am 902868b7: Merge "YUV(NV21) to RGBA function NEON optimizations."

# By Vassilis Laganakos
# Via David Butcher (1) and Gerrit Code Review (1)
* commit '902868b7c94f0c16b53e28ee1dd68c4e4a24f964':
  YUV(NV21) to RGBA function NEON optimizations.
diff --git a/driver/rsdBcc.cpp b/driver/rsdBcc.cpp
index c78508c..e17c107 100644
--- a/driver/rsdBcc.cpp
+++ b/driver/rsdBcc.cpp
@@ -176,6 +176,7 @@
     MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
     RsForEachStubParamStruct p;
     memcpy(&p, &mtls->fep, sizeof(p));
+    p.lid = idx;
     RsdHal * dc = (RsdHal *)mtls->rsc->mHal.drv;
     uint32_t sig = mtls->sig;
 
@@ -222,6 +223,7 @@
     MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
     RsForEachStubParamStruct p;
     memcpy(&p, &mtls->fep, sizeof(p));
+    p.lid = idx;
     RsdHal * dc = (RsdHal *)mtls->rsc->mHal.drv;
     uint32_t sig = mtls->sig;
 
@@ -341,17 +343,40 @@
     Context *mrsc = (Context *)rsc;
     RsdHal * dc = (RsdHal *)mtls->rsc->mHal.drv;
 
-    if ((dc->mWorkers.mCount > 1) && s->mHal.info.isThreadable && !dc->mInForEach) {
+    if ((dc->mWorkers.mCount >= 1) && s->mHal.info.isThreadable && !dc->mInForEach) {
+        const size_t targetByteChunk = 16 * 1024;
         dc->mInForEach = true;
         if (mtls->fep.dimY > 1) {
-            mtls->mSliceSize = mtls->fep.dimY / (dc->mWorkers.mCount * 4);
+            uint32_t s1 = mtls->fep.dimY / ((dc->mWorkers.mCount + 1) * 4);
+            uint32_t s2 = 0;
+
+            // This chooses our slice size to rate limit atomic ops to
+            // one per 16k bytes of reads/writes.
+            if (mtls->fep.yStrideOut) {
+                s2 = targetByteChunk / mtls->fep.yStrideOut;
+            } else {
+                s2 = targetByteChunk / mtls->fep.yStrideIn;
+            }
+            mtls->mSliceSize = rsMin(s1, s2);
+
             if(mtls->mSliceSize < 1) {
                 mtls->mSliceSize = 1;
             }
 
             rsdLaunchThreads(mrsc, wc_xy, mtls);
         } else {
-            mtls->mSliceSize = mtls->fep.dimX / (dc->mWorkers.mCount * 4);
+            uint32_t s1 = mtls->fep.dimX / ((dc->mWorkers.mCount + 1) * 4);
+            uint32_t s2 = 0;
+
+            // This chooses our slice size to rate limit atomic ops to
+            // one per 16k bytes of reads/writes.
+            if (mtls->fep.eStrideOut) {
+                s2 = targetByteChunk / mtls->fep.eStrideOut;
+            } else {
+                s2 = targetByteChunk / mtls->fep.eStrideIn;
+            }
+            mtls->mSliceSize = rsMin(s1, s2);
+
             if(mtls->mSliceSize < 1) {
                 mtls->mSliceSize = 1;
             }
@@ -364,6 +389,7 @@
     } else {
         RsForEachStubParamStruct p;
         memcpy(&p, &mtls->fep, sizeof(p));
+        p.lid = 0;
         uint32_t sig = mtls->sig;
 
         //ALOGE("launch 3");
diff --git a/driver/rsdCore.cpp b/driver/rsdCore.cpp
index cdfc600..c8b8014 100644
--- a/driver/rsdCore.cpp
+++ b/driver/rsdCore.cpp
@@ -189,7 +189,8 @@
     while (!dc->mExit) {
         dc->mWorkers.mLaunchSignals[idx].wait();
         if (dc->mWorkers.mLaunchCallback) {
-           dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, idx);
+            // idx +1 is used because the calling thread is always worker 0.
+            dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, idx+1);
         }
         android_atomic_dec(&dc->mWorkers.mRunningCount);
         dc->mWorkers.mCompleteSignal.set();
@@ -208,6 +209,13 @@
     for (uint32_t ct = 0; ct < dc->mWorkers.mCount; ct++) {
         dc->mWorkers.mLaunchSignals[ct].set();
     }
+
+    // We use the calling thread as one of the workers so we can start without
+    // the delay of the thread wakeup.
+    if (dc->mWorkers.mLaunchCallback) {
+       dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, 0);
+    }
+
     while (android_atomic_acquire_load(&dc->mWorkers.mRunningCount) != 0) {
         dc->mWorkers.mCompleteSignal.wait();
     }
@@ -250,11 +258,13 @@
         cpu = rsc->props.mDebugMaxThreads;
     }
     if (cpu < 2) {
-        cpu = 0;
+        dc->mWorkers.mCount = 0;
+        return true;
     }
     ALOGV("%p Launching thread(s), CPUs %i", rsc, cpu);
 
-    dc->mWorkers.mCount = (uint32_t)cpu;
+    // Subtract one from the cpu count because we also use the command thread as a worker.
+    dc->mWorkers.mCount = (uint32_t)(cpu - 1);
     dc->mWorkers.mThreadId = (pthread_t *) calloc(dc->mWorkers.mCount, sizeof(pthread_t));
     dc->mWorkers.mNativeThreadId = (pid_t *) calloc(dc->mWorkers.mCount, sizeof(pid_t));
     dc->mWorkers.mLaunchSignals = new Signal[dc->mWorkers.mCount];
diff --git a/driver/rsdIntrinsicBlend.cpp b/driver/rsdIntrinsicBlend.cpp
index 22ad108..c35c379 100644
--- a/driver/rsdIntrinsicBlend.cpp
+++ b/driver/rsdIntrinsicBlend.cpp
@@ -103,9 +103,6 @@
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
-    in += xstart;
-    out += xstart;
-
     switch (p->slot) {
     case BLEND_CLEAR:
         for (;x1 < x2; x1++, out++) {
diff --git a/driver/rsdIntrinsicBlur.cpp b/driver/rsdIntrinsicBlur.cpp
index 9c1fe68..5cd671e 100644
--- a/driver/rsdIntrinsicBlur.cpp
+++ b/driver/rsdIntrinsicBlur.cpp
@@ -29,6 +29,8 @@
     short ip[104];
     float radius;
     int iradius;
+    void **scratch;
+    size_t *scratchSize;
     ObjectBaseRef<Allocation> alloc;
 };
 
@@ -139,6 +141,7 @@
         out->xyzw = blurredPixel;
         x1++;
         out++;
+        gPtr++;
     }
 }
 
@@ -161,7 +164,8 @@
 static void Blur_uchar4(const RsForEachStubParamStruct *p,
                                     uint32_t xstart, uint32_t xend,
                                     uint32_t instep, uint32_t outstep) {
-    float buf[4 * 2048];
+    float stackbuf[4 * 2048];
+    float *buf = &stackbuf[0];
     ConvolveParams *cp = (ConvolveParams *)p->usr;
     if (!cp->alloc.get()) {
         ALOGE("Blur executed without input, skipping");
@@ -174,16 +178,37 @@
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
+    if (p->dimX > 2048) {
+        if ((p->dimX > cp->scratchSize[p->lid]) || !cp->scratch[p->lid]) {
+            cp->scratch[p->lid] = realloc(cp->scratch[p->lid], p->dimX * 16);
+            cp->scratchSize[p->lid] = p->dimX;
+        }
+        buf = (float *)cp->scratch[p->lid];
+    }
     float4 *fout = (float4 *)buf;
+
     int y = p->y;
+    uint32_t vx1 = x1;
+    uint32_t vx2 = x2;
+
+    if (vx1 > (uint32_t)cp->iradius) {
+        vx1 -= cp->iradius;
+    } else {
+        vx1 = 0;
+    }
+    vx2 += cp->iradius;
+    if (vx2 >= p->dimX) {
+        vx2 = p->dimX - 1;
+    }
+
     if ((y > cp->iradius) && (y < ((int)p->dimY - cp->iradius))) {
         const uchar *pi = pin + (y - cp->iradius) * din->lod[0].stride;
-        OneVF(fout, pi, din->lod[0].stride, cp->fp, cp->iradius * 2 + 1, x1, x2);
+        OneVF(fout + vx1, pi, din->lod[0].stride, cp->fp, cp->iradius * 2 + 1, vx1, vx2);
     } else {
-        while(x2 > x1) {
-            OneV(p, fout, x1, y, pin, din->lod[0].stride, cp->fp, cp->iradius);
+        while(vx2 > vx1) {
+            OneV(p, fout, vx1, y, pin, din->lod[0].stride, cp->fp, cp->iradius);
             fout++;
-            x1++;
+            vx1++;
         }
     }
 
@@ -208,17 +233,51 @@
 
 }
 
-void * rsdIntrinsic_InitBlur(const android::renderscript::Context *dc,
+static void Destroy(const Context *rsc, const Script *script, void * intrinsicData) {
+    RsdHal * dc = (RsdHal *)rsc->mHal.drv;
+    ConvolveParams *cp = (ConvolveParams *)intrinsicData;
+
+    if (cp) {
+        if (cp->scratch) {
+            for (size_t i = 0; i < dc->mWorkers.mCount + 1; i++) {
+                if (cp->scratch[i]) {
+                    free(cp->scratch[i]);
+                }
+            }
+            free(cp->scratch);
+        }
+        if (cp->scratchSize) {
+            free(cp->scratchSize);
+        }
+        free(cp);
+    }
+}
+
+void * rsdIntrinsic_InitBlur(const android::renderscript::Context *rsc,
                                     android::renderscript::Script *script,
                                     RsdIntriniscFuncs_t *funcs) {
 
+    RsdHal * dc = (RsdHal *)rsc->mHal.drv;
+
     script->mHal.info.exportedVariableCount = 2;
     funcs->setVarObj = Blur_Bind;
     funcs->setVar = Blur_SetVar;
     funcs->root = Blur_uchar4;
+    funcs->destroy = Destroy;
 
     ConvolveParams *cp = (ConvolveParams *)calloc(1, sizeof(ConvolveParams));
+    if (!cp) {
+        return NULL;
+    }
+
     cp->radius = 5;
+    cp->scratch = (void **)calloc(dc->mWorkers.mCount + 1, sizeof(void *));
+    cp->scratchSize = (size_t *)calloc(dc->mWorkers.mCount + 1, sizeof(size_t));
+    if (!cp->scratch || !cp->scratchSize) {
+        Destroy(rsc, script, cp);
+        return NULL;
+    }
+
     ComputeGaussianWeights(cp);
     return cp;
 }
diff --git a/driver/rsdIntrinsicColorMatrix.cpp b/driver/rsdIntrinsicColorMatrix.cpp
index 8f6c70c..cfe0333 100644
--- a/driver/rsdIntrinsicColorMatrix.cpp
+++ b/driver/rsdIntrinsicColorMatrix.cpp
@@ -97,9 +97,6 @@
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
-    in += xstart;
-    out += xstart;
-
     if(x2 > x1) {
 #if defined(ARCH_ARM_HAVE_NEON)
         int32_t len = (x2 - x1) >> 2;
diff --git a/driver/rsdIntrinsicConvolve3x3.cpp b/driver/rsdIntrinsicConvolve3x3.cpp
index 55f4360..dac2f24 100644
--- a/driver/rsdIntrinsicConvolve3x3.cpp
+++ b/driver/rsdIntrinsicConvolve3x3.cpp
@@ -56,7 +56,7 @@
                         const float* coeff) {
 
     uint32_t x1 = rsMax((int32_t)x-1, 0);
-    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX);
+    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
 
     float4 px = convert_float4(py0[x1]) * coeff[0] +
                 convert_float4(py0[x]) * coeff[1] +
diff --git a/driver/rsdIntrinsicConvolve5x5.cpp b/driver/rsdIntrinsicConvolve5x5.cpp
index fc6b029..ac06304 100644
--- a/driver/rsdIntrinsicConvolve5x5.cpp
+++ b/driver/rsdIntrinsicConvolve5x5.cpp
@@ -134,7 +134,7 @@
 #if defined(ARCH_ARM_HAVE_NEON)
     if((x1 + 3) < x2) {
         uint32_t len = (x2 - x1 - 3) >> 1;
-        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
+        rsdIntrinsicConvolve5x5_K(out, py0+x1-2, py1+x1-2, py2+x1-2, py3+x1-2, py4+x1-2, cp->ip, len);
         out += len << 1;
         x1 += len << 1;
     }
diff --git a/driver/rsdIntrinsicLUT.cpp b/driver/rsdIntrinsicLUT.cpp
index a75534e..818a132 100644
--- a/driver/rsdIntrinsicLUT.cpp
+++ b/driver/rsdIntrinsicLUT.cpp
@@ -44,9 +44,6 @@
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
-    in += xstart;
-    out += xstart;
-
     DrvAllocation *din = (DrvAllocation *)cp->lut->mHal.drv;
     const uchar *tr = (const uchar *)din->lod[0].mallocPtr;
     const uchar *tg = &tr[256];
diff --git a/rs_hal.h b/rs_hal.h
index b0e10c9..f172fbf 100644
--- a/rs_hal.h
+++ b/rs_hal.h
@@ -58,6 +58,7 @@
     uint32_t lod;
     RsAllocationCubemapFace face;
     uint32_t ar[16];
+    uint32_t lid;
 
     uint32_t dimX;
     uint32_t dimY;