Cleanup of ForEachParams in cpu ref

Change-Id: I8cc51915b2a605c240d98e3010619b741a13bae2
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 738eb84..40f4745 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -164,7 +164,7 @@
 
     // fast path for very small launches
     MTLaunchStruct *mtls = (MTLaunchStruct *)data;
-    if (mtls && mtls->fep.dimY <= 1 && mtls->xEnd <= mtls->xStart + mtls->mSliceSize) {
+    if (mtls && mtls->fep.dim.y <= 1 && mtls->xEnd <= mtls->xStart + mtls->mSliceSize) {
         if (mWorkers.mLaunchCallback) {
             mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
         }
@@ -344,106 +344,100 @@
 }
 
 typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
-typedef void (*walk_loop_t)(MTLaunchStruct*,
-                            RsExpandKernelParams&,
+typedef void (*walk_loop_t)(const MTLaunchStruct*,
+                            RsExpandKernelDriverInfo,
                             outer_foreach_t);
 
+static void kparamSetup(RsExpandKernelParams *kparams, const RsExpandKernelDriverInfo *fep) {
+    //ALOGE("kp  usr %p", fep->usr);
+    //ALOGE("kp  slot %i", fep->slot);
+    //ALOGE("kp  dim %i %i %i", fep->dim.x, fep->dim.y, fep->dim.z);
+    //ALOGE("kp  lid %i", fep->lid);
+    //ALOGE("kp  in[0] stide %i  ptr %p", fep->inStride[0], fep->inPtr[0]);
+    //ALOGE("kp  out[0] ptr %p", fep->outPtr[0]);
+    //ALOGE("kp  loc %i %i %i", fep->current.x, fep->current.y, fep->current.z);
 
-static void walk_wrapper(void* usr, uint32_t idx, walk_loop_t walk_loop) {
-    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
-
-    uint32_t inLen = mtls->fep.inLen;
-
-    RsExpandKernelParams kparams;
-    kparams.takeFields(mtls->fep);
-
-    // Used by CpuScriptGroup, IntrinsicBlur, and IntrinsicHistogram
-    kparams.lid = idx;
-
-    if (inLen > 0) {
-        // Allocate space for our input base pointers.
-        kparams.ins = (const void**)alloca(inLen * sizeof(void*));
-
-        // Allocate space for our input stride information.
-        kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t));
-
-        // Fill our stride information.
-        for (int inIndex = inLen; --inIndex >= 0;) {
-          kparams.inEStrides[inIndex] = mtls->fep.inStrides[inIndex].eStride;
-        }
-    }
-
-    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
-
-    walk_loop(mtls, kparams, fn);
+    kparams->usr  = fep->usr;
+    kparams->slot = fep->slot;
+    kparams->dimX = fep->dim.x;
+    kparams->dimY = fep->dim.y;
+    kparams->dimZ = fep->dim.z;
+    kparams->lid = fep->lid;
+    kparams->inEStrides = (uint32_t *)&fep->inStride[0];
+    kparams->ins = (const void **)&fep->inPtr[0];
+    kparams->out = fep->outPtr[0];
+    kparams->y = fep->current.y;
+    kparams->z = fep->current.z;
 }
 
+static inline void fepPtrSetup(const MTLaunchStruct *mtls, RsExpandKernelDriverInfo *fep,
+                               uint32_t x, uint32_t y,
+                               uint32_t z = 0, uint32_t lod = 0,
+                               RsAllocationCubemapFace face = RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X,
+                               uint32_t a1 = 0, uint32_t a2 = 0, uint32_t a3 = 0, uint32_t a4 = 0) {
+
+    for (uint32_t i = 0; i < fep->inLen; i++) {
+        fep->inPtr[i] = (const uint8_t *)mtls->ains[i]->getPointerUnchecked(x, y, z, lod, face, a1, a2, a3, a4);
+    }
+
+    if (mtls->aout[0] != nullptr) {
+        fep->outPtr[0] = (uint8_t *)mtls->aout[0]->getPointerUnchecked(x, y, z, lod, face, a1, a2, a3, a4);
+    }
+}
+
+
 static void walk_2d(void *usr, uint32_t idx) {
-    walk_wrapper(usr, idx, [](MTLaunchStruct *mtls,
-                              RsExpandKernelParams &kparams,
-                              outer_foreach_t fn) {
+    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+    RsExpandKernelDriverInfo fep = mtls->fep;
+    fep.lid = idx;
 
-        while (1) {
-            uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
-            uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
-            uint32_t yEnd   = yStart + mtls->mSliceSize;
+    while (1) {
+        uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+        uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
+        uint32_t yEnd   = yStart + mtls->mSliceSize;
 
-            yEnd = rsMin(yEnd, mtls->yEnd);
+        yEnd = rsMin(yEnd, mtls->yEnd);
 
-            if (yEnd <= yStart) {
-                return;
-            }
-
-            for (kparams.y = yStart; kparams.y < yEnd; kparams.y++) {
-                kparams.out = mtls->fep.outPtr +
-                              (mtls->fep.outStride.yStride * kparams.y) +
-                              (mtls->fep.outStride.eStride * mtls->xStart);
-
-                for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) {
-                    StridePair &strides = mtls->fep.inStrides[inIndex];
-
-                    kparams.ins[inIndex] =
-                      mtls->fep.inPtrs[inIndex] +
-                      (strides.yStride * kparams.y) +
-                      (strides.eStride * mtls->xStart);
-                }
-
-                fn(&kparams, mtls->xStart, mtls->xEnd,
-                   mtls->fep.outStride.eStride);
-            }
+        if (yEnd <= yStart) {
+            return;
         }
-    });
+
+        for (fep.current.y = yStart; fep.current.y < yEnd; fep.current.y++) {
+            fepPtrSetup(mtls, &fep, mtls->xStart, fep.current.y);
+
+            RsExpandKernelParams kparams;
+            kparamSetup(&kparams, &fep);
+
+            outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+            fn(&kparams, mtls->xStart, mtls->xEnd, fep.outStride[0]);
+        }
+    }
 }
 
 static void walk_1d(void *usr, uint32_t idx) {
-    walk_wrapper(usr, idx, [](MTLaunchStruct *mtls,
-                              RsExpandKernelParams &kparams,
-                              outer_foreach_t fn) {
+    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+    RsExpandKernelDriverInfo fep = mtls->fep;
+    fep.lid = idx;
 
-        while (1) {
-            uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
-            uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
-            uint32_t xEnd   = xStart + mtls->mSliceSize;
+    while (1) {
+        uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+        uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
+        uint32_t xEnd   = xStart + mtls->mSliceSize;
 
-            xEnd = rsMin(xEnd, mtls->xEnd);
+        xEnd = rsMin(xEnd, mtls->xEnd);
 
-            if (xEnd <= xStart) {
-                return;
-            }
-
-            kparams.out = mtls->fep.outPtr +
-                          (mtls->fep.outStride.eStride * xStart);
-
-            for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) {
-                StridePair &strides = mtls->fep.inStrides[inIndex];
-
-                kparams.ins[inIndex] =
-                  mtls->fep.inPtrs[inIndex] + (strides.eStride * xStart);
-            }
-
-            fn(&kparams, xStart, xEnd, mtls->fep.outStride.eStride);
+        if (xEnd <= xStart) {
+            return;
         }
-    });
+
+        fepPtrSetup(mtls, &fep, xStart, 0);
+
+        RsExpandKernelParams kparams;
+        kparamSetup(&kparams, &fep);
+
+        outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+        fn(&kparams, xStart, xEnd, fep.outStride[0]);
+    }
 }
 
 
@@ -459,17 +453,17 @@
         const size_t targetByteChunk = 16 * 1024;
         mInForEach = true;
 
-        if (mtls->fep.dimY > 1) {
-            uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
+        if (mtls->fep.dim.y > 1) {
+            uint32_t s1 = mtls->fep.dim.y / ((mWorkers.mCount + 1) * 4);
             uint32_t s2 = 0;
 
             // This chooses our slice size to rate limit atomic ops to
             // one per 16k bytes of reads/writes.
-            if (mtls->fep.outStride.yStride) {
-                s2 = targetByteChunk / mtls->fep.outStride.yStride;
+            if ((mtls->aout[0] != nullptr) && mtls->aout[0]->mHal.drvState.lod[0].stride) {
+                s2 = targetByteChunk / mtls->aout[0]->mHal.drvState.lod[0].stride;
             } else {
                 // We know that there is either an output or an input.
-                s2 = targetByteChunk / mtls->fep.inStrides[0].yStride;
+                s2 = targetByteChunk / mtls->ains[0]->mHal.drvState.lod[0].stride;
             }
             mtls->mSliceSize = rsMin(s1, s2);
 
@@ -479,16 +473,16 @@
 
             launchThreads(walk_2d, mtls);
         } else {
-            uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
+            uint32_t s1 = mtls->fep.dim.x / ((mWorkers.mCount + 1) * 4);
             uint32_t s2 = 0;
 
             // This chooses our slice size to rate limit atomic ops to
             // one per 16k bytes of reads/writes.
-            if (mtls->fep.outStride.eStride) {
-                s2 = targetByteChunk / mtls->fep.outStride.eStride;
+            if ((mtls->aout[0] != nullptr) && mtls->aout[0]->getType()->getElementSizeBytes()) {
+                s2 = targetByteChunk / mtls->aout[0]->getType()->getElementSizeBytes();
             } else {
                 // We know that there is either an output or an input.
-                s2 = targetByteChunk / mtls->fep.inStrides[0].eStride;
+                s2 = targetByteChunk / mtls->ains[0]->getType()->getElementSizeBytes();
             }
             mtls->mSliceSize = rsMin(s1, s2);
 
@@ -501,53 +495,21 @@
         mInForEach = false;
 
     } else {
-        RsExpandKernelParams kparams;
-        kparams.takeFields(mtls->fep);
-
-        if (inLen > 0) {
-            // Allocate space for our input base pointers.
-            kparams.ins = (const void**)alloca(inLen * sizeof(void*));
-
-            // Allocate space for our input stride information.
-            kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t));
-
-            // Fill our stride information.
-            for (int inIndex = inLen; --inIndex >= 0;) {
-                kparams.inEStrides[inIndex] =
-                    mtls->fep.inStrides[inIndex].eStride;
-            }
-        }
-
-        //ALOGE("launch 3");
         outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
         for (uint32_t arrayIndex = mtls->arrayStart;
              arrayIndex < mtls->arrayEnd; arrayIndex++) {
 
-            for (kparams.z = mtls->zStart; kparams.z < mtls->zEnd;
-                 kparams.z++) {
+            for (mtls->fep.current.z = mtls->zStart; mtls->fep.current.z < mtls->zEnd;
+                 mtls->fep.current.z++) {
 
-                for (kparams.y = mtls->yStart; kparams.y < mtls->yEnd;
-                     kparams.y++) {
+                for (mtls->fep.current.y = mtls->yStart; mtls->fep.current.y < mtls->yEnd;
+                     mtls->fep.current.y++) {
 
-                    uint32_t offset =
-                      mtls->fep.dimY * mtls->fep.dimZ * arrayIndex +
-                      mtls->fep.dimY * kparams.z + kparams.y;
+                    fepPtrSetup(mtls, &mtls->fep, mtls->xStart, mtls->fep.current.y, mtls->fep.current.z);
 
-                    kparams.out = mtls->fep.outPtr +
-                                  (mtls->fep.outStride.yStride * offset) +
-                                  (mtls->fep.outStride.eStride * mtls->xStart);
-
-                    for (int inIndex = inLen; --inIndex >= 0;) {
-                        StridePair &strides = mtls->fep.inStrides[inIndex];
-
-                        kparams.ins[inIndex] =
-                          mtls->fep.inPtrs[inIndex] +
-                          (strides.yStride * offset) +
-                          (strides.eStride * mtls->xStart);
-                    }
-
-                    fn(&kparams, mtls->xStart, mtls->xEnd,
-                       mtls->fep.outStride.eStride);
+                    RsExpandKernelParams kparams;
+                    kparamSetup(&kparams, &mtls->fep);
+                    fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.outStride[0]);
                 }
             }
         }