Add a basic implementation of the reduce kernel API to the CPU
reference implementation.

Bug: 22631253

For now, this just runs a serial reduction on one thread.

Change-Id: I34c96d24bb6f44274de72bb53160abcf79d143b0
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 4367bd4..48e8dbb 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -45,11 +45,6 @@
 using namespace android;
 using namespace android::renderscript;
 
-typedef void (*outer_foreach_t)(
-    const RsExpandKernelDriverInfo *,
-    uint32_t x1, uint32_t x2, uint32_t outstep);
-
-
 static pthread_key_t gThreadTLSKey = 0;
 static uint32_t gThreadTLSKeyCount = 0;
 static pthread_mutex_t gInitMutex = PTHREAD_MUTEX_INITIALIZER;
@@ -153,13 +148,15 @@
     return nullptr;
 }
 
+// Launch a kernel.
+// The callback function is called to execute the kernel.
 void RsdCpuReferenceImpl::launchThreads(WorkerCallback_t cbk, void *data) {
     mWorkers.mLaunchData = data;
     mWorkers.mLaunchCallback = cbk;
 
     // fast path for very small launches
-    MTLaunchStruct *mtls = (MTLaunchStruct *)data;
-    if (mtls && mtls->fep.dim.y <= 1 && mtls->end.x <= mtls->start.x + mtls->mSliceSize) {
+    MTLaunchStructCommon *mtls = (MTLaunchStructCommon *)data;
+    if (mtls && mtls->dimPtr->y <= 1 && mtls->end.x <= mtls->start.x + mtls->mSliceSize) {
         if (mWorkers.mLaunchCallback) {
             mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
         }
@@ -220,7 +217,6 @@
 
 bool RsdCpuReferenceImpl::init(uint32_t version_major, uint32_t version_minor,
                                sym_lookup_t lfn, script_lookup_t slfn) {
-
     mSymLookupFn = lfn;
     mScriptLookupFn = slfn;
 
@@ -328,16 +324,19 @@
 
 }
 
-static inline void FepPtrSetup(const MTLaunchStruct *mtls, RsExpandKernelDriverInfo *fep,
+// Set up the appropriate input and output pointers to the kernel driver info structure.
+// Inputs:
+//   mtls - The MTLaunchStruct holding information about the kernel launch
+//   fep - The forEach parameters (driver info structure)
+//   x, y, z, lod, face, a1, a2, a3, a4 - The start offsets into each dimension
+static inline void FepPtrSetup(const MTLaunchStructForEach *mtls, RsExpandKernelDriverInfo *fep,
                                uint32_t x, uint32_t y,
                                uint32_t z = 0, uint32_t lod = 0,
                                RsAllocationCubemapFace face = RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X,
                                uint32_t a1 = 0, uint32_t a2 = 0, uint32_t a3 = 0, uint32_t a4 = 0) {
-
     for (uint32_t i = 0; i < fep->inLen; i++) {
         fep->inPtr[i] = (const uint8_t *)mtls->ains[i]->getPointerUnchecked(x, y, z, lod, face, a1, a2, a3, a4);
     }
-
     if (mtls->aout[0] != nullptr) {
         fep->outPtr[0] = (uint8_t *)mtls->aout[0]->getPointerUnchecked(x, y, z, lod, face, a1, a2, a3, a4);
     }
@@ -356,7 +355,7 @@
     return n;
 }
 
-static bool SelectOuterSlice(const MTLaunchStruct *mtls, RsExpandKernelDriverInfo* fep, uint32_t sliceNum) {
+static bool SelectOuterSlice(const MTLaunchStructForEach *mtls, RsExpandKernelDriverInfo* fep, uint32_t sliceNum) {
 
     uint32_t r = sliceNum;
     r = sliceInt(&fep->current.z, r, mtls->start.z, mtls->end.z);
@@ -371,10 +370,10 @@
 
 
 static void walk_general(void *usr, uint32_t idx) {
-    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+    MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr;
     RsExpandKernelDriverInfo fep = mtls->fep;
     fep.lid = idx;
-    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+    ForEachFunc_t fn = mtls->kernel;
 
 
     while(1) {
@@ -400,10 +399,10 @@
 }
 
 static void walk_2d(void *usr, uint32_t idx) {
-    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+    MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr;
     RsExpandKernelDriverInfo fep = mtls->fep;
     fep.lid = idx;
-    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+    ForEachFunc_t fn = mtls->kernel;
 
     while (1) {
         uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
@@ -425,10 +424,10 @@
 }
 
 static void walk_1d(void *usr, uint32_t idx) {
-    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+    MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr;
     RsExpandKernelDriverInfo fep = mtls->fep;
     fep.lid = idx;
-    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+    ForEachFunc_t fn = mtls->kernel;
 
     while (1) {
         uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
@@ -447,11 +446,30 @@
     }
 }
 
-void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
+// Launch a reduce-style kernel.
+// Inputs:
+//  ain:  The allocation that contains the input
+//  aout: The allocation that will hold the output
+//  mtls: Holds launch parameters
+void RsdCpuReferenceImpl::launchReduce(const Allocation *ain,
+                                       Allocation *aout,
+                                       MTLaunchStructReduce *mtls) {
+    const uint32_t xStart = mtls->start.x;
+    const uint32_t xEnd = mtls->end.x;
+
+    if (xStart >= xEnd) {
+      return;
+    }
+
+    const uint32_t startOffset = ain->getType()->getElementSizeBytes() * xStart;
+    mtls->kernel(&mtls->inBuf[startOffset], mtls->outBuf, xEnd - xStart);
+}
+
+void RsdCpuReferenceImpl::launchForEach(const Allocation ** ains,
                                         uint32_t inLen,
                                         Allocation* aout,
                                         const RsScriptCall* sc,
-                                        MTLaunchStruct* mtls) {
+                                        MTLaunchStructForEach* mtls) {
 
     //android::StopWatch kernel_time("kernel time");
 
@@ -519,7 +537,7 @@
         mInForEach = false;
 
     } else {
-        outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+        ForEachFunc_t fn = mtls->kernel;
         uint32_t slice = 0;