Support for general reduction kernels.

Requires coordinated change in frameworks/base.

Requires coordinated change in frameworks/compile/libbcc in order
for RsTest to run.

At present, general reduction kernels are run single-threaded.

Also: Remove dead struct field MTLaunchStructForEach::sig.

Bug: 23535724
Change-Id: Ice17ccf20a902f8a106eaa62ec071d46e3c0ad8c
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 48e8dbb..b8b4838 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -342,6 +342,18 @@
     }
 }
 
+// Set up the appropriate input and output pointers to the kernel driver info structure.
+// Inputs:
+//   mtls - The MTLaunchStruct holding information about the kernel launch
+//   redp - The reduce parameters (driver info structure)
+//   x, y, z - The start offsets into each dimension
+static inline void RedpPtrSetup(const MTLaunchStructReduceNew *mtls, RsExpandKernelDriverInfo *redp,
+                                uint32_t x, uint32_t y, uint32_t z) {
+    for (uint32_t i = 0; i < redp->inLen; i++) {
+        redp->inPtr[i] = (const uint8_t *)mtls->ains[i]->getPointerUnchecked(x, y, z);
+    }
+}
+
 static uint32_t sliceInt(uint32_t *p, uint32_t val, uint32_t start, uint32_t end) {
     if (start >= end) {
         *p = start;
@@ -355,16 +367,16 @@
     return n;
 }
 
-static bool SelectOuterSlice(const MTLaunchStructForEach *mtls, RsExpandKernelDriverInfo* fep, uint32_t sliceNum) {
+static bool SelectOuterSlice(const MTLaunchStructCommon *mtls, RsExpandKernelDriverInfo* info, uint32_t sliceNum) {
 
     uint32_t r = sliceNum;
-    r = sliceInt(&fep->current.z, r, mtls->start.z, mtls->end.z);
-    r = sliceInt(&fep->current.lod, r, mtls->start.lod, mtls->end.lod);
-    r = sliceInt(&fep->current.face, r, mtls->start.face, mtls->end.face);
-    r = sliceInt(&fep->current.array[0], r, mtls->start.array[0], mtls->end.array[0]);
-    r = sliceInt(&fep->current.array[1], r, mtls->start.array[1], mtls->end.array[1]);
-    r = sliceInt(&fep->current.array[2], r, mtls->start.array[2], mtls->end.array[2]);
-    r = sliceInt(&fep->current.array[3], r, mtls->start.array[3], mtls->end.array[3]);
+    r = sliceInt(&info->current.z, r, mtls->start.z, mtls->end.z);
+    r = sliceInt(&info->current.lod, r, mtls->start.lod, mtls->end.lod);
+    r = sliceInt(&info->current.face, r, mtls->start.face, mtls->end.face);
+    r = sliceInt(&info->current.array[0], r, mtls->start.array[0], mtls->end.array[0]);
+    r = sliceInt(&info->current.array[1], r, mtls->start.array[1], mtls->end.array[1]);
+    r = sliceInt(&info->current.array[2], r, mtls->start.array[2], mtls->end.array[2]);
+    r = sliceInt(&info->current.array[3], r, mtls->start.array[3], mtls->end.array[3]);
     return r == 0;
 }
 
@@ -446,7 +458,7 @@
     }
 }
 
-// Launch a reduce-style kernel.
+// Launch a simple reduce-style kernel.
 // Inputs:
 //  ain:  The allocation that contains the input
 //  aout: The allocation that will hold the output
@@ -465,6 +477,50 @@
     mtls->kernel(&mtls->inBuf[startOffset], mtls->outBuf, xEnd - xStart);
 }
 
+// Launch a general reduce-style kernel.
+// Inputs:
+//   ains[0..inLen-1]: Array of allocations that contain the inputs
+//   aout:             The allocation that will hold the output
+//   mtls:             Holds launch parameters
+void RsdCpuReferenceImpl::launchReduceNew(const Allocation ** ains,
+                                          uint32_t inLen,
+                                          Allocation * aout,
+                                          MTLaunchStructReduceNew *mtls) {
+  // In the presence of outconverter, we allocate temporary memory for
+  // the accumulator.
+  //
+  // In the absence of outconverter, we use the output allocation as the
+  // accumulator.
+  uint8_t *const accumPtr = (mtls->outFunc
+                             ? static_cast<uint8_t *>(malloc(mtls->accumSize))
+                             : mtls->redp.outPtr[0]);
+
+  // initialize
+  if (mtls->initFunc) {
+    mtls->initFunc(accumPtr);
+  } else {
+    memset(accumPtr, 0, mtls->accumSize);
+  }
+
+  // accumulate
+  const ReduceNewAccumulatorFunc_t fn = mtls->accumFunc;
+  uint32_t slice = 0;
+  while (SelectOuterSlice(mtls, &mtls->redp, slice++)) {
+    for (mtls->redp.current.y = mtls->start.y;
+         mtls->redp.current.y < mtls->end.y;
+         mtls->redp.current.y++) {
+      RedpPtrSetup(mtls, &mtls->redp, mtls->start.x, mtls->redp.current.y, mtls->redp.current.z);
+      fn(&mtls->redp, mtls->start.x, mtls->end.x, accumPtr);
+    }
+  }
+
+  // outconvert
+  if (mtls->outFunc) {
+    mtls->outFunc(mtls->redp.outPtr[0], accumPtr);
+    free(accumPtr);
+  }
+}
+
 void RsdCpuReferenceImpl::launchForEach(const Allocation ** ains,
                                         uint32_t inLen,
                                         Allocation* aout,