Support for general reduction kernels.

Requires coordinated change in frameworks/base.

Requires coordinated change in frameworks/compile/libbcc in order
for RsTest to run.

At present, general reduction kernels are run single-threaded.

Also: Remove dead struct field MTLaunchStructForEach::sig.

Bug: 23535724
Change-Id: Ice17ccf20a902f8a106eaa62ec071d46e3c0ad8c
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 48e8dbb..b8b4838 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -342,6 +342,18 @@
     }
 }
 
+// Set up the appropriate input and output pointers to the kernel driver info structure.
+// Inputs:
+//   mtls - The MTLaunchStruct holding information about the kernel launch
+//   redp - The reduce parameters (driver info structure)
+//   x, y, z - The start offsets into each dimension
+static inline void RedpPtrSetup(const MTLaunchStructReduceNew *mtls, RsExpandKernelDriverInfo *redp,
+                                uint32_t x, uint32_t y, uint32_t z) {
+    for (uint32_t i = 0; i < redp->inLen; i++) {
+        redp->inPtr[i] = (const uint8_t *)mtls->ains[i]->getPointerUnchecked(x, y, z);
+    }
+}
+
 static uint32_t sliceInt(uint32_t *p, uint32_t val, uint32_t start, uint32_t end) {
     if (start >= end) {
         *p = start;
@@ -355,16 +367,16 @@
     return n;
 }
 
-static bool SelectOuterSlice(const MTLaunchStructForEach *mtls, RsExpandKernelDriverInfo* fep, uint32_t sliceNum) {
+static bool SelectOuterSlice(const MTLaunchStructCommon *mtls, RsExpandKernelDriverInfo* info, uint32_t sliceNum) {
 
     uint32_t r = sliceNum;
-    r = sliceInt(&fep->current.z, r, mtls->start.z, mtls->end.z);
-    r = sliceInt(&fep->current.lod, r, mtls->start.lod, mtls->end.lod);
-    r = sliceInt(&fep->current.face, r, mtls->start.face, mtls->end.face);
-    r = sliceInt(&fep->current.array[0], r, mtls->start.array[0], mtls->end.array[0]);
-    r = sliceInt(&fep->current.array[1], r, mtls->start.array[1], mtls->end.array[1]);
-    r = sliceInt(&fep->current.array[2], r, mtls->start.array[2], mtls->end.array[2]);
-    r = sliceInt(&fep->current.array[3], r, mtls->start.array[3], mtls->end.array[3]);
+    r = sliceInt(&info->current.z, r, mtls->start.z, mtls->end.z);
+    r = sliceInt(&info->current.lod, r, mtls->start.lod, mtls->end.lod);
+    r = sliceInt(&info->current.face, r, mtls->start.face, mtls->end.face);
+    r = sliceInt(&info->current.array[0], r, mtls->start.array[0], mtls->end.array[0]);
+    r = sliceInt(&info->current.array[1], r, mtls->start.array[1], mtls->end.array[1]);
+    r = sliceInt(&info->current.array[2], r, mtls->start.array[2], mtls->end.array[2]);
+    r = sliceInt(&info->current.array[3], r, mtls->start.array[3], mtls->end.array[3]);
     return r == 0;
 }
 
@@ -446,7 +458,7 @@
     }
 }
 
-// Launch a reduce-style kernel.
+// Launch a simple reduce-style kernel.
 // Inputs:
 //  ain:  The allocation that contains the input
 //  aout: The allocation that will hold the output
@@ -465,6 +477,50 @@
     mtls->kernel(&mtls->inBuf[startOffset], mtls->outBuf, xEnd - xStart);
 }
 
+// Launch a general reduce-style kernel.
+// Inputs:
+//   ains[0..inLen-1]: Array of allocations that contain the inputs
+//   aout:             The allocation that will hold the output
+//   mtls:             Holds launch parameters
+void RsdCpuReferenceImpl::launchReduceNew(const Allocation ** ains,
+                                          uint32_t inLen,
+                                          Allocation * aout,
+                                          MTLaunchStructReduceNew *mtls) {
+  // In the presence of outconverter, we allocate temporary memory for
+  // the accumulator.
+  //
+  // In the absence of outconverter, we use the output allocation as the
+  // accumulator.
+  uint8_t *const accumPtr = (mtls->outFunc
+                             ? static_cast<uint8_t *>(malloc(mtls->accumSize))
+                             : mtls->redp.outPtr[0]);
+
+  // initialize
+  if (mtls->initFunc) {
+    mtls->initFunc(accumPtr);
+  } else {
+    memset(accumPtr, 0, mtls->accumSize);
+  }
+
+  // accumulate
+  const ReduceNewAccumulatorFunc_t fn = mtls->accumFunc;
+  uint32_t slice = 0;
+  while (SelectOuterSlice(mtls, &mtls->redp, slice++)) {
+    for (mtls->redp.current.y = mtls->start.y;
+         mtls->redp.current.y < mtls->end.y;
+         mtls->redp.current.y++) {
+      RedpPtrSetup(mtls, &mtls->redp, mtls->start.x, mtls->redp.current.y, mtls->redp.current.z);
+      fn(&mtls->redp, mtls->start.x, mtls->end.x, accumPtr);
+    }
+  }
+
+  // outconvert
+  if (mtls->outFunc) {
+    mtls->outFunc(mtls->redp.outPtr[0], accumPtr);
+    free(accumPtr);
+  }
+}
+
 void RsdCpuReferenceImpl::launchForEach(const Allocation ** ains,
                                         uint32_t inLen,
                                         Allocation* aout,
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index cfdb29a..939b7ae 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -33,11 +33,21 @@
 
 // Function types found in RenderScript code
 typedef void (*ReduceFunc_t)(const uint8_t *inBuf, uint8_t *outBuf, uint32_t len);
+typedef void (*ReduceNewAccumulatorFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint8_t *accum);
+typedef void (*ReduceNewInitializerFunc_t)(uint8_t *accum);
+typedef void (*ReduceNewOutConverterFunc_t)(uint8_t *out, const uint8_t *accum);
 typedef void (*ForEachFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint32_t outStride);
 typedef void (*InvokeFunc_t)(void *params);
 typedef void (*InitOrDtorFunc_t)(void);
 typedef int  (*RootFunc_t)(void);
 
+struct ReduceNewDescription {
+    ReduceNewAccumulatorFunc_t  accumFunc;  // expanded accumulator function
+    ReduceNewInitializerFunc_t  initFunc;   // user initializer function
+    ReduceNewOutConverterFunc_t outFunc;    // user outconverter function
+    size_t                      accumSize;  // accumulator datum size, in bytes
+};
+
 // Internal driver callback used to execute a kernel
 typedef void (*WorkerCallback_t)(void *usr, uint32_t idx);
 
@@ -72,7 +82,6 @@
     RsExpandKernelDriverInfo fep;
 
     ForEachFunc_t kernel;
-    uint32_t sig;
     const Allocation *ains[RS_KERNEL_INPUT_LIMIT];
     Allocation *aout[RS_KERNEL_INPUT_LIMIT];
 };
@@ -84,6 +93,19 @@
     RsLaunchDimensions inputDim;
 };
 
+struct MTLaunchStructReduceNew : public MTLaunchStructCommon {
+    // Driver info structure
+    RsExpandKernelDriverInfo redp;
+
+    const Allocation *ains[RS_KERNEL_INPUT_LIMIT];
+
+    ReduceNewAccumulatorFunc_t accumFunc;
+    ReduceNewInitializerFunc_t initFunc;
+    ReduceNewOutConverterFunc_t outFunc;
+
+    size_t accumSize;  // accumulator datum size in bytes
+};
+
 class RsdCpuReferenceImpl : public RsdCpuReference {
 public:
     ~RsdCpuReferenceImpl() override;
@@ -107,10 +129,14 @@
     void launchForEach(const Allocation **ains, uint32_t inLen, Allocation *aout,
                        const RsScriptCall *sc, MTLaunchStructForEach *mtls);
 
-    // Launch a reduce kernel
+    // Launch a simple reduce kernel
     void launchReduce(const Allocation *ain, Allocation *aout,
                       MTLaunchStructReduce *mtls);
 
+    // Launch a general reduce kernel
+    void launchReduceNew(const Allocation ** ains, uint32_t inLen, Allocation *aout,
+                         MTLaunchStructReduceNew *mtls);
+
     CpuScript * createScript(const ScriptC *s, char const *resName, char const *cacheDir,
                              uint8_t const *bitcode, size_t bitcodeSize, uint32_t flags) override;
     CpuScript * createIntrinsic(const Script *s, RsScriptIntrinsicID iid, Element *e) override;
diff --git a/cpu_ref/rsCpuExecutable.cpp b/cpu_ref/rsCpuExecutable.cpp
index d654743..5dd31ee 100644
--- a/cpu_ref/rsCpuExecutable.cpp
+++ b/cpu_ref/rsCpuExecutable.cpp
@@ -319,6 +319,7 @@
     ForEachFunc_t* forEachFunctions = nullptr;
     uint32_t* forEachSignatures = nullptr;
     ReduceFunc_t* reduceFunctions = nullptr;
+    ReduceNewDescription* reduceNewDescriptions = nullptr;
     const char ** pragmaKeys = nullptr;
     const char ** pragmaValues = nullptr;
     uint32_t checksum = 0;
@@ -485,7 +486,7 @@
         }
     }
 
-    // Read general reduce kernels (for now, we expect the count to be zero)
+    // Read general reduce kernels
     if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
         goto error;
     }
@@ -493,11 +494,91 @@
         ALOGE("Invalid export reduce new count!: %s", line);
         goto error;
     }
-    if (reduceNewCount != 0) {
-        ALOGE("Expected export reduce new count to be zero!: %s", line);
+
+    reduceNewDescriptions = new ReduceNewDescription[reduceNewCount];
+    if (reduceNewDescriptions == nullptr) {
         goto error;
     }
 
+    for (size_t i = 0; i < reduceNewCount; ++i) {
+        static const char kNoName[] = ".";
+
+        unsigned int tmpSig = 0;
+        size_t tmpSize = 0;
+        char tmpNameReduce[MAXLINE];
+        char tmpNameInitializer[MAXLINE];
+        char tmpNameAccumulator[MAXLINE];
+        char tmpNameCombiner[MAXLINE];
+        char tmpNameOutConverter[MAXLINE];
+        char tmpNameHalter[MAXLINE];
+
+        if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
+            goto error;
+        }
+#define DELIMNAME " - %" MAKE_STR(MAXLINE) "s"
+        if (sscanf(line, "%u - %zu" DELIMNAME DELIMNAME DELIMNAME DELIMNAME DELIMNAME DELIMNAME,
+                   &tmpSig, &tmpSize, tmpNameReduce, tmpNameInitializer, tmpNameAccumulator,
+                   tmpNameCombiner, tmpNameOutConverter, tmpNameHalter) != 8) {
+            ALOGE("Invalid export reduce new!: %s", line);
+            goto error;
+        }
+#undef DELIMNAME
+
+        // For now, we expect
+        // - Reduce and Accumulator names
+        // - optional Initializer, Combiner, and OutConverter name
+        // - no Halter name
+        if (!strcmp(tmpNameReduce, kNoName) ||
+            !strcmp(tmpNameAccumulator, kNoName)) {
+            ALOGE("Expected reduce and accumulator names!: %s", line);
+            goto error;
+        }
+        if (strcmp(tmpNameHalter, kNoName)) {
+            ALOGE("Did not expect halter name!: %s", line);
+            goto error;
+        }
+
+        // The current implementation does not use the signature,
+        // reduce name, or combiner.
+
+        reduceNewDescriptions[i].accumSize = tmpSize;
+
+        // Process the (optional) initializer.
+        if (strcmp(tmpNameInitializer, kNoName)) {
+          // Lookup the original user-written initializer.
+          if (!(reduceNewDescriptions[i].initFunc =
+                (ReduceNewInitializerFunc_t) dlsym(sharedObj, tmpNameInitializer))) {
+            ALOGE("Failed to find initializer function address for %s(): %s",
+                  tmpNameInitializer, dlerror());
+            goto error;
+          }
+        } else {
+          reduceNewDescriptions[i].initFunc = nullptr;
+        }
+
+        // Lookup the expanded accumulator.
+        strncat(tmpNameAccumulator, ".expand", MAXLINE-1-strlen(tmpNameAccumulator));
+        if (!(reduceNewDescriptions[i].accumFunc =
+              (ReduceNewAccumulatorFunc_t) dlsym(sharedObj, tmpNameAccumulator))) {
+            ALOGE("Failed to find accumulator function address for %s(): %s",
+                  tmpNameAccumulator, dlerror());
+            goto error;
+        }
+
+        // Process the (optional) outconverter.
+        if (strcmp(tmpNameOutConverter, kNoName)) {
+          // Lookup the original user-written outconverter.
+          if (!(reduceNewDescriptions[i].outFunc =
+                (ReduceNewOutConverterFunc_t) dlsym(sharedObj, tmpNameOutConverter))) {
+            ALOGE("Failed to find outconverter function address for %s(): %s",
+                  tmpNameOutConverter, dlerror());
+            goto error;
+          }
+        } else {
+          reduceNewDescriptions[i].outFunc = nullptr;
+        }
+    }
+
     if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
         goto error;
     }
@@ -631,6 +712,7 @@
         invokeFunctions, funcCount,
         forEachFunctions, forEachSignatures, forEachCount,
         reduceFunctions, reduceCount,
+        reduceNewDescriptions, reduceNewCount,
         pragmaKeys, pragmaValues, pragmaCount,
         rsGlobalNames, rsGlobalAddresses, rsGlobalSizes, rsGlobalProperties,
         numEntries, isThreadable, checksum);
diff --git a/cpu_ref/rsCpuExecutable.h b/cpu_ref/rsCpuExecutable.h
index fe9c2ad..6e4d325 100644
--- a/cpu_ref/rsCpuExecutable.h
+++ b/cpu_ref/rsCpuExecutable.h
@@ -69,6 +69,7 @@
                      ForEachFunc_t* forEachFunctions, uint32_t* forEachSignatures,
                      size_t forEachCount,
                      ReduceFunc_t* reduceFunctions, size_t reduceCount,
+                     ReduceNewDescription *reduceNewDescriptions, size_t reduceNewCount,
                      const char** pragmaKeys, const char** pragmaValues,
                      size_t pragmaCount,
                      const char **globalNames, const void **globalAddresses,
@@ -81,6 +82,7 @@
         mForEachFunctions(forEachFunctions), mForEachSignatures(forEachSignatures),
         mForEachCount(forEachCount),
         mReduceFunctions(reduceFunctions), mReduceCount(reduceCount),
+        mReduceNewDescriptions(reduceNewDescriptions), mReduceNewCount(reduceNewCount),
         mPragmaKeys(pragmaKeys), mPragmaValues(pragmaValues),
         mPragmaCount(pragmaCount), mGlobalNames(globalNames),
         mGlobalAddresses(globalAddresses), mGlobalSizes(globalSizes),
@@ -109,6 +111,8 @@
 
         delete[] mReduceFunctions;
 
+        delete[] mReduceNewDescriptions;
+
         delete[] mForEachSignatures;
         delete[] mForEachFunctions;
 
@@ -134,6 +138,7 @@
     size_t getExportedFunctionCount() const { return mFuncCount; }
     size_t getExportedForEachCount() const { return mForEachCount; }
     size_t getExportedReduceCount() const { return mReduceCount; }
+    size_t getExportedReduceNewCount() const { return mReduceNewCount; }
     size_t getPragmaCount() const { return mPragmaCount; }
 
     void* getFieldAddress(int slot) const { return mFieldAddress[slot]; }
@@ -148,6 +153,10 @@
 
     ReduceFunc_t getReduceFunction(int slot) const { return mReduceFunctions[slot]; }
 
+    const ReduceNewDescription* getReduceNewDescription(int slot) const {
+        return &mReduceNewDescriptions[slot];
+    }
+
     const char ** getPragmaKeys() const { return mPragmaKeys; }
     const char ** getPragmaValues() const { return mPragmaValues; }
 
@@ -203,6 +212,9 @@
     ReduceFunc_t* mReduceFunctions;
     size_t mReduceCount;
 
+    ReduceNewDescription* mReduceNewDescriptions;
+    size_t mReduceNewCount;
+
     const char ** mPragmaKeys;
     const char ** mPragmaValues;
     size_t mPragmaCount;
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index 5adca54..7308b54 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -497,6 +497,7 @@
     // Copy info over to runtime
     script->mHal.info.exportedFunctionCount = mScriptExec->getExportedFunctionCount();
     script->mHal.info.exportedReduceCount = mScriptExec->getExportedReduceCount();
+    script->mHal.info.exportedReduceNewCount = mScriptExec->getExportedReduceNewCount();
     script->mHal.info.exportedForEachCount = mScriptExec->getExportedForEachCount();
     script->mHal.info.exportedVariableCount = mScriptExec->getExportedVariableCount();
     script->mHal.info.exportedPragmaCount = mScriptExec->getPragmaCount();;
@@ -553,7 +554,7 @@
     return true;
 }
 
-// Preliminary work to prepare a reduce-style kernel for launch.
+// Preliminary work to prepare a simple reduce-style kernel for launch.
 bool RsdCpuScriptImpl::reduceMtlsSetup(const Allocation *ain,
                                        const Allocation *aout,
                                        const RsScriptCall *sc,
@@ -591,6 +592,77 @@
     return true;
 }
 
+// Preliminary work to prepare a general reduce-style kernel for launch.
+bool RsdCpuScriptImpl::reduceNewMtlsSetup(const Allocation ** ains,
+                                          uint32_t inLen,
+                                          const Allocation * aout,
+                                          const RsScriptCall *sc,
+                                          MTLaunchStructReduceNew *mtls) {
+    rsAssert(ains && (inLen >= 1) && aout);
+    memset(mtls, 0, sizeof(MTLaunchStructReduceNew));
+    mtls->dimPtr = &mtls->redp.dim;
+
+    for (int index = inLen; --index >= 0;) {
+        if (allocationLODIsNull(ains[index])) {
+            mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+                                         "reduce called with null in allocations");
+            return false;
+        }
+    }
+
+    if (allocationLODIsNull(aout)) {
+        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+                                     "reduce called with null out allocation");
+        return false;
+    }
+
+    const Allocation *ain0   = ains[0];
+    const Type       *inType = ain0->getType();
+
+    mtls->redp.dim.x = inType->getDimX();
+    mtls->redp.dim.y = inType->getDimY();
+    mtls->redp.dim.z = inType->getDimZ();
+
+    for (int Index = inLen; --Index >= 1;) {
+        if (!ain0->hasSameDims(ains[Index])) {
+            mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+                                         "Failed to launch reduction kernel;"
+                                         "dimensions of input allocations do not match.");
+            return false;
+        }
+    }
+
+    if (!setUpMtlsDimensions(mtls, mtls->redp.dim, sc)) {
+        return false;
+    }
+
+    // The X & Y walkers always want 0-1 min even if dim is not present
+    mtls->end.x = rsMax((uint32_t)1, mtls->end.x);
+    mtls->end.y = rsMax((uint32_t)1, mtls->end.y);
+
+    mtls->rs = mCtx;
+
+    // Currently not threaded.
+    mtls->isThreadable = false;
+    mtls->mSliceNum = -1;
+
+    // Set up output,
+    mtls->redp.outLen = 1;
+    mtls->redp.outPtr[0] = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
+    mtls->redp.outStride[0] = aout->getType()->getElementSizeBytes();
+
+    // Set up input.
+    memcpy(mtls->ains, ains, inLen * sizeof(ains[0]));
+    mtls->redp.inLen = inLen;
+    for (int index = inLen; --index >= 0;) {
+        mtls->redp.inPtr[index] = (const uint8_t*)ains[index]->mHal.drvState.lod[0].mallocPtr;
+        mtls->redp.inStride[index] = ains[index]->getType()->getElementSizeBytes();
+    }
+
+    // All validation passed, ok to launch threads
+    return true;
+}
+
 // Preliminary work to prepare a forEach-style kernel for launch.
 bool RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
                                         uint32_t inLen,
@@ -626,13 +698,11 @@
         for (int Index = inLen; --Index >= 1;) {
             if (!ain0->hasSameDims(ains[Index])) {
                 mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
-                  "Failed to launch kernel; dimensions of input and output"
+                  "Failed to launch kernel; dimensions of input"
                   "allocations do not match.");
-
                 return false;
             }
         }
-
     } else if (aout != nullptr) {
         const Type *outType = aout->getType();
 
@@ -729,12 +799,25 @@
     }
 }
 
+void RsdCpuScriptImpl::invokeReduceNew(uint32_t slot,
+                                       const Allocation ** ains, uint32_t inLen,
+                                       Allocation *aout,
+                                       const RsScriptCall *sc) {
+  MTLaunchStructReduceNew mtls;
+
+  if (reduceNewMtlsSetup(ains, inLen, aout, sc, &mtls)) {
+    reduceNewKernelSetup(slot, &mtls);
+    RsdCpuScriptImpl *oldTLS = mCtx->setTLS(this);
+    mCtx->launchReduceNew(ains, inLen, aout, &mtls);
+    mCtx->setTLS(oldTLS);
+  }
+}
+
 void RsdCpuScriptImpl::forEachKernelSetup(uint32_t slot, MTLaunchStructForEach *mtls) {
     mtls->script = this;
     mtls->fep.slot = slot;
     mtls->kernel = mScriptExec->getForEachFunction(slot);
     rsAssert(mtls->kernel != nullptr);
-    mtls->sig = mScriptExec->getForEachSignature(slot);
 }
 
 void RsdCpuScriptImpl::reduceKernelSetup(uint32_t slot, MTLaunchStructReduce *mtls) {
@@ -743,6 +826,19 @@
     rsAssert(mtls->kernel != nullptr);
 }
 
+void RsdCpuScriptImpl::reduceNewKernelSetup(uint32_t slot, MTLaunchStructReduceNew *mtls) {
+    mtls->script = this;
+    mtls->redp.slot = slot;
+
+    const ReduceNewDescription *desc = mScriptExec->getReduceNewDescription(slot);
+    mtls->accumFunc = desc->accumFunc;
+    mtls->initFunc  = desc->initFunc;   // might legally be nullptr
+    mtls->outFunc   = desc->outFunc;    // might legally be nullptr
+    mtls->accumSize = desc->accumSize;
+
+    rsAssert(mtls->accumFunc != nullptr);
+}
+
 int RsdCpuScriptImpl::invokeRoot() {
     RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
     int ret = mRoot();
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index 248e5c7..2909dab 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -65,6 +65,11 @@
                       Allocation* aout,
                       const RsScriptCall* sc) override;
 
+    void invokeReduceNew(uint32_t slot,
+                         const Allocation ** ains, uint32_t inLen,
+                         Allocation* aout,
+                         const RsScriptCall* sc) override;
+
     void invokeInit() override;
     void invokeFreeChildren() override;
 
@@ -89,12 +94,18 @@
 
     virtual void forEachKernelSetup(uint32_t slot, MTLaunchStructForEach *mtls);
 
-    // Build an MTLaunchStruct suitable for launching a reduce-style kernel.
+    // Build an MTLaunchStruct suitable for launching a simple reduce-style kernel.
     bool reduceMtlsSetup(const Allocation *ain, const Allocation *aout,
                          const RsScriptCall *sc, MTLaunchStructReduce *mtls);
-    // Finalize an MTLaunchStruct for launching a reduce-style kernel.
+    // Finalize an MTLaunchStruct for launching a simple reduce-style kernel.
     virtual void reduceKernelSetup(uint32_t slot, MTLaunchStructReduce *mtls);
 
+    // Build an MTLaunchStruct suitable for launching a general reduce-style kernel.
+    bool reduceNewMtlsSetup(const Allocation ** ains, uint32_t inLen, const Allocation *aout,
+                            const RsScriptCall *sc, MTLaunchStructReduceNew *mtls);
+    // Finalize an MTLaunchStruct for launching a general reduce-style kernel.
+    virtual void reduceNewKernelSetup(uint32_t slot, MTLaunchStructReduceNew *mtls);
+
     const RsdCpuReference::CpuSymbol * lookupSymbolMath(const char *sym);
     static void * lookupRuntimeStub(void* pContext, char const* name);
 
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
index f2c7f19..49a999d 100644
--- a/cpu_ref/rsd_cpu.h
+++ b/cpu_ref/rsd_cpu.h
@@ -63,6 +63,11 @@
                                   Allocation *aout,
                                   const RsScriptCall *sc) = 0;
 
+        virtual void invokeReduceNew(uint32_t slot,
+                                     const Allocation ** ains, uint32_t inLen,
+                                     Allocation *aout,
+                                     const RsScriptCall *sc) = 0;
+
         virtual void invokeInit() = 0;
         virtual void invokeFreeChildren() = 0;