Support for general reduction kernels.

Requires coordinated change in frameworks/base.

Requires coordinated change in frameworks/compile/libbcc in order
for RsTest to run.

At present, general reduction kernels are run single-threaded.

Also: Remove dead struct field MTLaunchStructForEach::sig.

Bug: 23535724
Change-Id: Ice17ccf20a902f8a106eaa62ec071d46e3c0ad8c
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 48e8dbb..b8b4838 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -342,6 +342,18 @@
     }
 }
 
+// Set up the appropriate input and output pointers to the kernel driver info structure.
+// Inputs:
+//   mtls - The MTLaunchStruct holding information about the kernel launch
+//   redp - The reduce parameters (driver info structure)
+//   x, y, z - The start offsets into each dimension
+static inline void RedpPtrSetup(const MTLaunchStructReduceNew *mtls, RsExpandKernelDriverInfo *redp,
+                                uint32_t x, uint32_t y, uint32_t z) {
+    for (uint32_t i = 0; i < redp->inLen; i++) {
+        redp->inPtr[i] = (const uint8_t *)mtls->ains[i]->getPointerUnchecked(x, y, z);
+    }
+}
+
 static uint32_t sliceInt(uint32_t *p, uint32_t val, uint32_t start, uint32_t end) {
     if (start >= end) {
         *p = start;
@@ -355,16 +367,16 @@
     return n;
 }
 
-static bool SelectOuterSlice(const MTLaunchStructForEach *mtls, RsExpandKernelDriverInfo* fep, uint32_t sliceNum) {
+static bool SelectOuterSlice(const MTLaunchStructCommon *mtls, RsExpandKernelDriverInfo* info, uint32_t sliceNum) {
 
     uint32_t r = sliceNum;
-    r = sliceInt(&fep->current.z, r, mtls->start.z, mtls->end.z);
-    r = sliceInt(&fep->current.lod, r, mtls->start.lod, mtls->end.lod);
-    r = sliceInt(&fep->current.face, r, mtls->start.face, mtls->end.face);
-    r = sliceInt(&fep->current.array[0], r, mtls->start.array[0], mtls->end.array[0]);
-    r = sliceInt(&fep->current.array[1], r, mtls->start.array[1], mtls->end.array[1]);
-    r = sliceInt(&fep->current.array[2], r, mtls->start.array[2], mtls->end.array[2]);
-    r = sliceInt(&fep->current.array[3], r, mtls->start.array[3], mtls->end.array[3]);
+    r = sliceInt(&info->current.z, r, mtls->start.z, mtls->end.z);
+    r = sliceInt(&info->current.lod, r, mtls->start.lod, mtls->end.lod);
+    r = sliceInt(&info->current.face, r, mtls->start.face, mtls->end.face);
+    r = sliceInt(&info->current.array[0], r, mtls->start.array[0], mtls->end.array[0]);
+    r = sliceInt(&info->current.array[1], r, mtls->start.array[1], mtls->end.array[1]);
+    r = sliceInt(&info->current.array[2], r, mtls->start.array[2], mtls->end.array[2]);
+    r = sliceInt(&info->current.array[3], r, mtls->start.array[3], mtls->end.array[3]);
     return r == 0;
 }
 
@@ -446,7 +458,7 @@
     }
 }
 
-// Launch a reduce-style kernel.
+// Launch a simple reduce-style kernel.
 // Inputs:
 //  ain:  The allocation that contains the input
 //  aout: The allocation that will hold the output
@@ -465,6 +477,50 @@
     mtls->kernel(&mtls->inBuf[startOffset], mtls->outBuf, xEnd - xStart);
 }
 
+// Launch a general reduce-style kernel.
+// Inputs:
+//   ains[0..inLen-1]: Array of allocations that contain the inputs
+//   aout:             The allocation that will hold the output
+//   mtls:             Holds launch parameters
+void RsdCpuReferenceImpl::launchReduceNew(const Allocation ** ains,
+                                          uint32_t inLen,
+                                          Allocation * aout,
+                                          MTLaunchStructReduceNew *mtls) {
+  // In the presence of outconverter, we allocate temporary memory for
+  // the accumulator.
+  //
+  // In the absence of outconverter, we use the output allocation as the
+  // accumulator.
+  uint8_t *const accumPtr = (mtls->outFunc
+                             ? static_cast<uint8_t *>(malloc(mtls->accumSize))
+                             : mtls->redp.outPtr[0]);
+
+  // initialize
+  if (mtls->initFunc) {
+    mtls->initFunc(accumPtr);
+  } else {
+    memset(accumPtr, 0, mtls->accumSize);
+  }
+
+  // accumulate
+  const ReduceNewAccumulatorFunc_t fn = mtls->accumFunc;
+  uint32_t slice = 0;
+  while (SelectOuterSlice(mtls, &mtls->redp, slice++)) {
+    for (mtls->redp.current.y = mtls->start.y;
+         mtls->redp.current.y < mtls->end.y;
+         mtls->redp.current.y++) {
+      RedpPtrSetup(mtls, &mtls->redp, mtls->start.x, mtls->redp.current.y, mtls->redp.current.z);
+      fn(&mtls->redp, mtls->start.x, mtls->end.x, accumPtr);
+    }
+  }
+
+  // outconvert
+  if (mtls->outFunc) {
+    mtls->outFunc(mtls->redp.outPtr[0], accumPtr);
+    free(accumPtr);
+  }
+}
+
 void RsdCpuReferenceImpl::launchForEach(const Allocation ** ains,
                                         uint32_t inLen,
                                         Allocation* aout,
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index cfdb29a..939b7ae 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -33,11 +33,21 @@
 
 // Function types found in RenderScript code
 typedef void (*ReduceFunc_t)(const uint8_t *inBuf, uint8_t *outBuf, uint32_t len);
+typedef void (*ReduceNewAccumulatorFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint8_t *accum);
+typedef void (*ReduceNewInitializerFunc_t)(uint8_t *accum);
+typedef void (*ReduceNewOutConverterFunc_t)(uint8_t *out, const uint8_t *accum);
 typedef void (*ForEachFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint32_t outStride);
 typedef void (*InvokeFunc_t)(void *params);
 typedef void (*InitOrDtorFunc_t)(void);
 typedef int  (*RootFunc_t)(void);
 
+struct ReduceNewDescription {
+    ReduceNewAccumulatorFunc_t  accumFunc;  // expanded accumulator function
+    ReduceNewInitializerFunc_t  initFunc;   // user initializer function
+    ReduceNewOutConverterFunc_t outFunc;    // user outconverter function
+    size_t                      accumSize;  // accumulator datum size, in bytes
+};
+
 // Internal driver callback used to execute a kernel
 typedef void (*WorkerCallback_t)(void *usr, uint32_t idx);
 
@@ -72,7 +82,6 @@
     RsExpandKernelDriverInfo fep;
 
     ForEachFunc_t kernel;
-    uint32_t sig;
     const Allocation *ains[RS_KERNEL_INPUT_LIMIT];
     Allocation *aout[RS_KERNEL_INPUT_LIMIT];
 };
@@ -84,6 +93,19 @@
     RsLaunchDimensions inputDim;
 };
 
+struct MTLaunchStructReduceNew : public MTLaunchStructCommon {
+    // Driver info structure
+    RsExpandKernelDriverInfo redp;
+
+    const Allocation *ains[RS_KERNEL_INPUT_LIMIT];
+
+    ReduceNewAccumulatorFunc_t accumFunc;
+    ReduceNewInitializerFunc_t initFunc;
+    ReduceNewOutConverterFunc_t outFunc;
+
+    size_t accumSize;  // accumulator datum size in bytes
+};
+
 class RsdCpuReferenceImpl : public RsdCpuReference {
 public:
     ~RsdCpuReferenceImpl() override;
@@ -107,10 +129,14 @@
     void launchForEach(const Allocation **ains, uint32_t inLen, Allocation *aout,
                        const RsScriptCall *sc, MTLaunchStructForEach *mtls);
 
-    // Launch a reduce kernel
+    // Launch a simple reduce kernel
     void launchReduce(const Allocation *ain, Allocation *aout,
                       MTLaunchStructReduce *mtls);
 
+    // Launch a general reduce kernel
+    void launchReduceNew(const Allocation ** ains, uint32_t inLen, Allocation *aout,
+                         MTLaunchStructReduceNew *mtls);
+
     CpuScript * createScript(const ScriptC *s, char const *resName, char const *cacheDir,
                              uint8_t const *bitcode, size_t bitcodeSize, uint32_t flags) override;
     CpuScript * createIntrinsic(const Script *s, RsScriptIntrinsicID iid, Element *e) override;
diff --git a/cpu_ref/rsCpuExecutable.cpp b/cpu_ref/rsCpuExecutable.cpp
index d654743..5dd31ee 100644
--- a/cpu_ref/rsCpuExecutable.cpp
+++ b/cpu_ref/rsCpuExecutable.cpp
@@ -319,6 +319,7 @@
     ForEachFunc_t* forEachFunctions = nullptr;
     uint32_t* forEachSignatures = nullptr;
     ReduceFunc_t* reduceFunctions = nullptr;
+    ReduceNewDescription* reduceNewDescriptions = nullptr;
     const char ** pragmaKeys = nullptr;
     const char ** pragmaValues = nullptr;
     uint32_t checksum = 0;
@@ -485,7 +486,7 @@
         }
     }
 
-    // Read general reduce kernels (for now, we expect the count to be zero)
+    // Read general reduce kernels
     if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
         goto error;
     }
@@ -493,11 +494,91 @@
         ALOGE("Invalid export reduce new count!: %s", line);
         goto error;
     }
-    if (reduceNewCount != 0) {
-        ALOGE("Expected export reduce new count to be zero!: %s", line);
+
+    reduceNewDescriptions = new ReduceNewDescription[reduceNewCount];
+    if (reduceNewDescriptions == nullptr) {
         goto error;
     }
 
+    for (size_t i = 0; i < reduceNewCount; ++i) {
+        static const char kNoName[] = ".";
+
+        unsigned int tmpSig = 0;
+        size_t tmpSize = 0;
+        char tmpNameReduce[MAXLINE];
+        char tmpNameInitializer[MAXLINE];
+        char tmpNameAccumulator[MAXLINE];
+        char tmpNameCombiner[MAXLINE];
+        char tmpNameOutConverter[MAXLINE];
+        char tmpNameHalter[MAXLINE];
+
+        if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
+            goto error;
+        }
+#define DELIMNAME " - %" MAKE_STR(MAXLINE) "s"
+        if (sscanf(line, "%u - %zu" DELIMNAME DELIMNAME DELIMNAME DELIMNAME DELIMNAME DELIMNAME,
+                   &tmpSig, &tmpSize, tmpNameReduce, tmpNameInitializer, tmpNameAccumulator,
+                   tmpNameCombiner, tmpNameOutConverter, tmpNameHalter) != 8) {
+            ALOGE("Invalid export reduce new!: %s", line);
+            goto error;
+        }
+#undef DELIMNAME
+
+        // For now, we expect
+        // - Reduce and Accumulator names
+        // - optional Initializer, Combiner, and OutConverter name
+        // - no Halter name
+        if (!strcmp(tmpNameReduce, kNoName) ||
+            !strcmp(tmpNameAccumulator, kNoName)) {
+            ALOGE("Expected reduce and accumulator names!: %s", line);
+            goto error;
+        }
+        if (strcmp(tmpNameHalter, kNoName)) {
+            ALOGE("Did not expect halter name!: %s", line);
+            goto error;
+        }
+
+        // The current implementation does not use the signature,
+        // reduce name, or combiner.
+
+        reduceNewDescriptions[i].accumSize = tmpSize;
+
+        // Process the (optional) initializer.
+        if (strcmp(tmpNameInitializer, kNoName)) {
+          // Lookup the original user-written initializer.
+          if (!(reduceNewDescriptions[i].initFunc =
+                (ReduceNewInitializerFunc_t) dlsym(sharedObj, tmpNameInitializer))) {
+            ALOGE("Failed to find initializer function address for %s(): %s",
+                  tmpNameInitializer, dlerror());
+            goto error;
+          }
+        } else {
+          reduceNewDescriptions[i].initFunc = nullptr;
+        }
+
+        // Lookup the expanded accumulator.
+        strncat(tmpNameAccumulator, ".expand", MAXLINE-1-strlen(tmpNameAccumulator));
+        if (!(reduceNewDescriptions[i].accumFunc =
+              (ReduceNewAccumulatorFunc_t) dlsym(sharedObj, tmpNameAccumulator))) {
+            ALOGE("Failed to find accumulator function address for %s(): %s",
+                  tmpNameAccumulator, dlerror());
+            goto error;
+        }
+
+        // Process the (optional) outconverter.
+        if (strcmp(tmpNameOutConverter, kNoName)) {
+          // Lookup the original user-written outconverter.
+          if (!(reduceNewDescriptions[i].outFunc =
+                (ReduceNewOutConverterFunc_t) dlsym(sharedObj, tmpNameOutConverter))) {
+            ALOGE("Failed to find outconverter function address for %s(): %s",
+                  tmpNameOutConverter, dlerror());
+            goto error;
+          }
+        } else {
+          reduceNewDescriptions[i].outFunc = nullptr;
+        }
+    }
+
     if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
         goto error;
     }
@@ -631,6 +712,7 @@
         invokeFunctions, funcCount,
         forEachFunctions, forEachSignatures, forEachCount,
         reduceFunctions, reduceCount,
+        reduceNewDescriptions, reduceNewCount,
         pragmaKeys, pragmaValues, pragmaCount,
         rsGlobalNames, rsGlobalAddresses, rsGlobalSizes, rsGlobalProperties,
         numEntries, isThreadable, checksum);
diff --git a/cpu_ref/rsCpuExecutable.h b/cpu_ref/rsCpuExecutable.h
index fe9c2ad..6e4d325 100644
--- a/cpu_ref/rsCpuExecutable.h
+++ b/cpu_ref/rsCpuExecutable.h
@@ -69,6 +69,7 @@
                      ForEachFunc_t* forEachFunctions, uint32_t* forEachSignatures,
                      size_t forEachCount,
                      ReduceFunc_t* reduceFunctions, size_t reduceCount,
+                     ReduceNewDescription *reduceNewDescriptions, size_t reduceNewCount,
                      const char** pragmaKeys, const char** pragmaValues,
                      size_t pragmaCount,
                      const char **globalNames, const void **globalAddresses,
@@ -81,6 +82,7 @@
         mForEachFunctions(forEachFunctions), mForEachSignatures(forEachSignatures),
         mForEachCount(forEachCount),
         mReduceFunctions(reduceFunctions), mReduceCount(reduceCount),
+        mReduceNewDescriptions(reduceNewDescriptions), mReduceNewCount(reduceNewCount),
         mPragmaKeys(pragmaKeys), mPragmaValues(pragmaValues),
         mPragmaCount(pragmaCount), mGlobalNames(globalNames),
         mGlobalAddresses(globalAddresses), mGlobalSizes(globalSizes),
@@ -109,6 +111,8 @@
 
         delete[] mReduceFunctions;
 
+        delete[] mReduceNewDescriptions;
+
         delete[] mForEachSignatures;
         delete[] mForEachFunctions;
 
@@ -134,6 +138,7 @@
     size_t getExportedFunctionCount() const { return mFuncCount; }
     size_t getExportedForEachCount() const { return mForEachCount; }
     size_t getExportedReduceCount() const { return mReduceCount; }
+    size_t getExportedReduceNewCount() const { return mReduceNewCount; }
     size_t getPragmaCount() const { return mPragmaCount; }
 
     void* getFieldAddress(int slot) const { return mFieldAddress[slot]; }
@@ -148,6 +153,10 @@
 
     ReduceFunc_t getReduceFunction(int slot) const { return mReduceFunctions[slot]; }
 
+    const ReduceNewDescription* getReduceNewDescription(int slot) const {
+        return &mReduceNewDescriptions[slot];
+    }
+
     const char ** getPragmaKeys() const { return mPragmaKeys; }
     const char ** getPragmaValues() const { return mPragmaValues; }
 
@@ -203,6 +212,9 @@
     ReduceFunc_t* mReduceFunctions;
     size_t mReduceCount;
 
+    ReduceNewDescription* mReduceNewDescriptions;
+    size_t mReduceNewCount;
+
     const char ** mPragmaKeys;
     const char ** mPragmaValues;
     size_t mPragmaCount;
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index 5adca54..7308b54 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -497,6 +497,7 @@
     // Copy info over to runtime
     script->mHal.info.exportedFunctionCount = mScriptExec->getExportedFunctionCount();
     script->mHal.info.exportedReduceCount = mScriptExec->getExportedReduceCount();
+    script->mHal.info.exportedReduceNewCount = mScriptExec->getExportedReduceNewCount();
     script->mHal.info.exportedForEachCount = mScriptExec->getExportedForEachCount();
     script->mHal.info.exportedVariableCount = mScriptExec->getExportedVariableCount();
     script->mHal.info.exportedPragmaCount = mScriptExec->getPragmaCount();;
@@ -553,7 +554,7 @@
     return true;
 }
 
-// Preliminary work to prepare a reduce-style kernel for launch.
+// Preliminary work to prepare a simple reduce-style kernel for launch.
 bool RsdCpuScriptImpl::reduceMtlsSetup(const Allocation *ain,
                                        const Allocation *aout,
                                        const RsScriptCall *sc,
@@ -591,6 +592,77 @@
     return true;
 }
 
+// Preliminary work to prepare a general reduce-style kernel for launch.
+bool RsdCpuScriptImpl::reduceNewMtlsSetup(const Allocation ** ains,
+                                          uint32_t inLen,
+                                          const Allocation * aout,
+                                          const RsScriptCall *sc,
+                                          MTLaunchStructReduceNew *mtls) {
+    rsAssert(ains && (inLen >= 1) && aout);
+    memset(mtls, 0, sizeof(MTLaunchStructReduceNew));
+    mtls->dimPtr = &mtls->redp.dim;
+
+    for (int index = inLen; --index >= 0;) {
+        if (allocationLODIsNull(ains[index])) {
+            mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+                                         "reduce called with null in allocations");
+            return false;
+        }
+    }
+
+    if (allocationLODIsNull(aout)) {
+        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+                                     "reduce called with null out allocation");
+        return false;
+    }
+
+    const Allocation *ain0   = ains[0];
+    const Type       *inType = ain0->getType();
+
+    mtls->redp.dim.x = inType->getDimX();
+    mtls->redp.dim.y = inType->getDimY();
+    mtls->redp.dim.z = inType->getDimZ();
+
+    for (int Index = inLen; --Index >= 1;) {
+        if (!ain0->hasSameDims(ains[Index])) {
+            mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+                                         "Failed to launch reduction kernel;"
+                                         "dimensions of input allocations do not match.");
+            return false;
+        }
+    }
+
+    if (!setUpMtlsDimensions(mtls, mtls->redp.dim, sc)) {
+        return false;
+    }
+
+    // The X & Y walkers always want 0-1 min even if dim is not present
+    mtls->end.x = rsMax((uint32_t)1, mtls->end.x);
+    mtls->end.y = rsMax((uint32_t)1, mtls->end.y);
+
+    mtls->rs = mCtx;
+
+    // Currently not threaded.
+    mtls->isThreadable = false;
+    mtls->mSliceNum = -1;
+
+    // Set up output,
+    mtls->redp.outLen = 1;
+    mtls->redp.outPtr[0] = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
+    mtls->redp.outStride[0] = aout->getType()->getElementSizeBytes();
+
+    // Set up input.
+    memcpy(mtls->ains, ains, inLen * sizeof(ains[0]));
+    mtls->redp.inLen = inLen;
+    for (int index = inLen; --index >= 0;) {
+        mtls->redp.inPtr[index] = (const uint8_t*)ains[index]->mHal.drvState.lod[0].mallocPtr;
+        mtls->redp.inStride[index] = ains[index]->getType()->getElementSizeBytes();
+    }
+
+    // All validation passed, ok to launch threads
+    return true;
+}
+
 // Preliminary work to prepare a forEach-style kernel for launch.
 bool RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
                                         uint32_t inLen,
@@ -626,13 +698,11 @@
         for (int Index = inLen; --Index >= 1;) {
             if (!ain0->hasSameDims(ains[Index])) {
                 mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
-                  "Failed to launch kernel; dimensions of input and output"
+                  "Failed to launch kernel; dimensions of input"
                   "allocations do not match.");
-
                 return false;
             }
         }
-
     } else if (aout != nullptr) {
         const Type *outType = aout->getType();
 
@@ -729,12 +799,25 @@
     }
 }
 
+void RsdCpuScriptImpl::invokeReduceNew(uint32_t slot,
+                                       const Allocation ** ains, uint32_t inLen,
+                                       Allocation *aout,
+                                       const RsScriptCall *sc) {
+  MTLaunchStructReduceNew mtls;
+
+  if (reduceNewMtlsSetup(ains, inLen, aout, sc, &mtls)) {
+    reduceNewKernelSetup(slot, &mtls);
+    RsdCpuScriptImpl *oldTLS = mCtx->setTLS(this);
+    mCtx->launchReduceNew(ains, inLen, aout, &mtls);
+    mCtx->setTLS(oldTLS);
+  }
+}
+
 void RsdCpuScriptImpl::forEachKernelSetup(uint32_t slot, MTLaunchStructForEach *mtls) {
     mtls->script = this;
     mtls->fep.slot = slot;
     mtls->kernel = mScriptExec->getForEachFunction(slot);
     rsAssert(mtls->kernel != nullptr);
-    mtls->sig = mScriptExec->getForEachSignature(slot);
 }
 
 void RsdCpuScriptImpl::reduceKernelSetup(uint32_t slot, MTLaunchStructReduce *mtls) {
@@ -743,6 +826,19 @@
     rsAssert(mtls->kernel != nullptr);
 }
 
+void RsdCpuScriptImpl::reduceNewKernelSetup(uint32_t slot, MTLaunchStructReduceNew *mtls) {
+    mtls->script = this;
+    mtls->redp.slot = slot;
+
+    const ReduceNewDescription *desc = mScriptExec->getReduceNewDescription(slot);
+    mtls->accumFunc = desc->accumFunc;
+    mtls->initFunc  = desc->initFunc;   // might legally be nullptr
+    mtls->outFunc   = desc->outFunc;    // might legally be nullptr
+    mtls->accumSize = desc->accumSize;
+
+    rsAssert(mtls->accumFunc != nullptr);
+}
+
 int RsdCpuScriptImpl::invokeRoot() {
     RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
     int ret = mRoot();
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index 248e5c7..2909dab 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -65,6 +65,11 @@
                       Allocation* aout,
                       const RsScriptCall* sc) override;
 
+    void invokeReduceNew(uint32_t slot,
+                         const Allocation ** ains, uint32_t inLen,
+                         Allocation* aout,
+                         const RsScriptCall* sc) override;
+
     void invokeInit() override;
     void invokeFreeChildren() override;
 
@@ -89,12 +94,18 @@
 
     virtual void forEachKernelSetup(uint32_t slot, MTLaunchStructForEach *mtls);
 
-    // Build an MTLaunchStruct suitable for launching a reduce-style kernel.
+    // Build an MTLaunchStruct suitable for launching a simple reduce-style kernel.
     bool reduceMtlsSetup(const Allocation *ain, const Allocation *aout,
                          const RsScriptCall *sc, MTLaunchStructReduce *mtls);
-    // Finalize an MTLaunchStruct for launching a reduce-style kernel.
+    // Finalize an MTLaunchStruct for launching a simple reduce-style kernel.
     virtual void reduceKernelSetup(uint32_t slot, MTLaunchStructReduce *mtls);
 
+    // Build an MTLaunchStruct suitable for launching a general reduce-style kernel.
+    bool reduceNewMtlsSetup(const Allocation ** ains, uint32_t inLen, const Allocation *aout,
+                            const RsScriptCall *sc, MTLaunchStructReduceNew *mtls);
+    // Finalize an MTLaunchStruct for launching a general reduce-style kernel.
+    virtual void reduceNewKernelSetup(uint32_t slot, MTLaunchStructReduceNew *mtls);
+
     const RsdCpuReference::CpuSymbol * lookupSymbolMath(const char *sym);
     static void * lookupRuntimeStub(void* pContext, char const* name);
 
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
index f2c7f19..49a999d 100644
--- a/cpu_ref/rsd_cpu.h
+++ b/cpu_ref/rsd_cpu.h
@@ -63,6 +63,11 @@
                                   Allocation *aout,
                                   const RsScriptCall *sc) = 0;
 
+        virtual void invokeReduceNew(uint32_t slot,
+                                     const Allocation ** ains, uint32_t inLen,
+                                     Allocation *aout,
+                                     const RsScriptCall *sc) = 0;
+
         virtual void invokeInit() = 0;
         virtual void invokeFreeChildren() = 0;
 
diff --git a/driver/rsdBcc.cpp b/driver/rsdBcc.cpp
index 25659d8..af8d6ad 100644
--- a/driver/rsdBcc.cpp
+++ b/driver/rsdBcc.cpp
@@ -133,6 +133,15 @@
     cs->invokeReduce(slot, ain, aout, sc);
 }
 
+void rsdScriptInvokeReduceNew(const Context *dc, Script *s,
+                              uint32_t slot,
+                              const Allocation ** ains, size_t inLen,
+                              Allocation *aout,
+                              const RsScriptCall *sc) {
+    RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
+    cs->invokeReduceNew(slot, ains, inLen, aout, sc);
+}
+
 void rsdScriptSetGlobalVar(const Context *dc, const Script *s,
                            uint32_t slot, void *data, size_t dataLength) {
     RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
diff --git a/driver/rsdBcc.h b/driver/rsdBcc.h
index e95529b..a2bf8be 100644
--- a/driver/rsdBcc.h
+++ b/driver/rsdBcc.h
@@ -50,6 +50,14 @@
                            android::renderscript::Allocation *aout,
                            const RsScriptCall *sc);
 
+void rsdScriptInvokeReduceNew(const android::renderscript::Context *rsc,
+                              android::renderscript::Script *s,
+                              uint32_t slot,
+                              const android::renderscript::Allocation ** ains,
+                              size_t inLen,
+                              android::renderscript::Allocation *aout,
+                              const RsScriptCall *sc);
+
 void rsdScriptInvokeForEachMulti(const android::renderscript::Context *rsc,
                                  android::renderscript::Script *s,
                                  uint32_t slot,
diff --git a/driver/rsdCore.cpp b/driver/rsdCore.cpp
index f70b79b..f0a7334 100644
--- a/driver/rsdCore.cpp
+++ b/driver/rsdCore.cpp
@@ -101,6 +101,8 @@
         fnPtr[0] = (void *)rsdScriptUpdateCachedObject; break;
     case RS_HAL_SCRIPT_INVOKE_REDUCE:
         fnPtr[0] = (void *)rsdScriptInvokeReduce; break;
+    case RS_HAL_SCRIPT_INVOKE_REDUCE_NEW:
+        fnPtr[0] = (void *)rsdScriptInvokeReduceNew; break;
 
     case RS_HAL_ALLOCATION_INIT:
         fnPtr[0] = (void *)rsdAllocationInit; break;
diff --git a/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java b/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java
index efed0bf..7de6733 100644
--- a/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java
+++ b/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java
@@ -108,6 +108,7 @@
         unitTests.add(new UT_mesh(this, mRes, mCtx));*/
         unitTests.add(new UT_foreach_multi(this, mRes, mCtx));
         unitTests.add(new UT_fp_mad(this, mRes, mCtx));
+        unitTests.add(new UT_reduce(this, mRes, mCtx));
 
         /*
         unitTests.add(new UnitTest(null, "<Pass>", 1));
diff --git a/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java b/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java
new file mode 100644
index 0000000..6cc70d1
--- /dev/null
+++ b/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java
@@ -0,0 +1,336 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.rs.test;
+
+import android.content.Context;
+import android.content.res.Resources;
+import android.renderscript.*;
+import android.util.Log;
+import java.lang.Float;
+import java.util.Random;
+
+public class UT_reduce extends UnitTest {
+    private static final String TAG = "reduce";
+
+    protected UT_reduce(RSTestCore rstc, Resources res, Context ctx) {
+        super(rstc, "reduce", ctx);
+    }
+
+    private byte[] createInputArrayByte(int len, int seed) {
+        byte[] array = new byte[len];
+        (new Random(seed)).nextBytes(array);
+        return array;
+    }
+
+    private float[] createInputArrayFloat(int len, int seed) {
+        Random rand = new Random(seed);
+        float[] array = new float[len];
+        for (int i = 0; i < len; ++i)
+            array[i] = rand.nextFloat();
+        return array;
+    }
+
+    private int[] createInputArrayInt(int len, int seed) {
+        Random rand = new Random(seed);
+        int[] array = new int[len];
+        for (int i = 0; i < len; ++i)
+            array[i] = rand.nextInt();
+        return array;
+    }
+
+    private int[] createInputArrayInt(int len, int seed, int eltRange) {
+        Random rand = new Random(seed);
+        int[] array = new int[len];
+        for (int i = 0; i < len; ++i)
+            array[i] = rand.nextInt(eltRange);
+        return array;
+    }
+
+    private <T extends Number> boolean result(String testName, T javaRslt, T rsRslt) {
+        final boolean success = javaRslt.equals(rsRslt);
+        Log.i(TAG,
+                testName + ": java " + javaRslt + ", rs " + rsRslt + ": " +
+                (success ? "PASSED" : "FAILED"));
+        return success;
+    }
+
+    private boolean result(String testName, Int2 javaRslt, Int2 rsRslt) {
+        final boolean success = (javaRslt.x == rsRslt.x) && (javaRslt.y == rsRslt.y);
+        Log.i(TAG,
+                testName +
+                ": java (" + javaRslt.x + ", " + javaRslt.y + ")" +
+                ", rs (" + rsRslt.x + ", " + rsRslt.y + ")" +
+                ": " + (success ? "PASSED" : "FAILED"));
+        return success;
+    }
+
+    ///////////////////////////////////////////////////////////////////
+
+    private int addint(int[] input) {
+        int rslt = 0;
+        for (int idx = 0; idx < input.length; ++idx)
+            rslt += input[idx];
+        return rslt;
+    }
+
+    private boolean addint1D(RenderScript RS, ScriptC_reduce s) {
+        final int[] input = createInputArrayInt(100000, 0, 1 << 13);
+
+        final int javaRslt = addint(input);
+        final int rsRslt = s.reduce_addint(input).get();
+
+        return result("addint1D", javaRslt, rsRslt);
+    }
+
+    private boolean addint2D(RenderScript RS, ScriptC_reduce s) {
+        final int dimX = 450, dimY = 225;
+
+        final int[] inputArray = createInputArrayInt(dimX * dimY, 1, 1 << 13);
+        Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS));
+        typeBuilder.setX(dimX).setY(dimY);
+        Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create());
+        inputAllocation.copy2DRangeFrom(0, 0, dimX, dimY, inputArray);
+
+        final int javaRslt = addint(inputArray);
+        final int rsRslt = s.reduce_addint(inputAllocation).get();
+
+        return result("addint2D", javaRslt, rsRslt);
+    }
+
+    ///////////////////////////////////////////////////////////////////
+
+    private float dp(float[] input1, float[] input2) {
+        _RS_ASSERT("dp input length mismatch", input1.length == input2.length);
+
+        float rslt = 0;
+        for (int idx = 0; idx < input1.length; ++idx)
+            rslt += input1[idx] * input2[idx];
+        return rslt;
+    }
+
+    private boolean dp(RenderScript RS, ScriptC_reduce s) {
+        final float[] input1 = createInputArrayFloat(100000, 2);
+        final float[] input2 = createInputArrayFloat(100000, 3);
+
+        final float javaRslt = dp(input1, input2);
+        final float rsRslt = s.reduce_dp(input1, input2).get();
+
+        // NOTE: Using a floating point equality check to test for
+        // correctness -- as we do below -- is a bad idea.  It's only
+        // reliable if the Java and RenderScript implementation of dp
+        // use the same algorithm.  Equality could be broken by
+        // different optimizations between the two, or running the
+        // RenderScript algorithm multithreaded, or running the
+        // RenderScript algorithm on a GPU rather than the CPU.
+        //
+        // Should we be checking instead that the results are
+        // "sufficiently close"?  Cooking the input set to try to
+        // ensure a deterministic result?  Changing to integers
+        // instead?
+        return result("dp", javaRslt, rsRslt);
+    }
+
+    ///////////////////////////////////////////////////////////////////
+
+    private Int2 findMinAndMax(float[] input) {
+        float minVal = Float.POSITIVE_INFINITY;
+        int minIdx = -1;
+        float maxVal = Float.NEGATIVE_INFINITY;
+        int maxIdx = -1;
+
+        for (int idx = 0; idx < input.length; ++idx) {
+            if (input[idx] < minVal) {
+                minVal = input[idx];
+                minIdx = idx;
+            }
+            if (input[idx] > maxVal) {
+                maxVal = input[idx];
+                maxIdx = idx;
+            }
+        }
+
+        return new Int2(minIdx, maxIdx);
+    }
+
+    private boolean findMinAndMax(RenderScript RS, ScriptC_reduce s) {
+        final float[] input = createInputArrayFloat(100000, 4);
+
+        final Int2 javaRslt = findMinAndMax(input);
+        final Int2 rsRslt = s.reduce_findMinAndMax(input).get();
+
+        return result("findMinAndMax", javaRslt, rsRslt);
+    }
+
+    ///////////////////////////////////////////////////////////////////
+
+    private boolean fz(RenderScript RS, ScriptC_reduce s) {
+        final int inputLen = 100000;
+        int[] input = createInputArrayInt(inputLen, 5);
+        // just in case we got unlucky
+        input[(new Random(6)).nextInt(inputLen)] = 0;
+
+        final int rsRslt = s.reduce_fz(input).get();
+
+        final boolean success = (input[rsRslt] == 0);
+        Log.i(TAG,
+                "fz: input[" + rsRslt + "] == " + input[rsRslt] + ": " +
+                (success ? "PASSED" : "FAILED"));
+        return success;
+    }
+
+    ///////////////////////////////////////////////////////////////////
+
+    private boolean fz2(RenderScript RS, ScriptC_reduce s) {
+        final int dimX = 225, dimY = 450;
+        final int inputLen = dimX * dimY;
+
+        int[] inputArray = createInputArrayInt(inputLen, 7);
+        // just in case we got unlucky
+        inputArray[(new Random(8)).nextInt(inputLen)] = 0;
+
+        Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS));
+        typeBuilder.setX(dimX).setY(dimY);
+        Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create());
+        inputAllocation.copy2DRangeFrom(0, 0, dimX, dimY, inputArray);
+
+        final Int2 rsRslt = s.reduce_fz2(inputAllocation).get();
+
+        final int cellVal = inputArray[rsRslt.x + dimX * rsRslt.y];
+        final boolean success = (cellVal == 0);
+        Log.i(TAG,
+                "fz2: input[" + rsRslt.x + ", " + rsRslt.y + "] == " + cellVal + ": " +
+                (success ? "PASSED" : "FAILED"));
+        return success;
+    }
+
+    ///////////////////////////////////////////////////////////////////
+
+    private boolean fz3(RenderScript RS, ScriptC_reduce s) {
+        final int dimX = 59, dimY = 48, dimZ = 37;
+        final int inputLen = dimX * dimY * dimZ;
+
+        int[] inputArray = createInputArrayInt(inputLen, 9);
+        // just in case we got unlucky
+        inputArray[(new Random(10)).nextInt(inputLen)] = 0;
+
+        Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS));
+        typeBuilder.setX(dimX).setY(dimY).setZ(dimZ);
+        Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create());
+        inputAllocation.copy3DRangeFrom(0, 0, 0, dimX, dimY, dimZ, inputArray);
+
+        final Int3 rsRslt = s.reduce_fz3(inputAllocation).get();
+
+        final int cellVal = inputArray[rsRslt.x + dimX * rsRslt.y + dimX * dimY * rsRslt.z];
+        final boolean success = (cellVal == 0);
+        Log.i(TAG,
+                "fz3: input[" + rsRslt.x + ", " + rsRslt.y + ", " + rsRslt.z + "] == " + cellVal + ": " +
+                (success ? "PASSED" : "FAILED"));
+        return success;
+    }
+
+    ///////////////////////////////////////////////////////////////////
+
+    private static final int histogramBucketCount = 256;
+
+    private long[] histogram(RenderScript RS, final byte[] inputArray) {
+        Allocation inputAllocation = Allocation.createSized(RS, Element.U8(RS), inputArray.length);
+        inputAllocation.copyFrom(inputArray);
+
+        Allocation outputAllocation = Allocation.createSized(RS, Element.U32(RS), histogramBucketCount);
+
+        ScriptIntrinsicHistogram scriptHsg = ScriptIntrinsicHistogram.create(RS, Element.U8(RS));
+        scriptHsg.setOutput(outputAllocation);
+        scriptHsg.forEach(inputAllocation);
+
+        int[] outputArrayMistyped = new int[histogramBucketCount];
+        outputAllocation.copyTo(outputArrayMistyped);
+
+        long[] outputArray = new long[histogramBucketCount];
+        for (int i = 0; i < histogramBucketCount; ++i)
+            outputArray[i] = outputArrayMistyped[i] & (long)0xffffffff;
+        return outputArray;
+    }
+
+    private boolean histogram(RenderScript RS, ScriptC_reduce s) {
+        final byte[] inputArray = createInputArrayByte(100000, 11);
+
+        final long[] javaRslt = histogram(RS, inputArray);
+        _RS_ASSERT("javaRslt unexpected length: " + javaRslt.length, javaRslt.length == histogramBucketCount);
+        final long[] rsRslt = s.reduce_histogram(inputArray).get();
+        _RS_ASSERT("rsRslt unexpected length: " + rsRslt.length, rsRslt.length == histogramBucketCount);
+
+        for (int i = 0; i < histogramBucketCount; ++i) {
+            if (javaRslt[i] != rsRslt[i]) {
+                Log.i(TAG,
+                        "histogram[" + i + "]: java " + javaRslt[i] + ", rs " + rsRslt[i] + ": FAILED");
+                return false;
+            }
+        }
+
+        Log.i(TAG, "histogram: PASSED");
+        return true;
+    }
+
+    //-----------------------------------------------------------------
+
+    private Int2 mode(RenderScript RS, final byte[] inputArray) {
+        long[] hsg = histogram(RS, inputArray);
+
+        int modeIdx = 0;
+        for (int i = 1; i < hsg.length; ++i)
+            if (hsg[i] > hsg[modeIdx]) modeIdx =i;
+        return new Int2(modeIdx, (int)hsg[modeIdx]);
+    }
+
+    private boolean mode(RenderScript RS, ScriptC_reduce s) {
+        final byte[] inputArray = createInputArrayByte(100000, 12);
+
+        final Int2 javaRslt = mode(RS, inputArray);
+        final Int2 rsRslt = s.reduce_mode(inputArray).get();
+
+        return result("mode", javaRslt, rsRslt);
+    }
+
+    ///////////////////////////////////////////////////////////////////
+
+    public void run() {
+        RenderScript pRS = RenderScript.create(mCtx);
+        ScriptC_reduce s = new ScriptC_reduce(pRS);
+        s.set_negInf(Float.NEGATIVE_INFINITY);
+        s.set_posInf(Float.POSITIVE_INFINITY);
+
+        boolean pass = true;
+        pass &= addint1D(pRS, s);
+        pass &= addint2D(pRS, s);
+        pass &= dp(pRS, s);
+        pass &= findMinAndMax(pRS, s);
+        pass &= fz(pRS, s);
+        pass &= fz2(pRS, s);
+        pass &= fz3(pRS, s);
+        pass &= histogram(pRS, s);
+        pass &= mode(pRS, s);
+
+        pRS.finish();
+        pRS.destroy();
+
+        Log.i(TAG, pass ? "PASSED" : "FAILED");
+        if (pass)
+            passTest();
+        else
+            failTest();
+    }
+}
diff --git a/java/tests/RsTest/src/com/android/rs/test/reduce.rs b/java/tests/RsTest/src/com/android/rs/test/reduce.rs
new file mode 100644
index 0000000..1983bc0
--- /dev/null
+++ b/java/tests/RsTest/src/com/android/rs/test/reduce.rs
@@ -0,0 +1,157 @@
+#include "shared.rsh"
+
+float negInf, posInf;
+
+/////////////////////////////////////////////////////////////////////////
+
+#pragma rs reduce(addint) \
+  accumulator(aiAccum)
+
+static void aiAccum(int *accum, int val) { *accum += val; }
+
+/////////////////////////////////////////////////////////////////////////
+
+#pragma rs reduce(dp) \
+  accumulator(dpAccum) combiner(dpSum)
+
+static void dpAccum(float *accum, float in1, float in2) {
+  *accum += in1*in2;
+}
+
+// combiner function
+static void dpSum(float *accum, const float *val) { *accum += *val; }
+
+/////////////////////////////////////////////////////////////////////////
+
+#pragma rs reduce(findMinAndMax) \
+  initializer(fMMInit) accumulator(fMMAccumulator) \
+  combiner(fMMCombiner) outconverter(fMMOutConverter)
+
+typedef struct {
+  float val;
+  int idx;
+} IndexedVal;
+
+typedef struct {
+  IndexedVal min, max;
+} MinAndMax;
+
+static void fMMInit(MinAndMax *accum) {
+  accum->min.val = posInf;
+  accum->min.idx = -1;
+  accum->max.val = negInf;
+  accum->max.idx = -1;
+}
+
+static void fMMAccumulator(MinAndMax *accum, float in, int x) {
+  IndexedVal me;
+  me.val = in;
+  me.idx = x;
+
+  if (me.val < accum->min.val)
+    accum->min = me;
+  if (me.val > accum->max.val)
+    accum->max = me;
+}
+
+static void fMMCombiner(MinAndMax *accum,
+                        const MinAndMax *val) {
+  fMMAccumulator(accum, val->min.val, val->min.idx);
+  fMMAccumulator(accum, val->max.val, val->max.idx);
+}
+
+static void fMMOutConverter(int2 *result,
+                            const MinAndMax *val) {
+  result->x = val->min.idx;
+  result->y = val->max.idx;
+}
+
+/////////////////////////////////////////////////////////////////////////
+
+#pragma rs reduce(fz) \
+  initializer(fzInit) \
+  accumulator(fzAccum) combiner(fzCombine)
+
+static void fzInit(int *accumIdx) { *accumIdx = -1; }
+
+static void fzAccum(int *accumIdx,
+                    int inVal, int x /* special arg */) {
+  if (inVal==0) *accumIdx = x;
+}
+
+static void fzCombine(int *accumIdx, const int *accumIdx2) {
+  if (*accumIdx2 >= 0) *accumIdx = *accumIdx2;
+}
+
+/////////////////////////////////////////////////////////////////////////
+
+#pragma rs reduce(fz2) \
+  initializer(fz2Init) \
+  accumulator(fz2Accum) combiner(fz2Combine)
+
+static void fz2Init(int2 *accum) { accum->x = accum->y = -1; }
+
+static void fz2Accum(int2 *accum,
+                     int inVal,
+                     int x /* special arg */,
+                     int y /* special arg */) {
+  if (inVal==0) {
+    accum->x = x;
+    accum->y = y;
+  }
+}
+
+static void fz2Combine(int2 *accum, const int2 *accum2) {
+  if (accum2->x >= 0) *accum = *accum2;
+}
+
+/////////////////////////////////////////////////////////////////////////
+
+#pragma rs reduce(fz3) \
+  initializer(fz3Init) \
+  accumulator(fz3Accum) combiner(fz3Combine)
+
+static void fz3Init(int3 *accum) { accum->x = accum->y = accum->z = -1; }
+
+static void fz3Accum(int3 *accum,
+                     int inVal,
+                     int x /* special arg */,
+                     int y /* special arg */,
+                     int z /* special arg */) {
+  if (inVal==0) {
+    accum->x = x;
+    accum->y = y;
+    accum->z = z;
+  }
+}
+
+static void fz3Combine(int3 *accum, const int3 *accum2) {
+  if (accum->x >= 0) *accum = *accum2;
+}
+
+/////////////////////////////////////////////////////////////////////////
+
+#pragma rs reduce(histogram) \
+  accumulator(hsgAccum) combiner(hsgCombine)
+
+#define BUCKETS 256
+typedef uint32_t Histogram[BUCKETS];
+
+static void hsgAccum(Histogram *h, uchar in) { ++(*h)[in]; }
+
+static void hsgCombine(Histogram *accum, const Histogram *addend) {
+  for (int i = 0; i < BUCKETS; ++i)
+    (*accum)[i] += (*addend)[i];
+}
+
+#pragma rs reduce(mode) \
+  accumulator(hsgAccum) combiner(hsgCombine) \
+  outconverter(modeOutConvert)
+
+static void modeOutConvert(int2 *result, const Histogram *h) {
+  uint32_t mode = 0;
+  for (int i = 1; i < BUCKETS; ++i)
+    if ((*h)[i] > (*h)[mode]) mode = i;
+  result->x = mode;
+  result->y = (*h)[mode];
+}
diff --git a/rs.spec b/rs.spec
index 6e2dc4a..45f960c 100644
--- a/rs.spec
+++ b/rs.spec
@@ -407,6 +407,14 @@
     param const RsScriptCall * sc
 }
 
+ScriptReduceNew {
+    param RsScript s
+    param uint32_t slot
+    param RsAllocation * ains
+    param RsAllocation aout
+    param const RsScriptCall * sc
+}
+
 ScriptSetVarI {
     param RsScript s
     param uint32_t slot
diff --git a/rsDriverLoader.cpp b/rsDriverLoader.cpp
index b39f4f3..426c519 100644
--- a/rsDriverLoader.cpp
+++ b/rsDriverLoader.cpp
@@ -71,6 +71,7 @@
     ret &= fn(RS_HAL_SCRIPT_INVOKE_ROOT, (void **)&rsc->mHal.funcs.script.invokeRoot);
     ret &= fn(RS_HAL_SCRIPT_INVOKE_FOR_EACH, (void **)&rsc->mHal.funcs.script.invokeForEach);
     ret &= fn(RS_HAL_SCRIPT_INVOKE_REDUCE, (void **)&rsc->mHal.funcs.script.invokeReduce);
+    ret &= fn(RS_HAL_SCRIPT_INVOKE_REDUCE_NEW, (void **)&rsc->mHal.funcs.script.invokeReduceNew);
     ret &= fn(RS_HAL_SCRIPT_INVOKE_INIT, (void **)&rsc->mHal.funcs.script.invokeInit);
     ret &= fn(RS_HAL_SCRIPT_INVOKE_FREE_CHILDREN, (void **)&rsc->mHal.funcs.script.invokeFreeChildren);
     ret &= fn(RS_HAL_SCRIPT_SET_GLOBAL_VAR, (void **)&rsc->mHal.funcs.script.setGlobalVar);
@@ -266,6 +267,3 @@
 
     return true;
 }
-
-
-
diff --git a/rsScript.cpp b/rsScript.cpp
index bc24292..bf28328 100644
--- a/rsScript.cpp
+++ b/rsScript.cpp
@@ -232,6 +232,18 @@
                             static_cast<Allocation *>(vaout), sc);
 }
 
+void rsi_ScriptReduceNew(Context *rsc, RsScript vs, uint32_t slot,
+                         RsAllocation *vains, size_t inLen,
+                         RsAllocation vaout, const RsScriptCall *sc,
+                         size_t scLen) {
+  Script *s = static_cast<Script *>(vs);
+  Allocation **ains = (Allocation**)(vains);
+
+  s->runReduceNew(rsc, slot,
+                  const_cast<const Allocation **>(ains), inLen,
+                  static_cast<Allocation *>(vaout), sc);
+}
+
 void rsi_ScriptInvoke(Context *rsc, RsScript vs, uint32_t slot) {
     Script *s = static_cast<Script *>(vs);
     s->Invoke(rsc, slot, nullptr, 0);
diff --git a/rsScript.h b/rsScript.h
index bd6622d..c3241ab 100644
--- a/rsScript.h
+++ b/rsScript.h
@@ -86,6 +86,7 @@
             size_t exportedVariableCount;
             size_t exportedForEachCount;
             size_t exportedReduceCount;
+            size_t exportedReduceNewCount;
             size_t exportedFunctionCount;
             size_t exportedPragmaCount;
             char const **exportedPragmaKeyList;
@@ -135,6 +136,10 @@
     virtual void runReduce(Context *rsc, uint32_t slot, const Allocation *ain,
                            Allocation *aout, const RsScriptCall *sc) = 0;
 
+    virtual void runReduceNew(Context *rsc, uint32_t slot,
+                              const Allocation **ains, size_t inLen,
+                              Allocation *aout, const RsScriptCall *sc) = 0;
+
     virtual void Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) = 0;
     virtual void setupScript(Context *rsc) = 0;
     virtual uint32_t run(Context *) = 0;
diff --git a/rsScriptC.cpp b/rsScriptC.cpp
index 8a13c89..f643093 100644
--- a/rsScriptC.cpp
+++ b/rsScriptC.cpp
@@ -245,7 +245,7 @@
     ATRACE_CALL();
 
     if (slot >= mHal.info.exportedReduceCount) {
-        rsc->setError(RS_ERROR_BAD_SCRIPT, "The reduce kernel index is out of bounds");
+        rsc->setError(RS_ERROR_BAD_SCRIPT, "The simple reduce kernel index is out of bounds");
         return;
     }
     if (mRSC->hadFatalError()) return;
@@ -259,6 +259,27 @@
     rsc->mHal.funcs.script.invokeReduce(rsc, this, slot, ain, aout, sc);
 }
 
+void ScriptC::runReduceNew(Context *rsc, uint32_t slot,
+                           const Allocation ** ains, size_t inLen,
+                           Allocation *aout, const RsScriptCall *sc) {
+  // TODO: Record the name of the kernel in the tracing information.
+  ATRACE_CALL();
+
+  if (slot >= mHal.info.exportedReduceNewCount) {
+      rsc->setError(RS_ERROR_BAD_SCRIPT, "The general reduce kernel index is out of bounds");
+      return;
+  }
+  if (mRSC->hadFatalError()) return;
+
+  setupScript(rsc);
+
+  if (rsc->props.mLogScripts) {
+      ALOGV("%p ScriptC::runReduceNew invoking slot %i, ptr %p", rsc, slot, this);
+  }
+
+  rsc->mHal.funcs.script.invokeReduceNew(rsc, this, slot, ains, inLen, aout, sc);
+}
+
 void ScriptC::Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) {
     ATRACE_CALL();
 
diff --git a/rsScriptC.h b/rsScriptC.h
index bc94010..c8881a4 100644
--- a/rsScriptC.h
+++ b/rsScriptC.h
@@ -34,21 +34,25 @@
     ScriptC(Context *);
     virtual ~ScriptC();
 
-    virtual void Invoke(Context *rsc, uint32_t slot, const void *data, size_t len);
+    void Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) override;
 
     virtual uint32_t run(Context *);
 
-    virtual void runForEach(Context *rsc,
-                            uint32_t slot,
-                            const Allocation ** ains,
-                            size_t inLen,
-                            Allocation * aout,
-                            const void * usr,
-                            size_t usrBytes,
-                            const RsScriptCall *sc = nullptr);
+    void runForEach(Context *rsc,
+                    uint32_t slot,
+                    const Allocation ** ains,
+                    size_t inLen,
+                    Allocation * aout,
+                    const void * usr,
+                    size_t usrBytes,
+                    const RsScriptCall *sc = nullptr) override;
 
-    virtual void runReduce(Context *rsc, uint32_t slot, const Allocation *ain,
-                           Allocation *aout, const RsScriptCall *sc);
+    void runReduce(Context *rsc, uint32_t slot, const Allocation *ain,
+                   Allocation *aout, const RsScriptCall *sc) override;
+
+    void runReduceNew(Context *rsc, uint32_t slot,
+                      const Allocation ** ains, size_t inLen,
+                      Allocation *aout, const RsScriptCall *sc) override;
 
     virtual void serialize(Context *rsc, OStream *stream) const {    }
     virtual RsA3DClassID getClassId() const { return RS_A3D_CLASS_ID_SCRIPT_C; }
diff --git a/rsScriptIntrinsic.cpp b/rsScriptIntrinsic.cpp
index 223e93b..6e0f6ae 100644
--- a/rsScriptIntrinsic.cpp
+++ b/rsScriptIntrinsic.cpp
@@ -72,6 +72,10 @@
                                 Allocation *aout, const RsScriptCall *sc) {
 }
 
+void ScriptIntrinsic::runReduceNew(Context *rsc, uint32_t slot,
+                                   const Allocation ** ains, size_t inLen,
+                                   Allocation *aout, const RsScriptCall *sc) {
+}
 
 void ScriptIntrinsic::Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) {
 }
diff --git a/rsScriptIntrinsic.h b/rsScriptIntrinsic.h
index fd48bdf..e2b04b8 100644
--- a/rsScriptIntrinsic.h
+++ b/rsScriptIntrinsic.h
@@ -52,6 +52,10 @@
     void runReduce(Context *rsc, uint32_t slot, const Allocation *ain,
                    Allocation *aout, const RsScriptCall *sc) override;
 
+    void runReduceNew(Context *rsc, uint32_t slot,
+                      const Allocation ** ains, size_t inLen,
+                      Allocation *aout, const RsScriptCall *sc) override;
+
     void Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) override;
     void setupScript(Context *rsc) override;
     uint32_t run(Context *) override;
diff --git a/rs_hal.h b/rs_hal.h
index 390e90d..2f3aa1a 100644
--- a/rs_hal.h
+++ b/rs_hal.h
@@ -154,6 +154,11 @@
                              uint32_t slot, const Allocation *ain,
                              Allocation *aout,
                              const RsScriptCall *sc);
+        void (*invokeReduceNew)(const Context *rsc, Script *s,
+                                uint32_t slot,
+                                const Allocation ** ains, size_t inLen,
+                                Allocation *aout,
+                                const RsScriptCall *sc);
         void (*invokeInit)(const Context *rsc, Script *s);
         void (*invokeFreeChildren)(const Context *rsc, Script *s);
 
@@ -386,6 +391,7 @@
     RS_HAL_SCRIPT_INVOKE_FOR_EACH_MULTI                     = 1013,
     RS_HAL_SCRIPT_UPDATE_CACHED_OBJECT                      = 1014,
     RS_HAL_SCRIPT_INVOKE_REDUCE                             = 1015,
+    RS_HAL_SCRIPT_INVOKE_REDUCE_NEW                         = 1016,
 
     RS_HAL_ALLOCATION_INIT                                  = 2000,
     RS_HAL_ALLOCATION_INIT_ADAPTER                          = 2001,