Support for general reduction kernels.
Requires coordinated change in frameworks/base.
Requires coordinated change in frameworks/compile/libbcc in order
for RsTest to run.
At present, general reduction kernels are run single-threaded.
Also: Remove dead struct field MTLaunchStructForEach::sig.
Bug: 23535724
Change-Id: Ice17ccf20a902f8a106eaa62ec071d46e3c0ad8c
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 48e8dbb..b8b4838 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -342,6 +342,18 @@
}
}
+// Set up the appropriate input and output pointers to the kernel driver info structure.
+// Inputs:
+// mtls - The MTLaunchStruct holding information about the kernel launch
+// redp - The reduce parameters (driver info structure)
+// x, y, z - The start offsets into each dimension
+static inline void RedpPtrSetup(const MTLaunchStructReduceNew *mtls, RsExpandKernelDriverInfo *redp,
+ uint32_t x, uint32_t y, uint32_t z) {
+ for (uint32_t i = 0; i < redp->inLen; i++) {
+ redp->inPtr[i] = (const uint8_t *)mtls->ains[i]->getPointerUnchecked(x, y, z);
+ }
+}
+
static uint32_t sliceInt(uint32_t *p, uint32_t val, uint32_t start, uint32_t end) {
if (start >= end) {
*p = start;
@@ -355,16 +367,16 @@
return n;
}
-static bool SelectOuterSlice(const MTLaunchStructForEach *mtls, RsExpandKernelDriverInfo* fep, uint32_t sliceNum) {
+static bool SelectOuterSlice(const MTLaunchStructCommon *mtls, RsExpandKernelDriverInfo* info, uint32_t sliceNum) {
uint32_t r = sliceNum;
- r = sliceInt(&fep->current.z, r, mtls->start.z, mtls->end.z);
- r = sliceInt(&fep->current.lod, r, mtls->start.lod, mtls->end.lod);
- r = sliceInt(&fep->current.face, r, mtls->start.face, mtls->end.face);
- r = sliceInt(&fep->current.array[0], r, mtls->start.array[0], mtls->end.array[0]);
- r = sliceInt(&fep->current.array[1], r, mtls->start.array[1], mtls->end.array[1]);
- r = sliceInt(&fep->current.array[2], r, mtls->start.array[2], mtls->end.array[2]);
- r = sliceInt(&fep->current.array[3], r, mtls->start.array[3], mtls->end.array[3]);
+ r = sliceInt(&info->current.z, r, mtls->start.z, mtls->end.z);
+ r = sliceInt(&info->current.lod, r, mtls->start.lod, mtls->end.lod);
+ r = sliceInt(&info->current.face, r, mtls->start.face, mtls->end.face);
+ r = sliceInt(&info->current.array[0], r, mtls->start.array[0], mtls->end.array[0]);
+ r = sliceInt(&info->current.array[1], r, mtls->start.array[1], mtls->end.array[1]);
+ r = sliceInt(&info->current.array[2], r, mtls->start.array[2], mtls->end.array[2]);
+ r = sliceInt(&info->current.array[3], r, mtls->start.array[3], mtls->end.array[3]);
return r == 0;
}
@@ -446,7 +458,7 @@
}
}
-// Launch a reduce-style kernel.
+// Launch a simple reduce-style kernel.
// Inputs:
// ain: The allocation that contains the input
// aout: The allocation that will hold the output
@@ -465,6 +477,50 @@
mtls->kernel(&mtls->inBuf[startOffset], mtls->outBuf, xEnd - xStart);
}
+// Launch a general reduce-style kernel.
+// Inputs:
+// ains[0..inLen-1]: Array of allocations that contain the inputs
+// aout: The allocation that will hold the output
+// mtls: Holds launch parameters
+void RsdCpuReferenceImpl::launchReduceNew(const Allocation ** ains,
+ uint32_t inLen,
+ Allocation * aout,
+ MTLaunchStructReduceNew *mtls) {
+ // In the presence of outconverter, we allocate temporary memory for
+ // the accumulator.
+ //
+ // In the absence of outconverter, we use the output allocation as the
+ // accumulator.
+ uint8_t *const accumPtr = (mtls->outFunc
+ ? static_cast<uint8_t *>(malloc(mtls->accumSize))
+ : mtls->redp.outPtr[0]);
+
+ // initialize
+ if (mtls->initFunc) {
+ mtls->initFunc(accumPtr);
+ } else {
+ memset(accumPtr, 0, mtls->accumSize);
+ }
+
+ // accumulate
+ const ReduceNewAccumulatorFunc_t fn = mtls->accumFunc;
+ uint32_t slice = 0;
+ while (SelectOuterSlice(mtls, &mtls->redp, slice++)) {
+ for (mtls->redp.current.y = mtls->start.y;
+ mtls->redp.current.y < mtls->end.y;
+ mtls->redp.current.y++) {
+ RedpPtrSetup(mtls, &mtls->redp, mtls->start.x, mtls->redp.current.y, mtls->redp.current.z);
+ fn(&mtls->redp, mtls->start.x, mtls->end.x, accumPtr);
+ }
+ }
+
+ // outconvert
+ if (mtls->outFunc) {
+ mtls->outFunc(mtls->redp.outPtr[0], accumPtr);
+ free(accumPtr);
+ }
+}
+
void RsdCpuReferenceImpl::launchForEach(const Allocation ** ains,
uint32_t inLen,
Allocation* aout,
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index cfdb29a..939b7ae 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -33,11 +33,21 @@
// Function types found in RenderScript code
typedef void (*ReduceFunc_t)(const uint8_t *inBuf, uint8_t *outBuf, uint32_t len);
+typedef void (*ReduceNewAccumulatorFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint8_t *accum);
+typedef void (*ReduceNewInitializerFunc_t)(uint8_t *accum);
+typedef void (*ReduceNewOutConverterFunc_t)(uint8_t *out, const uint8_t *accum);
typedef void (*ForEachFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint32_t outStride);
typedef void (*InvokeFunc_t)(void *params);
typedef void (*InitOrDtorFunc_t)(void);
typedef int (*RootFunc_t)(void);
+struct ReduceNewDescription {
+ ReduceNewAccumulatorFunc_t accumFunc; // expanded accumulator function
+ ReduceNewInitializerFunc_t initFunc; // user initializer function
+ ReduceNewOutConverterFunc_t outFunc; // user outconverter function
+ size_t accumSize; // accumulator datum size, in bytes
+};
+
// Internal driver callback used to execute a kernel
typedef void (*WorkerCallback_t)(void *usr, uint32_t idx);
@@ -72,7 +82,6 @@
RsExpandKernelDriverInfo fep;
ForEachFunc_t kernel;
- uint32_t sig;
const Allocation *ains[RS_KERNEL_INPUT_LIMIT];
Allocation *aout[RS_KERNEL_INPUT_LIMIT];
};
@@ -84,6 +93,19 @@
RsLaunchDimensions inputDim;
};
+struct MTLaunchStructReduceNew : public MTLaunchStructCommon {
+ // Driver info structure
+ RsExpandKernelDriverInfo redp;
+
+ const Allocation *ains[RS_KERNEL_INPUT_LIMIT];
+
+ ReduceNewAccumulatorFunc_t accumFunc;
+ ReduceNewInitializerFunc_t initFunc;
+ ReduceNewOutConverterFunc_t outFunc;
+
+ size_t accumSize; // accumulator datum size in bytes
+};
+
class RsdCpuReferenceImpl : public RsdCpuReference {
public:
~RsdCpuReferenceImpl() override;
@@ -107,10 +129,14 @@
void launchForEach(const Allocation **ains, uint32_t inLen, Allocation *aout,
const RsScriptCall *sc, MTLaunchStructForEach *mtls);
- // Launch a reduce kernel
+ // Launch a simple reduce kernel
void launchReduce(const Allocation *ain, Allocation *aout,
MTLaunchStructReduce *mtls);
+ // Launch a general reduce kernel
+ void launchReduceNew(const Allocation ** ains, uint32_t inLen, Allocation *aout,
+ MTLaunchStructReduceNew *mtls);
+
CpuScript * createScript(const ScriptC *s, char const *resName, char const *cacheDir,
uint8_t const *bitcode, size_t bitcodeSize, uint32_t flags) override;
CpuScript * createIntrinsic(const Script *s, RsScriptIntrinsicID iid, Element *e) override;
diff --git a/cpu_ref/rsCpuExecutable.cpp b/cpu_ref/rsCpuExecutable.cpp
index d654743..5dd31ee 100644
--- a/cpu_ref/rsCpuExecutable.cpp
+++ b/cpu_ref/rsCpuExecutable.cpp
@@ -319,6 +319,7 @@
ForEachFunc_t* forEachFunctions = nullptr;
uint32_t* forEachSignatures = nullptr;
ReduceFunc_t* reduceFunctions = nullptr;
+ ReduceNewDescription* reduceNewDescriptions = nullptr;
const char ** pragmaKeys = nullptr;
const char ** pragmaValues = nullptr;
uint32_t checksum = 0;
@@ -485,7 +486,7 @@
}
}
- // Read general reduce kernels (for now, we expect the count to be zero)
+ // Read general reduce kernels
if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
goto error;
}
@@ -493,11 +494,91 @@
ALOGE("Invalid export reduce new count!: %s", line);
goto error;
}
- if (reduceNewCount != 0) {
- ALOGE("Expected export reduce new count to be zero!: %s", line);
+
+ reduceNewDescriptions = new ReduceNewDescription[reduceNewCount];
+ if (reduceNewDescriptions == nullptr) {
goto error;
}
+ for (size_t i = 0; i < reduceNewCount; ++i) {
+ static const char kNoName[] = ".";
+
+ unsigned int tmpSig = 0;
+ size_t tmpSize = 0;
+ char tmpNameReduce[MAXLINE];
+ char tmpNameInitializer[MAXLINE];
+ char tmpNameAccumulator[MAXLINE];
+ char tmpNameCombiner[MAXLINE];
+ char tmpNameOutConverter[MAXLINE];
+ char tmpNameHalter[MAXLINE];
+
+ if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
+ goto error;
+ }
+#define DELIMNAME " - %" MAKE_STR(MAXLINE) "s"
+ if (sscanf(line, "%u - %zu" DELIMNAME DELIMNAME DELIMNAME DELIMNAME DELIMNAME DELIMNAME,
+ &tmpSig, &tmpSize, tmpNameReduce, tmpNameInitializer, tmpNameAccumulator,
+ tmpNameCombiner, tmpNameOutConverter, tmpNameHalter) != 8) {
+ ALOGE("Invalid export reduce new!: %s", line);
+ goto error;
+ }
+#undef DELIMNAME
+
+ // For now, we expect
+ // - Reduce and Accumulator names
+ // - optional Initializer, Combiner, and OutConverter name
+ // - no Halter name
+ if (!strcmp(tmpNameReduce, kNoName) ||
+ !strcmp(tmpNameAccumulator, kNoName)) {
+ ALOGE("Expected reduce and accumulator names!: %s", line);
+ goto error;
+ }
+ if (strcmp(tmpNameHalter, kNoName)) {
+ ALOGE("Did not expect halter name!: %s", line);
+ goto error;
+ }
+
+ // The current implementation does not use the signature,
+ // reduce name, or combiner.
+
+ reduceNewDescriptions[i].accumSize = tmpSize;
+
+ // Process the (optional) initializer.
+ if (strcmp(tmpNameInitializer, kNoName)) {
+ // Lookup the original user-written initializer.
+ if (!(reduceNewDescriptions[i].initFunc =
+ (ReduceNewInitializerFunc_t) dlsym(sharedObj, tmpNameInitializer))) {
+ ALOGE("Failed to find initializer function address for %s(): %s",
+ tmpNameInitializer, dlerror());
+ goto error;
+ }
+ } else {
+ reduceNewDescriptions[i].initFunc = nullptr;
+ }
+
+ // Lookup the expanded accumulator.
+ strncat(tmpNameAccumulator, ".expand", MAXLINE-1-strlen(tmpNameAccumulator));
+ if (!(reduceNewDescriptions[i].accumFunc =
+ (ReduceNewAccumulatorFunc_t) dlsym(sharedObj, tmpNameAccumulator))) {
+ ALOGE("Failed to find accumulator function address for %s(): %s",
+ tmpNameAccumulator, dlerror());
+ goto error;
+ }
+
+ // Process the (optional) outconverter.
+ if (strcmp(tmpNameOutConverter, kNoName)) {
+ // Lookup the original user-written outconverter.
+ if (!(reduceNewDescriptions[i].outFunc =
+ (ReduceNewOutConverterFunc_t) dlsym(sharedObj, tmpNameOutConverter))) {
+ ALOGE("Failed to find outconverter function address for %s(): %s",
+ tmpNameOutConverter, dlerror());
+ goto error;
+ }
+ } else {
+ reduceNewDescriptions[i].outFunc = nullptr;
+ }
+ }
+
if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
goto error;
}
@@ -631,6 +712,7 @@
invokeFunctions, funcCount,
forEachFunctions, forEachSignatures, forEachCount,
reduceFunctions, reduceCount,
+ reduceNewDescriptions, reduceNewCount,
pragmaKeys, pragmaValues, pragmaCount,
rsGlobalNames, rsGlobalAddresses, rsGlobalSizes, rsGlobalProperties,
numEntries, isThreadable, checksum);
diff --git a/cpu_ref/rsCpuExecutable.h b/cpu_ref/rsCpuExecutable.h
index fe9c2ad..6e4d325 100644
--- a/cpu_ref/rsCpuExecutable.h
+++ b/cpu_ref/rsCpuExecutable.h
@@ -69,6 +69,7 @@
ForEachFunc_t* forEachFunctions, uint32_t* forEachSignatures,
size_t forEachCount,
ReduceFunc_t* reduceFunctions, size_t reduceCount,
+ ReduceNewDescription *reduceNewDescriptions, size_t reduceNewCount,
const char** pragmaKeys, const char** pragmaValues,
size_t pragmaCount,
const char **globalNames, const void **globalAddresses,
@@ -81,6 +82,7 @@
mForEachFunctions(forEachFunctions), mForEachSignatures(forEachSignatures),
mForEachCount(forEachCount),
mReduceFunctions(reduceFunctions), mReduceCount(reduceCount),
+ mReduceNewDescriptions(reduceNewDescriptions), mReduceNewCount(reduceNewCount),
mPragmaKeys(pragmaKeys), mPragmaValues(pragmaValues),
mPragmaCount(pragmaCount), mGlobalNames(globalNames),
mGlobalAddresses(globalAddresses), mGlobalSizes(globalSizes),
@@ -109,6 +111,8 @@
delete[] mReduceFunctions;
+ delete[] mReduceNewDescriptions;
+
delete[] mForEachSignatures;
delete[] mForEachFunctions;
@@ -134,6 +138,7 @@
size_t getExportedFunctionCount() const { return mFuncCount; }
size_t getExportedForEachCount() const { return mForEachCount; }
size_t getExportedReduceCount() const { return mReduceCount; }
+ size_t getExportedReduceNewCount() const { return mReduceNewCount; }
size_t getPragmaCount() const { return mPragmaCount; }
void* getFieldAddress(int slot) const { return mFieldAddress[slot]; }
@@ -148,6 +153,10 @@
ReduceFunc_t getReduceFunction(int slot) const { return mReduceFunctions[slot]; }
+ const ReduceNewDescription* getReduceNewDescription(int slot) const {
+ return &mReduceNewDescriptions[slot];
+ }
+
const char ** getPragmaKeys() const { return mPragmaKeys; }
const char ** getPragmaValues() const { return mPragmaValues; }
@@ -203,6 +212,9 @@
ReduceFunc_t* mReduceFunctions;
size_t mReduceCount;
+ ReduceNewDescription* mReduceNewDescriptions;
+ size_t mReduceNewCount;
+
const char ** mPragmaKeys;
const char ** mPragmaValues;
size_t mPragmaCount;
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index 5adca54..7308b54 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -497,6 +497,7 @@
// Copy info over to runtime
script->mHal.info.exportedFunctionCount = mScriptExec->getExportedFunctionCount();
script->mHal.info.exportedReduceCount = mScriptExec->getExportedReduceCount();
+ script->mHal.info.exportedReduceNewCount = mScriptExec->getExportedReduceNewCount();
script->mHal.info.exportedForEachCount = mScriptExec->getExportedForEachCount();
script->mHal.info.exportedVariableCount = mScriptExec->getExportedVariableCount();
script->mHal.info.exportedPragmaCount = mScriptExec->getPragmaCount();;
@@ -553,7 +554,7 @@
return true;
}
-// Preliminary work to prepare a reduce-style kernel for launch.
+// Preliminary work to prepare a simple reduce-style kernel for launch.
bool RsdCpuScriptImpl::reduceMtlsSetup(const Allocation *ain,
const Allocation *aout,
const RsScriptCall *sc,
@@ -591,6 +592,77 @@
return true;
}
+// Preliminary work to prepare a general reduce-style kernel for launch.
+bool RsdCpuScriptImpl::reduceNewMtlsSetup(const Allocation ** ains,
+ uint32_t inLen,
+ const Allocation * aout,
+ const RsScriptCall *sc,
+ MTLaunchStructReduceNew *mtls) {
+ rsAssert(ains && (inLen >= 1) && aout);
+ memset(mtls, 0, sizeof(MTLaunchStructReduceNew));
+ mtls->dimPtr = &mtls->redp.dim;
+
+ for (int index = inLen; --index >= 0;) {
+ if (allocationLODIsNull(ains[index])) {
+ mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+ "reduce called with null in allocations");
+ return false;
+ }
+ }
+
+ if (allocationLODIsNull(aout)) {
+ mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+ "reduce called with null out allocation");
+ return false;
+ }
+
+ const Allocation *ain0 = ains[0];
+ const Type *inType = ain0->getType();
+
+ mtls->redp.dim.x = inType->getDimX();
+ mtls->redp.dim.y = inType->getDimY();
+ mtls->redp.dim.z = inType->getDimZ();
+
+ for (int Index = inLen; --Index >= 1;) {
+ if (!ain0->hasSameDims(ains[Index])) {
+ mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+ "Failed to launch reduction kernel;"
+ "dimensions of input allocations do not match.");
+ return false;
+ }
+ }
+
+ if (!setUpMtlsDimensions(mtls, mtls->redp.dim, sc)) {
+ return false;
+ }
+
+ // The X & Y walkers always want 0-1 min even if dim is not present
+ mtls->end.x = rsMax((uint32_t)1, mtls->end.x);
+ mtls->end.y = rsMax((uint32_t)1, mtls->end.y);
+
+ mtls->rs = mCtx;
+
+ // Currently not threaded.
+ mtls->isThreadable = false;
+ mtls->mSliceNum = -1;
+
+ // Set up output,
+ mtls->redp.outLen = 1;
+ mtls->redp.outPtr[0] = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
+ mtls->redp.outStride[0] = aout->getType()->getElementSizeBytes();
+
+ // Set up input.
+ memcpy(mtls->ains, ains, inLen * sizeof(ains[0]));
+ mtls->redp.inLen = inLen;
+ for (int index = inLen; --index >= 0;) {
+ mtls->redp.inPtr[index] = (const uint8_t*)ains[index]->mHal.drvState.lod[0].mallocPtr;
+ mtls->redp.inStride[index] = ains[index]->getType()->getElementSizeBytes();
+ }
+
+ // All validation passed, ok to launch threads
+ return true;
+}
+
// Preliminary work to prepare a forEach-style kernel for launch.
bool RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
uint32_t inLen,
@@ -626,13 +698,11 @@
for (int Index = inLen; --Index >= 1;) {
if (!ain0->hasSameDims(ains[Index])) {
mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
- "Failed to launch kernel; dimensions of input and output"
+ "Failed to launch kernel; dimensions of input"
"allocations do not match.");
-
return false;
}
}
-
} else if (aout != nullptr) {
const Type *outType = aout->getType();
@@ -729,12 +799,25 @@
}
}
+void RsdCpuScriptImpl::invokeReduceNew(uint32_t slot,
+ const Allocation ** ains, uint32_t inLen,
+ Allocation *aout,
+ const RsScriptCall *sc) {
+ MTLaunchStructReduceNew mtls;
+
+ if (reduceNewMtlsSetup(ains, inLen, aout, sc, &mtls)) {
+ reduceNewKernelSetup(slot, &mtls);
+ RsdCpuScriptImpl *oldTLS = mCtx->setTLS(this);
+ mCtx->launchReduceNew(ains, inLen, aout, &mtls);
+ mCtx->setTLS(oldTLS);
+ }
+}
+
void RsdCpuScriptImpl::forEachKernelSetup(uint32_t slot, MTLaunchStructForEach *mtls) {
mtls->script = this;
mtls->fep.slot = slot;
mtls->kernel = mScriptExec->getForEachFunction(slot);
rsAssert(mtls->kernel != nullptr);
- mtls->sig = mScriptExec->getForEachSignature(slot);
}
void RsdCpuScriptImpl::reduceKernelSetup(uint32_t slot, MTLaunchStructReduce *mtls) {
@@ -743,6 +826,19 @@
rsAssert(mtls->kernel != nullptr);
}
+void RsdCpuScriptImpl::reduceNewKernelSetup(uint32_t slot, MTLaunchStructReduceNew *mtls) {
+ mtls->script = this;
+ mtls->redp.slot = slot;
+
+ const ReduceNewDescription *desc = mScriptExec->getReduceNewDescription(slot);
+ mtls->accumFunc = desc->accumFunc;
+ mtls->initFunc = desc->initFunc; // might legally be nullptr
+ mtls->outFunc = desc->outFunc; // might legally be nullptr
+ mtls->accumSize = desc->accumSize;
+
+ rsAssert(mtls->accumFunc != nullptr);
+}
+
int RsdCpuScriptImpl::invokeRoot() {
RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
int ret = mRoot();
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index 248e5c7..2909dab 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -65,6 +65,11 @@
Allocation* aout,
const RsScriptCall* sc) override;
+ void invokeReduceNew(uint32_t slot,
+ const Allocation ** ains, uint32_t inLen,
+ Allocation* aout,
+ const RsScriptCall* sc) override;
+
void invokeInit() override;
void invokeFreeChildren() override;
@@ -89,12 +94,18 @@
virtual void forEachKernelSetup(uint32_t slot, MTLaunchStructForEach *mtls);
- // Build an MTLaunchStruct suitable for launching a reduce-style kernel.
+ // Build an MTLaunchStruct suitable for launching a simple reduce-style kernel.
bool reduceMtlsSetup(const Allocation *ain, const Allocation *aout,
const RsScriptCall *sc, MTLaunchStructReduce *mtls);
- // Finalize an MTLaunchStruct for launching a reduce-style kernel.
+ // Finalize an MTLaunchStruct for launching a simple reduce-style kernel.
virtual void reduceKernelSetup(uint32_t slot, MTLaunchStructReduce *mtls);
+ // Build an MTLaunchStruct suitable for launching a general reduce-style kernel.
+ bool reduceNewMtlsSetup(const Allocation ** ains, uint32_t inLen, const Allocation *aout,
+ const RsScriptCall *sc, MTLaunchStructReduceNew *mtls);
+ // Finalize an MTLaunchStruct for launching a general reduce-style kernel.
+ virtual void reduceNewKernelSetup(uint32_t slot, MTLaunchStructReduceNew *mtls);
+
const RsdCpuReference::CpuSymbol * lookupSymbolMath(const char *sym);
static void * lookupRuntimeStub(void* pContext, char const* name);
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
index f2c7f19..49a999d 100644
--- a/cpu_ref/rsd_cpu.h
+++ b/cpu_ref/rsd_cpu.h
@@ -63,6 +63,11 @@
Allocation *aout,
const RsScriptCall *sc) = 0;
+ virtual void invokeReduceNew(uint32_t slot,
+ const Allocation ** ains, uint32_t inLen,
+ Allocation *aout,
+ const RsScriptCall *sc) = 0;
+
virtual void invokeInit() = 0;
virtual void invokeFreeChildren() = 0;