Support for general reduction kernels.
Requires coordinated change in frameworks/base.
Requires coordinated change in frameworks/compile/libbcc in order
for RsTest to run.
At present, general reduction kernels are run single-threaded.
Also: Remove dead struct field MTLaunchStructForEach::sig.
Bug: 23535724
Change-Id: Ice17ccf20a902f8a106eaa62ec071d46e3c0ad8c
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 48e8dbb..b8b4838 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -342,6 +342,18 @@
}
}
+// Set up the appropriate input and output pointers to the kernel driver info structure.
+// Inputs:
+// mtls - The MTLaunchStruct holding information about the kernel launch
+// redp - The reduce parameters (driver info structure)
+// x, y, z - The start offsets into each dimension
+static inline void RedpPtrSetup(const MTLaunchStructReduceNew *mtls, RsExpandKernelDriverInfo *redp,
+ uint32_t x, uint32_t y, uint32_t z) {
+ for (uint32_t i = 0; i < redp->inLen; i++) {
+ redp->inPtr[i] = (const uint8_t *)mtls->ains[i]->getPointerUnchecked(x, y, z);
+ }
+}
+
static uint32_t sliceInt(uint32_t *p, uint32_t val, uint32_t start, uint32_t end) {
if (start >= end) {
*p = start;
@@ -355,16 +367,16 @@
return n;
}
-static bool SelectOuterSlice(const MTLaunchStructForEach *mtls, RsExpandKernelDriverInfo* fep, uint32_t sliceNum) {
+static bool SelectOuterSlice(const MTLaunchStructCommon *mtls, RsExpandKernelDriverInfo* info, uint32_t sliceNum) {
uint32_t r = sliceNum;
- r = sliceInt(&fep->current.z, r, mtls->start.z, mtls->end.z);
- r = sliceInt(&fep->current.lod, r, mtls->start.lod, mtls->end.lod);
- r = sliceInt(&fep->current.face, r, mtls->start.face, mtls->end.face);
- r = sliceInt(&fep->current.array[0], r, mtls->start.array[0], mtls->end.array[0]);
- r = sliceInt(&fep->current.array[1], r, mtls->start.array[1], mtls->end.array[1]);
- r = sliceInt(&fep->current.array[2], r, mtls->start.array[2], mtls->end.array[2]);
- r = sliceInt(&fep->current.array[3], r, mtls->start.array[3], mtls->end.array[3]);
+ r = sliceInt(&info->current.z, r, mtls->start.z, mtls->end.z);
+ r = sliceInt(&info->current.lod, r, mtls->start.lod, mtls->end.lod);
+ r = sliceInt(&info->current.face, r, mtls->start.face, mtls->end.face);
+ r = sliceInt(&info->current.array[0], r, mtls->start.array[0], mtls->end.array[0]);
+ r = sliceInt(&info->current.array[1], r, mtls->start.array[1], mtls->end.array[1]);
+ r = sliceInt(&info->current.array[2], r, mtls->start.array[2], mtls->end.array[2]);
+ r = sliceInt(&info->current.array[3], r, mtls->start.array[3], mtls->end.array[3]);
return r == 0;
}
@@ -446,7 +458,7 @@
}
}
-// Launch a reduce-style kernel.
+// Launch a simple reduce-style kernel.
// Inputs:
// ain: The allocation that contains the input
// aout: The allocation that will hold the output
@@ -465,6 +477,50 @@
mtls->kernel(&mtls->inBuf[startOffset], mtls->outBuf, xEnd - xStart);
}
+// Launch a general reduce-style kernel.
+// Inputs:
+// ains[0..inLen-1]: Array of allocations that contain the inputs
+// aout: The allocation that will hold the output
+// mtls: Holds launch parameters
+void RsdCpuReferenceImpl::launchReduceNew(const Allocation ** ains,
+ uint32_t inLen,
+ Allocation * aout,
+ MTLaunchStructReduceNew *mtls) {
+ // In the presence of outconverter, we allocate temporary memory for
+ // the accumulator.
+ //
+ // In the absence of outconverter, we use the output allocation as the
+ // accumulator.
+ uint8_t *const accumPtr = (mtls->outFunc
+ ? static_cast<uint8_t *>(malloc(mtls->accumSize))
+ : mtls->redp.outPtr[0]);
+
+ // initialize
+ if (mtls->initFunc) {
+ mtls->initFunc(accumPtr);
+ } else {
+ memset(accumPtr, 0, mtls->accumSize);
+ }
+
+ // accumulate
+ const ReduceNewAccumulatorFunc_t fn = mtls->accumFunc;
+ uint32_t slice = 0;
+ while (SelectOuterSlice(mtls, &mtls->redp, slice++)) {
+ for (mtls->redp.current.y = mtls->start.y;
+ mtls->redp.current.y < mtls->end.y;
+ mtls->redp.current.y++) {
+ RedpPtrSetup(mtls, &mtls->redp, mtls->start.x, mtls->redp.current.y, mtls->redp.current.z);
+ fn(&mtls->redp, mtls->start.x, mtls->end.x, accumPtr);
+ }
+ }
+
+ // outconvert
+ if (mtls->outFunc) {
+ mtls->outFunc(mtls->redp.outPtr[0], accumPtr);
+ free(accumPtr);
+ }
+}
+
void RsdCpuReferenceImpl::launchForEach(const Allocation ** ains,
uint32_t inLen,
Allocation* aout,
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index cfdb29a..939b7ae 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -33,11 +33,21 @@
// Function types found in RenderScript code
typedef void (*ReduceFunc_t)(const uint8_t *inBuf, uint8_t *outBuf, uint32_t len);
+typedef void (*ReduceNewAccumulatorFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint8_t *accum);
+typedef void (*ReduceNewInitializerFunc_t)(uint8_t *accum);
+typedef void (*ReduceNewOutConverterFunc_t)(uint8_t *out, const uint8_t *accum);
typedef void (*ForEachFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint32_t outStride);
typedef void (*InvokeFunc_t)(void *params);
typedef void (*InitOrDtorFunc_t)(void);
typedef int (*RootFunc_t)(void);
+struct ReduceNewDescription {
+ ReduceNewAccumulatorFunc_t accumFunc; // expanded accumulator function
+ ReduceNewInitializerFunc_t initFunc; // user initializer function
+ ReduceNewOutConverterFunc_t outFunc; // user outconverter function
+ size_t accumSize; // accumulator datum size, in bytes
+};
+
// Internal driver callback used to execute a kernel
typedef void (*WorkerCallback_t)(void *usr, uint32_t idx);
@@ -72,7 +82,6 @@
RsExpandKernelDriverInfo fep;
ForEachFunc_t kernel;
- uint32_t sig;
const Allocation *ains[RS_KERNEL_INPUT_LIMIT];
Allocation *aout[RS_KERNEL_INPUT_LIMIT];
};
@@ -84,6 +93,19 @@
RsLaunchDimensions inputDim;
};
+struct MTLaunchStructReduceNew : public MTLaunchStructCommon {
+ // Driver info structure
+ RsExpandKernelDriverInfo redp;
+
+ const Allocation *ains[RS_KERNEL_INPUT_LIMIT];
+
+ ReduceNewAccumulatorFunc_t accumFunc;
+ ReduceNewInitializerFunc_t initFunc;
+ ReduceNewOutConverterFunc_t outFunc;
+
+ size_t accumSize; // accumulator datum size in bytes
+};
+
class RsdCpuReferenceImpl : public RsdCpuReference {
public:
~RsdCpuReferenceImpl() override;
@@ -107,10 +129,14 @@
void launchForEach(const Allocation **ains, uint32_t inLen, Allocation *aout,
const RsScriptCall *sc, MTLaunchStructForEach *mtls);
- // Launch a reduce kernel
+ // Launch a simple reduce kernel
void launchReduce(const Allocation *ain, Allocation *aout,
MTLaunchStructReduce *mtls);
+ // Launch a general reduce kernel
+ void launchReduceNew(const Allocation ** ains, uint32_t inLen, Allocation *aout,
+ MTLaunchStructReduceNew *mtls);
+
CpuScript * createScript(const ScriptC *s, char const *resName, char const *cacheDir,
uint8_t const *bitcode, size_t bitcodeSize, uint32_t flags) override;
CpuScript * createIntrinsic(const Script *s, RsScriptIntrinsicID iid, Element *e) override;
diff --git a/cpu_ref/rsCpuExecutable.cpp b/cpu_ref/rsCpuExecutable.cpp
index d654743..5dd31ee 100644
--- a/cpu_ref/rsCpuExecutable.cpp
+++ b/cpu_ref/rsCpuExecutable.cpp
@@ -319,6 +319,7 @@
ForEachFunc_t* forEachFunctions = nullptr;
uint32_t* forEachSignatures = nullptr;
ReduceFunc_t* reduceFunctions = nullptr;
+ ReduceNewDescription* reduceNewDescriptions = nullptr;
const char ** pragmaKeys = nullptr;
const char ** pragmaValues = nullptr;
uint32_t checksum = 0;
@@ -485,7 +486,7 @@
}
}
- // Read general reduce kernels (for now, we expect the count to be zero)
+ // Read general reduce kernels
if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
goto error;
}
@@ -493,11 +494,91 @@
ALOGE("Invalid export reduce new count!: %s", line);
goto error;
}
- if (reduceNewCount != 0) {
- ALOGE("Expected export reduce new count to be zero!: %s", line);
+
+ reduceNewDescriptions = new ReduceNewDescription[reduceNewCount];
+ if (reduceNewDescriptions == nullptr) {
goto error;
}
+ for (size_t i = 0; i < reduceNewCount; ++i) {
+ static const char kNoName[] = ".";
+
+ unsigned int tmpSig = 0;
+ size_t tmpSize = 0;
+ char tmpNameReduce[MAXLINE];
+ char tmpNameInitializer[MAXLINE];
+ char tmpNameAccumulator[MAXLINE];
+ char tmpNameCombiner[MAXLINE];
+ char tmpNameOutConverter[MAXLINE];
+ char tmpNameHalter[MAXLINE];
+
+ if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
+ goto error;
+ }
+#define DELIMNAME " - %" MAKE_STR(MAXLINE) "s"
+ if (sscanf(line, "%u - %zu" DELIMNAME DELIMNAME DELIMNAME DELIMNAME DELIMNAME DELIMNAME,
+ &tmpSig, &tmpSize, tmpNameReduce, tmpNameInitializer, tmpNameAccumulator,
+ tmpNameCombiner, tmpNameOutConverter, tmpNameHalter) != 8) {
+ ALOGE("Invalid export reduce new!: %s", line);
+ goto error;
+ }
+#undef DELIMNAME
+
+ // For now, we expect
+ // - Reduce and Accumulator names
+ // - optional Initializer, Combiner, and OutConverter name
+ // - no Halter name
+ if (!strcmp(tmpNameReduce, kNoName) ||
+ !strcmp(tmpNameAccumulator, kNoName)) {
+ ALOGE("Expected reduce and accumulator names!: %s", line);
+ goto error;
+ }
+ if (strcmp(tmpNameHalter, kNoName)) {
+ ALOGE("Did not expect halter name!: %s", line);
+ goto error;
+ }
+
+ // The current implementation does not use the signature,
+ // reduce name, or combiner.
+
+ reduceNewDescriptions[i].accumSize = tmpSize;
+
+ // Process the (optional) initializer.
+ if (strcmp(tmpNameInitializer, kNoName)) {
+ // Lookup the original user-written initializer.
+ if (!(reduceNewDescriptions[i].initFunc =
+ (ReduceNewInitializerFunc_t) dlsym(sharedObj, tmpNameInitializer))) {
+ ALOGE("Failed to find initializer function address for %s(): %s",
+ tmpNameInitializer, dlerror());
+ goto error;
+ }
+ } else {
+ reduceNewDescriptions[i].initFunc = nullptr;
+ }
+
+ // Lookup the expanded accumulator.
+ strncat(tmpNameAccumulator, ".expand", MAXLINE-1-strlen(tmpNameAccumulator));
+ if (!(reduceNewDescriptions[i].accumFunc =
+ (ReduceNewAccumulatorFunc_t) dlsym(sharedObj, tmpNameAccumulator))) {
+ ALOGE("Failed to find accumulator function address for %s(): %s",
+ tmpNameAccumulator, dlerror());
+ goto error;
+ }
+
+ // Process the (optional) outconverter.
+ if (strcmp(tmpNameOutConverter, kNoName)) {
+ // Lookup the original user-written outconverter.
+ if (!(reduceNewDescriptions[i].outFunc =
+ (ReduceNewOutConverterFunc_t) dlsym(sharedObj, tmpNameOutConverter))) {
+ ALOGE("Failed to find outconverter function address for %s(): %s",
+ tmpNameOutConverter, dlerror());
+ goto error;
+ }
+ } else {
+ reduceNewDescriptions[i].outFunc = nullptr;
+ }
+ }
+
if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
goto error;
}
@@ -631,6 +712,7 @@
invokeFunctions, funcCount,
forEachFunctions, forEachSignatures, forEachCount,
reduceFunctions, reduceCount,
+ reduceNewDescriptions, reduceNewCount,
pragmaKeys, pragmaValues, pragmaCount,
rsGlobalNames, rsGlobalAddresses, rsGlobalSizes, rsGlobalProperties,
numEntries, isThreadable, checksum);
diff --git a/cpu_ref/rsCpuExecutable.h b/cpu_ref/rsCpuExecutable.h
index fe9c2ad..6e4d325 100644
--- a/cpu_ref/rsCpuExecutable.h
+++ b/cpu_ref/rsCpuExecutable.h
@@ -69,6 +69,7 @@
ForEachFunc_t* forEachFunctions, uint32_t* forEachSignatures,
size_t forEachCount,
ReduceFunc_t* reduceFunctions, size_t reduceCount,
+ ReduceNewDescription *reduceNewDescriptions, size_t reduceNewCount,
const char** pragmaKeys, const char** pragmaValues,
size_t pragmaCount,
const char **globalNames, const void **globalAddresses,
@@ -81,6 +82,7 @@
mForEachFunctions(forEachFunctions), mForEachSignatures(forEachSignatures),
mForEachCount(forEachCount),
mReduceFunctions(reduceFunctions), mReduceCount(reduceCount),
+ mReduceNewDescriptions(reduceNewDescriptions), mReduceNewCount(reduceNewCount),
mPragmaKeys(pragmaKeys), mPragmaValues(pragmaValues),
mPragmaCount(pragmaCount), mGlobalNames(globalNames),
mGlobalAddresses(globalAddresses), mGlobalSizes(globalSizes),
@@ -109,6 +111,8 @@
delete[] mReduceFunctions;
+ delete[] mReduceNewDescriptions;
+
delete[] mForEachSignatures;
delete[] mForEachFunctions;
@@ -134,6 +138,7 @@
size_t getExportedFunctionCount() const { return mFuncCount; }
size_t getExportedForEachCount() const { return mForEachCount; }
size_t getExportedReduceCount() const { return mReduceCount; }
+ size_t getExportedReduceNewCount() const { return mReduceNewCount; }
size_t getPragmaCount() const { return mPragmaCount; }
void* getFieldAddress(int slot) const { return mFieldAddress[slot]; }
@@ -148,6 +153,10 @@
ReduceFunc_t getReduceFunction(int slot) const { return mReduceFunctions[slot]; }
+ const ReduceNewDescription* getReduceNewDescription(int slot) const {
+ return &mReduceNewDescriptions[slot];
+ }
+
const char ** getPragmaKeys() const { return mPragmaKeys; }
const char ** getPragmaValues() const { return mPragmaValues; }
@@ -203,6 +212,9 @@
ReduceFunc_t* mReduceFunctions;
size_t mReduceCount;
+ ReduceNewDescription* mReduceNewDescriptions;
+ size_t mReduceNewCount;
+
const char ** mPragmaKeys;
const char ** mPragmaValues;
size_t mPragmaCount;
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index 5adca54..7308b54 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -497,6 +497,7 @@
// Copy info over to runtime
script->mHal.info.exportedFunctionCount = mScriptExec->getExportedFunctionCount();
script->mHal.info.exportedReduceCount = mScriptExec->getExportedReduceCount();
+ script->mHal.info.exportedReduceNewCount = mScriptExec->getExportedReduceNewCount();
script->mHal.info.exportedForEachCount = mScriptExec->getExportedForEachCount();
script->mHal.info.exportedVariableCount = mScriptExec->getExportedVariableCount();
script->mHal.info.exportedPragmaCount = mScriptExec->getPragmaCount();;
@@ -553,7 +554,7 @@
return true;
}
-// Preliminary work to prepare a reduce-style kernel for launch.
+// Preliminary work to prepare a simple reduce-style kernel for launch.
bool RsdCpuScriptImpl::reduceMtlsSetup(const Allocation *ain,
const Allocation *aout,
const RsScriptCall *sc,
@@ -591,6 +592,77 @@
return true;
}
+// Preliminary work to prepare a general reduce-style kernel for launch.
+bool RsdCpuScriptImpl::reduceNewMtlsSetup(const Allocation ** ains,
+ uint32_t inLen,
+ const Allocation * aout,
+ const RsScriptCall *sc,
+ MTLaunchStructReduceNew *mtls) {
+ rsAssert(ains && (inLen >= 1) && aout);
+ memset(mtls, 0, sizeof(MTLaunchStructReduceNew));
+ mtls->dimPtr = &mtls->redp.dim;
+
+ for (int index = inLen; --index >= 0;) {
+ if (allocationLODIsNull(ains[index])) {
+ mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+ "reduce called with null in allocations");
+ return false;
+ }
+ }
+
+ if (allocationLODIsNull(aout)) {
+ mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+ "reduce called with null out allocation");
+ return false;
+ }
+
+ const Allocation *ain0 = ains[0];
+ const Type *inType = ain0->getType();
+
+ mtls->redp.dim.x = inType->getDimX();
+ mtls->redp.dim.y = inType->getDimY();
+ mtls->redp.dim.z = inType->getDimZ();
+
+ for (int Index = inLen; --Index >= 1;) {
+ if (!ain0->hasSameDims(ains[Index])) {
+ mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+ "Failed to launch reduction kernel;"
+ "dimensions of input allocations do not match.");
+ return false;
+ }
+ }
+
+ if (!setUpMtlsDimensions(mtls, mtls->redp.dim, sc)) {
+ return false;
+ }
+
+ // The X & Y walkers always want 0-1 min even if dim is not present
+ mtls->end.x = rsMax((uint32_t)1, mtls->end.x);
+ mtls->end.y = rsMax((uint32_t)1, mtls->end.y);
+
+ mtls->rs = mCtx;
+
+ // Currently not threaded.
+ mtls->isThreadable = false;
+ mtls->mSliceNum = -1;
+
+ // Set up output,
+ mtls->redp.outLen = 1;
+ mtls->redp.outPtr[0] = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
+ mtls->redp.outStride[0] = aout->getType()->getElementSizeBytes();
+
+ // Set up input.
+ memcpy(mtls->ains, ains, inLen * sizeof(ains[0]));
+ mtls->redp.inLen = inLen;
+ for (int index = inLen; --index >= 0;) {
+ mtls->redp.inPtr[index] = (const uint8_t*)ains[index]->mHal.drvState.lod[0].mallocPtr;
+ mtls->redp.inStride[index] = ains[index]->getType()->getElementSizeBytes();
+ }
+
+ // All validation passed, ok to launch threads
+ return true;
+}
+
// Preliminary work to prepare a forEach-style kernel for launch.
bool RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
uint32_t inLen,
@@ -626,13 +698,11 @@
for (int Index = inLen; --Index >= 1;) {
if (!ain0->hasSameDims(ains[Index])) {
mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
- "Failed to launch kernel; dimensions of input and output"
+ "Failed to launch kernel; dimensions of input"
"allocations do not match.");
-
return false;
}
}
-
} else if (aout != nullptr) {
const Type *outType = aout->getType();
@@ -729,12 +799,25 @@
}
}
+void RsdCpuScriptImpl::invokeReduceNew(uint32_t slot,
+ const Allocation ** ains, uint32_t inLen,
+ Allocation *aout,
+ const RsScriptCall *sc) {
+ MTLaunchStructReduceNew mtls;
+
+ if (reduceNewMtlsSetup(ains, inLen, aout, sc, &mtls)) {
+ reduceNewKernelSetup(slot, &mtls);
+ RsdCpuScriptImpl *oldTLS = mCtx->setTLS(this);
+ mCtx->launchReduceNew(ains, inLen, aout, &mtls);
+ mCtx->setTLS(oldTLS);
+ }
+}
+
void RsdCpuScriptImpl::forEachKernelSetup(uint32_t slot, MTLaunchStructForEach *mtls) {
mtls->script = this;
mtls->fep.slot = slot;
mtls->kernel = mScriptExec->getForEachFunction(slot);
rsAssert(mtls->kernel != nullptr);
- mtls->sig = mScriptExec->getForEachSignature(slot);
}
void RsdCpuScriptImpl::reduceKernelSetup(uint32_t slot, MTLaunchStructReduce *mtls) {
@@ -743,6 +826,19 @@
rsAssert(mtls->kernel != nullptr);
}
+void RsdCpuScriptImpl::reduceNewKernelSetup(uint32_t slot, MTLaunchStructReduceNew *mtls) {
+ mtls->script = this;
+ mtls->redp.slot = slot;
+
+ const ReduceNewDescription *desc = mScriptExec->getReduceNewDescription(slot);
+ mtls->accumFunc = desc->accumFunc;
+ mtls->initFunc = desc->initFunc; // might legally be nullptr
+ mtls->outFunc = desc->outFunc; // might legally be nullptr
+ mtls->accumSize = desc->accumSize;
+
+ rsAssert(mtls->accumFunc != nullptr);
+}
+
int RsdCpuScriptImpl::invokeRoot() {
RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
int ret = mRoot();
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index 248e5c7..2909dab 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -65,6 +65,11 @@
Allocation* aout,
const RsScriptCall* sc) override;
+ void invokeReduceNew(uint32_t slot,
+ const Allocation ** ains, uint32_t inLen,
+ Allocation* aout,
+ const RsScriptCall* sc) override;
+
void invokeInit() override;
void invokeFreeChildren() override;
@@ -89,12 +94,18 @@
virtual void forEachKernelSetup(uint32_t slot, MTLaunchStructForEach *mtls);
- // Build an MTLaunchStruct suitable for launching a reduce-style kernel.
+ // Build an MTLaunchStruct suitable for launching a simple reduce-style kernel.
bool reduceMtlsSetup(const Allocation *ain, const Allocation *aout,
const RsScriptCall *sc, MTLaunchStructReduce *mtls);
- // Finalize an MTLaunchStruct for launching a reduce-style kernel.
+ // Finalize an MTLaunchStruct for launching a simple reduce-style kernel.
virtual void reduceKernelSetup(uint32_t slot, MTLaunchStructReduce *mtls);
+ // Build an MTLaunchStruct suitable for launching a general reduce-style kernel.
+ bool reduceNewMtlsSetup(const Allocation ** ains, uint32_t inLen, const Allocation *aout,
+ const RsScriptCall *sc, MTLaunchStructReduceNew *mtls);
+ // Finalize an MTLaunchStruct for launching a general reduce-style kernel.
+ virtual void reduceNewKernelSetup(uint32_t slot, MTLaunchStructReduceNew *mtls);
+
const RsdCpuReference::CpuSymbol * lookupSymbolMath(const char *sym);
static void * lookupRuntimeStub(void* pContext, char const* name);
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
index f2c7f19..49a999d 100644
--- a/cpu_ref/rsd_cpu.h
+++ b/cpu_ref/rsd_cpu.h
@@ -63,6 +63,11 @@
Allocation *aout,
const RsScriptCall *sc) = 0;
+ virtual void invokeReduceNew(uint32_t slot,
+ const Allocation ** ains, uint32_t inLen,
+ Allocation *aout,
+ const RsScriptCall *sc) = 0;
+
virtual void invokeInit() = 0;
virtual void invokeFreeChildren() = 0;
diff --git a/driver/rsdBcc.cpp b/driver/rsdBcc.cpp
index 25659d8..af8d6ad 100644
--- a/driver/rsdBcc.cpp
+++ b/driver/rsdBcc.cpp
@@ -133,6 +133,15 @@
cs->invokeReduce(slot, ain, aout, sc);
}
+void rsdScriptInvokeReduceNew(const Context *dc, Script *s,
+ uint32_t slot,
+ const Allocation ** ains, size_t inLen,
+ Allocation *aout,
+ const RsScriptCall *sc) {
+ RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
+ cs->invokeReduceNew(slot, ains, inLen, aout, sc);
+}
+
void rsdScriptSetGlobalVar(const Context *dc, const Script *s,
uint32_t slot, void *data, size_t dataLength) {
RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
diff --git a/driver/rsdBcc.h b/driver/rsdBcc.h
index e95529b..a2bf8be 100644
--- a/driver/rsdBcc.h
+++ b/driver/rsdBcc.h
@@ -50,6 +50,14 @@
android::renderscript::Allocation *aout,
const RsScriptCall *sc);
+void rsdScriptInvokeReduceNew(const android::renderscript::Context *rsc,
+ android::renderscript::Script *s,
+ uint32_t slot,
+ const android::renderscript::Allocation ** ains,
+ size_t inLen,
+ android::renderscript::Allocation *aout,
+ const RsScriptCall *sc);
+
void rsdScriptInvokeForEachMulti(const android::renderscript::Context *rsc,
android::renderscript::Script *s,
uint32_t slot,
diff --git a/driver/rsdCore.cpp b/driver/rsdCore.cpp
index f70b79b..f0a7334 100644
--- a/driver/rsdCore.cpp
+++ b/driver/rsdCore.cpp
@@ -101,6 +101,8 @@
fnPtr[0] = (void *)rsdScriptUpdateCachedObject; break;
case RS_HAL_SCRIPT_INVOKE_REDUCE:
fnPtr[0] = (void *)rsdScriptInvokeReduce; break;
+ case RS_HAL_SCRIPT_INVOKE_REDUCE_NEW:
+ fnPtr[0] = (void *)rsdScriptInvokeReduceNew; break;
case RS_HAL_ALLOCATION_INIT:
fnPtr[0] = (void *)rsdAllocationInit; break;
diff --git a/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java b/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java
index efed0bf..7de6733 100644
--- a/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java
+++ b/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java
@@ -108,6 +108,7 @@
unitTests.add(new UT_mesh(this, mRes, mCtx));*/
unitTests.add(new UT_foreach_multi(this, mRes, mCtx));
unitTests.add(new UT_fp_mad(this, mRes, mCtx));
+ unitTests.add(new UT_reduce(this, mRes, mCtx));
/*
unitTests.add(new UnitTest(null, "<Pass>", 1));
diff --git a/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java b/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java
new file mode 100644
index 0000000..6cc70d1
--- /dev/null
+++ b/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java
@@ -0,0 +1,336 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.rs.test;
+
+import android.content.Context;
+import android.content.res.Resources;
+import android.renderscript.*;
+import android.util.Log;
+import java.lang.Float;
+import java.util.Random;
+
+public class UT_reduce extends UnitTest {
+ private static final String TAG = "reduce";
+
+ protected UT_reduce(RSTestCore rstc, Resources res, Context ctx) {
+ super(rstc, "reduce", ctx);
+ }
+
+ private byte[] createInputArrayByte(int len, int seed) {
+ byte[] array = new byte[len];
+ (new Random(seed)).nextBytes(array);
+ return array;
+ }
+
+ private float[] createInputArrayFloat(int len, int seed) {
+ Random rand = new Random(seed);
+ float[] array = new float[len];
+ for (int i = 0; i < len; ++i)
+ array[i] = rand.nextFloat();
+ return array;
+ }
+
+ private int[] createInputArrayInt(int len, int seed) {
+ Random rand = new Random(seed);
+ int[] array = new int[len];
+ for (int i = 0; i < len; ++i)
+ array[i] = rand.nextInt();
+ return array;
+ }
+
+ private int[] createInputArrayInt(int len, int seed, int eltRange) {
+ Random rand = new Random(seed);
+ int[] array = new int[len];
+ for (int i = 0; i < len; ++i)
+ array[i] = rand.nextInt(eltRange);
+ return array;
+ }
+
+ private <T extends Number> boolean result(String testName, T javaRslt, T rsRslt) {
+ final boolean success = javaRslt.equals(rsRslt);
+ Log.i(TAG,
+ testName + ": java " + javaRslt + ", rs " + rsRslt + ": " +
+ (success ? "PASSED" : "FAILED"));
+ return success;
+ }
+
+ private boolean result(String testName, Int2 javaRslt, Int2 rsRslt) {
+ final boolean success = (javaRslt.x == rsRslt.x) && (javaRslt.y == rsRslt.y);
+ Log.i(TAG,
+ testName +
+ ": java (" + javaRslt.x + ", " + javaRslt.y + ")" +
+ ", rs (" + rsRslt.x + ", " + rsRslt.y + ")" +
+ ": " + (success ? "PASSED" : "FAILED"));
+ return success;
+ }
+
+ ///////////////////////////////////////////////////////////////////
+
+ private int addint(int[] input) {
+ int rslt = 0;
+ for (int idx = 0; idx < input.length; ++idx)
+ rslt += input[idx];
+ return rslt;
+ }
+
+ private boolean addint1D(RenderScript RS, ScriptC_reduce s) {
+ final int[] input = createInputArrayInt(100000, 0, 1 << 13);
+
+ final int javaRslt = addint(input);
+ final int rsRslt = s.reduce_addint(input).get();
+
+ return result("addint1D", javaRslt, rsRslt);
+ }
+
+ private boolean addint2D(RenderScript RS, ScriptC_reduce s) {
+ final int dimX = 450, dimY = 225;
+
+ final int[] inputArray = createInputArrayInt(dimX * dimY, 1, 1 << 13);
+ Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS));
+ typeBuilder.setX(dimX).setY(dimY);
+ Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create());
+ inputAllocation.copy2DRangeFrom(0, 0, dimX, dimY, inputArray);
+
+ final int javaRslt = addint(inputArray);
+ final int rsRslt = s.reduce_addint(inputAllocation).get();
+
+ return result("addint2D", javaRslt, rsRslt);
+ }
+
+ ///////////////////////////////////////////////////////////////////
+
+ private float dp(float[] input1, float[] input2) {
+ _RS_ASSERT("dp input length mismatch", input1.length == input2.length);
+
+ float rslt = 0;
+ for (int idx = 0; idx < input1.length; ++idx)
+ rslt += input1[idx] * input2[idx];
+ return rslt;
+ }
+
+ private boolean dp(RenderScript RS, ScriptC_reduce s) {
+ final float[] input1 = createInputArrayFloat(100000, 2);
+ final float[] input2 = createInputArrayFloat(100000, 3);
+
+ final float javaRslt = dp(input1, input2);
+ final float rsRslt = s.reduce_dp(input1, input2).get();
+
+ // NOTE: Using a floating point equality check to test for
+ // correctness -- as we do below -- is a bad idea. It's only
+ // reliable if the Java and RenderScript implementation of dp
+ // use the same algorithm. Equality could be broken by
+ // different optimizations between the two, or running the
+ // RenderScript algorithm multithreaded, or running the
+ // RenderScript algorithm on a GPU rather than the CPU.
+ //
+ // Should we be checking instead that the results are
+ // "sufficiently close"? Cooking the input set to try to
+ // ensure a deterministic result? Changing to integers
+ // instead?
+ return result("dp", javaRslt, rsRslt);
+ }
+
+ ///////////////////////////////////////////////////////////////////
+
+ private Int2 findMinAndMax(float[] input) {
+ float minVal = Float.POSITIVE_INFINITY;
+ int minIdx = -1;
+ float maxVal = Float.NEGATIVE_INFINITY;
+ int maxIdx = -1;
+
+ for (int idx = 0; idx < input.length; ++idx) {
+ if (input[idx] < minVal) {
+ minVal = input[idx];
+ minIdx = idx;
+ }
+ if (input[idx] > maxVal) {
+ maxVal = input[idx];
+ maxIdx = idx;
+ }
+ }
+
+ return new Int2(minIdx, maxIdx);
+ }
+
+ private boolean findMinAndMax(RenderScript RS, ScriptC_reduce s) {
+ final float[] input = createInputArrayFloat(100000, 4);
+
+ final Int2 javaRslt = findMinAndMax(input);
+ final Int2 rsRslt = s.reduce_findMinAndMax(input).get();
+
+ return result("findMinAndMax", javaRslt, rsRslt);
+ }
+
+ ///////////////////////////////////////////////////////////////////
+
+ private boolean fz(RenderScript RS, ScriptC_reduce s) {
+ final int inputLen = 100000;
+ int[] input = createInputArrayInt(inputLen, 5);
+ // just in case we got unlucky
+ input[(new Random(6)).nextInt(inputLen)] = 0;
+
+ final int rsRslt = s.reduce_fz(input).get();
+
+ final boolean success = (input[rsRslt] == 0);
+ Log.i(TAG,
+ "fz: input[" + rsRslt + "] == " + input[rsRslt] + ": " +
+ (success ? "PASSED" : "FAILED"));
+ return success;
+ }
+
+ ///////////////////////////////////////////////////////////////////
+
+ private boolean fz2(RenderScript RS, ScriptC_reduce s) {
+ final int dimX = 225, dimY = 450;
+ final int inputLen = dimX * dimY;
+
+ int[] inputArray = createInputArrayInt(inputLen, 7);
+ // just in case we got unlucky
+ inputArray[(new Random(8)).nextInt(inputLen)] = 0;
+
+ Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS));
+ typeBuilder.setX(dimX).setY(dimY);
+ Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create());
+ inputAllocation.copy2DRangeFrom(0, 0, dimX, dimY, inputArray);
+
+ final Int2 rsRslt = s.reduce_fz2(inputAllocation).get();
+
+ final int cellVal = inputArray[rsRslt.x + dimX * rsRslt.y];
+ final boolean success = (cellVal == 0);
+ Log.i(TAG,
+ "fz2: input[" + rsRslt.x + ", " + rsRslt.y + "] == " + cellVal + ": " +
+ (success ? "PASSED" : "FAILED"));
+ return success;
+ }
+
+ ///////////////////////////////////////////////////////////////////
+
+ private boolean fz3(RenderScript RS, ScriptC_reduce s) {
+ final int dimX = 59, dimY = 48, dimZ = 37;
+ final int inputLen = dimX * dimY * dimZ;
+
+ int[] inputArray = createInputArrayInt(inputLen, 9);
+ // just in case we got unlucky
+ inputArray[(new Random(10)).nextInt(inputLen)] = 0;
+
+ Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS));
+ typeBuilder.setX(dimX).setY(dimY).setZ(dimZ);
+ Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create());
+ inputAllocation.copy3DRangeFrom(0, 0, 0, dimX, dimY, dimZ, inputArray);
+
+ final Int3 rsRslt = s.reduce_fz3(inputAllocation).get();
+
+ final int cellVal = inputArray[rsRslt.x + dimX * rsRslt.y + dimX * dimY * rsRslt.z];
+ final boolean success = (cellVal == 0);
+ Log.i(TAG,
+ "fz3: input[" + rsRslt.x + ", " + rsRslt.y + ", " + rsRslt.z + "] == " + cellVal + ": " +
+ (success ? "PASSED" : "FAILED"));
+ return success;
+ }
+
+ ///////////////////////////////////////////////////////////////////
+
+ private static final int histogramBucketCount = 256;
+
+ private long[] histogram(RenderScript RS, final byte[] inputArray) {
+ Allocation inputAllocation = Allocation.createSized(RS, Element.U8(RS), inputArray.length);
+ inputAllocation.copyFrom(inputArray);
+
+ Allocation outputAllocation = Allocation.createSized(RS, Element.U32(RS), histogramBucketCount);
+
+ ScriptIntrinsicHistogram scriptHsg = ScriptIntrinsicHistogram.create(RS, Element.U8(RS));
+ scriptHsg.setOutput(outputAllocation);
+ scriptHsg.forEach(inputAllocation);
+
+ int[] outputArrayMistyped = new int[histogramBucketCount];
+ outputAllocation.copyTo(outputArrayMistyped);
+
+ long[] outputArray = new long[histogramBucketCount];
+ for (int i = 0; i < histogramBucketCount; ++i)
+ outputArray[i] = outputArrayMistyped[i] & (long)0xffffffff;
+ return outputArray;
+ }
+
+ private boolean histogram(RenderScript RS, ScriptC_reduce s) {
+ final byte[] inputArray = createInputArrayByte(100000, 11);
+
+ final long[] javaRslt = histogram(RS, inputArray);
+ _RS_ASSERT("javaRslt unexpected length: " + javaRslt.length, javaRslt.length == histogramBucketCount);
+ final long[] rsRslt = s.reduce_histogram(inputArray).get();
+ _RS_ASSERT("rsRslt unexpected length: " + rsRslt.length, rsRslt.length == histogramBucketCount);
+
+ for (int i = 0; i < histogramBucketCount; ++i) {
+ if (javaRslt[i] != rsRslt[i]) {
+ Log.i(TAG,
+ "histogram[" + i + "]: java " + javaRslt[i] + ", rs " + rsRslt[i] + ": FAILED");
+ return false;
+ }
+ }
+
+ Log.i(TAG, "histogram: PASSED");
+ return true;
+ }
+
+ //-----------------------------------------------------------------
+
+ private Int2 mode(RenderScript RS, final byte[] inputArray) {
+ long[] hsg = histogram(RS, inputArray);
+
+ int modeIdx = 0;
+ for (int i = 1; i < hsg.length; ++i)
+ if (hsg[i] > hsg[modeIdx]) modeIdx =i;
+ return new Int2(modeIdx, (int)hsg[modeIdx]);
+ }
+
+ private boolean mode(RenderScript RS, ScriptC_reduce s) {
+ final byte[] inputArray = createInputArrayByte(100000, 12);
+
+ final Int2 javaRslt = mode(RS, inputArray);
+ final Int2 rsRslt = s.reduce_mode(inputArray).get();
+
+ return result("mode", javaRslt, rsRslt);
+ }
+
+ ///////////////////////////////////////////////////////////////////
+
+ public void run() {
+ RenderScript pRS = RenderScript.create(mCtx);
+ ScriptC_reduce s = new ScriptC_reduce(pRS);
+ s.set_negInf(Float.NEGATIVE_INFINITY);
+ s.set_posInf(Float.POSITIVE_INFINITY);
+
+ boolean pass = true;
+ pass &= addint1D(pRS, s);
+ pass &= addint2D(pRS, s);
+ pass &= dp(pRS, s);
+ pass &= findMinAndMax(pRS, s);
+ pass &= fz(pRS, s);
+ pass &= fz2(pRS, s);
+ pass &= fz3(pRS, s);
+ pass &= histogram(pRS, s);
+ pass &= mode(pRS, s);
+
+ pRS.finish();
+ pRS.destroy();
+
+ Log.i(TAG, pass ? "PASSED" : "FAILED");
+ if (pass)
+ passTest();
+ else
+ failTest();
+ }
+}
diff --git a/java/tests/RsTest/src/com/android/rs/test/reduce.rs b/java/tests/RsTest/src/com/android/rs/test/reduce.rs
new file mode 100644
index 0000000..1983bc0
--- /dev/null
+++ b/java/tests/RsTest/src/com/android/rs/test/reduce.rs
@@ -0,0 +1,157 @@
+#include "shared.rsh"
+
+float negInf, posInf;
+
+/////////////////////////////////////////////////////////////////////////
+
+#pragma rs reduce(addint) \
+ accumulator(aiAccum)
+
+static void aiAccum(int *accum, int val) { *accum += val; }
+
+/////////////////////////////////////////////////////////////////////////
+
+#pragma rs reduce(dp) \
+ accumulator(dpAccum) combiner(dpSum)
+
+static void dpAccum(float *accum, float in1, float in2) {
+ *accum += in1*in2;
+}
+
+// combiner function
+static void dpSum(float *accum, const float *val) { *accum += *val; }
+
+/////////////////////////////////////////////////////////////////////////
+
+#pragma rs reduce(findMinAndMax) \
+ initializer(fMMInit) accumulator(fMMAccumulator) \
+ combiner(fMMCombiner) outconverter(fMMOutConverter)
+
+typedef struct {
+ float val;
+ int idx;
+} IndexedVal;
+
+typedef struct {
+ IndexedVal min, max;
+} MinAndMax;
+
+static void fMMInit(MinAndMax *accum) {
+ accum->min.val = posInf;
+ accum->min.idx = -1;
+ accum->max.val = negInf;
+ accum->max.idx = -1;
+}
+
+static void fMMAccumulator(MinAndMax *accum, float in, int x) {
+ IndexedVal me;
+ me.val = in;
+ me.idx = x;
+
+ if (me.val < accum->min.val)
+ accum->min = me;
+ if (me.val > accum->max.val)
+ accum->max = me;
+}
+
+static void fMMCombiner(MinAndMax *accum,
+ const MinAndMax *val) {
+ fMMAccumulator(accum, val->min.val, val->min.idx);
+ fMMAccumulator(accum, val->max.val, val->max.idx);
+}
+
+static void fMMOutConverter(int2 *result,
+ const MinAndMax *val) {
+ result->x = val->min.idx;
+ result->y = val->max.idx;
+}
+
+/////////////////////////////////////////////////////////////////////////
+
+#pragma rs reduce(fz) \
+ initializer(fzInit) \
+ accumulator(fzAccum) combiner(fzCombine)
+
+static void fzInit(int *accumIdx) { *accumIdx = -1; }
+
+static void fzAccum(int *accumIdx,
+ int inVal, int x /* special arg */) {
+ if (inVal==0) *accumIdx = x;
+}
+
+static void fzCombine(int *accumIdx, const int *accumIdx2) {
+ if (*accumIdx2 >= 0) *accumIdx = *accumIdx2;
+}
+
+/////////////////////////////////////////////////////////////////////////
+
+#pragma rs reduce(fz2) \
+ initializer(fz2Init) \
+ accumulator(fz2Accum) combiner(fz2Combine)
+
+static void fz2Init(int2 *accum) { accum->x = accum->y = -1; }
+
+static void fz2Accum(int2 *accum,
+ int inVal,
+ int x /* special arg */,
+ int y /* special arg */) {
+ if (inVal==0) {
+ accum->x = x;
+ accum->y = y;
+ }
+}
+
+static void fz2Combine(int2 *accum, const int2 *accum2) {
+ if (accum2->x >= 0) *accum = *accum2;
+}
+
+/////////////////////////////////////////////////////////////////////////
+
+#pragma rs reduce(fz3) \
+ initializer(fz3Init) \
+ accumulator(fz3Accum) combiner(fz3Combine)
+
+static void fz3Init(int3 *accum) { accum->x = accum->y = accum->z = -1; }
+
+static void fz3Accum(int3 *accum,
+ int inVal,
+ int x /* special arg */,
+ int y /* special arg */,
+ int z /* special arg */) {
+ if (inVal==0) {
+ accum->x = x;
+ accum->y = y;
+ accum->z = z;
+ }
+}
+
+static void fz3Combine(int3 *accum, const int3 *accum2) {
+ if (accum->x >= 0) *accum = *accum2;
+}
+
+/////////////////////////////////////////////////////////////////////////
+
+#pragma rs reduce(histogram) \
+ accumulator(hsgAccum) combiner(hsgCombine)
+
+#define BUCKETS 256
+typedef uint32_t Histogram[BUCKETS];
+
+static void hsgAccum(Histogram *h, uchar in) { ++(*h)[in]; }
+
+static void hsgCombine(Histogram *accum, const Histogram *addend) {
+ for (int i = 0; i < BUCKETS; ++i)
+ (*accum)[i] += (*addend)[i];
+}
+
+#pragma rs reduce(mode) \
+ accumulator(hsgAccum) combiner(hsgCombine) \
+ outconverter(modeOutConvert)
+
+static void modeOutConvert(int2 *result, const Histogram *h) {
+ uint32_t mode = 0;
+ for (int i = 1; i < BUCKETS; ++i)
+ if ((*h)[i] > (*h)[mode]) mode = i;
+ result->x = mode;
+ result->y = (*h)[mode];
+}
diff --git a/rs.spec b/rs.spec
index 6e2dc4a..45f960c 100644
--- a/rs.spec
+++ b/rs.spec
@@ -407,6 +407,14 @@
param const RsScriptCall * sc
}
+ScriptReduceNew {
+ param RsScript s
+ param uint32_t slot
+ param RsAllocation * ains
+ param RsAllocation aout
+ param const RsScriptCall * sc
+}
+
ScriptSetVarI {
param RsScript s
param uint32_t slot
diff --git a/rsDriverLoader.cpp b/rsDriverLoader.cpp
index b39f4f3..426c519 100644
--- a/rsDriverLoader.cpp
+++ b/rsDriverLoader.cpp
@@ -71,6 +71,7 @@
ret &= fn(RS_HAL_SCRIPT_INVOKE_ROOT, (void **)&rsc->mHal.funcs.script.invokeRoot);
ret &= fn(RS_HAL_SCRIPT_INVOKE_FOR_EACH, (void **)&rsc->mHal.funcs.script.invokeForEach);
ret &= fn(RS_HAL_SCRIPT_INVOKE_REDUCE, (void **)&rsc->mHal.funcs.script.invokeReduce);
+ ret &= fn(RS_HAL_SCRIPT_INVOKE_REDUCE_NEW, (void **)&rsc->mHal.funcs.script.invokeReduceNew);
ret &= fn(RS_HAL_SCRIPT_INVOKE_INIT, (void **)&rsc->mHal.funcs.script.invokeInit);
ret &= fn(RS_HAL_SCRIPT_INVOKE_FREE_CHILDREN, (void **)&rsc->mHal.funcs.script.invokeFreeChildren);
ret &= fn(RS_HAL_SCRIPT_SET_GLOBAL_VAR, (void **)&rsc->mHal.funcs.script.setGlobalVar);
@@ -266,6 +267,3 @@
return true;
}
-
-
-
diff --git a/rsScript.cpp b/rsScript.cpp
index bc24292..bf28328 100644
--- a/rsScript.cpp
+++ b/rsScript.cpp
@@ -232,6 +232,18 @@
static_cast<Allocation *>(vaout), sc);
}
+void rsi_ScriptReduceNew(Context *rsc, RsScript vs, uint32_t slot,
+ RsAllocation *vains, size_t inLen,
+ RsAllocation vaout, const RsScriptCall *sc,
+ size_t scLen) {
+ Script *s = static_cast<Script *>(vs);
+ Allocation **ains = (Allocation**)(vains);
+
+ s->runReduceNew(rsc, slot,
+ const_cast<const Allocation **>(ains), inLen,
+ static_cast<Allocation *>(vaout), sc);
+}
+
void rsi_ScriptInvoke(Context *rsc, RsScript vs, uint32_t slot) {
Script *s = static_cast<Script *>(vs);
s->Invoke(rsc, slot, nullptr, 0);
diff --git a/rsScript.h b/rsScript.h
index bd6622d..c3241ab 100644
--- a/rsScript.h
+++ b/rsScript.h
@@ -86,6 +86,7 @@
size_t exportedVariableCount;
size_t exportedForEachCount;
size_t exportedReduceCount;
+ size_t exportedReduceNewCount;
size_t exportedFunctionCount;
size_t exportedPragmaCount;
char const **exportedPragmaKeyList;
@@ -135,6 +136,10 @@
virtual void runReduce(Context *rsc, uint32_t slot, const Allocation *ain,
Allocation *aout, const RsScriptCall *sc) = 0;
+ virtual void runReduceNew(Context *rsc, uint32_t slot,
+ const Allocation **ains, size_t inLen,
+ Allocation *aout, const RsScriptCall *sc) = 0;
+
virtual void Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) = 0;
virtual void setupScript(Context *rsc) = 0;
virtual uint32_t run(Context *) = 0;
diff --git a/rsScriptC.cpp b/rsScriptC.cpp
index 8a13c89..f643093 100644
--- a/rsScriptC.cpp
+++ b/rsScriptC.cpp
@@ -245,7 +245,7 @@
ATRACE_CALL();
if (slot >= mHal.info.exportedReduceCount) {
- rsc->setError(RS_ERROR_BAD_SCRIPT, "The reduce kernel index is out of bounds");
+ rsc->setError(RS_ERROR_BAD_SCRIPT, "The simple reduce kernel index is out of bounds");
return;
}
if (mRSC->hadFatalError()) return;
@@ -259,6 +259,27 @@
rsc->mHal.funcs.script.invokeReduce(rsc, this, slot, ain, aout, sc);
}
+void ScriptC::runReduceNew(Context *rsc, uint32_t slot,
+ const Allocation ** ains, size_t inLen,
+ Allocation *aout, const RsScriptCall *sc) {
+ // TODO: Record the name of the kernel in the tracing information.
+ ATRACE_CALL();
+
+ if (slot >= mHal.info.exportedReduceNewCount) {
+ rsc->setError(RS_ERROR_BAD_SCRIPT, "The general reduce kernel index is out of bounds");
+ return;
+ }
+ if (mRSC->hadFatalError()) return;
+
+ setupScript(rsc);
+
+ if (rsc->props.mLogScripts) {
+ ALOGV("%p ScriptC::runReduceNew invoking slot %i, ptr %p", rsc, slot, this);
+ }
+
+ rsc->mHal.funcs.script.invokeReduceNew(rsc, this, slot, ains, inLen, aout, sc);
+}
+
void ScriptC::Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) {
ATRACE_CALL();
diff --git a/rsScriptC.h b/rsScriptC.h
index bc94010..c8881a4 100644
--- a/rsScriptC.h
+++ b/rsScriptC.h
@@ -34,21 +34,25 @@
ScriptC(Context *);
virtual ~ScriptC();
- virtual void Invoke(Context *rsc, uint32_t slot, const void *data, size_t len);
+ void Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) override;
virtual uint32_t run(Context *);
- virtual void runForEach(Context *rsc,
- uint32_t slot,
- const Allocation ** ains,
- size_t inLen,
- Allocation * aout,
- const void * usr,
- size_t usrBytes,
- const RsScriptCall *sc = nullptr);
+ void runForEach(Context *rsc,
+ uint32_t slot,
+ const Allocation ** ains,
+ size_t inLen,
+ Allocation * aout,
+ const void * usr,
+ size_t usrBytes,
+ const RsScriptCall *sc = nullptr) override;
- virtual void runReduce(Context *rsc, uint32_t slot, const Allocation *ain,
- Allocation *aout, const RsScriptCall *sc);
+ void runReduce(Context *rsc, uint32_t slot, const Allocation *ain,
+ Allocation *aout, const RsScriptCall *sc) override;
+
+ void runReduceNew(Context *rsc, uint32_t slot,
+ const Allocation ** ains, size_t inLen,
+ Allocation *aout, const RsScriptCall *sc) override;
virtual void serialize(Context *rsc, OStream *stream) const { }
virtual RsA3DClassID getClassId() const { return RS_A3D_CLASS_ID_SCRIPT_C; }
diff --git a/rsScriptIntrinsic.cpp b/rsScriptIntrinsic.cpp
index 223e93b..6e0f6ae 100644
--- a/rsScriptIntrinsic.cpp
+++ b/rsScriptIntrinsic.cpp
@@ -72,6 +72,10 @@
Allocation *aout, const RsScriptCall *sc) {
}
+void ScriptIntrinsic::runReduceNew(Context *rsc, uint32_t slot,
+ const Allocation ** ains, size_t inLen,
+ Allocation *aout, const RsScriptCall *sc) {
+}
void ScriptIntrinsic::Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) {
}
diff --git a/rsScriptIntrinsic.h b/rsScriptIntrinsic.h
index fd48bdf..e2b04b8 100644
--- a/rsScriptIntrinsic.h
+++ b/rsScriptIntrinsic.h
@@ -52,6 +52,10 @@
void runReduce(Context *rsc, uint32_t slot, const Allocation *ain,
Allocation *aout, const RsScriptCall *sc) override;
+ void runReduceNew(Context *rsc, uint32_t slot,
+ const Allocation ** ains, size_t inLen,
+ Allocation *aout, const RsScriptCall *sc) override;
+
void Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) override;
void setupScript(Context *rsc) override;
uint32_t run(Context *) override;
diff --git a/rs_hal.h b/rs_hal.h
index 390e90d..2f3aa1a 100644
--- a/rs_hal.h
+++ b/rs_hal.h
@@ -154,6 +154,11 @@
uint32_t slot, const Allocation *ain,
Allocation *aout,
const RsScriptCall *sc);
+ void (*invokeReduceNew)(const Context *rsc, Script *s,
+ uint32_t slot,
+ const Allocation ** ains, size_t inLen,
+ Allocation *aout,
+ const RsScriptCall *sc);
void (*invokeInit)(const Context *rsc, Script *s);
void (*invokeFreeChildren)(const Context *rsc, Script *s);
@@ -386,6 +391,7 @@
RS_HAL_SCRIPT_INVOKE_FOR_EACH_MULTI = 1013,
RS_HAL_SCRIPT_UPDATE_CACHED_OBJECT = 1014,
RS_HAL_SCRIPT_INVOKE_REDUCE = 1015,
+ RS_HAL_SCRIPT_INVOKE_REDUCE_NEW = 1016,
RS_HAL_ALLOCATION_INIT = 2000,
RS_HAL_ALLOCATION_INIT_ADAPTER = 2001,