Cleanup of ForEachParams in cpu ref
Change-Id: I8cc51915b2a605c240d98e3010619b741a13bae2
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 738eb84..40f4745 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -164,7 +164,7 @@
// fast path for very small launches
MTLaunchStruct *mtls = (MTLaunchStruct *)data;
- if (mtls && mtls->fep.dimY <= 1 && mtls->xEnd <= mtls->xStart + mtls->mSliceSize) {
+ if (mtls && mtls->fep.dim.y <= 1 && mtls->xEnd <= mtls->xStart + mtls->mSliceSize) {
if (mWorkers.mLaunchCallback) {
mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
}
@@ -344,106 +344,100 @@
}
typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
-typedef void (*walk_loop_t)(MTLaunchStruct*,
- RsExpandKernelParams&,
+typedef void (*walk_loop_t)(const MTLaunchStruct*,
+ RsExpandKernelDriverInfo,
outer_foreach_t);
+static void kparamSetup(RsExpandKernelParams *kparams, const RsExpandKernelDriverInfo *fep) {
+ //ALOGE("kp usr %p", fep->usr);
+ //ALOGE("kp slot %i", fep->slot);
+ //ALOGE("kp dim %i %i %i", fep->dim.x, fep->dim.y, fep->dim.z);
+ //ALOGE("kp lid %i", fep->lid);
+ //ALOGE("kp in[0] stide %i ptr %p", fep->inStride[0], fep->inPtr[0]);
+ //ALOGE("kp out[0] ptr %p", fep->outPtr[0]);
+ //ALOGE("kp loc %i %i %i", fep->current.x, fep->current.y, fep->current.z);
-static void walk_wrapper(void* usr, uint32_t idx, walk_loop_t walk_loop) {
- MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
-
- uint32_t inLen = mtls->fep.inLen;
-
- RsExpandKernelParams kparams;
- kparams.takeFields(mtls->fep);
-
- // Used by CpuScriptGroup, IntrinsicBlur, and IntrinsicHistogram
- kparams.lid = idx;
-
- if (inLen > 0) {
- // Allocate space for our input base pointers.
- kparams.ins = (const void**)alloca(inLen * sizeof(void*));
-
- // Allocate space for our input stride information.
- kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t));
-
- // Fill our stride information.
- for (int inIndex = inLen; --inIndex >= 0;) {
- kparams.inEStrides[inIndex] = mtls->fep.inStrides[inIndex].eStride;
- }
- }
-
- outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
-
- walk_loop(mtls, kparams, fn);
+ kparams->usr = fep->usr;
+ kparams->slot = fep->slot;
+ kparams->dimX = fep->dim.x;
+ kparams->dimY = fep->dim.y;
+ kparams->dimZ = fep->dim.z;
+ kparams->lid = fep->lid;
+ kparams->inEStrides = (uint32_t *)&fep->inStride[0];
+ kparams->ins = (const void **)&fep->inPtr[0];
+ kparams->out = fep->outPtr[0];
+ kparams->y = fep->current.y;
+ kparams->z = fep->current.z;
}
+static inline void fepPtrSetup(const MTLaunchStruct *mtls, RsExpandKernelDriverInfo *fep,
+ uint32_t x, uint32_t y,
+ uint32_t z = 0, uint32_t lod = 0,
+ RsAllocationCubemapFace face = RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X,
+ uint32_t a1 = 0, uint32_t a2 = 0, uint32_t a3 = 0, uint32_t a4 = 0) {
+
+ for (uint32_t i = 0; i < fep->inLen; i++) {
+ fep->inPtr[i] = (const uint8_t *)mtls->ains[i]->getPointerUnchecked(x, y, z, lod, face, a1, a2, a3, a4);
+ }
+
+ if (mtls->aout[0] != nullptr) {
+ fep->outPtr[0] = (uint8_t *)mtls->aout[0]->getPointerUnchecked(x, y, z, lod, face, a1, a2, a3, a4);
+ }
+}
+
+
static void walk_2d(void *usr, uint32_t idx) {
- walk_wrapper(usr, idx, [](MTLaunchStruct *mtls,
- RsExpandKernelParams &kparams,
- outer_foreach_t fn) {
+ MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+ RsExpandKernelDriverInfo fep = mtls->fep;
+ fep.lid = idx;
- while (1) {
- uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
- uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
- uint32_t yEnd = yStart + mtls->mSliceSize;
+ while (1) {
+ uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+ uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
+ uint32_t yEnd = yStart + mtls->mSliceSize;
- yEnd = rsMin(yEnd, mtls->yEnd);
+ yEnd = rsMin(yEnd, mtls->yEnd);
- if (yEnd <= yStart) {
- return;
- }
-
- for (kparams.y = yStart; kparams.y < yEnd; kparams.y++) {
- kparams.out = mtls->fep.outPtr +
- (mtls->fep.outStride.yStride * kparams.y) +
- (mtls->fep.outStride.eStride * mtls->xStart);
-
- for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) {
- StridePair &strides = mtls->fep.inStrides[inIndex];
-
- kparams.ins[inIndex] =
- mtls->fep.inPtrs[inIndex] +
- (strides.yStride * kparams.y) +
- (strides.eStride * mtls->xStart);
- }
-
- fn(&kparams, mtls->xStart, mtls->xEnd,
- mtls->fep.outStride.eStride);
- }
+ if (yEnd <= yStart) {
+ return;
}
- });
+
+ for (fep.current.y = yStart; fep.current.y < yEnd; fep.current.y++) {
+ fepPtrSetup(mtls, &fep, mtls->xStart, fep.current.y);
+
+ RsExpandKernelParams kparams;
+ kparamSetup(&kparams, &fep);
+
+ outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+ fn(&kparams, mtls->xStart, mtls->xEnd, fep.outStride[0]);
+ }
+ }
}
static void walk_1d(void *usr, uint32_t idx) {
- walk_wrapper(usr, idx, [](MTLaunchStruct *mtls,
- RsExpandKernelParams &kparams,
- outer_foreach_t fn) {
+ MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+ RsExpandKernelDriverInfo fep = mtls->fep;
+ fep.lid = idx;
- while (1) {
- uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
- uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
- uint32_t xEnd = xStart + mtls->mSliceSize;
+ while (1) {
+ uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+ uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
+ uint32_t xEnd = xStart + mtls->mSliceSize;
- xEnd = rsMin(xEnd, mtls->xEnd);
+ xEnd = rsMin(xEnd, mtls->xEnd);
- if (xEnd <= xStart) {
- return;
- }
-
- kparams.out = mtls->fep.outPtr +
- (mtls->fep.outStride.eStride * xStart);
-
- for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) {
- StridePair &strides = mtls->fep.inStrides[inIndex];
-
- kparams.ins[inIndex] =
- mtls->fep.inPtrs[inIndex] + (strides.eStride * xStart);
- }
-
- fn(&kparams, xStart, xEnd, mtls->fep.outStride.eStride);
+ if (xEnd <= xStart) {
+ return;
}
- });
+
+ fepPtrSetup(mtls, &fep, xStart, 0);
+
+ RsExpandKernelParams kparams;
+ kparamSetup(&kparams, &fep);
+
+ outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+ fn(&kparams, xStart, xEnd, fep.outStride[0]);
+ }
}
@@ -459,17 +453,17 @@
const size_t targetByteChunk = 16 * 1024;
mInForEach = true;
- if (mtls->fep.dimY > 1) {
- uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
+ if (mtls->fep.dim.y > 1) {
+ uint32_t s1 = mtls->fep.dim.y / ((mWorkers.mCount + 1) * 4);
uint32_t s2 = 0;
// This chooses our slice size to rate limit atomic ops to
// one per 16k bytes of reads/writes.
- if (mtls->fep.outStride.yStride) {
- s2 = targetByteChunk / mtls->fep.outStride.yStride;
+ if ((mtls->aout[0] != nullptr) && mtls->aout[0]->mHal.drvState.lod[0].stride) {
+ s2 = targetByteChunk / mtls->aout[0]->mHal.drvState.lod[0].stride;
} else {
// We know that there is either an output or an input.
- s2 = targetByteChunk / mtls->fep.inStrides[0].yStride;
+ s2 = targetByteChunk / mtls->ains[0]->mHal.drvState.lod[0].stride;
}
mtls->mSliceSize = rsMin(s1, s2);
@@ -479,16 +473,16 @@
launchThreads(walk_2d, mtls);
} else {
- uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
+ uint32_t s1 = mtls->fep.dim.x / ((mWorkers.mCount + 1) * 4);
uint32_t s2 = 0;
// This chooses our slice size to rate limit atomic ops to
// one per 16k bytes of reads/writes.
- if (mtls->fep.outStride.eStride) {
- s2 = targetByteChunk / mtls->fep.outStride.eStride;
+ if ((mtls->aout[0] != nullptr) && mtls->aout[0]->getType()->getElementSizeBytes()) {
+ s2 = targetByteChunk / mtls->aout[0]->getType()->getElementSizeBytes();
} else {
// We know that there is either an output or an input.
- s2 = targetByteChunk / mtls->fep.inStrides[0].eStride;
+ s2 = targetByteChunk / mtls->ains[0]->getType()->getElementSizeBytes();
}
mtls->mSliceSize = rsMin(s1, s2);
@@ -501,53 +495,21 @@
mInForEach = false;
} else {
- RsExpandKernelParams kparams;
- kparams.takeFields(mtls->fep);
-
- if (inLen > 0) {
- // Allocate space for our input base pointers.
- kparams.ins = (const void**)alloca(inLen * sizeof(void*));
-
- // Allocate space for our input stride information.
- kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t));
-
- // Fill our stride information.
- for (int inIndex = inLen; --inIndex >= 0;) {
- kparams.inEStrides[inIndex] =
- mtls->fep.inStrides[inIndex].eStride;
- }
- }
-
- //ALOGE("launch 3");
outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
for (uint32_t arrayIndex = mtls->arrayStart;
arrayIndex < mtls->arrayEnd; arrayIndex++) {
- for (kparams.z = mtls->zStart; kparams.z < mtls->zEnd;
- kparams.z++) {
+ for (mtls->fep.current.z = mtls->zStart; mtls->fep.current.z < mtls->zEnd;
+ mtls->fep.current.z++) {
- for (kparams.y = mtls->yStart; kparams.y < mtls->yEnd;
- kparams.y++) {
+ for (mtls->fep.current.y = mtls->yStart; mtls->fep.current.y < mtls->yEnd;
+ mtls->fep.current.y++) {
- uint32_t offset =
- mtls->fep.dimY * mtls->fep.dimZ * arrayIndex +
- mtls->fep.dimY * kparams.z + kparams.y;
+ fepPtrSetup(mtls, &mtls->fep, mtls->xStart, mtls->fep.current.y, mtls->fep.current.z);
- kparams.out = mtls->fep.outPtr +
- (mtls->fep.outStride.yStride * offset) +
- (mtls->fep.outStride.eStride * mtls->xStart);
-
- for (int inIndex = inLen; --inIndex >= 0;) {
- StridePair &strides = mtls->fep.inStrides[inIndex];
-
- kparams.ins[inIndex] =
- mtls->fep.inPtrs[inIndex] +
- (strides.yStride * offset) +
- (strides.eStride * mtls->xStart);
- }
-
- fn(&kparams, mtls->xStart, mtls->xEnd,
- mtls->fep.outStride.eStride);
+ RsExpandKernelParams kparams;
+ kparamSetup(&kparams, &mtls->fep);
+ fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.outStride[0]);
}
}
}
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index 8060826..a42cef7 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -25,7 +25,7 @@
#include <string>
-#define RS_KERNEL_INPUT_THRESHOLD 32
+#define RS_KERNEL_INPUT_LIMIT 8
namespace bcc {
class BCCContext;
@@ -41,39 +41,42 @@
uint32_t yStride;
};
+struct RsLaunchDimensions {
+ uint32_t x;
+ uint32_t y;
+ uint32_t z;
+ uint32_t lod;
+ uint32_t faces;
+ uint32_t array[4 /*make a define*/];
+};
+
struct RsExpandKernelDriverInfo {
- const uint8_t **inPtrs;
+ // Warning: This structure is shared with the compiler
+ // Any change to the fields here requires a matching compiler change
+
+ const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT];
+ uint32_t inStride[RS_KERNEL_INPUT_LIMIT];
uint32_t inLen;
- uint8_t *outPtr;
+ uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT];
+ uint32_t outStride[RS_KERNEL_INPUT_LIMIT];
+ uint32_t outLen;
- StridePair *inStrides;
- StridePair outStride;
+ // Dimension of the launch
+ RsLaunchDimensions dim;
- uint32_t dimX;
- uint32_t dimY;
- uint32_t dimZ;
-
- uint32_t slot;
+ // The walking itterator of the launch
+ RsLaunchDimensions current;
const void *usr;
uint32_t usrLen;
- bool heapAllocatedArrays;
- RsExpandKernelDriverInfo() : heapAllocatedArrays(false) {}
- ~RsExpandKernelDriverInfo() {
- if (heapAllocatedArrays) {
- if (inPtrs != nullptr) {
- delete[] inPtrs;
- }
+ // Items below this line are not used by the compiler and can be change in the driver
+ uint32_t lid;
+ uint32_t slot;
- if (inStrides != nullptr) {
- delete[] inStrides;
- }
- }
- }
};
struct RsExpandKernelParams {
@@ -99,16 +102,6 @@
* modify blur to not need it.
*/
uint32_t slot;
-
- /// Copy fields needed by a kernel from a driver struct.
- void takeFields(const RsExpandKernelDriverInfo &dstruct) {
- this->usr = dstruct.usr;
- this->slot = dstruct.slot;
-
- this->dimX = dstruct.dimX;
- this->dimY = dstruct.dimY;
- this->dimZ = dstruct.dimZ;
- }
};
extern bool gArchUseSIMD;
@@ -134,13 +127,17 @@
ForEachFunc_t kernel;
uint32_t sig;
- const Allocation ** ains;
- Allocation * aout;
+ const Allocation * ains[RS_KERNEL_INPUT_LIMIT];
+ Allocation * aout[RS_KERNEL_INPUT_LIMIT];
uint32_t mSliceSize;
volatile int mSliceNum;
bool isThreadable;
+ // origin of the launch
+ RsLaunchDimensions origin;
+
+ // TODO: convert to RsLaunchDimensions
uint32_t xStart;
uint32_t xEnd;
uint32_t yStart;
@@ -149,9 +146,6 @@
uint32_t zEnd;
uint32_t arrayStart;
uint32_t arrayEnd;
-
- const uint8_t *inPtrsBuff[RS_KERNEL_INPUT_THRESHOLD];
- StridePair inStridesBuff[RS_KERNEL_INPUT_THRESHOLD];
};
class RsdCpuReferenceImpl : public RsdCpuReference {
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index cfb6da2..f312866 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -1013,9 +1013,9 @@
const Allocation *ain0 = ains[0];
const Type *inType = ain0->getType();
- mtls->fep.dimX = inType->getDimX();
- mtls->fep.dimY = inType->getDimY();
- mtls->fep.dimZ = inType->getDimZ();
+ mtls->fep.dim.x = inType->getDimX();
+ mtls->fep.dim.y = inType->getDimY();
+ mtls->fep.dim.z = inType->getDimZ();
for (int Index = inLen; --Index >= 1;) {
if (!ain0->hasSameDims(ains[Index])) {
@@ -1029,9 +1029,9 @@
} else if (aout != nullptr) {
const Type *outType = aout->getType();
- mtls->fep.dimX = outType->getDimX();
- mtls->fep.dimY = outType->getDimY();
- mtls->fep.dimZ = outType->getDimZ();
+ mtls->fep.dim.x = outType->getDimX();
+ mtls->fep.dim.y = outType->getDimY();
+ mtls->fep.dim.z = outType->getDimZ();
} else {
mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
@@ -1049,35 +1049,35 @@
}
if (!sc || (sc->xEnd == 0)) {
- mtls->xEnd = mtls->fep.dimX;
+ mtls->xEnd = mtls->fep.dim.x;
} else {
- rsAssert(sc->xStart < mtls->fep.dimX);
- rsAssert(sc->xEnd <= mtls->fep.dimX);
+ rsAssert(sc->xStart < mtls->fep.dim.x);
+ rsAssert(sc->xEnd <= mtls->fep.dim.x);
rsAssert(sc->xStart < sc->xEnd);
- mtls->xStart = rsMin(mtls->fep.dimX, sc->xStart);
- mtls->xEnd = rsMin(mtls->fep.dimX, sc->xEnd);
+ mtls->xStart = rsMin(mtls->fep.dim.x, sc->xStart);
+ mtls->xEnd = rsMin(mtls->fep.dim.x, sc->xEnd);
if (mtls->xStart >= mtls->xEnd) return;
}
if (!sc || (sc->yEnd == 0)) {
- mtls->yEnd = mtls->fep.dimY;
+ mtls->yEnd = mtls->fep.dim.y;
} else {
- rsAssert(sc->yStart < mtls->fep.dimY);
- rsAssert(sc->yEnd <= mtls->fep.dimY);
+ rsAssert(sc->yStart < mtls->fep.dim.y);
+ rsAssert(sc->yEnd <= mtls->fep.dim.y);
rsAssert(sc->yStart < sc->yEnd);
- mtls->yStart = rsMin(mtls->fep.dimY, sc->yStart);
- mtls->yEnd = rsMin(mtls->fep.dimY, sc->yEnd);
+ mtls->yStart = rsMin(mtls->fep.dim.y, sc->yStart);
+ mtls->yEnd = rsMin(mtls->fep.dim.y, sc->yEnd);
if (mtls->yStart >= mtls->yEnd) return;
}
if (!sc || (sc->zEnd == 0)) {
- mtls->zEnd = mtls->fep.dimZ;
+ mtls->zEnd = mtls->fep.dim.z;
} else {
- rsAssert(sc->zStart < mtls->fep.dimZ);
- rsAssert(sc->zEnd <= mtls->fep.dimZ);
+ rsAssert(sc->zStart < mtls->fep.dim.z);
+ rsAssert(sc->zEnd <= mtls->fep.dim.z);
rsAssert(sc->zStart < sc->zEnd);
- mtls->zStart = rsMin(mtls->fep.dimZ, sc->zStart);
- mtls->zEnd = rsMin(mtls->fep.dimZ, sc->zEnd);
+ mtls->zStart = rsMin(mtls->fep.dim.z, sc->zStart);
+ mtls->zEnd = rsMin(mtls->fep.dim.z, sc->zEnd);
if (mtls->zStart >= mtls->zEnd) return;
}
@@ -1089,52 +1089,28 @@
rsAssert(inLen == 0 || (ains[0]->getType()->getDimZ() == 0));
mtls->rsc = mCtx;
- mtls->ains = ains;
- mtls->aout = aout;
+ if (ains) {
+ memcpy(mtls->ains, ains, inLen * sizeof(ains[0]));
+ }
+ mtls->aout[0] = aout;
mtls->fep.usr = usr;
mtls->fep.usrLen = usrLen;
mtls->mSliceSize = 1;
mtls->mSliceNum = 0;
- mtls->fep.inPtrs = nullptr;
- mtls->fep.inStrides = nullptr;
mtls->isThreadable = mIsThreadable;
if (inLen > 0) {
-
- if (inLen <= RS_KERNEL_INPUT_THRESHOLD) {
- mtls->fep.inPtrs = (const uint8_t**)mtls->inPtrsBuff;
- mtls->fep.inStrides = mtls->inStridesBuff;
- } else {
- mtls->fep.heapAllocatedArrays = true;
-
- mtls->fep.inPtrs = new const uint8_t*[inLen];
- mtls->fep.inStrides = new StridePair[inLen];
- }
-
mtls->fep.inLen = inLen;
-
for (int index = inLen; --index >= 0;) {
- const Allocation *ain = ains[index];
-
- mtls->fep.inPtrs[index] =
- (const uint8_t*)ain->mHal.drvState.lod[0].mallocPtr;
-
- mtls->fep.inStrides[index].eStride =
- ain->getType()->getElementSizeBytes();
- mtls->fep.inStrides[index].yStride =
- ain->mHal.drvState.lod[0].stride;
+ mtls->fep.inPtr[index] = (const uint8_t*)ains[index]->mHal.drvState.lod[0].mallocPtr;
+ mtls->fep.inStride[index] = ains[index]->getType()->getElementSizeBytes();
}
}
- mtls->fep.outPtr = nullptr;
- mtls->fep.outStride.eStride = 0;
- mtls->fep.outStride.yStride = 0;
if (aout != nullptr) {
- mtls->fep.outPtr = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
-
- mtls->fep.outStride.eStride = aout->getType()->getElementSizeBytes();
- mtls->fep.outStride.yStride = aout->mHal.drvState.lod[0].stride;
+ mtls->fep.outPtr[0] = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
+ mtls->fep.outStride[0] = aout->getType()->getElementSizeBytes();
}
}