am 902868b7: Merge "YUV(NV21) to RGBA function NEON optimizations."
# By Vassilis Laganakos
# Via David Butcher (1) and Gerrit Code Review (1)
* commit '902868b7c94f0c16b53e28ee1dd68c4e4a24f964':
YUV(NV21) to RGBA function NEON optimizations.
diff --git a/driver/rsdBcc.cpp b/driver/rsdBcc.cpp
index c78508c..e17c107 100644
--- a/driver/rsdBcc.cpp
+++ b/driver/rsdBcc.cpp
@@ -176,6 +176,7 @@
MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
RsForEachStubParamStruct p;
memcpy(&p, &mtls->fep, sizeof(p));
+ p.lid = idx;
RsdHal * dc = (RsdHal *)mtls->rsc->mHal.drv;
uint32_t sig = mtls->sig;
@@ -222,6 +223,7 @@
MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
RsForEachStubParamStruct p;
memcpy(&p, &mtls->fep, sizeof(p));
+ p.lid = idx;
RsdHal * dc = (RsdHal *)mtls->rsc->mHal.drv;
uint32_t sig = mtls->sig;
@@ -341,17 +343,40 @@
Context *mrsc = (Context *)rsc;
RsdHal * dc = (RsdHal *)mtls->rsc->mHal.drv;
- if ((dc->mWorkers.mCount > 1) && s->mHal.info.isThreadable && !dc->mInForEach) {
+ if ((dc->mWorkers.mCount >= 1) && s->mHal.info.isThreadable && !dc->mInForEach) {
+ const size_t targetByteChunk = 16 * 1024;
dc->mInForEach = true;
if (mtls->fep.dimY > 1) {
- mtls->mSliceSize = mtls->fep.dimY / (dc->mWorkers.mCount * 4);
+ uint32_t s1 = mtls->fep.dimY / ((dc->mWorkers.mCount + 1) * 4);
+ uint32_t s2 = 0;
+
+ // This chooses our slice size to rate limit atomic ops to
+ // one per 16k bytes of reads/writes.
+ if (mtls->fep.yStrideOut) {
+ s2 = targetByteChunk / mtls->fep.yStrideOut;
+ } else {
+ s2 = targetByteChunk / mtls->fep.yStrideIn;
+ }
+ mtls->mSliceSize = rsMin(s1, s2);
+
if(mtls->mSliceSize < 1) {
mtls->mSliceSize = 1;
}
rsdLaunchThreads(mrsc, wc_xy, mtls);
} else {
- mtls->mSliceSize = mtls->fep.dimX / (dc->mWorkers.mCount * 4);
+ uint32_t s1 = mtls->fep.dimX / ((dc->mWorkers.mCount + 1) * 4);
+ uint32_t s2 = 0;
+
+ // This chooses our slice size to rate limit atomic ops to
+ // one per 16k bytes of reads/writes.
+ if (mtls->fep.eStrideOut) {
+ s2 = targetByteChunk / mtls->fep.eStrideOut;
+ } else {
+ s2 = targetByteChunk / mtls->fep.eStrideIn;
+ }
+ mtls->mSliceSize = rsMin(s1, s2);
+
if(mtls->mSliceSize < 1) {
mtls->mSliceSize = 1;
}
@@ -364,6 +389,7 @@
} else {
RsForEachStubParamStruct p;
memcpy(&p, &mtls->fep, sizeof(p));
+ p.lid = 0;
uint32_t sig = mtls->sig;
//ALOGE("launch 3");
diff --git a/driver/rsdCore.cpp b/driver/rsdCore.cpp
index cdfc600..c8b8014 100644
--- a/driver/rsdCore.cpp
+++ b/driver/rsdCore.cpp
@@ -189,7 +189,8 @@
while (!dc->mExit) {
dc->mWorkers.mLaunchSignals[idx].wait();
if (dc->mWorkers.mLaunchCallback) {
- dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, idx);
+ // idx +1 is used because the calling thread is always worker 0.
+ dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, idx+1);
}
android_atomic_dec(&dc->mWorkers.mRunningCount);
dc->mWorkers.mCompleteSignal.set();
@@ -208,6 +209,13 @@
for (uint32_t ct = 0; ct < dc->mWorkers.mCount; ct++) {
dc->mWorkers.mLaunchSignals[ct].set();
}
+
+ // We use the calling thread as one of the workers so we can start without
+ // the delay of the thread wakeup.
+ if (dc->mWorkers.mLaunchCallback) {
+ dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, 0);
+ }
+
while (android_atomic_acquire_load(&dc->mWorkers.mRunningCount) != 0) {
dc->mWorkers.mCompleteSignal.wait();
}
@@ -250,11 +258,13 @@
cpu = rsc->props.mDebugMaxThreads;
}
if (cpu < 2) {
- cpu = 0;
+ dc->mWorkers.mCount = 0;
+ return true;
}
ALOGV("%p Launching thread(s), CPUs %i", rsc, cpu);
- dc->mWorkers.mCount = (uint32_t)cpu;
+ // Subtract one from the cpu count because we also use the command thread as a worker.
+ dc->mWorkers.mCount = (uint32_t)(cpu - 1);
dc->mWorkers.mThreadId = (pthread_t *) calloc(dc->mWorkers.mCount, sizeof(pthread_t));
dc->mWorkers.mNativeThreadId = (pid_t *) calloc(dc->mWorkers.mCount, sizeof(pid_t));
dc->mWorkers.mLaunchSignals = new Signal[dc->mWorkers.mCount];
diff --git a/driver/rsdIntrinsicBlend.cpp b/driver/rsdIntrinsicBlend.cpp
index 22ad108..c35c379 100644
--- a/driver/rsdIntrinsicBlend.cpp
+++ b/driver/rsdIntrinsicBlend.cpp
@@ -103,9 +103,6 @@
uint32_t x1 = xstart;
uint32_t x2 = xend;
- in += xstart;
- out += xstart;
-
switch (p->slot) {
case BLEND_CLEAR:
for (;x1 < x2; x1++, out++) {
diff --git a/driver/rsdIntrinsicBlur.cpp b/driver/rsdIntrinsicBlur.cpp
index 9c1fe68..5cd671e 100644
--- a/driver/rsdIntrinsicBlur.cpp
+++ b/driver/rsdIntrinsicBlur.cpp
@@ -29,6 +29,8 @@
short ip[104];
float radius;
int iradius;
+ void **scratch;
+ size_t *scratchSize;
ObjectBaseRef<Allocation> alloc;
};
@@ -139,6 +141,7 @@
out->xyzw = blurredPixel;
x1++;
out++;
+ gPtr++;
}
}
@@ -161,7 +164,8 @@
static void Blur_uchar4(const RsForEachStubParamStruct *p,
uint32_t xstart, uint32_t xend,
uint32_t instep, uint32_t outstep) {
- float buf[4 * 2048];
+ float stackbuf[4 * 2048];
+ float *buf = &stackbuf[0];
ConvolveParams *cp = (ConvolveParams *)p->usr;
if (!cp->alloc.get()) {
ALOGE("Blur executed without input, skipping");
@@ -174,16 +178,37 @@
uint32_t x1 = xstart;
uint32_t x2 = xend;
+ if (p->dimX > 2048) {
+ if ((p->dimX > cp->scratchSize[p->lid]) || !cp->scratch[p->lid]) {
+ cp->scratch[p->lid] = realloc(cp->scratch[p->lid], p->dimX * 16);
+ cp->scratchSize[p->lid] = p->dimX;
+ }
+ buf = (float *)cp->scratch[p->lid];
+ }
float4 *fout = (float4 *)buf;
+
int y = p->y;
+ uint32_t vx1 = x1;
+ uint32_t vx2 = x2;
+
+ if (vx1 > (uint32_t)cp->iradius) {
+ vx1 -= cp->iradius;
+ } else {
+ vx1 = 0;
+ }
+ vx2 += cp->iradius;
+ if (vx2 >= p->dimX) {
+ vx2 = p->dimX - 1;
+ }
+
if ((y > cp->iradius) && (y < ((int)p->dimY - cp->iradius))) {
const uchar *pi = pin + (y - cp->iradius) * din->lod[0].stride;
- OneVF(fout, pi, din->lod[0].stride, cp->fp, cp->iradius * 2 + 1, x1, x2);
+ OneVF(fout + vx1, pi, din->lod[0].stride, cp->fp, cp->iradius * 2 + 1, vx1, vx2);
} else {
- while(x2 > x1) {
- OneV(p, fout, x1, y, pin, din->lod[0].stride, cp->fp, cp->iradius);
+ while(vx2 > vx1) {
+ OneV(p, fout, vx1, y, pin, din->lod[0].stride, cp->fp, cp->iradius);
fout++;
- x1++;
+ vx1++;
}
}
@@ -208,17 +233,51 @@
}
-void * rsdIntrinsic_InitBlur(const android::renderscript::Context *dc,
+static void Destroy(const Context *rsc, const Script *script, void * intrinsicData) {
+ RsdHal * dc = (RsdHal *)rsc->mHal.drv;
+ ConvolveParams *cp = (ConvolveParams *)intrinsicData;
+
+ if (cp) {
+ if (cp->scratch) {
+ for (size_t i = 0; i < dc->mWorkers.mCount + 1; i++) {
+ if (cp->scratch[i]) {
+ free(cp->scratch[i]);
+ }
+ }
+ free(cp->scratch);
+ }
+ if (cp->scratchSize) {
+ free(cp->scratchSize);
+ }
+ free(cp);
+ }
+}
+
+void * rsdIntrinsic_InitBlur(const android::renderscript::Context *rsc,
android::renderscript::Script *script,
RsdIntriniscFuncs_t *funcs) {
+ RsdHal * dc = (RsdHal *)rsc->mHal.drv;
+
script->mHal.info.exportedVariableCount = 2;
funcs->setVarObj = Blur_Bind;
funcs->setVar = Blur_SetVar;
funcs->root = Blur_uchar4;
+ funcs->destroy = Destroy;
ConvolveParams *cp = (ConvolveParams *)calloc(1, sizeof(ConvolveParams));
+ if (!cp) {
+ return NULL;
+ }
+
cp->radius = 5;
+ cp->scratch = (void **)calloc(dc->mWorkers.mCount + 1, sizeof(void *));
+ cp->scratchSize = (size_t *)calloc(dc->mWorkers.mCount + 1, sizeof(size_t));
+ if (!cp->scratch || !cp->scratchSize) {
+ Destroy(rsc, script, cp);
+ return NULL;
+ }
+
ComputeGaussianWeights(cp);
return cp;
}
diff --git a/driver/rsdIntrinsicColorMatrix.cpp b/driver/rsdIntrinsicColorMatrix.cpp
index 8f6c70c..cfe0333 100644
--- a/driver/rsdIntrinsicColorMatrix.cpp
+++ b/driver/rsdIntrinsicColorMatrix.cpp
@@ -97,9 +97,6 @@
uint32_t x1 = xstart;
uint32_t x2 = xend;
- in += xstart;
- out += xstart;
-
if(x2 > x1) {
#if defined(ARCH_ARM_HAVE_NEON)
int32_t len = (x2 - x1) >> 2;
diff --git a/driver/rsdIntrinsicConvolve3x3.cpp b/driver/rsdIntrinsicConvolve3x3.cpp
index 55f4360..dac2f24 100644
--- a/driver/rsdIntrinsicConvolve3x3.cpp
+++ b/driver/rsdIntrinsicConvolve3x3.cpp
@@ -56,7 +56,7 @@
const float* coeff) {
uint32_t x1 = rsMax((int32_t)x-1, 0);
- uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX);
+ uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
float4 px = convert_float4(py0[x1]) * coeff[0] +
convert_float4(py0[x]) * coeff[1] +
diff --git a/driver/rsdIntrinsicConvolve5x5.cpp b/driver/rsdIntrinsicConvolve5x5.cpp
index fc6b029..ac06304 100644
--- a/driver/rsdIntrinsicConvolve5x5.cpp
+++ b/driver/rsdIntrinsicConvolve5x5.cpp
@@ -134,7 +134,7 @@
#if defined(ARCH_ARM_HAVE_NEON)
if((x1 + 3) < x2) {
uint32_t len = (x2 - x1 - 3) >> 1;
- rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
+ rsdIntrinsicConvolve5x5_K(out, py0+x1-2, py1+x1-2, py2+x1-2, py3+x1-2, py4+x1-2, cp->ip, len);
out += len << 1;
x1 += len << 1;
}
diff --git a/driver/rsdIntrinsicLUT.cpp b/driver/rsdIntrinsicLUT.cpp
index a75534e..818a132 100644
--- a/driver/rsdIntrinsicLUT.cpp
+++ b/driver/rsdIntrinsicLUT.cpp
@@ -44,9 +44,6 @@
uint32_t x1 = xstart;
uint32_t x2 = xend;
- in += xstart;
- out += xstart;
-
DrvAllocation *din = (DrvAllocation *)cp->lut->mHal.drv;
const uchar *tr = (const uchar *)din->lod[0].mallocPtr;
const uchar *tg = &tr[256];
diff --git a/rs_hal.h b/rs_hal.h
index b0e10c9..f172fbf 100644
--- a/rs_hal.h
+++ b/rs_hal.h
@@ -58,6 +58,7 @@
uint32_t lod;
RsAllocationCubemapFace face;
uint32_t ar[16];
+ uint32_t lid;
uint32_t dimX;
uint32_t dimY;