Cleanup pass + implement blur uchar
Change-Id: Ib7f1c5218663b468a3c11daa2c3373ae132145ac
Conflicts:
cpu_ref/rsCpuIntrinsicBlend.cpp
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 29539da..5ea28d4 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -423,13 +423,20 @@
return i;
}
-extern RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx, const Script *s);
-extern RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s);
-extern RsdCpuScriptImpl * rsdIntrinsic_LUT(RsdCpuReferenceImpl *ctx, const Script *s);
-extern RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx, const Script *s);
-extern RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx, const Script *s);
-extern RsdCpuScriptImpl * rsdIntrinsic_YuvToRGB(RsdCpuReferenceImpl *ctx, const Script *s);
-extern RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx, const Script *s);
+extern RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e);
+extern RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e);
+extern RsdCpuScriptImpl * rsdIntrinsic_LUT(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e);
+extern RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e);
+extern RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e);
+extern RsdCpuScriptImpl * rsdIntrinsic_YuvToRGB(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e);
+extern RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e);
RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createIntrinsic(const Script *s,
RsScriptIntrinsicID iid, Element *e) {
@@ -437,25 +444,25 @@
RsdCpuScriptImpl *i = NULL;
switch (iid) {
case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3:
- i = rsdIntrinsic_Convolve3x3(this, s);
+ i = rsdIntrinsic_Convolve3x3(this, s, e);
break;
case RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX:
- i = rsdIntrinsic_ColorMatrix(this, s);
+ i = rsdIntrinsic_ColorMatrix(this, s, e);
break;
case RS_SCRIPT_INTRINSIC_ID_LUT:
- i = rsdIntrinsic_LUT(this, s);
+ i = rsdIntrinsic_LUT(this, s, e);
break;
case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5:
- i = rsdIntrinsic_Convolve5x5(this, s);
+ i = rsdIntrinsic_Convolve5x5(this, s, e);
break;
case RS_SCRIPT_INTRINSIC_ID_BLUR:
- i = rsdIntrinsic_Blur(this, s);
+ i = rsdIntrinsic_Blur(this, s, e);
break;
case RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB:
- i = rsdIntrinsic_YuvToRGB(this, s);
+ i = rsdIntrinsic_YuvToRGB(this, s, e);
break;
case RS_SCRIPT_INTRINSIC_ID_BLEND:
- i = rsdIntrinsic_Blend(this, s);
+ i = rsdIntrinsic_Blend(this, s, e);
break;
default:
diff --git a/cpu_ref/rsCpuIntrinsic.cpp b/cpu_ref/rsCpuIntrinsic.cpp
index a4eef21..450ee30 100644
--- a/cpu_ref/rsCpuIntrinsic.cpp
+++ b/cpu_ref/rsCpuIntrinsic.cpp
@@ -21,10 +21,11 @@
using namespace android::renderscript;
RsdCpuScriptIntrinsic::RsdCpuScriptIntrinsic(RsdCpuReferenceImpl *ctx, const Script *s,
- RsScriptIntrinsicID iid)
+ const Element *e, RsScriptIntrinsicID iid)
: RsdCpuScriptImpl(ctx, s) {
mID = iid;
+ mElement.set(e);
}
RsdCpuScriptIntrinsic::~RsdCpuScriptIntrinsic() {
diff --git a/cpu_ref/rsCpuIntrinsic.h b/cpu_ref/rsCpuIntrinsic.h
index 1756115..35ffc69 100644
--- a/cpu_ref/rsCpuIntrinsic.h
+++ b/cpu_ref/rsCpuIntrinsic.h
@@ -47,11 +47,13 @@
virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
virtual ~RsdCpuScriptIntrinsic();
- RsdCpuScriptIntrinsic(RsdCpuReferenceImpl *ctx, const Script *s, RsScriptIntrinsicID iid);
+ RsdCpuScriptIntrinsic(RsdCpuReferenceImpl *ctx, const Script *s, const Element *,
+ RsScriptIntrinsicID iid);
protected:
RsScriptIntrinsicID mID;
outer_foreach_t mRootPtr;
+ ObjectBaseRef<const Element> mElement;
};
diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
index 57286d5..d7b01b6 100644
--- a/cpu_ref/rsCpuIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -30,7 +30,7 @@
virtual void populateScript(Script *);
virtual ~RsdCpuScriptIntrinsicBlend();
- RsdCpuScriptIntrinsicBlend(RsdCpuReferenceImpl *ctx, const Script *s);
+ RsdCpuScriptIntrinsicBlend(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
protected:
static void kernel(const RsForEachStubParamStruct *p,
@@ -456,8 +456,9 @@
}
-RsdCpuScriptIntrinsicBlend::RsdCpuScriptIntrinsicBlend(RsdCpuReferenceImpl *ctx, const Script *s)
- : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_BLEND) {
+RsdCpuScriptIntrinsicBlend::RsdCpuScriptIntrinsicBlend(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e)
+ : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_BLEND) {
mRootPtr = &kernel;
}
@@ -469,8 +470,9 @@
s->mHal.info.exportedVariableCount = 0;
}
-RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx, const Script *s) {
- return new RsdCpuScriptIntrinsicBlend(ctx, s);
+RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e) {
+ return new RsdCpuScriptIntrinsicBlend(ctx, s, e);
}
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp
index 48363d1..1229f79 100644
--- a/cpu_ref/rsCpuIntrinsicBlur.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlur.cpp
@@ -33,7 +33,7 @@
virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
virtual ~RsdCpuScriptIntrinsicBlur();
- RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl *ctx, const Script *s);
+ RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
protected:
float fp[104];
@@ -42,9 +42,12 @@
int iradius;
ObjectBaseRef<Allocation> alloc;
- static void kernel(const RsForEachStubParamStruct *p,
- uint32_t xstart, uint32_t xend,
- uint32_t instep, uint32_t outstep);
+ static void kernelU4(const RsForEachStubParamStruct *p,
+ uint32_t xstart, uint32_t xend,
+ uint32_t instep, uint32_t outstep);
+ static void kernelU1(const RsForEachStubParamStruct *p,
+ uint32_t xstart, uint32_t xend,
+ uint32_t instep, uint32_t outstep);
void ComputeGaussianWeights();
};
@@ -104,8 +107,8 @@
-static void OneV(const RsForEachStubParamStruct *p, float4 *out, int32_t x, int32_t y,
- const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
+static void OneVU4(const RsForEachStubParamStruct *p, float4 *out, int32_t x, int32_t y,
+ const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
const uchar *pi = ptrIn + x*4;
@@ -122,19 +125,36 @@
out->xyzw = blurredPixel;
}
-extern "C" void rsdIntrinsicBlurVF_K(void *dst, const void *pin, int stride, const void *gptr, int rct, int x1, int x2);
-extern "C" void rsdIntrinsicBlurHF_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int x2);
+static void OneVU1(const RsForEachStubParamStruct *p, float *out, int32_t x, int32_t y,
+ const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
-static void OneVF(float4 *out,
- const uchar *ptrIn, int iStride, const float* gPtr, int ct,
- int x1, int x2) {
+ const uchar *pi = ptrIn + x;
+
+ float blurredPixel = 0;
+ for (int r = -iradius; r <= iradius; r ++) {
+ int validY = rsMax((y + r), 0);
+ validY = rsMin(validY, (int)(p->dimY - 1));
+ float pf = (float)pi[validY * iStride];
+ blurredPixel += pf * gPtr[0];
+ gPtr++;
+ }
+
+ out[0] = blurredPixel;
+}
+
+extern "C" void rsdIntrinsicBlurVFU4_K(void *dst, const void *pin, int stride, const void *gptr, int rct, int x1, int ct);
+extern "C" void rsdIntrinsicBlurHFU4_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int ct);
+
+static void OneVFU4(float4 *out,
+ const uchar *ptrIn, int iStride, const float* gPtr, int ct,
+ int x1, int x2) {
#if defined(ARCH_ARM_HAVE_NEON)
{
int t = (x2 - x1);
t &= ~1;
if(t) {
- rsdIntrinsicBlurVF_K(out, ptrIn, iStride, gPtr, ct, x1, x1 + t);
+ rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, x1, x1 + t);
}
x1 += t;
}
@@ -157,8 +177,41 @@
}
}
-static void OneH(const RsForEachStubParamStruct *p, uchar4 *out, int32_t x,
- const float4 *ptrIn, const float* gPtr, int iradius) {
+static void OneVFU1(float *out,
+ const uchar *ptrIn, int iStride, const float* gPtr, int ct, int len) {
+
+#if defined(ARCH_ARM_HAVE_NEON)
+ {
+ int t = len >> 2;
+ t &= ~1;
+ if(t) {
+ rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, 0, len);
+ }
+ len -= t << 2;
+ ptrIn += t << 2;
+ out += t << 2;
+ }
+#endif
+
+ while(len) {
+ const uchar *pi = ptrIn;
+ float blurredPixel = 0;
+ const float* gp = gPtr;
+
+ for (int r = 0; r < ct; r++) {
+ float pf = (float)pi[0];
+ blurredPixel += pf * gp[0];
+ pi += iStride;
+ gp++;
+ }
+ out[0] = blurredPixel;
+ len--;
+ out++;
+ }
+}
+
+static void OneHU4(const RsForEachStubParamStruct *p, uchar4 *out, int32_t x,
+ const float4 *ptrIn, const float* gPtr, int iradius) {
float4 blurredPixel = 0;
for (int r = -iradius; r <= iradius; r ++) {
@@ -172,10 +225,25 @@
out->xyzw = convert_uchar4(blurredPixel);
}
+static void OneHU1(const RsForEachStubParamStruct *p, uchar *out, int32_t x,
+ const float *ptrIn, const float* gPtr, int iradius) {
-void RsdCpuScriptIntrinsicBlur::kernel(const RsForEachStubParamStruct *p,
- uint32_t xstart, uint32_t xend,
- uint32_t instep, uint32_t outstep) {
+ float blurredPixel = 0;
+ for (int r = -iradius; r <= iradius; r ++) {
+ int validX = rsMax((x + r), 0);
+ validX = rsMin(validX, (int)(p->dimX - 1));
+ float pf = ptrIn[validX];
+ blurredPixel += pf * gPtr[0];
+ gPtr++;
+ }
+
+ out[0] = (uchar)blurredPixel;
+}
+
+
+void RsdCpuScriptIntrinsicBlur::kernelU4(const RsForEachStubParamStruct *p,
+ uint32_t xstart, uint32_t xend,
+ uint32_t instep, uint32_t outstep) {
float buf[4 * 2048];
RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)p->usr;
if (!cp->alloc.get()) {
@@ -193,10 +261,10 @@
int y = p->y;
if ((y > cp->iradius) && (y < ((int)p->dimY - cp->iradius))) {
const uchar *pi = pin + (y - cp->iradius) * stride;
- OneVF(fout, pi, stride, cp->fp, cp->iradius * 2 + 1, x1, x2);
+ OneVFU4(fout, pi, stride, cp->fp, cp->iradius * 2 + 1, x1, x2);
} else {
while(x2 > x1) {
- OneV(p, fout, x1, y, pin, stride, cp->fp, cp->iradius);
+ OneVU4(p, fout, x1, y, pin, stride, cp->fp, cp->iradius);
fout++;
x1++;
}
@@ -204,29 +272,90 @@
x1 = xstart;
while ((x1 < (uint32_t)cp->iradius) && (x1 < x2)) {
- OneH(p, out, x1, (float4 *)buf, cp->fp, cp->iradius);
+ OneHU4(p, out, x1, (float4 *)buf, cp->fp, cp->iradius);
out++;
x1++;
}
#if defined(ARCH_ARM_HAVE_NEON)
if ((x1 + cp->iradius) < x2) {
- rsdIntrinsicBlurHF_K(out, ((float4 *)buf) - cp->iradius, cp->fp, cp->iradius * 2 + 1, x1, x2 - cp->iradius);
+ rsdIntrinsicBlurHFU4_K(out, ((float4 *)buf) - cp->iradius, cp->fp,
+ cp->iradius * 2 + 1, x1, x2 - cp->iradius);
out += (x2 - cp->iradius) - x1;
x1 = x2 - cp->iradius;
}
#endif
while(x2 > x1) {
- OneH(p, out, x1, (float4 *)buf, cp->fp, cp->iradius);
+ OneHU4(p, out, x1, (float4 *)buf, cp->fp, cp->iradius);
out++;
x1++;
}
-
}
-RsdCpuScriptIntrinsicBlur::RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl *ctx, const Script *s)
- : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_BLUR) {
+void RsdCpuScriptIntrinsicBlur::kernelU1(const RsForEachStubParamStruct *p,
+ uint32_t xstart, uint32_t xend,
+ uint32_t instep, uint32_t outstep) {
+ float buf[4 * 2048];
+ RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)p->usr;
+ if (!cp->alloc.get()) {
+ ALOGE("Blur executed without input, skipping");
+ return;
+ }
+ const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
+ const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
- mRootPtr = &kernel;
+ uchar *out = (uchar *)p->out;
+ uint32_t x1 = xstart;
+ uint32_t x2 = xend;
+
+ float *fout = (float *)buf;
+ int y = p->y;
+ if ((y > cp->iradius) && (y < ((int)p->dimY - cp->iradius))) {
+ const uchar *pi = pin + (y - cp->iradius) * stride;
+ OneVFU1(fout, pi, stride, cp->fp, cp->iradius * 2 + 1, x2-x1);
+ } else {
+ while(x2 > x1) {
+ OneVU1(p, fout, x1, y, pin, stride, cp->fp, cp->iradius);
+ fout++;
+ x1++;
+ }
+ }
+
+ x1 = xstart;
+ while ((x1 < (uint32_t)cp->iradius) && (x1 < x2)) {
+ OneHU1(p, out, x1, buf, cp->fp, cp->iradius);
+ out++;
+ x1++;
+ }
+#if 0//defined(ARCH_ARM_HAVE_NEON)
+ if ((x1 + cp->iradius) < x2) {
+ rsdIntrinsicBlurHFU4_K(out, ((float4 *)buf) - cp->iradius, cp->fp, cp->iradius * 2 + 1, x1, 0, x2 - cp->iradius);
+ out += (x2 - cp->iradius) - x1;
+ x1 = x2 - cp->iradius;
+ }
+#endif
+ while(x2 > x1) {
+ OneHU1(p, out, x1, buf, cp->fp, cp->iradius);
+ out++;
+ x1++;
+ }
+}
+
+RsdCpuScriptIntrinsicBlur::RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e)
+ : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_BLUR) {
+
+ mRootPtr = NULL;
+ if (e->getType() == RS_TYPE_UNSIGNED_8) {
+ switch (e->getVectorSize()) {
+ case 1:
+ mRootPtr = &kernelU1;
+ break;
+ case 4:
+ mRootPtr = &kernelU4;
+ break;
+ }
+ }
+ rsAssert(mRootPtr);
radius = 5;
ComputeGaussianWeights();
}
@@ -243,9 +372,9 @@
}
-RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx, const Script *s) {
+RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
- return new RsdCpuScriptIntrinsicBlur(ctx, s);
+ return new RsdCpuScriptIntrinsicBlur(ctx, s, e);
}
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 8f3196d..3fc322c 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -32,7 +32,7 @@
virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
virtual ~RsdCpuScriptIntrinsicColorMatrix();
- RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s);
+ RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
protected:
float fp[16];
@@ -191,8 +191,8 @@
RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
- RsdCpuReferenceImpl *ctx, const Script *s)
- : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
+ RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
+ : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
const static float defaultMatrix[] = {
1.f, 0.f, 0.f, 0.f,
@@ -210,9 +210,10 @@
s->mHal.info.exportedVariableCount = 1;
}
-RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s) {
+RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e) {
- return new RsdCpuScriptIntrinsicColorMatrix(ctx, s);
+ return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e);
}
diff --git a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
index 1f49e1e..020fa6f 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
@@ -34,12 +34,13 @@
virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
virtual ~RsdCpuScriptIntrinsicConvolve3x3();
- RsdCpuScriptIntrinsicConvolve3x3(RsdCpuReferenceImpl *ctx, const Script *s);
+ RsdCpuScriptIntrinsicConvolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *);
protected:
- float fp[16];
- short ip[16];
- ObjectBaseRef<Allocation> alloc;
+ float mFp[16];
+ short mIp[16];
+ ObjectBaseRef<const Allocation> mAlloc;
+ ObjectBaseRef<const Element> mElement;
static void kernel(const RsForEachStubParamStruct *p,
uint32_t xstart, uint32_t xend,
@@ -52,15 +53,15 @@
void RsdCpuScriptIntrinsicConvolve3x3::setGlobalObj(uint32_t slot, ObjectBase *data) {
rsAssert(slot == 1);
- alloc.set(static_cast<Allocation *>(data));
+ mAlloc.set(static_cast<Allocation *>(data));
}
void RsdCpuScriptIntrinsicConvolve3x3::setGlobalVar(uint32_t slot, const void *data,
size_t dataLength) {
rsAssert(slot == 0);
- memcpy (&fp, data, dataLength);
+ memcpy (&mFp, data, dataLength);
for(int ct=0; ct < 9; ct++) {
- ip[ct] = (short)(fp[ct] * 255.f + 0.5f);
+ mIp[ct] = (short)(mFp[ct] * 255.f + 0.5f);
}
}
@@ -95,12 +96,12 @@
uint32_t instep, uint32_t outstep) {
RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
- if (!cp->alloc.get()) {
+ if (!cp->mAlloc.get()) {
ALOGE("Convolve3x3 executed without input, skipping");
return;
}
- const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
- const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
+ const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
+ const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
@@ -112,7 +113,7 @@
uint32_t x1 = xstart;
uint32_t x2 = xend;
if(x1 == 0) {
- ConvolveOne(p, 0, out, py0, py1, py2, cp->fp);
+ ConvolveOne(p, 0, out, py0, py1, py2, cp->mFp);
x1 ++;
out++;
}
@@ -121,14 +122,14 @@
#if defined(ARCH_ARM_HAVE_NEON)
int32_t len = (x2 - x1 - 1) >> 1;
if(len > 0) {
- rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->ip, len);
+ rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
x1 += len << 1;
out += len << 1;
}
#endif
while(x1 != x2) {
- ConvolveOne(p, x1, out, py0, py1, py2, cp->fp);
+ ConvolveOne(p, x1, out, py0, py1, py2, cp->mFp);
out++;
x1++;
}
@@ -136,13 +137,13 @@
}
RsdCpuScriptIntrinsicConvolve3x3::RsdCpuScriptIntrinsicConvolve3x3(
- RsdCpuReferenceImpl *ctx, const Script *s)
- : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3) {
+ RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
+ : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3) {
mRootPtr = &kernel;
for(int ct=0; ct < 9; ct++) {
- fp[ct] = 1.f / 9.f;
- ip[ct] = (short)(fp[ct] * 255.f + 0.5f);
+ mFp[ct] = 1.f / 9.f;
+ mIp[ct] = (short)(mFp[ct] * 255.f + 0.5f);
}
}
@@ -154,13 +155,13 @@
}
void RsdCpuScriptIntrinsicConvolve3x3::invokeFreeChildren() {
- alloc.clear();
+ mAlloc.clear();
}
-RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx, const Script *s) {
+RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
- return new RsdCpuScriptIntrinsicConvolve3x3(ctx, s);
+ return new RsdCpuScriptIntrinsicConvolve3x3(ctx, s, e);
}
diff --git a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
index 2cae2c0..d36639f 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
@@ -34,7 +34,7 @@
virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
virtual ~RsdCpuScriptIntrinsicConvolve5x5();
- RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s);
+ RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
protected:
float fp[28];
@@ -167,8 +167,8 @@
RsdCpuScriptIntrinsicConvolve5x5::RsdCpuScriptIntrinsicConvolve5x5(
- RsdCpuReferenceImpl *ctx, const Script *s)
- : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) {
+ RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
+ : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) {
mRootPtr = &kernel;
for(int ct=0; ct < 9; ct++) {
@@ -189,9 +189,10 @@
}
-RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx, const Script *s) {
+RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e) {
- return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s);
+ return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s, e);
}
diff --git a/cpu_ref/rsCpuIntrinsicLUT.cpp b/cpu_ref/rsCpuIntrinsicLUT.cpp
index 188ed2b..0da1f75 100644
--- a/cpu_ref/rsCpuIntrinsicLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsicLUT.cpp
@@ -33,7 +33,7 @@
virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
virtual ~RsdCpuScriptIntrinsicLUT();
- RsdCpuScriptIntrinsicLUT(RsdCpuReferenceImpl *ctx, const Script *s);
+ RsdCpuScriptIntrinsicLUT(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
protected:
ObjectBaseRef<Allocation> lut;
@@ -78,8 +78,9 @@
}
}
-RsdCpuScriptIntrinsicLUT::RsdCpuScriptIntrinsicLUT(RsdCpuReferenceImpl *ctx, const Script *s)
- : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_LUT) {
+RsdCpuScriptIntrinsicLUT::RsdCpuScriptIntrinsicLUT(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e)
+ : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_LUT) {
mRootPtr = &kernel;
}
@@ -96,9 +97,10 @@
}
-RsdCpuScriptImpl * rsdIntrinsic_LUT(RsdCpuReferenceImpl *ctx, const Script *s) {
+RsdCpuScriptImpl * rsdIntrinsic_LUT(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e) {
- return new RsdCpuScriptIntrinsicLUT(ctx, s);
+ return new RsdCpuScriptIntrinsicLUT(ctx, s, e);
}
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index 7b8f768..946d1ba 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -33,7 +33,7 @@
virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
virtual ~RsdCpuScriptIntrinsicYuvToRGB();
- RsdCpuScriptIntrinsicYuvToRGB(RsdCpuReferenceImpl *ctx, const Script *s);
+ RsdCpuScriptIntrinsicYuvToRGB(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
protected:
ObjectBaseRef<Allocation> alloc;
@@ -144,8 +144,8 @@
}
RsdCpuScriptIntrinsicYuvToRGB::RsdCpuScriptIntrinsicYuvToRGB(
- RsdCpuReferenceImpl *ctx, const Script *s)
- : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB) {
+ RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
+ : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB) {
mRootPtr = &kernel;
}
@@ -162,8 +162,9 @@
}
-RsdCpuScriptImpl * rsdIntrinsic_YuvToRGB(RsdCpuReferenceImpl *ctx, const Script *s) {
- return new RsdCpuScriptIntrinsicYuvToRGB(ctx, s);
+RsdCpuScriptImpl * rsdIntrinsic_YuvToRGB(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e) {
+ return new RsdCpuScriptIntrinsicYuvToRGB(ctx, s, e);
}
diff --git a/cpu_ref/rsCpuIntrinsics_neon.S b/cpu_ref/rsCpuIntrinsics_neon.S
index 04dd8b1..53c116d 100644
--- a/cpu_ref/rsCpuIntrinsics_neon.S
+++ b/cpu_ref/rsCpuIntrinsics_neon.S
@@ -284,7 +284,7 @@
r5 = sp+4, x1
r6 = sp+8, x2
*/
-ENTRY(rsdIntrinsicBlurVF_K)
+ENTRY(rsdIntrinsicBlurVFU4_K)
push {r4-r8, r10, r11, lr}
vpush {q4-q7}
@@ -324,7 +324,7 @@
vpop {q4-q7}
pop {r4-r8, r10, r11, lr}
bx lr
-END(rsdIntrinsicBlurVF_K)
+END(rsdIntrinsicBlurVFU4_K)
/*
static void OneVF(float4 *out, const uchar *ptrIn, int iStride,
@@ -337,7 +337,7 @@
r4 = sp, x1
r5 = sp+4, x2
*/
-ENTRY(rsdIntrinsicBlurHF_K)
+ENTRY(rsdIntrinsicBlurHFU4_K)
push {r4-r8, r10, r11, lr}
vpush {q4-q7}
@@ -357,8 +357,7 @@
2:
vld1.32 {q1}, [r7]!
vld1.32 {q2}, [r7]!
- vld1.32 {d6[0]}, [r10]!
- vld1.32 {d6[1]}, [r10]!
+ vld1.32 {d6}, [r10]!
vmla.f32 q0, q1, d6[0]
vmla.f32 q0, q2, d6[1]
subs r11, r11, #2
@@ -376,7 +375,7 @@
vpop {q4-q7}
pop {r4-r8, r10, r11, lr}
bx lr
-END(rsdIntrinsicBlurHF_K)
+END(rsdIntrinsicBlurHFU4_K)
/*
r0 = dst