| /* |
| * Copyright (C) 2014 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include <math.h> |
| |
| #include <cstdint> |
| |
| #include "RenderScriptToolkit.h" |
| #include "TaskProcessor.h" |
| #include "Utils.h" |
| |
| #if defined(ARCH_X86_HAVE_AVX2) |
| #include <stdint.h> |
| #include <x86intrin.h> |
| #include <xmmintrin.h> |
| #endif |
| |
| #define LOG_TAG "renderscript.toolkit.Resize" |
| |
| namespace android { |
| namespace renderscript { |
| |
| class ResizeTask : public Task { |
| const uchar* mIn; |
| uchar* mOut; |
| float mScaleX; |
| float mScaleY; |
| size_t mInputSizeX; |
| size_t mInputSizeY; |
| |
| void kernelU1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY); |
| void kernelU2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY); |
| void kernelU4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY); |
| #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT |
| void kernelF1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY); |
| void kernelF2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY); |
| void kernelF4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY); |
| #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT |
| |
| // Process a 2D tile of the overall work. threadIndex identifies which thread does the work. |
| virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX, |
| size_t endY) override; |
| |
| public: |
| ResizeTask(const uchar* input, uchar* output, size_t inputSizeX, size_t inputSizeY, |
| size_t vectorSize, size_t outputSizeX, size_t outputSizeY, |
| const Restriction* restriction) |
| : Task{outputSizeX, outputSizeY, vectorSize, false, restriction}, |
| mIn{input}, |
| mOut{output}, |
| mInputSizeX{inputSizeX}, |
| mInputSizeY{inputSizeY} { |
| mScaleX = static_cast<float>(inputSizeX) / outputSizeX; |
| mScaleY = static_cast<float>(inputSizeY) / outputSizeY; |
| } |
| }; |
| |
| void ResizeTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX, |
| size_t endY) { |
| typedef void (ResizeTask::*KernelFunction)(uchar*, uint32_t, uint32_t, uint32_t); |
| |
| KernelFunction kernel; |
| switch (mVectorSize) { |
| case 4: |
| kernel = &ResizeTask::kernelU4; |
| break; |
| case 3: |
| kernel = &ResizeTask::kernelU4; |
| break; |
| case 2: |
| kernel = &ResizeTask::kernelU2; |
| break; |
| case 1: |
| kernel = &ResizeTask::kernelU1; |
| break; |
| default: |
| ALOGE("Bad vector size %zd", mVectorSize); |
| } |
| |
| for (size_t y = startY; y < endY; y++) { |
| size_t offset = (mSizeX * y + startX) * paddedSize(mVectorSize); |
| uchar* out = mOut + offset; |
| std::invoke(kernel, this, out, startX, endX, y); |
| } |
| } |
| |
| static float4 cubicInterpolate(float4 p0, float4 p1, float4 p2, float4 p3, float x) { |
| return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3 |
| + x * (3.f * (p1 - p2) + p3 - p0))); |
| } |
| |
| static float2 cubicInterpolate(float2 p0,float2 p1,float2 p2,float2 p3, float x) { |
| return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3 |
| + x * (3.f * (p1 - p2) + p3 - p0))); |
| } |
| |
| |
| #if defined(ARCH_X86_HAVE_AVX2) |
| static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) { |
| return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + |
| _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(4.f), _mm_set1_ps(p2),_mm_set1_ps(p3))) |
| + x * (_mm_cvtss_f32(_mm_fmadd_ss (_mm_set1_ps(3.f),_mm_set1_ps(p1 - p2), |
| _mm_set1_ps(p3 - p0)))))); |
| |
| } |
| #else |
| static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) { |
| //ALOGI("CP, %f, %f, %f, %f, %f", p0, p1, p2, p3, x); |
| return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3 |
| + x * (3.f * (p1 - p2) + p3 - p0))); |
| } |
| #endif |
| |
| static uchar4 OneBiCubic(const uchar4 *yp0, const uchar4 *yp1, const uchar4 *yp2, const uchar4 *yp3, |
| float xf, float yf, int width) { |
| int startx = (int) floor(xf - 1); |
| xf = xf - floor(xf); |
| int maxx = width - 1; |
| int xs0 = std::max(0, startx + 0); |
| int xs1 = std::max(0, startx + 1); |
| int xs2 = std::min(maxx, startx + 2); |
| int xs3 = std::min(maxx, startx + 3); |
| |
| float4 p0 = cubicInterpolate(convert<float4>(yp0[xs0]), |
| convert<float4>(yp0[xs1]), |
| convert<float4>(yp0[xs2]), |
| convert<float4>(yp0[xs3]), xf); |
| |
| float4 p1 = cubicInterpolate(convert<float4>(yp1[xs0]), |
| convert<float4>(yp1[xs1]), |
| convert<float4>(yp1[xs2]), |
| convert<float4>(yp1[xs3]), xf); |
| |
| float4 p2 = cubicInterpolate(convert<float4>(yp2[xs0]), |
| convert<float4>(yp2[xs1]), |
| convert<float4>(yp2[xs2]), |
| convert<float4>(yp2[xs3]), xf); |
| |
| float4 p3 = cubicInterpolate(convert<float4>(yp3[xs0]), |
| convert<float4>(yp3[xs1]), |
| convert<float4>(yp3[xs2]), |
| convert<float4>(yp3[xs3]), xf); |
| |
| float4 p = cubicInterpolate(p0, p1, p2, p3, yf); |
| p = clamp(p + 0.5f, 0.f, 255.f); |
| return convert<uchar4>(p); |
| } |
| |
| static uchar2 OneBiCubic(const uchar2 *yp0, const uchar2 *yp1, const uchar2 *yp2, const uchar2 *yp3, |
| float xf, float yf, int width) { |
| int startx = (int) floor(xf - 1); |
| xf = xf - floor(xf); |
| int maxx = width - 1; |
| int xs0 = std::max(0, startx + 0); |
| int xs1 = std::max(0, startx + 1); |
| int xs2 = std::min(maxx, startx + 2); |
| int xs3 = std::min(maxx, startx + 3); |
| |
| float2 p0 = cubicInterpolate(convert<float2>(yp0[xs0]), |
| convert<float2>(yp0[xs1]), |
| convert<float2>(yp0[xs2]), |
| convert<float2>(yp0[xs3]), xf); |
| |
| float2 p1 = cubicInterpolate(convert<float2>(yp1[xs0]), |
| convert<float2>(yp1[xs1]), |
| convert<float2>(yp1[xs2]), |
| convert<float2>(yp1[xs3]), xf); |
| |
| float2 p2 = cubicInterpolate(convert<float2>(yp2[xs0]), |
| convert<float2>(yp2[xs1]), |
| convert<float2>(yp2[xs2]), |
| convert<float2>(yp2[xs3]), xf); |
| |
| float2 p3 = cubicInterpolate(convert<float2>(yp3[xs0]), |
| convert<float2>(yp3[xs1]), |
| convert<float2>(yp3[xs2]), |
| convert<float2>(yp3[xs3]), xf); |
| |
| float2 p = cubicInterpolate(p0, p1, p2, p3, yf); |
| p = clamp(p + 0.5f, 0.f, 255.f); |
| return convert<uchar2>(p); |
| } |
| |
| static uchar OneBiCubic(const uchar *yp0, const uchar *yp1, const uchar *yp2, const uchar *yp3, |
| float xf, float yf, int width) { |
| int startx = (int) floor(xf - 1); |
| xf = xf - floor(xf); |
| int maxx = width - 1; |
| int xs0 = std::max(0, startx + 0); |
| int xs1 = std::max(0, startx + 1); |
| int xs2 = std::min(maxx, startx + 2); |
| int xs3 = std::min(maxx, startx + 3); |
| |
| float p0 = cubicInterpolate((float)yp0[xs0], (float)yp0[xs1], |
| (float)yp0[xs2], (float)yp0[xs3], xf); |
| float p1 = cubicInterpolate((float)yp1[xs0], (float)yp1[xs1], |
| (float)yp1[xs2], (float)yp1[xs3], xf); |
| float p2 = cubicInterpolate((float)yp2[xs0], (float)yp2[xs1], |
| (float)yp2[xs2], (float)yp2[xs3], xf); |
| float p3 = cubicInterpolate((float)yp3[xs0], (float)yp3[xs1], |
| (float)yp3[xs2], (float)yp3[xs3], xf); |
| |
| float p = cubicInterpolate(p0, p1, p2, p3, yf); |
| p = clamp(p + 0.5f, 0.f, 255.f); |
| //ALOGI("CUC,%f,%u", p, (uchar)p); |
| return (uchar)p; |
| } |
| |
| extern "C" uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc); |
| |
| extern "C" void rsdIntrinsicResizeB4_K( |
| uchar4 *dst, |
| size_t count, |
| uint32_t xf, |
| uint32_t xinc, |
| uchar4 const *srcn, |
| uchar4 const *src0, |
| uchar4 const *src1, |
| uchar4 const *src2, |
| size_t xclip, |
| size_t avail, |
| uint64_t osc_ctl, |
| int32_t const *yr); |
| |
| extern "C" void rsdIntrinsicResizeB2_K( |
| uchar2 *dst, |
| size_t count, |
| uint32_t xf, |
| uint32_t xinc, |
| uchar2 const *srcn, |
| uchar2 const *src0, |
| uchar2 const *src1, |
| uchar2 const *src2, |
| size_t xclip, |
| size_t avail, |
| uint64_t osc_ctl, |
| int32_t const *yr); |
| |
| extern "C" void rsdIntrinsicResizeB1_K( |
| uchar *dst, |
| size_t count, |
| uint32_t xf, |
| uint32_t xinc, |
| uchar const *srcn, |
| uchar const *src0, |
| uchar const *src1, |
| uchar const *src2, |
| size_t xclip, |
| size_t avail, |
| uint64_t osc_ctl, |
| int32_t const *yr); |
| |
| #if defined(ARCH_ARM_USE_INTRINSICS) |
| static void mkYCoeff(int32_t *yr, float yf) { |
| int32_t yf1 = rint(yf * 0x10000); |
| int32_t yf2 = rint(yf * yf * 0x10000); |
| int32_t yf3 = rint(yf * yf * yf * 0x10000); |
| |
| yr[0] = -(2 * yf2 - yf3 - yf1) >> 1; |
| yr[1] = (3 * yf3 - 5 * yf2 + 0x20000) >> 1; |
| yr[2] = (-3 * yf3 + 4 * yf2 + yf1) >> 1; |
| yr[3] = -(yf3 - yf2) >> 1; |
| } |
| #endif |
| |
| #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT |
| static float4 OneBiCubic(const float4 *yp0, const float4 *yp1, const float4 *yp2, const float4 *yp3, |
| float xf, float yf, int width) { |
| int startx = (int) floor(xf - 1); |
| xf = xf - floor(xf); |
| int maxx = width - 1; |
| int xs0 = std::max(0, startx + 0); |
| int xs1 = std::max(0, startx + 1); |
| int xs2 = std::min(maxx, startx + 2); |
| int xs3 = std::min(maxx, startx + 3); |
| |
| float4 p0 = cubicInterpolate(yp0[xs0], yp0[xs1], |
| yp0[xs2], yp0[xs3], xf); |
| float4 p1 = cubicInterpolate(yp1[xs0], yp1[xs1], |
| yp1[xs2], yp1[xs3], xf); |
| float4 p2 = cubicInterpolate(yp2[xs0], yp2[xs1], |
| yp2[xs2], yp2[xs3], xf); |
| float4 p3 = cubicInterpolate(yp3[xs0], yp3[xs1], |
| yp3[xs2], yp3[xs3], xf); |
| |
| float4 p = cubicInterpolate(p0, p1, p2, p3, yf); |
| return p; |
| } |
| |
| static float2 OneBiCubic(const float2 *yp0, const float2 *yp1, const float2 *yp2, const float2 *yp3, |
| float xf, float yf, int width) { |
| int startx = (int) floor(xf - 1); |
| xf = xf - floor(xf); |
| int maxx = width - 1; |
| int xs0 = std::max(0, startx + 0); |
| int xs1 = std::max(0, startx + 1); |
| int xs2 = std::min(maxx, startx + 2); |
| int xs3 = std::min(maxx, startx + 3); |
| |
| float2 p0 = cubicInterpolate(yp0[xs0], yp0[xs1], |
| yp0[xs2], yp0[xs3], xf); |
| float2 p1 = cubicInterpolate(yp1[xs0], yp1[xs1], |
| yp1[xs2], yp1[xs3], xf); |
| float2 p2 = cubicInterpolate(yp2[xs0], yp2[xs1], |
| yp2[xs2], yp2[xs3], xf); |
| float2 p3 = cubicInterpolate(yp3[xs0], yp3[xs1], |
| yp3[xs2], yp3[xs3], xf); |
| |
| float2 p = cubicInterpolate(p0, p1, p2, p3, yf); |
| return p; |
| } |
| |
| static float OneBiCubic(const float *yp0, const float *yp1, const float *yp2, const float *yp3, |
| float xf, float yf, int width) { |
| int startx = (int) floor(xf - 1); |
| xf = xf - floor(xf); |
| int maxx = width - 1; |
| int xs0 = std::max(0, startx + 0); |
| int xs1 = std::max(0, startx + 1); |
| int xs2 = std::min(maxx, startx + 2); |
| int xs3 = std::min(maxx, startx + 3); |
| |
| float p0 = cubicInterpolate(yp0[xs0], yp0[xs1], |
| yp0[xs2], yp0[xs3], xf); |
| float p1 = cubicInterpolate(yp1[xs0], yp1[xs1], |
| yp1[xs2], yp1[xs3], xf); |
| float p2 = cubicInterpolate(yp2[xs0], yp2[xs1], |
| yp2[xs2], yp2[xs3], xf); |
| float p3 = cubicInterpolate(yp3[xs0], yp3[xs1], |
| yp3[xs2], yp3[xs3], xf); |
| |
| float p = cubicInterpolate(p0, p1, p2, p3, yf); |
| return p; |
| } |
| #endif |
| |
| void ResizeTask::kernelU4(uchar *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) { |
| const uchar *pin = mIn; |
| const int srcHeight = mInputSizeY; |
| const int srcWidth = mInputSizeX; |
| const size_t stride = mInputSizeX * paddedSize(mVectorSize); |
| |
| |
| #if defined(ARCH_X86_HAVE_AVX2) |
| float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), |
| _mm_set1_ps(scaleY), _mm_set1_ps(0.5f))); |
| #else |
| float yf = (currentY + 0.5f) * mScaleY - 0.5f; |
| #endif |
| |
| |
| int starty = (int) floor(yf - 1); |
| yf = yf - floor(yf); |
| int maxy = srcHeight - 1; |
| int ys0 = std::max(0, starty + 0); |
| int ys1 = std::max(0, starty + 1); |
| int ys2 = std::min(maxy, starty + 2); |
| int ys3 = std::min(maxy, starty + 3); |
| |
| const uchar4 *yp0 = (const uchar4 *)(pin + stride * ys0); |
| const uchar4 *yp1 = (const uchar4 *)(pin + stride * ys1); |
| const uchar4 *yp2 = (const uchar4 *)(pin + stride * ys2); |
| const uchar4 *yp3 = (const uchar4 *)(pin + stride * ys3); |
| |
| uchar4 *out = ((uchar4 *)outPtr); |
| uint32_t x1 = xstart; |
| uint32_t x2 = xend; |
| |
| #if defined(ARCH_ARM_USE_INTRINSICS) |
| if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) { |
| float xf = (x1 + 0.5f) * mScaleX - 0.5f; |
| long xf16 = rint(xf * 0x10000); |
| uint32_t xinc16 = rint(mScaleX * 0x10000); |
| |
| int xoff = (xf16 >> 16) - 1; |
| int xclip = std::max(0, xoff) - xoff; |
| int len = x2 - x1; |
| |
| int32_t yr[4]; |
| uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16); |
| mkYCoeff(yr, yf); |
| |
| xoff += xclip; |
| |
| rsdIntrinsicResizeB4_K( |
| out, len, |
| xf16 & 0xffff, xinc16, |
| yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff, |
| xclip, srcWidth - xoff + xclip, |
| osc_ctl, yr); |
| out += len; |
| x1 += len; |
| } |
| #endif |
| |
| while(x1 < x2) { |
| #if defined(ARCH_X86_HAVE_AVX2) |
| float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) , |
| _mm_set1_ps(0.5f))); |
| #else |
| float xf = (x1 + 0.5f) * mScaleX - 0.5f; |
| #endif |
| *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth); |
| out++; |
| x1++; |
| } |
| } |
| |
| void ResizeTask::kernelU2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) { |
| const uchar *pin = mIn; |
| const int srcHeight = mInputSizeY; |
| const int srcWidth = mInputSizeX; |
| const size_t stride = mInputSizeX * mVectorSize; |
| |
| |
| #if defined(ARCH_X86_HAVE_AVX2) |
| float yf = _mm_cvtss_f32( |
| _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f))); |
| #else |
| float yf = (currentY + 0.5f) * mScaleY - 0.5f; |
| #endif |
| |
| int starty = (int) floor(yf - 1); |
| yf = yf - floor(yf); |
| int maxy = srcHeight - 1; |
| int ys0 = std::max(0, starty + 0); |
| int ys1 = std::max(0, starty + 1); |
| int ys2 = std::min(maxy, starty + 2); |
| int ys3 = std::min(maxy, starty + 3); |
| |
| const uchar2 *yp0 = (const uchar2 *)(pin + stride * ys0); |
| const uchar2 *yp1 = (const uchar2 *)(pin + stride * ys1); |
| const uchar2 *yp2 = (const uchar2 *)(pin + stride * ys2); |
| const uchar2 *yp3 = (const uchar2 *)(pin + stride * ys3); |
| |
| uchar2 *out = ((uchar2 *)outPtr); |
| uint32_t x1 = xstart; |
| uint32_t x2 = xend; |
| |
| #if defined(ARCH_ARM_USE_INTRINSICS) |
| if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) { |
| float xf = (x1 + 0.5f) * mScaleX - 0.5f; |
| long xf16 = rint(xf * 0x10000); |
| uint32_t xinc16 = rint(mScaleX * 0x10000); |
| |
| int xoff = (xf16 >> 16) - 1; |
| int xclip = std::max(0, xoff) - xoff; |
| int len = x2 - x1; |
| |
| int32_t yr[4]; |
| uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16); |
| mkYCoeff(yr, yf); |
| |
| xoff += xclip; |
| |
| rsdIntrinsicResizeB2_K( |
| out, len, |
| xf16 & 0xffff, xinc16, |
| yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff, |
| xclip, srcWidth - xoff + xclip, |
| osc_ctl, yr); |
| out += len; |
| x1 += len; |
| } |
| #endif |
| |
| while(x1 < x2) { |
| |
| #if defined(ARCH_X86_HAVE_AVX2) |
| float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) , |
| _mm_set1_ps(0.5f))); |
| #else |
| float xf = (x1 + 0.5f) * mScaleX - 0.5f; |
| #endif |
| *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth); |
| out++; |
| x1++; |
| } |
| } |
| |
| void ResizeTask::kernelU1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) { |
| //ALOGI("TK kernelU1 xstart %u, xend %u, outstep %u", xstart, xend); |
| const uchar *pin = mIn; |
| const int srcHeight = mInputSizeY; |
| const int srcWidth = mInputSizeX; |
| const size_t stride = mInputSizeX * mVectorSize; |
| |
| // ALOGI("Toolkit ResizeU1 (%ux%u) by (%f,%f), xstart:%u to %u, stride %zu, out %p", srcWidth, |
| // srcHeight, scaleX, scaleY, xstart, xend, stride, outPtr); |
| |
| #if defined(ARCH_X86_HAVE_AVX2) |
| float yf = _mm_cvtss_f32( |
| _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f))); |
| #else |
| float yf = (currentY + 0.5f) * mScaleY - 0.5f; |
| #endif |
| |
| int starty = (int) floor(yf - 1); |
| yf = yf - floor(yf); |
| int maxy = srcHeight - 1; |
| int ys0 = std::max(0, starty + 0); |
| int ys1 = std::min(maxy, std::max(0, starty + 1)); |
| int ys2 = std::min(maxy, starty + 2); |
| int ys3 = std::min(maxy, starty + 3); |
| |
| const uchar *yp0 = pin + stride * ys0; |
| const uchar *yp1 = pin + stride * ys1; |
| const uchar *yp2 = pin + stride * ys2; |
| const uchar *yp3 = pin + stride * ys3; |
| |
| uchar *out = ((uchar *)outPtr); |
| uint32_t x1 = xstart; |
| uint32_t x2 = xend; |
| |
| #if defined(ARCH_ARM_USE_INTRINSICS) |
| if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) { |
| float xf = (x1 + 0.5f) * mScaleX - 0.5f; |
| long xf16 = rint(xf * 0x10000); |
| uint32_t xinc16 = rint(mScaleX * 0x10000); |
| |
| int xoff = (xf16 >> 16) - 1; |
| int xclip = std::max(0, xoff) - xoff; |
| int len = x2 - x1; |
| |
| int32_t yr[4]; |
| uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16); |
| mkYCoeff(yr, yf); |
| |
| // ALOGI("ys0 %d, ys1 %d, ys2 %d, ys3 %d, x1 %u, x2 %u, xf %f, xf16 %ld, xinc16 %u, xoff %d, |
| // xclip %d, len %d, osc_ctl %lu)", |
| // ys0, ys1, ys2, ys3, x1, x2, xf, xf16, xinc16, xoff, xclip, len, (unsigned long) |
| // osc_ctl); |
| // ALOGI("TK scaleX %f, xf %f, xf16 %ld, xinc16 %d, xoff %d, xclip %d, len %d", scaleX, xf, |
| // xf16, xinc16, xoff, xclip, len); ALOGI("TK xf16 & 0xffff %ld, ys0 %u, ys1 %u, ys2 %u, ys3 |
| // %u, srcWidth - xoff + xclip %d", xf16 & 0xffff, ys0, ys1, ys2, ys3, srcWidth - xoff); |
| |
| xoff += xclip; |
| |
| rsdIntrinsicResizeB1_K( |
| out, len, |
| xf16 & 0xffff, xinc16, |
| yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff, |
| xclip, srcWidth - xoff + xclip, |
| osc_ctl, yr); |
| out += len; |
| x1 += len; |
| } |
| #endif |
| |
| while(x1 < x2) { |
| |
| #if defined(ARCH_X86_HAVE_AVX2) |
| float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) , |
| _mm_set1_ps(0.5f))); |
| #else |
| float xf = (x1 + 0.5f) * mScaleX - 0.5f; |
| #endif |
| |
| *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth); |
| out++; |
| x1++; |
| } |
| } |
| |
| #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT |
| void ResizeTask::kernelF4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) { |
| const uchar *pin = mIn; |
| const int srcHeight = inputSizeY; |
| const int srcWidth = inputSizeX; |
| const size_t stride = sizeX * vectorSize; |
| |
| #if defined(ARCH_X86_HAVE_AVX2) |
| float yf = _mm_cvtss_f32( |
| _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f))); |
| #else |
| float yf = (currentY + 0.5f) * scaleY - 0.5f; |
| #endif |
| |
| int starty = (int) floor(yf - 1); |
| yf = yf - floor(yf); |
| int maxy = srcHeight - 1; |
| int ys0 = std::max(0, starty + 0); |
| int ys1 = std::max(0, starty + 1); |
| int ys2 = std::min(maxy, starty + 2); |
| int ys3 = std::min(maxy, starty + 3); |
| |
| const float4 *yp0 = (const float4 *)(pin + stride * ys0); |
| const float4 *yp1 = (const float4 *)(pin + stride * ys1); |
| const float4 *yp2 = (const float4 *)(pin + stride * ys2); |
| const float4 *yp3 = (const float4 *)(pin + stride * ys3); |
| |
| float4 *out = ((float4 *)outPtr); |
| uint32_t x1 = xstart; |
| uint32_t x2 = xend; |
| |
| while(x1 < x2) { |
| |
| #if defined(ARCH_X86_HAVE_AVX2) |
| float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) , |
| _mm_set1_ps(0.5f))); |
| #else |
| float xf = (x1 + 0.5f) * scaleX - 0.5f; |
| #endif |
| |
| *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth); |
| out++; |
| x1++; |
| } |
| } |
| |
| void ResizeTask::kernelF2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) { |
| const uchar *pin = mIn; |
| const int srcHeight = inputSizeY; |
| const int srcWidth = inputSizeX; |
| const size_t stride = sizeX * vectorSize; |
| |
| |
| #if defined(ARCH_X86_HAVE_AVX2) |
| float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), |
| _mm_set1_ps(scaleY), _mm_set1_ps(0.5f))); |
| #else |
| float yf = (currentY + 0.5f) * scaleY - 0.5f; |
| #endif |
| |
| int starty = (int) floor(yf - 1); |
| yf = yf - floor(yf); |
| int maxy = srcHeight - 1; |
| int ys0 = std::max(0, starty + 0); |
| int ys1 = std::max(0, starty + 1); |
| int ys2 = std::min(maxy, starty + 2); |
| int ys3 = std::min(maxy, starty + 3); |
| |
| const float2 *yp0 = (const float2 *)(pin + stride * ys0); |
| const float2 *yp1 = (const float2 *)(pin + stride * ys1); |
| const float2 *yp2 = (const float2 *)(pin + stride * ys2); |
| const float2 *yp3 = (const float2 *)(pin + stride * ys3); |
| |
| float2 *out = ((float2 *)outPtr); |
| uint32_t x1 = xstart; |
| uint32_t x2 = xend; |
| |
| while(x1 < x2) { |
| |
| #if defined(ARCH_X86_HAVE_AVX2) |
| float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) , |
| _mm_set1_ps(0.5f))); |
| #else |
| float xf = (x1 + 0.5f) * scaleX - 0.5f; |
| #endif |
| |
| *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth); |
| out++; |
| x1++; |
| } |
| } |
| |
| void ResizeTask::kernelF1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) { |
| const uchar *pin = mIn; |
| const int srcHeight = inputSizeY; |
| const int srcWidth = inputSizeX; |
| const size_t stride = sizeX * vectorSize; |
| |
| |
| #if defined(ARCH_X86_HAVE_AVX2) |
| float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), |
| _mm_set1_ps(scaleY), _mm_set1_ps(0.5f))); |
| #else |
| float yf = (currentY + 0.5f) * scaleY - 0.5f; |
| #endif |
| |
| int starty = (int) floor(yf - 1); |
| yf = yf - floor(yf); |
| int maxy = srcHeight - 1; |
| int ys0 = std::max(0, starty + 0); |
| int ys1 = std::max(0, starty + 1); |
| int ys2 = std::min(maxy, starty + 2); |
| int ys3 = std::min(maxy, starty + 3); |
| |
| const float *yp0 = (const float *)(pin + stride * ys0); |
| const float *yp1 = (const float *)(pin + stride * ys1); |
| const float *yp2 = (const float *)(pin + stride * ys2); |
| const float *yp3 = (const float *)(pin + stride * ys3); |
| |
| float *out = ((float *)outPtr); |
| uint32_t x1 = xstart; |
| uint32_t x2 = xend; |
| |
| while(x1 < x2) { |
| |
| #if defined(ARCH_X86_HAVE_AVX2) |
| float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) , |
| _mm_set1_ps(0.5f))); |
| #else |
| float xf = (x1 + 0.5f) * scaleX - 0.5f; |
| #endif |
| |
| *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth); |
| out++; |
| x1++; |
| } |
| } |
| |
| void ResizeTask::preLaunch(uint32_t slot, const RsScriptCall *sc) |
| { |
| |
| //check the data type to determine F or U. |
| if (mAlloc->getType()->getElement()->getType() == RS_TYPE_UNSIGNED_8) { |
| switch(mAlloc->getType()->getElement()->getVectorSize()) { |
| case 1: |
| mRootPtr = &kernelU1; |
| break; |
| case 2: |
| mRootPtr = &kernelU2; |
| break; |
| case 3: |
| case 4: |
| mRootPtr = &kernelU4; |
| break; |
| } |
| } else { |
| switch(mAlloc->getType()->getElement()->getVectorSize()) { |
| case 1: |
| mRootPtr = &kernelF1; |
| break; |
| case 2: |
| mRootPtr = &kernelF2; |
| break; |
| case 3: |
| case 4: |
| mRootPtr = &kernelF4; |
| break; |
| } |
| } |
| } |
| #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT |
| |
| void RenderScriptToolkit::resize(const uint8_t* input, uint8_t* output, size_t inputSizeX, |
| size_t inputSizeY, size_t vectorSize, size_t outputSizeX, |
| size_t outputSizeY, const Restriction* restriction) { |
| #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE |
| if (!validRestriction(LOG_TAG, outputSizeX, outputSizeY, restriction)) { |
| return; |
| } |
| if (vectorSize < 1 || vectorSize > 4) { |
| ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize); |
| return; |
| } |
| #endif |
| |
| ResizeTask task((const uchar*)input, (uchar*)output, inputSizeX, inputSizeY, vectorSize, |
| outputSizeX, outputSizeY, restriction); |
| processor->doTask(&task); |
| } |
| |
| } // namespace renderscript |
| } // namespace android |