toolkit/Resize.cpp - platform/frameworks/rs - Git at Google

 /*
  * Copyright (C) 2014 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include <math.h>

 #include <cstdint>

 #include "RenderScriptToolkit.h"
 #include "TaskProcessor.h"
 #include "Utils.h"

 #if defined(ARCH_X86_HAVE_AVX2)
 #include <stdint.h>
 #include <x86intrin.h>
 #include <xmmintrin.h>
 #endif

 #define LOG_TAG "renderscript.toolkit.Resize"

 namespace android {
 namespace renderscript {

 class ResizeTask : public Task {
     const uchar* mIn;
     uchar* mOut;
     float mScaleX;
     float mScaleY;
     size_t mInputSizeX;
     size_t mInputSizeY;

     void kernelU1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
     void kernelU2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
     void kernelU4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
     void kernelF1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
     void kernelF2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
     void kernelF4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
 #endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT

     // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
     virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
                              size_t endY) override;

    public:
     ResizeTask(const uchar* input, uchar* output, size_t inputSizeX, size_t inputSizeY,
                size_t vectorSize, size_t outputSizeX, size_t outputSizeY,
                const Restriction* restriction)
         : Task{outputSizeX, outputSizeY, vectorSize, false, restriction},
           mIn{input},
           mOut{output},
           mInputSizeX{inputSizeX},
           mInputSizeY{inputSizeY} {
         mScaleX = static_cast<float>(inputSizeX) / outputSizeX;
         mScaleY = static_cast<float>(inputSizeY) / outputSizeY;
     }
 };

 void ResizeTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
                              size_t endY) {
     typedef void (ResizeTask::*KernelFunction)(uchar*, uint32_t, uint32_t, uint32_t);

     KernelFunction kernel;
     switch (mVectorSize) {
         case 4:
             kernel = &ResizeTask::kernelU4;
             break;
         case 3:
             kernel = &ResizeTask::kernelU4;
             break;
         case 2:
             kernel = &ResizeTask::kernelU2;
             break;
         case 1:
             kernel = &ResizeTask::kernelU1;
             break;
         default:
             ALOGE("Bad vector size %zd", mVectorSize);
     }

     for (size_t y = startY; y < endY; y++) {
         size_t offset = (mSizeX * y + startX) * paddedSize(mVectorSize);
         uchar* out = mOut + offset;
         std::invoke(kernel, this, out, startX, endX, y);
     }
 }

 static float4 cubicInterpolate(float4 p0, float4 p1, float4 p2, float4 p3, float x) {
     return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
             + x * (3.f * (p1 - p2) + p3 - p0)));
 }

 static float2 cubicInterpolate(float2 p0,float2 p1,float2 p2,float2 p3, float x) {
     return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
             + x * (3.f * (p1 - p2) + p3 - p0)));
 }


 #if defined(ARCH_X86_HAVE_AVX2)
 static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
    return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 +
            _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(4.f), _mm_set1_ps(p2),_mm_set1_ps(p3)))
            + x * (_mm_cvtss_f32(_mm_fmadd_ss (_mm_set1_ps(3.f),_mm_set1_ps(p1 - p2),
                                               _mm_set1_ps(p3 - p0))))));

 }
 #else
 static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
     //ALOGI("CP, %f, %f, %f, %f, %f", p0, p1, p2, p3, x);
     return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
             + x * (3.f * (p1 - p2) + p3 - p0)));
 }
 #endif

 static uchar4 OneBiCubic(const uchar4 *yp0, const uchar4 *yp1, const uchar4 *yp2, const uchar4 *yp3,
                          float xf, float yf, int width) {
     int startx = (int) floor(xf - 1);
     xf = xf - floor(xf);
     int maxx = width - 1;
     int xs0 = std::max(0, startx + 0);
     int xs1 = std::max(0, startx + 1);
     int xs2 = std::min(maxx, startx + 2);
     int xs3 = std::min(maxx, startx + 3);

     float4 p0  = cubicInterpolate(convert<float4>(yp0[xs0]),
                                   convert<float4>(yp0[xs1]),
                                   convert<float4>(yp0[xs2]),
                                   convert<float4>(yp0[xs3]), xf);

     float4 p1  = cubicInterpolate(convert<float4>(yp1[xs0]),
                                   convert<float4>(yp1[xs1]),
                                   convert<float4>(yp1[xs2]),
                                   convert<float4>(yp1[xs3]), xf);

     float4 p2  = cubicInterpolate(convert<float4>(yp2[xs0]),
                                   convert<float4>(yp2[xs1]),
                                   convert<float4>(yp2[xs2]),
                                   convert<float4>(yp2[xs3]), xf);

     float4 p3  = cubicInterpolate(convert<float4>(yp3[xs0]),
                                   convert<float4>(yp3[xs1]),
                                   convert<float4>(yp3[xs2]),
                                   convert<float4>(yp3[xs3]), xf);

     float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
     p = clamp(p + 0.5f, 0.f, 255.f);
     return convert<uchar4>(p);
 }

 static uchar2 OneBiCubic(const uchar2 *yp0, const uchar2 *yp1, const uchar2 *yp2, const uchar2 *yp3,
                          float xf, float yf, int width) {
     int startx = (int) floor(xf - 1);
     xf = xf - floor(xf);
     int maxx = width - 1;
     int xs0 = std::max(0, startx + 0);
     int xs1 = std::max(0, startx + 1);
     int xs2 = std::min(maxx, startx + 2);
     int xs3 = std::min(maxx, startx + 3);

     float2 p0  = cubicInterpolate(convert<float2>(yp0[xs0]),
                                   convert<float2>(yp0[xs1]),
                                   convert<float2>(yp0[xs2]),
                                   convert<float2>(yp0[xs3]), xf);

     float2 p1  = cubicInterpolate(convert<float2>(yp1[xs0]),
                                   convert<float2>(yp1[xs1]),
                                   convert<float2>(yp1[xs2]),
                                   convert<float2>(yp1[xs3]), xf);

     float2 p2  = cubicInterpolate(convert<float2>(yp2[xs0]),
                                   convert<float2>(yp2[xs1]),
                                   convert<float2>(yp2[xs2]),
                                   convert<float2>(yp2[xs3]), xf);

     float2 p3  = cubicInterpolate(convert<float2>(yp3[xs0]),
                                   convert<float2>(yp3[xs1]),
                                   convert<float2>(yp3[xs2]),
                                   convert<float2>(yp3[xs3]), xf);

     float2 p  = cubicInterpolate(p0, p1, p2, p3, yf);
     p = clamp(p + 0.5f, 0.f, 255.f);
     return convert<uchar2>(p);
 }

 static uchar OneBiCubic(const uchar *yp0, const uchar *yp1, const uchar *yp2, const uchar *yp3,
                         float xf, float yf, int width) {
     int startx = (int) floor(xf - 1);
     xf = xf - floor(xf);
     int maxx = width - 1;
     int xs0 = std::max(0, startx + 0);
     int xs1 = std::max(0, startx + 1);
     int xs2 = std::min(maxx, startx + 2);
     int xs3 = std::min(maxx, startx + 3);

     float p0  = cubicInterpolate((float)yp0[xs0], (float)yp0[xs1],
                                  (float)yp0[xs2], (float)yp0[xs3], xf);
     float p1  = cubicInterpolate((float)yp1[xs0], (float)yp1[xs1],
                                  (float)yp1[xs2], (float)yp1[xs3], xf);
     float p2  = cubicInterpolate((float)yp2[xs0], (float)yp2[xs1],
                                  (float)yp2[xs2], (float)yp2[xs3], xf);
     float p3  = cubicInterpolate((float)yp3[xs0], (float)yp3[xs1],
                                  (float)yp3[xs2], (float)yp3[xs3], xf);

     float p  = cubicInterpolate(p0, p1, p2, p3, yf);
     p = clamp(p + 0.5f, 0.f, 255.f);
     //ALOGI("CUC,%f,%u", p, (uchar)p);
     return (uchar)p;
 }

 extern "C" uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc);

 extern "C" void rsdIntrinsicResizeB4_K(
             uchar4 *dst,
             size_t count,
             uint32_t xf,
             uint32_t xinc,
             uchar4 const *srcn,
             uchar4 const *src0,
             uchar4 const *src1,
             uchar4 const *src2,
             size_t xclip,
             size_t avail,
             uint64_t osc_ctl,
             int32_t const *yr);

 extern "C" void rsdIntrinsicResizeB2_K(
             uchar2 *dst,
             size_t count,
             uint32_t xf,
             uint32_t xinc,
             uchar2 const *srcn,
             uchar2 const *src0,
             uchar2 const *src1,
             uchar2 const *src2,
             size_t xclip,
             size_t avail,
             uint64_t osc_ctl,
             int32_t const *yr);

 extern "C" void rsdIntrinsicResizeB1_K(
             uchar *dst,
             size_t count,
             uint32_t xf,
             uint32_t xinc,
             uchar const *srcn,
             uchar const *src0,
             uchar const *src1,
             uchar const *src2,
             size_t xclip,
             size_t avail,
             uint64_t osc_ctl,
             int32_t const *yr);

 #if defined(ARCH_ARM_USE_INTRINSICS)
 static void mkYCoeff(int32_t *yr, float yf) {
     int32_t yf1 = rint(yf * 0x10000);
     int32_t yf2 = rint(yf * yf * 0x10000);
     int32_t yf3 = rint(yf * yf * yf * 0x10000);

     yr[0] = -(2 * yf2 - yf3 - yf1) >> 1;
     yr[1] = (3 * yf3 - 5 * yf2 + 0x20000) >> 1;
     yr[2] = (-3 * yf3 + 4 * yf2 + yf1) >> 1;
     yr[3] = -(yf3 - yf2) >> 1;
 }
 #endif

 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
 static float4 OneBiCubic(const float4 *yp0, const float4 *yp1, const float4 *yp2, const float4 *yp3,
                          float xf, float yf, int width) {
     int startx = (int) floor(xf - 1);
     xf = xf - floor(xf);
     int maxx = width - 1;
     int xs0 = std::max(0, startx + 0);
     int xs1 = std::max(0, startx + 1);
     int xs2 = std::min(maxx, startx + 2);
     int xs3 = std::min(maxx, startx + 3);

     float4 p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
                                   yp0[xs2], yp0[xs3], xf);
     float4 p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
                                   yp1[xs2], yp1[xs3], xf);
     float4 p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
                                   yp2[xs2], yp2[xs3], xf);
     float4 p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
                                   yp3[xs2], yp3[xs3], xf);

     float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
     return p;
 }

 static float2 OneBiCubic(const float2 *yp0, const float2 *yp1, const float2 *yp2, const float2 *yp3,
                          float xf, float yf, int width) {
     int startx = (int) floor(xf - 1);
     xf = xf - floor(xf);
     int maxx = width - 1;
     int xs0 = std::max(0, startx + 0);
     int xs1 = std::max(0, startx + 1);
     int xs2 = std::min(maxx, startx + 2);
     int xs3 = std::min(maxx, startx + 3);

     float2 p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
                                   yp0[xs2], yp0[xs3], xf);
     float2 p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
                                   yp1[xs2], yp1[xs3], xf);
     float2 p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
                                   yp2[xs2], yp2[xs3], xf);
     float2 p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
                                   yp3[xs2], yp3[xs3], xf);

     float2 p  = cubicInterpolate(p0, p1, p2, p3, yf);
     return p;
 }

 static float OneBiCubic(const float *yp0, const float *yp1, const float *yp2, const float *yp3,
                         float xf, float yf, int width) {
     int startx = (int) floor(xf - 1);
     xf = xf - floor(xf);
     int maxx = width - 1;
     int xs0 = std::max(0, startx + 0);
     int xs1 = std::max(0, startx + 1);
     int xs2 = std::min(maxx, startx + 2);
     int xs3 = std::min(maxx, startx + 3);

     float p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
                                  yp0[xs2], yp0[xs3], xf);
     float p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
                                  yp1[xs2], yp1[xs3], xf);
     float p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
                                  yp2[xs2], yp2[xs3], xf);
     float p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
                                  yp3[xs2], yp3[xs3], xf);

     float p  = cubicInterpolate(p0, p1, p2, p3, yf);
     return p;
 }
 #endif

 void ResizeTask::kernelU4(uchar *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
     const uchar *pin = mIn;
     const int srcHeight = mInputSizeY;
     const int srcWidth = mInputSizeX;
     const size_t stride = mInputSizeX * paddedSize(mVectorSize);


 #if defined(ARCH_X86_HAVE_AVX2)
     float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
                                           _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
 #else
     float yf = (currentY + 0.5f) * mScaleY - 0.5f;
 #endif


     int starty = (int) floor(yf - 1);
     yf = yf - floor(yf);
     int maxy = srcHeight - 1;
     int ys0 = std::max(0, starty + 0);
     int ys1 = std::max(0, starty + 1);
     int ys2 = std::min(maxy, starty + 2);
     int ys3 = std::min(maxy, starty + 3);

     const uchar4 *yp0 = (const uchar4 *)(pin + stride * ys0);
     const uchar4 *yp1 = (const uchar4 *)(pin + stride * ys1);
     const uchar4 *yp2 = (const uchar4 *)(pin + stride * ys2);
     const uchar4 *yp3 = (const uchar4 *)(pin + stride * ys3);

     uchar4 *out = ((uchar4 *)outPtr);
     uint32_t x1 = xstart;
     uint32_t x2 = xend;

 #if defined(ARCH_ARM_USE_INTRINSICS)
     if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
         float xf = (x1 + 0.5f) * mScaleX - 0.5f;
         long xf16 = rint(xf * 0x10000);
         uint32_t xinc16 = rint(mScaleX * 0x10000);

         int xoff = (xf16 >> 16) - 1;
         int xclip = std::max(0, xoff) - xoff;
         int len = x2 - x1;

         int32_t yr[4];
         uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
         mkYCoeff(yr, yf);

         xoff += xclip;

         rsdIntrinsicResizeB4_K(
                 out, len,
                 xf16 & 0xffff, xinc16,
                 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
                 xclip, srcWidth - xoff + xclip,
                 osc_ctl, yr);
         out += len;
         x1 += len;
     }
 #endif

     while(x1 < x2) {
 #if defined(ARCH_X86_HAVE_AVX2)
         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
                                               _mm_set1_ps(0.5f)));
 #else
         float xf = (x1 + 0.5f) * mScaleX - 0.5f;
 #endif
         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
         out++;
         x1++;
     }
 }

 void ResizeTask::kernelU2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
     const uchar *pin = mIn;
     const int srcHeight = mInputSizeY;
     const int srcWidth = mInputSizeX;
     const size_t stride = mInputSizeX * mVectorSize;


 #if defined(ARCH_X86_HAVE_AVX2)
     float yf = _mm_cvtss_f32(
             _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
 #else
     float yf = (currentY + 0.5f) * mScaleY - 0.5f;
 #endif

     int starty = (int) floor(yf - 1);
     yf = yf - floor(yf);
     int maxy = srcHeight - 1;
     int ys0 = std::max(0, starty + 0);
     int ys1 = std::max(0, starty + 1);
     int ys2 = std::min(maxy, starty + 2);
     int ys3 = std::min(maxy, starty + 3);

     const uchar2 *yp0 = (const uchar2 *)(pin + stride * ys0);
     const uchar2 *yp1 = (const uchar2 *)(pin + stride * ys1);
     const uchar2 *yp2 = (const uchar2 *)(pin + stride * ys2);
     const uchar2 *yp3 = (const uchar2 *)(pin + stride * ys3);

     uchar2 *out = ((uchar2 *)outPtr);
     uint32_t x1 = xstart;
     uint32_t x2 = xend;

 #if defined(ARCH_ARM_USE_INTRINSICS)
     if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
         float xf = (x1 + 0.5f) * mScaleX - 0.5f;
         long xf16 = rint(xf * 0x10000);
         uint32_t xinc16 = rint(mScaleX * 0x10000);

         int xoff = (xf16 >> 16) - 1;
         int xclip = std::max(0, xoff) - xoff;
         int len = x2 - x1;

         int32_t yr[4];
         uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
         mkYCoeff(yr, yf);

         xoff += xclip;

         rsdIntrinsicResizeB2_K(
                 out, len,
                 xf16 & 0xffff, xinc16,
                 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
                 xclip, srcWidth - xoff + xclip,
                 osc_ctl, yr);
         out += len;
         x1 += len;
     }
 #endif

     while(x1 < x2) {

 #if defined(ARCH_X86_HAVE_AVX2)
         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
                                               _mm_set1_ps(0.5f)));
 #else
         float xf = (x1 + 0.5f) * mScaleX - 0.5f;
 #endif
         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
         out++;
         x1++;
     }
 }

 void ResizeTask::kernelU1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
     //ALOGI("TK kernelU1 xstart %u, xend %u, outstep %u", xstart, xend);
     const uchar *pin = mIn;
     const int srcHeight = mInputSizeY;
     const int srcWidth = mInputSizeX;
     const size_t stride = mInputSizeX * mVectorSize;

     // ALOGI("Toolkit   ResizeU1 (%ux%u) by (%f,%f), xstart:%u to %u, stride %zu, out %p", srcWidth,
     // srcHeight, scaleX, scaleY, xstart, xend, stride, outPtr);

 #if defined(ARCH_X86_HAVE_AVX2)
     float yf = _mm_cvtss_f32(
             _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
 #else
     float yf = (currentY + 0.5f) * mScaleY - 0.5f;
 #endif

     int starty = (int) floor(yf - 1);
     yf = yf - floor(yf);
     int maxy = srcHeight - 1;
     int ys0 = std::max(0, starty + 0);
     int ys1 = std::min(maxy, std::max(0, starty + 1));
     int ys2 = std::min(maxy, starty + 2);
     int ys3 = std::min(maxy, starty + 3);

     const uchar *yp0 = pin + stride * ys0;
     const uchar *yp1 = pin + stride * ys1;
     const uchar *yp2 = pin + stride * ys2;
     const uchar *yp3 = pin + stride * ys3;

     uchar *out = ((uchar *)outPtr);
     uint32_t x1 = xstart;
     uint32_t x2 = xend;

 #if defined(ARCH_ARM_USE_INTRINSICS)
     if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
         float xf = (x1 + 0.5f) * mScaleX - 0.5f;
         long xf16 = rint(xf * 0x10000);
         uint32_t xinc16 = rint(mScaleX * 0x10000);

         int xoff = (xf16 >> 16) - 1;
         int xclip = std::max(0, xoff) - xoff;
         int len = x2 - x1;

         int32_t yr[4];
         uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
         mkYCoeff(yr, yf);

         // ALOGI("ys0 %d, ys1 %d, ys2 %d, ys3 %d, x1 %u, x2 %u, xf %f, xf16 %ld, xinc16 %u, xoff %d,
         // xclip %d, len %d, osc_ctl %lu)",
         //       ys0, ys1, ys2, ys3, x1, x2, xf, xf16, xinc16, xoff, xclip, len, (unsigned long)
         //       osc_ctl);
         // ALOGI("TK scaleX %f, xf %f, xf16 %ld, xinc16 %d, xoff %d, xclip %d, len %d", scaleX, xf,
         // xf16, xinc16, xoff, xclip, len); ALOGI("TK xf16 & 0xffff %ld, ys0 %u, ys1 %u, ys2 %u, ys3
         // %u, srcWidth - xoff + xclip %d", xf16 & 0xffff, ys0, ys1, ys2, ys3, srcWidth - xoff);

         xoff += xclip;

         rsdIntrinsicResizeB1_K(
                 out, len,
                 xf16 & 0xffff, xinc16,
                 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
                 xclip, srcWidth - xoff + xclip,
                 osc_ctl, yr);
         out += len;
         x1 += len;
     }
 #endif

     while(x1 < x2) {

 #if defined(ARCH_X86_HAVE_AVX2)
         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
                                               _mm_set1_ps(0.5f)));
 #else
         float xf = (x1 + 0.5f) * mScaleX - 0.5f;
 #endif

         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
         out++;
         x1++;
     }
 }

 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
 void ResizeTask::kernelF4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
     const uchar *pin = mIn;
     const int srcHeight = inputSizeY;
     const int srcWidth = inputSizeX;
     const size_t stride = sizeX * vectorSize;

 #if defined(ARCH_X86_HAVE_AVX2)
     float yf = _mm_cvtss_f32(
             _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
 #else
     float yf = (currentY + 0.5f) * scaleY - 0.5f;
 #endif

     int starty = (int) floor(yf - 1);
     yf = yf - floor(yf);
     int maxy = srcHeight - 1;
     int ys0 = std::max(0, starty + 0);
     int ys1 = std::max(0, starty + 1);
     int ys2 = std::min(maxy, starty + 2);
     int ys3 = std::min(maxy, starty + 3);

     const float4 *yp0 = (const float4 *)(pin + stride * ys0);
     const float4 *yp1 = (const float4 *)(pin + stride * ys1);
     const float4 *yp2 = (const float4 *)(pin + stride * ys2);
     const float4 *yp3 = (const float4 *)(pin + stride * ys3);

     float4 *out = ((float4 *)outPtr);
     uint32_t x1 = xstart;
     uint32_t x2 = xend;

     while(x1 < x2) {

 #if defined(ARCH_X86_HAVE_AVX2)
         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
                                               _mm_set1_ps(0.5f)));
 #else
         float xf = (x1 + 0.5f) * scaleX - 0.5f;
 #endif

         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
         out++;
         x1++;
     }
 }

 void ResizeTask::kernelF2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
     const uchar *pin = mIn;
     const int srcHeight = inputSizeY;
     const int srcWidth = inputSizeX;
     const size_t stride = sizeX * vectorSize;


 #if defined(ARCH_X86_HAVE_AVX2)
     float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
                                           _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
 #else
     float yf = (currentY + 0.5f) * scaleY - 0.5f;
 #endif

     int starty = (int) floor(yf - 1);
     yf = yf - floor(yf);
     int maxy = srcHeight - 1;
     int ys0 = std::max(0, starty + 0);
     int ys1 = std::max(0, starty + 1);
     int ys2 = std::min(maxy, starty + 2);
     int ys3 = std::min(maxy, starty + 3);

     const float2 *yp0 = (const float2 *)(pin + stride * ys0);
     const float2 *yp1 = (const float2 *)(pin + stride * ys1);
     const float2 *yp2 = (const float2 *)(pin + stride * ys2);
     const float2 *yp3 = (const float2 *)(pin + stride * ys3);

     float2 *out = ((float2 *)outPtr);
     uint32_t x1 = xstart;
     uint32_t x2 = xend;

     while(x1 < x2) {

 #if defined(ARCH_X86_HAVE_AVX2)
         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
                                               _mm_set1_ps(0.5f)));
 #else
         float xf = (x1 + 0.5f) * scaleX - 0.5f;
 #endif

         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
         out++;
         x1++;
     }
 }

 void ResizeTask::kernelF1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
     const uchar *pin = mIn;
     const int srcHeight = inputSizeY;
     const int srcWidth = inputSizeX;
     const size_t stride = sizeX * vectorSize;


 #if defined(ARCH_X86_HAVE_AVX2)
     float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
                                           _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
 #else
     float yf = (currentY + 0.5f) * scaleY - 0.5f;
 #endif

     int starty = (int) floor(yf - 1);
     yf = yf - floor(yf);
     int maxy = srcHeight - 1;
     int ys0 = std::max(0, starty + 0);
     int ys1 = std::max(0, starty + 1);
     int ys2 = std::min(maxy, starty + 2);
     int ys3 = std::min(maxy, starty + 3);

     const float *yp0 = (const float *)(pin + stride * ys0);
     const float *yp1 = (const float *)(pin + stride * ys1);
     const float *yp2 = (const float *)(pin + stride * ys2);
     const float *yp3 = (const float *)(pin + stride * ys3);

     float *out = ((float *)outPtr);
     uint32_t x1 = xstart;
     uint32_t x2 = xend;

     while(x1 < x2) {

 #if defined(ARCH_X86_HAVE_AVX2)
         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
                                               _mm_set1_ps(0.5f)));
 #else
         float xf = (x1 + 0.5f) * scaleX - 0.5f;
 #endif

         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
         out++;
         x1++;
     }
 }

 void ResizeTask::preLaunch(uint32_t slot, const RsScriptCall *sc)
 {

     //check the data type to determine F or U.
     if (mAlloc->getType()->getElement()->getType() == RS_TYPE_UNSIGNED_8) {
         switch(mAlloc->getType()->getElement()->getVectorSize()) {
         case 1:
             mRootPtr = &kernelU1;
             break;
         case 2:
             mRootPtr = &kernelU2;
             break;
         case 3:
         case 4:
             mRootPtr = &kernelU4;
             break;
         }
     } else {
         switch(mAlloc->getType()->getElement()->getVectorSize()) {
         case 1:
             mRootPtr = &kernelF1;
             break;
         case 2:
             mRootPtr = &kernelF2;
             break;
         case 3:
         case 4:
             mRootPtr = &kernelF4;
             break;
         }
     }
 }
 #endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT

 void RenderScriptToolkit::resize(const uint8_t* input, uint8_t* output, size_t inputSizeX,
                                  size_t inputSizeY, size_t vectorSize, size_t outputSizeX,
                                  size_t outputSizeY, const Restriction* restriction) {
 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
     if (!validRestriction(LOG_TAG, outputSizeX, outputSizeY, restriction)) {
         return;
     }
     if (vectorSize < 1 || vectorSize > 4) {
         ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
         return;
     }
 #endif

     ResizeTask task((const uchar*)input, (uchar*)output, inputSizeX, inputSizeY, vectorSize,
                     outputSizeX, outputSizeY, restriction);
     processor->doTask(&task);
 }

 }  // namespace renderscript
 }  // namespace android