| /* |
| * Copyright (C) 2012 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include <assert.h> |
| |
| #include <cstdint> |
| |
| #include "RenderScriptToolkit.h" |
| #include "TaskProcessor.h" |
| #include "Utils.h" |
| |
| namespace android { |
| namespace renderscript { |
| |
| #define LOG_TAG "renderscript.toolkit.Blend" |
| |
| /** |
| * Blends a source into a destination, based on the mode. |
| */ |
| class BlendTask : public Task { |
| // The type of blending to do. |
| RenderScriptToolkit::BlendingMode mMode; |
| // The input we're blending. |
| const uchar4* mIn; |
| // The destination, used both for input and output. |
| uchar4* mOut; |
| |
| void blend(RenderScriptToolkit::BlendingMode mode, const uchar4* in, uchar4* out, |
| uint32_t length); |
| // Process a 2D tile of the overall work. threadIndex identifies which thread does the work. |
| virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX, |
| size_t endY) override; |
| |
| public: |
| BlendTask(RenderScriptToolkit::BlendingMode mode, const uint8_t* in, uint8_t* out, size_t sizeX, |
| size_t sizeY, const Restriction* restriction) |
| : Task{sizeX, sizeY, 4, true, restriction}, |
| mMode{mode}, |
| mIn{reinterpret_cast<const uchar4*>(in)}, |
| mOut{reinterpret_cast<uchar4*>(out)} {} |
| }; |
| |
| #if defined(ARCH_ARM_USE_INTRINSICS) |
| extern "C" int rsdIntrinsicBlend_K(uchar4 *out, uchar4 const *in, int slot, |
| uint32_t xstart, uint32_t xend); |
| #endif |
| |
| #if defined(ARCH_X86_HAVE_SSSE3) |
| extern void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8); |
| extern void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8); |
| extern void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8); |
| extern void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8); |
| extern void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8); |
| extern void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8); |
| extern void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8); |
| extern void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8); |
| extern void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8); |
| extern void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8); |
| extern void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8); |
| extern void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8); |
| #endif |
| |
| // Convert vector to uchar4, clipping each value to 255. |
| template <typename TI> |
| static inline uchar4 convertClipped(TI amount) { |
| return uchar4 { static_cast<uchar>(amount.x > 255 ? 255 : amount.x), |
| static_cast<uchar>(amount.y > 255 ? 255 : amount.y), |
| static_cast<uchar>(amount.z > 255 ? 255 : amount.z), |
| static_cast<uchar>(amount.w > 255 ? 255 : amount.w)}; |
| } |
| |
| void BlendTask::blend(RenderScriptToolkit::BlendingMode mode, const uchar4* in, uchar4* out, |
| uint32_t length) { |
| uint32_t x1 = 0; |
| uint32_t x2 = length; |
| |
| #if defined(ARCH_ARM_USE_INTRINSICS) |
| if (mUsesSimd) { |
| if (rsdIntrinsicBlend_K(out, in, (int) mode, x1, x2) >= 0) { |
| return; |
| } else { |
| ALOGW("Intrinsic Blend failed to use SIMD for %d", mode); |
| } |
| } |
| #endif |
| switch (mode) { |
| case RenderScriptToolkit::BlendingMode::CLEAR: |
| for (;x1 < x2; x1++, out++) { |
| *out = 0; |
| } |
| break; |
| case RenderScriptToolkit::BlendingMode::SRC: |
| for (;x1 < x2; x1++, out++, in++) { |
| *out = *in; |
| } |
| break; |
| //RenderScriptToolkit::BlendingMode::DST is a NOP |
| case RenderScriptToolkit::BlendingMode::DST: |
| break; |
| case RenderScriptToolkit::BlendingMode::SRC_OVER: |
| #if defined(ARCH_X86_HAVE_SSSE3) |
| if (mUsesSimd) { |
| if ((x1 + 8) < x2) { |
| uint32_t len = (x2 - x1) >> 3; |
| rsdIntrinsicBlendSrcOver_K(out, in, len); |
| x1 += len << 3; |
| out += len << 3; |
| in += len << 3; |
| } |
| } |
| #endif |
| for (;x1 < x2; x1++, out++, in++) { |
| ushort4 in_s = convert<ushort4>(*in); |
| ushort4 out_s = convert<ushort4>(*out); |
| in_s = in_s + ((out_s * (ushort4)(255 - in_s.w)) >> (ushort4)8); |
| *out = convertClipped(in_s); |
| } |
| break; |
| case RenderScriptToolkit::BlendingMode::DST_OVER: |
| #if defined(ARCH_X86_HAVE_SSSE3) |
| if (mUsesSimd) { |
| if ((x1 + 8) < x2) { |
| uint32_t len = (x2 - x1) >> 3; |
| rsdIntrinsicBlendDstOver_K(out, in, len); |
| x1 += len << 3; |
| out += len << 3; |
| in += len << 3; |
| } |
| } |
| #endif |
| for (;x1 < x2; x1++, out++, in++) { |
| ushort4 in_s = convert<ushort4>(*in); |
| ushort4 out_s = convert<ushort4>(*out); |
| in_s = out_s + ((in_s * (ushort4)(255 - out_s.w)) >> (ushort4)8); |
| *out = convertClipped(in_s); |
| } |
| break; |
| case RenderScriptToolkit::BlendingMode::SRC_IN: |
| #if defined(ARCH_X86_HAVE_SSSE3) |
| if (mUsesSimd) { |
| if ((x1 + 8) < x2) { |
| uint32_t len = (x2 - x1) >> 3; |
| rsdIntrinsicBlendSrcIn_K(out, in, len); |
| x1 += len << 3; |
| out += len << 3; |
| in += len << 3; |
| } |
| } |
| #endif |
| for (;x1 < x2; x1++, out++, in++) { |
| ushort4 in_s = convert<ushort4>(*in); |
| in_s = (in_s * out->w) >> (ushort4)8; |
| *out = convert<uchar4>(in_s); |
| } |
| break; |
| case RenderScriptToolkit::BlendingMode::DST_IN: |
| #if defined(ARCH_X86_HAVE_SSSE3) |
| if (mUsesSimd) { |
| if ((x1 + 8) < x2) { |
| uint32_t len = (x2 - x1) >> 3; |
| rsdIntrinsicBlendDstIn_K(out, in, len); |
| x1 += len << 3; |
| out += len << 3; |
| in += len << 3; |
| } |
| } |
| #endif |
| for (;x1 < x2; x1++, out++, in++) { |
| ushort4 out_s = convert<ushort4>(*out); |
| out_s = (out_s * in->w) >> (ushort4)8; |
| *out = convert<uchar4>(out_s); |
| } |
| break; |
| case RenderScriptToolkit::BlendingMode::SRC_OUT: |
| #if defined(ARCH_X86_HAVE_SSSE3) |
| if (mUsesSimd) { |
| if ((x1 + 8) < x2) { |
| uint32_t len = (x2 - x1) >> 3; |
| rsdIntrinsicBlendSrcOut_K(out, in, len); |
| x1 += len << 3; |
| out += len << 3; |
| in += len << 3; |
| } |
| } |
| #endif |
| for (;x1 < x2; x1++, out++, in++) { |
| ushort4 in_s = convert<ushort4>(*in); |
| in_s = (in_s * (ushort4)(255 - out->w)) >> (ushort4)8; |
| *out = convert<uchar4>(in_s); |
| } |
| break; |
| case RenderScriptToolkit::BlendingMode::DST_OUT: |
| #if defined(ARCH_X86_HAVE_SSSE3) |
| if (mUsesSimd) { |
| if ((x1 + 8) < x2) { |
| uint32_t len = (x2 - x1) >> 3; |
| rsdIntrinsicBlendDstOut_K(out, in, len); |
| x1 += len << 3; |
| out += len << 3; |
| in += len << 3; |
| } |
| } |
| #endif |
| for (;x1 < x2; x1++, out++, in++) { |
| ushort4 out_s = convert<ushort4>(*out); |
| out_s = (out_s * (ushort4)(255 - in->w)) >> (ushort4)8; |
| *out = convert<uchar4>(out_s); |
| } |
| break; |
| case RenderScriptToolkit::BlendingMode::SRC_ATOP: |
| #if defined(ARCH_X86_HAVE_SSSE3) |
| if (mUsesSimd) { |
| if ((x1 + 8) < x2) { |
| uint32_t len = (x2 - x1) >> 3; |
| rsdIntrinsicBlendSrcAtop_K(out, in, len); |
| x1 += len << 3; |
| out += len << 3; |
| in += len << 3; |
| } |
| } |
| #endif |
| for (;x1 < x2; x1++, out++, in++) { |
| // The max value the operation could produce before the shift |
| // is 255 * 255 + 255 * (255 - 0) = 130050, or 0x1FC02. |
| // That value does not fit in a ushort, so we use uint. |
| uint4 in_s = convert<uint4>(*in); |
| uint4 out_s = convert<uint4>(*out); |
| out_s.xyz = ((in_s.xyz * out_s.w) + |
| (out_s.xyz * ((uint3)255 - (uint3)in_s.w))) >> (uint3)8; |
| *out = convertClipped(out_s); |
| } |
| break; |
| case RenderScriptToolkit::BlendingMode::DST_ATOP: |
| #if defined(ARCH_X86_HAVE_SSSE3) |
| if (mUsesSimd) { |
| if ((x1 + 8) < x2) { |
| uint32_t len = (x2 - x1) >> 3; |
| rsdIntrinsicBlendDstAtop_K(out, in, len); |
| x1 += len << 3; |
| out += len << 3; |
| in += len << 3; |
| } |
| } |
| #endif |
| for (;x1 < x2; x1++, out++, in++) { |
| uint4 in_s = convert<uint4>(*in); |
| uint4 out_s = convert<uint4>(*out); |
| out_s.xyz = ((out_s.xyz * in_s.w) + |
| (in_s.xyz * ((uint3)255 - (uint3)out_s.w))) >> (uint3)8; |
| out_s.w = in_s.w; |
| *out = convertClipped(out_s); |
| } |
| break; |
| case RenderScriptToolkit::BlendingMode::XOR: |
| #if defined(ARCH_X86_HAVE_SSSE3) |
| if (mUsesSimd) { |
| if ((x1 + 8) < x2) { |
| uint32_t len = (x2 - x1) >> 3; |
| rsdIntrinsicBlendXor_K(out, in, len); |
| x1 += len << 3; |
| out += len << 3; |
| in += len << 3; |
| } |
| } |
| #endif |
| for (;x1 < x2; x1++, out++, in++) { |
| *out = *in ^ *out; |
| } |
| break; |
| case RenderScriptToolkit::BlendingMode::MULTIPLY: |
| #if defined(ARCH_X86_HAVE_SSSE3) |
| if (mUsesSimd) { |
| if ((x1 + 8) < x2) { |
| uint32_t len = (x2 - x1) >> 3; |
| rsdIntrinsicBlendMultiply_K(out, in, len); |
| x1 += len << 3; |
| out += len << 3; |
| in += len << 3; |
| } |
| } |
| #endif |
| for (;x1 < x2; x1++, out++, in++) { |
| *out = convert<uchar4>((convert<ushort4>(*in) * convert<ushort4>(*out)) |
| >> (ushort4)8); |
| } |
| break; |
| case RenderScriptToolkit::BlendingMode::ADD: |
| #if defined(ARCH_X86_HAVE_SSSE3) |
| if (mUsesSimd) { |
| if((x1 + 8) < x2) { |
| uint32_t len = (x2 - x1) >> 3; |
| rsdIntrinsicBlendAdd_K(out, in, len); |
| x1 += len << 3; |
| out += len << 3; |
| in += len << 3; |
| } |
| } |
| #endif |
| for (;x1 < x2; x1++, out++, in++) { |
| uint32_t iR = in->x, iG = in->y, iB = in->z, iA = in->w, |
| oR = out->x, oG = out->y, oB = out->z, oA = out->w; |
| out->x = (oR + iR) > 255 ? 255 : oR + iR; |
| out->y = (oG + iG) > 255 ? 255 : oG + iG; |
| out->z = (oB + iB) > 255 ? 255 : oB + iB; |
| out->w = (oA + iA) > 255 ? 255 : oA + iA; |
| } |
| break; |
| case RenderScriptToolkit::BlendingMode::SUBTRACT: |
| #if defined(ARCH_X86_HAVE_SSSE3) |
| if (mUsesSimd) { |
| if((x1 + 8) < x2) { |
| uint32_t len = (x2 - x1) >> 3; |
| rsdIntrinsicBlendSub_K(out, in, len); |
| x1 += len << 3; |
| out += len << 3; |
| in += len << 3; |
| } |
| } |
| #endif |
| for (;x1 < x2; x1++, out++, in++) { |
| int32_t iR = in->x, iG = in->y, iB = in->z, iA = in->w, |
| oR = out->x, oG = out->y, oB = out->z, oA = out->w; |
| out->x = (oR - iR) < 0 ? 0 : oR - iR; |
| out->y = (oG - iG) < 0 ? 0 : oG - iG; |
| out->z = (oB - iB) < 0 ? 0 : oB - iB; |
| out->w = (oA - iA) < 0 ? 0 : oA - iA; |
| } |
| break; |
| |
| default: |
| ALOGE("Called unimplemented value %d", mode); |
| assert(false); |
| } |
| } |
| |
| void BlendTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX, |
| size_t endY) { |
| for (size_t y = startY; y < endY; y++) { |
| size_t offset = y * mSizeX + startX; |
| blend(mMode, mIn + offset, mOut + offset, endX - startX); |
| } |
| } |
| |
| void RenderScriptToolkit::blend(BlendingMode mode, const uint8_t* in, uint8_t* out, size_t sizeX, |
| size_t sizeY, const Restriction* restriction) { |
| #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE |
| if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) { |
| return; |
| } |
| #endif |
| |
| BlendTask task(mode, in, out, sizeX, sizeY, restriction); |
| processor->doTask(&task); |
| } |
| |
| } // namespace renderscript |
| } // namespace android |