| #include "caffe2/core/operator.h" |
| #include "caffe2/utils/cpu_neon.h" |
| #include "caffe2/utils/math.h" |
| |
| #ifdef USE_MKLDNN |
| #include <caffe2/ideep/operators/operator_fallback_ideep.h> |
| #include <caffe2/ideep/utils/ideep_operator.h> |
| #endif |
| |
| namespace caffe2 { |
| |
| #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
| namespace { |
| |
| // |
| // ARM Neon code utilities |
| // |
| |
| inline float32x4_t to_v4_f32(uint16x4_t v) { |
| return vcvtq_f32_u32(vmovl_u16(v)); |
| } |
| |
| inline float32x4x4_t to_f32_v4_x4(uint8x16_t v) { |
| float32x4x4_t out; |
| |
| uint16x8_t lo_u16 = vmovl_u8(vget_low_u8(v)); |
| |
| out.val[0] = to_v4_f32(vget_low_u16(lo_u16)); |
| out.val[1] = to_v4_f32(vget_high_u16(lo_u16)); |
| |
| uint16x8_t hi_u16 = vmovl_u8(vget_high_u8(v)); |
| |
| out.val[2] = to_v4_f32(vget_low_u16(hi_u16)); |
| out.val[3] = to_v4_f32(vget_high_u16(hi_u16)); |
| |
| return out; |
| } |
| |
| inline void clamp(float32x4_t& v) { |
| v = vmaxq_f32(v, vdupq_n_f32(0)); |
| v = vminq_f32(v, vdupq_n_f32((float)std::numeric_limits<uint8_t>::max())); |
| } |
| |
| inline void addMeanAndClamp(float32x4_t& v, float mean) { |
| v = vaddq_f32(v, vdupq_n_f32(mean)); |
| clamp(v); |
| } |
| |
| inline uint8x8_t convertNarrowAndPack(float32x4_t v0, float32x4_t v1) { |
| uint16x4_t u16_0 = vmovn_u32(vcvtq_u32_f32(v0)); |
| uint16x4_t u16_1 = vmovn_u32(vcvtq_u32_f32(v1)); |
| uint16x8_t u16_01 = vcombine_u16(u16_0, u16_1); |
| return vmovn_u16(u16_01); |
| } |
| |
| } // unnamed namespace |
| #endif // defined(__ARM_NEON__) || defined(__ARM_NEON) |
| |
| class PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp |
| : public Operator<CPUContext> { |
| public: |
| // Expect this many channels as input |
| static constexpr int kInputChannels = 4; |
| |
| // Expect this many channels as output |
| static constexpr int kOutputChannels = 3; |
| |
| // We read this much noise per vectorized cycle |
| static constexpr int kNeonNoiseReadSize = kOutputChannels * 16; |
| |
| USE_OPERATOR_FUNCTIONS(CPUContext); |
| explicit PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp(const OperatorDef& operator_def, Workspace* ws) |
| : Operator<CPUContext>(operator_def, ws), ws_(ws) {} |
| |
| bool RunOnDevice() override { |
| const auto& X = Input(0); |
| const auto& mean = Input(1); |
| |
| auto* noiseBlob = ws_->CreateBlob("__CAFFE2_STYLIZER_NOISE__"); |
| auto defaultNoiseSize = OperatorBase::GetSingleArgument<int>( |
| "noise_size", 491 /* prime to avoid artifacts */); |
| |
| if (!BlobIsTensorType(*noiseBlob, CPU)) { |
| // Initialize random noise on first use. |
| // Cache it to maintain temporal consistency. |
| auto* t = BlobGetMutableTensor(noiseBlob, CPU); |
| |
| #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
| // Noise space is larger for vectorized code due to the |
| // vectorized load |
| initNoiseCPUNeon(t, defaultNoiseSize); |
| #else |
| initNoiseCPU(t, defaultNoiseSize); |
| #endif |
| } |
| const auto& noise = noiseBlob->template Get<TensorCPU>(); |
| CAFFE_ENFORCE(noise.numel() >= defaultNoiseSize); |
| |
| CAFFE_ENFORCE(X.dim() == 4); |
| const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3); |
| // Assume BGR or BGRA |
| CAFFE_ENFORCE(mean.numel() == kOutputChannels); |
| |
| CAFFE_ENFORCE(C == kInputChannels); |
| auto* Y = Output(0, {N, kOutputChannels, H, W}, at::dtype<float>()); |
| |
| runBatch( |
| N, |
| C, |
| H, |
| W, |
| defaultNoiseSize, |
| X.data<uint8_t>(), |
| mean.data<float>(), |
| noise.data<float>(), |
| Y->template mutable_data<float>()); |
| |
| return true; |
| } |
| |
| #if !defined(__ARM_NEON__) && !defined(__ARM_NEON) |
| void initNoiseCPU(Tensor* noise, int size) { |
| noise->Resize(size); |
| |
| math::RandGaussian<float, CPUContext>( |
| size, |
| 0.0, |
| OperatorBase::GetSingleArgument<float>("noise_std", 10.0), |
| noise->template mutable_data<float>(), |
| &context_); |
| } |
| #endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON) |
| |
| #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
| void initNoiseCPUNeon(Tensor* noise, int size) { |
| // For ARM NEON, we read in multiples of kNeonNoiseReadSize since |
| // the inner loop is vectorized. Round up to the next highest |
| // multiple of kNeonNoiseReadSize |
| size = math::RoundUp(size, kNeonNoiseReadSize) + size; |
| noise->Resize(size); |
| |
| math::RandGaussian<float, CPUContext>( |
| size, |
| 0.0, |
| OperatorBase::GetSingleArgument<float>("noise_std", 10.0), |
| noise->template mutable_data<float>(), |
| &context_); |
| } |
| #endif // defined(__ARM_NEON__) || defined(__ARM_NEON) |
| |
| void runBatch( |
| int N, |
| int /*C*/, |
| int H, |
| int W, |
| int noiseCycle, |
| const uint8_t* input, |
| const float* meanChannel, |
| const float* noise, |
| float* output) { |
| int planeSize = H * W; |
| |
| for (int n = 0; n < N; ++n) { |
| auto curInput = input + n * kInputChannels * planeSize; |
| auto curOutput = output + n * kOutputChannels * planeSize; |
| |
| #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
| runCPUNeon(H, W, noiseCycle, curInput, meanChannel, noise, curOutput); |
| #else |
| runCPU(H, W, noiseCycle, curInput, meanChannel, noise, curOutput); |
| #endif // defined(__ARM_NEON__) || defined(__ARM_NEON) |
| } |
| } |
| |
| #if !defined(__ARM_NEON__) && !defined(__ARM_NEON) |
| void runCPU( |
| int H, |
| int W, |
| int noiseCycle, |
| const uint8_t* input, |
| const float* meanChannel, |
| const float* noise, |
| float* output) { |
| int planeSize = H * W; |
| int noiseOffset = 0; |
| |
| for (int point = 0; point < planeSize; ++point) { |
| for (int c = 0; c < kOutputChannels; ++c) { |
| float v = (float)input[point * kInputChannels + c]; |
| output[c * planeSize + point] = v - meanChannel[c] + noise[noiseOffset]; |
| |
| if (++noiseOffset >= noiseCycle) { |
| noiseOffset = 0; |
| } |
| } |
| } |
| } |
| #endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON) |
| |
| #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
| void runCPUNeon( |
| int H, |
| int W, |
| int noiseCycle, |
| const uint8_t* input, |
| const float* meanChannel, |
| const float* noise, |
| float* output) { |
| // Vectorized load parameters: |
| |
| // Loop unroll factor |
| // FIXME: this doesn't actually unroll; clang has per-loop unroll |
| // pragmas but GCC does not |
| constexpr int kUnroll = 1; |
| |
| // How much data we load for each inner loop |
| constexpr int kInnerLoadSize = sizeof(uint8x16x4_t); |
| |
| // What we write out |
| constexpr int kInnerStoreSize = sizeof(float32x4_t); |
| |
| // We load 16 pixels at a time, with 4 channels each |
| constexpr int kLoadPixels = kInnerLoadSize / kInputChannels; |
| static_assert(kLoadPixels == 16, "unexpected"); |
| |
| // How many pixels we load per loop |
| constexpr int kLoadPixelsPerLoop = kLoadPixels * kUnroll; |
| |
| // We need at least this much noise each loop through |
| CAFFE_ENFORCE_GE(noiseCycle, kOutputChannels * kLoadPixelsPerLoop); |
| |
| int noiseUsed = 0; |
| const float* curNoise = noise; |
| |
| float mean[kOutputChannels] = { |
| meanChannel[0], meanChannel[1], meanChannel[2]}; |
| int planeSize = H * W; |
| |
| // Vectorized portion |
| int point = 0; |
| |
| // If the slice is not aligned, then we have to use the |
| // un-vectorized version |
| bool isAligned = isPointerAligned(input, kInnerLoadSize) && |
| isPointerAligned(output, kInnerStoreSize) && |
| // Because we are writing to output at offsets of planeSize, |
| // planeSize has to be an even multiple of kInnerStoreSize |
| (planeSize % kInnerStoreSize == 0); |
| |
| // What portion the vectorized loop will handle |
| int limit = |
| isAligned ? (planeSize / kLoadPixelsPerLoop) * kLoadPixelsPerLoop : 0; |
| |
| for (; point < limit; point += kLoadPixelsPerLoop) { |
| // Unroll load/update/store by kUnroll |
| for (int j = 0; j < kUnroll; ++j) { |
| // We load 16 pixels x 4 channels at a time |
| const uint8_t* inputAligned = (const uint8_t*)__builtin_assume_aligned( |
| input + (point + j * kLoadPixels) * kInputChannels, |
| sizeof(uint8x16x4_t)); |
| uint8x16x4_t loadV = vld4q_u8(inputAligned); |
| |
| // The compiler doesn't want to unroll this when we put it in a |
| // loop, and in GCC there's no per-loop unroll pragma, so we do |
| // it manually. |
| // This seems to involve no register spillage, crossing fingers |
| // that it remains that way. |
| { |
| constexpr int kChannel = 0; |
| float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 0); |
| float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 4); |
| float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 8); |
| float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 12); |
| |
| float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]); |
| float32x4_t meanV = vdupq_n_f32(mean[kChannel]); |
| outV.val[0] = vsubq_f32(outV.val[0], meanV); |
| outV.val[1] = vsubq_f32(outV.val[1], meanV); |
| outV.val[2] = vsubq_f32(outV.val[2], meanV); |
| outV.val[3] = vsubq_f32(outV.val[3], meanV); |
| |
| outV.val[0] = vaddq_f32(outV.val[0], noise0); |
| outV.val[1] = vaddq_f32(outV.val[1], noise1); |
| outV.val[2] = vaddq_f32(outV.val[2], noise2); |
| outV.val[3] = vaddq_f32(outV.val[3], noise3); |
| |
| float* outputAligned = (float*)__builtin_assume_aligned( |
| &output[kChannel * planeSize + (point + j * kLoadPixels)], |
| sizeof(float32x4_t)); |
| |
| vst1q_f32(outputAligned + 0, outV.val[0]); |
| vst1q_f32(outputAligned + 4, outV.val[1]); |
| vst1q_f32(outputAligned + 8, outV.val[2]); |
| vst1q_f32(outputAligned + 12, outV.val[3]); |
| } |
| |
| { |
| constexpr int kChannel = 1; |
| float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 16); |
| float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 20); |
| float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 24); |
| float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 28); |
| |
| float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]); |
| float32x4_t meanV = vdupq_n_f32(mean[kChannel]); |
| outV.val[0] = vsubq_f32(outV.val[0], meanV); |
| outV.val[1] = vsubq_f32(outV.val[1], meanV); |
| outV.val[2] = vsubq_f32(outV.val[2], meanV); |
| outV.val[3] = vsubq_f32(outV.val[3], meanV); |
| |
| outV.val[0] = vaddq_f32(outV.val[0], noise0); |
| outV.val[1] = vaddq_f32(outV.val[1], noise1); |
| outV.val[2] = vaddq_f32(outV.val[2], noise2); |
| outV.val[3] = vaddq_f32(outV.val[3], noise3); |
| |
| float* outputAligned = (float*)__builtin_assume_aligned( |
| &output[kChannel * planeSize + (point + j * kLoadPixels)], |
| sizeof(float32x4_t)); |
| |
| vst1q_f32(outputAligned + 0, outV.val[0]); |
| vst1q_f32(outputAligned + 4, outV.val[1]); |
| vst1q_f32(outputAligned + 8, outV.val[2]); |
| vst1q_f32(outputAligned + 12, outV.val[3]); |
| } |
| |
| { |
| constexpr int kChannel = 2; |
| float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 32); |
| float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 36); |
| float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 40); |
| float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 44); |
| |
| float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]); |
| float32x4_t meanV = vdupq_n_f32(mean[kChannel]); |
| outV.val[0] = vsubq_f32(outV.val[0], meanV); |
| outV.val[1] = vsubq_f32(outV.val[1], meanV); |
| outV.val[2] = vsubq_f32(outV.val[2], meanV); |
| outV.val[3] = vsubq_f32(outV.val[3], meanV); |
| |
| outV.val[0] = vaddq_f32(outV.val[0], noise0); |
| outV.val[1] = vaddq_f32(outV.val[1], noise1); |
| outV.val[2] = vaddq_f32(outV.val[2], noise2); |
| outV.val[3] = vaddq_f32(outV.val[3], noise3); |
| |
| float* outputAligned = (float*)__builtin_assume_aligned( |
| &output[kChannel * planeSize + (point + j * kLoadPixels)], |
| sizeof(float32x4_t)); |
| |
| vst1q_f32(outputAligned + 0, outV.val[0]); |
| vst1q_f32(outputAligned + 4, outV.val[1]); |
| vst1q_f32(outputAligned + 8, outV.val[2]); |
| vst1q_f32(outputAligned + 12, outV.val[3]); |
| } |
| } |
| |
| curNoise += (kLoadPixels * kOutputChannels) * kUnroll; |
| noiseUsed += (kLoadPixels * kOutputChannels) * kUnroll; |
| |
| if (noiseUsed >= noiseCycle) { |
| noiseUsed = 0; |
| curNoise = noise + ((curNoise - noise) % noiseCycle); |
| } |
| } |
| |
| // Epilogue: non-vectorized remainder |
| for (; point < planeSize; ++point) { |
| for (int c = 0; c < kOutputChannels; ++c) { |
| float v = (float)input[point * kInputChannels + c]; |
| output[c * planeSize + point] = v - mean[c] + *curNoise++; |
| ++noiseUsed; |
| } |
| |
| if (noiseUsed >= noiseCycle) { |
| noiseUsed = 0; |
| curNoise = noise + ((curNoise - noise) % noiseCycle); |
| } |
| } |
| } |
| #endif // defined(__ARM_NEON__) || defined(__ARM_NEON) |
| |
| private: |
| Workspace* ws_; |
| }; |
| |
| namespace { |
| |
| template <typename T> |
| static inline T clamped_cast(float f) { |
| if (f >= std::numeric_limits<T>::max()) { |
| return std::numeric_limits<T>::max(); |
| } |
| if (f <= std::numeric_limits<T>::min()) { |
| return std::numeric_limits<T>::min(); |
| } |
| return static_cast<T>(f); |
| } |
| |
| } // unnamed namespace |
| |
| class BRGNCHWCToPackedInt8BGRAStylizerDeprocessOp |
| : public Operator<CPUContext> { |
| public: |
| using Operator<CPUContext>::Operator; |
| |
| // Expect this many channels as input |
| static constexpr int kInputChannels = 3; |
| |
| // Expect this many channels as output |
| static constexpr int kOutputChannels = 4; |
| |
| bool RunOnDevice() override { |
| const auto& X = Input(0); |
| const auto& mean = Input(1); |
| |
| CAFFE_ENFORCE(X.dim() == 4); |
| const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3); |
| // Assume BGR or BGRA |
| CAFFE_ENFORCE(mean.numel() == kInputChannels); |
| CAFFE_ENFORCE(C == kInputChannels); |
| // RGB |
| auto* Y = Output(0, {N, H, W, kOutputChannels}, at::dtype<uint8_t>()); |
| |
| runBatch( |
| N, |
| C, |
| H, |
| W, |
| X.data<float>(), |
| mean.data<float>(), |
| Y->template mutable_data<uint8_t>()); |
| |
| return true; |
| } |
| |
| void runBatch( |
| int N, |
| int /*C*/, |
| int H, |
| int W, |
| const float* input, |
| const float* meanChannel, |
| uint8_t* output) { |
| int planeSize = H * W; |
| |
| for (int n = 0; n < N; ++n) { |
| auto curInput = input + n * kInputChannels * planeSize; |
| auto curOutput = output + n * kOutputChannels * planeSize; |
| |
| #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
| runCPUNeon(H, W, curInput, meanChannel, curOutput); |
| #else |
| runCPU(H, W, curInput, meanChannel, curOutput); |
| #endif // defined(__ARM_NEON__) || defined(__ARM_NEON) |
| } |
| } |
| |
| #if !defined(__ARM_NEON__) && !defined(__ARM_NEON) |
| void runCPU( |
| int H, |
| int W, |
| const float* input, |
| const float* meanChannel, |
| uint8_t* output) { |
| int planeSize = H * W; |
| |
| for (int point = 0; point < planeSize; ++point) { |
| for (int c = 0; c < kInputChannels; ++c) { |
| uint8_t v = clamped_cast<uint8_t>( |
| input[c * planeSize + point] + meanChannel[c]); |
| output[point * kOutputChannels + c] = v; |
| } |
| |
| // alpha |
| output[point * kOutputChannels + (kOutputChannels - 1)] = |
| std::numeric_limits<uint8_t>::max(); |
| } |
| } |
| #endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON) |
| |
| #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
| void runCPUNeon( |
| int H, |
| int W, |
| const float* input, |
| const float* meanChannel, |
| uint8_t* output) { |
| // Vectorized load parameters: |
| |
| // We load in chunks of this size |
| constexpr int kLoadUnit = sizeof(float32x4_t); |
| constexpr int kLoadFloats = (sizeof(float32x4_t) / sizeof(float)); |
| |
| // We store in chunks of this size |
| constexpr int kStoreUnit = sizeof(uint8x8x4_t); |
| |
| // The vector portion loads this many f32 pixels at a time (8) |
| constexpr int kLoadPixels = 2 * kLoadFloats; |
| |
| float mean[kInputChannels] = { |
| meanChannel[0], meanChannel[1], meanChannel[2]}; |
| int planeSize = H * W; |
| |
| // Vectorized portion |
| int point = 0; |
| |
| // If the slice is not aligned, then we have to use the |
| // un-vectorized version |
| bool isAligned = isPointerAligned(input, kLoadUnit) && |
| isPointerAligned(output, kStoreUnit) && |
| // Because we are reading from input at offsets of planeSize, |
| // planeSize has to be an even multiple of kLoadUnit |
| (planeSize % kLoadUnit == 0); |
| |
| // What portion the vectorized loop will handle |
| int limit = isAligned ? (planeSize / kLoadPixels) * kLoadPixels : 0; |
| |
| for (; point < limit; point += kLoadPixels) { |
| // Load 8 f32 pixels from each channel; loading 16 involves |
| // register spills it seems |
| float32x4_t inputc0_0 = |
| vld1q_f32_aligned(input + 0 * planeSize + point + 0 * kLoadFloats); |
| float32x4_t inputc0_1 = |
| vld1q_f32_aligned(input + 0 * planeSize + point + 1 * kLoadFloats); |
| |
| float32x4_t inputc1_0 = |
| vld1q_f32_aligned(input + 1 * planeSize + point + 0 * kLoadFloats); |
| float32x4_t inputc1_1 = |
| vld1q_f32_aligned(input + 1 * planeSize + point + 1 * kLoadFloats); |
| |
| float32x4_t inputc2_0 = |
| vld1q_f32_aligned(input + 2 * planeSize + point + 0 * kLoadFloats); |
| float32x4_t inputc2_1 = |
| vld1q_f32_aligned(input + 2 * planeSize + point + 1 * kLoadFloats); |
| |
| addMeanAndClamp(inputc0_0, mean[0]); |
| addMeanAndClamp(inputc0_1, mean[0]); |
| uint8x8_t u8_c0 = convertNarrowAndPack(inputc0_0, inputc0_1); |
| |
| addMeanAndClamp(inputc1_0, mean[1]); |
| addMeanAndClamp(inputc1_1, mean[1]); |
| uint8x8_t u8_c1 = convertNarrowAndPack(inputc1_0, inputc1_1); |
| |
| addMeanAndClamp(inputc2_0, mean[2]); |
| addMeanAndClamp(inputc2_1, mean[2]); |
| uint8x8_t u8_c2 = convertNarrowAndPack(inputc2_0, inputc2_1); |
| |
| // This is the alpha channel |
| uint8x8_t u8_c3 = vdup_n_u8(std::numeric_limits<uint8_t>::max()); |
| |
| // We now have 8 bytes of each channel in a separate vector |
| // Write BGRA interleaved to output |
| uint8x8x4_t u8_out = {{ u8_c0, u8_c1, u8_c2, u8_c3 }}; |
| vst4_u8_aligned(output + kOutputChannels * point, u8_out); |
| } |
| |
| // Epilogue: non-vectorized remainder |
| for (; point < planeSize; ++point) { |
| for (int c = 0; c < kInputChannels; ++c) { |
| uint8_t v = |
| clamped_cast<uint8_t>(input[c * planeSize + point] + mean[c]); |
| output[point * kOutputChannels + c] = v; |
| } |
| |
| // alpha |
| output[point * kOutputChannels + (kOutputChannels - 1)] = |
| std::numeric_limits<uint8_t>::max(); |
| } |
| } |
| #endif // defined(__ARM_NEON__) || defined(__ARM_NEON) |
| }; |
| |
| namespace { |
| |
| REGISTER_CPU_OPERATOR( |
| PackedInt8BGRANHWCToNCHWCStylizerPreprocess, |
| PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp); |
| OPERATOR_SCHEMA(PackedInt8BGRANHWCToNCHWCStylizerPreprocess) |
| .NumInputs(2) |
| .NumOutputs(1); |
| REGISTER_CPU_OPERATOR( |
| BRGNCHWCToPackedInt8BGRAStylizerDeprocess, |
| BRGNCHWCToPackedInt8BGRAStylizerDeprocessOp); |
| OPERATOR_SCHEMA(BRGNCHWCToPackedInt8BGRAStylizerDeprocess) |
| .NumInputs(2) |
| .NumOutputs(1); |
| |
| #ifdef USE_MKLDNN |
| REGISTER_IDEEP_OPERATOR( |
| BRGNCHWCToPackedInt8BGRAStylizerDeprocess, |
| IDEEPFallbackOp<BRGNCHWCToPackedInt8BGRAStylizerDeprocessOp, SkipIndices<0>>); |
| REGISTER_IDEEP_OPERATOR( |
| PackedInt8BGRANHWCToNCHWCStylizerPreprocess, |
| IDEEPFallbackOp<PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp>); |
| #endif |
| } // namespace |
| } // namespace caffe2 |