caffe2/operators/stylizer_ops.cc - platform/external/pytorch - Git at Google

 #include "caffe2/core/operator.h"
 #include "caffe2/utils/cpu_neon.h"
 #include "caffe2/utils/math.h"

 #ifdef USE_MKLDNN
 #include <caffe2/ideep/operators/operator_fallback_ideep.h>
 #include <caffe2/ideep/utils/ideep_operator.h>
 #endif

 namespace caffe2 {

 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 namespace {

 //
 // ARM Neon code utilities
 //

 inline float32x4_t to_v4_f32(uint16x4_t v) {
   return vcvtq_f32_u32(vmovl_u16(v));
 }

 inline float32x4x4_t to_f32_v4_x4(uint8x16_t v) {
   float32x4x4_t out;

   uint16x8_t lo_u16 = vmovl_u8(vget_low_u8(v));

   out.val[0] = to_v4_f32(vget_low_u16(lo_u16));
   out.val[1] = to_v4_f32(vget_high_u16(lo_u16));

   uint16x8_t hi_u16 = vmovl_u8(vget_high_u8(v));

   out.val[2] = to_v4_f32(vget_low_u16(hi_u16));
   out.val[3] = to_v4_f32(vget_high_u16(hi_u16));

   return out;
 }

 inline void clamp(float32x4_t& v) {
   v = vmaxq_f32(v, vdupq_n_f32(0));
   v = vminq_f32(v, vdupq_n_f32((float)std::numeric_limits<uint8_t>::max()));
 }

 inline void addMeanAndClamp(float32x4_t& v, float mean) {
   v = vaddq_f32(v, vdupq_n_f32(mean));
   clamp(v);
 }

 inline uint8x8_t convertNarrowAndPack(float32x4_t v0, float32x4_t v1) {
   uint16x4_t u16_0 = vmovn_u32(vcvtq_u32_f32(v0));
   uint16x4_t u16_1 = vmovn_u32(vcvtq_u32_f32(v1));
   uint16x8_t u16_01 = vcombine_u16(u16_0, u16_1);
   return vmovn_u16(u16_01);
 }

 } // unnamed namespace
 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON)

 class PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp
     : public Operator<CPUContext> {
  public:
   // Expect this many channels as input
   static constexpr int kInputChannels = 4;

   // Expect this many channels as output
   static constexpr int kOutputChannels = 3;

   // We read this much noise per vectorized cycle
   static constexpr int kNeonNoiseReadSize = kOutputChannels * 16;

   USE_OPERATOR_FUNCTIONS(CPUContext);
   explicit PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<CPUContext>(operator_def, ws), ws_(ws) {}

   bool RunOnDevice() override {
     const auto& X = Input(0);
     const auto& mean = Input(1);

     auto* noiseBlob = ws_->CreateBlob("__CAFFE2_STYLIZER_NOISE__");
     auto defaultNoiseSize = OperatorBase::GetSingleArgument<int>(
         "noise_size", 491 /* prime to avoid artifacts */);

     if (!BlobIsTensorType(*noiseBlob, CPU)) {
       // Initialize random noise on first use.
       // Cache it to maintain temporal consistency.
       auto* t = BlobGetMutableTensor(noiseBlob, CPU);

 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
       // Noise space is larger for vectorized code due to the
       // vectorized load
       initNoiseCPUNeon(t, defaultNoiseSize);
 #else
       initNoiseCPU(t, defaultNoiseSize);
 #endif
     }
     const auto& noise = noiseBlob->template Get<TensorCPU>();
     CAFFE_ENFORCE(noise.numel() >= defaultNoiseSize);

     CAFFE_ENFORCE(X.dim() == 4);
     const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
     // Assume BGR or BGRA
     CAFFE_ENFORCE(mean.numel() == kOutputChannels);

     CAFFE_ENFORCE(C == kInputChannels);
     auto* Y = Output(0, {N, kOutputChannels, H, W}, at::dtype<float>());

     runBatch(
         N,
         C,
         H,
         W,
         defaultNoiseSize,
         X.data<uint8_t>(),
         mean.data<float>(),
         noise.data<float>(),
         Y->template mutable_data<float>());

     return true;
   }

 #if !defined(__ARM_NEON__) && !defined(__ARM_NEON)
   void initNoiseCPU(Tensor* noise, int size) {
     noise->Resize(size);

     math::RandGaussian<float, CPUContext>(
         size,
         0.0,
         OperatorBase::GetSingleArgument<float>("noise_std", 10.0),
         noise->template mutable_data<float>(),
         &context_);
   }
 #endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON)

 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
   void initNoiseCPUNeon(Tensor* noise, int size) {
     // For ARM NEON, we read in multiples of kNeonNoiseReadSize since
     // the inner loop is vectorized. Round up to the next highest
     // multiple of kNeonNoiseReadSize
     size = math::RoundUp(size, kNeonNoiseReadSize) + size;
     noise->Resize(size);

     math::RandGaussian<float, CPUContext>(
         size,
         0.0,
         OperatorBase::GetSingleArgument<float>("noise_std", 10.0),
         noise->template mutable_data<float>(),
         &context_);
   }
 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON)

   void runBatch(
       int N,
       int /*C*/,
       int H,
       int W,
       int noiseCycle,
       const uint8_t* input,
       const float* meanChannel,
       const float* noise,
       float* output) {
     int planeSize = H * W;

     for (int n = 0; n < N; ++n) {
       auto curInput = input + n * kInputChannels * planeSize;
       auto curOutput = output + n * kOutputChannels * planeSize;

 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
       runCPUNeon(H, W, noiseCycle, curInput, meanChannel, noise, curOutput);
 #else
       runCPU(H, W, noiseCycle, curInput, meanChannel, noise, curOutput);
 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
     }
   }

 #if !defined(__ARM_NEON__) && !defined(__ARM_NEON)
   void runCPU(
       int H,
       int W,
       int noiseCycle,
       const uint8_t* input,
       const float* meanChannel,
       const float* noise,
       float* output) {
     int planeSize = H * W;
     int noiseOffset = 0;

     for (int point = 0; point < planeSize; ++point) {
       for (int c = 0; c < kOutputChannels; ++c) {
         float v = (float)input[point * kInputChannels + c];
         output[c * planeSize + point] = v - meanChannel[c] + noise[noiseOffset];

         if (++noiseOffset >= noiseCycle) {
           noiseOffset = 0;
         }
       }
     }
   }
 #endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON)

 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
   void runCPUNeon(
       int H,
       int W,
       int noiseCycle,
       const uint8_t* input,
       const float* meanChannel,
       const float* noise,
       float* output) {
     // Vectorized load parameters:

     // Loop unroll factor
     // FIXME: this doesn't actually unroll; clang has per-loop unroll
     // pragmas but GCC does not
     constexpr int kUnroll = 1;

     // How much data we load for each inner loop
     constexpr int kInnerLoadSize = sizeof(uint8x16x4_t);

     // What we write out
     constexpr int kInnerStoreSize = sizeof(float32x4_t);

     // We load 16 pixels at a time, with 4 channels each
     constexpr int kLoadPixels = kInnerLoadSize / kInputChannels;
     static_assert(kLoadPixels == 16, "unexpected");

     // How many pixels we load per loop
     constexpr int kLoadPixelsPerLoop = kLoadPixels * kUnroll;

     // We need at least this much noise each loop through
     CAFFE_ENFORCE_GE(noiseCycle, kOutputChannels * kLoadPixelsPerLoop);

     int noiseUsed = 0;
     const float* curNoise = noise;

     float mean[kOutputChannels] = {
         meanChannel[0], meanChannel[1], meanChannel[2]};
     int planeSize = H * W;

     // Vectorized portion
     int point = 0;

     // If the slice is not aligned, then we have to use the
     // un-vectorized version
     bool isAligned = isPointerAligned(input, kInnerLoadSize) &&
         isPointerAligned(output, kInnerStoreSize) &&
         // Because we are writing to output at offsets of planeSize,
         // planeSize has to be an even multiple of kInnerStoreSize
         (planeSize % kInnerStoreSize == 0);

     // What portion the vectorized loop will handle
     int limit =
         isAligned ? (planeSize / kLoadPixelsPerLoop) * kLoadPixelsPerLoop : 0;

     for (; point < limit; point += kLoadPixelsPerLoop) {
       // Unroll load/update/store by kUnroll
       for (int j = 0; j < kUnroll; ++j) {
         // We load 16 pixels x 4 channels at a time
         const uint8_t* inputAligned = (const uint8_t*)__builtin_assume_aligned(
             input + (point + j * kLoadPixels) * kInputChannels,
             sizeof(uint8x16x4_t));
         uint8x16x4_t loadV = vld4q_u8(inputAligned);

         // The compiler doesn't want to unroll this when we put it in a
         // loop, and in GCC there's no per-loop unroll pragma, so we do
         // it manually.
         // This seems to involve no register spillage, crossing fingers
         // that it remains that way.
         {
           constexpr int kChannel = 0;
           float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 0);
           float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 4);
           float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 8);
           float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 12);

           float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
           float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
           outV.val[0] = vsubq_f32(outV.val[0], meanV);
           outV.val[1] = vsubq_f32(outV.val[1], meanV);
           outV.val[2] = vsubq_f32(outV.val[2], meanV);
           outV.val[3] = vsubq_f32(outV.val[3], meanV);

           outV.val[0] = vaddq_f32(outV.val[0], noise0);
           outV.val[1] = vaddq_f32(outV.val[1], noise1);
           outV.val[2] = vaddq_f32(outV.val[2], noise2);
           outV.val[3] = vaddq_f32(outV.val[3], noise3);

           float* outputAligned = (float*)__builtin_assume_aligned(
               &output[kChannel * planeSize + (point + j * kLoadPixels)],
               sizeof(float32x4_t));

           vst1q_f32(outputAligned + 0, outV.val[0]);
           vst1q_f32(outputAligned + 4, outV.val[1]);
           vst1q_f32(outputAligned + 8, outV.val[2]);
           vst1q_f32(outputAligned + 12, outV.val[3]);
         }

         {
           constexpr int kChannel = 1;
           float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 16);
           float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 20);
           float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 24);
           float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 28);

           float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
           float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
           outV.val[0] = vsubq_f32(outV.val[0], meanV);
           outV.val[1] = vsubq_f32(outV.val[1], meanV);
           outV.val[2] = vsubq_f32(outV.val[2], meanV);
           outV.val[3] = vsubq_f32(outV.val[3], meanV);

           outV.val[0] = vaddq_f32(outV.val[0], noise0);
           outV.val[1] = vaddq_f32(outV.val[1], noise1);
           outV.val[2] = vaddq_f32(outV.val[2], noise2);
           outV.val[3] = vaddq_f32(outV.val[3], noise3);

           float* outputAligned = (float*)__builtin_assume_aligned(
               &output[kChannel * planeSize + (point + j * kLoadPixels)],
               sizeof(float32x4_t));

           vst1q_f32(outputAligned + 0, outV.val[0]);
           vst1q_f32(outputAligned + 4, outV.val[1]);
           vst1q_f32(outputAligned + 8, outV.val[2]);
           vst1q_f32(outputAligned + 12, outV.val[3]);
         }

         {
           constexpr int kChannel = 2;
           float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 32);
           float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 36);
           float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 40);
           float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 44);

           float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
           float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
           outV.val[0] = vsubq_f32(outV.val[0], meanV);
           outV.val[1] = vsubq_f32(outV.val[1], meanV);
           outV.val[2] = vsubq_f32(outV.val[2], meanV);
           outV.val[3] = vsubq_f32(outV.val[3], meanV);

           outV.val[0] = vaddq_f32(outV.val[0], noise0);
           outV.val[1] = vaddq_f32(outV.val[1], noise1);
           outV.val[2] = vaddq_f32(outV.val[2], noise2);
           outV.val[3] = vaddq_f32(outV.val[3], noise3);

           float* outputAligned = (float*)__builtin_assume_aligned(
               &output[kChannel * planeSize + (point + j * kLoadPixels)],
               sizeof(float32x4_t));

           vst1q_f32(outputAligned + 0, outV.val[0]);
           vst1q_f32(outputAligned + 4, outV.val[1]);
           vst1q_f32(outputAligned + 8, outV.val[2]);
           vst1q_f32(outputAligned + 12, outV.val[3]);
         }
       }

       curNoise += (kLoadPixels * kOutputChannels) * kUnroll;
       noiseUsed += (kLoadPixels * kOutputChannels) * kUnroll;

       if (noiseUsed >= noiseCycle) {
         noiseUsed = 0;
         curNoise = noise + ((curNoise - noise) % noiseCycle);
       }
     }

     // Epilogue: non-vectorized remainder
     for (; point < planeSize; ++point) {
       for (int c = 0; c < kOutputChannels; ++c) {
         float v = (float)input[point * kInputChannels + c];
         output[c * planeSize + point] = v - mean[c] + *curNoise++;
         ++noiseUsed;
       }

       if (noiseUsed >= noiseCycle) {
         noiseUsed = 0;
         curNoise = noise + ((curNoise - noise) % noiseCycle);
       }
     }
   }
 #endif //  defined(__ARM_NEON__) || defined(__ARM_NEON)

  private:
   Workspace* ws_;
 };

 namespace {

 template <typename T>
 static inline T clamped_cast(float f) {
   if (f >= std::numeric_limits<T>::max()) {
     return std::numeric_limits<T>::max();
   }
   if (f <= std::numeric_limits<T>::min()) {
     return std::numeric_limits<T>::min();
   }
   return static_cast<T>(f);
 }

 } // unnamed namespace

 class BRGNCHWCToPackedInt8BGRAStylizerDeprocessOp
     : public Operator<CPUContext> {
  public:
   using Operator<CPUContext>::Operator;

   // Expect this many channels as input
   static constexpr int kInputChannels = 3;

   // Expect this many channels as output
   static constexpr int kOutputChannels = 4;

   bool RunOnDevice() override {
     const auto& X = Input(0);
     const auto& mean = Input(1);

     CAFFE_ENFORCE(X.dim() == 4);
     const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
     // Assume BGR or BGRA
     CAFFE_ENFORCE(mean.numel() == kInputChannels);
     CAFFE_ENFORCE(C == kInputChannels);
     // RGB
     auto* Y = Output(0, {N, H, W, kOutputChannels}, at::dtype<uint8_t>());

     runBatch(
         N,
         C,
         H,
         W,
         X.data<float>(),
         mean.data<float>(),
         Y->template mutable_data<uint8_t>());

     return true;
   }

   void runBatch(
       int N,
       int /*C*/,
       int H,
       int W,
       const float* input,
       const float* meanChannel,
       uint8_t* output) {
     int planeSize = H * W;

     for (int n = 0; n < N; ++n) {
       auto curInput = input + n * kInputChannels * planeSize;
       auto curOutput = output + n * kOutputChannels * planeSize;

 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
       runCPUNeon(H, W, curInput, meanChannel, curOutput);
 #else
       runCPU(H, W, curInput, meanChannel, curOutput);
 #endif //  defined(__ARM_NEON__) || defined(__ARM_NEON)
     }
   }

 #if !defined(__ARM_NEON__) && !defined(__ARM_NEON)
   void runCPU(
       int H,
       int W,
       const float* input,
       const float* meanChannel,
       uint8_t* output) {
     int planeSize = H * W;

     for (int point = 0; point < planeSize; ++point) {
       for (int c = 0; c < kInputChannels; ++c) {
         uint8_t v = clamped_cast<uint8_t>(
             input[c * planeSize + point] + meanChannel[c]);
         output[point * kOutputChannels + c] = v;
       }

       // alpha
       output[point * kOutputChannels + (kOutputChannels - 1)] =
           std::numeric_limits<uint8_t>::max();
     }
   }
 #endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON)

 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
   void runCPUNeon(
       int H,
       int W,
       const float* input,
       const float* meanChannel,
       uint8_t* output) {
     // Vectorized load parameters:

     // We load in chunks of this size
     constexpr int kLoadUnit = sizeof(float32x4_t);
     constexpr int kLoadFloats = (sizeof(float32x4_t) / sizeof(float));

     // We store in chunks of this size
     constexpr int kStoreUnit = sizeof(uint8x8x4_t);

     // The vector portion loads this many f32 pixels at a time (8)
     constexpr int kLoadPixels = 2 * kLoadFloats;

     float mean[kInputChannels] = {
         meanChannel[0], meanChannel[1], meanChannel[2]};
     int planeSize = H * W;

     // Vectorized portion
     int point = 0;

     // If the slice is not aligned, then we have to use the
     // un-vectorized version
     bool isAligned = isPointerAligned(input, kLoadUnit) &&
         isPointerAligned(output, kStoreUnit) &&
         // Because we are reading from input at offsets of planeSize,
         // planeSize has to be an even multiple of kLoadUnit
         (planeSize % kLoadUnit == 0);

     // What portion the vectorized loop will handle
     int limit = isAligned ? (planeSize / kLoadPixels) * kLoadPixels : 0;

     for (; point < limit; point += kLoadPixels) {
       // Load 8 f32 pixels from each channel; loading 16 involves
       // register spills it seems
       float32x4_t inputc0_0 =
           vld1q_f32_aligned(input + 0 * planeSize + point + 0 * kLoadFloats);
       float32x4_t inputc0_1 =
           vld1q_f32_aligned(input + 0 * planeSize + point + 1 * kLoadFloats);

       float32x4_t inputc1_0 =
           vld1q_f32_aligned(input + 1 * planeSize + point + 0 * kLoadFloats);
       float32x4_t inputc1_1 =
           vld1q_f32_aligned(input + 1 * planeSize + point + 1 * kLoadFloats);

       float32x4_t inputc2_0 =
           vld1q_f32_aligned(input + 2 * planeSize + point + 0 * kLoadFloats);
       float32x4_t inputc2_1 =
           vld1q_f32_aligned(input + 2 * planeSize + point + 1 * kLoadFloats);

       addMeanAndClamp(inputc0_0, mean[0]);
       addMeanAndClamp(inputc0_1, mean[0]);
       uint8x8_t u8_c0 = convertNarrowAndPack(inputc0_0, inputc0_1);

       addMeanAndClamp(inputc1_0, mean[1]);
       addMeanAndClamp(inputc1_1, mean[1]);
       uint8x8_t u8_c1 = convertNarrowAndPack(inputc1_0, inputc1_1);

       addMeanAndClamp(inputc2_0, mean[2]);
       addMeanAndClamp(inputc2_1, mean[2]);
       uint8x8_t u8_c2 = convertNarrowAndPack(inputc2_0, inputc2_1);

       // This is the alpha channel
       uint8x8_t u8_c3 = vdup_n_u8(std::numeric_limits<uint8_t>::max());

       // We now have 8 bytes of each channel in a separate vector
       // Write BGRA interleaved to output
       uint8x8x4_t u8_out = {{ u8_c0, u8_c1, u8_c2, u8_c3 }};
       vst4_u8_aligned(output + kOutputChannels * point, u8_out);
     }

     // Epilogue: non-vectorized remainder
     for (; point < planeSize; ++point) {
       for (int c = 0; c < kInputChannels; ++c) {
         uint8_t v =
             clamped_cast<uint8_t>(input[c * planeSize + point] + mean[c]);
         output[point * kOutputChannels + c] = v;
       }

       // alpha
       output[point * kOutputChannels + (kOutputChannels - 1)] =
           std::numeric_limits<uint8_t>::max();
     }
   }
 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
 };

 namespace {

 REGISTER_CPU_OPERATOR(
     PackedInt8BGRANHWCToNCHWCStylizerPreprocess,
     PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp);
 OPERATOR_SCHEMA(PackedInt8BGRANHWCToNCHWCStylizerPreprocess)
     .NumInputs(2)
     .NumOutputs(1);
 REGISTER_CPU_OPERATOR(
     BRGNCHWCToPackedInt8BGRAStylizerDeprocess,
     BRGNCHWCToPackedInt8BGRAStylizerDeprocessOp);
 OPERATOR_SCHEMA(BRGNCHWCToPackedInt8BGRAStylizerDeprocess)
     .NumInputs(2)
     .NumOutputs(1);

 #ifdef USE_MKLDNN
 REGISTER_IDEEP_OPERATOR(
     BRGNCHWCToPackedInt8BGRAStylizerDeprocess,
     IDEEPFallbackOp<BRGNCHWCToPackedInt8BGRAStylizerDeprocessOp, SkipIndices<0>>);
 REGISTER_IDEEP_OPERATOR(
     PackedInt8BGRANHWCToNCHWCStylizerPreprocess,
     IDEEPFallbackOp<PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp>);
 #endif
 } // namespace
 } // namespace caffe2
	#include "caffe2/core/operator.h"
	#include "caffe2/utils/cpu_neon.h"
	#include "caffe2/utils/math.h"

	#ifdef USE_MKLDNN
	#include <caffe2/ideep/operators/operator_fallback_ideep.h>
	#include <caffe2/ideep/utils/ideep_operator.h>
	#endif

	namespace caffe2 {

	#if defined(__ARM_NEON__) \|\| defined(__ARM_NEON)
	namespace {

	//
	// ARM Neon code utilities
	//

	inline float32x4_t to_v4_f32(uint16x4_t v) {
	return vcvtq_f32_u32(vmovl_u16(v));
	}

	inline float32x4x4_t to_f32_v4_x4(uint8x16_t v) {
	float32x4x4_t out;

	uint16x8_t lo_u16 = vmovl_u8(vget_low_u8(v));

	out.val[0] = to_v4_f32(vget_low_u16(lo_u16));
	out.val[1] = to_v4_f32(vget_high_u16(lo_u16));

	uint16x8_t hi_u16 = vmovl_u8(vget_high_u8(v));

	out.val[2] = to_v4_f32(vget_low_u16(hi_u16));
	out.val[3] = to_v4_f32(vget_high_u16(hi_u16));

	return out;
	}

	inline void clamp(float32x4_t& v) {
	v = vmaxq_f32(v, vdupq_n_f32(0));
	v = vminq_f32(v, vdupq_n_f32((float)std::numeric_limits<uint8_t>::max()));
	}

	inline void addMeanAndClamp(float32x4_t& v, float mean) {
	v = vaddq_f32(v, vdupq_n_f32(mean));
	clamp(v);
	}

	inline uint8x8_t convertNarrowAndPack(float32x4_t v0, float32x4_t v1) {
	uint16x4_t u16_0 = vmovn_u32(vcvtq_u32_f32(v0));
	uint16x4_t u16_1 = vmovn_u32(vcvtq_u32_f32(v1));
	uint16x8_t u16_01 = vcombine_u16(u16_0, u16_1);
	return vmovn_u16(u16_01);
	}

	} // unnamed namespace
	#endif // defined(__ARM_NEON__) \|\| defined(__ARM_NEON)

	class PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp
	: public Operator<CPUContext> {
	public:
	// Expect this many channels as input
	static constexpr int kInputChannels = 4;

	// Expect this many channels as output
	static constexpr int kOutputChannels = 3;

	// We read this much noise per vectorized cycle
	static constexpr int kNeonNoiseReadSize = kOutputChannels * 16;

	USE_OPERATOR_FUNCTIONS(CPUContext);
	explicit PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp(const OperatorDef& operator_def, Workspace* ws)
	: Operator<CPUContext>(operator_def, ws), ws_(ws) {}

	bool RunOnDevice() override {
	const auto& X = Input(0);
	const auto& mean = Input(1);

	auto* noiseBlob = ws_->CreateBlob("__CAFFE2_STYLIZER_NOISE__");
	auto defaultNoiseSize = OperatorBase::GetSingleArgument<int>(
	"noise_size", 491 /* prime to avoid artifacts */);

	if (!BlobIsTensorType(*noiseBlob, CPU)) {
	// Initialize random noise on first use.
	// Cache it to maintain temporal consistency.
	auto* t = BlobGetMutableTensor(noiseBlob, CPU);

	#if defined(__ARM_NEON__) \|\| defined(__ARM_NEON)
	// Noise space is larger for vectorized code due to the
	// vectorized load
	initNoiseCPUNeon(t, defaultNoiseSize);
	#else
	initNoiseCPU(t, defaultNoiseSize);
	#endif
	}
	const auto& noise = noiseBlob->template Get<TensorCPU>();
	CAFFE_ENFORCE(noise.numel() >= defaultNoiseSize);

	CAFFE_ENFORCE(X.dim() == 4);
	const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
	// Assume BGR or BGRA
	CAFFE_ENFORCE(mean.numel() == kOutputChannels);

	CAFFE_ENFORCE(C == kInputChannels);
	auto* Y = Output(0, {N, kOutputChannels, H, W}, at::dtype<float>());

	runBatch(
	N,
	C,
	H,
	W,
	defaultNoiseSize,
	X.data<uint8_t>(),
	mean.data<float>(),
	noise.data<float>(),
	Y->template mutable_data<float>());

	return true;
	}

	#if !defined(__ARM_NEON__) && !defined(__ARM_NEON)
	void initNoiseCPU(Tensor* noise, int size) {
	noise->Resize(size);

	math::RandGaussian<float, CPUContext>(
	size,
	0.0,
	OperatorBase::GetSingleArgument<float>("noise_std", 10.0),
	noise->template mutable_data<float>(),
	&context_);
	}
	#endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON)

	#if defined(__ARM_NEON__) \|\| defined(__ARM_NEON)
	void initNoiseCPUNeon(Tensor* noise, int size) {
	// For ARM NEON, we read in multiples of kNeonNoiseReadSize since
	// the inner loop is vectorized. Round up to the next highest
	// multiple of kNeonNoiseReadSize
	size = math::RoundUp(size, kNeonNoiseReadSize) + size;
	noise->Resize(size);

	math::RandGaussian<float, CPUContext>(
	size,
	0.0,
	OperatorBase::GetSingleArgument<float>("noise_std", 10.0),
	noise->template mutable_data<float>(),
	&context_);
	}
	#endif // defined(__ARM_NEON__) \|\| defined(__ARM_NEON)

	void runBatch(
	int N,
	int /C/,
	int H,
	int W,
	int noiseCycle,
	const uint8_t* input,
	const float* meanChannel,
	const float* noise,
	float* output) {
	int planeSize = H * W;

	for (int n = 0; n < N; ++n) {
	auto curInput = input + n * kInputChannels * planeSize;
	auto curOutput = output + n * kOutputChannels * planeSize;

	#if defined(__ARM_NEON__) \|\| defined(__ARM_NEON)
	runCPUNeon(H, W, noiseCycle, curInput, meanChannel, noise, curOutput);
	#else
	runCPU(H, W, noiseCycle, curInput, meanChannel, noise, curOutput);
	#endif // defined(__ARM_NEON__) \|\| defined(__ARM_NEON)
	}
	}

	#if !defined(__ARM_NEON__) && !defined(__ARM_NEON)
	void runCPU(
	int H,
	int W,
	int noiseCycle,
	const uint8_t* input,
	const float* meanChannel,
	const float* noise,
	float* output) {
	int planeSize = H * W;
	int noiseOffset = 0;

	for (int point = 0; point < planeSize; ++point) {
	for (int c = 0; c < kOutputChannels; ++c) {
	float v = (float)input[point * kInputChannels + c];
	output[c * planeSize + point] = v - meanChannel[c] + noise[noiseOffset];

	if (++noiseOffset >= noiseCycle) {
	noiseOffset = 0;
	}
	}
	}
	}
	#endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON)

	#if defined(__ARM_NEON__) \|\| defined(__ARM_NEON)
	void runCPUNeon(
	int H,
	int W,
	int noiseCycle,
	const uint8_t* input,
	const float* meanChannel,
	const float* noise,
	float* output) {
	// Vectorized load parameters:

	// Loop unroll factor
	// FIXME: this doesn't actually unroll; clang has per-loop unroll
	// pragmas but GCC does not
	constexpr int kUnroll = 1;

	// How much data we load for each inner loop
	constexpr int kInnerLoadSize = sizeof(uint8x16x4_t);

	// What we write out
	constexpr int kInnerStoreSize = sizeof(float32x4_t);

	// We load 16 pixels at a time, with 4 channels each
	constexpr int kLoadPixels = kInnerLoadSize / kInputChannels;
	static_assert(kLoadPixels == 16, "unexpected");

	// How many pixels we load per loop
	constexpr int kLoadPixelsPerLoop = kLoadPixels * kUnroll;

	// We need at least this much noise each loop through
	CAFFE_ENFORCE_GE(noiseCycle, kOutputChannels * kLoadPixelsPerLoop);

	int noiseUsed = 0;
	const float* curNoise = noise;

	float mean[kOutputChannels] = {
	meanChannel[0], meanChannel[1], meanChannel[2]};
	int planeSize = H * W;

	// Vectorized portion
	int point = 0;

	// If the slice is not aligned, then we have to use the
	// un-vectorized version
	bool isAligned = isPointerAligned(input, kInnerLoadSize) &&
	isPointerAligned(output, kInnerStoreSize) &&
	// Because we are writing to output at offsets of planeSize,
	// planeSize has to be an even multiple of kInnerStoreSize
	(planeSize % kInnerStoreSize == 0);

	// What portion the vectorized loop will handle
	int limit =
	isAligned ? (planeSize / kLoadPixelsPerLoop) * kLoadPixelsPerLoop : 0;

	for (; point < limit; point += kLoadPixelsPerLoop) {
	// Unroll load/update/store by kUnroll
	for (int j = 0; j < kUnroll; ++j) {
	// We load 16 pixels x 4 channels at a time
	const uint8_t* inputAligned = (const uint8_t*)__builtin_assume_aligned(
	input + (point + j * kLoadPixels) * kInputChannels,
	sizeof(uint8x16x4_t));
	uint8x16x4_t loadV = vld4q_u8(inputAligned);

	// The compiler doesn't want to unroll this when we put it in a
	// loop, and in GCC there's no per-loop unroll pragma, so we do
	// it manually.
	// This seems to involve no register spillage, crossing fingers
	// that it remains that way.
	{
	constexpr int kChannel = 0;
	float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 0);
	float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 4);
	float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 8);
	float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 12);

	float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
	float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
	outV.val[0] = vsubq_f32(outV.val[0], meanV);
	outV.val[1] = vsubq_f32(outV.val[1], meanV);
	outV.val[2] = vsubq_f32(outV.val[2], meanV);
	outV.val[3] = vsubq_f32(outV.val[3], meanV);

	outV.val[0] = vaddq_f32(outV.val[0], noise0);
	outV.val[1] = vaddq_f32(outV.val[1], noise1);
	outV.val[2] = vaddq_f32(outV.val[2], noise2);
	outV.val[3] = vaddq_f32(outV.val[3], noise3);

	float* outputAligned = (float*)__builtin_assume_aligned(
	&output[kChannel * planeSize + (point + j * kLoadPixels)],
	sizeof(float32x4_t));

	vst1q_f32(outputAligned + 0, outV.val[0]);
	vst1q_f32(outputAligned + 4, outV.val[1]);
	vst1q_f32(outputAligned + 8, outV.val[2]);
	vst1q_f32(outputAligned + 12, outV.val[3]);
	}

	{
	constexpr int kChannel = 1;
	float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 16);
	float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 20);
	float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 24);
	float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 28);

	float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
	float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
	outV.val[0] = vsubq_f32(outV.val[0], meanV);
	outV.val[1] = vsubq_f32(outV.val[1], meanV);
	outV.val[2] = vsubq_f32(outV.val[2], meanV);
	outV.val[3] = vsubq_f32(outV.val[3], meanV);

	outV.val[0] = vaddq_f32(outV.val[0], noise0);
	outV.val[1] = vaddq_f32(outV.val[1], noise1);
	outV.val[2] = vaddq_f32(outV.val[2], noise2);
	outV.val[3] = vaddq_f32(outV.val[3], noise3);

	float* outputAligned = (float*)__builtin_assume_aligned(
	&output[kChannel * planeSize + (point + j * kLoadPixels)],
	sizeof(float32x4_t));

	vst1q_f32(outputAligned + 0, outV.val[0]);
	vst1q_f32(outputAligned + 4, outV.val[1]);
	vst1q_f32(outputAligned + 8, outV.val[2]);
	vst1q_f32(outputAligned + 12, outV.val[3]);
	}

	{
	constexpr int kChannel = 2;
	float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 32);
	float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 36);
	float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 40);
	float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 44);

	float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
	float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
	outV.val[0] = vsubq_f32(outV.val[0], meanV);
	outV.val[1] = vsubq_f32(outV.val[1], meanV);
	outV.val[2] = vsubq_f32(outV.val[2], meanV);
	outV.val[3] = vsubq_f32(outV.val[3], meanV);

	outV.val[0] = vaddq_f32(outV.val[0], noise0);
	outV.val[1] = vaddq_f32(outV.val[1], noise1);
	outV.val[2] = vaddq_f32(outV.val[2], noise2);
	outV.val[3] = vaddq_f32(outV.val[3], noise3);

	float* outputAligned = (float*)__builtin_assume_aligned(
	&output[kChannel * planeSize + (point + j * kLoadPixels)],
	sizeof(float32x4_t));

	vst1q_f32(outputAligned + 0, outV.val[0]);
	vst1q_f32(outputAligned + 4, outV.val[1]);
	vst1q_f32(outputAligned + 8, outV.val[2]);
	vst1q_f32(outputAligned + 12, outV.val[3]);
	}
	}

	curNoise += (kLoadPixels * kOutputChannels) * kUnroll;
	noiseUsed += (kLoadPixels * kOutputChannels) * kUnroll;

	if (noiseUsed >= noiseCycle) {
	noiseUsed = 0;
	curNoise = noise + ((curNoise - noise) % noiseCycle);
	}
	}

	// Epilogue: non-vectorized remainder
	for (; point < planeSize; ++point) {
	for (int c = 0; c < kOutputChannels; ++c) {
	float v = (float)input[point * kInputChannels + c];
	output[c * planeSize + point] = v - mean[c] + *curNoise++;
	++noiseUsed;
	}

	if (noiseUsed >= noiseCycle) {
	noiseUsed = 0;
	curNoise = noise + ((curNoise - noise) % noiseCycle);
	}
	}
	}
	#endif // defined(__ARM_NEON__) \|\| defined(__ARM_NEON)

	private:
	Workspace* ws_;
	};

	namespace {

	template <typename T>
	static inline T clamped_cast(float f) {
	if (f >= std::numeric_limits<T>::max()) {
	return std::numeric_limits<T>::max();
	}
	if (f <= std::numeric_limits<T>::min()) {
	return std::numeric_limits<T>::min();
	}
	return static_cast<T>(f);
	}

	} // unnamed namespace

	class BRGNCHWCToPackedInt8BGRAStylizerDeprocessOp
	: public Operator<CPUContext> {
	public:
	using Operator<CPUContext>::Operator;

	// Expect this many channels as input
	static constexpr int kInputChannels = 3;

	// Expect this many channels as output
	static constexpr int kOutputChannels = 4;

	bool RunOnDevice() override {
	const auto& X = Input(0);
	const auto& mean = Input(1);

	CAFFE_ENFORCE(X.dim() == 4);
	const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
	// Assume BGR or BGRA
	CAFFE_ENFORCE(mean.numel() == kInputChannels);
	CAFFE_ENFORCE(C == kInputChannels);
	// RGB
	auto* Y = Output(0, {N, H, W, kOutputChannels}, at::dtype<uint8_t>());

	runBatch(
	N,
	C,
	H,
	W,
	X.data<float>(),
	mean.data<float>(),
	Y->template mutable_data<uint8_t>());

	return true;
	}

	void runBatch(
	int N,
	int /C/,
	int H,
	int W,
	const float* input,
	const float* meanChannel,
	uint8_t* output) {
	int planeSize = H * W;

	for (int n = 0; n < N; ++n) {
	auto curInput = input + n * kInputChannels * planeSize;
	auto curOutput = output + n * kOutputChannels * planeSize;

	#if defined(__ARM_NEON__) \|\| defined(__ARM_NEON)
	runCPUNeon(H, W, curInput, meanChannel, curOutput);
	#else
	runCPU(H, W, curInput, meanChannel, curOutput);
	#endif // defined(__ARM_NEON__) \|\| defined(__ARM_NEON)
	}
	}

	#if !defined(__ARM_NEON__) && !defined(__ARM_NEON)
	void runCPU(
	int H,
	int W,
	const float* input,
	const float* meanChannel,
	uint8_t* output) {
	int planeSize = H * W;

	for (int point = 0; point < planeSize; ++point) {
	for (int c = 0; c < kInputChannels; ++c) {
	uint8_t v = clamped_cast<uint8_t>(
	input[c * planeSize + point] + meanChannel[c]);
	output[point * kOutputChannels + c] = v;
	}

	// alpha
	output[point * kOutputChannels + (kOutputChannels - 1)] =
	std::numeric_limits<uint8_t>::max();
	}
	}
	#endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON)

	#if defined(__ARM_NEON__) \|\| defined(__ARM_NEON)
	void runCPUNeon(
	int H,
	int W,
	const float* input,
	const float* meanChannel,
	uint8_t* output) {
	// Vectorized load parameters:

	// We load in chunks of this size
	constexpr int kLoadUnit = sizeof(float32x4_t);
	constexpr int kLoadFloats = (sizeof(float32x4_t) / sizeof(float));

	// We store in chunks of this size
	constexpr int kStoreUnit = sizeof(uint8x8x4_t);

	// The vector portion loads this many f32 pixels at a time (8)
	constexpr int kLoadPixels = 2 * kLoadFloats;

	float mean[kInputChannels] = {
	meanChannel[0], meanChannel[1], meanChannel[2]};
	int planeSize = H * W;

	// Vectorized portion
	int point = 0;

	// If the slice is not aligned, then we have to use the
	// un-vectorized version
	bool isAligned = isPointerAligned(input, kLoadUnit) &&
	isPointerAligned(output, kStoreUnit) &&
	// Because we are reading from input at offsets of planeSize,
	// planeSize has to be an even multiple of kLoadUnit
	(planeSize % kLoadUnit == 0);

	// What portion the vectorized loop will handle
	int limit = isAligned ? (planeSize / kLoadPixels) * kLoadPixels : 0;

	for (; point < limit; point += kLoadPixels) {
	// Load 8 f32 pixels from each channel; loading 16 involves
	// register spills it seems
	float32x4_t inputc0_0 =
	vld1q_f32_aligned(input + 0 * planeSize + point + 0 * kLoadFloats);
	float32x4_t inputc0_1 =
	vld1q_f32_aligned(input + 0 * planeSize + point + 1 * kLoadFloats);

	float32x4_t inputc1_0 =
	vld1q_f32_aligned(input + 1 * planeSize + point + 0 * kLoadFloats);
	float32x4_t inputc1_1 =
	vld1q_f32_aligned(input + 1 * planeSize + point + 1 * kLoadFloats);

	float32x4_t inputc2_0 =
	vld1q_f32_aligned(input + 2 * planeSize + point + 0 * kLoadFloats);
	float32x4_t inputc2_1 =
	vld1q_f32_aligned(input + 2 * planeSize + point + 1 * kLoadFloats);

	addMeanAndClamp(inputc0_0, mean[0]);
	addMeanAndClamp(inputc0_1, mean[0]);
	uint8x8_t u8_c0 = convertNarrowAndPack(inputc0_0, inputc0_1);

	addMeanAndClamp(inputc1_0, mean[1]);
	addMeanAndClamp(inputc1_1, mean[1]);
	uint8x8_t u8_c1 = convertNarrowAndPack(inputc1_0, inputc1_1);

	addMeanAndClamp(inputc2_0, mean[2]);
	addMeanAndClamp(inputc2_1, mean[2]);
	uint8x8_t u8_c2 = convertNarrowAndPack(inputc2_0, inputc2_1);

	// This is the alpha channel
	uint8x8_t u8_c3 = vdup_n_u8(std::numeric_limits<uint8_t>::max());

	// We now have 8 bytes of each channel in a separate vector
	// Write BGRA interleaved to output
	uint8x8x4_t u8_out = {{ u8_c0, u8_c1, u8_c2, u8_c3 }};
	vst4_u8_aligned(output + kOutputChannels * point, u8_out);
	}

	// Epilogue: non-vectorized remainder
	for (; point < planeSize; ++point) {
	for (int c = 0; c < kInputChannels; ++c) {
	uint8_t v =
	clamped_cast<uint8_t>(input[c * planeSize + point] + mean[c]);
	output[point * kOutputChannels + c] = v;
	}

	// alpha
	output[point * kOutputChannels + (kOutputChannels - 1)] =
	std::numeric_limits<uint8_t>::max();
	}
	}
	#endif // defined(__ARM_NEON__) \|\| defined(__ARM_NEON)
	};

	namespace {

	REGISTER_CPU_OPERATOR(
	PackedInt8BGRANHWCToNCHWCStylizerPreprocess,
	PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp);
	OPERATOR_SCHEMA(PackedInt8BGRANHWCToNCHWCStylizerPreprocess)
	.NumInputs(2)
	.NumOutputs(1);
	REGISTER_CPU_OPERATOR(
	BRGNCHWCToPackedInt8BGRAStylizerDeprocess,
	BRGNCHWCToPackedInt8BGRAStylizerDeprocessOp);
	OPERATOR_SCHEMA(BRGNCHWCToPackedInt8BGRAStylizerDeprocess)
	.NumInputs(2)
	.NumOutputs(1);

	#ifdef USE_MKLDNN
	REGISTER_IDEEP_OPERATOR(
	BRGNCHWCToPackedInt8BGRAStylizerDeprocess,
	IDEEPFallbackOp<BRGNCHWCToPackedInt8BGRAStylizerDeprocessOp, SkipIndices<0>>);
	REGISTER_IDEEP_OPERATOR(
	PackedInt8BGRANHWCToNCHWCStylizerPreprocess,
	IDEEPFallbackOp<PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp>);
	#endif
	} // namespace
	} // namespace caffe2