caffe2/image/transform_gpu.cu - platform/external/pytorch - Git at Google

 #include "caffe2/core/context_gpu.h"
 #include "caffe2/image/transform_gpu.h"
 #include "caffe2/utils/conversions.h"

 /**
  *
  * Copyright (c) 2016, NVIDIA CORPORATION, All rights reserved
  * Distributed under 2-clause BSD license; see accompanying LICENSE file
  *
  **/

 namespace caffe2 {

 namespace {

 // input in (int8, NHWC), output in (fp32, NCHW)
 template <typename In, typename Out>
 __global__ void transform_kernel(
     const int C,
     const int H,
     const int W,
     const float* mean,
     const float* std,
     const In* in,
     Out* out) {
   const auto n = blockIdx.x;

   const auto nStride = C*H*W;

   // pointers to data for this image
   const In *const input_ptr = &in[n*nStride];
   Out *const output_ptr = &out[n*nStride];

   // either read or write uncoalesced - try reading
   for (int c=0; c < C; ++c) {
     for (int h=threadIdx.y; h < H; h += blockDim.y) {
       for (int w=threadIdx.x; w < W; w += blockDim.x) {
         const int in_idx = c + C*w + C*W*h;  // HWC
         const int out_idx = c*H*W + h*W + w;  // CHW

         output_ptr[out_idx] = convert::To<float,Out>(
           (convert::To<In,float>(input_ptr[in_idx])-mean[c]) * std[c]);
       }
     }
   }
 }

 }

 template <typename T_IN, typename T_OUT, class Context>

 bool TransformOnGPU(
     Tensor& X,
     Tensor* Y,
     Tensor& mean,
     Tensor& std,
     Context* context) {
   const int N = X.dim32(0), C = X.dim32(3), H = X.dim32(1), W = X.dim32(2);
   auto* input_data = X.template data<T_IN>();
   auto* output_data = Y->template mutable_data<T_OUT>();

   transform_kernel<
     T_IN, T_OUT><<<N, dim3(16, 16), 0, context->cuda_stream()>>>(
       C, H, W, mean.template data<float>(), std.template data<float>(),
       input_data, output_data);
   C10_CUDA_KERNEL_LAUNCH_CHECK();

   return true;
 };

 template bool TransformOnGPU<uint8_t, float, CUDAContext>(
     Tensor& X,
     Tensor* Y,
     Tensor& mean,
     Tensor& std,
     CUDAContext* context);

 template bool TransformOnGPU<uint8_t, at::Half, CUDAContext>(
     Tensor& X,
     Tensor* Y,
     Tensor& mean,
     Tensor& std,
     CUDAContext* context);

 }  // namespace caffe2
	#include "caffe2/core/context_gpu.h"
	#include "caffe2/image/transform_gpu.h"
	#include "caffe2/utils/conversions.h"

	/**
	*
	* Copyright (c) 2016, NVIDIA CORPORATION, All rights reserved
	* Distributed under 2-clause BSD license; see accompanying LICENSE file
	*
	**/

	namespace caffe2 {

	namespace {

	// input in (int8, NHWC), output in (fp32, NCHW)
	template <typename In, typename Out>
	__global__ void transform_kernel(
	const int C,
	const int H,
	const int W,
	const float* mean,
	const float* std,
	const In* in,
	Out* out) {
	const auto n = blockIdx.x;

	const auto nStride = CHW;

	// pointers to data for this image
	const In const input_ptr = &in[nnStride];
	Out const output_ptr = &out[nnStride];

	// either read or write uncoalesced - try reading
	for (int c=0; c < C; ++c) {
	for (int h=threadIdx.y; h < H; h += blockDim.y) {
	for (int w=threadIdx.x; w < W; w += blockDim.x) {
	const int in_idx = c + Cw + CW*h; // HWC
	const int out_idx = cHW + h*W + w; // CHW

	output_ptr[out_idx] = convert::To<float,Out>(
	(convert::To<In,float>(input_ptr[in_idx])-mean[c]) * std[c]);
	}
	}
	}
	}

	}

	template <typename T_IN, typename T_OUT, class Context>

	bool TransformOnGPU(
	Tensor& X,
	Tensor* Y,
	Tensor& mean,
	Tensor& std,
	Context* context) {
	const int N = X.dim32(0), C = X.dim32(3), H = X.dim32(1), W = X.dim32(2);
	auto* input_data = X.template data<T_IN>();
	auto* output_data = Y->template mutable_data<T_OUT>();

	transform_kernel<
	T_IN, T_OUT><<<N, dim3(16, 16), 0, context->cuda_stream()>>>(
	C, H, W, mean.template data<float>(), std.template data<float>(),
	input_data, output_data);
	C10_CUDA_KERNEL_LAUNCH_CHECK();

	return true;
	};

	template bool TransformOnGPU<uint8_t, float, CUDAContext>(
	Tensor& X,
	Tensor* Y,
	Tensor& mean,
	Tensor& std,
	CUDAContext* context);

	template bool TransformOnGPU<uint8_t, at::Half, CUDAContext>(
	Tensor& X,
	Tensor* Y,
	Tensor& mean,
	Tensor& std,
	CUDAContext* context);

	} // namespace caffe2