caffe2/utils/math/broadcast.cu - platform/external/pytorch - Git at Google

 #include "caffe2/utils/math/broadcast.h"

 #include "caffe2/core/context_gpu.h"
 #include "caffe2/utils/math/utils.h"

 namespace caffe2 {
 namespace math {

 namespace {

 template <typename T>
 __global__ void AffineChannelNCHWCUDAKernel(
     const int C,
     const int M,
     const int HxW,
     const T* X,
     const T* scale,
     const T* bias,
     T* Y);

 template <>
 __global__ void AffineChannelNCHWCUDAKernel<float>(
     const int C,
     const int M,
     const int HxW,
     const float* X,
     const float* scale,
     const float* bias,
     float* Y) {
   const int nc = blockIdx.x / M;
   const int c = nc % C;
   const int w = blockIdx.x % M * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
   if (w < HxW) {
     const int index = nc * HxW + w;
 #if __CUDA_ARCH__ >= 350 || defined(USE_ROCM)
     Y[index] = fmaf(__ldg(X + index), __ldg(scale + c), __ldg(bias + c));
 #else
     Y[index] = fmaf(X[index], scale[c], bias[c]);
 #endif
   }
 }

 template <typename T>
 __global__ void AffineChannelNHWCCUDAKernel(
     const int C,
     const T* X,
     const T* scale,
     const T* bias,
     T* Y);

 template <>
 __global__ void AffineChannelNHWCCUDAKernel<float>(
     const int C,
     const float* X,
     const float* scale,
     const float* bias,
     float* Y) {
   const int c = blockIdx.y * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
   if (c < C) {
     const int index = blockIdx.x * C + c;
 #if __CUDA_ARCH__ >= 350 || defined(USE_ROCM)
     Y[index] = fmaf(__ldg(X + index), __ldg(scale + c), __ldg(bias + c));
 #else
     Y[index] = fmaf(X[index], scale[c], bias[c]);
 #endif
   }
 }

 } // namespace

 #define CAFFE2_SPECIALIZED_CUDA_AFFINE_CHANNEL(T)                            \
   template <>                                                                \
   CAFFE2_CUDA_EXPORT void AffineChannel<T, CUDAContext, StorageOrder::NCHW>( \
       const int N,                                                           \
       const int C,                                                           \
       const int HxW,                                                         \
       const T* X,                                                            \
       const T* scale,                                                        \
       const T* bias,                                                         \
       T* Y,                                                                  \
       CUDAContext* context) {                                                \
     const int M = DivUp(HxW, CAFFE_CUDA_NUM_THREADS);                        \
     AffineChannelNCHWCUDAKernel<T>                                           \
         <<<N * C * M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(  \
             C, M, HxW, X, scale, bias, Y);                                   \
     C10_CUDA_KERNEL_LAUNCH_CHECK();                                          \
   }                                                                          \
   template <>                                                                \
   CAFFE2_CUDA_EXPORT void AffineChannel<T, CUDAContext, StorageOrder::NHWC>( \
       const int N,                                                           \
       const int C,                                                           \
       const int HxW,                                                         \
       const T* X,                                                            \
       const T* scale,                                                        \
       const T* bias,                                                         \
       T* Y,                                                                  \
       CUDAContext* context) {                                                \
     const int M = DivUp(C, CAFFE_CUDA_NUM_THREADS);                          \
     AffineChannelNHWCCUDAKernel<T>                                           \
         <<<dim3(N* HxW, M),                                                  \
            CAFFE_CUDA_NUM_THREADS,                                           \
            0,                                                                \
            context->cuda_stream()>>>(C, X, scale, bias, Y);                  \
     C10_CUDA_KERNEL_LAUNCH_CHECK();                                          \
   }
 CAFFE2_SPECIALIZED_CUDA_AFFINE_CHANNEL(float)
 #undef CAFFE2_SPECIALIZED_CUDA_AFFINE_CHANNEL

 } // namespace math
 } // namespace caffe2
	#include "caffe2/utils/math/broadcast.h"

	#include "caffe2/core/context_gpu.h"
	#include "caffe2/utils/math/utils.h"

	namespace caffe2 {
	namespace math {

	namespace {

	template <typename T>
	__global__ void AffineChannelNCHWCUDAKernel(
	const int C,
	const int M,
	const int HxW,
	const T* X,
	const T* scale,
	const T* bias,
	T* Y);

	template <>
	__global__ void AffineChannelNCHWCUDAKernel<float>(
	const int C,
	const int M,
	const int HxW,
	const float* X,
	const float* scale,
	const float* bias,
	float* Y) {
	const int nc = blockIdx.x / M;
	const int c = nc % C;
	const int w = blockIdx.x % M * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
	if (w < HxW) {
	const int index = nc * HxW + w;
	#if __CUDA_ARCH__ >= 350 \|\| defined(USE_ROCM)
	Y[index] = fmaf(__ldg(X + index), __ldg(scale + c), __ldg(bias + c));
	#else
	Y[index] = fmaf(X[index], scale[c], bias[c]);
	#endif
	}
	}

	template <typename T>
	__global__ void AffineChannelNHWCCUDAKernel(
	const int C,
	const T* X,
	const T* scale,
	const T* bias,
	T* Y);

	template <>
	__global__ void AffineChannelNHWCCUDAKernel<float>(
	const int C,
	const float* X,
	const float* scale,
	const float* bias,
	float* Y) {
	const int c = blockIdx.y * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
	if (c < C) {
	const int index = blockIdx.x * C + c;
	#if __CUDA_ARCH__ >= 350 \|\| defined(USE_ROCM)
	Y[index] = fmaf(__ldg(X + index), __ldg(scale + c), __ldg(bias + c));
	#else
	Y[index] = fmaf(X[index], scale[c], bias[c]);
	#endif
	}
	}

	} // namespace

	#define CAFFE2_SPECIALIZED_CUDA_AFFINE_CHANNEL(T) \
	template <> \
	CAFFE2_CUDA_EXPORT void AffineChannel<T, CUDAContext, StorageOrder::NCHW>( \
	const int N, \
	const int C, \
	const int HxW, \
	const T* X, \
	const T* scale, \
	const T* bias, \
	T* Y, \
	CUDAContext* context) { \
	const int M = DivUp(HxW, CAFFE_CUDA_NUM_THREADS); \
	AffineChannelNCHWCUDAKernel<T> \
	<<<N * C * M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( \
	C, M, HxW, X, scale, bias, Y); \
	C10_CUDA_KERNEL_LAUNCH_CHECK(); \
	} \
	template <> \
	CAFFE2_CUDA_EXPORT void AffineChannel<T, CUDAContext, StorageOrder::NHWC>( \
	const int N, \
	const int C, \
	const int HxW, \
	const T* X, \
	const T* scale, \
	const T* bias, \
	T* Y, \
	CUDAContext* context) { \
	const int M = DivUp(C, CAFFE_CUDA_NUM_THREADS); \
	AffineChannelNHWCCUDAKernel<T> \
	<<<dim3(N* HxW, M), \
	CAFFE_CUDA_NUM_THREADS, \
	0, \
	context->cuda_stream()>>>(C, X, scale, bias, Y); \
	C10_CUDA_KERNEL_LAUNCH_CHECK(); \
	}
	CAFFE2_SPECIALIZED_CUDA_AFFINE_CHANNEL(float)
	#undef CAFFE2_SPECIALIZED_CUDA_AFFINE_CHANNEL

	} // namespace math
	} // namespace caffe2