caffe2/operators/relu_op.cu - platform/external/pytorch - Git at Google

 #include "caffe2/operators/relu_op.h"

 #include <algorithm>
 #include <functional>

 #include "caffe2/core/context_gpu.h"
 #include "caffe2/utils/math.h"

 namespace caffe2 {

 namespace {

 #ifdef __HIPCC__
 using half2 = __half2;
 #endif // __HIPCC__

 template <typename T>
 __global__ void ReluCUDAKernel(const int N, const T* X, T* Y);

 #define DELEGATE_RELU_CUDA_KERNEL(T, MaxFunc)                        \
   template <>                                                        \
   __global__ void ReluCUDAKernel<T>(const int N, const T* X, T* Y) { \
     const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x; \
     if (i < N) {                                                     \
       Y[i] = MaxFunc(X[i], T(0));                                    \
     }                                                                \
   }
 DELEGATE_RELU_CUDA_KERNEL(float, fmaxf)
 #undef DELEGATE_RELU_CUDA_KERNEL

 template <>
 __global__ void ReluCUDAKernel<half>(const int N, const half* X, half* Y) {
   const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
   if (i < N) {
     const half kZero = __float2half(0.0f);
 #if __CUDA_ARCH__ >= 530 || TORCH_HIP_VERSION >= 300
     Y[i] = __hgt(__ldg(X + i), kZero) ? __ldg(X + i) : kZero;
 #else
     Y[i] = (__half2float(X[i]) > 0) ? X[i] : kZero;
 #endif
   }
 }

 template <>
 __global__ void ReluCUDAKernel<half2>(const int N, const half2* X, half2* Y) {
   const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
   if (i < N) {
     const half2 kZero = __float2half2_rn(0.0f);
 #if __CUDA_ARCH__ >= 530 || TORCH_HIP_VERSION >= 300
     Y[i] = __hmul2(__hgt2(__ldg(X + i), kZero), __ldg(X + i));
 #else
     const float2 xx = __half22float2(X[i]);
     // There are explicit cast to float here, because it may otherwise cause ambiguity on ROCm and can be triggered
     // sometimes:
     //
     //   error: conditional expression is ambiguous; 'const hip_impl::Scalar_accessor<float, Native_vec_, 0>' can be
     //   converted to 'float' and vice versa
     Y[i] = __floats2half2_rn(xx.x > 0.0f ? static_cast<float>(xx.x) : 0.0f,
                              xx.y > 0.0f ? static_cast<float>(xx.y) : 0.0f);
 #endif
   }
 }

 template <typename T>
 __global__ void
 ReluGradientCUDAKernel(const int N, const T* dY, const T* Y, T* dX) {
   const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
   if (i < N) {
 #if __CUDA_ARCH__ >= 350 || TORCH_HIP_VERSION >= 300
     dX[i] = __ldg(Y + i) > T(0) ? __ldg(dY + i) : T(0);
 #else
     dX[i] = Y[i] > T(0) ? dY[i] : T(0);
 #endif
   }
 }

 template <>
 __global__ void ReluGradientCUDAKernel<half>(
     const int N,
     const half* dY,
     const half* Y,
     half* dX) {
   const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
   if (i < N) {
     const half kZero = __float2half(0.0f);
 #if __CUDA_ARCH__ >= 530 || TORCH_HIP_VERSION >= 300
     dX[i] = __hgt(__ldg(Y + i), kZero) ? __ldg(dY + i) : kZero;
 #else
     dX[i] = (__half2float(Y[i]) > 0) ? dY[i] : kZero;
 #endif
   }
 }

 template <>
 __global__ void ReluGradientCUDAKernel<half2>(
     const int N,
     const half2* dY,
     const half2* Y,
     half2* dX) {
   const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
   if (i < N) {
     const half2 kZero = __float2half2_rn(0.0f);
 #if __CUDA_ARCH__ >= 530 || TORCH_HIP_VERSION >= 300
     dX[i] = __hmul2(__hgt2(__ldg(Y + i), kZero), __ldg(dY + i));
 #else
     const float2 dy = __half22float2(dY[i]);
     const float2 yy = __half22float2(Y[i]);
     // There are explicit cast to float here, because it may otherwise cause ambiguity on ROCm and can be triggered
     // sometimes:
     //
     //   error: conditional expression is ambiguous; 'const hip_impl::Scalar_accessor<float, Native_vec_, 1>' can be
     //   converted to 'float' and vice versa

      dX[i] = __floats2half2_rn(yy.x > 0.0f ? static_cast<float>(dy.x) : 0.0f,
                                yy.y > 0.0f ? static_cast<float>(dy.y) : 0.0f);
 #endif
   }
 }

 } // namespace

 template <>
 template <typename T>
 bool ReluFunctor<CUDAContext>::
 operator()(const int N, const T* X, T* Y, CUDAContext* context) const {
   if (N > 0) {
     const int M = math::DivUp(N, CAFFE_CUDA_NUM_THREADS);
     ReluCUDAKernel<T>
         <<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(N, X, Y);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
   return true;
 }

 template <>
 template <>
 bool ReluFunctor<CUDAContext>::operator()<at::Half>(
     const int N,
     const at::Half* X,
     at::Half* Y,
     CUDAContext* context) const {
   if (N == 0) {
     return true;
   }
   if (N % 2 == 0) {
     const int M = math::DivUp(N / 2, CAFFE_CUDA_NUM_THREADS);
     ReluCUDAKernel<half2>
         <<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
             N / 2,
             reinterpret_cast<const half2*>(X),
             reinterpret_cast<half2*>(Y));
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   } else {
     const int M = math::DivUp(N, CAFFE_CUDA_NUM_THREADS);
     ReluCUDAKernel<half>
         <<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
             N, reinterpret_cast<const half*>(X), reinterpret_cast<half*>(Y));
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
   return true;
 }

 template <>
 template <typename T>
 bool ReluGradientFunctor<CUDAContext>::Forward(
     const std::vector<int>& Y_dims,
     const std::vector<int>& /* dY_dims */,
     const T* Y,
     const T* dY,
     T* dX,
     CUDAContext* context) const {
   const int N = std::accumulate(
       Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
   if (N > 0) {
     const int M = math::DivUp(N, CAFFE_CUDA_NUM_THREADS);
     ReluGradientCUDAKernel<T>
         <<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
             N, dY, Y, dX);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
   return true;
 }

 template <>
 template <>
 bool ReluGradientFunctor<CUDAContext>::Forward<at::Half>(
     const std::vector<int>& Y_dims,
     const std::vector<int>& /* dY_dims */,
     const at::Half* Y,
     const at::Half* dY,
     at::Half* dX,
     CUDAContext* context) const {
   const int N = std::accumulate(
       Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
   if (N == 0) {
     return true;
   }
   if (N % 2 == 0) {
     const int M = math::DivUp(N / 2, CAFFE_CUDA_NUM_THREADS);
     ReluGradientCUDAKernel<half2>
         <<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
             N / 2,
             reinterpret_cast<const half2*>(dY),
             reinterpret_cast<const half2*>(Y),
             reinterpret_cast<half2*>(dX));
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   } else {
     const int M = math::DivUp(N, CAFFE_CUDA_NUM_THREADS);
     ReluGradientCUDAKernel<half>
         <<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
             N,
             reinterpret_cast<const half*>(dY),
             reinterpret_cast<const half*>(Y),
             reinterpret_cast<half*>(dX));
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
   return true;
 }

 REGISTER_CUDA_OPERATOR(
     Relu,
     UnaryElementwiseOp<
         TensorTypes<float, at::Half>,
         CUDAContext,
         ReluFunctor<CUDAContext>>);
 REGISTER_CUDA_OPERATOR(
     ReluGradient,
     BinaryElementwiseOp<
         TensorTypes<float, at::Half>,
         CUDAContext,
         ReluGradientFunctor<CUDAContext>>);

 } // namespace caffe2
	#include "caffe2/operators/relu_op.h"

	#include <algorithm>
	#include <functional>

	#include "caffe2/core/context_gpu.h"
	#include "caffe2/utils/math.h"

	namespace caffe2 {

	namespace {

	#ifdef __HIPCC__
	using half2 = __half2;
	#endif // __HIPCC__

	template <typename T>
	__global__ void ReluCUDAKernel(const int N, const T* X, T* Y);

	#define DELEGATE_RELU_CUDA_KERNEL(T, MaxFunc) \
	template <> \
	__global__ void ReluCUDAKernel<T>(const int N, const T* X, T* Y) { \
	const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x; \
	if (i < N) { \
	Y[i] = MaxFunc(X[i], T(0)); \
	} \
	}
	DELEGATE_RELU_CUDA_KERNEL(float, fmaxf)
	#undef DELEGATE_RELU_CUDA_KERNEL

	template <>
	__global__ void ReluCUDAKernel<half>(const int N, const half* X, half* Y) {
	const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
	if (i < N) {
	const half kZero = __float2half(0.0f);
	#if __CUDA_ARCH__ >= 530 \|\| TORCH_HIP_VERSION >= 300
	Y[i] = __hgt(__ldg(X + i), kZero) ? __ldg(X + i) : kZero;
	#else
	Y[i] = (__half2float(X[i]) > 0) ? X[i] : kZero;
	#endif
	}
	}

	template <>
	__global__ void ReluCUDAKernel<half2>(const int N, const half2* X, half2* Y) {
	const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
	if (i < N) {
	const half2 kZero = __float2half2_rn(0.0f);
	#if __CUDA_ARCH__ >= 530 \|\| TORCH_HIP_VERSION >= 300
	Y[i] = __hmul2(__hgt2(__ldg(X + i), kZero), __ldg(X + i));
	#else
	const float2 xx = __half22float2(X[i]);
	// There are explicit cast to float here, because it may otherwise cause ambiguity on ROCm and can be triggered
	// sometimes:
	//
	// error: conditional expression is ambiguous; 'const hip_impl::Scalar_accessor<float, Native_vec_, 0>' can be
	// converted to 'float' and vice versa
	Y[i] = __floats2half2_rn(xx.x > 0.0f ? static_cast<float>(xx.x) : 0.0f,
	xx.y > 0.0f ? static_cast<float>(xx.y) : 0.0f);
	#endif
	}
	}

	template <typename T>
	__global__ void
	ReluGradientCUDAKernel(const int N, const T* dY, const T* Y, T* dX) {
	const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
	if (i < N) {
	#if __CUDA_ARCH__ >= 350 \|\| TORCH_HIP_VERSION >= 300
	dX[i] = __ldg(Y + i) > T(0) ? __ldg(dY + i) : T(0);
	#else
	dX[i] = Y[i] > T(0) ? dY[i] : T(0);
	#endif
	}
	}

	template <>
	__global__ void ReluGradientCUDAKernel<half>(
	const int N,
	const half* dY,
	const half* Y,
	half* dX) {
	const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
	if (i < N) {
	const half kZero = __float2half(0.0f);
	#if __CUDA_ARCH__ >= 530 \|\| TORCH_HIP_VERSION >= 300
	dX[i] = __hgt(__ldg(Y + i), kZero) ? __ldg(dY + i) : kZero;
	#else
	dX[i] = (__half2float(Y[i]) > 0) ? dY[i] : kZero;
	#endif
	}
	}

	template <>
	__global__ void ReluGradientCUDAKernel<half2>(
	const int N,
	const half2* dY,
	const half2* Y,
	half2* dX) {
	const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
	if (i < N) {
	const half2 kZero = __float2half2_rn(0.0f);
	#if __CUDA_ARCH__ >= 530 \|\| TORCH_HIP_VERSION >= 300
	dX[i] = __hmul2(__hgt2(__ldg(Y + i), kZero), __ldg(dY + i));
	#else
	const float2 dy = __half22float2(dY[i]);
	const float2 yy = __half22float2(Y[i]);
	// There are explicit cast to float here, because it may otherwise cause ambiguity on ROCm and can be triggered
	// sometimes:
	//
	// error: conditional expression is ambiguous; 'const hip_impl::Scalar_accessor<float, Native_vec_, 1>' can be
	// converted to 'float' and vice versa

	dX[i] = __floats2half2_rn(yy.x > 0.0f ? static_cast<float>(dy.x) : 0.0f,
	yy.y > 0.0f ? static_cast<float>(dy.y) : 0.0f);
	#endif
	}
	}

	} // namespace

	template <>
	template <typename T>
	bool ReluFunctor<CUDAContext>::
	operator()(const int N, const T* X, T* Y, CUDAContext* context) const {
	if (N > 0) {
	const int M = math::DivUp(N, CAFFE_CUDA_NUM_THREADS);
	ReluCUDAKernel<T>
	<<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(N, X, Y);
	C10_CUDA_KERNEL_LAUNCH_CHECK();
	}
	return true;
	}

	template <>
	template <>
	bool ReluFunctor<CUDAContext>::operator()<at::Half>(
	const int N,
	const at::Half* X,
	at::Half* Y,
	CUDAContext* context) const {
	if (N == 0) {
	return true;
	}
	if (N % 2 == 0) {
	const int M = math::DivUp(N / 2, CAFFE_CUDA_NUM_THREADS);
	ReluCUDAKernel<half2>
	<<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
	N / 2,
	reinterpret_cast<const half2*>(X),
	reinterpret_cast<half2*>(Y));
	C10_CUDA_KERNEL_LAUNCH_CHECK();
	} else {
	const int M = math::DivUp(N, CAFFE_CUDA_NUM_THREADS);
	ReluCUDAKernel<half>
	<<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
	N, reinterpret_cast<const half>(X), reinterpret_cast<half>(Y));
	C10_CUDA_KERNEL_LAUNCH_CHECK();
	}
	return true;
	}

	template <>
	template <typename T>
	bool ReluGradientFunctor<CUDAContext>::Forward(
	const std::vector<int>& Y_dims,
	const std::vector<int>& /* dY_dims */,
	const T* Y,
	const T* dY,
	T* dX,
	CUDAContext* context) const {
	const int N = std::accumulate(
	Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
	if (N > 0) {
	const int M = math::DivUp(N, CAFFE_CUDA_NUM_THREADS);
	ReluGradientCUDAKernel<T>
	<<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
	N, dY, Y, dX);
	C10_CUDA_KERNEL_LAUNCH_CHECK();
	}
	return true;
	}

	template <>
	template <>
	bool ReluGradientFunctor<CUDAContext>::Forward<at::Half>(
	const std::vector<int>& Y_dims,
	const std::vector<int>& /* dY_dims */,
	const at::Half* Y,
	const at::Half* dY,
	at::Half* dX,
	CUDAContext* context) const {
	const int N = std::accumulate(
	Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
	if (N == 0) {
	return true;
	}
	if (N % 2 == 0) {
	const int M = math::DivUp(N / 2, CAFFE_CUDA_NUM_THREADS);
	ReluGradientCUDAKernel<half2>
	<<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
	N / 2,
	reinterpret_cast<const half2*>(dY),
	reinterpret_cast<const half2*>(Y),
	reinterpret_cast<half2*>(dX));
	C10_CUDA_KERNEL_LAUNCH_CHECK();
	} else {
	const int M = math::DivUp(N, CAFFE_CUDA_NUM_THREADS);
	ReluGradientCUDAKernel<half>
	<<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
	N,
	reinterpret_cast<const half*>(dY),
	reinterpret_cast<const half*>(Y),
	reinterpret_cast<half*>(dX));
	C10_CUDA_KERNEL_LAUNCH_CHECK();
	}
	return true;
	}

	REGISTER_CUDA_OPERATOR(
	Relu,
	UnaryElementwiseOp<
	TensorTypes<float, at::Half>,
	CUDAContext,
	ReluFunctor<CUDAContext>>);
	REGISTER_CUDA_OPERATOR(
	ReluGradient,
	BinaryElementwiseOp<
	TensorTypes<float, at::Half>,
	CUDAContext,
	ReluGradientFunctor<CUDAContext>>);

	} // namespace caffe2