caffe2/operators/elementwise_ops.cu - platform/external/pytorch - Git at Google

 #include "caffe2/operators/elementwise_ops.h"

 #include "caffe2/utils/cub_namespace.cuh"
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_reduce.cuh>
 #include <cub/device/device_reduce.cuh>

 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/utils/conversions.h"

 #ifdef __HIPCC__
 #if TORCH_HIP_VERSION < 210
 // rocblas doesn't fully support fp16 yet
 #define ROCBLAS_FP16 0
 #endif
 #endif

 namespace caffe2 {

 REGISTER_CUDA_OPERATOR(
     Not,
     UnaryElementwiseOp<BoolTypes, CUDAContext, NotFunctor<CUDAContext>>);
 REGISTER_CUDA_OPERATOR(
     Sign,
     UnaryElementwiseOp<NumericTypes, CUDAContext, SignFunctor<CUDAContext>>);

 #define REGISTER_CUDA_COMPARE_OPERATOR(Op)                    \
   REGISTER_CUDA_OPERATOR(                                     \
       Op,                                                     \
       BinaryElementwiseOp<                                    \
           TensorTypes<bool, int32_t, int64_t, float, double>, \
           CUDAContext,                                        \
           Op##Functor<CUDAContext>,                           \
           FixedType<bool>>)

 REGISTER_CUDA_COMPARE_OPERATOR(EQ);
 REGISTER_CUDA_COMPARE_OPERATOR(NE);
 REGISTER_CUDA_COMPARE_OPERATOR(LT);
 REGISTER_CUDA_COMPARE_OPERATOR(LE);
 REGISTER_CUDA_COMPARE_OPERATOR(GT);
 REGISTER_CUDA_COMPARE_OPERATOR(GE);

 #undef REGISTER_CUDA_COMPARE_OPERATOR

 #define REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(Op) \
   REGISTER_CUDA_OPERATOR(                         \
       Op,                                         \
       BinaryElementwiseOp<BoolTypes, CUDAContext, Op##Functor<CUDAContext>>)

 REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(And);
 REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(Or);
 REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(Xor);

 #undef REGISTER_CUDA_LOGICAL_BINARY_OPERATOR

 #define REGISTER_CUDA_BITWISE_BINARY_OPERATOR(Op) \
   REGISTER_CUDA_OPERATOR(                         \
       Op,                                         \
       BinaryElementwiseOp<                        \
           IntBoolTypes,                           \
           CUDAContext,                            \
           Op##Functor<CUDAContext>>)

 REGISTER_CUDA_BITWISE_BINARY_OPERATOR(BitwiseAnd);
 REGISTER_CUDA_BITWISE_BINARY_OPERATOR(BitwiseOr);
 REGISTER_CUDA_BITWISE_BINARY_OPERATOR(BitwiseXor);

 #undef REGISTER_CUDA_BITWISE_BINARY_OPERATOR

 namespace {

 template <typename T>
 __global__ void
 reduce_sum_like_post1(const T* g_idata, T* g_odata, int pre, int N) {
   int n = blockIdx.x * blockDim.x + threadIdx.x;
   if (n >= N) {
     return;
   }

   float sum = 0.0;
   for (int i = 0; i < pre; ++i) {
     sum += convert::To<T, float>(g_idata[i * N + n]);
   }

   g_odata[n] = convert::To<float, T>(sum);
 }

 template <typename T>
 void device_reduce(
     const T* d_in,
     T* d_out,
     int N,
     Tensor* buffer,
     CUDAContext* context) {
   // Determine temporary device storage requirements
   size_t temp_storage_bytes = 0;
   cub::DeviceReduce::Sum(
       NULL, temp_storage_bytes, d_in, d_out, N, context->cuda_stream());

   auto buffer_size = temp_storage_bytes / sizeof(T);
   buffer_size += temp_storage_bytes % sizeof(T) != 0 ? 1 : 0;
   buffer->Resize(buffer_size);
   void* d_temp_storage = static_cast<void*>(buffer->template mutable_data<T>());
   // Run sum-reduction
   cub::DeviceReduce::Sum(
       d_temp_storage,
       temp_storage_bytes,
       d_in,
       d_out,
       N,
       context->cuda_stream());
 }

 template <>
 void device_reduce<at::Half>(
     const at::Half* in,
     at::Half* out,
     int N,
     Tensor* buffer,
     CUDAContext* context) {
   (void)N; // Suppress unused variable warning
   (void)buffer; // Suppress unused variable warning
   (void)context; // Suppress unused variable warning
 #if TORCH_HIP_VERSION >= 210
   auto buffer_size = 1;

   if (buffer->numel() != buffer_size) {
     buffer->Resize(buffer_size);

     math::Set<at::Half, CUDAContext>(
         N,
         convert::To<float, at::Half>(1.),
         buffer->template mutable_data<at::Half>(),
         context);
   }

   CUBLAS_ENFORCE(rocblas_hdot(
       context->cublas_handle(),
       N,
       reinterpret_cast<const rocblas_half*>(in),
       1,
       reinterpret_cast<const rocblas_half*>(buffer->data<at::Half>()),
       0,
       reinterpret_cast<rocblas_half*>(out)));
 #elif TORCH_HIP_VERSION < 210
    CAFFE_THROW("HIP rocblas doesn't fully support fp16 device_reduce yet.");
 #else
   auto buffer_size = 1;

   if (buffer->numel() != buffer_size) {
     buffer->Resize(buffer_size);

     math::Set<at::Half, CUDAContext>(
         N,
         convert::To<float, at::Half>(1.),
         buffer->template mutable_data<at::Half>(),
         context);
   }

   CUBLAS_ENFORCE(cublasDotEx(
       context->cublas_handle(),
       N,
       in,
       CUDA_R_16F,
       1,
       buffer->data<at::Half>(),
       CUDA_R_16F,
       0,
       out,
       CUDA_R_16F,
       CUDA_R_32F));
 #endif
 }

 template <typename T, int BLOCK_THREADS>
 __global__ void
 reduce_sum_like(const T* g_idata, T* g_odata, int pre, int N, int post) {
   int n = blockIdx.x;
   float sum = 0.0;
   int limit = pre * post;
   for (int i = threadIdx.x; i < limit; i += blockDim.x) {
     int curPre = i / post;
     int curPost = i % post;

     sum +=
         convert::To<T, float>(g_idata[curPre * N * post + n * post + curPost]);
   }
   // uses a shared memory reduction within block
   typedef cub::BlockReduce<float, BLOCK_THREADS> BlockReduceT;
   // Shared memory
   __shared__ typename BlockReduceT::TempStorage temp_storage;
   float aggregate = BlockReduceT(temp_storage).Sum(sum);
   if (threadIdx.x == 0) {
     g_odata[n] = convert::To<float, T>(aggregate);
   }
 }
 } // namespace

 template <>
 template <typename T>
 bool SumReduceLikeOp<CUDAContext>::DoRunWithType() {
   const auto& A = Input(0);
   const auto& B = Input(1);
   auto* C = Output(0);
   auto count = A.size();
   CAFFE_ENFORCE(&B != C, "In-place is not allowed.");
   C->ResizeLike(B);
   const T* Adata = A.template data<T>();
   auto* Cdata = C->template mutable_data<T>();

   if (C->size() == 0) {
     // output is empty, nothing to do, not even launching the CUDA kernel
     return true;
   }

   if (B.size() == 1) {
     device_reduce<T>(Adata, Cdata, count, &sum_buffer_, &context_);
   } else {
     size_t pre, n, post;
     std::tie(pre, n, post) =
         elementwise_ops_utils::ComputeLegacyBroadcastSizes(A, B, axis_);
     // because we check shape(B) \in shape(A) before,
     // post and pre cannot be 1 at same time
     if (post == 1) {
       reduce_sum_like_post1<T>
           <<<CAFFE_GET_BLOCKS(n),
              CAFFE_CUDA_NUM_THREADS,
              0,
              context_.cuda_stream()>>>(Adata, Cdata, pre, n);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
       if (post >= 128) {
         reduce_sum_like<T, 512>
             <<<n, 512, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       } else if (post >= 64) {
         reduce_sum_like<T, 128>
             <<<n, 128, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       } else if (post >= 32) {
         reduce_sum_like<T, 64>
             <<<n, 64, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       } else {
         reduce_sum_like<T, 32>
             <<<n, 32, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
     }
   }
   return true;
 }

 template <>
 bool SumReduceLikeOp<CUDAContext>::RunOnDevice() {
   return DispatchHelper<TensorTypes<float, at::Half>>::call(this, Input(0));
 }

 REGISTER_CUDA_OPERATOR(SumReduceLike, SumReduceLikeOp<CUDAContext>);

 } // namespace caffe2
	#include "caffe2/operators/elementwise_ops.h"

	#include "caffe2/utils/cub_namespace.cuh"
	#include <cub/block/block_load.cuh>
	#include <cub/block/block_reduce.cuh>
	#include <cub/device/device_reduce.cuh>

	#include "caffe2/core/common_gpu.h"
	#include "caffe2/core/context_gpu.h"
	#include "caffe2/utils/conversions.h"

	#ifdef __HIPCC__
	#if TORCH_HIP_VERSION < 210
	// rocblas doesn't fully support fp16 yet
	#define ROCBLAS_FP16 0
	#endif
	#endif

	namespace caffe2 {

	REGISTER_CUDA_OPERATOR(
	Not,
	UnaryElementwiseOp<BoolTypes, CUDAContext, NotFunctor<CUDAContext>>);
	REGISTER_CUDA_OPERATOR(
	Sign,
	UnaryElementwiseOp<NumericTypes, CUDAContext, SignFunctor<CUDAContext>>);

	#define REGISTER_CUDA_COMPARE_OPERATOR(Op) \
	REGISTER_CUDA_OPERATOR( \
	Op, \
	BinaryElementwiseOp< \
	TensorTypes<bool, int32_t, int64_t, float, double>, \
	CUDAContext, \
	Op##Functor<CUDAContext>, \
	FixedType<bool>>)

	REGISTER_CUDA_COMPARE_OPERATOR(EQ);
	REGISTER_CUDA_COMPARE_OPERATOR(NE);
	REGISTER_CUDA_COMPARE_OPERATOR(LT);
	REGISTER_CUDA_COMPARE_OPERATOR(LE);
	REGISTER_CUDA_COMPARE_OPERATOR(GT);
	REGISTER_CUDA_COMPARE_OPERATOR(GE);

	#undef REGISTER_CUDA_COMPARE_OPERATOR

	#define REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(Op) \
	REGISTER_CUDA_OPERATOR( \
	Op, \
	BinaryElementwiseOp<BoolTypes, CUDAContext, Op##Functor<CUDAContext>>)

	REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(And);
	REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(Or);
	REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(Xor);

	#undef REGISTER_CUDA_LOGICAL_BINARY_OPERATOR

	#define REGISTER_CUDA_BITWISE_BINARY_OPERATOR(Op) \
	REGISTER_CUDA_OPERATOR( \
	Op, \
	BinaryElementwiseOp< \
	IntBoolTypes, \
	CUDAContext, \
	Op##Functor<CUDAContext>>)

	REGISTER_CUDA_BITWISE_BINARY_OPERATOR(BitwiseAnd);
	REGISTER_CUDA_BITWISE_BINARY_OPERATOR(BitwiseOr);
	REGISTER_CUDA_BITWISE_BINARY_OPERATOR(BitwiseXor);

	#undef REGISTER_CUDA_BITWISE_BINARY_OPERATOR

	namespace {

	template <typename T>
	__global__ void
	reduce_sum_like_post1(const T* g_idata, T* g_odata, int pre, int N) {
	int n = blockIdx.x * blockDim.x + threadIdx.x;
	if (n >= N) {
	return;
	}

	float sum = 0.0;
	for (int i = 0; i < pre; ++i) {
	sum += convert::To<T, float>(g_idata[i * N + n]);
	}

	g_odata[n] = convert::To<float, T>(sum);
	}

	template <typename T>
	void device_reduce(
	const T* d_in,
	T* d_out,
	int N,
	Tensor* buffer,
	CUDAContext* context) {
	// Determine temporary device storage requirements
	size_t temp_storage_bytes = 0;
	cub::DeviceReduce::Sum(
	NULL, temp_storage_bytes, d_in, d_out, N, context->cuda_stream());

	auto buffer_size = temp_storage_bytes / sizeof(T);
	buffer_size += temp_storage_bytes % sizeof(T) != 0 ? 1 : 0;
	buffer->Resize(buffer_size);
	void* d_temp_storage = static_cast<void*>(buffer->template mutable_data<T>());
	// Run sum-reduction
	cub::DeviceReduce::Sum(
	d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_out,
	N,
	context->cuda_stream());
	}

	template <>
	void device_reduce<at::Half>(
	const at::Half* in,
	at::Half* out,
	int N,
	Tensor* buffer,
	CUDAContext* context) {
	(void)N; // Suppress unused variable warning
	(void)buffer; // Suppress unused variable warning
	(void)context; // Suppress unused variable warning
	#if TORCH_HIP_VERSION >= 210
	auto buffer_size = 1;

	if (buffer->numel() != buffer_size) {
	buffer->Resize(buffer_size);

	math::Set<at::Half, CUDAContext>(
	N,
	convert::To<float, at::Half>(1.),
	buffer->template mutable_data<at::Half>(),
	context);
	}

	CUBLAS_ENFORCE(rocblas_hdot(
	context->cublas_handle(),
	N,
	reinterpret_cast<const rocblas_half*>(in),
	1,
	reinterpret_cast<const rocblas_half*>(buffer->data<at::Half>()),
	0,
	reinterpret_cast<rocblas_half*>(out)));
	#elif TORCH_HIP_VERSION < 210
	CAFFE_THROW("HIP rocblas doesn't fully support fp16 device_reduce yet.");
	#else
	auto buffer_size = 1;

	if (buffer->numel() != buffer_size) {
	buffer->Resize(buffer_size);

	math::Set<at::Half, CUDAContext>(
	N,
	convert::To<float, at::Half>(1.),
	buffer->template mutable_data<at::Half>(),
	context);
	}

	CUBLAS_ENFORCE(cublasDotEx(
	context->cublas_handle(),
	N,
	in,
	CUDA_R_16F,
	1,
	buffer->data<at::Half>(),
	CUDA_R_16F,
	0,
	out,
	CUDA_R_16F,
	CUDA_R_32F));
	#endif
	}

	template <typename T, int BLOCK_THREADS>
	__global__ void
	reduce_sum_like(const T* g_idata, T* g_odata, int pre, int N, int post) {
	int n = blockIdx.x;
	float sum = 0.0;
	int limit = pre * post;
	for (int i = threadIdx.x; i < limit; i += blockDim.x) {
	int curPre = i / post;
	int curPost = i % post;

	sum +=
	convert::To<T, float>(g_idata[curPre * N * post + n * post + curPost]);
	}
	// uses a shared memory reduction within block
	typedef cub::BlockReduce<float, BLOCK_THREADS> BlockReduceT;
	// Shared memory
	__shared__ typename BlockReduceT::TempStorage temp_storage;
	float aggregate = BlockReduceT(temp_storage).Sum(sum);
	if (threadIdx.x == 0) {
	g_odata[n] = convert::To<float, T>(aggregate);
	}
	}
	} // namespace

	template <>
	template <typename T>
	bool SumReduceLikeOp<CUDAContext>::DoRunWithType() {
	const auto& A = Input(0);
	const auto& B = Input(1);
	auto* C = Output(0);
	auto count = A.size();
	CAFFE_ENFORCE(&B != C, "In-place is not allowed.");
	C->ResizeLike(B);
	const T* Adata = A.template data<T>();
	auto* Cdata = C->template mutable_data<T>();

	if (C->size() == 0) {
	// output is empty, nothing to do, not even launching the CUDA kernel
	return true;
	}

	if (B.size() == 1) {
	device_reduce<T>(Adata, Cdata, count, &sum_buffer_, &context_);
	} else {
	size_t pre, n, post;
	std::tie(pre, n, post) =
	elementwise_ops_utils::ComputeLegacyBroadcastSizes(A, B, axis_);
	// because we check shape(B) \in shape(A) before,
	// post and pre cannot be 1 at same time
	if (post == 1) {
	reduce_sum_like_post1<T>
	<<<CAFFE_GET_BLOCKS(n),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(Adata, Cdata, pre, n);
	C10_CUDA_KERNEL_LAUNCH_CHECK();
	} else {
	if (post >= 128) {
	reduce_sum_like<T, 512>
	<<<n, 512, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
	C10_CUDA_KERNEL_LAUNCH_CHECK();
	} else if (post >= 64) {
	reduce_sum_like<T, 128>
	<<<n, 128, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
	C10_CUDA_KERNEL_LAUNCH_CHECK();
	} else if (post >= 32) {
	reduce_sum_like<T, 64>
	<<<n, 64, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
	C10_CUDA_KERNEL_LAUNCH_CHECK();
	} else {
	reduce_sum_like<T, 32>
	<<<n, 32, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
	C10_CUDA_KERNEL_LAUNCH_CHECK();
	}
	}
	}
	return true;
	}

	template <>
	bool SumReduceLikeOp<CUDAContext>::RunOnDevice() {
	return DispatchHelper<TensorTypes<float, at::Half>>::call(this, Input(0));
	}

	REGISTER_CUDA_OPERATOR(SumReduceLike, SumReduceLikeOp<CUDAContext>);

	} // namespace caffe2