blob: fcbe26f927aec0a7a78ffce116bccd7b01b93951 [file] [log] [blame]
#include "caffe2/operators/elementwise_ops.h"
#include "caffe2/utils/cub_namespace.cuh"
#include <cub/block/block_load.cuh>
#include <cub/block/block_reduce.cuh>
#include <cub/device/device_reduce.cuh>
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/utils/conversions.h"
#ifdef __HIPCC__
#if TORCH_HIP_VERSION < 210
// rocblas doesn't fully support fp16 yet
#define ROCBLAS_FP16 0
#endif
#endif
namespace caffe2 {
REGISTER_CUDA_OPERATOR(
Not,
UnaryElementwiseOp<BoolTypes, CUDAContext, NotFunctor<CUDAContext>>);
REGISTER_CUDA_OPERATOR(
Sign,
UnaryElementwiseOp<NumericTypes, CUDAContext, SignFunctor<CUDAContext>>);
#define REGISTER_CUDA_COMPARE_OPERATOR(Op) \
REGISTER_CUDA_OPERATOR( \
Op, \
BinaryElementwiseOp< \
TensorTypes<bool, int32_t, int64_t, float, double>, \
CUDAContext, \
Op##Functor<CUDAContext>, \
FixedType<bool>>)
REGISTER_CUDA_COMPARE_OPERATOR(EQ);
REGISTER_CUDA_COMPARE_OPERATOR(NE);
REGISTER_CUDA_COMPARE_OPERATOR(LT);
REGISTER_CUDA_COMPARE_OPERATOR(LE);
REGISTER_CUDA_COMPARE_OPERATOR(GT);
REGISTER_CUDA_COMPARE_OPERATOR(GE);
#undef REGISTER_CUDA_COMPARE_OPERATOR
#define REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(Op) \
REGISTER_CUDA_OPERATOR( \
Op, \
BinaryElementwiseOp<BoolTypes, CUDAContext, Op##Functor<CUDAContext>>)
REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(And);
REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(Or);
REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(Xor);
#undef REGISTER_CUDA_LOGICAL_BINARY_OPERATOR
#define REGISTER_CUDA_BITWISE_BINARY_OPERATOR(Op) \
REGISTER_CUDA_OPERATOR( \
Op, \
BinaryElementwiseOp< \
IntBoolTypes, \
CUDAContext, \
Op##Functor<CUDAContext>>)
REGISTER_CUDA_BITWISE_BINARY_OPERATOR(BitwiseAnd);
REGISTER_CUDA_BITWISE_BINARY_OPERATOR(BitwiseOr);
REGISTER_CUDA_BITWISE_BINARY_OPERATOR(BitwiseXor);
#undef REGISTER_CUDA_BITWISE_BINARY_OPERATOR
namespace {
template <typename T>
__global__ void
reduce_sum_like_post1(const T* g_idata, T* g_odata, int pre, int N) {
int n = blockIdx.x * blockDim.x + threadIdx.x;
if (n >= N) {
return;
}
float sum = 0.0;
for (int i = 0; i < pre; ++i) {
sum += convert::To<T, float>(g_idata[i * N + n]);
}
g_odata[n] = convert::To<float, T>(sum);
}
template <typename T>
void device_reduce(
const T* d_in,
T* d_out,
int N,
Tensor* buffer,
CUDAContext* context) {
// Determine temporary device storage requirements
size_t temp_storage_bytes = 0;
cub::DeviceReduce::Sum(
NULL, temp_storage_bytes, d_in, d_out, N, context->cuda_stream());
auto buffer_size = temp_storage_bytes / sizeof(T);
buffer_size += temp_storage_bytes % sizeof(T) != 0 ? 1 : 0;
buffer->Resize(buffer_size);
void* d_temp_storage = static_cast<void*>(buffer->template mutable_data<T>());
// Run sum-reduction
cub::DeviceReduce::Sum(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
N,
context->cuda_stream());
}
template <>
void device_reduce<at::Half>(
const at::Half* in,
at::Half* out,
int N,
Tensor* buffer,
CUDAContext* context) {
(void)N; // Suppress unused variable warning
(void)buffer; // Suppress unused variable warning
(void)context; // Suppress unused variable warning
#if TORCH_HIP_VERSION >= 210
auto buffer_size = 1;
if (buffer->numel() != buffer_size) {
buffer->Resize(buffer_size);
math::Set<at::Half, CUDAContext>(
N,
convert::To<float, at::Half>(1.),
buffer->template mutable_data<at::Half>(),
context);
}
CUBLAS_ENFORCE(rocblas_hdot(
context->cublas_handle(),
N,
reinterpret_cast<const rocblas_half*>(in),
1,
reinterpret_cast<const rocblas_half*>(buffer->data<at::Half>()),
0,
reinterpret_cast<rocblas_half*>(out)));
#elif TORCH_HIP_VERSION < 210
CAFFE_THROW("HIP rocblas doesn't fully support fp16 device_reduce yet.");
#else
auto buffer_size = 1;
if (buffer->numel() != buffer_size) {
buffer->Resize(buffer_size);
math::Set<at::Half, CUDAContext>(
N,
convert::To<float, at::Half>(1.),
buffer->template mutable_data<at::Half>(),
context);
}
CUBLAS_ENFORCE(cublasDotEx(
context->cublas_handle(),
N,
in,
CUDA_R_16F,
1,
buffer->data<at::Half>(),
CUDA_R_16F,
0,
out,
CUDA_R_16F,
CUDA_R_32F));
#endif
}
template <typename T, int BLOCK_THREADS>
__global__ void
reduce_sum_like(const T* g_idata, T* g_odata, int pre, int N, int post) {
int n = blockIdx.x;
float sum = 0.0;
int limit = pre * post;
for (int i = threadIdx.x; i < limit; i += blockDim.x) {
int curPre = i / post;
int curPost = i % post;
sum +=
convert::To<T, float>(g_idata[curPre * N * post + n * post + curPost]);
}
// uses a shared memory reduction within block
typedef cub::BlockReduce<float, BLOCK_THREADS> BlockReduceT;
// Shared memory
__shared__ typename BlockReduceT::TempStorage temp_storage;
float aggregate = BlockReduceT(temp_storage).Sum(sum);
if (threadIdx.x == 0) {
g_odata[n] = convert::To<float, T>(aggregate);
}
}
} // namespace
template <>
template <typename T>
bool SumReduceLikeOp<CUDAContext>::DoRunWithType() {
const auto& A = Input(0);
const auto& B = Input(1);
auto* C = Output(0);
auto count = A.size();
CAFFE_ENFORCE(&B != C, "In-place is not allowed.");
C->ResizeLike(B);
const T* Adata = A.template data<T>();
auto* Cdata = C->template mutable_data<T>();
if (C->size() == 0) {
// output is empty, nothing to do, not even launching the CUDA kernel
return true;
}
if (B.size() == 1) {
device_reduce<T>(Adata, Cdata, count, &sum_buffer_, &context_);
} else {
size_t pre, n, post;
std::tie(pre, n, post) =
elementwise_ops_utils::ComputeLegacyBroadcastSizes(A, B, axis_);
// because we check shape(B) \in shape(A) before,
// post and pre cannot be 1 at same time
if (post == 1) {
reduce_sum_like_post1<T>
<<<CAFFE_GET_BLOCKS(n),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(Adata, Cdata, pre, n);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else {
if (post >= 128) {
reduce_sum_like<T, 512>
<<<n, 512, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else if (post >= 64) {
reduce_sum_like<T, 128>
<<<n, 128, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else if (post >= 32) {
reduce_sum_like<T, 64>
<<<n, 64, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else {
reduce_sum_like<T, 32>
<<<n, 32, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
}
}
return true;
}
template <>
bool SumReduceLikeOp<CUDAContext>::RunOnDevice() {
return DispatchHelper<TensorTypes<float, at::Half>>::call(this, Input(0));
}
REGISTER_CUDA_OPERATOR(SumReduceLike, SumReduceLikeOp<CUDAContext>);
} // namespace caffe2