| // Implements the math functions for GPU. |
| |
| #include "caffe2/utils/math.h" |
| |
| #include <cstring> |
| #include <limits> |
| #include <numeric> |
| #include <vector> |
| |
| #include <cub/cub.cuh> |
| #include <cub/block/block_reduce.cuh> |
| #include "caffe2/utils/cub_namespace.cuh" |
| |
| #include <thrust/host_vector.h> |
| #include <thrust/device_vector.h> |
| #include <thrust/functional.h> |
| |
| #include "caffe2/core/context_gpu.h" |
| #include "caffe2/utils/GpuAtomics.cuh" |
| #include "caffe2/utils/conversions.h" |
| |
| #include "caffe2/utils/fixed_divisor.h" |
| // TODO: Move this to fixed_divisor.h |
| #if defined(USE_ROCM) |
| #define FIXED_DIVISOR int32_t |
| #define FIXED_DIVISOR_DIV(d, n) (n / d) |
| #define FIXED_DIVISOR_MOD(d, n) (n % d) |
| #define FIXED_DIVISOR_DIV_MOD(d, n, q, r) \ |
| do { \ |
| const auto n_copy = n; \ |
| *q = n_copy / d; \ |
| *r = n_copy % d; \ |
| } while (0) |
| #else // USE_ROCM |
| #define FIXED_DIVISOR FixedDivisor<int32_t> |
| #define FIXED_DIVISOR_DIV(d, n) (d.Div(n)) |
| #define FIXED_DIVISOR_MOD(d, n) (d.Mod(n)) |
| #define FIXED_DIVISOR_DIV_MOD(d, n, q, r) (d.DivMod(n, q, r)) |
| #endif // USE_ROCM |
| |
| #if defined(USE_ROCM) |
| using CUBLAS_HALF_TYPE = rocblas_half; |
| #else // __HIP_PLATFORM_HCC |
| using CUBLAS_HALF_TYPE = __half; |
| #endif // __HIP_PLATFORM_HCC |
| |
| #include "caffe2/utils/math/utils.h" |
| |
| #if THRUST_VERSION >= 100800 |
| #define THRUST_SUPPORTS_PER_THREAD |
| #endif // THRUST_VERSION >= 100800 |
| |
| namespace caffe2 { |
| namespace math { |
| |
| namespace { |
| |
| #define DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Func, expr) \ |
| template <typename T> \ |
| struct Func##Functor { \ |
| inline __host__ __device__ T \ |
| operator()(const T& lhs, const T& rhs) const { \ |
| return lhs expr rhs; \ |
| } \ |
| }; \ |
| template <> \ |
| struct Func##Functor<at::Half> { \ |
| inline __host__ __device__ at::Half operator()( \ |
| const at::Half& lhs, \ |
| const at::Half& rhs) const { \ |
| return convert::To<float, at::Half>(convert::To<at::Half, float>( \ |
| lhs) expr convert::To<at::Half, float>(rhs)); \ |
| } \ |
| }; |
| DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Add, +) |
| DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Sub, -) |
| DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Mul, *) |
| DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Div, /) |
| #undef DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR |
| |
| template <typename TIn, typename TOut, class BinaryOperator> |
| __global__ void SimpleBinaryOpCUDAKernel( |
| const int N, |
| const BinaryOperator op, |
| const TIn* A, |
| const TIn* B, |
| TOut* C) { |
| CUDA_1D_KERNEL_LOOP(i, N) { |
| C[i] = op(A[i], B[i]); |
| } |
| } |
| |
| template <typename TIn, typename TOut, class BinaryOperator, bool broadcast_1st> |
| __global__ void RowwiseBinaryOpCUDAKenel( |
| const int size, |
| const FIXED_DIVISOR cols, |
| const BinaryOperator op, |
| const TIn* A, |
| const TIn* B, |
| TOut* C) { |
| CUDA_1D_KERNEL_LOOP(C_index, size) { |
| const int j = FIXED_DIVISOR_MOD(cols, C_index); |
| const int A_index = broadcast_1st ? j : C_index; |
| const int B_index = broadcast_1st ? C_index : j; |
| C[C_index] = op(A[A_index], B[B_index]); |
| } |
| } |
| |
| template <typename TIn, typename TOut, class BinaryOperator, bool broadcast_1st> |
| __global__ void ColwiseBinaryOpCUDAKenel( |
| const int size, |
| const FIXED_DIVISOR cols, |
| const BinaryOperator op, |
| const TIn* A, |
| const TIn* B, |
| TOut* C) { |
| CUDA_1D_KERNEL_LOOP(C_index, size) { |
| const int i = FIXED_DIVISOR_DIV(cols, C_index); |
| const int A_index = broadcast_1st ? i : C_index; |
| const int B_index = broadcast_1st ? C_index : i; |
| C[C_index] = op(A[A_index], B[B_index]); |
| } |
| } |
| |
| template <typename TIn, typename TOut, class BinaryOperator, int D> |
| __global__ void BroadcastBinaryOpCUDAKernel( |
| const int size, |
| const SimpleArray<int, D> A_strides, |
| const SimpleArray<int, D> B_strides, |
| const SimpleArray<FIXED_DIVISOR, D> C_dims, |
| const BinaryOperator op, |
| const TIn* A, |
| const TIn* B, |
| TOut* C) { |
| CUDA_1D_KERNEL_LOOP(C_index, size) { |
| int A_index = 0; |
| int B_index = 0; |
| int C_index_val = C_index; |
| #pragma unroll |
| for (int i = D - 1; i >= 0; --i) { |
| int d; |
| FIXED_DIVISOR_DIV_MOD(C_dims.data[i], C_index_val, &C_index_val, &d); |
| A_index += d * A_strides.data[i]; |
| B_index += d * B_strides.data[i]; |
| } |
| C[C_index] = op(A[A_index], B[B_index]); |
| } |
| } |
| |
| template <typename TIn, typename TOut, class BinaryOperator> |
| CAFFE2_CUDA_EXPORT void BinaryOpWith2DBroadcasting( |
| const int rows, |
| const int cols, |
| const bool rowwise_broadcast, |
| const bool broadcast_1st, |
| const BinaryOperator& op, |
| const TIn* A, |
| const TIn* B, |
| TOut* C, |
| CUDAContext* context) { |
| if (rows == 0 || cols == 0) { |
| return; |
| } |
| const int size = rows * cols; |
| const FIXED_DIVISOR cols_div(cols); |
| if (rowwise_broadcast) { |
| if (broadcast_1st) { |
| RowwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, true> |
| <<<CAFFE_GET_BLOCKS(size), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(size, cols_div, op, A, B, C); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } else { |
| RowwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, false> |
| <<<CAFFE_GET_BLOCKS(size), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(size, cols_div, op, A, B, C); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| } else { |
| if (broadcast_1st) { |
| ColwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, true> |
| <<<CAFFE_GET_BLOCKS(size), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(size, cols_div, op, A, B, C); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } else { |
| ColwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, false> |
| <<<CAFFE_GET_BLOCKS(size), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(size, cols_div, op, A, B, C); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| } |
| } |
| |
| template <typename TIn, typename TOut, class BinaryOperator, int D> |
| CAFFE2_CUDA_EXPORT void BroadcastBinaryOpImpl( |
| const int* A_dims, |
| const int* B_dims, |
| const int* C_dims, |
| const BinaryOperator& op, |
| const TIn* A, |
| const TIn* B, |
| TOut* C, |
| CUDAContext* context) { |
| SimpleArray<int, D> A_strides_array; |
| SimpleArray<int, D> B_strides_array; |
| SimpleArray<FIXED_DIVISOR, D> C_dims_array; |
| int A_stride = 1; |
| int B_stride = 1; |
| for (int i = D - 1; i >= 0; --i) { |
| if (C_dims[i] == 0) { |
| return; |
| } |
| A_strides_array.data[i] = A_dims[i] == 1 ? 0 : A_stride; |
| B_strides_array.data[i] = B_dims[i] == 1 ? 0 : B_stride; |
| A_stride *= A_dims[i]; |
| B_stride *= B_dims[i]; |
| C_dims_array.data[i] = FIXED_DIVISOR(C_dims[i]); |
| } |
| const int size = |
| std::accumulate(C_dims, C_dims + D, 1, std::multiplies<int>()); |
| BroadcastBinaryOpCUDAKernel<TIn, TOut, BinaryOperator, D> |
| <<<CAFFE_GET_BLOCKS(size), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>( |
| size, A_strides_array, B_strides_array, C_dims_array, op, A, B, C); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| |
| template <typename TIn, typename TOut, class BinaryOperator> |
| CAFFE2_CUDA_EXPORT void BroadcastBinaryOp( |
| const int A_ndim, |
| const int* A_dims, |
| const int B_ndim, |
| const int* B_dims, |
| const BinaryOperator& op, |
| const TIn* A, |
| const TIn* B, |
| TOut* C, |
| CUDAContext* context) { |
| const int ndim = std::max(A_ndim, B_ndim); |
| std::vector<int> A_dims_array(ndim); |
| std::vector<int> B_dims_array(ndim); |
| std::vector<int> C_dims_array(ndim); |
| utils::ComputeBroadcastBinaryOpDims( |
| A_ndim, |
| A_dims, |
| B_ndim, |
| B_dims, |
| A_dims_array.data(), |
| B_dims_array.data(), |
| C_dims_array.data()); |
| if (A_dims_array == B_dims_array) { |
| const int size = std::accumulate( |
| C_dims_array.cbegin(), C_dims_array.cend(), 1, std::multiplies<int>()); |
| SimpleBinaryOpCUDAKernel<TIn, TOut, BinaryOperator> |
| <<<CAFFE_GET_BLOCKS(size), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(size, op, A, B, C); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| return; |
| } |
| int rows; |
| int cols; |
| bool broadcast_1st; |
| if (utils::IsRowwiseBroadcastBinaryOp( |
| ndim, |
| A_dims_array.data(), |
| B_dims_array.data(), |
| &rows, |
| &cols, |
| &broadcast_1st)) { |
| BinaryOpWith2DBroadcasting<TIn, TOut, BinaryOperator>( |
| rows, cols, true, broadcast_1st, op, A, B, C, context); |
| return; |
| } |
| if (utils::IsColwiseBroadcastBinaryOp( |
| ndim, |
| A_dims_array.data(), |
| B_dims_array.data(), |
| &rows, |
| &cols, |
| &broadcast_1st)) { |
| BinaryOpWith2DBroadcasting<TIn, TOut, BinaryOperator>( |
| rows, cols, false, broadcast_1st, op, A, B, C, context); |
| return; |
| } |
| DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3( |
| ndim, |
| BroadcastBinaryOpImpl, |
| TIn, |
| TOut, |
| BinaryOperator, |
| A_dims_array.data(), |
| B_dims_array.data(), |
| C_dims_array.data(), |
| op, |
| A, |
| B, |
| C, |
| context); |
| } |
| |
| } // namespace |
| |
| #define DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \ |
| template <> \ |
| CAFFE2_CUDA_EXPORT void Rowwise##Func<TIn, CUDAContext, true>( \ |
| const int rows, \ |
| const int cols, \ |
| const TIn* A, \ |
| const TIn* B, \ |
| TOut* C, \ |
| CUDAContext* context) { \ |
| if (rows == 0 || cols == 0) { \ |
| return; \ |
| } \ |
| const int size = rows * cols; \ |
| const FIXED_DIVISOR cols_div(cols); \ |
| RowwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, true> \ |
| <<<CAFFE_GET_BLOCKS(size), \ |
| CAFFE_CUDA_NUM_THREADS, \ |
| 0, \ |
| context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \ |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); \ |
| } \ |
| template <> \ |
| CAFFE2_CUDA_EXPORT void Rowwise##Func<TIn, CUDAContext, false>( \ |
| const int rows, \ |
| const int cols, \ |
| const TIn* A, \ |
| const TIn* B, \ |
| TOut* C, \ |
| CUDAContext* context) { \ |
| if (rows == 0 || cols == 0) { \ |
| return; \ |
| } \ |
| const int size = rows * cols; \ |
| const FIXED_DIVISOR cols_div(cols); \ |
| RowwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, false> \ |
| <<<CAFFE_GET_BLOCKS(size), \ |
| CAFFE_CUDA_NUM_THREADS, \ |
| 0, \ |
| context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \ |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); \ |
| } \ |
| template <> \ |
| CAFFE2_CUDA_EXPORT void Colwise##Func<TIn, CUDAContext, true>( \ |
| const int rows, \ |
| const int cols, \ |
| const TIn* A, \ |
| const TIn* B, \ |
| TOut* C, \ |
| CUDAContext* context) { \ |
| if (rows == 0 || cols == 0) { \ |
| return; \ |
| } \ |
| const int size = rows * cols; \ |
| const FIXED_DIVISOR cols_div(cols); \ |
| ColwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, true> \ |
| <<<CAFFE_GET_BLOCKS(size), \ |
| CAFFE_CUDA_NUM_THREADS, \ |
| 0, \ |
| context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \ |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); \ |
| } \ |
| template <> \ |
| CAFFE2_CUDA_EXPORT void Colwise##Func<TIn, CUDAContext, false>( \ |
| const int rows, \ |
| const int cols, \ |
| const TIn* A, \ |
| const TIn* B, \ |
| TOut* C, \ |
| CUDAContext* context) { \ |
| if (rows == 0 || cols == 0) { \ |
| return; \ |
| } \ |
| const int size = rows * cols; \ |
| const FIXED_DIVISOR cols_div(cols); \ |
| ColwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, false> \ |
| <<<CAFFE_GET_BLOCKS(size), \ |
| CAFFE_CUDA_NUM_THREADS, \ |
| 0, \ |
| context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \ |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); \ |
| } |
| |
| #define DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(Func, Op) \ |
| DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \ |
| DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \ |
| DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(float, bool, Func, Op) \ |
| DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(double, bool, Func, Op) \ |
| DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op) |
| |
| DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(EQ, thrust::equal_to) |
| DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(NE, thrust::not_equal_to) |
| DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(LT, thrust::less) |
| DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(LE, thrust::less_equal) |
| DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(GT, thrust::greater) |
| DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(GE, thrust::greater_equal) |
| |
| #undef DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION |
| |
| #define DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Func, Op) \ |
| DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION( \ |
| std::int32_t, std::int32_t, Func, Op) \ |
| DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION( \ |
| std::int64_t, std::int64_t, Func, Op) \ |
| DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(float, float, Func, Op) \ |
| DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(double, double, Func, Op) \ |
| DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(at::Half, at::Half, Func, Op) |
| |
| DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Add, AddFunctor) |
| DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Sub, SubFunctor) |
| DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Mul, MulFunctor) |
| DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Div, DivFunctor) |
| |
| #undef DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION |
| |
| DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, And, thrust::logical_and) |
| DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or) |
| DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor) |
| |
| #define DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(Func, Op) \ |
| DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op) \ |
| DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION( \ |
| std::int32_t, std::int32_t, Func, Op) \ |
| DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION( \ |
| std::int64_t, std::int64_t, Func, Op) |
| |
| DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and) |
| DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or) |
| DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor) |
| |
| #undef DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION |
| |
| #undef DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION |
| |
| #define DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \ |
| template <> \ |
| CAFFE2_CUDA_EXPORT void Func<TIn, CUDAContext>( \ |
| const int A_ndim, \ |
| const int* A_dims, \ |
| const int B_ndim, \ |
| const int* B_dims, \ |
| const TIn* A, \ |
| const TIn* B, \ |
| TOut* C, \ |
| CUDAContext* context) { \ |
| BroadcastBinaryOp<TIn, TOut, Op<TIn>>( \ |
| A_ndim, A_dims, B_ndim, B_dims, Op<TIn>(), A, B, C, context); \ |
| } |
| |
| #define DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(Func, Op) \ |
| DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \ |
| DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \ |
| DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(float, bool, Func, Op) \ |
| DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(double, bool, Func, Op) \ |
| DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op) |
| |
| DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(EQ, thrust::equal_to) |
| DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(NE, thrust::not_equal_to) |
| DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(LT, thrust::less) |
| DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(LE, thrust::less_equal) |
| DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(GT, thrust::greater) |
| DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(GE, thrust::greater_equal) |
| |
| #undef DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION |
| |
| #define DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Func, Op) \ |
| DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION( \ |
| std::int32_t, std::int32_t, Func, Op) \ |
| DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION( \ |
| std::int64_t, std::int64_t, Func, Op) \ |
| DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(float, float, Func, Op) \ |
| DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(double, double, Func, Op) \ |
| DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(at::Half, at::Half, Func, Op) |
| |
| DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Add, AddFunctor) |
| DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Sub, SubFunctor) |
| DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Mul, MulFunctor) |
| DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Div, DivFunctor) |
| |
| #undef DEFINE_BROADCAST_CUDA_BINARY_FUNCTION |
| |
| DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, And, thrust::logical_and) |
| DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or) |
| DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor) |
| |
| #define DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(Func, Op) \ |
| DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op) \ |
| DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION( \ |
| std::int32_t, std::int32_t, Func, Op) \ |
| DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) |
| |
| DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and) |
| DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or) |
| DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor) |
| |
| #undef DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION |
| |
| #undef DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION |
| |
| #define DELEGATE_REDUCTION_FUNCTION(T, Funcname, func) \ |
| template <> \ |
| CAFFE2_CUDA_EXPORT void Funcname<T, CUDAContext>( \ |
| const int N, \ |
| const T* src, \ |
| T* dst, \ |
| Tensor* scratch_ptr, \ |
| CUDAContext* context) { \ |
| size_t memRequired = 0; \ |
| cub::DeviceReduce::func( \ |
| nullptr, memRequired, src, dst, N, context->cuda_stream()); \ |
| auto buffer_size = \ |
| static_cast<int64_t>((memRequired + sizeof(T) - 1) / sizeof(T)); \ |
| scratch_ptr->Resize(std::vector<int64_t>{buffer_size}); \ |
| cub::DeviceReduce::func( \ |
| static_cast<void*>(scratch_ptr->mutable_data<T>()), \ |
| memRequired, \ |
| src, \ |
| dst, \ |
| N, \ |
| context->cuda_stream()); \ |
| } |
| |
| DELEGATE_REDUCTION_FUNCTION(float, ReduceMin, Min) |
| DELEGATE_REDUCTION_FUNCTION(float, ReduceMax, Max) |
| DELEGATE_REDUCTION_FUNCTION(int32_t, ReduceMax, Max) |
| DELEGATE_REDUCTION_FUNCTION(int64_t, ReduceMax, Max) |
| |
| #undef DELEGATE_REDUCTION_FUNCTION |
| |
| // Caffe2 gemm provides a simpler interface to the gemm functions, with the |
| // limitation that the data has to be contiguous in memory. |
| template <> |
| CAFFE2_CUDA_EXPORT void Gemm<float, CUDAContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float* A, |
| const float* B, |
| const float beta, |
| float* C, |
| CUDAContext* context, |
| TensorProto::DataType math_type) { |
| // Note that cublas follows fortran order, so the order is different from |
| // the cblas convention. |
| const int lda = (trans_A == CblasNoTrans) ? K : M; |
| const int ldb = (trans_B == CblasNoTrans) ? N : K; |
| const cublasOperation_t cu_trans_A = |
| (trans_A == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| const cublasOperation_t cu_trans_B = |
| (trans_B == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| CUBLAS_ENFORCE( |
| cublasSetPointerMode(context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); |
| CUBLAS_ENFORCE(cublasSgemm( |
| context->cublas_handle(), |
| cu_trans_B, |
| cu_trans_A, |
| N, |
| M, |
| K, |
| &alpha, |
| B, |
| ldb, |
| A, |
| lda, |
| &beta, |
| C, |
| N)); |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void Gemm<at::Half, CUDAContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const at::Half* A, |
| const at::Half* B, |
| const float beta, |
| at::Half* C, |
| CUDAContext* context, |
| TensorProto::DataType math_type) { |
| // Note that cublas follows fortran order, so the order is different from |
| // the cblas convention. |
| const int lda = (trans_A == CblasNoTrans) ? K : M; |
| const int ldb = (trans_B == CblasNoTrans) ? N : K; |
| const cublasOperation_t cu_trans_A = |
| (trans_A == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| const cublasOperation_t cu_trans_B = |
| (trans_B == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| if (math_type == TensorProto_DataType_FLOAT) { |
| CUBLAS_ENFORCE(cublasSetPointerMode( |
| context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); |
| #if defined(USE_ROCM) |
| // rocblas doesn't support cublasSgemmEx type API yet. |
| // It has more general rocblas_gemm_ex API which is more close to |
| // cublasGemmEx rocblas_gemm_ex does D = alpha*op( A )*op( B ) + beta*C, |
| // whereas cublasgemmEx does C = alpha*op( A )*op( B ) + beta*C |
| ROCBLAS_ENFORCE(rocblas_gemm_ex( |
| context->rocblashandle(), |
| cu_trans_B, |
| cu_trans_A, |
| N, |
| M, |
| K, |
| &alpha, |
| B, |
| rocblas_datatype_f16_r, |
| ldb, |
| A, |
| rocblas_datatype_f16_r, |
| lda, |
| &beta, |
| C, |
| rocblas_datatype_f16_r, |
| N, |
| C, // D |
| rocblas_datatype_f16_r, // D type |
| N, // ldd |
| rocblas_datatype_f32_r, // compute type |
| rocblas_gemm_algo_standard, // rocblas_gemm_algo |
| 0, // solution index, reserved for future use |
| 0)); // flags, reserved for future use |
| #else |
| CUBLAS_ENFORCE(cublasSgemmEx( |
| context->cublas_handle(), |
| cu_trans_B, |
| cu_trans_A, |
| N, |
| M, |
| K, |
| &alpha, |
| B, |
| CUDA_R_16F, |
| ldb, |
| A, |
| CUDA_R_16F, |
| lda, |
| &beta, |
| C, |
| CUDA_R_16F, |
| N)); |
| #endif // USE_ROCM |
| } else if (math_type == TensorProto_DataType_FLOAT16) { |
| // convert alpha, beta from float -> __half |
| const __half alpha_fp16 = at::Half(alpha); |
| const __half beta_fp16 = at::Half(beta); |
| // call cublasHgemm |
| CUBLAS_ENFORCE(cublasSetPointerMode( |
| context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); |
| CUBLAS_ENFORCE(cublasHgemm( |
| context->cublas_handle(), |
| cu_trans_B, |
| cu_trans_A, |
| N, |
| M, |
| K, |
| reinterpret_cast<const CUBLAS_HALF_TYPE*>(&alpha_fp16), |
| reinterpret_cast<const CUBLAS_HALF_TYPE*>(B), |
| ldb, |
| reinterpret_cast<const CUBLAS_HALF_TYPE*>(A), |
| lda, |
| reinterpret_cast<const CUBLAS_HALF_TYPE*>(&beta_fp16), |
| reinterpret_cast<CUBLAS_HALF_TYPE*>(C), |
| N)); |
| } else { |
| // fail |
| CAFFE_THROW("Unsupported math type"); |
| } |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void BiasCHW<float, CUDAContext>( |
| const float* bias, |
| const float* bias_multiplier, |
| const int bias_channels, |
| const int image_size, |
| float* image, |
| CUDAContext* context) { |
| Gemm<float, CUDAContext>( |
| CblasNoTrans, |
| CblasNoTrans, |
| bias_channels, |
| image_size, |
| 1, |
| 1, |
| bias, |
| bias_multiplier, |
| 1, |
| image, |
| context); |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void GemmBatched<float, CUDAContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int batch_size, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float** A, |
| const float** B, |
| const float beta, |
| float** C, |
| CUDAContext* context, |
| TensorProto::DataType math_type) { |
| #if __CUDACC_VER_MAJOR__ < 8 || defined(USE_ROCM) |
| // loop over matrices in the batch |
| for (int i = 0; i < batch_size; ++i) { |
| Gemm<float, CUDAContext>( |
| trans_A, |
| trans_B, |
| M, |
| N, |
| K, |
| alpha, |
| A[i], |
| B[i], |
| beta, |
| C[i], |
| context, |
| math_type); |
| } |
| #else |
| // Note that cublas follows fortran order, so the order is different from |
| // the cblas convention. |
| const int lda = (trans_A == CblasNoTrans) ? K : M; |
| const int ldb = (trans_B == CblasNoTrans) ? N : K; |
| const int ldc = N; |
| const cublasOperation_t cu_trans_A = |
| (trans_A == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| const cublasOperation_t cu_trans_B = |
| (trans_B == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| thrust::device_vector<const float*> A_device(A, A + batch_size); |
| thrust::device_vector<const float*> B_device(B, B + batch_size); |
| thrust::device_vector<float*> C_device(C, C + batch_size); |
| CUBLAS_ENFORCE( |
| cublasSetPointerMode(context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); |
| CUBLAS_ENFORCE(cublasSgemmBatched( |
| context->cublas_handle(), |
| cu_trans_B, |
| cu_trans_A, |
| N, |
| M, |
| K, |
| &alpha, |
| B_device.data().get(), |
| ldb, |
| A_device.data().get(), |
| lda, |
| &beta, |
| C_device.data().get(), |
| ldc, |
| batch_size)); |
| #endif |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void GemmStridedBatched<float, CUDAContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int batch_size, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float* A, |
| const int A_stride, |
| const float* B, |
| const int B_stride, |
| const float beta, |
| float* C, |
| const int C_stride, |
| CUDAContext* context, |
| TensorProto::DataType math_type) { |
| #if __CUDACC_VER_MAJOR__ < 8 && !defined(USE_ROCM) |
| // loop over matrices in the batch |
| for (int i = 0; i < batch_size; ++i) { |
| Gemm<float, CUDAContext>( |
| trans_A, trans_B, M, N, K, alpha, A, B, beta, C, context, math_type); |
| A += A_stride; |
| B += B_stride; |
| C += C_stride; |
| } |
| #else |
| // Note that cublas follows fortran order, so the order is different from |
| // the cblas convention. |
| const int lda = (trans_A == CblasNoTrans) ? K : M; |
| const int ldb = (trans_B == CblasNoTrans) ? N : K; |
| const int ldc = N; |
| const cublasOperation_t cu_trans_A = |
| (trans_A == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| const cublasOperation_t cu_trans_B = |
| (trans_B == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| CUBLAS_ENFORCE( |
| cublasSetPointerMode(context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); |
| CUBLAS_ENFORCE(cublasSgemmStridedBatched( |
| context->cublas_handle(), |
| cu_trans_B, |
| cu_trans_A, |
| N, |
| M, |
| K, |
| &alpha, |
| B, |
| ldb, |
| B_stride, |
| A, |
| lda, |
| A_stride, |
| &beta, |
| C, |
| ldc, |
| C_stride, |
| batch_size)); |
| #endif |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void GemmBatched<at::Half, CUDAContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int batch_size, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const at::Half** A, |
| const at::Half** B, |
| const float beta, |
| at::Half** C, |
| CUDAContext* context, |
| TensorProto::DataType math_type) { |
| #if __CUDACC_VER_MAJOR__ < 9 |
| // loop over matrices in the batch |
| for (int i = 0; i < batch_size; ++i) { |
| Gemm<at::Half, CUDAContext>( |
| trans_A, |
| trans_B, |
| M, |
| N, |
| K, |
| alpha, |
| A[i], |
| B[i], |
| beta, |
| C[i], |
| context, |
| math_type); |
| } |
| #else |
| // Note that cublas follows fortran order, so the order is different from |
| // the cblas convention. |
| const int lda = (trans_A == CblasNoTrans) ? K : M; |
| const int ldb = (trans_B == CblasNoTrans) ? N : K; |
| const int ldc = N; |
| const cublasOperation_t cu_trans_A = |
| (trans_A == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| const cublasOperation_t cu_trans_B = |
| (trans_B == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| if (math_type == TensorProto_DataType_FLOAT) { |
| thrust::device_vector<const void*> A_device(A, A + batch_size); |
| thrust::device_vector<const void*> B_device(B, B + batch_size); |
| thrust::device_vector<void*> C_device(C, C + batch_size); |
| CUBLAS_ENFORCE(cublasSetPointerMode( |
| context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); |
| CUBLAS_ENFORCE(cublasGemmBatchedEx( |
| context->cublas_handle(), |
| cu_trans_B, |
| cu_trans_A, |
| N, |
| M, |
| K, |
| &alpha, |
| B_device.data().get(), |
| CUDA_R_16F, |
| ldb, |
| A_device.data().get(), |
| CUDA_R_16F, |
| lda, |
| &beta, |
| C_device.data().get(), |
| CUDA_R_16F, |
| ldc, |
| batch_size, |
| CUDA_R_32F, |
| CUBLAS_GEMM_DEFAULT_TENSOR_OP)); |
| } else if (math_type == TensorProto_DataType_FLOAT16) { |
| // Convert alpha, beta from float -> __half |
| const __half alpha_fp16 = at::Half(alpha); |
| const __half beta_fp16 = at::Half(beta); |
| thrust::host_vector<const __half*> A_array(batch_size); |
| thrust::host_vector<const __half*> B_array(batch_size); |
| thrust::host_vector<__half*> C_array(batch_size); |
| for (int i = 0; i < batch_size; ++i) { |
| A_array[i] = reinterpret_cast<const __half*>(A[i]); |
| B_array[i] = reinterpret_cast<const __half*>(B[i]); |
| C_array[i] = reinterpret_cast<__half*>(C[i]); |
| } |
| thrust::device_vector<const __half*> A_device( |
| A_array.cbegin(), A_array.cend()); |
| thrust::device_vector<const __half*> B_device( |
| B_array.cbegin(), B_array.cend()); |
| thrust::device_vector<__half*> C_device(C_array.cbegin(), C_array.cend()); |
| CUBLAS_ENFORCE(cublasSetPointerMode( |
| context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); |
| CUBLAS_ENFORCE(cublasHgemmBatched( |
| context->cublas_handle(), |
| cu_trans_B, |
| cu_trans_A, |
| N, |
| M, |
| K, |
| &alpha_fp16, |
| B_device.data().get(), |
| ldb, |
| A_device.data().get(), |
| lda, |
| &beta_fp16, |
| C_device.data().get(), |
| ldc, |
| batch_size)); |
| } else { |
| CAFFE_THROW("Unsupported math type"); |
| } |
| #endif |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void GemmStridedBatched<at::Half, CUDAContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int batch_size, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const at::Half* A, |
| const int A_stride, |
| const at::Half* B, |
| const int B_stride, |
| const float beta, |
| at::Half* C, |
| const int C_stride, |
| CUDAContext* context, |
| TensorProto::DataType math_type) { |
| #if __CUDACC_VER_MAJOR__ < 8 && !defined(USE_ROCM) |
| // loop over matrices in the batch |
| for (int i = 0; i < batch_size; ++i) { |
| Gemm<at::Half, CUDAContext>( |
| trans_A, trans_B, M, N, K, alpha, A, B, beta, C, context, math_type); |
| A += A_stride; |
| B += B_stride; |
| C += C_stride; |
| } |
| #else |
| // Note that cublas follows fortran order, so the order is different from |
| // the cblas convention. |
| const int lda = (trans_A == CblasNoTrans) ? K : M; |
| const int ldb = (trans_B == CblasNoTrans) ? N : K; |
| const int ldc = N; |
| const cublasOperation_t cu_trans_A = |
| (trans_A == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| const cublasOperation_t cu_trans_B = |
| (trans_B == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| if (math_type == TensorProto_DataType_FLOAT) { |
| CUBLAS_ENFORCE(cublasSetPointerMode( |
| context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); |
| #if defined(USE_ROCM) |
| // D[i*stride_d] = alpha*op(A[i*stride_a])*op(B[i*stride_b]) + |
| // beta*C[i*stride_c], for i in [0,batch_count-1] |
| ROCBLAS_ENFORCE(rocblas_gemm_strided_batched_ex( |
| context->rocblashandle(), |
| cu_trans_B, |
| cu_trans_A, |
| N, |
| M, |
| K, |
| &alpha, |
| B, |
| rocblas_datatype_f16_r, |
| ldb, |
| B_stride, |
| A, |
| rocblas_datatype_f16_r, |
| lda, |
| A_stride, |
| &beta, |
| C, |
| rocblas_datatype_f16_r, |
| ldc, |
| C_stride, |
| C, // D |
| rocblas_datatype_f16_r, // D type |
| ldc, // ldd |
| C_stride, // D stride |
| batch_size, |
| rocblas_datatype_f32_r, // compute type |
| rocblas_gemm_algo_standard, // rocblas_gemm_algo |
| 0, // solution index, reserved for future use |
| 0)); // flags, reserved for future use |
| #else |
| CUBLAS_ENFORCE(cublasGemmStridedBatchedEx( |
| context->cublas_handle(), |
| cu_trans_B, |
| cu_trans_A, |
| N, |
| M, |
| K, |
| &alpha, |
| B, |
| CUDA_R_16F, |
| ldb, |
| B_stride, |
| A, |
| CUDA_R_16F, |
| lda, |
| A_stride, |
| &beta, |
| C, |
| CUDA_R_16F, |
| ldc, |
| C_stride, |
| batch_size, |
| CUDA_R_32F, |
| CUBLAS_GEMM_DEFAULT_TENSOR_OP)); |
| #endif // USE_ROCM |
| } else if (math_type == TensorProto_DataType_FLOAT16) { |
| // Convert alpha, beta from float -> __half |
| const __half alpha_fp16 = at::Half(alpha); |
| const __half beta_fp16 = at::Half(beta); |
| CUBLAS_ENFORCE(cublasSetPointerMode( |
| context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); |
| CUBLAS_ENFORCE(cublasHgemmStridedBatched( |
| context->cublas_handle(), |
| cu_trans_B, |
| cu_trans_A, |
| N, |
| M, |
| K, |
| reinterpret_cast<const CUBLAS_HALF_TYPE*>(&alpha_fp16), |
| reinterpret_cast<const CUBLAS_HALF_TYPE*>(B), |
| ldb, |
| B_stride, |
| reinterpret_cast<const CUBLAS_HALF_TYPE*>(A), |
| lda, |
| A_stride, |
| reinterpret_cast<const CUBLAS_HALF_TYPE*>(&beta_fp16), |
| reinterpret_cast<CUBLAS_HALF_TYPE*>(C), |
| ldc, |
| C_stride, |
| batch_size)); |
| } else { |
| CAFFE_THROW("Unsupported math type"); |
| } |
| #endif |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void Gemv<float, CUDAContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const int M, |
| const int N, |
| const float alpha, |
| const float* A, |
| const float* x, |
| const float beta, |
| float* y, |
| CUDAContext* context, |
| TensorProto::DataType math_type) { |
| const cublasOperation_t cu_trans_A = |
| (trans_A == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; |
| CUBLAS_ENFORCE( |
| cublasSetPointerMode(context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); |
| CUBLAS_ENFORCE(cublasSgemv( |
| context->cublas_handle(), |
| cu_trans_A, |
| N, |
| M, |
| &alpha, |
| A, |
| N, |
| x, |
| 1, |
| &beta, |
| y, |
| 1)); |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void Gemv<at::Half, CUDAContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const int M, |
| const int N, |
| const float alpha, |
| const at::Half* A, |
| const at::Half* x, |
| const float beta, |
| at::Half* y, |
| CUDAContext* context, |
| TensorProto::DataType math_type) { |
| const cublasOperation_t cu_trans_A = |
| (trans_A == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; |
| |
| // sort out what we need to call cublasSgemmEx / cublasHgemm |
| const int m = (cu_trans_A == CUBLAS_OP_N) ? N : M; |
| const int k = (cu_trans_A == CUBLAS_OP_N) ? M : N; |
| const int lda = (cu_trans_A == CUBLAS_OP_N) ? m : k; |
| const int ldc = m; |
| |
| if (math_type == TensorProto_DataType_FLOAT) { |
| CUBLAS_ENFORCE(cublasSetPointerMode( |
| context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); |
| #if defined(USE_ROCM) |
| // rocblas doesn't support cublasSgemmEx type API yet. |
| // It has more general rocblas_gemm_ex API which is more close to |
| // cublasGemmEx rocblas_gemm_ex does D = alpha*op( A )*op( B ) + beta*C, |
| // whereas cublasgemmEx does C = alpha*op( A )*op( B ) + beta*C |
| ROCBLAS_ENFORCE(rocblas_gemm_ex( |
| context->rocblashandle(), |
| cu_trans_A, |
| rocblas_operation_none, |
| m, |
| 1, |
| k, |
| &alpha, |
| A, |
| rocblas_datatype_f16_r, |
| lda, |
| x, |
| rocblas_datatype_f16_r, |
| k, |
| &beta, |
| y, |
| rocblas_datatype_f16_r, |
| ldc, |
| y, // D |
| rocblas_datatype_f16_r, // D type |
| ldc, // ldd |
| rocblas_datatype_f32_r, // compute type |
| rocblas_gemm_algo_standard, // rocblas_gemm_algo |
| 0, // solution index, reserved for future use |
| 0)); // flags, reserved for future use |
| #else |
| CUBLAS_ENFORCE(cublasSgemmEx( |
| context->cublas_handle(), |
| cu_trans_A, |
| CUBLAS_OP_N, |
| m, |
| 1, |
| k, |
| &alpha, |
| A, |
| CUDA_R_16F, |
| lda, |
| x, |
| CUDA_R_16F, |
| k, |
| &beta, |
| y, |
| CUDA_R_16F, |
| ldc)); |
| #endif // USE_ROCM |
| } else if (math_type == TensorProto_DataType_FLOAT16) { |
| const __half alpha_fp16 = at::Half(alpha); |
| const __half beta_fp16 = at::Half(beta); |
| CUBLAS_ENFORCE(cublasSetPointerMode( |
| context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); |
| CUBLAS_ENFORCE(cublasHgemm( |
| context->cublas_handle(), |
| cu_trans_A, |
| CUBLAS_OP_N, |
| m, |
| 1, |
| k, |
| reinterpret_cast<const CUBLAS_HALF_TYPE*>(&alpha_fp16), |
| reinterpret_cast<const CUBLAS_HALF_TYPE*>(A), |
| lda, |
| reinterpret_cast<const CUBLAS_HALF_TYPE*>(x), |
| k, |
| reinterpret_cast<const CUBLAS_HALF_TYPE*>(&beta_fp16), |
| reinterpret_cast<CUBLAS_HALF_TYPE*>(y), |
| ldc)); |
| } else { |
| // fail |
| CAFFE_THROW("Unsupported math type"); |
| } |
| } |
| |
| #if !defined(USE_ROCM) |
| |
| // No change, but required. Defer to default CUDA engine |
| template <> |
| CAFFE2_CUDA_EXPORT void Gemm<float, CUDAContext, TensorCoreEngine>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float* A, |
| const float* B, |
| const float beta, |
| float* C, |
| CUDAContext* context, |
| TensorProto::DataType math_type) { |
| return Gemm<float, CUDAContext>( |
| trans_A, trans_B, M, N, K, alpha, A, B, beta, C, context, math_type); |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void Gemm<at::Half, CUDAContext, TensorCoreEngine>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const at::Half* A, |
| const at::Half* B, |
| const float beta, |
| at::Half* C, |
| CUDAContext* context, |
| TensorProto::DataType math_type) { |
| // Note that cublas follows fortran order, so the order is different from |
| // the cblas convention. |
| const int lda = (trans_A == CblasNoTrans) ? K : M; |
| const int ldb = (trans_B == CblasNoTrans) ? N : K; |
| const cublasOperation_t cu_trans_A = |
| (trans_A == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| const cublasOperation_t cu_trans_B = |
| (trans_B == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| |
| // enable TensorCore for this call on this handle |
| if (TensorCoreAvailable()) { |
| CUBLAS_ENFORCE( |
| cublasSetMathMode(context->cublas_handle(), CUBLAS_TENSOR_OP_MATH)); |
| } |
| |
| CUBLAS_ENFORCE( |
| cublasSetPointerMode(context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); |
| CUBLAS_ENFORCE(cublasGemmEx( |
| context->cublas_handle(), |
| cu_trans_B, |
| cu_trans_A, |
| N, |
| M, |
| K, |
| &alpha, |
| B, |
| CUDA_R_16F, |
| ldb, |
| A, |
| CUDA_R_16F, |
| lda, |
| &beta, |
| C, |
| CUDA_R_16F, |
| N, |
| CUDA_R_32F, |
| CUBLAS_GEMM_DFALT_TENSOR_OP)); |
| |
| // Now disable TensorCore math for subsequent calls to this handle |
| if (TensorCoreAvailable()) { |
| CUBLAS_ENFORCE( |
| cublasSetMathMode(context->cublas_handle(), CUBLAS_DEFAULT_MATH)); |
| } |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void GemmBatched<float, CUDAContext, TensorCoreEngine>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int batch_size, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float** A, |
| const float** B, |
| const float beta, |
| float** C, |
| CUDAContext* context, |
| TensorProto::DataType math_type) { |
| GemmBatched<float, CUDAContext, DefaultEngine>( |
| trans_A, |
| trans_B, |
| batch_size, |
| M, |
| N, |
| K, |
| alpha, |
| A, |
| B, |
| beta, |
| C, |
| context, |
| math_type); |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void GemmBatched<at::Half, CUDAContext, TensorCoreEngine>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int batch_size, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const at::Half** A, |
| const at::Half** B, |
| const float beta, |
| at::Half** C, |
| CUDAContext* context, |
| TensorProto::DataType math_type) { |
| GemmBatched<at::Half, CUDAContext, DefaultEngine>( |
| trans_A, |
| trans_B, |
| batch_size, |
| M, |
| N, |
| K, |
| alpha, |
| A, |
| B, |
| beta, |
| C, |
| context, |
| math_type); |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void |
| GemmStridedBatched<float, CUDAContext, TensorCoreEngine>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int batch_size, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float* A, |
| const int A_stride, |
| const float* B, |
| const int B_stride, |
| const float beta, |
| float* C, |
| const int C_stride, |
| CUDAContext* context, |
| TensorProto::DataType math_type) { |
| GemmStridedBatched<float, CUDAContext, DefaultEngine>( |
| trans_A, |
| trans_B, |
| batch_size, |
| M, |
| N, |
| K, |
| alpha, |
| A, |
| A_stride, |
| B, |
| B_stride, |
| beta, |
| C, |
| C_stride, |
| context, |
| math_type); |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void |
| GemmStridedBatched<at::Half, CUDAContext, TensorCoreEngine>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int batch_size, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const at::Half* A, |
| const int A_stride, |
| const at::Half* B, |
| const int B_stride, |
| const float beta, |
| at::Half* C, |
| const int C_stride, |
| CUDAContext* context, |
| TensorProto::DataType math_type) { |
| GemmStridedBatched<at::Half, CUDAContext, DefaultEngine>( |
| trans_A, |
| trans_B, |
| batch_size, |
| M, |
| N, |
| K, |
| alpha, |
| A, |
| A_stride, |
| B, |
| B_stride, |
| beta, |
| C, |
| C_stride, |
| context, |
| math_type); |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void Gemv<float, CUDAContext, TensorCoreEngine>( |
| const CBLAS_TRANSPOSE trans_A, |
| const int M, |
| const int N, |
| const float alpha, |
| const float* A, |
| const float* x, |
| const float beta, |
| float* y, |
| CUDAContext* context, |
| TensorProto::DataType math_type) { |
| Gemv<float, CUDAContext, DefaultEngine>( |
| trans_A, M, N, alpha, A, x, beta, y, context, math_type); |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void Gemv<at::Half, CUDAContext, TensorCoreEngine>( |
| const CBLAS_TRANSPOSE trans_A, |
| const int M, |
| const int N, |
| const float alpha, |
| const at::Half* A, |
| const at::Half* x, |
| const float beta, |
| at::Half* y, |
| CUDAContext* context, |
| TensorProto::DataType math_type) { |
| Gemv<at::Half, CUDAContext, DefaultEngine>( |
| trans_A, M, N, alpha, A, x, beta, y, context, math_type); |
| } |
| |
| #endif |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void GemmEx<float, CUDAContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float* A, |
| const int lda, |
| const float* B, |
| const int ldb, |
| const float beta, |
| float* C, |
| const int ldc, |
| CUDAContext* context) { |
| // Note that cublas follows fortran order, so the order is different from |
| // the cblas convention. |
| const cublasOperation_t cu_trans_A = |
| (trans_A == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| const cublasOperation_t cu_trans_B = |
| (trans_B == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| CUBLAS_ENFORCE( |
| cublasSetPointerMode(context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); |
| CUBLAS_ENFORCE(cublasSgemm( |
| context->cublas_handle(), |
| cu_trans_B, |
| cu_trans_A, |
| N, |
| M, |
| K, |
| &alpha, |
| B, |
| ldb, |
| A, |
| lda, |
| &beta, |
| C, |
| ldc)); |
| } |
| |
| // Batched Add variants |
| namespace { |
| |
| template <typename T> |
| __global__ void AddStripedBatchKernel( |
| const int N, |
| const T* first, |
| T* Y, |
| const int stripe, |
| const int batch) { |
| for (int j = 0; j < batch; j++) { |
| const T* x = first + j * stripe; |
| CUDA_1D_KERNEL_LOOP(i, N) { |
| float tmpY = convert::To<T, float>(Y[i]); |
| tmpY += convert::To<T, float>(x[i]); |
| Y[i] = convert::To<float, T>(tmpY); |
| } |
| } |
| } |
| } // namespace |
| |
| #define CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(T) \ |
| template <> \ |
| CAFFE2_CUDA_EXPORT void AddStripedBatch<T, CUDAContext>( \ |
| const int N, \ |
| const T* first, \ |
| T* Y, \ |
| const int stripe, \ |
| const int batch, \ |
| CUDAContext* context) { \ |
| AddStripedBatchKernel<T> \ |
| <<<CAFFE_GET_BLOCKS(N), \ |
| CAFFE_CUDA_NUM_THREADS, \ |
| 0, \ |
| context->cuda_stream()>>>(N, first, Y, stripe, batch); \ |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); \ |
| } |
| |
| CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(float); |
| CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(at::Half); |
| #undef CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH |
| |
| namespace { |
| template <typename T> |
| __global__ void |
| UniformShift(const size_t N, const float min, const float max, T* x) { |
| float scale = max - min; |
| CUDA_1D_KERNEL_LOOP(i, N) { |
| x[i] = convert::To<float, T>(convert::To<T, float>(x[i]) * scale + min); |
| } |
| } |
| |
| __global__ void |
| UniformIntFit(const size_t N, const int min, const int max, unsigned int* x) { |
| int* x_int = reinterpret_cast<int*>(x); |
| int range = (max - min + 1); |
| CUDA_1D_KERNEL_LOOP(i, N) { |
| x_int[i] = min + static_cast<int>(x[i] % range); |
| } |
| } |
| } // namespace |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void RandUniform<float, CUDAContext>( |
| const size_t n, |
| const float min, |
| const float max, |
| float* r, |
| CUDAContext* context) { |
| CURAND_ENFORCE(curandGenerateUniform(context->curand_generator(), r, n)); |
| UniformShift<float> |
| <<<CAFFE_GET_BLOCKS(n), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(n, min, max, r); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void RandUniform<double, CUDAContext>( |
| const size_t n, |
| const double min, |
| const double max, |
| double* r, |
| CUDAContext* context) { |
| CURAND_ENFORCE( |
| curandGenerateUniformDouble(context->curand_generator(), r, n)); |
| UniformShift<double> |
| <<<CAFFE_GET_BLOCKS(n), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(n, min, max, r); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void RandUniform<int, CUDAContext>( |
| const size_t n, |
| const int min, |
| const int max, |
| int* r, |
| CUDAContext* context) { |
| CURAND_ENFORCE(curandGenerate( |
| context->curand_generator(), reinterpret_cast<unsigned int*>(r), n)); |
| UniformIntFit<<< |
| CAFFE_GET_BLOCKS(n), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>( |
| n, min, max, reinterpret_cast<unsigned int*>(r)); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| |
| template <typename T> |
| size_t HandleOddLengthRandGaussian( |
| const size_t n, |
| const T mean, |
| const T std, |
| T* r, |
| CUDAContext* context) { |
| if (n % 2 == 1) { |
| std::default_random_engine generator; |
| std::normal_distribution<T> distribution(mean, std); |
| const T random_value = distribution(generator); |
| Set<T, CUDAContext>(1, random_value, r + (n - 1), context); |
| return n - 1; |
| } |
| return n; |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void RandGaussian<float, CUDAContext>( |
| const size_t n, |
| const float mean, |
| const float std, |
| float* r, |
| CUDAContext* context) { |
| // If n is odd, we add a random Gaussian value at the end manually |
| // and generate n-1 random values using curandGenerateNormal. |
| // curandGenerateNormal requires n to be even. |
| const size_t even_n = |
| HandleOddLengthRandGaussian<float>(n, mean, std, r, context); |
| CURAND_ENFORCE( |
| curandGenerateNormal(context->curand_generator(), r, even_n, mean, std)); |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void RandGaussian<double, CUDAContext>( |
| const size_t n, |
| const double mean, |
| const double std, |
| double* r, |
| CUDAContext* context) { |
| const size_t even_n = |
| HandleOddLengthRandGaussian<double>(n, mean, std, r, context); |
| CURAND_ENFORCE(curandGenerateNormalDouble( |
| context->curand_generator(), r, even_n, mean, std)); |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void Dot<float, CUDAContext>( |
| const int n, |
| const float* a, |
| const float* b, |
| float* y, |
| CUDAContext* context) { |
| CUBLAS_ENFORCE(cublasSetPointerMode( |
| context->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE)); |
| CUBLAS_ENFORCE(cublasSdot(context->cublas_handle(), n, a, 1, b, 1, y)); |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void Dot<at::Half, CUDAContext>( |
| const int n, |
| const at::Half* a, |
| const at::Half* b, |
| at::Half* y, |
| CUDAContext* context) { |
| #if defined(USE_ROCM) && (TORCH_HIP_VERSION < 210) |
| CAFFE_THROW("HIP currently does not support FP16 completely yet."); |
| #elif defined(USE_ROCM) && (TORCH_HIP_VERSION >= 210) |
| CUBLAS_ENFORCE(cublasSetPointerMode( |
| context->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE)); |
| CUBLAS_ENFORCE(rocblas_hdot( |
| context->cublas_handle(), |
| n, |
| reinterpret_cast<const rocblas_half*>(a), |
| 1, |
| reinterpret_cast<const rocblas_half*>(b), |
| 1, |
| reinterpret_cast<rocblas_half*>(y))); |
| #else |
| // execute with 32-bit math |
| CUBLAS_ENFORCE(cublasSetPointerMode( |
| context->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE)); |
| CUBLAS_ENFORCE(cublasDotEx( |
| context->cublas_handle(), |
| n, |
| a, |
| CUDA_R_16F, |
| 1, |
| b, |
| CUDA_R_16F, |
| 1, |
| y, |
| CUDA_R_16F, |
| CUDA_R_32F)); |
| #endif |
| } |
| |
| // A previous version of caffe2 used Thrust but it turns out that thrust |
| // reduction has an implicit scratch space allocation and deallocation, which |
| // may interfere with NCCL and create a deadlock. Hence we are using a custom |
| // reduction here. |
| #define SUM_KERNEL_NTHREADS 128 |
| template <typename T> |
| __global__ void SumKernel(const int N, const T* X, T* Y, bool square) { |
| const int idx = threadIdx.x; |
| __shared__ float reduction_buffer[SUM_KERNEL_NTHREADS]; |
| |
| reduction_buffer[idx] = 0; |
| |
| // A multilevel reduction. |
| // N -> 128 |
| if (!square) { |
| for (int i = idx; i < N; i += SUM_KERNEL_NTHREADS) { |
| reduction_buffer[idx] += convert::To<T, float>(X[i]); |
| } |
| } else { |
| for (int i = idx; i < N; i += SUM_KERNEL_NTHREADS) { |
| float Xi = convert::To<T, float>(X[i]); |
| reduction_buffer[idx] += Xi * Xi; |
| } |
| } |
| __syncthreads(); |
| // 128 -> 32 |
| if (idx < 32) { |
| reduction_buffer[idx] += reduction_buffer[idx + 32] + |
| reduction_buffer[idx + 64] + reduction_buffer[idx + 96]; |
| } |
| __syncthreads(); |
| // 32 -> 1 |
| if (idx == 0) { |
| float tmp = 0; |
| for (int i = 0; i < 32; ++i) { |
| tmp += reduction_buffer[i]; |
| } |
| *Y = convert::To<float, T>(tmp); |
| } |
| } |
| |
| // According to the benchmarks script |
| // caffe2/caffe2/experiments/python/device_reduce_sum_bench.py, |
| // device reduce is slower for N <= 10000. |
| #define DEVICE_REDUCE_SIZE_THRESHOLD 10000 |
| |
| namespace { |
| |
| template <typename T> |
| __global__ void SumConvertKernel(float* sum, T* dest) { |
| *dest = convert::To<float, T>(*sum); |
| } |
| |
| template <typename T, typename IterT> |
| CAFFE2_CUDA_EXPORT void SumGenericIter( |
| const int N, |
| IterT it, |
| T*& dest, |
| CUDAContext* context, |
| Tensor* scratch_ptr) { |
| size_t memRequired = 0; |
| cub::DeviceReduce::Sum( |
| nullptr, memRequired, it, dest, N, context->cuda_stream()); |
| auto buffer_size = |
| static_cast<int64_t>((memRequired + sizeof(T) - 1) / sizeof(T)); |
| if (!dest) { |
| // allocate one more T at the end of scratch for dest |
| scratch_ptr->Resize(std::vector<int64_t>{buffer_size + 1}); |
| dest = scratch_ptr->template mutable_data<T>() + buffer_size; |
| } else { |
| scratch_ptr->Resize(std::vector<int64_t>{buffer_size}); |
| } |
| cub::DeviceReduce::Sum( |
| static_cast<void*>(scratch_ptr->template mutable_data<T>()), |
| memRequired, |
| it, |
| dest, |
| N, |
| context->cuda_stream()); |
| } |
| } // namespace |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void Sum<float, CUDAContext>( |
| const int N, |
| const float* x, |
| float* y, |
| CUDAContext* context, |
| Tensor* scratch_ptr) { |
| if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) { |
| SumGenericIter<float>(N, x, y, context, scratch_ptr); |
| } else { |
| SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>( |
| N, x, y, false); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void Sum<int32_t, CUDAContext>( |
| const int N, |
| const int32_t* x, |
| int32_t* y, |
| CUDAContext* context, |
| Tensor* scratch_ptr) { |
| if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) { |
| SumGenericIter<int32_t>(N, x, y, context, scratch_ptr); |
| } else { |
| SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>( |
| N, x, y, false); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| } |
| |
| namespace { |
| template <typename T> |
| struct FloatTransform { |
| inline __host__ __device__ float operator()(const T v) const { |
| return convert::To<T, float>(v); |
| } |
| }; |
| } // namespace |
| |
| #define CAFFE2_MATH_SUM_FUNC(T) \ |
| template <> \ |
| CAFFE2_CUDA_EXPORT void Sum<T, CUDAContext>( \ |
| const int N, \ |
| const T* x, \ |
| T* y, \ |
| CUDAContext* context, \ |
| Tensor* scratch_ptr) { \ |
| if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) { \ |
| FloatTransform<T> transform; \ |
| cub::TransformInputIterator<float, FloatTransform<T>, const T*> it( \ |
| x, transform); \ |
| float* sum = nullptr; \ |
| SumGenericIter<float>(N, it, sum, context, scratch_ptr); \ |
| SumConvertKernel<<<1, 1, 0, context->cuda_stream()>>>(sum, y); \ |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); \ |
| } else { \ |
| SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>( \ |
| N, x, y, false); \ |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); \ |
| } \ |
| } |
| |
| CAFFE2_MATH_SUM_FUNC(at::Half) |
| #undef CAFFE2_MATH_SUM_FUNC |
| |
| namespace { |
| template <typename T> |
| struct SqrTransform { |
| inline __host__ __device__ T operator()(const T v) const { |
| return v * v; |
| } |
| }; |
| } // namespace |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void SumSqr<float, CUDAContext>( |
| const int N, |
| const float* x, |
| float* y, |
| CUDAContext* context, |
| Tensor* scratch_ptr) { |
| if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) { |
| SqrTransform<float> transform; |
| cub::TransformInputIterator<float, SqrTransform<float>, const float*> it( |
| x, transform); |
| SumGenericIter<float>(N, it, y, context, scratch_ptr); |
| } else { |
| SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>( |
| N, x, y, true); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| } |
| |
| #define CAFFE2_MATH_SUMSQR_FUNC(T) \ |
| template <> \ |
| CAFFE2_CUDA_EXPORT void SumSqr<T, CUDAContext>( \ |
| const int N, \ |
| const T* x, \ |
| T* y, \ |
| CUDAContext* context, \ |
| Tensor* scratch_ptr) { \ |
| if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) { \ |
| FloatTransform<T> float_transform; \ |
| cub::TransformInputIterator<float, FloatTransform<T>, const T*> \ |
| float_it(x, float_transform); \ |
| SqrTransform<float> sqr_transform; \ |
| cub::TransformInputIterator< \ |
| float, \ |
| SqrTransform<float>, \ |
| decltype(float_it)> \ |
| it(float_it, sqr_transform); \ |
| float* sum = nullptr; \ |
| SumGenericIter<float>(N, it, sum, context, scratch_ptr); \ |
| SumConvertKernel<<<1, 1, 0, context->cuda_stream()>>>(sum, y); \ |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); \ |
| } else { \ |
| SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>( \ |
| N, x, y, true); \ |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); \ |
| } \ |
| } |
| |
| CAFFE2_MATH_SUMSQR_FUNC(at::Half) |
| #undef CAFFE2_MATH_SUMSQR_FUNC |
| #undef DEVICE_REDUCE_SIZE_THRESHOLD |
| |
| namespace { |
| template <typename T> |
| __global__ void |
| SelectKernel(const int N, const int D, const T* x, const int* idx, T* y) { |
| CUDA_1D_KERNEL_LOOP(i, N) { |
| y[i] = x[i * D + idx[i]]; |
| } |
| } |
| } // namespace |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void Select<float, CUDAContext>( |
| const int N, |
| const int D, |
| const float* x, |
| const int* idx, |
| float* y, |
| CUDAContext* context) { |
| SelectKernel<float> |
| <<<CAFFE_GET_BLOCKS(N), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(N, D, x, idx, y); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void Select<at::Half, CUDAContext>( |
| const int N, |
| const int D, |
| const at::Half* x, |
| const int* idx, |
| at::Half* y, |
| CUDAContext* context) { |
| SelectKernel<at::Half> |
| <<<CAFFE_GET_BLOCKS(N), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(N, D, x, idx, y); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| |
| namespace { |
| |
| template <typename T> |
| __global__ void Im2ColNCHWCUDAKernel( |
| const int n, |
| const int input_h, |
| const int input_w, |
| const int kernel_h, |
| const int kernel_w, |
| const int dilation_h, |
| const int dilation_w, |
| const int pad_t, |
| const int pad_l, |
| const int stride_h, |
| const int stride_w, |
| const int output_h, |
| const int output_w, |
| const T* img_data, |
| T* col_data) { |
| CUDA_1D_KERNEL_LOOP(index, n) { |
| const int w_out = index % output_w; |
| const int h_index = index / output_w; |
| const int h_out = h_index % output_h; |
| const int channel_in = h_index / output_h; |
| const int channel_out = channel_in * kernel_h * kernel_w; |
| const int h_in = h_out * stride_h - pad_t; |
| const int w_in = w_out * stride_w - pad_l; |
| const int output_size = output_h * output_w; |
| T* col_data_ptr = |
| col_data + (channel_out * output_h + h_out) * output_w + w_out; |
| const T* img_data_ptr = |
| img_data + (channel_in * input_h + h_in) * input_w + w_in; |
| int dh = 0; |
| for (int i = 0; i < kernel_h; ++i) { |
| int dw = 0; |
| for (int j = 0; j < kernel_w; ++j) { |
| const int h = h_in + dh; |
| const int w = w_in + dw; |
| #if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) |
| *col_data_ptr = utils::IsAGeZeroAndALtB(h, input_h) && |
| utils::IsAGeZeroAndALtB(w, input_w) |
| ? __ldg(img_data_ptr + dh * input_w + dw) |
| : 0; |
| #else |
| *col_data_ptr = utils::IsAGeZeroAndALtB(h, input_h) && |
| utils::IsAGeZeroAndALtB(w, input_w) |
| ? img_data_ptr[dh * input_w + dw] |
| : 0; |
| #endif |
| col_data_ptr += output_size; |
| dw += dilation_w; |
| } |
| dh += dilation_h; |
| } |
| } |
| } |
| |
| template <typename T> |
| __global__ void Im2ColNHWCCUDAKernel( |
| const int n, |
| const int input_h, |
| const int input_w, |
| const int kernel_h, |
| const int kernel_w, |
| const int dilation_h, |
| const int dilation_w, |
| const int pad_t, |
| const int pad_l, |
| const int stride_h, |
| const int stride_w, |
| const int output_w, |
| const int channels, |
| const T* img_data, |
| T* col_data) { |
| CUDA_1D_KERNEL_LOOP(index, n) { |
| const int channel_in = index % channels; |
| const int w_out = index / channels % output_w; |
| const int h_out = index / channels / output_w; |
| const int h_in = h_out * stride_h - pad_t; |
| const int w_in = w_out * stride_w - pad_l; |
| T* col_data_ptr = col_data + |
| (h_out * output_w + w_out) * channels * kernel_h * kernel_w + |
| channel_in; |
| int dh = 0; |
| for (int i = 0; i < kernel_h; ++i) { |
| int dw = 0; |
| for (int j = 0; j < kernel_w; ++j) { |
| const int h = h_in + dh; |
| const int w = w_in + dw; |
| #if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) |
| *col_data_ptr = utils::IsAGeZeroAndALtB(h, input_h) && |
| utils::IsAGeZeroAndALtB(w, input_w) |
| ? __ldg(img_data + (h * input_w + w) * channels + channel_in) |
| : 0; |
| #else |
| *col_data_ptr = utils::IsAGeZeroAndALtB(h, input_h) && |
| utils::IsAGeZeroAndALtB(w, input_w) |
| ? img_data[(h * input_w + w) * channels + channel_in] |
| : 0; |
| #endif |
| col_data_ptr += channels; |
| dw += dilation_w; |
| } |
| dh += dilation_h; |
| } |
| } |
| } |
| |
| template <typename T> |
| __global__ void Col2ImNCHWCUDAKernel( |
| const int n, |
| const int input_h, |
| const int input_w, |
| const int patch_h, |
| const int patch_w, |
| const int dilation_h, |
| const int dilation_w, |
| const int pad_t, |
| const int pad_l, |
| const int stride_h, |
| const int stride_w, |
| const int output_h, |
| const int output_w, |
| const T* col_data, |
| T* img_data) { |
| const int dpatch_h = dilation_h * (patch_h - 1) + 1; |
| const int dpatch_w = dilation_w * (patch_w - 1) + 1; |
| |
| CUDA_1D_KERNEL_LOOP(index, n) { |
| T val = 0; |
| const int w = index % input_w + pad_l; |
| const int h = index / input_w % input_h + pad_t; |
| const int c = index / (input_h * input_w); |
| |
| // compute the start and end of the output |
| const int w_col_start = (w < dpatch_w) ? 0 : (w - dpatch_w) / stride_w + 1; |
| const int w_col_end = min(w / stride_w + 1, output_w); |
| const int h_col_start = (h < dpatch_h) ? 0 : (h - dpatch_h) / stride_h + 1; |
| const int h_col_end = min(h / stride_h + 1, output_h); |
| |
| for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { |
| for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { |
| int h_k = (h - h_col * stride_h); |
| int w_k = (w - w_col * stride_w); |
| if (h_k % dilation_h == 0 && w_k % dilation_w == 0) { |
| h_k /= dilation_h; |
| w_k /= dilation_w; |
| const int col_data_index = |
| (((c * patch_h + h_k) * patch_w + w_k) * output_h + h_col) * |
| output_w + |
| w_col; |
| #if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) |
| val += __ldg(col_data + col_data_index); |
| #else |
| val += col_data[col_data_index]; |
| #endif |
| } |
| } |
| } |
| img_data[index] = val; |
| } |
| } |
| |
| template <typename T> |
| __global__ void Col2ImNHWCCUDAKernel( |
| const int n, |
| const int input_w, |
| const int channels, |
| const int patch_h, |
| const int patch_w, |
| const int dilation_h, |
| const int dilation_w, |
| const int pad_t, |
| const int pad_l, |
| const int stride_h, |
| const int stride_w, |
| const int output_h, |
| const int output_w, |
| const T* col_data, |
| T* img_data) { |
| const int dpatch_h = dilation_h * (patch_h - 1) + 1; |
| const int dpatch_w = dilation_w * (patch_w - 1) + 1; |
| |
| CUDA_1D_KERNEL_LOOP(index, n) { |
| T val = 0; |
| const int c = index % channels; |
| const int w = index / channels % input_w + pad_l; |
| const int h = index / channels / input_w + pad_t; |
| // compute the start and end of the output |
| const int w_col_start = (w < dpatch_w) ? 0 : (w - dpatch_w) / stride_w + 1; |
| const int w_col_end = min(w / stride_w + 1, output_w); |
| const int h_col_start = (h < dpatch_h) ? 0 : (h - dpatch_h) / stride_h + 1; |
| const int h_col_end = min(h / stride_h + 1, output_h); |
| const int channels_col = patch_h * patch_w * channels; |
| |
| for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { |
| for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { |
| int h_k = h - h_col * stride_h; |
| int w_k = w - w_col * stride_w; |
| if (h_k % dilation_h == 0 && w_k % dilation_w == 0) { |
| h_k /= dilation_h; |
| w_k /= dilation_w; |
| const int c_col = (h_k * patch_w + w_k) * channels + c; |
| #if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) |
| val += __ldg( |
| col_data + (h_col * output_w + w_col) * channels_col + c_col); |
| #else |
| val += col_data[(h_col * output_w + w_col) * channels_col + c_col]; |
| #endif |
| } |
| } |
| } |
| img_data[index] = val; |
| } |
| } |
| |
| template <typename T, int N, bool kCol2Im> |
| __global__ void Im2ColNdNCHWCUDAKernel( |
| const int outer_size, |
| const int inner_size, |
| const int kernel_size, |
| SimpleArray<int, N + 1> img_shape, |
| SimpleArray<int, N + 1> col_shape, |
| SimpleArray<int, N> kernel_shape, |
| SimpleArray<int, N> stride, |
| SimpleArray<int, N> dilation, |
| SimpleArray<int, N> pad, |
| const T* X_data, |
| T* Y_data) { |
| int d_offset[N]; |
| int d_iter[N]; |
| for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { |
| int offset_i = i; |
| #pragma unroll |
| for (int d_i = N - 1; d_i >= 0; --d_i) { |
| d_offset[d_i] = offset_i % kernel_shape.data[d_i]; |
| offset_i /= kernel_shape.data[d_i]; |
| } |
| for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { |
| int offset_j = j; |
| #pragma unroll |
| for (int d_i = N - 1; d_i >= 0; --d_i) { |
| d_iter[d_i] = offset_j % col_shape.data[d_i + 1]; |
| offset_j /= col_shape.data[d_i + 1]; |
| } |
| const int col_index = i * inner_size + j; |
| int img_index = i / kernel_size; |
| bool is_padding = false; |
| #pragma unroll |
| for (int d_i = 0; d_i < N; ++d_i) { |
| const int d_img = d_iter[d_i] * stride.data[d_i] - pad.data[d_i] + |
| d_offset[d_i] * dilation.data[d_i]; |
| is_padding |= !utils::IsAGeZeroAndALtB(d_img, img_shape.data[d_i + 1]); |
| img_index = img_index * img_shape.data[d_i + 1] + d_img; |
| } |
| #if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) |
| if (!kCol2Im) { |
| Y_data[col_index] = is_padding ? 0 : __ldg(X_data + img_index); |
| } else if (!is_padding) { |
| gpu_atomic_add(Y_data + img_index, __ldg(X_data + col_index)); |
| } |
| #else |
| if (!kCol2Im) { |
| Y_data[col_index] = is_padding ? 0 : X_data[img_index]; |
| } else if (!is_padding) { |
| gpu_atomic_add(Y_data + img_index, X_data[col_index]); |
| } |
| #endif |
| } |
| } |
| } |
| |
| template <typename T, int N> |
| CAFFE2_CUDA_EXPORT void Im2ColNdNCHWCUDAImpl( |
| const int img_size, |
| const int col_size, |
| const int* img_shape, |
| const int* col_shape, |
| const int* kernel_shape, |
| const int* stride, |
| const int* dilation, |
| const int* pad, |
| const float* img_data, |
| float* col_data, |
| CUDAContext* context) { |
| const int outer_size = col_shape[0]; |
| const int inner_size = col_size / outer_size; |
| const int kernel_size = std::accumulate( |
| kernel_shape, kernel_shape + N, 1, std::multiplies<int>()); |
| SimpleArray<int, N + 1> img_shape_array; |
| SimpleArray<int, N + 1> col_shape_array; |
| SimpleArray<int, N> kernel_shape_array; |
| SimpleArray<int, N> stride_array; |
| SimpleArray<int, N> dilation_array; |
| SimpleArray<int, N> pad_array; |
| std::memcpy(img_shape_array.data, img_shape, (N + 1) * sizeof(int)); |
| std::memcpy(col_shape_array.data, col_shape, (N + 1) * sizeof(int)); |
| std::memcpy(kernel_shape_array.data, kernel_shape, N * sizeof(int)); |
| std::memcpy(stride_array.data, stride, N * sizeof(int)); |
| std::memcpy(dilation_array.data, dilation, N * sizeof(int)); |
| std::memcpy(pad_array.data, pad, N * sizeof(int)); |
| Im2ColNdNCHWCUDAKernel<T, N, false> |
| <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>( |
| outer_size, |
| inner_size, |
| kernel_size, |
| img_shape_array, |
| col_shape_array, |
| kernel_shape_array, |
| stride_array, |
| dilation_array, |
| pad_array, |
| img_data, |
| col_data); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| |
| template <typename T, int N> |
| CAFFE2_CUDA_EXPORT void Col2ImNdNCHWCUDAImpl( |
| const int img_size, |
| const int col_size, |
| const int* img_shape, |
| const int* col_shape, |
| const int* kernel_shape, |
| const int* stride, |
| const int* dilation, |
| const int* pad, |
| const float* col_data, |
| float* img_data, |
| CUDAContext* context) { |
| const int outer_size = col_shape[0]; |
| const int inner_size = col_size / outer_size; |
| const int kernel_size = std::accumulate( |
| kernel_shape, kernel_shape + N, 1, std::multiplies<int>()); |
| SimpleArray<int, N + 1> img_shape_array; |
| SimpleArray<int, N + 1> col_shape_array; |
| SimpleArray<int, N> kernel_shape_array; |
| SimpleArray<int, N> stride_array; |
| SimpleArray<int, N> dilation_array; |
| SimpleArray<int, N> pad_array; |
| std::memcpy(img_shape_array.data, img_shape, (N + 1) * sizeof(int)); |
| std::memcpy(col_shape_array.data, col_shape, (N + 1) * sizeof(int)); |
| std::memcpy(kernel_shape_array.data, kernel_shape, N * sizeof(int)); |
| std::memcpy(stride_array.data, stride, N * sizeof(int)); |
| std::memcpy(dilation_array.data, dilation, N * sizeof(int)); |
| std::memcpy(pad_array.data, pad, N * sizeof(int)); |
| Set<T, CUDAContext>(img_size, 0, img_data, context); |
| Im2ColNdNCHWCUDAKernel<T, N, true> |
| <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>( |
| outer_size, |
| inner_size, |
| kernel_size, |
| img_shape_array, |
| col_shape_array, |
| kernel_shape_array, |
| stride_array, |
| dilation_array, |
| pad_array, |
| col_data, |
| img_data); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| |
| } // namespace |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void Im2Col<float, CUDAContext, StorageOrder::NCHW>( |
| const int channels, |
| const int height, |
| const int width, |
| const int kernel_h, |
| const int kernel_w, |
| const int dilation_h, |
| const int dilation_w, |
| const int pad_t, |
| const int pad_l, |
| const int pad_b, |
| const int pad_r, |
| const int stride_h, |
| const int stride_w, |
| const float* img_data, |
| float* col_data, |
| CUDAContext* context, |
| const int /* groups */) { |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1; |
| const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1; |
| const int num_kernels = channels * output_h * output_w; |
| Im2ColNCHWCUDAKernel<float> |
| <<<CAFFE_GET_BLOCKS(num_kernels), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>( |
| num_kernels, |
| height, |
| width, |
| kernel_h, |
| kernel_w, |
| dilation_h, |
| dilation_w, |
| pad_t, |
| pad_l, |
| stride_h, |
| stride_w, |
| output_h, |
| output_w, |
| img_data, |
| col_data); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void Im2Col<float, CUDAContext, StorageOrder::NHWC>( |
| const int channels, |
| const int height, |
| const int width, |
| const int kernel_h, |
| const int kernel_w, |
| const int dilation_h, |
| const int dilation_w, |
| const int pad_t, |
| const int pad_l, |
| const int pad_b, |
| const int pad_r, |
| const int stride_h, |
| const int stride_w, |
| const float* img_data, |
| float* col_data, |
| CUDAContext* context, |
| const int groups) { |
| CAFFE_ENFORCE_EQ(groups, 1, "groups must be 1 for GPU NHWC Im2Col"); |
| |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1; |
| const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1; |
| const int num_kernels = output_h * output_w * channels; |
| Im2ColNHWCCUDAKernel<float> |
| <<<CAFFE_GET_BLOCKS(num_kernels), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>( |
| num_kernels, |
| height, |
| width, |
| kernel_h, |
| kernel_w, |
| dilation_h, |
| dilation_w, |
| pad_t, |
| pad_l, |
| stride_h, |
| stride_w, |
| output_w, |
| channels, |
| img_data, |
| col_data); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void Col2Im<float, CUDAContext, StorageOrder::NCHW>( |
| const int channels, |
| const int height, |
| const int width, |
| const int kernel_h, |
| const int kernel_w, |
| const int dilation_h, |
| const int dilation_w, |
| const int pad_t, |
| const int pad_l, |
| const int pad_b, |
| const int pad_r, |
| const int stride_h, |
| const int stride_w, |
| const float* col_data, |
| float* img_data, |
| CUDAContext* context, |
| const int /* groups */) { |
| // In NCHW, the number of groups doesn't affect Col2Im. |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1; |
| const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1; |
| const int num_kernels = channels * height * width; |
| Col2ImNCHWCUDAKernel<float> |
| <<<CAFFE_GET_BLOCKS(num_kernels), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>( |
| num_kernels, |
| height, |
| width, |
| kernel_h, |
| kernel_w, |
| dilation_h, |
| dilation_w, |
| pad_t, |
| pad_l, |
| stride_h, |
| stride_w, |
| output_h, |
| output_w, |
| col_data, |
| img_data); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void Col2Im<float, CUDAContext, StorageOrder::NHWC>( |
| const int channels, |
| const int height, |
| const int width, |
| const int kernel_h, |
| const int kernel_w, |
| const int dilation_h, |
| const int dilation_w, |
| const int pad_t, |
| const int pad_l, |
| const int pad_b, |
| const int pad_r, |
| const int stride_h, |
| const int stride_w, |
| const float* col_data, |
| float* img_data, |
| CUDAContext* context, |
| const int groups) { |
| CAFFE_ENFORCE_EQ(groups, 1, "groups must be 1 for GPU NHWC Col2Im"); |
| |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1; |
| const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1; |
| const int num_kernels = height * width * channels; |
| Col2ImNHWCCUDAKernel<float> |
| <<<CAFFE_GET_BLOCKS(num_kernels), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>( |
| num_kernels, |
| width, |
| channels, |
| kernel_h, |
| kernel_w, |
| dilation_h, |
| dilation_w, |
| pad_t, |
| pad_l, |
| stride_h, |
| stride_w, |
| output_h, |
| output_w, |
| col_data, |
| img_data); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void Im2ColNd<float, CUDAContext, StorageOrder::NCHW>( |
| const int N, |
| const int img_size, |
| const int col_size, |
| const int* img_shape, |
| const int* col_shape, |
| const int* kernel_shape, |
| const int* stride, |
| const int* dilation, |
| const int* pad, |
| const float* img_data, |
| float* col_data, |
| CUDAContext* context, |
| const int /* groups */) { |
| // In NCHW, the number of groups doesn't affect Im2Col. |
| DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1( |
| N, |
| Im2ColNdNCHWCUDAImpl, |
| float, |
| img_size, |
| col_size, |
| img_shape, |
| col_shape, |
| kernel_shape, |
| stride, |
| dilation, |
| pad, |
| img_data, |
| col_data, |
| context); |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void Im2ColNd<float, CUDAContext, StorageOrder::NHWC>( |
| const int N, |
| const int img_size, |
| const int col_size, |
| const int* img_shape, |
| const int* col_shape, |
| const int* kernel_shape, |
| const int* stride, |
| const int* dilation, |
| const int* pad, |
| const float* img_data, |
| float* col_data, |
| CUDAContext* context, |
| const int groups) { |
| CAFFE_NOT_IMPLEMENTED; |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void Col2ImNd<float, CUDAContext, StorageOrder::NCHW>( |
| const int N, |
| const int img_size, |
| const int col_size, |
| const int* img_shape, |
| const int* col_shape, |
| const int* kernel_shape, |
| const int* stride, |
| const int* dilation, |
| const int* pad, |
| const float* col_data, |
| float* img_data, |
| CUDAContext* context, |
| int /* groups */) { |
| // In NCHW, the number of groups doesn't affect Col2Im. |
| DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1( |
| N, |
| Col2ImNdNCHWCUDAImpl, |
| float, |
| img_size, |
| col_size, |
| img_shape, |
| col_shape, |
| kernel_shape, |
| stride, |
| dilation, |
| pad, |
| col_data, |
| img_data, |
| context); |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void Col2ImNd<float, CUDAContext, StorageOrder::NHWC>( |
| const int N, |
| const int img_size, |
| const int col_size, |
| const int* img_shape, |
| const int* col_shape, |
| const int* kernel_shape, |
| const int* stride, |
| const int* dilation, |
| const int* pad, |
| const float* col_data, |
| float* img_data, |
| CUDAContext* context, |
| int groups) { |
| CAFFE_NOT_IMPLEMENTED; |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void CopyMatrix<CUDAContext>( |
| const size_t itemsize, |
| const int M, |
| const int N, |
| const void* A, |
| const int lda, |
| void* B, |
| const int ldb, |
| CUDAContext* context, |
| TypeMeta::Copy copy) { |
| CAFFE_ENFORCE(!copy, "Copy constructor is not supported in CUDA context"); |
| cudaMemcpy2DAsync( |
| B, |
| ldb * itemsize, |
| A, |
| lda * itemsize, |
| N * itemsize, |
| M, |
| cudaMemcpyDeviceToDevice, |
| context->cuda_stream()); |
| } |
| |
| #define CAFFE2_SPECIALIZED_CUDA_COPY_MATRIX(T) \ |
| template <> \ |
| void CopyMatrix<T, CUDAContext>( \ |
| const int M, \ |
| const int N, \ |
| const T* A, \ |
| const int lda, \ |
| T* B, \ |
| const int ldb, \ |
| CUDAContext* context) { \ |
| if (M == 0 || N == 0) { \ |
| return; \ |
| } \ |
| cudaMemcpy2DAsync( \ |
| B, \ |
| sizeof(T) * ldb, \ |
| A, \ |
| sizeof(T) * lda, \ |
| sizeof(T) * N, \ |
| M, \ |
| cudaMemcpyDeviceToDevice, \ |
| context->cuda_stream()); \ |
| } |
| CAFFE2_SPECIALIZED_CUDA_COPY_MATRIX(float) |
| CAFFE2_SPECIALIZED_CUDA_COPY_MATRIX(double) |
| CAFFE2_SPECIALIZED_CUDA_COPY_MATRIX(int) |
| CAFFE2_SPECIALIZED_CUDA_COPY_MATRIX(int64_t) |
| #undef CAFFE2_SPECIALIZED_CUDA_COPY_MATRIX |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void CopyVector<float, CUDAContext>( |
| const int N, |
| const float* src, |
| float* dst, |
| CUDAContext* context) { |
| if (src != dst && N > 0) { |
| cudaMemcpyAsync( |
| dst, |
| src, |
| sizeof(float) * N, |
| cudaMemcpyDeviceToDevice, |
| context->cuda_stream()); |
| } |
| } |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void CopyVector<int, CUDAContext>( |
| const int N, |
| const int* src, |
| int* dst, |
| CUDAContext* context) { |
| if (src != dst && N > 0) { |
| cudaMemcpyAsync( |
| dst, |
| src, |
| sizeof(int) * N, |
| cudaMemcpyDeviceToDevice, |
| context->cuda_stream()); |
| } |
| } |
| |
| namespace { |
| |
| template <typename T> |
| using BlockReduce = cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS>; |
| |
| template <typename T, class Reducer> |
| __global__ void RowwiseReduceKernel( |
| const int rows, |
| const int cols, |
| const Reducer reducer, |
| const T init, |
| const T alpha, |
| const T* X, |
| T* Y) { |
| __shared__ typename BlockReduce<T>::TempStorage temp_storage; |
| for (int i = blockIdx.x; i < rows; i += gridDim.x) { |
| T val = init; |
| for (int j = threadIdx.x; j < cols; j += blockDim.x) { |
| val = reducer(X[i * cols + j], val); |
| } |
| val = BlockReduce<T>(temp_storage).Reduce(val, reducer); |
| if (threadIdx.x == 0) { |
| Y[i] = val * alpha; |
| } |
| __syncthreads(); |
| } |
| } |
| |
| template <typename T, class Reducer> |
| __global__ void ColwiseReduceKernel( |
| const int rows, |
| const int cols, |
| const Reducer reducer, |
| const T init, |
| const T alpha, |
| const T* X, |
| T* Y) { |
| __shared__ typename BlockReduce<T>::TempStorage temp_storage; |
| for (int i = blockIdx.x; i < cols; i += gridDim.x) { |
| T val = init; |
| for (int j = threadIdx.x; j < rows; j += blockDim.x) { |
| val = reducer(X[j * cols + i], val); |
| } |
| val = BlockReduce<T>(temp_storage).Reduce(val, reducer); |
| if (threadIdx.x == 0) { |
| Y[i] = val * alpha; |
| } |
| __syncthreads(); |
| } |
| } |
| |
| } // namespace |
| |
| #define CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX(T) \ |
| template <> \ |
| CAFFE2_CUDA_EXPORT void RowwiseMax<T, CUDAContext>( \ |
| const int N, const int D, const T* x, T* y, CUDAContext* context) { \ |
| RowwiseReduceKernel<<< \ |
| std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), \ |
| CAFFE_CUDA_NUM_THREADS, \ |
| 0, \ |
| context->cuda_stream()>>>( \ |
| N, D, cub::Max(), std::numeric_limits<T>::lowest(), T(1), x, y); \ |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); \ |
| } |
| CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX(float) |
| #undef CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX |
| |
| #define CAFFE2_SPECIALIZED_CUDA_COLWISE_MAX(T) \ |
| template <> \ |
| CAFFE2_CUDA_EXPORT void ColwiseMax<T, CUDAContext>( \ |
| const int N, const int D, const T* x, T* y, CUDAContext* context) { \ |
| ColwiseReduceKernel<<< \ |
| std::min(D, CAFFE_MAXIMUM_NUM_BLOCKS), \ |
| CAFFE_CUDA_NUM_THREADS, \ |
| 0, \ |
| context->cuda_stream()>>>( \ |
| N, D, cub::Max(), std::numeric_limits<T>::lowest(), T(1), x, y); \ |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); \ |
| } |
| CAFFE2_SPECIALIZED_CUDA_COLWISE_MAX(float) |
| #undef CAFFE2_SPECIALIZED_CUDA_COLWISE_MAX |
| |
| namespace { |
| __global__ void |
| maximum_kernel(const int N, const float alpha, const float* x, float* y) { |
| CUDA_1D_KERNEL_LOOP(i, N) { |
| y[i] = fmaxf(x[i], alpha); |
| } |
| } |
| } // namespace |
| |
| template <> |
| CAFFE2_CUDA_EXPORT void Maximum( |
| const int N, |
| const float alpha, |
| const float* x, |
| float* y, |
| CUDAContext* context) { |
| maximum_kernel<<< |
| std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(N, alpha, x, y); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| |
| namespace { |
| |
| template <typename T, int D> |
| __global__ void BroadcastCUDAKernel( |
| const int Y_size, |
| const SimpleArray<int, D> X_strides, |
| const SimpleArray<FIXED_DIVISOR, D> Y_dims, |
| const T alpha, |
| const T* X, |
| T* Y) { |
| CUDA_1D_KERNEL_LOOP(Y_index, Y_size) { |
| int X_index = 0; |
| int Y_index_val = Y_index; |
| #pragma unroll |
| for (int i = D - 1; i >= 0; --i) { |
| int d; |
| FIXED_DIVISOR_DIV_MOD(Y_dims.data[i], Y_index_val, &Y_index_val, &d); |
| X_index += d * X_strides.data[i]; |
| } |
| #if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) |
| Y[Y_index] = __ldg(X + X_index) * alpha; |
| #else |
| Y[Y_index] = X[X_index] * alpha; |
| #endif |
| } |
| } |
| |
| template <typename T, int D> |
| CAFFE2_CUDA_EXPORT void BroadcastCUDAImpl( |
| const int X_ndim, |
| const int* X_dims, |
| const int* Y_dims, |
| const T alpha, |
| const T* X, |
| T* Y, |
| CUDAContext* context) { |
| SimpleArray<int, D> X_strides_array; |
| SimpleArray<FIXED_DIVISOR, D> Y_dims_array; |
| const int d = D - X_ndim; |
| std::fill(X_strides_array.data, X_strides_array.data + d, 0); |
| int cur_stride = 1; |
| for (int i = D - 1; i >= d; --i) { |
| CAFFE_ENFORCE(X_dims[i - d] == 1 || X_dims[i - d] == Y_dims[i]); |
| X_strides_array.data[i] = X_dims[i - d] == 1 ? 0 : cur_stride; |
| cur_stride *= X_dims[i - d]; |
| } |
| for (int i = 0; i < D; ++i) { |
| if (Y_dims[i] == 0) { |
| return; |
| } |
| Y_dims_array.data[i] = FIXED_DIVISOR(Y_dims[i]); |
| } |
| const int Y_size = |
| std::accumulate(Y_dims, Y_dims + D, 1, std::multiplies<int>()); |
| BroadcastCUDAKernel<T, D> |
| <<<CAFFE_GET_BLOCKS(Y_size), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>( |
| Y_size, X_strides_array, Y_dims_array, alpha, X, Y); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| |
| } // namespace |
| |
| #define CAFFE2_SPECIALIZED_CUDA_BROADCAST(T) \ |
| template <> \ |
| CAFFE2_CUDA_EXPORT void Broadcast<T, CUDAContext>( \ |
| const int X_ndim, \ |
| const int* X_dims, \ |
| const int Y_ndim, \ |
| const int* Y_dims, \ |
| const T alpha, \ |
| const T* X, \ |
| T* Y, \ |
| CUDAContext* context, \ |
| bool) { \ |
| CAFFE_ENFORCE_LE(X_ndim, Y_ndim); \ |
| DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1( \ |
| Y_ndim, \ |
| BroadcastCUDAImpl, \ |
| T, \ |
| X_ndim, \ |
| X_dims, \ |
| Y_dims, \ |
| alpha, \ |
| X, \ |
| Y, \ |
| context); \ |
| } |
| CAFFE2_SPECIALIZED_CUDA_BROADCAST(std::int32_t) |
| CAFFE2_SPECIALIZED_CUDA_BROADCAST(std::int64_t) |
| CAFFE2_SPECIALIZED_CUDA_BROADCAST(float) |
| CAFFE2_SPECIALIZED_CUDA_BROADCAST(double) |
| #undef CAFFE2_SPECIALIZED_CUDA_BROADCAST |
| |
| namespace { |
| |
| template <typename T> |
| __global__ void |
| InvStdCUDAKernel(const int N, const T epsilon, const T* var, T* inv_std); |
| |
| #define DELEGATE_INV_STD_KERNEL_FUNCTION(T, Func) \ |
| template <> \ |
| __global__ void InvStdCUDAKernel<T>( \ |
| const int N, const T epsilon, const T* var, T* inv_std) { \ |
| CUDA_1D_KERNEL_LOOP(i, N) { \ |
| inv_std[i] = Func(var[i] + epsilon); \ |
| } \ |
| } |
| DELEGATE_INV_STD_KERNEL_FUNCTION(float, rsqrtf) |
| #undef DELEGATE_INV_STD_KERNEL_FUNCTION |
| |
| } // namespace |
| |
| #define CAFFE2_SPECIALIZED_CUDA_INV_STD(T) \ |
| template <> \ |
| CAFFE2_CUDA_EXPORT void InvStd<T, CUDAContext>( \ |
| const int N, \ |
| const T epsilon, \ |
| const T* var, \ |
| T* inv_std, \ |
| CUDAContext* context) { \ |
| InvStdCUDAKernel<T> \ |
| <<<CAFFE_GET_BLOCKS(N), \ |
| CAFFE_CUDA_NUM_THREADS, \ |
| 0, \ |
| context->cuda_stream()>>>(N, epsilon, var, inv_std); \ |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); \ |
| } |
| CAFFE2_SPECIALIZED_CUDA_INV_STD(float) |
| #undef CAFFE2_SPECIALIZED_CUDA_INV_STD |
| |
| } // namespace math |
| } // namespace caffe2 |