| #include "caffe2/core/context_gpu.h" |
| #include "caffe2/operators/prelu_op.h" |
| |
| #include "caffe2/utils/cub_namespace.cuh" |
| #include <cub/block/block_reduce.cuh> |
| |
| namespace caffe2 { |
| namespace { |
| template <typename T> |
| __global__ void PReluKernel(const int N, const T* X, const T* W, T* Y) { |
| CUDA_1D_KERNEL_LOOP(i, N) { |
| Y[i] = (X[i] > 0) * X[i] + (X[i] < 0) * X[i] * W[0]; |
| } |
| } |
| |
| template <typename T> |
| __global__ void PReluKernelNCHW( |
| const int N, |
| const int C, |
| const int dim, |
| const T* X, |
| const T* W, |
| T* Y) { |
| CUDA_1D_KERNEL_LOOP(i, N * C * dim) { |
| int c = (i / dim) % C; |
| Y[i] = (X[i] > 0) * X[i] + (X[i] < 0) * X[i] * W[c]; |
| } |
| } |
| |
| template <typename T> |
| __global__ void |
| PReluKernelNHWC(const int nitems, const int C, const T* X, const T* W, T* Y) { |
| CUDA_1D_KERNEL_LOOP(i, nitems) { |
| int c = i % C; |
| Y[i] = (X[i] > 0) * X[i] + (X[i] < 0) * X[i] * W[c]; |
| } |
| } |
| |
| template <typename T> |
| __global__ void |
| PReluGradientKernel(const int N, const T* X, const T* W, const T* dY, T* dX) { |
| CUDA_1D_KERNEL_LOOP(i, N) { |
| dX[i] = (X[i] > 0) * dY[i] + (X[i] <= 0) * dY[i] * W[0]; |
| } |
| } |
| |
| template <typename T> |
| __global__ void PReluGradientKernelNCHW( |
| const int N, |
| const int C, |
| const int dim, |
| const T* X, |
| const T* W, |
| const T* dY, |
| T* dX) { |
| CUDA_1D_KERNEL_LOOP(i, N * C * dim) { |
| int c = (i / dim) % C; |
| dX[i] = (X[i] > 0) * dY[i] + (X[i] <= 0) * dY[i] * W[c]; |
| } |
| } |
| |
| template <typename T> |
| __global__ void PReluGradientKernelNHWC( |
| const int nitems, |
| const int C, |
| const T* X, |
| const T* W, |
| const T* dY, |
| T* dX) { |
| CUDA_1D_KERNEL_LOOP(i, nitems) { |
| int c = i % C; |
| dX[i] = (X[i] > 0) * dY[i] + (X[i] <= 0) * dY[i] * W[c]; |
| } |
| } |
| |
| template <typename T> |
| __global__ void PReluSharedWGradientKernelNCHW( |
| const int num_items, |
| const T* Xdata, |
| const T* dYdata, |
| T* dW) { |
| T wsum = 0.0; |
| for (int i = threadIdx.x; i < num_items; i += blockDim.x) { |
| wsum += (Xdata[i] <= 0) * dYdata[i] * Xdata[i]; |
| } |
| |
| typedef cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS> BlockReduce; |
| __shared__ typename BlockReduce::TempStorage temp_storage; |
| T sum = BlockReduce(temp_storage).Sum(wsum); |
| if (threadIdx.x == 0) { |
| *dW = sum; |
| } |
| } |
| |
| template <typename T> |
| __global__ void PReluWGradientKernelNCHW( |
| const int C, |
| const int N, |
| const int num_items, |
| const T* Xdata, |
| const T* dYdata, |
| T* dW) { |
| int c = blockIdx.x; |
| |
| T wsum = 0.0; |
| int items_per_channel = num_items / C; |
| int items_per_sample_channel = items_per_channel / N; |
| for (int i = threadIdx.x; i < items_per_channel; i += blockDim.x) { |
| // TODO: simplify |
| int n = i / items_per_sample_channel; |
| int ii = n * items_per_sample_channel * C + c * items_per_sample_channel + |
| i % items_per_sample_channel; |
| wsum += (Xdata[ii] <= 0) * dYdata[ii] * Xdata[ii]; |
| } |
| |
| typedef cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS> BlockReduce; |
| __shared__ typename BlockReduce::TempStorage temp_storage; |
| T sum = BlockReduce(temp_storage).Sum(wsum); |
| if (threadIdx.x == 0) { |
| dW[c] = sum; |
| } |
| } |
| |
| template <typename T> |
| __global__ void PReluWGradientKernelNHWC( |
| const int C, |
| const int num_items, |
| const T* Xdata, |
| const T* dYdata, |
| T* dW) { |
| const auto c = blockIdx.x; |
| T wsum = 0.0; |
| const auto items_per_channel = num_items / C; |
| for (int i = threadIdx.x; i < items_per_channel; i += blockDim.x) { |
| const auto ii = i * C + c; |
| wsum += (Xdata[ii] <= 0) * dYdata[ii] * Xdata[ii]; |
| } |
| |
| typedef cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS> BlockReduce; |
| __shared__ typename BlockReduce::TempStorage temp_storage; |
| T sum = BlockReduce(temp_storage).Sum(wsum); |
| if (threadIdx.x == 0) { |
| dW[c] = sum; |
| } |
| } |
| |
| } // namespace |
| |
| template <> |
| bool PReluOp<float, CUDAContext>::RunOnDevice() { |
| const auto& X = Input(0); |
| const auto& W = Input(1); |
| |
| auto* Y = Output(0, X.sizes(), at::dtype<float>()); |
| const auto* Xdata = X.data<float>(); |
| const auto* Wdata = W.data<float>(); |
| auto* Ydata = Y->template mutable_data<float>(); |
| |
| const auto C = order_ == StorageOrder::NCHW ? X.dim(1) : X.dim(X.dim() - 1); |
| const auto C_shared = (W.numel() == 1); |
| |
| if (!C_shared) { |
| CAFFE_ENFORCE_EQ(C, W.numel()); |
| } |
| if (C_shared) { |
| PReluKernel<<< |
| CAFFE_GET_BLOCKS(X.numel()), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context_.cuda_stream()>>>(X.numel(), Xdata, Wdata, Ydata); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| |
| return true; |
| } |
| // non-shared case. |
| switch (order_) { |
| case StorageOrder::NCHW: { |
| const auto N = X.dim(0); |
| const auto dim = X.size_from_dim(2); |
| CHECK(N * C * dim == X.numel()); |
| PReluKernelNCHW<<< |
| CAFFE_GET_BLOCKS(X.numel()), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context_.cuda_stream()>>>(N, C, dim, Xdata, Wdata, Ydata); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| |
| break; |
| } |
| case StorageOrder::NHWC: { |
| PReluKernelNHWC<<< |
| CAFFE_GET_BLOCKS(X.numel()), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context_.cuda_stream()>>>(X.numel(), C, Xdata, Wdata, Ydata); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| |
| break; |
| } |
| default: |
| CAFFE_THROW("Unknown storage order: ", order_); |
| } |
| return true; |
| } |
| |
| template <> |
| bool PReluGradientOp<float, CUDAContext>::RunOnDevice() { |
| auto& Y = Input(0); |
| auto& dY = Input(1); |
| auto& X = Input(2); |
| auto& W = Input(3); |
| |
| CAFFE_ENFORCE(&Y != &X, "Cannot backpropagate through an in-place PReLU"); |
| |
| TORCH_DCHECK_EQ(dY.numel(), Y.numel()); |
| auto* dX = Output(0, Y.sizes(), at::dtype<float>()); |
| auto* dW = Output(1, W.sizes(), at::dtype<float>()); |
| |
| const auto C = order_ == StorageOrder::NCHW ? X.dim(1) : X.dim(X.dim() - 1); |
| const auto C_shared = (W.numel() == 1); |
| |
| const float* Ydata = Y.data<float>(); |
| const float* dYdata = dY.data<float>(); |
| const float* Xdata = X.data<float>(); |
| const float* Wdata = W.data<float>(); |
| float* dXdata = dX->template mutable_data<float>(); |
| float* dWdata = dW->template mutable_data<float>(); |
| int N = Y.dim(0); |
| |
| if (C_shared) { |
| PReluSharedWGradientKernelNCHW<<< |
| 1, |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context_.cuda_stream()>>>(X.numel(), Xdata, dYdata, dWdata); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| |
| PReluGradientKernel<<< |
| CAFFE_GET_BLOCKS(X.numel()), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context_.cuda_stream()>>>(X.numel(), Xdata, Wdata, dYdata, dXdata); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| |
| return true; |
| } |
| // non-shared case. |
| switch (order_) { |
| case StorageOrder::NCHW: { |
| const auto dim = Y.size_from_dim(2); |
| PReluWGradientKernelNCHW<<< |
| C, |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context_.cuda_stream()>>>(C, N, X.numel(), Xdata, dYdata, dWdata); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| |
| PReluGradientKernelNCHW<<< |
| CAFFE_GET_BLOCKS(X.numel()), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context_.cuda_stream()>>>(N, C, dim, Xdata, Wdata, dYdata, dXdata); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| |
| break; |
| } |
| case StorageOrder::NHWC: { |
| PReluWGradientKernelNHWC<<< |
| C, |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context_.cuda_stream()>>>(C, X.numel(), Xdata, dYdata, dWdata); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| |
| PReluGradientKernelNHWC<<< |
| CAFFE_GET_BLOCKS(Y.numel()), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context_.cuda_stream()>>>(X.numel(), C, Xdata, Wdata, dYdata, dXdata); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| |
| break; |
| } |
| default: |
| CAFFE_THROW("Unknown storage order: ", order_); |
| } |
| return true; |
| } |
| |
| REGISTER_CUDA_OPERATOR(PRelu, PReluOp<float, CUDAContext>); |
| REGISTER_CUDA_OPERATOR(PReluGradient, PReluGradientOp<float, CUDAContext>); |
| } // namespace caffe2 |