| #include "caffe2/core/context_gpu.h" |
| #include "caffe2/operators/integral_image_op.h" |
| |
| namespace caffe2 { |
| |
| namespace { |
| __global__ void RowPassKernel( |
| int count, |
| int rows_out, |
| int cols_out, |
| int chans, |
| const float* in, |
| float* out) { |
| CUDA_1D_KERNEL_LOOP(i, count) { |
| // Figure out which row, channel, and batch element we're processing |
| int row = i % rows_out; |
| int chan = (i / rows_out) % chans; |
| int ind = i / rows_out / chans; |
| // Input is (H, W) and output is (H + 1, W + 1) |
| int rows_in = rows_out - 1; |
| int cols_in = cols_out - 1; |
| // Row pointer to input data |
| // Input data is shift (-1, -1) relative to output data, hence row - 1 |
| const float* row_in_data = |
| in + cols_in * ((row - 1) + rows_in * (chan + ind * chans)); |
| // Row pointer to output data |
| float* row_out_data = |
| out + cols_out * (row + rows_out * (chan + ind * chans)); |
| // The first row and first column of the output is all zeros |
| row_out_data[0] = 0.; |
| if (row == 0) { |
| for (int i = 1; i < cols_out; ++i) { |
| row_out_data[i] = 0.; |
| } |
| } else { |
| for (int i = 1; i < cols_out; ++i) { |
| // Recall that input data is shift (-1, -1) relative to the output, |
| // hence i - 1 |
| row_out_data[i] = row_out_data[i - 1] + row_in_data[i - 1]; |
| } |
| } |
| } |
| } |
| |
| __global__ void RowPassGradientKernel( |
| int count, |
| int rows_out, |
| int cols_out, |
| int chans, |
| const float* in, |
| float* out) { |
| CUDA_1D_KERNEL_LOOP(i, count) { |
| // Figure out which row, channel, and batch element we're processing |
| int row = i % rows_out; |
| int chan = (i / rows_out) % chans; |
| int ind = i / rows_out / chans; |
| // Input in (H + 1, W + 1) and output is (H + 1, W) |
| int rows_in = rows_out; |
| int cols_in = cols_out + 1; |
| // Col pointer to input data |
| const float* row_in_data = |
| in + cols_in * (row + rows_in * (chan + ind * chans)); |
| // Col pointer to output data |
| float* row_out_data = |
| out + cols_out * (row + rows_out * (chan + ind * chans)); |
| row_out_data[0] = row_in_data[0]; |
| for (int i = 1; i < cols_out; ++i) { |
| row_out_data[i] = row_out_data[i - 1] + row_in_data[i]; |
| } |
| } |
| } |
| |
| __global__ void |
| ColPassKernel(int count, int rows_out, int cols_out, int chans, float* out) { |
| CUDA_1D_KERNEL_LOOP(i, count) { |
| // Figure out which col, channel, and batch element we're processing |
| int col = i % cols_out; |
| int chan = (i / cols_out) % chans; |
| int ind = i / cols_out / chans; |
| float* col_out_data = |
| out + col + cols_out * rows_out * (chan + ind * chans); |
| for (int i = 1; i < rows_out; ++i) { |
| col_out_data[i * cols_out] += col_out_data[(i - 1) * cols_out]; |
| } |
| } |
| } |
| |
| __global__ void ColPassGradientKernel( |
| int count, |
| int rows_out, |
| int cols_out, |
| int chans, |
| const float* in, |
| float* out) { |
| CUDA_1D_KERNEL_LOOP(i, count) { |
| // Figure out which col, channel, and batch element we're processing |
| int col = i % cols_out; |
| int chan = (i / cols_out) % chans; |
| int ind = i / cols_out / chans; |
| // Input is (H + 1, W) and output is (H, W) |
| int rows_in = rows_out + 1; |
| int cols_in = cols_out; |
| // Col pointer to input data |
| const float* col_in_data = |
| in + col + cols_in * rows_in * (chan + ind * chans); |
| // Col pointer to output data |
| float* col_out_data = |
| out + col + cols_out * rows_out * (chan + ind * chans); |
| col_out_data[0] = col_in_data[0]; |
| for (int i = 1; i < rows_out; ++i) { |
| col_out_data[i * cols_out] = |
| col_out_data[(i - 1) * cols_out] + col_in_data[i * cols_in]; |
| } |
| } |
| } |
| |
| } // namespace |
| |
| template <> |
| bool IntegralImageOp<float, CUDAContext>::RunOnDevice() { |
| auto& X = Input(0); |
| |
| CAFFE_ENFORCE(X.dim() == 4, "Only supports 4D tensors for the momement"); |
| |
| // Input is (N, C, H, W) |
| // Output is (N, C, H + 1, W + 1) |
| vector<int64_t> out_shape(X.sizes().vec()); |
| out_shape[2] += 1; // H + 1 output size |
| out_shape[3] += 1; // W + 1 output size |
| auto* Y = Output(0, out_shape, at::dtype<float>()); |
| |
| const int chans = X.dim32(1); |
| const int rows_out = Y->dim32(2); |
| const int cols_out = Y->dim32(3); |
| // Integral image over rows of input X |
| const int row_pass_size = X.dim32(0) * chans * rows_out; |
| RowPassKernel<<< |
| CAFFE_GET_BLOCKS(row_pass_size), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context_.cuda_stream()>>>( |
| row_pass_size, |
| rows_out, |
| cols_out, |
| chans, |
| X.data<float>(), |
| Y->template mutable_data<float>()); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| |
| // Integral image over columns of the integral image over rows |
| const int col_pass_size = X.dim32(0) * chans * cols_out; |
| ColPassKernel<<< |
| CAFFE_GET_BLOCKS(col_pass_size), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context_.cuda_stream()>>>( |
| col_pass_size, |
| rows_out, |
| cols_out, |
| chans, |
| Y->template mutable_data<float>()); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| |
| return true; |
| } |
| |
| template <> |
| bool IntegralImageGradientOp<float, CUDAContext>::RunOnDevice() { |
| auto& X = Input(0); // Original input to "forward" op |
| auto& dY = Input(1); // Gradient of net w.r.t. output of "forward" op |
| // (aka "gradOutput") |
| |
| auto* dX = Output( |
| 0, X.sizes(), at::dtype<float>()); // Gradient of net w.r.t. input to |
| // "forward" op (aka "gradInput") |
| |
| // Row pass reduces shape of dY from (N, C, H + 1, W + 1) |
| // to (N, C, H + 1, W) |
| // Col pass reduces shape to (N, C, H, W) |
| vector<int64_t> row_pass_shape(dY.sizes().vec()); |
| row_pass_shape[3] -= 1; |
| ReinitializeTensor(&row_pass_buffer_, row_pass_shape, at::dtype<float>().device(CUDA)); |
| const int chans = row_pass_buffer_.dim32(1); |
| const int rows_out = row_pass_buffer_.dim32(2); |
| const int cols_out = row_pass_buffer_.dim32(3); |
| // Integral image over rows of input X |
| const int row_pass_size = X.dim32(0) * chans * rows_out; |
| RowPassGradientKernel<<< |
| CAFFE_GET_BLOCKS(row_pass_size), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context_.cuda_stream()>>>( |
| row_pass_size, |
| rows_out, |
| cols_out, |
| chans, |
| dY.data<float>(), |
| row_pass_buffer_.mutable_data<float>()); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| |
| // Integral image over columns of the integral image over rows |
| const int col_pass_size = X.dim32(0) * chans * cols_out; |
| ColPassGradientKernel<<< |
| CAFFE_GET_BLOCKS(col_pass_size), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context_.cuda_stream()>>>( |
| col_pass_size, |
| rows_out - 1, |
| cols_out, |
| chans, |
| row_pass_buffer_.data<float>(), |
| dX->template mutable_data<float>()); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| |
| return true; |
| } |
| |
| REGISTER_CUDA_OPERATOR(IntegralImage, IntegralImageOp<float, CUDAContext>); |
| REGISTER_CUDA_OPERATOR( |
| IntegralImageGradient, |
| IntegralImageGradientOp<float, CUDAContext>); |
| |
| } // namespace caffe2 |