blob: f4aeeb726d72d481b51ba4c689b25cc300804057 [file] [log] [blame]
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/integral_image_op.h"
namespace caffe2 {
namespace {
__global__ void RowPassKernel(
int count,
int rows_out,
int cols_out,
int chans,
const float* in,
float* out) {
CUDA_1D_KERNEL_LOOP(i, count) {
// Figure out which row, channel, and batch element we're processing
int row = i % rows_out;
int chan = (i / rows_out) % chans;
int ind = i / rows_out / chans;
// Input is (H, W) and output is (H + 1, W + 1)
int rows_in = rows_out - 1;
int cols_in = cols_out - 1;
// Row pointer to input data
// Input data is shift (-1, -1) relative to output data, hence row - 1
const float* row_in_data =
in + cols_in * ((row - 1) + rows_in * (chan + ind * chans));
// Row pointer to output data
float* row_out_data =
out + cols_out * (row + rows_out * (chan + ind * chans));
// The first row and first column of the output is all zeros
row_out_data[0] = 0.;
if (row == 0) {
for (int i = 1; i < cols_out; ++i) {
row_out_data[i] = 0.;
}
} else {
for (int i = 1; i < cols_out; ++i) {
// Recall that input data is shift (-1, -1) relative to the output,
// hence i - 1
row_out_data[i] = row_out_data[i - 1] + row_in_data[i - 1];
}
}
}
}
__global__ void RowPassGradientKernel(
int count,
int rows_out,
int cols_out,
int chans,
const float* in,
float* out) {
CUDA_1D_KERNEL_LOOP(i, count) {
// Figure out which row, channel, and batch element we're processing
int row = i % rows_out;
int chan = (i / rows_out) % chans;
int ind = i / rows_out / chans;
// Input in (H + 1, W + 1) and output is (H + 1, W)
int rows_in = rows_out;
int cols_in = cols_out + 1;
// Col pointer to input data
const float* row_in_data =
in + cols_in * (row + rows_in * (chan + ind * chans));
// Col pointer to output data
float* row_out_data =
out + cols_out * (row + rows_out * (chan + ind * chans));
row_out_data[0] = row_in_data[0];
for (int i = 1; i < cols_out; ++i) {
row_out_data[i] = row_out_data[i - 1] + row_in_data[i];
}
}
}
__global__ void
ColPassKernel(int count, int rows_out, int cols_out, int chans, float* out) {
CUDA_1D_KERNEL_LOOP(i, count) {
// Figure out which col, channel, and batch element we're processing
int col = i % cols_out;
int chan = (i / cols_out) % chans;
int ind = i / cols_out / chans;
float* col_out_data =
out + col + cols_out * rows_out * (chan + ind * chans);
for (int i = 1; i < rows_out; ++i) {
col_out_data[i * cols_out] += col_out_data[(i - 1) * cols_out];
}
}
}
__global__ void ColPassGradientKernel(
int count,
int rows_out,
int cols_out,
int chans,
const float* in,
float* out) {
CUDA_1D_KERNEL_LOOP(i, count) {
// Figure out which col, channel, and batch element we're processing
int col = i % cols_out;
int chan = (i / cols_out) % chans;
int ind = i / cols_out / chans;
// Input is (H + 1, W) and output is (H, W)
int rows_in = rows_out + 1;
int cols_in = cols_out;
// Col pointer to input data
const float* col_in_data =
in + col + cols_in * rows_in * (chan + ind * chans);
// Col pointer to output data
float* col_out_data =
out + col + cols_out * rows_out * (chan + ind * chans);
col_out_data[0] = col_in_data[0];
for (int i = 1; i < rows_out; ++i) {
col_out_data[i * cols_out] =
col_out_data[(i - 1) * cols_out] + col_in_data[i * cols_in];
}
}
}
} // namespace
template <>
bool IntegralImageOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
CAFFE_ENFORCE(X.dim() == 4, "Only supports 4D tensors for the momement");
// Input is (N, C, H, W)
// Output is (N, C, H + 1, W + 1)
vector<int64_t> out_shape(X.sizes().vec());
out_shape[2] += 1; // H + 1 output size
out_shape[3] += 1; // W + 1 output size
auto* Y = Output(0, out_shape, at::dtype<float>());
const int chans = X.dim32(1);
const int rows_out = Y->dim32(2);
const int cols_out = Y->dim32(3);
// Integral image over rows of input X
const int row_pass_size = X.dim32(0) * chans * rows_out;
RowPassKernel<<<
CAFFE_GET_BLOCKS(row_pass_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
row_pass_size,
rows_out,
cols_out,
chans,
X.data<float>(),
Y->template mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
// Integral image over columns of the integral image over rows
const int col_pass_size = X.dim32(0) * chans * cols_out;
ColPassKernel<<<
CAFFE_GET_BLOCKS(col_pass_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
col_pass_size,
rows_out,
cols_out,
chans,
Y->template mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
template <>
bool IntegralImageGradientOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0); // Original input to "forward" op
auto& dY = Input(1); // Gradient of net w.r.t. output of "forward" op
// (aka "gradOutput")
auto* dX = Output(
0, X.sizes(), at::dtype<float>()); // Gradient of net w.r.t. input to
// "forward" op (aka "gradInput")
// Row pass reduces shape of dY from (N, C, H + 1, W + 1)
// to (N, C, H + 1, W)
// Col pass reduces shape to (N, C, H, W)
vector<int64_t> row_pass_shape(dY.sizes().vec());
row_pass_shape[3] -= 1;
ReinitializeTensor(&row_pass_buffer_, row_pass_shape, at::dtype<float>().device(CUDA));
const int chans = row_pass_buffer_.dim32(1);
const int rows_out = row_pass_buffer_.dim32(2);
const int cols_out = row_pass_buffer_.dim32(3);
// Integral image over rows of input X
const int row_pass_size = X.dim32(0) * chans * rows_out;
RowPassGradientKernel<<<
CAFFE_GET_BLOCKS(row_pass_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
row_pass_size,
rows_out,
cols_out,
chans,
dY.data<float>(),
row_pass_buffer_.mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
// Integral image over columns of the integral image over rows
const int col_pass_size = X.dim32(0) * chans * cols_out;
ColPassGradientKernel<<<
CAFFE_GET_BLOCKS(col_pass_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
col_pass_size,
rows_out - 1,
cols_out,
chans,
row_pass_buffer_.data<float>(),
dX->template mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
REGISTER_CUDA_OPERATOR(IntegralImage, IntegralImageOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(
IntegralImageGradient,
IntegralImageGradientOp<float, CUDAContext>);
} // namespace caffe2