blob: 372638abdd249dc97578cad12fc4cfce5fe0d383 [file] [log] [blame]
#include <cub/cub.cuh>
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/pack_segments.h"
#include "caffe2/utils/cub_namespace.cuh"
namespace caffe2 {
namespace {
template <typename T, typename Data_T>
__global__ void PackSegmentsKernel(
const Data_T* data_ptr,
const T* lengths_ptr,
const T* lengths_cum_sum,
const T max_length,
const int64_t num_seq,
const int64_t cell_size,
Data_T padding,
bool* presence_ptr,
Data_T* out_ptr) {
CUDA_1D_KERNEL_LOOP(i, num_seq * max_length * cell_size) {
int seq = (i / cell_size) / max_length;
int cell = (i / cell_size) % max_length;
int offset = i % cell_size;
if (presence_ptr && offset == 0) {
presence_ptr[i / cell_size] = cell < lengths_ptr[seq];
}
if (cell >= lengths_ptr[seq]) {
out_ptr[i] = padding;
} else {
int32_t idx = (lengths_cum_sum[seq] + cell) * cell_size + offset;
out_ptr[i] = data_ptr[idx];
}
}
}
template <typename T, typename Data_T>
__global__ void UnpackSegmentsKernel(
const Data_T* data_ptr,
const T* lengths_ptr,
const T* lengths_cum_sum,
const T max_length,
const int64_t num_seq,
const int64_t cell_size,
Data_T* out_ptr) {
CUDA_1D_KERNEL_LOOP(i, num_seq * max_length * cell_size) {
int seq = (i / cell_size) / max_length;
int cell = (i / cell_size) % max_length;
int offset = i % cell_size;
if (cell < lengths_ptr[seq]) {
int idx = (lengths_cum_sum[seq] + cell) * cell_size + offset;
out_ptr[idx] = data_ptr[i];
}
}
}
template <typename T>
int64_t int_array_sum(
const T* dev_array,
int64_t num_items,
Tensor& dev_buffer,
Tensor& dev_sum,
Tensor& host_sum,
CUDAContext& context) {
// Retrieve buffer size
size_t temp_storage_bytes = 0;
cub::DeviceReduce::Sum(
nullptr,
temp_storage_bytes,
dev_array,
dev_sum.mutable_data<int64_t>(),
num_items,
context.cuda_stream());
// Allocate temporary storage
auto buffer_size = (temp_storage_bytes + sizeof(T)) / sizeof(T);
dev_buffer.Resize(buffer_size);
void* dev_temp_storage = static_cast<void*>(dev_buffer.mutable_data<T>());
// Find sumimum
cub::DeviceReduce::Sum(
dev_temp_storage,
temp_storage_bytes,
dev_array,
dev_sum.mutable_data<int64_t>(),
num_items,
context.cuda_stream());
// Copy to host
host_sum.CopyFrom(dev_sum);
context.FinishDeviceComputation();
return *host_sum.data<int64_t>();
}
template <typename T>
T array_max(
const T* dev_array,
int64_t num_items,
Tensor& dev_max_buffer,
Tensor& dev_max,
Tensor& host_max,
CUDAContext& context) {
// Retrieve buffer size
size_t temp_storage_bytes = 0;
cub::DeviceReduce::Max(
nullptr,
temp_storage_bytes,
dev_array,
dev_max.mutable_data<T>(),
num_items,
context.cuda_stream());
// Allocate temporary storage
auto buffer_size = (temp_storage_bytes + sizeof(T)) / sizeof(T);
dev_max_buffer.Resize(buffer_size);
void* dev_temp_storage = static_cast<void*>(dev_max_buffer.mutable_data<T>());
// Find maximum
cub::DeviceReduce::Max(
dev_temp_storage,
temp_storage_bytes,
dev_array,
dev_max.mutable_data<T>(),
num_items,
context.cuda_stream());
// Copy to host
host_max.CopyFrom(dev_max);
context.FinishDeviceComputation();
return *host_max.data<T>();
}
template <typename T>
void array_prefix_sum_exclusive(
const T* dev_array,
const int32_t num_items,
Tensor& prefix_buffer,
Tensor& prefix_sum,
CUDAContext& context) {
// Retrieve buffer size
size_t temp_storage_bytes = 0;
prefix_sum.Resize(num_items);
cub::DeviceScan::ExclusiveSum(
nullptr,
temp_storage_bytes,
dev_array,
prefix_sum.mutable_data<T>(),
num_items,
context.cuda_stream());
// Allocate temporary storage
auto buffer_size = (temp_storage_bytes + sizeof(T)) / sizeof(T);
prefix_buffer.Resize(buffer_size);
void* dev_temp_storage = static_cast<void*>(prefix_buffer.mutable_data<T>());
// Exclusive sum
cub::DeviceScan::ExclusiveSum(
dev_temp_storage,
temp_storage_bytes,
dev_array,
prefix_sum.mutable_data<T>(),
num_items,
context.cuda_stream());
}
} // namespace
template <>
template <typename T>
bool PackSegmentsOp<CUDAContext>::DoRunWithType() {
return DispatchHelper<TensorTypes2<char, int32_t, int64_t, float>, T>::call(
this, Input(DATA));
}
template <>
template <typename T, typename Data_T>
bool PackSegmentsOp<CUDAContext>::DoRunWithType2() {
const auto& data = Input(DATA);
const auto& lengths = Input(LENGTHS);
int64_t num_seq = lengths.dim(0);
const Data_T* data_ptr = data.data<Data_T>();
const T* lengths_ptr = lengths.data<T>();
auto* out = Output(0);
Tensor* presence_mask = nullptr;
if (return_presence_mask_) {
presence_mask = Output(1);
}
CAFFE_ENFORCE_GE(data.dim(), 1, "DATA should be at least 1-D");
CAFFE_ENFORCE_EQ(lengths.dim(), 1, "LENGTH should be 1-D");
// Find the length of the longest sequence.
dev_max_length_.Resize(1);
host_max_length_.Resize(1);
T temp = num_seq > 0 ? array_max<T>(
lengths_ptr,
num_seq,
dev_buffer_,
dev_max_length_,
host_max_length_,
context_)
: 0;
if (max_length_ != -1) {
CAFFE_ENFORCE_GE(
max_length_,
temp,
"Pre-defined max_length should be greater than the real max_length");
temp = max_length_;
}
const T& max_length = temp;
// Compute prefix sum over the lengths
array_prefix_sum_exclusive<T>(
lengths_ptr, num_seq, dev_buffer_, dev_lengths_prefix_sum_, context_);
bool* presence_mask_data = nullptr;
if (return_presence_mask_) {
std::vector<int64_t> presence_shape{lengths.numel(), max_length};
presence_mask->Resize(presence_shape);
presence_mask_data = presence_mask->template mutable_data<bool>();
}
// create output tensor
auto shape = data.sizes().vec(); // Shape of out is batch_size x max_len x ...
shape[0] = max_length;
shape.insert(shape.begin(), lengths.numel());
out->Resize(shape);
Data_T* out_ptr = static_cast<Data_T*>(out->raw_mutable_data(data.meta()));
// Return empty out (with the proper shape) if first dim is 0.
if (!data.dim(0)) {
return true;
}
// Do padding
Data_T padding = out->IsType<float>() ? padding_ : 0;
int64_t cell_size = data.numel() / data.dim(0);
PackSegmentsKernel<<<
CAFFE_GET_BLOCKS(num_seq * max_length * cell_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
data_ptr,
lengths_ptr,
dev_lengths_prefix_sum_.data<T>(),
max_length,
num_seq,
cell_size,
padding,
presence_mask_data,
out_ptr);
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
template <>
template <typename T>
bool UnpackSegmentsOp<CUDAContext>::DoRunWithType() {
return DispatchHelper<TensorTypes2<char, int32_t, int64_t, float>, T>::call(
this, Input(DATA));
}
template <>
template <typename T, typename Data_T>
bool UnpackSegmentsOp<CUDAContext>::DoRunWithType2() {
const auto& data = Input(DATA);
const auto& lengths = Input(LENGTHS);
int64_t num_seq = lengths.dim(0);
const Data_T* data_ptr = data.data<Data_T>();
const T* lengths_ptr = lengths.data<T>();
auto* out = Output(0);
CAFFE_ENFORCE_GE(data.dim(), 1, "DATA should be at least 1-D");
CAFFE_ENFORCE_EQ(lengths.dim(), 1, "LENGTH should be 1-D");
// Compute prefix sum over the lengths
array_prefix_sum_exclusive<T>(
lengths_ptr, num_seq, dev_buffer_, dev_lengths_prefix_sum_, context_);
// compute max of the lengths
dev_max_length_.Resize(1);
host_max_length_.Resize(1);
T temp = num_seq > 0 ? array_max<T>(
lengths_ptr,
num_seq,
dev_buffer_,
dev_max_length_,
host_max_length_,
context_)
: 0;
if (max_length_ != -1) {
CAFFE_ENFORCE_EQ(
max_length_,
data.dim(1),
"max_length should be equal to the packed segments");
CAFFE_ENFORCE_GE(
max_length_,
temp,
"Pre-defined max_length should be greater than the real max_length");
temp = max_length_;
}
const T& max_length = temp;
// compute num of cells: sum of the lengths
dev_num_cell_.Resize(1);
host_num_cell_.Resize(1);
const int64_t num_cell = int_array_sum<T>(
lengths_ptr,
num_seq,
dev_buffer_,
dev_num_cell_,
host_num_cell_,
context_);
// create output tensor
auto shape = data.sizes().vec();
CAFFE_ENFORCE_EQ(
shape[0], lengths.dim(0), "LENGTH should match DATA in dimension 0");
shape.erase(shape.begin());
shape[0] = num_cell;
out->Resize(shape);
Data_T* out_ptr = static_cast<Data_T*>(out->raw_mutable_data(data.meta()));
// Return empty out (with the proper shape) if any of the dimensions is 0.
if (data.dim(0) == 0 || data.dim(1) == 0) {
return true;
}
// Unpack
int64_t cell_size = data.numel() / (data.dim(0) * data.dim(1));
UnpackSegmentsKernel<<<
CAFFE_GET_BLOCKS(num_seq * max_length * cell_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
data_ptr,
lengths_ptr,
dev_lengths_prefix_sum_.data<T>(),
max_length,
num_seq,
cell_size,
out_ptr);
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
REGISTER_CUDA_OPERATOR(UnpackSegments, UnpackSegmentsOp<CUDAContext>);
REGISTER_CUDA_OPERATOR(PackSegments, PackSegmentsOp<CUDAContext>);
} // namespace caffe2