| #include <algorithm> |
| |
| #include <cub/cub.cuh> |
| #include "caffe2/utils/cub_namespace.cuh" |
| |
| #include "caffe2/core/context_gpu.h" |
| #include "caffe2/operators/sequence_ops.h" |
| |
| #include "caffe2/core/operator.h" |
| #include "caffe2/core/tensor.h" |
| |
| namespace caffe2 { |
| |
| namespace { |
| template <typename T> |
| __global__ void AddPaddingKernel( |
| const T* in, |
| int block_size, |
| int lengths_size, |
| int outer_size, |
| const int32_t* lengths_prefix_sum, |
| const T* padding_start_ptr, |
| int start_padding_width_blocks, |
| const T* padding_end_ptr, |
| int end_padding_width_blocks, |
| T* out, |
| int32_t* lengths_out) { |
| int element_idx = blockIdx.x; |
| int prior_padding = |
| element_idx * (start_padding_width_blocks + end_padding_width_blocks); |
| int out_start_idx = element_idx == 0 |
| ? 0 |
| : lengths_prefix_sum[element_idx - 1] + prior_padding; |
| int len_blocks; |
| int in_start_idx; |
| if (lengths_prefix_sum) { |
| len_blocks = lengths_prefix_sum[element_idx] - |
| (element_idx == 0 ? 0 : lengths_prefix_sum[element_idx - 1]); |
| in_start_idx = lengths_prefix_sum[element_idx] - len_blocks; |
| } else { |
| // Only one element, use the outer size |
| CUDA_KERNEL_ASSERT(lengths_size == 1); |
| len_blocks = outer_size; |
| in_start_idx = 0; |
| } |
| |
| out_start_idx *= block_size; |
| in_start_idx *= block_size; |
| |
| int len = len_blocks * block_size; |
| int start_padding_width = start_padding_width_blocks * block_size; |
| int end_padding_width = end_padding_width_blocks * block_size; |
| |
| // start pad |
| T* out_ptr = out + out_start_idx; |
| for (int i = threadIdx.x; i < start_padding_width; i += blockDim.x) { |
| T fill = padding_start_ptr ? padding_start_ptr[i % block_size] : T(0); |
| out_ptr[i] = fill; |
| } |
| |
| // payload |
| for (int i = threadIdx.x; i < len; i += blockDim.x) { |
| out_ptr[i + start_padding_width] = in[in_start_idx + i]; |
| } |
| |
| // end pad |
| for (int i = threadIdx.x; i < end_padding_width; i += blockDim.x) { |
| T fill = padding_end_ptr ? padding_end_ptr[i % block_size] : T(0); |
| out_ptr[i + start_padding_width + len] = fill; |
| } |
| |
| // update the lengths |
| if (threadIdx.x == 0 && lengths_out != nullptr) { |
| lengths_out[element_idx] = |
| len_blocks + start_padding_width_blocks + end_padding_width_blocks; |
| } |
| } |
| |
| template <typename T> |
| __global__ void RemovePaddingKernel( |
| const T* in, |
| int block_size, |
| int lengths_size, |
| int outer_size, |
| const int32_t* lengths_prefix_sum, |
| int start_padding_width_blocks, |
| int end_padding_width_blocks, |
| T* out, |
| int32_t* lengths_out) { |
| int element_idx = blockIdx.x; |
| int prior_padding = |
| element_idx * (start_padding_width_blocks + end_padding_width_blocks); |
| int out_start_idx = element_idx == 0 |
| ? 0 |
| : lengths_prefix_sum[element_idx - 1] - prior_padding; |
| int len_blocks; |
| int in_start_idx; |
| if (lengths_prefix_sum) { |
| len_blocks = lengths_prefix_sum[element_idx] - |
| (element_idx == 0 ? 0 : lengths_prefix_sum[element_idx - 1]); |
| in_start_idx = lengths_prefix_sum[element_idx] - len_blocks; |
| } else { |
| // Only one element, use the outer size |
| CUDA_KERNEL_ASSERT(lengths_size == 1); |
| len_blocks = outer_size; |
| in_start_idx = 0; |
| } |
| |
| out_start_idx *= block_size; |
| in_start_idx *= block_size; |
| |
| int len = len_blocks * block_size; |
| int start_padding_width = start_padding_width_blocks * block_size; |
| |
| // payload |
| T* out_ptr = out + out_start_idx; |
| for (int i = threadIdx.x; i < len; i += blockDim.x) { |
| out_ptr[in_start_idx + i] = in[i + start_padding_width]; |
| } |
| |
| // update the lengths |
| if (threadIdx.x == 0 && lengths_out != nullptr) { |
| lengths_out[element_idx] = |
| len_blocks - (start_padding_width_blocks + end_padding_width_blocks); |
| } |
| } |
| |
| template <bool Inclusive = true> |
| void lengths_prefix_sum( |
| const int32_t* lengths, |
| int32_t num_items, |
| Tensor* prefix_buffer, |
| Tensor* prefix_sum, |
| CUDAContext* context) { |
| // Retrieve buffer size |
| size_t temp_storage_bytes = 0; |
| prefix_sum->Resize(num_items); |
| if (Inclusive) { |
| cub::DeviceScan::InclusiveSum( |
| NULL, |
| temp_storage_bytes, |
| lengths, |
| prefix_sum->template mutable_data<int32_t>(), |
| num_items, |
| context->cuda_stream()); |
| } else { |
| cub::DeviceScan::ExclusiveSum( |
| NULL, |
| temp_storage_bytes, |
| lengths, |
| prefix_sum->template mutable_data<int32_t>(), |
| num_items, |
| context->cuda_stream()); |
| } |
| |
| // Allocate temporary storage |
| auto buffer_size = (temp_storage_bytes + sizeof(int32_t)) / sizeof(int32_t); |
| prefix_buffer->Resize(buffer_size); |
| void* d_temp_storage = |
| static_cast<void*>(prefix_buffer->template mutable_data<int32_t>()); |
| |
| if (Inclusive) { |
| cub::DeviceScan::InclusiveSum( |
| d_temp_storage, |
| temp_storage_bytes, |
| lengths, |
| prefix_sum->template mutable_data<int32_t>(), |
| num_items, |
| context->cuda_stream()); |
| } else { |
| cub::DeviceScan::ExclusiveSum( |
| d_temp_storage, |
| temp_storage_bytes, |
| lengths, |
| prefix_sum->template mutable_data<int32_t>(), |
| num_items, |
| context->cuda_stream()); |
| } |
| } |
| } // namespace |
| |
| template <> |
| template <typename T> |
| bool AddPaddingOp<CUDAContext>::MakePadding( |
| const T* in_ptr, |
| T* out_ptr, |
| const int32_t* lengths_ptr, |
| int32_t lengths_size, |
| int32_t outer_size, |
| const T* padding_start_ptr, |
| const T* padding_end_ptr, |
| int64_t block_size) { |
| // Step 1: compute prefix sum over the lengths -- unless |
| // there were no lengths given, i.e there is only one segment |
| const int32_t* lengths_prefix_sum_ptr = nullptr; |
| if (lengths_ptr != nullptr) { |
| lengths_prefix_sum( |
| lengths_ptr, |
| lengths_size, |
| &lengths_prefix_sum_buffer_, |
| &lengths_prefix_sum_, |
| &context_); |
| lengths_prefix_sum_ptr = lengths_prefix_sum_.data<int32_t>(); |
| } |
| |
| int32_t* lengths_out_ptr = nullptr; |
| if (OutputSize() > 1) { |
| auto* lengths_out = Output(1, {lengths_size}, at::dtype<int32_t>()); |
| lengths_out_ptr = lengths_out->template mutable_data<int32_t>(); |
| } |
| |
| if (lengths_size == 0) { |
| return true; |
| } |
| |
| // Compute the padding using the accumulated lengths |
| AddPaddingKernel<T> |
| <<<lengths_size, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>( |
| in_ptr, |
| block_size, |
| lengths_size, |
| outer_size, |
| lengths_prefix_sum_ptr, |
| padding_start_ptr, |
| startPaddingWidth_, |
| padding_end_ptr, |
| endPaddingWidth_, |
| out_ptr, |
| lengths_out_ptr); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| |
| return true; |
| } |
| |
| REGISTER_CUDA_OPERATOR(AddPadding, AddPaddingOp<CUDAContext>); |
| |
| template <> |
| template <typename T> |
| bool RemovePaddingOp<CUDAContext>::DoRunWithType() { |
| const auto& in = Input(0); |
| CAFFE_ENFORCE_GE(in.dim(), 1); |
| const int32_t outer_size = in.sizes()[0]; |
| const auto block_size = std::accumulate( |
| in.sizes().begin() + 1, in.sizes().end(), 1, std::multiplies<int64_t>()); |
| |
| // if no lengths is provided, assume it is a single full-span entry |
| const int32_t* lengths_ptr = nullptr; |
| int32_t lengths_size = 1; |
| if (InputSize() > 1) { |
| const auto& lengths = Input(1); |
| lengths_ptr = lengths.data<int32_t>(); |
| lengths_size = lengths.numel(); |
| } |
| |
| auto out_dims = in.sizes().vec(); |
| out_dims[0] -= (startPaddingWidth_ + endPaddingWidth_) * lengths_size; |
| auto* out = Output(0, out_dims, at::dtype<T>()); |
| const auto* in_ptr = in.template data<T>(); |
| auto* out_ptr = out->template mutable_data<T>(); |
| |
| // Step 1: compute prefix sum over the (padded) lengths -- unless |
| // there were no lengths given, i.e there is only one segment |
| const int32_t* lengths_prefix_sum_ptr = nullptr; |
| if (lengths_ptr != nullptr) { |
| lengths_prefix_sum( |
| lengths_ptr, |
| lengths_size, |
| &lengths_prefix_sum_buffer_, |
| &lengths_prefix_sum_, |
| &context_); |
| lengths_prefix_sum_ptr = lengths_prefix_sum_.data<int32_t>(); |
| } |
| |
| int32_t* lengths_out_ptr = nullptr; |
| if (OutputSize() > 1) { |
| auto* lengths_out = Output(1, {lengths_size}, at::dtype<int32_t>()); |
| lengths_out_ptr = lengths_out->template mutable_data<int32_t>(); |
| } |
| |
| if (lengths_size == 0) { |
| return true; |
| } |
| |
| // Compute the padding using the accumulated lengths |
| RemovePaddingKernel<T> |
| <<<lengths_size, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>( |
| in_ptr, |
| block_size, |
| lengths_size, |
| outer_size, |
| lengths_prefix_sum_ptr, |
| startPaddingWidth_, |
| endPaddingWidth_, |
| out_ptr, |
| lengths_out_ptr); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| |
| return true; |
| } |
| |
| template <typename T> |
| __global__ void gather_padding_kernel( |
| const int K, |
| const int N, |
| const int Y0Width, |
| const int Y1Width, |
| const T* X, |
| const int* I, |
| const int* L, |
| T* Y0, |
| T* Y1) { |
| typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce; |
| __shared__ typename BlockReduce::TempStorage y0_tmp; |
| __shared__ typename BlockReduce::TempStorage y1_tmp; |
| for (int i = blockIdx.x; i < N; i += gridDim.x) { |
| T sum_1 = T(0); |
| T sum_2 = T(0); |
| for (int j = threadIdx.x; j < K * Y0Width; j += blockDim.x) { |
| const int j1 = j / Y0Width; |
| const int j2 = j % Y0Width; |
| const int idx1 = N * (L[j1] + j2); |
| sum_1 += X[idx1 + i]; |
| } |
| for (int j = threadIdx.x; j < K * Y1Width; j += blockDim.x) { |
| const int j1 = j / Y1Width; |
| const int j2 = j % Y1Width; |
| const int idx1 = N * L[j1]; |
| const int idx2 = idx1 + N * (I[j1] - Y1Width + j2); |
| sum_2 += X[idx2 + i]; |
| } |
| sum_1 = BlockReduce(y0_tmp).Reduce(sum_1, cub::Sum()); |
| sum_2 = BlockReduce(y1_tmp).Reduce(sum_2, cub::Sum()); |
| if (threadIdx.x == 0) { |
| Y0[i] = sum_1; |
| Y0 != Y1 ? Y1[i] = sum_2 : Y0[i] = sum_1 + sum_2; |
| } |
| __syncthreads(); |
| } |
| } |
| |
| template <> |
| template <typename T> |
| void GatherPaddingOp<CUDAContext>::GatherPadding( |
| const int outer_size, |
| const int lengths_size, |
| const int block_size, |
| const int pad_width, |
| const T* in_ptr, |
| const int* lengths_ptr, |
| T* padding_start_ptr, |
| T* padding_end_ptr) { |
| if (lengths_size > 0) { |
| lengths_prefix_sum<false>( |
| lengths_ptr, |
| lengths_size, |
| &lengths_prefix_sum_buffer_, |
| &lengths_prefix_sum_, |
| &context_); |
| gather_padding_kernel<T> |
| <<<std::min(block_size, CAFFE_MAXIMUM_NUM_BLOCKS), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context_.cuda_stream()>>>( |
| lengths_size, |
| block_size, |
| startPaddingWidth_, |
| endPaddingWidth_, |
| in_ptr, |
| lengths_ptr, |
| lengths_prefix_sum_.template data<int>(), |
| padding_start_ptr, |
| padding_end_ptr); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| } |
| REGISTER_CUDA_OPERATOR(RemovePadding, RemovePaddingOp<CUDAContext>); |
| REGISTER_CUDA_OPERATOR(GatherPadding, GatherPaddingOp<CUDAContext>); |
| } // namespace caffe2 |