caffe2/operators/gather_op.cuh - platform/external/pytorch - Git at Google

 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/gather_op.h"

 namespace caffe2 {

 // This maintains kernels and index-mapping functions shared
 //  by Gather and BatchGather ops.
 namespace gather_helper {

 template <typename T_INDEX, typename TData>
 __global__ void BatchGatherKernel(
     const TData* src_base,
     TData* out,
     const T_INDEX* indices,
     const int M,
     const int N,
     const int data_batch_size,
     const int gathered_batch_size,
     const int block_size,
     const int indexing_axis_dim,
     const bool wrap_indices) {
   const int begin_idx = blockIdx.x * blockDim.x + threadIdx.x;
   const int num_items = M * N * block_size;
   for (int s = begin_idx; s < num_items; s += blockDim.x * gridDim.x) {
     const int k = s % block_size;
     const int j = s / block_size % N;
     const int i = s / block_size / N;
     T_INDEX idx = indices[j];
     if (wrap_indices && (idx < 0)) {
         idx = idx + (T_INDEX) indexing_axis_dim;
     }
     const float* src_offset = src_base + i * data_batch_size + idx * block_size;
     float* dst_offset = out + i * gathered_batch_size + j * block_size;
     dst_offset[k] = src_offset[k];
   }
 }

 // Actual gather implementation - resizes output and copies indexed data.
 template <typename Index>
 static bool gather_impl_cuda(
     Operator<CUDAContext>* op,
     int dataIdx,
     int indicesIdx,
     int outputIdx,
     int axis,
     bool wrap_indices,
     bool match_outer) {
   const Tensor& data = op->Input(dataIdx);
   const Tensor& indices = op->Input(indicesIdx);
   const TypeMeta dataType = data.dtype();
   size_t item_bytesize = dataType.itemsize();

   CAFFE_ENFORCE(!match_outer, "match_outer=true is currently NOT supported for CUDA");

   // ONNX allows negative axis to index from the back, valid range: [-r, r].
   if (axis < 0) {
     axis = data.dim() + axis;
   }
   CAFFE_ENFORCE_GE(
       data.dim(), axis + 1, "DATA should be at least [axis+1]-D");
   CAFFE_ENFORCE_GE(axis, 0, "Axis should be non-negative");
   CAFFE_ENFORCE_LT(axis, data.dim(), "Axis out of range");

   // New shape:
   //  [data dims before axis] + [indices dims] + [data dims after axis]
   vector<int64_t> shape =
       calc_output_shape_vector<int64_t>(data.sizes(), indices.sizes(), axis, match_outer);
   Tensor* output = op->Output(outputIdx, shape, at::dtype(dataType));
   float* out = static_cast<float*>(output->raw_mutable_data(dataType));

   // Succeed if size of output is zero, which can happen for empty batch which
   // would have data dimension size of 0.
   // This *must* be done AFTER output->raw_mutable_data() above as that has
   // important allocation side effect that we must see.
   if (output->numel() == 0) {
     return true;
   }

   const Index* idxs = indices.template data<Index>();
   const float* src_base = static_cast<const float*>(data.raw_data());

   const int outer_dims_product = data.size_to_dim(axis);
   const int block_size = data.size_from_dim(axis + 1);

   const int src_indexing_axis_dim = data.size(axis);
   // Treat indices as a single block even if they have multiple dimensions.
   // The "gathered batch" is a cumulative result combining indexed blocks.
   const int N = indices.numel();
   auto gathered_batch_size = N * block_size;
   const auto src_batch_size = data.size_from_dim(axis);

   // Only run kernel if input is not empty.
   if (N > 0) {
     BatchGatherKernel<<<
         std::min(outer_dims_product, CAFFE_MAXIMUM_NUM_BLOCKS),
         std::min(N * block_size, CAFFE_CUDA_NUM_THREADS),
         0,
         op->getContext()->cuda_stream()>>>(
         src_base,
         out,
         idxs,
         outer_dims_product,
         N,
         src_batch_size,
         gathered_batch_size,
         block_size,
         src_indexing_axis_dim,
         wrap_indices);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
   return true;
 }

 } // namespace gather_helper
 } // namespace caffe2
	#include "caffe2/core/common_gpu.h"
	#include "caffe2/core/context_gpu.h"
	#include "caffe2/operators/gather_op.h"

	namespace caffe2 {

	// This maintains kernels and index-mapping functions shared
	// by Gather and BatchGather ops.
	namespace gather_helper {

	template <typename T_INDEX, typename TData>
	__global__ void BatchGatherKernel(
	const TData* src_base,
	TData* out,
	const T_INDEX* indices,
	const int M,
	const int N,
	const int data_batch_size,
	const int gathered_batch_size,
	const int block_size,
	const int indexing_axis_dim,
	const bool wrap_indices) {
	const int begin_idx = blockIdx.x * blockDim.x + threadIdx.x;
	const int num_items = M * N * block_size;
	for (int s = begin_idx; s < num_items; s += blockDim.x * gridDim.x) {
	const int k = s % block_size;
	const int j = s / block_size % N;
	const int i = s / block_size / N;
	T_INDEX idx = indices[j];
	if (wrap_indices && (idx < 0)) {
	idx = idx + (T_INDEX) indexing_axis_dim;
	}
	const float* src_offset = src_base + i * data_batch_size + idx * block_size;
	float* dst_offset = out + i * gathered_batch_size + j * block_size;
	dst_offset[k] = src_offset[k];
	}
	}

	// Actual gather implementation - resizes output and copies indexed data.
	template <typename Index>
	static bool gather_impl_cuda(
	Operator<CUDAContext>* op,
	int dataIdx,
	int indicesIdx,
	int outputIdx,
	int axis,
	bool wrap_indices,
	bool match_outer) {
	const Tensor& data = op->Input(dataIdx);
	const Tensor& indices = op->Input(indicesIdx);
	const TypeMeta dataType = data.dtype();
	size_t item_bytesize = dataType.itemsize();

	CAFFE_ENFORCE(!match_outer, "match_outer=true is currently NOT supported for CUDA");

	// ONNX allows negative axis to index from the back, valid range: [-r, r].
	if (axis < 0) {
	axis = data.dim() + axis;
	}
	CAFFE_ENFORCE_GE(
	data.dim(), axis + 1, "DATA should be at least [axis+1]-D");
	CAFFE_ENFORCE_GE(axis, 0, "Axis should be non-negative");
	CAFFE_ENFORCE_LT(axis, data.dim(), "Axis out of range");

	// New shape:
	// [data dims before axis] + [indices dims] + [data dims after axis]
	vector<int64_t> shape =
	calc_output_shape_vector<int64_t>(data.sizes(), indices.sizes(), axis, match_outer);
	Tensor* output = op->Output(outputIdx, shape, at::dtype(dataType));
	float* out = static_cast<float*>(output->raw_mutable_data(dataType));

	// Succeed if size of output is zero, which can happen for empty batch which
	// would have data dimension size of 0.
	// This must be done AFTER output->raw_mutable_data() above as that has
	// important allocation side effect that we must see.
	if (output->numel() == 0) {
	return true;
	}

	const Index* idxs = indices.template data<Index>();
	const float* src_base = static_cast<const float*>(data.raw_data());

	const int outer_dims_product = data.size_to_dim(axis);
	const int block_size = data.size_from_dim(axis + 1);

	const int src_indexing_axis_dim = data.size(axis);
	// Treat indices as a single block even if they have multiple dimensions.
	// The "gathered batch" is a cumulative result combining indexed blocks.
	const int N = indices.numel();
	auto gathered_batch_size = N * block_size;
	const auto src_batch_size = data.size_from_dim(axis);

	// Only run kernel if input is not empty.
	if (N > 0) {
	BatchGatherKernel<<<
	std::min(outer_dims_product, CAFFE_MAXIMUM_NUM_BLOCKS),
	std::min(N * block_size, CAFFE_CUDA_NUM_THREADS),
	0,
	op->getContext()->cuda_stream()>>>(
	src_base,
	out,
	idxs,
	outer_dims_product,
	N,
	src_batch_size,
	gathered_batch_size,
	block_size,
	src_indexing_axis_dim,
	wrap_indices);
	C10_CUDA_KERNEL_LAUNCH_CHECK();
	}
	return true;
	}

	} // namespace gather_helper
	} // namespace caffe2