aten/src/ATen/native/cuda/Unique.cu - platform/external/pytorch - Git at Google

 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/ThrustAllocator.h>

 #include <c10/util/Load.h>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
 #else
 #include <ATen/ops/_unique2_native.h>
 #include <ATen/ops/_unique_native.h>
 #include <ATen/ops/arange.h>
 #include <ATen/ops/empty.h>
 #include <ATen/ops/unique_consecutive_native.h>
 #include <ATen/ops/unique_dim_consecutive_native.h>
 #include <ATen/ops/unique_dim_native.h>
 #endif

 #include <tuple>
 #include <iterator>
 #include <thrust/adjacent_difference.h>
 #include <thrust/execution_policy.h>
 #include <thrust/unique.h>
 #include <thrust/sort.h>
 #include <thrust/scan.h>
 #include <thrust/scatter.h>

 #include <ATen/native/cuda/UniqueCub.cuh>

 namespace at {
 namespace native{

 namespace {

 template <
   typename policy_t, typename scalar_t,
   typename equal_t, typename not_equal_t
 >
 std::tuple<Tensor, Tensor, int64_t> compute_unique(
   const policy_t &policy,
   scalar_t *data,
   int64_t num_inp,
   const Tensor &sorted_indices,
   const bool return_inverse,
   const bool return_counts,
   TensorOptions options,
   equal_t equal,
   not_equal_t not_equal
 ) {
   // inverse indices
   Tensor inverse_indices;
   if (!return_inverse || num_inp == 0) {
     inverse_indices = at::empty({0}, options);
   } else {
     TORCH_CHECK(sorted_indices.defined(),
       "return_inverse is set to true, but sorted_indices is undefined. Send a bug report!");
     const int64_t *sorted_indices_ptr = sorted_indices.data_ptr<int64_t>();
     Tensor inv_loc = at::empty({num_inp}, options);
     inverse_indices = at::empty({num_inp}, options);
     int64_t* inv_loc_ptr = inv_loc.data_ptr<int64_t>();
     int64_t* inverse_indices_ptr = inverse_indices.data_ptr<int64_t>();
     thrust::adjacent_difference(policy, data, data + num_inp, inv_loc_ptr, not_equal);
     inv_loc[0] = 0;
     thrust::inclusive_scan(policy, inv_loc_ptr, inv_loc_ptr + num_inp, inv_loc_ptr);
     thrust::scatter(policy, inv_loc_ptr, inv_loc_ptr + num_inp, sorted_indices_ptr, inverse_indices_ptr);
   }

   // unique and count
   Tensor counts = at::empty({0}, options);
   int64_t num_out;
   if (!return_counts) {
     num_out = thrust::unique(policy, data, data + num_inp, equal) - data;
   } else {
     Tensor range = at::arange(0, num_inp + 1, options);
     int64_t *range_ptr = range.data_ptr<int64_t>();
     num_out = thrust::unique_by_key(policy, data, data + num_inp, range_ptr, equal).first - data;
     range[num_out] = num_inp;
     counts.resize_(num_out);
     int64_t* counts_ptr = counts.data_ptr<int64_t>();
     thrust::adjacent_difference(policy, range_ptr + 1, range_ptr + num_out + 1, counts_ptr);
   }

   AT_CUDA_CHECK(cudaGetLastError());
   return std::tuple<Tensor, Tensor, int64_t>(inverse_indices, counts, num_out);
 }

 template <typename scalar_t>
 std::tuple<Tensor, Tensor, Tensor> unique_dim_cuda_template(
   const Tensor& self,
   const int64_t dim,
   const bool consecutive,
   const bool return_inverse,
   const bool return_counts
 ) {

   /**
     * The idea for implementing this is basically the same as unique.
     * For unique_dim, we are taking the unique with respect to a index
     * tensor, but during the processes, we override the compare and equal
     * operator by checking the data underlying it instead. After the
     * algorithm, we would use index_select to map the resulting indicies
     * to the result on the actual data.
     */

   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   at::cuda::ThrustAllocator allocator;
   auto policy = thrust::cuda::par(allocator).on(stream);

   auto sizes = self.sizes().vec();
   // check how many zero dimensions exist
   auto num_zero_dims = std::count(sizes.begin(), sizes.end(), 0);

   // tensor is not well formed as it has 0 sized dimensions
   if (self.size(dim) == 0){
     TORCH_CHECK(
         num_zero_dims == 1,
         "Number of zero sized dimensions is more than one, so unique cannot be applied ")
     Tensor output = at::empty(sizes, self.options());
     Tensor inverse_indices =
         at::empty({0}, self.options().dtype(kLong));
     Tensor counts = at::empty({0}, self.options().dtype(kLong));

     return std::make_tuple(output, inverse_indices, counts);
   }

   TORCH_CHECK(num_zero_dims == 0,
     "There are 0 sized dimensions, and they aren't selected, so unique cannot be applied");

   int64_t num_inp = self.size(dim);
   auto options = self.options().dtype(kLong);
   Tensor input_flat = self.transpose(dim, 0).contiguous().view({num_inp, -1});
   int64_t n = input_flat.size(1);
   scalar_t *input_flat_ptr = input_flat.data_ptr<scalar_t>();

   Tensor indices = at::arange(0, num_inp, options);
   int64_t *indices_data = indices.data_ptr<int64_t>();
   if (!consecutive) {
     thrust::sort(policy, indices_data, indices_data + num_inp,
       [=] __device__ (int64_t a, int64_t b) -> bool {
         for (int64_t i = 0; i < n; ++i) {
           scalar_t lhs = c10::load(&input_flat_ptr[i + a * n]);
           scalar_t rhs = c10::load(&input_flat_ptr[i + b * n]);
           if (lhs < rhs) {
             return true;
           } else if (lhs > rhs) {
             return false;
           }
         }
         return false;
       }
     );
   }

   Tensor inverse_indices, counts;
   int64_t num_out;
   std::tie(inverse_indices, counts, num_out) = compute_unique(
     policy, indices_data, num_inp, indices,
     return_inverse, return_counts, options,
     [=] __device__ (int64_t a, int64_t b) -> bool {
       for (int64_t i = 0; i < n; ++i) {
         scalar_t lhs = c10::load(&input_flat_ptr[i + a * n]);
         scalar_t rhs = c10::load(&input_flat_ptr[i + b * n]);
         if (lhs != rhs) {
           return false;
         }
       }
       return true;
     },
     [=] __device__ (int64_t a, int64_t b) -> int64_t {
       for (int64_t i = 0; i < n; ++i) {
         scalar_t lhs = c10::load(&input_flat_ptr[i + a * n]);
         scalar_t rhs = c10::load(&input_flat_ptr[i + b * n]);
         if (lhs != rhs) {
           return 1;
         }
       }
       return 0;
     }
   );
   indices.resize_(num_out);

   return std::tuple<Tensor, Tensor, Tensor>(self.index_select(dim, indices), inverse_indices, counts);
 }

 } // namespace


 std::tuple<Tensor, Tensor>
 _unique_cuda(const Tensor& self, const bool sorted, const bool return_inverse) {
   return AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, self.scalar_type(), "unique", [&] {
     // The current CUDA implementation of unique always sort due to the
     // lack of hashtable implementation in thrust
     Tensor output, inverse;
     std::tie(output, inverse, std::ignore) = internal::unique_cuda_template<scalar_t>(self, false, return_inverse, false);
     return std::make_tuple(output, inverse);
   });
 }

 std::tuple<Tensor, Tensor, Tensor>
 _unique2_cuda(const Tensor& self, const bool sorted, const bool return_inverse, const bool return_counts) {
   return AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, self.scalar_type(), "unique", [&] {
     // The current CUDA implementation of unique always sort due to the
     // lack of hashtable implementation in thrust
     return internal::unique_cuda_template<scalar_t>(self, false, return_inverse, return_counts);
   });
 }

 std::tuple<Tensor, Tensor, Tensor>
 unique_dim_cuda(const Tensor& self, const int64_t dim, const bool sorted, const bool return_inverse, const bool return_counts) {
   return AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, self.scalar_type(), "unique_dim", [&] {
     return unique_dim_cuda_template<scalar_t>(self, dim, false, return_inverse, return_counts);
   });
 }

 std::tuple<Tensor, Tensor, Tensor>
 unique_dim_consecutive_cuda(const Tensor& self, const int64_t dim, const bool return_inverse, const bool return_counts) {
   return AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, self.scalar_type(), "unique_dim", [&] {
     return unique_dim_cuda_template<scalar_t>(self, dim, true, return_inverse, return_counts);
   });
 }

 std::tuple<Tensor, Tensor, Tensor>
 unique_consecutive_cuda(const Tensor& self, const bool return_inverse, const bool return_counts, c10::optional<int64_t> dim) {
   if (!dim.has_value()) {
     return AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, self.scalar_type(), "unique", [&] {
       // The current CUDA implementation of unique always sort due to the
       // lack of hashtable implementation in thrust
       return internal::unique_cuda_template<scalar_t>(self, true, return_inverse, return_counts);
     });
   }
   return unique_dim_consecutive_cuda(self, dim.value(), return_inverse, return_counts);
 }

 }  // namespace native
 }  // namespace at
	#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
	#include <ATen/core/Tensor.h>
	#include <ATen/Dispatch.h>
	#include <ATen/cuda/CUDAContext.h>
	#include <ATen/cuda/ThrustAllocator.h>

	#include <c10/util/Load.h>

	#ifndef AT_PER_OPERATOR_HEADERS
	#include <ATen/Functions.h>
	#else
	#include <ATen/ops/_unique2_native.h>
	#include <ATen/ops/_unique_native.h>
	#include <ATen/ops/arange.h>
	#include <ATen/ops/empty.h>
	#include <ATen/ops/unique_consecutive_native.h>
	#include <ATen/ops/unique_dim_consecutive_native.h>
	#include <ATen/ops/unique_dim_native.h>
	#endif

	#include <tuple>
	#include <iterator>
	#include <thrust/adjacent_difference.h>
	#include <thrust/execution_policy.h>
	#include <thrust/unique.h>
	#include <thrust/sort.h>
	#include <thrust/scan.h>
	#include <thrust/scatter.h>

	#include <ATen/native/cuda/UniqueCub.cuh>

	namespace at {
	namespace native{

	namespace {

	template <
	typename policy_t, typename scalar_t,
	typename equal_t, typename not_equal_t
	>
	std::tuple<Tensor, Tensor, int64_t> compute_unique(
	const policy_t &policy,
	scalar_t *data,
	int64_t num_inp,
	const Tensor &sorted_indices,
	const bool return_inverse,
	const bool return_counts,
	TensorOptions options,
	equal_t equal,
	not_equal_t not_equal
	) {
	// inverse indices
	Tensor inverse_indices;
	if (!return_inverse \|\| num_inp == 0) {
	inverse_indices = at::empty({0}, options);
	} else {
	TORCH_CHECK(sorted_indices.defined(),
	"return_inverse is set to true, but sorted_indices is undefined. Send a bug report!");
	const int64_t *sorted_indices_ptr = sorted_indices.data_ptr<int64_t>();
	Tensor inv_loc = at::empty({num_inp}, options);
	inverse_indices = at::empty({num_inp}, options);
	int64_t* inv_loc_ptr = inv_loc.data_ptr<int64_t>();
	int64_t* inverse_indices_ptr = inverse_indices.data_ptr<int64_t>();
	thrust::adjacent_difference(policy, data, data + num_inp, inv_loc_ptr, not_equal);
	inv_loc[0] = 0;
	thrust::inclusive_scan(policy, inv_loc_ptr, inv_loc_ptr + num_inp, inv_loc_ptr);
	thrust::scatter(policy, inv_loc_ptr, inv_loc_ptr + num_inp, sorted_indices_ptr, inverse_indices_ptr);
	}

	// unique and count
	Tensor counts = at::empty({0}, options);
	int64_t num_out;
	if (!return_counts) {
	num_out = thrust::unique(policy, data, data + num_inp, equal) - data;
	} else {
	Tensor range = at::arange(0, num_inp + 1, options);
	int64_t *range_ptr = range.data_ptr<int64_t>();
	num_out = thrust::unique_by_key(policy, data, data + num_inp, range_ptr, equal).first - data;
	range[num_out] = num_inp;
	counts.resize_(num_out);
	int64_t* counts_ptr = counts.data_ptr<int64_t>();
	thrust::adjacent_difference(policy, range_ptr + 1, range_ptr + num_out + 1, counts_ptr);
	}

	AT_CUDA_CHECK(cudaGetLastError());
	return std::tuple<Tensor, Tensor, int64_t>(inverse_indices, counts, num_out);
	}

	template <typename scalar_t>
	std::tuple<Tensor, Tensor, Tensor> unique_dim_cuda_template(
	const Tensor& self,
	const int64_t dim,
	const bool consecutive,
	const bool return_inverse,
	const bool return_counts
	) {

	/**
	* The idea for implementing this is basically the same as unique.
	* For unique_dim, we are taking the unique with respect to a index
	* tensor, but during the processes, we override the compare and equal
	* operator by checking the data underlying it instead. After the
	* algorithm, we would use index_select to map the resulting indicies
	* to the result on the actual data.
	*/

	cudaStream_t stream = at::cuda::getCurrentCUDAStream();
	at::cuda::ThrustAllocator allocator;
	auto policy = thrust::cuda::par(allocator).on(stream);

	auto sizes = self.sizes().vec();
	// check how many zero dimensions exist
	auto num_zero_dims = std::count(sizes.begin(), sizes.end(), 0);

	// tensor is not well formed as it has 0 sized dimensions
	if (self.size(dim) == 0){
	TORCH_CHECK(
	num_zero_dims == 1,
	"Number of zero sized dimensions is more than one, so unique cannot be applied ")
	Tensor output = at::empty(sizes, self.options());
	Tensor inverse_indices =
	at::empty({0}, self.options().dtype(kLong));
	Tensor counts = at::empty({0}, self.options().dtype(kLong));

	return std::make_tuple(output, inverse_indices, counts);
	}

	TORCH_CHECK(num_zero_dims == 0,
	"There are 0 sized dimensions, and they aren't selected, so unique cannot be applied");

	int64_t num_inp = self.size(dim);
	auto options = self.options().dtype(kLong);
	Tensor input_flat = self.transpose(dim, 0).contiguous().view({num_inp, -1});
	int64_t n = input_flat.size(1);
	scalar_t *input_flat_ptr = input_flat.data_ptr<scalar_t>();

	Tensor indices = at::arange(0, num_inp, options);
	int64_t *indices_data = indices.data_ptr<int64_t>();
	if (!consecutive) {
	thrust::sort(policy, indices_data, indices_data + num_inp,
	[=] __device__ (int64_t a, int64_t b) -> bool {
	for (int64_t i = 0; i < n; ++i) {
	scalar_t lhs = c10::load(&input_flat_ptr[i + a * n]);
	scalar_t rhs = c10::load(&input_flat_ptr[i + b * n]);
	if (lhs < rhs) {
	return true;
	} else if (lhs > rhs) {
	return false;
	}
	}
	return false;
	}
	);
	}

	Tensor inverse_indices, counts;
	int64_t num_out;
	std::tie(inverse_indices, counts, num_out) = compute_unique(
	policy, indices_data, num_inp, indices,
	return_inverse, return_counts, options,
	[=] __device__ (int64_t a, int64_t b) -> bool {
	for (int64_t i = 0; i < n; ++i) {
	scalar_t lhs = c10::load(&input_flat_ptr[i + a * n]);
	scalar_t rhs = c10::load(&input_flat_ptr[i + b * n]);
	if (lhs != rhs) {
	return false;
	}
	}
	return true;
	},
	[=] __device__ (int64_t a, int64_t b) -> int64_t {
	for (int64_t i = 0; i < n; ++i) {
	scalar_t lhs = c10::load(&input_flat_ptr[i + a * n]);
	scalar_t rhs = c10::load(&input_flat_ptr[i + b * n]);
	if (lhs != rhs) {
	return 1;
	}
	}
	return 0;
	}
	);
	indices.resize_(num_out);

	return std::tuple<Tensor, Tensor, Tensor>(self.index_select(dim, indices), inverse_indices, counts);
	}

	} // namespace


	std::tuple<Tensor, Tensor>
	_unique_cuda(const Tensor& self, const bool sorted, const bool return_inverse) {
	return AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, self.scalar_type(), "unique", [&] {
	// The current CUDA implementation of unique always sort due to the
	// lack of hashtable implementation in thrust
	Tensor output, inverse;
	std::tie(output, inverse, std::ignore) = internal::unique_cuda_template<scalar_t>(self, false, return_inverse, false);
	return std::make_tuple(output, inverse);
	});
	}

	std::tuple<Tensor, Tensor, Tensor>
	_unique2_cuda(const Tensor& self, const bool sorted, const bool return_inverse, const bool return_counts) {
	return AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, self.scalar_type(), "unique", [&] {
	// The current CUDA implementation of unique always sort due to the
	// lack of hashtable implementation in thrust
	return internal::unique_cuda_template<scalar_t>(self, false, return_inverse, return_counts);
	});
	}

	std::tuple<Tensor, Tensor, Tensor>
	unique_dim_cuda(const Tensor& self, const int64_t dim, const bool sorted, const bool return_inverse, const bool return_counts) {
	return AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, self.scalar_type(), "unique_dim", [&] {
	return unique_dim_cuda_template<scalar_t>(self, dim, false, return_inverse, return_counts);
	});
	}

	std::tuple<Tensor, Tensor, Tensor>
	unique_dim_consecutive_cuda(const Tensor& self, const int64_t dim, const bool return_inverse, const bool return_counts) {
	return AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, self.scalar_type(), "unique_dim", [&] {
	return unique_dim_cuda_template<scalar_t>(self, dim, true, return_inverse, return_counts);
	});
	}

	std::tuple<Tensor, Tensor, Tensor>
	unique_consecutive_cuda(const Tensor& self, const bool return_inverse, const bool return_counts, c10::optional<int64_t> dim) {
	if (!dim.has_value()) {
	return AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, self.scalar_type(), "unique", [&] {
	// The current CUDA implementation of unique always sort due to the
	// lack of hashtable implementation in thrust
	return internal::unique_cuda_template<scalar_t>(self, true, return_inverse, return_counts);
	});
	}
	return unique_dim_consecutive_cuda(self, dim.value(), return_inverse, return_counts);
	}

	} // namespace native
	} // namespace at