blob: 7e9b5b73610e5ba906ac27e8b71e27de838dcf5b [file] [log] [blame]
#include "caffe2/operators/transpose_op.h"
#include <algorithm>
#include <limits>
#include <type_traits>
#include <vector>
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/cudnn_wrappers.h"
#include "caffe2/core/types.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
namespace {
class CuDNNTransposeOp final : public Operator<CUDAContext> {
public:
USE_OPERATOR_FUNCTIONS(CUDAContext);
template <class... Args>
explicit CuDNNTransposeOp(Args&&... args)
: Operator<CUDAContext>(std::forward<Args>(args)...),
cudnn_wrapper_(&context_),
axes_(OperatorBase::GetRepeatedArgument<int>("axes")) {
// Checks the legality of axes_: it should be from 0 to axes_.size().
std::vector<int> axes_sorted(axes_);
std::sort(axes_sorted.begin(), axes_sorted.end());
for (std::size_t i = 0; i < axes_sorted.size(); ++i) {
if (axes_sorted[i] != i) {
CAFFE_THROW("Axes should be a permutation of 0 to ndim.");
}
}
CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&X_desc_));
CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&Y_desc_));
}
~CuDNNTransposeOp() override {
CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(X_desc_));
CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(Y_desc_));
}
bool RunOnDevice() override {
return DispatchHelper<TensorTypes<float, int>>::call(this, Input(0));
}
template <typename T>
bool DoRunWithType() {
const auto& X = Input(0);
const int ndim = X.dim();
if (axes_.empty()) {
axes_.resize(ndim);
std::iota(axes_.rbegin(), axes_.rend(), 0);
} else {
CAFFE_ENFORCE_EQ(axes_.size(), ndim);
}
std::vector<std::int64_t> X_dims = X.sizes().vec();
std::vector<std::int64_t> Y_dims(ndim);
for (int i = 0; i < ndim; ++i) {
Y_dims[i] = X_dims[axes_[i]];
}
auto* Y = Output(0, Y_dims, at::dtype<T>());
const T* X_data = X.template data<T>();
T* Y_data = Y->template mutable_data<T>();
if (X.numel() == 0) {
return true;
}
if (!IsFloatType<T>() || !IsCuDNNValidTensor(X)) {
math::Transpose<std::int64_t, T, CUDAContext>(
ndim, X_dims.data(), axes_.data(), X_data, Y_data, &context_);
return true;
}
if (cudnnTypeWrapper<T>::type != cached_dtype_ ||
X_dims != cached_X_dims_) {
SetTensorDescriptor(cudnnTypeWrapper<T>::type, X_dims, Y_dims);
cached_dtype_ = cudnnTypeWrapper<T>::type;
cached_X_dims_ = X_dims;
}
CUDNN_ENFORCE(cudnnTransformTensor(
cudnn_wrapper_.inline_cudnn_handle(),
cudnnTypeWrapper<T>::kOne(),
X_desc_,
X_data,
cudnnTypeWrapper<T>::kZero(),
Y_desc_,
Y_data));
return true;
}
private:
template <typename T>
constexpr bool IsFloatType() const {
return std::is_same<T, float>::value || std::is_same<T, double>::value ||
std::is_same<T, at::Half>::value;
}
bool IsCuDNNValidTensor(const Tensor& X) const {
const int ndim = X.dim();
return ndim >= 3 && ndim <= CUDNN_DIM_MAX &&
X.numel() < std::numeric_limits<int32_t>::max();
}
void SetTensorDescriptor(
const cudnnDataType_t data_type,
const std::vector<std::int64_t>& X_dims,
const std::vector<std::int64_t>& Y_dims) {
const int ndim = X_dims.size();
std::vector<int> dims(Y_dims.cbegin(), Y_dims.cend());
std::vector<int> X_strides(ndim);
std::vector<int> X_buff(ndim);
std::vector<int> Y_strides(ndim);
X_buff.back() = 1;
Y_strides.back() = 1;
for (int i = ndim - 1; i > 0; --i) {
X_buff[i - 1] = X_buff[i] * X_dims[i];
Y_strides[i - 1] = Y_strides[i] * Y_dims[i];
}
for (int i = 0; i < ndim; ++i) {
X_strides[i] = X_buff[axes_[i]];
}
CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
X_desc_, data_type, ndim, dims.data(), X_strides.data()));
CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
Y_desc_, data_type, ndim, dims.data(), Y_strides.data()));
}
CuDNNWrapper cudnn_wrapper_;
cudnnTensorDescriptor_t X_desc_;
cudnnTensorDescriptor_t Y_desc_;
cudnnDataType_t cached_dtype_ = cudnnTypeWrapper<float>::type;
std::vector<std::int64_t> cached_X_dims_;
std::vector<std::int32_t> axes_;
};
#if !CUDNN_VERSION_MIN(6, 0, 0)
// CuDNN 5.1 does not have int support yet.
template <>
bool CuDNNTransposeOp::DoRunWithType<int>() {
const auto& X = Input(0);
const int ndim = X.dim();
if (axes_.empty()) {
axes_.resize(ndim);
std::iota(axes_.rbegin(), axes_.rend(), 0);
} else {
CAFFE_ENFORCE_EQ(axes_.size(), ndim);
}
std::vector<std::int64_t> X_dims = X.sizes().vec();
std::vector<std::int64_t> Y_dims(ndim);
for (int i = 0; i < ndim; ++i) {
Y_dims[i] = X_dims[axes_[i]];
}
auto* Y = Output(0, Y_dims, at::dtype<T>());
const T* X_data = X.template data<T>();
T* Y_data = Y->template mutable_data<T>();
math::Transpose<std::int64_t, T, CUDAContext>(
ndim, X_dims.data(), axes_.data(), X_data, Y_data, &context_);
return true;
}
#endif // !CUDNN_VERSION_MIN(6, 0, 0)
} // namespace
REGISTER_CUDNN_OPERATOR(Transpose, CuDNNTransposeOp);
} // namespace caffe2