blob: e4f5dd427ac4f2ad83e80996d8fb23303ed32193 [file] [log] [blame]
#include "caffe2/operators/channel_shuffle_op.h"
#include <array>
#include "caffe2/core/context_gpu.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
template <typename T, bool kNFirst>
__global__ void ChannelShuffleNCHWKernel(
const int G,
const int K,
const int HxW,
const T* X,
T* Y) {
const int C = G * K;
const int n = kNFirst ? blockIdx.x : blockIdx.y;
const int s = kNFirst ? blockIdx.y : blockIdx.x;
const int g = blockIdx.z % G;
const int k = blockIdx.z / G;
const int offset = s * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
if (offset < HxW) {
#if __CUDA_ARCH__ >= 350
Y[(n * C + blockIdx.z) * HxW + offset] =
__ldg(X + (n * C + g * K + k) * HxW + offset);
#else
Y[(n * C + blockIdx.z) * HxW + offset] =
X[(n * C + g * K + k) * HxW + offset];
#endif
}
}
template <typename T, int kSharedSize>
__global__ void
ChannelShuffleNHWCKernel(const int G, const int K, const T* X, T* Y) {
__shared__ T sdata[kSharedSize];
const int C = G * K;
const int offset = blockIdx.x * C;
for (int i = threadIdx.x; i < C; i += blockDim.x) {
#if __CUDA_ARCH__ >= 350
sdata[i] = __ldg(X + offset + i);
#else
sdata[i] = X[offset + i];
#endif
}
__syncthreads();
for (int i = threadIdx.x; i < C; i += blockDim.x) {
const int g = i % G;
const int k = i / G;
Y[offset + i] = sdata[g * K + k];
}
}
template <>
bool ChannelShuffleOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
const auto& X = Input(0);
auto* Y = Output(0, X.sizes(), at::dtype<float>());
const int N = X.dim32(0);
const int C = X.dim32(1);
const int G = this->group_;
CAFFE_ENFORCE_EQ(C % G, 0);
if (X.numel() == 0) {
return true;
}
const int K = C / G;
const int HxW = X.numel() / (N * C);
const int S = (HxW + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
const float* X_data = X.data<float>();
float* Y_data = Y->mutable_data<float>();
if (N <= kCUDAGridDimMaxY) {
const dim3 dim_grid(S, N, C);
ChannelShuffleNCHWKernel<float, false>
<<<dim_grid, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
G, K, HxW, X_data, Y_data);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else {
const dim3 dim_grid(N, S, C);
ChannelShuffleNCHWKernel<float, true>
<<<dim_grid, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
G, K, HxW, X_data, Y_data);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
return true;
}
template <>
bool ChannelShuffleOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
const auto& X = Input(0);
auto* Y = Output(0, X.sizes(), at::dtype<float>());
const int ndim = X.dim();
const int N = X.dim32(0);
const int C = X.dim32(ndim - 1);
const int G = this->group_;
CAFFE_ENFORCE_EQ(C % G, 0);
if (X.numel() == 0) {
return true;
}
const int K = C / G;
const int HxW = X.numel() / (N * C);
const int outer_size = N * HxW;
const float* X_data = X.data<float>();
float* Y_data = Y->mutable_data<float>();
if (C <= 32) {
ChannelShuffleNHWCKernel<float, 32>
<<<outer_size, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
G, K, X_data, Y_data);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else if (C <= 128) {
ChannelShuffleNHWCKernel<float, 128>
<<<outer_size, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
G, K, X_data, Y_data);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else if (C <= 512) {
ChannelShuffleNHWCKernel<float, 512>
<<<outer_size, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
G, K, X_data, Y_data);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else {
const std::array<std::int64_t, 3> dims = {N * HxW, G, K};
const std::array<std::int32_t, 3> axes = {0, 2, 1};
math::Transpose<std::int64_t, float, CUDAContext>(
3, dims.data(), axes.data(), X_data, Y_data, &context_);
}
return true;
}
template <>
bool ChannelShuffleGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
const auto& dY = Input(0);
auto* dX = Output(0, dY.sizes(), at::dtype<float>());
const int N = dY.dim32(0);
const int C = dY.dim32(1);
const int G = this->group_;
CAFFE_ENFORCE_EQ(C % G, 0);
if (dY.numel() == 0) {
return true;
}
const int K = C / G;
const int HxW = dY.numel() / (N * C);
const int S = (HxW + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
const float* dY_data = dY.data<float>();
float* dX_data = dX->mutable_data<float>();
if (N <= kCUDAGridDimMaxY) {
const dim3 dim_grid(S, N, C);
ChannelShuffleNCHWKernel<float, false>
<<<dim_grid, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
K, G, HxW, dY_data, dX_data);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else {
const dim3 dim_grid(N, S, C);
ChannelShuffleNCHWKernel<float, true>
<<<dim_grid, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
K, G, HxW, dY_data, dX_data);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
return true;
}
template <>
bool ChannelShuffleGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
const auto& dY = Input(0);
auto* dX = Output(0, dY.sizes(), at::dtype<float>());
const int ndim = dY.dim();
const int N = dY.dim32(0);
const int C = dY.dim32(ndim - 1);
const int G = this->group_;
CAFFE_ENFORCE_EQ(C % G, 0);
if (dY.numel() == 0) {
return true;
}
const int K = C / G;
const int HxW = dY.numel() / (N * C);
const int outer_size = N * HxW;
const float* dY_data = dY.data<float>();
float* dX_data = dX->mutable_data<float>();
if (C <= 32) {
ChannelShuffleNHWCKernel<float, 32>
<<<outer_size, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
K, G, dY_data, dX_data);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else if (C <= 128) {
ChannelShuffleNHWCKernel<float, 128>
<<<outer_size, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
K, G, dY_data, dX_data);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else if (C <= 512) {
ChannelShuffleNHWCKernel<float, 512>
<<<outer_size, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
K, G, dY_data, dX_data);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else {
const std::array<std::int64_t, 3> dims = {N * HxW, K, G};
const std::array<std::int32_t, 3> axes = {0, 2, 1};
math::Transpose<std::int64_t, float, CUDAContext>(
3, dims.data(), axes.data(), dY_data, dX_data, &context_);
}
return true;
}
REGISTER_CUDA_OPERATOR(ChannelShuffle, ChannelShuffleOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(
ChannelShuffleGradient,
ChannelShuffleGradientOp<float, CUDAContext>);
} // namespace caffe2