blob: 21d6cc1600fac63732c9f548999231ecbb1c053d [file] [log] [blame]
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/core/Tensor.h>
#include <ATen/ceil_div.h>
#include <ATen/Dispatch.h>
#include <ATen/cuda/Atomic.cuh>
#include <ATen/cuda/detail/IndexUtils.cuh>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/TensorUtils.h>
#include <ATen/Utils.h>
#include <c10/util/Exception.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/empty_like.h>
#include <ATen/ops/replication_pad1d_native.h>
#include <ATen/ops/replication_pad1d_backward_native.h>
#include <ATen/ops/replication_pad2d_native.h>
#include <ATen/ops/replication_pad2d_backward_native.h>
#include <ATen/ops/replication_pad3d_native.h>
#include <ATen/ops/replication_pad3d_backward_native.h>
#endif
#include <algorithm>
#include <cfloat>
#include <cmath>
namespace at {
namespace native {
__host__ __device__ __forceinline__ int imin(int a, int b) {
return a > b ? b : a;
}
__host__ __device__ __forceinline__ int imax(int a, int b) {
return a > b ? a : b;
}
namespace {
template <typename scalar_t>
__global__ void replication_pad_forward_kernel1d(
PackedTensorAccessor64<scalar_t, 3> input,
PackedTensorAccessor64<scalar_t, 3> output,
const int padL,
const int y_shift,
const int z_shift) {
const int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
const int plane = blockIdx.y + y_shift;
const int batch = blockIdx.z + z_shift;
if (outputPointId >= output.size(2)) {
return;
}
const int outputPointX = outputPointId % output.size(2);
const int iStartX = imax(0, -padL);
const int oStartX = imax(0, padL);
const int inputPointX = imin(imax(padL, outputPointX), input.size(2) + padL - 1) - oStartX + iStartX;
scalar_t valueToCopy = input[batch][plane][inputPointX];
output[batch][plane][outputPointX] = valueToCopy;
}
template <typename scalar_t>
__global__ void replication_pad_backward_kernel(
PackedTensorAccessor64<scalar_t, 3> gradInput,
PackedTensorAccessor64<scalar_t, 3> gradOutput,
const int padL,
const int y_shift,
const int z_shift) {
const int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
const int plane = blockIdx.y + y_shift;
const int batch = blockIdx.z + z_shift;
if (outputPointId >= gradOutput.size(2)) {
return;
}
const int outputPointX = outputPointId % gradOutput.size(2);
const int iStartX = imax(0, -padL);
const int oStartX = imax(0, padL);
const int inputPointX = imin(imax(padL, outputPointX), gradInput.size(2) + padL - 1) - oStartX + iStartX;
scalar_t valueToCopy = gradOutput[batch][plane][outputPointX];
gpuAtomicAddNoReturn(&gradInput[batch][plane][inputPointX], valueToCopy);
}
template <typename scalar_t>
__global__ void replication_pad_forward_kernel2d(
PackedTensorAccessor64<scalar_t, 4> input,
PackedTensorAccessor64<scalar_t, 4> output,
const int padT,
const int padL,
const int y_shift,
const int z_shift) {
const int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
const int plane = blockIdx.y + y_shift;
const int batch = blockIdx.z + z_shift;
if (outputPointId >= output.size(2) * output.size(3)) {
return;
}
const int outputPointX = outputPointId % output.size(3);
const int outputPointY = outputPointId / output.size(3);
const int iStartX = imax(0, -padL);
const int iStartY = imax(0, -padT);
const int oStartX = imax(0, padL);
const int oStartY = imax(0, padT);
const int inputPointX = imin(imax(padL, outputPointX), input.size(3) + padL - 1) - oStartX + iStartX;
const int inputPointY = imin(imax(padT, outputPointY), input.size(2) + padT - 1) - oStartY + iStartY;
scalar_t valueToCopy = input[batch][plane][inputPointY][inputPointX];
output[batch][plane][outputPointY][outputPointX] = valueToCopy;
}
template <typename scalar_t>
__global__ void replication_pad_backward_kernel(
PackedTensorAccessor64<scalar_t, 4> gradInput,
PackedTensorAccessor64<scalar_t, 4> gradOutput,
const int padT,
const int padL,
const int y_shift,
const int z_shift) {
const int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
const int plane = blockIdx.y + y_shift;
const int batch = blockIdx.z + z_shift;
if (outputPointId >= gradOutput.size(2) * gradOutput.size(3)) {
return;
}
const int outputPointX = outputPointId % gradOutput.size(3);
const int outputPointY = outputPointId / gradOutput.size(3);
const int iStartX = imax(0, -padL);
const int iStartY = imax(0, -padT);
const int oStartX = imax(0, padL);
const int oStartY = imax(0, padT);
const int inputPointX = imin(imax(padL, outputPointX), gradInput.size(3) + padL - 1) - oStartX + iStartX;
const int inputPointY = imin(imax(padT, outputPointY), gradInput.size(2) + padT - 1) - oStartY + iStartY;
scalar_t valueToCopy = gradOutput[batch][plane][outputPointY][outputPointX];
gpuAtomicAddNoReturn(&gradInput[batch][plane][inputPointY][inputPointX], valueToCopy);
}
template <typename scalar_t>
__global__ void replication_pad_forward_kernel3d(
PackedTensorAccessor64<scalar_t, 5> input,
PackedTensorAccessor64<scalar_t, 5> output,
const int pfront,
const int ptop,
const int pleft,
const int y_shift,
const int z_shift) {
const int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
const int plane = blockIdx.y + y_shift;
const int batch = blockIdx.z + z_shift;
if (outputPointId >= (output.size(2) * output.size(3) *
output.size(4))) {
return;
}
const int outputPointX = outputPointId % output.size(4);
const int outputPointY = (outputPointId / output.size(4)) % output.size(3);
const int outputPointZ = outputPointId / (output.size(3) * output.size(4));
const int iStartX = imax(0, -pleft);
const int iStartY = imax(0, -ptop);
const int iStartZ = imax(0, -pfront);
const int oStartX = imax(0, pleft);
const int oStartY = imax(0, ptop);
const int oStartZ = imax(0, pfront);
const int inputPointX = imin(imax(pleft, outputPointX),
input.size(4) + pleft - 1) - oStartX + iStartX;
const int inputPointY = imin(imax(ptop, outputPointY),
input.size(3) + ptop - 1) - oStartY + iStartY;
const int inputPointZ = imin(imax(pfront, outputPointZ),
input.size(2) + pfront - 1) - oStartZ + iStartZ;
scalar_t valueToCopy =
input[batch][plane][inputPointZ][inputPointY][inputPointX];
output[batch][plane][outputPointZ][outputPointY][outputPointX] = valueToCopy;
}
template <typename scalar_t>
__global__ void replication_pad_backward_kernel(
PackedTensorAccessor64<scalar_t, 5> gradInput,
PackedTensorAccessor64<scalar_t, 5> gradOutput,
const int pfront,
const int ptop,
const int pleft,
const int y_shift,
const int z_shift) {
const int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
const int plane = blockIdx.y + y_shift;
const int batch = blockIdx.z + z_shift;
if (outputPointId >= (gradOutput.size(2) * gradOutput.size(3) *
gradOutput.size(4))) {
return;
}
const int outputPointX = outputPointId % gradOutput.size(4);
const int outputPointY = (outputPointId / gradOutput.size(4)) %
gradOutput.size(3);
const int outputPointZ = outputPointId / (gradOutput.size(3) *
gradOutput.size(4));
const int iStartX = imax(0, -pleft);
const int iStartY = imax(0, -ptop);
const int iStartZ = imax(0, -pfront);
const int oStartX = imax(0, pleft);
const int oStartY = imax(0, ptop);
const int oStartZ = imax(0, pfront);
const int inputPointX = imin(imax(pleft, outputPointX),
gradInput.size(4) + pleft - 1) - oStartX + iStartX;
const int inputPointY = imin(imax(ptop, outputPointY),
gradInput.size(3) + ptop - 1) - oStartY + iStartY;
const int inputPointZ = imin(imax(pfront, outputPointZ),
gradInput.size(2) + pfront - 1) - oStartZ + iStartZ;
scalar_t valueToCopy =
gradOutput[batch][plane][outputPointZ][outputPointY][outputPointX];
gpuAtomicAddNoReturn(&gradInput[batch][plane][inputPointZ][inputPointY][inputPointX],
valueToCopy);
}
void replication_pad2d_backward_out_cuda_template(
Tensor& gradInput,
const Tensor& gradOutput,
const Tensor& input,
IntArrayRef paddingSize)
{
TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
"input tensor must fit into 32-bit index math");
TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(gradOutput),
"output gradient tensor must fit into 32-bit index math");
TORCH_CHECK(paddingSize.size() == 4, "padding Size is expected to be 4");
const auto padL = paddingSize[0];
const auto padR = paddingSize[1];
const auto padT = paddingSize[2];
const auto padB = paddingSize[3];
int planeDim = 0;
int dimh = 1;
int dimw = 2;
int numInputDims = input.dim();
if (numInputDims == 4) {
planeDim++;
dimh++;
dimw++;
}
const auto iheight = input.size(dimh);
const auto iwidth = input.size(dimw);
const auto oheight = iheight + padT + padB;
const auto owidth = iwidth + padL + padR;
TORCH_CHECK(owidth == gradOutput.size(dimw),
"gradOutput width unexpected. Expected: ", owidth, ", Got: ",
gradOutput.size(dimw));
TORCH_CHECK(oheight == gradOutput.size(dimh),
"gradOutput height unexpected. Expected: ", oheight, ", Got: ",
gradOutput.size(dimh));
gradInput.resize_as_(input);
if (gradInput.numel() == 0) {
return;
}
gradInput.zero_();
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kHalf,
input.scalar_type(), "replication_pad2d_backward_cuda", [&] {
auto gradInput_ = gradInput;
auto gradOutput_ = gradOutput;
if (numInputDims == 3) {
gradInput_ = gradInput.unsqueeze(0);
gradOutput_ = gradOutput.unsqueeze(0);
}
auto devGradInput = gradInput_.packed_accessor64<scalar_t, 4>();
auto devGradOutput = gradOutput_.packed_accessor64<scalar_t, 4>();
int64_t outputPlaneSize = devGradOutput.size(2) * devGradOutput.size(3);
int64_t size1 = devGradOutput.size(1);
int64_t size0 = devGradOutput.size(0);
for (int64_t block_y = 0; block_y < size1; block_y += 65535) {
int64_t block_y_size = std::min(size1 - block_y, static_cast<int64_t>(65535));
for (int64_t block_z = 0; block_z < size0; block_z += 65535) {
int64_t block_z_size = std::min(size0 - block_z, static_cast<int64_t>(65535));
dim3 gridSize(ceil_div(outputPlaneSize, static_cast<int64_t>(256)), block_y_size, block_z_size);
dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
replication_pad_backward_kernel <<<gridSize, blockSize, 0, at::cuda::getCurrentCUDAStream()>>>(
devGradInput, devGradOutput, padT, padL, block_y, block_z);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
}
}
);
}
static inline void shapeCheck3d(
const Tensor& input,
int pleft, int pright,
int ptop, int pbottom,
int pfront, int pback) {
TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
"input tensor must fit into 32-bit index math");
int numInputDims = input.dim();
bool valid_dims = input.size(1) != 0 && input.size(2) != 0 && input.size(3) != 0;
TORCH_CHECK(
(numInputDims == 4 && input.size(0) != 0 && valid_dims) ||
(numInputDims == 5 && valid_dims && input.size(4) != 0),
"Expected 4D or 5D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ",
input.sizes());
int planeDim = 0;
int dimd = 1;
int dimh = 2;
int dimw = 3;
if (numInputDims == 5) {
planeDim++;
dimd++;
dimh++;
dimw++;
}
const int idepth = input.size(dimd);
const int iheight = input.size(dimh);
const int iwidth = input.size(dimw);
const int odepth = idepth + pfront + pback;
const int oheight = iheight + ptop + pbottom;
const int owidth = iwidth + pleft + pright;
TORCH_CHECK(owidth >= 1 || oheight >= 1 || odepth >= 1,
"input (D: ", idepth, " H: ", iheight, ", W: ", iwidth,
") is too small."
" Calculated output D: ", odepth, " H: ", oheight, " W: ", owidth);
}
static inline void shapeAndGradOutputCheck3d(
const Tensor& input,
const Tensor& gradOutput,
int pleft, int pright,
int ptop, int pbottom,
int pfront, int pback) {
TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
"input tensor must fit into 32-bit index math");
int numInputDims = input.dim();
bool valid_dims = input.size(1) != 0 && input.size(2) != 0 && input.size(3) != 0;
TORCH_CHECK(
(numInputDims == 4 && valid_dims) ||
(numInputDims == 5 && valid_dims && input.size(4) != 0),
"Expected 4D or 5D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ",
input.sizes());
int planeDim = 0;
int dimd = 1;
int dimh = 2;
int dimw = 3;
if (numInputDims == 5) {
planeDim++;
dimd++;
dimh++;
dimw++;
}
int numPlanes = input.size(planeDim);
int idepth = input.size(dimd);
int iheight = input.size(dimh);
int iwidth = input.size(dimw);
int odepth = idepth + pfront + pback;
int oheight = iheight + ptop + pbottom;
int owidth = iwidth + pleft + pright;
TORCH_CHECK(owidth >= 1 || oheight >= 1 || odepth >= 1,
"input (D: ", idepth, " H: ", iheight, ", W: ", iwidth,
") is too small."
" Calculated output D: ", odepth, " H: ", oheight, " W: ", owidth);
TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(gradOutput),
"output gradient tensor must fit into 32-bit index math");
TORCH_CHECK(numPlanes == gradOutput.size(planeDim),
"gradOutput width unexpected. Expected: ", numPlanes, ", Got: ",
gradOutput.size(planeDim));
TORCH_CHECK(owidth == gradOutput.size(dimw),
"gradOutput width unexpected. Expected: ", owidth, ", Got: ",
gradOutput.size(dimw));
TORCH_CHECK(oheight == gradOutput.size(dimh),
"gradOutput height unexpected. Expected: ", oheight, ", Got: ",
gradOutput.size(dimh));
TORCH_CHECK(odepth == gradOutput.size(dimd),
"gradOutput depth unexpected. Expected: ", odepth, ", Got: ",
gradOutput.size(dimd));
}
void replication_pad3d_backward_out_cuda_template(
Tensor& gradInput,
const Tensor& gradOutput,
const Tensor& input,
IntArrayRef paddingSize)
{
TORCH_CHECK(paddingSize.size() == 6, "padding Size is expected to be 6");
const auto pleft = paddingSize[0];
const auto pright = paddingSize[1];
const auto ptop = paddingSize[2];
const auto pbottom = paddingSize[3];
const auto pfront = paddingSize[4];
const auto pback = paddingSize[5];
shapeAndGradOutputCheck3d(input, gradOutput, pleft, pright, ptop,
pbottom, pfront, pback);
int planeDim = 0;
int dimd = 1;
int dimh = 2;
int dimw = 3;
int numInputDims = input.dim();
if (numInputDims == 5) {
planeDim++;
dimd++;
dimh++;
dimw++;
}
gradInput.resize_as_(input);
if (gradInput.numel() == 0) {
return;
}
gradInput.zero_();
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kHalf,
input.scalar_type(), "replication_pad3d_backward_cuda", [&] {
auto gradInput_ = gradInput;
auto gradOutput_ = gradOutput;
if (numInputDims == 4) {
gradInput_ = gradInput.unsqueeze(0);
gradOutput_ = gradOutput.unsqueeze(0);
}
auto devGradInput = gradInput_.packed_accessor64<scalar_t, 5>();
auto devGradOutput = gradOutput_.packed_accessor64<scalar_t, 5>();
const int64_t outputPlaneSize = devGradOutput.size(2) * devGradOutput.size(3) * devGradOutput.size(4);
const int64_t size1 = devGradOutput.size(1);
const int64_t size0 = devGradOutput.size(0);
for (int64_t block_y = 0; block_y < size1; block_y += 65535) {
int64_t block_y_size = std::min(size1 - block_y, static_cast<int64_t>(65535));
for (int64_t block_z = 0; block_z < size0; block_z += 65535) {
int64_t block_z_size = std::min(size0 - block_z, static_cast<int64_t>(65535));
dim3 gridSize(ceil_div(outputPlaneSize, static_cast<int64_t>(256)), block_y_size, block_z_size);
dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
replication_pad_backward_kernel <<<gridSize, blockSize, 0, at::cuda::getCurrentCUDAStream()>>>(
devGradInput, devGradOutput, pfront, ptop, pleft, block_y, block_z);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
}
}
);
}
} // namespace
TORCH_IMPL_FUNC(replication_pad1d_out_cuda) (
const Tensor& input, IntArrayRef paddingSize, const Tensor& output
) {
TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
"input tensor must fit into 32-bit index math");
int64_t padL = paddingSize[0];
int64_t padR = paddingSize[1];
constexpr int64_t planeDim = -2;
constexpr int64_t dimw = -1;
int numInputDims = input.ndimension();
int64_t numPlanes = input.size(planeDim);
int64_t inputW = input.size(dimw);
int64_t outputW = output.size(dimw);
if (input.numel() == 0) {
return;
}
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kHalf,
input.scalar_type(), "replication_pad1d_cuda", [&] {
at::Tensor input_ = input;
at::Tensor output_ = output;
if (numInputDims == 2) {
input_ = input.unsqueeze(0);
output_ = output.unsqueeze(0);
}
auto devInput = input_.packed_accessor64<scalar_t, 3>();
auto devOutput = output_.packed_accessor64<scalar_t, 3>();
int64_t outputPlaneSize = devOutput.size(2);
int64_t size1 = devOutput.size(1);
int64_t size0 = devOutput.size(0);
for (int64_t block_y = 0; block_y < size1; block_y += 65535) {
int64_t block_y_size = std::min(size1 - block_y, static_cast<int64_t>(65535));
for (int64_t block_z = 0; block_z < size0; block_z += 65535) {
int64_t block_z_size = std::min(size0 - block_z, static_cast<int64_t>(65535));
dim3 gridSize(ceil_div(outputPlaneSize, static_cast<int64_t>(256)), block_y_size, block_z_size);
dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
replication_pad_forward_kernel1d <<<gridSize, blockSize, 0,
at::cuda::getCurrentCUDAStream()>>>(devInput, devOutput, padL, block_y, block_z);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
}
}
);
}
TORCH_IMPL_FUNC(replication_pad1d_backward_out_cuda) (
const Tensor& gradOutput,
const Tensor& input,
IntArrayRef paddingSize,
const Tensor& gradInput
) {
// See Note [Writing Nondeterministic Operations]
// Nondeterministic because of atomicAdd usage
globalContext().alertNotDeterministic("replication_pad1d_backward_cuda");
TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
"input tensor must fit into 32-bit index math");
TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(gradOutput),
"output gradient tensor must fit into 32-bit index math");
const int padL = paddingSize[0];
int planeDim = 0;
int dimw = 1;
int numInputDims = input.ndimension();
if (numInputDims == 3) {
planeDim++;
dimw++;
}
int iwidth = input.size(dimw);
if (gradInput.numel() == 0) {
return;
}
gradInput.zero_();
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kHalf,
input.scalar_type(), "replication_pad1d_backward_cuda", [&] {
auto gradInput_ = gradInput;
auto gradOutput_ = gradOutput;
if (numInputDims == 2) {
gradInput_ = gradInput.unsqueeze(0);
gradOutput_ = gradOutput.unsqueeze(0);
}
auto devGradInput = gradInput_.packed_accessor64<scalar_t, 3>();
auto devGradOutput = gradOutput_.packed_accessor64<scalar_t, 3>();
int64_t outputPlaneSize = devGradOutput.size(2);
int64_t size1 = devGradOutput.size(1);
int64_t size0 = devGradOutput.size(0);
for (int64_t block_y = 0; block_y < size1; block_y += 65535) {
int64_t block_y_size = std::min(size1 - block_y, static_cast<int64_t>(65535));
for (int64_t block_z = 0; block_z < size0; block_z += 65535) {
int64_t block_z_size = std::min(size0 - block_z, static_cast<int64_t>(65535));
dim3 gridSize(ceil_div(outputPlaneSize, static_cast<int64_t>(256)), block_y_size, block_z_size);
dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
replication_pad_backward_kernel <<<gridSize, blockSize, 0, at::cuda::getCurrentCUDAStream()>>>(
devGradInput, devGradOutput, padL, block_y, block_z);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
}
});
}
TORCH_IMPL_FUNC(replication_pad2d_out_cuda) (
const Tensor& input, IntArrayRef paddingSize, const Tensor& output
) {
TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
"input tensor must fit into 32-bit index math");
if (input.numel() == 0) {
return;
}
const auto padL = paddingSize[0];
// const auto padR = paddingSize[1]; // This padding is ignored here
const auto padT = paddingSize[2];
// const auto padB = paddingSize[3]; // This padding is ignored here
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kHalf,
input.scalar_type(), "replication_pad2d_cuda", [&] {
at::Tensor input_ = input;
at::Tensor output_ = output;
if (input.dim() == 3) {
input_ = input.unsqueeze(0);
output_ = output.unsqueeze(0);
}
auto devInput = input_.packed_accessor64<scalar_t, 4>();
auto devOutput = output_.packed_accessor64<scalar_t, 4>();
int64_t outputPlaneSize = devOutput.size(2) * devOutput.size(3);
int64_t size1 = devOutput.size(1);
int64_t size0 = devOutput.size(0);
for (int64_t block_y = 0; block_y < size1; block_y += 65535) {
int64_t block_y_size = std::min(size1 - block_y, static_cast<int64_t>(65535));
for (int64_t block_z = 0; block_z < size0; block_z += 65535) {
int64_t block_z_size = std::min(size0 - block_z, static_cast<int64_t>(65535));
dim3 gridSize(ceil_div(outputPlaneSize, static_cast<int64_t>(256)), block_y_size, block_z_size);
dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
replication_pad_forward_kernel2d <<<gridSize, blockSize, 0, at::cuda::getCurrentCUDAStream()>>>(
devInput, devOutput, padT, padL, block_y, block_z);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
}
}
);
}
Tensor& replication_pad2d_backward_out_cuda(const Tensor& gradOutput,
const Tensor& input,
IntArrayRef paddingSize,
Tensor& gradInput)
{
// See Note [Writing Nondeterministic Operations]
// Nondeterministic because of atomicAdd usage
globalContext().alertNotDeterministic("replication_pad2d_backward_out_cuda");
replication_pad2d_backward_out_cuda_template(
gradInput, gradOutput, input, paddingSize);
return gradInput;
}
Tensor replication_pad2d_backward_cuda(
const Tensor& gradOutput,
const Tensor& input,
IntArrayRef paddingSize)
{
// See Note [Writing Nondeterministic Operations]
// Nondeterministic because of atomicAdd usage
globalContext().alertNotDeterministic("replication_pad2d_backward_cuda");
auto gradInput = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
replication_pad2d_backward_out_cuda_template(
gradInput, gradOutput, input, paddingSize);
return gradInput;
}
TORCH_IMPL_FUNC(replication_pad3d_out_cuda) (
const Tensor& input, IntArrayRef paddingSize, const Tensor& output
) {
const auto pleft = paddingSize[0];
// const auto pright = paddingSize[1]; // Ignored here
const auto ptop = paddingSize[2];
// const auto pbottom = paddingSize[3]; // Ignored here
const auto pfront = paddingSize[4];
// const auto pback = paddingSize[5]; // Ignored here
int planeDim = 0;
int dimd = 1;
int dimh = 2;
int dimw = 3;
int numInputDims = input.dim();
if (numInputDims == 5) {
planeDim++;
dimd++;
dimh++;
dimw++;
}
const auto numPlanes = input.size(planeDim);
const auto inputD = input.size(dimd);
const auto inputH = input.size(dimh);
const auto inputW = input.size(dimw);
const auto outputD = output.size(dimd);
const auto outputH = output.size(dimh);
const auto outputW = output.size(dimw);
if (input.numel() == 0) {
return;
}
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kHalf,
input.scalar_type(), "replication_pad3d_cuda", [&] {
at::Tensor input_ = input;
at::Tensor output_ = output;
if (numInputDims == 4) {
input_ = input.unsqueeze(0);
output_ = output.unsqueeze(0);
}
auto devInput = input_.packed_accessor64<scalar_t, 5>();
auto devOutput = output_.packed_accessor64<scalar_t, 5>();
const int64_t outputPlaneSize = devOutput.size(2) * devOutput.size(3) * devOutput.size(4);
const int64_t size1 = devOutput.size(1);
const int64_t size0 = devOutput.size(0);
for (int64_t block_y = 0; block_y < size1; block_y += 65535) {
int64_t block_y_size = std::min(size1 - block_y, static_cast<int64_t>(65535));
for (int64_t block_z = 0; block_z < size0; block_z += 65535) {
int64_t block_z_size = std::min(size0 - block_z, static_cast<int64_t>(65535));
dim3 gridSize(ceil_div(outputPlaneSize, static_cast<int64_t>(256)), block_y_size, block_z_size);
dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
replication_pad_forward_kernel3d <<<gridSize, blockSize, 0, at::cuda::getCurrentCUDAStream()>>>(
devInput, devOutput, pfront, ptop, pleft, block_y, block_z);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
}
}
);
}
Tensor& replication_pad3d_backward_out_cuda(const Tensor& gradOutput,
const Tensor& input,
IntArrayRef paddingSize,
Tensor& gradInput)
{
// See Note [Writing Nondeterministic Operations]
// Nondeterministic because of atomicAdd usage
globalContext().alertNotDeterministic("replication_pad3d_backward_out_cuda");
replication_pad3d_backward_out_cuda_template(
gradInput, gradOutput, input, paddingSize);
return gradInput;
}
Tensor replication_pad3d_backward_cuda(
const Tensor& gradOutput,
const Tensor& input,
IntArrayRef paddingSize)
{
// See Note [Writing Nondeterministic Operations]
// Nondeterministic because of atomicAdd usage
globalContext().alertNotDeterministic("replication_pad3d_backward_cuda");
auto gradInput = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
replication_pad3d_backward_out_cuda_template(
gradInput, gradOutput, input, paddingSize);
return gradInput;
}
} // at::native
} // at