blob: 6e43e382ddfc9c41675f2635bb1fc764b1f29c68 [file] [log] [blame]
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/core/Tensor.h>
#include <ATen/AccumulateType.h>
#include <ATen/Dispatch.h>
#include <ATen/TensorUtils.h>
#include <ATen/Utils.h>
#include <ATen/cuda/Atomic.cuh>
#include <ATen/cuda/CUDAContext.h>
#include <c10/util/Exception.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/adaptive_avg_pool3d_backward_native.h>
#include <ATen/ops/adaptive_avg_pool3d_native.h>
#include <ATen/ops/empty.h>
#include <ATen/ops/zeros_like.h>
#endif
#include <algorithm>
#include <cfloat>
#include <cmath>
namespace at {
namespace native {
namespace {
__device__ inline int64_t start_index(int64_t a, int64_t b, int64_t c) {
return (a / b) * c + ((a % b) * c) / b;
}
__device__ inline int64_t end_index(int64_t a, int64_t b, int64_t c) {
return 1 + ((a + 1) * c - 1) / b;
}
// 5d tensor B x D x T x H x W
// All kernels view batch dim B and dim D as collapsed.
/*
* Description:
* this function adaptively average pools an input 5D tensor along dimensions
* 2, 3, and 4 5D input, 5D output
*
* gridDim.y blocks work together on a single 2D output plane specified by
* (blockIdx.x + offsetZ).
*/
template <typename scalar_t, typename accscalar_t>
__global__ void adaptiveaveragepool(
scalar_t *input, scalar_t *output,
int isizeT, int isizeH, int isizeW,
int osizeT, int osizeH, int osizeW,
int64_t istrideD,
int64_t istrideT, int64_t istrideH, int64_t istrideW,
int64_t offsetZ) {
// iterates on output pixels
int ot, oh, ow;
// compute offsets based on thread/block ID
int ostartH = blockIdx.y * blockDim.y + threadIdx.y;
int oendH = osizeH;
int ostepH = gridDim.y * blockDim.y;
int ostartW = threadIdx.x;
int oendW = osizeW;
int ostepW = blockDim.x;
// select output plane
int64_t o_plane = blockIdx.x + offsetZ;
ot = o_plane % osizeT; // output frame/time
int d = o_plane / osizeT; // slice/feature
// input frame/time range is fixed.
int istartT = start_index(ot, osizeT, isizeT);
int iendT = end_index(ot, osizeT, isizeT);
int kT = iendT - istartT;
// input offset by slice/feature and earliest relevant frame/time
scalar_t *input_dt = input + d*istrideD + istartT*istrideT;
// output offset by slice/feature and frame/time
scalar_t *output_dt = output + o_plane*osizeH*osizeW;
// For all output pixels...
for (oh = ostartH; oh < oendH; oh += ostepH) {
int istartH = start_index(oh, osizeH, isizeH);
int iendH = end_index(oh, osizeH, isizeH);
int kH = iendH - istartH;
for (ow = ostartW; ow < oendW; ow += ostepW) {
int istartW = start_index(ow, osizeW, isizeW);
int iendW = end_index(ow, osizeW, isizeW);
int kW = iendW - istartW;
// Compute the average pooling from corresponding input pixels
scalar_t *ptr_input = input_dt + istartH*istrideH + istartW*istrideW;
scalar_t *ptr_output = output_dt + oh*osizeW + ow;
accscalar_t sum = static_cast<accscalar_t>(0);
int it, ih, iw;
for (it = 0; it < kT; ++it) {
for (ih = 0; ih < kH; ++ih) {
for (iw = 0; iw < kW; ++iw) {
scalar_t val = ptr_input[ih*istrideH + iw*istrideW];
sum += static_cast<accscalar_t>(val);
}
}
ptr_input += istrideT; // next input frame
}
// Update output
const accscalar_t divide_factor = static_cast<accscalar_t>(kT * kH * kW);
*ptr_output = static_cast<scalar_t>(sum / divide_factor);
}
}
}
template <typename scalar_t, typename accscalar_t>
void adaptiveaveragepool_loop(
scalar_t *input_data, scalar_t *output_data,
int64_t totalZ,
int isizeT, int isizeH, int isizeW,
int osizeT, int osizeH, int osizeW,
int64_t istrideD, int64_t istrideT, int64_t istrideH, int64_t istrideW) {
int64_t offsetZ = 0;
dim3 threads(32, 8);
// each H*W plane is processed by blocksH thread blocks
int blocksH = std::max((int)(16L / totalZ), 1);
while (totalZ > 0) {
dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH);
adaptiveaveragepool<scalar_t, accscalar_t>
<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
input_data, output_data,
isizeT, isizeH, isizeW,
osizeT, osizeH, osizeW,
istrideD,
istrideT, istrideH, istrideW,
offsetZ);
C10_CUDA_KERNEL_LAUNCH_CHECK();
totalZ -= 65535;
offsetZ += 65535;
}
}
/*
* Description:
* This function computes the gradInput from gradOutput.
*
* gridDim.y blocks work together on a single 2D output plane specified by
* (blockIdx.x + offsetZ).
*/
template <typename scalar_t, typename accscalar_t>
__global__ void adaptiveaveragegradinput(
scalar_t *gradInput, scalar_t *gradOutput,
int isizeT, int isizeH, int isizeW,
int osizeT, int osizeH, int osizeW,
int64_t offsetZ)
{
// iterators on input pixels
int it, ih, iw;
// compute offsets based on thread/block ID
int istartH = blockIdx.y * blockDim.y + threadIdx.y;
int iendH = isizeH;
int istepH = gridDim.y * blockDim.y;
int istartW = threadIdx.x;
int iendW = isizeW;
int istepW = blockDim.x;
// select input plane
int64_t i_plane = blockIdx.x + offsetZ;
it = i_plane % isizeT; // output frame/time
int d = i_plane / isizeT; // slice/feature
// output frame/time range is fixed.
int ostartT = start_index(it, isizeT, osizeT);
int oendT = end_index(it, isizeT, osizeT);
// gradInput offset by slice/feature and frame/time.
scalar_t *gradInput_dt = gradInput + i_plane*isizeH*isizeW;
// gradOutput offset by slice/feature and earliest relevant frame/time
scalar_t *gradOutput_dt = gradOutput + (d*osizeT + ostartT)*osizeH*osizeW;
// For all input pixels...
for (ih = istartH; ih < iendH; ih += istepH) {
int ostartH = start_index(ih, isizeH, osizeH);
int oendH = end_index(ih, isizeH, osizeH);
for (iw = istartW; iw < iendW; iw += istepW) {
int ostartW = start_index(iw, isizeW, osizeW);
int oendW = end_index(iw, isizeW, osizeW);
// Compute the gradients from corresponding output pixels
scalar_t *ptr_gradInput = gradInput_dt + ih*isizeW + iw;
scalar_t *ptr_gradOutput = gradOutput_dt;
// for all relevant output pixels
int ot, oh, ow;
for (ot = ostartT; ot < oendT; ++ot) {
int kT = end_index(ot, osizeT, isizeT) - start_index(ot, osizeT, isizeT);
for (oh = ostartH; oh < oendH; ++oh) {
int kH = end_index(oh, osizeH, isizeH) - start_index(oh, osizeH, isizeH);
for (ow = ostartW; ow < oendW; ++ow) {
int kW = end_index(ow, osizeW, isizeW) - start_index(ow, osizeW, isizeW);
const accscalar_t divide_factor = kW * kH * kT;
accscalar_t grad_delta = static_cast<accscalar_t>(ptr_gradOutput[oh*osizeW + ow] / divide_factor);
*ptr_gradInput += static_cast<scalar_t>(grad_delta);
}
}
ptr_gradOutput += osizeH*osizeW; // next output frame
}
}
}
}
template <typename scalar_t, typename accscalar_t>
void adaptiveaveragegradinput_loop(
scalar_t *gradInput_data, scalar_t *gradOutput_data,
int64_t totalZ,
int isizeT, int isizeH, int isizeW,
int osizeT, int osizeH, int osizeW) {
int64_t offsetZ = 0;
dim3 threads(32, 8);
// each H*W plane is processed by blocksH thread blocks
int blocksH = std::max((int)(16L / totalZ), 1);
while (totalZ > 0) {
dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH);
adaptiveaveragegradinput<scalar_t, accscalar_t>
<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
gradInput_data, gradOutput_data,
isizeT, isizeH, isizeW,
osizeT, osizeH, osizeW,
offsetZ);
C10_CUDA_KERNEL_LAUNCH_CHECK();
totalZ -= 65535;
offsetZ += 65535;
}
}
/*
* Description:
* This function computes the gradInput from gradOutput.
*
* gridDim.y blocks work together on a single 2D output plane specified by
* (blockIdx.x + offsetZ).
*
* (uses atomic add)
*
*/
template <typename scalar_t>
__global__ void atomicadaptiveaveragegradinput(
scalar_t *gradInput, scalar_t *gradOutput,
int isizeT, int isizeH, int isizeW,
int osizeT, int osizeH, int osizeW,
int64_t offsetZ)
{
// iterators on output pixels
int ot, oh, ow;
// compute offsets based on thread/block ID
int ostartH = blockIdx.y * blockDim.y + threadIdx.y;
int oendH = osizeH;
int ostepH = gridDim.y * blockDim.y;
int ostartW = threadIdx.x;
int oendW = osizeW;
int ostepW = blockDim.x;
// select output plane
int64_t o_plane = blockIdx.x + offsetZ;
ot = o_plane % osizeT; // output frame/time
int d = o_plane / osizeT; // output slice/feature
// input frame/time range is fixed.
int istartT = start_index(ot, osizeT, isizeT);
int iendT = end_index(ot, osizeT, isizeT);
int kT = iendT - istartT;
// gradInput offset by slice/feature and earliest relevant frame/time
scalar_t *gradInput_nt = gradInput + (d*isizeT + istartT)*isizeH*isizeW;
// gradOutput offset by slice/feature and frame/time
scalar_t *gradOutput_nt = gradOutput + o_plane*osizeH*osizeW;
// For all output pixels...
for (oh = ostartH; oh < oendH; oh += ostepH) {
int istartH = start_index(oh, osizeH, isizeH);
int iendH = end_index(oh, osizeH, isizeH);
int kH = iendH - istartH;
for (ow = ostartW; ow < oendW; ow += ostepW) {
int istartW = start_index(ow, osizeW, isizeW);
int iendW = end_index(ow, osizeW, isizeW);
int kW = iendW - istartW;
// Compute the gradients from corresponding input pixels
scalar_t *ptr_gradInput = gradInput_nt + istartH*isizeW + istartW;
scalar_t *ptr_gradOutput = gradOutput_nt + oh*osizeW + ow;
scalar_t grad_delta = *ptr_gradOutput / kT / kH / kW;
int it, ih, iw;
for (it = 0; it < kT; ++it) {
for (ih = 0; ih < kH; ++ih) {
for (iw = 0; iw < kW; ++iw) {
gpuAtomicAddNoReturn(&(ptr_gradInput[ih*isizeW + iw]), grad_delta);
}
}
ptr_gradInput += isizeH*isizeW; // next input frame
}
}
}
}
template <typename scalar_t>
void atomicadaptiveaveragegradinput_loop(
scalar_t* gradInput_data, scalar_t* gradOutput_data,
int64_t totalZ,
int isizeT, int isizeH, int isizeW,
int osizeT, int osizeH, int osizeW) {
int64_t offsetZ = 0;
dim3 threads(32, 8);
int blocksH = std::max((int)(16L / totalZ), 1);
while (totalZ > 0) {
dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH);
atomicadaptiveaveragegradinput<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
gradInput_data, gradOutput_data,
isizeT, isizeH, isizeW,
osizeT, osizeH, osizeW,
offsetZ);
C10_CUDA_KERNEL_LAUNCH_CHECK();
totalZ -= 65535;
offsetZ += 65535;
}
}
// 5D tensor B x D x T x H x w
void adaptive_avg_pool3d_out_cuda_template(
Tensor& output,
const Tensor& input_,
IntArrayRef& output_size) {
TensorArg output_arg{output, "output", 1};
TensorArg input_arg{input_, "input_", 2};
checkAllSameGPU("adaptive_avg_pool3d_cuda", {output_arg, input_arg});
for (int64_t i = 1; i < input_.ndimension(); i++) {
TORCH_CHECK(
input_.size(i) > 0,
"adaptive_avg_pool3d_cuda(): Expected input to have non-zero size for non-batch dimensions, "
"but input has sizes ", input_.sizes(),
" with dimension ", i, " being empty");
}
TORCH_CHECK(
(input_.ndimension() == 4 || input_.ndimension() == 5),
"adaptive_avg_pool3d_cuda(): Expected 4D or 5D tensor, but got ", input_.sizes());
// the jit sometimes passes output_size.size() == 1
TORCH_CHECK(
output_size.size() == 1 || output_size.size() == 3,
"adaptive_avg_pool3d: internal error: output_size.size() must be 1 or 3");
int64_t osizeT = output_size[0];
int64_t osizeH = output_size[1];
int64_t osizeW = output_size[2];
int64_t sizeD, isizeT, isizeH, isizeW;
int64_t istrideD, istrideT, istrideH, istrideW;
int64_t totalZ;
const Tensor& input = input_.ndimension() == 4 ? input_ : input_.contiguous();
if (input.ndimension() == 4) {
sizeD = input.size(0);
isizeT = input.size(1);
isizeH = input.size(2);
isizeW = input.size(3);
istrideD = input.stride(0);
istrideT = input.stride(1);
istrideH = input.stride(2);
istrideW = input.stride(3);
output.resize_({sizeD, osizeT, osizeH, osizeW});
totalZ = sizeD * osizeT;
} else {
int64_t sizeB = input.size(0);
sizeD = input.size(1);
isizeT = input.size(2);
isizeH = input.size(3);
isizeW = input.size(4);
istrideD = input.stride(1);
istrideT = input.stride(2);
istrideH = input.stride(3);
istrideW = input.stride(4);
output.resize_({sizeB, sizeD, osizeT, osizeH, osizeW});
totalZ = sizeB * sizeD * osizeT;
}
if (output.numel() == 0) {
return;
}
AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
input.scalar_type(), "adaptive_avg_pool3d_cuda", [&] {
using accscalar_t = at::acc_type<scalar_t, true>;
scalar_t* input_data = input.data_ptr<scalar_t>();
scalar_t* output_data = output.data_ptr<scalar_t>();
adaptiveaveragepool_loop<scalar_t, accscalar_t>(
input_data, output_data,
totalZ,
isizeT, isizeH, isizeW,
osizeT, osizeH, osizeW,
istrideD, istrideT, istrideH, istrideW);
});
}
void adaptive_avg_pool3d_backward_out_cuda_template(
Tensor& gradInput,
const Tensor& gradOutput_,
const Tensor& input) {
TensorArg grad_input_arg{gradInput, "gradInput", 1};
TensorArg grad_output_arg{gradOutput_, "gradOutput_", 2};
TensorArg input_arg{input, "input", 3};
checkAllSameGPU(
"adaptive_avg_pool3d_out_cuda",
{grad_input_arg, grad_output_arg, input_arg});
const Tensor gradOutput = gradOutput_.contiguous();
gradInput.resize_as_(input);
if (gradInput.numel() == 0) {
return;
}
gradInput.zero_();
int64_t sizeD, isizeT, isizeH, isizeW;
int64_t osizeT, osizeH, osizeW;
int64_t totalZ;
if (input.ndimension() == 4) {
sizeD = input.size(0);
isizeT = input.size(1);
isizeH = input.size(2);
isizeW = input.size(3);
osizeT = gradOutput.size(1);
osizeH = gradOutput.size(2);
osizeW = gradOutput.size(3);
} else {
sizeD = input.size(1);
isizeT = input.size(2);
isizeH = input.size(3);
isizeW = input.size(4);
osizeT = gradOutput.size(2);
osizeH = gradOutput.size(3);
osizeW = gradOutput.size(4);
}
bool atomic = (isizeW%osizeW != 0) || (isizeH%osizeH != 0) || (isizeT%osizeT != 0);
if (input.ndimension() == 4) {
totalZ = atomic ? sizeD * osizeT : sizeD * isizeT;
} else {
int sizeB = input.size(0);
totalZ = atomic ? sizeB * sizeD * osizeT : sizeB * sizeD * isizeT;
}
if (atomic) {
AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
input.scalar_type(), "adaptive_avg_pool3d_backward_cuda", [&] {
scalar_t* gradInput_data = gradInput.data_ptr<scalar_t>();
scalar_t* gradOutput_data = gradOutput.data_ptr<scalar_t>();
atomicadaptiveaveragegradinput_loop(
gradInput_data, gradOutput_data,
totalZ,
isizeT, isizeH, isizeW,
osizeT, osizeH, osizeW);
});
} else {
AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
input.scalar_type(), "adaptive_avg_pool3d_backward_cuda", [&] {
using accscalar_t = at::acc_type<scalar_t, true>;
scalar_t* gradInput_data = gradInput.data_ptr<scalar_t>();
scalar_t* gradOutput_data = gradOutput.data_ptr<scalar_t>();
adaptiveaveragegradinput_loop<scalar_t, accscalar_t>(
gradInput_data, gradOutput_data,
totalZ,
isizeT, isizeH, isizeW,
osizeT, osizeH, osizeW);
});
}
}
} // namespace
Tensor& adaptive_avg_pool3d_out_cuda(const Tensor& input,
IntArrayRef output_size,
Tensor& output) {
adaptive_avg_pool3d_out_cuda_template(output, input, output_size);
return output;
}
Tensor adaptive_avg_pool3d_cuda(
const Tensor& input,
IntArrayRef output_size) {
auto output = at::empty({0}, input.options());
adaptive_avg_pool3d_out_cuda_template(output, input, output_size);
return output;
}
Tensor& adaptive_avg_pool3d_backward_out_cuda(const Tensor& gradOutput_,
const Tensor& input,
Tensor& gradInput) {
// See Note [Writing Nondeterministic Operations]
// Nondeterministic because of atomicAdd usage
globalContext().alertNotDeterministic("adaptive_avg_pool3d_backward_out_cuda");
adaptive_avg_pool3d_backward_out_cuda_template(gradInput, gradOutput_, input);
return gradInput;
}
Tensor adaptive_avg_pool3d_backward_cuda(
const Tensor& gradOutput_,
const Tensor& input) {
// See Note [Writing Nondeterministic Operations]
// Nondeterministic because of atomicAdd usage
globalContext().alertNotDeterministic("adaptive_avg_pool3d_backward_cuda");
auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
adaptive_avg_pool3d_backward_out_cuda_template(gradInput, gradOutput_, input);
return gradInput;
}
} // namespace native
} // namespace at