| #define TORCH_ASSERT_ONLY_METHOD_OPERATORS |
| #include <ATen/core/Tensor.h> |
| #include <ATen/ceil_div.h> |
| #include <ATen/Dispatch.h> |
| #include <ATen/cuda/Atomic.cuh> |
| #include <ATen/cuda/CUDAContext.h> |
| #include <ATen/TensorUtils.h> |
| #include <ATen/Utils.h> |
| #include <c10/util/Exception.h> |
| #include <ATen/native/cuda/LaunchUtils.h> |
| |
| #ifndef AT_PER_OPERATOR_HEADERS |
| #include <ATen/Functions.h> |
| #include <ATen/NativeFunctions.h> |
| #else |
| #include <ATen/ops/_adaptive_avg_pool2d_backward_native.h> |
| #include <ATen/ops/_adaptive_avg_pool2d_native.h> |
| #include <ATen/ops/empty.h> |
| #include <ATen/ops/zeros_like.h> |
| #endif |
| |
| #include <algorithm> |
| #include <cfloat> |
| #include <cmath> |
| |
| #define START_IND(a,b,c) ((int64_t)((a / b) * c + ((a % b) * c) / b)) |
| #define END_IND(a,b,c) (1 + ((int64_t)(a + 1) * c - 1) / b) |
| |
| #define START_IND_INT(a,b,c) ((a * c) / b) |
| #define END_IND_INT(a,b,c) (((a + 1) * c + b - 1) / b) |
| // #define START_IND(a,b,c) a * c / b |
| // #define END_IND(a,b,c) (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0 |
| |
| #define CUDA_MAX_THREADS 1024 // this is safe, in reality 256 is our limit |
| #define BLOCK_STRIDE 2 // increasing block_stride to lower # of blocks launched |
| |
| namespace at { |
| namespace native { |
| |
| namespace { |
| |
| // 4d tensor B x D x H x W |
| // All kernels view batch dim B and feature dim D as collapsed. |
| |
| /* |
| * Description: |
| * this function adaptively average pools an input 4D tensor along dimensions 2 and 3 |
| * 4D input, 4D output |
| */ |
| template <typename T> |
| __global__ void adaptive_average_pool(T *input, T *output, |
| int isizeH, int isizeW, |
| int osizeH, int osizeW, |
| int64_t istrideD, int64_t istrideH, int64_t istrideW) |
| { |
| // iterators on output pixels |
| int oh, ow; |
| |
| // select input/output plane based on thread/block ID |
| int o_plane = blockIdx.x; |
| int i_plane = o_plane; |
| |
| output = output + o_plane*osizeH*osizeW; |
| input = input + i_plane*istrideD; |
| |
| int ostartH = blockDim.y*blockIdx.y + threadIdx.y; |
| int oendH = osizeH; |
| const int ostepH = blockDim.y*gridDim.y; |
| |
| int ostartW = threadIdx.x; |
| int oendW = osizeW; |
| const int ostepW = blockDim.x; |
| |
| // For all output pixels... |
| for(oh = ostartH; oh < oendH; oh += ostepH) { |
| |
| int istartH = START_IND(oh, osizeH, isizeH); |
| int iendH = END_IND(oh, osizeH, isizeH); |
| int kH = iendH - istartH; |
| |
| for(ow = ostartW; ow < oendW; ow += ostepW) { |
| |
| int istartW = START_IND(ow, osizeW, isizeW); |
| int iendW = END_IND(ow, osizeW, isizeW); |
| int kW = iendW - istartW; |
| |
| // Compute the average pooling over corresponding input pixels |
| T *ptr_input = input + istartH*istrideH + istartW*istrideW; |
| T *ptr_output = output + oh*osizeW + ow; |
| T sum = static_cast<T>(0); |
| int ih, iw; |
| for(ih = 0; ih < kH; ++ih) { |
| for(iw = 0; iw < kW; ++iw) { |
| T val = ptr_input[iw*istrideW]; |
| sum += val; |
| } |
| ptr_input += istrideH; // next input line |
| } |
| // Update output |
| *ptr_output = sum / kH / kW; |
| } |
| } |
| } |
| |
| /* |
| * Description: |
| * this function computes the gradInput from gradOutput |
| */ |
| template <typename T> |
| __global__ void adaptive_average_gradinput( |
| T *gradInput, T *gradOutput, |
| int isizeH, int isizeW, int osizeH, int osizeW |
| ) |
| { |
| // iterators on input pixels |
| int ih, iw; |
| |
| // select input/output plane based on thread/block ID |
| int i_plane = blockIdx.x; |
| int o_plane = i_plane; |
| |
| gradOutput = gradOutput + o_plane*osizeH*osizeW; |
| gradInput = gradInput + i_plane*isizeH*isizeW; |
| |
| int istartH = blockDim.y*blockIdx.y + threadIdx.y; |
| int iendH = isizeH; |
| int istepH = blockDim.y*gridDim.y; |
| |
| int istartW = threadIdx.x; |
| int iendW = isizeW; |
| int istepW = blockDim.x; |
| |
| // compute gradInput |
| for(ih = istartH; ih < iendH; ih += istepH) { |
| |
| int ostartH = START_IND(ih, isizeH, osizeH); |
| int oendH = END_IND(ih, isizeH, osizeH); |
| |
| for(iw = istartW; iw < iendW; iw += istepW) { |
| |
| int ostartW = START_IND(iw, isizeW, osizeW); |
| int oendW = END_IND(iw, isizeW, osizeW); |
| |
| // Compute the gradients over corresponding output pixels |
| T *ptr_gradInput = gradInput + ih*isizeW + iw; |
| |
| int oh, ow; |
| for(oh = ostartH; oh < oendH; ++oh) { |
| int kH = START_IND(oh, osizeH, isizeH) - END_IND(oh, osizeH, isizeH); |
| for(ow = ostartW; ow < oendW; ++ow) { |
| int kW = START_IND(ow, osizeW, isizeW) - END_IND(ow, osizeW, isizeW); |
| T grad_delta = gradOutput[ow + oh*osizeW] / kH / kW; |
| *ptr_gradInput += grad_delta; |
| } |
| } |
| } |
| } |
| } |
| |
| /* |
| * Description: |
| * this function computes the gradInput from gradOutput |
| * (uses atomic add) |
| */ |
| template <typename T> |
| __global__ void atomic_adaptive_average_gradinput( |
| T *gradInput, T *gradOutput, |
| int isizeH, int isizeW, int osizeH, int osizeW |
| ) |
| { |
| // iterators on output indices |
| int oh, ow; |
| |
| // select input/output plane based on thread/block ID |
| int o_plane = blockIdx.x; |
| int i_plane = o_plane; |
| |
| gradOutput = gradOutput + o_plane*osizeW*osizeH; |
| gradInput = gradInput + i_plane*isizeW*isizeH; |
| |
| int ostartH = blockDim.y*blockIdx.y + threadIdx.y; |
| int oendH = osizeH; |
| int ostepH = blockDim.y*gridDim.y; |
| |
| int ostartW = threadIdx.x; |
| int oendW = osizeW; |
| int ostepW = blockDim.x; |
| |
| // For all output pixels... |
| for(oh = ostartH; oh < oendH; oh += ostepH) { |
| |
| int istartH = START_IND(oh, osizeH, isizeH); |
| int iendH = END_IND(oh, osizeH, isizeH); |
| int kH = iendH - istartH; |
| |
| for(ow = ostartW; ow < oendW; ow += ostepW) { |
| |
| int istartW = START_IND(ow, osizeW, isizeW); |
| int iendW = END_IND(ow, osizeW, isizeW); |
| int kW = iendW - istartW; |
| |
| // Compute the gradients for over corresponding input pixels |
| T *ptr_gradInput = gradInput + istartH*isizeW + istartW; |
| T *ptr_gradOutput = gradOutput + oh*osizeW + ow; |
| T grad_delta = *ptr_gradOutput / kW / kH; |
| |
| int ih, iw; |
| for(ih = 0; ih < kH; ++ih) { |
| for(iw = 0; iw < kW; ++iw) { |
| // atomic add since different threads could update same variable |
| gpuAtomicAddNoReturn(&(ptr_gradInput[iw]), grad_delta); |
| } |
| ptr_gradInput += isizeW; // next input line |
| } |
| } |
| } |
| } |
| |
| /* |
| * Description: |
| * this function adaptively average pools an input 4D tensor along dimensions 2 and 3 |
| * NHWC layout for both input and output tensor |
| * 4D input, 4D output |
| */ |
| template <typename index_t, typename scalar_t> |
| C10_LAUNCH_BOUNDS_1(CUDA_MAX_THREADS) |
| __global__ void adaptive_average_pool_nhwc(const scalar_t* __restrict__ input, scalar_t* __restrict__ output, |
| int sizeB, int sizeC, |
| int isizeH, int isizeW, |
| int osizeH, int osizeW, |
| int kernel_stride_C, int kernel_size_C, |
| index_t istrideB, index_t istrideC, |
| index_t istrideH, index_t istrideW) |
| { |
| extern __shared__ int smem[]; |
| scalar_t *out_cached = reinterpret_cast<scalar_t*>(smem); |
| |
| // flattening cta for pre-computation & smem initialization; |
| int thread_id = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z); |
| int block_size = blockDim.x * blockDim.y * blockDim.z; |
| |
| // use shared memory to store temporary output value. This is simply to |
| // reduce register usage. |
| for (index_t i = thread_id; i < kernel_size_C*blockDim.x*blockDim.y*blockDim.z; i+= block_size) { |
| out_cached[i] = scalar_t(0.0); |
| } |
| |
| __syncthreads(); |
| |
| // each CTA handles a portion of a single slice on batch dimension; |
| int batch_id = blockIdx.x % sizeB; |
| int channel_id = blockIdx.x / sizeB; |
| int channel_offset = threadIdx.x + channel_id * blockDim.x; |
| |
| // each CTA handles a single slice on batch dimension; |
| // We use gridDim.x to handle striding on C as well. |
| output = output + batch_id * osizeH * osizeW * sizeC; |
| input = input + batch_id * istrideB; |
| |
| // split out_cached and exclusively it assigned to each thread; |
| out_cached = &out_cached[(threadIdx.z * blockDim.y + threadIdx.y) * kernel_size_C * blockDim.x]; |
| |
| // iterate on output H & W. |
| // Each CTA handles a consecutive H & W section (TILE); Do NOT stride CTA on |
| // tile so there's a better chance to hit L1 cache. |
| index_t oH = (osizeH + gridDim.z-1) / gridDim.z; |
| index_t oW = (osizeW + gridDim.y-1) / gridDim.y; |
| index_t ostartH = threadIdx.z + blockIdx.z*oH; |
| index_t oendH = ::min(ostartH+oH, osizeH); |
| index_t ostartW = threadIdx.y + blockIdx.y*oW; |
| index_t oendW = ::min(ostartW+oW, osizeW); |
| |
| // Stride for threads, each warp can reuse L1 as they go. So theoretically |
| // better chance to survive cache eviction. |
| for (int oh = ostartH; oh < oendH; oh+=blockDim.z) { |
| int istartH = START_IND_INT(oh, osizeH, isizeH); |
| int iendH = END_IND_INT(oh, osizeH, isizeH); |
| for (int ow = ostartW; ow < oendW; ow+=blockDim.y) { |
| int istartW = START_IND_INT(ow, osizeW, isizeW); |
| int iendW = END_IND_INT(ow, osizeW, isizeW); |
| scalar_t factor = scalar_t(1.0) / ((iendH-istartH) * (iendW-istartW)); |
| |
| // loop on input: hierarchy h->w->c, use shared memory here hopefully |
| // would not stall global memory read; |
| for (index_t ih = istartH; ih < iendH; ih++) { |
| for (index_t iw = istartW; iw < iendW; iw++) { |
| int cached_index = threadIdx.x; |
| const scalar_t *ptr_input = input + ih*istrideH + iw*istrideW; |
| for (index_t c = channel_offset; |
| c < sizeC; |
| c += blockDim.x*kernel_stride_C) { |
| out_cached[cached_index] += ptr_input[c*istrideC]; |
| cached_index += blockDim.x; |
| } |
| } |
| } |
| scalar_t *ptr_output = output + (oh * osizeW + ow) * sizeC; |
| |
| int cached_index = threadIdx.x; |
| // write accumulated output to global memory; |
| for (index_t c = channel_offset; |
| c < sizeC; |
| c += blockDim.x*kernel_stride_C) { |
| // This causes numerical issueptr when unit test with NCHW kernel; |
| // switch to could verify the correctness; |
| // output[c] = out_cached[c] / (iendH-istartH) / (iendW-istartW); |
| ptr_output[c] = out_cached[cached_index] * factor; |
| out_cached[cached_index] = scalar_t(0.0); |
| cached_index += blockDim.x; |
| } |
| // no need to __syncthreads() since out_cached is not shared. |
| } |
| } |
| } |
| |
| /* |
| * Description: |
| * this function computes the gradInput from gradOutput |
| * NHWC layout for both input and output tensor |
| * 4D input, 4D output |
| */ |
| template <typename index_t, typename scalar_t> |
| C10_LAUNCH_BOUNDS_1(CUDA_MAX_THREADS) |
| __global__ void adaptive_average_gradinput_nhwc(scalar_t* __restrict__ gradInput, const scalar_t* __restrict__ gradOutput, |
| int sizeB, int sizeC, |
| int isizeH, int isizeW, |
| int osizeH, int osizeW, |
| int kernel_stride_C, int kernel_size_C, |
| index_t ostrideB, index_t ostrideC, |
| index_t ostrideH, index_t ostrideW) |
| { |
| extern __shared__ int smem[]; |
| index_t *ostartW_cached = smem; |
| index_t *oendW_cached = &ostartW_cached[isizeW]; |
| |
| // be careful with alignment, in case scalar_t is fp16, we want to assign |
| // int pointers first. |
| scalar_t *r_kW_cached = reinterpret_cast<scalar_t*>(&oendW_cached[isizeW]); |
| scalar_t *r_kH_cached = &r_kW_cached[osizeW]; |
| scalar_t *out_cached = &r_kH_cached[osizeH]; |
| |
| // flattening cta for pre-computation & smem initialization; |
| int thread_id = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z); |
| int block_size = blockDim.x * blockDim.y * blockDim.z; |
| |
| // Precompute output start/end index per input index on width dimension; |
| // Not doing this for height dimension, as that's our out-most loop. |
| for (index_t i = thread_id; i < isizeW; i+= block_size) { |
| ostartW_cached[i] = START_IND_INT(i, isizeW, osizeW); |
| oendW_cached[i] = END_IND_INT(i, isizeW, osizeW); |
| } |
| |
| // Precompute pooling height/weight factor for each output element; |
| // This is used to weight output gradient when accumulate them on input |
| // gradient. |
| // Technically we don't have to compute it for the whole `osizeH`, since |
| // each cta only covers a consecutive portion of the entire output. But it's |
| // not going to save us from code divergence, and shared memory save is not |
| // an issue neither, so just leave it as is for now. |
| for (index_t i = thread_id; i < osizeH; i+= block_size) { |
| r_kH_cached[i] = scalar_t(1.0) / (END_IND_INT(i, osizeH, isizeH) - START_IND_INT(i, osizeH, isizeH)); |
| } |
| for (index_t i = thread_id; i < osizeW; i+= block_size) { |
| r_kW_cached[i] = scalar_t(1.0) / (END_IND_INT(i, osizeW, isizeW) - START_IND_INT(i, osizeW, isizeW)); |
| } |
| |
| // each CTA handles a portion of a single slice on batch dimension; |
| int batch_id = blockIdx.x % sizeB; |
| int channel_id = blockIdx.x / sizeB; |
| int channel_offset = threadIdx.x + channel_id * blockDim.x; |
| |
| // use shared memory to store temporary output value. This is simply to |
| // reduce register usage. |
| for (index_t i = thread_id; i < kernel_size_C*blockDim.x*blockDim.y*blockDim.z; i+= block_size) { |
| out_cached[i] = scalar_t(0.0); |
| } |
| |
| __syncthreads(); |
| |
| // each CTA handles a portion of a single slice on batch dimension; |
| // We use gridDim.x to handle striding on C as well. |
| gradInput = gradInput + batch_id * isizeH * isizeW * sizeC; |
| gradOutput = gradOutput + batch_id * ostrideB; |
| |
| // split out_cached and exclusively it assigned to each thread; |
| out_cached = &out_cached[(threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x * kernel_size_C]; |
| |
| // iterate on input H & W. |
| // Each CTA handles a consecutive H & W section (TILE); Do NOT stride CTA on |
| // tile so there's a better chance to hit L1 cache. |
| index_t iH = (isizeH + gridDim.z-1) / gridDim.z; |
| index_t iW = (isizeW + gridDim.y-1) / gridDim.y; |
| index_t istartH = threadIdx.z + blockIdx.z*iH; |
| index_t iendH = ::min(istartH+iH, isizeH); |
| index_t istartW = threadIdx.y + blockIdx.y*iW; |
| index_t iendW = ::min(istartW+iW, isizeW); |
| |
| // Stride for threads, each warp can reuse L1 as they go. So theoretically |
| // better chance to survive cache eviction. |
| for (index_t ih = istartH; ih < iendH; ih+=blockDim.z) { |
| index_t ostartH = START_IND_INT(ih, isizeH, osizeH); |
| index_t oendH = END_IND_INT(ih, isizeH, osizeH); |
| for (index_t iw = istartW; iw < iendW; iw+=blockDim.y) { |
| // loop on output: hierarchy h->w->c, so we could reuse weight factor f |
| // because it remains the same for given oh & ow |
| for(index_t oh = ostartH; oh < oendH; ++oh) { |
| for(index_t ow = ostartW_cached[iw]; ow < oendW_cached[iw]; ++ow) { |
| scalar_t f = r_kW_cached[ow] * r_kH_cached[oh]; |
| const scalar_t* ptr_gradOutput = gradOutput + oh*ostrideH + ow*ostrideW; |
| int cached_index = threadIdx.x; |
| for (index_t c = channel_offset; |
| c < sizeC; |
| c += blockDim.x*kernel_stride_C) { |
| out_cached[cached_index] += ptr_gradOutput[c*ostrideC] * f; |
| cached_index += blockDim.x; |
| } |
| } |
| } |
| scalar_t *ptr_gradInput = gradInput + (ih * isizeW + iw) * sizeC; |
| int cached_index = threadIdx.x; |
| // write accumulated gradIput to global memory; |
| for (index_t c = channel_offset; |
| c < sizeC; |
| c += blockDim.x*kernel_stride_C) { |
| ptr_gradInput[c] = out_cached[cached_index]; |
| out_cached[cached_index] = scalar_t(0.0); |
| cached_index += blockDim.x; |
| } |
| // no need to __syncthreads() since out_cached is not shared. |
| } |
| } |
| } |
| |
| // 4d tensor B x D x H x W |
| |
| void adaptive_avg_pool2d_out_cuda_template( |
| Tensor& output, |
| const Tensor& input, |
| IntArrayRef output_size) |
| { |
| TensorArg input_arg{ input, "input", 1 }, |
| output_arg{ output, "output", 2 }; |
| checkAllSameGPU(__func__, {input_arg, output_arg}); |
| |
| TORCH_CHECK(output_size.size() == 2, "adaptive_avg_pool2d: output_size must be 2"); |
| int64_t ndim = input.dim(); |
| TORCH_CHECK((ndim == 3 || ndim == 4), |
| "adaptive_avg_pool2d(): Expected 3D or 4D tensor, but got ", input.sizes()); |
| for (const auto i : {-2, -1}) { |
| TORCH_CHECK(input.size(i) > 0, |
| "adaptive_avg_pool2d(): Expected input to have non-zero size for non-batch dimensions, " |
| "but input has sizes ", input.sizes(), " with dimension ", i + ndim, " being " |
| "empty"); |
| } |
| |
| Tensor input_ = input; |
| switch (input.suggest_memory_format()) { |
| case at::MemoryFormat::ChannelsLast: { |
| // special case for tensor memory format in channels_last |
| TORCH_CHECK(input.ndimension() == 4, |
| "adaptive_avg_pool2d(): Expected 4D tensor, but got ", |
| input.sizes()); |
| |
| int sizeB = input_.size(0); |
| int sizeC = input_.size(1); |
| int isizeH = input_.size(2); |
| int isizeW = input_.size(3); |
| |
| int64_t istrideB = input_.stride(0); |
| int64_t istrideC = input_.stride(1); |
| int64_t istrideH = input_.stride(2); |
| int64_t istrideW = input_.stride(3); |
| |
| int osizeH = output_size[0]; |
| int osizeW = output_size[1]; |
| // preserve channels_last stride on output tensor; |
| if (!output.is_contiguous(at::MemoryFormat::ChannelsLast)) { |
| // TODO: modify this after resize_ added `memory_format` tag |
| output.resize_({sizeB, sizeC, osizeH, osizeW}).as_strided_({sizeB, sizeC, osizeH, osizeW}, {sizeC*osizeH*osizeW, 1, osizeW*sizeC, sizeC}); |
| } |
| |
| if (output.numel() == 0) { |
| return; |
| } |
| |
| const int max_threads = std::min<int>( |
| at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, CUDA_MAX_THREADS); |
| int* maxThreadsDim = at::cuda::getCurrentDeviceProperties()->maxThreadsDim; |
| int* maxGridSize = at::cuda::getCurrentDeviceProperties()->maxGridSize; |
| size_t sharedMemPerBlock = at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock; |
| |
| // Launch kernel on output tensor elements. Logic behind launch config: |
| // output tensor size NCHW, strides NHWC; |
| // Launch on: |
| // N -> grid.x |
| // H -> grid.z * block.z |
| // W -> grid.y * block.y |
| // C -> block.x |
| // encourage larger block_y & block_z for better cache hit while maintain |
| // reasonable block_x for coalesced memory access; |
| int block_x = std::min<int>( |
| maxThreadsDim[0], std::min<int>(lastPow2(sizeC), at::cuda::warp_size())); |
| int block_y = std::min<int>( |
| maxThreadsDim[1], std::min<int>(lastPow2(osizeW), max_threads / block_x)); |
| int block_z = std::min<int>( |
| maxThreadsDim[2], std::min<int>(lastPow2(osizeH), max_threads / block_x / block_y)); |
| block_x = std::min<int>( |
| maxThreadsDim[0], std::min<int>(lastPow2(sizeC), max_threads / block_y / block_z)); |
| const dim3 block(block_x, block_y, block_z); |
| int kernel_stride_C = ceil_div(sizeC, block_x * 4); |
| int kernel_size_C = ceil_div(sizeC, block_x * kernel_stride_C); |
| |
| // Do NOT clip grid_x, striding on Batch dimension is not in the kernel, |
| // although it could be easily implemented given current kernel. |
| int grid_x = sizeB*kernel_stride_C; |
| // it's OK to clip grid_y & grid_z, as we block the two dimensions in the kernel; |
| int grid_y = std::min<int>( |
| maxGridSize[1], ceil_div(osizeW, block_y*BLOCK_STRIDE)); |
| int grid_z = std::min<int>( |
| maxGridSize[2], ceil_div(osizeH, block_z*BLOCK_STRIDE)); |
| const dim3 grid(grid_x, grid_y, grid_z); |
| |
| |
| // we are dealing with packed tensor here. max index is the same as numel. |
| // TODO: to really support input tensor large enought to go beyond int32, |
| // we will need to restrict out shared memory usage and adjust the launch |
| // config; |
| AT_ASSERT(input_.numel() < std::numeric_limits<int32_t>::max()); |
| AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, |
| input_.scalar_type(), "adaptive_avg_pool2d_nhwc_cuda", [&] { |
| size_t shmem_size = (kernel_size_C * block_x * block_y * block_z) * sizeof(scalar_t); |
| AT_ASSERT(shmem_size <= sharedMemPerBlock); |
| adaptive_average_pool_nhwc<int32_t><<<grid, block, shmem_size, at::cuda::getCurrentCUDAStream()>>> ( |
| input_.data_ptr<scalar_t>(), |
| output.data_ptr<scalar_t>(), |
| sizeB, sizeC, isizeH, isizeW, osizeH, osizeW, |
| kernel_stride_C, kernel_size_C, |
| istrideB, istrideC, istrideH, istrideW); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| ); |
| break; |
| } |
| case at::MemoryFormat::Contiguous: { |
| int64_t grid_x = input.size(-3); |
| if (input.ndimension() == 4) { |
| input_ = input.contiguous(); |
| grid_x *= input_.size(-4); |
| } |
| int64_t sizeD = input_.size(-3); |
| int64_t isizeH = input_.size(-2); |
| int64_t isizeW = input_.size(-1); |
| |
| int64_t istrideD = input_.stride(-3); |
| int64_t istrideH = input_.stride(-2); |
| int64_t istrideW = input_.stride(-1); |
| |
| int64_t osizeH = output_size[0]; |
| int64_t osizeW = output_size[1]; |
| if (input.ndimension() == 4) { |
| output.resize_({input_.size(-4), sizeD, osizeH, osizeW}); |
| } else { |
| output.resize_({sizeD, osizeH, osizeW}); |
| } |
| if (output.numel() == 0) { |
| return; |
| } |
| |
| AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, |
| input_.scalar_type(), "adaptive_avg_pool2d_cuda", [&] { |
| scalar_t *input_data = input_.data_ptr<scalar_t>(); |
| scalar_t *output_data = output.data_ptr<scalar_t>(); |
| |
| // cuda blocks & threads: |
| int blocksH = std::max<int64_t>((int)(16L / sizeD), 1); |
| dim3 blocks(grid_x, blocksH); |
| dim3 threads(32, 8); |
| |
| // run averagepool kernel |
| adaptive_average_pool <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>> ( |
| input_data, output_data, |
| isizeH, isizeW, osizeH, osizeW, |
| istrideD, istrideH, istrideW); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| ); |
| break; |
| } |
| default: |
| TORCH_CHECK( |
| false, |
| "Unsupported memory format. Supports only ChannelsLast, Contiguous"); |
| } |
| } |
| |
| void adaptive_avg_pool2d_backward_out_cuda_template( |
| Tensor& gradInput, |
| const Tensor& gradOutput_, |
| const Tensor& input) |
| { |
| TensorArg grad_input_arg{ gradInput, "gradInput", 1 }, |
| grad_output_arg{ gradOutput_, "gradOutput_", 2 }, |
| input_arg{ input, "input", 3 }; |
| checkAllSameGPU(__func__, {grad_input_arg, grad_output_arg, input_arg}); |
| |
| switch (input.suggest_memory_format()) { |
| case at::MemoryFormat::ChannelsLast: { |
| // special case for tensor memory format in channels_last |
| TORCH_CHECK(input.ndimension() == 4, |
| "adaptive_avg_pool2d_backward_cuda(): Expected 4D tensor, but got ", input.ndimension()); |
| |
| int sizeB = input.size(0); |
| int sizeC = input.size(1); |
| int isizeH = input.size(2); |
| int isizeW = input.size(3); |
| |
| Tensor gradOutput = gradOutput_; |
| |
| int64_t ostrideB = gradOutput.stride(0); |
| int64_t ostrideC = gradOutput.stride(1); |
| int64_t ostrideH = gradOutput.stride(2); |
| int64_t ostrideW = gradOutput.stride(3); |
| |
| int osizeH = gradOutput.size(-2); |
| int osizeW = gradOutput.size(-1); |
| |
| // preserve channels_last stride on input tensor; |
| if (!gradInput.is_contiguous(at::MemoryFormat::ChannelsLast)) { |
| gradInput.as_strided_( |
| {sizeB, sizeC, isizeH, isizeW}, |
| {sizeC*isizeH*isizeW, 1, isizeW*sizeC, sizeC}); |
| } |
| |
| const int max_threads = std::min<int>( |
| at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, CUDA_MAX_THREADS); |
| int* maxThreadsDim = at::cuda::getCurrentDeviceProperties()->maxThreadsDim; |
| int* maxGridSize = at::cuda::getCurrentDeviceProperties()->maxGridSize; |
| size_t sharedMemPerBlock = at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock; |
| |
| // Launch kernel on input tensor elements. Logic behind launch config: |
| // input tensor size NCHW, strides NHWC; |
| // Launch on: |
| // N(C) -> grid.x (striding on C to reduce sh_mem usage) |
| // H -> grid.z * block.z |
| // W -> grid.y * block.y |
| // C -> block.x |
| // encourage larger block_y & block_z for better cache hit while maintain |
| // reasonable block_x for coalesced memory access; |
| int block_x = std::min<int>( |
| maxThreadsDim[0], std::min<int>(lastPow2(sizeC), at::cuda::warp_size())); |
| int block_y = std::min<int>( |
| maxThreadsDim[1], std::min<int>(lastPow2(isizeW), max_threads / block_x)); |
| int block_z = std::min<int>( |
| maxThreadsDim[2], std::min<int>(lastPow2(isizeH), max_threads / block_x / block_y)); |
| block_x = std::min<int>( |
| maxThreadsDim[0], std::min<int>(lastPow2(sizeC), max_threads / block_y / block_z)); |
| const dim3 block(block_x, block_y, block_z); |
| int kernel_stride_C = ceil_div(sizeC, block_x * 4); |
| int kernel_size_C = ceil_div(sizeC, block_x * kernel_stride_C); |
| |
| // Do NOT clip grid_x, striding on Batch dimension is not in the kernel, |
| // although it could be easily implemented given current kernel. |
| int grid_x = sizeB*kernel_stride_C; |
| // it's OK to clip grid_y & grid_z, as we block the two dimensions in the kernel; |
| int grid_y = std::min<int>( |
| maxGridSize[1], ceil_div(isizeW, block_y*BLOCK_STRIDE)); |
| int grid_z = std::min<int>( |
| maxGridSize[2], ceil_div(isizeH, block_z*BLOCK_STRIDE)); |
| const dim3 grid(grid_x, grid_y, grid_z); |
| |
| // we are dealing with packed tensor here. max index is the same as numel. |
| // TODO: to really support input tensor large enought to go beyond int32, |
| // we will need to restrict out shared memory usage and adjust the launch |
| // config; |
| AT_ASSERT(input.numel() < std::numeric_limits<int32_t>::max()); |
| AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, |
| input.scalar_type(), "adaptive_avg_pool2d_backward_nhwc_cuda", [&] { |
| size_t shmem_size = (kernel_size_C * block_x * block_y * block_z + osizeH + osizeW) * sizeof(scalar_t) + 2 * isizeW * sizeof(int32_t); |
| AT_ASSERT(shmem_size <= sharedMemPerBlock); |
| adaptive_average_gradinput_nhwc<int32_t><<<grid, block, shmem_size, at::cuda::getCurrentCUDAStream()>>> ( |
| gradInput.data_ptr<scalar_t>(), |
| gradOutput.data_ptr<scalar_t>(), |
| sizeB, sizeC, isizeH, isizeW, osizeH, osizeW, |
| kernel_stride_C, kernel_size_C, |
| ostrideB, ostrideC, ostrideH, ostrideW); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| ); |
| break; |
| } |
| case at::MemoryFormat::Contiguous: { |
| bool atomic = true; // suboptimal, but without atomic it doesn't pass the tests |
| |
| Tensor gradOutput = gradOutput_.contiguous(); |
| |
| int64_t sizeD = input.size(-3); |
| int64_t isizeH = input.size(-2); |
| int64_t isizeW = input.size(-1); |
| |
| int64_t osizeH = gradOutput.size(-2); |
| int64_t osizeW = gradOutput.size(-1); |
| |
| int64_t grid_x = sizeD; |
| if (input.ndimension() == 4) grid_x *= input.size(-4); |
| |
| //bool atomic = (isizeW%osizeW != 0) || (isizeH%osizeH != 0); |
| AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, |
| input.scalar_type(), "adaptive_avg_pool2d_backward_cuda", [&] { |
| scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>(); |
| scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>(); |
| |
| // cuda blocks & threads: |
| int blocksH = std::max((int)(16L / sizeD), 1); |
| dim3 blocks(grid_x, blocksH); |
| dim3 threads(32, 8); |
| |
| if(atomic) |
| { |
| // run updateGradInput kernel, accumulate gradients atomically |
| atomic_adaptive_average_gradinput <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>> ( |
| gradInput_data, gradOutput_data, |
| isizeH, isizeW, osizeH, osizeW); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| else |
| { |
| // run updateGradInput kernel |
| adaptive_average_gradinput <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>> ( |
| gradInput_data, gradOutput_data, |
| isizeH, isizeW, osizeH, osizeW); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| } |
| ); |
| break; |
| } |
| default: |
| TORCH_CHECK( |
| false, |
| "Unsupported memory format. Supports only ChannelsLast, Contiguous"); |
| |
| } |
| } |
| |
| } // namespace |
| |
| Tensor& adaptive_avg_pool2d_out_cuda( |
| const Tensor& input, |
| IntArrayRef output_size, |
| Tensor& output) |
| { |
| adaptive_avg_pool2d_out_cuda_template( |
| output, input, output_size); |
| return output; |
| } |
| |
| Tensor adaptive_avg_pool2d_cuda( |
| at::Tensor const& input, |
| IntArrayRef output_size) |
| { |
| auto output = at::empty({0}, input.options()); |
| adaptive_avg_pool2d_out_cuda_template( |
| output, input, output_size); |
| return output; |
| } |
| |
| Tensor& adaptive_avg_pool2d_backward_out_cuda( |
| Tensor& gradInput, |
| const Tensor& gradOutput, |
| const Tensor& input) |
| { |
| // See Note [Writing Nondeterministic Operations] |
| // Nondeterministic because of atomicAdd usage |
| globalContext().alertNotDeterministic("adaptive_avg_pool2d_backward_out_cuda"); |
| gradInput.resize_as_(input); |
| if (gradInput.numel() != 0) { |
| adaptive_avg_pool2d_backward_out_cuda_template( |
| gradInput, gradOutput, input); |
| } |
| return gradInput; |
| } |
| |
| Tensor adaptive_avg_pool2d_backward_cuda( |
| const Tensor& gradOutput, |
| const Tensor& input) |
| { |
| // See Note [Writing Nondeterministic Operations] |
| // Nondeterministic because of atomicAdd usage |
| globalContext().alertNotDeterministic("adaptive_avg_pool2d_backward_cuda"); |
| auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); |
| if (gradInput.numel() != 0) { |
| adaptive_avg_pool2d_backward_out_cuda_template( |
| gradInput, gradOutput, input); |
| } |
| return gradInput; |
| } |
| |
| } // at::native |
| } // at |
| |
| #undef BLOCK_STRIDE |
| #undef CUDA_MAX_THREADS |
| #undef START_IND |
| #undef END_IND |