| /** |
| * Copyright (c) 2016-present, Facebook, Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /** |
| * Adapted from https://github.com/torch/cunn/blob/master/lib/THCUNN/SpatialUpSamplingNearest.cu |
| * |
| * Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) |
| * Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) |
| * Copyright (c) 2011-2013 NYU (Clement Farabet) |
| * Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, |
| * Leon Bottou, Iain Melvin, Jason Weston) |
| * Copyright (c) 2006 Idiap Research Institute (Samy Bengio) |
| * Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, |
| * Samy Bengio, Johnny Mariethoz) |
| * |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * 3. Neither the names of NEC Laboratories American and IDIAP Research |
| * Institute nor the names of its contributors may be used to endorse or |
| * promote products derived from this software without specific prior |
| * written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| |
| #include "caffe2/core/context_gpu.h" |
| #include "modules/detectron/upsample_nearest_op.h" |
| |
| namespace caffe2 { |
| |
| namespace { |
| __device__ int translate_idx(int ii, int d1, int d2, int d3, int scale_factor) { |
| int x, y, z, w; |
| w = ii % d3; |
| ii = ii/d3; |
| z = ii % d2; |
| ii = ii/d2; |
| y = ii % d1; |
| ii = ii/d1; |
| x = ii; |
| w = w/scale_factor; |
| z = z/scale_factor; |
| d2 /= scale_factor; |
| d3 /= scale_factor; |
| return (((x*d1+y)*d2)+z)*d3+w; |
| } |
| |
| __device__ int translate_idx_inv( |
| int ii, int d1, int d2, int d3, int scale_factor, int off_x, int off_y) { |
| int x, y, z, w; |
| w = ii % d3; |
| ii = ii/d3; |
| z = ii % d2; |
| ii = ii/d2; |
| y = ii % d1; |
| ii = ii/d1; |
| x = ii; |
| w = w*scale_factor+off_x; |
| z = z*scale_factor+off_y; |
| d2 *= scale_factor; |
| d3 *= scale_factor; |
| return (((x*d1+y)*d2)+z)*d3+w; |
| } |
| |
| __global__ void upscale(const float *input, float *output, long no_elements, |
| int scale_factor, int d1, int d2, int d3) { |
| long ii = threadIdx.x + blockDim.x * blockIdx.x; |
| ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y; |
| if (ii >= no_elements) return; |
| int ipidx = translate_idx(ii, d1, d2, d3, scale_factor); |
| output[ii]=input[ipidx]; |
| } |
| |
| __global__ void downscale(float *gradInput_data, const float *gradOutput_data, |
| long no_elements, int scale_factor, int d1, int d2, |
| int d3) { |
| long ii = threadIdx.x + blockDim.x * blockIdx.x; |
| ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y; |
| if (ii >= no_elements) return; |
| for (int i=0; i < scale_factor; i++){ |
| for(int j=0; j < scale_factor; j++){ |
| int ipidx = translate_idx_inv(ii, d1, d2, d3, scale_factor, i, j); |
| gradInput_data[ii] += gradOutput_data[ipidx]; |
| } |
| } |
| } |
| } // namespace |
| |
| template<> |
| bool UpsampleNearestOp<float, CUDAContext>::RunOnDevice() { |
| auto& X = Input(0); |
| auto* Y = Output(0); |
| |
| vector<int64_t> out_shape; |
| for (int i = 0; i < X.ndim(); ++i) { |
| out_shape.push_back(X.dim32(i)); |
| } |
| out_shape[X.ndim() - 1] *= scale_; |
| out_shape[X.ndim() - 2] *= scale_; |
| Y->Resize(out_shape); |
| |
| int d1; |
| int d2; |
| int d3; |
| if (X.ndim() == 3) { |
| d1 = Y->dim32(0); |
| d2 = Y->dim32(1); |
| d3 = Y->dim32(2); |
| } else { |
| d1 = Y->dim32(1); |
| d2 = Y->dim32(2); |
| d3 = Y->dim32(3); |
| } |
| long no_elements = Y->size(); |
| |
| const float *input_data = X.data<float>(); |
| float *output_data = Y->mutable_data<float>(); |
| |
| // cuda blocks & threads: |
| long nthreads = 256; |
| // Max number of blocks: http://en.wikipedia.org/wiki/CUDA |
| // 65535 for SM 2.x, 2^32 -1 for >= 3.0 |
| // TODO: When we move to SM 3.5 we should update this |
| long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535); |
| long n_yblocks = (long)ceil( |
| (float)no_elements / (float)(n_xblocks * nthreads)); |
| CAFFE_ENFORCE(n_yblocks <= 65535); |
| dim3 blocks(n_xblocks, n_yblocks); |
| dim3 threads(nthreads); |
| |
| upscale<<<blocks, threads, 0, context_.cuda_stream()>>>( |
| input_data, output_data, no_elements, scale_, d1, d2, d3); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| |
| return true; |
| } |
| |
| |
| template<> |
| bool UpsampleNearestGradientOp<float, CUDAContext>::RunOnDevice() { |
| auto& X = Input(0); // Original input to "forward" op |
| auto& dY = Input(1); // Gradient of net w.r.t. output of "forward" op |
| // (aka "gradOutput") |
| auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op |
| // (aka "gradInput") |
| |
| dX->ResizeLike(X); |
| float *gradInput_data = dX->mutable_data<float>(); |
| const float *gradOutput_data = dY.data<float>(); |
| |
| int d1; |
| int d2; |
| int d3; |
| if (dX->ndim() == 3) { |
| d1 = dX->dim32(0); |
| d2 = dX->dim32(1); |
| d3 = dX->dim32(2); |
| } else { |
| d1 = dX->dim32(1); |
| d2 = dX->dim32(2); |
| d3 = dX->dim32(3); |
| } |
| long no_elements = dX->size(); |
| |
| // cuda blocks & threads: |
| long nthreads = 256; |
| // Max number of blocks: http://en.wikipedia.org/wiki/CUDA |
| // 65535 for SM 2.x, 2^32 -1 for >= 3.0 |
| // TODO: When we move to SM 3.5 we should update this |
| long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535); |
| long n_yblocks = (long)ceil( |
| (float)no_elements / (float)(n_xblocks * nthreads)); |
| CAFFE_ENFORCE(n_yblocks <= 65535); |
| dim3 blocks(n_xblocks, n_yblocks); |
| dim3 threads(nthreads); |
| |
| math::Set<float, CUDAContext>(no_elements, 0.f, gradInput_data, &context_); |
| downscale<<<blocks, threads, 0, context_.cuda_stream()>>>( |
| gradInput_data, gradOutput_data, no_elements, scale_, d1, d2, d3); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| |
| return true; |
| } |
| |
| REGISTER_CUDA_OPERATOR(UpsampleNearest, |
| UpsampleNearestOp<float, CUDAContext>); |
| REGISTER_CUDA_OPERATOR(UpsampleNearestGradient, |
| UpsampleNearestGradientOp<float, CUDAContext>); |
| } // namespace caffe2 |