modules/detectron/upsample_nearest_op.cu - platform/external/pytorch - Git at Google

 /**
  * Copyright (c) 2016-present, Facebook, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 /**
  * Adapted from https://github.com/torch/cunn/blob/master/lib/THCUNN/SpatialUpSamplingNearest.cu
  *
  * Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
  * Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
  * Copyright (c) 2011-2013 NYU (Clement Farabet)
  * Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert,
  *                         Leon Bottou, Iain Melvin, Jason Weston)
  * Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
  * Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
  *                         Samy Bengio, Johnny Mariethoz)
  *
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * 3. Neither the names of NEC Laboratories American and IDIAP Research
  *    Institute nor the names of its contributors may be used to endorse or
  *    promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */


 #include "caffe2/core/context_gpu.h"
 #include "modules/detectron/upsample_nearest_op.h"

 namespace caffe2 {

 namespace {
 __device__ int translate_idx(int ii, int d1, int d2, int d3, int scale_factor) {
   int x, y, z, w;
   w = ii % d3;
   ii = ii/d3;
   z = ii % d2;
   ii = ii/d2;
   y = ii % d1;
   ii = ii/d1;
   x = ii;
   w = w/scale_factor;
   z = z/scale_factor;
   d2 /= scale_factor;
   d3 /= scale_factor;
   return (((x*d1+y)*d2)+z)*d3+w;
 }

 __device__ int translate_idx_inv(
     int ii, int d1, int d2, int d3, int scale_factor, int off_x, int off_y) {
   int x, y, z, w;
   w = ii % d3;
   ii = ii/d3;
   z = ii % d2;
   ii = ii/d2;
   y = ii % d1;
   ii = ii/d1;
   x = ii;
   w = w*scale_factor+off_x;
   z = z*scale_factor+off_y;
   d2 *= scale_factor;
   d3 *= scale_factor;
   return (((x*d1+y)*d2)+z)*d3+w;
 }

 __global__ void upscale(const float *input, float *output, long no_elements,
                         int scale_factor, int d1, int d2, int d3) {
   long ii = threadIdx.x + blockDim.x * blockIdx.x;
   ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
   if (ii >= no_elements) return;
   int ipidx = translate_idx(ii, d1, d2, d3, scale_factor);
   output[ii]=input[ipidx];
 }

 __global__ void downscale(float *gradInput_data, const float *gradOutput_data,
                           long no_elements, int scale_factor, int d1, int d2,
                           int d3) {
   long ii = threadIdx.x + blockDim.x * blockIdx.x;
   ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
   if (ii >= no_elements) return;
   for (int i=0; i < scale_factor; i++){
     for(int j=0; j < scale_factor; j++){
       int ipidx = translate_idx_inv(ii, d1, d2, d3, scale_factor, i, j);
       gradInput_data[ii] += gradOutput_data[ipidx];
     }
   }
 }
 } // namespace

 template<>
 bool UpsampleNearestOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0);
   auto* Y = Output(0);

   vector<int64_t> out_shape;
   for (int i = 0; i < X.ndim(); ++i) {
     out_shape.push_back(X.dim32(i));
   }
   out_shape[X.ndim() - 1] *= scale_;
   out_shape[X.ndim() - 2] *= scale_;
   Y->Resize(out_shape);

   int d1;
   int d2;
   int d3;
   if (X.ndim() == 3) {
     d1 = Y->dim32(0);
     d2 = Y->dim32(1);
     d3 = Y->dim32(2);
   } else {
     d1 = Y->dim32(1);
     d2 = Y->dim32(2);
     d3 = Y->dim32(3);
   }
   long no_elements = Y->size();

   const float *input_data = X.data<float>();
   float *output_data = Y->mutable_data<float>();

   // cuda blocks & threads:
   long nthreads = 256;
   // Max number of blocks: http://en.wikipedia.org/wiki/CUDA
   // 65535 for SM 2.x, 2^32 -1 for >= 3.0
   // TODO: When we move to SM 3.5 we should update this
   long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
   long n_yblocks = (long)ceil(
       (float)no_elements / (float)(n_xblocks * nthreads));
   CAFFE_ENFORCE(n_yblocks <= 65535);
   dim3 blocks(n_xblocks, n_yblocks);
   dim3 threads(nthreads);

   upscale<<<blocks, threads, 0, context_.cuda_stream()>>>(
       input_data, output_data, no_elements, scale_, d1, d2, d3);
   C10_CUDA_KERNEL_LAUNCH_CHECK();

   return true;
 }


 template<>
 bool UpsampleNearestGradientOp<float, CUDAContext>::RunOnDevice() {
   auto& X  = Input(0);   // Original input to "forward" op
   auto& dY = Input(1);   // Gradient of net w.r.t. output of "forward" op
                          // (aka "gradOutput")
   auto* dX = Output(0);  // Gradient of net w.r.t. input to "forward" op
                          // (aka "gradInput")

   dX->ResizeLike(X);
   float *gradInput_data = dX->mutable_data<float>();
   const float *gradOutput_data = dY.data<float>();

   int d1;
   int d2;
   int d3;
   if (dX->ndim() == 3) {
     d1 = dX->dim32(0);
     d2 = dX->dim32(1);
     d3 = dX->dim32(2);
   } else {
     d1 = dX->dim32(1);
     d2 = dX->dim32(2);
     d3 = dX->dim32(3);
   }
   long no_elements = dX->size();

   // cuda blocks & threads:
   long nthreads = 256;
   // Max number of blocks: http://en.wikipedia.org/wiki/CUDA
   // 65535 for SM 2.x, 2^32 -1 for >= 3.0
   // TODO: When we move to SM 3.5 we should update this
   long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
   long n_yblocks = (long)ceil(
       (float)no_elements / (float)(n_xblocks * nthreads));
   CAFFE_ENFORCE(n_yblocks <= 65535);
   dim3 blocks(n_xblocks, n_yblocks);
   dim3 threads(nthreads);

   math::Set<float, CUDAContext>(no_elements, 0.f, gradInput_data, &context_);
   downscale<<<blocks, threads, 0, context_.cuda_stream()>>>(
       gradInput_data, gradOutput_data, no_elements, scale_, d1, d2, d3);
   C10_CUDA_KERNEL_LAUNCH_CHECK();

   return true;
 }

 REGISTER_CUDA_OPERATOR(UpsampleNearest,
                        UpsampleNearestOp<float, CUDAContext>);
 REGISTER_CUDA_OPERATOR(UpsampleNearestGradient,
                        UpsampleNearestGradientOp<float, CUDAContext>);
 } // namespace caffe2
	/**
	* Copyright (c) 2016-present, Facebook, Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/**
	* Adapted from https://github.com/torch/cunn/blob/master/lib/THCUNN/SpatialUpSamplingNearest.cu
	*
	* Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
	* Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
	* Copyright (c) 2011-2013 NYU (Clement Farabet)
	* Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert,
	* Leon Bottou, Iain Melvin, Jason Weston)
	* Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
	* Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
	* Samy Bengio, Johnny Mariethoz)
	*
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* 3. Neither the names of NEC Laboratories American and IDIAP Research
	* Institute nor the names of its contributors may be used to endorse or
	* promote products derived from this software without specific prior
	* written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*/


	#include "caffe2/core/context_gpu.h"
	#include "modules/detectron/upsample_nearest_op.h"

	namespace caffe2 {

	namespace {
	__device__ int translate_idx(int ii, int d1, int d2, int d3, int scale_factor) {
	int x, y, z, w;
	w = ii % d3;
	ii = ii/d3;
	z = ii % d2;
	ii = ii/d2;
	y = ii % d1;
	ii = ii/d1;
	x = ii;
	w = w/scale_factor;
	z = z/scale_factor;
	d2 /= scale_factor;
	d3 /= scale_factor;
	return (((xd1+y)d2)+z)*d3+w;
	}

	__device__ int translate_idx_inv(
	int ii, int d1, int d2, int d3, int scale_factor, int off_x, int off_y) {
	int x, y, z, w;
	w = ii % d3;
	ii = ii/d3;
	z = ii % d2;
	ii = ii/d2;
	y = ii % d1;
	ii = ii/d1;
	x = ii;
	w = w*scale_factor+off_x;
	z = z*scale_factor+off_y;
	d2 *= scale_factor;
	d3 *= scale_factor;
	return (((xd1+y)d2)+z)*d3+w;
	}

	__global__ void upscale(const float input, float output, long no_elements,
	int scale_factor, int d1, int d2, int d3) {
	long ii = threadIdx.x + blockDim.x * blockIdx.x;
	ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
	if (ii >= no_elements) return;
	int ipidx = translate_idx(ii, d1, d2, d3, scale_factor);
	output[ii]=input[ipidx];
	}

	__global__ void downscale(float gradInput_data, const float gradOutput_data,
	long no_elements, int scale_factor, int d1, int d2,
	int d3) {
	long ii = threadIdx.x + blockDim.x * blockIdx.x;
	ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
	if (ii >= no_elements) return;
	for (int i=0; i < scale_factor; i++){
	for(int j=0; j < scale_factor; j++){
	int ipidx = translate_idx_inv(ii, d1, d2, d3, scale_factor, i, j);
	gradInput_data[ii] += gradOutput_data[ipidx];
	}
	}
	}
	} // namespace

	template<>
	bool UpsampleNearestOp<float, CUDAContext>::RunOnDevice() {
	auto& X = Input(0);
	auto* Y = Output(0);

	vector<int64_t> out_shape;
	for (int i = 0; i < X.ndim(); ++i) {
	out_shape.push_back(X.dim32(i));
	}
	out_shape[X.ndim() - 1] *= scale_;
	out_shape[X.ndim() - 2] *= scale_;
	Y->Resize(out_shape);

	int d1;
	int d2;
	int d3;
	if (X.ndim() == 3) {
	d1 = Y->dim32(0);
	d2 = Y->dim32(1);
	d3 = Y->dim32(2);
	} else {
	d1 = Y->dim32(1);
	d2 = Y->dim32(2);
	d3 = Y->dim32(3);
	}
	long no_elements = Y->size();

	const float *input_data = X.data<float>();
	float *output_data = Y->mutable_data<float>();

	// cuda blocks & threads:
	long nthreads = 256;
	// Max number of blocks: http://en.wikipedia.org/wiki/CUDA
	// 65535 for SM 2.x, 2^32 -1 for >= 3.0
	// TODO: When we move to SM 3.5 we should update this
	long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
	long n_yblocks = (long)ceil(
	(float)no_elements / (float)(n_xblocks * nthreads));
	CAFFE_ENFORCE(n_yblocks <= 65535);
	dim3 blocks(n_xblocks, n_yblocks);
	dim3 threads(nthreads);

	upscale<<<blocks, threads, 0, context_.cuda_stream()>>>(
	input_data, output_data, no_elements, scale_, d1, d2, d3);
	C10_CUDA_KERNEL_LAUNCH_CHECK();

	return true;
	}


	template<>
	bool UpsampleNearestGradientOp<float, CUDAContext>::RunOnDevice() {
	auto& X = Input(0); // Original input to "forward" op
	auto& dY = Input(1); // Gradient of net w.r.t. output of "forward" op
	// (aka "gradOutput")
	auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op
	// (aka "gradInput")

	dX->ResizeLike(X);
	float *gradInput_data = dX->mutable_data<float>();
	const float *gradOutput_data = dY.data<float>();

	int d1;
	int d2;
	int d3;
	if (dX->ndim() == 3) {
	d1 = dX->dim32(0);
	d2 = dX->dim32(1);
	d3 = dX->dim32(2);
	} else {
	d1 = dX->dim32(1);
	d2 = dX->dim32(2);
	d3 = dX->dim32(3);
	}
	long no_elements = dX->size();

	// cuda blocks & threads:
	long nthreads = 256;
	// Max number of blocks: http://en.wikipedia.org/wiki/CUDA
	// 65535 for SM 2.x, 2^32 -1 for >= 3.0
	// TODO: When we move to SM 3.5 we should update this
	long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
	long n_yblocks = (long)ceil(
	(float)no_elements / (float)(n_xblocks * nthreads));
	CAFFE_ENFORCE(n_yblocks <= 65535);
	dim3 blocks(n_xblocks, n_yblocks);
	dim3 threads(nthreads);

	math::Set<float, CUDAContext>(no_elements, 0.f, gradInput_data, &context_);
	downscale<<<blocks, threads, 0, context_.cuda_stream()>>>(
	gradInput_data, gradOutput_data, no_elements, scale_, d1, d2, d3);
	C10_CUDA_KERNEL_LAUNCH_CHECK();

	return true;
	}

	REGISTER_CUDA_OPERATOR(UpsampleNearest,
	UpsampleNearestOp<float, CUDAContext>);
	REGISTER_CUDA_OPERATOR(UpsampleNearestGradient,
	UpsampleNearestGradientOp<float, CUDAContext>);
	} // namespace caffe2