modules/detectron/spatial_narrow_as_op.cu - platform/external/pytorch - Git at Google

 /**
  * Copyright (c) 2016-present, Facebook, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "caffe2/core/context_gpu.h"
 #include "caffe2/core/operator.h"
 #include "modules/detectron/spatial_narrow_as_op.h"

 namespace caffe2 {

 namespace {
 template <typename T>
 __global__ void CopyKernel(
     const int N,
     const int C,
     const int in_H,
     const int in_W,
     const int out_H,
     const int out_W,
     const T* in_data,
     T* out_data) {
   CUDA_1D_KERNEL_LOOP(index, N * C * out_H * out_W) {
     int w = index % out_W;
     int h = (index / out_W) % out_H;
     int c = (index / out_W / out_H) % C;
     int n = (index / out_W / out_H / C);
     int in_index = n * C * in_H * in_W + c * in_H * in_W + h * in_W + w;
     int out_index = n * C * out_H * out_W + c * out_H * out_W + h * out_W + w;
     out_data[out_index] = in_data[in_index];
   }
 }

 template <typename T>
 __global__ void CopyGradientKernel(
     const int N,
     const int C,
     const int in_H,
     const int in_W,
     const int out_H,
     const int out_W,
     const T* in_data,
     T* out_data) {
   CUDA_1D_KERNEL_LOOP(index, N * C * in_H * in_W) {
     int w = index % in_W;
     int h = (index / in_W) % in_H;
     int c = (index / in_W / in_H) % C;
     int n = (index / in_W / in_H / C);
     int in_index = n * C * in_H * in_W + c * in_H * in_W + h * in_W + w;
     int out_index = n * C * out_H * out_W + c * out_H * out_W + h * out_W + w;
     out_data[out_index] = in_data[in_index];
   }
 }
 } // namespace


 template <>
 bool SpatialNarrowAsOp<CUDAContext>::RunOnDevice() {
   return DispatchHelper<TensorTypes<float_t, int32_t>>::call(this, Input(0));
 }

 template <>
 template <typename T>
 bool SpatialNarrowAsOp<CUDAContext>::DoRunWithType() {
   // Narrows input 0 (A) spatially to match input 1 (B)
   auto& A = Input(0);
   auto& B = Input(1);


   CAFFE_ENFORCE_EQ(A.dim32(0), B.dim32(0), "Input dim 0 must be equal.");
   std::vector<int64_t> sizes;
   if (A.ndim() == B.ndim()) {
     CAFFE_ENFORCE_EQ(A.dim32(1), B.dim32(1), "Input dim 1 must be equal.");
     CAFFE_ENFORCE_GE(
         A.dim32(2), B.dim32(2), "Input 0 height must be >= input 1 height.");
     CAFFE_ENFORCE_GE(
         A.dim32(3), B.dim32(3), "Input 0 width must be >= input 1 width.");
     sizes = B.sizes().vec();
   } else {
     // For (N, H, W) case
     CAFFE_ENFORCE_EQ(A.ndim() - 1, B.ndim(), "Dimension mismatch.");
     CAFFE_ENFORCE_GE(
         A.dim32(2), B.dim32(1), "Input 0 height must be >= input 1 height.");
     CAFFE_ENFORCE_GE(
         A.dim32(3), B.dim32(2), "Input 0 width must be >= input 1 width.");
     sizes = {A.dim32(0), A.dim32(1), B.dim32(1), B.dim32(2)};
   }
   auto* C = Output(0, sizes, at::dtype<T>());
   int out_width = C->dim32(3);
   int out_height = C->dim32(2);
   int in_width = A.dim32(3);
   int in_height = A.dim32(2);

   CopyKernel<T><<<
       CAFFE_GET_BLOCKS(C->size()),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
       C->dim32(0),
       C->dim32(1),
       in_height,
       in_width,
       out_height,
       out_width,
       A.template data<T>(),
       C->template mutable_data<T>());
   C10_CUDA_KERNEL_LAUNCH_CHECK();

   return true;
 }

 template <>
 bool SpatialNarrowAsGradientOp<CUDAContext>::RunOnDevice() {
   return DispatchHelper<TensorTypes<float_t, int32_t>>::call(this, Input(0));
 }

 template <>
 template <typename T>
 bool SpatialNarrowAsGradientOp<CUDAContext>::DoRunWithType() {
   auto& A = Input(0);
   auto& B = Input(1);
   auto& dC = Input(2); // Gradient of net w.r.t. output of forward op
   auto* dA = Output(0, A.sizes(), at::dtype<T>()); // Gradient of net w.r.t. input to forward op

   math::Set<T, CUDAContext>(
       dA->size(), 0.f, dA->template mutable_data<T>(), &context_);
   int out_width = dA->dim32(3);
   int out_height = dA->dim32(2);
   int in_width = dC.dim32(3);
   int in_height = dC.dim32(2);

   CopyGradientKernel<T><<<
       CAFFE_GET_BLOCKS(dC.size()),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
       dA->dim32(0),
       dA->dim32(1),
       in_height,
       in_width,
       out_height,
       out_width,
       dC.template data<T>(),
       dA->template mutable_data<T>());
   C10_CUDA_KERNEL_LAUNCH_CHECK();

   return true;
 }

 REGISTER_CUDA_OPERATOR(SpatialNarrowAs, SpatialNarrowAsOp<CUDAContext>);
 REGISTER_CUDA_OPERATOR(
     SpatialNarrowAsGradient,
     SpatialNarrowAsGradientOp<CUDAContext>);
 } // namespace caffe2
	/**
	* Copyright (c) 2016-present, Facebook, Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "caffe2/core/context_gpu.h"
	#include "caffe2/core/operator.h"
	#include "modules/detectron/spatial_narrow_as_op.h"

	namespace caffe2 {

	namespace {
	template <typename T>
	__global__ void CopyKernel(
	const int N,
	const int C,
	const int in_H,
	const int in_W,
	const int out_H,
	const int out_W,
	const T* in_data,
	T* out_data) {
	CUDA_1D_KERNEL_LOOP(index, N * C * out_H * out_W) {
	int w = index % out_W;
	int h = (index / out_W) % out_H;
	int c = (index / out_W / out_H) % C;
	int n = (index / out_W / out_H / C);
	int in_index = n * C * in_H * in_W + c * in_H * in_W + h * in_W + w;
	int out_index = n * C * out_H * out_W + c * out_H * out_W + h * out_W + w;
	out_data[out_index] = in_data[in_index];
	}
	}

	template <typename T>
	__global__ void CopyGradientKernel(
	const int N,
	const int C,
	const int in_H,
	const int in_W,
	const int out_H,
	const int out_W,
	const T* in_data,
	T* out_data) {
	CUDA_1D_KERNEL_LOOP(index, N * C * in_H * in_W) {
	int w = index % in_W;
	int h = (index / in_W) % in_H;
	int c = (index / in_W / in_H) % C;
	int n = (index / in_W / in_H / C);
	int in_index = n * C * in_H * in_W + c * in_H * in_W + h * in_W + w;
	int out_index = n * C * out_H * out_W + c * out_H * out_W + h * out_W + w;
	out_data[out_index] = in_data[in_index];
	}
	}
	} // namespace


	template <>
	bool SpatialNarrowAsOp<CUDAContext>::RunOnDevice() {
	return DispatchHelper<TensorTypes<float_t, int32_t>>::call(this, Input(0));
	}

	template <>
	template <typename T>
	bool SpatialNarrowAsOp<CUDAContext>::DoRunWithType() {
	// Narrows input 0 (A) spatially to match input 1 (B)
	auto& A = Input(0);
	auto& B = Input(1);


	CAFFE_ENFORCE_EQ(A.dim32(0), B.dim32(0), "Input dim 0 must be equal.");
	std::vector<int64_t> sizes;
	if (A.ndim() == B.ndim()) {
	CAFFE_ENFORCE_EQ(A.dim32(1), B.dim32(1), "Input dim 1 must be equal.");
	CAFFE_ENFORCE_GE(
	A.dim32(2), B.dim32(2), "Input 0 height must be >= input 1 height.");
	CAFFE_ENFORCE_GE(
	A.dim32(3), B.dim32(3), "Input 0 width must be >= input 1 width.");
	sizes = B.sizes().vec();
	} else {
	// For (N, H, W) case
	CAFFE_ENFORCE_EQ(A.ndim() - 1, B.ndim(), "Dimension mismatch.");
	CAFFE_ENFORCE_GE(
	A.dim32(2), B.dim32(1), "Input 0 height must be >= input 1 height.");
	CAFFE_ENFORCE_GE(
	A.dim32(3), B.dim32(2), "Input 0 width must be >= input 1 width.");
	sizes = {A.dim32(0), A.dim32(1), B.dim32(1), B.dim32(2)};
	}
	auto* C = Output(0, sizes, at::dtype<T>());
	int out_width = C->dim32(3);
	int out_height = C->dim32(2);
	int in_width = A.dim32(3);
	int in_height = A.dim32(2);

	CopyKernel<T><<<
	CAFFE_GET_BLOCKS(C->size()),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	C->dim32(0),
	C->dim32(1),
	in_height,
	in_width,
	out_height,
	out_width,
	A.template data<T>(),
	C->template mutable_data<T>());
	C10_CUDA_KERNEL_LAUNCH_CHECK();

	return true;
	}

	template <>
	bool SpatialNarrowAsGradientOp<CUDAContext>::RunOnDevice() {
	return DispatchHelper<TensorTypes<float_t, int32_t>>::call(this, Input(0));
	}

	template <>
	template <typename T>
	bool SpatialNarrowAsGradientOp<CUDAContext>::DoRunWithType() {
	auto& A = Input(0);
	auto& B = Input(1);
	auto& dC = Input(2); // Gradient of net w.r.t. output of forward op
	auto* dA = Output(0, A.sizes(), at::dtype<T>()); // Gradient of net w.r.t. input to forward op

	math::Set<T, CUDAContext>(
	dA->size(), 0.f, dA->template mutable_data<T>(), &context_);
	int out_width = dA->dim32(3);
	int out_height = dA->dim32(2);
	int in_width = dC.dim32(3);
	int in_height = dC.dim32(2);

	CopyGradientKernel<T><<<
	CAFFE_GET_BLOCKS(dC.size()),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	dA->dim32(0),
	dA->dim32(1),
	in_height,
	in_width,
	out_height,
	out_width,
	dC.template data<T>(),
	dA->template mutable_data<T>());
	C10_CUDA_KERNEL_LAUNCH_CHECK();

	return true;
	}

	REGISTER_CUDA_OPERATOR(SpatialNarrowAs, SpatialNarrowAsOp<CUDAContext>);
	REGISTER_CUDA_OPERATOR(
	SpatialNarrowAsGradient,
	SpatialNarrowAsGradientOp<CUDAContext>);
	} // namespace caffe2