modules/detectron/smooth_l1_loss_op.cu - platform/external/pytorch - Git at Google

 /**
  * Copyright (c) 2016-present, Facebook, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "caffe2/core/context_gpu.h"
 #include "modules/detectron/smooth_l1_loss_op.h"

 namespace caffe2 {

 namespace {
 template <typename T>
 __global__ void SmoothL1Kernel(
     const int n, const T* in, T* out, T beta) {
   // f(x) = 0.5 * x^2 / beta      if |x| < beta
   //        |x| - 0.5 * beta      otherwise
   CUDA_1D_KERNEL_LOOP(index, n) {
     T val = in[index];
     T abs_val = c10::cuda::compat::abs(val);
     if (abs_val < beta) {
       out[index] = 0.5 * val * val / beta;
     } else {
       out[index] = abs_val - 0.5 * beta;
     }
   }
 }

 template <typename T>
 __global__ void SmoothL1GradientKernel(
     const int n,
     const T* in,
     T* out,
     const T* d_loss_data,
     T norm,
     T beta) {
   // f'(x) = x / beta     if |x| < beta
   //       = sign(x)      otherwise
   // We also scale by norm * d_loss in this kernel for convenience
   CUDA_1D_KERNEL_LOOP(index, n) {
     T val = in[index];
     T abs_val = c10::cuda::compat::abs(val);
     T d_loss = *d_loss_data;
     if (abs_val < beta) {
       out[index] = norm * d_loss * val / beta;
     } else {
       out[index] = norm * d_loss * ((T(0) < val) - (val < T(0)));
     }
   }
 }
 } // namespace

 template<>
 bool SmoothL1LossOp<float, CUDAContext>::RunOnDevice() {
   auto& Y_hat     = Input(0);
   auto& Y         = Input(1);
   auto& alpha_in  = Input(2);
   auto& alpha_out = Input(3);


   int N = Y.dim32(0);
   // Require the same number of elements along axis 0 (batch size), but
   // otherwise don't care about the shape (just the number of elements)
   CAFFE_ENFORCE_EQ(Y_hat.dim32(0), Y.dim32(0),
       "Y_hat and Y must have the same number of elements along axis 0");
   CAFFE_ENFORCE_EQ(Y_hat.size(), Y.size(),
       "Y_hat and Y must have the same number of elements");
   CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size());
   CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size());

   auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
   buff_.ResizeLike(Y);

   // Difference
   // d := y_hat - y
   math::Sub<float, CUDAContext>(
       Y.size(), Y_hat.data<float>(), Y.data<float>(),
       buff_.mutable_data<float>(), &context_);
   // Element-wise weighted difference (can be used to ignore or reweight
   // specific components)
   // d := alpha_in * (y_hat - y)
   math::Mul<float, CUDAContext>(
       buff_.size(), buff_.data<float>(), alpha_in.data<float>(),
       buff_.mutable_data<float>(), &context_);

   // Element-wise smooth l1 loss
   // l := SmoothL1(alpha_in * (y_hat - y))
   SmoothL1Kernel<float>
   <<<CAFFE_GET_BLOCKS(buff_.size()),
      CAFFE_CUDA_NUM_THREADS,
      0,
      context_.cuda_stream()>>>(
           buff_.size(), buff_.data<float>(), buff_.mutable_data<float>(),
           beta_);
   C10_CUDA_KERNEL_LAUNCH_CHECK();

   // Element-wise weighted smooth l1 loss (can be used to specify a per-element
   // loss weight)
   // l := alpha_out * SmoothL1(alpha_in * (y_hat - y))
   math::Mul<float, CUDAContext>(
       buff_.size(), buff_.data<float>(), alpha_out.data<float>(),
       buff_.mutable_data<float>(), &context_);
   // Sum of all losses
   // al := sum_i l_i
   float* avg_loss_data = avg_loss->mutable_data<float>();
   math::Sum<float, CUDAContext>(
       buff_.size(), buff_.data<float>(), avg_loss_data, &context_);
   // Average of input batch size
   // al := 1/N * al
   math::Scale<float, float, CUDAContext>(
       1, scale_ / N, avg_loss_data, avg_loss_data, &context_);
   return true;
 }

 template<>
 bool SmoothL1LossGradientOp<float, CUDAContext>::RunOnDevice() {
   auto& Y_hat      = Input(0);
   auto& Y          = Input(1);
   auto& alpha_in   = Input(2);
   auto& alpha_out  = Input(3);
   auto& d_avg_loss = Input(4);  // gradient of net w.r.t. avg_loss ("gradOuput")
   // We intentially don't compute gradients for Y, alpha_{in,out} since they
   // are not needed (can change in the future if desired)

   int N = Y.dim32(0);
   // Require the same number of elements along axis 0 (batch size), but
   // otherwise don't care about the shape (just the number of elements)
   CAFFE_ENFORCE_EQ(Y_hat.dim32(0), Y.dim32(0),
       "Y_hat and Y must have the same number of elements along axis 0");
   CAFFE_ENFORCE_EQ(Y_hat.size(), Y.size(),
       "Y_hat and Y must have the same number of elements");
   CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size());
   CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size());
   CAFFE_ENFORCE_EQ(d_avg_loss.size(), 1);

   auto* d_Y_hat = Output(0, Y_hat.sizes(), at::dtype<float>()); // gradient of net w.r.t. Y_hat ("gradInput")
   buff_.ResizeLike(Y);

   // Difference
   // d := y_hat - y
   math::Sub<float, CUDAContext>(
       Y.size(), Y_hat.data<float>(), Y.data<float>(),
       buff_.mutable_data<float>(), &context_);
   // Element-wise weighted difference (can be used to ignore or reweight
   // specific components)
   // d := alpha_in * (y_hat - y)
   math::Mul<float, CUDAContext>(
       buff_.size(), buff_.data<float>(), alpha_in.data<float>(),
       buff_.mutable_data<float>(), &context_);
   // d_Y_hat := d_avg_loss / N * SmoothL1'(alpha_in * (y_hat - y))
   SmoothL1GradientKernel<float>
   <<<CAFFE_GET_BLOCKS(buff_.size()),
      CAFFE_CUDA_NUM_THREADS,
      0,
      context_.cuda_stream()>>>(
          buff_.size(), buff_.data<float>(), d_Y_hat->mutable_data<float>(),
          d_avg_loss.data<float>(), scale_ / N, beta_);
   C10_CUDA_KERNEL_LAUNCH_CHECK();

   // Element-wise scale by alpha_in and alpha_out
   math::Mul<float, CUDAContext>(
       d_Y_hat->size(), d_Y_hat->data<float>(), alpha_in.data<float>(),
       d_Y_hat->mutable_data<float>(), &context_);
   math::Mul<float, CUDAContext>(
       d_Y_hat->size(), d_Y_hat->data<float>(), alpha_out.data<float>(),
       d_Y_hat->mutable_data<float>(), &context_);
   return true;
 }


 REGISTER_CUDA_OPERATOR(SmoothL1Loss,
                        SmoothL1LossOp<float, CUDAContext>);
 REGISTER_CUDA_OPERATOR(SmoothL1LossGradient,
                        SmoothL1LossGradientOp<float, CUDAContext>);
 }  // namespace caffe2
	/**
	* Copyright (c) 2016-present, Facebook, Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "caffe2/core/context_gpu.h"
	#include "modules/detectron/smooth_l1_loss_op.h"

	namespace caffe2 {

	namespace {
	template <typename T>
	__global__ void SmoothL1Kernel(
	const int n, const T* in, T* out, T beta) {
	// f(x) = 0.5 * x^2 / beta if \|x\| < beta
	// \|x\| - 0.5 * beta otherwise
	CUDA_1D_KERNEL_LOOP(index, n) {
	T val = in[index];
	T abs_val = c10::cuda::compat::abs(val);
	if (abs_val < beta) {
	out[index] = 0.5 * val * val / beta;
	} else {
	out[index] = abs_val - 0.5 * beta;
	}
	}
	}

	template <typename T>
	__global__ void SmoothL1GradientKernel(
	const int n,
	const T* in,
	T* out,
	const T* d_loss_data,
	T norm,
	T beta) {
	// f'(x) = x / beta if \|x\| < beta
	// = sign(x) otherwise
	// We also scale by norm * d_loss in this kernel for convenience
	CUDA_1D_KERNEL_LOOP(index, n) {
	T val = in[index];
	T abs_val = c10::cuda::compat::abs(val);
	T d_loss = *d_loss_data;
	if (abs_val < beta) {
	out[index] = norm * d_loss * val / beta;
	} else {
	out[index] = norm * d_loss * ((T(0) < val) - (val < T(0)));
	}
	}
	}
	} // namespace

	template<>
	bool SmoothL1LossOp<float, CUDAContext>::RunOnDevice() {
	auto& Y_hat = Input(0);
	auto& Y = Input(1);
	auto& alpha_in = Input(2);
	auto& alpha_out = Input(3);


	int N = Y.dim32(0);
	// Require the same number of elements along axis 0 (batch size), but
	// otherwise don't care about the shape (just the number of elements)
	CAFFE_ENFORCE_EQ(Y_hat.dim32(0), Y.dim32(0),
	"Y_hat and Y must have the same number of elements along axis 0");
	CAFFE_ENFORCE_EQ(Y_hat.size(), Y.size(),
	"Y_hat and Y must have the same number of elements");
	CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size());
	CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size());

	auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
	buff_.ResizeLike(Y);

	// Difference
	// d := y_hat - y
	math::Sub<float, CUDAContext>(
	Y.size(), Y_hat.data<float>(), Y.data<float>(),
	buff_.mutable_data<float>(), &context_);
	// Element-wise weighted difference (can be used to ignore or reweight
	// specific components)
	// d := alpha_in * (y_hat - y)
	math::Mul<float, CUDAContext>(
	buff_.size(), buff_.data<float>(), alpha_in.data<float>(),
	buff_.mutable_data<float>(), &context_);

	// Element-wise smooth l1 loss
	// l := SmoothL1(alpha_in * (y_hat - y))
	SmoothL1Kernel<float>
	<<<CAFFE_GET_BLOCKS(buff_.size()),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	buff_.size(), buff_.data<float>(), buff_.mutable_data<float>(),
	beta_);
	C10_CUDA_KERNEL_LAUNCH_CHECK();

	// Element-wise weighted smooth l1 loss (can be used to specify a per-element
	// loss weight)
	// l := alpha_out * SmoothL1(alpha_in * (y_hat - y))
	math::Mul<float, CUDAContext>(
	buff_.size(), buff_.data<float>(), alpha_out.data<float>(),
	buff_.mutable_data<float>(), &context_);
	// Sum of all losses
	// al := sum_i l_i
	float* avg_loss_data = avg_loss->mutable_data<float>();
	math::Sum<float, CUDAContext>(
	buff_.size(), buff_.data<float>(), avg_loss_data, &context_);
	// Average of input batch size
	// al := 1/N * al
	math::Scale<float, float, CUDAContext>(
	1, scale_ / N, avg_loss_data, avg_loss_data, &context_);
	return true;
	}

	template<>
	bool SmoothL1LossGradientOp<float, CUDAContext>::RunOnDevice() {
	auto& Y_hat = Input(0);
	auto& Y = Input(1);
	auto& alpha_in = Input(2);
	auto& alpha_out = Input(3);
	auto& d_avg_loss = Input(4); // gradient of net w.r.t. avg_loss ("gradOuput")
	// We intentially don't compute gradients for Y, alpha_{in,out} since they
	// are not needed (can change in the future if desired)

	int N = Y.dim32(0);
	// Require the same number of elements along axis 0 (batch size), but
	// otherwise don't care about the shape (just the number of elements)
	CAFFE_ENFORCE_EQ(Y_hat.dim32(0), Y.dim32(0),
	"Y_hat and Y must have the same number of elements along axis 0");
	CAFFE_ENFORCE_EQ(Y_hat.size(), Y.size(),
	"Y_hat and Y must have the same number of elements");
	CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size());
	CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size());
	CAFFE_ENFORCE_EQ(d_avg_loss.size(), 1);

	auto* d_Y_hat = Output(0, Y_hat.sizes(), at::dtype<float>()); // gradient of net w.r.t. Y_hat ("gradInput")
	buff_.ResizeLike(Y);

	// Difference
	// d := y_hat - y
	math::Sub<float, CUDAContext>(
	Y.size(), Y_hat.data<float>(), Y.data<float>(),
	buff_.mutable_data<float>(), &context_);
	// Element-wise weighted difference (can be used to ignore or reweight
	// specific components)
	// d := alpha_in * (y_hat - y)
	math::Mul<float, CUDAContext>(
	buff_.size(), buff_.data<float>(), alpha_in.data<float>(),
	buff_.mutable_data<float>(), &context_);
	// d_Y_hat := d_avg_loss / N * SmoothL1'(alpha_in * (y_hat - y))
	SmoothL1GradientKernel<float>
	<<<CAFFE_GET_BLOCKS(buff_.size()),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context_.cuda_stream()>>>(
	buff_.size(), buff_.data<float>(), d_Y_hat->mutable_data<float>(),
	d_avg_loss.data<float>(), scale_ / N, beta_);
	C10_CUDA_KERNEL_LAUNCH_CHECK();

	// Element-wise scale by alpha_in and alpha_out
	math::Mul<float, CUDAContext>(
	d_Y_hat->size(), d_Y_hat->data<float>(), alpha_in.data<float>(),
	d_Y_hat->mutable_data<float>(), &context_);
	math::Mul<float, CUDAContext>(
	d_Y_hat->size(), d_Y_hat->data<float>(), alpha_out.data<float>(),
	d_Y_hat->mutable_data<float>(), &context_);
	return true;
	}


	REGISTER_CUDA_OPERATOR(SmoothL1Loss,
	SmoothL1LossOp<float, CUDAContext>);
	REGISTER_CUDA_OPERATOR(SmoothL1LossGradient,
	SmoothL1LossGradientOp<float, CUDAContext>);
	} // namespace caffe2