blob: ea835a4bc2b97d201a5d8894d24de96c3dd21e9c [file] [log] [blame]
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "caffe2/core/context_gpu.h"
#include "modules/detectron/smooth_l1_loss_op.h"
namespace caffe2 {
namespace {
template <typename T>
__global__ void SmoothL1Kernel(
const int n, const T* in, T* out, T beta) {
// f(x) = 0.5 * x^2 / beta if |x| < beta
// |x| - 0.5 * beta otherwise
CUDA_1D_KERNEL_LOOP(index, n) {
T val = in[index];
T abs_val = c10::cuda::compat::abs(val);
if (abs_val < beta) {
out[index] = 0.5 * val * val / beta;
} else {
out[index] = abs_val - 0.5 * beta;
}
}
}
template <typename T>
__global__ void SmoothL1GradientKernel(
const int n,
const T* in,
T* out,
const T* d_loss_data,
T norm,
T beta) {
// f'(x) = x / beta if |x| < beta
// = sign(x) otherwise
// We also scale by norm * d_loss in this kernel for convenience
CUDA_1D_KERNEL_LOOP(index, n) {
T val = in[index];
T abs_val = c10::cuda::compat::abs(val);
T d_loss = *d_loss_data;
if (abs_val < beta) {
out[index] = norm * d_loss * val / beta;
} else {
out[index] = norm * d_loss * ((T(0) < val) - (val < T(0)));
}
}
}
} // namespace
template<>
bool SmoothL1LossOp<float, CUDAContext>::RunOnDevice() {
auto& Y_hat = Input(0);
auto& Y = Input(1);
auto& alpha_in = Input(2);
auto& alpha_out = Input(3);
int N = Y.dim32(0);
// Require the same number of elements along axis 0 (batch size), but
// otherwise don't care about the shape (just the number of elements)
CAFFE_ENFORCE_EQ(Y_hat.dim32(0), Y.dim32(0),
"Y_hat and Y must have the same number of elements along axis 0");
CAFFE_ENFORCE_EQ(Y_hat.size(), Y.size(),
"Y_hat and Y must have the same number of elements");
CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size());
CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size());
auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
buff_.ResizeLike(Y);
// Difference
// d := y_hat - y
math::Sub<float, CUDAContext>(
Y.size(), Y_hat.data<float>(), Y.data<float>(),
buff_.mutable_data<float>(), &context_);
// Element-wise weighted difference (can be used to ignore or reweight
// specific components)
// d := alpha_in * (y_hat - y)
math::Mul<float, CUDAContext>(
buff_.size(), buff_.data<float>(), alpha_in.data<float>(),
buff_.mutable_data<float>(), &context_);
// Element-wise smooth l1 loss
// l := SmoothL1(alpha_in * (y_hat - y))
SmoothL1Kernel<float>
<<<CAFFE_GET_BLOCKS(buff_.size()),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
buff_.size(), buff_.data<float>(), buff_.mutable_data<float>(),
beta_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
// Element-wise weighted smooth l1 loss (can be used to specify a per-element
// loss weight)
// l := alpha_out * SmoothL1(alpha_in * (y_hat - y))
math::Mul<float, CUDAContext>(
buff_.size(), buff_.data<float>(), alpha_out.data<float>(),
buff_.mutable_data<float>(), &context_);
// Sum of all losses
// al := sum_i l_i
float* avg_loss_data = avg_loss->mutable_data<float>();
math::Sum<float, CUDAContext>(
buff_.size(), buff_.data<float>(), avg_loss_data, &context_);
// Average of input batch size
// al := 1/N * al
math::Scale<float, float, CUDAContext>(
1, scale_ / N, avg_loss_data, avg_loss_data, &context_);
return true;
}
template<>
bool SmoothL1LossGradientOp<float, CUDAContext>::RunOnDevice() {
auto& Y_hat = Input(0);
auto& Y = Input(1);
auto& alpha_in = Input(2);
auto& alpha_out = Input(3);
auto& d_avg_loss = Input(4); // gradient of net w.r.t. avg_loss ("gradOuput")
// We intentially don't compute gradients for Y, alpha_{in,out} since they
// are not needed (can change in the future if desired)
int N = Y.dim32(0);
// Require the same number of elements along axis 0 (batch size), but
// otherwise don't care about the shape (just the number of elements)
CAFFE_ENFORCE_EQ(Y_hat.dim32(0), Y.dim32(0),
"Y_hat and Y must have the same number of elements along axis 0");
CAFFE_ENFORCE_EQ(Y_hat.size(), Y.size(),
"Y_hat and Y must have the same number of elements");
CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size());
CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size());
CAFFE_ENFORCE_EQ(d_avg_loss.size(), 1);
auto* d_Y_hat = Output(0, Y_hat.sizes(), at::dtype<float>()); // gradient of net w.r.t. Y_hat ("gradInput")
buff_.ResizeLike(Y);
// Difference
// d := y_hat - y
math::Sub<float, CUDAContext>(
Y.size(), Y_hat.data<float>(), Y.data<float>(),
buff_.mutable_data<float>(), &context_);
// Element-wise weighted difference (can be used to ignore or reweight
// specific components)
// d := alpha_in * (y_hat - y)
math::Mul<float, CUDAContext>(
buff_.size(), buff_.data<float>(), alpha_in.data<float>(),
buff_.mutable_data<float>(), &context_);
// d_Y_hat := d_avg_loss / N * SmoothL1'(alpha_in * (y_hat - y))
SmoothL1GradientKernel<float>
<<<CAFFE_GET_BLOCKS(buff_.size()),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
buff_.size(), buff_.data<float>(), d_Y_hat->mutable_data<float>(),
d_avg_loss.data<float>(), scale_ / N, beta_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
// Element-wise scale by alpha_in and alpha_out
math::Mul<float, CUDAContext>(
d_Y_hat->size(), d_Y_hat->data<float>(), alpha_in.data<float>(),
d_Y_hat->mutable_data<float>(), &context_);
math::Mul<float, CUDAContext>(
d_Y_hat->size(), d_Y_hat->data<float>(), alpha_out.data<float>(),
d_Y_hat->mutable_data<float>(), &context_);
return true;
}
REGISTER_CUDA_OPERATOR(SmoothL1Loss,
SmoothL1LossOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(SmoothL1LossGradient,
SmoothL1LossGradientOp<float, CUDAContext>);
} // namespace caffe2