blob: 0612ef7edcc8c933259a60700a4aa6d36227eb20 [file] [log] [blame]
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cfloat>
#include "caffe2/core/context_gpu.h"
#include "modules/detectron/softmax_focal_loss_op.h"
namespace caffe2 {
namespace {
__global__ void SpatialSoftmaxKernel(const int N, const int A,
const int H, const int W, const float* Xdata, float* Pdata,
const int num_classes) {
CUDA_1D_KERNEL_LOOP(index, N * A * H * W) {
int D = num_classes * A;
int x = index % W;
int y = (index / W) % H;
int a = (index / (W * H)) % A;
int i = index / W / H / A;
// Subtract max on each cell for numerical reasons
float max_val = -FLT_MAX;
for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
max_val = max(max_val, Xdata[idx]);
}
// Exponentiate
float expsum = 0.0f;
for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
float expx = exp(Xdata[idx] - max_val);
Pdata[idx] = expx;
expsum += expx;
}
// Normalize
for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
Pdata[idx] /= expsum;
}
}
}
__global__ void SoftmaxFocalLossKernel(
const int N, const int A, const int H, const int W,
const float* Pdata, const int* targets, float* losses,
const float* weight_pos, const float gamma, const float alpha,
const int num_classes) {
CUDA_1D_KERNEL_LOOP(i, N * A * H * W) {
int D = A * num_classes;
int x = i % W;
int y = (i / W) % H;
int a = (i / (W * H)) % A;
int n = i / (W * H * A);
const int label = static_cast<int>(targets[i]);
float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
float z = (label == 0) * (1 - alpha) / Np +
(label >= 1) * alpha / Np;
losses[i] = 0.0;
if (label >= 0) {
int offset = a * num_classes;
int idx = n * (H * W * D) + (offset + label) * (H * W) + y * W + x;
losses[i] =
-(pow(1.0f - Pdata[idx], gamma) *
log(c10::cuda::compat::max(Pdata[idx], FLT_MIN))) * z;
}
}
}
__global__ void SoftmaxFocalLossGradientWeightKernel(
const int N, const int A, const int H, const int W,
const float* Pdata, const int* targets, float* buff,
const float* weight_pos, const float gamma, const float alpha,
const int num_classes) {
CUDA_1D_KERNEL_LOOP(i, N * A * H * W) {
int D = A * num_classes;
int x = i % W;
int y = (i / W) % H;
int a = (i / (W * H)) % A;
int n = i / (W * H * A);
const int label = static_cast<int>(targets[i]);
float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
float z = (label == 0) * (1 - alpha) / Np +
(label >= 1) * alpha / Np;
buff[i] = 0.0;
if (label >= 0) {
int offset = a * num_classes;
int idx = n * (H * W * D) + (offset + label) * (H * W) + y * W + x;
float onemp = 1. - Pdata[idx];
float p = Pdata[idx];
buff[i] =
(-pow(onemp, gamma) +
gamma * pow(onemp, gamma - 1) * p * log(c10::cuda::compat::max(p, FLT_MIN))) * z;
}
}
}
__global__ void SoftmaxFocalLossGradientKernel(
const int N, const int D, const int H, const int W,
const float* Pdata, const int* targets, const float* buff,
const float* d_loss_data, float* dX, const int num_classes) {
CUDA_1D_KERNEL_LOOP(i, N * D * H * W) {
int A = D / num_classes;
int x = i % W;
int y = (i / W) % H;
int d = (i / (W * H)) % D;
int a = d / num_classes;
int c = d % num_classes;
int n = i / (W * H * D);
float d_loss = *d_loss_data;
int ind = n * (H * W * A) + a * (H * W) + y * W + x;
const int label = static_cast<int>(targets[ind]);
float c1 = (label >= 0) * 1.0;
float c2 = (label == c) * 1.0;
dX[i] = 0.0;
dX[i] = c1 * d_loss * buff[ind] * (c2 - Pdata[i]);
}
}
} // namespace
template <>
bool SoftmaxFocalLossOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0); // Logits
auto& T = Input(1); // Labels
auto& wp = Input(2); // num of foreground
// average loss as output
// softmax probability, going to be re-used in gradient
int N = X.dim32(0);
int D = X.dim32(1);
int H = X.dim32(2);
int W = X.dim32(3);
int A = D / num_classes_;
ReinitializeTensor(&losses_, {N * A * H * W}, at::dtype<float>().device(CUDA));
auto* P = Output(1, {N * D * H * W}, at::dtype<float>());
auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
math::Set<float, CUDAContext>(
avg_loss->size(), 0.f, avg_loss->mutable_data<float>(), &context_);
math::Set<float, CUDAContext>(
P->size(), 0.f, P->mutable_data<float>(), &context_);
math::Set<float, CUDAContext>(
losses_.size(), 0.f, losses_.mutable_data<float>(), &context_);
TORCH_DCHECK_EQ(X.ndim(), 4);
const float* Xdata = X.data<float>();
const float* Wdata = wp.data<float>();
// Spatial Softmax Kernel
SpatialSoftmaxKernel
<<<CAFFE_GET_BLOCKS(N * A * H * W), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
N, A, H, W, Xdata, P->mutable_data<float>(), num_classes_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
// Compute loss for each x,y location
const int* Tdata = T.data<int>();
SoftmaxFocalLossKernel
<<<CAFFE_GET_BLOCKS(N * A * H * W), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
N, A, H, W, P->data<float>(), Tdata, losses_.mutable_data<float>(),
Wdata, gamma_, alpha_, num_classes_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
// sum the losses
float* avg_loss_data = avg_loss->mutable_data<float>();
math::Sum<float, CUDAContext>(
losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
math::Scale<float, float, CUDAContext>(
1, scale_, avg_loss_data, avg_loss_data, &context_);
return true;
}
template<>
bool SoftmaxFocalLossGradientOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0); // Logits
auto& T = Input(1); // Label
auto& wp = Input(2); // num of foreground example
auto& P = Input(3); // Softmax Probability
auto& d_avg_loss = Input(4);
int N = X.dim32(0);
int D = X.dim32(1);
int H = X.dim32(2);
int W = X.dim32(3);
int A = D / num_classes_;
ReinitializeTensor(&buff_, {N * A * H * W}, at::dtype<float>().device(CUDA));
auto* dX = Output(0, X.sizes(), at::dtype<float>()); // gradient wrt logits
const float* Xdata = X.data<float>();
const int* Tdata = T.data<int>();
const float* Pdata = P.data<float>();
const float* Wdata = wp.data<float>();
// Compute the weight for gradients
SoftmaxFocalLossGradientWeightKernel
<<<CAFFE_GET_BLOCKS(N * A * H * W), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
N, A, H, W, Pdata, Tdata, buff_.mutable_data<float>(),
Wdata, gamma_, alpha_, num_classes_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
// Compute the gradient with the weights
const float* Bdata = buff_.data<float>();
SoftmaxFocalLossGradientKernel
<<<CAFFE_GET_BLOCKS(N * D * H * W), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
N, D, H, W, Pdata, Tdata, Bdata, d_avg_loss.data<float>(),
dX->mutable_data<float>(), num_classes_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
math::Scale<float, float, CUDAContext>(
dX->size(),
scale_,
dX->data<float>(),
dX->mutable_data<float>(),
&context_);
return true;
}
REGISTER_CUDA_OPERATOR(SoftmaxFocalLoss,
SoftmaxFocalLossOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(SoftmaxFocalLossGradient,
SoftmaxFocalLossGradientOp<float, CUDAContext>);
} // namespace caffe2