blob: 741da27f59d2b45f3eec714bfaaabab0519e33c6 [file] [log] [blame]
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cfloat>
#include "caffe2/core/context_gpu.h"
#include "modules/detectron/group_spatial_softmax_op.h"
namespace caffe2 {
namespace {
__global__ void GroupSpatialSoftmaxKernel(const int num, const int A, const int W,
const int H, const float* Xdata, float* Pdata, const int num_classes) {
// Loop through labels (N x A x H x W)
CUDA_1D_KERNEL_LOOP(index, num * A * H * W) {
int D = num_classes * A;
int x = index % W;
int y = (index / W) % H;
int a = (index / (W * H)) % A;
int i = index / W / H / A;
// Subtract max on each cell for numerical reasons
float max_val = -FLT_MAX;
for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
max_val = max(max_val, Xdata[idx]);
}
// Exponentiate
float expsum = 0.0f;
for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
float expx = exp(Xdata[idx] - max_val);
Pdata[idx] = expx;
expsum += expx;
}
// Normalize
for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
Pdata[idx] /= expsum;
}
}
}
__global__ void SumProbsKernel(const int N, const int A, const int W,
const int H, const float* Ydata, const float* dYdata,
float* sum_probs_data, const int num_classes) {
CUDA_1D_KERNEL_LOOP(i, N * A * W * H) {
int D = num_classes * A;
int x = i % W;
int y = (i / W) % H;
int a = (i / (W * H)) % A;
int n = i / (W * H * A);
sum_probs_data[i] = 0.0;
for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
int idx = n * (H * W * D) + c * (H * W) + y * W + x;
sum_probs_data[i] += (Ydata[idx] * dYdata[idx]);
}
}
}
__global__ void SubSumKernel(
const int N, const int A, const int W, const int H,
const float* sum_probs_data, float* dXdata, const int num_classes) {
CUDA_1D_KERNEL_LOOP(i, N * (A * num_classes) * W * H) {
int D = num_classes * A;
int x = i % W;
int y = (i / W) % H;
int a = ((i / (W * H)) % D) / num_classes;
int n = i / W / H / D;
int idx = n * (H * W * A) + a * (H * W) + y * W + x;
dXdata[i] = (dXdata[i] - sum_probs_data[idx]);
}
}
} // namespace
template <>
bool GroupSpatialSoftmaxOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0); // Logits
int N = X.dim32(0);
int D = X.dim32(1);
int H = X.dim32(2);
int W = X.dim32(3);
int A = D / num_classes_;
auto* P = Output(0, X.sizes(), at::dtype<float>()); // Probabilities from softmax
TORCH_DCHECK_EQ(X.ndim(), 4);
const float* Xdata = X.data<float>();
float* Pdata = P->mutable_data<float>();
// Softmax for each x,y location
GroupSpatialSoftmaxKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
N, A, W, H, Xdata, Pdata, num_classes_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
template<>
bool GroupSpatialSoftmaxGradientOp<float, CUDAContext>::RunOnDevice() {
auto& Y = Input(0); // Probabilities from softmax
auto& dY = Input(1);
TORCH_DCHECK_EQ(Y.ndim(), 4);
int N = Y.dim32(0);
int D = Y.dim32(1);
int H = Y.dim32(2);
int W = Y.dim32(3);
int A = D / num_classes_;
auto* dX = Output(0, Y.sizes(), at::dtype<float>());
if (sum_probs_.size() != N * A * H * W) {
ReinitializeTensor(&sum_probs_, {N * A * H * W}, at::dtype<float>().device(CUDA));
}
const float* Ydata = Y.data<float>();
const float* dYdata = dY.data<float>();
float* dXdata = dX->mutable_data<float>();
float* sum_probs_data = sum_probs_.mutable_data<float>();
math::Set<float, CUDAContext>(
sum_probs_.size(), 0.0f, sum_probs_data, &context_);
// Complete math:
// J_ij = h_i (delta_ij - h_j)
// d x_i = sum_j d h_ij = sum_j J_ij * dy_j
// = sum_j h_i (delta_ij - h_j) * dy_j
// = h_i dy_i - (sum_j h_i h_j dy_j)
// = h_i dy_i - h_i sum_j h_j dy_j
// Step 0: dx = dy
context_.Copy<float, CUDAContext, CUDAContext>(Y.size(), dYdata, dXdata);
// Step 1: s = Sum(dY[j] * Y[j])
SumProbsKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, 0,
context_.cuda_stream()>>>(
N, A, W, H, Ydata, dYdata, sum_probs_data, num_classes_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
// Step 2: dX[i] = dX[i] - s
SubSumKernel<<<CAFFE_GET_BLOCKS(Y.size()), CAFFE_CUDA_NUM_THREADS, 0,
context_.cuda_stream()>>>(
N, A, W, H, sum_probs_.data<float>(), dXdata, num_classes_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
// Step 3: dX[i] = Y[i] * dX[i]
math::Mul<float, CUDAContext>(Y.size(), dXdata, Ydata, dXdata, &context_);
return true;
}
REGISTER_CUDA_OPERATOR(GroupSpatialSoftmax,
GroupSpatialSoftmaxOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(GroupSpatialSoftmaxGradient,
GroupSpatialSoftmaxGradientOp<float, CUDAContext>);
} // namespace caffe2