blob: e4d1f34b754c1ed59e07174ca1d6e965e3a26a1d [file] [log] [blame]
#include <algorithm>
#include <cub/block/block_reduce.cuh>
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/normalize_l1_op.h"
#include "caffe2/operators/normalize_op.h"
#include "caffe2/utils/cub_namespace.cuh"
namespace caffe2 {
__global__ void NormalizeKernel(
const int m,
const int n,
const int sf,
const float* xData,
float* yData,
const float kEps) {
typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
__shared__ BlockReduce::TempStorage temp_storage;
for (int i = blockIdx.x; i < n; i += gridDim.x) {
auto base = (i / sf) * sf * m + (i % sf);
float sum = 0.0;
__shared__ float norm;
for (int j = threadIdx.x; j < m; j += blockDim.x) {
const auto x_ij = xData[base + j * sf];
sum += x_ij * x_ij;
}
float reduce_result = BlockReduce(temp_storage).Sum(sum);
if (threadIdx.x == 0) {
norm = sqrtf(reduce_result);
norm = fmaxf(norm, kEps);
}
__syncthreads();
for (int j = threadIdx.x; j < m; j += blockDim.x) {
const auto index = base + j * sf;
yData[index] = xData[index] / norm;
}
}
}
__global__ void NormalizeGradientKernel(
const int M,
const int N,
const int SF,
const float* in_mat,
const float* grad_out_mat,
float* grad_mat,
const float kEps) {
typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
__shared__ BlockReduce::TempStorage temp_storage_sum;
__shared__ BlockReduce::TempStorage temp_storage_norm;
for (int i = blockIdx.x; i < M; i += gridDim.x) {
float sum = 0.0;
float norm = 0.0;
__shared__ float row_sum;
__shared__ float row_norm;
__shared__ float row_norm_3;
auto base = (i / SF) * SF * N + (i % SF);
for (int j = threadIdx.x; j < N; j += blockDim.x) {
int index = base + j * SF;
sum += in_mat[index] * grad_out_mat[index];
norm += in_mat[index] * in_mat[index];
}
float reduce_result = BlockReduce(temp_storage_sum).Sum(sum);
float reduce_norm = BlockReduce(temp_storage_norm).Sum(norm);
if (threadIdx.x == 0) {
row_sum = reduce_result;
row_norm = sqrtf(reduce_norm);
row_norm = fmaxf(row_norm, kEps);
row_norm_3 = powf(row_norm, 3);
}
__syncthreads();
for (int j = threadIdx.x; j < N; j += blockDim.x) {
int index = base + j * SF;
const float x_ij = in_mat[index];
const float dy_ij = grad_out_mat[index];
grad_mat[index] = (dy_ij / row_norm) - ((x_ij / row_norm_3) * row_sum);
}
}
}
template <>
void NormalizeOp<float, CUDAContext>::DoNormalize(
const float* xData,
float* yData,
const int m,
const int n,
const int sf) {
NormalizeKernel<<<
std::min(n, CAFFE_MAXIMUM_NUM_BLOCKS),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(m, n, sf, xData, yData, kEps_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
template <>
bool NormalizeGradientOp<float, CUDAContext>::RunOnDevice() {
const auto& X = Input(0);
const auto& dY = Input(1);
auto* dX = Output(0);
dX->ResizeLike(X);
const auto canonical_axis =
X.canonical_axis_index(OperatorBase::GetSingleArgument<int>("axis", -1));
int N = X.dim32(canonical_axis);
int M = X.numel() / N;
const int SF = X.size_from_dim(canonical_axis + 1);
NormalizeGradientKernel<<<
std::min(M, CAFFE_MAXIMUM_NUM_BLOCKS),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
M,
N,
SF,
X.data<float>(),
dY.data<float>(),
dX->template mutable_data<float>(),
kEps_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
namespace {
__global__ void NormalizeL1Kernel(
const int m,
const int n,
const int sf,
const float* xData,
float* yData) {
typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
__shared__ BlockReduce::TempStorage temp_storage;
for (int i = blockIdx.x; i < n; i += gridDim.x) {
auto base = (i / sf) * sf * m + (i % sf);
float sum = 0.0;
__shared__ float norm;
for (int j = threadIdx.x; j < m; j += blockDim.x) {
const auto x_ij = xData[base + j * sf];
sum += fabsf(x_ij);
}
float reduce_result = BlockReduce(temp_storage).Sum(sum);
if (threadIdx.x == 0) {
norm = reduce_result;
}
__syncthreads();
if (norm != 0) {
for (int j = threadIdx.x; j < m; j += blockDim.x) {
const auto index = base + j * sf;
yData[index] = xData[index] / norm;
}
}
}
}
} // namespace
template <>
void NormalizeL1Op<float, CUDAContext>::DoNormalize(
const float* xData,
float* yData,
const int m,
const int n,
const int sf) {
NormalizeL1Kernel<<<
std::min(n, CAFFE_MAXIMUM_NUM_BLOCKS),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(m, n, sf, xData, yData);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
REGISTER_CUDA_OPERATOR(Normalize, NormalizeOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(
NormalizeGradient,
NormalizeGradientOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(NormalizeL1, NormalizeL1Op<float, CUDAContext>);
} // namespace caffe2