| #include "caffe2/sgd/momentum_sgd_op.h" |
| #include "caffe2/core/common_gpu.h" |
| #include "caffe2/core/context_gpu.h" |
| |
| namespace caffe2 { |
| |
| inline int CaffeGetBlocksSGD(const int N) { |
| return std::max( |
| (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS, |
| // Use at least 1 block, since CUDA does not allow empty block |
| 1); |
| } |
| template <bool nesterov> |
| __global__ void MomentumSGDKernel( |
| const int N, |
| const float* g, |
| const float* m, |
| float* ng, |
| float* nm, |
| const float* lr, |
| const float momentum, |
| float* param); |
| |
| template <> |
| __global__ void MomentumSGDKernel<true>( |
| const int N, |
| const float* g, |
| const float* m, |
| float* ng, |
| float* nm, |
| const float* lr, |
| const float momentum, |
| float* param) { |
| const float LR = lr[0]; |
| CUDA_1D_KERNEL_LOOP(i, N) { |
| const float mi = m[i]; |
| const float mi_new = momentum * mi + LR * g[i]; |
| nm[i] = mi_new; |
| ng[i] = fmaf(momentum, mi_new - mi, mi_new); |
| if (param != nullptr) { |
| param[i] -= ng[i]; |
| } |
| } |
| } |
| |
| template <> |
| __global__ void MomentumSGDKernel<false>( |
| const int N, |
| const float* g, |
| const float* m, |
| float* ng, |
| float* nm, |
| const float* lr, |
| const float momentum, |
| float* param) { |
| const float LR = lr[0]; |
| CUDA_1D_KERNEL_LOOP(i, N) { |
| const float adjusted_gradient = LR * g[i] + momentum * m[i]; |
| nm[i] = adjusted_gradient; |
| ng[i] = adjusted_gradient; |
| if (param != nullptr) { |
| param[i] -= adjusted_gradient; |
| } |
| } |
| } |
| |
| template <> |
| void momentum_sgd_update<CUDAContext>( |
| const int N, |
| const float* g, |
| const float* m, |
| float* ng, |
| float* nm, |
| const float* lr, |
| const float momentum, |
| const bool nesterov, |
| float* param, |
| CUDAContext* context) { |
| if (nesterov) { |
| MomentumSGDKernel<true> |
| <<<CaffeGetBlocksSGD(N), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(N, g, m, ng, nm, lr, momentum, param); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } else { |
| MomentumSGDKernel<false> |
| <<<CaffeGetBlocksSGD(N), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(N, g, m, ng, nm, lr, momentum, param); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| } |
| |
| |
| template <typename SIndex> |
| __global__ void SparseMomentumSGDKernel( |
| const size_t N, |
| const size_t sz, |
| const float momentum, |
| const bool nesterov, |
| float *param, |
| float *param_mom, |
| const SIndex *indices, |
| const float *gradIn, |
| float *gradOut, |
| const float *lr) |
| { |
| const float LR = lr[0]; |
| CUDA_1D_KERNEL_LOOP(i, N) |
| { |
| const size_t gradIdx = i; |
| const SIndex index = indices[i / sz]; |
| const size_t paramIdx = index * sz + (i % sz); |
| |
| if (!nesterov) |
| { |
| const float adjusted_gradient = LR * gradIn[gradIdx] + |
| momentum * param_mom[paramIdx]; |
| gradOut[gradIdx] = adjusted_gradient; |
| param_mom[paramIdx] = adjusted_gradient; |
| param[paramIdx] -= adjusted_gradient; |
| } else { |
| const float mom_old = param_mom[paramIdx]; |
| const float mom_new = LR * gradIn[gradIdx] + momentum * mom_old; |
| param_mom[paramIdx] = mom_new; |
| const float adjusted_gradient = (1 + momentum) * mom_new - |
| momentum * mom_old; |
| gradOut[gradIdx] = adjusted_gradient; |
| param[paramIdx] -= adjusted_gradient; |
| } |
| } |
| } |
| |
| |
| // Specialization of DoRunWithType for CUDA |
| template <> |
| template <typename SIndex> |
| bool SparseMomentumSGDUpdateOp<float, CUDAContext>::DoRunWithType() { |
| auto N = Input(GRAD).size(); |
| auto grad_slice_sz = Input(GRAD).size_from_dim(Input(INDICES).ndim()); |
| |
| SparseMomentumSGDKernel<SIndex><<< |
| CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, 0, |
| context_.cuda_stream()>>>( |
| N, grad_slice_sz, |
| momentum_, nesterov_, |
| Output(OUTPUT_PARAM)->template mutable_data<float>(), |
| Output(OUTPUT_MOMENTUM)->template mutable_data<float>(), |
| Input(INDICES).template data<SIndex>(), |
| Input(GRAD).template data<float>(), |
| Output(OUTPUT_GRAD)->template mutable_data<float>(), |
| Input(LR).template data<float>()); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| return true; |
| } |
| |
| REGISTER_CUDA_OPERATOR(MomentumSGD, MomentumSGDOp<float, CUDAContext>); |
| REGISTER_CUDA_OPERATOR(MomentumSGDUpdate, MomentumSGDUpdateOp<float, CUDAContext>); |
| REGISTER_CUDA_OPERATOR(SparseMomentumSGDUpdate, SparseMomentumSGDUpdateOp<float, CUDAContext>); |
| |
| } |