caffe2/sgd/momentum_sgd_op_gpu.cu - platform/external/pytorch - Git at Google

 #include "caffe2/sgd/momentum_sgd_op.h"
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context_gpu.h"

 namespace caffe2 {

 inline int CaffeGetBlocksSGD(const int N) {
   return std::max(
       (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS,
       // Use at least 1 block, since CUDA does not allow empty block
       1);
 }
 template <bool nesterov>
 __global__ void MomentumSGDKernel(
     const int N,
     const float* g,
     const float* m,
     float* ng,
     float* nm,
     const float* lr,
     const float momentum,
     float* param);

 template <>
 __global__ void MomentumSGDKernel<true>(
     const int N,
     const float* g,
     const float* m,
     float* ng,
     float* nm,
     const float* lr,
     const float momentum,
     float* param) {
   const float LR = lr[0];
   CUDA_1D_KERNEL_LOOP(i, N) {
     const float mi = m[i];
     const float mi_new = momentum * mi + LR * g[i];
     nm[i] = mi_new;
     ng[i] = fmaf(momentum, mi_new - mi, mi_new);
     if (param != nullptr) {
       param[i] -= ng[i];
     }
   }
 }

 template <>
 __global__ void MomentumSGDKernel<false>(
     const int N,
     const float* g,
     const float* m,
     float* ng,
     float* nm,
     const float* lr,
     const float momentum,
     float* param) {
   const float LR = lr[0];
   CUDA_1D_KERNEL_LOOP(i, N) {
     const float adjusted_gradient = LR * g[i] + momentum * m[i];
     nm[i] = adjusted_gradient;
     ng[i] = adjusted_gradient;
     if (param != nullptr) {
       param[i] -= adjusted_gradient;
     }
   }
 }

 template <>
 void momentum_sgd_update<CUDAContext>(
     const int N,
     const float* g,
     const float* m,
     float* ng,
     float* nm,
     const float* lr,
     const float momentum,
     const bool nesterov,
     float* param,
     CUDAContext* context) {
   if (nesterov) {
     MomentumSGDKernel<true>
         <<<CaffeGetBlocksSGD(N),
            CAFFE_CUDA_NUM_THREADS,
            0,
            context->cuda_stream()>>>(N, g, m, ng, nm, lr, momentum, param);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   } else {
     MomentumSGDKernel<false>
         <<<CaffeGetBlocksSGD(N),
            CAFFE_CUDA_NUM_THREADS,
            0,
            context->cuda_stream()>>>(N, g, m, ng, nm, lr, momentum, param);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
 }


 template <typename SIndex>
 __global__ void SparseMomentumSGDKernel(
     const size_t N,
     const size_t sz,
     const float momentum,
     const bool nesterov,
     float *param,
     float *param_mom,
     const SIndex *indices,
     const float *gradIn,
     float *gradOut,
     const float *lr)
 {
   const float LR = lr[0];
   CUDA_1D_KERNEL_LOOP(i, N)
   {
     const size_t gradIdx = i;
     const SIndex index = indices[i / sz];
     const size_t paramIdx = index * sz + (i % sz);

     if (!nesterov)
     {
       const float adjusted_gradient = LR * gradIn[gradIdx] +
           momentum * param_mom[paramIdx];
       gradOut[gradIdx] = adjusted_gradient;
       param_mom[paramIdx] = adjusted_gradient;
       param[paramIdx] -= adjusted_gradient;
     } else {
       const float mom_old = param_mom[paramIdx];
       const float mom_new = LR * gradIn[gradIdx] + momentum * mom_old;
       param_mom[paramIdx] = mom_new;
       const float adjusted_gradient = (1 + momentum) * mom_new -
           momentum * mom_old;
       gradOut[gradIdx] = adjusted_gradient;
       param[paramIdx] -= adjusted_gradient;
     }
   }
 }


 // Specialization of DoRunWithType for CUDA
 template <>
 template <typename SIndex>
 bool SparseMomentumSGDUpdateOp<float, CUDAContext>::DoRunWithType() {
   auto N = Input(GRAD).size();
   auto grad_slice_sz = Input(GRAD).size_from_dim(Input(INDICES).ndim());

   SparseMomentumSGDKernel<SIndex><<<
     CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, 0,
     context_.cuda_stream()>>>(
         N, grad_slice_sz,
         momentum_, nesterov_,
         Output(OUTPUT_PARAM)->template mutable_data<float>(),
         Output(OUTPUT_MOMENTUM)->template mutable_data<float>(),
         Input(INDICES).template data<SIndex>(),
         Input(GRAD).template data<float>(),
         Output(OUTPUT_GRAD)->template mutable_data<float>(),
         Input(LR).template data<float>());
   C10_CUDA_KERNEL_LAUNCH_CHECK();
   return true;
 }

 REGISTER_CUDA_OPERATOR(MomentumSGD, MomentumSGDOp<float, CUDAContext>);
 REGISTER_CUDA_OPERATOR(MomentumSGDUpdate, MomentumSGDUpdateOp<float, CUDAContext>);
 REGISTER_CUDA_OPERATOR(SparseMomentumSGDUpdate, SparseMomentumSGDUpdateOp<float, CUDAContext>);

 }
	#include "caffe2/sgd/momentum_sgd_op.h"
	#include "caffe2/core/common_gpu.h"
	#include "caffe2/core/context_gpu.h"

	namespace caffe2 {

	inline int CaffeGetBlocksSGD(const int N) {
	return std::max(
	(N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS,
	// Use at least 1 block, since CUDA does not allow empty block
	1);
	}
	template <bool nesterov>
	__global__ void MomentumSGDKernel(
	const int N,
	const float* g,
	const float* m,
	float* ng,
	float* nm,
	const float* lr,
	const float momentum,
	float* param);

	template <>
	__global__ void MomentumSGDKernel<true>(
	const int N,
	const float* g,
	const float* m,
	float* ng,
	float* nm,
	const float* lr,
	const float momentum,
	float* param) {
	const float LR = lr[0];
	CUDA_1D_KERNEL_LOOP(i, N) {
	const float mi = m[i];
	const float mi_new = momentum * mi + LR * g[i];
	nm[i] = mi_new;
	ng[i] = fmaf(momentum, mi_new - mi, mi_new);
	if (param != nullptr) {
	param[i] -= ng[i];
	}
	}
	}

	template <>
	__global__ void MomentumSGDKernel<false>(
	const int N,
	const float* g,
	const float* m,
	float* ng,
	float* nm,
	const float* lr,
	const float momentum,
	float* param) {
	const float LR = lr[0];
	CUDA_1D_KERNEL_LOOP(i, N) {
	const float adjusted_gradient = LR * g[i] + momentum * m[i];
	nm[i] = adjusted_gradient;
	ng[i] = adjusted_gradient;
	if (param != nullptr) {
	param[i] -= adjusted_gradient;
	}
	}
	}

	template <>
	void momentum_sgd_update<CUDAContext>(
	const int N,
	const float* g,
	const float* m,
	float* ng,
	float* nm,
	const float* lr,
	const float momentum,
	const bool nesterov,
	float* param,
	CUDAContext* context) {
	if (nesterov) {
	MomentumSGDKernel<true>
	<<<CaffeGetBlocksSGD(N),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context->cuda_stream()>>>(N, g, m, ng, nm, lr, momentum, param);
	C10_CUDA_KERNEL_LAUNCH_CHECK();
	} else {
	MomentumSGDKernel<false>
	<<<CaffeGetBlocksSGD(N),
	CAFFE_CUDA_NUM_THREADS,
	0,
	context->cuda_stream()>>>(N, g, m, ng, nm, lr, momentum, param);
	C10_CUDA_KERNEL_LAUNCH_CHECK();
	}
	}


	template <typename SIndex>
	__global__ void SparseMomentumSGDKernel(
	const size_t N,
	const size_t sz,
	const float momentum,
	const bool nesterov,
	float *param,
	float *param_mom,
	const SIndex *indices,
	const float *gradIn,
	float *gradOut,
	const float *lr)
	{
	const float LR = lr[0];
	CUDA_1D_KERNEL_LOOP(i, N)
	{
	const size_t gradIdx = i;
	const SIndex index = indices[i / sz];
	const size_t paramIdx = index * sz + (i % sz);

	if (!nesterov)
	{
	const float adjusted_gradient = LR * gradIn[gradIdx] +
	momentum * param_mom[paramIdx];
	gradOut[gradIdx] = adjusted_gradient;
	param_mom[paramIdx] = adjusted_gradient;
	param[paramIdx] -= adjusted_gradient;
	} else {
	const float mom_old = param_mom[paramIdx];
	const float mom_new = LR * gradIn[gradIdx] + momentum * mom_old;
	param_mom[paramIdx] = mom_new;
	const float adjusted_gradient = (1 + momentum) * mom_new -
	momentum * mom_old;
	gradOut[gradIdx] = adjusted_gradient;
	param[paramIdx] -= adjusted_gradient;
	}
	}
	}


	// Specialization of DoRunWithType for CUDA
	template <>
	template <typename SIndex>
	bool SparseMomentumSGDUpdateOp<float, CUDAContext>::DoRunWithType() {
	auto N = Input(GRAD).size();
	auto grad_slice_sz = Input(GRAD).size_from_dim(Input(INDICES).ndim());

	SparseMomentumSGDKernel<SIndex><<<
	CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, 0,
	context_.cuda_stream()>>>(
	N, grad_slice_sz,
	momentum_, nesterov_,
	Output(OUTPUT_PARAM)->template mutable_data<float>(),
	Output(OUTPUT_MOMENTUM)->template mutable_data<float>(),
	Input(INDICES).template data<SIndex>(),
	Input(GRAD).template data<float>(),
	Output(OUTPUT_GRAD)->template mutable_data<float>(),
	Input(LR).template data<float>());
	C10_CUDA_KERNEL_LAUNCH_CHECK();
	return true;
	}

	REGISTER_CUDA_OPERATOR(MomentumSGD, MomentumSGDOp<float, CUDAContext>);
	REGISTER_CUDA_OPERATOR(MomentumSGDUpdate, MomentumSGDUpdateOp<float, CUDAContext>);
	REGISTER_CUDA_OPERATOR(SparseMomentumSGDUpdate, SparseMomentumSGDUpdateOp<float, CUDAContext>);

	}