caffe2/sgd/lars_op.h - platform/external/pytorch - Git at Google

 #ifndef CAFFE2_OPERATORS_LARS_OP_H_
 #define CAFFE2_OPERATORS_LARS_OP_H_

 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"

 namespace caffe2 {

 template <typename T, class Context>
 class LarsOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   LarsOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
         offset_(this->template GetSingleArgument<float>("offset", 0.5)),
         lr_min_(this->template GetSingleArgument<float>("lr_min", 0.02)) {}

   bool RunOnDevice() override {
     auto& X = Input(0);
     auto& dX = Input(1);
     CAFFE_ENFORCE(
         dX.numel() == X.numel(), "Gradient size doesn't match parameter size.");
     CAFFE_ENFORCE_GE(offset_, 0);
     CAFFE_ENFORCE_GE(lr_min_, 0);

     auto& wd = Input(2);
     auto& trust = Input(3);
     auto& lr_max = Input(4);

     auto* lr_rescaled = Output(0, vector<int64_t>{1}, at::dtype<T>());

     ReinitializeTensor(&X_norm_tensor_, {1}, at::dtype<T>().device(Context::GetDeviceType()));
     T* X_norm_ = X_norm_tensor_.template mutable_data<T>();

     ReinitializeTensor(&dX_norm_tensor_, {1}, at::dtype<T>().device(Context::GetDeviceType()));
     T* dX_norm_ = dX_norm_tensor_.template mutable_data<T>();

     ComputeNorms(
         dX.numel(),
         X.template data<T>(),
         dX.template data<T>(),
         X_norm_,
         dX_norm_);

     ComputeLearningRate(
         wd.template data<T>(),
         trust.template data<T>(),
         lr_max.template data<T>(),
         offset_,
         lr_min_,
         X_norm_,
         dX_norm_,
         lr_rescaled->template mutable_data<T>());

     return true;
   }

  private:
   // Compute the l2 norm of X_data and dX_data
   void ComputeNorms(
       int64_t N,
       const T* X_data,
       const T* dX_data,
       T* X_norm,
       T* dX_norm) {
     math::SumSqr(N, X_data, X_norm, &context_);
     math::Sqrt(1, X_norm, X_norm, &context_);
     math::SumSqr(N, dX_data, dX_norm, &context_);
     math::Sqrt(1, dX_norm, dX_norm, &context_);
   }
   // Compute the learning rate and apply clipping
   void ComputeLearningRate(
       const T* wd,
       const T* trust,
       const T* lr_max,
       T offset,
       T lr_min,
       T* X_norm,
       T* dX_norm,
       T* lr_rescaled);

   T offset_;
   T lr_min_;

   Tensor X_norm_tensor_;
   Tensor dX_norm_tensor_;
 };

 } // namespace caffe2

 #endif // CAFFE2_OPERATORS_LARS_OP_H_
	#ifndef CAFFE2_OPERATORS_LARS_OP_H_
	#define CAFFE2_OPERATORS_LARS_OP_H_

	#include "caffe2/core/context.h"
	#include "caffe2/core/logging.h"
	#include "caffe2/core/operator.h"
	#include "caffe2/utils/math.h"

	namespace caffe2 {

	template <typename T, class Context>
	class LarsOp final : public Operator<Context> {
	public:
	USE_OPERATOR_CONTEXT_FUNCTIONS;
	LarsOp(const OperatorDef& operator_def, Workspace* ws)
	: Operator<Context>(operator_def, ws),
	offset_(this->template GetSingleArgument<float>("offset", 0.5)),
	lr_min_(this->template GetSingleArgument<float>("lr_min", 0.02)) {}

	bool RunOnDevice() override {
	auto& X = Input(0);
	auto& dX = Input(1);
	CAFFE_ENFORCE(
	dX.numel() == X.numel(), "Gradient size doesn't match parameter size.");
	CAFFE_ENFORCE_GE(offset_, 0);
	CAFFE_ENFORCE_GE(lr_min_, 0);

	auto& wd = Input(2);
	auto& trust = Input(3);
	auto& lr_max = Input(4);

	auto* lr_rescaled = Output(0, vector<int64_t>{1}, at::dtype<T>());

	ReinitializeTensor(&X_norm_tensor_, {1}, at::dtype<T>().device(Context::GetDeviceType()));
	T* X_norm_ = X_norm_tensor_.template mutable_data<T>();

	ReinitializeTensor(&dX_norm_tensor_, {1}, at::dtype<T>().device(Context::GetDeviceType()));
	T* dX_norm_ = dX_norm_tensor_.template mutable_data<T>();

	ComputeNorms(
	dX.numel(),
	X.template data<T>(),
	dX.template data<T>(),
	X_norm_,
	dX_norm_);

	ComputeLearningRate(
	wd.template data<T>(),
	trust.template data<T>(),
	lr_max.template data<T>(),
	offset_,
	lr_min_,
	X_norm_,
	dX_norm_,
	lr_rescaled->template mutable_data<T>());

	return true;
	}

	private:
	// Compute the l2 norm of X_data and dX_data
	void ComputeNorms(
	int64_t N,
	const T* X_data,
	const T* dX_data,
	T* X_norm,
	T* dX_norm) {
	math::SumSqr(N, X_data, X_norm, &context_);
	math::Sqrt(1, X_norm, X_norm, &context_);
	math::SumSqr(N, dX_data, dX_norm, &context_);
	math::Sqrt(1, dX_norm, dX_norm, &context_);
	}
	// Compute the learning rate and apply clipping
	void ComputeLearningRate(
	const T* wd,
	const T* trust,
	const T* lr_max,
	T offset,
	T lr_min,
	T* X_norm,
	T* dX_norm,
	T* lr_rescaled);

	T offset_;
	T lr_min_;

	Tensor X_norm_tensor_;
	Tensor dX_norm_tensor_;
	};

	} // namespace caffe2

	#endif // CAFFE2_OPERATORS_LARS_OP_H_