| #ifndef CAFFE2_OPERATORS_LARS_OP_H_ |
| #define CAFFE2_OPERATORS_LARS_OP_H_ |
| |
| #include "caffe2/core/context.h" |
| #include "caffe2/core/logging.h" |
| #include "caffe2/core/operator.h" |
| #include "caffe2/utils/math.h" |
| |
| namespace caffe2 { |
| |
| template <typename T, class Context> |
| class LarsOp final : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| LarsOp(const OperatorDef& operator_def, Workspace* ws) |
| : Operator<Context>(operator_def, ws), |
| offset_(this->template GetSingleArgument<float>("offset", 0.5)), |
| lr_min_(this->template GetSingleArgument<float>("lr_min", 0.02)) {} |
| |
| bool RunOnDevice() override { |
| auto& X = Input(0); |
| auto& dX = Input(1); |
| CAFFE_ENFORCE( |
| dX.numel() == X.numel(), "Gradient size doesn't match parameter size."); |
| CAFFE_ENFORCE_GE(offset_, 0); |
| CAFFE_ENFORCE_GE(lr_min_, 0); |
| |
| auto& wd = Input(2); |
| auto& trust = Input(3); |
| auto& lr_max = Input(4); |
| |
| auto* lr_rescaled = Output(0, vector<int64_t>{1}, at::dtype<T>()); |
| |
| ReinitializeTensor(&X_norm_tensor_, {1}, at::dtype<T>().device(Context::GetDeviceType())); |
| T* X_norm_ = X_norm_tensor_.template mutable_data<T>(); |
| |
| ReinitializeTensor(&dX_norm_tensor_, {1}, at::dtype<T>().device(Context::GetDeviceType())); |
| T* dX_norm_ = dX_norm_tensor_.template mutable_data<T>(); |
| |
| ComputeNorms( |
| dX.numel(), |
| X.template data<T>(), |
| dX.template data<T>(), |
| X_norm_, |
| dX_norm_); |
| |
| ComputeLearningRate( |
| wd.template data<T>(), |
| trust.template data<T>(), |
| lr_max.template data<T>(), |
| offset_, |
| lr_min_, |
| X_norm_, |
| dX_norm_, |
| lr_rescaled->template mutable_data<T>()); |
| |
| return true; |
| } |
| |
| private: |
| // Compute the l2 norm of X_data and dX_data |
| void ComputeNorms( |
| int64_t N, |
| const T* X_data, |
| const T* dX_data, |
| T* X_norm, |
| T* dX_norm) { |
| math::SumSqr(N, X_data, X_norm, &context_); |
| math::Sqrt(1, X_norm, X_norm, &context_); |
| math::SumSqr(N, dX_data, dX_norm, &context_); |
| math::Sqrt(1, dX_norm, dX_norm, &context_); |
| } |
| // Compute the learning rate and apply clipping |
| void ComputeLearningRate( |
| const T* wd, |
| const T* trust, |
| const T* lr_max, |
| T offset, |
| T lr_min, |
| T* X_norm, |
| T* dX_norm, |
| T* lr_rescaled); |
| |
| T offset_; |
| T lr_min_; |
| |
| Tensor X_norm_tensor_; |
| Tensor dX_norm_tensor_; |
| }; |
| |
| } // namespace caffe2 |
| |
| #endif // CAFFE2_OPERATORS_LARS_OP_H_ |