caffe2/sgd/storm_op.cc - platform/external/pytorch - Git at Google

 #include "storm_op.h"

 namespace caffe2 {

 REGISTER_CPU_OPERATOR(Storm, StormOp<CPUContext>);
 OPERATOR_SCHEMA(Storm)
     .NumInputs(5)
     .NumOutputs(3)
     .AllowInplace({{0, 0}, {1, 1}, {2, 2}})
     .SetDoc(R"DOC(

 Computes the STORM (https://arxiv.org/abs/1905.10018) update for an input
 gradient and accumulated history of gradients. Concretely, given inputs
 (param, moment, grad_sq_sum, grad, lr), computes:

     new_grad_sq_sum = grad_sq_sum + norm(grad)^2
     effective_lr = lr / (beta + new_grad_sq_sum)^1/3
     alpha = momentum * square(effective_lr)
     new_moment = grad + (1 - alpha) * (moment - grad)
     new_param = param + effective_lr * new_moment

 and returns (new_param, new_moment, new_grad_sq_sum).

 Note that due to caffe2 limitation, it is difficult to re-calculate gradient
 in the previous iteration using the current example. We simplied calculation
 for new_moment by using the gradient from the current iteration.

 )DOC")
     .Input(0, "param", "Parameters to be updated.")
     .Input(1, "moment", "Moment history.")
     .Input(2, "grad_sq_sum", "Sum of observed squared gradients.")
     .Input(3, "grad", "Gradients computed.")
     .Input(4, "lr", "Learning rate, k in the original paper.")
     .Output(0, "output_param", "Updated parameters.")
     .Output(1, "output_moment", "Updated moment.")
     .Output(2, "output_grad_sq_sum", "Updated sum of squared gradients.")
     .Arg("momentum", "Momentum hyperparameter, c in the original paper.")
     .Arg(
         "beta",
         "denominator in adaptive learning rate, w in the original paper.");

 REGISTER_CPU_OPERATOR(SparseStorm, SparseStormOp<CPUContext>);
 OPERATOR_SCHEMA(SparseStorm)
     .NumInputs(6)
     .NumOutputs(3)
     .EnforceOneToOneInplace()
     .SetDoc(R"DOC(

 This operator implement the STORM (https://arxiv.org/abs/1905.10018)
 optimization algorithm. Given inputs (param, moment, grad_sq_sum, grad,
 indices, lr), computes the dense STORM update on (param, moment[indices],
 grad_sq_sum, grad, lr), and returns (new_param, new_moment, new_grad_sq_sum)
 as in the dense case.
 )DOC")
     .Input(0, "param", "Parameters to be updated.")
     .Input(1, "moment", "Moment history.")
     .Input(2, "grad_sq_sum", "Sum of observed squared gradients.")
     .Input(3, "grad", "Gradients computed.")
     .Input(4, "indices", "Sparse indices.")
     .Input(5, "lr", "Learning rate, k in the original paper.")
     .Output(0, "output_param", "Updated parameters.")
     .Output(1, "output_moment", "Updated moment.")
     .Output(2, "output_grad_sq_sum", "Updated sum of squared gradients.")
     .Arg("momentum", "Momentum hyperparameter, c in the original paper.")
     .Arg(
         "beta",
         "denominator in adaptive learning rate, w in the original paper.");

 SHOULD_NOT_DO_GRADIENT(Storm);
 SHOULD_NOT_DO_GRADIENT(SparseStorm);
 } // namespace caffe2
	#include "storm_op.h"

	namespace caffe2 {

	REGISTER_CPU_OPERATOR(Storm, StormOp<CPUContext>);
	OPERATOR_SCHEMA(Storm)
	.NumInputs(5)
	.NumOutputs(3)
	.AllowInplace({{0, 0}, {1, 1}, {2, 2}})
	.SetDoc(R"DOC(

	Computes the STORM (https://arxiv.org/abs/1905.10018) update for an input
	gradient and accumulated history of gradients. Concretely, given inputs
	(param, moment, grad_sq_sum, grad, lr), computes:

	new_grad_sq_sum = grad_sq_sum + norm(grad)^2
	effective_lr = lr / (beta + new_grad_sq_sum)^1/3
	alpha = momentum * square(effective_lr)
	new_moment = grad + (1 - alpha) * (moment - grad)
	new_param = param + effective_lr * new_moment

	and returns (new_param, new_moment, new_grad_sq_sum).

	Note that due to caffe2 limitation, it is difficult to re-calculate gradient
	in the previous iteration using the current example. We simplied calculation
	for new_moment by using the gradient from the current iteration.

	)DOC")
	.Input(0, "param", "Parameters to be updated.")
	.Input(1, "moment", "Moment history.")
	.Input(2, "grad_sq_sum", "Sum of observed squared gradients.")
	.Input(3, "grad", "Gradients computed.")
	.Input(4, "lr", "Learning rate, k in the original paper.")
	.Output(0, "output_param", "Updated parameters.")
	.Output(1, "output_moment", "Updated moment.")
	.Output(2, "output_grad_sq_sum", "Updated sum of squared gradients.")
	.Arg("momentum", "Momentum hyperparameter, c in the original paper.")
	.Arg(
	"beta",
	"denominator in adaptive learning rate, w in the original paper.");

	REGISTER_CPU_OPERATOR(SparseStorm, SparseStormOp<CPUContext>);
	OPERATOR_SCHEMA(SparseStorm)
	.NumInputs(6)
	.NumOutputs(3)
	.EnforceOneToOneInplace()
	.SetDoc(R"DOC(

	This operator implement the STORM (https://arxiv.org/abs/1905.10018)
	optimization algorithm. Given inputs (param, moment, grad_sq_sum, grad,
	indices, lr), computes the dense STORM update on (param, moment[indices],
	grad_sq_sum, grad, lr), and returns (new_param, new_moment, new_grad_sq_sum)
	as in the dense case.
	)DOC")
	.Input(0, "param", "Parameters to be updated.")
	.Input(1, "moment", "Moment history.")
	.Input(2, "grad_sq_sum", "Sum of observed squared gradients.")
	.Input(3, "grad", "Gradients computed.")
	.Input(4, "indices", "Sparse indices.")
	.Input(5, "lr", "Learning rate, k in the original paper.")
	.Output(0, "output_param", "Updated parameters.")
	.Output(1, "output_moment", "Updated moment.")
	.Output(2, "output_grad_sq_sum", "Updated sum of squared gradients.")
	.Arg("momentum", "Momentum hyperparameter, c in the original paper.")
	.Arg(
	"beta",
	"denominator in adaptive learning rate, w in the original paper.");

	SHOULD_NOT_DO_GRADIENT(Storm);
	SHOULD_NOT_DO_GRADIENT(SparseStorm);
	} // namespace caffe2