caffe2/operators/softmax_op.cc - platform/external/pytorch - Git at Google

 #include "caffe2/operators/softmax_op.h"

 #include "caffe2/operators/softmax_utils.h"

 namespace caffe2 {

 // Implementation for the CPU context.
 template <>
 bool SoftmaxOp<float, CPUContext>::RunOnDevice() {
   const auto& X = Input(0);
   const int canonical_axis = X.canonical_axis_index(axis_);
   const int N = X.size_to_dim(canonical_axis);
   const int D = X.size_from_dim(canonical_axis);
   auto* Y = Output(0, X.sizes(), at::dtype<float>());
   const float* X_data = X.data<float>();
   float* Y_data = Y->mutable_data<float>();
   if (N == 0 || D == 0) {
     return true;
   }
   if (!scale_.defined()) {
     scale_ = caffe2::empty({N}, at::dtype<float>().device(CPU));
   } else if (scale_.numel() != N) {
     scale_.Resize(N);
   }
   softmax_utils::SoftmaxCPU<float>(
       N, D, false, X_data, Y_data, scale_.mutable_data<float>(), &context_);
   return true;
 }

 // Implementation for the CPU context.
 template <>
 bool SoftmaxGradientOp<float, CPUContext>::RunOnDevice() {
   auto& Y = Input(0);
   auto& dY = Input(1);

   const auto canonical_axis = Y.canonical_axis_index(axis_);
   const int64_t N = Y.size_to_dim(canonical_axis);
   const int64_t D = Y.size_from_dim(canonical_axis);
   // First, get scales
   if (!scale_.defined()) {
     scale_ = caffe2::empty({N}, at::dtype<float>().device(CPU));
   } else if (scale_.numel() != N) {
     scale_.Resize(N);
   }

   if (!sum_multiplier_.defined()) {
     sum_multiplier_ = caffe2::empty({D}, at::dtype<float>().device(CPU));
     math::Set<float, CPUContext>(
         D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
   } else if (sum_multiplier_.numel() != D) {
     sum_multiplier_.Resize(D);
     math::Set<float, CPUContext>(
         D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
   }

   auto* dX = Output(0, Y.sizes(), at::dtype<float>());
   const float* Ydata = Y.data<float>();
   const float* dYdata = dY.data<float>();
   float* dXdata = dX->mutable_data<float>();
   if (N == 0 || D == 0) {
     return true;
   }
   context_.CopySameDevice<float>(Y.numel(), dYdata, dXdata);
   float* scaledata = scale_.mutable_data<float>();
   for (int i = 0; i < N; ++i) {
     math::Dot<float, CPUContext>(
         D, Ydata + i * D, dYdata + i * D, scaledata + i, &context_);
   }
   math::Gemm<float, CPUContext>(
       CblasNoTrans,
       CblasNoTrans,
       N,
       D,
       1,
       -1,
       scaledata,
       sum_multiplier_.data<float>(),
       1,
       dXdata,
       &context_);
   math::Mul<float, CPUContext>(Y.numel(), dXdata, Ydata, dXdata, &context_);
   return true;
 }

 REGISTER_CPU_OPERATOR(Softmax, SoftmaxOp<float, CPUContext>);
 REGISTER_CPU_GRADIENT_OPERATOR(
     SoftmaxGradient,
     SoftmaxGradientOp<float, CPUContext>);

 OPERATOR_SCHEMA(Softmax)
     .NumInputs(1)
     .NumOutputs(1)
     .IdenticalTypeAndShape()
     .SetDoc(R"DOC(

 Applies the Softmax function to an n-dimensional input Tensor rescaling them so
 that the elements of the n-dimensional output Tensor lie in the range (0,1) and
 sum to 1. The softmax operator is typically the last layer in a classifier network,
 as its output can be interpreted as confidence probabilities of an input belonging
 to each class. The input is a 2-D tensor (Tensor) of size (batch_size x
 input_feature_dimensions). The output tensor has the same shape and contains the
 softmax normalized values of the corresponding input. The softmax function is
 defined as follows:

 $$softmax(x_i) = \frac{\exp(x_i)}{\sum_{j} \exp(x_j)}$$

 The input does not need to explicitly be a 2D vector; rather, it will be coerced
 into one. For an arbitrary n-dimensional tensor `X` in
 $[a_0, a_1, ..., a_{k-1}, a_k, ..., a_{n-1}]$, where k is the `axis` provided,
 then `X` will be coerced into a 2-dimensional tensor with dimensions
 $[(a_0 * ... * a_{k-1}), (a_k * ... * a_{n-1})]$. For the default case where
 `axis`=1, the `X` tensor will be coerced into a 2D tensor of dimensions
 $[a_0, (a_1 * ... * a_{n-1})]$, where $a_0$ is often the batch size. In this
 situation, we must have $a_0 = N$ and $a_1 * ... * a_{n-1} = D$. Each of these
 dimensions must be matched correctly, or else the operator will throw errors.

 Github Links:

 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/softmax_op.h
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/softmax_op.cc


 <details>

 <summary> <b>Example</b> </summary>

 **Code**

 ```
 workspace.ResetWorkspace()

 op = core.CreateOperator(
     "Softmax",
     ["X"],
     ["Y"]
 )

 workspace.FeedBlob("X", np.random.randn(1, 5).astype(np.float32))
 print("input:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("softmax:", workspace.FetchBlob("Y"))

 ```

 **Result**

 ```
 input: [[ 0.0417839   0.61960053 -0.23150268 -0.64389366 -3.0000346 ]]
 softmax: [[0.24422921 0.43525138 0.18582782 0.12303016 0.01166145]]

 ```

 </details>


 )DOC")
     .Arg(
         "axis",
         "*(type: int; default: 1)* Axis of the inputs when coerced to 2D matrix.")
     .Input(
         0,
         "X",
         "*(type: Tensor`<float>`)* Input tensor that's coerced into a 2D matrix of size (NxD) as described above.")
     .Output(
         0,
         "Y",
         "*(type: Tensor`<float>`)* The softmax normalized output tensor with the same shape as input tensor.")
     .InheritOnnxSchema();

 // Input: Y, dY. Output: dX
 GRADIENT_OPERATOR_SCHEMA(SoftmaxGradient).NumInputs(2).NumOutputs(1);

 class GetSoftmaxGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
         def_.type() + "Gradient",
         "",
         vector<string>{O(0), GO(0)},
         vector<string>{GI(0)});
   }
 };
 REGISTER_GRADIENT(Softmax, GetSoftmaxGradient);
 REGISTER_GRADIENT(SoftmaxFp16, GetSoftmaxGradient);

 } // namespace caffe2
	#include "caffe2/operators/softmax_op.h"

	#include "caffe2/operators/softmax_utils.h"

	namespace caffe2 {

	// Implementation for the CPU context.
	template <>
	bool SoftmaxOp<float, CPUContext>::RunOnDevice() {
	const auto& X = Input(0);
	const int canonical_axis = X.canonical_axis_index(axis_);
	const int N = X.size_to_dim(canonical_axis);
	const int D = X.size_from_dim(canonical_axis);
	auto* Y = Output(0, X.sizes(), at::dtype<float>());
	const float* X_data = X.data<float>();
	float* Y_data = Y->mutable_data<float>();
	if (N == 0 \|\| D == 0) {
	return true;
	}
	if (!scale_.defined()) {
	scale_ = caffe2::empty({N}, at::dtype<float>().device(CPU));
	} else if (scale_.numel() != N) {
	scale_.Resize(N);
	}
	softmax_utils::SoftmaxCPU<float>(
	N, D, false, X_data, Y_data, scale_.mutable_data<float>(), &context_);
	return true;
	}

	// Implementation for the CPU context.
	template <>
	bool SoftmaxGradientOp<float, CPUContext>::RunOnDevice() {
	auto& Y = Input(0);
	auto& dY = Input(1);

	const auto canonical_axis = Y.canonical_axis_index(axis_);
	const int64_t N = Y.size_to_dim(canonical_axis);
	const int64_t D = Y.size_from_dim(canonical_axis);
	// First, get scales
	if (!scale_.defined()) {
	scale_ = caffe2::empty({N}, at::dtype<float>().device(CPU));
	} else if (scale_.numel() != N) {
	scale_.Resize(N);
	}

	if (!sum_multiplier_.defined()) {
	sum_multiplier_ = caffe2::empty({D}, at::dtype<float>().device(CPU));
	math::Set<float, CPUContext>(
	D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
	} else if (sum_multiplier_.numel() != D) {
	sum_multiplier_.Resize(D);
	math::Set<float, CPUContext>(
	D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
	}

	auto* dX = Output(0, Y.sizes(), at::dtype<float>());
	const float* Ydata = Y.data<float>();
	const float* dYdata = dY.data<float>();
	float* dXdata = dX->mutable_data<float>();
	if (N == 0 \|\| D == 0) {
	return true;
	}
	context_.CopySameDevice<float>(Y.numel(), dYdata, dXdata);
	float* scaledata = scale_.mutable_data<float>();
	for (int i = 0; i < N; ++i) {
	math::Dot<float, CPUContext>(
	D, Ydata + i * D, dYdata + i * D, scaledata + i, &context_);
	}
	math::Gemm<float, CPUContext>(
	CblasNoTrans,
	CblasNoTrans,
	N,
	D,
	1,
	-1,
	scaledata,
	sum_multiplier_.data<float>(),
	1,
	dXdata,
	&context_);
	math::Mul<float, CPUContext>(Y.numel(), dXdata, Ydata, dXdata, &context_);
	return true;
	}

	REGISTER_CPU_OPERATOR(Softmax, SoftmaxOp<float, CPUContext>);
	REGISTER_CPU_GRADIENT_OPERATOR(
	SoftmaxGradient,
	SoftmaxGradientOp<float, CPUContext>);

	OPERATOR_SCHEMA(Softmax)
	.NumInputs(1)
	.NumOutputs(1)
	.IdenticalTypeAndShape()
	.SetDoc(R"DOC(

	Applies the Softmax function to an n-dimensional input Tensor rescaling them so
	that the elements of the n-dimensional output Tensor lie in the range (0,1) and
	sum to 1. The softmax operator is typically the last layer in a classifier network,
	as its output can be interpreted as confidence probabilities of an input belonging
	to each class. The input is a 2-D tensor (Tensor) of size (batch_size x
	input_feature_dimensions). The output tensor has the same shape and contains the
	softmax normalized values of the corresponding input. The softmax function is
	defined as follows:

	$$softmax(x_i) = \frac{\exp(x_i)}{\sum_{j} \exp(x_j)}$$

	The input does not need to explicitly be a 2D vector; rather, it will be coerced
	into one. For an arbitrary n-dimensional tensor `X` in
	$[a_0, a_1, ..., a_{k-1}, a_k, ..., a_{n-1}]$, where k is the `axis` provided,
	then `X` will be coerced into a 2-dimensional tensor with dimensions
	$[(a_0 * ... * a_{k-1}), (a_k * ... * a_{n-1})]$. For the default case where
	`axis`=1, the `X` tensor will be coerced into a 2D tensor of dimensions
	$[a_0, (a_1 * ... * a_{n-1})]$, where $a_0$ is often the batch size. In this
	situation, we must have $a_0 = N$ and $a_1 * ... * a_{n-1} = D$. Each of these
	dimensions must be matched correctly, or else the operator will throw errors.

	Github Links:

	- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/softmax_op.h
	- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/softmax_op.cc


	<details>

	<summary> <b>Example</b> </summary>

	Code

	```
	workspace.ResetWorkspace()

	op = core.CreateOperator(
	"Softmax",
	["X"],
	["Y"]
	)

	workspace.FeedBlob("X", np.random.randn(1, 5).astype(np.float32))
	print("input:", workspace.FetchBlob("X"))
	workspace.RunOperatorOnce(op)
	print("softmax:", workspace.FetchBlob("Y"))

	```

	Result

	```
	input: [[ 0.0417839 0.61960053 -0.23150268 -0.64389366 -3.0000346 ]]
	softmax: [[0.24422921 0.43525138 0.18582782 0.12303016 0.01166145]]

	```

	</details>



	)DOC")
	.Arg(
	"axis",
	"(type: int; default: 1) Axis of the inputs when coerced to 2D matrix.")
	.Input(
	0,
	"X",
	"(type: Tensor`<float>`) Input tensor that's coerced into a 2D matrix of size (NxD) as described above.")
	.Output(
	0,
	"Y",
	"(type: Tensor`<float>`) The softmax normalized output tensor with the same shape as input tensor.")
	.InheritOnnxSchema();

	// Input: Y, dY. Output: dX
	GRADIENT_OPERATOR_SCHEMA(SoftmaxGradient).NumInputs(2).NumOutputs(1);

	class GetSoftmaxGradient : public GradientMakerBase {
	using GradientMakerBase::GradientMakerBase;
	vector<OperatorDef> GetGradientDefs() override {
	return SingleGradientDef(
	def_.type() + "Gradient",
	"",
	vector<string>{O(0), GO(0)},
	vector<string>{GI(0)});
	}
	};
	REGISTER_GRADIENT(Softmax, GetSoftmaxGradient);
	REGISTER_GRADIENT(SoftmaxFp16, GetSoftmaxGradient);

	} // namespace caffe2