| #include "caffe2/operators/softmax_op.h" |
| |
| #include "caffe2/operators/softmax_utils.h" |
| |
| namespace caffe2 { |
| |
| // Implementation for the CPU context. |
| template <> |
| bool SoftmaxOp<float, CPUContext>::RunOnDevice() { |
| const auto& X = Input(0); |
| const int canonical_axis = X.canonical_axis_index(axis_); |
| const int N = X.size_to_dim(canonical_axis); |
| const int D = X.size_from_dim(canonical_axis); |
| auto* Y = Output(0, X.sizes(), at::dtype<float>()); |
| const float* X_data = X.data<float>(); |
| float* Y_data = Y->mutable_data<float>(); |
| if (N == 0 || D == 0) { |
| return true; |
| } |
| if (!scale_.defined()) { |
| scale_ = caffe2::empty({N}, at::dtype<float>().device(CPU)); |
| } else if (scale_.numel() != N) { |
| scale_.Resize(N); |
| } |
| softmax_utils::SoftmaxCPU<float>( |
| N, D, false, X_data, Y_data, scale_.mutable_data<float>(), &context_); |
| return true; |
| } |
| |
| // Implementation for the CPU context. |
| template <> |
| bool SoftmaxGradientOp<float, CPUContext>::RunOnDevice() { |
| auto& Y = Input(0); |
| auto& dY = Input(1); |
| |
| const auto canonical_axis = Y.canonical_axis_index(axis_); |
| const int64_t N = Y.size_to_dim(canonical_axis); |
| const int64_t D = Y.size_from_dim(canonical_axis); |
| // First, get scales |
| if (!scale_.defined()) { |
| scale_ = caffe2::empty({N}, at::dtype<float>().device(CPU)); |
| } else if (scale_.numel() != N) { |
| scale_.Resize(N); |
| } |
| |
| if (!sum_multiplier_.defined()) { |
| sum_multiplier_ = caffe2::empty({D}, at::dtype<float>().device(CPU)); |
| math::Set<float, CPUContext>( |
| D, 1.f, sum_multiplier_.mutable_data<float>(), &context_); |
| } else if (sum_multiplier_.numel() != D) { |
| sum_multiplier_.Resize(D); |
| math::Set<float, CPUContext>( |
| D, 1.f, sum_multiplier_.mutable_data<float>(), &context_); |
| } |
| |
| auto* dX = Output(0, Y.sizes(), at::dtype<float>()); |
| const float* Ydata = Y.data<float>(); |
| const float* dYdata = dY.data<float>(); |
| float* dXdata = dX->mutable_data<float>(); |
| if (N == 0 || D == 0) { |
| return true; |
| } |
| context_.CopySameDevice<float>(Y.numel(), dYdata, dXdata); |
| float* scaledata = scale_.mutable_data<float>(); |
| for (int i = 0; i < N; ++i) { |
| math::Dot<float, CPUContext>( |
| D, Ydata + i * D, dYdata + i * D, scaledata + i, &context_); |
| } |
| math::Gemm<float, CPUContext>( |
| CblasNoTrans, |
| CblasNoTrans, |
| N, |
| D, |
| 1, |
| -1, |
| scaledata, |
| sum_multiplier_.data<float>(), |
| 1, |
| dXdata, |
| &context_); |
| math::Mul<float, CPUContext>(Y.numel(), dXdata, Ydata, dXdata, &context_); |
| return true; |
| } |
| |
| REGISTER_CPU_OPERATOR(Softmax, SoftmaxOp<float, CPUContext>); |
| REGISTER_CPU_GRADIENT_OPERATOR( |
| SoftmaxGradient, |
| SoftmaxGradientOp<float, CPUContext>); |
| |
| OPERATOR_SCHEMA(Softmax) |
| .NumInputs(1) |
| .NumOutputs(1) |
| .IdenticalTypeAndShape() |
| .SetDoc(R"DOC( |
| |
| Applies the Softmax function to an n-dimensional input Tensor rescaling them so |
| that the elements of the n-dimensional output Tensor lie in the range (0,1) and |
| sum to 1. The softmax operator is typically the last layer in a classifier network, |
| as its output can be interpreted as confidence probabilities of an input belonging |
| to each class. The input is a 2-D tensor (Tensor) of size (batch_size x |
| input_feature_dimensions). The output tensor has the same shape and contains the |
| softmax normalized values of the corresponding input. The softmax function is |
| defined as follows: |
| |
| $$softmax(x_i) = \frac{\exp(x_i)}{\sum_{j} \exp(x_j)}$$ |
| |
| The input does not need to explicitly be a 2D vector; rather, it will be coerced |
| into one. For an arbitrary n-dimensional tensor `X` in |
| $[a_0, a_1, ..., a_{k-1}, a_k, ..., a_{n-1}]$, where k is the `axis` provided, |
| then `X` will be coerced into a 2-dimensional tensor with dimensions |
| $[(a_0 * ... * a_{k-1}), (a_k * ... * a_{n-1})]$. For the default case where |
| `axis`=1, the `X` tensor will be coerced into a 2D tensor of dimensions |
| $[a_0, (a_1 * ... * a_{n-1})]$, where $a_0$ is often the batch size. In this |
| situation, we must have $a_0 = N$ and $a_1 * ... * a_{n-1} = D$. Each of these |
| dimensions must be matched correctly, or else the operator will throw errors. |
| |
| Github Links: |
| |
| - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/softmax_op.h |
| - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/softmax_op.cc |
| |
| |
| <details> |
| |
| <summary> <b>Example</b> </summary> |
| |
| **Code** |
| |
| ``` |
| workspace.ResetWorkspace() |
| |
| op = core.CreateOperator( |
| "Softmax", |
| ["X"], |
| ["Y"] |
| ) |
| |
| workspace.FeedBlob("X", np.random.randn(1, 5).astype(np.float32)) |
| print("input:", workspace.FetchBlob("X")) |
| workspace.RunOperatorOnce(op) |
| print("softmax:", workspace.FetchBlob("Y")) |
| |
| ``` |
| |
| **Result** |
| |
| ``` |
| input: [[ 0.0417839 0.61960053 -0.23150268 -0.64389366 -3.0000346 ]] |
| softmax: [[0.24422921 0.43525138 0.18582782 0.12303016 0.01166145]] |
| |
| ``` |
| |
| </details> |
| |
| |
| |
| )DOC") |
| .Arg( |
| "axis", |
| "*(type: int; default: 1)* Axis of the inputs when coerced to 2D matrix.") |
| .Input( |
| 0, |
| "X", |
| "*(type: Tensor`<float>`)* Input tensor that's coerced into a 2D matrix of size (NxD) as described above.") |
| .Output( |
| 0, |
| "Y", |
| "*(type: Tensor`<float>`)* The softmax normalized output tensor with the same shape as input tensor.") |
| .InheritOnnxSchema(); |
| |
| // Input: Y, dY. Output: dX |
| GRADIENT_OPERATOR_SCHEMA(SoftmaxGradient).NumInputs(2).NumOutputs(1); |
| |
| class GetSoftmaxGradient : public GradientMakerBase { |
| using GradientMakerBase::GradientMakerBase; |
| vector<OperatorDef> GetGradientDefs() override { |
| return SingleGradientDef( |
| def_.type() + "Gradient", |
| "", |
| vector<string>{O(0), GO(0)}, |
| vector<string>{GI(0)}); |
| } |
| }; |
| REGISTER_GRADIENT(Softmax, GetSoftmaxGradient); |
| REGISTER_GRADIENT(SoftmaxFp16, GetSoftmaxGradient); |
| |
| } // namespace caffe2 |