| #include "caffe2/operators/cross_entropy_op.h" |
| #include "caffe2/utils/eigen_utils.h" |
| |
| namespace caffe2 { |
| |
| namespace { |
| |
| inline float sigmoid_xent_forward(float lgt, float tgt) { |
| // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) |
| return lgt * (tgt - (lgt >= 0)) - log(1 + exp(lgt - 2 * lgt * (lgt >= 0))); |
| } |
| |
| inline float sigmoid_xent_backward(float lgt, float tgt) { |
| // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) |
| return tgt - 1. / (1. + exp(-lgt)); |
| } |
| |
| inline float sigmoid_partition(float lgt) { |
| // computes log(1 + exp(lgt)) with only exp(x) function when x >= 0 |
| // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) |
| return lgt * (lgt >= 0) + log(1 + exp(lgt - 2 * lgt * (lgt >= 0))); |
| } |
| |
| inline float sigmoid_xent_forward_with_log_d_trick(float lgt, float tgt) { |
| // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) |
| return (2 * tgt - 1.) * (lgt - sigmoid_partition(lgt)); |
| } |
| |
| inline float sigmoid_xent_backward_with_log_d_trick(float lgt, float tgt) { |
| // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) |
| return (2 * tgt - 1.) / (1. + exp(lgt)); |
| } |
| |
| inline float unjoined_sigmoid_xent_forward(float lgt, float tgt) { |
| // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) |
| return lgt * tgt + (tgt - 1) * lgt * (lgt >= 0) - |
| // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) |
| (1 - tgt) * log(1 + exp(lgt - 2 * lgt * (lgt >= 0))); |
| } |
| |
| inline float unjoined_sigmoid_xent_backward(float lgt, float tgt) { |
| // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) |
| return tgt - (1. - tgt) / (1. + exp(-lgt)); |
| } |
| |
| } // namespace |
| |
| template <> |
| bool LabelCrossEntropyOp<float, CPUContext>::RunOnDevice() { |
| auto& X = Input(0); |
| auto& label = Input(1); |
| |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| int N, D; |
| if (X.dim() > 1) { |
| N = X.dim32(0); |
| D = X.size_from_dim(1); |
| } else { |
| N = 1; |
| D = X.dim32(0); |
| } |
| CAFFE_ENFORCE( |
| (label.dim() == 1) || (label.dim() == 2 && label.dim32(1) == 1)); |
| CAFFE_ENFORCE_EQ(label.dim32(0), N); |
| auto* Y = Output(0, {N}, at::dtype<float>()); |
| const auto* Xdata = X.data<float>(); |
| const auto* labelData = label.data<int>(); |
| auto* Ydata = Y->template mutable_data<float>(); |
| CAFFE_ENFORCE( |
| (ConstEigenVectorArrayMap<int>(labelData, N) < D).all() && |
| (ConstEigenVectorArrayMap<int>(labelData, N) >= 0).all(), |
| "Label seems to be outside of supported range. Supported labels are in " |
| "range [0,", |
| D, |
| ")"); |
| for (int i = 0; i < N; ++i) { |
| Ydata[i] = -log(std::max(Xdata[i * D + labelData[i]], kLOG_THRESHOLD())); |
| } |
| return true; |
| } |
| |
| template <> |
| bool SigmoidCrossEntropyWithLogitsOp<float, CPUContext>::RunOnDevice() { |
| auto& logits = Input(0); |
| auto& targets = Input(1); |
| CAFFE_ENFORCE_EQ(logits.sizes(), targets.sizes()); |
| const auto inner_size = logits.dim() > 0 ? logits.sizes().back() : 1; |
| const auto outer_size = logits.numel() / inner_size; |
| |
| std::vector<int64_t> dims; |
| if (logits.dim() != 0) { |
| dims = |
| std::vector<int64_t>(logits.sizes().begin(), logits.sizes().end() - 1); |
| } |
| auto* out = Output(0, dims, at::dtype<float>()); |
| auto* out_ptr = out->template mutable_data<float>(); |
| |
| auto* logits_ptr = logits.data<float>(); |
| auto* targets_ptr = targets.data<float>(); |
| |
| auto in_idx = 0; |
| for (int i = 0; i < outer_size; ++i) { |
| float value = 0; |
| for (int j = 0; j < inner_size; ++j) { |
| if (unjoined_lr_loss_) { |
| value += unjoined_sigmoid_xent_forward( |
| logits_ptr[in_idx], targets_ptr[in_idx]); |
| } else { |
| value += |
| (log_D_trick_ ? sigmoid_xent_forward_with_log_d_trick( |
| logits_ptr[in_idx], targets_ptr[in_idx]) |
| : sigmoid_xent_forward( |
| logits_ptr[in_idx], targets_ptr[in_idx])); |
| } |
| ++in_idx; |
| } |
| out_ptr[i] = -value / inner_size; |
| } |
| return true; |
| } |
| |
| template <> |
| bool SigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>::RunOnDevice() { |
| auto& g = Input(0); |
| auto& logits = Input(1); |
| auto& targets = Input(2); |
| CAFFE_ENFORCE(logits.sizes() == targets.sizes()); |
| const auto inner_size = logits.dim() > 0 ? logits.sizes().back() : 1; |
| const auto outer_size = logits.numel() / inner_size; |
| CAFFE_ENFORCE(g.numel() == outer_size); |
| |
| auto* out = Output(0, logits.sizes(), at::dtype<float>()); |
| auto* out_ptr = out->template mutable_data<float>(); |
| |
| auto* logits_ptr = logits.data<float>(); |
| auto* targets_ptr = targets.data<float>(); |
| auto* g_ptr = g.data<float>(); |
| |
| auto in_idx = 0; |
| for (int i = 0; i < outer_size; ++i) { |
| auto g_factor = -g_ptr[i] / inner_size; |
| for (int j = 0; j < inner_size; ++j) { |
| if (unjoined_lr_loss_) { |
| out_ptr[in_idx] = g_factor * |
| unjoined_sigmoid_xent_backward( |
| logits_ptr[in_idx], targets_ptr[in_idx]); |
| } else { |
| out_ptr[in_idx] = g_factor * |
| (log_D_trick_ ? sigmoid_xent_backward_with_log_d_trick( |
| logits_ptr[in_idx], targets_ptr[in_idx]) |
| : sigmoid_xent_backward( |
| logits_ptr[in_idx], targets_ptr[in_idx])); |
| } |
| ++in_idx; |
| } |
| } |
| return true; |
| } |
| |
| template <> |
| bool WeightedSigmoidCrossEntropyWithLogitsOp<float, CPUContext>::RunOnDevice() { |
| auto& logits = Input(0); |
| auto& targets = Input(1); |
| auto& weights = Input(2); |
| CAFFE_ENFORCE(logits.sizes() == targets.sizes()); |
| CAFFE_ENFORCE(weights.sizes() == targets.sizes()); |
| const auto inner_size = logits.dim() > 0 ? logits.sizes().back() : 1; |
| const auto outer_size = logits.numel() / inner_size; |
| |
| std::vector<int64_t> dims; |
| if (logits.dim() != 0) { |
| dims = |
| std::vector<int64_t>(logits.sizes().begin(), logits.sizes().end() - 1); |
| } |
| |
| auto* out = Output(0, dims, at::dtype<float>()); |
| auto* out_ptr = out->template mutable_data<float>(); |
| |
| auto* logits_ptr = logits.data<float>(); |
| auto* targets_ptr = targets.data<float>(); |
| auto* weights_ptr = weights.data<float>(); |
| |
| auto in_idx = 0; |
| for (int i = 0; i < outer_size; ++i) { |
| float value = 0; |
| for (int j = 0; j < inner_size; ++j) { |
| value += sigmoid_xent_forward(logits_ptr[in_idx], targets_ptr[in_idx]) * |
| weights_ptr[in_idx]; |
| ++in_idx; |
| } |
| out_ptr[i] = -value / inner_size; |
| } |
| return true; |
| } |
| |
| template <> |
| bool WeightedSigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>:: |
| RunOnDevice() { |
| auto& g = Input(0); |
| auto& logits = Input(1); |
| auto& targets = Input(2); |
| auto& weights = Input(3); |
| CAFFE_ENFORCE(logits.sizes() == targets.sizes()); |
| CAFFE_ENFORCE(weights.sizes() == targets.sizes()); |
| const auto inner_size = logits.dim() > 0 ? logits.sizes().back() : 1; |
| const auto outer_size = logits.numel() / inner_size; |
| CAFFE_ENFORCE(g.numel() == outer_size); |
| |
| auto* out = Output(0, logits.sizes(), at::dtype<float>()); |
| auto* out_ptr = out->template mutable_data<float>(); |
| |
| auto* logits_ptr = logits.data<float>(); |
| auto* targets_ptr = targets.data<float>(); |
| auto* weights_ptr = weights.data<float>(); |
| auto* g_ptr = g.data<float>(); |
| |
| auto in_idx = 0; |
| for (int i = 0; i < outer_size; ++i) { |
| auto g_factor = -g_ptr[i] / inner_size; |
| for (int j = 0; j < inner_size; ++j) { |
| out_ptr[in_idx] = g_factor * |
| sigmoid_xent_backward(logits_ptr[in_idx], targets_ptr[in_idx]) * |
| weights_ptr[in_idx]; |
| ++in_idx; |
| } |
| } |
| return true; |
| } |
| |
| template <> |
| bool LabelCrossEntropyGradientOp<float, CPUContext>::RunOnDevice() { |
| auto& X = Input(0); |
| auto& label = Input(1); |
| auto& dY = Input(2); |
| |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| int N, D; |
| if (X.dim() > 1) { |
| N = X.dim32(0); |
| D = X.size_from_dim(1); |
| } else { |
| N = 1; |
| D = X.dim32(0); |
| } |
| CAFFE_ENFORCE( |
| (label.dim() == 1) || (label.dim() == 2 && label.dim32(1) == 1)); |
| CAFFE_ENFORCE_EQ(label.dim32(0), N); |
| CAFFE_ENFORCE_EQ(dY.dim(), 1); |
| CAFFE_ENFORCE_EQ(dY.dim32(0), N); |
| auto* dX = Output(0, X.sizes(), at::dtype<float>()); |
| math::Set<float, CPUContext>( |
| dX->numel(), 0.f, dX->template mutable_data<float>(), &context_); |
| const float* Xdata = X.data<float>(); |
| const float* dYdata = dY.data<float>(); |
| const int* labelData = label.data<int>(); |
| float* dXdata = dX->template mutable_data<float>(); |
| for (int i = 0; i < N; ++i) { |
| dXdata[i * D + labelData[i]] = |
| -dYdata[i] / std::max(Xdata[i * D + labelData[i]], kLOG_THRESHOLD()); |
| } |
| return true; |
| } |
| |
| template <> |
| bool MakeTwoClassOp<float, CPUContext>::RunOnDevice() { |
| auto& X = Input(0); |
| |
| auto shape = X.sizes().vec(); |
| shape.push_back(2); |
| int64_t N = X.numel(); |
| auto* Y = Output(0, shape, at::dtype<float>()); |
| const auto* Xdata = X.data<float>(); |
| auto* Ydata = Y->template mutable_data<float>(); |
| for (int64_t i = 0; i < N; ++i) { |
| TORCH_DCHECK_GE(Xdata[i], 0.0); |
| TORCH_DCHECK_LE(Xdata[i], 1.0); |
| Ydata[i * 2] = 1.0 - Xdata[i]; |
| Ydata[i * 2 + 1] = Xdata[i]; |
| } |
| return true; |
| } |
| |
| template <> |
| bool MakeTwoClassGradientOp<float, CPUContext>::RunOnDevice() { |
| auto& dY = Input(0); |
| |
| auto shape = dY.sizes().vec(); |
| CAFFE_ENFORCE_GE(shape.size(), 1); |
| CAFFE_ENFORCE_EQ(shape.back(), 2); |
| shape.pop_back(); |
| auto* dX = Output(0, shape, at::dtype<float>()); |
| const float* dYdata = dY.data<float>(); |
| float* dXdata = dX->template mutable_data<float>(); |
| int64_t N = dX->numel(); |
| // use eigen? |
| for (int64_t i = 0; i < N; ++i) { |
| dXdata[i] = dYdata[i * 2 + 1] - dYdata[i * 2]; |
| } |
| return true; |
| } |
| |
| template <> |
| bool CrossEntropyOp<float, CPUContext>::RunOnDevice() { |
| auto& X = Input(0); |
| auto& label = Input(1); |
| |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| int N, D; |
| if (X.dim() > 1) { |
| N = X.dim32(0); |
| D = X.size_from_dim(1); |
| } else { |
| N = 1; |
| D = X.dim32(0); |
| } |
| CAFFE_ENFORCE( |
| (label.dim() == 1) || (label.dim() == 2 && label.dim32(1) == D)); |
| CAFFE_ENFORCE_EQ(label.dim32(0), N); |
| auto* Y = Output(0, vector<int64_t>{N}, at::dtype<float>()); |
| const float* Xdata = X.data<float>(); |
| const float* labelData = label.data<float>(); |
| auto* Ydata = Y->template mutable_data<float>(); |
| CAFFE_ENFORCE( |
| (ConstEigenArrayMap<float>(labelData, D, N) <= 1.0f).all() && |
| (ConstEigenArrayMap<float>(labelData, D, N) >= 0.0f).all(), |
| "Soft label seems incorrect: label value should be a probability ", |
| "between 0 and 1.0. You may be using the wrong cross entropy operator; ", |
| "use LabelCrossEntropy if the labels are integers whose values are at ", |
| "most the number of classes, ", |
| D, |
| "."); |
| EigenArrayMap<float>(Ydata, 1, N) = |
| -(ConstEigenArrayMap<float>(labelData, D, N) * |
| ConstEigenArrayMap<float>(Xdata, D, N).cwiseMax(kLOG_THRESHOLD()).log()) |
| .colwise() |
| .sum(); |
| return true; |
| } |
| |
| template <> |
| bool CrossEntropyGradientOp<float, CPUContext>::RunOnDevice() { |
| auto& X = Input(0); |
| auto& label = Input(1); |
| auto& dY = Input(2); |
| |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| int N, D; |
| if (X.dim() > 1) { |
| N = X.dim32(0); |
| D = X.size_from_dim(1); |
| } else { |
| N = 1; |
| D = X.dim32(0); |
| } |
| CAFFE_ENFORCE( |
| (label.dim() == 1) || (label.dim() == 2 && label.dim32(1) == D)); |
| CAFFE_ENFORCE_EQ(label.dim32(0), N); |
| CAFFE_ENFORCE_EQ(dY.dim(), 1); |
| CAFFE_ENFORCE_EQ(dY.dim32(0), N); |
| auto* dX = Output(0, X.sizes(), at::dtype<float>()); |
| math::Set<float, CPUContext>( |
| dX->numel(), 0.f, dX->template mutable_data<float>(), &context_); |
| const float* Xdata = X.data<float>(); |
| const float* dYdata = dY.data<float>(); |
| const float* labelData = label.data<float>(); |
| float* dXdata = dX->template mutable_data<float>(); |
| EigenArrayMap<float>(dXdata, D, N) = |
| (ConstEigenArrayMap<float>(labelData, D, N) / |
| ConstEigenArrayMap<float>(Xdata, D, N).cwiseMax(kLOG_THRESHOLD())) |
| .rowwise() * |
| (-ConstEigenVectorArrayMap<float>(dYdata, N).transpose()); |
| return true; |
| } |
| |
| REGISTER_CPU_OPERATOR( |
| LabelCrossEntropy, |
| LabelCrossEntropyOp<float, CPUContext>); |
| REGISTER_CPU_OPERATOR( |
| LabelCrossEntropyGradient, |
| LabelCrossEntropyGradientOp<float, CPUContext>); |
| |
| OPERATOR_SCHEMA(LabelCrossEntropy) |
| .NumInputs(2) |
| .NumOutputs(1) |
| .IdenticalTypeAndShapeOfInputDim(0, 0) |
| .SetDoc(R"DOC( |
| This operator computes the cross entropy between a $NxD$ dimensional input data tensor $X$ and a one dimensional input label tensor $label$. The op produces a single length $N$ output tensor $Y$. Here, $N$ is considered the batch size and $D$ is the size of each element in the batch. In practice, it is most commonly used at the end of models as a part of the loss computation, after the SoftMax operator and before the AveragedLoss operator. The cross entropy operation is defined as follows |
| |
| $$Y_i = -log(X_{ij})$$ |
| |
| where ($i$, $j$) is the classifier's prediction of the $j$th class (the correct one), and $i$ is the batch size. Each log has a lower limit for numerical stability. |
| |
| The difference between *LabelCrossEntropy* and *CrossEntropy* is how the labels are specified. Here, the labels are a length $N$ list of integers, whereas in CrossEntropy the labels are a $NxD$ dimensional matrix of one hot label vectors. However, the results of computation should be the same, as shown in the two examples where ($i$, $j$) is the classifier's prediction of the $j$th class (the correct one), and $i$ is the batch size. Each log has a lower limit for numerical stability. |
| |
| Github Links: |
| - https://github.com/caffe2/caffe2/blob/master/caffe2/operators/cross_entropy_op.h |
| - https://github.com/caffe2/caffe2/blob/master/caffe2/operators/cross_entropy_op.cc |
| |
| <details> |
| |
| <summary> <b>Example</b> </summary> |
| |
| **Code** |
| |
| ``` |
| |
| workspace.ResetWorkspace() |
| |
| op = core.CreateOperator( |
| "LabelCrossEntropy", |
| ["X", "label"], |
| ["Y"] |
| ) |
| |
| // Create X: Sample softmax output for 5-class model |
| X = np.array([[.01, .05, .02, .02, .9],[.03, .1, .42, .05, .4]]) |
| print("X:\n",X) |
| |
| // Create label: Sample 1-hot ground truth label vectors |
| label = np.array([4,2]) |
| print("label:\n",label) |
| |
| // Feed X & label into workspace |
| workspace.FeedBlob("X", X.astype(np.float32)) |
| workspace.FeedBlob("label", label.astype(np.int32)) |
| |
| // Run op |
| workspace.RunOperatorOnce(op) |
| |
| // Collect Output |
| print("Y:\n", workspace.FetchBlob("Y")) |
| |
| ``` |
| |
| **Result** |
| |
| ``` |
| |
| X: |
| [[0.01 0.05 0.02 0.02 0.9 ] |
| [0.03 0.1 0.42 0.05 0.4 ]] |
| label: |
| [4 2] |
| Y: |
| [0.10536055 0.8675006 ] |
| |
| ``` |
| |
| </details> |
| |
| |
| )DOC") |
| .Input( |
| 0, |
| "X", |
| "Input tensor which is almost always the result of a softmax operation. $X$ is a 2D array of size $NxD$, where $N$ is the batch size and $D$ is the number of classes.") |
| .Input( |
| 1, |
| "label", |
| "Blob containing the labels used to compare the input. $label$ is a length $N$ list of integers, where each element is the integer label for the $n$th element of the batch.") |
| .Output( |
| 0, |
| "Y", |
| "Output blob from the cross entropy computation. $Y$ is 1D length $N$ tensor."); |
| OPERATOR_SCHEMA(LabelCrossEntropyGradient).NumInputs(3).NumOutputs(1); |
| |
| class GetLabelCrossEntropyGradient : public GradientMakerBase { |
| using GradientMakerBase::GradientMakerBase; |
| vector<OperatorDef> GetGradientDefs() override { |
| return SingleGradientDef( |
| "LabelCrossEntropyGradient", |
| "", |
| vector<string>{I(0), I(1), GO(0)}, |
| vector<string>{GI(0)}); |
| } |
| }; |
| REGISTER_GRADIENT(LabelCrossEntropy, GetLabelCrossEntropyGradient); |
| |
| REGISTER_CPU_OPERATOR(MakeTwoClass, MakeTwoClassOp<float, CPUContext>); |
| REGISTER_CPU_OPERATOR( |
| MakeTwoClassGradient, |
| MakeTwoClassGradientOp<float, CPUContext>); |
| |
| REGISTER_CPU_OPERATOR( |
| SigmoidCrossEntropyWithLogits, |
| SigmoidCrossEntropyWithLogitsOp<float, CPUContext>); |
| REGISTER_CPU_OPERATOR( |
| SigmoidCrossEntropyWithLogitsGradient, |
| SigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>); |
| |
| REGISTER_CPU_OPERATOR( |
| WeightedSigmoidCrossEntropyWithLogits, |
| WeightedSigmoidCrossEntropyWithLogitsOp<float, CPUContext>); |
| REGISTER_CPU_OPERATOR( |
| WeightedSigmoidCrossEntropyWithLogitsGradient, |
| WeightedSigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>); |
| |
| OPERATOR_SCHEMA(MakeTwoClass) |
| .NumInputs(1) |
| .NumOutputs(1) |
| .TensorInferenceFunction([](const OperatorDef& /* unused */, |
| const vector<TensorShape>& in) { |
| vector<TensorShape> out(1); |
| out[0].add_dims(in[0].dims(0)); |
| out[0].add_dims(2); |
| return out; |
| }) |
| .SetDoc(R"DOC( |
| Given a vector of probabilities, this operator transforms this into a 2-column |
| matrix with complimentary probabilities for binary classification. In explicit |
| terms, given the vector X, the output Y is vstack(1 - X, X). |
| )DOC") |
| .Input(0, "X", "Input vector of probabilities") |
| .Output( |
| 0, |
| "Y", |
| "2-column matrix with complimentary probabilities of X for " |
| "binary classification"); |
| |
| OPERATOR_SCHEMA(MakeTwoClassGradient).NumInputs(1).NumOutputs(1); |
| |
| OPERATOR_SCHEMA(SigmoidCrossEntropyWithLogits) |
| .Arg("log_D_trick", R"DOC( |
| default is false; if enabled, will use the log d trick to avoid the vanishing |
| gradients early on; see Goodfellow et. al (2014) |
| )DOC") |
| .Arg("unjoined_lr_loss", R"DOC( |
| default is false; if enabled, the model will be allowed to train on an unjoined |
| dataset, where some examples might be false negative and might appear |
| in the dataset later as (true) positive example. |
| )DOC") |
| .NumInputs(2) |
| .NumOutputs(1) |
| .IdenticalTypeAndShapeOfInputDim(0, 0) |
| .SetDoc(R"DOC( |
| Given two matrices logits and targets, of same shape, |
| (batch_size, num_classes), computes the sigmoid cross entropy between the two. |
| Returns a tensor of shape (batch_size,) of losses for each example. |
| )DOC") |
| .Input(0, "logits", "matrix of logits for each example and class.") |
| .Input(1, "targets", "matrix of targets, same shape as logits.") |
| .Output(0, "xentropy", "Vector with the total xentropy for each example."); |
| |
| OPERATOR_SCHEMA(SigmoidCrossEntropyWithLogitsGradient) |
| .NumInputs(3) |
| .NumOutputs(1); |
| |
| OPERATOR_SCHEMA(WeightedSigmoidCrossEntropyWithLogits) |
| .NumInputs(3) |
| .NumOutputs(1) |
| .IdenticalTypeAndShapeOfInputDim(0, 0) |
| .SetDoc(R"DOC( |
| Given three matrices: logits, targets, weights, all of the same shape, |
| (batch_size, num_classes), computes the weighted sigmoid cross entropy between |
| logits and targets. Specifically, at each position r,c, this computes |
| weights[r, c] * crossentropy(sigmoid(logits[r, c]), targets[r, c]), and then |
| averages over each row. |
| Returns a tensor of shape (batch_size,) of losses for each example. |
| )DOC") |
| .Input(0, "logits", "matrix of logits for each example and class.") |
| .Input(1, "targets", "matrix of targets, same shape as logits.") |
| .Input(2, "weights", "matrix of weights, same shape as logits.") |
| .Output(0, "xentropy", "Vector with the total xentropy for each example."); |
| |
| OPERATOR_SCHEMA(WeightedSigmoidCrossEntropyWithLogitsGradient) |
| .NumInputs(4) |
| .NumOutputs(1); |
| |
| struct GetMakeTwoClassGradient : public GradientMakerBase { |
| using GradientMakerBase::GradientMakerBase; |
| vector<OperatorDef> GetGradientDefs() override { |
| return SingleGradientDef( |
| "MakeTwoClassGradient", |
| "", |
| vector<string>{GO(0)}, |
| vector<string>{GI(0)}); |
| } |
| }; |
| REGISTER_GRADIENT(MakeTwoClass, GetMakeTwoClassGradient); |
| |
| struct GetSigmoidCrossEntropyWithLogitsGradient : public GradientMakerBase { |
| using GradientMakerBase::GradientMakerBase; |
| vector<OperatorDef> GetGradientDefs() override { |
| return SingleGradientDef( |
| "SigmoidCrossEntropyWithLogitsGradient", |
| "", |
| vector<string>{GO(0), I(0), I(1)}, |
| vector<string>{GI(0)}); |
| } |
| }; |
| REGISTER_GRADIENT( |
| SigmoidCrossEntropyWithLogits, |
| GetSigmoidCrossEntropyWithLogitsGradient); |
| |
| struct GetWeightedSigmoidCrossEntropyWithLogitsGradient |
| : public GradientMakerBase { |
| using GradientMakerBase::GradientMakerBase; |
| vector<OperatorDef> GetGradientDefs() override { |
| return SingleGradientDef( |
| "WeightedSigmoidCrossEntropyWithLogitsGradient", |
| "", |
| vector<string>{GO(0), I(0), I(1), I(2)}, |
| vector<string>{GI(0)}); |
| } |
| }; |
| REGISTER_GRADIENT( |
| WeightedSigmoidCrossEntropyWithLogits, |
| GetWeightedSigmoidCrossEntropyWithLogitsGradient); |
| |
| REGISTER_CPU_OPERATOR(CrossEntropy, CrossEntropyOp<float, CPUContext>); |
| REGISTER_CPU_OPERATOR( |
| CrossEntropyGradient, |
| CrossEntropyGradientOp<float, CPUContext>); |
| |
| OPERATOR_SCHEMA(CrossEntropy) |
| .NumInputs(2) |
| .NumOutputs(1) |
| .IdenticalTypeAndShapeOfInputDim(0, 0) |
| .SetDoc(R"DOC( |
| This operator computes the cross entropy between a $NxD$ dimensional input data tensor $X$ and a $NxD$ dimensional input label tensor $label$. The op produces a single length $N$ output tensor $Y$. Here, $N$ is considered the batch size and $D$ is the size of each element in the batch. In practice, it is most commonly used at the end of models as a part of the loss computation, after the SoftMax operator and before the AveragedLoss operator. The cross entropy operation is defined as follows |
| |
| $$Y_i = \sum_j (label_{ij} * log(X_{ij}))$$ |
| |
| where ($i$, $j$) is the classifier's prediction of the $j$th class (the correct one), and $i$ is the batch size. Each log has a lower limit for numerical stability. |
| |
| Github Links: |
| - https://github.com/caffe2/caffe2/blob/master/caffe2/operators/cross_entropy_op.h |
| - https://github.com/caffe2/caffe2/blob/master/caffe2/operators/cross_entropy_op.cc |
| |
| <details> |
| |
| <summary> <b>Example</b> </summary> |
| |
| **Code** |
| |
| ``` |
| |
| workspace.ResetWorkspace() |
| |
| op = core.CreateOperator( |
| "CrossEntropy", |
| ["X", "label"], |
| ["Y"] |
| ) |
| |
| // Create X: Sample softmax output for 5-class model |
| X = np.array([[.01, .05, .02, .02, .9],[.03, .1, .42, .05, .4]]) |
| print("X:\n",X) |
| |
| // Create label: Sample 1-hot ground truth label vectors |
| label = np.array([[0.,0.,0.,0.,1.],[0.,0.,1.,0.,0.]]) |
| print("label:\n",label) |
| |
| // Feed X & label into workspace |
| workspace.FeedBlob("X", X.astype(np.float32)) |
| workspace.FeedBlob("label", label.astype(np.float32)) |
| |
| // Run op |
| workspace.RunOperatorOnce(op) |
| |
| // Collect Output |
| print("Y:\n", workspace.FetchBlob("Y")) |
| |
| ``` |
| |
| **Result** |
| |
| ``` |
| |
| X: |
| [[0.01 0.05 0.02 0.02 0.9 ] |
| [0.03 0.1 0.42 0.05 0.4 ]] |
| label: |
| [[0. 0. 0. 0. 1.] |
| [0. 0. 1. 0. 0.]] |
| Y: |
| [0.10536055 0.8675006 ] |
| |
| ``` |
| |
| </details> |
| |
| |
| )DOC") |
| .Input( |
| 0, |
| "X", |
| "Input tensor which is almost always the result of a softmax operation. $X$ is a 2D array of size $NxD$, where $N$ is the batch size and $D$ is the number of classes.") |
| .Input( |
| 1, |
| "label", |
| "Blob containing the labels used to compare the input. $label$ is the same shape as $X$.") |
| .Output( |
| 0, |
| "Y", |
| "Output blob from the cross entropy computation. $Y$ is 1D length $N$ tensor."); |
| OPERATOR_SCHEMA(CrossEntropyGradient).NumInputs(3).NumOutputs(1); |
| |
| class GetCrossEntropyGradient : public GradientMakerBase { |
| using GradientMakerBase::GradientMakerBase; |
| vector<OperatorDef> GetGradientDefs() override { |
| return SingleGradientDef( |
| "CrossEntropyGradient", |
| "", |
| vector<string>{I(0), I(1), GO(0)}, |
| vector<string>{GI(0)}); |
| } |
| }; |
| REGISTER_GRADIENT(CrossEntropy, GetCrossEntropyGradient); |
| |
| } // namespace caffe2 |