blob: 3bc659d0033533c8bab3c905d421cdd6ff69ea2d [file] [log] [blame]
#include "caffe2/operators/reduction_ops.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(SumElements, SumElementsOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(SumElementsInt, SumElementsIntOp<int, CPUContext>);
REGISTER_CPU_OPERATOR(SumSqrElements, SumSqrElementsOp<CPUContext>);
REGISTER_CPU_OPERATOR(
SumElementsGradient,
SumElementsGradientOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(RowwiseMax, MaxReductionOp<float, CPUContext, true>);
REGISTER_CPU_OPERATOR(
RowwiseMaxGradient,
MaxReductionGradientOp<float, CPUContext, true>);
REGISTER_CPU_OPERATOR(
ColwiseMaxGradient,
MaxReductionGradientOp<float, CPUContext, false>);
REGISTER_CPU_OPERATOR(ColwiseMax, MaxReductionOp<float, CPUContext, false>);
OPERATOR_SCHEMA(SumElements)
.NumInputs(1)
.NumOutputs(1)
.ScalarType(TensorProto::FLOAT)
.SetDoc(R"DOC(
Sums the elements of the input tensor. Tensor type must be float32.
Github Links:
- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduction_ops.cc
<details>
<summary> <b>Example</b> </summary>
**Code**
```
workspace.ResetWorkspace()
sum_op = core.CreateOperator(
"SumElements",
["X"],
["Y"]
)
avg_op = core.CreateOperator(
"SumElements",
["X"],
["Y"],
average=True
)
workspace.FeedBlob("X", np.random.randint(10, size=(3,3)).astype(np.float32))
print("X:\n", workspace.FetchBlob("X"))
workspace.RunOperatorOnce(sum_op)
print("Y (sum_op):", workspace.FetchBlob("Y"))
workspace.RunOperatorOnce(avg_op)
print("Y (avg_op):", workspace.FetchBlob("Y"))
```
**Result**
```
X:
[[7. 2. 5.]
[9. 4. 2.]
[1. 2. 5.]]
Y (sum_op): 37.0
Y (avg_op): 4.111111
```
</details>
)DOC")
.Arg("average", "(*bool*): set to True to compute the average of the elements rather than the sum")
.Input(0, "X", "(*Tensor`<float>`*): blob pointing to an instance of a counter")
.Output(0, "sum", "(*Tensor`<float>`*): Scalar tensor containing the sum (or average)");
OPERATOR_SCHEMA(SumElementsInt)
.NumInputs(1)
.NumOutputs(1)
.ScalarType(TensorProto::INT32)
.SetDoc("Sums the integer elements of the input tensor.")
.Input(0, "X", "Tensor to sum up")
.Output(0, "sum", "Scalar sum");
SHOULD_NOT_DO_GRADIENT(SumElementsInt);
OPERATOR_SCHEMA(SumSqrElements)
.NumInputs(1)
.NumOutputs(1)
.ScalarType(TensorProto::FLOAT)
.SetDoc("Sums the squares elements of the input tensor.")
.Arg("average", "whether to average or not")
.Input(0, "X", "Tensor to sum up")
.Output(0, "sum", "Scalar sum of squares");
OPERATOR_SCHEMA(SumElementsGradient).NumInputs(2).NumOutputs(1);
class GetSumElementsGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
"SumElementsGradient",
"",
vector<string>{I(0), GO(0)},
vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(SumElements, GetSumElementsGradient);
OPERATOR_SCHEMA(RowwiseMax)
.NumInputs(1)
.NumOutputs(1)
.SetDoc(R"DOC(
Compute row-wise max reduction of the input tensor. This op takes one input, $X$, of shape $BxMxN$, where $B$ is the batch size, $M$ is number of rows, and $N$ is number of columns. The output of this op, $Y$, is a matrix of shape $BxM$, with one row for each element of the batch, and the same number of columns as the number of rows of the input tensor.
Github Links:
- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduction_ops.h
- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduction_ops.cc
<details>
<summary> <b>Example</b> </summary>
**Code**
```
workspace.ResetWorkspace()
op = core.CreateOperator(
"RowwiseMax",
["X"],
["Y"]
)
// Create X, simulating a batch of 2, 4x4 matricies
X = np.random.randint(0,high=20,size=(2,4,4))
print("X:\n",X)
// Feed X into workspace
workspace.FeedBlob("X", X.astype(np.float32))
// Run op
workspace.RunOperatorOnce(op)
// Collect Output
print("Y:\n", workspace.FetchBlob("Y"))
```
**Result**
```
X:
[[[ 5 12 10 1]
[ 4 16 2 15]
[ 5 11 12 15]
[15 4 17 19]]
[[16 5 5 13]
[17 2 1 17]
[18 3 19 5]
[14 16 10 16]]]
Y:
[[12. 16. 15. 19.]
[16. 17. 19. 16.]]
```
</details>
)DOC")
.Input(
0,
"X",
"A tensor of dimensions $B x M x N$ to compute rowwise-max. Here, $B$ is batch size, and $M$ and $N$ are the number of rows and columns of each element of the batch, respectively.")
.Output(
0,
"Y",
"The output tensor of shape $B x M$, where each row represents the row-wise maximums for that element of the input batch.");
OPERATOR_SCHEMA(RowwiseMaxGradient).NumInputs(3).NumOutputs(1);
class GetRowwiseMaxGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
"RowwiseMaxGradient",
"",
vector<string>{I(0), O(0), GO(0)},
vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(RowwiseMax, GetRowwiseMaxGradient);
OPERATOR_SCHEMA(ColwiseMaxGradient);
OPERATOR_SCHEMA(ColwiseMax)
.NumInputs(1)
.NumOutputs(1)
.SetDoc(R"DOC(
Compute column-wise max reduction of the input tensor. This op takes one input, $X$, of shape $BxMxN$, where $B$ is the batch size, $M$ is number of rows, and $N$ is number of columns. The output of this op, $Y$, is a matrix of shape $BxN$, with one row for each element of the batch, and the same number of columns as the input tensor.
Github Links:
- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduction_ops.h
- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduction_ops.cc
<details>
<summary> <b>Example</b> </summary>
**Code**
```
workspace.ResetWorkspace()
op = core.CreateOperator(
"ColwiseMax",
["X"],
["Y"]
)
// Create X, simulating a batch of 2, 4x4 matricies
X = np.random.randint(0,high=20,size=(2,4,4))
print("X:\n",X)
// Feed X into workspace
workspace.FeedBlob("X", X.astype(np.float32))
// Run op
workspace.RunOperatorOnce(op)
// Collect Output
print("Y:\n", workspace.FetchBlob("Y"))
```
**Result**
```
X:
[[[17 15 2 6]
[ 8 12 6 0]
[ 6 9 7 3]
[ 4 13 16 13]]
[[ 0 3 4 12]
[18 1 17 12]
[ 7 17 13 14]
[12 17 2 1]]]
Y:
[[17. 15. 16. 13.]
[18. 17. 17. 14.]]
```
</details>
)DOC")
.TensorInferenceFunction([](const OperatorDef& /*unused*/,
const std::vector<TensorShape>& in) {
vector<int64_t> output_dims = {in[0].dims()[0], in[0].dims()[2]};
return vector<TensorShape>{
CreateTensorShape(vector<int64_t>{output_dims}, in[0].data_type())};
})
.Input(
0,
"X",
"A tensor of dimensions $B x M x N$ to compute columnwise-max. Here, $B$ is batch size, and $M$ and $N$ are the number of rows and columns of each element of the batch, respectively.")
.Output(
0,
"Y",
"The output tensor of shape $B x N$, where each row represents the column-wise maximums for that element of the input batch.");
OPERATOR_SCHEMA(ColumnMaxGradient).NumInputs(3).NumOutputs(1);
class GetColwiseMaxGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
"ColwiseMaxGradient",
"",
vector<string>{I(0), O(0), GO(0)},
vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(ColwiseMax, GetColwiseMaxGradient);
template <typename T, class Context>
bool SumElementsGradientOp<T, Context>::RunOnDevice()
// TODO: T21635077 fix float-divide-by-zero undefined behavior
#if defined(__has_feature)
#if __has_feature(__address_sanitizer__)
__attribute__((__no_sanitize__("float-divide-by-zero")))
#endif
#endif
{
auto& X = Input(0);
Tensor sum_grad(Input(1), CPU);
auto* dX = Output(0, X.sizes(), at::dtype<T>());
TORCH_DCHECK_EQ(sum_grad.numel(), 1);
math::Set<T, Context>(
dX->numel(),
static_cast<T>(
sum_grad.template data<T>()[0] * (average_ ? 1.0 / X.numel() : 1)),
dX->template mutable_data<T>(),
&context_);
return true;
}
template <typename T, class Context, bool ROWWISE>
bool MaxReductionGradientOp<T, Context, ROWWISE>::RunOnDevice() {
auto& X = Input(0);
auto& Y = Input(1);
auto& dY = Input(2);
auto* dX = Output(0, X.sizes(), at::dtype<T>());
CAFFE_ENFORCE_EQ(X.dim(), 3);
const int batch_size = X.dim32(0);
const int M = X.dim32(1);
const int N = X.dim32(2);
const T* Xdata = X.template data<T>();
const T* Ydata = Y.template data<T>();
const T* dYdata = dY.template data<T>();
T* dXdata = dX->template mutable_data<T>();
const int input_size = M * N;
for (int i = 0; i < batch_size; ++i) {
const T* Xdata_i = Xdata + i * input_size;
T* dXdata_i = dXdata + i * input_size;
if (ROWWISE) {
const T* Ydata_i = Ydata + i * M;
const T* dYdata_i = dYdata + i * M;
for (int m = 0; m < M; ++m) {
const T* Xdata_m = Xdata_i + m * N;
T* dXdata_m = dXdata_i + m * N;
for (int n = 0; n < N; ++n) {
if (Xdata_m[n] == Ydata_i[m]) {
dXdata_m[n] = dYdata_i[m];
} else {
dXdata_m[n] = static_cast<T>(0);
}
}
}
} else {
const T* Ydata_i = Ydata + i * N;
const T* dYdata_i = dYdata + i * N;
for (int n = 0; n < N; ++n) {
for (int m = 0; m < M; ++m) {
const T* Xdata_m = Xdata_i + m * N;
T* dXdata_m = dXdata_i + m * N;
if (Xdata_m[n] == Ydata_i[n]) {
dXdata_m[n] = dYdata_i[n];
} else {
dXdata_m[n] = static_cast<T>(0);
}
}
}
}
}
return true;
}
} // namespace caffe2