blob: b950b2123a2ad6c8b5869dc152dcccedcf32974f [file] [log] [blame]
#include "caffe2/operators/prelu_op.h"
#include "caffe2/utils/eigen_utils.h"
#include "caffe2/utils/math.h"
#include "caffe2/core/types.h"
#include "caffe2/utils/cpu_neon.h"
namespace caffe2 {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
namespace {
void runNeonPrelu(float* out, const float* in, int size, float w) {
float32x4_t vZero = vdupq_n_f32(0.0f);
float32x4_t vW = vdupq_n_f32(w);
constexpr int kVecSizeInFloat = sizeof(float32x4_t) / sizeof(float);
if (size < kVecSizeInFloat) {
for (int i = 0; i < size; ++i) {
float v = in[i];
out[i] = v > 0 ? v : v * w;
}
return;
}
// We want to load aligned from the input, but assume the output is unaligned
int prologue = kVecSizeInFloat -
// remainder in floats
(((uintptr_t)in) % (sizeof(float32x4_t))) / sizeof(float);
int i = 0;
// Prologue loop
for (; i < prologue; ++i) {
float v = in[i];
out[i] = v > 0 ? v : v * w;
}
// The loop is manually unrolled by 6; seems to be the limit for
// armv7 to avoid register spills
constexpr int kUnroll = 6;
constexpr int kFloatsPerLoop = kUnroll * kVecSizeInFloat;
int remainder = size - prologue;
int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop;
for (; i < vectorizable; i += kFloatsPerLoop) {
float32x4_t v0 = vld1q_f32_aligned(in + i + 0);
float32x4_t v1 = vld1q_f32_aligned(in + i + 4);
float32x4_t v2 = vld1q_f32_aligned(in + i + 8);
float32x4_t v3 = vld1q_f32_aligned(in + i + 12);
float32x4_t v4 = vld1q_f32_aligned(in + i + 16);
float32x4_t v5 = vld1q_f32_aligned(in + i + 20);
uint32x4_t gz0 = vcgtq_f32(v0, vZero);
uint32x4_t gz1 = vcgtq_f32(v1, vZero);
uint32x4_t gz2 = vcgtq_f32(v2, vZero);
uint32x4_t gz3 = vcgtq_f32(v3, vZero);
uint32x4_t gz4 = vcgtq_f32(v4, vZero);
uint32x4_t gz5 = vcgtq_f32(v5, vZero);
float32x4_t v0neg = vmulq_f32(v0, vW);
float32x4_t v1neg = vmulq_f32(v1, vW);
float32x4_t v2neg = vmulq_f32(v2, vW);
float32x4_t v3neg = vmulq_f32(v3, vW);
float32x4_t v4neg = vmulq_f32(v4, vW);
float32x4_t v5neg = vmulq_f32(v5, vW);
// v0 > 0 ? v0 : v0 * w
v0 = vbslq_f32(gz0, v0, v0neg);
v1 = vbslq_f32(gz1, v1, v1neg);
v2 = vbslq_f32(gz2, v2, v2neg);
v3 = vbslq_f32(gz3, v3, v3neg);
v4 = vbslq_f32(gz4, v4, v4neg);
v5 = vbslq_f32(gz5, v5, v5neg);
vst1q_f32(out + i + 0, v0);
vst1q_f32(out + i + 4, v1);
vst1q_f32(out + i + 8, v2);
vst1q_f32(out + i + 12, v3);
vst1q_f32(out + i + 16, v4);
vst1q_f32(out + i + 20, v5);
}
for (; i < size; ++i) {
float v = in[i];
out[i] = v > 0 ? v : v * w;
}
}
} // namespace
#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
template <>
bool PReluOp<float, CPUContext>::RunOnDevice() {
const auto& X = Input(0);
const auto& W = Input(1);
auto* Y = Output(0, X.sizes(), at::dtype<float>());
const auto* Xdata = X.template data<float>();
const auto* Wdata = W.template data<float>();
auto* Ydata = Y->template mutable_data<float>();
const auto C = order_ == StorageOrder::NCHW ? X.size(1) : X.size(X.dim() - 1);
const auto C_shared = (W.numel() == 1);
if (!C_shared) {
CAFFE_ENFORCE_EQ(C, W.numel());
}
if (C_shared) {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
// The function is completely pointwise
runNeonPrelu(Ydata, Xdata, X.size(), Wdata[0]);
#else
ConstEigenVectorMap<float> Xvec(Xdata, X.numel());
EigenVectorMap<float> Yvec(Ydata, Y->numel());
Yvec = Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[0];
#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
return true;
}
// non-shared case.
switch (order_) {
case StorageOrder::NCHW: {
const auto N = X.size(0);
const auto dim = X.size_from_dim(2);
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
// Pointwise for each channel
for (int n = 0; n < N; ++n) {
for (int c = 0; c < C; ++c) {
runNeonPrelu(
Ydata + (n * C + c) * dim,
Xdata + (n * C + c) * dim,
dim,
Wdata[c]);
}
}
#else
int nc = 0;
for (int n = 0; n < N; ++n) {
for (int c = 0; c < C; ++c) {
ConstEigenVectorMap<float> Xvec(Xdata + nc * dim, dim);
EigenVectorMap<float>(Ydata + nc * dim, dim) =
Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[c];
nc++;
}
}
#endif
break;
}
case StorageOrder::NHWC: {
// Lay out matrix as (NHW, C) and multiply by C
const auto NHW = X.numel() / C;
ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
ConstEigenVectorArrayMap<float> Wvec(Wdata, C);
EigenArrayMap<float> Ymat(Ydata, C, NHW);
Ymat = (Xmat > 0).select(Xmat, Xmat.colwise() * Wvec);
break;
}
default:
CAFFE_THROW("Unknown storage order: ", order_);
}
return true;
}
template <>
bool PReluGradientOp<float, CPUContext>::RunOnDevice() {
auto& Y = Input(0);
auto& dY = Input(1);
auto& X = Input(2);
auto& W = Input(3);
CAFFE_ENFORCE(&Y != &X, "Cannot backpropagate through an in-place PReLU");
TORCH_DCHECK_EQ(dY.numel(), Y.numel());
auto* dX = Output(0, Y.sizes(), at::dtype<float>());
auto* dW = Output(1, W.sizes(), at::dtype<float>());
const auto C = order_ == StorageOrder::NCHW ? X.size(1) : X.size(X.dim() - 1);
const auto C_shared = (W.numel() == 1);
const float* Ydata = Y.data<float>();
const float* dYdata = dY.data<float>();
const float* Xdata = X.data<float>();
const float* Wdata = W.data<float>();
float* dXdata = dX->template mutable_data<float>();
float* dWdata = dW->template mutable_data<float>();
// non-shared case.
switch (order_) {
case StorageOrder::NCHW: {
const auto dim = X.size_from_dim(2);
const auto div_factor = C_shared ? C : 1;
for (auto c = 0; c < W.numel(); ++c) {
dWdata[c] = 0;
}
for (int i = 0; i < Y.numel(); ++i) {
if (Xdata[i] <= 0) {
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
int c = (i / dim) % C / div_factor;
dWdata[c] += dYdata[i] * Xdata[i];
}
}
for (int i = 0; i < Y.numel(); ++i) {
if (Xdata[i] > 0) {
dXdata[i] = dYdata[i];
} else {
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
int c = (i / dim) % C / div_factor;
dXdata[i] = Wdata[c] * dYdata[i];
}
}
break;
}
case StorageOrder::NHWC: {
const auto NHW = X.numel() / C;
ConstEigenVectorArrayMap<float> Wvec(Wdata, W.numel());
EigenVectorArrayMap<float> dWvec(dWdata, dW->numel());
ConstEigenArrayMap<float> Ymat(Ydata, C, NHW);
ConstEigenArrayMap<float> dYmat(dYdata, C, NHW);
ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
EigenArrayMap<float> dXmat(dXdata, C, NHW);
if (C_shared) {
dXmat = (Xmat > 0).select(dYmat, dYmat * Wdata[0]);
dWdata[0] =
(Xmat > 0)
.select(
Xmat.cwiseMin(0.0f), // zero gradients on the 'if' path.
dYmat * Xmat)
.sum();
} else {
dXmat = (Xmat > 0).select(dYmat, dYmat.colwise() * Wvec);
dWvec = (Xmat > 0)
.select(
Xmat.cwiseMin(0.0f), // zero gradients on the 'if' path.
dYmat * Xmat)
.rowwise()
.sum();
}
break;
}
default:
CAFFE_THROW("Unknown storage order: ", order_);
}
return true;
}
REGISTER_CPU_OPERATOR(PRelu, PReluOp<float, CPUContext>);
REGISTER_CPU_GRADIENT_OPERATOR(
PReluGradient,
PReluGradientOp<float, CPUContext>);
// Input: X, Slope, output: Y
OPERATOR_SCHEMA(PRelu)
.NumInputs(2)
.NumOutputs(1)
.AllowInplace({{0, 0}})
.IdenticalTypeAndShapeOfInput(0)
.SetDoc(R"DOC(
The *PRelu* op takes input data tensor $X$, an input slope tensor $slope$, and produces one output tensor $Y$ of the same shape as $X.$ The op performs the element wise *PRelu* operation, defined as
$$y=prelu(x) =\begin{cases}slope * x & x < 0\\x & otherwise\end{cases}$$
Note, is slope is size 1, the value is shared across the channels, otherwise $X$ and $slope$ must be the same shape. See [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/abs/1502.01852) for more information.
Github Links:
- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/prelu_op.h
- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/prelu_op.cc
<details>
<summary> <b>Example</b> </summary>
**Code**
```
workspace.ResetWorkspace()
op = core.CreateOperator(
"PRelu",
["X","Slope"],
["Y"],
)
workspace.FeedBlob("X", np.random.randn(3, 3).astype(np.float32))
print("X:\n", workspace.FetchBlob("X"), "\n")
workspace.FeedBlob("Slope", np.array([0.1]).astype(np.float32))
print("Slope:\n", workspace.FetchBlob("Slope"), "\n")
workspace.RunOperatorOnce(op)
print("Y:\n", workspace.FetchBlob("Y"))
```
**Result**
```
X:
[[ 0.3957382 -0.19725518 -0.26991343]
[ 1.5513182 -0.27427664 -0.14584002]
[-0.4121164 0.9292345 0.96426094]]
Slope:
[0.1]
Y:
[[ 0.3957382 -0.01972552 -0.02699134]
[ 1.5513182 -0.02742766 -0.014584 ]
[-0.04121164 0.9292345 0.96426094]]
```
</details>
)DOC")
.Input(0, "X", "Input tensor of data to be operated on.")
.Input(
1,
"Slope",
"1D input slope tensor. If `Slope` is of size 1, the value is shared across different channels")
.Output(0, "Y", "Output tensor, with same shape as $X$.")
.InheritOnnxSchema();
// Input: Y, dY, output: dX
GRADIENT_OPERATOR_SCHEMA(PReluGradient)
.NumInputs(4)
.NumOutputs(2)
.SetDoc(R"DOC(
PReluGradient takes both Y and dY and uses this to update dX and dW according
to the chain rule and derivatives of the rectified linear function.
)DOC")
.IdenticalTypeAndShapeOfMultipleInputs({2, 3});
class GetPReluGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
def_.type() + "Gradient",
"",
vector<string>{O(0), GO(0), I(0), I(1)},
vector<string>{GI(0), GI(1)});
}
};
REGISTER_GRADIENT(PRelu, GetPReluGradient);
} // namespace caffe2