blob: ffcb0c8f6bbcf140b85340d1970a7dfaf5884980 [file] [log] [blame]
#include "caffe2/operators/pool_op_util.h"
#include "caffe2/utils/eigen_utils.h"
namespace caffe2 {
namespace pool_op_util {
namespace {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
// Vectorizes 4x4p0s0 average pooling for ARM NEON
void AvgPoolNeon4x4p0s0Plane(
int inputH,
int inputW,
const float* input,
float* output) {
constexpr int kKernelHeight = 4;
constexpr int kKernelWidth = 4;
constexpr float kDiv = (1.0f / ((float)kKernelHeight * (float)kKernelWidth));
// Handle portion that can be unrolled by 4
constexpr int kUnroll = 4;
constexpr int kLoadSizeFloat = (sizeof(float32x4_t) / sizeof(float));
constexpr int kLoadCols = kUnroll * kLoadSizeFloat;
if (inputW % kLoadCols == 0) {
//
// Manually unroll by 4 (kUnroll)
//
for (int h = 0; h < inputH; h += kKernelHeight) {
float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
const float* curInput = input + h * inputW;
for (int w = 0; w < inputW; w += kLoadCols) {
float32x4_t out = {};
{
float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
out = vsetq_lane_f32(v0, out, 0);
}
curInput += kLoadSizeFloat;
{
float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
out = vsetq_lane_f32(v0, out, 1);
}
curInput += kLoadSizeFloat;
{
float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
out = vsetq_lane_f32(v0, out, 2);
}
curInput += kLoadSizeFloat;
{
float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
out = vsetq_lane_f32(v0, out, 3);
}
curInput += kLoadSizeFloat;
out = vmulq_f32(out, vdupq_n_f32(kDiv));
vst1q_f32_aligned(&outputRow[w / kKernelWidth], out);
}
}
} else {
//
// Not unrolled
//
for (int h = 0; h < inputH; h += kKernelHeight) {
const float* inputRow = input + h * inputW;
float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
for (int w = 0; w < inputW; w += kKernelWidth) {
const float* curInput = inputRow + w;
float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3) * kDiv;
outputRow[w / kKernelWidth] = v0;
}
}
}
}
// Vectorizes 2x2p0s0 average pooling for ARM NEON
void MaxPoolNeon2x2p0s0Plane(
int inputH,
int inputW,
const float* input,
float* output) {
constexpr int kKernelHeight = 2;
constexpr int kKernelWidth = 2;
// Handle portion that can be unrolled by 4
constexpr int kUnroll = 4;
constexpr int kLoadSizeFloat = (sizeof(float32x4_t) / sizeof(float));
constexpr int kLoadCols = kUnroll * kLoadSizeFloat;
if (inputW % kLoadCols == 0) {
for (int h = 0; h < inputH; h += kKernelHeight) {
float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
const float* curInput = input + h * inputW;
for (int w = 0; w < inputW; w += kLoadCols) {
float32x2_t hmax_0, hmax_1, hmax_2, hmax_3;
{
float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
hmax_0 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
}
curInput += kLoadSizeFloat;
{
float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
hmax_1 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
}
curInput += kLoadSizeFloat;
{
float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
hmax_2 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
}
curInput += kLoadSizeFloat;
{
float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
hmax_3 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
}
curInput += kLoadSizeFloat;
float32x4_t out_0 = vcombine_f32(hmax_0, hmax_1);
float32x4_t out_1 = vcombine_f32(hmax_2, hmax_3);
vst1q_f32_aligned(&outputRow[w / kKernelWidth + 0], out_0);
vst1q_f32_aligned(&outputRow[w / kKernelWidth + 4], out_1);
}
}
} else {
// Not unrolled
for (int h = 0; h < inputH; h += kKernelHeight) {
const float* inputRow = input + h * inputW;
float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
for (int w = 0; w < inputW; w += kKernelWidth * 2) {
const float* curInput = inputRow + w;
float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
float32x2_t hmax = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
vst1_f32(&outputRow[w / kKernelWidth], hmax);
}
}
}
}
#endif
} // namespace
bool IsNeon4x4p0s0Eligible(
const int input_h,
const int input_w,
const int output_h,
const int output_w,
const int kh,
const int kw,
const int stride_h,
const int stride_w,
const int pad_t,
const int pad_l,
const int pad_b,
const int pad_r,
const int dilation_h,
const int dilation_w,
const float* X,
float* Y) {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
// Use this kernel only if:
// 1. Kernel size is 4x4
// 2. Stride is 4x4
// 3. Padding is 0
// 4. Dilation is 1
// 5. Output width and height are even divisors of input width
// 6. Input width and height are divisible by 4 (should be implied by all of
// the above, but just check again)
// Input and output pointers are aligned by float32x4_t
const bool kernel_ok = (kh == 4) && (kw == 4);
const bool stride_ok = (stride_h == 4) && (stride_w == 4);
const bool pad_ok =
(pad_t == 0) && (pad_l == 0) && (pad_b == 0) && (pad_r == 0);
const bool dilation_ok = (dilation_h == 1) && (dilation_w == 1);
const bool output_ok = (input_h % output_h == 0) && (input_w % output_w == 0);
const bool input_ok = (input_w % 4 == 0) && (input_h % 4 == 0);
const bool align_ok = isPointerAligned(X, sizeof(float32x4_t)) &&
isPointerAligned(Y, sizeof(float32x4_t));
return kernel_ok && stride_ok && pad_ok && dilation_ok && output_ok &&
input_ok && align_ok;
#else
(void)input_h;
(void)input_w;
(void)output_h;
(void)output_w;
(void)kh;
(void)kw;
(void)stride_h;
(void)stride_w;
(void)pad_t;
(void)pad_l;
(void)pad_b;
(void)pad_r;
(void)dilation_h;
(void)dilation_w;
(void)X;
(void)Y;
return false;
#endif
}
bool IsNeon2x2p0s0Eligible(
const int input_h,
const int input_w,
const int output_h,
const int output_w,
const int kh,
const int kw,
const int stride_h,
const int stride_w,
const int pad_t,
const int pad_l,
const int pad_b,
const int pad_r,
const int dilation_h,
const int dilation_w,
const float* X,
float* Y) {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
// Use this kernel only if:
// 1. Kernel size is 2x2
// 2. Stride is 2x2
// 3. Padding is 0
// 4. Dilation is 1
// 5. Output width and height are even divisors of input width
// 6. Input width and height are divisible by 4 (should be implied b all of
// the above, but just check again)
// Input and output pointers are aligned by float32x4_t
const bool kernel_ok = (kh == 2) && (kw == 2);
const bool stride_ok = (stride_h == 2) && (stride_w == 2);
const bool pad_ok =
(pad_t == 0) && (pad_l == 0) && (pad_b == 0) && (pad_r == 0);
const bool dilation_ok = (dilation_h == 1) && (dilation_w == 1);
const bool output_ok = (input_h % output_h == 0) && (input_w % output_w == 0);
const bool input_ok = (input_w % 4 == 0) && (input_h % 4 == 0);
const bool align_ok = isPointerAligned(X, sizeof(float32x4_t)) &&
isPointerAligned(Y, sizeof(float32x4_t));
return kernel_ok && stride_ok && pad_ok && dilation_ok && output_ok &&
input_ok && align_ok;
#else
(void)input_h;
(void)input_w;
(void)output_h;
(void)output_w;
(void)kh;
(void)kw;
(void)stride_h;
(void)stride_w;
(void)pad_t;
(void)pad_l;
(void)pad_b;
(void)pad_r;
(void)dilation_h;
(void)dilation_w;
(void)X;
(void)Y;
return false;
#endif
}
void RunNeonAveragePool4x4p0s0NCHW(
int N,
int C,
int H,
int W,
const float* X,
float* Y) {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
const int X_stride = H * W;
const int Y_stride = (H / 4) * (W / 4);
const float* X_ptr = X;
float* Y_ptr = Y;
for (int i = 0; i < N; ++i) {
for (int j = 0; j < C; ++j) {
AvgPoolNeon4x4p0s0Plane(H, W, X_ptr, Y_ptr);
X_ptr += X_stride;
Y_ptr += Y_stride;
}
}
#else
(void)N;
(void)C;
(void)H;
(void)W;
(void)X;
(void)Y;
#endif
}
void RunNeonMaxPool2x2p0s0NCHW(
int N,
int C,
int H,
int W,
const float* X,
float* Y) {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
const int X_stride = H * W;
const int Y_stride = (H / 2) * (W / 2);
const float* X_ptr = X;
float* Y_ptr = Y;
for (int i = 0; i < N; ++i) {
for (int j = 0; j < C; ++j) {
MaxPoolNeon2x2p0s0Plane(H, W, X_ptr, Y_ptr);
X_ptr += X_stride;
Y_ptr += Y_stride;
}
}
#else
(void)N;
(void)C;
(void)H;
(void)W;
(void)X;
(void)Y;
#endif
}
} // namespace pool_op_util
} // namespace caffe2