blob: 23e52a152eabc3327e913a2c5f225bd610436cfc [file] [log] [blame]
#include "caffe2/core/common.h"
#ifdef CAFFE2_USE_MKL
#include <mkl.h>
#endif
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/operators/conv_pool_op_base.h"
#include "caffe2/operators/leaky_relu_op.h"
#include "caffe2/utils/cpuid.h"
#include "caffe2/utils/math.h"
#include "nnpack.h"
C10_DEFINE_int(
caffe2_nnpack_num_threads,
1,
"The number of nnpack pthreadpool threads.");
C10_DEFINE_bool(
caffe2_nnpack_use_mkl_num_threads,
true,
"If MKL is built, this sets nnpack to use the same number of threads as "
"MKL does. This overrides caffe2_nnpack_num_threads if set.");
namespace caffe2 {
////////////////////////////////////////////////////////////////////////////////
// Helper Functions
////////////////////////////////////////////////////////////////////////////////
namespace {
bool has_nnpack() {
// nnp_initialize is a noop after the first call so it's safe to invoke it
// repeatedly
auto nnpack_status = nnp_initialize();
return nnpack_status == nnp_status_success;
}
nnp_convolution_algorithm get_nnp_convolution_algorithm(
const std::string& algo) {
if (algo == "AUTO") {
return nnp_convolution_algorithm_auto;
}
if (algo == "WINOGRAD") {
return nnp_convolution_algorithm_wt8x8;
}
if (algo == "FT16") {
return nnp_convolution_algorithm_ft16x16;
}
if (algo == "FT8") {
return nnp_convolution_algorithm_ft8x8;
}
return nnp_convolution_algorithm_auto;
}
nnp_convolution_transform_strategy get_nnp_convolution_transform_strategy(
const std::string& kts) {
if (kts == "BLOCK") {
return nnp_convolution_transform_strategy_block_based;
}
if (kts == "TUPLE") {
return nnp_convolution_transform_strategy_tuple_based;
}
return nnp_convolution_transform_strategy_block_based;
}
////////////////////////////////////////////////////////////////////////////////
// Thread Pool
////////////////////////////////////////////////////////////////////////////////
static pthreadpool_t nnpack_threadpool_ = nullptr;
pthreadpool_t nnpack_threadpool() {
if (nnpack_threadpool_ == nullptr) {
enum nnp_status nnpack_status = nnp_initialize();
CAFFE_ENFORCE(
nnpack_status == nnp_status_success, "NNPack is not supported here!");
int num_threads = FLAGS_caffe2_nnpack_num_threads;
if (FLAGS_caffe2_nnpack_use_mkl_num_threads) {
#ifdef CAFFE2_USE_MKL
num_threads = mkl_get_max_threads();
#else
VLOG(1) << "I am asked to use MKL num of threads for NNPACK but this "
"Caffe2 is not built with MKL. Skipping.";
#endif
}
nnpack_threadpool_ = pthreadpool_create(num_threads);
}
return nnpack_threadpool_;
}
}
////////////////////////////////////////////////////////////////////////////////
// NNPACK Ops
////////////////////////////////////////////////////////////////////////////////
class NNPACKConvOp final : public ConvPoolOpBase<CPUContext> {
public:
NNPACKConvOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<CPUContext>(operator_def, ws),
algo_(get_nnp_convolution_algorithm(
OperatorBase::GetSingleArgument<std::string>("algo", "AUTO"))),
kts_(get_nnp_convolution_transform_strategy(
OperatorBase::GetSingleArgument<std::string>("kts", "TUPLE"))) {
OPERATOR_NEEDS_FEATURE(
this->order_ == StorageOrder::NCHW,
"NNPack only supports NCHW order. Please consider adding "
"TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
OPERATOR_NEEDS_FEATURE(
dilation_h() == 1 && dilation_w() == 1,
"The NNPack convolution does not support dilation yet.");
// NNPACK can be built with avx2 support only and might not be able to run
// on a given machine.
OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
}
bool RunOnDeviceWithOrderNCHW() override {
auto& X = Input(0);
auto& filter = Input(1);
auto& bias = Input(2);
auto* Y = Output(0);
const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
const int M = filter.dim32(0);
CAFFE_ENFORCE(X.dim() == 4, "Input dim should be 4");
CAFFE_ENFORCE(filter.dim(), 4);
CAFFE_ENFORCE(C % this->group_ == 0, "");
CAFFE_ENFORCE(M % this->group_ == 0, "");
CAFFE_ENFORCE(filter.dim32(1) == C / this->group_, "");
CAFFE_ENFORCE(filter.dim32(2) == this->kernel_h(), "");
CAFFE_ENFORCE(filter.dim32(3) == this->kernel_w(), "");
CAFFE_ENFORCE(bias.numel() == M, "");
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
const int oH = Y->dim32(2), oW = Y->dim32(3);
if (N > 1) {
CAFFE_ENFORCE_EQ(
this->stride_h(),
1,
"NNPack only supports stride = 1 when doing batch feedforward");
CAFFE_ENFORCE_EQ(
this->stride_w(),
1,
"NNPack only supports stride = 1 when doing batch feedforward");
}
std::vector<int> pads(
{this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()});
std::vector<int> stride({this->stride_h(), this->stride_w()});
const size_t input_channels = X.dim32(1);
const size_t output_channels = Y->dim32(1);
const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)),
.height = static_cast<size_t>(X.dim32(2))};
// filter is MCHW
const nnp_size kernel_size = {
.width = static_cast<size_t>(filter.dim32(3)),
.height = static_cast<size_t>(filter.dim32(2))};
// pad is tblr
const nnp_padding padding = {.top = static_cast<size_t>(pads[0]),
.right = static_cast<size_t>(pads[3]),
.bottom = static_cast<size_t>(pads[1]),
.left = static_cast<size_t>(pads[2])};
const nnp_size output_subsample = {
.width = static_cast<size_t>(stride[1]),
.height = static_cast<size_t>(stride[0])};
if (N == 1) {
VLOG(1) << "Running inference mode";
for (auto g = 0; g < group_; ++g) {
const auto status = nnp_convolution_inference(
algo_,
kts_,
C / group_,
M / group_,
input_size,
padding,
kernel_size,
output_subsample,
X.template data<float>() + g * H * W * (C / group_),
filter.template data<float>() + filter.numel() / group_ * g,
bias.template data<float>() + bias.numel() / group_ * g,
Y->template mutable_data<float>() + g * oH * oW * (M / group_),
nnpack_threadpool(),
nullptr);
CAFFE_ENFORCE(nnp_status_success == status, "");
}
} else {
VLOG(1) << "Running batched mode";
for (auto g = 0; g < group_; ++g) {
const auto status = nnp_convolution_output(
algo_,
N,
C / group_,
M / group_,
input_size,
padding,
kernel_size,
X.template data<float>() + g * H * W * (C / group_),
filter.template data<float>() + filter.numel() / group_ * g,
bias.template data<float>() + bias.numel() / group_ * g,
Y->template mutable_data<float>() + g * oH * oW * (M / group_),
nnpack_threadpool(),
nullptr);
CAFFE_ENFORCE(nnp_status_success == status, "");
}
}
return true;
}
private:
const nnp_convolution_algorithm algo_;
const nnp_convolution_transform_strategy kts_;
};
class NNPACKMaxPoolOp final : public ConvPoolOpBase<CPUContext> {
public:
NNPACKMaxPoolOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<CPUContext>(operator_def, ws) {
OPERATOR_NEEDS_FEATURE(
this->order_ == StorageOrder::NCHW,
"NNPack only supports NCHW order. Please consider add "
"TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
OPERATOR_NEEDS_FEATURE(
this->kernel_h() == 2, "NNPack only supports MaxPool kernel size 2*2!");
OPERATOR_NEEDS_FEATURE(
this->kernel_w() == 2, "NNPack only supports MaxPool kernel size 2*2!");
OPERATOR_NEEDS_FEATURE(
this->stride_h() == 2, "NNPack only supports MaxPool stride size 2*2!");
OPERATOR_NEEDS_FEATURE(
this->stride_w() == 2, "NNPack only supports MaxPool stride size 2*2!");
OPERATOR_NEEDS_FEATURE(
this->pad_t() == 0,
"NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
OPERATOR_NEEDS_FEATURE(
this->pad_l() == 0,
"NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
OPERATOR_NEEDS_FEATURE(
this->pad_r() == 0,
"NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
OPERATOR_NEEDS_FEATURE(
this->pad_b() == 0,
"NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
// NNPACK can be built with avx2 support only and might not be able to run
// on a given machine.
OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
}
bool RunOnDeviceWithOrderNCHW() override {
auto& X = Input(0);
auto* Y = Output(0);
CAFFE_ENFORCE(X.dim() == 4, "");
const int H = X.dim32(2), W = X.dim32(3);
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, X.dim32(1));
std::vector<int> pads(
{this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()});
std::vector<int> stride({this->stride_h(), this->stride_w()});
std::vector<int> pooling({this->kernel_h(), this->kernel_w()});
// Input X is in NCHW order
const size_t batch_size = X.dim32(0);
const size_t input_channels = X.dim32(1);
const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)),
.height = static_cast<size_t>(X.dim32(2))};
// pooling kernel
const nnp_size pooling_size = {.width = static_cast<size_t>(pooling[1]),
.height = static_cast<size_t>(pooling[0])};
// pad is tblr
const nnp_padding padding = {.top = static_cast<size_t>(pads[0]),
.right = static_cast<size_t>(pads[3]),
.bottom = static_cast<size_t>(pads[1]),
.left = static_cast<size_t>(pads[2])};
const nnp_size pooling_stride = {.width = static_cast<size_t>(stride[1]),
.height = static_cast<size_t>(stride[0])};
const auto status = nnp_max_pooling_output(
batch_size,
input_channels,
input_size,
padding,
pooling_size,
pooling_stride,
X.template data<float>(),
Y->template mutable_data<float>(),
nnpack_threadpool());
CAFFE_ENFORCE(nnp_status_success == status, "");
return true;
}
private:
};
class NNPACKReluOp final : public Operator<CPUContext> {
public:
NNPACKReluOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws) {
// NNPACK can be built with avx2 support only and might not be able to run
// on a given machine.
OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
}
bool RunOnDevice() override {
auto& X = Input(0);
auto* Y = Output(0);
const auto status = nnp_relu_output(
1,
X.numel(),
X.template data<float>(),
Y->template mutable_data<float>(),
0.0,
nnpack_threadpool());
CAFFE_ENFORCE(nnp_status_success == status, "");
return true;
}
private:
};
class NNPACKLeakyReluOp final : public LeakyReluOp<float, CPUContext> {
public:
NNPACKLeakyReluOp(const OperatorDef& operator_def, Workspace* ws)
: LeakyReluOp<float, CPUContext>(operator_def, ws) {
// NNPACK can be built with avx2 support only and might not be able to run
// on a given machine.
OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
}
bool RunOnDevice() override {
auto& X = Input(0);
auto* Y = Output(0);
const auto status = nnp_relu_output(
1,
X.numel(),
X.template data<float>(),
Y->template mutable_data<float>(),
alpha_,
nnpack_threadpool());
CAFFE_ENFORCE(nnp_status_success == status, "");
return true;
}
private:
};
REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, NNPACK, NNPACKConvOp);
REGISTER_CPU_OPERATOR_WITH_ENGINE(MaxPool, NNPACK, NNPACKMaxPoolOp);
REGISTER_CPU_OPERATOR_WITH_ENGINE(Relu, NNPACK, NNPACKReluOp);
REGISTER_CPU_OPERATOR_WITH_ENGINE(LeakyRelu, NNPACK, NNPACKLeakyReluOp);
} // namespace caffe2