| #include "caffe2/core/common.h" |
| |
| #ifdef CAFFE2_USE_MKL |
| #include <mkl.h> |
| #endif |
| |
| #include "caffe2/core/context.h" |
| #include "caffe2/core/logging.h" |
| #include "caffe2/core/operator.h" |
| #include "caffe2/operators/conv_pool_op_base.h" |
| #include "caffe2/operators/leaky_relu_op.h" |
| #include "caffe2/utils/cpuid.h" |
| #include "caffe2/utils/math.h" |
| #include "nnpack.h" |
| |
| C10_DEFINE_int( |
| caffe2_nnpack_num_threads, |
| 1, |
| "The number of nnpack pthreadpool threads."); |
| C10_DEFINE_bool( |
| caffe2_nnpack_use_mkl_num_threads, |
| true, |
| "If MKL is built, this sets nnpack to use the same number of threads as " |
| "MKL does. This overrides caffe2_nnpack_num_threads if set."); |
| |
| namespace caffe2 { |
| //////////////////////////////////////////////////////////////////////////////// |
| // Helper Functions |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| namespace { |
| |
| bool has_nnpack() { |
| // nnp_initialize is a noop after the first call so it's safe to invoke it |
| // repeatedly |
| auto nnpack_status = nnp_initialize(); |
| return nnpack_status == nnp_status_success; |
| } |
| |
| nnp_convolution_algorithm get_nnp_convolution_algorithm( |
| const std::string& algo) { |
| if (algo == "AUTO") { |
| return nnp_convolution_algorithm_auto; |
| } |
| if (algo == "WINOGRAD") { |
| return nnp_convolution_algorithm_wt8x8; |
| } |
| if (algo == "FT16") { |
| return nnp_convolution_algorithm_ft16x16; |
| } |
| if (algo == "FT8") { |
| return nnp_convolution_algorithm_ft8x8; |
| } |
| return nnp_convolution_algorithm_auto; |
| } |
| |
| nnp_convolution_transform_strategy get_nnp_convolution_transform_strategy( |
| const std::string& kts) { |
| if (kts == "BLOCK") { |
| return nnp_convolution_transform_strategy_block_based; |
| } |
| if (kts == "TUPLE") { |
| return nnp_convolution_transform_strategy_tuple_based; |
| } |
| return nnp_convolution_transform_strategy_block_based; |
| } |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // Thread Pool |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| static pthreadpool_t nnpack_threadpool_ = nullptr; |
| |
| pthreadpool_t nnpack_threadpool() { |
| if (nnpack_threadpool_ == nullptr) { |
| enum nnp_status nnpack_status = nnp_initialize(); |
| CAFFE_ENFORCE( |
| nnpack_status == nnp_status_success, "NNPack is not supported here!"); |
| int num_threads = FLAGS_caffe2_nnpack_num_threads; |
| if (FLAGS_caffe2_nnpack_use_mkl_num_threads) { |
| #ifdef CAFFE2_USE_MKL |
| num_threads = mkl_get_max_threads(); |
| #else |
| VLOG(1) << "I am asked to use MKL num of threads for NNPACK but this " |
| "Caffe2 is not built with MKL. Skipping."; |
| #endif |
| } |
| nnpack_threadpool_ = pthreadpool_create(num_threads); |
| } |
| return nnpack_threadpool_; |
| } |
| } |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // NNPACK Ops |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| class NNPACKConvOp final : public ConvPoolOpBase<CPUContext> { |
| public: |
| NNPACKConvOp(const OperatorDef& operator_def, Workspace* ws) |
| : ConvPoolOpBase<CPUContext>(operator_def, ws), |
| algo_(get_nnp_convolution_algorithm( |
| OperatorBase::GetSingleArgument<std::string>("algo", "AUTO"))), |
| kts_(get_nnp_convolution_transform_strategy( |
| OperatorBase::GetSingleArgument<std::string>("kts", "TUPLE"))) { |
| OPERATOR_NEEDS_FEATURE( |
| this->order_ == StorageOrder::NCHW, |
| "NNPack only supports NCHW order. Please consider adding " |
| "TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv."); |
| OPERATOR_NEEDS_FEATURE( |
| dilation_h() == 1 && dilation_w() == 1, |
| "The NNPack convolution does not support dilation yet."); |
| // NNPACK can be built with avx2 support only and might not be able to run |
| // on a given machine. |
| OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?"); |
| } |
| |
| bool RunOnDeviceWithOrderNCHW() override { |
| auto& X = Input(0); |
| auto& filter = Input(1); |
| auto& bias = Input(2); |
| auto* Y = Output(0); |
| |
| const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3); |
| const int M = filter.dim32(0); |
| |
| CAFFE_ENFORCE(X.dim() == 4, "Input dim should be 4"); |
| CAFFE_ENFORCE(filter.dim(), 4); |
| CAFFE_ENFORCE(C % this->group_ == 0, ""); |
| CAFFE_ENFORCE(M % this->group_ == 0, ""); |
| CAFFE_ENFORCE(filter.dim32(1) == C / this->group_, ""); |
| CAFFE_ENFORCE(filter.dim32(2) == this->kernel_h(), ""); |
| CAFFE_ENFORCE(filter.dim32(3) == this->kernel_w(), ""); |
| CAFFE_ENFORCE(bias.numel() == M, ""); |
| |
| ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0)); |
| const int oH = Y->dim32(2), oW = Y->dim32(3); |
| |
| if (N > 1) { |
| CAFFE_ENFORCE_EQ( |
| this->stride_h(), |
| 1, |
| "NNPack only supports stride = 1 when doing batch feedforward"); |
| CAFFE_ENFORCE_EQ( |
| this->stride_w(), |
| 1, |
| "NNPack only supports stride = 1 when doing batch feedforward"); |
| } |
| std::vector<int> pads( |
| {this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()}); |
| std::vector<int> stride({this->stride_h(), this->stride_w()}); |
| |
| const size_t input_channels = X.dim32(1); |
| const size_t output_channels = Y->dim32(1); |
| const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)), |
| .height = static_cast<size_t>(X.dim32(2))}; |
| // filter is MCHW |
| const nnp_size kernel_size = { |
| .width = static_cast<size_t>(filter.dim32(3)), |
| .height = static_cast<size_t>(filter.dim32(2))}; |
| // pad is tblr |
| const nnp_padding padding = {.top = static_cast<size_t>(pads[0]), |
| .right = static_cast<size_t>(pads[3]), |
| .bottom = static_cast<size_t>(pads[1]), |
| .left = static_cast<size_t>(pads[2])}; |
| |
| const nnp_size output_subsample = { |
| .width = static_cast<size_t>(stride[1]), |
| .height = static_cast<size_t>(stride[0])}; |
| if (N == 1) { |
| VLOG(1) << "Running inference mode"; |
| for (auto g = 0; g < group_; ++g) { |
| const auto status = nnp_convolution_inference( |
| algo_, |
| kts_, |
| C / group_, |
| M / group_, |
| input_size, |
| padding, |
| kernel_size, |
| output_subsample, |
| X.template data<float>() + g * H * W * (C / group_), |
| filter.template data<float>() + filter.numel() / group_ * g, |
| bias.template data<float>() + bias.numel() / group_ * g, |
| Y->template mutable_data<float>() + g * oH * oW * (M / group_), |
| nnpack_threadpool(), |
| nullptr); |
| CAFFE_ENFORCE(nnp_status_success == status, ""); |
| } |
| } else { |
| VLOG(1) << "Running batched mode"; |
| for (auto g = 0; g < group_; ++g) { |
| const auto status = nnp_convolution_output( |
| algo_, |
| N, |
| C / group_, |
| M / group_, |
| input_size, |
| padding, |
| kernel_size, |
| X.template data<float>() + g * H * W * (C / group_), |
| filter.template data<float>() + filter.numel() / group_ * g, |
| bias.template data<float>() + bias.numel() / group_ * g, |
| Y->template mutable_data<float>() + g * oH * oW * (M / group_), |
| nnpack_threadpool(), |
| nullptr); |
| CAFFE_ENFORCE(nnp_status_success == status, ""); |
| } |
| } |
| return true; |
| } |
| |
| private: |
| const nnp_convolution_algorithm algo_; |
| const nnp_convolution_transform_strategy kts_; |
| }; |
| |
| class NNPACKMaxPoolOp final : public ConvPoolOpBase<CPUContext> { |
| public: |
| NNPACKMaxPoolOp(const OperatorDef& operator_def, Workspace* ws) |
| : ConvPoolOpBase<CPUContext>(operator_def, ws) { |
| OPERATOR_NEEDS_FEATURE( |
| this->order_ == StorageOrder::NCHW, |
| "NNPack only supports NCHW order. Please consider add " |
| "TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv."); |
| OPERATOR_NEEDS_FEATURE( |
| this->kernel_h() == 2, "NNPack only supports MaxPool kernel size 2*2!"); |
| OPERATOR_NEEDS_FEATURE( |
| this->kernel_w() == 2, "NNPack only supports MaxPool kernel size 2*2!"); |
| OPERATOR_NEEDS_FEATURE( |
| this->stride_h() == 2, "NNPack only supports MaxPool stride size 2*2!"); |
| OPERATOR_NEEDS_FEATURE( |
| this->stride_w() == 2, "NNPack only supports MaxPool stride size 2*2!"); |
| OPERATOR_NEEDS_FEATURE( |
| this->pad_t() == 0, |
| "NNPack Pooling differs from Caffe2 Pooling when pad > 0!"); |
| OPERATOR_NEEDS_FEATURE( |
| this->pad_l() == 0, |
| "NNPack Pooling differs from Caffe2 Pooling when pad > 0!"); |
| OPERATOR_NEEDS_FEATURE( |
| this->pad_r() == 0, |
| "NNPack Pooling differs from Caffe2 Pooling when pad > 0!"); |
| OPERATOR_NEEDS_FEATURE( |
| this->pad_b() == 0, |
| "NNPack Pooling differs from Caffe2 Pooling when pad > 0!"); |
| // NNPACK can be built with avx2 support only and might not be able to run |
| // on a given machine. |
| OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?"); |
| } |
| |
| bool RunOnDeviceWithOrderNCHW() override { |
| auto& X = Input(0); |
| auto* Y = Output(0); |
| CAFFE_ENFORCE(X.dim() == 4, ""); |
| const int H = X.dim32(2), W = X.dim32(3); |
| ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, X.dim32(1)); |
| std::vector<int> pads( |
| {this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()}); |
| std::vector<int> stride({this->stride_h(), this->stride_w()}); |
| std::vector<int> pooling({this->kernel_h(), this->kernel_w()}); |
| |
| // Input X is in NCHW order |
| const size_t batch_size = X.dim32(0); |
| const size_t input_channels = X.dim32(1); |
| const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)), |
| .height = static_cast<size_t>(X.dim32(2))}; |
| // pooling kernel |
| const nnp_size pooling_size = {.width = static_cast<size_t>(pooling[1]), |
| .height = static_cast<size_t>(pooling[0])}; |
| // pad is tblr |
| const nnp_padding padding = {.top = static_cast<size_t>(pads[0]), |
| .right = static_cast<size_t>(pads[3]), |
| .bottom = static_cast<size_t>(pads[1]), |
| .left = static_cast<size_t>(pads[2])}; |
| |
| const nnp_size pooling_stride = {.width = static_cast<size_t>(stride[1]), |
| .height = static_cast<size_t>(stride[0])}; |
| const auto status = nnp_max_pooling_output( |
| batch_size, |
| input_channels, |
| input_size, |
| padding, |
| pooling_size, |
| pooling_stride, |
| X.template data<float>(), |
| Y->template mutable_data<float>(), |
| nnpack_threadpool()); |
| CAFFE_ENFORCE(nnp_status_success == status, ""); |
| return true; |
| } |
| |
| private: |
| }; |
| |
| class NNPACKReluOp final : public Operator<CPUContext> { |
| public: |
| NNPACKReluOp(const OperatorDef& operator_def, Workspace* ws) |
| : Operator<CPUContext>(operator_def, ws) { |
| // NNPACK can be built with avx2 support only and might not be able to run |
| // on a given machine. |
| OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?"); |
| } |
| |
| bool RunOnDevice() override { |
| auto& X = Input(0); |
| auto* Y = Output(0); |
| const auto status = nnp_relu_output( |
| 1, |
| X.numel(), |
| X.template data<float>(), |
| Y->template mutable_data<float>(), |
| 0.0, |
| nnpack_threadpool()); |
| CAFFE_ENFORCE(nnp_status_success == status, ""); |
| return true; |
| } |
| |
| private: |
| }; |
| |
| class NNPACKLeakyReluOp final : public LeakyReluOp<float, CPUContext> { |
| public: |
| NNPACKLeakyReluOp(const OperatorDef& operator_def, Workspace* ws) |
| : LeakyReluOp<float, CPUContext>(operator_def, ws) { |
| // NNPACK can be built with avx2 support only and might not be able to run |
| // on a given machine. |
| OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?"); |
| } |
| |
| bool RunOnDevice() override { |
| auto& X = Input(0); |
| auto* Y = Output(0); |
| const auto status = nnp_relu_output( |
| 1, |
| X.numel(), |
| X.template data<float>(), |
| Y->template mutable_data<float>(), |
| alpha_, |
| nnpack_threadpool()); |
| CAFFE_ENFORCE(nnp_status_success == status, ""); |
| return true; |
| } |
| |
| private: |
| }; |
| |
| REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, NNPACK, NNPACKConvOp); |
| REGISTER_CPU_OPERATOR_WITH_ENGINE(MaxPool, NNPACK, NNPACKMaxPoolOp); |
| REGISTER_CPU_OPERATOR_WITH_ENGINE(Relu, NNPACK, NNPACKReluOp); |
| REGISTER_CPU_OPERATOR_WITH_ENGINE(LeakyRelu, NNPACK, NNPACKLeakyReluOp); |
| |
| } // namespace caffe2 |