blob: 60f53012135e7d9ed9cf50871203dcc95fb3e71c [file] [log] [blame]
#pragma once
#include <fbgemm/Fbgemm.h>
#include "caffe2/operators/conv_op.h"
#include "caffe2/operators/conv_pool_op_base.h"
#include "caffe2/quantization/server/caffe2_dnnlowp_utils.h"
#include "caffe2/quantization/server/conv_pool_dnnlowp_op_base.h"
#include "caffe2/quantization/server/dnnlowp.h"
#include "caffe2/quantization/server/op_wrapper.h"
namespace caffe2 {
using ConvFp32Op = ConvOp<float, CPUContext>;
// Convolutional layer computed in integer with quantization
template <typename T, bool ReluFused = false>
class ConvDNNLowPOp : public ConvPoolDNNLowPOpBase<T, ConvFp32Op> {
public:
USE_CONV_POOL_BASE_FUNCTIONS(CPUContext);
USE_CONV_POOL_DNNLOWP_OPERATOR_BASE_FUNCTIONS(T, ConvFp32Op);
ConvDNNLowPOp(const OperatorDef& operator_def, Workspace* ws);
virtual ~ConvDNNLowPOp();
protected:
bool RunOnDeviceWithOrderNCHW() override;
bool RunOnDeviceWithOrderNHWC() override;
bool GetQuantizationParameters_();
/**
* @return true if convolution is basically a GEMM point-wise (e.g., 1x1)
* convolution, no stride/dilation/pad
*/
bool IsConvGEMM_() const;
bool NoIm2ColNHWC_();
int KernelDim_();
const T* Im2ColNHWC_(Tensor* col_buffer);
dnnlowp::TensorQuantizationParams& FilterQuantizationParams(int group_id);
dnnlowp::RequantizationParams& RequantizationParams(int group_id);
static void PartitionGroupedNHWCConv_(
int* group_begin,
int* group_end,
int* i_begin,
int* i_end,
int num_groups,
int m,
int nthreads,
int thread_id);
virtual bool Acc16() const {
return false;
}
Tensor col_buffer_{CPU};
Tensor img_shape_device_{CPU};
Tensor col_buffer_shape_device_{CPU};
// Input: X, W, b
// Output: Y
INPUT_TAGS(INPUT, FILTER, BIAS);
// x86 only provides SIMD instructions that multiply a signed integer with an
// unsigned integer. We use signed for weights.
using T_signed = typename std::make_signed<T>::type;
// used in slow path for T != uint8_t
std::vector<T_signed> W_quantized_;
// pre-computed biases and offsets
std::shared_ptr<std::vector<std::int32_t>> column_offsets_;
std::vector<std::int32_t> row_offsets_;
const std::int32_t* b_quantized_data_{nullptr};
std::vector<std::uint8_t> X_pack_buf_;
void RunOnDeviceEpilogueNCHW_(
const T* col_buffer_data,
std::int32_t* Y_int32,
T* Y_data,
std::size_t i_offset,
int group_id);
void RunOnDeviceEpilogueNHWC_(
const T* col_buffer_data,
std::int32_t* Y_int32);
std::vector<std::int32_t> Y_int32_;
std::vector<dnnlowp::TensorQuantizationParams> filter_qparams_;
std::vector<std::int32_t> filter_zero_points_;
std::vector<float> requantization_multipliers_;
bool quantize_groupwise_;
private:
void QuantizeWeight_();
void PreComputeRowColumnOffsets_();
void QuantizeBias_();
bool TakeDepthWise3x3FastPath_();
bool TakeDepthWise3x3x3FastPath_();
bool TakeGConvFastPath_();
template <typename PackAMatrix, fbgemm::QuantizationGranularity Q_GRAN>
void DispatchFBGEMM_(
PackAMatrix& packA,
vector<std::int32_t>* Y_int32,
uint8_t* Y_uint8_data);
void ConvNHWCCore_(const T* col_buffer_data, vector<std::int32_t>* Y_int32);
fbgemm::conv_param_t<> GetConvParam_();
fbgemm::conv_param_t<3> GetConv3DParam_();
std::vector<dnnlowp::RequantizationParams> requantization_params_;
// used in fast path for T == uint8_t
std::shared_ptr<fbgemm::PackBMatrix<std::int8_t>> Wq_packed_;
// For depthwise conv
std::shared_ptr<fbgemm::PackedDepthWiseConvMatrix> Wq_depthwise_packed_;
// For small gconv
std::shared_ptr<fbgemm::PackWeightMatrixForGConv<std::int8_t>>
Wq_gconv_packed_;
std::shared_ptr<
fbgemm::PackWeightMatrixForGConv<std::int8_t, std::int32_t, 3>>
Wq_gconv3d_packed_;
// pre-computed biases and offsets
std::shared_ptr<std::vector<std::int32_t>> b_quantized_;
float in_qparams_scale_old_{0};
std::int32_t in_qparams_zero_point_old_{0};
}; // class ConvDNNLowPOp
} // namespace caffe2