caffe2/quantization/server/caffe2_dnnlowp_utils.cc - platform/external/pytorch - Git at Google

 #include "caffe2_dnnlowp_utils.h"
 #include "caffe2/core/tensor_int8.h"
 #include "caffe2/quantization/server/sigmoid.h"
 #include "caffe2/quantization/server/tanh.h"

 #include <map>
 #ifdef _OPENMP
 #include <omp.h>
 #endif

 C10_DECLARE_int32(caffe2_dnnlowp_activation_quantization_precision);
 C10_DECLARE_int32(caffe2_dnnlowp_weight_quantization_precision);
 C10_DECLARE_int32(caffe2_dnnlowp_requantization_multiplier_precision);
 C10_DECLARE_int32(caffe2_dnnlowp_eltwise_quantization_precision);
 C10_DECLARE_bool(caffe2_dnnlowp_force_scale_power_of_two);
 C10_DECLARE_bool(caffe2_dnnlowp_preserve_activation_sparsity);
 C10_DECLARE_bool(caffe2_dnnlowp_preserve_weight_sparsity);
 C10_DECLARE_string(caffe2_dnnlowp_activation_quantization_kind);
 C10_DECLARE_string(caffe2_dnnlowp_weight_quantization_kind);
 C10_DECLARE_double(caffe2_dnnlowp_weight_p99_threshold);
 C10_DECLARE_double(caffe2_dnnlowp_activation_p99_threshold);

 namespace dnnlowp {

 using namespace std;
 using namespace caffe2;
 using int8::Int8TensorCPU;

 static bool HasDNNLowPEngine_(const OperatorDef& op_def) {
   const string ENGINE_PREFIX = "DNNLOWP";
   return strncmp(
              op_def.engine().c_str(),
              ENGINE_PREFIX.c_str(),
              ENGINE_PREFIX.size()) == 0;
 }

 static bool HasDNNLowPEngine_(const OperatorBase& op) {
   return HasDNNLowPEngine_(op.debug_def());
 }

 void PropagateOutputTensorQuantizationParams(
     OperatorBase* op,
     int idx,
     const TensorQuantizationParams& qparams) {
   LOG_IF(WARNING, !HasDNNLowPEngine_(*op));
   Int8TensorCPU* output =
       op->Outputs()[idx]->template GetMutable<Int8TensorCPU>();
   output->scale = qparams.scale;
   output->zero_point = qparams.zero_point;
 }

 TensorQuantizationParams GetInputTensorQuantizationParamsOf(
     OperatorBase* op,
     int idx,
     const QuantizationFactory* qfactory,
     bool is_weight /*=false*/) {
   LOG_IF(WARNING, !HasDNNLowPEngine_(*op));

   if (op->InputIsType<Int8TensorCPU>(idx)) {
     const Int8TensorCPU& int8_tensor = op->Input<Int8TensorCPU>(idx);
     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
     TensorQuantizationParams qparams;
     qparams.scale = int8_tensor.scale;
     qparams.zero_point = int8_tensor.zero_point;
     qparams.precision = qfactory->GetActivationPrecision();
     return qparams;
   } else {
     const TensorCPU* tensor = &op->template Input<Tensor>(idx, CPU);
     CAFFE_ENFORCE(tensor->template IsType<float>());
     CAFFE_ENFORCE(tensor->numel() == 0 || tensor->template data<float>());

     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     float min, max;
     fbgemm::FindMinMax(
         tensor->template data<float>(), &min, &max, tensor->numel());
     auto activation_quantization_kind = qfactory->GetActivationKind();
     if (activation_quantization_kind !=
         QuantizationFactory::QuantizationKind::MIN_MAX_QUANTIZATION) {
       LOG(WARNING)
           << "DNNLOWP dynamic int8 FC uses min_max as the only activation_quantization kind. Qparams will be assigned based on min_max regardless of activation_quantization_kind args.";
     }
     if (is_weight) {
       auto weight_quantization_kind = qfactory->GetWeightKind();
       if (weight_quantization_kind !=
           QuantizationFactory::QuantizationKind::MIN_MAX_QUANTIZATION) {
         LOG(WARNING)
             << "DNNLOWP dynamic int8 FC weight is not constant, assigning qparams to weight based on min_max, regardless of weight_quantization_kind args.";
       }
     }
     return qfactory->ChooseQuantizationParams(min, max, is_weight);
   }
 }

 static string OutputArgumentIdxString_(int idx) {
   return idx == 0 ? "" : to_string(idx + 1);
 }

 static string OutputScaleArgumentName(int idx) {
   return "Y" + OutputArgumentIdxString_(idx) + "_scale";
 }

 static string OutputZeroPointArgumentName(int idx) {
   return "Y" + OutputArgumentIdxString_(idx) + "_zero_point";
 }

 static void SetStaticQuantizationParams_(
     OperatorDef* op_def,
     int output_index,
     const TensorQuantizationParams& qparams) {
   AddArgument<float>(
       OutputScaleArgumentName(output_index), qparams.scale, op_def);
   AddArgument<int32_t>(
       OutputZeroPointArgumentName(output_index), qparams.zero_point, op_def);
 }

 void SetStaticQuantizationParams(
     OperatorBase* op,
     int output_index,
     const TensorQuantizationParams& qparams) {
   LOG_IF(WARNING, !HasDNNLowPEngine_(*op));
   auto op_def = make_shared<OperatorDef>();
   *op_def = op->debug_def();
   SetStaticQuantizationParams_(op_def.get(), output_index, qparams);
   op->set_debug_def(op_def);
 }

 bool HasStaticQuantization(
     const caffe2::OperatorBase* op,
     int output_index /*=0*/) {
   LOG_IF(WARNING, !HasDNNLowPEngine_(*op));
   return op->HasSingleArgumentOfType<float>(
       OutputScaleArgumentName(output_index));
 }

 TensorQuantizationParams GetStaticQuantizationParamsOf(
     const caffe2::OperatorBase* op,
     int idx) {
   LOG_IF(WARNING, !HasDNNLowPEngine_(*op));
   unique_ptr<QuantizationFactory> qfactory = GetQuantizationFactoryOf(op);

   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   TensorQuantizationParams qparams;
   qparams.scale = op->GetSingleArgument<float>(OutputScaleArgumentName(idx), 0);
   qparams.zero_point =
       op->GetSingleArgument<int32_t>(OutputZeroPointArgumentName(idx), 0);
   qparams.precision = qfactory->GetActivationPrecision();

   return qparams;
 }

 template <typename T>
 const T* QuantizeInputIfNeeded(
     OperatorBase* op,
     int input_index,
     const TensorQuantizationParams& qparams,
     vector<T>& temp) {
   if (op->InputIsType<int8::Int8TensorCPU>(input_index)) {
     // Already quantized
     return op->Input<int8::Int8TensorCPU>(input_index).t.data<T>();
   } else {
     // Need to quantize
     const TensorCPU& tensor = op->Input<Tensor>(input_index, CPU);
     temp.resize(tensor.numel());
     fbgemm::Quantize<T>(
         tensor.data<float>(), temp.data(), temp.size(), qparams);
     return temp.data();
   }
 }

 template <typename T>
 const T* RowWiseQuantizeInputIfNeeded(
     OperatorBase* op,
     int input_index,
     const std::vector<TensorQuantizationParams>& qparams,
     vector<T>& temp) {
   if (op->InputIsType<int8::Int8TensorCPU>(input_index)) {
     // Already quantized
     return op->Input<int8::Int8TensorCPU>(input_index).t.data<T>();
   } else {
     // Need to quantize
     const TensorCPU& tensor = op->Input<Tensor>(input_index, CPU);
     temp.resize(tensor.numel());
     // number of rows
     int N = qparams.size();
     int rowwidth = temp.size() / N;
     // quantize each row
     for (int i = 0; i < N; i++) {
       fbgemm::Quantize<T>(
           tensor.data<float>() + rowwidth * i,
           temp.data() + rowwidth * i,
           rowwidth,
           qparams[i]);
     }
     return temp.data();
   }
 }

 template const uint8_t* QuantizeInputIfNeeded<uint8_t>(
     OperatorBase* op,
     int input_index,
     const TensorQuantizationParams& qparams,
     vector<uint8_t>& temp);

 template const int8_t* QuantizeInputIfNeeded<int8_t>(
     OperatorBase* op,
     int input_index,
     const TensorQuantizationParams& qparams,
     vector<int8_t>& temp);

 template const uint16_t* QuantizeInputIfNeeded<uint16_t>(
     OperatorBase* op,
     int input_index,
     const TensorQuantizationParams& qparams,
     vector<uint16_t>& temp);

 template const int16_t* QuantizeInputIfNeeded<int16_t>(
     OperatorBase* op,
     int input_index,
     const TensorQuantizationParams& qparams,
     vector<int16_t>& temp);

 template const uint8_t* RowWiseQuantizeInputIfNeeded<uint8_t>(
     OperatorBase* op,
     int input_index,
     const std::vector<TensorQuantizationParams>& qparams,
     vector<uint8_t>& temp);

 template const uint16_t* RowWiseQuantizeInputIfNeeded<uint16_t>(
     OperatorBase* op,
     int input_index,
     const std::vector<TensorQuantizationParams>& qparams,
     vector<uint16_t>& temp);

 void MeasureQuantizationError(
     const float* actual,
     const float* ref,
     size_t len,
     QuantizationErrorStats* stat) {
   // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
   for (int i = 0; i < len; ++i) {
     stat->sum_sq += ref[i] * ref[i];
     float err = actual[i] - ref[i];
     stat->sum_err_sq += err * err;

     if (fabs(err) > stat->max_abs_err) {
       stat->max_abs_err = fabs(err);
       stat->max_err_actual = actual[i];
       stat->max_err_ref = ref[i];
     }
   }
   ++stat->measure_cnt;
 }

 void ReportQuantizationError(
     const OperatorBase* op,
     const QuantizationErrorStats& stat) {
   if (stat.sum_sq == 0) {
     LOG(INFO) << " output " << op->debug_def().output(0) << " of operator "
               << op << " with type " << op->debug_def().type() << " and engine "
               << op->debug_def().engine()
               << " has l2 relative error nan (stat.sum_err_sq "
               << stat.sum_err_sq << " stat.sum_sq 0)"
               << " and max abs error " << stat.max_abs_err << " (reference is "
               << stat.max_err_ref << " and actual is " << stat.max_err_actual
               << ")"
               << " sum_err_sq " << stat.sum_err_sq << " sum_sq_ " << stat.sum_sq
               << " cnt " << stat.measure_cnt;
   } else {
     LOG(INFO) << " output " << op->debug_def().output(0) << " of operator "
               << op << " with type " << op->debug_def().type() << " and engine "
               << op->debug_def().engine() << " has l2 relative error "
               << std::sqrt(stat.sum_err_sq) / std::sqrt(stat.sum_sq)
               << " and max abs error " << stat.max_abs_err << " (reference is "
               << stat.max_err_ref << " and actual is " << stat.max_err_actual
               << ")"
               << " sum_err_sq " << stat.sum_err_sq << " sum_sq_ " << stat.sum_sq
               << " cnt " << stat.measure_cnt;
   }
 }

 static unique_ptr<QuantizationFactory> GetQuantizationFactoryOf_(
     const OperatorDef& op_def) {
   int activation_precision =
       ArgumentHelper::GetSingleArgument<OperatorDef, int>(
           op_def,
           "activation_precision",
           FLAGS_caffe2_dnnlowp_activation_quantization_precision);
   int weight_precision = ArgumentHelper::GetSingleArgument<OperatorDef, int>(
       op_def,
       "weight_precision",
       FLAGS_caffe2_dnnlowp_weight_quantization_precision);
   int requantization_multiplier_precision =
       ArgumentHelper::GetSingleArgument<OperatorDef, int>(
           op_def,
           "requantization_multiplier_precision",
           FLAGS_caffe2_dnnlowp_requantization_multiplier_precision);
   int eltwise_quantization_precision =
       ArgumentHelper::GetSingleArgument<OperatorDef, int>(
           op_def,
           "eltwise_quantization_precision",
           FLAGS_caffe2_dnnlowp_eltwise_quantization_precision);
   bool preserve_activation_sparsity =
       ArgumentHelper::GetSingleArgument<OperatorDef, bool>(
           op_def,
           "preserve_activation_sparsity",
           FLAGS_caffe2_dnnlowp_preserve_activation_sparsity);
   bool preserve_weight_sparsity =
       ArgumentHelper::GetSingleArgument<OperatorDef, bool>(
           op_def,
           "preserve_weight_sparsity",
           FLAGS_caffe2_dnnlowp_preserve_weight_sparsity);
   bool force_scale_power_of_two =
       ArgumentHelper::GetSingleArgument<OperatorDef, bool>(
           op_def,
           "force_scale_power_of_two",
           FLAGS_caffe2_dnnlowp_force_scale_power_of_two);
   string activation_quantization_kind =
       ArgumentHelper::GetSingleArgument<OperatorDef, string>(
           op_def,
           "activation_quantization_kind",
           FLAGS_caffe2_dnnlowp_activation_quantization_kind);
   string weight_quantization_kind =
       ArgumentHelper::GetSingleArgument<OperatorDef, string>(
           op_def,
           "weight_quantization_kind",
           FLAGS_caffe2_dnnlowp_weight_quantization_kind);
   float weight_p99_threshold =
       ArgumentHelper::GetSingleArgument<OperatorDef, float>(
           op_def,
           "weight_p99_threshold",
           FLAGS_caffe2_dnnlowp_weight_p99_threshold);
   float activation_p99_threshold =
       ArgumentHelper::GetSingleArgument<OperatorDef, float>(
           op_def,
           "activation_p99_threshold",
           FLAGS_caffe2_dnnlowp_activation_p99_threshold);
   std::stringstream ss;
   ss << "Quantization method for op with output " << op_def.output(0)
      << " engine " << op_def.engine() << " activation_precision "
      << activation_precision << " weight_precision " << weight_precision
      << " requantization_multiplier_precision "
      << requantization_multiplier_precision
      << " eltwise_quantization_precision " << eltwise_quantization_precision
      << " preserve_activation_sparsity " << preserve_activation_sparsity
      << " preserve_weight_sparsity " << preserve_weight_sparsity
      << " force_scale_power_of_two " << force_scale_power_of_two
      << " activation_quantization_kind " << activation_quantization_kind
      << " weight_quantization_kind " << weight_quantization_kind;
   if (weight_quantization_kind == "p99" || weight_quantization_kind == "P99") {
     ss << " weight p99 threshold " << weight_p99_threshold;
   }
   if (activation_quantization_kind == "p99" ||
       activation_quantization_kind == "P99") {
     ss << " activation p99 threshold " << activation_p99_threshold;
   }
   VLOG(2) << ss.str();

   // NOLINTNEXTLINE(modernize-make-unique)
   return unique_ptr<QuantizationFactory>(new QuantizationFactory(
       activation_precision,
       weight_precision,
       requantization_multiplier_precision,
       eltwise_quantization_precision,
       preserve_activation_sparsity,
       preserve_weight_sparsity,
       force_scale_power_of_two,
       StringToKind(activation_quantization_kind),
       StringToKind(weight_quantization_kind),
       weight_p99_threshold,
       activation_p99_threshold));
 }

 unique_ptr<QuantizationFactory> GetQuantizationFactoryOf(
     const OperatorBase* op) {
   return GetQuantizationFactoryOf_(op->debug_def());
 }

 void AdjustOutputTensorQuantizationParamsWithFollowedBy(
     OperatorBase* op,
     const string& followed_by) {
   LOG_IF(WARNING, !HasDNNLowPEngine_(*op));

   auto op_def = make_shared<OperatorDef>();
   *op_def = op->debug_def();
   AddArgument<string>("followed_by", followed_by, op_def.get());
   op->set_debug_def(op_def);

   if (followed_by == "Sigmoid") {
     SetStaticQuantizationParams(
         op, 0, Sigmoid<uint8_t>().GetInputQuantizationParams());
   } else if (followed_by == "Tanh") {
     SetStaticQuantizationParams(
         op, 0, Tanh<uint8_t>().GetInputQuantizationParams());
   } else if (followed_by == "Relu") {
     if (HasStaticQuantization(op)) {
       unique_ptr<QuantizationFactory> qfactory = GetQuantizationFactoryOf(op);
       TensorQuantizationParams qparams = GetStaticQuantizationParamsOf(op, 0);
       qparams = qfactory->ChooseQuantizationParams(0, qparams.Max());
       SetStaticQuantizationParams(op, 0, qparams);
     }
   } else {
     LOG(WARNING) << "Unknown followed_by " << followed_by;
   }
 }

 void ParseDNNLowPOperatorArguments(
     OperatorBase* op,
     bool* dequantize_output,
     bool* measure_quantization_error,
     string* followed_by) {
   // When exiting quantized region or we're just doing per-op quantization,
   // dequantize the outputs as floats.
   if (dequantize_output) {
     *dequantize_output =
         op->GetSingleArgument<bool>("dequantize_output", false);
     if (*dequantize_output) {
       VLOG(2) << "Dequantize output " << op->debug_def().output(0)
               << " of operator type " << op->debug_def().type();
     }
   }

   // Measure quantization error by comparing with reference fp32 operators.
   if (measure_quantization_error) {
     *measure_quantization_error =
         op->GetSingleArgument<bool>("measure_quantization_error", false);
   }

   // Output scale and zero_point can be specified (actually recommended to be
   // specified for performance to avoid on-the-fly quantization parameter
   // selection) from activation distributions collected from profiling.
   if (HasStaticQuantization(op)) {
     TensorQuantizationParams qparams = GetStaticQuantizationParamsOf(op, 0);
     unique_ptr<QuantizationFactory> qfactory = GetQuantizationFactoryOf(op);
     if (qparams.zero_point != (1 << (qfactory->GetActivationPrecision() - 1)) &&
         qparams.zero_point != 0 && qfactory->GetPreserveActivationSparsity()) {
       LOG(WARNING) << "Symmetric quantization is used for activation but "
                       "Y_zero_point is "
                    << qparams.zero_point << " for " << op->debug_def().output(0)
                    << " output activation of an operator with type "
                    << op->debug_def().type();
     }
   } else {
     if (op->HasSingleArgumentOfType<int>("Y_zero_point")) {
       LOG(WARNING) << "Y_zero_point without Y_scale for "
                    << op->debug_def().output(0)
                    << " (an output of operator type " << op->debug_def().type()
                    << ") doesn't make sense";
     }
   }

   // When an operator has only one consumer and the consumer only cares about
   // a limited range of values, we can quantize more precisely.
   if (op->HasSingleArgumentOfType<string>("followed_by")) {
     string followed_by_ = op->GetSingleArgument<string>("followed_by", "");
     VLOG(2) << "Operator with type " << op->debug_def().type() << " and output "
             << op->debug_def().output(0) << " is followed by " << followed_by_;

     AdjustOutputTensorQuantizationParamsWithFollowedBy(op, followed_by_);
     if (followed_by) {
       *followed_by = followed_by_;
     }
   }
 }

 NetDef AddScaleZeroOffsetArgumentsWithHistogram(
     NetDef net_def,
     const string& histogram_file_name) {
   ifstream f(histogram_file_name);

   // check the format by looking at the first line
   string first_line, word;
   getline(f, first_line);
   f.seekg(0, f.beg);
   istringstream ist(first_line);
   int nwords_first_line = 0;
   while (ist >> word) {
     ++nwords_first_line;
   }

   ist.str(first_line);
   ist.clear();

   bool new_format = true;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   int op_index, i, nbins;
   string op_type, tensor_name;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   float min, max;
   ist >> op_index >> op_type >> i >> tensor_name >> min >> max >> nbins;
   if (nwords_first_line != nbins + 7) {
     ist.str(first_line);
     ist.clear();
     ist >> op_index >> i >> tensor_name >> min >> max >> nbins;
     if (nwords_first_line == nbins + 6) {
       new_format = false;
     } else {
       LOG(WARNING) << "histogram file " << histogram_file_name
                    << " has an invalid format";
       return net_def;
     }
   }

   // parse the input file
   op_index = 0;
   for (auto& op_def : *net_def.mutable_op()) {
     ArgumentHelper arg_helper(op_def);

     for (i = 0; i < op_def.output().size(); ++i) {
       // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
       int op_index2, i2;

       if (new_format) {
         f >> op_index2 >> op_type >> i2 >> tensor_name >> min >> max >> nbins;
       } else {
         f >> op_index2 >> i2 >> tensor_name >> min >> max >> nbins;
       }
       LOG_IF(WARNING, op_index2 != op_index)
           << "op index " << op_index2 << " doesn't match with " << op_index;
       LOG_IF(WARNING, tensor_name != op_def.output(i))
           << tensor_name << " in histogram file line " << op_index
           << " doesn't match with operation def " << op_def.output(i);
       LOG_IF(WARNING, i2 != i)
           << "output tensor index " << i2 << " doesn't match with " << i;
       if (new_format) {
         LOG_IF(WARNING, op_type != op_def.type())
             << "operator type " << op_type << " in histogram file line "
             << op_index << " doesn't match with operation def "
             << op_def.type();
       }

       vector<uint64_t> bins;
       for (int j = 0; j < nbins; ++j) {
         // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
         uint64_t cnt;
         f >> cnt;
         bins.push_back(cnt);
       }

       if (!HasDNNLowPEngine_(op_def) ||
           arg_helper.GetSingleArgument<int>("dequantize_output", 0) != 0 ||
           i > 0) {
         LOG(INFO) << "Skip " << op_def.type() << " " << op_def.output(0);
         continue;
       }

       Histogram hist = Histogram(min, max, bins);

       unique_ptr<QuantizationFactory> qfactory =
           GetQuantizationFactoryOf_(op_def);
       TensorQuantizationParams qparams =
           qfactory->ChooseQuantizationParams(hist);

       SetStaticQuantizationParams_(&op_def, 0, qparams);
     }
     ++op_index;
   }

   return net_def;
 }

 } // namespace dnnlowp
	#include "caffe2_dnnlowp_utils.h"
	#include "caffe2/core/tensor_int8.h"
	#include "caffe2/quantization/server/sigmoid.h"
	#include "caffe2/quantization/server/tanh.h"

	#include <map>
	#ifdef _OPENMP
	#include <omp.h>
	#endif

	C10_DECLARE_int32(caffe2_dnnlowp_activation_quantization_precision);
	C10_DECLARE_int32(caffe2_dnnlowp_weight_quantization_precision);
	C10_DECLARE_int32(caffe2_dnnlowp_requantization_multiplier_precision);
	C10_DECLARE_int32(caffe2_dnnlowp_eltwise_quantization_precision);
	C10_DECLARE_bool(caffe2_dnnlowp_force_scale_power_of_two);
	C10_DECLARE_bool(caffe2_dnnlowp_preserve_activation_sparsity);
	C10_DECLARE_bool(caffe2_dnnlowp_preserve_weight_sparsity);
	C10_DECLARE_string(caffe2_dnnlowp_activation_quantization_kind);
	C10_DECLARE_string(caffe2_dnnlowp_weight_quantization_kind);
	C10_DECLARE_double(caffe2_dnnlowp_weight_p99_threshold);
	C10_DECLARE_double(caffe2_dnnlowp_activation_p99_threshold);

	namespace dnnlowp {

	using namespace std;
	using namespace caffe2;
	using int8::Int8TensorCPU;

	static bool HasDNNLowPEngine_(const OperatorDef& op_def) {
	const string ENGINE_PREFIX = "DNNLOWP";
	return strncmp(
	op_def.engine().c_str(),
	ENGINE_PREFIX.c_str(),
	ENGINE_PREFIX.size()) == 0;
	}

	static bool HasDNNLowPEngine_(const OperatorBase& op) {
	return HasDNNLowPEngine_(op.debug_def());
	}

	void PropagateOutputTensorQuantizationParams(
	OperatorBase* op,
	int idx,
	const TensorQuantizationParams& qparams) {
	LOG_IF(WARNING, !HasDNNLowPEngine_(*op));
	Int8TensorCPU* output =
	op->Outputs()[idx]->template GetMutable<Int8TensorCPU>();
	output->scale = qparams.scale;
	output->zero_point = qparams.zero_point;
	}

	TensorQuantizationParams GetInputTensorQuantizationParamsOf(
	OperatorBase* op,
	int idx,
	const QuantizationFactory* qfactory,
	bool is_weight /=false/) {
	LOG_IF(WARNING, !HasDNNLowPEngine_(*op));

	if (op->InputIsType<Int8TensorCPU>(idx)) {
	const Int8TensorCPU& int8_tensor = op->Input<Int8TensorCPU>(idx);
	// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
	TensorQuantizationParams qparams;
	qparams.scale = int8_tensor.scale;
	qparams.zero_point = int8_tensor.zero_point;
	qparams.precision = qfactory->GetActivationPrecision();
	return qparams;
	} else {
	const TensorCPU* tensor = &op->template Input<Tensor>(idx, CPU);
	CAFFE_ENFORCE(tensor->template IsType<float>());
	CAFFE_ENFORCE(tensor->numel() == 0 \|\| tensor->template data<float>());

	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	float min, max;
	fbgemm::FindMinMax(
	tensor->template data<float>(), &min, &max, tensor->numel());
	auto activation_quantization_kind = qfactory->GetActivationKind();
	if (activation_quantization_kind !=
	QuantizationFactory::QuantizationKind::MIN_MAX_QUANTIZATION) {
	LOG(WARNING)
	<< "DNNLOWP dynamic int8 FC uses min_max as the only activation_quantization kind. Qparams will be assigned based on min_max regardless of activation_quantization_kind args.";
	}
	if (is_weight) {
	auto weight_quantization_kind = qfactory->GetWeightKind();
	if (weight_quantization_kind !=
	QuantizationFactory::QuantizationKind::MIN_MAX_QUANTIZATION) {
	LOG(WARNING)
	<< "DNNLOWP dynamic int8 FC weight is not constant, assigning qparams to weight based on min_max, regardless of weight_quantization_kind args.";
	}
	}
	return qfactory->ChooseQuantizationParams(min, max, is_weight);
	}
	}

	static string OutputArgumentIdxString_(int idx) {
	return idx == 0 ? "" : to_string(idx + 1);
	}

	static string OutputScaleArgumentName(int idx) {
	return "Y" + OutputArgumentIdxString_(idx) + "_scale";
	}

	static string OutputZeroPointArgumentName(int idx) {
	return "Y" + OutputArgumentIdxString_(idx) + "_zero_point";
	}

	static void SetStaticQuantizationParams_(
	OperatorDef* op_def,
	int output_index,
	const TensorQuantizationParams& qparams) {
	AddArgument<float>(
	OutputScaleArgumentName(output_index), qparams.scale, op_def);
	AddArgument<int32_t>(
	OutputZeroPointArgumentName(output_index), qparams.zero_point, op_def);
	}

	void SetStaticQuantizationParams(
	OperatorBase* op,
	int output_index,
	const TensorQuantizationParams& qparams) {
	LOG_IF(WARNING, !HasDNNLowPEngine_(*op));
	auto op_def = make_shared<OperatorDef>();
	*op_def = op->debug_def();
	SetStaticQuantizationParams_(op_def.get(), output_index, qparams);
	op->set_debug_def(op_def);
	}

	bool HasStaticQuantization(
	const caffe2::OperatorBase* op,
	int output_index /=0/) {
	LOG_IF(WARNING, !HasDNNLowPEngine_(*op));
	return op->HasSingleArgumentOfType<float>(
	OutputScaleArgumentName(output_index));
	}

	TensorQuantizationParams GetStaticQuantizationParamsOf(
	const caffe2::OperatorBase* op,
	int idx) {
	LOG_IF(WARNING, !HasDNNLowPEngine_(*op));
	unique_ptr<QuantizationFactory> qfactory = GetQuantizationFactoryOf(op);

	// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
	TensorQuantizationParams qparams;
	qparams.scale = op->GetSingleArgument<float>(OutputScaleArgumentName(idx), 0);
	qparams.zero_point =
	op->GetSingleArgument<int32_t>(OutputZeroPointArgumentName(idx), 0);
	qparams.precision = qfactory->GetActivationPrecision();

	return qparams;
	}

	template <typename T>
	const T* QuantizeInputIfNeeded(
	OperatorBase* op,
	int input_index,
	const TensorQuantizationParams& qparams,
	vector<T>& temp) {
	if (op->InputIsType<int8::Int8TensorCPU>(input_index)) {
	// Already quantized
	return op->Input<int8::Int8TensorCPU>(input_index).t.data<T>();
	} else {
	// Need to quantize
	const TensorCPU& tensor = op->Input<Tensor>(input_index, CPU);
	temp.resize(tensor.numel());
	fbgemm::Quantize<T>(
	tensor.data<float>(), temp.data(), temp.size(), qparams);
	return temp.data();
	}
	}

	template <typename T>
	const T* RowWiseQuantizeInputIfNeeded(
	OperatorBase* op,
	int input_index,
	const std::vector<TensorQuantizationParams>& qparams,
	vector<T>& temp) {
	if (op->InputIsType<int8::Int8TensorCPU>(input_index)) {
	// Already quantized
	return op->Input<int8::Int8TensorCPU>(input_index).t.data<T>();
	} else {
	// Need to quantize
	const TensorCPU& tensor = op->Input<Tensor>(input_index, CPU);
	temp.resize(tensor.numel());
	// number of rows
	int N = qparams.size();
	int rowwidth = temp.size() / N;
	// quantize each row
	for (int i = 0; i < N; i++) {
	fbgemm::Quantize<T>(
	tensor.data<float>() + rowwidth * i,
	temp.data() + rowwidth * i,
	rowwidth,
	qparams[i]);
	}
	return temp.data();
	}
	}

	template const uint8_t* QuantizeInputIfNeeded<uint8_t>(
	OperatorBase* op,
	int input_index,
	const TensorQuantizationParams& qparams,
	vector<uint8_t>& temp);

	template const int8_t* QuantizeInputIfNeeded<int8_t>(
	OperatorBase* op,
	int input_index,
	const TensorQuantizationParams& qparams,
	vector<int8_t>& temp);

	template const uint16_t* QuantizeInputIfNeeded<uint16_t>(
	OperatorBase* op,
	int input_index,
	const TensorQuantizationParams& qparams,
	vector<uint16_t>& temp);

	template const int16_t* QuantizeInputIfNeeded<int16_t>(
	OperatorBase* op,
	int input_index,
	const TensorQuantizationParams& qparams,
	vector<int16_t>& temp);

	template const uint8_t* RowWiseQuantizeInputIfNeeded<uint8_t>(
	OperatorBase* op,
	int input_index,
	const std::vector<TensorQuantizationParams>& qparams,
	vector<uint8_t>& temp);

	template const uint16_t* RowWiseQuantizeInputIfNeeded<uint16_t>(
	OperatorBase* op,
	int input_index,
	const std::vector<TensorQuantizationParams>& qparams,
	vector<uint16_t>& temp);

	void MeasureQuantizationError(
	const float* actual,
	const float* ref,
	size_t len,
	QuantizationErrorStats* stat) {
	// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
	for (int i = 0; i < len; ++i) {
	stat->sum_sq += ref[i] * ref[i];
	float err = actual[i] - ref[i];
	stat->sum_err_sq += err * err;

	if (fabs(err) > stat->max_abs_err) {
	stat->max_abs_err = fabs(err);
	stat->max_err_actual = actual[i];
	stat->max_err_ref = ref[i];
	}
	}
	++stat->measure_cnt;
	}

	void ReportQuantizationError(
	const OperatorBase* op,
	const QuantizationErrorStats& stat) {
	if (stat.sum_sq == 0) {
	LOG(INFO) << " output " << op->debug_def().output(0) << " of operator "
	<< op << " with type " << op->debug_def().type() << " and engine "
	<< op->debug_def().engine()
	<< " has l2 relative error nan (stat.sum_err_sq "
	<< stat.sum_err_sq << " stat.sum_sq 0)"
	<< " and max abs error " << stat.max_abs_err << " (reference is "
	<< stat.max_err_ref << " and actual is " << stat.max_err_actual
	<< ")"
	<< " sum_err_sq " << stat.sum_err_sq << " sum_sq_ " << stat.sum_sq
	<< " cnt " << stat.measure_cnt;
	} else {
	LOG(INFO) << " output " << op->debug_def().output(0) << " of operator "
	<< op << " with type " << op->debug_def().type() << " and engine "
	<< op->debug_def().engine() << " has l2 relative error "
	<< std::sqrt(stat.sum_err_sq) / std::sqrt(stat.sum_sq)
	<< " and max abs error " << stat.max_abs_err << " (reference is "
	<< stat.max_err_ref << " and actual is " << stat.max_err_actual
	<< ")"
	<< " sum_err_sq " << stat.sum_err_sq << " sum_sq_ " << stat.sum_sq
	<< " cnt " << stat.measure_cnt;
	}
	}

	static unique_ptr<QuantizationFactory> GetQuantizationFactoryOf_(
	const OperatorDef& op_def) {
	int activation_precision =
	ArgumentHelper::GetSingleArgument<OperatorDef, int>(
	op_def,
	"activation_precision",
	FLAGS_caffe2_dnnlowp_activation_quantization_precision);
	int weight_precision = ArgumentHelper::GetSingleArgument<OperatorDef, int>(
	op_def,
	"weight_precision",
	FLAGS_caffe2_dnnlowp_weight_quantization_precision);
	int requantization_multiplier_precision =
	ArgumentHelper::GetSingleArgument<OperatorDef, int>(
	op_def,
	"requantization_multiplier_precision",
	FLAGS_caffe2_dnnlowp_requantization_multiplier_precision);
	int eltwise_quantization_precision =
	ArgumentHelper::GetSingleArgument<OperatorDef, int>(
	op_def,
	"eltwise_quantization_precision",
	FLAGS_caffe2_dnnlowp_eltwise_quantization_precision);
	bool preserve_activation_sparsity =
	ArgumentHelper::GetSingleArgument<OperatorDef, bool>(
	op_def,
	"preserve_activation_sparsity",
	FLAGS_caffe2_dnnlowp_preserve_activation_sparsity);
	bool preserve_weight_sparsity =
	ArgumentHelper::GetSingleArgument<OperatorDef, bool>(
	op_def,
	"preserve_weight_sparsity",
	FLAGS_caffe2_dnnlowp_preserve_weight_sparsity);
	bool force_scale_power_of_two =
	ArgumentHelper::GetSingleArgument<OperatorDef, bool>(
	op_def,
	"force_scale_power_of_two",
	FLAGS_caffe2_dnnlowp_force_scale_power_of_two);
	string activation_quantization_kind =
	ArgumentHelper::GetSingleArgument<OperatorDef, string>(
	op_def,
	"activation_quantization_kind",
	FLAGS_caffe2_dnnlowp_activation_quantization_kind);
	string weight_quantization_kind =
	ArgumentHelper::GetSingleArgument<OperatorDef, string>(
	op_def,
	"weight_quantization_kind",
	FLAGS_caffe2_dnnlowp_weight_quantization_kind);
	float weight_p99_threshold =
	ArgumentHelper::GetSingleArgument<OperatorDef, float>(
	op_def,
	"weight_p99_threshold",
	FLAGS_caffe2_dnnlowp_weight_p99_threshold);
	float activation_p99_threshold =
	ArgumentHelper::GetSingleArgument<OperatorDef, float>(
	op_def,
	"activation_p99_threshold",
	FLAGS_caffe2_dnnlowp_activation_p99_threshold);
	std::stringstream ss;
	ss << "Quantization method for op with output " << op_def.output(0)
	<< " engine " << op_def.engine() << " activation_precision "
	<< activation_precision << " weight_precision " << weight_precision
	<< " requantization_multiplier_precision "
	<< requantization_multiplier_precision
	<< " eltwise_quantization_precision " << eltwise_quantization_precision
	<< " preserve_activation_sparsity " << preserve_activation_sparsity
	<< " preserve_weight_sparsity " << preserve_weight_sparsity
	<< " force_scale_power_of_two " << force_scale_power_of_two
	<< " activation_quantization_kind " << activation_quantization_kind
	<< " weight_quantization_kind " << weight_quantization_kind;
	if (weight_quantization_kind == "p99" \|\| weight_quantization_kind == "P99") {
	ss << " weight p99 threshold " << weight_p99_threshold;
	}
	if (activation_quantization_kind == "p99" \|\|
	activation_quantization_kind == "P99") {
	ss << " activation p99 threshold " << activation_p99_threshold;
	}
	VLOG(2) << ss.str();

	// NOLINTNEXTLINE(modernize-make-unique)
	return unique_ptr<QuantizationFactory>(new QuantizationFactory(
	activation_precision,
	weight_precision,
	requantization_multiplier_precision,
	eltwise_quantization_precision,
	preserve_activation_sparsity,
	preserve_weight_sparsity,
	force_scale_power_of_two,
	StringToKind(activation_quantization_kind),
	StringToKind(weight_quantization_kind),
	weight_p99_threshold,
	activation_p99_threshold));
	}

	unique_ptr<QuantizationFactory> GetQuantizationFactoryOf(
	const OperatorBase* op) {
	return GetQuantizationFactoryOf_(op->debug_def());
	}

	void AdjustOutputTensorQuantizationParamsWithFollowedBy(
	OperatorBase* op,
	const string& followed_by) {
	LOG_IF(WARNING, !HasDNNLowPEngine_(*op));

	auto op_def = make_shared<OperatorDef>();
	*op_def = op->debug_def();
	AddArgument<string>("followed_by", followed_by, op_def.get());
	op->set_debug_def(op_def);

	if (followed_by == "Sigmoid") {
	SetStaticQuantizationParams(
	op, 0, Sigmoid<uint8_t>().GetInputQuantizationParams());
	} else if (followed_by == "Tanh") {
	SetStaticQuantizationParams(
	op, 0, Tanh<uint8_t>().GetInputQuantizationParams());
	} else if (followed_by == "Relu") {
	if (HasStaticQuantization(op)) {
	unique_ptr<QuantizationFactory> qfactory = GetQuantizationFactoryOf(op);
	TensorQuantizationParams qparams = GetStaticQuantizationParamsOf(op, 0);
	qparams = qfactory->ChooseQuantizationParams(0, qparams.Max());
	SetStaticQuantizationParams(op, 0, qparams);
	}
	} else {
	LOG(WARNING) << "Unknown followed_by " << followed_by;
	}
	}

	void ParseDNNLowPOperatorArguments(
	OperatorBase* op,
	bool* dequantize_output,
	bool* measure_quantization_error,
	string* followed_by) {
	// When exiting quantized region or we're just doing per-op quantization,
	// dequantize the outputs as floats.
	if (dequantize_output) {
	*dequantize_output =
	op->GetSingleArgument<bool>("dequantize_output", false);
	if (*dequantize_output) {
	VLOG(2) << "Dequantize output " << op->debug_def().output(0)
	<< " of operator type " << op->debug_def().type();
	}
	}

	// Measure quantization error by comparing with reference fp32 operators.
	if (measure_quantization_error) {
	*measure_quantization_error =
	op->GetSingleArgument<bool>("measure_quantization_error", false);
	}

	// Output scale and zero_point can be specified (actually recommended to be
	// specified for performance to avoid on-the-fly quantization parameter
	// selection) from activation distributions collected from profiling.
	if (HasStaticQuantization(op)) {
	TensorQuantizationParams qparams = GetStaticQuantizationParamsOf(op, 0);
	unique_ptr<QuantizationFactory> qfactory = GetQuantizationFactoryOf(op);
	if (qparams.zero_point != (1 << (qfactory->GetActivationPrecision() - 1)) &&
	qparams.zero_point != 0 && qfactory->GetPreserveActivationSparsity()) {
	LOG(WARNING) << "Symmetric quantization is used for activation but "
	"Y_zero_point is "
	<< qparams.zero_point << " for " << op->debug_def().output(0)
	<< " output activation of an operator with type "
	<< op->debug_def().type();
	}
	} else {
	if (op->HasSingleArgumentOfType<int>("Y_zero_point")) {
	LOG(WARNING) << "Y_zero_point without Y_scale for "
	<< op->debug_def().output(0)
	<< " (an output of operator type " << op->debug_def().type()
	<< ") doesn't make sense";
	}
	}

	// When an operator has only one consumer and the consumer only cares about
	// a limited range of values, we can quantize more precisely.
	if (op->HasSingleArgumentOfType<string>("followed_by")) {
	string followed_by_ = op->GetSingleArgument<string>("followed_by", "");
	VLOG(2) << "Operator with type " << op->debug_def().type() << " and output "
	<< op->debug_def().output(0) << " is followed by " << followed_by_;

	AdjustOutputTensorQuantizationParamsWithFollowedBy(op, followed_by_);
	if (followed_by) {
	*followed_by = followed_by_;
	}
	}
	}

	NetDef AddScaleZeroOffsetArgumentsWithHistogram(
	NetDef net_def,
	const string& histogram_file_name) {
	ifstream f(histogram_file_name);

	// check the format by looking at the first line
	string first_line, word;
	getline(f, first_line);
	f.seekg(0, f.beg);
	istringstream ist(first_line);
	int nwords_first_line = 0;
	while (ist >> word) {
	++nwords_first_line;
	}

	ist.str(first_line);
	ist.clear();

	bool new_format = true;
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	int op_index, i, nbins;
	string op_type, tensor_name;
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	float min, max;
	ist >> op_index >> op_type >> i >> tensor_name >> min >> max >> nbins;
	if (nwords_first_line != nbins + 7) {
	ist.str(first_line);
	ist.clear();
	ist >> op_index >> i >> tensor_name >> min >> max >> nbins;
	if (nwords_first_line == nbins + 6) {
	new_format = false;
	} else {
	LOG(WARNING) << "histogram file " << histogram_file_name
	<< " has an invalid format";
	return net_def;
	}
	}

	// parse the input file
	op_index = 0;
	for (auto& op_def : *net_def.mutable_op()) {
	ArgumentHelper arg_helper(op_def);

	for (i = 0; i < op_def.output().size(); ++i) {
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	int op_index2, i2;

	if (new_format) {
	f >> op_index2 >> op_type >> i2 >> tensor_name >> min >> max >> nbins;
	} else {
	f >> op_index2 >> i2 >> tensor_name >> min >> max >> nbins;
	}
	LOG_IF(WARNING, op_index2 != op_index)
	<< "op index " << op_index2 << " doesn't match with " << op_index;
	LOG_IF(WARNING, tensor_name != op_def.output(i))
	<< tensor_name << " in histogram file line " << op_index
	<< " doesn't match with operation def " << op_def.output(i);
	LOG_IF(WARNING, i2 != i)
	<< "output tensor index " << i2 << " doesn't match with " << i;
	if (new_format) {
	LOG_IF(WARNING, op_type != op_def.type())
	<< "operator type " << op_type << " in histogram file line "
	<< op_index << " doesn't match with operation def "
	<< op_def.type();
	}

	vector<uint64_t> bins;
	for (int j = 0; j < nbins; ++j) {
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	uint64_t cnt;
	f >> cnt;
	bins.push_back(cnt);
	}

	if (!HasDNNLowPEngine_(op_def) \|\|
	arg_helper.GetSingleArgument<int>("dequantize_output", 0) != 0 \|\|
	i > 0) {
	LOG(INFO) << "Skip " << op_def.type() << " " << op_def.output(0);
	continue;
	}

	Histogram hist = Histogram(min, max, bins);

	unique_ptr<QuantizationFactory> qfactory =
	GetQuantizationFactoryOf_(op_def);
	TensorQuantizationParams qparams =
	qfactory->ChooseQuantizationParams(hist);

	SetStaticQuantizationParams_(&op_def, 0, qparams);
	}
	++op_index;
	}

	return net_def;
	}

	} // namespace dnnlowp