caffe2/quantization/server/norm_minimization.cc - platform/external/pytorch - Git at Google

 #include "caffe2/core/logging.h"
 #include "caffe2/utils/cpuid.h"
 #include "l2_minimization.h"

 #include <cassert>
 #include <cmath>
 #include <limits>

 #include <immintrin.h>

 #include <c10/util/irange.h>

 using namespace std;

 namespace dnnlowp {

 #undef NDEBUG

 // Use fp16_min as the small scale cutoff because we don't want to use scales in fp16 subnormal range.
 // This is to be consistent with Glow and FakeLowP implementation for NNPI.
 constexpr float SMALL_SCALE_THRESHOLD = 6.1e-5f;

 static float
 GetNorm(float begin, float end, float density, NormMinimization::Kind kind) {
   float norm = 0;

   // assume values are uniformly distributed within each histogram bin
   if (NormMinimization::L2 == kind) {
     // err = density * (integral_{begin, end} x^2)
     //     = density * (end^3 - begin^3) / 3
     norm = (end * end * end - begin * begin * begin) / 3;
     // for begin = d/2 and end = -d/2, this leads to d^3/12
   } else {
     // err = density * (integral_{begin, end} |x|)
     //     = density * (end^2 - begin^2) / 2
     float left_begin = std::min(0.0f, begin);
     float left_end = std::min(0.0f, end);
     assert(left_begin * left_begin >= left_end * left_end);
     norm += (left_begin * left_begin - left_end * left_end) / 2;

     float right_begin = std::max(0.0f, begin);
     float right_end = std::max(0.0f, end);
     assert(right_end * right_end >= right_begin * right_begin);
     norm += (right_end * right_end - right_begin * right_begin) / 2;
   }

   return density * norm;
 }

 // Filter out outliers in input distributions
 // Exploit the input distributions for the quick search
 TensorQuantizationParams NormMinimization::NonlinearQuantizationParamsSearch(
     const Histogram& hist,
     bool preserve_sparsity,
     int precision) {
   if (preserve_sparsity) {
     VLOG(2) << "l2_approx with symmetric quantization falls back to L2";
     return ChooseQuantizationParams(hist, preserve_sparsity, precision);
   }
   VLOG(2) << "Using the nonlinear quantile search";

   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   float min, max;
   vector<float> bins_f(dnnlowp::adjust_hist_to_include_zero(hist, &min, &max));
   int nbins = bins_f.size();
   // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
   float bin_width = (max - min) / nbins;
   float scale = (max - min) / float((1 << precision) - 1);
   if (bin_width == 0 || scale < SMALL_SCALE_THRESHOLD) {
     QuantizationFactory* qfactory = QuantizationFactory::GetDefaultInstance();
     return qfactory->ChooseQuantizationParams(
         min, max, precision, preserve_sparsity);
   }
   int dst_nbins = 1 << precision;

   float org_max = max;
   float org_min = min;
   // calculate the CDF
   uint64_t total = 0;
   for (uint64_t x : bins_f) {
     total += x;
   }
   vector<uint64_t> CDF;
   uint64_t sum = 0;
   for (uint64_t x : bins_f) {
     sum += x;
     CDF.push_back(sum);
   }

   double stepsize = 0.00001; // experiment on the granularity
   double alpha = 0.0f, beta = 1.0f; // lowerbound and upperbound
   int start_bin = 0;
   int end_bin = nbins - 1;
   double norm_min = numeric_limits<double>::max();

   while (alpha < beta) {
     // find the next step
     double next_alpha = alpha + stepsize;
     double next_beta = beta - stepsize;

     // find the left and right bins between the quantile bounds
     int i = start_bin, j = end_bin;
     while (i < end_bin && CDF[i] < next_alpha * total)
       i++;
     while (j > start_bin && CDF[j] > next_beta * total)
       j--;

     // decide the next move
     // cout << i << ", " << j << endl;
     int next_start_bin = start_bin, next_end_bin = end_bin;
     if ((i - start_bin) > (end_bin - j)) {
       // move the start_bin
       next_start_bin = i;
       alpha = next_alpha;
     } else {
       // move the end_bin
       next_end_bin = j;
       beta = next_beta;
     }

     if (next_start_bin == start_bin && next_end_bin == end_bin)
       continue;
     // calculate the norm
     double norm = 0;
     double dst_bin_width =
         // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
         bin_width * (next_end_bin - next_start_bin + 1) / dst_nbins;

     // go over each histogram bin and accumulate errors
     for (int src_bin = 0; src_bin < nbins; ++src_bin) {
       // distances from the beginning of first dst_bin to the beginning and
       // end of src_bin
       // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
       double src_bin_begin = (src_bin - next_start_bin) * bin_width;
       double src_bin_end = src_bin_begin + bin_width;

       // which dst_bins the beginning and end of src_bin belong to?
       int dst_bin_of_begin = std::min(
           (1 << precision) - 1.,
           std::max(0., floor(src_bin_begin / dst_bin_width)));
       int dst_bin_of_end = std::min(
           (1 << precision) - 1.,
           std::max(0., floor(src_bin_end / dst_bin_width)));

       double dst_bin_of_begin_center =
           dst_bin_of_begin * dst_bin_width + dst_bin_width / 2;
       double density = bins_f[src_bin] / bin_width;
       if (dst_bin_of_begin == dst_bin_of_end) {
         // if src_bin is entirely within 1 dst_bin
         double delta_begin = src_bin_begin - dst_bin_of_begin_center;
         double delta_end = src_bin_end - dst_bin_of_begin_center;
         // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
         norm += GetNorm(delta_begin, delta_end, density, kind_);
       } else {
         double delta_begin = src_bin_begin - dst_bin_of_begin_center;
         double delta_end = dst_bin_width / 2;
         // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
         norm += GetNorm(delta_begin, delta_end, density, kind_);

         // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
         norm += (dst_bin_of_end - dst_bin_of_begin - 1) *
             // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             GetNorm(-dst_bin_width / 2, dst_bin_width / 2, density, kind_);

         double dst_bin_of_end_center =
             dst_bin_of_end * dst_bin_width + dst_bin_width / 2;
         delta_begin = -dst_bin_width / 2;
         delta_end = src_bin_end - dst_bin_of_end_center;
         // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
         norm += GetNorm(delta_begin, delta_end, density, kind_);
       }
     }
     if (norm > norm_min)
       break;
     norm_min = norm;
     start_bin = next_start_bin;
     end_bin = next_end_bin;
   }
   VLOG(2) << "best quantization range " << start_bin << "," << end_bin + 1
           << "," << norm_min;

   double selected_sum = 0;
   for (int i = start_bin; i < end_bin + 1; ++i) {
     selected_sum += bins_f[i];
   }
   VLOG(2) << "best quantization range covers "
           << (double)selected_sum / total * 100 << " %%";

   // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
   max = min + bin_width * (end_bin + 1);
   // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
   min = min + bin_width * start_bin;

   VLOG(2) << "Org min " << org_min << " org max " << org_max << " found min "
           << min << " max " << max << " with minimal norm " << norm_min;
   QuantizationFactory* qfactory = QuantizationFactory::GetDefaultInstance();
   return qfactory->ChooseQuantizationParams(
       min, max, precision, preserve_sparsity);
 }

 TensorQuantizationParams NormMinimization::ChooseQuantizationParams(
     const Histogram& hist,
     bool preserve_sparsity,
     int precision) {
   VLOG(2) << "Using the brute force search";
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   float min, max;
   vector<float> bins_f(dnnlowp::adjust_hist_to_include_zero(hist, &min, &max));
   int nbins = bins_f.size();
   // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
   float bin_width = (max - min) / nbins;

   float scale = (max - min) / float((1 << precision) - 1);
   if (bin_width == 0 || scale < SMALL_SCALE_THRESHOLD) {
     QuantizationFactory* qfactory = QuantizationFactory::GetDefaultInstance();
     return qfactory->ChooseQuantizationParams(
         min, max, precision, preserve_sparsity);
   }
   int dst_nbins = 1 << precision;

   // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
   int zero_bin = round(-min / bin_width);

   vector<pair<int, float>> best_start_bins(nbins + 1);

   // Look at mapping [start_bin, start_bin + nbins_selected) to
   // [0, 1 << precision) for every (start_bin, nbins_selected) combination and
   // pick the one with smallest L2 quantization error
 #ifdef _OPENMP
 #pragma omp parallel for schedule(dynamic)
 #endif
   for (int nbins_selected = 1; nbins_selected <= nbins; ++nbins_selected) {
     float norm_min = numeric_limits<float>::max();
     int best_start_bin = 0;

     int start_bin_begin = 0, start_bin_end = nbins - nbins_selected + 1;
     if (preserve_sparsity) {
       // when preserving sparsity we only check the range
       // starting from 0 (when min is 0) or symmetric around 0.
       if (min == 0) {
         start_bin_begin = 0;
         start_bin_end = 1;
       } else {
         start_bin_begin = zero_bin - nbins_selected / 2;
         start_bin_end = start_bin_begin + 1;
       }
     }

     // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
     float dst_bin_width = bin_width * nbins_selected / dst_nbins;

     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     int start_bin;
     for (start_bin = start_bin_begin; start_bin < start_bin_end; ++start_bin) {
       float norm = 0;

       // go over each histogram bin and accumulate errors
       caffe2::CpuId cpuid = caffe2::GetCpuId();
       if (kind_ == NormMinimization::L2 && cpuid.avx2() && cpuid.fma()) {
         norm = internal::L2MinimizationKernelAVX2(
             precision,
             bins_f.data(),
             nbins,
             bin_width,
             dst_bin_width,
             start_bin);
       } else {
         for (int src_bin = 0; src_bin < nbins; ++src_bin) {
           // distances from the beginning of first dst_bin to the beginning and
           // end of src_bin
           // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
           float src_bin_begin = (src_bin - start_bin) * bin_width;
           float src_bin_end = src_bin_begin + bin_width;

           // which dst_bins the beginning and end of src_bin belong to?
           int dst_bin_of_begin = std::min(
               // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
               (1 << precision) - 1.0f,
               std::max(0.0f, floorf(src_bin_begin / dst_bin_width)));
           int dst_bin_of_end = std::min(
               // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
               (1 << precision) - 1.0f,
               std::max(0.0f, floorf(src_bin_end / dst_bin_width)));

           float dst_bin_of_begin_center =
               // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
               dst_bin_of_begin * dst_bin_width + dst_bin_width / 2;
           float density = bins_f[src_bin] / bin_width;
           float delta_begin = src_bin_begin - dst_bin_of_begin_center;
           if (dst_bin_of_begin == dst_bin_of_end) {
             // if src_bin is entirely within 1 dst_bin
             float delta_end = src_bin_end - dst_bin_of_begin_center;
             norm += GetNorm(delta_begin, delta_end, density, kind_);
           } else {
             float delta_end = dst_bin_width / 2;
             norm += GetNorm(delta_begin, delta_end, density, kind_);

             // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             norm += (dst_bin_of_end - dst_bin_of_begin - 1) *
                 GetNorm(-dst_bin_width / 2, dst_bin_width / 2, density, kind_);

             float dst_bin_of_end_center =
                 // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
                 dst_bin_of_end * dst_bin_width + dst_bin_width / 2;
             delta_begin = -dst_bin_width / 2;
             delta_end = src_bin_end - dst_bin_of_end_center;
             norm += GetNorm(delta_begin, delta_end, density, kind_);
           }
         }
       }

       if (norm < norm_min) {
         norm_min = norm;
         best_start_bin = start_bin;
       }
     } // for each start_bin

     best_start_bins[nbins_selected] = {best_start_bin, norm_min};
   } // for each nbins_selected

   float norm_min = numeric_limits<float>::max();
   int best_nbins_selected = 1, best_start_bin = 0;
   for (int nbins_selected = 1; nbins_selected <= nbins; ++nbins_selected) {
     float norm = best_start_bins[nbins_selected].second;
     if (norm < norm_min) {
       norm_min = norm;
       best_start_bin = best_start_bins[nbins_selected].first;
       best_nbins_selected = nbins_selected;
     }
   }

   float total_sum = 0;
   for (const auto i : c10::irange(bins_f.size())) {
     total_sum += bins_f[i];
   }
   float selected_sum = 0;
   int i_begin = std::max(0, best_start_bin);
   int i_end = std::min(nbins, best_start_bin + best_nbins_selected);
   for (int i = i_begin; i < i_end; ++i) {
     selected_sum += bins_f[i];
   }
   VLOG(2) << "best quantization range covers " << selected_sum / total_sum * 100
           << " %%";

   // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
   max = min + bin_width * (best_start_bin + best_nbins_selected);
   // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
   min = min + bin_width * (best_start_bin);

   QuantizationFactory* qfactory = QuantizationFactory::GetDefaultInstance();
   return qfactory->ChooseQuantizationParams(
       min, max, precision, preserve_sparsity);
 } // ChooseQuantizationParams

 } // namespace dnnlowp
	#include "caffe2/core/logging.h"
	#include "caffe2/utils/cpuid.h"
	#include "l2_minimization.h"

	#include <cassert>
	#include <cmath>
	#include <limits>

	#include <immintrin.h>

	#include <c10/util/irange.h>

	using namespace std;

	namespace dnnlowp {

	#undef NDEBUG

	// Use fp16_min as the small scale cutoff because we don't want to use scales in fp16 subnormal range.
	// This is to be consistent with Glow and FakeLowP implementation for NNPI.
	constexpr float SMALL_SCALE_THRESHOLD = 6.1e-5f;

	static float
	GetNorm(float begin, float end, float density, NormMinimization::Kind kind) {
	float norm = 0;

	// assume values are uniformly distributed within each histogram bin
	if (NormMinimization::L2 == kind) {
	// err = density * (integral_{begin, end} x^2)
	// = density * (end^3 - begin^3) / 3
	norm = (end * end * end - begin * begin * begin) / 3;
	// for begin = d/2 and end = -d/2, this leads to d^3/12
	} else {
	// err = density * (integral_{begin, end} \|x\|)
	// = density * (end^2 - begin^2) / 2
	float left_begin = std::min(0.0f, begin);
	float left_end = std::min(0.0f, end);
	assert(left_begin * left_begin >= left_end * left_end);
	norm += (left_begin * left_begin - left_end * left_end) / 2;

	float right_begin = std::max(0.0f, begin);
	float right_end = std::max(0.0f, end);
	assert(right_end * right_end >= right_begin * right_begin);
	norm += (right_end * right_end - right_begin * right_begin) / 2;
	}

	return density * norm;
	}

	// Filter out outliers in input distributions
	// Exploit the input distributions for the quick search
	TensorQuantizationParams NormMinimization::NonlinearQuantizationParamsSearch(
	const Histogram& hist,
	bool preserve_sparsity,
	int precision) {
	if (preserve_sparsity) {
	VLOG(2) << "l2_approx with symmetric quantization falls back to L2";
	return ChooseQuantizationParams(hist, preserve_sparsity, precision);
	}
	VLOG(2) << "Using the nonlinear quantile search";

	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	float min, max;
	vector<float> bins_f(dnnlowp::adjust_hist_to_include_zero(hist, &min, &max));
	int nbins = bins_f.size();
	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
	float bin_width = (max - min) / nbins;
	float scale = (max - min) / float((1 << precision) - 1);
	if (bin_width == 0 \|\| scale < SMALL_SCALE_THRESHOLD) {
	QuantizationFactory* qfactory = QuantizationFactory::GetDefaultInstance();
	return qfactory->ChooseQuantizationParams(
	min, max, precision, preserve_sparsity);
	}
	int dst_nbins = 1 << precision;

	float org_max = max;
	float org_min = min;
	// calculate the CDF
	uint64_t total = 0;
	for (uint64_t x : bins_f) {
	total += x;
	}
	vector<uint64_t> CDF;
	uint64_t sum = 0;
	for (uint64_t x : bins_f) {
	sum += x;
	CDF.push_back(sum);
	}

	double stepsize = 0.00001; // experiment on the granularity
	double alpha = 0.0f, beta = 1.0f; // lowerbound and upperbound
	int start_bin = 0;
	int end_bin = nbins - 1;
	double norm_min = numeric_limits<double>::max();

	while (alpha < beta) {
	// find the next step
	double next_alpha = alpha + stepsize;
	double next_beta = beta - stepsize;

	// find the left and right bins between the quantile bounds
	int i = start_bin, j = end_bin;
	while (i < end_bin && CDF[i] < next_alpha * total)
	i++;
	while (j > start_bin && CDF[j] > next_beta * total)
	j--;

	// decide the next move
	// cout << i << ", " << j << endl;
	int next_start_bin = start_bin, next_end_bin = end_bin;
	if ((i - start_bin) > (end_bin - j)) {
	// move the start_bin
	next_start_bin = i;
	alpha = next_alpha;
	} else {
	// move the end_bin
	next_end_bin = j;
	beta = next_beta;
	}

	if (next_start_bin == start_bin && next_end_bin == end_bin)
	continue;
	// calculate the norm
	double norm = 0;
	double dst_bin_width =
	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
	bin_width * (next_end_bin - next_start_bin + 1) / dst_nbins;

	// go over each histogram bin and accumulate errors
	for (int src_bin = 0; src_bin < nbins; ++src_bin) {
	// distances from the beginning of first dst_bin to the beginning and
	// end of src_bin
	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
	double src_bin_begin = (src_bin - next_start_bin) * bin_width;
	double src_bin_end = src_bin_begin + bin_width;

	// which dst_bins the beginning and end of src_bin belong to?
	int dst_bin_of_begin = std::min(
	(1 << precision) - 1.,
	std::max(0., floor(src_bin_begin / dst_bin_width)));
	int dst_bin_of_end = std::min(
	(1 << precision) - 1.,
	std::max(0., floor(src_bin_end / dst_bin_width)));

	double dst_bin_of_begin_center =
	dst_bin_of_begin * dst_bin_width + dst_bin_width / 2;
	double density = bins_f[src_bin] / bin_width;
	if (dst_bin_of_begin == dst_bin_of_end) {
	// if src_bin is entirely within 1 dst_bin
	double delta_begin = src_bin_begin - dst_bin_of_begin_center;
	double delta_end = src_bin_end - dst_bin_of_begin_center;
	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
	norm += GetNorm(delta_begin, delta_end, density, kind_);
	} else {
	double delta_begin = src_bin_begin - dst_bin_of_begin_center;
	double delta_end = dst_bin_width / 2;
	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
	norm += GetNorm(delta_begin, delta_end, density, kind_);

	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
	norm += (dst_bin_of_end - dst_bin_of_begin - 1) *
	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
	GetNorm(-dst_bin_width / 2, dst_bin_width / 2, density, kind_);

	double dst_bin_of_end_center =
	dst_bin_of_end * dst_bin_width + dst_bin_width / 2;
	delta_begin = -dst_bin_width / 2;
	delta_end = src_bin_end - dst_bin_of_end_center;
	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
	norm += GetNorm(delta_begin, delta_end, density, kind_);
	}
	}
	if (norm > norm_min)
	break;
	norm_min = norm;
	start_bin = next_start_bin;
	end_bin = next_end_bin;
	}
	VLOG(2) << "best quantization range " << start_bin << "," << end_bin + 1
	<< "," << norm_min;

	double selected_sum = 0;
	for (int i = start_bin; i < end_bin + 1; ++i) {
	selected_sum += bins_f[i];
	}
	VLOG(2) << "best quantization range covers "
	<< (double)selected_sum / total * 100 << " %%";

	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
	max = min + bin_width * (end_bin + 1);
	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
	min = min + bin_width * start_bin;

	VLOG(2) << "Org min " << org_min << " org max " << org_max << " found min "
	<< min << " max " << max << " with minimal norm " << norm_min;
	QuantizationFactory* qfactory = QuantizationFactory::GetDefaultInstance();
	return qfactory->ChooseQuantizationParams(
	min, max, precision, preserve_sparsity);
	}

	TensorQuantizationParams NormMinimization::ChooseQuantizationParams(
	const Histogram& hist,
	bool preserve_sparsity,
	int precision) {
	VLOG(2) << "Using the brute force search";
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	float min, max;
	vector<float> bins_f(dnnlowp::adjust_hist_to_include_zero(hist, &min, &max));
	int nbins = bins_f.size();
	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
	float bin_width = (max - min) / nbins;

	float scale = (max - min) / float((1 << precision) - 1);
	if (bin_width == 0 \|\| scale < SMALL_SCALE_THRESHOLD) {
	QuantizationFactory* qfactory = QuantizationFactory::GetDefaultInstance();
	return qfactory->ChooseQuantizationParams(
	min, max, precision, preserve_sparsity);
	}
	int dst_nbins = 1 << precision;

	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
	int zero_bin = round(-min / bin_width);

	vector<pair<int, float>> best_start_bins(nbins + 1);

	// Look at mapping [start_bin, start_bin + nbins_selected) to
	// [0, 1 << precision) for every (start_bin, nbins_selected) combination and
	// pick the one with smallest L2 quantization error
	#ifdef _OPENMP
	#pragma omp parallel for schedule(dynamic)
	#endif
	for (int nbins_selected = 1; nbins_selected <= nbins; ++nbins_selected) {
	float norm_min = numeric_limits<float>::max();
	int best_start_bin = 0;

	int start_bin_begin = 0, start_bin_end = nbins - nbins_selected + 1;
	if (preserve_sparsity) {
	// when preserving sparsity we only check the range
	// starting from 0 (when min is 0) or symmetric around 0.
	if (min == 0) {
	start_bin_begin = 0;
	start_bin_end = 1;
	} else {
	start_bin_begin = zero_bin - nbins_selected / 2;
	start_bin_end = start_bin_begin + 1;
	}
	}

	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
	float dst_bin_width = bin_width * nbins_selected / dst_nbins;

	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	int start_bin;
	for (start_bin = start_bin_begin; start_bin < start_bin_end; ++start_bin) {
	float norm = 0;

	// go over each histogram bin and accumulate errors
	caffe2::CpuId cpuid = caffe2::GetCpuId();
	if (kind_ == NormMinimization::L2 && cpuid.avx2() && cpuid.fma()) {
	norm = internal::L2MinimizationKernelAVX2(
	precision,
	bins_f.data(),
	nbins,
	bin_width,
	dst_bin_width,
	start_bin);
	} else {
	for (int src_bin = 0; src_bin < nbins; ++src_bin) {
	// distances from the beginning of first dst_bin to the beginning and
	// end of src_bin
	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
	float src_bin_begin = (src_bin - start_bin) * bin_width;
	float src_bin_end = src_bin_begin + bin_width;

	// which dst_bins the beginning and end of src_bin belong to?
	int dst_bin_of_begin = std::min(
	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
	(1 << precision) - 1.0f,
	std::max(0.0f, floorf(src_bin_begin / dst_bin_width)));
	int dst_bin_of_end = std::min(
	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
	(1 << precision) - 1.0f,
	std::max(0.0f, floorf(src_bin_end / dst_bin_width)));

	float dst_bin_of_begin_center =
	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
	dst_bin_of_begin * dst_bin_width + dst_bin_width / 2;
	float density = bins_f[src_bin] / bin_width;
	float delta_begin = src_bin_begin - dst_bin_of_begin_center;
	if (dst_bin_of_begin == dst_bin_of_end) {
	// if src_bin is entirely within 1 dst_bin
	float delta_end = src_bin_end - dst_bin_of_begin_center;
	norm += GetNorm(delta_begin, delta_end, density, kind_);
	} else {
	float delta_end = dst_bin_width / 2;
	norm += GetNorm(delta_begin, delta_end, density, kind_);

	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
	norm += (dst_bin_of_end - dst_bin_of_begin - 1) *
	GetNorm(-dst_bin_width / 2, dst_bin_width / 2, density, kind_);

	float dst_bin_of_end_center =
	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
	dst_bin_of_end * dst_bin_width + dst_bin_width / 2;
	delta_begin = -dst_bin_width / 2;
	delta_end = src_bin_end - dst_bin_of_end_center;
	norm += GetNorm(delta_begin, delta_end, density, kind_);
	}
	}
	}

	if (norm < norm_min) {
	norm_min = norm;
	best_start_bin = start_bin;
	}
	} // for each start_bin

	best_start_bins[nbins_selected] = {best_start_bin, norm_min};
	} // for each nbins_selected

	float norm_min = numeric_limits<float>::max();
	int best_nbins_selected = 1, best_start_bin = 0;
	for (int nbins_selected = 1; nbins_selected <= nbins; ++nbins_selected) {
	float norm = best_start_bins[nbins_selected].second;
	if (norm < norm_min) {
	norm_min = norm;
	best_start_bin = best_start_bins[nbins_selected].first;
	best_nbins_selected = nbins_selected;
	}
	}

	float total_sum = 0;
	for (const auto i : c10::irange(bins_f.size())) {
	total_sum += bins_f[i];
	}
	float selected_sum = 0;
	int i_begin = std::max(0, best_start_bin);
	int i_end = std::min(nbins, best_start_bin + best_nbins_selected);
	for (int i = i_begin; i < i_end; ++i) {
	selected_sum += bins_f[i];
	}
	VLOG(2) << "best quantization range covers " << selected_sum / total_sum * 100
	<< " %%";

	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
	max = min + bin_width * (best_start_bin + best_nbins_selected);
	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
	min = min + bin_width * (best_start_bin);

	QuantizationFactory* qfactory = QuantizationFactory::GetDefaultInstance();
	return qfactory->ChooseQuantizationParams(
	min, max, precision, preserve_sparsity);
	} // ChooseQuantizationParams

	} // namespace dnnlowp