| // Example code illustrating the theory exposed in doc/quantization.md |
| |
| /* Command line to build and run on x86: |
| |
| c++ doc/quantization_example.cc -I . --std=c++11 -msse4.1 -lpthread \ |
| -o /tmp/quantization_example && \ |
| /tmp/quantization_example |
| |
| */ |
| |
| #include <algorithm> |
| #include <cassert> |
| #include <cmath> |
| #include <cstdint> |
| #include <iostream> |
| #include <random> |
| #include <vector> |
| #include "../public/gemmlowp.h" |
| #include "../public/output_stages.h" |
| |
| // We will handle both float and quantized matrices, which we will |
| // represent as gemmlowp::MatrixMap. |
| // We will need to be able to print them. |
| |
| // Output a matrix to a std::ostream |
| template <typename tScalar, gemmlowp::MapOrder tOrder> |
| std::ostream& operator<<(std::ostream& s, |
| const gemmlowp::MatrixMap<tScalar, tOrder>& m) { |
| for (int i = 0; i < m.rows(); i++) { |
| for (int j = 0; j < m.cols(); j++) { |
| if (j) { |
| s << '\t'; |
| } |
| s << static_cast<float>(m(i, j)); |
| } |
| s << '\n'; |
| } |
| return s; |
| } |
| |
| // Find the min and max value in a float matrix. |
| template <gemmlowp::MapOrder tOrder> |
| void FindMinMax(const gemmlowp::MatrixMap<float, tOrder>& m, float* min, |
| float* max) { |
| *min = *max = m(0, 0); |
| for (int i = 0; i < m.rows(); i++) { |
| for (int j = 0; j < m.cols(); j++) { |
| const float val = m(i, j); |
| *min = std::min(*min, val); |
| *max = std::max(*max, val); |
| } |
| } |
| } |
| |
| // A structure to hold quantization parameters 'scale' and 'zero_point' |
| // as discussed in doc/quantization.md. As explained there, the meaning |
| // of these values is as the constants in the quantization equation |
| // |
| // real_value = scale * (quantized_value - zero_point) |
| // |
| // In other words, 'zero_point' is the quantized value that corresponds |
| // to the real value 0, and 'scale' is the difference of real values |
| // corresponding to consecutive quantized values. |
| struct QuantizationParams { |
| float scale; |
| std::uint8_t zero_point; |
| }; |
| |
| // Given the min and max values of a float array, return |
| // reasonable quantization parameters to use for this array. |
| QuantizationParams ChooseQuantizationParams(float min, float max) { |
| // We extend the [min, max] interval to ensure that it contains 0. |
| // Otherwise, we would not meet the requirement that 0 be an exactly |
| // representable value. |
| min = std::min(min, 0.f); |
| max = std::max(max, 0.f); |
| |
| // the min and max quantized values, as floating-point values |
| const float qmin = 0; |
| const float qmax = 255; |
| |
| // First determine the scale. |
| const double scale = (max - min) / (qmax - qmin); |
| |
| // Zero-point computation. |
| // First the initial floating-point computation. The zero-point can be |
| // determined from solving an affine equation for any known pair |
| // (real value, corresponding quantized value). |
| // We know two such pairs: (rmin, qmin) and (rmax, qmax). |
| // Let's use the first one here. |
| const double initial_zero_point = qmin - min / scale; |
| |
| // Now we need to nudge the zero point to be an integer |
| // (our zero points are integer, and this is motivated by the requirement |
| // to be able to represent the real value "0" exactly as a quantized value, |
| // which is required in multiple places, for example in Im2col with SAME |
| // padding). |
| std::uint8_t nudged_zero_point = 0; |
| if (initial_zero_point < qmin) { |
| nudged_zero_point = qmin; |
| } else if (initial_zero_point > qmax) { |
| nudged_zero_point = qmax; |
| } else { |
| nudged_zero_point = |
| static_cast<std::uint8_t>(std::round(initial_zero_point)); |
| } |
| |
| QuantizationParams result; |
| result.scale = scale; |
| result.zero_point = nudged_zero_point; |
| return result; |
| } |
| |
| template <gemmlowp::MapOrder tLhsOrder, gemmlowp::MapOrder tRhsOrder, |
| gemmlowp::MapOrder tResultOrder> |
| void FloatMatrixMultiplication( |
| const gemmlowp::MatrixMap<const float, tLhsOrder>& lhs, |
| const gemmlowp::MatrixMap<const float, tRhsOrder>& rhs, |
| gemmlowp::MatrixMap<float, tResultOrder>* result) { |
| assert(lhs.cols() == rhs.rows()); |
| assert(lhs.rows() == result->rows()); |
| assert(rhs.cols() == result->cols()); |
| for (int i = 0; i < lhs.rows(); i++) { |
| for (int k = 0; k < rhs.cols(); k++) { |
| (*result)(i, k) = 0; |
| for (int j = 0; j < lhs.cols(); j++) { |
| (*result)(i, k) += lhs(i, j) * rhs(j, k); |
| } |
| } |
| } |
| } |
| |
| void Quantize(const QuantizationParams& qparams, const std::vector<float>& src, |
| std::vector<std::uint8_t>* dst) { |
| assert(src.size() == dst->size()); |
| for (std::size_t i = 0; i < src.size(); i++) { |
| const float real_val = src[i]; |
| const float transformed_val = qparams.zero_point + real_val / qparams.scale; |
| const float clamped_val = std::max(0.f, std::min(255.f, transformed_val)); |
| (*dst)[i] = static_cast<std::uint8_t>(std::round(clamped_val)); |
| } |
| } |
| |
| void Dequantize(const QuantizationParams& qparams, |
| const std::vector<std::uint8_t>& src, std::vector<float>* dst) { |
| assert(src.size() == dst->size()); |
| for (std::size_t i = 0; i < src.size(); i++) { |
| const std::uint8_t quantized_val = src[i]; |
| (*dst)[i] = qparams.scale * (quantized_val - qparams.zero_point); |
| } |
| } |
| |
| template <typename tScalar, gemmlowp::MapOrder tOrder> |
| class MatrixWithStorage { |
| public: |
| MatrixWithStorage(int rows, int cols) |
| : storage(rows * cols), matrix_map(storage.data(), rows, cols) {} |
| void MakeRandom() { |
| static std::mt19937 random_engine; |
| std::uniform_real_distribution<float> distribution(-1, 1); |
| for (auto& x : storage) { |
| x = static_cast<tScalar>(distribution(random_engine)); |
| } |
| } |
| gemmlowp::MatrixMap<const tScalar, tOrder> ConstMap() const { |
| return gemmlowp::MatrixMap<const tScalar, tOrder>( |
| storage.data(), matrix_map.rows(), matrix_map.cols()); |
| } |
| gemmlowp::MatrixMap<tScalar, tOrder> Map() { |
| return gemmlowp::MatrixMap<tScalar, tOrder>( |
| storage.data(), matrix_map.rows(), matrix_map.cols()); |
| } |
| const std::vector<tScalar>& Storage() const { return storage; } |
| std::vector<tScalar>& Storage() { return storage; } |
| |
| private: |
| std::vector<tScalar> storage; |
| gemmlowp::MatrixMap<tScalar, tOrder> matrix_map; |
| }; |
| |
| template <typename tScalar, gemmlowp::MapOrder tOrder> |
| std::ostream& operator<<(std::ostream& s, |
| const MatrixWithStorage<tScalar, tOrder>& m) { |
| return s << m.ConstMap(); |
| } |
| |
| // Given a real_multiplier in the interval (0, 1), |
| // produces a pair (quantized_multiplier, right_shift) where |
| // quantized_multiplier is an int32 representing a fixed-point value |
| // in the interval [-1, 1) (in practice we only produce positive values) |
| // and right_shift is an amount to shift right by, so that the |
| // floating-point multiplication of some int32 input value by real_multiplier, |
| // |
| // return static_cast<int32>(int32_value * real_multiplier); |
| // |
| // is best approximated by the integer-arithmetic-only code |
| // |
| // return RoundingRightShift( |
| // FixedPointMultiplication(int32_value, quantized_multiplier), |
| // right_shift); |
| // |
| // This is how to obtain the fixed-point multiplier and right shift |
| // parameters to pass to |
| // OutputStageQuantizeDownInt32ByFixedPoint. |
| // |
| // Note: all this code only needs to run offline to generate the quantized |
| // neural network workload, not at runtime on the |
| // device on which quantized neural networks need to run. So it's not |
| // performance-critical at all. |
| void QuantizeMultiplierSmallerThanOne(float real_multiplier, |
| std::int32_t* quantized_multiplier, |
| int* right_shift) { |
| assert(real_multiplier > 0.f); |
| assert(real_multiplier < 1.f); |
| int s = 0; |
| // We want to bring the real multiplier into the interval [1/2, 1). |
| // We can do so by multiplying it by two, and recording how many times |
| // we multiplied by two so that we can compensate that by a right |
| // shift by the same amount. |
| while (real_multiplier < 0.5f) { |
| real_multiplier *= 2.0f; |
| s++; |
| } |
| // Now that the real multiplier is in [1/2, 1), we convert it |
| // into a fixed-point number. |
| std::int64_t q = |
| static_cast<std::int64_t>(std::round(real_multiplier * (1ll << 31))); |
| assert(q <= (1ll << 31)); |
| // Handle the special case when the real multiplier was so close to 1 |
| // that its fixed-point approximation was undistinguishable from 1. |
| // We handle this by dividing it by two, and remembering to decrement |
| // the right shift amount. |
| if (q == (1ll << 31)) { |
| q /= 2; |
| s--; |
| } |
| assert(s >= 0); |
| assert(q <= std::numeric_limits<std::int32_t>::max()); |
| *quantized_multiplier = static_cast<std::int32_t>(q); |
| *right_shift = s; |
| } |
| |
| int main() { |
| std::cout.precision(3); |
| |
| const int rows = 2; |
| const int depth = 4; |
| const int cols = 3; |
| const auto kOrder = gemmlowp::MapOrder::ColMajor; |
| |
| std::cout << "First, let us make some float matrices LHS and RHS, " |
| << "and compute their product.\n" |
| << std::endl; |
| MatrixWithStorage<float, kOrder> float_lhs(rows, depth); |
| float_lhs.MakeRandom(); |
| MatrixWithStorage<float, kOrder> float_rhs(depth, cols); |
| float_rhs.MakeRandom(); |
| MatrixWithStorage<float, kOrder> reference_float_result(rows, cols); |
| auto reference_float_result_map = reference_float_result.Map(); |
| FloatMatrixMultiplication(float_lhs.ConstMap(), float_rhs.ConstMap(), |
| &reference_float_result_map); |
| std::cout << "Here is the float LHS matrix:\n" << float_lhs << std::endl; |
| std::cout << "Here is the float RHS matrix:\n" << float_rhs << std::endl; |
| std::cout << "Here is the float product (LHS * RHS) matrix obtained by " |
| << "ordinary float matrix multiplication, i.e. as far as we are " |
| << "concerned, the REFERENCE RESULT:\n" |
| << reference_float_result << std::endl; |
| |
| std::cout |
| << "Now we embark on reproducing this result using " |
| << "quantized arithmetic. The code below splits into two parts: " |
| << "quantization code that only needs to run offline (e.g. to " |
| << "generate a quantized neural network workload), and actual " |
| << "runtime quantized code, which is typically performance-critical " |
| << "and where we typically do not want to use any floating-point " |
| << "arithmetic. We want to clearly distinguish between the two.\n" |
| << std::endl; |
| |
| std::cout << "The below is OFFLINE QUANTIZATION CODE. We still use some " |
| << "floating-point arithmetic in the process of generating the " |
| << "quantized workload to be run on-device.\n" |
| << std::endl; |
| |
| std::cout |
| << "Now, let us choose quantization parameters for these matrices. " |
| << "You might ask, what good is quantization if we need to pick " |
| << "quantization parameters for the result before we can run the " |
| << "quantized computation to obtain the result? The idea is that we " |
| << "target applications such as neural networks, where unknown results " |
| << "are only allowed to vary within preexisting bounds. In practice, the " |
| << "bounds for the results are typically learned during the neural " |
| "network " |
| << "training process. The min and max of the result do not have to be " |
| << "exact. If they are too broad, we just get lower quantization " |
| "accuracy. " |
| << "If they are too narrow, we just get clamping at the bounds.\n" |
| << std::endl; |
| |
| float lhs_min, lhs_max, rhs_min, rhs_max, result_min, result_max; |
| FindMinMax(float_lhs.Map(), &lhs_min, &lhs_max); |
| FindMinMax(float_rhs.Map(), &rhs_min, &rhs_max); |
| FindMinMax(reference_float_result.Map(), &result_min, &result_max); |
| const auto lhs_qparams = ChooseQuantizationParams(lhs_min, lhs_max); |
| const auto rhs_qparams = ChooseQuantizationParams(rhs_min, rhs_max); |
| const auto result_qparams = ChooseQuantizationParams(result_min, result_max); |
| |
| std::cout << "For LHS, we have min = " << lhs_min << ", max = " << lhs_max |
| << ", scale = " << lhs_qparams.scale |
| << ", zero_point = " << static_cast<float>(lhs_qparams.zero_point) |
| << std::endl; |
| std::cout << "For RHS, we have min = " << rhs_min << ", max = " << rhs_max |
| << ", scale = " << rhs_qparams.scale |
| << ", zero_point = " << static_cast<float>(rhs_qparams.zero_point) |
| << std::endl; |
| std::cout << "For the result, we have min = " << result_min |
| << ", max = " << result_max << ", scale = " << result_qparams.scale |
| << ", zero_point = " |
| << static_cast<float>(result_qparams.zero_point) << std::endl; |
| |
| std::cout << std::endl; |
| |
| MatrixWithStorage<std::uint8_t, kOrder> uint8_lhs(rows, depth); |
| MatrixWithStorage<std::uint8_t, kOrder> uint8_rhs(depth, cols); |
| MatrixWithStorage<std::uint8_t, kOrder> actual_uint8_result(rows, cols); |
| |
| Quantize(lhs_qparams, float_lhs.Storage(), &uint8_lhs.Storage()); |
| Quantize(rhs_qparams, float_rhs.Storage(), &uint8_rhs.Storage()); |
| |
| std::cout << "Quantized uint8 LHS matrix:\n" << uint8_lhs << std::endl; |
| std::cout << "Quantized uint8 RHS matrix:\n" << uint8_rhs << std::endl; |
| |
| const int lhs_offset = -lhs_qparams.zero_point; |
| const int rhs_offset = -rhs_qparams.zero_point; |
| const int result_offset = result_qparams.zero_point; |
| |
| const float real_multiplier = |
| lhs_qparams.scale * rhs_qparams.scale / result_qparams.scale; |
| std::int32_t quantized_multiplier; |
| int right_shift; |
| QuantizeMultiplierSmallerThanOne(real_multiplier, &quantized_multiplier, |
| &right_shift); |
| |
| std::cout << "End of OFFLINE QUANTIZATION CODE.\n" << std::endl; |
| |
| std::cout << "The below is ON-DEVICE RUNTIME QUANTIZED CODE. " |
| << "This is the part that is performance-critical and may only " |
| << "use quantized arithmetic.\n" |
| << std::endl; |
| |
| gemmlowp::OutputStageQuantizeDownInt32ByFixedPoint |
| quantize_down_stage; |
| quantize_down_stage.result_offset_after_shift = result_offset; |
| quantize_down_stage.result_fixedpoint_multiplier = quantized_multiplier; |
| quantize_down_stage.result_shift = right_shift; |
| gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage; |
| const auto& output_pipeline = |
| std::make_tuple(quantize_down_stage, saturating_cast_stage); |
| |
| auto actual_uint8_result_map = actual_uint8_result.Map(); |
| gemmlowp::GemmContext gemm_context; |
| gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, |
| gemmlowp::DefaultL8R8BitDepthParams>( |
| &gemm_context, uint8_lhs.ConstMap(), uint8_rhs.ConstMap(), |
| &actual_uint8_result_map, lhs_offset, rhs_offset, output_pipeline); |
| |
| std::cout << "Quantized uint8 result matrix obtained by quantized " |
| << "multiplication:\n" |
| << actual_uint8_result << std::endl; |
| |
| std::cout << "End of ON-DEVICE RUNTIME QUANTIZED CODE.\n" << std::endl; |
| |
| MatrixWithStorage<float, kOrder> actual_float_result(rows, cols); |
| Dequantize(result_qparams, actual_uint8_result.Storage(), |
| &actual_float_result.Storage()); |
| std::cout |
| << "Here is the actual float product (LHS * RHS) matrix obtained by " |
| << "dequantizing the above uint8 result, i.e. " |
| << "as far as we are concerned, the ACTUAL RESULT:\n" |
| << actual_float_result << std::endl; |
| |
| MatrixWithStorage<float, kOrder> diff_float_result(rows, cols); |
| for (int i = 0; i < rows; i++) { |
| for (int j = 0; j < cols; j++) { |
| diff_float_result.Map()(i, j) = |
| actual_float_result.Map()(i, j) - reference_float_result.Map()(i, j); |
| } |
| } |
| |
| std::cout << "Difference between ACTUAL and REFERENCE float results:\n" |
| << diff_float_result << std::endl; |
| } |