internal/output.h - platform/external/gemmlowp - Git at Google

 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // output.h: processing the 32-bit accumulators output by the unpack
 // stage, obtaining the final result matrix entries and storing them into
 // the destination matrix.

 #ifndef GEMMLOWP_INTERNAL_OUTPUT_H_
 #define GEMMLOWP_INTERNAL_OUTPUT_H_

 #include <cmath>
 #include <tuple>
 #include <type_traits>
 #include <typeinfo>

 #include "../fixedpoint/fixedpoint.h"
 #include "../public/output_stages.h"
 #include "simd_wrappers.h"

 namespace gemmlowp {

 template <typename OutputStage, typename InputBufferType>
 struct OutputStageEvalBufferImpl {
   // This generic template body should never be hit.
   static_assert(
       std::is_same<InputBufferType, void>::value,
       "Unimplemented: missing implementation of this output pipeline stage "
       "for this data type. This would happen if some architecture-specific "
       "SIMD back-end (output_$arch.h) were incomplete.");
 };

 template <typename OutputStage, typename InputType>
 struct OutputStageEvalImpl {
   static constexpr int kRows = InputType::kRows;
   static constexpr int kCols = InputType::kCols;
   using InputBufferType = typename InputType::BufferType;
   using BufferEvalImplType =
       OutputStageEvalBufferImpl<OutputStage, InputBufferType>;
   using OutputBufferType = typename BufferEvalImplType::OutputType;
   using OutputScalarType = typename OutputBufferType::ScalarType;
   using OutputType = RegisterBlock<OutputScalarType, kRows, kCols>;

   OutputStageEvalImpl(const OutputStage& s) : buffer_eval_impl(s) {}

   OutputType Eval(InputType input, int, int) const {
     OutputType output;
     output.buf = buffer_eval_impl.Eval(input.buf);
     return output;
   }

   const BufferEvalImplType buffer_eval_impl;
 };

 template <int Size>
 struct OutputStageEvalBufferImpl<OutputStageQuantizeDownInt32ToUint8Scale,
                                  RegisterBuffer<std::int32_t, Size>> {
   using InputType = RegisterBuffer<std::int32_t, Size>;
   using OutputType = RegisterBuffer<std::int32_t, Size>;

   typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage;

   OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {}

   OutputType Eval(InputType input) const {
     const int result_shift = output_stage.result_shift;
     const std::int32_t result_mult_int = output_stage.result_mult_int;
     using RegisterType = typename InputType::RegisterType;
     const RegisterType result_offset =
         Dup<RegisterType>(output_stage.result_offset);
     OutputType output;
     for (int i = 0; i < InputType::kRegisterCount; i++) {
       output.reg[i] = RoundingDivideByPOT(
           Mul(Add(input.reg[i], result_offset), result_mult_int), result_shift);
     }
     return output;
   }

   const OutputStage& output_stage;
 };

 template <int Rows, int Cols, VectorShape Shape>
 struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8ScalePC<Shape>,
                            RegisterBlock<std::int32_t, Rows, Cols>> {
   typedef RegisterBlock<std::int32_t, Rows, Cols> InputType;
   typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType;
   typedef OutputStageQuantizeDownInt32ToUint8ScalePC<Shape> OutputStage;

   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

   OutputType Eval(InputType input, int row, int col) const {
     OutputType output;
     const int result_shift = output_stage.result_shift;
     const int pos = Shape == VectorShape::Col ? row : col;
     const auto result_mult_int =
         LoadForBroadcasting<InputType>(output_stage.result_mult_int, pos);
     const auto result_offset =
         LoadForBroadcasting<InputType>(output_stage.result_offset, pos);
     const auto dividend = BroadcastMul<InputType>(
         BroadcastAdd<InputType>(input, result_offset), result_mult_int);
     for (int i = 0; i < InputType::kRegisterCount; i++) {
       output.buf.reg[i] =
           RoundingDivideByPOT(dividend.buf.reg[i], result_shift);
     }
     return output;
   }

   const OutputStage& output_stage;
 };

 template <int Size>
 struct OutputStageEvalBufferImpl<
     OutputStageQuantizeDownInt32ByFixedPoint,
     RegisterBuffer<std::int32_t, Size>> {
   typedef RegisterBuffer<std::int32_t, Size> InputType;
   typedef RegisterBuffer<std::int32_t, Size> OutputType;

   typedef OutputStageQuantizeDownInt32ByFixedPoint OutputStage;

   OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {}

   OutputType Eval(InputType input) const {
     OutputType output;
     using RegisterType = typename InputType::RegisterType;
     const RegisterType result_offset_after_shift =
         Dup<RegisterType>(output_stage.result_offset_after_shift);
     for (int i = 0; i < InputType::kRegisterCount; i++) {
       const RegisterType mulhigh_val = SaturatingRoundingDoublingHighMul(
           input.reg[i], output_stage.result_fixedpoint_multiplier);
       output.reg[i] =
           Add(RoundingDivideByPOT(mulhigh_val, output_stage.result_shift),
               result_offset_after_shift);
     }
     return output;
   }

   const OutputStage& output_stage;
 };

 template <int Size>
 struct OutputStageEvalBufferImpl<OutputStageScaleInt32ByFixedPointAndExponent,
                                  RegisterBuffer<std::int32_t, Size>> {
   typedef RegisterBuffer<std::int32_t, Size> InputType;
   typedef RegisterBuffer<std::int32_t, Size> OutputType;

   typedef OutputStageScaleInt32ByFixedPointAndExponent OutputStage;

   OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {
     left_shift = std::max(0, output_stage.result_exponent);
     right_shift = std::max(0, -output_stage.result_exponent);
   }

   OutputType Eval(InputType input) const {
     OutputType output;
     using RegisterType = typename InputType::RegisterType;
     const RegisterType result_offset_after_shift =
         Dup<RegisterType>(output_stage.result_offset_after_shift);
     for (int i = 0; i < InputType::kRegisterCount; i++) {
       const RegisterType mulhigh_val = SaturatingRoundingDoublingHighMul(
           ShiftLeft(input.reg[i], left_shift),
           output_stage.result_fixedpoint_multiplier);
       output.reg[i] = Add(RoundingDivideByPOT(mulhigh_val, right_shift),
                           result_offset_after_shift);
     }
     return output;
   }

   const OutputStage& output_stage;
   int left_shift;
   int right_shift;
 };

 template <int Rows, int Cols, VectorShape Shape>
 struct OutputStageEvalImpl<
     OutputStageScaleInt32ByFixedPointAndExponentPC<Shape>,
     RegisterBlock<std::int32_t, Rows, Cols>> {
   typedef RegisterBlock<std::int32_t, Rows, Cols> InputType;
   typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType;

   typedef OutputStageScaleInt32ByFixedPointAndExponentPC<Shape> OutputStage;

   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

   OutputType Eval(InputType input, int row, int col) const {
     OutputType output;
     const int pos = Shape == VectorShape::Row ? col : row;
     using RegisterType = typename InputType::RegisterType;
     const RegisterType result_offset_after_shift =
         Dup<RegisterType>(output_stage.result_offset_after_shift);
     auto left_shift =
         LoadForBroadcasting<InputType>(output_stage.result_exponent, pos);
     auto right_shift =
         LoadForBroadcasting<InputType>(output_stage.result_exponent, pos);
     const auto result_fixedpoint_multiplier = LoadForBroadcasting<InputType>(
         output_stage.result_fixedpoint_multiplier, pos);
     for (int i = 0; i < decltype(left_shift)::kRegisterCount; i++) {
       left_shift.buf.reg[i] = Max(left_shift.buf.reg[i], 0);
       right_shift.buf.reg[i] = Max(-right_shift.buf.reg[i], 0);
     }
     const auto mulhigh_val = BroadcastSaturatingRoundingDoublingHighMul(
         BroadcastShiftLeft(input, left_shift), result_fixedpoint_multiplier);
     const auto rdpot_val =
         BroadcastRoundingDivideByPOT(mulhigh_val, right_shift);
     for (int i = 0; i < InputType::kRegisterCount; i++) {
       output.buf.reg[i] = Add(rdpot_val.buf.reg[i], result_offset_after_shift);
     }
     return output;
   }

   const OutputStage& output_stage;
 };

 // Implementation of OutputStageSaturatingCastToUint8 for scalar data.
 template <int Size>
 struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8,
                                  RegisterBuffer<std::int32_t, Size>> {
   typedef RegisterBuffer<std::int32_t, Size> InputType;
   typedef RegisterBuffer<std::uint8_t, Size> OutputType;
   static_assert(InputType::kRegisterLanes == 1,
                 "This path is only for scalar values");

   typedef OutputStageSaturatingCastToUint8 OutputStage;

   OutputStageEvalBufferImpl(const OutputStage&) {}

   OutputType Eval(InputType input) const {
     OutputType output;
     for (int i = 0; i < InputType::kRegisterCount; i++) {
       std::int32_t data = input.reg[i];
       output.reg[i] = data > 255 ? 255 : data < 0 ? 0 : data;
     }
     return output;
   }
 };

 // Implementation of OutputStageSaturatingCastToInt8 for scalar data.
 template <int Size>
 struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt8,
                                  RegisterBuffer<std::int32_t, Size>> {
   typedef RegisterBuffer<std::int32_t, Size> InputType;
   typedef RegisterBuffer<std::int8_t, Size> OutputType;
   static_assert(InputType::kRegisterLanes == 1,
                 "This path is only for scalar values");

   typedef OutputStageSaturatingCastToInt8 OutputStage;

   OutputStageEvalBufferImpl(const OutputStage&) {}

   OutputType Eval(InputType input) const {
     OutputType output;
     for (int i = 0; i < InputType::kRegisterCount; i++) {
       std::int32_t data = input.reg[i];
       output.reg[i] = data > 127 ? 127 : data < -128 ? -128 : data;
     }
     return output;
   }
 };

 // Implementation of OutputStageSaturatingCastToInt16 for scalar data.
 template <int Size>
 struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
                                  RegisterBuffer<std::int32_t, Size>> {
   typedef RegisterBuffer<std::int32_t, Size> InputType;
   typedef RegisterBuffer<std::int16_t, Size> OutputType;
   static_assert(InputType::kRegisterLanes == 1,
                 "This path is only for scalar values");

   typedef OutputStageSaturatingCastToInt16 OutputStage;

   OutputStageEvalBufferImpl(const OutputStage&) {}

   OutputType Eval(InputType input) const {
     OutputType output;
     for (int i = 0; i < InputType::kRegisterCount; i++) {
       std::int32_t data = input.reg[i];
       output.reg[i] = data > 32767 ? 32767 : data < -32768 ? -32768 : data;
     }
     return output;
   }
 };

 // Implementation of OutputStageTruncatingCastToUint8 for scalar data
 template <int Size>
 struct OutputStageEvalBufferImpl<OutputStageTruncatingCastToUint8,
                                  RegisterBuffer<std::int32_t, Size>> {
   typedef RegisterBuffer<std::int32_t, Size> InputType;
   typedef RegisterBuffer<std::uint8_t, Size> OutputType;
   static_assert(InputType::kRegisterLanes == 1,
                 "This path is only for scalar values");

   typedef OutputStageTruncatingCastToUint8 OutputStage;

   OutputStageEvalBufferImpl(const OutputStage&) {}

   OutputType Eval(InputType input) const {
     OutputType output;
     for (int i = 0; i < InputType::kRegisterCount; i++) {
       output.reg[i] = input.reg[i];
     }
     return output;
   }
 };

 template <int Rows, int Cols, typename VectorType>
 struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>,
                            RegisterBlock<std::int32_t, Rows, Cols>> {
   typedef RegisterBlock<std::int32_t, Rows, Cols> InputType;
   typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType;
   typedef OutputStageBiasAddition<VectorType> OutputStage;

   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

   OutputType Eval(InputType input, int row, int col) const {
     const int pos = VectorType::kShape == VectorShape::Row ? col : row;
     return BroadcastAdd<InputType>(
         input, LoadForBroadcasting<InputType>(output_stage.bias_vector, pos));
   }

   const OutputStage& output_stage;
 };

 template <int Size>
 struct OutputStageEvalBufferImpl<OutputStageClamp,
                                  RegisterBuffer<std::int32_t, Size>> {
   typedef RegisterBuffer<std::int32_t, Size> InputType;
   typedef RegisterBuffer<std::int32_t, Size> OutputType;

   typedef OutputStageClamp OutputStage;

   OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {}

   OutputType Eval(InputType input) const {
     using RegisterType = typename InputType::RegisterType;
     const RegisterType min = Dup<RegisterType>(output_stage.min);
     const RegisterType max = Dup<RegisterType>(output_stage.max);
     OutputType output;
     for (int i = 0; i < InputType::kRegisterCount; i++) {
       output.reg[i] = Min(Max(input.reg[i], min), max);
     }
     return output;
   }

   const OutputStage& output_stage;
 };

 template <int Size>
 struct OutputStageEvalBufferImpl<OutputStageTanh,
                                  RegisterBuffer<std::int32_t, Size>> {
   typedef RegisterBuffer<std::int32_t, Size> InputType;
   typedef RegisterBuffer<std::int32_t, Size> OutputType;
   using RegisterType = typename InputType::RegisterType;
   typedef RegisterType DataType;
   typedef OutputStageTanh OutputStage;

   OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {
     const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;
     const std::int32_t real_amplitude_as_int32 =
         output_stage.real_amplitude_as_int32;

     input_cutoff_min = real_zero_as_int32 - 8 * real_amplitude_as_int32;
     input_cutoff_max = real_zero_as_int32 + 8 * real_amplitude_as_int32;
     output_min = real_zero_as_int32 - real_amplitude_as_int32;
     output_max = real_zero_as_int32 + real_amplitude_as_int32;

     double inverse_amplitude_normalized_double = 1.0 / real_amplitude_as_int32;
     inverse_amplitude_neg_exponent = 0;
     while (inverse_amplitude_normalized_double < 0.5) {
       inverse_amplitude_normalized_double *= 2;
       inverse_amplitude_neg_exponent++;
     }
     inverse_amplitude_normalized = FixedPoint<DataType, 0>::FromDouble(
         inverse_amplitude_normalized_double);

     double amplitude_normalized_double = real_amplitude_as_int32;
     amplitude_exponent = 0;
     while (amplitude_normalized_double >= 1.0) {
       amplitude_normalized_double *= 0.5;
       amplitude_exponent++;
     }
     amplitude_normalized =
         FixedPoint<DataType, 0>::FromDouble(amplitude_normalized_double);
   }

   OutputType Eval(InputType input) const {
     const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;

     typedef FixedPoint<DataType, 3> F3;
     typedef FixedPoint<DataType, 0> F0;

     OutputType output;

     for (int i = 0; i < OutputType::kRegisterCount; i++) {
       // fixed-point affine transformation
       DataType input_centered =
           Sub(input.reg[i], Dup<DataType>(real_zero_as_int32));
       F3 fixedpoint_input =
           F3::FromRaw(input_centered) * inverse_amplitude_normalized;
       // left shift
       fixedpoint_input.raw() = ShiftLeft(fixedpoint_input.raw(),
                                          28 - inverse_amplitude_neg_exponent);
       // fixed-point tanh and multiplication
       F0 fixedpoint_output = tanh(fixedpoint_input) * amplitude_normalized;
       // right shift
       DataType int32_output =
           Add(Dup<DataType>(real_zero_as_int32),
               ShiftRight(fixedpoint_output.raw(), 31 - amplitude_exponent));

       DataType mask_if_below_cutoff_min =
           MaskIfLessThanOrEqual(input.reg[i], Dup<DataType>(input_cutoff_min));
       DataType mask_if_above_cutoff_max = MaskIfGreaterThanOrEqual(
           input.reg[i], Dup<DataType>(input_cutoff_max));

       output.reg[i] = SelectUsingMask(
           mask_if_below_cutoff_min, Dup<DataType>(output_min),
           SelectUsingMask(mask_if_above_cutoff_max, Dup<DataType>(output_max),
                           int32_output));
     }
     return output;
   }

   const OutputStage& output_stage;
   std::int32_t input_cutoff_min, input_cutoff_max;
   std::int32_t output_min, output_max;
   FixedPoint<DataType, 0> inverse_amplitude_normalized;
   int inverse_amplitude_neg_exponent;
   FixedPoint<DataType, 0> amplitude_normalized;
   int amplitude_exponent;
 };

 // OutputPipelineOutputType is a helper to determine the output data type of a
 // pipeline, for a
 // given input data type. It is a recursive template; see the explanation on
 // OutputPipelineEvalImpl below.
 template <typename OutputPipelineType, int FirstStage, typename InputType,
           bool StopRecursion =
               FirstStage == std::tuple_size<OutputPipelineType>::value>
 struct OutputPipelineOutputType {
   typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
       FirstStageType;
   typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
       FirstStageOutputType;
   typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage + 1,
                                             FirstStageOutputType>::Type Type;
 };

 template <typename OutputPipelineType, int FirstStage, typename InputType>
 struct OutputPipelineOutputType<OutputPipelineType, FirstStage, InputType,
                                 true> {
   typedef InputType Type;
 };

 // OutputPipelineEvalImpl is a helper to implement the evaluation of
 // the whole pipeline. It is a recursive template to implement compile-time
 // unrolling of the loop over all pipeline stages. The 'FirstStage' parameter
 // is how we implement recursion: each specialization implements only
 // evaluation starting at 'FirstStage'. The StopRecursion parameter is just a
 // helper to implement the termination of the recursion as a partial
 // specialization below.
 template <typename OutputPipelineType, int FirstStage, typename InputType,
           bool StopRecursion =
               FirstStage == std::tuple_size<OutputPipelineType>::value>
 struct OutputPipelineEvalImpl {
   typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
       FirstStageType;
   typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
       FirstStageOutputType;
   typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage,
                                             InputType>::Type OutputType;

   OutputPipelineEvalImpl(const OutputPipelineType& output_pipeline)
       : head_impl(std::get<FirstStage>(output_pipeline)),
         tail_impl(output_pipeline) {}

   OutputType Eval(InputType input, int row, int col) const {
     // Evaluate the first stage.
     FirstStageOutputType first_stage_output = head_impl.Eval(input, row, col);
     // Recurse into the remaining stages.
     return tail_impl.Eval(first_stage_output, row, col);
   }

   const OutputStageEvalImpl<FirstStageType, InputType> head_impl;
   const OutputPipelineEvalImpl<OutputPipelineType, FirstStage + 1,
                                FirstStageOutputType>
       tail_impl;
 };

 // Specialization on 'StopRecursion' for terminating the recursion.
 template <typename OutputPipelineType, int FirstStage, typename InputType>
 struct OutputPipelineEvalImpl<OutputPipelineType, FirstStage, InputType, true> {
   OutputPipelineEvalImpl(const OutputPipelineType&) {}

   InputType Eval(InputType input, int, int) const {
     // Terminating the recursion.
     return input;
   }
 };

 template <typename RegisterBlockType, typename DstType>
 struct StoreFinalOutputImpl {
   static_assert(std::is_same<RegisterBlockType, void>::value,
                 "This generic impl should never be hit");
 };

 template <typename ScalarType, int Rows, int Cols, typename DstType>
 struct StoreFinalOutputImpl<RegisterBlock<ScalarType, Rows, Cols>, DstType> {
   using RegisterBlockType = RegisterBlock<ScalarType, Rows, Cols>;
   static void Run(const RegisterBlockType& src, DstType* dst, int row,
                   int col) {
     for (int r = 0; r < Rows; r++) {
       for (int c = 0; c < Cols; c++) {
         *dst->data(row + r, col + c) = src.buf.reg[r + c * Rows];
       }
     }
   }
 };

 // StoreFinalOutput takes the final value at the end of the output pipeline and
 // stores it into the destination matrix. It can be specialized for different
 // data types; the generic implementation here is typically used only for plain
 // old scalar (not SIMD) types.
 template <typename RegisterBlockType, typename DstType>
 void StoreFinalOutput(RegisterBlockType src, DstType* dst, int row, int col) {
   StoreFinalOutputImpl<RegisterBlockType, DstType>::Run(src, dst, row, col);
 }

 template <typename OutputPipelineType, typename InputType>
 struct OutputPipelineExecutor {
   OutputPipelineExecutor(const OutputPipelineType& output_pipeline)
       : output_pipeline_eval_impl_(output_pipeline) {}

   // Execute is the entry point into the output pipeline evaluation
   // code. It should be the only thing that unpack code calls. It takes the
   // result
   // of the unpack stage and stores it into the destination matrix.
   template <typename DstType>
   void Execute(InputType input, DstType* dst, int src_global_row,
                int src_global_col, int dst_row, int dst_col) const {
     // Statically assert that the output pipeline matches the given destination
     // matrix's scalar type.
     typedef typename OutputPipelineOutputType<
         OutputPipelineType, 0, InputType>::Type::BufferType::ScalarType

         ScalarOutputType;
     typedef typename DstType::Scalar ScalarDstType;
     static_assert(std::is_same<ScalarOutputType, ScalarDstType>::value,
                   "mismatched destination scalar type and output pipeline");

     // Evaluate the output pipeline.
     auto output =
         output_pipeline_eval_impl_.Eval(input, src_global_row, src_global_col);
     // Store the result into the destination matrix.
     StoreFinalOutput(output, dst, dst_row, dst_col);
   }

   const OutputPipelineEvalImpl<OutputPipelineType, 0, InputType>
       output_pipeline_eval_impl_;
 };

 }  // namespace gemmlowp

 #ifdef GEMMLOWP_NEON
 #include "output_neon.h"
 #elif defined(GEMMLOWP_SSE4)
 #include "output_sse.h"
 #elif defined(GEMMLOWP_MSA)
 #include "output_msa.h"
 #endif

 #endif  // GEMMLOWP_INTERNAL_OUTPUT_H_
	// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	// output.h: processing the 32-bit accumulators output by the unpack
	// stage, obtaining the final result matrix entries and storing them into
	// the destination matrix.

	#ifndef GEMMLOWP_INTERNAL_OUTPUT_H_
	#define GEMMLOWP_INTERNAL_OUTPUT_H_

	#include <cmath>
	#include <tuple>
	#include <type_traits>
	#include <typeinfo>

	#include "../fixedpoint/fixedpoint.h"
	#include "../public/output_stages.h"
	#include "simd_wrappers.h"

	namespace gemmlowp {

	template <typename OutputStage, typename InputBufferType>
	struct OutputStageEvalBufferImpl {
	// This generic template body should never be hit.
	static_assert(
	std::is_same<InputBufferType, void>::value,
	"Unimplemented: missing implementation of this output pipeline stage "
	"for this data type. This would happen if some architecture-specific "
	"SIMD back-end (output_$arch.h) were incomplete.");
	};

	template <typename OutputStage, typename InputType>
	struct OutputStageEvalImpl {
	static constexpr int kRows = InputType::kRows;
	static constexpr int kCols = InputType::kCols;
	using InputBufferType = typename InputType::BufferType;
	using BufferEvalImplType =
	OutputStageEvalBufferImpl<OutputStage, InputBufferType>;
	using OutputBufferType = typename BufferEvalImplType::OutputType;
	using OutputScalarType = typename OutputBufferType::ScalarType;
	using OutputType = RegisterBlock<OutputScalarType, kRows, kCols>;

	OutputStageEvalImpl(const OutputStage& s) : buffer_eval_impl(s) {}

	OutputType Eval(InputType input, int, int) const {
	OutputType output;
	output.buf = buffer_eval_impl.Eval(input.buf);
	return output;
	}

	const BufferEvalImplType buffer_eval_impl;
	};

	template <int Size>
	struct OutputStageEvalBufferImpl<OutputStageQuantizeDownInt32ToUint8Scale,
	RegisterBuffer<std::int32_t, Size>> {
	using InputType = RegisterBuffer<std::int32_t, Size>;
	using OutputType = RegisterBuffer<std::int32_t, Size>;

	typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage;

	OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {}

	OutputType Eval(InputType input) const {
	const int result_shift = output_stage.result_shift;
	const std::int32_t result_mult_int = output_stage.result_mult_int;
	using RegisterType = typename InputType::RegisterType;
	const RegisterType result_offset =
	Dup<RegisterType>(output_stage.result_offset);
	OutputType output;
	for (int i = 0; i < InputType::kRegisterCount; i++) {
	output.reg[i] = RoundingDivideByPOT(
	Mul(Add(input.reg[i], result_offset), result_mult_int), result_shift);
	}
	return output;
	}

	const OutputStage& output_stage;
	};

	template <int Rows, int Cols, VectorShape Shape>
	struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8ScalePC<Shape>,
	RegisterBlock<std::int32_t, Rows, Cols>> {
	typedef RegisterBlock<std::int32_t, Rows, Cols> InputType;
	typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType;
	typedef OutputStageQuantizeDownInt32ToUint8ScalePC<Shape> OutputStage;

	OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

	OutputType Eval(InputType input, int row, int col) const {
	OutputType output;
	const int result_shift = output_stage.result_shift;
	const int pos = Shape == VectorShape::Col ? row : col;
	const auto result_mult_int =
	LoadForBroadcasting<InputType>(output_stage.result_mult_int, pos);
	const auto result_offset =
	LoadForBroadcasting<InputType>(output_stage.result_offset, pos);
	const auto dividend = BroadcastMul<InputType>(
	BroadcastAdd<InputType>(input, result_offset), result_mult_int);
	for (int i = 0; i < InputType::kRegisterCount; i++) {
	output.buf.reg[i] =
	RoundingDivideByPOT(dividend.buf.reg[i], result_shift);
	}
	return output;
	}

	const OutputStage& output_stage;
	};

	template <int Size>
	struct OutputStageEvalBufferImpl<
	OutputStageQuantizeDownInt32ByFixedPoint,
	RegisterBuffer<std::int32_t, Size>> {
	typedef RegisterBuffer<std::int32_t, Size> InputType;
	typedef RegisterBuffer<std::int32_t, Size> OutputType;

	typedef OutputStageQuantizeDownInt32ByFixedPoint OutputStage;

	OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {}

	OutputType Eval(InputType input) const {
	OutputType output;
	using RegisterType = typename InputType::RegisterType;
	const RegisterType result_offset_after_shift =
	Dup<RegisterType>(output_stage.result_offset_after_shift);
	for (int i = 0; i < InputType::kRegisterCount; i++) {
	const RegisterType mulhigh_val = SaturatingRoundingDoublingHighMul(
	input.reg[i], output_stage.result_fixedpoint_multiplier);
	output.reg[i] =
	Add(RoundingDivideByPOT(mulhigh_val, output_stage.result_shift),
	result_offset_after_shift);
	}
	return output;
	}

	const OutputStage& output_stage;
	};

	template <int Size>
	struct OutputStageEvalBufferImpl<OutputStageScaleInt32ByFixedPointAndExponent,
	RegisterBuffer<std::int32_t, Size>> {
	typedef RegisterBuffer<std::int32_t, Size> InputType;
	typedef RegisterBuffer<std::int32_t, Size> OutputType;

	typedef OutputStageScaleInt32ByFixedPointAndExponent OutputStage;

	OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {
	left_shift = std::max(0, output_stage.result_exponent);
	right_shift = std::max(0, -output_stage.result_exponent);
	}

	OutputType Eval(InputType input) const {
	OutputType output;
	using RegisterType = typename InputType::RegisterType;
	const RegisterType result_offset_after_shift =
	Dup<RegisterType>(output_stage.result_offset_after_shift);
	for (int i = 0; i < InputType::kRegisterCount; i++) {
	const RegisterType mulhigh_val = SaturatingRoundingDoublingHighMul(
	ShiftLeft(input.reg[i], left_shift),
	output_stage.result_fixedpoint_multiplier);
	output.reg[i] = Add(RoundingDivideByPOT(mulhigh_val, right_shift),
	result_offset_after_shift);
	}
	return output;
	}

	const OutputStage& output_stage;
	int left_shift;
	int right_shift;
	};

	template <int Rows, int Cols, VectorShape Shape>
	struct OutputStageEvalImpl<
	OutputStageScaleInt32ByFixedPointAndExponentPC<Shape>,
	RegisterBlock<std::int32_t, Rows, Cols>> {
	typedef RegisterBlock<std::int32_t, Rows, Cols> InputType;
	typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType;

	typedef OutputStageScaleInt32ByFixedPointAndExponentPC<Shape> OutputStage;

	OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

	OutputType Eval(InputType input, int row, int col) const {
	OutputType output;
	const int pos = Shape == VectorShape::Row ? col : row;
	using RegisterType = typename InputType::RegisterType;
	const RegisterType result_offset_after_shift =
	Dup<RegisterType>(output_stage.result_offset_after_shift);
	auto left_shift =
	LoadForBroadcasting<InputType>(output_stage.result_exponent, pos);
	auto right_shift =
	LoadForBroadcasting<InputType>(output_stage.result_exponent, pos);
	const auto result_fixedpoint_multiplier = LoadForBroadcasting<InputType>(
	output_stage.result_fixedpoint_multiplier, pos);
	for (int i = 0; i < decltype(left_shift)::kRegisterCount; i++) {
	left_shift.buf.reg[i] = Max(left_shift.buf.reg[i], 0);
	right_shift.buf.reg[i] = Max(-right_shift.buf.reg[i], 0);
	}
	const auto mulhigh_val = BroadcastSaturatingRoundingDoublingHighMul(
	BroadcastShiftLeft(input, left_shift), result_fixedpoint_multiplier);
	const auto rdpot_val =
	BroadcastRoundingDivideByPOT(mulhigh_val, right_shift);
	for (int i = 0; i < InputType::kRegisterCount; i++) {
	output.buf.reg[i] = Add(rdpot_val.buf.reg[i], result_offset_after_shift);
	}
	return output;
	}

	const OutputStage& output_stage;
	};

	// Implementation of OutputStageSaturatingCastToUint8 for scalar data.
	template <int Size>
	struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8,
	RegisterBuffer<std::int32_t, Size>> {
	typedef RegisterBuffer<std::int32_t, Size> InputType;
	typedef RegisterBuffer<std::uint8_t, Size> OutputType;
	static_assert(InputType::kRegisterLanes == 1,
	"This path is only for scalar values");

	typedef OutputStageSaturatingCastToUint8 OutputStage;

	OutputStageEvalBufferImpl(const OutputStage&) {}

	OutputType Eval(InputType input) const {
	OutputType output;
	for (int i = 0; i < InputType::kRegisterCount; i++) {
	std::int32_t data = input.reg[i];
	output.reg[i] = data > 255 ? 255 : data < 0 ? 0 : data;
	}
	return output;
	}
	};

	// Implementation of OutputStageSaturatingCastToInt8 for scalar data.
	template <int Size>
	struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt8,
	RegisterBuffer<std::int32_t, Size>> {
	typedef RegisterBuffer<std::int32_t, Size> InputType;
	typedef RegisterBuffer<std::int8_t, Size> OutputType;
	static_assert(InputType::kRegisterLanes == 1,
	"This path is only for scalar values");

	typedef OutputStageSaturatingCastToInt8 OutputStage;

	OutputStageEvalBufferImpl(const OutputStage&) {}

	OutputType Eval(InputType input) const {
	OutputType output;
	for (int i = 0; i < InputType::kRegisterCount; i++) {
	std::int32_t data = input.reg[i];
	output.reg[i] = data > 127 ? 127 : data < -128 ? -128 : data;
	}
	return output;
	}
	};

	// Implementation of OutputStageSaturatingCastToInt16 for scalar data.
	template <int Size>
	struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
	RegisterBuffer<std::int32_t, Size>> {
	typedef RegisterBuffer<std::int32_t, Size> InputType;
	typedef RegisterBuffer<std::int16_t, Size> OutputType;
	static_assert(InputType::kRegisterLanes == 1,
	"This path is only for scalar values");

	typedef OutputStageSaturatingCastToInt16 OutputStage;

	OutputStageEvalBufferImpl(const OutputStage&) {}

	OutputType Eval(InputType input) const {
	OutputType output;
	for (int i = 0; i < InputType::kRegisterCount; i++) {
	std::int32_t data = input.reg[i];
	output.reg[i] = data > 32767 ? 32767 : data < -32768 ? -32768 : data;
	}
	return output;
	}
	};

	// Implementation of OutputStageTruncatingCastToUint8 for scalar data
	template <int Size>
	struct OutputStageEvalBufferImpl<OutputStageTruncatingCastToUint8,
	RegisterBuffer<std::int32_t, Size>> {
	typedef RegisterBuffer<std::int32_t, Size> InputType;
	typedef RegisterBuffer<std::uint8_t, Size> OutputType;
	static_assert(InputType::kRegisterLanes == 1,
	"This path is only for scalar values");

	typedef OutputStageTruncatingCastToUint8 OutputStage;

	OutputStageEvalBufferImpl(const OutputStage&) {}

	OutputType Eval(InputType input) const {
	OutputType output;
	for (int i = 0; i < InputType::kRegisterCount; i++) {
	output.reg[i] = input.reg[i];
	}
	return output;
	}
	};

	template <int Rows, int Cols, typename VectorType>
	struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>,
	RegisterBlock<std::int32_t, Rows, Cols>> {
	typedef RegisterBlock<std::int32_t, Rows, Cols> InputType;
	typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType;
	typedef OutputStageBiasAddition<VectorType> OutputStage;

	OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

	OutputType Eval(InputType input, int row, int col) const {
	const int pos = VectorType::kShape == VectorShape::Row ? col : row;
	return BroadcastAdd<InputType>(
	input, LoadForBroadcasting<InputType>(output_stage.bias_vector, pos));
	}

	const OutputStage& output_stage;
	};

	template <int Size>
	struct OutputStageEvalBufferImpl<OutputStageClamp,
	RegisterBuffer<std::int32_t, Size>> {
	typedef RegisterBuffer<std::int32_t, Size> InputType;
	typedef RegisterBuffer<std::int32_t, Size> OutputType;

	typedef OutputStageClamp OutputStage;

	OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {}

	OutputType Eval(InputType input) const {
	using RegisterType = typename InputType::RegisterType;
	const RegisterType min = Dup<RegisterType>(output_stage.min);
	const RegisterType max = Dup<RegisterType>(output_stage.max);
	OutputType output;
	for (int i = 0; i < InputType::kRegisterCount; i++) {
	output.reg[i] = Min(Max(input.reg[i], min), max);
	}
	return output;
	}

	const OutputStage& output_stage;
	};

	template <int Size>
	struct OutputStageEvalBufferImpl<OutputStageTanh,
	RegisterBuffer<std::int32_t, Size>> {
	typedef RegisterBuffer<std::int32_t, Size> InputType;
	typedef RegisterBuffer<std::int32_t, Size> OutputType;
	using RegisterType = typename InputType::RegisterType;
	typedef RegisterType DataType;
	typedef OutputStageTanh OutputStage;

	OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {
	const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;
	const std::int32_t real_amplitude_as_int32 =
	output_stage.real_amplitude_as_int32;

	input_cutoff_min = real_zero_as_int32 - 8 * real_amplitude_as_int32;
	input_cutoff_max = real_zero_as_int32 + 8 * real_amplitude_as_int32;
	output_min = real_zero_as_int32 - real_amplitude_as_int32;
	output_max = real_zero_as_int32 + real_amplitude_as_int32;

	double inverse_amplitude_normalized_double = 1.0 / real_amplitude_as_int32;
	inverse_amplitude_neg_exponent = 0;
	while (inverse_amplitude_normalized_double < 0.5) {
	inverse_amplitude_normalized_double *= 2;
	inverse_amplitude_neg_exponent++;
	}
	inverse_amplitude_normalized = FixedPoint<DataType, 0>::FromDouble(
	inverse_amplitude_normalized_double);

	double amplitude_normalized_double = real_amplitude_as_int32;
	amplitude_exponent = 0;
	while (amplitude_normalized_double >= 1.0) {
	amplitude_normalized_double *= 0.5;
	amplitude_exponent++;
	}
	amplitude_normalized =
	FixedPoint<DataType, 0>::FromDouble(amplitude_normalized_double);
	}

	OutputType Eval(InputType input) const {
	const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;

	typedef FixedPoint<DataType, 3> F3;
	typedef FixedPoint<DataType, 0> F0;

	OutputType output;

	for (int i = 0; i < OutputType::kRegisterCount; i++) {
	// fixed-point affine transformation
	DataType input_centered =
	Sub(input.reg[i], Dup<DataType>(real_zero_as_int32));
	F3 fixedpoint_input =
	F3::FromRaw(input_centered) * inverse_amplitude_normalized;
	// left shift
	fixedpoint_input.raw() = ShiftLeft(fixedpoint_input.raw(),
	28 - inverse_amplitude_neg_exponent);
	// fixed-point tanh and multiplication
	F0 fixedpoint_output = tanh(fixedpoint_input) * amplitude_normalized;
	// right shift
	DataType int32_output =
	Add(Dup<DataType>(real_zero_as_int32),
	ShiftRight(fixedpoint_output.raw(), 31 - amplitude_exponent));

	DataType mask_if_below_cutoff_min =
	MaskIfLessThanOrEqual(input.reg[i], Dup<DataType>(input_cutoff_min));
	DataType mask_if_above_cutoff_max = MaskIfGreaterThanOrEqual(
	input.reg[i], Dup<DataType>(input_cutoff_max));

	output.reg[i] = SelectUsingMask(
	mask_if_below_cutoff_min, Dup<DataType>(output_min),
	SelectUsingMask(mask_if_above_cutoff_max, Dup<DataType>(output_max),
	int32_output));
	}
	return output;
	}

	const OutputStage& output_stage;
	std::int32_t input_cutoff_min, input_cutoff_max;
	std::int32_t output_min, output_max;
	FixedPoint<DataType, 0> inverse_amplitude_normalized;
	int inverse_amplitude_neg_exponent;
	FixedPoint<DataType, 0> amplitude_normalized;
	int amplitude_exponent;
	};

	// OutputPipelineOutputType is a helper to determine the output data type of a
	// pipeline, for a
	// given input data type. It is a recursive template; see the explanation on
	// OutputPipelineEvalImpl below.
	template <typename OutputPipelineType, int FirstStage, typename InputType,
	bool StopRecursion =
	FirstStage == std::tuple_size<OutputPipelineType>::value>
	struct OutputPipelineOutputType {
	typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
	FirstStageType;
	typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
	FirstStageOutputType;
	typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage + 1,
	FirstStageOutputType>::Type Type;
	};

	template <typename OutputPipelineType, int FirstStage, typename InputType>
	struct OutputPipelineOutputType<OutputPipelineType, FirstStage, InputType,
	true> {
	typedef InputType Type;
	};

	// OutputPipelineEvalImpl is a helper to implement the evaluation of
	// the whole pipeline. It is a recursive template to implement compile-time
	// unrolling of the loop over all pipeline stages. The 'FirstStage' parameter
	// is how we implement recursion: each specialization implements only
	// evaluation starting at 'FirstStage'. The StopRecursion parameter is just a
	// helper to implement the termination of the recursion as a partial
	// specialization below.
	template <typename OutputPipelineType, int FirstStage, typename InputType,
	bool StopRecursion =
	FirstStage == std::tuple_size<OutputPipelineType>::value>
	struct OutputPipelineEvalImpl {
	typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
	FirstStageType;
	typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
	FirstStageOutputType;
	typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage,
	InputType>::Type OutputType;

	OutputPipelineEvalImpl(const OutputPipelineType& output_pipeline)
	: head_impl(std::get<FirstStage>(output_pipeline)),
	tail_impl(output_pipeline) {}

	OutputType Eval(InputType input, int row, int col) const {
	// Evaluate the first stage.
	FirstStageOutputType first_stage_output = head_impl.Eval(input, row, col);
	// Recurse into the remaining stages.
	return tail_impl.Eval(first_stage_output, row, col);
	}

	const OutputStageEvalImpl<FirstStageType, InputType> head_impl;
	const OutputPipelineEvalImpl<OutputPipelineType, FirstStage + 1,
	FirstStageOutputType>
	tail_impl;
	};

	// Specialization on 'StopRecursion' for terminating the recursion.
	template <typename OutputPipelineType, int FirstStage, typename InputType>
	struct OutputPipelineEvalImpl<OutputPipelineType, FirstStage, InputType, true> {
	OutputPipelineEvalImpl(const OutputPipelineType&) {}

	InputType Eval(InputType input, int, int) const {
	// Terminating the recursion.
	return input;
	}
	};

	template <typename RegisterBlockType, typename DstType>
	struct StoreFinalOutputImpl {
	static_assert(std::is_same<RegisterBlockType, void>::value,
	"This generic impl should never be hit");
	};

	template <typename ScalarType, int Rows, int Cols, typename DstType>
	struct StoreFinalOutputImpl<RegisterBlock<ScalarType, Rows, Cols>, DstType> {
	using RegisterBlockType = RegisterBlock<ScalarType, Rows, Cols>;
	static void Run(const RegisterBlockType& src, DstType* dst, int row,
	int col) {
	for (int r = 0; r < Rows; r++) {
	for (int c = 0; c < Cols; c++) {
	dst->data(row + r, col + c) = src.buf.reg[r + c Rows];
	}
	}
	}
	};

	// StoreFinalOutput takes the final value at the end of the output pipeline and
	// stores it into the destination matrix. It can be specialized for different
	// data types; the generic implementation here is typically used only for plain
	// old scalar (not SIMD) types.
	template <typename RegisterBlockType, typename DstType>
	void StoreFinalOutput(RegisterBlockType src, DstType* dst, int row, int col) {
	StoreFinalOutputImpl<RegisterBlockType, DstType>::Run(src, dst, row, col);
	}

	template <typename OutputPipelineType, typename InputType>
	struct OutputPipelineExecutor {
	OutputPipelineExecutor(const OutputPipelineType& output_pipeline)
	: output_pipeline_eval_impl_(output_pipeline) {}

	// Execute is the entry point into the output pipeline evaluation
	// code. It should be the only thing that unpack code calls. It takes the
	// result
	// of the unpack stage and stores it into the destination matrix.
	template <typename DstType>
	void Execute(InputType input, DstType* dst, int src_global_row,
	int src_global_col, int dst_row, int dst_col) const {
	// Statically assert that the output pipeline matches the given destination
	// matrix's scalar type.
	typedef typename OutputPipelineOutputType<
	OutputPipelineType, 0, InputType>::Type::BufferType::ScalarType

	ScalarOutputType;
	typedef typename DstType::Scalar ScalarDstType;
	static_assert(std::is_same<ScalarOutputType, ScalarDstType>::value,
	"mismatched destination scalar type and output pipeline");

	// Evaluate the output pipeline.
	auto output =
	output_pipeline_eval_impl_.Eval(input, src_global_row, src_global_col);
	// Store the result into the destination matrix.
	StoreFinalOutput(output, dst, dst_row, dst_col);
	}

	const OutputPipelineEvalImpl<OutputPipelineType, 0, InputType>
	output_pipeline_eval_impl_;
	};

	} // namespace gemmlowp

	#ifdef GEMMLOWP_NEON
	#include "output_neon.h"
	#elif defined(GEMMLOWP_SSE4)
	#include "output_sse.h"
	#elif defined(GEMMLOWP_MSA)
	#include "output_msa.h"
	#endif

	#endif // GEMMLOWP_INTERNAL_OUTPUT_H_