| // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // output.h: processing the 32-bit accumulators output by the unpack |
| // stage, obtaining the final result matrix entries and storing them into |
| // the destination matrix. |
| |
| #ifndef GEMMLOWP_INTERNAL_OUTPUT_H_ |
| #define GEMMLOWP_INTERNAL_OUTPUT_H_ |
| |
| #include <cmath> |
| #include <tuple> |
| #include <type_traits> |
| #include <typeinfo> |
| |
| #include "../fixedpoint/fixedpoint.h" |
| #include "../public/output_stages.h" |
| #include "simd_wrappers.h" |
| |
| namespace gemmlowp { |
| |
| template <typename OutputStage, typename InputBufferType> |
| struct OutputStageEvalBufferImpl { |
| // This generic template body should never be hit. |
| static_assert( |
| std::is_same<InputBufferType, void>::value, |
| "Unimplemented: missing implementation of this output pipeline stage " |
| "for this data type. This would happen if some architecture-specific " |
| "SIMD back-end (output_$arch.h) were incomplete."); |
| }; |
| |
| template <typename OutputStage, typename InputType> |
| struct OutputStageEvalImpl { |
| static constexpr int kRows = InputType::kRows; |
| static constexpr int kCols = InputType::kCols; |
| using InputBufferType = typename InputType::BufferType; |
| using BufferEvalImplType = |
| OutputStageEvalBufferImpl<OutputStage, InputBufferType>; |
| using OutputBufferType = typename BufferEvalImplType::OutputType; |
| using OutputScalarType = typename OutputBufferType::ScalarType; |
| using OutputType = RegisterBlock<OutputScalarType, kRows, kCols>; |
| |
| OutputStageEvalImpl(const OutputStage& s) : buffer_eval_impl(s) {} |
| |
| OutputType Eval(InputType input, int, int) const { |
| OutputType output; |
| output.buf = buffer_eval_impl.Eval(input.buf); |
| return output; |
| } |
| |
| const BufferEvalImplType buffer_eval_impl; |
| }; |
| |
| template <int Size> |
| struct OutputStageEvalBufferImpl<OutputStageQuantizeDownInt32ToUint8Scale, |
| RegisterBuffer<std::int32_t, Size>> { |
| using InputType = RegisterBuffer<std::int32_t, Size>; |
| using OutputType = RegisterBuffer<std::int32_t, Size>; |
| |
| typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage; |
| |
| OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {} |
| |
| OutputType Eval(InputType input) const { |
| const int result_shift = output_stage.result_shift; |
| const std::int32_t result_mult_int = output_stage.result_mult_int; |
| using RegisterType = typename InputType::RegisterType; |
| const RegisterType result_offset = |
| Dup<RegisterType>(output_stage.result_offset); |
| OutputType output; |
| for (int i = 0; i < InputType::kRegisterCount; i++) { |
| output.reg[i] = RoundingDivideByPOT( |
| Mul(Add(input.reg[i], result_offset), result_mult_int), result_shift); |
| } |
| return output; |
| } |
| |
| const OutputStage& output_stage; |
| }; |
| |
| template <int Rows, int Cols, VectorShape Shape> |
| struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8ScalePC<Shape>, |
| RegisterBlock<std::int32_t, Rows, Cols>> { |
| typedef RegisterBlock<std::int32_t, Rows, Cols> InputType; |
| typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType; |
| typedef OutputStageQuantizeDownInt32ToUint8ScalePC<Shape> OutputStage; |
| |
| OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} |
| |
| OutputType Eval(InputType input, int row, int col) const { |
| OutputType output; |
| const int result_shift = output_stage.result_shift; |
| const int pos = Shape == VectorShape::Col ? row : col; |
| const auto result_mult_int = |
| LoadForBroadcasting<InputType>(output_stage.result_mult_int, pos); |
| const auto result_offset = |
| LoadForBroadcasting<InputType>(output_stage.result_offset, pos); |
| const auto dividend = BroadcastMul<InputType>( |
| BroadcastAdd<InputType>(input, result_offset), result_mult_int); |
| for (int i = 0; i < InputType::kRegisterCount; i++) { |
| output.buf.reg[i] = |
| RoundingDivideByPOT(dividend.buf.reg[i], result_shift); |
| } |
| return output; |
| } |
| |
| const OutputStage& output_stage; |
| }; |
| |
| template <int Size> |
| struct OutputStageEvalBufferImpl< |
| OutputStageQuantizeDownInt32ByFixedPoint, |
| RegisterBuffer<std::int32_t, Size>> { |
| typedef RegisterBuffer<std::int32_t, Size> InputType; |
| typedef RegisterBuffer<std::int32_t, Size> OutputType; |
| |
| typedef OutputStageQuantizeDownInt32ByFixedPoint OutputStage; |
| |
| OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {} |
| |
| OutputType Eval(InputType input) const { |
| OutputType output; |
| using RegisterType = typename InputType::RegisterType; |
| const RegisterType result_offset_after_shift = |
| Dup<RegisterType>(output_stage.result_offset_after_shift); |
| for (int i = 0; i < InputType::kRegisterCount; i++) { |
| const RegisterType mulhigh_val = SaturatingRoundingDoublingHighMul( |
| input.reg[i], output_stage.result_fixedpoint_multiplier); |
| output.reg[i] = |
| Add(RoundingDivideByPOT(mulhigh_val, output_stage.result_shift), |
| result_offset_after_shift); |
| } |
| return output; |
| } |
| |
| const OutputStage& output_stage; |
| }; |
| |
| template <int Size> |
| struct OutputStageEvalBufferImpl<OutputStageScaleInt32ByFixedPointAndExponent, |
| RegisterBuffer<std::int32_t, Size>> { |
| typedef RegisterBuffer<std::int32_t, Size> InputType; |
| typedef RegisterBuffer<std::int32_t, Size> OutputType; |
| |
| typedef OutputStageScaleInt32ByFixedPointAndExponent OutputStage; |
| |
| OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) { |
| left_shift = std::max(0, output_stage.result_exponent); |
| right_shift = std::max(0, -output_stage.result_exponent); |
| } |
| |
| OutputType Eval(InputType input) const { |
| OutputType output; |
| using RegisterType = typename InputType::RegisterType; |
| const RegisterType result_offset_after_shift = |
| Dup<RegisterType>(output_stage.result_offset_after_shift); |
| for (int i = 0; i < InputType::kRegisterCount; i++) { |
| const RegisterType mulhigh_val = SaturatingRoundingDoublingHighMul( |
| ShiftLeft(input.reg[i], left_shift), |
| output_stage.result_fixedpoint_multiplier); |
| output.reg[i] = Add(RoundingDivideByPOT(mulhigh_val, right_shift), |
| result_offset_after_shift); |
| } |
| return output; |
| } |
| |
| const OutputStage& output_stage; |
| int left_shift; |
| int right_shift; |
| }; |
| |
| template <int Rows, int Cols, VectorShape Shape> |
| struct OutputStageEvalImpl< |
| OutputStageScaleInt32ByFixedPointAndExponentPC<Shape>, |
| RegisterBlock<std::int32_t, Rows, Cols>> { |
| typedef RegisterBlock<std::int32_t, Rows, Cols> InputType; |
| typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType; |
| |
| typedef OutputStageScaleInt32ByFixedPointAndExponentPC<Shape> OutputStage; |
| |
| OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} |
| |
| OutputType Eval(InputType input, int row, int col) const { |
| OutputType output; |
| const int pos = Shape == VectorShape::Row ? col : row; |
| using RegisterType = typename InputType::RegisterType; |
| const RegisterType result_offset_after_shift = |
| Dup<RegisterType>(output_stage.result_offset_after_shift); |
| auto left_shift = |
| LoadForBroadcasting<InputType>(output_stage.result_exponent, pos); |
| auto right_shift = |
| LoadForBroadcasting<InputType>(output_stage.result_exponent, pos); |
| const auto result_fixedpoint_multiplier = LoadForBroadcasting<InputType>( |
| output_stage.result_fixedpoint_multiplier, pos); |
| for (int i = 0; i < decltype(left_shift)::kRegisterCount; i++) { |
| left_shift.buf.reg[i] = Max(left_shift.buf.reg[i], 0); |
| right_shift.buf.reg[i] = Max(-right_shift.buf.reg[i], 0); |
| } |
| const auto mulhigh_val = BroadcastSaturatingRoundingDoublingHighMul( |
| BroadcastShiftLeft(input, left_shift), result_fixedpoint_multiplier); |
| const auto rdpot_val = |
| BroadcastRoundingDivideByPOT(mulhigh_val, right_shift); |
| for (int i = 0; i < InputType::kRegisterCount; i++) { |
| output.buf.reg[i] = Add(rdpot_val.buf.reg[i], result_offset_after_shift); |
| } |
| return output; |
| } |
| |
| const OutputStage& output_stage; |
| }; |
| |
| // Implementation of OutputStageSaturatingCastToUint8 for scalar data. |
| template <int Size> |
| struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8, |
| RegisterBuffer<std::int32_t, Size>> { |
| typedef RegisterBuffer<std::int32_t, Size> InputType; |
| typedef RegisterBuffer<std::uint8_t, Size> OutputType; |
| static_assert(InputType::kRegisterLanes == 1, |
| "This path is only for scalar values"); |
| |
| typedef OutputStageSaturatingCastToUint8 OutputStage; |
| |
| OutputStageEvalBufferImpl(const OutputStage&) {} |
| |
| OutputType Eval(InputType input) const { |
| OutputType output; |
| for (int i = 0; i < InputType::kRegisterCount; i++) { |
| std::int32_t data = input.reg[i]; |
| output.reg[i] = data > 255 ? 255 : data < 0 ? 0 : data; |
| } |
| return output; |
| } |
| }; |
| |
| // Implementation of OutputStageSaturatingCastToInt8 for scalar data. |
| template <int Size> |
| struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt8, |
| RegisterBuffer<std::int32_t, Size>> { |
| typedef RegisterBuffer<std::int32_t, Size> InputType; |
| typedef RegisterBuffer<std::int8_t, Size> OutputType; |
| static_assert(InputType::kRegisterLanes == 1, |
| "This path is only for scalar values"); |
| |
| typedef OutputStageSaturatingCastToInt8 OutputStage; |
| |
| OutputStageEvalBufferImpl(const OutputStage&) {} |
| |
| OutputType Eval(InputType input) const { |
| OutputType output; |
| for (int i = 0; i < InputType::kRegisterCount; i++) { |
| std::int32_t data = input.reg[i]; |
| output.reg[i] = data > 127 ? 127 : data < -128 ? -128 : data; |
| } |
| return output; |
| } |
| }; |
| |
| // Implementation of OutputStageSaturatingCastToInt16 for scalar data. |
| template <int Size> |
| struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16, |
| RegisterBuffer<std::int32_t, Size>> { |
| typedef RegisterBuffer<std::int32_t, Size> InputType; |
| typedef RegisterBuffer<std::int16_t, Size> OutputType; |
| static_assert(InputType::kRegisterLanes == 1, |
| "This path is only for scalar values"); |
| |
| typedef OutputStageSaturatingCastToInt16 OutputStage; |
| |
| OutputStageEvalBufferImpl(const OutputStage&) {} |
| |
| OutputType Eval(InputType input) const { |
| OutputType output; |
| for (int i = 0; i < InputType::kRegisterCount; i++) { |
| std::int32_t data = input.reg[i]; |
| output.reg[i] = data > 32767 ? 32767 : data < -32768 ? -32768 : data; |
| } |
| return output; |
| } |
| }; |
| |
| // Implementation of OutputStageTruncatingCastToUint8 for scalar data |
| template <int Size> |
| struct OutputStageEvalBufferImpl<OutputStageTruncatingCastToUint8, |
| RegisterBuffer<std::int32_t, Size>> { |
| typedef RegisterBuffer<std::int32_t, Size> InputType; |
| typedef RegisterBuffer<std::uint8_t, Size> OutputType; |
| static_assert(InputType::kRegisterLanes == 1, |
| "This path is only for scalar values"); |
| |
| typedef OutputStageTruncatingCastToUint8 OutputStage; |
| |
| OutputStageEvalBufferImpl(const OutputStage&) {} |
| |
| OutputType Eval(InputType input) const { |
| OutputType output; |
| for (int i = 0; i < InputType::kRegisterCount; i++) { |
| output.reg[i] = input.reg[i]; |
| } |
| return output; |
| } |
| }; |
| |
| template <int Rows, int Cols, typename VectorType> |
| struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>, |
| RegisterBlock<std::int32_t, Rows, Cols>> { |
| typedef RegisterBlock<std::int32_t, Rows, Cols> InputType; |
| typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType; |
| typedef OutputStageBiasAddition<VectorType> OutputStage; |
| |
| OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} |
| |
| OutputType Eval(InputType input, int row, int col) const { |
| const int pos = VectorType::kShape == VectorShape::Row ? col : row; |
| return BroadcastAdd<InputType>( |
| input, LoadForBroadcasting<InputType>(output_stage.bias_vector, pos)); |
| } |
| |
| const OutputStage& output_stage; |
| }; |
| |
| template <int Size> |
| struct OutputStageEvalBufferImpl<OutputStageClamp, |
| RegisterBuffer<std::int32_t, Size>> { |
| typedef RegisterBuffer<std::int32_t, Size> InputType; |
| typedef RegisterBuffer<std::int32_t, Size> OutputType; |
| |
| typedef OutputStageClamp OutputStage; |
| |
| OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {} |
| |
| OutputType Eval(InputType input) const { |
| using RegisterType = typename InputType::RegisterType; |
| const RegisterType min = Dup<RegisterType>(output_stage.min); |
| const RegisterType max = Dup<RegisterType>(output_stage.max); |
| OutputType output; |
| for (int i = 0; i < InputType::kRegisterCount; i++) { |
| output.reg[i] = Min(Max(input.reg[i], min), max); |
| } |
| return output; |
| } |
| |
| const OutputStage& output_stage; |
| }; |
| |
| template <int Size> |
| struct OutputStageEvalBufferImpl<OutputStageTanh, |
| RegisterBuffer<std::int32_t, Size>> { |
| typedef RegisterBuffer<std::int32_t, Size> InputType; |
| typedef RegisterBuffer<std::int32_t, Size> OutputType; |
| using RegisterType = typename InputType::RegisterType; |
| typedef RegisterType DataType; |
| typedef OutputStageTanh OutputStage; |
| |
| OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) { |
| const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32; |
| const std::int32_t real_amplitude_as_int32 = |
| output_stage.real_amplitude_as_int32; |
| |
| input_cutoff_min = real_zero_as_int32 - 8 * real_amplitude_as_int32; |
| input_cutoff_max = real_zero_as_int32 + 8 * real_amplitude_as_int32; |
| output_min = real_zero_as_int32 - real_amplitude_as_int32; |
| output_max = real_zero_as_int32 + real_amplitude_as_int32; |
| |
| double inverse_amplitude_normalized_double = 1.0 / real_amplitude_as_int32; |
| inverse_amplitude_neg_exponent = 0; |
| while (inverse_amplitude_normalized_double < 0.5) { |
| inverse_amplitude_normalized_double *= 2; |
| inverse_amplitude_neg_exponent++; |
| } |
| inverse_amplitude_normalized = FixedPoint<DataType, 0>::FromDouble( |
| inverse_amplitude_normalized_double); |
| |
| double amplitude_normalized_double = real_amplitude_as_int32; |
| amplitude_exponent = 0; |
| while (amplitude_normalized_double >= 1.0) { |
| amplitude_normalized_double *= 0.5; |
| amplitude_exponent++; |
| } |
| amplitude_normalized = |
| FixedPoint<DataType, 0>::FromDouble(amplitude_normalized_double); |
| } |
| |
| OutputType Eval(InputType input) const { |
| const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32; |
| |
| typedef FixedPoint<DataType, 3> F3; |
| typedef FixedPoint<DataType, 0> F0; |
| |
| OutputType output; |
| |
| for (int i = 0; i < OutputType::kRegisterCount; i++) { |
| // fixed-point affine transformation |
| DataType input_centered = |
| Sub(input.reg[i], Dup<DataType>(real_zero_as_int32)); |
| F3 fixedpoint_input = |
| F3::FromRaw(input_centered) * inverse_amplitude_normalized; |
| // left shift |
| fixedpoint_input.raw() = ShiftLeft(fixedpoint_input.raw(), |
| 28 - inverse_amplitude_neg_exponent); |
| // fixed-point tanh and multiplication |
| F0 fixedpoint_output = tanh(fixedpoint_input) * amplitude_normalized; |
| // right shift |
| DataType int32_output = |
| Add(Dup<DataType>(real_zero_as_int32), |
| ShiftRight(fixedpoint_output.raw(), 31 - amplitude_exponent)); |
| |
| DataType mask_if_below_cutoff_min = |
| MaskIfLessThanOrEqual(input.reg[i], Dup<DataType>(input_cutoff_min)); |
| DataType mask_if_above_cutoff_max = MaskIfGreaterThanOrEqual( |
| input.reg[i], Dup<DataType>(input_cutoff_max)); |
| |
| output.reg[i] = SelectUsingMask( |
| mask_if_below_cutoff_min, Dup<DataType>(output_min), |
| SelectUsingMask(mask_if_above_cutoff_max, Dup<DataType>(output_max), |
| int32_output)); |
| } |
| return output; |
| } |
| |
| const OutputStage& output_stage; |
| std::int32_t input_cutoff_min, input_cutoff_max; |
| std::int32_t output_min, output_max; |
| FixedPoint<DataType, 0> inverse_amplitude_normalized; |
| int inverse_amplitude_neg_exponent; |
| FixedPoint<DataType, 0> amplitude_normalized; |
| int amplitude_exponent; |
| }; |
| |
| // OutputPipelineOutputType is a helper to determine the output data type of a |
| // pipeline, for a |
| // given input data type. It is a recursive template; see the explanation on |
| // OutputPipelineEvalImpl below. |
| template <typename OutputPipelineType, int FirstStage, typename InputType, |
| bool StopRecursion = |
| FirstStage == std::tuple_size<OutputPipelineType>::value> |
| struct OutputPipelineOutputType { |
| typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type |
| FirstStageType; |
| typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType |
| FirstStageOutputType; |
| typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage + 1, |
| FirstStageOutputType>::Type Type; |
| }; |
| |
| template <typename OutputPipelineType, int FirstStage, typename InputType> |
| struct OutputPipelineOutputType<OutputPipelineType, FirstStage, InputType, |
| true> { |
| typedef InputType Type; |
| }; |
| |
| // OutputPipelineEvalImpl is a helper to implement the evaluation of |
| // the whole pipeline. It is a recursive template to implement compile-time |
| // unrolling of the loop over all pipeline stages. The 'FirstStage' parameter |
| // is how we implement recursion: each specialization implements only |
| // evaluation starting at 'FirstStage'. The StopRecursion parameter is just a |
| // helper to implement the termination of the recursion as a partial |
| // specialization below. |
| template <typename OutputPipelineType, int FirstStage, typename InputType, |
| bool StopRecursion = |
| FirstStage == std::tuple_size<OutputPipelineType>::value> |
| struct OutputPipelineEvalImpl { |
| typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type |
| FirstStageType; |
| typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType |
| FirstStageOutputType; |
| typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage, |
| InputType>::Type OutputType; |
| |
| OutputPipelineEvalImpl(const OutputPipelineType& output_pipeline) |
| : head_impl(std::get<FirstStage>(output_pipeline)), |
| tail_impl(output_pipeline) {} |
| |
| OutputType Eval(InputType input, int row, int col) const { |
| // Evaluate the first stage. |
| FirstStageOutputType first_stage_output = head_impl.Eval(input, row, col); |
| // Recurse into the remaining stages. |
| return tail_impl.Eval(first_stage_output, row, col); |
| } |
| |
| const OutputStageEvalImpl<FirstStageType, InputType> head_impl; |
| const OutputPipelineEvalImpl<OutputPipelineType, FirstStage + 1, |
| FirstStageOutputType> |
| tail_impl; |
| }; |
| |
| // Specialization on 'StopRecursion' for terminating the recursion. |
| template <typename OutputPipelineType, int FirstStage, typename InputType> |
| struct OutputPipelineEvalImpl<OutputPipelineType, FirstStage, InputType, true> { |
| OutputPipelineEvalImpl(const OutputPipelineType&) {} |
| |
| InputType Eval(InputType input, int, int) const { |
| // Terminating the recursion. |
| return input; |
| } |
| }; |
| |
| template <typename RegisterBlockType, typename DstType> |
| struct StoreFinalOutputImpl { |
| static_assert(std::is_same<RegisterBlockType, void>::value, |
| "This generic impl should never be hit"); |
| }; |
| |
| template <typename ScalarType, int Rows, int Cols, typename DstType> |
| struct StoreFinalOutputImpl<RegisterBlock<ScalarType, Rows, Cols>, DstType> { |
| using RegisterBlockType = RegisterBlock<ScalarType, Rows, Cols>; |
| static void Run(const RegisterBlockType& src, DstType* dst, int row, |
| int col) { |
| for (int r = 0; r < Rows; r++) { |
| for (int c = 0; c < Cols; c++) { |
| *dst->data(row + r, col + c) = src.buf.reg[r + c * Rows]; |
| } |
| } |
| } |
| }; |
| |
| // StoreFinalOutput takes the final value at the end of the output pipeline and |
| // stores it into the destination matrix. It can be specialized for different |
| // data types; the generic implementation here is typically used only for plain |
| // old scalar (not SIMD) types. |
| template <typename RegisterBlockType, typename DstType> |
| void StoreFinalOutput(RegisterBlockType src, DstType* dst, int row, int col) { |
| StoreFinalOutputImpl<RegisterBlockType, DstType>::Run(src, dst, row, col); |
| } |
| |
| template <typename OutputPipelineType, typename InputType> |
| struct OutputPipelineExecutor { |
| OutputPipelineExecutor(const OutputPipelineType& output_pipeline) |
| : output_pipeline_eval_impl_(output_pipeline) {} |
| |
| // Execute is the entry point into the output pipeline evaluation |
| // code. It should be the only thing that unpack code calls. It takes the |
| // result |
| // of the unpack stage and stores it into the destination matrix. |
| template <typename DstType> |
| void Execute(InputType input, DstType* dst, int src_global_row, |
| int src_global_col, int dst_row, int dst_col) const { |
| // Statically assert that the output pipeline matches the given destination |
| // matrix's scalar type. |
| typedef typename OutputPipelineOutputType< |
| OutputPipelineType, 0, InputType>::Type::BufferType::ScalarType |
| |
| ScalarOutputType; |
| typedef typename DstType::Scalar ScalarDstType; |
| static_assert(std::is_same<ScalarOutputType, ScalarDstType>::value, |
| "mismatched destination scalar type and output pipeline"); |
| |
| // Evaluate the output pipeline. |
| auto output = |
| output_pipeline_eval_impl_.Eval(input, src_global_row, src_global_col); |
| // Store the result into the destination matrix. |
| StoreFinalOutput(output, dst, dst_row, dst_col); |
| } |
| |
| const OutputPipelineEvalImpl<OutputPipelineType, 0, InputType> |
| output_pipeline_eval_impl_; |
| }; |
| |
| } // namespace gemmlowp |
| |
| #ifdef GEMMLOWP_NEON |
| #include "output_neon.h" |
| #elif defined(GEMMLOWP_SSE4) |
| #include "output_sse.h" |
| #elif defined(GEMMLOWP_MSA) |
| #include "output_msa.h" |
| #endif |
| |
| #endif // GEMMLOWP_INTERNAL_OUTPUT_H_ |