| // Copyright 2015 Google Inc. All Rights Reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // output_sse.h: optimized SSE4.2 specializations of the templates in output.h. |
| |
| #ifndef GEMMLOWP_INTERNAL_OUTPUT_SSE_H_ |
| #define GEMMLOWP_INTERNAL_OUTPUT_SSE_H_ |
| |
| #include "output.h" |
| |
| #include <smmintrin.h> |
| |
| namespace gemmlowp { |
| |
| template <> |
| struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8, |
| RegBufferInt32<4>> { |
| typedef RegBufferInt32<4> InputType; |
| typedef RegBufferUint8<4> OutputType; |
| |
| typedef OutputStageSaturatingCastToUint8 OutputStage; |
| |
| OutputStageEvalBufferImpl(const OutputStage&) {} |
| |
| OutputType Eval(InputType input) const { |
| OutputType output; |
| __m128i res_16 = _mm_packs_epi32(input.reg[0], input.reg[0]); |
| __m128i res_8 = _mm_packus_epi16(res_16, res_16); |
| output.reg[0] = _mm_cvtsi128_si32(res_8); |
| return output; |
| } |
| }; |
| |
| template <> |
| struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8, |
| RegBufferInt32<8>> { |
| typedef RegBufferInt32<8> InputType; |
| typedef RegBufferUint8<8> OutputType; |
| |
| typedef OutputStageSaturatingCastToUint8 OutputStage; |
| |
| OutputStageEvalBufferImpl(const OutputStage&) {} |
| |
| OutputType Eval(InputType input) const { |
| OutputType output; |
| __m128i res_16 = _mm_packs_epi32(input.reg[0], input.reg[1]); |
| __m128i res_8 = _mm_packus_epi16(res_16, res_16); |
| output.reg[0] = _mm_extract_epi32(res_8, 0); |
| output.reg[1] = _mm_extract_epi32(res_8, 1); |
| return output; |
| } |
| }; |
| |
| template <> |
| struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8, |
| RegBufferInt32<16>> { |
| typedef RegBufferInt32<16> InputType; |
| typedef RegBufferUint8<16> OutputType; |
| |
| typedef OutputStageSaturatingCastToUint8 OutputStage; |
| |
| OutputStageEvalBufferImpl(const OutputStage&) {} |
| |
| OutputType Eval(InputType input) const { |
| OutputType output; |
| __m128i res_16_0 = _mm_packs_epi32(input.reg[0], input.reg[1]); |
| __m128i res_16_1 = _mm_packs_epi32(input.reg[2], input.reg[3]); |
| output.reg[0] = _mm_packus_epi16(res_16_0, res_16_1); |
| return output; |
| } |
| }; |
| |
| template <> |
| struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8, |
| RegBufferInt32<32>> { |
| typedef RegBufferInt32<32> InputType; |
| typedef RegBufferUint8<32> OutputType; |
| |
| typedef OutputStageSaturatingCastToUint8 OutputStage; |
| |
| OutputStageEvalBufferImpl(const OutputStage&) {} |
| |
| OutputType Eval(InputType input) const { |
| OutputType output; |
| __m128i res_16_0 = _mm_packs_epi32(input.reg[0], input.reg[1]); |
| __m128i res_16_1 = _mm_packs_epi32(input.reg[2], input.reg[3]); |
| output.reg[0] = _mm_packus_epi16(res_16_0, res_16_1); |
| __m128i res_16_2 = _mm_packs_epi32(input.reg[4], input.reg[5]); |
| __m128i res_16_3 = _mm_packs_epi32(input.reg[6], input.reg[7]); |
| output.reg[1] = _mm_packus_epi16(res_16_2, res_16_3); |
| return output; |
| } |
| }; |
| |
| template <> |
| struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16, |
| RegBufferInt32<4>> { |
| typedef RegBufferInt32<4> InputType; |
| typedef RegBufferInt16<4> OutputType; |
| |
| typedef OutputStageSaturatingCastToInt16 OutputStage; |
| |
| OutputStageEvalBufferImpl(const OutputStage&) {} |
| |
| OutputType Eval(InputType input) const { |
| OutputType output; |
| __m128i res_16 = _mm_packs_epi32(input.reg[0], input.reg[0]); |
| output.reg[0] = _mm_extract_epi16(res_16, 0); |
| output.reg[1] = _mm_extract_epi16(res_16, 1); |
| output.reg[2] = _mm_extract_epi16(res_16, 2); |
| output.reg[3] = _mm_extract_epi16(res_16, 3); |
| return output; |
| } |
| }; |
| |
| template <> |
| struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16, |
| RegBufferInt32<8>> { |
| typedef RegBufferInt32<8> InputType; |
| typedef RegBufferInt16<8> OutputType; |
| |
| typedef OutputStageSaturatingCastToInt16 OutputStage; |
| |
| OutputStageEvalBufferImpl(const OutputStage&) {} |
| |
| OutputType Eval(InputType input) const { |
| OutputType output; |
| output.reg[0] = _mm_packs_epi32(input.reg[0], input.reg[1]); |
| return output; |
| } |
| }; |
| |
| template <> |
| struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16, |
| RegBufferInt32<16>> { |
| typedef RegBufferInt32<16> InputType; |
| typedef RegBufferInt16<16> OutputType; |
| |
| typedef OutputStageSaturatingCastToInt16 OutputStage; |
| |
| OutputStageEvalBufferImpl(const OutputStage&) {} |
| |
| OutputType Eval(InputType input) const { |
| OutputType output; |
| output.reg[0] = _mm_packs_epi32(input.reg[0], input.reg[1]); |
| output.reg[1] = _mm_packs_epi32(input.reg[2], input.reg[3]); |
| return output; |
| } |
| }; |
| |
| template <> |
| struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16, |
| RegBufferInt32<32>> { |
| typedef RegBufferInt32<32> InputType; |
| typedef RegBufferInt16<32> OutputType; |
| |
| typedef OutputStageSaturatingCastToInt16 OutputStage; |
| |
| OutputStageEvalBufferImpl(const OutputStage&) {} |
| |
| OutputType Eval(InputType input) const { |
| OutputType output; |
| output.reg[0] = _mm_packs_epi32(input.reg[0], input.reg[1]); |
| output.reg[1] = _mm_packs_epi32(input.reg[2], input.reg[3]); |
| output.reg[2] = _mm_packs_epi32(input.reg[4], input.reg[5]); |
| output.reg[3] = _mm_packs_epi32(input.reg[6], input.reg[7]); |
| return output; |
| } |
| }; |
| |
| template <typename DstType> |
| struct StoreFinalOutputImpl<RegBlockInt32<4, 1>, DstType> { |
| static void Run(const RegBlockInt32<4, 1>& src, DstType* dst, int row, |
| int col) { |
| if (DstType::kOrder == MapOrder::ColMajor) { |
| StoreInt32x4(dst->data(row, col), src.buf.reg[0]); |
| } else { |
| *dst->data(row + 0, col) = GetLane<0>(src.buf.reg[0]); |
| *dst->data(row + 1, col) = GetLane<1>(src.buf.reg[0]); |
| *dst->data(row + 2, col) = GetLane<2>(src.buf.reg[0]); |
| *dst->data(row + 3, col) = GetLane<3>(src.buf.reg[0]); |
| } |
| } |
| }; |
| |
| template <typename DstType> |
| struct StoreFinalOutputImpl<RegBlockInt32<8, 1>, DstType> { |
| static void Run(const RegBlockInt32<8, 1>& src, DstType* dst, int row, |
| int col) { |
| if (DstType::kOrder == MapOrder::ColMajor) { |
| StoreInt32x4(dst->data(row, col), src.buf.reg[0]); |
| StoreInt32x4(dst->data(row + 4, col), src.buf.reg[1]); |
| } else { |
| *dst->data(row + 0, col) = GetLane<0>(src.buf.reg[0]); |
| *dst->data(row + 1, col) = GetLane<1>(src.buf.reg[0]); |
| *dst->data(row + 2, col) = GetLane<2>(src.buf.reg[0]); |
| *dst->data(row + 3, col) = GetLane<3>(src.buf.reg[0]); |
| *dst->data(row + 4, col) = GetLane<0>(src.buf.reg[1]); |
| *dst->data(row + 5, col) = GetLane<1>(src.buf.reg[1]); |
| *dst->data(row + 6, col) = GetLane<2>(src.buf.reg[1]); |
| *dst->data(row + 7, col) = GetLane<3>(src.buf.reg[1]); |
| } |
| } |
| }; |
| |
| template <typename DstType> |
| struct StoreFinalOutputImpl<RegBlockInt16<4, 1>, DstType> { |
| static void Run(const RegBlockInt16<4, 1>& src, DstType* dst, int row, |
| int col) { |
| *dst->data(row + 0, col) = src.buf.reg[0]; |
| *dst->data(row + 1, col) = src.buf.reg[1]; |
| *dst->data(row + 2, col) = src.buf.reg[2]; |
| *dst->data(row + 3, col) = src.buf.reg[3]; |
| } |
| }; |
| |
| template <typename DstType> |
| struct StoreFinalOutputImpl<RegBlockInt16<8, 1>, DstType> { |
| static void Run(const RegBlockInt16<8, 1>& src, DstType* dst, int row, |
| int col) { |
| if (DstType::kOrder == MapOrder::ColMajor) { |
| StoreInt16x8(dst->data(row, col), src.buf.reg[0]); |
| } else { |
| *dst->data(row + 0, col) = _mm_extract_epi16(src.buf.reg[0], 0); |
| *dst->data(row + 1, col) = _mm_extract_epi16(src.buf.reg[0], 1); |
| *dst->data(row + 2, col) = _mm_extract_epi16(src.buf.reg[0], 2); |
| *dst->data(row + 3, col) = _mm_extract_epi16(src.buf.reg[0], 3); |
| *dst->data(row + 4, col) = _mm_extract_epi16(src.buf.reg[0], 4); |
| *dst->data(row + 5, col) = _mm_extract_epi16(src.buf.reg[0], 5); |
| *dst->data(row + 6, col) = _mm_extract_epi16(src.buf.reg[0], 6); |
| *dst->data(row + 7, col) = _mm_extract_epi16(src.buf.reg[0], 7); |
| } |
| } |
| }; |
| |
| inline RegBlockInt32<4, 4> Transpose(const RegBlockInt32<4, 4>& src) { |
| __m128i t0 = _mm_unpacklo_epi32(src.buf.reg[0], src.buf.reg[1]); |
| __m128i t1 = _mm_unpacklo_epi32(src.buf.reg[2], src.buf.reg[3]); |
| __m128i t2 = _mm_unpackhi_epi32(src.buf.reg[0], src.buf.reg[1]); |
| __m128i t3 = _mm_unpackhi_epi32(src.buf.reg[2], src.buf.reg[3]); |
| |
| RegBlockInt32<4, 4> result; |
| result.buf.reg[0] = _mm_unpacklo_epi64(t0, t1); |
| result.buf.reg[1] = _mm_unpackhi_epi64(t0, t1); |
| result.buf.reg[2] = _mm_unpacklo_epi64(t2, t3); |
| result.buf.reg[3] = _mm_unpackhi_epi64(t2, t3); |
| return result; |
| } |
| |
| template <typename DstType> |
| struct StoreFinalOutputImpl<RegBlockInt32<4, 4>, DstType> { |
| static void Run(const RegBlockInt32<4, 4>& src, DstType* dst, int row, |
| int col) { |
| if (DstType::kOrder == MapOrder::ColMajor) { |
| for (int i = 0; i < 4; i++) { |
| StoreInt32x4(dst->data(row, col + i), src.buf.reg[i]); |
| } |
| } else { |
| const auto transpose = Transpose(src); |
| for (int i = 0; i < 4; i++) { |
| StoreInt32x4(dst->data(row + i, col), transpose.buf.reg[i]); |
| } |
| } |
| } |
| }; |
| |
| template <typename DstType> |
| struct StoreFinalOutputImpl<RegBlockInt16<4, 4>, DstType> { |
| static void Run(const RegBlockInt16<4, 4>& src, DstType* dst, int row, |
| int col) { |
| std::int16_t buf[16]; |
| StoreInt16x8(buf + 0, src.buf.reg[0]); |
| StoreInt16x8(buf + 8, src.buf.reg[1]); |
| for (int i = 0; i < 4; i++) { |
| for (int j = 0; j < 4; j++) { |
| *dst->data(row + i, col + j) = buf[i + 4 * j]; |
| } |
| } |
| } |
| }; |
| |
| template <typename DstType> |
| struct StoreFinalOutputImpl<RegBlockInt32<8, 4>, DstType> { |
| static void Run(const RegBlockInt32<8, 4>& src, DstType* dst, int row, |
| int col) { |
| if (DstType::kOrder == MapOrder::ColMajor) { |
| for (int i = 0; i < 4; i++) { |
| StoreInt32x4(dst->data(row, col + i), src.buf.reg[2 * i]); |
| StoreInt32x4(dst->data(row + 4, col + i), src.buf.reg[2 * i + 1]); |
| } |
| } else { |
| RegBlockInt32<4, 4> top; |
| top.buf.reg[0] = src.buf.reg[0]; |
| top.buf.reg[1] = src.buf.reg[2]; |
| top.buf.reg[2] = src.buf.reg[4]; |
| top.buf.reg[3] = src.buf.reg[6]; |
| const auto transpose_top = Transpose(top); |
| for (int i = 0; i < 4; i++) { |
| StoreInt32x4(dst->data(row + i, col), transpose_top.buf.reg[i]); |
| } |
| RegBlockInt32<4, 4> bottom; |
| bottom.buf.reg[0] = src.buf.reg[1]; |
| bottom.buf.reg[1] = src.buf.reg[3]; |
| bottom.buf.reg[2] = src.buf.reg[5]; |
| bottom.buf.reg[3] = src.buf.reg[7]; |
| const auto transpose_bottom = Transpose(bottom); |
| for (int i = 0; i < 4; i++) { |
| StoreInt32x4(dst->data(row + 4 + i, col), transpose_bottom.buf.reg[i]); |
| } |
| } |
| } |
| }; |
| |
| template <typename DstType> |
| struct StoreFinalOutputImpl<RegBlockInt16<8, 4>, DstType> { |
| static void Run(const RegBlockInt16<8, 4>& src, DstType* dst, int row, |
| int col) { |
| if (DstType::kOrder == MapOrder::ColMajor) { |
| for (int i = 0; i < 4; i++) { |
| StoreInt16x8(dst->data(row, col + i), src.buf.reg[i]); |
| } |
| } else { |
| std::int16_t buf[32]; |
| StoreInt16x8(buf + 0, src.buf.reg[0]); |
| StoreInt16x8(buf + 8, src.buf.reg[1]); |
| StoreInt16x8(buf + 16, src.buf.reg[2]); |
| StoreInt16x8(buf + 24, src.buf.reg[3]); |
| for (int i = 0; i < 8; i++) { |
| for (int j = 0; j < 4; j++) { |
| *dst->data(row + i, col + j) = buf[i + 8 * j]; |
| } |
| } |
| } |
| } |
| }; |
| |
| template <typename DstType> |
| struct StoreFinalOutputImpl<RegBlockInt32<8, 8>, DstType> { |
| static void Run(const RegBlockInt32<8, 8>& src, DstType* dst, int row, |
| int col) { |
| if (DstType::kOrder == MapOrder::ColMajor) { |
| for (int i = 0; i < 8; i++) { |
| StoreInt32x4(dst->data(row, col + i), src.buf.reg[2 * i]); |
| StoreInt32x4(dst->data(row + 4, col + i), src.buf.reg[2 * i + 1]); |
| } |
| } else { |
| RegBlockInt32<4, 4> top_left; |
| top_left.buf.reg[0] = src.buf.reg[0]; |
| top_left.buf.reg[1] = src.buf.reg[2]; |
| top_left.buf.reg[2] = src.buf.reg[4]; |
| top_left.buf.reg[3] = src.buf.reg[6]; |
| const auto transpose_top_left = Transpose(top_left); |
| for (int i = 0; i < 4; i++) { |
| StoreInt32x4(dst->data(row + i, col), transpose_top_left.buf.reg[i]); |
| } |
| RegBlockInt32<4, 4> bottom_left; |
| bottom_left.buf.reg[0] = src.buf.reg[1]; |
| bottom_left.buf.reg[1] = src.buf.reg[3]; |
| bottom_left.buf.reg[2] = src.buf.reg[5]; |
| bottom_left.buf.reg[3] = src.buf.reg[7]; |
| const auto transpose_bottom_left = Transpose(bottom_left); |
| for (int i = 0; i < 4; i++) { |
| StoreInt32x4(dst->data(row + 4 + i, col), |
| transpose_bottom_left.buf.reg[i]); |
| } |
| RegBlockInt32<4, 4> top_right; |
| top_right.buf.reg[0] = src.buf.reg[8]; |
| top_right.buf.reg[1] = src.buf.reg[10]; |
| top_right.buf.reg[2] = src.buf.reg[12]; |
| top_right.buf.reg[3] = src.buf.reg[14]; |
| const auto transpose_top_right = Transpose(top_right); |
| for (int i = 0; i < 4; i++) { |
| StoreInt32x4(dst->data(row + i, col + 4), |
| transpose_top_right.buf.reg[i]); |
| } |
| RegBlockInt32<4, 4> bottom_right; |
| bottom_right.buf.reg[0] = src.buf.reg[9]; |
| bottom_right.buf.reg[1] = src.buf.reg[11]; |
| bottom_right.buf.reg[2] = src.buf.reg[13]; |
| bottom_right.buf.reg[3] = src.buf.reg[15]; |
| const auto transpose_bottom_right = Transpose(bottom_right); |
| for (int i = 0; i < 4; i++) { |
| StoreInt32x4(dst->data(row + 4 + i, col + 4), |
| transpose_bottom_right.buf.reg[i]); |
| } |
| } |
| } |
| }; |
| |
| template <typename DstType> |
| struct StoreFinalOutputImpl<RegBlockInt16<8, 8>, DstType> { |
| static void Run(const RegBlockInt16<8, 8>& src, DstType* dst, int row, |
| int col) { |
| if (DstType::kOrder == MapOrder::ColMajor) { |
| for (int i = 0; i < 8; i++) { |
| StoreInt16x8(dst->data(row, col + i), src.buf.reg[i]); |
| } |
| } else { |
| // top-left 4x4 |
| __m128i t0 = _mm_unpacklo_epi16(src.buf.reg[0], src.buf.reg[1]); |
| __m128i t1 = _mm_unpacklo_epi16(src.buf.reg[2], src.buf.reg[3]); |
| __m128i u0 = _mm_unpacklo_epi32(t0, t1); |
| __m128i u1 = _mm_unpackhi_epi32(t0, t1); |
| // top-right 4x4 |
| __m128i t2 = _mm_unpacklo_epi16(src.buf.reg[4], src.buf.reg[5]); |
| __m128i t3 = _mm_unpacklo_epi16(src.buf.reg[6], src.buf.reg[7]); |
| __m128i u2 = _mm_unpacklo_epi32(t2, t3); |
| __m128i u3 = _mm_unpackhi_epi32(t2, t3); |
| // bottom-left 4x4 |
| __m128i t4 = _mm_unpackhi_epi16(src.buf.reg[0], src.buf.reg[1]); |
| __m128i t5 = _mm_unpackhi_epi16(src.buf.reg[2], src.buf.reg[3]); |
| __m128i u4 = _mm_unpacklo_epi32(t4, t5); |
| __m128i u5 = _mm_unpackhi_epi32(t4, t5); |
| // bottom-right 4x4 |
| __m128i t6 = _mm_unpackhi_epi16(src.buf.reg[4], src.buf.reg[5]); |
| __m128i t7 = _mm_unpackhi_epi16(src.buf.reg[6], src.buf.reg[7]); |
| __m128i u6 = _mm_unpacklo_epi32(t6, t7); |
| __m128i u7 = _mm_unpackhi_epi32(t6, t7); |
| |
| StoreInt16x8(dst->data(row + 0, col), _mm_unpacklo_epi64(u0, u2)); |
| StoreInt16x8(dst->data(row + 1, col), _mm_unpackhi_epi64(u0, u2)); |
| StoreInt16x8(dst->data(row + 2, col), _mm_unpacklo_epi64(u1, u3)); |
| StoreInt16x8(dst->data(row + 3, col), _mm_unpackhi_epi64(u1, u3)); |
| StoreInt16x8(dst->data(row + 4, col), _mm_unpacklo_epi64(u4, u6)); |
| StoreInt16x8(dst->data(row + 5, col), _mm_unpackhi_epi64(u4, u6)); |
| StoreInt16x8(dst->data(row + 6, col), _mm_unpacklo_epi64(u5, u7)); |
| StoreInt16x8(dst->data(row + 7, col), _mm_unpackhi_epi64(u5, u7)); |
| } |
| } |
| }; |
| |
| template <typename DstType> |
| struct StoreFinalOutputImpl<RegBlockInt32<1, 4>, DstType> { |
| static void Run(const RegBlockInt32<1, 4>& src, DstType* dst, int row, |
| int col) { |
| if (DstType::kOrder == MapOrder::ColMajor) { |
| *dst->data(row, col + 0) = GetLane<0>(src.buf.reg[0]); |
| *dst->data(row, col + 1) = GetLane<1>(src.buf.reg[0]); |
| *dst->data(row, col + 2) = GetLane<2>(src.buf.reg[0]); |
| *dst->data(row, col + 3) = GetLane<3>(src.buf.reg[0]); |
| } else { |
| StoreInt32x4(dst->data(row, col), src.buf.reg[0]); |
| } |
| } |
| }; |
| |
| template <typename DstType> |
| struct StoreFinalOutputImpl<RegBlockUint8<4, 1>, DstType> { |
| static void Run(const RegBlockUint8<4, 1>& src, DstType* dst, int row, |
| int col) { |
| const std::uint32_t src_reg = src.buf.reg[0]; |
| for (int i = 0; i < 4; i++) { |
| *dst->data(row + i, col) = (src_reg >> (8 * i)); |
| } |
| } |
| }; |
| |
| template <typename DstType> |
| struct StoreFinalOutputImpl<RegBlockUint8<8, 1>, DstType> { |
| static void Run(const RegBlockUint8<8, 1>& src, DstType* dst, int row, |
| int col) { |
| for (int i = 0; i < 4; i++) { |
| *dst->data(row + i, col) = (src.buf.reg[0] >> (8 * i)); |
| } |
| for (int i = 0; i < 4; i++) { |
| *dst->data(row + 4 + i, col) = (src.buf.reg[1] >> (8 * i)); |
| } |
| } |
| }; |
| |
| template <typename DstType> |
| struct StoreFinalOutputImpl<RegBlockUint8<1, 4>, DstType> { |
| static void Run(const RegBlockUint8<1, 4>& src, DstType* dst, int row, |
| int col) { |
| for (int i = 0; i < 4; i++) { |
| *dst->data(row, col + i) = (src.buf.reg[0] >> (8 * i)); |
| } |
| } |
| }; |
| |
| template <typename DstType> |
| struct StoreFinalOutputImpl<RegBlockUint8<4, 4>, DstType> { |
| static void Run(const RegBlockUint8<4, 4>& src, DstType* dst, int row, |
| int col) { |
| std::uint8_t buf[16]; |
| StoreUint8x16(buf, src.buf.reg[0]); |
| for (int c = 0; c < 4; c++) { |
| for (int r = 0; r < 4; r++) { |
| *dst->data(row + r, col + c) = buf[r + 4 * c]; |
| } |
| } |
| } |
| }; |
| |
| template <typename DstType> |
| struct StoreFinalOutputImpl<RegBlockUint8<8, 4>, DstType> { |
| static void Run(const RegBlockUint8<8, 4>& src, DstType* dst, int row, |
| int col) { |
| std::uint8_t buf[32]; |
| StoreUint8x16(buf, src.buf.reg[0]); |
| StoreUint8x16(buf + 16, src.buf.reg[1]); |
| for (int c = 0; c < 4; c++) { |
| for (int r = 0; r < 8; r++) { |
| *dst->data(row + r, col + c) = buf[r + 8 * c]; |
| } |
| } |
| } |
| }; |
| |
| template <typename DstType> |
| struct StoreFinalOutputImpl<RegBlockUint8<8, 8>, DstType> { |
| static void Run(const RegBlockUint8<8, 8>& src, DstType* dst, int row, |
| int col) { |
| std::uint8_t buf[64]; |
| StoreUint8x16(buf, src.buf.reg[0]); |
| StoreUint8x16(buf + 16, src.buf.reg[1]); |
| StoreUint8x16(buf + 32, src.buf.reg[2]); |
| StoreUint8x16(buf + 48, src.buf.reg[3]); |
| for (int c = 0; c < 8; c++) { |
| for (int r = 0; r < 8; r++) { |
| *dst->data(row + r, col + c) = buf[r + 8 * c]; |
| } |
| } |
| } |
| }; |
| |
| // Specialization for MatrixMap, for performance. |
| template <typename tScalar, MapOrder tOrder> |
| struct StoreFinalOutputImpl<RegBlockUint8<8, 8>, MatrixMap<tScalar, tOrder>> { |
| static void Run(const RegBlockUint8<8, 8>& src, |
| MatrixMap<tScalar, tOrder>* dst, int row, int col) { |
| std::uint8_t buf[64]; |
| StoreUint8x16(buf, src.buf.reg[0]); |
| StoreUint8x16(buf + 16, src.buf.reg[1]); |
| StoreUint8x16(buf + 32, src.buf.reg[2]); |
| StoreUint8x16(buf + 48, src.buf.reg[3]); |
| // Make a local copy so that the compiler can prove that data_ does not |
| // alias &data_ or &stride_. |
| MatrixMap<tScalar, tOrder> local = *dst; |
| for (int c = 0; c < 8; c++) { |
| for (int r = 0; r < 8; r++) { |
| *local.data(row + r, col + c) = buf[r + 8 * c]; |
| } |
| } |
| } |
| }; |
| |
| } // namespace gemmlowp |
| |
| #endif // GEMMLOWP_INTERNAL_OUTPUT_SSE_H_ |