| // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // unpack.h: unpacking the result blocks computed by compute.h, |
| // storing them into the destination matrix. |
| |
| #ifndef GEMMLOWP_INTERNAL_UNPACK_H_ |
| #define GEMMLOWP_INTERNAL_UNPACK_H_ |
| |
| #include "allocator.h" |
| #include "block_params.h" |
| #include "output.h" |
| #include "pack.h" |
| |
| #include <cmath> |
| |
| namespace gemmlowp { |
| |
| class PackedResult { |
| public: |
| PackedResult(Allocator* _allocator, const BlockParams& _block_params) |
| : allocator_(_allocator), block_params_(_block_params) { |
| matrix_handle_ = allocator_->Reserve<std::int32_t>(block_params_.l2_rows * |
| block_params_.l2_cols); |
| } |
| |
| ~PackedResult() {} |
| |
| MatrixMap<std::int32_t, MapOrder::ColMajor> Map() { |
| return MatrixMap<std::int32_t, MapOrder::ColMajor>( |
| allocator_->GetPointer<std::int32_t>(matrix_handle_), |
| block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows); |
| } |
| |
| MatrixMap<const std::int32_t, MapOrder::ColMajor> Map() const { |
| return MatrixMap<const std::int32_t, MapOrder::ColMajor>( |
| allocator_->GetPointer<const std::int32_t>(matrix_handle_), |
| block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows); |
| } |
| |
| private: |
| Allocator* allocator_; |
| Allocator::Handle matrix_handle_; |
| const BlockParams& block_params_; |
| }; |
| |
| struct MatrixBlockBounds { |
| int start_row; |
| int start_col; |
| int rows; |
| int cols; |
| |
| MatrixBlockBounds(int start_row_, int start_col_, int rows_, int cols_) |
| : start_row(start_row_), |
| start_col(start_col_), |
| rows(rows_), |
| cols(cols_) {} |
| }; |
| |
| template <int Rows, int Cols, typename SrcMapType> |
| void PrefetchResultBlock(const SrcMapType& src, |
| const VectorMap<const std::int32_t, VectorShape::Col>& |
| lhs_sums_of_each_slice, |
| int src_row, int src_col) { |
| const std::int32_t* src_data = src.data(src_row, src_col); |
| const int src_stride = src.stride(); |
| const std::int32_t* lhs_sums_data = lhs_sums_of_each_slice.data(src_row); |
| for (int r = 0; r < Rows; r += 4) { |
| Prefetch(lhs_sums_data + r); |
| } |
| for (int c = 0; c < Cols; c++) { |
| for (int r = 0; r < Rows; r += 4) { |
| Prefetch(src_data + r + c * src_stride); |
| } |
| } |
| } |
| |
| template <typename KernelFormat, typename RegisterBlockType, |
| typename SrcMapType, typename LhsOffset, typename RhsOffset, |
| typename OutputPipelineExecutorType, typename DstType> |
| void UnpackResultBlock(const SrcMapType& src, |
| const OutputPipelineExecutorType& executor, DstType* dst, |
| const VectorMap<const std::int32_t, VectorShape::Col>& |
| lhs_sums_of_each_slice, |
| const VectorMap<const std::int32_t, VectorShape::Row>& |
| rhs_sums_of_each_slice, |
| const LhsOffset& lhs_offset, const RhsOffset& rhs_offset, |
| int depth, int src_row, int src_col, int src_global_row, |
| int src_global_col, int dst_row, int dst_col) { |
| using KernelLhsInputScalar = typename KernelFormat::Lhs::InputScalar; |
| using KernelLhsScalar = typename KernelFormat::Lhs::Scalar; |
| using KernelRhsInputScalar = typename KernelFormat::Rhs::InputScalar; |
| using KernelRhsScalar = typename KernelFormat::Rhs::Scalar; |
| static constexpr int KernelLhsZeroPointInput = |
| ZeroPointInputValue<KernelLhsInputScalar, KernelLhsScalar>::kValue; |
| static constexpr int KernelRhsZeroPointInput = |
| ZeroPointInputValue<KernelRhsInputScalar, KernelRhsScalar>::kValue; |
| auto acc = Load<RegisterBlockType>(src, src_row, src_col); |
| const auto& lhs_sums_of_each_slice_block = |
| LoadForBroadcasting<RegisterBlockType>(lhs_sums_of_each_slice, src_row); |
| const auto& rhs_sums_of_each_slice_block = |
| LoadForBroadcasting<RegisterBlockType>(rhs_sums_of_each_slice, src_col); |
| auto lhs_offset_block = |
| LoadForBroadcasting<RegisterBlockType>(lhs_offset, src_row); |
| auto rhs_offset_block = |
| LoadForBroadcasting<RegisterBlockType>(rhs_offset, src_col); |
| AddConstant<KernelLhsZeroPointInput>(&lhs_offset_block); |
| AddConstant<KernelRhsZeroPointInput>(&rhs_offset_block); |
| BroadcastMulAdd(lhs_sums_of_each_slice_block, rhs_offset_block, &acc); |
| for (int i = 0; i < decltype(rhs_offset_block)::kRegisterCount; i++) { |
| rhs_offset_block.buf.reg[i] = Mul(rhs_offset_block.buf.reg[i], depth); |
| } |
| BroadcastMulAdd(BroadcastAdd(rhs_sums_of_each_slice_block, rhs_offset_block), |
| lhs_offset_block, &acc); |
| executor.Execute(acc, dst, src_global_row, src_global_col, dst_row, dst_col); |
| } |
| |
| template <typename KernelFormat, typename ResultBlockType, |
| typename PackedResultType, typename LhsOffset, typename RhsOffset, |
| typename OutputPipelineType> |
| void UnpackResult(ResultBlockType* dst, const MatrixBlockBounds& dst_block, |
| const PackedResultType& src, int depth, |
| const std::int32_t* lhs_sums_of_each_slice_ptr, |
| const std::int32_t* rhs_sums_of_each_slice_ptr, |
| const LhsOffset& lhs_offset, const RhsOffset& rhs_offset, |
| const OutputPipelineType& output_pipeline) { |
| ScopedProfilingLabel label(ResultBlockType::kOrder == MapOrder::ColMajor |
| ? "unpack to column-major" |
| : "unpack to row-major"); |
| assert(dst_block.start_row >= 0); |
| assert(dst_block.start_row + dst_block.rows <= dst->rows()); |
| assert(dst_block.start_col >= 0); |
| assert(dst_block.start_col + dst_block.cols <= dst->cols()); |
| const auto src_map = src.Map(); |
| const VectorMap<const std::int32_t, VectorShape::Col> lhs_sums_of_each_slice( |
| lhs_sums_of_each_slice_ptr, dst_block.rows); |
| const VectorMap<const std::int32_t, VectorShape::Row> rhs_sums_of_each_slice( |
| rhs_sums_of_each_slice_ptr, dst_block.cols); |
| using Int32x1x1 = RegisterBlock<std::int32_t, 1, 1>; |
| using Int32x4x1 = RegisterBlock<std::int32_t, 4, 1>; |
| using Int32x8x1 = RegisterBlock<std::int32_t, 8, 1>; |
| using Int32x1x4 = RegisterBlock<std::int32_t, 1, 4>; |
| using Int32x4x4 = RegisterBlock<std::int32_t, 4, 4>; |
| using Int32x8x4 = RegisterBlock<std::int32_t, 8, 4>; |
| |
| using DstScalarType = typename ResultBlockType::Scalar; |
| using DstScalarx8x8 = RegisterBlock<DstScalarType, 8, 8>; |
| |
| OutputPipelineExecutor<OutputPipelineType, Int32x1x1> |
| output_pipeline_executor_1x1(output_pipeline); |
| OutputPipelineExecutor<OutputPipelineType, Int32x4x1> |
| output_pipeline_executor_4x1(output_pipeline); |
| OutputPipelineExecutor<OutputPipelineType, Int32x8x1> |
| output_pipeline_executor_8x1(output_pipeline); |
| OutputPipelineExecutor<OutputPipelineType, Int32x1x4> |
| output_pipeline_executor_1x4(output_pipeline); |
| OutputPipelineExecutor<OutputPipelineType, Int32x4x4> |
| output_pipeline_executor_4x4(output_pipeline); |
| OutputPipelineExecutor<OutputPipelineType, Int32x8x4> |
| output_pipeline_executor_8x4(output_pipeline); |
| |
| int c8 = 0; |
| if (ResultBlockType::kOrder == MapOrder::RowMajor) { |
| for (; c8 <= dst_block.cols - 8; c8 += 8) { |
| PrefetchResultBlock<8, 8>(src_map, lhs_sums_of_each_slice, 0, c8); |
| int r = 0; |
| for (; r <= dst_block.rows - 8; r += 8) { |
| const int global_row = r + dst_block.start_row; |
| PrefetchResultBlock<8, 8>(src_map, lhs_sums_of_each_slice, r + 8, c8); |
| DstScalarType dst_colmajor_buf[64]; |
| MatrixMap<DstScalarType, MapOrder::ColMajor> dst_colmajor_map( |
| dst_colmajor_buf, 8, 8); |
| for (int cx = 0; cx < 8; cx += 4) { |
| const int c = c8 + cx; |
| const int global_col = c + dst_block.start_col; |
| UnpackResultBlock<KernelFormat, Int32x8x4>( |
| src_map, output_pipeline_executor_8x4, &dst_colmajor_map, |
| lhs_sums_of_each_slice, rhs_sums_of_each_slice, lhs_offset, |
| rhs_offset, depth, r, c, global_row, global_col, 0, cx); |
| } |
| StoreFinalOutput(LoadContiguous<DstScalarx8x8>(dst_colmajor_buf), dst, |
| r + dst_block.start_row, c8 + dst_block.start_col); |
| } |
| for (; r <= dst_block.rows - 4; r += 4) { |
| const int global_row = r + dst_block.start_row; |
| for (int cx = 0; cx < 8; cx += 4) { |
| const int c = c8 + cx; |
| const int global_col = c + dst_block.start_col; |
| UnpackResultBlock<KernelFormat, Int32x4x4>( |
| src_map, output_pipeline_executor_4x4, dst, |
| lhs_sums_of_each_slice, rhs_sums_of_each_slice, lhs_offset, |
| rhs_offset, depth, r, c, global_row, global_col, global_row, |
| global_col); |
| } |
| } |
| for (; r < dst_block.rows; r++) { |
| const int global_row = r + dst_block.start_row; |
| for (int cx = 0; cx < 8; cx += 4) { |
| const int c = c8 + cx; |
| const int global_col = c + dst_block.start_col; |
| UnpackResultBlock<KernelFormat, Int32x1x4>( |
| src_map, output_pipeline_executor_1x4, dst, |
| lhs_sums_of_each_slice, rhs_sums_of_each_slice, lhs_offset, |
| rhs_offset, depth, r, c, global_row, global_col, global_row, |
| global_col); |
| } |
| } |
| } |
| } |
| int c = c8; |
| for (; c <= dst_block.cols - 4; c += 4) { |
| const int global_col = c + dst_block.start_col; |
| PrefetchResultBlock<8, 4>(src_map, lhs_sums_of_each_slice, 0, c); |
| int r = 0; |
| for (; r <= dst_block.rows - 8; r += 8) { |
| const int global_row = r + dst_block.start_row; |
| PrefetchResultBlock<8, 4>(src_map, lhs_sums_of_each_slice, r + 8, c); |
| UnpackResultBlock<KernelFormat, Int32x8x4>( |
| src_map, output_pipeline_executor_8x4, dst, lhs_sums_of_each_slice, |
| rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c, |
| global_row, global_col, global_row, global_col); |
| } |
| for (; r <= dst_block.rows - 4; r += 4) { |
| const int global_row = r + dst_block.start_row; |
| UnpackResultBlock<KernelFormat, Int32x4x4>( |
| src_map, output_pipeline_executor_4x4, dst, lhs_sums_of_each_slice, |
| rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c, |
| global_row, global_col, global_row, global_col); |
| } |
| for (; r < dst_block.rows; r++) { |
| const int global_row = r + dst_block.start_row; |
| UnpackResultBlock<KernelFormat, Int32x1x4>( |
| src_map, output_pipeline_executor_1x4, dst, lhs_sums_of_each_slice, |
| rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c, |
| global_row, global_col, global_row, global_col); |
| } |
| } |
| for (; c < dst_block.cols; c++) { |
| const int global_col = c + dst_block.start_col; |
| PrefetchResultBlock<8, 1>(src_map, lhs_sums_of_each_slice, 0, c); |
| int r = 0; |
| for (; r <= dst_block.rows - 8; r += 8) { |
| const int global_row = r + dst_block.start_row; |
| PrefetchResultBlock<8, 1>(src_map, lhs_sums_of_each_slice, r + 8, c); |
| UnpackResultBlock<KernelFormat, Int32x8x1>( |
| src_map, output_pipeline_executor_8x1, dst, lhs_sums_of_each_slice, |
| rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c, |
| global_row, global_col, global_row, global_col); |
| } |
| for (; r <= dst_block.rows - 4; r += 4) { |
| const int global_row = r + dst_block.start_row; |
| UnpackResultBlock<KernelFormat, Int32x4x1>( |
| src_map, output_pipeline_executor_4x1, dst, lhs_sums_of_each_slice, |
| rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c, |
| global_row, global_col, global_row, global_col); |
| } |
| for (; r < dst_block.rows; r++) { |
| const int global_row = r + dst_block.start_row; |
| UnpackResultBlock<KernelFormat, Int32x1x1>( |
| src_map, output_pipeline_executor_1x1, dst, lhs_sums_of_each_slice, |
| rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c, |
| global_row, global_col, global_row, global_col); |
| } |
| } |
| } |
| |
| } // end namespace gemmlowp |
| |
| #endif // GEMMLOWP_INTERNAL_UNPACK_H_ |