| // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // single_thread_gemm.h: Single-threaded GEMM implementation. |
| // This is a good place to start reading code, as it shows the overall |
| // structure of a GEMM and is much simpler than multi_thread_gemm.h. |
| |
| #ifndef GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_ |
| #define GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_ |
| |
| #include <cassert> |
| |
| #include "../public/map.h" |
| #include "allocator.h" |
| #include "compute.h" |
| #include "kernel.h" |
| #include "pack.h" |
| #include "unpack.h" |
| |
| #ifdef GEMMLOWP_PROFILING_SIZES |
| #ifndef GEMMLOWP_PROFILING |
| #error GEMMLOWP_PROFILING_SIZES without GEMMLOWP_PROFILING |
| #endif |
| #include <string> |
| #include <unordered_map> |
| #endif |
| |
| namespace gemmlowp { |
| |
| class SingleThreadGemmContext { |
| public: |
| Allocator* allocator() { return &allocator_; } |
| |
| void set_l1_bytes_to_use(int n) { l1_bytes_to_use_ = n; } |
| void set_l2_bytes_to_use(int n) { l2_bytes_to_use_ = n; } |
| void set_l2_rhs_factor(float n) { l2_rhs_factor_ = n; } |
| |
| int l1_bytes_to_use() const { return l1_bytes_to_use_; } |
| int l2_bytes_to_use() const { return l2_bytes_to_use_; } |
| float l2_rhs_factor() const { return l2_rhs_factor_; } |
| |
| protected: |
| Allocator allocator_; |
| |
| // The cache configurationt to use. |
| int l1_bytes_to_use_ = kDefaultL1CacheSize; |
| int l2_bytes_to_use_ = kDefaultL2CacheSize; |
| float l2_rhs_factor_ = kDefaultL2RhsFactor; |
| }; |
| |
| template <typename KernelFormat, typename InputScalar, typename OutputScalar, |
| typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder, |
| MapOrder ResultOrder, typename LhsOffset, typename RhsOffset, |
| typename OutputPipelineType> |
| void SingleThreadGemm(SingleThreadGemmContext* context, |
| const KernelBase& kernel, |
| const MatrixMap<const InputScalar, LhsOrder>& lhs, |
| const MatrixMap<const InputScalar, RhsOrder>& rhs, |
| MatrixMap<OutputScalar, ResultOrder>* result, |
| const LhsOffset& lhs_offset, const RhsOffset& rhs_offset, |
| const OutputPipelineType& output_pipeline) { |
| ScopedProfilingLabel label("gemmlowp::SingleThreadGemm"); |
| |
| assert(lhs.cols() == rhs.rows()); |
| |
| int rows = result->rows(); |
| int cols = result->cols(); |
| int depth = lhs.cols(); |
| |
| // zero sizes should have been caught earlier and early-returned. |
| assert(rows > 0); |
| assert(cols > 0); |
| assert(depth > 0); |
| |
| // The case of rows<cols should have been caught earlier and transposed. |
| assert(rows >= cols); |
| |
| Allocator* allocator = context->allocator(); |
| |
| BlockParams block_params; |
| block_params.Init<KernelFormat>( |
| rows, cols, depth, 1, context->l1_bytes_to_use(), |
| context->l2_bytes_to_use(), context->l2_rhs_factor()); |
| |
| #ifdef GEMMLOWP_PROFILING_SIZES |
| // Using a static map of label strings. Not reentrant at all! |
| static std::unordered_map<std::uint64_t, std::string> labels_map; |
| std::uint64_t sizes_hash = static_cast<std::uint64_t>(rows) ^ |
| (static_cast<std::uint64_t>(depth) << 16) ^ |
| (static_cast<std::uint64_t>(cols) << 32); |
| if (!labels_map.count(sizes_hash)) { |
| char label[256]; |
| snprintf(label, sizeof(label), |
| "(rows = %d, depth = %d, cols = %d, l2_rows = %d, l2_depth = %d, " |
| "l2_cols = %d, l1_rows = %d, l1_depth = %d, l1_cols = %d)", |
| rows, depth, cols, block_params.l2_rows, block_params.l2_depth, |
| block_params.l2_cols, block_params.l1_rows, block_params.l1_depth, |
| block_params.l1_cols); |
| labels_map[sizes_hash] = label; |
| } |
| ScopedProfilingLabel size_label(labels_map[sizes_hash].c_str()); |
| #endif |
| |
| PackedSideBlock<typename KernelFormat::Lhs> packed_lhs(Side::Lhs, allocator, |
| block_params); |
| PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(Side::Rhs, allocator, |
| block_params); |
| |
| PackedResult packed_result(allocator, block_params); |
| |
| allocator->Commit(); |
| |
| const bool pack_rhs_once = block_params.l2_cols >= cols; |
| |
| if (pack_rhs_once) { |
| PackRhs(&packed_rhs, rhs); |
| } |
| |
| for (int r = 0; r < rows; r += block_params.l2_rows) { |
| int rs = std::min(block_params.l2_rows, rows - r); |
| |
| PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth)); |
| |
| for (int c = 0; c < cols; c += block_params.l2_cols) { |
| int cs = std::min(block_params.l2_cols, cols - c); |
| |
| if (!pack_rhs_once) { |
| PackRhs(&packed_rhs, rhs.block(0, c, depth, cs)); |
| } |
| |
| Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs, |
| depth); |
| |
| UnpackResult<KernelFormat>( |
| result, MatrixBlockBounds(r, c, rs, cs), packed_result, depth, |
| packed_lhs.sums_of_each_slice(), packed_rhs.sums_of_each_slice(), |
| lhs_offset.block(r, rs), rhs_offset.block(c, cs), output_pipeline); |
| } |
| } |
| |
| allocator->Decommit(); |
| } |
| |
| } // namespace gemmlowp |
| |
| #endif // GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_ |