| // Example command line to build on Android ARM64: |
| /* |
| ~/android/toolchains/r15c-aarch64/bin/aarch64-linux-android-clang++ \ |
| test/benchmark_all_sizes.cc -o /tmp/b -O3 --std=c++11 -fPIE -static \ |
| -DBENCHMARK_QUICK -DBENCHMARK_8bit |
| */ |
| |
| #include <algorithm> |
| #include <cmath> |
| #include <cstdint> |
| #include <ctime> |
| #include <iostream> |
| #include <map> |
| #include <random> |
| #include <set> |
| |
| #include "../public/gemmlowp.h" |
| |
| #if defined GEMMLOWP_ANDROID && defined GEMMLOWP_ARM_32 |
| // Compilation workaround |
| namespace std { |
| using ::round; |
| } |
| #endif |
| |
| // Minimum duration of each benchmark measurement. Also, duration |
| // of sleep time between each two consecutive benchmark measurements to |
| // prevent over-heating. |
| const double kBenchmarkSecs = 0.1; |
| |
| // Sleep time before each benchmark. |
| const int kCooldownBeforeBenchmarkSecs = 0; |
| |
| // Number of benchmark passes. |
| const int kPasses = 4; |
| |
| #ifdef BENCHMARK_NUM_THREADS |
| const int kNumThreads = BENCHMARK_NUM_THREADS; |
| #else |
| const int kNumThreads = 1; |
| #endif |
| |
| namespace gemmlowp { |
| |
| // gemmlowp itself doesn't have a Matrix class, only a MatrixMap class, |
| // since it only maps existing data. In tests though, we need to |
| // create our own matrices. |
| template <typename tScalar, MapOrder tOrder> |
| class Matrix : public MatrixMap<tScalar, tOrder> { |
| public: |
| typedef MatrixMap<tScalar, tOrder> Map; |
| typedef MatrixMap<const tScalar, tOrder> ConstMap; |
| typedef typename Map::Scalar Scalar; |
| static const MapOrder Order = tOrder; |
| using Map::cols_; |
| using Map::data_; |
| using Map::kOrder; |
| using Map::rows_; |
| using Map::stride_; |
| |
| public: |
| Matrix() : Map(nullptr, 0, 0, 0) {} |
| |
| Matrix(int rows, int cols) : Map(nullptr, 0, 0, 0) { Resize(rows, cols); } |
| |
| Matrix(const Matrix& other) : Map(nullptr, 0, 0, 0) { *this = other; } |
| |
| Matrix& operator=(const Matrix& other) { |
| Resize(other.rows_, other.cols_); |
| std::memcpy(data_, other.data_, size() * sizeof(Scalar)); |
| return *this; |
| } |
| |
| friend bool operator==(const Matrix& a, const Matrix& b) { |
| return a.rows_ == b.rows_ && a.cols_ == b.cols_ && |
| !std::memcmp(a.data_, b.data_, a.size()); |
| } |
| |
| void Resize(int rows, int cols) { |
| rows_ = rows; |
| cols_ = cols; |
| stride_ = kOrder == MapOrder::ColMajor ? rows : cols; |
| storage.resize(size()); |
| data_ = storage.data(); |
| } |
| |
| int size() const { return rows_ * cols_; } |
| |
| Map& map() { return *static_cast<Map*>(this); } |
| |
| ConstMap const_map() const { return ConstMap(data_, rows_, cols_, stride_); } |
| |
| protected: |
| std::vector<Scalar> storage; |
| }; |
| |
| template <typename MatrixType> |
| void MakeZero(MatrixType* m) { |
| for (int c = 0; c < m->cols(); c++) { |
| for (int r = 0; r < m->rows(); r++) { |
| (*m)(r, c) = 128; |
| } |
| } |
| } |
| |
| } // end namespace gemmlowp |
| |
| template <typename BitDepthParams> |
| float benchmark_8bit(int rows, int depth, int cols) { |
| using namespace gemmlowp; |
| typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType; |
| typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType; |
| typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType; |
| |
| LhsType lhs; |
| RhsType rhs; |
| ResultType result; |
| lhs.Resize(rows, depth); |
| rhs.Resize(depth, cols); |
| result.Resize(rows, cols); |
| MakeZero(&lhs); |
| MakeZero(&rhs); |
| MakeZero(&result); |
| |
| typedef std::tuple<OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint, |
| OutputStageSaturatingCastToUint8> |
| Pipeline; |
| gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint |
| quantize_down_stage; |
| quantize_down_stage.result_offset_after_shift = 128; |
| quantize_down_stage.result_fixedpoint_multiplier = 1234567890; |
| quantize_down_stage.result_shift = 16; |
| gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage; |
| const auto output_pipeline = |
| std::make_tuple(quantize_down_stage, saturating_cast_stage); |
| GemmContext gemm_context; |
| gemm_context.set_max_num_threads(kNumThreads); |
| gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, BitDepthParams>( |
| &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, |
| -128, output_pipeline); |
| |
| double time_start = real_time_in_seconds(); |
| double t = time_start; |
| int iters = 0; |
| int iters_at_a_time = 1; |
| while (t - time_start < kBenchmarkSecs) { |
| for (int i = 0; i < iters_at_a_time; i++) { |
| gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, |
| BitDepthParams>( |
| &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, |
| -128, output_pipeline); |
| iters++; |
| } |
| iters_at_a_time *= 2; |
| t = real_time_in_seconds(); |
| } |
| return (t - time_start) / iters; |
| } |
| |
| template <typename BitDepthParams> |
| float benchmark_8bit_to_32bit(int rows, int depth, int cols) { |
| using namespace gemmlowp; |
| typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType; |
| typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType; |
| typedef Matrix<std::int32_t, MapOrder::ColMajor> ResultType; |
| |
| LhsType lhs; |
| RhsType rhs; |
| ResultType result; |
| lhs.Resize(rows, depth); |
| rhs.Resize(depth, cols); |
| result.Resize(rows, cols); |
| MakeZero(&lhs); |
| MakeZero(&rhs); |
| MakeZero(&result); |
| |
| typedef std::tuple<> EmptyPipeline; |
| GemmContext gemm_context; |
| gemm_context.set_max_num_threads(kNumThreads); |
| gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, BitDepthParams>( |
| &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, |
| -128, EmptyPipeline()); |
| |
| double time_start = real_time_in_seconds(); |
| double t = time_start; |
| int iters = 0; |
| int iters_at_a_time = 1; |
| while (t - time_start < kBenchmarkSecs) { |
| for (int i = 0; i < iters_at_a_time; i++) { |
| gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, |
| BitDepthParams>( |
| &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, |
| -128, EmptyPipeline()); |
| iters++; |
| } |
| iters_at_a_time *= 2; |
| t = real_time_in_seconds(); |
| } |
| return (t - time_start) / iters; |
| } |
| |
| struct Shape { |
| int rows; |
| int depth; |
| int cols; |
| }; |
| |
| bool operator==(const Shape& s1, const Shape& s2) { |
| return s1.rows == s2.rows && s1.depth == s2.depth && s1.cols == s2.cols; |
| } |
| |
| bool operator<(const Shape& shape1, const Shape& shape2) { |
| return shape1.depth < shape2.depth || |
| (shape1.depth == shape2.depth && |
| (shape1.rows < shape2.rows || |
| (shape1.rows == shape2.rows && shape1.cols < shape2.cols))); |
| }; |
| |
| #ifdef _WIN32 |
| #define sleep(t) Sleep(t) |
| #endif |
| |
| float benchmark(const Shape& shape) { |
| if (kCooldownBeforeBenchmarkSecs) { |
| sleep(kCooldownBeforeBenchmarkSecs); |
| } |
| #if defined BENCHMARK_8bit |
| // Benchmark the fast 8bit path, using L8R8WithLhsNonzeroBitDepthParams. |
| // This is the recommended thing to default to: it's what most applications |
| // want to use, as it's the fastest. |
| // The contract is that LHS must take values in [1, 255], while RHS can take |
| // any value in [0, 255]. |
| return benchmark_8bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( |
| shape.rows, shape.depth, shape.cols); |
| #elif defined BENCHMARK_8bit_wide |
| // Variant benchmarking the slower (mostly legacy) DefaultL8R8BitDepthParams. |
| // The only contract difference is that both LHS and RHS can take values in |
| // [0, 255]. |
| return benchmark_8bit<gemmlowp::DefaultL8R8BitDepthParams>( |
| shape.rows, shape.depth, shape.cols); |
| #elif defined BENCHMARK_8bit_to_32bit |
| // Variant of BENCHMARK_8bit where the user asks for getting raw int32 |
| // accumulators, instead of a 8bit-downscaled result. |
| return benchmark_8bit_to_32bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( |
| shape.rows, shape.depth, shape.cols); |
| #elif defined BENCHMARK_8bit_to_32bit_wide |
| // Variant of BENCHMARK_8bit_wide where the user asks for getting raw int32 |
| // accumulators, instead of a 8bit-downscaled result. |
| return benchmark_8bit_to_32bit<gemmlowp::DefaultL8R8BitDepthParams>( |
| shape.rows, shape.depth, shape.cols); |
| #elif defined BENCHMARK_float |
| return benchmark_float(shape.rows, shape.depth, shape.cols); |
| #else |
| #error What arithmetic path should we benchmark? (Suggestion: #define BENCHMARK_8bit) |
| #endif |
| } |
| |
| std::set<int> all_sizes() { |
| std::set<int> sizes; |
| for (int i = 1; i <= 2048; i *= 2) { |
| sizes.insert(i); |
| } |
| for (double x = 8; x <= 2048; x *= std::sqrt(2.)) { |
| sizes.insert(static_cast<int>(std::round(x))); |
| } |
| for (double x = 16; x <= 512; x *= std::pow(2., 1. / 4.)) { |
| sizes.insert(static_cast<int>(std::round(x))); |
| } |
| return sizes; |
| } |
| |
| std::mt19937& RandomEngine() { |
| static std::mt19937 engine; |
| return engine; |
| } |
| |
| std::vector<Shape> all_shapes_in_random_order() { |
| std::vector<Shape> shapes; |
| const std::set<int> sizes = all_sizes(); |
| #if defined BENCHMARK_ROWS |
| // Benchmark one specific shape |
| Shape shape; |
| shape.rows = BENCHMARK_ROWS; |
| shape.depth = BENCHMARK_DEPTH; |
| shape.cols = BENCHMARK_COLS; |
| shapes.push_back(shape); |
| #elif defined BENCHMARK_QUICK |
| // Benchmark an assortment of cubic shapes |
| for (int size : sizes) { |
| Shape shape; |
| shape.rows = size; |
| shape.depth = size; |
| shape.cols = size; |
| shapes.push_back(shape); |
| } |
| #elif defined BENCHMARK_EXHAUSTIVE |
| // Benchmark all sorts of shapes |
| for (int rows : sizes) { |
| for (int depth : sizes) { |
| for (int cols : sizes) { |
| Shape shape; |
| shape.rows = rows; |
| shape.depth = depth; |
| shape.cols = cols; |
| shapes.push_back(shape); |
| } |
| } |
| } |
| #else |
| #error What shapes should we benchmark? (Suggestion: #define BENCHMARK_QUICK) |
| #endif |
| std::shuffle(std::begin(shapes), std::end(shapes), RandomEngine()); |
| return shapes; |
| } |
| |
| void run_benchmarks(std::map<Shape, float>* results) { |
| std::vector<Shape> shapes; |
| for (int pass = 0; pass < kPasses; pass++) { |
| const std::vector<Shape> pass_shapes = all_shapes_in_random_order(); |
| shapes.insert(std::end(shapes), std::begin(pass_shapes), |
| std::end(pass_shapes)); |
| } |
| |
| const double time_start = gemmlowp::real_time_in_seconds(); |
| for (std::size_t i = 0; i < shapes.size(); i++) { |
| const double ratio = static_cast<double>(i) / shapes.size(); |
| const double elapsed = gemmlowp::real_time_in_seconds() - time_start; |
| const double elapsed_hours = elapsed / 3600.; |
| const double eta_hours = elapsed_hours * (1. - ratio) / ratio; |
| fprintf(stderr, |
| "Benchmarking: %.2f%% done, Elapsed: %.2f hours, ETA: %.2f " |
| "hours... \r", |
| 100. * ratio, elapsed_hours, eta_hours); |
| fflush(stderr); |
| const Shape& shape = shapes[i]; |
| float latency = benchmark(shape); |
| if (results->count(shape)) { |
| (*results)[shape] = std::min(latency, (*results)[shape]); |
| } else { |
| (*results)[shape] = latency; |
| } |
| } |
| fprintf(stderr, "\n"); |
| } |
| |
| int main() { |
| std::map<Shape, float> results; |
| run_benchmarks(&results); |
| printf("Using %d thread(s)\n", kNumThreads); |
| printf("depth,rows,cols,latency(s),Gop/s\n"); |
| for (const auto& result : results) { |
| const Shape& shape = result.first; |
| printf("%d,%d,%d,%.4g,%.4g\n", shape.depth, shape.rows, shape.cols, |
| result.second, |
| 2e-9 * shape.depth * shape.rows * shape.cols / result.second); |
| } |
| } |