test/benchmark_all_sizes.cc - platform/external/gemmlowp - Git at Google

 // Example command line to build on Android ARM64:
 /*
 ~/android/toolchains/r15c-aarch64/bin/aarch64-linux-android-clang++ \
 test/benchmark_all_sizes.cc -o /tmp/b -O3 --std=c++11 -fPIE -static \
 -DBENCHMARK_QUICK -DBENCHMARK_8bit
 */

 #include <algorithm>
 #include <cmath>
 #include <cstdint>
 #include <ctime>
 #include <iostream>
 #include <map>
 #include <random>
 #include <set>

 #include "../public/gemmlowp.h"

 #if defined GEMMLOWP_ANDROID && defined GEMMLOWP_ARM_32
 // Compilation workaround
 namespace std {
   using ::round;
 }
 #endif

 // Minimum duration of each benchmark measurement. Also, duration
 // of sleep time between each two consecutive benchmark measurements to
 // prevent over-heating.
 const double kBenchmarkSecs = 0.1;

 // Sleep time before each benchmark.
 const int kCooldownBeforeBenchmarkSecs = 0;

 // Number of benchmark passes.
 const int kPasses = 4;

 #ifdef BENCHMARK_NUM_THREADS
 const int kNumThreads = BENCHMARK_NUM_THREADS;
 #else
 const int kNumThreads = 1;
 #endif

 namespace gemmlowp {

 // gemmlowp itself doesn't have a Matrix class, only a MatrixMap class,
 // since it only maps existing data. In tests though, we need to
 // create our own matrices.
 template <typename tScalar, MapOrder tOrder>
 class Matrix : public MatrixMap<tScalar, tOrder> {
  public:
   typedef MatrixMap<tScalar, tOrder> Map;
   typedef MatrixMap<const tScalar, tOrder> ConstMap;
   typedef typename Map::Scalar Scalar;
   static const MapOrder Order = tOrder;
   using Map::cols_;
   using Map::data_;
   using Map::kOrder;
   using Map::rows_;
   using Map::stride_;

  public:
   Matrix() : Map(nullptr, 0, 0, 0) {}

   Matrix(int rows, int cols) : Map(nullptr, 0, 0, 0) { Resize(rows, cols); }

   Matrix(const Matrix& other) : Map(nullptr, 0, 0, 0) { *this = other; }

   Matrix& operator=(const Matrix& other) {
     Resize(other.rows_, other.cols_);
     std::memcpy(data_, other.data_, size() * sizeof(Scalar));
     return *this;
   }

   friend bool operator==(const Matrix& a, const Matrix& b) {
     return a.rows_ == b.rows_ && a.cols_ == b.cols_ &&
            !std::memcmp(a.data_, b.data_, a.size());
   }

   void Resize(int rows, int cols) {
     rows_ = rows;
     cols_ = cols;
     stride_ = kOrder == MapOrder::ColMajor ? rows : cols;
     storage.resize(size());
     data_ = storage.data();
   }

   int size() const { return rows_ * cols_; }

   Map& map() { return *static_cast<Map*>(this); }

   ConstMap const_map() const { return ConstMap(data_, rows_, cols_, stride_); }

  protected:
   std::vector<Scalar> storage;
 };

 template <typename MatrixType>
 void MakeZero(MatrixType* m) {
   for (int c = 0; c < m->cols(); c++) {
     for (int r = 0; r < m->rows(); r++) {
       (*m)(r, c) = 128;
     }
   }
 }

 }  // end namespace gemmlowp

 template <typename BitDepthParams>
 float benchmark_8bit(int rows, int depth, int cols) {
   using namespace gemmlowp;
   typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
   typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
   typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;

   LhsType lhs;
   RhsType rhs;
   ResultType result;
   lhs.Resize(rows, depth);
   rhs.Resize(depth, cols);
   result.Resize(rows, cols);
   MakeZero(&lhs);
   MakeZero(&rhs);
   MakeZero(&result);

   typedef std::tuple<OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
                      OutputStageSaturatingCastToUint8>
       Pipeline;
   gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
       quantize_down_stage;
   quantize_down_stage.result_offset_after_shift = 128;
   quantize_down_stage.result_fixedpoint_multiplier = 1234567890;
   quantize_down_stage.result_shift = 16;
   gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
   const auto output_pipeline =
       std::make_tuple(quantize_down_stage, saturating_cast_stage);
   GemmContext gemm_context;
   gemm_context.set_max_num_threads(kNumThreads);
   gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, BitDepthParams>(
       &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
       -128, output_pipeline);

   double time_start = real_time_in_seconds();
   double t = time_start;
   int iters = 0;
   int iters_at_a_time = 1;
   while (t - time_start < kBenchmarkSecs) {
     for (int i = 0; i < iters_at_a_time; i++) {
       gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t,
                                        BitDepthParams>(
           &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
           -128, output_pipeline);
       iters++;
     }
     iters_at_a_time *= 2;
     t = real_time_in_seconds();
   }
   return (t - time_start) / iters;
 }

 template <typename BitDepthParams>
 float benchmark_8bit_to_32bit(int rows, int depth, int cols) {
   using namespace gemmlowp;
   typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
   typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
   typedef Matrix<std::int32_t, MapOrder::ColMajor> ResultType;

   LhsType lhs;
   RhsType rhs;
   ResultType result;
   lhs.Resize(rows, depth);
   rhs.Resize(depth, cols);
   result.Resize(rows, cols);
   MakeZero(&lhs);
   MakeZero(&rhs);
   MakeZero(&result);

   typedef std::tuple<> EmptyPipeline;
   GemmContext gemm_context;
   gemm_context.set_max_num_threads(kNumThreads);
   gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, BitDepthParams>(
       &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
       -128, EmptyPipeline());

   double time_start = real_time_in_seconds();
   double t = time_start;
   int iters = 0;
   int iters_at_a_time = 1;
   while (t - time_start < kBenchmarkSecs) {
     for (int i = 0; i < iters_at_a_time; i++) {
       gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t,
                                        BitDepthParams>(
           &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
           -128, EmptyPipeline());
       iters++;
     }
     iters_at_a_time *= 2;
     t = real_time_in_seconds();
   }
   return (t - time_start) / iters;
 }

 struct Shape {
   int rows;
   int depth;
   int cols;
 };

 bool operator==(const Shape& s1, const Shape& s2) {
   return s1.rows == s2.rows && s1.depth == s2.depth && s1.cols == s2.cols;
 }

 bool operator<(const Shape& shape1, const Shape& shape2) {
   return shape1.depth < shape2.depth ||
          (shape1.depth == shape2.depth &&
           (shape1.rows < shape2.rows ||
            (shape1.rows == shape2.rows && shape1.cols < shape2.cols)));
 };

 #ifdef _WIN32
 #define sleep(t) Sleep(t)
 #endif

 float benchmark(const Shape& shape) {
   if (kCooldownBeforeBenchmarkSecs) {
     sleep(kCooldownBeforeBenchmarkSecs);
   }
 #if defined BENCHMARK_8bit
   // Benchmark the fast 8bit path, using L8R8WithLhsNonzeroBitDepthParams.
   // This is the recommended thing to default to: it's what most applications
   // want to use, as it's the fastest.
   // The contract is that LHS must take values in [1, 255], while RHS can take
   // any value in [0, 255].
   return benchmark_8bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
       shape.rows, shape.depth, shape.cols);
 #elif defined BENCHMARK_8bit_wide
   // Variant benchmarking the slower (mostly legacy) DefaultL8R8BitDepthParams.
   // The only contract difference is that both LHS and RHS can take values in
   // [0, 255].
   return benchmark_8bit<gemmlowp::DefaultL8R8BitDepthParams>(
       shape.rows, shape.depth, shape.cols);
 #elif defined BENCHMARK_8bit_to_32bit
   // Variant of BENCHMARK_8bit where the user asks for getting raw int32
   // accumulators, instead of a 8bit-downscaled result.
   return benchmark_8bit_to_32bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
       shape.rows, shape.depth, shape.cols);
 #elif defined BENCHMARK_8bit_to_32bit_wide
   // Variant of BENCHMARK_8bit_wide where the user asks for getting raw int32
   // accumulators, instead of a 8bit-downscaled result.
   return benchmark_8bit_to_32bit<gemmlowp::DefaultL8R8BitDepthParams>(
       shape.rows, shape.depth, shape.cols);
 #elif defined BENCHMARK_float
   return benchmark_float(shape.rows, shape.depth, shape.cols);
 #else
 #error What arithmetic path should we benchmark? (Suggestion: #define BENCHMARK_8bit)
 #endif
 }

 std::set<int> all_sizes() {
   std::set<int> sizes;
   for (int i = 1; i <= 2048; i *= 2) {
     sizes.insert(i);
   }
   for (double x = 8; x <= 2048; x *= std::sqrt(2.)) {
     sizes.insert(static_cast<int>(std::round(x)));
   }
   for (double x = 16; x <= 512; x *= std::pow(2., 1. / 4.)) {
     sizes.insert(static_cast<int>(std::round(x)));
   }
   return sizes;
 }

 std::mt19937& RandomEngine() {
   static std::mt19937 engine;
   return engine;
 }

 std::vector<Shape> all_shapes_in_random_order() {
   std::vector<Shape> shapes;
   const std::set<int> sizes = all_sizes();
 #if defined BENCHMARK_ROWS
   // Benchmark one specific shape
   Shape shape;
   shape.rows = BENCHMARK_ROWS;
   shape.depth = BENCHMARK_DEPTH;
   shape.cols = BENCHMARK_COLS;
   shapes.push_back(shape);
 #elif defined BENCHMARK_QUICK
   // Benchmark an assortment of cubic shapes
   for (int size : sizes) {
     Shape shape;
     shape.rows = size;
     shape.depth = size;
     shape.cols = size;
     shapes.push_back(shape);
   }
 #elif defined BENCHMARK_EXHAUSTIVE
   // Benchmark all sorts of shapes
   for (int rows : sizes) {
     for (int depth : sizes) {
       for (int cols : sizes) {
         Shape shape;
         shape.rows = rows;
         shape.depth = depth;
         shape.cols = cols;
         shapes.push_back(shape);
       }
     }
   }
 #else
 #error What shapes should we benchmark? (Suggestion: #define BENCHMARK_QUICK)
 #endif
   std::shuffle(std::begin(shapes), std::end(shapes), RandomEngine());
   return shapes;
 }

 void run_benchmarks(std::map<Shape, float>* results) {
   std::vector<Shape> shapes;
   for (int pass = 0; pass < kPasses; pass++) {
     const std::vector<Shape> pass_shapes = all_shapes_in_random_order();
     shapes.insert(std::end(shapes), std::begin(pass_shapes),
                   std::end(pass_shapes));
   }

   const double time_start = gemmlowp::real_time_in_seconds();
   for (std::size_t i = 0; i < shapes.size(); i++) {
     const double ratio = static_cast<double>(i) / shapes.size();
     const double elapsed = gemmlowp::real_time_in_seconds() - time_start;
     const double elapsed_hours = elapsed / 3600.;
     const double eta_hours = elapsed_hours * (1. - ratio) / ratio;
     fprintf(stderr,
             "Benchmarking: %.2f%% done, Elapsed: %.2f hours, ETA: %.2f "
             "hours...   \r",
             100. * ratio, elapsed_hours, eta_hours);
     fflush(stderr);
     const Shape& shape = shapes[i];
     float latency = benchmark(shape);
     if (results->count(shape)) {
       (*results)[shape] = std::min(latency, (*results)[shape]);
     } else {
       (*results)[shape] = latency;
     }
   }
   fprintf(stderr, "\n");
 }

 int main() {
   std::map<Shape, float> results;
   run_benchmarks(&results);
   printf("Using %d thread(s)\n", kNumThreads);
   printf("depth,rows,cols,latency(s),Gop/s\n");
   for (const auto& result : results) {
     const Shape& shape = result.first;
     printf("%d,%d,%d,%.4g,%.4g\n", shape.depth, shape.rows, shape.cols,
            result.second,
            2e-9 * shape.depth * shape.rows * shape.cols / result.second);
   }
 }
	// Example command line to build on Android ARM64:
	/*
	~/android/toolchains/r15c-aarch64/bin/aarch64-linux-android-clang++ \
	test/benchmark_all_sizes.cc -o /tmp/b -O3 --std=c++11 -fPIE -static \
	-DBENCHMARK_QUICK -DBENCHMARK_8bit
	*/

	#include <algorithm>
	#include <cmath>
	#include <cstdint>
	#include <ctime>
	#include <iostream>
	#include <map>
	#include <random>
	#include <set>

	#include "../public/gemmlowp.h"

	#if defined GEMMLOWP_ANDROID && defined GEMMLOWP_ARM_32
	// Compilation workaround
	namespace std {
	using ::round;
	}
	#endif

	// Minimum duration of each benchmark measurement. Also, duration
	// of sleep time between each two consecutive benchmark measurements to
	// prevent over-heating.
	const double kBenchmarkSecs = 0.1;

	// Sleep time before each benchmark.
	const int kCooldownBeforeBenchmarkSecs = 0;

	// Number of benchmark passes.
	const int kPasses = 4;

	#ifdef BENCHMARK_NUM_THREADS
	const int kNumThreads = BENCHMARK_NUM_THREADS;
	#else
	const int kNumThreads = 1;
	#endif

	namespace gemmlowp {

	// gemmlowp itself doesn't have a Matrix class, only a MatrixMap class,
	// since it only maps existing data. In tests though, we need to
	// create our own matrices.
	template <typename tScalar, MapOrder tOrder>
	class Matrix : public MatrixMap<tScalar, tOrder> {
	public:
	typedef MatrixMap<tScalar, tOrder> Map;
	typedef MatrixMap<const tScalar, tOrder> ConstMap;
	typedef typename Map::Scalar Scalar;
	static const MapOrder Order = tOrder;
	using Map::cols_;
	using Map::data_;
	using Map::kOrder;
	using Map::rows_;
	using Map::stride_;

	public:
	Matrix() : Map(nullptr, 0, 0, 0) {}

	Matrix(int rows, int cols) : Map(nullptr, 0, 0, 0) { Resize(rows, cols); }

	Matrix(const Matrix& other) : Map(nullptr, 0, 0, 0) { *this = other; }

	Matrix& operator=(const Matrix& other) {
	Resize(other.rows_, other.cols_);
	std::memcpy(data_, other.data_, size() * sizeof(Scalar));
	return *this;
	}

	friend bool operator==(const Matrix& a, const Matrix& b) {
	return a.rows_ == b.rows_ && a.cols_ == b.cols_ &&
	!std::memcmp(a.data_, b.data_, a.size());
	}

	void Resize(int rows, int cols) {
	rows_ = rows;
	cols_ = cols;
	stride_ = kOrder == MapOrder::ColMajor ? rows : cols;
	storage.resize(size());
	data_ = storage.data();
	}

	int size() const { return rows_ * cols_; }

	Map& map() { return static_cast<Map>(this); }

	ConstMap const_map() const { return ConstMap(data_, rows_, cols_, stride_); }

	protected:
	std::vector<Scalar> storage;
	};

	template <typename MatrixType>
	void MakeZero(MatrixType* m) {
	for (int c = 0; c < m->cols(); c++) {
	for (int r = 0; r < m->rows(); r++) {
	(*m)(r, c) = 128;
	}
	}
	}

	} // end namespace gemmlowp

	template <typename BitDepthParams>
	float benchmark_8bit(int rows, int depth, int cols) {
	using namespace gemmlowp;
	typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
	typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
	typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;

	LhsType lhs;
	RhsType rhs;
	ResultType result;
	lhs.Resize(rows, depth);
	rhs.Resize(depth, cols);
	result.Resize(rows, cols);
	MakeZero(&lhs);
	MakeZero(&rhs);
	MakeZero(&result);

	typedef std::tuple<OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
	OutputStageSaturatingCastToUint8>
	Pipeline;
	gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
	quantize_down_stage;
	quantize_down_stage.result_offset_after_shift = 128;
	quantize_down_stage.result_fixedpoint_multiplier = 1234567890;
	quantize_down_stage.result_shift = 16;
	gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
	const auto output_pipeline =
	std::make_tuple(quantize_down_stage, saturating_cast_stage);
	GemmContext gemm_context;
	gemm_context.set_max_num_threads(kNumThreads);
	gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, BitDepthParams>(
	&gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
	-128, output_pipeline);

	double time_start = real_time_in_seconds();
	double t = time_start;
	int iters = 0;
	int iters_at_a_time = 1;
	while (t - time_start < kBenchmarkSecs) {
	for (int i = 0; i < iters_at_a_time; i++) {
	gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t,
	BitDepthParams>(
	&gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
	-128, output_pipeline);
	iters++;
	}
	iters_at_a_time *= 2;
	t = real_time_in_seconds();
	}
	return (t - time_start) / iters;
	}

	template <typename BitDepthParams>
	float benchmark_8bit_to_32bit(int rows, int depth, int cols) {
	using namespace gemmlowp;
	typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
	typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
	typedef Matrix<std::int32_t, MapOrder::ColMajor> ResultType;

	LhsType lhs;
	RhsType rhs;
	ResultType result;
	lhs.Resize(rows, depth);
	rhs.Resize(depth, cols);
	result.Resize(rows, cols);
	MakeZero(&lhs);
	MakeZero(&rhs);
	MakeZero(&result);

	typedef std::tuple<> EmptyPipeline;
	GemmContext gemm_context;
	gemm_context.set_max_num_threads(kNumThreads);
	gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, BitDepthParams>(
	&gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
	-128, EmptyPipeline());

	double time_start = real_time_in_seconds();
	double t = time_start;
	int iters = 0;
	int iters_at_a_time = 1;
	while (t - time_start < kBenchmarkSecs) {
	for (int i = 0; i < iters_at_a_time; i++) {
	gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t,
	BitDepthParams>(
	&gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
	-128, EmptyPipeline());
	iters++;
	}
	iters_at_a_time *= 2;
	t = real_time_in_seconds();
	}
	return (t - time_start) / iters;
	}

	struct Shape {
	int rows;
	int depth;
	int cols;
	};

	bool operator==(const Shape& s1, const Shape& s2) {
	return s1.rows == s2.rows && s1.depth == s2.depth && s1.cols == s2.cols;
	}

	bool operator<(const Shape& shape1, const Shape& shape2) {
	return shape1.depth < shape2.depth \|\|
	(shape1.depth == shape2.depth &&
	(shape1.rows < shape2.rows \|\|
	(shape1.rows == shape2.rows && shape1.cols < shape2.cols)));
	};

	#ifdef _WIN32
	#define sleep(t) Sleep(t)
	#endif

	float benchmark(const Shape& shape) {
	if (kCooldownBeforeBenchmarkSecs) {
	sleep(kCooldownBeforeBenchmarkSecs);
	}
	#if defined BENCHMARK_8bit
	// Benchmark the fast 8bit path, using L8R8WithLhsNonzeroBitDepthParams.
	// This is the recommended thing to default to: it's what most applications
	// want to use, as it's the fastest.
	// The contract is that LHS must take values in [1, 255], while RHS can take
	// any value in [0, 255].
	return benchmark_8bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
	shape.rows, shape.depth, shape.cols);
	#elif defined BENCHMARK_8bit_wide
	// Variant benchmarking the slower (mostly legacy) DefaultL8R8BitDepthParams.
	// The only contract difference is that both LHS and RHS can take values in
	// [0, 255].
	return benchmark_8bit<gemmlowp::DefaultL8R8BitDepthParams>(
	shape.rows, shape.depth, shape.cols);
	#elif defined BENCHMARK_8bit_to_32bit
	// Variant of BENCHMARK_8bit where the user asks for getting raw int32
	// accumulators, instead of a 8bit-downscaled result.
	return benchmark_8bit_to_32bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
	shape.rows, shape.depth, shape.cols);
	#elif defined BENCHMARK_8bit_to_32bit_wide
	// Variant of BENCHMARK_8bit_wide where the user asks for getting raw int32
	// accumulators, instead of a 8bit-downscaled result.
	return benchmark_8bit_to_32bit<gemmlowp::DefaultL8R8BitDepthParams>(
	shape.rows, shape.depth, shape.cols);
	#elif defined BENCHMARK_float
	return benchmark_float(shape.rows, shape.depth, shape.cols);
	#else
	#error What arithmetic path should we benchmark? (Suggestion: #define BENCHMARK_8bit)
	#endif
	}

	std::set<int> all_sizes() {
	std::set<int> sizes;
	for (int i = 1; i <= 2048; i *= 2) {
	sizes.insert(i);
	}
	for (double x = 8; x <= 2048; x *= std::sqrt(2.)) {
	sizes.insert(static_cast<int>(std::round(x)));
	}
	for (double x = 16; x <= 512; x *= std::pow(2., 1. / 4.)) {
	sizes.insert(static_cast<int>(std::round(x)));
	}
	return sizes;
	}

	std::mt19937& RandomEngine() {
	static std::mt19937 engine;
	return engine;
	}

	std::vector<Shape> all_shapes_in_random_order() {
	std::vector<Shape> shapes;
	const std::set<int> sizes = all_sizes();
	#if defined BENCHMARK_ROWS
	// Benchmark one specific shape
	Shape shape;
	shape.rows = BENCHMARK_ROWS;
	shape.depth = BENCHMARK_DEPTH;
	shape.cols = BENCHMARK_COLS;
	shapes.push_back(shape);
	#elif defined BENCHMARK_QUICK
	// Benchmark an assortment of cubic shapes
	for (int size : sizes) {
	Shape shape;
	shape.rows = size;
	shape.depth = size;
	shape.cols = size;
	shapes.push_back(shape);
	}
	#elif defined BENCHMARK_EXHAUSTIVE
	// Benchmark all sorts of shapes
	for (int rows : sizes) {
	for (int depth : sizes) {
	for (int cols : sizes) {
	Shape shape;
	shape.rows = rows;
	shape.depth = depth;
	shape.cols = cols;
	shapes.push_back(shape);
	}
	}
	}
	#else
	#error What shapes should we benchmark? (Suggestion: #define BENCHMARK_QUICK)
	#endif
	std::shuffle(std::begin(shapes), std::end(shapes), RandomEngine());
	return shapes;
	}

	void run_benchmarks(std::map<Shape, float>* results) {
	std::vector<Shape> shapes;
	for (int pass = 0; pass < kPasses; pass++) {
	const std::vector<Shape> pass_shapes = all_shapes_in_random_order();
	shapes.insert(std::end(shapes), std::begin(pass_shapes),
	std::end(pass_shapes));
	}

	const double time_start = gemmlowp::real_time_in_seconds();
	for (std::size_t i = 0; i < shapes.size(); i++) {
	const double ratio = static_cast<double>(i) / shapes.size();
	const double elapsed = gemmlowp::real_time_in_seconds() - time_start;
	const double elapsed_hours = elapsed / 3600.;
	const double eta_hours = elapsed_hours * (1. - ratio) / ratio;
	fprintf(stderr,
	"Benchmarking: %.2f%% done, Elapsed: %.2f hours, ETA: %.2f "
	"hours... \r",
	100. * ratio, elapsed_hours, eta_hours);
	fflush(stderr);
	const Shape& shape = shapes[i];
	float latency = benchmark(shape);
	if (results->count(shape)) {
	(results)[shape] = std::min(latency, (results)[shape]);
	} else {
	(*results)[shape] = latency;
	}
	}
	fprintf(stderr, "\n");
	}

	int main() {
	std::map<Shape, float> results;
	run_benchmarks(&results);
	printf("Using %d thread(s)\n", kNumThreads);
	printf("depth,rows,cols,latency(s),Gop/s\n");
	for (const auto& result : results) {
	const Shape& shape = result.first;
	printf("%d,%d,%d,%.4g,%.4g\n", shape.depth, shape.rows, shape.cols,
	result.second,
	2e-9 * shape.depth * shape.rows * shape.cols / result.second);
	}
	}