test/benchmark.cc - platform/external/gemmlowp - Git at Google

 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #ifdef __APPLE__
 #include <sys/time.h>
 #endif

 #include <cstdint>
 #include <cstdlib>
 #include <ctime>
 #include <iostream>
 #include <map>
 #include <vector>
 #ifdef __APPLE__
 #include <TargetConditionals.h>
 #endif

 #include "test.h"

 #ifndef GEMMLOWP_TEST_BIT_DEPTH_PARAMS
 #define GEMMLOWP_TEST_BIT_DEPTH_PARAMS DefaultL8R8BitDepthParams
 #endif

 #if defined(__arm__) && !defined(GEMMLOWP_NEON)
 #warning "Building without NEON support on ARM, check your compiler setup!"
 #endif

 #if defined(__mips) && !defined(GEMMLOWP_MSA)
 #warning "Building without MSA support on MIPS, check your compiler setup!"
 #endif

 #if defined(__AVX2__) && !defined(GEMMLOWP_AVX2)
 #warning \
     "Building without AVX2 support on AVX2 enabled machine, check your compiler setup!"
 #endif

 #if defined(__SSE4_2__) && !defined(GEMMLOWP_AVX2) && !defined(GEMMLOWP_SSE4)
 #warning \
     "Building without SSE4.2 support on SSE4.2 enabled machine, check your compiler setup!"
 #endif

 namespace gemmlowp {

 const double min_accurate_duration = 1e-1;
 const std::size_t min_working_set_size = 16 * 1024 * 1024;

 struct gemm_t {
   int rows, depth, cols;
   gemm_t() : rows(0), depth(0), cols(0) {}
   gemm_t(int r, int d, int c) : rows(r), depth(d), cols(c) {}
 };

 bool operator<(const gemm_t& a, const gemm_t& b) {
   return a.rows < b.rows ||
          (a.rows <= b.rows &&
           (a.depth < b.depth || (a.depth <= b.depth && (a.cols < b.cols))));
 }

 template <typename LhsType, typename RhsType, typename ResultType>
 double time_for_gemms(GemmContext* context, const std::vector<gemm_t>& gemms) {
   typedef std::uint8_t Scalar;

   // set up the matrix pool

   std::size_t combined_gemm_sizes = 0;
   for (auto gemm : gemms) {
     int rows = gemm.rows;
     int depth = gemm.depth;
     int cols = gemm.cols;
     combined_gemm_sizes +=
         sizeof(Scalar) * (rows * depth + depth * cols + rows * cols);
   }

   const std::size_t pool_size = 1 + min_working_set_size / combined_gemm_sizes;

   std::vector<LhsType> lhs(pool_size * gemms.size());
   std::vector<RhsType> rhs(pool_size * gemms.size());
   std::vector<ResultType> result(pool_size * gemms.size());

   for (std::size_t i = 0; i < pool_size; i++) {
     for (std::size_t j = 0; j < gemms.size(); j++) {
       int k = i * gemms.size() + j;
       lhs[k].Resize(gemms[j].rows, gemms[j].depth);
       MakeConstant(&lhs[k], 0);
       rhs[k].Resize(gemms[j].depth, gemms[j].cols);
       MakeConstant(&rhs[k], 0);
       result[k].Resize(gemms[j].rows, gemms[j].cols);
       MakeConstant(&result[k], 0);
     }
   }

   // main benchmark loop

   int iters_at_a_time = 1;
   float time_per_iter = 0.0f;
   std::size_t pool_index = 0;

   while (true) {
     double starttime = real_time_in_seconds();
     for (int i = 0; i < iters_at_a_time; i++) {
       for (size_t j = 0; j < gemms.size(); j++) {
         size_t k = pool_index * gemms.size() + j;
         Gemm<std::uint8_t, GEMMLOWP_TEST_BIT_DEPTH_PARAMS>(
             context, lhs[k].const_map(), rhs[k].const_map(), &result[k].map(),
             -75, -91, 74980, 123, 20);
       }
       pool_index++;
       if (pool_index == pool_size) {
         pool_index = 0;
       }
     }
     double endtime = real_time_in_seconds();

     const float timing = static_cast<float>(endtime - starttime);

     if (timing >= min_accurate_duration) {
       time_per_iter = timing / iters_at_a_time;
       break;
     }

     iters_at_a_time *= 2;
   }

   return time_per_iter;
 }

 template <typename LhsType, typename RhsType, typename ResultType>
 double gflops_for_gemms(GemmContext* context,
                         const std::vector<gemm_t>& gemms) {
   const double time_per_iter =
       time_for_gemms<LhsType, RhsType, ResultType>(context, gemms);
   double ops = 0;
   for (auto gemm : gemms) {
     ops += 2.0 * gemm.rows * gemm.depth * gemm.cols;
   }
   return 1e-9 * ops / time_per_iter;
 }

 void benchmark(GemmContext* context) {
   std::map<gemm_t, std::vector<double>> benchmark_results;

   std::vector<gemm_t> benchmark_gemms;
   benchmark_gemms.emplace_back(10, 10, 10);
   benchmark_gemms.emplace_back(20, 20, 20);
   benchmark_gemms.emplace_back(30, 30, 30);
   benchmark_gemms.emplace_back(40, 40, 40);
   benchmark_gemms.emplace_back(50, 50, 50);
   benchmark_gemms.emplace_back(60, 60, 60);
   benchmark_gemms.emplace_back(64, 256, 147);
   benchmark_gemms.emplace_back(100, 100, 1);
   benchmark_gemms.emplace_back(100, 100, 100);
   benchmark_gemms.emplace_back(100, 1000, 100);
   benchmark_gemms.emplace_back(1000, 1000, 1);
   benchmark_gemms.emplace_back(1000, 1000, 10);
   benchmark_gemms.emplace_back(1000, 1000, 100);
   benchmark_gemms.emplace_back(1000, 1000, 1000);

   const int repeat = 2;

   typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
   typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
   typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;

 #ifdef GEMMLOWP_TEST_PROFILE
   gemmlowp::RegisterCurrentThreadForProfiling();
   gemmlowp::StartProfiling();
 #endif

   // We don't record the first repetition, it's just warm-up.
   for (int r = 0; r < repeat + 1; r++) {
     std::cout << "repetition " << r + 1 << "/" << repeat + 1 << "...\r"
               << std::flush;
     for (auto gemm : benchmark_gemms) {
       double gflops = 0;
       std::vector<gemm_t> unique_gemm;
       unique_gemm.push_back(gemm);
       gflops =
           gflops_for_gemms<LhsType, RhsType, ResultType>(context, unique_gemm);
       if (r > 0) {
         benchmark_results[gemm].emplace_back(gflops);
       }
     }
   }

 #ifdef GEMMLOWP_TEST_PROFILE
   gemmlowp::FinishProfiling();
 #endif

   std::cout << "                                                \r"
             << std::flush;

   std::cout.precision(4);

   for (auto b : benchmark_results) {
     sort(b.second.begin(), b.second.end());
     std::cout << b.first.rows << "x" << b.first.depth << "x" << b.first.cols
               << " : " << b.second.back() << " GFlops/s" << std::endl;
   }
   std::cout << std::endl;
 }

 void benchmark_gemm_sizes(GemmContext* context,
                           const std::vector<gemm_t>& gemms, double mintime) {
   typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
   typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
   typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;

   std::vector<float> gemm_times;
   std::cout << "running for " << mintime << " seconds..." << std::endl;

 #ifdef GEMMLOWP_TEST_PROFILE
   gemmlowp::RegisterCurrentThreadForProfiling();
   gemmlowp::StartProfiling();
 #endif

   double starttime = real_time_in_seconds();
   while (real_time_in_seconds() < starttime + mintime) {
     gemm_times.push_back(
         time_for_gemms<LhsType, RhsType, ResultType>(context, gemms));
   }

 #ifdef GEMMLOWP_TEST_PROFILE
   gemmlowp::FinishProfiling();
 #endif

   std::sort(gemm_times.begin(), gemm_times.end());

   double sum_gemm_times = 0;
   double sum_gemm_times_trimmed = 0;
   int count_gemm_times_trimmed = 0;
   const float trim_ratio = 0.25;
   const size_t count_trimmed = gemm_times.size() * trim_ratio;
   double sum_gemm_times_best = 0;
   int count_gemm_times_best = 0;
   const float best_ratio = 0.1;
   const size_t count_best = gemm_times.size() * best_ratio;

   for (size_t i = 0; i < gemm_times.size(); i++) {
     sum_gemm_times += gemm_times[i];
     if (i >= count_trimmed && i < gemm_times.size() - count_trimmed) {
       sum_gemm_times_trimmed += gemm_times[i];
       count_gemm_times_trimmed++;
     }
     if (i < count_best) {
       sum_gemm_times_best += gemm_times[i];
       count_gemm_times_best++;
     }
   }

   const double min_latency = gemm_times.front();
   const double max_latency = gemm_times.back();
   const double mean_latency = sum_gemm_times / gemm_times.size();
   const double trimmed_mean_latency =
       sum_gemm_times_trimmed / count_gemm_times_trimmed;
   const double best_mean_latency = sum_gemm_times_best / count_gemm_times_best;

   std::cout << "Graph latency (over " << gemm_times.size()
             << " iterations):" << std::endl;
   std::cout << "  Best:             " << min_latency << "s" << std::endl;
   std::cout << "  Worst:            " << max_latency << "s" << std::endl;
   std::cout << "  Mean:             " << mean_latency << "s" << std::endl;
   std::cout << "  " << 100 * trim_ratio
             << "% trimmed mean: " << trimmed_mean_latency << "s" << std::endl;
   std::cout << "  Mean of " << 100 * best_ratio
             << "% best: " << best_mean_latency << "s" << std::endl;
 }

 void benchmark_googlenet(GemmContext* context) {
   // These are the m, n, k sizes for a typical GoogLeNet.
   const int googlenet_gemm_sizes[] = {
       12544, 64,  147, 3136, 64,   64,   3136, 192,  576,  784, 64,   192,
       784,   96,  192, 784,  128,  864,  784,  16,   192,  784, 32,   400,
       784,   32,  192, 784,  128,  256,  784,  128,  256,  784, 192,  1152,
       784,   32,  256, 784,  96,   800,  784,  64,   256,  196, 192,  480,
       196,   96,  480, 196,  204,  864,  196,  16,   480,  196, 48,   400,
       196,   64,  480, 196,  160,  508,  196,  112,  508,  196, 224,  1008,
       196,   24,  508, 196,  64,   600,  196,  64,   508,  196, 128,  512,
       196,   128, 512, 196,  256,  1152, 196,  24,   512,  196, 64,   600,
       196,   64,  512, 196,  112,  512,  196,  144,  512,  196, 288,  1296,
       196,   32,  512, 196,  64,   800,  196,  64,   512,  196, 256,  528,
       196,   160, 528, 196,  320,  1440, 196,  32,   528,  196, 128,  800,
       196,   128, 528, 49,   256,  832,  49,   160,  832,  49,  320,  1440,
       49,    48,  832, 49,   128,  1200, 49,   128,  832,  49,  384,  832,
       49,    192, 832, 49,   384,  1728, 49,   48,   832,  49,  128,  1200,
       49,    128, 832, 16,   128,  508,  1,    1024, 2048, 1,   1008, 1024,
       16,    128, 528, 1,    1024, 2048, 1,    1008, 1024, 1,   1008, 1024,
   };
   assert(sizeof(googlenet_gemm_sizes) % (3 * sizeof(googlenet_gemm_sizes[0])) ==
          0);
   const std::size_t num_googlenet_gemms =
       sizeof(googlenet_gemm_sizes) / (3 * sizeof(googlenet_gemm_sizes[0]));

   std::vector<gemm_t> googlenet_gemms(num_googlenet_gemms);
   for (std::size_t i = 0; i < num_googlenet_gemms; i++) {
     googlenet_gemms[i].rows = googlenet_gemm_sizes[3 * i + 1];
     googlenet_gemms[i].depth = googlenet_gemm_sizes[3 * i + 2];
     googlenet_gemms[i].cols = googlenet_gemm_sizes[3 * i + 0];
   }

   const double mintime = 20.0;
   benchmark_gemm_sizes(context, googlenet_gemms, mintime);
 }

 void benchmark_small_model(GemmContext* context) {
   // These are the m, n, k sizes for a small model with large batches.
   const int small_model_gemm_sizes[] = {
       29232, 16, 25, 7308, 6, 400, 203, 3002, 216,
   };
   assert(sizeof(small_model_gemm_sizes) %
              (3 * sizeof(small_model_gemm_sizes[0])) ==
          0);
   const std::size_t num_small_model_gemms =
       sizeof(small_model_gemm_sizes) / (3 * sizeof(small_model_gemm_sizes[0]));

   std::vector<gemm_t> small_model_gemms(num_small_model_gemms);
   for (std::size_t i = 0; i < num_small_model_gemms; i++) {
     small_model_gemms[i].rows = small_model_gemm_sizes[3 * i + 1];
     small_model_gemms[i].depth = small_model_gemm_sizes[3 * i + 2];
     small_model_gemms[i].cols = small_model_gemm_sizes[3 * i + 0];
   }

   const double mintime = 10.0;
   benchmark_gemm_sizes(context, small_model_gemms, mintime);
 }

 void benchmark_all() {
   {
     gemmlowp::GemmContext context;
     std::cout << "Benchmarking small model GEMMs..." << std::endl;
     gemmlowp::benchmark_small_model(&context);
   }

   {
     gemmlowp::GemmContext context;
     std::cout << "Benchmarking typical GoogLeNet GEMMs..." << std::endl;
     gemmlowp::benchmark_googlenet(&context);
   }

   {
     gemmlowp::GemmContext context;
     context.set_max_num_threads(0);
     std::cout << "Benchmarking multi-threaded mode..." << std::endl;
     gemmlowp::benchmark(&context);
   }

   {
     gemmlowp::GemmContext context;
     context.set_max_num_threads(1);
     std::cout << "Benchmarking single-threaded mode..." << std::endl;
     gemmlowp::benchmark(&context);
   }
 }

 }  // end namespace gemmlowp

 // For iOS, we need to define our own main(), so skip it here.
 #if !(defined(__APPLE__) && (TARGET_OS_IPHONE || TARGET_IPHONE_SIMULATOR))
 int main() { gemmlowp::benchmark_all(); }
 #endif
	// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#ifdef __APPLE__
	#include <sys/time.h>
	#endif

	#include <cstdint>
	#include <cstdlib>
	#include <ctime>
	#include <iostream>
	#include <map>
	#include <vector>
	#ifdef __APPLE__
	#include <TargetConditionals.h>
	#endif

	#include "test.h"

	#ifndef GEMMLOWP_TEST_BIT_DEPTH_PARAMS
	#define GEMMLOWP_TEST_BIT_DEPTH_PARAMS DefaultL8R8BitDepthParams
	#endif

	#if defined(__arm__) && !defined(GEMMLOWP_NEON)
	#warning "Building without NEON support on ARM, check your compiler setup!"
	#endif

	#if defined(__mips) && !defined(GEMMLOWP_MSA)
	#warning "Building without MSA support on MIPS, check your compiler setup!"
	#endif

	#if defined(__AVX2__) && !defined(GEMMLOWP_AVX2)
	#warning \
	"Building without AVX2 support on AVX2 enabled machine, check your compiler setup!"
	#endif

	#if defined(__SSE4_2__) && !defined(GEMMLOWP_AVX2) && !defined(GEMMLOWP_SSE4)
	#warning \
	"Building without SSE4.2 support on SSE4.2 enabled machine, check your compiler setup!"
	#endif

	namespace gemmlowp {

	const double min_accurate_duration = 1e-1;
	const std::size_t min_working_set_size = 16 * 1024 * 1024;

	struct gemm_t {
	int rows, depth, cols;
	gemm_t() : rows(0), depth(0), cols(0) {}
	gemm_t(int r, int d, int c) : rows(r), depth(d), cols(c) {}
	};

	bool operator<(const gemm_t& a, const gemm_t& b) {
	return a.rows < b.rows \|\|
	(a.rows <= b.rows &&
	(a.depth < b.depth \|\| (a.depth <= b.depth && (a.cols < b.cols))));
	}

	template <typename LhsType, typename RhsType, typename ResultType>
	double time_for_gemms(GemmContext* context, const std::vector<gemm_t>& gemms) {
	typedef std::uint8_t Scalar;

	// set up the matrix pool

	std::size_t combined_gemm_sizes = 0;
	for (auto gemm : gemms) {
	int rows = gemm.rows;
	int depth = gemm.depth;
	int cols = gemm.cols;
	combined_gemm_sizes +=
	sizeof(Scalar) * (rows * depth + depth * cols + rows * cols);
	}

	const std::size_t pool_size = 1 + min_working_set_size / combined_gemm_sizes;

	std::vector<LhsType> lhs(pool_size * gemms.size());
	std::vector<RhsType> rhs(pool_size * gemms.size());
	std::vector<ResultType> result(pool_size * gemms.size());

	for (std::size_t i = 0; i < pool_size; i++) {
	for (std::size_t j = 0; j < gemms.size(); j++) {
	int k = i * gemms.size() + j;
	lhs[k].Resize(gemms[j].rows, gemms[j].depth);
	MakeConstant(&lhs[k], 0);
	rhs[k].Resize(gemms[j].depth, gemms[j].cols);
	MakeConstant(&rhs[k], 0);
	result[k].Resize(gemms[j].rows, gemms[j].cols);
	MakeConstant(&result[k], 0);
	}
	}

	// main benchmark loop

	int iters_at_a_time = 1;
	float time_per_iter = 0.0f;
	std::size_t pool_index = 0;

	while (true) {
	double starttime = real_time_in_seconds();
	for (int i = 0; i < iters_at_a_time; i++) {
	for (size_t j = 0; j < gemms.size(); j++) {
	size_t k = pool_index * gemms.size() + j;
	Gemm<std::uint8_t, GEMMLOWP_TEST_BIT_DEPTH_PARAMS>(
	context, lhs[k].const_map(), rhs[k].const_map(), &result[k].map(),
	-75, -91, 74980, 123, 20);
	}
	pool_index++;
	if (pool_index == pool_size) {
	pool_index = 0;
	}
	}
	double endtime = real_time_in_seconds();

	const float timing = static_cast<float>(endtime - starttime);

	if (timing >= min_accurate_duration) {
	time_per_iter = timing / iters_at_a_time;
	break;
	}

	iters_at_a_time *= 2;
	}

	return time_per_iter;
	}

	template <typename LhsType, typename RhsType, typename ResultType>
	double gflops_for_gemms(GemmContext* context,
	const std::vector<gemm_t>& gemms) {
	const double time_per_iter =
	time_for_gemms<LhsType, RhsType, ResultType>(context, gemms);
	double ops = 0;
	for (auto gemm : gemms) {
	ops += 2.0 * gemm.rows * gemm.depth * gemm.cols;
	}
	return 1e-9 * ops / time_per_iter;
	}

	void benchmark(GemmContext* context) {
	std::map<gemm_t, std::vector<double>> benchmark_results;

	std::vector<gemm_t> benchmark_gemms;
	benchmark_gemms.emplace_back(10, 10, 10);
	benchmark_gemms.emplace_back(20, 20, 20);
	benchmark_gemms.emplace_back(30, 30, 30);
	benchmark_gemms.emplace_back(40, 40, 40);
	benchmark_gemms.emplace_back(50, 50, 50);
	benchmark_gemms.emplace_back(60, 60, 60);
	benchmark_gemms.emplace_back(64, 256, 147);
	benchmark_gemms.emplace_back(100, 100, 1);
	benchmark_gemms.emplace_back(100, 100, 100);
	benchmark_gemms.emplace_back(100, 1000, 100);
	benchmark_gemms.emplace_back(1000, 1000, 1);
	benchmark_gemms.emplace_back(1000, 1000, 10);
	benchmark_gemms.emplace_back(1000, 1000, 100);
	benchmark_gemms.emplace_back(1000, 1000, 1000);

	const int repeat = 2;

	typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
	typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
	typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;

	#ifdef GEMMLOWP_TEST_PROFILE
	gemmlowp::RegisterCurrentThreadForProfiling();
	gemmlowp::StartProfiling();
	#endif

	// We don't record the first repetition, it's just warm-up.
	for (int r = 0; r < repeat + 1; r++) {
	std::cout << "repetition " << r + 1 << "/" << repeat + 1 << "...\r"
	<< std::flush;
	for (auto gemm : benchmark_gemms) {
	double gflops = 0;
	std::vector<gemm_t> unique_gemm;
	unique_gemm.push_back(gemm);
	gflops =
	gflops_for_gemms<LhsType, RhsType, ResultType>(context, unique_gemm);
	if (r > 0) {
	benchmark_results[gemm].emplace_back(gflops);
	}
	}
	}

	#ifdef GEMMLOWP_TEST_PROFILE
	gemmlowp::FinishProfiling();
	#endif

	std::cout << " \r"
	<< std::flush;

	std::cout.precision(4);

	for (auto b : benchmark_results) {
	sort(b.second.begin(), b.second.end());
	std::cout << b.first.rows << "x" << b.first.depth << "x" << b.first.cols
	<< " : " << b.second.back() << " GFlops/s" << std::endl;
	}
	std::cout << std::endl;
	}

	void benchmark_gemm_sizes(GemmContext* context,
	const std::vector<gemm_t>& gemms, double mintime) {
	typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
	typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
	typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;

	std::vector<float> gemm_times;
	std::cout << "running for " << mintime << " seconds..." << std::endl;

	#ifdef GEMMLOWP_TEST_PROFILE
	gemmlowp::RegisterCurrentThreadForProfiling();
	gemmlowp::StartProfiling();
	#endif

	double starttime = real_time_in_seconds();
	while (real_time_in_seconds() < starttime + mintime) {
	gemm_times.push_back(
	time_for_gemms<LhsType, RhsType, ResultType>(context, gemms));
	}

	#ifdef GEMMLOWP_TEST_PROFILE
	gemmlowp::FinishProfiling();
	#endif

	std::sort(gemm_times.begin(), gemm_times.end());

	double sum_gemm_times = 0;
	double sum_gemm_times_trimmed = 0;
	int count_gemm_times_trimmed = 0;
	const float trim_ratio = 0.25;
	const size_t count_trimmed = gemm_times.size() * trim_ratio;
	double sum_gemm_times_best = 0;
	int count_gemm_times_best = 0;
	const float best_ratio = 0.1;
	const size_t count_best = gemm_times.size() * best_ratio;

	for (size_t i = 0; i < gemm_times.size(); i++) {
	sum_gemm_times += gemm_times[i];
	if (i >= count_trimmed && i < gemm_times.size() - count_trimmed) {
	sum_gemm_times_trimmed += gemm_times[i];
	count_gemm_times_trimmed++;
	}
	if (i < count_best) {
	sum_gemm_times_best += gemm_times[i];
	count_gemm_times_best++;
	}
	}

	const double min_latency = gemm_times.front();
	const double max_latency = gemm_times.back();
	const double mean_latency = sum_gemm_times / gemm_times.size();
	const double trimmed_mean_latency =
	sum_gemm_times_trimmed / count_gemm_times_trimmed;
	const double best_mean_latency = sum_gemm_times_best / count_gemm_times_best;

	std::cout << "Graph latency (over " << gemm_times.size()
	<< " iterations):" << std::endl;
	std::cout << " Best: " << min_latency << "s" << std::endl;
	std::cout << " Worst: " << max_latency << "s" << std::endl;
	std::cout << " Mean: " << mean_latency << "s" << std::endl;
	std::cout << " " << 100 * trim_ratio
	<< "% trimmed mean: " << trimmed_mean_latency << "s" << std::endl;
	std::cout << " Mean of " << 100 * best_ratio
	<< "% best: " << best_mean_latency << "s" << std::endl;
	}

	void benchmark_googlenet(GemmContext* context) {
	// These are the m, n, k sizes for a typical GoogLeNet.
	const int googlenet_gemm_sizes[] = {
	12544, 64, 147, 3136, 64, 64, 3136, 192, 576, 784, 64, 192,
	784, 96, 192, 784, 128, 864, 784, 16, 192, 784, 32, 400,
	784, 32, 192, 784, 128, 256, 784, 128, 256, 784, 192, 1152,
	784, 32, 256, 784, 96, 800, 784, 64, 256, 196, 192, 480,
	196, 96, 480, 196, 204, 864, 196, 16, 480, 196, 48, 400,
	196, 64, 480, 196, 160, 508, 196, 112, 508, 196, 224, 1008,
	196, 24, 508, 196, 64, 600, 196, 64, 508, 196, 128, 512,
	196, 128, 512, 196, 256, 1152, 196, 24, 512, 196, 64, 600,
	196, 64, 512, 196, 112, 512, 196, 144, 512, 196, 288, 1296,
	196, 32, 512, 196, 64, 800, 196, 64, 512, 196, 256, 528,
	196, 160, 528, 196, 320, 1440, 196, 32, 528, 196, 128, 800,
	196, 128, 528, 49, 256, 832, 49, 160, 832, 49, 320, 1440,
	49, 48, 832, 49, 128, 1200, 49, 128, 832, 49, 384, 832,
	49, 192, 832, 49, 384, 1728, 49, 48, 832, 49, 128, 1200,
	49, 128, 832, 16, 128, 508, 1, 1024, 2048, 1, 1008, 1024,
	16, 128, 528, 1, 1024, 2048, 1, 1008, 1024, 1, 1008, 1024,
	};
	assert(sizeof(googlenet_gemm_sizes) % (3 * sizeof(googlenet_gemm_sizes[0])) ==
	0);
	const std::size_t num_googlenet_gemms =
	sizeof(googlenet_gemm_sizes) / (3 * sizeof(googlenet_gemm_sizes[0]));

	std::vector<gemm_t> googlenet_gemms(num_googlenet_gemms);
	for (std::size_t i = 0; i < num_googlenet_gemms; i++) {
	googlenet_gemms[i].rows = googlenet_gemm_sizes[3 * i + 1];
	googlenet_gemms[i].depth = googlenet_gemm_sizes[3 * i + 2];
	googlenet_gemms[i].cols = googlenet_gemm_sizes[3 * i + 0];
	}

	const double mintime = 20.0;
	benchmark_gemm_sizes(context, googlenet_gemms, mintime);
	}

	void benchmark_small_model(GemmContext* context) {
	// These are the m, n, k sizes for a small model with large batches.
	const int small_model_gemm_sizes[] = {
	29232, 16, 25, 7308, 6, 400, 203, 3002, 216,
	};
	assert(sizeof(small_model_gemm_sizes) %
	(3 * sizeof(small_model_gemm_sizes[0])) ==
	0);
	const std::size_t num_small_model_gemms =
	sizeof(small_model_gemm_sizes) / (3 * sizeof(small_model_gemm_sizes[0]));

	std::vector<gemm_t> small_model_gemms(num_small_model_gemms);
	for (std::size_t i = 0; i < num_small_model_gemms; i++) {
	small_model_gemms[i].rows = small_model_gemm_sizes[3 * i + 1];
	small_model_gemms[i].depth = small_model_gemm_sizes[3 * i + 2];
	small_model_gemms[i].cols = small_model_gemm_sizes[3 * i + 0];
	}

	const double mintime = 10.0;
	benchmark_gemm_sizes(context, small_model_gemms, mintime);
	}

	void benchmark_all() {
	{
	gemmlowp::GemmContext context;
	std::cout << "Benchmarking small model GEMMs..." << std::endl;
	gemmlowp::benchmark_small_model(&context);
	}

	{
	gemmlowp::GemmContext context;
	std::cout << "Benchmarking typical GoogLeNet GEMMs..." << std::endl;
	gemmlowp::benchmark_googlenet(&context);
	}

	{
	gemmlowp::GemmContext context;
	context.set_max_num_threads(0);
	std::cout << "Benchmarking multi-threaded mode..." << std::endl;
	gemmlowp::benchmark(&context);
	}

	{
	gemmlowp::GemmContext context;
	context.set_max_num_threads(1);
	std::cout << "Benchmarking single-threaded mode..." << std::endl;
	gemmlowp::benchmark(&context);
	}
	}

	} // end namespace gemmlowp

	// For iOS, we need to define our own main(), so skip it here.
	#if !(defined(__APPLE__) && (TARGET_OS_IPHONE \|\| TARGET_IPHONE_SIMULATOR))
	int main() { gemmlowp::benchmark_all(); }
	#endif