| #ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H |
| #define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H |
| |
| #include "benchmarks/gpu/BenchmarkLogger.h" |
| #include "benchmarks/gpu/timing/timing.h" |
| #include "src/__support/CPP/array.h" |
| #include "src/__support/CPP/functional.h" |
| #include "src/__support/CPP/limits.h" |
| #include "src/__support/CPP/string_view.h" |
| #include "src/__support/CPP/type_traits.h" |
| #include "src/__support/FPUtil/FPBits.h" |
| #include "src/__support/macros/config.h" |
| #include "src/stdlib/rand.h" |
| #include "src/time/clock.h" |
| |
| #include <stdint.h> |
| |
| namespace LIBC_NAMESPACE_DECL { |
| |
| namespace benchmarks { |
| |
| struct BenchmarkOptions { |
| uint32_t initial_iterations = 1; |
| uint32_t min_iterations = 1; |
| uint32_t max_iterations = 10000000; |
| uint32_t min_samples = 4; |
| uint32_t max_samples = 1000; |
| int64_t min_duration = 500 * 1000; // 500 * 1000 nanoseconds = 500 us |
| int64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second |
| double epsilon = 0.0001; |
| double scaling_factor = 1.4; |
| }; |
| |
| struct Measurement { |
| uint32_t iterations = 0; |
| uint64_t elapsed_cycles = 0; |
| }; |
| |
| class RefinableRuntimeEstimation { |
| uint64_t total_cycles = 0; |
| uint32_t total_iterations = 0; |
| |
| public: |
| uint64_t update(const Measurement &M) { |
| total_cycles += M.elapsed_cycles; |
| total_iterations += M.iterations; |
| return total_cycles / total_iterations; |
| } |
| }; |
| |
| // Tracks the progression of the runtime estimation |
| class RuntimeEstimationProgression { |
| RefinableRuntimeEstimation rre; |
| |
| public: |
| uint64_t current_estimation = 0; |
| |
| double compute_improvement(const Measurement &M) { |
| const uint64_t new_estimation = rre.update(M); |
| double ratio = |
| (static_cast<double>(current_estimation) / new_estimation) - 1.0; |
| |
| // Get absolute value |
| if (ratio < 0) |
| ratio *= -1; |
| |
| current_estimation = new_estimation; |
| return ratio; |
| } |
| }; |
| |
| struct BenchmarkResult { |
| uint64_t cycles = 0; |
| double standard_deviation = 0; |
| uint64_t min = UINT64_MAX; |
| uint64_t max = 0; |
| uint32_t samples = 0; |
| uint32_t total_iterations = 0; |
| clock_t total_time = 0; |
| }; |
| |
| BenchmarkResult benchmark(const BenchmarkOptions &options, |
| cpp::function<uint64_t(void)> wrapper_func); |
| |
| class Benchmark { |
| const cpp::function<uint64_t(void)> func; |
| const cpp::string_view suite_name; |
| const cpp::string_view test_name; |
| const uint32_t num_threads; |
| |
| public: |
| Benchmark(cpp::function<uint64_t(void)> func, char const *suite_name, |
| char const *test_name, uint32_t num_threads) |
| : func(func), suite_name(suite_name), test_name(test_name), |
| num_threads(num_threads) { |
| add_benchmark(this); |
| } |
| |
| static void run_benchmarks(); |
| const cpp::string_view get_suite_name() const { return suite_name; } |
| const cpp::string_view get_test_name() const { return test_name; } |
| |
| protected: |
| static void add_benchmark(Benchmark *benchmark); |
| |
| private: |
| BenchmarkResult run() { |
| BenchmarkOptions options; |
| return benchmark(options, func); |
| } |
| }; |
| |
| // We want our random values to be approximately |
| // Output: a random number with the exponent field between min_exp and max_exp, |
| // i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1), |
| // Caveats: |
| // -EXP_BIAS corresponding to denormal values, |
| // EXP_BIAS + 1 corresponding to inf or nan. |
| template <typename T> |
| static T |
| get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS, |
| int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) { |
| using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>; |
| |
| // Required to correctly instantiate FPBits for floats and doubles. |
| using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>), |
| uint64_t, uint32_t>; |
| RandType bits; |
| if constexpr (cpp::is_same_v<T, uint64_t>) |
| bits = (static_cast<uint64_t>(LIBC_NAMESPACE::rand()) << 32) | |
| static_cast<uint64_t>(LIBC_NAMESPACE::rand()); |
| else |
| bits = LIBC_NAMESPACE::rand(); |
| double scale = |
| static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1); |
| FPBits fp(bits); |
| fp.set_biased_exponent( |
| static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp)); |
| return fp.get_val(); |
| } |
| |
| template <typename T> class MathPerf { |
| using FPBits = fputil::FPBits<T>; |
| using StorageType = typename FPBits::StorageType; |
| static constexpr StorageType UIntMax = |
| cpp::numeric_limits<StorageType>::max(); |
| |
| public: |
| template <size_t N = 1> |
| static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp) { |
| cpp::array<T, N> inputs; |
| for (size_t i = 0; i < N; ++i) |
| inputs[i] = get_rand_input<T>(min_exp, max_exp); |
| |
| uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs); |
| |
| return total_time / N; |
| } |
| |
| // Throughput benchmarking for functions that take 2 inputs. |
| template <size_t N = 1> |
| static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp, |
| int arg1_max_exp, int arg2_min_exp, |
| int arg2_max_exp) { |
| cpp::array<T, N> inputs1; |
| cpp::array<T, N> inputs2; |
| for (size_t i = 0; i < N; ++i) { |
| inputs1[i] = get_rand_input<T>(arg1_min_exp, arg1_max_exp); |
| inputs2[i] = get_rand_input<T>(arg2_min_exp, arg2_max_exp); |
| } |
| |
| uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2); |
| |
| return total_time / N; |
| } |
| }; |
| |
| } // namespace benchmarks |
| } // namespace LIBC_NAMESPACE_DECL |
| |
| // Passing -1 indicates the benchmark should be run with as many threads as |
| // allocated by the user in the benchmark's CMake. |
| #define BENCHMARK(SuiteName, TestName, Func) \ |
| LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ |
| Func, #SuiteName, #TestName, -1) |
| |
| #define BENCHMARK_N_THREADS(SuiteName, TestName, Func, NumThreads) \ |
| LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ |
| Func, #SuiteName, #TestName, NumThreads) |
| |
| #define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func) \ |
| BENCHMARK_N_THREADS(SuiteName, TestName, Func, 1) |
| |
| #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \ |
| BENCHMARK_N_THREADS(SuiteName, TestName, Func, \ |
| LIBC_NAMESPACE::gpu::get_lane_size()) |
| #endif |