From 78c312c983255e15fc274de2368a2ec13ce81cbf Mon Sep 17 00:00:00 2001
From: Martin Ankerl <martin.ankerl@gmail.com>
Date: Sat, 13 Jun 2020 09:37:27 +0200
Subject: Replace current benchmarking framework with nanobench

This replaces the current benchmarking framework with nanobench [1], an
MIT licensed single-header benchmarking library, of which I am the
autor. This has in my opinion several advantages, especially on Linux:

* fast: Running all benchmarks takes ~6 seconds instead of 4m13s on
  an Intel i7-8700 CPU @ 3.20GHz.

* accurate: I ran e.g. the benchmark for SipHash_32b 10 times and
  calculate standard deviation / mean = coefficient of variation:

  * 0.57% CV for old benchmarking framework
  * 0.20% CV for nanobench

  So the benchmark results with nanobench seem to vary less than with
  the old framework.

* It automatically determines runtime based on clock precision, no need
  to specify number of evaluations.

* measure instructions, cycles, branches, instructions per cycle,
  branch misses (only Linux, when performance counters are available)

* output in markdown table format.

* Warn about unstable environment (frequency scaling, turbo, ...)

* For better profiling, it is possible to set the environment variable
  NANOBENCH_ENDLESS to force endless running of a particular benchmark
  without the need to recompile. This makes it to e.g. run "perf top"
  and look at hotspots.

Here is an example copy & pasted from the terminal output:

|             ns/byte |              byte/s |    err% |        ins/byte |        cyc/byte |    IPC |       bra/byte |   miss% |     total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|                2.52 |      396,529,415.94 |    0.6% |           25.42 |            8.02 |  3.169 |           0.06 |    0.0% |      0.03 | `bench/crypto_hash.cpp RIPEMD160`
|                1.87 |      535,161,444.83 |    0.3% |           21.36 |            5.95 |  3.589 |           0.06 |    0.0% |      0.02 | `bench/crypto_hash.cpp SHA1`
|                3.22 |      310,344,174.79 |    1.1% |           36.80 |           10.22 |  3.601 |           0.09 |    0.0% |      0.04 | `bench/crypto_hash.cpp SHA256`
|                2.01 |      496,375,796.23 |    0.0% |           18.72 |            6.43 |  2.911 |           0.01 |    1.0% |      0.00 | `bench/crypto_hash.cpp SHA256D64_1024`
|                7.23 |      138,263,519.35 |    0.1% |           82.66 |           23.11 |  3.577 |           1.63 |    0.1% |      0.00 | `bench/crypto_hash.cpp SHA256_32b`
|                3.04 |      328,780,166.40 |    0.3% |           35.82 |            9.69 |  3.696 |           0.03 |    0.0% |      0.03 | `bench/crypto_hash.cpp SHA512`

[1] https://github.com/martinus/nanobench

* Adds support for asymptotes

  This adds support to calculate asymptotic complexity of a benchmark.
  This is similar to #17375, but currently only one asymptote is
  supported, and I have added support in the benchmark `ComplexMemPool`
  as an example.

  Usage is e.g. like this:

  ```
  ./bench_bitcoin -filter=ComplexMemPool -asymptote=25,50,100,200,400,600,800
  ```

  This runs the benchmark `ComplexMemPool` several times but with
  different complexityN settings. The benchmark can extract that number
  and use it accordingly. Here, it's used for `childTxs`. The output is
  this:

  | complexityN |               ns/op |                op/s |    err% |          ins/op |          cyc/op |    IPC |     total | benchmark
  |------------:|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|----------:|:----------
  |          25 |        1,064,241.00 |              939.64 |    1.4% |    3,960,279.00 |    2,829,708.00 |  1.400 |      0.01 | `ComplexMemPool`
  |          50 |        1,579,530.00 |              633.10 |    1.0% |    6,231,810.00 |    4,412,674.00 |  1.412 |      0.02 | `ComplexMemPool`
  |         100 |        4,022,774.00 |              248.58 |    0.6% |   16,544,406.00 |   11,889,535.00 |  1.392 |      0.04 | `ComplexMemPool`
  |         200 |       15,390,986.00 |               64.97 |    0.2% |   63,904,254.00 |   47,731,705.00 |  1.339 |      0.17 | `ComplexMemPool`
  |         400 |       69,394,711.00 |               14.41 |    0.1% |  272,602,461.00 |  219,014,691.00 |  1.245 |      0.76 | `ComplexMemPool`
  |         600 |      168,977,165.00 |                5.92 |    0.1% |  639,108,082.00 |  535,316,887.00 |  1.194 |      1.86 | `ComplexMemPool`
  |         800 |      310,109,077.00 |                3.22 |    0.1% |1,149,134,246.00 |  984,620,812.00 |  1.167 |      3.41 | `ComplexMemPool`

  |   coefficient |   err% | complexity
  |--------------:|-------:|------------
  |   4.78486e-07 |   4.5% | O(n^2)
  |   6.38557e-10 |  21.7% | O(n^3)
  |   3.42338e-05 |  38.0% | O(n log n)
  |   0.000313914 |  46.9% | O(n)
  |     0.0129823 | 114.4% | O(log n)
  |     0.0815055 | 133.8% | O(1)

  The best fitting curve is O(n^2), so the algorithm seems to scale
  quadratic with `childTxs` in the range 25 to 800.
---
 src/bench/bench.cpp | 140 ++++++++++++++--------------------------------------
 1 file changed, 36 insertions(+), 104 deletions(-)

(limited to 'src/bench/bench.cpp')

diff --git a/src/bench/bench.cpp b/src/bench/bench.cpp
index 7b93ef688d..01466d0b6f 100644
--- a/src/bench/bench.cpp
+++ b/src/bench/bench.cpp
@@ -8,141 +8,73 @@
 #include <test/util/setup_common.h>
 #include <validation.h>
 
-#include <algorithm>
-#include <assert.h>
-#include <iomanip>
-#include <iostream>
-#include <numeric>
 #include <regex>
 
 const std::function<void(const std::string&)> G_TEST_LOG_FUN{};
 
-void benchmark::ConsolePrinter::header()
-{
-    std::cout << "# Benchmark, evals, iterations, total, min, max, median" << std::endl;
-}
+namespace {
 
-void benchmark::ConsolePrinter::result(const State& state)
+void GenerateTemplateResults(const std::vector<ankerl::nanobench::Result>& benchmarkResults, const std::string& filename, const char* tpl)
 {
-    auto results = state.m_elapsed_results;
-    std::sort(results.begin(), results.end());
-
-    double total = state.m_num_iters * std::accumulate(results.begin(), results.end(), 0.0);
-
-    double front = 0;
-    double back = 0;
-    double median = 0;
-
-    if (!results.empty()) {
-        front = results.front();
-        back = results.back();
-
-        size_t mid = results.size() / 2;
-        median = results[mid];
-        if (0 == results.size() % 2) {
-            median = (results[mid] + results[mid + 1]) / 2;
-        }
+    if (benchmarkResults.empty() || filename.empty()) {
+        // nothing to write, bail out
+        return;
     }
-
-    std::cout << std::setprecision(6);
-    std::cout << state.m_name << ", " << state.m_num_evals << ", " << state.m_num_iters << ", " << total << ", " << front << ", " << back << ", " << median << std::endl;
-}
-
-void benchmark::ConsolePrinter::footer() {}
-benchmark::PlotlyPrinter::PlotlyPrinter(std::string plotly_url, int64_t width, int64_t height)
-    : m_plotly_url(plotly_url), m_width(width), m_height(height)
-{
-}
-
-void benchmark::PlotlyPrinter::header()
-{
-    std::cout << "<html><head>"
-              << "<script src=\"" << m_plotly_url << "\"></script>"
-              << "</head><body><div id=\"myDiv\" style=\"width:" << m_width << "px; height:" << m_height << "px\"></div>"
-              << "<script> var data = ["
-              << std::endl;
-}
-
-void benchmark::PlotlyPrinter::result(const State& state)
-{
-    std::cout << "{ " << std::endl
-              << "  name: '" << state.m_name << "', " << std::endl
-              << "  y: [";
-
-    const char* prefix = "";
-    for (const auto& e : state.m_elapsed_results) {
-        std::cout << prefix << std::setprecision(6) << e;
-        prefix = ", ";
+    std::ofstream fout(filename);
+    if (fout.is_open()) {
+        ankerl::nanobench::render(tpl, benchmarkResults, fout);
+    } else {
+        std::cout << "Could write to file '" << filename << "'" << std::endl;
     }
-    std::cout << "]," << std::endl
-              << "  boxpoints: 'all', jitter: 0.3, pointpos: 0, type: 'box',"
-              << std::endl
-              << "}," << std::endl;
-}
 
-void benchmark::PlotlyPrinter::footer()
-{
-    std::cout << "]; var layout = { showlegend: false, yaxis: { rangemode: 'tozero', autorange: true } };"
-              << "Plotly.newPlot('myDiv', data, layout);"
-              << "</script></body></html>";
+    std::cout << "Created '" << filename << "'" << std::endl;
 }
 
+} // namespace
 
 benchmark::BenchRunner::BenchmarkMap& benchmark::BenchRunner::benchmarks()
 {
-    static std::map<std::string, Bench> benchmarks_map;
+    static std::map<std::string, BenchFunction> benchmarks_map;
     return benchmarks_map;
 }
 
-benchmark::BenchRunner::BenchRunner(std::string name, benchmark::BenchFunction func, uint64_t num_iters_for_one_second)
+benchmark::BenchRunner::BenchRunner(std::string name, benchmark::BenchFunction func)
 {
-    benchmarks().insert(std::make_pair(name, Bench{func, num_iters_for_one_second}));
+    benchmarks().insert(std::make_pair(name, func));
 }
 
-void benchmark::BenchRunner::RunAll(Printer& printer, uint64_t num_evals, double scaling, const std::string& filter, bool is_list_only)
+void benchmark::BenchRunner::RunAll(const Args& args)
 {
-    if (!std::ratio_less_equal<benchmark::clock::period, std::micro>::value) {
-        std::cerr << "WARNING: Clock precision is worse than microsecond - benchmarks may be less accurate!\n";
-    }
-#ifdef DEBUG
-    std::cerr << "WARNING: This is a debug build - may result in slower benchmarks.\n";
-#endif
-
-    std::regex reFilter(filter);
+    std::regex reFilter(args.regex_filter);
     std::smatch baseMatch;
 
-    printer.header();
-
+    std::vector<ankerl::nanobench::Result> benchmarkResults;
     for (const auto& p : benchmarks()) {
         if (!std::regex_match(p.first, baseMatch, reFilter)) {
             continue;
         }
 
-        uint64_t num_iters = static_cast<uint64_t>(p.second.num_iters_for_one_second * scaling);
-        if (0 == num_iters) {
-            num_iters = 1;
-        }
-        State state(p.first, num_evals, num_iters, printer);
-        if (!is_list_only) {
-            p.second.func(state);
+        if (args.is_list_only) {
+            std::cout << p.first << std::endl;
+            continue;
         }
-        printer.result(state);
-    }
-
-    printer.footer();
-}
-
-bool benchmark::State::UpdateTimer(const benchmark::time_point current_time)
-{
-    if (m_start_time != time_point()) {
-        std::chrono::duration<double> diff = current_time - m_start_time;
-        m_elapsed_results.push_back(diff.count() / m_num_iters);
 
-        if (m_elapsed_results.size() == m_num_evals) {
-            return false;
+        Bench bench;
+        bench.name(p.first);
+        if (args.asymptote.empty()) {
+            p.second(bench);
+        } else {
+            for (auto n : args.asymptote) {
+                bench.complexityN(n);
+                p.second(bench);
+            }
+            std::cout << bench.complexityBigO() << std::endl;
         }
+        benchmarkResults.push_back(bench.results().back());
     }
 
-    m_num_iters_left = m_num_iters - 1;
-    return true;
+    GenerateTemplateResults(benchmarkResults, args.output_csv, "# Benchmark, evals, iterations, total, min, max, median\n"
+                                                               "{{#result}}{{name}}, {{epochs}}, {{average(iterations)}}, {{sumProduct(iterations, elapsed)}}, {{minimum(elapsed)}}, {{maximum(elapsed)}}, {{median(elapsed)}}\n"
+                                                               "{{/result}}");
+    GenerateTemplateResults(benchmarkResults, args.output_json, ankerl::nanobench::templates::json());
 }
-- 
cgit v1.2.3