1 files changed, 110 insertions, 72 deletions
diff --git a/src/bench/bench.cpp b/src/bench/bench.cpp
index 1482452814..edbad09ebd 100644
--- a/src/bench/bench.cpp
+++ b/src/bench/bench.cpp
@@ -8,98 +8,136 @@
 #include <assert.h>
 #include <iostream>
 #include <iomanip>
+#include <algorithm>
+#include <regex>
+#include <numeric>
 
-benchmark::BenchRunner::BenchmarkMap &benchmark::BenchRunner::benchmarks() {
-    static std::map<std::string, benchmark::BenchFunction> benchmarks_map;
-    return benchmarks_map;
+void benchmark::ConsolePrinter::header()
+{
+    std::cout << "# Benchmark, evals, iterations, total, min, max, median" << std::endl;
 }
 
-benchmark::BenchRunner::BenchRunner(std::string name, benchmark::BenchFunction func)
+void benchmark::ConsolePrinter::result(const State& state)
 {
-    benchmarks().insert(std::make_pair(name, func));
+    auto results = state.m_elapsed_results;
+    std::sort(results.begin(), results.end());
+
+    double total = state.m_num_iters * std::accumulate(results.begin(), results.end(), 0.0);
+
+    double front = 0;
+    double back = 0;
+    double median = 0;
+
+    if (!results.empty()) {
+        front = results.front();
+        back = results.back();
+
+        size_t mid = results.size() / 2;
+        median = results[mid];
+        if (0 == results.size() % 2) {
+            median = (results[mid] + results[mid + 1]) / 2;
+        }
+    }
+
+    std::cout << std::setprecision(6);
+    std::cout << state.m_name << ", " << state.m_num_evals << ", " << state.m_num_iters << ", " << total << ", " << front << ", " << back << ", " << median << std::endl;
 }
 
-void
-benchmark::BenchRunner::RunAll(benchmark::duration elapsedTimeForOne)
+void benchmark::ConsolePrinter::footer() {}
+benchmark::PlotlyPrinter::PlotlyPrinter(std::string plotly_url, int64_t width, int64_t height)
+    : m_plotly_url(plotly_url), m_width(width), m_height(height)
 {
-    perf_init();
-    if (std::ratio_less_equal<benchmark::clock::period, std::micro>::value) {
-        std::cerr << "WARNING: Clock precision is worse than microsecond - benchmarks may be less accurate!\n";
-    }
-    std::cout << "#Benchmark" << "," << "count" << "," << "min(ns)" << "," << "max(ns)" << "," << "average(ns)" << ","
-              << "min_cycles" << "," << "max_cycles" << "," << "average_cycles" << "\n";
+}
 
-    for (const auto &p: benchmarks()) {
-        State state(p.first, elapsedTimeForOne);
-        p.second(state);
-    }
-    perf_fini();
+void benchmark::PlotlyPrinter::header()
+{
+    std::cout << "<html><head>"
+              << "<script src=\"" << m_plotly_url << "\"></script>"
+              << "</head><body><div id=\"myDiv\" style=\"width:" << m_width << "px; height:" << m_height << "px\"></div>"
+              << "<script> var data = ["
+              << std::endl;
 }
 
-bool benchmark::State::KeepRunning()
+void benchmark::PlotlyPrinter::result(const State& state)
 {
-    if (count & countMask) {
-      ++count;
-      return true;
+    std::cout << "{ " << std::endl
+              << "  name: '" << state.m_name << "', " << std::endl
+              << "  y: [";
+
+    const char* prefix = "";
+    for (const auto& e : state.m_elapsed_results) {
+        std::cout << prefix << std::setprecision(6) << e;
+        prefix = ", ";
     }
-    time_point now;
+    std::cout << "]," << std::endl
+              << "  boxpoints: 'all', jitter: 0.3, pointpos: 0, type: 'box',"
+              << std::endl
+              << "}," << std::endl;
+}
+
+void benchmark::PlotlyPrinter::footer()
+{
+    std::cout << "]; var layout = { showlegend: false, yaxis: { rangemode: 'tozero', autorange: true } };"
+              << "Plotly.newPlot('myDiv', data, layout);"
+              << "</script></body></html>";
+}
 
-    uint64_t nowCycles;
-    if (count == 0) {
-        lastTime = beginTime = now = clock::now();
-        lastCycles = beginCycles = nowCycles = perf_cpucycles();
+
+benchmark::BenchRunner::BenchmarkMap& benchmark::BenchRunner::benchmarks()
+{
+    static std::map<std::string, Bench> benchmarks_map;
+    return benchmarks_map;
+}
+
+benchmark::BenchRunner::BenchRunner(std::string name, benchmark::BenchFunction func, uint64_t num_iters_for_one_second)
+{
+    benchmarks().insert(std::make_pair(name, Bench{func, num_iters_for_one_second}));
+}
+
+void benchmark::BenchRunner::RunAll(Printer& printer, uint64_t num_evals, double scaling, const std::string& filter, bool is_list_only)
+{
+    perf_init();
+    if (!std::ratio_less_equal<benchmark::clock::period, std::micro>::value) {
+        std::cerr << "WARNING: Clock precision is worse than microsecond - benchmarks may be less accurate!\n";
     }
-    else {
-        now = clock::now();
-        auto elapsed = now - lastTime;
-        auto elapsedOne = elapsed / (countMask + 1);
-        if (elapsedOne < minTime) minTime = elapsedOne;
-        if (elapsedOne > maxTime) maxTime = elapsedOne;
-
-        // We only use relative values, so don't have to handle 64-bit wrap-around specially
-        nowCycles = perf_cpucycles();
-        uint64_t elapsedOneCycles = (nowCycles - lastCycles) / (countMask + 1);
-        if (elapsedOneCycles < minCycles) minCycles = elapsedOneCycles;
-        if (elapsedOneCycles > maxCycles) maxCycles = elapsedOneCycles;
-
-        if (elapsed*128 < maxElapsed) {
-          // If the execution was much too fast (1/128th of maxElapsed), increase the count mask by 8x and restart timing.
-          // The restart avoids including the overhead of this code in the measurement.
-          countMask = ((countMask<<3)|7) & ((1LL<<60)-1);
-          count = 0;
-          minTime = duration::max();
-          maxTime = duration::zero();
-          minCycles = std::numeric_limits<uint64_t>::max();
-          maxCycles = std::numeric_limits<uint64_t>::min();
-          return true;
+
+    std::regex reFilter(filter);
+    std::smatch baseMatch;
+
+    printer.header();
+
+    for (const auto& p : benchmarks()) {
+        if (!std::regex_match(p.first, baseMatch, reFilter)) {
+            continue;
+        }
+
+        uint64_t num_iters = static_cast<uint64_t>(p.second.num_iters_for_one_second * scaling);
+        if (0 == num_iters) {
+            num_iters = 1;
         }
-        if (elapsed*16 < maxElapsed) {
-          uint64_t newCountMask = ((countMask<<1)|1) & ((1LL<<60)-1);
-          if ((count & newCountMask)==0) {
-              countMask = newCountMask;
-          }
+        State state(p.first, num_evals, num_iters, printer);
+        if (!is_list_only) {
+            p.second.func(state);
         }
+        printer.result(state);
     }
-    lastTime = now;
-    lastCycles = nowCycles;
-    ++count;
 
-    if (now - beginTime < maxElapsed) return true; // Keep going
+    printer.footer();
 
-    --count;
+    perf_fini();
+}
 
-    assert(count != 0 && "count == 0 => (now == 0 && beginTime == 0) => return above");
+bool benchmark::State::UpdateTimer(const benchmark::time_point current_time)
+{
+    if (m_start_time != time_point()) {
+        std::chrono::duration<double> diff = current_time - m_start_time;
+        m_elapsed_results.push_back(diff.count() / m_num_iters);
 
-    // Output results
-    // Duration casts are only necessary here because hardware with sub-nanosecond clocks
-    // will lose precision.
-    int64_t min_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(minTime).count();
-    int64_t max_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(maxTime).count();
-    int64_t avg_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>((now-beginTime)/count).count();
-    int64_t averageCycles = (nowCycles-beginCycles)/count;
-    std::cout << std::fixed << std::setprecision(15) << name << "," << count << "," << min_elapsed << "," << max_elapsed << "," << avg_elapsed << ","
-              << minCycles << "," << maxCycles << "," << averageCycles << "\n";
-    std::cout.copyfmt(std::ios(nullptr));
+        if (m_elapsed_results.size() == m_num_evals) {
+            return false;
+        }
+    }
 
-    return false;
+    m_num_iters_left = m_num_iters - 1;
+    return true;
 }