NVIDIA · danielfrg · Apr 22, 2026 · Apr 22, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/benchmarks/cuda_bindings/README.md b/benchmarks/cuda_bindings/README.md
@@ -47,12 +47,14 @@ To run the benchmarks combine the environment and task:
 ```bash
 # Run the Python benchmarks in the wheel environment
 pixi run -e wheel bench
+pixi run -e wheel bench --min-time 0.1
 
 # Run the Python benchmarks in the source environment
 pixi run -e source bench
 
 # Run the C++ benchmarks
 pixi run -e wheel bench-cpp
+pixi run -e wheel bench-cpp --min-time 0.1
 ```
 
 Both runners automatically save results to JSON files in the benchmarks

diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp
@@ -45,6 +45,11 @@ int main(int argc, char** argv) {
     check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed");
 
     bench::BenchmarkSuite suite(options);
+    // Drain the persistent stream after calibration so event_record (which
+    // enqueues onto the stream) and event_synchronize start from a known state.
+    suite.set_post_calibrate([&]() {
+        check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed");
+    });
 
     // --- event_create_destroy ---
     {

diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp
@@ -238,6 +238,12 @@ int main(int argc, char** argv) {
     void* struct_params[] = {&struct_2048B};
 
     bench::BenchmarkSuite suite(options);
+    // After calibration, drain the persistent stream so the first measured
+    // sample does not start on a backlogged stream. Calibration for enqueue-
+    // style ops (kernel launches) may queue many thousands of operations.
+    suite.set_post_calibrate([&]() {
+        check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed");
+    });
 
     suite.run("launch.launch_empty_kernel", [&]() {
         check_cu(cuLaunchKernel(empty_kernel, 1, 1, 1, 1, 1, 1, 0, stream, nullptr, nullptr),

diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp
@@ -52,6 +52,11 @@ int main(int argc, char** argv) {
     uint8_t host_dst[COPY_SIZE] = {};
 
     bench::BenchmarkSuite suite(options);
+    // Drain the persistent stream after calibration so async benchmarks
+    // (mem_alloc_async_free_async) don't start measurement on a backlogged stream.
+    suite.set_post_calibrate([&]() {
+        check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed");
+    });
 
     // --- mem_alloc_free ---
     {

diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp
@@ -38,6 +38,12 @@ int main(int argc, char** argv) {
     check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed");
 
     bench::BenchmarkSuite suite(options);
+    // Drain the persistent stream after calibration for completeness.
+    // stream_create_destroy uses a local stream, but stream_query/synchronize
+    // observe the persistent one.
+    suite.set_post_calibrate([&]() {
+        check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed");
+    });
 
     // --- stream_create_destroy ---
     {

diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp
@@ -6,10 +6,12 @@
 
 #include <chrono>
 #include <cmath>
+#include <algorithm>
 #include <cstdint>
 #include <cstdlib>
 #include <ctime>
 #include <fstream>
+#include <functional>
 #include <iomanip>
 #include <iostream>
 #include <string>
@@ -22,6 +24,12 @@ struct Options {
     std::uint64_t warmups = 5;
     std::uint64_t values = 20;
     std::uint64_t runs = 20;
+    double min_time_sec = 0.0;
+    // Safety cap for the calibration doubling loop. Set high enough that even
+    // sub-nanosecond ops can reach typical --min-time targets (e.g. 100ms).
+    // A warning is printed if calibration hits this cap before reaching min-time.
+    std::uint64_t max_loops = 100000000;
+    std::uint64_t calibrate_rounds = 3;
     std::string output_path;
     std::string benchmark_name;
 };
@@ -46,6 +54,18 @@ inline Options parse_args(int argc, char** argv) {
             options.warmups = std::strtoull(argv[++i], nullptr, 10);
             continue;
         }
+        if (arg == "--min-time" && i + 1 < argc) {
+            options.min_time_sec = std::strtod(argv[++i], nullptr);
+            continue;
+        }
+        if (arg == "--max-loops" && i + 1 < argc) {
+            options.max_loops = std::strtoull(argv[++i], nullptr, 10);
+            continue;
+        }
+        if (arg == "--calibrate-rounds" && i + 1 < argc) {
+            options.calibrate_rounds = std::strtoull(argv[++i], nullptr, 10);
+            continue;
+        }
         if (arg == "--values" && i + 1 < argc) {
             options.values = std::strtoull(argv[++i], nullptr, 10);
             continue;
@@ -68,6 +88,9 @@ inline Options parse_args(int argc, char** argv) {
                       << "  --warmups N     Warmup values per run (default: 5)\n"
                       << "  --values N      Timed values per run (default: 20)\n"
                       << "  --runs N        Number of runs (default: 20)\n"
+                      << "  --min-time S    Calibrate loops to reach S seconds per value\n"
+                      << "  --max-loops N   Safety cap for calibration loop count (default: 100000000)\n"
+                      << "  --calibrate-rounds N  Calibration passes (default: 3)\n"
                       << "  -o, --output F  Write pyperf-compatible JSON to file\n"
                       << "  --name S        Benchmark name (overrides default)\n";
             std::exit(0);
@@ -93,6 +116,70 @@ inline std::string iso_now() {
     return std::string(buf);
 }
 
+// Calibrate loop count to hit a minimum wall time per value.
+// Returns the chosen loop count. If `capped_out` is non-null, it is set to
+// true when calibration reached `max_loops` before hitting `min_time_sec`
+// (meaning --min-time was NOT actually satisfied by the calibration).
+template <typename Fn>
+std::uint64_t calibrate_loops(
+    const Options& options,
+    Fn&& fn,
+    bool* capped_out = nullptr,
+    double* last_elapsed_out = nullptr
+) {
+    if (options.min_time_sec <= 0.0) {
+        if (capped_out) *capped_out = false;
+        if (last_elapsed_out) *last_elapsed_out = 0.0;
+        return options.loops;
+    }
+
+    // Allow callers (e.g. the explicit-loop overload) to request a minimum
+    // starting loop count via options.loops.
+    const std::uint64_t start_loops = std::max<std::uint64_t>(1, options.loops);
+    std::uint64_t best = start_loops;
+    const std::uint64_t max_loops = std::max<std::uint64_t>(start_loops, options.max_loops);
+    const std::uint64_t rounds = std::max<std::uint64_t>(1, options.calibrate_rounds);
+
+    bool capped = false;
+    double last_elapsed = 0.0;
+
+    for (std::uint64_t round = 0; round < rounds; ++round) {
+        std::uint64_t loops = start_loops;
+        double elapsed = 0.0;
+
+        while (true) {
+            const auto t0 = std::chrono::steady_clock::now();
+            for (std::uint64_t i = 0; i < loops; ++i) {
+                fn();
+            }
+            const auto t1 = std::chrono::steady_clock::now();
+            elapsed = std::chrono::duration<double>(t1 - t0).count();
+
+            if (elapsed >= options.min_time_sec) {
+                break;
+            }
+            if (loops >= max_loops) {
+                capped = true;
+                break;
+            }
+            if (loops > max_loops / 2) {
+                loops = max_loops;
+            } else {
+                loops *= 2;
+            }
+        }
+
+        if (loops > best) {
+            best = loops;
+        }
+        last_elapsed = elapsed;
+    }
+
+    if (capped_out) *capped_out = capped;
+    if (last_elapsed_out) *last_elapsed_out = last_elapsed;
+    return best;
+}
+
 // Run a benchmark function. The function signature is: void fn() — one call = one operation.
 // The harness calls fn() in a tight loop `loops` times per value.
 template <typename Fn>
@@ -235,22 +322,59 @@ class BenchmarkSuite {
 public:
     explicit BenchmarkSuite(Options options) : options_(std::move(options)) {}
 
+    // Post-calibration hook. If set, invoked after calibration and before the
+    // first measured warmup/value, for every benchmark in this suite. Intended
+    // for async benchmarks that need to drain state left behind by calibration
+    // (e.g. cuStreamSynchronize on a persistent stream). Can be overridden
+    // per-call via the `post_calibrate` parameter on `run()`.
+    void set_post_calibrate(std::function<void()> hook) {
+        post_calibrate_ = std::move(hook);
+    }
+
     // Run a benchmark and record it. The name is used as the benchmark ID.
+    // If --min-time is set, loop count is auto-calibrated. `post_calibrate`,
+    // if provided, runs after calibration and before measurement.
     template <typename Fn>
-    void run(const std::string& name, Fn&& fn) {
-        auto results = run_benchmark(options_, std::forward<Fn>(fn));
+    void run(
+        const std::string& name,
+        Fn&& fn,
+        std::function<void()> post_calibrate = {}
+    ) {
+        std::uint64_t loops = options_.loops;
+        Options custom = options_;
+        if (options_.min_time_sec > 0.0) {
+            loops = calibrate_and_warn(name, options_, fn);
+            custom.loops = loops;
+            invoke_post_calibrate(post_calibrate);
+        }
+        auto results = run_benchmark(custom, std::forward<Fn>(fn));
         print_summary(name, results);
-        entries_.push_back({name, options_.loops, std::move(results)});
+        entries_.push_back({name, loops, std::move(results)});
     }
 
-    // Run a benchmark with a custom loop count (for slow operations like compilation).
+    // Run a benchmark with a custom loop count (used as a floor for fast ops
+    // or a fixed count for slow ops like compilation). When --min-time is set,
+    // calibration still runs but starts from `loops_override` as the minimum.
     template <typename Fn>
-    void run(const std::string& name, std::uint64_t loops_override, Fn&& fn) {
+    void run(
+        const std::string& name,
+        std::uint64_t loops_override,
+        Fn&& fn,
+        std::function<void()> post_calibrate = {}
+    ) {
+        std::uint64_t loops = loops_override;
         Options custom = options_;
         custom.loops = loops_override;
+        if (options_.min_time_sec > 0.0) {
+            Options calib_opts = options_;
+            calib_opts.loops = loops_override;  // floor
+            loops = calibrate_and_warn(name, calib_opts, fn);
+            custom.loops = loops;
+            invoke_post_calibrate(post_calibrate);
+        }
         auto results = run_benchmark(custom, std::forward<Fn>(fn));
         print_summary(name, results);
-        entries_.push_back({name, loops_override, std::move(results)});
+        entries_.push_back({name, loops, std::move(results)});
     }
 
     // Write all collected benchmarks to the output file (if -o was given).
@@ -263,6 +387,36 @@ class BenchmarkSuite {
 private:
     Options options_;
     std::vector<BenchmarkEntry> entries_;
+    std::function<void()> post_calibrate_;
+
+    void invoke_post_calibrate(const std::function<void()>& per_call) const {
+        if (per_call) {
+            per_call();
+        } else if (post_calibrate_) {
+            post_calibrate_();
+        }
+    }
+
+    template <typename Fn>
+    std::uint64_t calibrate_and_warn(
+        const std::string& name,
+        const Options& calib_opts,
+        Fn&& fn
+    ) const {
+        bool capped = false;
+        double last_elapsed = 0.0;
+        std::uint64_t loops = calibrate_loops(
+            calib_opts, std::forward<Fn>(fn), &capped, &last_elapsed
+        );
+        if (capped) {
+            std::cerr << "WARNING: " << name
+                      << ": calibration hit --max-loops (" << calib_opts.max_loops
+                      << ") before reaching --min-time (" << calib_opts.min_time_sec
+                      << "s). Last sample: " << last_elapsed
+                      << "s. Raise --max-loops to satisfy --min-time for this benchmark.\n";
+        }
+        return loops;
+    }
 
     static void write_multi_pyperf_json(
         const std::string& output_path,

diff --git a/benchmarks/cuda_bindings/compare.py b/benchmarks/cuda_bindings/compare.py
@@ -29,14 +29,27 @@ def load_benchmarks(path: Path) -> dict[str, list[float]]:
                 name = run.get("metadata", {}).get("name", "")
                 if name:
                     break
-        values = []
+        values: list[float] = []
         for run in bench.get("runs", []):
             values.extend(run.get("values", []))
         if name and values:
             results[name] = values
     return results
 
 
+def stats(values: list[float]) -> tuple[float, float, float, int]:
+    mean = statistics.mean(values)
+    stdev = statistics.pstdev(values) if len(values) > 1 else 0.0
+    rsd = (stdev / mean) if mean else 0.0
+    return mean, stdev, rsd, len(values)
+
+
+def fmt_rsd(rsd: float | None) -> str:
+    if rsd is None:
+        return "-"
+    return f"{rsd * 100:.1f}%"
+
+
 def fmt_ns(seconds: float) -> str:
     ns = seconds * 1e9
     if ns >= 1000:
@@ -79,13 +92,16 @@ def main() -> None:
 
     # Header
     if cpp_benchmarks:
-        header = f"{'Benchmark':<{name_width}}  {'C++ (mean)':>12}  {'Python (mean)':>14}  {'Overhead':>10}"
+        header = (
+            f"{'Benchmark':<{name_width}}  {'C++ (mean)':>12}  {'C++ RSD':>8}  "
+            f"{'Python (mean)':>14}  {'Py RSD':>7}  {'Overhead':>10}"
+        )
         sep = "-" * len(header)
         print(sep)
         print(header)
         print(sep)
     else:
-        header = f"{'Benchmark':<{name_width}}  {'Python (mean)':>14}"
+        header = f"{'Benchmark':<{name_width}}  {'Python (mean)':>14}  {'Py RSD':>7}"
         sep = "-" * len(header)
         print(sep)
         print(header)
@@ -95,21 +111,29 @@ def main() -> None:
         py_vals = py_benchmarks.get(name)
         cpp_vals = cpp_benchmarks.get(name)
 
-        py_str = fmt_ns(statistics.mean(py_vals)) if py_vals else "-"
-        cpp_str = fmt_ns(statistics.mean(cpp_vals)) if cpp_vals else "-"
+        py_stats = stats(py_vals) if py_vals else None
+        cpp_stats = stats(cpp_vals) if cpp_vals else None
+
+        py_str = fmt_ns(py_stats[0]) if py_stats else "-"
+        cpp_str = fmt_ns(cpp_stats[0]) if cpp_stats else "-"
+        py_rsd = fmt_rsd(py_stats[2]) if py_stats else "-"
+        cpp_rsd = fmt_rsd(cpp_stats[2]) if cpp_stats else "-"
 
-        if py_vals and cpp_vals:
-            py_mean = statistics.mean(py_vals)
-            cpp_mean = statistics.mean(cpp_vals)
+        if py_stats and cpp_stats:
+            py_mean = py_stats[0]
+            cpp_mean = cpp_stats[0]
             overhead_ns = (py_mean - cpp_mean) * 1e9
             overhead_str = f"+{overhead_ns:.0f} ns"
         else:
             overhead_str = "-"
 
         if cpp_benchmarks:
-            print(f"{name:<{name_width}}  {cpp_str:>12}  {py_str:>14}  {overhead_str:>10}")
+            print(
+                f"{name:<{name_width}}  {cpp_str:>12}  {cpp_rsd:>8}  "
+                f"{py_str:>14}  {py_rsd:>7}  {overhead_str:>10}"
+            )
         else:
-            print(f"{name:<{name_width}}  {py_str:>14}")
+            print(f"{name:<{name_width}}  {py_str:>14}  {py_rsd:>7}")
 
     print(sep)