diff --git a/benchmarks/cuda_bindings/README.md b/benchmarks/cuda_bindings/README.md index f8d5ccf043..cffca57bef 100644 --- a/benchmarks/cuda_bindings/README.md +++ b/benchmarks/cuda_bindings/README.md @@ -47,12 +47,14 @@ To run the benchmarks combine the environment and task: ```bash # Run the Python benchmarks in the wheel environment pixi run -e wheel bench +pixi run -e wheel bench --min-time 0.1 # Run the Python benchmarks in the source environment pixi run -e source bench # Run the C++ benchmarks pixi run -e wheel bench-cpp +pixi run -e wheel bench-cpp --min-time 0.1 ``` Both runners automatically save results to JSON files in the benchmarks diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp index 44cd617778..c24aa98319 100644 --- a/benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp +++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp @@ -45,6 +45,11 @@ int main(int argc, char** argv) { check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed"); bench::BenchmarkSuite suite(options); + // Drain the persistent stream after calibration so event_record (which + // enqueues onto the stream) and event_synchronize start from a known state. + suite.set_post_calibrate([&]() { + check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed"); + }); // --- event_create_destroy --- { diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp index 984c82fcf3..4897859a61 100644 --- a/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp +++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp @@ -238,6 +238,12 @@ int main(int argc, char** argv) { void* struct_params[] = {&struct_2048B}; bench::BenchmarkSuite suite(options); + // After calibration, drain the persistent stream so the first measured + // sample does not start on a backlogged stream. Calibration for enqueue- + // style ops (kernel launches) may queue many thousands of operations. + suite.set_post_calibrate([&]() { + check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed"); + }); suite.run("launch.launch_empty_kernel", [&]() { check_cu(cuLaunchKernel(empty_kernel, 1, 1, 1, 1, 1, 1, 0, stream, nullptr, nullptr), diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp index 4e71b73fb5..803363be48 100644 --- a/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp +++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp @@ -52,6 +52,11 @@ int main(int argc, char** argv) { uint8_t host_dst[COPY_SIZE] = {}; bench::BenchmarkSuite suite(options); + // Drain the persistent stream after calibration so async benchmarks + // (mem_alloc_async_free_async) don't start measurement on a backlogged stream. + suite.set_post_calibrate([&]() { + check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed"); + }); // --- mem_alloc_free --- { diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp index 702e86aef0..95ad0790f9 100644 --- a/benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp +++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp @@ -38,6 +38,12 @@ int main(int argc, char** argv) { check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed"); bench::BenchmarkSuite suite(options); + // Drain the persistent stream after calibration for completeness. + // stream_create_destroy uses a local stream, but stream_query/synchronize + // observe the persistent one. + suite.set_post_calibrate([&]() { + check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed"); + }); // --- stream_create_destroy --- { diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp index 8b54122866..131f69de54 100644 --- a/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp +++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp @@ -6,10 +6,12 @@ #include #include +#include #include #include #include #include +#include #include #include #include @@ -22,6 +24,12 @@ struct Options { std::uint64_t warmups = 5; std::uint64_t values = 20; std::uint64_t runs = 20; + double min_time_sec = 0.0; + // Safety cap for the calibration doubling loop. Set high enough that even + // sub-nanosecond ops can reach typical --min-time targets (e.g. 100ms). + // A warning is printed if calibration hits this cap before reaching min-time. + std::uint64_t max_loops = 100000000; + std::uint64_t calibrate_rounds = 3; std::string output_path; std::string benchmark_name; }; @@ -46,6 +54,18 @@ inline Options parse_args(int argc, char** argv) { options.warmups = std::strtoull(argv[++i], nullptr, 10); continue; } + if (arg == "--min-time" && i + 1 < argc) { + options.min_time_sec = std::strtod(argv[++i], nullptr); + continue; + } + if (arg == "--max-loops" && i + 1 < argc) { + options.max_loops = std::strtoull(argv[++i], nullptr, 10); + continue; + } + if (arg == "--calibrate-rounds" && i + 1 < argc) { + options.calibrate_rounds = std::strtoull(argv[++i], nullptr, 10); + continue; + } if (arg == "--values" && i + 1 < argc) { options.values = std::strtoull(argv[++i], nullptr, 10); continue; @@ -68,6 +88,9 @@ inline Options parse_args(int argc, char** argv) { << " --warmups N Warmup values per run (default: 5)\n" << " --values N Timed values per run (default: 20)\n" << " --runs N Number of runs (default: 20)\n" + << " --min-time S Calibrate loops to reach S seconds per value\n" + << " --max-loops N Safety cap for calibration loop count (default: 100000000)\n" + << " --calibrate-rounds N Calibration passes (default: 3)\n" << " -o, --output F Write pyperf-compatible JSON to file\n" << " --name S Benchmark name (overrides default)\n"; std::exit(0); @@ -93,6 +116,70 @@ inline std::string iso_now() { return std::string(buf); } +// Calibrate loop count to hit a minimum wall time per value. +// Returns the chosen loop count. If `capped_out` is non-null, it is set to +// true when calibration reached `max_loops` before hitting `min_time_sec` +// (meaning --min-time was NOT actually satisfied by the calibration). +template +std::uint64_t calibrate_loops( + const Options& options, + Fn&& fn, + bool* capped_out = nullptr, + double* last_elapsed_out = nullptr +) { + if (options.min_time_sec <= 0.0) { + if (capped_out) *capped_out = false; + if (last_elapsed_out) *last_elapsed_out = 0.0; + return options.loops; + } + + // Allow callers (e.g. the explicit-loop overload) to request a minimum + // starting loop count via options.loops. + const std::uint64_t start_loops = std::max(1, options.loops); + std::uint64_t best = start_loops; + const std::uint64_t max_loops = std::max(start_loops, options.max_loops); + const std::uint64_t rounds = std::max(1, options.calibrate_rounds); + + bool capped = false; + double last_elapsed = 0.0; + + for (std::uint64_t round = 0; round < rounds; ++round) { + std::uint64_t loops = start_loops; + double elapsed = 0.0; + + while (true) { + const auto t0 = std::chrono::steady_clock::now(); + for (std::uint64_t i = 0; i < loops; ++i) { + fn(); + } + const auto t1 = std::chrono::steady_clock::now(); + elapsed = std::chrono::duration(t1 - t0).count(); + + if (elapsed >= options.min_time_sec) { + break; + } + if (loops >= max_loops) { + capped = true; + break; + } + if (loops > max_loops / 2) { + loops = max_loops; + } else { + loops *= 2; + } + } + + if (loops > best) { + best = loops; + } + last_elapsed = elapsed; + } + + if (capped_out) *capped_out = capped; + if (last_elapsed_out) *last_elapsed_out = last_elapsed; + return best; +} + // Run a benchmark function. The function signature is: void fn() — one call = one operation. // The harness calls fn() in a tight loop `loops` times per value. template @@ -235,22 +322,59 @@ class BenchmarkSuite { public: explicit BenchmarkSuite(Options options) : options_(std::move(options)) {} + // Post-calibration hook. If set, invoked after calibration and before the + // first measured warmup/value, for every benchmark in this suite. Intended + // for async benchmarks that need to drain state left behind by calibration + // (e.g. cuStreamSynchronize on a persistent stream). Can be overridden + // per-call via the `post_calibrate` parameter on `run()`. + void set_post_calibrate(std::function hook) { + post_calibrate_ = std::move(hook); + } + // Run a benchmark and record it. The name is used as the benchmark ID. + // If --min-time is set, loop count is auto-calibrated. `post_calibrate`, + // if provided, runs after calibration and before measurement. template - void run(const std::string& name, Fn&& fn) { - auto results = run_benchmark(options_, std::forward(fn)); + void run( + const std::string& name, + Fn&& fn, + std::function post_calibrate = {} + ) { + std::uint64_t loops = options_.loops; + Options custom = options_; + if (options_.min_time_sec > 0.0) { + loops = calibrate_and_warn(name, options_, fn); + custom.loops = loops; + invoke_post_calibrate(post_calibrate); + } + auto results = run_benchmark(custom, std::forward(fn)); print_summary(name, results); - entries_.push_back({name, options_.loops, std::move(results)}); + entries_.push_back({name, loops, std::move(results)}); } - // Run a benchmark with a custom loop count (for slow operations like compilation). + // Run a benchmark with a custom loop count (used as a floor for fast ops + // or a fixed count for slow ops like compilation). When --min-time is set, + // calibration still runs but starts from `loops_override` as the minimum. template - void run(const std::string& name, std::uint64_t loops_override, Fn&& fn) { + void run( + const std::string& name, + std::uint64_t loops_override, + Fn&& fn, + std::function post_calibrate = {} + ) { + std::uint64_t loops = loops_override; Options custom = options_; custom.loops = loops_override; + if (options_.min_time_sec > 0.0) { + Options calib_opts = options_; + calib_opts.loops = loops_override; // floor + loops = calibrate_and_warn(name, calib_opts, fn); + custom.loops = loops; + invoke_post_calibrate(post_calibrate); + } auto results = run_benchmark(custom, std::forward(fn)); print_summary(name, results); - entries_.push_back({name, loops_override, std::move(results)}); + entries_.push_back({name, loops, std::move(results)}); } // Write all collected benchmarks to the output file (if -o was given). @@ -263,6 +387,36 @@ class BenchmarkSuite { private: Options options_; std::vector entries_; + std::function post_calibrate_; + + void invoke_post_calibrate(const std::function& per_call) const { + if (per_call) { + per_call(); + } else if (post_calibrate_) { + post_calibrate_(); + } + } + + template + std::uint64_t calibrate_and_warn( + const std::string& name, + const Options& calib_opts, + Fn&& fn + ) const { + bool capped = false; + double last_elapsed = 0.0; + std::uint64_t loops = calibrate_loops( + calib_opts, std::forward(fn), &capped, &last_elapsed + ); + if (capped) { + std::cerr << "WARNING: " << name + << ": calibration hit --max-loops (" << calib_opts.max_loops + << ") before reaching --min-time (" << calib_opts.min_time_sec + << "s). Last sample: " << last_elapsed + << "s. Raise --max-loops to satisfy --min-time for this benchmark.\n"; + } + return loops; + } static void write_multi_pyperf_json( const std::string& output_path, diff --git a/benchmarks/cuda_bindings/compare.py b/benchmarks/cuda_bindings/compare.py index 6a3e94f344..7dbc972e92 100644 --- a/benchmarks/cuda_bindings/compare.py +++ b/benchmarks/cuda_bindings/compare.py @@ -29,7 +29,7 @@ def load_benchmarks(path: Path) -> dict[str, list[float]]: name = run.get("metadata", {}).get("name", "") if name: break - values = [] + values: list[float] = [] for run in bench.get("runs", []): values.extend(run.get("values", [])) if name and values: @@ -37,6 +37,19 @@ def load_benchmarks(path: Path) -> dict[str, list[float]]: return results +def stats(values: list[float]) -> tuple[float, float, float, int]: + mean = statistics.mean(values) + stdev = statistics.pstdev(values) if len(values) > 1 else 0.0 + rsd = (stdev / mean) if mean else 0.0 + return mean, stdev, rsd, len(values) + + +def fmt_rsd(rsd: float | None) -> str: + if rsd is None: + return "-" + return f"{rsd * 100:.1f}%" + + def fmt_ns(seconds: float) -> str: ns = seconds * 1e9 if ns >= 1000: @@ -79,13 +92,16 @@ def main() -> None: # Header if cpp_benchmarks: - header = f"{'Benchmark':<{name_width}} {'C++ (mean)':>12} {'Python (mean)':>14} {'Overhead':>10}" + header = ( + f"{'Benchmark':<{name_width}} {'C++ (mean)':>12} {'C++ RSD':>8} " + f"{'Python (mean)':>14} {'Py RSD':>7} {'Overhead':>10}" + ) sep = "-" * len(header) print(sep) print(header) print(sep) else: - header = f"{'Benchmark':<{name_width}} {'Python (mean)':>14}" + header = f"{'Benchmark':<{name_width}} {'Python (mean)':>14} {'Py RSD':>7}" sep = "-" * len(header) print(sep) print(header) @@ -95,21 +111,29 @@ def main() -> None: py_vals = py_benchmarks.get(name) cpp_vals = cpp_benchmarks.get(name) - py_str = fmt_ns(statistics.mean(py_vals)) if py_vals else "-" - cpp_str = fmt_ns(statistics.mean(cpp_vals)) if cpp_vals else "-" + py_stats = stats(py_vals) if py_vals else None + cpp_stats = stats(cpp_vals) if cpp_vals else None + + py_str = fmt_ns(py_stats[0]) if py_stats else "-" + cpp_str = fmt_ns(cpp_stats[0]) if cpp_stats else "-" + py_rsd = fmt_rsd(py_stats[2]) if py_stats else "-" + cpp_rsd = fmt_rsd(cpp_stats[2]) if cpp_stats else "-" - if py_vals and cpp_vals: - py_mean = statistics.mean(py_vals) - cpp_mean = statistics.mean(cpp_vals) + if py_stats and cpp_stats: + py_mean = py_stats[0] + cpp_mean = cpp_stats[0] overhead_ns = (py_mean - cpp_mean) * 1e9 overhead_str = f"+{overhead_ns:.0f} ns" else: overhead_str = "-" if cpp_benchmarks: - print(f"{name:<{name_width}} {cpp_str:>12} {py_str:>14} {overhead_str:>10}") + print( + f"{name:<{name_width}} {cpp_str:>12} {cpp_rsd:>8} " + f"{py_str:>14} {py_rsd:>7} {overhead_str:>10}" + ) else: - print(f"{name:<{name_width}} {py_str:>14}") + print(f"{name:<{name_width}} {py_str:>14} {py_rsd:>7}") print(sep)