Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions benchmarks/cuda_bindings/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,14 @@ To run the benchmarks combine the environment and task:
```bash
# Run the Python benchmarks in the wheel environment
pixi run -e wheel bench
pixi run -e wheel bench --min-time 0.1

# Run the Python benchmarks in the source environment
pixi run -e source bench

# Run the C++ benchmarks
pixi run -e wheel bench-cpp
pixi run -e wheel bench-cpp --min-time 0.1
```

Both runners automatically save results to JSON files in the benchmarks
Expand Down
5 changes: 5 additions & 0 deletions benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ int main(int argc, char** argv) {
check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed");

bench::BenchmarkSuite suite(options);
// Drain the persistent stream after calibration so event_record (which
// enqueues onto the stream) and event_synchronize start from a known state.
suite.set_post_calibrate([&]() {
check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed");
});

// --- event_create_destroy ---
{
Expand Down
6 changes: 6 additions & 0 deletions benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,12 @@ int main(int argc, char** argv) {
void* struct_params[] = {&struct_2048B};

bench::BenchmarkSuite suite(options);
// After calibration, drain the persistent stream so the first measured
// sample does not start on a backlogged stream. Calibration for enqueue-
// style ops (kernel launches) may queue many thousands of operations.
suite.set_post_calibrate([&]() {
check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed");
});

suite.run("launch.launch_empty_kernel", [&]() {
check_cu(cuLaunchKernel(empty_kernel, 1, 1, 1, 1, 1, 1, 0, stream, nullptr, nullptr),
Expand Down
5 changes: 5 additions & 0 deletions benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ int main(int argc, char** argv) {
uint8_t host_dst[COPY_SIZE] = {};

bench::BenchmarkSuite suite(options);
// Drain the persistent stream after calibration so async benchmarks
// (mem_alloc_async_free_async) don't start measurement on a backlogged stream.
suite.set_post_calibrate([&]() {
check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed");
});

// --- mem_alloc_free ---
{
Expand Down
6 changes: 6 additions & 0 deletions benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ int main(int argc, char** argv) {
check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed");

bench::BenchmarkSuite suite(options);
// Drain the persistent stream after calibration for completeness.
// stream_create_destroy uses a local stream, but stream_query/synchronize
// observe the persistent one.
suite.set_post_calibrate([&]() {
check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed");
});

// --- stream_create_destroy ---
{
Expand Down
166 changes: 160 additions & 6 deletions benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@

#include <chrono>
#include <cmath>
#include <algorithm>
#include <cstdint>
#include <cstdlib>
#include <ctime>
#include <fstream>
#include <functional>
#include <iomanip>
#include <iostream>
#include <string>
Expand All @@ -22,6 +24,12 @@ struct Options {
std::uint64_t warmups = 5;
std::uint64_t values = 20;
std::uint64_t runs = 20;
double min_time_sec = 0.0;
// Safety cap for the calibration doubling loop. Set high enough that even
// sub-nanosecond ops can reach typical --min-time targets (e.g. 100ms).
// A warning is printed if calibration hits this cap before reaching min-time.
std::uint64_t max_loops = 100000000;
std::uint64_t calibrate_rounds = 3;
std::string output_path;
std::string benchmark_name;
};
Expand All @@ -46,6 +54,18 @@ inline Options parse_args(int argc, char** argv) {
options.warmups = std::strtoull(argv[++i], nullptr, 10);
continue;
}
if (arg == "--min-time" && i + 1 < argc) {
options.min_time_sec = std::strtod(argv[++i], nullptr);
continue;
}
if (arg == "--max-loops" && i + 1 < argc) {
options.max_loops = std::strtoull(argv[++i], nullptr, 10);
continue;
}
if (arg == "--calibrate-rounds" && i + 1 < argc) {
options.calibrate_rounds = std::strtoull(argv[++i], nullptr, 10);
continue;
}
if (arg == "--values" && i + 1 < argc) {
options.values = std::strtoull(argv[++i], nullptr, 10);
continue;
Expand All @@ -68,6 +88,9 @@ inline Options parse_args(int argc, char** argv) {
<< " --warmups N Warmup values per run (default: 5)\n"
<< " --values N Timed values per run (default: 20)\n"
<< " --runs N Number of runs (default: 20)\n"
<< " --min-time S Calibrate loops to reach S seconds per value\n"
<< " --max-loops N Safety cap for calibration loop count (default: 100000000)\n"
<< " --calibrate-rounds N Calibration passes (default: 3)\n"
<< " -o, --output F Write pyperf-compatible JSON to file\n"
<< " --name S Benchmark name (overrides default)\n";
std::exit(0);
Expand All @@ -93,6 +116,70 @@ inline std::string iso_now() {
return std::string(buf);
}

// Calibrate loop count to hit a minimum wall time per value.
// Returns the chosen loop count. If `capped_out` is non-null, it is set to
// true when calibration reached `max_loops` before hitting `min_time_sec`
// (meaning --min-time was NOT actually satisfied by the calibration).
template <typename Fn>
std::uint64_t calibrate_loops(
const Options& options,
Fn&& fn,
bool* capped_out = nullptr,
double* last_elapsed_out = nullptr
) {
if (options.min_time_sec <= 0.0) {
if (capped_out) *capped_out = false;
if (last_elapsed_out) *last_elapsed_out = 0.0;
return options.loops;
}

// Allow callers (e.g. the explicit-loop overload) to request a minimum
// starting loop count via options.loops.
const std::uint64_t start_loops = std::max<std::uint64_t>(1, options.loops);
std::uint64_t best = start_loops;
const std::uint64_t max_loops = std::max<std::uint64_t>(start_loops, options.max_loops);
const std::uint64_t rounds = std::max<std::uint64_t>(1, options.calibrate_rounds);

bool capped = false;
double last_elapsed = 0.0;

for (std::uint64_t round = 0; round < rounds; ++round) {
std::uint64_t loops = start_loops;
double elapsed = 0.0;

while (true) {
const auto t0 = std::chrono::steady_clock::now();
for (std::uint64_t i = 0; i < loops; ++i) {
fn();
}
const auto t1 = std::chrono::steady_clock::now();
elapsed = std::chrono::duration<double>(t1 - t0).count();

if (elapsed >= options.min_time_sec) {
break;
}
if (loops >= max_loops) {
capped = true;
break;
}
if (loops > max_loops / 2) {
loops = max_loops;
} else {
loops *= 2;
}
}

if (loops > best) {
best = loops;
}
last_elapsed = elapsed;
}

if (capped_out) *capped_out = capped;
if (last_elapsed_out) *last_elapsed_out = last_elapsed;
return best;
}

// Run a benchmark function. The function signature is: void fn() — one call = one operation.
// The harness calls fn() in a tight loop `loops` times per value.
template <typename Fn>
Expand Down Expand Up @@ -235,22 +322,59 @@ class BenchmarkSuite {
public:
explicit BenchmarkSuite(Options options) : options_(std::move(options)) {}

// Post-calibration hook. If set, invoked after calibration and before the
// first measured warmup/value, for every benchmark in this suite. Intended
// for async benchmarks that need to drain state left behind by calibration
// (e.g. cuStreamSynchronize on a persistent stream). Can be overridden
// per-call via the `post_calibrate` parameter on `run()`.
void set_post_calibrate(std::function<void()> hook) {
post_calibrate_ = std::move(hook);
}

// Run a benchmark and record it. The name is used as the benchmark ID.
// If --min-time is set, loop count is auto-calibrated. `post_calibrate`,
// if provided, runs after calibration and before measurement.
template <typename Fn>
void run(const std::string& name, Fn&& fn) {
auto results = run_benchmark(options_, std::forward<Fn>(fn));
void run(
const std::string& name,
Fn&& fn,
std::function<void()> post_calibrate = {}
) {
std::uint64_t loops = options_.loops;
Options custom = options_;
if (options_.min_time_sec > 0.0) {
loops = calibrate_and_warn(name, options_, fn);
custom.loops = loops;
invoke_post_calibrate(post_calibrate);
}
auto results = run_benchmark(custom, std::forward<Fn>(fn));
print_summary(name, results);
entries_.push_back({name, options_.loops, std::move(results)});
entries_.push_back({name, loops, std::move(results)});
}

// Run a benchmark with a custom loop count (for slow operations like compilation).
// Run a benchmark with a custom loop count (used as a floor for fast ops
// or a fixed count for slow ops like compilation). When --min-time is set,
// calibration still runs but starts from `loops_override` as the minimum.
template <typename Fn>
void run(const std::string& name, std::uint64_t loops_override, Fn&& fn) {
void run(
const std::string& name,
std::uint64_t loops_override,
Fn&& fn,
std::function<void()> post_calibrate = {}
) {
std::uint64_t loops = loops_override;
Options custom = options_;
custom.loops = loops_override;
if (options_.min_time_sec > 0.0) {
Options calib_opts = options_;
calib_opts.loops = loops_override; // floor
loops = calibrate_and_warn(name, calib_opts, fn);
custom.loops = loops;
invoke_post_calibrate(post_calibrate);
}
auto results = run_benchmark(custom, std::forward<Fn>(fn));
print_summary(name, results);
entries_.push_back({name, loops_override, std::move(results)});
entries_.push_back({name, loops, std::move(results)});
}

// Write all collected benchmarks to the output file (if -o was given).
Expand All @@ -263,6 +387,36 @@ class BenchmarkSuite {
private:
Options options_;
std::vector<BenchmarkEntry> entries_;
std::function<void()> post_calibrate_;

void invoke_post_calibrate(const std::function<void()>& per_call) const {
if (per_call) {
per_call();
} else if (post_calibrate_) {
post_calibrate_();
}
}

template <typename Fn>
std::uint64_t calibrate_and_warn(
const std::string& name,
const Options& calib_opts,
Fn&& fn
) const {
bool capped = false;
double last_elapsed = 0.0;
std::uint64_t loops = calibrate_loops(
calib_opts, std::forward<Fn>(fn), &capped, &last_elapsed
);
if (capped) {
std::cerr << "WARNING: " << name
<< ": calibration hit --max-loops (" << calib_opts.max_loops
<< ") before reaching --min-time (" << calib_opts.min_time_sec
<< "s). Last sample: " << last_elapsed
<< "s. Raise --max-loops to satisfy --min-time for this benchmark.\n";
}
return loops;
}

static void write_multi_pyperf_json(
const std::string& output_path,
Expand Down
44 changes: 34 additions & 10 deletions benchmarks/cuda_bindings/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,27 @@ def load_benchmarks(path: Path) -> dict[str, list[float]]:
name = run.get("metadata", {}).get("name", "")
if name:
break
values = []
values: list[float] = []
for run in bench.get("runs", []):
values.extend(run.get("values", []))
if name and values:
results[name] = values
return results


def stats(values: list[float]) -> tuple[float, float, float, int]:
mean = statistics.mean(values)
stdev = statistics.pstdev(values) if len(values) > 1 else 0.0
rsd = (stdev / mean) if mean else 0.0
return mean, stdev, rsd, len(values)


def fmt_rsd(rsd: float | None) -> str:
if rsd is None:
return "-"
return f"{rsd * 100:.1f}%"


def fmt_ns(seconds: float) -> str:
ns = seconds * 1e9
if ns >= 1000:
Expand Down Expand Up @@ -79,13 +92,16 @@ def main() -> None:

# Header
if cpp_benchmarks:
header = f"{'Benchmark':<{name_width}} {'C++ (mean)':>12} {'Python (mean)':>14} {'Overhead':>10}"
header = (
f"{'Benchmark':<{name_width}} {'C++ (mean)':>12} {'C++ RSD':>8} "
f"{'Python (mean)':>14} {'Py RSD':>7} {'Overhead':>10}"
)
sep = "-" * len(header)
print(sep)
print(header)
print(sep)
else:
header = f"{'Benchmark':<{name_width}} {'Python (mean)':>14}"
header = f"{'Benchmark':<{name_width}} {'Python (mean)':>14} {'Py RSD':>7}"
sep = "-" * len(header)
print(sep)
print(header)
Expand All @@ -95,21 +111,29 @@ def main() -> None:
py_vals = py_benchmarks.get(name)
cpp_vals = cpp_benchmarks.get(name)

py_str = fmt_ns(statistics.mean(py_vals)) if py_vals else "-"
cpp_str = fmt_ns(statistics.mean(cpp_vals)) if cpp_vals else "-"
py_stats = stats(py_vals) if py_vals else None
cpp_stats = stats(cpp_vals) if cpp_vals else None

py_str = fmt_ns(py_stats[0]) if py_stats else "-"
cpp_str = fmt_ns(cpp_stats[0]) if cpp_stats else "-"
py_rsd = fmt_rsd(py_stats[2]) if py_stats else "-"
cpp_rsd = fmt_rsd(cpp_stats[2]) if cpp_stats else "-"

if py_vals and cpp_vals:
py_mean = statistics.mean(py_vals)
cpp_mean = statistics.mean(cpp_vals)
if py_stats and cpp_stats:
py_mean = py_stats[0]
cpp_mean = cpp_stats[0]
overhead_ns = (py_mean - cpp_mean) * 1e9
overhead_str = f"+{overhead_ns:.0f} ns"
else:
overhead_str = "-"

if cpp_benchmarks:
print(f"{name:<{name_width}} {cpp_str:>12} {py_str:>14} {overhead_str:>10}")
print(
f"{name:<{name_width}} {cpp_str:>12} {cpp_rsd:>8} "
f"{py_str:>14} {py_rsd:>7} {overhead_str:>10}"
)
else:
print(f"{name:<{name_width}} {py_str:>14}")
print(f"{name:<{name_width}} {py_str:>14} {py_rsd:>7}")

print(sep)

Expand Down