From 41453fb578fb77ef2c4803a11084cd532c9fb5e3 Mon Sep 17 00:00:00 2001 From: moudyellaz Date: Thu, 11 Jun 2026 03:02:19 +0200 Subject: [PATCH] feat(cycle_bench): add public-execution ms calibration for fee model --- docs/benchmarks/cycle_bench.md | 45 +++++-- tools/cycle_bench/src/main.rs | 225 ++++++++++++++++++++++++++++++++- 2 files changed, 250 insertions(+), 20 deletions(-) diff --git a/docs/benchmarks/cycle_bench.md b/docs/benchmarks/cycle_bench.md index 0e880070..2a8785f0 100644 --- a/docs/benchmarks/cycle_bench.md +++ b/docs/benchmarks/cycle_bench.md @@ -14,21 +14,37 @@ Per-program Risc0 cycle counts, prover wall time, PPE composition cost, and veri | Profile | release | | GPU acceleration | none | -## Executor cycles +## Executor cycles and public-execution ms -`SessionInfo::cycles()` per instruction. Deterministic across runs. Wall time is `best / mean ± stdev` over 5 timed iterations (1 warmup discarded). +`SessionInfo::cycles()` per instruction. Deterministic across runs. Wall time is `best / mean ± stdev` over the timed iterations (1 warmup discarded; `--exec-iters` sets the count, 50 below). `calib_ms` and `net_ms` are the public-execution time in milliseconds, on the same axis as the private `G_verify` so the fee model has one common unit for both paths. See the calibration block below for how they are derived. -| Program | Instruction | user_cycles | segments | exec_ms (best / mean ± stdev) | -|---|---|---:|---:|---| -| authenticated_transfer | Initialize | 43,642 | 1 | 18.86 / 19.41 ± 0.48 | -| authenticated_transfer | Transfer | 77,095 | 1 | 19.67 / 20.84 ± 1.16 | -| token | Burn | 116,546 | 1 | 24.86 / 25.46 ± 0.63 | -| token | Mint | 116,862 | 1 | 24.47 / 25.08 ± 0.42 | -| token | Transfer | 127,726 | 1 | 25.00 / 25.40 ± 0.29 | -| clock | Tick (no rollups) | 137,022 | 1 | 21.18 / 21.57 ± 0.41 | -| ata | Create | 175,056 | 1 | 23.64 / 24.94 ± 1.09 | -| amm | SwapExactInput | 508,634 | 1 | 34.21 / 34.77 ± 0.55 | -| amm | AddLiquidity | 642,774 | 1 | 37.59 / 37.87 ± 0.28 | +| Program | Instruction | user_cycles | segments | exec_ms (best / mean ± stdev) | calib_ms | net_ms | +|---|---|---:|---:|---|---:|---:| +| authenticated_transfer | Initialize | 43,818 | 1 | 30.69 / 31.93 ± 1.03 | 1.31 | 0.29 | +| authenticated_transfer | Transfer | 79,958 | 1 | 31.02 / 32.35 ± 0.59 | 2.38 | 0.61 | +| token | Burn | 116,546 | 1 | 36.08 / 37.18 ± 0.60 | 3.47 | 5.67 | +| token | Mint | 116,862 | 1 | 35.67 / 37.73 ± 2.54 | 3.48 | 5.26 | +| token | Transfer | 127,726 | 1 | 35.49 / 36.86 ± 0.90 | 3.81 | 5.08 | +| clock | Tick (no rollups) | 137,022 | 1 | 32.12 / 33.16 ± 0.89 | 4.08 | 1.72 | +| ata | Create | 174,515 | 1 | 35.41 / 36.49 ± 0.65 | 5.20 | 5.00 | +| amm | SwapExactInput | 508,904 | 1 | 46.71 / 48.06 ± 0.86 | 15.17 | 16.30 | +| amm | AddLiquidity | 643,464 | 1 | 48.57 / 50.28 ± 0.98 | 19.18 | 18.16 | + +### Public-execution ms calibration + +The binary fits `best_ms = intercept + slope · user_cycles` by ordinary least squares across the nine cases (best-of-N, not mean, so one OS scheduling spike cannot tilt the slope). On the machine above: + +| Field | Value | +|---|---| +| throughput (1 / slope) | 33,546 cycles/ms | +| fixed overhead (intercept) | 30.41 ms per call | +| R² | 0.935 | + +- `calib_ms = user_cycles / throughput` is the compute-only time, a pure function of the deterministic cycle count and the one pinned-hardware constant, so it reproduces run to run where raw wall-time does not. This is the number to put on the common public/private ms axis. +- `net_ms = best exec_ms − fixed overhead` is the measured compute with the host-side overhead stripped; it agrees with `calib_ms` to within the per-program overhead scatter (the intercept is an ELF-size-averaged constant, so this decomposition is first-order, not mechanistic). +- The `fixed overhead` is host-side per-call setup (ELF parse into a `MemoryImage`, `ExecutorEnv` build) that is outside the cycle count and does not scale with the instruction's work. + +The fixed overhead is paid per transaction in the current node, not amortized. The public-execution path at `lee/state_machine/src/program.rs:56-87` builds a fresh `ExecutorEnv` and calls `default_executor().execute(env, self.elf())` per call with the raw ELF bytes; no parsed image is cached across transactions. So today the real per-public-tx sequencer cost is the raw `exec_ms` (≈ 31 ms for the cheapest program), overhead-dominated. Caching the parsed `MemoryImage` per `ProgramId` would drop the per-tx cost to `calib_ms` (1–19 ms). Public execution is also cycle-capped at `MAX_NUM_CYCLES_PUBLIC_EXECUTION` (`program.rs:64`), which bounds the worst-case public-tx cost. ## Real proving (`--prove`) @@ -85,7 +101,8 @@ The corresponding `proof_bytes` (S_agg) for the bench receipt is captured by `-- ## Reproduce ```sh -cargo run --release -p cycle_bench +# Executor cycles + public-execution ms calibration (no proving). --exec-iters sets the sample count. +cargo run --release -p cycle_bench -- --exec-iters 50 cargo run --release -p cycle_bench --features prove -- --prove cargo run --release -p cycle_bench --features ppe -- --prove --ppe diff --git a/tools/cycle_bench/src/main.rs b/tools/cycle_bench/src/main.rs index 914d68c5..bed61f92 100644 --- a/tools/cycle_bench/src/main.rs +++ b/tools/cycle_bench/src/main.rs @@ -9,11 +9,14 @@ #![expect( clippy::arithmetic_side_effects, + clippy::as_conversions, + clippy::cast_precision_loss, clippy::float_arithmetic, clippy::missing_const_for_fn, clippy::non_ascii_literal, clippy::print_stderr, clippy::print_stdout, + clippy::suboptimal_flops, reason = "Bench tool: matches test-style fixture code" )] @@ -68,6 +71,13 @@ struct BenchResult { user_cycles: u64, segments: usize, exec_stats: Stats, + /// Compute-only execution time (ms): best-of-N executor wall-time minus the calibrated + /// host-side fixed per-call overhead. Filled after the calibration fit over all cases. + net_compute_ms: Option, + /// Deterministic model prediction of compute time (ms): `user_cycles * slope` from the + /// calibration fit. Pure function of the deterministic cycle count and the pinned-hardware + /// throughput, so it reproduces across re-runs where raw wall-time does not. + calibrated_ms: Option, /// Stats over prover.prove(env, elf) wall-clock samples. Only populated when --prove is set. /// Single-sample (n=1) when --prove is on without explicit repetition, since proving is slow. prove_stats: Option, @@ -81,6 +91,89 @@ struct BenchResult { prove_segments: Option, } +/// Linear calibration of executor wall-time against deterministic user cycles, +/// fitted across all standalone cases as `best_ms = intercept_ms + slope_ms_per_cycle * +/// user_cycles`. +/// +/// The intercept is the host-side fixed per-call cost (ELF parse, `ExecutorEnv` build) that is +/// outside the cycle count and does not scale with the instruction's work. The slope is the +/// per-cycle execution rate on the pinned box; its reciprocal is the throughput the tokenomics +/// fee model denominates public execution in, and is the public-side counterpart to the flat +/// `G_verify` verify cost. The intercept is an ELF-size-averaged constant, so `net_compute_ms` +/// is a first-order decomposition, not a mechanistic per-program overhead. +#[derive(Debug, Serialize, Clone, Copy)] +struct Calibration { + /// Cases the fit was computed over. + n: usize, + /// Slope: milliseconds of executor wall-time per user cycle. + slope_ms_per_cycle: f64, + /// Intercept: host-side fixed per-call overhead in milliseconds. + intercept_ms: f64, + /// Reciprocal of the slope: cycles executed per millisecond on the pinned box. + throughput_cycles_per_ms: f64, + /// Coefficient of determination of the fit (1.0 = perfect linear fit). + r2: f64, +} + +impl Calibration { + /// Ordinary least squares of `best_ms` (y) on `user_cycles` (x) across `results`. + /// The fit uses best-of-N rather than the mean so a single OS scheduling spike in one + /// case cannot tilt the slope; best-of-N is the per-case noise floor and reproduces + /// run-to-run, which is what a pinned-hardware throughput constant needs. + /// Returns `None` when there are fewer than two distinct cycle counts to fit a line. + fn fit(results: &[BenchResult]) -> Option { + let n = results.len(); + if n < 2 { + return None; + } + let xs: Vec = results.iter().map(|r| r.user_cycles as f64).collect(); + let ys: Vec = results.iter().map(|r| r.exec_stats.best_ms).collect(); + let nf = n as f64; + let sum_x: f64 = xs.iter().sum(); + let sum_y: f64 = ys.iter().sum(); + let sum_xy: f64 = xs.iter().zip(&ys).map(|(x, y)| x * y).sum(); + let sum_xx: f64 = xs.iter().map(|x| x * x).sum(); + let denom = nf * sum_xx - sum_x.powi(2); + if denom.abs() < f64::EPSILON { + return None; + } + let slope = (nf * sum_xy - sum_x * sum_y) / denom; + let intercept = (sum_y - slope * sum_x) / nf; + let mean_y = sum_y / nf; + let ss_tot: f64 = ys.iter().map(|y| (y - mean_y).powi(2)).sum(); + let ss_res: f64 = xs + .iter() + .zip(&ys) + .map(|(x, y)| (y - (intercept + slope * x)).powi(2)) + .sum(); + // ss_tot ≈ 0 means every best_ms is identical; the ratio is 0/0. We report 1.0 (a flat + // line fits a flat cloud exactly). This is a degenerate guard, not a real-data path: the + // bench cases span a wide cycle range, so ss_tot is large in practice. + let r2 = if ss_tot.abs() < f64::EPSILON { + 1.0 + } else { + 1.0 - ss_res / ss_tot + }; + let throughput_cycles_per_ms = if slope.abs() < f64::EPSILON { + 0.0 + } else { + 1.0 / slope + }; + Some(Self { + n, + slope_ms_per_cycle: slope, + intercept_ms: intercept, + throughput_cycles_per_ms, + r2, + }) + } + + /// Compute-time prediction for a cycle count: `slope * user_cycles` (overhead excluded). + fn calibrated_ms(&self, user_cycles: u64) -> f64 { + self.slope_ms_per_cycle * user_cycles as f64 + } +} + struct Case { program: &'static str, instruction_label: &'static str, @@ -185,6 +278,8 @@ impl Case { user_cycles: info.cycles(), segments: info.segments.len(), exec_stats, + net_compute_ms: None, + calibrated_ms: None, prove_stats, prove_total_cycles, prove_user_cycles, @@ -495,12 +590,23 @@ fn main() -> Result<()> { )?, ]; - let results: Vec = cases + let mut results: Vec = cases .into_iter() .map(|c| c.run(prove, exec_iters)) .collect::>>()?; + let calibration = Calibration::fit(&results); + if let Some(cal) = calibration { + for r in &mut results { + r.calibrated_ms = Some(cal.calibrated_ms(r.user_cycles)); + r.net_compute_ms = Some(r.exec_stats.best_ms - cal.intercept_ms); + } + } + print_table(&results, prove); + if let Some(cal) = calibration { + print_calibration(&cal); + } #[cfg(feature = "ppe")] let ppe_results = if cli.ppe { ppe::run_all() } else { Vec::new() }; @@ -525,6 +631,7 @@ fn main() -> Result<()> { } let combined = serde_json::json!({ "standalone": results, + "calibration": calibration, "ppe": ppe_results, }); std::fs::write(&out_path, serde_json::to_string_pretty(&combined)?)?; @@ -533,6 +640,24 @@ fn main() -> Result<()> { Ok(()) } +fn print_calibration(cal: &Calibration) { + println!("\npublic-execution ms calibration (pinned hardware):"); + println!( + " fit: best_ms = {:.4} + {:.3e} * user_cycles (n={}, R²={:.4})", + cal.intercept_ms, cal.slope_ms_per_cycle, cal.n, cal.r2, + ); + println!( + " throughput: {:.0} cycles/ms", + cal.throughput_cycles_per_ms, + ); + println!( + " fixed overhead: {:.3} ms host-side per call (ELF parse + env build, off-cycle)", + cal.intercept_ms, + ); + println!(" calib_ms = user_cycles / throughput (compute only, overhead excluded)"); + println!(" net_ms = best exec_ms - fixed overhead (measured compute, overhead stripped)"); +} + fn print_table(results: &[BenchResult], prove: bool) { let pw = results .iter() @@ -555,15 +680,28 @@ fn print_table(results: &[BenchResult], prove: bool) { .unwrap_or(0) .max("exec_ms (best / mean ± stdev)".len()); + let dw = 10_usize; println!( - "{:cw$} {:>sw$} {:cw$} {:>sw$} {:dw$} {:>dw$}", + "program", + "instruction", + "user_cycles", + "segments", + "exec_ms (best / mean ± stdev)", + "calib_ms", + "net_ms", ); - println!("{}", "-".repeat(pw + iw + cw + sw + exec_w + 8)); + println!("{}", "-".repeat(pw + iw + cw + sw + exec_w + 2 * dw + 12)); for r in results { + let calib = r + .calibrated_ms + .map_or_else(|| "-".to_owned(), |v| format!("{v:.2}")); + let net = r + .net_compute_ms + .map_or_else(|| "-".to_owned(), |v| format!("{v:.2}")); println!( - "{:cw$} {:>sw$} {:cw$} {:>sw$} {:dw$} {:>dw$}", + r.program, r.instruction, r.user_cycles, r.segments, r.exec_stats, calib, net, ); } @@ -595,3 +733,78 @@ fn print_table(results: &[BenchResult], prove: bool) { } } } + +#[cfg(test)] +mod tests { + use cycle_bench::stats::Stats; + + use super::{BenchResult, Calibration}; + + /// Minimal `BenchResult` carrying only the fields the calibration fit reads: + /// `user_cycles` (x) and `exec_stats.best_ms` (y). + fn point(user_cycles: u64, best_ms: f64) -> BenchResult { + BenchResult { + program: "test", + instruction: "test", + user_cycles, + segments: 1, + exec_stats: Stats::from_samples(&[best_ms]), + net_compute_ms: None, + calibrated_ms: None, + prove_stats: None, + prove_total_cycles: None, + prove_user_cycles: None, + prove_paging_cycles: None, + prove_segments: None, + } + } + + fn close(a: f64, b: f64) -> bool { + (a - b).abs() < 1e-9 + } + + #[test] + fn fit_recovers_a_known_line() { + // best_ms = 10 + 0.001 * user_cycles -> slope 1e-3, intercept 10, throughput 1000. + let results = [point(1000, 11.0), point(2000, 12.0), point(3000, 13.0)]; + let cal = Calibration::fit(&results).expect("fit over three points"); + + assert!( + close(cal.slope_ms_per_cycle, 0.001), + "slope {}", + cal.slope_ms_per_cycle + ); + assert!( + close(cal.intercept_ms, 10.0), + "intercept {}", + cal.intercept_ms + ); + assert!( + close(cal.throughput_cycles_per_ms, 1000.0), + "throughput {}", + cal.throughput_cycles_per_ms, + ); + assert!(close(cal.r2, 1.0), "r2 {}", cal.r2); + assert_eq!(cal.n, 3); + // calibrated_ms is the overhead-excluded compute prediction: slope * cycles. + assert!( + close(cal.calibrated_ms(2000), 2.0), + "calib {}", + cal.calibrated_ms(2000) + ); + } + + #[test] + fn fit_needs_at_least_two_points() { + assert!(Calibration::fit(&[]).is_none()); + assert!(Calibration::fit(&[point(1000, 11.0)]).is_none()); + } + + #[test] + fn fit_with_identical_cycle_counts_returns_none() { + // Zero spread in x leaves the slope undetermined; the fit must decline rather than divide + // by zero. + let results = [point(1000, 11.0), point(1000, 12.0)]; + assert!(Calibration::fit(&results).is_none()); + } +}