From 05f41f81e97973d7ed24209c88f5606baa92dca7 Mon Sep 17 00:00:00 2001
From: Moudy <m.ellaz@hotmail.com>
Date: Thu, 14 May 2026 18:14:37 +0200
Subject: [PATCH] feat: add cycle_bench tool for executor, prove, PPE, and
 verify cycle measurements

---
 Cargo.toml                     |   1 +
 docs/benchmarks/README.md      |  11 +
 docs/benchmarks/cycle_bench.md | 117 ++++++
 tools/cycle_bench/Cargo.toml   |  29 ++
 tools/cycle_bench/README.md    |  36 ++
 tools/cycle_bench/src/main.rs  | 639 +++++++++++++++++++++++++++++++++
 tools/cycle_bench/src/ppe.rs   | 307 ++++++++++++++++
 tools/cycle_bench/src/stats.rs |  54 +++
 8 files changed, 1194 insertions(+)
 create mode 100644 docs/benchmarks/README.md
 create mode 100644 docs/benchmarks/cycle_bench.md
 create mode 100644 tools/cycle_bench/Cargo.toml
 create mode 100644 tools/cycle_bench/README.md
 create mode 100644 tools/cycle_bench/src/main.rs
 create mode 100644 tools/cycle_bench/src/ppe.rs
 create mode 100644 tools/cycle_bench/src/stats.rs

diff --git a/Cargo.toml b/Cargo.toml
index 1bce967f..a69a71dd 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -38,6 +38,7 @@ members = [
   "examples/program_deployment/methods/guest",
   "testnet_initial_state",
   "indexer/ffi",
+  "tools/cycle_bench",
 ]
 
 [workspace.dependencies]
diff --git a/docs/benchmarks/README.md b/docs/benchmarks/README.md
new file mode 100644
index 00000000..18f22225
--- /dev/null
+++ b/docs/benchmarks/README.md
@@ -0,0 +1,11 @@
+# Benchmarks
+
+Bench tools live under `tools/` with READMEs for how to run each one.
+This directory holds the result write-ups: machine, raw tables, and
+short findings.
+
+| Bench | Doc |
+|---|---|
+| cycle_bench | [cycle_bench.md](cycle_bench.md) |
+
+All numbers are from a single M2 Pro dev box unless noted otherwise.
diff --git a/docs/benchmarks/cycle_bench.md b/docs/benchmarks/cycle_bench.md
new file mode 100644
index 00000000..62db9b1d
--- /dev/null
+++ b/docs/benchmarks/cycle_bench.md
@@ -0,0 +1,117 @@
+# cycle_bench
+
+Per-program Risc0 cycle counts, prover wall time, PPE composition cost,
+and verifier wall time for the built-in LEZ programs. Inputs for the
+fee model's `G_executor`, `G_prove`, `G_verify`, and `S_agg` parameters.
+
+## Machine
+
+| Field | Value |
+|---|---|
+| Chip | Apple M2 Pro (8P+4E) |
+| RAM | 16 GB |
+| OS | macOS 15.5 |
+| Rust | 1.94.0 |
+| Risc0 zkVM | 3.0.5 |
+| Profile | release |
+| GPU acceleration | none |
+
+## Executor cycles
+
+`SessionInfo::cycles()` per instruction. Deterministic across runs. Wall time
+is `best / mean ± stdev` over 5 timed iterations (1 warmup discarded).
+
+| Program | Instruction | user_cycles | segments | exec_ms (best / mean ± stdev) |
+|---|---|---:|---:|---|
+| authenticated_transfer | Initialize | 43,642 | 1 | 18.86 / 19.41 ± 0.48 |
+| authenticated_transfer | Transfer | 77,095 | 1 | 19.67 / 20.84 ± 1.16 |
+| token | Burn | 116,546 | 1 | 24.86 / 25.46 ± 0.63 |
+| token | Mint | 116,862 | 1 | 24.47 / 25.08 ± 0.42 |
+| token | Transfer | 127,726 | 1 | 25.00 / 25.40 ± 0.29 |
+| clock | Tick (no rollups) | 137,022 | 1 | 21.18 / 21.57 ± 0.41 |
+| ata | Create | 175,056 | 1 | 23.64 / 24.94 ± 1.09 |
+| amm | SwapExactInput | 508,634 | 1 | 34.21 / 34.77 ± 0.55 |
+| amm | AddLiquidity | 642,774 | 1 | 37.59 / 37.87 ± 0.28 |
+
+## Real proving (`--prove`)
+
+`prover.prove(env, elf)` wall time per program on CPU. `total_cycles` is
+`user_cycles` rounded up to the next power of two (Risc0 padding).
+
+| Program | Instruction | total_cycles | prove_ms | prove_s |
+|---|---|---:|---:|---:|
+| authenticated_transfer | Initialize | 131,072 | 11,881 | 11.9 |
+| authenticated_transfer | Transfer | 131,072 | 13,705 | 13.7 |
+| token | Burn | 262,144 | 22,893 | 22.9 |
+| token | Mint | 262,144 | 23,927 | 23.9 |
+| token | Transfer | 262,144 | 27,178 | 27.2 |
+| clock | Tick | 262,144 | 23,486 | 23.5 |
+| ata | Create | 262,144 | 21,093 | 21.1 |
+| amm | AddLiquidity | 1,048,576 | 111,654 | 111.7 |
+| amm | SwapExactInput | 1,048,576 | 126,400 | 126.4 |
+
+Linear fit across po2 buckets: ≈ 100 µs per total cycle (≈ 10k cycles/s
+throughput on this CPU).
+
+## PPE composition + chain-call sweep (`--ppe`)
+
+Same `auth_transfer Transfer` instruction, standalone vs wrapped in the
+privacy circuit; plus the `chain_caller` test program with N chained
+`authenticated_transfer` calls. `proof_bytes` is the borsh-serialized
+InnerReceipt (S_agg in the fee model).
+
+| Case | prove_ms | prove_s | proof_bytes |
+|---|---:|---:|---:|
+| auth_transfer Transfer standalone | 13,705 | 13.7 | n/a |
+| auth_transfer Transfer in PPE | 61,486 | 61.5 | 223,551 |
+| chain_caller depth=1 | 122,590 | 122.6 | 223,551 |
+| chain_caller depth=3 | 231,974 | 232.0 | 223,551 |
+| chain_caller depth=5 | 372,123 | 372.1 | 223,551 |
+| chain_caller depth=9 | 544,280 | 544.3 | 223,551 |
+
+Linear fit depth=1..9: ≈ 53 s per additional chained call, intercept ≈ 73 s.
+Composition tax (single program PPE − standalone): ≈ 48 s. `proof_bytes` is
+constant: the outer succinct proof has fixed size; the journal carried
+alongside it scales with public state and is reported separately by `--verify`.
+
+## Verifier (`--verify`)
+
+One PPE receipt generated once (auth_transfer Transfer in PPE), then
+`Receipt::verify(PRIVACY_PRESERVING_CIRCUIT_ID)` measured over 1000 iterations.
+
+| Field | Value |
+|---|---|
+| case | auth_transfer Transfer in PPE |
+| proof_bytes (S_agg) | 223,551 |
+| journal_bytes | 412 |
+| verify_ms (best / mean ± stdev, n=1000) | 11.71 / 12.06 ± 1.99 |
+
+## Findings
+
+- Proving cost scales with po2-bucketed `total_cycles`, not raw `user_cycles`.
+  Trimming user_cycles only helps if it crosses a 2^N boundary.
+- Single-program PPE composition tax on M2 Pro CPU: ≈ 48 s (61.5 − 13.7).
+- Chained-call cost is linear at ≈ 53 s per call. A max-depth chain (10) would
+  take ≈ 600 s standalone on this CPU.
+- `G_verify` is ≈ 12 ms and roughly constant per outer receipt (1000-iter
+  stdev ≈ 2 ms). The succinct outer proof is fixed at 223,551 bytes (S_agg);
+  verify is not on the latency critical path.
+
+## Reproduce
+
+```sh
+cargo run --release -p cycle_bench
+cargo run --release -p cycle_bench --features prove -- --prove
+cargo run --release -p cycle_bench --features ppe -- --prove --ppe
+cargo run --release -p cycle_bench --features ppe -- --verify --verify-iters 1000
+```
+
+JSON output: `target/cycle_bench.json`.
+
+## Caveats
+
+- CPU-only proving on a dev laptop. Production prover hardware (GPU,
+  specialised CPU pipelines) will produce much smaller numbers; relative
+  ordering should be preserved.
+- Single-segment cases only; multi-segment programs would pay continuation
+  overhead not measured here.
diff --git a/tools/cycle_bench/Cargo.toml b/tools/cycle_bench/Cargo.toml
new file mode 100644
index 00000000..6847b0c5
--- /dev/null
+++ b/tools/cycle_bench/Cargo.toml
@@ -0,0 +1,29 @@
+[package]
+name = "cycle_bench"
+version = "0.1.0"
+edition = "2024"
+license = { workspace = true }
+publish = false
+
+[lints]
+workspace = true
+
+[features]
+default = []
+prove = ["nssa/prove", "risc0-zkvm/prove"]
+ppe = ["prove"]
+
+[dependencies]
+nssa = { workspace = true }
+nssa_core = { workspace = true, features = ["host"] }
+clock_core.workspace = true
+token_core.workspace = true
+amm_core.workspace = true
+ata_core.workspace = true
+
+risc0-zkvm.workspace = true
+borsh.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+anyhow.workspace = true
+clap = { workspace = true }
diff --git a/tools/cycle_bench/README.md b/tools/cycle_bench/README.md
new file mode 100644
index 00000000..2bc5462f
--- /dev/null
+++ b/tools/cycle_bench/README.md
@@ -0,0 +1,36 @@
+# cycle_bench
+
+Per-program Risc0 cycle counts, prover wall time, PPE composition cost, and
+verifier wall time for the built-in LEZ programs. Feeds the fee model
+(`G_executor`, `G_prove`, `G_verify`, `S_agg`).
+
+## Run
+
+```sh
+# Executor cycles only (fast, ~seconds)
+cargo run --release -p cycle_bench
+
+# + real proving per program (slow, ~minutes)
+cargo run --release -p cycle_bench --features prove -- --prove
+
+# + PPE composition cases (very slow, ~hour)
+cargo run --release -p cycle_bench --features ppe -- --prove --ppe
+
+# + verifier microbench (G_verify): generates one PPE receipt, times verify x1000
+cargo run --release -p cycle_bench --features ppe -- --verify --verify-iters 1000
+```
+
+`RISC0_DEV_MODE=1` skips proving entirely and is only useful for the executor path.
+Combine flags freely; output is printed to stdout and written to
+`target/cycle_bench.json` for regression diffs.
+
+## What you'll see
+
+- Per-program executor cycles and segments, plus exec wall time as
+  `best / mean ± stdev (n=N)`.
+- With `--prove`: prover total cycles, paging cycles, segments, and wall time.
+- With `--ppe`: end-to-end `execute_and_prove` wall time and S_agg
+  (the borsh-serialized InnerReceipt length) for one auth-transfer-in-PPE
+  case and a chain-caller depth sweep.
+- With `--verify`: verify wall time `best / mean ± stdev`, plus
+  `proof_bytes` and `journal_bytes`.
diff --git a/tools/cycle_bench/src/main.rs b/tools/cycle_bench/src/main.rs
new file mode 100644
index 00000000..6b4c0e57
--- /dev/null
+++ b/tools/cycle_bench/src/main.rs
@@ -0,0 +1,639 @@
+//! Measures Risc0 user cycles per built-in program instruction.
+//!
+//! Runs each guest ELF through the Risc0 executor (no proving) with realistic inputs
+//! drawn from the existing per-program unit tests, then prints a table and writes a
+//! JSON dump for regression comparison.
+//!
+//! Run with `cargo run --release -p cycle_bench`. `RISC0_DEV_MODE` has no effect on
+//! executor cycle counts.
+
+#![allow(
+    clippy::arithmetic_side_effects,
+    clippy::print_stdout,
+    clippy::print_stderr,
+    clippy::std_instead_of_alloc,
+    clippy::std_instead_of_core,
+    reason = "Bench tool: matches test-style fixture code"
+)]
+
+use std::{path::PathBuf, time::Instant};
+
+mod ppe;
+mod stats;
+
+use stats::Stats;
+
+use amm_core::{
+    PoolDefinition, compute_liquidity_token_pda, compute_pool_pda, compute_vault_pda,
+};
+use anyhow::Result;
+use ata_core::{compute_ata_seed, get_associated_token_account_id};
+use clap::Parser;
+use clock_core::{
+    CLOCK_01_PROGRAM_ACCOUNT_ID, CLOCK_10_PROGRAM_ACCOUNT_ID, CLOCK_50_PROGRAM_ACCOUNT_ID,
+    ClockAccountData,
+};
+use nssa::program_methods::{
+    AMM_ELF, ASSOCIATED_TOKEN_ACCOUNT_ELF, AUTHENTICATED_TRANSFER_ELF, CLOCK_ELF, TOKEN_ELF,
+};
+use nssa_core::{
+    Timestamp,
+    account::{Account, AccountId, AccountWithMetadata, Data},
+    program::{InstructionData, ProgramId},
+};
+use risc0_zkvm::{ExecutorEnv, default_executor, default_prover};
+use serde::Serialize;
+use token_core::{TokenDefinition, TokenHolding};
+
+#[derive(Parser, Debug)]
+#[command(about = "Per-program executor and (optionally) prover cycle measurements")]
+struct Cli {
+    /// Also run prover.prove for each case and report wall time + cycles. Slow.
+    #[arg(long)]
+    prove: bool,
+
+    /// Also run privacy-preserving execution circuit (PPE) composition cases:
+    /// (a) single auth_transfer Transfer through `execute_and_prove`, (b) chain_caller
+    /// with depth N=1,3,5,9. Requires --features ppe at build time. Very slow.
+    #[arg(long)]
+    ppe: bool,
+
+    /// After running --ppe-style proving once for auth_transfer-in-PPE, time
+    /// receipt.verify(PRIVACY_PRESERVING_CIRCUIT_ID) over many iterations.
+    /// Produces G_verify for the fee model. Requires --features ppe.
+    #[arg(long)]
+    verify: bool,
+
+    /// Iterations for --verify. Default matches the fee-model handoff target.
+    #[arg(long, default_value_t = 1000)]
+    verify_iters: usize,
+
+    /// Iterations for executor wall-time sampling per case. First iter is
+    /// discarded as warmup, remaining N feed the stats.
+    #[arg(long, default_value_t = 5)]
+    exec_iters: usize,
+}
+
+const AMM_PROGRAM_ID: ProgramId = [42; 8];
+const TOKEN_PROGRAM_ID: ProgramId = [15; 8];
+const ATA_PROGRAM_ID: ProgramId = [88; 8];
+const CLOCK_PROGRAM_ID: ProgramId = [13; 8];
+const AUTH_TRANSFER_PROGRAM_ID: ProgramId = [7; 8];
+
+#[derive(Debug, Serialize)]
+struct BenchResult {
+    program: &'static str,
+    instruction: &'static str,
+    user_cycles: u64,
+    segments: usize,
+    exec_stats: Stats,
+    /// Stats over prover.prove(env, elf) wall-clock samples. Only populated when --prove is set.
+    /// Single-sample (n=1) when --prove is on without explicit repetition, since proving is slow.
+    prove_stats: Option<Stats>,
+    /// Total cycles (with continuation overhead, paging, po2 padding) from ProveInfo.stats.
+    prove_total_cycles: Option<u64>,
+    /// User cycles from ProveInfo.stats (should match executor cycles).
+    prove_user_cycles: Option<u64>,
+    /// Paging cycles from ProveInfo.stats.
+    prove_paging_cycles: Option<u64>,
+    /// Segments from ProveInfo.stats.
+    prove_segments: Option<usize>,
+}
+
+fn run_case<I: Serialize>(
+    program: &'static str,
+    instruction_label: &'static str,
+    elf: &[u8],
+    self_program_id: ProgramId,
+    pre_states: Vec<AccountWithMetadata>,
+    instruction: &I,
+    prove: bool,
+    exec_iters: usize,
+) -> Result<BenchResult> {
+    let caller_program_id: Option<ProgramId> = None;
+    let instruction_words: InstructionData = risc0_zkvm::serde::to_vec(instruction)?;
+
+    // One warmup pass discarded, then `exec_iters` samples. The executor has
+    // large per-call setup overhead (ELF parsing, env init); reporting both
+    // best-of-N and mean ± stdev shows whether jitter is significant.
+    let mut samples: Vec<f64> = Vec::with_capacity(exec_iters);
+    let mut last_info = None;
+    let total = exec_iters.saturating_add(1).max(2);
+    for iter in 0..total {
+        let mut env_builder = ExecutorEnv::builder();
+        env_builder
+            .write(&self_program_id)?
+            .write(&caller_program_id)?
+            .write(&pre_states)?
+            .write(&instruction_words)?;
+        let env = env_builder.build()?;
+
+        let started = Instant::now();
+        let info = default_executor().execute(env, elf)?;
+        let elapsed_ms = started.elapsed().as_secs_f64() * 1_000.0;
+
+        if iter > 0 {
+            samples.push(elapsed_ms);
+        }
+        last_info = Some(info);
+    }
+    let info = last_info.expect("at least one iteration");
+    let exec_stats = Stats::from_samples(&samples);
+
+    let mut prove_stats = None;
+    let mut prove_total_cycles = None;
+    let mut prove_user_cycles = None;
+    let mut prove_paging_cycles = None;
+    let mut prove_segments = None;
+    if prove {
+        let mut env_builder = ExecutorEnv::builder();
+        env_builder
+            .write(&self_program_id)?
+            .write(&caller_program_id)?
+            .write(&pre_states)?
+            .write(&instruction_words)?;
+        let env = env_builder.build()?;
+
+        let started = Instant::now();
+        let prove_info = default_prover()
+            .prove(env, elf)
+            .map_err(|e| anyhow::anyhow!("prove failed: {e}"))?;
+        let prove_ms = started.elapsed().as_secs_f64() * 1_000.0;
+        prove_stats = Some(Stats::from_samples(&[prove_ms]));
+        prove_total_cycles = Some(prove_info.stats.total_cycles);
+        prove_user_cycles = Some(prove_info.stats.user_cycles);
+        prove_paging_cycles = Some(prove_info.stats.paging_cycles);
+        prove_segments = Some(prove_info.stats.segments);
+        eprintln!(
+            "  prove({program}/{instruction_label}): {prove_ms:.1} ms ({:.1}s), total_cycles={}, segments={}",
+            prove_ms / 1_000.0, prove_info.stats.total_cycles, prove_info.stats.segments,
+        );
+    }
+
+    Ok(BenchResult {
+        program,
+        instruction: instruction_label,
+        user_cycles: info.cycles(),
+        segments: info.segments.len(),
+        exec_stats,
+        prove_stats,
+        prove_total_cycles,
+        prove_user_cycles,
+        prove_paging_cycles,
+        prove_segments,
+    })
+}
+
+fn authenticated_transfer_init() -> Vec<AccountWithMetadata> {
+    vec![AccountWithMetadata {
+        account: Account::default(),
+        is_authorized: true,
+        account_id: AccountId::new([1; 32]),
+    }]
+}
+
+fn authenticated_transfer_transfer() -> Vec<AccountWithMetadata> {
+    let sender = AccountWithMetadata {
+        account: Account {
+            balance: 1_000_000,
+            ..Account::default()
+        },
+        is_authorized: true,
+        account_id: AccountId::new([1; 32]),
+    };
+    let recipient = AccountWithMetadata {
+        account: Account::default(),
+        is_authorized: false,
+        account_id: AccountId::new([2; 32]),
+    };
+    vec![sender, recipient]
+}
+
+fn token_holding(
+    definition_id: AccountId,
+    account_id: AccountId,
+    balance: u128,
+    is_authorized: bool,
+) -> AccountWithMetadata {
+    AccountWithMetadata {
+        account: Account {
+            program_owner: TOKEN_PROGRAM_ID,
+            balance: 0,
+            data: Data::from(&TokenHolding::Fungible {
+                definition_id,
+                balance,
+            }),
+            nonce: 0_u128.into(),
+        },
+        is_authorized,
+        account_id,
+    }
+}
+
+fn token_definition(
+    account_id: AccountId,
+    total_supply: u128,
+    is_authorized: bool,
+) -> AccountWithMetadata {
+    AccountWithMetadata {
+        account: Account {
+            program_owner: TOKEN_PROGRAM_ID,
+            balance: 0,
+            data: Data::from(&TokenDefinition::Fungible {
+                name: String::from("test"),
+                total_supply,
+                metadata_id: None,
+            }),
+            nonce: 0_u128.into(),
+        },
+        is_authorized,
+        account_id,
+    }
+}
+
+fn token_transfer_pre_states() -> Vec<AccountWithMetadata> {
+    let def = AccountId::new([15; 32]);
+    let sender = token_holding(def, AccountId::new([17; 32]), 100_000, true);
+    let recipient = token_holding(def, AccountId::new([42; 32]), 50_000, true);
+    vec![sender, recipient]
+}
+
+fn token_mint_pre_states() -> Vec<AccountWithMetadata> {
+    let def_id = AccountId::new([15; 32]);
+    let def = token_definition(def_id, 100_000, true);
+    let holding = token_holding(def_id, AccountId::new([17; 32]), 1_000, true);
+    vec![def, holding]
+}
+
+fn token_burn_pre_states() -> Vec<AccountWithMetadata> {
+    let def_id = AccountId::new([15; 32]);
+    let def = token_definition(def_id, 100_000, true);
+    let holding = token_holding(def_id, AccountId::new([17; 32]), 1_000, true);
+    vec![def, holding]
+}
+
+fn clock_account(account_id: AccountId, block_id: u64) -> AccountWithMetadata {
+    AccountWithMetadata {
+        account: Account {
+            program_owner: CLOCK_PROGRAM_ID,
+            balance: 0,
+            data: ClockAccountData {
+                block_id,
+                timestamp: Timestamp::from(0_u64),
+            }
+            .to_bytes()
+            .try_into()
+            .expect("ClockAccountData should fit in account data"),
+            nonce: 0_u128.into(),
+        },
+        is_authorized: false,
+        account_id,
+    }
+}
+
+fn clock_pre_states_tick_at(block_id: u64) -> Vec<AccountWithMetadata> {
+    vec![
+        clock_account(CLOCK_01_PROGRAM_ACCOUNT_ID, block_id),
+        clock_account(CLOCK_10_PROGRAM_ACCOUNT_ID, block_id),
+        clock_account(CLOCK_50_PROGRAM_ACCOUNT_ID, block_id),
+    ]
+}
+
+fn amm_token_a_def_id() -> AccountId {
+    AccountId::new([42; 32])
+}
+fn amm_token_b_def_id() -> AccountId {
+    AccountId::new([43; 32])
+}
+fn amm_pool_id() -> AccountId {
+    compute_pool_pda(AMM_PROGRAM_ID, amm_token_a_def_id(), amm_token_b_def_id())
+}
+fn amm_vault_a_id() -> AccountId {
+    compute_vault_pda(AMM_PROGRAM_ID, amm_pool_id(), amm_token_a_def_id())
+}
+fn amm_vault_b_id() -> AccountId {
+    compute_vault_pda(AMM_PROGRAM_ID, amm_pool_id(), amm_token_b_def_id())
+}
+fn amm_lp_def_id() -> AccountId {
+    compute_liquidity_token_pda(AMM_PROGRAM_ID, amm_pool_id())
+}
+
+/// Pool seeded with reserves 1_000 / 500, lp supply sqrt(1000*500) = 707.
+fn amm_pool_account() -> AccountWithMetadata {
+    let reserve_a: u128 = 1_000;
+    let reserve_b: u128 = 500;
+    let lp_supply: u128 = (reserve_a * reserve_b).isqrt();
+    AccountWithMetadata {
+        account: Account {
+            program_owner: AMM_PROGRAM_ID,
+            balance: 0,
+            data: Data::from(&PoolDefinition {
+                definition_token_a_id: amm_token_a_def_id(),
+                definition_token_b_id: amm_token_b_def_id(),
+                vault_a_id: amm_vault_a_id(),
+                vault_b_id: amm_vault_b_id(),
+                liquidity_pool_id: amm_lp_def_id(),
+                liquidity_pool_supply: lp_supply,
+                reserve_a,
+                reserve_b,
+                fees: 0,
+                active: true,
+            }),
+            nonce: 0_u128.into(),
+        },
+        is_authorized: true,
+        account_id: amm_pool_id(),
+    }
+}
+
+fn amm_swap_pre_states() -> Vec<AccountWithMetadata> {
+    let pool = amm_pool_account();
+    let vault_a = token_holding(amm_token_a_def_id(), amm_vault_a_id(), 1_000, true);
+    let vault_b = token_holding(amm_token_b_def_id(), amm_vault_b_id(), 500, true);
+    let user_a = token_holding(amm_token_a_def_id(), AccountId::new([45; 32]), 1_000, true);
+    let user_b = token_holding(amm_token_b_def_id(), AccountId::new([46; 32]), 500, false);
+    vec![pool, vault_a, vault_b, user_a, user_b]
+}
+
+fn amm_add_liquidity_pre_states() -> Vec<AccountWithMetadata> {
+    let pool = amm_pool_account();
+    let vault_a = token_holding(amm_token_a_def_id(), amm_vault_a_id(), 1_000, true);
+    let vault_b = token_holding(amm_token_b_def_id(), amm_vault_b_id(), 500, true);
+    let lp_supply: u128 = (1_000_u128 * 500_u128).isqrt();
+    let lp_def = token_definition(amm_lp_def_id(), lp_supply, true);
+    let user_a = token_holding(amm_token_a_def_id(), AccountId::new([45; 32]), 1_000, true);
+    let user_b = token_holding(amm_token_b_def_id(), AccountId::new([46; 32]), 500, true);
+    let user_lp = token_holding(amm_lp_def_id(), AccountId::new([47; 32]), 0, true);
+    vec![pool, vault_a, vault_b, lp_def, user_a, user_b, user_lp]
+}
+
+fn ata_create_pre_states() -> Vec<AccountWithMetadata> {
+    let owner_id = AccountId::new([91; 32]);
+    let definition_id = AccountId::new([15; 32]);
+    let owner = AccountWithMetadata {
+        account: Account::default(),
+        is_authorized: true,
+        account_id: owner_id,
+    };
+    let token_def = token_definition(definition_id, 100_000, false);
+    let seed = compute_ata_seed(owner_id, definition_id);
+    let ata_id = get_associated_token_account_id(&ATA_PROGRAM_ID, &seed);
+    let ata_account = AccountWithMetadata {
+        account: Account::default(),
+        is_authorized: false,
+        account_id: ata_id,
+    };
+    vec![owner, token_def, ata_account]
+}
+
+fn main() -> Result<()> {
+    let cli = Cli::parse();
+    let prove = cli.prove;
+    let exec_iters = cli.exec_iters.max(1);
+    if prove {
+        eprintln!("cycle_bench: prove mode ON, this will be slow (~minutes per program)");
+    }
+
+    let mut results: Vec<BenchResult> = Vec::new();
+
+    let transfer_amount: u128 = 5_000;
+    results.push(run_case(
+        "authenticated_transfer",
+        "Transfer",
+        AUTHENTICATED_TRANSFER_ELF,
+        AUTH_TRANSFER_PROGRAM_ID,
+        authenticated_transfer_transfer(),
+        &transfer_amount,
+        prove,
+        exec_iters,
+    )?);
+    let init_amount: u128 = 0;
+    results.push(run_case(
+        "authenticated_transfer",
+        "Initialize",
+        AUTHENTICATED_TRANSFER_ELF,
+        AUTH_TRANSFER_PROGRAM_ID,
+        authenticated_transfer_init(),
+        &init_amount,
+        prove,
+        exec_iters,
+    )?);
+
+    results.push(run_case(
+        "token",
+        "Transfer",
+        TOKEN_ELF,
+        TOKEN_PROGRAM_ID,
+        token_transfer_pre_states(),
+        &token_core::Instruction::Transfer {
+            amount_to_transfer: 5_000,
+        },
+        prove,
+        exec_iters,
+    )?);
+    results.push(run_case(
+        "token",
+        "Mint",
+        TOKEN_ELF,
+        TOKEN_PROGRAM_ID,
+        token_mint_pre_states(),
+        &token_core::Instruction::Mint {
+            amount_to_mint: 5_000,
+        },
+        prove,
+        exec_iters,
+    )?);
+    results.push(run_case(
+        "token",
+        "Burn",
+        TOKEN_ELF,
+        TOKEN_PROGRAM_ID,
+        token_burn_pre_states(),
+        &token_core::Instruction::Burn {
+            amount_to_burn: 500,
+        },
+        prove,
+        exec_iters,
+    )?);
+
+    let clock_timestamp = Timestamp::from(1_700_000_000_u64);
+    results.push(run_case(
+        "clock",
+        "Tick (block_id+1, no multiples)",
+        CLOCK_ELF,
+        CLOCK_PROGRAM_ID,
+        clock_pre_states_tick_at(0),
+        &clock_timestamp,
+        prove,
+        exec_iters,
+    )?);
+
+    results.push(run_case(
+        "amm",
+        "SwapExactInput",
+        AMM_ELF,
+        AMM_PROGRAM_ID,
+        amm_swap_pre_states(),
+        &amm_core::Instruction::SwapExactInput {
+            swap_amount_in: 200,
+            min_amount_out: 1,
+            token_definition_id_in: amm_token_a_def_id(),
+        },
+        prove,
+        exec_iters,
+    )?);
+    results.push(run_case(
+        "amm",
+        "AddLiquidity",
+        AMM_ELF,
+        AMM_PROGRAM_ID,
+        amm_add_liquidity_pre_states(),
+        &amm_core::Instruction::AddLiquidity {
+            min_amount_liquidity: 1,
+            max_amount_to_add_token_a: 400,
+            max_amount_to_add_token_b: 200,
+        },
+        prove,
+        exec_iters,
+    )?);
+
+    results.push(run_case(
+        "ata",
+        "Create",
+        ASSOCIATED_TOKEN_ACCOUNT_ELF,
+        ATA_PROGRAM_ID,
+        ata_create_pre_states(),
+        &ata_core::Instruction::Create {
+            ata_program_id: ATA_PROGRAM_ID,
+        },
+        prove,
+        exec_iters,
+    )?);
+
+    print_table(&results, prove);
+
+    #[cfg(feature = "ppe")]
+    let ppe_results = if cli.ppe {
+        ppe::run_all()?
+    } else {
+        Vec::new()
+    };
+    #[cfg(not(feature = "ppe"))]
+    let ppe_results: Vec<ppe::PpeBenchResult> = {
+        if cli.ppe {
+            eprintln!("cycle_bench: --ppe requires --features ppe at build time. Ignoring.");
+        }
+        Vec::new()
+    };
+    if !ppe_results.is_empty() {
+        ppe::print_table(&ppe_results);
+    }
+
+    #[cfg(feature = "ppe")]
+    let verify_result = if cli.verify {
+        Some(ppe::run_verify(cli.verify_iters)?)
+    } else {
+        None
+    };
+    #[cfg(not(feature = "ppe"))]
+    let verify_result: Option<ppe::VerifyBenchResult> = {
+        if cli.verify {
+            eprintln!("cycle_bench: --verify requires --features ppe at build time. Ignoring.");
+        }
+        None
+    };
+    if let Some(ref vr) = verify_result {
+        ppe::print_verify(vr);
+    }
+
+    let workspace_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+        .join("..")
+        .join("..")
+        .canonicalize()?;
+    let out_path = workspace_root.join("target").join("cycle_bench.json");
+    if let Some(parent) = out_path.parent() {
+        std::fs::create_dir_all(parent)?;
+    }
+    let combined = serde_json::json!({
+        "standalone": results,
+        "ppe": ppe_results,
+        "verify": verify_result,
+    });
+    std::fs::write(&out_path, serde_json::to_string_pretty(&combined)?)?;
+    println!("\nJSON written to {}", out_path.display());
+
+    Ok(())
+}
+
+fn print_table(results: &[BenchResult], prove: bool) {
+    let pw = results
+        .iter()
+        .map(|r| r.program.len())
+        .max()
+        .unwrap_or(0)
+        .max("program".len());
+    let iw = results
+        .iter()
+        .map(|r| r.instruction.len())
+        .max()
+        .unwrap_or(0)
+        .max("instruction".len());
+    let cw = 12_usize;
+    let sw = 8_usize;
+    let exec_w = results
+        .iter()
+        .map(|r| r.exec_stats.format().len())
+        .max()
+        .unwrap_or(0)
+        .max("exec_ms (best / mean ± stdev)".len());
+
+    println!(
+        "{:<pw$}  {:<iw$}  {:>cw$}  {:>sw$}  {:<exec_w$}",
+        "program",
+        "instruction",
+        "user_cycles",
+        "segments",
+        "exec_ms (best / mean ± stdev)",
+    );
+    println!("{}", "-".repeat(pw + iw + cw + sw + exec_w + 8));
+    for r in results {
+        println!(
+            "{:<pw$}  {:<iw$}  {:>cw$}  {:>sw$}  {:<exec_w$}",
+            r.program,
+            r.instruction,
+            r.user_cycles,
+            r.segments,
+            r.exec_stats.format(),
+        );
+    }
+
+    if prove {
+        println!("\nprove():");
+        let pcw = 14_usize;
+        let pwallw = 24_usize;
+        let psw = 10_usize;
+        println!(
+            "{:<pw$}  {:<iw$}  {:>pcw$}  {:>pwallw$}  {:>psw$}",
+            "program", "instruction", "prove_total_c", "prove_ms (s)", "prove_segs",
+        );
+        println!("{}", "-".repeat(pw + iw + pcw + pwallw + psw + 8));
+        for r in results {
+            let total = r
+                .prove_total_cycles
+                .map(|c| c.to_string())
+                .unwrap_or_else(|| "-".to_owned());
+            let pms = r
+                .prove_stats
+                .map(|s| format!("{:.1} ({:.1}s)", s.best_ms, s.best_ms / 1_000.0))
+                .unwrap_or_else(|| "-".to_owned());
+            let psegs = r
+                .prove_segments
+                .map(|s| s.to_string())
+                .unwrap_or_else(|| "-".to_owned());
+            println!(
+                "{:<pw$}  {:<iw$}  {:>pcw$}  {:>pwallw$}  {:>psw$}",
+                r.program, r.instruction, total, pms, psegs,
+            );
+        }
+    }
+}
diff --git a/tools/cycle_bench/src/ppe.rs b/tools/cycle_bench/src/ppe.rs
new file mode 100644
index 00000000..f83f38bd
--- /dev/null
+++ b/tools/cycle_bench/src/ppe.rs
@@ -0,0 +1,307 @@
+//! Privacy-preserving execution (PPE) cases for cycle_bench.
+//!
+//! Composition cost is the delta between standalone `prover.prove(env, elf)` for
+//! a single program (measured in the main bench) and a full `execute_and_prove`
+//! that wraps the same program in the privacy circuit. Chained-call depth sweep
+//! uses the `chain_caller` test program (loaded from artifacts/) with N=1, 3, 5, 9.
+//!
+//! `run_verify` produces G_verify for the fee model: it generates one PPE
+//! receipt (auth_transfer Transfer in PPE) and times `Receipt::verify` over
+//! `iters` iterations. The proof bytes captured here are also the on-wire
+//! "outer proof" payload (S_agg in the fee model).
+
+#![allow(
+    dead_code,
+    reason = "Stubs are used when the `ppe` feature is disabled."
+)]
+
+use anyhow::Result;
+use serde::Serialize;
+
+use crate::stats::Stats;
+
+#[derive(Debug, Serialize, Clone)]
+pub struct PpeBenchResult {
+    pub label: String,
+    pub chain_depth: usize,
+    pub prove_wall_ms: Option<f64>,
+    /// borsh-serialized InnerReceipt length (S_agg in the fee model).
+    pub proof_bytes: Option<usize>,
+    pub error: Option<String>,
+}
+
+#[derive(Debug, Serialize, Clone)]
+pub struct VerifyBenchResult {
+    pub label: String,
+    pub stats: Stats,
+    pub proof_bytes: usize,
+    pub journal_bytes: usize,
+}
+
+#[cfg(not(feature = "ppe"))]
+pub fn run_all() -> Result<Vec<PpeBenchResult>> {
+    Ok(Vec::new())
+}
+
+#[cfg(feature = "ppe")]
+pub fn run_all() -> Result<Vec<PpeBenchResult>> {
+    let mut results = Vec::new();
+
+    eprintln!("PPE: running composition cost (auth_transfer Transfer in PPE)");
+    results.push(ppe_impl::run_auth_transfer_in_ppe());
+
+    for depth in [1_u32, 3, 5, 9] {
+        eprintln!("PPE: running chain_caller depth={depth}");
+        results.push(ppe_impl::run_chain_caller(depth));
+    }
+
+    Ok(results)
+}
+
+#[cfg(not(feature = "ppe"))]
+pub fn run_verify(_iters: usize) -> Result<VerifyBenchResult> {
+    anyhow::bail!("--verify requires --features ppe at build time")
+}
+
+#[cfg(feature = "ppe")]
+pub fn run_verify(iters: usize) -> Result<VerifyBenchResult> {
+    ppe_impl::run_verify(iters)
+}
+
+pub fn print_table(results: &[PpeBenchResult]) {
+    let lw = results
+        .iter()
+        .map(|r| r.label.len())
+        .max()
+        .unwrap_or(0)
+        .max("label".len());
+
+    println!(
+        "\n{:<lw$}  {:>5}  {:>20}  {:>12}  {}",
+        "label", "depth", "prove_ms (s)", "proof_bytes", "error",
+        lw = lw,
+    );
+    println!("{}", "-".repeat(lw + 60));
+    for r in results {
+        let p = r
+            .prove_wall_ms
+            .map(|v| format!("{v:.1} ({:.1}s)", v / 1_000.0))
+            .unwrap_or_else(|| "-".to_owned());
+        let b = r
+            .proof_bytes
+            .map(|n| n.to_string())
+            .unwrap_or_else(|| "-".to_owned());
+        let e = r.error.as_deref().unwrap_or("");
+        println!(
+            "{:<lw$}  {:>5}  {:>20}  {:>12}  {}",
+            r.label, r.chain_depth, p, b, e,
+            lw = lw,
+        );
+    }
+}
+
+pub fn print_verify(r: &VerifyBenchResult) {
+    println!("\nVerify (G_verify):");
+    println!("  case          : {}", r.label);
+    println!("  proof_bytes   : {} (borsh InnerReceipt, S_agg)", r.proof_bytes);
+    println!("  journal_bytes : {}", r.journal_bytes);
+    println!("  verify_ms     : {}", r.stats.format());
+}
+
+#[cfg(feature = "ppe")]
+mod ppe_impl {
+    use std::{collections::HashMap, time::Instant};
+
+    use nssa::{
+        execute_and_prove,
+        privacy_preserving_transaction::circuit::{Proof, ProgramWithDependencies},
+        program::Program,
+        program_methods::PRIVACY_PRESERVING_CIRCUIT_ID,
+    };
+    use nssa_core::{
+        InputAccountIdentity, PrivacyPreservingCircuitOutput,
+        account::{Account, AccountId, AccountWithMetadata},
+        program::ProgramId,
+    };
+    use risc0_zkvm::{InnerReceipt, Receipt, serde::to_vec};
+
+    use super::{PpeBenchResult, VerifyBenchResult};
+    use crate::stats::Stats;
+
+    const AUTH_TRANSFER_ID: ProgramId =
+        nssa::program_methods::AUTHENTICATED_TRANSFER_ID;
+    const AUTH_TRANSFER_ELF: &[u8] = nssa::program_methods::AUTHENTICATED_TRANSFER_ELF;
+
+    /// chain_caller bytecode shipped at artifacts/test_program_methods/chain_caller.bin.
+    /// Loaded at compile time so we don't need a dev-dependency on test_program_methods.
+    const CHAIN_CALLER_ELF: &[u8] =
+        include_bytes!("../../../artifacts/test_program_methods/chain_caller.bin");
+
+    pub fn run_auth_transfer_in_ppe() -> PpeBenchResult {
+        let label = "auth_transfer Transfer in PPE".to_owned();
+        let started = Instant::now();
+        match prove_auth_transfer_in_ppe() {
+            Ok((_out, proof)) => {
+                let prove_ms = started.elapsed().as_secs_f64() * 1_000.0;
+                PpeBenchResult {
+                    label,
+                    chain_depth: 0,
+                    prove_wall_ms: Some(prove_ms),
+                    proof_bytes: Some(proof.into_inner().len()),
+                    error: None,
+                }
+            }
+            Err(err) => PpeBenchResult {
+                label,
+                chain_depth: 0,
+                prove_wall_ms: None,
+                proof_bytes: None,
+                error: Some(err.to_string()),
+            },
+        }
+    }
+
+    fn prove_auth_transfer_in_ppe()
+    -> anyhow::Result<(PrivacyPreservingCircuitOutput, Proof)> {
+        let program = Program::new(AUTH_TRANSFER_ELF.to_vec())?;
+        let pwd = ProgramWithDependencies::from(program);
+
+        // For PPE to allow the sender's balance to be decremented by this
+        // program, the sender must already be claimed by auth_transfer.
+        // Recipient stays default-owned so the first call can claim it.
+        let sender = AccountWithMetadata {
+            account: Account {
+                program_owner: AUTH_TRANSFER_ID,
+                balance: 1_000_000,
+                ..Account::default()
+            },
+            is_authorized: true,
+            account_id: AccountId::new([1; 32]),
+        };
+        let recipient = AccountWithMetadata {
+            account: Account::default(),
+            is_authorized: true,
+            account_id: AccountId::new([2; 32]),
+        };
+        let pre_states = vec![sender, recipient];
+
+        let balance_to_move: u128 = 5_000;
+        let instruction_data = to_vec(&balance_to_move)?;
+
+        let account_identities = vec![InputAccountIdentity::Public; pre_states.len()];
+
+        Ok(execute_and_prove(
+            pre_states,
+            instruction_data,
+            account_identities,
+            &pwd,
+        )?)
+    }
+
+    pub fn run_chain_caller(depth: u32) -> PpeBenchResult {
+        let label = format!("chain_caller depth={depth}");
+        let started = Instant::now();
+        match prove_chain_caller(depth) {
+            Ok((_out, proof)) => {
+                let prove_ms = started.elapsed().as_secs_f64() * 1_000.0;
+                PpeBenchResult {
+                    label,
+                    chain_depth: depth as usize,
+                    prove_wall_ms: Some(prove_ms),
+                    proof_bytes: Some(proof.into_inner().len()),
+                    error: None,
+                }
+            }
+            Err(err) => PpeBenchResult {
+                label,
+                chain_depth: depth as usize,
+                prove_wall_ms: None,
+                proof_bytes: None,
+                error: Some(err.to_string()),
+            },
+        }
+    }
+
+    fn prove_chain_caller(
+        num_chain_calls: u32,
+    ) -> anyhow::Result<(PrivacyPreservingCircuitOutput, Proof)> {
+        let chain_caller = Program::new(CHAIN_CALLER_ELF.to_vec())?;
+        let auth_transfer = Program::new(AUTH_TRANSFER_ELF.to_vec())?;
+        let mut deps = HashMap::new();
+        deps.insert(AUTH_TRANSFER_ID, auth_transfer);
+        let pwd = ProgramWithDependencies::new(chain_caller, deps);
+
+        // Both accounts pre-claimed by auth_transfer. chain_caller doesn't
+        // track recipient's post-claim program_owner, so a default recipient
+        // would cause a state mismatch on subsequent chained calls.
+        let recipient_pre = AccountWithMetadata {
+            account: Account {
+                program_owner: AUTH_TRANSFER_ID,
+                ..Account::default()
+            },
+            is_authorized: true,
+            account_id: AccountId::new([2; 32]),
+        };
+        let sender_pre = AccountWithMetadata {
+            account: Account {
+                program_owner: AUTH_TRANSFER_ID,
+                balance: 1_000_000,
+                ..Account::default()
+            },
+            is_authorized: true,
+            account_id: AccountId::new([1; 32]),
+        };
+        // chain_caller expects pre_states = [recipient, sender].
+        let pre_states = vec![recipient_pre, sender_pre];
+
+        let balance: u128 = 1;
+        let pda_seed: Option<nssa_core::program::PdaSeed> = None;
+        let instruction = (balance, AUTH_TRANSFER_ID, num_chain_calls, pda_seed);
+        let instruction_data = to_vec(&instruction)?;
+
+        let account_identities = vec![InputAccountIdentity::Public; pre_states.len()];
+
+        Ok(execute_and_prove(
+            pre_states,
+            instruction_data,
+            account_identities,
+            &pwd,
+        )?)
+    }
+
+    pub fn run_verify(iters: usize) -> anyhow::Result<VerifyBenchResult> {
+        eprintln!("verify: generating PPE receipt for auth_transfer Transfer (~1 prove)");
+        let (output, proof) = prove_auth_transfer_in_ppe()?;
+        let journal = output.to_bytes();
+        let journal_bytes = journal.len();
+        let proof_bytes_vec = proof.into_inner();
+        let proof_bytes = proof_bytes_vec.len();
+
+        let inner: InnerReceipt = borsh::from_slice(&proof_bytes_vec)
+            .map_err(|e| anyhow::anyhow!("InnerReceipt deserialize: {e}"))?;
+        let receipt = Receipt::new(inner, journal);
+
+        // Sanity-check before the timing loop so we don't measure 1000 failures.
+        receipt
+            .verify(PRIVACY_PRESERVING_CIRCUIT_ID)
+            .map_err(|e| anyhow::anyhow!("verify sanity check failed: {e}"))?;
+
+        eprintln!("verify: timing {iters} iters of receipt.verify(...)");
+        let mut samples = Vec::with_capacity(iters);
+        for _ in 0..iters {
+            let started = Instant::now();
+            receipt
+                .verify(PRIVACY_PRESERVING_CIRCUIT_ID)
+                .map_err(|e| anyhow::anyhow!("verify failed mid-loop: {e}"))?;
+            samples.push(started.elapsed().as_secs_f64() * 1_000.0);
+        }
+        let stats = Stats::from_samples(&samples);
+
+        Ok(VerifyBenchResult {
+            label: "auth_transfer Transfer in PPE".to_owned(),
+            stats,
+            proof_bytes,
+            journal_bytes,
+        })
+    }
+}
diff --git a/tools/cycle_bench/src/stats.rs b/tools/cycle_bench/src/stats.rs
new file mode 100644
index 00000000..b1e45b56
--- /dev/null
+++ b/tools/cycle_bench/src/stats.rs
@@ -0,0 +1,54 @@
+//! Small helper for best / mean / stdev over wall-time samples.
+//!
+//! We report both best-of-N (the figure that strips OS noise and matches what most
+//! bench READMEs print) and mean +/- stdev (the figure the fee model wants, since
+//! it cares about the steady-state cost not a single fastest sample).
+
+use serde::Serialize;
+
+#[derive(Debug, Serialize, Clone, Copy, Default)]
+pub struct Stats {
+    pub n: usize,
+    pub best_ms: f64,
+    pub mean_ms: f64,
+    pub stdev_ms: f64,
+}
+
+impl Stats {
+    pub fn from_samples(samples: &[f64]) -> Self {
+        let n = samples.len();
+        if n == 0 {
+            return Self::default();
+        }
+        let best_ms = samples.iter().copied().fold(f64::INFINITY, f64::min);
+        let sum: f64 = samples.iter().sum();
+        let mean_ms = sum / n as f64;
+        let stdev_ms = if n > 1 {
+            let var: f64 = samples
+                .iter()
+                .map(|s| {
+                    let d = s - mean_ms;
+                    d * d
+                })
+                .sum::<f64>()
+                / (n - 1) as f64;
+            var.sqrt()
+        } else {
+            0.0
+        };
+        Self {
+            n,
+            best_ms,
+            mean_ms,
+            stdev_ms,
+        }
+    }
+
+    /// Format as `best / mean ± stdev (n=N)` for table display.
+    pub fn format(&self) -> String {
+        format!(
+            "{:.2} / {:.2} ± {:.2} (n={})",
+            self.best_ms, self.mean_ms, self.stdev_ms, self.n,
+        )
+    }
+}