Move benches to bins (#534)

* Copy recusion bench to bin

* Add command line arguments

* Allow ranges for inner_size

* Accept range for threads

* Log2 inner size

* CLI args for logging

* Update readme

* Use split_once

* Cleanup

* Correct inner proof size

* Shrink public surface

* Print stats on inner proofs
This commit is contained in:
Remco Bloemen 2022-04-15 04:23:43 +02:00 committed by GitHub
parent 76c86c55b3
commit 9cff202e1a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 309 additions and 17 deletions

View File

@ -23,13 +23,12 @@ in the Plonky2 directory.
## Running
To see recursion performance, one can run this test, which generates a chain of three recursion proofs:
To see recursion performance, one can run this bench, which generates a chain of three recursion proofs:
```sh
RUST_LOG=debug RUSTFLAGS=-Ctarget-cpu=native cargo test --release test_recursive_recursive_verifier
RUSTFLAGS=-Ctarget-cpu=native cargo run --release --example bench_recursion -- -vv
```
## Jemalloc
By default, Plonky2 uses the [Jemalloc](http://jemalloc.net) memory allocator due to its superior performance. Currently, it changes the default allocator of any binary to which it is linked. You can disable this behavior by removing the corresponding lines in [`plonky2/src/lib.rs`](https://github.com/mir-protocol/plonky2/blob/main/plonky2/src/lib.rs).

View File

@ -33,6 +33,8 @@ jemallocator = "0.3.2"
[dev-dependencies]
criterion = "0.3.5"
tynm = "0.1.6"
structopt = "0.3.26"
num_cpus = "1.13.1"
[[bench]]
name = "field_arithmetic"

View File

@ -0,0 +1,291 @@
// HACK: Ideally this would live in `benches/`, but `cargo bench` doesn't allow
// custom CLI argument parsing (even with harness disabled). We could also have
// put it in `src/bin/`, but then we wouldn't have access to
// `[dev-dependencies]`.
#![feature(generic_const_exprs)]
use std::{num::ParseIntError, ops::RangeInclusive, str::FromStr};
use anyhow::{Context as _, Result, anyhow};
use log::{info, Level, LevelFilter};
use plonky2::{
gates::noop::NoopGate,
hash::hash_types::RichField,
iop::witness::{PartialWitness, Witness},
plonk::{
circuit_builder::CircuitBuilder,
circuit_data::{
CircuitConfig, CommonCircuitData, VerifierCircuitTarget, VerifierOnlyCircuitData,
},
config::{AlgebraicHasher, GenericConfig, Hasher, PoseidonGoldilocksConfig},
proof::{CompressedProofWithPublicInputs, ProofWithPublicInputs},
prover::prove,
},
util::timing::TimingTree,
};
use plonky2_field::extension_field::Extendable;
use rand::{rngs::OsRng, RngCore, SeedableRng};
use rand_chacha::ChaCha8Rng;
use structopt::StructOpt;
type ProofTuple<F, C, const D: usize> = (
ProofWithPublicInputs<F, C, D>,
VerifierOnlyCircuitData<C, D>,
CommonCircuitData<F, C, D>,
);
#[derive(Clone, StructOpt, Debug)]
#[structopt(name = "bench_recursion")]
struct Options {
/// Verbose mode (-v, -vv, -vvv, etc.)
#[structopt(short, long, parse(from_occurrences))]
verbose: usize,
/// Apply an env_filter compatible log filter
#[structopt(long, env, default_value)]
log_filter: String,
/// Random seed for deterministic runs.
/// If not specified a new seed is generated from OS entropy.
#[structopt(long, parse(try_from_str = parse_hex_u64))]
seed: Option<u64>,
/// Number of compute threads to use. Defaults to number of cores. Can be a single
/// value or a rust style range.
#[structopt(long, parse(try_from_str = parse_range_usize))]
threads: Option<RangeInclusive<usize>>,
/// Log2 gate count of the inner proof. Can be a single value or a rust style
/// range.
#[structopt(long, default_value="14", parse(try_from_str = parse_range_usize))]
size: RangeInclusive<usize>,
}
/// Creates a dummy proof which should have `2 ** log2_size` rows.
fn dummy_proof<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>(
config: &CircuitConfig,
log2_size: usize,
) -> Result<ProofTuple<F, C, D>>
where
[(); C::Hasher::HASH_SIZE]:,
{
// 'size' is in degree, but we want number of noop gates. A non-zero amount of padding will be added and size will be rounded to the next power of two. To hit our target size, we go just under the previous power of two and hope padding is less than half the proof.
let num_dummy_gates = match log2_size {
0 => return Err(anyhow!("size must be at least 1")),
1 => 0,
2 => 1,
n => (1 << (n - 1)) + 1,
};
info!("Constructing inner proof with {} gates", num_dummy_gates);
let mut builder = CircuitBuilder::<F, D>::new(config.clone());
for _ in 0..num_dummy_gates {
builder.add_gate(NoopGate, vec![]);
}
builder.print_gate_counts(0);
let data = builder.build::<C>();
let inputs = PartialWitness::new();
let mut timing = TimingTree::new("prove", Level::Debug);
let proof = prove(&data.prover_only, &data.common, inputs, &mut timing)?;
timing.print();
data.verify(proof.clone())?;
Ok((proof, data.verifier_only, data.common))
}
fn recursive_proof<
F: RichField + Extendable<D>,
C: GenericConfig<D, F = F>,
InnerC: GenericConfig<D, F = F>,
const D: usize,
>(
inner: &ProofTuple<F, InnerC, D>,
config: &CircuitConfig,
min_degree_bits: Option<usize>,
) -> Result<ProofTuple<F, C, D>>
where
InnerC::Hasher: AlgebraicHasher<F>,
[(); C::Hasher::HASH_SIZE]:,
{
let (inner_proof, inner_vd, inner_cd) = inner;
let mut builder = CircuitBuilder::<F, D>::new(config.clone());
let mut pw = PartialWitness::new();
let pt = builder.add_virtual_proof_with_pis(inner_cd);
pw.set_proof_with_pis_target(&pt, inner_proof);
let inner_data = VerifierCircuitTarget {
constants_sigmas_cap: builder.add_virtual_cap(inner_cd.config.fri_config.cap_height),
};
pw.set_cap_target(
&inner_data.constants_sigmas_cap,
&inner_vd.constants_sigmas_cap,
);
builder.verify_proof(pt, &inner_data, inner_cd);
builder.print_gate_counts(0);
if let Some(min_degree_bits) = min_degree_bits {
// We don't want to pad all the way up to 2^min_degree_bits, as the builder will
// add a few special gates afterward. So just pad to 2^(min_degree_bits
// - 1) + 1. Then the builder will pad to the next power of two,
// 2^min_degree_bits.
let min_gates = (1 << (min_degree_bits - 1)) + 1;
for _ in builder.num_gates()..min_gates {
builder.add_gate(NoopGate, vec![]);
}
}
let data = builder.build::<C>();
let mut timing = TimingTree::new("prove", Level::Debug);
let proof = prove(&data.prover_only, &data.common, pw, &mut timing)?;
timing.print();
data.verify(proof.clone())?;
Ok((proof, data.verifier_only, data.common))
}
/// Test serialization and print some size info.
fn test_serialization<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>(
proof: &ProofWithPublicInputs<F, C, D>,
cd: &CommonCircuitData<F, C, D>,
) -> Result<()>
where
[(); C::Hasher::HASH_SIZE]:,
{
let proof_bytes = proof.to_bytes()?;
info!("Proof length: {} bytes", proof_bytes.len());
let proof_from_bytes = ProofWithPublicInputs::from_bytes(proof_bytes, cd)?;
assert_eq!(proof, &proof_from_bytes);
let now = std::time::Instant::now();
let compressed_proof = proof.clone().compress(cd)?;
let decompressed_compressed_proof = compressed_proof.clone().decompress(cd)?;
info!("{:.4}s to compress proof", now.elapsed().as_secs_f64());
assert_eq!(proof, &decompressed_compressed_proof);
let compressed_proof_bytes = compressed_proof.to_bytes()?;
info!(
"Compressed proof length: {} bytes",
compressed_proof_bytes.len()
);
let compressed_proof_from_bytes =
CompressedProofWithPublicInputs::from_bytes(compressed_proof_bytes, cd)?;
assert_eq!(compressed_proof, compressed_proof_from_bytes);
Ok(())
}
fn benchmark(config: &CircuitConfig, log2_inner_size: usize) -> Result<()> {
const D: usize = 2;
type C = PoseidonGoldilocksConfig;
type F = <C as GenericConfig<D>>::F;
// Start with a dummy proof of specified size
let inner = dummy_proof::<F, C, D>(config, log2_inner_size)?;
let (_, _, cd) = &inner;
info!(
"Initial proof degree {} = 2^{}",
cd.degree(),
cd.degree_bits
);
// Recursively verify the proof
let middle = recursive_proof::<F, C, C, D>(&inner, config, None)?;
let (_, _, cd) = &middle;
info!(
"Single recursion proof degree {} = 2^{}",
cd.degree(),
cd.degree_bits
);
// Add a second layer of recursion to shrink the proof size further
let outer = recursive_proof::<F, C, C, D>(&middle, config, None)?;
let (proof, _, cd) = &outer;
info!(
"Double recursion proof degree {} = 2^{}",
cd.degree(),
cd.degree_bits
);
test_serialization(proof, cd)?;
Ok(())
}
fn main() -> Result<()> {
// Parse command line arguments, see `--help` for details.
let options = Options::from_args_safe()?;
// Initialize logging
let mut builder = env_logger::Builder::from_default_env();
builder.parse_filters(&options.log_filter);
builder.format_timestamp(None);
match options.verbose {
0 => &mut builder,
1 => builder.filter_level(LevelFilter::Info),
2 => builder.filter_level(LevelFilter::Debug),
_ => builder.filter_level(LevelFilter::Trace),
};
builder.try_init()?;
// Initialize randomness source
let rng_seed = options.seed.unwrap_or_else(|| OsRng::default().next_u64());
info!("Using random seed {rng_seed:16x}");
let _rng = ChaCha8Rng::seed_from_u64(rng_seed);
// TODO: Use `rng` to create deterministic runs
let num_cpus = num_cpus::get();
let threads = options.threads.unwrap_or(num_cpus..=num_cpus);
let config = CircuitConfig::standard_recursion_config();
for log2_inner_size in options.size {
// Since the `size` is most likely to be and unbounded range we make that the outer iterator.
for threads in threads.clone() {
rayon::ThreadPoolBuilder::new()
.num_threads(threads)
.build()
.context("Failed to build thread pool.")?
.install(|| {
info!(
"Using {} compute threads on {} cores",
rayon::current_num_threads(),
num_cpus
);
// Run the benchmark
benchmark(&config, log2_inner_size)
})?;
}
}
Ok(())
}
fn parse_hex_u64(src: &str) -> Result<u64, ParseIntError> {
let src = src.strip_prefix("0x").unwrap_or(src);
u64::from_str_radix(src, 16)
}
fn parse_range_usize(src: &str) -> Result<RangeInclusive<usize>, ParseIntError> {
if let Some((left, right)) = src.split_once("..=") {
Ok(RangeInclusive::new(
usize::from_str(left)?,
usize::from_str(right)?,
))
} else if let Some((left, right)) = src.split_once("..") {
Ok(RangeInclusive::new(
usize::from_str(left)?,
if right.is_empty() {
usize::MAX
} else {
usize::from_str(right)?.saturating_sub(1)
},
))
} else {
let value = usize::from_str(src)?;
Ok(RangeInclusive::new(value, value))
}
}

View File

@ -104,9 +104,9 @@ impl CircuitConfig {
/// Circuit data required by the prover or the verifier.
pub struct CircuitData<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize> {
pub(crate) prover_only: ProverOnlyCircuitData<F, C, D>,
pub(crate) verifier_only: VerifierOnlyCircuitData<C, D>,
pub(crate) common: CommonCircuitData<F, C, D>,
pub prover_only: ProverOnlyCircuitData<F, C, D>,
pub verifier_only: VerifierOnlyCircuitData<C, D>,
pub common: CommonCircuitData<F, C, D>,
}
impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
@ -154,8 +154,8 @@ pub struct ProverCircuitData<
C: GenericConfig<D, F = F>,
const D: usize,
> {
pub(crate) prover_only: ProverOnlyCircuitData<F, C, D>,
pub(crate) common: CommonCircuitData<F, C, D>,
pub prover_only: ProverOnlyCircuitData<F, C, D>,
pub common: CommonCircuitData<F, C, D>,
}
impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
@ -181,8 +181,8 @@ pub struct VerifierCircuitData<
C: GenericConfig<D, F = F>,
const D: usize,
> {
pub(crate) verifier_only: VerifierOnlyCircuitData<C, D>,
pub(crate) common: CommonCircuitData<F, C, D>,
pub verifier_only: VerifierOnlyCircuitData<C, D>,
pub common: CommonCircuitData<F, C, D>,
}
impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
@ -207,7 +207,7 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
}
/// Circuit data required by the prover, but not the verifier.
pub(crate) struct ProverOnlyCircuitData<
pub struct ProverOnlyCircuitData<
F: RichField + Extendable<D>,
C: GenericConfig<D, F = F>,
const D: usize,
@ -235,9 +235,9 @@ pub(crate) struct ProverOnlyCircuitData<
/// Circuit data required by the verifier, but not the prover.
#[derive(Debug)]
pub(crate) struct VerifierOnlyCircuitData<C: GenericConfig<D>, const D: usize> {
pub struct VerifierOnlyCircuitData<C: GenericConfig<D>, const D: usize> {
/// A commitment to each constant polynomial and each permutation polynomial.
pub(crate) constants_sigmas_cap: MerkleCap<C::F, C::Hasher>,
pub constants_sigmas_cap: MerkleCap<C::F, C::Hasher>,
}
/// Circuit data required by both the prover and the verifier.
@ -247,11 +247,11 @@ pub struct CommonCircuitData<
C: GenericConfig<D, F = F>,
const D: usize,
> {
pub(crate) config: CircuitConfig,
pub config: CircuitConfig,
pub(crate) fri_params: FriParams,
pub(crate) degree_bits: usize,
pub degree_bits: usize,
/// The types of gates used in this circuit, along with their prefixes.
pub(crate) gates: Vec<GateRef<F, D>>,
@ -434,5 +434,5 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
/// dynamic, at least not without setting a maximum wire count and paying for the worst case.
pub struct VerifierCircuitTarget {
/// A commitment to each constant polynomial and each permutation polynomial.
pub(crate) constants_sigmas_cap: MerkleCapTarget,
pub constants_sigmas_cap: MerkleCapTarget,
}

View File

@ -26,7 +26,7 @@ use crate::util::partial_products::{partial_products_and_z_gx, quotient_chunk_pr
use crate::util::timing::TimingTree;
use crate::util::transpose;
pub(crate) fn prove<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>(
pub fn prove<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>(
prover_data: &ProverOnlyCircuitData<F, C, D>,
common_data: &CommonCircuitData<F, C, D>,
inputs: PartialWitness<F>,