Fix parallelization

Fix bench
Fix parallel feature tagging
2024-06-14 15:24:41 +02:00 · 2024-06-14 15:04:16 +02:00 · 2024-06-14 15:04:09 +02:00 · 2024-06-14 13:35:38 +02:00 · 2024-06-14 12:01:06 +02:00
5 changed files with 150 additions and 45 deletions
--- a/nomos-da/kzgrs/Cargo.toml
+++ b/nomos-da/kzgrs/Cargo.toml
@ -20,19 +20,25 @@ num-bigint = "0.4.4"
 thiserror = "1.0.58"
 num-traits = "0.2.18"
 rand = "0.8.5"
 rayon = { version = "1.10", optional = true }
 [dev-dependencies]
 divan = "0.1"
 rayon = "1.10"
 [[bench]]
 name = "kzg"
 harness = false
 [[bench]]
 name = "fft"
 harness = false
 [features]
 default = ["single"]
 single = []
 parallel = [
    "rayon",
    "ark-ff/parallel",
    "ark-ff/asm",
    "ark-ff/rayon",
--- a/nomos-da/kzgrs/benches/fft.rs
+++ b/nomos-da/kzgrs/benches/fft.rs
@ -0,0 +1,52 @@
 use ark_bls12_381::{Fr, G1Affine};
 use ark_ec::{AffineRepr, CurveGroup};
 use ark_ff::{BigInt, FftField, Field};
 use divan::counter::ItemsCount;
 use divan::{black_box, counter::BytesCount, AllocProfiler, Bencher};
 use kzgrs::fft::{fft_g1, ifft_g1};
 fn main() {
    divan::main()
 }
 #[divan::bench(args = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096])]
 fn compute_fft_for_size(bencher: Bencher, size: usize) {
    bencher
        .with_inputs(|| {
            let primitive_root = <Fr as FftField>::get_root_of_unity(size as u64).unwrap();
            let roots_of_unity: Vec<_> = (1..=size)
                .map(|i| primitive_root.pow::<ark_ff::BigInt<4>>(BigInt::from(i as u64)))
                .collect();
            let buff: Vec<G1Affine> = (0..size)
                .map(|i| {
                    G1Affine::identity()
                        .mul_bigint(BigInt::<4>::from(i as u64))
                        .into_affine()
                })
                .collect();
            (buff, roots_of_unity)
        })
        .input_counter(move |_| ItemsCount::new(size))
        .bench_refs(|(buff, roots_of_unity)| black_box(fft_g1(buff, roots_of_unity)));
 }
 #[divan::bench(args = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096])]
 fn compute_ifft_for_size(bencher: Bencher, size: usize) {
    bencher
        .with_inputs(|| {
            let primitive_root = <Fr as FftField>::get_root_of_unity(size as u64).unwrap();
            let roots_of_unity: Vec<_> = (1..=size)
                .map(|i| primitive_root.pow::<ark_ff::BigInt<4>>(BigInt::from(i as u64)))
                .collect();
            let buff: Vec<G1Affine> = (0..size)
                .map(|i| {
                    G1Affine::identity()
                        .mul_bigint(BigInt::<4>::from(i as u64))
                        .into_affine()
                })
                .collect();
            let buff = fft_g1(&buff, &roots_of_unity);
            (buff, roots_of_unity)
        })
        .input_counter(move |_| ItemsCount::new(size))
        .bench_refs(|(buff, roots_of_unity)| black_box(ifft_g1(buff, roots_of_unity)));
 }
--- a/nomos-da/kzgrs/benches/kzg.rs
+++ b/nomos-da/kzgrs/benches/kzg.rs
@ -6,7 +6,9 @@ use divan::counter::ItemsCount;
 use divan::{black_box, counter::BytesCount, AllocProfiler, Bencher};
 use once_cell::sync::Lazy;
 use rand::RngCore;
 #[cfg(feature = "parallel")]
 use rayon::iter::IntoParallelIterator;
 #[cfg(feature = "parallel")]
 use rayon::iter::ParallelIterator;
 use kzgrs::{common::bytes_to_polynomial_unchecked, kzg::*};
@ -46,6 +48,7 @@ fn commit_single_polynomial_with_element_count(bencher: Bencher, element_count:
        .bench_refs(|(_evals, poly)| black_box(commit_polynomial(poly, &GLOBAL_PARAMETERS)));
 }
 #[cfg(feature = "parallel")]
 #[allow(non_snake_case)]
 #[divan::bench(args = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096])]
 fn commit_polynomial_with_element_count_parallelized(bencher: Bencher, element_count: usize) {
@ -114,6 +117,7 @@ fn compute_batch_proofs(bencher: Bencher, element_count: usize) {
 // This is a test on how will perform by having a wrapping rayon on top of the proof computation
 // ark libraries already use rayon underneath so no great improvements are probably come up from this.
 // But it should help reusing the same thread pool for all jobs saving a little time.
 #[cfg(feature = "parallel")]
 #[allow(non_snake_case)]
 #[divan::bench(args = [128, 256, 512, 1024], sample_count = 3, sample_size = 5)]
 fn compute_parallelize_batch_proofs(bencher: Bencher, element_count: usize) {
--- a/nomos-da/kzgrs/src/fft.rs
+++ b/nomos-da/kzgrs/src/fft.rs
@ -2,54 +2,89 @@ use ark_bls12_381::{Bls12_381, Fr, G1Affine};
 use ark_ec::pairing::Pairing;
 use ark_ec::{AffineRepr, CurveGroup};
 use ark_ff::{BigInt, BigInteger, FftField, Field, PrimeField};
-use blst::BLS12_381_G1;
+#[cfg(feature = "parallel")]
 use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
 pub fn fft_g1(vals: &[G1Affine], roots_of_unity: &[Fr]) -> Vec<G1Affine> {
    debug_assert_eq!(vals.len(), roots_of_unity.len());
-    if vals.len() == 1 {
+    let original_len = vals.len();
    if original_len == 1 {
        return vals.to_vec();
    }
    let half_roots: Vec<_> = roots_of_unity.iter().step_by(2).copied().collect();
-    let l = fft_g1(
+    let l = || {
-        vals.iter()
+        fft_g1(
-            .step_by(2)
+            vals.iter()
-            .copied()
+                .step_by(2)
-            .collect::<Vec<_>>()
+                .copied()
-            .as_slice(),
+                .collect::<Vec<_>>()
-        half_roots.as_slice(),
+                .as_slice(),
-    );
+            half_roots.as_slice(),
        )
    };
-    let r = fft_g1(
+    let r = || {
-        vals.iter()
+        fft_g1(
-            .skip(1)
+            vals.iter()
-            .step_by(2)
+                .skip(1)
-            .copied()
+                .step_by(2)
-            .collect::<Vec<_>>()
+                .copied()
-            .as_slice(),
+                .collect::<Vec<_>>()
-        half_roots.as_slice(),
+                .as_slice(),
-    );
+            half_roots.as_slice(),
        )
    };
-    let y_times_root = r
+    let [l, r]: [Vec<G1Affine>; 2] = {
-        .into_iter()
+        #[cfg(feature = "parallel")]
-        .cycle()
+        {
-        .enumerate()
+            let (l, r) = rayon::join(l, r);
-        .map(|(i, y)| (y * roots_of_unity[i % vals.len()]).into_affine());
+            [l, r]
        }
        #[cfg(not(feature = "parallel"))]
        {
            [l(), r()]
        }
    };
    // Double sized so we can use iterator later on
    let l: Vec<_> = l.into_iter().cycle().take(original_len).collect();
    let r: Vec<_> = r.into_iter().cycle().take(original_len).collect();
-    l.into_iter()
+    let y_times_root = {
-        .cycle()
+        #[cfg(feature = "parallel")]
-        .take(vals.len())
+        {
-        .zip(y_times_root)
+            r.into_par_iter()
-        .enumerate()
+        }
-        .map(|(i, (x, y_times_root))| {
+        #[cfg(not(feature = "parallel"))]
-            if i < vals.len() / 2 {
+        {
-                x + y_times_root
+            r.into_iter()
-            } else {
+        }
-                x - y_times_root
+    }
-            }
+    .enumerate()
-            .into_affine()
+    .map(|(i, y)| (y * roots_of_unity[i % vals.len()]).into_affine());
-        })
+
-        .collect()
+    {
        #[cfg(feature = "parallel")]
        {
            l.into_par_iter()
        }
        #[cfg(not(feature = "parallel"))]
        {
            l.into_iter()
        }
    }
    .zip(y_times_root)
    .enumerate()
    .map(|(i, (x, y_times_root))| {
        if i < vals.len() / 2 {
            x + y_times_root
        } else {
            x - y_times_root
        }
        .into_affine()
    })
    .collect()
 }
 pub fn ifft_g1(vals: &[G1Affine], roots_of_unity: &[Fr]) -> Vec<G1Affine> {
@ -57,10 +92,18 @@ pub fn ifft_g1(vals: &[G1Affine], roots_of_unity: &[Fr]) -> Vec<G1Affine> {
    let mut mod_min_2 = BigInt::new(<Fr as PrimeField>::MODULUS.0);
    mod_min_2.sub_with_borrow(&BigInt::<4>::from(2u64));
    let invlen = Fr::from(vals.len() as u64).pow(mod_min_2).into_bigint();
-    fft_g1(vals, roots_of_unity)
+    {
-        .into_iter()
+        #[cfg(feature = "parallel")]
-        .map(|g| g.mul_bigint(invlen).into_affine())
+        {
-        .collect()
+            fft_g1(vals, roots_of_unity).into_par_iter()
        }
        #[cfg(not(feature = "parallel"))]
        {
            fft_g1(vals, roots_of_unity).into_iter()
        }
    }
    .map(|g| g.mul_bigint(invlen).into_affine())
    .collect()
 }
 #[cfg(test)]
--- a/nomos-da/kzgrs/src/lib.rs
+++ b/nomos-da/kzgrs/src/lib.rs
@ -1,6 +1,6 @@
 pub mod common;
-mod fft;
+pub mod fft;
-mod fk20;
+pub mod fk20;
 pub mod global_parameters;
 pub mod kzg;
 pub mod rs;
Author	SHA1	Message	Date
danielsanchezq	bf83bc4403	Fix parallelization	2024-06-14 15:24:41 +02:00
danielsanchezq	c83638dd3a	Fix bench	2024-06-14 15:04:16 +02:00
danielsanchezq	a760e436ed	Fix parallel feature tagging	2024-06-14 15:04:09 +02:00
danielsanchezq	911505f5e3	Add i/fft benches	2024-06-14 13:35:38 +02:00
danielsanchezq	5eeb96271f	Parallelize i/fft	2024-06-14 12:01:06 +02:00