From 49e10423d0fbf24703e1b1cdb45be724cd55b362 Mon Sep 17 00:00:00 2001
From: Balazs Komuves <bkomuves@gmail.com>
Date: Sun, 25 Jan 2026 21:17:45 +0100
Subject: [PATCH] implement circom's Poseidon permutation and compression for
 t=2,3,4,5

---
 README.md                   |  79 +++++++++++++++--
 src/bin/testmain.rs         |  14 ++-
 src/poseidon/permutation.rs | 165 +++++++++++++++++++++++++++++++++---
 3 files changed, 241 insertions(+), 17 deletions(-)
diff --git a/README.md b/README.md
index 9f50a39..f587e9f 100644
--- a/README.md
+++ b/README.md
@@ -5,22 +5,91 @@ Self-contained (no external dependencies), pure Rust implementation of Poseidon
 and Poseidon2 hash functions over the BN254 curve's scalar field, using 32 bit 
 limbs internally.
 
-It's intended to be used on 32-bit platforms, eg. 32-bit RISC-V (rv32im)
-(though porting to 64 bits shouldn't be a big effort).
+It's primarily intended to be used on 32-bit platforms, eg. 32-bit RISC-V (`rv32im`)
+(though porting to 64 bits shouldn't be a big effort; TODO).
 
 The algebra implementation is based on [`zikkurat-algebra`](https://github.com/faulhornlabs/zikkurat-algebra/)
 and [`staging-agda`](https://github.com/faulhornlabs/staging-agda/).
 
+### Compatibility
+
+The Poseidon implementation is compatible with [`circomlib`](https://github.com/iden3/circomlib/).
+
+The Poseidon2 implementation is compatible with [`zkfriendlyhashzoo`](https://extgit.isec.tugraz.at/krypto/zkfriendlyhashzoo).
+It _used to be_ compatible with the [HorizenLabs implementation](https://github.com/HorizenLabs/poseidon2),
+until they changed all their constants in [this commit](https://github.com/HorizenLabs/poseidon2/commit/bb476b9ca38198cf5092487283c8b8c5d4317c4e).
+We don't think it's worth the pain to follow this change.
+
 ### Status
 
-Currently, only Poseidon2 with `t=3` is implemented.
+Currently, only the following instances are implemented:
+
+- Poseidon permutation with `t=2,3,4,5` over BN254's scalar field
+- Poseidon2 permutation with `t=3` over BN254's scalar field
+
+I feel that larger states are unneccesary in practice. As a concrete example,
+[PSE's RLN circuit](https://github.com/Rate-Limiting-Nullifier/circom-rln) uses `t=2,3,4`.
+
+The proper way to handle larger input is to implement the sponge construction.
+
+### Usage
+
+There are three main types:
+
+- `BigInt<N>` is an unsigned big integer consisting of `N` words (so `2^(32*N)` or `2^(64*N)` bits);
+- `Felt`, short for "Field Element", is a prime field element in the standard representation
+  (integers modulo `p`);
+- `Mont` is a field element in the Montgomery represntation. This is used internally 
+  for calculations, as the multiplications is much faster this way.
+
+The core functionality of the Poseidon family of hash functions is the _permutation_, 
+which takes an array of `t` field elements, and returns the same:
+
+    fn permute( [Felt; t] ) -> [Felt; t]
+
+From this one can build all kind of stuff, including a proper hash function (using
+the so-called "sponge construction). The latter is not implemented in `circomlib`,
+instead, what they have is a compression function parametrized by `t`:
+
+    fn compress( [Felt; t-1] ) -> Felt
+
+This takes `t-1` field elements and returns one (which is interpreted as a hash).
+
+This is implemented by extending the input with a 0, applying the permutation, and
+taking the first element of the output vector (note: in `circomlib`, the extra 0 is 
+at the beginning, not at the end, but that doesn't matter at all; just be consistent).
+
+Remark: That extra zero (called the "capacity") is _extremely important_, without 
+that the whole construction would be totally insecure!
+
+### Speed
+
+Some approximate benchmark numbers below.
+
+#### 32-bit RISC-V 
+
+On RV32IM (the primary target as of now), we have approximately the following cycle counts:
+
+Poseidon2:
+
+- 350k cycles for a single `t=3` permutation
+
+#### Modern CPUs
+
+On modern 64-bit CPU-s, the 64-bit version is preferred (TODO: implement it).
+
+32 bit version, running on an M2 macbook pro:
+
+- 155 msec for 10k `t=3` permutations
 
 ### TODO
 
 - [ ] optimize squaring to use less multiplications (?)
 - [ ] benchmark RISC-V cycles
 - [ ] add more Poseidon2 state widths (not just `t=3`)
-- [ ] implement `circomlib`-compatible Poseidon
-- [ ] add a test-suite; in particular, more complete testing of the field operations
+- [x] implement `circomlib`-compatible Poseidon
+- [ ] add a proper test-suite; in particular, more complete testing of the field operations
 - [ ] add a 64 bit version
 - [ ] further optimizations
+- [ ] implement the sponge construction
+
diff --git a/src/bin/testmain.rs b/src/bin/testmain.rs
index ef3f710..978a448 100644
--- a/src/bin/testmain.rs
+++ b/src/bin/testmain.rs
@@ -202,9 +202,21 @@ fn main() {
   println!("in dec = {}", Mont::to_decimal_string(&MONT1));
 */
 
+  let in1: Felt = Felt::from_u32(1);
+  let out1 = compress_1(in1);
+  println!("compress(1) = {}", Felt::to_decimal_string(&out1) );
+
   let in2: [Felt; 2] = [ Felt::from_u32(1) , Felt::from_u32(2) ];
-  let out2 = compress_felt_T3(in2);
+  let out2 = compress_2(in2);
   println!("compress(2) = {}", Felt::to_decimal_string(&out2) );
 
+  let in3: [Felt; 3] = [ Felt::from_u32(1) , Felt::from_u32(2) , Felt::from_u32(3) ];
+  let out3 = compress_3(in3);
+  println!("compress(3) = {}", Felt::to_decimal_string(&out3) );
+
+  let in4: [Felt; 4] = [ Felt::from_u32(1) , Felt::from_u32(2) , Felt::from_u32(3) , Felt::from_u32(4) ];
+  let out4 = compress_4(in4);
+  println!("compress(4) = {}", Felt::to_decimal_string(&out4) );
+
 }
 
diff --git a/src/poseidon/permutation.rs b/src/poseidon/permutation.rs
index fe906d8..208732a 100644
--- a/src/poseidon/permutation.rs
+++ b/src/poseidon/permutation.rs
@@ -92,7 +92,7 @@ fn external_round<const T: usize>(rcs: &[Mont], input: [Mont; T], mtx: [Mont; T*
 }
 
 //------------------------------------------------------------------------------
-// T = 3
+// TODO: can we somehow unify the different T cases????
 
 /*
 // debugging
@@ -104,16 +104,20 @@ fn printRound(text: &str, round: usize, state: &[Mont]) {
 }
 */
 
-pub fn permute_mont_T3(input: [Mont; 3]) -> [Mont; 3] {
-  const T:  usize = 3;
+//--------------------------------------
+// T = 2
+
+pub fn permute_mont_T2(input: [Mont; 2]) -> [Mont; 2] {
+  const T:  usize = 2;
+
   const TT: usize = 2*T-1;
   const NP: usize = INTERNAL_ROUND_COUNT[T-2];
-  const C:  [Mont;  81] = t3::CONST_C;
-  const M:  [Mont;   9] = t3::CONST_M;
-  const P:  [Mont;   9] = t3::CONST_P;
-  const S:  [Mont; 285] = t3::CONST_S;
-  let mut state: [Mont; 3] = input;
-  // printRound("input", 0, &state); 
+  const C:  [Mont;  72] = t2::CONST_C;
+  const M:  [Mont;   4] = t2::CONST_M;
+  const P:  [Mont;   4] = t2::CONST_P;
+  const S:  [Mont; 168] = t2::CONST_S;
+
+  let mut state: [Mont; T] = input;
   for j in 0..T { 
     state[j] = Mont::add( &state[j] , &C[j] );
   }
@@ -137,7 +141,126 @@ pub fn permute_mont_T3(input: [Mont; 3]) -> [Mont; 3] {
   state
 }
 
-pub fn compress_felt_T3(input: [Felt;2]) -> Felt {
+//--------------------------------------
+
+pub fn permute_mont_T3(input: [Mont; 3]) -> [Mont; 3] {
+  const T:  usize = 3;
+
+  const TT: usize = 2*T-1;
+  const NP: usize = INTERNAL_ROUND_COUNT[T-2];
+  const C:  [Mont;  81] = t3::CONST_C;
+  const M:  [Mont;   9] = t3::CONST_M;
+  const P:  [Mont;   9] = t3::CONST_P;
+  const S:  [Mont; 285] = t3::CONST_S;
+
+  let mut state: [Mont; T] = input;
+  for j in 0..T { 
+    state[j] = Mont::add( &state[j] , &C[j] );
+  }
+  for i in 0..4  { 
+    let rcs: &[Mont] = &C[ ((i+1)*T) .. ((i+2)*T) ];
+    let mat = if i<3 { M } else { P };
+    state = external_round::<T>( rcs , state , mat ); 
+    // printRound("initial round", i, &state); 
+  }
+  for i in 0..NP { 
+    let rc: Mont = C[ i + 5*T ];
+    let scoeffs: &[Mont]  = &S[ (i*TT) .. ((i+1)*TT) ];
+    state = internal_round::<T>( rc , scoeffs , state );
+    // printRound("internal round", i, &state); 
+  }
+  for i in 4..8  { 
+    let rcs: &[Mont] = if i<7  { &C[ (NP + (i+1)*T) .. (NP + (i+2)*T) ] } else { &[Mont::zero(); T] };
+    state = external_round::<T>( rcs , state , M ); 
+    // printRound("final round", i, &state); 
+  }
+  state
+}
+
+//--------------------------------------
+
+pub fn permute_mont_T4(input: [Mont; 4]) -> [Mont; 4] {
+  const T:  usize = 4;
+
+  const TT: usize = 2*T-1;
+  const NP: usize = INTERNAL_ROUND_COUNT[T-2];
+  const C:  [Mont;  88] = t4::CONST_C;
+  const M:  [Mont;  16] = t4::CONST_M;
+  const P:  [Mont;  16] = t4::CONST_P;
+  const S:  [Mont; 392] = t4::CONST_S;
+
+  let mut state: [Mont; T] = input;
+  for j in 0..T { 
+    state[j] = Mont::add( &state[j] , &C[j] );
+  }
+  for i in 0..4  { 
+    let rcs: &[Mont] = &C[ ((i+1)*T) .. ((i+2)*T) ];
+    let mat = if i<3 { M } else { P };
+    state = external_round::<T>( rcs , state , mat ); 
+    // printRound("initial round", i, &state); 
+  }
+  for i in 0..NP { 
+    let rc: Mont = C[ i + 5*T ];
+    let scoeffs: &[Mont]  = &S[ (i*TT) .. ((i+1)*TT) ];
+    state = internal_round::<T>( rc , scoeffs , state );
+    // printRound("internal round", i, &state); 
+  }
+  for i in 4..8  { 
+    let rcs: &[Mont] = if i<7  { &C[ (NP + (i+1)*T) .. (NP + (i+2)*T) ] } else { &[Mont::zero(); T] };
+    state = external_round::<T>( rcs , state , M ); 
+    // printRound("final round", i, &state); 
+  }
+  state
+}
+
+//--------------------------------------
+
+pub fn permute_mont_T5(input: [Mont; 5]) -> [Mont; 5] {
+  const T:  usize = 5;
+
+  const TT: usize = 2*T-1;
+  const NP: usize = INTERNAL_ROUND_COUNT[T-2];
+  const C:  [Mont; 100] = t5::CONST_C;
+  const M:  [Mont;  25] = t5::CONST_M;
+  const P:  [Mont;  25] = t5::CONST_P;
+  const S:  [Mont; 540] = t5::CONST_S;
+
+  let mut state: [Mont; T] = input;
+  for j in 0..T { 
+    state[j] = Mont::add( &state[j] , &C[j] );
+  }
+  for i in 0..4  { 
+    let rcs: &[Mont] = &C[ ((i+1)*T) .. ((i+2)*T) ];
+    let mat = if i<3 { M } else { P };
+    state = external_round::<T>( rcs , state , mat ); 
+    // printRound("initial round", i, &state); 
+  }
+  for i in 0..NP { 
+    let rc: Mont = C[ i + 5*T ];
+    let scoeffs: &[Mont]  = &S[ (i*TT) .. ((i+1)*TT) ];
+    state = internal_round::<T>( rc , scoeffs , state );
+    // printRound("internal round", i, &state); 
+  }
+  for i in 4..8  { 
+    let rcs: &[Mont] = if i<7  { &C[ (NP + (i+1)*T) .. (NP + (i+2)*T) ] } else { &[Mont::zero(); T] };
+    state = external_round::<T>( rcs , state , M ); 
+    // printRound("final round", i, &state); 
+  }
+  state
+}
+
+//------------------------------------------------------------------------------
+
+pub fn compress_1(input: Felt) -> Felt {
+  let mut state: [Mont; 2] = 
+    [ Mont::zero()
+    , Felt::to_mont(&input)
+    ]; 
+  state = permute_mont_T2(state);
+  Felt::from_mont(&state[0])
+}
+
+pub fn compress_2(input: [Felt;2]) -> Felt {
   let mut state: [Mont; 3] = 
     [ Mont::zero()
     , Felt::to_mont(&input[0])
@@ -147,7 +270,27 @@ pub fn compress_felt_T3(input: [Felt;2]) -> Felt {
   Felt::from_mont(&state[0])
 }
 
-//------------------------------------------------------------------------------
+pub fn compress_3(input: [Felt;3]) -> Felt {
+  let mut state: [Mont; 4] = 
+    [ Mont::zero()
+    , Felt::to_mont(&input[0])
+    , Felt::to_mont(&input[1])
+    , Felt::to_mont(&input[2])
+    ]; 
+  state = permute_mont_T4(state);
+  Felt::from_mont(&state[0])
+}
 
+pub fn compress_4(input: [Felt;4]) -> Felt {
+  let mut state: [Mont; 5] = 
+    [ Mont::zero()
+    , Felt::to_mont(&input[0])
+    , Felt::to_mont(&input[1])
+    , Felt::to_mont(&input[2])
+    , Felt::to_mont(&input[3])
+    ]; 
+  state = permute_mont_T5(state);
+  Felt::from_mont(&state[0])
+}
 
 //------------------------------------------------------------------------------