From d89a5cca7b3b567fca664971d1291eb4b3d41a5b Mon Sep 17 00:00:00 2001
From: Balazs Komuves <bkomuves@gmail.com>
Date: Thu, 3 Oct 2024 00:26:28 +0200
Subject: [PATCH] add Monolith hash implementation (C + Nim + tests)

---
 README.md                                     |   2 +-
 cbits/goldilocks.c                            | 338 ++++++++++++------
 cbits/goldilocks.h                            |  14 +-
 cbits/monolith_constants.inc                  |  71 ++++
 cbits/monolith_conv_uint64.inc                | 267 ++++++++++++++
 goldilocks_hash/monolith.nim                  |  13 +
 goldilocks_hash/monolith/compress.nim         |  14 +
 goldilocks_hash/monolith/merkle.nim           |  76 ++++
 goldilocks_hash/monolith/permutation.nim      |  17 +
 goldilocks_hash/monolith/sponge.nim           | 121 +++++++
 reference/Common.hs                           |   6 +
 reference/TestGen/TestCompress.hs             |  56 +++
 .../monolith/compressTestCases.nim            |  53 +++
 .../goldilocks_hash/monolith/testCompress.nim |  38 ++
 tests/goldilocks_hash/monolith/testMerkle.nim |  31 ++
 .../monolith/testPermutation.nim              |  60 ++++
 tests/goldilocks_hash/monolith/testSponge.nim |  76 ++++
 .../poseidon2/compressTestCases.nim           |  53 +++
 .../poseidon2/testCompress.nim                |  46 +--
 tests/test.nim                                |   6 +
 20 files changed, 1200 insertions(+), 158 deletions(-)
 create mode 100644 cbits/monolith_constants.inc
 create mode 100644 cbits/monolith_conv_uint64.inc
 create mode 100644 goldilocks_hash/monolith.nim
 create mode 100644 goldilocks_hash/monolith/compress.nim
 create mode 100644 goldilocks_hash/monolith/merkle.nim
 create mode 100644 goldilocks_hash/monolith/permutation.nim
 create mode 100644 goldilocks_hash/monolith/sponge.nim
 create mode 100644 reference/TestGen/TestCompress.hs
 create mode 100644 tests/goldilocks_hash/monolith/compressTestCases.nim
 create mode 100644 tests/goldilocks_hash/monolith/testCompress.nim
 create mode 100644 tests/goldilocks_hash/monolith/testMerkle.nim
 create mode 100644 tests/goldilocks_hash/monolith/testPermutation.nim
 create mode 100644 tests/goldilocks_hash/monolith/testSponge.nim
 create mode 100644 tests/goldilocks_hash/poseidon2/compressTestCases.nim

diff --git a/README.md b/README.md
index a9122e3..55be15e 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ Hash functions supported
 ------------------------
 
 - [x] Poseidon2 with `t=12`
-- [ ] Monolith with `t=12`
+- [x] Monolith with `t=12`
 - [ ] Tip4' with `t=12`
 
 The Poseidon2 implementation is compatible with [Horizen Lab's one][4].
diff --git a/cbits/goldilocks.c b/cbits/goldilocks.c
index 0d8549c..822b2d9 100644
--- a/cbits/goldilocks.c
+++ b/cbits/goldilocks.c
@@ -26,43 +26,6 @@ uint64_t goldilocks_sub_safe(uint64_t x, uint64_t y) {
   return goldilocks_add( x , goldilocks_neg(y) );
 }
 
-/*
-
-// add together 3 field elements
-uint64_t goldilocks_add3( uint64_t x0, uint64_t x1, uint64_t x2 ) {
-  uint64_t x01 = goldilocks_add( x0 , x1  );
-  return goldilocks_add( x01, x2 );
-}
-
-//--------------------------------------
-
-uint64_t goldilocks_div_by_2(uint64_t x) {
-  return (x & 1) ? (x/2 + 0x7fffffff80000001) : (x/2);
-}
-
-uint64_t goldilocks_div_by_3(uint64_t x) {
-  uint64_t m = x % 3;
-  uint64_t r;
-  switch(m) {
-    case 0:
-      r = (x/3);
-      break;
-    case 1:
-      r = (x/3 + 0xaaaaaaaa00000001);      // (x+2*p) / 3 = x/3 + (2*p+1)/3
-      break;
-    case 2:
-      r = (x/3 + 0x5555555500000001);      // (x+p) / 3 = x/3 + (p+1)/3 
-      break;
-  }
-  return r;
-}
-
-uint64_t goldilocks_div_by_4(uint64_t x) {
-  return goldilocks_div_by_2(goldilocks_div_by_2(x));
-}
-
-*/
-
 //--------------------------------------
 
 uint64_t goldilocks_rdc(__uint128_t x) {
@@ -201,7 +164,6 @@ void goldilocks_poseidon2_internal_diffusion(uint64_t *inp, uint64_t *out) {
   s0 += inp[3]; s1 += inp[9];
   s0 += inp[4]; s1 += inp[10];
   s0 += inp[5]; s1 += inp[11];
-//  uint64_t s = goldilocks_rdc_small( s0 + s1 );
   __uint128_t s = s0 + s1;
 
   for(int i=0; i<12; i++) { 
@@ -211,8 +173,6 @@ void goldilocks_poseidon2_internal_diffusion(uint64_t *inp, uint64_t *out) {
 
 //--------------------------------------
 
-/*
-
 // multiplies a vector of size 4 by the 4x4 MDS matrix on the left:
 //
 //       [ 5 7 1 3 ]
@@ -220,74 +180,6 @@ void goldilocks_poseidon2_internal_diffusion(uint64_t *inp, uint64_t *out) {
 //       [ 1 3 5 7 ]
 //       [ 1 1 4 6 ]
 //
-void goldilocks_mul_by_M4(uint64_t *inp, uint64_t *out) {
-  uint64_t a = inp[0];
-  uint64_t b = inp[1];
-  uint64_t c = inp[2];
-  uint64_t d = inp[3];
-
-  uint64_t a2 = goldilocks_add( a  , a  );
-  uint64_t a4 = goldilocks_add( a2 , a2 );
-  uint64_t a5 = goldilocks_add( a4 , a  );
-
-  uint64_t b2 = goldilocks_add( b  , b  );
-  uint64_t b3 = goldilocks_add( b2 , b  );
-  uint64_t b6 = goldilocks_add( b3 , b3 );
-  uint64_t b7 = goldilocks_add( b6 , b  );
-
-  uint64_t c2 = goldilocks_add( c  , c  );
-  uint64_t c4 = goldilocks_add( c2 , c2 );
-  uint64_t c5 = goldilocks_add( c4 , c  );
-
-  uint64_t d2 = goldilocks_add( d  , d  );
-  uint64_t d3 = goldilocks_add( d2 , d  );
-  uint64_t d6 = goldilocks_add( d3 , d3 );
-  uint64_t d7 = goldilocks_add( d6 , d  );
-
-  out[0] = goldilocks_add( goldilocks_add( a5 , b7 ) , goldilocks_add( c  , d3 ) );
-  out[1] = goldilocks_add( goldilocks_add( a4 , b6 ) , goldilocks_add( c  , d  ) );
-  out[2] = goldilocks_add( goldilocks_add( a  , b3 ) , goldilocks_add( c5 , d7 ) );
-  out[3] = goldilocks_add( goldilocks_add( a  , b  ) , goldilocks_add( c4 , d6 ) );
-}
-
-// returns 2*a + b + c
-uint64_t goldilocks_weighted_add_211(uint64_t a, uint64_t b, uint64_t c) {
-  uint64_t a2 = goldilocks_add( a , a );
-  uint64_t bc = goldilocks_add( b , c );
-  return goldilocks_add( a2 , bc );
-}
-
-// multiplies by 12x12 block-circulant matrix [2*M4, M4, M4]
-void goldilocks_poseidon2_external_diffusion(uint64_t *inp, uint64_t *out) {
-  uint64_t us[4];
-  uint64_t vs[4];
-  uint64_t ws[4];
-
-  goldilocks_mul_by_M4( inp + 0 , us );
-  goldilocks_mul_by_M4( inp + 4 , vs );
-  goldilocks_mul_by_M4( inp + 8 , ws );
-
-  out[0]  = goldilocks_weighted_add_211( us[0] , vs[0] , ws[0] );
-  out[1]  = goldilocks_weighted_add_211( us[1] , vs[1] , ws[1] );
-  out[2]  = goldilocks_weighted_add_211( us[2] , vs[2] , ws[2] );
-  out[3]  = goldilocks_weighted_add_211( us[3] , vs[3] , ws[3] );
-
-  out[4]  = goldilocks_weighted_add_211( vs[0] , ws[0] , us[0] );
-  out[5]  = goldilocks_weighted_add_211( vs[1] , ws[1] , us[1] );
-  out[6]  = goldilocks_weighted_add_211( vs[2] , ws[2] , us[2] );
-  out[7]  = goldilocks_weighted_add_211( vs[3] , ws[3] , us[3] );
-
-  out[ 8] = goldilocks_weighted_add_211( ws[0] , us[0] , vs[0] );
-  out[ 9] = goldilocks_weighted_add_211( ws[1] , us[1] , vs[1] );
-  out[10] = goldilocks_weighted_add_211( ws[2] , us[2] , vs[2] );
-  out[11] = goldilocks_weighted_add_211( ws[3] , us[3] , vs[3] );
-}
-
-*/
-
-//--------------------------------------
-
-// multiplies a vector of size 4 by the 4x4 MDS matrix on the left
 void uint64_mul_by_M4(uint64_t *inp, uint64_t *out) {
   uint64_t a = inp[0];
   uint64_t b = inp[1];
@@ -524,4 +416,234 @@ void goldilocks_poseidon2_bytes_digest(int rate, int N, const uint8_t *input, ui
   for(int j=0; j<4; j++) { hash[j] = state[j]; }
 }
 
+//==============================================================================
+// *** Monolith hash ***
+//
+// compatible with <https://extgit.iaik.tugraz.at/krypto/zkfriendlyhashzoo>
+//
+
+/* 
+monolith test vector (permutation of [0..11]) 
+---------------------------------------------
+from <https://extgit.iaik.tugraz.at/krypto/zkfriendlyhashzoo/-/blob/master/plain_impls/src/monolith_64/monolith_64.rs?ref_type=heads#L653>
+
+0x516dd661e959f541 = 5867581605548782913
+0x082c137169707901 = 588867029099903233
+0x53dff3fd9f0a5beb = 6043817495575026667
+0x0b2ebaa261590650 = 805786589926590032
+0x89aadb57e2969cb6 = 9919982299747097782
+0x5d3d6905970259bd = 6718641691835914685
+0x6e5ac1a4c0cfa0fe = 7951881005429661950
+0xd674b7736abfc5ce = 15453177927755089358
+0x0d8697e1cd9a235f = 974633365445157727
+0x85fc4017c247136e = 9654662171963364206
+0x572bafd76e511424 = 6281307445101925412
+0xbec1638e28eae57f = 13745376999934453119
+
+*/
+
+//--------------------------------------
+// ** sbox layer
+
+// based on the reference implementation from 
+// <https://extgit.iaik.tugraz.at/krypto/zkfriendlyhashzoo>
+uint64_t goldilocks_monolith_single_bar(uint64_t x) {
+//  uint64_t y1 = ((x & 0x8080808080808080) >> 7) | ((x & 0x7F7F7F7F7F7F7F7F) << 1); 
+//  uint64_t y2 = ((x & 0xC0C0C0C0C0C0C0C0) >> 6) | ((x & 0x3F3F3F3F3F3F3F3F) << 2); 
+//  uint64_t y3 = ((x & 0xE0E0E0E0E0E0E0E0) >> 5) | ((x & 0x1F1F1F1F1F1F1F1F) << 3); 
+//  uint64_t z  = x ^ ((~y1) & y2 & y3);
+//  uint64_t r  = ((z  & 0x8080808080808080) >> 7) | ((z  & 0x7F7F7F7F7F7F7F7F) << 1);
+
+  const uint64_t mask80 = 0x8080808080808080;
+  const uint64_t mask7F = ~mask80;
+  uint64_t y1 = ((x  & mask80) >> 7) | ((x  & mask7F) << 1); 
+  uint64_t y2 = ((y1 & mask80) >> 7) | ((y1 & mask7F) << 1); 
+  uint64_t y3 = ((y2 & mask80) >> 7) | ((y2 & mask7F) << 1); 
+  uint64_t z  = x ^ ((~y1) & y2 & y3);
+  uint64_t r  = ((z  & mask80) >> 7) | ((z  & mask7F) << 1);
+  return r;
+}
+
+// the sbox-layer (note: it's only applied to the first 4 field elements!)
+void goldilocks_monolith_bars(uint64_t *state) {
+  for(int j=0; j<4; j++) { state[j] = goldilocks_monolith_single_bar(state[j]); }
+}
+
+//--------------------------------------
+// ** nonlinear layer
+
+// the nonlinear layer
+//
+// remark: since the next layer is always the linear diffusion, it's enough
+// to reduce to 64 bit, don't have to reduce to [0..p-1]. 
+// As in the linear layer we split into two 32 bit words anyway.
+void goldilocks_monolith_bricks(uint64_t *state) {
+  for(int i=11; i>0; i--) state[i] = goldilocks_sqr_add_to_uint64( state[i-1] , state[i] );
+}
+
+//--------------------------------------
+// ** fast diffusion layer
+
+#include "monolith_conv_uint64.inc"
+
+// we split the input to low and high 32 bit words
+// do circular convolution on them, which safe because there is no overflow in 64 bit words
+// but should be much faster as there are no modulo operations just 64-bit machine word ops
+// then reconstruct and reduce at the end
+void goldilocks_monolith_concrete(uint64_t *state) {
+  uint64_t lo[12];
+  uint64_t hi[12];
+ 
+  for(int i=0; i<12; i++) { 
+    uint64_t x = state[i];
+    lo[i] = x & 0xffffffff;
+    hi[i] = x >> 32;
+  }
+
+  uint64_circular_conv_12_with( lo , lo );
+  uint64_circular_conv_12_with( hi , hi );
+
+  for(int i=0; i<12; i++) {
+    __uint128_t x = (((__uint128_t)hi[i]) << 32) + lo[i];
+    state[i] = goldilocks_rdc_small(x);
+  }
+}
+
+void goldilocks_monolith_concrete_rc(uint64_t *state, const uint64_t *rc) {
+  uint64_t lo[12];
+  uint64_t hi[12];
+ 
+  for(int i=0; i<12; i++) { 
+    uint64_t x = state[i];
+    lo[i] = x & 0xffffffff;
+    hi[i] = x >> 32;
+  }
+
+  uint64_circular_conv_12_with( lo , lo );
+  uint64_circular_conv_12_with( hi , hi );
+
+  for(int i=0; i<12; i++) {
+    __uint128_t x = (((__uint128_t)hi[i]) << 32) + lo[i] + rc[i];
+    state[i] = goldilocks_rdc_small(x);
+  }
+}
+
+//--------------------------------------
+// ** rounds
+
+#include "monolith_constants.inc"
+
+void goldilocks_monolith_round(int round_idx, uint64_t *state) {
+  goldilocks_monolith_bars       (state);
+  goldilocks_monolith_bricks     (state);
+  goldilocks_monolith_concrete_rc(state , &(monolith_t12_round_constants[round_idx][0]) );
+}
+
+void goldilocks_monolith_permutation(uint64_t *state) {
+  // initial layer
+  goldilocks_monolith_concrete(state);
+  // five rounds with RC
+  for(int r=0; r<5; r++) {
+    goldilocks_monolith_round(r, state);
+  }
+  // last round, no RC
+  goldilocks_monolith_bars    (state);
+  goldilocks_monolith_bricks  (state);
+  goldilocks_monolith_concrete(state);
+}
+
+//------------------------------------------------------------------------------
+
+// compression function: input is two 4-element vector of field elements, 
+// and the output is a vector of 4 field elements
+void goldilocks_monolith_keyed_compress(const uint64_t *x, const uint64_t *y, uint64_t key, uint64_t *out) {
+  uint64_t state[12];
+  for(int i=0; i<4; i++) {
+    state[i  ] = x[i];
+    state[i+4] = y[i];
+    state[i+8] = 0;
+  }
+  state[8] = key;
+  goldilocks_monolith_permutation(state);
+  for(int i=0; i<4; i++) {
+    out[i] = state[i];
+  }
+}
+
+void goldilocks_monolith_compress(const uint64_t *x, const uint64_t *y, uint64_t *out) {
+  goldilocks_monolith_keyed_compress(x, y, 0, out);
+}
+
+//------------------------------------------------------------------------------
+
+// hash a sequence of field elements into a digest of 4 field elements
+void goldilocks_monolith_felts_digest(int rate, int N, const uint64_t *input, uint64_t *hash) {
+
+  assert( (rate >= 1) && (rate <= 8) );
+
+  uint64_t domsep = rate + 256*12 + 65536*63;
+  uint64_t state[12];
+  for(int i=0; i<12; i++) state[i] = 0;
+  state[8] = domsep;
+
+  int nchunks = (N + rate) / rate;       // 10* padding
+  const uint64_t *ptr = input;
+  for(int k=0; k<nchunks-1; k++) {
+    for(int j=0; j<rate; j++) { state[j] = goldilocks_add( state[j] , ptr[j] ); }
+    goldilocks_monolith_permutation( state );
+    ptr += rate;
+  }
+
+  int rem = nchunks*rate - N;       // 0 < rem <= rate
+  int ofs = rate - rem; 
+
+  // the last block, with padding
+  uint64_t last[8];
+  for(int i=0    ; i<ofs ; i++) last[i] = ptr[i];
+  for(int i=ofs+1; i<rate; i++) last[i] = 0;
+  last[ofs] = 0x01;
+  for(int j=0; j<rate; j++) { state[j] = goldilocks_add( state[j] , last[j] ); }
+  goldilocks_monolith_permutation( state );
+
+  for(int j=0; j<4; j++) { hash[j] = state[j]; }
+}
+
+//--------------------------------------
+
+void goldilocks_monolith_bytes_digest(int rate, int N, const uint8_t *input, uint64_t *hash) {
+
+  assert( (rate == 4) || (rate == 8) );
+
+  uint64_t domsep = rate + 256*12 + 65536*8;
+  uint64_t state[12];
+  for(int i=0; i<12; i++) state[i] = 0;
+  state[8] = domsep;
+
+  uint64_t felts[8];
+
+  int rate_in_bytes  = 31 * (rate>>2);                   // 31 or 62
+  int nchunks = (N + rate_in_bytes) / rate_in_bytes;     // 10* padding
+  const uint8_t *ptr = input;
+  for(int k=0; k<nchunks-1; k++) {
+    goldilocks_convert_bytes_to_field_elements(rate, ptr, felts);
+    for(int j=0; j<rate; j++) { state[j] = goldilocks_add( state[j] , felts[j] ); }
+    goldilocks_monolith_permutation( state );
+    ptr += rate_in_bytes;
+  }
+
+  int rem = nchunks*rate_in_bytes - N;       // 0 < rem <= rate_in_bytes 
+  int ofs = rate_in_bytes - rem; 
+  uint8_t last[62];
+
+  // last block, with padding
+  for(int i=0    ; i<ofs          ; i++) last[i] = ptr[i];
+  for(int i=ofs+1; i<rate_in_bytes; i++) last[i] = 0;
+  last[ofs] = 0x01;
+  goldilocks_convert_bytes_to_field_elements(rate, last, felts);
+  for(int j=0; j<rate; j++) { state[j] = goldilocks_add( state[j] ,felts[j] ); }
+  goldilocks_monolith_permutation( state );
+
+  for(int j=0; j<4; j++) { hash[j] = state[j]; }
+}
+
 //------------------------------------------------------------------------------
diff --git a/cbits/goldilocks.h b/cbits/goldilocks.h
index 94b3428..ba92ad7 100644
--- a/cbits/goldilocks.h
+++ b/cbits/goldilocks.h
@@ -15,12 +15,18 @@ uint64_t goldilocks_mul_small(uint64_t x, uint32_t y);
 
 //------------------------------------------------------------------------------
 
-void goldilocks_poseidon2_permutation(uint64_t *state);
-
+void goldilocks_poseidon2_permutation   (uint64_t *state);
 void goldilocks_poseidon2_keyed_compress(const uint64_t *x, const uint64_t *y, uint64_t key, uint64_t *out);
 void goldilocks_poseidon2_compress      (const uint64_t *x, const uint64_t *y,               uint64_t *out);
+void goldilocks_poseidon2_bytes_digest  (int rate, int N, const uint8_t  *input, uint64_t *hash);
+void goldilocks_poseidon2_felts_digest  (int rate, int N, const uint64_t *input, uint64_t *hash);
 
-void goldilocks_poseidon2_bytes_digest(int rate, int N, const uint8_t  *input, uint64_t *hash);
-void goldilocks_poseidon2_felts_digest(int rate, int N, const uint64_t *input, uint64_t *hash);
+//--------------------------------------
+
+void goldilocks_monolith_permutation   (uint64_t *state);
+void goldilocks_monolith_keyed_compress(const uint64_t *x, const uint64_t *y, uint64_t key, uint64_t *out);
+void goldilocks_monolith_compress      (const uint64_t *x, const uint64_t *y,               uint64_t *out);
+void goldilocks_monolith_bytes_digest  (int rate, int N, const uint8_t  *input, uint64_t *hash);
+void goldilocks_monolith_felts_digest  (int rate, int N, const uint64_t *input, uint64_t *hash);
 
 //------------------------------------------------------------------------------
diff --git a/cbits/monolith_constants.inc b/cbits/monolith_constants.inc
new file mode 100644
index 0000000..9c3e7c5
--- /dev/null
+++ b/cbits/monolith_constants.inc
@@ -0,0 +1,71 @@
+
+#include <stdint.h>
+
+const uint64_t monolith_t12_round_constants[5][12] = 
+  { { 0xbcaf2516e5926dcf
+    , 0x4ec5a76bce1e7676
+    , 0x9d804725bebb56ab
+    , 0x2ec05fca215a5be3
+    , 0xe16274e4acab86a0
+    , 0x80b0fddcc3c4380f
+    , 0xc87c769ad77ffece
+    , 0x37f85ec9117d287c
+    , 0x3b8d825b014c458d
+    , 0xb7a01d0cb850d75e
+    , 0x1333b751bac704bd
+    , 0x7b7ef14183d47b6f
+    }
+  , { 0x2114517643e3b286
+    , 0x542d15ea3cd12ade
+    , 0xe847d363f17a93e9
+    , 0x24f0421c6ff41c56
+    , 0x66e3eda93e2ca216
+    , 0xfb88d475279cb568
+    , 0x7f421c6269938a22
+    , 0xdbb973acce857401
+    , 0xe172409cb1563a6a
+    , 0x996f729f6340447d
+    , 0x925c579738b6fa4a
+    , 0x752e9ec9e0b34686
+    }
+  , { 0xdb419e0bd38469bd
+    , 0xba41cee828bd26d8
+    , 0xd6630f8f0969db39
+    , 0x2340e955ae2f0d94
+    , 0x282f553d35872e2e
+    , 0x77f7c3ff1ae496b3
+    , 0xf5f2efab64bc5eef
+    , 0x47b23a00830284f4
+    , 0xe18a2d2242486fa
+    , 0x3d101838a773dab0
+    , 0x47d686fd16856524
+    , 0x3eb2d254189b3534
+    }
+  , { 0xfe886e291ca8c5bd
+    , 0xb97ec74df1e4b0b6
+    , 0x574fdef3a600e370
+    , 0x8ad61c6f132d4feb
+    , 0x41e69ca4ecc7e8c7
+    , 0x151ad562e1f90ca4
+    , 0x747c051439a5603c
+    , 0x990151d3e52d502c
+    , 0x532c7f258282ea12
+    , 0x65e62cb34275dd5
+    , 0x5288008954f5d0b2
+    , 0xee7c3407cf3d6e02
+    }
+  , { 0xda07029808bad5de
+    , 0x7bebdf38dcc7a673
+    , 0x20a3f252688c312d
+    , 0x9c5248f7bbf8d188
+    , 0xcf1cf778994382d4
+    , 0x8c434b1738b8338c
+    , 0xfe504398813b67a8
+    , 0xe879562fdef813b9
+    , 0xd4666793b2a2f191
+    , 0xd9096b87de22de01
+    , 0xcaf4cea5f22abf34
+    , 0x3128d1e75d0204fa
+    }
+  };
+
diff --git a/cbits/monolith_conv_uint64.inc b/cbits/monolith_conv_uint64.inc
new file mode 100644
index 0000000..2a688bd
--- /dev/null
+++ b/cbits/monolith_conv_uint64.inc
@@ -0,0 +1,267 @@
+
+//
+// circular convolution with the vector [7,8,21,22,6,7,9,10,13,26,8,23] algorithms in uint64_t
+// the idea is that we can split field elements into (lo + 2^32*hi)
+// apply the convolution separately (it won't overflow)
+// then combine and reduce
+//
+// based on the book:
+//
+// Nussbaumer: "Fast Fourier Transform and Convolution Algorithms"
+//
+
+/*
+
+our coefficient vectors:
+
+  [7,8,21,22,6,7,9,10,13,26,8,23]
+
+in CRT rectangle format:
+
+  +----------+
+  |  7  6 13 |
+  |  26 8  7 | 
+  |  9  8 21 |
+  | 22 10 23 |
+  +----------+
+
+*/
+
+#include <stdint.h>
+
+//------------------------------------------------------------------------------
+
+// convolves with:  b2 = { 64 , 32 , 64 };
+//   tgt[0] = 64*x + 64*y + 32*z
+//   tgt[1] = 32*x + 64*y + 64*z
+//   tgt[2] = 64*x + 32*y + 64*z
+void uint64_convolve_with_B2(uint64_t *src, uint64_t *tgt) {
+  uint64_t x = src[0];
+  uint64_t y = src[1];
+  uint64_t z = src[2];
+
+  uint64_t x32 = x << 5;
+  uint64_t y32 = y << 5;
+  uint64_t z32 = z << 5;
+
+  uint64_t s64 = (x32 + y32 + z32) << 1;
+
+  tgt[0] = s64 - z32;
+  tgt[1] = s64 - x32;
+  tgt[2] = s64 - y32;
+}
+
+
+// convolves with:  b3 = { -32 , -4 ,   4 };
+//   tgt[0] = -32*x +  4*y -  4*z
+//   tgt[1] =  -4*x - 32*y + 64*z
+//   tgt[2] =   4*x -  4*y - 32*z
+void uint64_convolve_with_B3(uint64_t *src, uint64_t *tgt) {
+  uint64_t x = src[0];
+  uint64_t y = src[1];
+  uint64_t z = src[2];
+
+  uint64_t x4 = x << 2;
+  uint64_t y4 = y << 2;
+  uint64_t z4 = z << 2;
+
+  uint64_t x32 = x4 << 3;
+  uint64_t y32 = y4 << 3;
+  uint64_t z32 = z4 << 3;
+
+  tgt[0] = - x32 + y4  - z4;
+  tgt[1] = - x4  - y32 + z4; 
+  tgt[2] =   x4  - y4  - z32;
+}
+
+// convolves with:  b4 = { -6 , 0 , 8 };
+//   tgt[0] = - 6*x + 8*y 
+//   tgt[1] =       - 6*y + 8*z
+//   tgt[2] =   8*x       - 6*z
+void uint64_convolve_with_B4(uint64_t *src, uint64_t *tgt) {
+  uint64_t x = src[0];
+  uint64_t y = src[1];
+  uint64_t z = src[2];
+
+  uint64_t x8 = x << 3;
+  uint64_t y8 = y << 3;
+  uint64_t z8 = z << 3;
+
+  uint64_t x6 = x8 - (x + x);
+  uint64_t y6 = y8 - (y + y);
+  uint64_t z6 = z8 - (z + z);
+
+  tgt[0] = - x6 + y8;
+  tgt[1] = - y6 + z8;
+  tgt[2] = - z6 + x8;
+}
+
+// convolves with:  b5 = {   2 , -4 , -24 };
+//   tgt[0] =   2*x - 24*y -  4*z 
+//   tgt[1] =  -4*x +  2*y - 24*z
+//   tgt[2] = -24*x -  4*y +  2*z
+void uint64_convolve_with_B5(uint64_t *src, uint64_t *tgt) {
+  uint64_t x = src[0];
+  uint64_t y = src[1];
+  uint64_t z = src[2];
+
+  uint64_t x2 = x << 1;
+  uint64_t y2 = y << 1;
+  uint64_t z2 = z << 1;
+
+  uint64_t x4 = x2 << 1;
+  uint64_t y4 = y2 << 1;
+  uint64_t z4 = z2 << 1;
+
+  uint64_t x24 = x4*6; // (x4 + x4 + x4) << 1;
+  uint64_t y24 = y4*6; // (y4 + y4 + y4) << 1;
+  uint64_t z24 = z4*6; // (z4 + z4 + z4) << 1;
+
+  tgt[0] =   x2  - y24 - z4 ;
+  tgt[1] = - x4  + y2  - z24;
+  tgt[2] = - x24 - y4  + z2 ;
+}
+
+// convolves with:  b6 = {  -2 , -2 ,  -8 };
+//   tgt[0] = - ( 2*x + 8*y + 2*z ) 
+//   tgt[1] = - ( 2*x + 2*y + 8*z )
+//   tgt[2] = - ( 8*x + 2*y + 2*z )
+void uint64_convolve_with_B6(uint64_t *src, uint64_t *tgt) {
+  uint64_t x = src[0];
+  uint64_t y = src[1];
+  uint64_t z = src[2];
+
+  uint64_t x3 = (x << 2) - x ;
+  uint64_t y3 = (y << 2) - y ;
+  uint64_t z3 = (z << 2) - z ;
+
+  uint64_t s = x + y + z;
+
+  tgt[0] = - ( (s + y3) << 1 );
+  tgt[1] = - ( (s + z3) << 1 );
+  tgt[2] = - ( (s + x3) << 1 );
+}
+
+//------------------------------------------------------------------------------
+
+void uint64_naive_circular_conv( int n, uint64_t *input, uint64_t *coeffs, uint64_t *output ) {
+  for(int k=0; k<n; k++) {
+    uint64_t acc = 0;
+    for(int j=0; j<n; j++) {
+      acc += input[j] * coeffs[ (k+n-j)%n ];
+    }
+    output[k] = acc;
+  }  
+}
+
+//------------------------------------------------------------------------------
+
+void uint64_add_vec3(uint64_t *xs, uint64_t *ys, uint64_t *zs) {
+  for(int i=0; i<3; i++) zs[i] = xs[i] + ys[i];  
+}
+
+void uint64_sub_vec3(uint64_t *xs, uint64_t *ys, uint64_t *zs) {
+  for(int i=0; i<3; i++) zs[i] = xs[i] - ys[i];  
+}
+
+//------------------------------------------------------------------------------
+
+// cyclic convolution of 12 terms via the Agarwal-Cooley algorithm
+// with the fixed vector [7,8,21,22,6,7,9,10,13,26,8,23]
+//
+void uint64_circular_conv_12_with( uint64_t *input , uint64_t *output ) {
+
+  uint64_t input_rect[4][3];       // first index is the outer, second the inner
+
+  for(int k=0; k<12; k++) {
+    input_rect[k%4][k%3] = input [k];
+  }
+
+  uint64_t *input_ptr = (uint64_t*) input_rect;
+
+  uint64_t *x0 = input_ptr    ;
+  uint64_t *x1 = input_ptr + 3;
+  uint64_t *x2 = input_ptr + 6;
+  uint64_t *x3 = input_ptr + 9;
+
+  uint64_t a0[3], a1[3], a2[3], a3[3], a4[3], a5[3], a6[3];
+  for(int j=0; j<3; j++)  {
+    a0[j] = x0[j] + x2[j]; 
+    a1[j] = x1[j] + x3[j]; 
+    a2[j] = a0[j] + a1[j]; 
+    a3[j] = a0[j] - a1[j]; 
+    a4[j] = x0[j] - x2[j]; 
+    a5[j] = x1[j] - x3[j]; 
+    a6[j] = a4[j] + a5[j]; 
+  }
+
+  uint64_t m0[3], m1[3], m2[3], m3[3], m4[3];
+  uint64_convolve_with_B2( a2 , m0 );            // uint64_naive_circular_conv( 3 , a2 , b2 , m0 );
+  uint64_convolve_with_B3( a3 , m1 );            // uint64_naive_circular_conv( 3 , a3 , b3 , m1 );
+  uint64_convolve_with_B4( a4 , m2 );            // uint64_naive_circular_conv( 3 , a4 , b4 , m2 );
+  uint64_convolve_with_B5( a5 , m3 );            // uint64_naive_circular_conv( 3 , a5 , b5 , m3 );
+  uint64_convolve_with_B6( a6 , m4 );            // uint64_naive_circular_conv( 3 , a6 , b6 , m4 );
+
+  uint64_t u0[3], u1[3], u2[3], u3[3];
+  uint64_add_vec3( m0 , m1 , u0 );
+  uint64_sub_vec3( m0 , m1 , u1 );
+  uint64_sub_vec3( m4 , m3 , u2 );
+  uint64_sub_vec3( m4 , m2 , u3 );
+
+  for(int i=0; i<3; i++) {
+    x0[i] = ( u0[i] + 2*u2[i] ) >> 2;
+    x1[i] = ( u1[i] + 2*u3[i] ) >> 2;
+    x2[i] = ( u0[i] - 2*u2[i] ) >> 2;
+    x3[i] = ( u1[i] - 2*u3[i] ) >> 2;
+  }
+ 
+  for(int k=0; k<12; k++) {
+    output[k] = input_rect[k%4][k%3];
+  }
+}
+
+//------------------------------------------------------------------------------
+
+/*
+
+void uint64_test_short_conv_with() {
+  
+  printf("test short convolution algos for uint64\n");
+  
+  uint64_t input    [12];
+  uint64_t coeffs   [12] = {7,8,21,22,6,7,9,10,13,26,8,23};
+  uint64_t output   [12];
+  uint64_t reference[12];
+
+  // generate some "random-looking" numbers
+  uint64_t a=123459;
+  uint64_t b=789013;
+  for(int i=0;i<12;i++) {
+    uint64_t c = (a*b) ^ (a - 12345);
+    uint64_t d = (c*a) ^ (b + 67891);
+    input [i] = c & 0x0fffffff;             // WE WANT NO OVERFLOW!
+    a = b   + c       + 1;
+    b = 3*a - 5*c + d - 3;
+  }
+
+  for(int i=0; i<12; i++) {
+    printf("x[%d] = %016llx  ;  h[%d] = %016llx\n" , i, input[i], i, coeffs[i] );
+  }
+
+  // -----------[ length = 12 ]----------- 
+
+  printf("\n");
+  printf("length = 12\n");
+
+  uint64_naive_circular_conv   ( 12, input, coeffs, reference );
+  uint64_circular_conv_12_with (     input,         output    );
+
+  for(int i=0; i<12; i++) {
+    printf("out[%d] = %016llx  ;  ref[%d] = %016llx\n" , i, output[i], i, reference[i] );
+  }
+}
+
+*/
+
+//------------------------------------------------------------------------------
+
diff --git a/goldilocks_hash/monolith.nim b/goldilocks_hash/monolith.nim
new file mode 100644
index 0000000..a508ef5
--- /dev/null
+++ b/goldilocks_hash/monolith.nim
@@ -0,0 +1,13 @@
+
+import goldilocks_hash/types
+import goldilocks_hash/goldilocks
+
+import goldilocks_hash/monolith/sponge
+import goldilocks_hash/monolith/merkle
+# import goldilocks_hash/monolith/permutation
+# import goldilocks_hash/monolith/compress
+
+export types
+export goldilocks
+export sponge
+export merkle
diff --git a/goldilocks_hash/monolith/compress.nim b/goldilocks_hash/monolith/compress.nim
new file mode 100644
index 0000000..36858d3
--- /dev/null
+++ b/goldilocks_hash/monolith/compress.nim
@@ -0,0 +1,14 @@
+
+import ../types
+
+proc c_compress(a, b: var Digest, key: uint64, output: var Digest) {. header: "../cbits/goldilocks.h", importc: "goldilocks_monolith_keyed_compress", cdecl .}
+
+# keyed compression function
+func compress*(a, b: Digest, key: uint64 = 0) : Digest =
+  var x: Digest = a
+  var y: Digest = b
+  var output: Digest
+  c_compress(x,y,key,output)
+  return output
+
+
diff --git a/goldilocks_hash/monolith/merkle.nim b/goldilocks_hash/monolith/merkle.nim
new file mode 100644
index 0000000..7f89536
--- /dev/null
+++ b/goldilocks_hash/monolith/merkle.nim
@@ -0,0 +1,76 @@
+
+# binary merkle trees, where the nodes and leaves are four-tuples of field elements
+#
+# we use a custom "safe" merkle tree API, so that:
+#
+# - there is no collision between different input lengths
+# - there is no collision if you remove the bottommost layer (or several layers)
+# - the merkle root of the singleton is not itself
+
+import ../types
+import ./compress
+#import ./io
+
+#-------------------------------------------------------------------------------
+
+const KeyNone              : uint64 = 0x0
+const KeyBottomLayer       : uint64 = 0x1
+const KeyOdd               : uint64 = 0x2
+const KeyOddAndBottomLayer : uint64 = 0x3
+
+#-------------------------------------------------------------------------------
+
+type Merkle* = object
+  todo : seq[Digest]  # nodes that haven't been combined yet
+  width: int          # width of the current subtree
+  leafs: int          # amount of leafs processed
+
+func init*(_: type Merkle): Merkle =
+  Merkle(width: 2)
+
+func internalCompress(merkle: var Merkle, odd: static bool) =
+  when odd:
+    let a = merkle.todo.pop()
+    let b = zeroDigest
+    let key = if merkle.width == 2: KeyOddAndBottomLayer else: KeyOdd
+    merkle.todo.add(compress(a, b, key = key))
+    merkle.leafs += merkle.width div 2 # zero node represents this many leafs
+  else:
+    let b = merkle.todo.pop()
+    let a = merkle.todo.pop()
+    let key = if merkle.width == 2: KeyBottomLayer else: KeyNone
+    merkle.todo.add(compress(a, b, key = key))
+  merkle.width *= 2
+
+func update*(merkle: var Merkle, element: Digest) =
+  merkle.todo.add(element)
+  inc merkle.leafs
+  merkle.width = 2
+  while merkle.width <= merkle.leafs and merkle.leafs mod merkle.width == 0:
+    merkle.internalCompress(odd = false)
+
+func finish*(merkle: var Merkle): Digest =
+  assert merkle.todo.len > 0, "merkle root of empty sequence is not defined"
+
+  if merkle.leafs == 1:
+    merkle.internalCompress(odd = true)
+
+  while merkle.todo.len > 1:
+    if merkle.leafs mod merkle.width == 0:
+      merkle.internalCompress(odd = false)
+    else:
+      merkle.internalCompress(odd = true)
+
+  return merkle.todo[0]
+
+func digest*(_: type Merkle, elements: openArray[Digest]): Digest =
+  var merkle = Merkle.init()
+  for element in elements:
+    merkle.update(element)
+  return merkle.finish()
+
+func merkleRoot*(elements: openArray[Digest]): Digest = Merkle.digest(elements)
+
+#-------------------------------------------------------------------------------
+
+
diff --git a/goldilocks_hash/monolith/permutation.nim b/goldilocks_hash/monolith/permutation.nim
new file mode 100644
index 0000000..b97f1df
--- /dev/null
+++ b/goldilocks_hash/monolith/permutation.nim
@@ -0,0 +1,17 @@
+
+import ../types
+
+# the Monolith permutation (mutable, in-place version)
+proc permInPlace*   (state: var State) {. header: "../cbits/goldilocks.h", importc: "goldilocks_monolith_permutation", cdecl .}
+proc permInPlaceF12*(state: var F12  ) {. header: "../cbits/goldilocks.h", importc: "goldilocks_monolith_permutation", cdecl .}
+
+# the Monolith permutation (pure version)
+func perm*(state: State): State =
+  var tmp = state
+  permInPlace(tmp)
+  return tmp
+
+func permF12*(state: F12): F12 =
+  var tmp = state
+  permInPlaceF12(tmp)
+  return tmp
diff --git a/goldilocks_hash/monolith/sponge.nim b/goldilocks_hash/monolith/sponge.nim
new file mode 100644
index 0000000..34bb925
--- /dev/null
+++ b/goldilocks_hash/monolith/sponge.nim
@@ -0,0 +1,121 @@
+
+# sponge construction for linear hashing.
+#
+# we recommend to use rate=8
+# (note that we have the state width fixed to t=12)
+#
+# we use the 10* padding strategy (that is, always append an 1, and append
+# as many zeros as required so that the final length is divisible by the rate)
+# both when hashing bytes and when hashing field elements
+
+#import std/assertions       # on 1.6.18 with an M2 i got "cannot open file: std/assertions" ....
+
+import ../types
+import ../goldilocks
+import ./permutation
+#import ./io
+
+#-------------------------------------------------------------------------------
+
+# "import std/assertions" does not work    
+# i got the error: "cannot open file: std/assertions" 
+proc fakeAssert( cond: bool, msg: string ) =
+  if not cond:
+    raise newException(AssertionDefect, msg)
+
+#-------------------------------------------------------------------------------
+
+type
+  Sponge*[T: static typedesc, rate: static int] = object
+    state:      F12
+    lenModRate: uint
+
+func numberOfBits(T: static typedesc): int = 
+  if T is F:    return 63
+  if T is byte: return 8
+  if T is bool: return 1
+  fakeAssert( false , "unsupported input type for sponge construction" )
+
+func initialize[T: static typedesc, rate: static int](sponge: var Sponge[T,rate]) =
+  fakeAssert( rate >= 1 and rate <= 8 , "with t=12, rate must be at most 8 (and positive)" )
+  let nbits = numberOfBits(T)
+  let IV = toF( 0x10000*uint64(nbits) + 0x100*12 + uint64(rate) )  # domain separation IV := (65536*nbits + 256*t + r)
+  sponge.state[8] = IV; 
+
+#---------------------------------------
+
+func extractDigestF4[T: static typedesc, rate: static int](sponge: var Sponge[T,rate]) : F4 =
+  var digest : F4
+  for i in 0..<4: digest[i] = sponge.state[i]
+  return digest
+
+func extractDigest[T: static typedesc, rate: static int](sponge: var Sponge[T,rate]) : Digest =
+  return toDigest(sponge.extractDigestF4())
+
+#---------------------------------------
+
+func update*[rate: static int](sponge: var Sponge[typedesc[F],rate], x: F) =
+  sponge.state[sponge.lenModRate] += x
+  sponge.lenModRate = (sponge.lenModRate + 1) mod rate
+  if (sponge.lenModRate == 0):
+    permInPlaceF12( sponge.state );
+
+func finish*[T: static typedesc, rate:static int](sponge: var Sponge[T,rate]): Digest =
+  # padding
+  sponge.update(one)
+  while( sponge.lenModRate != 0):
+    sponge.update(zero)
+  return sponge.extractDigest()
+
+#-------------------------------------------------------------------------------
+
+# # _: type Sponge, 
+#func init*( _: type Sponge, T: static typedesc, rate: static int = 8): Sponge[T,rate] =
+#  when (rate < 1 or rate > 8):
+#    {.error: "only rates between 1 and 8 are supported".}
+#  var sponge: Sponge[T,rate]
+#  initialize[T,rate](sponge)
+#  return sponge
+
+func newSponge*[T: static typedesc, rate: static int = 8](): Sponge[T,rate] =
+  when (rate < 1 or rate > 8):
+    {.error: "only rates between 1 and 8 are supported".}
+  var sponge: Sponge[T,rate]
+  initialize[T,rate](sponge)
+  return sponge
+
+#---------------------------------------
+
+# digest a sequence of field elements
+func digestNim*(rate: static int = 8, elements: openArray[F]): Digest =
+  var sponge : Sponge[typedesc[F],rate] = newSponge[typedesc[F],rate]()
+  for element in elements:
+    sponge.update(element)
+  return sponge.finish()
+
+# # digest a sequence of bytes
+#func digestNim*(rate: static int = 8, bytes: openArray[byte],): F =
+#  var sponge = Sponge.init(nbits=8, rate)
+#  for element in bytes.elements(F):
+#    sponge.update(element)
+#  return sponge.finish()
+
+#---------------------------------------
+
+proc digestFeltsRawC(rate: int, len: int, input: ptr UncheckedArray[F   ], hash: var F4) {. header: "../cbits/goldilocks.h", importc: "goldilocks_monolith_felts_digest", cdecl .}
+proc digestBytesRawC(rate: int, len: int, input: ptr UncheckedArray[byte], hash: var F4) {. header: "../cbits/goldilocks.h", importc: "goldilocks_monolith_bytes_digest", cdecl .}
+
+func digestFeltsC*(rate: static int = 8, felts: openArray[F]): Digest =
+  var digest : F4
+  let input = cast[ptr UncheckedArray[F]]( felts.unsafeAddr )
+  digestFeltsRawC(rate, felts.len, input, digest)
+  return toDigest(digest)
+
+func digestBytesC*(rate: static int = 8, bytes: openArray[byte]): Digest =
+  var digest : F4
+  let input = cast[ptr UncheckedArray[byte]]( bytes.unsafeAddr )
+  digestBytesRawC(rate, bytes.len, input, digest)
+  return toDigest(digest)
+
+#-------------------------------------------------------------------------------
+
diff --git a/reference/Common.hs b/reference/Common.hs
index 630e4c8..add43e1 100644
--- a/reference/Common.hs
+++ b/reference/Common.hs
@@ -40,6 +40,12 @@ extractDigest :: State -> Digest
 extractDigest state = case elems state of 
   (a:b:c:d:_) -> MkDigest a b c d
 
+listToDigest :: [F] -> Digest
+listToDigest [a,b,c,d] = MkDigest a b c d
+
+digestToList :: Digest -> [F]
+digestToList (MkDigest a b c d) = [a,b,c,d]
+
 --------------------------------------------------------------------------------
 
 digestToWord64s :: Digest -> [Word64]
diff --git a/reference/TestGen/TestCompress.hs b/reference/TestGen/TestCompress.hs
new file mode 100644
index 0000000..21e1f2b
--- /dev/null
+++ b/reference/TestGen/TestCompress.hs
@@ -0,0 +1,56 @@
+
+-- | Generate test cases for Nim
+
+module TestGen.TestCompress where
+
+--------------------------------------------------------------------------------
+
+import Data.Array
+import Data.List
+
+import System.IO
+
+import Merkle
+import Goldilocks
+import Common
+
+import TestGen.Shared
+
+--------------------------------------------------------------------------------
+
+printArray :: String -> [F] -> String
+printArray varname xs = unlines (header : stuff ++ footer) where
+  header = "const " ++ varname ++ "* : array[" ++ show (length xs) ++ ", F] = " 
+  footer = ["  ]",""]
+  stuff  = showListWith nimShowF xs
+
+--------------------------------------------------------------------------------
+
+left  = map toF [1..4]
+right = map toF [5..8]
+
+compress_ :: Hash -> Int -> [F] -> [F] -> [F]
+compress_ hash key as bs = digestToList $ keyedCompress hash key (listToDigest as) (listToDigest bs)
+
+--------------------------------------------------------------------------------
+
+printTests :: Hash -> IO ()
+printTests hash = hPrintTests stdout hash
+
+hPrintTests :: Handle -> Hash -> IO ()
+hPrintTests h hash = hPutStrLn h $ unlines 
+  [ printArray "refInp1" left
+  , printArray "refInp2" right
+  , printArray "refOutKey0" (compress_ hash 0 left right)
+  , printArray "refOutKey1" (compress_ hash 1 left right)
+  , printArray "refOutKey2" (compress_ hash 2 left right)
+  , printArray "refOutKey3" (compress_ hash 3 left right)
+  ]
+
+writeTests :: Hash -> IO ()
+writeTests hash = withFile "compressTestCases.nim" WriteMode $ \h -> do
+  hPutStrLn   h "# generated by TestGen/TestCompress.hs\n"
+  hPutStrLn   h "import goldilocks_hash/types\n"
+  hPrintTests h hash
+
+--------------------------------------------------------------------------------
diff --git a/tests/goldilocks_hash/monolith/compressTestCases.nim b/tests/goldilocks_hash/monolith/compressTestCases.nim
new file mode 100644
index 0000000..8628562
--- /dev/null
+++ b/tests/goldilocks_hash/monolith/compressTestCases.nim
@@ -0,0 +1,53 @@
+# generated by TestGen/TestCompress.hs
+
+import goldilocks_hash/types
+
+const refInp1* : array[4, F] = 
+  [ toF( 0x0000000000000001'u64 )
+  , toF( 0x0000000000000002'u64 )
+  , toF( 0x0000000000000003'u64 )
+  , toF( 0x0000000000000004'u64 )
+  ]
+
+
+const refInp2* : array[4, F] = 
+  [ toF( 0x0000000000000005'u64 )
+  , toF( 0x0000000000000006'u64 )
+  , toF( 0x0000000000000007'u64 )
+  , toF( 0x0000000000000008'u64 )
+  ]
+
+
+const refOutKey0* : array[4, F] = 
+  [ toF( 0x794c4b4308cb8286'u64 )
+  , toF( 0xe6ca7b9c49970427'u64 )
+  , toF( 0x89b2e0614bc0af93'u64 )
+  , toF( 0xd0f63984b0d43850'u64 )
+  ]
+
+
+const refOutKey1* : array[4, F] = 
+  [ toF( 0xe29e85f8f1782476'u64 )
+  , toF( 0xd32a5179356e274f'u64 )
+  , toF( 0x00fd4b778d2a019e'u64 )
+  , toF( 0x060ca2a006f4815a'u64 )
+  ]
+
+
+const refOutKey2* : array[4, F] = 
+  [ toF( 0xd3b556e546fe9ea5'u64 )
+  , toF( 0x5d99e5d70188e012'u64 )
+  , toF( 0x6bd1f2c0940918f4'u64 )
+  , toF( 0xe25b659a26b33f27'u64 )
+  ]
+
+
+const refOutKey3* : array[4, F] = 
+  [ toF( 0x12b810db565f56db'u64 )
+  , toF( 0x25f66032a99e4e52'u64 )
+  , toF( 0x3ceca3fb262075b4'u64 )
+  , toF( 0x77602ef03231a802'u64 )
+  ]
+
+
+
diff --git a/tests/goldilocks_hash/monolith/testCompress.nim b/tests/goldilocks_hash/monolith/testCompress.nim
new file mode 100644
index 0000000..eb190b4
--- /dev/null
+++ b/tests/goldilocks_hash/monolith/testCompress.nim
@@ -0,0 +1,38 @@
+
+import std/unittest
+
+import goldilocks_hash/types
+import goldilocks_hash/monolith/compress
+
+import ./compressTestCases
+
+#-------------------------------------------------------------------------------
+
+suite "monolith compression":
+
+  test "compression of [1..4] and [5..8] with key=0":
+    let input1 : Digest = toDigest(refInp1)
+    let input2 : Digest = toDigest(refInp2)
+    let output : Digest = compress(input1, input2)
+    check ( fromDigest(output) == refOutKey0 )
+
+  test "compression of [1..4] and [5..8] with key=1":
+    let input1 : Digest = toDigest(refInp1)
+    let input2 : Digest = toDigest(refInp2)
+    let output : Digest = compress(input1, input2, key=1)
+    check ( fromDigest(output) == refOutKey1 )
+
+  test "compression of [1..4] and [5..8] with key=2":
+    let input1 : Digest = toDigest(refInp1)
+    let input2 : Digest = toDigest(refInp2)
+    let output : Digest = compress(input1, input2, key=2)
+    check ( fromDigest(output) == refOutKey2 )
+
+  test "compression of [1..4] and [5..8] with key=3":
+    let input1 : Digest = toDigest(refInp1)
+    let input2 : Digest = toDigest(refInp2)
+    let output : Digest = compress(input1, input2, key=3)
+    check ( fromDigest(output) == refOutKey3 )
+
+#-------------------------------------------------------------------------------
+
diff --git a/tests/goldilocks_hash/monolith/testMerkle.nim b/tests/goldilocks_hash/monolith/testMerkle.nim
new file mode 100644
index 0000000..c7074bd
--- /dev/null
+++ b/tests/goldilocks_hash/monolith/testMerkle.nim
@@ -0,0 +1,31 @@
+
+import std/unittest
+# import std/sequtils
+
+import goldilocks_hash/types
+import goldilocks_hash/monolith/merkle
+
+import ./merkleTestCases
+
+#-------------------------------------------------------------------------------
+
+func digestSeq(n: int): seq[Digest] = 
+  var input : seq[Digest] = newSeq[Digest](n)
+  for i in 0..<n: 
+    let x = uint64(i+1)
+    input[i] = mkDigestU64(x,0,0,0)
+  return input
+
+func isOkDigestNim( testcases: openarray[tuple[n:int,digest:F4]] ): bool =
+  var ok = true
+  for (n,refdigest) in testcases:
+    let input : seq[Digest] = digestSeq(n)
+    if merkleRoot(input) != toDigest(refdigest):
+      ok = false
+  return ok
+
+suite "monolith merkle tree /Nim":
+
+  test "merkle root of digest sequences": check isOkDigestNim( testcases_merkleroot )
+ 
+#-------------------------------------------------------------------------------
diff --git a/tests/goldilocks_hash/monolith/testPermutation.nim b/tests/goldilocks_hash/monolith/testPermutation.nim
new file mode 100644
index 0000000..85d933d
--- /dev/null
+++ b/tests/goldilocks_hash/monolith/testPermutation.nim
@@ -0,0 +1,60 @@
+
+import std/unittest
+# import std/sequtils
+
+import goldilocks_hash/types
+import goldilocks_hash/monolith/permutation
+
+import ./permTestCases
+
+#-------------------------------------------------------------------------------
+
+const refInp: F12 = 
+  [ toF(  0'u64 )
+  , toF(  1'u64 )
+  , toF(  2'u64 )
+  , toF(  3'u64 )
+  , toF(  4'u64 )
+  , toF(  5'u64 )
+  , toF(  6'u64 )
+  , toF(  7'u64 )
+  , toF(  8'u64 )
+  , toF(  9'u64 )
+  , toF( 10'u64 )
+  , toF( 11'u64 )
+  ]
+  
+const refOut: F12 = 
+  [ toF( 0x516dd661e959f541'u64 )
+  , toF( 0x082c137169707901'u64 )
+  , toF( 0x53dff3fd9f0a5beb'u64 )
+  , toF( 0x0b2ebaa261590650'u64 )
+  , toF( 0x89aadb57e2969cb6'u64 )
+  , toF( 0x5d3d6905970259bd'u64 )
+  , toF( 0x6e5ac1a4c0cfa0fe'u64 )
+  , toF( 0xd674b7736abfc5ce'u64 )
+  , toF( 0x0d8697e1cd9a235f'u64 )
+  , toF( 0x85fc4017c247136e'u64 )
+  , toF( 0x572bafd76e511424'u64 )
+  , toF( 0xbec1638e28eae57f'u64 )
+  ]
+
+#-------------------------------------------------------------------------------
+
+suite "monolith permutation":
+
+  test "permutation of [0..11]":
+    var input  = toState(refInp)
+    var output = perm(input);
+    check fromState(output) == refOut
+
+  test "more permutation tests":
+    var ok = true
+    for (xs,ys) in testcases_perm:
+      if permF12(xs) != ys: 
+        ok = false
+        break
+    check ok
+ 
+#-------------------------------------------------------------------------------
+
diff --git a/tests/goldilocks_hash/monolith/testSponge.nim b/tests/goldilocks_hash/monolith/testSponge.nim
new file mode 100644
index 0000000..f7eeacc
--- /dev/null
+++ b/tests/goldilocks_hash/monolith/testSponge.nim
@@ -0,0 +1,76 @@
+
+import std/unittest
+# import std/sequtils
+
+import goldilocks_hash/types
+import goldilocks_hash/monolith/sponge
+
+import ./spongeTestCases
+
+#-------------------------------------------------------------------------------
+
+func byteSeq(n: int): seq[byte] = 
+  var input : seq[byte] = newSeq[byte](n)
+  for i in 0..<n: input[i] = byte(i+1)
+  return input
+
+func feltSeq(n: int): seq[F] = 
+  var input : seq[F] = newSeq[F](n)
+  for i in 0..<n: input[i] = toF(uint64(i+1))
+  return input
+
+#-------------------------------------------------------------------------------
+
+func isOkFeltNim(r: static int, testcases: openarray[tuple[n:int,digest:F4]] ): bool =
+  var ok = true
+  for (n,refdigest) in testcases:
+    let input : seq[F] = feltSeq(n)
+    if digestNim(rate=r, input) != toDigest(refdigest):
+      ok = false
+  return ok
+
+suite "monolith sponge /Nim":
+
+  test "sponge for field elements w/ rate = 1": check isOkFeltNim( 1 , testcases_field_rate1 )
+  test "sponge for field elements w/ rate = 2": check isOkFeltNim( 2 , testcases_field_rate2 )
+  test "sponge for field elements w/ rate = 3": check isOkFeltNim( 3 , testcases_field_rate3 )
+  test "sponge for field elements w/ rate = 4": check isOkFeltNim( 4 , testcases_field_rate4 )
+  test "sponge for field elements w/ rate = 5": check isOkFeltNim( 5 , testcases_field_rate5 )
+  test "sponge for field elements w/ rate = 6": check isOkFeltNim( 6 , testcases_field_rate6 )
+  test "sponge for field elements w/ rate = 7": check isOkFeltNim( 7 , testcases_field_rate7 )
+  test "sponge for field elements w/ rate = 8": check isOkFeltNim( 8 , testcases_field_rate8 )
+ 
+#-------------------------------------------------------------------------------
+
+func isOkFeltC(r: static int, testcases: openarray[tuple[n:int,digest:F4]] ): bool =
+  var ok = true
+  for (n,refdigest) in testcases:
+    let input : seq[F] = feltSeq(n)
+    if digestFeltsC(rate=r, input) != toDigest(refdigest):
+      ok = false
+  return ok
+
+func isOkBytesC(r: static int, testcases: openarray[tuple[n:int,digest:F4]] ): bool =
+  var ok = true
+  for (n,refdigest) in testcases:
+    let input : seq[byte] = byteSeq(n)
+    if digestBytesC(rate=r, input) != toDigest(refdigest):
+      ok = false
+  return ok
+
+suite "monolith sponge /C":
+
+  test "sponge for field elements w/ rate = 1": check isOkFeltC( 1 , testcases_field_rate1 )
+  test "sponge for field elements w/ rate = 2": check isOkFeltC( 2 , testcases_field_rate2 )
+  test "sponge for field elements w/ rate = 3": check isOkFeltC( 3 , testcases_field_rate3 )
+  test "sponge for field elements w/ rate = 4": check isOkFeltC( 4 , testcases_field_rate4 )
+  test "sponge for field elements w/ rate = 5": check isOkFeltC( 5 , testcases_field_rate5 )
+  test "sponge for field elements w/ rate = 6": check isOkFeltC( 6 , testcases_field_rate6 )
+  test "sponge for field elements w/ rate = 7": check isOkFeltC( 7 , testcases_field_rate7 )
+  test "sponge for field elements w/ rate = 8": check isOkFeltC( 8 , testcases_field_rate8 )
+
+  test "sponge for bytes w/ rate = 4": check isOkBytesC( 4 , testcases_bytes_rate4 )
+  test "sponge for bytes w/ rate = 8": check isOkBytesC( 8 , testcases_bytes_rate8 )
+ 
+#-------------------------------------------------------------------------------
+
diff --git a/tests/goldilocks_hash/poseidon2/compressTestCases.nim b/tests/goldilocks_hash/poseidon2/compressTestCases.nim
new file mode 100644
index 0000000..edf8423
--- /dev/null
+++ b/tests/goldilocks_hash/poseidon2/compressTestCases.nim
@@ -0,0 +1,53 @@
+# generated by TestGen/TestCompress.hs
+
+import goldilocks_hash/types
+
+const refInp1* : array[4, F] = 
+  [ toF( 0x0000000000000001'u64 )
+  , toF( 0x0000000000000002'u64 )
+  , toF( 0x0000000000000003'u64 )
+  , toF( 0x0000000000000004'u64 )
+  ]
+
+
+const refInp2* : array[4, F] = 
+  [ toF( 0x0000000000000005'u64 )
+  , toF( 0x0000000000000006'u64 )
+  , toF( 0x0000000000000007'u64 )
+  , toF( 0x0000000000000008'u64 )
+  ]
+
+
+const refOutKey0* : array[4, F] = 
+  [ toF( 0xc4a4082f411ba790'u64 )
+  , toF( 0x98c2ed7546c44cce'u64 )
+  , toF( 0xc9404f373b78c979'u64 )
+  , toF( 0x65d6b3c998920f59'u64 )
+  ]
+
+
+const refOutKey1* : array[4, F] = 
+  [ toF( 0xca47449a05283778'u64 )
+  , toF( 0x08d3ced2020391ac'u64 )
+  , toF( 0xda461ea45670fb12'u64 )
+  , toF( 0x57f2c0b6c98a05c5'u64 )
+  ]
+
+
+const refOutKey2* : array[4, F] = 
+  [ toF( 0xe6fcec96a7a7f4b0'u64 )
+  , toF( 0x3002a22356daa551'u64 )
+  , toF( 0x899e2c1075a45f3f'u64 )
+  , toF( 0xf07e38ccb3ade312'u64 )
+  ]
+
+
+const refOutKey3* : array[4, F] = 
+  [ toF( 0x9930cff752b046fb'u64 )
+  , toF( 0x41570687cadcea0b'u64 )
+  , toF( 0x3ac093a5a92066c7'u64 )
+  , toF( 0xc45c75a3911cde87'u64 )
+  ]
+
+
+
diff --git a/tests/goldilocks_hash/poseidon2/testCompress.nim b/tests/goldilocks_hash/poseidon2/testCompress.nim
index 0f293d1..b1bbf04 100644
--- a/tests/goldilocks_hash/poseidon2/testCompress.nim
+++ b/tests/goldilocks_hash/poseidon2/testCompress.nim
@@ -4,51 +4,7 @@ import std/unittest
 import goldilocks_hash/types
 import goldilocks_hash/poseidon2/compress
 
-#-------------------------------------------------------------------------------
-
-const refInp1: array[4,F] = 
-  [ toF( 1'u64 )
-  , toF( 2'u64 )
-  , toF( 3'u64 )
-  , toF( 4'u64 )
-  ]
-
-const refInp2: array[4,F] =  
-  [ toF( 5'u64 )
-  , toF( 6'u64 )
-  , toF( 7'u64 )
-  , toF( 8'u64 )
-  ]
-
-#---------------------------------------
-
-const refOutKey0: array[4,F] =  
-  [ toF( 0xc4a4082f411ba790'u64 )
-  , toF( 0x98c2ed7546c44cce'u64 )
-  , toF( 0xc9404f373b78c979'u64 )
-  , toF( 0x65d6b3c998920f59'u64 )
-  ]
-
-const refOutKey1: array[4,F] =  
-  [ toF( 0xca47449a05283778'u64 )
-  , toF( 0x08d3ced2020391ac'u64 )
-  , toF( 0xda461ea45670fb12'u64 )
-  , toF( 0x57f2c0b6c98a05c5'u64 )
-  ]
-
-const refOutKey2: array[4,F] =  
-  [ toF( 0xe6fcec96a7a7f4b0'u64 )
-  , toF( 0x3002a22356daa551'u64 )
-  , toF( 0x899e2c1075a45f3f'u64 )
-  , toF( 0xf07e38ccb3ade312'u64 )
-  ]
-
-const refOutKey3: array[4,F] =  
-  [ toF( 0x9930cff752b046fb'u64 )
-  , toF( 0x41570687cadcea0b'u64 )
-  , toF( 0x3ac093a5a92066c7'u64 )
-  , toF( 0xc45c75a3911cde87'u64 )
-  ]
+import ./compressTestCases
 
 #-------------------------------------------------------------------------------
 
diff --git a/tests/test.nim b/tests/test.nim
index eb08bbd..ebf1907 100644
--- a/tests/test.nim
+++ b/tests/test.nim
@@ -1,8 +1,14 @@
 
 import ./goldilocks_hash/field/testField
+
 import ./goldilocks_hash/poseidon2/testPermutation
 import ./goldilocks_hash/poseidon2/testCompress
 import ./goldilocks_hash/poseidon2/testSponge
 import ./goldilocks_hash/poseidon2/testMerkle
 
+import ./goldilocks_hash/monolith/testPermutation
+import ./goldilocks_hash/monolith/testCompress
+import ./goldilocks_hash/monolith/testSponge
+import ./goldilocks_hash/monolith/testMerkle
+
 {.warning[UnusedImport]: off.}