fast C implementation for Goldilocks field + tests for field implementations

2026-02-20 21:53:10 +00:00 · 2025-10-14 18:09:30 +02:00 · 2025-10-14 18:09:30 +02:00 · 58756dd824
commit 58756dd824
parent bf75c153b2
17 changed files with 1532 additions and 4 deletions
--- a/reference/src/Field/Class.hs
+++ b/reference/src/Field/Class.hs
@ -0,0 +1,55 @@
+
+module Field.Class where
+
+--------------------------------------------------------------------------------
+
+import Data.Proxy
+
+import System.Random
+
+import qualified Field.Goldilocks           as Goldi   
+import qualified Field.Goldilocks.Extension as GoldiExt
+
+--------------------------------------------------------------------------------
+
+class (Show a, Eq a, Num a, Fractional a) => Field a where
+  fieldSize :: Proxy a -> Integer
+  zero      :: a
+  one       :: a
+  isZero    :: a -> Bool
+  isOne     :: a -> Bool
+  square    :: a -> a
+  power     :: a -> Integer -> a
+  power_    :: a -> Int     -> a
+  rndIO     :: IO a
+
+inverse :: Field a => a -> a
+inverse = recip
+
+--------------------------------------------------------------------------------
+
+instance Field Goldi.F where
+  fieldSize _ = Goldi.goldilocksPrime
+  zero        = Goldi.zero
+  one         = Goldi.one
+  isZero      = Goldi.isZero
+  isOne       = Goldi.isOne
+  square      = Goldi.sqr
+  power       = Goldi.pow
+  power_      = Goldi.pow_
+  rndIO       = randomIO
+
+--------------------------------------------------------------------------------
+
+instance Field GoldiExt.FExt where
+  fieldSize _ = (Goldi.goldilocksPrime ^ 2)
+  zero        = GoldiExt.zero
+  one         = GoldiExt.one
+  isZero      = GoldiExt.isZero
+  isOne       = GoldiExt.isOne
+  square      = GoldiExt.sqr
+  power       = GoldiExt.pow
+  power_      = GoldiExt.pow_
+  rndIO       = randomIO
+
+--------------------------------------------------------------------------------
--- a/reference/src/Field/Goldilocks.hs
+++ b/reference/src/Field/Goldilocks.hs
@ -1,8 +1,14 @@

-module Field.Goldilocks 
-  ( module Field.Goldilocks.Slow
-  )
-  where
+{-# LANGUAGE CPP #-}

+#ifdef USE_NAIVE_HASKELL
+
+module Field.Goldilocks ( module Field.Goldilocks.Slow ) where
 import Field.Goldilocks.Slow

+#else
+
+module Field.Goldilocks ( module Field.Goldilocks.Fast ) where
+import Field.Goldilocks.Fast
+
+#endif
--- a/reference/src/Field/Goldilocks/Extension.hs
+++ b/reference/src/Field/Goldilocks/Extension.hs
@ -17,6 +17,7 @@ import System.Random
 import Data.Binary

 import Field.Goldilocks ( F )
+import qualified Field.Goldilocks as Goldi

 --------------------------------------------------------------------------------

@ -58,6 +59,17 @@ instance Random F2 where

 --------------------------------------------------------------------------------

+zero, one, two :: F2
+zero = F2 Goldi.zero Goldi.zero 
+one  = F2 Goldi.one  Goldi.zero 
+two  = F2 Goldi.two  Goldi.zero 
+
+isZero, isOne :: F2 -> Bool
+isZero (F2 r i) = Goldi.isZero r && Goldi.isZero i
+isOne  (F2 r i) = Goldi.isOne  r && Goldi.isZero i
+
+--------------------------------------------------------------------------------
+
 inj :: F -> F2
 inj r = F2 r 0 

--- a/reference/src/Field/Goldilocks/Fast.hs
+++ b/reference/src/Field/Goldilocks/Fast.hs
@ -0,0 +1,153 @@
+
+-- | Bindings to a C implementation of the Goldilocks prime field
+
+{-# LANGUAGE ForeignFunctionInterface, BangPatterns, NumericUnderscores #-}
+module Field.Goldilocks.Fast where
+
+--------------------------------------------------------------------------------
+
+import Prelude hiding ( div )
+import qualified Prelude
+
+import Data.Bits
+import Data.Word
+import Data.Ratio
+
+import Foreign.C
+
+import System.Random
+
+import Data.Binary
+import Data.Binary.Get ( getWord64le )
+import Data.Binary.Put ( putWord64le )
+
+import Text.Printf
+
+--------------------------------------------------------------------------------
+
+type F = Goldilocks
+
+fromF :: F -> Word64
+fromF (MkGoldilocks x) = x
+
+unsafeToF :: Word64 -> F
+unsafeToF = MkGoldilocks
+
+toF :: Word64 -> F
+toF = mkGoldilocks . fromIntegral
+
+intToF :: Int -> F
+intToF = mkGoldilocks . fromIntegral
+
+instance Binary F where
+  put x = putWord64le (fromF x)
+  get = toF <$> getWord64le
+
+--------------------------------------------------------------------------------
+
+newtype Goldilocks 
+  = MkGoldilocks Word64
+  deriving Eq
+
+instance Show Goldilocks where
+  show (MkGoldilocks k) = printf "0x%016x" k
+
+zero, one, two :: Goldilocks
+zero = MkGoldilocks 0
+one  = MkGoldilocks 1
+two  = MkGoldilocks 2
+
+isZero, isOne :: Goldilocks -> Bool
+isZero (MkGoldilocks x) = x == 0
+isOne  (MkGoldilocks x) = x == 1
+
+--------------------------------------------------------------------------------
+
+instance Num Goldilocks where
+  fromInteger = mkGoldilocks
+  negate = neg
+  (+)    = add
+  (-)    = sub
+  (*)    = mul
+  abs    = id
+  signum _ = MkGoldilocks 1
+
+instance Fractional Goldilocks where
+  fromRational y = fromInteger (numerator y) `div` fromInteger (denominator y)
+  recip  = inv
+  (/)    = div
+
+instance Random Goldilocks where
+  -- random :: RandomGen g => g -> (a, g) 
+  random  g = let (x,g') = randomR (0,goldilocksPrimeWord64-1) g in (MkGoldilocks x, g')
+  randomR = error "randomR/Goldilocks: doesn't make much sense"
+
+--------------------------------------------------------------------------------
+
+-- | @p = 2^64 - 2^32 + 1@
+goldilocksPrime :: Integer
+goldilocksPrime = 0x_ffff_ffff_0000_0001
+
+goldilocksPrimeWord64 :: Word64
+goldilocksPrimeWord64 = 0x_ffff_ffff_0000_0001
+
+modp :: Integer -> Integer
+modp a = mod a goldilocksPrime
+
+mkGoldilocks :: Integer -> Goldilocks
+mkGoldilocks = MkGoldilocks . fromInteger . modp
+
+-- | A fixed generator of the multiplicative subgroup of the field
+theMultiplicativeGenerator :: Goldilocks
+theMultiplicativeGenerator = mkGoldilocks 7
+
+--------------------------------------------------------------------------------
+
+foreign import ccall unsafe "goldilocks_neg" c_goldilocks_neg :: Word64 -> Word64
+foreign import ccall unsafe "goldilocks_add" c_goldilocks_add :: Word64 -> Word64 -> Word64
+foreign import ccall unsafe "goldilocks_sub" c_goldilocks_sub :: Word64 -> Word64 -> Word64
+foreign import ccall unsafe "goldilocks_sqr" c_goldilocks_sqr :: Word64 -> Word64
+foreign import ccall unsafe "goldilocks_mul" c_goldilocks_mul :: Word64 -> Word64 -> Word64
+foreign import ccall unsafe "goldilocks_inv" c_goldilocks_inv :: Word64 -> Word64
+foreign import ccall unsafe "goldilocks_div" c_goldilocks_div :: Word64 -> Word64 -> Word64
+foreign import ccall unsafe "goldilocks_pow" c_goldilocks_pow :: Word64 -> CInt   -> Word64
+
+neg :: Goldilocks -> Goldilocks
+neg (MkGoldilocks k) = MkGoldilocks (c_goldilocks_neg k) 
+
+add :: Goldilocks -> Goldilocks -> Goldilocks
+add (MkGoldilocks a) (MkGoldilocks b) = MkGoldilocks (c_goldilocks_add a b) 
+
+sub :: Goldilocks -> Goldilocks -> Goldilocks
+sub (MkGoldilocks a) (MkGoldilocks b) = MkGoldilocks (c_goldilocks_sub a b) 
+
+sqr :: Goldilocks -> Goldilocks
+sqr (MkGoldilocks a)  = MkGoldilocks (c_goldilocks_sqr a) 
+
+mul :: Goldilocks -> Goldilocks -> Goldilocks
+mul (MkGoldilocks a) (MkGoldilocks b) = MkGoldilocks (c_goldilocks_mul a b) 
+
+inv :: Goldilocks -> Goldilocks
+inv (MkGoldilocks a)  = MkGoldilocks (c_goldilocks_inv a) 
+
+div :: Goldilocks -> Goldilocks -> Goldilocks
+div (MkGoldilocks a) (MkGoldilocks b) = MkGoldilocks (c_goldilocks_div a b) 
+
+--------------------------------------------------------------------------------
+
+pow_ :: Goldilocks -> Int -> Goldilocks
+pow_ (MkGoldilocks x) e = MkGoldilocks $ c_goldilocks_pow x (fromIntegral e :: CInt)
+
+pow :: Goldilocks -> Integer -> Goldilocks
+pow x e 
+  | e == 0    = 1
+  | e <  0    = pow (inv x) (negate e)
+  | otherwise = go 1 x e
+  where
+    go !acc _  0     = acc
+    go !acc !s !expo = case expo .&. 1 of
+      0 -> go acc     (sqr s) (shiftR expo 1)
+      _ -> go (acc*s) (sqr s) (shiftR expo 1)
+
+--------------------------------------------------------------------------------
+
--- a/reference/src/Field/Goldilocks/Slow.hs
+++ b/reference/src/Field/Goldilocks/Slow.hs
@ -47,6 +47,15 @@ newtype Goldilocks
 instance Show Goldilocks where
  show (MkGoldilocks k) = printf "0x%016x" k

+zero, one, two :: Goldilocks
+zero = MkGoldilocks 0
+one  = MkGoldilocks 1
+two  = MkGoldilocks 2
+
+isZero, isOne :: Goldilocks -> Bool
+isZero (MkGoldilocks x) = x == 0
+isOne  (MkGoldilocks x) = x == 1
+
 --------------------------------------------------------------------------------

 instance Num Goldilocks where
--- a/reference/src/Field/Properties.hs
+++ b/reference/src/Field/Properties.hs
@ -0,0 +1,321 @@
+
+
+-- | Property tests for rings and fields
+
+{-# LANGUAGE ScopedTypeVariables, Rank2Types, TypeApplications, FlexibleInstances, ConstraintKinds #-}
+module Field.Properties where
+
+--------------------------------------------------------------------------------
+
+import Data.Proxy
+import Data.IORef
+
+import Control.Monad
+import System.IO
+import System.Random
+
+import Field.Class
+
+--------------------------------------------------------------------------------
+-- compatibility hacks
+
+type Ring a = Field a
+
+--------------------------------------------------------------------------------
+
+runFieldTests :: forall a. Field a => IORef Bool -> Int -> Proxy a -> IO ()
+runFieldTests okflag n pxy = do
+  runRingTests      okflag n pxy
+  runFieldOnlyTests okflag n pxy 
+
+
+runRingTests :: forall a. Ring a => IORef Bool -> Int -> Proxy a -> IO ()
+runRingTests okflag n pxy = do
+
+  forM_ ringProps $ \prop -> case prop of
+  
+    RingProp1 test name -> doTests okflag n name $ do
+      x <- rndIO @a
+      return (test x) 
+
+    RingProp2 test name -> doTests okflag n name $ do
+      x <- rndIO @a
+      y <- rndIO @a
+      return (test x y) 
+
+    RingProp3 test name -> doTests okflag n name $ do
+      x <- rndIO @a
+      y <- rndIO @a
+      z <- rndIO @a
+      return (test x y z) 
+
+runFieldOnlyTests :: forall a. Field a => IORef Bool -> Int -> Proxy a -> IO ()
+runFieldOnlyTests okflag n pxy = do
+
+  forM_ fieldOnlyProps $ \prop -> case prop of
+  
+    FieldProp1 test name -> doTests okflag n name $ do
+      x <- rndIO @a
+      return (test x) 
+
+    FieldProp2 test name -> doTests okflag n name $ do
+      x <- rndIO @a
+      y <- rndIO @a
+      return (test x y) 
+
+    FieldProp3 test name -> doTests okflag n name $ do
+      x <- rndIO @a
+      y <- rndIO @a
+      z <- rndIO @a
+      return (test x y z) 
+
+    FieldPropE test name -> doTests okflag n name $ do
+      x <- rndIO @a
+      e <- randomRIO (-1000,1000::Int)
+      return (test x e) 
+
+--------------------------------------------------------------------------------
+
+doTests :: IORef Bool -> Int -> String -> IO Bool -> IO Bool
+doTests okflag n name testAction = 
+  do
+    let str = " - " ++ name ++ "... " 
+    putStr $ str ++ replicate (30 - length str) ' '
+    hFlush stdout
+    oks <- forM [1..n] $ \i -> testAction
+    let ok = and oks
+    case ok of
+      True  -> putStrLn $ "ok (passed " ++ show n ++ " tests)"
+      False -> do
+        writeIORef okflag False
+        putStrLn $ "FAILED!! (FAILED " ++ show (countFalses oks) ++ " tests!)"
+    return ok
+  where
+    countFalses :: [Bool] -> Int
+    countFalses = length . filter (==False)
+
+--------------------------------------------------------------------------------
+
+data RingProp
+  = RingProp1  (forall a. Ring a  => a -> Bool          ) String
+  | RingProp2  (forall a. Ring a  => a -> a -> Bool     ) String
+  | RingProp3  (forall a. Ring a  => a -> a -> a -> Bool) String
+
+data FieldProp
+  = FieldProp1 (forall a. Field a => a -> Bool          ) String
+  | FieldProp2 (forall a. Field a => a -> a -> Bool     ) String
+  | FieldProp3 (forall a. Field a => a -> a -> a -> Bool) String
+  | FieldPropE (forall a. Field a => a -> Int -> Bool   ) String
+
+--------------------------------------------------------------------------------
+
+ringProps :: [RingProp]
+ringProps = 
+  [ RingProp1 prop_add_left_unit               "add left unit"
+  , RingProp1 prop_add_right_unit              "add right unit"
+  , RingProp1 prop_add_left_inv                "add left inv"
+  , RingProp1 prop_add_right_inv               "add right inv"
+  , RingProp2 prop_add_commutative             "add comm"
+  , RingProp3 prop_add_associative             "add assoc"
+  , RingProp2 prop_sub_def                     "sub def"
+  , RingProp3 prop_add_sub_associative_1       "add-sub assoc /1"
+  , RingProp3 prop_add_sub_associative_2       "add-sub assoc /2"
+  , RingProp3 prop_add_sub_associative_3       "add-sub assoc /3"
+  , RingProp1 prop_is_zero                     "is zero"
+  , RingProp1 prop_is_one                      "is one"
+  , RingProp1 prop_is_equal                    "is equal"
+  , RingProp1 prop_mul_left_unit               "mul left unit"
+  , RingProp1 prop_mul_right_unit              "mul right unit"
+  , RingProp2 prop_mul_commutative             "mul comm"
+  , RingProp3 prop_mul_associative             "mul assoc"
+  , RingProp1 prop_square_def                  "square def"
+  , RingProp2 prop_square_distrib              "square distributive"
+  , RingProp3 prop_add_mul_left_distributive   "add+mul left distr"
+  , RingProp3 prop_add_mul_right_distributive  "add+mul right distr"
+  , RingProp3 prop_sub_mul_left_distributive   "sub+mul left distr"
+  , RingProp3 prop_sub_mul_right_distributive  "sub+mul right distr"
+  , RingProp1 prop_power_0                     "0-th power"
+  , RingProp1 prop_power_1                     "1-th power"
+  , RingProp1 prop_power_2                     "2-th power"
+  , RingProp1 prop_power_3                     "3-th power"
+  , RingProp1 prop_power_4                     "4-th power"
+  , RingProp1 prop_power_5                     "5-th power"
+  ]
+
+fieldOnlyProps :: [FieldProp]
+fieldOnlyProps = 
+  [ FieldProp1 prop_mul_left_inv               "mul left inf"
+  , FieldProp1 prop_mul_right_inv              "mul right inf"
+  , FieldProp2 prop_div_def                    "div def"
+  , FieldProp1 prop_inv_def                    "inv def"
+  , FieldProp2 prop_div_test                   "div defining prop."
+  , FieldProp1 prop_inv_fermat                 "inv == fermat"
+  , FieldProp1 prop_fermat_1                   "fermat/1"
+  , FieldProp1 prop_fermat_2                   "fermat/2"
+  , FieldPropE prop_power_vs_power_            "power vs. power_"
+  , FieldProp1 prop_power_neg                  "negative power"
+  , FieldProp3 prop_mul_div_associative_1      "mul-div assoc /1"
+  , FieldProp3 prop_mul_div_associative_2      "mul-div assoc /2"
+  , FieldProp3 prop_mul_div_associative_3      "mul-div assoc /3"
+--  , FieldProp3 prop_batch_inverse              "batch inverse"
+--  , FieldProp1 prop_frobenius                  "frobenius == frobeniusNaive"
+  ]
+
+--------------------------------------------------------------------------------
+-- * Ring properties
+
+prop_add_left_unit :: Ring a => a -> Bool
+prop_add_left_unit x = zero + x == x
+
+prop_add_right_unit :: Ring a => a -> Bool
+prop_add_right_unit x = x + zero == x
+
+prop_add_left_inv :: Ring a => a -> Bool
+prop_add_left_inv x = (negate x) + x == zero
+
+prop_add_right_inv :: Ring a => a -> Bool
+prop_add_right_inv x = x + (negate x) == zero
+
+prop_add_commutative :: Ring a => a -> a -> Bool
+prop_add_commutative x y = (x + y == y + x)
+
+prop_add_associative :: Ring a => a -> a -> a -> Bool
+prop_add_associative x y z = ((x + y) + z) == (x + (y + z))
+
+prop_sub_def :: Ring a => a -> a -> Bool
+prop_sub_def x y = (x + (negate y) == x - y)
+
+prop_add_sub_associative_1 :: Ring a => a -> a -> a -> Bool
+prop_add_sub_associative_1 x y z = ((x + y) - z) == (x + (y - z))
+
+prop_add_sub_associative_2 :: Ring a => a -> a -> a -> Bool
+prop_add_sub_associative_2 x y z = ((x - y) + z) == (x - (y - z))
+
+prop_add_sub_associative_3 :: Ring a => a -> a -> a -> Bool
+prop_add_sub_associative_3 x y z = ((x - y) - z) == (x - (y + z))
+
+----------------------------------------
+
+prop_is_zero :: forall a. Ring a => a -> Bool
+prop_is_zero x = isZero (zero @a) && isZero x == (x == 0)
+
+prop_is_one :: forall a. Ring a => a -> Bool
+prop_is_one x = isOne (one @a) && isOne x == (x == 1)
+
+prop_is_equal :: forall a. Ring a => a -> Bool
+prop_is_equal x = and
+  [ zero  == zero @a
+  , zero  /= one  @a
+  , one   /= zero @a
+  , one   == one  @a
+  , x     == x 
+  , (x+1) /= x    
+  , x     /= (x+1)
+  ]
+
+----------------------------------------
+
+prop_mul_left_unit :: Ring a => a -> Bool
+prop_mul_left_unit x = (one * x == x)
+
+prop_mul_right_unit :: Ring a => a -> Bool
+prop_mul_right_unit x = (x * one == x)
+
+prop_mul_commutative :: Ring a => a -> a -> Bool
+prop_mul_commutative x y = (x * y == y * x)
+
+prop_mul_associative :: Ring a => a -> a -> a -> Bool
+prop_mul_associative x y z = ((x * y) * z) == (x * (y * z))
+
+prop_square_def :: Ring a => a -> Bool
+prop_square_def x = (square x == x*x)
+
+prop_square_distrib :: Ring a => a -> a -> Bool
+prop_square_distrib x y =  (square (x+y) == square x + 2*x*y + square y)
+                        && (square (x-y) == square x - 2*x*y + square y)
+
+----------------------------------------
+
+prop_add_mul_left_distributive :: Ring a => a -> a -> a -> Bool
+prop_add_mul_left_distributive x y z = (x + y) * z ==  x*z + y*z
+
+prop_add_mul_right_distributive :: Ring a => a -> a -> a -> Bool
+prop_add_mul_right_distributive x y z = x * (y + z) ==  x*y + x*z
+
+prop_sub_mul_left_distributive :: Ring a => a -> a -> a -> Bool
+prop_sub_mul_left_distributive x y z = (x - y) * z ==  x*z - y*z
+
+prop_sub_mul_right_distributive :: Ring a => a -> a -> a -> Bool
+prop_sub_mul_right_distributive x y z = x * (y - z) ==  x*y - x*z
+
+--------------------------------------------------------------------------------
+
+prop_power_0 :: Ring a => a -> Bool
+prop_power_0 x = power x 0 == (if x == 0 then zero else one)
+
+prop_power_1 :: Ring a => a -> Bool
+prop_power_1 x = power x 1 == x
+
+prop_power_2 :: Ring a => a -> Bool
+prop_power_2 x = power x 2 == x *x
+
+prop_power_3 :: Ring a => a -> Bool
+prop_power_3 x = power x 3 == x*x*x
+
+prop_power_4 :: Ring a => a -> Bool
+prop_power_4 x = power x 4 == x*x*x*x
+
+prop_power_5 :: Ring a => a -> Bool
+prop_power_5 x = power x 5 == x*x*x*x*x
+
+--------------------------------------------------------------------------------
+-- * Field properties
+
+prop_mul_left_inv :: Field a => a -> Bool
+prop_mul_left_inv x = isZero x || (inverse x) * x == one
+
+prop_mul_right_inv :: Field a => a -> Bool
+prop_mul_right_inv x = isZero x || x * (inverse x) == one 
+
+prop_div_def :: Field a => a -> a -> Bool
+prop_div_def x y = (x * (inverse y) == x / y)
+
+prop_inv_def :: Field a => a -> Bool
+prop_inv_def x = (inverse x == 1 / x)
+
+prop_div_test :: Field a => a -> a -> Bool
+prop_div_test x y = isZero y || (x/y)*y == x
+
+prop_inv_fermat :: forall a. Field a => a -> Bool
+prop_inv_fermat x = (inverse x) == power x (p - 2) where p = fieldSize (Proxy @a)
+
+prop_fermat_1 :: forall a. Field a => a -> Bool
+prop_fermat_1 x = power x p == x where p = fieldSize (Proxy @a)
+
+prop_fermat_2 :: forall a. Field a => a -> Bool
+prop_fermat_2 x = power x (p - 1) == one where p = fieldSize (Proxy @a)
+
+prop_power_vs_power_ :: forall a. Field a => a -> Int -> Bool
+prop_power_vs_power_ x e = power x (fromIntegral e) == power_ x e
+
+prop_power_neg :: forall a. Field a => a -> Bool
+prop_power_neg x = power x (-1) == inverse x
+
+prop_mul_div_associative_1 :: Field a => a -> a -> a -> Bool
+prop_mul_div_associative_1 x y z = ((x * y) / z) == (x * (y / z))
+
+prop_mul_div_associative_2 :: Field a => a -> a -> a -> Bool
+prop_mul_div_associative_2 x y z = ((x / y) * z) == (x / (y / z))
+
+prop_mul_div_associative_3 :: Field a => a -> a -> a -> Bool
+prop_mul_div_associative_3 x y z = ((x / y) / z) == (x / (y * z))
+
+-- prop_batch_inverse :: Field a => a -> a -> a -> Bool
+-- prop_batch_inverse x y z = any (==0) as || (map recip as == bs) where
+--   as = [ x,y,z, x+y, y+z, z+x, x+y+z ]
+--   bs = batchInverse as
+
+-- prop_frobenius :: Field a => a -> Bool
+-- prop_frobenius x = (frobenius x == frobeniusNaive x)
+
+--------------------------------------------------------------------------------
--- a/reference/src/Field/Tests.hs
+++ b/reference/src/Field/Tests.hs
@ -0,0 +1,47 @@
+
+module Field.Tests where
+
+--------------------------------------------------------------------------------
+
+import Control.Monad
+
+import Data.Proxy
+import Data.IORef
+
+import Field.Class
+import Field.Properties
+
+import Field.Goldilocks           ( F    )
+import Field.Goldilocks.Extension ( FExt )
+
+--------------------------------------------------------------------------------
+
+nn = 1000
+
+runMyFieldTests :: IO Bool
+runMyFieldTests = do 
+  ok1 <- runGoldilocksTests
+  ok2 <- runGoldilocksExtensionTests
+  return (ok1 && ok2)
+
+--------------------------------------------------------------------------------
+
+runGoldilocksTests :: IO Bool
+runGoldilocksTests = do
+  putStrLn "\nTests for the Goldilocks field:"
+  putStrLn   "==============================="
+  okflag <- newIORef True
+  runFieldTests okflag nn (Proxy @F)
+  readIORef okflag
+
+runGoldilocksExtensionTests :: IO Bool
+runGoldilocksExtensionTests = do
+  putStrLn "\nTests for the Goldilocks quadratic extension field:"
+  putStrLn   "==================================================="
+  okflag <- newIORef True
+  runFieldTests okflag nn (Proxy @FExt)
+  readIORef okflag
+
+--------------------------------------------------------------------------------
+
+
--- a/reference/src/Outsource/Types.hs
+++ b/reference/src/Outsource/Types.hs
@ -0,0 +1,44 @@
+
+{-# LANGUAGE StrictData, RecordWildCards #-}
+module Outsource.Types where
+
+--------------------------------------------------------------------------------
+
+import FRI.Types
+
+--------------------------------------------------------------------------------
+
+-- | The type parameter is only there, because in the proof, we don't want to
+-- repeat the FRI configuration (which is already included in the FRI proof). 
+--
+-- It's a bit ugly, but hey this is just a prototype anyway!
+--
+data OutsourceConfig' friconfig = MkOutsourceConfig
+  { outsrcFriConfig   :: friconfig     -- ^ the FRI protocol configuration
+  , outsrcKeepParity  :: Log2          -- ^ how much parity data to keep: Original data size times @2^(-k)@
+  }
+  deriving (Eq,Show)
+
+type OutsourceConfigFull = OutsourceConfig' FriConfig
+type OutsourceConfig_    = OutsourceConfig' ()
+
+-- | The size of the rows (= number of columns in the data matrix)
+outSrcNColumns :: OutsourceConfigFull -> Int
+outSrcNColumns = friNColumns . outsrcFriConfig
+
+--------------------------------------------------------------------------------
+
+-- | Proof that the outsourcing of Reed-Solomon is done correctly.
+--
+-- This is checked against the original data Merkle root and RS-encoded Merkle root
+data OutsourceProof = MkOutsourceProof
+  { outsrcConfig     :: OutsourceConfig' ()      -- ^ we don't want to repeat the FRI configuration...
+  , outsrcFriProof   :: FriProof                 -- ^ ...which is already included in the FRI proof
+  , outsrcConnection :: ConnectionProof          -- ^ connect the original data to the parity data
+  }
+  deriving (Eq,Show)
+
+data ConnectionProof = MkConnectionProof
+  -- TODO
+
+--------------------------------------------------------------------------------
--- a/reference/src/cbits/compile.sh
+++ b/reference/src/cbits/compile.sh
@ -0,0 +1,4 @@
+#!/bin/bash
+
+gcc -c -O2 goldilocks.c
+gcc -c -O2 monolith.c
--- a/reference/src/cbits/goldilocks.c
+++ b/reference/src/cbits/goldilocks.c
@ -0,0 +1,250 @@
+
+#include <stdint.h>
+#include <stdio.h>      // for testing only
+#include <assert.h>
+
+#include "goldilocks.h"
+
+//------------------------------------------------------------------------------
+
+#define GOLDILOCKS_HALFPRIME_PLUS1 0x7fffffff80000001
+
+//------------------------------------------------------------------------------
+// *** Goldilocks field ***
+
+int goldilocks_isvalid(uint64_t x) {
+  return (x < GOLDILOCKS_PRIME);
+}
+
+uint64_t goldilocks_neg(uint64_t x) {
+  return (x==0) ? 0 : (GOLDILOCKS_PRIME - x);
+}
+
+uint64_t goldilocks_add(uint64_t x, uint64_t y) {
+  uint64_t z = x + y;
+  return ( (z >= GOLDILOCKS_PRIME) || (z<x) ) ? (z - GOLDILOCKS_PRIME) : z;
+}
+
+uint64_t goldilocks_add_to_uint64(uint64_t x, uint64_t y) {
+  uint64_t z = x + y;
+  return (z<x) ? (z - GOLDILOCKS_PRIME) : z;
+}
+
+uint64_t goldilocks_sub(uint64_t x, uint64_t y) {
+  uint64_t z = x - y;
+  return (z > x) ? (z + GOLDILOCKS_PRIME) : z;
+}
+
+uint64_t goldilocks_sub_safe(uint64_t x, uint64_t y) {
+  return goldilocks_add( x , goldilocks_neg(y) );
+}
+
+//--------------------------------------
+
+uint64_t goldilocks_rdc(__uint128_t x) {
+  // x = n0 + 2^64 * n1 + 2^96 * n2
+  uint64_t n0 = (uint64_t)x;
+  uint64_t n1 = (x >> 64) & 0xffffffff;
+  uint64_t n2 = (x >> 96);
+  
+  uint64_t mid = (n1 << 32) - n1;     // (2^32 - 1) * n1
+  uint64_t tmp = n0 + mid;
+  if (tmp < n0) { tmp -= GOLDILOCKS_PRIME; }
+
+  uint64_t res = tmp - n2;
+  if (res > tmp) { res += GOLDILOCKS_PRIME; }
+  return (res >= GOLDILOCKS_PRIME) ? (res - GOLDILOCKS_PRIME) : res;
+}
+
+// reduce to 64-bit, but it can be still bigger than `p`
+uint64_t goldilocks_rdc_to_uint64(__uint128_t x) {
+  // x = n0 + 2^64 * n1 + 2^96 * n2
+  uint64_t n0 = (uint64_t)x;
+  uint64_t n1 = (x >> 64) & 0xffffffff;
+  uint64_t n2 = (x >> 96);
+  
+  uint64_t mid = (n1 << 32) - n1;     // (2^32 - 1) * n1
+  uint64_t tmp = n0 + mid;
+  if (tmp < n0) { tmp -= GOLDILOCKS_PRIME; }
+
+  uint64_t res = tmp - n2;
+  if (res > tmp) { res += GOLDILOCKS_PRIME; }
+  return res;
+}
+
+// we assume x < 2^96
+uint64_t goldilocks_rdc_small(__uint128_t x) {
+  // x = n0 + 2^64 * n1
+  uint64_t n0 = (uint64_t)x;
+  uint64_t n1 = (x >> 64);
+
+  uint64_t mid = (n1 << 32) - n1;     // (2^32 - 1) * n1
+  uint64_t tmp = n0 + mid;
+  if (tmp < n0) { tmp -= GOLDILOCKS_PRIME; }
+
+  uint64_t res = tmp;
+  return (res >= GOLDILOCKS_PRIME) ? (res - GOLDILOCKS_PRIME) : res;
+}
+
+//--------------------------------------
+
+uint64_t goldilocks_mul(uint64_t x, uint64_t y) {
+  __uint128_t z = (__uint128_t)x * (__uint128_t)y;
+  return goldilocks_rdc(z); 
+}
+
+uint64_t goldilocks_mul_to_uint64(uint64_t x, uint64_t y) {
+  __uint128_t z = (__uint128_t)x * (__uint128_t)y;
+  return goldilocks_rdc_to_uint64(z); 
+}
+
+uint64_t goldilocks_mul_add128(uint64_t x, uint64_t y, __uint128_t z) {
+  __uint128_t w = (__uint128_t)x * (__uint128_t)y + z;
+  return goldilocks_rdc(w); 
+}
+
+uint64_t goldilocks_sqr(uint64_t x) {
+  __uint128_t z = (__uint128_t)x * (__uint128_t)x;
+  return goldilocks_rdc(z); 
+}
+
+uint64_t goldilocks_sqr_add(uint64_t x, uint64_t y) {
+  __uint128_t z = (__uint128_t)x * x + y;
+  return goldilocks_rdc(z); 
+}
+
+// only reduce to uint64, not to [0..p-1]
+uint64_t goldilocks_sqr_add_to_uint64(uint64_t x, uint64_t y) {
+  __uint128_t z = (__uint128_t)x * x + y;
+  return goldilocks_rdc_to_uint64(z); 
+}
+
+uint64_t goldilocks_mul_small(uint64_t x, uint32_t y) {
+  __uint128_t z = (__uint128_t)x * (__uint128_t)y;
+  return goldilocks_rdc_small(z); 
+}
+
+//------------------------------------------------------------------------------
+
+uint64_t goldilocks_euclid(uint64_t x0, uint64_t y0, uint64_t u0, uint64_t v0) {
+
+  uint64_t x = x0;
+  uint64_t y = y0;
+  uint64_t u = u0;
+  uint64_t v = v0;
+
+  while( ( (u!=1) && (v!=1) ) ) {
+
+    while (!(u & 1ull)) {
+      u = u >> 1;
+      int odd = x & 1ull;
+      x = x >> 1;
+      if (odd) { x += GOLDILOCKS_HALFPRIME_PLUS1; }
+    }
+
+    while (!(v & 1ull)) {
+      v = v >> 1;
+      int odd = y & 1ull;
+      y = y >> 1;
+      if (odd) { y += GOLDILOCKS_HALFPRIME_PLUS1; }
+    }
+
+    if (u < v) {
+      // u-v < 0, that is, u < v
+      v = v - u;
+      y = goldilocks_sub(y , x);
+    }
+    else {
+      // u-v >= 0, that is, u >= v
+      u = u - v;
+      x = goldilocks_sub(x , y);
+    }
+  
+  }
+
+  if (u == 1) { 
+    return x;
+  } 
+  else { 
+    return y;
+  }
+}
+
+uint64_t goldilocks_div(uint64_t a, uint64_t b) {
+  return goldilocks_euclid(a,0,b,GOLDILOCKS_PRIME);
+}
+
+uint64_t goldilocks_inv(uint64_t a) {
+  return goldilocks_div(1, a);
+}
+
+//------------------------------------------------------------------------------
+
+uint64_t goldilocks_pow(uint64_t base, int expo) {
+  if (expo == 0) { return 1; }
+  if (expo <  0) { return goldilocks_pow( goldilocks_inv(base) , -expo ); }
+
+  int      e   = expo;
+  uint64_t sq  = base;
+  uint64_t acc = 1;
+
+  while (e != 0) { 
+    if ((e & 1) != 0) {
+      acc = goldilocks_mul( acc, sq );
+    } 
+    if (e > 0) {
+      sq = goldilocks_mul( sq , sq );
+      e  = e >> 1;
+    }
+  }
+
+  return acc;
+}
+
+//==============================================================================
+// *** debugging ***
+
+void debug_print_state(const char *msg, int n, uint64_t *state) {
+  printf("-----------------\n");
+  printf("%s\n",msg);
+  for(int i=0;i<n;i++) {
+    printf(" - 0x%016llx = %llu\n",state[i],state[i]);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+#define MASK 0x3fffffffffffffffULL
+
+// NOTE: we assume a little-endian architecture
+void goldilocks_convert_31_bytes_to_4_field_elements(const uint8_t *ptr, uint64_t *felts) {
+  const uint64_t *q0  = (const uint64_t*)(ptr   );
+  const uint64_t *q7  = (const uint64_t*)(ptr+ 7);
+  const uint64_t *q15 = (const uint64_t*)(ptr+15);
+  const uint64_t *q23 = (const uint64_t*)(ptr+23);
+
+  felts[0] =  (q0 [0]) & MASK;
+  felts[1] = ((q7 [0]) >> 6) | ((uint64_t)(ptr[15] & 0x0f) << 58);
+  felts[2] = ((q15[0]) >> 4) | ((uint64_t)(ptr[23] & 0x03) << 60); 
+  felts[3] = ((q23[0]) >> 2);
+}
+
+void goldilocks_convert_bytes_to_field_elements(int rate, const uint8_t *ptr, uint64_t *felts) {
+  switch(rate) {
+
+    case 4:
+      goldilocks_convert_31_bytes_to_4_field_elements(ptr, felts);
+      break;
+
+    case 8:
+      goldilocks_convert_31_bytes_to_4_field_elements(ptr   , felts  ); 
+      goldilocks_convert_31_bytes_to_4_field_elements(ptr+31, felts+4);
+      break;
+
+    default:
+      assert( 0 );
+      break;
+  }
+}
+
+//------------------------------------------------------------------------------
--- a/reference/src/cbits/goldilocks.h
+++ b/reference/src/cbits/goldilocks.h
@ -0,0 +1,39 @@
+
+#include <stdint.h>
+
+//------------------------------------------------------------------------------
+
+#define GOLDILOCKS_PRIME 0xffffffff00000001
+
+//------------------------------------------------------------------------------
+
+int goldilocks_isvalid(uint64_t x);
+
+uint64_t goldilocks_neg(uint64_t x);
+uint64_t goldilocks_add(uint64_t x, uint64_t y);
+uint64_t goldilocks_sub(uint64_t x, uint64_t y);
+uint64_t goldilocks_sqr(uint64_t x);
+uint64_t goldilocks_mul(uint64_t x, uint64_t y);
+uint64_t goldilocks_mul_small(uint64_t x, uint32_t y);
+uint64_t goldilocks_inv(uint64_t a);
+uint64_t goldilocks_div(uint64_t a, uint64_t b);
+uint64_t goldilocks_pow(uint64_t b, int e);
+
+//------------------------------------------------------------------------------
+
+uint64_t goldilocks_rdc          (__uint128_t x);
+uint64_t goldilocks_rdc_to_uint64(__uint128_t x);
+uint64_t goldilocks_rdc_small    (__uint128_t x);
+
+uint64_t goldilocks_mul_to_uint64    (uint64_t x, uint64_t y);
+uint64_t goldilocks_mul_add128       (uint64_t x, uint64_t y, __uint128_t z);
+uint64_t goldilocks_sqr_add          (uint64_t x, uint64_t y);
+uint64_t goldilocks_sqr_add_to_uint64(uint64_t x, uint64_t y);
+uint64_t goldilocks_mul_small        (uint64_t x, uint32_t y);
+
+//------------------------------------------------------------------------------
+
+void goldilocks_convert_31_bytes_to_4_field_elements (      const uint8_t *ptr, uint64_t *felts );
+void goldilocks_convert_bytes_to_field_elements ( int rate, const uint8_t *ptr, uint64_t *felts );
+
+//------------------------------------------------------------------------------
--- a/reference/src/cbits/goldilocks.o
+++ b/reference/src/cbits/goldilocks.o
--- a/reference/src/cbits/monolith.c
+++ b/reference/src/cbits/monolith.c
@ -0,0 +1,238 @@
+
+#include <assert.h>
+
+#include "goldilocks.h"
+#include "monolith.h"
+
+//==============================================================================
+// *** Monolith hash ***
+//
+// compatible with <https://extgit.iaik.tugraz.at/krypto/zkfriendlyhashzoo>
+//
+
+/* 
+monolith test vector (permutation of [0..11]) 
+---------------------------------------------
+from <https://extgit.iaik.tugraz.at/krypto/zkfriendlyhashzoo/-/blob/master/plain_impls/src/monolith_64/monolith_64.rs?ref_type=heads#L653>
+
+0x516dd661e959f541 = 5867581605548782913
+0x082c137169707901 = 588867029099903233
+0x53dff3fd9f0a5beb = 6043817495575026667
+0x0b2ebaa261590650 = 805786589926590032
+0x89aadb57e2969cb6 = 9919982299747097782
+0x5d3d6905970259bd = 6718641691835914685
+0x6e5ac1a4c0cfa0fe = 7951881005429661950
+0xd674b7736abfc5ce = 15453177927755089358
+0x0d8697e1cd9a235f = 974633365445157727
+0x85fc4017c247136e = 9654662171963364206
+0x572bafd76e511424 = 6281307445101925412
+0xbec1638e28eae57f = 13745376999934453119
+
+*/
+
+//--------------------------------------
+// ** sbox layer
+
+// based on the reference implementation from 
+// <https://extgit.iaik.tugraz.at/krypto/zkfriendlyhashzoo>
+uint64_t goldilocks_monolith_single_bar(uint64_t x) {
+
+  //  uint64_t y1 = ((x & 0x8080808080808080) >> 7) | ((x & 0x7F7F7F7F7F7F7F7F) << 1); 
+  //  uint64_t y2 = ((x & 0xC0C0C0C0C0C0C0C0) >> 6) | ((x & 0x3F3F3F3F3F3F3F3F) << 2); 
+  //  uint64_t y3 = ((x & 0xE0E0E0E0E0E0E0E0) >> 5) | ((x & 0x1F1F1F1F1F1F1F1F) << 3); 
+  //  uint64_t z  = x ^ ((~y1) & y2 & y3);
+  //  uint64_t r  = ((z  & 0x8080808080808080) >> 7) | ((z  & 0x7F7F7F7F7F7F7F7F) << 1);
+
+  const uint64_t mask80 = 0x8080808080808080;
+  const uint64_t mask7F = ~mask80;
+  uint64_t y1 = ((x  & mask80) >> 7) | ((x  & mask7F) << 1); 
+  uint64_t y2 = ((y1 & mask80) >> 7) | ((y1 & mask7F) << 1); 
+  uint64_t y3 = ((y2 & mask80) >> 7) | ((y2 & mask7F) << 1); 
+  uint64_t z  = x ^ ((~y1) & y2 & y3);
+  uint64_t r  = ((z  & mask80) >> 7) | ((z  & mask7F) << 1);
+  return r;
+}
+
+// the sbox-layer (note: it's only applied to the first 4 field elements!)
+void goldilocks_monolith_bars(uint64_t *state) {
+  for(int j=0; j<4; j++) { state[j] = goldilocks_monolith_single_bar(state[j]); }
+}
+
+//--------------------------------------
+// ** nonlinear layer
+
+// the nonlinear layer
+//
+// remark: since the next layer is always the linear diffusion, it's enough
+// to reduce to 64 bit, don't have to reduce to [0..p-1]. 
+// As in the linear layer we split into two 32 bit words anyway.
+void goldilocks_monolith_bricks(uint64_t *state) {
+  for(int i=11; i>0; i--) state[i] = goldilocks_sqr_add_to_uint64( state[i-1] , state[i] );
+}
+
+//--------------------------------------
+// ** fast diffusion layer
+
+#include "monolith_conv_uint64.inc"
+
+// we split the input to low and high 32 bit words
+// do circular convolution on them, which safe because there is no overflow in 64 bit words
+// but should be much faster as there are no modulo operations just 64-bit machine word ops
+// then reconstruct and reduce at the end
+void goldilocks_monolith_concrete(uint64_t *state) {
+  uint64_t lo[12];
+  uint64_t hi[12];
+ 
+  for(int i=0; i<12; i++) { 
+    uint64_t x = state[i];
+    lo[i] = x & 0xffffffff;
+    hi[i] = x >> 32;
+  }
+
+  uint64_circular_conv_12_with( lo , lo );
+  uint64_circular_conv_12_with( hi , hi );
+
+  for(int i=0; i<12; i++) {
+    __uint128_t x = (((__uint128_t)hi[i]) << 32) + lo[i];
+    state[i] = goldilocks_rdc_small(x);
+  }
+}
+
+void goldilocks_monolith_concrete_rc(uint64_t *state, const uint64_t *rc) {
+  uint64_t lo[12];
+  uint64_t hi[12];
+ 
+  for(int i=0; i<12; i++) { 
+    uint64_t x = state[i];
+    lo[i] = x & 0xffffffff;
+    hi[i] = x >> 32;
+  }
+
+  uint64_circular_conv_12_with( lo , lo );
+  uint64_circular_conv_12_with( hi , hi );
+
+  for(int i=0; i<12; i++) {
+    __uint128_t x = (((__uint128_t)hi[i]) << 32) + lo[i] + rc[i];
+    state[i] = goldilocks_rdc_small(x);
+  }
+}
+
+//--------------------------------------
+// ** rounds
+
+#include "monolith_constants.inc"
+
+void goldilocks_monolith_round(int round_idx, uint64_t *state) {
+  goldilocks_monolith_bars       (state);
+  goldilocks_monolith_bricks     (state);
+  goldilocks_monolith_concrete_rc(state , &(monolith_t12_round_constants[round_idx][0]) );
+}
+
+void goldilocks_monolith_permutation(uint64_t *state) {
+  // initial layer
+  goldilocks_monolith_concrete(state);
+  // five rounds with RC
+  for(int r=0; r<5; r++) {
+    goldilocks_monolith_round(r, state);
+  }
+  // last round, no RC
+  goldilocks_monolith_bars    (state);
+  goldilocks_monolith_bricks  (state);
+  goldilocks_monolith_concrete(state);
+}
+
+//------------------------------------------------------------------------------
+
+// compression function: input is two 4-element vector of field elements, 
+// and the output is a vector of 4 field elements
+void goldilocks_monolith_keyed_compress(const uint64_t *x, const uint64_t *y, uint64_t key, uint64_t *out) {
+  uint64_t state[12];
+  for(int i=0; i<4; i++) {
+    state[i  ] = x[i];
+    state[i+4] = y[i];
+    state[i+8] = 0;
+  }
+  state[8] = key;
+  goldilocks_monolith_permutation(state);
+  for(int i=0; i<4; i++) {
+    out[i] = state[i];
+  }
+}
+
+void goldilocks_monolith_compress(const uint64_t *x, const uint64_t *y, uint64_t *out) {
+  goldilocks_monolith_keyed_compress(x, y, 0, out);
+}
+
+//------------------------------------------------------------------------------
+
+// hash a sequence of field elements into a digest of 4 field elements
+void goldilocks_monolith_felts_digest(int rate, int N, const uint64_t *input, uint64_t *hash) {
+
+  assert( (rate >= 1) && (rate <= 8) );
+
+  uint64_t domsep = rate + 256*12 + 65536*63;
+  uint64_t state[12];
+  for(int i=0; i<12; i++) state[i] = 0;
+  state[8] = domsep;
+
+  int nchunks = (N + rate) / rate;       // 10* padding
+  const uint64_t *ptr = input;
+  for(int k=0; k<nchunks-1; k++) {
+    for(int j=0; j<rate; j++) { state[j] = goldilocks_add( state[j] , ptr[j] ); }
+    goldilocks_monolith_permutation( state );
+    ptr += rate;
+  }
+
+  int rem = nchunks*rate - N;       // 0 < rem <= rate
+  int ofs = rate - rem; 
+
+  // the last block, with padding
+  uint64_t last[8];
+  for(int i=0    ; i<ofs ; i++) last[i] = ptr[i];
+  for(int i=ofs+1; i<rate; i++) last[i] = 0;
+  last[ofs] = 0x01;
+  for(int j=0; j<rate; j++) { state[j] = goldilocks_add( state[j] , last[j] ); }
+  goldilocks_monolith_permutation( state );
+
+  for(int j=0; j<4; j++) { hash[j] = state[j]; }
+}
+
+//--------------------------------------
+
+void goldilocks_monolith_bytes_digest(int rate, int N, const uint8_t *input, uint64_t *hash) {
+
+  assert( (rate == 4) || (rate == 8) );
+
+  uint64_t domsep = rate + 256*12 + 65536*8;
+  uint64_t state[12];
+  for(int i=0; i<12; i++) state[i] = 0;
+  state[8] = domsep;
+
+  uint64_t felts[8];
+
+  int rate_in_bytes  = 31 * (rate>>2);                   // 31 or 62
+  int nchunks = (N + rate_in_bytes) / rate_in_bytes;     // 10* padding
+  const uint8_t *ptr = input;
+  for(int k=0; k<nchunks-1; k++) {
+    goldilocks_convert_bytes_to_field_elements(rate, ptr, felts);
+    for(int j=0; j<rate; j++) { state[j] = goldilocks_add( state[j] , felts[j] ); }
+    goldilocks_monolith_permutation( state );
+    ptr += rate_in_bytes;
+  }
+
+  int rem = nchunks*rate_in_bytes - N;       // 0 < rem <= rate_in_bytes 
+  int ofs = rate_in_bytes - rem; 
+  uint8_t last[62];
+
+  // last block, with padding
+  for(int i=0    ; i<ofs          ; i++) last[i] = ptr[i];
+  for(int i=ofs+1; i<rate_in_bytes; i++) last[i] = 0;
+  last[ofs] = 0x01;
+  goldilocks_convert_bytes_to_field_elements(rate, last, felts);
+  for(int j=0; j<rate; j++) { state[j] = goldilocks_add( state[j] ,felts[j] ); }
+  goldilocks_monolith_permutation( state );
+
+  for(int j=0; j<4; j++) { hash[j] = state[j]; }
+}
+
+//------------------------------------------------------------------------------
--- a/reference/src/cbits/monolith.h
+++ b/reference/src/cbits/monolith.h
@ -0,0 +1,12 @@
+
+#include <stdint.h>
+
+//------------------------------------------------------------------------------
+
+void goldilocks_monolith_permutation   (uint64_t *state);
+void goldilocks_monolith_keyed_compress(const uint64_t *x, const uint64_t *y, uint64_t key, uint64_t *out);
+void goldilocks_monolith_compress      (const uint64_t *x, const uint64_t *y,               uint64_t *out);
+void goldilocks_monolith_bytes_digest  (int rate, int N, const uint8_t  *input, uint64_t *hash);
+void goldilocks_monolith_felts_digest  (int rate, int N, const uint64_t *input, uint64_t *hash);
+
+//------------------------------------------------------------------------------
--- a/reference/src/cbits/monolith.o
+++ b/reference/src/cbits/monolith.o
--- a/reference/src/cbits/monolith_constants.inc
+++ b/reference/src/cbits/monolith_constants.inc
@ -0,0 +1,71 @@
+
+#include <stdint.h>
+
+const uint64_t monolith_t12_round_constants[5][12] = 
+  { { 0xbcaf2516e5926dcf
+    , 0x4ec5a76bce1e7676
+    , 0x9d804725bebb56ab
+    , 0x2ec05fca215a5be3
+    , 0xe16274e4acab86a0
+    , 0x80b0fddcc3c4380f
+    , 0xc87c769ad77ffece
+    , 0x37f85ec9117d287c
+    , 0x3b8d825b014c458d
+    , 0xb7a01d0cb850d75e
+    , 0x1333b751bac704bd
+    , 0x7b7ef14183d47b6f
+    }
+  , { 0x2114517643e3b286
+    , 0x542d15ea3cd12ade
+    , 0xe847d363f17a93e9
+    , 0x24f0421c6ff41c56
+    , 0x66e3eda93e2ca216
+    , 0xfb88d475279cb568
+    , 0x7f421c6269938a22
+    , 0xdbb973acce857401
+    , 0xe172409cb1563a6a
+    , 0x996f729f6340447d
+    , 0x925c579738b6fa4a
+    , 0x752e9ec9e0b34686
+    }
+  , { 0xdb419e0bd38469bd
+    , 0xba41cee828bd26d8
+    , 0xd6630f8f0969db39
+    , 0x2340e955ae2f0d94
+    , 0x282f553d35872e2e
+    , 0x77f7c3ff1ae496b3
+    , 0xf5f2efab64bc5eef
+    , 0x47b23a00830284f4
+    , 0xe18a2d2242486fa
+    , 0x3d101838a773dab0
+    , 0x47d686fd16856524
+    , 0x3eb2d254189b3534
+    }
+  , { 0xfe886e291ca8c5bd
+    , 0xb97ec74df1e4b0b6
+    , 0x574fdef3a600e370
+    , 0x8ad61c6f132d4feb
+    , 0x41e69ca4ecc7e8c7
+    , 0x151ad562e1f90ca4
+    , 0x747c051439a5603c
+    , 0x990151d3e52d502c
+    , 0x532c7f258282ea12
+    , 0x65e62cb34275dd5
+    , 0x5288008954f5d0b2
+    , 0xee7c3407cf3d6e02
+    }
+  , { 0xda07029808bad5de
+    , 0x7bebdf38dcc7a673
+    , 0x20a3f252688c312d
+    , 0x9c5248f7bbf8d188
+    , 0xcf1cf778994382d4
+    , 0x8c434b1738b8338c
+    , 0xfe504398813b67a8
+    , 0xe879562fdef813b9
+    , 0xd4666793b2a2f191
+    , 0xd9096b87de22de01
+    , 0xcaf4cea5f22abf34
+    , 0x3128d1e75d0204fa
+    }
+  };
+
--- a/reference/src/cbits/monolith_conv_uint64.inc
+++ b/reference/src/cbits/monolith_conv_uint64.inc
@ -0,0 +1,267 @@
+
+//
+// circular convolution with the vector [7,8,21,22,6,7,9,10,13,26,8,23] algorithms in uint64_t
+// the idea is that we can split field elements into (lo + 2^32*hi)
+// apply the convolution separately (it won't overflow)
+// then combine and reduce
+//
+// based on the book:
+//
+// Nussbaumer: "Fast Fourier Transform and Convolution Algorithms"
+//
+
+/*
+
+our coefficient vectors:
+
+  [7,8,21,22,6,7,9,10,13,26,8,23]
+
+in CRT rectangle format:
+
+  +----------+
+  |  7  6 13 |
+  |  26 8  7 | 
+  |  9  8 21 |
+  | 22 10 23 |
+  +----------+
+
+*/
+
+#include <stdint.h>
+
+//------------------------------------------------------------------------------
+
+// convolves with:  b2 = { 64 , 32 , 64 };
+//   tgt[0] = 64*x + 64*y + 32*z
+//   tgt[1] = 32*x + 64*y + 64*z
+//   tgt[2] = 64*x + 32*y + 64*z
+void uint64_convolve_with_B2(uint64_t *src, uint64_t *tgt) {
+  uint64_t x = src[0];
+  uint64_t y = src[1];
+  uint64_t z = src[2];
+
+  uint64_t x32 = x << 5;
+  uint64_t y32 = y << 5;
+  uint64_t z32 = z << 5;
+
+  uint64_t s64 = (x32 + y32 + z32) << 1;
+
+  tgt[0] = s64 - z32;
+  tgt[1] = s64 - x32;
+  tgt[2] = s64 - y32;
+}
+
+
+// convolves with:  b3 = { -32 , -4 ,   4 };
+//   tgt[0] = -32*x +  4*y -  4*z
+//   tgt[1] =  -4*x - 32*y + 64*z
+//   tgt[2] =   4*x -  4*y - 32*z
+void uint64_convolve_with_B3(uint64_t *src, uint64_t *tgt) {
+  uint64_t x = src[0];
+  uint64_t y = src[1];
+  uint64_t z = src[2];
+
+  uint64_t x4 = x << 2;
+  uint64_t y4 = y << 2;
+  uint64_t z4 = z << 2;
+
+  uint64_t x32 = x4 << 3;
+  uint64_t y32 = y4 << 3;
+  uint64_t z32 = z4 << 3;
+
+  tgt[0] = - x32 + y4  - z4;
+  tgt[1] = - x4  - y32 + z4; 
+  tgt[2] =   x4  - y4  - z32;
+}
+
+// convolves with:  b4 = { -6 , 0 , 8 };
+//   tgt[0] = - 6*x + 8*y 
+//   tgt[1] =       - 6*y + 8*z
+//   tgt[2] =   8*x       - 6*z
+void uint64_convolve_with_B4(uint64_t *src, uint64_t *tgt) {
+  uint64_t x = src[0];
+  uint64_t y = src[1];
+  uint64_t z = src[2];
+
+  uint64_t x8 = x << 3;
+  uint64_t y8 = y << 3;
+  uint64_t z8 = z << 3;
+
+  uint64_t x6 = x8 - (x + x);
+  uint64_t y6 = y8 - (y + y);
+  uint64_t z6 = z8 - (z + z);
+
+  tgt[0] = - x6 + y8;
+  tgt[1] = - y6 + z8;
+  tgt[2] = - z6 + x8;
+}
+
+// convolves with:  b5 = {   2 , -4 , -24 };
+//   tgt[0] =   2*x - 24*y -  4*z 
+//   tgt[1] =  -4*x +  2*y - 24*z
+//   tgt[2] = -24*x -  4*y +  2*z
+void uint64_convolve_with_B5(uint64_t *src, uint64_t *tgt) {
+  uint64_t x = src[0];
+  uint64_t y = src[1];
+  uint64_t z = src[2];
+
+  uint64_t x2 = x << 1;
+  uint64_t y2 = y << 1;
+  uint64_t z2 = z << 1;
+
+  uint64_t x4 = x2 << 1;
+  uint64_t y4 = y2 << 1;
+  uint64_t z4 = z2 << 1;
+
+  uint64_t x24 = x4*6; // (x4 + x4 + x4) << 1;
+  uint64_t y24 = y4*6; // (y4 + y4 + y4) << 1;
+  uint64_t z24 = z4*6; // (z4 + z4 + z4) << 1;
+
+  tgt[0] =   x2  - y24 - z4 ;
+  tgt[1] = - x4  + y2  - z24;
+  tgt[2] = - x24 - y4  + z2 ;
+}
+
+// convolves with:  b6 = {  -2 , -2 ,  -8 };
+//   tgt[0] = - ( 2*x + 8*y + 2*z ) 
+//   tgt[1] = - ( 2*x + 2*y + 8*z )
+//   tgt[2] = - ( 8*x + 2*y + 2*z )
+void uint64_convolve_with_B6(uint64_t *src, uint64_t *tgt) {
+  uint64_t x = src[0];
+  uint64_t y = src[1];
+  uint64_t z = src[2];
+
+  uint64_t x3 = (x << 2) - x ;
+  uint64_t y3 = (y << 2) - y ;
+  uint64_t z3 = (z << 2) - z ;
+
+  uint64_t s = x + y + z;
+
+  tgt[0] = - ( (s + y3) << 1 );
+  tgt[1] = - ( (s + z3) << 1 );
+  tgt[2] = - ( (s + x3) << 1 );
+}
+
+//------------------------------------------------------------------------------
+
+void uint64_naive_circular_conv( int n, uint64_t *input, uint64_t *coeffs, uint64_t *output ) {
+  for(int k=0; k<n; k++) {
+    uint64_t acc = 0;
+    for(int j=0; j<n; j++) {
+      acc += input[j] * coeffs[ (k+n-j)%n ];
+    }
+    output[k] = acc;
+  }  
+}
+
+//------------------------------------------------------------------------------
+
+void uint64_add_vec3(uint64_t *xs, uint64_t *ys, uint64_t *zs) {
+  for(int i=0; i<3; i++) zs[i] = xs[i] + ys[i];  
+}
+
+void uint64_sub_vec3(uint64_t *xs, uint64_t *ys, uint64_t *zs) {
+  for(int i=0; i<3; i++) zs[i] = xs[i] - ys[i];  
+}
+
+//------------------------------------------------------------------------------
+
+// cyclic convolution of 12 terms via the Agarwal-Cooley algorithm
+// with the fixed vector [7,8,21,22,6,7,9,10,13,26,8,23]
+//
+void uint64_circular_conv_12_with( uint64_t *input , uint64_t *output ) {
+
+  uint64_t input_rect[4][3];       // first index is the outer, second the inner
+
+  for(int k=0; k<12; k++) {
+    input_rect[k%4][k%3] = input [k];
+  }
+
+  uint64_t *input_ptr = (uint64_t*) input_rect;
+
+  uint64_t *x0 = input_ptr    ;
+  uint64_t *x1 = input_ptr + 3;
+  uint64_t *x2 = input_ptr + 6;
+  uint64_t *x3 = input_ptr + 9;
+
+  uint64_t a0[3], a1[3], a2[3], a3[3], a4[3], a5[3], a6[3];
+  for(int j=0; j<3; j++)  {
+    a0[j] = x0[j] + x2[j]; 
+    a1[j] = x1[j] + x3[j]; 
+    a2[j] = a0[j] + a1[j]; 
+    a3[j] = a0[j] - a1[j]; 
+    a4[j] = x0[j] - x2[j]; 
+    a5[j] = x1[j] - x3[j]; 
+    a6[j] = a4[j] + a5[j]; 
+  }
+
+  uint64_t m0[3], m1[3], m2[3], m3[3], m4[3];
+  uint64_convolve_with_B2( a2 , m0 );            // uint64_naive_circular_conv( 3 , a2 , b2 , m0 );
+  uint64_convolve_with_B3( a3 , m1 );            // uint64_naive_circular_conv( 3 , a3 , b3 , m1 );
+  uint64_convolve_with_B4( a4 , m2 );            // uint64_naive_circular_conv( 3 , a4 , b4 , m2 );
+  uint64_convolve_with_B5( a5 , m3 );            // uint64_naive_circular_conv( 3 , a5 , b5 , m3 );
+  uint64_convolve_with_B6( a6 , m4 );            // uint64_naive_circular_conv( 3 , a6 , b6 , m4 );
+
+  uint64_t u0[3], u1[3], u2[3], u3[3];
+  uint64_add_vec3( m0 , m1 , u0 );
+  uint64_sub_vec3( m0 , m1 , u1 );
+  uint64_sub_vec3( m4 , m3 , u2 );
+  uint64_sub_vec3( m4 , m2 , u3 );
+
+  for(int i=0; i<3; i++) {
+    x0[i] = ( u0[i] + 2*u2[i] ) >> 2;
+    x1[i] = ( u1[i] + 2*u3[i] ) >> 2;
+    x2[i] = ( u0[i] - 2*u2[i] ) >> 2;
+    x3[i] = ( u1[i] - 2*u3[i] ) >> 2;
+  }
+ 
+  for(int k=0; k<12; k++) {
+    output[k] = input_rect[k%4][k%3];
+  }
+}
+
+//------------------------------------------------------------------------------
+
+/*
+
+void uint64_test_short_conv_with() {
+  
+  printf("test short convolution algos for uint64\n");
+  
+  uint64_t input    [12];
+  uint64_t coeffs   [12] = {7,8,21,22,6,7,9,10,13,26,8,23};
+  uint64_t output   [12];
+  uint64_t reference[12];
+
+  // generate some "random-looking" numbers
+  uint64_t a=123459;
+  uint64_t b=789013;
+  for(int i=0;i<12;i++) {
+    uint64_t c = (a*b) ^ (a - 12345);
+    uint64_t d = (c*a) ^ (b + 67891);
+    input [i] = c & 0x0fffffff;             // WE WANT NO OVERFLOW!
+    a = b   + c       + 1;
+    b = 3*a - 5*c + d - 3;
+  }
+
+  for(int i=0; i<12; i++) {
+    printf("x[%d] = %016llx  ;  h[%d] = %016llx\n" , i, input[i], i, coeffs[i] );
+  }
+
+  // -----------[ length = 12 ]----------- 
+
+  printf("\n");
+  printf("length = 12\n");
+
+  uint64_naive_circular_conv   ( 12, input, coeffs, reference );
+  uint64_circular_conv_12_with (     input,         output    );
+
+  for(int i=0; i<12; i++) {
+    printf("out[%d] = %016llx  ;  ref[%d] = %016llx\n" , i, output[i], i, reference[i] );
+  }
+}
+
+*/
+
+//------------------------------------------------------------------------------
+