diff --git a/reference/README.md b/reference/README.md
index b3b4891..d33a6fa 100644
--- a/reference/README.md
+++ b/reference/README.md
@@ -16,18 +16,26 @@ We could significantly improve the speed of the Haskell implementation by bindin
 for some of the critical routines: Goldilocks field and extension, hashing, fast Fourier 
 transform.
 
+The switch between the simple but intentionally naive (and very slow) Haskell 
+implementation and the significantly faster C bindings is controlled by by the 
+C preprocessor flag `-DUSE_NAIVE_HASKELL` (so the faster one is the default).
+
 ### Implementation status
 
+- [ ] cabalize
 - [x] FRI prover
 - [x] FRI verifier
 - [x] proof serialization
 - [ ] serious testing of the FRI verifier
 - [ ] full outsourcing protocol
-- [ ] command line interface
 - [x] faster Goldilocks field operations via C FFI
-- [ ] quadratic field extension in C too
+- [x] quadratic field extension in C too (useful for the folding prover?)
 - [x] faster hashing via C FFI
 - [ ] faster NTT via C FFI
+- [ ] disk layout optimization
+- [ ] end-to-end workflow with input/output data in files
+- [ ] command line interface
+- [ ] even more detailed documentation of the protocol
 
 ### References
 
diff --git a/reference/src/Field/Goldilocks/Extension.hs b/reference/src/Field/Goldilocks/Extension.hs
index 73ea096..02254ac 100644
--- a/reference/src/Field/Goldilocks/Extension.hs
+++ b/reference/src/Field/Goldilocks/Extension.hs
@@ -1,176 +1,14 @@
 
--- | Quadratic extension over the Goldilocks field
---
--- We use the irreducble polynomial @x^2 - 7@ to be compatible with Plonky3
+{-# LANGUAGE CPP #-}
 
-module Field.Goldilocks.Extension where
+#ifdef USE_NAIVE_HASKELL
 
---------------------------------------------------------------------------------
+module Field.Goldilocks.Extension ( module Field.Goldilocks.Extension.Haskell ) where
+import Field.Goldilocks.Extension.Haskell
 
-import Prelude hiding ( div )
+#else
 
-import Data.Bits
-import Data.Ratio
+module Field.Goldilocks.Extension ( module Field.Goldilocks.Extension.BindC ) where
+import Field.Goldilocks.Extension.BindC
 
-import System.Random
-
-import Foreign.Ptr
-import Foreign.Storable
-
-import Data.Binary
-
-import Field.Goldilocks ( F )
-import qualified Field.Goldilocks as Goldi
-
---------------------------------------------------------------------------------
-
-type FExt = F2
-
-data F2 = F2 
-  { real :: !F 
-  , imag :: !F 
-  }
-  deriving (Eq)
-
-instance Binary F2 where
-  put (F2 x y) = put x >> put y
-  get = F2 <$> get <*> get
-  
-instance Show F2 where
-  show (F2 r i) = "(" ++ show r ++ " + j * " ++ show i ++ ")"
-
-instance Num F2 where
-  fromInteger = inj . fromIntegral
-  negate = neg
-  (+)    = add
-  (-)    = sub
-  (*)    = mul
-  abs    = id
-  signum _ = inj 1
-
-instance Fractional F2 where
-  fromRational y = fromInteger (numerator y) `div` fromInteger (denominator y)
-  recip  = inv
-  (/)    = div
-
-instance Random F2 where
-  -- random :: RandomGen g => g -> (a, g) 
-  random  g = let (x,g' ) = random g
-                  (y,g'') = random g'
-              in  (F2 x y, g'')
-  randomR = error "randomR/F2: doesn't make any sense"
-
-instance Storable F2 where
-  peek ptr = do
-    r <- peek (castPtr ptr) 
-    i <- peek (castPtr ptr `plusPtr` 8)
-    return (F2 r i)
-  poke ptr (F2 r i) = do
-    poke (castPtr ptr)             r
-    poke (castPtr ptr `plusPtr` 8) i
-  sizeOf    _ = 16
-  alignment _ = 8
-
---------------------------------------------------------------------------------
-
-zero, one, two :: F2
-zero = F2 Goldi.zero Goldi.zero 
-one  = F2 Goldi.one  Goldi.zero 
-two  = F2 Goldi.two  Goldi.zero 
-
-isZero, isOne :: F2 -> Bool
-isZero (F2 r i) = Goldi.isZero r && Goldi.isZero i
-isOne  (F2 r i) = Goldi.isOne  r && Goldi.isZero i
-
---------------------------------------------------------------------------------
-
-inj :: F -> F2
-inj r = F2 r 0 
-
-neg :: F2 -> F2
-neg (F2 r i) = F2 (negate r) (negate i)
-
-add :: F2 -> F2 -> F2
-add (F2 r1 i1) (F2 r2 i2) = F2 (r1 + r2) (i1 + i2)
-
-sub :: F2 -> F2 -> F2
-sub (F2 r1 i1) (F2 r2 i2) = F2 (r1 - r2) (i1 - i2)
-
-scl :: F -> F2 -> F2
-scl s (F2 r i) = F2 (s * r) (s * i)
-
-sqrNaive :: F2 -> F2
-sqrNaive (F2 r i) = F2 r3 i3 where
-  r3 = r*r + 7 * i*i
-  i3 = 2 * r*i
-
-mulNaive :: F2 -> F2 -> F2
-mulNaive (F2 r1 i1) (F2 r2 i2) = F2 r3 i3 where
-  r3 = r1 * r2 + 7 * i1 * i2
-  i3 = r1 * i2 +     i1 * r2
-
--- uses Karatsuba trick to have one less multiplications
-mulKaratsuba :: F2 -> F2 -> F2
-mulKaratsuba (F2 r1 i1) (F2 r2 i2) = F2 r3 i3 where
-  u = r1*r2
-  w = i1*i2
-  v = (r1+i1)*(r2+i2)
-  r3 = u + 7*w
-  i3 = v - u - w
-
-sqr :: F2 -> F2
-sqr = sqrNaive
-
-mul :: F2 -> F2 -> F2
-mul = mulKaratsuba
-
---------------------------------------------------------------------------------
--- * inverse and division
-
--- | We can solve the equation explicitly.
---
--- > irred = x^2 + p*x + q
--- > (a*x + b) * (c*x + d) = (a*c)*x^2 + (a*d+b*c)*x + (b*d)
--- >                       = (a*d + b*c - a*c*p)*x + (b*d - a*c*q)
---
--- and then we want to solve
---
--- > b*d       - a*c*q == 1
--- > a*d + b*c - a*c*p == 0
---
--- which has the solution:
---
--- > c = - a       / (b^2 - a*b*p + a^2*q)  
--- > d = (b - a*p) / (b^2 - a*b*p + a^2*q)
---
--- Remark: It seems the denominator being zero would mean that our
--- defining polynomial is not irreducible.
---
--- Note: we can optimize for the common case p=0; and also for q=1.
---
-inv :: F2 -> F2
-inv (F2 b a) = F2 d c where
-  denom = b*b - 7*a*a
-  c = - a / denom
-  d =   b / denom
-
-div :: F2 -> F2 -> F2
-div u v = mul u (inv v)
-
---------------------------------------------------------------------------------
-
-pow_ :: F2 -> Int -> F2
-pow_ x e = pow x (fromIntegral e)
-
-pow :: F2 -> Integer -> F2
-pow x e 
-  | e == 0    = 1
-  | e <  0    = pow (inv x) (negate e)
-  | otherwise = go 1 x e
-  where
-    go !acc _  0     = acc
-    go !acc !s !expo = case expo .&. 1 of
-      0 -> go acc     (sqr s) (shiftR expo 1)
-      _ -> go (acc*s) (sqr s) (shiftR expo 1)
-
---------------------------------------------------------------------------------
+#endif
diff --git a/reference/src/Field/Goldilocks/Extension/BindC.hs b/reference/src/Field/Goldilocks/Extension/BindC.hs
new file mode 100644
index 0000000..f85ec96
--- /dev/null
+++ b/reference/src/Field/Goldilocks/Extension/BindC.hs
@@ -0,0 +1,193 @@
+
+-- | Bindings to a C implementation of the quadratic extension over the Goldilocks prime field
+--
+-- It's probably not significantly (if at all) faster than the naive Haskell combined 
+-- with the fast Goldilocks base field operations, but the C versions should be useful 
+-- for the vector operations, and this way we can test them easily.
+
+{-# LANGUAGE ForeignFunctionInterface, BangPatterns, NumericUnderscores #-}
+module Field.Goldilocks.Extension.BindC where
+
+--------------------------------------------------------------------------------
+
+import Prelude hiding ( div )
+import qualified Prelude
+
+import Data.Bits
+import Data.Word
+import Data.Ratio
+
+import Foreign.C
+import Foreign.Ptr
+import Foreign.Storable
+import Foreign.Marshal
+
+import System.Random
+import System.IO.Unsafe
+
+import Data.Binary
+import Data.Binary.Get ( getWord64le )
+import Data.Binary.Put ( putWord64le )
+
+import Text.Printf
+
+import Field.Goldilocks ( F , Goldilocks(..) )
+import qualified Field.Goldilocks as Goldi
+
+import Data.Flat
+
+--------------------------------------------------------------------------------
+
+type FExt = F2
+
+data F2 = F2 
+  { real :: !F 
+  , imag :: !F 
+  }
+  deriving (Eq)
+
+--------------------------------------------------------------------------------
+
+instance Binary F2 where
+  put (F2 x y) = put x >> put y
+  get = F2 <$> get <*> get
+  
+instance Storable F2 where
+  peek ptr = do
+    r <- peek (castPtr ptr) 
+    i <- peek (castPtr ptr `plusPtr` 8)
+    return (F2 r i)
+  poke ptr (F2 r i) = do
+    poke (castPtr ptr)             r
+    poke (castPtr ptr `plusPtr` 8) i
+  sizeOf    _ = 16
+  alignment _ = 8
+
+instance Flat F2 where
+  sizeInBytes  _ = 16
+  sizeInQWords _ = 2
+
+  withFlat (F2 x y) action = allocaBytesAligned 16 8 $ \ptr -> do
+    poke (castPtr ptr            ) x
+    poke (castPtr ptr `plusPtr` 8) y
+    action ptr 
+
+  makeFlat ptr = do
+    x <- peek (castPtr ptr            )
+    y <- peek (castPtr ptr `plusPtr` 8)
+    return (F2 x y)
+
+--------------------------------------------------------------------------------
+
+instance Show F2 where
+  show (F2 r i) = "(" ++ show r ++ " + j * " ++ show i ++ ")"
+
+instance Num F2 where
+  fromInteger = inj . fromIntegral
+  negate = neg
+  (+)    = add
+  (-)    = sub
+  (*)    = mul
+  abs    = id
+  signum _ = inj 1
+
+instance Fractional F2 where
+  fromRational y = fromInteger (numerator y) `div` fromInteger (denominator y)
+  recip  = inv
+  (/)    = div
+
+instance Random F2 where
+  -- random :: RandomGen g => g -> (a, g) 
+  random  g = let (x,g' ) = random g
+                  (y,g'') = random g'
+              in  (F2 x y, g'')
+  randomR = error "randomR/F2: doesn't make any sense"
+
+--------------------------------------------------------------------------------
+
+zero, one, two :: F2
+zero = F2 Goldi.zero Goldi.zero 
+one  = F2 Goldi.one  Goldi.zero 
+two  = F2 Goldi.two  Goldi.zero 
+
+isZero, isOne :: F2 -> Bool
+isZero (F2 r i) = Goldi.isZero r && Goldi.isZero i
+isOne  (F2 r i) = Goldi.isOne  r && Goldi.isZero i
+
+--------------------------------------------------------------------------------
+
+{-# NOINLINE unaryOpIO #-}
+unaryOpIO :: (Ptr Word64 -> Ptr Word64 -> IO ()) -> F2 -> IO F2
+unaryOpIO c_action x = allocaBytesAligned 32 8 $ \ptr1 -> do
+  let ptr2 = plusPtr ptr1 16
+  poke (castPtr ptr1) x
+  c_action ptr1 ptr2
+  peek (castPtr ptr2)
+
+{-# NOINLINE binaryOpIO #-}
+binaryOpIO :: (Ptr Word64 -> Ptr Word64 -> Ptr Word64 -> IO ()) -> F2 -> F2 -> IO F2
+binaryOpIO c_action x y = allocaBytesAligned 48 8 $ \ptr1 -> do
+  let ptr2 = plusPtr ptr1 16
+  let ptr3 = plusPtr ptr1 32
+  poke (castPtr ptr1) x
+  poke (castPtr ptr2) y
+  c_action ptr1 ptr2 ptr3
+  peek (castPtr ptr3)
+
+--------------------------------------------------------------------------------
+
+inj :: F -> F2
+inj r = F2 r 0 
+
+neg, sqr, inv :: F2 -> F2
+neg x = unsafePerformIO (unaryOpIO c_goldilocks_ext_neg x)
+sqr x = unsafePerformIO (unaryOpIO c_goldilocks_ext_sqr x)
+inv x = unsafePerformIO (unaryOpIO c_goldilocks_ext_inv x)
+
+add, sub, mul, div  :: F2 -> F2 -> F2
+add x y = unsafePerformIO (binaryOpIO c_goldilocks_ext_add x y)
+sub x y = unsafePerformIO (binaryOpIO c_goldilocks_ext_sub x y)
+mul x y = unsafePerformIO (binaryOpIO c_goldilocks_ext_mul x y)
+div x y = unsafePerformIO (binaryOpIO c_goldilocks_ext_div x y)
+
+{-# NOINLINE sclIO #-}
+sclIO :: F -> F2 -> IO F2
+sclIO (MkGoldilocks s) x  = allocaBytesAligned 32 8 $ \ptr1 -> do
+  let ptr2 = plusPtr ptr1 16
+  poke (castPtr ptr1) x
+  c_goldilocks_ext_scl s ptr1 ptr2
+  peek (castPtr ptr2)
+
+{-# NOINLINE powIO #-}
+powIO :: F2 -> Int -> IO F2
+powIO base expo = allocaBytesAligned 32 8 $ \ptr1 -> do
+  let ptr2 = plusPtr ptr1 16
+  poke (castPtr ptr1) base
+  c_goldilocks_ext_pow ptr1 (fromIntegral expo :: CInt) ptr2
+  peek (castPtr ptr2)
+
+scl :: F -> F2 -> F2
+scl s x = unsafePerformIO (sclIO s x)
+
+pow_ :: F2 -> Int -> F2
+pow_ b e = unsafePerformIO (powIO b e)
+
+-- | NOTE: this is technically incorrect (it only works for small exponents), 
+-- but we don't really care
+pow :: F2 -> Integer -> F2
+pow b e = pow_ b (fromInteger e)
+
+--------------------------------------------------------------------------------
+
+foreign import ccall unsafe "goldilocks_ext_neg" c_goldilocks_ext_neg :: Ptr Word64               -> Ptr Word64 -> IO ()
+foreign import ccall unsafe "goldilocks_ext_add" c_goldilocks_ext_add :: Ptr Word64 -> Ptr Word64 -> Ptr Word64 -> IO ()
+foreign import ccall unsafe "goldilocks_ext_sub" c_goldilocks_ext_sub :: Ptr Word64 -> Ptr Word64 -> Ptr Word64 -> IO ()
+foreign import ccall unsafe "goldilocks_ext_scl" c_goldilocks_ext_scl ::     Word64 -> Ptr Word64 -> Ptr Word64 -> IO ()
+foreign import ccall unsafe "goldilocks_ext_sqr" c_goldilocks_ext_sqr :: Ptr Word64               -> Ptr Word64 -> IO ()
+foreign import ccall unsafe "goldilocks_ext_mul" c_goldilocks_ext_mul :: Ptr Word64 -> Ptr Word64 -> Ptr Word64 -> IO ()
+foreign import ccall unsafe "goldilocks_ext_inv" c_goldilocks_ext_inv :: Ptr Word64               -> Ptr Word64 -> IO ()
+foreign import ccall unsafe "goldilocks_ext_div" c_goldilocks_ext_div :: Ptr Word64 -> Ptr Word64 -> Ptr Word64 -> IO ()
+foreign import ccall unsafe "goldilocks_ext_pow" c_goldilocks_ext_pow :: Ptr Word64 -> CInt       -> Ptr Word64 -> IO ()
+
+--------------------------------------------------------------------------------
+
diff --git a/reference/src/Field/Goldilocks/Extension/Haskell.hs b/reference/src/Field/Goldilocks/Extension/Haskell.hs
new file mode 100644
index 0000000..fa7c586
--- /dev/null
+++ b/reference/src/Field/Goldilocks/Extension/Haskell.hs
@@ -0,0 +1,197 @@
+
+-- | Quadratic extension over the Goldilocks field
+--
+-- We use the irreducble polynomial @x^2 - 7@ to be compatible with Plonky3
+
+module Field.Goldilocks.Extension.Haskell where
+
+--------------------------------------------------------------------------------
+
+import Prelude hiding ( div )
+
+import Data.Bits
+import Data.Ratio
+
+import System.Random
+
+import Foreign.Ptr
+import Foreign.Storable
+import Foreign.Marshal
+
+import Data.Binary
+
+import Data.Flat
+
+import Field.Goldilocks ( F )
+import qualified Field.Goldilocks as Goldi
+
+--------------------------------------------------------------------------------
+
+type FExt = F2
+
+data F2 = F2 
+  { real :: !F 
+  , imag :: !F 
+  }
+  deriving (Eq)
+
+--------------------------------------------------------------------------------
+
+instance Binary F2 where
+  put (F2 x y) = put x >> put y
+  get = F2 <$> get <*> get
+  
+instance Storable F2 where
+  peek ptr = do
+    r <- peek (castPtr ptr) 
+    i <- peek (castPtr ptr `plusPtr` 8)
+    return (F2 r i)
+  poke ptr (F2 r i) = do
+    poke (castPtr ptr)             r
+    poke (castPtr ptr `plusPtr` 8) i
+  sizeOf    _ = 16
+  alignment _ = 8
+
+instance Flat F2 where
+  sizeInBytes  _ = 16
+  sizeInQWords _ = 2
+
+  withFlat (F2 x y) action = allocaBytesAligned 16 8 $ \ptr -> do
+    poke (castPtr ptr            ) x
+    poke (castPtr ptr `plusPtr` 8) y
+    action ptr 
+
+  makeFlat ptr = do
+    x <- peek (castPtr ptr            )
+    y <- peek (castPtr ptr `plusPtr` 8)
+    return (F2 x y)
+
+--------------------------------------------------------------------------------
+
+instance Show F2 where
+  show (F2 r i) = "(" ++ show r ++ " + j * " ++ show i ++ ")"
+
+instance Num F2 where
+  fromInteger = inj . fromIntegral
+  negate = neg
+  (+)    = add
+  (-)    = sub
+  (*)    = mul
+  abs    = id
+  signum _ = inj 1
+
+instance Fractional F2 where
+  fromRational y = fromInteger (numerator y) `div` fromInteger (denominator y)
+  recip  = inv
+  (/)    = div
+
+instance Random F2 where
+  -- random :: RandomGen g => g -> (a, g) 
+  random  g = let (x,g' ) = random g
+                  (y,g'') = random g'
+              in  (F2 x y, g'')
+  randomR = error "randomR/F2: doesn't make any sense"
+
+--------------------------------------------------------------------------------
+
+zero, one, two :: F2
+zero = F2 Goldi.zero Goldi.zero 
+one  = F2 Goldi.one  Goldi.zero 
+two  = F2 Goldi.two  Goldi.zero 
+
+isZero, isOne :: F2 -> Bool
+isZero (F2 r i) = Goldi.isZero r && Goldi.isZero i
+isOne  (F2 r i) = Goldi.isOne  r && Goldi.isZero i
+
+--------------------------------------------------------------------------------
+
+inj :: F -> F2
+inj r = F2 r 0 
+
+neg :: F2 -> F2
+neg (F2 r i) = F2 (negate r) (negate i)
+
+add :: F2 -> F2 -> F2
+add (F2 r1 i1) (F2 r2 i2) = F2 (r1 + r2) (i1 + i2)
+
+sub :: F2 -> F2 -> F2
+sub (F2 r1 i1) (F2 r2 i2) = F2 (r1 - r2) (i1 - i2)
+
+scl :: F -> F2 -> F2
+scl s (F2 r i) = F2 (s * r) (s * i)
+
+sqrNaive :: F2 -> F2
+sqrNaive (F2 r i) = F2 r3 i3 where
+  r3 = r*r + 7 * i*i
+  i3 = 2 * r*i
+
+mulNaive :: F2 -> F2 -> F2
+mulNaive (F2 r1 i1) (F2 r2 i2) = F2 r3 i3 where
+  r3 = r1 * r2 + 7 * i1 * i2
+  i3 = r1 * i2 +     i1 * r2
+
+-- uses Karatsuba trick to have one less multiplications
+mulKaratsuba :: F2 -> F2 -> F2
+mulKaratsuba (F2 r1 i1) (F2 r2 i2) = F2 r3 i3 where
+  u = r1*r2
+  w = i1*i2
+  v = (r1+i1)*(r2+i2)
+  r3 = u + 7*w
+  i3 = v - u - w
+
+sqr :: F2 -> F2
+sqr = sqrNaive
+
+mul :: F2 -> F2 -> F2
+mul = mulKaratsuba
+
+--------------------------------------------------------------------------------
+-- * inverse and division
+
+-- | We can solve the equation explicitly.
+--
+-- > irred = x^2 + p*x + q
+-- > (a*x + b) * (c*x + d) = (a*c)*x^2 + (a*d+b*c)*x + (b*d)
+-- >                       = (a*d + b*c - a*c*p)*x + (b*d - a*c*q)
+--
+-- and then we want to solve
+--
+-- > b*d       - a*c*q == 1
+-- > a*d + b*c - a*c*p == 0
+--
+-- which has the solution:
+--
+-- > c = - a       / (b^2 - a*b*p + a^2*q)  
+-- > d = (b - a*p) / (b^2 - a*b*p + a^2*q)
+--
+-- Remark: It seems the denominator being zero would mean that our
+-- defining polynomial is not irreducible.
+--
+-- Note: we can optimize for the common case p=0; and also for q=1.
+--
+inv :: F2 -> F2
+inv (F2 b a) = F2 d c where
+  denom = b*b - 7*a*a
+  c = - a / denom
+  d =   b / denom
+
+div :: F2 -> F2 -> F2
+div u v = mul u (inv v)
+
+--------------------------------------------------------------------------------
+
+pow_ :: F2 -> Int -> F2
+pow_ x e = pow x (fromIntegral e)
+
+pow :: F2 -> Integer -> F2
+pow x e 
+  | e == 0    = 1
+  | e <  0    = pow (inv x) (negate e)
+  | otherwise = go 1 x e
+  where
+    go !acc _  0     = acc
+    go !acc !s !expo = case expo .&. 1 of
+      0 -> go acc     (sqr s) (shiftR expo 1)
+      _ -> go (acc*s) (sqr s) (shiftR expo 1)
+
+--------------------------------------------------------------------------------
diff --git a/reference/src/Field/Goldilocks/Fast.hs b/reference/src/Field/Goldilocks/Fast.hs
index 09b0e1e..e04b80a 100644
--- a/reference/src/Field/Goldilocks/Fast.hs
+++ b/reference/src/Field/Goldilocks/Fast.hs
@@ -16,6 +16,7 @@ import Data.Ratio
 import Foreign.C
 import Foreign.Ptr
 import Foreign.Storable
+import Foreign.Marshal
 
 import System.Random
 
@@ -25,6 +26,8 @@ import Data.Binary.Put ( putWord64le )
 
 import Text.Printf
 
+import Data.Flat
+
 --------------------------------------------------------------------------------
 
 type F = Goldilocks
@@ -51,6 +54,12 @@ instance Storable F where
   sizeOf    _ = 8
   alignment _ = 8
 
+instance Flat Goldilocks where
+  sizeInBytes  _ = 8
+  sizeInQWords _ = 1
+  withFlat (MkGoldilocks x) action = alloca $ \ptr -> poke ptr x >> action ptr
+  makeFlat ptr = MkGoldilocks <$> peek ptr
+
 --------------------------------------------------------------------------------
 
 newtype Goldilocks 
diff --git a/reference/src/Field/Goldilocks/Slow.hs b/reference/src/Field/Goldilocks/Slow.hs
index 1692f6a..ef400a6 100644
--- a/reference/src/Field/Goldilocks/Slow.hs
+++ b/reference/src/Field/Goldilocks/Slow.hs
@@ -13,6 +13,10 @@ import Data.Bits
 import Data.Word
 import Data.Ratio
 
+import Foreign.Ptr
+import Foreign.Storable
+import Foreign.Marshal
+
 import System.Random
 
 import Data.Binary
@@ -21,6 +25,8 @@ import Data.Binary.Put ( putWord64le )
 
 import Text.Printf
 
+import Data.Flat
+
 --------------------------------------------------------------------------------
 
 type F = Goldilocks
@@ -34,10 +40,24 @@ toF = mkGoldilocks . fromIntegral
 intToF :: Int -> F
 intToF = mkGoldilocks . fromIntegral
 
+--------------------------------------------------------------------------------
+
 instance Binary F where
   put x = putWord64le (fromF x)
   get = toF <$> getWord64le
 
+instance Storable F where
+  peek ptr                   = (MkGoldilocks . fromIntegral) <$> peek (castPtr ptr :: Ptr Word64)
+  poke ptr  (MkGoldilocks x) = poke (castPtr ptr :: Ptr Word64) (fromInteger x :: Word64)
+  sizeOf    _ = 8
+  alignment _ = 8
+
+instance Flat Goldilocks where
+  sizeInBytes  _ = 8
+  sizeInQWords _ = 1
+  withFlat (MkGoldilocks x) action = alloca $ \ptr -> poke ptr (fromInteger x :: Word64) >> action ptr
+  makeFlat ptr = (MkGoldilocks . fromIntegral) <$> peek ptr
+
 --------------------------------------------------------------------------------
 
 newtype Goldilocks 
diff --git a/reference/src/cbits/compile.sh b/reference/src/cbits/compile.sh
index 628824a..c52bddd 100755
--- a/reference/src/cbits/compile.sh
+++ b/reference/src/cbits/compile.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
 
 gcc -c -O2 goldilocks.c
+gcc -c -O2 goldilocks_ext.c
 gcc -c -O2 monolith.c
+gcc -c -O2 ntt.c
+
diff --git a/reference/src/cbits/goldilocks.c b/reference/src/cbits/goldilocks.c
index c404fba..b8ed831 100644
--- a/reference/src/cbits/goldilocks.c
+++ b/reference/src/cbits/goldilocks.c
@@ -1,4 +1,6 @@
 
+// the "Goldilocks" prime field of size `p = 2^64 - 2^32 + 1`
+
 #include <stdint.h>
 #include <stdio.h>      // for testing only
 #include <assert.h>
@@ -126,6 +128,22 @@ uint64_t goldilocks_mul_small(uint64_t x, uint32_t y) {
 
 //------------------------------------------------------------------------------
 
+uint64_t goldilocks_mul_by_2(uint64_t x) {
+  return goldilocks_add( x , x ); 
+}
+
+uint64_t goldilocks_a_plus_7b(uint64_t a, uint64_t b) {
+  __uint128_t z = (__uint128_t)a + 7 * (__uint128_t)b ;
+  return goldilocks_rdc_small( z );
+}
+
+uint64_t goldilocks_a_minus_7b(uint64_t a, uint64_t b) {
+  uint64_t b7 = goldilocks_rdc_small ( 7 * (__uint128_t)b ) ;
+  return goldilocks_sub( a , b7 );
+}
+
+//------------------------------------------------------------------------------
+
 uint64_t goldilocks_euclid(uint64_t x0, uint64_t y0, uint64_t u0, uint64_t v0) {
 
   uint64_t x = x0;
@@ -178,6 +196,15 @@ uint64_t goldilocks_inv(uint64_t a) {
   return goldilocks_div(1, a);
 }
 
+uint64_t goldilocks_div_by_2(uint64_t a) {
+  if (a & 1) {
+    return ((a >> 1) + GOLDILOCKS_HALFPRIME_PLUS1);
+  }
+  else {
+    return (a >> 1);
+  }
+}
+
 //------------------------------------------------------------------------------
 
 uint64_t goldilocks_pow(uint64_t base, int expo) {
@@ -193,7 +220,7 @@ uint64_t goldilocks_pow(uint64_t base, int expo) {
       acc = goldilocks_mul( acc, sq );
     } 
     if (e > 0) {
-      sq = goldilocks_mul( sq , sq );
+      sq = goldilocks_sqr( sq );
       e  = e >> 1;
     }
   }
diff --git a/reference/src/cbits/goldilocks.h b/reference/src/cbits/goldilocks.h
index c53af1e..11a9bfe 100644
--- a/reference/src/cbits/goldilocks.h
+++ b/reference/src/cbits/goldilocks.h
@@ -1,4 +1,6 @@
 
+// the "Goldilocks" prime field of size `p = 2^64 - 2^32 + 1`
+
 #include <stdint.h>
 
 //------------------------------------------------------------------------------
@@ -19,6 +21,11 @@ uint64_t goldilocks_inv(uint64_t a);
 uint64_t goldilocks_div(uint64_t a, uint64_t b);
 uint64_t goldilocks_pow(uint64_t b, int e);
 
+uint64_t goldilocks_mul_by_2  (uint64_t x);
+uint64_t goldilocks_div_by_2  (uint64_t a);
+uint64_t goldilocks_a_plus_7b (uint64_t a, uint64_t b);
+uint64_t goldilocks_a_minus_7b(uint64_t a, uint64_t b);
+
 //------------------------------------------------------------------------------
 
 uint64_t goldilocks_rdc          (__uint128_t x);
diff --git a/reference/src/cbits/goldilocks_ext.c b/reference/src/cbits/goldilocks_ext.c
new file mode 100644
index 0000000..22c6cce
--- /dev/null
+++ b/reference/src/cbits/goldilocks_ext.c
@@ -0,0 +1,147 @@
+
+// quadratic field extension F[x] = F(x) / (x^2 - 7) over the Goldilocks field
+
+#include "goldilocks.h"
+#include "goldilocks_ext.h"
+
+//------------------------------------------------------------------------------
+
+void goldilocks_ext_inj( uint64_t input , uint64_t *out ) {
+  out[0] = input;
+  out[1] = 0;
+}
+
+void goldilocks_ext_set_one( uint64_t *out ) {
+  out[0] = 1;
+  out[1] = 0;
+}
+
+//------------------------------------------------------------------------------
+
+void goldilocks_ext_neg(const uint64_t *x , uint64_t *out) {
+  out[0] = goldilocks_neg( x[0] );
+  out[1] = goldilocks_neg( x[1] );
+}
+
+void goldilocks_ext_add(const uint64_t *x , const uint64_t *y , uint64_t *out) {
+  out[0] = goldilocks_add( x[0] , y[0] );
+  out[1] = goldilocks_add( x[1] , y[1] );
+}
+
+void goldilocks_ext_sub(const uint64_t *x , const uint64_t *y , uint64_t *out) {
+  out[0] = goldilocks_sub( x[0] , y[0] );
+  out[1] = goldilocks_sub( x[1] , y[1] );
+}
+
+void goldilocks_ext_scl(uint64_t s , const uint64_t *x , uint64_t *out) {
+  out[0] = goldilocks_mul( s , x[0] );
+  out[1] = goldilocks_mul( s , x[1] );
+}
+
+//------------------------------------------------------------------------------
+
+/*
+sqrNaive :: F2 -> F2
+sqrNaive (F2 r i) = F2 r3 i3 where
+  r3 = r*r + 7 * i*i
+  i3 = 2 * r*i
+
+-- uses Karatsuba trick to have one less multiplications
+mulKaratsuba :: F2 -> F2 -> F2
+mulKaratsuba (F2 r1 i1) (F2 r2 i2) = F2 r3 i3 where
+  u = r1*r2
+  w = i1*i2
+  v = (r1+i1)*(r2+i2)
+  r3 = u + 7*w
+  i3 = v - u - w
+*/
+
+void goldilocks_ext_sqr(const uint64_t *x , uint64_t *out) {
+  uint64_t u = goldilocks_a_plus_7b( goldilocks_sqr(x[0]) , goldilocks_sqr( x[1] )) ;
+  uint64_t v = goldilocks_mul_by_2( goldilocks_mul( x[0] , x[1] ) );
+  out[0] = u;   
+  out[1] = v;     // because we want inplace to work too
+}
+
+// uses Karatsuba trick to have one less multiplications
+void goldilocks_ext_mul(const uint64_t *x , const uint64_t *y , uint64_t *out) {
+  uint64_t u = goldilocks_mul( x[0] , y[0] );
+  uint64_t w = goldilocks_mul( x[1] , y[1] );
+  uint64_t v = goldilocks_mul( goldilocks_add( x[0] , x[1] ) , goldilocks_add( y[0] , y[1] ) );
+  out[0] = goldilocks_a_plus_7b( u , w );
+  out[1] = goldilocks_sub( v , goldilocks_add( u , w ) );
+}
+
+//------------------------------------------------------------------------------
+
+// We can solve the equation explicitly.
+//
+// > irred = x^2 + p*x + q
+// > (a*x + b) * (c*x + d) = (a*c)*x^2 + (a*d+b*c)*x + (b*d)
+// >                       = (a*d + b*c - a*c*p)*x + (b*d - a*c*q)
+//
+// and then we want to solve
+//
+// > b*d       - a*c*q == 1
+// > a*d + b*c - a*c*p == 0
+//
+// which has the solution:
+//
+// > c = - a       / (b^2 - a*b*p + a^2*q)  
+// > d = (b - a*p) / (b^2 - a*b*p + a^2*q)
+//
+// Remark: It seems the denominator being zero would mean that our
+// defining polynomial is not irreducible.
+//
+// Note: we can optimize for the common case p=0; and also for q=1.
+//
+void goldilocks_ext_inv(const uint64_t *ptr , uint64_t *out) {
+  uint64_t b = ptr[0];
+  uint64_t a = ptr[1];   // (a*x + b)
+  uint64_t denom = goldilocks_a_minus_7b( goldilocks_sqr(b) , goldilocks_sqr(a) );
+  uint64_t denom_inv = goldilocks_inv( denom );
+  out[0] =                 goldilocks_mul( b , denom_inv )  ;
+  out[1] = goldilocks_neg( goldilocks_mul( a , denom_inv ) );  
+}
+
+void goldilocks_ext_div(const uint64_t *x , const uint64_t *y , uint64_t *out) {
+  uint64_t invy[2];
+  goldilocks_ext_inv( y , invy );
+  goldilocks_ext_mul( x , invy , out );
+}
+
+//------------------------------------------------------------------------------
+
+void goldilocks_ext_pow(const uint64_t *base , int expo , uint64_t *out) {
+
+  if (expo == 0) { 
+    goldilocks_ext_set_one(out);
+    return;
+  }
+
+  if (expo <  0) { 
+    uint64_t base_inv[2];
+    goldilocks_ext_inv( base , base_inv );
+    goldilocks_ext_pow( base_inv , -expo , out ); 
+    return;
+  }
+
+  int      e   = expo;
+  uint64_t sq [2] ; sq[0] = base[0] ; sq[1] = base[1] ;
+  uint64_t acc[2] ; goldilocks_ext_set_one( acc );
+
+  while (e != 0) { 
+    if ((e & 1) != 0) {
+      goldilocks_ext_mul( acc , sq , acc );
+    } 
+    if (e > 0) {
+      goldilocks_ext_sqr( sq , sq );
+      e  = e >> 1;
+    }
+  }
+
+  out[0] = acc[0];
+  out[1] = acc[1];
+}
+
+//------------------------------------------------------------------------------
diff --git a/reference/src/cbits/goldilocks_ext.h b/reference/src/cbits/goldilocks_ext.h
new file mode 100644
index 0000000..ab02a0d
--- /dev/null
+++ b/reference/src/cbits/goldilocks_ext.h
@@ -0,0 +1,23 @@
+
+// quadratic field extension F[x] = F(x) / (x^2 - 7) over the Goldilocks field
+
+#include <stdint.h>
+#include "goldilocks.h"
+
+//------------------------------------------------------------------------------
+
+void goldilocks_ext_inj( uint64_t input , uint64_t *out );
+void goldilocks_ext_set_one( uint64_t *out );
+
+void goldilocks_ext_neg(const uint64_t *x                     , uint64_t *out);
+void goldilocks_ext_add(const uint64_t *x , const uint64_t *y , uint64_t *out);
+void goldilocks_ext_sub(const uint64_t *x , const uint64_t *y , uint64_t *out);
+void goldilocks_ext_scl(       uint64_t s , const uint64_t *x , uint64_t *out);
+void goldilocks_ext_sqr(const uint64_t *x                     , uint64_t *out);
+void goldilocks_ext_mul(const uint64_t *x , const uint64_t *y , uint64_t *out);
+void goldilocks_ext_inv(const uint64_t *x                     , uint64_t *out);
+void goldilocks_ext_div(const uint64_t *x , const uint64_t *y , uint64_t *out);
+void goldilocks_ext_pow(const uint64_t *b , int e             , uint64_t *out);
+
+//------------------------------------------------------------------------------
+
diff --git a/reference/src/runi.sh b/reference/src/runi.sh
index f1fc46f..51aabbb 100755
--- a/reference/src/runi.sh
+++ b/reference/src/runi.sh
@@ -1,3 +1,3 @@
 #!/bin/bash
 
-ghci testMain.hs cbits/goldilocks.o cbits/monolith.o 
\ No newline at end of file
+ghci testMain.hs cbits/goldilocks.o cbits/goldilocks_ext.o cbits/monolith.o cbits/ntt.o