add quadratic field extension C implementation (will be useful for vector operations?)

2026-02-20 13:43:08 +00:00 · 2025-10-21 13:14:07 +02:00 · 2025-10-21 13:14:07 +02:00 · 4d65a0b042
commit 4d65a0b042
parent b2d8b577f7
12 changed files with 646 additions and 174 deletions
--- a/reference/README.md
+++ b/reference/README.md
@ -16,18 +16,26 @@ We could significantly improve the speed of the Haskell implementation by bindin
 for some of the critical routines: Goldilocks field and extension, hashing, fast Fourier 
 transform.
 The switch between the simple but intentionally naive (and very slow) Haskell 
 implementation and the significantly faster C bindings is controlled by by the 
 C preprocessor flag `-DUSE_NAIVE_HASKELL` (so the faster one is the default).
 ### Implementation status
 - [ ] cabalize
 - [x] FRI prover
 - [x] FRI verifier
 - [x] proof serialization
 - [ ] serious testing of the FRI verifier
 - [ ] full outsourcing protocol
 - [ ] command line interface
 - [x] faster Goldilocks field operations via C FFI
- [ ] quadratic field extension in C too
+- [x] quadratic field extension in C too (useful for the folding prover?)
 - [x] faster hashing via C FFI
 - [ ] faster NTT via C FFI
 - [ ] disk layout optimization
 - [ ] end-to-end workflow with input/output data in files
 - [ ] command line interface
 - [ ] even more detailed documentation of the protocol
 ### References
--- a/reference/src/Field/Goldilocks/Extension.hs
+++ b/reference/src/Field/Goldilocks/Extension.hs
@ -1,176 +1,14 @@
-- | Quadratic extension over the Goldilocks field
+{-# LANGUAGE CPP #-}
 --
 -- We use the irreducble polynomial @x^2 - 7@ to be compatible with Plonky3
-module Field.Goldilocks.Extension where
+#ifdef USE_NAIVE_HASKELL
--------------------------------------------------------------------------------
+module Field.Goldilocks.Extension ( module Field.Goldilocks.Extension.Haskell ) where
 import Field.Goldilocks.Extension.Haskell
-import Prelude hiding ( div )
+#else
-import Data.Bits
+module Field.Goldilocks.Extension ( module Field.Goldilocks.Extension.BindC ) where
-import Data.Ratio
+import Field.Goldilocks.Extension.BindC
-import System.Random
+#endif
 import Foreign.Ptr
 import Foreign.Storable
 import Data.Binary
 import Field.Goldilocks ( F )
 import qualified Field.Goldilocks as Goldi
 --------------------------------------------------------------------------------
 type FExt = F2
 data F2 = F2 
  { real :: !F 
  , imag :: !F 
  }
  deriving (Eq)
 instance Binary F2 where
  put (F2 x y) = put x >> put y
  get = F2 <$> get <*> get
 instance Show F2 where
  show (F2 r i) = "(" ++ show r ++ " + j * " ++ show i ++ ")"
 instance Num F2 where
  fromInteger = inj . fromIntegral
  negate = neg
  (+)    = add
  (-)    = sub
  (*)    = mul
  abs    = id
  signum _ = inj 1
 instance Fractional F2 where
  fromRational y = fromInteger (numerator y) `div` fromInteger (denominator y)
  recip  = inv
  (/)    = div
 instance Random F2 where
  -- random :: RandomGen g => g -> (a, g) 
  random  g = let (x,g' ) = random g
                  (y,g'') = random g'
              in  (F2 x y, g'')
  randomR = error "randomR/F2: doesn't make any sense"
 instance Storable F2 where
  peek ptr = do
    r <- peek (castPtr ptr) 
    i <- peek (castPtr ptr `plusPtr` 8)
    return (F2 r i)
  poke ptr (F2 r i) = do
    poke (castPtr ptr)             r
    poke (castPtr ptr `plusPtr` 8) i
  sizeOf    _ = 16
  alignment _ = 8
 --------------------------------------------------------------------------------
 zero, one, two :: F2
 zero = F2 Goldi.zero Goldi.zero 
 one  = F2 Goldi.one  Goldi.zero 
 two  = F2 Goldi.two  Goldi.zero 
 isZero, isOne :: F2 -> Bool
 isZero (F2 r i) = Goldi.isZero r && Goldi.isZero i
 isOne  (F2 r i) = Goldi.isOne  r && Goldi.isZero i
 --------------------------------------------------------------------------------
 inj :: F -> F2
 inj r = F2 r 0 
 neg :: F2 -> F2
 neg (F2 r i) = F2 (negate r) (negate i)
 add :: F2 -> F2 -> F2
 add (F2 r1 i1) (F2 r2 i2) = F2 (r1 + r2) (i1 + i2)
 sub :: F2 -> F2 -> F2
 sub (F2 r1 i1) (F2 r2 i2) = F2 (r1 - r2) (i1 - i2)
 scl :: F -> F2 -> F2
 scl s (F2 r i) = F2 (s * r) (s * i)
 sqrNaive :: F2 -> F2
 sqrNaive (F2 r i) = F2 r3 i3 where
  r3 = r*r + 7 * i*i
  i3 = 2 * r*i
 mulNaive :: F2 -> F2 -> F2
 mulNaive (F2 r1 i1) (F2 r2 i2) = F2 r3 i3 where
  r3 = r1 * r2 + 7 * i1 * i2
  i3 = r1 * i2 +     i1 * r2
 -- uses Karatsuba trick to have one less multiplications
 mulKaratsuba :: F2 -> F2 -> F2
 mulKaratsuba (F2 r1 i1) (F2 r2 i2) = F2 r3 i3 where
  u = r1*r2
  w = i1*i2
  v = (r1+i1)*(r2+i2)
  r3 = u + 7*w
  i3 = v - u - w
 sqr :: F2 -> F2
 sqr = sqrNaive
 mul :: F2 -> F2 -> F2
 mul = mulKaratsuba
 --------------------------------------------------------------------------------
 -- * inverse and division
 -- | We can solve the equation explicitly.
 --
 -- > irred = x^2 + p*x + q
 -- > (a*x + b) * (c*x + d) = (a*c)*x^2 + (a*d+b*c)*x + (b*d)
 -- >                       = (a*d + b*c - a*c*p)*x + (b*d - a*c*q)
 --
 -- and then we want to solve
 --
 -- > b*d       - a*c*q == 1
 -- > a*d + b*c - a*c*p == 0
 --
 -- which has the solution:
 --
 -- > c = - a       / (b^2 - a*b*p + a^2*q)  
 -- > d = (b - a*p) / (b^2 - a*b*p + a^2*q)
 --
 -- Remark: It seems the denominator being zero would mean that our
 -- defining polynomial is not irreducible.
 --
 -- Note: we can optimize for the common case p=0; and also for q=1.
 --
 inv :: F2 -> F2
 inv (F2 b a) = F2 d c where
  denom = b*b - 7*a*a
  c = - a / denom
  d =   b / denom
 div :: F2 -> F2 -> F2
 div u v = mul u (inv v)
 --------------------------------------------------------------------------------
 pow_ :: F2 -> Int -> F2
 pow_ x e = pow x (fromIntegral e)
 pow :: F2 -> Integer -> F2
 pow x e 
  | e == 0    = 1
  | e <  0    = pow (inv x) (negate e)
  | otherwise = go 1 x e
  where
    go !acc _  0     = acc
    go !acc !s !expo = case expo .&. 1 of
      0 -> go acc     (sqr s) (shiftR expo 1)
      _ -> go (acc*s) (sqr s) (shiftR expo 1)
 --------------------------------------------------------------------------------
--- a/reference/src/Field/Goldilocks/Extension/BindC.hs
+++ b/reference/src/Field/Goldilocks/Extension/BindC.hs
@ -0,0 +1,193 @@
 -- | Bindings to a C implementation of the quadratic extension over the Goldilocks prime field
 --
 -- It's probably not significantly (if at all) faster than the naive Haskell combined 
 -- with the fast Goldilocks base field operations, but the C versions should be useful 
 -- for the vector operations, and this way we can test them easily.
 {-# LANGUAGE ForeignFunctionInterface, BangPatterns, NumericUnderscores #-}
 module Field.Goldilocks.Extension.BindC where
 --------------------------------------------------------------------------------
 import Prelude hiding ( div )
 import qualified Prelude
 import Data.Bits
 import Data.Word
 import Data.Ratio
 import Foreign.C
 import Foreign.Ptr
 import Foreign.Storable
 import Foreign.Marshal
 import System.Random
 import System.IO.Unsafe
 import Data.Binary
 import Data.Binary.Get ( getWord64le )
 import Data.Binary.Put ( putWord64le )
 import Text.Printf
 import Field.Goldilocks ( F , Goldilocks(..) )
 import qualified Field.Goldilocks as Goldi
 import Data.Flat
 --------------------------------------------------------------------------------
 type FExt = F2
 data F2 = F2 
  { real :: !F 
  , imag :: !F 
  }
  deriving (Eq)
 --------------------------------------------------------------------------------
 instance Binary F2 where
  put (F2 x y) = put x >> put y
  get = F2 <$> get <*> get
 instance Storable F2 where
  peek ptr = do
    r <- peek (castPtr ptr) 
    i <- peek (castPtr ptr `plusPtr` 8)
    return (F2 r i)
  poke ptr (F2 r i) = do
    poke (castPtr ptr)             r
    poke (castPtr ptr `plusPtr` 8) i
  sizeOf    _ = 16
  alignment _ = 8
 instance Flat F2 where
  sizeInBytes  _ = 16
  sizeInQWords _ = 2
  withFlat (F2 x y) action = allocaBytesAligned 16 8 $ \ptr -> do
    poke (castPtr ptr            ) x
    poke (castPtr ptr `plusPtr` 8) y
    action ptr 
  makeFlat ptr = do
    x <- peek (castPtr ptr            )
    y <- peek (castPtr ptr `plusPtr` 8)
    return (F2 x y)
 --------------------------------------------------------------------------------
 instance Show F2 where
  show (F2 r i) = "(" ++ show r ++ " + j * " ++ show i ++ ")"
 instance Num F2 where
  fromInteger = inj . fromIntegral
  negate = neg
  (+)    = add
  (-)    = sub
  (*)    = mul
  abs    = id
  signum _ = inj 1
 instance Fractional F2 where
  fromRational y = fromInteger (numerator y) `div` fromInteger (denominator y)
  recip  = inv
  (/)    = div
 instance Random F2 where
  -- random :: RandomGen g => g -> (a, g) 
  random  g = let (x,g' ) = random g
                  (y,g'') = random g'
              in  (F2 x y, g'')
  randomR = error "randomR/F2: doesn't make any sense"
 --------------------------------------------------------------------------------
 zero, one, two :: F2
 zero = F2 Goldi.zero Goldi.zero 
 one  = F2 Goldi.one  Goldi.zero 
 two  = F2 Goldi.two  Goldi.zero 
 isZero, isOne :: F2 -> Bool
 isZero (F2 r i) = Goldi.isZero r && Goldi.isZero i
 isOne  (F2 r i) = Goldi.isOne  r && Goldi.isZero i
 --------------------------------------------------------------------------------
 {-# NOINLINE unaryOpIO #-}
 unaryOpIO :: (Ptr Word64 -> Ptr Word64 -> IO ()) -> F2 -> IO F2
 unaryOpIO c_action x = allocaBytesAligned 32 8 $ \ptr1 -> do
  let ptr2 = plusPtr ptr1 16
  poke (castPtr ptr1) x
  c_action ptr1 ptr2
  peek (castPtr ptr2)
 {-# NOINLINE binaryOpIO #-}
 binaryOpIO :: (Ptr Word64 -> Ptr Word64 -> Ptr Word64 -> IO ()) -> F2 -> F2 -> IO F2
 binaryOpIO c_action x y = allocaBytesAligned 48 8 $ \ptr1 -> do
  let ptr2 = plusPtr ptr1 16
  let ptr3 = plusPtr ptr1 32
  poke (castPtr ptr1) x
  poke (castPtr ptr2) y
  c_action ptr1 ptr2 ptr3
  peek (castPtr ptr3)
 --------------------------------------------------------------------------------
 inj :: F -> F2
 inj r = F2 r 0 
 neg, sqr, inv :: F2 -> F2
 neg x = unsafePerformIO (unaryOpIO c_goldilocks_ext_neg x)
 sqr x = unsafePerformIO (unaryOpIO c_goldilocks_ext_sqr x)
 inv x = unsafePerformIO (unaryOpIO c_goldilocks_ext_inv x)
 add, sub, mul, div  :: F2 -> F2 -> F2
 add x y = unsafePerformIO (binaryOpIO c_goldilocks_ext_add x y)
 sub x y = unsafePerformIO (binaryOpIO c_goldilocks_ext_sub x y)
 mul x y = unsafePerformIO (binaryOpIO c_goldilocks_ext_mul x y)
 div x y = unsafePerformIO (binaryOpIO c_goldilocks_ext_div x y)
 {-# NOINLINE sclIO #-}
 sclIO :: F -> F2 -> IO F2
 sclIO (MkGoldilocks s) x  = allocaBytesAligned 32 8 $ \ptr1 -> do
  let ptr2 = plusPtr ptr1 16
  poke (castPtr ptr1) x
  c_goldilocks_ext_scl s ptr1 ptr2
  peek (castPtr ptr2)
 {-# NOINLINE powIO #-}
 powIO :: F2 -> Int -> IO F2
 powIO base expo = allocaBytesAligned 32 8 $ \ptr1 -> do
  let ptr2 = plusPtr ptr1 16
  poke (castPtr ptr1) base
  c_goldilocks_ext_pow ptr1 (fromIntegral expo :: CInt) ptr2
  peek (castPtr ptr2)
 scl :: F -> F2 -> F2
 scl s x = unsafePerformIO (sclIO s x)
 pow_ :: F2 -> Int -> F2
 pow_ b e = unsafePerformIO (powIO b e)
 -- | NOTE: this is technically incorrect (it only works for small exponents), 
 -- but we don't really care
 pow :: F2 -> Integer -> F2
 pow b e = pow_ b (fromInteger e)
 --------------------------------------------------------------------------------
 foreign import ccall unsafe "goldilocks_ext_neg" c_goldilocks_ext_neg :: Ptr Word64               -> Ptr Word64 -> IO ()
 foreign import ccall unsafe "goldilocks_ext_add" c_goldilocks_ext_add :: Ptr Word64 -> Ptr Word64 -> Ptr Word64 -> IO ()
 foreign import ccall unsafe "goldilocks_ext_sub" c_goldilocks_ext_sub :: Ptr Word64 -> Ptr Word64 -> Ptr Word64 -> IO ()
 foreign import ccall unsafe "goldilocks_ext_scl" c_goldilocks_ext_scl ::     Word64 -> Ptr Word64 -> Ptr Word64 -> IO ()
 foreign import ccall unsafe "goldilocks_ext_sqr" c_goldilocks_ext_sqr :: Ptr Word64               -> Ptr Word64 -> IO ()
 foreign import ccall unsafe "goldilocks_ext_mul" c_goldilocks_ext_mul :: Ptr Word64 -> Ptr Word64 -> Ptr Word64 -> IO ()
 foreign import ccall unsafe "goldilocks_ext_inv" c_goldilocks_ext_inv :: Ptr Word64               -> Ptr Word64 -> IO ()
 foreign import ccall unsafe "goldilocks_ext_div" c_goldilocks_ext_div :: Ptr Word64 -> Ptr Word64 -> Ptr Word64 -> IO ()
 foreign import ccall unsafe "goldilocks_ext_pow" c_goldilocks_ext_pow :: Ptr Word64 -> CInt       -> Ptr Word64 -> IO ()
 --------------------------------------------------------------------------------
--- a/reference/src/Field/Goldilocks/Extension/Haskell.hs
+++ b/reference/src/Field/Goldilocks/Extension/Haskell.hs
@ -0,0 +1,197 @@
 -- | Quadratic extension over the Goldilocks field
 --
 -- We use the irreducble polynomial @x^2 - 7@ to be compatible with Plonky3
 module Field.Goldilocks.Extension.Haskell where
 --------------------------------------------------------------------------------
 import Prelude hiding ( div )
 import Data.Bits
 import Data.Ratio
 import System.Random
 import Foreign.Ptr
 import Foreign.Storable
 import Foreign.Marshal
 import Data.Binary
 import Data.Flat
 import Field.Goldilocks ( F )
 import qualified Field.Goldilocks as Goldi
 --------------------------------------------------------------------------------
 type FExt = F2
 data F2 = F2 
  { real :: !F 
  , imag :: !F 
  }
  deriving (Eq)
 --------------------------------------------------------------------------------
 instance Binary F2 where
  put (F2 x y) = put x >> put y
  get = F2 <$> get <*> get
 instance Storable F2 where
  peek ptr = do
    r <- peek (castPtr ptr) 
    i <- peek (castPtr ptr `plusPtr` 8)
    return (F2 r i)
  poke ptr (F2 r i) = do
    poke (castPtr ptr)             r
    poke (castPtr ptr `plusPtr` 8) i
  sizeOf    _ = 16
  alignment _ = 8
 instance Flat F2 where
  sizeInBytes  _ = 16
  sizeInQWords _ = 2
  withFlat (F2 x y) action = allocaBytesAligned 16 8 $ \ptr -> do
    poke (castPtr ptr            ) x
    poke (castPtr ptr `plusPtr` 8) y
    action ptr 
  makeFlat ptr = do
    x <- peek (castPtr ptr            )
    y <- peek (castPtr ptr `plusPtr` 8)
    return (F2 x y)
 --------------------------------------------------------------------------------
 instance Show F2 where
  show (F2 r i) = "(" ++ show r ++ " + j * " ++ show i ++ ")"
 instance Num F2 where
  fromInteger = inj . fromIntegral
  negate = neg
  (+)    = add
  (-)    = sub
  (*)    = mul
  abs    = id
  signum _ = inj 1
 instance Fractional F2 where
  fromRational y = fromInteger (numerator y) `div` fromInteger (denominator y)
  recip  = inv
  (/)    = div
 instance Random F2 where
  -- random :: RandomGen g => g -> (a, g) 
  random  g = let (x,g' ) = random g
                  (y,g'') = random g'
              in  (F2 x y, g'')
  randomR = error "randomR/F2: doesn't make any sense"
 --------------------------------------------------------------------------------
 zero, one, two :: F2
 zero = F2 Goldi.zero Goldi.zero 
 one  = F2 Goldi.one  Goldi.zero 
 two  = F2 Goldi.two  Goldi.zero 
 isZero, isOne :: F2 -> Bool
 isZero (F2 r i) = Goldi.isZero r && Goldi.isZero i
 isOne  (F2 r i) = Goldi.isOne  r && Goldi.isZero i
 --------------------------------------------------------------------------------
 inj :: F -> F2
 inj r = F2 r 0 
 neg :: F2 -> F2
 neg (F2 r i) = F2 (negate r) (negate i)
 add :: F2 -> F2 -> F2
 add (F2 r1 i1) (F2 r2 i2) = F2 (r1 + r2) (i1 + i2)
 sub :: F2 -> F2 -> F2
 sub (F2 r1 i1) (F2 r2 i2) = F2 (r1 - r2) (i1 - i2)
 scl :: F -> F2 -> F2
 scl s (F2 r i) = F2 (s * r) (s * i)
 sqrNaive :: F2 -> F2
 sqrNaive (F2 r i) = F2 r3 i3 where
  r3 = r*r + 7 * i*i
  i3 = 2 * r*i
 mulNaive :: F2 -> F2 -> F2
 mulNaive (F2 r1 i1) (F2 r2 i2) = F2 r3 i3 where
  r3 = r1 * r2 + 7 * i1 * i2
  i3 = r1 * i2 +     i1 * r2
 -- uses Karatsuba trick to have one less multiplications
 mulKaratsuba :: F2 -> F2 -> F2
 mulKaratsuba (F2 r1 i1) (F2 r2 i2) = F2 r3 i3 where
  u = r1*r2
  w = i1*i2
  v = (r1+i1)*(r2+i2)
  r3 = u + 7*w
  i3 = v - u - w
 sqr :: F2 -> F2
 sqr = sqrNaive
 mul :: F2 -> F2 -> F2
 mul = mulKaratsuba
 --------------------------------------------------------------------------------
 -- * inverse and division
 -- | We can solve the equation explicitly.
 --
 -- > irred = x^2 + p*x + q
 -- > (a*x + b) * (c*x + d) = (a*c)*x^2 + (a*d+b*c)*x + (b*d)
 -- >                       = (a*d + b*c - a*c*p)*x + (b*d - a*c*q)
 --
 -- and then we want to solve
 --
 -- > b*d       - a*c*q == 1
 -- > a*d + b*c - a*c*p == 0
 --
 -- which has the solution:
 --
 -- > c = - a       / (b^2 - a*b*p + a^2*q)  
 -- > d = (b - a*p) / (b^2 - a*b*p + a^2*q)
 --
 -- Remark: It seems the denominator being zero would mean that our
 -- defining polynomial is not irreducible.
 --
 -- Note: we can optimize for the common case p=0; and also for q=1.
 --
 inv :: F2 -> F2
 inv (F2 b a) = F2 d c where
  denom = b*b - 7*a*a
  c = - a / denom
  d =   b / denom
 div :: F2 -> F2 -> F2
 div u v = mul u (inv v)
 --------------------------------------------------------------------------------
 pow_ :: F2 -> Int -> F2
 pow_ x e = pow x (fromIntegral e)
 pow :: F2 -> Integer -> F2
 pow x e 
  | e == 0    = 1
  | e <  0    = pow (inv x) (negate e)
  | otherwise = go 1 x e
  where
    go !acc _  0     = acc
    go !acc !s !expo = case expo .&. 1 of
      0 -> go acc     (sqr s) (shiftR expo 1)
      _ -> go (acc*s) (sqr s) (shiftR expo 1)
 --------------------------------------------------------------------------------
--- a/reference/src/Field/Goldilocks/Fast.hs
+++ b/reference/src/Field/Goldilocks/Fast.hs
@ -16,6 +16,7 @@ import Data.Ratio
 import Foreign.C
 import Foreign.Ptr
 import Foreign.Storable
 import Foreign.Marshal
 import System.Random
@ -25,6 +26,8 @@ import Data.Binary.Put ( putWord64le )
 import Text.Printf
 import Data.Flat
 --------------------------------------------------------------------------------
 type F = Goldilocks
@ -51,6 +54,12 @@ instance Storable F where
  sizeOf    _ = 8
  alignment _ = 8
 instance Flat Goldilocks where
  sizeInBytes  _ = 8
  sizeInQWords _ = 1
  withFlat (MkGoldilocks x) action = alloca $ \ptr -> poke ptr x >> action ptr
  makeFlat ptr = MkGoldilocks <$> peek ptr
 --------------------------------------------------------------------------------
 newtype Goldilocks 
--- a/reference/src/Field/Goldilocks/Slow.hs
+++ b/reference/src/Field/Goldilocks/Slow.hs
@ -13,6 +13,10 @@ import Data.Bits
 import Data.Word
 import Data.Ratio
 import Foreign.Ptr
 import Foreign.Storable
 import Foreign.Marshal
 import System.Random
 import Data.Binary
@ -21,6 +25,8 @@ import Data.Binary.Put ( putWord64le )
 import Text.Printf
 import Data.Flat
 --------------------------------------------------------------------------------
 type F = Goldilocks
@ -34,10 +40,24 @@ toF = mkGoldilocks . fromIntegral
 intToF :: Int -> F
 intToF = mkGoldilocks . fromIntegral
 --------------------------------------------------------------------------------
 instance Binary F where
  put x = putWord64le (fromF x)
  get = toF <$> getWord64le
 instance Storable F where
  peek ptr                   = (MkGoldilocks . fromIntegral) <$> peek (castPtr ptr :: Ptr Word64)
  poke ptr  (MkGoldilocks x) = poke (castPtr ptr :: Ptr Word64) (fromInteger x :: Word64)
  sizeOf    _ = 8
  alignment _ = 8
 instance Flat Goldilocks where
  sizeInBytes  _ = 8
  sizeInQWords _ = 1
  withFlat (MkGoldilocks x) action = alloca $ \ptr -> poke ptr (fromInteger x :: Word64) >> action ptr
  makeFlat ptr = (MkGoldilocks . fromIntegral) <$> peek ptr
 --------------------------------------------------------------------------------
 newtype Goldilocks 
--- a/reference/src/cbits/compile.sh
+++ b/reference/src/cbits/compile.sh
@ -1,4 +1,7 @@
 #!/bin/bash
 gcc -c -O2 goldilocks.c
 gcc -c -O2 goldilocks_ext.c
 gcc -c -O2 monolith.c
 gcc -c -O2 ntt.c
--- a/reference/src/cbits/goldilocks.c
+++ b/reference/src/cbits/goldilocks.c
@ -1,4 +1,6 @@
 // the "Goldilocks" prime field of size `p = 2^64 - 2^32 + 1`
 #include <stdint.h>
 #include <stdio.h>      // for testing only
 #include <assert.h>
@ -126,6 +128,22 @@ uint64_t goldilocks_mul_small(uint64_t x, uint32_t y) {
 //------------------------------------------------------------------------------
 uint64_t goldilocks_mul_by_2(uint64_t x) {
  return goldilocks_add( x , x ); 
 }
 uint64_t goldilocks_a_plus_7b(uint64_t a, uint64_t b) {
  __uint128_t z = (__uint128_t)a + 7 * (__uint128_t)b ;
  return goldilocks_rdc_small( z );
 }
 uint64_t goldilocks_a_minus_7b(uint64_t a, uint64_t b) {
  uint64_t b7 = goldilocks_rdc_small ( 7 * (__uint128_t)b ) ;
  return goldilocks_sub( a , b7 );
 }
 //------------------------------------------------------------------------------
 uint64_t goldilocks_euclid(uint64_t x0, uint64_t y0, uint64_t u0, uint64_t v0) {
  uint64_t x = x0;
@ -178,6 +196,15 @@ uint64_t goldilocks_inv(uint64_t a) {
  return goldilocks_div(1, a);
 }
 uint64_t goldilocks_div_by_2(uint64_t a) {
  if (a & 1) {
    return ((a >> 1) + GOLDILOCKS_HALFPRIME_PLUS1);
  }
  else {
    return (a >> 1);
  }
 }
 //------------------------------------------------------------------------------
 uint64_t goldilocks_pow(uint64_t base, int expo) {
@ -193,7 +220,7 @@ uint64_t goldilocks_pow(uint64_t base, int expo) {
      acc = goldilocks_mul( acc, sq );
    } 
    if (e > 0) {
-      sq = goldilocks_mul( sq , sq );
+      sq = goldilocks_sqr( sq );
      e  = e >> 1;
    }
  }
--- a/reference/src/cbits/goldilocks.h
+++ b/reference/src/cbits/goldilocks.h
@ -1,4 +1,6 @@
 // the "Goldilocks" prime field of size `p = 2^64 - 2^32 + 1`
 #include <stdint.h>
 //------------------------------------------------------------------------------
@ -19,6 +21,11 @@ uint64_t goldilocks_inv(uint64_t a);
 uint64_t goldilocks_div(uint64_t a, uint64_t b);
 uint64_t goldilocks_pow(uint64_t b, int e);
 uint64_t goldilocks_mul_by_2  (uint64_t x);
 uint64_t goldilocks_div_by_2  (uint64_t a);
 uint64_t goldilocks_a_plus_7b (uint64_t a, uint64_t b);
 uint64_t goldilocks_a_minus_7b(uint64_t a, uint64_t b);
 //------------------------------------------------------------------------------
 uint64_t goldilocks_rdc          (__uint128_t x);
--- a/reference/src/cbits/goldilocks_ext.c
+++ b/reference/src/cbits/goldilocks_ext.c
@ -0,0 +1,147 @@
 // quadratic field extension F[x] = F(x) / (x^2 - 7) over the Goldilocks field
 #include "goldilocks.h"
 #include "goldilocks_ext.h"
 //------------------------------------------------------------------------------
 void goldilocks_ext_inj( uint64_t input , uint64_t *out ) {
  out[0] = input;
  out[1] = 0;
 }
 void goldilocks_ext_set_one( uint64_t *out ) {
  out[0] = 1;
  out[1] = 0;
 }
 //------------------------------------------------------------------------------
 void goldilocks_ext_neg(const uint64_t *x , uint64_t *out) {
  out[0] = goldilocks_neg( x[0] );
  out[1] = goldilocks_neg( x[1] );
 }
 void goldilocks_ext_add(const uint64_t *x , const uint64_t *y , uint64_t *out) {
  out[0] = goldilocks_add( x[0] , y[0] );
  out[1] = goldilocks_add( x[1] , y[1] );
 }
 void goldilocks_ext_sub(const uint64_t *x , const uint64_t *y , uint64_t *out) {
  out[0] = goldilocks_sub( x[0] , y[0] );
  out[1] = goldilocks_sub( x[1] , y[1] );
 }
 void goldilocks_ext_scl(uint64_t s , const uint64_t *x , uint64_t *out) {
  out[0] = goldilocks_mul( s , x[0] );
  out[1] = goldilocks_mul( s , x[1] );
 }
 //------------------------------------------------------------------------------
 /*
 sqrNaive :: F2 -> F2
 sqrNaive (F2 r i) = F2 r3 i3 where
  r3 = r*r + 7 * i*i
  i3 = 2 * r*i
 -- uses Karatsuba trick to have one less multiplications
 mulKaratsuba :: F2 -> F2 -> F2
 mulKaratsuba (F2 r1 i1) (F2 r2 i2) = F2 r3 i3 where
  u = r1*r2
  w = i1*i2
  v = (r1+i1)*(r2+i2)
  r3 = u + 7*w
  i3 = v - u - w
 */
 void goldilocks_ext_sqr(const uint64_t *x , uint64_t *out) {
  uint64_t u = goldilocks_a_plus_7b( goldilocks_sqr(x[0]) , goldilocks_sqr( x[1] )) ;
  uint64_t v = goldilocks_mul_by_2( goldilocks_mul( x[0] , x[1] ) );
  out[0] = u;   
  out[1] = v;     // because we want inplace to work too
 }
 // uses Karatsuba trick to have one less multiplications
 void goldilocks_ext_mul(const uint64_t *x , const uint64_t *y , uint64_t *out) {
  uint64_t u = goldilocks_mul( x[0] , y[0] );
  uint64_t w = goldilocks_mul( x[1] , y[1] );
  uint64_t v = goldilocks_mul( goldilocks_add( x[0] , x[1] ) , goldilocks_add( y[0] , y[1] ) );
  out[0] = goldilocks_a_plus_7b( u , w );
  out[1] = goldilocks_sub( v , goldilocks_add( u , w ) );
 }
 //------------------------------------------------------------------------------
 // We can solve the equation explicitly.
 //
 // > irred = x^2 + p*x + q
 // > (a*x + b) * (c*x + d) = (a*c)*x^2 + (a*d+b*c)*x + (b*d)
 // >                       = (a*d + b*c - a*c*p)*x + (b*d - a*c*q)
 //
 // and then we want to solve
 //
 // > b*d       - a*c*q == 1
 // > a*d + b*c - a*c*p == 0
 //
 // which has the solution:
 //
 // > c = - a       / (b^2 - a*b*p + a^2*q)  
 // > d = (b - a*p) / (b^2 - a*b*p + a^2*q)
 //
 // Remark: It seems the denominator being zero would mean that our
 // defining polynomial is not irreducible.
 //
 // Note: we can optimize for the common case p=0; and also for q=1.
 //
 void goldilocks_ext_inv(const uint64_t *ptr , uint64_t *out) {
  uint64_t b = ptr[0];
  uint64_t a = ptr[1];   // (a*x + b)
  uint64_t denom = goldilocks_a_minus_7b( goldilocks_sqr(b) , goldilocks_sqr(a) );
  uint64_t denom_inv = goldilocks_inv( denom );
  out[0] =                 goldilocks_mul( b , denom_inv )  ;
  out[1] = goldilocks_neg( goldilocks_mul( a , denom_inv ) );  
 }
 void goldilocks_ext_div(const uint64_t *x , const uint64_t *y , uint64_t *out) {
  uint64_t invy[2];
  goldilocks_ext_inv( y , invy );
  goldilocks_ext_mul( x , invy , out );
 }
 //------------------------------------------------------------------------------
 void goldilocks_ext_pow(const uint64_t *base , int expo , uint64_t *out) {
  if (expo == 0) { 
    goldilocks_ext_set_one(out);
    return;
  }
  if (expo <  0) { 
    uint64_t base_inv[2];
    goldilocks_ext_inv( base , base_inv );
    goldilocks_ext_pow( base_inv , -expo , out ); 
    return;
  }
  int      e   = expo;
  uint64_t sq [2] ; sq[0] = base[0] ; sq[1] = base[1] ;
  uint64_t acc[2] ; goldilocks_ext_set_one( acc );
  while (e != 0) { 
    if ((e & 1) != 0) {
      goldilocks_ext_mul( acc , sq , acc );
    } 
    if (e > 0) {
      goldilocks_ext_sqr( sq , sq );
      e  = e >> 1;
    }
  }
  out[0] = acc[0];
  out[1] = acc[1];
 }
 //------------------------------------------------------------------------------
--- a/reference/src/cbits/goldilocks_ext.h
+++ b/reference/src/cbits/goldilocks_ext.h
@ -0,0 +1,23 @@
 // quadratic field extension F[x] = F(x) / (x^2 - 7) over the Goldilocks field
 #include <stdint.h>
 #include "goldilocks.h"
 //------------------------------------------------------------------------------
 void goldilocks_ext_inj( uint64_t input , uint64_t *out );
 void goldilocks_ext_set_one( uint64_t *out );
 void goldilocks_ext_neg(const uint64_t *x                     , uint64_t *out);
 void goldilocks_ext_add(const uint64_t *x , const uint64_t *y , uint64_t *out);
 void goldilocks_ext_sub(const uint64_t *x , const uint64_t *y , uint64_t *out);
 void goldilocks_ext_scl(       uint64_t s , const uint64_t *x , uint64_t *out);
 void goldilocks_ext_sqr(const uint64_t *x                     , uint64_t *out);
 void goldilocks_ext_mul(const uint64_t *x , const uint64_t *y , uint64_t *out);
 void goldilocks_ext_inv(const uint64_t *x                     , uint64_t *out);
 void goldilocks_ext_div(const uint64_t *x , const uint64_t *y , uint64_t *out);
 void goldilocks_ext_pow(const uint64_t *b , int e             , uint64_t *out);
 //------------------------------------------------------------------------------
--- a/reference/src/runi.sh
+++ b/reference/src/runi.sh
@ -1,3 +1,3 @@
 #!/bin/bash
-ghci testMain.hs cbits/goldilocks.o cbits/monolith.o 
+ghci testMain.hs cbits/goldilocks.o cbits/goldilocks_ext.o cbits/monolith.o cbits/ntt.o
`@ -1,3 +1,3 @@`
	`#!/bin/bash`	`#!/bin/bash`

	`ghci testMain.hs cbits/goldilocks.o cbits/monolith.o`	`ghci testMain.hs cbits/goldilocks.o cbits/goldilocks_ext.o cbits/monolith.o cbits/ntt.o`