From c312210878588df1f3a3e92de119e5dda6cd2a03 Mon Sep 17 00:00:00 2001
From: Mamy Ratsimbazafy <mamy_github@numforge.co>
Date: Sat, 6 Feb 2021 16:28:38 +0100
Subject: [PATCH] Rework towering (#148)

* naive removal of out-of-place mul by non residue

* Use {.inline.} in a consistent manner across the codebase

* Handle aliasing for quadratic multiplication

* reorg optimization

* Handle aliasing for quadratic squaring

* handle aliasing in mul_sparse_complex_by_0y

* Rework multiplication by nonresidue, assume tower and twist use same non-residue

* continue rework

* continue on non-residues

* Remove "NonResidue *" calls

* handle aliasing in Chung-Hasan SQR2

* Handla aliasing in Chung-Hasan SQR3

* Use one less temporary in Chung Hasan sqr2

* handle aliasing in cubic extensions

* merge extension tower in the same file to reduce duplicate proc and allow better inlining

* handle aliasing in cubic inversion

* drop out-of-place proc from BigInt and finite fields as well

* less copies in line_projective

* remove a copy in fp12 by lines
---
 constantine/arithmetic/bigints.nim            |   9 +-
 constantine/arithmetic/finite_fields.nim      | 111 +-
 .../arithmetic/finite_fields_double_width.nim |  21 +-
 .../arithmetic/finite_fields_inversion.nim    |  13 +-
 .../arithmetic/finite_fields_square_root.nim  |  45 +-
 constantine/arithmetic/limbs.nim              |   2 +-
 ...imbs_double_width.nim => limbs_extmul.nim} |  60 +-
 constantine/arithmetic/limbs_modular.nim      |   2 +-
 constantine/arithmetic/limbs_montgomery.nim   |  48 +-
 constantine/config/curves_declaration.nim     |  25 +-
 constantine/config/curves_parser.nim          |  27 +-
 constantine/config/curves_prop_core.nim       |  28 +-
 .../elliptic/ec_shortweierstrass_affine.nim   |   2 +
 constantine/isogeny/frobenius.nim             |  42 -
 constantine/pairing/lines_common.nim          |  16 +
 constantine/pairing/lines_projective.nim      |  44 +-
 constantine/pairing/mul_fp12_by_lines.nim     |  13 +-
 constantine/pairing/pairing_bls12.nim         |   3 +-
 constantine/pairing/pairing_bn.nim            |   3 +-
 .../cubic_extensions.nim                      | 228 ----
 .../exponentiations.nim                       |   4 +-
 .../extension_fields.nim                      | 977 ++++++++++++++++++
 .../quadratic_extensions.nim                  | 365 -------
 .../tower_field_extensions/tower_common.nim   | 263 -----
 .../tower_instantiation.nim                   | 300 +++---
 docs/optimizations.md                         |  21 +-
 helpers/prng_unsafe.nim                       |   2 +-
 tests/t_finite_fields.nim                     |  28 +-
 28 files changed, 1417 insertions(+), 1285 deletions(-)
 rename constantine/arithmetic/{limbs_double_width.nim => limbs_extmul.nim} (82%)
 delete mode 100644 constantine/tower_field_extensions/cubic_extensions.nim
 create mode 100644 constantine/tower_field_extensions/extension_fields.nim
 delete mode 100644 constantine/tower_field_extensions/quadratic_extensions.nim
 delete mode 100644 constantine/tower_field_extensions/tower_common.nim

diff --git a/constantine/arithmetic/bigints.nim b/constantine/arithmetic/bigints.nim
index c7a35d8..9236076 100644
--- a/constantine/arithmetic/bigints.nim
+++ b/constantine/arithmetic/bigints.nim
@@ -10,7 +10,7 @@ import
   ../config/[common, type_bigint],
   ../primitives,
   ./limbs,
-  ./limbs_double_width,
+  ./limbs_extmul,
   ./limbs_modular,
   ./limbs_montgomery
 
@@ -347,7 +347,7 @@ func bit0*(a: BigInt): Ct[uint8] =
 # Multiplication by small cosntants
 # ------------------------------------------------------------
 
-func `*=`*(a: var BigInt, b: static int) {.inline.} =
+func `*=`*(a: var BigInt, b: static int) =
   ## Multiplication by a small integer known at compile-time
   # Implementation:
   #
@@ -420,11 +420,6 @@ func `*=`*(a: var BigInt, b: static int) {.inline.} =
   else:
     {.error: "Multiplication by this small int not implemented".}
 
-func `*`*(b: static int, a: BigInt): BigInt {.noinit, inline.} =
-  ## Multiplication by a small integer known at compile-time
-  result = a
-  result *= b
-
 # Division by constants
 # ------------------------------------------------------------
 
diff --git a/constantine/arithmetic/finite_fields.nim b/constantine/arithmetic/finite_fields.nim
index b3d0209..3ed4c12 100644
--- a/constantine/arithmetic/finite_fields.nim
+++ b/constantine/arithmetic/finite_fields.nim
@@ -43,6 +43,7 @@ export Fp, Fr, FF
 
 # No exceptions allowed
 {.push raises: [].}
+{.push inline.}
 
 # ############################################################
 #
@@ -50,14 +51,14 @@ export Fp, Fr, FF
 #
 # ############################################################
 
-func fromBig*(dst: var FF, src: BigInt) {.inline.}=
+func fromBig*(dst: var FF, src: BigInt) =
   ## Convert a BigInt to its Montgomery form
   when nimvm:
     dst.mres.montyResidue_precompute(src, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord())
   else:
     dst.mres.montyResidue(src, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.canUseNoCarryMontyMul())
 
-func fromBig*[C: static Curve](T: type FF[C], src: BigInt): FF[C] {.noInit, inline.} =
+func fromBig*[C: static Curve](T: type FF[C], src: BigInt): FF[C] {.noInit.} =
   ## Convert a BigInt to its Montgomery form
   result.fromBig(src)
 
@@ -70,14 +71,14 @@ func toBig*(src: FF): auto {.noInit, inline.} =
 # Copy
 # ------------------------------------------------------------
 
-func ccopy*(a: var FF, b: FF, ctl: SecretBool) {.inline, meter.} =
+func ccopy*(a: var FF, b: FF, ctl: SecretBool) {.meter.} =
   ## Constant-time conditional copy
   ## If ctl is true: b is copied into a
   ## if ctl is false: b is not copied and a is unmodified
   ## Time and memory accesses are the same whether a copy occurs or not
   ccopy(a.mres, b.mres, ctl)
 
-func cswap*(a, b: var FF, ctl: CTBool) {.inline, meter.} =
+func cswap*(a, b: var FF, ctl: CTBool) {.meter.} =
   ## Swap ``a`` and ``b`` if ``ctl`` is true
   ##
   ## Constant-time:
@@ -105,34 +106,34 @@ func cswap*(a, b: var FF, ctl: CTBool) {.inline, meter.} =
 # In practice I'm not aware of such prime being using in elliptic curves.
 # 2^127 - 1 and 2^521 - 1 are used but 127 and 521 are not multiple of 32/64
 
-func `==`*(a, b: FF): SecretBool {.inline.} =
+func `==`*(a, b: FF): SecretBool =
   ## Constant-time equality check
   a.mres == b.mres
 
-func isZero*(a: FF): SecretBool {.inline.} =
+func isZero*(a: FF): SecretBool =
   ## Constant-time check if zero
   a.mres.isZero()
 
-func isOne*(a: FF): SecretBool {.inline.} =
+func isOne*(a: FF): SecretBool =
   ## Constant-time check if one
   a.mres == FF.getMontyOne()
 
-func isMinusOne*(a: FF): SecretBool {.inline.} =
+func isMinusOne*(a: FF): SecretBool =
   ## Constant-time check if -1 (mod p)
   a.mres == FF.getMontyPrimeMinus1()
 
-func setZero*(a: var FF) {.inline.} =
+func setZero*(a: var FF) =
   ## Set ``a`` to zero
   a.mres.setZero()
 
-func setOne*(a: var FF) {.inline.} =
+func setOne*(a: var FF) =
   ## Set ``a`` to one
   # Note: we need 1 in Montgomery residue form
   # TODO: Nim codegen is not optimal it uses a temporary
   #       Check if the compiler optimizes it away
   a.mres = FF.getMontyOne()
 
-func `+=`*(a: var FF, b: FF) {.inline, meter.} =
+func `+=`*(a: var FF, b: FF) {.meter.} =
   ## In-place addition modulo p
   when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
     addmod_asm(a.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
@@ -141,7 +142,7 @@ func `+=`*(a: var FF, b: FF) {.inline, meter.} =
     overflowed = overflowed or not(a.mres < FF.fieldMod())
     discard csub(a.mres, FF.fieldMod(), overflowed)
 
-func `-=`*(a: var FF, b: FF) {.inline, meter.} =
+func `-=`*(a: var FF, b: FF) {.meter.} =
   ## In-place substraction modulo p
   when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
     submod_asm(a.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
@@ -149,7 +150,7 @@ func `-=`*(a: var FF, b: FF) {.inline, meter.} =
     let underflowed = sub(a.mres, b.mres)
     discard cadd(a.mres, FF.fieldMod(), underflowed)
 
-func double*(a: var FF) {.inline, meter.} =
+func double*(a: var FF) {.meter.} =
   ## Double ``a`` modulo p
   when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
     addmod_asm(a.mres.limbs, a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
@@ -158,7 +159,7 @@ func double*(a: var FF) {.inline, meter.} =
     overflowed = overflowed or not(a.mres < FF.fieldMod())
     discard csub(a.mres, FF.fieldMod(), overflowed)
 
-func sum*(r: var FF, a, b: FF) {.inline, meter.} =
+func sum*(r: var FF, a, b: FF) {.meter.} =
   ## Sum ``a`` and ``b`` into ``r`` modulo p
   ## r is initialized/overwritten
   when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
@@ -168,11 +169,11 @@ func sum*(r: var FF, a, b: FF) {.inline, meter.} =
     overflowed = overflowed or not(r.mres < FF.fieldMod())
     discard csub(r.mres, FF.fieldMod(), overflowed)
 
-func sumNoReduce*(r: var FF, a, b: FF) {.inline, meter.} =
+func sumNoReduce*(r: var FF, a, b: FF) {.meter.} =
   ## Sum ``a`` and ``b`` into ``r`` without reduction
   discard r.mres.sum(a.mres, b.mres)
 
-func diff*(r: var FF, a, b: FF) {.inline, meter.} =
+func diff*(r: var FF, a, b: FF) {.meter.} =
   ## Substract `b` from `a` and store the result into `r`.
   ## `r` is initialized/overwritten
   ## Requires r != b
@@ -182,12 +183,12 @@ func diff*(r: var FF, a, b: FF) {.inline, meter.} =
     var underflowed = r.mres.diff(a.mres, b.mres)
     discard cadd(r.mres, FF.fieldMod(), underflowed)
 
-func diffNoReduce*(r: var FF, a, b: FF) {.inline, meter.} =
+func diffNoReduce*(r: var FF, a, b: FF) {.meter.} =
   ## Substract `b` from `a` and store the result into `r`
   ## without reduction
   discard r.mres.diff(a.mres, b.mres)
 
-func double*(r: var FF, a: FF) {.inline, meter.} =
+func double*(r: var FF, a: FF) {.meter.} =
   ## Double ``a`` into ``r``
   ## `r` is initialized/overwritten
   when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
@@ -197,16 +198,16 @@ func double*(r: var FF, a: FF) {.inline, meter.} =
     overflowed = overflowed or not(r.mres < FF.fieldMod())
     discard csub(r.mres, FF.fieldMod(), overflowed)
 
-func prod*(r: var FF, a, b: FF) {.inline, meter.} =
+func prod*(r: var FF, a, b: FF) {.meter.} =
   ## Store the product of ``a`` by ``b`` modulo p into ``r``
   ## ``r`` is initialized / overwritten
   r.mres.montyMul(a.mres, b.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.canUseNoCarryMontyMul())
 
-func square*(r: var FF, a: FF) {.inline, meter.} =
+func square*(r: var FF, a: FF) {.meter.} =
   ## Squaring modulo p
   r.mres.montySquare(a.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.canUseNoCarryMontySquare())
 
-func neg*(r: var FF, a: FF) {.inline, meter.} =
+func neg*(r: var FF, a: FF) {.meter.} =
   ## Negate modulo p
   when UseASM_X86_64:
     negmod_asm(r.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
@@ -221,11 +222,11 @@ func neg*(r: var FF, a: FF) {.inline, meter.} =
     t.mres.czero(isZero)
     r = t
 
-func neg*(a: var FF) {.inline, meter.} =
+func neg*(a: var FF) {.meter.} =
   ## Negate modulo p
   a.neg(a)
 
-func div2*(a: var FF) {.inline, meter.} =
+func div2*(a: var FF) {.meter.} =
   ## Modular division by 2
   a.mres.div2_modular(FF.getPrimePlus1div2())
 
@@ -269,7 +270,7 @@ func csub*(a: var FF, b: FF, ctl: SecretBool) {.meter.} =
 #
 # Internally those procedures will allocate extra scratchspace on the stack
 
-func pow*(a: var FF, exponent: BigInt) {.inline.} =
+func pow*(a: var FF, exponent: BigInt) =
   ## Exponentiation modulo p
   ## ``a``: a field element to be exponentiated
   ## ``exponent``: a big integer
@@ -282,7 +283,7 @@ func pow*(a: var FF, exponent: BigInt) {.inline.} =
     FF.canUseNoCarryMontySquare()
   )
 
-func pow*(a: var FF, exponent: openarray[byte]) {.inline.} =
+func pow*(a: var FF, exponent: openarray[byte]) =
   ## Exponentiation modulo p
   ## ``a``: a field element to be exponentiated
   ## ``exponent``: a big integer in canonical big endian representation
@@ -295,7 +296,7 @@ func pow*(a: var FF, exponent: openarray[byte]) {.inline.} =
     FF.canUseNoCarryMontySquare()
   )
 
-func powUnsafeExponent*(a: var FF, exponent: BigInt) {.inline.} =
+func powUnsafeExponent*(a: var FF, exponent: BigInt) =
   ## Exponentiation modulo p
   ## ``a``: a field element to be exponentiated
   ## ``exponent``: a big integer
@@ -315,7 +316,7 @@ func powUnsafeExponent*(a: var FF, exponent: BigInt) {.inline.} =
     FF.canUseNoCarryMontySquare()
   )
 
-func powUnsafeExponent*(a: var FF, exponent: openarray[byte]) {.inline.} =
+func powUnsafeExponent*(a: var FF, exponent: openarray[byte]) =
   ## Exponentiation modulo p
   ## ``a``: a field element to be exponentiated
   ## ``exponent``: a big integer a big integer in canonical big endian representation
@@ -342,47 +343,27 @@ func powUnsafeExponent*(a: var FF, exponent: openarray[byte]) {.inline.} =
 # ############################################################
 #
 # This implements extra primitives for ergonomics.
-# The in-place ones should be preferred as they avoid copies on assignment
-# Two kinds:
-# - Those that return a field element
-# - Those that internally allocate a temporary field element
 
-func `+`*(a, b: FF): FF {.noInit, inline, meter.} =
-  ## Addition modulo p
-  result.sum(a, b)
-
-func `-`*(a, b: FF): FF {.noInit, inline, meter.} =
-  ## Substraction modulo p
-  result.diff(a, b)
-
-func `*`*(a, b: FF): FF {.noInit, inline, meter.} =
-  ## Multiplication modulo p
-  ##
-  ## It is recommended to assign with {.noInit.}
-  ## as FF elements are usually large and this
-  ## routine will zero init internally the result.
-  result.prod(a, b)
-
-func `*=`*(a: var FF, b: FF) {.inline, meter.} =
+func `*=`*(a: var FF, b: FF) {.meter.} =
   ## Multiplication modulo p
   a.prod(a, b)
 
-func square*(a: var FF) {.inline, meter.} =
+func square*(a: var FF) {.meter.} =
   ## Squaring modulo p
   a.mres.montySquare(a.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.canUseNoCarryMontySquare())
 
-func square_repeated*(r: var FF, num: int) {.inline, meter.} =
+func square_repeated*(r: var FF, num: int) {.meter.} =
   ## Repeated squarings
   for _ in 0 ..< num:
     r.square()
 
-func square_repeated*(r: var FF, a: FF, num: int) {.inline, meter.} =
+func square_repeated*(r: var FF, a: FF, num: int) {.meter.} =
   ## Repeated squarings
   r.square(a)
   for _ in 1 ..< num:
     r.square()
 
-func `*=`*(a: var FF, b: static int) {.inline.} =
+func `*=`*(a: var FF, b: static int) =
   ## Multiplication by a small integer known at compile-time
   # Implementation:
   # We don't want to go convert the integer to the Montgomery domain (O(n²))
@@ -464,7 +445,27 @@ func `*=`*(a: var FF, b: static int) {.inline.} =
   else:
     {.error: "Multiplication by this small int not implemented".}
 
-func `*`*(b: static int, a: FF): FF {.noinit, inline.} =
+func prod*(r: var FF, a: FF, b: static int) =
   ## Multiplication by a small integer known at compile-time
-  result = a
-  result *= b
+  const negate = b < 0
+  const b = if negate: -b
+            else: b
+  when negate:
+    r.neg(a)
+  else:
+    r = a
+  r *= b
+
+template mulCheckSparse*(a: var Fp, b: Fp) =
+  ## Multiplication with optimization for sparse inputs
+  when b.isOne().bool:
+    discard
+  elif b.isZero().bool:
+    a.setZero()
+  elif b.isMinusOne().bool:
+    a.neg()
+  else:
+    a *= b
+
+{.pop.} # inline
+{.pop.} # raises no exceptions
diff --git a/constantine/arithmetic/finite_fields_double_width.nim b/constantine/arithmetic/finite_fields_double_width.nim
index 1762c0e..a9a94db 100644
--- a/constantine/arithmetic/finite_fields_double_width.nim
+++ b/constantine/arithmetic/finite_fields_double_width.nim
@@ -12,7 +12,7 @@ import
   ./bigints,
   ./finite_fields,
   ./limbs,
-  ./limbs_double_width,
+  ./limbs_extmul,
   ./limbs_montgomery
 
 when UseASM_X86_64:
@@ -28,18 +28,22 @@ template doubleWidth*(T: typedesc[Fp]): typedesc =
   ## Return the double-width type matching with Fp
   FpDbl[T.C]
 
-func `==`*(a, b: FpDbl): SecretBool {.inline.} =
+# No exceptions allowed
+{.push raises: [].}
+{.push inline.}
+
+func `==`*(a, b: FpDbl): SecretBool =
   a.limbs2x == b.limbs2x
 
-func mulNoReduce*(r: var FpDbl, a, b: Fp) {.inline.} =
+func mulNoReduce*(r: var FpDbl, a, b: Fp) =
   ## Store the product of ``a`` by ``b`` into ``r``
   r.limbs2x.prod(a.mres.limbs, b.mres.limbs)
 
-func squareNoReduce*(r: var FpDbl, a: Fp) {.inline.} =
+func squareNoReduce*(r: var FpDbl, a: Fp) =
   ## Store the square of ``a`` into ``r``
   r.limbs2x.square(a.mres.limbs)
 
-func reduce*(r: var Fp, a: FpDbl) {.inline.} =
+func reduce*(r: var Fp, a: FpDbl) =
   ## Reduce a double-width field element into r
   const N = r.mres.limbs.len
   montyRed(
@@ -54,7 +58,7 @@ func diffNoReduce*(r: var FpDbl, a, b: FpDbl) =
   ## Double-width substraction without reduction
   discard r.limbs2x.diff(a.limbs2x, b.limbs2x)
 
-func diff*(r: var FpDbl, a, b: FpDbl) {.inline.}=
+func diff*(r: var FpDbl, a, b: FpDbl) =
   ## Double-width modular substraction
   when UseASM_X86_64:
     sub2x_asm(r.limbs2x, a.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs)
@@ -69,6 +73,9 @@ func diff*(r: var FpDbl, a, b: FpDbl) {.inline.}=
       addC(carry, sum, r.limbs2x[i+N], M.limbs[i], carry)
       underflowed.ccopy(r.limbs2x[i+N], sum)
 
-func `-=`*(a: var FpDbl, b: FpDbl) {.inline.}=
+func `-=`*(a: var FpDbl, b: FpDbl) =
   ## Double-width modular substraction
   a.diff(a, b)
+
+{.pop.} # inline
+{.pop.} # raises no exceptions
diff --git a/constantine/arithmetic/finite_fields_inversion.nim b/constantine/arithmetic/finite_fields_inversion.nim
index f26c803..6f04bb9 100644
--- a/constantine/arithmetic/finite_fields_inversion.nim
+++ b/constantine/arithmetic/finite_fields_inversion.nim
@@ -19,13 +19,17 @@ export zoo_inversions
 #
 # ############################################################
 
-func inv_euclid*(r: var Fp, a: Fp) {.inline.} =
+# No exceptions allowed
+{.push raises: [].}
+{.push inline.}
+
+func inv_euclid*(r: var Fp, a: Fp) =
   ## Inversion modulo p via
   ## Niels Moller constant-time version of
   ## Stein's GCD derived from extended binary Euclid algorithm
   r.mres.steinsGCD(a.mres, Fp.getR2modP(), Fp.C.Mod, Fp.getPrimePlus1div2())
 
-func inv*(r: var Fp, a: Fp) {.inline.} =
+func inv*(r: var Fp, a: Fp) =
   ## Inversion modulo p
   ##
   ## The inverse of 0 is 0.
@@ -41,7 +45,7 @@ func inv*(r: var Fp, a: Fp) {.inline.} =
   else:
     r.inv_euclid(a)
 
-func inv*(a: var Fp) {.inline.} =
+func inv*(a: var Fp) =
   ## Inversion modulo p
   ##
   ## The inverse of 0 is 0.
@@ -52,3 +56,6 @@ func inv*(a: var Fp) {.inline.} =
     a.inv_addchain(a)
   else:
     a.inv_euclid(a)
+
+{.pop.} # inline
+{.pop.} # raises no exceptions
diff --git a/constantine/arithmetic/finite_fields_square_root.nim b/constantine/arithmetic/finite_fields_square_root.nim
index 5c66803..2428151 100644
--- a/constantine/arithmetic/finite_fields_square_root.nim
+++ b/constantine/arithmetic/finite_fields_square_root.nim
@@ -18,10 +18,14 @@ import
 #
 # ############################################################
 
+# No exceptions allowed
+{.push raises: [].}
+{.push inline.}
+
 # Legendre symbol / Euler's Criterion / Kronecker's symbol
 # ------------------------------------------------------------
 
-func isSquare*(a: Fp): SecretBool {.inline.} =
+func isSquare*(a: Fp): SecretBool =
   ## Returns true if ``a`` is a square (quadratic residue) in 𝔽p
   ##
   ## Assumes that the prime modulus ``p`` is public.
@@ -50,7 +54,7 @@ func hasP3mod4_primeModulus(C: static Curve): static bool =
   ## Returns true iff p ≡ 3 (mod 4)
   (BaseType(C.Mod.limbs[0]) and 3) == 3
 
-func sqrt_p3mod4(a: var Fp) {.inline.} =
+func sqrt_p3mod4(a: var Fp) =
   ## Compute the square root of ``a``
   ##
   ## This requires ``a`` to be a square
@@ -64,7 +68,7 @@ func sqrt_p3mod4(a: var Fp) {.inline.} =
   static: doAssert BaseType(Fp.C.Mod.limbs[0]) mod 4 == 3
   a.powUnsafeExponent(Fp.getPrimePlus1div4_BE())
 
-func sqrt_invsqrt_p3mod4(sqrt, invsqrt: var Fp, a: Fp) {.inline.} =
+func sqrt_invsqrt_p3mod4(sqrt, invsqrt: var Fp, a: Fp) =
   ## If ``a`` is a square, compute the square root of ``a`` in sqrt
   ## and the inverse square root of a in invsqrt
   ##
@@ -85,7 +89,7 @@ func sqrt_invsqrt_p3mod4(sqrt, invsqrt: var Fp, a: Fp) {.inline.} =
   # √a ≡ a * 1/√a ≡ a^((p+1)/4) (mod p)
   sqrt.prod(invsqrt, a)
 
-func sqrt_invsqrt_if_square_p3mod4(sqrt, invsqrt: var Fp, a: Fp): SecretBool {.inline.} =
+func sqrt_invsqrt_if_square_p3mod4(sqrt, invsqrt: var Fp, a: Fp): SecretBool =
   ## If ``a`` is a square, compute the square root of ``a`` in sqrt
   ## and the inverse square root of a in invsqrt
   ##
@@ -97,7 +101,7 @@ func sqrt_invsqrt_if_square_p3mod4(sqrt, invsqrt: var Fp, a: Fp): SecretBool {.i
   test.square(sqrt)
   result = test == a
 
-func sqrt_if_square_p3mod4*(a: var Fp): SecretBool {.inline.} =
+func sqrt_if_square_p3mod4*(a: var Fp): SecretBool =
   ## If ``a`` is a square, compute the square root of ``a``
   ## if not, ``a`` is unmodified.
   ##
@@ -115,7 +119,7 @@ func sqrt_if_square_p3mod4*(a: var Fp): SecretBool {.inline.} =
 # Specialized routines for addchain-based square roots
 # ------------------------------------------------------------
 
-func sqrt_addchain(a: var Fp) {.inline.} =
+func sqrt_addchain(a: var Fp) =
   ## Compute the square root of ``a``
   ##
   ## This requires ``a`` to be a square
@@ -128,13 +132,13 @@ func sqrt_addchain(a: var Fp) {.inline.} =
   invsqrt.invsqrt_addchain(a)
   a *= invsqrt
 
-func sqrt_invsqrt_addchain(sqrt, invsqrt: var Fp, a: Fp) {.inline.} =
+func sqrt_invsqrt_addchain(sqrt, invsqrt: var Fp, a: Fp) =
   ## If ``a`` is a square, compute the square root of ``a`` in sqrt
   ## and the inverse square root of a in invsqrt
   invsqrt.invsqrt_addchain(a)
   sqrt.prod(invsqrt, a)
 
-func sqrt_invsqrt_if_square_addchain(sqrt, invsqrt: var Fp, a: Fp): SecretBool {.inline.} =
+func sqrt_invsqrt_if_square_addchain(sqrt, invsqrt: var Fp, a: Fp): SecretBool =
   ## If ``a`` is a square, compute the square root of ``a`` in sqrt
   ## and the inverse square root of a in invsqrt
   ##
@@ -144,7 +148,7 @@ func sqrt_invsqrt_if_square_addchain(sqrt, invsqrt: var Fp, a: Fp): SecretBool {
   test.square(sqrt)
   result = test == a
 
-func sqrt_if_square_addchain*(a: var Fp): SecretBool {.inline.} =
+func sqrt_if_square_addchain*(a: var Fp): SecretBool =
   ## If ``a`` is a square, compute the square root of ``a``
   ## if not, ``a`` is unmodified.
   ##
@@ -155,6 +159,8 @@ func sqrt_if_square_addchain*(a: var Fp): SecretBool {.inline.} =
   result = sqrt_invsqrt_if_square_addchain(sqrt, invsqrt, a)
   a.ccopy(sqrt, result)
 
+{.pop.} # inline
+
 # Tonelli Shanks for any prime
 # ------------------------------------------------------------
 
@@ -228,7 +234,7 @@ func sqrt_invsqrt_tonelli_shanks_pre(
 
 # ----------------------------------------------
 
-func sqrt_tonelli_shanks(a: var Fp, useAddChain: static bool) {.inline.} =
+func sqrt_tonelli_shanks(a: var Fp, useAddChain: static bool) =
   ## Compute the square root of ``a``
   ##
   ## This requires ``a`` to be a square
@@ -244,7 +250,7 @@ func sqrt_tonelli_shanks(a: var Fp, useAddChain: static bool) {.inline.} =
   sqrt_invsqrt_tonelli_shanks_pre(sqrt, invsqrt, a, a_pre_exp)
   a = sqrt
 
-func sqrt_invsqrt_tonelli_shanks(sqrt, invsqrt: var Fp, a: Fp, useAddChain: static bool) {.inline.} =
+func sqrt_invsqrt_tonelli_shanks(sqrt, invsqrt: var Fp, a: Fp, useAddChain: static bool) =
   ## Compute the square root and inverse square root of ``a``
   ##
   ## This requires ``a`` to be a square
@@ -258,7 +264,7 @@ func sqrt_invsqrt_tonelli_shanks(sqrt, invsqrt: var Fp, a: Fp, useAddChain: stat
   a_pre_exp.precompute_tonelli_shanks(a, useAddChain)
   sqrt_invsqrt_tonelli_shanks_pre(sqrt, invsqrt, a, a_pre_exp)
 
-func sqrt_invsqrt_if_square_tonelli_shanks(sqrt, invsqrt: var Fp, a: Fp, useAddChain: static bool): SecretBool  {.inline.} =
+func sqrt_invsqrt_if_square_tonelli_shanks(sqrt, invsqrt: var Fp, a: Fp, useAddChain: static bool): SecretBool =
   ## Compute the square root and ivnerse square root of ``a``
   ##
   ## This returns true if ``a`` is square and sqrt/invsqrt contains the square root/inverse square root
@@ -274,7 +280,7 @@ func sqrt_invsqrt_if_square_tonelli_shanks(sqrt, invsqrt: var Fp, a: Fp, useAddC
   sqrt_invsqrt_tonelli_shanks_pre(sqrt, invsqrt, a, a_pre_exp)
   a = sqrt
 
-func sqrt_if_square_tonelli_shanks*(a: var Fp, useAddChain: static bool): SecretBool {.inline.} =
+func sqrt_if_square_tonelli_shanks*(a: var Fp, useAddChain: static bool): SecretBool =
   ## If ``a`` is a square, compute the square root of ``a``
   ## if not, ``a`` is unmodified.
   ##
@@ -293,7 +299,9 @@ func sqrt_if_square_tonelli_shanks*(a: var Fp, useAddChain: static bool): Secret
 # Note: we export the inner sqrt_invsqrt_IMPL
 #       for benchmarking purposes.
 
-func sqrt*[C](a: var Fp[C]) {.inline.} =
+{.push inline.}
+
+func sqrt*[C](a: var Fp[C]) =
   ## Compute the square root of ``a``
   ##
   ## This requires ``a`` to be a square
@@ -311,7 +319,7 @@ func sqrt*[C](a: var Fp[C]) {.inline.} =
   else:
     sqrt_tonelli_shanks(a, useAddChain = C.hasTonelliShanksAddchain())
 
-func sqrt_invsqrt*[C](sqrt, invsqrt: var Fp[C], a: Fp[C]) {.inline.} =
+func sqrt_invsqrt*[C](sqrt, invsqrt: var Fp[C], a: Fp[C]) =
   ## Compute the square root and inverse square root of ``a``
   ##
   ## This requires ``a`` to be a square
@@ -328,7 +336,7 @@ func sqrt_invsqrt*[C](sqrt, invsqrt: var Fp[C], a: Fp[C]) {.inline.} =
   else:
     sqrt_invsqrt_tonelli_shanks(sqrt, invsqrt, a, useAddChain = C.hasTonelliShanksAddchain())
 
-func sqrt_invsqrt_if_square*[C](sqrt, invsqrt: var Fp[C], a: Fp[C]): SecretBool  {.inline.} =
+func sqrt_invsqrt_if_square*[C](sqrt, invsqrt: var Fp[C], a: Fp[C]): SecretBool  =
   ## Compute the square root and ivnerse square root of ``a``
   ##
   ## This returns true if ``a`` is square and sqrt/invsqrt contains the square root/inverse square root
@@ -345,7 +353,7 @@ func sqrt_invsqrt_if_square*[C](sqrt, invsqrt: var Fp[C], a: Fp[C]): SecretBool
   else:
     result = sqrt_invsqrt_if_square_tonelli_shanks(sqrt, invsqrt, a, useAddChain = C.hasTonelliShanksAddchain())
 
-func sqrt_if_square*[C](a: var Fp[C]): SecretBool {.inline.} =
+func sqrt_if_square*[C](a: var Fp[C]): SecretBool =
   ## If ``a`` is a square, compute the square root of ``a``
   ## if not, ``a`` is unmodified.
   ##
@@ -359,3 +367,6 @@ func sqrt_if_square*[C](a: var Fp[C]): SecretBool {.inline.} =
     result = sqrt_if_square_p3mod4(a)
   else:
     result = sqrt_if_square_tonelli_shanks(a, useAddChain = C.hasTonelliShanksAddchain())
+
+{.pop.} # inline
+{.pop.} # raises no exceptions
diff --git a/constantine/arithmetic/limbs.nim b/constantine/arithmetic/limbs.nim
index 6b81111..70ab953 100644
--- a/constantine/arithmetic/limbs.nim
+++ b/constantine/arithmetic/limbs.nim
@@ -174,7 +174,7 @@ func isEven*(a: Limbs): SecretBool =
 # Bit manipulation
 # ------------------------------------------------------------
 
-func shiftRight*(a: var Limbs, k: int) {.inline.}=
+func shiftRight*(a: var Limbs, k: int) =
   ## Shift right by k.
   ##
   ## k MUST be less than the base word size (2^32 or 2^64)
diff --git a/constantine/arithmetic/limbs_double_width.nim b/constantine/arithmetic/limbs_extmul.nim
similarity index 82%
rename from constantine/arithmetic/limbs_double_width.nim
rename to constantine/arithmetic/limbs_extmul.nim
index d58a7bb..a674cb8 100644
--- a/constantine/arithmetic/limbs_double_width.nim
+++ b/constantine/arithmetic/limbs_extmul.nim
@@ -21,13 +21,46 @@ when UseASM_X86_64:
 #
 # ############################################################
 
+# Inlining note:
+# - The public dispatch functions are inlined.
+#   This allows the compiler to check CPU features only once
+#   in the high-level proc and also removes an intermediate function call
+#   to the wrapper function
+# - The ASM procs or internal fallbacks are not inlined
+#   to save on code size.
+
 # No exceptions allowed
 {.push raises: [].}
 
 # Multiplication
 # ------------------------------------------------------------
 
-func prod*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
+func prod_comba[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
+  ## Extended precision multiplication
+  # We use Product Scanning / Comba multiplication
+  var t, u, v = Zero
+  const stopEx = min(a.len+b.len, r.len)
+
+  staticFor i, 0, stopEx:
+    # Invariant for product scanning:
+    # if we multiply accumulate by a[k1] * b[k2]
+    # we have k1+k2 == i
+    const ib = min(b.len-1, i)
+    const ia = i - ib
+    staticFor j, 0, min(a.len - ia, ib+1):
+      mulAcc(t, u, v, a[ia+j], b[ib-j])
+
+    r[i] = v
+    when i < stopEx-1:
+      v = u
+      u = t
+      t = Zero
+
+  if aLen+bLen < rLen:
+    for i in aLen+bLen ..< rLen:
+      r[i] = Zero
+
+func prod*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) {.inline.} =
   ## Multi-precision multiplication
   ## r <- a*b
   ##
@@ -46,28 +79,7 @@ func prod*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b:
   elif UseASM_X86_64:
     mul_asm(r, a, b)
   else:
-    # We use Product Scanning / Comba multiplication
-    var t, u, v = Zero
-    const stopEx = min(a.len+b.len, r.len)
-
-    staticFor i, 0, stopEx:
-      # Invariant for product scanning:
-      # if we multiply accumulate by a[k1] * b[k2]
-      # we have k1+k2 == i
-      const ib = min(b.len-1, i)
-      const ia = i - ib
-      staticFor j, 0, min(a.len - ia, ib+1):
-        mulAcc(t, u, v, a[ia+j], b[ib-j])
-
-      r[i] = v
-      when i < stopEx-1:
-        v = u
-        u = t
-        t = Zero
-
-    if aLen+bLen < rLen:
-      for i in aLen+bLen ..< rLen:
-        r[i] = Zero
+    prod_comba(r, a, b)
 
 func prod_high_words*[rLen, aLen, bLen](
        r: var Limbs[rLen],
@@ -178,7 +190,7 @@ func square_operandScan[rLen, aLen](
 
 func square*[rLen, aLen](
        r: var Limbs[rLen],
-       a: Limbs[aLen]) =
+       a: Limbs[aLen]) {.inline.} =
   ## Multi-precision squaring
   ## r <- a²
   ##
diff --git a/constantine/arithmetic/limbs_modular.nim b/constantine/arithmetic/limbs_modular.nim
index 82f6d80..99ec325 100644
--- a/constantine/arithmetic/limbs_modular.nim
+++ b/constantine/arithmetic/limbs_modular.nim
@@ -20,7 +20,7 @@ import
 #
 # ############################################################
 
-func div2_modular*(a: var Limbs, mp1div2: Limbs) {.inline.}=
+func div2_modular*(a: var Limbs, mp1div2: Limbs) {.inline.} =
   ## Modular Division by 2
   ## `a` will be divided in-place
   ## `mp1div2` is the modulus (M+1)/2
diff --git a/constantine/arithmetic/limbs_montgomery.nim b/constantine/arithmetic/limbs_montgomery.nim
index cae6e09..e45964c 100644
--- a/constantine/arithmetic/limbs_montgomery.nim
+++ b/constantine/arithmetic/limbs_montgomery.nim
@@ -40,7 +40,13 @@ when UseASM_X86_64:
 #       - pairing final exponentiation
 #       are bottlenecked by Montgomery multiplications or squarings
 #
-# Hence we use inline assembly where possible
+# Inlining note:
+# - The public dispatch functions are inlined.
+#   This allows the compiler to check CPU features only once
+#   in the high-level proc and also removes an intermediate function call
+#   to the wrapper function
+# - The ASM procs or internal fallbacks are not inlined
+#   to save on code size.
 
 # No exceptions allowed
 {.push raises: [].}
@@ -381,31 +387,12 @@ func montyRed_Comba[N: static int](
   discard z.csub(M, SecretBool(carry) or not(z < M))
   r = z
 
-# TODO upstream, using Limbs[N] breaks semcheck
-func montyRed*[N: static int](
-       r: var array[N, SecretWord],
-       a: array[N*2, SecretWord],
-       M: array[N, SecretWord],
-       m0ninv: BaseType, canUseNoCarryMontyMul: static bool) =
-  ## Montgomery reduce a double-width bigint modulo M
-  when UseASM_X86_64 and r.len <= 6:
-    if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()):
-      montRed_asm_adx_bmi2(r, a, M, m0ninv, canUseNoCarryMontyMul)
-    else:
-      montRed_asm(r, a, M, m0ninv, canUseNoCarryMontyMul)
-  elif UseASM_X86_32 and r.len <= 6:
-    # TODO: Assembly faster than GCC but slower than Clang
-    montRed_asm(r, a, M, m0ninv, canUseNoCarryMontyMul)
-  else:
-    montyRed_CIOS(r, a, M, m0ninv)
-    # montyRed_Comba(r, a, M, m0ninv)
-
 # Exported API
 # ------------------------------------------------------------
 
 func montyMul*(
         r: var Limbs, a, b, M: Limbs,
-        m0ninv: static BaseType, canUseNoCarryMontyMul: static bool) =
+        m0ninv: static BaseType, canUseNoCarryMontyMul: static bool) {.inline.} =
   ## Compute r <- a*b (mod M) in the Montgomery domain
   ## `m0ninv` = -1/M (mod SecretWord). Our words are 2^32 or 2^64
   ##
@@ -471,6 +458,25 @@ func montySquare*(r: var Limbs, a, M: Limbs,
   #   # montySquare_CIOS(r, a, M, m0ninv) # TODO <--- Fix this
   #   montyMul_FIPS(r, a, a, M, m0ninv)
 
+# TODO upstream, using Limbs[N] breaks semcheck
+func montyRed*[N: static int](
+       r: var array[N, SecretWord],
+       a: array[N*2, SecretWord],
+       M: array[N, SecretWord],
+       m0ninv: BaseType, canUseNoCarryMontyMul: static bool) {.inline.} =
+  ## Montgomery reduce a double-width bigint modulo M
+  when UseASM_X86_64 and r.len <= 6:
+    if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()):
+      montRed_asm_adx_bmi2(r, a, M, m0ninv, canUseNoCarryMontyMul)
+    else:
+      montRed_asm(r, a, M, m0ninv, canUseNoCarryMontyMul)
+  elif UseASM_X86_32 and r.len <= 6:
+    # TODO: Assembly faster than GCC but slower than Clang
+    montRed_asm(r, a, M, m0ninv, canUseNoCarryMontyMul)
+  else:
+    montyRed_CIOS(r, a, M, m0ninv)
+    # montyRed_Comba(r, a, M, m0ninv)
+
 func redc*(r: var Limbs, a, one, M: Limbs,
            m0ninv: static BaseType, canUseNoCarryMontyMul: static bool) =
   ## Transform a bigint ``a`` from it's Montgomery N-residue representation (mod N)
diff --git a/constantine/config/curves_declaration.nim b/constantine/config/curves_declaration.nim
index 33b14e3..cf8f0a0 100644
--- a/constantine/config/curves_declaration.nim
+++ b/constantine/config/curves_declaration.nim
@@ -83,12 +83,11 @@ declareCurves:
     eq_form: ShortWeierstrass
     coef_a: 0
     coef_b: 2
-    nonresidue_quad_fp: -1       #      -1   is not a square in 𝔽p
-    nonresidue_cube_fp2: (1, 1)  # 1+𝑖   1+𝑖  is not a cube in 𝔽p²
+    nonresidue_fp: -1       #      -1   is not a square in 𝔽p
+    nonresidue_fp2: (1, 1)  # 1+𝑖   1+𝑖  is not a square or cube in 𝔽p²
 
     embedding_degree: 12
     sexticTwist: D_Twist
-    sexticNonResidue_fp2: (1, 1) # 1+𝑖
 
   curve BN254_Snarks: # Zero-Knowledge proofs curve (SNARKS, STARKS, Ethereum)
     bitwidth: 254
@@ -103,12 +102,11 @@ declareCurves:
     eq_form: ShortWeierstrass
     coef_a: 0
     coef_b: 3
-    nonresidue_quad_fp: -1       #      -1   is not a square in 𝔽p
-    nonresidue_cube_fp2: (9, 1)  # 9+𝑖   9+𝑖  is not a cube in 𝔽p²
+    nonresidue_fp: -1       #      -1   is not a square in 𝔽p
+    nonresidue_fp2: (9, 1)  # 9+𝑖   9+𝑖  is not a square or cube in 𝔽p²
 
     embedding_degree: 12
     sexticTwist: D_Twist
-    sexticNonResidue_fp2: (9, 1) # 9+𝑖
 
   curve Curve25519: # Bernstein curve
     bitwidth: 255
@@ -136,12 +134,11 @@ declareCurves:
     eq_form: ShortWeierstrass
     coef_a: 0
     coef_b: 1
-    nonresidue_quad_fp: -5       #      -5   is not a square in 𝔽p
-    nonresidue_cube_fp2: (0, 1)  # √-5  √-5  is not a cube in 𝔽p²
+    nonresidue_fp: -5       #      -5   is not a square in 𝔽p
+    nonresidue_fp2: (0, 1)  # √-5  √-5  is not a square or cube in 𝔽p²
 
     embedding_degree: 12
     sexticTwist: D_Twist
-    sexticNonResidue_fp2: (0, 1) # √-5
 
   curve BLS12_381:
     bitwidth: 381
@@ -157,12 +154,11 @@ declareCurves:
     eq_form: ShortWeierstrass
     coef_a: 0
     coef_b: 4
-    nonresidue_quad_fp: -1       #      -1   is not a square in 𝔽p
-    nonresidue_cube_fp2: (1, 1)  # 1+𝑖   1+𝑖  is not a cube in 𝔽p²
+    nonresidue_fp: -1       #      -1   is not a square in 𝔽p
+    nonresidue_fp2: (1, 1)  # 1+𝑖   1+𝑖  is not a square or cube in 𝔽p²
 
     embedding_degree: 12
     sexticTwist: M_Twist
-    sexticNonResidue_fp2: (1, 1) # 1+𝑖
 
   curve BW6_761:
     bitwidth: 761
@@ -182,9 +178,8 @@ declareCurves:
     coef_b: -1
 
     # TODO: rework the quad/cube/sextic non residue declaration
-    nonresidue_quad_fp: -4       # -4   is not a square in 𝔽p
-    nonresidue_cube_fp2: (0, 1)  # -4   is not a cube in 𝔽p²
+    nonresidue_fp: -4       # -4   is not a square or cube in 𝔽p
+    nonresidue_fp2: (0, 1)  # -4   is not a cube in 𝔽p²
 
     embedding_degree: 6
     sexticTwist: M_Twist
-    sexticNonResidue_fp2: (0, 1)  # -4
diff --git a/constantine/config/curves_parser.nim b/constantine/config/curves_parser.nim
index bc06aee..56c1b9a 100644
--- a/constantine/config/curves_parser.nim
+++ b/constantine/config/curves_parser.nim
@@ -96,8 +96,8 @@ type
     modulus: NimNode  # nnkStrLit (hex)
 
     # Towering
-    nonresidue_quad_fp: NimNode # nnkIntLit
-    nonresidue_cube_fp2: NimNode # nnkPar(nnkIntLit, nnkIntLit)
+    nonresidue_fp: NimNode # nnkIntLit
+    nonresidue_fp2: NimNode # nnkPar(nnkIntLit, nnkIntLit)
 
     # Curve parameters
     eq_form: CurveEquationForm
@@ -108,7 +108,6 @@ type
 
     embedding_degree: int
     sexticTwist: SexticTwist
-    sexticNonResidue_fp2: NimNode # nnkPar(nnkIntLit, nnkIntLit)
 
     family: CurveFamily
 
@@ -198,17 +197,15 @@ proc parseCurveDecls(defs: var seq[CurveParams], curves: NimNode) =
         params.orderBitwidth = sectionVal
       elif sectionId.eqIdent"cofactor":
         discard "TODO"
-      elif sectionId.eqIdent"nonresidue_quad_fp":
-        params.nonresidue_quad_fp = sectionVal
-      elif sectionId.eqIdent"nonresidue_cube_fp2":
-        params.nonresidue_cube_fp2 = sectionVal
+      elif sectionId.eqIdent"nonresidue_fp":
+        params.nonresidue_fp = sectionVal
+      elif sectionId.eqIdent"nonresidue_fp2":
+        params.nonresidue_fp2 = sectionVal
 
       elif sectionId.eqIdent"embedding_degree":
         params.embedding_degree = sectionVal.intVal.int
       elif sectionId.eqIdent"sexticTwist":
         params.sexticTwist = parseEnum[SexticTwist]($sectionVal)
-      elif sectionId.eqIdent"sexticNonResidue_fp2":
-        params.sexticNonResidue_fp2 = sectionVal
       else:
         error "Invalid section: \n", curveParams[i].toStrLit()
 
@@ -323,12 +320,12 @@ proc genMainConstants(defs: var seq[CurveParams]): NimNode =
       # Towering
       # -----------------------------------------------
       curveEllipticStmts.add newConstStmt(
-        exported($curve & "_nonresidue_quad_fp"),
-        curveDef.nonresidue_quad_fp
+        exported($curve & "_nonresidue_fp"),
+        curveDef.nonresidue_fp
       )
       curveEllipticStmts.add newConstStmt(
-        exported($curve & "_nonresidue_cube_fp2"),
-        curveDef.nonresidue_cube_fp2
+        exported($curve & "_nonresidue_fp2"),
+        curveDef.nonresidue_fp2
       )
 
       # Pairing
@@ -341,10 +338,6 @@ proc genMainConstants(defs: var seq[CurveParams]): NimNode =
         exported($curve & "_sexticTwist"),
         newLit curveDef.sexticTwist
       )
-      curveEllipticStmts.add newConstStmt(
-        exported($curve & "_sexticNonResidue_fp2"),
-        curveDef.sexticNonResidue_fp2
-      )
 
   # end for ---------------------------------------------------
 
diff --git a/constantine/config/curves_prop_core.nim b/constantine/config/curves_prop_core.nim
index e214f9c..759fe15 100644
--- a/constantine/config/curves_prop_core.nim
+++ b/constantine/config/curves_prop_core.nim
@@ -82,21 +82,27 @@ macro getCoefB*(C: static Curve): untyped =
   ## or a bigInt depending on the curve
   result = bindSym($C & "_coef_B")
 
-macro get_QNR_Fp*(C: static Curve): untyped =
-  ## Returns the tower extension quadratic non-residue in 𝔽p
-  ## i.e. a number that is not a square in 𝔽p
-  result = bindSym($C & "_nonresidue_quad_fp")
+macro getNonResidueFp*(C: static Curve): untyped =
+  ## Returns the tower extension (and twist) non-residue for 𝔽p
+  ## Depending on the curve it might be:
+  ## - not a square (quadratic non-residue to construct Fp2)
+  ## - not a cube (cubic non-residue to construct Fp3)
+  ## - neither a square or cube (sextic non-residue to construct Fp2, Fp3 or Fp6)
+  result = bindSym($C & "_nonresidue_fp")
 
-macro get_CNR_Fp2*(C: static Curve): untyped =
-  ## Returns the tower extension cubic non-residue 𝔽p²
-  ## i.e. a number that is not a cube in 𝔽p²
+macro getNonResidueFp2*(C: static Curve): untyped =
+  ## Returns the tower extension (and twist) non-residue for 𝔽p²
+  ## Depending on the curve it might be:
+  ## - not a square (quadratic non-residue to construct Fp4)
+  ## - not a cube (cubic non-residue to construct Fp6)
+  ## - neither a square or cube (sextic non-residue to construct Fp4, Fp6 or Fp12)
   ##
   ## The return value is a tuple (a, b)
   ## that corresponds to the number a + b𝑗
   ## with 𝑗 choosen for 𝑗² - QNR_Fp == 0
   ## i.e. if -1 is chosen as a quadratic non-residue 𝑗 = √-1
   ##      if -2 is chosen as a quadratic non-residue 𝑗 = √-2
-  result = bindSym($C & "_nonresidue_cube_fp2")
+  result = bindSym($C & "_nonresidue_fp2")
 
 macro getEmbeddingDegree*(C: static Curve): untyped =
   ## Returns the prime embedding degree,
@@ -108,9 +114,3 @@ macro getEmbeddingDegree*(C: static Curve): untyped =
 macro getSexticTwist*(C: static Curve): untyped =
   ## Returns if D-Twist or M-Twist
   result = bindSym($C & "_sexticTwist")
-
-macro get_SNR_Fp2*(C: static Curve): untyped =
-  ## Returns the sextic non-residue in 𝔽p²
-  ## choosen to build the twisted curve E'(𝔽p²)
-  ## i.e. a number µ so that x⁶ - µ is irreducible
-  result = bindSym($C & "_sexticNonResidue_fp2")
diff --git a/constantine/elliptic/ec_shortweierstrass_affine.nim b/constantine/elliptic/ec_shortweierstrass_affine.nim
index e8e4d6f..45d86f8 100644
--- a/constantine/elliptic/ec_shortweierstrass_affine.nim
+++ b/constantine/elliptic/ec_shortweierstrass_affine.nim
@@ -32,6 +32,8 @@ type
     ## over a field F
     x*, y*: F
 
+  SexticNonResidue* = NonResidue
+
 func isInf*(P: ECP_ShortW_Aff): SecretBool =
   ## Returns true if P is an infinity point
   ## and false otherwise
diff --git a/constantine/isogeny/frobenius.nim b/constantine/isogeny/frobenius.nim
index 7e1ee42..f596097 100644
--- a/constantine/isogeny/frobenius.nim
+++ b/constantine/isogeny/frobenius.nim
@@ -49,48 +49,6 @@ func frobenius_map*(r: var Fp2, a: Fp2, k: static int = 1) {.inline.} =
   else:
     r = a
 
-template mulCheckSparse(a: var Fp, b: Fp) =
-  when b.isOne().bool:
-    discard
-  elif b.isZero().bool:
-    a.setZero()
-  elif b.isMinusOne().bool:
-    a.neg()
-  else:
-    a *= b
-
-template mulCheckSparse(a: var Fp2, b: Fp2) =
-  when b.isOne().bool:
-    discard
-  elif b.isMinusOne().bool:
-    a.neg()
-  elif b.c0.isZero().bool and b.c1.isOne().bool:
-    var t {.noInit.}: type(a.c0)
-    when fromComplexExtension(b):
-      t.neg(a.c1)
-      a.c1 = a.c0
-      a.c0 = t
-    else:
-      t = NonResidue * a.c1
-      a.c1 = a.c0
-      a.c0 = t
-  elif b.c0.isZero().bool and b.c1.isMinusOne().bool:
-    var t {.noInit.}: type(a.c0)
-    when fromComplexExtension(b):
-      t = a.c1
-      a.c1.neg(a.c0)
-      a.c0 = t
-    else:
-      t = NonResidue * a.c1
-      a.c1.neg(a.c0)
-      a.c0.neg(t)
-  elif b.c0.isZero().bool:
-    a.mul_sparse_by_0y(b)
-  elif b.c1.isZero().bool:
-    a.mul_sparse_by_x0(b)
-  else:
-    a *= b
-
 # Frobenius map - on extension fields
 # -----------------------------------------------------------------
 
diff --git a/constantine/pairing/lines_common.nim b/constantine/pairing/lines_common.nim
index 6b9eea8..fcac103 100644
--- a/constantine/pairing/lines_common.nim
+++ b/constantine/pairing/lines_common.nim
@@ -35,6 +35,22 @@ type
     ##   with Fp2 coordinates: xyz000
     x*, y*, z*: F
 
+  SexticNonResidue* = NonResidue
+    ## The Sextic non-residue to build
+    ## 𝔽p2 -> 𝔽p12 towering and the G2 sextic twist
+    ## or
+    ## 𝔽p -> 𝔽p6 towering and the G2 sextic twist
+    ##
+    ## Note:
+    ## while the non-residues for
+    ## - 𝔽p2 -> 𝔽p4
+    ## - 𝔽p2 -> 𝔽p6
+    ## are also sextic non-residues by construction.
+    ## the non-residues for
+    ## - 𝔽p4 -> 𝔽p12
+    ## - 𝔽p6 -> 𝔽p12
+    ## are not.
+
 func toHex*(line: Line, order: static Endianness = bigEndian): string =
   result = static($line.typeof.genericHead() & '(')
   for fieldName, fieldValue in fieldPairs(line):
diff --git a/constantine/pairing/lines_projective.nim b/constantine/pairing/lines_projective.nim
index 31904ed..e9790c8 100644
--- a/constantine/pairing/lines_projective.nim
+++ b/constantine/pairing/lines_projective.nim
@@ -92,15 +92,10 @@ func line_eval_double[F](
   template B: untyped = line.y
   template C: untyped = line.z
 
-  A = T.y               # A = Y
-  v = T.y               # v = Y
-  B = T.z               # B = Z
-  C = T.x               # C = X
-
-  A *= B                # A = Y.Z
-  C.square()            # C = X²
-  v.square()            # v = Y²
-  B.square()            # B = Z²
+  A.prod(T.y, T.z)      # A = Y.Z
+  C.square(T.x)         # C = X²
+  v.square(T.y)         # v = Y²
+  B.square(T.z)         # B = Z²
 
   A.double()            # A =  2 Y.Z
   A.neg()               # A = -2 Y.Z
@@ -148,28 +143,21 @@ func line_eval_add[F](
   template B: untyped = line.y
   template C: untyped = line.z
 
-  A = T.x     # A = X₁
-  v = T.z     # v = Z₁
-  B = T.z     # B = Z₁
-  C = T.y     # C = Y₁
+  v.prod(T.z, Q.y) # v = Z₁Y₂
+  B.prod(T.z, Q.x) # B = Z₁X₂
 
-  v *= Q.y    # v = Z₁Y₂
-  B *= Q.x    # B = Z₁X₂
+  A.diff(T.x, B)   # A = X₁-Z₁X₂
+  C.diff(T.y, v)   # C = Y₁-Z₁Y₂
 
-  A -= B      # A = X₁-Z₁X₂
-  C -= v      # C = Y₁-Z₁Y₂
+  v.prod(A, Q.y)   # v = (X₁-Z₁X₂) Y₂
+  B.prod(C, Q.x)   # B = (Y₁-Z₁Y₂) X₂
+  B -= v           # B = (Y₁-Z₁Y₂) X₂ - (X₁-Z₁X₂) Y₂
+
+  C.neg()          # C = -(Y₁-Z₁Y₂)
 
-  v = A       # v = X₁-Z₁X₂
   when F.C.getSexticTwist() == M_Twist:
     A *= SexticNonResidue # A = ξ (X₁ - Z₁X₂)
 
-  v *= Q.y    # v = (X₁-Z₁X₂) Y₂
-  B = C       # B = Y₁-Z₁Y₂
-  B *= Q.x    # B = (Y₁-Z₁Y₂) X₂
-  B -= v      # B = (Y₁-Z₁Y₂) X₂ - (X₁-Z₁X₂) Y₂
-
-  C.neg()     # C = -(Y₁-Z₁Y₂)
-
 func line_eval_fused_double[F](
        line: var Line[F],
        T: var ECP_ShortW_Proj[F, OnTwist]) =
@@ -193,13 +181,11 @@ func line_eval_fused_double[F](
   when F.C.getSexticTwist() == D_Twist:
     snrB *= SexticNonResidue
 
-  E = C
-  E *= b3
+  E.prod(C, b3)
   when F.C.getSexticTwist() == M_Twist:
     E *= SexticNonResidue # E = 3b'Z² = 3bξ Z²
 
-  F = E
-  F *= 3            # F = 3E = 9bZ²
+  F.prod(E, 3)      # F = 3E = 9bZ²
   G.sum(snrB, F)
   G.div2()          # G = (B+F)/2
   H.sum(T.y, T.z)
diff --git a/constantine/pairing/mul_fp12_by_lines.nim b/constantine/pairing/mul_fp12_by_lines.nim
index 0a23c28..065db1a 100644
--- a/constantine/pairing/mul_fp12_by_lines.nim
+++ b/constantine/pairing/mul_fp12_by_lines.nim
@@ -56,7 +56,7 @@ func mul_by_line_xy0*[C: static Curve](
   v0.prod(a.c0, b.x)
   v1.prod(a.c1, b.y)
   r.c0.prod(a.c2, b.y)
-  r.c0 *= ξ
+  r.c0 *= SexticNonResidue
   r.c0 += v0
 
   r.c1.sum(a.c0, a.c1) # Error when r and a alias as r.c0 was updated
@@ -93,10 +93,9 @@ func mul_sparse_by_line_xy00z0*[C: static Curve](
   f.c1 += f.c0
   v3.mul_by_line_xy0(f.c1, v2)
   v3 -= v0
-  v3 -= v1
-  f.c1 = v3
+  f.c1.diff(v3, v1)
 
-  v3.c0 = ξ * v1.c2
+  v3.c0.prod(v1.c2, SexticNonResidue)
   v3.c0 += v0.c0
   v3.c1.sum(v0.c1, v1.c0)
   v3.c2.sum(v0.c2, v1.c1)
@@ -150,7 +149,7 @@ func mul_sparse_by_line_xyz000*[C: static Curve](
 
   # r0 = ξ a2 b1 + v0
   f.c0.mul_sparse_by_y0(f.c2, l.z)
-  f.c0 *= NonResidue
+  f.c0 *= SexticNonResidue
   f.c0 += v0
 
   # r2 = a2 b0 + v1
@@ -201,12 +200,12 @@ func mul_sparse_by_line_xy000z*[C: static Curve](
 
   # r0 = ξ a1 b2 + v0
   f.c0.mul_sparse_by_0y(f.c1, l.z)
-  f.c0 *= NonResidue
+  f.c0 *= SexticNonResidue
   f.c0 += v0
 
   # r1 = a1 b0 + ξ v2
   f.c1 *= b0
-  v2 *= NonResidue
+  v2 *= SexticNonResidue
   f.c1 += v2
 
 func mul*[C](f: var Fp12[C], line: Line[Fp2[C]]) {.inline.} =
diff --git a/constantine/pairing/pairing_bls12.nim b/constantine/pairing/pairing_bls12.nim
index 8252beb..23f53ff 100644
--- a/constantine/pairing/pairing_bls12.nim
+++ b/constantine/pairing/pairing_bls12.nim
@@ -89,7 +89,8 @@ func millerLoopGenericBLS12*[C](
   f.setOne()
 
   template u: untyped = C.pairing(ate_param)
-  let u3 = 3*C.pairing(ate_param)
+  var u3 = C.pairing(ate_param)
+  u3 *= 3
   for i in countdown(u3.bits - 2, 1):
     f.square()
     line.line_double(T, P)
diff --git a/constantine/pairing/pairing_bn.nim b/constantine/pairing/pairing_bn.nim
index cb03959..e0a6eeb 100644
--- a/constantine/pairing/pairing_bn.nim
+++ b/constantine/pairing/pairing_bn.nim
@@ -87,7 +87,8 @@ func millerLoopGenericBN*[C](
   f.setOne()
 
   template u: untyped = C.pairing(ate_param)
-  let u3 = 3*C.pairing(ate_param)
+  var u3 = C.pairing(ate_param)
+  u3 *= 3
   for i in countdown(u3.bits - 2, 1):
     f.square()
     line.line_double(T, P)
diff --git a/constantine/tower_field_extensions/cubic_extensions.nim b/constantine/tower_field_extensions/cubic_extensions.nim
deleted file mode 100644
index 091fa5b..0000000
--- a/constantine/tower_field_extensions/cubic_extensions.nim
+++ /dev/null
@@ -1,228 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ../arithmetic,
-  ../primitives,
-  ./tower_common
-
-# Commutative ring implementation for Cubic Extension Fields
-# -------------------------------------------------------------------
-# Cubic extensions can use specific squaring procedures
-# beyond Schoolbook and Karatsuba:
-# - Chung-Hasan (3 different algorithms)
-# - Toom-Cook-3x
-#
-# Chung-Hasan papers
-# http://cacr.uwaterloo.ca/techreports/2006/cacr2006-24.pdf
-# https://www.lirmm.fr/arith18/papers/Chung-Squaring.pdf
-#
-# The papers focus on polynomial squaring, they have been adapted
-# to towered extension fields with the relevant costs in
-#
-# - Multiplication and Squaring on Pairing-Friendly Fields
-#   Augusto Jun Devegili and Colm Ó hÉigeartaigh and Michael Scott and Ricardo Dahab, 2006
-#   https://eprint.iacr.org/2006/471
-#
-# Costs in the underlying field
-# M: Mul, S: Square, A: Add/Sub, B: Mul by non-residue
-#
-# | Method      | > Linear | Linear            |
-# |-------------|----------|-------------------|
-# | Schoolbook  | 3M + 3S  | 6A + 2B           |
-# | Karatsuba   | 6S       | 13A + 2B          |
-# | Tom-Cook-3x | 5S       | 33A + 2B          |
-# | CH-SQR1     | 3M + 2S  | 11A + 2B          |
-# | CH-SQR2     | 2M + 3S  | 10A + 2B          |
-# | CH-SQR3     | 1M + 4S  | 11A + 2B + 1 Div2 |
-# | CH-SQR3x    | 1M + 4S  | 14A + 2B          |
-
-func square_Chung_Hasan_SQR2(r: var CubicExt, a: CubicExt) {.used.}=
-  ## Returns r = a²
-  mixin prod, square, sum
-  var v3{.noInit.}, v4{.noInit.}, v5{.noInit.}: typeof(r.c0)
-
-  v4.prod(a.c0, a.c1)
-  v4.double()
-  v5.square(a.c2)
-  r.c1 = NonResidue * v5
-  r.c1 += v4
-  r.c2.diff(v4, v5)
-  v3.square(a.c0)
-  v4.diff(a.c0, a.c1)
-  v4 += a.c2
-  v5.prod(a.c1, a.c2)
-  v5.double()
-  v4.square()
-  r.c0 = NonResidue * v5
-  r.c0 += v3
-  r.c2 += v4
-  r.c2 += v5
-  r.c2 -= v3
-
-func square_Chung_Hasan_SQR3(r: var CubicExt, a: CubicExt) =
-  ## Returns r = a²
-  mixin prod, square, sum
-  var v0{.noInit.}, v2{.noInit.}: typeof(r.c0)
-
-  r.c1.sum(a.c0, a.c2)    # r1 = a0 + a2
-  v2.diff(r.c1, a.c1)     # v2 = a0 - a1 + a2
-  r.c1 += a.c1            # r1 = a0 + a1 + a2
-  r.c1.square()           # r1 = (a0 + a1 + a2)²
-  v2.square()             # v2 = (a0 - a1 + a2)²
-
-  r.c2.sum(r.c1, v2)      # r2 = (a0 + a1 + a2)² + (a0 - a1 + a2)²
-  r.c2.div2()             # r2 = ((a0 + a1 + a2)² + (a0 - a1 + a2)²)/2
-
-  r.c0.prod(a.c1, a.c2)   # r0 = a1 a2
-  r.c0.double()           # r0 = 2 a1 a2
-
-  v2.square(a.c2)         # v2 = a2²
-  r.c1 += NonResidue * v2 # r1 = (a0 + a1 + a2)² + β a2²
-  r.c1 -= r.c0            # r1 = (a0 + a1 + a2)² - 2 a1 a2 + β a2²
-  r.c1 -= r.c2            # r1 = (a0 + a1 + a2)² - 2 a1 a2 - ((a0 + a1 + a2)² + (a0 - a1 + a2)²)/2 + β a2²
-
-  v0.square(a.c0)         # v0 = a0²
-  r.c0 *= NonResidue      # r0 = β 2 a1 a2
-  r.c0 += v0              # r0 = a0² + β 2 a1 a2
-
-  r.c2 -= v0              # r2 = ((a0 + a1 + a2)² + (a0 - a1 + a2)²)/2 - a0²
-  r.c2 -= v2              # r2 = ((a0 + a1 + a2)² + (a0 - a1 + a2)²)/2 - a0² - a2²
-
-func square*(r: var CubicExt, a: CubicExt) {.inline.} =
-  ## Returns r = a²
-  square_Chung_Hasan_SQR3(r, a)
-
-func prod*(r: var CubicExt, a, b: CubicExt) =
-  ## Returns r = a * b
-  ##
-  ## r MUST not share a buffer with a
-  # Algorithm is Karatsuba
-  var v0{.noInit.}, v1{.noInit.}, v2{.noInit.}, t{.noInit.}: typeof(r.c0)
-
-  v0.prod(a.c0, b.c0)
-  v1.prod(a.c1, b.c1)
-  v2.prod(a.c2, b.c2)
-
-  # r.c0 = β ((a.c1 + a.c2) * (b.c1 + b.c2) - v1 - v2) + v0
-  r.c0.sum(a.c1, a.c2)
-  t.sum(b.c1, b.c2)
-  r.c0 *= t
-  r.c0 -= v1
-  r.c0 -= v2
-  r.c0 *= NonResidue
-  r.c0 += v0
-
-  # r.c1 = (a.c0 + a.c1) * (b.c0 + b.c1) - v0 - v1 + β v2
-  r.c1.sum(a.c0, a.c1)
-  t.sum(b.c0, b.c1)
-  r.c1 *= t
-  r.c1 -= v0
-  r.c1 -= v1
-  r.c1 += NonResidue * v2
-
-  # r.c2 = (a.c0 + a.c2) * (b.c0 + b.c2) - v0 - v2 + v1
-  r.c2.sum(a.c0, a.c2)
-  t.sum(b.c0, b.c2)
-  r.c2 *= t
-  r.c2 -= v0
-  r.c2 -= v2
-  r.c2 += v1
-
-func inv*(r: var CubicExt, a: CubicExt) =
-  ## Compute the multiplicative inverse of ``a``
-  ##
-  ## The inverse of 0 is 0.
-  ## Incidentally this avoids extra check
-  ## to convert Jacobian and Projective coordinates
-  ## to affine for elliptic curve
-  #
-  # Algorithm 5.23
-  #
-  # Arithmetic of Finite Fields
-  # Chapter 5 of Guide to Pairing-Based Cryptography
-  # Jean Luc Beuchat, Luis J. Dominguez Perez, Sylvain Duquesne, Nadia El Mrabet, Laura Fuentes-Castañeda, Francisco Rodríguez-Henríquez, 2017\
-  # https://www.researchgate.net/publication/319538235_Arithmetic_of_Finite_Fields
-  #
-  # We optimize for stack usage and use 4 temporaries (+r as temporary)
-  # instead of 9, because 5 * 2 (𝔽p2) * Bitsize would be:
-  # - ~2540 bits for BN254
-  # - ~3810 bits for BLS12-381
-  var v1 {.noInit.}, v2 {.noInit.}, v3 {.noInit.}: typeof(r.c0)
-
-  # A in r0
-  # A <- a0² - β a1 a2
-  r.c0.square(a.c0)
-  v1.prod(a.c1, a.c2)
-  v1 *= NonResidue
-  r.c0 -= v1
-
-  # B in v1
-  # B <- β a2² - a0 a1
-  v1.square(a.c2)
-  v1 *= NonResidue
-  v2.prod(a.c0, a.c1)
-  v1 -= v2
-
-  # C in v2
-  # C <- a1² - a0 a2
-  v2.square(a.c1)
-  v3.prod(a.c0, a.c2)
-  v2 -= v3
-
-  # F in v3
-  # F <- β a1 C + a0 A + β a2 B
-  r.c1.prod(v1, NonResidue * a.c2)
-  r.c2.prod(v2, NonResidue * a.c1)
-  v3.prod(r.c0, a.c0)
-  v3 += r.c1
-  v3 += r.c2
-
-  let t = v3 # TODO, support aliasing in all primitives
-  v3.inv(t)
-
-  # (a0 + a1 v + a2 v²)^-1 = (A + B v + C v²) / F
-  r.c0 *= v3
-  r.c1.prod(v1, v3)
-  r.c2.prod(v2, v3)
-
-func inv*(a: var CubicExt) =
-  ## Compute the multiplicative inverse of ``a``
-  ##
-  ## The inverse of 0 is 0.
-  ## Incidentally this avoids extra check
-  ## to convert Jacobian and Projective coordinates
-  ## to affine for elliptic curve
-  let t = a
-  a.inv(t)
-
-func `*=`*(a: var CubicExt, b: CubicExt) {.inline.} =
-  ## In-place multiplication
-  # On higher extension field like 𝔽p6,
-  # if `prod` is called on shared in and out buffer, the result is wrong
-  let t = a
-  a.prod(t, b)
-
-func conj*(a: var CubicExt) {.inline.} =
-  ## Computes the conjugate in-place
-  mixin conj, conjneg
-  a.c0.conj()
-  a.c1.conjneg()
-  a.c2.conj()
-
-func conj*(r: var CubicExt, a: CubicExt) {.inline.} =
-  ## Computes the conjugate out-of-place
-  mixin conj, conjneg
-  r.c0.conj(a.c0)
-  r.c1.conjneg(a.c1)
-  r.c2.conj(a.c2)
-
-func square*(a: var CubicExt) {.inline.} =
-  ## In-place squaring
-  let t = a
-  a.square(t)
diff --git a/constantine/tower_field_extensions/exponentiations.nim b/constantine/tower_field_extensions/exponentiations.nim
index a5e18b1..8a60d5c 100644
--- a/constantine/tower_field_extensions/exponentiations.nim
+++ b/constantine/tower_field_extensions/exponentiations.nim
@@ -12,9 +12,7 @@ import
   ../config/common,
   ../primitives,
   ../io/io_bigints,
-  ./tower_common,
-  ./quadratic_extensions,
-  ./cubic_extensions
+  ./extension_fields
 
 # ############################################################
 #
diff --git a/constantine/tower_field_extensions/extension_fields.nim b/constantine/tower_field_extensions/extension_fields.nim
new file mode 100644
index 0000000..257f46e
--- /dev/null
+++ b/constantine/tower_field_extensions/extension_fields.nim
@@ -0,0 +1,977 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../config/common,
+  ../primitives,
+  ../arithmetic
+
+# Note: to avoid burdening the Nim compiler, we rely on generic extension
+# to complain if the base field procedures don't exist
+
+# No exceptions allowed
+{.push raises: [].}
+{.push inline.}
+
+# ############################################################
+#                                                            #
+#                    Extension Fields                        #
+#                                                            #
+# ############################################################
+
+type
+  NonResidue* = object
+    ## Non-Residue
+    ##
+    ## Placeholder for the appropriate quadratic, cubic or sectic non-residue
+
+  CubicExt* = concept x
+    ## Cubic Extension field concept
+    type BaseField = auto
+    x.c0 is BaseField
+    x.c1 is BaseField
+    x.c2 is BaseField
+
+  QuadraticExt* = concept x
+    ## Quadratic Extension field concept
+    not(x is CubicExt)
+
+    type BaseField = auto
+    x.c0 is BaseField
+    x.c1 is BaseField
+
+  ExtensionField* = QuadraticExt or CubicExt
+
+# Initialization
+# -------------------------------------------------------------------
+
+func setZero*(a: var ExtensionField) =
+  ## Set ``a`` to 0 in the extension field
+  for field in fields(a):
+    field.setZero()
+
+func setOne*(a: var ExtensionField) =
+  ## Set ``a`` to 1 in the extension field
+  for fieldName, fA in fieldPairs(a):
+    when fieldName == "c0":
+      fA.setOne()
+    else:
+      fA.setZero()
+
+func fromBig*(a: var ExtensionField, src: BigInt) =
+  ## Set ``a`` to the bigint value in the extension field
+  for fieldName, fA in fieldPairs(a):
+    when fieldName == "c0":
+      fA.fromBig(src)
+    else:
+      fA.setZero()
+
+# Comparison
+# -------------------------------------------------------------------
+
+func `==`*(a, b: ExtensionField): SecretBool =
+  ## Constant-time equality check
+  result = CtTrue
+  for fA, fB in fields(a, b):
+    result = result and (fA == fB)
+
+func isZero*(a: ExtensionField): SecretBool =
+  ## Constant-time check if zero
+  result = CtTrue
+  for fA in fields(a):
+    result = result and fA.isZero()
+
+func isOne*(a: ExtensionField): SecretBool =
+  ## Constant-time check if one
+  result = CtTrue
+  for fieldName, fA in fieldPairs(a):
+    when fieldName == "c0":
+      result = result and fA.isOne()
+    else:
+      result = result and fA.isZero()
+
+func isMinusOne*(a: ExtensionField): SecretBool =
+  ## Constant-time check if -1
+  result = CtTrue
+  for fieldName, fA in fieldPairs(a):
+    when fieldName == "c0":
+      result = result and fA.isMinusOne()
+    else:
+      result = result and fA.isZero()
+
+# Copies
+# -------------------------------------------------------------------
+
+func ccopy*(a: var ExtensionField, b: ExtensionField, ctl: SecretBool) =
+  ## Constant-time conditional copy
+  ## If ctl is true: b is copied into a
+  ## if ctl is false: b is not copied and a is unmodified
+  ## Time and memory accesses are the same whether a copy occurs or not
+  for fA, fB in fields(a, b):
+    ccopy(fA, fB, ctl)
+
+# Abelian group
+# -------------------------------------------------------------------
+
+func neg*(r: var ExtensionField, a: ExtensionField) =
+  ## Field out-of-place negation
+  for fR, fA in fields(r, a):
+    fR.neg(fA)
+
+func neg*(a: var ExtensionField) =
+  ## Field in-place negation
+  for fA in fields(a):
+    fA.neg()
+
+func `+=`*(a: var ExtensionField, b: ExtensionField) =
+  ## Addition in the extension field
+  for fA, fB in fields(a, b):
+    fA += fB
+
+func `-=`*(a: var ExtensionField, b: ExtensionField) =
+  ## Substraction in the extension field
+  for fA, fB in fields(a, b):
+    fA -= fB
+
+func double*(r: var ExtensionField, a: ExtensionField) =
+  ## Field out-of-place doubling
+  for fR, fA in fields(r, a):
+    fR.double(fA)
+
+func double*(a: var ExtensionField) =
+  ## Field in-place doubling
+  for fA in fields(a):
+    fA.double()
+
+func div2*(a: var ExtensionField) =
+  ## Field in-place division by 2
+  for fA in fields(a):
+    fA.div2()
+
+func sum*(r: var QuadraticExt, a, b: QuadraticExt) =
+  ## Sum ``a`` and ``b`` into ``r``
+  r.c0.sum(a.c0, b.c0)
+  r.c1.sum(a.c1, b.c1)
+
+func sum*(r: var CubicExt, a, b: CubicExt) =
+  ## Sum ``a`` and ``b`` into ``r``
+  r.c0.sum(a.c0, b.c0)
+  r.c1.sum(a.c1, b.c1)
+  r.c2.sum(a.c2, b.c2)
+
+func diff*(r: var QuadraticExt, a, b: QuadraticExt) =
+  ## Diff ``a`` and ``b`` into ``r``
+  r.c0.diff(a.c0, b.c0)
+  r.c1.diff(a.c1, b.c1)
+
+func diff*(r: var CubicExt, a, b: CubicExt) =
+  ## Diff ``a`` and ``b`` into ``r``
+  r.c0.diff(a.c0, b.c0)
+  r.c1.diff(a.c1, b.c1)
+  r.c2.diff(a.c2, b.c2)
+
+func conj*(a: var QuadraticExt) =
+  ## Computes the conjugate in-place
+  a.c1.neg()
+
+func conj*(r: var QuadraticExt, a: QuadraticExt) =
+  ## Computes the conjugate out-of-place
+  r.c0 = a.c0
+  r.c1.neg(a.c1)
+
+func conjneg*(a: var QuadraticExt) =
+  ## Computes the negated conjugate in-place
+  a.c0.neg()
+
+func conjneg*(r: var QuadraticExt, a: QuadraticExt) =
+  ## Computes the negated conjugate out-of-place
+  r.c0.neg(a.c0)
+  r.c1 = a.c1
+
+func conj*(a: var CubicExt) =
+  ## Computes the conjugate in-place
+  a.c0.conj()
+  a.c1.conjneg()
+  a.c2.conj()
+
+func conj*(r: var CubicExt, a: CubicExt) =
+  ## Computes the conjugate out-of-place
+  r.c0.conj(a.c0)
+  r.c1.conjneg(a.c1)
+  r.c2.conj(a.c2)
+
+# Conditional arithmetic
+# -------------------------------------------------------------------
+
+func cneg*(a: var ExtensionField, ctl: SecretBool) =
+  ## Constant-time in-place conditional negation
+  ## Only negate if ctl is true
+  for fA in fields(a):
+    fA.cneg(ctl)
+
+func cadd*(a: var ExtensionField, b: ExtensionField, ctl: SecretBool) =
+  ## Constant-time in-place conditional addition
+  for fA, fB in fields(a, b):
+    fA.cadd(fB, ctl)
+
+func csub*(a: var ExtensionField, b: ExtensionField, ctl: SecretBool) =
+  ## Constant-time in-place conditional substraction
+  for fA, fB in fields(a, b):
+    fA.csub(fB, ctl)
+
+# Multiplication by a small integer known at compile-time
+# -------------------------------------------------------------------
+
+func `*=`*(a: var ExtensionField, b: static int) =
+  ## Multiplication by a small integer known at compile-time
+
+  const negate = b < 0
+  const b = if negate: -b
+            else: b
+  when negate:
+    a.neg(a)
+  when b == 0:
+    a.setZero()
+  elif b == 1:
+    return
+  elif b == 2:
+    a.double()
+  elif b == 3:
+    let t1 = a
+    a.double()
+    a += t1
+  elif b == 4:
+    a.double()
+    a.double()
+  elif b == 5:
+    let t1 = a
+    a.double()
+    a.double()
+    a += t1
+  elif b == 6:
+    a.double()
+    let t2 = a
+    a.double() # 4
+    a += t2
+  elif b == 7:
+    let t1 = a
+    a.double()
+    let t2 = a
+    a.double() # 4
+    a += t2
+    a += t1
+  elif b == 8:
+    a.double()
+    a.double()
+    a.double()
+  elif b == 9:
+    let t1 = a
+    a.double()
+    a.double()
+    a.double() # 8
+    a += t1
+  elif b == 10:
+    a.double()
+    let t2 = a
+    a.double()
+    a.double() # 8
+    a += t2
+  elif b == 11:
+    let t1 = a
+    a.double()
+    let t2 = a
+    a.double()
+    a.double() # 8
+    a += t2
+    a += t1
+  elif b == 12:
+    a.double()
+    a.double() # 4
+    let t4 = a
+    a.double() # 8
+    a += t4
+  else:
+    {.error: "Multiplication by this small int not implemented".}
+
+func prod*(r: var ExtensionField, a: ExtensionField, b: static int) =
+  ## Multiplication by a small integer known at compile-time
+  const negate = b < 0
+  const b = if negate: -b
+            else: b
+  when negate:
+    r.neg(a)
+  else:
+    r = a
+  r *= b
+
+{.pop.} # inline
+
+# ############################################################
+#                                                            #
+#                Quadratic extensions                        #
+#                                                            #
+# ############################################################
+
+# Commutative ring implementation for complex quadratic extension fields
+# ----------------------------------------------------------------------
+
+func square_complex(r: var QuadraticExt, a: QuadraticExt) =
+  ## Return a² in 𝔽p2 = 𝔽p[𝑖] in ``r``
+  ## ``r`` is initialized/overwritten
+  ##
+  ## Requires a complex extension field
+  # (c0, c1)² => (c0 + c1𝑖)²
+  #           => c0² + 2 c0 c1𝑖 + (c1𝑖)²
+  #           => c0²-c1² + 2 c0 c1𝑖
+  #           => (c0²-c1², 2 c0 c1)
+  #           or
+  #           => ((c0-c1)(c0+c1), 2 c0 c1)
+  #           => ((c0-c1)(c0-c1 + 2 c1), c0 * 2 c1)
+  #
+  # Costs (naive implementation)
+  # - 1 Multiplication 𝔽p
+  # - 2 Squarings 𝔽p
+  # - 1 Doubling 𝔽p
+  # - 1 Substraction 𝔽p
+  # Stack: 4 * ModulusBitSize (4x 𝔽p element)
+  #
+  # Or (with 1 less Mul/Squaring at the cost of 1 addition and extra 2 𝔽p stack space)
+  #
+  # - 2 Multiplications 𝔽p
+  # - 1 Addition 𝔽p
+  # - 1 Doubling 𝔽p
+  # - 1 Substraction 𝔽p
+  #
+  # To handle aliasing between r and a, we need
+  # r to be used only when a is unneeded
+  # so we can't use r fields as a temporary
+  mixin fromComplexExtension
+  static: doAssert r.fromComplexExtension()
+
+  var v0 {.noInit.}, v1 {.noInit.}: typeof(r.c0)
+  v0.diff(a.c0, a.c1)    # v0 = c0 - c1               [1 Sub]
+  v1.sum(a.c0, a.c1)     # v1 = c0 + c1               [1 Dbl, 1 Sub]
+  r.c1.prod(a.c0, a.c1)  # r.c1 = c0 c1               [1 Mul, 1 Dbl, 1 Sub]
+  # aliasing: a unneeded now
+  r.c1.double()          # r.c1 = 2 c0 c1             [1 Mul, 2 Dbl, 1 Sub]
+  r.c0.prod(v0, v1)      # r.c0 = (c0 + c1)(c0 - c1)  [2 Mul, 2 Dbl, 1 Sub]
+
+func prod_complex(r: var QuadraticExt, a, b: QuadraticExt) =
+  ## Return a * b in 𝔽p2 = 𝔽p[𝑖] in ``r``
+  ## ``r`` is initialized/overwritten
+  ##
+  ## Requires a complex extension field
+  # (a0, a1) (b0, b1) => (a0 + a1𝑖) (b0 + b1𝑖)
+  #                   => (a0 b0 - a1 b1) + (a0 b1 + a1 b0) 𝑖
+  #
+  # In Fp, multiplication has cost O(n²) with n the number of limbs
+  # while addition has cost O(3n) (n for addition, n for overflow, n for conditional substraction)
+  # and substraction has cost O(2n) (n for substraction + underflow, n for conditional addition)
+  #
+  # Even for 256-bit primes, we are looking at always a minimum of n=5 limbs (with 2^63 words)
+  # where addition/substraction are significantly cheaper than multiplication
+  #
+  # So we always reframe the imaginary part using Karatsuba approach to save a multiplication
+  # (a0, a1) (b0, b1) => (a0 b0 - a1 b1) + 𝑖( (a0 + a1)(b0 + b1) - a0 b0 - a1 b1 )
+  #
+  # Costs (naive implementation)
+  # - 4 Multiplications 𝔽p
+  # - 1 Addition 𝔽p
+  # - 1 Substraction 𝔽p
+  # Stack: 6 * ModulusBitSize (4x 𝔽p element + 2x named temporaries)
+  #
+  # Costs (Karatsuba)
+  # - 3 Multiplications 𝔽p
+  # - 3 Substraction 𝔽p (2 are fused)
+  # - 2 Addition 𝔽p
+  # Stack: 6 * ModulusBitSize (4x 𝔽p element + 2x named temporaries + 1 in-place multiplication temporary)
+  mixin fromComplexExtension
+  static: doAssert r.fromComplexExtension()
+
+  # TODO: GCC is adding an unexplainable 30 cycles tax to this function (~10% slow down)
+  #       for seemingly no reason
+
+  when false: # Single-width implementation - BLS12-381
+              # Clang 348 cycles on i9-9980XE @3.9 GHz
+    var a0b0 {.noInit.}, a1b1 {.noInit.}: typeof(r.c0)
+    a0b0.prod(a.c0, b.c0)                                         # [1 Mul]
+    a1b1.prod(a.c1, b.c1)                                         # [2 Mul]
+
+    r.c0.sum(a.c0, a.c1)  # r0 = (a0 + a1)                        # [2 Mul, 1 Add]
+    r.c1.sum(b.c0, b.c1)  # r1 = (b0 + b1)                        # [2 Mul, 2 Add]
+    # aliasing: a and b unneeded now
+    r.c1 *= r.c0          # r1 = (b0 + b1)(a0 + a1)               # [3 Mul, 2 Add] - 𝔽p temporary
+
+    r.c0.diff(a0b0, a1b1) # r0 = a0 b0 - a1 b1                    # [3 Mul, 2 Add, 1 Sub]
+    r.c1 -= a0b0          # r1 = (b0 + b1)(a0 + a1) - a0b0        # [3 Mul, 2 Add, 2 Sub]
+    r.c1 -= a1b1          # r1 = (b0 + b1)(a0 + a1) - a0b0 - a1b1 # [3 Mul, 2 Add, 3 Sub]
+
+  else: # Double-width implementation with lazy reduction
+        # Clang 341 cycles on i9-9980XE @3.9 GHz
+    var a0b0 {.noInit.}, a1b1 {.noInit.}: doubleWidth(typeof(r.c0))
+    var d {.noInit.}: doubleWidth(typeof(r.c0))
+    const msbSet = r.c0.typeof.canUseNoCarryMontyMul()
+
+    a0b0.mulNoReduce(a.c0, b.c0)     # 44 cycles - cumul 44
+    a1b1.mulNoReduce(a.c1, b.c1)     # 44 cycles - cumul 88
+    when msbSet:
+      r.c0.sum(a.c0, a.c1)
+      r.c1.sum(b.c0, b.c1)
+    else:
+      r.c0.sumNoReduce(a.c0, a.c1)   # 5 cycles  - cumul 93
+      r.c1.sumNoReduce(b.c0, b.c1)   # 5 cycles  - cumul 98
+    # aliasing: a and b unneeded now
+    d.mulNoReduce(r.c0, r.c1)        # 44 cycles - cumul 142
+    when msbSet:
+      d -= a0b0
+      d -= a1b1
+    else:
+      d.diffNoReduce(d, a0b0)        # 11 cycles - cumul 153
+      d.diffNoReduce(d, a1b1)        # 11 cycles - cumul 164
+    a0b0.diff(a0b0, a1b1)            # 19 cycles - cumul 183
+    r.c0.reduce(a0b0)                # 50 cycles - cumul 233
+    r.c1.reduce(d)                   # 50 cycles - cumul 288
+
+  # Single-width [3 Mul, 2 Add, 3 Sub]
+  #    3*88 + 2*14 + 3*14 = 334 theoretical cycles
+  #    348 measured
+  # Double-Width
+  #    288 theoretical cycles
+  #    329 measured
+  #    Unexplained 40 cycles diff between theo and measured
+  #    and unexplained 30 cycles between Clang and GCC
+  #    - Function calls?
+  #    - push/pop stack?
+
+func mul_sparse_complex_by_0y(
+       r: var QuadraticExt, a: QuadraticExt,
+       sparseB: auto) =
+  ## Multiply `a` by `b` with sparse coordinates (0, y)
+  ## On a complex quadratic extension field 𝔽p2 = 𝔽p[𝑖]
+  #
+  # r0 = a0 b0 - a1 b1
+  # r1 = (a0 + a1) (b0 + b1) - a0 b0 - a1 b1 (Karatsuba)
+  #
+  # with b0 = 0, hence
+  #
+  # r0 = - a1 b1
+  # r1 = (a0 + a1) b1 - a1 b1 = a0 b1
+  mixin fromComplexExtension
+  static: doAssert r.fromComplexExtension()
+
+  when typeof(sparseB) is typeof(a):
+    template b(): untyped = sparseB.c1
+  elif typeof(sparseB) is typeof(a.c0):
+    template b(): untyped = sparseB
+  else:
+    {.error: "sparseB type is " & $typeof(sparseB) &
+      " which does not match with either a (" & $typeof(a) &
+      ") or a.c0 (" & $typeof(a.c0) & ")".}
+
+  var t{.noInit.}: typeof(a.c0)
+  t.prod(a.c1, b)
+  r.c1.prod(a.c0, b)
+  r.c0.neg(t)
+
+# Commutative ring implementation for generic quadratic extension fields
+# ----------------------------------------------------------------------
+
+func square_generic(r: var QuadraticExt, a: QuadraticExt) =
+  ## Return a² in ``r``
+  ## ``r`` is initialized/overwritten
+  # Algorithm (with β the non-residue in the base field)
+  #
+  # (c0, c1)² => (c0 + c1 w)²
+  #           => c0² + 2 c0 c1 w + c1²w²
+  #           => c0² + β c1² + 2 c0 c1 w
+  #           => (c0² + β c1², 2 c0 c1)
+  # We have 2 squarings and 1 multiplication in the base field
+  # which are significantly more costly than additions.
+  # For example when construction 𝔽p12 from 𝔽p6:
+  # - 4 limbs like BN254:     multiplication is 20x slower than addition/substraction
+  # - 6 limbs like BLS12-381: multiplication is 28x slower than addition/substraction
+  #
+  # We can save operations with one of the following expressions
+  # of c0² + β c1² and noticing that c0c1 is already computed for the "y" coordinate
+  #
+  # Alternative 1:
+  #   c0² + β c1² <=> (c0 - c1)(c0 - β c1) + β c0c1 + c0c1
+  #
+  # Alternative 2:
+  #   c0² + β c1² <=> (c0 + c1)(c0 + β c1) - β c0c1 - c0c1
+  mixin prod
+  var v0 {.noInit.}, v1 {.noInit.}: typeof(r.c0)
+
+  # v1 <- (c0 + β c1)
+  v1.prod(a.c1, NonResidue)
+  v1 += a.c0
+
+  # v0 <- (c0 + c1)(c0 + β c1)
+  v0.sum(a.c0, a.c1)
+  v0 *= v1
+
+  # v1 <- c0 c1
+  v1.prod(a.c0, a.c1)
+
+  # aliasing: a unneeded now
+
+  # r0 = (c0 + c1)(c0 + β c1) - c0c1
+  v0 -= v1
+
+  # r1 = 2 c0c1
+  r.c1.double(v1)
+
+  # r0 = (c0 + c1)(c0 + β c1) - c0c1 - β c0c1
+  v1 *= NonResidue
+  r.c0.diff(v0, v1)
+
+func prod_generic(r: var QuadraticExt, a, b: QuadraticExt) =
+  ## Returns r = a * b
+  # Algorithm (with β the non-residue in the base field)
+  #
+  # r0 = a0 b0 + β a1 b1
+  # r1 = (a0 + a1) (b0 + b1) - a0 b0 - a1 b1 (Karatsuba)
+  mixin prod
+  var v0 {.noInit.}, v1 {.noInit.}, v2 {.noInit.}: typeof(r.c0)
+
+  # v2 <- (a0 + a1)(b0 + b1)
+  v0.sum(a.c0, a.c1)
+  v1.sum(b.c0, b.c1)
+  v2.prod(v0, v1)
+
+  # v0 <- a0 b0
+  # v1 <- a1 b1
+  v0.prod(a.c0, b.c0)
+  v1.prod(a.c1, b.c1)
+
+  # aliasing: a and b unneeded now
+
+  # r1 <- (a0 + a1)(b0 + b1) - a0 b0 - a1 b1
+  r.c1.diff(v2, v1)
+  r.c1 -= v0
+
+  # r0 <- a0 b0 + β a1 b1
+  v1 *= NonResidue
+  r.c0.sum(v0, v1)
+
+func mul_sparse_generic_by_x0(r: var QuadraticExt, a, sparseB: QuadraticExt) =
+  ## Multiply `a` by `b` with sparse coordinates (x, 0)
+  ## On a generic quadratic extension field
+  # Algorithm (with β the non-residue in the base field)
+  #
+  # r0 = a0 b0 + β a1 b1
+  # r1 = (a0 + a1) (b0 + b1) - a0 b0 - a1 b1 (Karatsuba)
+  #
+  # with b1 = 0, hence
+  #
+  # r0 = a0 b0
+  # r1 = (a0 + a1) b0 - a0 b0 = a1 b0
+  mixin prod
+  template b(): untyped = sparseB
+
+  r.c0.prod(a.c0, b.c0)
+  r.c1.prod(a.c1, b.c0)
+
+func mul_sparse_generic_by_0y(
+       r: var QuadraticExt, a: QuadraticExt,
+       sparseB: auto) =
+  ## Multiply `a` by `b` with sparse coordinates (0, y)
+  ## On a generic quadratic extension field
+  # Algorithm (with β the non-residue in the base field)
+  #
+  # r0 = a0 b0 + β a1 b1
+  # r1 = (a0 + a1) (b0 + b1) - a0 b0 - a1 b1 (Karatsuba)
+  #
+  # with b0 = 0, hence
+  #
+  # r0 = β a1 b1
+  # r1 = (a0 + a1) b1 - a1 b1 = a0 b1
+  when typeof(sparseB) is typeof(a):
+    template b(): untyped = sparseB.c1
+  elif typeof(sparseB) is typeof(a.c0):
+    template b(): untyped = sparseB
+  else:
+    {.error: "sparseB type is " & $typeof(sparseB) &
+      " which does not match with either a (" & $typeof(a) &
+      ") or a.c0 (" & $typeof(a.c0) & ")".}
+
+  var t{.noInit.}: typeof(a.c0)
+
+  t.prod(a.c1, b)
+  t *= NonResidue
+  r.c1.prod(a.c0, b)
+  # aliasing: a unneeded now
+  r.c0 = t
+
+func mul_sparse_generic_by_0y(
+       r: var QuadraticExt, a: QuadraticExt,
+       sparseB: static int) =
+  ## Multiply `a` by `b` with sparse coordinates (0, y)
+  ## On a generic quadratic extension field
+  # Algorithm (with β the non-residue in the base field)
+  #
+  # r0 = a0 b0 + β a1 b1
+  # r1 = (a0 + a1) (b0 + b1) - a0 b0 - a1 b1 (Karatsuba)
+  #
+  # with b0 = 0, hence
+  #
+  # r0 = β a1 b1
+  # r1 = (a0 + a1) b1 - a1 b1 = a0 b1
+  template b(): untyped = sparseB
+
+  var t{.noInit.}: typeof(a.c0)
+
+  t.prod(a.c1, b)
+  t *= NonResidue
+  r.c1.prod(a.c0, b)
+  # aliasing: a unneeded now
+  r.c0 = t
+
+func invImpl(r: var QuadraticExt, a: QuadraticExt) =
+  ## Compute the multiplicative inverse of ``a``
+  ##
+  ## The inverse of 0 is 0.
+  #
+  # Algorithm:
+  #
+  # 1 / (a0 + a1 w) <=> (a0 - a1 w) / (a0 + a1 w)(a0 - a1 w)
+  #                 <=> (a0 - a1 w) / (a0² - a1² w²)
+  # with w being our coordinate system and β the quadratic non-residue
+  # we have w² = β
+  # So the inverse is (a0 - a1 w) / (a0² - β a1²)
+  mixin fromComplexExtension
+
+  # [2 Sqr, 1 Add]
+  var v0 {.noInit.}, v1 {.noInit.}: typeof(r.c0)
+  v0.square(a.c0)
+  v1.square(a.c1)
+  when r.fromComplexExtension():
+    v0 += v1
+  else:
+    v1 *= NonResidue
+    v0 -= v1              # v0 = a0² - β a1² (the norm / squared magnitude of a)
+
+  # [1 Inv, 2 Sqr, 1 Add]
+  v1.inv(v0)              # v1 = 1 / (a0² - β a1²)
+
+  # [1 Inv, 2 Mul, 2 Sqr, 1 Add, 1 Neg]
+  r.c0.prod(a.c0, v1)     # r0 = a0 / (a0² - β a1²)
+  v0.neg(v1)              # v0 = -1 / (a0² - β a1²)
+  r.c1.prod(a.c1, v0)     # r1 = -a1 / (a0² - β a1²)
+
+# Exported quadratic symbols
+# -------------------------------------------------------------------
+
+{.push inline.}
+
+func square*(r: var QuadraticExt, a: QuadraticExt) =
+  mixin fromComplexExtension
+  when r.fromComplexExtension():
+    r.square_complex(a)
+  else:
+    r.square_generic(a)
+
+func prod*(r: var QuadraticExt, a, b: QuadraticExt) =
+  mixin fromComplexExtension
+  when r.fromComplexExtension():
+    r.prod_complex(a, b)
+  else:
+    r.prod_generic(a, b)
+
+func inv*(r: var QuadraticExt, a: QuadraticExt) =
+  ## Compute the multiplicative inverse of ``a``
+  ##
+  ## The inverse of 0 is 0.
+  ## Incidentally this avoids extra check
+  ## to convert Jacobian and Projective coordinates
+  ## to affine for elliptic curve
+  r.invImpl(a)
+
+func inv*(a: var QuadraticExt) =
+  ## Compute the multiplicative inverse of ``a``
+  ##
+  ## The inverse of 0 is 0.
+  ## Incidentally this avoids extra check
+  ## to convert Jacobian and Projective coordinates
+  ## to affine for elliptic curve
+  a.invImpl(a)
+
+func `*=`*(a: var QuadraticExt, b: QuadraticExt) =
+  ## In-place multiplication
+  a.prod(a, b)
+
+func square*(a: var QuadraticExt) =
+  ## In-place squaring
+  a.square(a)
+
+func mul_sparse_by_0y*(r: var QuadraticExt, a: QuadraticExt, sparseB: auto) =
+  ## Sparse multiplication
+  mixin fromComplexExtension
+  when a.fromComplexExtension():
+    r.mul_sparse_complex_by_0y(a, sparseB)
+  else:
+    r.mul_sparse_generic_by_0y(a, sparseB)
+
+func mul_sparse_by_0y*(r: var QuadraticExt, a: QuadraticExt, sparseB: static int) =
+  ## Sparse multiplication
+  mixin fromComplexExtension
+  when a.fromComplexExtension():
+    {.error: "Not implemented.".}
+  else:
+    r.mul_sparse_generic_by_0y(a, sparseB)
+
+func mul_sparse_by_0y*(a: var QuadraticExt, sparseB: auto) =
+  ## Sparse in-place multiplication
+  a.mul_sparse_by_0y(a, sparseB)
+
+func mul_sparse_by_x0*(a: var QuadraticExt, sparseB: QuadraticExt) =
+  ## Sparse in-place multiplication
+  a.mul_sparse_generic_by_x0(a, sparseB)
+
+{.pop.} # inline
+
+# ############################################################
+#                                                            #
+#                    Cubic extensions                        #
+#                                                            #
+# ############################################################
+
+# Commutative ring implementation for Cubic Extension Fields
+# -------------------------------------------------------------------
+# Cubic extensions can use specific squaring procedures
+# beyond Schoolbook and Karatsuba:
+# - Chung-Hasan (3 different algorithms)
+# - Toom-Cook-3x
+#
+# Chung-Hasan papers
+# http://cacr.uwaterloo.ca/techreports/2006/cacr2006-24.pdf
+# https://www.lirmm.fr/arith18/papers/Chung-Squaring.pdf
+#
+# The papers focus on polynomial squaring, they have been adapted
+# to towered extension fields with the relevant costs in
+#
+# - Multiplication and Squaring on Pairing-Friendly Fields
+#   Augusto Jun Devegili and Colm Ó hÉigeartaigh and Michael Scott and Ricardo Dahab, 2006
+#   https://eprint.iacr.org/2006/471
+#
+# Costs in the underlying field
+# M: Mul, S: Square, A: Add/Sub, B: Mul by non-residue
+#
+# | Method      | > Linear | Linear            |
+# |-------------|----------|-------------------|
+# | Schoolbook  | 3M + 3S  | 6A + 2B           |
+# | Karatsuba   | 6S       | 13A + 2B          |
+# | Tom-Cook-3x | 5S       | 33A + 2B          |
+# | CH-SQR1     | 3M + 2S  | 11A + 2B          |
+# | CH-SQR2     | 2M + 3S  | 10A + 2B          |
+# | CH-SQR3     | 1M + 4S  | 11A + 2B + 1 Div2 |
+# | CH-SQR3x    | 1M + 4S  | 14A + 2B          |
+
+func square_Chung_Hasan_SQR2(r: var CubicExt, a: CubicExt) {.used.}=
+  ## Returns r = a²
+  mixin prod, square, sum
+  var s0{.noInit.}, m01{.noInit.}, m12{.noInit.}: typeof(r.c0)
+
+  # precomputations that use a
+  m01.prod(a.c0, a.c1)
+  m01.double()
+  m12.prod(a.c1, a.c2)
+  m12.double()
+  s0.square(a.c2)
+  # aliasing: a₂ unused
+
+  # r₂ = (a₀ - a₁ + a₂)²
+  r.c2.sum(a.c2, a.c0)
+  r.c2 -= a.c1
+  r.c2.square()
+  # aliasing, a almost unneeded now
+
+  # r₂ = (a₀ - a₁ + a₂)² + 2a₀a₁ + 2a₁a₂ - a₂²
+  r.c2 += m01
+  r.c2 += m12
+  r.c2 -= s0
+
+  # r₁ = 2a₀a₁ + β a₂²
+  r.c1.prod(s0, NonResidue)
+  r.c1 += m01
+
+  # r₂ = (a₀ - a₁ + a₂)² + 2a₀a₁ + 2a₁a₂ - a₀² - a₂²
+  s0.square(a.c0)
+  r.c2 -= s0
+
+  # r₀ = a₀² + β 2a₁a₂
+  r.c0.prod(m12, NonResidue)
+  r.c0 += s0
+
+func square_Chung_Hasan_SQR3(r: var CubicExt, a: CubicExt) =
+  ## Returns r = a²
+  mixin prod, square, sum
+  var s0{.noInit.}, t{.noInit.}, m12{.noInit.}: typeof(r.c0)
+
+  # s₀ = (a₀ + a₁ + a₂)²
+  # t = ((a₀ + a₁ + a₂)² + (a₀ - a₁ + a₂)²) / 2
+  s0.sum(a.c0, a.c2)
+  t.diff(s0, a.c1)
+  s0 += a.c1
+  s0.square()
+  t.square()
+  t += s0
+  t.div2()
+
+  # m12 = 2a₁a₂ and r₁ = a₂²
+  # then a₁ and a₂ are unused for aliasing
+  m12.prod(a.c1, a.c2)
+  m12.double()
+  r.c1.square(a.c2)       # r₁ = a₂²
+
+  r.c2.diff(t, r.c1)      # r₂ = t - a₂²
+  r.c1 *= NonResidue      # r₁ = β a₂²
+  r.c1 += s0              # r₁ = (a₀ + a₁ + a₂)² + β a₂²
+  r.c1 -= m12             # r₁ = (a₀ + a₁ + a₂)² - 2a₁a₂ + β a₂²
+  r.c1 -= t               # r₁ = (a₀ + a₁ + a₂)² - 2a₁a₂ - t + β a₂²
+
+  s0.square(a.c0)
+  # aliasing: a₀ unused
+
+  r.c2 -= s0
+  r.c0.prod(m12, NonResidue)
+  r.c0 += s0
+
+func prodImpl(r: var CubicExt, a, b: CubicExt) =
+  ## Returns r = a * b  # Algorithm is Karatsuba
+  var v0{.noInit.}, v1{.noInit.}, v2{.noInit.}: typeof(r.c0)
+  var t0{.noInit.}, t1{.noInit.}, t2{.noInit.}: typeof(r.c0)
+
+  v0.prod(a.c0, b.c0)
+  v1.prod(a.c1, b.c1)
+  v2.prod(a.c2, b.c2)
+
+  # r₀ = β ((a₁ + a₂)(b₁ + b₂) - v₁ - v₂) + v₀
+  t0.sum(a.c1, a.c2)
+  t1.sum(b.c1, b.c2)
+  t0 *= t1
+  t0 -= v1
+  t0 -= v2
+  t0 *= NonResidue
+  # r₀ = t₀ + v₀ at the end to handle aliasing
+
+  # r₁ = (a₀ + a₁) * (b₀ + b₁) - v₀ - v₁ + β v₂
+  t1.sum(a.c0, a.c1)
+  t2.sum(b.c0, b.c1)
+  r.c1.prod(t1, t2)
+  r.c1 -= v0
+  r.c1 -= v1
+  t1.prod(v2, NonResidue)
+  r.c1 += t1
+
+  # r₂ = (a₀ + a₂) * (b₀ + b₂) - v₀ - v₂ + v₁
+  t1.sum(a.c0, a.c2)
+  t2.sum(b.c0, b.c2)
+  r.c2.prod(t1, t2)
+  r.c2 -= v0
+  r.c2 -= v2
+  r.c2 += v1
+
+  # Finish r₀
+  r.c0.sum(t0, v0)
+
+func invImpl(r: var CubicExt, a: CubicExt) =
+  ## Compute the multiplicative inverse of ``a``
+  ##
+  ## The inverse of 0 is 0.
+  ## Incidentally this avoids extra check
+  ## to convert Jacobian and Projective coordinates
+  ## to affine for elliptic curve
+  #
+  # Algorithm 5.23
+  #
+  # Arithmetic of Finite Fields
+  # Chapter 5 of Guide to Pairing-Based Cryptography
+  # Jean Luc Beuchat, Luis J. Dominguez Perez, Sylvain Duquesne, Nadia El Mrabet, Laura Fuentes-Castañeda, Francisco Rodríguez-Henríquez, 2017\
+  # https://www.researchgate.net/publication/319538235_Arithmetic_of_Finite_Fields
+  #
+  # We optimize for stack usage and use 4 temporaries
+  # instead of 9, because 4 * 2 (𝔽p2) * Bitsize would be:
+  # - ~2032 bits for BN254
+  # - ~3048 bits for BLS12-381
+  var A {.noInit.}, B {.noInit.}, C {.noInit.}: typeof(r.c0)
+  var t {.noInit.}: typeof(r.c0)
+
+  # A <- a₀² - β a₁ a₂
+  A.square(a.c0)
+  t.prod(a.c1, a.c2)
+  t *= NonResidue
+  A -= t
+
+  # B <- β a₂² - a₀ a₁
+  B.square(a.c2)
+  B *= NonResidue
+  t.prod(a.c0, a.c1)
+  B -= t
+
+  # C <- a₁² - a₀ a₂
+  C.square(a.c1)
+  t.prod(a.c0, a.c2)
+  C -= t
+
+  # F in t
+  # F <- β a₁ C + a₀ A + β a₂ B
+  t.prod(a.c1, C)
+  r.c2.prod(a.c2, B) # aliasing: last use of a₂, destroy r₂
+  t += r.c2
+  t *= NonResidue
+  r.c0.prod(a.c0, A) # aliasing: last use of a₀, destroy r₀
+  t += r.c0
+
+  t.inv()
+
+  # (a0 + a1 v + a2 v²)^-1 = (A + B v + C v²) / F
+  r.c0.prod(A, t)
+  r.c1.prod(B, t)
+  r.c2.prod(C, t)
+
+# Exported cubic symbols
+# -------------------------------------------------------------------
+{.push inline.}
+
+func square*(r: var CubicExt, a: CubicExt) =
+  ## Returns r = a²
+  square_Chung_Hasan_SQR3(r, a)
+
+func square*(a: var CubicExt) =
+  ## In-place squaring
+  a.square(a)
+
+func prod*(r: var CubicExt, a, b: CubicExt) =
+  ## In-place multiplication
+  r.prodImpl(a, b)
+
+func `*=`*(a: var CubicExt, b: CubicExt) =
+  ## In-place multiplication
+  a.prodImpl(a, b)
+
+func inv*(r: var CubicExt, a: CubicExt) =
+  ## Compute the multiplicative inverse of ``a``
+  ##
+  ## The inverse of 0 is 0.
+  ## Incidentally this avoids extra check
+  ## to convert Jacobian and Projective coordinates
+  ## to affine for elliptic curve
+  r.invImpl(a)
+
+func inv*(a: var CubicExt) =
+  ## Compute the multiplicative inverse of ``a``
+  ##
+  ## The inverse of 0 is 0.
+  ## Incidentally this avoids extra check
+  ## to convert Jacobian and Projective coordinates
+  ## to affine for elliptic curve
+  a.invImpl(a)
+
+{.pop.} # inline
+{.pop.} # raises no exceptions
diff --git a/constantine/tower_field_extensions/quadratic_extensions.nim b/constantine/tower_field_extensions/quadratic_extensions.nim
deleted file mode 100644
index e761bfd..0000000
--- a/constantine/tower_field_extensions/quadratic_extensions.nim
+++ /dev/null
@@ -1,365 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ../arithmetic,
-  ../primitives,
-  ./tower_common
-
-# Commutative ring implementation for complex extension fields
-# -------------------------------------------------------------------
-
-func square_complex(r: var QuadraticExt, a: QuadraticExt) =
-  ## Return a² in 𝔽p2 = 𝔽p[𝑖] in ``r``
-  ## ``r`` is initialized/overwritten
-  ##
-  ## Requires a complex extension field
-  # (c0, c1)² => (c0 + c1𝑖)²
-  #           => c0² + 2 c0 c1𝑖 + (c1𝑖)²
-  #           => c0²-c1² + 2 c0 c1𝑖
-  #           => (c0²-c1², 2 c0 c1)
-  #           or
-  #           => ((c0-c1)(c0+c1), 2 c0 c1)
-  #           => ((c0-c1)(c0-c1 + 2 c1), c0 * 2 c1)
-  #
-  # Costs (naive implementation)
-  # - 1 Multiplication 𝔽p
-  # - 2 Squarings 𝔽p
-  # - 1 Doubling 𝔽p
-  # - 1 Substraction 𝔽p
-  # Stack: 4 * ModulusBitSize (4x 𝔽p element)
-  #
-  # Or (with 1 less Mul/Squaring at the cost of 1 addition and extra 2 𝔽p stack space)
-  #
-  # - 2 Multiplications 𝔽p
-  # - 1 Addition 𝔽p
-  # - 1 Doubling 𝔽p
-  # - 1 Substraction 𝔽p
-  # Stack: 6 * ModulusBitSize (4x 𝔽p element + 1 named temporaries + 1 in-place multiplication temporary)
-  # as in-place multiplications require a (shared) internal temporary
-  mixin fromComplexExtension
-  static: doAssert r.fromComplexExtension()
-
-  var c0mc1 {.noInit.}: typeof(r.c0)
-  c0mc1.diff(a.c0, a.c1) # c0mc1 = c0 - c1                            [1 Sub]
-  r.c1.double(a.c1)      # result.c1 = 2 c1                           [1 Dbl, 1 Sub]
-  r.c0.sum(c0mc1, r.c1)  # result.c0 = c0 - c1 + 2 c1                 [1 Add, 1 Dbl, 1 Sub]
-  r.c0 *= c0mc1          # result.c0 = (c0 + c1)(c0 - c1) = c0² - c1² [1 Mul, 1 Add, 1 Dbl, 1 Sub] - 𝔽p temporary
-  r.c1 *= a.c0           # result.c1 = 2 c1 c0                        [2 Mul, 1 Add, 1 Dbl, 1 Sub] - 𝔽p temporary
-
-func prod_complex(r: var QuadraticExt, a, b: QuadraticExt) =
-  ## Return a * b in 𝔽p2 = 𝔽p[𝑖] in ``r``
-  ## ``r`` is initialized/overwritten
-  ##
-  ## Requires a complex extension field
-  # (a0, a1) (b0, b1) => (a0 + a1𝑖) (b0 + b1𝑖)
-  #                   => (a0 b0 - a1 b1) + (a0 b1 + a1 b0) 𝑖
-  #
-  # In Fp, multiplication has cost O(n²) with n the number of limbs
-  # while addition has cost O(3n) (n for addition, n for overflow, n for conditional substraction)
-  # and substraction has cost O(2n) (n for substraction + underflow, n for conditional addition)
-  #
-  # Even for 256-bit primes, we are looking at always a minimum of n=5 limbs (with 2^63 words)
-  # where addition/substraction are significantly cheaper than multiplication
-  #
-  # So we always reframe the imaginary part using Karatsuba approach to save a multiplication
-  # (a0, a1) (b0, b1) => (a0 b0 - a1 b1) + 𝑖( (a0 + a1)(b0 + b1) - a0 b0 - a1 b1 )
-  #
-  # Costs (naive implementation)
-  # - 4 Multiplications 𝔽p
-  # - 1 Addition 𝔽p
-  # - 1 Substraction 𝔽p
-  # Stack: 6 * ModulusBitSize (4x 𝔽p element + 2x named temporaries)
-  #
-  # Costs (Karatsuba)
-  # - 3 Multiplications 𝔽p
-  # - 3 Substraction 𝔽p (2 are fused)
-  # - 2 Addition 𝔽p
-  # Stack: 6 * ModulusBitSize (4x 𝔽p element + 2x named temporaries + 1 in-place multiplication temporary)
-  mixin fromComplexExtension
-  static: doAssert r.fromComplexExtension()
-
-  # TODO: GCC is adding an unexplainable 30 cycles tax to this function (~10% slow down)
-  #       for seemingly no reason
-
-  when false: # Single-width implementation - BLS12-381
-              # Clang 348 cycles on i9-9980XE @3.9 GHz
-    var a0b0 {.noInit.}, a1b1 {.noInit.}: typeof(r.c0)
-    a0b0.prod(a.c0, b.c0)                                         # [1 Mul]
-    a1b1.prod(a.c1, b.c1)                                         # [2 Mul]
-
-    r.c0.sum(a.c0, a.c1)  # r0 = (a0 + a1)                        # [2 Mul, 1 Add]
-    r.c1.sum(b.c0, b.c1)  # r1 = (b0 + b1)                        # [2 Mul, 2 Add]
-    r.c1 *= r.c0          # r1 = (b0 + b1)(a0 + a1)               # [3 Mul, 2 Add] - 𝔽p temporary
-
-    r.c0.diff(a0b0, a1b1) # r0 = a0 b0 - a1 b1                    # [3 Mul, 2 Add, 1 Sub]
-    r.c1 -= a0b0          # r1 = (b0 + b1)(a0 + a1) - a0b0        # [3 Mul, 2 Add, 2 Sub]
-    r.c1 -= a1b1          # r1 = (b0 + b1)(a0 + a1) - a0b0 - a1b1 # [3 Mul, 2 Add, 3 Sub]
-
-  else: # Double-width implementation with lazy reduction
-        # Clang 341 cycles on i9-9980XE @3.9 GHz
-    var a0b0 {.noInit.}, a1b1 {.noInit.}: doubleWidth(typeof(r.c0))
-    var d {.noInit.}: doubleWidth(typeof(r.c0))
-    const msbSet = r.c0.typeof.canUseNoCarryMontyMul()
-
-    a0b0.mulNoReduce(a.c0, b.c0)     # 44 cycles - cumul 44
-    a1b1.mulNoReduce(a.c1, b.c1)     # 44 cycles - cumul 88
-    when msbSet:
-      r.c0.sum(a.c0, a.c1)
-      r.c1.sum(b.c0, b.c1)
-    else:
-      r.c0.sumNoReduce(a.c0, a.c1)   # 5 cycles  - cumul 93
-      r.c1.sumNoReduce(b.c0, b.c1)   # 5 cycles  - cumul 98
-    d.mulNoReduce(r.c0, r.c1)        # 44 cycles - cumul 142
-    when msbSet:
-      d -= a0b0
-      d -= a1b1
-    else:
-      d.diffNoReduce(d, a0b0)        # 11 cycles - cumul 153
-      d.diffNoReduce(d, a1b1)        # 11 cycles - cumul 164
-    a0b0.diff(a0b0, a1b1)            # 19 cycles - cumul 183
-    r.c0.reduce(a0b0)                # 50 cycles - cumul 233
-    r.c1.reduce(d)                   # 50 cycles - cumul 288
-
-  # Single-width [3 Mul, 2 Add, 3 Sub]
-  #    3*88 + 2*14 + 3*14 = 334 theoretical cycles
-  #    348 measured
-  # Double-Width
-  #    288 theoretical cycles
-  #    329 measured
-  #    Unexplained 40 cycles diff between theo and measured
-  #    and unexplained 30 cycles between Clang and GCC
-  #    - Function calls?
-  #    - push/pop stack?
-
-func mul_sparse_complex_by_0y(r: var QuadraticExt, a, sparseB: QuadraticExt) =
-  ## Multiply `a` by `b` with sparse coordinates (0, y)
-  ## On a complex quadratic extension field 𝔽p2 = 𝔽p[𝑖]
-  #
-  # r0 = a0 b0 - a1 b1
-  # r1 = (a0 + a1) (b0 + b1) - a0 b0 - a1 b1 (Karatsuba)
-  #
-  # with b0 = 0, hence
-  #
-  # r0 = - a1 b1
-  # r1 = (a0 + a1) b1 - a1 b1 = a0 b1
-  mixin fromComplexExtension
-  static: doAssert r.fromComplexExtension()
-
-  template b(): untyped = sparseB
-
-  r.c0.prod(a.c1, b.c1)
-  r.c0.neg(r.c0)
-  r.c1.prod(a.c0, b.c1)
-
-# Commutative ring implementation for generic quadratic extension fields
-# -------------------------------------------------------------------
-
-func square_generic(r: var QuadraticExt, a: QuadraticExt) =
-  ## Return a² in ``r``
-  ## ``r`` is initialized/overwritten
-  # Algorithm (with β the non-residue in the base field)
-  #
-  # (c0, c1)² => (c0 + c1 w)²
-  #           => c0² + 2 c0 c1 w + c1²w²
-  #           => c0² + β c1² + 2 c0 c1 w
-  #           => (c0² + β c1², 2 c0 c1)
-  # We have 2 squarings and 1 multiplication in the base field
-  # which are significantly more costly than additions.
-  # For example when construction 𝔽p12 from 𝔽p6:
-  # - 4 limbs like BN254:     multiplication is 20x slower than addition/substraction
-  # - 6 limbs like BLS12-381: multiplication is 28x slower than addition/substraction
-  #
-  # We can save operations with one of the following expressions
-  # of c0² + β c1² and noticing that c0c1 is already computed for the "y" coordinate
-  #
-  # Alternative 1:
-  #   c0² + β c1² <=> (c0 - c1)(c0 - β c1) + β c0c1 + c0c1
-  #
-  # Alternative 2:
-  #   c0² + β c1² <=> (c0 + c1)(c0 + β c1) - β c0c1 - c0c1
-  mixin prod
-
-  # r0 <- (c0 + c1)(c0 + β c1)
-  r.c0.sum(a.c0, a.c1)
-  r.c1.sum(a.c0, NonResidue * a.c1)
-  r.c0 *= r.c1
-
-  # r1 <- c0 c1
-  r.c1.prod(a.c0, a.c1)
-
-  # r0 = (c0 + c1)(c0 + β c1) - β c0c1 - c0c1
-  r.c0 -= NonResidue * r.c1
-  r.c0 -= r.c1
-
-  # r1 = 2 c0c1
-  r.c1.double()
-
-func prod_generic(r: var QuadraticExt, a, b: QuadraticExt) =
-  ## Returns r = a * b
-  # Algorithm (with β the non-residue in the base field)
-  #
-  # r0 = a0 b0 + β a1 b1
-  # r1 = (a0 + a1) (b0 + b1) - a0 b0 - a1 b1 (Karatsuba)
-  mixin prod
-  var t {.noInit.}: typeof(r.c0)
-
-  # r1 <- (a0 + a1)(b0 + b1)
-  r.c0.sum(a.c0, a.c1)
-  t.sum(b.c0, b.c1)
-  r.c1.prod(r.c0, t)
-
-  # r0 <- a0 b0
-  # r1 <- (a0 + a1)(b0 + b1) - a0 b0 - a1 b1
-  r.c0.prod(a.c0, b.c0)
-  t.prod(a.c1, b.c1)
-  r.c1 -= r.c0
-  r.c1 -= t
-
-  # r0 <- a0 b0 + β a1 b1
-  r.c0 += NonResidue * t
-
-func mul_sparse_generic_by_x0(r: var QuadraticExt, a, sparseB: QuadraticExt) =
-  ## Multiply `a` by `b` with sparse coordinates (x, 0)
-  ## On a generic quadratic extension field
-  # Algorithm (with β the non-residue in the base field)
-  #
-  # r0 = a0 b0 + β a1 b1
-  # r1 = (a0 + a1) (b0 + b1) - a0 b0 - a1 b1 (Karatsuba)
-  #
-  # with b1 = 0, hence
-  #
-  # r0 = a0 b0
-  # r1 = (a0 + a1) b0 - a0 b0 = a1 b0
-  mixin prod
-  template b(): untyped = sparseB
-
-  r.c0.prod(a.c0, b.c0)
-  r.c1.prod(a.c1, b.c0)
-
-func mul_sparse_generic_by_0y(r: var QuadraticExt, a, sparseB: QuadraticExt) =
-  ## Multiply `a` by `b` with sparse coordinates (0, y)
-  ## On a generic quadratic extension field
-  # Algorithm (with β the non-residue in the base field)
-  #
-  # r0 = a0 b0 + β a1 b1
-  # r1 = (a0 + a1) (b0 + b1) - a0 b0 - a1 b1 (Karatsuba)
-  #
-  # with b0 = 0, hence
-  #
-  # r0 = β a1 b1
-  # r1 = (a0 + a1) b1 - a1 b1 = a0 b1
-  template b(): untyped = sparseB
-
-  r.c0.prod(a.c1, b.c1)
-  r.c0 *= NonResidue
-  r.c1.prod(a.c0, b.c1)
-
-# Exported symbols
-# -------------------------------------------------------------------
-
-func conj*(a: var QuadraticExt) {.inline.} =
-  ## Computes the conjugate in-place
-  a.c1.neg()
-
-func conj*(r: var QuadraticExt, a: QuadraticExt) {.inline.} =
-  ## Computes the conjugate out-of-place
-  r.c0 = a.c0
-  r.c1.neg(a.c1)
-
-func conjneg*(a: var QuadraticExt) {.inline.} =
-  ## Computes the negated conjugate in-place
-  a.c0.neg()
-
-func conjneg*(r: var QuadraticExt, a: QuadraticExt) {.inline.} =
-  ## Computes the negated conjugate out-of-place
-  r.c0.neg(a.c0)
-  r.c1 = a.c1
-
-func square*(r: var QuadraticExt, a: QuadraticExt) {.inline.} =
-  mixin fromComplexExtension
-  when r.fromComplexExtension():
-    r.square_complex(a)
-  else:
-    r.square_generic(a)
-
-func prod*(r: var QuadraticExt, a, b: QuadraticExt) {.inline.} =
-  mixin fromComplexExtension
-  when r.fromComplexExtension():
-    r.prod_complex(a, b)
-  else:
-    r.prod_generic(a, b)
-
-func inv*(r: var QuadraticExt, a: QuadraticExt) =
-  ## Compute the multiplicative inverse of ``a``
-  ##
-  ## The inverse of 0 is 0.
-  ## Incidentally this avoids extra check
-  ## to convert Jacobian and Projective coordinates
-  ## to affine for elliptic curve
-  #
-  # Algorithm:
-  #
-  # 1 / (a0 + a1 w) <=> (a0 - a1 w) / (a0 + a1 w)(a0 - a1 w)
-  #                 <=> (a0 - a1 w) / (a0² - a1² w²)
-  # with w being our coordinate system and β the quadratic non-residue
-  # we have w² = β
-  # So the inverse is (a0 - a1 w) / (a0² - β a1²)
-  mixin fromComplexExtension
-
-  # [2 Sqr, 1 Add]
-  var v0 {.noInit.}, v1 {.noInit.}: typeof(r.c0)
-  v0.square(a.c0)
-  v1.square(a.c1)
-  when r.fromComplexExtension():
-    v0 += v1
-  else:
-    v0 -= NonResidue * v1 # v0 = a0² - β a1² (the norm / squared magnitude of a)
-
-  # [1 Inv, 2 Sqr, 1 Add]
-  v1.inv(v0)              # v1 = 1 / (a0² - β a1²)
-
-  # [1 Inv, 2 Mul, 2 Sqr, 1 Add, 1 Neg]
-  r.c0.prod(a.c0, v1)     # r0 = a0 / (a0² - β a1²)
-  v0.neg(v1)              # v0 = -1 / (a0² - β a1²)
-  r.c1.prod(a.c1, v0)     # r1 = -a1 / (a0² - β a1²)
-
-func inv*(a: var QuadraticExt) =
-  ## Compute the multiplicative inverse of ``a``
-  ##
-  ## The inverse of 0 is 0.
-  ## Incidentally this avoids extra check
-  ## to convert Jacobian and Projective coordinates
-  ## to affine for elliptic curve
-  a.inv(a)
-
-func `*=`*(a: var QuadraticExt, b: QuadraticExt) {.inline.} =
-  ## In-place multiplication
-  # On higher extension field like 𝔽p12,
-  # if `prod` is called on shared in and out buffer, the result is wrong
-  let t = a
-  a.prod(t, b)
-
-func square*(a: var QuadraticExt) {.inline.} =
-  let t = a
-  a.square(t)
-
-func mul_sparse_by_0y*(a: var QuadraticExt, sparseB: QuadraticExt) {.inline.} =
-  ## Sparse in-place multiplication
-  mixin fromComplexExtension
-  when a.fromComplexExtension():
-    let t = a
-    a.mul_sparse_complex_by_0y(t, sparseB)
-  else:
-    let t = a
-    a.mul_sparse_generic_by_0y(t, sparseB)
-
-func mul_sparse_by_x0*(a: var QuadraticExt, sparseB: QuadraticExt) {.inline.} =
-  ## Sparse in-place multiplication
-  let t = a
-  a.mul_sparse_generic_by_x0(t, sparseB)
diff --git a/constantine/tower_field_extensions/tower_common.nim b/constantine/tower_field_extensions/tower_common.nim
deleted file mode 100644
index 6fdb63d..0000000
--- a/constantine/tower_field_extensions/tower_common.nim
+++ /dev/null
@@ -1,263 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ../config/common,
-  ../primitives,
-  ../arithmetic
-
-# Note: to avoid burdening the Nim compiler, we rely on generic extension
-# to complain if the base field procedures don't exist
-
-# Common type definition
-# -------------------------------------------------------------------
-
-type
-  NonResidue* = object
-    ## Non-Residue
-    ##
-    ## Placeholder for the appropriate quadratic, cubic or sectic non-residue
-
-  CubicExt* = concept x
-    ## Cubic Extension field concept
-    type BaseField = auto
-    x.c0 is BaseField
-    x.c1 is BaseField
-    x.c2 is BaseField
-
-  QuadraticExt* = concept x
-    ## Quadratic Extension field concept
-    not(x is CubicExt)
-
-    type BaseField = auto
-    x.c0 is BaseField
-    x.c1 is BaseField
-
-  ExtensionField* = QuadraticExt or CubicExt
-
-# Initialization
-# -------------------------------------------------------------------
-
-func setZero*(a: var ExtensionField) =
-  ## Set ``a`` to 0 in the extension field
-  for field in fields(a):
-    field.setZero()
-
-func setOne*(a: var ExtensionField) =
-  ## Set ``a`` to 1 in the extension field
-  for fieldName, fA in fieldPairs(a):
-    when fieldName == "c0":
-      fA.setOne()
-    else:
-      fA.setZero()
-
-func fromBig*(a: var ExtensionField, src: BigInt) =
-  ## Set ``a`` to the bigint value in the extension field
-  for fieldName, fA in fieldPairs(a):
-    when fieldName == "c0":
-      fA.fromBig(src)
-    else:
-      fA.setZero()
-
-# Comparison
-# -------------------------------------------------------------------
-
-func `==`*(a, b: ExtensionField): SecretBool =
-  ## Constant-time equality check
-  result = CtTrue
-  for fA, fB in fields(a, b):
-    result = result and (fA == fB)
-
-func isZero*(a: ExtensionField): SecretBool =
-  ## Constant-time check if zero
-  result = CtTrue
-  for fA in fields(a):
-    result = result and fA.isZero()
-
-func isOne*(a: ExtensionField): SecretBool =
-  ## Constant-time check if one
-  result = CtTrue
-  for fieldName, fA in fieldPairs(a):
-    when fieldName == "c0":
-      result = result and fA.isOne()
-    else:
-      result = result and fA.isZero()
-
-func isMinusOne*(a: ExtensionField): SecretBool =
-  ## Constant-time check if -1
-  result = CtTrue
-  for fieldName, fA in fieldPairs(a):
-    when fieldName == "c0":
-      result = result and fA.isMinusOne()
-    else:
-      result = result and fA.isZero()
-
-# Copies
-# -------------------------------------------------------------------
-
-func ccopy*(a: var ExtensionField, b: ExtensionField, ctl: SecretBool) =
-  ## Constant-time conditional copy
-  ## If ctl is true: b is copied into a
-  ## if ctl is false: b is not copied and a is unmodified
-  ## Time and memory accesses are the same whether a copy occurs or not
-  for fA, fB in fields(a, b):
-    ccopy(fA, fB, ctl)
-
-# Abelian group
-# -------------------------------------------------------------------
-
-func neg*(r: var ExtensionField, a: ExtensionField) =
-  ## Field out-of-place negation
-  for fR, fA in fields(r, a):
-    fR.neg(fA)
-
-func neg*(a: var ExtensionField) =
-  ## Field in-place negation
-  for fA in fields(a):
-    fA.neg()
-
-func `+=`*(a: var ExtensionField, b: ExtensionField) =
-  ## Addition in the extension field
-  for fA, fB in fields(a, b):
-    fA += fB
-
-func `-=`*(a: var ExtensionField, b: ExtensionField) =
-  ## Substraction in the extension field
-  for fA, fB in fields(a, b):
-    fA -= fB
-
-func double*(r: var ExtensionField, a: ExtensionField) =
-  ## Field out-of-place doubling
-  for fR, fA in fields(r, a):
-    fR.double(fA)
-
-func double*(a: var ExtensionField) =
-  ## Field in-place doubling
-  for fA in fields(a):
-    fA.double()
-
-func div2*(a: var ExtensionField) =
-  ## Field in-place division by 2
-  for fA in fields(a):
-    fA.div2()
-
-func sum*(r: var QuadraticExt, a, b: QuadraticExt) =
-  ## Sum ``a`` and ``b`` into ``r``
-  r.c0.sum(a.c0, b.c0)
-  r.c1.sum(a.c1, b.c1)
-
-func sum*(r: var CubicExt, a, b: CubicExt) =
-  ## Sum ``a`` and ``b`` into ``r``
-  r.c0.sum(a.c0, b.c0)
-  r.c1.sum(a.c1, b.c1)
-  r.c2.sum(a.c2, b.c2)
-
-func diff*(r: var QuadraticExt, a, b: QuadraticExt) =
-  ## Diff ``a`` and ``b`` into ``r``
-  r.c0.diff(a.c0, b.c0)
-  r.c1.diff(a.c1, b.c1)
-
-func diff*(r: var CubicExt, a, b: CubicExt) =
-  ## Diff ``a`` and ``b`` into ``r``
-  r.c0.diff(a.c0, b.c0)
-  r.c1.diff(a.c1, b.c1)
-  r.c2.diff(a.c2, b.c2)
-
-
-# Conditional arithmetic
-# -------------------------------------------------------------------
-
-func cneg*(a: var ExtensionField, ctl: SecretBool) =
-  ## Constant-time in-place conditional negation
-  ## Only negate if ctl is true
-  for fA in fields(a):
-    fA.cneg(ctl)
-
-func cadd*(a: var ExtensionField, b: ExtensionField, ctl: SecretBool) =
-  ## Constant-time in-place conditional addition
-  for fA, fB in fields(a, b):
-    fA.cadd(fB, ctl)
-
-func csub*(a: var ExtensionField, b: ExtensionField, ctl: SecretBool) =
-  ## Constant-time in-place conditional substraction
-  for fA, fB in fields(a, b):
-    fA.csub(fB, ctl)
-
-# Multiplication by a small integer known at compile-time
-# -------------------------------------------------------------------
-
-func `*=`*(a: var ExtensionField, b: static int) {.inline.} =
-  ## Multiplication by a small integer known at compile-time
-
-  const negate = b < 0
-  const b = if negate: -b
-            else: b
-  when negate:
-    a.neg(a)
-  when b == 0:
-    a.setZero()
-  elif b == 1:
-    return
-  elif b == 2:
-    a.double()
-  elif b == 3:
-    let t1 = a
-    a.double()
-    a += t1
-  elif b == 4:
-    a.double()
-    a.double()
-  elif b == 5:
-    let t1 = a
-    a.double()
-    a.double()
-    a += t1
-  elif b == 6:
-    a.double()
-    let t2 = a
-    a.double() # 4
-    a += t2
-  elif b == 7:
-    let t1 = a
-    a.double()
-    let t2 = a
-    a.double() # 4
-    a += t2
-    a += t1
-  elif b == 8:
-    a.double()
-    a.double()
-    a.double()
-  elif b == 9:
-    let t1 = a
-    a.double()
-    a.double()
-    a.double() # 8
-    a += t1
-  elif b == 10:
-    a.double()
-    let t2 = a
-    a.double()
-    a.double() # 8
-    a += t2
-  elif b == 11:
-    let t1 = a
-    a.double()
-    let t2 = a
-    a.double()
-    a.double() # 8
-    a += t2
-    a += t1
-  elif b == 12:
-    a.double()
-    a.double() # 4
-    let t4 = a
-    a.double() # 8
-    a += t4
-  else:
-    {.error: "Multiplication by this small int not implemented".}
diff --git a/constantine/tower_field_extensions/tower_instantiation.nim b/constantine/tower_field_extensions/tower_instantiation.nim
index 8502f58..f654d04 100644
--- a/constantine/tower_field_extensions/tower_instantiation.nim
+++ b/constantine/tower_field_extensions/tower_instantiation.nim
@@ -12,14 +12,32 @@
 
 import
   ../arithmetic,
-  ../config/curves,
+  ../config/[common, curves],
   ../io/io_fields,
-  ./tower_common,
-  ./quadratic_extensions,
-  ./cubic_extensions,
+  ./extension_fields,
   ./exponentiations
 
-export tower_common, quadratic_extensions, cubic_extensions, exponentiations
+export extension_fields, exponentiations
+
+# We assume that the sextic non-residues used to construct
+# the elliptic curve twists
+# match with the quadratic and cubic non-residues
+# chosen to construct the tower of extension fields.
+
+# 𝔽p
+# ----------------------------------------------------------------
+
+func `*=`*(a: var Fp, _: type NonResidue) {.inline.} =
+  ## Multiply an element of 𝔽p by the quadratic non-residue
+  ## chosen to construct 𝔽p2
+  static: doAssert Fp.C.getNonResidueFp() != -1, "𝔽p2 should be specialized for complex extension"
+  a *= Fp.C.getNonResidueFp()
+
+func prod*(r: var Fp, a: Fp, _: type NonResidue){.inline.} =
+  ## Multiply an element of 𝔽p by the quadratic non-residue
+  ## chosen to construct 𝔽p2
+  static: doAssert Fp.C.getNonResidueFp() != -1, "𝔽p2 should be specialized for complex extension"
+  r.prod(a, Fp.C.getNonResidueFp())
 
 # 𝔽p2
 # ----------------------------------------------------------------
@@ -28,53 +46,58 @@ type
   Fp2*[C: static Curve] = object
     c0*, c1*: Fp[C]
 
-  β = NonResidue
-    # Quadratic or Cubic non-residue
-
-  SexticNonResidue* = object
-
 template fromComplexExtension*[F](elem: F): static bool =
   ## Returns true if the input is a complex extension
   ## i.e. the irreducible polynomial chosen is
   ##   x² - µ with µ = -1
   ## and so 𝔽p2 = 𝔽p[x] / x² - µ = 𝔽p[𝑖]
-  when F is Fp2 and F.C.get_QNR_Fp() == -1:
+  when F is Fp2 and F.C.getNonResidueFp() == -1:
     true
   else:
     false
 
-func `*=`*(a: var Fp, _: typedesc[β]) {.inline.} =
-  ## Multiply an element of 𝔽p by the quadratic non-residue
-  ## chosen to construct 𝔽p2
-  static: doAssert Fp.C.get_QNR_Fp() != -1, "𝔽p2 should be specialized for complex extension"
-  a *= Fp.C.get_QNR_Fp()
+template mulCheckSparse*(a: var Fp2, b: Fp2) =
+  when b.isOne().bool:
+    discard
+  elif b.isMinusOne().bool:
+    a.neg()
+  elif b.c0.isZero().bool and b.c1.isOne().bool:
+    var t {.noInit.}: type(a.c0)
+    when fromComplexExtension(b):
+      t.neg(a.c1)
+      a.c1 = a.c0
+      a.c0 = t
+    else:
+      t.prod(a.c1, NonResidue)
+      a.c1 = a.c0
+      a.c0 = t
+  elif b.c0.isZero().bool and b.c1.isMinusOne().bool:
+    var t {.noInit.}: type(a.c0)
+    when fromComplexExtension(b):
+      t = a.c1
+      a.c1.neg(a.c0)
+      a.c0 = t
+    else:
+      t.prod(a.c1, NonResidue)
+      a.c1.neg(a.c0)
+      a.c0.neg(t)
+  elif b.c0.isZero().bool:
+    a.mul_sparse_by_0y(b)
+  elif b.c1.isZero().bool:
+    a.mul_sparse_by_x0(b)
+  else:
+    a *= b
 
-func `*`*(_: typedesc[β], a: Fp): Fp {.inline, noInit.} =
-  ## Multiply an element of 𝔽p by the quadratic non-residue
-  ## chosen to construct 𝔽p2
-  result = a
-  result *= β
-
-# TODO: rework the quad/cube/sextic non residue declaration
-
-func `*=`*(a: var Fp, _: typedesc[SexticNonResidue]) {.inline.} =
-  ## Multiply an element of 𝔽p by the sextic non-residue
-  ## chosen to construct 𝔽p6
-  a *= Fp.C.get_QNR_Fp()
-
-func `*`*(_: typedesc[SexticNonResidue], a: Fp): Fp {.inline, noInit.} =
-  ## Multiply an element of 𝔽p by the sextic non-residue
-  ## chosen to construct 𝔽p6
-  result = a
-  result *= SexticNonResidue
-
-func `*=`*(a: var Fp2, _: typedesc[SexticNonResidue]) {.inline.} =
-  ## Multiply an element of 𝔽p2 by the sextic non-residue
-  ## chosen to construct the sextic twist
+func prod*(r: var Fp2, a: Fp2, _: type NonResidue) {.inline.} =
+  ## Multiply an element of 𝔽p2 by the non-residue
+  ## chosen to construct the next extension or the twist:
+  ## - if quadratic non-residue: 𝔽p4
+  ## - if cubic non-residue: 𝔽p6
+  ## - if sextic non-residue: 𝔽p4, 𝔽p6 or 𝔽p12
   # Yet another const tuple unpacking bug
-  const u = Fp2.C.get_SNR_Fp2()[0] # Sextic non-residue to construct 𝔽p12
-  const v = Fp2.C.get_SNR_Fp2()[1]
-  const Beta {.used.} = Fp2.C.get_QNR_Fp()  # Quadratic non-residue to construct 𝔽p2
+  const u = Fp2.C.getNonResidueFp2()[0]
+  const v = Fp2.C.getNonResidueFp2()[1]
+  const Beta {.used.} = Fp2.C.getNonResidueFp()
   # ξ = u + v x
   # and x² = β
   #
@@ -82,24 +105,49 @@ func `*=`*(a: var Fp2, _: typedesc[SexticNonResidue]) {.inline.} =
   #                       => u c0 + β v c1 + (v c0 + u c1) x
   when a.fromComplexExtension() and u == 1 and v == 1:
     let t = a.c0
-    a.c0 -= a.c1
-    a.c1 += t
+    r.c0.diff(t, a.c1)
+    r.c1.sum(t, a.c1)
   else:
-    let a0 = a.c0
-    let a1 = a.c1
-    when a.fromComplexExtension():
-      a.c0.diff(u * a0, v * a1)
+    # Case:
+    # - BN254_Snarks, QNR_Fp: -1, SNR_Fp2: 9+1𝑖  (𝑖 = √-1)
+    # - BLS12_377, QNR_Fp: -5, SNR_Fp2: 0+1j    (j = √-5)
+    # - BW6_761, SNR_Fp: -4, CNR_Fp2: 0+1j      (j = √-4)
+    when u == 0:
+      # BLS12_377 and BW6_761, use small addition chain
+      r.mul_sparse_by_0y(a, v)
     else:
-      a.c0.sum(u * a0, (Beta * v) * a1)
-    a.c1.sum(v * a0, u * a1)
+      # BN254_Snarks, u = 9
+      # Full 𝔽p2 multiplication is cheaper than addition chains
+      # for u*c0 and u*c1
+      static:
+        doAssert u >= 0 and uint64(u) <= uint64(high(BaseType))
+        doAssert v >= 0 and uint64(v) <= uint64(high(BaseType))
+      # TODO: compile-time
+      var NR {.noInit.}: Fp2
+      NR.c0.fromUint(uint u)
+      NR.c1.fromUint(uint v)
+      r.prod(a, NR)
 
-func `/=`*(a: var Fp2, _: typedesc[SexticNonResidue]) {.inline.} =
-  ## Multiply an element of 𝔽p by the quadratic non-residue
-  ## chosen to construct sextic twist
+func `*=`*(a: var Fp2, _: type NonResidue) {.inline.} =
+  ## Multiply an element of 𝔽p2 by the non-residue
+  ## chosen to construct the next extension or the twist:
+  ## - if quadratic non-residue: 𝔽p4
+  ## - if cubic non-residue: 𝔽p6
+  ## - if sextic non-residue: 𝔽p4, 𝔽p6 or 𝔽p12
   # Yet another const tuple unpacking bug
-  const u = Fp2.C.get_SNR_Fp2()[0] # Sextic non-residue to construct 𝔽p12
-  const v = Fp2.C.get_SNR_Fp2()[1]
-  const Beta = Fp2.C.get_QNR_Fp()  # Quadratic non-residue to construct 𝔽p2
+  a.prod(a, NonResidue)
+
+func `/=`*(a: var Fp2, _: type NonResidue) {.inline.} =
+  ## Divide an element of 𝔽p by the non-residue
+  ## chosen to construct the next extension or the twist:
+  ## - if quadratic non-residue: 𝔽p4
+  ## - if cubic non-residue: 𝔽p6
+  ## - if sextic non-residue: 𝔽p4, 𝔽p6 or 𝔽p12
+  # Yet another const tuple unpacking bug
+  # Yet another const tuple unpacking bug
+  const u = Fp2.C.getNonresidueFp2()[0] # Sextic non-residue to construct 𝔽p12
+  const v = Fp2.C.getNonresidueFp2()[1]
+  const Beta = Fp2.C.getNonResidueFp()  # Quadratic non-residue to construct 𝔽p2
   # ξ = u + v x
   # and x² = β
   #
@@ -118,7 +166,7 @@ func `/=`*(a: var Fp2, _: typedesc[SexticNonResidue]) {.inline.} =
     a.c1 -= t
     a.div2()
   else:
-    let a0 = a.c0
+    var a0 = a.c0
     let a1 = a.c1
     const u2v2 = u*u - Beta*v*v # (u² - βv²)
     # TODO can be precomputed (or precompute b/µ the twist coefficient)
@@ -127,12 +175,18 @@ func `/=`*(a: var Fp2, _: typedesc[SexticNonResidue]) {.inline.} =
     u2v2inv.fromUint(u2v2)
     u2v2inv.inv()
 
-    a.c0.diff(u * a0, (Beta * v) * a1)
-    a.c1.diff(u * a1, v * a0)
+    a.c0 *= u
+    a.c1 *= Beta * v
+    a.c0 -= a.c1
+
+    a.c1 = a1
+    a.c1 *= u
+    a0 *= v
+    a.c1 -= a0
     a.c0 *= u2v2inv
     a.c1 *= u2v2inv
 
-# 𝔽p6
+# 𝔽p4 & 𝔽p6
 # ----------------------------------------------------------------
 
 type
@@ -142,53 +196,58 @@ type
   Fp6*[C: static Curve] = object
     c0*, c1*, c2*: Fp2[C]
 
-  ξ* = NonResidue
-    # We call the non-residue ξ on 𝔽p4/𝔽p6 to avoid confusion
-    # between non-residue
-    # of different tower level
+func prod*(r: var Fp4, a: Fp4, _: type NonResidue) =
+  ## Multiply an element of 𝔽p4 by the non-residue
+  ## chosen to construct the next extension or the twist:
+  ## - if quadratic non-residue: 𝔽p8
+  ## - if cubic non-residue: 𝔽p12
+  ## - if sextic non-residue: 𝔽p8, 𝔽p12 or 𝔽p24
+  ##
+  ## Assumes that it is sqrt(NonResidue_Fp2)
+  let t = a.c0
+  r.c0.prod(a.c1, NonResidue)
+  r.c1 = t
 
-func `*`*(_: typedesc[ξ], a: Fp2): Fp2 {.inline, noInit.} =
-  ## Multiply an element of 𝔽p2 by the quadratic and cubic non-residue
-  ## chosen to construct 𝔽p4/𝔽p6
-  # Yet another const tuple unpacking bug
-  const u = Fp2.C.get_CNR_Fp2()[0]         # Quadratic & Cubic non-residue to construct 𝔽p4/𝔽p6
-  const v = Fp2.C.get_CNR_Fp2()[1]
-  const Beta {.used.} = Fp2.C.get_QNR_Fp() # Quadratic non-residue to construct 𝔽p2
-  # ξ = u + v x
-  # and x² = β
-  #
-  # (c0 + c1 x) (u + v x) => u c0 + (u c0 + u c1)x + v c1 x²
-  #                       => u c0 + β v c1 + (v c0 + u c1) x
+func `*=`*(a: var Fp4, _: type NonResidue) {.inline.} =
+  ## Multiply an element of 𝔽p4 by the non-residue
+  ## chosen to construct the next extension or the twist:
+  ## - if quadratic non-residue: 𝔽p8
+  ## - if cubic non-residue: 𝔽p12
+  ## - if sextic non-residue: 𝔽p8, 𝔽p12 or 𝔽p24
+  ##
+  ## Assumes that it is sqrt(NonResidue_Fp2)
+  a.prod(a, NonResidue)
 
-  # TODO: check code generated when ξ = 1 + 𝑖
-  #       The mul by constant are inline but
-  #       since we don't have __restrict tag
-  #       and we use arrays (which decay into pointer)
-  #       the compiler might not elide the temporary
-  when a.fromComplexExtension():
-    result.c0.diff(u * a.c0, v * a.c1)
-  else:
-    result.c0.sum(u * a.c0, (Beta * v) * a.c1)
-  result.c1.sum(v * a.c0, u * a.c1 )
+func prod*(r: var Fp6, a: Fp6, _: type NonResidue) {.inline.} =
+  ## Multiply an element of 𝔽p4 by the non-residue
+  ## chosen to construct the next extension or the twist:
+  ## - if quadratic non-residue: 𝔽p12
+  ## - if cubic non-residue: 𝔽p18
+  ## - if sextic non-residue: 𝔽p12, 𝔽p18 or 𝔽p36
+  ##
+  ## Assumes that it is cube_root(NonResidue_Fp2)
+  ##
+  ## For all curves γ = v with v the factor for 𝔽p6 coordinate
+  ## and v³ = ξ
+  ## (c0 + c1 v + c2 v²) v => ξ c2 + c0 v + c1 v²
+  let t = a.c2
+  r.c1 = a.c0
+  r.c2 = a.c1
+  t.c0.prod(t, NonResidue)
 
-func `*=`*(a: var Fp2, _: typedesc[ξ]) {.inline.} =
-  ## Multiply an element of 𝔽p2 by the quadratic non-residue
-  ## chosen to construct 𝔽p6
-  # Yet another const tuple unpacking bug
-  const u = Fp2.C.get_CNR_Fp2()[0] # Cubic non-residue to construct 𝔽p6
-  const v = Fp2.C.get_CNR_Fp2()[1]
-  const Beta {.used.} = Fp2.C.get_QNR_Fp()  # Quadratic non-residue to construct 𝔽p2
-  # ξ = u + v x
-  # and x² = β
-  #
-  # (c0 + c1 x) (u + v x) => u c0 + (u c0 + u c1)x + v c1 x²
-  #                       => u c0 + β v c1 + (v c0 + u c1) x
-  when a.fromComplexExtension() and u == 1 and v == 1:
-    let t = a.c0
-    a.c0 -= a.c1
-    a.c1 += t
-  else: # TODO: optim for inline
-    a = ξ * a
+func `*=`*(a: var Fp6, _: type NonResidue) {.inline.} =
+  ## Multiply an element of 𝔽p4 by the non-residue
+  ## chosen to construct the next extension or the twist:
+  ## - if quadratic non-residue: 𝔽p12
+  ## - if cubic non-residue: 𝔽p18
+  ## - if sextic non-residue: 𝔽p12, 𝔽p18 or 𝔽p36
+  ##
+  ## Assumes that it is cube_root(NonResidue_Fp2)
+  ##
+  ## For all curves γ = v with v the factor for 𝔽p6 coordinate
+  ## and v³ = ξ
+  ## (c0 + c1 v + c2 v²) v => ξ c2 + c0 v + c1 v²
+  a.prod(a, NonResidue)
 
 # 𝔽p12
 # ----------------------------------------------------------------
@@ -198,32 +257,6 @@ type
     c0*, c1*, c2*: Fp4[C]
     # c0*, c1*: Fp6[C]
 
-  γ = NonResidue
-    # We call the non-residue γ (Gamma) on 𝔽p6 to avoid confusion between non-residue
-    # of different tower level
-
-func `*`*(_: typedesc[γ], a: Fp4): Fp4 {.noInit, inline.} =
-  ## Multiply an element of 𝔽p4 by the sextic non-residue
-  ## chosen to construct 𝔽p12
-  result.c0 = ξ * a.c1
-  result.c1 = a.c0
-
-func `*=`*(a: var Fp4, _: typedesc[γ]) {.inline.} =
-  a = γ * a
-
-func `*`*(_: typedesc[γ], a: Fp6): Fp6 {.noInit, inline.} =
-  ## Multiply an element of 𝔽p6 by the cubic non-residue
-  ## chosen to construct 𝔽p12
-  ## For all curves γ = v with v the factor for 𝔽p6 coordinate
-  ## and v³ = ξ
-  ## (c0 + c1 v + c2 v²) v => ξ c2 + c0 v + c1 v²
-  result.c0 = ξ * a.c2
-  result.c1 = a.c0
-  result.c2 = a.c1
-
-func `*=`*(a: var Fp6, _: typedesc[γ]) {.inline.} =
-  a = γ * a
-
 # Sparse functions
 # ----------------------------------------------------------------
 
@@ -238,13 +271,6 @@ func mul_sparse_by_y0*[C: static Curve](r: var Fp4[C], a: Fp4[C], b: Fp2[C]) =
   r.c0.prod(a.c0, b)
   r.c1.prod(a.c1, b)
 
-func mul_sparse_by_0y*[C: static Curve](r: var Fp4[C], a: Fp4[C], b: Fp2[C]) =
-  ## Sparse multiplication of an Fp4 element
-  ## with coordinates (a₀, a₁) by (0, b₁)
-  r.c0.prod(a.c1, b)
-  r.c0 *= NonResidue
-  r.c1.prod(a.c0, b)
-
 func mul_sparse_by_0y0*[C: static Curve](r: var Fp6[C], a: Fp6[C], b: Fp2[C]) =
   ## Sparse multiplication of an Fp6 element
   ## with coordinates (a₀, a₁, a₂) by (0, b₁, 0)
@@ -265,6 +291,6 @@ func mul_sparse_by_0y0*[C: static Curve](r: var Fp6[C], a: Fp6[C], b: Fp2[C]) =
   #    = a1 b1
 
   r.c0.prod(a.c2, b)
-  r.c0 *= ξ
+  r.c0 *= NonResidue
   r.c1.prod(a.c0, b)
   r.c2.prod(a.c1, b)
diff --git a/docs/optimizations.md b/docs/optimizations.md
index eeba714..023119b 100644
--- a/docs/optimizations.md
+++ b/docs/optimizations.md
@@ -75,6 +75,9 @@ The optimizations can be of algebraic, algorithmic or "implementation details" n
   - [ ] x86: MULX, ADCX, ADOX instructions
   - [ ] no-carry optimization for CIOS (Coarsely Integrated Operand Scanning)
 
+- Addition chains
+  - [ ] unreduced squarings/multiplications in addition chains
+
 - Exponentiation
   - [x] variable-time exponentiation
   - [x] fixed window optimization _(sliding windows are not constant-time)_
@@ -158,19 +161,25 @@ The optimizations can be of algebraic, algorithmic or "implementation details" n
 - Line functions
   - [x] Homogeneous projective coordinates
     - [x] D-Twist
+      - [x] Fused line add + elliptic curve add
+      - [x] Fused line double + elliptic curve double
     - [x] M-Twist
-    - [x] Fused line add + elliptic curve add
-    - [x] Fused line double + elliptic curve double
+      - [x] Fused line add + elliptic curve add
+      - [x] Fused line double + elliptic curve double
   - [ ] Jacobian projective coordinates
     - [ ] D-Twist
+      - [ ] Fused line add + elliptic curve add
+      - [ ] Fused line double + elliptic curve double
     - [ ] M-Twist
-    - [ ] Fused line add + elliptic curve add
-    - [ ] Fused line double + elliptic curve double
+      - [ ] Fused line add + elliptic curve add
+      - [ ] Fused line double + elliptic curve double
   - [x] Sparse multiplication line * Gₜ element
-    - [x] 6-way sparse
-    - [ ] Pseudo 8-sparse
     - [x] D-Twist
+      - [x] 6-way sparse
+      - [ ] Pseudo 8-sparse
     - [x] M-Twist
+      - [x] 6-way sparse
+      - [ ] Pseudo 8-sparse
 
 - Miller Loop
   - [x] NAF recoding
diff --git a/helpers/prng_unsafe.nim b/helpers/prng_unsafe.nim
index b156d79..4845957 100644
--- a/helpers/prng_unsafe.nim
+++ b/helpers/prng_unsafe.nim
@@ -15,7 +15,7 @@ import
     ec_shortweierstrass_projective,
     ec_shortweierstrass_jacobian],
   ../constantine/io/io_bigints,
-  ../constantine/tower_field_extensions/tower_common
+  ../constantine/tower_field_extensions/extension_fields
 
 # ############################################################
 #
diff --git a/tests/t_finite_fields.nim b/tests/t_finite_fields.nim
index 37b5362..29d9bc0 100644
--- a/tests/t_finite_fields.nim
+++ b/tests/t_finite_fields.nim
@@ -25,7 +25,6 @@ proc main() =
         y.fromUint(10'u32)
         z.fromUint(90'u32)
 
-        let u = x + y
         x += y
 
         var x_bytes: array[8, byte]
@@ -34,7 +33,6 @@ proc main() =
         check:
           # Check equality in the Montgomery domain
           bool(z == x)
-          bool(z == u)
           # Check equality when converting back to natural domain
           90'u64 == cast[uint64](x_bytes)
 
@@ -45,7 +43,6 @@ proc main() =
         y.fromUint(21'u32)
         z.fromUint(0'u32)
 
-        let u = x + y
         x += y
 
         var x_bytes: array[8, byte]
@@ -54,7 +51,6 @@ proc main() =
         check:
           # Check equality in the Montgomery domain
           bool(z == x)
-          bool(z == u)
           # Check equality when converting back to natural domain
           0'u64 == cast[uint64](x_bytes)
 
@@ -65,7 +61,6 @@ proc main() =
         y.fromUint(22'u32)
         z.fromUint(1'u32)
 
-        let u = x + y
         x += y
 
         var x_bytes: array[8, byte]
@@ -74,7 +69,6 @@ proc main() =
         check:
           # Check equality in the Montgomery domain
           bool(z == x)
-          bool(z == u)
           # Check equality when converting back to natural domain
           1'u64 == cast[uint64](x_bytes)
 
@@ -86,7 +80,6 @@ proc main() =
         y.fromUint(10'u32)
         z.fromUint(70'u32)
 
-        let u = x - y
         x -= y
 
         var x_bytes: array[8, byte]
@@ -95,7 +88,6 @@ proc main() =
         check:
           # Check equality in the Montgomery domain
           bool(z == x)
-          bool(z == u)
           # Check equality when converting back to natural domain
           70'u64 == cast[uint64](x_bytes)
 
@@ -106,7 +98,6 @@ proc main() =
         y.fromUint(80'u32)
         z.fromUint(0'u32)
 
-        let u = x - y
         x -= y
 
         var x_bytes: array[8, byte]
@@ -115,7 +106,6 @@ proc main() =
         check:
           # Check equality in the Montgomery domain
           bool(z == x)
-          bool(z == u)
           # Check equality when converting back to natural domain
           0'u64 == cast[uint64](x_bytes)
 
@@ -126,7 +116,6 @@ proc main() =
         y.fromUint(81'u32)
         z.fromUint(100'u32)
 
-        let u = x - y
         x -= y
 
         var x_bytes: array[8, byte]
@@ -135,19 +124,18 @@ proc main() =
         check:
           # Check equality in the Montgomery domain
           bool(z == x)
-          bool(z == u)
           # Check equality when converting back to natural domain
           100'u64 == cast[uint64](x_bytes)
 
     test "Multiplication mod 101":
       block:
-        var x, y, z: Fp[Fake101]
+        var x, y, z, r: Fp[Fake101]
 
         x.fromUint(10'u32)
         y.fromUint(10'u32)
         z.fromUint(100'u32)
 
-        let r = x * y
+        r.prod(x, y)
 
         var r_bytes: array[8, byte]
         r_bytes.exportRawUint(r, cpuEndian)
@@ -159,13 +147,13 @@ proc main() =
           100'u64 == cast[uint64](r_bytes)
 
       block:
-        var x, y, z: Fp[Fake101]
+        var x, y, z, r: Fp[Fake101]
 
         x.fromUint(10'u32)
         y.fromUint(11'u32)
         z.fromUint(9'u32)
 
-        let r = x * y
+        r.prod(x, y)
 
         var r_bytes: array[8, byte]
         r_bytes.exportRawUint(r, cpuEndian)
@@ -275,13 +263,13 @@ proc main() =
 
     test "Multiplication mod 2^61 - 1":
       block:
-        var x, y, z: Fp[Mersenne61]
+        var x, y, z, r: Fp[Mersenne61]
 
         x.fromUint(10'u32)
         y.fromUint(10'u32)
         z.fromUint(100'u32)
 
-        let r = x * y
+        r.prod(x, y)
 
         var r_bytes: array[8, byte]
         r_bytes.exportRawUint(r, cpuEndian)
@@ -294,13 +282,13 @@ proc main() =
           cast[uint64](r_bytes) == 100'u64
 
       block:
-        var x, y, z: Fp[Mersenne61]
+        var x, y, z, r: Fp[Mersenne61]
 
         x.fromUint(1'u32 shl 31)
         y.fromUint(1'u32 shl 31)
         z.fromUint(2'u32)
 
-        let r = x * y
+        r.prod(x, y)
 
         var r_bytes: array[8, byte]
         r_bytes.exportRawUint(r, cpuEndian)