Fast modular inversion (#172)

* split modular inversion in its own file * Stash fast GCD inversion https://eprint.iacr.org/2020/972.pdf * Stash Pornin's bingcd -> issue with inner modular reduction * Implement Bernstein-Yang inversion * Avoid Nim checks on signed integers (32-bit runtime issue) * cleanup: remove old inversion impls * cleanup: static moduli, move div2 * small comments (skip ci) * comment cleanup (skip ci) * fix total iterations on 32-bit * Add batch conversion to affine coordinates using simultaneous inversion trick * fix conditional setZero and batchAffine conversion * cleanup unneeded branches following affine conversion unification * Fix batchAffine with zero inputs and add fuzz failure to test suite
2025-02-04 15:03:36 +00:00 · 2022-02-10 14:05:07 +01:00 · 2022-02-10 14:05:07 +01:00 · 53c4db7ead
commit 53c4db7ead
parent c02e6bdf84
65 changed files with 1655 additions and 1679 deletions
--- a/benchmarks/bench_elliptic_template.nim
+++ b/benchmarks/bench_elliptic_template.nim
@ -68,10 +68,7 @@ proc mixedAddBench*(T: typedesc, iters: int) =
  let P = rng.random_unsafe(T)
  let Q = rng.random_unsafe(T)
  var Qaff: ECP_ShortW_Aff[T.F, T.G]
-  when Q is ECP_ShortW_Prj:
-    Qaff.affineFromProjective(Q)
-  else:
-    Qaff.affineFromJacobian(Q)
+  Qaff.affine(Q)
  bench("EC Mixed Addition " & G1_or_G2, T, iters):
    r.madd(P, Qaff)

@ -87,14 +84,14 @@ proc affFromProjBench*(T: typedesc, iters: int) =
  var r {.noInit.}: ECP_ShortW_Aff[T.F, T.G]
  let P = rng.random_unsafe(T)
  bench("EC Projective to Affine " & G1_or_G2, T, iters):
-    r.affineFromProjective(P)
+    r.affine(P)

 proc affFromJacBench*(T: typedesc, iters: int) =
  const G1_or_G2 = when T.F is Fp: "G1" else: "G2"
  var r {.noInit.}: ECP_ShortW_Aff[T.F, T.G]
  let P = rng.random_unsafe(T)
  bench("EC Jacobian to Affine " & G1_or_G2, T, iters):
-    r.affineFromJacobian(P)
+    r.affine(P)

 proc scalarMulGenericBench*(T: typedesc, window: static int, iters: int) =
  const bits = T.F.C.getCurveOrderBitwidth()
--- a/benchmarks/bench_fields_template.nim
+++ b/benchmarks/bench_fields_template.nim
@ -95,30 +95,9 @@ proc invBench*(T: typedesc, iters: int) =
  var r: T
  let x = rng.random_unsafe(T)
  preventOptimAway(r)
-  bench("Inversion (constant-time default impl)", T, iters):
+  bench("Inversion (constant-time)", T, iters):
    r.inv(x)

-proc invEuclidBench*(T: typedesc, iters: int) =
-  var r: T
-  let x = rng.random_unsafe(T)
-  preventOptimAway(r)
-  bench("Inversion (constant-time Euclid)", T, iters):
-    r.inv_euclid(x)
-
-proc invPowFermatBench*(T: typedesc, iters: int) =
-  let x = rng.random_unsafe(T)
-  const exponent = T.getInvModExponent()
-  bench("Inversion (exponentiation p-2, Little Fermat)", T, iters):
-    var r = x
-    r.powUnsafeExponent(exponent)
-
-proc invAddChainBench*(T: typedesc, iters: int) =
-  var r: T
-  let x = rng.random_unsafe(T)
-  preventOptimAway(r)
-  bench("Inversion (addition chain)", T, iters):
-    r.inv_addchain(x)
-
 proc sqrtBench*(T: typedesc, iters: int) =
  let x = rng.random_unsafe(T)

--- a/benchmarks/bench_fp.nim
+++ b/benchmarks/bench_fp.nim
@ -11,7 +11,7 @@ import
  ../constantine/config/[curves, common],
  ../constantine/arithmetic,
  ../constantine/io/io_bigints,
-  ../constantine/curves/[zoo_inversions, zoo_square_roots],
+  ../constantine/curves/zoo_square_roots,
  # Helpers
  ../helpers/static_for,
  ./bench_fields_template
@ -50,8 +50,7 @@ proc main() =
    mulBench(Fp[curve], Iters)
    sqrBench(Fp[curve], Iters)
    smallSeparator()
-    invEuclidBench(Fp[curve], ExponentIters)
-    invPowFermatBench(Fp[curve], ExponentIters)
+    invBench(Fp[curve], ExponentIters)
    sqrtBench(Fp[curve], ExponentIters)
    sqrtRatioBench(Fp[curve], ExponentIters)
    # Exponentiation by a "secret" of size ~the curve order
--- a/benchmarks/bench_hash_to_curve.nim
+++ b/benchmarks/bench_hash_to_curve.nim
@ -64,7 +64,7 @@ proc bench_BLS12_381_proj_aff_conversion(iters: int) =
  )

  bench("Proj->Affine conversion (for pairing)", BLS12_381, iters):
-    Paff.affineFromProjective(P)
+    Paff.affine(P)

 const Iters = 1000

--- a/benchmarks/bench_pairing_template.nim
+++ b/benchmarks/bench_pairing_template.nim
@ -51,9 +51,9 @@ func clearCofactor[F; G: static Subgroup](
       ec: var ECP_ShortW_Aff[F, G]) =
  # For now we don't have any affine operation defined
  var t {.noInit.}: ECP_ShortW_Prj[F, G]
-  t.projectiveFromAffine(ec)
+  t.fromAffine(ec)
  t.clearCofactor()
-  ec.affineFromProjective(t)
+  ec.affine(t)

 func random_point*(rng: var RngState, EC: typedesc): EC {.noInit.} =
  result = rng.random_unsafe(EC)
--- a/benchmarks/bench_summary_template.nim
+++ b/benchmarks/bench_summary_template.nim
@ -86,9 +86,9 @@ func clearCofactorReference[F; G: static Subgroup](
       ec: var ECP_ShortW_Aff[F, G]) =
  # For now we don't have any affine operation defined
  var t {.noInit.}: ECP_ShortW_Prj[F, G]
-  t.projectiveFromAffine(ec)
+  t.fromAffine(ec)
  t.clearCofactorReference()
-  ec.affineFromProjective(t)
+  ec.affine(t)

 func random_point*(rng: var RngState, EC: typedesc): EC {.noInit.} =
  result = rng.random_unsafe(EC)
@ -136,10 +136,7 @@ proc mixedAddBench*(T: typedesc, iters: int) =
  let P = rng.random_unsafe(T)
  let Q = rng.random_unsafe(T)
  var Qaff: ECP_ShortW_Aff[T.F, T.G]
-  when Q is ECP_ShortW_Prj:
-    Qaff.affineFromProjective(Q)
-  else:
-    Qaff.affineFromJacobian(Q)
+  Qaff.affine(Q)
  bench("EC Mixed Addition " & G1_or_G2, T, iters):
    r.madd(P, Qaff)

--- a/constantine.nimble
+++ b/constantine.nimble
@ -28,6 +28,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
  # Big ints
  # ----------------------------------------------------------
  ("tests/t_io_bigints.nim", false),
+  ("tests/t_io_unsaturated.nim", false),
  ("tests/t_bigints.nim", false),
  ("tests/t_bigints_multimod.nim", false),
  ("tests/t_bigints_mod_vs_gmp.nim", true),
@ -64,6 +65,11 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
  ("tests/t_fp4_frobenius.nim", false),
  ("tests/t_fp6_frobenius.nim", false),
  ("tests/t_fp12_frobenius.nim", false),
+
+  # Elliptic curve arithmetic 
+  # ----------------------------------------------------------
+  ("tests/t_ec_conversion.nim", false),
+
  # Elliptic curve arithmetic G1
  # ----------------------------------------------------------
  # ("tests/t_ec_shortw_prj_g1_add_double.nim", false),
--- a/constantine/arithmetic.nim
+++ b/constantine/arithmetic.nim
@ -10,7 +10,6 @@ import
  arithmetic/[bigints, bigints_montgomery],
  arithmetic/[
    finite_fields,
-    finite_fields_inversion,
    finite_fields_square_root,
    finite_fields_double_precision
  ]
@ -19,6 +18,5 @@ export
  bigints,
  bigints_montgomery,
  finite_fields,
-  finite_fields_inversion,
  finite_fields_square_root,
  finite_fields_double_precision
--- a/constantine/arithmetic/assembly/limbs_asm_montmul_x86_adx_bmi2.nim
+++ b/constantine/arithmetic/assembly/limbs_asm_montmul_x86_adx_bmi2.nim
@ -147,7 +147,7 @@ proc partialRedx(
    ctx.comment "  Reduction"
    ctx.comment "  m = t[0] * m0ninv mod 2^w"
    ctx.mov  rdx, t[0]
-    ctx.mulx S, rdx, m0ninv, rdx # (S, RDX) <- m0ninv * RDX
+    ctx.imul rdx, m0ninv

    # Clear carry flags - TODO: necessary?
    ctx.`xor` S, S
--- a/constantine/arithmetic/bigints.nim
+++ b/constantine/arithmetic/bigints.nim
@ -11,8 +11,8 @@ import
  ../primitives,
  ./limbs,
  ./limbs_extmul,
-  ./limbs_modular,
-  ./limbs_montgomery
+  ./limbs_invmod,
+  ./limbs_division

 export BigInt

@ -79,9 +79,9 @@ func setUint*(a: var BigInt, n: SomeUnsignedInt) =
  ## Set a BigInt to a machine-sized integer ``n``
  a.limbs.setUint(n)

-func czero*(a: var BigInt, ctl: SecretBool) =
+func csetZero*(a: var BigInt, ctl: SecretBool) =
  ## Set ``a`` to 0 if ``ctl`` is true
-  a.limbs.czero(ctl)
+  a.limbs.csetZero(ctl)

 # Copy
 # ------------------------------------------------------------
@ -450,35 +450,36 @@ func reduce*[aBits, mBits](r: var BigInt[mBits], a: BigInt[aBits], M: BigInt[mBi
  # pass a pointer+length to a fixed session of the BSS.
  reduce(r.limbs, a.limbs, aBits, M.limbs, mBits)

-func div2_modular*[bits](a: var BigInt[bits], mp1div2: BigInt[bits]) =
-  ## Compute a <- a/2 (mod M)
-  ## `mp1div2` is the modulus (M+1)/2
-  ##
-  ## Normally if `a` is odd we add the modulus before dividing by 2
-  ## but this may overflow and we might lose a bit before shifting.
-  ## Instead we shift first and then add half the modulus rounded up
-  ##
-  ## Assuming M is odd, `mp1div2` can be precomputed without
-  ## overflowing the "Limbs" by dividing by 2 first
-  ## and add 1
-  ## Otherwise `mp1div2` should be M/2
-  a.limbs.div2_modular(mp1div2.limbs)
-
-func steinsGCD*[bits](r: var BigInt[bits], a, F, M, mp1div2: BigInt[bits]) =
-  ## Compute F multiplied the modular inverse of ``a`` modulo M
-  ## r ≡ F . a^-1 (mod M)
+func invmod*[bits](
+       r: var BigInt[bits],
+       a, F, M: BigInt[bits]) =
+  ## Compute the modular inverse of ``a`` modulo M
+  ## r ≡ F.a⁻¹ (mod M)
  ##
  ## M MUST be odd, M does not need to be prime.
  ## ``a`` MUST be less than M.
-  r.limbs.steinsGCD(a.limbs, F.limbs, M.limbs, bits, mp1div2.limbs)
+  r.limbs.invmod(a.limbs, F.limbs, M.limbs, bits)

-func invmod*[bits](r: var BigInt[bits], a, M, mp1div2: BigInt[bits]) =
+func invmod*[bits](
+       r: var BigInt[bits],
+       a: BigInt[bits],
+       F, M: static BigInt[bits]) =
+  ## Compute the modular inverse of ``a`` modulo M
+  ## r ≡ F.a⁻¹ (mod M)
+  ## 
+  ## with F and M known at compile-time
+  ##
+  ## M MUST be odd, M does not need to be prime.
+  ## ``a`` MUST be less than M.
+  r.limbs.invmod(a.limbs, F.limbs, M.limbs, bits)
+
+func invmod*[bits](r: var BigInt[bits], a, M: BigInt[bits]) =
  ## Compute the modular inverse of ``a`` modulo M
  ##
  ## The modulus ``M`` MUST be odd
  var one {.noInit.}: BigInt[bits]
  one.setOne()
-  r.steinsGCD(a, one, M, mp1div2)
+  r.invmod(a, one, M)

 {.pop.} # inline
 {.pop.} # raises no exceptions
--- a/constantine/arithmetic/bigints_montgomery.nim
+++ b/constantine/arithmetic/bigints_montgomery.nim
@ -11,7 +11,6 @@ import
  ../primitives,
  ../io/io_bigints,
  ./limbs,
-  ./limbs_modular,
  ./limbs_montgomery,
  ./bigints

--- a/constantine/arithmetic/finite_fields.nim
+++ b/constantine/arithmetic/finite_fields.nim
@ -231,25 +231,29 @@ func neg*(r: var FF, a: FF) {.meter.} =
    var t {.noInit.}: FF
    let isZero = a.isZero()
    discard t.mres.diff(FF.fieldMod(), a.mres)
-    t.mres.czero(isZero)
+    t.mres.csetZero(isZero)
    r = t

 func neg*(a: var FF) {.meter.} =
  ## Negate modulo p
  a.neg(a)

-func div2*(a: var FF) {.meter.} =
-  ## Modular division by 2
-  a.mres.div2_modular(FF.getPrimePlus1div2())
-
 # ############################################################
 #
 #                Field arithmetic conditional
 #
 # ############################################################

+func csetZero*(a: var FF, ctl: SecretBool) =
+  ## Set ``a`` to 0 if ``ctl`` is true
+  a.mres.csetZero(ctl)
+
+func csetOne*(a: var FF, ctl: SecretBool) =
+  ## Set ``a`` to 1 if ``ctl`` is true
+  a.mres.ccopy(FF.getMontyOne(), ctl)
+
 func cneg*(r: var FF, a: FF, ctl: SecretBool) {.meter.} =
-  ## Constant-time in-place conditional negation
+  ## Constant-time out-of-place conditional negation
  ## The negation is only performed if ctl is "true"
  r.neg(a)
  r.ccopy(a, not ctl)
@ -261,19 +265,68 @@ func cneg*(a: var FF, ctl: SecretBool) {.meter.} =
  a.cneg(t, ctl)

 func cadd*(a: var FF, b: FF, ctl: SecretBool) {.meter.} =
-  ## Constant-time in-place conditional addition
+  ## Constant-time out-place conditional addition
  ## The addition is only performed if ctl is "true"
  var t = a
  t += b
  a.ccopy(t, ctl)

 func csub*(a: var FF, b: FF, ctl: SecretBool) {.meter.} =
-  ## Constant-time in-place conditional substraction
+  ## Constant-time out-place conditional substraction
  ## The substraction is only performed if ctl is "true"
  var t = a
  t -= b
  a.ccopy(t, ctl)

+# ############################################################
+#
+#           Field arithmetic division and inversion
+#
+# ############################################################
+
+func div2*(a: var FF) {.meter.} =
+  ## Modular division by 2
+  ## `a` will be divided in-place
+  #
+  # Normally if `a` is odd we add the modulus before dividing by 2
+  # but this may overflow and we might lose a bit before shifting.
+  # Instead we shift first and then add half the modulus rounded up
+  #
+  # Assuming M is odd, `mp1div2` can be precomputed without
+  # overflowing the "Limbs" by dividing by 2 first
+  # and add 1
+  # Otherwise `mp1div2` should be M/2
+
+  # if a.isOdd:
+  #   a += M
+  # a = a shr 1
+  let wasOdd = a.mres.isOdd()
+  a.mres.shiftRight(1)
+  let carry {.used.} = a.mres.cadd(FF.getPrimePlus1div2(), wasOdd)
+  
+  # a < M -> a/2 <= M/2:
+  #   a/2 + M/2 < M if a is odd
+  #   a/2       < M if a is even
+  debug: doAssert not carry.bool
+
+func inv*(r: var FF, a: FF) =
+  ## Inversion modulo p
+  ##
+  ## The inverse of 0 is 0.
+  ## Incidentally this avoids extra check
+  ## to convert Jacobian and Projective coordinates
+  ## to affine for elliptic curve
+  r.mres.invmod(a.mres, FF.getR2modP(), FF.fieldMod())
+
+func inv*(a: var FF) =
+  ## Inversion modulo p
+  ##
+  ## The inverse of 0 is 0.
+  ## Incidentally this avoids extra check
+  ## to convert Jacobian and Projective coordinates
+  ## to affine for elliptic curve
+  a.inv(a)
+
 # ############################################################
 #
 #               Field arithmetic exponentiation
--- a/constantine/arithmetic/finite_fields_double_precision.nim
+++ b/constantine/arithmetic/finite_fields_double_precision.nim
@ -162,7 +162,7 @@ func neg2xMod*(r: var FpDbl, a: FpDbl) =
      subB(borrow, t.limbs2x[i], M.limbs[i-N], a.limbs2x[i], borrow)

    # Zero the result if input was zero
-    t.limbs2x.czero(isZero)
+    t.limbs2x.csetZero(isZero)
    r = t

 func prod2xImpl(
--- a/constantine/arithmetic/finite_fields_inversion.nim
+++ b/constantine/arithmetic/finite_fields_inversion.nim
@ -1,64 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ../config/[curves, type_ff],
-  ./bigints,
-  ../curves/zoo_inversions
-
-export zoo_inversions
-
-# ############################################################
-#
-#                 Finite field inversion
-#
-# ############################################################
-
-# No exceptions allowed
-{.push raises: [].}
-{.push inline.}
-
-func inv_euclid*(r: var FF, a: FF) =
-  ## Inversion modulo p via
-  ## Niels Moller constant-time version of
-  ## Stein's GCD derived from extended binary Euclid algorithm
-  r.mres.steinsGCD(a.mres, FF.getR2modP(), FF.fieldMod(), FF.getPrimePlus1div2())
-
-func inv*(r: var FF, a: FF) =
-  ## Inversion modulo p
-  ##
-  ## The inverse of 0 is 0.
-  ## Incidentally this avoids extra check
-  ## to convert Jacobian and Projective coordinates
-  ## to affine for elliptic curve
-  # For now we don't activate the addition chains
-  # Performance is slower than Euclid-based inversion on newer CPUs
-  #
-  # - Montgomery multiplication/squaring can skip the final substraction
-  # - For generalized Mersenne Prime curves, modular reduction can be made extremely cheap.
-  # - For BW6-761 the addition chain is over 2x slower than Euclid-based inversion
-  #   due to multiplication being so costly with 12 limbs (grows quadratically)
-  #   while Euclid costs grows linearly.
-  when false and
-       FF is Fp and FF.C.hasInversionAddchain() and
-       FF.C notin {BW6_761}:
-    r.inv_addchain(a)
-  else:
-    r.inv_euclid(a)
-
-func inv*(a: var FF) =
-  ## Inversion modulo p
-  ##
-  ## The inverse of 0 is 0.
-  ## Incidentally this avoids extra check
-  ## to convert Jacobian and Projective coordinates
-  ## to affine for elliptic curve
-  a.inv(a)
-
-{.pop.} # inline
-{.pop.} # raises no exceptions
--- a/constantine/arithmetic/limbs.nim
+++ b/constantine/arithmetic/limbs.nim
@ -85,12 +85,11 @@ func setUint*(a: var Limbs, n: SomeUnsignedInt) =
    when a.len > 2:
      zeroMem(a[2].addr, (a.len - 2) * sizeof(SecretWord))

-func czero*(a: var Limbs, ctl: SecretBool) =
+func csetZero*(a: var Limbs, ctl: SecretBool) =
  ## Set ``a`` to 0 if ``ctl`` is true
-  # Only used for FF neg in pure Nim fallback
-  # so no need for assembly
+  let mask = -(SecretWord(ctl) xor One)
  for i in 0 ..< a.len:
-    ctl.ccopy(a[i], Zero)
+    a[i] = a[i] and mask

 # Copy
 # ------------------------------------------------------------
@ -330,6 +329,26 @@ func cneg*(a: var Limbs, ctl: CTBool) =
    carry = SecretWord(t < carry)    # Carry on
    a[i] = t

+func cneg*(r: var Limbs, a: Limbs, ctl: CTBool) =
+  ## Conditional negation.
+  ## Negate if ``ctl`` is true
+
+  # Algorithm:
+  # In two-complement representation
+  #  -x <=> not(x) + 1 <=> x xor 0xFF... + 1
+  # and
+  #   x <=> x xor 0x00...<=> x xor 0x00... + 0
+  #
+  # So we need to xor all words and then add 1
+  # The "+1" might carry
+  # So we fuse the 2 steps
+  let mask = -SecretWord(ctl)        # Obtain a 0xFF... or 0x00... mask
+  var carry = SecretWord(ctl)
+  for i in 0 ..< a.len:
+    let t = (a[i] xor mask) + carry  # XOR with mask and add 0x01 or 0x00 respectively
+    carry = SecretWord(t < carry)    # Carry on
+    r[i] = t
+
 {.pop.} # inline

 # Division
--- a/constantine/arithmetic/limbs_division.nim
+++ b/constantine/arithmetic/limbs_division.nim
@ -8,144 +8,14 @@

 import
  ../config/common,
-  ../primitives,
-  ./limbs
+  ../primitives

 # No exceptions allowed
 {.push raises: [].}

 # ############################################################
 #
-#                  Modular division by 2
-#
-# ############################################################
-
-func div2_modular*(a: var Limbs, mp1div2: Limbs) {.inline.} =
-  ## Modular Division by 2
-  ## `a` will be divided in-place
-  ## `mp1div2` is the modulus (M+1)/2
-  ##
-  ## Normally if `a` is odd we add the modulus before dividing by 2
-  ## but this may overflow and we might lose a bit before shifting.
-  ## Instead we shift first and then add half the modulus rounded up
-  ##
-  ## Assuming M is odd, `mp1div2` can be precomputed without
-  ## overflowing the "Limbs" by dividing by 2 first
-  ## and add 1
-  ## Otherwise `mp1div2` should be M/2
-
-  # if a.isOdd:
-  #   a += M
-  # a = a shr 1
-  let wasOdd = a.isOdd()
-  a.shiftRight(1)
-  let carry {.used.} = a.cadd(mp1div2, wasOdd)
-  debug: doAssert not carry.bool
-
-# ############################################################
-#
-#                    Modular inversion
-#
-# ############################################################
-
-# Generic (odd-only modulus)
-# ------------------------------------------------------------
-# Algorithm by Niels Möller,
-# a modified version of Stein's Algorithm (binary Extended Euclid GCD)
-#
-# Algorithm 5 in
-# Fast Software Polynomial Multiplication on ARM Processors Using the NEON Engine
-# Danilo Camara, Conrado P. L. Gouvea, Julio Lopez, and Ricardo Dahab
-# https://link.springer.com/content/pdf/10.1007%2F978-3-642-40588-4_10.pdf
-#
-# Input: integer x, odd integer n, x < n
-# Output: x−1 (mod n)
-# 1:   function ModInv(x, n)
-# 2:   (a, b, u, v) ← (x, n, 1, 1)
-# 3:   ℓ ← ⌊log2 n⌋ + 1            ⮚ number of bits in n
-# 4:   for i ← 0 to 2ℓ − 1 do
-# 5:     odd ← a & 1
-# 6:     if odd and a ≥ b then
-# 7:       a ← a − b
-# 8:     else if odd and a < b then
-# 9:       (a, b, u, v) ← (b − a, a, v, u)
-# 10:    a ← a >> 1
-# 11:    if odd then u ← u − v
-# 12:    if u < 0 then u ← u + n
-# 13:    if u & 1 then u ← u + n
-# 14:    u ← u >> 1
-# 15:  return v
-#
-# Warning ⚠️: v should be 0 at initialization
-#
-# We modify it to return F . a^-1
-# So that we can pass an adjustment factor F
-# And directly compute modular division or Montgomery inversion
-
-func steinsGCD*(v: var Limbs, a: Limbs, F, M: Limbs, bits: int, mp1div2: Limbs) =
-  ## Compute F multiplied the modular inverse of ``a`` modulo M
-  ## r ≡ F . a^-1 (mod M)
-  ##
-  ## M MUST be odd, M does not need to be prime.
-  ## ``a`` MUST be less than M.
-  ##
-  ## No information about ``a`` in particular its actual length in bits is leaked.
-  ##
-  ## This takes (M+1)/2 (mp1div2) as a precomputed parameter as a slight optimization
-  ## in stack size and speed.
-  ##
-  ## The inverse of 0 is 0.
-
-  # Ideally we need registers for a, b, u, v
-  # but:
-  #   - Even with ~256-bit primes, that's 4 limbs = 4*4 => 16 registers
-  #   - x86_64 only has 16 general purposes registers
-  #   - Registers are needed for the loop counter and comparison results
-  #   - CMOV is reg <- RM so can move registers/memory into registers
-  #     but cannot move into memory.
-  # so we choose to keep "v" from the algorithm in memory as `r`
-
-  # TODO: the inlining of primitives like `csub` is bad for codesize
-  #       but there is a 80% slowdown without it.
-
-  var a = a
-  var b = M
-  var u = F
-  v.setZero()
-
-  for i in 0 ..< 2 * bits:
-    debug: doAssert bool(b.isOdd)
-    let isOddA = a.isOdd()
-
-    # if isOddA: a -= b
-    let aLessThanB = isOddA and (SecretBool) a.csub(b, isOddA)
-    # if a < b and the sub was processed
-    # in that case, b <- a = a - b + b
-    discard b.cadd(a, aLessThanB)
-    # and a <- -new_a = (b-a)
-    a.cneg(aLessThanB)
-    debug: doAssert not bool(a.isOdd)
-    a.shiftRight(1)
-
-    # Swap u and v is a < b
-    u.cswap(v, aLessThanB)
-    # if isOddA: u -= v (mod M)
-    let neg = isOddA and (SecretBool) u.csub(v, isOddA)
-    discard u.cadd(M, neg)
-
-    # u = u/2 (mod M)
-    u.div2_modular(mp1div2)
-
-  debug:
-    doAssert bool a.isZero()
-    # GCD exist (always true if a and M are relatively prime)
-    doAssert bool b.isOne() or
-      # or not (on prime fields iff input was zero) and no GCD fallback output is zero
-      v.isZero()
-
-# ############################################################
-#
-#                   Modular Reduction
+#              Division and Modular Reduction
 #
 # ############################################################
 #
--- a/constantine/arithmetic/limbs_invmod.nim
+++ b/constantine/arithmetic/limbs_invmod.nim
@ -0,0 +1,453 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../config/common,
+  ../primitives,
+  ./limbs, ./limbs_unsaturated
+
+# No exceptions allowed
+{.push raises: [].}
+
+# ############################################################
+#
+#                  Modular inversion (mod 2ᵏ)
+#
+# ############################################################
+
+func invModBitwidth(a: BaseType): BaseType =
+  # Modular inverse algorithm:
+  # Explanation p11 "Dumas iterations" based on Newton-Raphson:
+  # - Cetin Kaya Koc (2017), https://eprint.iacr.org/2017/411
+  # - Jean-Guillaume Dumas (2012), https://arxiv.org/pdf/1209.6626v2.pdf
+  # - Colin Plumb (1994), http://groups.google.com/groups?selm=1994Apr6.093116.27805%40mnemosyne.cs.du.edu
+  # Other sources:
+  # - https://crypto.stackexchange.com/questions/47493/how-to-determine-the-multiplicative-inverse-modulo-64-or-other-power-of-two
+  # - https://mumble.net/~campbell/2015/01/21/inverse-mod-power-of-two
+  # - http://marc-b-reynolds.github.io/math/2017/09/18/ModInverse.html
+
+  # We are in a special case
+  # where m = 2^WordBitWidth.
+  # For a and m to be coprimes, a must be odd.
+  #
+  # We have the following relation
+  # ax ≡ 1 (mod 2ᵏ) <=> ax(2 - ax) ≡ 1 (mod 2²ᵏ)
+  # which grows in O(log(log(a)))
+  debug: doAssert (a and 1) == 1, "a must be odd"
+
+  const k = log2_vartime(a.sizeof() * 8)
+  result = a                 # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse
+  for _ in 0 ..< k:          # at each iteration we get the inverse mod(2^2k)
+    result *= 2 - a * result # x' = x(2 - ax)
+
+func invMod2powK(M0: BaseType, k: static BaseType): BaseType =
+  ## Compute 1/M mod 2ᵏ
+  ## from M[0]
+  # Algorithm: Cetin Kaya Koc (2017) p11, https://eprint.iacr.org/2017/411
+  # Once you have a modular inverse (mod 2ˢ) you can reduce
+  # (mod 2ᵏ) to have the modular inverse (mod 2ᵏ)
+  static: doAssert k <= WordBitWidth
+  const maskMod = (1 shl k)-1
+  M0.invModBitwidth() and maskMod
+
+# ###############################################################
+#
+#                   Modular inversion 
+#
+# ###############################################################
+
+# Algorithm by Bernstein-Yang
+# ------------------------------------------------------------
+#
+# - Original Bernstein-Yang paper, https://eprint.iacr.org/2019/266
+# - Executable spec and description by Dettman-Wuille, https://github.com/bitcoin-core/secp256k1/blob/85b00a1/doc/safegcd_implementation.md
+# - Formal bound verification by Wuille, https://github.com/sipa/safegcd-bounds
+# - Formal verification by Hvass-Aranha-Spitters, https://eprint.iacr.org/2021/549
+#
+# We implement the half-delta divstep variant
+
+type TransitionMatrix = object
+  ## Bernstein-Yang Jumpdivstep transition matrix
+  ##     [u v]
+  ## t = [q r]
+  ## It it is scaled by 2ᵏ
+  u, v, q, r: SignedSecretWord
+
+debug:
+  # Debugging helpers
+
+  func checkDeterminant(t: TransitionMatrix, u, v, q, r: SignedSecretWord, k, numIters: int): bool =
+    # The determinant of t must be a power of two. This guarantees that multiplication with t
+    # does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which
+    # will be divided out again).
+    # Each divstep's individual matrix has determinant 2⁻¹,
+    # the aggregate of numIters of them will have determinant 2ⁿᵘᵐᴵᵗᵉʳˢ. Multiplying with the initial
+    # 2ᵏ*identity (which has determinant 2²ᵏ) means the result has determinant 2²ᵏ⁻ⁿᵘᵐᴵᵗᵉʳˢ.
+    let
+      u = SecretWord u
+      v = SecretWord v
+      q = SecretWord q
+      r = SecretWord r
+
+    var a, b: array[2, SecretWord]
+    var e: array[2, SecretWord]
+    smul(a[1], a[0], u, r)
+    smul(b[1], b[0], v, q)
+
+    var borrow: Borrow
+    subB(borrow, a[0], a[0], b[0], Borrow(0))
+    subB(borrow, a[1], a[1], b[1], borrow)
+
+    let d = 2*k - numIters
+    b[0] = Zero; b[1] = Zero
+    b[d div WordBitwidth] = One shl (d mod WordBitwidth)
+
+    return bool(a == b)
+
+func canonicalize(
+       a: var LimbsUnsaturated,
+       signMask: SignedSecretWord,
+       M: LimbsUnsaturated
+     ) =
+  ## Compute a = sign*a (mod M)
+  ## 
+  ## with a in range (-2*M, M)
+  ## result in range [0, M)
+  
+  const
+    UnsatBitWidth = WordBitWidth - a.Excess
+    Max = SignedSecretWord(MaxWord shr a.Excess)
+  
+  # Operate in registers
+  var z = a
+  
+  # Add M if `z` is negative
+  # -> range (-M, M)
+  z.cadd(M, z.isNegMask())
+  # Negate if sign is negative
+  # -> range (-M, M)
+  z.cneg(signMask)
+  # Normalize words to range (-2^UnsatBitwidth, 2^UnsatBitwidth)
+  for i in 0 ..< z.words.len-1:
+    z[i+1] = z[i+1] + z[i].ashr(UnsatBitWidth)
+    z[i] = z[i] and Max
+
+  # Add M if `z` is negative
+  # -> range (0, M)
+  z.cadd(M, z.isNegMask())
+  # Normalize words to range (-2^UnsatBitwidth, 2^UnsatBitwidth)
+  for i in 0 ..< z.words.len-1:
+    z[i+1] = z[i+1] + z[i].ashr(UnsatBitWidth)
+    z[i] = z[i] and Max
+
+  a = z
+
+proc partitionDivsteps(bits, wordBitWidth: int): tuple[totalIters, numChunks, chunkSize, cutoff: int] =
+  # Given the field modulus number of bits
+  # and the effective word size  
+  # Returns:
+  # - the total number of iterations that guarantees GCD convergence
+  # - the number of chunks of divsteps to compute
+  # - the base number of divsteps per chunk
+  # - a cutoff chunk,
+  #     before this chunk ID, the number of divsteps is "base number + 1"
+  #     afterward it's "base number"
+  let totalIters =           
+    if bits == 256:
+      # https://github.com/sipa/safegcd-bounds/tree/master/coq
+      # For 256-bit inputs, 590 divsteps are sufficient with hddivstep variant (half-delta divstep)
+      # for gcd(f, g) with 0 <= g <= f <= Modulus (inversion g == 1)
+      # The generic formula reports 591
+      590
+    else:
+      # https://github.com/sipa/safegcd-bounds/blob/master/genproofhd.md
+      # For any input, for gcd(f, g) with 0 <= g <= f <= Modulus with hddivstep variant (half-delta divstep)
+      # (inversion g == 1)
+      (45907*bits + 26313) div 19929
+  let numChunks = (totalIters + wordBitWidth-1) div wordBitWidth
+  let chunkSize = totalIters div numChunks
+  let cutoff = totalIters mod numChunks
+  return (totalIters, numChunks, chunkSize, cutoff)
+
+func batchedDivsteps(
+       t: var TransitionMatrix,
+       theta: SignedSecretWord,
+       f0, g0: SignedSecretWord,
+       numIters: int,
+       k: static int
+     ): SignedSecretWord =
+  ## Bernstein-Yang half-delta (theta) batch of divsteps
+  ## 
+  ## Output:
+  ## - return theta for the next batch of divsteps
+  ## - mutate t, the transition matrix to apply `numIters` divsteps at once
+  ##   t is scaled by 2ᵏ
+  ## 
+  ## Input:
+  ## - f0, bottom limb of f
+  ## - g0, bottom limb of g
+  ## - numIters, number of iterations requested in this batch of divsteps
+  ## - k, the maximum batch size, transition matrix is scaled by 2ᵏ
+  var
+    u = SignedSecretWord(1 shl k)
+    v = SignedSecretWord(0)
+    q = SignedSecretWord(0)
+    r = SignedSecretWord(1 shl k)
+    f = f0
+    g = g0
+
+    theta = theta
+
+  for i in k-numIters ..< k:
+    debug:
+      func reportLoop() =
+        debugEcho "  iterations: [", k-numIters, ", ", k, ")", " (", numIters, " iterations in total)"
+        debugEcho "  i: ", i, ", theta: ", int(theta)
+        # debugEcho "    f: 0b", BiggestInt(f).toBin(64), ", g: 0b", BiggestInt(g).toBin(64), " | f: ", int(f), ", g: ", int(g)
+        # debugEcho "    u: 0b", BiggestInt(u).toBin(64), ", v: 0b", BiggestInt(v).toBin(64), " | u: ", int(u), ", v: ", int(v)
+        # debugEcho "    q: 0b", BiggestInt(q).toBin(64), ", r: 0b", BiggestInt(r).toBin(64), " | q: ", int(q), ", r: ", int(r)
+
+      doAssert (BaseType(f) and 1) == 1, (reportLoop(); "f must be odd)")
+      doAssert bool(not(uint(u or v or q or r) and (if i == 0: high(uint) else: high(uint) shr (i - 1)))), (reportLoop(); "Min trailing zeros count decreases at each iteration")
+      doAssert bool(u.ashr(k-i)*f0 + v.ashr(k-i)*g0 == f.lshl(i)), (reportLoop(); "Applying the transition matrix to (f₀, g₀) returns current (f, g)")
+      doAssert bool(q.ashr(k-i)*f0 + r.ashr(k-i)*g0 == g.lshl(i)), (reportLoop(); "Applying the transition matrix to (f₀, g₀) returns current (f, g)")
+
+    # Conditional masks for (theta < 0) and g odd
+    let c1 = theta.isNegMask()
+    let c2 = g.isOddMask()
+    # x, y, z, conditional complement of f, u, v
+    let x = f xor c1
+    let y = u xor c1
+    let z = v xor c1
+    # conditional substraction from g, q, r
+    g.csub(x, c2)
+    q.csub(y, c2)
+    r.csub(z, c2)
+    # c3 = (theta >= 0) and g odd
+    let c3 = c2 and not c1
+    # theta = -theta or theta+1
+    theta = (theta xor c3) + SignedSecretWord(1)
+    # Conditional rollback substraction
+    f.cadd(g, c3)
+    u.cadd(q, c3)
+    v.cadd(r, c3)
+    # Shifts
+    g = g.lshr(1)
+    q = q.ashr(1)
+    r = r.ashr(1)
+
+  t.u = u
+  t.v = v
+  t.q = q
+  t.r = r
+  debug:
+    doAssert bool(u*f0 + v*g0 == f.lshl(k)), "Applying the final matrix to (f₀, g₀) gives the final (f, g)"
+    doAssert bool(q*f0 + r*g0 == g.lshl(k)), "Applying the final matrix to (f₀, g₀) gives the final (f, g)"
+    doAssert checkDeterminant(t, u, v, q, r, k, numIters)
+
+  return theta
+
+func matVecMul_shr_k_mod_M[N, E: static int](
+       t: TransitionMatrix,
+       d, e: var LimbsUnsaturated[N, E],
+       k: static int,
+       M: LimbsUnsaturated[N, E],
+       invMod2powK: SecretWord
+  ) =
+  ## Compute
+  ##      
+  ## [u v]    [d] 
+  ## [q r]/2ᵏ.[e] mod M
+  ##
+  ## d, e will be in range (-2*modulus,modulus)
+  ## and output limbs in (-2ᵏ, 2ᵏ)
+  
+  static: doAssert k == WordBitWidth - E
+  const Max = SignedSecretWord(MaxWord shr E)
+
+  let
+    u = t.u
+    v = t.v
+    q = t.q
+    r = t.r
+
+  let sign_d = d.isNegMask()
+  let sign_e = e.isNegMask()
+
+  # Double-signed-word carries
+  var cd, ce: DSWord
+
+  # First iteration of [u v] [d] 
+  #                    [q r].[e]
+  cd.slincombAccNoCarry(u, d[0], v, e[0])
+  ce.slincombAccNoCarry(q, d[0], r, e[0])
+
+  # Compute me and md, multiples of M
+  # such as the bottom k bits if d and e are 0
+  # This allows fusing division by 2ᵏ
+  # i.e. (mx * M) mod 2ᵏ = x mod 2ᵏ
+  var md, me = SignedSecretWord(0)
+  md.cadd(u, sign_d)
+  md.cadd(v, sign_e)
+  me.cadd(q, sign_d)
+  me.cadd(r, sign_e)
+  
+  md = md - (SignedSecretWord(invMod2powK * SecretWord(cd.lo) + SecretWord(md)) and Max)
+  me = me - (SignedSecretWord(invMod2powK * SecretWord(ce.lo) + SecretWord(me)) and Max)
+
+  # First iteration of [u v] [d]   [md]
+  #                    [q r].[e] + [me].M[0]
+  # k bottom bits are 0
+  cd.smulAccNoCarry(md, M[0])
+  ce.smulAccNoCarry(me, M[0])
+  cd.ashr(k)
+  ce.ashr(k)
+
+  for i in 1 ..< N:
+    cd.slincombAccNoCarry(u, d[i], v, e[i])
+    ce.slincombAccNoCarry(q, d[i], r, e[i])
+    cd.smulAccNoCarry(md, M[i])
+    ce.smulAccNoCarry(me, M[i])
+    d[i-1] = cd.lo and Max
+    e[i-1] = ce.lo and Max
+    cd.ashr(k)
+    ce.ashr(k)
+  
+  d[N-1] = cd.lo
+  e[N-1] = ce.lo
+
+func matVecMul_shr_k[N, E: static int](
+       t: TransitionMatrix,
+       f, g: var LimbsUnsaturated[N, E],
+       k: static int     
+  ) =
+  ## Compute
+  ##      
+  ## [u v] [f] 
+  ## [q r].[g] / 2ᵏ
+
+  static: doAssert k == WordBitWidth - E
+  const Max = SignedSecretWord(MaxWord shr E)
+
+  let
+    u = t.u
+    v = t.v
+    q = t.q
+    r = t.r
+
+  # Double-signed-word carries
+  var cf, cg: DSWord
+  
+  # First iteration of [u v] [f] 
+  #                    [q r].[g]
+  cf.slincombAccNoCarry(u, f[0], v, g[0])
+  cg.slincombAccNoCarry(q, f[0], r, g[0])
+  # bottom k bits are zero by construction
+  debug:
+    doAssert BaseType(cf.lo and Max) == 0, "bottom k bits should be 0, cf.lo: " & $BaseType(cf.lo)
+    doAssert BaseType(cg.lo and Max) == 0, "bottom k bits should be 0, cg.lo: " & $BaseType(cg.lo)
+
+  cf.ashr(k)
+  cg.ashr(k)
+
+  for i in 1 ..< N:
+    cf.slincombAccNoCarry(u, f[i], v, g[i])
+    cg.slincombAccNoCarry(q, f[i], r, g[i])
+    f[i-1] = cf.lo and Max
+    g[i-1] = cg.lo and Max
+    cf.ashr(k)
+    cg.ashr(k)
+  
+  f[N-1] = cf.lo
+  g[N-1] = cg.lo
+
+func invmodImpl[N, E](
+       a: var LimbsUnsaturated[N, E],
+       F, M: LimbsUnsaturated[N, E],
+       invMod2powK: SecretWord,
+       k, bits: static int) =
+  ## Modular inversion using Bernstein-Yang algorithm
+  ## r ≡ F.a⁻¹ (mod M)
+
+  # theta = delta-1/2, delta starts at 1/2 for the half-delta variant
+  var theta = SignedSecretWord(0)
+  var d{.noInit.}, e{.noInit.}: LimbsUnsaturated[N, E]
+  var f{.noInit.}, g{.noInit.}: LimbsUnsaturated[N, E]
+
+  d.setZero()
+  e = F
+
+  # g < f for partitioning / iteration count formula
+  f = M
+  g = a
+  const partition = partitionDivsteps(bits, k)
+
+  for i in 0 ..< partition.numChunks:
+    var t{.noInit.}: TransitionMatrix
+    let numIters = partition.chunkSize + int(i < partition.cutoff)
+    # Compute transition matrix and next theta
+    theta = t.batchedDivsteps(theta, f[0], g[0], numIters, k)
+    # Apply the transition matrix
+    # [u v]    [d] 
+    # [q r]/2ᵏ.[e]  mod M
+    t.matVecMul_shr_k_mod_M(d, e, k, M, invMod2powK)
+    # [u v]    [f] 
+    # [q r]/2ᵏ.[g] 
+    t.matVecMul_shr_k(f, g, k)
+
+  d.canonicalize(signMask = f.isNegMask(), M)
+  a = d
+
+func invmod*(
+       r: var Limbs, a: Limbs,
+       F, M: Limbs, bits: static int) =
+  ## Compute the scaled modular inverse of ``a`` modulo M
+  ## r ≡ F.a⁻¹ (mod M)
+  ##
+  ## M MUST be odd, M does not need to be prime.
+  ## ``a`` MUST be less than M.
+  const Excess = 2
+  const k = WordBitwidth - Excess
+  const NumUnsatWords = (bits + k - 1) div k
+
+  # Convert values to unsaturated repr
+  var m2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
+  var factor {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
+  m2.fromPackedRepr(M)
+  factor.fromPackedRepr(F)
+  let m0invK = SecretWord invMod2powK(BaseType M[0], k)
+
+  var a2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
+  a2.fromPackedRepr(a)
+  a2.invmodImpl(factor, m2, m0invK, k, bits)
+  r.fromUnsatRepr(a2)
+
+func invmod*(
+       r: var Limbs, a: Limbs,
+       F, M: static Limbs, bits: static int) =
+  ## Compute the scaled modular inverse of ``a`` modulo M
+  ## r ≡ F.a⁻¹ (mod M)
+  ## 
+  ## with F and M known at compile-time
+  ##
+  ## M MUST be odd, M does not need to be prime.
+  ## ``a`` MUST be less than M.
+  
+  const Excess = 2
+  const k = WordBitwidth - Excess
+  const NumUnsatWords = (bits + k - 1) div k
+
+  # Convert values to unsaturated repr
+  const m2 = LimbsUnsaturated[NumUnsatWords, Excess].fromPackedRepr(M)
+  const factor = LimbsUnsaturated[NumUnsatWords, Excess].fromPackedRepr(F)
+  const m0invK = SecretWord invMod2powK(BaseType M[0], k)
+
+  var a2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
+  a2.fromPackedRepr(a)
+  a2.invmodImpl(factor, m2, m0invK, k, bits)
+  r.fromUnsatRepr(a2)
--- a/constantine/arithmetic/limbs_unsaturated.nim
+++ b/constantine/arithmetic/limbs_unsaturated.nim
@ -0,0 +1,385 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../config/common,
+  ../primitives
+
+type
+  SignedSecretWord* = distinct SecretWord
+
+  LimbsUnsaturated*[N, Excess: static int] = object
+    ## An array of signed secret words
+    ## with each word having their top Excess bits unused between function calls
+    ## This allows efficient handling of carries and signs without intrinsics or assembly.
+    #
+    # Comparison with packed representation:
+    # 
+    # Packed representation
+    # - pro: uses less words (important for multiplication which is O(n²) with n the number of words)
+    # - pro: less "mental overhead" to keep track (clear/shift) excess bits
+    # - con: addition-with-carry require compiler intrinsics or inline assembly.
+    #        Compiler codegen may be incredibly bad (GCC).
+    # - con: addition-with-carry cannot use instruction-level-parallelism or SIMD vectorization
+    #        due to the dependency chain.
+    # - con: on x86 addition-with-carry dependency chain has high latency
+    #
+    # Unsaturated representation:
+    # - pro: portable, addition-with-carry can be implemented without compiler support or inline assembly
+    # - con: "mental overhead" to keep track of used free bits in algorithms, shifts, multiplication, division, ...
+    # - con: substraction-with-borrow internally requires signed integers
+    #        or immediate canonicalization
+    #        or adding a multiple of the modulus to avoid underflows
+    # - con: multiplication requires immediate canonicalization
+    # - con: require arithmetic right-shift if using signed integers
+    # - con: more memory usage
+    # - con: may be slower due to using more words
+    # - con: multiple representation of integers
+    # - pro: can do many additions before having to canonicalize
+    #
+    # Constantine used to have an unsaturated representation by default
+    # before refactor PR: https://github.com/mratsim/constantine/pull/17
+    # Unsaturated representation has been advocated (in particular for generalized mersenne primes)
+    # by:
+    # - https://bearssl.org/bigint.html
+    # - https://cryptojedi.org/peter/data/pairing-20131122.pdf
+    # - https://milagro.apache.org/docs/amcl-overview/#representation
+    # - https://eprint.iacr.org/2017/437
+    #
+    # In practice:
+    # - Addition is not a bottleneck in cryptography, multiplication is.
+    #   Naive asymptotic analysis, makes it 4x~6x more costly
+    #   on 256-bit (4-limbs) inputs to 384-bit (6-limbs) inputs
+    # - CPUs have significantly reduce carry latencies.
+    #   https://www.agner.org/optimize/instruction_tables.pdf
+    #   - Nehalem (Intel 2008) used to have 0.33 ADD (reg, reg), 1 ADD (reg, mem)  and 2 ADC reciprcal throughput (ADD 6x faster than ADC)
+    #   - Ice Lake (Intel 2019) is 0.25 ADD (reg, reg), 0.5 ADD (reg, mem) and 1 ADC reciprocal throughput
+    #   - Zen 3 (AMD 2020) is 0.25 ADD (reg, reg/mem) and 1 ADC reciprocal throughput
+    # - Introducing inline assembly is easier than dealing with signed integers
+    # - Inline assembly is required to ensure constant-time properties
+    #   as compilers get smarters
+    # - Canonicalization has an overhead, it just delays (and potentially batches) paying for the dependencing chain.
+    words*: array[N, SignedSecretWord]
+
+# ############################################################
+#
+#                      Accessors
+#
+# ############################################################
+
+template `[]`*(a: LimbsUnsaturated, idx: int): SignedSecretWord =
+  a.words[idx]
+
+template `[]=`*(a: LimbsUnsaturated, idx: int, val: SignedSecretWord) =
+  a.words[idx] = val
+
+# ############################################################
+#
+#                        Conversion
+#
+# ############################################################
+
+# {.push checks:off.} # Avoid IndexDefect and Int overflows
+func fromPackedRepr*[LU, E, LP: static int](
+       dst: var LimbsUnsaturated[LU, E],
+       src: Limbs[LP]) =
+  ## Converts from an packed representation to an unsaturated representation  
+  const UnsatBitWidth = WordBitWidth-E
+  const Max = MaxWord shr E
+
+  static:
+    # Destination and Source size are consistent
+    doAssert (LU-1) * UnsatBitWidth <= WordBitwidth * LP, block:
+      "\n  (LU-1) * UnsatBitWidth: " & $(LU-1) & " * " & $UnsatBitWidth & " = " & $((LU-1) * UnsatBitWidth) &
+      "\n  WordBitwidth * LP: " & $WordBitwidth & " * " & $LP & " = " & $(WordBitwidth * LP)
+
+  var
+    srcIdx, dstIdx = 0
+    hi, lo = Zero
+    accLen = 0
+  
+  while srcIdx < src.len:
+    # Form a 2-word buffer (hi, lo)
+    let w = if src_idx < src.len: src[srcIdx]
+            else: Zero
+    inc srcIdx
+ 
+    if accLen == 0:
+      lo = w and Max
+      hi = w shr UnsatBitWidth
+    else:
+      lo = (lo or (w shl accLen)) and Max
+      hi = w shr (UnsatBitWidth - accLen)
+
+    accLen += WordBitWidth
+
+    while accLen >= UnsatBitWidth:
+      let s = min(accLen, UnsatBitWidth)
+
+      dst[dstIdx] = SignedSecretWord lo
+      dstIdx += 1
+      accLen -= s
+      lo = ((lo shr s) or (hi shl (UnsatBitWidth - s))) and Max
+      hi = hi shr s
+      
+  if dstIdx < dst.words.len:
+    dst[dstIdx] = SignedSecretWord lo
+
+  for i in dst_idx + 1 ..< dst.words.len:
+    dst[i] = SignedSecretWord Zero
+
+func fromPackedRepr*(T: type LimbsUnsaturated, src: Limbs): T =
+  ## Converts from an packed representation to an unsaturated representation
+  result.fromPackedRepr(src)
+
+func fromUnsatRepr*[LU, E, LP: static int](
+       dst: var Limbs[LP],
+       src: LimbsUnsaturated[LU, E]) =
+  ## Converts from an packed representation to an unsaturated representation  
+  const UnsatBitWidth = WordBitWidth-E
+
+  static:
+    # Destination and Source size are consistent
+    doAssert (LU-1) * UnsatBitWidth <= WordBitwidth * LP, block:
+      "\n  (LU-1) * UnsatBitWidth: " & $(LU-1) & " * " & $UnsatBitWidth & " = " & $((LU-1) * UnsatBitWidth) &
+      "\n  WordBitwidth * LP: " & $WordBitwidth & " * " & $LP & " = " & $(WordBitwidth * LP)
+
+  var
+    srcIdx, dstIdx = 0
+    acc = Zero
+    accLen = 0
+
+  for src_idx in 0 ..< src.words.len:
+    let nextWord = SecretWord src[srcIdx]
+
+    # buffer reads
+    acc = acc or (nextWord shl accLen)
+    accLen += UnsatBitWidth
+
+    # if full, dump
+    if accLen >= WordBitWidth:
+      dst[dstIdx] = acc
+      inc dstIdx
+      accLen -= WordBitWidth
+      acc = nextWord shr (UnsatBitWidth - accLen)
+  
+  if dst_idx < dst.len:
+    dst[dst_idx] = acc
+
+  for i in dst_idx + 1 ..< dst.len:
+    dst[i] = Zero
+
+# {.pop.}
+
+# ############################################################
+#
+#                      Initialization
+#
+# ############################################################
+
+func setZero*(a: var LimbsUnsaturated) =
+  ## Set ``a`` to 0
+  for i in 0 ..< a.words.len:
+    a[i] = SignedSecretWord(0)
+
+func setOne*(a: var LimbsUnsaturated) =
+  ## Set ``a`` to 1
+  a[0] = SignedSecretWord(1)
+  for i in 1 ..< a.words.len:
+    a[i] = SignedSecretWord(0)
+
+# ############################################################
+#
+#                      Arithmetic
+#
+# ############################################################
+
+# Workaround bug
+func `xor`*(x,y: SecretWord): SecretWord {.inline.} =
+  # For some reason the template defined in constant_time.nim isn't found
+  SecretWord(x.BaseType xor y.BaseType)
+
+when sizeof(int) == 8 and not defined(Constantine32):
+  type
+    SignedBaseType* = int64
+else:
+  type
+    SignedBaseType* = int32
+
+template fmap(x: SignedSecretWord, op: untyped, y: SignedSecretWord): SignedSecretWord =
+  ## Unwrap x and y from their distinct type
+  ## Apply op, and rewrap them
+  SignedSecretWord(op(SecretWord(x), SecretWord(y)))
+
+template fmapAsgn(x: SignedSecretWord, op: untyped, y: SignedSecretWord) =
+  ## Unwrap x and y from their distinct type
+  ## Apply assignment op, and rewrap them
+  op(SecretWord(x), SecretWord(y))
+
+template `and`*(x, y: SignedSecretWord): SignedSecretWord    = fmap(x, `and`, y)
+template `or`*(x, y: SignedSecretWord): SignedSecretWord     = fmap(x, `or`, y)
+template `xor`*(x, y: SignedSecretWord): SignedSecretWord    = SignedSecretWord(BaseType(x) xor BaseType(y))
+template `not`*(x: SignedSecretWord): SignedSecretWord       = SignedSecretWord(not SecretWord(x))
+template `+`*(x, y: SignedSecretWord): SignedSecretWord      = fmap(x, `+`, y)
+template `+=`*(x: var SignedSecretWord, y: SignedSecretWord) = fmapAsgn(x, `+=`, y)
+template `-`*(x, y: SignedSecretWord): SignedSecretWord      = fmap(x, `-`, y)
+template `-=`*(x: var SignedSecretWord, y: SignedSecretWord) = fmapAsgn(x, `-=`, y)
+
+template `-`*(x: SignedSecretWord): SignedSecretWord =
+  # We don't use Nim signed integers to avoid range checks
+  SignedSecretWord(-SecretWord(x))
+
+template `*`*(x, y: SignedSecretWord): SignedSecretWord =
+  # Warning ⚠️ : We assume that mul hardware multiplication is constant time
+  # but this is not always true, especially on ARMv7 and ARMv9
+  fmap(x, `*`, y)
+
+# shifts
+template ashr*(x: SignedSecretWord, y: SomeNumber): SignedSecretWord =
+  ## Arithmetic right shift
+  # We need to cast to Nim ints without Nim checks
+  SignedSecretWord(cast[SignedBaseType](x).ashr(y))
+
+template lshr*(x: SignedSecretWord, y: SomeNumber): SignedSecretWord =
+  ## Logical right shift
+  SignedSecretWord(SecretWord(x) shr y)
+
+template lshl*(x: SignedSecretWord, y: SomeNumber): SignedSecretWord =
+  ## Logical left shift
+  SignedSecretWord(SecretWord(x) shl y)
+
+# ############################################################
+#
+#             Hardened Boolean primitives
+#
+# ############################################################
+
+template `==`*(x, y: SignedSecretWord): SecretBool =
+  SecretWord(x) == SecretWord(y)
+
+# ############################################################
+#
+#                Conditional arithmetic
+#
+# ############################################################
+
+# SignedSecretWord
+# ----------------
+
+func isNegMask*(a: SignedSecretWord): SignedSecretWord {.inline.} =
+  ## Produce the -1 mask if a is negative
+  ## and 0 otherwise
+  a.ashr(WordBitWidth-1)
+
+func isOddMask*(a: SignedSecretWord): SignedSecretWord {.inline.} =
+  ## Produce the -1 mask if a is odd
+  ## and 0 otherwise
+  -(a and SignedSecretWord(1))
+
+func cneg*(
+       a: SignedSecretWord,
+       mask: SignedSecretWord): SignedSecretWord {.inline.} =
+  ## Conditionally negate `a` 
+  ## mask must be 0 (0x00000...0000) (no negation)
+  ## or -1 (0xFFFF...FFFF) (negation)
+  (a xor mask) - mask
+
+func cadd*(
+       a: var SignedSecretWord,
+       b: SignedSecretWord,
+       mask: SignedSecretWord) {.inline.} =
+  ## Conditionally add `b` to `a` 
+  ## mask must be 0 (0x00000...0000) (no addition)
+  ## or -1 (0xFFFF...FFFF) (addition)
+  a = a + (b and mask)
+
+func csub*(
+       a: var SignedSecretWord,
+       b: SignedSecretWord,
+       mask: SignedSecretWord) {.inline.} =
+  ## Conditionally substract `b` from `a` 
+  ## mask must be 0 (0x00000...0000) (no substraction)
+  ## or -1 (0xFFFF...FFFF) (substraction)
+  a = a - (b and mask)
+
+# UnsaturatedLimbs
+# ----------------
+
+func isNegMask*(a: LimbsUnsaturated): SignedSecretWord {.inline.} =
+  ## Produce the -1 mask if a is negative
+  ## and 0 otherwise
+  a[a.words.len-1].ashr(WordBitWidth - a.Excess + 1)
+
+func cneg*(
+       a: var LimbsUnsaturated,
+       mask: SignedSecretWord) {.inline.} =
+  ## Conditionally negate `a` 
+  ## mask must be 0 (0x00000...0000) (no negation)
+  ## or -1 (0xFFFF...FFFF) (negation)
+  ## 
+  ## Carry propagation is deferred
+  for i in 0 ..< a.words.len:
+    a[i] = a[i].cneg(mask)
+
+func cadd*(
+       a: var LimbsUnsaturated,
+       b: LimbsUnsaturated,
+       mask: SignedSecretWord) {.inline.} =
+  ## Conditionally add `b` to `a` 
+  ## mask must be 0 (0x00000...0000) (no addition)
+  ## or -1 (0xFFFF...FFFF) (addition)
+  ## 
+  ## Carry propagation is deferred
+  for i in 0 ..< a.words.len:
+    a[i].cadd(b[i], mask)
+
+# ############################################################
+#
+#                Double-Width signed arithmetic
+#
+# ############################################################
+
+type DSWord* = object
+  lo*, hi*: SignedSecretWord
+
+func smulAccNoCarry*(r: var DSWord, a, b: SignedSecretWord) {.inline.}=
+  ## Signed accumulated multiplication
+  ## (_, hi, lo) += a*b
+  ## This assumes no overflowing
+  var UV: array[2, SecretWord]
+  var carry: Carry
+  smul(UV[1], UV[0], SecretWord a, SecretWord b)
+  addC(carry, UV[0], UV[0], SecretWord r.lo, Carry(0))
+  addC(carry, UV[1], UV[1], SecretWord r.hi, carry)
+
+  r.lo = SignedSecretWord UV[0]
+  r.hi = SignedSecretWord UV[1]
+
+func slincombAccNoCarry*(r: var DSWord, a, u, b, v: SignedSecretWord) {.inline.}=
+  ## Accumulated linear combination
+  ## (_, hi, lo) += a*u + b*v
+  ## This assumes no overflowing
+  var carry: Carry
+  var x1, x0, y1, y0: SecretWord
+  smul(x1, x0, SecretWord a, SecretWord u)
+  addC(carry, x0, x0, SecretWord r.lo, Carry(0))
+  addC(carry, x1, x1, SecretWord r.hi, carry)
+  smul(y1, y0, SecretWord b, SecretWord v)
+  addC(carry, x0, x0, y0, Carry(0))
+  addC(carry, x1, x1, y1, carry)
+
+  r.lo = SignedSecretWord x0
+  r.hi = SignedSecretWord x1
+
+func ashr*(
+       r: var DSWord,
+       k: SomeInteger) {.inline.} =
+  ## Arithmetic right-shift of a double-word
+  ## This does not normalize the excess bits
+  r.lo = r.lo.lshr(k) or r.hi.lshl(WordBitWidth - k)
+  r.hi = r.hi.ashr(k)
--- a/constantine/curves/bls12_377_precomputed_params.nim
+++ b/constantine/curves/bls12_377_precomputed_params.nim
--- a/constantine/curves/bls12_377_inversion.nim
+++ b/constantine/curves/bls12_377_inversion.nim
@ -1,204 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ../config/curves,
-  ../arithmetic/finite_fields
-
-# ############################################################
-#
-#           Specialized inversion for BLS12-377
-#
-# ############################################################
-
-func inv_addchain*(r: var Fp[BLS12_377], a: Fp[BLS12_377]) =
-  let a = a # ensure a.inv_addchain(a) is OK
-
-  var
-    x10       {.noInit.}: Fp[BLS12_377]
-    x11       {.noInit.}: Fp[BLS12_377]
-    x100      {.noInit.}: Fp[BLS12_377]
-    x101      {.noInit.}: Fp[BLS12_377]
-    x111      {.noInit.}: Fp[BLS12_377]
-    x1001     {.noInit.}: Fp[BLS12_377]
-    x1011     {.noInit.}: Fp[BLS12_377]
-    x1111     {.noInit.}: Fp[BLS12_377]
-    x10001    {.noInit.}: Fp[BLS12_377]
-    x10011    {.noInit.}: Fp[BLS12_377]
-    x10111    {.noInit.}: Fp[BLS12_377]
-    x11011    {.noInit.}: Fp[BLS12_377]
-    x11101    {.noInit.}: Fp[BLS12_377]
-    x11111    {.noInit.}: Fp[BLS12_377]
-    x110100   {.noInit.}: Fp[BLS12_377]
-    x11010000 {.noInit.}: Fp[BLS12_377]
-    x11010111 {.noInit.}: Fp[BLS12_377]
-
-  x10       .square(a)
-  x11       .prod(a, x10)
-  x100      .prod(a, x11)
-  x101      .prod(a, x100)
-  x111      .prod(x10, x101)
-  x1001     .prod(x10, x111)
-  x1011     .prod(x10, x1001)
-  x1111     .prod(x100, x1011)
-  x10001    .prod(x10, x1111)
-  x10011    .prod(x10, x10001)
-  x10111    .prod(x100, x10011)
-  x11011    .prod(x100, x10111)
-  x11101    .prod(x10, x11011)
-  x11111    .prod(x10, x11101)
-  x110100   .prod(x10111, x11101)
-  x11010000 .square_repeated(x110100, 2)
-  x11010111 .prod(x111, x11010000)
-  # 18 operations
-
-  # TODO: we can accumulate in a partially reduced
-  #       doubled-size `r` to avoid the final substractions.
-  #       and only reduce at the end.
-  #       This requires the number of op to be less than log2(p) == 381
-
-  # 18 + 18 = 36 operations
-  r.square_repeated(x11010111, 8)
-  r *= x11101
-  r.square_repeated(7)
-  r *= x10001
-  r.square()
-
-  # 36 + 14 = 50 operations
-  r *= a
-  r.square_repeated(9)
-  r *= x10111
-  r.square_repeated(2)
-  r *= x11
-
-  # 50 + 21 = 71 operations
-  r.square_repeated(6)
-  r *= x101
-  r.square_repeated(4)
-  r *= a
-  r.square_repeated(9)
-
-  # 71 + 13 = 84 operations
-  r *= x11101
-  r.square_repeated(5)
-  r *= x1011
-  r.square_repeated(5)
-  r *= x11
-
-  # 84 + 21 = 105 operations
-  r.square_repeated(8)
-  r *= x11101
-  r.square()
-  r *= a
-  r.square_repeated(10)
-
-  # 105 + 20 = 125 operations
-  r *= x10111
-  r.square_repeated(12)
-  r *= x11011
-  r.square_repeated(5)
-  r *= x101
-
-  # 125 + 22 = 147 operations
-  r.square_repeated(7)
-  r *= x101
-  r.square_repeated(6)
-  r *= x1001
-  r.square_repeated(7)
-
-  # 147 + 11 = 158 operations
-  r *= x11101
-  r.square_repeated(5)
-  r *= x10001
-  r.square_repeated(3)
-  r *= x101
-
-  # 158 + 23 = 181 operations
-  r.square_repeated(8)
-  r *= x10001
-  r.square_repeated(6)
-  r *= x11011
-  r.square_repeated(7)
-
-  # 181 + 19 = 200 operations
-  r *= x11111
-  r.square_repeated(4)
-  r *= x11
-  r.square_repeated(12)
-  r *= x1111
-
-  # 200 + 19 = 219 operations
-  r.square_repeated(4)
-  r *= x101
-  r.square_repeated(8)
-  r *= x10011
-  r.square_repeated(5)
-
-  # 219 + 13 = 232 operations
-  r *= x10001
-  r.square_repeated(3)
-  r *= x111
-  r.square_repeated(7)
-  r *= x1111
-
-  # 232 + 22 = 254 operations
-  r.square_repeated(5)
-  r *= x1111
-  r.square_repeated(7)
-  r *= x11011
-  r.square_repeated(8)
-
-  # 254 + 13 = 269 operations
-  r *= x10001
-  r.square_repeated(6)
-  r *= x11111
-  r.square_repeated(6)
-  r *= x11101
-
-  # 269 + 35 = 304 operations
-  r.square_repeated(9)
-  r *= x1001
-  r.square_repeated(5)
-  r *= x1001
-  r.square_repeated(19)
-
-  # 304 + 17 = 321 operations
-  r *= x10111
-  r.square_repeated(8)
-  r *= x1011
-  r.square_repeated(6)
-  r *= x10111
-
-  # 321 + 16 = 337 operations
-  r.square_repeated(4)
-  r *= x101
-  r.square_repeated(4)
-  r *= a
-  r.square_repeated(6)
-
-  # 337 + 29 = 376 operations
-  r *= x11
-  r.square_repeated(29)
-  r *= a
-  r.square_repeated(7)
-  r *= x101
-
-  # 376 + 16 = 392 operations
-  r.square_repeated(9)
-  r *= x10001
-  r.square_repeated(6)
-
-  # 392 + 8*6 = 440 operations
-  for _ in 0 ..< 8:
-    r *= x11111
-    r.square_repeated(5)
-
-  r *= x11111
-  r.square()
-  r *= a
-  # Total 443 operations
--- a/constantine/curves/bls12_381_precomputed_params.nim
+++ b/constantine/curves/bls12_381_precomputed_params.nim
--- a/constantine/curves/bls12_381_inversion.nim
+++ b/constantine/curves/bls12_381_inversion.nim
@ -1,220 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ../config/curves,
-  ../arithmetic/finite_fields
-
-# ############################################################
-#
-#           Specialized inversion for BLS12-381
-#
-# ############################################################
-
-func inv_addchain*(r: var Fp[BLS12_381], a: Fp[BLS12_381]) =
-  var
-    x10       {.noinit.}: Fp[BLS12_381]
-    x100      {.noinit.}: Fp[BLS12_381]
-    x1000     {.noinit.}: Fp[BLS12_381]
-    x1001     {.noinit.}: Fp[BLS12_381]
-    x1011     {.noinit.}: Fp[BLS12_381]
-    x1101     {.noinit.}: Fp[BLS12_381]
-    x10001    {.noinit.}: Fp[BLS12_381]
-    x10100    {.noinit.}: Fp[BLS12_381]
-    x11001    {.noinit.}: Fp[BLS12_381]
-    x11010    {.noinit.}: Fp[BLS12_381]
-    x110100   {.noinit.}: Fp[BLS12_381]
-    x110110   {.noinit.}: Fp[BLS12_381]
-    x110111   {.noinit.}: Fp[BLS12_381]
-    x1001101  {.noinit.}: Fp[BLS12_381]
-    x1001111  {.noinit.}: Fp[BLS12_381]
-    x1010101  {.noinit.}: Fp[BLS12_381]
-    x1011101  {.noinit.}: Fp[BLS12_381]
-    x1100111  {.noinit.}: Fp[BLS12_381]
-    x1101001  {.noinit.}: Fp[BLS12_381]
-    x1110111  {.noinit.}: Fp[BLS12_381]
-    x1111011  {.noinit.}: Fp[BLS12_381]
-    x10001001 {.noinit.}: Fp[BLS12_381]
-    x10010101 {.noinit.}: Fp[BLS12_381]
-    x10010111 {.noinit.}: Fp[BLS12_381]
-    x10101001 {.noinit.}: Fp[BLS12_381]
-    x10110001 {.noinit.}: Fp[BLS12_381]
-    x10111111 {.noinit.}: Fp[BLS12_381]
-    x11000011 {.noinit.}: Fp[BLS12_381]
-    x11010000 {.noinit.}: Fp[BLS12_381]
-    x11010111 {.noinit.}: Fp[BLS12_381]
-    x11100001 {.noinit.}: Fp[BLS12_381]
-    x11100101 {.noinit.}: Fp[BLS12_381]
-    x11101011 {.noinit.}: Fp[BLS12_381]
-    x11110101 {.noinit.}: Fp[BLS12_381]
-    x11111111 {.noinit.}: Fp[BLS12_381]
-
-  x10       .square(a)
-  x100      .square(x10)
-  x1000     .square(x100)
-  x1001     .prod(a, x1000)
-  x1011     .prod(x10, x1001)
-  x1101     .prod(x10, x1011)
-  x10001    .prod(x100, x1101)
-  x10100    .prod(x1001, x1011)
-  x11001    .prod(x1000, x10001)
-  x11010    .prod(a, x11001)
-  x110100   .square(x11010)
-  x110110   .prod(x10, x110100)
-  x110111   .prod(a, x110110)
-  x1001101  .prod(x11001, x110100)
-  x1001111  .prod(x10, x1001101)
-  x1010101  .prod(x1000, x1001101)
-  x1011101  .prod(x1000, x1010101)
-  x1100111  .prod(x11010, x1001101)
-  x1101001  .prod(x10, x1100111)
-  x1110111  .prod(x11010, x1011101)
-  x1111011  .prod(x100, x1110111)
-  x10001001 .prod(x110100, x1010101)
-  x10010101 .prod(x11010, x1111011)
-  x10010111 .prod(x10, x10010101)
-  x10101001 .prod(x10100, x10010101)
-  x10110001 .prod(x1000, x10101001)
-  x10111111 .prod(x110110, x10001001)
-  x11000011 .prod(x100, x10111111)
-  x11010000 .prod(x1101, x11000011)
-  x11010111 .prod(x10100, x11000011)
-  x11100001 .prod(x10001, x11010000)
-  x11100101 .prod(x100, x11100001)
-  x11101011 .prod(x10100, x11010111)
-  x11110101 .prod(x10100, x11100001)
-  x11111111 .prod(x10100, x11101011)
-  # 35 operations
-
-  # TODO: we can accumulate in a partially reduced
-  #       doubled-size `r` to avoid the final substractions.
-  #       and only reduce at the end.
-  #       This requires the number of op to be less than log2(p) == 381
-
-  # 35 + 22 = 57 operations
-  r.prod(x10111111, x11100001)
-  r.square_repeated(8)
-  r *= x10001
-  r.square_repeated(11)
-  r *= x11110101
-
-  # 57 + 28 = 85 operations
-  r.square_repeated(11)
-  r *= x11100101
-  r.square_repeated(8)
-  r *= x11111111
-  r.square_repeated(7)
-
-  # 85 + 22 = 107 operations
-  r *= x1001101
-  r.square_repeated(9)
-  r *= x1101001
-  r.square_repeated(10)
-  r *= x10110001
-
-  # 107+24 = 131 operations
-  r.square_repeated(7)
-  r *= x1011101
-  r.square_repeated(9)
-  r *= x1111011
-  r.square_repeated(6)
-
-  # 131+23 = 154 operations
-  r *= x11001
-  r.square_repeated(11)
-  r *= x1101001
-  r.square_repeated(9)
-  r *= x11101011
-
-  # 154+28 = 182 operations
-  r.square_repeated(10)
-  r *= x11010111
-  r.square_repeated(6)
-  r *= x11001
-  r.square_repeated(10)
-
-  # 182+23 = 205 operations
-  r *= x1110111
-  r.square_repeated(9)
-  r *= x10010111
-  r.square_repeated(11)
-  r *= x1001111
-
-  # 205+30 = 235 operations
-  r.square_repeated(10)
-  r *= x11100001
-  r.square_repeated(9)
-  r *= x10001001
-  r.square_repeated(9)
-
-  # 235+21 = 256 operations
-  r *= x10111111
-  r.square_repeated(8)
-  r *= x1100111
-  r.square_repeated(10)
-  r *= x11000011
-
-  # 256+28 = 284 operations
-  r.square_repeated(9)
-  r *= x10010101
-  r.square_repeated(12)
-  r *= x1111011
-  r.square_repeated(5)
-
-  # 284 + 21 = 305 operations
-  r *= x1011
-  r.square_repeated(11)
-  r *= x1111011
-  r.square_repeated(7)
-  r *= x1001
-
-  # 305+32 = 337 operations
-  r.square_repeated(13)
-  r *= x11110101
-  r.square_repeated(9)
-  r *= x10111111
-  r.square_repeated(8)
-
-  # 337+22 = 359 operations
-  r *= x11111111
-  r.square_repeated(8)
-  r *= x11101011
-  r.square_repeated(11)
-  r *= x10101001
-
-  # 359+24 = 383 operations
-  r.square_repeated(8)
-  r *= x11111111
-  r.square_repeated(8)
-  r *= x11111111
-  r.square_repeated(6)
-
-  # 383+22 = 405 operations
-  r *= x110111
-  r.square_repeated(10)
-  r *= x11111111
-  r.square_repeated(9)
-  r *= x11111111
-
-  # 405+26 = 431 operations
-  r.square_repeated(8)
-  r *= x11111111
-  r.square_repeated(8)
-  r *= x11111111
-  r.square_repeated(8)
-
-  # 431+19 = 450 operations
-  r *= x11111111
-  r.square_repeated(7)
-  r *= x1010101
-  r.square_repeated(9)
-  r *= x10101001
-
-  # Total 450 operations:
-  # - 74 multiplications
-  # - 376 squarings
--- a/constantine/curves/bn254_nogami_precomputed_params.nim
+++ b/constantine/curves/bn254_nogami_precomputed_params.nim
--- a/constantine/curves/bn254_nogami_inversion.nim
+++ b/constantine/curves/bn254_nogami_inversion.nim
@ -1,98 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ../config/curves,
-  ../arithmetic/finite_fields
-
-# ############################################################
-#
-#           Specialized inversion for BN254-Nogami
-#
-# ############################################################
-
-func inv_addchain*(r: var Fp[BN254_Nogami], a: Fp[BN254_Nogami]) =
-  var
-    x100     {.noInit.}: Fp[BN254_Nogami]
-    x1000    {.noInit.}: Fp[BN254_Nogami]
-    x1100    {.noInit.}: Fp[BN254_Nogami]
-    x1101    {.noInit.}: Fp[BN254_Nogami]
-    x10001   {.noInit.}: Fp[BN254_Nogami]
-    x100010  {.noInit.}: Fp[BN254_Nogami]
-    x1000100 {.noInit.}: Fp[BN254_Nogami]
-    x1010101 {.noInit.}: Fp[BN254_Nogami]
-
-  x100     .square_repeated(a, 2)
-  x1000    .square(x100)
-  x1100    .prod(x100, x1000)
-  x1101    .prod(a, x1100)
-  x10001   .prod(x100, x1101)
-  x100010  .square(x10001)
-  x1000100 .square(x100010)
-  x1010101 .prod(x10001, x1000100)
-  # 9 operations
-
-  var
-    r13      {.noInit.}: Fp[BN254_Nogami]
-    r17      {.noInit.}: Fp[BN254_Nogami]
-    r18      {.noInit.}: Fp[BN254_Nogami]
-    r23      {.noInit.}: Fp[BN254_Nogami]
-    r26      {.noInit.}: Fp[BN254_Nogami]
-    r27      {.noInit.}: Fp[BN254_Nogami]
-    r28      {.noInit.}: Fp[BN254_Nogami]
-    r36      {.noInit.}: Fp[BN254_Nogami]
-    r38      {.noInit.}: Fp[BN254_Nogami]
-    r39      {.noInit.}: Fp[BN254_Nogami]
-    r40      {.noInit.}: Fp[BN254_Nogami]
-
-  r13.square_repeated(x1010101, 2)
-  r13 *= x100010
-  r13 *= x1101
-
-  r17.square(r13)
-  r17 *= r13
-  r17.square_repeated(2)
-
-  r18.prod(r13, r17)
-
-  r23.square_repeated(r18, 3)
-  r23 *= r18
-  r23 *= r17
-
-  r26.square_repeated(r23, 2)
-  r26 *= r23
-
-  r27.prod(r23, r26)
-  r28.prod(r26, r27)
-
-  r36.square(r28)
-  r36 *= r28
-  r36.square_repeated(2)
-  r36 *= r28
-  r36.square_repeated(3)
-
-  r38.prod(r28, r36)
-  r38 *= r27
-  r39.square(r38)
-  r40.prod(r38, r39)
-
-  r.prod(r39, r40)
-  r.square_repeated(3)
-  r *= r40
-  r.square_repeated(55)
-  r *= r38
-
-  r.square_repeated(55)
-  r *= r28
-  r.square_repeated(56)
-  r *= r18
-  r.square_repeated(56)
-
-  r *= x10001
-
-  # Total 271 operations
--- a/constantine/curves/bn254_snarks_precomputed_params.nim
+++ b/constantine/curves/bn254_snarks_precomputed_params.nim
--- a/constantine/curves/bn254_snarks_inversion.nim
+++ b/constantine/curves/bn254_snarks_inversion.nim
@ -1,158 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ../config/curves,
-  ../arithmetic/finite_fields
-
-# ############################################################
-#
-#           Specialized inversion for BN254-Snarks
-#
-# ############################################################
-
-func inv_addchain*(r: var Fp[BN254_Snarks], a: Fp[BN254_Snarks]) =
-  var
-    x10       {.noInit.}: Fp[BN254_Snarks]
-    x11       {.noInit.}: Fp[BN254_Snarks]
-    x101      {.noInit.}: Fp[BN254_Snarks]
-    x110      {.noInit.}: Fp[BN254_Snarks]
-    x1000     {.noInit.}: Fp[BN254_Snarks]
-    x1101     {.noInit.}: Fp[BN254_Snarks]
-    x10010    {.noInit.}: Fp[BN254_Snarks]
-    x10011    {.noInit.}: Fp[BN254_Snarks]
-    x10100    {.noInit.}: Fp[BN254_Snarks]
-    x10111    {.noInit.}: Fp[BN254_Snarks]
-    x11100    {.noInit.}: Fp[BN254_Snarks]
-    x100000   {.noInit.}: Fp[BN254_Snarks]
-    x100011   {.noInit.}: Fp[BN254_Snarks]
-    x101011   {.noInit.}: Fp[BN254_Snarks]
-    x101111   {.noInit.}: Fp[BN254_Snarks]
-    x1000001  {.noInit.}: Fp[BN254_Snarks]
-    x1010011  {.noInit.}: Fp[BN254_Snarks]
-    x1011011  {.noInit.}: Fp[BN254_Snarks]
-    x1100001  {.noInit.}: Fp[BN254_Snarks]
-    x1110101  {.noInit.}: Fp[BN254_Snarks]
-    x10010001 {.noInit.}: Fp[BN254_Snarks]
-    x10010101 {.noInit.}: Fp[BN254_Snarks]
-    x10110101 {.noInit.}: Fp[BN254_Snarks]
-    x10111011 {.noInit.}: Fp[BN254_Snarks]
-    x11000001 {.noInit.}: Fp[BN254_Snarks]
-    x11000011 {.noInit.}: Fp[BN254_Snarks]
-    x11010011 {.noInit.}: Fp[BN254_Snarks]
-    x11100001 {.noInit.}: Fp[BN254_Snarks]
-    x11100011 {.noInit.}: Fp[BN254_Snarks]
-    x11100111 {.noInit.}: Fp[BN254_Snarks]
-
-  x10       .square(a)
-  x11       .prod(x10, a)
-  x101      .prod(x10, x11)
-  x110      .prod(x101, a)
-  x1000     .prod(x10, x110)
-  x1101     .prod(x101, x1000)
-  x10010    .prod(x101, x1101)
-  x10011    .prod(x10010, a)
-  x10100    .prod(x10011, a)
-  x10111    .prod(x11, x10100)
-  x11100    .prod(x101, x10111)
-  x100000   .prod(x1101, x10011)
-  x100011   .prod(x11, x100000)
-  x101011   .prod(x1000, x100011)
-  x101111   .prod(x10011, x11100)
-  x1000001  .prod(x10010, x101111)
-  x1010011  .prod(x10010, x1000001)
-  x1011011  .prod(x1000, x1010011)
-  x1100001  .prod(x110, x1011011)
-  x1110101  .prod(x10100, x1100001)
-  x10010001 .prod(x11100, x1110101)
-  x10010101 .prod(x100000, x1110101)
-  x10110101 .prod(x100000, x10010101)
-  x10111011 .prod(x110, x10110101)
-  x11000001 .prod(x110, x10111011)
-  x11000011 .prod(x10, x11000001)
-  x11010011 .prod(x10010, x11000001)
-  x11100001 .prod(x100000, x11000001)
-  x11100011 .prod(x10, x11100001)
-  x11100111 .prod(x110, x11100001) # 30 operations
-
-  # 30 + 27 = 57 operations
-  r.square(x11000001)
-  r.square_repeated(7)
-  r *= x10010001
-  r.square_repeated(10)
-  r *= x11100111
-  r.square_repeated(7)
-
-  # 57 + 19 = 76 operations
-  r *= x10111
-  r.square_repeated(9)
-  r *= x10011
-  r.square_repeated(7)
-  r *= x1101
-
-  # 76 + 33 = 109 operations
-  r.square_repeated(14)
-  r *= x1010011
-  r.square_repeated(9)
-  r *= x11100001
-  r.square_repeated(8)
-
-  # 109 + 18 = 127 operations
-  r *= x1000001
-  r.square_repeated(10)
-  r *= x1011011
-  r.square_repeated(5)
-  r *= x1101
-
-  # 127 + 34 = 161 operations
-  r.square_repeated(8)
-  r *= x11
-  r.square_repeated(12)
-  r *= x101011
-  r.square_repeated(12)
-
-  # 161 + 25 = 186 operations
-  r *= x10111011
-  r.square_repeated(8)
-  r *= x101111
-  r.square_repeated(14)
-  r *= x10110101
-
-  # 186 + 28 = 214
-  r.square_repeated(9)
-  r *= x10010001
-  r.square_repeated(5)
-  r *= x1101
-  r.square_repeated(12)
-
-  # 214 + 22 = 236
-  r *= x11100011
-  r.square_repeated(8)
-  r *= x10010101
-  r.square_repeated(11)
-  r *= x11010011
-
-  # 236 + 32 = 268
-  r.square_repeated(7)
-  r *= x1100001
-  r.square_repeated(11)
-  r *= x100011
-  r.square_repeated(12)
-
-  # 268 + 20 = 288
-  r *= x1011011
-  r.square_repeated(9)
-  r *= x11000011
-  r.square_repeated(8)
-  r *= x11100111
-
-  # 288 + 15 = 303
-  r.square_repeated(7)
-  r *= x1110101
-  r.square_repeated(6)
-  r *= x101
--- a/constantine/curves/bw6_761_precomputed_params.nim
+++ b/constantine/curves/bw6_761_precomputed_params.nim
--- a/constantine/curves/bw6_761_inversion.nim
+++ b/constantine/curves/bw6_761_inversion.nim
@ -1,376 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ../config/curves,
-  ../arithmetic/finite_fields
-
-# ############################################################
-#
-#           Specialized inversion for BW6-761
-#
-# ############################################################
-
-func inv_addchain*(r: var Fp[BW6_761], a: Fp[BW6_761]) =
-  let a = a # ensure a.inv_addchain(a) is OK
-
-  var
-    x10       {.noInit.}: Fp[BW6_761]
-    x11       {.noInit.}: Fp[BW6_761]
-    x101      {.noInit.}: Fp[BW6_761]
-    x111      {.noInit.}: Fp[BW6_761]
-    x1001     {.noInit.}: Fp[BW6_761]
-    x1011     {.noInit.}: Fp[BW6_761]
-    x1101     {.noInit.}: Fp[BW6_761]
-    x1111     {.noInit.}: Fp[BW6_761]
-    x10001    {.noInit.}: Fp[BW6_761]
-    x10010    {.noInit.}: Fp[BW6_761]
-    x10011    {.noInit.}: Fp[BW6_761]
-    x10111    {.noInit.}: Fp[BW6_761]
-    x11001    {.noInit.}: Fp[BW6_761]
-    x11011    {.noInit.}: Fp[BW6_761]
-    x11101    {.noInit.}: Fp[BW6_761]
-    x11111    {.noInit.}: Fp[BW6_761]
-    x100001   {.noInit.}: Fp[BW6_761]
-    x100011   {.noInit.}: Fp[BW6_761]
-    x100101   {.noInit.}: Fp[BW6_761]
-    x100111   {.noInit.}: Fp[BW6_761]
-    x101001   {.noInit.}: Fp[BW6_761]
-    x101011   {.noInit.}: Fp[BW6_761]
-    x101101   {.noInit.}: Fp[BW6_761]
-    x101111   {.noInit.}: Fp[BW6_761]
-    x110001   {.noInit.}: Fp[BW6_761]
-    x110011   {.noInit.}: Fp[BW6_761]
-    x110101   {.noInit.}: Fp[BW6_761]
-    x110111   {.noInit.}: Fp[BW6_761]
-    x111001   {.noInit.}: Fp[BW6_761]
-    x111011   {.noInit.}: Fp[BW6_761]
-    x111101   {.noInit.}: Fp[BW6_761]
-    x1111010  {.noInit.}: Fp[BW6_761]
-    x1111111  {.noInit.}: Fp[BW6_761]
-    x11111110 {.noInit.}: Fp[BW6_761]
-    x11111111 {.noInit.}: Fp[BW6_761]
-
-  x10       .square(a)
-  x11       .prod(a, x10)
-  x101      .prod(x10, x11)
-  x111      .prod(x10, x101)
-  x1001     .prod(x10, x111)
-  x1011     .prod(x10, x1001)
-  x1101     .prod(x10, x1011)
-  x1111     .prod(x10, x1101)
-  x10001    .prod(x10, x1111)
-  x10010    .prod(a, x10001)
-  x10011    .prod(a, x10010)
-  x10111    .prod(x101, x10010)
-  x11001    .prod(x10, x10111)
-  x11011    .prod(x10, x11001)
-  x11101    .prod(x10, x11011)
-  x11111    .prod(x10, x11101)
-  x100001   .prod(x10, x11111)
-  x100011   .prod(x10, x100001)
-  x100101   .prod(x10, x100011)
-  x100111   .prod(x10, x100101)
-  x101001   .prod(x10, x100111)
-  x101011   .prod(x10, x101001)
-  x101101   .prod(x10, x101011)
-  x101111   .prod(x10, x101101)
-  x110001   .prod(x10, x101111)
-  x110011   .prod(x10, x110001)
-  x110101   .prod(x10, x110011)
-  x110111   .prod(x10, x110101)
-  x111001   .prod(x10, x110111)
-  x111011   .prod(x10, x111001)
-  x111101   .prod(x10, x111011)
-  x1111010  .square(x111101)
-  x1111111  .prod(x101, x1111010)
-  x11111110 .square(x1111111)
-  x11111111 .prod(a, x11111110)
-  # 35 operations
-
-  # TODO: we can accumulate in a partially reduced
-  #       doubled-size `r` to avoid the final substractions.
-  #       and only reduce at the end.
-  #       This requires the number of op to be less than log2(p) == 381
-
-  # 35 + 8 = 43 operations
-  r.prod(x100001, x11111111)
-  r.square_repeated(3)
-  r *= x10111
-  r.square_repeated(2)
-  r *= a
-
-  # 43 + 22 = 65 operations
-  r.square_repeated(9)
-  r *= x1001
-  r.square_repeated(7)
-  r *= x11111
-  r.square_repeated(4)
-
-  # 65 + 17 = 82 operations
-  r *= x111
-  r.square_repeated(9)
-  r *= x1111
-  r.square_repeated(5)
-  r *= x111
-
-  # 82 + 29 = 111 operations
-  r.square_repeated(11)
-  r *= x101011
-  r.square_repeated(7)
-  r *= x100011
-  r.square_repeated(9)
-
-  # 111 + 28 = 139 operations
-  r *= x11111
-  r.square_repeated(8)
-  r *= x100101
-  r.square_repeated(17)
-  r *= x100111
-
-  # 139 + 22 = 161 operations
-  r.square_repeated(4)
-  r *= x1101
-  r.square_repeated(9)
-  r *= x11111111
-  r.square_repeated(7)
-
-  # 161 + 15 = 176 operations
-  r *= x11111
-  r.square_repeated(6)
-  r *= x10111
-  r.square_repeated(6)
-  r *= x1001
-
-  # 176 + 22 = 198 operations
-  r.square_repeated(4)
-  r *= x11
-  r.square_repeated(6)
-  r *= x11
-  r.square_repeated(10)
-
-  # 198 + 16 = 214 operations
-  r *= x110101
-  r.square_repeated(2)
-  r *= a
-  r.square_repeated(11)
-  r *= x11101
-
-  # 214 + 28 = 238 operations
-  r.square_repeated(6)
-  r *= x101
-  r.square_repeated(7)
-  r *= x1101
-  r.square_repeated(9)
-
-  # 238 + 21 = 259 operations
-  r *= x100001
-  r.square_repeated(7)
-  r *= x100101
-  r.square_repeated(11)
-  r *= x100111
-
-  # 259 + 28 = 287 operations
-  r.square_repeated(7)
-  r *= x101111
-  r.square_repeated(6)
-  r *= x11111
-  r.square_repeated(13)
-
-  # 287 + 25 = 302 operations
-  r *= x100001
-  r.square_repeated(6)
-  r *= x111011
-  r.square_repeated(6)
-  r *= x111001
-
-  # 302 + 27 = 329 operations
-  r.square_repeated(10)
-  r *= x10111
-  r.square_repeated(11)
-  r *= x111101
-  r.square_repeated(4)
-
-  # 329 + 17 = 346 operations
-  r *= x1101
-  r.square_repeated(8)
-  r *= x110001
-  r.square_repeated(6)
-  r *= x110001
-
-  # 346 + 20 = 366 operations
-  r.square_repeated(5)
-  r *= x11001
-  r.square_repeated(3)
-  r *= x11
-  r.square_repeated(10)
-
-  # 366 + 16 = 382 operations
-  r *= x100111
-  r.square_repeated(5)
-  r *= x1001
-  r.square_repeated(8)
-  r *= x11001
-
-  # 382 + 25 = 407 operations
-  r.square_repeated(10)
-  r *= x1111
-  r.square_repeated(7)
-  r *= x11101
-  r.square_repeated(6)
-
-  # 407 + 20 = 427 operations
-  r *= x11101
-  r.square_repeated(9)
-  r *= x11111111
-  r.square_repeated(8)
-  r *= x100101
-
-  # 427 + 27 = 454 operations
-  r.square_repeated(6)
-  r *= x101101
-  r.square_repeated(10)
-  r *= x100011
-  r.square_repeated(9)
-
-  # 454 + 20 = 474 operations
-  r *= x1001
-  r.square_repeated(8)
-  r *= x1101
-  r.square_repeated(9)
-  r *= x100111
-
-  # 474 + 25 = 499 operations
-  r.square_repeated(8)
-  r *= x100011
-  r.square_repeated(6)
-  r *= x101101
-  r.square_repeated(9)
-
-  # 499 + 16 = 515 operations
-  r *= x100101
-  r.square_repeated(4)
-  r *= x1111
-  r.square_repeated(9)
-  r *= x1111111
-
-  # 515 + 25 = 540 operations
-  r.square_repeated(6)
-  r *= x11001
-  r.square_repeated(8)
-  r *= x111
-  r.square_repeated(9)
-
-  # 540 + 15 = 555 operations
-  r *= x111011
-  r.square_repeated(5)
-  r *= x10011
-  r.square_repeated(7)
-  r *= x100111
-
-  # 555 + 22 = 577 operations
-  r.square_repeated(5)
-  r *= x10111
-  r.square_repeated(9)
-  r *= x111001
-  r.square_repeated(6)
-
-  # 577 + 14 = 591 operations
-  r *= x111101
-  r.square_repeated(9)
-  r *= x11111111
-  r.square_repeated(2)
-  r *= x11
-
-  # 591 + 21 = 612 operations
-  r.square_repeated(7)
-  r *= x10111
-  r.square_repeated(6)
-  r *= x10011
-  r.square_repeated(6)
-
-  # 612 + 18 = 630 operations
-  r *= x101
-  r.square_repeated(9)
-  r *= x10001
-  r.square_repeated(6)
-  r *= x11011
-
-  # 630 + 27 = 657 operations
-  r.square_repeated(10)
-  r *= x100101
-  r.square_repeated(7)
-  r *= x110011
-  r.square_repeated(8)
-
-  # 657 + 13 = 670 operations
-  r *= x111101
-  r.square_repeated(7)
-  r *= x100011
-  r.square_repeated(3)
-  r *= x111
-
-  # 670 + 26 = 696 operations
-  r.square_repeated(10)
-  r *= x1011
-  r.square_repeated(11)
-  r *= x110011
-  r.square_repeated(3)
-
-  # 696 + 17 = 713 operations
-  r *= x111
-  r.square_repeated(9)
-  r *= x101011
-  r.square_repeated(5)
-  r *= x10111
-
-  # 713 + 21 = 734 operations
-  r.square_repeated(7)
-  r *= x101011
-  r.square_repeated(2)
-  r *= x11
-  r.square_repeated(10)
-
-  # 734 + 19 = 753 operations
-  r *= x101001
-  r.square_repeated(10)
-  r *= x110111
-  r.square_repeated(6)
-  r *= x111001
-
-  # 753 + 23 = 776 operations
-  r.square_repeated(6)
-  r *= x101001
-  r.square_repeated(9)
-  r *= x100111
-  r.square_repeated(6)
-
-  # 776 + 12 = 788 operations
-  r *= x110011
-  r.square_repeated(7)
-  r *= x100001
-  r.square_repeated(2)
-  r *= x11
-
-  # 788 + 39 = 827 operations
-  r.square_repeated(21)
-  r *= a
-  r.square_repeated(11)
-  r *= x101111
-  r.square_repeated(5)
-
-  # 827 + 55 = 882 operations
-  r *= x1001
-  r.square_repeated(7)
-  r *= x11101
-  r.square_repeated(45)
-  r *= x10001
-
-  # 882 + 4 = 886 operations
-  r.square_repeated(3)
-  r *= a
--- a/constantine/curves/curve25519_inversion.nim
+++ b/constantine/curves/curve25519_inversion.nim
@ -1,60 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ../config/curves,
-  ../arithmetic/finite_fields
-
-# ############################################################
-#
-#           Specialized inversion for BLS12-381
-#
-# ############################################################
-
-func inv_addchain*(r: var Fp[Curve25519], a: Fp[Curve25519]) =
-  var
-    x10       {.noinit.}: Fp[Curve25519]
-    x1001     {.noinit.}: Fp[Curve25519]
-    x1011     {.noinit.}: Fp[Curve25519]
-
-  x10       .square(a)               # 2
-  x1001     .square_repeated(x10, 2) # 8
-  x1001     *= a                     # 9
-  x1011     .prod(x10, x1001)        # 11
-  # 5 operations
-
-  # TODO: we can accumulate in a partially reduced
-  #       doubled-size `r` to avoid the final substractions.
-  #       and only reduce at the end.
-  #       This requires the number of op to be less than log2(p) == 255
-
-  template t: untyped = x10
-  
-  t.square(x1011)              # 22
-  r.prod(t, x1001)             # 31 = 2⁵-1
-
-  template u: untyped = x1001
-
-  t.square_repeated(r, 5)
-  r *= t                       # 2¹⁰-1
-  t.square_repeated(r, 10)
-  t *= r                       # 2²⁰-1
-
-  u.square_repeated(t, 20)
-  t *= u                       # 2⁴⁰-1
-  t.square_repeated(10)
-  t *= r                       # 2⁵⁰-1
-  r.square_repeated(t, 50)
-  r *= t                       # 2¹⁰⁰-1
-  
-  u.square_repeated(r, 100)
-  r *= u                       # 2²⁰⁰-1
-  r.square_repeated(50)
-  r *= t                       # 2²⁵⁰-1
-  r.square_repeated(5)
-  r *= x1011                   # 2²⁵⁵-21 (note: 11 = 2⁵-21)
--- a/constantine/curves/secp256k1_inversion.nim
+++ b/constantine/curves/secp256k1_inversion.nim
@ -1,96 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ../config/curves,
-  ../arithmetic/finite_fields
-
-# ############################################################
-#
-#           Specialized inversion for Secp256k1
-#
-# ############################################################
-
-func inv_addchain*(r: var Fp[Secp256k1], a: Fp[Secp256k1]) {.used.}=
-  ## We invert via Little Fermat's theorem
-  ## a^(-1) ≡ a^(p-2) (mod p)
-  ## with p = "0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2F"
-  ## We take advantage of the prime special form to hardcode
-  ## the sequence of squarings and multiplications for the modular exponentiation
-  ##
-  ## See libsecp256k1
-  ##
-  ## The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
-  ## { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
-  ## [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
-
-  var
-    x2{.noInit.}: Fp[Secp256k1]
-    x3{.noInit.}: Fp[Secp256k1]
-    x6{.noInit.}: Fp[Secp256k1]
-    x9{.noInit.}: Fp[Secp256k1]
-    x11{.noInit.}: Fp[Secp256k1]
-    x22{.noInit.}: Fp[Secp256k1]
-    x44{.noInit.}: Fp[Secp256k1]
-    x88{.noInit.}: Fp[Secp256k1]
-    x176{.noInit.}: Fp[Secp256k1]
-    x220{.noInit.}: Fp[Secp256k1]
-    x223{.noInit.}: Fp[Secp256k1]
-
-  x2.square(a)
-  x2 *= a
-
-  x3.square(x2)
-  x3 *= a
-
-  x6 = x3
-  x6.square_repeated(3)
-  x6 *= x3
-
-  x9 = x6
-  x9.square_repeated(3)
-  x9 *= x3
-
-  x11 = x9
-  x11.square_repeated(2)
-  x11 *= x2
-
-  x22 = x11
-  x22.square_repeated(11)
-  x22 *= x11
-
-  x44 = x22
-  x44.square_repeated(22)
-  x44 *= x22
-
-  x88 = x44
-  x88.square_repeated(44)
-  x88 *= x44
-
-  x176 = x88
-  x88.square_repeated(88)
-  x176 *= x88
-
-  x220 = x176
-  x220.square_repeated(44)
-  x220 *= x44
-
-  x223 = x220
-  x223.square_repeated(3)
-  x223 *= x3
-
-  # The final result is then assembled using a sliding window over the blocks
-  r = x223
-  r.square_repeated(23)
-  r *= x22
-  r.square_repeated(5)
-  r *= a
-  r.square_repeated(3)
-  r *= x2
-  r.square_repeated(2)
-  r *= a
--- a/constantine/curves/zoo_precomputed_params.nim
+++ b/constantine/curves/zoo_precomputed_params.nim
@ -9,11 +9,11 @@
 import
  std/macros,
  ../config/curves,
-  ./bls12_377_precomputed_params,
-  ./bls12_381_precomputed_params,
-  ./bn254_nogami_precomputed_params,
-  ./bn254_snarks_precomputed_params,
-  ./bw6_761_precomputed_params
+  ./bls12_377_g2_params,
+  ./bls12_381_g2_params,
+  ./bn254_nogami_g2_params,
+  ./bn254_snarks_g2_params,
+  ./bw6_761_g2_params

 {.experimental: "dynamicBindSym".}

--- a/constantine/curves/zoo_inversions.nim
+++ b/constantine/curves/zoo_inversions.nim
@ -1,35 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ../config/curves,
-  ./bls12_377_inversion,
-  ./bls12_381_inversion,
-  ./bn254_nogami_inversion,
-  ./bn254_snarks_inversion,
-  ./bw6_761_inversion,
-  ./secp256k1_inversion,
-  ./curve25519_inversion
-
-
-export
-  bls12_377_inversion,
-  bls12_381_inversion,
-  bn254_nogami_inversion,
-  bn254_snarks_inversion,
-  bw6_761_inversion,
-  secp256k1_inversion,
-  curve25519_inversion
-
-func hasInversionAddchain*(C: static Curve): static bool =
-  ## Is an inversion addition chain implemented for the curve.
-  ## Note: the addition chain might be slower than Euclid-based inversion.
-  when C in {BN254_Nogami, BN254_Snarks, BLS12_377, BLS12_381, BW6_761, Curve25519, Secp256k1}:
-    true
-  else:
-    false
--- a/constantine/elliptic/ec_endomorphism_accel.nim
+++ b/constantine/elliptic/ec_endomorphism_accel.nim
@ -218,10 +218,10 @@ func nDimMultiScalarRecoding[M, L: static int](
      k[j].div2()
      k[j] += SecretWord (bji and b[0][i])

-func buildLookupTable[M: static int, EC](
+func buildLookupTable[M: static int, EC, ECaff](
       P: EC,
       endomorphisms: array[M-1, EC],
-       lut: var array[1 shl (M-1), EC],
+       lut: var array[1 shl (M-1), ECaff],
     ) =
  ## Build the lookup table from the base point P
  ## and the curve endomorphism
@ -251,11 +251,17 @@ func buildLookupTable[M: static int, EC](
  #
  #   This scheme ensures 1 addition per table entry instead of a number
  #   of addition dependent on `u` Hamming Weight
-  lut[0] = P
+
+  # Step 1. Create the lookup-table in alternative coordinates
+  var tab {.noInit.}: array[1 shl (M-1), EC]
+  tab[0] = P
  for u in 1'u32 ..< 1 shl (M-1):
    # The recoding allows usage of 2^(n-1) table instead of the usual 2^n with NAF
    let msb = u.log2_vartime() # No undefined, u != 0
-    lut[u].sum(lut[u.clearBit(msb)], endomorphisms[msb])
+    tab[u].sum(tab[u.clearBit(msb)], endomorphisms[msb])
+
+  # Step 2. Convert to affine coordinates to benefit from mixed-addition
+  lut.batchAffine(tab)

 func tableIndex(glv: GLV_SAC, bit: int): SecretWord =
  ## Compose the secret table index from
@ -286,12 +292,14 @@ func scalarMulEndo*[scalBits; EC](
  ## Requires:
  ## - Cofactor to be cleared
  ## - 0 <= scalar < curve order
+  mixin affine
+  type ECaff = affine(EC)
  const C = P.F.C # curve
  static: doAssert scalBits <= C.getCurveOrderBitwidth(), "Do not use endomorphism to multiply beyond the curve order"
  when P.F is Fp:
    const M = 2
    # 1. Compute endomorphisms
-    var endomorphisms {.noInit.}: array[M-1, typeof(P)]
+    var endomorphisms {.noInit.}: array[M-1, EC]
    when P.G == G1:
      endomorphisms[0] = P
      endomorphisms[0].x *= C.getCubicRootOfUnity_mod_p()
@ -301,7 +309,7 @@ func scalarMulEndo*[scalBits; EC](
  elif P.F is Fp2:
    const M = 4
    # 1. Compute endomorphisms
-    var endomorphisms {.noInit.}: array[M-1, typeof(P)]
+    var endomorphisms {.noInit.}: array[M-1, EC]
    endomorphisms[0].frobenius_psi(P)
    endomorphisms[1].frobenius_psi(P, 2)
    endomorphisms[2].frobenius_psi(P, 3)
@ -331,10 +339,8 @@ func scalarMulEndo*[scalBits; EC](
      endomorphisms[i-1].cneg(negatePoints[i])

  # 4. Precompute lookup table
-  var lut {.noInit.}: array[1 shl (M-1), typeof(P)]
+  var lut {.noInit.}: array[1 shl (M-1), ECaff]
  buildLookupTable(P, endomorphisms, lut)
-  # TODO: Montgomery simultaneous inversion (or other simultaneous inversion techniques)
-  #       so that we use mixed addition formulas in the main loop

  # 5. Recode the miniscalars
  #    we need the base miniscalar (that encodes the sign)
@ -346,12 +352,13 @@ func scalarMulEndo*[scalBits; EC](
  recoded.nDimMultiScalarRecoding(miniScalars)

  # 6. Proceed to GLV accelerated scalar multiplication
-  var Q {.noInit.}: typeof(P)
-  Q.secretLookup(lut, recoded.tableIndex(L-1))
+  var Q {.noInit.}: EC
+  var tmp {.noInit.}: ECaff
+  tmp.secretLookup(lut, recoded.tableIndex(L-1))
+  Q.fromAffine(tmp)

  for i in countdown(L-2, 0):
    Q.double()
-    var tmp {.noInit.}: typeof(Q)
    tmp.secretLookup(lut, recoded.tableIndex(i))
    tmp.cneg(SecretBool recoded[0][i])
    Q += tmp
@ -392,34 +399,40 @@ func scalarMulEndo*[scalBits; EC](
 #   -  0t10   ->  0b10  is  2
 #   -  0t11   ->  0b11  is  3

-func buildLookupTable_m2w2[EC](
+func buildLookupTable_m2w2[EC, Ecaff](
       P0: EC,
       P1: EC,
-       lut: var array[8, EC],
+       lut: var array[8, Ecaff],
     ) =
  ## Build a lookup table for GLV with 2-dimensional decomposition
  ## and window of size 2

+  # Step 1. Create the lookup-table in alternative coordinates
+  var tab {.noInit.}: array[8, EC]
+
  # with [k0, k1] the mini-scalars with digits of size 2-bit
  #
  # 4 = 0b100 - encodes [0b01, 0b00] ≡ P0
-  lut[4] = P0
+  tab[4] = P0
  # 5 = 0b101 - encodes [0b01, 0b01] ≡ P0 - P1
-  lut[5].diff(lut[4], P1)
+  tab[5].diff(tab[4], P1)
  # 7 = 0b111 - encodes [0b01, 0b11] ≡ P0 + P1
-  lut[7].sum(lut[4], P1)
+  tab[7].sum(tab[4], P1)
  # 6 = 0b110 - encodes [0b01, 0b10] ≡ P0 + 2P1
-  lut[6].sum(lut[7], P1)
+  tab[6].sum(tab[7], P1)

  # 0 = 0b000 - encodes [0b00, 0b00] ≡ 3P0
-  lut[0].double(lut[4])
-  lut[0] += lut[4]
+  tab[0].double(tab[4])
+  tab[0] += tab[4]
  # 1 = 0b001 - encodes [0b00, 0b01] ≡ 3P0 + P1
-  lut[1].sum(lut[0], P1)
+  tab[1].sum(tab[0], P1)
  # 2 = 0b010 - encodes [0b00, 0b10] ≡ 3P0 + 2P1
-  lut[2].sum(lut[1], P1)
+  tab[2].sum(tab[1], P1)
  # 3 = 0b011 - encodes [0b00, 0b11] ≡ 3P0 + 3P1
-  lut[3].sum(lut[2], P1)
+  tab[3].sum(tab[2], P1)
+
+  # Step 2. Convert to affine coordinates to benefit from mixed-addition
+  lut.batchAffine(tab)

 func w2Get(recoding: Recoded,
          digitIdx: int): uint8 {.inline.}=
@ -477,6 +490,8 @@ func scalarMulGLV_m2w2*[scalBits; EC](
  ## Requires:
  ## - Cofactor to be cleared
  ## - 0 <= scalar < curve order
+  mixin affine
+  type ECaff = affine(EC)
  const C = P0.F.C # curve
  static: doAssert: scalBits == C.getCurveOrderBitwidth()

@ -485,7 +500,7 @@ func scalarMulGLV_m2w2*[scalBits; EC](
    var P1 = P0
    P1.x *= C.getCubicRootOfUnity_mod_p()
  else:
-    var P1 {.noInit.}: typeof(P0)
+    var P1 {.noInit.}: EC
    P1.frobenius_psi(P0, 2)

  # 2. Decompose scalar into mini-scalars
@ -503,10 +518,8 @@ func scalarMulGLV_m2w2*[scalBits; EC](
    P1.cneg(negatePoints[1])

  # 4. Precompute lookup table
-  var lut {.noInit.}: array[8, EC]
+  var lut {.noInit.}: array[8, ECaff]
  buildLookupTable_m2w2(P0, P1, lut)
-  # TODO: Montgomery simultaneous inversion (or other simultaneous inversion techniques)
-  #       so that we use mixed addition formulas in the main loop

  # 5. Recode the miniscalars
  #    we need the base miniscalar (that encodes the sign)
@ -518,15 +531,16 @@ func scalarMulGLV_m2w2*[scalBits; EC](
  recoded.nDimMultiScalarRecoding(miniScalars)

  # 6. Proceed to GLV accelerated scalar multiplication
-  var Q {.noInit.}: typeof(P0)
+  var Q {.noInit.}: EC
+  var tmp {.noInit.}: ECaff
  var isNeg: SecretBool

-  Q.secretLookup(lut, recoded.w2TableIndex((L div 2) - 1, isNeg))
+  tmp.secretLookup(lut, recoded.w2TableIndex((L div 2) - 1, isNeg))
+  Q.fromAffine(tmp)

  for i in countdown((L div 2) - 2, 0):
    Q.double()
    Q.double()
-    var tmp {.noInit.}: typeof(Q)
    tmp.secretLookup(lut, recoded.w2TableIndex(i, isNeg))
    tmp.cneg(isNeg)
    Q += tmp
--- a/constantine/elliptic/ec_shortweierstrass_affine.nim
+++ b/constantine/elliptic/ec_shortweierstrass_affine.nim
@ -12,7 +12,7 @@ import
  ../arithmetic,
  ../towers,
  ../io/[io_fields, io_towers],
-  ../curves/zoo_precomputed_params
+  ../curves/zoo_g2_params

 # ############################################################
 #
@ -45,6 +45,14 @@ func isInf*(P: ECP_ShortW_Aff): SecretBool =
  ## and false otherwise
  result = P.x.isZero() and P.y.isZero()

+func ccopy*(P: var ECP_ShortW_Aff, Q: ECP_ShortW_Aff, ctl: SecretBool) {.inline.} =
+  ## Constant-time conditional copy
+  ## If ctl is true: Q is copied into P
+  ## if ctl is false: Q is not copied and P is unmodified
+  ## Time and memory accesses are the same whether a copy occurs or not
+  for fP, fQ in fields(P, Q):
+    ccopy(fP, fQ, ctl)
+
 func curve_eq_rhs*[F](y2: var F, x: F, G: static Subgroup) =
  ## Compute the curve equation right-hand-side from field element `x`
  ## i.e.  `y²` in `y² = x³ + a x + b`
--- a/constantine/elliptic/ec_shortweierstrass_jacobian.nim
+++ b/constantine/elliptic/ec_shortweierstrass_jacobian.nim
@ -34,6 +34,10 @@ type ECP_ShortW_Jac*[F; G: static Subgroup] = object
  ## Note that jacobian coordinates are not unique
  x*, y*, z*: F

+template affine*[F, G](_: type ECP_ShortW_Jac[F, G]): typedesc =
+  ## Returns the affine type that corresponds to the Jacobian type input
+  ECP_ShortW_Aff[F, G]
+
 func `==`*(P, Q: ECP_ShortW_Jac): SecretBool =
  ## Constant-time equality check
  ## This is a costly operation
@ -379,6 +383,8 @@ func madd*[F; G: static Subgroup](
  ## with p in Jacobian coordinates and Q in affine coordinates
  ##
  ##   R = P + Q
+  ## 
+  ## TODO: ⚠️ cannot handle P == Q
  # "madd-2007-bl" mixed addition formula - https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl
  # with conditional copies to handle infinity points
  #  Assumptions: Z2=1.
@ -516,6 +522,14 @@ func `+=`*(P: var ECP_ShortW_Jac, Q: ECP_ShortW_Jac) {.inline.} =
  ## In-place point addition
  P.sum(P, Q)

+func `+=`*(P: var ECP_ShortW_Jac, Q: ECP_ShortW_Aff) {.inline.} =
+  ## In-place mixed point addition
+  ## 
+  ## TODO: ⚠️ cannot handle P == Q
+  var t{.noInit.}: typeof(P)
+  t.madd(P, Q)
+  P = t
+
 func double*(P: var ECP_ShortW_Jac) {.inline.} =
  ## In-place point doubling
  P.double(P)
@ -528,7 +542,7 @@ func diff*(r: var ECP_ShortW_Jac,
  nQ.neg(Q)
  r.sum(P, nQ)

-func affineFromJacobian*[F; G](
+func affine*[F; G](
       aff: var ECP_ShortW_Aff[F, G],
       jac: ECP_ShortW_Jac[F, G]) =
  var invZ {.noInit.}, invZ2{.noInit.}: F
@ -536,16 +550,76 @@ func affineFromJacobian*[F; G](
  invZ2.square(invZ)

  aff.x.prod(jac.x, invZ2)
+  invZ *= invZ2
  aff.y.prod(jac.y, invZ)
-  aff.y *= invZ2

-func jacobianFromAffine*[F; G](
+func fromAffine*[F; G](
       jac: var ECP_ShortW_Jac[F, G],
       aff: ECP_ShortW_Aff[F, G]) {.inline.} =
  jac.x = aff.x
  jac.y = aff.y
  jac.z.setOne()

+func batchAffine*[N: static int, F, G](
+       affs: var array[N, ECP_ShortW_Aff[F, G]],
+       jacs: array[N, ECP_ShortW_Jac[F, G]]) =
+  # Algorithm: Montgomery's batch inversion
+  # - Speeding the Pollard and Elliptic Curve Methods of Factorization
+  #   Section 10.3.1
+  #   Peter L. Montgomery
+  #   https://www.ams.org/journals/mcom/1987-48-177/S0025-5718-1987-0866113-7/S0025-5718-1987-0866113-7.pdf
+  # - Modern Computer Arithmetic
+  #   Section 2.5.1 Several inversions at once
+  #   Richard P. Brent and Paul Zimmermann
+  #   https://members.loria.fr/PZimmermann/mca/mca-cup-0.5.9.pdf
+
+  # To avoid temporaries, we store partial accumulations
+  # in affs[i].x and whether z == 0 in affs[i].y
+  var zeroes: array[N, SecretBool]
+  affs[0].x  = jacs[0].z
+  zeroes[0] = affs[0].x.isZero()
+  affs[0].x.csetOne(zeroes[0])
+
+  for i in 1 ..< N:
+    # Skip zero z-coordinates (infinity points)
+    var z = jacs[i].z
+    zeroes[i] = z.isZero()
+    z.csetOne(zeroes[i])
+
+    affs[i].x.prod(affs[i-1].x, z)
+  
+  var accInv {.noInit.}: F
+  accInv.inv(affs[N-1].x)
+
+  for i in countdown(N-1, 1):
+    # Skip zero z-coordinates (infinity points)
+    var z = affs[i].x
+
+    # Extract 1/Pᵢ
+    var invi {.noInit.}: F
+    invi.prod(accInv, affs[i-1].x)
+    invi.csetZero(zeroes[i])
+
+    # Now convert Pᵢ to affine
+    var invi2 {.noinit.}: F
+    invi2.square(invi)
+    affs[i].x.prod(jacs[i].x, invi2)
+    invi *= invi2
+    affs[i].y.prod(jacs[i].y, invi)
+
+    # next iteration
+    invi = jacs[i].z
+    invi.csetOne(zeroes[i])
+    accInv *= invi
+
+  block: # tail
+    var invi2 {.noinit.}: F
+    accInv.csetZero(zeroes[0])
+    invi2.square(accInv)
+    affs[0].x.prod(jacs[0].x, invi2)
+    accInv *= invi2
+    affs[0].y.prod(jacs[0].y, accInv)
+
 # ############################################################
 #                                                            #
 #      Deriving efficient and complete Jacobian formulae     #
--- a/constantine/elliptic/ec_shortweierstrass_projective.nim
+++ b/constantine/elliptic/ec_shortweierstrass_projective.nim
@ -34,6 +34,10 @@ type ECP_ShortW_Prj*[F; G: static Subgroup] = object
  ## Note that projective coordinates are not unique
  x*, y*, z*: F

+template affine*[F, G](_: type ECP_ShortW_Prj[F, G]): typedesc =
+  ## Returns the affine type that corresponds to the Jacobian type input
+  ECP_ShortW_Aff[F, G]
+
 func `==`*(P, Q: ECP_ShortW_Prj): SecretBool =
  ## Constant-time equality check
  ## This is a costly operation
@ -393,7 +397,6 @@ func `+=`*(P: var ECP_ShortW_Prj, Q: ECP_ShortW_Prj) {.inline.} =

 func `+=`*(P: var ECP_ShortW_Prj, Q: ECP_ShortW_Aff) {.inline.} =
  ## In-place mixed point addition
-  # used in line_addition
  P.madd(P, Q)

 func double*(P: var ECP_ShortW_Prj) {.inline.} =
@ -409,7 +412,7 @@ func diff*(r: var ECP_ShortW_Prj,
  nQ.neg(Q)
  r.sum(P, nQ)

-func affineFromProjective*[F, G](
+func affine*[F, G](
       aff: var ECP_ShortW_Aff[F, G],
       proj: ECP_ShortW_Prj[F, G]) =
  var invZ {.noInit.}: F
@ -418,9 +421,63 @@ func affineFromProjective*[F, G](
  aff.x.prod(proj.x, invZ)
  aff.y.prod(proj.y, invZ)

-func projectiveFromAffine*[F, G](
+func fromAffine*[F, G](
       proj: var ECP_ShortW_Prj[F, G],
       aff: ECP_ShortW_Aff[F, G]) {.inline.} =
  proj.x = aff.x
  proj.y = aff.y
  proj.z.setOne()
+
+func batchAffine*[N: static int, F, G](
+       affs: var array[N, ECP_ShortW_Aff[F, G]],
+       projs: array[N, ECP_ShortW_Prj[F, G]]) =
+  # Algorithm: Montgomery's batch inversion
+  # - Speeding the Pollard and Elliptic Curve Methods of Factorization
+  #   Section 10.3.1
+  #   Peter L. Montgomery
+  #   https://www.ams.org/journals/mcom/1987-48-177/S0025-5718-1987-0866113-7/S0025-5718-1987-0866113-7.pdf
+  # - Modern Computer Arithmetic
+  #   Section 2.5.1 Several inversions at once
+  #   Richard P. Brent and Paul Zimmermann
+  #   https://members.loria.fr/PZimmermann/mca/mca-cup-0.5.9.pdf
+
+  # To avoid temporaries, we store partial accumulations
+  # in affs[i].x
+  var zeroes: array[N, SecretBool]
+  affs[0].x = projs[0].z
+  zeroes[0] = affs[0].x.isZero()
+  affs[0].x.csetOne(zeroes[0])
+
+  for i in 1 ..< N:
+    # Skip zero z-coordinates (infinity points)
+    var z = projs[i].z
+    zeroes[i] = z.isZero()
+    z.csetOne(zeroes[i])
+
+    affs[i].x.prod(affs[i-1].x, z)
+  
+  var accInv {.noInit.}: F
+  accInv.inv(affs[N-1].x)
+
+  for i in countdown(N-1, 1):
+    # Skip zero z-coordinates (infinity points)
+    var z = affs[i].x
+
+    # Extract 1/Pᵢ
+    var invi {.noInit.}: F
+    invi.prod(accInv, affs[i-1].x)
+    invi.csetZero(zeroes[i])
+
+    # Now convert Pᵢ to affine
+    affs[i].x.prod(projs[i].x, invi)
+    affs[i].y.prod(projs[i].y, invi)
+
+    # next iteration
+    invi = projs[i].z
+    invi.csetOne(zeroes[i])
+    accInv *= invi
+  
+  block: # tail
+    accInv.csetZero(zeroes[0])
+    affs[0].x.prod(projs[0].x, accInv)
+    affs[0].y.prod(projs[0].y, accInv)
--- a/constantine/elliptic/ec_twistededwards_projective.nim
+++ b/constantine/elliptic/ec_twistededwards_projective.nim
@ -289,7 +289,7 @@ func diff*(r: var ECP_TwEdwards_Prj,
  nQ.neg(Q)
  r.sum(P, nQ)

-func affineFromProjective*[F](
+func affine*[F](
       aff: var ECP_TwEdwards_Prj[F],
       proj: ECP_TwEdwards_Prj[F]) =
  var invZ {.noInit.}: F
@ -298,7 +298,7 @@ func affineFromProjective*[F](
  aff.x.prod(proj.x, invZ)
  aff.y.prod(proj.y, invZ)

-func projectiveFromAffine*[F](
+func projective*[F](
       proj: var ECP_TwEdwards_Prj[F],
       aff: ECP_TwEdwards_Prj[F]) {.inline.} =
  proj.x = aff.x
--- a/constantine/io/io_ec.nim
+++ b/constantine/io/io_ec.nim
@ -10,6 +10,7 @@ import
  ./io_bigints, ./io_fields, ./io_towers,
  ../arithmetic,
  ../towers,
+  ../primitives,
  ../elliptic/[
    ec_shortweierstrass_affine,
    ec_shortweierstrass_projective,
@ -26,7 +27,12 @@ import
 #
 # ############################################################

-func toHex*[EC: ECP_ShortW_Prj or ECP_ShortW_Jac or ECP_ShortW_Aff](P: EC): string =
+proc spaces(num: int): string =
+  result = newString(num)
+  for i in 0 ..< num:
+    result[i] = ' '
+
+func toHex*[EC: ECP_ShortW_Prj or ECP_ShortW_Jac or ECP_ShortW_Aff](P: EC, indent: static int = 0): string =
  ## Stringify an elliptic curve point to Hex
  ## Note. Leading zeros are not removed.
  ## Result is prefixed with 0x
@ -39,29 +45,31 @@ func toHex*[EC: ECP_ShortW_Prj or ECP_ShortW_Jac or ECP_ShortW_Aff](P: EC): stri
  ## This proc output may change format in the future

  var aff {.noInit.}: ECP_ShortW_Aff[EC.F, EC.G]
-  when EC is ECP_ShortW_Prj:
-    aff.affineFromProjective(P)
-  elif EC is ECP_ShortW_Jac:
-    aff.affineFromJacobian(P)
+  when EC isnot ECP_ShortW_Aff:
+    aff.affine(P)
  else:
    aff = P

-  result = "ECP[" & $aff.F & "](\n  x: "
+  const sp = spaces(indent)
+
+  result = sp & $EC & "(\n" & sp & "  x: "
  result.appendHex(aff.x, bigEndian)
-  result &= ",\n  y: "
+  result &= ",\n" & sp & "  y: "
  result.appendHex(aff.y, bigEndian)
-  result &= "\n)"
+  result &= "\n" & sp & ")"

 func fromHex*(dst: var (ECP_ShortW_Prj or ECP_ShortW_Jac), x, y: string): bool {.raises: [ValueError].}=
  ## Convert hex strings to a G1 curve point
-  ## Returns `false`
-  ## if there is no point with coordinates (`x`, `y`) on the curve
+  ## Returns true if point exist or if input is the point at infinity (all 0)
+  ## Returns `false` if there is no point with coordinates (`x`, `y`) on the curve
  ## In that case, `dst` content is undefined.
  static: doAssert dst.F is Fp, "dst must be on G1, an elliptic curve over 𝔽p"
  dst.x.fromHex(x)
  dst.y.fromHex(y)
  dst.z.setOne()
-  return bool(isOnCurve(dst.x, dst.y, dst.G))
+  let isInf = dst.x.isZero() and dst.y.isZero()
+  dst.z.csetZero(isInf)
+  return bool(isOnCurve(dst.x, dst.y, dst.G) or isInf)

 func fromHex*(dst: var (ECP_ShortW_Prj or ECP_ShortW_Jac), x0, x1, y0, y1: string): bool {.raises: [ValueError].}=
  ## Convert hex strings to a G2 curve point
@ -72,24 +80,34 @@ func fromHex*(dst: var (ECP_ShortW_Prj or ECP_ShortW_Jac), x0, x1, y0, y1: strin
  dst.x.fromHex(x0, x1)
  dst.y.fromHex(y0, y1)
  dst.z.setOne()
-  return bool(isOnCurve(dst.x, dst.y, dst.G))
+  let isInf = dst.x.isZero() and dst.y.isZero()
+  dst.z.csetZero(isInf)
+  return bool(isOnCurve(dst.x, dst.y, dst.G) or isInf)

 func fromHex*(dst: var ECP_ShortW_Aff, x, y: string): bool {.raises: [ValueError].}=
  ## Convert hex strings to a G1 curve point
-  ## Returns `false`
-  ## if there is no point with coordinates (`x`, `y`) on the curve
+  ## Returns true if point exist or if input is the point at infinity (all 0)
+  ## Returns `false` if there is no point with coordinates (`x`, `y`) on the curve
  ## In that case, `dst` content is undefined.
  static: doAssert dst.F is Fp, "dst must be on G1, an elliptic curve over 𝔽p"
  dst.x.fromHex(x)
  dst.y.fromHex(y)
-  return bool(isOnCurve(dst.x, dst.y, dst.G))
+  return bool(isOnCurve(dst.x, dst.y, dst.G) or dst.isInf())

 func fromHex*(dst: var ECP_ShortW_Aff, x0, x1, y0, y1: string): bool {.raises: [ValueError].}=
  ## Convert hex strings to a G2 curve point
-  ## Returns `false`
-  ## if there is no point with coordinates (`x`, `y`) on the curve
+  ## Returns true if point exist or if input is the point at infinity (all 0)
+  ## Returns `false` if there is no point with coordinates (`x`, `y`) on the curve
  ## In that case, `dst` content is undefined.
  static: doAssert dst.F is Fp2, "dst must be on G2, an elliptic curve over 𝔽p2"
  dst.x.fromHex(x0, x1)
  dst.y.fromHex(y0, y1)
-  return bool(isOnCurve(dst.x, dst.y, dst.G))
+  return bool(isOnCurve(dst.x, dst.y, dst.G) or dst.isInf())
+
+func fromHex*[EC: ECP_ShortW_Prj or ECP_ShortW_Jac or ECP_ShortW_Aff](
+       _: type EC, x, y: string): EC {.raises: [ValueError].} =
+  doAssert result.fromHex(x, y)
+
+func fromHex*[EC: ECP_ShortW_Prj or ECP_ShortW_Jac or ECP_ShortW_Aff](
+       _: type EC, x0, x1, y0, y1: string): EC {.raises: [ValueError].} =
+  doAssert result.fromHex(x0, x1, y0, y1)
--- a/constantine/pairing/miller_loops.nim
+++ b/constantine/pairing/miller_loops.nim
@ -141,7 +141,7 @@ func miller_init_double_then_add*[FT, F1, F2](

  # First step: 0b10, T <- Q, f = 1 (mod p¹²), f *= line
  # ----------------------------------------------------
-  T.projectiveFromAffine(Q)
+  T.fromAffine(Q)

  # f.square() -> square(1)
  line.line_double(T, P)
@ -292,7 +292,7 @@ func miller_init_double_then_add*[N: static int, FT, F1, F2](
  # First step: T <- Q, f = 1 (mod p¹²), f *= line
  # ----------------------------------------------
  for i in 0 ..< N:
-    Ts[i].projectiveFromAffine(Qs[i])
+    Ts[i].fromAffine(Qs[i])

  line0.line_double(Ts[0], Ps[0])
  when N >= 2:
--- a/constantine/pairing/pairing_bls12.nim
+++ b/constantine/pairing/pairing_bls12.nim
@ -63,7 +63,7 @@ func millerLoopGenericBLS12*[C](
    line {.noInit.}: Line[Fp2[C]]
    nQ{.noInit.}: typeof(Q)

-  T.projectiveFromAffine(Q)
+  T.fromAffine(Q)
  nQ.neg(Q)

  basicMillerLoop(
--- a/constantine/pairing/pairing_bn.nim
+++ b/constantine/pairing/pairing_bn.nim
@ -61,7 +61,7 @@ func millerLoopGenericBN*[C](
    line {.noInit.}: Line[Fp2[C]]
    nQ{.noInit.}: typeof(Q)

-  T.projectiveFromAffine(Q)
+  T.fromAffine(Q)
  nQ.neg(Q)

  basicMillerLoop(
--- a/constantine/pairing/pairing_bw6_761.nim
+++ b/constantine/pairing/pairing_bw6_761.nim
@ -45,7 +45,7 @@ func millerLoopBW6_761_naive[C](
    line {.noInit.}: Line[Fp[C]]
    nQ{.noInit.}: typeof(Q)

-  T.projectiveFromAffine(Q)
+  T.fromAffine(Q)
  nQ.neg(Q)

  basicMillerLoop(
@ -55,7 +55,7 @@ func millerLoopBW6_761_naive[C](
  )

  var f2 {.noInit.}: typeof(f)
-  T.projectiveFromAffine(Q)
+  T.fromAffine(Q)

  basicMillerLoop(
    f2, T, line,
@ -93,7 +93,7 @@ func millerLoopBW6_761_opt_to_debug[C](
    T {.noInit.}: ECP_ShortW_Prj[Fp[C], G2]
    line {.noInit.}: Line[Fp[C]]

-  T.projectiveFromAffine(Q)
+  T.fromAffine(Q)
  f.setOne()

  template u: untyped = pairing(C, ate_param_1_opt)
@ -114,7 +114,7 @@ func millerLoopBW6_761_opt_to_debug[C](

  mu = f
  minvu.inv(f)
-  Qu.affineFromProjective(T)
+  Qu.affine(T)
  nQu.neg(Qu)

  # Drop the vertical line
@ -125,7 +125,7 @@ func millerLoopBW6_761_opt_to_debug[C](
  # 2nd part: f_{u²-u-1,Q}(P)
  # ------------------------------
  # We restart from `f` and `T`
-  T.projectiveFromAffine(Qu)
+  T.fromAffine(Qu)

  template u: untyped = pairing(C, ate_param_2_opt)
  var u3 = pairing(C, ate_param_2_opt)
--- a/constantine/primitives/bithacks.nim
+++ b/constantine/primitives/bithacks.nim
@ -115,4 +115,4 @@ func isPowerOf2_vartime*(n: SomeUnsignedInt): bool {.inline.} =
 func nextPowerOf2_vartime*(n: uint64): uint64 {.inline.} =
  ## Returns x if x is a power of 2
  ## or the next biggest power of 2
-  1'u64 shl (log2_vartime(n-1) + 1)
+  1'u64 shl (log2_vartime(n-1) + 1)
--- a/constantine/primitives/constant_time.nim
+++ b/constantine/primitives/constant_time.nim
@ -124,7 +124,7 @@ template `-`*[T: Ct](x: T): T =
 #
 # ############################################################

-template isMsbSet[T: Ct](x: T): CTBool[T] =
+template isMsbSet*[T: Ct](x: T): CTBool[T] =
  ## Returns the most significant bit of an integer
  const msb_pos = T.sizeof * 8 - 1
  (CTBool[T])(x shr msb_pos)
@ -163,6 +163,16 @@ template `<=`*[T: Ct](x, y: T): CTBool[T] =
 template `xor`*[T: Ct](x, y: CTBool[T]): CTBool[T] =
  CTBool[T](noteq(T(x), T(y)))

+# ############################################################
+#
+#                       Conditionals
+#
+# ############################################################
+
+template cneg*[T: Ct](x: T, ctl: CTBool[T]): T =
+  # Conditional negate if ctl is true
+  (x xor -T(ctl)) + T(ctl) 
+
 # ############################################################
 #
 #         Workaround system.nim `!=` template
--- a/constantine/primitives/constant_time_types.nim
+++ b/constantine/primitives/constant_time_types.nim
@ -39,3 +39,4 @@ const X86* = defined(amd64) or defined(i386)
 when sizeof(int) == 8 and GCC_Compatible:
  type
    uint128*{.importc: "unsigned __int128".} = object
+    int128*{.importc: "__int128".} = object
--- a/constantine/primitives/extended_precision.nim
+++ b/constantine/primitives/extended_precision.nim
@ -75,6 +75,19 @@ func muladd2*(hi, lo: var Ct[uint32], a, b, c1, c2: Ct[uint32]) {.inline.}=
  lo = (Ct[uint32])(dblPrec)
  hi = (Ct[uint32])(dblPrec shr 32)

+func smul*(hi, lo: var Ct[uint32], a, b: Ct[uint32]) {.inline.} =
+  ## Signed extended precision multiplication
+  ## (hi, lo) <- a*b
+  ##
+  ## Inputs are intentionally unsigned
+  ## as we use their unchecked raw representation for cryptography
+  ##
+  ## This is constant-time on most hardware
+  ## See: https://www.bearssl.org/ctmul.html
+  let dblPrec = int64(cast[int32](a)) * int64(cast[int32](b)) # Sign-extension via cast+conversion to wider int
+  lo = cast[Ct[uint32]](dblPrec)
+  hi = cast[Ct[uint32]](dblPrec shr 32)
+
 # ############################################################
 #
 #                     64-bit words
@ -83,15 +96,15 @@ func muladd2*(hi, lo: var Ct[uint32], a, b, c1, c2: Ct[uint32]) {.inline.}=

 when sizeof(int) == 8:
  when defined(vcc):
-    from ./extended_precision_x86_64_msvc import unsafeDiv2n1n, mul, muladd1, muladd2
+    from ./extended_precision_x86_64_msvc import unsafeDiv2n1n, mul, muladd1, muladd2, smul
  elif GCCCompatible:
    # TODO: constant-time div2n1n
    when X86:
      from ./extended_precision_x86_64_gcc import unsafeDiv2n1n
-      from ./extended_precision_64bit_uint128 import mul, muladd1, muladd2
+      from ./extended_precision_64bit_uint128 import mul, muladd1, muladd2, smul
    else:
-      from ./extended_precision_64bit_uint128 import unsafeDiv2n1n, mul, muladd1, muladd2
-  export unsafeDiv2n1n, mul, muladd1, muladd2
+      from ./extended_precision_64bit_uint128 import unsafeDiv2n1n, mul, muladd1, muladd2, smul
+  export unsafeDiv2n1n, mul, muladd1, muladd2, smul

 # ############################################################
 #
--- a/constantine/primitives/extended_precision_64bit_uint128.nim
+++ b/constantine/primitives/extended_precision_64bit_uint128.nim
@ -97,3 +97,25 @@ func muladd2*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}=
    else:
      {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
      {.emit:["*",lo, " = (NU64)", dblPrec,";"].}
+
+func smul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
+  ## Extended precision multiplication
+  ## (hi, lo) <- a*b
+  ##
+  ## Inputs are intentionally unsigned
+  ## as we use their unchecked raw representation for cryptography
+  ## 
+  ## This is constant-time on most hardware
+  ## See: https://www.bearssl.org/ctmul.html
+  block:
+    var dblPrec {.noInit.}: int128
+    # We need to cast to int64 then sign-extended to int128
+    {.emit:[dblPrec, " = (__int128)", cast[int64](a)," * (__int128)", cast[int64](b),";"].}
+
+    # Don't forget to dereference the var param in C mode
+    when defined(cpp):
+      {.emit:[hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+      {.emit:[lo, " = (NU64)", dblPrec,";"].}
+    else:
+      {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+      {.emit:["*",lo, " = (NU64)", dblPrec,";"].}
--- a/constantine/primitives/extended_precision_x86_64_msvc.nim
+++ b/constantine/primitives/extended_precision_x86_64_msvc.nim
@ -29,6 +29,7 @@ func udiv128(highDividend, lowDividend, divisor: Ct[uint64], remainder: var Ct[u
  ##   - if n_hi > d result is undefined

 func umul128(a, b: Ct[uint64], hi: var Ct[uint64]): Ct[uint64] {.importc:"_umul128", header:"<intrin.h>", nodecl.}
+  ## Unsigned extended precision multiplication
  ## (hi, lo) <-- a * b
  ## Return value is the low word

@ -84,3 +85,22 @@ func muladd2*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}=
  # Carry chain 2
  addC(carry2, lo, lo, c2, Carry(0))
  addC(carry2, hi, hi, 0, carry2)
+
+func smul128(a, b: Ct[uint64], hi: var Ct[uint64]): Ct[uint64] {.importc:"_mul128", header:"<intrin.h>", nodecl.}
+  ## Signed extended precision multiplication
+  ## (hi, lo) <-- a * b
+  ## Return value is the low word
+  ##
+  ## Inputs are intentionally unsigned
+  ## as we use their unchecked raw representation for cryptography
+
+func smul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
+  ## Extended precision multiplication
+  ## (hi, lo) <- a*b
+  ##
+  ## Inputs are intentionally unsigned
+  ## as we use their unchecked raw representation for cryptography
+  ## 
+  ## This is constant-time on most hardware
+  ## See: https://www.bearssl.org/ctmul.html
+  lo = smul128(a, b, hi)
--- a/constantine/protocols/ethereum_evm_precompiles.nim
+++ b/constantine/protocols/ethereum_evm_precompiles.nim
@ -133,7 +133,7 @@ func eth_evm_ecadd*(

  R.sum(P, Q)
  var aff{.noInit.}: ECP_ShortW_Aff[Fp[BN254_Snarks], G1]
-  aff.affineFromProjective(R)
+  aff.affine(R)

  r.toOpenArray(0, 31).exportRawUint(
    aff.x, bigEndian
@ -207,7 +207,7 @@ func eth_evm_ecmul*(
    P.scalarMul(s)

  var aff{.noInit.}: ECP_ShortW_Aff[Fp[BN254_Snarks], G1]
-  aff.affineFromProjective(P)
+  aff.affine(P)

  r.toOpenArray(0, 31).exportRawUint(
    aff.x, bigEndian
@ -221,7 +221,7 @@ func subgroupCheck(P: ECP_ShortW_Aff[Fp2[BN254_Snarks], G2]): bool =
  ## that point may not be in the correct cyclic subgroup.
  ## If we are on the subgroup of order r then [r]P = 0
  var Q{.noInit.}: ECP_ShortW_Prj[Fp2[BN254_Snarks], G2]
-  Q.projectiveFromAffine(P)
+  Q.fromAffine(P)
  return bool(Q.isInSubgroup())

 func fromRawCoords(
--- a/constantine/tower_field_extensions/extension_fields.nim
+++ b/constantine/tower_field_extensions/extension_fields.nim
@ -219,6 +219,17 @@ func conj*(r: var CubicExt, a: CubicExt) =
 # Conditional arithmetic
 # -------------------------------------------------------------------

+func csetZero*(a: var ExtensionField, ctl: SecretBool) =
+  ## Set ``a`` to 0 if ``ctl`` is true
+  staticFor i, 0, a.coords.len:
+    a.coords[i].csetZero(ctl)
+
+func csetOne*(a: var ExtensionField, ctl: SecretBool) =
+  ## Set ``a`` to 1 if ``ctl`` is true
+  a.coords[0].csetOne(ctl)
+  staticFor i, 1, a.coords.len:
+    a.coords[i].csetZero(ctl)
+
 func cneg*(a: var ExtensionField, ctl: SecretBool) =
  ## Constant-time in-place conditional negation
  ## Only negate if ctl is true
--- a/docs/optimizations.md
+++ b/docs/optimizations.md
@ -28,16 +28,16 @@ The optimizations can be of algebraic, algorithmic or "implementation details" n
 - Squaring
  - [x] Dedicated squaring functions
  - [x] int128
-  - [ ] loop unrolling
+  - [x] loop unrolling
  - [x] x86: Full Assembly implementation
-  - [ ] x86: MULX, ADCX, ADOX instructions
+  - [x] x86: MULX, ADCX, ADOX instructions

 ## Finite Fields & Modular Arithmetic

 - Representation
  - [x] Montgomery Representation
  - [ ] Barret Reduction
-  - [ ] Unsaturated Representation
+  - [x] Unsaturated Representation
    - [ ] Mersenne Prime (2ᵏ - 1),
    - [ ] Generalized Mersenne Prime (NIST Prime P256: 2^256 - 2^224 + 2^192 + 2^96 - 1)
    - [ ] Pseudo-Mersenne Prime (2^m - k for example Curve25519: 2^255 - 19)
@ -91,9 +91,10 @@ The optimizations can be of algebraic, algorithmic or "implementation details" n
 - Inversion (constant-time baseline, Little-Fermat inversion via a^(p-2))
  - [x] Constant-time binary GCD algorithm by Möller, algorithm 5 in https://link.springer.com/content/pdf/10.1007%2F978-3-642-40588-4_10.pdf
  - [x] Addition-chain for a^(p-2)
-  - [ ] Constant-time binary GCD algorithm by Bernstein-Yang, https://eprint.iacr.org/2019/266
+  - [x] Constant-time binary GCD algorithm by Bernstein-Yang, https://eprint.iacr.org/2019/266
  - [ ] Constant-time binary GCD algorithm by Pornin, https://eprint.iacr.org/2020/972
-  - [ ] Simultaneous inversion
+  - [x] Constant-time binary GCD algorithm by BY with half-delta optimization by libsecp256k1, formally verified, https://eprint.iacr.org/2021/549
+  - [x] Simultaneous inversion

 - Square Root (constant-time)
  - [x] baseline sqrt via Little-Fermat for `p ≡ 3 (mod 4)`
@ -193,7 +194,7 @@ The optimizations can be of algebraic, algorithmic or "implementation details" n
  - [ ] BLS curves: Ghamman, Fouotsa
  - [x] BLS curves: Hayashida, Hayasaka, Teruya

- [ ] Multi-pairing
+- [x] Multi-pairing
  - [ ] Line accumulation
  - [ ] Parallel Multi-Pairing

@ -204,7 +205,7 @@ The optimizations can be of algebraic, algorithmic or "implementation details" n
  - [ ] BLS G2: Scott et al
  - [ ] BLS G2: Fuentes-Castañeda
  - [x] BLS G2: Budroni et al, endomorphism accelerated
-  - [ ] BN G2
+  - [x] BN G2: Fuentes-Castañeda
  - [ ] BW6-761 G1
  - [ ] BW6-761 G2

--- a/research/kzg_poly_commit/fft_g1.nim
+++ b/research/kzg_poly_commit/fft_g1.nim
@ -222,7 +222,7 @@ when isMainModule:
  proc roundtrip() =
    let fftDesc = FFTDescriptor[G1].init(maxScale = 4)
    var data = newSeq[G1](fftDesc.maxWidth)
-    data[0].projectiveFromAffine(Generator1)
+    data[0].fromAffine(Generator1)
    for i in 1 ..< fftDesc.maxWidth:
      data[i].madd(data[i-1], Generator1)

@ -273,7 +273,7 @@ when isMainModule:

      let desc = FFTDescriptor[G1].init(uint8 scale)
      var data = newSeq[G1](desc.maxWidth)
-      data[0].projectiveFromAffine(Generator1)
+      data[0].fromAffine(Generator1)
      for i in 1 ..< desc.maxWidth:
        data[i].madd(data[i-1], Generator1)

--- a/research/kzg_poly_commit/kzg_single_proofs.nim
+++ b/research/kzg_poly_commit/kzg_single_proofs.nim
@ -64,7 +64,7 @@ proc checkProofSingle(
     ): bool =
  ## Check a proof for a Kate commitment for an evaluation f(x) = y
  var xG2, g2: G2
-  g2.projectiveFromAffine(Generator2)
+  g2.fromAffine(Generator2)
  xG2 = g2
  xG2.scalarMul(x.toBig())

@ -72,7 +72,7 @@ proc checkProofSingle(
  s_minus_x.diff(kzg.secretG2[1], xG2)

  var yG1: G1
-  yG1.projectiveFromAffine(Generator1)
+  yG1.fromAffine(Generator1)
  yG1.scalarMul(y.toBig())

  var commitment_minus_y: G1
--- a/research/kzg_poly_commit/polynomials.nim
+++ b/research/kzg_poly_commit/polynomials.nim
@ -47,10 +47,10 @@ func pair_verify*(
  var P1a, P2a: G1aff
  var Q1a, Q2a: G2aff

-  P1a.affineFromProjective(P1)
-  Q1a.affineFromProjective(Q1)
-  P2a.affineFromProjective(P2)
-  Q2a.affineFromProjective(Q2)
+  P1a.affine(P1)
+  Q1a.affine(Q1)
+  P2a.affine(P2)
+  Q2a.affine(Q2)

  # To verify if e(P1, Q1) == e(P2, Q2)
  # we can do e(P1, Q1) / e(P2, Q2) == 1
--- a/sage/precompute_params.sage
+++ b/sage/precompute_params.sage
--- a/tests/t_bigints.nim
+++ b/tests/t_bigints.nim
@ -555,14 +555,10 @@ proc mainModularInverse() =
        let a = BigInt[16].fromUint(42'u16)
        let M = BigInt[16].fromUint(2017'u16)

-        var mp1div2 = M
-        discard mp1div2.add(One)
-        mp1div2.shiftRight(1)
-
        let expected = BigInt[16].fromUint(1969'u16)
        var r = canary(BigInt[16])

-        r.invmod(a, M, mp1div2)
+        r.invmod(a, M)

        check: bool(r == expected)

@ -570,14 +566,10 @@ proc mainModularInverse() =
        let a = BigInt[381].fromUint(42'u16)
        let M = BigInt[381].fromUint(2017'u16)

-        var mp1div2 = M
-        discard mp1div2.add(One)
-        mp1div2.shiftRight(1)
-
        let expected = BigInt[381].fromUint(1969'u16)
        var r = canary(BigInt[381])

-        r.invmod(a, M, mp1div2)
+        r.invmod(a, M)

        check: bool(r == expected)

@ -586,14 +578,10 @@ proc mainModularInverse() =
        let a = BigInt[16].fromUint(271'u16)
        let M = BigInt[16].fromUint(383'u16)

-        var mp1div2 = M
-        discard mp1div2.add(One)
-        mp1div2.shiftRight(1)
-
        let expected = BigInt[16].fromUint(106'u16)
        var r = canary(BigInt[16])

-        r.invmod(a, M, mp1div2)
+        r.invmod(a, M)

        check: bool(r == expected)

@ -601,14 +589,10 @@ proc mainModularInverse() =
        let a = BigInt[381].fromUint(271'u16)
        let M = BigInt[381].fromUint(383'u16)

-        var mp1div2 = M
-        discard mp1div2.add(One)
-        mp1div2.shiftRight(1)
-
        let expected = BigInt[381].fromUint(106'u16)
        var r = canary(BigInt[381])

-        r.invmod(a, M, mp1div2)
+        r.invmod(a, M)

        check: bool(r == expected)

@ -616,14 +600,10 @@ proc mainModularInverse() =
      let a = BigInt[381].fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
      let M = BigInt[381].fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab")

-      var mp1div2 = M
-      discard mp1div2.add(One)
-      mp1div2.shiftRight(1)
-
      let expected = BigInt[381].fromHex("0x0636759a0f3034fa47174b2c0334902f11e9915b7bd89c6a2b3082b109abbc9837da17201f6d8286fe6203caa1b9d4c8")

      var r = canary(BigInt[381])
-      r.invmod(a, M, mp1div2)
+      r.invmod(a, M)

      check: bool(r == expected)

@ -632,14 +612,10 @@ proc mainModularInverse() =
        let a = BigInt[16].fromUint(0'u16)
        let M = BigInt[16].fromUint(2017'u16)

-        var mp1div2 = M
-        mp1div2.shiftRight(1)
-        discard mp1div2.add(One)
-
        let expected = BigInt[16].fromUint(0'u16)
        var r = canary(BigInt[16])

-        r.invmod(a, M, mp1div2)
+        r.invmod(a, M)

        check: bool(r == expected)

@ -647,14 +623,10 @@ proc mainModularInverse() =
        let a = BigInt[381].fromUint(0'u16)
        let M = BigInt[381].fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab")

-        var mp1div2 = M
-        mp1div2.shiftRight(1)
-        discard mp1div2.add(One)
-
        let expected = BigInt[381].fromUint(0'u16)
        var r = canary(BigInt[381])

-        r.invmod(a, M, mp1div2)
+        r.invmod(a, M)

        check: bool(r == expected)

--- a/tests/t_ec_conversion.nim
+++ b/tests/t_ec_conversion.nim
@ -0,0 +1,63 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internals
+  ../constantine/config/[type_ff, curves],
+  ../constantine/elliptic/[ec_shortweierstrass_jacobian, ec_shortweierstrass_projective],
+  ../constantine/towers,
+  # Test utilities
+  ./t_ec_template
+
+const
+  Iters = 8
+
+run_EC_conversion_failures(
+  moduleName = "test_ec_conversion_fuzzing_failures"
+)
+
+run_EC_affine_conversion(
+    ec = ECP_ShortW_Jac[Fp[BN254_Snarks], G1],
+    Iters = Iters,
+    moduleName = "test_ec_conversion_shortw_affine_jacobian_g1_" & $BN254_Snarks
+  )
+run_EC_affine_conversion(
+    ec = ECP_ShortW_Prj[Fp[BN254_Snarks], G1],
+    Iters = Iters,
+    moduleName = "test_ec_conversion_shortw_affine_projective_g1_" & $BN254_Snarks
+  )
+run_EC_affine_conversion(
+    ec = ECP_ShortW_Jac[Fp2[BN254_Snarks], G2],
+    Iters = Iters,
+    moduleName = "test_ec_conversion_shortw_affine_jacobian_g2_" & $BN254_Snarks
+  )
+run_EC_affine_conversion(
+    ec = ECP_ShortW_Prj[Fp2[BN254_Snarks], G2],
+    Iters = Iters,
+    moduleName = "test_ec_conversion_shortw_affine_projective_g2_" & $BN254_Snarks
+  )
+run_EC_affine_conversion(
+    ec = ECP_ShortW_Jac[Fp[BLS12_381], G1],
+    Iters = Iters,
+    moduleName = "test_ec_conversion_shortw_affine_jacobian_g1_" & $BLS12_381
+  )
+run_EC_affine_conversion(
+    ec = ECP_ShortW_Prj[Fp[BLS12_381], G1],
+    Iters = Iters,
+    moduleName = "test_ec_conversion_shortw_affine_projective_g1_" & $BLS12_381
+  )
+run_EC_affine_conversion(
+    ec = ECP_ShortW_Jac[Fp2[BLS12_381], G2],
+    Iters = Iters,
+    moduleName = "test_ec_conversion_shortw_affine_jacobian_g2_" & $BLS12_381
+  )
+run_EC_affine_conversion(
+    ec = ECP_ShortW_Prj[Fp2[BLS12_381], G2],
+    Iters = Iters,
+    moduleName = "test_ec_conversion_shortw_affine_projective_g2_" & $BLS12_381
+  )
--- a/tests/t_ec_sage_template.nim
+++ b/tests/t_ec_sage_template.nim
@ -203,12 +203,8 @@ proc run_scalar_mul_test_vs_sage*(
          reference {.noInit.}: EC
          endo {.noInit.}: EC

-        when EC is ECP_ShortW_Prj:
-          P.projectiveFromAffine(vec.vectors[i].P)
-          Q.projectiveFromAffine(vec.vectors[i].Q)
-        else:
-          P.jacobianFromAffine(vec.vectors[i].P)
-          Q.jacobianFromAffine(vec.vectors[i].Q)
+        P.fromAffine(vec.vectors[i].P)
+        Q.fromAffine(vec.vectors[i].Q)
        impl = P
        reference = P
        endo = P
--- a/tests/t_ec_template.nim
+++ b/tests/t_ec_template.nim
@ -441,10 +441,7 @@ proc run_EC_mixed_add_impl*(
          let a = rng.random_point(EC, randZ, gen)
          let b = rng.random_point(EC, randZ, gen)
          var bAff: ECP_ShortW_Aff[EC.F, EC.G]
-          when b is ECP_ShortW_Prj:
-            bAff.affineFromProjective(b)
-          else:
-            bAff.affineFromJacobian(b)
+          bAff.affine(b)

          var r_generic, r_mixed: EC

@ -535,4 +532,169 @@ proc run_EC_subgroups_cofactors_impl*(
      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = Long01Sequence)
    
      echo "    [SUCCESS] Test finished with ", inSubgroup, " points in ", G1_or_G2, " subgroup and ",
-              offSubgroup, " points on curve but not in subgroup (before cofactor clearing)"
+              offSubgroup, " points on curve but not in subgroup (before cofactor clearing)"
+
+proc run_EC_affine_conversion*(
+       ec: typedesc,
+       Iters: static int,
+       moduleName: string
+     ) =
+
+  # Random seed for reproducibility
+  var rng: RngState
+  let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+  rng.seed(seed)
+  echo "\n------------------------------------------------------\n"
+  echo moduleName, " xoshiro512** seed: ", seed
+
+  const G1_or_G2 = pairingGroup(ec)
+
+  const testSuiteDesc = "Elliptic curve in " & $ec.F.C.getEquationForm() & " form"
+
+  suite testSuiteDesc & " - " & $ec & " - [" & $WordBitwidth & "-bit mode]":
+    test "EC " & G1_or_G2 & " batchAffine is consistent with single affine conversion":
+      proc test(EC: typedesc, gen: RandomGen) =
+        const batchSize = 10
+        for _ in 0 ..< Iters:
+          var Ps: array[batchSize, EC]
+          for i in 0 ..< batchSize:
+            Ps[i] = rng.random_point(EC, randZ = true, gen)
+
+          var Qs, Rs: array[batchSize, affine(EC)]
+          for i in 0 ..< batchSize:
+            Qs[i].affine(Ps[i])
+          Rs.batchAffine(Ps)
+
+          for i in countdown(batchSize-1, 0):
+            doAssert bool(Qs[i] == Rs[i]), block:
+              var s: string
+              s &= "Mismatch on iteration " & $i
+              s &= "\nFailing batch for " & $EC & " (" & $WordBitwidth & "-bit)"
+              s &= "\n  ["
+              for i in 0 ..< batchSize:
+                s &= "\n" & Ps[i].toHex(indent = 4)
+                if i != batchSize-1: s &= ","
+              s &= "\n  ]"
+              s &= "\nFailing inversions for " & $EC & " (" & $WordBitwidth & "-bit)"
+              s &= "\n  ["
+              for i in 0 ..< batchSize:
+                s &= "\n" & Rs[i].toHex(indent = 4)
+                if i != batchSize-1: s &= ","
+              s &= "\n  ]"
+              s &= "\nExpected inversions for " & $EC & " (" & $WordBitwidth & "-bit)"
+              s &= "\n  ["
+              for i in 0 ..< batchSize:
+                s &= "\n" & Qs[i].toHex(indent = 4)
+                if i != batchSize-1: s &= ","
+              s &= "\n  ]"
+              s
+
+      test(ec, gen = Uniform)
+      test(ec, gen = HighHammingWeight)
+      test(ec, gen = Long01Sequence)
+
+proc run_EC_conversion_failures*(
+       moduleName: string
+     ) =
+
+  echo "\n------------------------------------------------------\n"
+  echo moduleName
+
+  suite moduleName & " - [" & $WordBitwidth & "-bit mode]":
+    test "EC batchAffine fuzzing failures ":
+      proc test_bn254_snarks_g1(ECP: type) =
+        type ECP_Aff = ECP_ShortW_Aff[Fp[BN254_Snarks], G1]
+
+        let Ps = [
+          ECP.fromHex(
+            x = "0x0e0a76c19a07e01fe56f246f7878652c0b39eb28f5c60b3dd43e438dc50e0d9d",
+            y = "0x04e6da44bc7f802fab3df34ce45d86857327663bc24ff574da48ee2b01a4932e"
+          ),
+          ECP.fromHex(
+            x = "0x2036a21a3d9cc09d8f5f7491fe7e4f44cffd2addf01c6ae587bee7d24f060571",
+            y = "0x2b5f1cc6f1cdb4a6dbaf3c88b9c02ccf984aecbba4830d5aeb33f940cb632d8a"
+          ),
+          ECP.fromHex(
+            x = "0x2fd314a75c6b1f82d70f2edc7b7bf6e7397bc04bc6aaa0584b9e5bbb7689082a",
+            y = "0x111b3b4a697e7a990400eb39f09a9bb559748cea6699535bd114ffb3dcc0b4d1"
+          ),
+          ECP.fromHex(
+            x = "0x0000000000000000000000000000000000000000000000000000000000000000",
+            y = "0x0000000000000000000000000000000000000000000000000000000000000000"
+          ),
+          ECP.fromHex(
+            x = "0x0e0a77c199ffdf2f686ea36f7879462c0a74eb28f5e70b3dd31d438dc58f0d9d",
+            y = "0x0b3938a732020d98793510be6aa312651a5f5369ebbbe41d7fda8fd914b7f264"
+          ),
+          ECP.fromHex(
+            x = "0x0000000007ffffffffffffff80000000000007ffe000000000ffffffffffffff",
+            y = "0x1d9db0f30e3395ee33a70674a31e2854de0665292dd545c10fb3da579d7df916"
+          ),
+          ECP.fromHex(
+            x = "0x000000000000000fffffffffe0000000000007ffffffffffffffffffffffffff",
+            y = "0x2a5c6df4d24efa9ffcf4003e35801dc202d820b59d67ecc65d57cfdf53b4bbc6"
+          ),
+          ECP.fromHex(
+            x = "0x00000000000000000003ffffffc00000000000000c000000000000003ffffffe",
+            y = "0x09f811f84207472ccd6ca00bb1ec3e6132a1c9206adc9ed768871f0005f0d358"
+          ),
+          ECP.fromHex(
+            x = "0x0e0979b99d07df30656ea36f7879462c097beb28f5c8083dd25d448dc58f0ca4",
+            y = "0x1799b22d8780c917ab1c4e15da718c243babc1c51225b5f8298aa570b5029796"
+          ),
+          ECP.fromHex(
+            x = "0x0e0a76c29a07e02f666ea36e806a462c0a78eb25f5c70b3dd35c4b8dc58f0d9c",
+            y = "0x0529cb1ad2552c7979a900ff59551d5dc1f8680c3a4f20d3b9cdcf68b69ec61c"
+          )
+        ]
+
+        let Qs = [
+          ECP_Aff.fromHex(
+            x = "0x0e0a76c19a07e01fe56f246f7878652c0b39eb28f5c60b3dd43e438dc50e0d9d",
+            y = "0x04e6da44bc7f802fab3df34ce45d86857327663bc24ff574da48ee2b01a4932e"
+          ),
+          ECP_Aff.fromHex(
+            x = "0x2036a21a3d9cc09d8f5f7491fe7e4f44cffd2addf01c6ae587bee7d24f060571",
+            y = "0x2b5f1cc6f1cdb4a6dbaf3c88b9c02ccf984aecbba4830d5aeb33f940cb632d8a"
+          ),
+          ECP_Aff.fromHex(
+            x = "0x2fd314a75c6b1f82d70f2edc7b7bf6e7397bc04bc6aaa0584b9e5bbb7689082a",
+            y = "0x111b3b4a697e7a990400eb39f09a9bb559748cea6699535bd114ffb3dcc0b4d1"
+          ),
+          ECP_Aff.fromHex(
+            x = "0x0000000000000000000000000000000000000000000000000000000000000000",
+            y = "0x0000000000000000000000000000000000000000000000000000000000000000"
+          ),
+          ECP_Aff.fromHex(
+            x = "0x0e0a77c199ffdf2f686ea36f7879462c0a74eb28f5e70b3dd31d438dc58f0d9d",
+            y = "0x0b3938a732020d98793510be6aa312651a5f5369ebbbe41d7fda8fd914b7f264"
+          ),
+          ECP_Aff.fromHex(
+            x = "0x0000000007ffffffffffffff80000000000007ffe000000000ffffffffffffff",
+            y = "0x1d9db0f30e3395ee33a70674a31e2854de0665292dd545c10fb3da579d7df916"
+          ),
+          ECP_Aff.fromHex(
+            x = "0x000000000000000fffffffffe0000000000007ffffffffffffffffffffffffff",
+            y = "0x2a5c6df4d24efa9ffcf4003e35801dc202d820b59d67ecc65d57cfdf53b4bbc6"
+          ),
+          ECP_Aff.fromHex(
+            x = "0x00000000000000000003ffffffc00000000000000c000000000000003ffffffe",
+            y = "0x09f811f84207472ccd6ca00bb1ec3e6132a1c9206adc9ed768871f0005f0d358"
+          ),
+          ECP_Aff.fromHex(
+            x = "0x0e0979b99d07df30656ea36f7879462c097beb28f5c8083dd25d448dc58f0ca4",
+            y = "0x1799b22d8780c917ab1c4e15da718c243babc1c51225b5f8298aa570b5029796"
+          ),
+          ECP_Aff.fromHex(
+            x = "0x0e0a76c29a07e02f666ea36e806a462c0a78eb25f5c70b3dd35c4b8dc58f0d9c",
+            y = "0x0529cb1ad2552c7979a900ff59551d5dc1f8680c3a4f20d3b9cdcf68b69ec61c"
+          )
+        ]
+
+        var Rs: array[10, ECP_Aff]
+        Rs.batchAffine(Ps)
+        for i in 0 ..< 10:
+          doAssert bool(Qs[i] == Rs[i])
+
+      test_bn254_snarks_g1(ECP_ShortW_Prj[Fp[BN254_Snarks], G1])
+      test_bn254_snarks_g1(ECP_ShortW_Jac[Fp[BN254_Snarks], G1])
--- a/tests/t_hash_to_curve.nim
+++ b/tests/t_hash_to_curve.nim
@ -132,7 +132,7 @@ proc run_hash_to_curve_test(
        )

        var P_ref: EC
-        P_ref.projectiveFromAffine(vec.vectors[i].P)
+        P_ref.fromAffine(vec.vectors[i].P)

        doAssert: bool(P == P_ref)

--- a/tests/t_io_unsaturated.nim
+++ b/tests/t_io_unsaturated.nim
@ -0,0 +1,89 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import  std/[unittest,times],
+        ../constantine/config/[common, curves],
+        ../constantine/arithmetic,
+        ../constantine/arithmetic/limbs_unsaturated,
+        ../constantine/io/io_bigints,
+        ../helpers/prng_unsafe
+
+# Random seed for reproducibility
+var rng: RngState
+let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+rng.seed(seed)
+echo "\n------------------------------------------------------\n"
+echo "test_io_unsaturated xoshiro512** seed: ", seed
+
+type
+  RandomGen = enum
+    Uniform
+    HighHammingWeight
+    Long01Sequence
+
+func random_bigint*(rng: var RngState, curve: static Curve, gen: static RandomGen): auto =
+  when gen == Uniform:
+    rng.random_unsafe(matchingBigInt(curve))
+  elif gen == HighHammingWeight:
+    rng.random_highHammingWeight(matchingBigInt(curve))
+  else:
+    rng.random_long01Seq(matchingBigInt(curve))
+
+# debug
+import std/strutils
+
+proc testRoundtrip(curve: static Curve, gen: static RandomGen) =
+  const bits = curve.getCurveBitwidth()
+  const Excess = 2
+  const UnsatBitwidth = WordBitWidth - Excess
+  const N = (bits + UnsatBitwidth-1) div UnsatBitwidth
+
+  let a = rng.random_bigint(curve, gen)
+  var u: LimbsUnsaturated[N, Excess]
+  var b: typeof(a)
+
+  u.fromPackedRepr(a.limbs)
+  b.limbs.fromUnsatRepr(u)
+
+  doAssert bool(a == b), block:
+    "\n  a: " & a.toHex() &
+    "\n  b: " & b.toHex()
+
+proc main() =
+  suite "Packed <-> Unsaturated limbs roundtrips" & " [" & $WordBitwidth & "-bit mode]":
+    const Iters = 10000
+    test "BN254_Snarks":
+      for _ in 0 ..< Iters:
+        testRoundtrip(BN254_Snarks, Uniform)
+      for _ in 0 ..< Iters:
+        testRoundtrip(BN254_Snarks, HighHammingWeight)
+      for _ in 0 ..< Iters:
+        testRoundtrip(BN254_Snarks, Long01Sequence)
+    test "Curve25519":
+      for _ in 0 ..< Iters:
+        testRoundtrip(Curve25519, Uniform)
+      for _ in 0 ..< Iters:
+        testRoundtrip(Curve25519, HighHammingWeight)
+      for _ in 0 ..< Iters:
+        testRoundtrip(Curve25519, Long01Sequence)
+    test "secp256k1":
+      for _ in 0 ..< Iters:
+        testRoundtrip(Secp256k1, Uniform)
+      for _ in 0 ..< Iters:
+        testRoundtrip(Secp256k1, HighHammingWeight)
+      for _ in 0 ..< Iters:
+        testRoundtrip(Secp256k1, Long01Sequence)
+    test "BLS12-381":
+      for _ in 0 ..< Iters:
+        testRoundtrip(BLS12_381, Uniform)
+      for _ in 0 ..< Iters:
+        testRoundtrip(BLS12_381, HighHammingWeight)
+      for _ in 0 ..< Iters:
+        testRoundtrip(BLS12_381, Long01Sequence)
+
+main()
--- a/tests/t_pairing_bls12_377_line_functions.nim
+++ b/tests/t_pairing_bls12_377_line_functions.nim
@ -100,7 +100,7 @@ suite "Pairing - Line Functions on BLS12-377" & " [" & $WordBitwidth & "-bit mod
        TQ.sum(T, Q)

        var Qaff{.noInit.}: ECP_ShortW_Aff[Fp2[C], G2]
-        Qaff.affineFromProjective(Q)
+        Qaff.affine(Q)
        l.line_add(T, Qaff, P)

        doAssert: bool(T == TQ)
--- a/tests/t_pairing_bls12_381_line_functions.nim
+++ b/tests/t_pairing_bls12_381_line_functions.nim
@ -100,7 +100,7 @@ suite "Pairing - Line Functions on BLS12-381" & " [" & $WordBitwidth & "-bit mod
        TQ.sum(T, Q)

        var Qaff{.noInit.}: ECP_ShortW_Aff[Fp2[C], G2]
-        Qaff.affineFromProjective(Q)
+        Qaff.affine(Q)
        l.line_add(T, Qaff, P)

        doAssert: bool(T == TQ)
--- a/tests/t_pairing_template.nim
+++ b/tests/t_pairing_template.nim
@ -43,9 +43,9 @@ func clearCofactor[F; G: static Subgroup](
       ec: var ECP_ShortW_Aff[F, G]) =
  # For now we don't have any affine operation defined
  var t {.noInit.}: ECP_ShortW_Prj[F, G]
-  t.projectiveFromAffine(ec)
+  t.fromAffine(ec)
  t.clearCofactor()
-  ec.affineFromProjective(t)
+  ec.affine(t)

 func random_point*(rng: var RngState, EC: typedesc, randZ: bool, gen: RandomGen): EC {.noInit.} =
  if gen == Uniform:
@ -82,10 +82,10 @@ template runPairingTests*(Iters: static int, C: static Curve, G1, G2, GT: typede
      var Pa {.noInit.}, Pa2 {.noInit.}: affineType(P)
      var Qa {.noInit.}, Qa2 {.noInit.}: affineType(Q)

-      Pa.affineFromProjective(P)
-      Pa2.affineFromProjective(P2)
-      Qa.affineFromProjective(Q)
-      Qa2.affineFromProjective(Q2)
+      Pa.affine(P)
+      Pa2.affine(P2)
+      Qa.affine(Q)
+      Qa2.affine(Q2)

      r.pairing_fn(Pa, Qa)
      r.square()
--- a/tests/t_sig_bls_lowlevel.nim
+++ b/tests/t_sig_bls_lowlevel.nim
@ -85,9 +85,9 @@ func publicKeyG1(
       seckey: Fr[BLS12_381]
     ) =
  var t: ECP_ShortW_Prj[Fp[BLS12_381], G1]
-  t.projectiveFromAffine(BLS12_381_G1_generator)
+  t.fromAffine(BLS12_381_G1_generator)
  t.scalarMul(seckey.toBig())
-  pubkey.affineFromprojective(t)
+  pubkey.affine(t)
  doAssert not bool pubkey.isInf()

 func signG2[T: byte|char](
@ -105,7 +105,7 @@ func signG2[T: byte|char](
    domainSepTag = DomainSepTag
  )
  t.scalarMul(secretKey.toBig())
-  signature.affineFromprojective(t)
+  signature.affine(t)
  doAssert not bool signature.isInf()

 func verifyG2[T: byte|char](
@ -125,7 +125,7 @@ func verifyG2[T: byte|char](
    message = message,
    domainSepTag = DomainSepTag
  )
-  Q.affineFromprojective(Qprj)
+  Q.affine(Qprj)

  var e0{.noInit.}, e1{.noInit.}: Fp12[BLS12_381]
  e0.pairing_bls12(pubkey, Q)
@ -154,7 +154,7 @@ func verifyG2_multi[T: byte|char](
  var G1s: array[2, ECP_ShortW_Aff[Fp[BLS12_381], G1]]

  G1s[0] = pubkey
-  G2s[0].affineFromprojective(Qprj)
+  G2s[0].affine(Qprj)

  G1s[1].neg(BLS12_381_G1_generator)
  G2s[1] = signature