Square Root & Inversion addition chains - 20% perf increase (#132)

* Addition chain for sqrt BLS12-381: 20% perf improvement * sqrt addchain for BN254_Snarks - 20% perf improvement as well * Fix operation count [skip ci] * BLS12-377 sqrt - 10% perf improvement * sqrt addition chain for BW6-761 - 6% speedup * BN254_Nogami inversion addchain * sqrt addchain for BN254_Nogami * Inversion addchain for BLS12-377 * inversion ddition chain for BW6-761
2021-01-23 20:55:40 +01:00 · 2021-01-23 20:55:40 +01:00 · 82819b1b10
parent a02dd19d36
commit 82819b1b10
19 changed files with 1988 additions and 97 deletions
--- a/benchmarks/bench_fields_template.nim
+++ b/benchmarks/bench_fields_template.nim
@ -22,15 +22,15 @@ import
  ./bench_blueprint
 export notes
-proc separator*() = separator(145)
+proc separator*() = separator(165)
 proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
  let ns = inNanoseconds((stop-start) div iters)
  let throughput = 1e9 / float64(ns)
  when SupportsGetTicks:
-    echo &"{op:<50} {field:<18} {throughput:>15.3f} ops/s     {ns:>9} ns/op     {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
+    echo &"{op:<70} {field:<18} {throughput:>15.3f} ops/s     {ns:>9} ns/op     {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
  else:
-    echo &"{op:<50} {field:<18} {throughput:>15.3f} ops/s     {ns:>9} ns/op"
+    echo &"{op:<70} {field:<18} {throughput:>15.3f} ops/s     {ns:>9} ns/op"
 macro fixFieldDisplay(T: typedesc): untyped =
  # At compile-time, enums are integers and their display is buggy
@ -93,20 +93,20 @@ proc invBench*(T: typedesc, iters: int) =
  var r: T
  let x = rng.random_unsafe(T)
  preventOptimAway(r)
-  bench("Inversion (constant-time default method)", T, iters):
+  bench("Inversion (constant-time default impl)", T, iters):
    r.inv(x)
 proc invEuclidBench*(T: typedesc, iters: int) =
  var r: T
  let x = rng.random_unsafe(T)
  preventOptimAway(r)
-  bench("Inversion via constant-time Euclid", T, iters):
+  bench("Inversion (constant-time Euclid)", T, iters):
    r.inv_euclid(x)
 proc invPowFermatBench*(T: typedesc, iters: int) =
  let x = rng.random_unsafe(T)
  const exponent = T.getInvModExponent()
-  bench("Inversion via exponentiation p-2 (Little Fermat)", T, iters):
+  bench("Inversion (exponentiation p-2, Little Fermat)", T, iters):
    var r = x
    r.powUnsafeExponent(exponent)
@ -114,15 +114,39 @@ proc invAddChainBench*(T: typedesc, iters: int) =
  var r: T
  let x = rng.random_unsafe(T)
  preventOptimAway(r)
-  bench("Inversion via addition chain", T, iters):
+  bench("Inversion (addition chain)", T, iters):
    r.inv_addchain(x)
 proc sqrtBench*(T: typedesc, iters: int) =
  let x = rng.random_unsafe(T)
-  bench("Square Root + square check (constant-time)", T, iters):
+  bench("Square Root + isSquare (constant-time default impl)", T, iters):
    var r = x
    discard r.sqrt_if_square()
 proc sqrtP3mod4Bench*(T: typedesc, iters: int) =
  let x = rng.random_unsafe(T)
  bench("SquareRoot + isSquare (p ≡ 3 (mod 4) exponentiation)", T, iters):
    var r = x
    discard r.sqrt_if_square_p3mod4()
 proc sqrtAddChainBench*(T: typedesc, iters: int) =
  let x = rng.random_unsafe(T)
  bench("SquareRoot + isSquare (addition chain)", T, iters):
    var r = x
    discard r.sqrt_if_square_addchain()
 proc sqrtTonelliBench*(T: typedesc, iters: int) =
  let x = rng.random_unsafe(T)
  bench("SquareRoot + isSquare (constant-time Tonelli-Shanks exponentiation)", T, iters):
    var r = x
    discard r.sqrt_if_square_tonelli_shanks(useAddChain = false)
 proc sqrtTonelliAddChainBench*(T: typedesc, iters: int) =
  let x = rng.random_unsafe(T)
  bench("SquareRoot + isSquare (constant-time Tonelli-Shanks addchain)", T, iters):
    var r = x
    discard r.sqrt_if_square_tonelli_shanks(useAddChain = true)
 proc powBench*(T: typedesc, iters: int) =
  let x = rng.random_unsafe(T)
  let exponent = rng.random_unsafe(BigInt[T.C.getCurveOrderBitwidth()])
--- a/benchmarks/bench_fp.nim
+++ b/benchmarks/bench_fp.nim
@ -8,9 +8,10 @@
 import
  # Internals
-  ../constantine/config/curves,
+  ../constantine/config/[curves, common],
  ../constantine/arithmetic,
  ../constantine/io/io_bigints,
  ../constantine/curves/[zoo_inversions, zoo_square_roots],
  # Helpers
  ../helpers/static_for,
  ./bench_fields_template,
@ -24,8 +25,8 @@ import
 # ############################################################
-const Iters = 1_000_000
+const Iters = 100_000
-const ExponentIters = 1000
+const ExponentIters = 100
 const AvailableCurves = [
  # P224,
  BN254_Nogami,
@ -35,6 +36,7 @@ const AvailableCurves = [
  # Secp256k1,
  BLS12_377,
  BLS12_381,
  BW6_761
 ]
 proc main() =
@ -50,9 +52,15 @@ proc main() =
    sqrBench(Fp[curve], Iters)
    invEuclidBench(Fp[curve], ExponentIters)
    invPowFermatBench(Fp[curve], ExponentIters)
-    when curve in {BN254_Snarks, BLS12_381}:
+    when curve.hasInversionAddchain():
      invAddChainBench(Fp[curve], ExponentIters)
-    sqrtBench(Fp[curve], ExponentIters)
+    when (BaseType(curve.Mod.limbs[0]) and 3) == 3:
      sqrtP3mod4Bench(Fp[curve], ExponentIters)
    when curve.hasSqrtAddchain():
      sqrtAddChainBench(Fp[curve], ExponentIters)
    when curve in {BLS12_377}:
      sqrtTonelliBench(Fp[curve], ExponentIters)
      sqrtTonelliAddChainBench(Fp[curve], ExponentIters)
    # Exponentiation by a "secret" of size ~the curve order
    powBench(Fp[curve], ExponentIters)
    powUnsafeBench(Fp[curve], ExponentIters)
--- a/constantine.nimble
+++ b/constantine.nimble
@ -218,6 +218,7 @@ proc test(flags, path: string, commandFile = false) =
    exec command
  else:
    exec "echo \'" & command & "\' >> " & buildParallel
    exec "echo \"------------------------------------------------------\""
 proc buildBench(benchName: string, compiler = "", useAsm = true, run = false) =
  if not dirExists "build":
--- a/constantine/arithmetic/finite_fields.nim
+++ b/constantine/arithmetic/finite_fields.nim
@ -386,6 +386,12 @@ func square_repeated*(r: var FF, num: int) {.inline.} =
  for _ in 0 ..< num:
    r.square()
 func square_repeated*(r: var FF, a: FF, num: int) {.inline.} =
  ## Repeated squarings
  r.square(a)
  for _ in 1 ..< num:
    r.square()
 func `*=`*(a: var FF, b: static int) {.inline.} =
  ## Multiplication by a small integer known at compile-time
  # Implementation:
--- a/constantine/arithmetic/finite_fields_inversion.nim
+++ b/constantine/arithmetic/finite_fields_inversion.nim
@ -36,7 +36,7 @@ func inv*(r: var Fp, a: Fp) {.inline.} =
  # neither for Secp256k1 nor BN curves
  # Performance is slower than GCD
  # To be revisited with faster squaring/multiplications
-  when Fp.C in {BN254_Snarks, BLS12_381}:
+  when Fp.C.hasInversionAddchain():
    r.inv_addchain(a)
  else:
    r.inv_euclid(a)
@ -48,10 +48,7 @@ func inv*(a: var Fp) {.inline.} =
  ## Incidentally this avoids extra check
  ## to convert Jacobian and Projective coordinates
  ## to affine for elliptic curve
-  # For now we don't activate the addition chains
+  when Fp.C.hasInversionAddchain():
  # for Secp256k1 nor BN curves
  # Performance is slower than GCD
  when Fp.C in {BN254_Snarks, BLS12_381}:
    a.inv_addchain(a)
  else:
    a.inv_euclid(a)
--- a/constantine/arithmetic/finite_fields_square_root.nim
+++ b/constantine/arithmetic/finite_fields_square_root.nim
@ -46,6 +46,10 @@ func isSquare*(a: Fp): SecretBool {.inline.} =
 # Specialized routine for p ≡ 3 (mod 4)
 # ------------------------------------------------------------
 func hasP3mod4_primeModulus(C: static Curve): static bool =
  ## Returns true iff p ≡ 3 (mod 4)
  (BaseType(C.Mod.limbs[0]) and 3) == 3
 func sqrt_p3mod4(a: var Fp) {.inline.} =
  ## Compute the square root of ``a``
  ##
@ -93,7 +97,7 @@ func sqrt_invsqrt_if_square_p3mod4(sqrt, invsqrt: var Fp, a: Fp): SecretBool {.i
  test.square(sqrt)
  result = test == a
-func sqrt_if_square_p3mod4(a: var Fp): SecretBool {.inline.} =
+func sqrt_if_square_p3mod4*(a: var Fp): SecretBool {.inline.} =
  ## If ``a`` is a square, compute the square root of ``a``
  ## if not, ``a`` is unmodified.
  ##
@ -108,14 +112,60 @@ func sqrt_if_square_p3mod4(a: var Fp): SecretBool {.inline.} =
  result = sqrt_invsqrt_if_square_p3mod4(sqrt, invsqrt, a)
  a.ccopy(sqrt, result)
 # Specialized routines for addchain-based square roots
 # ------------------------------------------------------------
 func sqrt_addchain(a: var Fp) {.inline.} =
  ## Compute the square root of ``a``
  ##
  ## This requires ``a`` to be a square
  ## The result is undefined otherwise
  ##
  ## The square root, if it exist is multivalued,
  ## i.e. both x² == (-x)²
  ## This procedure returns a deterministic result
  var invsqrt {.noInit.}: Fp
  invsqrt.invsqrt_addchain(a)
  a *= invsqrt
 func sqrt_invsqrt_addchain(sqrt, invsqrt: var Fp, a: Fp) {.inline.} =
  ## If ``a`` is a square, compute the square root of ``a`` in sqrt
  ## and the inverse square root of a in invsqrt
  invsqrt.invsqrt_addchain(a)
  sqrt.prod(invsqrt, a)
 func sqrt_invsqrt_if_square_addchain(sqrt, invsqrt: var Fp, a: Fp): SecretBool {.inline.} =
  ## If ``a`` is a square, compute the square root of ``a`` in sqrt
  ## and the inverse square root of a in invsqrt
  ##
  ## If a is not square, sqrt and invsqrt are undefined
  sqrt_invsqrt_addchain(sqrt, invsqrt, a)
  var test {.noInit.}: Fp
  test.square(sqrt)
  result = test == a
 func sqrt_if_square_addchain*(a: var Fp): SecretBool {.inline.} =
  ## If ``a`` is a square, compute the square root of ``a``
  ## if not, ``a`` is unmodified.
  ##
  ## The square root, if it exist is multivalued,
  ## i.e. both x² == (-x)²
  ## This procedure returns a deterministic result
  var sqrt {.noInit.}, invsqrt {.noInit.}: Fp
  result = sqrt_invsqrt_if_square_addchain(sqrt, invsqrt, a)
  a.ccopy(sqrt, result)
 # Tonelli Shanks for any prime
 # ------------------------------------------------------------
 func precompute_tonelli_shanks(
       a_pre_exp: var Fp,
-       a: Fp) =
+       a: Fp, useAddChain: static bool) =
  a_pre_exp = a
-  a_pre_exp.powUnsafeExponent(Fp.C.tonelliShanks(exponent))
+  when useAddChain:
    a_pre_exp.precompute_tonelli_shanks_addchain(a)
  else:
    a_pre_exp.powUnsafeExponent(Fp.C.tonelliShanks(exponent))
 func isSquare_tonelli_shanks(
       a, a_pre_exp: Fp): SecretBool =
@ -126,10 +176,9 @@ func isSquare_tonelli_shanks(
  ## a^((p-1-2^e)/(2*2^e))
  const e = Fp.C.tonelliShanks(twoAdicity)
  var r {.noInit.}: Fp
-  r.square(a_pre_exp) # a^(2(q-1-2^e)/(2*2^e)) = a^((q-1)/2^e - 1)
+  r.square(a_pre_exp)    # a^(2(q-1-2^e)/(2*2^e)) = a^((q-1)/2^e - 1)
-  r *= a              # a^((q-1)/2^e)
+  r *= a                 # a^((q-1)/2^e)
-  for _ in 0 ..< e-1:
+  r.square_repeated(e-1) # a^((q-1)/2)
    r.square()        # a^((q-1)/2)
  result = not(r.isMinusOne())
  # r can be:
@ -143,14 +192,14 @@ func isSquare_tonelli_shanks(
      r.isMinusOne()
    )
-func sqrt_invsqrt_tonelli_shanks(
+func sqrt_invsqrt_tonelli_shanks_pre(
       sqrt, invsqrt: var Fp,
       a, a_pre_exp: Fp) =
  ## Compute the square_root and inverse_square_root
  ## of `a` via constant-time Tonelli-Shanks
  ##
  ## a_pre_exp is a precomputation a^((p-1-2^e)/(2*2^e))
-  ## ThItat is shared with the simultaneous isSquare routine
+  ## That is shared with the simultaneous isSquare routine
  template z: untyped = a_pre_exp
  template r: untyped = invsqrt
  var t {.noInit.}: Fp
@ -165,8 +214,7 @@ func sqrt_invsqrt_tonelli_shanks(
  var buf {.noInit.}: Fp
  for i in countdown(e, 2, 1):
-    for j in 1 .. i-2:
+    b.square_repeated(i-2)
      b.square()
    let bNotOne = not b.isOne()
    buf.prod(r, root)
@ -178,8 +226,72 @@ func sqrt_invsqrt_tonelli_shanks(
  sqrt.prod(invsqrt, a)
 # ----------------------------------------------
 func sqrt_tonelli_shanks(a: var Fp, useAddChain: static bool) {.inline.} =
  ## Compute the square root of ``a``
  ##
  ## This requires ``a`` to be a square
  ##
  ## The result is undefined otherwise
  ##
  ## The square root, if it exist is multivalued,
  ## i.e. both x² == (-x)²
  ## This procedure returns a deterministic result
  ## This procedure is constant-time
  var a_pre_exp{.noInit.}, sqrt{.noInit.}, invsqrt{.noInit.}: Fp
  a_pre_exp.precompute_tonelli_shanks(a, useAddChain)
  sqrt_invsqrt_tonelli_shanks_pre(sqrt, invsqrt, a, a_pre_exp)
  a = sqrt
 func sqrt_invsqrt_tonelli_shanks(sqrt, invsqrt: var Fp, a: Fp, useAddChain: static bool) {.inline.} =
  ## Compute the square root and inverse square root of ``a``
  ##
  ## This requires ``a`` to be a square
  ##
  ## The result is undefined otherwise
  ##
  ## The square root, if it exist is multivalued,
  ## i.e. both x² == (-x)²
  ## This procedure returns a deterministic result
  var a_pre_exp{.noInit.}: Fp
  a_pre_exp.precompute_tonelli_shanks(a, useAddChain)
  sqrt_invsqrt_tonelli_shanks_pre(sqrt, invsqrt, a, a_pre_exp)
 func sqrt_invsqrt_if_square_tonelli_shanks(sqrt, invsqrt: var Fp, a: Fp, useAddChain: static bool): SecretBool  {.inline.} =
  ## Compute the square root and ivnerse square root of ``a``
  ##
  ## This returns true if ``a`` is square and sqrt/invsqrt contains the square root/inverse square root
  ##
  ## The result is undefined otherwise
  ##
  ## The square root, if it exist is multivalued,
  ## i.e. both x² == (-x)²
  ## This procedure returns a deterministic result
  var a_pre_exp{.noInit.}: Fp
  a_pre_exp.precompute_tonelli_shanks(a, useAddChain)
  result = isSquare_tonelli_shanks(a, a_pre_exp)
  sqrt_invsqrt_tonelli_shanks_pre(sqrt, invsqrt, a, a_pre_exp)
  a = sqrt
 func sqrt_if_square_tonelli_shanks*(a: var Fp, useAddChain: static bool): SecretBool {.inline.} =
  ## If ``a`` is a square, compute the square root of ``a``
  ## if not, ``a`` is unmodified.
  ##
  ## The square root, if it exist is multivalued,
  ## i.e. both x² == (-x)²
  ## This procedure returns a deterministic result
  ## This procedure is constant-time
  var a_pre_exp{.noInit.}, sqrt{.noInit.}, invsqrt{.noInit.}: Fp
  a_pre_exp.precompute_tonelli_shanks(a, useAddChain)
  result = isSquare_tonelli_shanks(a, a_pre_exp)
  sqrt_invsqrt_tonelli_shanks_pre(sqrt, invsqrt, a, a_pre_exp)
  a = sqrt
 # Public routines
 # ------------------------------------------------------------
 # Note: we export the inner sqrt_invsqrt_IMPL
 #       for benchmarking purposes.
 func sqrt*[C](a: var Fp[C]) {.inline.} =
  ## Compute the square root of ``a``
@ -192,30 +304,12 @@ func sqrt*[C](a: var Fp[C]) {.inline.} =
  ## i.e. both x² == (-x)²
  ## This procedure returns a deterministic result
  ## This procedure is constant-time
-  when (BaseType(C.Mod.limbs[0]) and 3) == 3:
+  when C.hasSqrtAddchain():
    sqrt_addchain(a)
  elif C.hasP3mod4_primeModulus():
    sqrt_p3mod4(a)
  else:
-    var a_pre_exp{.noInit.}, sqrt{.noInit.}, invsqrt{.noInit.}: Fp[C]
+    sqrt_tonelli_shanks(a, useAddChain = C.hasTonelliShanksAddchain())
    a_pre_exp.precompute_tonelli_shanks(a)
    sqrt_invsqrt_tonelli_shanks(sqrt, invsqrt, a, a_pre_exp)
    a = sqrt
 func sqrt_if_square*[C](a: var Fp[C]): SecretBool {.inline.} =
  ## If ``a`` is a square, compute the square root of ``a``
  ## if not, ``a`` is unmodified.
  ##
  ## The square root, if it exist is multivalued,
  ## i.e. both x² == (-x)²
  ## This procedure returns a deterministic result
  ## This procedure is constant-time
  when (BaseType(C.Mod.limbs[0]) and 3) == 3:
    result = sqrt_if_square_p3mod4(a)
  else:
    var a_pre_exp{.noInit.}, sqrt{.noInit.}, invsqrt{.noInit.}: Fp[C]
    a_pre_exp.precompute_tonelli_shanks(a)
    result = isSquare_tonelli_shanks(a, a_pre_exp)
    sqrt_invsqrt_tonelli_shanks(sqrt, invsqrt, a, a_pre_exp)
    a = sqrt
 func sqrt_invsqrt*[C](sqrt, invsqrt: var Fp[C], a: Fp[C]) {.inline.} =
  ## Compute the square root and inverse square root of ``a``
@ -227,12 +321,12 @@ func sqrt_invsqrt*[C](sqrt, invsqrt: var Fp[C], a: Fp[C]) {.inline.} =
  ## The square root, if it exist is multivalued,
  ## i.e. both x² == (-x)²
  ## This procedure returns a deterministic result
-  when (BaseType(C.Mod.limbs[0]) and 3) == 3:
+  when C.hasSqrtAddchain():
    sqrt_invsqrt_addchain(sqrt, invsqrt, a)
  elif C.hasP3mod4_primeModulus():
    sqrt_invsqrt_p3mod4(sqrt, invsqrt, a)
  else:
-    var a_pre_exp{.noInit.}: Fp[C]
+    sqrt_invsqrt_tonelli_shanks(sqrt, invsqrt, a, useAddChain = C.hasTonelliShanksAddchain())
    a_pre_exp.precompute_tonelli_shanks(a)
    sqrt_invsqrt_tonelli_shanks(sqrt, invsqrt, a, a_pre_exp)
 func sqrt_invsqrt_if_square*[C](sqrt, invsqrt: var Fp[C], a: Fp[C]): SecretBool  {.inline.} =
  ## Compute the square root and ivnerse square root of ``a``
@ -244,11 +338,24 @@ func sqrt_invsqrt_if_square*[C](sqrt, invsqrt: var Fp[C], a: Fp[C]): SecretBool
  ## The square root, if it exist is multivalued,
  ## i.e. both x² == (-x)²
  ## This procedure returns a deterministic result
-  when (BaseType(C.Mod.limbs[0]) and 3) == 3:
+  when C.hasSqrtAddchain():
    result = sqrt_invsqrt_if_square_addchain(sqrt, invsqrt, a)
  elif C.hasP3mod4_primeModulus():
    result = sqrt_invsqrt_if_square_p3mod4(sqrt, invsqrt, a)
  else:
-    var a_pre_exp{.noInit.}: Fp[C]
+    result = sqrt_invsqrt_if_square_tonelli_shanks(sqrt, invsqrt, a, useAddChain = C.hasTonelliShanksAddchain())
-    a_pre_exp.precompute_tonelli_shanks(a)
+
-    result = isSquare_tonelli_shanks(a, a_pre_exp)
+func sqrt_if_square*[C](a: var Fp[C]): SecretBool {.inline.} =
-    sqrt_invsqrt_tonelli_shanks(sqrt, invsqrt, a, a_pre_exp)
+  ## If ``a`` is a square, compute the square root of ``a``
-    a = sqrt
+  ## if not, ``a`` is unmodified.
  ##
  ## The square root, if it exist is multivalued,
  ## i.e. both x² == (-x)²
  ## This procedure returns a deterministic result
  ## This procedure is constant-time
  when C.hasSqrtAddchain():
    result = sqrt_if_square_addchain(a)
  elif C.hasP3mod4_primeModulus():
    result = sqrt_if_square_p3mod4(a)
  else:
    result = sqrt_if_square_tonelli_shanks(a, useAddChain = C.hasTonelliShanksAddchain())
--- a/constantine/curves/bls12_377_inversion.nim
+++ b/constantine/curves/bls12_377_inversion.nim
@ -0,0 +1,204 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
  ../config/curves,
  ../arithmetic/finite_fields
 # ############################################################
 #
 #           Specialized inversion for BLS12-377
 #
 # ############################################################
 func inv_addchain*(r: var Fp[BLS12_377], a: Fp[BLS12_377]) =
  let a = a # ensure a.inv_addchain(a) is OK
  var
    x10       {.noInit.}: Fp[BLS12_377]
    x11       {.noInit.}: Fp[BLS12_377]
    x100      {.noInit.}: Fp[BLS12_377]
    x101      {.noInit.}: Fp[BLS12_377]
    x111      {.noInit.}: Fp[BLS12_377]
    x1001     {.noInit.}: Fp[BLS12_377]
    x1011     {.noInit.}: Fp[BLS12_377]
    x1111     {.noInit.}: Fp[BLS12_377]
    x10001    {.noInit.}: Fp[BLS12_377]
    x10011    {.noInit.}: Fp[BLS12_377]
    x10111    {.noInit.}: Fp[BLS12_377]
    x11011    {.noInit.}: Fp[BLS12_377]
    x11101    {.noInit.}: Fp[BLS12_377]
    x11111    {.noInit.}: Fp[BLS12_377]
    x110100   {.noInit.}: Fp[BLS12_377]
    x11010000 {.noInit.}: Fp[BLS12_377]
    x11010111 {.noInit.}: Fp[BLS12_377]
  x10       .square(a)
  x11       .prod(a, x10)
  x100      .prod(a, x11)
  x101      .prod(a, x100)
  x111      .prod(x10, x101)
  x1001     .prod(x10, x111)
  x1011     .prod(x10, x1001)
  x1111     .prod(x100, x1011)
  x10001    .prod(x10, x1111)
  x10011    .prod(x10, x10001)
  x10111    .prod(x100, x10011)
  x11011    .prod(x100, x10111)
  x11101    .prod(x10, x11011)
  x11111    .prod(x10, x11101)
  x110100   .prod(x10111, x11101)
  x11010000 .square_repeated(x110100, 2)
  x11010111 .prod(x111, x11010000)
  # 18 operations
  # TODO: we can accumulate in a partially reduced
  #       doubled-size `r` to avoid the final substractions.
  #       and only reduce at the end.
  #       This requires the number of op to be less than log2(p) == 381
  # 18 + 18 = 36 operations
  r.square_repeated(x11010111, 8)
  r *= x11101
  r.square_repeated(7)
  r *= x10001
  r.square()
  # 36 + 14 = 50 operations
  r *= a
  r.square_repeated(9)
  r *= x10111
  r.square_repeated(2)
  r *= x11
  # 50 + 21 = 71 operations
  r.square_repeated(6)
  r *= x101
  r.square_repeated(4)
  r *= a
  r.square_repeated(9)
  # 71 + 13 = 84 operations
  r *= x11101
  r.square_repeated(5)
  r *= x1011
  r.square_repeated(5)
  r *= x11
  # 84 + 21 = 105 operations
  r.square_repeated(8)
  r *= x11101
  r.square()
  r *= a
  r.square_repeated(10)
  # 105 + 20 = 125 operations
  r *= x10111
  r.square_repeated(12)
  r *= x11011
  r.square_repeated(5)
  r *= x101
  # 125 + 22 = 147 operations
  r.square_repeated(7)
  r *= x101
  r.square_repeated(6)
  r *= x1001
  r.square_repeated(7)
  # 147 + 11 = 158 operations
  r *= x11101
  r.square_repeated(5)
  r *= x10001
  r.square_repeated(3)
  r *= x101
  # 158 + 23 = 181 operations
  r.square_repeated(8)
  r *= x10001
  r.square_repeated(6)
  r *= x11011
  r.square_repeated(7)
  # 181 + 19 = 200 operations
  r *= x11111
  r.square_repeated(4)
  r *= x11
  r.square_repeated(12)
  r *= x1111
  # 200 + 19 = 219 operations
  r.square_repeated(4)
  r *= x101
  r.square_repeated(8)
  r *= x10011
  r.square_repeated(5)
  # 219 + 13 = 232 operations
  r *= x10001
  r.square_repeated(3)
  r *= x111
  r.square_repeated(7)
  r *= x1111
  # 232 + 22 = 254 operations
  r.square_repeated(5)
  r *= x1111
  r.square_repeated(7)
  r *= x11011
  r.square_repeated(8)
  # 254 + 13 = 269 operations
  r *= x10001
  r.square_repeated(6)
  r *= x11111
  r.square_repeated(6)
  r *= x11101
  # 269 + 35 = 304 operations
  r.square_repeated(9)
  r *= x1001
  r.square_repeated(5)
  r *= x1001
  r.square_repeated(19)
  # 304 + 17 = 321 operations
  r *= x10111
  r.square_repeated(8)
  r *= x1011
  r.square_repeated(6)
  r *= x10111
  # 321 + 16 = 337 operations
  r.square_repeated(4)
  r *= x101
  r.square_repeated(4)
  r *= a
  r.square_repeated(6)
  # 337 + 29 = 376 operations
  r *= x11
  r.square_repeated(29)
  r *= a
  r.square_repeated(7)
  r *= x101
  # 376 + 16 = 392 operations
  r.square_repeated(9)
  r *= x10001
  r.square_repeated(6)
  # 392 + 8*6 = 440 operations
  for _ in 0 ..< 8:
    r *= x11111
    r.square_repeated(5)
  r *= x11111
  r.square()
  r *= a
  # Total 443 operations
--- a/constantine/curves/bls12_377_sqrt.nim
+++ b/constantine/curves/bls12_377_sqrt.nim
@ -8,7 +8,8 @@
 import
  ../config/[curves, type_bigint, type_ff],
-  ../io/[io_bigints, io_fields]
+  ../io/[io_bigints, io_fields],
  ../arithmetic/finite_fields
 const
  # with e = 2adicity
@ -18,3 +19,188 @@ const
  BLS12_377_TonelliShanks_exponent* = BigInt[330].fromHex"0x35c748c2f8a21d58c760b80d94292763445b3e601ea271e3de6c45f741290002e16ba88600000010a11"
  BLS12_377_TonelliShanks_twoAdicity* = 46
  BLS12_377_TonelliShanks_root_of_unity* = Fp[BLS12_377].fromHex"0x382d3d99cdbc5d8fe9dee6aa914b0ad14fcaca7022110ec6eaa2bc56228ac41ea03d28cc795186ba6b5ef26b00bbe8"
 # ############################################################
 #
 #       Specialized Tonelli-Shanks for BLS12-377
 #
 # ############################################################
 func precompute_tonelli_shanks_addchain*(
       r: var Fp[BLS12_377],
       a: Fp[BLS12_377]) =
  ## Does a^BLS12_377_TonelliShanks_exponent
  ## via an addition-chain
  var
    x10       {.noInit.}: Fp[BLS12_377]
    x11       {.noInit.}: Fp[BLS12_377]
    x100      {.noInit.}: Fp[BLS12_377]
    x101      {.noInit.}: Fp[BLS12_377]
    x111      {.noInit.}: Fp[BLS12_377]
    x1001     {.noInit.}: Fp[BLS12_377]
    x1011     {.noInit.}: Fp[BLS12_377]
    x1111     {.noInit.}: Fp[BLS12_377]
    x10001    {.noInit.}: Fp[BLS12_377]
    x10011    {.noInit.}: Fp[BLS12_377]
    x10111    {.noInit.}: Fp[BLS12_377]
    x11011    {.noInit.}: Fp[BLS12_377]
    x11101    {.noInit.}: Fp[BLS12_377]
    x11111    {.noInit.}: Fp[BLS12_377]
    x110100   {.noInit.}: Fp[BLS12_377]
    x11010000 {.noInit.}: Fp[BLS12_377]
    x11010111 {.noInit.}: Fp[BLS12_377]
  x10       .square(a)
  x11       .prod(a, x10)
  x100      .prod(a, x11)
  x101      .prod(a, x100)
  x111      .prod(x10, x101)
  x1001     .prod(x10, x111)
  x1011     .prod(x10, x1001)
  x1111     .prod(x100, x1011)
  x10001    .prod(x10, x1111)
  x10011    .prod(x10, x10001)
  x10111    .prod(x100, x10011)
  x11011    .prod(x100, x10111)
  x11101    .prod(x10, x11011)
  x11111    .prod(x10, x11101)
  x110100   .prod(x10111, x11101)
  x11010000 .square_repeated(x110100, 2)
  x11010111 .prod(x111, x11010000)
  # 18 operations
  # TODO: we can accumulate in a partially reduced
  #       doubled-size `r` to avoid the final substractions.
  #       and only reduce at the end.
  #       This requires the number of op to be less than log2(p) == 381
  # 18 + 18 = 36 operations
  r.square_repeated(x11010111, 8)
  r *= x11101
  r.square_repeated(7)
  r *= x10001
  r.square()
  # 36 + 14 = 50 operations
  r *= a
  r.square_repeated(9)
  r *= x10111
  r.square_repeated(2)
  r *= x11
  # 50 + 21 = 71 operations
  r.square_repeated(6)
  r *= x101
  r.square_repeated(4)
  r *= a
  r.square_repeated(9)
  # 71 + 13 = 84 operations
  r *= x11101
  r.square_repeated(5)
  r *= x1011
  r.square_repeated(5)
  r *= x11
  # 84 + 21 = 105 operations
  r.square_repeated(8)
  r *= x11101
  r.square()
  r *= a
  r.square_repeated(10)
  # 105 + 20 = 125 operations
  r *= x10111
  r.square_repeated(12)
  r *= x11011
  r.square_repeated(5)
  r *= x101
  # 125 + 22 = 147 operations
  r.square_repeated(7)
  r *= x101
  r.square_repeated(6)
  r *= x1001
  r.square_repeated(7)
  # 147 + 11 = 158 operations
  r *= x11101
  r.square_repeated(5)
  r *= x10001
  r.square_repeated(3)
  r *= x101
  # 158 + 23 = 181 operations
  r.square_repeated(8)
  r *= x10001
  r.square_repeated(6)
  r *= x11011
  r.square_repeated(7)
  # 181 + 19 = 200 operations
  r *= x11111
  r.square_repeated(4)
  r *= x11
  r.square_repeated(12)
  r *= x1111
  # 200 + 19 = 219 operations
  r.square_repeated(4)
  r *= x101
  r.square_repeated(8)
  r *= x10011
  r.square_repeated(5)
  # 219 + 13 = 232 operations
  r *= x10001
  r.square_repeated(3)
  r *= x111
  r.square_repeated(7)
  r *= x1111
  # 232 + 22 = 254 operations
  r.square_repeated(5)
  r *= x1111
  r.square_repeated(7)
  r *= x11011
  r.square_repeated(8)
  # 254 + 13 = 269 operations
  r *= x10001
  r.square_repeated(6)
  r *= x11111
  r.square_repeated(6)
  r *= x11101
  # 269 + 35 = 304 operations
  r.square_repeated(9)
  r *= x1001
  r.square_repeated(5)
  r *= x1001
  r.square_repeated(19)
  # 304 + 17 = 321 operations
  r *= x10111
  r.square_repeated(8)
  r *= x1011
  r.square_repeated(6)
  r *= x10111
  # 321 + 16 = 337 operations
  r.square_repeated(4)
  r *= x101
  r.square_repeated(4)
  r *= a
  r.square_repeated(6)
  # 337 + 29 = 376 operations
  r *= x11
  r.square_repeated(29)
  r *= a
  r.square_repeated(7)
  r *= x101
  # 376 + 10 = 386 operations
  r.square_repeated(9)
  r *= x10001
--- a/constantine/curves/bls12_381_inversion.nim
+++ b/constantine/curves/bls12_381_inversion.nim
@ -88,7 +88,8 @@ func inv_addchain*(r: var Fp[BLS12_381], a: Fp[BLS12_381]) =
  x11100101 .prod(x100, x11100001)
  x11101011 .prod(x10100, x11010111)
  x11110101 .prod(x10100, x11100001)
-  x11111111 .prod(x10100, x11101011) # 35 operations
+  x11111111 .prod(x10100, x11101011)
  # 35 operations
  # TODO: we can accumulate in a partially reduced
  #       doubled-size `r` to avoid the final substractions.
@ -109,7 +110,7 @@ func inv_addchain*(r: var Fp[BLS12_381], a: Fp[BLS12_381]) =
  r *= x11111111
  r.square_repeated(7)
-  # 88 + 22 = 107 operations
+  # 85 + 22 = 107 operations
  r *= x1001101
  r.square_repeated(9)
  r *= x1101001
--- a/constantine/curves/bls12_381_sqrt.nim
+++ b/constantine/curves/bls12_381_sqrt.nim
@ -0,0 +1,223 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
  ../config/curves,
  ../arithmetic/finite_fields
 # ############################################################
 #
 #           Specialized invsqrt for BLS12-381
 #
 # ############################################################
 func invsqrt_addchain*(r: var Fp[BLS12_381], a: Fp[BLS12_381]) =
  var
    x10       {.noInit.}: Fp[BLS12_381]
    x100      {.noInit.}: Fp[BLS12_381]
    x1000     {.noInit.}: Fp[BLS12_381]
    x1001     {.noInit.}: Fp[BLS12_381]
    x1011     {.noInit.}: Fp[BLS12_381]
    x1101     {.noInit.}: Fp[BLS12_381]
    x10001    {.noInit.}: Fp[BLS12_381]
    x10100    {.noInit.}: Fp[BLS12_381]
    x10101    {.noInit.}: Fp[BLS12_381]
    x11001    {.noInit.}: Fp[BLS12_381]
    x11010    {.noInit.}: Fp[BLS12_381]
    x110100   {.noInit.}: Fp[BLS12_381]
    x110110   {.noInit.}: Fp[BLS12_381]
    x110111   {.noInit.}: Fp[BLS12_381]
    x1001101  {.noInit.}: Fp[BLS12_381]
    x1001111  {.noInit.}: Fp[BLS12_381]
    x1010101  {.noInit.}: Fp[BLS12_381]
    x1011101  {.noInit.}: Fp[BLS12_381]
    x1100111  {.noInit.}: Fp[BLS12_381]
    x1101001  {.noInit.}: Fp[BLS12_381]
    x1110111  {.noInit.}: Fp[BLS12_381]
    x1111011  {.noInit.}: Fp[BLS12_381]
    x10001001 {.noInit.}: Fp[BLS12_381]
    x10010101 {.noInit.}: Fp[BLS12_381]
    x10010111 {.noInit.}: Fp[BLS12_381]
    x10101001 {.noInit.}: Fp[BLS12_381]
    x10110001 {.noInit.}: Fp[BLS12_381]
    x10111111 {.noInit.}: Fp[BLS12_381]
    x11000011 {.noInit.}: Fp[BLS12_381]
    x11010000 {.noInit.}: Fp[BLS12_381]
    x11010111 {.noInit.}: Fp[BLS12_381]
    x11100001 {.noInit.}: Fp[BLS12_381]
    x11100101 {.noInit.}: Fp[BLS12_381]
    x11101011 {.noInit.}: Fp[BLS12_381]
    x11110101 {.noInit.}: Fp[BLS12_381]
    x11111111 {.noInit.}: Fp[BLS12_381]
  x10       .square(a)
  x100      .square(x10)
  x1000     .square(x100)
  x1001     .prod(a, x1000)
  x1011     .prod(x10, x1001)
  x1101     .prod(x10, x1011)
  x10001    .prod(x100, x1101)
  x10100    .prod(x1001, x1011)
  x10101    .prod(a, x10100)
  x11001    .prod(x100, x10101)
  x11010    .prod(a, x11001)
  x110100   .square(x11010)
  x110110   .prod(x10, x110100)
  x110111   .prod(a, x110110)
  x1001101  .prod(x11001, x110100)
  x1001111  .prod(x10, x1001101)
  x1010101  .prod(x1000, x1001101)
  x1011101  .prod(x1000, x1010101)
  x1100111  .prod(x11010, x1001101)
  x1101001  .prod(x10, x1100111)
  x1110111  .prod(x11010, x1011101)
  x1111011  .prod(x100, x1110111)
  x10001001 .prod(x110100, x1010101)
  x10010101 .prod(x11010, x1111011)
  x10010111 .prod(x10, x10010101)
  x10101001 .prod(x10100, x10010101)
  x10110001 .prod(x1000, x10101001)
  x10111111 .prod(x110110, x10001001)
  x11000011 .prod(x100, x10111111)
  x11010000 .prod(x1101, x11000011)
  x11010111 .prod(x10100, x11000011)
  x11100001 .prod(x10001, x11010000)
  x11100101 .prod(x100, x11100001)
  x11101011 .prod(x10100, x11010111)
  x11110101 .prod(x10100, x11100001)
  x11111111 .prod(x10100, x11101011)
  # 36 operations
  # TODO: we can accumulate in a partially reduced
  #       doubled-size `r` to avoid the final substractions.
  #       and only reduce at the end.
  #       This requires the number of op to be less than log2(p) == 381
  # 36 + 22 = 58 operations
  r.prod(x10111111, x11100001)
  r.square_repeated(8)
  r *= x10001
  r.square_repeated(11)
  r *= x11110101
  # 58 + 28 = 86 operations
  r.square_repeated(11)
  r *= x11100101
  r.square_repeated(8)
  r *= x11111111
  r.square_repeated(7)
  # 86 + 22 = 108 operations
  r *= x1001101
  r.square_repeated(9)
  r *= x1101001
  r.square_repeated(10)
  r *= x10110001
  # 108+24 = 132 operations
  r.square_repeated(7)
  r *= x1011101
  r.square_repeated(9)
  r *= x1111011
  r.square_repeated(6)
  # 132+23 = 155 operations
  r *= x11001
  r.square_repeated(11)
  r *= x1101001
  r.square_repeated(9)
  r *= x11101011
  # 155+28 = 183 operations
  r.square_repeated(10)
  r *= x11010111
  r.square_repeated(6)
  r *= x11001
  r.square_repeated(10)
  # 183+23 = 206 operations
  r *= x1110111
  r.square_repeated(9)
  r *= x10010111
  r.square_repeated(11)
  r *= x1001111
  # 206+30 = 236 operations
  r.square_repeated(10)
  r *= x11100001
  r.square_repeated(9)
  r *= x10001001
  r.square_repeated(9)
  # 236+21 = 257 operations
  r *= x10111111
  r.square_repeated(8)
  r *= x1100111
  r.square_repeated(10)
  r *= x11000011
  # 257+28 = 285 operations
  r.square_repeated(9)
  r *= x10010101
  r.square_repeated(12)
  r *= x1111011
  r.square_repeated(5)
  # 285 + 21 = 306 operations
  r *= x1011
  r.square_repeated(11)
  r *= x1111011
  r.square_repeated(7)
  r *= x1001
  # 306+32 = 338 operations
  r.square_repeated(13)
  r *= x11110101
  r.square_repeated(9)
  r *= x10111111
  r.square_repeated(8)
  # 338+22 = 360 operations
  r *= x11111111
  r.square_repeated(8)
  r *= x11101011
  r.square_repeated(11)
  r *= x10101001
  # 360+24 = 384 operations
  r.square_repeated(8)
  r *= x11111111
  r.square_repeated(8)
  r *= x11111111
  r.square_repeated(6)
  # 384+22 = 406 operations
  r *= x110111
  r.square_repeated(10)
  r *= x11111111
  r.square_repeated(9)
  r *= x11111111
  # 406+26 = 432 operations
  r.square_repeated(8)
  r *= x11111111
  r.square_repeated(8)
  r *= x11111111
  r.square_repeated(8)
  # 432+17 = 449 operations
  r *= x11111111
  r.square_repeated(7)
  r *= x1010101
  r.square_repeated(6)
  r *= x10101
  r.square()
  # Total 449 operations:
  # - 75 multiplications
  # - 374 squarings
--- a/constantine/curves/bn254_nogami_inversion.nim
+++ b/constantine/curves/bn254_nogami_inversion.nim
@ -0,0 +1,98 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
  ../config/curves,
  ../arithmetic/finite_fields
 # ############################################################
 #
 #           Specialized inversion for BN254-Nogami
 #
 # ############################################################
 func inv_addchain*(r: var Fp[BN254_Nogami], a: Fp[BN254_Nogami]) =
  var
    x100     {.noInit.}: Fp[BN254_Nogami]
    x1000    {.noInit.}: Fp[BN254_Nogami]
    x1100    {.noInit.}: Fp[BN254_Nogami]
    x1101    {.noInit.}: Fp[BN254_Nogami]
    x10001   {.noInit.}: Fp[BN254_Nogami]
    x100010  {.noInit.}: Fp[BN254_Nogami]
    x1000100 {.noInit.}: Fp[BN254_Nogami]
    x1010101 {.noInit.}: Fp[BN254_Nogami]
  x100     .square_repeated(a, 2)
  x1000    .square(x100)
  x1100    .prod(x100, x1000)
  x1101    .prod(a, x1100)
  x10001   .prod(x100, x1101)
  x100010  .square(x10001)
  x1000100 .square(x100010)
  x1010101 .prod(x10001, x1000100)
  # 9 operations
  var
    r13      {.noInit.}: Fp[BN254_Nogami]
    r17      {.noInit.}: Fp[BN254_Nogami]
    r18      {.noInit.}: Fp[BN254_Nogami]
    r23      {.noInit.}: Fp[BN254_Nogami]
    r26      {.noInit.}: Fp[BN254_Nogami]
    r27      {.noInit.}: Fp[BN254_Nogami]
    r28      {.noInit.}: Fp[BN254_Nogami]
    r36      {.noInit.}: Fp[BN254_Nogami]
    r38      {.noInit.}: Fp[BN254_Nogami]
    r39      {.noInit.}: Fp[BN254_Nogami]
    r40      {.noInit.}: Fp[BN254_Nogami]
  r13.square_repeated(x1010101, 2)
  r13 *= x100010
  r13 *= x1101
  r17.square(r13)
  r17 *= r13
  r17.square_repeated(2)
  r18.prod(r13, r17)
  r23.square_repeated(r18, 3)
  r23 *= r18
  r23 *= r17
  r26.square_repeated(r23, 2)
  r26 *= r23
  r27.prod(r23, r26)
  r28.prod(r26, r27)
  r36.square(r28)
  r36 *= r28
  r36.square_repeated(2)
  r36 *= r28
  r36.square_repeated(3)
  r38.prod(r28, r36)
  r38 *= r27
  r39.square(r38)
  r40.prod(r38, r39)
  r.prod(r39, r40)
  r.square_repeated(3)
  r *= r40
  r.square_repeated(55)
  r *= r38
  r.square_repeated(55)
  r *= r28
  r.square_repeated(56)
  r *= r18
  r.square_repeated(56)
  r *= x10001
  # Total 271 operations
--- a/constantine/curves/bn254_nogami_sqrt.nim
+++ b/constantine/curves/bn254_nogami_sqrt.nim
@ -0,0 +1,89 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
  ../config/curves,
  ../arithmetic/finite_fields
 # ############################################################
 #
 #           Specialized inversion for BN254-Nogami
 #
 # ############################################################
 func invsqrt_addchain*(r: var Fp[BN254_Nogami], a: Fp[BN254_Nogami]) =
  var
    x10 {.noInit.}: Fp[BN254_Nogami]
    x11 {.noInit.}: Fp[BN254_Nogami]
  x10 .square(a)
  x11 .prod(a, x10)
  # 2 operations
  var
    r10  {.noInit.}: Fp[BN254_Nogami]
    r14  {.noInit.}: Fp[BN254_Nogami]
    r15  {.noInit.}: Fp[BN254_Nogami]
    r20  {.noInit.}: Fp[BN254_Nogami]
    r23  {.noInit.}: Fp[BN254_Nogami]
    r24  {.noInit.}: Fp[BN254_Nogami]
    r25  {.noInit.}: Fp[BN254_Nogami]
    r33  {.noInit.}: Fp[BN254_Nogami]
    r35  {.noInit.}: Fp[BN254_Nogami]
    r36  {.noInit.}: Fp[BN254_Nogami]
    r37  {.noInit.}: Fp[BN254_Nogami]
    r98  {.noInit.}: Fp[BN254_Nogami]
    r263 {.noInit.}: Fp[BN254_Nogami]
  r10.square_repeated(x11, 7)
  r10 *= x11
  r14.square(r10)
  r14 *= r10
  r14.square_repeated(2)
  r15.prod(r10, r14)
  r20.square_repeated(r15, 3)
  r20 *= r15
  r20 *= r14
  r23.square_repeated(r20, 2)
  r23 *= r20
  r24.prod(r20, r23)
  r25.prod(r23, r24)
  r33.square(r25)
  r33 *= r25
  r33.square_repeated(2)
  r33 *= r25
  r33.square_repeated(3)
  r35.prod(r25, r33)
  r35 *= r24
  r36.square(r35)
  r37.prod(r35, r36)
  r.prod(r36, r37)
  r.square_repeated(3)
  r *= r37
  r.square_repeated(55)
  r *= r35
  r.square_repeated(55)
  r *= r25
  r.square_repeated(56)
  r *= r15
  r.square_repeated(52)
  r *= a
  r.square_repeated(2)
  # Total 265 operations
--- a/constantine/curves/bn254_snarks_pairing.nim
+++ b/constantine/curves/bn254_snarks_pairing.nim
@ -71,44 +71,44 @@ func pow_u*(r: var Fp12[BN254_Snarks], a: Fp12[BN254_Snarks], invert = BN254_Sna
    x10001110 .prod(x10110, x1111000)
    var
-      i15 {.noInit.}: Fp12[BN254_Snarks]
+      r15 {.noInit.}: Fp12[BN254_Snarks]
-      i16 {.noInit.}: Fp12[BN254_Snarks]
+      r16 {.noInit.}: Fp12[BN254_Snarks]
-      i17 {.noInit.}: Fp12[BN254_Snarks]
+      r17 {.noInit.}: Fp12[BN254_Snarks]
-      i18 {.noInit.}: Fp12[BN254_Snarks]
+      r18 {.noInit.}: Fp12[BN254_Snarks]
-      i20 {.noInit.}: Fp12[BN254_Snarks]
+      r20 {.noInit.}: Fp12[BN254_Snarks]
-      i21 {.noInit.}: Fp12[BN254_Snarks]
+      r21 {.noInit.}: Fp12[BN254_Snarks]
-      i22 {.noInit.}: Fp12[BN254_Snarks]
+      r22 {.noInit.}: Fp12[BN254_Snarks]
-      i26 {.noInit.}: Fp12[BN254_Snarks]
+      r26 {.noInit.}: Fp12[BN254_Snarks]
-      i27 {.noInit.}: Fp12[BN254_Snarks]
+      r27 {.noInit.}: Fp12[BN254_Snarks]
-      i61 {.noInit.}: Fp12[BN254_Snarks]
+      r61 {.noInit.}: Fp12[BN254_Snarks]
-    i15.cyclotomic_square(x10001110)
+    r15.cyclotomic_square(x10001110)
-    i15 *= x1001010
+    r15 *= x1001010
-    i16.prod(x10001110, i15)
+    r16.prod(x10001110, r15)
-    i17.prod(x1111, i16)
+    r17.prod(x1111, r16)
-    i18.prod(i16, i17)
+    r18.prod(r16, r17)
-    i20.cyclotomic_square(i18)
+    r20.cyclotomic_square(r18)
-    i20 *= i17
+    r20 *= r17
-    i21.prod(x1111000, i20)
+    r21.prod(x1111000, r20)
-    i22.prod(i15, i21)
+    r22.prod(r15, r21)
-    i26.cyclotomic_square(i22)
+    r26.cyclotomic_square(r22)
-    i26.cyclotomic_square()
+    r26.cyclotomic_square()
-    i26 *= i22
+    r26 *= r22
-    i26 *= i18
+    r26 *= r18
-    i27.prod(i22, i26)
+    r27.prod(r22, r26)
-    i61.prod(i26, i27)
+    r61.prod(r26, r27)
-    i61.cycl_sqr_repeated(17)
+    r61.cycl_sqr_repeated(17)
-    i61 *= i27
+    r61 *= r27
-    i61.cycl_sqr_repeated(14)
+    r61.cycl_sqr_repeated(14)
-    i61 *= i21
+    r61 *= r21
-    r = i61
+    r = r61
    r.cycl_sqr_repeated(16)
-    r *= i20
+    r *= r20
    if invert:
      r.cyclotomic_inv()
--- a/constantine/curves/bn254_snarks_sqrt.nim
+++ b/constantine/curves/bn254_snarks_sqrt.nim
@ -0,0 +1,158 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
  ../config/curves,
  ../arithmetic/finite_fields
 # ############################################################
 #
 #           Specialized inversion for BN254-Snarks
 #
 # ############################################################
 func invsqrt_addchain*(r: var Fp[BN254_Snarks], a: Fp[BN254_Snarks]) =
  var
    x10       {.noInit.}: Fp[BN254_Snarks]
    x11       {.noInit.}: Fp[BN254_Snarks]
    x101      {.noInit.}: Fp[BN254_Snarks]
    x110      {.noInit.}: Fp[BN254_Snarks]
    x1000     {.noInit.}: Fp[BN254_Snarks]
    x1101     {.noInit.}: Fp[BN254_Snarks]
    x10010    {.noInit.}: Fp[BN254_Snarks]
    x10011    {.noInit.}: Fp[BN254_Snarks]
    x10100    {.noInit.}: Fp[BN254_Snarks]
    x10111    {.noInit.}: Fp[BN254_Snarks]
    x11100    {.noInit.}: Fp[BN254_Snarks]
    x100000   {.noInit.}: Fp[BN254_Snarks]
    x100011   {.noInit.}: Fp[BN254_Snarks]
    x101011   {.noInit.}: Fp[BN254_Snarks]
    x101111   {.noInit.}: Fp[BN254_Snarks]
    x1000001  {.noInit.}: Fp[BN254_Snarks]
    x1010011  {.noInit.}: Fp[BN254_Snarks]
    x1011011  {.noInit.}: Fp[BN254_Snarks]
    x1100001  {.noInit.}: Fp[BN254_Snarks]
    x1110101  {.noInit.}: Fp[BN254_Snarks]
    x10010001 {.noInit.}: Fp[BN254_Snarks]
    x10010101 {.noInit.}: Fp[BN254_Snarks]
    x10110101 {.noInit.}: Fp[BN254_Snarks]
    x10111011 {.noInit.}: Fp[BN254_Snarks]
    x11000001 {.noInit.}: Fp[BN254_Snarks]
    x11000011 {.noInit.}: Fp[BN254_Snarks]
    x11010011 {.noInit.}: Fp[BN254_Snarks]
    x11100001 {.noInit.}: Fp[BN254_Snarks]
    x11100011 {.noInit.}: Fp[BN254_Snarks]
    x11100111 {.noInit.}: Fp[BN254_Snarks]
  x10       .square(a)
  x11       .prod(x10, a)
  x101      .prod(x10, x11)
  x110      .prod(x101, a)
  x1000     .prod(x10, x110)
  x1101     .prod(x101, x1000)
  x10010    .prod(x101, x1101)
  x10011    .prod(x10010, a)
  x10100    .prod(x10011, a)
  x10111    .prod(x11, x10100)
  x11100    .prod(x101, x10111)
  x100000   .prod(x1101, x10011)
  x100011   .prod(x11, x100000)
  x101011   .prod(x1000, x100011)
  x101111   .prod(x10011, x11100)
  x1000001  .prod(x10010, x101111)
  x1010011  .prod(x10010, x1000001)
  x1011011  .prod(x1000, x1010011)
  x1100001  .prod(x110, x1011011)
  x1110101  .prod(x10100, x1100001)
  x10010001 .prod(x11100, x1110101)
  x10010101 .prod(x100000, x1110101)
  x10110101 .prod(x100000, x10010101)
  x10111011 .prod(x110, x10110101)
  x11000001 .prod(x110, x10111011)
  x11000011 .prod(x10, x11000001)
  x11010011 .prod(x10010, x11000001)
  x11100001 .prod(x100000, x11000001)
  x11100011 .prod(x10, x11100001)
  x11100111 .prod(x110, x11100001) # 30 operations
  # 30 + 27 = 57 operations
  r.square(x11000001)
  r.square_repeated(7)
  r *= x10010001
  r.square_repeated(10)
  r *= x11100111
  r.square_repeated(7)
  # 57 + 19 = 76 operations
  r *= x10111
  r.square_repeated(9)
  r *= x10011
  r.square_repeated(7)
  r *= x1101
  # 76 + 33 = 109 operations
  r.square_repeated(14)
  r *= x1010011
  r.square_repeated(9)
  r *= x11100001
  r.square_repeated(8)
  # 109 + 18 = 127 operations
  r *= x1000001
  r.square_repeated(10)
  r *= x1011011
  r.square_repeated(5)
  r *= x1101
  # 127 + 34 = 161 operations
  r.square_repeated(8)
  r *= x11
  r.square_repeated(12)
  r *= x101011
  r.square_repeated(12)
  # 161 + 25 = 186 operations
  r *= x10111011
  r.square_repeated(8)
  r *= x101111
  r.square_repeated(14)
  r *= x10110101
  # 186 + 28 = 214
  r.square_repeated(9)
  r *= x10010001
  r.square_repeated(5)
  r *= x1101
  r.square_repeated(12)
  # 214 + 22 = 236
  r *= x11100011
  r.square_repeated(8)
  r *= x10010101
  r.square_repeated(11)
  r *= x11010011
  # 236 + 32 = 268
  r.square_repeated(7)
  r *= x1100001
  r.square_repeated(11)
  r *= x100011
  r.square_repeated(12)
  # 268 + 20 = 288
  r *= x1011011
  r.square_repeated(9)
  r *= x11000011
  r.square_repeated(8)
  r *= x11100111
  # 288 + 13 = 301
  r.square_repeated(7)
  r *= x1110101
  r.square_repeated(4)
  r *= a
--- a/constantine/curves/bw6_761_inversion.nim
+++ b/constantine/curves/bw6_761_inversion.nim
@ -0,0 +1,376 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
  ../config/curves,
  ../arithmetic/finite_fields
 # ############################################################
 #
 #           Specialized inversion for BW6-761
 #
 # ############################################################
 func inv_addchain*(r: var Fp[BW6_761], a: Fp[BW6_761]) =
  let a = a # ensure a.inv_addchain(a) is OK
  var
    x10       {.noInit.}: Fp[BW6_761]
    x11       {.noInit.}: Fp[BW6_761]
    x101      {.noInit.}: Fp[BW6_761]
    x111      {.noInit.}: Fp[BW6_761]
    x1001     {.noInit.}: Fp[BW6_761]
    x1011     {.noInit.}: Fp[BW6_761]
    x1101     {.noInit.}: Fp[BW6_761]
    x1111     {.noInit.}: Fp[BW6_761]
    x10001    {.noInit.}: Fp[BW6_761]
    x10010    {.noInit.}: Fp[BW6_761]
    x10011    {.noInit.}: Fp[BW6_761]
    x10111    {.noInit.}: Fp[BW6_761]
    x11001    {.noInit.}: Fp[BW6_761]
    x11011    {.noInit.}: Fp[BW6_761]
    x11101    {.noInit.}: Fp[BW6_761]
    x11111    {.noInit.}: Fp[BW6_761]
    x100001   {.noInit.}: Fp[BW6_761]
    x100011   {.noInit.}: Fp[BW6_761]
    x100101   {.noInit.}: Fp[BW6_761]
    x100111   {.noInit.}: Fp[BW6_761]
    x101001   {.noInit.}: Fp[BW6_761]
    x101011   {.noInit.}: Fp[BW6_761]
    x101101   {.noInit.}: Fp[BW6_761]
    x101111   {.noInit.}: Fp[BW6_761]
    x110001   {.noInit.}: Fp[BW6_761]
    x110011   {.noInit.}: Fp[BW6_761]
    x110101   {.noInit.}: Fp[BW6_761]
    x110111   {.noInit.}: Fp[BW6_761]
    x111001   {.noInit.}: Fp[BW6_761]
    x111011   {.noInit.}: Fp[BW6_761]
    x111101   {.noInit.}: Fp[BW6_761]
    x1111010  {.noInit.}: Fp[BW6_761]
    x1111111  {.noInit.}: Fp[BW6_761]
    x11111110 {.noInit.}: Fp[BW6_761]
    x11111111 {.noInit.}: Fp[BW6_761]
  x10       .square(a)
  x11       .prod(a, x10)
  x101      .prod(x10, x11)
  x111      .prod(x10, x101)
  x1001     .prod(x10, x111)
  x1011     .prod(x10, x1001)
  x1101     .prod(x10, x1011)
  x1111     .prod(x10, x1101)
  x10001    .prod(x10, x1111)
  x10010    .prod(a, x10001)
  x10011    .prod(a, x10010)
  x10111    .prod(x101, x10010)
  x11001    .prod(x10, x10111)
  x11011    .prod(x10, x11001)
  x11101    .prod(x10, x11011)
  x11111    .prod(x10, x11101)
  x100001   .prod(x10, x11111)
  x100011   .prod(x10, x100001)
  x100101   .prod(x10, x100011)
  x100111   .prod(x10, x100101)
  x101001   .prod(x10, x100111)
  x101011   .prod(x10, x101001)
  x101101   .prod(x10, x101011)
  x101111   .prod(x10, x101101)
  x110001   .prod(x10, x101111)
  x110011   .prod(x10, x110001)
  x110101   .prod(x10, x110011)
  x110111   .prod(x10, x110101)
  x111001   .prod(x10, x110111)
  x111011   .prod(x10, x111001)
  x111101   .prod(x10, x111011)
  x1111010  .square(x111101)
  x1111111  .prod(x101, x1111010)
  x11111110 .square(x1111111)
  x11111111 .prod(a, x11111110)
  # 35 operations
  # TODO: we can accumulate in a partially reduced
  #       doubled-size `r` to avoid the final substractions.
  #       and only reduce at the end.
  #       This requires the number of op to be less than log2(p) == 381
  # 35 + 8 = 43 operations
  r.prod(x100001, x11111111)
  r.square_repeated(3)
  r *= x10111
  r.square_repeated(2)
  r *= a
  # 43 + 22 = 65 operations
  r.square_repeated(9)
  r *= x1001
  r.square_repeated(7)
  r *= x11111
  r.square_repeated(4)
  # 65 + 17 = 82 operations
  r *= x111
  r.square_repeated(9)
  r *= x1111
  r.square_repeated(5)
  r *= x111
  # 82 + 29 = 111 operations
  r.square_repeated(11)
  r *= x101011
  r.square_repeated(7)
  r *= x100011
  r.square_repeated(9)
  # 111 + 28 = 139 operations
  r *= x11111
  r.square_repeated(8)
  r *= x100101
  r.square_repeated(17)
  r *= x100111
  # 139 + 22 = 161 operations
  r.square_repeated(4)
  r *= x1101
  r.square_repeated(9)
  r *= x11111111
  r.square_repeated(7)
  # 161 + 15 = 176 operations
  r *= x11111
  r.square_repeated(6)
  r *= x10111
  r.square_repeated(6)
  r *= x1001
  # 176 + 22 = 198 operations
  r.square_repeated(4)
  r *= x11
  r.square_repeated(6)
  r *= x11
  r.square_repeated(10)
  # 198 + 16 = 214 operations
  r *= x110101
  r.square_repeated(2)
  r *= a
  r.square_repeated(11)
  r *= x11101
  # 214 + 28 = 238 operations
  r.square_repeated(6)
  r *= x101
  r.square_repeated(7)
  r *= x1101
  r.square_repeated(9)
  # 238 + 21 = 259 operations
  r *= x100001
  r.square_repeated(7)
  r *= x100101
  r.square_repeated(11)
  r *= x100111
  # 259 + 28 = 287 operations
  r.square_repeated(7)
  r *= x101111
  r.square_repeated(6)
  r *= x11111
  r.square_repeated(13)
  # 287 + 25 = 302 operations
  r *= x100001
  r.square_repeated(6)
  r *= x111011
  r.square_repeated(6)
  r *= x111001
  # 302 + 27 = 329 operations
  r.square_repeated(10)
  r *= x10111
  r.square_repeated(11)
  r *= x111101
  r.square_repeated(4)
  # 329 + 17 = 346 operations
  r *= x1101
  r.square_repeated(8)
  r *= x110001
  r.square_repeated(6)
  r *= x110001
  # 346 + 20 = 366 operations
  r.square_repeated(5)
  r *= x11001
  r.square_repeated(3)
  r *= x11
  r.square_repeated(10)
  # 366 + 16 = 382 operations
  r *= x100111
  r.square_repeated(5)
  r *= x1001
  r.square_repeated(8)
  r *= x11001
  # 382 + 25 = 407 operations
  r.square_repeated(10)
  r *= x1111
  r.square_repeated(7)
  r *= x11101
  r.square_repeated(6)
  # 407 + 20 = 427 operations
  r *= x11101
  r.square_repeated(9)
  r *= x11111111
  r.square_repeated(8)
  r *= x100101
  # 427 + 27 = 454 operations
  r.square_repeated(6)
  r *= x101101
  r.square_repeated(10)
  r *= x100011
  r.square_repeated(9)
  # 454 + 20 = 474 operations
  r *= x1001
  r.square_repeated(8)
  r *= x1101
  r.square_repeated(9)
  r *= x100111
  # 474 + 25 = 499 operations
  r.square_repeated(8)
  r *= x100011
  r.square_repeated(6)
  r *= x101101
  r.square_repeated(9)
  # 499 + 16 = 515 operations
  r *= x100101
  r.square_repeated(4)
  r *= x1111
  r.square_repeated(9)
  r *= x1111111
  # 515 + 25 = 540 operations
  r.square_repeated(6)
  r *= x11001
  r.square_repeated(8)
  r *= x111
  r.square_repeated(9)
  # 540 + 15 = 555 operations
  r *= x111011
  r.square_repeated(5)
  r *= x10011
  r.square_repeated(7)
  r *= x100111
  # 555 + 22 = 577 operations
  r.square_repeated(5)
  r *= x10111
  r.square_repeated(9)
  r *= x111001
  r.square_repeated(6)
  # 577 + 14 = 591 operations
  r *= x111101
  r.square_repeated(9)
  r *= x11111111
  r.square_repeated(2)
  r *= x11
  # 591 + 21 = 612 operations
  r.square_repeated(7)
  r *= x10111
  r.square_repeated(6)
  r *= x10011
  r.square_repeated(6)
  # 612 + 18 = 630 operations
  r *= x101
  r.square_repeated(9)
  r *= x10001
  r.square_repeated(6)
  r *= x11011
  # 630 + 27 = 657 operations
  r.square_repeated(10)
  r *= x100101
  r.square_repeated(7)
  r *= x110011
  r.square_repeated(8)
  # 657 + 13 = 670 operations
  r *= x111101
  r.square_repeated(7)
  r *= x100011
  r.square_repeated(3)
  r *= x111
  # 670 + 26 = 696 operations
  r.square_repeated(10)
  r *= x1011
  r.square_repeated(11)
  r *= x110011
  r.square_repeated(3)
  # 696 + 17 = 713 operations
  r *= x111
  r.square_repeated(9)
  r *= x101011
  r.square_repeated(5)
  r *= x10111
  # 713 + 21 = 734 operations
  r.square_repeated(7)
  r *= x101011
  r.square_repeated(2)
  r *= x11
  r.square_repeated(10)
  # 734 + 19 = 753 operations
  r *= x101001
  r.square_repeated(10)
  r *= x110111
  r.square_repeated(6)
  r *= x111001
  # 753 + 23 = 776 operations
  r.square_repeated(6)
  r *= x101001
  r.square_repeated(9)
  r *= x100111
  r.square_repeated(6)
  # 776 + 12 = 788 operations
  r *= x110011
  r.square_repeated(7)
  r *= x100001
  r.square_repeated(2)
  r *= x11
  # 788 + 39 = 827 operations
  r.square_repeated(21)
  r *= a
  r.square_repeated(11)
  r *= x101111
  r.square_repeated(5)
  # 827 + 55 = 882 operations
  r *= x1001
  r.square_repeated(7)
  r *= x11101
  r.square_repeated(45)
  r *= x10001
  # 882 + 4 = 886 operations
  r.square_repeated(3)
  r *= a
--- a/constantine/curves/bw6_761_sqrt.nim
+++ b/constantine/curves/bw6_761_sqrt.nim
@ -0,0 +1,373 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
  ../config/curves,
  ../arithmetic/finite_fields
 # ############################################################
 #
 #           Specialized invsqrt for BW6-761
 #
 # ############################################################
 func invsqrt_addchain*(r: var Fp[BW6_761], a: Fp[BW6_761]) =
  var
    x10       {.noInit.}: Fp[BW6_761]
    x11       {.noInit.}: Fp[BW6_761]
    x101      {.noInit.}: Fp[BW6_761]
    x111      {.noInit.}: Fp[BW6_761]
    x1001     {.noInit.}: Fp[BW6_761]
    x1011     {.noInit.}: Fp[BW6_761]
    x1101     {.noInit.}: Fp[BW6_761]
    x1111     {.noInit.}: Fp[BW6_761]
    x10001    {.noInit.}: Fp[BW6_761]
    x10010    {.noInit.}: Fp[BW6_761]
    x10011    {.noInit.}: Fp[BW6_761]
    x10111    {.noInit.}: Fp[BW6_761]
    x11001    {.noInit.}: Fp[BW6_761]
    x11011    {.noInit.}: Fp[BW6_761]
    x11101    {.noInit.}: Fp[BW6_761]
    x11111    {.noInit.}: Fp[BW6_761]
    x100001   {.noInit.}: Fp[BW6_761]
    x100011   {.noInit.}: Fp[BW6_761]
    x100101   {.noInit.}: Fp[BW6_761]
    x100111   {.noInit.}: Fp[BW6_761]
    x101001   {.noInit.}: Fp[BW6_761]
    x101011   {.noInit.}: Fp[BW6_761]
    x101101   {.noInit.}: Fp[BW6_761]
    x101111   {.noInit.}: Fp[BW6_761]
    x110001   {.noInit.}: Fp[BW6_761]
    x110011   {.noInit.}: Fp[BW6_761]
    x110101   {.noInit.}: Fp[BW6_761]
    x110111   {.noInit.}: Fp[BW6_761]
    x111001   {.noInit.}: Fp[BW6_761]
    x111011   {.noInit.}: Fp[BW6_761]
    x111101   {.noInit.}: Fp[BW6_761]
    x1111010  {.noInit.}: Fp[BW6_761]
    x1111111  {.noInit.}: Fp[BW6_761]
    x11111110 {.noInit.}: Fp[BW6_761]
    x11111111 {.noInit.}: Fp[BW6_761]
  x10       .square(a)
  x11       .prod(a, x10)
  x101      .prod(x10, x11)
  x111      .prod(x10, x101)
  x1001     .prod(x10, x111)
  x1011     .prod(x10, x1001)
  x1101     .prod(x10, x1011)
  x1111     .prod(x10, x1101)
  x10001    .prod(x10, x1111)
  x10010    .prod(a, x10001)
  x10011    .prod(a, x10010)
  x10111    .prod(x101, x10010)
  x11001    .prod(x10, x10111)
  x11011    .prod(x10, x11001)
  x11101    .prod(x10, x11011)
  x11111    .prod(x10, x11101)
  x100001   .prod(x10, x11111)
  x100011   .prod(x10, x100001)
  x100101   .prod(x10, x100011)
  x100111   .prod(x10, x100101)
  x101001   .prod(x10, x100111)
  x101011   .prod(x10, x101001)
  x101101   .prod(x10, x101011)
  x101111   .prod(x10, x101101)
  x110001   .prod(x10, x101111)
  x110011   .prod(x10, x110001)
  x110101   .prod(x10, x110011)
  x110111   .prod(x10, x110101)
  x111001   .prod(x10, x110111)
  x111011   .prod(x10, x111001)
  x111101   .prod(x10, x111011)
  x1111010  .square(x111101)
  x1111111  .prod(x101, x1111010)
  x11111110 .square(x1111111)
  x11111111 .prod(a, x11111110)
  # 35 operations
  # TODO: we can accumulate in a partially reduced
  #       doubled-size `r` to avoid the final substractions.
  #       and only reduce at the end.
  #       This requires the number of op to be less than log2(p) == 381
  # 35 + 8 = 43 operations
  r.prod(x100001, x11111111)
  r.square_repeated(3)
  r *= x10111
  r.square_repeated(2)
  r *= a
  # 43 + 22 = 65 operations
  r.square_repeated(9)
  r *= x1001
  r.square_repeated(7)
  r *= x11111
  r.square_repeated(4)
  # 65 + 17 = 82 operations
  r *= x111
  r.square_repeated(9)
  r *= x1111
  r.square_repeated(5)
  r *= x111
  # 82 + 29 = 111 operations
  r.square_repeated(11)
  r *= x101011
  r.square_repeated(7)
  r *= x100011
  r.square_repeated(9)
  # 111 + 28 = 139 operations
  r *= x11111
  r.square_repeated(8)
  r *= x100101
  r.square_repeated(17)
  r *= x100111
  # 139 + 22 = 161 operations
  r.square_repeated(4)
  r *= x1101
  r.square_repeated(9)
  r *= x11111111
  r.square_repeated(7)
  # 161 + 15 = 176 operations
  r *= x11111
  r.square_repeated(6)
  r *= x10111
  r.square_repeated(6)
  r *= x1001
  # 176 + 22 = 198 operations
  r.square_repeated(4)
  r *= x11
  r.square_repeated(6)
  r *= x11
  r.square_repeated(10)
  # 198 + 16 = 214 operations
  r *= x110101
  r.square_repeated(2)
  r *= a
  r.square_repeated(11)
  r *= x11101
  # 214 + 28 = 238 operations
  r.square_repeated(6)
  r *= x101
  r.square_repeated(7)
  r *= x1101
  r.square_repeated(9)
  # 238 + 21 = 259 operations
  r *= x100001
  r.square_repeated(7)
  r *= x100101
  r.square_repeated(11)
  r *= x100111
  # 259 + 28 = 287 operations
  r.square_repeated(7)
  r *= x101111
  r.square_repeated(6)
  r *= x11111
  r.square_repeated(13)
  # 287 + 25 = 302 operations
  r *= x100001
  r.square_repeated(6)
  r *= x111011
  r.square_repeated(6)
  r *= x111001
  # 302 + 27 = 329 operations
  r.square_repeated(10)
  r *= x10111
  r.square_repeated(11)
  r *= x111101
  r.square_repeated(4)
  # 329 + 17 = 346 operations
  r *= x1101
  r.square_repeated(8)
  r *= x110001
  r.square_repeated(6)
  r *= x110001
  # 346 + 20 = 366 operations
  r.square_repeated(5)
  r *= x11001
  r.square_repeated(3)
  r *= x11
  r.square_repeated(10)
  # 366 + 16 = 382 operations
  r *= x100111
  r.square_repeated(5)
  r *= x1001
  r.square_repeated(8)
  r *= x11001
  # 382 + 25 = 407 operations
  r.square_repeated(10)
  r *= x1111
  r.square_repeated(7)
  r *= x11101
  r.square_repeated(6)
  # 407 + 20 = 427 operations
  r *= x11101
  r.square_repeated(9)
  r *= x11111111
  r.square_repeated(8)
  r *= x100101
  # 427 + 27 = 454 operations
  r.square_repeated(6)
  r *= x101101
  r.square_repeated(10)
  r *= x100011
  r.square_repeated(9)
  # 454 + 20 = 474 operations
  r *= x1001
  r.square_repeated(8)
  r *= x1101
  r.square_repeated(9)
  r *= x100111
  # 474 + 25 = 499 operations
  r.square_repeated(8)
  r *= x100011
  r.square_repeated(6)
  r *= x101101
  r.square_repeated(9)
  # 499 + 16 = 515 operations
  r *= x100101
  r.square_repeated(4)
  r *= x1111
  r.square_repeated(9)
  r *= x1111111
  # 515 + 25 = 540 operations
  r.square_repeated(6)
  r *= x11001
  r.square_repeated(8)
  r *= x111
  r.square_repeated(9)
  # 540 + 15 = 555 operations
  r *= x111011
  r.square_repeated(5)
  r *= x10011
  r.square_repeated(7)
  r *= x100111
  # 555 + 22 = 577 operations
  r.square_repeated(5)
  r *= x10111
  r.square_repeated(9)
  r *= x111001
  r.square_repeated(6)
  # 577 + 14 = 591 operations
  r *= x111101
  r.square_repeated(9)
  r *= x11111111
  r.square_repeated(2)
  r *= x11
  # 591 + 21 = 612 operations
  r.square_repeated(7)
  r *= x10111
  r.square_repeated(6)
  r *= x10011
  r.square_repeated(6)
  # 612 + 18 = 630 operations
  r *= x101
  r.square_repeated(9)
  r *= x10001
  r.square_repeated(6)
  r *= x11011
  # 630 + 27 = 657 operations
  r.square_repeated(10)
  r *= x100101
  r.square_repeated(7)
  r *= x110011
  r.square_repeated(8)
  # 657 + 13 = 670 operations
  r *= x111101
  r.square_repeated(7)
  r *= x100011
  r.square_repeated(3)
  r *= x111
  # 670 + 26 = 696 operations
  r.square_repeated(10)
  r *= x1011
  r.square_repeated(11)
  r *= x110011
  r.square_repeated(3)
  # 696 + 17 = 713 operations
  r *= x111
  r.square_repeated(9)
  r *= x101011
  r.square_repeated(5)
  r *= x10111
  # 713 + 21 = 734 operations
  r.square_repeated(7)
  r *= x101011
  r.square_repeated(2)
  r *= x11
  r.square_repeated(10)
  # 734 + 19 = 753 operations
  r *= x101001
  r.square_repeated(10)
  r *= x110111
  r.square_repeated(6)
  r *= x111001
  # 753 + 23 = 776 operations
  r.square_repeated(6)
  r *= x101001
  r.square_repeated(9)
  r *= x100111
  r.square_repeated(6)
  # 776 + 12 = 788 operations
  r *= x110011
  r.square_repeated(7)
  r *= x100001
  r.square_repeated(2)
  r *= x11
  # 788 + 39 = 827 operations
  r.square_repeated(21)
  r *= a
  r.square_repeated(11)
  r *= x101111
  r.square_repeated(5)
  # 827 + 55 = 882 operations
  r *= x1001
  r.square_repeated(7)
  r *= x11101
  r.square_repeated(45)
  r *= x10001
  # 882 + 1 = 883 operations
  r.square()
--- a/constantine/curves/zoo_inversions.nim
+++ b/constantine/curves/zoo_inversions.nim
@ -7,11 +7,27 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
  ../config/[curves, type_ff],
  ./bls12_377_inversion,
  ./bls12_381_inversion,
  ./bn254_nogami_inversion,
  ./bn254_snarks_inversion,
  ./bw6_761_inversion,
  ./secp256k1_inversion
 export
  bls12_377_inversion,
  bls12_381_inversion,
  bn254_nogami_inversion,
  bn254_snarks_inversion,
  bw6_761_inversion,
  secp256k1_inversion
 func hasInversionAddchain*(C: static Curve): static bool =
  # TODO: For now we don't activate the addition chains
  #      for Secp256k1
  # Performance is slower than GCD
  when C in {BN254_Nogami, BN254_Snarks, BLS12_377, BLS12_381, BW6_761}:
    true
  else:
    false
--- a/constantine/curves/zoo_square_roots.nim
+++ b/constantine/curves/zoo_square_roots.nim
@ -8,11 +8,34 @@
 import
  std/macros,
-  ../config/curves,
+  ../config/[curves, type_ff],
-  ./bls12_377_sqrt
+  ./bls12_377_sqrt,
  ./bls12_381_sqrt,
  ./bn254_nogami_sqrt,
  ./bn254_snarks_sqrt,
  ./bw6_761_sqrt
 export
  bls12_377_sqrt,
  bls12_381_sqrt,
  bn254_nogami_sqrt,
  bn254_snarks_sqrt,
  bw6_761_sqrt
 func hasSqrtAddchain*(C: static Curve): static bool =
  when C in {BLS12_381, BN254_Nogami, BN254_Snarks, BW6_761}:
    true
  else:
    false
 {.experimental: "dynamicBindSym".}
 macro tonelliShanks*(C: static Curve, value: untyped): untyped =
  ## Get Square Root via Tonelli-Shanks related constants
  return bindSym($C & "_TonelliShanks_" & $value)
 func hasTonelliShanksAddchain*(C: static Curve): static bool =
  when C in {BLS12_377}:
    true
  else:
    false
--- a/tests/t_finite_fields_sqrt.nim
+++ b/tests/t_finite_fields_sqrt.nim
@ -125,6 +125,7 @@ proc main() =
    randomSqrtCheck BN254_Snarks
    randomSqrtCheck BLS12_377 # p ≢ 3 (mod 4)
    randomSqrtCheck BLS12_381
    randomSqrtCheck BW6_761
  suite "Modular square root - 32-bit bugs highlighted by property-based testing " & " [" & $WordBitwidth & "-bit mode]":
    # test "FKM12_447 - #30": - Deactivated, we don't support the curve as no one uses it.