Square Root & Inversion addition chains - 20% perf increase (#132)

* Addition chain for sqrt BLS12-381: 20% perf improvement * sqrt addchain for BN254_Snarks - 20% perf improvement as well * Fix operation count [skip ci] * BLS12-377 sqrt - 10% perf improvement * sqrt addition chain for BW6-761 - 6% speedup * BN254_Nogami inversion addchain * sqrt addchain for BN254_Nogami * Inversion addchain for BLS12-377 * inversion ddition chain for BW6-761
2021-01-23 20:55:40 +01:00 · 2021-01-23 20:55:40 +01:00 · 82819b1b10
parent a02dd19d36
commit 82819b1b10
19 changed files with 1988 additions and 97 deletions
--- a/benchmarks/bench_fields_template.nim
+++ b/benchmarks/bench_fields_template.nim
@ -22,15 +22,15 @@ import
  ./bench_blueprint

 export notes
-proc separator*() = separator(145)
+proc separator*() = separator(165)

 proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
  let ns = inNanoseconds((stop-start) div iters)
  let throughput = 1e9 / float64(ns)
  when SupportsGetTicks:
-    echo &"{op:<50} {field:<18} {throughput:>15.3f} ops/s     {ns:>9} ns/op     {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
+    echo &"{op:<70} {field:<18} {throughput:>15.3f} ops/s     {ns:>9} ns/op     {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
  else:
-    echo &"{op:<50} {field:<18} {throughput:>15.3f} ops/s     {ns:>9} ns/op"
+    echo &"{op:<70} {field:<18} {throughput:>15.3f} ops/s     {ns:>9} ns/op"

 macro fixFieldDisplay(T: typedesc): untyped =
  # At compile-time, enums are integers and their display is buggy
@ -93,20 +93,20 @@ proc invBench*(T: typedesc, iters: int) =
  var r: T
  let x = rng.random_unsafe(T)
  preventOptimAway(r)
-  bench("Inversion (constant-time default method)", T, iters):
+  bench("Inversion (constant-time default impl)", T, iters):
    r.inv(x)

 proc invEuclidBench*(T: typedesc, iters: int) =
  var r: T
  let x = rng.random_unsafe(T)
  preventOptimAway(r)
-  bench("Inversion via constant-time Euclid", T, iters):
+  bench("Inversion (constant-time Euclid)", T, iters):
    r.inv_euclid(x)

 proc invPowFermatBench*(T: typedesc, iters: int) =
  let x = rng.random_unsafe(T)
  const exponent = T.getInvModExponent()
-  bench("Inversion via exponentiation p-2 (Little Fermat)", T, iters):
+  bench("Inversion (exponentiation p-2, Little Fermat)", T, iters):
    var r = x
    r.powUnsafeExponent(exponent)

@ -114,15 +114,39 @@ proc invAddChainBench*(T: typedesc, iters: int) =
  var r: T
  let x = rng.random_unsafe(T)
  preventOptimAway(r)
-  bench("Inversion via addition chain", T, iters):
+  bench("Inversion (addition chain)", T, iters):
    r.inv_addchain(x)

 proc sqrtBench*(T: typedesc, iters: int) =
  let x = rng.random_unsafe(T)
-  bench("Square Root + square check (constant-time)", T, iters):
+  bench("Square Root + isSquare (constant-time default impl)", T, iters):
    var r = x
    discard r.sqrt_if_square()

+proc sqrtP3mod4Bench*(T: typedesc, iters: int) =
+  let x = rng.random_unsafe(T)
+  bench("SquareRoot + isSquare (p ≡ 3 (mod 4) exponentiation)", T, iters):
+    var r = x
+    discard r.sqrt_if_square_p3mod4()
+
+proc sqrtAddChainBench*(T: typedesc, iters: int) =
+  let x = rng.random_unsafe(T)
+  bench("SquareRoot + isSquare (addition chain)", T, iters):
+    var r = x
+    discard r.sqrt_if_square_addchain()
+
+proc sqrtTonelliBench*(T: typedesc, iters: int) =
+  let x = rng.random_unsafe(T)
+  bench("SquareRoot + isSquare (constant-time Tonelli-Shanks exponentiation)", T, iters):
+    var r = x
+    discard r.sqrt_if_square_tonelli_shanks(useAddChain = false)
+
+proc sqrtTonelliAddChainBench*(T: typedesc, iters: int) =
+  let x = rng.random_unsafe(T)
+  bench("SquareRoot + isSquare (constant-time Tonelli-Shanks addchain)", T, iters):
+    var r = x
+    discard r.sqrt_if_square_tonelli_shanks(useAddChain = true)
+
 proc powBench*(T: typedesc, iters: int) =
  let x = rng.random_unsafe(T)
  let exponent = rng.random_unsafe(BigInt[T.C.getCurveOrderBitwidth()])
--- a/benchmarks/bench_fp.nim
+++ b/benchmarks/bench_fp.nim
@ -8,9 +8,10 @@

 import
  # Internals
-  ../constantine/config/curves,
+  ../constantine/config/[curves, common],
  ../constantine/arithmetic,
  ../constantine/io/io_bigints,
+  ../constantine/curves/[zoo_inversions, zoo_square_roots],
  # Helpers
  ../helpers/static_for,
  ./bench_fields_template,
@ -24,8 +25,8 @@ import
 # ############################################################


-const Iters = 1_000_000
-const ExponentIters = 1000
+const Iters = 100_000
+const ExponentIters = 100
 const AvailableCurves = [
  # P224,
  BN254_Nogami,
@ -35,6 +36,7 @@ const AvailableCurves = [
  # Secp256k1,
  BLS12_377,
  BLS12_381,
+  BW6_761
 ]

 proc main() =
@ -50,9 +52,15 @@ proc main() =
    sqrBench(Fp[curve], Iters)
    invEuclidBench(Fp[curve], ExponentIters)
    invPowFermatBench(Fp[curve], ExponentIters)
-    when curve in {BN254_Snarks, BLS12_381}:
+    when curve.hasInversionAddchain():
      invAddChainBench(Fp[curve], ExponentIters)
-    sqrtBench(Fp[curve], ExponentIters)
+    when (BaseType(curve.Mod.limbs[0]) and 3) == 3:
+      sqrtP3mod4Bench(Fp[curve], ExponentIters)
+    when curve.hasSqrtAddchain():
+      sqrtAddChainBench(Fp[curve], ExponentIters)
+    when curve in {BLS12_377}:
+      sqrtTonelliBench(Fp[curve], ExponentIters)
+      sqrtTonelliAddChainBench(Fp[curve], ExponentIters)
    # Exponentiation by a "secret" of size ~the curve order
    powBench(Fp[curve], ExponentIters)
    powUnsafeBench(Fp[curve], ExponentIters)
--- a/constantine.nimble
+++ b/constantine.nimble
@ -218,6 +218,7 @@ proc test(flags, path: string, commandFile = false) =
    exec command
  else:
    exec "echo \'" & command & "\' >> " & buildParallel
+    exec "echo \"------------------------------------------------------\""

 proc buildBench(benchName: string, compiler = "", useAsm = true, run = false) =
  if not dirExists "build":
--- a/constantine/arithmetic/finite_fields.nim
+++ b/constantine/arithmetic/finite_fields.nim
@ -386,6 +386,12 @@ func square_repeated*(r: var FF, num: int) {.inline.} =
  for _ in 0 ..< num:
    r.square()

+func square_repeated*(r: var FF, a: FF, num: int) {.inline.} =
+  ## Repeated squarings
+  r.square(a)
+  for _ in 1 ..< num:
+    r.square()
+
 func `*=`*(a: var FF, b: static int) {.inline.} =
  ## Multiplication by a small integer known at compile-time
  # Implementation:
--- a/constantine/arithmetic/finite_fields_inversion.nim
+++ b/constantine/arithmetic/finite_fields_inversion.nim
@ -36,7 +36,7 @@ func inv*(r: var Fp, a: Fp) {.inline.} =
  # neither for Secp256k1 nor BN curves
  # Performance is slower than GCD
  # To be revisited with faster squaring/multiplications
-  when Fp.C in {BN254_Snarks, BLS12_381}:
+  when Fp.C.hasInversionAddchain():
    r.inv_addchain(a)
  else:
    r.inv_euclid(a)
@ -48,10 +48,7 @@ func inv*(a: var Fp) {.inline.} =
  ## Incidentally this avoids extra check
  ## to convert Jacobian and Projective coordinates
  ## to affine for elliptic curve
-  # For now we don't activate the addition chains
-  # for Secp256k1 nor BN curves
-  # Performance is slower than GCD
-  when Fp.C in {BN254_Snarks, BLS12_381}:
+  when Fp.C.hasInversionAddchain():
    a.inv_addchain(a)
  else:
    a.inv_euclid(a)
--- a/constantine/arithmetic/finite_fields_square_root.nim
+++ b/constantine/arithmetic/finite_fields_square_root.nim
@ -46,6 +46,10 @@ func isSquare*(a: Fp): SecretBool {.inline.} =
 # Specialized routine for p ≡ 3 (mod 4)
 # ------------------------------------------------------------

+func hasP3mod4_primeModulus(C: static Curve): static bool =
+  ## Returns true iff p ≡ 3 (mod 4)
+  (BaseType(C.Mod.limbs[0]) and 3) == 3
+
 func sqrt_p3mod4(a: var Fp) {.inline.} =
  ## Compute the square root of ``a``
  ##
@ -93,7 +97,7 @@ func sqrt_invsqrt_if_square_p3mod4(sqrt, invsqrt: var Fp, a: Fp): SecretBool {.i
  test.square(sqrt)
  result = test == a

-func sqrt_if_square_p3mod4(a: var Fp): SecretBool {.inline.} =
+func sqrt_if_square_p3mod4*(a: var Fp): SecretBool {.inline.} =
  ## If ``a`` is a square, compute the square root of ``a``
  ## if not, ``a`` is unmodified.
  ##
@ -108,14 +112,60 @@ func sqrt_if_square_p3mod4(a: var Fp): SecretBool {.inline.} =
  result = sqrt_invsqrt_if_square_p3mod4(sqrt, invsqrt, a)
  a.ccopy(sqrt, result)

+# Specialized routines for addchain-based square roots
+# ------------------------------------------------------------
+
+func sqrt_addchain(a: var Fp) {.inline.} =
+  ## Compute the square root of ``a``
+  ##
+  ## This requires ``a`` to be a square
+  ## The result is undefined otherwise
+  ##
+  ## The square root, if it exist is multivalued,
+  ## i.e. both x² == (-x)²
+  ## This procedure returns a deterministic result
+  var invsqrt {.noInit.}: Fp
+  invsqrt.invsqrt_addchain(a)
+  a *= invsqrt
+
+func sqrt_invsqrt_addchain(sqrt, invsqrt: var Fp, a: Fp) {.inline.} =
+  ## If ``a`` is a square, compute the square root of ``a`` in sqrt
+  ## and the inverse square root of a in invsqrt
+  invsqrt.invsqrt_addchain(a)
+  sqrt.prod(invsqrt, a)
+
+func sqrt_invsqrt_if_square_addchain(sqrt, invsqrt: var Fp, a: Fp): SecretBool {.inline.} =
+  ## If ``a`` is a square, compute the square root of ``a`` in sqrt
+  ## and the inverse square root of a in invsqrt
+  ##
+  ## If a is not square, sqrt and invsqrt are undefined
+  sqrt_invsqrt_addchain(sqrt, invsqrt, a)
+  var test {.noInit.}: Fp
+  test.square(sqrt)
+  result = test == a
+
+func sqrt_if_square_addchain*(a: var Fp): SecretBool {.inline.} =
+  ## If ``a`` is a square, compute the square root of ``a``
+  ## if not, ``a`` is unmodified.
+  ##
+  ## The square root, if it exist is multivalued,
+  ## i.e. both x² == (-x)²
+  ## This procedure returns a deterministic result
+  var sqrt {.noInit.}, invsqrt {.noInit.}: Fp
+  result = sqrt_invsqrt_if_square_addchain(sqrt, invsqrt, a)
+  a.ccopy(sqrt, result)
+
 # Tonelli Shanks for any prime
 # ------------------------------------------------------------

 func precompute_tonelli_shanks(
       a_pre_exp: var Fp,
-       a: Fp) =
+       a: Fp, useAddChain: static bool) =
  a_pre_exp = a
-  a_pre_exp.powUnsafeExponent(Fp.C.tonelliShanks(exponent))
+  when useAddChain:
+    a_pre_exp.precompute_tonelli_shanks_addchain(a)
+  else:
+    a_pre_exp.powUnsafeExponent(Fp.C.tonelliShanks(exponent))

 func isSquare_tonelli_shanks(
       a, a_pre_exp: Fp): SecretBool =
@ -126,10 +176,9 @@ func isSquare_tonelli_shanks(
  ## a^((p-1-2^e)/(2*2^e))
  const e = Fp.C.tonelliShanks(twoAdicity)
  var r {.noInit.}: Fp
-  r.square(a_pre_exp) # a^(2(q-1-2^e)/(2*2^e)) = a^((q-1)/2^e - 1)
-  r *= a              # a^((q-1)/2^e)
-  for _ in 0 ..< e-1:
-    r.square()        # a^((q-1)/2)
+  r.square(a_pre_exp)    # a^(2(q-1-2^e)/(2*2^e)) = a^((q-1)/2^e - 1)
+  r *= a                 # a^((q-1)/2^e)
+  r.square_repeated(e-1) # a^((q-1)/2)

  result = not(r.isMinusOne())
  # r can be:
@ -143,14 +192,14 @@ func isSquare_tonelli_shanks(
      r.isMinusOne()
    )

-func sqrt_invsqrt_tonelli_shanks(
+func sqrt_invsqrt_tonelli_shanks_pre(
       sqrt, invsqrt: var Fp,
       a, a_pre_exp: Fp) =
  ## Compute the square_root and inverse_square_root
  ## of `a` via constant-time Tonelli-Shanks
  ##
  ## a_pre_exp is a precomputation a^((p-1-2^e)/(2*2^e))
-  ## ThItat is shared with the simultaneous isSquare routine
+  ## That is shared with the simultaneous isSquare routine
  template z: untyped = a_pre_exp
  template r: untyped = invsqrt
  var t {.noInit.}: Fp
@ -165,8 +214,7 @@ func sqrt_invsqrt_tonelli_shanks(
  var buf {.noInit.}: Fp

  for i in countdown(e, 2, 1):
-    for j in 1 .. i-2:
-      b.square()
+    b.square_repeated(i-2)

    let bNotOne = not b.isOne()
    buf.prod(r, root)
@ -178,8 +226,72 @@ func sqrt_invsqrt_tonelli_shanks(

  sqrt.prod(invsqrt, a)

+# ----------------------------------------------
+
+func sqrt_tonelli_shanks(a: var Fp, useAddChain: static bool) {.inline.} =
+  ## Compute the square root of ``a``
+  ##
+  ## This requires ``a`` to be a square
+  ##
+  ## The result is undefined otherwise
+  ##
+  ## The square root, if it exist is multivalued,
+  ## i.e. both x² == (-x)²
+  ## This procedure returns a deterministic result
+  ## This procedure is constant-time
+  var a_pre_exp{.noInit.}, sqrt{.noInit.}, invsqrt{.noInit.}: Fp
+  a_pre_exp.precompute_tonelli_shanks(a, useAddChain)
+  sqrt_invsqrt_tonelli_shanks_pre(sqrt, invsqrt, a, a_pre_exp)
+  a = sqrt
+
+func sqrt_invsqrt_tonelli_shanks(sqrt, invsqrt: var Fp, a: Fp, useAddChain: static bool) {.inline.} =
+  ## Compute the square root and inverse square root of ``a``
+  ##
+  ## This requires ``a`` to be a square
+  ##
+  ## The result is undefined otherwise
+  ##
+  ## The square root, if it exist is multivalued,
+  ## i.e. both x² == (-x)²
+  ## This procedure returns a deterministic result
+  var a_pre_exp{.noInit.}: Fp
+  a_pre_exp.precompute_tonelli_shanks(a, useAddChain)
+  sqrt_invsqrt_tonelli_shanks_pre(sqrt, invsqrt, a, a_pre_exp)
+
+func sqrt_invsqrt_if_square_tonelli_shanks(sqrt, invsqrt: var Fp, a: Fp, useAddChain: static bool): SecretBool  {.inline.} =
+  ## Compute the square root and ivnerse square root of ``a``
+  ##
+  ## This returns true if ``a`` is square and sqrt/invsqrt contains the square root/inverse square root
+  ##
+  ## The result is undefined otherwise
+  ##
+  ## The square root, if it exist is multivalued,
+  ## i.e. both x² == (-x)²
+  ## This procedure returns a deterministic result
+  var a_pre_exp{.noInit.}: Fp
+  a_pre_exp.precompute_tonelli_shanks(a, useAddChain)
+  result = isSquare_tonelli_shanks(a, a_pre_exp)
+  sqrt_invsqrt_tonelli_shanks_pre(sqrt, invsqrt, a, a_pre_exp)
+  a = sqrt
+
+func sqrt_if_square_tonelli_shanks*(a: var Fp, useAddChain: static bool): SecretBool {.inline.} =
+  ## If ``a`` is a square, compute the square root of ``a``
+  ## if not, ``a`` is unmodified.
+  ##
+  ## The square root, if it exist is multivalued,
+  ## i.e. both x² == (-x)²
+  ## This procedure returns a deterministic result
+  ## This procedure is constant-time
+  var a_pre_exp{.noInit.}, sqrt{.noInit.}, invsqrt{.noInit.}: Fp
+  a_pre_exp.precompute_tonelli_shanks(a, useAddChain)
+  result = isSquare_tonelli_shanks(a, a_pre_exp)
+  sqrt_invsqrt_tonelli_shanks_pre(sqrt, invsqrt, a, a_pre_exp)
+  a = sqrt
+
 # Public routines
 # ------------------------------------------------------------
+# Note: we export the inner sqrt_invsqrt_IMPL
+#       for benchmarking purposes.

 func sqrt*[C](a: var Fp[C]) {.inline.} =
  ## Compute the square root of ``a``
@ -192,30 +304,12 @@ func sqrt*[C](a: var Fp[C]) {.inline.} =
  ## i.e. both x² == (-x)²
  ## This procedure returns a deterministic result
  ## This procedure is constant-time
-  when (BaseType(C.Mod.limbs[0]) and 3) == 3:
+  when C.hasSqrtAddchain():
+    sqrt_addchain(a)
+  elif C.hasP3mod4_primeModulus():
    sqrt_p3mod4(a)
  else:
-    var a_pre_exp{.noInit.}, sqrt{.noInit.}, invsqrt{.noInit.}: Fp[C]
-    a_pre_exp.precompute_tonelli_shanks(a)
-    sqrt_invsqrt_tonelli_shanks(sqrt, invsqrt, a, a_pre_exp)
-    a = sqrt
-
-func sqrt_if_square*[C](a: var Fp[C]): SecretBool {.inline.} =
-  ## If ``a`` is a square, compute the square root of ``a``
-  ## if not, ``a`` is unmodified.
-  ##
-  ## The square root, if it exist is multivalued,
-  ## i.e. both x² == (-x)²
-  ## This procedure returns a deterministic result
-  ## This procedure is constant-time
-  when (BaseType(C.Mod.limbs[0]) and 3) == 3:
-    result = sqrt_if_square_p3mod4(a)
-  else:
-    var a_pre_exp{.noInit.}, sqrt{.noInit.}, invsqrt{.noInit.}: Fp[C]
-    a_pre_exp.precompute_tonelli_shanks(a)
-    result = isSquare_tonelli_shanks(a, a_pre_exp)
-    sqrt_invsqrt_tonelli_shanks(sqrt, invsqrt, a, a_pre_exp)
-    a = sqrt
+    sqrt_tonelli_shanks(a, useAddChain = C.hasTonelliShanksAddchain())

 func sqrt_invsqrt*[C](sqrt, invsqrt: var Fp[C], a: Fp[C]) {.inline.} =
  ## Compute the square root and inverse square root of ``a``
@ -227,12 +321,12 @@ func sqrt_invsqrt*[C](sqrt, invsqrt: var Fp[C], a: Fp[C]) {.inline.} =
  ## The square root, if it exist is multivalued,
  ## i.e. both x² == (-x)²
  ## This procedure returns a deterministic result
-  when (BaseType(C.Mod.limbs[0]) and 3) == 3:
+  when C.hasSqrtAddchain():
+    sqrt_invsqrt_addchain(sqrt, invsqrt, a)
+  elif C.hasP3mod4_primeModulus():
    sqrt_invsqrt_p3mod4(sqrt, invsqrt, a)
  else:
-    var a_pre_exp{.noInit.}: Fp[C]
-    a_pre_exp.precompute_tonelli_shanks(a)
-    sqrt_invsqrt_tonelli_shanks(sqrt, invsqrt, a, a_pre_exp)
+    sqrt_invsqrt_tonelli_shanks(sqrt, invsqrt, a, useAddChain = C.hasTonelliShanksAddchain())

 func sqrt_invsqrt_if_square*[C](sqrt, invsqrt: var Fp[C], a: Fp[C]): SecretBool  {.inline.} =
  ## Compute the square root and ivnerse square root of ``a``
@ -244,11 +338,24 @@ func sqrt_invsqrt_if_square*[C](sqrt, invsqrt: var Fp[C], a: Fp[C]): SecretBool
  ## The square root, if it exist is multivalued,
  ## i.e. both x² == (-x)²
  ## This procedure returns a deterministic result
-  when (BaseType(C.Mod.limbs[0]) and 3) == 3:
+  when C.hasSqrtAddchain():
+    result = sqrt_invsqrt_if_square_addchain(sqrt, invsqrt, a)
+  elif C.hasP3mod4_primeModulus():
    result = sqrt_invsqrt_if_square_p3mod4(sqrt, invsqrt, a)
  else:
-    var a_pre_exp{.noInit.}: Fp[C]
-    a_pre_exp.precompute_tonelli_shanks(a)
-    result = isSquare_tonelli_shanks(a, a_pre_exp)
-    sqrt_invsqrt_tonelli_shanks(sqrt, invsqrt, a, a_pre_exp)
-    a = sqrt
+    result = sqrt_invsqrt_if_square_tonelli_shanks(sqrt, invsqrt, a, useAddChain = C.hasTonelliShanksAddchain())
+
+func sqrt_if_square*[C](a: var Fp[C]): SecretBool {.inline.} =
+  ## If ``a`` is a square, compute the square root of ``a``
+  ## if not, ``a`` is unmodified.
+  ##
+  ## The square root, if it exist is multivalued,
+  ## i.e. both x² == (-x)²
+  ## This procedure returns a deterministic result
+  ## This procedure is constant-time
+  when C.hasSqrtAddchain():
+    result = sqrt_if_square_addchain(a)
+  elif C.hasP3mod4_primeModulus():
+    result = sqrt_if_square_p3mod4(a)
+  else:
+    result = sqrt_if_square_tonelli_shanks(a, useAddChain = C.hasTonelliShanksAddchain())
--- a/constantine/curves/bls12_377_inversion.nim
+++ b/constantine/curves/bls12_377_inversion.nim
@ -0,0 +1,204 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../config/curves,
+  ../arithmetic/finite_fields
+
+# ############################################################
+#
+#           Specialized inversion for BLS12-377
+#
+# ############################################################
+
+func inv_addchain*(r: var Fp[BLS12_377], a: Fp[BLS12_377]) =
+  let a = a # ensure a.inv_addchain(a) is OK
+
+  var
+    x10       {.noInit.}: Fp[BLS12_377]
+    x11       {.noInit.}: Fp[BLS12_377]
+    x100      {.noInit.}: Fp[BLS12_377]
+    x101      {.noInit.}: Fp[BLS12_377]
+    x111      {.noInit.}: Fp[BLS12_377]
+    x1001     {.noInit.}: Fp[BLS12_377]
+    x1011     {.noInit.}: Fp[BLS12_377]
+    x1111     {.noInit.}: Fp[BLS12_377]
+    x10001    {.noInit.}: Fp[BLS12_377]
+    x10011    {.noInit.}: Fp[BLS12_377]
+    x10111    {.noInit.}: Fp[BLS12_377]
+    x11011    {.noInit.}: Fp[BLS12_377]
+    x11101    {.noInit.}: Fp[BLS12_377]
+    x11111    {.noInit.}: Fp[BLS12_377]
+    x110100   {.noInit.}: Fp[BLS12_377]
+    x11010000 {.noInit.}: Fp[BLS12_377]
+    x11010111 {.noInit.}: Fp[BLS12_377]
+
+  x10       .square(a)
+  x11       .prod(a, x10)
+  x100      .prod(a, x11)
+  x101      .prod(a, x100)
+  x111      .prod(x10, x101)
+  x1001     .prod(x10, x111)
+  x1011     .prod(x10, x1001)
+  x1111     .prod(x100, x1011)
+  x10001    .prod(x10, x1111)
+  x10011    .prod(x10, x10001)
+  x10111    .prod(x100, x10011)
+  x11011    .prod(x100, x10111)
+  x11101    .prod(x10, x11011)
+  x11111    .prod(x10, x11101)
+  x110100   .prod(x10111, x11101)
+  x11010000 .square_repeated(x110100, 2)
+  x11010111 .prod(x111, x11010000)
+  # 18 operations
+
+  # TODO: we can accumulate in a partially reduced
+  #       doubled-size `r` to avoid the final substractions.
+  #       and only reduce at the end.
+  #       This requires the number of op to be less than log2(p) == 381
+
+  # 18 + 18 = 36 operations
+  r.square_repeated(x11010111, 8)
+  r *= x11101
+  r.square_repeated(7)
+  r *= x10001
+  r.square()
+
+  # 36 + 14 = 50 operations
+  r *= a
+  r.square_repeated(9)
+  r *= x10111
+  r.square_repeated(2)
+  r *= x11
+
+  # 50 + 21 = 71 operations
+  r.square_repeated(6)
+  r *= x101
+  r.square_repeated(4)
+  r *= a
+  r.square_repeated(9)
+
+  # 71 + 13 = 84 operations
+  r *= x11101
+  r.square_repeated(5)
+  r *= x1011
+  r.square_repeated(5)
+  r *= x11
+
+  # 84 + 21 = 105 operations
+  r.square_repeated(8)
+  r *= x11101
+  r.square()
+  r *= a
+  r.square_repeated(10)
+
+  # 105 + 20 = 125 operations
+  r *= x10111
+  r.square_repeated(12)
+  r *= x11011
+  r.square_repeated(5)
+  r *= x101
+
+  # 125 + 22 = 147 operations
+  r.square_repeated(7)
+  r *= x101
+  r.square_repeated(6)
+  r *= x1001
+  r.square_repeated(7)
+
+  # 147 + 11 = 158 operations
+  r *= x11101
+  r.square_repeated(5)
+  r *= x10001
+  r.square_repeated(3)
+  r *= x101
+
+  # 158 + 23 = 181 operations
+  r.square_repeated(8)
+  r *= x10001
+  r.square_repeated(6)
+  r *= x11011
+  r.square_repeated(7)
+
+  # 181 + 19 = 200 operations
+  r *= x11111
+  r.square_repeated(4)
+  r *= x11
+  r.square_repeated(12)
+  r *= x1111
+
+  # 200 + 19 = 219 operations
+  r.square_repeated(4)
+  r *= x101
+  r.square_repeated(8)
+  r *= x10011
+  r.square_repeated(5)
+
+  # 219 + 13 = 232 operations
+  r *= x10001
+  r.square_repeated(3)
+  r *= x111
+  r.square_repeated(7)
+  r *= x1111
+
+  # 232 + 22 = 254 operations
+  r.square_repeated(5)
+  r *= x1111
+  r.square_repeated(7)
+  r *= x11011
+  r.square_repeated(8)
+
+  # 254 + 13 = 269 operations
+  r *= x10001
+  r.square_repeated(6)
+  r *= x11111
+  r.square_repeated(6)
+  r *= x11101
+
+  # 269 + 35 = 304 operations
+  r.square_repeated(9)
+  r *= x1001
+  r.square_repeated(5)
+  r *= x1001
+  r.square_repeated(19)
+
+  # 304 + 17 = 321 operations
+  r *= x10111
+  r.square_repeated(8)
+  r *= x1011
+  r.square_repeated(6)
+  r *= x10111
+
+  # 321 + 16 = 337 operations
+  r.square_repeated(4)
+  r *= x101
+  r.square_repeated(4)
+  r *= a
+  r.square_repeated(6)
+
+  # 337 + 29 = 376 operations
+  r *= x11
+  r.square_repeated(29)
+  r *= a
+  r.square_repeated(7)
+  r *= x101
+
+  # 376 + 16 = 392 operations
+  r.square_repeated(9)
+  r *= x10001
+  r.square_repeated(6)
+
+  # 392 + 8*6 = 440 operations
+  for _ in 0 ..< 8:
+    r *= x11111
+    r.square_repeated(5)
+
+  r *= x11111
+  r.square()
+  r *= a
+  # Total 443 operations
--- a/constantine/curves/bls12_377_sqrt.nim
+++ b/constantine/curves/bls12_377_sqrt.nim
@ -8,7 +8,8 @@

 import
  ../config/[curves, type_bigint, type_ff],
-  ../io/[io_bigints, io_fields]
+  ../io/[io_bigints, io_fields],
+  ../arithmetic/finite_fields

 const
  # with e = 2adicity
@ -18,3 +19,188 @@ const
  BLS12_377_TonelliShanks_exponent* = BigInt[330].fromHex"0x35c748c2f8a21d58c760b80d94292763445b3e601ea271e3de6c45f741290002e16ba88600000010a11"
  BLS12_377_TonelliShanks_twoAdicity* = 46
  BLS12_377_TonelliShanks_root_of_unity* = Fp[BLS12_377].fromHex"0x382d3d99cdbc5d8fe9dee6aa914b0ad14fcaca7022110ec6eaa2bc56228ac41ea03d28cc795186ba6b5ef26b00bbe8"
+
+# ############################################################
+#
+#       Specialized Tonelli-Shanks for BLS12-377
+#
+# ############################################################
+
+func precompute_tonelli_shanks_addchain*(
+       r: var Fp[BLS12_377],
+       a: Fp[BLS12_377]) =
+  ## Does a^BLS12_377_TonelliShanks_exponent
+  ## via an addition-chain
+
+  var
+    x10       {.noInit.}: Fp[BLS12_377]
+    x11       {.noInit.}: Fp[BLS12_377]
+    x100      {.noInit.}: Fp[BLS12_377]
+    x101      {.noInit.}: Fp[BLS12_377]
+    x111      {.noInit.}: Fp[BLS12_377]
+    x1001     {.noInit.}: Fp[BLS12_377]
+    x1011     {.noInit.}: Fp[BLS12_377]
+    x1111     {.noInit.}: Fp[BLS12_377]
+    x10001    {.noInit.}: Fp[BLS12_377]
+    x10011    {.noInit.}: Fp[BLS12_377]
+    x10111    {.noInit.}: Fp[BLS12_377]
+    x11011    {.noInit.}: Fp[BLS12_377]
+    x11101    {.noInit.}: Fp[BLS12_377]
+    x11111    {.noInit.}: Fp[BLS12_377]
+    x110100   {.noInit.}: Fp[BLS12_377]
+    x11010000 {.noInit.}: Fp[BLS12_377]
+    x11010111 {.noInit.}: Fp[BLS12_377]
+
+  x10       .square(a)
+  x11       .prod(a, x10)
+  x100      .prod(a, x11)
+  x101      .prod(a, x100)
+  x111      .prod(x10, x101)
+  x1001     .prod(x10, x111)
+  x1011     .prod(x10, x1001)
+  x1111     .prod(x100, x1011)
+  x10001    .prod(x10, x1111)
+  x10011    .prod(x10, x10001)
+  x10111    .prod(x100, x10011)
+  x11011    .prod(x100, x10111)
+  x11101    .prod(x10, x11011)
+  x11111    .prod(x10, x11101)
+  x110100   .prod(x10111, x11101)
+  x11010000 .square_repeated(x110100, 2)
+  x11010111 .prod(x111, x11010000)
+  # 18 operations
+
+  # TODO: we can accumulate in a partially reduced
+  #       doubled-size `r` to avoid the final substractions.
+  #       and only reduce at the end.
+  #       This requires the number of op to be less than log2(p) == 381
+
+  # 18 + 18 = 36 operations
+  r.square_repeated(x11010111, 8)
+  r *= x11101
+  r.square_repeated(7)
+  r *= x10001
+  r.square()
+
+  # 36 + 14 = 50 operations
+  r *= a
+  r.square_repeated(9)
+  r *= x10111
+  r.square_repeated(2)
+  r *= x11
+
+  # 50 + 21 = 71 operations
+  r.square_repeated(6)
+  r *= x101
+  r.square_repeated(4)
+  r *= a
+  r.square_repeated(9)
+
+  # 71 + 13 = 84 operations
+  r *= x11101
+  r.square_repeated(5)
+  r *= x1011
+  r.square_repeated(5)
+  r *= x11
+
+  # 84 + 21 = 105 operations
+  r.square_repeated(8)
+  r *= x11101
+  r.square()
+  r *= a
+  r.square_repeated(10)
+
+  # 105 + 20 = 125 operations
+  r *= x10111
+  r.square_repeated(12)
+  r *= x11011
+  r.square_repeated(5)
+  r *= x101
+
+  # 125 + 22 = 147 operations
+  r.square_repeated(7)
+  r *= x101
+  r.square_repeated(6)
+  r *= x1001
+  r.square_repeated(7)
+
+  # 147 + 11 = 158 operations
+  r *= x11101
+  r.square_repeated(5)
+  r *= x10001
+  r.square_repeated(3)
+  r *= x101
+
+  # 158 + 23 = 181 operations
+  r.square_repeated(8)
+  r *= x10001
+  r.square_repeated(6)
+  r *= x11011
+  r.square_repeated(7)
+
+  # 181 + 19 = 200 operations
+  r *= x11111
+  r.square_repeated(4)
+  r *= x11
+  r.square_repeated(12)
+  r *= x1111
+
+  # 200 + 19 = 219 operations
+  r.square_repeated(4)
+  r *= x101
+  r.square_repeated(8)
+  r *= x10011
+  r.square_repeated(5)
+
+  # 219 + 13 = 232 operations
+  r *= x10001
+  r.square_repeated(3)
+  r *= x111
+  r.square_repeated(7)
+  r *= x1111
+
+  # 232 + 22 = 254 operations
+  r.square_repeated(5)
+  r *= x1111
+  r.square_repeated(7)
+  r *= x11011
+  r.square_repeated(8)
+
+  # 254 + 13 = 269 operations
+  r *= x10001
+  r.square_repeated(6)
+  r *= x11111
+  r.square_repeated(6)
+  r *= x11101
+
+  # 269 + 35 = 304 operations
+  r.square_repeated(9)
+  r *= x1001
+  r.square_repeated(5)
+  r *= x1001
+  r.square_repeated(19)
+
+  # 304 + 17 = 321 operations
+  r *= x10111
+  r.square_repeated(8)
+  r *= x1011
+  r.square_repeated(6)
+  r *= x10111
+
+  # 321 + 16 = 337 operations
+  r.square_repeated(4)
+  r *= x101
+  r.square_repeated(4)
+  r *= a
+  r.square_repeated(6)
+
+  # 337 + 29 = 376 operations
+  r *= x11
+  r.square_repeated(29)
+  r *= a
+  r.square_repeated(7)
+  r *= x101
+
+  # 376 + 10 = 386 operations
+  r.square_repeated(9)
+  r *= x10001
--- a/constantine/curves/bls12_381_inversion.nim
+++ b/constantine/curves/bls12_381_inversion.nim
@ -88,7 +88,8 @@ func inv_addchain*(r: var Fp[BLS12_381], a: Fp[BLS12_381]) =
  x11100101 .prod(x100, x11100001)
  x11101011 .prod(x10100, x11010111)
  x11110101 .prod(x10100, x11100001)
-  x11111111 .prod(x10100, x11101011) # 35 operations
+  x11111111 .prod(x10100, x11101011)
+  # 35 operations

  # TODO: we can accumulate in a partially reduced
  #       doubled-size `r` to avoid the final substractions.
@ -109,7 +110,7 @@ func inv_addchain*(r: var Fp[BLS12_381], a: Fp[BLS12_381]) =
  r *= x11111111
  r.square_repeated(7)

-  # 88 + 22 = 107 operations
+  # 85 + 22 = 107 operations
  r *= x1001101
  r.square_repeated(9)
  r *= x1101001
--- a/constantine/curves/bls12_381_sqrt.nim
+++ b/constantine/curves/bls12_381_sqrt.nim
@ -0,0 +1,223 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../config/curves,
+  ../arithmetic/finite_fields
+
+# ############################################################
+#
+#           Specialized invsqrt for BLS12-381
+#
+# ############################################################
+
+func invsqrt_addchain*(r: var Fp[BLS12_381], a: Fp[BLS12_381]) =
+  var
+    x10       {.noInit.}: Fp[BLS12_381]
+    x100      {.noInit.}: Fp[BLS12_381]
+    x1000     {.noInit.}: Fp[BLS12_381]
+    x1001     {.noInit.}: Fp[BLS12_381]
+    x1011     {.noInit.}: Fp[BLS12_381]
+    x1101     {.noInit.}: Fp[BLS12_381]
+    x10001    {.noInit.}: Fp[BLS12_381]
+    x10100    {.noInit.}: Fp[BLS12_381]
+    x10101    {.noInit.}: Fp[BLS12_381]
+    x11001    {.noInit.}: Fp[BLS12_381]
+    x11010    {.noInit.}: Fp[BLS12_381]
+    x110100   {.noInit.}: Fp[BLS12_381]
+    x110110   {.noInit.}: Fp[BLS12_381]
+    x110111   {.noInit.}: Fp[BLS12_381]
+    x1001101  {.noInit.}: Fp[BLS12_381]
+    x1001111  {.noInit.}: Fp[BLS12_381]
+    x1010101  {.noInit.}: Fp[BLS12_381]
+    x1011101  {.noInit.}: Fp[BLS12_381]
+    x1100111  {.noInit.}: Fp[BLS12_381]
+    x1101001  {.noInit.}: Fp[BLS12_381]
+    x1110111  {.noInit.}: Fp[BLS12_381]
+    x1111011  {.noInit.}: Fp[BLS12_381]
+    x10001001 {.noInit.}: Fp[BLS12_381]
+    x10010101 {.noInit.}: Fp[BLS12_381]
+    x10010111 {.noInit.}: Fp[BLS12_381]
+    x10101001 {.noInit.}: Fp[BLS12_381]
+    x10110001 {.noInit.}: Fp[BLS12_381]
+    x10111111 {.noInit.}: Fp[BLS12_381]
+    x11000011 {.noInit.}: Fp[BLS12_381]
+    x11010000 {.noInit.}: Fp[BLS12_381]
+    x11010111 {.noInit.}: Fp[BLS12_381]
+    x11100001 {.noInit.}: Fp[BLS12_381]
+    x11100101 {.noInit.}: Fp[BLS12_381]
+    x11101011 {.noInit.}: Fp[BLS12_381]
+    x11110101 {.noInit.}: Fp[BLS12_381]
+    x11111111 {.noInit.}: Fp[BLS12_381]
+
+  x10       .square(a)
+  x100      .square(x10)
+  x1000     .square(x100)
+  x1001     .prod(a, x1000)
+  x1011     .prod(x10, x1001)
+  x1101     .prod(x10, x1011)
+  x10001    .prod(x100, x1101)
+  x10100    .prod(x1001, x1011)
+  x10101    .prod(a, x10100)
+  x11001    .prod(x100, x10101)
+  x11010    .prod(a, x11001)
+  x110100   .square(x11010)
+  x110110   .prod(x10, x110100)
+  x110111   .prod(a, x110110)
+  x1001101  .prod(x11001, x110100)
+  x1001111  .prod(x10, x1001101)
+  x1010101  .prod(x1000, x1001101)
+  x1011101  .prod(x1000, x1010101)
+  x1100111  .prod(x11010, x1001101)
+  x1101001  .prod(x10, x1100111)
+  x1110111  .prod(x11010, x1011101)
+  x1111011  .prod(x100, x1110111)
+  x10001001 .prod(x110100, x1010101)
+  x10010101 .prod(x11010, x1111011)
+  x10010111 .prod(x10, x10010101)
+  x10101001 .prod(x10100, x10010101)
+  x10110001 .prod(x1000, x10101001)
+  x10111111 .prod(x110110, x10001001)
+  x11000011 .prod(x100, x10111111)
+  x11010000 .prod(x1101, x11000011)
+  x11010111 .prod(x10100, x11000011)
+  x11100001 .prod(x10001, x11010000)
+  x11100101 .prod(x100, x11100001)
+  x11101011 .prod(x10100, x11010111)
+  x11110101 .prod(x10100, x11100001)
+  x11111111 .prod(x10100, x11101011)
+  # 36 operations
+
+  # TODO: we can accumulate in a partially reduced
+  #       doubled-size `r` to avoid the final substractions.
+  #       and only reduce at the end.
+  #       This requires the number of op to be less than log2(p) == 381
+
+  # 36 + 22 = 58 operations
+  r.prod(x10111111, x11100001)
+  r.square_repeated(8)
+  r *= x10001
+  r.square_repeated(11)
+  r *= x11110101
+
+  # 58 + 28 = 86 operations
+  r.square_repeated(11)
+  r *= x11100101
+  r.square_repeated(8)
+  r *= x11111111
+  r.square_repeated(7)
+
+  # 86 + 22 = 108 operations
+  r *= x1001101
+  r.square_repeated(9)
+  r *= x1101001
+  r.square_repeated(10)
+  r *= x10110001
+
+  # 108+24 = 132 operations
+  r.square_repeated(7)
+  r *= x1011101
+  r.square_repeated(9)
+  r *= x1111011
+  r.square_repeated(6)
+
+  # 132+23 = 155 operations
+  r *= x11001
+  r.square_repeated(11)
+  r *= x1101001
+  r.square_repeated(9)
+  r *= x11101011
+
+  # 155+28 = 183 operations
+  r.square_repeated(10)
+  r *= x11010111
+  r.square_repeated(6)
+  r *= x11001
+  r.square_repeated(10)
+
+  # 183+23 = 206 operations
+  r *= x1110111
+  r.square_repeated(9)
+  r *= x10010111
+  r.square_repeated(11)
+  r *= x1001111
+
+  # 206+30 = 236 operations
+  r.square_repeated(10)
+  r *= x11100001
+  r.square_repeated(9)
+  r *= x10001001
+  r.square_repeated(9)
+
+  # 236+21 = 257 operations
+  r *= x10111111
+  r.square_repeated(8)
+  r *= x1100111
+  r.square_repeated(10)
+  r *= x11000011
+
+  # 257+28 = 285 operations
+  r.square_repeated(9)
+  r *= x10010101
+  r.square_repeated(12)
+  r *= x1111011
+  r.square_repeated(5)
+
+  # 285 + 21 = 306 operations
+  r *= x1011
+  r.square_repeated(11)
+  r *= x1111011
+  r.square_repeated(7)
+  r *= x1001
+
+  # 306+32 = 338 operations
+  r.square_repeated(13)
+  r *= x11110101
+  r.square_repeated(9)
+  r *= x10111111
+  r.square_repeated(8)
+
+  # 338+22 = 360 operations
+  r *= x11111111
+  r.square_repeated(8)
+  r *= x11101011
+  r.square_repeated(11)
+  r *= x10101001
+
+  # 360+24 = 384 operations
+  r.square_repeated(8)
+  r *= x11111111
+  r.square_repeated(8)
+  r *= x11111111
+  r.square_repeated(6)
+
+  # 384+22 = 406 operations
+  r *= x110111
+  r.square_repeated(10)
+  r *= x11111111
+  r.square_repeated(9)
+  r *= x11111111
+
+  # 406+26 = 432 operations
+  r.square_repeated(8)
+  r *= x11111111
+  r.square_repeated(8)
+  r *= x11111111
+  r.square_repeated(8)
+
+  # 432+17 = 449 operations
+  r *= x11111111
+  r.square_repeated(7)
+  r *= x1010101
+  r.square_repeated(6)
+  r *= x10101
+  r.square()
+
+  # Total 449 operations:
+  # - 75 multiplications
+  # - 374 squarings
--- a/constantine/curves/bn254_nogami_inversion.nim
+++ b/constantine/curves/bn254_nogami_inversion.nim
@ -0,0 +1,98 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../config/curves,
+  ../arithmetic/finite_fields
+
+# ############################################################
+#
+#           Specialized inversion for BN254-Nogami
+#
+# ############################################################
+
+func inv_addchain*(r: var Fp[BN254_Nogami], a: Fp[BN254_Nogami]) =
+  var
+    x100     {.noInit.}: Fp[BN254_Nogami]
+    x1000    {.noInit.}: Fp[BN254_Nogami]
+    x1100    {.noInit.}: Fp[BN254_Nogami]
+    x1101    {.noInit.}: Fp[BN254_Nogami]
+    x10001   {.noInit.}: Fp[BN254_Nogami]
+    x100010  {.noInit.}: Fp[BN254_Nogami]
+    x1000100 {.noInit.}: Fp[BN254_Nogami]
+    x1010101 {.noInit.}: Fp[BN254_Nogami]
+
+  x100     .square_repeated(a, 2)
+  x1000    .square(x100)
+  x1100    .prod(x100, x1000)
+  x1101    .prod(a, x1100)
+  x10001   .prod(x100, x1101)
+  x100010  .square(x10001)
+  x1000100 .square(x100010)
+  x1010101 .prod(x10001, x1000100)
+  # 9 operations
+
+  var
+    r13      {.noInit.}: Fp[BN254_Nogami]
+    r17      {.noInit.}: Fp[BN254_Nogami]
+    r18      {.noInit.}: Fp[BN254_Nogami]
+    r23      {.noInit.}: Fp[BN254_Nogami]
+    r26      {.noInit.}: Fp[BN254_Nogami]
+    r27      {.noInit.}: Fp[BN254_Nogami]
+    r28      {.noInit.}: Fp[BN254_Nogami]
+    r36      {.noInit.}: Fp[BN254_Nogami]
+    r38      {.noInit.}: Fp[BN254_Nogami]
+    r39      {.noInit.}: Fp[BN254_Nogami]
+    r40      {.noInit.}: Fp[BN254_Nogami]
+
+  r13.square_repeated(x1010101, 2)
+  r13 *= x100010
+  r13 *= x1101
+
+  r17.square(r13)
+  r17 *= r13
+  r17.square_repeated(2)
+
+  r18.prod(r13, r17)
+
+  r23.square_repeated(r18, 3)
+  r23 *= r18
+  r23 *= r17
+
+  r26.square_repeated(r23, 2)
+  r26 *= r23
+
+  r27.prod(r23, r26)
+  r28.prod(r26, r27)
+
+  r36.square(r28)
+  r36 *= r28
+  r36.square_repeated(2)
+  r36 *= r28
+  r36.square_repeated(3)
+
+  r38.prod(r28, r36)
+  r38 *= r27
+  r39.square(r38)
+  r40.prod(r38, r39)
+
+  r.prod(r39, r40)
+  r.square_repeated(3)
+  r *= r40
+  r.square_repeated(55)
+  r *= r38
+
+  r.square_repeated(55)
+  r *= r28
+  r.square_repeated(56)
+  r *= r18
+  r.square_repeated(56)
+
+  r *= x10001
+
+  # Total 271 operations
--- a/constantine/curves/bn254_nogami_sqrt.nim
+++ b/constantine/curves/bn254_nogami_sqrt.nim
@ -0,0 +1,89 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../config/curves,
+  ../arithmetic/finite_fields
+
+# ############################################################
+#
+#           Specialized inversion for BN254-Nogami
+#
+# ############################################################
+
+func invsqrt_addchain*(r: var Fp[BN254_Nogami], a: Fp[BN254_Nogami]) =
+  var
+    x10 {.noInit.}: Fp[BN254_Nogami]
+    x11 {.noInit.}: Fp[BN254_Nogami]
+
+  x10 .square(a)
+  x11 .prod(a, x10)
+  # 2 operations
+
+  var
+    r10  {.noInit.}: Fp[BN254_Nogami]
+    r14  {.noInit.}: Fp[BN254_Nogami]
+    r15  {.noInit.}: Fp[BN254_Nogami]
+    r20  {.noInit.}: Fp[BN254_Nogami]
+    r23  {.noInit.}: Fp[BN254_Nogami]
+    r24  {.noInit.}: Fp[BN254_Nogami]
+    r25  {.noInit.}: Fp[BN254_Nogami]
+    r33  {.noInit.}: Fp[BN254_Nogami]
+    r35  {.noInit.}: Fp[BN254_Nogami]
+    r36  {.noInit.}: Fp[BN254_Nogami]
+    r37  {.noInit.}: Fp[BN254_Nogami]
+    r98  {.noInit.}: Fp[BN254_Nogami]
+    r263 {.noInit.}: Fp[BN254_Nogami]
+
+  r10.square_repeated(x11, 7)
+  r10 *= x11
+
+  r14.square(r10)
+  r14 *= r10
+  r14.square_repeated(2)
+
+  r15.prod(r10, r14)
+
+  r20.square_repeated(r15, 3)
+  r20 *= r15
+  r20 *= r14
+
+  r23.square_repeated(r20, 2)
+  r23 *= r20
+
+  r24.prod(r20, r23)
+  r25.prod(r23, r24)
+
+  r33.square(r25)
+  r33 *= r25
+  r33.square_repeated(2)
+  r33 *= r25
+  r33.square_repeated(3)
+
+  r35.prod(r25, r33)
+  r35 *= r24
+
+  r36.square(r35)
+  r37.prod(r35, r36)
+
+  r.prod(r36, r37)
+  r.square_repeated(3)
+  r *= r37
+  r.square_repeated(55)
+  r *= r35
+
+  r.square_repeated(55)
+  r *= r25
+  r.square_repeated(56)
+  r *= r15
+  r.square_repeated(52)
+
+  r *= a
+  r.square_repeated(2)
+
+  # Total 265 operations
--- a/constantine/curves/bn254_snarks_pairing.nim
+++ b/constantine/curves/bn254_snarks_pairing.nim
@ -71,44 +71,44 @@ func pow_u*(r: var Fp12[BN254_Snarks], a: Fp12[BN254_Snarks], invert = BN254_Sna
    x10001110 .prod(x10110, x1111000)

    var
-      i15 {.noInit.}: Fp12[BN254_Snarks]
-      i16 {.noInit.}: Fp12[BN254_Snarks]
-      i17 {.noInit.}: Fp12[BN254_Snarks]
-      i18 {.noInit.}: Fp12[BN254_Snarks]
-      i20 {.noInit.}: Fp12[BN254_Snarks]
-      i21 {.noInit.}: Fp12[BN254_Snarks]
-      i22 {.noInit.}: Fp12[BN254_Snarks]
-      i26 {.noInit.}: Fp12[BN254_Snarks]
-      i27 {.noInit.}: Fp12[BN254_Snarks]
-      i61 {.noInit.}: Fp12[BN254_Snarks]
+      r15 {.noInit.}: Fp12[BN254_Snarks]
+      r16 {.noInit.}: Fp12[BN254_Snarks]
+      r17 {.noInit.}: Fp12[BN254_Snarks]
+      r18 {.noInit.}: Fp12[BN254_Snarks]
+      r20 {.noInit.}: Fp12[BN254_Snarks]
+      r21 {.noInit.}: Fp12[BN254_Snarks]
+      r22 {.noInit.}: Fp12[BN254_Snarks]
+      r26 {.noInit.}: Fp12[BN254_Snarks]
+      r27 {.noInit.}: Fp12[BN254_Snarks]
+      r61 {.noInit.}: Fp12[BN254_Snarks]

-    i15.cyclotomic_square(x10001110)
-    i15 *= x1001010
-    i16.prod(x10001110, i15)
-    i17.prod(x1111, i16)
-    i18.prod(i16, i17)
+    r15.cyclotomic_square(x10001110)
+    r15 *= x1001010
+    r16.prod(x10001110, r15)
+    r17.prod(x1111, r16)
+    r18.prod(r16, r17)

-    i20.cyclotomic_square(i18)
-    i20 *= i17
-    i21.prod(x1111000, i20)
-    i22.prod(i15, i21)
+    r20.cyclotomic_square(r18)
+    r20 *= r17
+    r21.prod(x1111000, r20)
+    r22.prod(r15, r21)

-    i26.cyclotomic_square(i22)
-    i26.cyclotomic_square()
-    i26 *= i22
-    i26 *= i18
+    r26.cyclotomic_square(r22)
+    r26.cyclotomic_square()
+    r26 *= r22
+    r26 *= r18

-    i27.prod(i22, i26)
+    r27.prod(r22, r26)

-    i61.prod(i26, i27)
-    i61.cycl_sqr_repeated(17)
-    i61 *= i27
-    i61.cycl_sqr_repeated(14)
-    i61 *= i21
+    r61.prod(r26, r27)
+    r61.cycl_sqr_repeated(17)
+    r61 *= r27
+    r61.cycl_sqr_repeated(14)
+    r61 *= r21

-    r = i61
+    r = r61
    r.cycl_sqr_repeated(16)
-    r *= i20
+    r *= r20

    if invert:
      r.cyclotomic_inv()
--- a/constantine/curves/bn254_snarks_sqrt.nim
+++ b/constantine/curves/bn254_snarks_sqrt.nim
@ -0,0 +1,158 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../config/curves,
+  ../arithmetic/finite_fields
+
+# ############################################################
+#
+#           Specialized inversion for BN254-Snarks
+#
+# ############################################################
+
+func invsqrt_addchain*(r: var Fp[BN254_Snarks], a: Fp[BN254_Snarks]) =
+  var
+    x10       {.noInit.}: Fp[BN254_Snarks]
+    x11       {.noInit.}: Fp[BN254_Snarks]
+    x101      {.noInit.}: Fp[BN254_Snarks]
+    x110      {.noInit.}: Fp[BN254_Snarks]
+    x1000     {.noInit.}: Fp[BN254_Snarks]
+    x1101     {.noInit.}: Fp[BN254_Snarks]
+    x10010    {.noInit.}: Fp[BN254_Snarks]
+    x10011    {.noInit.}: Fp[BN254_Snarks]
+    x10100    {.noInit.}: Fp[BN254_Snarks]
+    x10111    {.noInit.}: Fp[BN254_Snarks]
+    x11100    {.noInit.}: Fp[BN254_Snarks]
+    x100000   {.noInit.}: Fp[BN254_Snarks]
+    x100011   {.noInit.}: Fp[BN254_Snarks]
+    x101011   {.noInit.}: Fp[BN254_Snarks]
+    x101111   {.noInit.}: Fp[BN254_Snarks]
+    x1000001  {.noInit.}: Fp[BN254_Snarks]
+    x1010011  {.noInit.}: Fp[BN254_Snarks]
+    x1011011  {.noInit.}: Fp[BN254_Snarks]
+    x1100001  {.noInit.}: Fp[BN254_Snarks]
+    x1110101  {.noInit.}: Fp[BN254_Snarks]
+    x10010001 {.noInit.}: Fp[BN254_Snarks]
+    x10010101 {.noInit.}: Fp[BN254_Snarks]
+    x10110101 {.noInit.}: Fp[BN254_Snarks]
+    x10111011 {.noInit.}: Fp[BN254_Snarks]
+    x11000001 {.noInit.}: Fp[BN254_Snarks]
+    x11000011 {.noInit.}: Fp[BN254_Snarks]
+    x11010011 {.noInit.}: Fp[BN254_Snarks]
+    x11100001 {.noInit.}: Fp[BN254_Snarks]
+    x11100011 {.noInit.}: Fp[BN254_Snarks]
+    x11100111 {.noInit.}: Fp[BN254_Snarks]
+
+  x10       .square(a)
+  x11       .prod(x10, a)
+  x101      .prod(x10, x11)
+  x110      .prod(x101, a)
+  x1000     .prod(x10, x110)
+  x1101     .prod(x101, x1000)
+  x10010    .prod(x101, x1101)
+  x10011    .prod(x10010, a)
+  x10100    .prod(x10011, a)
+  x10111    .prod(x11, x10100)
+  x11100    .prod(x101, x10111)
+  x100000   .prod(x1101, x10011)
+  x100011   .prod(x11, x100000)
+  x101011   .prod(x1000, x100011)
+  x101111   .prod(x10011, x11100)
+  x1000001  .prod(x10010, x101111)
+  x1010011  .prod(x10010, x1000001)
+  x1011011  .prod(x1000, x1010011)
+  x1100001  .prod(x110, x1011011)
+  x1110101  .prod(x10100, x1100001)
+  x10010001 .prod(x11100, x1110101)
+  x10010101 .prod(x100000, x1110101)
+  x10110101 .prod(x100000, x10010101)
+  x10111011 .prod(x110, x10110101)
+  x11000001 .prod(x110, x10111011)
+  x11000011 .prod(x10, x11000001)
+  x11010011 .prod(x10010, x11000001)
+  x11100001 .prod(x100000, x11000001)
+  x11100011 .prod(x10, x11100001)
+  x11100111 .prod(x110, x11100001) # 30 operations
+
+  # 30 + 27 = 57 operations
+  r.square(x11000001)
+  r.square_repeated(7)
+  r *= x10010001
+  r.square_repeated(10)
+  r *= x11100111
+  r.square_repeated(7)
+
+  # 57 + 19 = 76 operations
+  r *= x10111
+  r.square_repeated(9)
+  r *= x10011
+  r.square_repeated(7)
+  r *= x1101
+
+  # 76 + 33 = 109 operations
+  r.square_repeated(14)
+  r *= x1010011
+  r.square_repeated(9)
+  r *= x11100001
+  r.square_repeated(8)
+
+  # 109 + 18 = 127 operations
+  r *= x1000001
+  r.square_repeated(10)
+  r *= x1011011
+  r.square_repeated(5)
+  r *= x1101
+
+  # 127 + 34 = 161 operations
+  r.square_repeated(8)
+  r *= x11
+  r.square_repeated(12)
+  r *= x101011
+  r.square_repeated(12)
+
+  # 161 + 25 = 186 operations
+  r *= x10111011
+  r.square_repeated(8)
+  r *= x101111
+  r.square_repeated(14)
+  r *= x10110101
+
+  # 186 + 28 = 214
+  r.square_repeated(9)
+  r *= x10010001
+  r.square_repeated(5)
+  r *= x1101
+  r.square_repeated(12)
+
+  # 214 + 22 = 236
+  r *= x11100011
+  r.square_repeated(8)
+  r *= x10010101
+  r.square_repeated(11)
+  r *= x11010011
+
+  # 236 + 32 = 268
+  r.square_repeated(7)
+  r *= x1100001
+  r.square_repeated(11)
+  r *= x100011
+  r.square_repeated(12)
+
+  # 268 + 20 = 288
+  r *= x1011011
+  r.square_repeated(9)
+  r *= x11000011
+  r.square_repeated(8)
+  r *= x11100111
+
+  # 288 + 13 = 301
+  r.square_repeated(7)
+  r *= x1110101
+  r.square_repeated(4)
+  r *= a
--- a/constantine/curves/bw6_761_inversion.nim
+++ b/constantine/curves/bw6_761_inversion.nim
@ -0,0 +1,376 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../config/curves,
+  ../arithmetic/finite_fields
+
+# ############################################################
+#
+#           Specialized inversion for BW6-761
+#
+# ############################################################
+
+func inv_addchain*(r: var Fp[BW6_761], a: Fp[BW6_761]) =
+  let a = a # ensure a.inv_addchain(a) is OK
+
+  var
+    x10       {.noInit.}: Fp[BW6_761]
+    x11       {.noInit.}: Fp[BW6_761]
+    x101      {.noInit.}: Fp[BW6_761]
+    x111      {.noInit.}: Fp[BW6_761]
+    x1001     {.noInit.}: Fp[BW6_761]
+    x1011     {.noInit.}: Fp[BW6_761]
+    x1101     {.noInit.}: Fp[BW6_761]
+    x1111     {.noInit.}: Fp[BW6_761]
+    x10001    {.noInit.}: Fp[BW6_761]
+    x10010    {.noInit.}: Fp[BW6_761]
+    x10011    {.noInit.}: Fp[BW6_761]
+    x10111    {.noInit.}: Fp[BW6_761]
+    x11001    {.noInit.}: Fp[BW6_761]
+    x11011    {.noInit.}: Fp[BW6_761]
+    x11101    {.noInit.}: Fp[BW6_761]
+    x11111    {.noInit.}: Fp[BW6_761]
+    x100001   {.noInit.}: Fp[BW6_761]
+    x100011   {.noInit.}: Fp[BW6_761]
+    x100101   {.noInit.}: Fp[BW6_761]
+    x100111   {.noInit.}: Fp[BW6_761]
+    x101001   {.noInit.}: Fp[BW6_761]
+    x101011   {.noInit.}: Fp[BW6_761]
+    x101101   {.noInit.}: Fp[BW6_761]
+    x101111   {.noInit.}: Fp[BW6_761]
+    x110001   {.noInit.}: Fp[BW6_761]
+    x110011   {.noInit.}: Fp[BW6_761]
+    x110101   {.noInit.}: Fp[BW6_761]
+    x110111   {.noInit.}: Fp[BW6_761]
+    x111001   {.noInit.}: Fp[BW6_761]
+    x111011   {.noInit.}: Fp[BW6_761]
+    x111101   {.noInit.}: Fp[BW6_761]
+    x1111010  {.noInit.}: Fp[BW6_761]
+    x1111111  {.noInit.}: Fp[BW6_761]
+    x11111110 {.noInit.}: Fp[BW6_761]
+    x11111111 {.noInit.}: Fp[BW6_761]
+
+  x10       .square(a)
+  x11       .prod(a, x10)
+  x101      .prod(x10, x11)
+  x111      .prod(x10, x101)
+  x1001     .prod(x10, x111)
+  x1011     .prod(x10, x1001)
+  x1101     .prod(x10, x1011)
+  x1111     .prod(x10, x1101)
+  x10001    .prod(x10, x1111)
+  x10010    .prod(a, x10001)
+  x10011    .prod(a, x10010)
+  x10111    .prod(x101, x10010)
+  x11001    .prod(x10, x10111)
+  x11011    .prod(x10, x11001)
+  x11101    .prod(x10, x11011)
+  x11111    .prod(x10, x11101)
+  x100001   .prod(x10, x11111)
+  x100011   .prod(x10, x100001)
+  x100101   .prod(x10, x100011)
+  x100111   .prod(x10, x100101)
+  x101001   .prod(x10, x100111)
+  x101011   .prod(x10, x101001)
+  x101101   .prod(x10, x101011)
+  x101111   .prod(x10, x101101)
+  x110001   .prod(x10, x101111)
+  x110011   .prod(x10, x110001)
+  x110101   .prod(x10, x110011)
+  x110111   .prod(x10, x110101)
+  x111001   .prod(x10, x110111)
+  x111011   .prod(x10, x111001)
+  x111101   .prod(x10, x111011)
+  x1111010  .square(x111101)
+  x1111111  .prod(x101, x1111010)
+  x11111110 .square(x1111111)
+  x11111111 .prod(a, x11111110)
+  # 35 operations
+
+  # TODO: we can accumulate in a partially reduced
+  #       doubled-size `r` to avoid the final substractions.
+  #       and only reduce at the end.
+  #       This requires the number of op to be less than log2(p) == 381
+
+  # 35 + 8 = 43 operations
+  r.prod(x100001, x11111111)
+  r.square_repeated(3)
+  r *= x10111
+  r.square_repeated(2)
+  r *= a
+
+  # 43 + 22 = 65 operations
+  r.square_repeated(9)
+  r *= x1001
+  r.square_repeated(7)
+  r *= x11111
+  r.square_repeated(4)
+
+  # 65 + 17 = 82 operations
+  r *= x111
+  r.square_repeated(9)
+  r *= x1111
+  r.square_repeated(5)
+  r *= x111
+
+  # 82 + 29 = 111 operations
+  r.square_repeated(11)
+  r *= x101011
+  r.square_repeated(7)
+  r *= x100011
+  r.square_repeated(9)
+
+  # 111 + 28 = 139 operations
+  r *= x11111
+  r.square_repeated(8)
+  r *= x100101
+  r.square_repeated(17)
+  r *= x100111
+
+  # 139 + 22 = 161 operations
+  r.square_repeated(4)
+  r *= x1101
+  r.square_repeated(9)
+  r *= x11111111
+  r.square_repeated(7)
+
+  # 161 + 15 = 176 operations
+  r *= x11111
+  r.square_repeated(6)
+  r *= x10111
+  r.square_repeated(6)
+  r *= x1001
+
+  # 176 + 22 = 198 operations
+  r.square_repeated(4)
+  r *= x11
+  r.square_repeated(6)
+  r *= x11
+  r.square_repeated(10)
+
+  # 198 + 16 = 214 operations
+  r *= x110101
+  r.square_repeated(2)
+  r *= a
+  r.square_repeated(11)
+  r *= x11101
+
+  # 214 + 28 = 238 operations
+  r.square_repeated(6)
+  r *= x101
+  r.square_repeated(7)
+  r *= x1101
+  r.square_repeated(9)
+
+  # 238 + 21 = 259 operations
+  r *= x100001
+  r.square_repeated(7)
+  r *= x100101
+  r.square_repeated(11)
+  r *= x100111
+
+  # 259 + 28 = 287 operations
+  r.square_repeated(7)
+  r *= x101111
+  r.square_repeated(6)
+  r *= x11111
+  r.square_repeated(13)
+
+  # 287 + 25 = 302 operations
+  r *= x100001
+  r.square_repeated(6)
+  r *= x111011
+  r.square_repeated(6)
+  r *= x111001
+
+  # 302 + 27 = 329 operations
+  r.square_repeated(10)
+  r *= x10111
+  r.square_repeated(11)
+  r *= x111101
+  r.square_repeated(4)
+
+  # 329 + 17 = 346 operations
+  r *= x1101
+  r.square_repeated(8)
+  r *= x110001
+  r.square_repeated(6)
+  r *= x110001
+
+  # 346 + 20 = 366 operations
+  r.square_repeated(5)
+  r *= x11001
+  r.square_repeated(3)
+  r *= x11
+  r.square_repeated(10)
+
+  # 366 + 16 = 382 operations
+  r *= x100111
+  r.square_repeated(5)
+  r *= x1001
+  r.square_repeated(8)
+  r *= x11001
+
+  # 382 + 25 = 407 operations
+  r.square_repeated(10)
+  r *= x1111
+  r.square_repeated(7)
+  r *= x11101
+  r.square_repeated(6)
+
+  # 407 + 20 = 427 operations
+  r *= x11101
+  r.square_repeated(9)
+  r *= x11111111
+  r.square_repeated(8)
+  r *= x100101
+
+  # 427 + 27 = 454 operations
+  r.square_repeated(6)
+  r *= x101101
+  r.square_repeated(10)
+  r *= x100011
+  r.square_repeated(9)
+
+  # 454 + 20 = 474 operations
+  r *= x1001
+  r.square_repeated(8)
+  r *= x1101
+  r.square_repeated(9)
+  r *= x100111
+
+  # 474 + 25 = 499 operations
+  r.square_repeated(8)
+  r *= x100011
+  r.square_repeated(6)
+  r *= x101101
+  r.square_repeated(9)
+
+  # 499 + 16 = 515 operations
+  r *= x100101
+  r.square_repeated(4)
+  r *= x1111
+  r.square_repeated(9)
+  r *= x1111111
+
+  # 515 + 25 = 540 operations
+  r.square_repeated(6)
+  r *= x11001
+  r.square_repeated(8)
+  r *= x111
+  r.square_repeated(9)
+
+  # 540 + 15 = 555 operations
+  r *= x111011
+  r.square_repeated(5)
+  r *= x10011
+  r.square_repeated(7)
+  r *= x100111
+
+  # 555 + 22 = 577 operations
+  r.square_repeated(5)
+  r *= x10111
+  r.square_repeated(9)
+  r *= x111001
+  r.square_repeated(6)
+
+  # 577 + 14 = 591 operations
+  r *= x111101
+  r.square_repeated(9)
+  r *= x11111111
+  r.square_repeated(2)
+  r *= x11
+
+  # 591 + 21 = 612 operations
+  r.square_repeated(7)
+  r *= x10111
+  r.square_repeated(6)
+  r *= x10011
+  r.square_repeated(6)
+
+  # 612 + 18 = 630 operations
+  r *= x101
+  r.square_repeated(9)
+  r *= x10001
+  r.square_repeated(6)
+  r *= x11011
+
+  # 630 + 27 = 657 operations
+  r.square_repeated(10)
+  r *= x100101
+  r.square_repeated(7)
+  r *= x110011
+  r.square_repeated(8)
+
+  # 657 + 13 = 670 operations
+  r *= x111101
+  r.square_repeated(7)
+  r *= x100011
+  r.square_repeated(3)
+  r *= x111
+
+  # 670 + 26 = 696 operations
+  r.square_repeated(10)
+  r *= x1011
+  r.square_repeated(11)
+  r *= x110011
+  r.square_repeated(3)
+
+  # 696 + 17 = 713 operations
+  r *= x111
+  r.square_repeated(9)
+  r *= x101011
+  r.square_repeated(5)
+  r *= x10111
+
+  # 713 + 21 = 734 operations
+  r.square_repeated(7)
+  r *= x101011
+  r.square_repeated(2)
+  r *= x11
+  r.square_repeated(10)
+
+  # 734 + 19 = 753 operations
+  r *= x101001
+  r.square_repeated(10)
+  r *= x110111
+  r.square_repeated(6)
+  r *= x111001
+
+  # 753 + 23 = 776 operations
+  r.square_repeated(6)
+  r *= x101001
+  r.square_repeated(9)
+  r *= x100111
+  r.square_repeated(6)
+
+  # 776 + 12 = 788 operations
+  r *= x110011
+  r.square_repeated(7)
+  r *= x100001
+  r.square_repeated(2)
+  r *= x11
+
+  # 788 + 39 = 827 operations
+  r.square_repeated(21)
+  r *= a
+  r.square_repeated(11)
+  r *= x101111
+  r.square_repeated(5)
+
+  # 827 + 55 = 882 operations
+  r *= x1001
+  r.square_repeated(7)
+  r *= x11101
+  r.square_repeated(45)
+  r *= x10001
+
+  # 882 + 4 = 886 operations
+  r.square_repeated(3)
+  r *= a
--- a/constantine/curves/bw6_761_sqrt.nim
+++ b/constantine/curves/bw6_761_sqrt.nim
@ -0,0 +1,373 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../config/curves,
+  ../arithmetic/finite_fields
+
+# ############################################################
+#
+#           Specialized invsqrt for BW6-761
+#
+# ############################################################
+
+func invsqrt_addchain*(r: var Fp[BW6_761], a: Fp[BW6_761]) =
+  var
+    x10       {.noInit.}: Fp[BW6_761]
+    x11       {.noInit.}: Fp[BW6_761]
+    x101      {.noInit.}: Fp[BW6_761]
+    x111      {.noInit.}: Fp[BW6_761]
+    x1001     {.noInit.}: Fp[BW6_761]
+    x1011     {.noInit.}: Fp[BW6_761]
+    x1101     {.noInit.}: Fp[BW6_761]
+    x1111     {.noInit.}: Fp[BW6_761]
+    x10001    {.noInit.}: Fp[BW6_761]
+    x10010    {.noInit.}: Fp[BW6_761]
+    x10011    {.noInit.}: Fp[BW6_761]
+    x10111    {.noInit.}: Fp[BW6_761]
+    x11001    {.noInit.}: Fp[BW6_761]
+    x11011    {.noInit.}: Fp[BW6_761]
+    x11101    {.noInit.}: Fp[BW6_761]
+    x11111    {.noInit.}: Fp[BW6_761]
+    x100001   {.noInit.}: Fp[BW6_761]
+    x100011   {.noInit.}: Fp[BW6_761]
+    x100101   {.noInit.}: Fp[BW6_761]
+    x100111   {.noInit.}: Fp[BW6_761]
+    x101001   {.noInit.}: Fp[BW6_761]
+    x101011   {.noInit.}: Fp[BW6_761]
+    x101101   {.noInit.}: Fp[BW6_761]
+    x101111   {.noInit.}: Fp[BW6_761]
+    x110001   {.noInit.}: Fp[BW6_761]
+    x110011   {.noInit.}: Fp[BW6_761]
+    x110101   {.noInit.}: Fp[BW6_761]
+    x110111   {.noInit.}: Fp[BW6_761]
+    x111001   {.noInit.}: Fp[BW6_761]
+    x111011   {.noInit.}: Fp[BW6_761]
+    x111101   {.noInit.}: Fp[BW6_761]
+    x1111010  {.noInit.}: Fp[BW6_761]
+    x1111111  {.noInit.}: Fp[BW6_761]
+    x11111110 {.noInit.}: Fp[BW6_761]
+    x11111111 {.noInit.}: Fp[BW6_761]
+
+  x10       .square(a)
+  x11       .prod(a, x10)
+  x101      .prod(x10, x11)
+  x111      .prod(x10, x101)
+  x1001     .prod(x10, x111)
+  x1011     .prod(x10, x1001)
+  x1101     .prod(x10, x1011)
+  x1111     .prod(x10, x1101)
+  x10001    .prod(x10, x1111)
+  x10010    .prod(a, x10001)
+  x10011    .prod(a, x10010)
+  x10111    .prod(x101, x10010)
+  x11001    .prod(x10, x10111)
+  x11011    .prod(x10, x11001)
+  x11101    .prod(x10, x11011)
+  x11111    .prod(x10, x11101)
+  x100001   .prod(x10, x11111)
+  x100011   .prod(x10, x100001)
+  x100101   .prod(x10, x100011)
+  x100111   .prod(x10, x100101)
+  x101001   .prod(x10, x100111)
+  x101011   .prod(x10, x101001)
+  x101101   .prod(x10, x101011)
+  x101111   .prod(x10, x101101)
+  x110001   .prod(x10, x101111)
+  x110011   .prod(x10, x110001)
+  x110101   .prod(x10, x110011)
+  x110111   .prod(x10, x110101)
+  x111001   .prod(x10, x110111)
+  x111011   .prod(x10, x111001)
+  x111101   .prod(x10, x111011)
+  x1111010  .square(x111101)
+  x1111111  .prod(x101, x1111010)
+  x11111110 .square(x1111111)
+  x11111111 .prod(a, x11111110)
+  # 35 operations
+
+  # TODO: we can accumulate in a partially reduced
+  #       doubled-size `r` to avoid the final substractions.
+  #       and only reduce at the end.
+  #       This requires the number of op to be less than log2(p) == 381
+
+  # 35 + 8 = 43 operations
+  r.prod(x100001, x11111111)
+  r.square_repeated(3)
+  r *= x10111
+  r.square_repeated(2)
+  r *= a
+
+  # 43 + 22 = 65 operations
+  r.square_repeated(9)
+  r *= x1001
+  r.square_repeated(7)
+  r *= x11111
+  r.square_repeated(4)
+
+  # 65 + 17 = 82 operations
+  r *= x111
+  r.square_repeated(9)
+  r *= x1111
+  r.square_repeated(5)
+  r *= x111
+
+  # 82 + 29 = 111 operations
+  r.square_repeated(11)
+  r *= x101011
+  r.square_repeated(7)
+  r *= x100011
+  r.square_repeated(9)
+
+  # 111 + 28 = 139 operations
+  r *= x11111
+  r.square_repeated(8)
+  r *= x100101
+  r.square_repeated(17)
+  r *= x100111
+
+  # 139 + 22 = 161 operations
+  r.square_repeated(4)
+  r *= x1101
+  r.square_repeated(9)
+  r *= x11111111
+  r.square_repeated(7)
+
+  # 161 + 15 = 176 operations
+  r *= x11111
+  r.square_repeated(6)
+  r *= x10111
+  r.square_repeated(6)
+  r *= x1001
+
+  # 176 + 22 = 198 operations
+  r.square_repeated(4)
+  r *= x11
+  r.square_repeated(6)
+  r *= x11
+  r.square_repeated(10)
+
+  # 198 + 16 = 214 operations
+  r *= x110101
+  r.square_repeated(2)
+  r *= a
+  r.square_repeated(11)
+  r *= x11101
+
+  # 214 + 28 = 238 operations
+  r.square_repeated(6)
+  r *= x101
+  r.square_repeated(7)
+  r *= x1101
+  r.square_repeated(9)
+
+  # 238 + 21 = 259 operations
+  r *= x100001
+  r.square_repeated(7)
+  r *= x100101
+  r.square_repeated(11)
+  r *= x100111
+
+  # 259 + 28 = 287 operations
+  r.square_repeated(7)
+  r *= x101111
+  r.square_repeated(6)
+  r *= x11111
+  r.square_repeated(13)
+
+  # 287 + 25 = 302 operations
+  r *= x100001
+  r.square_repeated(6)
+  r *= x111011
+  r.square_repeated(6)
+  r *= x111001
+
+  # 302 + 27 = 329 operations
+  r.square_repeated(10)
+  r *= x10111
+  r.square_repeated(11)
+  r *= x111101
+  r.square_repeated(4)
+
+  # 329 + 17 = 346 operations
+  r *= x1101
+  r.square_repeated(8)
+  r *= x110001
+  r.square_repeated(6)
+  r *= x110001
+
+  # 346 + 20 = 366 operations
+  r.square_repeated(5)
+  r *= x11001
+  r.square_repeated(3)
+  r *= x11
+  r.square_repeated(10)
+
+  # 366 + 16 = 382 operations
+  r *= x100111
+  r.square_repeated(5)
+  r *= x1001
+  r.square_repeated(8)
+  r *= x11001
+
+  # 382 + 25 = 407 operations
+  r.square_repeated(10)
+  r *= x1111
+  r.square_repeated(7)
+  r *= x11101
+  r.square_repeated(6)
+
+  # 407 + 20 = 427 operations
+  r *= x11101
+  r.square_repeated(9)
+  r *= x11111111
+  r.square_repeated(8)
+  r *= x100101
+
+  # 427 + 27 = 454 operations
+  r.square_repeated(6)
+  r *= x101101
+  r.square_repeated(10)
+  r *= x100011
+  r.square_repeated(9)
+
+  # 454 + 20 = 474 operations
+  r *= x1001
+  r.square_repeated(8)
+  r *= x1101
+  r.square_repeated(9)
+  r *= x100111
+
+  # 474 + 25 = 499 operations
+  r.square_repeated(8)
+  r *= x100011
+  r.square_repeated(6)
+  r *= x101101
+  r.square_repeated(9)
+
+  # 499 + 16 = 515 operations
+  r *= x100101
+  r.square_repeated(4)
+  r *= x1111
+  r.square_repeated(9)
+  r *= x1111111
+
+  # 515 + 25 = 540 operations
+  r.square_repeated(6)
+  r *= x11001
+  r.square_repeated(8)
+  r *= x111
+  r.square_repeated(9)
+
+  # 540 + 15 = 555 operations
+  r *= x111011
+  r.square_repeated(5)
+  r *= x10011
+  r.square_repeated(7)
+  r *= x100111
+
+  # 555 + 22 = 577 operations
+  r.square_repeated(5)
+  r *= x10111
+  r.square_repeated(9)
+  r *= x111001
+  r.square_repeated(6)
+
+  # 577 + 14 = 591 operations
+  r *= x111101
+  r.square_repeated(9)
+  r *= x11111111
+  r.square_repeated(2)
+  r *= x11
+
+  # 591 + 21 = 612 operations
+  r.square_repeated(7)
+  r *= x10111
+  r.square_repeated(6)
+  r *= x10011
+  r.square_repeated(6)
+
+  # 612 + 18 = 630 operations
+  r *= x101
+  r.square_repeated(9)
+  r *= x10001
+  r.square_repeated(6)
+  r *= x11011
+
+  # 630 + 27 = 657 operations
+  r.square_repeated(10)
+  r *= x100101
+  r.square_repeated(7)
+  r *= x110011
+  r.square_repeated(8)
+
+  # 657 + 13 = 670 operations
+  r *= x111101
+  r.square_repeated(7)
+  r *= x100011
+  r.square_repeated(3)
+  r *= x111
+
+  # 670 + 26 = 696 operations
+  r.square_repeated(10)
+  r *= x1011
+  r.square_repeated(11)
+  r *= x110011
+  r.square_repeated(3)
+
+  # 696 + 17 = 713 operations
+  r *= x111
+  r.square_repeated(9)
+  r *= x101011
+  r.square_repeated(5)
+  r *= x10111
+
+  # 713 + 21 = 734 operations
+  r.square_repeated(7)
+  r *= x101011
+  r.square_repeated(2)
+  r *= x11
+  r.square_repeated(10)
+
+  # 734 + 19 = 753 operations
+  r *= x101001
+  r.square_repeated(10)
+  r *= x110111
+  r.square_repeated(6)
+  r *= x111001
+
+  # 753 + 23 = 776 operations
+  r.square_repeated(6)
+  r *= x101001
+  r.square_repeated(9)
+  r *= x100111
+  r.square_repeated(6)
+
+  # 776 + 12 = 788 operations
+  r *= x110011
+  r.square_repeated(7)
+  r *= x100001
+  r.square_repeated(2)
+  r *= x11
+
+  # 788 + 39 = 827 operations
+  r.square_repeated(21)
+  r *= a
+  r.square_repeated(11)
+  r *= x101111
+  r.square_repeated(5)
+
+  # 827 + 55 = 882 operations
+  r *= x1001
+  r.square_repeated(7)
+  r *= x11101
+  r.square_repeated(45)
+  r *= x10001
+
+  # 882 + 1 = 883 operations
+  r.square()
--- a/constantine/curves/zoo_inversions.nim
+++ b/constantine/curves/zoo_inversions.nim
@ -7,11 +7,27 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

 import
+  ../config/[curves, type_ff],
+  ./bls12_377_inversion,
  ./bls12_381_inversion,
+  ./bn254_nogami_inversion,
  ./bn254_snarks_inversion,
+  ./bw6_761_inversion,
  ./secp256k1_inversion

 export
+  bls12_377_inversion,
  bls12_381_inversion,
+  bn254_nogami_inversion,
  bn254_snarks_inversion,
+  bw6_761_inversion,
  secp256k1_inversion
+
+func hasInversionAddchain*(C: static Curve): static bool =
+  # TODO: For now we don't activate the addition chains
+  #      for Secp256k1
+  # Performance is slower than GCD
+  when C in {BN254_Nogami, BN254_Snarks, BLS12_377, BLS12_381, BW6_761}:
+    true
+  else:
+    false
--- a/constantine/curves/zoo_square_roots.nim
+++ b/constantine/curves/zoo_square_roots.nim
@ -8,11 +8,34 @@

 import
  std/macros,
-  ../config/curves,
-  ./bls12_377_sqrt
+  ../config/[curves, type_ff],
+  ./bls12_377_sqrt,
+  ./bls12_381_sqrt,
+  ./bn254_nogami_sqrt,
+  ./bn254_snarks_sqrt,
+  ./bw6_761_sqrt
+
+export
+  bls12_377_sqrt,
+  bls12_381_sqrt,
+  bn254_nogami_sqrt,
+  bn254_snarks_sqrt,
+  bw6_761_sqrt
+
+func hasSqrtAddchain*(C: static Curve): static bool =
+  when C in {BLS12_381, BN254_Nogami, BN254_Snarks, BW6_761}:
+    true
+  else:
+    false

 {.experimental: "dynamicBindSym".}

 macro tonelliShanks*(C: static Curve, value: untyped): untyped =
  ## Get Square Root via Tonelli-Shanks related constants
  return bindSym($C & "_TonelliShanks_" & $value)
+
+func hasTonelliShanksAddchain*(C: static Curve): static bool =
+  when C in {BLS12_377}:
+    true
+  else:
+    false
--- a/tests/t_finite_fields_sqrt.nim
+++ b/tests/t_finite_fields_sqrt.nim
@ -125,6 +125,7 @@ proc main() =
    randomSqrtCheck BN254_Snarks
    randomSqrtCheck BLS12_377 # p ≢ 3 (mod 4)
    randomSqrtCheck BLS12_381
+    randomSqrtCheck BW6_761

  suite "Modular square root - 32-bit bugs highlighted by property-based testing " & " [" & $WordBitwidth & "-bit mode]":
    # test "FKM12_447 - #30": - Deactivated, we don't support the curve as no one uses it.