30% faster constant-time inversion

2026-02-16 11:13:07 +00:00 · 2020-03-20 23:03:52 +01:00 · 2020-03-20 23:03:52 +01:00 · bde619155b
commit bde619155b
parent 1958356a09
39 changed files with 1467 additions and 1066 deletions
--- a/benchmarks/bench_eth_curves.nim
+++ b/benchmarks/bench_eth_curves.nim
@ -1,6 +0,0 @@
-# Benchmark of Ethereum 1 and Ethereum 2 elliptic curves
-
-import
-  ./secp256k1_fp,
-  ./bn254_fp,
-  ./bls12_381_fp
--- a/benchmarks/bench_finite_fields.nim
+++ b/benchmarks/bench_finite_fields.nim
@ -0,0 +1,157 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# ############################################################
+#
+#             Benchmark of finite fields
+#
+# ############################################################
+
+import
+  # Internals
+  ../constantine/config/[common, curves],
+  ../constantine/arithmetic,
+  ../constantine/io/[io_bigints, io_fields],
+  ../constantine/primitives,
+  # Helpers
+  ../helpers/[timers, prng, static_for],
+  # Standard library
+  std/[monotimes, times, strformat, strutils, macros]
+
+const Iters = 1_000_000
+const InvIters = 1000
+const AvailableCurves = [
+  P224,
+  BN254,
+  P256,
+  Secp256k1,
+  BLS12_381
+]
+
+var rng: RngState
+let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+rng.seed(seed)
+echo "bench_finite_field xoshiro512** seed: ", seed
+
+# warmup
+proc warmup*() =
+  # Warmup - make sure cpu is on max perf
+  let start = cpuTime()
+  var foo = 123
+  for i in 0 ..< 300_000_000:
+    foo += i*i mod 456
+    foo = foo mod 789
+
+  # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
+  let stop = cpuTime()
+  echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
+
+warmup()
+
+echo "\n⚠️ Measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
+echo "==========================================================================================================\n"
+echo "All benchmarks are using constant-time implementations to protect against side-channel attacks."
+when defined(gcc):
+  echo "\nCompiled with GCC"
+elif defined(clang):
+  echo "\nCompiled with Clang"
+elif defined(vcc):
+  echo "\nCompiled with MSVC"
+elif defined(icc):
+  echo "\nCompiled with ICC"
+else:
+  echo "\nCompiled with an unknown compiler"
+
+when defined(i386) or defined(amd64):
+  import ../helpers/x86
+  echo "Running on ", cpuName(), "\n\n"
+
+proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
+  echo &"{op:<15} {field:<15} {inNanoseconds((stop-start) div iters):>9} ns {(stopClk - startClk) div iters:>9} cycles"
+
+macro fixFieldDisplay(T: typedesc): untyped =
+  # At compile-time, enums are integers and their display is buggy
+  # we get the Curve ID instead of the curve name.
+  let instantiated = T.getTypeInst()
+  var name = $instantiated[1][0] # Fp
+  name.add "[" & $Curve(instantiated[1][1].intVal) & "]"
+  result = newLit name
+
+# Compilers are smart with dead code (but not with multiprecision arithmetic :/)
+var globalsAreNotOptimizedAway: Word
+
+template bench(op: string, T: typedesc, iters: int, body: untyped): untyped =
+  let start = getMonotime()
+  let startClk = getTicks()
+  for _ in 0 ..< iters:
+    body
+  let stopClk = getTicks()
+  let stop = getMonotime()
+
+  report(op, fixFieldDisplay(T), start, stop, startClk, stopClk, iters)
+
+proc addBench(T: typedesc) =
+  var x = rng.random(T)
+  let y = rng.random(T)
+  bench("Addition", T, Iters):
+    x += y
+  globalsAreNotOptimizedAway += x.mres.limbs[^1]
+
+proc subBench(T: typedesc) =
+  var x = rng.random(T)
+  let y = rng.random(T)
+  preventOptimAway(x)
+  bench("Substraction", T, Iters):
+    x -= y
+  globalsAreNotOptimizedAway += x.mres.limbs[^1]
+
+proc negBench(T: typedesc) =
+  var r: T
+  let x = rng.random(T)
+  bench("Negation", T, Iters):
+    r.neg(x)
+  globalsAreNotOptimizedAway += r.mres.limbs[^1]
+
+proc mulBench(T: typedesc) =
+  var r: T
+  let x = rng.random(T)
+  let y = rng.random(T)
+  preventOptimAway(r)
+  bench("Multiplication", T, Iters):
+    r.prod(x, y)
+
+proc sqrBench(T: typedesc) =
+  var r: T
+  let x = rng.random(T)
+  preventOptimAway(r)
+  bench("Squaring", T, Iters):
+    r.square(x)
+
+proc invBench(T: typedesc) =
+  var r: T
+  let x = rng.random(T)
+  preventOptimAway(r)
+  bench("Inversion", T, InvIters):
+    r.inv(x)
+
+proc main() =
+  echo "-".repeat(80)
+  staticFor i, 0, AvailableCurves.len:
+    const curve = AvailableCurves[i]
+    addBench(Fp[curve])
+    subBench(Fp[curve])
+    negBench(Fp[curve])
+    mulBench(Fp[curve])
+    sqrBench(Fp[curve])
+    invBench(Fp[curve])
+    echo "-".repeat(80)
+
+main()
+
+echo "Notes:"
+echo "  GCC is significantly slower than Clang on multiprecision arithmetic."
--- a/benchmarks/big_to_fq.nim
+++ b/benchmarks/big_to_fq.nim
@ -1,48 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-# ############################################################
-#
-#        Benchmark of the conversion from Big Int to Fq
-#
-# ############################################################
-
-# 2 implementations are possible
-# - 1 based on Montgomery Multiplication
-# - 1 based on modular left shift which involves multiple divisions
-
-import
-  ../constantine/config/[common, curves],
-  ../constantine/arithmetic/[bigints_checked, finite_fields],
-  random, std/monotimes, times, strformat
-
-const Iters = 1_000_000
-
-randomize(1234)
-
-proc main() =
-  var x: BigInt[381]
-  x.setInternalBitLength()
-  for i in 0 ..< x.limbs.len - 1:
-    # Set x to a random value guaranteed below the prime
-    x.limbs[i] = Word(rand(BaseType.high.int))
-
-
-  let start = getMonotime()
-  for _ in 0 ..< Iters:
-    let y = Fq[BLS12_381].fromBig(x)
-  let stop = getMonotime()
-
-  echo &"Time for {Iters} iterations: {inMilliseconds(stop-start)} ms"
-
-
-main()
-# 1_000_000 iterations with -d:danger on i9-9980XE all-core turbo 4.1GHz
-# Montgomery Multiplication based: 254ms
-# shlAddMod based (using assembly div2n1n!!): 907 ms
-# Note: shlAddMod will be even slower when division is made constant-time
--- a/benchmarks/bls12_381_fp.nim
+++ b/benchmarks/bls12_381_fp.nim
@ -1,155 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-# ############################################################
-#
-#           Benchmark of modular exponentiation
-#
-# ############################################################
-
-# 2 implementations are available
-# - 1 is constant time
-# - 1 exposes the exponent bits to:
-#   timing attack,
-#   memory access analysis,
-#   power analysis (i.e. oscilloscopes on embedded)
-#   It is suitable for public exponents for example
-#   to compute modular inversion via the Fermat method
-
-import
-  ../constantine/config/[common, curves],
-  ../constantine/arithmetic/[bigints, finite_fields],
-  ../constantine/io/[io_bigints, io_fields],
-  random, std/monotimes, times, strformat,
-  ./timers
-
-const Iters = 1_000_000
-const InvIters = 1000
-
-randomize(1234)
-
-# warmup
-proc warmup*() =
-  # Warmup - make sure cpu is on max perf
-  let start = cpuTime()
-  var foo = 123
-  for i in 0 ..< 300_000_000:
-    foo += i*i mod 456
-    foo = foo mod 789
-
-  # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
-  let stop = cpuTime()
-  echo &"\n\nWarmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
-
-warmup()
-
-echo "\n⚠️ Measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
-echo "==========================================================================================================\n"
-
-proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
-  echo &"{op:<15} {field:<15} {inNanoseconds((stop-start) div iters):>9} ns {(stopClk - startClk) div iters:>9} cycles"
-
-proc addBench() =
-  var x, y: Fp[BLS12_381]
-  # BN254 field modulus
-  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
-  # BLS12-381 prime - 2
-  y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9")
-
-  let start = getMonotime()
-  let startClk = getTicks()
-  for _ in 0 ..< Iters:
-    x += y
-  let stopClk = getTicks()
-  let stop = getMonotime()
-  report("Addition", "Fp[BLS12_381]", start, stop, startClk, stopClk, Iters)
-
-
-addBench()
-
-proc subBench() =
-  var x, y: Fp[BLS12_381]
-  # BN254 field modulus
-  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
-  # BLS12-381 prime - 2
-  y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9")
-
-  let start = getMonotime()
-  let startClk = getTicks()
-  for _ in 0 ..< Iters:
-    x -= y
-  let stopClk = getTicks()
-  let stop = getMonotime()
-  report("Substraction", "Fp[BLS12_381]", start, stop, startClk, stopClk, Iters)
-
-subBench()
-
-proc negBench() =
-  var r, x: Fp[BLS12_381]
-  # BN254 field modulus
-  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
-
-  let start = getMonotime()
-  let startClk = getTicks()
-  for _ in 0 ..< Iters:
-    r.neg(x)
-  let stopClk = getTicks()
-  let stop = getMonotime()
-  report("Negation", "Fp[BLS12_381]", start, stop, startClk, stopClk, Iters)
-
-negBench()
-
-proc mulBench() =
-  var r, x, y: Fp[BLS12_381]
-  # BN254 field modulus
-  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
-  # BLS12-381 prime - 2
-  y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9")
-
-  let start = getMonotime()
-  let startClk = getTicks()
-  for _ in 0 ..< Iters:
-    r.prod(x, y)
-  let stopClk = getTicks()
-  let stop = getMonotime()
-  report("Multiplication", "Fp[BLS12_381]", start, stop, startClk, stopClk, Iters)
-
-mulBench()
-
-proc sqrBench() =
-  var r, x: Fp[BLS12_381]
-  # BN254 field modulus
-  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
-
-  let start = getMonotime()
-  let startClk = getTicks()
-  for _ in 0 ..< Iters:
-    r.square(x)
-  let stopClk = getTicks()
-  let stop = getMonotime()
-  report("Squaring", "Fp[BLS12_381]", start, stop, startClk, stopClk, Iters)
-
-sqrBench()
-
-proc invBench() =
-  # TODO: having x on the stack triggers stack smashing detection. To be investigated
-  var x: ref Fp[BLS12_381]
-  new x
-  # BN254 field modulus
-  x[].fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
-
-  let start = getMonotime()
-  let startClk = getTicks()
-  for _ in 0 ..< InvIters:
-    # Note: we don't copy the original x so x is alterning between x and x^-1
-    inv(x[])
-  let stopClk = getTicks()
-  let stop = getMonotime()
-  report("Inversion", "Fp[BLS12_381]", start, stop, startClk, stopClk, InvIters)
-
-invBench()
--- a/benchmarks/bn254_fp.nim
+++ b/benchmarks/bn254_fp.nim
@ -1,155 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-# ############################################################
-#
-#           Benchmark of modular exponentiation
-#
-# ############################################################
-
-# 2 implementations are available
-# - 1 is constant time
-# - 1 exposes the exponent bits to:
-#   timing attack,
-#   memory access analysis,
-#   power analysis (i.e. oscilloscopes on embedded)
-#   It is suitable for public exponents for example
-#   to compute modular inversion via the Fermat method
-
-import
-  ../constantine/config/[common, curves],
-  ../constantine/arithmetic/[bigints, finite_fields],
-  ../constantine/io/[io_bigints, io_fields],
-  random, std/monotimes, times, strformat,
-  ./timers
-
-const Iters = 1_000_000
-const InvIters = 1000
-
-randomize(1234)
-
-# warmup
-proc warmup*() =
-  # Warmup - make sure cpu is on max perf
-  let start = cpuTime()
-  var foo = 123
-  for i in 0 ..< 300_000_000:
-    foo += i*i mod 456
-    foo = foo mod 789
-
-  # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
-  let stop = cpuTime()
-  echo &"\n\nWarmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
-
-warmup()
-
-echo "\n⚠️ Measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
-echo "==========================================================================================================\n"
-
-proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
-  echo &"{op:<15} {field:<15} {inNanoseconds((stop-start) div iters):>9} ns {(stopClk - startClk) div iters:>9} cycles"
-
-proc addBench() =
-  var x, y: Fp[BN254]
-  # BN254 field modulus
-  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
-  # Truncated BLS12-381 prime
-  y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624")
-
-  let start = getMonotime()
-  let startClk = getTicks()
-  for _ in 0 ..< Iters:
-    x += y
-  let stopClk = getTicks()
-  let stop = getMonotime()
-  report("Addition", "Fp[BN254]", start, stop, startClk, stopClk, Iters)
-
-
-addBench()
-
-proc subBench() =
-  var x, y: Fp[BN254]
-  # BN254 field modulus
-  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
-  # Truncated BLS12-381 prime
-  y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624")
-
-  let start = getMonotime()
-  let startClk = getTicks()
-  for _ in 0 ..< Iters:
-    x -= y
-  let stopClk = getTicks()
-  let stop = getMonotime()
-  report("Substraction", "Fp[BN254]", start, stop, startClk, stopClk, Iters)
-
-subBench()
-
-proc negBench() =
-  var r, x: Fp[BN254]
-  # BN254 field modulus
-  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
-
-  let start = getMonotime()
-  let startClk = getTicks()
-  for _ in 0 ..< Iters:
-    r.neg(x)
-  let stopClk = getTicks()
-  let stop = getMonotime()
-  report("Negation", "Fp[BN254]", start, stop, startClk, stopClk, Iters)
-
-negBench()
-
-proc mulBench() =
-  var r, x, y: Fp[BN254]
-  # BN254 field modulus
-  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
-  # Truncated BLS12-381 prime
-  y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624")
-
-  let start = getMonotime()
-  let startClk = getTicks()
-  for _ in 0 ..< Iters:
-    r.prod(x, y)
-  let stopClk = getTicks()
-  let stop = getMonotime()
-  report("Multiplication", "Fp[BN254]", start, stop, startClk, stopClk, Iters)
-
-mulBench()
-
-proc sqrBench() =
-  var r, x: Fp[BN254]
-  # BN254 field modulus
-  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
-
-  let start = getMonotime()
-  let startClk = getTicks()
-  for _ in 0 ..< Iters:
-    r.square(x)
-  let stopClk = getTicks()
-  let stop = getMonotime()
-  report("Squaring", "Fp[BN254]", start, stop, startClk, stopClk, Iters)
-
-sqrBench()
-
-proc invBench() =
-  # TODO: having x on the stack triggers stack smashing detection. To be investigated
-  var x: ref Fp[BN254]
-  new x
-  # BN254 field modulus
-  x[].fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
-
-  let start = getMonotime()
-  let startClk = getTicks()
-  for _ in 0 ..< InvIters:
-    # Note: we don't copy the original x so x is alterning between x and x^-1
-    inv(x[])
-  let stopClk = getTicks()
-  let stop = getMonotime()
-  report("Inversion", "Fp[BN254]", start, stop, startClk, stopClk, InvIters)
-
-invBench()
--- a/benchmarks/powmod.nim
+++ b/benchmarks/powmod.nim
@ -1,69 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-# ############################################################
-#
-#           Benchmark of modular exponentiation
-#
-# ############################################################
-
-# 2 implementations are available
-# - 1 is constant time
-# - 1 exposes the exponent bits to:
-#   timing attack,
-#   memory access analysis,
-#   power analysis (i.e. oscilloscopes on embedded)
-#   It is suitable for public exponents for example
-#   to compute modular inversion via the Fermat method
-
-import
-  ../constantine/config/[common, curves],
-  ../constantine/arithmetic/[bigints_checked, finite_fields],
-  ../constantine/io/[io_bigints, io_fields],
-  random, std/monotimes, times, strformat
-
-const Iters = 10_000
-
-randomize(1234)
-
-proc mainCT() =
-  var x: Fp[BLS12_381]
-  # BN254 field modulus
-  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
-  # BLS12-381 prime - 2
-  let exponent = BigInt[381].fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9")
-
-  let start = getMonotime()
-  for _ in 0 ..< Iters:
-    var y = x
-    x.pow(exponent)
-  let stop = getMonotime()
-
-  echo &"Time for {Iters} iterations (constant-time 381-bit): {inMilliseconds(stop-start)} ms"
-
-proc mainUnsafe() =
-  var x: Fp[BLS12_381]
-  # BN254 field modulus
-  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
-  # BLS12-381 prime - 2
-  let exponent = BigInt[381].fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9")
-
-  let start = getMonotime()
-  for _ in 0 ..< Iters:
-    var y = x
-    x.powUnsafeExponent(exponent)
-  let stop = getMonotime()
-
-  echo &"Time for {Iters} iterations (unsafe 381-bit): {inMilliseconds(stop-start)} ms"
-
-mainCT()
-mainUnsafe()
-# 10_000 iterations with -d:danger on i9-9980XE all-core turbo 4.1GHz
-
-# Time for 10000 iterations (constant-time 381-bit): 1349 ms
-# Time for 10000 iterations (unsafe 381-bit): 1226 ms
--- a/benchmarks/secp256k1_fp.nim
+++ b/benchmarks/secp256k1_fp.nim
@ -1,155 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-# ############################################################
-#
-#           Benchmark of modular exponentiation
-#
-# ############################################################
-
-# 2 implementations are available
-# - 1 is constant time
-# - 1 exposes the exponent bits to:
-#   timing attack,
-#   memory access analysis,
-#   power analysis (i.e. oscilloscopes on embedded)
-#   It is suitable for public exponents for example
-#   to compute modular inversion via the Fermat method
-
-import
-  ../constantine/config/[common, curves],
-  ../constantine/arithmetic/[bigints, finite_fields],
-  ../constantine/io/[io_bigints, io_fields],
-  random, std/monotimes, times, strformat,
-  ./timers
-
-const Iters = 1_000_000
-const InvIters = 1000
-
-randomize(1234)
-
-# warmup
-proc warmup*() =
-  # Warmup - make sure cpu is on max perf
-  let start = cpuTime()
-  var foo = 123
-  for i in 0 ..< 300_000_000:
-    foo += i*i mod 456
-    foo = foo mod 789
-
-  # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
-  let stop = cpuTime()
-  echo &"\n\nWarmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
-
-warmup()
-
-echo "\n⚠️ Measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
-echo "==========================================================================================================\n"
-
-proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
-  echo &"{op:<15} {field:<15} {inNanoseconds((stop-start) div iters):>9} ns {(stopClk - startClk) div iters:>9} cycles"
-
-proc addBench() =
-  var x, y: Fp[Secp256k1]
-  # BN254 field modulus
-  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
-  # Truncated BLS12-381 prime
-  y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624")
-
-  let start = getMonotime()
-  let startClk = getTicks()
-  for _ in 0 ..< Iters:
-    x += y
-  let stopClk = getTicks()
-  let stop = getMonotime()
-  report("Addition", "Fp[Secp256k1]", start, stop, startClk, stopClk, Iters)
-
-
-addBench()
-
-proc subBench() =
-  var x, y: Fp[Secp256k1]
-  # BN254 field modulus
-  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
-  # Truncated BLS12-381 prime
-  y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624")
-
-  let start = getMonotime()
-  let startClk = getTicks()
-  for _ in 0 ..< Iters:
-    x -= y
-  let stopClk = getTicks()
-  let stop = getMonotime()
-  report("Substraction", "Fp[Secp256k1]", start, stop, startClk, stopClk, Iters)
-
-subBench()
-
-proc negBench() =
-  var r, x: Fp[Secp256k1]
-  # BN254 field modulus
-  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
-
-  let start = getMonotime()
-  let startClk = getTicks()
-  for _ in 0 ..< Iters:
-    r.neg(x)
-  let stopClk = getTicks()
-  let stop = getMonotime()
-  report("Negation", "Fp[Secp256k1]", start, stop, startClk, stopClk, Iters)
-
-negBench()
-
-proc mulBench() =
-  var r, x, y: Fp[Secp256k1]
-  # BN254 field modulus
-  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
-  # Truncated BLS12-381 prime
-  y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624")
-
-  let start = getMonotime()
-  let startClk = getTicks()
-  for _ in 0 ..< Iters:
-    r.prod(x, y)
-  let stopClk = getTicks()
-  let stop = getMonotime()
-  report("Multiplication", "Fp[Secp256k1]", start, stop, startClk, stopClk, Iters)
-
-mulBench()
-
-proc sqrBench() =
-  var r, x: Fp[Secp256k1]
-  # BN254 field modulus
-  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
-
-  let start = getMonotime()
-  let startClk = getTicks()
-  for _ in 0 ..< Iters:
-    r.square(x)
-  let stopClk = getTicks()
-  let stop = getMonotime()
-  report("Squaring", "Fp[Secp256k1]", start, stop, startClk, stopClk, Iters)
-
-sqrBench()
-
-proc invBench() =
-  # TODO: having x on the stack triggers stack smashing detection. To be investigated
-  var x: ref Fp[Secp256k1]
-  new x
-  # BN254 field modulus
-  x[].fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
-
-  let start = getMonotime()
-  let startClk = getTicks()
-  for _ in 0 ..< InvIters:
-    # Note: we don't copy the original x so x is alterning between x and x^-1
-    inv(x[])
-  let stopClk = getTicks()
-  let stop = getMonotime()
-  report("Inversion", "Fp[Secp256k1]", start, stop, startClk, stopClk, InvIters)
-
-invBench()
--- a/benchmarks/timers.nim
+++ b/benchmarks/timers.nim
@ -1,49 +0,0 @@
-when defined(i386) or defined(amd64):
-  # From Linux
-  #
-  # The RDTSC instruction is not ordered relative to memory
-  # access.  The Intel SDM and the AMD APM are both vague on this
-  # point, but empirically an RDTSC instruction can be
-  # speculatively executed before prior loads.  An RDTSC
-  # immediately after an appropriate barrier appears to be
-  # ordered as a normal load, that is, it provides the same
-  # ordering guarantees as reading from a global memory location
-  # that some other imaginary CPU is updating continuously with a
-  # time stamp.
-  #
-  # From Intel SDM
-  # https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
-  when not defined(vcc):
-    when defined(amd64):
-      proc getTicks*(): int64 {.inline.} =
-        var lo, hi: int64
-        # TODO: Provide a compile-time flag for RDTSCP support
-        #       and use it instead of lfence + RDTSC
-        {.emit: """asm volatile(
-          "lfence\n"
-          "rdtsc\n"
-          : "=a"(`lo`), "=d"(`hi`)
-          :
-          : "memory"
-        );""".}
-        return (hi shl 32) or lo
-    else:
-      proc getTicks*(): int64 {.inline.} =
-        # TODO: Provide a compile-time flag for RDTSCP support
-        #       and use it instead of lfence + RDTSC
-        {.emit: """asm volatile(
-          "lfence\n"
-          "rdtsc\n"
-          : "=a"(`result`)
-          :
-          : "memory"
-        );""".}
-  else:
-    proc rdtsc(): int64 {.sideeffect, importc: "__rdtsc", header: "<intrin.h>".}
-    proc lfence() {.importc: "__mm_lfence", header: "<intrin.h>".}
-
-    proc getTicks*(): int64 {.inline.} =
-      lfence()
-      return rdtsc()
-else:
-  {.error: "getticks is not supported on this CPU architecture".}
--- a/constantine.nimble
+++ b/constantine.nimble
@ -110,3 +110,18 @@ task test_no_gmp, "Run tests that don't require GMP":

    # 𝔽p2
    test "", "tests/test_fp2.nim"
+
+task bench, "Run benchmark with your default compiler":
+  if not dirExists "build":
+    mkDir "build"
+  exec "nim c -d:danger --verbosity:0 -o:build/bench_finite_fields_default -r --hints:off --warnings:off benchmarks/bench_finite_fields"
+
+task bench_gcc, "Run benchmark with gcc":
+  if not dirExists "build":
+    mkDir "build"
+  exec "nim c --cc:gcc -d:danger --verbosity:0 -o:build/bench_finite_fields_gcc -r --hints:off --warnings:off benchmarks/bench_finite_fields"
+
+task bench_clang, "Run benchmark with clang":
+  if not dirExists "build":
+    mkDir "build"
+  exec "nim c --cc:clang -d:danger --verbosity:0 -o:build/bench_finite_fields_clang -r --hints:off --warnings:off benchmarks/bench_finite_fields"
--- a/constantine/arithmetic.nim
+++ b/constantine/arithmetic.nim
@ -0,0 +1,17 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  arithmetic/[limbs, limbs_modular, limbs_montgomery],
+  arithmetic/bigints,
+  arithmetic/[finite_fields, finite_fields_inversion]
+
+export
+  limbs, limbs_modular, limbs_montgomery,
+  bigints,
+  finite_fields, finite_fields_inversion
--- a/constantine/arithmetic/bigints.nim
+++ b/constantine/arithmetic/bigints.nim
@ -9,8 +9,7 @@
 import
  ../config/common,
  ../primitives,
-  ./limbs,
-  ./montgomery
+  ./limbs, ./limbs_montgomery, ./limbs_modular

 # ############################################################
 #
@ -87,25 +86,16 @@ debug:
  func `$`*(a: BigInt): string =
    result = "BigInt["
    result.add $BigInt.bits
-    result.add "](limbs: ["
-    result.add $BaseType(a.limbs[0]) & " (0x" & toHex(BaseType(a.limbs[0])) & ')'
-    for i in 1 ..< a.limbs.len:
-      result.add ", "
-      result.add $BaseType(a.limbs[i]) & " (0x" & toHex(BaseType(a.limbs[i])) & ')'
-    result.add "])"
+    result.add "](limbs: "
+    result.add a.limbs.toString()
+    result.add ")"

 # No exceptions allowed
 {.push raises: [].}
 {.push inline.}

-func `==`*(a, b: BigInt): CTBool[Word] =
-  ## Returns true if 2 big ints are equal
-  ## Comparison is constant-time
-  a.limbs == b.limbs
-
-func isZero*(a: BigInt): CTBool[Word] =
-  ## Returns true if a big int is equal to zero
-  a.limbs.isZero
+# Initialization
+# ------------------------------------------------------------

 func setZero*(a: var BigInt) =
  ## Set a BigInt to 0
@ -115,6 +105,51 @@ func setOne*(a: var BigInt) =
  ## Set a BigInt to 1
  a.limbs.setOne()

+# Copy
+# ------------------------------------------------------------
+
+func ccopy*(a: var BigInt, b: BigInt, ctl: CTBool[Word]) =
+  ## Constant-time conditional copy
+  ## If ctl is true: b is copied into a
+  ## if ctl is false: b is not copied and a is untouched
+  ## Time and memory accesses are the same whether a copy occurs or not
+  ccopy(a.limbs, b.limbs, ctl)
+
+func cswap*(a, b: var BigInt, ctl: CTBool) =
+  ## Swap ``a`` and ``b`` if ``ctl`` is true
+  ##
+  ## Constant-time:
+  ## Whether ``ctl`` is true or not, the same
+  ## memory accesses are done (unless the compiler tries to be clever)
+  cswap(a.limbs, b.limbs, ctl)
+
+# Comparison
+# ------------------------------------------------------------
+
+func `==`*(a, b: BigInt): CTBool[Word] =
+  ## Returns true if 2 big ints are equal
+  ## Comparison is constant-time
+  a.limbs == b.limbs
+
+func `<`*(a, b: BigInt): CTBool[Word] =
+  ## Returns true if a < b
+  a.limbs < b.limbs
+
+func `<=`*(a, b: BigInt): CTBool[Word] =
+  ## Returns true if a <= b
+  a.limbs <= b.limbs
+
+func isZero*(a: BigInt): CTBool[Word] =
+  ## Returns true if a big int is equal to zero
+  a.limbs.isZero
+
+func isOdd*(a: BigInt): CTBool[Word] =
+  ## Returns true if a is odd
+  a.limbs.isOdd
+
+# Arithmetic
+# ------------------------------------------------------------
+
 func cadd*(a: var BigInt, b: BigInt, ctl: CTBool[Word]): CTBool[Word] =
  ## Constant-time in-place conditional addition
  ## The addition is only performed if ctl is "true"
@ -133,19 +168,16 @@ func cdouble*(a: var BigInt, ctl: CTBool[Word]): CTBool[Word] =
  ## The result carry is always computed.
  (CTBool[Word]) cadd(a.limbs, a.limbs, ctl)

-# ############################################################
-#
-#          BigInt Primitives Optimized for speed
-#
-# ############################################################
-#
-# TODO: fallback to cadd / csub with a "size" compile-option
-
 func add*(a: var BigInt, b: BigInt): CTBool[Word] =
  ## Constant-time in-place addition
  ## Returns the carry
  (CTBool[Word]) add(a.limbs, b.limbs)

+func add*(a: var BigInt, b: Word): CTBool[Word] =
+  ## Constant-time in-place addition
+  ## Returns the carry
+  (CTBool[Word]) add(a.limbs, b)
+
 func sub*(a: var BigInt, b: BigInt): CTBool[Word] =
  ## Constant-time in-place substraction
  ## Returns the borrow
@ -177,19 +209,23 @@ func double*(r: var BigInt, a: BigInt): CTBool[Word] =
  ## Returns the carry
  (CTBool[Word]) sum(r.limbs, a.limbs, a.limbs)

-# ############################################################
-#
-#                    Comparisons
-#
-# ############################################################
+func div2*(a: var BigInt) =
+  ## In-place divide ``a`` by 2
+  a.limbs.shiftRight(1)

-func `<`*(a, b: BigInt): CTBool[Word] =
-  ## Returns true if a < b
-  a.limbs < b.limbs
+func cneg*(a: var BigInt, ctl: CTBool) =
+  ## Conditional negation.
+  ## Negate if ``ctl`` is true
+  a.limbs.cneg(ctl)

-func `<=`*(a, b: BigInt): CTBool[Word] =
-  ## Returns true if a <= b
-  a.limbs <= b.limbs
+# Bit Manipulation
+# ------------------------------------------------------------
+
+func shiftRight*(a: var BigInt, k: int) =
+  ## Shift right by k.
+  ##
+  ## k MUST be less than the base word size (2^31 or 2^63)
+  a.limbs.shiftRight(k)

 # ############################################################
 #
@ -209,6 +245,22 @@ func reduce*[aBits, mBits](r: var BigInt[mBits], a: BigInt[aBits], M: BigInt[mBi
  # pass a pointer+length to a fixed session of the BSS.
  reduce(r.limbs, a.limbs, aBits, M.limbs, mBits)

+func steinsGCD*[bits](r: var BigInt[bits], a, F, M, mp1div2: BigInt[bits]) =
+  ## Compute F multiplied the modular inverse of ``a`` modulo M
+  ## r ≡ F . a^-1 (mod M)
+  ##
+  ## M MUST be odd, M does not need to be prime.
+  ## ``a`` MUST be less than M.
+  r.limbs.steinsGCD(a.limbs, F.limbs, M.limbs, bits, mp1div2.limbs)
+
+func invmod*[bits](r: var BigInt[bits], a, M, mp1div2: BigInt[bits]) =
+  ## Compute the modular inverse of ``a`` modulo M
+  ##
+  ## The modulus ``M`` MUST be odd
+  var one {.noInit.}: BigInt[bits]
+  one.setOne()
+  r.steinsGCD(a, one, M, mp1div2)
+
 # ############################################################
 #
 #                 Montgomery Arithmetic
@ -335,3 +387,6 @@ func montyPowUnsafeExponent*[mBits: static int](
                     else: (1 shl windowSize) + 1
  var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired]]
  montyPowUnsafeExponent(a.limbs, exponent, M.limbs, one.limbs, negInvModWord, scratchSpace, canUseNoCarryMontyMul)
+
+{.pop.} # inline
+{.pop.} # raises no exceptions
--- a/constantine/arithmetic/finite_fields.nim
+++ b/constantine/arithmetic/finite_fields.nim
@ -27,7 +27,7 @@
 import
  ../primitives,
  ../config/[common, curves],
-  ./bigints, ./montgomery
+  ./bigints, ./limbs_montgomery

 # type
 #   `Fp`*[C: static Curve] = object
@ -64,7 +64,7 @@ func fromBig*[C: static Curve](dst: var Fp[C], src: BigInt) {.noInit.} =
  dst.mres.montyResidue(src, C.Mod.mres, C.getR2modP(), C.getNegInvModWord(), C.canUseNoCarryMontyMul())

 func toBig*(src: Fp): auto {.noInit.} =
-  ## Convert a finite-field element to a BigInt in natral representation
+  ## Convert a finite-field element to a BigInt in natural representation
  var r {.noInit.}: typeof(src.mres)
  r.redc(src.mres, Fp.C.Mod.mres, Fp.C.getNegInvModWord(), Fp.C.canUseNoCarryMontyMul())
  return r
@ -193,17 +193,6 @@ func powUnsafeExponent*(a: var Fp, exponent: BigInt) =
    Fp.C.canUseNoCarryMontyMul()
  )

-func inv*(a: var Fp) =
-  ## Inversion modulo p
-  ## Warning ⚠️ :
-  ##   - This assumes that `Fp` is a prime field
-  const windowSize = 5 # TODO: find best window size for each curves
-  a.mres.montyPowUnsafeExponent(
-    Fp.C.getInvModExponent(),
-    Fp.C.Mod.mres, Fp.C.getMontyOne(),
-    Fp.C.getNegInvModWord(), windowSize,
-    Fp.C.canUseNoCarryMontyMul()
-  )

 # ############################################################
 #
@ -235,12 +224,8 @@ func `*`*(a, b: Fp): Fp {.noInit.} =

 func `*=`*(a: var Fp, b: Fp) =
  ## Multiplication modulo p
-  ##
-  ## Implementation note:
-  ## - This requires a temporary field element
-  ##
-  ## Cost
-  ## Stack: 1 * ModulusBitSize
-  var tmp{.noInit.}: Fp
-  tmp.prod(a, b)
-  a = tmp
+  a.prod(a, b)
+
+func square*(a: var Fp) =
+  ## Squaring modulo p
+  a.mres.montySquare(a.mres, Fp.C.Mod.mres, Fp.C.getNegInvModWord(), Fp.C.canUseNoCarryMontySquare())
--- a/constantine/arithmetic/finite_fields_inversion.nim
+++ b/constantine/arithmetic/finite_fields_inversion.nim
@ -0,0 +1,133 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../config/[common, curves],
+  ./finite_fields
+
+# ############################################################
+#
+#                  Specialized inversions
+#
+# ############################################################
+
+# Field-specific inversion routines
+template repeat(num: int, body: untyped) =
+  for _ in 0 ..< num:
+    body
+
+# Secp256k1
+# ------------------------------------------------------------
+func invmod_addchain(r: var Fp[Secp256k1], a: Fp[Secp256k1]) =
+  ## We invert via Little Fermat's theorem
+  ## a^(-1) ≡ a^(p-2) (mod p)
+  ## with p = "0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2F"
+  ## We take advantage of the prime special form to hardcode
+  ## the sequence of squarings and multiplications for the modular exponentiation
+  ##
+  ## See libsecp256k1
+  ##
+  ## The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
+  ## { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
+  ## [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
+
+  var
+    x2{.noInit.}: Fp[Secp256k1]
+    x3{.noInit.}: Fp[Secp256k1]
+    x6{.noInit.}: Fp[Secp256k1]
+    x9{.noInit.}: Fp[Secp256k1]
+    x11{.noInit.}: Fp[Secp256k1]
+    x22{.noInit.}: Fp[Secp256k1]
+    x44{.noInit.}: Fp[Secp256k1]
+    x88{.noInit.}: Fp[Secp256k1]
+    x176{.noInit.}: Fp[Secp256k1]
+    x220{.noInit.}: Fp[Secp256k1]
+    x223{.noInit.}: Fp[Secp256k1]
+
+  x2.square(a)
+  x2 *= a
+
+  x3.square(x2)
+  x3 *= a
+
+  x6 = x3
+  repeat 3: x6.square()
+  x6 *= x3
+
+  x9 = x6
+  repeat 3: x9.square()
+  x9 *= x3
+
+  x11 = x9
+  repeat 2: x11.square()
+  x11 *= x2
+
+  x22 = x11
+  repeat 11: x22.square()
+  x22 *= x11
+
+  x44 = x22
+  repeat 22: x44.square()
+  x44 *= x22
+
+  x88 = x44
+  repeat 44: x88.square()
+  x88 *= x44
+
+  x176 = x88
+  repeat 88: x88.square()
+  x176 *= x88
+
+  x220 = x176
+  repeat 44: x220.square()
+  x220 *= x44
+
+  x223 = x220
+  repeat 3: x223.square()
+  x223 *= x3
+
+  # The final result is then assembled using a sliding window over the blocks
+  r = x223
+  repeat 23: r.square()
+  r *= x22
+  repeat 5: r.square()
+  r *= a
+  repeat 3: r.square()
+  r *= x2
+  repeat 2: r.square()
+  r *= a
+
+# BN Curves
+# ------------------------------------------------------------
+# Efficient Pairings and ECC for Embedded Systems
+# Thomas Unterluggauer and Erich Wenger
+# https://eprint.iacr.org/2014/800.pdf
+#
+# BN curve field modulus are of the form:
+#   p = 36u^4 + 36u^3 + 24u^2 + 6u + 1
+#
+# We construct the following multiplication-squaring chain
+# a^-1 mod p = a^(p-2) mod p                 (Little Fermat Theorem)
+#            = a^(36 u^4 + 36 u^3 + 24 u^2 + 6u + 1 - 2) mod p
+#            = a^(36 u^4) . a^(36 u^3) . a^(24 u^2) . a^(6u-1) mod p
+#
+# Note: it only works for u positive, in particular BN254 doesn't work :/
+#       Is there a way to only use a^-u or even powers?
+
+# ############################################################
+#
+#                         Dispatch
+#
+# ############################################################
+
+func inv*(r: var Fp, a: Fp) =
+  ## Inversion modulo p
+  # For now we don't activate the addition chain.
+  # Performance is equal to GCD and it does not pass test on 𝔽p2
+  # We need faster squaring/multiplications
+  r.mres.steinsGCD(a.mres, Fp.C.getR2modP(), Fp.C.Mod.mres, Fp.C.getPrimePlus1div2())
--- a/constantine/arithmetic/limbs.nim
+++ b/constantine/arithmetic/limbs.nim
@ -43,6 +43,22 @@ type Limbs*[N: static int] = array[N, Word]
  ##
  ## but for unknown reason, it prevents semchecking `bits`

+debug:
+  import strutils
+
+  func toString*(a: Limbs): string =
+    result = "["
+    result.add $BaseType(a[0]) & " (0x" & toHex(BaseType(a[0])) & ')'
+    for i in 1 ..< a.len:
+      result.add ", "
+      result.add $BaseType(a[i]) & " (0x" & toHex(BaseType(a[i])) & ')'
+    result.add "])"
+
+  func toHex*(a: Limbs): string =
+    result = "0x"
+    for i in countdown(a.len-1, 0):
+      result.add toHex(BaseType(a[i]))
+
 # No exceptions allowed
 {.push raises: [].}

@ -82,20 +98,8 @@ type Limbs*[N: static int] = array[N, Word]
 # as we avoid the parmeter packing/unpacking ceremony at function entry/exit
 # and unrolling overhead is minimal.

-func `==`*(a, b: Limbs): CTBool[Word] =
-  ## Returns true if 2 limbs are equal
-  ## Comparison is constant-time
-  var accum = Zero
-  for i in 0 ..< a.len:
-    accum = accum or (a[i] xor b[i])
-  result = accum.isZero()
-
-func isZero*(a: Limbs): CTBool[Word] =
-  ## Returns true if ``a`` is equal to zero
-  var accum = Zero
-  for i in 0 ..< a.len:
-    accum = accum or a[i]
-  result = accum.isZero()
+# Initialization
+# ------------------------------------------------------------

 func setZero*(a: var Limbs) =
  ## Set ``a`` to 0
@ -107,6 +111,9 @@ func setOne*(a: var Limbs) =
  when a.len > 1:
    zeroMem(a[1].addr, (a.len - 1) * sizeof(Word))

+# Copy
+# ------------------------------------------------------------
+
 func ccopy*(a: var Limbs, b: Limbs, ctl: CTBool[Word]) =
  ## Constant-time conditional copy
  ## If ctl is true: b is copied into a
@ -118,6 +125,65 @@ func ccopy*(a: var Limbs, b: Limbs, ctl: CTBool[Word]) =
  for i in 0 ..< a.len:
    ctl.ccopy(a[i], b[i])

+func cswap*(a, b: var Limbs, ctl: CTBool) =
+  ## Swap ``a`` and ``b`` if ``ctl`` is true
+  ##
+  ## Constant-time:
+  ## Whether ``ctl`` is true or not, the same
+  ## memory accesses are done (unless the compiler tries to be clever)
+
+  var mask = -(Word ctl)
+  for i in 0 ..< a.len:
+    let t = mask and (a[i] xor b[i])
+    a[i] = a[i] xor t
+    b[i] = b[i] xor t
+
+# Comparison
+# ------------------------------------------------------------
+
+func `==`*(a, b: Limbs): CTBool[Word] =
+  ## Returns true if 2 limbs are equal
+  ## Comparison is constant-time
+  var accum = Zero
+  for i in 0 ..< a.len:
+    accum = accum or (a[i] xor b[i])
+  result = accum.isZero()
+
+func `<`*(a, b: Limbs): CTBool[Word] =
+  ## Returns true if a < b
+  ## Comparison is constant-time
+  var diff: Word
+  var borrow: Borrow
+  for i in 0 ..< a.len:
+    subB(borrow, diff, a[i], b[i], borrow)
+
+  result = (CTBool[Word])(borrow)
+
+func `<=`*(a, b: Limbs): CTBool[Word] =
+  ## Returns true if a <= b
+  ## Comparison is constant-time
+  not(b < a)
+
+func isZero*(a: Limbs): CTBool[Word] =
+  ## Returns true if ``a`` is equal to zero
+  var accum = Zero
+  for i in 0 ..< a.len:
+    accum = accum or a[i]
+  result = accum.isZero()
+
+func isOne*(a: Limbs): CTBool[Word] =
+  ## Returns true if ``a`` is equal to one
+  result = a[0] == Word(1)
+  for i in 1 ..< a.len:
+    result = result and a[i].isZero()
+
+func isOdd*(a: Limbs): CTBool[Word] =
+  ## Returns true if a is odd
+  CTBool[Word](a[0] and Word(1))
+
+# Arithmetic
+# ------------------------------------------------------------
+
 func add*(a: var Limbs, b: Limbs): Carry =
  ## Limbs addition
  ## Returns the carry
@ -125,6 +191,14 @@ func add*(a: var Limbs, b: Limbs): Carry =
  for i in 0 ..< a.len:
    addC(result, a[i], a[i], b[i], result)

+func add*(a: var Limbs, w: Word): Carry =
+  ## Limbs addition, add a number that fits in a word
+  ## Returns the carry
+  result = Carry(0)
+  addC(result, a[0], a[0], w, result)
+  for i in 1 ..< a.len:
+    addC(result, a[i], a[i], Zero, result)
+
 func cadd*(a: var Limbs, b: Limbs, ctl: CTBool[Word]): Carry =
  ## Limbs conditional addition
  ## Returns the carry
@ -180,266 +254,46 @@ func diff*(r: var Limbs, a, b: Limbs): Borrow =
  for i in 0 ..< a.len:
    subB(result, r[i], a[i], b[i], result)

-func `<`*(a, b: Limbs): CTBool[Word] =
-  ## Returns true if a < b
-  ## Comparison is constant-time
-  var diff: Word
-  var borrow: Borrow
+func cneg*(a: var Limbs, ctl: CTBool) =
+  ## Conditional negation.
+  ## Negate if ``ctl`` is true
+
+  # Algorithm:
+  # In two-complement representation
+  #  -x <=> not(x) + 1 <=> x xor 0xFF... + 1
+  # and
+  #   x <=> x xor 0x00...<=> x xor 0x00... + 0
+  #
+  # So we need to xor all words and then add 1
+  # The "+1" might carry
+  # So we fuse the 2 steps
+  let mask = -Word(ctl)              # Obtain a 0xFF... or 0x00... mask
+  var carry = Word(ctl)
  for i in 0 ..< a.len:
-    subB(borrow, diff, a[i], b[i], borrow)
+    let t = (a[i] xor mask) + carry  # XOR with mask and add 0x01 or 0x00 respectively
+    carry = Word(t < carry)          # Carry on
+    a[i] = t

-  result = (CTBool[Word])(borrow)
+# Bit manipulation
+# ------------------------------------------------------------

-func `<=`*(a, b: Limbs): CTBool[Word] =
-  ## Returns true if a <= b
-  ## Comparison is constant-time
-  not(b < a)
+func shiftRight*(a: var Limbs, k: int) =
+  ## Shift right by k.
+  ##
+  ## k MUST be less than the base word size (2^32 or 2^64)
+  # We don't reuse shr as this is an in-place operation
+  # Do we need to return the shifted out part?
+  #
+  # Note: for speed, loading a[i] and a[i+1]
+  #       instead of a[i-1] and a[i]
+  #       is probably easier to parallelize for the compiler
+  #       (antidependence WAR vs loop-carried dependence RAW)
+
+  # checkWordShift(k)
+
+  for i in 0 ..< a.len-1:
+    a[i] = (a[i] shr k) or (a[i+1] shl (WordBitWidth - k))
+  a[a.len-1] = a[a.len-1] shr k

 {.pop.} # inline
-
-# ############################################################
-#
-#                   Modular BigInt
-#
-# ############################################################
-#
-# To avoid code-size explosion due to monomorphization
-# and given that reductions are not in hot path in Constantine
-# we use type-erased procedures, instead of instantiating
-# one per number of limbs combination
-
-# Type-erasure
-# ------------------------------------------------------------
-
-type
-  LimbsView = ptr UncheckedArray[Word]
-    ## Type-erased fixed-precision limbs
-    ##
-    ## This type mirrors the Limb type and is used
-    ## for some low-level computation API
-    ## This design
-    ## - avoids code bloat due to generic monomorphization
-    ##   otherwise limbs routines would have an instantiation for
-    ##   each number of words.
-    ##
-    ## Accesses should be done via BigIntViewConst / BigIntViewConst
-    ## to have the compiler check for mutability
-
-  # "Indirection" to enforce pointer types deep immutability
-  LimbsViewConst = distinct LimbsView
-    ## Immutable view into the limbs of a BigInt
-  LimbsViewMut = distinct LimbsView
-    ## Mutable view into a BigInt
-  LimbsViewAny = LimbsViewConst or LimbsViewMut
-
-# Deep Mutability safety
-# ------------------------------------------------------------
-
-template view(a: Limbs): LimbsViewConst =
-  ## Returns a borrowed type-erased immutable view to a bigint
-  LimbsViewConst(cast[LimbsView](a.unsafeAddr))
-
-template view(a: var Limbs): LimbsViewMut =
-  ## Returns a borrowed type-erased mutable view to a mutable bigint
-  LimbsViewMut(cast[LimbsView](a.addr))
-
-template `[]`*(v: LimbsViewConst, limbIdx: int): Word =
-  LimbsView(v)[limbIdx]
-
-template `[]`*(v: LimbsViewMut, limbIdx: int): var Word =
-  LimbsView(v)[limbIdx]
-
-template `[]=`*(v: LimbsViewMut, limbIdx: int, val: Word) =
-  LimbsView(v)[limbIdx] = val
-
-# Type-erased add-sub
-# ------------------------------------------------------------
-
-func cadd(a: LimbsViewMut, b: LimbsViewAny, ctl: CTBool[Word], len: int): Carry =
-  ## Type-erased conditional addition
-  ## Returns the carry
-  ##
-  ## if ctl is true: a <- a + b
-  ## if ctl is false: a <- a
-  ## The carry is always computed whether ctl is true or false
-  ##
-  ## Time and memory accesses are the same whether a copy occurs or not
-  result = Carry(0)
-  var sum: Word
-  for i in 0 ..< len:
-    addC(result, sum, a[i], b[i], result)
-    ctl.ccopy(a[i], sum)
-
-func csub(a: LimbsViewMut, b: LimbsViewAny, ctl: CTBool[Word], len: int): Borrow =
-  ## Type-erased conditional addition
-  ## Returns the borrow
-  ##
-  ## if ctl is true: a <- a - b
-  ## if ctl is false: a <- a
-  ## The borrow is always computed whether ctl is true or false
-  ##
-  ## Time and memory accesses are the same whether a copy occurs or not
-  result = Borrow(0)
-  var diff: Word
-  for i in 0 ..< len:
-    subB(result, diff, a[i], b[i], result)
-    ctl.ccopy(a[i], diff)
-
-# Modular reduction
-# ------------------------------------------------------------
-
-func numWordsFromBits(bits: int): int {.inline.} =
-  const divShiftor = log2(uint32(WordBitWidth))
-  result = (bits + WordBitWidth - 1) shr divShiftor
-
-func shlAddMod_estimate(a: LimbsViewMut, aLen: int,
-                        c: Word, M: LimbsViewConst, mBits: int
-                      ): tuple[neg, tooBig: CTBool[Word]] =
-  ## Estimate a <- a shl 2^w + c (mod M)
-  ##
-  ## with w the base word width, usually 32 on 32-bit platforms and 64 on 64-bit platforms
-  ##
-  ## Updates ``a`` and returns ``neg`` and ``tooBig``
-  ## If ``neg``, the estimate in ``a`` is negative and ``M`` must be added to it.
-  ## If ``tooBig``, the estimate in ``a`` overflowed and ``M`` must be substracted from it.
-
-  # Aliases
-  # ----------------------------------------------------------------------
-  let MLen = numWordsFromBits(mBits)
-
-  # Captures aLen and MLen
-  template `[]`(v: untyped, limbIdxFromEnd: BackwardsIndex): Word {.dirty.}=
-    v[`v Len` - limbIdxFromEnd.int]
-
-  # ----------------------------------------------------------------------
-                                                          # Assuming 64-bit words
-  let hi = a[^1]                                          # Save the high word to detect carries
-  let R = mBits and (WordBitWidth - 1)                    # R = mBits mod 64
-
-  var a0, a1, m0: Word
-  if R == 0:                                              # If the number of mBits is a multiple of 64
-    a0 = a[^1]                                            #
-    moveMem(a[1].addr, a[0].addr, (aLen-1) * Word.sizeof) # we can just shift words
-    a[0] = c                                              # and replace the first one by c
-    a1 = a[^1]
-    m0 = M[^1]
-  else:                                                   # Else: need to deal with partial word shifts at the edge.
-    a0 = (a[^1] shl (WordBitWidth-R)) or (a[^2] shr R)
-    moveMem(a[1].addr, a[0].addr, (aLen-1) * Word.sizeof)
-    a[0] = c
-    a1 = (a[^1] shl (WordBitWidth-R)) or (a[^2] shr R)
-    m0 = (M[^1] shl (WordBitWidth-R)) or (M[^2] shr R)
-
-  # m0 has its high bit set. (a0, a1)/p0 fits in a limb.
-  # Get a quotient q, at most we will be 2 iterations off
-  # from the true quotient
-  var q, r: Word
-  unsafeDiv2n1n(q, r, a0, a1, m0)                # Estimate quotient
-  q = mux(                                       # If n_hi == divisor
-        a0 == m0, MaxWord,                       # Quotient == MaxWord (0b1111...1111)
-        mux(
-          q.isZero, Zero,                        # elif q == 0, true quotient = 0
-          q - One                                # else instead of being of by 0, 1 or 2
-        )                                        # we returning q-1 to be off by -1, 0 or 1
-      )
-
-  # Now substract a*2^64 - q*p
-  var carry = Zero
-  var over_p = CtTrue                            # Track if quotient greater than the modulus
-
-  for i in 0 ..< MLen:
-    var qp_lo: Word
-
-    block: # q*p
-      # q * p + carry (doubleword) carry from previous limb
-      muladd1(carry, qp_lo, q, M[i], Word carry)
-
-    block: # a*2^64 - q*p
-      var borrow: Borrow
-      subB(borrow, a[i], a[i], qp_lo, Borrow(0))
-      carry += Word(borrow) # Adjust if borrow
-
-    over_p = mux(
-              a[i] == M[i], over_p,
-              a[i] > M[i]
-            )
-
-  # Fix quotient, the true quotient is either q-1, q or q+1
-  #
-  # if carry < q or carry == q and over_p we must do "a -= p"
-  # if carry > hi (negative result) we must do "a += p"
-
-  result.neg = Word(carry) > hi
-  result.tooBig = not(result.neg) and (over_p or (Word(carry) < hi))
-
-func shlAddMod(a: LimbsViewMut, aLen: int,
-               c: Word, M: LimbsViewConst, mBits: int) =
-  ## Fused modular left-shift + add
-  ## Shift input `a` by a word and add `c` modulo `M`
-  ##
-  ## With a word W = 2^WordBitSize and a modulus M
-  ## Does a <- a * W + c (mod M)
-  ##
-  ## The modulus `M` most-significant bit at `mBits` MUST be set.
-  if mBits <= WordBitWidth:
-    # If M fits in a single limb
-
-    # We normalize M with R so that the MSB is set
-    # And normalize (a * 2^64 + c) by R as well to maintain the result
-    # This ensures that (a0, a1)/p0 fits in a limb.
-    let R = mBits and (WordBitWidth - 1)
-
-    # (hi, lo) = a * 2^64 + c
-    let hi = (a[0] shl (WordBitWidth-R)) or (c shr R)
-    let lo = c shl (WordBitWidth-R)
-    let m0 = M[0] shl (WordBitWidth-R)
-
-    var q, r: Word
-    unsafeDiv2n1n(q, r, hi, lo, m0)  # (hi, lo) mod M
-
-    a[0] = r shr (WordBitWidth-R)
-
-  else:
-    ## Multiple limbs
-    let (neg, tooBig) = shlAddMod_estimate(a, aLen, c, M, mBits)
-    discard a.cadd(M, ctl = neg, aLen)
-    discard a.csub(M, ctl = tooBig, aLen)
-
-func reduce(r: LimbsViewMut,
-            a: LimbsViewAny, aBits: int,
-            M: LimbsViewConst, mBits: int) =
-  ## Reduce `a` modulo `M` and store the result in `r`
-  let aLen = numWordsFromBits(aBits)
-  let mLen = numWordsFromBits(mBits)
-  let rLen = mLen
-
-  if aBits < mBits:
-    # if a uses less bits than the modulus,
-    # it is guaranteed < modulus.
-    # This relies on the precondition that the modulus uses all declared bits
-    copyMem(r[0].addr, a[0].unsafeAddr, aLen * sizeof(Word))
-    for i in aLen ..< mLen:
-      r[i] = Zero
-  else:
-    # a length i at least equal to the modulus.
-    # we can copy modulus.limbs-1 words
-    # and modular shift-left-add the rest
-    let aOffset = aLen - mLen
-    copyMem(r[0].addr, a[aOffset+1].unsafeAddr, (mLen-1) * sizeof(Word))
-    r[rLen - 1] = Zero
-    # Now shift-left the copied words while adding the new word modulo M
-    for i in countdown(aOffset, 0):
-      shlAddMod(r, rLen, a[i], M, mBits)
-
-func reduce*[aLen, mLen](r: var Limbs[mLen],
-                         a: Limbs[aLen], aBits: static int,
-                         M: Limbs[mLen], mBits: static int
-                        ) {.inline.} =
-  ## Reduce `a` modulo `M` and store the result in `r`
-  ##
-  ## Warning ⚠: At the moment this is NOT constant-time
-  ##            as it relies on hardware division.
-  # This is implemented via type-erased indirection to avoid
-  # a significant amount of code duplication if instantiated for
-  # varying bitwidth.
-  reduce(r.view(), a.view(), aBits, M.view(), mBits)
+{.pop.} # raises no exceptions
--- a/constantine/arithmetic/limbs_modular.nim
+++ b/constantine/arithmetic/limbs_modular.nim
@ -0,0 +1,371 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../config/common,
+  ../primitives,
+  ./limbs
+
+# No exceptions allowed
+{.push raises: [].}
+
+# ############################################################
+#
+#                    Modular inversion
+#
+# ############################################################
+
+# Generic (odd-only modulus)
+# ------------------------------------------------------------
+# Algorithm by Niels Möller,
+# a modified version of Stein's Algorithm (binary Extended Euclid GCD)
+#
+# Algorithm 5 in
+# Fast Software Polynomial Multiplication on ARM Processors Using the NEON Engine
+# Danilo Camara, Conrado P. L. Gouvea, Julio Lopez, and Ricardo Dahab
+# https://link.springer.com/content/pdf/10.1007%2F978-3-642-40588-4_10.pdf
+#
+# Input: integer x, odd integer n, x < n
+# Output: x−1 (mod n)
+# 1:   function ModInv(x, n)
+# 2:   (a, b, u, v) ← (x, n, 1, 1)
+# 3:   ℓ ← ⌊log2 n⌋ + 1            ⮚ number of bits in n
+# 4:   for i ← 0 to 2ℓ − 1 do
+# 5:     odd ← a & 1
+# 6:     if odd and a ≥ b then
+# 7:       a ← a − b
+# 8:     else if odd and a < b then
+# 9:       (a, b, u, v) ← (b − a, a, v, u)
+# 10:    a ← a >> 1
+# 11:    if odd then u ← u − v
+# 12:    if u < 0 then u ← u + n
+# 13:    if u & 1 then u ← u + n
+# 14:    u ← u >> 1
+# 15:  return v
+#
+# Warning ⚠️: v should be 0 at initialization
+#
+# We modify it to return F . a^-1
+# So that we can pass an adjustment factor F
+# And directly compute modular division or Montgomery inversion
+
+func steinsGCD*(v: var Limbs, a: Limbs, F, M: Limbs, bits: int, mp1div2: Limbs) =
+  ## Compute F multiplied the modular inverse of ``a`` modulo M
+  ## r ≡ F . a^-1 (mod M)
+  ##
+  ## M MUST be odd, M does not need to be prime.
+  ## ``a`` MUST be less than M.
+  ##
+  ## No information about ``a`` in particular its actual length in bits is leaked.
+  ##
+  ## This takes (M+1)/2 (mp1div2) as a precomputed parameter as a slight optimization
+  ## in stack size and speed.
+
+  # Ideally we need registers for a, b, u, v
+  # but:
+  #   - Even with ~256-bit primes, that's 4 limbs = 4*4 => 16 registers
+  #   - x86_64 only has 16 general purposes registers
+  #   - Registers are needed for the loop counter and comparison results
+  #   - CMOV is reg <- RM so can move registers/memory into registers
+  #     but cannot move into memory.
+  # so we choose to keep "v" from the algorithm in memory as `r`
+
+  # TODO: the inlining of primitives like `csub` is bad for codesize
+  #       but there is a 80% slowdown without it.
+  # TODO: The `cmov` in `cadd` and `csub` always retest the condition
+  #       which is probably costly here given how many we have.
+
+  var a = a
+  var b = M
+  var u = F
+  v.setZero()
+
+  for i in 0 ..< 2 * bits:
+    debug: doAssert bool(b.isOdd)
+    let isOddA = a.isOdd()
+
+    # if isOddA: a -= b
+    let aLessThanB = isOddA and (CTBool[Word]) a.csub(b, isOddA)
+    # if a < b and the sub was processed
+    # in that case, b <- a = a - b + b
+    discard b.cadd(a, aLessThanB)
+    # and a <- -new_a = (b-a)
+    a.cneg(aLessThanB)
+    debug: doAssert not bool(a.isOdd)
+    a.shiftRight(1)
+
+    # Swap u and v is a < b
+    u.cswap(v, aLessThanB)
+    # if isOddA: u -= v (mod M)
+    let neg = isOddA and (CTBool[Word]) u.csub(v, isOddA)
+    let corrected = u.cadd(M, neg)
+
+    let isOddU = u.isOdd()
+    # if u.isOdd:
+    #   u += n
+    # u = u shr 1
+    #
+    # Warning ⚠️: u += n will overflow the BigInt
+    #   and we might lose a bit on the next shift
+    #   Instead we shift first and then add hald the modulus rounded up
+    u.shiftRight(1)
+    let carry = u.cadd(mp1div2, isOddU)
+    debug: doAssert not carry.bool
+
+  debug:
+    doAssert bool a.isZero()
+    doAssert bool b.isOne() # GCD always exist if a and M are relatively prime
+
+# ############################################################
+#
+#                   Modular Reduction
+#
+# ############################################################
+#
+# To avoid code-size explosion due to monomorphization
+# and given that reductions are not in hot path in Constantine
+# we use type-erased procedures, instead of instantiating
+# one per number of limbs combination
+
+# Type-erasure
+# ------------------------------------------------------------
+
+type
+  LimbsView = ptr UncheckedArray[Word]
+    ## Type-erased fixed-precision limbs
+    ##
+    ## This type mirrors the Limb type and is used
+    ## for some low-level computation API
+    ## This design
+    ## - avoids code bloat due to generic monomorphization
+    ##   otherwise limbs routines would have an instantiation for
+    ##   each number of words.
+    ##
+    ## Accesses should be done via BigIntViewConst / BigIntViewConst
+    ## to have the compiler check for mutability
+
+  # "Indirection" to enforce pointer types deep immutability
+  LimbsViewConst = distinct LimbsView
+    ## Immutable view into the limbs of a BigInt
+  LimbsViewMut = distinct LimbsView
+    ## Mutable view into a BigInt
+  LimbsViewAny = LimbsViewConst or LimbsViewMut
+
+# Deep Mutability safety
+# ------------------------------------------------------------
+
+template view(a: Limbs): LimbsViewConst =
+  ## Returns a borrowed type-erased immutable view to a bigint
+  LimbsViewConst(cast[LimbsView](a.unsafeAddr))
+
+template view(a: var Limbs): LimbsViewMut =
+  ## Returns a borrowed type-erased mutable view to a mutable bigint
+  LimbsViewMut(cast[LimbsView](a.addr))
+
+template `[]`*(v: LimbsViewConst, limbIdx: int): Word =
+  LimbsView(v)[limbIdx]
+
+template `[]`*(v: LimbsViewMut, limbIdx: int): var Word =
+  LimbsView(v)[limbIdx]
+
+template `[]=`*(v: LimbsViewMut, limbIdx: int, val: Word) =
+  LimbsView(v)[limbIdx] = val
+
+# Type-erased add-sub
+# ------------------------------------------------------------
+
+func cadd(a: LimbsViewMut, b: LimbsViewAny, ctl: CTBool[Word], len: int): Carry =
+  ## Type-erased conditional addition
+  ## Returns the carry
+  ##
+  ## if ctl is true: a <- a + b
+  ## if ctl is false: a <- a
+  ## The carry is always computed whether ctl is true or false
+  ##
+  ## Time and memory accesses are the same whether a copy occurs or not
+  result = Carry(0)
+  var sum: Word
+  for i in 0 ..< len:
+    addC(result, sum, a[i], b[i], result)
+    ctl.ccopy(a[i], sum)
+
+func csub(a: LimbsViewMut, b: LimbsViewAny, ctl: CTBool[Word], len: int): Borrow =
+  ## Type-erased conditional addition
+  ## Returns the borrow
+  ##
+  ## if ctl is true: a <- a - b
+  ## if ctl is false: a <- a
+  ## The borrow is always computed whether ctl is true or false
+  ##
+  ## Time and memory accesses are the same whether a copy occurs or not
+  result = Borrow(0)
+  var diff: Word
+  for i in 0 ..< len:
+    subB(result, diff, a[i], b[i], result)
+    ctl.ccopy(a[i], diff)
+
+# Modular reduction
+# ------------------------------------------------------------
+
+func numWordsFromBits(bits: int): int {.inline.} =
+  const divShiftor = log2(uint32(WordBitWidth))
+  result = (bits + WordBitWidth - 1) shr divShiftor
+
+func shlAddMod_estimate(a: LimbsViewMut, aLen: int,
+                        c: Word, M: LimbsViewConst, mBits: int
+                      ): tuple[neg, tooBig: CTBool[Word]] =
+  ## Estimate a <- a shl 2^w + c (mod M)
+  ##
+  ## with w the base word width, usually 32 on 32-bit platforms and 64 on 64-bit platforms
+  ##
+  ## Updates ``a`` and returns ``neg`` and ``tooBig``
+  ## If ``neg``, the estimate in ``a`` is negative and ``M`` must be added to it.
+  ## If ``tooBig``, the estimate in ``a`` overflowed and ``M`` must be substracted from it.
+
+  # Aliases
+  # ----------------------------------------------------------------------
+  let MLen = numWordsFromBits(mBits)
+
+  # Captures aLen and MLen
+  template `[]`(v: untyped, limbIdxFromEnd: BackwardsIndex): Word {.dirty.}=
+    v[`v Len` - limbIdxFromEnd.int]
+
+  # ----------------------------------------------------------------------
+                                                          # Assuming 64-bit words
+  let hi = a[^1]                                          # Save the high word to detect carries
+  let R = mBits and (WordBitWidth - 1)                    # R = mBits mod 64
+
+  var a0, a1, m0: Word
+  if R == 0:                                              # If the number of mBits is a multiple of 64
+    a0 = a[^1]                                            #
+    moveMem(a[1].addr, a[0].addr, (aLen-1) * Word.sizeof) # we can just shift words
+    a[0] = c                                              # and replace the first one by c
+    a1 = a[^1]
+    m0 = M[^1]
+  else:                                                   # Else: need to deal with partial word shifts at the edge.
+    a0 = (a[^1] shl (WordBitWidth-R)) or (a[^2] shr R)
+    moveMem(a[1].addr, a[0].addr, (aLen-1) * Word.sizeof)
+    a[0] = c
+    a1 = (a[^1] shl (WordBitWidth-R)) or (a[^2] shr R)
+    m0 = (M[^1] shl (WordBitWidth-R)) or (M[^2] shr R)
+
+  # m0 has its high bit set. (a0, a1)/p0 fits in a limb.
+  # Get a quotient q, at most we will be 2 iterations off
+  # from the true quotient
+  var q, r: Word
+  unsafeDiv2n1n(q, r, a0, a1, m0)                # Estimate quotient
+  q = mux(                                       # If n_hi == divisor
+        a0 == m0, MaxWord,                       # Quotient == MaxWord (0b1111...1111)
+        mux(
+          q.isZero, Zero,                        # elif q == 0, true quotient = 0
+          q - One                                # else instead of being of by 0, 1 or 2
+        )                                        # we returning q-1 to be off by -1, 0 or 1
+      )
+
+  # Now substract a*2^64 - q*p
+  var carry = Zero
+  var over_p = CtTrue                            # Track if quotient greater than the modulus
+
+  for i in 0 ..< MLen:
+    var qp_lo: Word
+
+    block: # q*p
+      # q * p + carry (doubleword) carry from previous limb
+      muladd1(carry, qp_lo, q, M[i], Word carry)
+
+    block: # a*2^64 - q*p
+      var borrow: Borrow
+      subB(borrow, a[i], a[i], qp_lo, Borrow(0))
+      carry += Word(borrow) # Adjust if borrow
+
+    over_p = mux(
+              a[i] == M[i], over_p,
+              a[i] > M[i]
+            )
+
+  # Fix quotient, the true quotient is either q-1, q or q+1
+  #
+  # if carry < q or carry == q and over_p we must do "a -= p"
+  # if carry > hi (negative result) we must do "a += p"
+
+  result.neg = Word(carry) > hi
+  result.tooBig = not(result.neg) and (over_p or (Word(carry) < hi))
+
+func shlAddMod(a: LimbsViewMut, aLen: int,
+               c: Word, M: LimbsViewConst, mBits: int) =
+  ## Fused modular left-shift + add
+  ## Shift input `a` by a word and add `c` modulo `M`
+  ##
+  ## With a word W = 2^WordBitSize and a modulus M
+  ## Does a <- a * W + c (mod M)
+  ##
+  ## The modulus `M` most-significant bit at `mBits` MUST be set.
+  if mBits <= WordBitWidth:
+    # If M fits in a single limb
+
+    # We normalize M with R so that the MSB is set
+    # And normalize (a * 2^64 + c) by R as well to maintain the result
+    # This ensures that (a0, a1)/p0 fits in a limb.
+    let R = mBits and (WordBitWidth - 1)
+
+    # (hi, lo) = a * 2^64 + c
+    let hi = (a[0] shl (WordBitWidth-R)) or (c shr R)
+    let lo = c shl (WordBitWidth-R)
+    let m0 = M[0] shl (WordBitWidth-R)
+
+    var q, r: Word
+    unsafeDiv2n1n(q, r, hi, lo, m0)  # (hi, lo) mod M
+
+    a[0] = r shr (WordBitWidth-R)
+
+  else:
+    ## Multiple limbs
+    let (neg, tooBig) = shlAddMod_estimate(a, aLen, c, M, mBits)
+    discard a.cadd(M, ctl = neg, aLen)
+    discard a.csub(M, ctl = tooBig, aLen)
+
+func reduce(r: LimbsViewMut,
+            a: LimbsViewAny, aBits: int,
+            M: LimbsViewConst, mBits: int) =
+  ## Reduce `a` modulo `M` and store the result in `r`
+  let aLen = numWordsFromBits(aBits)
+  let mLen = numWordsFromBits(mBits)
+  let rLen = mLen
+
+  if aBits < mBits:
+    # if a uses less bits than the modulus,
+    # it is guaranteed < modulus.
+    # This relies on the precondition that the modulus uses all declared bits
+    copyMem(r[0].addr, a[0].unsafeAddr, aLen * sizeof(Word))
+    for i in aLen ..< mLen:
+      r[i] = Zero
+  else:
+    # a length i at least equal to the modulus.
+    # we can copy modulus.limbs-1 words
+    # and modular shift-left-add the rest
+    let aOffset = aLen - mLen
+    copyMem(r[0].addr, a[aOffset+1].unsafeAddr, (mLen-1) * sizeof(Word))
+    r[rLen - 1] = Zero
+    # Now shift-left the copied words while adding the new word modulo M
+    for i in countdown(aOffset, 0):
+      shlAddMod(r, rLen, a[i], M, mBits)
+
+func reduce*[aLen, mLen](r: var Limbs[mLen],
+                         a: Limbs[aLen], aBits: static int,
+                         M: Limbs[mLen], mBits: static int
+                        ) {.inline.} =
+  ## Reduce `a` modulo `M` and store the result in `r`
+  ##
+  ## Warning ⚠: At the moment this is NOT constant-time
+  ##            as it relies on hardware division.
+  # This is implemented via type-erased indirection to avoid
+  # a significant amount of code duplication if instantiated for
+  # varying bitwidth.
+  reduce(r.view(), a.view(), aBits, M.view(), mBits)
+
+{.pop.} # raises no exceptions
--- a/constantine/arithmetic/limbs_montgomery.nim
+++ b/constantine/arithmetic/limbs_montgomery.nim
@ -21,7 +21,7 @@ import
 # Note: Montgomery multiplications and squarings are the biggest bottlenecks
 #       of an elliptic curve library, asymptotically 100% of the costly algorithms:
 #       - field exponentiation
-#       - field inversion
+#       - field inversion via Little Fermat
 #       - extension towers multiplication, squarings, inversion
 #       - elliptic curve point addition
 #       - elliptic curve point doubling
@ -80,6 +80,9 @@ macro staticFor(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped)
      body.replaceNodes(idx, newLit i)
    )

+# No exceptions allowed
+{.push raises: [].}
+
 # Implementation
 # ------------------------------------------------------------
 # Note: the low-level implementations should not use static parameter
@ -565,3 +568,5 @@ func montyPowUnsafeExponent*(
        # scratchspace[1] holds the original `a`
        scratchspace[0].montyMul(a, scratchspace[1], M, negInvModWord, canUseNoCarryMontyMul)
      a = scratchspace[0]
+
+{.pop.} # raises no exceptions
--- a/constantine/arithmetic/precomputed.nim
+++ b/constantine/arithmetic/precomputed.nim
@ -67,6 +67,19 @@ func subB(bOut, diff: var BaseType, a, b, bIn: BaseType) =
  bOut = BaseType(noBorrowHi == 0)
  diff = merge(rHi, rLo)

+func add(a: var BigInt, w: BaseType): bool =
+  ## Limbs addition, add a number that fits in a word
+  ## Returns the carry
+  var carry, sum: BaseType
+  addC(carry, sum, BaseType(a.limbs[0]), w, carry)
+  a.limbs[0] = Word(sum)
+  for i in 1 ..< a.limbs.len:
+    let ai = BaseType(a.limbs[i])
+    addC(carry, sum, ai, 0, carry)
+    a.limbs[i] = Word(sum)
+
+  result = bool(carry)
+
 func dbl(a: var BigInt): bool =
  ## In-place multiprecision double
  ##   a -> 2a
@ -246,9 +259,24 @@ func primeMinus2_BE*[bits: static int](
     ): array[(bits+7) div 8, byte] {.noInit.} =
  ## Compute an input prime-2
  ## and return the result as a canonical byte array / octet string
-  ## For use to precompute modular inverse exponent.
+  ## For use to precompute modular inverse exponent
+  ## when using inversion by Little Fermat Theorem a^-1 = a^(p-2) mod p

  var tmp = P
  discard tmp.csub(BigInt[bits].fromRawUint([byte 2], bigEndian), true)

  result.exportRawUint(tmp, bigEndian)
+
+func primePlus1div2*(P: BigInt): BigInt =
+  ## Compute (P+1)/2, assumes P is odd
+  ## For use in constant-time modular inversion
+  checkOddModulus(P)
+
+  # (P+1)/2 = P/2 + 1 if P is odd,
+  # this avoids overflowing if the prime uses all bits
+  # i.e. in the form (2^64)^w - 1 or (2^32)^w - 1
+
+  result = P
+  result.shiftRight(1)
+  let carry = result.add(1)
+  doAssert not carry
--- a/constantine/config/curves.nim
+++ b/constantine/config/curves.nim
@ -13,6 +13,7 @@ import
  ./curves_parser, ./common,
  ../arithmetic/[precomputed, bigints]

+{.push used.}

 # ############################################################
 #
@ -40,27 +41,23 @@ import
 #   which returns the field modulus of the curve
 when not defined(testingCurves):
  declareCurves:
-    # Barreto-Naehrig curve, pairing-friendly, Prime 254 bit, ~100-bit security
-    # https://eprint.iacr.org/2013/879.pdf
-    # Usage: Zero-Knowledge Proofs / zkSNARKs in ZCash and Ethereum 1
-    #        https://eips.ethereum.org/EIPS/eip-196
+    curve P224: # NIST P-224
+      bitsize: 224
+      modulus: "0xffffffff_ffffffff_ffffffff_ffffffff_00000000_00000000_00000001"
    curve BN254:
      bitsize: 254
      modulus: "0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47"
      # Equation: Y^2 = X^3 + 3
-    curve BLS12_381:
-      bitsize: 381
-      modulus: "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab"
-      # Equation: y^2 = x^3 + 4
-    curve P224: # NIST P-224
-      bitsize: 224
-      modulus: "0xffffffff_ffffffff_ffffffff_ffffffff_00000000_00000000_00000001"
    curve P256: # secp256r1 / NIST P-256
      bitsize: 256
      modulus: "0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff"
    curve Secp256k1: # Bitcoin curve
      bitsize: 256
      modulus: "0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2F"
+    curve BLS12_381:
+      bitsize: 381
+      modulus: "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab"
+      # Equation: y^2 = x^3 + 4
 else:
  # Fake curve for testing field arithmetic
  declareCurves:
@ -76,12 +73,75 @@ else:
    curve P224: # NIST P-224
      bitsize: 224
      modulus: "0xffffffff_ffffffff_ffffffff_ffffffff_00000000_00000000_00000001"
+    curve BN254: # Zero-Knowledge proofs curve (SNARKS, STARKS)
+      bitsize: 254
+      modulus: "0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47"
+      # Equation: Y^2 = X^3 + 3
    curve P256: # secp256r1 / NIST P-256
      bitsize: 256
      modulus: "0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff"
+    curve Secp256k1: # Bitcoin curve
+      bitsize: 256
+      modulus: "0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2F"
    curve BLS12_381:
      bitsize: 381
      modulus: "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab"
+      # Equation: y^2 = x^3 + 4
+
+# ############################################################
+#
+#                   Curve Families
+#
+# ############################################################
+type CurveFamily = enum
+  None
+  BN   # Barreto-Naehrig
+  BLS  # Barreto-Lynn-Scott
+
+func family*(curve: Curve): CurveFamily =
+  case curve
+  of BN254:
+    BN
+  of BLS12_381:
+    BLS
+  else:
+    None
+
+# ############################################################
+#
+#              Curve Specific Parameters
+#
+# ############################################################
+#
+# In the form CurveXXX_ParameterName where CurveXXX is the curve name + number of bits
+# of the field modulus
+
+# BN Curves
+# ------------------------------------------------------------
+# See https://tools.ietf.org/id/draft-yonezawa-pairing-friendly-curves-00.html
+#
+# The prime p and order r are primes and of the form
+# p = 36u^4 + 36u^3 + 24u^2 + 6u + 1
+# r = 36u^4 + 36u^3 + 18u^2 + 6u + 1
+#
+# https://eprint.iacr.org/2010/429.pdf
+# https://eprint.iacr.org/2013/879.pdf
+# Usage: Zero-Knowledge Proofs / zkSNARKs in ZCash and Ethereum 1
+#        https://eips.ethereum.org/EIPS/eip-196
+
+# BLS Curves
+# ------------------------------------------------------------
+# See https://tools.ietf.org/id/draft-yonezawa-pairing-friendly-curves-00.html
+#
+# BLS12 curves
+#   The prime p and order r are primes and of the form
+#   p = (u - 1)^2 (u^4 - u^2 + 1)/3 + u
+#   r = u^4 - u^2 + 1
+#
+# BLS48 curves
+#   The prime p and order r are primes and of the form
+#   p = (u - 1)^2 (u^16 - u^8 + 1)/3 + u
+#   r = u^16 - u^8 + 1

 # ############################################################
 #
@ -147,6 +207,7 @@ macro genMontyMagics(T: typed): untyped =
        )
      )
    )
+
    # const MyCurve_NegInvModWord = negInvModWord(MyCurve_Modulus)
    result.add newConstStmt(
      ident($curve & "_NegInvModWord"), newCall(
@ -177,6 +238,16 @@ macro genMontyMagics(T: typed): untyped =
        )
      )
    )
+    # const MyCurve_PrimePlus1div2 = primePlus1div2(MyCurve_Modulus)
+    result.add newConstStmt(
+      ident($curve & "_PrimePlus1div2"), newCall(
+        bindSym"primePlus1div2",
+        nnkDotExpr.newTree(
+          bindSym($curve & "_Modulus"),
+          ident"mres"
+        )
+      )
+    )

  # echo result.toStrLit

@ -208,6 +279,10 @@ macro getInvModExponent*(C: static Curve): untyped =
  ## Get modular inversion exponent (Modulus-2 in canonical representation)
  result = bindSym($C & "_InvModExponent")

+macro getPrimePlus1div2*(C: static Curve): untyped =
+  ## Get (P+1) / 2 for an odd prime
+  result = bindSym($C & "_PrimePlus1div2")
+
 # ############################################################
 #
 #                Debug info printed at compile-time
@ -234,5 +309,5 @@ macro debugConsts(): untyped =
  result.add quote do:
    echo "----------------------------------------------------------------------------"

-debug:
-  debugConsts()
+# debug:
+#   debugConsts()
--- a/constantine/tower_field_extensions/abelian_groups.nim
+++ b/constantine/tower_field_extensions/abelian_groups.nim
@ -7,7 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

 import
-  ../arithmetic/finite_fields,
+  ../arithmetic,
  ../config/common,
  ../primitives

--- a/constantine/tower_field_extensions/fp2_complex.nim
+++ b/constantine/tower_field_extensions/fp2_complex.nim
@ -44,7 +44,7 @@
 # TODO: Clarify some assumptions about the prime p ≡ 3 (mod 4)

 import
-  ../arithmetic/finite_fields,
+  ../arithmetic,
  ../config/curves,
  ./abelian_groups

@ -151,7 +151,7 @@ func inv*(r: var Fp2, a: Fp2) =
  t0 += t1             # t0 = a0² + a1² (the norm / squared magnitude of a)

  # [1 Inv, 2 Sqr, 1 Add]
-  inv(t0)              # t0 = 1 / (a0² + a1²)
+  t0.inv(t0)           # t0 = 1 / (a0² + a1²)

  # [1 Inv, 2 Mul, 2 Sqr, 1 Add, 1 Neg]
  r.c0.prod(a.c0, t0)  # r0 = a0 / (a0² + a1²)
--- a/constantine/tower_field_extensions/fp2_sqrt_minus2.nim
+++ b/constantine/tower_field_extensions/fp2_sqrt_minus2.nim
@ -33,7 +33,7 @@


 import
-  ../arithmetic/finite_fields,
+  ../arithmetic,
  ../config/curves,
  ./abelian_groups

--- a/constantine/tower_field_extensions/fp2_sqrt_minus5.nim
+++ b/constantine/tower_field_extensions/fp2_sqrt_minus5.nim
@ -32,7 +32,7 @@
 #     https://eprint.iacr.org/2010/354

 import
-  ../arithmetic/finite_fields,
+  ../arithmetic,
  ../config/curves,
  ./abelian_groups

--- a/formal_verification/bls12_381_q_64.nim
+++ b/formal_verification/bls12_381_q_64.nim
@ -128,7 +128,7 @@ func fromHex(output: var openArray[byte], hexStr: string, order: static[Endianne
 # -------------------------------------------------------------------------

 when isMainModule:
-  import random, std/monotimes, times, strformat, ./timers
+  import random, std/monotimes, times, strformat, ../helpers/timers

  const Iters = 1_000_000
  const InvIters = 1000
--- a/formal_verification/timers.nim
+++ b/formal_verification/timers.nim
@ -1,49 +0,0 @@
-when defined(i386) or defined(amd64):
-  # From Linux
-  #
-  # The RDTSC instruction is not ordered relative to memory
-  # access.  The Intel SDM and the AMD APM are both vague on this
-  # point, but empirically an RDTSC instruction can be
-  # speculatively executed before prior loads.  An RDTSC
-  # immediately after an appropriate barrier appears to be
-  # ordered as a normal load, that is, it provides the same
-  # ordering guarantees as reading from a global memory location
-  # that some other imaginary CPU is updating continuously with a
-  # time stamp.
-  #
-  # From Intel SDM
-  # https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
-  when not defined(vcc):
-    when defined(amd64):
-      proc getTicks*(): int64 {.inline.} =
-        var lo, hi: int64
-        # TODO: Provide a compile-time flag for RDTSCP support
-        #       and use it instead of lfence + RDTSC
-        {.emit: """asm volatile(
-          "lfence\n"
-          "rdtsc\n"
-          : "=a"(`lo`), "=d"(`hi`)
-          :
-          : "memory"
-        );""".}
-        return (hi shl 32) or lo
-    else:
-      proc getTicks*(): int64 {.inline.} =
-        # TODO: Provide a compile-time flag for RDTSCP support
-        #       and use it instead of lfence + RDTSC
-        {.emit: """asm volatile(
-          "lfence\n"
-          "rdtsc\n"
-          : "=a"(`result`)
-          :
-          : "memory"
-        );""".}
-  else:
-    proc rdtsc(): int64 {.sideeffect, importc: "__rdtsc", header: "<intrin.h>".}
-    proc lfence() {.importc: "__mm_lfence", header: "<intrin.h>".}
-
-    proc getTicks*(): int64 {.inline.} =
-      lfence()
-      return rdtsc()
-else:
-  {.error: "getticks is not supported on this CPU architecture".}
--- a/helpers/README.md
+++ b/helpers/README.md
@ -0,0 +1,3 @@
+# Helpers, utilities, tools, miscellaneous
+
+This folder holds helper functions that are not part of Constantine but facilitates testing and benchmarking.
--- a/helpers/prng.nim
+++ b/helpers/prng.nim
@ -98,5 +98,5 @@ func random[T](rng: var RngState, a: var T, C: static Curve) {.noInit.}=
      rng.random(field, C)

 func random*(rng: var RngState, T: typedesc): T =
-  ## Create a random Field or Extension FIeld Element
+  ## Create a random Field or Extension Field Element
  rng.random(result, T.C)
--- a/helpers/static_for.nim
+++ b/helpers/static_for.nim
@ -0,0 +1,28 @@
+import macros
+
+proc replaceNodes(ast: NimNode, what: NimNode, by: NimNode): NimNode =
+  # Replace "what" ident node by "by"
+  proc inspect(node: NimNode): NimNode =
+    case node.kind:
+    of {nnkIdent, nnkSym}:
+      if node.eqIdent(what):
+        return by
+      return node
+    of nnkEmpty:
+      return node
+    of nnkLiterals:
+      return node
+    else:
+      var rTree = node.kind.newTree()
+      for child in node:
+        rTree.add inspect(child)
+      return rTree
+  result = inspect(ast)
+
+macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped): untyped =
+  result = newStmtList()
+  for i in start ..< stopEx:
+    result.add nnkBlockStmt.newTree(
+      ident("unrolledIter_" & $idx & $i),
+      body.replaceNodes(idx, newLit i)
+    )
--- a/helpers/timers.nim
+++ b/helpers/timers.nim
@ -0,0 +1,14 @@
+when defined(i386) or defined(amd64):
+  import x86
+  export getTicks
+
+# This doesn't always work unfortunately ...
+
+proc volatilize(x: ptr byte) {.codegenDecl: "$# $#(char const volatile *x)", inline.} =
+  discard
+
+template preventOptimAway*[T](x: var T) =
+  volatilize(cast[ptr byte](unsafeAddr x))
+
+template preventOptimAway*[T](x: T) =
+  volatilize(cast[ptr byte](x))
--- a/helpers/x86.nim
+++ b/helpers/x86.nim
@ -0,0 +1,74 @@
+# Cpu Name
+# -------------------------------------------------------
+
+{.passC:"-std=gnu99".} # TODO may conflict with milagro "-std=c99"
+
+proc cpuID(eaxi, ecxi: int32): tuple[eax, ebx, ecx, edx: int32] =
+  when defined(vcc):
+    proc cpuidVcc(cpuInfo: ptr int32; functionID: int32)
+      {.importc: "__cpuidex", header: "intrin.h".}
+    cpuidVcc(addr result.eax, eaxi, ecxi)
+  else:
+    var (eaxr, ebxr, ecxr, edxr) = (0'i32, 0'i32, 0'i32, 0'i32)
+    asm """
+      cpuid
+      :"=a"(`eaxr`), "=b"(`ebxr`), "=c"(`ecxr`), "=d"(`edxr`)
+      :"a"(`eaxi`), "c"(`ecxi`)"""
+    (eaxr, ebxr, ecxr, edxr)
+
+proc cpuName*(): string =
+  var leaves {.global.} = cast[array[48, char]]([
+    cpuID(eaxi = 0x80000002'i32, ecxi = 0),
+    cpuID(eaxi = 0x80000003'i32, ecxi = 0),
+    cpuID(eaxi = 0x80000004'i32, ecxi = 0)])
+  result = $cast[cstring](addr leaves[0])
+
+# Counting cycles
+# -------------------------------------------------------
+
+# From Linux
+#
+# The RDTSC instruction is not ordered relative to memory
+# access.  The Intel SDM and the AMD APM are both vague on this
+# point, but empirically an RDTSC instruction can be
+# speculatively executed before prior loads.  An RDTSC
+# immediately after an appropriate barrier appears to be
+# ordered as a normal load, that is, it provides the same
+# ordering guarantees as reading from a global memory location
+# that some other imaginary CPU is updating continuously with a
+# time stamp.
+#
+# From Intel SDM
+# https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
+
+proc getTicks*(): int64 {.inline.} =
+  when defined(vcc):
+    proc rdtsc(): int64 {.sideeffect, importc: "__rdtsc", header: "<intrin.h>".}
+    proc lfence() {.importc: "__mm_lfence", header: "<intrin.h>".}
+
+    lfence()
+    return rdtsc()
+
+  else:
+    when defined(amd64):
+      var lo, hi: int64
+      # TODO: Provide a compile-time flag for RDTSCP support
+      #       and use it instead of lfence + RDTSC
+      {.emit: """asm volatile(
+        "lfence\n"
+        "rdtsc\n"
+        : "=a"(`lo`), "=d"(`hi`)
+        :
+        : "memory"
+      );""".}
+      return (hi shl 32) or lo
+    else: # 32-bit x86
+      # TODO: Provide a compile-time flag for RDTSCP support
+      #       and use it instead of lfence + RDTSC
+      {.emit: """asm volatile(
+        "lfence\n"
+        "rdtsc\n"
+        : "=a"(`result`)
+        :
+        : "memory"
+      );""".}
--- a/tests/test_bigints.nim
+++ b/tests/test_bigints.nim
@ -8,11 +8,11 @@

 import  unittest,
        ../constantine/io/io_bigints,
-        ../constantine/arithmetic/bigints,
+        ../constantine/arithmetic,
        ../constantine/config/common,
        ../constantine/primitives

-proc main() =
+proc mainArith() =
  suite "isZero":
    test "isZero for zero":
      var x: BigInt[128]
@ -128,6 +128,16 @@ proc main() =
          bool(a == c)
          not bool(carry)

+  suite "BigInt + Word":
+    test "Addition limbs carry":
+      block: # P256 / 2
+        var a = BigInt[256].fromhex"0x7fffffff800000008000000000000000000000007fffffffffffffffffffffff"
+
+        let expected = BigInt[256].fromHex"7fffffff80000000800000000000000000000000800000000000000000000000"
+
+        discard a.add(Word 1)
+        check: bool(a == expected)
+
  suite "Modular operations - small modulus":
    # Vectors taken from Stint - https://github.com/status-im/nim-stint
    test "100 mod 13":
@ -205,4 +215,249 @@ proc main() =
      check:
        bool(r == BigInt[40].fromUint(u mod v))

-main()
+proc mainNeg() =
+  suite "Conditional negation":
+    test "Conditional negation":
+      block:
+        var a = fromHex(BigInt[128], "0x12345678_FF11FFAA_00321321_CAFECAFE")
+        var b = fromHex(BigInt[128], "0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF")
+
+        let a2 = a
+        let b2 = b
+
+        a.cneg(CtTrue)
+        b.cneg(CtTrue)
+
+        discard a.add(a2)
+        discard b.add(b2)
+
+        check:
+          bool(a.isZero)
+          bool(b.isZero)
+
+      block:
+        var a = fromHex(BigInt[128], "0x12345678_FF11FFAA_00321321_CAFECAFE")
+        var b = fromHex(BigInt[128], "0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF")
+
+        let a2 = a
+        let b2 = b
+
+        a.cneg(CtFalse)
+        b.cneg(CtFalse)
+
+        check:
+          bool(a == a2)
+          bool(b == b2)
+
+    test "Conditional negation with carries":
+      block:
+        var a = fromHex(BigInt[128], "0x12345678_FF11FFAA_00321321_FFFFFFFF")
+        var b = fromHex(BigInt[128], "0xFFFFFFFF_FFFFFFFF_00000000_00000000")
+
+        let a2 = a
+        let b2 = b
+
+        a.cneg(CtTrue)
+        b.cneg(CtTrue)
+
+        discard a.add(a2)
+        discard b.add(b2)
+
+        check:
+          bool(a.isZero)
+          bool(b.isZero)
+
+      block:
+        var a = fromHex(BigInt[128], "0x12345678_00000000_00321321_FFFFFFFF")
+        var b = fromHex(BigInt[128], "0xFFFFFFFF_FFFFFFFF_00000000_00000000")
+
+        let a2 = a
+        let b2 = b
+
+        a.cneg(CtFalse)
+        b.cneg(CtFalse)
+
+        check:
+          bool(a == a2)
+          bool(b == b2)
+
+    test "Conditional all-zero bit or all-one bit":
+      block:
+        var a = fromHex(BigInt[128], "0x00000000_00000000_00000000_00000000")
+        var b = fromHex(BigInt[128], "0xFFFFFFFF_FFFFFFFF_FFFFFFFF_FFFFFFFF")
+
+        let a2 = a
+        let b2 = b
+
+        a.cneg(CtTrue)
+        b.cneg(CtTrue)
+
+        discard a.add(a2)
+        discard b.add(b2)
+
+        check:
+          bool(a.isZero)
+          bool(b.isZero)
+
+      block:
+        var a = fromHex(BigInt[128], "0x00000000_00000000_00000000_00000000")
+        var b = fromHex(BigInt[128], "0xFFFFFFFF_FFFFFFFF_FFFFFFFF_FFFFFFFF")
+
+        let a2 = a
+        let b2 = b
+
+        a.cneg(CtFalse)
+        b.cneg(CtFalse)
+
+        check:
+          bool(a == a2)
+          bool(b == b2)
+
+proc mainCopySwap() =
+  suite "Copy and Swap":
+    test "Conditional copy":
+      block:
+        var a = fromHex(BigInt[128], "0x12345678_FF11FFAA_00321321_CAFECAFE")
+        let b = fromHex(BigInt[128], "0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF")
+
+        var expected = a
+        a.ccopy(b, CtFalse)
+
+        check: bool(expected == a)
+
+      block:
+        var a = fromHex(BigInt[128], "0x00000000_FFFFFFFF_FFFFFFFF_FFFFFFFF")
+        let b = fromHex(BigInt[128], "0x00000000_00000000_00000000_00000001")
+
+        var expected = b
+        a.ccopy(b, CtTrue)
+
+        check: bool(expected == b)
+
+    test "Conditional swap":
+      block:
+        var a = fromHex(BigInt[128], "0x12345678_FF11FFAA_00321321_CAFECAFE")
+        var b = fromHex(BigInt[128], "0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF")
+
+        let eA = a
+        let eB = b
+
+        a.cswap(b, CtFalse)
+        check:
+          bool(eA == a)
+          bool(eB == b)
+
+      block:
+        var a = fromHex(BigInt[128], "0x00000000_FFFFFFFF_FFFFFFFF_FFFFFFFF")
+        var b = fromHex(BigInt[128], "0x00000000_00000000_00000000_00000001")
+
+        let eA = b
+        let eB = a
+
+        a.cswap(b, CtTrue)
+        check:
+          bool(eA == a)
+          bool(eB == b)
+
+proc mainModularInverse() =
+  suite "Modular Inverse (with odd modulus)":
+    # Note: We don't define multi-precision multiplication
+    #       because who needs it when you have Montgomery?
+    #       ¯\_(ツ)_/¯
+    test "42^-1 (mod 2017) = 1969":
+      block: # small int
+        let a = BigInt[16].fromUint(42'u16)
+        let M = BigInt[16].fromUint(2017'u16)
+
+        var mp1div2 = M
+        discard mp1div2.add(Word 1)
+        mp1div2.shiftRight(1)
+
+        let expected = BigInt[16].fromUint(1969'u16)
+        var r {.noInit.}: BigInt[16]
+
+        r.invmod(a, M, mp1div2)
+
+        check: bool(r == expected)
+
+      block: # huge int
+        let a = BigInt[381].fromUint(42'u16)
+        let M = BigInt[381].fromUint(2017'u16)
+
+        var mp1div2 = M
+        discard mp1div2.add(Word 1)
+        mp1div2.shiftRight(1)
+
+        let expected = BigInt[381].fromUint(1969'u16)
+        var r {.noInit.}: BigInt[381]
+
+        r.invmod(a, M, mp1div2)
+
+        check: bool(r == expected)
+
+    test "271^-1 (mod 383) = 106":
+      block: # small int
+        let a = BigInt[16].fromUint(271'u16)
+        let M = BigInt[16].fromUint(383'u16)
+
+        var mp1div2 = M
+        discard mp1div2.add(Word 1)
+        mp1div2.shiftRight(1)
+
+        let expected = BigInt[16].fromUint(106'u16)
+        var r {.noInit.}: BigInt[16]
+
+        r.invmod(a, M, mp1div2)
+
+        check: bool(r == expected)
+
+      block: # huge int
+        let a = BigInt[381].fromUint(271'u16)
+        let M = BigInt[381].fromUint(383'u16)
+
+        var mp1div2 = M
+        discard mp1div2.add(Word 1)
+        mp1div2.shiftRight(1)
+
+        let expected = BigInt[381].fromUint(106'u16)
+        var r {.noInit.}: BigInt[381]
+
+        r.invmod(a, M, mp1div2)
+
+        check: bool(r == expected)
+
+    test "BN254_Modulus^-1 (mod BLS12_381)":
+      let a = BigInt[381].fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
+      let M = BigInt[381].fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab")
+
+      var mp1div2 = M
+      discard mp1div2.add(Word 1)
+      mp1div2.shiftRight(1)
+
+      let expected = BigInt[381].fromHex("0x0636759a0f3034fa47174b2c0334902f11e9915b7bd89c6a2b3082b109abbc9837da17201f6d8286fe6203caa1b9d4c8")
+
+      var r {.noInit.}: BigInt[381]
+      r.invmod(a, M, mp1div2)
+
+      check: bool(r == expected)
+
+    test "0^-1 (mod 0) = 0 (need for tower of extension fields)":
+      let a = BigInt[16].fromUint(0'u16)
+      let M = BigInt[16].fromUint(2017'u16)
+
+      var mp1div2 = M
+      mp1div2.shiftRight(1)
+      discard mp1div2.add(Word 1)
+
+      let expected = BigInt[16].fromUint(0'u16)
+      var r {.noInit.}: BigInt[16]
+
+      r.invmod(a, M, mp1div2)
+
+      check: bool(r == expected)
+
+
+mainArith()
+mainNeg()
+mainCopySwap()
+mainModularInverse()
--- a/tests/test_bigints_multimod.nim
+++ b/tests/test_bigints_multimod.nim
@ -11,7 +11,7 @@ import
  unittest,
  # Third-party
  ../constantine/io/io_bigints,
-  ../constantine/arithmetic/bigints,
+  ../constantine/arithmetic,
  ../constantine/primitives

 proc main() =
--- a/tests/test_bigints_vs_gmp.nim
+++ b/tests/test_bigints_vs_gmp.nim
@ -13,8 +13,8 @@ import
  gmp, stew/byteutils,
  # Internal
  ../constantine/io/io_bigints,
-  ../constantine/arithmetic/bigints,
-  ../constantine/primitives/constant_time
+  ../constantine/arithmetic,
+  ../constantine/primitives

 # We test up to 1024-bit, more is really slow

--- a/tests/test_finite_fields.nim
+++ b/tests/test_finite_fields.nim
@ -7,7 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

 import  unittest,
-        ../constantine/arithmetic/finite_fields,
+        ../constantine/arithmetic,
        ../constantine/io/io_fields,
        ../constantine/config/curves

--- a/tests/test_finite_fields_mulsquare.nim
+++ b/tests/test_finite_fields_mulsquare.nim
@ -7,11 +7,11 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

 import  std/unittest, std/times,
-        ../constantine/arithmetic/[bigints, finite_fields],
+        ../constantine/arithmetic,
        ../constantine/io/[io_bigints, io_fields],
        ../constantine/config/curves,
        # Test utilities
-        ./prng
+        ../helpers/prng

 const Iters = 128

@ -25,17 +25,22 @@ static: doAssert defined(testingCurves), "This modules requires the -d:testingCu
 import ../constantine/config/common

 proc sanity(C: static Curve) =
-  test "Squaring 0,1,2 with "& $C & " [FastSquaring = " & $Fake101.canUseNoCarryMontySquare & "]":
+  test "Squaring 0,1,2 with "& $Curve(C) & " [FastSquaring = " & $C.canUseNoCarryMontySquare & "]":
        block: # 0² mod
          var n: Fp[C]

          n.fromUint(0'u32)
          let expected = n

+          # Out-of-place
          var r: Fp[C]
          r.square(n)
+          # In-place
+          n.square()

-          check: bool(r == expected)
+          check:
+            bool(r == expected)
+            bool(n == expected)

        block: # 1² mod
          var n: Fp[C]
@ -43,10 +48,15 @@ proc sanity(C: static Curve) =
          n.fromUint(1'u32)
          let expected = n

+          # Out-of-place
          var r: Fp[C]
          r.square(n)
+          # In-place
+          n.square()

-          check: bool(r == expected)
+          check:
+            bool(r == expected)
+            bool(n == expected)

        block: # 2² mod
          var n, expected: Fp[C]
@ -54,10 +64,15 @@ proc sanity(C: static Curve) =
          n.fromUint(2'u32)
          expected.fromUint(4'u32)

+          # Out-of-place
          var r: Fp[C]
          r.square(n)
+          # In-place
+          n.square()

-          check: bool(r == expected)
+          check:
+            bool(r == expected)
+            bool(n == expected)

 proc mainSanity() =
  suite "Modular squaring is consistent with multiplication on special elements":
--- a/tests/test_finite_fields_powinv.nim
+++ b/tests/test_finite_fields_powinv.nim
@ -7,7 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

 import  unittest,
-        ../constantine/arithmetic/[bigints, finite_fields],
+        ../constantine/arithmetic,
        ../constantine/io/[io_bigints, io_fields],
        ../constantine/config/curves

@ -144,14 +144,14 @@ proc main() =

  suite "Modular inversion over prime fields":
    test "x^(-1) mod p":
-        var x: Fp[BLS12_381]
+        var r, x: Fp[BLS12_381]

        # BN254 field modulus
        x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")

        let expected = "0x0636759a0f3034fa47174b2c0334902f11e9915b7bd89c6a2b3082b109abbc9837da17201f6d8286fe6203caa1b9d4c8"
-        x.inv()
-        let computed = x.toHex()
+        r.inv(x)
+        let computed = r.toHex()

        check:
          computed == expected
--- a/tests/test_finite_fields_vs_gmp.nim
+++ b/tests/test_finite_fields_vs_gmp.nim
@ -13,18 +13,21 @@ import
  gmp, stew/byteutils,
  # Internal
  ../constantine/io/[io_bigints, io_fields],
-  ../constantine/arithmetic/[finite_fields, bigints],
-  ../constantine/primitives/constant_time,
+  ../constantine/arithmetic,
+  ../constantine/primitives,
  ../constantine/config/curves

-# We test up to 1024-bit, more is really slow
-
 var RNG {.compileTime.} = initRand(1234)
 const CurveParams = [
+  P224: (224, "0xffffffffffffffffffffffffffffffff000000000000000000000001"),
  BN254: (254, "0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47"),
+  P256: (256, "0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff"),
+  Secp256k1: (256, "0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F"),
  BLS12_381: (381, "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab")
 ]

+const AvailableCurves = [P224, BN254, P256, Secp256k1, BLS12_381]
+
 const # https://gmplib.org/manual/Integer-Import-and-Export.html
  GMP_WordLittleEndian = -1'i32
  GMP_WordNativeEndian = 0'i32
@ -53,7 +56,7 @@ proc binary_prologue[C: static Curve, N: static int](
  mpz_urandomb(b, gmpRng, uint bits)
  # Set modulus to curve modulus
  let err = mpz_set_str(p, CurveParams[C][1], 0)
-  doAssert err == 0
+  doAssert err == 0, "Error on prime for curve " & $Curve(C)

  #########################################################
  # Conversion buffers
@ -189,8 +192,8 @@ proc invTests(gmpRng: var gmp_randstate_t, a, b, p, r: var mpz_t, C: static Curv
  let exist = mpz_invert(r, a, p)
  doAssert exist != 0

-  var rTest = aTest
-  rTest.inv()
+  var rTest {.noInit.}: Fp[C]
+  rTest.inv(aTest)

  binary_epilogue(r, a, b, rTest, aBuf, bBuf, "Inversion (b is unused)")

@ -206,7 +209,7 @@ macro randomTests(numTests: static int, curveSym, body: untyped): untyped =
  result = newStmtList()

  for _ in 0 ..< numTests:
-    let curve = RNG.rand([BN254, BLS12_381])
+    let curve = RNG.rand(AvailableCurves)

    result.add quote do:
      block:
--- a/tests/test_fp2.nim
+++ b/tests/test_fp2.nim
@ -12,9 +12,9 @@ import
  # Internals
  ../constantine/tower_field_extensions/[abelian_groups, fp2_complex],
  ../constantine/config/[common, curves],
-  ../constantine/arithmetic/bigints,
+  ../constantine/arithmetic,
  # Test utilities
-  ./prng
+  ../helpers/prng

 const Iters = 128

--- a/tests/test_io_bigints.nim
+++ b/tests/test_io_bigints.nim
@ -9,7 +9,7 @@
 import  unittest, random,
        ../constantine/io/io_bigints,
        ../constantine/config/common,
-        ../constantine/arithmetic/bigints
+        ../constantine/arithmetic

 randomize(0xDEADBEEF) # Random seed for reproducibility
 type T = BaseType
--- a/tests/test_io_fields.nim
+++ b/tests/test_io_fields.nim
@ -10,7 +10,7 @@ import  unittest, random,
        ../constantine/io/[io_bigints, io_fields],
        ../constantine/config/curves,
        ../constantine/config/common,
-        ../constantine/arithmetic/[bigints, finite_fields]
+        ../constantine/arithmetic

 randomize(0xDEADBEEF) # Random seed for reproducibility
 type T = BaseType