30% faster constant-time inversion

2026-01-04 06:03:08 +00:00 · 2020-03-20 23:03:52 +01:00 · 2020-03-20 23:03:52 +01:00 · bde619155b
commit bde619155b
parent 1958356a09
39 changed files with 1467 additions and 1066 deletions
--- a/benchmarks/bench_eth_curves.nim
+++ b/benchmarks/bench_eth_curves.nim
@ -1,6 +0,0 @@
 # Benchmark of Ethereum 1 and Ethereum 2 elliptic curves
 import
  ./secp256k1_fp,
  ./bn254_fp,
  ./bls12_381_fp
--- a/benchmarks/bench_finite_fields.nim
+++ b/benchmarks/bench_finite_fields.nim
@ -0,0 +1,157 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 # ############################################################
 #
 #             Benchmark of finite fields
 #
 # ############################################################
 import
  # Internals
  ../constantine/config/[common, curves],
  ../constantine/arithmetic,
  ../constantine/io/[io_bigints, io_fields],
  ../constantine/primitives,
  # Helpers
  ../helpers/[timers, prng, static_for],
  # Standard library
  std/[monotimes, times, strformat, strutils, macros]
 const Iters = 1_000_000
 const InvIters = 1000
 const AvailableCurves = [
  P224,
  BN254,
  P256,
  Secp256k1,
  BLS12_381
 ]
 var rng: RngState
 let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
 rng.seed(seed)
 echo "bench_finite_field xoshiro512** seed: ", seed
 # warmup
 proc warmup*() =
  # Warmup - make sure cpu is on max perf
  let start = cpuTime()
  var foo = 123
  for i in 0 ..< 300_000_000:
    foo += i*i mod 456
    foo = foo mod 789
  # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
  let stop = cpuTime()
  echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
 warmup()
 echo "\n⚠️ Measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
 echo "==========================================================================================================\n"
 echo "All benchmarks are using constant-time implementations to protect against side-channel attacks."
 when defined(gcc):
  echo "\nCompiled with GCC"
 elif defined(clang):
  echo "\nCompiled with Clang"
 elif defined(vcc):
  echo "\nCompiled with MSVC"
 elif defined(icc):
  echo "\nCompiled with ICC"
 else:
  echo "\nCompiled with an unknown compiler"
 when defined(i386) or defined(amd64):
  import ../helpers/x86
  echo "Running on ", cpuName(), "\n\n"
 proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
  echo &"{op:<15} {field:<15} {inNanoseconds((stop-start) div iters):>9} ns {(stopClk - startClk) div iters:>9} cycles"
 macro fixFieldDisplay(T: typedesc): untyped =
  # At compile-time, enums are integers and their display is buggy
  # we get the Curve ID instead of the curve name.
  let instantiated = T.getTypeInst()
  var name = $instantiated[1][0] # Fp
  name.add "[" & $Curve(instantiated[1][1].intVal) & "]"
  result = newLit name
 # Compilers are smart with dead code (but not with multiprecision arithmetic :/)
 var globalsAreNotOptimizedAway: Word
 template bench(op: string, T: typedesc, iters: int, body: untyped): untyped =
  let start = getMonotime()
  let startClk = getTicks()
  for _ in 0 ..< iters:
    body
  let stopClk = getTicks()
  let stop = getMonotime()
  report(op, fixFieldDisplay(T), start, stop, startClk, stopClk, iters)
 proc addBench(T: typedesc) =
  var x = rng.random(T)
  let y = rng.random(T)
  bench("Addition", T, Iters):
    x += y
  globalsAreNotOptimizedAway += x.mres.limbs[^1]
 proc subBench(T: typedesc) =
  var x = rng.random(T)
  let y = rng.random(T)
  preventOptimAway(x)
  bench("Substraction", T, Iters):
    x -= y
  globalsAreNotOptimizedAway += x.mres.limbs[^1]
 proc negBench(T: typedesc) =
  var r: T
  let x = rng.random(T)
  bench("Negation", T, Iters):
    r.neg(x)
  globalsAreNotOptimizedAway += r.mres.limbs[^1]
 proc mulBench(T: typedesc) =
  var r: T
  let x = rng.random(T)
  let y = rng.random(T)
  preventOptimAway(r)
  bench("Multiplication", T, Iters):
    r.prod(x, y)
 proc sqrBench(T: typedesc) =
  var r: T
  let x = rng.random(T)
  preventOptimAway(r)
  bench("Squaring", T, Iters):
    r.square(x)
 proc invBench(T: typedesc) =
  var r: T
  let x = rng.random(T)
  preventOptimAway(r)
  bench("Inversion", T, InvIters):
    r.inv(x)
 proc main() =
  echo "-".repeat(80)
  staticFor i, 0, AvailableCurves.len:
    const curve = AvailableCurves[i]
    addBench(Fp[curve])
    subBench(Fp[curve])
    negBench(Fp[curve])
    mulBench(Fp[curve])
    sqrBench(Fp[curve])
    invBench(Fp[curve])
    echo "-".repeat(80)
 main()
 echo "Notes:"
 echo "  GCC is significantly slower than Clang on multiprecision arithmetic."
--- a/benchmarks/big_to_fq.nim
+++ b/benchmarks/big_to_fq.nim
@ -1,48 +0,0 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 # ############################################################
 #
 #        Benchmark of the conversion from Big Int to Fq
 #
 # ############################################################
 # 2 implementations are possible
 # - 1 based on Montgomery Multiplication
 # - 1 based on modular left shift which involves multiple divisions
 import
  ../constantine/config/[common, curves],
  ../constantine/arithmetic/[bigints_checked, finite_fields],
  random, std/monotimes, times, strformat
 const Iters = 1_000_000
 randomize(1234)
 proc main() =
  var x: BigInt[381]
  x.setInternalBitLength()
  for i in 0 ..< x.limbs.len - 1:
    # Set x to a random value guaranteed below the prime
    x.limbs[i] = Word(rand(BaseType.high.int))
  let start = getMonotime()
  for _ in 0 ..< Iters:
    let y = Fq[BLS12_381].fromBig(x)
  let stop = getMonotime()
  echo &"Time for {Iters} iterations: {inMilliseconds(stop-start)} ms"
 main()
 # 1_000_000 iterations with -d:danger on i9-9980XE all-core turbo 4.1GHz
 # Montgomery Multiplication based: 254ms
 # shlAddMod based (using assembly div2n1n!!): 907 ms
 # Note: shlAddMod will be even slower when division is made constant-time
--- a/benchmarks/bls12_381_fp.nim
+++ b/benchmarks/bls12_381_fp.nim
@ -1,155 +0,0 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 # ############################################################
 #
 #           Benchmark of modular exponentiation
 #
 # ############################################################
 # 2 implementations are available
 # - 1 is constant time
 # - 1 exposes the exponent bits to:
 #   timing attack,
 #   memory access analysis,
 #   power analysis (i.e. oscilloscopes on embedded)
 #   It is suitable for public exponents for example
 #   to compute modular inversion via the Fermat method
 import
  ../constantine/config/[common, curves],
  ../constantine/arithmetic/[bigints, finite_fields],
  ../constantine/io/[io_bigints, io_fields],
  random, std/monotimes, times, strformat,
  ./timers
 const Iters = 1_000_000
 const InvIters = 1000
 randomize(1234)
 # warmup
 proc warmup*() =
  # Warmup - make sure cpu is on max perf
  let start = cpuTime()
  var foo = 123
  for i in 0 ..< 300_000_000:
    foo += i*i mod 456
    foo = foo mod 789
  # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
  let stop = cpuTime()
  echo &"\n\nWarmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
 warmup()
 echo "\n⚠️ Measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
 echo "==========================================================================================================\n"
 proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
  echo &"{op:<15} {field:<15} {inNanoseconds((stop-start) div iters):>9} ns {(stopClk - startClk) div iters:>9} cycles"
 proc addBench() =
  var x, y: Fp[BLS12_381]
  # BN254 field modulus
  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
  # BLS12-381 prime - 2
  y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9")
  let start = getMonotime()
  let startClk = getTicks()
  for _ in 0 ..< Iters:
    x += y
  let stopClk = getTicks()
  let stop = getMonotime()
  report("Addition", "Fp[BLS12_381]", start, stop, startClk, stopClk, Iters)
 addBench()
 proc subBench() =
  var x, y: Fp[BLS12_381]
  # BN254 field modulus
  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
  # BLS12-381 prime - 2
  y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9")
  let start = getMonotime()
  let startClk = getTicks()
  for _ in 0 ..< Iters:
    x -= y
  let stopClk = getTicks()
  let stop = getMonotime()
  report("Substraction", "Fp[BLS12_381]", start, stop, startClk, stopClk, Iters)
 subBench()
 proc negBench() =
  var r, x: Fp[BLS12_381]
  # BN254 field modulus
  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
  let start = getMonotime()
  let startClk = getTicks()
  for _ in 0 ..< Iters:
    r.neg(x)
  let stopClk = getTicks()
  let stop = getMonotime()
  report("Negation", "Fp[BLS12_381]", start, stop, startClk, stopClk, Iters)
 negBench()
 proc mulBench() =
  var r, x, y: Fp[BLS12_381]
  # BN254 field modulus
  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
  # BLS12-381 prime - 2
  y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9")
  let start = getMonotime()
  let startClk = getTicks()
  for _ in 0 ..< Iters:
    r.prod(x, y)
  let stopClk = getTicks()
  let stop = getMonotime()
  report("Multiplication", "Fp[BLS12_381]", start, stop, startClk, stopClk, Iters)
 mulBench()
 proc sqrBench() =
  var r, x: Fp[BLS12_381]
  # BN254 field modulus
  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
  let start = getMonotime()
  let startClk = getTicks()
  for _ in 0 ..< Iters:
    r.square(x)
  let stopClk = getTicks()
  let stop = getMonotime()
  report("Squaring", "Fp[BLS12_381]", start, stop, startClk, stopClk, Iters)
 sqrBench()
 proc invBench() =
  # TODO: having x on the stack triggers stack smashing detection. To be investigated
  var x: ref Fp[BLS12_381]
  new x
  # BN254 field modulus
  x[].fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
  let start = getMonotime()
  let startClk = getTicks()
  for _ in 0 ..< InvIters:
    # Note: we don't copy the original x so x is alterning between x and x^-1
    inv(x[])
  let stopClk = getTicks()
  let stop = getMonotime()
  report("Inversion", "Fp[BLS12_381]", start, stop, startClk, stopClk, InvIters)
 invBench()
--- a/benchmarks/bn254_fp.nim
+++ b/benchmarks/bn254_fp.nim
@ -1,155 +0,0 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 # ############################################################
 #
 #           Benchmark of modular exponentiation
 #
 # ############################################################
 # 2 implementations are available
 # - 1 is constant time
 # - 1 exposes the exponent bits to:
 #   timing attack,
 #   memory access analysis,
 #   power analysis (i.e. oscilloscopes on embedded)
 #   It is suitable for public exponents for example
 #   to compute modular inversion via the Fermat method
 import
  ../constantine/config/[common, curves],
  ../constantine/arithmetic/[bigints, finite_fields],
  ../constantine/io/[io_bigints, io_fields],
  random, std/monotimes, times, strformat,
  ./timers
 const Iters = 1_000_000
 const InvIters = 1000
 randomize(1234)
 # warmup
 proc warmup*() =
  # Warmup - make sure cpu is on max perf
  let start = cpuTime()
  var foo = 123
  for i in 0 ..< 300_000_000:
    foo += i*i mod 456
    foo = foo mod 789
  # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
  let stop = cpuTime()
  echo &"\n\nWarmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
 warmup()
 echo "\n⚠️ Measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
 echo "==========================================================================================================\n"
 proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
  echo &"{op:<15} {field:<15} {inNanoseconds((stop-start) div iters):>9} ns {(stopClk - startClk) div iters:>9} cycles"
 proc addBench() =
  var x, y: Fp[BN254]
  # BN254 field modulus
  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
  # Truncated BLS12-381 prime
  y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624")
  let start = getMonotime()
  let startClk = getTicks()
  for _ in 0 ..< Iters:
    x += y
  let stopClk = getTicks()
  let stop = getMonotime()
  report("Addition", "Fp[BN254]", start, stop, startClk, stopClk, Iters)
 addBench()
 proc subBench() =
  var x, y: Fp[BN254]
  # BN254 field modulus
  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
  # Truncated BLS12-381 prime
  y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624")
  let start = getMonotime()
  let startClk = getTicks()
  for _ in 0 ..< Iters:
    x -= y
  let stopClk = getTicks()
  let stop = getMonotime()
  report("Substraction", "Fp[BN254]", start, stop, startClk, stopClk, Iters)
 subBench()
 proc negBench() =
  var r, x: Fp[BN254]
  # BN254 field modulus
  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
  let start = getMonotime()
  let startClk = getTicks()
  for _ in 0 ..< Iters:
    r.neg(x)
  let stopClk = getTicks()
  let stop = getMonotime()
  report("Negation", "Fp[BN254]", start, stop, startClk, stopClk, Iters)
 negBench()
 proc mulBench() =
  var r, x, y: Fp[BN254]
  # BN254 field modulus
  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
  # Truncated BLS12-381 prime
  y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624")
  let start = getMonotime()
  let startClk = getTicks()
  for _ in 0 ..< Iters:
    r.prod(x, y)
  let stopClk = getTicks()
  let stop = getMonotime()
  report("Multiplication", "Fp[BN254]", start, stop, startClk, stopClk, Iters)
 mulBench()
 proc sqrBench() =
  var r, x: Fp[BN254]
  # BN254 field modulus
  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
  let start = getMonotime()
  let startClk = getTicks()
  for _ in 0 ..< Iters:
    r.square(x)
  let stopClk = getTicks()
  let stop = getMonotime()
  report("Squaring", "Fp[BN254]", start, stop, startClk, stopClk, Iters)
 sqrBench()
 proc invBench() =
  # TODO: having x on the stack triggers stack smashing detection. To be investigated
  var x: ref Fp[BN254]
  new x
  # BN254 field modulus
  x[].fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
  let start = getMonotime()
  let startClk = getTicks()
  for _ in 0 ..< InvIters:
    # Note: we don't copy the original x so x is alterning between x and x^-1
    inv(x[])
  let stopClk = getTicks()
  let stop = getMonotime()
  report("Inversion", "Fp[BN254]", start, stop, startClk, stopClk, InvIters)
 invBench()
--- a/benchmarks/powmod.nim
+++ b/benchmarks/powmod.nim
@ -1,69 +0,0 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 # ############################################################
 #
 #           Benchmark of modular exponentiation
 #
 # ############################################################
 # 2 implementations are available
 # - 1 is constant time
 # - 1 exposes the exponent bits to:
 #   timing attack,
 #   memory access analysis,
 #   power analysis (i.e. oscilloscopes on embedded)
 #   It is suitable for public exponents for example
 #   to compute modular inversion via the Fermat method
 import
  ../constantine/config/[common, curves],
  ../constantine/arithmetic/[bigints_checked, finite_fields],
  ../constantine/io/[io_bigints, io_fields],
  random, std/monotimes, times, strformat
 const Iters = 10_000
 randomize(1234)
 proc mainCT() =
  var x: Fp[BLS12_381]
  # BN254 field modulus
  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
  # BLS12-381 prime - 2
  let exponent = BigInt[381].fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9")
  let start = getMonotime()
  for _ in 0 ..< Iters:
    var y = x
    x.pow(exponent)
  let stop = getMonotime()
  echo &"Time for {Iters} iterations (constant-time 381-bit): {inMilliseconds(stop-start)} ms"
 proc mainUnsafe() =
  var x: Fp[BLS12_381]
  # BN254 field modulus
  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
  # BLS12-381 prime - 2
  let exponent = BigInt[381].fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9")
  let start = getMonotime()
  for _ in 0 ..< Iters:
    var y = x
    x.powUnsafeExponent(exponent)
  let stop = getMonotime()
  echo &"Time for {Iters} iterations (unsafe 381-bit): {inMilliseconds(stop-start)} ms"
 mainCT()
 mainUnsafe()
 # 10_000 iterations with -d:danger on i9-9980XE all-core turbo 4.1GHz
 # Time for 10000 iterations (constant-time 381-bit): 1349 ms
 # Time for 10000 iterations (unsafe 381-bit): 1226 ms
--- a/benchmarks/secp256k1_fp.nim
+++ b/benchmarks/secp256k1_fp.nim
@ -1,155 +0,0 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 # ############################################################
 #
 #           Benchmark of modular exponentiation
 #
 # ############################################################
 # 2 implementations are available
 # - 1 is constant time
 # - 1 exposes the exponent bits to:
 #   timing attack,
 #   memory access analysis,
 #   power analysis (i.e. oscilloscopes on embedded)
 #   It is suitable for public exponents for example
 #   to compute modular inversion via the Fermat method
 import
  ../constantine/config/[common, curves],
  ../constantine/arithmetic/[bigints, finite_fields],
  ../constantine/io/[io_bigints, io_fields],
  random, std/monotimes, times, strformat,
  ./timers
 const Iters = 1_000_000
 const InvIters = 1000
 randomize(1234)
 # warmup
 proc warmup*() =
  # Warmup - make sure cpu is on max perf
  let start = cpuTime()
  var foo = 123
  for i in 0 ..< 300_000_000:
    foo += i*i mod 456
    foo = foo mod 789
  # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
  let stop = cpuTime()
  echo &"\n\nWarmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
 warmup()
 echo "\n⚠️ Measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
 echo "==========================================================================================================\n"
 proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
  echo &"{op:<15} {field:<15} {inNanoseconds((stop-start) div iters):>9} ns {(stopClk - startClk) div iters:>9} cycles"
 proc addBench() =
  var x, y: Fp[Secp256k1]
  # BN254 field modulus
  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
  # Truncated BLS12-381 prime
  y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624")
  let start = getMonotime()
  let startClk = getTicks()
  for _ in 0 ..< Iters:
    x += y
  let stopClk = getTicks()
  let stop = getMonotime()
  report("Addition", "Fp[Secp256k1]", start, stop, startClk, stopClk, Iters)
 addBench()
 proc subBench() =
  var x, y: Fp[Secp256k1]
  # BN254 field modulus
  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
  # Truncated BLS12-381 prime
  y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624")
  let start = getMonotime()
  let startClk = getTicks()
  for _ in 0 ..< Iters:
    x -= y
  let stopClk = getTicks()
  let stop = getMonotime()
  report("Substraction", "Fp[Secp256k1]", start, stop, startClk, stopClk, Iters)
 subBench()
 proc negBench() =
  var r, x: Fp[Secp256k1]
  # BN254 field modulus
  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
  let start = getMonotime()
  let startClk = getTicks()
  for _ in 0 ..< Iters:
    r.neg(x)
  let stopClk = getTicks()
  let stop = getMonotime()
  report("Negation", "Fp[Secp256k1]", start, stop, startClk, stopClk, Iters)
 negBench()
 proc mulBench() =
  var r, x, y: Fp[Secp256k1]
  # BN254 field modulus
  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
  # Truncated BLS12-381 prime
  y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624")
  let start = getMonotime()
  let startClk = getTicks()
  for _ in 0 ..< Iters:
    r.prod(x, y)
  let stopClk = getTicks()
  let stop = getMonotime()
  report("Multiplication", "Fp[Secp256k1]", start, stop, startClk, stopClk, Iters)
 mulBench()
 proc sqrBench() =
  var r, x: Fp[Secp256k1]
  # BN254 field modulus
  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
  let start = getMonotime()
  let startClk = getTicks()
  for _ in 0 ..< Iters:
    r.square(x)
  let stopClk = getTicks()
  let stop = getMonotime()
  report("Squaring", "Fp[Secp256k1]", start, stop, startClk, stopClk, Iters)
 sqrBench()
 proc invBench() =
  # TODO: having x on the stack triggers stack smashing detection. To be investigated
  var x: ref Fp[Secp256k1]
  new x
  # BN254 field modulus
  x[].fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
  let start = getMonotime()
  let startClk = getTicks()
  for _ in 0 ..< InvIters:
    # Note: we don't copy the original x so x is alterning between x and x^-1
    inv(x[])
  let stopClk = getTicks()
  let stop = getMonotime()
  report("Inversion", "Fp[Secp256k1]", start, stop, startClk, stopClk, InvIters)
 invBench()
--- a/benchmarks/timers.nim
+++ b/benchmarks/timers.nim
@ -1,49 +0,0 @@
 when defined(i386) or defined(amd64):
  # From Linux
  #
  # The RDTSC instruction is not ordered relative to memory
  # access.  The Intel SDM and the AMD APM are both vague on this
  # point, but empirically an RDTSC instruction can be
  # speculatively executed before prior loads.  An RDTSC
  # immediately after an appropriate barrier appears to be
  # ordered as a normal load, that is, it provides the same
  # ordering guarantees as reading from a global memory location
  # that some other imaginary CPU is updating continuously with a
  # time stamp.
  #
  # From Intel SDM
  # https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
  when not defined(vcc):
    when defined(amd64):
      proc getTicks*(): int64 {.inline.} =
        var lo, hi: int64
        # TODO: Provide a compile-time flag for RDTSCP support
        #       and use it instead of lfence + RDTSC
        {.emit: """asm volatile(
          "lfence\n"
          "rdtsc\n"
          : "=a"(`lo`), "=d"(`hi`)
          :
          : "memory"
        );""".}
        return (hi shl 32) or lo
    else:
      proc getTicks*(): int64 {.inline.} =
        # TODO: Provide a compile-time flag for RDTSCP support
        #       and use it instead of lfence + RDTSC
        {.emit: """asm volatile(
          "lfence\n"
          "rdtsc\n"
          : "=a"(`result`)
          :
          : "memory"
        );""".}
  else:
    proc rdtsc(): int64 {.sideeffect, importc: "__rdtsc", header: "<intrin.h>".}
    proc lfence() {.importc: "__mm_lfence", header: "<intrin.h>".}
    proc getTicks*(): int64 {.inline.} =
      lfence()
      return rdtsc()
 else:
  {.error: "getticks is not supported on this CPU architecture".}
--- a/constantine.nimble
+++ b/constantine.nimble
@ -110,3 +110,18 @@ task test_no_gmp, "Run tests that don't require GMP":
    # 𝔽p2
    test "", "tests/test_fp2.nim"
 task bench, "Run benchmark with your default compiler":
  if not dirExists "build":
    mkDir "build"
  exec "nim c -d:danger --verbosity:0 -o:build/bench_finite_fields_default -r --hints:off --warnings:off benchmarks/bench_finite_fields"
 task bench_gcc, "Run benchmark with gcc":
  if not dirExists "build":
    mkDir "build"
  exec "nim c --cc:gcc -d:danger --verbosity:0 -o:build/bench_finite_fields_gcc -r --hints:off --warnings:off benchmarks/bench_finite_fields"
 task bench_clang, "Run benchmark with clang":
  if not dirExists "build":
    mkDir "build"
  exec "nim c --cc:clang -d:danger --verbosity:0 -o:build/bench_finite_fields_clang -r --hints:off --warnings:off benchmarks/bench_finite_fields"
--- a/constantine/arithmetic.nim
+++ b/constantine/arithmetic.nim
@ -0,0 +1,17 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
  arithmetic/[limbs, limbs_modular, limbs_montgomery],
  arithmetic/bigints,
  arithmetic/[finite_fields, finite_fields_inversion]
 export
  limbs, limbs_modular, limbs_montgomery,
  bigints,
  finite_fields, finite_fields_inversion
--- a/constantine/arithmetic/bigints.nim
+++ b/constantine/arithmetic/bigints.nim
@ -9,8 +9,7 @@
 import
  ../config/common,
  ../primitives,
-  ./limbs,
+  ./limbs, ./limbs_montgomery, ./limbs_modular
  ./montgomery
 # ############################################################
 #
@ -87,25 +86,16 @@ debug:
  func `$`*(a: BigInt): string =
    result = "BigInt["
    result.add $BigInt.bits
-    result.add "](limbs: ["
+    result.add "](limbs: "
-    result.add $BaseType(a.limbs[0]) & " (0x" & toHex(BaseType(a.limbs[0])) & ')'
+    result.add a.limbs.toString()
-    for i in 1 ..< a.limbs.len:
+    result.add ")"
      result.add ", "
      result.add $BaseType(a.limbs[i]) & " (0x" & toHex(BaseType(a.limbs[i])) & ')'
    result.add "])"
 # No exceptions allowed
 {.push raises: [].}
 {.push inline.}
-func `==`*(a, b: BigInt): CTBool[Word] =
+# Initialization
-  ## Returns true if 2 big ints are equal
+# ------------------------------------------------------------
  ## Comparison is constant-time
  a.limbs == b.limbs
 func isZero*(a: BigInt): CTBool[Word] =
  ## Returns true if a big int is equal to zero
  a.limbs.isZero
 func setZero*(a: var BigInt) =
  ## Set a BigInt to 0
@ -115,6 +105,51 @@ func setOne*(a: var BigInt) =
  ## Set a BigInt to 1
  a.limbs.setOne()
 # Copy
 # ------------------------------------------------------------
 func ccopy*(a: var BigInt, b: BigInt, ctl: CTBool[Word]) =
  ## Constant-time conditional copy
  ## If ctl is true: b is copied into a
  ## if ctl is false: b is not copied and a is untouched
  ## Time and memory accesses are the same whether a copy occurs or not
  ccopy(a.limbs, b.limbs, ctl)
 func cswap*(a, b: var BigInt, ctl: CTBool) =
  ## Swap ``a`` and ``b`` if ``ctl`` is true
  ##
  ## Constant-time:
  ## Whether ``ctl`` is true or not, the same
  ## memory accesses are done (unless the compiler tries to be clever)
  cswap(a.limbs, b.limbs, ctl)
 # Comparison
 # ------------------------------------------------------------
 func `==`*(a, b: BigInt): CTBool[Word] =
  ## Returns true if 2 big ints are equal
  ## Comparison is constant-time
  a.limbs == b.limbs
 func `<`*(a, b: BigInt): CTBool[Word] =
  ## Returns true if a < b
  a.limbs < b.limbs
 func `<=`*(a, b: BigInt): CTBool[Word] =
  ## Returns true if a <= b
  a.limbs <= b.limbs
 func isZero*(a: BigInt): CTBool[Word] =
  ## Returns true if a big int is equal to zero
  a.limbs.isZero
 func isOdd*(a: BigInt): CTBool[Word] =
  ## Returns true if a is odd
  a.limbs.isOdd
 # Arithmetic
 # ------------------------------------------------------------
 func cadd*(a: var BigInt, b: BigInt, ctl: CTBool[Word]): CTBool[Word] =
  ## Constant-time in-place conditional addition
  ## The addition is only performed if ctl is "true"
@ -133,19 +168,16 @@ func cdouble*(a: var BigInt, ctl: CTBool[Word]): CTBool[Word] =
  ## The result carry is always computed.
  (CTBool[Word]) cadd(a.limbs, a.limbs, ctl)
 # ############################################################
 #
 #          BigInt Primitives Optimized for speed
 #
 # ############################################################
 #
 # TODO: fallback to cadd / csub with a "size" compile-option
 func add*(a: var BigInt, b: BigInt): CTBool[Word] =
  ## Constant-time in-place addition
  ## Returns the carry
  (CTBool[Word]) add(a.limbs, b.limbs)
 func add*(a: var BigInt, b: Word): CTBool[Word] =
  ## Constant-time in-place addition
  ## Returns the carry
  (CTBool[Word]) add(a.limbs, b)
 func sub*(a: var BigInt, b: BigInt): CTBool[Word] =
  ## Constant-time in-place substraction
  ## Returns the borrow
@ -177,19 +209,23 @@ func double*(r: var BigInt, a: BigInt): CTBool[Word] =
  ## Returns the carry
  (CTBool[Word]) sum(r.limbs, a.limbs, a.limbs)
-# ############################################################
+func div2*(a: var BigInt) =
-#
+  ## In-place divide ``a`` by 2
-#                    Comparisons
+  a.limbs.shiftRight(1)
 #
 # ############################################################
-func `<`*(a, b: BigInt): CTBool[Word] =
+func cneg*(a: var BigInt, ctl: CTBool) =
-  ## Returns true if a < b
+  ## Conditional negation.
-  a.limbs < b.limbs
+  ## Negate if ``ctl`` is true
  a.limbs.cneg(ctl)
-func `<=`*(a, b: BigInt): CTBool[Word] =
+# Bit Manipulation
-  ## Returns true if a <= b
+# ------------------------------------------------------------
-  a.limbs <= b.limbs
+
 func shiftRight*(a: var BigInt, k: int) =
  ## Shift right by k.
  ##
  ## k MUST be less than the base word size (2^31 or 2^63)
  a.limbs.shiftRight(k)
 # ############################################################
 #
@ -209,6 +245,22 @@ func reduce*[aBits, mBits](r: var BigInt[mBits], a: BigInt[aBits], M: BigInt[mBi
  # pass a pointer+length to a fixed session of the BSS.
  reduce(r.limbs, a.limbs, aBits, M.limbs, mBits)
 func steinsGCD*[bits](r: var BigInt[bits], a, F, M, mp1div2: BigInt[bits]) =
  ## Compute F multiplied the modular inverse of ``a`` modulo M
  ## r ≡ F . a^-1 (mod M)
  ##
  ## M MUST be odd, M does not need to be prime.
  ## ``a`` MUST be less than M.
  r.limbs.steinsGCD(a.limbs, F.limbs, M.limbs, bits, mp1div2.limbs)
 func invmod*[bits](r: var BigInt[bits], a, M, mp1div2: BigInt[bits]) =
  ## Compute the modular inverse of ``a`` modulo M
  ##
  ## The modulus ``M`` MUST be odd
  var one {.noInit.}: BigInt[bits]
  one.setOne()
  r.steinsGCD(a, one, M, mp1div2)
 # ############################################################
 #
 #                 Montgomery Arithmetic
@ -335,3 +387,6 @@ func montyPowUnsafeExponent*[mBits: static int](
                     else: (1 shl windowSize) + 1
  var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired]]
  montyPowUnsafeExponent(a.limbs, exponent, M.limbs, one.limbs, negInvModWord, scratchSpace, canUseNoCarryMontyMul)
 {.pop.} # inline
 {.pop.} # raises no exceptions
--- a/constantine/arithmetic/finite_fields.nim
+++ b/constantine/arithmetic/finite_fields.nim
@ -27,7 +27,7 @@
 import
  ../primitives,
  ../config/[common, curves],
-  ./bigints, ./montgomery
+  ./bigints, ./limbs_montgomery
 # type
 #   `Fp`*[C: static Curve] = object
@ -64,7 +64,7 @@ func fromBig*[C: static Curve](dst: var Fp[C], src: BigInt) {.noInit.} =
  dst.mres.montyResidue(src, C.Mod.mres, C.getR2modP(), C.getNegInvModWord(), C.canUseNoCarryMontyMul())
 func toBig*(src: Fp): auto {.noInit.} =
-  ## Convert a finite-field element to a BigInt in natral representation
+  ## Convert a finite-field element to a BigInt in natural representation
  var r {.noInit.}: typeof(src.mres)
  r.redc(src.mres, Fp.C.Mod.mres, Fp.C.getNegInvModWord(), Fp.C.canUseNoCarryMontyMul())
  return r
@ -193,17 +193,6 @@ func powUnsafeExponent*(a: var Fp, exponent: BigInt) =
    Fp.C.canUseNoCarryMontyMul()
  )
 func inv*(a: var Fp) =
  ## Inversion modulo p
  ## Warning ⚠️ :
  ##   - This assumes that `Fp` is a prime field
  const windowSize = 5 # TODO: find best window size for each curves
  a.mres.montyPowUnsafeExponent(
    Fp.C.getInvModExponent(),
    Fp.C.Mod.mres, Fp.C.getMontyOne(),
    Fp.C.getNegInvModWord(), windowSize,
    Fp.C.canUseNoCarryMontyMul()
  )
 # ############################################################
 #
@ -235,12 +224,8 @@ func `*`*(a, b: Fp): Fp {.noInit.} =
 func `*=`*(a: var Fp, b: Fp) =
  ## Multiplication modulo p
-  ##
+  a.prod(a, b)
-  ## Implementation note:
+
-  ## - This requires a temporary field element
+func square*(a: var Fp) =
-  ##
+  ## Squaring modulo p
-  ## Cost
+  a.mres.montySquare(a.mres, Fp.C.Mod.mres, Fp.C.getNegInvModWord(), Fp.C.canUseNoCarryMontySquare())
  ## Stack: 1 * ModulusBitSize
  var tmp{.noInit.}: Fp
  tmp.prod(a, b)
  a = tmp
--- a/constantine/arithmetic/finite_fields_inversion.nim
+++ b/constantine/arithmetic/finite_fields_inversion.nim
@ -0,0 +1,133 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
  ../config/[common, curves],
  ./finite_fields
 # ############################################################
 #
 #                  Specialized inversions
 #
 # ############################################################
 # Field-specific inversion routines
 template repeat(num: int, body: untyped) =
  for _ in 0 ..< num:
    body
 # Secp256k1
 # ------------------------------------------------------------
 func invmod_addchain(r: var Fp[Secp256k1], a: Fp[Secp256k1]) =
  ## We invert via Little Fermat's theorem
  ## a^(-1) ≡ a^(p-2) (mod p)
  ## with p = "0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2F"
  ## We take advantage of the prime special form to hardcode
  ## the sequence of squarings and multiplications for the modular exponentiation
  ##
  ## See libsecp256k1
  ##
  ## The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
  ## { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
  ## [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
  var
    x2{.noInit.}: Fp[Secp256k1]
    x3{.noInit.}: Fp[Secp256k1]
    x6{.noInit.}: Fp[Secp256k1]
    x9{.noInit.}: Fp[Secp256k1]
    x11{.noInit.}: Fp[Secp256k1]
    x22{.noInit.}: Fp[Secp256k1]
    x44{.noInit.}: Fp[Secp256k1]
    x88{.noInit.}: Fp[Secp256k1]
    x176{.noInit.}: Fp[Secp256k1]
    x220{.noInit.}: Fp[Secp256k1]
    x223{.noInit.}: Fp[Secp256k1]
  x2.square(a)
  x2 *= a
  x3.square(x2)
  x3 *= a
  x6 = x3
  repeat 3: x6.square()
  x6 *= x3
  x9 = x6
  repeat 3: x9.square()
  x9 *= x3
  x11 = x9
  repeat 2: x11.square()
  x11 *= x2
  x22 = x11
  repeat 11: x22.square()
  x22 *= x11
  x44 = x22
  repeat 22: x44.square()
  x44 *= x22
  x88 = x44
  repeat 44: x88.square()
  x88 *= x44
  x176 = x88
  repeat 88: x88.square()
  x176 *= x88
  x220 = x176
  repeat 44: x220.square()
  x220 *= x44
  x223 = x220
  repeat 3: x223.square()
  x223 *= x3
  # The final result is then assembled using a sliding window over the blocks
  r = x223
  repeat 23: r.square()
  r *= x22
  repeat 5: r.square()
  r *= a
  repeat 3: r.square()
  r *= x2
  repeat 2: r.square()
  r *= a
 # BN Curves
 # ------------------------------------------------------------
 # Efficient Pairings and ECC for Embedded Systems
 # Thomas Unterluggauer and Erich Wenger
 # https://eprint.iacr.org/2014/800.pdf
 #
 # BN curve field modulus are of the form:
 #   p = 36u^4 + 36u^3 + 24u^2 + 6u + 1
 #
 # We construct the following multiplication-squaring chain
 # a^-1 mod p = a^(p-2) mod p                 (Little Fermat Theorem)
 #            = a^(36 u^4 + 36 u^3 + 24 u^2 + 6u + 1 - 2) mod p
 #            = a^(36 u^4) . a^(36 u^3) . a^(24 u^2) . a^(6u-1) mod p
 #
 # Note: it only works for u positive, in particular BN254 doesn't work :/
 #       Is there a way to only use a^-u or even powers?
 # ############################################################
 #
 #                         Dispatch
 #
 # ############################################################
 func inv*(r: var Fp, a: Fp) =
  ## Inversion modulo p
  # For now we don't activate the addition chain.
  # Performance is equal to GCD and it does not pass test on 𝔽p2
  # We need faster squaring/multiplications
  r.mres.steinsGCD(a.mres, Fp.C.getR2modP(), Fp.C.Mod.mres, Fp.C.getPrimePlus1div2())
--- a/constantine/arithmetic/limbs.nim
+++ b/constantine/arithmetic/limbs.nim
@ -43,6 +43,22 @@ type Limbs*[N: static int] = array[N, Word]
  ##
  ## but for unknown reason, it prevents semchecking `bits`
 debug:
  import strutils
  func toString*(a: Limbs): string =
    result = "["
    result.add $BaseType(a[0]) & " (0x" & toHex(BaseType(a[0])) & ')'
    for i in 1 ..< a.len:
      result.add ", "
      result.add $BaseType(a[i]) & " (0x" & toHex(BaseType(a[i])) & ')'
    result.add "])"
  func toHex*(a: Limbs): string =
    result = "0x"
    for i in countdown(a.len-1, 0):
      result.add toHex(BaseType(a[i]))
 # No exceptions allowed
 {.push raises: [].}
@ -82,20 +98,8 @@ type Limbs*[N: static int] = array[N, Word]
 # as we avoid the parmeter packing/unpacking ceremony at function entry/exit
 # and unrolling overhead is minimal.
-func `==`*(a, b: Limbs): CTBool[Word] =
+# Initialization
-  ## Returns true if 2 limbs are equal
+# ------------------------------------------------------------
  ## Comparison is constant-time
  var accum = Zero
  for i in 0 ..< a.len:
    accum = accum or (a[i] xor b[i])
  result = accum.isZero()
 func isZero*(a: Limbs): CTBool[Word] =
  ## Returns true if ``a`` is equal to zero
  var accum = Zero
  for i in 0 ..< a.len:
    accum = accum or a[i]
  result = accum.isZero()
 func setZero*(a: var Limbs) =
  ## Set ``a`` to 0
@ -107,6 +111,9 @@ func setOne*(a: var Limbs) =
  when a.len > 1:
    zeroMem(a[1].addr, (a.len - 1) * sizeof(Word))
 # Copy
 # ------------------------------------------------------------
 func ccopy*(a: var Limbs, b: Limbs, ctl: CTBool[Word]) =
  ## Constant-time conditional copy
  ## If ctl is true: b is copied into a
@ -118,6 +125,65 @@ func ccopy*(a: var Limbs, b: Limbs, ctl: CTBool[Word]) =
  for i in 0 ..< a.len:
    ctl.ccopy(a[i], b[i])
 func cswap*(a, b: var Limbs, ctl: CTBool) =
  ## Swap ``a`` and ``b`` if ``ctl`` is true
  ##
  ## Constant-time:
  ## Whether ``ctl`` is true or not, the same
  ## memory accesses are done (unless the compiler tries to be clever)
  var mask = -(Word ctl)
  for i in 0 ..< a.len:
    let t = mask and (a[i] xor b[i])
    a[i] = a[i] xor t
    b[i] = b[i] xor t
 # Comparison
 # ------------------------------------------------------------
 func `==`*(a, b: Limbs): CTBool[Word] =
  ## Returns true if 2 limbs are equal
  ## Comparison is constant-time
  var accum = Zero
  for i in 0 ..< a.len:
    accum = accum or (a[i] xor b[i])
  result = accum.isZero()
 func `<`*(a, b: Limbs): CTBool[Word] =
  ## Returns true if a < b
  ## Comparison is constant-time
  var diff: Word
  var borrow: Borrow
  for i in 0 ..< a.len:
    subB(borrow, diff, a[i], b[i], borrow)
  result = (CTBool[Word])(borrow)
 func `<=`*(a, b: Limbs): CTBool[Word] =
  ## Returns true if a <= b
  ## Comparison is constant-time
  not(b < a)
 func isZero*(a: Limbs): CTBool[Word] =
  ## Returns true if ``a`` is equal to zero
  var accum = Zero
  for i in 0 ..< a.len:
    accum = accum or a[i]
  result = accum.isZero()
 func isOne*(a: Limbs): CTBool[Word] =
  ## Returns true if ``a`` is equal to one
  result = a[0] == Word(1)
  for i in 1 ..< a.len:
    result = result and a[i].isZero()
 func isOdd*(a: Limbs): CTBool[Word] =
  ## Returns true if a is odd
  CTBool[Word](a[0] and Word(1))
 # Arithmetic
 # ------------------------------------------------------------
 func add*(a: var Limbs, b: Limbs): Carry =
  ## Limbs addition
  ## Returns the carry
@ -125,6 +191,14 @@ func add*(a: var Limbs, b: Limbs): Carry =
  for i in 0 ..< a.len:
    addC(result, a[i], a[i], b[i], result)
 func add*(a: var Limbs, w: Word): Carry =
  ## Limbs addition, add a number that fits in a word
  ## Returns the carry
  result = Carry(0)
  addC(result, a[0], a[0], w, result)
  for i in 1 ..< a.len:
    addC(result, a[i], a[i], Zero, result)
 func cadd*(a: var Limbs, b: Limbs, ctl: CTBool[Word]): Carry =
  ## Limbs conditional addition
  ## Returns the carry
@ -180,266 +254,46 @@ func diff*(r: var Limbs, a, b: Limbs): Borrow =
  for i in 0 ..< a.len:
    subB(result, r[i], a[i], b[i], result)
-func `<`*(a, b: Limbs): CTBool[Word] =
+func cneg*(a: var Limbs, ctl: CTBool) =
-  ## Returns true if a < b
+  ## Conditional negation.
-  ## Comparison is constant-time
+  ## Negate if ``ctl`` is true
-  var diff: Word
+
-  var borrow: Borrow
+  # Algorithm:
  # In two-complement representation
  #  -x <=> not(x) + 1 <=> x xor 0xFF... + 1
  # and
  #   x <=> x xor 0x00...<=> x xor 0x00... + 0
  #
  # So we need to xor all words and then add 1
  # The "+1" might carry
  # So we fuse the 2 steps
  let mask = -Word(ctl)              # Obtain a 0xFF... or 0x00... mask
  var carry = Word(ctl)
  for i in 0 ..< a.len:
-    subB(borrow, diff, a[i], b[i], borrow)
+    let t = (a[i] xor mask) + carry  # XOR with mask and add 0x01 or 0x00 respectively
    carry = Word(t < carry)          # Carry on
    a[i] = t
-  result = (CTBool[Word])(borrow)
+# Bit manipulation
 # ------------------------------------------------------------
-func `<=`*(a, b: Limbs): CTBool[Word] =
+func shiftRight*(a: var Limbs, k: int) =
-  ## Returns true if a <= b
+  ## Shift right by k.
-  ## Comparison is constant-time
+  ##
-  not(b < a)
+  ## k MUST be less than the base word size (2^32 or 2^64)
  # We don't reuse shr as this is an in-place operation
  # Do we need to return the shifted out part?
  #
  # Note: for speed, loading a[i] and a[i+1]
  #       instead of a[i-1] and a[i]
  #       is probably easier to parallelize for the compiler
  #       (antidependence WAR vs loop-carried dependence RAW)
  # checkWordShift(k)
  for i in 0 ..< a.len-1:
    a[i] = (a[i] shr k) or (a[i+1] shl (WordBitWidth - k))
  a[a.len-1] = a[a.len-1] shr k
 {.pop.} # inline
-
+{.pop.} # raises no exceptions
 # ############################################################
 #
 #                   Modular BigInt
 #
 # ############################################################
 #
 # To avoid code-size explosion due to monomorphization
 # and given that reductions are not in hot path in Constantine
 # we use type-erased procedures, instead of instantiating
 # one per number of limbs combination
 # Type-erasure
 # ------------------------------------------------------------
 type
  LimbsView = ptr UncheckedArray[Word]
    ## Type-erased fixed-precision limbs
    ##
    ## This type mirrors the Limb type and is used
    ## for some low-level computation API
    ## This design
    ## - avoids code bloat due to generic monomorphization
    ##   otherwise limbs routines would have an instantiation for
    ##   each number of words.
    ##
    ## Accesses should be done via BigIntViewConst / BigIntViewConst
    ## to have the compiler check for mutability
  # "Indirection" to enforce pointer types deep immutability
  LimbsViewConst = distinct LimbsView
    ## Immutable view into the limbs of a BigInt
  LimbsViewMut = distinct LimbsView
    ## Mutable view into a BigInt
  LimbsViewAny = LimbsViewConst or LimbsViewMut
 # Deep Mutability safety
 # ------------------------------------------------------------
 template view(a: Limbs): LimbsViewConst =
  ## Returns a borrowed type-erased immutable view to a bigint
  LimbsViewConst(cast[LimbsView](a.unsafeAddr))
 template view(a: var Limbs): LimbsViewMut =
  ## Returns a borrowed type-erased mutable view to a mutable bigint
  LimbsViewMut(cast[LimbsView](a.addr))
 template `[]`*(v: LimbsViewConst, limbIdx: int): Word =
  LimbsView(v)[limbIdx]
 template `[]`*(v: LimbsViewMut, limbIdx: int): var Word =
  LimbsView(v)[limbIdx]
 template `[]=`*(v: LimbsViewMut, limbIdx: int, val: Word) =
  LimbsView(v)[limbIdx] = val
 # Type-erased add-sub
 # ------------------------------------------------------------
 func cadd(a: LimbsViewMut, b: LimbsViewAny, ctl: CTBool[Word], len: int): Carry =
  ## Type-erased conditional addition
  ## Returns the carry
  ##
  ## if ctl is true: a <- a + b
  ## if ctl is false: a <- a
  ## The carry is always computed whether ctl is true or false
  ##
  ## Time and memory accesses are the same whether a copy occurs or not
  result = Carry(0)
  var sum: Word
  for i in 0 ..< len:
    addC(result, sum, a[i], b[i], result)
    ctl.ccopy(a[i], sum)
 func csub(a: LimbsViewMut, b: LimbsViewAny, ctl: CTBool[Word], len: int): Borrow =
  ## Type-erased conditional addition
  ## Returns the borrow
  ##
  ## if ctl is true: a <- a - b
  ## if ctl is false: a <- a
  ## The borrow is always computed whether ctl is true or false
  ##
  ## Time and memory accesses are the same whether a copy occurs or not
  result = Borrow(0)
  var diff: Word
  for i in 0 ..< len:
    subB(result, diff, a[i], b[i], result)
    ctl.ccopy(a[i], diff)
 # Modular reduction
 # ------------------------------------------------------------
 func numWordsFromBits(bits: int): int {.inline.} =
  const divShiftor = log2(uint32(WordBitWidth))
  result = (bits + WordBitWidth - 1) shr divShiftor
 func shlAddMod_estimate(a: LimbsViewMut, aLen: int,
                        c: Word, M: LimbsViewConst, mBits: int
                      ): tuple[neg, tooBig: CTBool[Word]] =
  ## Estimate a <- a shl 2^w + c (mod M)
  ##
  ## with w the base word width, usually 32 on 32-bit platforms and 64 on 64-bit platforms
  ##
  ## Updates ``a`` and returns ``neg`` and ``tooBig``
  ## If ``neg``, the estimate in ``a`` is negative and ``M`` must be added to it.
  ## If ``tooBig``, the estimate in ``a`` overflowed and ``M`` must be substracted from it.
  # Aliases
  # ----------------------------------------------------------------------
  let MLen = numWordsFromBits(mBits)
  # Captures aLen and MLen
  template `[]`(v: untyped, limbIdxFromEnd: BackwardsIndex): Word {.dirty.}=
    v[`v Len` - limbIdxFromEnd.int]
  # ----------------------------------------------------------------------
                                                          # Assuming 64-bit words
  let hi = a[^1]                                          # Save the high word to detect carries
  let R = mBits and (WordBitWidth - 1)                    # R = mBits mod 64
  var a0, a1, m0: Word
  if R == 0:                                              # If the number of mBits is a multiple of 64
    a0 = a[^1]                                            #
    moveMem(a[1].addr, a[0].addr, (aLen-1) * Word.sizeof) # we can just shift words
    a[0] = c                                              # and replace the first one by c
    a1 = a[^1]
    m0 = M[^1]
  else:                                                   # Else: need to deal with partial word shifts at the edge.
    a0 = (a[^1] shl (WordBitWidth-R)) or (a[^2] shr R)
    moveMem(a[1].addr, a[0].addr, (aLen-1) * Word.sizeof)
    a[0] = c
    a1 = (a[^1] shl (WordBitWidth-R)) or (a[^2] shr R)
    m0 = (M[^1] shl (WordBitWidth-R)) or (M[^2] shr R)
  # m0 has its high bit set. (a0, a1)/p0 fits in a limb.
  # Get a quotient q, at most we will be 2 iterations off
  # from the true quotient
  var q, r: Word
  unsafeDiv2n1n(q, r, a0, a1, m0)                # Estimate quotient
  q = mux(                                       # If n_hi == divisor
        a0 == m0, MaxWord,                       # Quotient == MaxWord (0b1111...1111)
        mux(
          q.isZero, Zero,                        # elif q == 0, true quotient = 0
          q - One                                # else instead of being of by 0, 1 or 2
        )                                        # we returning q-1 to be off by -1, 0 or 1
      )
  # Now substract a*2^64 - q*p
  var carry = Zero
  var over_p = CtTrue                            # Track if quotient greater than the modulus
  for i in 0 ..< MLen:
    var qp_lo: Word
    block: # q*p
      # q * p + carry (doubleword) carry from previous limb
      muladd1(carry, qp_lo, q, M[i], Word carry)
    block: # a*2^64 - q*p
      var borrow: Borrow
      subB(borrow, a[i], a[i], qp_lo, Borrow(0))
      carry += Word(borrow) # Adjust if borrow
    over_p = mux(
              a[i] == M[i], over_p,
              a[i] > M[i]
            )
  # Fix quotient, the true quotient is either q-1, q or q+1
  #
  # if carry < q or carry == q and over_p we must do "a -= p"
  # if carry > hi (negative result) we must do "a += p"
  result.neg = Word(carry) > hi
  result.tooBig = not(result.neg) and (over_p or (Word(carry) < hi))
 func shlAddMod(a: LimbsViewMut, aLen: int,
               c: Word, M: LimbsViewConst, mBits: int) =
  ## Fused modular left-shift + add
  ## Shift input `a` by a word and add `c` modulo `M`
  ##
  ## With a word W = 2^WordBitSize and a modulus M
  ## Does a <- a * W + c (mod M)
  ##
  ## The modulus `M` most-significant bit at `mBits` MUST be set.
  if mBits <= WordBitWidth:
    # If M fits in a single limb
    # We normalize M with R so that the MSB is set
    # And normalize (a * 2^64 + c) by R as well to maintain the result
    # This ensures that (a0, a1)/p0 fits in a limb.
    let R = mBits and (WordBitWidth - 1)
    # (hi, lo) = a * 2^64 + c
    let hi = (a[0] shl (WordBitWidth-R)) or (c shr R)
    let lo = c shl (WordBitWidth-R)
    let m0 = M[0] shl (WordBitWidth-R)
    var q, r: Word
    unsafeDiv2n1n(q, r, hi, lo, m0)  # (hi, lo) mod M
    a[0] = r shr (WordBitWidth-R)
  else:
    ## Multiple limbs
    let (neg, tooBig) = shlAddMod_estimate(a, aLen, c, M, mBits)
    discard a.cadd(M, ctl = neg, aLen)
    discard a.csub(M, ctl = tooBig, aLen)
 func reduce(r: LimbsViewMut,
            a: LimbsViewAny, aBits: int,
            M: LimbsViewConst, mBits: int) =
  ## Reduce `a` modulo `M` and store the result in `r`
  let aLen = numWordsFromBits(aBits)
  let mLen = numWordsFromBits(mBits)
  let rLen = mLen
  if aBits < mBits:
    # if a uses less bits than the modulus,
    # it is guaranteed < modulus.
    # This relies on the precondition that the modulus uses all declared bits
    copyMem(r[0].addr, a[0].unsafeAddr, aLen * sizeof(Word))
    for i in aLen ..< mLen:
      r[i] = Zero
  else:
    # a length i at least equal to the modulus.
    # we can copy modulus.limbs-1 words
    # and modular shift-left-add the rest
    let aOffset = aLen - mLen
    copyMem(r[0].addr, a[aOffset+1].unsafeAddr, (mLen-1) * sizeof(Word))
    r[rLen - 1] = Zero
    # Now shift-left the copied words while adding the new word modulo M
    for i in countdown(aOffset, 0):
      shlAddMod(r, rLen, a[i], M, mBits)
 func reduce*[aLen, mLen](r: var Limbs[mLen],
                         a: Limbs[aLen], aBits: static int,
                         M: Limbs[mLen], mBits: static int
                        ) {.inline.} =
  ## Reduce `a` modulo `M` and store the result in `r`
  ##
  ## Warning ⚠: At the moment this is NOT constant-time
  ##            as it relies on hardware division.
  # This is implemented via type-erased indirection to avoid
  # a significant amount of code duplication if instantiated for
  # varying bitwidth.
  reduce(r.view(), a.view(), aBits, M.view(), mBits)
--- a/constantine/arithmetic/limbs_modular.nim
+++ b/constantine/arithmetic/limbs_modular.nim
@ -0,0 +1,371 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
  ../config/common,
  ../primitives,
  ./limbs
 # No exceptions allowed
 {.push raises: [].}
 # ############################################################
 #
 #                    Modular inversion
 #
 # ############################################################
 # Generic (odd-only modulus)
 # ------------------------------------------------------------
 # Algorithm by Niels Möller,
 # a modified version of Stein's Algorithm (binary Extended Euclid GCD)
 #
 # Algorithm 5 in
 # Fast Software Polynomial Multiplication on ARM Processors Using the NEON Engine
 # Danilo Camara, Conrado P. L. Gouvea, Julio Lopez, and Ricardo Dahab
 # https://link.springer.com/content/pdf/10.1007%2F978-3-642-40588-4_10.pdf
 #
 # Input: integer x, odd integer n, x < n
 # Output: x−1 (mod n)
 # 1:   function ModInv(x, n)
 # 2:   (a, b, u, v) ← (x, n, 1, 1)
 # 3:   ℓ ← ⌊log2 n⌋ + 1            ⮚ number of bits in n
 # 4:   for i ← 0 to 2ℓ − 1 do
 # 5:     odd ← a & 1
 # 6:     if odd and a ≥ b then
 # 7:       a ← a − b
 # 8:     else if odd and a < b then
 # 9:       (a, b, u, v) ← (b − a, a, v, u)
 # 10:    a ← a >> 1
 # 11:    if odd then u ← u − v
 # 12:    if u < 0 then u ← u + n
 # 13:    if u & 1 then u ← u + n
 # 14:    u ← u >> 1
 # 15:  return v
 #
 # Warning ⚠️: v should be 0 at initialization
 #
 # We modify it to return F . a^-1
 # So that we can pass an adjustment factor F
 # And directly compute modular division or Montgomery inversion
 func steinsGCD*(v: var Limbs, a: Limbs, F, M: Limbs, bits: int, mp1div2: Limbs) =
  ## Compute F multiplied the modular inverse of ``a`` modulo M
  ## r ≡ F . a^-1 (mod M)
  ##
  ## M MUST be odd, M does not need to be prime.
  ## ``a`` MUST be less than M.
  ##
  ## No information about ``a`` in particular its actual length in bits is leaked.
  ##
  ## This takes (M+1)/2 (mp1div2) as a precomputed parameter as a slight optimization
  ## in stack size and speed.
  # Ideally we need registers for a, b, u, v
  # but:
  #   - Even with ~256-bit primes, that's 4 limbs = 4*4 => 16 registers
  #   - x86_64 only has 16 general purposes registers
  #   - Registers are needed for the loop counter and comparison results
  #   - CMOV is reg <- RM so can move registers/memory into registers
  #     but cannot move into memory.
  # so we choose to keep "v" from the algorithm in memory as `r`
  # TODO: the inlining of primitives like `csub` is bad for codesize
  #       but there is a 80% slowdown without it.
  # TODO: The `cmov` in `cadd` and `csub` always retest the condition
  #       which is probably costly here given how many we have.
  var a = a
  var b = M
  var u = F
  v.setZero()
  for i in 0 ..< 2 * bits:
    debug: doAssert bool(b.isOdd)
    let isOddA = a.isOdd()
    # if isOddA: a -= b
    let aLessThanB = isOddA and (CTBool[Word]) a.csub(b, isOddA)
    # if a < b and the sub was processed
    # in that case, b <- a = a - b + b
    discard b.cadd(a, aLessThanB)
    # and a <- -new_a = (b-a)
    a.cneg(aLessThanB)
    debug: doAssert not bool(a.isOdd)
    a.shiftRight(1)
    # Swap u and v is a < b
    u.cswap(v, aLessThanB)
    # if isOddA: u -= v (mod M)
    let neg = isOddA and (CTBool[Word]) u.csub(v, isOddA)
    let corrected = u.cadd(M, neg)
    let isOddU = u.isOdd()
    # if u.isOdd:
    #   u += n
    # u = u shr 1
    #
    # Warning ⚠️: u += n will overflow the BigInt
    #   and we might lose a bit on the next shift
    #   Instead we shift first and then add hald the modulus rounded up
    u.shiftRight(1)
    let carry = u.cadd(mp1div2, isOddU)
    debug: doAssert not carry.bool
  debug:
    doAssert bool a.isZero()
    doAssert bool b.isOne() # GCD always exist if a and M are relatively prime
 # ############################################################
 #
 #                   Modular Reduction
 #
 # ############################################################
 #
 # To avoid code-size explosion due to monomorphization
 # and given that reductions are not in hot path in Constantine
 # we use type-erased procedures, instead of instantiating
 # one per number of limbs combination
 # Type-erasure
 # ------------------------------------------------------------
 type
  LimbsView = ptr UncheckedArray[Word]
    ## Type-erased fixed-precision limbs
    ##
    ## This type mirrors the Limb type and is used
    ## for some low-level computation API
    ## This design
    ## - avoids code bloat due to generic monomorphization
    ##   otherwise limbs routines would have an instantiation for
    ##   each number of words.
    ##
    ## Accesses should be done via BigIntViewConst / BigIntViewConst
    ## to have the compiler check for mutability
  # "Indirection" to enforce pointer types deep immutability
  LimbsViewConst = distinct LimbsView
    ## Immutable view into the limbs of a BigInt
  LimbsViewMut = distinct LimbsView
    ## Mutable view into a BigInt
  LimbsViewAny = LimbsViewConst or LimbsViewMut
 # Deep Mutability safety
 # ------------------------------------------------------------
 template view(a: Limbs): LimbsViewConst =
  ## Returns a borrowed type-erased immutable view to a bigint
  LimbsViewConst(cast[LimbsView](a.unsafeAddr))
 template view(a: var Limbs): LimbsViewMut =
  ## Returns a borrowed type-erased mutable view to a mutable bigint
  LimbsViewMut(cast[LimbsView](a.addr))
 template `[]`*(v: LimbsViewConst, limbIdx: int): Word =
  LimbsView(v)[limbIdx]
 template `[]`*(v: LimbsViewMut, limbIdx: int): var Word =
  LimbsView(v)[limbIdx]
 template `[]=`*(v: LimbsViewMut, limbIdx: int, val: Word) =
  LimbsView(v)[limbIdx] = val
 # Type-erased add-sub
 # ------------------------------------------------------------
 func cadd(a: LimbsViewMut, b: LimbsViewAny, ctl: CTBool[Word], len: int): Carry =
  ## Type-erased conditional addition
  ## Returns the carry
  ##
  ## if ctl is true: a <- a + b
  ## if ctl is false: a <- a
  ## The carry is always computed whether ctl is true or false
  ##
  ## Time and memory accesses are the same whether a copy occurs or not
  result = Carry(0)
  var sum: Word
  for i in 0 ..< len:
    addC(result, sum, a[i], b[i], result)
    ctl.ccopy(a[i], sum)
 func csub(a: LimbsViewMut, b: LimbsViewAny, ctl: CTBool[Word], len: int): Borrow =
  ## Type-erased conditional addition
  ## Returns the borrow
  ##
  ## if ctl is true: a <- a - b
  ## if ctl is false: a <- a
  ## The borrow is always computed whether ctl is true or false
  ##
  ## Time and memory accesses are the same whether a copy occurs or not
  result = Borrow(0)
  var diff: Word
  for i in 0 ..< len:
    subB(result, diff, a[i], b[i], result)
    ctl.ccopy(a[i], diff)
 # Modular reduction
 # ------------------------------------------------------------
 func numWordsFromBits(bits: int): int {.inline.} =
  const divShiftor = log2(uint32(WordBitWidth))
  result = (bits + WordBitWidth - 1) shr divShiftor
 func shlAddMod_estimate(a: LimbsViewMut, aLen: int,
                        c: Word, M: LimbsViewConst, mBits: int
                      ): tuple[neg, tooBig: CTBool[Word]] =
  ## Estimate a <- a shl 2^w + c (mod M)
  ##
  ## with w the base word width, usually 32 on 32-bit platforms and 64 on 64-bit platforms
  ##
  ## Updates ``a`` and returns ``neg`` and ``tooBig``
  ## If ``neg``, the estimate in ``a`` is negative and ``M`` must be added to it.
  ## If ``tooBig``, the estimate in ``a`` overflowed and ``M`` must be substracted from it.
  # Aliases
  # ----------------------------------------------------------------------
  let MLen = numWordsFromBits(mBits)
  # Captures aLen and MLen
  template `[]`(v: untyped, limbIdxFromEnd: BackwardsIndex): Word {.dirty.}=
    v[`v Len` - limbIdxFromEnd.int]
  # ----------------------------------------------------------------------
                                                          # Assuming 64-bit words
  let hi = a[^1]                                          # Save the high word to detect carries
  let R = mBits and (WordBitWidth - 1)                    # R = mBits mod 64
  var a0, a1, m0: Word
  if R == 0:                                              # If the number of mBits is a multiple of 64
    a0 = a[^1]                                            #
    moveMem(a[1].addr, a[0].addr, (aLen-1) * Word.sizeof) # we can just shift words
    a[0] = c                                              # and replace the first one by c
    a1 = a[^1]
    m0 = M[^1]
  else:                                                   # Else: need to deal with partial word shifts at the edge.
    a0 = (a[^1] shl (WordBitWidth-R)) or (a[^2] shr R)
    moveMem(a[1].addr, a[0].addr, (aLen-1) * Word.sizeof)
    a[0] = c
    a1 = (a[^1] shl (WordBitWidth-R)) or (a[^2] shr R)
    m0 = (M[^1] shl (WordBitWidth-R)) or (M[^2] shr R)
  # m0 has its high bit set. (a0, a1)/p0 fits in a limb.
  # Get a quotient q, at most we will be 2 iterations off
  # from the true quotient
  var q, r: Word
  unsafeDiv2n1n(q, r, a0, a1, m0)                # Estimate quotient
  q = mux(                                       # If n_hi == divisor
        a0 == m0, MaxWord,                       # Quotient == MaxWord (0b1111...1111)
        mux(
          q.isZero, Zero,                        # elif q == 0, true quotient = 0
          q - One                                # else instead of being of by 0, 1 or 2
        )                                        # we returning q-1 to be off by -1, 0 or 1
      )
  # Now substract a*2^64 - q*p
  var carry = Zero
  var over_p = CtTrue                            # Track if quotient greater than the modulus
  for i in 0 ..< MLen:
    var qp_lo: Word
    block: # q*p
      # q * p + carry (doubleword) carry from previous limb
      muladd1(carry, qp_lo, q, M[i], Word carry)
    block: # a*2^64 - q*p
      var borrow: Borrow
      subB(borrow, a[i], a[i], qp_lo, Borrow(0))
      carry += Word(borrow) # Adjust if borrow
    over_p = mux(
              a[i] == M[i], over_p,
              a[i] > M[i]
            )
  # Fix quotient, the true quotient is either q-1, q or q+1
  #
  # if carry < q or carry == q and over_p we must do "a -= p"
  # if carry > hi (negative result) we must do "a += p"
  result.neg = Word(carry) > hi
  result.tooBig = not(result.neg) and (over_p or (Word(carry) < hi))
 func shlAddMod(a: LimbsViewMut, aLen: int,
               c: Word, M: LimbsViewConst, mBits: int) =
  ## Fused modular left-shift + add
  ## Shift input `a` by a word and add `c` modulo `M`
  ##
  ## With a word W = 2^WordBitSize and a modulus M
  ## Does a <- a * W + c (mod M)
  ##
  ## The modulus `M` most-significant bit at `mBits` MUST be set.
  if mBits <= WordBitWidth:
    # If M fits in a single limb
    # We normalize M with R so that the MSB is set
    # And normalize (a * 2^64 + c) by R as well to maintain the result
    # This ensures that (a0, a1)/p0 fits in a limb.
    let R = mBits and (WordBitWidth - 1)
    # (hi, lo) = a * 2^64 + c
    let hi = (a[0] shl (WordBitWidth-R)) or (c shr R)
    let lo = c shl (WordBitWidth-R)
    let m0 = M[0] shl (WordBitWidth-R)
    var q, r: Word
    unsafeDiv2n1n(q, r, hi, lo, m0)  # (hi, lo) mod M
    a[0] = r shr (WordBitWidth-R)
  else:
    ## Multiple limbs
    let (neg, tooBig) = shlAddMod_estimate(a, aLen, c, M, mBits)
    discard a.cadd(M, ctl = neg, aLen)
    discard a.csub(M, ctl = tooBig, aLen)
 func reduce(r: LimbsViewMut,
            a: LimbsViewAny, aBits: int,
            M: LimbsViewConst, mBits: int) =
  ## Reduce `a` modulo `M` and store the result in `r`
  let aLen = numWordsFromBits(aBits)
  let mLen = numWordsFromBits(mBits)
  let rLen = mLen
  if aBits < mBits:
    # if a uses less bits than the modulus,
    # it is guaranteed < modulus.
    # This relies on the precondition that the modulus uses all declared bits
    copyMem(r[0].addr, a[0].unsafeAddr, aLen * sizeof(Word))
    for i in aLen ..< mLen:
      r[i] = Zero
  else:
    # a length i at least equal to the modulus.
    # we can copy modulus.limbs-1 words
    # and modular shift-left-add the rest
    let aOffset = aLen - mLen
    copyMem(r[0].addr, a[aOffset+1].unsafeAddr, (mLen-1) * sizeof(Word))
    r[rLen - 1] = Zero
    # Now shift-left the copied words while adding the new word modulo M
    for i in countdown(aOffset, 0):
      shlAddMod(r, rLen, a[i], M, mBits)
 func reduce*[aLen, mLen](r: var Limbs[mLen],
                         a: Limbs[aLen], aBits: static int,
                         M: Limbs[mLen], mBits: static int
                        ) {.inline.} =
  ## Reduce `a` modulo `M` and store the result in `r`
  ##
  ## Warning ⚠: At the moment this is NOT constant-time
  ##            as it relies on hardware division.
  # This is implemented via type-erased indirection to avoid
  # a significant amount of code duplication if instantiated for
  # varying bitwidth.
  reduce(r.view(), a.view(), aBits, M.view(), mBits)
 {.pop.} # raises no exceptions
--- a/constantine/arithmetic/limbs_montgomery.nim
+++ b/constantine/arithmetic/limbs_montgomery.nim
@ -21,7 +21,7 @@ import
 # Note: Montgomery multiplications and squarings are the biggest bottlenecks
 #       of an elliptic curve library, asymptotically 100% of the costly algorithms:
 #       - field exponentiation
-#       - field inversion
+#       - field inversion via Little Fermat
 #       - extension towers multiplication, squarings, inversion
 #       - elliptic curve point addition
 #       - elliptic curve point doubling
@ -80,6 +80,9 @@ macro staticFor(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped)
      body.replaceNodes(idx, newLit i)
    )
 # No exceptions allowed
 {.push raises: [].}
 # Implementation
 # ------------------------------------------------------------
 # Note: the low-level implementations should not use static parameter
@ -565,3 +568,5 @@ func montyPowUnsafeExponent*(
        # scratchspace[1] holds the original `a`
        scratchspace[0].montyMul(a, scratchspace[1], M, negInvModWord, canUseNoCarryMontyMul)
      a = scratchspace[0]
 {.pop.} # raises no exceptions
--- a/constantine/arithmetic/precomputed.nim
+++ b/constantine/arithmetic/precomputed.nim
@ -67,6 +67,19 @@ func subB(bOut, diff: var BaseType, a, b, bIn: BaseType) =
  bOut = BaseType(noBorrowHi == 0)
  diff = merge(rHi, rLo)
 func add(a: var BigInt, w: BaseType): bool =
  ## Limbs addition, add a number that fits in a word
  ## Returns the carry
  var carry, sum: BaseType
  addC(carry, sum, BaseType(a.limbs[0]), w, carry)
  a.limbs[0] = Word(sum)
  for i in 1 ..< a.limbs.len:
    let ai = BaseType(a.limbs[i])
    addC(carry, sum, ai, 0, carry)
    a.limbs[i] = Word(sum)
  result = bool(carry)
 func dbl(a: var BigInt): bool =
  ## In-place multiprecision double
  ##   a -> 2a
@ -246,9 +259,24 @@ func primeMinus2_BE*[bits: static int](
     ): array[(bits+7) div 8, byte] {.noInit.} =
  ## Compute an input prime-2
  ## and return the result as a canonical byte array / octet string
-  ## For use to precompute modular inverse exponent.
+  ## For use to precompute modular inverse exponent
  ## when using inversion by Little Fermat Theorem a^-1 = a^(p-2) mod p
  var tmp = P
  discard tmp.csub(BigInt[bits].fromRawUint([byte 2], bigEndian), true)
  result.exportRawUint(tmp, bigEndian)
 func primePlus1div2*(P: BigInt): BigInt =
  ## Compute (P+1)/2, assumes P is odd
  ## For use in constant-time modular inversion
  checkOddModulus(P)
  # (P+1)/2 = P/2 + 1 if P is odd,
  # this avoids overflowing if the prime uses all bits
  # i.e. in the form (2^64)^w - 1 or (2^32)^w - 1
  result = P
  result.shiftRight(1)
  let carry = result.add(1)
  doAssert not carry
--- a/constantine/config/curves.nim
+++ b/constantine/config/curves.nim
@ -13,6 +13,7 @@ import
  ./curves_parser, ./common,
  ../arithmetic/[precomputed, bigints]
 {.push used.}
 # ############################################################
 #
@ -40,27 +41,23 @@ import
 #   which returns the field modulus of the curve
 when not defined(testingCurves):
  declareCurves:
-    # Barreto-Naehrig curve, pairing-friendly, Prime 254 bit, ~100-bit security
+    curve P224: # NIST P-224
-    # https://eprint.iacr.org/2013/879.pdf
+      bitsize: 224
-    # Usage: Zero-Knowledge Proofs / zkSNARKs in ZCash and Ethereum 1
+      modulus: "0xffffffff_ffffffff_ffffffff_ffffffff_00000000_00000000_00000001"
    #        https://eips.ethereum.org/EIPS/eip-196
    curve BN254:
      bitsize: 254
      modulus: "0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47"
      # Equation: Y^2 = X^3 + 3
    curve BLS12_381:
      bitsize: 381
      modulus: "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab"
      # Equation: y^2 = x^3 + 4
    curve P224: # NIST P-224
      bitsize: 224
      modulus: "0xffffffff_ffffffff_ffffffff_ffffffff_00000000_00000000_00000001"
    curve P256: # secp256r1 / NIST P-256
      bitsize: 256
      modulus: "0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff"
    curve Secp256k1: # Bitcoin curve
      bitsize: 256
      modulus: "0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2F"
    curve BLS12_381:
      bitsize: 381
      modulus: "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab"
      # Equation: y^2 = x^3 + 4
 else:
  # Fake curve for testing field arithmetic
  declareCurves:
@ -76,12 +73,75 @@ else:
    curve P224: # NIST P-224
      bitsize: 224
      modulus: "0xffffffff_ffffffff_ffffffff_ffffffff_00000000_00000000_00000001"
    curve BN254: # Zero-Knowledge proofs curve (SNARKS, STARKS)
      bitsize: 254
      modulus: "0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47"
      # Equation: Y^2 = X^3 + 3
    curve P256: # secp256r1 / NIST P-256
      bitsize: 256
      modulus: "0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff"
    curve Secp256k1: # Bitcoin curve
      bitsize: 256
      modulus: "0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2F"
    curve BLS12_381:
      bitsize: 381
      modulus: "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab"
      # Equation: y^2 = x^3 + 4
 # ############################################################
 #
 #                   Curve Families
 #
 # ############################################################
 type CurveFamily = enum
  None
  BN   # Barreto-Naehrig
  BLS  # Barreto-Lynn-Scott
 func family*(curve: Curve): CurveFamily =
  case curve
  of BN254:
    BN
  of BLS12_381:
    BLS
  else:
    None
 # ############################################################
 #
 #              Curve Specific Parameters
 #
 # ############################################################
 #
 # In the form CurveXXX_ParameterName where CurveXXX is the curve name + number of bits
 # of the field modulus
 # BN Curves
 # ------------------------------------------------------------
 # See https://tools.ietf.org/id/draft-yonezawa-pairing-friendly-curves-00.html
 #
 # The prime p and order r are primes and of the form
 # p = 36u^4 + 36u^3 + 24u^2 + 6u + 1
 # r = 36u^4 + 36u^3 + 18u^2 + 6u + 1
 #
 # https://eprint.iacr.org/2010/429.pdf
 # https://eprint.iacr.org/2013/879.pdf
 # Usage: Zero-Knowledge Proofs / zkSNARKs in ZCash and Ethereum 1
 #        https://eips.ethereum.org/EIPS/eip-196
 # BLS Curves
 # ------------------------------------------------------------
 # See https://tools.ietf.org/id/draft-yonezawa-pairing-friendly-curves-00.html
 #
 # BLS12 curves
 #   The prime p and order r are primes and of the form
 #   p = (u - 1)^2 (u^4 - u^2 + 1)/3 + u
 #   r = u^4 - u^2 + 1
 #
 # BLS48 curves
 #   The prime p and order r are primes and of the form
 #   p = (u - 1)^2 (u^16 - u^8 + 1)/3 + u
 #   r = u^16 - u^8 + 1
 # ############################################################
 #
@ -147,6 +207,7 @@ macro genMontyMagics(T: typed): untyped =
        )
      )
    )
    # const MyCurve_NegInvModWord = negInvModWord(MyCurve_Modulus)
    result.add newConstStmt(
      ident($curve & "_NegInvModWord"), newCall(
@ -177,6 +238,16 @@ macro genMontyMagics(T: typed): untyped =
        )
      )
    )
    # const MyCurve_PrimePlus1div2 = primePlus1div2(MyCurve_Modulus)
    result.add newConstStmt(
      ident($curve & "_PrimePlus1div2"), newCall(
        bindSym"primePlus1div2",
        nnkDotExpr.newTree(
          bindSym($curve & "_Modulus"),
          ident"mres"
        )
      )
    )
  # echo result.toStrLit
@ -208,6 +279,10 @@ macro getInvModExponent*(C: static Curve): untyped =
  ## Get modular inversion exponent (Modulus-2 in canonical representation)
  result = bindSym($C & "_InvModExponent")
 macro getPrimePlus1div2*(C: static Curve): untyped =
  ## Get (P+1) / 2 for an odd prime
  result = bindSym($C & "_PrimePlus1div2")
 # ############################################################
 #
 #                Debug info printed at compile-time
@ -234,5 +309,5 @@ macro debugConsts(): untyped =
  result.add quote do:
    echo "----------------------------------------------------------------------------"
-debug:
+# debug:
-  debugConsts()
+#   debugConsts()
--- a/constantine/tower_field_extensions/abelian_groups.nim
+++ b/constantine/tower_field_extensions/abelian_groups.nim
@ -7,7 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
-  ../arithmetic/finite_fields,
+  ../arithmetic,
  ../config/common,
  ../primitives
--- a/constantine/tower_field_extensions/fp2_complex.nim
+++ b/constantine/tower_field_extensions/fp2_complex.nim
@ -44,7 +44,7 @@
 # TODO: Clarify some assumptions about the prime p ≡ 3 (mod 4)
 import
-  ../arithmetic/finite_fields,
+  ../arithmetic,
  ../config/curves,
  ./abelian_groups
@ -151,7 +151,7 @@ func inv*(r: var Fp2, a: Fp2) =
  t0 += t1             # t0 = a0² + a1² (the norm / squared magnitude of a)
  # [1 Inv, 2 Sqr, 1 Add]
-  inv(t0)              # t0 = 1 / (a0² + a1²)
+  t0.inv(t0)           # t0 = 1 / (a0² + a1²)
  # [1 Inv, 2 Mul, 2 Sqr, 1 Add, 1 Neg]
  r.c0.prod(a.c0, t0)  # r0 = a0 / (a0² + a1²)
--- a/constantine/tower_field_extensions/fp2_sqrt_minus2.nim
+++ b/constantine/tower_field_extensions/fp2_sqrt_minus2.nim
@ -33,7 +33,7 @@
 import
-  ../arithmetic/finite_fields,
+  ../arithmetic,
  ../config/curves,
  ./abelian_groups
--- a/constantine/tower_field_extensions/fp2_sqrt_minus5.nim
+++ b/constantine/tower_field_extensions/fp2_sqrt_minus5.nim
@ -32,7 +32,7 @@
 #     https://eprint.iacr.org/2010/354
 import
-  ../arithmetic/finite_fields,
+  ../arithmetic,
  ../config/curves,
  ./abelian_groups
--- a/formal_verification/bls12_381_q_64.nim
+++ b/formal_verification/bls12_381_q_64.nim
@ -128,7 +128,7 @@ func fromHex(output: var openArray[byte], hexStr: string, order: static[Endianne
 # -------------------------------------------------------------------------
 when isMainModule:
-  import random, std/monotimes, times, strformat, ./timers
+  import random, std/monotimes, times, strformat, ../helpers/timers
  const Iters = 1_000_000
  const InvIters = 1000
--- a/formal_verification/timers.nim
+++ b/formal_verification/timers.nim
@ -1,49 +0,0 @@
 when defined(i386) or defined(amd64):
  # From Linux
  #
  # The RDTSC instruction is not ordered relative to memory
  # access.  The Intel SDM and the AMD APM are both vague on this
  # point, but empirically an RDTSC instruction can be
  # speculatively executed before prior loads.  An RDTSC
  # immediately after an appropriate barrier appears to be
  # ordered as a normal load, that is, it provides the same
  # ordering guarantees as reading from a global memory location
  # that some other imaginary CPU is updating continuously with a
  # time stamp.
  #
  # From Intel SDM
  # https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
  when not defined(vcc):
    when defined(amd64):
      proc getTicks*(): int64 {.inline.} =
        var lo, hi: int64
        # TODO: Provide a compile-time flag for RDTSCP support
        #       and use it instead of lfence + RDTSC
        {.emit: """asm volatile(
          "lfence\n"
          "rdtsc\n"
          : "=a"(`lo`), "=d"(`hi`)
          :
          : "memory"
        );""".}
        return (hi shl 32) or lo
    else:
      proc getTicks*(): int64 {.inline.} =
        # TODO: Provide a compile-time flag for RDTSCP support
        #       and use it instead of lfence + RDTSC
        {.emit: """asm volatile(
          "lfence\n"
          "rdtsc\n"
          : "=a"(`result`)
          :
          : "memory"
        );""".}
  else:
    proc rdtsc(): int64 {.sideeffect, importc: "__rdtsc", header: "<intrin.h>".}
    proc lfence() {.importc: "__mm_lfence", header: "<intrin.h>".}
    proc getTicks*(): int64 {.inline.} =
      lfence()
      return rdtsc()
 else:
  {.error: "getticks is not supported on this CPU architecture".}
--- a/helpers/README.md
+++ b/helpers/README.md
@ -0,0 +1,3 @@
 # Helpers, utilities, tools, miscellaneous
 This folder holds helper functions that are not part of Constantine but facilitates testing and benchmarking.
--- a/helpers/prng.nim
+++ b/helpers/prng.nim
@ -98,5 +98,5 @@ func random[T](rng: var RngState, a: var T, C: static Curve) {.noInit.}=
      rng.random(field, C)
 func random*(rng: var RngState, T: typedesc): T =
-  ## Create a random Field or Extension FIeld Element
+  ## Create a random Field or Extension Field Element
  rng.random(result, T.C)
--- a/helpers/static_for.nim
+++ b/helpers/static_for.nim
@ -0,0 +1,28 @@
 import macros
 proc replaceNodes(ast: NimNode, what: NimNode, by: NimNode): NimNode =
  # Replace "what" ident node by "by"
  proc inspect(node: NimNode): NimNode =
    case node.kind:
    of {nnkIdent, nnkSym}:
      if node.eqIdent(what):
        return by
      return node
    of nnkEmpty:
      return node
    of nnkLiterals:
      return node
    else:
      var rTree = node.kind.newTree()
      for child in node:
        rTree.add inspect(child)
      return rTree
  result = inspect(ast)
 macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped): untyped =
  result = newStmtList()
  for i in start ..< stopEx:
    result.add nnkBlockStmt.newTree(
      ident("unrolledIter_" & $idx & $i),
      body.replaceNodes(idx, newLit i)
    )
--- a/helpers/timers.nim
+++ b/helpers/timers.nim
@ -0,0 +1,14 @@
 when defined(i386) or defined(amd64):
  import x86
  export getTicks
 # This doesn't always work unfortunately ...
 proc volatilize(x: ptr byte) {.codegenDecl: "$# $#(char const volatile *x)", inline.} =
  discard
 template preventOptimAway*[T](x: var T) =
  volatilize(cast[ptr byte](unsafeAddr x))
 template preventOptimAway*[T](x: T) =
  volatilize(cast[ptr byte](x))
--- a/helpers/x86.nim
+++ b/helpers/x86.nim
@ -0,0 +1,74 @@
 # Cpu Name
 # -------------------------------------------------------
 {.passC:"-std=gnu99".} # TODO may conflict with milagro "-std=c99"
 proc cpuID(eaxi, ecxi: int32): tuple[eax, ebx, ecx, edx: int32] =
  when defined(vcc):
    proc cpuidVcc(cpuInfo: ptr int32; functionID: int32)
      {.importc: "__cpuidex", header: "intrin.h".}
    cpuidVcc(addr result.eax, eaxi, ecxi)
  else:
    var (eaxr, ebxr, ecxr, edxr) = (0'i32, 0'i32, 0'i32, 0'i32)
    asm """
      cpuid
      :"=a"(`eaxr`), "=b"(`ebxr`), "=c"(`ecxr`), "=d"(`edxr`)
      :"a"(`eaxi`), "c"(`ecxi`)"""
    (eaxr, ebxr, ecxr, edxr)
 proc cpuName*(): string =
  var leaves {.global.} = cast[array[48, char]]([
    cpuID(eaxi = 0x80000002'i32, ecxi = 0),
    cpuID(eaxi = 0x80000003'i32, ecxi = 0),
    cpuID(eaxi = 0x80000004'i32, ecxi = 0)])
  result = $cast[cstring](addr leaves[0])
 # Counting cycles
 # -------------------------------------------------------
 # From Linux
 #
 # The RDTSC instruction is not ordered relative to memory
 # access.  The Intel SDM and the AMD APM are both vague on this
 # point, but empirically an RDTSC instruction can be
 # speculatively executed before prior loads.  An RDTSC
 # immediately after an appropriate barrier appears to be
 # ordered as a normal load, that is, it provides the same
 # ordering guarantees as reading from a global memory location
 # that some other imaginary CPU is updating continuously with a
 # time stamp.
 #
 # From Intel SDM
 # https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
 proc getTicks*(): int64 {.inline.} =
  when defined(vcc):
    proc rdtsc(): int64 {.sideeffect, importc: "__rdtsc", header: "<intrin.h>".}
    proc lfence() {.importc: "__mm_lfence", header: "<intrin.h>".}
    lfence()
    return rdtsc()
  else:
    when defined(amd64):
      var lo, hi: int64
      # TODO: Provide a compile-time flag for RDTSCP support
      #       and use it instead of lfence + RDTSC
      {.emit: """asm volatile(
        "lfence\n"
        "rdtsc\n"
        : "=a"(`lo`), "=d"(`hi`)
        :
        : "memory"
      );""".}
      return (hi shl 32) or lo
    else: # 32-bit x86
      # TODO: Provide a compile-time flag for RDTSCP support
      #       and use it instead of lfence + RDTSC
      {.emit: """asm volatile(
        "lfence\n"
        "rdtsc\n"
        : "=a"(`result`)
        :
        : "memory"
      );""".}
--- a/tests/test_bigints.nim
+++ b/tests/test_bigints.nim
@ -8,11 +8,11 @@
 import  unittest,
        ../constantine/io/io_bigints,
-        ../constantine/arithmetic/bigints,
+        ../constantine/arithmetic,
        ../constantine/config/common,
        ../constantine/primitives
-proc main() =
+proc mainArith() =
  suite "isZero":
    test "isZero for zero":
      var x: BigInt[128]
@ -128,6 +128,16 @@ proc main() =
          bool(a == c)
          not bool(carry)
  suite "BigInt + Word":
    test "Addition limbs carry":
      block: # P256 / 2
        var a = BigInt[256].fromhex"0x7fffffff800000008000000000000000000000007fffffffffffffffffffffff"
        let expected = BigInt[256].fromHex"7fffffff80000000800000000000000000000000800000000000000000000000"
        discard a.add(Word 1)
        check: bool(a == expected)
  suite "Modular operations - small modulus":
    # Vectors taken from Stint - https://github.com/status-im/nim-stint
    test "100 mod 13":
@ -205,4 +215,249 @@ proc main() =
      check:
        bool(r == BigInt[40].fromUint(u mod v))
-main()
+proc mainNeg() =
  suite "Conditional negation":
    test "Conditional negation":
      block:
        var a = fromHex(BigInt[128], "0x12345678_FF11FFAA_00321321_CAFECAFE")
        var b = fromHex(BigInt[128], "0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF")
        let a2 = a
        let b2 = b
        a.cneg(CtTrue)
        b.cneg(CtTrue)
        discard a.add(a2)
        discard b.add(b2)
        check:
          bool(a.isZero)
          bool(b.isZero)
      block:
        var a = fromHex(BigInt[128], "0x12345678_FF11FFAA_00321321_CAFECAFE")
        var b = fromHex(BigInt[128], "0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF")
        let a2 = a
        let b2 = b
        a.cneg(CtFalse)
        b.cneg(CtFalse)
        check:
          bool(a == a2)
          bool(b == b2)
    test "Conditional negation with carries":
      block:
        var a = fromHex(BigInt[128], "0x12345678_FF11FFAA_00321321_FFFFFFFF")
        var b = fromHex(BigInt[128], "0xFFFFFFFF_FFFFFFFF_00000000_00000000")
        let a2 = a
        let b2 = b
        a.cneg(CtTrue)
        b.cneg(CtTrue)
        discard a.add(a2)
        discard b.add(b2)
        check:
          bool(a.isZero)
          bool(b.isZero)
      block:
        var a = fromHex(BigInt[128], "0x12345678_00000000_00321321_FFFFFFFF")
        var b = fromHex(BigInt[128], "0xFFFFFFFF_FFFFFFFF_00000000_00000000")
        let a2 = a
        let b2 = b
        a.cneg(CtFalse)
        b.cneg(CtFalse)
        check:
          bool(a == a2)
          bool(b == b2)
    test "Conditional all-zero bit or all-one bit":
      block:
        var a = fromHex(BigInt[128], "0x00000000_00000000_00000000_00000000")
        var b = fromHex(BigInt[128], "0xFFFFFFFF_FFFFFFFF_FFFFFFFF_FFFFFFFF")
        let a2 = a
        let b2 = b
        a.cneg(CtTrue)
        b.cneg(CtTrue)
        discard a.add(a2)
        discard b.add(b2)
        check:
          bool(a.isZero)
          bool(b.isZero)
      block:
        var a = fromHex(BigInt[128], "0x00000000_00000000_00000000_00000000")
        var b = fromHex(BigInt[128], "0xFFFFFFFF_FFFFFFFF_FFFFFFFF_FFFFFFFF")
        let a2 = a
        let b2 = b
        a.cneg(CtFalse)
        b.cneg(CtFalse)
        check:
          bool(a == a2)
          bool(b == b2)
 proc mainCopySwap() =
  suite "Copy and Swap":
    test "Conditional copy":
      block:
        var a = fromHex(BigInt[128], "0x12345678_FF11FFAA_00321321_CAFECAFE")
        let b = fromHex(BigInt[128], "0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF")
        var expected = a
        a.ccopy(b, CtFalse)
        check: bool(expected == a)
      block:
        var a = fromHex(BigInt[128], "0x00000000_FFFFFFFF_FFFFFFFF_FFFFFFFF")
        let b = fromHex(BigInt[128], "0x00000000_00000000_00000000_00000001")
        var expected = b
        a.ccopy(b, CtTrue)
        check: bool(expected == b)
    test "Conditional swap":
      block:
        var a = fromHex(BigInt[128], "0x12345678_FF11FFAA_00321321_CAFECAFE")
        var b = fromHex(BigInt[128], "0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF")
        let eA = a
        let eB = b
        a.cswap(b, CtFalse)
        check:
          bool(eA == a)
          bool(eB == b)
      block:
        var a = fromHex(BigInt[128], "0x00000000_FFFFFFFF_FFFFFFFF_FFFFFFFF")
        var b = fromHex(BigInt[128], "0x00000000_00000000_00000000_00000001")
        let eA = b
        let eB = a
        a.cswap(b, CtTrue)
        check:
          bool(eA == a)
          bool(eB == b)
 proc mainModularInverse() =
  suite "Modular Inverse (with odd modulus)":
    # Note: We don't define multi-precision multiplication
    #       because who needs it when you have Montgomery?
    #       ¯\_(ツ)_/¯
    test "42^-1 (mod 2017) = 1969":
      block: # small int
        let a = BigInt[16].fromUint(42'u16)
        let M = BigInt[16].fromUint(2017'u16)
        var mp1div2 = M
        discard mp1div2.add(Word 1)
        mp1div2.shiftRight(1)
        let expected = BigInt[16].fromUint(1969'u16)
        var r {.noInit.}: BigInt[16]
        r.invmod(a, M, mp1div2)
        check: bool(r == expected)
      block: # huge int
        let a = BigInt[381].fromUint(42'u16)
        let M = BigInt[381].fromUint(2017'u16)
        var mp1div2 = M
        discard mp1div2.add(Word 1)
        mp1div2.shiftRight(1)
        let expected = BigInt[381].fromUint(1969'u16)
        var r {.noInit.}: BigInt[381]
        r.invmod(a, M, mp1div2)
        check: bool(r == expected)
    test "271^-1 (mod 383) = 106":
      block: # small int
        let a = BigInt[16].fromUint(271'u16)
        let M = BigInt[16].fromUint(383'u16)
        var mp1div2 = M
        discard mp1div2.add(Word 1)
        mp1div2.shiftRight(1)
        let expected = BigInt[16].fromUint(106'u16)
        var r {.noInit.}: BigInt[16]
        r.invmod(a, M, mp1div2)
        check: bool(r == expected)
      block: # huge int
        let a = BigInt[381].fromUint(271'u16)
        let M = BigInt[381].fromUint(383'u16)
        var mp1div2 = M
        discard mp1div2.add(Word 1)
        mp1div2.shiftRight(1)
        let expected = BigInt[381].fromUint(106'u16)
        var r {.noInit.}: BigInt[381]
        r.invmod(a, M, mp1div2)
        check: bool(r == expected)
    test "BN254_Modulus^-1 (mod BLS12_381)":
      let a = BigInt[381].fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
      let M = BigInt[381].fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab")
      var mp1div2 = M
      discard mp1div2.add(Word 1)
      mp1div2.shiftRight(1)
      let expected = BigInt[381].fromHex("0x0636759a0f3034fa47174b2c0334902f11e9915b7bd89c6a2b3082b109abbc9837da17201f6d8286fe6203caa1b9d4c8")
      var r {.noInit.}: BigInt[381]
      r.invmod(a, M, mp1div2)
      check: bool(r == expected)
    test "0^-1 (mod 0) = 0 (need for tower of extension fields)":
      let a = BigInt[16].fromUint(0'u16)
      let M = BigInt[16].fromUint(2017'u16)
      var mp1div2 = M
      mp1div2.shiftRight(1)
      discard mp1div2.add(Word 1)
      let expected = BigInt[16].fromUint(0'u16)
      var r {.noInit.}: BigInt[16]
      r.invmod(a, M, mp1div2)
      check: bool(r == expected)
 mainArith()
 mainNeg()
 mainCopySwap()
 mainModularInverse()
--- a/tests/test_bigints_multimod.nim
+++ b/tests/test_bigints_multimod.nim
@ -11,7 +11,7 @@ import
  unittest,
  # Third-party
  ../constantine/io/io_bigints,
-  ../constantine/arithmetic/bigints,
+  ../constantine/arithmetic,
  ../constantine/primitives
 proc main() =
--- a/tests/test_bigints_vs_gmp.nim
+++ b/tests/test_bigints_vs_gmp.nim
@ -13,8 +13,8 @@ import
  gmp, stew/byteutils,
  # Internal
  ../constantine/io/io_bigints,
-  ../constantine/arithmetic/bigints,
+  ../constantine/arithmetic,
-  ../constantine/primitives/constant_time
+  ../constantine/primitives
 # We test up to 1024-bit, more is really slow
--- a/tests/test_finite_fields.nim
+++ b/tests/test_finite_fields.nim
@ -7,7 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import  unittest,
-        ../constantine/arithmetic/finite_fields,
+        ../constantine/arithmetic,
        ../constantine/io/io_fields,
        ../constantine/config/curves
--- a/tests/test_finite_fields_mulsquare.nim
+++ b/tests/test_finite_fields_mulsquare.nim
@ -7,11 +7,11 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import  std/unittest, std/times,
-        ../constantine/arithmetic/[bigints, finite_fields],
+        ../constantine/arithmetic,
        ../constantine/io/[io_bigints, io_fields],
        ../constantine/config/curves,
        # Test utilities
-        ./prng
+        ../helpers/prng
 const Iters = 128
@ -25,17 +25,22 @@ static: doAssert defined(testingCurves), "This modules requires the -d:testingCu
 import ../constantine/config/common
 proc sanity(C: static Curve) =
-  test "Squaring 0,1,2 with "& $C & " [FastSquaring = " & $Fake101.canUseNoCarryMontySquare & "]":
+  test "Squaring 0,1,2 with "& $Curve(C) & " [FastSquaring = " & $C.canUseNoCarryMontySquare & "]":
        block: # 0² mod
          var n: Fp[C]
          n.fromUint(0'u32)
          let expected = n
          # Out-of-place
          var r: Fp[C]
          r.square(n)
          # In-place
          n.square()
-          check: bool(r == expected)
+          check:
            bool(r == expected)
            bool(n == expected)
        block: # 1² mod
          var n: Fp[C]
@ -43,10 +48,15 @@ proc sanity(C: static Curve) =
          n.fromUint(1'u32)
          let expected = n
          # Out-of-place
          var r: Fp[C]
          r.square(n)
          # In-place
          n.square()
-          check: bool(r == expected)
+          check:
            bool(r == expected)
            bool(n == expected)
        block: # 2² mod
          var n, expected: Fp[C]
@ -54,10 +64,15 @@ proc sanity(C: static Curve) =
          n.fromUint(2'u32)
          expected.fromUint(4'u32)
          # Out-of-place
          var r: Fp[C]
          r.square(n)
          # In-place
          n.square()
-          check: bool(r == expected)
+          check:
            bool(r == expected)
            bool(n == expected)
 proc mainSanity() =
  suite "Modular squaring is consistent with multiplication on special elements":
--- a/tests/test_finite_fields_powinv.nim
+++ b/tests/test_finite_fields_powinv.nim
@ -7,7 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import  unittest,
-        ../constantine/arithmetic/[bigints, finite_fields],
+        ../constantine/arithmetic,
        ../constantine/io/[io_bigints, io_fields],
        ../constantine/config/curves
@ -144,14 +144,14 @@ proc main() =
  suite "Modular inversion over prime fields":
    test "x^(-1) mod p":
-        var x: Fp[BLS12_381]
+        var r, x: Fp[BLS12_381]
        # BN254 field modulus
        x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
        let expected = "0x0636759a0f3034fa47174b2c0334902f11e9915b7bd89c6a2b3082b109abbc9837da17201f6d8286fe6203caa1b9d4c8"
-        x.inv()
+        r.inv(x)
-        let computed = x.toHex()
+        let computed = r.toHex()
        check:
          computed == expected
--- a/tests/test_finite_fields_vs_gmp.nim
+++ b/tests/test_finite_fields_vs_gmp.nim
@ -13,18 +13,21 @@ import
  gmp, stew/byteutils,
  # Internal
  ../constantine/io/[io_bigints, io_fields],
-  ../constantine/arithmetic/[finite_fields, bigints],
+  ../constantine/arithmetic,
-  ../constantine/primitives/constant_time,
+  ../constantine/primitives,
  ../constantine/config/curves
 # We test up to 1024-bit, more is really slow
 var RNG {.compileTime.} = initRand(1234)
 const CurveParams = [
  P224: (224, "0xffffffffffffffffffffffffffffffff000000000000000000000001"),
  BN254: (254, "0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47"),
  P256: (256, "0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff"),
  Secp256k1: (256, "0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F"),
  BLS12_381: (381, "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab")
 ]
 const AvailableCurves = [P224, BN254, P256, Secp256k1, BLS12_381]
 const # https://gmplib.org/manual/Integer-Import-and-Export.html
  GMP_WordLittleEndian = -1'i32
  GMP_WordNativeEndian = 0'i32
@ -53,7 +56,7 @@ proc binary_prologue[C: static Curve, N: static int](
  mpz_urandomb(b, gmpRng, uint bits)
  # Set modulus to curve modulus
  let err = mpz_set_str(p, CurveParams[C][1], 0)
-  doAssert err == 0
+  doAssert err == 0, "Error on prime for curve " & $Curve(C)
  #########################################################
  # Conversion buffers
@ -189,8 +192,8 @@ proc invTests(gmpRng: var gmp_randstate_t, a, b, p, r: var mpz_t, C: static Curv
  let exist = mpz_invert(r, a, p)
  doAssert exist != 0
-  var rTest = aTest
+  var rTest {.noInit.}: Fp[C]
-  rTest.inv()
+  rTest.inv(aTest)
  binary_epilogue(r, a, b, rTest, aBuf, bBuf, "Inversion (b is unused)")
@ -206,7 +209,7 @@ macro randomTests(numTests: static int, curveSym, body: untyped): untyped =
  result = newStmtList()
  for _ in 0 ..< numTests:
-    let curve = RNG.rand([BN254, BLS12_381])
+    let curve = RNG.rand(AvailableCurves)
    result.add quote do:
      block:
--- a/tests/test_fp2.nim
+++ b/tests/test_fp2.nim
@ -12,9 +12,9 @@ import
  # Internals
  ../constantine/tower_field_extensions/[abelian_groups, fp2_complex],
  ../constantine/config/[common, curves],
-  ../constantine/arithmetic/bigints,
+  ../constantine/arithmetic,
  # Test utilities
-  ./prng
+  ../helpers/prng
 const Iters = 128
--- a/tests/test_io_bigints.nim
+++ b/tests/test_io_bigints.nim
@ -9,7 +9,7 @@
 import  unittest, random,
        ../constantine/io/io_bigints,
        ../constantine/config/common,
-        ../constantine/arithmetic/bigints
+        ../constantine/arithmetic
 randomize(0xDEADBEEF) # Random seed for reproducibility
 type T = BaseType
--- a/tests/test_io_fields.nim
+++ b/tests/test_io_fields.nim
@ -10,7 +10,7 @@ import  unittest, random,
        ../constantine/io/[io_bigints, io_fields],
        ../constantine/config/curves,
        ../constantine/config/common,
-        ../constantine/arithmetic/[bigints, finite_fields]
+        ../constantine/arithmetic
 randomize(0xDEADBEEF) # Random seed for reproducibility
 type T = BaseType
		`@ -0,0 +1,3 @@`
							`# Helpers, utilities, tools, miscellaneous`

							`This folder holds helper functions that are not part of Constantine but facilitates testing and benchmarking.`