diff --git a/.travis.yml b/.travis.yml
index 5612fe8..3b9ea56 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -13,6 +13,7 @@ matrix:
     # Build and test on both x86-64 and ARM64
     # Ubuntu Bionic (18.04) is needed, it includes
     # GCC 7 codegen fixes to addcarry_u64.
+    # Clang 9 (and GCC-6) are needed for inline assembly "flag output constraints"
     - dist: bionic
       arch: amd64
       env:
@@ -33,9 +34,19 @@ matrix:
         - ARCH=amd64
         - CHANNEL=devel
       compiler: clang
+      # addons:
+      #   apt:
+      #     sources:
+      #       - ubuntu-toolchain-r-test
+      #       - llvm-toolchain-bionic-9.0 # LLVM 9 repo is disallowed
+      #     packages:
+      #       - clang-9.0
+      # env:
+      #   - MATRIX_EVAL="CC=clang-9.0 && CXX=clang++-9.0"
 
     # On OSX we only test against clang (gcc is mapped to clang by default)
     - os: osx
+      osx_image: xcode11.5 # Need xcode 11.4.2 min for Clang 9
       arch: amd64
       env:
         - ARCH=amd64
@@ -98,9 +109,17 @@ before_script:
 script:
     - nimble refresh
     - nimble install gmp stew
-    - nimble test_parallel
-    - if [[ "$ARCH" != "arm64" ]]; then
-        nimble test_parallel_no_assembler;
+    # Installing Clang9.0 or later is a pain in Travis
+    # for inline assembly "flag output constraint"
+    # Also MacOS build is timing out with 2 series of tests.
+    - |
+      if [[ "$TRAVIS_COMPILER" == "clang" ]]; then
+        nimble test_parallel_no_assembler
+      else
+        nimble test_parallel
+        if [[ "$ARCH" != "arm64" ]]; then
+          nimble test_parallel_no_assembler
+        fi
       fi
 branches:
   except:
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 8817707..201f251 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -36,7 +36,7 @@ strategy:
     #   TEST_LANG: c
 
     MacOS_devel_64bit:
-      VM: 'macOS-10.14'
+      VM: 'macOS-10.15'
       UCPU: amd64
       CHANNEL: devel
       TEST_LANG: c
diff --git a/benchmarks/bench_elliptic_template.nim b/benchmarks/bench_elliptic_template.nim
index fb746ef..f66c281 100644
--- a/benchmarks/bench_elliptic_template.nim
+++ b/benchmarks/bench_elliptic_template.nim
@@ -61,7 +61,7 @@ echo "Optimization level => "
 echo "  no optimization: ", not defined(release)
 echo "  release: ", defined(release)
 echo "  danger: ", defined(danger)
-echo "  inline assembly: ", UseX86ASM
+echo "  inline assembly: ", UseASM_X86_64
 
 when (sizeof(int) == 4) or defined(Constantine32):
   echo "⚠️ Warning: using Constantine with 32-bit limbs"
diff --git a/benchmarks/bench_fields_template.nim b/benchmarks/bench_fields_template.nim
index e17bc4a..afbd5be 100644
--- a/benchmarks/bench_fields_template.nim
+++ b/benchmarks/bench_fields_template.nim
@@ -58,7 +58,7 @@ echo "Optimization level => "
 echo "  no optimization: ", not defined(release)
 echo "  release: ", defined(release)
 echo "  danger: ", defined(danger)
-echo "  inline assembly: ", UseX86ASM
+echo "  inline assembly: ", UseASM_X86_64
 
 when (sizeof(int) == 4) or defined(Constantine32):
   echo "⚠️ Warning: using Constantine with 32-bit limbs"
diff --git a/benchmarks/bench_fp_double_width.nim b/benchmarks/bench_fp_double_width.nim
new file mode 100644
index 0000000..bbb958a
--- /dev/null
+++ b/benchmarks/bench_fp_double_width.nim
@@ -0,0 +1,196 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# ############################################################
+#
+#             Benchmark of finite fields
+#
+# ############################################################
+
+import
+  # Internals
+  ../constantine/config/[curves, common],
+  ../constantine/arithmetic,
+  ../constantine/towers,
+  # Helpers
+  ../helpers/[prng_unsafe, static_for],
+  ./platforms,
+  # Standard library
+  std/[monotimes, times, strformat, strutils, macros]
+
+var rng: RngState
+let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+rng.seed(seed)
+echo "bench xoshiro512** seed: ", seed
+
+# warmup
+proc warmup*() =
+  # Warmup - make sure cpu is on max perf
+  let start = cpuTime()
+  var foo = 123
+  for i in 0 ..< 300_000_000:
+    foo += i*i mod 456
+    foo = foo mod 789
+
+  # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
+  let stop = cpuTime()
+  echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
+
+warmup()
+
+when defined(gcc):
+  echo "\nCompiled with GCC"
+elif defined(clang):
+  echo "\nCompiled with Clang"
+elif defined(vcc):
+  echo "\nCompiled with MSVC"
+elif defined(icc):
+  echo "\nCompiled with ICC"
+else:
+  echo "\nCompiled with an unknown compiler"
+
+echo "Optimization level => "
+echo "  no optimization: ", not defined(release)
+echo "  release: ", defined(release)
+echo "  danger: ", defined(danger)
+echo "  inline assembly: ", UseASM_X86_64
+
+when (sizeof(int) == 4) or defined(Constantine32):
+  echo "⚠️ Warning: using Constantine with 32-bit limbs"
+else:
+  echo "Using Constantine with 64-bit limbs"
+
+when SupportsCPUName:
+  echo "Running on ", cpuName(), ""
+
+when SupportsGetTicks:
+  echo "\n⚠️ Cycles measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
+  echo "i.e. a 20% overclock will be about 20% off (assuming no dynamic frequency scaling)"
+
+echo "\n=================================================================================================================\n"
+
+proc separator*() =
+  echo "-".repeat(145)
+
+proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
+  let ns = inNanoseconds((stop-start) div iters)
+  let throughput = 1e9 / float64(ns)
+  when SupportsGetTicks:
+    echo &"{op:<28} {field:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op     {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
+  else:
+    echo &"{op:<28} {field:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op"
+
+proc notes*() =
+  echo "Notes:"
+  echo "  - Compilers:"
+  echo "    Compilers are severely limited on multiprecision arithmetic."
+  echo "    Inline Assembly is used by default (nimble bench_fp)."
+  echo "    Bench without assembly can use \"nimble bench_fp_gcc\" or \"nimble bench_fp_clang\"."
+  echo "    GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries."
+  echo "  - The simplest operations might be optimized away by the compiler."
+  echo "  - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)"
+
+template bench(op: string, desc: string, iters: int, body: untyped): untyped =
+  let start = getMonotime()
+  when SupportsGetTicks:
+    let startClk = getTicks()
+  for _ in 0 ..< iters:
+    body
+  when SupportsGetTicks:
+    let stopClk = getTicks()
+  let stop = getMonotime()
+
+  when not SupportsGetTicks:
+    let startClk = -1'i64
+    let stopClk = -1'i64
+
+  report(op, desc, start, stop, startClk, stopClk, iters)
+
+func random_unsafe(rng: var RngState, a: var FpDbl, Base: typedesc) =
+  ## Initialize a standalone Double-Width field element
+  ## we don't reduce it modulo p², this is only used for benchmark
+  let aHi = rng.random_unsafe(Base)
+  let aLo = rng.random_unsafe(Base)
+  for i in 0 ..< aLo.mres.limbs.len:
+    a.limbs2x[i] = aLo.mres.limbs[i]
+  for i in 0 ..< aHi.mres.limbs.len:
+    a.limbs2x[aLo.mres.limbs.len+i] = aHi.mres.limbs[i]
+
+proc sumNoReduce(T: typedesc, iters: int) =
+  var r: T
+  let a = rng.random_unsafe(T)
+  let b = rng.random_unsafe(T)
+  bench("Addition no reduce", $T, iters):
+    r.sumNoReduce(a, b)
+
+proc sum(T: typedesc, iters: int) =
+  var r: T
+  let a = rng.random_unsafe(T)
+  let b = rng.random_unsafe(T)
+  bench("Addition", $T, iters):
+    r.sum(a, b)
+
+proc diffNoReduce(T: typedesc, iters: int) =
+  var r: T
+  let a = rng.random_unsafe(T)
+  let b = rng.random_unsafe(T)
+  bench("Substraction no reduce", $T, iters):
+    r.diffNoReduce(a, b)
+
+proc diff(T: typedesc, iters: int) =
+  var r: T
+  let a = rng.random_unsafe(T)
+  let b = rng.random_unsafe(T)
+  bench("Substraction", $T, iters):
+    r.diff(a, b)
+
+proc diff2xNoReduce(T: typedesc, iters: int) =
+  var r, a, b: doubleWidth(T)
+  rng.random_unsafe(r, T)
+  rng.random_unsafe(a, T)
+  rng.random_unsafe(b, T)
+  bench("Substraction 2x no reduce", $doubleWidth(T), iters):
+    r.diffNoReduce(a, b)
+
+proc diff2x(T: typedesc, iters: int) =
+  var r, a, b: doubleWidth(T)
+  rng.random_unsafe(r, T)
+  rng.random_unsafe(a, T)
+  rng.random_unsafe(b, T)
+  bench("Substraction 2x", $doubleWidth(T), iters):
+    r.diff(a, b)
+
+proc mul2xBench*(rLen, aLen, bLen: static int, iters: int) =
+  var r: BigInt[rLen]
+  let a = rng.random_unsafe(BigInt[aLen])
+  let b = rng.random_unsafe(BigInt[bLen])
+  bench("Multiplication", $rLen & " <- " & $aLen & " x " & $bLen, iters):
+    r.prod(a, b)
+
+proc reduce2x*(T: typedesc, iters: int) =
+  var r: T
+  var t: doubleWidth(T)
+  rng.random_unsafe(t, T)
+
+  bench("Reduce 2x-width", $T & " <- " & $doubleWidth(T), iters):
+    r.reduce(t)
+
+proc main() =
+  separator()
+  sumNoReduce(Fp[BLS12_381], iters = 10_000_000)
+  diffNoReduce(Fp[BLS12_381], iters = 10_000_000)
+  sum(Fp[BLS12_381], iters = 10_000_000)
+  diff(Fp[BLS12_381], iters = 10_000_000)
+  diff2x(Fp[BLS12_381], iters = 10_000_000)
+  diff2xNoReduce(Fp[BLS12_381], iters = 10_000_000)
+  mul2xBench(768, 384, 384, iters = 10_000_000)
+  reduce2x(Fp[BLS12_381], iters = 10_000_000)
+  separator()
+
+main()
+notes()
diff --git a/constantine.nimble b/constantine.nimble
index ae409e9..471fbbb 100644
--- a/constantine.nimble
+++ b/constantine.nimble
@@ -35,6 +35,8 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
   ("tests/t_finite_fields_vs_gmp.nim", true),
   # Precompute
   ("tests/t_precomputed", false),
+  # Double-width finite fields
+  ("tests/t_finite_fields_double_width.nim", false),
   # Towers of extension fields
   ("tests/t_fp2.nim", false),
   ("tests/t_fp2_sqrt.nim", false),
@@ -100,13 +102,15 @@ proc test(flags, path: string, commandFile = false) =
     # commandFile.writeLine command
     exec "echo \'" & command & "\' >> " & buildParallel
 
-proc runBench(benchName: string, compiler = "") =
+proc runBench(benchName: string, compiler = "", useAsm = true) =
   if not dirExists "build":
     mkDir "build"
 
   var cc = ""
   if compiler != "":
-    cc = "--cc:" & compiler & " -d:ConstantineASM=false"
+    cc = "--cc:" & compiler
+  if not useAsm:
+    cc &= " -d:ConstantineASM=false"
   exec "nim c " & cc &
        " -d:danger --verbosity:0 -o:build/" & benchName & "_" & compiler &
        " -r --hints:off --warnings:off benchmarks/" & benchName & ".nim"
@@ -298,6 +302,27 @@ task bench_fp_gcc, "Run benchmark 𝔽p with gcc":
 task bench_fp_clang, "Run benchmark 𝔽p with clang":
   runBench("bench_fp", "clang")
 
+task bench_fp_gcc_noasm, "Run benchmark 𝔽p with gcc - no Assembly":
+  runBench("bench_fp", "gcc", useAsm = false)
+
+task bench_fp_clang_noasm, "Run benchmark 𝔽p with clang - no Assembly":
+  runBench("bench_fp", "clang", useAsm = false)
+
+task bench_fpdbl, "Run benchmark 𝔽pDbl with your default compiler":
+  runBench("bench_fp_double_width")
+
+task bench_fpdbl_gcc, "Run benchmark 𝔽p with gcc":
+  runBench("bench_fp_double_width", "gcc")
+
+task bench_fpdbl_clang, "Run benchmark 𝔽p with clang":
+  runBench("bench_fp_double_width", "clang")
+
+task bench_fpdbl_gcc_noasm, "Run benchmark 𝔽p with gcc - no Assembly":
+  runBench("bench_fp_double_width", "gcc", useAsm = false)
+
+task bench_fpdbl_clang_noasm, "Run benchmark 𝔽p with clang - no Assembly":
+  runBench("bench_fp_double_width", "clang", useAsm = false)
+
 task bench_fp2, "Run benchmark with 𝔽p2 your default compiler":
   runBench("bench_fp2")
 
@@ -307,6 +332,12 @@ task bench_fp2_gcc, "Run benchmark 𝔽p2 with gcc":
 task bench_fp2_clang, "Run benchmark 𝔽p2 with clang":
   runBench("bench_fp2", "clang")
 
+task bench_fp2_gcc_noasm, "Run benchmark 𝔽p2 with gcc - no Assembly":
+  runBench("bench_fp2", "gcc", useAsm = false)
+
+task bench_fp2_clang_noasm, "Run benchmark 𝔽p2 with clang - no Assembly":
+  runBench("bench_fp2", "clang", useAsm = false)
+
 task bench_fp6, "Run benchmark with 𝔽p6 your default compiler":
   runBench("bench_fp6")
 
@@ -316,6 +347,12 @@ task bench_fp6_gcc, "Run benchmark 𝔽p6 with gcc":
 task bench_fp6_clang, "Run benchmark 𝔽p6 with clang":
   runBench("bench_fp6", "clang")
 
+task bench_fp6_gcc_noasm, "Run benchmark 𝔽p6 with gcc - no Assembly":
+  runBench("bench_fp6", "gcc", useAsm = false)
+
+task bench_fp6_clang_noasm, "Run benchmark 𝔽p6 with clang - no Assembly":
+  runBench("bench_fp6", "clang", useAsm = false)
+
 task bench_fp12, "Run benchmark with 𝔽p12 your default compiler":
   runBench("bench_fp12")
 
@@ -325,6 +362,12 @@ task bench_fp12_gcc, "Run benchmark 𝔽p12 with gcc":
 task bench_fp12_clang, "Run benchmark 𝔽p12 with clang":
   runBench("bench_fp12", "clang")
 
+task bench_fp12_gcc_noasm, "Run benchmark 𝔽p12 with gcc - no Assembly":
+  runBench("bench_fp12", "gcc", useAsm = false)
+
+task bench_fp12_clang_noasm, "Run benchmark 𝔽p12 with clang - no Assembly":
+  runBench("bench_fp12", "clang", useAsm = false)
+
 task bench_ec_g1, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - GCC":
   runBench("bench_ec_g1")
 
@@ -334,6 +377,12 @@ task bench_ec_g1_gcc, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weier
 task bench_ec_g1_clang, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - Clang":
   runBench("bench_ec_g1", "clang")
 
+task bench_ec_g1_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - GCC no Assembly":
+  runBench("bench_ec_g1", "gcc", useAsm = false)
+
+task bench_ec_g1_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - Clang no Assembly":
+  runBench("bench_ec_g1", "clang", useAsm = false)
+
 task bench_ec_g2, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - GCC":
   runBench("bench_ec_g2")
 
@@ -342,3 +391,9 @@ task bench_ec_g2_gcc, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weier
 
 task bench_ec_g2_clang, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - Clang":
   runBench("bench_ec_g2", "clang")
+
+task bench_ec_g2_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - GCC no Assembly":
+  runBench("bench_ec_g2", "gcc", useAsm = false)
+
+task bench_ec_g2_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - Clang no Assembly":
+  runBench("bench_ec_g2", "clang", useAsm = false)
diff --git a/constantine/arithmetic.nim b/constantine/arithmetic.nim
index 5a85716..0a2f9fe 100644
--- a/constantine/arithmetic.nim
+++ b/constantine/arithmetic.nim
@@ -8,8 +8,8 @@
 
 import
   arithmetic/bigints,
-  arithmetic/[finite_fields, finite_fields_inversion]
+  arithmetic/[finite_fields, finite_fields_inversion, finite_fields_double_width]
 
 export
   bigints,
-  finite_fields, finite_fields_inversion
+  finite_fields, finite_fields_inversion, finite_fields_double_width
diff --git a/constantine/arithmetic/bigints.nim b/constantine/arithmetic/bigints.nim
index 302e3a1..d4bc8da 100644
--- a/constantine/arithmetic/bigints.nim
+++ b/constantine/arithmetic/bigints.nim
@@ -9,13 +9,10 @@
 import
   ../config/[common, type_bigint],
   ../primitives,
-  ./limbs_generic,
+  ./limbs,
   ./limbs_generic_modular,
   ./limbs_montgomery
 
-when UseX86ASM:
-  import ./limbs_asm_x86
-
 export BigInt
 
 # ############################################################
@@ -85,10 +82,7 @@ func ccopy*(a: var BigInt, b: BigInt, ctl: SecretBool) =
   ## If ctl is true: b is copied into a
   ## if ctl is false: b is not copied and a is untouched
   ## Time and memory accesses are the same whether a copy occurs or not
-  when UseX86ASM:
-    ccopy_asm(a.limbs, b.limbs, ctl)
-  else:
-    ccopy(a.limbs, b.limbs, ctl)
+  ccopy(a.limbs, b.limbs, ctl)
 
 func cswap*(a, b: var BigInt, ctl: CTBool) =
   ## Swap ``a`` and ``b`` if ``ctl`` is true
@@ -245,6 +239,14 @@ func prod*[rBits, aBits, bBits](r: var BigInt[rBits], a: BigInt[aBits], b: BigIn
   ## It will be truncated if it cannot fit in r limbs.
   r.limbs.prod(a.limbs, b.limbs)
 
+func mul*[aBits, bBits](a: var BigInt[aBits], b: BigInt[bBits]) =
+  ## Multi-precision multiplication
+  ## a <- a*b
+  ## `a`, `b`, can have different sizes
+  var t{.noInit.}: typeof(a)
+  t.limbs.prod(a.limbs, b.limbs)
+  a = t
+
 func prod_high_words*[rBits, aBits, bBits](r: var BigInt[rBits], a: BigInt[aBits], b: BigInt[bBits], lowestWordIndex: static int) =
   ## Multi-precision multiplication keeping only high words
   ## r <- a*b >> (2^WordBitWidth)^lowestWordIndex
diff --git a/constantine/arithmetic/finite_fields.nim b/constantine/arithmetic/finite_fields.nim
index ce157d8..e68a936 100644
--- a/constantine/arithmetic/finite_fields.nim
+++ b/constantine/arithmetic/finite_fields.nim
@@ -29,7 +29,7 @@ import
   ../config/[common, type_fp, curves],
   ./bigints, ./limbs_montgomery
 
-when UseX86ASM:
+when UseASM_X86_64:
   import ./limbs_asm_modular_x86
 
 export Fp
@@ -120,7 +120,7 @@ func setOne*(a: var Fp) {.inline.} =
 
 func `+=`*(a: var Fp, b: Fp) {.inline.} =
   ## In-place addition modulo p
-  when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling
+  when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
     addmod_asm(a.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs)
   else:
     var overflowed = add(a.mres, b.mres)
@@ -129,7 +129,7 @@ func `+=`*(a: var Fp, b: Fp) {.inline.} =
 
 func `-=`*(a: var Fp, b: Fp) {.inline.} =
   ## In-place substraction modulo p
-  when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling
+  when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
     submod_asm(a.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs)
   else:
     let underflowed = sub(a.mres, b.mres)
@@ -137,7 +137,7 @@ func `-=`*(a: var Fp, b: Fp) {.inline.} =
 
 func double*(a: var Fp) {.inline.} =
   ## Double ``a`` modulo p
-  when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling
+  when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
     addmod_asm(a.mres.limbs, a.mres.limbs, Fp.C.Mod.limbs)
   else:
     var overflowed = double(a.mres)
@@ -145,9 +145,9 @@ func double*(a: var Fp) {.inline.} =
     discard csub(a.mres, Fp.C.Mod, overflowed)
 
 func sum*(r: var Fp, a, b: Fp) {.inline.} =
-  ## Sum ``a`` and ``b`` into ``r`` module p
+  ## Sum ``a`` and ``b`` into ``r`` modulo p
   ## r is initialized/overwritten
-  when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling
+  when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
     r = a
     addmod_asm(r.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs)
   else:
@@ -155,21 +155,42 @@ func sum*(r: var Fp, a, b: Fp) {.inline.} =
     overflowed = overflowed or not(r.mres < Fp.C.Mod)
     discard csub(r.mres, Fp.C.Mod, overflowed)
 
+func sumNoReduce*(r: var Fp, a, b: Fp) {.inline.} =
+  ## Sum ``a`` and ``b`` into ``r`` without reduction
+  discard r.mres.sum(a.mres, b.mres)
+
 func diff*(r: var Fp, a, b: Fp) {.inline.} =
   ## Substract `b` from `a` and store the result into `r`.
   ## `r` is initialized/overwritten
-  when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling
-    var t = a # Handle aliasing r == b
+  ## Requires r != b
+  when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
+    r = a
+    submod_asm(r.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs)
+  else:
+    var underflowed = r.mres.diff(a.mres, b.mres)
+    discard cadd(r.mres, Fp.C.Mod, underflowed)
+
+func diffAlias*(r: var Fp, a, b: Fp) {.inline.} =
+  ## Substract `b` from `a` and store the result into `r`.
+  ## `r` is initialized/overwritten
+  ## Handles r == b
+  when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
+    var t = a
     submod_asm(t.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs)
     r = t
   else:
     var underflowed = r.mres.diff(a.mres, b.mres)
     discard cadd(r.mres, Fp.C.Mod, underflowed)
 
+func diffNoReduce*(r: var Fp, a, b: Fp) {.inline.} =
+  ## Substract `b` from `a` and store the result into `r`
+  ## without reduction
+  discard r.mres.diff(a.mres, b.mres)
+
 func double*(r: var Fp, a: Fp) {.inline.} =
   ## Double ``a`` into ``r``
   ## `r` is initialized/overwritten
-  when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling
+  when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
     r = a
     addmod_asm(r.mres.limbs, a.mres.limbs, Fp.C.Mod.limbs)
   else:
@@ -188,7 +209,7 @@ func square*(r: var Fp, a: Fp) {.inline.} =
 
 func neg*(r: var Fp, a: Fp) {.inline.} =
   ## Negate modulo p
-  when UseX86ASM and defined(gcc):
+  when UseASM_X86_64 and defined(gcc):
     # Clang and every compiler besides GCC
     # can cleanly optimized this
     # especially on Fp2
diff --git a/constantine/arithmetic/finite_fields_double_width.nim b/constantine/arithmetic/finite_fields_double_width.nim
new file mode 100644
index 0000000..523c744
--- /dev/null
+++ b/constantine/arithmetic/finite_fields_double_width.nim
@@ -0,0 +1,72 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../config/[common, curves, type_bigint, type_fp],
+  ../primitives,
+  ./bigints,
+  ./finite_fields,
+  ./limbs,
+  ./limbs_double_width
+
+when UseASM_X86_64:
+  import limbs_asm_modular_dbl_width_x86
+
+type FpDbl*[C: static Curve] = object
+  ## Double-width Fp element
+  ## This allows saving on reductions
+  # We directly work with double the number of limbs
+  limbs2x*: matchingLimbs2x(C)
+
+template doubleWidth*(T: typedesc[Fp]): typedesc =
+  ## Return the double-width type matching with Fp
+  FpDbl[T.C]
+
+func mulNoReduce*(r: var FpDbl, a, b: Fp) {.inline.} =
+  ## Store the product of ``a`` by ``b`` into ``r``
+  r.limbs2x.prod(a.mres.limbs, b.mres.limbs)
+
+func reduce*(r: var Fp, a: FpDbl) {.inline.} =
+  ## Reduce a double-width field element into r
+  const N = r.mres.limbs.len
+  montyRed(
+    r.mres.limbs,
+    a.limbs2x,
+    Fp.C.Mod.limbs,
+    Fp.C.getNegInvModWord(),
+    Fp.C.canUseNoCarryMontyMul()
+  )
+
+func diffNoInline(r: var FpDbl, a, b: FpDbl): Borrow =
+  r.limbs2x.diff(a.limbs2x, b.limbs2x)
+
+func diffNoReduce*(r: var FpDbl, a, b: FpDbl) =
+  ## Double-width substraction without reduction
+  discard diffNoInline(r, a, b)
+
+func diff*(r: var FpDbl, a, b: FpDbl) =
+  ## Double-width modular substraction
+  when false: # TODO slower
+    r = a
+    sub2x_asm(r.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs)
+  else:
+    var underflowed = SecretBool diffNoInline(r, a, b)
+
+    const N = r.limbs2x.len div 2
+    const M = FpDbl.C.Mod
+    var carry = Carry(0)
+    var sum: SecretWord
+    for i in 0 ..< N:
+      addC(carry, sum, r.limbs2x[i+N], M.limbs[i], carry)
+      underflowed.ccopy(r.limbs2x[i+N], sum)
+
+func `-=`*(a: var FpDbl, b: FpDbl) =
+  when false: # TODO slower
+    sub2x_asm(a.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs)
+  else:
+    a.diff(a, b)
diff --git a/constantine/arithmetic/limbs_generic.nim b/constantine/arithmetic/limbs.nim
similarity index 84%
rename from constantine/arithmetic/limbs_generic.nim
rename to constantine/arithmetic/limbs.nim
index ded7c5a..8f7be16 100644
--- a/constantine/arithmetic/limbs_generic.nim
+++ b/constantine/arithmetic/limbs.nim
@@ -8,8 +8,13 @@
 
 import
   ../config/common,
-  ../primitives,
-  ../../helpers/static_for
+  ../primitives
+
+when UseASM_X86_32:
+  import ./limbs_asm_x86
+when UseASM_X86_64:
+  import ./limbs_asm_mul_x86
+  import ./limbs_asm_mul_x86_adx_bmi2
 
 # ############################################################
 #
@@ -37,39 +42,9 @@ import
 # The limb-endianess is little-endian, less significant limb is at index 0.
 # The word-endianness is native-endian.
 
-type Limbs*[N: static int] = array[N, SecretWord]
-  ## Limbs-type
-  ## Should be distinct type to avoid builtins to use non-constant time
-  ## implementation, for example for comparison.
-  ##
-  ## but for unknown reason, it prevents semchecking `bits`
-
 # No exceptions allowed
 {.push raises: [].}
 
-# ############################################################
-#
-#                      Accessors
-#
-# ############################################################
-#
-# Commented out since we don't use a distinct type
-
-# template `[]`[N](v: Limbs[N], idx: int): SecretWord =
-#   (array[N, SecretWord])(v)[idx]
-#
-# template `[]`[N](v: var Limbs[N], idx: int): var SecretWord =
-#   (array[N, SecretWord])(v)[idx]
-#
-# template `[]=`[N](v: Limbs[N], idx: int, val: SecretWord) =
-#   (array[N, SecretWord])(v)[idx] = val
-
-# ############################################################
-#
-#           Checks and debug/test only primitives
-#
-# ############################################################
-
 # ############################################################
 #
 #                   Limbs Primitives
@@ -104,8 +79,11 @@ func ccopy*(a: var Limbs, b: Limbs, ctl: SecretBool) =
   ## If ctl is true: b is copied into a
   ## if ctl is false: b is not copied and a is untouched
   ## Time and memory accesses are the same whether a copy occurs or not
-  for i in 0 ..< a.len:
-    ctl.ccopy(a[i], b[i])
+  when UseASM_X86_32:
+    ccopy_asm(a, b, ctl)
+  else:
+    for i in 0 ..< a.len:
+      ctl.ccopy(a[i], b[i])
 
 func cswap*(a, b: var Limbs, ctl: CTBool) =
   ## Swap ``a`` and ``b`` if ``ctl`` is true
@@ -190,9 +168,12 @@ func shiftRight*(a: var Limbs, k: int) {.inline.}=
 func add*(a: var Limbs, b: Limbs): Carry =
   ## Limbs addition
   ## Returns the carry
-  result = Carry(0)
-  for i in 0 ..< a.len:
-    addC(result, a[i], a[i], b[i], result)
+  when UseASM_X86_32:
+    result = add_asm(a, a, b)
+  else:
+    result = Carry(0)
+    for i in 0 ..< a.len:
+      addC(result, a[i], a[i], b[i], result)
 
 func add*(a: var Limbs, w: SecretWord): Carry =
   ## Limbs addition, add a number that fits in a word
@@ -222,16 +203,22 @@ func sum*(r: var Limbs, a, b: Limbs): Carry =
   ## `r` is initialized/overwritten
   ##
   ## Returns the carry
-  result = Carry(0)
-  for i in 0 ..< a.len:
-    addC(result, r[i], a[i], b[i], result)
+  when UseASM_X86_32:
+    result = add_asm(r, a, b)
+  else:
+    result = Carry(0)
+    for i in 0 ..< a.len:
+      addC(result, r[i], a[i], b[i], result)
 
 func sub*(a: var Limbs, b: Limbs): Borrow =
   ## Limbs substraction
   ## Returns the borrow
-  result = Borrow(0)
-  for i in 0 ..< a.len:
-    subB(result, a[i], a[i], b[i], result)
+  when UseASM_X86_32:
+    result = sub_asm(a, a, b)
+  else:
+    result = Borrow(0)
+    for i in 0 ..< a.len:
+      subB(result, a[i], a[i], b[i], result)
 
 func sub*(a: var Limbs, w: SecretWord): Borrow =
   ## Limbs substraction, sub a number that fits in a word
@@ -272,9 +259,12 @@ func diff*(r: var Limbs, a, b: Limbs): Borrow =
   ## `r` is initialized/overwritten
   ##
   ## Returns the borrow
-  result = Borrow(0)
-  for i in 0 ..< a.len:
-    subB(result, r[i], a[i], b[i], result)
+  when UseASM_X86_32:
+    result = sub_asm(r, a, b)
+  else:
+    result = Borrow(0)
+    for i in 0 ..< a.len:
+      subB(result, r[i], a[i], b[i], result)
 
 func cneg*(a: var Limbs, ctl: CTBool) =
   ## Conditional negation.
@@ -301,7 +291,7 @@ func cneg*(a: var Limbs, ctl: CTBool) =
 # Multiplication
 # ------------------------------------------------------------
 
-func prod*[rLen, aLen, bLen](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
+func prod*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
   ## Multi-precision multiplication
   ## r <- a*b
   ##
@@ -309,23 +299,34 @@ func prod*[rLen, aLen, bLen](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen])
   ## if `r`.limbs.len < a.limbs.len + b.limbs.len
   ## The result will be truncated, i.e. it will be
   ## a * b (mod (2^WordBitwidth)^r.limbs.len)
+  ##
+  ## `r` must not alias ``a`` or ``b``
 
-  # We use Product Scanning / Comba multiplication
-  var t, u, v = SecretWord(0)
-  var z: Limbs[rLen] # zero-init, ensure on stack and removes in-place problems
+  when UseASM_X86_64 and aLen <= 6:
+    if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()):
+      mul_asm_adx_bmi2(r, a, b)
+    else:
+      mul_asm(r, a, b)
+  elif UseASM_X86_64:
+    mul_asm(r, a, b)
+  else:
+    # We use Product Scanning / Comba multiplication
+    var t, u, v = SecretWord(0)
 
-  staticFor i, 0, min(a.len+b.len, r.len):
-    const ib = min(b.len-1, i)
-    const ia = i - ib
-    staticFor j, 0, min(a.len - ia, ib+1):
-      mulAcc(t, u, v, a[ia+j], b[ib-j])
+    staticFor i, 0, min(a.len+b.len, r.len):
+      const ib = min(b.len-1, i)
+      const ia = i - ib
+      staticFor j, 0, min(a.len - ia, ib+1):
+        mulAcc(t, u, v, a[ia+j], b[ib-j])
 
-    z[i] = v
-    v = u
-    u = t
-    t = SecretWord(0)
+      r[i] = v
+      v = u
+      u = t
+      t = SecretWord(0)
 
-  r = z
+    if aLen+bLen < rLen:
+      for i in aLen+bLen ..< rLen:
+        r[i] = SecretWord 0
 
 func prod_high_words*[rLen, aLen, bLen](
        r: var Limbs[rLen],
diff --git a/constantine/arithmetic/limbs_asm_modular_dbl_width_x86.nim b/constantine/arithmetic/limbs_asm_modular_dbl_width_x86.nim
new file mode 100644
index 0000000..96ee717
--- /dev/null
+++ b/constantine/arithmetic/limbs_asm_modular_dbl_width_x86.nim
@@ -0,0 +1,84 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Standard library
+  std/macros,
+  # Internal
+  ../config/common,
+  ../primitives
+
+# ############################################################
+#
+#        Assembly implementation of FpDbl
+#
+# ############################################################
+
+static: doAssert UseASM_X86_64
+{.localPassC:"-fomit-frame-pointer".} # Needed so that the compiler finds enough registers
+
+# TODO slower than intrinsics
+
+# Substraction
+# ------------------------------------------------------------
+
+macro sub2x_gen[N: static int](a: var Limbs[N], b: Limbs[N], M: Limbs[N div 2]): untyped =
+  ## Generate an optimized out-of-place double-width substraction kernel
+
+  result = newStmtList()
+
+  var ctx = init(Assembler_x86, BaseType)
+  let
+    N2 = N div 2
+
+    arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, InputOutput)
+    # We reuse the reg used for B for overflow detection
+    arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, InputOutput)
+    # We could force M as immediate by specializing per moduli
+    arrM = init(OperandArray, nimSymbol = M, N, PointerInReg, Input)
+    # If N is too big, we need to spill registers. TODO.
+    arrT = init(OperandArray, nimSymbol = ident"t", N2, ElemsInReg, Output_EarlyClobber)
+    arrTadd = init(OperandArray, nimSymbol = ident"tadd", N2, ElemsInReg, Output_EarlyClobber)
+
+  # Substraction
+  for i in 0 ..< N:
+    ctx.mov arrT[i mod N2], arrA[i]
+    if i == 0:
+      ctx.sub arrT[0], arrB[0]
+    else:
+      ctx.sbb arrT[i mod N2], arrB[i]
+    ctx.mov arrA[i], arrT[i mod N2]
+    # Interleaved copy the modulus to hide SBB latencies
+    if i < N2:
+      ctx.mov arrTadd[i], arrM[i]
+
+  # Mask: underflowed contains 0xFFFF or 0x0000
+  let underflowed = arrB.reuseRegister()
+  ctx.sbb underflowed, underflowed
+
+  # Now mask the adder, with 0 or the modulus limbs
+  for i in 0 ..< N2:
+    ctx.`and` arrTadd[i], underflowed
+
+  # Add the masked modulus
+  for i in 0 ..< N2:
+    if i == 0:
+      ctx.add arrT[0], arrTadd[0]
+    else:
+      ctx.adc arrT[i], arrTadd[i]
+    ctx.mov arrA[i+N2], arrT[i]
+
+  let t = arrT.nimSymbol
+  let tadd = arrTadd.nimSymbol
+  result.add quote do:
+    var `t`{.noinit.}, `tadd` {.noInit.}: typeof(`a`)
+  result.add ctx.generate
+
+func sub2x_asm*[N: static int](a: var Limbs[N], b: Limbs[N], M: Limbs[N div 2]) =
+  ## Constant-time double-width substraction
+  sub2x_gen(a, b, M)
diff --git a/constantine/arithmetic/limbs_asm_modular_x86.nim b/constantine/arithmetic/limbs_asm_modular_x86.nim
index 40108c0..05c0aff 100644
--- a/constantine/arithmetic/limbs_asm_modular_x86.nim
+++ b/constantine/arithmetic/limbs_asm_modular_x86.nim
@@ -12,7 +12,7 @@ import
   # Internal
   ../config/common,
   ../primitives,
-  ./limbs_generic
+  ./limbs
 
 # ############################################################
 #
@@ -25,7 +25,7 @@ import
 #       They are nice to let the compiler deals with mov
 #       but too constraining so we move things ourselves.
 
-static: doAssert UseX86ASM
+static: doAssert UseASM_X86_64
 
 {.localPassC:"-fomit-frame-pointer".} # Needed so that the compiler finds enough registers
 
@@ -64,6 +64,7 @@ macro addmod_gen[N: static int](a: var Limbs[N], b, M: Limbs[N]): untyped =
     ctx.mov arrTsub[i], arrT[i]
 
   # Mask: overflowed contains 0xFFFF or 0x0000
+  # TODO: unnecessary if MSB never set, i.e. "canUseNoCarryMontyMul"
   let overflowed = arrB.reuseRegister()
   ctx.sbb overflowed, overflowed
 
@@ -118,7 +119,7 @@ macro submod_gen[N: static int](a: var Limbs[N], b, M: Limbs[N]): untyped =
     arrT = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
     arrTadd = init(OperandArray, nimSymbol = ident"tadd", N, ElemsInReg, Output_EarlyClobber)
 
-  # Addition
+  # Substraction
   for i in 0 ..< N:
     ctx.mov arrT[i], arrA[i]
     if i == 0:
diff --git a/constantine/arithmetic/limbs_asm_montmul_x86.nim b/constantine/arithmetic/limbs_asm_montmul_x86.nim
index 7be67c8..8a50584 100644
--- a/constantine/arithmetic/limbs_asm_montmul_x86.nim
+++ b/constantine/arithmetic/limbs_asm_montmul_x86.nim
@@ -12,7 +12,8 @@ import
   # Internal
   ../config/common,
   ../primitives,
-  ./limbs_generic
+  ./limbs,
+  ./limbs_asm_montred_x86
 
 # ############################################################
 #
@@ -25,7 +26,7 @@ import
 #       They are nice to let the compiler deals with mov
 #       but too constraining so we move things ourselves.
 
-static: doAssert UseX86ASM
+static: doAssert UseASM_X86_64
 
 # Necessary for the compiler to find enough registers (enabled at -O1)
 {.localPassC:"-fomit-frame-pointer".}
@@ -33,28 +34,6 @@ static: doAssert UseX86ASM
 # Montgomery multiplication
 # ------------------------------------------------------------
 # Fallback when no ADX and BMI2 support (MULX, ADCX, ADOX)
-
-proc finalSub*(
-       ctx: var Assembler_x86,
-       r: Operand or OperandArray,
-       t, M, scratch: OperandArray
-     ) =
-  ## Reduce `t` into `r` modulo `M`
-  let N = M.len
-  ctx.comment "Final substraction"
-  for i in 0 ..< N:
-    ctx.mov scratch[i], t[i]
-    if i == 0:
-      ctx.sub scratch[i], M[i]
-    else:
-      ctx.sbb scratch[i], M[i]
-
-  # If we borrowed it means that we were smaller than
-  # the modulus and we don't need "scratch"
-  for i in 0 ..< N:
-    ctx.cmovnc t[i], scratch[i]
-    ctx.mov r[i], t[i]
-
 macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_MM: Limbs[N], m0ninv_MM: BaseType): untyped =
   ## Generate an optimized Montgomery Multiplication kernel
   ## using the CIOS method
@@ -211,7 +190,7 @@ macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_
   ctx.mov rRDX, r
   let r2 = rRDX.asArrayAddr(len = N)
 
-  ctx.finalSub(
+  ctx.finalSubNoCarry(
     r2, t, M,
     scratch
   )
diff --git a/constantine/arithmetic/limbs_asm_montmul_x86_adx_bmi2.nim b/constantine/arithmetic/limbs_asm_montmul_x86_adx_bmi2.nim
index f9d7d57..db6b85a 100644
--- a/constantine/arithmetic/limbs_asm_montmul_x86_adx_bmi2.nim
+++ b/constantine/arithmetic/limbs_asm_montmul_x86_adx_bmi2.nim
@@ -12,8 +12,8 @@ import
   # Internal
   ../config/common,
   ../primitives,
-  ./limbs_generic,
-  ./limbs_asm_montmul_x86
+  ./limbs,
+  ./limbs_asm_montred_x86
 
 # ############################################################
 #
@@ -26,7 +26,7 @@ import
 #       They are nice to let the compiler deals with mov
 #       but too constraining so we move things ourselves.
 
-static: doAssert UseX86ASM
+static: doAssert UseASM_X86_64
 
 # MULX/ADCX/ADOX
 {.localPassC:"-madx -mbmi2".}
@@ -37,53 +37,58 @@ static: doAssert UseX86ASM
 # ------------------------------------------------------------
 proc mulx_by_word(
        ctx: var Assembler_x86,
-       C: Operand,
+       hi: Operand,
        t: OperandArray,
        a: Operand, # Pointer in scratchspace
-       word: Operand,
-       S, rRDX: Operand
+       word0: Operand,
+       lo, rRDX: Operand
      ) =
   ## Multiply the `a[0..<N]` by `word` and store in `t[0..<N]`
   ## and carry register `C` (t[N])
   ## `t` and `C` overwritten
   ## `S` is a scratchspace carry register
   ## `rRDX` is the RDX register descriptor
-  let N = t.len
+  let N = min(a.len, t.len)
 
-  doAssert N >= 2, "The Assembly-optimized montgomery multiplication requires at least 2 limbs."
   ctx.comment "  Outer loop i = 0"
-  ctx.`xor` rRDX, rRDX # Clear flags - TODO: necessary?
-  ctx.mov rRDX, word
 
   # for j=0 to N-1
   #  (C,t[j])  := t[j] + a[j]*b[i] + C
 
   # First limb
-  ctx.mulx t[1], t[0], a[0], rdx
+  ctx.mov rRDX, word0
+  if N > 1:
+    ctx.mulx t[1], t[0], a[0], rdx
+    ctx.`xor` hi, hi # Clear flags - TODO: necessary?
+  else:
+    ctx.mulx hi, t[0], a[0], rdx
+    return
 
   # Steady state
   for j in 1 ..< N-1:
-    ctx.mulx t[j+1], S, a[j], rdx
-    ctx.adox t[j], S   # TODO, we probably can use ADC here
+    ctx.mulx t[j+1], lo, a[j], rdx
+    if j == 1:
+      ctx.add t[j], lo
+    else:
+      ctx.adc t[j], lo
 
   # Last limb
-  ctx.mulx C, S, a[N-1], rdx
-  ctx.adox t[N-1], S
+  ctx.comment "  Outer loop i = 0, last limb"
+  ctx.mulx hi, lo, a[N-1], rdx
+  ctx.adc t[N-1], lo
 
   # Final carries
-  ctx.comment "  Mul carries i = 0"
-  ctx.mov  rRDX, 0 # Set to 0 without clearing flags
-  ctx.adcx C, rRDX
-  ctx.adox C, rRDX
+  ctx.comment "  Accumulate last carries in hi word"
+  ctx.adc hi, 0
 
 proc mulaccx_by_word(
        ctx: var Assembler_x86,
-       C: Operand,
+       hi: Operand,
        t: OperandArray,
        a: Operand, # Pointer in scratchspace
        i: int,
        word: Operand,
-       S, rRDX: Operand
+       lo, rRDX: Operand
      ) =
   ## Multiply the `a[0..<N]` by `word`
   ## and accumulate in `t[0..<N]`
@@ -91,33 +96,33 @@ proc mulaccx_by_word(
   ## `t` and `C` are multiply-accumulated
   ## `S` is a scratchspace register
   ## `rRDX` is the RDX register descriptor
-  let N = t.len
+  let N = min(a.len, t.len)
 
-  doAssert N >= 2, "The Assembly-optimized montgomery multiplication requires at least 2 limbs."
   doAssert i != 0
 
-  ctx.comment "  Outer loop i = " & $i
-  ctx.`xor` rRDX, rRDX # Clear flags - TODO: necessary?
+  ctx.comment "  Outer loop i = " & $i & ", j in [0, " & $N & ")"
   ctx.mov rRDX, word
+  ctx.`xor` hi, hi # Clear flags - TODO: necessary?
 
   # for j=0 to N-1
   #  (C,t[j])  := t[j] + a[j]*b[i] + C
 
   # Steady state
   for j in 0 ..< N-1:
-    ctx.mulx C, S, a[j], rdx
-    ctx.adox t[j], S
-    ctx.adcx t[j+1], C
+    ctx.mulx hi, lo, a[j], rdx
+    ctx.adox t[j], lo
+    ctx.adcx t[j+1], hi
 
   # Last limb
-  ctx.mulx C, S, a[N-1], rdx
-  ctx.adox t[N-1], S
+  ctx.comment "  Outer loop i = " & $i & ", last limb"
+  ctx.mulx hi, lo, a[N-1], rdx
+  ctx.adox t[N-1], lo
 
   # Final carries
-  ctx.comment "  Mul carries i = " & $i
+  ctx.comment "  Accumulate last carries in hi word"
   ctx.mov  rRDX, 0 # Set to 0 without clearing flags
-  ctx.adcx C, rRDX
-  ctx.adox C, rRDX
+  ctx.adcx hi, rRDX
+  ctx.adox hi, rRDX
 
 proc partialRedx(
        ctx: var Assembler_x86,
@@ -163,7 +168,7 @@ proc partialRedx(
       ctx.adox t[j-1], S
 
     # Last carries
-    # t[N-1} = S + C
+    # t[N-1] = S + C
     ctx.comment "  Reduction carry "
     ctx.mov S, 0
     ctx.adcx t[N-1], S
@@ -182,13 +187,13 @@ macro montMul_CIOS_nocarry_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM,
   let
     scratchSlots = max(N, 6)
 
-    r = init(OperandArray, nimSymbol = r_MM, N, PointerInReg, InputOutput)
+    r = init(OperandArray, nimSymbol = r_MM, N, PointerInReg, InputOutput_EnsureClobber)
     # We could force M as immediate by specializing per moduli
     M = init(OperandArray, nimSymbol = M_MM, N, PointerInReg, Input)
     # If N is too big, we need to spill registers. TODO.
     t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
     # MultiPurpose Register slots
-    scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput)
+    scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
 
     # MULX requires RDX
     rRDX = Operand(
@@ -270,7 +275,7 @@ macro montMul_CIOS_nocarry_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM,
       lo, C, rRDX
     )
 
-  ctx.finalSub(
+  ctx.finalSubNoCarry(
     r, t, M,
     scratch
   )
diff --git a/constantine/arithmetic/limbs_asm_montred_x86.nim b/constantine/arithmetic/limbs_asm_montred_x86.nim
new file mode 100644
index 0000000..a696812
--- /dev/null
+++ b/constantine/arithmetic/limbs_asm_montred_x86.nim
@@ -0,0 +1,255 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Standard library
+  std/macros,
+  # Internal
+  ../config/common,
+  ../primitives,
+  ./limbs
+
+# ############################################################
+#
+#        Assembly implementation of finite fields
+#
+# ############################################################
+
+
+static: doAssert UseASM_X86_32
+
+# Necessary for the compiler to find enough registers (enabled at -O1)
+{.localPassC:"-fomit-frame-pointer".}
+
+proc finalSubNoCarry*(
+       ctx: var Assembler_x86,
+       r: Operand or OperandArray,
+       t, M, scratch: OperandArray
+     ) =
+  ## Reduce `t` into `r` modulo `M`
+  let N = M.len
+  ctx.comment "Final substraction (no carry)"
+  for i in 0 ..< N:
+    ctx.mov scratch[i], t[i]
+    if i == 0:
+      ctx.sub scratch[i], M[i]
+    else:
+      ctx.sbb scratch[i], M[i]
+
+  # If we borrowed it means that we were smaller than
+  # the modulus and we don't need "scratch"
+  for i in 0 ..< N:
+    ctx.cmovnc t[i], scratch[i]
+    ctx.mov r[i], t[i]
+
+proc finalSubCanOverflow*(
+       ctx: var Assembler_x86,
+       r: Operand or OperandArray,
+       t, M, scratch: OperandArray,
+       overflowReg: Operand
+     ) =
+  ## Reduce `t` into `r` modulo `M`
+  ## To be used when the final substraction can
+  ## also depend on the carry flag
+  ## This is in particular possible when the MSB
+  ## is set for the prime modulus
+  ## `overflowReg` should be a register that will be used
+  ## to store the carry flag
+
+  ctx.sbb overflowReg, overflowReg
+
+  let N = M.len
+  ctx.comment "Final substraction (may carry)"
+  for i in 0 ..< N:
+    ctx.mov scratch[i], t[i]
+    if i == 0:
+      ctx.sub scratch[i], M[i]
+    else:
+      ctx.sbb scratch[i], M[i]
+
+  ctx.sbb overflowReg, 0
+
+  # If we borrowed it means that we were smaller than
+  # the modulus and we don't need "scratch"
+  for i in 0 ..< N:
+    ctx.cmovnc t[i], scratch[i]
+    ctx.mov r[i], t[i]
+
+
+# Montgomery reduction
+# ------------------------------------------------------------
+
+macro montyRed_gen[N: static int](
+       r_MR: var array[N, SecretWord],
+       t_MR: array[N*2, SecretWord],
+       M_MR: array[N, SecretWord],
+       m0ninv_MR: BaseType,
+       canUseNoCarryMontyMul: static bool
+      ) =
+  # TODO, slower than Clang, in particular due to the shadowing
+
+  result = newStmtList()
+
+  var ctx = init(Assembler_x86, BaseType)
+  let
+    # We could force M as immediate by specializing per moduli
+    M = init(OperandArray, nimSymbol = M_MR, N, PointerInReg, Input)
+
+    # MUL requires RAX and RDX
+    rRAX = Operand(
+      desc: OperandDesc(
+        asmId: "[rax]",
+        nimSymbol: ident"rax",
+        rm: RAX,
+        constraint: InputOutput_EnsureClobber,
+        cEmit: "rax"
+      )
+    )
+
+    rRDX = Operand(
+      desc: OperandDesc(
+        asmId: "[rdx]",
+        nimSymbol: ident"rdx",
+        rm: RDX,
+        constraint: Output_EarlyClobber,
+        cEmit: "rdx"
+      )
+    )
+
+    m0ninv = Operand(
+      desc: OperandDesc(
+        asmId: "[m0ninv]",
+        nimSymbol: m0ninv_MR,
+        rm: Reg,
+        constraint: Input,
+        cEmit: "m0ninv"
+      )
+    )
+
+
+  let scratchSlots = N+2
+  var scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
+
+  # Prologue
+  let eax = rRAX.desc.nimSymbol
+  let edx = rRDX.desc.nimSymbol
+  let scratchSym = scratch.nimSymbol
+  result.add quote do:
+    static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
+
+    var `eax`{.noInit.}, `edx`{.noInit.}: BaseType
+    var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
+
+  # Algorithm
+  # ---------------------------------------------------------
+  # for i in 0 .. n-1:
+  #   hi <- 0
+  #   m <- t[i] * m0ninv mod 2^w (i.e. simple multiplication)
+  #   for j in 0 .. n-1:
+  #     (hi, lo) <- t[i+j] + m * M[j] + hi
+  #     t[i+j] <- lo
+  #   t[i+n] += hi
+  # for i in 0 .. n-1:
+  #   r[i] = t[i+n]
+  # if r >= M:
+  #   r -= M
+
+  # No register spilling handling
+  doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
+
+  result.add quote do:
+    `eax` = BaseType `t_MR`[0]
+    `scratchSym`[1 .. `N`-1] = `t_MR`.toOpenArray(1, `N`-1)
+
+  ctx.mov scratch[N], rRAX
+  ctx.imul rRAX, m0ninv    # m <- t[i] * m0ninv mod 2^w
+  ctx.mov scratch[0], rRAX
+
+  # scratch: [t[0] * m0, t[1], t[2], t[3], t[0]] for 4 limbs
+
+  for i in 0 ..< N:
+    ctx.comment ""
+    let hi = scratch[N]
+    let next = scratch[N+1]
+
+    ctx.mul rdx, rax, M[0], rax
+    ctx.add hi, rRAX # Guaranteed to be zero
+    ctx.mov rRAX, scratch[0]
+    ctx.adc hi, rRDX
+
+    for j in 1 ..< N-1:
+      ctx.comment ""
+      ctx.mul rdx, rax, M[j], rax
+      ctx.add scratch[j], rRAX
+      ctx.mov rRAX, scratch[0]
+      ctx.adc rRDX, 0
+      ctx.add scratch[j], hi
+      ctx.adc rRDX, 0
+      ctx.mov hi, rRDX
+
+    # Next load
+    if i < N-1:
+      ctx.comment ""
+      ctx.mov next, scratch[1]
+      ctx.imul scratch[1], m0ninv
+      ctx.comment ""
+
+    # Last limb
+    ctx.comment ""
+    ctx.mul rdx, rax, M[N-1], rax
+    ctx.add scratch[N-1], rRAX
+    ctx.mov rRAX, scratch[1] # Contains next * m0
+    ctx.adc rRDX, 0
+    ctx.add scratch[N-1], hi
+    ctx.adc rRDX, 0
+    ctx.mov hi, rRDX
+
+    scratch.rotateLeft()
+
+  # Code generation
+  result.add ctx.generate()
+
+  # New codegen
+  ctx = init(Assembler_x86, BaseType)
+
+  let r = init(OperandArray, nimSymbol = r_MR, N, PointerInReg, InputOutput_EnsureClobber)
+  let t = init(OperandArray, nimSymbol = t_MR, N*2, PointerInReg, Input)
+  let extraRegNeeded = N-2
+  let tsub = init(OperandArray, nimSymbol = ident"tsub", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber)
+  let tsubsym = tsub.nimSymbol
+  result.add quote do:
+    var `tsubsym` {.noInit.}: Limbs[`extraRegNeeded`]
+
+  # This does t[i+n] += hi
+  # but in a separate carry chain, fused with the
+  # copy "r[i] = t[i+n]"
+  for i in 0 ..< N:
+    if i == 0:
+      ctx.add scratch[i], t[i+N]
+    else:
+      ctx.adc scratch[i], t[i+N]
+
+  let reuse = repackRegisters(tsub, scratch[N], scratch[N+1])
+
+  if canUseNoCarryMontyMul:
+    ctx.finalSubNoCarry(r, scratch, M, reuse)
+  else:
+    ctx.finalSubCanOverflow(r, scratch, M, reuse, rRAX)
+
+  # Code generation
+  result.add ctx.generate()
+
+func montRed_asm*[N: static int](
+       r: var array[N, SecretWord],
+       t: array[N*2, SecretWord],
+       M: array[N, SecretWord],
+       m0ninv: BaseType,
+       canUseNoCarryMontyMul: static bool
+      ) =
+  ## Constant-time Montgomery reduction
+  montyRed_gen(r, t, M, m0ninv, canUseNoCarryMontyMul)
diff --git a/constantine/arithmetic/limbs_asm_montred_x86_adx_bmi2.nim b/constantine/arithmetic/limbs_asm_montred_x86_adx_bmi2.nim
new file mode 100644
index 0000000..442cd34
--- /dev/null
+++ b/constantine/arithmetic/limbs_asm_montred_x86_adx_bmi2.nim
@@ -0,0 +1,191 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Standard library
+  std/macros,
+  # Internal
+  ../config/common,
+  ../primitives,
+  ./limbs,
+  ./limbs_asm_montred_x86
+
+# ############################################################
+#
+#        Assembly implementation of finite fields
+#
+# ############################################################
+
+# TODO, MCL has an implementation about 14% faster
+
+static: doAssert UseASM_X86_64
+
+# MULX/ADCX/ADOX
+{.localPassC:"-madx -mbmi2".}
+# Necessary for the compiler to find enough registers (enabled at -O1)
+{.localPassC:"-fomit-frame-pointer".}
+
+# Montgomery reduction
+# ------------------------------------------------------------
+
+macro montyRedx_gen[N: static int](
+       r_MR: var array[N, SecretWord],
+       t_MR: array[N*2, SecretWord],
+       M_MR: array[N, SecretWord],
+       m0ninv_MR: BaseType,
+       canUseNoCarryMontyMul: static bool
+      ) =
+  # TODO, slower than Clang, in particular due to the shadowing
+
+  result = newStmtList()
+
+  var ctx = init(Assembler_x86, BaseType)
+  let
+    # We could force M as immediate by specializing per moduli
+    M = init(OperandArray, nimSymbol = M_MR, N, PointerInReg, Input)
+
+    hi = Operand(
+      desc: OperandDesc(
+        asmId: "[hi]",
+        nimSymbol: ident"hi",
+        rm: Reg,
+        constraint: Output_EarlyClobber,
+        cEmit: "hi"
+      )
+    )
+
+    lo = Operand(
+      desc: OperandDesc(
+        asmId: "[lo]",
+        nimSymbol: ident"lo",
+        rm: Reg,
+        constraint: Output_EarlyClobber,
+        cEmit: "lo"
+      )
+    )
+
+    rRDX = Operand(
+      desc: OperandDesc(
+        asmId: "[rdx]",
+        nimSymbol: ident"rdx",
+        rm: RDX,
+        constraint: InputOutput_EnsureClobber,
+        cEmit: "rdx"
+      )
+    )
+
+    m0ninv = Operand(
+      desc: OperandDesc(
+        asmId: "[m0ninv]",
+        nimSymbol: m0ninv_MR,
+        rm: Reg,
+        constraint: Input,
+        cEmit: "m0ninv"
+      )
+    )
+
+  let scratchSlots = N+1
+  var scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
+
+  # Prologue
+  let edx = rRDX.desc.nimSymbol
+  let hisym = hi.desc.nimSymbol
+  let losym = lo.desc.nimSymbol
+  let scratchSym = scratch.nimSymbol
+  result.add quote do:
+    static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
+
+    var `hisym`{.noInit.}, `losym`{.noInit.}, `edx`{.noInit.}: BaseType
+    var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
+
+  # Algorithm
+  # ---------------------------------------------------------
+  # for i in 0 .. n-1:
+  #   hi <- 0
+  #   m <- t[i] * m0ninv mod 2^w (i.e. simple multiplication)
+  #   for j in 0 .. n-1:
+  #     (hi, lo) <- t[i+j] + m * M[j] + hi
+  #     t[i+j] <- lo
+  #   t[i+n] += hi
+  # for i in 0 .. n-1:
+  #   r[i] = t[i+n]
+  # if r >= M:
+  #   r -= M
+
+  # No register spilling handling
+  doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
+
+  result.add quote do:
+    `edx` = BaseType(`m0ninv_MR`)
+    `scratchSym`[0 .. `N`-1] = `t_MR`.toOpenArray(0, `N`-1)
+
+  for i in 0 ..< N:
+    # RDX contains m0ninv at the start of each loop
+    ctx.comment ""
+    ctx.imul rRDX, scratch[0] # m <- t[i] * m0ninv mod 2^w
+    ctx.comment "---- Reduction " & $i
+    ctx.`xor` scratch[N], scratch[N]
+
+    for j in 0 ..< N-1:
+      ctx.comment ""
+      ctx.mulx hi, lo, M[j], rdx
+      ctx.adcx scratch[j], lo
+      ctx.adox scratch[j+1], hi
+
+    # Last limb
+    ctx.comment ""
+    ctx.mulx hi, lo, M[N-1], rdx
+    ctx.mov rRDX, m0ninv # Reload m0ninv for next iter
+    ctx.adcx scratch[N-1], lo
+    ctx.adox hi, scratch[N]
+    ctx.adcx scratch[N], hi
+
+    scratch.rotateLeft()
+
+  # Code generation
+  result.add ctx.generate()
+
+  # New codegen
+  ctx = init(Assembler_x86, BaseType)
+
+  let r = init(OperandArray, nimSymbol = r_MR, N, PointerInReg, InputOutput_EnsureClobber)
+  let t = init(OperandArray, nimSymbol = t_MR, N*2, PointerInReg, Input)
+  let extraRegNeeded = N-1
+  let tsub = init(OperandArray, nimSymbol = ident"tsub", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber)
+  let tsubsym = tsub.nimSymbol
+  result.add quote do:
+    var `tsubsym` {.noInit.}: Limbs[`extraRegNeeded`]
+
+  # This does t[i+n] += hi
+  # but in a separate carry chain, fused with the
+  # copy "r[i] = t[i+n]"
+  for i in 0 ..< N:
+    if i == 0:
+      ctx.add scratch[i], t[i+N]
+    else:
+      ctx.adc scratch[i], t[i+N]
+
+  let reuse = repackRegisters(tsub, scratch[N])
+
+  if canUseNoCarryMontyMul:
+    ctx.finalSubNoCarry(r, scratch, M, reuse)
+  else:
+    ctx.finalSubCanOverflow(r, scratch, M, reuse, hi)
+
+  # Code generation
+  result.add ctx.generate()
+
+func montRed_asm_adx_bmi2*[N: static int](
+       r: var array[N, SecretWord],
+       t: array[N*2, SecretWord],
+       M: array[N, SecretWord],
+       m0ninv: BaseType,
+       canUseNoCarryMontyMul: static bool
+      ) =
+  ## Constant-time Montgomery reduction
+  montyRedx_gen(r, t, M, m0ninv, canUseNoCarryMontyMul)
diff --git a/constantine/arithmetic/limbs_asm_mul_x86.nim b/constantine/arithmetic/limbs_asm_mul_x86.nim
new file mode 100644
index 0000000..64a12e3
--- /dev/null
+++ b/constantine/arithmetic/limbs_asm_mul_x86.nim
@@ -0,0 +1,146 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Standard library
+  std/macros,
+  # Internal
+  ../config/common,
+  ../primitives
+
+# ############################################################
+#
+#        Assembly implementation of bigint multiplication
+#
+# ############################################################
+
+# Note: We can refer to at most 30 registers in inline assembly
+#       and "InputOutput" registers count double
+#       They are nice to let the compiler deals with mov
+#       but too constraining so we move things ourselves.
+
+# TODO: verify that assembly generated works for small arrays
+#       that are passed by values
+
+static: doAssert UseASM_X86_64 # Need 8 registers just for mul
+                               # and 32-bit only has 8 max.
+
+macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
+  ## Comba multiplication generator
+  ## `a`, `b`, `r` can have a different number of limbs
+  ## if `r`.limbs.len < a.limbs.len + b.limbs.len
+  ## The result will be truncated, i.e. it will be
+  ## a * b (mod (2^WordBitwidth)^r.limbs.len)
+  ##
+  ## Assumes r doesn't aliases a or b
+
+  result = newStmtList()
+
+  var ctx = init(Assembler_x86, BaseType)
+  let
+    arrR = init(OperandArray, nimSymbol = r, rLen, PointerInReg, InputOutput_EnsureClobber)
+    arrA = init(OperandArray, nimSymbol = a, aLen, PointerInReg, Input)
+    arrB = init(OperandArray, nimSymbol = b, bLen, PointerInReg, Input)
+
+    t = Operand(
+      desc: OperandDesc(
+        asmId: "[t]",
+        nimSymbol: ident"t",
+        rm: Reg,
+        constraint: Output_EarlyClobber,
+        cEmit: "t"
+      )
+    )
+
+    u = Operand(
+      desc: OperandDesc(
+        asmId: "[u]",
+        nimSymbol: ident"u",
+        rm: Reg,
+        constraint: Output_EarlyClobber,
+        cEmit: "u"
+      )
+    )
+
+    v = Operand(
+      desc: OperandDesc(
+        asmId: "[v]",
+        nimSymbol: ident"v",
+        rm: Reg,
+        constraint: Output_EarlyClobber,
+        cEmit: "v"
+      )
+    )
+
+    # MUL requires RAX and RDX
+    rRAX = Operand(
+      desc: OperandDesc(
+        asmId: "[rax]",
+        nimSymbol: ident"rax",
+        rm: RAX,
+        constraint: Output_EarlyClobber,
+        cEmit: "rax"
+      )
+    )
+
+    rRDX = Operand(
+      desc: OperandDesc(
+        asmId: "[rdx]",
+        nimSymbol: ident"rdx",
+        rm: RDX,
+        constraint: Output_EarlyClobber,
+        cEmit: "rdx"
+      )
+    )
+
+
+  # Prologue
+  let tsym = t.desc.nimSymbol
+  let usym = u.desc.nimSymbol
+  let vsym = v.desc.nimSymbol
+  let eax = rRAX.desc.nimSymbol
+  let edx = rRDX.desc.nimSymbol
+  result.add quote do:
+    var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init
+    var `eax`{.noInit.}, `edx`{.noInit.}: BaseType
+
+  # Algorithm
+  ctx.`xor` u, u
+  ctx.`xor` v, v
+  ctx.`xor` t, t
+
+  for i in 0 ..< min(aLen+bLen, rLen):
+    let ib = min(bLen-1, i)
+    let ia = i - ib
+    for j in 0 ..< min(aLen - ia, ib+1):
+      # (t, u, v) <- (t, u, v) + a[ia+j] * b[ib-j]
+      ctx.mov rRAX, arrB[ib-j]
+      ctx.mul rdx, rax, arrA[ia+j], rax
+      ctx.add v, rRAX
+      ctx.adc u, rRDX
+      ctx.adc t, 0
+
+    ctx.mov arrR[i], v
+
+    if i != min(aLen+bLen, rLen) - 1:
+      ctx.mov v, u
+      ctx.mov u, t
+      ctx.`xor` t, t
+
+  if aLen+bLen < rLen:
+    ctx.`xor` rRAX, rRAX
+    for i in aLen+bLen ..< rLen:
+      ctx.mov arrR[i], rRAX
+
+  # Codegen
+  result.add ctx.generate
+
+func mul_asm*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
+  ## Multi-precision Multiplication
+  ## Assumes r doesn't alias a or b
+  mul_gen(r, a, b)
diff --git a/constantine/arithmetic/limbs_asm_mul_x86_adx_bmi2.nim b/constantine/arithmetic/limbs_asm_mul_x86_adx_bmi2.nim
new file mode 100644
index 0000000..dcc9042
--- /dev/null
+++ b/constantine/arithmetic/limbs_asm_mul_x86_adx_bmi2.nim
@@ -0,0 +1,197 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Standard library
+  std/macros,
+  # Internal
+  ../config/common,
+  ../primitives
+
+# ############################################################
+#
+#        Assembly implementation of finite fields
+#
+# ############################################################
+
+# Note: We can refer to at most 30 registers in inline assembly
+#       and "InputOutput" registers count double
+#       They are nice to let the compiler deals with mov
+#       but too constraining so we move things ourselves.
+
+static: doAssert UseASM_X86_64
+
+# MULX/ADCX/ADOX
+{.localPassC:"-madx -mbmi2".}
+# Necessary for the compiler to find enough registers (enabled at -O1)
+# {.localPassC:"-fomit-frame-pointer".}
+
+# Multiplication
+# ------------------------------------------------------------
+proc mulx_by_word(
+       ctx: var Assembler_x86,
+       r0: Operand,
+       a, t: OperandArray,
+       word0: Operand,
+       rRAX, rRDX: Operand
+     ) =
+  ## Multiply the `a[0..<N]` by `word`
+  ## and store in `[t:r0]`
+  ## with [t:r0] = tn, tn-1, ... t1, r0
+  doAssert a.len + 1 == t.len
+  let N = a.len
+
+  ctx.comment "  Outer loop i = 0, j=0 to " & $N
+
+  # for j=0 to N-1
+  #  (C,t[j])  := t[j] + a[j]*b[i] + C
+
+  # First limb
+  ctx.mov rRDX, word0
+  ctx.`xor` rRAX, rRAX # Clear flags (important if steady state is skipped)
+  ctx.mulx t[0], rRAX, a[0], rdx
+  ctx.mov r0, rRAX
+
+  # Steady state
+  for j in 1 ..< N:
+    ctx.mulx t[j], rRAX, a[j], rdx
+    if j == 1:
+      ctx.add t[j-1], rRAX
+    else:
+      ctx.adc t[j-1], rRAX
+
+  # Final carries
+  ctx.comment "  Accumulate last carries in hi word"
+  ctx.adc t[N-1], 0
+
+proc mulaccx_by_word(
+       ctx: var Assembler_x86,
+       r: OperandArray,
+       i: int,
+       a, t: OperandArray,
+       word: Operand,
+       rRAX, rRDX: Operand
+     ) =
+  ## Multiply the `a[0..<N]` by `word`
+  ## and store in `[t:r0]`
+  ## with [t:r0] = tn, tn-1, ... t1, r0
+  doAssert a.len + 1 == t.len
+  let N = min(a.len, r.len)
+  let hi = t[a.len]
+
+  doAssert i != 0
+
+  ctx.comment "  Outer loop i = " & $i & ", j in [0, " & $N & ")"
+  ctx.mov rRDX, word
+  ctx.`xor` rRAX, rRAX # Clear flags
+
+  # for j=0 to N-1
+  #  (C,t[j])  := t[j] + a[j]*b[i] + C
+
+  # Steady state
+  for j in 0 ..< N:
+    ctx.mulx hi, rRAX, a[j], rdx
+    ctx.adox t[j], rRAX
+    if j == 0:
+      ctx.mov r[i], t[j]
+    if j == N-1:
+      break
+    ctx.adcx t[j+1], hi
+
+  # Final carries
+  ctx.comment "  Accumulate last carries in hi word"
+  ctx.mov  rRDX, 0 # Set to 0 without clearing flags
+  ctx.adcx hi, rRDX
+  ctx.adox hi, rRDX
+
+macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen], bx: Limbs[bLen]) =
+  ## `a`, `b`, `r` can have a different number of limbs
+  ## if `r`.limbs.len < a.limbs.len + b.limbs.len
+  ## The result will be truncated, i.e. it will be
+  ## a * b (mod (2^WordBitwidth)^r.limbs.len)
+  ##
+  ## Assumes r doesn't aliases a or b
+
+  result = newStmtList()
+
+  var ctx = init(Assembler_x86, BaseType)
+  let
+    r = init(OperandArray, nimSymbol = rx, rLen, PointerInReg, InputOutput_EnsureClobber)
+    a = init(OperandArray, nimSymbol = ax, aLen, PointerInReg, Input)
+    b = init(OperandArray, nimSymbol = bx, bLen, PointerInReg, Input)
+
+    # MULX requires RDX
+    rRDX = Operand(
+      desc: OperandDesc(
+        asmId: "[rdx]",
+        nimSymbol: ident"rdx",
+        rm: RDX,
+        constraint: Output_EarlyClobber,
+        cEmit: "rdx"
+      )
+    )
+
+    rRAX = Operand(
+      desc: OperandDesc(
+        asmId: "[rax]",
+        nimSymbol: ident"rax",
+        rm: RAX,
+        constraint: Output_EarlyClobber,
+        cEmit: "rax"
+      )
+    )
+
+    tSlots = aLen+1 # Extra for high word
+
+  var # If aLen is too big, we need to spill registers. TODO.
+    t = init(OperandArray, nimSymbol = ident"t", tSlots, ElemsInReg, Output_EarlyClobber)
+
+
+  # Prologue
+  let tsym = t.nimSymbol
+  let eax = rRAX.desc.nimSymbol
+  let edx = rRDX.desc.nimSymbol
+  result.add quote do:
+    var `tsym`{.noInit.}: array[`tSlots`, BaseType]
+    var `edx`{.noInit.}, `eax`{.noInit.}: BaseType
+
+  for i in 0 ..< min(rLen, bLen):
+    if i == 0:
+      ctx.mulx_by_word(
+        r[0],
+        a, t,
+        b[0],
+        rRAX, rRDX,
+      )
+    else:
+      ctx.mulaccx_by_word(
+        r, i,
+        a, t,
+        b[i],
+        rRAX, rRDX
+      )
+
+      t.rotateLeft()
+
+  # Copy upper-limbs to result
+  for i in b.len ..< min(a.len+b.len, rLen):
+    ctx.mov r[i], t[i-b.len]
+
+  # Zero the extra
+  if aLen+bLen < rLen:
+    ctx.`xor` rRAX, rRAX
+    for i in aLen+bLen ..< rLen:
+      ctx.mov r[i], rRAX
+
+  # Codegen
+  result.add ctx.generate
+
+func mul_asm_adx_bmi2*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
+  ## Multi-precision Multiplication
+  ## Assumes r doesn't alias a or b
+  mulx_gen(r, a, b)
diff --git a/constantine/arithmetic/limbs_asm_x86.nim b/constantine/arithmetic/limbs_asm_x86.nim
index e0921ba..79bf47f 100644
--- a/constantine/arithmetic/limbs_asm_x86.nim
+++ b/constantine/arithmetic/limbs_asm_x86.nim
@@ -11,8 +11,7 @@ import
   std/macros,
   # Internal
   ../config/common,
-  ../primitives,
-  ./limbs_generic
+  ../primitives
 
 # ############################################################
 #
@@ -25,9 +24,10 @@ import
 #       They are nice to let the compiler deals with mov
 #       but too constraining so we move things ourselves.
 
-static: doAssert UseX86ASM
+# TODO: verify that assembly generated works for small arrays
+#       that are passed by values
 
-{.localPassC:"-fomit-frame-pointer".} # Needed so that the compiler finds enough registers
+static: doAssert UseASM_X86_32
 
 # Copy
 # ------------------------------------------------------------
@@ -40,8 +40,6 @@ macro ccopy_gen[N: static int](a: var Limbs[N], b: Limbs[N], ctl: SecretBool): u
   let
     arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, InputOutput)
     arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, Input)
-    # If N is too big, we need to spill registers. TODO.
-    arrT = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
 
     control = Operand(
       desc: OperandDesc(
@@ -53,16 +51,42 @@ macro ccopy_gen[N: static int](a: var Limbs[N], b: Limbs[N], ctl: SecretBool): u
       )
     )
 
+  var # Swappable registers to break dependency chains
+    t0 = Operand(
+      desc: OperandDesc(
+        asmId: "[t0]",
+        nimSymbol: ident"t0",
+        rm: Reg,
+        constraint: Output_EarlyClobber,
+        cEmit: "t0"
+      )
+    )
+
+    t1 = Operand(
+      desc: OperandDesc(
+        asmId: "[t1]",
+        nimSymbol: ident"t1",
+        rm: Reg,
+        constraint: Output_EarlyClobber,
+        cEmit: "t1"
+      )
+    )
+
+  # Prologue
+  let t0sym = t0.desc.nimSymbol
+  let t1sym = t1.desc.nimSymbol
+  result.add quote do:
+    var `t0sym`{.noinit.}, `t1sym`{.noinit.}: BaseType
+
+  # Algorithm
   ctx.test control, control
   for i in 0 ..< N:
-    ctx.mov arrT[i], arrA[i]
-    ctx.cmovnz arrT[i], arrB[i]
-    ctx.mov arrA[i], arrT[i]
+    ctx.mov t0, arrA[i]
+    ctx.cmovnz t0, arrB[i]
+    ctx.mov arrA[i], t0
+    swap(t0, t1)
 
-  let t = arrT.nimSymbol
-  let c = control.desc.nimSymbol
-  result.add quote do:
-    var `t` {.noInit.}: typeof(`a`)
+  # Codegen
   result.add ctx.generate()
 
 func ccopy_asm*(a: var Limbs, b: Limbs, ctl: SecretBool) {.inline.}=
@@ -71,3 +95,121 @@ func ccopy_asm*(a: var Limbs, b: Limbs, ctl: SecretBool) {.inline.}=
   ## if ctl is false: b is not copied and a is untouched
   ## Time and memory accesses are the same whether a copy occurs or not
   ccopy_gen(a, b, ctl)
+
+# Addition
+# ------------------------------------------------------------
+
+macro add_gen[N: static int](carry: var Carry, r: var Limbs[N], a, b: Limbs[N]): untyped =
+  ## Generate an optimized out-of-place addition kernel
+
+  result = newStmtList()
+
+  var ctx = init(Assembler_x86, BaseType)
+  let
+    arrR = init(OperandArray, nimSymbol = r, N, PointerInReg, InputOutput)
+    arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, Input)
+    arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, Input)
+
+  var # Swappable registers to break dependency chains
+    t0 = Operand(
+      desc: OperandDesc(
+        asmId: "[t0]",
+        nimSymbol: ident"t0",
+        rm: Reg,
+        constraint: Output_EarlyClobber,
+        cEmit: "t0"
+      )
+    )
+
+    t1 = Operand(
+      desc: OperandDesc(
+        asmId: "[t1]",
+        nimSymbol: ident"t1",
+        rm: Reg,
+        constraint: Output_EarlyClobber,
+        cEmit: "t1"
+      )
+    )
+
+  # Prologue
+  let t0sym = t0.desc.nimSymbol
+  let t1sym = t1.desc.nimSymbol
+  result.add quote do:
+    var `t0sym`{.noinit.}, `t1sym`{.noinit.}: BaseType
+
+  # Algorithm
+  for i in 0 ..< N:
+    ctx.mov t0, arrA[i]
+    if i == 0:
+      ctx.add t0, arrB[0]
+    else:
+      ctx.adc t0, arrB[i]
+    ctx.mov arrR[i], t0
+    swap(t0, t1)
+  ctx.setToCarryFlag(carry)
+
+  # Codegen
+  result.add ctx.generate
+
+func add_asm*(r: var Limbs, a, b: Limbs): Carry {.inline.}=
+  ## Constant-time addition
+  add_gen(result, r, a, b)
+
+# Substraction
+# ------------------------------------------------------------
+
+macro sub_gen[N: static int](borrow: var Borrow, r: var Limbs[N], a, b: Limbs[N]): untyped =
+  ## Generate an optimized out-of-place substraction kernel
+
+  result = newStmtList()
+
+  var ctx = init(Assembler_x86, BaseType)
+  let
+    arrR = init(OperandArray, nimSymbol = r, N, PointerInReg, InputOutput)
+    arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, Input)
+    arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, Input)
+
+  var # Swappable registers to break dependency chains
+    t0 = Operand(
+      desc: OperandDesc(
+        asmId: "[t0]",
+        nimSymbol: ident"t0",
+        rm: Reg,
+        constraint: Output_EarlyClobber,
+        cEmit: "t0"
+      )
+    )
+
+    t1 = Operand(
+      desc: OperandDesc(
+        asmId: "[t1]",
+        nimSymbol: ident"t1",
+        rm: Reg,
+        constraint: Output_EarlyClobber,
+        cEmit: "t1"
+      )
+    )
+
+  # Prologue
+  let t0sym = t0.desc.nimSymbol
+  let t1sym = t1.desc.nimSymbol
+  result.add quote do:
+    var `t0sym`{.noinit.}, `t1sym`{.noinit.}: BaseType
+
+  # Algorithm
+  for i in 0 ..< N:
+    ctx.mov t0, arrA[i]
+    if i == 0:
+      ctx.sub t0, arrB[0]
+    else:
+      ctx.sbb t0, arrB[i]
+    ctx.mov arrR[i], t0
+    swap(t0, t1)
+  ctx.setToCarryFlag(borrow)
+
+  # Codegen
+  result.add ctx.generate
+
+func sub_asm*(r: var Limbs, a, b: Limbs): Borrow {.inline.}=
+  ## Constant-time substraction
+  sub_gen(result, r, a, b)
diff --git a/constantine/arithmetic/limbs_double_width.nim b/constantine/arithmetic/limbs_double_width.nim
new file mode 100644
index 0000000..2193116
--- /dev/null
+++ b/constantine/arithmetic/limbs_double_width.nim
@@ -0,0 +1,101 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../config/common,
+  ../primitives,
+  ./limbs
+
+when UseASM_X86_32:
+  import ./limbs_asm_montred_x86
+when UseASM_X86_64:
+  import ./limbs_asm_montred_x86_adx_bmi2
+
+# ############################################################
+#
+#         Limbs raw representation and operations
+#
+# ############################################################
+
+# No exceptions allowed
+{.push raises: [].}
+
+# Montgomery Reduction
+# ------------------------------------------------------------
+# This is the reduction part of SOS (Separated Operand Scanning) modular multiplication technique
+
+# TODO upstream, using Limbs[N] breaks semcheck
+func montyRed*[N: static int](
+       r: var array[N, SecretWord],
+       t: array[N*2, SecretWord],
+       M: array[N, SecretWord],
+       m0ninv: BaseType, canUseNoCarryMontyMul: static bool) =
+  ## Montgomery reduce a double-width bigint modulo M
+  # - Analyzing and Comparing Montgomery Multiplication Algorithms
+  #   Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr.
+  #   http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf
+  #
+  # - Arithmetic of Finite Fields
+  #   Chapter 5 of Guide to Pairing-Based Cryptography
+  #   Jean Luc Beuchat, Luis J. Dominguez Perez, Sylvain Duquesne, Nadia El Mrabet, Laura Fuentes-Castañeda, Francisco Rodríguez-Henríquez, 2017
+  #   https://www.researchgate.net/publication/319538235_Arithmetic_of_Finite_Fields
+  #
+  # Algorithm
+  # Inputs:
+  # - N number of limbs
+  # - t[0 ..< 2N] (double-width input to reduce)
+  # - M[0 ..< N] The field modulus (must be odd for Montgomery reduction)
+  # - m0ninv: Montgomery Reduction magic number = -1/M[0]
+  # Output:
+  # - r[0 ..< N], in the Montgomery domain
+  # Parameters:
+  # - w, the word width usually 64 on 64-bit platforms or 32 on 32-bit
+  #
+  # for i in 0 .. n-1:
+  #   C <- 0
+  #   m <- t[i] * m0ninv mod 2^w (i.e. simple multiplication)
+  #   for j in 0 .. n-1:
+  #     (C, S) <- t[i+j] + m * M[j] + C
+  #     t[i+j] <- S
+  #   t[i+n] += C
+  # for i in 0 .. n-1:
+  #   r[i] = t[i+n]
+  # if r >= M:
+  #   r -= M
+  #
+  # Important note: `t[i+n] += C` should propagate the carry
+  # to the higher limb if any, thank you "implementation detail"
+  # missing from paper.
+  when UseASM_X86_64 and r.len <= 6:
+    if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()):
+      montRed_asm_adx_bmi2(r, t, M, m0ninv, canUseNoCarryMontyMul)
+    else:
+      montRed_asm(r, t, M, m0ninv, canUseNoCarryMontyMul)
+  elif UseASM_X86_32 and r.len <= 6:
+    # TODO: Assembly faster than GCC but slower than Clang
+    montRed_asm(r, t, M, m0ninv, canUseNoCarryMontyMul)
+  else:
+    var t = t          # Copy "t" for mutation and ensure on stack
+    var res: typeof(r) # Accumulator
+    staticFor i, 0, N:
+      var C = Zero
+      let m = t[i] * SecretWord(m0ninv)
+      staticFor j, 0, N:
+        muladd2(C, t[i+j], m, M[j], t[i+j], C)
+      res[i] = C
+
+    # This does t[i+n] += C
+    # but in a separate carry chain, fused with the
+    # copy "r[i] = t[i+n]"
+    var carry = Carry(0)
+    staticFor i, 0, N:
+      addC(carry, res[i], t[i+N], res[i], carry)
+
+    # Final substraction
+    discard res.csub(M, SecretWord(carry).isNonZero() or not(res < M))
+    r = res
diff --git a/constantine/arithmetic/limbs_generic_modular.nim b/constantine/arithmetic/limbs_generic_modular.nim
index ac73d6c..c0b0586 100644
--- a/constantine/arithmetic/limbs_generic_modular.nim
+++ b/constantine/arithmetic/limbs_generic_modular.nim
@@ -9,7 +9,7 @@
 import
   ../config/common,
   ../primitives,
-  ./limbs_generic
+  ./limbs
 
 # No exceptions allowed
 {.push raises: [].}
diff --git a/constantine/arithmetic/limbs_montgomery.nim b/constantine/arithmetic/limbs_montgomery.nim
index 93d8396..c6e53f4 100644
--- a/constantine/arithmetic/limbs_montgomery.nim
+++ b/constantine/arithmetic/limbs_montgomery.nim
@@ -12,9 +12,9 @@ import
   # Internal
   ../config/common,
   ../primitives,
-  ./limbs_generic
+  ./limbs
 
-when UseX86ASM:
+when UseASM_X86_64:
   import
     ./limbs_asm_montmul_x86,
     ./limbs_asm_montmul_x86_adx_bmi2
@@ -37,55 +37,7 @@ when UseX86ASM:
 #       - pairing final exponentiation
 #       are bottlenecked by Montgomery multiplications or squarings
 #
-# Unfortunately, the fastest implementation of Montgomery Multiplication
-# on x86 is impossible without resorting to assembly (probably 15~30% faster)
-#
-# It requires implementing 2 parallel pipelines of carry-chains (via instruction-level parallelism)
-# of MULX, ADCX and ADOX instructions, according to Intel paper:
-# https://www.intel.cn/content/dam/www/public/us/en/documents/white-papers/ia-large-integer-arithmetic-paper.pdf
-# and the code generation of MCL
-# https://github.com/herumi/mcl
-#
-# A generic implementation would require implementing a mini-compiler as macro
-# significantly sacrificing code readability, portability, auditability and maintainability.
-#
-# This would however save significant hardware or cloud resources.
-# An example inline assembly compiler for add-with-carry is available in
-# primitives/research/addcarry_subborrow_compiler.nim
-#
-# Instead we follow the optimized high-level implementation of Goff
-# which skips a significant amount of additions for moduli
-# that have their the most significant bit unset.
-
-# Loop unroller
-# ------------------------------------------------------------
-
-proc replaceNodes(ast: NimNode, what: NimNode, by: NimNode): NimNode =
-  # Replace "what" ident node by "by"
-  proc inspect(node: NimNode): NimNode =
-    case node.kind:
-    of {nnkIdent, nnkSym}:
-      if node.eqIdent(what):
-        return by
-      return node
-    of nnkEmpty:
-      return node
-    of nnkLiterals:
-      return node
-    else:
-      var rTree = node.kind.newTree()
-      for child in node:
-        rTree.add inspect(child)
-      return rTree
-  result = inspect(ast)
-
-macro staticFor(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped): untyped =
-  result = newStmtList()
-  for i in start ..< stopEx:
-    result.add nnkBlockStmt.newTree(
-      ident("unrolledIter_" & $idx & $i),
-      body.replaceNodes(idx, newLit i)
-    )
+# Hence we use inline assembly where possible
 
 # No exceptions allowed
 {.push raises: [].}
@@ -348,7 +300,7 @@ func montyMul*(
   # - specialize/duplicate code for m0ninv == 1 (especially if only 1 curve is needed)
   # - keep it generic and optimize code size
   when canUseNoCarryMontyMul:
-    when UseX86ASM and a.len in {2 .. 6}: # TODO: handle spilling
+    when UseASM_X86_64 and a.len in {2 .. 6}: # TODO: handle spilling
       if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()):
         montMul_CIOS_nocarry_asm_adx_bmi2(r, a, b, M, m0ninv)
       else:
diff --git a/constantine/config/common.nim b/constantine/config/common.nim
index 4707845..050d7a1 100644
--- a/constantine/config/common.nim
+++ b/constantine/config/common.nim
@@ -30,6 +30,14 @@ type
 
   SecretBool* = CTBool[SecretWord]
 
+
+  Limbs*[N: static int] = array[N, SecretWord]
+    ## Limbs-type
+    ## Should be distinct type to avoid builtins to use non-constant time
+    ## implementation, for example for comparison.
+    ##
+    ## but for unknown reason, it prevents semchecking `bits`
+
 const
   WordBitWidth* = sizeof(SecretWord) * 8
     ## Logical word size
@@ -44,7 +52,8 @@ const
 # TODO, we restrict assembly to 64-bit words
 # We need to support register spills for large limbs
 const ConstantineASM {.booldefine.} = true
-const UseX86ASM* = WordBitWidth == 64 and ConstantineASM and X86 and GCC_Compatible
+const UseASM_X86_32* = ConstantineASM and X86 and GCC_Compatible
+const UseASM_X86_64* = WordBitWidth == 64 and UseASM_X86_32
 
 # ############################################################
 #
diff --git a/constantine/config/curves.nim b/constantine/config/curves.nim
index cc6ecdb..a2debf6 100644
--- a/constantine/config/curves.nim
+++ b/constantine/config/curves.nim
@@ -27,15 +27,19 @@ macro Mod*(C: static Curve): untyped =
   ## Get the Modulus associated to a curve
   result = bindSym($C & "_Modulus")
 
-func getCurveBitwidth*(C: static Curve): static int =
+template getCurveBitwidth*(C: Curve): int =
   ## Returns the number of bits taken by the curve modulus
-  result = static(CurveBitWidth[C])
+  CurveBitWidth[C]
 
 template matchingBigInt*(C: static Curve): untyped =
   BigInt[CurveBitWidth[C]]
 
-func family*(C: static Curve): CurveFamily =
-  result = static(CurveFamilies[C])
+template family*(C: Curve): CurveFamily =
+  CurveFamilies[C]
+
+template matchingLimbs2x*(C: Curve): untyped =
+  const N2 = wordsRequired(getCurveBitwidth(C)) * 2 # TODO upstream, not precomputing N2 breaks semcheck
+  array[N2, SecretWord] # TODO upstream, using Limbs[N2] breaks semcheck
 
 # ############################################################
 #
diff --git a/constantine/config/type_bigint.nim b/constantine/config/type_bigint.nim
index 340cfff..4579545 100644
--- a/constantine/config/type_bigint.nim
+++ b/constantine/config/type_bigint.nim
@@ -31,8 +31,6 @@ type
 debug:
   import std/strutils
 
-  type Limbs[N: static int] = array[N, SecretWord]
-
   func toString*(a: Limbs): string =
     result = "["
     result.add " 0x" & toHex(BaseType(a[0]))
diff --git a/constantine/elliptic/ec_endomorphism_accel.nim b/constantine/elliptic/ec_endomorphism_accel.nim
index eece904..bcaae31 100644
--- a/constantine/elliptic/ec_endomorphism_accel.nim
+++ b/constantine/elliptic/ec_endomorphism_accel.nim
@@ -10,7 +10,6 @@ import
   # Standard Library
   std/typetraits,
   # Internal
-  ../../helpers/static_for,
   ../primitives,
   ../config/[common, curves, type_bigint],
   ../arithmetic,
diff --git a/constantine/elliptic/ec_endomorphism_params.nim b/constantine/elliptic/ec_endomorphism_params.nim
index 1f00a75..8f430b3 100644
--- a/constantine/elliptic/ec_endomorphism_params.nim
+++ b/constantine/elliptic/ec_endomorphism_params.nim
@@ -8,7 +8,6 @@
 
 import
   # Internal
-  ../../helpers/static_for,
   ../primitives,
   ../config/[common, curves, type_bigint],
   ../arithmetic,
diff --git a/constantine/elliptic/ec_weierstrass_projective.nim b/constantine/elliptic/ec_weierstrass_projective.nim
index 7c27fcf..3bdcd51 100644
--- a/constantine/elliptic/ec_weierstrass_projective.nim
+++ b/constantine/elliptic/ec_weierstrass_projective.nim
@@ -190,7 +190,7 @@ func sum*[F](
     r.y.sum(Q.x, Q.z)         # 15. Y3 <- X2 + Z2
     r.x *= r.y                # 16. X3 <- X3 Y3     X3 = (X1 Z1)(X2 Z2)
     r.y.sum(t0, t2)           # 17. Y3 <- t0 + t2   Y3 = X1 X2 + Z1 Z2
-    r.y.diff(r.x, r.y)        # 18. Y3 <- X3 - Y3   Y3 = (X1 + Z1)(X2 + Z2) - (X1 X2 + Z1 Z2) = X1 Z2 + X2 Z1
+    r.y.diffAlias(r.x, r.y)   # 18. Y3 <- X3 - Y3   Y3 = (X1 + Z1)(X2 + Z2) - (X1 X2 + Z1 Z2) = X1 Z2 + X2 Z1
     when F is Fp2 and F.C.getSexticTwist() == D_Twist:
       t0 *= SexticNonResidue
       t1 *= SexticNonResidue
@@ -206,7 +206,7 @@ func sum*[F](
       r.y *= SexticNonResidue
     r.x.prod(t4, r.y)         # 25. X3 <- t4 Y3     X3 = 3b(Y1 Z2 + Y2 Z1)(X1 Z2 + X2 Z1)
     t2.prod(t3, t1)           # 26. t2 <- t3 t1     t2 = (X1 Y2 + X2 Y1) (Y1 Y2 - 3b Z1 Z2)
-    r.x.diff(t2, r.x)         # 27. X3 <- t2 - X3   X3 = (X1 Y2 + X2 Y1) (Y1 Y2 - 3b Z1 Z2) - 3b(Y1 Z2 + Y2 Z1)(X1 Z2 + X2 Z1)
+    r.x.diffAlias(t2, r.x)    # 27. X3 <- t2 - X3   X3 = (X1 Y2 + X2 Y1) (Y1 Y2 - 3b Z1 Z2) - 3b(Y1 Z2 + Y2 Z1)(X1 Z2 + X2 Z1)
     r.y *= t0                 # 28. Y3 <- Y3 t0     Y3 = 9b X1 X2 (X1 Z2 + X2 Z1)
     t1 *= r.z                 # 29. t1 <- t1 Z3     t1 = (Y1 Y2 - 3b Z1 Z2)(Y1 Y2 + 3b Z1 Z2)
     r.y += t1                 # 30. Y3 <- t1 + Y3   Y3 = (Y1 Y2 + 3b Z1 Z2)(Y1 Y2 - 3b Z1 Z2) + 9b X1 X2 (X1 Z2 + X2 Z1)
diff --git a/constantine/primitives.nim b/constantine/primitives.nim
index 0db7d07..7bea2a9 100644
--- a/constantine/primitives.nim
+++ b/constantine/primitives.nim
@@ -12,7 +12,8 @@ import
   primitives/multiplexers,
   primitives/addcarry_subborrow,
   primitives/extended_precision,
-  primitives/bithacks
+  primitives/bithacks,
+  primitives/static_for
 
 export
   constant_time_types,
@@ -20,7 +21,8 @@ export
   multiplexers,
   addcarry_subborrow,
   extended_precision,
-  bithacks
+  bithacks,
+  staticFor
 
 when X86 and GCC_Compatible:
   import primitives/[cpuinfo_x86, macro_assembler_x86]
diff --git a/constantine/primitives/macro_assembler_x86.nim b/constantine/primitives/macro_assembler_x86.nim
index f9630f7..96faf1c 100644
--- a/constantine/primitives/macro_assembler_x86.nim
+++ b/constantine/primitives/macro_assembler_x86.nim
@@ -6,7 +6,7 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import std/[macros, strutils, sets, hashes]
+import std/[macros, strutils, sets, hashes, algorithm]
 
 # A compile-time inline assembler
 
@@ -33,6 +33,9 @@ type
 
     RAX            = "a"
 
+    # Flags
+    CarryFlag      = "@ccc"
+
   Register* = enum
     rbx, rdx, r8, rax
 
@@ -98,6 +101,12 @@ func hash(od: OperandDesc): Hash =
 func len*(opArray: OperandArray): int =
   opArray.buf.len
 
+func len*(opArray: Operand): int =
+  opArray.buf.len
+
+func rotateLeft*(opArray: var OperandArray) =
+  opArray.buf.rotateLeft(1)
+
 proc `[]`*(opArray: OperandArray, index: int): Operand =
   opArray.buf[index]
 
@@ -181,6 +190,25 @@ func asArrayAddr*(op: Operand, len: int): Operand =
 # Code generation
 # ------------------------------------------------------------------------------------------------------------
 
+func setToCarryFlag*(a: var Assembler_x86, carry: NimNode) =
+
+  # We need to dereference the hidden pointer of var param
+  let isHiddenDeref = carry.kind == nnkHiddenDeref
+  let nimSymbol = if isHiddenDeref: carry[0]
+                  else: carry
+  {.noSideEffect.}:
+    let symStr = $nimSymbol
+
+  let desc = OperandDesc(
+    asmId: "",
+    nimSymbol: ident(symStr),
+    rm: CarryFlag,
+    constraint: Output_Overwrite,
+    cEmit: symStr
+  )
+
+  a.operands.incl(desc)
+
 func generate*(a: Assembler_x86): NimNode =
   ## Generate the inline assembly code from
   ## the desired instruction
@@ -527,7 +555,7 @@ func cmovc*(a: var Assembler_x86, dst, src: Operand) =
 func cmovnc*(a: var Assembler_x86, dst, src: Operand) =
   ## Does: dst <- src if the carry flag is not set
   doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr
-  doAssert dst.desc.constraint in {Output_EarlyClobber, InputOutput, Output_Overwrite}, $dst.repr
+  doAssert dst.desc.constraint in OutputReg, $dst.repr
 
   a.codeFragment("cmovnc", src, dst)
   # No clobber
@@ -566,7 +594,7 @@ func mul*(a: var Assembler_x86, dHi, dLo: Register, src0: Operand, src1: Registe
 
 func imul*(a: var Assembler_x86, dst, src: Operand) =
   ## Does dst <- dst * src, keeping only the low half
-  doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr
+  doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr
   doAssert dst.desc.constraint in OutputReg, $dst.repr
 
   a.codeFragment("imul", src, dst)
@@ -574,9 +602,9 @@ func imul*(a: var Assembler_x86, dst, src: Operand) =
 func mulx*(a: var Assembler_x86, dHi, dLo, src0: Operand, src1: Register) =
   ## Does (dHi, dLo) <- src0 * src1
   doAssert src1 == rdx, "MULX requires the RDX register"
-  doAssert dHi.desc.rm in {Reg, ElemsInReg} or dHi.desc.rm in SpecificRegisters,
+  doAssert dHi.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
     "The destination operand must be a register " & $dHi.repr
-  doAssert dLo.desc.rm in {Reg, ElemsInReg} or dLo.desc.rm in SpecificRegisters,
+  doAssert dLo.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
     "The destination operand must be a register " & $dLo.repr
   doAssert dHi.desc.constraint in OutputReg
   doAssert dLo.desc.constraint in OutputReg
@@ -595,7 +623,7 @@ func adcx*(a: var Assembler_x86, dst, src: Operand) =
   ## Does: dst <- dst + src + carry
   ## and only sets the carry flag
   doAssert dst.desc.constraint in OutputReg, $dst.repr
-  doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr
+  doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr
   a.codeFragment("adcx", src, dst)
   a.areFlagsClobbered = true
 
@@ -603,7 +631,7 @@ func adox*(a: var Assembler_x86, dst, src: Operand) =
   ## Does: dst <- dst + src + overflow
   ## and only sets the overflow flag
   doAssert dst.desc.constraint in OutputReg, $dst.repr
-  doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr
+  doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr
   a.codeFragment("adox", src, dst)
   a.areFlagsClobbered = true
 
diff --git a/constantine/primitives/static_for.nim b/constantine/primitives/static_for.nim
new file mode 100644
index 0000000..af531ee
--- /dev/null
+++ b/constantine/primitives/static_for.nim
@@ -0,0 +1,36 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import std/macros
+
+proc replaceNodes(ast: NimNode, what: NimNode, by: NimNode): NimNode =
+  # Replace "what" ident node by "by"
+  proc inspect(node: NimNode): NimNode =
+    case node.kind:
+    of {nnkIdent, nnkSym}:
+      if node.eqIdent(what):
+        return by
+      return node
+    of nnkEmpty:
+      return node
+    of nnkLiterals:
+      return node
+    else:
+      var rTree = node.kind.newTree()
+      for child in node:
+        rTree.add inspect(child)
+      return rTree
+  result = inspect(ast)
+
+macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped): untyped =
+  result = newStmtList()
+  for i in start ..< stopEx:
+    result.add nnkBlockStmt.newTree(
+      ident("unrolledIter_" & $idx & $i),
+      body.replaceNodes(idx, newLit i)
+    )
diff --git a/constantine/tower_field_extensions/quadratic_extensions.nim b/constantine/tower_field_extensions/quadratic_extensions.nim
index 48b5892..48767c7 100644
--- a/constantine/tower_field_extensions/quadratic_extensions.nim
+++ b/constantine/tower_field_extensions/quadratic_extensions.nim
@@ -84,17 +84,59 @@ func prod_complex(r: var QuadraticExt, a, b: QuadraticExt) =
   mixin fromComplexExtension
   static: doAssert r.fromComplexExtension()
 
-  var a0b0 {.noInit.}, a1b1 {.noInit.}: typeof(r.c0)
-  a0b0.prod(a.c0, b.c0)                                         # [1 Mul]
-  a1b1.prod(a.c1, b.c1)                                         # [2 Mul]
+  # TODO: GCC is adding an unexplainable 30 cycles tax to this function (~10% slow down)
+  #       for seemingly no reason
 
-  r.c0.sum(a.c0, a.c1)  # r0 = (a0 + a1)                        # [2 Mul, 1 Add]
-  r.c1.sum(b.c0, b.c1)  # r1 = (b0 + b1)                        # [2 Mul, 2 Add]
-  r.c1 *= r.c0          # r1 = (b0 + b1)(a0 + a1)               # [3 Mul, 2 Add] - 𝔽p temporary
+  when true: # Single-width implementation
+             # Clang 330 cycles on i9-9980XE @4.1 GHz
+    var a0b0 {.noInit.}, a1b1 {.noInit.}: typeof(r.c0)
+    a0b0.prod(a.c0, b.c0)                                         # [1 Mul]
+    a1b1.prod(a.c1, b.c1)                                         # [2 Mul]
 
-  r.c0.diff(a0b0, a1b1) # r0 = a0 b0 - a1 b1                    # [3 Mul, 2 Add, 1 Sub]
-  r.c1 -= a0b0          # r1 = (b0 + b1)(a0 + a1) - a0b0        # [3 Mul, 2 Add, 2 Sub]
-  r.c1 -= a1b1          # r1 = (b0 + b1)(a0 + a1) - a0b0 - a1b1 # [3 Mul, 2 Add, 3 Sub]
+    r.c0.sum(a.c0, a.c1)  # r0 = (a0 + a1)                        # [2 Mul, 1 Add]
+    r.c1.sum(b.c0, b.c1)  # r1 = (b0 + b1)                        # [2 Mul, 2 Add]
+    r.c1 *= r.c0          # r1 = (b0 + b1)(a0 + a1)               # [3 Mul, 2 Add] - 𝔽p temporary
+
+    r.c0.diff(a0b0, a1b1) # r0 = a0 b0 - a1 b1                    # [3 Mul, 2 Add, 1 Sub]
+    r.c1 -= a0b0          # r1 = (b0 + b1)(a0 + a1) - a0b0        # [3 Mul, 2 Add, 2 Sub]
+    r.c1 -= a1b1          # r1 = (b0 + b1)(a0 + a1) - a0b0 - a1b1 # [3 Mul, 2 Add, 3 Sub]
+
+  else: # Double-width implementation with lazy reduction
+        # Deactivated for now Clang 360 cycles on i9-9980XE @4.1 GHz
+    var a0b0 {.noInit.}, a1b1 {.noInit.}: doubleWidth(typeof(r.c0))
+    var d {.noInit.}: doubleWidth(typeof(r.c0))
+    const msbSet = r.c0.typeof.C.canUseNoCarryMontyMul()
+
+    a0b0.mulNoReduce(a.c0, b.c0)     # 44 cycles - cumul 44
+    a1b1.mulNoReduce(a.c1, b.c1)     # 44 cycles - cumul 88
+    when msbSet:
+      r.c0.sum(a.c0, a.c1)
+      r.c1.sum(b.c0, b.c1)
+    else:
+      r.c0.sumNoReduce(a.c0, a.c1)   # 5 cycles  - cumul 93
+      r.c1.sumNoReduce(b.c0, b.c1)   # 5 cycles  - cumul 98
+    d.mulNoReduce(r.c0, r.c1)        # 44 cycles - cumul 142
+    when msbSet:
+      d -= a0b0
+      d -= a1b1
+    else:
+      d.diffNoReduce(d, a0b0)        # 10 cycles - cumul 152
+      d.diffNoReduce(d, a1b1)        # 10 cycles - cumul 162
+    a0b0.diff(a0b0, a1b1)            # 18 cycles - cumul 170
+    r.c0.reduce(a0b0)                # 68 cycles - cumul 248
+    r.c1.reduce(d)                   # 68 cycles - cumul 316
+
+  # Single-width [3 Mul, 2 Add, 3 Sub]
+  #    3*81 + 2*14 + 3*12 = 307 theoretical cycles
+  #    330 measured
+  # Double-Width
+  #    316 theoretical cycles
+  #    365 measured
+  #    Reductions can be 2x10 faster using MCL algorithm
+  #    but there are still unexplained 50 cycles diff between theo and measured
+  #    and unexplained 30 cycles between Clang and GCC
+  #    - Function calls?
+  #    - push/pop stack?
 
 # Commutative ring implementation for generic quadratic extension fields
 # -------------------------------------------------------------------
diff --git a/constantine/tower_field_extensions/tower_common.nim b/constantine/tower_field_extensions/tower_common.nim
index 88716cd..c05f01d 100644
--- a/constantine/tower_field_extensions/tower_common.nim
+++ b/constantine/tower_field_extensions/tower_common.nim
@@ -154,6 +154,19 @@ func diff*(r: var CubicExt, a, b: CubicExt) =
   r.c1.diff(a.c1, b.c1)
   r.c2.diff(a.c2, b.c2)
 
+func diffAlias*(r: var QuadraticExt, a, b: QuadraticExt) =
+  ## Diff ``a`` and ``b`` into ``r``
+  ## Handles r and b aliasing
+  r.c0.diffAlias(a.c0, b.c0)
+  r.c1.diffAlias(a.c1, b.c1)
+
+func diffAlias*(r: var CubicExt, a, b: CubicExt) =
+  ## Diff ``a`` and ``b`` into ``r``
+  ## Handles r and b aliasing
+  r.c0.diffAlias(a.c0, b.c0)
+  r.c1.diffAlias(a.c1, b.c1)
+  r.c2.diffAlias(a.c2, b.c2)
+
 # Multiplication by a small integer known at compile-time
 # -------------------------------------------------------------------
 
diff --git a/tests/t_bigints.nim b/tests/t_bigints.nim
index 7136650..be43128 100644
--- a/tests/t_bigints.nim
+++ b/tests/t_bigints.nim
@@ -145,6 +145,7 @@ proc mainArith() =
         discard a.add(SecretWord 1)
         check: bool(a == expected)
 
+proc mainMul() =
   suite "Multi-precision multiplication" & " [" & $WordBitwidth & "-bit mode]":
     test "Same size operand into double size result":
       block:
@@ -185,6 +186,7 @@ proc mainArith() =
         r.prod(b, a)
         check: bool(r == expected)
 
+proc mainMulHigh() =
   suite "Multi-precision multiplication keeping only high words" & " [" & $WordBitwidth & "-bit mode]":
     test "Same size operand into double size result - discard first word":
       block:
@@ -270,6 +272,7 @@ proc mainArith() =
         r.prod_high_words(b, a, 2)
         check: bool(r == expected)
 
+proc mainModular() =
   suite "Modular operations - small modulus" & " [" & $WordBitwidth & "-bit mode]":
     # Vectors taken from Stint - https://github.com/status-im/nim-stint
     test "100 mod 13":
@@ -619,6 +622,9 @@ proc mainModularInverse() =
         check: bool(r == expected)
 
 mainArith()
+mainMul()
+mainMulHigh()
+mainModular()
 mainNeg()
 mainCopySwap()
 mainModularInverse()
diff --git a/tests/t_finite_fields_double_width.nim b/tests/t_finite_fields_double_width.nim
new file mode 100644
index 0000000..da9951f
--- /dev/null
+++ b/tests/t_finite_fields_double_width.nim
@@ -0,0 +1,97 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Standard library
+  std/[unittest, times],
+  # Internal
+  ../constantine/arithmetic,
+  ../constantine/io/[io_bigints, io_fields],
+  ../constantine/config/[curves, common, type_bigint],
+  # Test utilities
+  ../helpers/prng_unsafe
+
+const Iters = 128
+
+var rng: RngState
+let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+rng.seed(seed)
+echo "\n------------------------------------------------------\n"
+echo "test_finite_fields_double_width xoshiro512** seed: ", seed
+
+proc randomCurve(C: static Curve) =
+  let a = rng.random_unsafe(Fp[C])
+  let b = rng.random_unsafe(Fp[C])
+
+  var r_fp, r_fpDbl: Fp[C]
+  var tmpDbl: FpDbl[C]
+
+  r_fp.prod(a, b)
+  tmpDbl.mulNoReduce(a, b)
+  r_fpDbl.reduce(tmpDbl)
+
+  doAssert bool(r_fp == r_fpDbl)
+
+proc randomHighHammingWeight(C: static Curve) =
+  let a = rng.random_highHammingWeight(Fp[C])
+  let b = rng.random_highHammingWeight(Fp[C])
+
+  var r_fp, r_fpDbl: Fp[C]
+  var tmpDbl: FpDbl[C]
+
+  r_fp.prod(a, b)
+  tmpDbl.mulNoReduce(a, b)
+  r_fpDbl.reduce(tmpDbl)
+
+  doAssert bool(r_fp == r_fpDbl)
+
+proc random_long01Seq(C: static Curve) =
+  let a = rng.random_long01Seq(Fp[C])
+  let b = rng.random_long01Seq(Fp[C])
+
+  var r_fp, r_fpDbl: Fp[C]
+  var tmpDbl: FpDbl[C]
+
+  r_fp.prod(a, b)
+  tmpDbl.mulNoReduce(a, b)
+  r_fpDbl.reduce(tmpDbl)
+
+  doAssert bool(r_fp == r_fpDbl)
+
+suite "Field Multiplication via double-width field elements is consistent with single-width." & " [" & $WordBitwidth & "-bit mode]":
+  test "With P-224 field modulus":
+    for _ in 0 ..< Iters:
+      randomCurve(P224)
+    for _ in 0 ..< Iters:
+      randomHighHammingWeight(P224)
+    for _ in 0 ..< Iters:
+      random_long01Seq(P224)
+
+  test "With P-256 field modulus":
+    for _ in 0 ..< Iters:
+      randomCurve(P256)
+    for _ in 0 ..< Iters:
+      randomHighHammingWeight(P256)
+    for _ in 0 ..< Iters:
+      random_long01Seq(P256)
+
+  test "With BN254_Snarks field modulus":
+    for _ in 0 ..< Iters:
+      randomCurve(BN254_Snarks)
+    for _ in 0 ..< Iters:
+      randomHighHammingWeight(BN254_Snarks)
+    for _ in 0 ..< Iters:
+      random_long01Seq(BN254_Snarks)
+
+  test "With BLS12_381 field modulus":
+    for _ in 0 ..< Iters:
+      randomCurve(BLS12_381)
+    for _ in 0 ..< Iters:
+      randomHighHammingWeight(BLS12_381)
+    for _ in 0 ..< Iters:
+      random_long01Seq(BLS12_381)
diff --git a/tests/t_precomputed.nim b/tests/t_precomputed.nim
index 01c4591..0518937 100644
--- a/tests/t_precomputed.nim
+++ b/tests/t_precomputed.nim
@@ -25,7 +25,7 @@ proc checkCubeRootOfUnity(curve: static Curve) =
   test $curve & " cube root of unity (mod r)":
     var cru: BigInt[3 * curve.getCurveOrderBitwidth()]
     cru.prod(curve.getCubicRootOfUnity_mod_r(), curve.getCubicRootOfUnity_mod_r())
-    cru.prod(cru, curve.getCubicRootOfUnity_mod_r())
+    cru.mul(curve.getCubicRootOfUnity_mod_r())
 
     var r: BigInt[curve.getCurveOrderBitwidth()]
     r.reduce(cru, curve.getCurveOrder)