FpDbl revisited (#144) - 7% perf improvement everywhere, up to 30% in double-width primitives

* reorg mul -> limbs_double_width, ConstantineASM CttASM * Implement squaring specialized scalar path (22% faster than mul) * Implement "portable" assembly for squaring * stash part of the changes * Reorg montgomery reduction - prepare to introduce Comba optimization * Implement comba Montgomery reduce (but it's slower!) * rename t -> a * 30% performance improvement by avoiding toOpenArray! * variable renaming * Fix 32-bit imports * slightly better assembly for sub2x * There is an annoying bottleneck * use out-of-place Fp assembly instead of in-place * diffAlias is unneeded now * cosmetic * speedup fpDbl sub by 20% * Fix Fp2 -> Fp6 -> Fp12 towering. It seems 5% faster * Stash ADCX/ADOX squaring
2026-02-17 03:33:22 +00:00 · 2021-02-01 03:52:27 +01:00 · 2021-02-01 03:52:27 +01:00 · 83dcd988b3
commit 83dcd988b3
parent d12d5faf21
28 changed files with 1046 additions and 468 deletions
--- a/README.md
+++ b/README.md
@ -42,7 +42,7 @@ generated incorrect add-with-carry code.

 On x86-64, inline assembly is used to workaround compilers having issues optimizing large integer arithmetic,
 and also ensure constant-time code.
-This can be deactivated with `"-d:ConstantineASM=false"`:
+This can be deactivated with `"-d:CttASM=false"`:
 - at a significant performance cost with GCC (~50% slower than Clang).
 - at misssed opportunity on recent CPUs that support MULX/ADCX/ADOX instructions (~60% faster than Clang).
 - There is a 2.4x perf ratio between using plain GCC vs GCC with inline assembly.
--- a/benchmarks/bench_fp_double_width.nim
+++ b/benchmarks/bench_fp_double_width.nim
@ -172,6 +172,12 @@ proc mul2xBench*(rLen, aLen, bLen: static int, iters: int) =
  bench("Multiplication", $rLen & " <- " & $aLen & " x " & $bLen, iters):
    r.prod(a, b)

+proc square2xBench*(rLen, aLen: static int, iters: int) =
+  var r: BigInt[rLen]
+  let a = rng.random_unsafe(BigInt[aLen])
+  bench("Squaring", $rLen & " <- " & $aLen & "²", iters):
+    r.square(a)
+
 proc reduce2x*(T: typedesc, iters: int) =
  var r: T
  var t: doubleWidth(T)
@ -189,6 +195,7 @@ proc main() =
  diff2x(Fp[BLS12_381], iters = 10_000_000)
  diff2xNoReduce(Fp[BLS12_381], iters = 10_000_000)
  mul2xBench(768, 384, 384, iters = 10_000_000)
+  square2xBench(768, 384, iters = 10_000_000)
  reduce2x(Fp[BLS12_381], iters = 10_000_000)
  separator()

--- a/constantine.nimble
+++ b/constantine.nimble
@ -232,7 +232,7 @@ proc buildBench(benchName: string, compiler = "", useAsm = true, run = false) =
  if compiler != "":
    cc = "--cc:" & compiler
  if not useAsm:
-    cc &= " -d:ConstantineASM=false"
+    cc &= " -d:CttASM=false"
  exec "nim c " & cc &
       " -d:danger --verbosity:0 -o:build/bench/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") &
       " --nimcache:nimcache/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") &
@ -246,7 +246,7 @@ proc runTests(requireGMP: bool, dumpCmdFile = false, test32bit = false, testASM
    if not(td.useGMP and not requireGMP):
      var flags = ""
      if not testASM:
-        flags &= " -d:ConstantineASM=false"
+        flags &= " -d:CttASM=false"
      if test32bit:
        flags &= " -d:Constantine32"
      if td.path in useDebug:
--- a/constantine/arithmetic/assembly/limbs_asm_modular_dbl_width_x86.nim
+++ b/constantine/arithmetic/assembly/limbs_asm_modular_dbl_width_x86.nim
@ -27,58 +27,67 @@ static: doAssert UseASM_X86_64
 # Substraction
 # ------------------------------------------------------------

-macro sub2x_gen[N: static int](a: var Limbs[N], b: Limbs[N], M: Limbs[N div 2]): untyped =
+macro sub2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N div 2]): untyped =
  ## Generate an optimized out-of-place double-width substraction kernel

  result = newStmtList()

  var ctx = init(Assembler_x86, BaseType)
  let
-    N2 = N div 2
+    H = N div 2

-    arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, InputOutput)
-    # We reuse the reg used for B for overflow detection
-    arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, InputOutput)
-    # We could force M as immediate by specializing per moduli
-    arrM = init(OperandArray, nimSymbol = M, N, PointerInReg, Input)
+    r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
+    # We reuse the reg used for b for overflow detection
+    b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
+    # We could force m as immediate by specializing per moduli
+    M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
    # If N is too big, we need to spill registers. TODO.
-    arrT = init(OperandArray, nimSymbol = ident"t", N2, ElemsInReg, Output_EarlyClobber)
-    arrTadd = init(OperandArray, nimSymbol = ident"tadd", N2, ElemsInReg, Output_EarlyClobber)
+    u = init(OperandArray, nimSymbol = ident"U", H, ElemsInReg, InputOutput)
+    v = init(OperandArray, nimSymbol = ident"V", H, ElemsInReg, InputOutput)
+
+  let usym = u.nimSymbol
+  let vsym = v.nimSymbol
+  result.add quote do:
+    var `usym`{.noinit.}, `vsym` {.noInit.}: typeof(`A`)
+    staticFor i, 0, `H`:
+      `usym`[i] = `A`[i]
+    staticFor i, `H`, `N`:
+      `vsym`[i-`H`] = `A`[i]

  # Substraction
-  for i in 0 ..< N:
-    ctx.mov arrT[i mod N2], arrA[i]
+  # u = a[0..<H] - b[0..<H], v = a[H..<N]
+  for i in 0 ..< H:
    if i == 0:
-      ctx.sub arrT[0], arrB[0]
+      ctx.sub u[0], b[0]
    else:
-      ctx.sbb arrT[i mod N2], arrB[i]
-    ctx.mov arrA[i], arrT[i mod N2]
-    # Interleaved copy the modulus to hide SBB latencies
-    if i < N2:
-      ctx.mov arrTadd[i], arrM[i]
+      ctx.sbb u[i], b[i]
+
+  # Everything should be hot in cache now so movs are cheaper
+  # we can try using 2 per SBB
+  # v = a[H..<N] - b[H..<N], a[0..<H] = u, u = M
+  for i in H ..< N:
+    ctx.mov r[i-H], u[i-H]
+    ctx.sbb v[i-H], b[i]
+    ctx.mov u[i-H], M[i-H] # TODO, bottleneck 17% perf: prefetch or inline modulus?

  # Mask: underflowed contains 0xFFFF or 0x0000
-  let underflowed = arrB.reuseRegister()
+  let underflowed = b.reuseRegister()
  ctx.sbb underflowed, underflowed

  # Now mask the adder, with 0 or the modulus limbs
-  for i in 0 ..< N2:
-    ctx.`and` arrTadd[i], underflowed
+  for i in 0 ..< H:
+    ctx.`and` u[i], underflowed

  # Add the masked modulus
-  for i in 0 ..< N2:
+  for i in 0 ..< H:
    if i == 0:
-      ctx.add arrT[0], arrTadd[0]
+      ctx.add u[0], v[0]
    else:
-      ctx.adc arrT[i], arrTadd[i]
-    ctx.mov arrA[i+N2], arrT[i]
+      ctx.adc u[i], v[i]
+    ctx.mov r[i+H], u[i]

-  let t = arrT.nimSymbol
-  let tadd = arrTadd.nimSymbol
-  result.add quote do:
-    var `t`{.noinit.}, `tadd` {.noInit.}: typeof(`a`)
  result.add ctx.generate

-func sub2x_asm*[N: static int](a: var Limbs[N], b: Limbs[N], M: Limbs[N div 2]) =
+func sub2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N div 2]) =
  ## Constant-time double-width substraction
-  sub2x_gen(a, b, M)
+  sub2x_gen(r, a, b, M)
--- a/constantine/arithmetic/assembly/limbs_asm_modular_x86.nim
+++ b/constantine/arithmetic/assembly/limbs_asm_modular_x86.nim
@ -31,10 +31,10 @@ static: doAssert UseASM_X86_64
 # Field addition
 # ------------------------------------------------------------

-macro addmod_gen[N: static int](a: var Limbs[N], b, M: Limbs[N]): untyped =
+macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
  ## Generate an optimized modular addition kernel
  # Register pressure note:
-  #   We could generate a kernel per modulus M by hardocing it as immediate
+  #   We could generate a kernel per modulus m by hardcoding it as immediate
  #   however this requires
  #     - duplicating the kernel and also
  #     - 64-bit immediate encoding is quite large
@ -43,64 +43,66 @@ macro addmod_gen[N: static int](a: var Limbs[N], b, M: Limbs[N]): untyped =

  var ctx = init(Assembler_x86, BaseType)
  let
-    arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, InputOutput)
-    # We reuse the reg used for B for overflow detection
-    arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, InputOutput)
-    # We could force M as immediate by specializing per moduli
-    arrM = init(OperandArray, nimSymbol = M, N, PointerInReg, Input)
+    r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
+    # We reuse the reg used for b for overflow detection
+    b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
+    # We could force m as immediate by specializing per moduli
+    M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
    # If N is too big, we need to spill registers. TODO.
-    arrT = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
-    arrTsub = init(OperandArray, nimSymbol = ident"tsub", N, ElemsInReg, Output_EarlyClobber)
+    u = init(OperandArray, nimSymbol = ident"u", N, ElemsInReg, InputOutput)
+    v = init(OperandArray, nimSymbol = ident"v", N, ElemsInReg, Output_EarlyClobber)
+
+  let usym = u.nimSymbol
+  let vsym = v.nimSymbol
+  result.add quote do:
+    var `usym`{.noinit.}, `vsym` {.noInit.}: typeof(`A`)
+    staticFor i, 0, `N`:
+      `usym`[i] = `A`[i]

  # Addition
  for i in 0 ..< N:
-    ctx.mov arrT[i], arrA[i]
    if i == 0:
-      ctx.add arrT[0], arrB[0]
+      ctx.add u[0], b[0]
    else:
-      ctx.adc arrT[i], arrB[i]
+      ctx.adc u[i], b[i]
    # Interleaved copy in a second buffer as well
-    ctx.mov arrTsub[i], arrT[i]
+    ctx.mov v[i], u[i]

  # Mask: overflowed contains 0xFFFF or 0x0000
  # TODO: unnecessary if MSB never set, i.e. "canUseNoCarryMontyMul"
-  let overflowed = arrB.reuseRegister()
+  let overflowed = b.reuseRegister()
  ctx.sbb overflowed, overflowed

  # Now substract the modulus
  for i in 0 ..< N:
    if i == 0:
-      ctx.sub arrTsub[0], arrM[0]
+      ctx.sub v[0], M[0]
    else:
-      ctx.sbb arrTsub[i], arrM[i]
+      ctx.sbb v[i], M[i]

  # If it overflows here, it means that it was
-  # smaller than the modulus and we don't need arrTsub
+  # smaller than the modulus and we don'u need V
  ctx.sbb overflowed, 0

  # Conditional Mov and
  # and store result
  for i in 0 ..< N:
-    ctx.cmovnc arrT[i],  arrTsub[i]
-    ctx.mov arrA[i], arrT[i]
+    ctx.cmovnc u[i],  v[i]
+    ctx.mov r[i], u[i]

-  let t = arrT.nimSymbol
-  let tsub = arrTsub.nimSymbol
-  result.add quote do:
-    var `t`{.noinit.}, `tsub` {.noInit.}: typeof(`a`)
  result.add ctx.generate

-func addmod_asm*(a: var Limbs, b, M: Limbs) =
+func addmod_asm*(r: var Limbs, a, b, m: Limbs) =
  ## Constant-time modular addition
-  addmod_gen(a, b, M)
+  addmod_gen(r, a, b, m)

 # Field substraction
 # ------------------------------------------------------------

-macro submod_gen[N: static int](a: var Limbs[N], b, M: Limbs[N]): untyped =
+macro submod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
  ## Generate an optimized modular addition kernel
  # Register pressure note:
-  #   We could generate a kernel per modulus M by hardocing it as immediate
+  #   We could generate a kernel per modulus m by hardocing it as immediate
  #   however this requires
  #     - duplicating the kernel and also
  #     - 64-bit immediate encoding is quite large
@ -109,119 +111,121 @@ macro submod_gen[N: static int](a: var Limbs[N], b, M: Limbs[N]): untyped =

  var ctx = init(Assembler_x86, BaseType)
  let
-    arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, InputOutput)
-    # We reuse the reg used for B for overflow detection
-    arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, InputOutput)
-    # We could force M as immediate by specializing per moduli
-    arrM = init(OperandArray, nimSymbol = M, N, PointerInReg, Input)
+    r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
+    # We reuse the reg used for b for overflow detection
+    b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
+    # We could force m as immediate by specializing per moduli
+    M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
    # If N is too big, we need to spill registers. TODO.
-    arrT = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
-    arrTadd = init(OperandArray, nimSymbol = ident"tadd", N, ElemsInReg, Output_EarlyClobber)
+    u = init(OperandArray, nimSymbol = ident"U", N, ElemsInReg, InputOutput)
+    v = init(OperandArray, nimSymbol = ident"V", N, ElemsInReg, Output_EarlyClobber)
+
+  let usym = u.nimSymbol
+  let vsym = v.nimSymbol
+  result.add quote do:
+    var `usym`{.noinit.}, `vsym` {.noInit.}: typeof(`A`)
+    staticFor i, 0, `N`:
+      `usym`[i] = `A`[i]

  # Substraction
  for i in 0 ..< N:
-    ctx.mov arrT[i], arrA[i]
    if i == 0:
-      ctx.sub arrT[0], arrB[0]
+      ctx.sub u[0], b[0]
    else:
-      ctx.sbb arrT[i], arrB[i]
+      ctx.sbb u[i], b[i]
    # Interleaved copy the modulus to hide SBB latencies
-    ctx.mov arrTadd[i], arrM[i]
+    ctx.mov v[i], M[i]

  # Mask: underflowed contains 0xFFFF or 0x0000
-  let underflowed = arrB.reuseRegister()
+  let underflowed = b.reuseRegister()
  ctx.sbb underflowed, underflowed

  # Now mask the adder, with 0 or the modulus limbs
  for i in 0 ..< N:
-    ctx.`and` arrTadd[i], underflowed
+    ctx.`and` v[i], underflowed

  # Add the masked modulus
  for i in 0 ..< N:
    if i == 0:
-      ctx.add arrT[0], arrTadd[0]
+      ctx.add u[0], v[0]
    else:
-      ctx.adc arrT[i], arrTadd[i]
-    ctx.mov arrA[i], arrT[i]
+      ctx.adc u[i], v[i]
+    ctx.mov r[i], u[i]

-  let t = arrT.nimSymbol
-  let tadd = arrTadd.nimSymbol
-  result.add quote do:
-    var `t`{.noinit.}, `tadd` {.noInit.}: typeof(`a`)
  result.add ctx.generate

-func submod_asm*(a: var Limbs, b, M: Limbs) =
+func submod_asm*(r: var Limbs, a, b, M: Limbs) =
  ## Constant-time modular substraction
  ## Warning, does not handle aliasing of a and b
-  submod_gen(a, b, M)
+  submod_gen(r, a, b, M)

 # Field negation
 # ------------------------------------------------------------

-macro negmod_gen[N: static int](r: var Limbs[N], a, M: Limbs[N]): untyped =
+macro negmod_gen[N: static int](R: var Limbs[N], A, m: Limbs[N]): untyped =
  ## Generate an optimized modular negation kernel

  result = newStmtList()

  var ctx = init(Assembler_x86, BaseType)
  let
-    arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, Input)
-    arrR = init(OperandArray, nimSymbol = r, N, PointerInReg, InputOutput)
-    arrT = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
-    # We could force M as immediate by specializing per moduli
-    # We reuse the reg used for M for overflow detection
-    arrM = init(OperandArray, nimSymbol = M, N, PointerInReg, InputOutput)
+    a = init(OperandArray, nimSymbol = A, N, PointerInReg, Input)
+    r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
+    u = init(OperandArray, nimSymbol = ident"U", N, ElemsInReg, Output_EarlyClobber)
+    # We could force m as immediate by specializing per moduli
+    # We reuse the reg used for m for overflow detection
+    M = init(OperandArray, nimSymbol = m, N, PointerInReg, InputOutput)

-  # Substraction M - a
+  # Substraction m - a
  for i in 0 ..< N:
-    ctx.mov arrT[i], arrM[i]
+    ctx.mov u[i], M[i]
    if i == 0:
-      ctx.sub arrT[0], arrA[0]
+      ctx.sub u[0], a[0]
    else:
-      ctx.sbb arrT[i], arrA[i]
+      ctx.sbb u[i], a[i]

  # Deal with a == 0
-  let isZero = arrM.reuseRegister()
-  ctx.mov isZero, arrA[0]
+  let isZero = M.reuseRegister()
+  ctx.mov isZero, a[0]
  for i in 1 ..< N:
-    ctx.`or` isZero, arrA[i]
+    ctx.`or` isZero, a[i]

  # Zero result if a == 0
  for i in 0 ..< N:
-    ctx.cmovz arrT[i], isZero
-    ctx.mov arrR[i], arrT[i]
+    ctx.cmovz u[i], isZero
+    ctx.mov r[i], u[i]

-  let t = arrT.nimSymbol
+  let usym = u.nimSymbol
  result.add quote do:
-    var `t`{.noinit.}: typeof(`a`)
+    var `usym`{.noinit.}: typeof(`A`)
  result.add ctx.generate

-func negmod_asm*(r: var Limbs, a, M: Limbs) =
+func negmod_asm*(r: var Limbs, a, m: Limbs) =
  ## Constant-time modular negation
-  negmod_gen(r, a, M)
+  negmod_gen(r, a, m)

 # Sanity checks
 # ----------------------------------------------------------

 when isMainModule:
-  import ../config/type_bigint, algorithm, strutils
+  import ../../config/type_bigint, algorithm, strutils

  proc mainAdd() =
    var a = [SecretWord 0xE3DF60E8F6D0AF9A'u64, SecretWord 0x7B2665C2258A7625'u64, SecretWord 0x68FC9A1D0977C8E0'u64, SecretWord 0xF3DC61ED7DE76883'u64]
    var b = [SecretWord 0x78E9C2EF58BB6B78'u64, SecretWord 0x547F65BD19014254'u64, SecretWord 0x556A115819EAD4B5'u64, SecretWord 0x8CA844A546935DC3'u64]
-    var M = [SecretWord 0xFFFFFFFF00000001'u64, SecretWord 0x0000000000000000'u64, SecretWord 0x00000000FFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64]
+    var m = [SecretWord 0xFFFFFFFF00000001'u64, SecretWord 0x0000000000000000'u64, SecretWord 0x00000000FFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64]
    var s = "0x5cc923d94f8c1b11cfa5cb7f3e8bb879be66ab7423629d968084a692c47ac647"

    a.reverse()
    b.reverse()
-    M.reverse()
+    m.reverse()

    debugecho "--------------------------------"
    debugecho "before:"
    debugecho "  a: ", a.toHex()
    debugecho "  b: ", b.toHex()
-    debugecho "  m: ", M.toHex()
-    addmod_asm(a, b, M)
+    debugecho "  m: ", m.toHex()
+    addmod_asm(a, a, b, m)
    debugecho "after:"
    debugecho "  a: ", a.toHex().tolower
    debugecho "  s: ", s
@ -229,19 +233,19 @@ when isMainModule:

    a = [SecretWord 0x00935a991ca215a6'u64, SecretWord 0x5fbdac6294679337'u64, SecretWord 0x1e41793877b80f12'u64, SecretWord 0x5724cd93cb32932d'u64]
    b = [SecretWord 0x19dd4ecfda64ef80'u64, SecretWord 0x92deeb1532169c3d'u64, SecretWord 0x69ce4ee28421cd30'u64, SecretWord 0x4d90ab5a40295321'u64]
-    M = [SecretWord 0x2523648240000001'u64, SecretWord 0xba344d8000000008'u64, SecretWord 0x6121000000000013'u64, SecretWord 0xa700000000000013'u64]
+    m = [SecretWord 0x2523648240000001'u64, SecretWord 0xba344d8000000008'u64, SecretWord 0x6121000000000013'u64, SecretWord 0xa700000000000013'u64]
    s = "0x1a70a968f7070526f29c9777c67e2f74880fc81afbd9dc42a4b578ee0b5be64e"

    a.reverse()
    b.reverse()
-    M.reverse()
+    m.reverse()

    debugecho "--------------------------------"
    debugecho "before:"
    debugecho "  a: ", a.toHex()
    debugecho "  b: ", b.toHex()
-    debugecho "  m: ", M.toHex()
-    addmod_asm(a, b, M)
+    debugecho "  m: ", m.toHex()
+    addmod_asm(a, a, b, m)
    debugecho "after:"
    debugecho "  a: ", a.toHex().tolower
    debugecho "  s: ", s
@ -249,19 +253,19 @@ when isMainModule:

    a = [SecretWord 0x1c7d810f37fc6e0b'u64, SecretWord 0xb91aba4ce339cea3'u64, SecretWord 0xd9f5571ccc4dfd1a'u64, SecretWord 0xf5906ee9df91f554'u64]
    b = [SecretWord 0x18394ffe94874c9f'u64, SecretWord 0x6e8a8ad032fc5f15'u64, SecretWord 0x7533a2b46b7e9530'u64, SecretWord 0x2849996b4bb61b48'u64]
-    M = [SecretWord 0x2523648240000001'u64, SecretWord 0xba344d8000000008'u64, SecretWord 0x6121000000000013'u64, SecretWord 0xa700000000000013'u64]
+    m = [SecretWord 0x2523648240000001'u64, SecretWord 0xba344d8000000008'u64, SecretWord 0x6121000000000013'u64, SecretWord 0xa700000000000013'u64]
    s = "0x0f936c8b8c83baa96d70f79d16362db0ee07f9d137cc923776da08552b481089"

    a.reverse()
    b.reverse()
-    M.reverse()
+    m.reverse()

    debugecho "--------------------------"
    debugecho "before:"
    debugecho "  a: ", a.toHex()
    debugecho "  b: ", b.toHex()
-    debugecho "  m: ", M.toHex()
-    addmod_asm(a, b, M)
+    debugecho "  m: ", m.toHex()
+    addmod_asm(a, a, b, m)
    debugecho "after:"
    debugecho "  a: ", a.toHex().tolower
    debugecho "  s: ", s
@ -269,19 +273,19 @@ when isMainModule:

    a = [SecretWord 0xe9d55643'u64, SecretWord 0x580ec4cc3f91cef3'u64, SecretWord 0x11ecbb7d35b36449'u64, SecretWord 0x35535ca31c5dc2ba'u64]
    b = [SecretWord 0x97f7ed94'u64, SecretWord 0xbad96eb98204a622'u64, SecretWord 0xbba94400f9a061d6'u64, SecretWord 0x60d3521a0d3dd9eb'u64]
-    M = [SecretWord 0xffffffff'u64, SecretWord 0xffffffffffffffff'u64, SecretWord 0xffffffff00000000'u64, SecretWord 0x0000000000000001'u64]
+    m = [SecretWord 0xffffffff'u64, SecretWord 0xffffffffffffffff'u64, SecretWord 0xffffffff00000000'u64, SecretWord 0x0000000000000001'u64]
    s = "0x0000000081cd43d812e83385c1967515cd95ff7f2f53c61f9626aebd299b9ca4"

    a.reverse()
    b.reverse()
-    M.reverse()
+    m.reverse()

    debugecho "--------------------------"
    debugecho "before:"
    debugecho "  a: ", a.toHex()
    debugecho "  b: ", b.toHex()
-    debugecho "  m: ", M.toHex()
-    addmod_asm(a, b, M)
+    debugecho "  m: ", m.toHex()
+    addmod_asm(a, a, b, m)
    debugecho "after:"
    debugecho "  a: ", a.toHex().tolower
    debugecho "  s: ", s
@ -292,22 +296,47 @@ when isMainModule:
  proc mainSub() =
    var a = [SecretWord 0xf9c32e89b80b17bd'u64, SecretWord 0xdbd3069d4ca0e1c3'u64, SecretWord 0x980d4c70d39d5e17'u64, SecretWord 0xd9f0252845f18c3a'u64]
    var b = [SecretWord 0x215075604bfd64de'u64, SecretWord 0x36dc488149fc5d3e'u64, SecretWord 0x91fff665385d20fd'u64, SecretWord 0xe980a5a203b43179'u64]
-    var M = [SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFEFFFFFC2F'u64]
+    var m = [SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFEFFFFFC2F'u64]
    var s = "0xd872b9296c0db2dfa4f6be1c02a48485060d560b9b403d19f06f7f86423d5ac1"

    a.reverse()
    b.reverse()
-    M.reverse()
+    m.reverse()

    debugecho "--------------------------------"
    debugecho "before:"
    debugecho "  a: ", a.toHex()
    debugecho "  b: ", b.toHex()
-    debugecho "  m: ", M.toHex()
-    submod_asm(a, b, M)
+    debugecho "  m: ", m.toHex()
+    submod_asm(a, a, b, m)
    debugecho "after:"
    debugecho "  a: ", a.toHex().tolower
    debugecho "  s: ", s
    debugecho " ok: ", a.toHex().tolower == s

  mainSub()
+
+  proc mainSubOutplace() =
+    var a = [SecretWord 0xf9c32e89b80b17bd'u64, SecretWord 0xdbd3069d4ca0e1c3'u64, SecretWord 0x980d4c70d39d5e17'u64, SecretWord 0xd9f0252845f18c3a'u64]
+    var b = [SecretWord 0x215075604bfd64de'u64, SecretWord 0x36dc488149fc5d3e'u64, SecretWord 0x91fff665385d20fd'u64, SecretWord 0xe980a5a203b43179'u64]
+    var m = [SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFEFFFFFC2F'u64]
+    var s = "0xd872b9296c0db2dfa4f6be1c02a48485060d560b9b403d19f06f7f86423d5ac1"
+
+    a.reverse()
+    b.reverse()
+    m.reverse()
+
+    var r: typeof(a)
+
+    debugecho "--------------------------------"
+    debugecho "before:"
+    debugecho "  a: ", a.toHex()
+    debugecho "  b: ", b.toHex()
+    debugecho "  m: ", m.toHex()
+    submod_asm(r, a, b, m)
+    debugecho "after:"
+    debugecho "  r: ", r.toHex().tolower
+    debugecho "  s: ", s
+    debugecho " ok: ", r.toHex().tolower == s
+
+  mainSubOutplace()
--- a/constantine/arithmetic/assembly/limbs_asm_montred_x86.nim
+++ b/constantine/arithmetic/assembly/limbs_asm_montred_x86.nim
@ -28,31 +28,31 @@ static: doAssert UseASM_X86_32
 proc finalSubNoCarry*(
       ctx: var Assembler_x86,
       r: Operand or OperandArray,
-       t, M, scratch: OperandArray
+       a, M, scratch: OperandArray
     ) =
-  ## Reduce `t` into `r` modulo `M`
+  ## Reduce `a` into `r` modulo `M`
  let N = M.len
  ctx.comment "Final substraction (no carry)"
  for i in 0 ..< N:
-    ctx.mov scratch[i], t[i]
+    ctx.mov scratch[i], a[i]
    if i == 0:
      ctx.sub scratch[i], M[i]
    else:
      ctx.sbb scratch[i], M[i]

  # If we borrowed it means that we were smaller than
-  # the modulus and we don't need "scratch"
+  # the modulus and we don'a need "scratch"
  for i in 0 ..< N:
-    ctx.cmovnc t[i], scratch[i]
-    ctx.mov r[i], t[i]
+    ctx.cmovnc a[i], scratch[i]
+    ctx.mov r[i], a[i]

 proc finalSubCanOverflow*(
       ctx: var Assembler_x86,
       r: Operand or OperandArray,
-       t, M, scratch: OperandArray,
+       a, M, scratch: OperandArray,
       overflowReg: Operand
     ) =
-  ## Reduce `t` into `r` modulo `M`
+  ## Reduce `a` into `r` modulo `M`
  ## To be used when the final substraction can
  ## also depend on the carry flag
  ## This is in particular possible when the MSB
@ -65,7 +65,7 @@ proc finalSubCanOverflow*(
  let N = M.len
  ctx.comment "Final substraction (may carry)"
  for i in 0 ..< N:
-    ctx.mov scratch[i], t[i]
+    ctx.mov scratch[i], a[i]
    if i == 0:
      ctx.sub scratch[i], M[i]
    else:
@ -74,10 +74,10 @@ proc finalSubCanOverflow*(
  ctx.sbb overflowReg, 0

  # If we borrowed it means that we were smaller than
-  # the modulus and we don't need "scratch"
+  # the modulus and we don'a need "scratch"
  for i in 0 ..< N:
-    ctx.cmovnc t[i], scratch[i]
-    ctx.mov r[i], t[i]
+    ctx.cmovnc a[i], scratch[i]
+    ctx.mov r[i], a[i]


 # Montgomery reduction
@ -85,7 +85,7 @@ proc finalSubCanOverflow*(

 macro montyRed_gen[N: static int](
       r_MR: var array[N, SecretWord],
-       t_MR: array[N*2, SecretWord],
+       a_MR: array[N*2, SecretWord],
       M_MR: array[N, SecretWord],
       m0ninv_MR: BaseType,
       canUseNoCarryMontyMul: static bool
@ -148,13 +148,13 @@ macro montyRed_gen[N: static int](
  # ---------------------------------------------------------
  # for i in 0 .. n-1:
  #   hi <- 0
-  #   m <- t[i] * m0ninv mod 2^w (i.e. simple multiplication)
+  #   m <- a[i] * m0ninv mod 2^w (i.e. simple multiplication)
  #   for j in 0 .. n-1:
-  #     (hi, lo) <- t[i+j] + m * M[j] + hi
-  #     t[i+j] <- lo
-  #   t[i+n] += hi
+  #     (hi, lo) <- a[i+j] + m * M[j] + hi
+  #     a[i+j] <- lo
+  #   a[i+n] += hi
  # for i in 0 .. n-1:
-  #   r[i] = t[i+n]
+  #   r[i] = a[i+n]
  # if r >= M:
  #   r -= M

@ -162,14 +162,15 @@ macro montyRed_gen[N: static int](
  doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."

  result.add quote do:
-    `eax` = BaseType `t_MR`[0]
-    `scratchSym`[1 .. `N`-1] = `t_MR`.toOpenArray(1, `N`-1)
+    `eax` = BaseType `a_MR`[0]
+    staticFor i, 0, `N`: # Do NOT use Nim slice/toOpenArray, they are not inlined
+      `scratchSym`[i] = `a_MR`[i]

  ctx.mov scratch[N], rRAX
-  ctx.imul rRAX, m0ninv    # m <- t[i] * m0ninv mod 2^w
+  ctx.imul rRAX, m0ninv    # m <- a[i] * m0ninv mod 2^w
  ctx.mov scratch[0], rRAX

-  # scratch: [t[0] * m0, t[1], t[2], t[3], t[0]] for 4 limbs
+  # scratch: [a[0] * m0, a[1], a[2], a[3], a[0]] for 4 limbs

  for i in 0 ..< N:
    ctx.comment ""
@ -217,23 +218,23 @@ macro montyRed_gen[N: static int](
  ctx = init(Assembler_x86, BaseType)

  let r = init(OperandArray, nimSymbol = r_MR, N, PointerInReg, InputOutput_EnsureClobber)
-  let t = init(OperandArray, nimSymbol = t_MR, N*2, PointerInReg, Input)
+  let a = init(OperandArray, nimSymbol = a_MR, N*2, PointerInReg, Input)
  let extraRegNeeded = N-2
-  let tsub = init(OperandArray, nimSymbol = ident"tsub", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber)
-  let tsubsym = tsub.nimSymbol
+  let t = init(OperandArray, nimSymbol = ident"t", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber)
+  let tsym = t.nimSymbol
  result.add quote do:
-    var `tsubsym` {.noInit.}: Limbs[`extraRegNeeded`]
+    var `tsym` {.noInit.}: Limbs[`extraRegNeeded`]

-  # This does t[i+n] += hi
+  # This does a[i+n] += hi
  # but in a separate carry chain, fused with the
-  # copy "r[i] = t[i+n]"
+  # copy "r[i] = a[i+n]"
  for i in 0 ..< N:
    if i == 0:
-      ctx.add scratch[i], t[i+N]
+      ctx.add scratch[i], a[i+N]
    else:
-      ctx.adc scratch[i], t[i+N]
+      ctx.adc scratch[i], a[i+N]

-  let reuse = repackRegisters(tsub, scratch[N], scratch[N+1])
+  let reuse = repackRegisters(t, scratch[N], scratch[N+1])

  if canUseNoCarryMontyMul:
    ctx.finalSubNoCarry(r, scratch, M, reuse)
@ -245,10 +246,10 @@ macro montyRed_gen[N: static int](

 func montRed_asm*[N: static int](
       r: var array[N, SecretWord],
-       t: array[N*2, SecretWord],
+       a: array[N*2, SecretWord],
       M: array[N, SecretWord],
       m0ninv: BaseType,
       canUseNoCarryMontyMul: static bool
      ) =
  ## Constant-time Montgomery reduction
-  montyRed_gen(r, t, M, m0ninv, canUseNoCarryMontyMul)
+  montyRed_gen(r, a, M, m0ninv, canUseNoCarryMontyMul)
--- a/constantine/arithmetic/assembly/limbs_asm_montred_x86_adx_bmi2.nim
+++ b/constantine/arithmetic/assembly/limbs_asm_montred_x86_adx_bmi2.nim
@ -37,7 +37,7 @@ static: doAssert UseASM_X86_64

 macro montyRedx_gen[N: static int](
       r_MR: var array[N, SecretWord],
-       t_MR: array[N*2, SecretWord],
+       a_MR: array[N*2, SecretWord],
       M_MR: array[N, SecretWord],
       m0ninv_MR: BaseType,
       canUseNoCarryMontyMul: static bool
@ -109,13 +109,13 @@ macro montyRedx_gen[N: static int](
  # ---------------------------------------------------------
  # for i in 0 .. n-1:
  #   hi <- 0
-  #   m <- t[i] * m0ninv mod 2^w (i.e. simple multiplication)
+  #   m <- a[i] * m0ninv mod 2^w (i.e. simple multiplication)
  #   for j in 0 .. n-1:
-  #     (hi, lo) <- t[i+j] + m * M[j] + hi
-  #     t[i+j] <- lo
-  #   t[i+n] += hi
+  #     (hi, lo) <- a[i+j] + m * M[j] + hi
+  #     a[i+j] <- lo
+  #   a[i+n] += hi
  # for i in 0 .. n-1:
-  #   r[i] = t[i+n]
+  #   r[i] = a[i+n]
  # if r >= M:
  #   r -= M

@ -124,12 +124,13 @@ macro montyRedx_gen[N: static int](

  result.add quote do:
    `edx` = BaseType(`m0ninv_MR`)
-    `scratchSym`[0 .. `N`-1] = `t_MR`.toOpenArray(0, `N`-1)
+    staticFor i, 0, `N`: # Do NOT use Nim slice/toOpenArray, they are not inlined
+      `scratchSym`[i] = `a_MR`[i]

  for i in 0 ..< N:
    # RDX contains m0ninv at the start of each loop
    ctx.comment ""
-    ctx.imul rRDX, scratch[0] # m <- t[i] * m0ninv mod 2^w
+    ctx.imul rRDX, scratch[0] # m <- a[i] * m0ninv mod 2^w
    ctx.comment "---- Reduction " & $i
    ctx.`xor` scratch[N], scratch[N]

@ -156,23 +157,23 @@ macro montyRedx_gen[N: static int](
  ctx = init(Assembler_x86, BaseType)

  let r = init(OperandArray, nimSymbol = r_MR, N, PointerInReg, InputOutput_EnsureClobber)
-  let t = init(OperandArray, nimSymbol = t_MR, N*2, PointerInReg, Input)
+  let a = init(OperandArray, nimSymbol = a_MR, N*2, PointerInReg, Input)
  let extraRegNeeded = N-1
-  let tsub = init(OperandArray, nimSymbol = ident"tsub", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber)
-  let tsubsym = tsub.nimSymbol
+  let t = init(OperandArray, nimSymbol = ident"t", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber)
+  let tsym = t.nimSymbol
  result.add quote do:
-    var `tsubsym` {.noInit.}: Limbs[`extraRegNeeded`]
+    var `tsym` {.noInit.}: Limbs[`extraRegNeeded`]

-  # This does t[i+n] += hi
+  # This does a[i+n] += hi
  # but in a separate carry chain, fused with the
-  # copy "r[i] = t[i+n]"
+  # copy "r[i] = a[i+n]"
  for i in 0 ..< N:
    if i == 0:
-      ctx.add scratch[i], t[i+N]
+      ctx.add scratch[i], a[i+N]
    else:
-      ctx.adc scratch[i], t[i+N]
+      ctx.adc scratch[i], a[i+N]

-  let reuse = repackRegisters(tsub, scratch[N])
+  let reuse = repackRegisters(t, scratch[N])

  if canUseNoCarryMontyMul:
    ctx.finalSubNoCarry(r, scratch, M, reuse)
@ -184,10 +185,10 @@ macro montyRedx_gen[N: static int](

 func montRed_asm_adx_bmi2*[N: static int](
       r: var array[N, SecretWord],
-       t: array[N*2, SecretWord],
+       a: array[N*2, SecretWord],
       M: array[N, SecretWord],
       m0ninv: BaseType,
       canUseNoCarryMontyMul: static bool
      ) =
  ## Constant-time Montgomery reduction
-  montyRedx_gen(r, t, M, m0ninv, canUseNoCarryMontyMul)
+  montyRedx_gen(r, a, M, m0ninv, canUseNoCarryMontyMul)
--- a/constantine/arithmetic/assembly/limbs_asm_mul_x86.nim
+++ b/constantine/arithmetic/assembly/limbs_asm_mul_x86.nim
@ -30,6 +30,9 @@ import
 static: doAssert UseASM_X86_64 # Need 8 registers just for mul
                               # and 32-bit only has 8 max.

+# Multiplication
+# -----------------------------------------------------------------------------------------------
+
 macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
  ## Comba multiplication generator
  ## `a`, `b`, `r` can have a different number of limbs
@ -114,7 +117,12 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
  ctx.`xor` v, v
  ctx.`xor` t, t

-  for i in 0 ..< min(aLen+bLen, rLen):
+  let stopEx = min(aLen+bLen, rLen)
+
+  for i in 0 ..< stopEx:
+    # Invariant for product scanning:
+    # if we multiply accumulate by a[k1] * b[k2]
+    # we have k1+k2 == i
    let ib = min(bLen-1, i)
    let ia = i - ib
    for j in 0 ..< min(aLen - ia, ib+1):
@ -127,7 +135,7 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],

    ctx.mov arrR[i], v

-    if i != min(aLen+bLen, rLen) - 1:
+    if i != stopEx - 1:
      ctx.mov v, u
      ctx.mov u, t
      ctx.`xor` t, t
@ -144,3 +152,140 @@ func mul_asm*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
  ## Multi-precision Multiplication
  ## Assumes r doesn't alias a or b
  mul_gen(r, a, b)
+
+# Squaring
+# -----------------------------------------------------------------------------------------------
+
+macro square_gen[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
+  ## Comba squaring generator
+  ## `a` and `r` can have a different number of limbs
+  ## if `r`.limbs.len < a.limbs.len * 2
+  ## The result will be truncated, i.e. it will be
+  ## a² (mod (2^WordBitwidth)^r.limbs.len)
+  ##
+  ## Assumes r doesn't aliases a
+
+  result = newStmtList()
+
+  var ctx = init(Assembler_x86, BaseType)
+  let
+    arrR = init(OperandArray, nimSymbol = r, rLen, PointerInReg, InputOutput_EnsureClobber)
+    arrA = init(OperandArray, nimSymbol = a, aLen, PointerInReg, Input)
+
+    t = Operand(
+      desc: OperandDesc(
+        asmId: "[t]",
+        nimSymbol: ident"t",
+        rm: Reg,
+        constraint: Output_EarlyClobber,
+        cEmit: "t"
+      )
+    )
+
+    u = Operand(
+      desc: OperandDesc(
+        asmId: "[u]",
+        nimSymbol: ident"u",
+        rm: Reg,
+        constraint: Output_EarlyClobber,
+        cEmit: "u"
+      )
+    )
+
+    v = Operand(
+      desc: OperandDesc(
+        asmId: "[v]",
+        nimSymbol: ident"v",
+        rm: Reg,
+        constraint: Output_EarlyClobber,
+        cEmit: "v"
+      )
+    )
+
+    # MUL requires RAX and RDX
+    rRAX = Operand(
+      desc: OperandDesc(
+        asmId: "[rax]",
+        nimSymbol: ident"rax",
+        rm: RAX,
+        constraint: Output_EarlyClobber,
+        cEmit: "rax"
+      )
+    )
+
+    rRDX = Operand(
+      desc: OperandDesc(
+        asmId: "[rdx]",
+        nimSymbol: ident"rdx",
+        rm: RDX,
+        constraint: Output_EarlyClobber,
+        cEmit: "rdx"
+      )
+    )
+
+
+  # Prologue
+  let tsym = t.desc.nimSymbol
+  let usym = u.desc.nimSymbol
+  let vsym = v.desc.nimSymbol
+  let eax = rRAX.desc.nimSymbol
+  let edx = rRDX.desc.nimSymbol
+  result.add quote do:
+    var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init
+    var `eax`{.noInit.}, `edx`{.noInit.}: BaseType
+
+  # Algorithm
+  ctx.`xor` u, u
+  ctx.`xor` v, v
+  ctx.`xor` t, t
+
+  let stopEx = min(aLen*2, rLen)
+
+  for i in 0 ..< stopEx:
+    # Invariant for product scanning:
+    # if we multiply accumulate by a[k1] * b[k2]
+    # we have k1+k2 == i
+    let ib = min(aLen-1, i)
+    let ia = i - ib
+    for j in 0 ..< min(aLen - ia, ib+1):
+      let k1 = ia+j
+      let k2 = ib-j
+      if k1 < k2:
+        # (t, u, v) <- (t, u, v) + 2 * a[k1] * a[k2]
+        ctx.mov rRAX, arrA[k2]
+        ctx.mul rdx, rax, arrA[k1], rax
+        ctx.add rRAX, rRAX
+        ctx.adc rRDX, rRDX
+        ctx.adc t, 0
+        ctx.add v, rRAX
+        ctx.adc u, rRDX
+        ctx.adc t, 0
+      elif k1 == k2:
+        # (t, u, v) <- (t, u, v) + a[k1] * a[k2]
+        ctx.mov rRAX, arrA[k2]
+        ctx.mul rdx, rax, arrA[k1], rax
+        ctx.add v, rRAX
+        ctx.adc u, rRDX
+        ctx.adc t, 0
+      else:
+        discard
+
+    ctx.mov arrR[i], v
+
+    if i != stopEx - 1:
+      ctx.mov v, u
+      ctx.mov u, t
+      ctx.`xor` t, t
+
+  if aLen*2 < rLen:
+    ctx.`xor` rRAX, rRAX
+    for i in aLen*2 ..< rLen:
+      ctx.mov arrR[i], rRAX
+
+  # Codegen
+  result.add ctx.generate
+
+func square_asm*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
+  ## Multi-precision Squaring
+  ## Assumes r doesn't alias a
+  square_gen(r, a)
--- a/constantine/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim
+++ b/constantine/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim
@ -195,3 +195,135 @@ func mul_asm_adx_bmi2*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limb
  ## Multi-precision Multiplication
  ## Assumes r doesn't alias a or b
  mulx_gen(r, a, b)
+
+
+# Squaring
+# -----------------------------------------------------------------------------------------------
+# TODO: We use 16 registers but GCC/Clang still complain :/
+
+# macro sqrx_gen[rLen, aLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen]) =
+#   ## Squaring
+#   ## `a` and `r` can have a different number of limbs
+#   ## if `r`.limbs.len < a.limbs.len * 2
+#   ## The result will be truncated, i.e. it will be
+#   ## a² (mod (2^WordBitwidth)^r.limbs.len)
+#   ##
+#   ## Assumes r doesn't aliases a
+#   result = newStmtList()
+#
+#   var ctx = init(Assembler_x86, BaseType)
+#   let
+#     # Register count with 6 limbs:
+#     # r + a + rax + rdx = 4
+#     # t = 2 * a.len = 12
+#     # We use the full x86 register set.
+#
+#     r = init(OperandArray, nimSymbol = rx, rLen, PointerInReg, InputOutput)
+#     a = init(OperandArray, nimSymbol = ax, aLen, PointerInReg, Input)
+#
+#     N = a.len
+#     tSlots = a.len * 2
+#     # If aLen is too big, we need to spill registers. TODO.
+#     t = init(OperandArray, nimSymbol = ident"t", tSlots, ElemsInReg, Output_EarlyClobber)
+#
+#     # MULX requires RDX
+#     rRDX = Operand(
+#       desc: OperandDesc(
+#         asmId: "[rdx]",
+#         nimSymbol: ident"rdx",
+#         rm: RDX,
+#         constraint: Output_EarlyClobber,
+#         cEmit: "rdx"
+#       )
+#     )
+#
+#     # Scratch spaces for carries
+#     rRAX = Operand(
+#       desc: OperandDesc(
+#         asmId: "[rax]",
+#         nimSymbol: ident"rax",
+#         rm: RAX,
+#         constraint: Output_EarlyClobber,
+#         cEmit: "rax"
+#       )
+#     )
+#
+#   # Prologue
+#   # -------------------------------
+#   let tsym = t.nimSymbol
+#   let eax = rRAX.desc.nimSymbol
+#   let edx = rRDX.desc.nimSymbol
+#   result.add quote do:
+#     var `tsym`{.noInit.}: array[`N`, BaseType]
+#     var `eax`{.noInit.}, `edx`{.noInit.}: BaseType
+#
+#   # Algorithm
+#   # -------------------------------
+#
+#   block: # Triangle
+#     # i = 0
+#     # ----------------
+#     ctx.mov rRDX, a[0]
+#     # Put a[1..<N] in unused registers, 4 mov per cycle on x86
+#     for i in 1 ..< N:
+#       ctx.mov t[i+N], a[i]
+#
+#     let # Carry handlers
+#       hi = r.reuseRegister()
+#       lo = rRAX
+#
+#     for j in 1 ..< N:
+#       # (carry, t[j]) <- a[j] * a[0] with a[j] in t[j+N]
+#       ctx.mulx t[j], rRAX, t[j+N], rdx
+#       if j == 1:
+#         ctx.add t[j-1], rRAX
+#       else:
+#         ctx.adc t[j-1], rRAX
+#     ctx.adc t[N-1], 0
+#
+#     for i in 1 ..< N-1:
+#       ctx.comment "  Process squaring triangle " & $i
+#       ctx.mov rRDX, a[i]
+#       ctx.`xor` t[i+N], t[i+N]    # Clear flags
+#       for j in i+1 ..< N:
+#         ctx.mulx hi, lo, t[j+N], rdx
+#         ctx.adox t[i+j], lo
+#         if j == N-1:
+#           break
+#         ctx.adcx t[i+j+1], hi
+#
+#       ctx.comment "  Accumulate last carries in i+N word"
+#       # t[i+N] is already 0
+#       ctx.adcx hi, t[i+N]
+#       ctx.adox t[i+N], hi
+#
+#   block:
+#     ctx.comment "Finish: (t[2*i+1], t[2*i]) <- 2*t[2*i] + a[i]*a[i]"
+#
+#     # Restore result
+#     ctx.mov r.reuseRegister(), xmm0
+#
+#     ctx.mov rRDX, a[0]
+#
+#     # (t[2*i+1], t[2*i]) <- 2*t[2*i] + a[i]*a[i]
+#     for i in 0 ..< N:
+#       ctx.mulx rRAX, rRDX, a[i], rdx
+#       ctx.add t[2*i], t[2*i]
+#       ctx.adc t[2*i+1], 0
+#       ctx.add t[2*i], rRDX
+#       if i != N - 1:
+#         ctx.mov rRDX, a[i+1]
+#       ctx.adc t[2*i+1], rRAX
+#       ctx.mov r[i], t[i]
+#
+#     # Move the rest
+#     for i in N ..< min(rLen, 2*N):
+#       ctx.mov r[i], t[i]
+#
+#   # Codegen
+#   result.add ctx.generate
+#
+# func square_asm_adx_bmi2*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
+#   ## Multi-precision Squaring
+#   ## Assumes r doesn't alias a
+#   sqrx_gen(r, a)
--- a/constantine/arithmetic/bigints.nim
+++ b/constantine/arithmetic/bigints.nim
@ -10,7 +10,9 @@ import
  ../config/[common, type_bigint],
  ../primitives,
  ./limbs,
-  ./limbs_modular
+  ./limbs_double_width,
+  ./limbs_modular,
+  ./limbs_montgomery

 export BigInt

@ -269,6 +271,10 @@ func prod*[rBits, aBits, bBits](r: var BigInt[rBits], a: BigInt[aBits], b: BigIn
  ## if r.bits < a.bits + b.bits
  ## the multiplication will overflow.
  ## It will be truncated if it cannot fit in r limbs.
+  ##
+  ## Truncation is at limb-level NOT bitlevel
+  ## It is recommended to only use
+  ## rBits >= aBits + bBits unless you know what you are doing.
  r.limbs.prod(a.limbs, b.limbs)

 func mul*[aBits, bBits](a: var BigInt[aBits], b: BigInt[bBits]) =
@ -298,6 +304,19 @@ func prod_high_words*[rBits, aBits, bBits](r: var BigInt[rBits], a: BigInt[aBits
  #   i.e. prod_high_words(result, P, a, w)
  r.limbs.prod_high_words(a.limbs, b.limbs, lowestWordIndex)

+func square*[rBits, aBits](r: var BigInt[rBits], a: BigInt[aBits]) =
+  ## Multi-precision squaring
+  ## r <- a²
+  ## `a`, `r` can have different sizes
+  ## if r.bits < a.bits * 2
+  ## the multiplication will overflow.
+  ## It will be truncated if it cannot fit in r limbs.
+  ##
+  ## Truncation is at limb-level NOT bitlevel
+  ## It is recommended to only use
+  ## rBits >= aBits * 2 unless you know what you are doing.
+  r.limbs.square(a.limbs)
+
 # Bit Manipulation
 # ------------------------------------------------------------

--- a/constantine/arithmetic/finite_fields.nim
+++ b/constantine/arithmetic/finite_fields.nim
@ -135,7 +135,7 @@ func setOne*(a: var FF) {.inline.} =
 func `+=`*(a: var FF, b: FF) {.inline, meter.} =
  ## In-place addition modulo p
  when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
-    addmod_asm(a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
+    addmod_asm(a.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
  else:
    var overflowed = add(a.mres, b.mres)
    overflowed = overflowed or not(a.mres < FF.fieldMod())
@ -144,7 +144,7 @@ func `+=`*(a: var FF, b: FF) {.inline, meter.} =
 func `-=`*(a: var FF, b: FF) {.inline, meter.} =
  ## In-place substraction modulo p
  when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
-    submod_asm(a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
+    submod_asm(a.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
  else:
    let underflowed = sub(a.mres, b.mres)
    discard cadd(a.mres, FF.fieldMod(), underflowed)
@ -152,7 +152,7 @@ func `-=`*(a: var FF, b: FF) {.inline, meter.} =
 func double*(a: var FF) {.inline, meter.} =
  ## Double ``a`` modulo p
  when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
-    addmod_asm(a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
+    addmod_asm(a.mres.limbs, a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
  else:
    var overflowed = double(a.mres)
    overflowed = overflowed or not(a.mres < FF.fieldMod())
@ -162,8 +162,7 @@ func sum*(r: var FF, a, b: FF) {.inline, meter.} =
  ## Sum ``a`` and ``b`` into ``r`` modulo p
  ## r is initialized/overwritten
  when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
-    r = a
-    addmod_asm(r.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
+    addmod_asm(r.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
  else:
    var overflowed = r.mres.sum(a.mres, b.mres)
    overflowed = overflowed or not(r.mres < FF.fieldMod())
@ -178,20 +177,7 @@ func diff*(r: var FF, a, b: FF) {.inline, meter.} =
  ## `r` is initialized/overwritten
  ## Requires r != b
  when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
-    r = a
-    submod_asm(r.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
-  else:
-    var underflowed = r.mres.diff(a.mres, b.mres)
-    discard cadd(r.mres, FF.fieldMod(), underflowed)
-
-func diffAlias*(r: var FF, a, b: FF) {.inline, meter.} =
-  ## Substract `b` from `a` and store the result into `r`.
-  ## `r` is initialized/overwritten
-  ## Handles r == b
-  when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
-    var t = a
-    submod_asm(t.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
-    r = t
+    submod_asm(r.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
  else:
    var underflowed = r.mres.diff(a.mres, b.mres)
    discard cadd(r.mres, FF.fieldMod(), underflowed)
@ -205,8 +191,7 @@ func double*(r: var FF, a: FF) {.inline, meter.} =
  ## Double ``a`` into ``r``
  ## `r` is initialized/overwritten
  when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
-    r = a
-    addmod_asm(r.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
+    addmod_asm(r.mres.limbs, a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
  else:
    var overflowed = r.mres.double(a.mres)
    overflowed = overflowed or not(r.mres < FF.fieldMod())
@ -223,10 +208,7 @@ func square*(r: var FF, a: FF) {.inline, meter.} =

 func neg*(r: var FF, a: FF) {.inline, meter.} =
  ## Negate modulo p
-  when UseASM_X86_64 and defined(gcc):
-    # Clang and every compiler besides GCC
-    # can cleanly optimized this
-    # especially on FF2
+  when UseASM_X86_64:
    negmod_asm(r.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
  else:
    # If a = 0 we need r = 0 and not r = M
--- a/constantine/arithmetic/finite_fields_double_width.nim
+++ b/constantine/arithmetic/finite_fields_double_width.nim
@ -12,7 +12,8 @@ import
  ./bigints,
  ./finite_fields,
  ./limbs,
-  ./limbs_double_width
+  ./limbs_double_width,
+  ./limbs_montgomery

 when UseASM_X86_64:
  import assembly/limbs_asm_modular_dbl_width_x86
@ -27,10 +28,17 @@ template doubleWidth*(T: typedesc[Fp]): typedesc =
  ## Return the double-width type matching with Fp
  FpDbl[T.C]

+func `==`*(a, b: FpDbl): SecretBool {.inline.} =
+  a.limbs2x == b.limbs2x
+
 func mulNoReduce*(r: var FpDbl, a, b: Fp) {.inline.} =
  ## Store the product of ``a`` by ``b`` into ``r``
  r.limbs2x.prod(a.mres.limbs, b.mres.limbs)

+func squareNoReduce*(r: var FpDbl, a: Fp) {.inline.} =
+  ## Store the square of ``a`` into ``r``
+  r.limbs2x.square(a.mres.limbs)
+
 func reduce*(r: var Fp, a: FpDbl) {.inline.} =
  ## Reduce a double-width field element into r
  const N = r.mres.limbs.len
@ -42,20 +50,16 @@ func reduce*(r: var Fp, a: FpDbl) {.inline.} =
    Fp.canUseNoCarryMontyMul()
  )

-func diffNoInline(r: var FpDbl, a, b: FpDbl): Borrow =
-  r.limbs2x.diff(a.limbs2x, b.limbs2x)
-
 func diffNoReduce*(r: var FpDbl, a, b: FpDbl) =
  ## Double-width substraction without reduction
-  discard diffNoInline(r, a, b)
+  discard r.limbs2x.diff(a.limbs2x, b.limbs2x)

-func diff*(r: var FpDbl, a, b: FpDbl) =
+func diff*(r: var FpDbl, a, b: FpDbl) {.inline.}=
  ## Double-width modular substraction
-  when false: # TODO slower
-    r = a
-    sub2x_asm(r.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs)
+  when UseASM_X86_64:
+    sub2x_asm(r.limbs2x, a.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs)
  else:
-    var underflowed = SecretBool diffNoInline(r, a, b)
+    var underflowed = SecretBool r.limbs2x.diff(a.limbs2x, b.limbs2x)

    const N = r.limbs2x.len div 2
    const M = FpDbl.C.Mod
@ -65,8 +69,6 @@ func diff*(r: var FpDbl, a, b: FpDbl) =
      addC(carry, sum, r.limbs2x[i+N], M.limbs[i], carry)
      underflowed.ccopy(r.limbs2x[i+N], sum)

-func `-=`*(a: var FpDbl, b: FpDbl) =
-  when false: # TODO slower
-    sub2x_asm(a.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs)
-  else:
-    a.diff(a, b)
+func `-=`*(a: var FpDbl, b: FpDbl) {.inline.}=
+  ## Double-width modular substraction
+  a.diff(a, b)
--- a/constantine/arithmetic/limbs.nim
+++ b/constantine/arithmetic/limbs.nim
@ -12,9 +12,6 @@ import

 when UseASM_X86_32:
  import ./assembly/limbs_asm_x86
-when UseASM_X86_64:
-  import ./assembly/limbs_asm_mul_x86
-  import ./assembly/limbs_asm_mul_x86_adx_bmi2

 # ############################################################
 #
@ -335,89 +332,6 @@ func cneg*(a: var Limbs, ctl: CTBool) =

 {.pop.} # inline

-# Multiplication
-# ------------------------------------------------------------
-
-func prod*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
-  ## Multi-precision multiplication
-  ## r <- a*b
-  ##
-  ## `a`, `b`, `r` can have a different number of limbs
-  ## if `r`.limbs.len < a.limbs.len + b.limbs.len
-  ## The result will be truncated, i.e. it will be
-  ## a * b (mod (2^WordBitwidth)^r.limbs.len)
-  ##
-  ## `r` must not alias ``a`` or ``b``
-
-  when UseASM_X86_64 and aLen <= 6:
-    if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()):
-      mul_asm_adx_bmi2(r, a, b)
-    else:
-      mul_asm(r, a, b)
-  elif UseASM_X86_64:
-    mul_asm(r, a, b)
-  else:
-    # We use Product Scanning / Comba multiplication
-    var t, u, v = Zero
-
-    staticFor i, 0, min(a.len+b.len, r.len):
-      const ib = min(b.len-1, i)
-      const ia = i - ib
-      staticFor j, 0, min(a.len - ia, ib+1):
-        mulAcc(t, u, v, a[ia+j], b[ib-j])
-
-      r[i] = v
-      v = u
-      u = t
-      t = Zero
-
-    if aLen+bLen < rLen:
-      for i in aLen+bLen ..< rLen:
-        r[i] = Zero
-
-func prod_high_words*[rLen, aLen, bLen](
-       r: var Limbs[rLen],
-       a: Limbs[aLen], b: Limbs[bLen],
-       lowestWordIndex: static int) =
-  ## Multi-precision multiplication keeping only high words
-  ## r <- a*b >> (2^WordBitWidth)^lowestWordIndex
-  ##
-  ## `a`, `b`, `r` can have a different number of limbs
-  ## if `r`.limbs.len < a.limbs.len + b.limbs.len - lowestWordIndex
-  ## The result will be truncated, i.e. it will be
-  ## a * b >> (2^WordBitWidth)^lowestWordIndex (mod (2^WordBitwidth)^r.limbs.len)
-  #
-  # This is useful for
-  # - Barret reduction
-  # - Approximating multiplication by a fractional constant in the form f(a) = K/C * a
-  #   with K and C known at compile-time.
-  #   We can instead find a well chosen M = (2^WordBitWidth)^w, with M > C (i.e. M is a power of 2 bigger than C)
-  #   Precompute P = K*M/C at compile-time
-  #   and at runtime do P*a/M <=> P*a >> (WordBitWidth*w)
-  #   i.e. prod_high_words(result, P, a, w)
-
-  # We use Product Scanning / Comba multiplication
-  var t, u, v = Zero # Will raise warning on empty iterations
-  var z: Limbs[rLen] # zero-init, ensure on stack and removes in-place problems
-
-  # The previous 2 columns can affect the lowest word due to carries
-  # but not the ones before (we accumulate in 3 words (t, u, v))
-  const w = lowestWordIndex - 2
-
-  staticFor i, max(0, w), min(a.len+b.len, r.len+lowestWordIndex):
-    const ib = min(b.len-1, i)
-    const ia = i - ib
-    staticFor j, 0, min(a.len - ia, ib+1):
-      mulAcc(t, u, v, a[ia+j], b[ib-j])
-
-    when i >= lowestWordIndex:
-      z[i-lowestWordIndex] = v
-    v = u
-    u = t
-    t = Zero
-
-  r = z
-
 # Division
 # ------------------------------------------------------------

--- a/constantine/arithmetic/limbs_double_width.nim
+++ b/constantine/arithmetic/limbs_double_width.nim
@ -11,10 +11,9 @@ import
  ../primitives,
  ./limbs

-when UseASM_X86_32:
-  import ./assembly/limbs_asm_montred_x86
 when UseASM_X86_64:
-  import ./assembly/limbs_asm_montred_x86_adx_bmi2
+  import ./assembly/limbs_asm_mul_x86
+  import ./assembly/limbs_asm_mul_x86_adx_bmi2

 # ############################################################
 #
@ -25,77 +24,170 @@ when UseASM_X86_64:
 # No exceptions allowed
 {.push raises: [].}

-# Montgomery Reduction
+# Multiplication
 # ------------------------------------------------------------
-# This is the reduction part of SOS (Separated Operand Scanning) modular multiplication technique

-# TODO upstream, using Limbs[N] breaks semcheck
-func montyRed*[N: static int](
-       r: var array[N, SecretWord],
-       t: array[N*2, SecretWord],
-       M: array[N, SecretWord],
-       m0ninv: BaseType, canUseNoCarryMontyMul: static bool) =
-  ## Montgomery reduce a double-width bigint modulo M
-  # - Analyzing and Comparing Montgomery Multiplication Algorithms
-  #   Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr.
-  #   http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf
-  #
-  # - Arithmetic of Finite Fields
-  #   Chapter 5 of Guide to Pairing-Based Cryptography
-  #   Jean Luc Beuchat, Luis J. Dominguez Perez, Sylvain Duquesne, Nadia El Mrabet, Laura Fuentes-Castañeda, Francisco Rodríguez-Henríquez, 2017
-  #   https://www.researchgate.net/publication/319538235_Arithmetic_of_Finite_Fields
-  #
-  # Algorithm
-  # Inputs:
-  # - N number of limbs
-  # - t[0 ..< 2N] (double-width input to reduce)
-  # - M[0 ..< N] The field modulus (must be odd for Montgomery reduction)
-  # - m0ninv: Montgomery Reduction magic number = -1/M[0]
-  # Output:
-  # - r[0 ..< N], in the Montgomery domain
-  # Parameters:
-  # - w, the word width usually 64 on 64-bit platforms or 32 on 32-bit
-  #
-  # for i in 0 .. n-1:
-  #   C <- 0
-  #   m <- t[i] * m0ninv mod 2^w (i.e. simple multiplication)
-  #   for j in 0 .. n-1:
-  #     (C, S) <- t[i+j] + m * M[j] + C
-  #     t[i+j] <- S
-  #   t[i+n] += C
-  # for i in 0 .. n-1:
-  #   r[i] = t[i+n]
-  # if r >= M:
-  #   r -= M
-  #
-  # Important note: `t[i+n] += C` should propagate the carry
-  # to the higher limb if any, thank you "implementation detail"
-  # missing from paper.
-  when UseASM_X86_64 and r.len <= 6:
+func prod*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
+  ## Multi-precision multiplication
+  ## r <- a*b
+  ##
+  ## `a`, `b`, `r` can have a different number of limbs
+  ## if `r`.limbs.len < a.limbs.len + b.limbs.len
+  ## The result will be truncated, i.e. it will be
+  ## a * b (mod (2^WordBitwidth)^r.limbs.len)
+  ##
+  ## `r` must not alias ``a`` or ``b``
+
+  when UseASM_X86_64 and aLen <= 6:
    if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()):
-      montRed_asm_adx_bmi2(r, t, M, m0ninv, canUseNoCarryMontyMul)
+      mul_asm_adx_bmi2(r, a, b)
    else:
-      montRed_asm(r, t, M, m0ninv, canUseNoCarryMontyMul)
-  elif UseASM_X86_32 and r.len <= 6:
-    # TODO: Assembly faster than GCC but slower than Clang
-    montRed_asm(r, t, M, m0ninv, canUseNoCarryMontyMul)
+      mul_asm(r, a, b)
+  elif UseASM_X86_64:
+    mul_asm(r, a, b)
  else:
-    var t = t          # Copy "t" for mutation and ensure on stack
-    var res: typeof(r) # Accumulator
-    staticFor i, 0, N:
-      var C = Zero
-      let m = t[i] * SecretWord(m0ninv)
-      staticFor j, 0, N:
-        muladd2(C, t[i+j], m, M[j], t[i+j], C)
-      res[i] = C
+    # We use Product Scanning / Comba multiplication
+    var t, u, v = Zero
+    const stopEx = min(a.len+b.len, r.len)

-    # This does t[i+n] += C
-    # but in a separate carry chain, fused with the
-    # copy "r[i] = t[i+n]"
-    var carry = Carry(0)
-    staticFor i, 0, N:
-      addC(carry, res[i], t[i+N], res[i], carry)
+    staticFor i, 0, stopEx:
+      # Invariant for product scanning:
+      # if we multiply accumulate by a[k1] * b[k2]
+      # we have k1+k2 == i
+      const ib = min(b.len-1, i)
+      const ia = i - ib
+      staticFor j, 0, min(a.len - ia, ib+1):
+        mulAcc(t, u, v, a[ia+j], b[ib-j])

-    # Final substraction
-    discard res.csub(M, SecretWord(carry).isNonZero() or not(res < M))
-    r = res
+      r[i] = v
+      when i < stopEx-1:
+        v = u
+        u = t
+        t = Zero
+
+    if aLen+bLen < rLen:
+      for i in aLen+bLen ..< rLen:
+        r[i] = Zero
+
+func prod_high_words*[rLen, aLen, bLen](
+       r: var Limbs[rLen],
+       a: Limbs[aLen], b: Limbs[bLen],
+       lowestWordIndex: static int) =
+  ## Multi-precision multiplication keeping only high words
+  ## r <- a*b >> (2^WordBitWidth)^lowestWordIndex
+  ##
+  ## `a`, `b`, `r` can have a different number of limbs
+  ## if `r`.limbs.len < a.limbs.len + b.limbs.len - lowestWordIndex
+  ## The result will be truncated, i.e. it will be
+  ## a * b >> (2^WordBitWidth)^lowestWordIndex (mod (2^WordBitwidth)^r.limbs.len)
+  #
+  # This is useful for
+  # - Barret reduction
+  # - Approximating multiplication by a fractional constant in the form f(a) = K/C * a
+  #   with K and C known at compile-time.
+  #   We can instead find a well chosen M = (2^WordBitWidth)^w, with M > C (i.e. M is a power of 2 bigger than C)
+  #   Precompute P = K*M/C at compile-time
+  #   and at runtime do P*a/M <=> P*a >> (WordBitWidth*w)
+  #   i.e. prod_high_words(result, P, a, w)
+
+  # We use Product Scanning / Comba multiplication
+  var t, u, v = Zero # Will raise warning on empty iterations
+  var z: Limbs[rLen] # zero-init, ensure on stack and removes in-place problems
+
+  # The previous 2 columns can affect the lowest word due to carries
+  # but not the ones before (we accumulate in 3 words (t, u, v))
+  const w = lowestWordIndex - 2
+  const stopEx = min(a.len+b.len, r.len+lowestWordIndex)
+
+  staticFor i, max(0, w), stopEx:
+    const ib = min(b.len-1, i)
+    const ia = i - ib
+    staticFor j, 0, min(a.len - ia, ib+1):
+      mulAcc(t, u, v, a[ia+j], b[ib-j])
+
+    when i >= lowestWordIndex:
+      z[i-lowestWordIndex] = v
+    when i < stopEx-1:
+      v = u
+      u = t
+      t = Zero
+
+  r = z
+
+func square_Comba[rLen, aLen](
+       r: var Limbs[rLen],
+       a: Limbs[aLen]) =
+  ## Multi-precision squaring using Comba / Product Scanning
+  var t, u, v = Zero
+  const stopEx = min(a.len * 2, r.len)
+
+  staticFor i, 0, stopEx:
+    # Invariant for product scanning:
+    # if we multiply accumulate by a[k1] * a[k2]
+    # we have k1+k2 == i
+    const ib = min(a.len-1, i)
+    const ia = i - ib
+    staticFor j, 0, min(a.len - ia, ib+1):
+      const k1 = ia+j
+      const k2 = ib-j
+      when k1 < k2:
+        mulDoubleAcc(t, u, v, a[k1], a[k2])
+      elif k1 == k2:
+        mulAcc(t, u, v, a[k1], a[k2])
+      else:
+        discard
+
+    r[i] = v
+    when i < stopEx-1:
+      v = u
+      u = t
+      t = Zero
+
+  if aLen*2 < rLen:
+    for i in aLen*2 ..< rLen:
+      r[i] = Zero
+
+func square_operandScan[rLen, aLen](
+       r: var Limbs[rLen],
+       a: Limbs[aLen]) =
+  ## Multi-precision squaring using Operand Scanning
+  const stopEx = min(a.len * 2, r.len)
+  var t: typeof(r) # zero-init, ensure on stack
+  var C = Zero
+  static: doAssert aLen * 2 == rLen, "Truncated square operand scanning is not implemented"
+
+  staticFor i, 0, stopEx:
+    staticFor j, i+1, stopEx:
+      muladd2(C, t[i+j], a[j], a[i], t[i+j], C)
+    t[i+stopEx] = C
+
+  staticFor i, 0, aLen:
+    # (t[2*i+1], t[2*i]) <- 2*t[2*i] + a[i]*a[i]
+    var u, v = Zero
+    var carry: Carry
+    # a[i] * a[i]
+    mul(u, v, a[i], a[i])
+    # 2*t[2*i]
+    addC(carry, t[2*i], t[2*i], t[2*i], Carry(0))
+    addC(carry, t[2*i+1], Zero, Zero, carry)
+    # 2*t[2*i] + a[i] * a[i]
+    addC(carry, t[2*i], t[2*i], u, Carry(0))
+    addC(carry, t[2*i+1], Zero, v, carry)
+
+  r = t
+
+func square*[rLen, aLen](
+       r: var Limbs[rLen],
+       a: Limbs[aLen]) =
+  ## Multi-precision squaring
+  ## r <- a²
+  ##
+  ## if `r`.limbs.len < a.limbs.len * 2
+  ## The result will be truncated, i.e. it will be
+  ## a² (mod (2^WordBitwidth)^r.limbs.len)
+  ##
+  ## `r` must not alias ``a`` or ``b``
+  when UseASM_X86_64:
+    square_asm(r, a)
+  else:
+    square_comba(r, a)
--- a/constantine/arithmetic/limbs_montgomery.nim
+++ b/constantine/arithmetic/limbs_montgomery.nim
@ -14,10 +14,13 @@ import
  ../primitives,
  ./limbs

+when UseASM_X86_32:
+  import ./assembly/limbs_asm_montred_x86
 when UseASM_X86_64:
  import
    ./assembly/limbs_asm_montmul_x86,
-    ./assembly/limbs_asm_montmul_x86_adx_bmi2
+    ./assembly/limbs_asm_montmul_x86_adx_bmi2,
+    ./assembly/limbs_asm_montred_x86_adx_bmi2

 # ############################################################
 #
@ -42,7 +45,7 @@ when UseASM_X86_64:
 # No exceptions allowed
 {.push raises: [].}

-# Implementation
+# Montgomery Multiplication
 # ------------------------------------------------------------

 func montyMul_CIOS_nocarry(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
@ -162,6 +165,9 @@ func montyMul_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
  discard z.csub(M, v.isNonZero() or not(z < M))
  r = z

+# Montgomery Squaring
+# ------------------------------------------------------------
+
 func montySquare_CIOS_nocarry(r: var Limbs, a, M: Limbs, m0ninv: BaseType) {.used.}=
  ## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
  ## and no-carry optimization.
@ -267,6 +273,133 @@ func montySquare_CIOS(r: var Limbs, a, M: Limbs, m0ninv: BaseType) {.used.}=
  discard t.csub(M, tN.isNonZero() or not(t < M)) # TODO: (t >= M) is unnecessary for prime in the form (2^64)^w
  r = t

+# Montgomery Reduction
+# ------------------------------------------------------------
+func montyRed_CIOS[N: static int](
+       r: var array[N, SecretWord],
+       a: array[N*2, SecretWord],
+       M: array[N, SecretWord],
+       m0ninv: BaseType) =
+  ## Montgomery reduce a double-width bigint modulo M
+  # - Analyzing and Comparing Montgomery Multiplication Algorithms
+  #   Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr.
+  #   http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf
+  #
+  # - Arithmetic of Finite Fields
+  #   Chapter 5 of Guide to Pairing-Based Cryptography
+  #   Jean Luc Beuchat, Luis J. Dominguez Perez, Sylvain Duquesne, Nadia El Mrabet, Laura Fuentes-Castañeda, Francisco Rodríguez-Henríquez, 2017
+  #   https://www.researchgate.net/publication/319538235_Arithmetic_of_Finite_Fields
+  #
+  # Algorithm
+  # Inputs:
+  # - N number of limbs
+  # - a[0 ..< 2N] (double-width input to reduce)
+  # - M[0 ..< N] The field modulus (must be odd for Montgomery reduction)
+  # - m0ninv: Montgomery Reduction magic number = -1/M[0]
+  # Output:
+  # - r[0 ..< N], in the Montgomery domain
+  # Parameters:
+  # - w, the word width usually 64 on 64-bit platforms or 32 on 32-bit
+  #
+  # for i in 0 .. n-1:
+  #   C <- 0
+  #   m <- a[i] * m0ninv mod 2^w (i.e. simple multiplication)
+  #   for j in 0 .. n-1:
+  #     (C, S) <- a[i+j] + m * M[j] + C
+  #     a[i+j] <- S
+  #   a[i+n] += C
+  # for i in 0 .. n-1:
+  #   r[i] = a[i+n]
+  # if r >= M:
+  #   r -= M
+  #
+  # Important note: `a[i+n] += C` should propagate the carry
+  # to the higher limb if any, thank you "implementation detail"
+  # missing from paper.
+
+  var a = a          # Copy "t" for mutation and ensure on stack
+  var res: typeof(r) # Accumulator
+  staticFor i, 0, N:
+    var C = Zero
+    let m = a[i] * SecretWord(m0ninv)
+    staticFor j, 0, N:
+      muladd2(C, a[i+j], m, M[j], a[i+j], C)
+    res[i] = C
+
+  # This does t[i+n] += C
+  # but in a separate carry chain, fused with the
+  # copy "r[i] = t[i+n]"
+  var carry = Carry(0)
+  staticFor i, 0, N:
+    addC(carry, res[i], a[i+N], res[i], carry)
+
+  # Final substraction
+  discard res.csub(M, SecretWord(carry).isNonZero() or not(res < M))
+  r = res
+
+func montyRed_Comba[N: static int](
+       r: var array[N, SecretWord],
+       a: array[N*2, SecretWord],
+       M: array[N, SecretWord],
+       m0ninv: BaseType) =
+  ## Montgomery reduce a double-width bigint modulo M
+  # We use Product Scanning / Comba multiplication
+  var t, u, v = Zero
+  var carry: Carry
+  var z: typeof(r) # zero-init, ensure on stack and removes in-place problems in tower fields
+  staticFor i, 0, N:
+    staticFor j, 0, i:
+      mulAcc(t, u, v, z[j], M[i-j])
+
+    addC(carry, v, v, a[i], Carry(0))
+    addC(carry, u, u, Zero, carry)
+    addC(carry, t, t, Zero, carry)
+
+    z[i] = v * SecretWord(m0ninv)
+    mulAcc(t, u, v, z[i], M[0])
+    v = u
+    u = t
+    t = Zero
+
+  staticFor i, N, 2*N-1:
+    staticFor j, i-N+1, N:
+      mulAcc(t, u, v, z[j], M[i-j])
+
+    addC(carry, v, v, a[i], Carry(0))
+    addC(carry, u, u, Zero, carry)
+    addC(carry, t, t, Zero, carry)
+
+    z[i-N] = v
+
+    v = u
+    u = t
+    t = Zero
+
+  addC(carry, z[N-1], v, a[2*N-1], Carry(0))
+
+  # Final substraction
+  discard z.csub(M, SecretBool(carry) or not(z < M))
+  r = z
+
+# TODO upstream, using Limbs[N] breaks semcheck
+func montyRed*[N: static int](
+       r: var array[N, SecretWord],
+       a: array[N*2, SecretWord],
+       M: array[N, SecretWord],
+       m0ninv: BaseType, canUseNoCarryMontyMul: static bool) =
+  ## Montgomery reduce a double-width bigint modulo M
+  when UseASM_X86_64 and r.len <= 6:
+    if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()):
+      montRed_asm_adx_bmi2(r, a, M, m0ninv, canUseNoCarryMontyMul)
+    else:
+      montRed_asm(r, a, M, m0ninv, canUseNoCarryMontyMul)
+  elif UseASM_X86_32 and r.len <= 6:
+    # TODO: Assembly faster than GCC but slower than Clang
+    montRed_asm(r, a, M, m0ninv, canUseNoCarryMontyMul)
+  else:
+    montyRed_CIOS(r, a, M, m0ninv)
+    # montyRed_Comba(r, a, M, m0ninv)
+
 # Exported API
 # ------------------------------------------------------------

--- a/constantine/config/common.nim
+++ b/constantine/config/common.nim
@ -54,8 +54,8 @@ const

 # TODO, we restrict assembly to 64-bit words
 # We need to support register spills for large limbs
-const ConstantineASM {.booldefine.} = true
-const UseASM_X86_32* = ConstantineASM and X86 and GCC_Compatible
+const CttASM {.booldefine.} = true
+const UseASM_X86_32* = CttASM and X86 and GCC_Compatible
 const UseASM_X86_64* = WordBitWidth == 64 and UseASM_X86_32

 # ############################################################
--- a/constantine/elliptic/ec_shortweierstrass_affine.nim
+++ b/constantine/elliptic/ec_shortweierstrass_affine.nim
@ -90,7 +90,7 @@ func curve_eq_rhs*[F](y2: var F, x: F, Tw: static Twisted) =
      else:
        {.error: "Only twisted curves are supported on extension field 𝔽p²".}

-    y2.diffAlias(t, y2)
+    y2.diff(t, y2)

  when F.C.getCoefA() != 0:
    t = x
--- a/constantine/elliptic/ec_shortweierstrass_projective.nim
+++ b/constantine/elliptic/ec_shortweierstrass_projective.nim
@ -198,7 +198,7 @@ func sum*[F; Tw: static Twisted](
    r.y.sum(Q.x, Q.z)         # 15. Y₃ <- X₂ + Z₂
    r.x *= r.y                # 16. X₃ <- X₃ Y₃     X₃ = (X₁Z₁)(X₂Z₂)
    r.y.sum(t0, t2)           # 17. Y₃ <- t₀ + t₂   Y₃ = X₁ X₂ + Z₁ Z₂
-    r.y.diffAlias(r.x, r.y)   # 18. Y₃ <- X₃ - Y₃   Y₃ = (X₁ + Z₁)(X₂ + Z₂) - (X₁ X₂ + Z₁ Z₂) = X₁Z₂ + X₂Z₁
+    r.y.diff(r.x, r.y)        # 18. Y₃ <- X₃ - Y₃   Y₃ = (X₁ + Z₁)(X₂ + Z₂) - (X₁ X₂ + Z₁ Z₂) = X₁Z₂ + X₂Z₁
    when Tw == OnTwist and F.C.getSexticTwist() == D_Twist:
      t0 *= SexticNonResidue
      t1 *= SexticNonResidue
@ -214,7 +214,7 @@ func sum*[F; Tw: static Twisted](
      r.y *= SexticNonResidue
    r.x.prod(t4, r.y)         # 25. X₃ <- t₄ Y₃     X₃ = 3b(Y₁Z₂ + Y₂Z₁)(X₁Z₂ + X₂Z₁)
    t2.prod(t3, t1)           # 26. t₂ <- t₃ t₁     t₂ = (X₁Y₂ + X₂Y₁) (Y₁Y₂ - 3bZ₁Z₂)
-    r.x.diffAlias(t2, r.x)    # 27. X₃ <- t₂ - X₃   X₃ = (X₁Y₂ + X₂Y₁) (Y₁Y₂ - 3bZ₁Z₂) - 3b(Y₁Z₂ + Y₂Z₁)(X₁Z₂ + X₂Z₁)
+    r.x.diff(t2, r.x)         # 27. X₃ <- t₂ - X₃   X₃ = (X₁Y₂ + X₂Y₁) (Y₁Y₂ - 3bZ₁Z₂) - 3b(Y₁Z₂ + Y₂Z₁)(X₁Z₂ + X₂Z₁)
    r.y *= t0                 # 28. Y₃ <- Y₃ t₀     Y₃ = 9bX₁X₂ (X₁Z₂ + X₂Z₁)
    t1 *= r.z                 # 29. t₁ <- t₁ Z₃     t₁ = (Y₁Y₂ - 3bZ₁Z₂)(Y₁Y₂ + 3bZ₁Z₂)
    r.y += t1                 # 30. Y₃ <- t₁ + Y₃   Y₃ = (Y₁Y₂ + 3bZ₁Z₂)(Y₁Y₂ - 3bZ₁Z₂) + 9bX₁X₂ (X₁Z₂ + X₂Z₁)
@ -277,7 +277,7 @@ func madd*[F; Tw: static Twisted](
      r.y *= SexticNonResidue
    r.x.prod(t4, r.y)         # 18. X₃ <- t₄ Y₃,   X₃ = (Y₁ + Y₂Z₁) 3b(X₁ + X₂Z₁)
    t2.prod(t3, t1)           # 19. t₂ <- t₃ t₁,   t₂ = (X₁Y₂ + X₂Y₁)(Y₁Y₂ - 3bZ₁)
-    r.x.diffAlias(t2, r.x)    # 20. X₃ <- t₂ - X₃, X₃ = (X₁Y₂ + X₂Y₁)(Y₁Y₂ - 3bZ₁) - 3b(Y₁ + Y₂Z₁)(X₁ + X₂Z₁)
+    r.x.diff(t2, r.x)         # 20. X₃ <- t₂ - X₃, X₃ = (X₁Y₂ + X₂Y₁)(Y₁Y₂ - 3bZ₁) - 3b(Y₁ + Y₂Z₁)(X₁ + X₂Z₁)
    r.y *= t0                 # 21. Y₃ <- Y₃ t₀,   Y₃ = 9bX₁X₂ (X₁ + X₂Z₁)
    t1 *= r.z                 # 22. t₁ <- t₁ Z₃,   t₁ = (Y₁Y₂ - 3bZ₁)(Y₁Y₂ + 3bZ₁)
    r.y += t1                 # 23. Y₃ <- t₁ + Y₃, Y₃ = (Y₁Y₂ + 3bZ₁)(Y₁Y₂ - 3bZ₁) + 9bX₁X₂ (X₁ + X₂Z₁)
--- a/constantine/pairing/lines_projective.nim
+++ b/constantine/pairing/lines_projective.nim
@ -270,7 +270,7 @@ func line_eval_fused_add[F](
  F.prod(T.z, C)
  G.prod(T.x, D)
  H.double(G)
-  H.diffAlias(F, H)
+  H.diff(F, H)
  H += E
  I.prod(T.y, E)

--- a/constantine/primitives/extended_precision.nim
+++ b/constantine/primitives/extended_precision.nim
@ -138,3 +138,17 @@ func mulAcc*[T: Ct[uint32]|Ct[uint64]](t, u, v: var T, a, b: T) {.inline.} =
  addC(carry, v, v, UV[0], Carry(0))
  addC(carry, u, u, UV[1], carry)
  t += T(carry)
+
+func mulDoubleAcc*[T: Ct[uint32]|Ct[uint64]](t, u, v: var T, a, b: T) {.inline.} =
+  ## (t, u, v) <- (t, u, v) + 2 * a * b
+  var UV: array[2, T]
+  var carry: Carry
+  mul(UV[1], UV[0], a, b)
+
+  addC(carry, UV[0], UV[0], UV[0], Carry(0))
+  addC(carry, UV[1], UV[1], UV[1], carry)
+  t += T(carry)
+
+  addC(carry, v, v, UV[0], Carry(0))
+  addC(carry, u, u, UV[1], carry)
+  t += T(carry)
--- a/constantine/primitives/macro_assembler_x86.nim
+++ b/constantine/primitives/macro_assembler_x86.nim
@ -40,7 +40,7 @@ type
    CarryFlag      = "@ccc"

  Register* = enum
-    rbx, rdx, r8, rax
+    rbx, rdx, r8, rax, xmm0

  Constraint* = enum
    ## GCC extended assembly modifier
@ -369,6 +369,26 @@ func codeFragment(a: var Assembler_x86, instr: string, imm: int, op: Operand) =

  a.operands.incl op.desc

+func codeFragment(a: var Assembler_x86, instr: string, reg: Register, op: OperandReuse) =
+  # Generate a code fragment
+  # ⚠️ Warning:
+  # The caller should deal with destination/source operand
+  # so that it fits GNU Assembly
+  if a.wordBitWidth == 64:
+    a.code &= instr & "q %%" & $reg & ", %" & $op.asmId & '\n'
+  else:
+    a.code &= instr & "l %%" & $reg & ", %" & $op.asmId & '\n'
+
+func codeFragment(a: var Assembler_x86, instr: string, op: OperandReuse, reg: Register) =
+  # Generate a code fragment
+  # ⚠️ Warning:
+  # The caller should deal with destination/source operand
+  # so that it fits GNU Assembly
+  if a.wordBitWidth == 64:
+    a.code &= instr & "q %" & $op.asmId & ", %%" & $reg & '\n'
+  else:
+    a.code &= instr & "l %" & $op.asmId & ", %%" & $reg & '\n'
+
 func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: Register) =
  # Generate a code fragment
  # ⚠️ Warning:
@ -605,6 +625,17 @@ func mov*(a: var Assembler_x86, dst: Operand, imm: int) =
  a.codeFragment("mov", imm, dst)
  # No clobber

+func mov*(a: var Assembler_x86, dst: Register, src: OperandReuse) =
+  ## Does: dst <- src with dst a fixed register
+  a.codeFragment("mov", src, dst)
+  # No clobber
+
+func mov*(a: var Assembler_x86, dst: OperandReuse, src: Register) =
+  ## Does: dst <- imm
+  # doAssert dst.desc.constraint in OutputReg, $dst.repr
+  a.codeFragment("mov", src, dst)
+  # No clobber
+
 func cmovc*(a: var Assembler_x86, dst, src: Operand) =
  ## Does: dst <- src if the carry flag is set
  doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr
@ -696,19 +727,38 @@ func mulx*(a: var Assembler_x86, dHi, dLo, src0: Operand, src1: Register) =

  a.operands.incl src0.desc

-func adcx*(a: var Assembler_x86, dst, src: Operand) =
+func mulx*(a: var Assembler_x86, dHi: OperandReuse, dLo, src0: Operand, src1: Register) =
+  ## Does (dHi, dLo) <- src0 * src1
+  doAssert src1 == rdx, "MULX requires the RDX register"
+  doAssert dLo.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
+    "The destination operand must be a register " & $dLo.repr
+  doAssert dLo.desc.constraint in OutputReg
+
+  let off0 = a.getStrOffset(src0)
+
+  # Annoying AT&T syntax
+  if a.wordBitWidth == 64:
+    a.code &= "mulxq " & off0 & ", %" & $dLo.desc.asmId & ", %" & $dHi.asmId & '\n'
+  else:
+    a.code &= "mulxl " & off0 & ", %" & $dLo.desc.asmId & ", %" & $dHi.asmId & '\n'
+
+  a.operands.incl src0.desc
+
+func adcx*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|OperandReuse) =
  ## Does: dst <- dst + src + carry
  ## and only sets the carry flag
-  doAssert dst.desc.constraint in OutputReg, $dst.repr
-  doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr
+  when dst is Operand:
+    doAssert dst.desc.constraint in OutputReg, $dst.repr
+    doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr
  a.codeFragment("adcx", src, dst)
  a.areFlagsClobbered = true

-func adox*(a: var Assembler_x86, dst, src: Operand) =
+func adox*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|OperandReuse) =
  ## Does: dst <- dst + src + overflow
  ## and only sets the overflow flag
-  doAssert dst.desc.constraint in OutputReg, $dst.repr
-  doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr
+  when dst is Operand:
+    doAssert dst.desc.constraint in OutputReg, $dst.repr
+    doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr
  a.codeFragment("adox", src, dst)
  a.areFlagsClobbered = true

--- a/constantine/tower_field_extensions/quadratic_extensions.nim
+++ b/constantine/tower_field_extensions/quadratic_extensions.nim
@ -87,8 +87,8 @@ func prod_complex(r: var QuadraticExt, a, b: QuadraticExt) =
  # TODO: GCC is adding an unexplainable 30 cycles tax to this function (~10% slow down)
  #       for seemingly no reason

-  when true: # Single-width implementation
-             # Clang 330 cycles on i9-9980XE @4.1 GHz
+  when false: # Single-width implementation - BLS12-381
+              # Clang 348 cycles on i9-9980XE @3.9 GHz
    var a0b0 {.noInit.}, a1b1 {.noInit.}: typeof(r.c0)
    a0b0.prod(a.c0, b.c0)                                         # [1 Mul]
    a1b1.prod(a.c1, b.c1)                                         # [2 Mul]
@ -102,7 +102,7 @@ func prod_complex(r: var QuadraticExt, a, b: QuadraticExt) =
    r.c1 -= a1b1          # r1 = (b0 + b1)(a0 + a1) - a0b0 - a1b1 # [3 Mul, 2 Add, 3 Sub]

  else: # Double-width implementation with lazy reduction
-        # Deactivated for now Clang 360 cycles on i9-9980XE @4.1 GHz
+        # Clang 341 cycles on i9-9980XE @3.9 GHz
    var a0b0 {.noInit.}, a1b1 {.noInit.}: doubleWidth(typeof(r.c0))
    var d {.noInit.}: doubleWidth(typeof(r.c0))
    const msbSet = r.c0.typeof.canUseNoCarryMontyMul()
@ -120,20 +120,19 @@ func prod_complex(r: var QuadraticExt, a, b: QuadraticExt) =
      d -= a0b0
      d -= a1b1
    else:
-      d.diffNoReduce(d, a0b0)        # 10 cycles - cumul 152
-      d.diffNoReduce(d, a1b1)        # 10 cycles - cumul 162
-    a0b0.diff(a0b0, a1b1)            # 18 cycles - cumul 170
-    r.c0.reduce(a0b0)                # 68 cycles - cumul 248
-    r.c1.reduce(d)                   # 68 cycles - cumul 316
+      d.diffNoReduce(d, a0b0)        # 11 cycles - cumul 153
+      d.diffNoReduce(d, a1b1)        # 11 cycles - cumul 164
+    a0b0.diff(a0b0, a1b1)            # 19 cycles - cumul 183
+    r.c0.reduce(a0b0)                # 50 cycles - cumul 233
+    r.c1.reduce(d)                   # 50 cycles - cumul 288

  # Single-width [3 Mul, 2 Add, 3 Sub]
-  #    3*81 + 2*14 + 3*12 = 307 theoretical cycles
-  #    330 measured
+  #    3*88 + 2*14 + 3*14 = 334 theoretical cycles
+  #    348 measured
  # Double-Width
-  #    316 theoretical cycles
-  #    365 measured
-  #    Reductions can be 2x10 faster using MCL algorithm
-  #    but there are still unexplained 50 cycles diff between theo and measured
+  #    288 theoretical cycles
+  #    329 measured
+  #    Unexplained 40 cycles diff between theo and measured
  #    and unexplained 30 cycles between Clang and GCC
  #    - Function calls?
  #    - push/pop stack?
--- a/constantine/tower_field_extensions/tower_common.nim
+++ b/constantine/tower_field_extensions/tower_common.nim
@ -168,18 +168,6 @@ func diff*(r: var CubicExt, a, b: CubicExt) =
  r.c1.diff(a.c1, b.c1)
  r.c2.diff(a.c2, b.c2)

-func diffAlias*(r: var QuadraticExt, a, b: QuadraticExt) =
-  ## Diff ``a`` and ``b`` into ``r``
-  ## Handles r and b aliasing
-  r.c0.diffAlias(a.c0, b.c0)
-  r.c1.diffAlias(a.c1, b.c1)
-
-func diffAlias*(r: var CubicExt, a, b: CubicExt) =
-  ## Diff ``a`` and ``b`` into ``r``
-  ## Handles r and b aliasing
-  r.c0.diffAlias(a.c0, b.c0)
-  r.c1.diffAlias(a.c1, b.c1)
-  r.c2.diffAlias(a.c2, b.c2)

 # Conditional arithmetic
 # -------------------------------------------------------------------
--- a/constantine/tower_field_extensions/tower_instantiation.nim
+++ b/constantine/tower_field_extensions/tower_instantiation.nim
@ -196,21 +196,32 @@ func `*=`*(a: var Fp2, _: typedesc[ξ]) {.inline.} =
 type
  Fp12*[C: static Curve] = object
    c0*, c1*, c2*: Fp4[C]
+    # c0*, c1*: Fp6[C]

  γ = NonResidue
    # We call the non-residue γ (Gamma) on 𝔽p6 to avoid confusion between non-residue
    # of different tower level

 func `*`*(_: typedesc[γ], a: Fp4): Fp4 {.noInit, inline.} =
+  ## Multiply an element of 𝔽p4 by the sextic non-residue
+  ## chosen to construct 𝔽p12
+  result.c0 = ξ * a.c1
+  result.c1 = a.c0
+
+func `*=`*(a: var Fp4, _: typedesc[γ]) {.inline.} =
+  a = γ * a
+
+func `*`*(_: typedesc[γ], a: Fp6): Fp6 {.noInit, inline.} =
  ## Multiply an element of 𝔽p6 by the cubic non-residue
  ## chosen to construct 𝔽p12
  ## For all curves γ = v with v the factor for 𝔽p6 coordinate
  ## and v³ = ξ
  ## (c0 + c1 v + c2 v²) v => ξ c2 + c0 v + c1 v²
-  result.c0 = ξ * a.c1
+  result.c0 = ξ * a.c2
  result.c1 = a.c0
+  result.c2 = a.c1

-func `*=`*(a: var Fp4, _: typedesc[γ]) {.inline.} =
+func `*=`*(a: var Fp6, _: typedesc[γ]) {.inline.} =
  a = γ * a

 # Sparse functions
--- a/metering/README.md
+++ b/metering/README.md
@ -71,7 +71,7 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst
 |double*                                           |          7212|   2404000000000.000|          0.003|            0.000|
 |sum*                                              |         21058|   7019333333333.333|          0.003|            0.000|
 |diff*                                             |          8884|   2961333333333.333|          0.003|            0.000|
-|diffAlias*                                        |            10|                 inf|          0.000|            0.000|
+|diff*                                        |            10|                 inf|          0.000|            0.000|
 |double*                                           |          4186|                 inf|          0.000|            0.000|
 |prod*                                             |         14486|   1609555555555.555|          0.009|            0.000|
 |square*                                           |            16|                 inf|          0.000|            0.000|
--- a/tests/t_bigints.nim
+++ b/tests/t_bigints.nim
@ -287,6 +287,28 @@ proc mainMulHigh() =
        r.prod_high_words(b, a, 2)
        check: bool(r == expected)

+proc mainSquare() =
+  suite "Multi-precision multiplication" & " [" & $WordBitwidth & "-bit mode]":
+    test "Squaring is consistent with multiplication (rBits = 2*aBits)":
+      block:
+        let a = BigInt[200].fromHex"0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DE"
+
+        var r_mul, r_sqr: BigInt[400]
+
+        r_mul.prod(a, a)
+        r_sqr.square(a)
+        check: bool(r_mul == r_sqr)
+
+    test "Squaring is consistent with multiplication (truncated)":
+      block:
+        let a = BigInt[200].fromHex"0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DE"
+
+        var r_mul, r_sqr: BigInt[256]
+
+        r_mul.prod(a, a)
+        r_sqr.square(a)
+        check: bool(r_mul == r_sqr)
+
 proc mainModular() =
  suite "Modular operations - small modulus" & " [" & $WordBitwidth & "-bit mode]":
    # Vectors taken from Stint - https://github.com/status-im/nim-stint
@ -637,9 +659,10 @@ proc mainModularInverse() =
        check: bool(r == expected)

 mainArith()
-# mainMul()
-# mainMulHigh()
-# mainModular()
-# mainNeg()
-# mainCopySwap()
-# mainModularInverse()
+mainMul()
+mainMulHigh()
+mainSquare()
+mainModular()
+mainNeg()
+mainCopySwap()
+mainModularInverse()
--- a/tests/t_finite_fields_double_width.nim
+++ b/tests/t_finite_fields_double_width.nim
@ -24,74 +24,100 @@ rng.seed(seed)
 echo "\n------------------------------------------------------\n"
 echo "test_finite_fields_double_width xoshiro512** seed: ", seed

-proc randomCurve(C: static Curve) =
-  let a = rng.random_unsafe(Fp[C])
-  let b = rng.random_unsafe(Fp[C])
+template mulTest(rng_gen: untyped): untyped =
+  proc `mul _ rng_gen`(C: static Curve) =
+    let a = rng_gen(rng, Fp[C])
+    let b = rng.random_unsafe(Fp[C])

-  var r_fp, r_fpDbl: Fp[C]
-  var tmpDbl: FpDbl[C]
+    var r_fp{.noInit.}, r_fpDbl{.noInit.}: Fp[C]
+    var tmpDbl{.noInit.}: FpDbl[C]

-  r_fp.prod(a, b)
-  tmpDbl.mulNoReduce(a, b)
-  r_fpDbl.reduce(tmpDbl)
+    r_fp.prod(a, b)
+    tmpDbl.mulNoReduce(a, b)
+    r_fpDbl.reduce(tmpDbl)

-  doAssert bool(r_fp == r_fpDbl)
+    doAssert bool(r_fp == r_fpDbl)

-proc randomHighHammingWeight(C: static Curve) =
-  let a = rng.random_highHammingWeight(Fp[C])
-  let b = rng.random_highHammingWeight(Fp[C])
+template sqrTest(rng_gen: untyped): untyped =
+  proc `sqr _ rng_gen`(C: static Curve) =
+    let a = rng_gen(rng, Fp[C])

-  var r_fp, r_fpDbl: Fp[C]
-  var tmpDbl: FpDbl[C]
+    var mulDbl{.noInit.}, sqrDbl{.noInit.}: FpDbl[C]

-  r_fp.prod(a, b)
-  tmpDbl.mulNoReduce(a, b)
-  r_fpDbl.reduce(tmpDbl)
+    mulDbl.mulNoReduce(a, a)
+    sqrDbl.squareNoReduce(a)

-  doAssert bool(r_fp == r_fpDbl)
+    doAssert bool(mulDbl == sqrDbl)

-proc random_long01Seq(C: static Curve) =
-  let a = rng.random_long01Seq(Fp[C])
-  let b = rng.random_long01Seq(Fp[C])
-
-  var r_fp, r_fpDbl: Fp[C]
-  var tmpDbl: FpDbl[C]
-
-  r_fp.prod(a, b)
-  tmpDbl.mulNoReduce(a, b)
-  r_fpDbl.reduce(tmpDbl)
-
-  doAssert bool(r_fp == r_fpDbl)
+mulTest(random_unsafe)
+mulTest(randomHighHammingWeight)
+mulTest(random_long01Seq)
+sqrTest(random_unsafe)
+sqrTest(randomHighHammingWeight)
+sqrTest(random_long01Seq)

 suite "Field Multiplication via double-width field elements is consistent with single-width." & " [" & $WordBitwidth & "-bit mode]":
  test "With P-224 field modulus":
    for _ in 0 ..< Iters:
-      randomCurve(P224)
+      mul_random_unsafe(P224)
    for _ in 0 ..< Iters:
-      randomHighHammingWeight(P224)
+      mul_randomHighHammingWeight(P224)
    for _ in 0 ..< Iters:
-      random_long01Seq(P224)
+      mul_random_long01Seq(P224)

  test "With P-256 field modulus":
    for _ in 0 ..< Iters:
-      randomCurve(P256)
+      mul_random_unsafe(P256)
    for _ in 0 ..< Iters:
-      randomHighHammingWeight(P256)
+      mul_randomHighHammingWeight(P256)
    for _ in 0 ..< Iters:
-      random_long01Seq(P256)
+      mul_random_long01Seq(P256)

  test "With BN254_Snarks field modulus":
    for _ in 0 ..< Iters:
-      randomCurve(BN254_Snarks)
+      mul_random_unsafe(BN254_Snarks)
    for _ in 0 ..< Iters:
-      randomHighHammingWeight(BN254_Snarks)
+      mul_randomHighHammingWeight(BN254_Snarks)
    for _ in 0 ..< Iters:
-      random_long01Seq(BN254_Snarks)
+      mul_random_long01Seq(BN254_Snarks)

  test "With BLS12_381 field modulus":
    for _ in 0 ..< Iters:
-      randomCurve(BLS12_381)
+      mul_random_unsafe(BLS12_381)
    for _ in 0 ..< Iters:
-      randomHighHammingWeight(BLS12_381)
+      mul_randomHighHammingWeight(BLS12_381)
    for _ in 0 ..< Iters:
-      random_long01Seq(BLS12_381)
+      mul_random_long01Seq(BLS12_381)
+
+suite "Field Squaring via double-width field elements is consistent with single-width." & " [" & $WordBitwidth & "-bit mode]":
+  test "With P-224 field modulus":
+    for _ in 0 ..< Iters:
+      sqr_random_unsafe(P224)
+    for _ in 0 ..< Iters:
+      sqr_randomHighHammingWeight(P224)
+    for _ in 0 ..< Iters:
+      sqr_random_long01Seq(P224)
+
+  test "With P-256 field modulus":
+    for _ in 0 ..< Iters:
+      sqr_random_unsafe(P256)
+    for _ in 0 ..< Iters:
+      sqr_randomHighHammingWeight(P256)
+    for _ in 0 ..< Iters:
+      sqr_random_long01Seq(P256)
+
+  test "With BN254_Snarks field modulus":
+    for _ in 0 ..< Iters:
+      sqr_random_unsafe(BN254_Snarks)
+    for _ in 0 ..< Iters:
+      sqr_randomHighHammingWeight(BN254_Snarks)
+    for _ in 0 ..< Iters:
+      sqr_random_long01Seq(BN254_Snarks)
+
+  test "With BLS12_381 field modulus":
+    for _ in 0 ..< Iters:
+      sqr_random_unsafe(BLS12_381)
+    for _ in 0 ..< Iters:
+      sqr_randomHighHammingWeight(BLS12_381)
+    for _ in 0 ..< Iters:
+      sqr_random_long01Seq(BLS12_381)
--- a/tests/t_finite_fields_vs_gmp.nim
+++ b/tests/t_finite_fields_vs_gmp.nim
@ -155,8 +155,9 @@ proc subTests(gmpRng: var gmp_randstate_t, a, b, p, r: var mpz_t, C: static Curv
  var r2Test = aTest
  r2Test -= bTest

+  # Substraction with r and b aliasing
  var r3Test = bTest
-  r3Test.diffAlias(aTest, r3Test)
+  r3Test.diff(aTest, r3Test)

  binary_epilogue(r, a, b, rTest, aBuf, bBuf, "Substraction (with result)")
  binary_epilogue(r, a, b, r2Test, aBuf, bBuf, "Substraction (in-place)")