From 8918cabb56569f37a741f900c3583e0545f70c82 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Mon, 15 Feb 2021 20:38:12 +0100
Subject: [PATCH] Cleanup: introduce clobbered registers, remove explicit rax,
 rdx for multiplication (minus 30-50 lines for related assembly files)

---
 .../assembly/limbs_asm_montmul_x86.nim        |  61 ++--
 .../limbs_asm_montmul_x86_adx_bmi2.nim        |  41 +--
 .../assembly/limbs_asm_montred_x86.nim        | 143 ++++------
 .../limbs_asm_montred_x86_adx_bmi2.nim        | 121 +++-----
 .../arithmetic/assembly/limbs_asm_mul_x86.nim |  78 +-----
 .../assembly/limbs_asm_mul_x86_adx_bmi2.nim   |  66 ++---
 .../primitives/macro_assembler_x86.nim        | 263 +++++++++++++++---
 7 files changed, 393 insertions(+), 380 deletions(-)

diff --git a/constantine/arithmetic/assembly/limbs_asm_montmul_x86.nim b/constantine/arithmetic/assembly/limbs_asm_montmul_x86.nim
index 9280f16..0af7fba 100644
--- a/constantine/arithmetic/assembly/limbs_asm_montmul_x86.nim
+++ b/constantine/arithmetic/assembly/limbs_asm_montmul_x86.nim
@@ -57,25 +57,6 @@ macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_
     scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
 
     # MUL requires RAX and RDX
-    rRAX = Operand(
-      desc: OperandDesc(
-        asmId: "[rax]",
-        nimSymbol: ident"rax",
-        rm: RAX,
-        constraint: Output_EarlyClobber,
-        cEmit: "rax"
-      )
-    )
-
-    rRDX = Operand(
-      desc: OperandDesc(
-        asmId: "[rdx]",
-        nimSymbol: ident"rdx",
-        rm: RDX,
-        constraint: Output_EarlyClobber,
-        cEmit: "rdx"
-      )
-    )
 
     m0ninv = Operand(
                desc: OperandDesc(
@@ -109,16 +90,12 @@ macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_
 
   let tsym = t.nimSymbol
   let scratchSym = scratch.nimSymbol
-  let eax = rRAX.desc.nimSymbol
-  let edx = rRDX.desc.nimSymbol
   result.add quote do:
     static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
 
     var `tsym`: typeof(`r_MM`) # zero init
     # Assumes 64-bit limbs on 64-bit arch (or you can't store an address)
     var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
-    var `eax`{.noInit.}, `edx`{.noInit.}: BaseType
-
     `scratchSym`[0] = cast[SecretWord](`a_MM`[0].unsafeAddr)
     `scratchSym`[1] = cast[SecretWord](`b_MM`[0].unsafeAddr)
     `scratchSym`[5] = cast[SecretWord](`r_MM`[0].unsafeAddr)
@@ -140,14 +117,14 @@ macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_
 
   for i in 0 ..< N:
     # (A, t[0]) <- a[0] * b[i] + t[0]
-    ctx.mov rRAX, a[0]
+    ctx.mov rax, a[0]
     ctx.mul rdx, rax, b[i], rax
     if i == 0: # overwrite t[0]
-      ctx.mov t[0], rRAX
+      ctx.mov t[0], rax
     else:      # Accumulate in t[0]
-      ctx.add t[0], rRAX
-      ctx.adc rRDX, 0
-    ctx.mov A, rRDX
+      ctx.add t[0], rax
+      ctx.adc rdx, 0
+    ctx.mov A, rdx
 
     # m        <- (t[0] * m0ninv) mod 2^w
     ctx.mov m, m0ninv
@@ -155,39 +132,39 @@ macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_
 
     # (C, _)    <- m * M[0] + t[0]
     ctx.`xor` C, C
-    ctx.mov rRAX, M[0]
+    ctx.mov rax, M[0]
     ctx.mul rdx, rax, m, rax
-    ctx.add rRAX, t[0]
-    ctx.adc C, rRDX
+    ctx.add rax, t[0]
+    ctx.adc C, rdx
 
     for j in 1 ..< N:
       # (A, t[j])   <- a[j] * b[i] + A + t[j]
-      ctx.mov rRAX, a[j]
+      ctx.mov rax, a[j]
       ctx.mul rdx, rax, b[i], rax
       if i == 0:
         ctx.mov t[j], A
       else:
         ctx.add t[j], A
-        ctx.adc rRDX, 0
+        ctx.adc rdx, 0
       ctx.`xor` A, A
-      ctx.add t[j], rRAX
-      ctx.adc A, rRDX
+      ctx.add t[j], rax
+      ctx.adc A, rdx
 
       # (C, t[j-1]) <- m * M[j] + C + t[j]
-      ctx.mov rRAX, M[j]
+      ctx.mov rax, M[j]
       ctx.mul rdx, rax, m, rax
       ctx.add C, t[j]
-      ctx.adc rRDX, 0
-      ctx.add C, rRAX
-      ctx.adc rRDX, 0
+      ctx.adc rdx, 0
+      ctx.add C, rax
+      ctx.adc rdx, 0
       ctx.mov t[j-1], C
-      ctx.mov C, rRDX
+      ctx.mov C, rdx
 
     ctx.add A, C
     ctx.mov t[N-1], A
 
-  ctx.mov rRDX, r
-  let r2 = rRDX.asArrayAddr(len = N)
+  ctx.mov rdx, r
+  let r2 = rdx.asArrayAddr(len = N)
 
   ctx.finalSubNoCarry(
     r2, t, M,
diff --git a/constantine/arithmetic/assembly/limbs_asm_montmul_x86_adx_bmi2.nim b/constantine/arithmetic/assembly/limbs_asm_montmul_x86_adx_bmi2.nim
index 9d058df..c32dc90 100644
--- a/constantine/arithmetic/assembly/limbs_asm_montmul_x86_adx_bmi2.nim
+++ b/constantine/arithmetic/assembly/limbs_asm_montmul_x86_adx_bmi2.nim
@@ -40,7 +40,7 @@ proc mulx_by_word(
        t: OperandArray,
        a: Operand, # Pointer in scratchspace
        word0: Operand,
-       lo, rRDX: Operand
+       lo: Operand
      ) =
   ## Multiply the `a[0..<N]` by `word` and store in `t[0..<N]`
   ## and carry register `C` (t[N])
@@ -55,7 +55,7 @@ proc mulx_by_word(
   #  (C,t[j])  := t[j] + a[j]*b[i] + C
 
   # First limb
-  ctx.mov rRDX, word0
+  ctx.mov rdx, word0
   if N > 1:
     ctx.mulx t[1], t[0], a[0], rdx
     ctx.`xor` hi, hi # Clear flags - TODO: necessary?
@@ -87,20 +87,19 @@ proc mulaccx_by_word(
        a: Operand, # Pointer in scratchspace
        i: int,
        word: Operand,
-       lo, rRDX: Operand
+       lo: Operand
      ) =
   ## Multiply the `a[0..<N]` by `word`
   ## and accumulate in `t[0..<N]`
   ## and carry register `C` (t[N])
   ## `t` and `C` are multiply-accumulated
   ## `S` is a scratchspace register
-  ## `rRDX` is the RDX register descriptor
   let N = min(a.len, t.len)
 
   doAssert i != 0
 
   ctx.comment "  Outer loop i = " & $i & ", j in [0, " & $N & ")"
-  ctx.mov rRDX, word
+  ctx.mov rdx, word
   ctx.`xor` hi, hi # Clear flags - TODO: necessary?
 
   # for j=0 to N-1
@@ -119,9 +118,9 @@ proc mulaccx_by_word(
 
   # Final carries
   ctx.comment "  Accumulate last carries in hi word"
-  ctx.mov  rRDX, 0 # Set to 0 without clearing flags
-  ctx.adcx hi, rRDX
-  ctx.adox hi, rRDX
+  ctx.mov  rdx, 0 # Set to 0 without clearing flags
+  ctx.adcx hi, rdx
+  ctx.adox hi, rdx
 
 proc partialRedx(
        ctx: var Assembler_x86,
@@ -129,7 +128,7 @@ proc partialRedx(
        t: OperandArray,
        M: OperandArray,
        m0ninv: Operand,
-       lo, S, rRDX: Operand
+       lo, S: Operand
      ) =
     ## Partial Montgomery reduction
     ## For CIOS method
@@ -145,8 +144,8 @@ proc partialRedx(
     # m = t[0] * m0ninv mod 2^w
     ctx.comment "  Reduction"
     ctx.comment "  m = t[0] * m0ninv mod 2^w"
-    ctx.mov  rRDX, t[0]
-    ctx.mulx S, rRDX, m0ninv, rdx # (S, RDX) <- m0ninv * RDX
+    ctx.mov  rdx, t[0]
+    ctx.mulx S, rdx, m0ninv, rdx # (S, RDX) <- m0ninv * RDX
 
     # Clear carry flags - TODO: necessary?
     ctx.`xor` S, S
@@ -194,16 +193,7 @@ macro montMul_CIOS_nocarry_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM,
     # MultiPurpose Register slots
     scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
 
-    # MULX requires RDX
-    rRDX = Operand(
-      desc: OperandDesc(
-        asmId: "[rdx]",
-        nimSymbol: ident"rdx",
-        rm: RDX,
-        constraint: Output_EarlyClobber,
-        cEmit: "rdx"
-      )
-    )
+    # MULX requires RDX as well
 
     a = scratch[0].asArrayAddr(len = N) # Store the `a` operand
     b = scratch[1].asArrayAddr(len = N) # Store the `b` operand
@@ -225,15 +215,12 @@ macro montMul_CIOS_nocarry_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM,
 
   let tsym = t.nimSymbol
   let scratchSym = scratch.nimSymbol
-  let edx = rRDX.desc.nimSymbol
   result.add quote do:
     static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
 
     var `tsym`: typeof(`r_MM`) # zero init
     # Assumes 64-bit limbs on 64-bit arch (or you can't store an address)
     var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
-    var `edx`{.noInit.}: BaseType
-
     `scratchSym`[0] = cast[SecretWord](`a_MM`[0].unsafeAddr)
     `scratchSym`[1] = cast[SecretWord](`b_MM`[0].unsafeAddr)
     `scratchSym`[4] = SecretWord `m0ninv_MM`
@@ -258,20 +245,20 @@ macro montMul_CIOS_nocarry_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM,
         A, t,
         a,
         b[0],
-        C, rRDX
+        C
       )
     else:
       ctx.mulaccx_by_word(
         A, t,
         a, i,
         b[i],
-        C, rRDX
+        C
       )
 
     ctx.partialRedx(
       A, t,
       M, m0ninv,
-      lo, C, rRDX
+      lo, C
     )
 
   ctx.finalSubNoCarry(
diff --git a/constantine/arithmetic/assembly/limbs_asm_montred_x86.nim b/constantine/arithmetic/assembly/limbs_asm_montred_x86.nim
index 8e48de7..e8da3cc 100644
--- a/constantine/arithmetic/assembly/limbs_asm_montred_x86.nim
+++ b/constantine/arithmetic/assembly/limbs_asm_montred_x86.nim
@@ -41,7 +41,7 @@ proc finalSubNoCarry*(
       ctx.sbb scratch[i], M[i]
 
   # If we borrowed it means that we were smaller than
-  # the modulus and we don'a need "scratch"
+  # the modulus and we don't need "scratch"
   for i in 0 ..< N:
     ctx.cmovnc a[i], scratch[i]
     ctx.mov r[i], a[i]
@@ -50,7 +50,7 @@ proc finalSubCanOverflow*(
        ctx: var Assembler_x86,
        r: Operand or OperandArray,
        a, M, scratch: OperandArray,
-       overflowReg: Operand
+       overflowReg: Operand or Register
      ) =
   ## Reduce `a` into `r` modulo `M`
   ## To be used when the final substraction can
@@ -74,7 +74,7 @@ proc finalSubCanOverflow*(
   ctx.sbb overflowReg, 0
 
   # If we borrowed it means that we were smaller than
-  # the modulus and we don'a need "scratch"
+  # the modulus and we don't need "scratch"
   for i in 0 ..< N:
     ctx.cmovnc a[i], scratch[i]
     ctx.mov r[i], a[i]
@@ -90,59 +90,37 @@ macro montyRedc2x_gen[N: static int](
        m0ninv_MR: BaseType,
        spareBits: static int
       ) =
-  # TODO, slower than Clang, in particular due to the shadowing
-
   result = newStmtList()
 
   var ctx = init(Assembler_x86, BaseType)
+  # On x86, compilers only let us use 15 out of 16 registers
+  # RAX and RDX are defacto used due to the MUL instructions
+  # so we store everything in scratchspaces restoring as needed
   let
     # We could force M as immediate by specializing per moduli
     M = init(OperandArray, nimSymbol = M_MR, N, PointerInReg, Input)
-
     # MUL requires RAX and RDX
-    rRAX = Operand(
-      desc: OperandDesc(
-        asmId: "[rax]",
-        nimSymbol: ident"rax",
-        rm: RAX,
-        constraint: InputOutput_EnsureClobber,
-        cEmit: "rax"
-      )
-    )
 
-    rRDX = Operand(
-      desc: OperandDesc(
-        asmId: "[rdx]",
-        nimSymbol: ident"rdx",
-        rm: RDX,
-        constraint: Output_EarlyClobber,
-        cEmit: "rdx"
-      )
-    )
+  let uSlots = N+2
+  let vSlots = max(N-2, 3)
 
-    m0ninv = Operand(
-      desc: OperandDesc(
-        asmId: "[m0ninv]",
-        nimSymbol: m0ninv_MR,
-        rm: Reg,
-        constraint: Input,
-        cEmit: "m0ninv"
-      )
-    )
-
-
-  let scratchSlots = N+2
-  var scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
+  var # Scratchspaces
+    u = init(OperandArray, nimSymbol = ident"U", uSlots, ElemsInReg, InputOutput_EnsureClobber)
+    v = init(OperandArray, nimSymbol = ident"V", vSlots, ElemsInReg, InputOutput_EnsureClobber)
 
   # Prologue
-  let eax = rRAX.desc.nimSymbol
-  let edx = rRDX.desc.nimSymbol
-  let scratchSym = scratch.nimSymbol
+  let usym = u.nimSymbol
+  let vsym = v.nimSymbol
   result.add quote do:
-    static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
+    var `usym`{.noinit.}: Limbs[`uSlots`]
+    var `vsym` {.noInit.}: Limbs[`vSlots`]
+    `vsym`[0] = cast[SecretWord](`r_MR`[0].unsafeAddr)
+    `vsym`[1] = cast[SecretWord](`a_MR`[0].unsafeAddr)
+    `vsym`[2] = SecretWord(`m0ninv_MR`)
 
-    var `eax`{.noInit.}, `edx`{.noInit.}: BaseType
-    var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
+  let r_temp = v[0].asArrayAddr(len = N)
+  let a = v[1].asArrayAddr(len = 2*N)
+  let m0ninv = v[2]
 
   # Algorithm
   # ---------------------------------------------------------
@@ -161,85 +139,76 @@ macro montyRedc2x_gen[N: static int](
   # No register spilling handling
   doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
 
-  result.add quote do:
-    `eax` = BaseType `a_MR`[0]
-    staticFor i, 0, `N`: # Do NOT use Nim slice/toOpenArray, they are not inlined
-      `scratchSym`[i] = `a_MR`[i]
+  for i in 0 ..< N:
+    ctx.mov u[i], a[i]
 
-  ctx.mov scratch[N], rRAX
-  ctx.imul rRAX, m0ninv    # m <- a[i] * m0ninv mod 2^w
-  ctx.mov scratch[0], rRAX
+  ctx.mov u[N], u[0]
+  ctx.imul u[0], m0ninv    # m <- a[i] * m0ninv mod 2^w
+  ctx.mov rax, u[0]
 
   # scratch: [a[0] * m0, a[1], a[2], a[3], a[0]] for 4 limbs
 
   for i in 0 ..< N:
     ctx.comment ""
-    let hi = scratch[N]
-    let next = scratch[N+1]
+    let hi = u[N]
+    let next = u[N+1]
 
     ctx.mul rdx, rax, M[0], rax
-    ctx.add hi, rRAX # Guaranteed to be zero
-    ctx.mov rRAX, scratch[0]
-    ctx.adc hi, rRDX
+    ctx.add hi, rax # Guaranteed to be zero
+    ctx.mov rax, u[0]
+    ctx.adc hi, rdx
 
     for j in 1 ..< N-1:
       ctx.comment ""
       ctx.mul rdx, rax, M[j], rax
-      ctx.add scratch[j], rRAX
-      ctx.mov rRAX, scratch[0]
-      ctx.adc rRDX, 0
-      ctx.add scratch[j], hi
-      ctx.adc rRDX, 0
-      ctx.mov hi, rRDX
+      ctx.add u[j], rax
+      ctx.mov rax, u[0]
+      ctx.adc rdx, 0
+      ctx.add u[j], hi
+      ctx.adc rdx, 0
+      ctx.mov hi, rdx
 
     # Next load
     if i < N-1:
       ctx.comment ""
-      ctx.mov next, scratch[1]
-      ctx.imul scratch[1], m0ninv
+      ctx.mov next, u[1]
+      ctx.imul u[1], m0ninv
       ctx.comment ""
 
     # Last limb
     ctx.comment ""
     ctx.mul rdx, rax, M[N-1], rax
-    ctx.add scratch[N-1], rRAX
-    ctx.mov rRAX, scratch[1] # Contains next * m0
-    ctx.adc rRDX, 0
-    ctx.add scratch[N-1], hi
-    ctx.adc rRDX, 0
-    ctx.mov hi, rRDX
+    ctx.add u[N-1], rax
+    ctx.mov rax, u[1] # Contains next * m0
+    ctx.adc rdx, 0
+    ctx.add u[N-1], hi
+    ctx.adc rdx, 0
+    ctx.mov hi, rdx
 
-    scratch.rotateLeft()
+    u.rotateLeft()
 
-  # Code generation
-  result.add ctx.generate()
+  # Second part - Final substraction
+  # ---------------------------------------------
 
-  # New codegen
-  ctx = init(Assembler_x86, BaseType)
-
-  let r = init(OperandArray, nimSymbol = r_MR, N, PointerInReg, InputOutput_EnsureClobber)
-  let a = init(OperandArray, nimSymbol = a_MR, N*2, PointerInReg, Input)
-  let extraRegNeeded = N-2
-  let t = init(OperandArray, nimSymbol = ident"t", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber)
-  let tsym = t.nimSymbol
-  result.add quote do:
-    var `tsym` {.noInit.}: Limbs[`extraRegNeeded`]
+  ctx.mov rdx, r_temp
+  let r = rdx.asArrayAddr(len = N)
 
   # This does a[i+n] += hi
   # but in a separate carry chain, fused with the
   # copy "r[i] = a[i+n]"
   for i in 0 ..< N:
     if i == 0:
-      ctx.add scratch[i], a[i+N]
+      ctx.add u[i], a[i+N]
     else:
-      ctx.adc scratch[i], a[i+N]
+      ctx.adc u[i], a[i+N]
 
-  let reuse = repackRegisters(t, scratch[N], scratch[N+1])
+  let t = repackRegisters(v, u[N], u[N+1])
 
+  # v is invalidated
   if spareBits >= 1:
-    ctx.finalSubNoCarry(r, scratch, M, reuse)
+    ctx.finalSubNoCarry(r, u, M, t)
   else:
-    ctx.finalSubCanOverflow(r, scratch, M, reuse, rRAX)
+    ctx.finalSubCanOverflow(r, u, M, t, rax)
 
   # Code generation
   result.add ctx.generate()
diff --git a/constantine/arithmetic/assembly/limbs_asm_montred_x86_adx_bmi2.nim b/constantine/arithmetic/assembly/limbs_asm_montred_x86_adx_bmi2.nim
index c4e6520..bd4bf68 100644
--- a/constantine/arithmetic/assembly/limbs_asm_montred_x86_adx_bmi2.nim
+++ b/constantine/arithmetic/assembly/limbs_asm_montred_x86_adx_bmi2.nim
@@ -35,15 +35,13 @@ static: doAssert UseASM_X86_64
 # Montgomery reduction
 # ------------------------------------------------------------
 
-macro montyRedc2xx_gen[N: static int](
+macro montyRedc2x_gen[N: static int](
        r_MR: var array[N, SecretWord],
        a_MR: array[N*2, SecretWord],
        M_MR: array[N, SecretWord],
        m0ninv_MR: BaseType,
        spareBits: static int
       ) =
-  # TODO, slower than Clang, in particular due to the shadowing
-
   result = newStmtList()
 
   var ctx = init(Assembler_x86, BaseType)
@@ -51,59 +49,29 @@ macro montyRedc2xx_gen[N: static int](
     # We could force M as immediate by specializing per moduli
     M = init(OperandArray, nimSymbol = M_MR, N, PointerInReg, Input)
 
-    hi = Operand(
-      desc: OperandDesc(
-        asmId: "[hi]",
-        nimSymbol: ident"hi",
-        rm: Reg,
-        constraint: Output_EarlyClobber,
-        cEmit: "hi"
-      )
-    )
+  let uSlots = N+1
+  let vSlots = max(N-1, 5)
 
-    lo = Operand(
-      desc: OperandDesc(
-        asmId: "[lo]",
-        nimSymbol: ident"lo",
-        rm: Reg,
-        constraint: Output_EarlyClobber,
-        cEmit: "lo"
-      )
-    )
-
-    rRDX = Operand(
-      desc: OperandDesc(
-        asmId: "[rdx]",
-        nimSymbol: ident"rdx",
-        rm: RDX,
-        constraint: InputOutput_EnsureClobber,
-        cEmit: "rdx"
-      )
-    )
-
-    m0ninv = Operand(
-      desc: OperandDesc(
-        asmId: "[m0ninv]",
-        nimSymbol: m0ninv_MR,
-        rm: Reg,
-        constraint: Input,
-        cEmit: "m0ninv"
-      )
-    )
-
-  let scratchSlots = N+1
-  var scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
+  var # Scratchspaces
+    u = init(OperandArray, nimSymbol = ident"U", uSlots, ElemsInReg, InputOutput_EnsureClobber)
+    v = init(OperandArray, nimSymbol = ident"V", vSlots, ElemsInReg, InputOutput_EnsureClobber)
 
   # Prologue
-  let edx = rRDX.desc.nimSymbol
-  let hisym = hi.desc.nimSymbol
-  let losym = lo.desc.nimSymbol
-  let scratchSym = scratch.nimSymbol
+  let usym = u.nimSymbol
+  let vsym = v.nimSymbol
   result.add quote do:
     static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
+    var `usym`{.noinit.}: Limbs[`uSlots`]
+    var `vsym` {.noInit.}: Limbs[`vSlots`]
+    `vsym`[0] = cast[SecretWord](`r_MR`[0].unsafeAddr)
+    `vsym`[1] = cast[SecretWord](`a_MR`[0].unsafeAddr)
+    `vsym`[2] = SecretWord(`m0ninv_MR`)
 
-    var `hisym`{.noInit.}, `losym`{.noInit.}, `edx`{.noInit.}: BaseType
-    var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
+  let r_temp = v[0].asArrayAddr(len = N)
+  let a = v[1].asArrayAddr(len = 2*N)
+  let m0ninv = v[2]
+  let lo = v[3]
+  let hi = v[4]
 
   # Algorithm
   # ---------------------------------------------------------
@@ -122,63 +90,52 @@ macro montyRedc2xx_gen[N: static int](
   # No register spilling handling
   doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
 
-  result.add quote do:
-    `edx` = BaseType(`m0ninv_MR`)
-    staticFor i, 0, `N`: # Do NOT use Nim slice/toOpenArray, they are not inlined
-      `scratchSym`[i] = `a_MR`[i]
+  ctx.mov rdx, m0ninv
+
+  for i in 0 ..< N:
+    ctx.mov u[i], a[i]
 
   for i in 0 ..< N:
     # RDX contains m0ninv at the start of each loop
     ctx.comment ""
-    ctx.imul rRDX, scratch[0] # m <- a[i] * m0ninv mod 2^w
+    ctx.imul rdx, u[0] # m <- a[i] * m0ninv mod 2^w
     ctx.comment "---- Reduction " & $i
-    ctx.`xor` scratch[N], scratch[N]
+    ctx.`xor` u[N], u[N]
 
     for j in 0 ..< N-1:
       ctx.comment ""
       ctx.mulx hi, lo, M[j], rdx
-      ctx.adcx scratch[j], lo
-      ctx.adox scratch[j+1], hi
+      ctx.adcx u[j], lo
+      ctx.adox u[j+1], hi
 
     # Last limb
     ctx.comment ""
     ctx.mulx hi, lo, M[N-1], rdx
-    ctx.mov rRDX, m0ninv # Reload m0ninv for next iter
-    ctx.adcx scratch[N-1], lo
-    ctx.adox hi, scratch[N]
-    ctx.adcx scratch[N], hi
+    ctx.mov rdx, m0ninv # Reload m0ninv for next iter
+    ctx.adcx u[N-1], lo
+    ctx.adox hi, u[N]
+    ctx.adcx u[N], hi
 
-    scratch.rotateLeft()
+    u.rotateLeft()
 
-  # Code generation
-  result.add ctx.generate()
-
-  # New codegen
-  ctx = init(Assembler_x86, BaseType)
-
-  let r = init(OperandArray, nimSymbol = r_MR, N, PointerInReg, InputOutput_EnsureClobber)
-  let a = init(OperandArray, nimSymbol = a_MR, N*2, PointerInReg, Input)
-  let extraRegNeeded = N-1
-  let t = init(OperandArray, nimSymbol = ident"t", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber)
-  let tsym = t.nimSymbol
-  result.add quote do:
-    var `tsym` {.noInit.}: Limbs[`extraRegNeeded`]
+  ctx.mov rdx, r_temp
+  let r = rdx.asArrayAddr(len = N)
 
   # This does a[i+n] += hi
   # but in a separate carry chain, fused with the
   # copy "r[i] = a[i+n]"
   for i in 0 ..< N:
     if i == 0:
-      ctx.add scratch[i], a[i+N]
+      ctx.add u[i], a[i+N]
     else:
-      ctx.adc scratch[i], a[i+N]
+      ctx.adc u[i], a[i+N]
 
-  let reuse = repackRegisters(t, scratch[N])
+  let t = repackRegisters(v, u[N])
 
   if spareBits >= 1:
-    ctx.finalSubNoCarry(r, scratch, M, reuse)
+    ctx.finalSubNoCarry(r, u, M, t)
   else:
-    ctx.finalSubCanOverflow(r, scratch, M, reuse, hi)
+    ctx.finalSubCanOverflow(r, u, M, t, hi)
 
   # Code generation
   result.add ctx.generate()
@@ -191,4 +148,4 @@ func montRed_asm_adx_bmi2*[N: static int](
        spareBits: static int
       ) =
   ## Constant-time Montgomery reduction
-  montyRedc2xx_gen(r, a, M, m0ninv, spareBits)
+  montyRedc2x_gen(r, a, M, m0ninv, spareBits)
diff --git a/constantine/arithmetic/assembly/limbs_asm_mul_x86.nim b/constantine/arithmetic/assembly/limbs_asm_mul_x86.nim
index 547a5f7..0863155 100644
--- a/constantine/arithmetic/assembly/limbs_asm_mul_x86.nim
+++ b/constantine/arithmetic/assembly/limbs_asm_mul_x86.nim
@@ -81,36 +81,13 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
     )
 
     # MUL requires RAX and RDX
-    rRAX = Operand(
-      desc: OperandDesc(
-        asmId: "[rax]",
-        nimSymbol: ident"rax",
-        rm: RAX,
-        constraint: Output_EarlyClobber,
-        cEmit: "rax"
-      )
-    )
-
-    rRDX = Operand(
-      desc: OperandDesc(
-        asmId: "[rdx]",
-        nimSymbol: ident"rdx",
-        rm: RDX,
-        constraint: Output_EarlyClobber,
-        cEmit: "rdx"
-      )
-    )
-
 
   # Prologue
   let tsym = t.desc.nimSymbol
   let usym = u.desc.nimSymbol
   let vsym = v.desc.nimSymbol
-  let eax = rRAX.desc.nimSymbol
-  let edx = rRDX.desc.nimSymbol
   result.add quote do:
     var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init
-    var `eax`{.noInit.}, `edx`{.noInit.}: BaseType
 
   # Algorithm
   ctx.`xor` u, u
@@ -127,10 +104,10 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
     let ia = i - ib
     for j in 0 ..< min(aLen - ia, ib+1):
       # (t, u, v) <- (t, u, v) + a[ia+j] * b[ib-j]
-      ctx.mov rRAX, arrB[ib-j]
+      ctx.mov rax, arrB[ib-j]
       ctx.mul rdx, rax, arrA[ia+j], rax
-      ctx.add v, rRAX
-      ctx.adc u, rRDX
+      ctx.add v, rax
+      ctx.adc u, rdx
       ctx.adc t, 0
 
     ctx.mov arrR[i], v
@@ -141,9 +118,9 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
       ctx.`xor` t, t
 
   if aLen+bLen < rLen:
-    ctx.`xor` rRAX, rRAX
+    ctx.`xor` rax, rax
     for i in aLen+bLen ..< rLen:
-      ctx.mov arrR[i], rRAX
+      ctx.mov arrR[i], rax
 
   # Codegen
   result.add ctx.generate
@@ -202,37 +179,12 @@ macro square_gen[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
       )
     )
 
-    # MUL requires RAX and RDX
-    rRAX = Operand(
-      desc: OperandDesc(
-        asmId: "[rax]",
-        nimSymbol: ident"rax",
-        rm: RAX,
-        constraint: Output_EarlyClobber,
-        cEmit: "rax"
-      )
-    )
-
-    rRDX = Operand(
-      desc: OperandDesc(
-        asmId: "[rdx]",
-        nimSymbol: ident"rdx",
-        rm: RDX,
-        constraint: Output_EarlyClobber,
-        cEmit: "rdx"
-      )
-    )
-
-
   # Prologue
   let tsym = t.desc.nimSymbol
   let usym = u.desc.nimSymbol
   let vsym = v.desc.nimSymbol
-  let eax = rRAX.desc.nimSymbol
-  let edx = rRDX.desc.nimSymbol
   result.add quote do:
     var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init
-    var `eax`{.noInit.}, `edx`{.noInit.}: BaseType
 
   # Algorithm
   ctx.`xor` u, u
@@ -252,20 +204,20 @@ macro square_gen[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
       let k2 = ib-j
       if k1 < k2:
         # (t, u, v) <- (t, u, v) + 2 * a[k1] * a[k2]
-        ctx.mov rRAX, arrA[k2]
+        ctx.mov rax, arrA[k2]
         ctx.mul rdx, rax, arrA[k1], rax
-        ctx.add rRAX, rRAX
-        ctx.adc rRDX, rRDX
+        ctx.add rax, rax
+        ctx.adc rdx, rdx
         ctx.adc t, 0
-        ctx.add v, rRAX
-        ctx.adc u, rRDX
+        ctx.add v, rax
+        ctx.adc u, rdx
         ctx.adc t, 0
       elif k1 == k2:
         # (t, u, v) <- (t, u, v) + a[k1] * a[k2]
-        ctx.mov rRAX, arrA[k2]
+        ctx.mov rax, arrA[k2]
         ctx.mul rdx, rax, arrA[k1], rax
-        ctx.add v, rRAX
-        ctx.adc u, rRDX
+        ctx.add v, rax
+        ctx.adc u, rdx
         ctx.adc t, 0
       else:
         discard
@@ -278,9 +230,9 @@ macro square_gen[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
       ctx.`xor` t, t
 
   if aLen*2 < rLen:
-    ctx.`xor` rRAX, rRAX
+    ctx.`xor` rax, rax
     for i in aLen*2 ..< rLen:
-      ctx.mov arrR[i], rRAX
+      ctx.mov arrR[i], rax
 
   # Codegen
   result.add ctx.generate
diff --git a/constantine/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim b/constantine/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim
index f56fcdd..7f91b27 100644
--- a/constantine/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim
+++ b/constantine/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim
@@ -37,8 +37,7 @@ proc mulx_by_word(
        ctx: var Assembler_x86,
        r0: Operand,
        a, t: OperandArray,
-       word0: Operand,
-       rRAX, rRDX: Operand
+       word0: Operand
      ) =
   ## Multiply the `a[0..<N]` by `word`
   ## and store in `[t:r0]`
@@ -52,18 +51,18 @@ proc mulx_by_word(
   #  (C,t[j])  := t[j] + a[j]*b[i] + C
 
   # First limb
-  ctx.mov rRDX, word0
-  ctx.`xor` rRAX, rRAX # Clear flags (important if steady state is skipped)
-  ctx.mulx t[0], rRAX, a[0], rdx
-  ctx.mov r0, rRAX
+  ctx.mov rdx, word0
+  ctx.`xor` rax, rax # Clear flags (important if steady state is skipped)
+  ctx.mulx t[0], rax, a[0], rdx
+  ctx.mov r0, rax
 
   # Steady state
   for j in 1 ..< N:
-    ctx.mulx t[j], rRAX, a[j], rdx
+    ctx.mulx t[j], rax, a[j], rdx
     if j == 1:
-      ctx.add t[j-1], rRAX
+      ctx.add t[j-1], rax
     else:
-      ctx.adc t[j-1], rRAX
+      ctx.adc t[j-1], rax
 
   # Final carries
   ctx.comment "  Accumulate last carries in hi word"
@@ -74,8 +73,7 @@ proc mulaccx_by_word(
        r: OperandArray,
        i: int,
        a, t: OperandArray,
-       word: Operand,
-       rRAX, rRDX: Operand
+       word: Operand
      ) =
   ## Multiply the `a[0..<N]` by `word`
   ## and store in `[t:r0]`
@@ -87,16 +85,16 @@ proc mulaccx_by_word(
   doAssert i != 0
 
   ctx.comment "  Outer loop i = " & $i & ", j in [0, " & $N & ")"
-  ctx.mov rRDX, word
-  ctx.`xor` rRAX, rRAX # Clear flags
+  ctx.mov rdx, word
+  ctx.`xor` rax, rax # Clear flags
 
   # for j=0 to N-1
   #  (C,t[j])  := t[j] + a[j]*b[i] + C
 
   # Steady state
   for j in 0 ..< N:
-    ctx.mulx hi, rRAX, a[j], rdx
-    ctx.adox t[j], rRAX
+    ctx.mulx hi, rax, a[j], rdx
+    ctx.adox t[j], rax
     if j == 0:
       ctx.mov r[i], t[j]
     if j == N-1:
@@ -105,9 +103,9 @@ proc mulaccx_by_word(
 
   # Final carries
   ctx.comment "  Accumulate last carries in hi word"
-  ctx.mov  rRDX, 0 # Set to 0 without clearing flags
-  ctx.adcx hi, rRDX
-  ctx.adox hi, rRDX
+  ctx.mov  rdx, 0 # Set to 0 without clearing flags
+  ctx.adcx hi, rdx
+  ctx.adox hi, rdx
 
 macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen], bx: Limbs[bLen]) =
   ## `a`, `b`, `r` can have a different number of limbs
@@ -126,25 +124,6 @@ macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen
     b = init(OperandArray, nimSymbol = bx, bLen, PointerInReg, Input)
 
     # MULX requires RDX
-    rRDX = Operand(
-      desc: OperandDesc(
-        asmId: "[rdx]",
-        nimSymbol: ident"rdx",
-        rm: RDX,
-        constraint: Output_EarlyClobber,
-        cEmit: "rdx"
-      )
-    )
-
-    rRAX = Operand(
-      desc: OperandDesc(
-        asmId: "[rax]",
-        nimSymbol: ident"rax",
-        rm: RAX,
-        constraint: Output_EarlyClobber,
-        cEmit: "rax"
-      )
-    )
 
     tSlots = aLen+1 # Extra for high word
 
@@ -154,26 +133,21 @@ macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen
 
   # Prologue
   let tsym = t.nimSymbol
-  let eax = rRAX.desc.nimSymbol
-  let edx = rRDX.desc.nimSymbol
   result.add quote do:
     var `tsym`{.noInit.}: array[`tSlots`, BaseType]
-    var `edx`{.noInit.}, `eax`{.noInit.}: BaseType
 
   for i in 0 ..< min(rLen, bLen):
     if i == 0:
       ctx.mulx_by_word(
         r[0],
         a, t,
-        b[0],
-        rRAX, rRDX,
+        b[0]
       )
     else:
       ctx.mulaccx_by_word(
         r, i,
         a, t,
-        b[i],
-        rRAX, rRDX
+        b[i]
       )
 
       t.rotateLeft()
@@ -184,9 +158,9 @@ macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen
 
   # Zero the extra
   if aLen+bLen < rLen:
-    ctx.`xor` rRAX, rRAX
+    ctx.`xor` rax, rax
     for i in aLen+bLen ..< rLen:
-      ctx.mov r[i], rRAX
+      ctx.mov r[i], rax
 
   # Codegen
   result.add ctx.generate
diff --git a/constantine/primitives/macro_assembler_x86.nim b/constantine/primitives/macro_assembler_x86.nim
index e9c09ca..8ba1421 100644
--- a/constantine/primitives/macro_assembler_x86.nim
+++ b/constantine/primitives/macro_assembler_x86.nim
@@ -39,6 +39,9 @@ type
     # Flags
     CarryFlag      = "@ccc"
 
+    # Clobbered register
+    ClobberedReg
+
   Register* = enum
     rbx, rdx, r8, rax, xmm0
 
@@ -50,6 +53,7 @@ type
     Output_EarlyClobber = "=&"
     InputOutput         = "+"
     InputOutput_EnsureClobber = "+&" # For register InputOutput, clang needs "+&" bug?
+    ClobberedRegister
 
   OpKind = enum
     kRegister
@@ -88,11 +92,12 @@ type
     wordSize: int
     areFlagsClobbered: bool
     isStackClobbered: bool
+    regClobbers: set[Register]
 
   Stack* = object
 
 const SpecificRegisters = {RCX, RDX, R8, RAX}
-const OutputReg = {Output_EarlyClobber, InputOutput, InputOutput_EnsureClobber, Output_Overwrite}
+const OutputReg = {Output_EarlyClobber, InputOutput, InputOutput_EnsureClobber, Output_Overwrite, ClobberedRegister}
 
 func hash(od: OperandDesc): Hash =
   {.noSideEffect.}:
@@ -195,6 +200,24 @@ func asArrayAddr*(op: Operand, len: int): Operand =
       offset: i
     )
 
+func asArrayAddr*(op: Register, len: int): Operand =
+  ## Use the value stored in an operand as an array address
+  result = Operand(
+    kind: kArrayAddr,
+    desc: nil,
+    buf: newSeq[Operand](len)
+  )
+  for i in 0 ..< len:
+    result.buf[i] = Operand(
+      desc: OperandDesc(
+        asmId: $op,
+        rm: ClobberedReg,
+        constraint: ClobberedRegister
+      ),
+      kind: kFromArray,
+      offset: i
+    )
+
 # Code generation
 # ------------------------------------------------------------------------------------------------------------
 
@@ -268,6 +291,12 @@ func generate*(a: Assembler_x86): NimNode =
       else:
         clobberList.add ", \"" & str & '\"'
 
+  for reg in a.regClobbers:
+    if clobberList.len == 2:
+      clobberList.add "\"" & $reg & '\"'
+    else:
+      clobberList.add ", \"" & $reg & '\"'
+
   params.add clobberList
 
   # GCC will optimize ASM away if there are no
@@ -293,7 +322,15 @@ func generate*(a: Assembler_x86): NimNode =
 
 func getStrOffset(a: Assembler_x86, op: Operand): string =
   if op.kind != kFromArray:
-    return "%" & op.desc.asmId
+    if op.kind == kArrayAddr:
+      # We are operating on an array pointer
+      # instead of array elements
+      if op.buf[0].desc.constraint == ClobberedRegister:
+        return "%%" & op.buf[0].desc.asmId
+      else:
+        return "%" & op.buf[0].desc.asmId
+    else:
+      return "%" & op.desc.asmId
 
   # Beware GCC / Clang differences with array offsets
   # https://lists.llvm.org/pipermail/llvm-dev/2017-August/116202.html
@@ -315,12 +352,16 @@ func getStrOffset(a: Assembler_x86, op: Operand): string =
        op.desc.rm in SpecificRegisters or
        (op.desc.rm == ElemsInReg and op.kind == kFromArray):
     if op.offset == 0:
-      return "(%" & $op.desc.asmId & ')'
+      return "(%" & op.desc.asmId & ')'
     # GCC & Clang seemed to disagree on pointer indexing
     # in the past and required different codegen
     # if defined(gcc):
-    #   return $(op.offset * a.wordSize) & "+(%" & $op.desc.asmId & ')'
-    return $(op.offset * a.wordSize) & "(%" & $op.desc.asmId & ')'
+    #   return $(op.offset * a.wordSize) & "+(%" & op.desc.asmId & ')'
+    return $(op.offset * a.wordSize) & "(%" & op.desc.asmId & ')'
+  elif op.desc.rm == ClobberedReg: # Array in clobbered register
+    if op.offset == 0:
+      return "(%%" & op.desc.asmId & ')'
+    return $(op.offset * a.wordSize) & "(%%" & op.desc.asmId & ')'
   else:
     error "Unsupported: " & $op.desc.rm.ord
 
@@ -335,7 +376,8 @@ func codeFragment(a: var Assembler_x86, instr: string, op: Operand) =
   else:
     error "Unsupported bitwidth: " & $a.wordBitWidth
 
-  a.operands.incl op.desc
+  if op.desc.constraint != ClobberedRegister:
+    a.operands.incl op.desc
 
 func codeFragment(a: var Assembler_x86, instr: string, op0, op1: Operand) =
   # Generate a code fragment
@@ -352,8 +394,56 @@ func codeFragment(a: var Assembler_x86, instr: string, op0, op1: Operand) =
   else:
     error "Unsupported bitwidth: " & $a.wordBitWidth
 
-  a.operands.incl op0.desc
-  a.operands.incl op1.desc
+  if op0.desc.constraint != ClobberedRegister:
+    a.operands.incl op0.desc
+  if op1.desc.constraint != ClobberedRegister:
+    a.operands.incl op1.desc
+
+func codeFragment(a: var Assembler_x86, instr: string, op: Operand, reg: Register) =
+  # Generate a code fragment
+  # ⚠️ Warning:
+  # The caller should deal with destination/source operand
+  # so that it fits GNU Assembly
+  let off = a.getStrOffset(op)
+
+  if a.wordBitWidth == 64:
+    a.code &= instr & "q " & off & ", %%" & $reg & '\n'
+  else:
+    a.code &= instr & "l " & off & ", %%" & $reg & '\n'
+
+  # op.desc can be nil for renamed registers (using asArrayAddr)
+  if not op.desc.isNil and op.desc.constraint != ClobberedRegister:
+    a.operands.incl op.desc
+  a.regClobbers.incl reg
+
+func codeFragment(a: var Assembler_x86, instr: string, reg: Register, op: Operand) =
+  # Generate a code fragment
+  # ⚠️ Warning:
+  # The caller should deal with destination/source operand
+  # so that it fits GNU Assembly
+  let off = a.getStrOffset(op)
+
+  if a.wordBitWidth == 64:
+    a.code &= instr & "q %%" & $reg & ", " & off & '\n'
+  else:
+    a.code &= instr & "l %%" & $reg & ", " & off & '\n'
+
+  if op.desc.constraint != ClobberedRegister:
+    a.operands.incl op.desc
+  a.regClobbers.incl reg
+
+func codeFragment(a: var Assembler_x86, instr: string, reg0, reg1: Register) =
+  # Generate a code fragment
+  # ⚠️ Warning:
+  # The caller should deal with destination/source operand
+  # so that it fits GNU Assembly
+  if a.wordBitWidth == 64:
+    a.code &= instr & "q %%" & $reg0 & ", %%" & $reg1 & '\n'
+  else:
+    a.code &= instr & "l %%" & $reg0 & ", %%" & $reg1 & '\n'
+
+  a.regClobbers.incl reg0
+  a.regClobbers.incl reg1
 
 func codeFragment(a: var Assembler_x86, instr: string, imm: int, op: Operand) =
   # Generate a code fragment
@@ -367,7 +457,8 @@ func codeFragment(a: var Assembler_x86, instr: string, imm: int, op: Operand) =
   else:
     a.code &= instr & "l $" & $imm & ", " & off & '\n'
 
-  a.operands.incl op.desc
+  if op.desc.constraint != ClobberedRegister:
+    a.operands.incl op.desc
 
 func codeFragment(a: var Assembler_x86, instr: string, reg: Register, op: OperandReuse) =
   # Generate a code fragment
@@ -378,6 +469,7 @@ func codeFragment(a: var Assembler_x86, instr: string, reg: Register, op: Operan
     a.code &= instr & "q %%" & $reg & ", %" & $op.asmId & '\n'
   else:
     a.code &= instr & "l %%" & $reg & ", %" & $op.asmId & '\n'
+  a.regClobbers.incl reg
 
 func codeFragment(a: var Assembler_x86, instr: string, op: OperandReuse, reg: Register) =
   # Generate a code fragment
@@ -388,6 +480,7 @@ func codeFragment(a: var Assembler_x86, instr: string, op: OperandReuse, reg: Re
     a.code &= instr & "q %" & $op.asmId & ", %%" & $reg & '\n'
   else:
     a.code &= instr & "l %" & $op.asmId & ", %%" & $reg & '\n'
+  a.regClobbers.incl reg
 
 func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: Register) =
   # Generate a code fragment
@@ -398,16 +491,7 @@ func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: Register)
     a.code &= instr & "q $" & $imm & ", %%" & $reg & '\n'
   else:
     a.code &= instr & "l $" & $imm & ", %%" & $reg & '\n'
-
-func codeFragment(a: var Assembler_x86, instr: string, reg0, reg1: Register) =
-  # Generate a code fragment
-  # ⚠️ Warning:
-  # The caller should deal with destination/source operand
-  # so that it fits GNU Assembly
-  if a.wordBitWidth == 64:
-    a.code &= instr & "q %%" & $reg0 & ", %%" & $reg1 & '\n'
-  else:
-    a.code &= instr & "l %%" & $reg0 & ", %%" & $reg1 & '\n'
+  a.regClobbers.incl reg
 
 func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: OperandReuse) =
   # Generate a code fragment
@@ -429,33 +513,35 @@ func codeFragment(a: var Assembler_x86, instr: string, reg0, reg1: OperandReuse)
   else:
     a.code &= instr & "l %" & $reg0.asmId & ", %" & $reg1.asmId & '\n'
 
-func codeFragment(a: var Assembler_x86, instr: string, reg0: OperandReuse, reg1: Operand) =
+func codeFragment(a: var Assembler_x86, instr: string, op0: OperandReuse, op1: Operand) =
   # Generate a code fragment
   # ⚠️ Warning:
   # The caller should deal with destination/source operand
   # so that it fits GNU Assembly
-  let off1 = a.getStrOffset(reg1)
+  let off1 = a.getStrOffset(op1)
 
   if a.wordBitWidth == 64:
-    a.code &= instr & "q %" & $reg0.asmId & ", " & off1 & '\n'
+    a.code &= instr & "q %" & $op0.asmId & ", " & off1 & '\n'
   else:
-    a.code &= instr & "l %" & $reg0.asmId & ", " & off1 & '\n'
+    a.code &= instr & "l %" & $op0.asmId & ", " & off1 & '\n'
 
-  a.operands.incl reg1.desc
+  if op1.desc.constraint != ClobberedRegister:
+    a.operands.incl op1.desc
 
-func codeFragment(a: var Assembler_x86, instr: string, reg0: Operand, reg1: OperandReuse) =
+func codeFragment(a: var Assembler_x86, instr: string, op0: Operand, op1: OperandReuse) =
   # Generate a code fragment
   # ⚠️ Warning:
   # The caller should deal with destination/source operand
   # so that it fits GNU Assembly
-  let off0 = a.getStrOffset(reg0)
+  let off0 = a.getStrOffset(op0)
 
   if a.wordBitWidth == 64:
-    a.code &= instr & "q " & off0 & ", %" & $reg1.asmId & '\n'
+    a.code &= instr & "q " & off0 & ", %" & $op1.asmId & '\n'
   else:
-    a.code &= instr & "l " & off0 & ", %" & $reg1.asmId & '\n'
+    a.code &= instr & "l " & off0 & ", %" & $op1.asmId & '\n'
 
-  a.operands.incl reg0.desc
+  if op0.desc.constraint != ClobberedRegister:
+    a.operands.incl op0.desc
 
 func reuseRegister*(reg: OperandArray): OperandReuse =
   # TODO: disable the reg input
@@ -481,6 +567,22 @@ func add*(a: var Assembler_x86, dst, src: Operand) =
   a.codeFragment("add", src, dst)
   a.areFlagsClobbered = true
 
+func add*(a: var Assembler_x86, dst, src: Register) =
+  ## Does: dst <- dst + src
+  a.codeFragment("add", src, dst)
+  a.areFlagsClobbered = true
+
+func add*(a: var Assembler_x86, dst: Operand, src: Register) =
+  ## Does: dst <- dst + src
+  doAssert dst.desc.constraint in OutputReg
+  a.codeFragment("add", src, dst)
+  a.areFlagsClobbered = true
+
+func add*(a: var Assembler_x86, dst: Register, src: Operand) =
+  ## Does: dst <- dst + src
+  a.codeFragment("add", src, dst)
+  a.areFlagsClobbered = true
+
 func adc*(a: var Assembler_x86, dst, src: Operand) =
   ## Does: dst <- dst + src + carry
   doAssert dst.desc.constraint in OutputReg
@@ -490,6 +592,11 @@ func adc*(a: var Assembler_x86, dst, src: Operand) =
   if dst.desc.rm in {Mem, MemOffsettable, AnyRegOrMem}:
     {.warning: "Using addcarry with a memory destination, this incurs significant performance penalties.".}
 
+func adc*(a: var Assembler_x86, dst, src: Register) =
+  ## Does: dst <- dst + src + carry
+  a.codeFragment("adc", src, dst)
+  a.areFlagsClobbered = true
+
 func adc*(a: var Assembler_x86, dst: Operand, imm: int) =
   ## Does: dst <- dst + imm + borrow
   doAssert dst.desc.constraint in OutputReg
@@ -499,6 +606,17 @@ func adc*(a: var Assembler_x86, dst: Operand, imm: int) =
   if dst.desc.rm in {Mem, MemOffsettable, AnyRegOrMem}:
     {.warning: "Using addcarry with a memory destination, this incurs significant performance penalties.".}
 
+func adc*(a: var Assembler_x86, dst: Operand, src: Register) =
+  ## Does: dst <- dst + src
+  doAssert dst.desc.constraint in OutputReg
+  a.codeFragment("adc", src, dst)
+  a.areFlagsClobbered = true
+
+func adc*(a: var Assembler_x86, dst: Register, imm: int) =
+  ## Does: dst <- dst + src
+  a.codeFragment("adc", imm, dst)
+  a.areFlagsClobbered = true
+
 func sub*(a: var Assembler_x86, dst, src: Operand) =
   ## Does: dst <- dst - src
   doAssert dst.desc.constraint in OutputReg
@@ -597,6 +715,12 @@ func `xor`*(a: var Assembler_x86, dst, src: Operand) =
   a.codeFragment("xor", src, dst)
   a.areFlagsClobbered = true
 
+func `xor`*(a: var Assembler_x86, dst, src: Register) =
+  ## Compute the bitwise xor of x and y and
+  ## reset all flags
+  a.codeFragment("xor", src, dst)
+  a.areFlagsClobbered = true
+
 func mov*(a: var Assembler_x86, dst, src: Operand) =
   ## Does: dst <- src
   doAssert dst.desc.constraint in OutputReg, $dst.repr
@@ -625,16 +749,26 @@ func mov*(a: var Assembler_x86, dst: Operand, imm: int) =
   a.codeFragment("mov", imm, dst)
   # No clobber
 
+func mov*(a: var Assembler_x86, dst: Register, imm: int) =
+  ## Does: dst <- src with dst a fixed register
+  a.codeFragment("mov", imm, dst)
+
+func mov*(a: var Assembler_x86, dst: Register, src: Operand) =
+  ## Does: dst <- src with dst a fixed register
+  a.codeFragment("mov", src, dst)
+
+func mov*(a: var Assembler_x86, dst: Operand, src: Register) =
+  ## Does: dst <- src with dst a fixed register
+  a.codeFragment("mov", src, dst)
+
 func mov*(a: var Assembler_x86, dst: Register, src: OperandReuse) =
   ## Does: dst <- src with dst a fixed register
   a.codeFragment("mov", src, dst)
-  # No clobber
 
 func mov*(a: var Assembler_x86, dst: OperandReuse, src: Register) =
   ## Does: dst <- imm
   # doAssert dst.desc.constraint in OutputReg, $dst.repr
   a.codeFragment("mov", src, dst)
-  # No clobber
 
 func cmovc*(a: var Assembler_x86, dst, src: Operand) =
   ## Does: dst <- src if the carry flag is set
@@ -697,6 +831,8 @@ func mul*(a: var Assembler_x86, dHi, dLo: Register, src0: Operand, src1: Registe
   doAssert src1 == rax, "MUL requires the RAX register"
   doAssert dHi == rdx,  "MUL requires the RDX register"
   doAssert dLo == rax,   "MUL requires the RAX register"
+  a.regClobbers.incl rax
+  a.regClobbers.incl rdx
 
   a.codeFragment("mul", src0)
 
@@ -707,9 +843,15 @@ func imul*(a: var Assembler_x86, dst, src: Operand) =
 
   a.codeFragment("imul", src, dst)
 
+func imul*(a: var Assembler_x86, dst: Register, src: Operand) =
+  ## Does dst <- dst * src, keeping only the low half
+  a.codeFragment("imul", src, dst)
+
 func mulx*(a: var Assembler_x86, dHi, dLo, src0: Operand, src1: Register) =
   ## Does (dHi, dLo) <- src0 * src1
   doAssert src1 == rdx, "MULX requires the RDX register"
+  a.regClobbers.incl rdx
+
   doAssert dHi.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
     "The destination operand must be a register " & $dHi.repr
   doAssert dLo.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
@@ -727,9 +869,31 @@ func mulx*(a: var Assembler_x86, dHi, dLo, src0: Operand, src1: Register) =
 
   a.operands.incl src0.desc
 
+func mulx*(a: var Assembler_x86, dHi: Operand, dLo: Register, src0: Operand, src1: Register) =
+  ## Does (dHi, dLo) <- src0 * src1
+  doAssert src1 == rdx, "MULX requires the RDX register"
+  a.regClobbers.incl rdx
+
+  doAssert dHi.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
+    "The destination operand must be a register " & $dHi.repr
+  doAssert dHi.desc.constraint in OutputReg
+
+  let off0 = a.getStrOffset(src0)
+
+  # Annoying AT&T syntax
+  if a.wordBitWidth == 64:
+    a.code &= "mulxq " & off0 & ", %%" & $dLo & ", %" & $dHi.desc.asmId & '\n'
+  else:
+    a.code &= "mulxl " & off0 & ", %%" & $dLo & ", %" & $dHi.desc.asmId & '\n'
+
+  a.operands.incl src0.desc
+  a.regClobbers.incl dLo
+
 func mulx*(a: var Assembler_x86, dHi: OperandReuse, dLo, src0: Operand, src1: Register) =
   ## Does (dHi, dLo) <- src0 * src1
   doAssert src1 == rdx, "MULX requires the RDX register"
+  a.regClobbers.incl rdx
+
   doAssert dLo.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
     "The destination operand must be a register " & $dLo.repr
   doAssert dLo.desc.constraint in OutputReg
@@ -744,7 +908,40 @@ func mulx*(a: var Assembler_x86, dHi: OperandReuse, dLo, src0: Operand, src1: Re
 
   a.operands.incl src0.desc
 
-func adcx*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|OperandReuse) =
+func mulx*(a: var Assembler_x86, dHi: OperandReuse, dLo: Register, src0: Operand, src1: Register) =
+  ## Does (dHi, dLo) <- src0 * src1
+  doAssert src1 == rdx, "MULX requires the RDX register"
+  a.regClobbers.incl rdx
+
+  let off0 = a.getStrOffset(src0)
+
+  # Annoying AT&T syntax
+  if a.wordBitWidth == 64:
+    a.code &= "mulxq " & off0 & ", %%" & $dLo & ", %" & $dHi.asmId & '\n'
+  else:
+    a.code &= "mulxl " & off0 & ", %%" & $dLo & ", %" & $dHi.asmId & '\n'
+
+  a.operands.incl src0.desc
+  a.regClobbers.incl dLo
+
+func mulx*(a: var Assembler_x86, dHi, dLo: Register, src0: Operand, src1: Register) =
+  ## Does (dHi, dLo) <- src0 * src1
+  doAssert src1 == rdx, "MULX requires the RDX register"
+  a.regClobbers.incl rdx
+
+  let off0 = a.getStrOffset(src0)
+
+  # Annoying AT&T syntax
+  if a.wordBitWidth == 64:
+    a.code &= "mulxq " & off0 & ", %%" & $dLo & ", %%" & $dHi & '\n'
+  else:
+    a.code &= "mulxl " & off0 & ", %%" & $dLo & ", %%" & $dHi & '\n'
+
+  a.operands.incl src0.desc
+  a.regClobbers.incl dHi
+  a.regClobbers.incl dLo
+
+func adcx*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|OperandReuse|Register) =
   ## Does: dst <- dst + src + carry
   ## and only sets the carry flag
   when dst is Operand:
@@ -753,7 +950,7 @@ func adcx*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|Operand
   a.codeFragment("adcx", src, dst)
   a.areFlagsClobbered = true
 
-func adox*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|OperandReuse) =
+func adox*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|OperandReuse|Register) =
   ## Does: dst <- dst + src + overflow
   ## and only sets the overflow flag
   when dst is Operand: