Cleanup: introduce clobbered registers, remove explicit rax, rdx for multiplication (minus 30-50 lines for related assembly files)

2025-03-04 04:10:40 +00:00 · 2021-02-15 20:38:12 +01:00 · 2021-02-15 20:38:12 +01:00 · 8918cabb56
commit 8918cabb56
parent 18069e54d3
7 changed files with 393 additions and 380 deletions
--- a/constantine/arithmetic/assembly/limbs_asm_montmul_x86.nim
+++ b/constantine/arithmetic/assembly/limbs_asm_montmul_x86.nim
@ -57,25 +57,6 @@ macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_
    scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)

    # MUL requires RAX and RDX
-    rRAX = Operand(
-      desc: OperandDesc(
-        asmId: "[rax]",
-        nimSymbol: ident"rax",
-        rm: RAX,
-        constraint: Output_EarlyClobber,
-        cEmit: "rax"
-      )
-    )
-
-    rRDX = Operand(
-      desc: OperandDesc(
-        asmId: "[rdx]",
-        nimSymbol: ident"rdx",
-        rm: RDX,
-        constraint: Output_EarlyClobber,
-        cEmit: "rdx"
-      )
-    )

    m0ninv = Operand(
               desc: OperandDesc(
@ -109,16 +90,12 @@ macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_

  let tsym = t.nimSymbol
  let scratchSym = scratch.nimSymbol
-  let eax = rRAX.desc.nimSymbol
-  let edx = rRDX.desc.nimSymbol
  result.add quote do:
    static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)

    var `tsym`: typeof(`r_MM`) # zero init
    # Assumes 64-bit limbs on 64-bit arch (or you can't store an address)
    var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
-    var `eax`{.noInit.}, `edx`{.noInit.}: BaseType
-
    `scratchSym`[0] = cast[SecretWord](`a_MM`[0].unsafeAddr)
    `scratchSym`[1] = cast[SecretWord](`b_MM`[0].unsafeAddr)
    `scratchSym`[5] = cast[SecretWord](`r_MM`[0].unsafeAddr)
@ -140,14 +117,14 @@ macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_

  for i in 0 ..< N:
    # (A, t[0]) <- a[0] * b[i] + t[0]
-    ctx.mov rRAX, a[0]
+    ctx.mov rax, a[0]
    ctx.mul rdx, rax, b[i], rax
    if i == 0: # overwrite t[0]
-      ctx.mov t[0], rRAX
+      ctx.mov t[0], rax
    else:      # Accumulate in t[0]
-      ctx.add t[0], rRAX
-      ctx.adc rRDX, 0
-    ctx.mov A, rRDX
+      ctx.add t[0], rax
+      ctx.adc rdx, 0
+    ctx.mov A, rdx

    # m        <- (t[0] * m0ninv) mod 2^w
    ctx.mov m, m0ninv
@ -155,39 +132,39 @@ macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_

    # (C, _)    <- m * M[0] + t[0]
    ctx.`xor` C, C
-    ctx.mov rRAX, M[0]
+    ctx.mov rax, M[0]
    ctx.mul rdx, rax, m, rax
-    ctx.add rRAX, t[0]
-    ctx.adc C, rRDX
+    ctx.add rax, t[0]
+    ctx.adc C, rdx

    for j in 1 ..< N:
      # (A, t[j])   <- a[j] * b[i] + A + t[j]
-      ctx.mov rRAX, a[j]
+      ctx.mov rax, a[j]
      ctx.mul rdx, rax, b[i], rax
      if i == 0:
        ctx.mov t[j], A
      else:
        ctx.add t[j], A
-        ctx.adc rRDX, 0
+        ctx.adc rdx, 0
      ctx.`xor` A, A
-      ctx.add t[j], rRAX
-      ctx.adc A, rRDX
+      ctx.add t[j], rax
+      ctx.adc A, rdx

      # (C, t[j-1]) <- m * M[j] + C + t[j]
-      ctx.mov rRAX, M[j]
+      ctx.mov rax, M[j]
      ctx.mul rdx, rax, m, rax
      ctx.add C, t[j]
-      ctx.adc rRDX, 0
-      ctx.add C, rRAX
-      ctx.adc rRDX, 0
+      ctx.adc rdx, 0
+      ctx.add C, rax
+      ctx.adc rdx, 0
      ctx.mov t[j-1], C
-      ctx.mov C, rRDX
+      ctx.mov C, rdx

    ctx.add A, C
    ctx.mov t[N-1], A

-  ctx.mov rRDX, r
-  let r2 = rRDX.asArrayAddr(len = N)
+  ctx.mov rdx, r
+  let r2 = rdx.asArrayAddr(len = N)

  ctx.finalSubNoCarry(
    r2, t, M,
--- a/constantine/arithmetic/assembly/limbs_asm_montmul_x86_adx_bmi2.nim
+++ b/constantine/arithmetic/assembly/limbs_asm_montmul_x86_adx_bmi2.nim
@ -40,7 +40,7 @@ proc mulx_by_word(
       t: OperandArray,
       a: Operand, # Pointer in scratchspace
       word0: Operand,
-       lo, rRDX: Operand
+       lo: Operand
     ) =
  ## Multiply the `a[0..<N]` by `word` and store in `t[0..<N]`
  ## and carry register `C` (t[N])
@ -55,7 +55,7 @@ proc mulx_by_word(
  #  (C,t[j])  := t[j] + a[j]*b[i] + C

  # First limb
-  ctx.mov rRDX, word0
+  ctx.mov rdx, word0
  if N > 1:
    ctx.mulx t[1], t[0], a[0], rdx
    ctx.`xor` hi, hi # Clear flags - TODO: necessary?
@ -87,20 +87,19 @@ proc mulaccx_by_word(
       a: Operand, # Pointer in scratchspace
       i: int,
       word: Operand,
-       lo, rRDX: Operand
+       lo: Operand
     ) =
  ## Multiply the `a[0..<N]` by `word`
  ## and accumulate in `t[0..<N]`
  ## and carry register `C` (t[N])
  ## `t` and `C` are multiply-accumulated
  ## `S` is a scratchspace register
-  ## `rRDX` is the RDX register descriptor
  let N = min(a.len, t.len)

  doAssert i != 0

  ctx.comment "  Outer loop i = " & $i & ", j in [0, " & $N & ")"
-  ctx.mov rRDX, word
+  ctx.mov rdx, word
  ctx.`xor` hi, hi # Clear flags - TODO: necessary?

  # for j=0 to N-1
@ -119,9 +118,9 @@ proc mulaccx_by_word(

  # Final carries
  ctx.comment "  Accumulate last carries in hi word"
-  ctx.mov  rRDX, 0 # Set to 0 without clearing flags
-  ctx.adcx hi, rRDX
-  ctx.adox hi, rRDX
+  ctx.mov  rdx, 0 # Set to 0 without clearing flags
+  ctx.adcx hi, rdx
+  ctx.adox hi, rdx

 proc partialRedx(
       ctx: var Assembler_x86,
@ -129,7 +128,7 @@ proc partialRedx(
       t: OperandArray,
       M: OperandArray,
       m0ninv: Operand,
-       lo, S, rRDX: Operand
+       lo, S: Operand
     ) =
    ## Partial Montgomery reduction
    ## For CIOS method
@ -145,8 +144,8 @@ proc partialRedx(
    # m = t[0] * m0ninv mod 2^w
    ctx.comment "  Reduction"
    ctx.comment "  m = t[0] * m0ninv mod 2^w"
-    ctx.mov  rRDX, t[0]
-    ctx.mulx S, rRDX, m0ninv, rdx # (S, RDX) <- m0ninv * RDX
+    ctx.mov  rdx, t[0]
+    ctx.mulx S, rdx, m0ninv, rdx # (S, RDX) <- m0ninv * RDX

    # Clear carry flags - TODO: necessary?
    ctx.`xor` S, S
@ -194,16 +193,7 @@ macro montMul_CIOS_nocarry_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM,
    # MultiPurpose Register slots
    scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)

-    # MULX requires RDX
-    rRDX = Operand(
-      desc: OperandDesc(
-        asmId: "[rdx]",
-        nimSymbol: ident"rdx",
-        rm: RDX,
-        constraint: Output_EarlyClobber,
-        cEmit: "rdx"
-      )
-    )
+    # MULX requires RDX as well

    a = scratch[0].asArrayAddr(len = N) # Store the `a` operand
    b = scratch[1].asArrayAddr(len = N) # Store the `b` operand
@ -225,15 +215,12 @@ macro montMul_CIOS_nocarry_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM,

  let tsym = t.nimSymbol
  let scratchSym = scratch.nimSymbol
-  let edx = rRDX.desc.nimSymbol
  result.add quote do:
    static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)

    var `tsym`: typeof(`r_MM`) # zero init
    # Assumes 64-bit limbs on 64-bit arch (or you can't store an address)
    var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
-    var `edx`{.noInit.}: BaseType
-
    `scratchSym`[0] = cast[SecretWord](`a_MM`[0].unsafeAddr)
    `scratchSym`[1] = cast[SecretWord](`b_MM`[0].unsafeAddr)
    `scratchSym`[4] = SecretWord `m0ninv_MM`
@ -258,20 +245,20 @@ macro montMul_CIOS_nocarry_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM,
        A, t,
        a,
        b[0],
-        C, rRDX
+        C
      )
    else:
      ctx.mulaccx_by_word(
        A, t,
        a, i,
        b[i],
-        C, rRDX
+        C
      )

    ctx.partialRedx(
      A, t,
      M, m0ninv,
-      lo, C, rRDX
+      lo, C
    )

  ctx.finalSubNoCarry(
--- a/constantine/arithmetic/assembly/limbs_asm_montred_x86.nim
+++ b/constantine/arithmetic/assembly/limbs_asm_montred_x86.nim
@ -41,7 +41,7 @@ proc finalSubNoCarry*(
      ctx.sbb scratch[i], M[i]

  # If we borrowed it means that we were smaller than
-  # the modulus and we don'a need "scratch"
+  # the modulus and we don't need "scratch"
  for i in 0 ..< N:
    ctx.cmovnc a[i], scratch[i]
    ctx.mov r[i], a[i]
@ -50,7 +50,7 @@ proc finalSubCanOverflow*(
       ctx: var Assembler_x86,
       r: Operand or OperandArray,
       a, M, scratch: OperandArray,
-       overflowReg: Operand
+       overflowReg: Operand or Register
     ) =
  ## Reduce `a` into `r` modulo `M`
  ## To be used when the final substraction can
@ -74,7 +74,7 @@ proc finalSubCanOverflow*(
  ctx.sbb overflowReg, 0

  # If we borrowed it means that we were smaller than
-  # the modulus and we don'a need "scratch"
+  # the modulus and we don't need "scratch"
  for i in 0 ..< N:
    ctx.cmovnc a[i], scratch[i]
    ctx.mov r[i], a[i]
@ -90,59 +90,37 @@ macro montyRedc2x_gen[N: static int](
       m0ninv_MR: BaseType,
       spareBits: static int
      ) =
-  # TODO, slower than Clang, in particular due to the shadowing
-
  result = newStmtList()

  var ctx = init(Assembler_x86, BaseType)
+  # On x86, compilers only let us use 15 out of 16 registers
+  # RAX and RDX are defacto used due to the MUL instructions
+  # so we store everything in scratchspaces restoring as needed
  let
    # We could force M as immediate by specializing per moduli
    M = init(OperandArray, nimSymbol = M_MR, N, PointerInReg, Input)
-
    # MUL requires RAX and RDX
-    rRAX = Operand(
-      desc: OperandDesc(
-        asmId: "[rax]",
-        nimSymbol: ident"rax",
-        rm: RAX,
-        constraint: InputOutput_EnsureClobber,
-        cEmit: "rax"
-      )
-    )

-    rRDX = Operand(
-      desc: OperandDesc(
-        asmId: "[rdx]",
-        nimSymbol: ident"rdx",
-        rm: RDX,
-        constraint: Output_EarlyClobber,
-        cEmit: "rdx"
-      )
-    )
+  let uSlots = N+2
+  let vSlots = max(N-2, 3)

-    m0ninv = Operand(
-      desc: OperandDesc(
-        asmId: "[m0ninv]",
-        nimSymbol: m0ninv_MR,
-        rm: Reg,
-        constraint: Input,
-        cEmit: "m0ninv"
-      )
-    )
-
-
-  let scratchSlots = N+2
-  var scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
+  var # Scratchspaces
+    u = init(OperandArray, nimSymbol = ident"U", uSlots, ElemsInReg, InputOutput_EnsureClobber)
+    v = init(OperandArray, nimSymbol = ident"V", vSlots, ElemsInReg, InputOutput_EnsureClobber)

  # Prologue
-  let eax = rRAX.desc.nimSymbol
-  let edx = rRDX.desc.nimSymbol
-  let scratchSym = scratch.nimSymbol
+  let usym = u.nimSymbol
+  let vsym = v.nimSymbol
  result.add quote do:
-    static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
+    var `usym`{.noinit.}: Limbs[`uSlots`]
+    var `vsym` {.noInit.}: Limbs[`vSlots`]
+    `vsym`[0] = cast[SecretWord](`r_MR`[0].unsafeAddr)
+    `vsym`[1] = cast[SecretWord](`a_MR`[0].unsafeAddr)
+    `vsym`[2] = SecretWord(`m0ninv_MR`)

-    var `eax`{.noInit.}, `edx`{.noInit.}: BaseType
-    var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
+  let r_temp = v[0].asArrayAddr(len = N)
+  let a = v[1].asArrayAddr(len = 2*N)
+  let m0ninv = v[2]

  # Algorithm
  # ---------------------------------------------------------
@ -161,85 +139,76 @@ macro montyRedc2x_gen[N: static int](
  # No register spilling handling
  doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."

-  result.add quote do:
-    `eax` = BaseType `a_MR`[0]
-    staticFor i, 0, `N`: # Do NOT use Nim slice/toOpenArray, they are not inlined
-      `scratchSym`[i] = `a_MR`[i]
+  for i in 0 ..< N:
+    ctx.mov u[i], a[i]

-  ctx.mov scratch[N], rRAX
-  ctx.imul rRAX, m0ninv    # m <- a[i] * m0ninv mod 2^w
-  ctx.mov scratch[0], rRAX
+  ctx.mov u[N], u[0]
+  ctx.imul u[0], m0ninv    # m <- a[i] * m0ninv mod 2^w
+  ctx.mov rax, u[0]

  # scratch: [a[0] * m0, a[1], a[2], a[3], a[0]] for 4 limbs

  for i in 0 ..< N:
    ctx.comment ""
-    let hi = scratch[N]
-    let next = scratch[N+1]
+    let hi = u[N]
+    let next = u[N+1]

    ctx.mul rdx, rax, M[0], rax
-    ctx.add hi, rRAX # Guaranteed to be zero
-    ctx.mov rRAX, scratch[0]
-    ctx.adc hi, rRDX
+    ctx.add hi, rax # Guaranteed to be zero
+    ctx.mov rax, u[0]
+    ctx.adc hi, rdx

    for j in 1 ..< N-1:
      ctx.comment ""
      ctx.mul rdx, rax, M[j], rax
-      ctx.add scratch[j], rRAX
-      ctx.mov rRAX, scratch[0]
-      ctx.adc rRDX, 0
-      ctx.add scratch[j], hi
-      ctx.adc rRDX, 0
-      ctx.mov hi, rRDX
+      ctx.add u[j], rax
+      ctx.mov rax, u[0]
+      ctx.adc rdx, 0
+      ctx.add u[j], hi
+      ctx.adc rdx, 0
+      ctx.mov hi, rdx

    # Next load
    if i < N-1:
      ctx.comment ""
-      ctx.mov next, scratch[1]
-      ctx.imul scratch[1], m0ninv
+      ctx.mov next, u[1]
+      ctx.imul u[1], m0ninv
      ctx.comment ""

    # Last limb
    ctx.comment ""
    ctx.mul rdx, rax, M[N-1], rax
-    ctx.add scratch[N-1], rRAX
-    ctx.mov rRAX, scratch[1] # Contains next * m0
-    ctx.adc rRDX, 0
-    ctx.add scratch[N-1], hi
-    ctx.adc rRDX, 0
-    ctx.mov hi, rRDX
+    ctx.add u[N-1], rax
+    ctx.mov rax, u[1] # Contains next * m0
+    ctx.adc rdx, 0
+    ctx.add u[N-1], hi
+    ctx.adc rdx, 0
+    ctx.mov hi, rdx

-    scratch.rotateLeft()
+    u.rotateLeft()

-  # Code generation
-  result.add ctx.generate()
+  # Second part - Final substraction
+  # ---------------------------------------------

-  # New codegen
-  ctx = init(Assembler_x86, BaseType)
-
-  let r = init(OperandArray, nimSymbol = r_MR, N, PointerInReg, InputOutput_EnsureClobber)
-  let a = init(OperandArray, nimSymbol = a_MR, N*2, PointerInReg, Input)
-  let extraRegNeeded = N-2
-  let t = init(OperandArray, nimSymbol = ident"t", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber)
-  let tsym = t.nimSymbol
-  result.add quote do:
-    var `tsym` {.noInit.}: Limbs[`extraRegNeeded`]
+  ctx.mov rdx, r_temp
+  let r = rdx.asArrayAddr(len = N)

  # This does a[i+n] += hi
  # but in a separate carry chain, fused with the
  # copy "r[i] = a[i+n]"
  for i in 0 ..< N:
    if i == 0:
-      ctx.add scratch[i], a[i+N]
+      ctx.add u[i], a[i+N]
    else:
-      ctx.adc scratch[i], a[i+N]
+      ctx.adc u[i], a[i+N]

-  let reuse = repackRegisters(t, scratch[N], scratch[N+1])
+  let t = repackRegisters(v, u[N], u[N+1])

+  # v is invalidated
  if spareBits >= 1:
-    ctx.finalSubNoCarry(r, scratch, M, reuse)
+    ctx.finalSubNoCarry(r, u, M, t)
  else:
-    ctx.finalSubCanOverflow(r, scratch, M, reuse, rRAX)
+    ctx.finalSubCanOverflow(r, u, M, t, rax)

  # Code generation
  result.add ctx.generate()
--- a/constantine/arithmetic/assembly/limbs_asm_montred_x86_adx_bmi2.nim
+++ b/constantine/arithmetic/assembly/limbs_asm_montred_x86_adx_bmi2.nim
@ -35,15 +35,13 @@ static: doAssert UseASM_X86_64
 # Montgomery reduction
 # ------------------------------------------------------------

-macro montyRedc2xx_gen[N: static int](
+macro montyRedc2x_gen[N: static int](
       r_MR: var array[N, SecretWord],
       a_MR: array[N*2, SecretWord],
       M_MR: array[N, SecretWord],
       m0ninv_MR: BaseType,
       spareBits: static int
      ) =
-  # TODO, slower than Clang, in particular due to the shadowing
-
  result = newStmtList()

  var ctx = init(Assembler_x86, BaseType)
@ -51,59 +49,29 @@ macro montyRedc2xx_gen[N: static int](
    # We could force M as immediate by specializing per moduli
    M = init(OperandArray, nimSymbol = M_MR, N, PointerInReg, Input)

-    hi = Operand(
-      desc: OperandDesc(
-        asmId: "[hi]",
-        nimSymbol: ident"hi",
-        rm: Reg,
-        constraint: Output_EarlyClobber,
-        cEmit: "hi"
-      )
-    )
+  let uSlots = N+1
+  let vSlots = max(N-1, 5)

-    lo = Operand(
-      desc: OperandDesc(
-        asmId: "[lo]",
-        nimSymbol: ident"lo",
-        rm: Reg,
-        constraint: Output_EarlyClobber,
-        cEmit: "lo"
-      )
-    )
-
-    rRDX = Operand(
-      desc: OperandDesc(
-        asmId: "[rdx]",
-        nimSymbol: ident"rdx",
-        rm: RDX,
-        constraint: InputOutput_EnsureClobber,
-        cEmit: "rdx"
-      )
-    )
-
-    m0ninv = Operand(
-      desc: OperandDesc(
-        asmId: "[m0ninv]",
-        nimSymbol: m0ninv_MR,
-        rm: Reg,
-        constraint: Input,
-        cEmit: "m0ninv"
-      )
-    )
-
-  let scratchSlots = N+1
-  var scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
+  var # Scratchspaces
+    u = init(OperandArray, nimSymbol = ident"U", uSlots, ElemsInReg, InputOutput_EnsureClobber)
+    v = init(OperandArray, nimSymbol = ident"V", vSlots, ElemsInReg, InputOutput_EnsureClobber)

  # Prologue
-  let edx = rRDX.desc.nimSymbol
-  let hisym = hi.desc.nimSymbol
-  let losym = lo.desc.nimSymbol
-  let scratchSym = scratch.nimSymbol
+  let usym = u.nimSymbol
+  let vsym = v.nimSymbol
  result.add quote do:
    static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
+    var `usym`{.noinit.}: Limbs[`uSlots`]
+    var `vsym` {.noInit.}: Limbs[`vSlots`]
+    `vsym`[0] = cast[SecretWord](`r_MR`[0].unsafeAddr)
+    `vsym`[1] = cast[SecretWord](`a_MR`[0].unsafeAddr)
+    `vsym`[2] = SecretWord(`m0ninv_MR`)

-    var `hisym`{.noInit.}, `losym`{.noInit.}, `edx`{.noInit.}: BaseType
-    var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
+  let r_temp = v[0].asArrayAddr(len = N)
+  let a = v[1].asArrayAddr(len = 2*N)
+  let m0ninv = v[2]
+  let lo = v[3]
+  let hi = v[4]

  # Algorithm
  # ---------------------------------------------------------
@ -122,63 +90,52 @@ macro montyRedc2xx_gen[N: static int](
  # No register spilling handling
  doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."

-  result.add quote do:
-    `edx` = BaseType(`m0ninv_MR`)
-    staticFor i, 0, `N`: # Do NOT use Nim slice/toOpenArray, they are not inlined
-      `scratchSym`[i] = `a_MR`[i]
+  ctx.mov rdx, m0ninv
+
+  for i in 0 ..< N:
+    ctx.mov u[i], a[i]

  for i in 0 ..< N:
    # RDX contains m0ninv at the start of each loop
    ctx.comment ""
-    ctx.imul rRDX, scratch[0] # m <- a[i] * m0ninv mod 2^w
+    ctx.imul rdx, u[0] # m <- a[i] * m0ninv mod 2^w
    ctx.comment "---- Reduction " & $i
-    ctx.`xor` scratch[N], scratch[N]
+    ctx.`xor` u[N], u[N]

    for j in 0 ..< N-1:
      ctx.comment ""
      ctx.mulx hi, lo, M[j], rdx
-      ctx.adcx scratch[j], lo
-      ctx.adox scratch[j+1], hi
+      ctx.adcx u[j], lo
+      ctx.adox u[j+1], hi

    # Last limb
    ctx.comment ""
    ctx.mulx hi, lo, M[N-1], rdx
-    ctx.mov rRDX, m0ninv # Reload m0ninv for next iter
-    ctx.adcx scratch[N-1], lo
-    ctx.adox hi, scratch[N]
-    ctx.adcx scratch[N], hi
+    ctx.mov rdx, m0ninv # Reload m0ninv for next iter
+    ctx.adcx u[N-1], lo
+    ctx.adox hi, u[N]
+    ctx.adcx u[N], hi

-    scratch.rotateLeft()
+    u.rotateLeft()

-  # Code generation
-  result.add ctx.generate()
-
-  # New codegen
-  ctx = init(Assembler_x86, BaseType)
-
-  let r = init(OperandArray, nimSymbol = r_MR, N, PointerInReg, InputOutput_EnsureClobber)
-  let a = init(OperandArray, nimSymbol = a_MR, N*2, PointerInReg, Input)
-  let extraRegNeeded = N-1
-  let t = init(OperandArray, nimSymbol = ident"t", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber)
-  let tsym = t.nimSymbol
-  result.add quote do:
-    var `tsym` {.noInit.}: Limbs[`extraRegNeeded`]
+  ctx.mov rdx, r_temp
+  let r = rdx.asArrayAddr(len = N)

  # This does a[i+n] += hi
  # but in a separate carry chain, fused with the
  # copy "r[i] = a[i+n]"
  for i in 0 ..< N:
    if i == 0:
-      ctx.add scratch[i], a[i+N]
+      ctx.add u[i], a[i+N]
    else:
-      ctx.adc scratch[i], a[i+N]
+      ctx.adc u[i], a[i+N]

-  let reuse = repackRegisters(t, scratch[N])
+  let t = repackRegisters(v, u[N])

  if spareBits >= 1:
-    ctx.finalSubNoCarry(r, scratch, M, reuse)
+    ctx.finalSubNoCarry(r, u, M, t)
  else:
-    ctx.finalSubCanOverflow(r, scratch, M, reuse, hi)
+    ctx.finalSubCanOverflow(r, u, M, t, hi)

  # Code generation
  result.add ctx.generate()
@ -191,4 +148,4 @@ func montRed_asm_adx_bmi2*[N: static int](
       spareBits: static int
      ) =
  ## Constant-time Montgomery reduction
-  montyRedc2xx_gen(r, a, M, m0ninv, spareBits)
+  montyRedc2x_gen(r, a, M, m0ninv, spareBits)
--- a/constantine/arithmetic/assembly/limbs_asm_mul_x86.nim
+++ b/constantine/arithmetic/assembly/limbs_asm_mul_x86.nim
@ -81,36 +81,13 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
    )

    # MUL requires RAX and RDX
-    rRAX = Operand(
-      desc: OperandDesc(
-        asmId: "[rax]",
-        nimSymbol: ident"rax",
-        rm: RAX,
-        constraint: Output_EarlyClobber,
-        cEmit: "rax"
-      )
-    )
-
-    rRDX = Operand(
-      desc: OperandDesc(
-        asmId: "[rdx]",
-        nimSymbol: ident"rdx",
-        rm: RDX,
-        constraint: Output_EarlyClobber,
-        cEmit: "rdx"
-      )
-    )
-

  # Prologue
  let tsym = t.desc.nimSymbol
  let usym = u.desc.nimSymbol
  let vsym = v.desc.nimSymbol
-  let eax = rRAX.desc.nimSymbol
-  let edx = rRDX.desc.nimSymbol
  result.add quote do:
    var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init
-    var `eax`{.noInit.}, `edx`{.noInit.}: BaseType

  # Algorithm
  ctx.`xor` u, u
@ -127,10 +104,10 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
    let ia = i - ib
    for j in 0 ..< min(aLen - ia, ib+1):
      # (t, u, v) <- (t, u, v) + a[ia+j] * b[ib-j]
-      ctx.mov rRAX, arrB[ib-j]
+      ctx.mov rax, arrB[ib-j]
      ctx.mul rdx, rax, arrA[ia+j], rax
-      ctx.add v, rRAX
-      ctx.adc u, rRDX
+      ctx.add v, rax
+      ctx.adc u, rdx
      ctx.adc t, 0

    ctx.mov arrR[i], v
@ -141,9 +118,9 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
      ctx.`xor` t, t

  if aLen+bLen < rLen:
-    ctx.`xor` rRAX, rRAX
+    ctx.`xor` rax, rax
    for i in aLen+bLen ..< rLen:
-      ctx.mov arrR[i], rRAX
+      ctx.mov arrR[i], rax

  # Codegen
  result.add ctx.generate
@ -202,37 +179,12 @@ macro square_gen[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
      )
    )

-    # MUL requires RAX and RDX
-    rRAX = Operand(
-      desc: OperandDesc(
-        asmId: "[rax]",
-        nimSymbol: ident"rax",
-        rm: RAX,
-        constraint: Output_EarlyClobber,
-        cEmit: "rax"
-      )
-    )
-
-    rRDX = Operand(
-      desc: OperandDesc(
-        asmId: "[rdx]",
-        nimSymbol: ident"rdx",
-        rm: RDX,
-        constraint: Output_EarlyClobber,
-        cEmit: "rdx"
-      )
-    )
-
-
  # Prologue
  let tsym = t.desc.nimSymbol
  let usym = u.desc.nimSymbol
  let vsym = v.desc.nimSymbol
-  let eax = rRAX.desc.nimSymbol
-  let edx = rRDX.desc.nimSymbol
  result.add quote do:
    var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init
-    var `eax`{.noInit.}, `edx`{.noInit.}: BaseType

  # Algorithm
  ctx.`xor` u, u
@ -252,20 +204,20 @@ macro square_gen[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
      let k2 = ib-j
      if k1 < k2:
        # (t, u, v) <- (t, u, v) + 2 * a[k1] * a[k2]
-        ctx.mov rRAX, arrA[k2]
+        ctx.mov rax, arrA[k2]
        ctx.mul rdx, rax, arrA[k1], rax
-        ctx.add rRAX, rRAX
-        ctx.adc rRDX, rRDX
+        ctx.add rax, rax
+        ctx.adc rdx, rdx
        ctx.adc t, 0
-        ctx.add v, rRAX
-        ctx.adc u, rRDX
+        ctx.add v, rax
+        ctx.adc u, rdx
        ctx.adc t, 0
      elif k1 == k2:
        # (t, u, v) <- (t, u, v) + a[k1] * a[k2]
-        ctx.mov rRAX, arrA[k2]
+        ctx.mov rax, arrA[k2]
        ctx.mul rdx, rax, arrA[k1], rax
-        ctx.add v, rRAX
-        ctx.adc u, rRDX
+        ctx.add v, rax
+        ctx.adc u, rdx
        ctx.adc t, 0
      else:
        discard
@ -278,9 +230,9 @@ macro square_gen[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
      ctx.`xor` t, t

  if aLen*2 < rLen:
-    ctx.`xor` rRAX, rRAX
+    ctx.`xor` rax, rax
    for i in aLen*2 ..< rLen:
-      ctx.mov arrR[i], rRAX
+      ctx.mov arrR[i], rax

  # Codegen
  result.add ctx.generate
--- a/constantine/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim
+++ b/constantine/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim
@ -37,8 +37,7 @@ proc mulx_by_word(
       ctx: var Assembler_x86,
       r0: Operand,
       a, t: OperandArray,
-       word0: Operand,
-       rRAX, rRDX: Operand
+       word0: Operand
     ) =
  ## Multiply the `a[0..<N]` by `word`
  ## and store in `[t:r0]`
@ -52,18 +51,18 @@ proc mulx_by_word(
  #  (C,t[j])  := t[j] + a[j]*b[i] + C

  # First limb
-  ctx.mov rRDX, word0
-  ctx.`xor` rRAX, rRAX # Clear flags (important if steady state is skipped)
-  ctx.mulx t[0], rRAX, a[0], rdx
-  ctx.mov r0, rRAX
+  ctx.mov rdx, word0
+  ctx.`xor` rax, rax # Clear flags (important if steady state is skipped)
+  ctx.mulx t[0], rax, a[0], rdx
+  ctx.mov r0, rax

  # Steady state
  for j in 1 ..< N:
-    ctx.mulx t[j], rRAX, a[j], rdx
+    ctx.mulx t[j], rax, a[j], rdx
    if j == 1:
-      ctx.add t[j-1], rRAX
+      ctx.add t[j-1], rax
    else:
-      ctx.adc t[j-1], rRAX
+      ctx.adc t[j-1], rax

  # Final carries
  ctx.comment "  Accumulate last carries in hi word"
@ -74,8 +73,7 @@ proc mulaccx_by_word(
       r: OperandArray,
       i: int,
       a, t: OperandArray,
-       word: Operand,
-       rRAX, rRDX: Operand
+       word: Operand
     ) =
  ## Multiply the `a[0..<N]` by `word`
  ## and store in `[t:r0]`
@ -87,16 +85,16 @@ proc mulaccx_by_word(
  doAssert i != 0

  ctx.comment "  Outer loop i = " & $i & ", j in [0, " & $N & ")"
-  ctx.mov rRDX, word
-  ctx.`xor` rRAX, rRAX # Clear flags
+  ctx.mov rdx, word
+  ctx.`xor` rax, rax # Clear flags

  # for j=0 to N-1
  #  (C,t[j])  := t[j] + a[j]*b[i] + C

  # Steady state
  for j in 0 ..< N:
-    ctx.mulx hi, rRAX, a[j], rdx
-    ctx.adox t[j], rRAX
+    ctx.mulx hi, rax, a[j], rdx
+    ctx.adox t[j], rax
    if j == 0:
      ctx.mov r[i], t[j]
    if j == N-1:
@ -105,9 +103,9 @@ proc mulaccx_by_word(

  # Final carries
  ctx.comment "  Accumulate last carries in hi word"
-  ctx.mov  rRDX, 0 # Set to 0 without clearing flags
-  ctx.adcx hi, rRDX
-  ctx.adox hi, rRDX
+  ctx.mov  rdx, 0 # Set to 0 without clearing flags
+  ctx.adcx hi, rdx
+  ctx.adox hi, rdx

 macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen], bx: Limbs[bLen]) =
  ## `a`, `b`, `r` can have a different number of limbs
@ -126,25 +124,6 @@ macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen
    b = init(OperandArray, nimSymbol = bx, bLen, PointerInReg, Input)

    # MULX requires RDX
-    rRDX = Operand(
-      desc: OperandDesc(
-        asmId: "[rdx]",
-        nimSymbol: ident"rdx",
-        rm: RDX,
-        constraint: Output_EarlyClobber,
-        cEmit: "rdx"
-      )
-    )
-
-    rRAX = Operand(
-      desc: OperandDesc(
-        asmId: "[rax]",
-        nimSymbol: ident"rax",
-        rm: RAX,
-        constraint: Output_EarlyClobber,
-        cEmit: "rax"
-      )
-    )

    tSlots = aLen+1 # Extra for high word

@ -154,26 +133,21 @@ macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen

  # Prologue
  let tsym = t.nimSymbol
-  let eax = rRAX.desc.nimSymbol
-  let edx = rRDX.desc.nimSymbol
  result.add quote do:
    var `tsym`{.noInit.}: array[`tSlots`, BaseType]
-    var `edx`{.noInit.}, `eax`{.noInit.}: BaseType

  for i in 0 ..< min(rLen, bLen):
    if i == 0:
      ctx.mulx_by_word(
        r[0],
        a, t,
-        b[0],
-        rRAX, rRDX,
+        b[0]
      )
    else:
      ctx.mulaccx_by_word(
        r, i,
        a, t,
-        b[i],
-        rRAX, rRDX
+        b[i]
      )

      t.rotateLeft()
@ -184,9 +158,9 @@ macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen

  # Zero the extra
  if aLen+bLen < rLen:
-    ctx.`xor` rRAX, rRAX
+    ctx.`xor` rax, rax
    for i in aLen+bLen ..< rLen:
-      ctx.mov r[i], rRAX
+      ctx.mov r[i], rax

  # Codegen
  result.add ctx.generate
--- a/constantine/primitives/macro_assembler_x86.nim
+++ b/constantine/primitives/macro_assembler_x86.nim
@ -39,6 +39,9 @@ type
    # Flags
    CarryFlag      = "@ccc"

+    # Clobbered register
+    ClobberedReg
+
  Register* = enum
    rbx, rdx, r8, rax, xmm0

@ -50,6 +53,7 @@ type
    Output_EarlyClobber = "=&"
    InputOutput         = "+"
    InputOutput_EnsureClobber = "+&" # For register InputOutput, clang needs "+&" bug?
+    ClobberedRegister

  OpKind = enum
    kRegister
@ -88,11 +92,12 @@ type
    wordSize: int
    areFlagsClobbered: bool
    isStackClobbered: bool
+    regClobbers: set[Register]

  Stack* = object

 const SpecificRegisters = {RCX, RDX, R8, RAX}
-const OutputReg = {Output_EarlyClobber, InputOutput, InputOutput_EnsureClobber, Output_Overwrite}
+const OutputReg = {Output_EarlyClobber, InputOutput, InputOutput_EnsureClobber, Output_Overwrite, ClobberedRegister}

 func hash(od: OperandDesc): Hash =
  {.noSideEffect.}:
@ -195,6 +200,24 @@ func asArrayAddr*(op: Operand, len: int): Operand =
      offset: i
    )

+func asArrayAddr*(op: Register, len: int): Operand =
+  ## Use the value stored in an operand as an array address
+  result = Operand(
+    kind: kArrayAddr,
+    desc: nil,
+    buf: newSeq[Operand](len)
+  )
+  for i in 0 ..< len:
+    result.buf[i] = Operand(
+      desc: OperandDesc(
+        asmId: $op,
+        rm: ClobberedReg,
+        constraint: ClobberedRegister
+      ),
+      kind: kFromArray,
+      offset: i
+    )
+
 # Code generation
 # ------------------------------------------------------------------------------------------------------------

@ -268,6 +291,12 @@ func generate*(a: Assembler_x86): NimNode =
      else:
        clobberList.add ", \"" & str & '\"'

+  for reg in a.regClobbers:
+    if clobberList.len == 2:
+      clobberList.add "\"" & $reg & '\"'
+    else:
+      clobberList.add ", \"" & $reg & '\"'
+
  params.add clobberList

  # GCC will optimize ASM away if there are no
@ -293,7 +322,15 @@ func generate*(a: Assembler_x86): NimNode =

 func getStrOffset(a: Assembler_x86, op: Operand): string =
  if op.kind != kFromArray:
-    return "%" & op.desc.asmId
+    if op.kind == kArrayAddr:
+      # We are operating on an array pointer
+      # instead of array elements
+      if op.buf[0].desc.constraint == ClobberedRegister:
+        return "%%" & op.buf[0].desc.asmId
+      else:
+        return "%" & op.buf[0].desc.asmId
+    else:
+      return "%" & op.desc.asmId

  # Beware GCC / Clang differences with array offsets
  # https://lists.llvm.org/pipermail/llvm-dev/2017-August/116202.html
@ -315,12 +352,16 @@ func getStrOffset(a: Assembler_x86, op: Operand): string =
       op.desc.rm in SpecificRegisters or
       (op.desc.rm == ElemsInReg and op.kind == kFromArray):
    if op.offset == 0:
-      return "(%" & $op.desc.asmId & ')'
+      return "(%" & op.desc.asmId & ')'
    # GCC & Clang seemed to disagree on pointer indexing
    # in the past and required different codegen
    # if defined(gcc):
-    #   return $(op.offset * a.wordSize) & "+(%" & $op.desc.asmId & ')'
-    return $(op.offset * a.wordSize) & "(%" & $op.desc.asmId & ')'
+    #   return $(op.offset * a.wordSize) & "+(%" & op.desc.asmId & ')'
+    return $(op.offset * a.wordSize) & "(%" & op.desc.asmId & ')'
+  elif op.desc.rm == ClobberedReg: # Array in clobbered register
+    if op.offset == 0:
+      return "(%%" & op.desc.asmId & ')'
+    return $(op.offset * a.wordSize) & "(%%" & op.desc.asmId & ')'
  else:
    error "Unsupported: " & $op.desc.rm.ord

@ -335,7 +376,8 @@ func codeFragment(a: var Assembler_x86, instr: string, op: Operand) =
  else:
    error "Unsupported bitwidth: " & $a.wordBitWidth

-  a.operands.incl op.desc
+  if op.desc.constraint != ClobberedRegister:
+    a.operands.incl op.desc

 func codeFragment(a: var Assembler_x86, instr: string, op0, op1: Operand) =
  # Generate a code fragment
@ -352,8 +394,56 @@ func codeFragment(a: var Assembler_x86, instr: string, op0, op1: Operand) =
  else:
    error "Unsupported bitwidth: " & $a.wordBitWidth

-  a.operands.incl op0.desc
-  a.operands.incl op1.desc
+  if op0.desc.constraint != ClobberedRegister:
+    a.operands.incl op0.desc
+  if op1.desc.constraint != ClobberedRegister:
+    a.operands.incl op1.desc
+
+func codeFragment(a: var Assembler_x86, instr: string, op: Operand, reg: Register) =
+  # Generate a code fragment
+  # ⚠️ Warning:
+  # The caller should deal with destination/source operand
+  # so that it fits GNU Assembly
+  let off = a.getStrOffset(op)
+
+  if a.wordBitWidth == 64:
+    a.code &= instr & "q " & off & ", %%" & $reg & '\n'
+  else:
+    a.code &= instr & "l " & off & ", %%" & $reg & '\n'
+
+  # op.desc can be nil for renamed registers (using asArrayAddr)
+  if not op.desc.isNil and op.desc.constraint != ClobberedRegister:
+    a.operands.incl op.desc
+  a.regClobbers.incl reg
+
+func codeFragment(a: var Assembler_x86, instr: string, reg: Register, op: Operand) =
+  # Generate a code fragment
+  # ⚠️ Warning:
+  # The caller should deal with destination/source operand
+  # so that it fits GNU Assembly
+  let off = a.getStrOffset(op)
+
+  if a.wordBitWidth == 64:
+    a.code &= instr & "q %%" & $reg & ", " & off & '\n'
+  else:
+    a.code &= instr & "l %%" & $reg & ", " & off & '\n'
+
+  if op.desc.constraint != ClobberedRegister:
+    a.operands.incl op.desc
+  a.regClobbers.incl reg
+
+func codeFragment(a: var Assembler_x86, instr: string, reg0, reg1: Register) =
+  # Generate a code fragment
+  # ⚠️ Warning:
+  # The caller should deal with destination/source operand
+  # so that it fits GNU Assembly
+  if a.wordBitWidth == 64:
+    a.code &= instr & "q %%" & $reg0 & ", %%" & $reg1 & '\n'
+  else:
+    a.code &= instr & "l %%" & $reg0 & ", %%" & $reg1 & '\n'
+
+  a.regClobbers.incl reg0
+  a.regClobbers.incl reg1

 func codeFragment(a: var Assembler_x86, instr: string, imm: int, op: Operand) =
  # Generate a code fragment
@ -367,7 +457,8 @@ func codeFragment(a: var Assembler_x86, instr: string, imm: int, op: Operand) =
  else:
    a.code &= instr & "l $" & $imm & ", " & off & '\n'

-  a.operands.incl op.desc
+  if op.desc.constraint != ClobberedRegister:
+    a.operands.incl op.desc

 func codeFragment(a: var Assembler_x86, instr: string, reg: Register, op: OperandReuse) =
  # Generate a code fragment
@ -378,6 +469,7 @@ func codeFragment(a: var Assembler_x86, instr: string, reg: Register, op: Operan
    a.code &= instr & "q %%" & $reg & ", %" & $op.asmId & '\n'
  else:
    a.code &= instr & "l %%" & $reg & ", %" & $op.asmId & '\n'
+  a.regClobbers.incl reg

 func codeFragment(a: var Assembler_x86, instr: string, op: OperandReuse, reg: Register) =
  # Generate a code fragment
@ -388,6 +480,7 @@ func codeFragment(a: var Assembler_x86, instr: string, op: OperandReuse, reg: Re
    a.code &= instr & "q %" & $op.asmId & ", %%" & $reg & '\n'
  else:
    a.code &= instr & "l %" & $op.asmId & ", %%" & $reg & '\n'
+  a.regClobbers.incl reg

 func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: Register) =
  # Generate a code fragment
@ -398,16 +491,7 @@ func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: Register)
    a.code &= instr & "q $" & $imm & ", %%" & $reg & '\n'
  else:
    a.code &= instr & "l $" & $imm & ", %%" & $reg & '\n'
-
-func codeFragment(a: var Assembler_x86, instr: string, reg0, reg1: Register) =
-  # Generate a code fragment
-  # ⚠️ Warning:
-  # The caller should deal with destination/source operand
-  # so that it fits GNU Assembly
-  if a.wordBitWidth == 64:
-    a.code &= instr & "q %%" & $reg0 & ", %%" & $reg1 & '\n'
-  else:
-    a.code &= instr & "l %%" & $reg0 & ", %%" & $reg1 & '\n'
+  a.regClobbers.incl reg

 func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: OperandReuse) =
  # Generate a code fragment
@ -429,33 +513,35 @@ func codeFragment(a: var Assembler_x86, instr: string, reg0, reg1: OperandReuse)
  else:
    a.code &= instr & "l %" & $reg0.asmId & ", %" & $reg1.asmId & '\n'

-func codeFragment(a: var Assembler_x86, instr: string, reg0: OperandReuse, reg1: Operand) =
+func codeFragment(a: var Assembler_x86, instr: string, op0: OperandReuse, op1: Operand) =
  # Generate a code fragment
  # ⚠️ Warning:
  # The caller should deal with destination/source operand
  # so that it fits GNU Assembly
-  let off1 = a.getStrOffset(reg1)
+  let off1 = a.getStrOffset(op1)

  if a.wordBitWidth == 64:
-    a.code &= instr & "q %" & $reg0.asmId & ", " & off1 & '\n'
+    a.code &= instr & "q %" & $op0.asmId & ", " & off1 & '\n'
  else:
-    a.code &= instr & "l %" & $reg0.asmId & ", " & off1 & '\n'
+    a.code &= instr & "l %" & $op0.asmId & ", " & off1 & '\n'

-  a.operands.incl reg1.desc
+  if op1.desc.constraint != ClobberedRegister:
+    a.operands.incl op1.desc

-func codeFragment(a: var Assembler_x86, instr: string, reg0: Operand, reg1: OperandReuse) =
+func codeFragment(a: var Assembler_x86, instr: string, op0: Operand, op1: OperandReuse) =
  # Generate a code fragment
  # ⚠️ Warning:
  # The caller should deal with destination/source operand
  # so that it fits GNU Assembly
-  let off0 = a.getStrOffset(reg0)
+  let off0 = a.getStrOffset(op0)

  if a.wordBitWidth == 64:
-    a.code &= instr & "q " & off0 & ", %" & $reg1.asmId & '\n'
+    a.code &= instr & "q " & off0 & ", %" & $op1.asmId & '\n'
  else:
-    a.code &= instr & "l " & off0 & ", %" & $reg1.asmId & '\n'
+    a.code &= instr & "l " & off0 & ", %" & $op1.asmId & '\n'

-  a.operands.incl reg0.desc
+  if op0.desc.constraint != ClobberedRegister:
+    a.operands.incl op0.desc

 func reuseRegister*(reg: OperandArray): OperandReuse =
  # TODO: disable the reg input
@ -481,6 +567,22 @@ func add*(a: var Assembler_x86, dst, src: Operand) =
  a.codeFragment("add", src, dst)
  a.areFlagsClobbered = true

+func add*(a: var Assembler_x86, dst, src: Register) =
+  ## Does: dst <- dst + src
+  a.codeFragment("add", src, dst)
+  a.areFlagsClobbered = true
+
+func add*(a: var Assembler_x86, dst: Operand, src: Register) =
+  ## Does: dst <- dst + src
+  doAssert dst.desc.constraint in OutputReg
+  a.codeFragment("add", src, dst)
+  a.areFlagsClobbered = true
+
+func add*(a: var Assembler_x86, dst: Register, src: Operand) =
+  ## Does: dst <- dst + src
+  a.codeFragment("add", src, dst)
+  a.areFlagsClobbered = true
+
 func adc*(a: var Assembler_x86, dst, src: Operand) =
  ## Does: dst <- dst + src + carry
  doAssert dst.desc.constraint in OutputReg
@ -490,6 +592,11 @@ func adc*(a: var Assembler_x86, dst, src: Operand) =
  if dst.desc.rm in {Mem, MemOffsettable, AnyRegOrMem}:
    {.warning: "Using addcarry with a memory destination, this incurs significant performance penalties.".}

+func adc*(a: var Assembler_x86, dst, src: Register) =
+  ## Does: dst <- dst + src + carry
+  a.codeFragment("adc", src, dst)
+  a.areFlagsClobbered = true
+
 func adc*(a: var Assembler_x86, dst: Operand, imm: int) =
  ## Does: dst <- dst + imm + borrow
  doAssert dst.desc.constraint in OutputReg
@ -499,6 +606,17 @@ func adc*(a: var Assembler_x86, dst: Operand, imm: int) =
  if dst.desc.rm in {Mem, MemOffsettable, AnyRegOrMem}:
    {.warning: "Using addcarry with a memory destination, this incurs significant performance penalties.".}

+func adc*(a: var Assembler_x86, dst: Operand, src: Register) =
+  ## Does: dst <- dst + src
+  doAssert dst.desc.constraint in OutputReg
+  a.codeFragment("adc", src, dst)
+  a.areFlagsClobbered = true
+
+func adc*(a: var Assembler_x86, dst: Register, imm: int) =
+  ## Does: dst <- dst + src
+  a.codeFragment("adc", imm, dst)
+  a.areFlagsClobbered = true
+
 func sub*(a: var Assembler_x86, dst, src: Operand) =
  ## Does: dst <- dst - src
  doAssert dst.desc.constraint in OutputReg
@ -597,6 +715,12 @@ func `xor`*(a: var Assembler_x86, dst, src: Operand) =
  a.codeFragment("xor", src, dst)
  a.areFlagsClobbered = true

+func `xor`*(a: var Assembler_x86, dst, src: Register) =
+  ## Compute the bitwise xor of x and y and
+  ## reset all flags
+  a.codeFragment("xor", src, dst)
+  a.areFlagsClobbered = true
+
 func mov*(a: var Assembler_x86, dst, src: Operand) =
  ## Does: dst <- src
  doAssert dst.desc.constraint in OutputReg, $dst.repr
@ -625,16 +749,26 @@ func mov*(a: var Assembler_x86, dst: Operand, imm: int) =
  a.codeFragment("mov", imm, dst)
  # No clobber

+func mov*(a: var Assembler_x86, dst: Register, imm: int) =
+  ## Does: dst <- src with dst a fixed register
+  a.codeFragment("mov", imm, dst)
+
+func mov*(a: var Assembler_x86, dst: Register, src: Operand) =
+  ## Does: dst <- src with dst a fixed register
+  a.codeFragment("mov", src, dst)
+
+func mov*(a: var Assembler_x86, dst: Operand, src: Register) =
+  ## Does: dst <- src with dst a fixed register
+  a.codeFragment("mov", src, dst)
+
 func mov*(a: var Assembler_x86, dst: Register, src: OperandReuse) =
  ## Does: dst <- src with dst a fixed register
  a.codeFragment("mov", src, dst)
-  # No clobber

 func mov*(a: var Assembler_x86, dst: OperandReuse, src: Register) =
  ## Does: dst <- imm
  # doAssert dst.desc.constraint in OutputReg, $dst.repr
  a.codeFragment("mov", src, dst)
-  # No clobber

 func cmovc*(a: var Assembler_x86, dst, src: Operand) =
  ## Does: dst <- src if the carry flag is set
@ -697,6 +831,8 @@ func mul*(a: var Assembler_x86, dHi, dLo: Register, src0: Operand, src1: Registe
  doAssert src1 == rax, "MUL requires the RAX register"
  doAssert dHi == rdx,  "MUL requires the RDX register"
  doAssert dLo == rax,   "MUL requires the RAX register"
+  a.regClobbers.incl rax
+  a.regClobbers.incl rdx

  a.codeFragment("mul", src0)

@ -707,9 +843,15 @@ func imul*(a: var Assembler_x86, dst, src: Operand) =

  a.codeFragment("imul", src, dst)

+func imul*(a: var Assembler_x86, dst: Register, src: Operand) =
+  ## Does dst <- dst * src, keeping only the low half
+  a.codeFragment("imul", src, dst)
+
 func mulx*(a: var Assembler_x86, dHi, dLo, src0: Operand, src1: Register) =
  ## Does (dHi, dLo) <- src0 * src1
  doAssert src1 == rdx, "MULX requires the RDX register"
+  a.regClobbers.incl rdx
+
  doAssert dHi.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
    "The destination operand must be a register " & $dHi.repr
  doAssert dLo.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
@ -727,9 +869,31 @@ func mulx*(a: var Assembler_x86, dHi, dLo, src0: Operand, src1: Register) =

  a.operands.incl src0.desc

+func mulx*(a: var Assembler_x86, dHi: Operand, dLo: Register, src0: Operand, src1: Register) =
+  ## Does (dHi, dLo) <- src0 * src1
+  doAssert src1 == rdx, "MULX requires the RDX register"
+  a.regClobbers.incl rdx
+
+  doAssert dHi.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
+    "The destination operand must be a register " & $dHi.repr
+  doAssert dHi.desc.constraint in OutputReg
+
+  let off0 = a.getStrOffset(src0)
+
+  # Annoying AT&T syntax
+  if a.wordBitWidth == 64:
+    a.code &= "mulxq " & off0 & ", %%" & $dLo & ", %" & $dHi.desc.asmId & '\n'
+  else:
+    a.code &= "mulxl " & off0 & ", %%" & $dLo & ", %" & $dHi.desc.asmId & '\n'
+
+  a.operands.incl src0.desc
+  a.regClobbers.incl dLo
+
 func mulx*(a: var Assembler_x86, dHi: OperandReuse, dLo, src0: Operand, src1: Register) =
  ## Does (dHi, dLo) <- src0 * src1
  doAssert src1 == rdx, "MULX requires the RDX register"
+  a.regClobbers.incl rdx
+
  doAssert dLo.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
    "The destination operand must be a register " & $dLo.repr
  doAssert dLo.desc.constraint in OutputReg
@ -744,7 +908,40 @@ func mulx*(a: var Assembler_x86, dHi: OperandReuse, dLo, src0: Operand, src1: Re

  a.operands.incl src0.desc

-func adcx*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|OperandReuse) =
+func mulx*(a: var Assembler_x86, dHi: OperandReuse, dLo: Register, src0: Operand, src1: Register) =
+  ## Does (dHi, dLo) <- src0 * src1
+  doAssert src1 == rdx, "MULX requires the RDX register"
+  a.regClobbers.incl rdx
+
+  let off0 = a.getStrOffset(src0)
+
+  # Annoying AT&T syntax
+  if a.wordBitWidth == 64:
+    a.code &= "mulxq " & off0 & ", %%" & $dLo & ", %" & $dHi.asmId & '\n'
+  else:
+    a.code &= "mulxl " & off0 & ", %%" & $dLo & ", %" & $dHi.asmId & '\n'
+
+  a.operands.incl src0.desc
+  a.regClobbers.incl dLo
+
+func mulx*(a: var Assembler_x86, dHi, dLo: Register, src0: Operand, src1: Register) =
+  ## Does (dHi, dLo) <- src0 * src1
+  doAssert src1 == rdx, "MULX requires the RDX register"
+  a.regClobbers.incl rdx
+
+  let off0 = a.getStrOffset(src0)
+
+  # Annoying AT&T syntax
+  if a.wordBitWidth == 64:
+    a.code &= "mulxq " & off0 & ", %%" & $dLo & ", %%" & $dHi & '\n'
+  else:
+    a.code &= "mulxl " & off0 & ", %%" & $dLo & ", %%" & $dHi & '\n'
+
+  a.operands.incl src0.desc
+  a.regClobbers.incl dHi
+  a.regClobbers.incl dLo
+
+func adcx*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|OperandReuse|Register) =
  ## Does: dst <- dst + src + carry
  ## and only sets the carry flag
  when dst is Operand:
@ -753,7 +950,7 @@ func adcx*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|Operand
  a.codeFragment("adcx", src, dst)
  a.areFlagsClobbered = true

-func adox*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|OperandReuse) =
+func adox*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|OperandReuse|Register) =
  ## Does: dst <- dst + src + overflow
  ## and only sets the overflow flag
  when dst is Operand: