mirror of
https://github.com/logos-storage/constantine.git
synced 2026-01-05 22:53:12 +00:00
Cleanup: introduce clobbered registers, remove explicit rax, rdx for multiplication (minus 30-50 lines for related assembly files)
This commit is contained in:
parent
18069e54d3
commit
8918cabb56
@ -57,25 +57,6 @@ macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_
|
|||||||
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||||
|
|
||||||
# MUL requires RAX and RDX
|
# MUL requires RAX and RDX
|
||||||
rRAX = Operand(
|
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[rax]",
|
|
||||||
nimSymbol: ident"rax",
|
|
||||||
rm: RAX,
|
|
||||||
constraint: Output_EarlyClobber,
|
|
||||||
cEmit: "rax"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
rRDX = Operand(
|
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[rdx]",
|
|
||||||
nimSymbol: ident"rdx",
|
|
||||||
rm: RDX,
|
|
||||||
constraint: Output_EarlyClobber,
|
|
||||||
cEmit: "rdx"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
m0ninv = Operand(
|
m0ninv = Operand(
|
||||||
desc: OperandDesc(
|
desc: OperandDesc(
|
||||||
@ -109,16 +90,12 @@ macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_
|
|||||||
|
|
||||||
let tsym = t.nimSymbol
|
let tsym = t.nimSymbol
|
||||||
let scratchSym = scratch.nimSymbol
|
let scratchSym = scratch.nimSymbol
|
||||||
let eax = rRAX.desc.nimSymbol
|
|
||||||
let edx = rRDX.desc.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
||||||
|
|
||||||
var `tsym`: typeof(`r_MM`) # zero init
|
var `tsym`: typeof(`r_MM`) # zero init
|
||||||
# Assumes 64-bit limbs on 64-bit arch (or you can't store an address)
|
# Assumes 64-bit limbs on 64-bit arch (or you can't store an address)
|
||||||
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
|
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
|
||||||
var `eax`{.noInit.}, `edx`{.noInit.}: BaseType
|
|
||||||
|
|
||||||
`scratchSym`[0] = cast[SecretWord](`a_MM`[0].unsafeAddr)
|
`scratchSym`[0] = cast[SecretWord](`a_MM`[0].unsafeAddr)
|
||||||
`scratchSym`[1] = cast[SecretWord](`b_MM`[0].unsafeAddr)
|
`scratchSym`[1] = cast[SecretWord](`b_MM`[0].unsafeAddr)
|
||||||
`scratchSym`[5] = cast[SecretWord](`r_MM`[0].unsafeAddr)
|
`scratchSym`[5] = cast[SecretWord](`r_MM`[0].unsafeAddr)
|
||||||
@ -140,14 +117,14 @@ macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_
|
|||||||
|
|
||||||
for i in 0 ..< N:
|
for i in 0 ..< N:
|
||||||
# (A, t[0]) <- a[0] * b[i] + t[0]
|
# (A, t[0]) <- a[0] * b[i] + t[0]
|
||||||
ctx.mov rRAX, a[0]
|
ctx.mov rax, a[0]
|
||||||
ctx.mul rdx, rax, b[i], rax
|
ctx.mul rdx, rax, b[i], rax
|
||||||
if i == 0: # overwrite t[0]
|
if i == 0: # overwrite t[0]
|
||||||
ctx.mov t[0], rRAX
|
ctx.mov t[0], rax
|
||||||
else: # Accumulate in t[0]
|
else: # Accumulate in t[0]
|
||||||
ctx.add t[0], rRAX
|
ctx.add t[0], rax
|
||||||
ctx.adc rRDX, 0
|
ctx.adc rdx, 0
|
||||||
ctx.mov A, rRDX
|
ctx.mov A, rdx
|
||||||
|
|
||||||
# m <- (t[0] * m0ninv) mod 2^w
|
# m <- (t[0] * m0ninv) mod 2^w
|
||||||
ctx.mov m, m0ninv
|
ctx.mov m, m0ninv
|
||||||
@ -155,39 +132,39 @@ macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_
|
|||||||
|
|
||||||
# (C, _) <- m * M[0] + t[0]
|
# (C, _) <- m * M[0] + t[0]
|
||||||
ctx.`xor` C, C
|
ctx.`xor` C, C
|
||||||
ctx.mov rRAX, M[0]
|
ctx.mov rax, M[0]
|
||||||
ctx.mul rdx, rax, m, rax
|
ctx.mul rdx, rax, m, rax
|
||||||
ctx.add rRAX, t[0]
|
ctx.add rax, t[0]
|
||||||
ctx.adc C, rRDX
|
ctx.adc C, rdx
|
||||||
|
|
||||||
for j in 1 ..< N:
|
for j in 1 ..< N:
|
||||||
# (A, t[j]) <- a[j] * b[i] + A + t[j]
|
# (A, t[j]) <- a[j] * b[i] + A + t[j]
|
||||||
ctx.mov rRAX, a[j]
|
ctx.mov rax, a[j]
|
||||||
ctx.mul rdx, rax, b[i], rax
|
ctx.mul rdx, rax, b[i], rax
|
||||||
if i == 0:
|
if i == 0:
|
||||||
ctx.mov t[j], A
|
ctx.mov t[j], A
|
||||||
else:
|
else:
|
||||||
ctx.add t[j], A
|
ctx.add t[j], A
|
||||||
ctx.adc rRDX, 0
|
ctx.adc rdx, 0
|
||||||
ctx.`xor` A, A
|
ctx.`xor` A, A
|
||||||
ctx.add t[j], rRAX
|
ctx.add t[j], rax
|
||||||
ctx.adc A, rRDX
|
ctx.adc A, rdx
|
||||||
|
|
||||||
# (C, t[j-1]) <- m * M[j] + C + t[j]
|
# (C, t[j-1]) <- m * M[j] + C + t[j]
|
||||||
ctx.mov rRAX, M[j]
|
ctx.mov rax, M[j]
|
||||||
ctx.mul rdx, rax, m, rax
|
ctx.mul rdx, rax, m, rax
|
||||||
ctx.add C, t[j]
|
ctx.add C, t[j]
|
||||||
ctx.adc rRDX, 0
|
ctx.adc rdx, 0
|
||||||
ctx.add C, rRAX
|
ctx.add C, rax
|
||||||
ctx.adc rRDX, 0
|
ctx.adc rdx, 0
|
||||||
ctx.mov t[j-1], C
|
ctx.mov t[j-1], C
|
||||||
ctx.mov C, rRDX
|
ctx.mov C, rdx
|
||||||
|
|
||||||
ctx.add A, C
|
ctx.add A, C
|
||||||
ctx.mov t[N-1], A
|
ctx.mov t[N-1], A
|
||||||
|
|
||||||
ctx.mov rRDX, r
|
ctx.mov rdx, r
|
||||||
let r2 = rRDX.asArrayAddr(len = N)
|
let r2 = rdx.asArrayAddr(len = N)
|
||||||
|
|
||||||
ctx.finalSubNoCarry(
|
ctx.finalSubNoCarry(
|
||||||
r2, t, M,
|
r2, t, M,
|
||||||
|
|||||||
@ -40,7 +40,7 @@ proc mulx_by_word(
|
|||||||
t: OperandArray,
|
t: OperandArray,
|
||||||
a: Operand, # Pointer in scratchspace
|
a: Operand, # Pointer in scratchspace
|
||||||
word0: Operand,
|
word0: Operand,
|
||||||
lo, rRDX: Operand
|
lo: Operand
|
||||||
) =
|
) =
|
||||||
## Multiply the `a[0..<N]` by `word` and store in `t[0..<N]`
|
## Multiply the `a[0..<N]` by `word` and store in `t[0..<N]`
|
||||||
## and carry register `C` (t[N])
|
## and carry register `C` (t[N])
|
||||||
@ -55,7 +55,7 @@ proc mulx_by_word(
|
|||||||
# (C,t[j]) := t[j] + a[j]*b[i] + C
|
# (C,t[j]) := t[j] + a[j]*b[i] + C
|
||||||
|
|
||||||
# First limb
|
# First limb
|
||||||
ctx.mov rRDX, word0
|
ctx.mov rdx, word0
|
||||||
if N > 1:
|
if N > 1:
|
||||||
ctx.mulx t[1], t[0], a[0], rdx
|
ctx.mulx t[1], t[0], a[0], rdx
|
||||||
ctx.`xor` hi, hi # Clear flags - TODO: necessary?
|
ctx.`xor` hi, hi # Clear flags - TODO: necessary?
|
||||||
@ -87,20 +87,19 @@ proc mulaccx_by_word(
|
|||||||
a: Operand, # Pointer in scratchspace
|
a: Operand, # Pointer in scratchspace
|
||||||
i: int,
|
i: int,
|
||||||
word: Operand,
|
word: Operand,
|
||||||
lo, rRDX: Operand
|
lo: Operand
|
||||||
) =
|
) =
|
||||||
## Multiply the `a[0..<N]` by `word`
|
## Multiply the `a[0..<N]` by `word`
|
||||||
## and accumulate in `t[0..<N]`
|
## and accumulate in `t[0..<N]`
|
||||||
## and carry register `C` (t[N])
|
## and carry register `C` (t[N])
|
||||||
## `t` and `C` are multiply-accumulated
|
## `t` and `C` are multiply-accumulated
|
||||||
## `S` is a scratchspace register
|
## `S` is a scratchspace register
|
||||||
## `rRDX` is the RDX register descriptor
|
|
||||||
let N = min(a.len, t.len)
|
let N = min(a.len, t.len)
|
||||||
|
|
||||||
doAssert i != 0
|
doAssert i != 0
|
||||||
|
|
||||||
ctx.comment " Outer loop i = " & $i & ", j in [0, " & $N & ")"
|
ctx.comment " Outer loop i = " & $i & ", j in [0, " & $N & ")"
|
||||||
ctx.mov rRDX, word
|
ctx.mov rdx, word
|
||||||
ctx.`xor` hi, hi # Clear flags - TODO: necessary?
|
ctx.`xor` hi, hi # Clear flags - TODO: necessary?
|
||||||
|
|
||||||
# for j=0 to N-1
|
# for j=0 to N-1
|
||||||
@ -119,9 +118,9 @@ proc mulaccx_by_word(
|
|||||||
|
|
||||||
# Final carries
|
# Final carries
|
||||||
ctx.comment " Accumulate last carries in hi word"
|
ctx.comment " Accumulate last carries in hi word"
|
||||||
ctx.mov rRDX, 0 # Set to 0 without clearing flags
|
ctx.mov rdx, 0 # Set to 0 without clearing flags
|
||||||
ctx.adcx hi, rRDX
|
ctx.adcx hi, rdx
|
||||||
ctx.adox hi, rRDX
|
ctx.adox hi, rdx
|
||||||
|
|
||||||
proc partialRedx(
|
proc partialRedx(
|
||||||
ctx: var Assembler_x86,
|
ctx: var Assembler_x86,
|
||||||
@ -129,7 +128,7 @@ proc partialRedx(
|
|||||||
t: OperandArray,
|
t: OperandArray,
|
||||||
M: OperandArray,
|
M: OperandArray,
|
||||||
m0ninv: Operand,
|
m0ninv: Operand,
|
||||||
lo, S, rRDX: Operand
|
lo, S: Operand
|
||||||
) =
|
) =
|
||||||
## Partial Montgomery reduction
|
## Partial Montgomery reduction
|
||||||
## For CIOS method
|
## For CIOS method
|
||||||
@ -145,8 +144,8 @@ proc partialRedx(
|
|||||||
# m = t[0] * m0ninv mod 2^w
|
# m = t[0] * m0ninv mod 2^w
|
||||||
ctx.comment " Reduction"
|
ctx.comment " Reduction"
|
||||||
ctx.comment " m = t[0] * m0ninv mod 2^w"
|
ctx.comment " m = t[0] * m0ninv mod 2^w"
|
||||||
ctx.mov rRDX, t[0]
|
ctx.mov rdx, t[0]
|
||||||
ctx.mulx S, rRDX, m0ninv, rdx # (S, RDX) <- m0ninv * RDX
|
ctx.mulx S, rdx, m0ninv, rdx # (S, RDX) <- m0ninv * RDX
|
||||||
|
|
||||||
# Clear carry flags - TODO: necessary?
|
# Clear carry flags - TODO: necessary?
|
||||||
ctx.`xor` S, S
|
ctx.`xor` S, S
|
||||||
@ -194,16 +193,7 @@ macro montMul_CIOS_nocarry_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM,
|
|||||||
# MultiPurpose Register slots
|
# MultiPurpose Register slots
|
||||||
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||||
|
|
||||||
# MULX requires RDX
|
# MULX requires RDX as well
|
||||||
rRDX = Operand(
|
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[rdx]",
|
|
||||||
nimSymbol: ident"rdx",
|
|
||||||
rm: RDX,
|
|
||||||
constraint: Output_EarlyClobber,
|
|
||||||
cEmit: "rdx"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
a = scratch[0].asArrayAddr(len = N) # Store the `a` operand
|
a = scratch[0].asArrayAddr(len = N) # Store the `a` operand
|
||||||
b = scratch[1].asArrayAddr(len = N) # Store the `b` operand
|
b = scratch[1].asArrayAddr(len = N) # Store the `b` operand
|
||||||
@ -225,15 +215,12 @@ macro montMul_CIOS_nocarry_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM,
|
|||||||
|
|
||||||
let tsym = t.nimSymbol
|
let tsym = t.nimSymbol
|
||||||
let scratchSym = scratch.nimSymbol
|
let scratchSym = scratch.nimSymbol
|
||||||
let edx = rRDX.desc.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
||||||
|
|
||||||
var `tsym`: typeof(`r_MM`) # zero init
|
var `tsym`: typeof(`r_MM`) # zero init
|
||||||
# Assumes 64-bit limbs on 64-bit arch (or you can't store an address)
|
# Assumes 64-bit limbs on 64-bit arch (or you can't store an address)
|
||||||
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
|
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
|
||||||
var `edx`{.noInit.}: BaseType
|
|
||||||
|
|
||||||
`scratchSym`[0] = cast[SecretWord](`a_MM`[0].unsafeAddr)
|
`scratchSym`[0] = cast[SecretWord](`a_MM`[0].unsafeAddr)
|
||||||
`scratchSym`[1] = cast[SecretWord](`b_MM`[0].unsafeAddr)
|
`scratchSym`[1] = cast[SecretWord](`b_MM`[0].unsafeAddr)
|
||||||
`scratchSym`[4] = SecretWord `m0ninv_MM`
|
`scratchSym`[4] = SecretWord `m0ninv_MM`
|
||||||
@ -258,20 +245,20 @@ macro montMul_CIOS_nocarry_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM,
|
|||||||
A, t,
|
A, t,
|
||||||
a,
|
a,
|
||||||
b[0],
|
b[0],
|
||||||
C, rRDX
|
C
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
ctx.mulaccx_by_word(
|
ctx.mulaccx_by_word(
|
||||||
A, t,
|
A, t,
|
||||||
a, i,
|
a, i,
|
||||||
b[i],
|
b[i],
|
||||||
C, rRDX
|
C
|
||||||
)
|
)
|
||||||
|
|
||||||
ctx.partialRedx(
|
ctx.partialRedx(
|
||||||
A, t,
|
A, t,
|
||||||
M, m0ninv,
|
M, m0ninv,
|
||||||
lo, C, rRDX
|
lo, C
|
||||||
)
|
)
|
||||||
|
|
||||||
ctx.finalSubNoCarry(
|
ctx.finalSubNoCarry(
|
||||||
|
|||||||
@ -41,7 +41,7 @@ proc finalSubNoCarry*(
|
|||||||
ctx.sbb scratch[i], M[i]
|
ctx.sbb scratch[i], M[i]
|
||||||
|
|
||||||
# If we borrowed it means that we were smaller than
|
# If we borrowed it means that we were smaller than
|
||||||
# the modulus and we don'a need "scratch"
|
# the modulus and we don't need "scratch"
|
||||||
for i in 0 ..< N:
|
for i in 0 ..< N:
|
||||||
ctx.cmovnc a[i], scratch[i]
|
ctx.cmovnc a[i], scratch[i]
|
||||||
ctx.mov r[i], a[i]
|
ctx.mov r[i], a[i]
|
||||||
@ -50,7 +50,7 @@ proc finalSubCanOverflow*(
|
|||||||
ctx: var Assembler_x86,
|
ctx: var Assembler_x86,
|
||||||
r: Operand or OperandArray,
|
r: Operand or OperandArray,
|
||||||
a, M, scratch: OperandArray,
|
a, M, scratch: OperandArray,
|
||||||
overflowReg: Operand
|
overflowReg: Operand or Register
|
||||||
) =
|
) =
|
||||||
## Reduce `a` into `r` modulo `M`
|
## Reduce `a` into `r` modulo `M`
|
||||||
## To be used when the final substraction can
|
## To be used when the final substraction can
|
||||||
@ -74,7 +74,7 @@ proc finalSubCanOverflow*(
|
|||||||
ctx.sbb overflowReg, 0
|
ctx.sbb overflowReg, 0
|
||||||
|
|
||||||
# If we borrowed it means that we were smaller than
|
# If we borrowed it means that we were smaller than
|
||||||
# the modulus and we don'a need "scratch"
|
# the modulus and we don't need "scratch"
|
||||||
for i in 0 ..< N:
|
for i in 0 ..< N:
|
||||||
ctx.cmovnc a[i], scratch[i]
|
ctx.cmovnc a[i], scratch[i]
|
||||||
ctx.mov r[i], a[i]
|
ctx.mov r[i], a[i]
|
||||||
@ -90,59 +90,37 @@ macro montyRedc2x_gen[N: static int](
|
|||||||
m0ninv_MR: BaseType,
|
m0ninv_MR: BaseType,
|
||||||
spareBits: static int
|
spareBits: static int
|
||||||
) =
|
) =
|
||||||
# TODO, slower than Clang, in particular due to the shadowing
|
|
||||||
|
|
||||||
result = newStmtList()
|
result = newStmtList()
|
||||||
|
|
||||||
var ctx = init(Assembler_x86, BaseType)
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
|
# On x86, compilers only let us use 15 out of 16 registers
|
||||||
|
# RAX and RDX are defacto used due to the MUL instructions
|
||||||
|
# so we store everything in scratchspaces restoring as needed
|
||||||
let
|
let
|
||||||
# We could force M as immediate by specializing per moduli
|
# We could force M as immediate by specializing per moduli
|
||||||
M = init(OperandArray, nimSymbol = M_MR, N, PointerInReg, Input)
|
M = init(OperandArray, nimSymbol = M_MR, N, PointerInReg, Input)
|
||||||
|
|
||||||
# MUL requires RAX and RDX
|
# MUL requires RAX and RDX
|
||||||
rRAX = Operand(
|
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[rax]",
|
|
||||||
nimSymbol: ident"rax",
|
|
||||||
rm: RAX,
|
|
||||||
constraint: InputOutput_EnsureClobber,
|
|
||||||
cEmit: "rax"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
rRDX = Operand(
|
let uSlots = N+2
|
||||||
desc: OperandDesc(
|
let vSlots = max(N-2, 3)
|
||||||
asmId: "[rdx]",
|
|
||||||
nimSymbol: ident"rdx",
|
|
||||||
rm: RDX,
|
|
||||||
constraint: Output_EarlyClobber,
|
|
||||||
cEmit: "rdx"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
m0ninv = Operand(
|
var # Scratchspaces
|
||||||
desc: OperandDesc(
|
u = init(OperandArray, nimSymbol = ident"U", uSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||||
asmId: "[m0ninv]",
|
v = init(OperandArray, nimSymbol = ident"V", vSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||||
nimSymbol: m0ninv_MR,
|
|
||||||
rm: Reg,
|
|
||||||
constraint: Input,
|
|
||||||
cEmit: "m0ninv"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
let scratchSlots = N+2
|
|
||||||
var scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
|
||||||
|
|
||||||
# Prologue
|
# Prologue
|
||||||
let eax = rRAX.desc.nimSymbol
|
let usym = u.nimSymbol
|
||||||
let edx = rRDX.desc.nimSymbol
|
let vsym = v.nimSymbol
|
||||||
let scratchSym = scratch.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
var `usym`{.noinit.}: Limbs[`uSlots`]
|
||||||
|
var `vsym` {.noInit.}: Limbs[`vSlots`]
|
||||||
|
`vsym`[0] = cast[SecretWord](`r_MR`[0].unsafeAddr)
|
||||||
|
`vsym`[1] = cast[SecretWord](`a_MR`[0].unsafeAddr)
|
||||||
|
`vsym`[2] = SecretWord(`m0ninv_MR`)
|
||||||
|
|
||||||
var `eax`{.noInit.}, `edx`{.noInit.}: BaseType
|
let r_temp = v[0].asArrayAddr(len = N)
|
||||||
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
|
let a = v[1].asArrayAddr(len = 2*N)
|
||||||
|
let m0ninv = v[2]
|
||||||
|
|
||||||
# Algorithm
|
# Algorithm
|
||||||
# ---------------------------------------------------------
|
# ---------------------------------------------------------
|
||||||
@ -161,85 +139,76 @@ macro montyRedc2x_gen[N: static int](
|
|||||||
# No register spilling handling
|
# No register spilling handling
|
||||||
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
|
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
|
||||||
|
|
||||||
result.add quote do:
|
for i in 0 ..< N:
|
||||||
`eax` = BaseType `a_MR`[0]
|
ctx.mov u[i], a[i]
|
||||||
staticFor i, 0, `N`: # Do NOT use Nim slice/toOpenArray, they are not inlined
|
|
||||||
`scratchSym`[i] = `a_MR`[i]
|
|
||||||
|
|
||||||
ctx.mov scratch[N], rRAX
|
ctx.mov u[N], u[0]
|
||||||
ctx.imul rRAX, m0ninv # m <- a[i] * m0ninv mod 2^w
|
ctx.imul u[0], m0ninv # m <- a[i] * m0ninv mod 2^w
|
||||||
ctx.mov scratch[0], rRAX
|
ctx.mov rax, u[0]
|
||||||
|
|
||||||
# scratch: [a[0] * m0, a[1], a[2], a[3], a[0]] for 4 limbs
|
# scratch: [a[0] * m0, a[1], a[2], a[3], a[0]] for 4 limbs
|
||||||
|
|
||||||
for i in 0 ..< N:
|
for i in 0 ..< N:
|
||||||
ctx.comment ""
|
ctx.comment ""
|
||||||
let hi = scratch[N]
|
let hi = u[N]
|
||||||
let next = scratch[N+1]
|
let next = u[N+1]
|
||||||
|
|
||||||
ctx.mul rdx, rax, M[0], rax
|
ctx.mul rdx, rax, M[0], rax
|
||||||
ctx.add hi, rRAX # Guaranteed to be zero
|
ctx.add hi, rax # Guaranteed to be zero
|
||||||
ctx.mov rRAX, scratch[0]
|
ctx.mov rax, u[0]
|
||||||
ctx.adc hi, rRDX
|
ctx.adc hi, rdx
|
||||||
|
|
||||||
for j in 1 ..< N-1:
|
for j in 1 ..< N-1:
|
||||||
ctx.comment ""
|
ctx.comment ""
|
||||||
ctx.mul rdx, rax, M[j], rax
|
ctx.mul rdx, rax, M[j], rax
|
||||||
ctx.add scratch[j], rRAX
|
ctx.add u[j], rax
|
||||||
ctx.mov rRAX, scratch[0]
|
ctx.mov rax, u[0]
|
||||||
ctx.adc rRDX, 0
|
ctx.adc rdx, 0
|
||||||
ctx.add scratch[j], hi
|
ctx.add u[j], hi
|
||||||
ctx.adc rRDX, 0
|
ctx.adc rdx, 0
|
||||||
ctx.mov hi, rRDX
|
ctx.mov hi, rdx
|
||||||
|
|
||||||
# Next load
|
# Next load
|
||||||
if i < N-1:
|
if i < N-1:
|
||||||
ctx.comment ""
|
ctx.comment ""
|
||||||
ctx.mov next, scratch[1]
|
ctx.mov next, u[1]
|
||||||
ctx.imul scratch[1], m0ninv
|
ctx.imul u[1], m0ninv
|
||||||
ctx.comment ""
|
ctx.comment ""
|
||||||
|
|
||||||
# Last limb
|
# Last limb
|
||||||
ctx.comment ""
|
ctx.comment ""
|
||||||
ctx.mul rdx, rax, M[N-1], rax
|
ctx.mul rdx, rax, M[N-1], rax
|
||||||
ctx.add scratch[N-1], rRAX
|
ctx.add u[N-1], rax
|
||||||
ctx.mov rRAX, scratch[1] # Contains next * m0
|
ctx.mov rax, u[1] # Contains next * m0
|
||||||
ctx.adc rRDX, 0
|
ctx.adc rdx, 0
|
||||||
ctx.add scratch[N-1], hi
|
ctx.add u[N-1], hi
|
||||||
ctx.adc rRDX, 0
|
ctx.adc rdx, 0
|
||||||
ctx.mov hi, rRDX
|
ctx.mov hi, rdx
|
||||||
|
|
||||||
scratch.rotateLeft()
|
u.rotateLeft()
|
||||||
|
|
||||||
# Code generation
|
# Second part - Final substraction
|
||||||
result.add ctx.generate()
|
# ---------------------------------------------
|
||||||
|
|
||||||
# New codegen
|
ctx.mov rdx, r_temp
|
||||||
ctx = init(Assembler_x86, BaseType)
|
let r = rdx.asArrayAddr(len = N)
|
||||||
|
|
||||||
let r = init(OperandArray, nimSymbol = r_MR, N, PointerInReg, InputOutput_EnsureClobber)
|
|
||||||
let a = init(OperandArray, nimSymbol = a_MR, N*2, PointerInReg, Input)
|
|
||||||
let extraRegNeeded = N-2
|
|
||||||
let t = init(OperandArray, nimSymbol = ident"t", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber)
|
|
||||||
let tsym = t.nimSymbol
|
|
||||||
result.add quote do:
|
|
||||||
var `tsym` {.noInit.}: Limbs[`extraRegNeeded`]
|
|
||||||
|
|
||||||
# This does a[i+n] += hi
|
# This does a[i+n] += hi
|
||||||
# but in a separate carry chain, fused with the
|
# but in a separate carry chain, fused with the
|
||||||
# copy "r[i] = a[i+n]"
|
# copy "r[i] = a[i+n]"
|
||||||
for i in 0 ..< N:
|
for i in 0 ..< N:
|
||||||
if i == 0:
|
if i == 0:
|
||||||
ctx.add scratch[i], a[i+N]
|
ctx.add u[i], a[i+N]
|
||||||
else:
|
else:
|
||||||
ctx.adc scratch[i], a[i+N]
|
ctx.adc u[i], a[i+N]
|
||||||
|
|
||||||
let reuse = repackRegisters(t, scratch[N], scratch[N+1])
|
let t = repackRegisters(v, u[N], u[N+1])
|
||||||
|
|
||||||
|
# v is invalidated
|
||||||
if spareBits >= 1:
|
if spareBits >= 1:
|
||||||
ctx.finalSubNoCarry(r, scratch, M, reuse)
|
ctx.finalSubNoCarry(r, u, M, t)
|
||||||
else:
|
else:
|
||||||
ctx.finalSubCanOverflow(r, scratch, M, reuse, rRAX)
|
ctx.finalSubCanOverflow(r, u, M, t, rax)
|
||||||
|
|
||||||
# Code generation
|
# Code generation
|
||||||
result.add ctx.generate()
|
result.add ctx.generate()
|
||||||
|
|||||||
@ -35,15 +35,13 @@ static: doAssert UseASM_X86_64
|
|||||||
# Montgomery reduction
|
# Montgomery reduction
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
macro montyRedc2xx_gen[N: static int](
|
macro montyRedc2x_gen[N: static int](
|
||||||
r_MR: var array[N, SecretWord],
|
r_MR: var array[N, SecretWord],
|
||||||
a_MR: array[N*2, SecretWord],
|
a_MR: array[N*2, SecretWord],
|
||||||
M_MR: array[N, SecretWord],
|
M_MR: array[N, SecretWord],
|
||||||
m0ninv_MR: BaseType,
|
m0ninv_MR: BaseType,
|
||||||
spareBits: static int
|
spareBits: static int
|
||||||
) =
|
) =
|
||||||
# TODO, slower than Clang, in particular due to the shadowing
|
|
||||||
|
|
||||||
result = newStmtList()
|
result = newStmtList()
|
||||||
|
|
||||||
var ctx = init(Assembler_x86, BaseType)
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
@ -51,59 +49,29 @@ macro montyRedc2xx_gen[N: static int](
|
|||||||
# We could force M as immediate by specializing per moduli
|
# We could force M as immediate by specializing per moduli
|
||||||
M = init(OperandArray, nimSymbol = M_MR, N, PointerInReg, Input)
|
M = init(OperandArray, nimSymbol = M_MR, N, PointerInReg, Input)
|
||||||
|
|
||||||
hi = Operand(
|
let uSlots = N+1
|
||||||
desc: OperandDesc(
|
let vSlots = max(N-1, 5)
|
||||||
asmId: "[hi]",
|
|
||||||
nimSymbol: ident"hi",
|
|
||||||
rm: Reg,
|
|
||||||
constraint: Output_EarlyClobber,
|
|
||||||
cEmit: "hi"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
lo = Operand(
|
var # Scratchspaces
|
||||||
desc: OperandDesc(
|
u = init(OperandArray, nimSymbol = ident"U", uSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||||
asmId: "[lo]",
|
v = init(OperandArray, nimSymbol = ident"V", vSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||||
nimSymbol: ident"lo",
|
|
||||||
rm: Reg,
|
|
||||||
constraint: Output_EarlyClobber,
|
|
||||||
cEmit: "lo"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
rRDX = Operand(
|
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[rdx]",
|
|
||||||
nimSymbol: ident"rdx",
|
|
||||||
rm: RDX,
|
|
||||||
constraint: InputOutput_EnsureClobber,
|
|
||||||
cEmit: "rdx"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
m0ninv = Operand(
|
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[m0ninv]",
|
|
||||||
nimSymbol: m0ninv_MR,
|
|
||||||
rm: Reg,
|
|
||||||
constraint: Input,
|
|
||||||
cEmit: "m0ninv"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
let scratchSlots = N+1
|
|
||||||
var scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
|
||||||
|
|
||||||
# Prologue
|
# Prologue
|
||||||
let edx = rRDX.desc.nimSymbol
|
let usym = u.nimSymbol
|
||||||
let hisym = hi.desc.nimSymbol
|
let vsym = v.nimSymbol
|
||||||
let losym = lo.desc.nimSymbol
|
|
||||||
let scratchSym = scratch.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
||||||
|
var `usym`{.noinit.}: Limbs[`uSlots`]
|
||||||
|
var `vsym` {.noInit.}: Limbs[`vSlots`]
|
||||||
|
`vsym`[0] = cast[SecretWord](`r_MR`[0].unsafeAddr)
|
||||||
|
`vsym`[1] = cast[SecretWord](`a_MR`[0].unsafeAddr)
|
||||||
|
`vsym`[2] = SecretWord(`m0ninv_MR`)
|
||||||
|
|
||||||
var `hisym`{.noInit.}, `losym`{.noInit.}, `edx`{.noInit.}: BaseType
|
let r_temp = v[0].asArrayAddr(len = N)
|
||||||
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
|
let a = v[1].asArrayAddr(len = 2*N)
|
||||||
|
let m0ninv = v[2]
|
||||||
|
let lo = v[3]
|
||||||
|
let hi = v[4]
|
||||||
|
|
||||||
# Algorithm
|
# Algorithm
|
||||||
# ---------------------------------------------------------
|
# ---------------------------------------------------------
|
||||||
@ -122,63 +90,52 @@ macro montyRedc2xx_gen[N: static int](
|
|||||||
# No register spilling handling
|
# No register spilling handling
|
||||||
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
|
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
|
||||||
|
|
||||||
result.add quote do:
|
ctx.mov rdx, m0ninv
|
||||||
`edx` = BaseType(`m0ninv_MR`)
|
|
||||||
staticFor i, 0, `N`: # Do NOT use Nim slice/toOpenArray, they are not inlined
|
for i in 0 ..< N:
|
||||||
`scratchSym`[i] = `a_MR`[i]
|
ctx.mov u[i], a[i]
|
||||||
|
|
||||||
for i in 0 ..< N:
|
for i in 0 ..< N:
|
||||||
# RDX contains m0ninv at the start of each loop
|
# RDX contains m0ninv at the start of each loop
|
||||||
ctx.comment ""
|
ctx.comment ""
|
||||||
ctx.imul rRDX, scratch[0] # m <- a[i] * m0ninv mod 2^w
|
ctx.imul rdx, u[0] # m <- a[i] * m0ninv mod 2^w
|
||||||
ctx.comment "---- Reduction " & $i
|
ctx.comment "---- Reduction " & $i
|
||||||
ctx.`xor` scratch[N], scratch[N]
|
ctx.`xor` u[N], u[N]
|
||||||
|
|
||||||
for j in 0 ..< N-1:
|
for j in 0 ..< N-1:
|
||||||
ctx.comment ""
|
ctx.comment ""
|
||||||
ctx.mulx hi, lo, M[j], rdx
|
ctx.mulx hi, lo, M[j], rdx
|
||||||
ctx.adcx scratch[j], lo
|
ctx.adcx u[j], lo
|
||||||
ctx.adox scratch[j+1], hi
|
ctx.adox u[j+1], hi
|
||||||
|
|
||||||
# Last limb
|
# Last limb
|
||||||
ctx.comment ""
|
ctx.comment ""
|
||||||
ctx.mulx hi, lo, M[N-1], rdx
|
ctx.mulx hi, lo, M[N-1], rdx
|
||||||
ctx.mov rRDX, m0ninv # Reload m0ninv for next iter
|
ctx.mov rdx, m0ninv # Reload m0ninv for next iter
|
||||||
ctx.adcx scratch[N-1], lo
|
ctx.adcx u[N-1], lo
|
||||||
ctx.adox hi, scratch[N]
|
ctx.adox hi, u[N]
|
||||||
ctx.adcx scratch[N], hi
|
ctx.adcx u[N], hi
|
||||||
|
|
||||||
scratch.rotateLeft()
|
u.rotateLeft()
|
||||||
|
|
||||||
# Code generation
|
ctx.mov rdx, r_temp
|
||||||
result.add ctx.generate()
|
let r = rdx.asArrayAddr(len = N)
|
||||||
|
|
||||||
# New codegen
|
|
||||||
ctx = init(Assembler_x86, BaseType)
|
|
||||||
|
|
||||||
let r = init(OperandArray, nimSymbol = r_MR, N, PointerInReg, InputOutput_EnsureClobber)
|
|
||||||
let a = init(OperandArray, nimSymbol = a_MR, N*2, PointerInReg, Input)
|
|
||||||
let extraRegNeeded = N-1
|
|
||||||
let t = init(OperandArray, nimSymbol = ident"t", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber)
|
|
||||||
let tsym = t.nimSymbol
|
|
||||||
result.add quote do:
|
|
||||||
var `tsym` {.noInit.}: Limbs[`extraRegNeeded`]
|
|
||||||
|
|
||||||
# This does a[i+n] += hi
|
# This does a[i+n] += hi
|
||||||
# but in a separate carry chain, fused with the
|
# but in a separate carry chain, fused with the
|
||||||
# copy "r[i] = a[i+n]"
|
# copy "r[i] = a[i+n]"
|
||||||
for i in 0 ..< N:
|
for i in 0 ..< N:
|
||||||
if i == 0:
|
if i == 0:
|
||||||
ctx.add scratch[i], a[i+N]
|
ctx.add u[i], a[i+N]
|
||||||
else:
|
else:
|
||||||
ctx.adc scratch[i], a[i+N]
|
ctx.adc u[i], a[i+N]
|
||||||
|
|
||||||
let reuse = repackRegisters(t, scratch[N])
|
let t = repackRegisters(v, u[N])
|
||||||
|
|
||||||
if spareBits >= 1:
|
if spareBits >= 1:
|
||||||
ctx.finalSubNoCarry(r, scratch, M, reuse)
|
ctx.finalSubNoCarry(r, u, M, t)
|
||||||
else:
|
else:
|
||||||
ctx.finalSubCanOverflow(r, scratch, M, reuse, hi)
|
ctx.finalSubCanOverflow(r, u, M, t, hi)
|
||||||
|
|
||||||
# Code generation
|
# Code generation
|
||||||
result.add ctx.generate()
|
result.add ctx.generate()
|
||||||
@ -191,4 +148,4 @@ func montRed_asm_adx_bmi2*[N: static int](
|
|||||||
spareBits: static int
|
spareBits: static int
|
||||||
) =
|
) =
|
||||||
## Constant-time Montgomery reduction
|
## Constant-time Montgomery reduction
|
||||||
montyRedc2xx_gen(r, a, M, m0ninv, spareBits)
|
montyRedc2x_gen(r, a, M, m0ninv, spareBits)
|
||||||
|
|||||||
@ -81,36 +81,13 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
|
|||||||
)
|
)
|
||||||
|
|
||||||
# MUL requires RAX and RDX
|
# MUL requires RAX and RDX
|
||||||
rRAX = Operand(
|
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[rax]",
|
|
||||||
nimSymbol: ident"rax",
|
|
||||||
rm: RAX,
|
|
||||||
constraint: Output_EarlyClobber,
|
|
||||||
cEmit: "rax"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
rRDX = Operand(
|
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[rdx]",
|
|
||||||
nimSymbol: ident"rdx",
|
|
||||||
rm: RDX,
|
|
||||||
constraint: Output_EarlyClobber,
|
|
||||||
cEmit: "rdx"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# Prologue
|
# Prologue
|
||||||
let tsym = t.desc.nimSymbol
|
let tsym = t.desc.nimSymbol
|
||||||
let usym = u.desc.nimSymbol
|
let usym = u.desc.nimSymbol
|
||||||
let vsym = v.desc.nimSymbol
|
let vsym = v.desc.nimSymbol
|
||||||
let eax = rRAX.desc.nimSymbol
|
|
||||||
let edx = rRDX.desc.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init
|
var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init
|
||||||
var `eax`{.noInit.}, `edx`{.noInit.}: BaseType
|
|
||||||
|
|
||||||
# Algorithm
|
# Algorithm
|
||||||
ctx.`xor` u, u
|
ctx.`xor` u, u
|
||||||
@ -127,10 +104,10 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
|
|||||||
let ia = i - ib
|
let ia = i - ib
|
||||||
for j in 0 ..< min(aLen - ia, ib+1):
|
for j in 0 ..< min(aLen - ia, ib+1):
|
||||||
# (t, u, v) <- (t, u, v) + a[ia+j] * b[ib-j]
|
# (t, u, v) <- (t, u, v) + a[ia+j] * b[ib-j]
|
||||||
ctx.mov rRAX, arrB[ib-j]
|
ctx.mov rax, arrB[ib-j]
|
||||||
ctx.mul rdx, rax, arrA[ia+j], rax
|
ctx.mul rdx, rax, arrA[ia+j], rax
|
||||||
ctx.add v, rRAX
|
ctx.add v, rax
|
||||||
ctx.adc u, rRDX
|
ctx.adc u, rdx
|
||||||
ctx.adc t, 0
|
ctx.adc t, 0
|
||||||
|
|
||||||
ctx.mov arrR[i], v
|
ctx.mov arrR[i], v
|
||||||
@ -141,9 +118,9 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
|
|||||||
ctx.`xor` t, t
|
ctx.`xor` t, t
|
||||||
|
|
||||||
if aLen+bLen < rLen:
|
if aLen+bLen < rLen:
|
||||||
ctx.`xor` rRAX, rRAX
|
ctx.`xor` rax, rax
|
||||||
for i in aLen+bLen ..< rLen:
|
for i in aLen+bLen ..< rLen:
|
||||||
ctx.mov arrR[i], rRAX
|
ctx.mov arrR[i], rax
|
||||||
|
|
||||||
# Codegen
|
# Codegen
|
||||||
result.add ctx.generate
|
result.add ctx.generate
|
||||||
@ -202,37 +179,12 @@ macro square_gen[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# MUL requires RAX and RDX
|
|
||||||
rRAX = Operand(
|
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[rax]",
|
|
||||||
nimSymbol: ident"rax",
|
|
||||||
rm: RAX,
|
|
||||||
constraint: Output_EarlyClobber,
|
|
||||||
cEmit: "rax"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
rRDX = Operand(
|
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[rdx]",
|
|
||||||
nimSymbol: ident"rdx",
|
|
||||||
rm: RDX,
|
|
||||||
constraint: Output_EarlyClobber,
|
|
||||||
cEmit: "rdx"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# Prologue
|
# Prologue
|
||||||
let tsym = t.desc.nimSymbol
|
let tsym = t.desc.nimSymbol
|
||||||
let usym = u.desc.nimSymbol
|
let usym = u.desc.nimSymbol
|
||||||
let vsym = v.desc.nimSymbol
|
let vsym = v.desc.nimSymbol
|
||||||
let eax = rRAX.desc.nimSymbol
|
|
||||||
let edx = rRDX.desc.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init
|
var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init
|
||||||
var `eax`{.noInit.}, `edx`{.noInit.}: BaseType
|
|
||||||
|
|
||||||
# Algorithm
|
# Algorithm
|
||||||
ctx.`xor` u, u
|
ctx.`xor` u, u
|
||||||
@ -252,20 +204,20 @@ macro square_gen[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
|||||||
let k2 = ib-j
|
let k2 = ib-j
|
||||||
if k1 < k2:
|
if k1 < k2:
|
||||||
# (t, u, v) <- (t, u, v) + 2 * a[k1] * a[k2]
|
# (t, u, v) <- (t, u, v) + 2 * a[k1] * a[k2]
|
||||||
ctx.mov rRAX, arrA[k2]
|
ctx.mov rax, arrA[k2]
|
||||||
ctx.mul rdx, rax, arrA[k1], rax
|
ctx.mul rdx, rax, arrA[k1], rax
|
||||||
ctx.add rRAX, rRAX
|
ctx.add rax, rax
|
||||||
ctx.adc rRDX, rRDX
|
ctx.adc rdx, rdx
|
||||||
ctx.adc t, 0
|
ctx.adc t, 0
|
||||||
ctx.add v, rRAX
|
ctx.add v, rax
|
||||||
ctx.adc u, rRDX
|
ctx.adc u, rdx
|
||||||
ctx.adc t, 0
|
ctx.adc t, 0
|
||||||
elif k1 == k2:
|
elif k1 == k2:
|
||||||
# (t, u, v) <- (t, u, v) + a[k1] * a[k2]
|
# (t, u, v) <- (t, u, v) + a[k1] * a[k2]
|
||||||
ctx.mov rRAX, arrA[k2]
|
ctx.mov rax, arrA[k2]
|
||||||
ctx.mul rdx, rax, arrA[k1], rax
|
ctx.mul rdx, rax, arrA[k1], rax
|
||||||
ctx.add v, rRAX
|
ctx.add v, rax
|
||||||
ctx.adc u, rRDX
|
ctx.adc u, rdx
|
||||||
ctx.adc t, 0
|
ctx.adc t, 0
|
||||||
else:
|
else:
|
||||||
discard
|
discard
|
||||||
@ -278,9 +230,9 @@ macro square_gen[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
|||||||
ctx.`xor` t, t
|
ctx.`xor` t, t
|
||||||
|
|
||||||
if aLen*2 < rLen:
|
if aLen*2 < rLen:
|
||||||
ctx.`xor` rRAX, rRAX
|
ctx.`xor` rax, rax
|
||||||
for i in aLen*2 ..< rLen:
|
for i in aLen*2 ..< rLen:
|
||||||
ctx.mov arrR[i], rRAX
|
ctx.mov arrR[i], rax
|
||||||
|
|
||||||
# Codegen
|
# Codegen
|
||||||
result.add ctx.generate
|
result.add ctx.generate
|
||||||
|
|||||||
@ -37,8 +37,7 @@ proc mulx_by_word(
|
|||||||
ctx: var Assembler_x86,
|
ctx: var Assembler_x86,
|
||||||
r0: Operand,
|
r0: Operand,
|
||||||
a, t: OperandArray,
|
a, t: OperandArray,
|
||||||
word0: Operand,
|
word0: Operand
|
||||||
rRAX, rRDX: Operand
|
|
||||||
) =
|
) =
|
||||||
## Multiply the `a[0..<N]` by `word`
|
## Multiply the `a[0..<N]` by `word`
|
||||||
## and store in `[t:r0]`
|
## and store in `[t:r0]`
|
||||||
@ -52,18 +51,18 @@ proc mulx_by_word(
|
|||||||
# (C,t[j]) := t[j] + a[j]*b[i] + C
|
# (C,t[j]) := t[j] + a[j]*b[i] + C
|
||||||
|
|
||||||
# First limb
|
# First limb
|
||||||
ctx.mov rRDX, word0
|
ctx.mov rdx, word0
|
||||||
ctx.`xor` rRAX, rRAX # Clear flags (important if steady state is skipped)
|
ctx.`xor` rax, rax # Clear flags (important if steady state is skipped)
|
||||||
ctx.mulx t[0], rRAX, a[0], rdx
|
ctx.mulx t[0], rax, a[0], rdx
|
||||||
ctx.mov r0, rRAX
|
ctx.mov r0, rax
|
||||||
|
|
||||||
# Steady state
|
# Steady state
|
||||||
for j in 1 ..< N:
|
for j in 1 ..< N:
|
||||||
ctx.mulx t[j], rRAX, a[j], rdx
|
ctx.mulx t[j], rax, a[j], rdx
|
||||||
if j == 1:
|
if j == 1:
|
||||||
ctx.add t[j-1], rRAX
|
ctx.add t[j-1], rax
|
||||||
else:
|
else:
|
||||||
ctx.adc t[j-1], rRAX
|
ctx.adc t[j-1], rax
|
||||||
|
|
||||||
# Final carries
|
# Final carries
|
||||||
ctx.comment " Accumulate last carries in hi word"
|
ctx.comment " Accumulate last carries in hi word"
|
||||||
@ -74,8 +73,7 @@ proc mulaccx_by_word(
|
|||||||
r: OperandArray,
|
r: OperandArray,
|
||||||
i: int,
|
i: int,
|
||||||
a, t: OperandArray,
|
a, t: OperandArray,
|
||||||
word: Operand,
|
word: Operand
|
||||||
rRAX, rRDX: Operand
|
|
||||||
) =
|
) =
|
||||||
## Multiply the `a[0..<N]` by `word`
|
## Multiply the `a[0..<N]` by `word`
|
||||||
## and store in `[t:r0]`
|
## and store in `[t:r0]`
|
||||||
@ -87,16 +85,16 @@ proc mulaccx_by_word(
|
|||||||
doAssert i != 0
|
doAssert i != 0
|
||||||
|
|
||||||
ctx.comment " Outer loop i = " & $i & ", j in [0, " & $N & ")"
|
ctx.comment " Outer loop i = " & $i & ", j in [0, " & $N & ")"
|
||||||
ctx.mov rRDX, word
|
ctx.mov rdx, word
|
||||||
ctx.`xor` rRAX, rRAX # Clear flags
|
ctx.`xor` rax, rax # Clear flags
|
||||||
|
|
||||||
# for j=0 to N-1
|
# for j=0 to N-1
|
||||||
# (C,t[j]) := t[j] + a[j]*b[i] + C
|
# (C,t[j]) := t[j] + a[j]*b[i] + C
|
||||||
|
|
||||||
# Steady state
|
# Steady state
|
||||||
for j in 0 ..< N:
|
for j in 0 ..< N:
|
||||||
ctx.mulx hi, rRAX, a[j], rdx
|
ctx.mulx hi, rax, a[j], rdx
|
||||||
ctx.adox t[j], rRAX
|
ctx.adox t[j], rax
|
||||||
if j == 0:
|
if j == 0:
|
||||||
ctx.mov r[i], t[j]
|
ctx.mov r[i], t[j]
|
||||||
if j == N-1:
|
if j == N-1:
|
||||||
@ -105,9 +103,9 @@ proc mulaccx_by_word(
|
|||||||
|
|
||||||
# Final carries
|
# Final carries
|
||||||
ctx.comment " Accumulate last carries in hi word"
|
ctx.comment " Accumulate last carries in hi word"
|
||||||
ctx.mov rRDX, 0 # Set to 0 without clearing flags
|
ctx.mov rdx, 0 # Set to 0 without clearing flags
|
||||||
ctx.adcx hi, rRDX
|
ctx.adcx hi, rdx
|
||||||
ctx.adox hi, rRDX
|
ctx.adox hi, rdx
|
||||||
|
|
||||||
macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen], bx: Limbs[bLen]) =
|
macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen], bx: Limbs[bLen]) =
|
||||||
## `a`, `b`, `r` can have a different number of limbs
|
## `a`, `b`, `r` can have a different number of limbs
|
||||||
@ -126,25 +124,6 @@ macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen
|
|||||||
b = init(OperandArray, nimSymbol = bx, bLen, PointerInReg, Input)
|
b = init(OperandArray, nimSymbol = bx, bLen, PointerInReg, Input)
|
||||||
|
|
||||||
# MULX requires RDX
|
# MULX requires RDX
|
||||||
rRDX = Operand(
|
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[rdx]",
|
|
||||||
nimSymbol: ident"rdx",
|
|
||||||
rm: RDX,
|
|
||||||
constraint: Output_EarlyClobber,
|
|
||||||
cEmit: "rdx"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
rRAX = Operand(
|
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[rax]",
|
|
||||||
nimSymbol: ident"rax",
|
|
||||||
rm: RAX,
|
|
||||||
constraint: Output_EarlyClobber,
|
|
||||||
cEmit: "rax"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
tSlots = aLen+1 # Extra for high word
|
tSlots = aLen+1 # Extra for high word
|
||||||
|
|
||||||
@ -154,26 +133,21 @@ macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen
|
|||||||
|
|
||||||
# Prologue
|
# Prologue
|
||||||
let tsym = t.nimSymbol
|
let tsym = t.nimSymbol
|
||||||
let eax = rRAX.desc.nimSymbol
|
|
||||||
let edx = rRDX.desc.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
var `tsym`{.noInit.}: array[`tSlots`, BaseType]
|
var `tsym`{.noInit.}: array[`tSlots`, BaseType]
|
||||||
var `edx`{.noInit.}, `eax`{.noInit.}: BaseType
|
|
||||||
|
|
||||||
for i in 0 ..< min(rLen, bLen):
|
for i in 0 ..< min(rLen, bLen):
|
||||||
if i == 0:
|
if i == 0:
|
||||||
ctx.mulx_by_word(
|
ctx.mulx_by_word(
|
||||||
r[0],
|
r[0],
|
||||||
a, t,
|
a, t,
|
||||||
b[0],
|
b[0]
|
||||||
rRAX, rRDX,
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
ctx.mulaccx_by_word(
|
ctx.mulaccx_by_word(
|
||||||
r, i,
|
r, i,
|
||||||
a, t,
|
a, t,
|
||||||
b[i],
|
b[i]
|
||||||
rRAX, rRDX
|
|
||||||
)
|
)
|
||||||
|
|
||||||
t.rotateLeft()
|
t.rotateLeft()
|
||||||
@ -184,9 +158,9 @@ macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen
|
|||||||
|
|
||||||
# Zero the extra
|
# Zero the extra
|
||||||
if aLen+bLen < rLen:
|
if aLen+bLen < rLen:
|
||||||
ctx.`xor` rRAX, rRAX
|
ctx.`xor` rax, rax
|
||||||
for i in aLen+bLen ..< rLen:
|
for i in aLen+bLen ..< rLen:
|
||||||
ctx.mov r[i], rRAX
|
ctx.mov r[i], rax
|
||||||
|
|
||||||
# Codegen
|
# Codegen
|
||||||
result.add ctx.generate
|
result.add ctx.generate
|
||||||
|
|||||||
@ -39,6 +39,9 @@ type
|
|||||||
# Flags
|
# Flags
|
||||||
CarryFlag = "@ccc"
|
CarryFlag = "@ccc"
|
||||||
|
|
||||||
|
# Clobbered register
|
||||||
|
ClobberedReg
|
||||||
|
|
||||||
Register* = enum
|
Register* = enum
|
||||||
rbx, rdx, r8, rax, xmm0
|
rbx, rdx, r8, rax, xmm0
|
||||||
|
|
||||||
@ -50,6 +53,7 @@ type
|
|||||||
Output_EarlyClobber = "=&"
|
Output_EarlyClobber = "=&"
|
||||||
InputOutput = "+"
|
InputOutput = "+"
|
||||||
InputOutput_EnsureClobber = "+&" # For register InputOutput, clang needs "+&" bug?
|
InputOutput_EnsureClobber = "+&" # For register InputOutput, clang needs "+&" bug?
|
||||||
|
ClobberedRegister
|
||||||
|
|
||||||
OpKind = enum
|
OpKind = enum
|
||||||
kRegister
|
kRegister
|
||||||
@ -88,11 +92,12 @@ type
|
|||||||
wordSize: int
|
wordSize: int
|
||||||
areFlagsClobbered: bool
|
areFlagsClobbered: bool
|
||||||
isStackClobbered: bool
|
isStackClobbered: bool
|
||||||
|
regClobbers: set[Register]
|
||||||
|
|
||||||
Stack* = object
|
Stack* = object
|
||||||
|
|
||||||
const SpecificRegisters = {RCX, RDX, R8, RAX}
|
const SpecificRegisters = {RCX, RDX, R8, RAX}
|
||||||
const OutputReg = {Output_EarlyClobber, InputOutput, InputOutput_EnsureClobber, Output_Overwrite}
|
const OutputReg = {Output_EarlyClobber, InputOutput, InputOutput_EnsureClobber, Output_Overwrite, ClobberedRegister}
|
||||||
|
|
||||||
func hash(od: OperandDesc): Hash =
|
func hash(od: OperandDesc): Hash =
|
||||||
{.noSideEffect.}:
|
{.noSideEffect.}:
|
||||||
@ -195,6 +200,24 @@ func asArrayAddr*(op: Operand, len: int): Operand =
|
|||||||
offset: i
|
offset: i
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func asArrayAddr*(op: Register, len: int): Operand =
|
||||||
|
## Use the value stored in an operand as an array address
|
||||||
|
result = Operand(
|
||||||
|
kind: kArrayAddr,
|
||||||
|
desc: nil,
|
||||||
|
buf: newSeq[Operand](len)
|
||||||
|
)
|
||||||
|
for i in 0 ..< len:
|
||||||
|
result.buf[i] = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: $op,
|
||||||
|
rm: ClobberedReg,
|
||||||
|
constraint: ClobberedRegister
|
||||||
|
),
|
||||||
|
kind: kFromArray,
|
||||||
|
offset: i
|
||||||
|
)
|
||||||
|
|
||||||
# Code generation
|
# Code generation
|
||||||
# ------------------------------------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
@ -268,6 +291,12 @@ func generate*(a: Assembler_x86): NimNode =
|
|||||||
else:
|
else:
|
||||||
clobberList.add ", \"" & str & '\"'
|
clobberList.add ", \"" & str & '\"'
|
||||||
|
|
||||||
|
for reg in a.regClobbers:
|
||||||
|
if clobberList.len == 2:
|
||||||
|
clobberList.add "\"" & $reg & '\"'
|
||||||
|
else:
|
||||||
|
clobberList.add ", \"" & $reg & '\"'
|
||||||
|
|
||||||
params.add clobberList
|
params.add clobberList
|
||||||
|
|
||||||
# GCC will optimize ASM away if there are no
|
# GCC will optimize ASM away if there are no
|
||||||
@ -293,7 +322,15 @@ func generate*(a: Assembler_x86): NimNode =
|
|||||||
|
|
||||||
func getStrOffset(a: Assembler_x86, op: Operand): string =
|
func getStrOffset(a: Assembler_x86, op: Operand): string =
|
||||||
if op.kind != kFromArray:
|
if op.kind != kFromArray:
|
||||||
return "%" & op.desc.asmId
|
if op.kind == kArrayAddr:
|
||||||
|
# We are operating on an array pointer
|
||||||
|
# instead of array elements
|
||||||
|
if op.buf[0].desc.constraint == ClobberedRegister:
|
||||||
|
return "%%" & op.buf[0].desc.asmId
|
||||||
|
else:
|
||||||
|
return "%" & op.buf[0].desc.asmId
|
||||||
|
else:
|
||||||
|
return "%" & op.desc.asmId
|
||||||
|
|
||||||
# Beware GCC / Clang differences with array offsets
|
# Beware GCC / Clang differences with array offsets
|
||||||
# https://lists.llvm.org/pipermail/llvm-dev/2017-August/116202.html
|
# https://lists.llvm.org/pipermail/llvm-dev/2017-August/116202.html
|
||||||
@ -315,12 +352,16 @@ func getStrOffset(a: Assembler_x86, op: Operand): string =
|
|||||||
op.desc.rm in SpecificRegisters or
|
op.desc.rm in SpecificRegisters or
|
||||||
(op.desc.rm == ElemsInReg and op.kind == kFromArray):
|
(op.desc.rm == ElemsInReg and op.kind == kFromArray):
|
||||||
if op.offset == 0:
|
if op.offset == 0:
|
||||||
return "(%" & $op.desc.asmId & ')'
|
return "(%" & op.desc.asmId & ')'
|
||||||
# GCC & Clang seemed to disagree on pointer indexing
|
# GCC & Clang seemed to disagree on pointer indexing
|
||||||
# in the past and required different codegen
|
# in the past and required different codegen
|
||||||
# if defined(gcc):
|
# if defined(gcc):
|
||||||
# return $(op.offset * a.wordSize) & "+(%" & $op.desc.asmId & ')'
|
# return $(op.offset * a.wordSize) & "+(%" & op.desc.asmId & ')'
|
||||||
return $(op.offset * a.wordSize) & "(%" & $op.desc.asmId & ')'
|
return $(op.offset * a.wordSize) & "(%" & op.desc.asmId & ')'
|
||||||
|
elif op.desc.rm == ClobberedReg: # Array in clobbered register
|
||||||
|
if op.offset == 0:
|
||||||
|
return "(%%" & op.desc.asmId & ')'
|
||||||
|
return $(op.offset * a.wordSize) & "(%%" & op.desc.asmId & ')'
|
||||||
else:
|
else:
|
||||||
error "Unsupported: " & $op.desc.rm.ord
|
error "Unsupported: " & $op.desc.rm.ord
|
||||||
|
|
||||||
@ -335,7 +376,8 @@ func codeFragment(a: var Assembler_x86, instr: string, op: Operand) =
|
|||||||
else:
|
else:
|
||||||
error "Unsupported bitwidth: " & $a.wordBitWidth
|
error "Unsupported bitwidth: " & $a.wordBitWidth
|
||||||
|
|
||||||
a.operands.incl op.desc
|
if op.desc.constraint != ClobberedRegister:
|
||||||
|
a.operands.incl op.desc
|
||||||
|
|
||||||
func codeFragment(a: var Assembler_x86, instr: string, op0, op1: Operand) =
|
func codeFragment(a: var Assembler_x86, instr: string, op0, op1: Operand) =
|
||||||
# Generate a code fragment
|
# Generate a code fragment
|
||||||
@ -352,8 +394,56 @@ func codeFragment(a: var Assembler_x86, instr: string, op0, op1: Operand) =
|
|||||||
else:
|
else:
|
||||||
error "Unsupported bitwidth: " & $a.wordBitWidth
|
error "Unsupported bitwidth: " & $a.wordBitWidth
|
||||||
|
|
||||||
a.operands.incl op0.desc
|
if op0.desc.constraint != ClobberedRegister:
|
||||||
a.operands.incl op1.desc
|
a.operands.incl op0.desc
|
||||||
|
if op1.desc.constraint != ClobberedRegister:
|
||||||
|
a.operands.incl op1.desc
|
||||||
|
|
||||||
|
func codeFragment(a: var Assembler_x86, instr: string, op: Operand, reg: Register) =
|
||||||
|
# Generate a code fragment
|
||||||
|
# ⚠️ Warning:
|
||||||
|
# The caller should deal with destination/source operand
|
||||||
|
# so that it fits GNU Assembly
|
||||||
|
let off = a.getStrOffset(op)
|
||||||
|
|
||||||
|
if a.wordBitWidth == 64:
|
||||||
|
a.code &= instr & "q " & off & ", %%" & $reg & '\n'
|
||||||
|
else:
|
||||||
|
a.code &= instr & "l " & off & ", %%" & $reg & '\n'
|
||||||
|
|
||||||
|
# op.desc can be nil for renamed registers (using asArrayAddr)
|
||||||
|
if not op.desc.isNil and op.desc.constraint != ClobberedRegister:
|
||||||
|
a.operands.incl op.desc
|
||||||
|
a.regClobbers.incl reg
|
||||||
|
|
||||||
|
func codeFragment(a: var Assembler_x86, instr: string, reg: Register, op: Operand) =
|
||||||
|
# Generate a code fragment
|
||||||
|
# ⚠️ Warning:
|
||||||
|
# The caller should deal with destination/source operand
|
||||||
|
# so that it fits GNU Assembly
|
||||||
|
let off = a.getStrOffset(op)
|
||||||
|
|
||||||
|
if a.wordBitWidth == 64:
|
||||||
|
a.code &= instr & "q %%" & $reg & ", " & off & '\n'
|
||||||
|
else:
|
||||||
|
a.code &= instr & "l %%" & $reg & ", " & off & '\n'
|
||||||
|
|
||||||
|
if op.desc.constraint != ClobberedRegister:
|
||||||
|
a.operands.incl op.desc
|
||||||
|
a.regClobbers.incl reg
|
||||||
|
|
||||||
|
func codeFragment(a: var Assembler_x86, instr: string, reg0, reg1: Register) =
|
||||||
|
# Generate a code fragment
|
||||||
|
# ⚠️ Warning:
|
||||||
|
# The caller should deal with destination/source operand
|
||||||
|
# so that it fits GNU Assembly
|
||||||
|
if a.wordBitWidth == 64:
|
||||||
|
a.code &= instr & "q %%" & $reg0 & ", %%" & $reg1 & '\n'
|
||||||
|
else:
|
||||||
|
a.code &= instr & "l %%" & $reg0 & ", %%" & $reg1 & '\n'
|
||||||
|
|
||||||
|
a.regClobbers.incl reg0
|
||||||
|
a.regClobbers.incl reg1
|
||||||
|
|
||||||
func codeFragment(a: var Assembler_x86, instr: string, imm: int, op: Operand) =
|
func codeFragment(a: var Assembler_x86, instr: string, imm: int, op: Operand) =
|
||||||
# Generate a code fragment
|
# Generate a code fragment
|
||||||
@ -367,7 +457,8 @@ func codeFragment(a: var Assembler_x86, instr: string, imm: int, op: Operand) =
|
|||||||
else:
|
else:
|
||||||
a.code &= instr & "l $" & $imm & ", " & off & '\n'
|
a.code &= instr & "l $" & $imm & ", " & off & '\n'
|
||||||
|
|
||||||
a.operands.incl op.desc
|
if op.desc.constraint != ClobberedRegister:
|
||||||
|
a.operands.incl op.desc
|
||||||
|
|
||||||
func codeFragment(a: var Assembler_x86, instr: string, reg: Register, op: OperandReuse) =
|
func codeFragment(a: var Assembler_x86, instr: string, reg: Register, op: OperandReuse) =
|
||||||
# Generate a code fragment
|
# Generate a code fragment
|
||||||
@ -378,6 +469,7 @@ func codeFragment(a: var Assembler_x86, instr: string, reg: Register, op: Operan
|
|||||||
a.code &= instr & "q %%" & $reg & ", %" & $op.asmId & '\n'
|
a.code &= instr & "q %%" & $reg & ", %" & $op.asmId & '\n'
|
||||||
else:
|
else:
|
||||||
a.code &= instr & "l %%" & $reg & ", %" & $op.asmId & '\n'
|
a.code &= instr & "l %%" & $reg & ", %" & $op.asmId & '\n'
|
||||||
|
a.regClobbers.incl reg
|
||||||
|
|
||||||
func codeFragment(a: var Assembler_x86, instr: string, op: OperandReuse, reg: Register) =
|
func codeFragment(a: var Assembler_x86, instr: string, op: OperandReuse, reg: Register) =
|
||||||
# Generate a code fragment
|
# Generate a code fragment
|
||||||
@ -388,6 +480,7 @@ func codeFragment(a: var Assembler_x86, instr: string, op: OperandReuse, reg: Re
|
|||||||
a.code &= instr & "q %" & $op.asmId & ", %%" & $reg & '\n'
|
a.code &= instr & "q %" & $op.asmId & ", %%" & $reg & '\n'
|
||||||
else:
|
else:
|
||||||
a.code &= instr & "l %" & $op.asmId & ", %%" & $reg & '\n'
|
a.code &= instr & "l %" & $op.asmId & ", %%" & $reg & '\n'
|
||||||
|
a.regClobbers.incl reg
|
||||||
|
|
||||||
func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: Register) =
|
func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: Register) =
|
||||||
# Generate a code fragment
|
# Generate a code fragment
|
||||||
@ -398,16 +491,7 @@ func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: Register)
|
|||||||
a.code &= instr & "q $" & $imm & ", %%" & $reg & '\n'
|
a.code &= instr & "q $" & $imm & ", %%" & $reg & '\n'
|
||||||
else:
|
else:
|
||||||
a.code &= instr & "l $" & $imm & ", %%" & $reg & '\n'
|
a.code &= instr & "l $" & $imm & ", %%" & $reg & '\n'
|
||||||
|
a.regClobbers.incl reg
|
||||||
func codeFragment(a: var Assembler_x86, instr: string, reg0, reg1: Register) =
|
|
||||||
# Generate a code fragment
|
|
||||||
# ⚠️ Warning:
|
|
||||||
# The caller should deal with destination/source operand
|
|
||||||
# so that it fits GNU Assembly
|
|
||||||
if a.wordBitWidth == 64:
|
|
||||||
a.code &= instr & "q %%" & $reg0 & ", %%" & $reg1 & '\n'
|
|
||||||
else:
|
|
||||||
a.code &= instr & "l %%" & $reg0 & ", %%" & $reg1 & '\n'
|
|
||||||
|
|
||||||
func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: OperandReuse) =
|
func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: OperandReuse) =
|
||||||
# Generate a code fragment
|
# Generate a code fragment
|
||||||
@ -429,33 +513,35 @@ func codeFragment(a: var Assembler_x86, instr: string, reg0, reg1: OperandReuse)
|
|||||||
else:
|
else:
|
||||||
a.code &= instr & "l %" & $reg0.asmId & ", %" & $reg1.asmId & '\n'
|
a.code &= instr & "l %" & $reg0.asmId & ", %" & $reg1.asmId & '\n'
|
||||||
|
|
||||||
func codeFragment(a: var Assembler_x86, instr: string, reg0: OperandReuse, reg1: Operand) =
|
func codeFragment(a: var Assembler_x86, instr: string, op0: OperandReuse, op1: Operand) =
|
||||||
# Generate a code fragment
|
# Generate a code fragment
|
||||||
# ⚠️ Warning:
|
# ⚠️ Warning:
|
||||||
# The caller should deal with destination/source operand
|
# The caller should deal with destination/source operand
|
||||||
# so that it fits GNU Assembly
|
# so that it fits GNU Assembly
|
||||||
let off1 = a.getStrOffset(reg1)
|
let off1 = a.getStrOffset(op1)
|
||||||
|
|
||||||
if a.wordBitWidth == 64:
|
if a.wordBitWidth == 64:
|
||||||
a.code &= instr & "q %" & $reg0.asmId & ", " & off1 & '\n'
|
a.code &= instr & "q %" & $op0.asmId & ", " & off1 & '\n'
|
||||||
else:
|
else:
|
||||||
a.code &= instr & "l %" & $reg0.asmId & ", " & off1 & '\n'
|
a.code &= instr & "l %" & $op0.asmId & ", " & off1 & '\n'
|
||||||
|
|
||||||
a.operands.incl reg1.desc
|
if op1.desc.constraint != ClobberedRegister:
|
||||||
|
a.operands.incl op1.desc
|
||||||
|
|
||||||
func codeFragment(a: var Assembler_x86, instr: string, reg0: Operand, reg1: OperandReuse) =
|
func codeFragment(a: var Assembler_x86, instr: string, op0: Operand, op1: OperandReuse) =
|
||||||
# Generate a code fragment
|
# Generate a code fragment
|
||||||
# ⚠️ Warning:
|
# ⚠️ Warning:
|
||||||
# The caller should deal with destination/source operand
|
# The caller should deal with destination/source operand
|
||||||
# so that it fits GNU Assembly
|
# so that it fits GNU Assembly
|
||||||
let off0 = a.getStrOffset(reg0)
|
let off0 = a.getStrOffset(op0)
|
||||||
|
|
||||||
if a.wordBitWidth == 64:
|
if a.wordBitWidth == 64:
|
||||||
a.code &= instr & "q " & off0 & ", %" & $reg1.asmId & '\n'
|
a.code &= instr & "q " & off0 & ", %" & $op1.asmId & '\n'
|
||||||
else:
|
else:
|
||||||
a.code &= instr & "l " & off0 & ", %" & $reg1.asmId & '\n'
|
a.code &= instr & "l " & off0 & ", %" & $op1.asmId & '\n'
|
||||||
|
|
||||||
a.operands.incl reg0.desc
|
if op0.desc.constraint != ClobberedRegister:
|
||||||
|
a.operands.incl op0.desc
|
||||||
|
|
||||||
func reuseRegister*(reg: OperandArray): OperandReuse =
|
func reuseRegister*(reg: OperandArray): OperandReuse =
|
||||||
# TODO: disable the reg input
|
# TODO: disable the reg input
|
||||||
@ -481,6 +567,22 @@ func add*(a: var Assembler_x86, dst, src: Operand) =
|
|||||||
a.codeFragment("add", src, dst)
|
a.codeFragment("add", src, dst)
|
||||||
a.areFlagsClobbered = true
|
a.areFlagsClobbered = true
|
||||||
|
|
||||||
|
func add*(a: var Assembler_x86, dst, src: Register) =
|
||||||
|
## Does: dst <- dst + src
|
||||||
|
a.codeFragment("add", src, dst)
|
||||||
|
a.areFlagsClobbered = true
|
||||||
|
|
||||||
|
func add*(a: var Assembler_x86, dst: Operand, src: Register) =
|
||||||
|
## Does: dst <- dst + src
|
||||||
|
doAssert dst.desc.constraint in OutputReg
|
||||||
|
a.codeFragment("add", src, dst)
|
||||||
|
a.areFlagsClobbered = true
|
||||||
|
|
||||||
|
func add*(a: var Assembler_x86, dst: Register, src: Operand) =
|
||||||
|
## Does: dst <- dst + src
|
||||||
|
a.codeFragment("add", src, dst)
|
||||||
|
a.areFlagsClobbered = true
|
||||||
|
|
||||||
func adc*(a: var Assembler_x86, dst, src: Operand) =
|
func adc*(a: var Assembler_x86, dst, src: Operand) =
|
||||||
## Does: dst <- dst + src + carry
|
## Does: dst <- dst + src + carry
|
||||||
doAssert dst.desc.constraint in OutputReg
|
doAssert dst.desc.constraint in OutputReg
|
||||||
@ -490,6 +592,11 @@ func adc*(a: var Assembler_x86, dst, src: Operand) =
|
|||||||
if dst.desc.rm in {Mem, MemOffsettable, AnyRegOrMem}:
|
if dst.desc.rm in {Mem, MemOffsettable, AnyRegOrMem}:
|
||||||
{.warning: "Using addcarry with a memory destination, this incurs significant performance penalties.".}
|
{.warning: "Using addcarry with a memory destination, this incurs significant performance penalties.".}
|
||||||
|
|
||||||
|
func adc*(a: var Assembler_x86, dst, src: Register) =
|
||||||
|
## Does: dst <- dst + src + carry
|
||||||
|
a.codeFragment("adc", src, dst)
|
||||||
|
a.areFlagsClobbered = true
|
||||||
|
|
||||||
func adc*(a: var Assembler_x86, dst: Operand, imm: int) =
|
func adc*(a: var Assembler_x86, dst: Operand, imm: int) =
|
||||||
## Does: dst <- dst + imm + borrow
|
## Does: dst <- dst + imm + borrow
|
||||||
doAssert dst.desc.constraint in OutputReg
|
doAssert dst.desc.constraint in OutputReg
|
||||||
@ -499,6 +606,17 @@ func adc*(a: var Assembler_x86, dst: Operand, imm: int) =
|
|||||||
if dst.desc.rm in {Mem, MemOffsettable, AnyRegOrMem}:
|
if dst.desc.rm in {Mem, MemOffsettable, AnyRegOrMem}:
|
||||||
{.warning: "Using addcarry with a memory destination, this incurs significant performance penalties.".}
|
{.warning: "Using addcarry with a memory destination, this incurs significant performance penalties.".}
|
||||||
|
|
||||||
|
func adc*(a: var Assembler_x86, dst: Operand, src: Register) =
|
||||||
|
## Does: dst <- dst + src
|
||||||
|
doAssert dst.desc.constraint in OutputReg
|
||||||
|
a.codeFragment("adc", src, dst)
|
||||||
|
a.areFlagsClobbered = true
|
||||||
|
|
||||||
|
func adc*(a: var Assembler_x86, dst: Register, imm: int) =
|
||||||
|
## Does: dst <- dst + src
|
||||||
|
a.codeFragment("adc", imm, dst)
|
||||||
|
a.areFlagsClobbered = true
|
||||||
|
|
||||||
func sub*(a: var Assembler_x86, dst, src: Operand) =
|
func sub*(a: var Assembler_x86, dst, src: Operand) =
|
||||||
## Does: dst <- dst - src
|
## Does: dst <- dst - src
|
||||||
doAssert dst.desc.constraint in OutputReg
|
doAssert dst.desc.constraint in OutputReg
|
||||||
@ -597,6 +715,12 @@ func `xor`*(a: var Assembler_x86, dst, src: Operand) =
|
|||||||
a.codeFragment("xor", src, dst)
|
a.codeFragment("xor", src, dst)
|
||||||
a.areFlagsClobbered = true
|
a.areFlagsClobbered = true
|
||||||
|
|
||||||
|
func `xor`*(a: var Assembler_x86, dst, src: Register) =
|
||||||
|
## Compute the bitwise xor of x and y and
|
||||||
|
## reset all flags
|
||||||
|
a.codeFragment("xor", src, dst)
|
||||||
|
a.areFlagsClobbered = true
|
||||||
|
|
||||||
func mov*(a: var Assembler_x86, dst, src: Operand) =
|
func mov*(a: var Assembler_x86, dst, src: Operand) =
|
||||||
## Does: dst <- src
|
## Does: dst <- src
|
||||||
doAssert dst.desc.constraint in OutputReg, $dst.repr
|
doAssert dst.desc.constraint in OutputReg, $dst.repr
|
||||||
@ -625,16 +749,26 @@ func mov*(a: var Assembler_x86, dst: Operand, imm: int) =
|
|||||||
a.codeFragment("mov", imm, dst)
|
a.codeFragment("mov", imm, dst)
|
||||||
# No clobber
|
# No clobber
|
||||||
|
|
||||||
|
func mov*(a: var Assembler_x86, dst: Register, imm: int) =
|
||||||
|
## Does: dst <- src with dst a fixed register
|
||||||
|
a.codeFragment("mov", imm, dst)
|
||||||
|
|
||||||
|
func mov*(a: var Assembler_x86, dst: Register, src: Operand) =
|
||||||
|
## Does: dst <- src with dst a fixed register
|
||||||
|
a.codeFragment("mov", src, dst)
|
||||||
|
|
||||||
|
func mov*(a: var Assembler_x86, dst: Operand, src: Register) =
|
||||||
|
## Does: dst <- src with dst a fixed register
|
||||||
|
a.codeFragment("mov", src, dst)
|
||||||
|
|
||||||
func mov*(a: var Assembler_x86, dst: Register, src: OperandReuse) =
|
func mov*(a: var Assembler_x86, dst: Register, src: OperandReuse) =
|
||||||
## Does: dst <- src with dst a fixed register
|
## Does: dst <- src with dst a fixed register
|
||||||
a.codeFragment("mov", src, dst)
|
a.codeFragment("mov", src, dst)
|
||||||
# No clobber
|
|
||||||
|
|
||||||
func mov*(a: var Assembler_x86, dst: OperandReuse, src: Register) =
|
func mov*(a: var Assembler_x86, dst: OperandReuse, src: Register) =
|
||||||
## Does: dst <- imm
|
## Does: dst <- imm
|
||||||
# doAssert dst.desc.constraint in OutputReg, $dst.repr
|
# doAssert dst.desc.constraint in OutputReg, $dst.repr
|
||||||
a.codeFragment("mov", src, dst)
|
a.codeFragment("mov", src, dst)
|
||||||
# No clobber
|
|
||||||
|
|
||||||
func cmovc*(a: var Assembler_x86, dst, src: Operand) =
|
func cmovc*(a: var Assembler_x86, dst, src: Operand) =
|
||||||
## Does: dst <- src if the carry flag is set
|
## Does: dst <- src if the carry flag is set
|
||||||
@ -697,6 +831,8 @@ func mul*(a: var Assembler_x86, dHi, dLo: Register, src0: Operand, src1: Registe
|
|||||||
doAssert src1 == rax, "MUL requires the RAX register"
|
doAssert src1 == rax, "MUL requires the RAX register"
|
||||||
doAssert dHi == rdx, "MUL requires the RDX register"
|
doAssert dHi == rdx, "MUL requires the RDX register"
|
||||||
doAssert dLo == rax, "MUL requires the RAX register"
|
doAssert dLo == rax, "MUL requires the RAX register"
|
||||||
|
a.regClobbers.incl rax
|
||||||
|
a.regClobbers.incl rdx
|
||||||
|
|
||||||
a.codeFragment("mul", src0)
|
a.codeFragment("mul", src0)
|
||||||
|
|
||||||
@ -707,9 +843,15 @@ func imul*(a: var Assembler_x86, dst, src: Operand) =
|
|||||||
|
|
||||||
a.codeFragment("imul", src, dst)
|
a.codeFragment("imul", src, dst)
|
||||||
|
|
||||||
|
func imul*(a: var Assembler_x86, dst: Register, src: Operand) =
|
||||||
|
## Does dst <- dst * src, keeping only the low half
|
||||||
|
a.codeFragment("imul", src, dst)
|
||||||
|
|
||||||
func mulx*(a: var Assembler_x86, dHi, dLo, src0: Operand, src1: Register) =
|
func mulx*(a: var Assembler_x86, dHi, dLo, src0: Operand, src1: Register) =
|
||||||
## Does (dHi, dLo) <- src0 * src1
|
## Does (dHi, dLo) <- src0 * src1
|
||||||
doAssert src1 == rdx, "MULX requires the RDX register"
|
doAssert src1 == rdx, "MULX requires the RDX register"
|
||||||
|
a.regClobbers.incl rdx
|
||||||
|
|
||||||
doAssert dHi.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
|
doAssert dHi.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
|
||||||
"The destination operand must be a register " & $dHi.repr
|
"The destination operand must be a register " & $dHi.repr
|
||||||
doAssert dLo.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
|
doAssert dLo.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
|
||||||
@ -727,9 +869,31 @@ func mulx*(a: var Assembler_x86, dHi, dLo, src0: Operand, src1: Register) =
|
|||||||
|
|
||||||
a.operands.incl src0.desc
|
a.operands.incl src0.desc
|
||||||
|
|
||||||
|
func mulx*(a: var Assembler_x86, dHi: Operand, dLo: Register, src0: Operand, src1: Register) =
|
||||||
|
## Does (dHi, dLo) <- src0 * src1
|
||||||
|
doAssert src1 == rdx, "MULX requires the RDX register"
|
||||||
|
a.regClobbers.incl rdx
|
||||||
|
|
||||||
|
doAssert dHi.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
|
||||||
|
"The destination operand must be a register " & $dHi.repr
|
||||||
|
doAssert dHi.desc.constraint in OutputReg
|
||||||
|
|
||||||
|
let off0 = a.getStrOffset(src0)
|
||||||
|
|
||||||
|
# Annoying AT&T syntax
|
||||||
|
if a.wordBitWidth == 64:
|
||||||
|
a.code &= "mulxq " & off0 & ", %%" & $dLo & ", %" & $dHi.desc.asmId & '\n'
|
||||||
|
else:
|
||||||
|
a.code &= "mulxl " & off0 & ", %%" & $dLo & ", %" & $dHi.desc.asmId & '\n'
|
||||||
|
|
||||||
|
a.operands.incl src0.desc
|
||||||
|
a.regClobbers.incl dLo
|
||||||
|
|
||||||
func mulx*(a: var Assembler_x86, dHi: OperandReuse, dLo, src0: Operand, src1: Register) =
|
func mulx*(a: var Assembler_x86, dHi: OperandReuse, dLo, src0: Operand, src1: Register) =
|
||||||
## Does (dHi, dLo) <- src0 * src1
|
## Does (dHi, dLo) <- src0 * src1
|
||||||
doAssert src1 == rdx, "MULX requires the RDX register"
|
doAssert src1 == rdx, "MULX requires the RDX register"
|
||||||
|
a.regClobbers.incl rdx
|
||||||
|
|
||||||
doAssert dLo.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
|
doAssert dLo.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
|
||||||
"The destination operand must be a register " & $dLo.repr
|
"The destination operand must be a register " & $dLo.repr
|
||||||
doAssert dLo.desc.constraint in OutputReg
|
doAssert dLo.desc.constraint in OutputReg
|
||||||
@ -744,7 +908,40 @@ func mulx*(a: var Assembler_x86, dHi: OperandReuse, dLo, src0: Operand, src1: Re
|
|||||||
|
|
||||||
a.operands.incl src0.desc
|
a.operands.incl src0.desc
|
||||||
|
|
||||||
func adcx*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|OperandReuse) =
|
func mulx*(a: var Assembler_x86, dHi: OperandReuse, dLo: Register, src0: Operand, src1: Register) =
|
||||||
|
## Does (dHi, dLo) <- src0 * src1
|
||||||
|
doAssert src1 == rdx, "MULX requires the RDX register"
|
||||||
|
a.regClobbers.incl rdx
|
||||||
|
|
||||||
|
let off0 = a.getStrOffset(src0)
|
||||||
|
|
||||||
|
# Annoying AT&T syntax
|
||||||
|
if a.wordBitWidth == 64:
|
||||||
|
a.code &= "mulxq " & off0 & ", %%" & $dLo & ", %" & $dHi.asmId & '\n'
|
||||||
|
else:
|
||||||
|
a.code &= "mulxl " & off0 & ", %%" & $dLo & ", %" & $dHi.asmId & '\n'
|
||||||
|
|
||||||
|
a.operands.incl src0.desc
|
||||||
|
a.regClobbers.incl dLo
|
||||||
|
|
||||||
|
func mulx*(a: var Assembler_x86, dHi, dLo: Register, src0: Operand, src1: Register) =
|
||||||
|
## Does (dHi, dLo) <- src0 * src1
|
||||||
|
doAssert src1 == rdx, "MULX requires the RDX register"
|
||||||
|
a.regClobbers.incl rdx
|
||||||
|
|
||||||
|
let off0 = a.getStrOffset(src0)
|
||||||
|
|
||||||
|
# Annoying AT&T syntax
|
||||||
|
if a.wordBitWidth == 64:
|
||||||
|
a.code &= "mulxq " & off0 & ", %%" & $dLo & ", %%" & $dHi & '\n'
|
||||||
|
else:
|
||||||
|
a.code &= "mulxl " & off0 & ", %%" & $dLo & ", %%" & $dHi & '\n'
|
||||||
|
|
||||||
|
a.operands.incl src0.desc
|
||||||
|
a.regClobbers.incl dHi
|
||||||
|
a.regClobbers.incl dLo
|
||||||
|
|
||||||
|
func adcx*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|OperandReuse|Register) =
|
||||||
## Does: dst <- dst + src + carry
|
## Does: dst <- dst + src + carry
|
||||||
## and only sets the carry flag
|
## and only sets the carry flag
|
||||||
when dst is Operand:
|
when dst is Operand:
|
||||||
@ -753,7 +950,7 @@ func adcx*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|Operand
|
|||||||
a.codeFragment("adcx", src, dst)
|
a.codeFragment("adcx", src, dst)
|
||||||
a.areFlagsClobbered = true
|
a.areFlagsClobbered = true
|
||||||
|
|
||||||
func adox*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|OperandReuse) =
|
func adox*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|OperandReuse|Register) =
|
||||||
## Does: dst <- dst + src + overflow
|
## Does: dst <- dst + src + overflow
|
||||||
## and only sets the overflow flag
|
## and only sets the overflow flag
|
||||||
when dst is Operand:
|
when dst is Operand:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user