diff --git a/constantine/arithmetic/assembly/limbs_asm_montmul_x86.nim b/constantine/arithmetic/assembly/limbs_asm_montmul_x86.nim index 9280f16..0af7fba 100644 --- a/constantine/arithmetic/assembly/limbs_asm_montmul_x86.nim +++ b/constantine/arithmetic/assembly/limbs_asm_montmul_x86.nim @@ -57,25 +57,6 @@ macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_ scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber) # MUL requires RAX and RDX - rRAX = Operand( - desc: OperandDesc( - asmId: "[rax]", - nimSymbol: ident"rax", - rm: RAX, - constraint: Output_EarlyClobber, - cEmit: "rax" - ) - ) - - rRDX = Operand( - desc: OperandDesc( - asmId: "[rdx]", - nimSymbol: ident"rdx", - rm: RDX, - constraint: Output_EarlyClobber, - cEmit: "rdx" - ) - ) m0ninv = Operand( desc: OperandDesc( @@ -109,16 +90,12 @@ macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_ let tsym = t.nimSymbol let scratchSym = scratch.nimSymbol - let eax = rRAX.desc.nimSymbol - let edx = rRDX.desc.nimSymbol result.add quote do: static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress) var `tsym`: typeof(`r_MM`) # zero init # Assumes 64-bit limbs on 64-bit arch (or you can't store an address) var `scratchSym` {.noInit.}: Limbs[`scratchSlots`] - var `eax`{.noInit.}, `edx`{.noInit.}: BaseType - `scratchSym`[0] = cast[SecretWord](`a_MM`[0].unsafeAddr) `scratchSym`[1] = cast[SecretWord](`b_MM`[0].unsafeAddr) `scratchSym`[5] = cast[SecretWord](`r_MM`[0].unsafeAddr) @@ -140,14 +117,14 @@ macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_ for i in 0 ..< N: # (A, t[0]) <- a[0] * b[i] + t[0] - ctx.mov rRAX, a[0] + ctx.mov rax, a[0] ctx.mul rdx, rax, b[i], rax if i == 0: # overwrite t[0] - ctx.mov t[0], rRAX + ctx.mov t[0], rax else: # Accumulate in t[0] - ctx.add t[0], rRAX - ctx.adc rRDX, 0 - ctx.mov A, rRDX + ctx.add t[0], rax + ctx.adc rdx, 0 + ctx.mov A, rdx # m <- (t[0] * m0ninv) mod 2^w ctx.mov m, m0ninv @@ -155,39 +132,39 @@ macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_ # (C, _) <- m * M[0] + t[0] ctx.`xor` C, C - ctx.mov rRAX, M[0] + ctx.mov rax, M[0] ctx.mul rdx, rax, m, rax - ctx.add rRAX, t[0] - ctx.adc C, rRDX + ctx.add rax, t[0] + ctx.adc C, rdx for j in 1 ..< N: # (A, t[j]) <- a[j] * b[i] + A + t[j] - ctx.mov rRAX, a[j] + ctx.mov rax, a[j] ctx.mul rdx, rax, b[i], rax if i == 0: ctx.mov t[j], A else: ctx.add t[j], A - ctx.adc rRDX, 0 + ctx.adc rdx, 0 ctx.`xor` A, A - ctx.add t[j], rRAX - ctx.adc A, rRDX + ctx.add t[j], rax + ctx.adc A, rdx # (C, t[j-1]) <- m * M[j] + C + t[j] - ctx.mov rRAX, M[j] + ctx.mov rax, M[j] ctx.mul rdx, rax, m, rax ctx.add C, t[j] - ctx.adc rRDX, 0 - ctx.add C, rRAX - ctx.adc rRDX, 0 + ctx.adc rdx, 0 + ctx.add C, rax + ctx.adc rdx, 0 ctx.mov t[j-1], C - ctx.mov C, rRDX + ctx.mov C, rdx ctx.add A, C ctx.mov t[N-1], A - ctx.mov rRDX, r - let r2 = rRDX.asArrayAddr(len = N) + ctx.mov rdx, r + let r2 = rdx.asArrayAddr(len = N) ctx.finalSubNoCarry( r2, t, M, diff --git a/constantine/arithmetic/assembly/limbs_asm_montmul_x86_adx_bmi2.nim b/constantine/arithmetic/assembly/limbs_asm_montmul_x86_adx_bmi2.nim index 9d058df..c32dc90 100644 --- a/constantine/arithmetic/assembly/limbs_asm_montmul_x86_adx_bmi2.nim +++ b/constantine/arithmetic/assembly/limbs_asm_montmul_x86_adx_bmi2.nim @@ -40,7 +40,7 @@ proc mulx_by_word( t: OperandArray, a: Operand, # Pointer in scratchspace word0: Operand, - lo, rRDX: Operand + lo: Operand ) = ## Multiply the `a[0.. 1: ctx.mulx t[1], t[0], a[0], rdx ctx.`xor` hi, hi # Clear flags - TODO: necessary? @@ -87,20 +87,19 @@ proc mulaccx_by_word( a: Operand, # Pointer in scratchspace i: int, word: Operand, - lo, rRDX: Operand + lo: Operand ) = ## Multiply the `a[0..= 1: - ctx.finalSubNoCarry(r, scratch, M, reuse) + ctx.finalSubNoCarry(r, u, M, t) else: - ctx.finalSubCanOverflow(r, scratch, M, reuse, rRAX) + ctx.finalSubCanOverflow(r, u, M, t, rax) # Code generation result.add ctx.generate() diff --git a/constantine/arithmetic/assembly/limbs_asm_montred_x86_adx_bmi2.nim b/constantine/arithmetic/assembly/limbs_asm_montred_x86_adx_bmi2.nim index c4e6520..bd4bf68 100644 --- a/constantine/arithmetic/assembly/limbs_asm_montred_x86_adx_bmi2.nim +++ b/constantine/arithmetic/assembly/limbs_asm_montred_x86_adx_bmi2.nim @@ -35,15 +35,13 @@ static: doAssert UseASM_X86_64 # Montgomery reduction # ------------------------------------------------------------ -macro montyRedc2xx_gen[N: static int]( +macro montyRedc2x_gen[N: static int]( r_MR: var array[N, SecretWord], a_MR: array[N*2, SecretWord], M_MR: array[N, SecretWord], m0ninv_MR: BaseType, spareBits: static int ) = - # TODO, slower than Clang, in particular due to the shadowing - result = newStmtList() var ctx = init(Assembler_x86, BaseType) @@ -51,59 +49,29 @@ macro montyRedc2xx_gen[N: static int]( # We could force M as immediate by specializing per moduli M = init(OperandArray, nimSymbol = M_MR, N, PointerInReg, Input) - hi = Operand( - desc: OperandDesc( - asmId: "[hi]", - nimSymbol: ident"hi", - rm: Reg, - constraint: Output_EarlyClobber, - cEmit: "hi" - ) - ) + let uSlots = N+1 + let vSlots = max(N-1, 5) - lo = Operand( - desc: OperandDesc( - asmId: "[lo]", - nimSymbol: ident"lo", - rm: Reg, - constraint: Output_EarlyClobber, - cEmit: "lo" - ) - ) - - rRDX = Operand( - desc: OperandDesc( - asmId: "[rdx]", - nimSymbol: ident"rdx", - rm: RDX, - constraint: InputOutput_EnsureClobber, - cEmit: "rdx" - ) - ) - - m0ninv = Operand( - desc: OperandDesc( - asmId: "[m0ninv]", - nimSymbol: m0ninv_MR, - rm: Reg, - constraint: Input, - cEmit: "m0ninv" - ) - ) - - let scratchSlots = N+1 - var scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber) + var # Scratchspaces + u = init(OperandArray, nimSymbol = ident"U", uSlots, ElemsInReg, InputOutput_EnsureClobber) + v = init(OperandArray, nimSymbol = ident"V", vSlots, ElemsInReg, InputOutput_EnsureClobber) # Prologue - let edx = rRDX.desc.nimSymbol - let hisym = hi.desc.nimSymbol - let losym = lo.desc.nimSymbol - let scratchSym = scratch.nimSymbol + let usym = u.nimSymbol + let vsym = v.nimSymbol result.add quote do: static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress) + var `usym`{.noinit.}: Limbs[`uSlots`] + var `vsym` {.noInit.}: Limbs[`vSlots`] + `vsym`[0] = cast[SecretWord](`r_MR`[0].unsafeAddr) + `vsym`[1] = cast[SecretWord](`a_MR`[0].unsafeAddr) + `vsym`[2] = SecretWord(`m0ninv_MR`) - var `hisym`{.noInit.}, `losym`{.noInit.}, `edx`{.noInit.}: BaseType - var `scratchSym` {.noInit.}: Limbs[`scratchSlots`] + let r_temp = v[0].asArrayAddr(len = N) + let a = v[1].asArrayAddr(len = 2*N) + let m0ninv = v[2] + let lo = v[3] + let hi = v[4] # Algorithm # --------------------------------------------------------- @@ -122,63 +90,52 @@ macro montyRedc2xx_gen[N: static int]( # No register spilling handling doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs." - result.add quote do: - `edx` = BaseType(`m0ninv_MR`) - staticFor i, 0, `N`: # Do NOT use Nim slice/toOpenArray, they are not inlined - `scratchSym`[i] = `a_MR`[i] + ctx.mov rdx, m0ninv + + for i in 0 ..< N: + ctx.mov u[i], a[i] for i in 0 ..< N: # RDX contains m0ninv at the start of each loop ctx.comment "" - ctx.imul rRDX, scratch[0] # m <- a[i] * m0ninv mod 2^w + ctx.imul rdx, u[0] # m <- a[i] * m0ninv mod 2^w ctx.comment "---- Reduction " & $i - ctx.`xor` scratch[N], scratch[N] + ctx.`xor` u[N], u[N] for j in 0 ..< N-1: ctx.comment "" ctx.mulx hi, lo, M[j], rdx - ctx.adcx scratch[j], lo - ctx.adox scratch[j+1], hi + ctx.adcx u[j], lo + ctx.adox u[j+1], hi # Last limb ctx.comment "" ctx.mulx hi, lo, M[N-1], rdx - ctx.mov rRDX, m0ninv # Reload m0ninv for next iter - ctx.adcx scratch[N-1], lo - ctx.adox hi, scratch[N] - ctx.adcx scratch[N], hi + ctx.mov rdx, m0ninv # Reload m0ninv for next iter + ctx.adcx u[N-1], lo + ctx.adox hi, u[N] + ctx.adcx u[N], hi - scratch.rotateLeft() + u.rotateLeft() - # Code generation - result.add ctx.generate() - - # New codegen - ctx = init(Assembler_x86, BaseType) - - let r = init(OperandArray, nimSymbol = r_MR, N, PointerInReg, InputOutput_EnsureClobber) - let a = init(OperandArray, nimSymbol = a_MR, N*2, PointerInReg, Input) - let extraRegNeeded = N-1 - let t = init(OperandArray, nimSymbol = ident"t", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber) - let tsym = t.nimSymbol - result.add quote do: - var `tsym` {.noInit.}: Limbs[`extraRegNeeded`] + ctx.mov rdx, r_temp + let r = rdx.asArrayAddr(len = N) # This does a[i+n] += hi # but in a separate carry chain, fused with the # copy "r[i] = a[i+n]" for i in 0 ..< N: if i == 0: - ctx.add scratch[i], a[i+N] + ctx.add u[i], a[i+N] else: - ctx.adc scratch[i], a[i+N] + ctx.adc u[i], a[i+N] - let reuse = repackRegisters(t, scratch[N]) + let t = repackRegisters(v, u[N]) if spareBits >= 1: - ctx.finalSubNoCarry(r, scratch, M, reuse) + ctx.finalSubNoCarry(r, u, M, t) else: - ctx.finalSubCanOverflow(r, scratch, M, reuse, hi) + ctx.finalSubCanOverflow(r, u, M, t, hi) # Code generation result.add ctx.generate() @@ -191,4 +148,4 @@ func montRed_asm_adx_bmi2*[N: static int]( spareBits: static int ) = ## Constant-time Montgomery reduction - montyRedc2xx_gen(r, a, M, m0ninv, spareBits) + montyRedc2x_gen(r, a, M, m0ninv, spareBits) diff --git a/constantine/arithmetic/assembly/limbs_asm_mul_x86.nim b/constantine/arithmetic/assembly/limbs_asm_mul_x86.nim index 547a5f7..0863155 100644 --- a/constantine/arithmetic/assembly/limbs_asm_mul_x86.nim +++ b/constantine/arithmetic/assembly/limbs_asm_mul_x86.nim @@ -81,36 +81,13 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], ) # MUL requires RAX and RDX - rRAX = Operand( - desc: OperandDesc( - asmId: "[rax]", - nimSymbol: ident"rax", - rm: RAX, - constraint: Output_EarlyClobber, - cEmit: "rax" - ) - ) - - rRDX = Operand( - desc: OperandDesc( - asmId: "[rdx]", - nimSymbol: ident"rdx", - rm: RDX, - constraint: Output_EarlyClobber, - cEmit: "rdx" - ) - ) - # Prologue let tsym = t.desc.nimSymbol let usym = u.desc.nimSymbol let vsym = v.desc.nimSymbol - let eax = rRAX.desc.nimSymbol - let edx = rRDX.desc.nimSymbol result.add quote do: var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init - var `eax`{.noInit.}, `edx`{.noInit.}: BaseType # Algorithm ctx.`xor` u, u @@ -127,10 +104,10 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], let ia = i - ib for j in 0 ..< min(aLen - ia, ib+1): # (t, u, v) <- (t, u, v) + a[ia+j] * b[ib-j] - ctx.mov rRAX, arrB[ib-j] + ctx.mov rax, arrB[ib-j] ctx.mul rdx, rax, arrA[ia+j], rax - ctx.add v, rRAX - ctx.adc u, rRDX + ctx.add v, rax + ctx.adc u, rdx ctx.adc t, 0 ctx.mov arrR[i], v @@ -141,9 +118,9 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], ctx.`xor` t, t if aLen+bLen < rLen: - ctx.`xor` rRAX, rRAX + ctx.`xor` rax, rax for i in aLen+bLen ..< rLen: - ctx.mov arrR[i], rRAX + ctx.mov arrR[i], rax # Codegen result.add ctx.generate @@ -202,37 +179,12 @@ macro square_gen[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) = ) ) - # MUL requires RAX and RDX - rRAX = Operand( - desc: OperandDesc( - asmId: "[rax]", - nimSymbol: ident"rax", - rm: RAX, - constraint: Output_EarlyClobber, - cEmit: "rax" - ) - ) - - rRDX = Operand( - desc: OperandDesc( - asmId: "[rdx]", - nimSymbol: ident"rdx", - rm: RDX, - constraint: Output_EarlyClobber, - cEmit: "rdx" - ) - ) - - # Prologue let tsym = t.desc.nimSymbol let usym = u.desc.nimSymbol let vsym = v.desc.nimSymbol - let eax = rRAX.desc.nimSymbol - let edx = rRDX.desc.nimSymbol result.add quote do: var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init - var `eax`{.noInit.}, `edx`{.noInit.}: BaseType # Algorithm ctx.`xor` u, u @@ -252,20 +204,20 @@ macro square_gen[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) = let k2 = ib-j if k1 < k2: # (t, u, v) <- (t, u, v) + 2 * a[k1] * a[k2] - ctx.mov rRAX, arrA[k2] + ctx.mov rax, arrA[k2] ctx.mul rdx, rax, arrA[k1], rax - ctx.add rRAX, rRAX - ctx.adc rRDX, rRDX + ctx.add rax, rax + ctx.adc rdx, rdx ctx.adc t, 0 - ctx.add v, rRAX - ctx.adc u, rRDX + ctx.add v, rax + ctx.adc u, rdx ctx.adc t, 0 elif k1 == k2: # (t, u, v) <- (t, u, v) + a[k1] * a[k2] - ctx.mov rRAX, arrA[k2] + ctx.mov rax, arrA[k2] ctx.mul rdx, rax, arrA[k1], rax - ctx.add v, rRAX - ctx.adc u, rRDX + ctx.add v, rax + ctx.adc u, rdx ctx.adc t, 0 else: discard @@ -278,9 +230,9 @@ macro square_gen[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) = ctx.`xor` t, t if aLen*2 < rLen: - ctx.`xor` rRAX, rRAX + ctx.`xor` rax, rax for i in aLen*2 ..< rLen: - ctx.mov arrR[i], rRAX + ctx.mov arrR[i], rax # Codegen result.add ctx.generate diff --git a/constantine/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim b/constantine/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim index f56fcdd..7f91b27 100644 --- a/constantine/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim +++ b/constantine/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim @@ -37,8 +37,7 @@ proc mulx_by_word( ctx: var Assembler_x86, r0: Operand, a, t: OperandArray, - word0: Operand, - rRAX, rRDX: Operand + word0: Operand ) = ## Multiply the `a[0..