constantine/constantine/arithmetic/limbs_asm_montmul_x86.nim
Mamy André-Ratsimbazafy 5e8b1870a6
Rename files
2020-07-24 23:08:00 +02:00

224 lines
6.8 KiB
Nim

# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
# Standard library
std/macros,
# Internal
../config/common,
../primitives,
./limbs_generic
# ############################################################
#
# Assembly implementation of finite fields
#
# ############################################################
# Note: We can refer to at most 30 registers in inline assembly
# and "InputOutput" registers count double
# They are nice to let the compiler deals with mov
# but too constraining so we move things ourselves.
static: doAssert UseX86ASM
# Necessary for the compiler to find enough registers (enabled at -O1)
{.localPassC:"-fomit-frame-pointer".}
# Montgomery multiplication
# ------------------------------------------------------------
# Fallback when no ADX and BMI2 support (MULX, ADCX, ADOX)
proc finalSub*(
ctx: var Assembler_x86,
r: Operand or OperandArray,
t, M, scratch: OperandArray
) =
## Reduce `t` into `r` modulo `M`
let N = M.len
ctx.comment "Final substraction"
for i in 0 ..< N:
ctx.mov scratch[i], t[i]
if i == 0:
ctx.sub scratch[i], M[i]
else:
ctx.sbb scratch[i], M[i]
# If we borrowed it means that we were smaller than
# the modulus and we don't need "scratch"
for i in 0 ..< N:
ctx.cmovnc t[i], scratch[i]
ctx.mov r[i], t[i]
macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_MM: Limbs[N], m0ninv_MM: BaseType): untyped =
## Generate an optimized Montgomery Multiplication kernel
## using the CIOS method
##
## The multiplication and reduction are further merged in the same loop
##
## This requires the most significant word of the Modulus
## M[^1] < high(SecretWord) shr 2 (i.e. less than 0b00111...1111)
## https://hackmd.io/@zkteam/modular_multiplication
result = newStmtList()
var ctx = init(Assembler_x86, BaseType)
let
scratchSlots = max(N, 6)
# We could force M as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = M_MM, N, PointerInReg, Input)
# If N is too big, we need to spill registers. TODO.
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
# MultiPurpose Register slots
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
# MUL requires RAX and RDX
rRAX = Operand(
desc: OperandDesc(
asmId: "[rax]",
nimSymbol: ident"rax",
rm: RAX,
constraint: Output_EarlyClobber,
cEmit: "rax"
)
)
rRDX = Operand(
desc: OperandDesc(
asmId: "[rdx]",
nimSymbol: ident"rdx",
rm: RDX,
constraint: Output_EarlyClobber,
cEmit: "rdx"
)
)
m0ninv = Operand(
desc: OperandDesc(
asmId: "[m0ninv]",
nimSymbol: m0ninv_MM,
rm: MemOffsettable,
constraint: Input,
cEmit: "&" & $m0ninv_MM
)
)
# We're really constrained by register and somehow setting as memory doesn't help
# So we store the result `r` in the scratch space and then reload it in RDX
# before the scratchspace is used in final substraction
a = scratch[0].asArrayAddr(len = N) # Store the `a` operand
b = scratch[1].asArrayAddr(len = N) # Store the `b` operand
A = scratch[2] # High part of extended precision multiplication
C = scratch[3]
m = scratch[4] # Stores (t[0] * m0ninv) mod 2^w
r = scratch[5] # Stores the `r` operand
# Registers used:
# - 1 for `M`
# - 6 for `t` (at most)
# - 6 for `scratch`
# - 2 for RAX and RDX
# Total 15 out of 16
# We can save 1 by hardcoding M as immediate (and m0ninv)
# but this prevent reusing the same code for multiple curves like BLS12-377 and BLS12-381
# We might be able to save registers by having `r` and `M` be memory operand as well
let tsym = t.nimSymbol
let scratchSym = scratch.nimSymbol
let eax = rRAX.desc.nimSymbol
let edx = rRDX.desc.nimSymbol
result.add quote do:
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
var `tsym`: typeof(`r_MM`) # zero init
# Assumes 64-bit limbs on 64-bit arch (or you can't store an address)
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
var `eax`{.noInit.}, `edx`{.noInit.}: BaseType
`scratchSym`[0] = cast[SecretWord](`a_MM`[0].unsafeAddr)
`scratchSym`[1] = cast[SecretWord](`b_MM`[0].unsafeAddr)
`scratchSym`[5] = cast[SecretWord](`r_MM`[0].unsafeAddr)
# Algorithm
# -----------------------------------------
# for i=0 to N-1
# (A, t[0]) <- a[0] * b[i] + t[0]
# m <- (t[0] * m0ninv) mod 2^w
# (C, _) <- m * M[0] + t[0]
# for j=1 to N-1
# (A, t[j]) <- a[j] * b[i] + A + t[j]
# (C, t[j-1]) <- m * M[j] + C + t[j]
#
# t[N-1] = C + A
# No register spilling handling
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
for i in 0 ..< N:
# (A, t[0]) <- a[0] * b[i] + t[0]
ctx.mov rRAX, a[0]
ctx.mul rdx, rax, b[i], rax
if i == 0: # overwrite t[0]
ctx.mov t[0], rRAX
else: # Accumulate in t[0]
ctx.add t[0], rRAX
ctx.adc rRDX, 0
ctx.mov A, rRDX
# m <- (t[0] * m0ninv) mod 2^w
ctx.mov m, m0ninv
ctx.imul m, t[0]
# (C, _) <- m * M[0] + t[0]
ctx.`xor` C, C
ctx.mov rRAX, M[0]
ctx.mul rdx, rax, m, rax
ctx.add rRAX, t[0]
ctx.adc C, rRDX
for j in 1 ..< N:
# (A, t[j]) <- a[j] * b[i] + A + t[j]
ctx.mov rRAX, a[j]
ctx.mul rdx, rax, b[i], rax
if i == 0:
ctx.mov t[j], A
else:
ctx.add t[j], A
ctx.adc rRDX, 0
ctx.`xor` A, A
ctx.add t[j], rRAX
ctx.adc A, rRDX
# (C, t[j-1]) <- m * M[j] + C + t[j]
ctx.mov rRAX, M[j]
ctx.mul rdx, rax, m, rax
ctx.add C, t[j]
ctx.adc rRDX, 0
ctx.add C, rRAX
ctx.adc rRDX, 0
ctx.mov t[j-1], C
ctx.mov C, rRDX
ctx.add A, C
ctx.mov t[N-1], A
ctx.mov rRDX, r
let r2 = rRDX.asArrayAddr(len = N)
ctx.finalSub(
r2, t, M,
scratch
)
result.add ctx.generate
func montMul_CIOS_nocarry_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
## Constant-time modular multiplication
montMul_CIOS_nocarry_gen(r, a, b, M, m0ninv)