Low-level refactoring (#175)

* Add specific fromMont conversion routine. Rename montyResidue to getMont

* missed test file

* Add x86_64 ASM for fromMont

* Add x86_64 MULX/ADCX/ADOX for fromMont

* rework Montgomery Multiplication with prefetch/latency hiding techniques

* Fix ADX autodetection, closes #174. Rollback faster mul_mont attempt, no improvement and debug pain.

* finalSub in fromMont & adx_bmi -> adx

* Some {.noInit.} to avoid Nim zeroMem (which should be optimized away but who knows)

* Uniformize name 'op+domain': mulmod - mulmont

* Fix asm codegen bug "0x0000555555565930 <+896>:   sbb    0x20(%r8),%r8" with Clang in final substraction

* Prepare for skipping final substraction

* Don't forget to copy the result when we skip the final substraction

* Seems like we need to stash the idea of skipping the final substraction for now, needs bounds analysis https://eprint.iacr.org/2017/1057.pdf

* fix condition for ASM 32-bit

* optim modular addition when sparebit is available
This commit is contained in:
Mamy Ratsimbazafy 2022-02-14 00:16:55 +01:00 committed by GitHub
parent 53c4db7ead
commit 14af7e8724
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
26 changed files with 986 additions and 557 deletions

View File

@ -91,6 +91,20 @@ proc sqrBench*(T: typedesc, iters: int) =
bench("Squaring", T, iters):
r.square(x)
proc toBigBench*(T: typedesc, iters: int) =
var r: matchingBigInt(T.C)
let x = rng.random_unsafe(T)
preventOptimAway(r)
bench("BigInt <- field conversion", T, iters):
r.fromField(x)
proc toFieldBench*(T: typedesc, iters: int) =
var r: T
let x = rng.random_unsafe(matchingBigInt(T.C))
preventOptimAway(r)
bench("BigInt -> field conversion", T, iters):
r.fromBig(x)
proc invBench*(T: typedesc, iters: int) =
var r: T
let x = rng.random_unsafe(T)

View File

@ -50,6 +50,9 @@ proc main() =
mulBench(Fp[curve], Iters)
sqrBench(Fp[curve], Iters)
smallSeparator()
toBigBench(Fp[curve], Iters)
toFieldBench(Fp[curve], Iters)
smallSeparator()
invBench(Fp[curve], ExponentIters)
sqrtBench(Fp[curve], ExponentIters)
sqrtRatioBench(Fp[curve], ExponentIters)

View File

@ -24,14 +24,111 @@ import
# They are nice to let the compiler deals with mov
# but too constraining so we move things ourselves.
static: doAssert UseASM_X86_64
static: doAssert UseASM_X86_32
{.localPassC:"-fomit-frame-pointer".} # Needed so that the compiler finds enough registers
proc finalSubNoCarryImpl*(
ctx: var Assembler_x86,
r: Operand or OperandArray,
a, M, scratch: OperandArray
) =
## Reduce `a` into `r` modulo `M`
##
## r, a, scratch, scratchReg are mutated
## M is read-only
let N = M.len
ctx.comment "Final substraction (no carry)"
for i in 0 ..< N:
ctx.mov scratch[i], a[i]
if i == 0:
ctx.sub scratch[i], M[i]
else:
ctx.sbb scratch[i], M[i]
# If we borrowed it means that we were smaller than
# the modulus and we don't need "scratch"
for i in 0 ..< N:
ctx.cmovnc a[i], scratch[i]
ctx.mov r[i], a[i]
proc finalSubMayCarryImpl*(
ctx: var Assembler_x86,
r: Operand or OperandArray,
a, M, scratch: OperandArray,
scratchReg: Operand or Register or OperandReuse
) =
## Reduce `a` into `r` modulo `M`
## To be used when the final substraction can
## also depend on the carry flag
##
## r, a, scratch, scratchReg are mutated
## M is read-only
ctx.comment "Final substraction (may carry)"
# Mask: scratchReg contains 0xFFFF or 0x0000
ctx.sbb scratchReg, scratchReg
# Now substract the modulus to test a < p
let N = M.len
for i in 0 ..< N:
ctx.mov scratch[i], a[i]
if i == 0:
ctx.sub scratch[i], M[i]
else:
ctx.sbb scratch[i], M[i]
# If it overflows here, it means that it was
# smaller than the modulus and we don't need `scratch`
ctx.sbb scratchReg, 0
# If we borrowed it means that we were smaller than
# the modulus and we don't need "scratch"
for i in 0 ..< N:
ctx.cmovnc a[i], scratch[i]
ctx.mov r[i], a[i]
macro finalSub_gen*[N: static int](
r_PIR: var array[N, SecretWord],
a_EIR, M_PIR: array[N, SecretWord],
scratch_EIR: var array[N, SecretWord],
mayCarry: static bool): untyped =
## Returns:
## a-M if a > M
## a otherwise
##
## - r_PIR is a pointer to the result array, mutated,
## - a_EIR is an array of registers, mutated,
## - M_PIR is a pointer to an array, read-only,
## - scratch_EIR is an array of registers, mutated
## - mayCarry is set to true when the carry flag also needs to be read
result = newStmtList()
var ctx = init(Assembler_x86, BaseType)
let
r = init(OperandArray, nimSymbol = r_PIR, N, PointerInReg, InputOutput)
# We reuse the reg used for b for overflow detection
a = init(OperandArray, nimSymbol = a_EIR, N, ElemsInReg, InputOutput)
# We could force m as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
t = init(OperandArray, nimSymbol = scratch_EIR, N, ElemsInReg, Output_EarlyClobber)
if mayCarry:
ctx.finalSubMayCarryImpl(
r, a, M, t, rax
)
else:
ctx.finalSubNoCarryImpl(
r, a, M, t
)
result.add ctx.generate()
# Field addition
# ------------------------------------------------------------
macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N], hasSpareBit: static bool): untyped =
## Generate an optimized modular addition kernel
# Register pressure note:
# We could generate a kernel per modulus m by hardcoding it as immediate
@ -68,33 +165,18 @@ macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
# Interleaved copy in a second buffer as well
ctx.mov v[i], u[i]
# Mask: overflowed contains 0xFFFF or 0x0000
# TODO: unnecessary if MSB never set, i.e. "Field.getSpareBits >= 1"
let overflowed = b.reuseRegister()
ctx.sbb overflowed, overflowed
# Now substract the modulus to test a < p
for i in 0 ..< N:
if i == 0:
ctx.sub v[0], M[0]
if hasSparebit:
ctx.finalSubNoCarryImpl(r, u, M, v)
else:
ctx.sbb v[i], M[i]
ctx.finalSubMayCarryImpl(
r, u, M, v, b.reuseRegister()
)
# If it overflows here, it means that it was
# smaller than the modulus and we don't need V
ctx.sbb overflowed, 0
result.add ctx.generate()
# Conditional Mov and
# and store result
for i in 0 ..< N:
ctx.cmovnc u[i], v[i]
ctx.mov r[i], u[i]
result.add ctx.generate
func addmod_asm*(r: var Limbs, a, b, m: Limbs) =
func addmod_asm*(r: var Limbs, a, b, m: Limbs, hasSpareBit: static bool) =
## Constant-time modular addition
addmod_gen(r, a, b, m)
addmod_gen(r, a, b, m, hasSpareBit)
# Field substraction
# ------------------------------------------------------------
@ -225,7 +307,7 @@ when isMainModule:
debugecho " a: ", a.toHex()
debugecho " b: ", b.toHex()
debugecho " m: ", m.toHex()
addmod_asm(a, a, b, m)
addmod_asm(a, a, b, m, hasSpareBit = false)
debugecho "after:"
debugecho " a: ", a.toHex().tolower
debugecho " s: ", s
@ -245,7 +327,7 @@ when isMainModule:
debugecho " a: ", a.toHex()
debugecho " b: ", b.toHex()
debugecho " m: ", m.toHex()
addmod_asm(a, a, b, m)
addmod_asm(a, a, b, m, hasSpareBit = false)
debugecho "after:"
debugecho " a: ", a.toHex().tolower
debugecho " s: ", s
@ -265,7 +347,7 @@ when isMainModule:
debugecho " a: ", a.toHex()
debugecho " b: ", b.toHex()
debugecho " m: ", m.toHex()
addmod_asm(a, a, b, m)
addmod_asm(a, a, b, m, hasSpareBit = false)
debugecho "after:"
debugecho " a: ", a.toHex().tolower
debugecho " s: ", s
@ -285,7 +367,7 @@ when isMainModule:
debugecho " a: ", a.toHex()
debugecho " b: ", b.toHex()
debugecho " m: ", m.toHex()
addmod_asm(a, a, b, m)
addmod_asm(a, a, b, m, hasSpareBit = false)
debugecho "after:"
debugecho " a: ", a.toHex().tolower
debugecho " s: ", s
@ -308,7 +390,7 @@ when isMainModule:
debugecho " a: ", a.toHex()
debugecho " b: ", b.toHex()
debugecho " m: ", m.toHex()
submod_asm(a, a, b, m)
submod_asm(a, a, b, m, hasSpareBit = false)
debugecho "after:"
debugecho " a: ", a.toHex().tolower
debugecho " s: ", s
@ -333,7 +415,7 @@ when isMainModule:
debugecho " a: ", a.toHex()
debugecho " b: ", b.toHex()
debugecho " m: ", m.toHex()
submod_asm(r, a, b, m)
submod_asm(r, a, b, m, hasSpareBit = false)
debugecho "after:"
debugecho " r: ", r.toHex().tolower
debugecho " s: ", s

View File

@ -1,160 +0,0 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
# Standard library
std/macros,
# Internal
../../config/common,
../../primitives,
./limbs_asm_montred_x86
# ############################################################
#
# Assembly implementation of finite fields
#
# ############################################################
static: doAssert UseASM_X86_64
# MULX/ADCX/ADOX
{.localPassC:"-madx -mbmi2".}
# Necessary for the compiler to find enough registers (enabled at -O1)
{.localPassC:"-fomit-frame-pointer".}
# No exceptions allowed
{.push raises: [].}
# Montgomery reduction
# ------------------------------------------------------------
macro montyRedc2x_adx_gen*[N: static int](
r_MR: var array[N, SecretWord],
a_MR: array[N*2, SecretWord],
M_MR: array[N, SecretWord],
m0ninv_MR: BaseType,
hasSpareBit: static bool
) =
result = newStmtList()
var ctx = init(Assembler_x86, BaseType)
let
# We could force M as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = M_MR, N, PointerInReg, Input)
let uSlots = N+1
let vSlots = max(N-1, 5)
var # Scratchspaces
u = init(OperandArray, nimSymbol = ident"U", uSlots, ElemsInReg, InputOutput_EnsureClobber)
v = init(OperandArray, nimSymbol = ident"V", vSlots, ElemsInReg, InputOutput_EnsureClobber)
# Prologue
let usym = u.nimSymbol
let vsym = v.nimSymbol
result.add quote do:
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
var `usym`{.noinit.}: Limbs[`uSlots`]
var `vsym` {.noInit.}: Limbs[`vSlots`]
`vsym`[0] = cast[SecretWord](`r_MR`[0].unsafeAddr)
`vsym`[1] = cast[SecretWord](`a_MR`[0].unsafeAddr)
`vsym`[2] = SecretWord(`m0ninv_MR`)
let r_temp = v[0].asArrayAddr(len = N)
let a = v[1].asArrayAddr(len = 2*N)
let m0ninv = v[2]
let lo = v[3]
let hi = v[4]
# Algorithm
# ---------------------------------------------------------
# for i in 0 .. n-1:
# hi <- 0
# m <- a[i] * m0ninv mod 2^w (i.e. simple multiplication)
# for j in 0 .. n-1:
# (hi, lo) <- a[i+j] + m * M[j] + hi
# a[i+j] <- lo
# a[i+n] += hi
# for i in 0 .. n-1:
# r[i] = a[i+n]
# if r >= M:
# r -= M
# No register spilling handling
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
ctx.mov rdx, m0ninv
for i in 0 ..< N:
ctx.mov u[i], a[i]
for i in 0 ..< N:
# RDX contains m0ninv at the start of each loop
ctx.comment ""
ctx.imul rdx, u[0] # m <- a[i] * m0ninv mod 2^w
ctx.comment "---- Reduction " & $i
ctx.`xor` u[N], u[N]
for j in 0 ..< N-1:
ctx.comment ""
ctx.mulx hi, lo, M[j], rdx
ctx.adcx u[j], lo
ctx.adox u[j+1], hi
# Last limb
ctx.comment ""
ctx.mulx hi, lo, M[N-1], rdx
ctx.mov rdx, m0ninv # Reload m0ninv for next iter
ctx.adcx u[N-1], lo
ctx.adox hi, u[N]
ctx.adcx u[N], hi
u.rotateLeft()
ctx.mov rdx, r_temp
let r = rdx.asArrayAddr(len = N)
# This does a[i+n] += hi
# but in a separate carry chain, fused with the
# copy "r[i] = a[i+n]"
for i in 0 ..< N:
if i == 0:
ctx.add u[i], a[i+N]
else:
ctx.adc u[i], a[i+N]
let t = repackRegisters(v, u[N])
if hasSpareBit:
ctx.finalSubNoCarry(r, u, M, t)
else:
ctx.finalSubCanOverflow(r, u, M, t, hi)
# Code generation
result.add ctx.generate()
func montRed_asm_adx_bmi2_impl*[N: static int](
r: var array[N, SecretWord],
a: array[N*2, SecretWord],
M: array[N, SecretWord],
m0ninv: BaseType,
hasSpareBit: static bool
) =
## Constant-time Montgomery reduction
## Inline-version
montyRedc2x_adx_gen(r, a, M, m0ninv, hasSpareBit)
func montRed_asm_adx_bmi2*[N: static int](
r: var array[N, SecretWord],
a: array[N*2, SecretWord],
M: array[N, SecretWord],
m0ninv: BaseType,
hasSpareBit: static bool
) =
## Constant-time Montgomery reduction
montRed_asm_adx_bmi2_impl(r, a, M, m0ninv, hasSpareBit)

View File

@ -12,7 +12,8 @@ import
# Internal
../../config/common,
../../primitives,
./limbs_asm_montred_x86,
./limbs_asm_modular_x86,
./limbs_asm_redc_mont_x86,
./limbs_asm_mul_x86
# ############################################################
@ -34,7 +35,11 @@ static: doAssert UseASM_X86_64
# Montgomery multiplication
# ------------------------------------------------------------
# Fallback when no ADX and BMI2 support (MULX, ADCX, ADOX)
macro montMul_CIOS_sparebit_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_MM: Limbs[N], m0ninv_MM: BaseType): untyped =
macro mulMont_CIOS_sparebit_gen[N: static int](
r_PIR: var Limbs[N], a_PIR, b_PIR,
M_PIR: Limbs[N], m0ninv_REG: BaseType,
skipReduction: static bool
): untyped =
## Generate an optimized Montgomery Multiplication kernel
## using the CIOS method
##
@ -44,14 +49,17 @@ macro montMul_CIOS_sparebit_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M
## M[^1] < high(SecretWord) shr 2 (i.e. less than 0b00111...1111)
## https://hackmd.io/@zkteam/modular_multiplication
# No register spilling handling
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
result = newStmtList()
var ctx = init(Assembler_x86, BaseType)
let
scratchSlots = max(N, 6)
scratchSlots = 6
# We could force M as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = M_MM, N, PointerInReg, Input)
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
# If N is too big, we need to spill registers. TODO.
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
# MultiPurpose Register slots
@ -62,10 +70,10 @@ macro montMul_CIOS_sparebit_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M
m0ninv = Operand(
desc: OperandDesc(
asmId: "[m0ninv]",
nimSymbol: m0ninv_MM,
nimSymbol: m0ninv_REG,
rm: MemOffsettable,
constraint: Input,
cEmit: "&" & $m0ninv_MM
cEmit: "&" & $m0ninv_REG
)
)
@ -76,7 +84,7 @@ macro montMul_CIOS_sparebit_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M
b = scratch[1].asArrayAddr(len = N) # Store the `b` operand
A = scratch[2] # High part of extended precision multiplication
C = scratch[3]
m = scratch[4] # Stores (t[0] * m0ninv) mod 2^w
m = scratch[4] # Stores (t[0] * m0ninv) mod 2ʷ
r = scratch[5] # Stores the `r` operand
# Registers used:
@ -94,18 +102,18 @@ macro montMul_CIOS_sparebit_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M
result.add quote do:
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
var `tsym`: typeof(`r_MM`) # zero init
var `tsym`: typeof(`r_PIR`) # zero init
# Assumes 64-bit limbs on 64-bit arch (or you can't store an address)
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
`scratchSym`[0] = cast[SecretWord](`a_MM`[0].unsafeAddr)
`scratchSym`[1] = cast[SecretWord](`b_MM`[0].unsafeAddr)
`scratchSym`[5] = cast[SecretWord](`r_MM`[0].unsafeAddr)
`scratchSym`[0] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
`scratchSym`[1] = cast[SecretWord](`b_PIR`[0].unsafeAddr)
`scratchSym`[5] = cast[SecretWord](`r_PIR`[0].unsafeAddr)
# Algorithm
# -----------------------------------------
# for i=0 to N-1
# (A, t[0]) <- a[0] * b[i] + t[0]
# m <- (t[0] * m0ninv) mod 2^w
# m <- (t[0] * m0ninv) mod 2ʷ
# (C, _) <- m * M[0] + t[0]
# for j=1 to N-1
# (A, t[j]) <- a[j] * b[i] + A + t[j]
@ -127,7 +135,7 @@ macro montMul_CIOS_sparebit_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M
ctx.adc rdx, 0
ctx.mov A, rdx
# m <- (t[0] * m0ninv) mod 2^w
# m <- (t[0] * m0ninv) mod 2ʷ
ctx.mov m, m0ninv
ctx.imul m, t[0]
@ -164,19 +172,27 @@ macro montMul_CIOS_sparebit_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M
ctx.add A, C
ctx.mov t[N-1], A
ctx.mov rdx, r
let r2 = rdx.asArrayAddr(len = N)
ctx.mov rax, r # move r away from scratchspace that will be used for final substraction
let r2 = rax.asArrayAddr(len = N)
ctx.finalSubNoCarry(
if skipReduction:
for i in 0 ..< N:
ctx.mov r2[i], t[i]
else:
ctx.finalSubNoCarryImpl(
r2, t, M,
scratch
)
result.add ctx.generate()
result.add ctx.generate
func montMul_CIOS_sparebit_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
## Constant-time modular multiplication
montMul_CIOS_sparebit_gen(r, a, b, M, m0ninv)
func mulMont_CIOS_sparebit_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipReduction: static bool = false) =
## Constant-time Montgomery multiplication
## If "skipReduction" is set
## the result is in the range [0, 2M)
## otherwise the result is in the range [0, M)
##
## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
r.mulMont_CIOS_sparebit_gen(a, b, M, m0ninv, skipReduction)
# Montgomery Squaring
# ------------------------------------------------------------
@ -189,25 +205,12 @@ func square_asm_inline[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen
## but not for stack variables
sqr_gen(r, a)
func montRed_asm_inline[N: static int](
r: var array[N, SecretWord],
a: array[N*2, SecretWord],
M: array[N, SecretWord],
m0ninv: BaseType,
hasSpareBit: static bool
) {.inline.} =
## Constant-time Montgomery reduction
## Extra indirection as the generator assumes that
## arrays are pointers, which is true for parameters
## but not for stack variables
montyRedc2x_gen(r, a, M, m0ninv, hasSpareBit)
func montSquare_CIOS_asm*[N](
func squareMont_CIOS_asm*[N](
r: var Limbs[N],
a, M: Limbs[N],
m0ninv: BaseType,
hasSpareBit: static bool) =
hasSpareBit, skipReduction: static bool) =
## Constant-time modular squaring
var r2x {.noInit.}: Limbs[2*N]
r2x.square_asm_inline(a)
r.montRed_asm_inline(r2x, M, m0ninv, hasSpareBit)
r.redcMont_asm_inline(r2x, M, m0ninv, hasSpareBit, skipReduction)

View File

@ -8,12 +8,12 @@
import
# Standard library
std/macros,
std/[macros, algorithm],
# Internal
../../config/common,
../../primitives,
./limbs_asm_montred_x86,
./limbs_asm_montred_x86_adx_bmi2,
./limbs_asm_modular_x86,
./limbs_asm_redc_mont_x86_adx_bmi2,
./limbs_asm_mul_x86_adx_bmi2
# ############################################################
@ -36,6 +36,7 @@ static: doAssert UseASM_X86_64
# Montgomery Multiplication
# ------------------------------------------------------------
proc mulx_by_word(
ctx: var Assembler_x86,
hi: Operand,
@ -149,7 +150,7 @@ proc partialRedx(
ctx.mov rdx, t[0]
ctx.imul rdx, m0ninv
# Clear carry flags - TODO: necessary?
# Clear carry flags
ctx.`xor` S, S
# S,_ := t[0] + m*M[0]
@ -158,6 +159,8 @@ proc partialRedx(
ctx.adcx lo, t[0] # set the carry flag for the future ADCX
ctx.mov t[0], S
ctx.mov lo, 0
# for j=1 to N-1
# (S,t[j-1]) := t[j] + m*M[j] + S
ctx.comment " for j=1 to N-1"
@ -170,26 +173,31 @@ proc partialRedx(
# Last carries
# t[N-1] = S + C
ctx.comment " Reduction carry "
ctx.mov S, 0
ctx.adcx t[N-1], S
ctx.adox t[N-1], C
ctx.adcx lo, C # lo contains 0 so C += S
ctx.adox t[N-1], lo
macro montMul_CIOS_sparebit_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_MM: Limbs[N], m0ninv_MM: BaseType): untyped =
macro mulMont_CIOS_sparebit_adx_gen[N: static int](
r_PIR: var Limbs[N], a_PIR, b_PIR,
M_PIR: Limbs[N], m0ninv_REG: BaseType,
skipReduction: static bool): untyped =
## Generate an optimized Montgomery Multiplication kernel
## using the CIOS method
## This requires the most significant word of the Modulus
## M[^1] < high(SecretWord) shr 2 (i.e. less than 0b00111...1111)
## https://hackmd.io/@zkteam/modular_multiplication
# No register spilling handling
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
result = newStmtList()
var ctx = init(Assembler_x86, BaseType)
let
scratchSlots = max(N, 6)
scratchSlots = 6
r = init(OperandArray, nimSymbol = r_MM, N, PointerInReg, InputOutput_EnsureClobber)
r = init(OperandArray, nimSymbol = r_PIR, N, PointerInReg, InputOutput_EnsureClobber)
# We could force M as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = M_MM, N, PointerInReg, Input)
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
# If N is too big, we need to spill registers. TODO.
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
# MultiPurpose Register slots
@ -220,12 +228,12 @@ macro montMul_CIOS_sparebit_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM
result.add quote do:
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
var `tsym`: typeof(`r_MM`) # zero init
var `tsym`{.noInit.}: typeof(`r_PIR`) # zero init
# Assumes 64-bit limbs on 64-bit arch (or you can't store an address)
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
`scratchSym`[0] = cast[SecretWord](`a_MM`[0].unsafeAddr)
`scratchSym`[1] = cast[SecretWord](`b_MM`[0].unsafeAddr)
`scratchSym`[4] = SecretWord `m0ninv_MM`
`scratchSym`[0] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
`scratchSym`[1] = cast[SecretWord](`b_PIR`[0].unsafeAddr)
`scratchSym`[4] = SecretWord `m0ninv_REG`
# Algorithm
# -----------------------------------------
@ -238,9 +246,6 @@ macro montMul_CIOS_sparebit_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM
# (C,t[j-1]) := t[j] + m*M[j] + C
# t[N-1] = C + A
# No register spilling handling
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
for i in 0 ..< N:
if i == 0:
ctx.mulx_by_word(
@ -263,47 +268,35 @@ macro montMul_CIOS_sparebit_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM
lo, C
)
ctx.finalSubNoCarry(
if skipReduction:
for i in 0 ..< N:
ctx.mov r[i], t[i]
else:
ctx.finalSubNoCarryImpl(
r, t, M,
scratch
)
result.add ctx.generate
func montMul_CIOS_sparebit_asm_adx_bmi2*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
## Constant-time modular multiplication
## Requires the prime modulus to have a spare bit in the representation. (Hence if using 64-bit words and 4 words, to be at most 255-bit)
montMul_CIOS_sparebit_adx_bmi2_gen(r, a, b, M, m0ninv)
func mulMont_CIOS_sparebit_asm_adx*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipReduction: static bool = false) =
## Constant-time Montgomery multiplication
## If "skipReduction" is set
## the result is in the range [0, 2M)
## otherwise the result is in the range [0, M)
##
## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
r.mulMont_CIOS_sparebit_adx_gen(a, b, M, m0ninv, skipReduction)
# Montgomery Squaring
# ------------------------------------------------------------
func square_asm_adx_bmi2_inline[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) {.inline.} =
## Multi-precision Squaring
## Extra indirection as the generator assumes that
## arrays are pointers, which is true for parameters
## but not for stack variables.
sqrx_gen(r, a)
func montRed_asm_adx_bmi2_inline[N: static int](
r: var array[N, SecretWord],
a: array[N*2, SecretWord],
M: array[N, SecretWord],
m0ninv: BaseType,
hasSpareBit: static bool
) {.inline.} =
## Constant-time Montgomery reduction
## Extra indirection as the generator assumes that
## arrays are pointers, which is true for parameters
## but not for stack variables.
montyRedc2x_adx_gen(r, a, M, m0ninv, hasSpareBit)
func montSquare_CIOS_asm_adx_bmi2*[N](
func squareMont_CIOS_asm_adx*[N](
r: var Limbs[N],
a, M: Limbs[N],
m0ninv: BaseType,
hasSpareBit: static bool) =
hasSpareBit, skipReduction: static bool) =
## Constant-time modular squaring
var r2x {.noInit.}: Limbs[2*N]
r2x.square_asm_adx_bmi2_inline(a)
r.montRed_asm_adx_bmi2_inline(r2x, M, m0ninv, hasSpareBit)
r2x.square_asm_adx_inline(a)
r.redcMont_asm_adx(r2x, M, m0ninv, hasSpareBit, skipReduction)

View File

@ -111,7 +111,7 @@ proc mulaccx_by_word(
ctx.adcx hi, rdx
ctx.adox hi, rdx
macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen], bx: Limbs[bLen]) =
macro mulx_gen[rLen, aLen, bLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limbs[aLen], b_PIR: Limbs[bLen]) =
## `a`, `b`, `r` can have a different number of limbs
## if `r`.limbs.len < a.limbs.len + b.limbs.len
## The result will be truncated, i.e. it will be
@ -123,9 +123,9 @@ macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen
var ctx = init(Assembler_x86, BaseType)
let
r = init(OperandArray, nimSymbol = rx, rLen, PointerInReg, InputOutput_EnsureClobber)
a = init(OperandArray, nimSymbol = ax, aLen, PointerInReg, Input)
b = init(OperandArray, nimSymbol = bx, bLen, PointerInReg, Input)
r = init(OperandArray, nimSymbol = r_PIR, rLen, PointerInReg, InputOutput_EnsureClobber)
a = init(OperandArray, nimSymbol = a_PIR, aLen, PointerInReg, Input)
b = init(OperandArray, nimSymbol = b_PIR, bLen, PointerInReg, Input)
# MULX requires RDX
@ -168,24 +168,24 @@ macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen
# Codegen
result.add ctx.generate
func mul_asm_adx_bmi2_impl*[rLen, aLen, bLen: static int](
func mul_asm_adx_inline*[rLen, aLen, bLen: static int](
r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) {.inline.} =
## Multi-precision Multiplication
## Assumes r doesn't alias a or b
## Inline version
mulx_gen(r, a, b)
func mul_asm_adx_bmi2*[rLen, aLen, bLen: static int](
func mul_asm_adx*[rLen, aLen, bLen: static int](
r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
## Multi-precision Multiplication
## Assumes r doesn't alias a or b
mul_asm_adx_bmi2_impl(r, a, b)
mul_asm_adx_inline(r, a, b)
# Squaring
# -----------------------------------------------------------------------------------------------
#
# Strategy:
# We want to use the same scheduling as mul_asm_adx_bmi2
# We want to use the same scheduling as mul_asm_adx
# and so process `a[0..<N]` by `word`
# and store the intermediate result in `[t[n..1]:r0]`
#
@ -561,7 +561,7 @@ func sqrx_gen6L(ctx: var Assembler_x86, r, a: OperandArray, t: var OperandArray)
merge_diag_and_partsum(r, a, hi1, lo1, zero, 4)
merge_diag_and_partsum(r, a, hi2, lo2, zero, 5)
macro sqrx_gen*[rLen, aLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen]) =
macro sqrx_gen*[rLen, aLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limbs[aLen]) =
## Squaring
## `a` and `r` can have a different number of limbs
## if `r`.limbs.len < a.limbs.len * 2
@ -578,8 +578,8 @@ macro sqrx_gen*[rLen, aLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen]) =
# t = 2 * a.len = 12
# We use the full x86 register set.
r = init(OperandArray, nimSymbol = rx, rLen, PointerInReg, InputOutput)
a = init(OperandArray, nimSymbol = ax, aLen, PointerInReg, Input)
r = init(OperandArray, nimSymbol = r_PIR, rLen, PointerInReg, InputOutput)
a = init(OperandArray, nimSymbol = a_PIR, aLen, PointerInReg, Input)
# MULX requires RDX
@ -604,10 +604,15 @@ macro sqrx_gen*[rLen, aLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen]) =
# Codegen
result.add ctx.generate
func square_asm_adx_bmi2*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
func square_asm_adx_inline*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) {.inline.} =
## Multi-precision Squaring
## inline version
sqrx_gen(r, a)
func square_asm_adx*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
## Multi-precision Squaring
## Assumes r doesn't alias a
sqrx_gen(r, a)
square_asm_adx_inline(r, a)
# Sanity checks
@ -628,8 +633,8 @@ when isMainModule:
var a2x, expected: Limbs[8]
a2x.square_asm_adx_bmi2(a)
expected.mul_asm_adx_bmi2(a, a)
a2x.square_asm_adx(a)
expected.mul_asm_adx(a, a)
debugecho "--------------------------------"
debugecho "before:"
debugecho " a : ", a.toString()
@ -644,8 +649,8 @@ when isMainModule:
var a2x, expected: Limbs[8]
a2x.square_asm_adx_bmi2(a)
expected.mul_asm_adx_bmi2(a, a)
a2x.square_asm_adx(a)
expected.mul_asm_adx(a, a)
debugecho "--------------------------------"
debugecho "before:"
debugecho " a : ", a.toString()
@ -667,8 +672,8 @@ when isMainModule:
var a2x, expected: Limbs[12]
a2x.square_asm_adx_bmi2(a)
expected.mul_asm_adx_bmi2(a, a)
a2x.square_asm_adx(a)
expected.mul_asm_adx(a, a)
debugecho "--------------------------------"
debugecho "before:"
debugecho " a : ", a.toString()
@ -683,8 +688,8 @@ when isMainModule:
var a2x, expected: Limbs[12]
a2x.square_asm_adx_bmi2(a)
expected.mul_asm_adx_bmi2(a, a)
a2x.square_asm_adx(a)
expected.mul_asm_adx(a, a)
debugecho "--------------------------------"
debugecho "before:"
debugecho " a : ", a.toString()

View File

@ -11,7 +11,8 @@ import
std/macros,
# Internal
../../config/common,
../../primitives
../../primitives,
./limbs_asm_modular_x86
# ############################################################
#
@ -25,76 +26,21 @@ static: doAssert UseASM_X86_32
# Necessary for the compiler to find enough registers (enabled at -O1)
{.localPassC:"-fomit-frame-pointer".}
proc finalSubNoCarry*(
ctx: var Assembler_x86,
r: Operand or OperandArray,
a, M, scratch: OperandArray
) =
## Reduce `a` into `r` modulo `M`
let N = M.len
ctx.comment "Final substraction (no carry)"
for i in 0 ..< N:
ctx.mov scratch[i], a[i]
if i == 0:
ctx.sub scratch[i], M[i]
else:
ctx.sbb scratch[i], M[i]
# If we borrowed it means that we were smaller than
# the modulus and we don't need "scratch"
for i in 0 ..< N:
ctx.cmovnc a[i], scratch[i]
ctx.mov r[i], a[i]
proc finalSubCanOverflow*(
ctx: var Assembler_x86,
r: Operand or OperandArray,
a, M, scratch: OperandArray,
overflowReg: Operand or Register
) =
## Reduce `a` into `r` modulo `M`
## To be used when the final substraction can
## also depend on the carry flag
## This is in particular possible when the MSB
## is set for the prime modulus
## `overflowReg` should be a register that will be used
## to store the carry flag
ctx.comment "Final substraction (may carry)"
# Mask: overflowed contains 0xFFFF or 0x0000
ctx.sbb overflowReg, overflowReg
# Now substract the modulus to test a < p
let N = M.len
for i in 0 ..< N:
ctx.mov scratch[i], a[i]
if i == 0:
ctx.sub scratch[i], M[i]
else:
ctx.sbb scratch[i], M[i]
# If it overflows here, it means that it was
# smaller than the modulus and we don't need `scratch`
ctx.sbb overflowReg, 0
# If we borrowed it means that we were smaller than
# the modulus and we don't need "scratch"
for i in 0 ..< N:
ctx.cmovnc a[i], scratch[i]
ctx.mov r[i], a[i]
# Montgomery reduction
# ------------------------------------------------------------
macro montyRedc2x_gen*[N: static int](
r_MR: var array[N, SecretWord],
a_MR: array[N*2, SecretWord],
M_MR: array[N, SecretWord],
m0ninv_MR: BaseType,
hasSpareBit: static bool
macro redc2xMont_gen*[N: static int](
r_PIR: var array[N, SecretWord],
a_PIR: array[N*2, SecretWord],
M_PIR: array[N, SecretWord],
m0ninv_REG: BaseType,
hasSpareBit, skipReduction: static bool
) =
# No register spilling handling
doAssert N > 2, "The Assembly-optimized montgomery reduction requires a minimum of 2 limbs."
doAssert N <= 6, "The Assembly-optimized montgomery reduction requires at most 6 limbs."
result = newStmtList()
var ctx = init(Assembler_x86, BaseType)
@ -103,7 +49,7 @@ macro montyRedc2x_gen*[N: static int](
# so we store everything in scratchspaces restoring as needed
let
# We could force M as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = M_MR, N, PointerInReg, Input)
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
# MUL requires RAX and RDX
let uSlots = N+2
@ -119,9 +65,9 @@ macro montyRedc2x_gen*[N: static int](
result.add quote do:
var `usym`{.noinit.}: Limbs[`uSlots`]
var `vsym` {.noInit.}: Limbs[`vSlots`]
`vsym`[0] = cast[SecretWord](`r_MR`[0].unsafeAddr)
`vsym`[1] = cast[SecretWord](`a_MR`[0].unsafeAddr)
`vsym`[2] = SecretWord(`m0ninv_MR`)
`vsym`[0] = cast[SecretWord](`r_PIR`[0].unsafeAddr)
`vsym`[1] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
`vsym`[2] = SecretWord(`m0ninv_REG`)
let r_temp = v[0].asArrayAddr(len = N)
let a = v[1].asArrayAddr(len = 2*N)
@ -131,7 +77,7 @@ macro montyRedc2x_gen*[N: static int](
# ---------------------------------------------------------
# for i in 0 .. n-1:
# hi <- 0
# m <- a[i] * m0ninv mod 2^w (i.e. simple multiplication)
# m <- a[i] * m0ninv mod 2ʷ (i.e. simple multiplication)
# for j in 0 .. n-1:
# (hi, lo) <- a[i+j] + m * M[j] + hi
# a[i+j] <- lo
@ -141,15 +87,11 @@ macro montyRedc2x_gen*[N: static int](
# if r >= M:
# r -= M
# No register spilling handling
doAssert N > 2, "The Assembly-optimized montgomery reduction requires a minimum of 2 limbs."
doAssert N <= 6, "The Assembly-optimized montgomery reduction requires at most 6 limbs."
for i in 0 ..< N:
ctx.mov u[i], a[i]
ctx.mov u[N], u[0]
ctx.imul u[0], m0ninv # m <- a[i] * m0ninv mod 2^w
ctx.imul u[0], m0ninv # m <- a[i] * m0ninv mod 2ʷ
ctx.mov rax, u[0]
# scratch: [a[0] * m0, a[1], a[2], a[3], a[0]] for 4 limbs
@ -208,27 +150,138 @@ macro montyRedc2x_gen*[N: static int](
else:
ctx.adc u[i], a[i+N]
# v is invalidated from now on
let t = repackRegisters(v, u[N], u[N+1])
# v is invalidated
if hasSpareBit:
ctx.finalSubNoCarry(r, u, M, t)
if hasSpareBit and skipReduction:
for i in 0 ..< N:
ctx.mov r[i], t[i]
elif hasSpareBit:
ctx.finalSubNoCarryImpl(r, u, M, t)
else:
ctx.finalSubCanOverflow(r, u, M, t, rax)
ctx.finalSubMayCarryImpl(r, u, M, t, rax)
# Code generation
result.add ctx.generate()
func montRed_asm*[N: static int](
func redcMont_asm_inline*[N: static int](
r: var array[N, SecretWord],
a: array[N*2, SecretWord],
M: array[N, SecretWord],
m0ninv: BaseType,
hasSpareBit: static bool
hasSpareBit: static bool,
skipReduction: static bool = false
) {.inline.} =
## Constant-time Montgomery reduction
## Inline-version
redc2xMont_gen(r, a, M, m0ninv, hasSpareBit, skipReduction)
func redcMont_asm*[N: static int](
r: var array[N, SecretWord],
a: array[N*2, SecretWord],
M: array[N, SecretWord],
m0ninv: BaseType,
hasSpareBit, skipReduction: static bool
) =
## Constant-time Montgomery reduction
static: doAssert UseASM_X86_64, "This requires x86-64."
montyRedc2x_gen(r, a, M, m0ninv, hasSpareBit)
redcMont_asm_inline(r, a, M, m0ninv, hasSpareBit, skipReduction)
# Montgomery conversion
# ----------------------------------------------------------
macro mulMont_by_1_gen[N: static int](
t_EIR: var array[N, SecretWord],
M_PIR: array[N, SecretWord],
m0ninv_REG: BaseType) =
# No register spilling handling
doAssert N <= 6, "The Assembly-optimized montgomery reduction requires at most 6 limbs."
result = newStmtList()
var ctx = init(Assembler_x86, BaseType)
# On x86, compilers only let us use 15 out of 16 registers
# RAX and RDX are defacto used due to the MUL instructions
# so we store everything in scratchspaces restoring as needed
let
scratchSlots = 2
t = init(OperandArray, nimSymbol = t_EIR, N, ElemsInReg, InputOutput_EnsureClobber)
# We could force M as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
# MultiPurpose Register slots
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
# MUL requires RAX and RDX
m0ninv = Operand(
desc: OperandDesc(
asmId: "[m0ninv]",
nimSymbol: m0ninv_REG,
rm: MemOffsettable,
constraint: Input,
cEmit: "&" & $m0ninv_REG
)
)
C = scratch[0] # Stores the high-part of muliplication
m = scratch[1] # Stores (t[0] * m0ninv) mod 2ʷ
let scratchSym = scratch.nimSymbol
# Copy a in t
result.add quote do:
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
# Algorithm
# ---------------------------------------------------------
# for i in 0 .. n-1:
# m <- t[0] * m0ninv mod 2ʷ (i.e. simple multiplication)
# C, _ = t[0] + m * M[0]
# for j in 1 .. n-1:
# (C, t[j-1]) <- r[j] + m*M[j] + C
# t[n-1] = C
ctx.comment "for i in 0 ..< N:"
for i in 0 ..< N:
ctx.comment " m <- t[0] * m0ninv mod 2ʷ"
ctx.mov m, m0ninv
ctx.imul m, t[0]
ctx.comment " C, _ = t[0] + m * M[0]"
ctx.`xor` C, C
ctx.mov rax, M[0]
ctx.mul rdx, rax, m, rax
ctx.add rax, t[0]
ctx.adc C, rdx
ctx.comment " for j in 1 .. n-1:"
for j in 1 ..< N:
ctx.comment " (C, t[j-1]) <- r[j] + m*M[j] + C"
ctx.mov rax, M[j]
ctx.mul rdx, rax, m, rax
ctx.add C, t[j]
ctx.adc rdx, 0
ctx.add C, rax
ctx.adc rdx, 0
ctx.mov t[j-1], C
ctx.mov C, rdx
ctx.comment " final carry"
ctx.mov t[N-1], C
result.add ctx.generate()
func fromMont_asm*(r: var Limbs, a, M: Limbs, m0ninv: BaseType) =
## Constant-time Montgomery residue form to BigInt conversion
var t{.noInit.} = a
block:
t.mulMont_by_1_gen(M, m0ninv)
block: # Map from [0, 2p) to [0, p)
var workspace{.noInit.}: typeof(r)
r.finalSub_gen(t, M, workspace, mayCarry = false)
# Sanity checks
# ----------------------------------------------------------
@ -242,7 +295,7 @@ when isMainModule:
# TODO: Properly handle low number of limbs
func montyRedc2x_Comba[N: static int](
func redc2xMont_Comba[N: static int](
r: var array[N, SecretWord],
a: array[N*2, SecretWord],
M: array[N, SecretWord],
@ -298,10 +351,10 @@ when isMainModule:
var a_sqr{.noInit.}, na_sqr{.noInit.}: Limbs[2]
var a_sqr_comba{.noInit.}, na_sqr_comba{.noInit.}: Limbs[2]
a_sqr.montRed_asm(adbl_sqr, M, 1, hasSpareBit = false)
na_sqr.montRed_asm(nadbl_sqr, M, 1, hasSpareBit = false)
a_sqr_comba.montyRedc2x_Comba(adbl_sqr, M, 1)
na_sqr_comba.montyRedc2x_Comba(nadbl_sqr, M, 1)
a_sqr.redcMont_asm(adbl_sqr, M, 1, hasSpareBit = false, skipReduction = false)
na_sqr.redcMont_asm(nadbl_sqr, M, 1, hasSpareBit = false, skipReduction = false)
a_sqr_comba.redc2xMont_Comba(adbl_sqr, M, 1)
na_sqr_comba.redc2xMont_Comba(nadbl_sqr, M, 1)
debugecho "--------------------------------"
debugecho "after:"

View File

@ -0,0 +1,269 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
# Standard library
std/macros,
# Internal
../../config/common,
../../primitives,
./limbs_asm_modular_x86
# ############################################################
#
# Assembly implementation of finite fields
#
# ############################################################
static: doAssert UseASM_X86_64
# MULX/ADCX/ADOX
{.localPassC:"-madx -mbmi2".}
# Necessary for the compiler to find enough registers (enabled at -O1)
{.localPassC:"-fomit-frame-pointer".}
# No exceptions allowed
{.push raises: [].}
# Montgomery reduction
# ------------------------------------------------------------
macro redc2xMont_adx_gen[N: static int](
r_PIR: var array[N, SecretWord],
a_PIR: array[N*2, SecretWord],
M_PIR: array[N, SecretWord],
m0ninv_REG: BaseType,
hasSpareBit, skipReduction: static bool
) =
# No register spilling handling
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
result = newStmtList()
var ctx = init(Assembler_x86, BaseType)
let
# We could force M as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
let uSlots = N+1
let vSlots = max(N-1, 5)
var # Scratchspaces
u = init(OperandArray, nimSymbol = ident"U", uSlots, ElemsInReg, InputOutput_EnsureClobber)
v = init(OperandArray, nimSymbol = ident"V", vSlots, ElemsInReg, InputOutput_EnsureClobber)
# Prologue
let usym = u.nimSymbol
let vsym = v.nimSymbol
result.add quote do:
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
var `usym`{.noinit.}: Limbs[`uSlots`]
var `vsym` {.noInit.}: Limbs[`vSlots`]
`vsym`[0] = cast[SecretWord](`r_PIR`[0].unsafeAddr)
`vsym`[1] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
`vsym`[2] = SecretWord(`m0ninv_REG`)
let r_temp = v[0].asArrayAddr(len = N)
let a = v[1].asArrayAddr(len = 2*N)
let m0ninv = v[2]
let lo = v[3]
let hi = v[4]
# Algorithm
# ---------------------------------------------------------
# for i in 0 .. n-1:
# hi <- 0
# m <- a[i] * m0ninv mod 2ʷ (i.e. simple multiplication)
# for j in 0 .. n-1:
# (hi, lo) <- a[i+j] + m * M[j] + hi
# a[i+j] <- lo
# a[i+n] += hi
# for i in 0 .. n-1:
# r[i] = a[i+n]
# if r >= M:
# r -= M
ctx.mov rdx, m0ninv
for i in 0 ..< N:
ctx.mov u[i], a[i]
for i in 0 ..< N:
# RDX contains m0ninv at the start of each loop
ctx.comment ""
ctx.imul rdx, u[0] # m <- a[i] * m0ninv mod 2ʷ
ctx.comment "---- Reduction " & $i
ctx.`xor` u[N], u[N]
for j in 0 ..< N-1:
ctx.comment ""
ctx.mulx hi, lo, M[j], rdx
ctx.adcx u[j], lo
ctx.adox u[j+1], hi
# Last limb
ctx.comment ""
ctx.mulx hi, lo, M[N-1], rdx
ctx.mov rdx, m0ninv # Reload m0ninv for next iter
ctx.adcx u[N-1], lo
ctx.adox hi, u[N]
ctx.adcx u[N], hi
u.rotateLeft()
ctx.mov rdx, r_temp
let r = rdx.asArrayAddr(len = N)
# This does a[i+n] += hi
# but in a separate carry chain, fused with the
# copy "r[i] = a[i+n]"
for i in 0 ..< N:
if i == 0:
ctx.add u[i], a[i+N]
else:
ctx.adc u[i], a[i+N]
let t = repackRegisters(v, u[N])
if hasSpareBit and skipReduction:
for i in 0 ..< N:
ctx.mov r[i], t[i]
elif hasSpareBit:
ctx.finalSubNoCarryImpl(r, u, M, t)
else:
ctx.finalSubMayCarryImpl(r, u, M, t, hi)
# Code generation
result.add ctx.generate()
func redcMont_asm_adx_inline*[N: static int](
r: var array[N, SecretWord],
a: array[N*2, SecretWord],
M: array[N, SecretWord],
m0ninv: BaseType,
hasSpareBit: static bool,
skipReduction: static bool = false
) {.inline.} =
## Constant-time Montgomery reduction
## Inline-version
redc2xMont_adx_gen(r, a, M, m0ninv, hasSpareBit, skipReduction)
func redcMont_asm_adx*[N: static int](
r: var array[N, SecretWord],
a: array[N*2, SecretWord],
M: array[N, SecretWord],
m0ninv: BaseType,
hasSpareBit: static bool,
skipReduction: static bool = false
) =
## Constant-time Montgomery reduction
redcMont_asm_adx_inline(r, a, M, m0ninv, hasSpareBit, skipReduction)
# Montgomery conversion
# ----------------------------------------------------------
macro mulMont_by_1_adx_gen[N: static int](
t_EIR: var array[N, SecretWord],
M_PIR: array[N, SecretWord],
m0ninv_REG: BaseType) =
# No register spilling handling
doAssert N <= 6, "The Assembly-optimized montgomery reduction requires at most 6 limbs."
result = newStmtList()
var ctx = init(Assembler_x86, BaseType)
# On x86, compilers only let us use 15 out of 16 registers
# RAX and RDX are defacto used due to the MUL instructions
# so we store everything in scratchspaces restoring as needed
let
scratchSlots = 1
t = init(OperandArray, nimSymbol = t_EIR, N, ElemsInReg, InputOutput_EnsureClobber)
# We could force M as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
# MultiPurpose Register slots
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
# MUL requires RAX and RDX
m0ninv = Operand(
desc: OperandDesc(
asmId: "[m0ninv]",
nimSymbol: m0ninv_REG,
rm: MemOffsettable,
constraint: Input,
cEmit: "&" & $m0ninv_REG
)
)
C = scratch[0] # Stores the high-part of muliplication
let scratchSym = scratch.nimSymbol
# Copy a in t
result.add quote do:
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
# Algorithm
# ---------------------------------------------------------
# for i in 0 .. n-1:
# m <- t[0] * m0ninv mod 2ʷ (i.e. simple multiplication)
# C, _ = t[0] + m * M[0]
# for j in 1 .. n-1:
# (C, t[j-1]) <- r[j] + m*M[j] + C
# t[n-1] = C
# Low-level optimizations
# https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
# Section 3.5.1.8 xor'ing a reg with itself is free (except for instruction code size)
ctx.comment "for i in 0 ..< N:"
for i in 0 ..< N:
ctx.comment " m <- t[0] * m0ninv mod 2ʷ"
ctx.mov rdx, m0ninv
ctx.imul rdx, t[0]
# Create 2 parallel carry-chains for adcx and adox
# We need to clear the carry/overflow flags first for ADCX/ADOX
# with the smallest instruction if possible (xor rax, rax)
# to reduce instruction-cache miss
ctx.comment " C, _ = t[0] + m * M[0]"
ctx.`xor` rax, rax
ctx.mulx C, rax, M[0], rdx
ctx.adcx rax, t[0] # Set C', the carry flag for future adcx, but don't accumulate in C yet
ctx.mov t[0], C
# for j=1 to N-1
# (S,t[j-1]) := t[j] + m*M[j] + S
ctx.comment " for j=1 to N-1"
ctx.comment " (C,t[j-1]) := t[j] + m*M[j] + C"
for j in 1 ..< N:
ctx.adcx t[j-1], t[j]
ctx.mulx t[j], C, M[j], rdx
ctx.adox t[j-1], C
ctx.comment " final carries"
ctx.mov rax, 0
ctx.adcx t[N-1], rax
ctx.adox t[N-1], rax
result.add ctx.generate()
func fromMont_asm_adx*(r: var Limbs, a, M: Limbs, m0ninv: BaseType) =
## Constant-time Montgomery residue form to BigInt conversion
## Requires ADX and BMI2 instruction set
var t{.noInit.} = a
block:
t.mulMont_by_1_adx_gen(M, m0ninv)
block: # Map from [0, 2p) to [0, p)
var workspace{.noInit.}: typeof(r)
r.finalSub_gen(t, M, workspace, mayCarry = false)

View File

@ -298,7 +298,7 @@ func prod_high_words*[rBits, aBits, bBits](r: var BigInt[rBits], a: BigInt[aBits
# - Barret reduction
# - Approximating multiplication by a fractional constant in the form f(a) = K/C * a
# with K and C known at compile-time.
# We can instead find a well chosen M = (2^WordBitWidth)^w, with M > C (i.e. M is a power of 2 bigger than C)
# We can instead find a well chosen M = (2^WordBitWidth)ʷ, with M > C (i.e. M is a power of 2 bigger than C)
# Precompute P = K*M/C at compile-time
# and at runtime do P*a/M <=> P*a >> WordBitWidth*w
# i.e. prod_high_words(result, P, a, w)

View File

@ -24,9 +24,9 @@ import
#
# ############################################################
func montyResidue*(mres: var BigInt, a, N, r2modM: BigInt, m0ninv: static BaseType, spareBits: static int) =
func getMont*(mres: var BigInt, a, N, r2modM: BigInt, m0ninv: static BaseType, spareBits: static int) =
## Convert a BigInt from its natural representation
## to the Montgomery n-residue form
## to the Montgomery residue form
##
## `mres` is overwritten. It's bitlength must be properly set before calling this procedure.
##
@ -39,37 +39,35 @@ func montyResidue*(mres: var BigInt, a, N, r2modM: BigInt, m0ninv: static BaseTy
## - `r2modM` is R² (mod M)
## with W = M.len
## and R = (2^WordBitWidth)^W
montyResidue(mres.limbs, a.limbs, N.limbs, r2modM.limbs, m0ninv, spareBits)
getMont(mres.limbs, a.limbs, N.limbs, r2modM.limbs, m0ninv, spareBits)
func redc*[mBits](r: var BigInt[mBits], a, M: BigInt[mBits], m0ninv: static BaseType, spareBits: static int) =
## Convert a BigInt from its Montgomery n-residue form
func fromMont*[mBits](r: var BigInt[mBits], a, M: BigInt[mBits], m0ninv: static BaseType, spareBits: static int) =
## Convert a BigInt from its Montgomery residue form
## to the natural representation
##
## `mres` is modified in-place
##
## Caller must take care of properly switching between
## the natural and montgomery domain.
let one = block:
var one {.noInit.}: BigInt[mBits]
one.setOne()
one
redc(r.limbs, a.limbs, one.limbs, M.limbs, m0ninv, spareBits)
fromMont(r.limbs, a.limbs, M.limbs, m0ninv, spareBits)
func montyMul*(r: var BigInt, a, b, M: BigInt, negInvModWord: static BaseType, spareBits: static int) =
func mulMont*(r: var BigInt, a, b, M: BigInt, negInvModWord: static BaseType,
spareBits: static int, skipReduction: static bool = false) =
## Compute r <- a*b (mod M) in the Montgomery domain
##
## This resets r to zero before processing. Use {.noInit.}
## to avoid duplicating with Nim zero-init policy
montyMul(r.limbs, a.limbs, b.limbs, M.limbs, negInvModWord, spareBits)
mulMont(r.limbs, a.limbs, b.limbs, M.limbs, negInvModWord, spareBits, skipReduction)
func montySquare*(r: var BigInt, a, M: BigInt, negInvModWord: static BaseType, spareBits: static int) =
func squareMont*(r: var BigInt, a, M: BigInt, negInvModWord: static BaseType,
spareBits: static int, skipReduction: static bool = false) =
## Compute r <- a^2 (mod M) in the Montgomery domain
##
## This resets r to zero before processing. Use {.noInit.}
## to avoid duplicating with Nim zero-init policy
montySquare(r.limbs, a.limbs, M.limbs, negInvModWord, spareBits)
squareMont(r.limbs, a.limbs, M.limbs, negInvModWord, spareBits, skipReduction)
func montyPow*[mBits: static int](
func powMont*[mBits: static int](
a: var BigInt[mBits], exponent: openarray[byte],
M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int,
spareBits: static int
@ -91,9 +89,9 @@ func montyPow*[mBits: static int](
const scratchLen = if windowSize == 1: 2
else: (1 shl windowSize) + 1
var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired]]
montyPow(a.limbs, exponent, M.limbs, one.limbs, negInvModWord, scratchSpace, spareBits)
powMont(a.limbs, exponent, M.limbs, one.limbs, negInvModWord, scratchSpace, spareBits)
func montyPowUnsafeExponent*[mBits: static int](
func powMontUnsafeExponent*[mBits: static int](
a: var BigInt[mBits], exponent: openarray[byte],
M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int,
spareBits: static int
@ -115,12 +113,12 @@ func montyPowUnsafeExponent*[mBits: static int](
const scratchLen = if windowSize == 1: 2
else: (1 shl windowSize) + 1
var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired]]
montyPowUnsafeExponent(a.limbs, exponent, M.limbs, one.limbs, negInvModWord, scratchSpace, spareBits)
powMontUnsafeExponent(a.limbs, exponent, M.limbs, one.limbs, negInvModWord, scratchSpace, spareBits)
from ../io/io_bigints import exportRawUint
# Workaround recursive dependencies
func montyPow*[mBits, eBits: static int](
func powMont*[mBits, eBits: static int](
a: var BigInt[mBits], exponent: BigInt[eBits],
M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int,
spareBits: static int
@ -137,9 +135,9 @@ func montyPow*[mBits, eBits: static int](
var expBE {.noInit.}: array[(ebits + 7) div 8, byte]
expBE.exportRawUint(exponent, bigEndian)
montyPow(a, expBE, M, one, negInvModWord, windowSize, spareBits)
powMont(a, expBE, M, one, negInvModWord, windowSize, spareBits)
func montyPowUnsafeExponent*[mBits, eBits: static int](
func powMontUnsafeExponent*[mBits, eBits: static int](
a: var BigInt[mBits], exponent: BigInt[eBits],
M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int,
spareBits: static int
@ -160,7 +158,7 @@ func montyPowUnsafeExponent*[mBits, eBits: static int](
var expBE {.noInit.}: array[(ebits + 7) div 8, byte]
expBE.exportRawUint(exponent, bigEndian)
montyPowUnsafeExponent(a, expBE, M, one, negInvModWord, windowSize, spareBits)
powMontUnsafeExponent(a, expBE, M, one, negInvModWord, windowSize, spareBits)
{.pop.} # inline
{.pop.} # raises no exceptions

View File

@ -56,16 +56,20 @@ func fromBig*(dst: var FF, src: BigInt) =
when nimvm:
dst.mres.montyResidue_precompute(src, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord())
else:
dst.mres.montyResidue(src, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
dst.mres.getMont(src, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
func fromBig*[C: static Curve](T: type FF[C], src: BigInt): FF[C] {.noInit.} =
## Convert a BigInt to its Montgomery form
result.fromBig(src)
func fromField*(dst: var BigInt, src: FF) {.noInit, inline.} =
## Convert a finite-field element to a BigInt in natural representation
dst.fromMont(src.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits())
func toBig*(src: FF): auto {.noInit, inline.} =
## Convert a finite-field element to a BigInt in natural representation
var r {.noInit.}: typeof(src.mres)
r.redc(src.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits())
r.fromField(src)
return r
# Copy
@ -102,7 +106,7 @@ func cswap*(a, b: var FF, ctl: CTBool) {.meter.} =
# Note: for `+=`, double, sum
# not(a.mres < FF.fieldMod()) is unnecessary if the prime has the form
# (2^64)^w - 1 (if using uint64 words).
# (2^64)ʷ - 1 (if using uint64 words).
# In practice I'm not aware of such prime being using in elliptic curves.
# 2^127 - 1 and 2^521 - 1 are used but 127 and 521 are not multiple of 32/64
@ -148,7 +152,7 @@ func setMinusOne*(a: var FF) =
func `+=`*(a: var FF, b: FF) {.meter.} =
## In-place addition modulo p
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
addmod_asm(a.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
addmod_asm(a.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs, FF.getSpareBits() >= 1)
else:
var overflowed = add(a.mres, b.mres)
overflowed = overflowed or not(a.mres < FF.fieldMod())
@ -165,7 +169,7 @@ func `-=`*(a: var FF, b: FF) {.meter.} =
func double*(a: var FF) {.meter.} =
## Double ``a`` modulo p
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
addmod_asm(a.mres.limbs, a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
addmod_asm(a.mres.limbs, a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs, FF.getSpareBits() >= 1)
else:
var overflowed = double(a.mres)
overflowed = overflowed or not(a.mres < FF.fieldMod())
@ -175,7 +179,7 @@ func sum*(r: var FF, a, b: FF) {.meter.} =
## Sum ``a`` and ``b`` into ``r`` modulo p
## r is initialized/overwritten
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
addmod_asm(r.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
addmod_asm(r.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs, FF.getSpareBits() >= 1)
else:
var overflowed = r.mres.sum(a.mres, b.mres)
overflowed = overflowed or not(r.mres < FF.fieldMod())
@ -204,20 +208,20 @@ func double*(r: var FF, a: FF) {.meter.} =
## Double ``a`` into ``r``
## `r` is initialized/overwritten
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
addmod_asm(r.mres.limbs, a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
addmod_asm(r.mres.limbs, a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs, FF.getSpareBits() >= 1)
else:
var overflowed = r.mres.double(a.mres)
overflowed = overflowed or not(r.mres < FF.fieldMod())
discard csub(r.mres, FF.fieldMod(), overflowed)
func prod*(r: var FF, a, b: FF) {.meter.} =
func prod*(r: var FF, a, b: FF, skipReduction: static bool = false) {.meter.} =
## Store the product of ``a`` by ``b`` modulo p into ``r``
## ``r`` is initialized / overwritten
r.mres.montyMul(a.mres, b.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits())
r.mres.mulMont(a.mres, b.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits(), skipReduction)
func square*(r: var FF, a: FF) {.meter.} =
func square*(r: var FF, a: FF, skipReduction: static bool = false) {.meter.} =
## Squaring modulo p
r.mres.montySquare(a.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits())
r.mres.squareMont(a.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits(), skipReduction)
func neg*(r: var FF, a: FF) {.meter.} =
## Negate modulo p
@ -261,20 +265,20 @@ func cneg*(r: var FF, a: FF, ctl: SecretBool) {.meter.} =
func cneg*(a: var FF, ctl: SecretBool) {.meter.} =
## Constant-time in-place conditional negation
## The negation is only performed if ctl is "true"
var t = a
var t {.noInit.} = a
a.cneg(t, ctl)
func cadd*(a: var FF, b: FF, ctl: SecretBool) {.meter.} =
## Constant-time out-place conditional addition
## The addition is only performed if ctl is "true"
var t = a
var t {.noInit.} = a
t += b
a.ccopy(t, ctl)
func csub*(a: var FF, b: FF, ctl: SecretBool) {.meter.} =
## Constant-time out-place conditional substraction
## The substraction is only performed if ctl is "true"
var t = a
var t {.noInit.} = a
t -= b
a.ccopy(t, ctl)
@ -340,7 +344,7 @@ func pow*(a: var FF, exponent: BigInt) =
## ``a``: a field element to be exponentiated
## ``exponent``: a big integer
const windowSize = 5 # TODO: find best window size for each curves
a.mres.montyPow(
a.mres.powMont(
exponent,
FF.fieldMod(), FF.getMontyOne(),
FF.getNegInvModWord(), windowSize,
@ -352,7 +356,7 @@ func pow*(a: var FF, exponent: openarray[byte]) =
## ``a``: a field element to be exponentiated
## ``exponent``: a big integer in canonical big endian representation
const windowSize = 5 # TODO: find best window size for each curves
a.mres.montyPow(
a.mres.powMont(
exponent,
FF.fieldMod(), FF.getMontyOne(),
FF.getNegInvModWord(), windowSize,
@ -371,7 +375,7 @@ func powUnsafeExponent*(a: var FF, exponent: BigInt) =
## - power analysis
## - timing analysis
const windowSize = 5 # TODO: find best window size for each curves
a.mres.montyPowUnsafeExponent(
a.mres.powMontUnsafeExponent(
exponent,
FF.fieldMod(), FF.getMontyOne(),
FF.getNegInvModWord(), windowSize,
@ -390,7 +394,7 @@ func powUnsafeExponent*(a: var FF, exponent: openarray[byte]) =
## - power analysis
## - timing analysis
const windowSize = 5 # TODO: find best window size for each curves
a.mres.montyPowUnsafeExponent(
a.mres.powMontUnsafeExponent(
exponent,
FF.fieldMod(), FF.getMontyOne(),
FF.getNegInvModWord(), windowSize,
@ -409,21 +413,39 @@ func `*=`*(a: var FF, b: FF) {.meter.} =
## Multiplication modulo p
a.prod(a, b)
func square*(a: var FF) {.meter.} =
func square*(a: var FF, skipReduction: static bool = false) {.meter.} =
## Squaring modulo p
a.mres.montySquare(a.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits())
a.square(a, skipReduction)
func square_repeated*(r: var FF, num: int) {.meter.} =
func square_repeated*(a: var FF, num: int, skipReduction: static bool = false) {.meter.} =
## Repeated squarings
# Except in Tonelli-Shanks, num is always known at compile-time
# and square repeated is inlined, so the compiler should optimize the branches away.
# TODO: understand the conditions to avoid the final substraction
for _ in 0 ..< num:
r.square()
a.square(skipReduction = false)
func square_repeated*(r: var FF, a: FF, num: int) {.meter.} =
func square_repeated*(r: var FF, a: FF, num: int, skipReduction: static bool = false) {.meter.} =
## Repeated squarings
# TODO: understand the conditions to avoid the final substraction
r.square(a)
for _ in 1 ..< num:
r.square()
func square_repeated_then_mul*(a: var FF, num: int, b: FF, skipReduction: static bool = false) {.meter.} =
## Square `a`, `num` times and then multiply by b
## Assumes at least 1 squaring
a.square_repeated(num, skipReduction = false)
a.prod(a, b, skipReduction = skipReduction)
func square_repeated_then_mul*(r: var FF, a: FF, num: int, b: FF, skipReduction: static bool = false) {.meter.} =
## Square `a`, `num` times and then multiply by b
## Assumes at least 1 squaring
r.square_repeated(a, num, skipReduction = false)
r.prod(r, b, skipReduction = skipReduction)
func `*=`*(a: var FF, b: static int) =
## Multiplication by a small integer known at compile-time
# Implementation:

View File

@ -73,7 +73,7 @@ func redc2x*(r: var Fp, a: FpDbl) =
## Reduce a double-precision field element into r
## from [0, 2ⁿp) range to [0, p) range
const N = r.mres.limbs.len
montyRedc2x(
redc2xMont(
r.mres.limbs,
a.limbs2x,
Fp.C.Mod.limbs,

View File

@ -128,7 +128,7 @@ func numWordsFromBits(bits: int): int {.inline.} =
func shlAddMod_estimate(a: LimbsViewMut, aLen: int,
c: SecretWord, M: LimbsViewConst, mBits: int
): tuple[neg, tooBig: SecretBool] =
## Estimate a <- a shl 2^w + c (mod M)
## Estimate a <- a shl 2ʷ + c (mod M)
##
## with w the base word width, usually 32 on 32-bit platforms and 64 on 64-bit platforms
##

View File

@ -74,7 +74,7 @@ func prod*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b:
when UseASM_X86_64 and aLen <= 6:
# ADX implies BMI2
if ({.noSideEffect.}: hasAdx()):
mul_asm_adx_bmi2(r, a, b)
mul_asm_adx(r, a, b)
else:
mul_asm(r, a, b)
elif UseASM_X86_64:
@ -98,7 +98,7 @@ func prod_high_words*[rLen, aLen, bLen](
# - Barret reduction
# - Approximating multiplication by a fractional constant in the form f(a) = K/C * a
# with K and C known at compile-time.
# We can instead find a well chosen M = (2^WordBitWidth)^w, with M > C (i.e. M is a power of 2 bigger than C)
# We can instead find a well chosen M = (2^WordBitWidth)ʷ, with M > C (i.e. M is a power of 2 bigger than C)
# Precompute P = K*M/C at compile-time
# and at runtime do P*a/M <=> P*a >> (WordBitWidth*w)
# i.e. prod_high_words(result, P, a, w)
@ -203,7 +203,7 @@ func square*[rLen, aLen](
when UseASM_X86_64 and aLen in {4, 6} and rLen == 2*aLen:
# ADX implies BMI2
if ({.noSideEffect.}: hasAdx()):
square_asm_adx_bmi2(r, a)
square_asm_adx(r, a)
else:
square_asm(r, a)
elif UseASM_X86_64:

View File

@ -15,12 +15,12 @@ import
./limbs, ./limbs_extmul
when UseASM_X86_32:
import ./assembly/limbs_asm_montred_x86
import ./assembly/limbs_asm_redc_mont_x86
when UseASM_X86_64:
import
./assembly/limbs_asm_montmul_x86,
./assembly/limbs_asm_montmul_x86_adx_bmi2,
./assembly/limbs_asm_montred_x86_adx_bmi2
./assembly/limbs_asm_mul_mont_x86,
./assembly/limbs_asm_mul_mont_x86_adx_bmi2,
./assembly/limbs_asm_redc_mont_x86_adx_bmi2
# ############################################################
#
@ -53,12 +53,19 @@ when UseASM_X86_64:
# Montgomery Reduction
# ------------------------------------------------------------
func montyRedc2x_CIOS[N: static int](
func redc2xMont_CIOS[N: static int](
r: var array[N, SecretWord],
a: array[N*2, SecretWord],
M: array[N, SecretWord],
m0ninv: BaseType) =
m0ninv: BaseType, skipReduction: static bool = false) =
## Montgomery reduce a double-precision bigint modulo M
##
## This maps
## - [0, 4p²) -> [0, 2p) with skipReduction
## - [0, 4p²) -> [0, p) without
##
## SkipReduction skips the final substraction step.
## For skipReduction, M needs to have a spare bit in it's representation i.e. unused MSB.
# - Analyzing and Comparing Montgomery Multiplication Algorithms
# Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr.
# http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf
@ -81,7 +88,7 @@ func montyRedc2x_CIOS[N: static int](
#
# for i in 0 .. n-1:
# C <- 0
# m <- a[i] * m0ninv mod 2^w (i.e. simple multiplication)
# m <- a[i] * m0ninv mod 2ʷ (i.e. simple multiplication)
# for j in 0 .. n-1:
# (C, S) <- a[i+j] + m * M[j] + C
# a[i+j] <- S
@ -95,8 +102,8 @@ func montyRedc2x_CIOS[N: static int](
# to the higher limb if any, thank you "implementation detail"
# missing from paper.
var a = a # Copy "t" for mutation and ensure on stack
var res: typeof(r) # Accumulator
var a {.noInit.} = a # Copy "t" for mutation and ensure on stack
var res {.noInit.}: typeof(r) # Accumulator
staticFor i, 0, N:
var C = Zero
let m = a[i] * SecretWord(m0ninv)
@ -112,15 +119,23 @@ func montyRedc2x_CIOS[N: static int](
addC(carry, res[i], a[i+N], res[i], carry)
# Final substraction
when not skipReduction:
discard res.csub(M, SecretWord(carry).isNonZero() or not(res < M))
r = res
func montyRedc2x_Comba[N: static int](
func redc2xMont_Comba[N: static int](
r: var array[N, SecretWord],
a: array[N*2, SecretWord],
M: array[N, SecretWord],
m0ninv: BaseType) =
m0ninv: BaseType, skipReduction: static bool = false) =
## Montgomery reduce a double-precision bigint modulo M
##
## This maps
## - [0, 4p²) -> [0, 2p) with skipReduction
## - [0, 4p²) -> [0, p) without
##
## SkipReduction skips the final substraction step.
## For skipReduction, M needs to have a spare bit in it's representation i.e. unused MSB.
# We use Product Scanning / Comba multiplication
var t, u, v = Zero
var carry: Carry
@ -156,18 +171,25 @@ func montyRedc2x_Comba[N: static int](
addC(carry, z[N-1], v, a[2*N-1], Carry(0))
# Final substraction
when not skipReduction:
discard z.csub(M, SecretBool(carry) or not(z < M))
r = z
# Montgomery Multiplication
# ------------------------------------------------------------
func montyMul_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
func mulMont_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipReduction: static bool = false) =
## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
## and no-carry optimization.
## This requires the most significant word of the Modulus
## M[^1] < high(SecretWord) shr 1 (i.e. less than 0b01111...1111)
## https://hackmd.io/@zkteam/modular_multiplication
## https://hackmd.io/@gnark/modular_multiplication
##
## This maps
## - [0, 2p) -> [0, 2p) with skipReduction
## - [0, 2p) -> [0, p) without
##
## SkipReduction skips the final substraction step.
# We want all the computation to be kept in registers
# hence we use a temporary `t`, hoping that the compiler does it.
@ -175,7 +197,7 @@ func montyMul_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
const N = t.len
staticFor i, 0, N:
# (A, t[0]) <- a[0] * b[i] + t[0]
# m <- (t[0] * m0ninv) mod 2^w
# m <- (t[0] * m0ninv) mod 2ʷ
# (C, _) <- m * M[0] + t[0]
var A: SecretWord
muladd1(A, t[0], a[0], b[i], t[0])
@ -191,10 +213,11 @@ func montyMul_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
t[N-1] = C + A
when not skipReduction:
discard t.csub(M, not(t < M))
r = t
func montyMul_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) {.used.} =
func mulMont_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) {.used.} =
## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
# - Analyzing and Comparing Montgomery Multiplication Algorithms
# Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr.
@ -221,7 +244,7 @@ func montyMul_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) {.used.} =
addC(tNp1, tN, tN, A, Carry(0))
# Reduction
# m <- (t[0] * m0ninv) mod 2^w
# m <- (t[0] * m0ninv) mod 2ʷ
# (C, _) <- m * M[0] + t[0]
var C, lo = Zero
let m = t[0] * SecretWord(m0ninv)
@ -239,11 +262,18 @@ func montyMul_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) {.used.} =
# t[N+1] can only be non-zero in the intermediate computation
# since it is immediately reduce to t[N] at the end of each "i" iteration
# However if t[N] is non-zero we have t > M
discard t.csub(M, tN.isNonZero() or not(t < M)) # TODO: (t >= M) is unnecessary for prime in the form (2^64)^w
discard t.csub(M, tN.isNonZero() or not(t < M)) # TODO: (t >= M) is unnecessary for prime in the form (2^64)ʷ
r = t
func montyMul_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
func mulMont_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipReduction: static bool = false) =
## Montgomery Multiplication using Finely Integrated Product Scanning (FIPS)
##
## This maps
## - [0, 2p) -> [0, 2p) with skipReduction
## - [0, 2p) -> [0, p) without
##
## SkipReduction skips the final substraction step.
## For skipReduction, M needs to have a spare bit in it's representation i.e. unused MSB.
# - Architectural Enhancements for Montgomery
# Multiplication on Embedded RISC Processors
# Johann Großschädl and Guy-Armand Kamendje, 2003
@ -276,6 +306,7 @@ func montyMul_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
u = t
t = Zero
when not skipReduction:
discard z.csub(M, v.isNonZero() or not(z < M))
r = z
@ -313,36 +344,81 @@ func montyMul_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
# ...
# for j in 1 ..< N: # <- Montgomery reduce.
# Montgomery Conversion
# ------------------------------------------------------------
#
# In Montgomery form, inputs are scaled by a constant R
# so a' = aR (mod p) and b' = bR (mod p)
#
# A classic multiplication would do a'*b' = abR² (mod p)
# we then need to remove the extra R, hence:
# - Montgomery reduction (redc) does 1/R (mod p) to map abR² (mod p) -> abR (mod p)
# - Montgomery multiplication directly compute mulMont(aR, bR) = abR (mod p)
#
# So to convert a to a' = aR (mod p), we can do mulMont(a, R²) = aR (mod p)
# and to convert a' to a = aR / R (mod p) we can do:
# - redc(aR) = a
# - or mulMont(aR, 1) = a
func fromMont_CIOS(r: var Limbs, a, M: Limbs, m0ninv: BaseType) =
## Convert from Montgomery form to canonical BigInt form
# for i in 0 .. n-1:
# m <- t[0] * m0ninv mod 2ʷ (i.e. simple multiplication)
# C, _ = t[0] + m * M[0]
# for j in 1 ..n-1:
# (C, t[j-1]) <- r[j] + m*M[j] + C
# t[n-1] = C
const N = a.len
var t {.noInit.} = a # Ensure working in registers
staticFor i, 0, N:
let m = t[0] * SecretWord(m0ninv)
var C, lo: SecretWord
muladd1(C, lo, m, M[0], t[0])
staticFor j, 1, N:
muladd2(C, t[j-1], m, M[j], C, t[j])
t[N-1] = C
discard t.csub(M, not(t < M))
r = t
# Exported API
# ------------------------------------------------------------
# TODO upstream, using Limbs[N] breaks semcheck
func montyRedc2x*[N: static int](
func redc2xMont*[N: static int](
r: var array[N, SecretWord],
a: array[N*2, SecretWord],
M: array[N, SecretWord],
m0ninv: BaseType, spareBits: static int) {.inline.} =
m0ninv: BaseType,
spareBits: static int, skipReduction: static bool = false) {.inline.} =
## Montgomery reduce a double-precision bigint modulo M
const skipReduction = skipReduction and spareBits >= 1
when UseASM_X86_64 and r.len <= 6:
# ADX implies BMI2
if ({.noSideEffect.}: hasAdx()):
montRed_asm_adx_bmi2(r, a, M, m0ninv, spareBits >= 1)
redcMont_asm_adx(r, a, M, m0ninv, spareBits >= 1, skipReduction)
else:
when r.len in {3..6}:
montRed_asm(r, a, M, m0ninv, spareBits >= 1)
redcMont_asm(r, a, M, m0ninv, spareBits >= 1, skipReduction)
else:
montyRedc2x_CIOS(r, a, M, m0ninv)
# montyRedc2x_Comba(r, a, M, m0ninv)
redc2xMont_CIOS(r, a, M, m0ninv, skipReduction)
# redc2xMont_Comba(r, a, M, m0ninv)
elif UseASM_X86_64 and r.len in {3..6}:
# TODO: Assembly faster than GCC but slower than Clang
montRed_asm(r, a, M, m0ninv, spareBits >= 1)
redcMont_asm(r, a, M, m0ninv, spareBits >= 1, skipReduction)
else:
montyRedc2x_CIOS(r, a, M, m0ninv)
# montyRedc2x_Comba(r, a, M, m0ninv)
redc2xMont_CIOS(r, a, M, m0ninv, skipReduction)
# redc2xMont_Comba(r, a, M, m0ninv, skipReduction)
func montyMul*(
func mulMont*(
r: var Limbs, a, b, M: Limbs,
m0ninv: static BaseType, spareBits: static int) {.inline.} =
m0ninv: BaseType,
spareBits: static int,
skipReduction: static bool = false) {.inline.} =
## Compute r <- a*b (mod M) in the Montgomery domain
## `m0ninv` = -1/M (mod SecretWord). Our words are 2^32 or 2^64
##
@ -369,43 +445,50 @@ func montyMul*(
# The implementation is visible from here, the compiler can make decision whether to:
# - specialize/duplicate code for m0ninv == 1 (especially if only 1 curve is needed)
# - keep it generic and optimize code size
const skipReduction = skipReduction and spareBits >= 1
when spareBits >= 1:
when UseASM_X86_64 and a.len in {2 .. 6}: # TODO: handle spilling
# ADX implies BMI2
if ({.noSideEffect.}: hasAdx()):
montMul_CIOS_sparebit_asm_adx_bmi2(r, a, b, M, m0ninv)
mulMont_CIOS_sparebit_asm_adx(r, a, b, M, m0ninv, skipReduction)
else:
montMul_CIOS_sparebit_asm(r, a, b, M, m0ninv)
mulMont_CIOS_sparebit_asm(r, a, b, M, m0ninv, skipReduction)
else:
montyMul_CIOS_sparebit(r, a, b, M, m0ninv)
mulMont_CIOS_sparebit(r, a, b, M, m0ninv, skipReduction)
else:
montyMul_FIPS(r, a, b, M, m0ninv)
mulMont_FIPS(r, a, b, M, m0ninv, skipReduction)
func montySquare*[N](r: var Limbs[N], a, M: Limbs[N],
m0ninv: static BaseType, spareBits: static int) {.inline.} =
func squareMont*[N](r: var Limbs[N], a, M: Limbs[N],
m0ninv: BaseType,
spareBits: static int,
skipReduction: static bool = false) {.inline.} =
## Compute r <- a^2 (mod M) in the Montgomery domain
## `m0ninv` = -1/M (mod SecretWord). Our words are 2^31 or 2^63
const skipReduction = skipReduction and spareBits >= 1
when UseASM_X86_64 and a.len in {4, 6}:
# ADX implies BMI2
if ({.noSideEffect.}: hasAdx()):
# With ADX and spare bit, montSquare_CIOS_asm_adx_bmi2
# With ADX and spare bit, squareMont_CIOS_asm_adx
# which uses unfused squaring then Montgomery reduction
# is slightly slower than fused Montgomery multiplication
when spareBits >= 1:
montMul_CIOS_sparebit_asm_adx_bmi2(r, a, a, M, m0ninv)
mulMont_CIOS_sparebit_asm_adx(r, a, a, M, m0ninv, skipReduction)
else:
montSquare_CIOS_asm_adx_bmi2(r, a, M, m0ninv, spareBits >= 1)
squareMont_CIOS_asm_adx(r, a, M, m0ninv, spareBits >= 1, skipReduction)
else:
montSquare_CIOS_asm(r, a, M, m0ninv, spareBits >= 1)
squareMont_CIOS_asm(r, a, M, m0ninv, spareBits >= 1, skipReduction)
elif UseASM_X86_64:
var r2x {.noInit.}: Limbs[2*N]
r2x.square(a)
r.montyRedc2x(r2x, M, m0ninv, spareBits)
r.redc2xMont(r2x, M, m0ninv, spareBits, skipReduction)
else:
montyMul(r, a, a, M, m0ninv, spareBits)
mulMont(r, a, a, M, m0ninv, spareBits, skipReduction)
func redc*(r: var Limbs, a, one, M: Limbs,
func fromMont*(r: var Limbs, a, M: Limbs,
m0ninv: static BaseType, spareBits: static int) =
## Transform a bigint ``a`` from it's Montgomery N-residue representation (mod N)
## to the regular natural representation (mod N)
@ -424,10 +507,16 @@ func redc*(r: var Limbs, a, one, M: Limbs,
# - https://en.wikipedia.org/wiki/Montgomery_modular_multiplication#Montgomery_arithmetic_on_multiprecision_(variable-radix)_integers
# - http://langevin.univ-tln.fr/cours/MLC/extra/montgomery.pdf
# Montgomery original paper
#
montyMul(r, a, one, M, m0ninv, spareBits)
when UseASM_X86_64 and a.len in {2 .. 6}:
# ADX implies BMI2
if ({.noSideEffect.}: hasAdx()):
fromMont_asm_adx(r, a, M, m0ninv)
else:
fromMont_asm(r, a, M, m0ninv)
else:
fromMont_CIOS(r, a, M, m0ninv)
func montyResidue*(r: var Limbs, a, M, r2modM: Limbs,
func getMont*(r: var Limbs, a, M, r2modM: Limbs,
m0ninv: static BaseType, spareBits: static int) =
## Transform a bigint ``a`` from it's natural representation (mod N)
## to a the Montgomery n-residue representation
@ -446,7 +535,7 @@ func montyResidue*(r: var Limbs, a, M, r2modM: Limbs,
## Important: `r` is overwritten
## The result `r` buffer size MUST be at least the size of `M` buffer
# Reference: https://eprint.iacr.org/2017/1057.pdf
montyMul(r, a, r2ModM, M, m0ninv, spareBits)
mulMont(r, a, r2ModM, M, m0ninv, spareBits)
# Montgomery Modular Exponentiation
# ------------------------------------------
@ -489,7 +578,7 @@ func getWindowLen(bufLen: int): uint =
while (1 shl result) + 1 > bufLen:
dec result
func montyPowPrologue(
func powMontPrologue(
a: var Limbs, M, one: Limbs,
m0ninv: static BaseType,
scratchspace: var openarray[Limbs],
@ -507,12 +596,12 @@ func montyPowPrologue(
else:
scratchspace[2] = a
for k in 2 ..< 1 shl result:
scratchspace[k+1].montyMul(scratchspace[k], a, M, m0ninv, spareBits)
scratchspace[k+1].mulMont(scratchspace[k], a, M, m0ninv, spareBits)
# Set a to one
a = one
func montyPowSquarings(
func powMontSquarings(
a: var Limbs,
exponent: openarray[byte],
M: Limbs,
@ -557,12 +646,11 @@ func montyPowSquarings(
# We have k bits and can do k squaring
for i in 0 ..< k:
tmp.montySquare(a, M, m0ninv, spareBits)
a = tmp
a.squareMont(a, M, m0ninv, spareBits)
return (k, bits)
func montyPow*(
func powMont*(
a: var Limbs,
exponent: openarray[byte],
M, one: Limbs,
@ -596,7 +684,7 @@ func montyPow*(
## A window of size 5 requires (2^5 + 1)*(381 + 7)/8 = 33 * 48 bytes = 1584 bytes
## of scratchspace (on the stack).
let window = montyPowPrologue(a, M, one, m0ninv, scratchspace, spareBits)
let window = powMontPrologue(a, M, one, m0ninv, scratchspace, spareBits)
# We process bits with from most to least significant.
# At each loop iteration with have acc_len bits in acc.
@ -607,7 +695,7 @@ func montyPow*(
acc, acc_len: uint
e = 0
while acc_len > 0 or e < exponent.len:
let (k, bits) = montyPowSquarings(
let (k, bits) = powMontSquarings(
a, exponent, M, m0ninv,
scratchspace[0], window,
acc, acc_len, e,
@ -626,10 +714,10 @@ func montyPow*(
# Multiply with the looked-up value
# we keep the product only if the exponent bits are not all zeroes
scratchspace[0].montyMul(a, scratchspace[1], M, m0ninv, spareBits)
scratchspace[0].mulMont(a, scratchspace[1], M, m0ninv, spareBits)
a.ccopy(scratchspace[0], SecretWord(bits).isNonZero())
func montyPowUnsafeExponent*(
func powMontUnsafeExponent*(
a: var Limbs,
exponent: openarray[byte],
M, one: Limbs,
@ -649,13 +737,13 @@ func montyPowUnsafeExponent*(
# TODO: scratchspace[1] is unused when window > 1
let window = montyPowPrologue(a, M, one, m0ninv, scratchspace, spareBits)
let window = powMontPrologue(a, M, one, m0ninv, scratchspace, spareBits)
var
acc, acc_len: uint
e = 0
while acc_len > 0 or e < exponent.len:
let (_, bits) = montyPowSquarings(
let (_, bits) = powMontSquarings(
a, exponent, M, m0ninv,
scratchspace[0], window,
acc, acc_len, e,
@ -665,10 +753,10 @@ func montyPowUnsafeExponent*(
## Warning ⚠️: Exposes the exponent bits
if bits != 0:
if window > 1:
scratchspace[0].montyMul(a, scratchspace[1+bits], M, m0ninv, spareBits)
scratchspace[0].mulMont(a, scratchspace[1+bits], M, m0ninv, spareBits)
else:
# scratchspace[1] holds the original `a`
scratchspace[0].montyMul(a, scratchspace[1], M, m0ninv, spareBits)
scratchspace[0].mulMont(a, scratchspace[1], M, m0ninv, spareBits)
a = scratchspace[0]
{.pop.} # raises no exceptions

View File

@ -355,7 +355,7 @@ func r3mod*(M: BigInt): BigInt =
## This is used in hash-to-curve to
## reduce a double-sized bigint mod M
## and map it to the Montgomery domain
## with just redc2x + montyMul
## with just redc2x + mulMont
r_powmod(3, M)
func montyOne*(M: BigInt): BigInt =
@ -391,7 +391,7 @@ func primePlus1div2*(P: BigInt): BigInt =
# (P+1)/2 = P/2 + 1 if P is odd,
# this avoids overflowing if the prime uses all bits
# i.e. in the form (2^64)^w - 1 or (2^32)^w - 1
# i.e. in the form (2^64)ʷ - 1 or (2^32)ʷ - 1
result = P
result.shiftRight(1)
@ -491,7 +491,7 @@ func toCanonicalIntRepr*[bits: static int](
# ############################################################
# This is needed to avoid recursive dependencies
func montyMul_precompute(r: var BigInt, a, b, M: BigInt, m0ninv: BaseType) =
func mulMont_precompute(r: var BigInt, a, b, M: BigInt, m0ninv: BaseType) =
## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
var t: typeof(M) # zero-init
const N = t.limbs.len
@ -527,4 +527,4 @@ func montyResidue_precompute*(r: var BigInt, a, M, r2modM: BigInt,
## Transform a bigint ``a`` from it's natural representation (mod N)
## to a the Montgomery n-residue representation
## This is intended for compile-time precomputations-only
montyMul_precompute(r, a, r2ModM, M, m0ninv)
mulMont_precompute(r, a, r2ModM, M, m0ninv)

View File

@ -154,15 +154,15 @@ func expandMessageXMD*[B1, B2, B3: byte|char, len_in_bytes: static int](
return
func redc2x[FF](r: var FF, big2x: BigInt) {.inline.} =
r.mres.limbs.montyRedc2x(
r.mres.limbs.redc2xMont(
big2x.limbs,
FF.fieldMod().limbs,
FF.getNegInvModWord(),
FF.getSpareBits()
)
func montyMul(r: var BigInt, a, b: BigInt, FF: type) {.inline.} =
r.limbs.montyMul(
func mulMont(r: var BigInt, a, b: BigInt, FF: type) {.inline.} =
r.limbs.mulMont(
a.limbs, b.limbs,
FF.fieldMod().limbs,
FF.getNegInvModWord(),
@ -228,14 +228,14 @@ func hashToField*[Field; B1, B2, B3: byte|char, count: static int](
# Reduces modulo p and output in Montgomery domain
when m == 1:
output[i].redc2x(big2x)
output[i].mres.montyMul(
output[i].mres.mulMont(
output[i].mres,
Fp[Field.C].getR3ModP(),
Fp[Field.C])
else:
output[i].coords[j].redc2x(big2x)
output[i].coords[j].mres.montyMul(
output[i].coords[j].mres.mulMont(
output[i].coords[j].mres,
Fp[Field.C].getR3ModP(),
Fp[Field.C])

View File

@ -249,7 +249,7 @@ let
hasBmi2Impl = testX86Feature(Bmi2)
hasTsxHleImpl = testX86Feature(TsxHle)
hasTsxRtmImpl = testX86Feature(TsxRtm)
hasAdxImpl = testX86Feature(TsxHle)
hasAdxImpl = testX86Feature(Adx)
hasSgxImpl = testX86Feature(Sgx)
hasGfniImpl = testX86Feature(Gfni)
hasAesImpl = testX86Feature(Aes)

View File

@ -337,6 +337,10 @@ func generate*(a: Assembler_x86): NimNode =
)
)
)
result = nnkBlockStmt.newTree(
newEmptyNode(),
result
)
func getStrOffset(a: Assembler_x86, op: Operand): string =
if op.kind != kFromArray:

View File

@ -195,8 +195,8 @@ func eth_evm_ecmul*(
var sprime{.noInit.}: typeof(smod.mres)
# Due to mismatch between the BigInt[256] input and the rest being BigInt[254]
# we use the low-level montyResidue instead of 'fromBig'
montyResidue(smod.mres.limbs, s.limbs,
# we use the low-level getMont instead of 'fromBig'
getMont(smod.mres.limbs, s.limbs,
Fr[BN254_Snarks].fieldMod().limbs,
Fr[BN254_Snarks].getR2modP().limbs,
Fr[BN254_Snarks].getNegInvModWord(),

View File

@ -13,8 +13,8 @@ import
../../arithmetic,
../../arithmetic/assembly/[
limbs_asm_mul_x86_adx_bmi2,
limbs_asm_montmul_x86_adx_bmi2,
limbs_asm_montred_x86_adx_bmi2
limbs_asm_mul_mont_x86_adx_bmi2,
limbs_asm_redc_mont_x86_adx_bmi2
]
@ -50,7 +50,7 @@ func has2extraBits(F: type Fp): bool =
# 𝔽p2 squaring
# ------------------------------------------------------------
func sqrx2x_complex_asm_adx_bmi2*(
func sqrx2x_complex_asm_adx*(
r: var array[2, FpDbl],
a: array[2, Fp]
) =
@ -67,11 +67,11 @@ func sqrx2x_complex_asm_adx_bmi2*(
t0.double(a.c1)
t1.sum(a.c0, a.c1)
r.c1.mul_asm_adx_bmi2_impl(t0, a.c0)
r.c1.mul_asm_adx_inline(t0, a.c0)
t0.diff(a.c0, a.c1)
r.c0.mul_asm_adx_bmi2_impl(t0, t1)
r.c0.mul_asm_adx_inline(t0, t1)
func sqrx_complex_sparebit_asm_adx_bmi2*(
func sqrx_complex_sparebit_asm_adx*(
r: var array[2, Fp],
a: array[2, Fp]
) =
@ -85,15 +85,15 @@ func sqrx_complex_sparebit_asm_adx_bmi2*(
var v0 {.noInit.}, v1 {.noInit.}: typeof(r.c0)
v0.diff(a.c0, a.c1)
v1.sum(a.c0, a.c1)
r.c1.mres.limbs.montMul_CIOS_sparebit_asm_adx_bmi2(a.c0.mres.limbs, a.c1.mres.limbs, Fp.fieldMod().limbs, Fp.getNegInvModWord())
r.c1.mres.limbs.mulMont_CIOS_sparebit_asm_adx(a.c0.mres.limbs, a.c1.mres.limbs, Fp.fieldMod().limbs, Fp.getNegInvModWord())
# aliasing: a unneeded now
r.c1.double()
r.c0.mres.limbs.montMul_CIOS_sparebit_asm_adx_bmi2(v0.mres.limbs, v1.mres.limbs, Fp.fieldMod().limbs, Fp.getNegInvModWord())
r.c0.mres.limbs.mulMont_CIOS_sparebit_asm_adx(v0.mres.limbs, v1.mres.limbs, Fp.fieldMod().limbs, Fp.getNegInvModWord())
# 𝔽p2 multiplication
# ------------------------------------------------------------
func mulx2x_complex_asm_adx_bmi2*(
func mul2x_fp2_complex_asm_adx*(
r: var array[2, FpDbl],
a, b: array[2, Fp]
) =
@ -101,15 +101,15 @@ func mulx2x_complex_asm_adx_bmi2*(
var D {.noInit.}: typeof(r.c0)
var t0 {.noInit.}, t1 {.noInit.}: typeof(a.c0)
r.c0.limbs2x.mul_asm_adx_bmi2_impl(a.c0.mres.limbs, b.c0.mres.limbs)
D.limbs2x.mul_asm_adx_bmi2_impl(a.c1.mres.limbs, b.c1.mres.limbs)
r.c0.limbs2x.mul_asm_adx_inline(a.c0.mres.limbs, b.c0.mres.limbs)
D.limbs2x.mul_asm_adx_inline(a.c1.mres.limbs, b.c1.mres.limbs)
when Fp.has1extraBit():
t0.sumUnr(a.c0, a.c1)
t1.sumUnr(b.c0, b.c1)
else:
t0.sum(a.c0, a.c1)
t1.sum(b.c0, b.c1)
r.c1.limbs2x.mul_asm_adx_bmi2_impl(t0.mres.limbs, t1.mres.limbs)
r.c1.limbs2x.mul_asm_adx_inline(t0.mres.limbs, t1.mres.limbs)
when Fp.has1extraBit():
r.c1.diff2xUnr(r.c1, r.c0)
r.c1.diff2xUnr(r.c1, D)
@ -118,20 +118,20 @@ func mulx2x_complex_asm_adx_bmi2*(
r.c1.diff2xMod(r.c1, D)
r.c0.diff2xMod(r.c0, D)
func mulx_complex_asm_adx_bmi2*(
func mul_fp2_complex_asm_adx*(
r: var array[2, Fp],
a, b: array[2, Fp]
) =
## Complex multiplication on 𝔽p2
var d {.noInit.}: array[2,doublePrec(Fp)]
d.mulx2x_complex_asm_adx_bmi2(a, b)
r.c0.mres.limbs.montRed_asm_adx_bmi2_impl(
d.mul2x_fp2_complex_asm_adx(a, b)
r.c0.mres.limbs.redcMont_asm_adx_inline(
d.c0.limbs2x,
Fp.fieldMod().limbs,
Fp.getNegInvModWord(),
Fp.has1extraBit()
)
r.c1.mres.limbs.montRed_asm_adx_bmi2_impl(
r.c1.mres.limbs.redcMont_asm_adx_inline(
d.c1.limbs2x,
Fp.fieldMod().limbs,
Fp.getNegInvModWord(),

View File

@ -454,7 +454,7 @@ func prod*[C: static Curve](
# (c0 + c1 x) (u + v x) => u c0 + (u c0 + u c1)x + v c1 x²
# => u c0 + β v c1 + (v c0 + u c1) x
when a.fromComplexExtension() and u == 1 and v == 1:
let t = a.c0
let t {.noInit.} = a.c0
r.c0.diff(t, a.c1)
r.c1.sum(t, a.c1)
else:
@ -504,7 +504,7 @@ func prod2x*[C: static Curve](
const Beta {.used.} = C.getNonResidueFp()
when complex and U == 1 and V == 1:
let a1 = a.c1
let a1 {.noInit.} = a.c1
r.c1.sum2xMod(a.c0, a1)
r.c0.diff2xMod(a.c0, a1)
else:
@ -567,13 +567,13 @@ func `/=`*[C: static Curve](a: var Fp2[C], _: type NonResidue) =
# 1/2 * (c0 + c1, c1 - c0)
when a.fromComplexExtension() and u == 1 and v == 1:
let t = a.c0
let t {.noInit.} = a.c0
a.c0 += a.c1
a.c1 -= t
a.div2()
else:
var a0 = a.c0
let a1 = a.c1
var a0 {.noInit.} = a.c0
let a1 {.noInit.} = a.c1
const u2v2 = u*u - Beta*v*v # (u² - βv²)
# TODO can be precomputed to avoid costly inversion.
var u2v2inv {.noInit.}: a.c0.typeof
@ -605,7 +605,7 @@ func prod*(r: var QuadraticExt, a: QuadraticExt, _: type NonResidue) =
## - if sextic non-residue: 𝔽p8, 𝔽p12 or 𝔽p24
##
## Assumes that the non-residue is sqrt(lower extension non-residue)
let t = a.c0
let t {.noInit.} = a.c0
r.c0.prod(a.c1, NonResidue)
r.c1 = t
@ -628,7 +628,7 @@ func prod2x*(
_: type NonResidue) =
## Multiplication by non-residue
static: doAssert not(r.c0 is FpDbl), "Wrong dispatch, there is a specific non-residue multiplication for the base extension."
let t = a.c0
let t {.noInit.} = a.c0
r.c0.prod2x(a.c1, NonResidue)
`=`(r.c1, t) # "r.c1 = t", is refused by the compiler.
@ -650,7 +650,7 @@ func prod*(r: var CubicExt, a: CubicExt, _: type NonResidue) =
## For all curves γ = v with v the factor for the cubic extension coordinate
## and v³ = ξ
## (c0 + c1 v + c2 v²) v => ξ c2 + c0 v + c1 v²
let t = a.c2
let t {.noInit.} = a.c2
r.c1 = a.c0
r.c2 = a.c1
r.c0.prod(t, NonResidue)
@ -677,7 +677,7 @@ func prod2x*(
## For all curves γ = v with v the factor for cubic extension coordinate
## and v³ = ξ
## (c0 + c1 v + c2 v²) v => ξ c2 + c0 v + c1 v²
let t = a.c2
let t {.noInit.} = a.c2
r.c1 = a.c0
r.c2 = a.c1
r.c0.prod2x(t, NonResidue)
@ -1243,7 +1243,7 @@ func square*(r: var QuadraticExt, a: QuadraticExt) =
when true:
when UseASM_X86_64 and a.c0.mres.limbs.len <= 6 and r.typeof.has1extraBit():
if ({.noSideEffect.}: hasAdx()):
r.coords.sqrx_complex_sparebit_asm_adx_bmi2(a.coords)
r.coords.sqrx_complex_sparebit_asm_adx(a.coords)
else:
r.square_complex(a)
else:
@ -1281,7 +1281,7 @@ func prod*(r: var QuadraticExt, a, b: QuadraticExt) =
else: # faster
when UseASM_X86_64 and a.c0.mres.limbs.len <= 6:
if ({.noSideEffect.}: hasAdx()):
r.coords.mulx_complex_asm_adx_bmi2(a.coords, b.coords)
r.coords.mul_fp2_complex_asm_adx(a.coords, b.coords)
else:
var d {.noInit.}: doublePrec(typeof(r))
d.prod2x_complex(a, b)
@ -1318,7 +1318,7 @@ func prod2x*(r: var QuadraticExt2x, a, b: QuadraticExt) =
when a.fromComplexExtension():
when UseASM_X86_64 and a.c0.mres.limbs.len <= 6:
if ({.noSideEffect.}: hasAdx()):
r.coords.mulx2x_complex_asm_adx_bmi2(a.coords, b.coords)
r.coords.mul2x_fp2_complex_asm_adx(a.coords, b.coords)
else:
r.prod2x_complex(a, b)
else:
@ -1591,7 +1591,7 @@ func prod2xImpl(r: var CubicExt2x, a, b: CubicExt) =
V2.prod2x(a.c2, b.c2)
# r₀ = β ((a₁ + a₂)(b₁ + b₂) - v₁ - v₂) + v₀
when false: # CubicExt.has1extraBit():
when a.c0 is Fp and CubicExt.has1extraBit():
t0.sumUnr(a.c1, a.c2)
t1.sumUnr(b.c1, b.c2)
else:
@ -1604,7 +1604,7 @@ func prod2xImpl(r: var CubicExt2x, a, b: CubicExt) =
r.c0.sum2xMod(r.c0, V0)
# r₁ = (a₀ + a₁) * (b₀ + b₁) - v₀ - v₁ + β v₂
when false: # CubicExt.has1extraBit():
when a.c0 is Fp and CubicExt.has1extraBit():
t0.sumUnr(a.c0, a.c1)
t1.sumUnr(b.c0, b.c1)
else:
@ -1617,7 +1617,7 @@ func prod2xImpl(r: var CubicExt2x, a, b: CubicExt) =
r.c1.sum2xMod(r.c1, r.c2)
# r₂ = (a₀ + a₂) * (b₀ + b₂) - v₀ - v₂ + v₁
when false: # CubicExt.has1extraBit():
when a.c0 is Fp and CubicExt.has1extraBit():
t0.sumUnr(a.c0, a.c2)
t1.sumUnr(b.c0, b.c2)
else:

View File

@ -84,7 +84,11 @@ The optimizations can be of algebraic, algorithmic or "implementation details" n
- [ ] NAF recoding
- [ ] windowed-NAF recoding
- [ ] SIMD vectorized select in window algorithm
- [ ] Almost Montgomery Multiplication, https://eprint.iacr.org/2011/239.pdf
- [ ] Montgomery Multiplication with no final substraction,
- Bos and Montgomery, https://eprint.iacr.org/2017/1057.pdf
- Colin D Walter, https://colinandmargaret.co.uk/Research/CDW_ELL_99.pdf
- Hachez and Quisquater, https://link.springer.com/content/pdf/10.1007%2F3-540-44499-8_23.pdf
- Gueron, https://eprint.iacr.org/2011/239.pdf
- [ ] Pippenger multi-exponentiation (variable-time)
- [ ] parallelized Pippenger

View File

@ -148,7 +148,7 @@ func random_unsafe(rng: var RngState, a: var FF) =
# Note: a simple modulo will be biaised but it's simple and "fast"
reduced.reduce(unreduced, FF.fieldMod())
a.mres.montyResidue(reduced, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
a.mres.getMont(reduced, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
func random_unsafe(rng: var RngState, a: var ExtensionField) =
## Recursively initialize an extension Field element
@ -179,7 +179,7 @@ func random_highHammingWeight(rng: var RngState, a: var FF) =
# Note: a simple modulo will be biaised but it's simple and "fast"
reduced.reduce(unreduced, FF.fieldMod())
a.mres.montyResidue(reduced, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
a.mres.getMont(reduced, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
func random_highHammingWeight(rng: var RngState, a: var ExtensionField) =
## Recursively initialize an extension Field element
@ -224,7 +224,7 @@ func random_long01Seq(rng: var RngState, a: var FF) =
# Note: a simple modulo will be biaised but it's simple and "fast"
reduced.reduce(unreduced, FF.fieldMod())
a.mres.montyResidue(reduced, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
a.mres.getMont(reduced, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
func random_long01Seq(rng: var RngState, a: var ExtensionField) =
## Recursively initialize an extension Field element

View File

@ -8,7 +8,8 @@
import std/unittest,
../constantine/arithmetic,
../constantine/io/io_fields,
../constantine/arithmetic/limbs_montgomery,
../constantine/io/[io_bigints, io_fields],
../constantine/config/curves
static: doAssert defined(testingCurves), "This modules requires the -d:testingCurves compile option"
@ -317,4 +318,54 @@ proc largeField() =
check: bool r.isZero()
test "fromMont doesn't need a final substraction with 256-bit prime (full word used)":
block:
var a: Fp[Secp256k1]
a.mres = Fp[Secp256k1].getMontyPrimeMinus1()
let expected = BigInt[256].fromHex"0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2E"
var r: BigInt[256]
r.fromField(a)
check: bool(r == expected)
block:
var a: Fp[Secp256k1]
var d: FpDbl[Secp256k1]
# Set Montgomery repr to the largest field element in Montgomery Residue form
a.mres = BigInt[256].fromHex"0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2E"
d.limbs2x = (BigInt[512].fromHex"0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2E").limbs
var r, expected: BigInt[256]
r.fromField(a)
expected.limbs.redc2xMont(d.limbs2x, Secp256k1.Mod().limbs, Fp[Secp256k1].getNegInvModWord(), Fp[Secp256k1].getSpareBits())
check: bool(r == expected)
test "fromMont doesn't need a final substraction with 255-bit prime (1 spare bit)":
block:
var a: Fp[Curve25519]
a.mres = Fp[Curve25519].getMontyPrimeMinus1()
let expected = BigInt[255].fromHex"0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffec"
var r: BigInt[255]
r.fromField(a)
check: bool(r == expected)
block:
var a: Fp[Curve25519]
var d: FpDbl[Curve25519]
# Set Montgomery repr to the largest field element in Montgomery Residue form
a.mres = BigInt[255].fromHex"0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffec"
d.limbs2x = (BigInt[512].fromHex"0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffec").limbs
var r, expected: BigInt[255]
r.fromField(a)
expected.limbs.redc2xMont(d.limbs2x, Curve25519.Mod().limbs, Fp[Curve25519].getNegInvModWord(), Fp[Curve25519].getSpareBits())
check: bool(r == expected)
largeField()