Low-level refactoring (#175)
* Add specific fromMont conversion routine. Rename montyResidue to getMont * missed test file * Add x86_64 ASM for fromMont * Add x86_64 MULX/ADCX/ADOX for fromMont * rework Montgomery Multiplication with prefetch/latency hiding techniques * Fix ADX autodetection, closes #174. Rollback faster mul_mont attempt, no improvement and debug pain. * finalSub in fromMont & adx_bmi -> adx * Some {.noInit.} to avoid Nim zeroMem (which should be optimized away but who knows) * Uniformize name 'op+domain': mulmod - mulmont * Fix asm codegen bug "0x0000555555565930 <+896>: sbb 0x20(%r8),%r8" with Clang in final substraction * Prepare for skipping final substraction * Don't forget to copy the result when we skip the final substraction * Seems like we need to stash the idea of skipping the final substraction for now, needs bounds analysis https://eprint.iacr.org/2017/1057.pdf * fix condition for ASM 32-bit * optim modular addition when sparebit is available
This commit is contained in:
parent
53c4db7ead
commit
14af7e8724
|
@ -91,6 +91,20 @@ proc sqrBench*(T: typedesc, iters: int) =
|
||||||
bench("Squaring", T, iters):
|
bench("Squaring", T, iters):
|
||||||
r.square(x)
|
r.square(x)
|
||||||
|
|
||||||
|
proc toBigBench*(T: typedesc, iters: int) =
|
||||||
|
var r: matchingBigInt(T.C)
|
||||||
|
let x = rng.random_unsafe(T)
|
||||||
|
preventOptimAway(r)
|
||||||
|
bench("BigInt <- field conversion", T, iters):
|
||||||
|
r.fromField(x)
|
||||||
|
|
||||||
|
proc toFieldBench*(T: typedesc, iters: int) =
|
||||||
|
var r: T
|
||||||
|
let x = rng.random_unsafe(matchingBigInt(T.C))
|
||||||
|
preventOptimAway(r)
|
||||||
|
bench("BigInt -> field conversion", T, iters):
|
||||||
|
r.fromBig(x)
|
||||||
|
|
||||||
proc invBench*(T: typedesc, iters: int) =
|
proc invBench*(T: typedesc, iters: int) =
|
||||||
var r: T
|
var r: T
|
||||||
let x = rng.random_unsafe(T)
|
let x = rng.random_unsafe(T)
|
||||||
|
|
|
@ -50,6 +50,9 @@ proc main() =
|
||||||
mulBench(Fp[curve], Iters)
|
mulBench(Fp[curve], Iters)
|
||||||
sqrBench(Fp[curve], Iters)
|
sqrBench(Fp[curve], Iters)
|
||||||
smallSeparator()
|
smallSeparator()
|
||||||
|
toBigBench(Fp[curve], Iters)
|
||||||
|
toFieldBench(Fp[curve], Iters)
|
||||||
|
smallSeparator()
|
||||||
invBench(Fp[curve], ExponentIters)
|
invBench(Fp[curve], ExponentIters)
|
||||||
sqrtBench(Fp[curve], ExponentIters)
|
sqrtBench(Fp[curve], ExponentIters)
|
||||||
sqrtRatioBench(Fp[curve], ExponentIters)
|
sqrtRatioBench(Fp[curve], ExponentIters)
|
||||||
|
|
|
@ -24,14 +24,111 @@ import
|
||||||
# They are nice to let the compiler deals with mov
|
# They are nice to let the compiler deals with mov
|
||||||
# but too constraining so we move things ourselves.
|
# but too constraining so we move things ourselves.
|
||||||
|
|
||||||
static: doAssert UseASM_X86_64
|
static: doAssert UseASM_X86_32
|
||||||
|
|
||||||
{.localPassC:"-fomit-frame-pointer".} # Needed so that the compiler finds enough registers
|
{.localPassC:"-fomit-frame-pointer".} # Needed so that the compiler finds enough registers
|
||||||
|
|
||||||
|
proc finalSubNoCarryImpl*(
|
||||||
|
ctx: var Assembler_x86,
|
||||||
|
r: Operand or OperandArray,
|
||||||
|
a, M, scratch: OperandArray
|
||||||
|
) =
|
||||||
|
## Reduce `a` into `r` modulo `M`
|
||||||
|
##
|
||||||
|
## r, a, scratch, scratchReg are mutated
|
||||||
|
## M is read-only
|
||||||
|
let N = M.len
|
||||||
|
ctx.comment "Final substraction (no carry)"
|
||||||
|
for i in 0 ..< N:
|
||||||
|
ctx.mov scratch[i], a[i]
|
||||||
|
if i == 0:
|
||||||
|
ctx.sub scratch[i], M[i]
|
||||||
|
else:
|
||||||
|
ctx.sbb scratch[i], M[i]
|
||||||
|
|
||||||
|
# If we borrowed it means that we were smaller than
|
||||||
|
# the modulus and we don't need "scratch"
|
||||||
|
for i in 0 ..< N:
|
||||||
|
ctx.cmovnc a[i], scratch[i]
|
||||||
|
ctx.mov r[i], a[i]
|
||||||
|
|
||||||
|
proc finalSubMayCarryImpl*(
|
||||||
|
ctx: var Assembler_x86,
|
||||||
|
r: Operand or OperandArray,
|
||||||
|
a, M, scratch: OperandArray,
|
||||||
|
scratchReg: Operand or Register or OperandReuse
|
||||||
|
) =
|
||||||
|
## Reduce `a` into `r` modulo `M`
|
||||||
|
## To be used when the final substraction can
|
||||||
|
## also depend on the carry flag
|
||||||
|
##
|
||||||
|
## r, a, scratch, scratchReg are mutated
|
||||||
|
## M is read-only
|
||||||
|
|
||||||
|
ctx.comment "Final substraction (may carry)"
|
||||||
|
|
||||||
|
# Mask: scratchReg contains 0xFFFF or 0x0000
|
||||||
|
ctx.sbb scratchReg, scratchReg
|
||||||
|
|
||||||
|
# Now substract the modulus to test a < p
|
||||||
|
let N = M.len
|
||||||
|
for i in 0 ..< N:
|
||||||
|
ctx.mov scratch[i], a[i]
|
||||||
|
if i == 0:
|
||||||
|
ctx.sub scratch[i], M[i]
|
||||||
|
else:
|
||||||
|
ctx.sbb scratch[i], M[i]
|
||||||
|
|
||||||
|
# If it overflows here, it means that it was
|
||||||
|
# smaller than the modulus and we don't need `scratch`
|
||||||
|
ctx.sbb scratchReg, 0
|
||||||
|
|
||||||
|
# If we borrowed it means that we were smaller than
|
||||||
|
# the modulus and we don't need "scratch"
|
||||||
|
for i in 0 ..< N:
|
||||||
|
ctx.cmovnc a[i], scratch[i]
|
||||||
|
ctx.mov r[i], a[i]
|
||||||
|
|
||||||
|
macro finalSub_gen*[N: static int](
|
||||||
|
r_PIR: var array[N, SecretWord],
|
||||||
|
a_EIR, M_PIR: array[N, SecretWord],
|
||||||
|
scratch_EIR: var array[N, SecretWord],
|
||||||
|
mayCarry: static bool): untyped =
|
||||||
|
## Returns:
|
||||||
|
## a-M if a > M
|
||||||
|
## a otherwise
|
||||||
|
##
|
||||||
|
## - r_PIR is a pointer to the result array, mutated,
|
||||||
|
## - a_EIR is an array of registers, mutated,
|
||||||
|
## - M_PIR is a pointer to an array, read-only,
|
||||||
|
## - scratch_EIR is an array of registers, mutated
|
||||||
|
## - mayCarry is set to true when the carry flag also needs to be read
|
||||||
|
result = newStmtList()
|
||||||
|
|
||||||
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
|
let
|
||||||
|
r = init(OperandArray, nimSymbol = r_PIR, N, PointerInReg, InputOutput)
|
||||||
|
# We reuse the reg used for b for overflow detection
|
||||||
|
a = init(OperandArray, nimSymbol = a_EIR, N, ElemsInReg, InputOutput)
|
||||||
|
# We could force m as immediate by specializing per moduli
|
||||||
|
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||||
|
t = init(OperandArray, nimSymbol = scratch_EIR, N, ElemsInReg, Output_EarlyClobber)
|
||||||
|
|
||||||
|
if mayCarry:
|
||||||
|
ctx.finalSubMayCarryImpl(
|
||||||
|
r, a, M, t, rax
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
ctx.finalSubNoCarryImpl(
|
||||||
|
r, a, M, t
|
||||||
|
)
|
||||||
|
|
||||||
|
result.add ctx.generate()
|
||||||
|
|
||||||
# Field addition
|
# Field addition
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
|
macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N], hasSpareBit: static bool): untyped =
|
||||||
## Generate an optimized modular addition kernel
|
## Generate an optimized modular addition kernel
|
||||||
# Register pressure note:
|
# Register pressure note:
|
||||||
# We could generate a kernel per modulus m by hardcoding it as immediate
|
# We could generate a kernel per modulus m by hardcoding it as immediate
|
||||||
|
@ -68,33 +165,18 @@ macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
|
||||||
# Interleaved copy in a second buffer as well
|
# Interleaved copy in a second buffer as well
|
||||||
ctx.mov v[i], u[i]
|
ctx.mov v[i], u[i]
|
||||||
|
|
||||||
# Mask: overflowed contains 0xFFFF or 0x0000
|
if hasSparebit:
|
||||||
# TODO: unnecessary if MSB never set, i.e. "Field.getSpareBits >= 1"
|
ctx.finalSubNoCarryImpl(r, u, M, v)
|
||||||
let overflowed = b.reuseRegister()
|
|
||||||
ctx.sbb overflowed, overflowed
|
|
||||||
|
|
||||||
# Now substract the modulus to test a < p
|
|
||||||
for i in 0 ..< N:
|
|
||||||
if i == 0:
|
|
||||||
ctx.sub v[0], M[0]
|
|
||||||
else:
|
else:
|
||||||
ctx.sbb v[i], M[i]
|
ctx.finalSubMayCarryImpl(
|
||||||
|
r, u, M, v, b.reuseRegister()
|
||||||
|
)
|
||||||
|
|
||||||
# If it overflows here, it means that it was
|
result.add ctx.generate()
|
||||||
# smaller than the modulus and we don't need V
|
|
||||||
ctx.sbb overflowed, 0
|
|
||||||
|
|
||||||
# Conditional Mov and
|
func addmod_asm*(r: var Limbs, a, b, m: Limbs, hasSpareBit: static bool) =
|
||||||
# and store result
|
|
||||||
for i in 0 ..< N:
|
|
||||||
ctx.cmovnc u[i], v[i]
|
|
||||||
ctx.mov r[i], u[i]
|
|
||||||
|
|
||||||
result.add ctx.generate
|
|
||||||
|
|
||||||
func addmod_asm*(r: var Limbs, a, b, m: Limbs) =
|
|
||||||
## Constant-time modular addition
|
## Constant-time modular addition
|
||||||
addmod_gen(r, a, b, m)
|
addmod_gen(r, a, b, m, hasSpareBit)
|
||||||
|
|
||||||
# Field substraction
|
# Field substraction
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
@ -225,7 +307,7 @@ when isMainModule:
|
||||||
debugecho " a: ", a.toHex()
|
debugecho " a: ", a.toHex()
|
||||||
debugecho " b: ", b.toHex()
|
debugecho " b: ", b.toHex()
|
||||||
debugecho " m: ", m.toHex()
|
debugecho " m: ", m.toHex()
|
||||||
addmod_asm(a, a, b, m)
|
addmod_asm(a, a, b, m, hasSpareBit = false)
|
||||||
debugecho "after:"
|
debugecho "after:"
|
||||||
debugecho " a: ", a.toHex().tolower
|
debugecho " a: ", a.toHex().tolower
|
||||||
debugecho " s: ", s
|
debugecho " s: ", s
|
||||||
|
@ -245,7 +327,7 @@ when isMainModule:
|
||||||
debugecho " a: ", a.toHex()
|
debugecho " a: ", a.toHex()
|
||||||
debugecho " b: ", b.toHex()
|
debugecho " b: ", b.toHex()
|
||||||
debugecho " m: ", m.toHex()
|
debugecho " m: ", m.toHex()
|
||||||
addmod_asm(a, a, b, m)
|
addmod_asm(a, a, b, m, hasSpareBit = false)
|
||||||
debugecho "after:"
|
debugecho "after:"
|
||||||
debugecho " a: ", a.toHex().tolower
|
debugecho " a: ", a.toHex().tolower
|
||||||
debugecho " s: ", s
|
debugecho " s: ", s
|
||||||
|
@ -265,7 +347,7 @@ when isMainModule:
|
||||||
debugecho " a: ", a.toHex()
|
debugecho " a: ", a.toHex()
|
||||||
debugecho " b: ", b.toHex()
|
debugecho " b: ", b.toHex()
|
||||||
debugecho " m: ", m.toHex()
|
debugecho " m: ", m.toHex()
|
||||||
addmod_asm(a, a, b, m)
|
addmod_asm(a, a, b, m, hasSpareBit = false)
|
||||||
debugecho "after:"
|
debugecho "after:"
|
||||||
debugecho " a: ", a.toHex().tolower
|
debugecho " a: ", a.toHex().tolower
|
||||||
debugecho " s: ", s
|
debugecho " s: ", s
|
||||||
|
@ -285,7 +367,7 @@ when isMainModule:
|
||||||
debugecho " a: ", a.toHex()
|
debugecho " a: ", a.toHex()
|
||||||
debugecho " b: ", b.toHex()
|
debugecho " b: ", b.toHex()
|
||||||
debugecho " m: ", m.toHex()
|
debugecho " m: ", m.toHex()
|
||||||
addmod_asm(a, a, b, m)
|
addmod_asm(a, a, b, m, hasSpareBit = false)
|
||||||
debugecho "after:"
|
debugecho "after:"
|
||||||
debugecho " a: ", a.toHex().tolower
|
debugecho " a: ", a.toHex().tolower
|
||||||
debugecho " s: ", s
|
debugecho " s: ", s
|
||||||
|
@ -308,7 +390,7 @@ when isMainModule:
|
||||||
debugecho " a: ", a.toHex()
|
debugecho " a: ", a.toHex()
|
||||||
debugecho " b: ", b.toHex()
|
debugecho " b: ", b.toHex()
|
||||||
debugecho " m: ", m.toHex()
|
debugecho " m: ", m.toHex()
|
||||||
submod_asm(a, a, b, m)
|
submod_asm(a, a, b, m, hasSpareBit = false)
|
||||||
debugecho "after:"
|
debugecho "after:"
|
||||||
debugecho " a: ", a.toHex().tolower
|
debugecho " a: ", a.toHex().tolower
|
||||||
debugecho " s: ", s
|
debugecho " s: ", s
|
||||||
|
@ -333,7 +415,7 @@ when isMainModule:
|
||||||
debugecho " a: ", a.toHex()
|
debugecho " a: ", a.toHex()
|
||||||
debugecho " b: ", b.toHex()
|
debugecho " b: ", b.toHex()
|
||||||
debugecho " m: ", m.toHex()
|
debugecho " m: ", m.toHex()
|
||||||
submod_asm(r, a, b, m)
|
submod_asm(r, a, b, m, hasSpareBit = false)
|
||||||
debugecho "after:"
|
debugecho "after:"
|
||||||
debugecho " r: ", r.toHex().tolower
|
debugecho " r: ", r.toHex().tolower
|
||||||
debugecho " s: ", s
|
debugecho " s: ", s
|
||||||
|
|
|
@ -1,160 +0,0 @@
|
||||||
# Constantine
|
|
||||||
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
|
||||||
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
|
||||||
# Licensed and distributed under either of
|
|
||||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
|
||||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
|
||||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
|
||||||
|
|
||||||
import
|
|
||||||
# Standard library
|
|
||||||
std/macros,
|
|
||||||
# Internal
|
|
||||||
../../config/common,
|
|
||||||
../../primitives,
|
|
||||||
./limbs_asm_montred_x86
|
|
||||||
|
|
||||||
# ############################################################
|
|
||||||
#
|
|
||||||
# Assembly implementation of finite fields
|
|
||||||
#
|
|
||||||
# ############################################################
|
|
||||||
|
|
||||||
static: doAssert UseASM_X86_64
|
|
||||||
|
|
||||||
# MULX/ADCX/ADOX
|
|
||||||
{.localPassC:"-madx -mbmi2".}
|
|
||||||
# Necessary for the compiler to find enough registers (enabled at -O1)
|
|
||||||
{.localPassC:"-fomit-frame-pointer".}
|
|
||||||
|
|
||||||
# No exceptions allowed
|
|
||||||
{.push raises: [].}
|
|
||||||
|
|
||||||
# Montgomery reduction
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
|
|
||||||
macro montyRedc2x_adx_gen*[N: static int](
|
|
||||||
r_MR: var array[N, SecretWord],
|
|
||||||
a_MR: array[N*2, SecretWord],
|
|
||||||
M_MR: array[N, SecretWord],
|
|
||||||
m0ninv_MR: BaseType,
|
|
||||||
hasSpareBit: static bool
|
|
||||||
) =
|
|
||||||
result = newStmtList()
|
|
||||||
|
|
||||||
var ctx = init(Assembler_x86, BaseType)
|
|
||||||
let
|
|
||||||
# We could force M as immediate by specializing per moduli
|
|
||||||
M = init(OperandArray, nimSymbol = M_MR, N, PointerInReg, Input)
|
|
||||||
|
|
||||||
let uSlots = N+1
|
|
||||||
let vSlots = max(N-1, 5)
|
|
||||||
|
|
||||||
var # Scratchspaces
|
|
||||||
u = init(OperandArray, nimSymbol = ident"U", uSlots, ElemsInReg, InputOutput_EnsureClobber)
|
|
||||||
v = init(OperandArray, nimSymbol = ident"V", vSlots, ElemsInReg, InputOutput_EnsureClobber)
|
|
||||||
|
|
||||||
# Prologue
|
|
||||||
let usym = u.nimSymbol
|
|
||||||
let vsym = v.nimSymbol
|
|
||||||
result.add quote do:
|
|
||||||
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
|
||||||
var `usym`{.noinit.}: Limbs[`uSlots`]
|
|
||||||
var `vsym` {.noInit.}: Limbs[`vSlots`]
|
|
||||||
`vsym`[0] = cast[SecretWord](`r_MR`[0].unsafeAddr)
|
|
||||||
`vsym`[1] = cast[SecretWord](`a_MR`[0].unsafeAddr)
|
|
||||||
`vsym`[2] = SecretWord(`m0ninv_MR`)
|
|
||||||
|
|
||||||
let r_temp = v[0].asArrayAddr(len = N)
|
|
||||||
let a = v[1].asArrayAddr(len = 2*N)
|
|
||||||
let m0ninv = v[2]
|
|
||||||
let lo = v[3]
|
|
||||||
let hi = v[4]
|
|
||||||
|
|
||||||
# Algorithm
|
|
||||||
# ---------------------------------------------------------
|
|
||||||
# for i in 0 .. n-1:
|
|
||||||
# hi <- 0
|
|
||||||
# m <- a[i] * m0ninv mod 2^w (i.e. simple multiplication)
|
|
||||||
# for j in 0 .. n-1:
|
|
||||||
# (hi, lo) <- a[i+j] + m * M[j] + hi
|
|
||||||
# a[i+j] <- lo
|
|
||||||
# a[i+n] += hi
|
|
||||||
# for i in 0 .. n-1:
|
|
||||||
# r[i] = a[i+n]
|
|
||||||
# if r >= M:
|
|
||||||
# r -= M
|
|
||||||
|
|
||||||
# No register spilling handling
|
|
||||||
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
|
|
||||||
|
|
||||||
ctx.mov rdx, m0ninv
|
|
||||||
|
|
||||||
for i in 0 ..< N:
|
|
||||||
ctx.mov u[i], a[i]
|
|
||||||
|
|
||||||
for i in 0 ..< N:
|
|
||||||
# RDX contains m0ninv at the start of each loop
|
|
||||||
ctx.comment ""
|
|
||||||
ctx.imul rdx, u[0] # m <- a[i] * m0ninv mod 2^w
|
|
||||||
ctx.comment "---- Reduction " & $i
|
|
||||||
ctx.`xor` u[N], u[N]
|
|
||||||
|
|
||||||
for j in 0 ..< N-1:
|
|
||||||
ctx.comment ""
|
|
||||||
ctx.mulx hi, lo, M[j], rdx
|
|
||||||
ctx.adcx u[j], lo
|
|
||||||
ctx.adox u[j+1], hi
|
|
||||||
|
|
||||||
# Last limb
|
|
||||||
ctx.comment ""
|
|
||||||
ctx.mulx hi, lo, M[N-1], rdx
|
|
||||||
ctx.mov rdx, m0ninv # Reload m0ninv for next iter
|
|
||||||
ctx.adcx u[N-1], lo
|
|
||||||
ctx.adox hi, u[N]
|
|
||||||
ctx.adcx u[N], hi
|
|
||||||
|
|
||||||
u.rotateLeft()
|
|
||||||
|
|
||||||
ctx.mov rdx, r_temp
|
|
||||||
let r = rdx.asArrayAddr(len = N)
|
|
||||||
|
|
||||||
# This does a[i+n] += hi
|
|
||||||
# but in a separate carry chain, fused with the
|
|
||||||
# copy "r[i] = a[i+n]"
|
|
||||||
for i in 0 ..< N:
|
|
||||||
if i == 0:
|
|
||||||
ctx.add u[i], a[i+N]
|
|
||||||
else:
|
|
||||||
ctx.adc u[i], a[i+N]
|
|
||||||
|
|
||||||
let t = repackRegisters(v, u[N])
|
|
||||||
|
|
||||||
if hasSpareBit:
|
|
||||||
ctx.finalSubNoCarry(r, u, M, t)
|
|
||||||
else:
|
|
||||||
ctx.finalSubCanOverflow(r, u, M, t, hi)
|
|
||||||
|
|
||||||
# Code generation
|
|
||||||
result.add ctx.generate()
|
|
||||||
|
|
||||||
func montRed_asm_adx_bmi2_impl*[N: static int](
|
|
||||||
r: var array[N, SecretWord],
|
|
||||||
a: array[N*2, SecretWord],
|
|
||||||
M: array[N, SecretWord],
|
|
||||||
m0ninv: BaseType,
|
|
||||||
hasSpareBit: static bool
|
|
||||||
) =
|
|
||||||
## Constant-time Montgomery reduction
|
|
||||||
## Inline-version
|
|
||||||
montyRedc2x_adx_gen(r, a, M, m0ninv, hasSpareBit)
|
|
||||||
|
|
||||||
func montRed_asm_adx_bmi2*[N: static int](
|
|
||||||
r: var array[N, SecretWord],
|
|
||||||
a: array[N*2, SecretWord],
|
|
||||||
M: array[N, SecretWord],
|
|
||||||
m0ninv: BaseType,
|
|
||||||
hasSpareBit: static bool
|
|
||||||
) =
|
|
||||||
## Constant-time Montgomery reduction
|
|
||||||
montRed_asm_adx_bmi2_impl(r, a, M, m0ninv, hasSpareBit)
|
|
|
@ -12,7 +12,8 @@ import
|
||||||
# Internal
|
# Internal
|
||||||
../../config/common,
|
../../config/common,
|
||||||
../../primitives,
|
../../primitives,
|
||||||
./limbs_asm_montred_x86,
|
./limbs_asm_modular_x86,
|
||||||
|
./limbs_asm_redc_mont_x86,
|
||||||
./limbs_asm_mul_x86
|
./limbs_asm_mul_x86
|
||||||
|
|
||||||
# ############################################################
|
# ############################################################
|
||||||
|
@ -34,7 +35,11 @@ static: doAssert UseASM_X86_64
|
||||||
# Montgomery multiplication
|
# Montgomery multiplication
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
# Fallback when no ADX and BMI2 support (MULX, ADCX, ADOX)
|
# Fallback when no ADX and BMI2 support (MULX, ADCX, ADOX)
|
||||||
macro montMul_CIOS_sparebit_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_MM: Limbs[N], m0ninv_MM: BaseType): untyped =
|
macro mulMont_CIOS_sparebit_gen[N: static int](
|
||||||
|
r_PIR: var Limbs[N], a_PIR, b_PIR,
|
||||||
|
M_PIR: Limbs[N], m0ninv_REG: BaseType,
|
||||||
|
skipReduction: static bool
|
||||||
|
): untyped =
|
||||||
## Generate an optimized Montgomery Multiplication kernel
|
## Generate an optimized Montgomery Multiplication kernel
|
||||||
## using the CIOS method
|
## using the CIOS method
|
||||||
##
|
##
|
||||||
|
@ -44,14 +49,17 @@ macro montMul_CIOS_sparebit_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M
|
||||||
## M[^1] < high(SecretWord) shr 2 (i.e. less than 0b00111...1111)
|
## M[^1] < high(SecretWord) shr 2 (i.e. less than 0b00111...1111)
|
||||||
## https://hackmd.io/@zkteam/modular_multiplication
|
## https://hackmd.io/@zkteam/modular_multiplication
|
||||||
|
|
||||||
|
# No register spilling handling
|
||||||
|
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
|
||||||
|
|
||||||
result = newStmtList()
|
result = newStmtList()
|
||||||
|
|
||||||
var ctx = init(Assembler_x86, BaseType)
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
let
|
let
|
||||||
scratchSlots = max(N, 6)
|
scratchSlots = 6
|
||||||
|
|
||||||
# We could force M as immediate by specializing per moduli
|
# We could force M as immediate by specializing per moduli
|
||||||
M = init(OperandArray, nimSymbol = M_MM, N, PointerInReg, Input)
|
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||||
# If N is too big, we need to spill registers. TODO.
|
# If N is too big, we need to spill registers. TODO.
|
||||||
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
|
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
|
||||||
# MultiPurpose Register slots
|
# MultiPurpose Register slots
|
||||||
|
@ -62,10 +70,10 @@ macro montMul_CIOS_sparebit_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M
|
||||||
m0ninv = Operand(
|
m0ninv = Operand(
|
||||||
desc: OperandDesc(
|
desc: OperandDesc(
|
||||||
asmId: "[m0ninv]",
|
asmId: "[m0ninv]",
|
||||||
nimSymbol: m0ninv_MM,
|
nimSymbol: m0ninv_REG,
|
||||||
rm: MemOffsettable,
|
rm: MemOffsettable,
|
||||||
constraint: Input,
|
constraint: Input,
|
||||||
cEmit: "&" & $m0ninv_MM
|
cEmit: "&" & $m0ninv_REG
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -76,7 +84,7 @@ macro montMul_CIOS_sparebit_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M
|
||||||
b = scratch[1].asArrayAddr(len = N) # Store the `b` operand
|
b = scratch[1].asArrayAddr(len = N) # Store the `b` operand
|
||||||
A = scratch[2] # High part of extended precision multiplication
|
A = scratch[2] # High part of extended precision multiplication
|
||||||
C = scratch[3]
|
C = scratch[3]
|
||||||
m = scratch[4] # Stores (t[0] * m0ninv) mod 2^w
|
m = scratch[4] # Stores (t[0] * m0ninv) mod 2ʷ
|
||||||
r = scratch[5] # Stores the `r` operand
|
r = scratch[5] # Stores the `r` operand
|
||||||
|
|
||||||
# Registers used:
|
# Registers used:
|
||||||
|
@ -94,18 +102,18 @@ macro montMul_CIOS_sparebit_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
||||||
|
|
||||||
var `tsym`: typeof(`r_MM`) # zero init
|
var `tsym`: typeof(`r_PIR`) # zero init
|
||||||
# Assumes 64-bit limbs on 64-bit arch (or you can't store an address)
|
# Assumes 64-bit limbs on 64-bit arch (or you can't store an address)
|
||||||
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
|
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
|
||||||
`scratchSym`[0] = cast[SecretWord](`a_MM`[0].unsafeAddr)
|
`scratchSym`[0] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
|
||||||
`scratchSym`[1] = cast[SecretWord](`b_MM`[0].unsafeAddr)
|
`scratchSym`[1] = cast[SecretWord](`b_PIR`[0].unsafeAddr)
|
||||||
`scratchSym`[5] = cast[SecretWord](`r_MM`[0].unsafeAddr)
|
`scratchSym`[5] = cast[SecretWord](`r_PIR`[0].unsafeAddr)
|
||||||
|
|
||||||
# Algorithm
|
# Algorithm
|
||||||
# -----------------------------------------
|
# -----------------------------------------
|
||||||
# for i=0 to N-1
|
# for i=0 to N-1
|
||||||
# (A, t[0]) <- a[0] * b[i] + t[0]
|
# (A, t[0]) <- a[0] * b[i] + t[0]
|
||||||
# m <- (t[0] * m0ninv) mod 2^w
|
# m <- (t[0] * m0ninv) mod 2ʷ
|
||||||
# (C, _) <- m * M[0] + t[0]
|
# (C, _) <- m * M[0] + t[0]
|
||||||
# for j=1 to N-1
|
# for j=1 to N-1
|
||||||
# (A, t[j]) <- a[j] * b[i] + A + t[j]
|
# (A, t[j]) <- a[j] * b[i] + A + t[j]
|
||||||
|
@ -127,7 +135,7 @@ macro montMul_CIOS_sparebit_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M
|
||||||
ctx.adc rdx, 0
|
ctx.adc rdx, 0
|
||||||
ctx.mov A, rdx
|
ctx.mov A, rdx
|
||||||
|
|
||||||
# m <- (t[0] * m0ninv) mod 2^w
|
# m <- (t[0] * m0ninv) mod 2ʷ
|
||||||
ctx.mov m, m0ninv
|
ctx.mov m, m0ninv
|
||||||
ctx.imul m, t[0]
|
ctx.imul m, t[0]
|
||||||
|
|
||||||
|
@ -164,19 +172,27 @@ macro montMul_CIOS_sparebit_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M
|
||||||
ctx.add A, C
|
ctx.add A, C
|
||||||
ctx.mov t[N-1], A
|
ctx.mov t[N-1], A
|
||||||
|
|
||||||
ctx.mov rdx, r
|
ctx.mov rax, r # move r away from scratchspace that will be used for final substraction
|
||||||
let r2 = rdx.asArrayAddr(len = N)
|
let r2 = rax.asArrayAddr(len = N)
|
||||||
|
|
||||||
ctx.finalSubNoCarry(
|
if skipReduction:
|
||||||
|
for i in 0 ..< N:
|
||||||
|
ctx.mov r2[i], t[i]
|
||||||
|
else:
|
||||||
|
ctx.finalSubNoCarryImpl(
|
||||||
r2, t, M,
|
r2, t, M,
|
||||||
scratch
|
scratch
|
||||||
)
|
)
|
||||||
|
result.add ctx.generate()
|
||||||
|
|
||||||
result.add ctx.generate
|
func mulMont_CIOS_sparebit_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipReduction: static bool = false) =
|
||||||
|
## Constant-time Montgomery multiplication
|
||||||
func montMul_CIOS_sparebit_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
|
## If "skipReduction" is set
|
||||||
## Constant-time modular multiplication
|
## the result is in the range [0, 2M)
|
||||||
montMul_CIOS_sparebit_gen(r, a, b, M, m0ninv)
|
## otherwise the result is in the range [0, M)
|
||||||
|
##
|
||||||
|
## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
|
||||||
|
r.mulMont_CIOS_sparebit_gen(a, b, M, m0ninv, skipReduction)
|
||||||
|
|
||||||
# Montgomery Squaring
|
# Montgomery Squaring
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
@ -189,25 +205,12 @@ func square_asm_inline[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen
|
||||||
## but not for stack variables
|
## but not for stack variables
|
||||||
sqr_gen(r, a)
|
sqr_gen(r, a)
|
||||||
|
|
||||||
func montRed_asm_inline[N: static int](
|
func squareMont_CIOS_asm*[N](
|
||||||
r: var array[N, SecretWord],
|
|
||||||
a: array[N*2, SecretWord],
|
|
||||||
M: array[N, SecretWord],
|
|
||||||
m0ninv: BaseType,
|
|
||||||
hasSpareBit: static bool
|
|
||||||
) {.inline.} =
|
|
||||||
## Constant-time Montgomery reduction
|
|
||||||
## Extra indirection as the generator assumes that
|
|
||||||
## arrays are pointers, which is true for parameters
|
|
||||||
## but not for stack variables
|
|
||||||
montyRedc2x_gen(r, a, M, m0ninv, hasSpareBit)
|
|
||||||
|
|
||||||
func montSquare_CIOS_asm*[N](
|
|
||||||
r: var Limbs[N],
|
r: var Limbs[N],
|
||||||
a, M: Limbs[N],
|
a, M: Limbs[N],
|
||||||
m0ninv: BaseType,
|
m0ninv: BaseType,
|
||||||
hasSpareBit: static bool) =
|
hasSpareBit, skipReduction: static bool) =
|
||||||
## Constant-time modular squaring
|
## Constant-time modular squaring
|
||||||
var r2x {.noInit.}: Limbs[2*N]
|
var r2x {.noInit.}: Limbs[2*N]
|
||||||
r2x.square_asm_inline(a)
|
r2x.square_asm_inline(a)
|
||||||
r.montRed_asm_inline(r2x, M, m0ninv, hasSpareBit)
|
r.redcMont_asm_inline(r2x, M, m0ninv, hasSpareBit, skipReduction)
|
|
@ -8,12 +8,12 @@
|
||||||
|
|
||||||
import
|
import
|
||||||
# Standard library
|
# Standard library
|
||||||
std/macros,
|
std/[macros, algorithm],
|
||||||
# Internal
|
# Internal
|
||||||
../../config/common,
|
../../config/common,
|
||||||
../../primitives,
|
../../primitives,
|
||||||
./limbs_asm_montred_x86,
|
./limbs_asm_modular_x86,
|
||||||
./limbs_asm_montred_x86_adx_bmi2,
|
./limbs_asm_redc_mont_x86_adx_bmi2,
|
||||||
./limbs_asm_mul_x86_adx_bmi2
|
./limbs_asm_mul_x86_adx_bmi2
|
||||||
|
|
||||||
# ############################################################
|
# ############################################################
|
||||||
|
@ -36,6 +36,7 @@ static: doAssert UseASM_X86_64
|
||||||
|
|
||||||
# Montgomery Multiplication
|
# Montgomery Multiplication
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
proc mulx_by_word(
|
proc mulx_by_word(
|
||||||
ctx: var Assembler_x86,
|
ctx: var Assembler_x86,
|
||||||
hi: Operand,
|
hi: Operand,
|
||||||
|
@ -149,7 +150,7 @@ proc partialRedx(
|
||||||
ctx.mov rdx, t[0]
|
ctx.mov rdx, t[0]
|
||||||
ctx.imul rdx, m0ninv
|
ctx.imul rdx, m0ninv
|
||||||
|
|
||||||
# Clear carry flags - TODO: necessary?
|
# Clear carry flags
|
||||||
ctx.`xor` S, S
|
ctx.`xor` S, S
|
||||||
|
|
||||||
# S,_ := t[0] + m*M[0]
|
# S,_ := t[0] + m*M[0]
|
||||||
|
@ -158,6 +159,8 @@ proc partialRedx(
|
||||||
ctx.adcx lo, t[0] # set the carry flag for the future ADCX
|
ctx.adcx lo, t[0] # set the carry flag for the future ADCX
|
||||||
ctx.mov t[0], S
|
ctx.mov t[0], S
|
||||||
|
|
||||||
|
ctx.mov lo, 0
|
||||||
|
|
||||||
# for j=1 to N-1
|
# for j=1 to N-1
|
||||||
# (S,t[j-1]) := t[j] + m*M[j] + S
|
# (S,t[j-1]) := t[j] + m*M[j] + S
|
||||||
ctx.comment " for j=1 to N-1"
|
ctx.comment " for j=1 to N-1"
|
||||||
|
@ -170,26 +173,31 @@ proc partialRedx(
|
||||||
# Last carries
|
# Last carries
|
||||||
# t[N-1] = S + C
|
# t[N-1] = S + C
|
||||||
ctx.comment " Reduction carry "
|
ctx.comment " Reduction carry "
|
||||||
ctx.mov S, 0
|
ctx.adcx lo, C # lo contains 0 so C += S
|
||||||
ctx.adcx t[N-1], S
|
ctx.adox t[N-1], lo
|
||||||
ctx.adox t[N-1], C
|
|
||||||
|
|
||||||
macro montMul_CIOS_sparebit_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_MM: Limbs[N], m0ninv_MM: BaseType): untyped =
|
macro mulMont_CIOS_sparebit_adx_gen[N: static int](
|
||||||
|
r_PIR: var Limbs[N], a_PIR, b_PIR,
|
||||||
|
M_PIR: Limbs[N], m0ninv_REG: BaseType,
|
||||||
|
skipReduction: static bool): untyped =
|
||||||
## Generate an optimized Montgomery Multiplication kernel
|
## Generate an optimized Montgomery Multiplication kernel
|
||||||
## using the CIOS method
|
## using the CIOS method
|
||||||
## This requires the most significant word of the Modulus
|
## This requires the most significant word of the Modulus
|
||||||
## M[^1] < high(SecretWord) shr 2 (i.e. less than 0b00111...1111)
|
## M[^1] < high(SecretWord) shr 2 (i.e. less than 0b00111...1111)
|
||||||
## https://hackmd.io/@zkteam/modular_multiplication
|
## https://hackmd.io/@zkteam/modular_multiplication
|
||||||
|
|
||||||
|
# No register spilling handling
|
||||||
|
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
|
||||||
|
|
||||||
result = newStmtList()
|
result = newStmtList()
|
||||||
|
|
||||||
var ctx = init(Assembler_x86, BaseType)
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
let
|
let
|
||||||
scratchSlots = max(N, 6)
|
scratchSlots = 6
|
||||||
|
|
||||||
r = init(OperandArray, nimSymbol = r_MM, N, PointerInReg, InputOutput_EnsureClobber)
|
r = init(OperandArray, nimSymbol = r_PIR, N, PointerInReg, InputOutput_EnsureClobber)
|
||||||
# We could force M as immediate by specializing per moduli
|
# We could force M as immediate by specializing per moduli
|
||||||
M = init(OperandArray, nimSymbol = M_MM, N, PointerInReg, Input)
|
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||||
# If N is too big, we need to spill registers. TODO.
|
# If N is too big, we need to spill registers. TODO.
|
||||||
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
|
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
|
||||||
# MultiPurpose Register slots
|
# MultiPurpose Register slots
|
||||||
|
@ -220,12 +228,12 @@ macro montMul_CIOS_sparebit_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
||||||
|
|
||||||
var `tsym`: typeof(`r_MM`) # zero init
|
var `tsym`{.noInit.}: typeof(`r_PIR`) # zero init
|
||||||
# Assumes 64-bit limbs on 64-bit arch (or you can't store an address)
|
# Assumes 64-bit limbs on 64-bit arch (or you can't store an address)
|
||||||
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
|
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
|
||||||
`scratchSym`[0] = cast[SecretWord](`a_MM`[0].unsafeAddr)
|
`scratchSym`[0] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
|
||||||
`scratchSym`[1] = cast[SecretWord](`b_MM`[0].unsafeAddr)
|
`scratchSym`[1] = cast[SecretWord](`b_PIR`[0].unsafeAddr)
|
||||||
`scratchSym`[4] = SecretWord `m0ninv_MM`
|
`scratchSym`[4] = SecretWord `m0ninv_REG`
|
||||||
|
|
||||||
# Algorithm
|
# Algorithm
|
||||||
# -----------------------------------------
|
# -----------------------------------------
|
||||||
|
@ -238,9 +246,6 @@ macro montMul_CIOS_sparebit_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM
|
||||||
# (C,t[j-1]) := t[j] + m*M[j] + C
|
# (C,t[j-1]) := t[j] + m*M[j] + C
|
||||||
# t[N-1] = C + A
|
# t[N-1] = C + A
|
||||||
|
|
||||||
# No register spilling handling
|
|
||||||
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
|
|
||||||
|
|
||||||
for i in 0 ..< N:
|
for i in 0 ..< N:
|
||||||
if i == 0:
|
if i == 0:
|
||||||
ctx.mulx_by_word(
|
ctx.mulx_by_word(
|
||||||
|
@ -263,47 +268,35 @@ macro montMul_CIOS_sparebit_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM
|
||||||
lo, C
|
lo, C
|
||||||
)
|
)
|
||||||
|
|
||||||
ctx.finalSubNoCarry(
|
if skipReduction:
|
||||||
|
for i in 0 ..< N:
|
||||||
|
ctx.mov r[i], t[i]
|
||||||
|
else:
|
||||||
|
ctx.finalSubNoCarryImpl(
|
||||||
r, t, M,
|
r, t, M,
|
||||||
scratch
|
scratch
|
||||||
)
|
)
|
||||||
|
|
||||||
result.add ctx.generate
|
result.add ctx.generate
|
||||||
|
|
||||||
func montMul_CIOS_sparebit_asm_adx_bmi2*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
|
func mulMont_CIOS_sparebit_asm_adx*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipReduction: static bool = false) =
|
||||||
## Constant-time modular multiplication
|
## Constant-time Montgomery multiplication
|
||||||
## Requires the prime modulus to have a spare bit in the representation. (Hence if using 64-bit words and 4 words, to be at most 255-bit)
|
## If "skipReduction" is set
|
||||||
montMul_CIOS_sparebit_adx_bmi2_gen(r, a, b, M, m0ninv)
|
## the result is in the range [0, 2M)
|
||||||
|
## otherwise the result is in the range [0, M)
|
||||||
|
##
|
||||||
|
## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
|
||||||
|
r.mulMont_CIOS_sparebit_adx_gen(a, b, M, m0ninv, skipReduction)
|
||||||
|
|
||||||
# Montgomery Squaring
|
# Montgomery Squaring
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
func square_asm_adx_bmi2_inline[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) {.inline.} =
|
func squareMont_CIOS_asm_adx*[N](
|
||||||
## Multi-precision Squaring
|
|
||||||
## Extra indirection as the generator assumes that
|
|
||||||
## arrays are pointers, which is true for parameters
|
|
||||||
## but not for stack variables.
|
|
||||||
sqrx_gen(r, a)
|
|
||||||
|
|
||||||
func montRed_asm_adx_bmi2_inline[N: static int](
|
|
||||||
r: var array[N, SecretWord],
|
|
||||||
a: array[N*2, SecretWord],
|
|
||||||
M: array[N, SecretWord],
|
|
||||||
m0ninv: BaseType,
|
|
||||||
hasSpareBit: static bool
|
|
||||||
) {.inline.} =
|
|
||||||
## Constant-time Montgomery reduction
|
|
||||||
## Extra indirection as the generator assumes that
|
|
||||||
## arrays are pointers, which is true for parameters
|
|
||||||
## but not for stack variables.
|
|
||||||
montyRedc2x_adx_gen(r, a, M, m0ninv, hasSpareBit)
|
|
||||||
|
|
||||||
func montSquare_CIOS_asm_adx_bmi2*[N](
|
|
||||||
r: var Limbs[N],
|
r: var Limbs[N],
|
||||||
a, M: Limbs[N],
|
a, M: Limbs[N],
|
||||||
m0ninv: BaseType,
|
m0ninv: BaseType,
|
||||||
hasSpareBit: static bool) =
|
hasSpareBit, skipReduction: static bool) =
|
||||||
## Constant-time modular squaring
|
## Constant-time modular squaring
|
||||||
var r2x {.noInit.}: Limbs[2*N]
|
var r2x {.noInit.}: Limbs[2*N]
|
||||||
r2x.square_asm_adx_bmi2_inline(a)
|
r2x.square_asm_adx_inline(a)
|
||||||
r.montRed_asm_adx_bmi2_inline(r2x, M, m0ninv, hasSpareBit)
|
r.redcMont_asm_adx(r2x, M, m0ninv, hasSpareBit, skipReduction)
|
|
@ -111,7 +111,7 @@ proc mulaccx_by_word(
|
||||||
ctx.adcx hi, rdx
|
ctx.adcx hi, rdx
|
||||||
ctx.adox hi, rdx
|
ctx.adox hi, rdx
|
||||||
|
|
||||||
macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen], bx: Limbs[bLen]) =
|
macro mulx_gen[rLen, aLen, bLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limbs[aLen], b_PIR: Limbs[bLen]) =
|
||||||
## `a`, `b`, `r` can have a different number of limbs
|
## `a`, `b`, `r` can have a different number of limbs
|
||||||
## if `r`.limbs.len < a.limbs.len + b.limbs.len
|
## if `r`.limbs.len < a.limbs.len + b.limbs.len
|
||||||
## The result will be truncated, i.e. it will be
|
## The result will be truncated, i.e. it will be
|
||||||
|
@ -123,9 +123,9 @@ macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen
|
||||||
|
|
||||||
var ctx = init(Assembler_x86, BaseType)
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
let
|
let
|
||||||
r = init(OperandArray, nimSymbol = rx, rLen, PointerInReg, InputOutput_EnsureClobber)
|
r = init(OperandArray, nimSymbol = r_PIR, rLen, PointerInReg, InputOutput_EnsureClobber)
|
||||||
a = init(OperandArray, nimSymbol = ax, aLen, PointerInReg, Input)
|
a = init(OperandArray, nimSymbol = a_PIR, aLen, PointerInReg, Input)
|
||||||
b = init(OperandArray, nimSymbol = bx, bLen, PointerInReg, Input)
|
b = init(OperandArray, nimSymbol = b_PIR, bLen, PointerInReg, Input)
|
||||||
|
|
||||||
# MULX requires RDX
|
# MULX requires RDX
|
||||||
|
|
||||||
|
@ -168,24 +168,24 @@ macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen
|
||||||
# Codegen
|
# Codegen
|
||||||
result.add ctx.generate
|
result.add ctx.generate
|
||||||
|
|
||||||
func mul_asm_adx_bmi2_impl*[rLen, aLen, bLen: static int](
|
func mul_asm_adx_inline*[rLen, aLen, bLen: static int](
|
||||||
r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) {.inline.} =
|
r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) {.inline.} =
|
||||||
## Multi-precision Multiplication
|
## Multi-precision Multiplication
|
||||||
## Assumes r doesn't alias a or b
|
## Assumes r doesn't alias a or b
|
||||||
## Inline version
|
## Inline version
|
||||||
mulx_gen(r, a, b)
|
mulx_gen(r, a, b)
|
||||||
|
|
||||||
func mul_asm_adx_bmi2*[rLen, aLen, bLen: static int](
|
func mul_asm_adx*[rLen, aLen, bLen: static int](
|
||||||
r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
|
r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
|
||||||
## Multi-precision Multiplication
|
## Multi-precision Multiplication
|
||||||
## Assumes r doesn't alias a or b
|
## Assumes r doesn't alias a or b
|
||||||
mul_asm_adx_bmi2_impl(r, a, b)
|
mul_asm_adx_inline(r, a, b)
|
||||||
|
|
||||||
# Squaring
|
# Squaring
|
||||||
# -----------------------------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------------------------
|
||||||
#
|
#
|
||||||
# Strategy:
|
# Strategy:
|
||||||
# We want to use the same scheduling as mul_asm_adx_bmi2
|
# We want to use the same scheduling as mul_asm_adx
|
||||||
# and so process `a[0..<N]` by `word`
|
# and so process `a[0..<N]` by `word`
|
||||||
# and store the intermediate result in `[t[n..1]:r0]`
|
# and store the intermediate result in `[t[n..1]:r0]`
|
||||||
#
|
#
|
||||||
|
@ -561,7 +561,7 @@ func sqrx_gen6L(ctx: var Assembler_x86, r, a: OperandArray, t: var OperandArray)
|
||||||
merge_diag_and_partsum(r, a, hi1, lo1, zero, 4)
|
merge_diag_and_partsum(r, a, hi1, lo1, zero, 4)
|
||||||
merge_diag_and_partsum(r, a, hi2, lo2, zero, 5)
|
merge_diag_and_partsum(r, a, hi2, lo2, zero, 5)
|
||||||
|
|
||||||
macro sqrx_gen*[rLen, aLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen]) =
|
macro sqrx_gen*[rLen, aLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limbs[aLen]) =
|
||||||
## Squaring
|
## Squaring
|
||||||
## `a` and `r` can have a different number of limbs
|
## `a` and `r` can have a different number of limbs
|
||||||
## if `r`.limbs.len < a.limbs.len * 2
|
## if `r`.limbs.len < a.limbs.len * 2
|
||||||
|
@ -578,8 +578,8 @@ macro sqrx_gen*[rLen, aLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen]) =
|
||||||
# t = 2 * a.len = 12
|
# t = 2 * a.len = 12
|
||||||
# We use the full x86 register set.
|
# We use the full x86 register set.
|
||||||
|
|
||||||
r = init(OperandArray, nimSymbol = rx, rLen, PointerInReg, InputOutput)
|
r = init(OperandArray, nimSymbol = r_PIR, rLen, PointerInReg, InputOutput)
|
||||||
a = init(OperandArray, nimSymbol = ax, aLen, PointerInReg, Input)
|
a = init(OperandArray, nimSymbol = a_PIR, aLen, PointerInReg, Input)
|
||||||
|
|
||||||
# MULX requires RDX
|
# MULX requires RDX
|
||||||
|
|
||||||
|
@ -604,10 +604,15 @@ macro sqrx_gen*[rLen, aLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen]) =
|
||||||
# Codegen
|
# Codegen
|
||||||
result.add ctx.generate
|
result.add ctx.generate
|
||||||
|
|
||||||
func square_asm_adx_bmi2*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
func square_asm_adx_inline*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) {.inline.} =
|
||||||
|
## Multi-precision Squaring
|
||||||
|
## inline version
|
||||||
|
sqrx_gen(r, a)
|
||||||
|
|
||||||
|
func square_asm_adx*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
||||||
## Multi-precision Squaring
|
## Multi-precision Squaring
|
||||||
## Assumes r doesn't alias a
|
## Assumes r doesn't alias a
|
||||||
sqrx_gen(r, a)
|
square_asm_adx_inline(r, a)
|
||||||
|
|
||||||
|
|
||||||
# Sanity checks
|
# Sanity checks
|
||||||
|
@ -628,8 +633,8 @@ when isMainModule:
|
||||||
|
|
||||||
var a2x, expected: Limbs[8]
|
var a2x, expected: Limbs[8]
|
||||||
|
|
||||||
a2x.square_asm_adx_bmi2(a)
|
a2x.square_asm_adx(a)
|
||||||
expected.mul_asm_adx_bmi2(a, a)
|
expected.mul_asm_adx(a, a)
|
||||||
debugecho "--------------------------------"
|
debugecho "--------------------------------"
|
||||||
debugecho "before:"
|
debugecho "before:"
|
||||||
debugecho " a : ", a.toString()
|
debugecho " a : ", a.toString()
|
||||||
|
@ -644,8 +649,8 @@ when isMainModule:
|
||||||
|
|
||||||
var a2x, expected: Limbs[8]
|
var a2x, expected: Limbs[8]
|
||||||
|
|
||||||
a2x.square_asm_adx_bmi2(a)
|
a2x.square_asm_adx(a)
|
||||||
expected.mul_asm_adx_bmi2(a, a)
|
expected.mul_asm_adx(a, a)
|
||||||
debugecho "--------------------------------"
|
debugecho "--------------------------------"
|
||||||
debugecho "before:"
|
debugecho "before:"
|
||||||
debugecho " a : ", a.toString()
|
debugecho " a : ", a.toString()
|
||||||
|
@ -667,8 +672,8 @@ when isMainModule:
|
||||||
|
|
||||||
var a2x, expected: Limbs[12]
|
var a2x, expected: Limbs[12]
|
||||||
|
|
||||||
a2x.square_asm_adx_bmi2(a)
|
a2x.square_asm_adx(a)
|
||||||
expected.mul_asm_adx_bmi2(a, a)
|
expected.mul_asm_adx(a, a)
|
||||||
debugecho "--------------------------------"
|
debugecho "--------------------------------"
|
||||||
debugecho "before:"
|
debugecho "before:"
|
||||||
debugecho " a : ", a.toString()
|
debugecho " a : ", a.toString()
|
||||||
|
@ -683,8 +688,8 @@ when isMainModule:
|
||||||
|
|
||||||
var a2x, expected: Limbs[12]
|
var a2x, expected: Limbs[12]
|
||||||
|
|
||||||
a2x.square_asm_adx_bmi2(a)
|
a2x.square_asm_adx(a)
|
||||||
expected.mul_asm_adx_bmi2(a, a)
|
expected.mul_asm_adx(a, a)
|
||||||
debugecho "--------------------------------"
|
debugecho "--------------------------------"
|
||||||
debugecho "before:"
|
debugecho "before:"
|
||||||
debugecho " a : ", a.toString()
|
debugecho " a : ", a.toString()
|
||||||
|
|
|
@ -11,7 +11,8 @@ import
|
||||||
std/macros,
|
std/macros,
|
||||||
# Internal
|
# Internal
|
||||||
../../config/common,
|
../../config/common,
|
||||||
../../primitives
|
../../primitives,
|
||||||
|
./limbs_asm_modular_x86
|
||||||
|
|
||||||
# ############################################################
|
# ############################################################
|
||||||
#
|
#
|
||||||
|
@ -25,76 +26,21 @@ static: doAssert UseASM_X86_32
|
||||||
# Necessary for the compiler to find enough registers (enabled at -O1)
|
# Necessary for the compiler to find enough registers (enabled at -O1)
|
||||||
{.localPassC:"-fomit-frame-pointer".}
|
{.localPassC:"-fomit-frame-pointer".}
|
||||||
|
|
||||||
proc finalSubNoCarry*(
|
|
||||||
ctx: var Assembler_x86,
|
|
||||||
r: Operand or OperandArray,
|
|
||||||
a, M, scratch: OperandArray
|
|
||||||
) =
|
|
||||||
## Reduce `a` into `r` modulo `M`
|
|
||||||
let N = M.len
|
|
||||||
ctx.comment "Final substraction (no carry)"
|
|
||||||
for i in 0 ..< N:
|
|
||||||
ctx.mov scratch[i], a[i]
|
|
||||||
if i == 0:
|
|
||||||
ctx.sub scratch[i], M[i]
|
|
||||||
else:
|
|
||||||
ctx.sbb scratch[i], M[i]
|
|
||||||
|
|
||||||
# If we borrowed it means that we were smaller than
|
|
||||||
# the modulus and we don't need "scratch"
|
|
||||||
for i in 0 ..< N:
|
|
||||||
ctx.cmovnc a[i], scratch[i]
|
|
||||||
ctx.mov r[i], a[i]
|
|
||||||
|
|
||||||
proc finalSubCanOverflow*(
|
|
||||||
ctx: var Assembler_x86,
|
|
||||||
r: Operand or OperandArray,
|
|
||||||
a, M, scratch: OperandArray,
|
|
||||||
overflowReg: Operand or Register
|
|
||||||
) =
|
|
||||||
## Reduce `a` into `r` modulo `M`
|
|
||||||
## To be used when the final substraction can
|
|
||||||
## also depend on the carry flag
|
|
||||||
## This is in particular possible when the MSB
|
|
||||||
## is set for the prime modulus
|
|
||||||
## `overflowReg` should be a register that will be used
|
|
||||||
## to store the carry flag
|
|
||||||
|
|
||||||
ctx.comment "Final substraction (may carry)"
|
|
||||||
|
|
||||||
# Mask: overflowed contains 0xFFFF or 0x0000
|
|
||||||
ctx.sbb overflowReg, overflowReg
|
|
||||||
|
|
||||||
# Now substract the modulus to test a < p
|
|
||||||
let N = M.len
|
|
||||||
for i in 0 ..< N:
|
|
||||||
ctx.mov scratch[i], a[i]
|
|
||||||
if i == 0:
|
|
||||||
ctx.sub scratch[i], M[i]
|
|
||||||
else:
|
|
||||||
ctx.sbb scratch[i], M[i]
|
|
||||||
|
|
||||||
# If it overflows here, it means that it was
|
|
||||||
# smaller than the modulus and we don't need `scratch`
|
|
||||||
ctx.sbb overflowReg, 0
|
|
||||||
|
|
||||||
# If we borrowed it means that we were smaller than
|
|
||||||
# the modulus and we don't need "scratch"
|
|
||||||
for i in 0 ..< N:
|
|
||||||
ctx.cmovnc a[i], scratch[i]
|
|
||||||
ctx.mov r[i], a[i]
|
|
||||||
|
|
||||||
|
|
||||||
# Montgomery reduction
|
# Montgomery reduction
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
macro montyRedc2x_gen*[N: static int](
|
macro redc2xMont_gen*[N: static int](
|
||||||
r_MR: var array[N, SecretWord],
|
r_PIR: var array[N, SecretWord],
|
||||||
a_MR: array[N*2, SecretWord],
|
a_PIR: array[N*2, SecretWord],
|
||||||
M_MR: array[N, SecretWord],
|
M_PIR: array[N, SecretWord],
|
||||||
m0ninv_MR: BaseType,
|
m0ninv_REG: BaseType,
|
||||||
hasSpareBit: static bool
|
hasSpareBit, skipReduction: static bool
|
||||||
) =
|
) =
|
||||||
|
|
||||||
|
# No register spilling handling
|
||||||
|
doAssert N > 2, "The Assembly-optimized montgomery reduction requires a minimum of 2 limbs."
|
||||||
|
doAssert N <= 6, "The Assembly-optimized montgomery reduction requires at most 6 limbs."
|
||||||
|
|
||||||
result = newStmtList()
|
result = newStmtList()
|
||||||
|
|
||||||
var ctx = init(Assembler_x86, BaseType)
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
|
@ -103,7 +49,7 @@ macro montyRedc2x_gen*[N: static int](
|
||||||
# so we store everything in scratchspaces restoring as needed
|
# so we store everything in scratchspaces restoring as needed
|
||||||
let
|
let
|
||||||
# We could force M as immediate by specializing per moduli
|
# We could force M as immediate by specializing per moduli
|
||||||
M = init(OperandArray, nimSymbol = M_MR, N, PointerInReg, Input)
|
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||||
# MUL requires RAX and RDX
|
# MUL requires RAX and RDX
|
||||||
|
|
||||||
let uSlots = N+2
|
let uSlots = N+2
|
||||||
|
@ -119,9 +65,9 @@ macro montyRedc2x_gen*[N: static int](
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
var `usym`{.noinit.}: Limbs[`uSlots`]
|
var `usym`{.noinit.}: Limbs[`uSlots`]
|
||||||
var `vsym` {.noInit.}: Limbs[`vSlots`]
|
var `vsym` {.noInit.}: Limbs[`vSlots`]
|
||||||
`vsym`[0] = cast[SecretWord](`r_MR`[0].unsafeAddr)
|
`vsym`[0] = cast[SecretWord](`r_PIR`[0].unsafeAddr)
|
||||||
`vsym`[1] = cast[SecretWord](`a_MR`[0].unsafeAddr)
|
`vsym`[1] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
|
||||||
`vsym`[2] = SecretWord(`m0ninv_MR`)
|
`vsym`[2] = SecretWord(`m0ninv_REG`)
|
||||||
|
|
||||||
let r_temp = v[0].asArrayAddr(len = N)
|
let r_temp = v[0].asArrayAddr(len = N)
|
||||||
let a = v[1].asArrayAddr(len = 2*N)
|
let a = v[1].asArrayAddr(len = 2*N)
|
||||||
|
@ -131,7 +77,7 @@ macro montyRedc2x_gen*[N: static int](
|
||||||
# ---------------------------------------------------------
|
# ---------------------------------------------------------
|
||||||
# for i in 0 .. n-1:
|
# for i in 0 .. n-1:
|
||||||
# hi <- 0
|
# hi <- 0
|
||||||
# m <- a[i] * m0ninv mod 2^w (i.e. simple multiplication)
|
# m <- a[i] * m0ninv mod 2ʷ (i.e. simple multiplication)
|
||||||
# for j in 0 .. n-1:
|
# for j in 0 .. n-1:
|
||||||
# (hi, lo) <- a[i+j] + m * M[j] + hi
|
# (hi, lo) <- a[i+j] + m * M[j] + hi
|
||||||
# a[i+j] <- lo
|
# a[i+j] <- lo
|
||||||
|
@ -141,15 +87,11 @@ macro montyRedc2x_gen*[N: static int](
|
||||||
# if r >= M:
|
# if r >= M:
|
||||||
# r -= M
|
# r -= M
|
||||||
|
|
||||||
# No register spilling handling
|
|
||||||
doAssert N > 2, "The Assembly-optimized montgomery reduction requires a minimum of 2 limbs."
|
|
||||||
doAssert N <= 6, "The Assembly-optimized montgomery reduction requires at most 6 limbs."
|
|
||||||
|
|
||||||
for i in 0 ..< N:
|
for i in 0 ..< N:
|
||||||
ctx.mov u[i], a[i]
|
ctx.mov u[i], a[i]
|
||||||
|
|
||||||
ctx.mov u[N], u[0]
|
ctx.mov u[N], u[0]
|
||||||
ctx.imul u[0], m0ninv # m <- a[i] * m0ninv mod 2^w
|
ctx.imul u[0], m0ninv # m <- a[i] * m0ninv mod 2ʷ
|
||||||
ctx.mov rax, u[0]
|
ctx.mov rax, u[0]
|
||||||
|
|
||||||
# scratch: [a[0] * m0, a[1], a[2], a[3], a[0]] for 4 limbs
|
# scratch: [a[0] * m0, a[1], a[2], a[3], a[0]] for 4 limbs
|
||||||
|
@ -208,27 +150,138 @@ macro montyRedc2x_gen*[N: static int](
|
||||||
else:
|
else:
|
||||||
ctx.adc u[i], a[i+N]
|
ctx.adc u[i], a[i+N]
|
||||||
|
|
||||||
|
# v is invalidated from now on
|
||||||
let t = repackRegisters(v, u[N], u[N+1])
|
let t = repackRegisters(v, u[N], u[N+1])
|
||||||
|
|
||||||
# v is invalidated
|
if hasSpareBit and skipReduction:
|
||||||
if hasSpareBit:
|
for i in 0 ..< N:
|
||||||
ctx.finalSubNoCarry(r, u, M, t)
|
ctx.mov r[i], t[i]
|
||||||
|
elif hasSpareBit:
|
||||||
|
ctx.finalSubNoCarryImpl(r, u, M, t)
|
||||||
else:
|
else:
|
||||||
ctx.finalSubCanOverflow(r, u, M, t, rax)
|
ctx.finalSubMayCarryImpl(r, u, M, t, rax)
|
||||||
|
|
||||||
# Code generation
|
# Code generation
|
||||||
result.add ctx.generate()
|
result.add ctx.generate()
|
||||||
|
|
||||||
func montRed_asm*[N: static int](
|
func redcMont_asm_inline*[N: static int](
|
||||||
r: var array[N, SecretWord],
|
r: var array[N, SecretWord],
|
||||||
a: array[N*2, SecretWord],
|
a: array[N*2, SecretWord],
|
||||||
M: array[N, SecretWord],
|
M: array[N, SecretWord],
|
||||||
m0ninv: BaseType,
|
m0ninv: BaseType,
|
||||||
hasSpareBit: static bool
|
hasSpareBit: static bool,
|
||||||
|
skipReduction: static bool = false
|
||||||
|
) {.inline.} =
|
||||||
|
## Constant-time Montgomery reduction
|
||||||
|
## Inline-version
|
||||||
|
redc2xMont_gen(r, a, M, m0ninv, hasSpareBit, skipReduction)
|
||||||
|
|
||||||
|
func redcMont_asm*[N: static int](
|
||||||
|
r: var array[N, SecretWord],
|
||||||
|
a: array[N*2, SecretWord],
|
||||||
|
M: array[N, SecretWord],
|
||||||
|
m0ninv: BaseType,
|
||||||
|
hasSpareBit, skipReduction: static bool
|
||||||
) =
|
) =
|
||||||
## Constant-time Montgomery reduction
|
## Constant-time Montgomery reduction
|
||||||
static: doAssert UseASM_X86_64, "This requires x86-64."
|
static: doAssert UseASM_X86_64, "This requires x86-64."
|
||||||
montyRedc2x_gen(r, a, M, m0ninv, hasSpareBit)
|
redcMont_asm_inline(r, a, M, m0ninv, hasSpareBit, skipReduction)
|
||||||
|
|
||||||
|
# Montgomery conversion
|
||||||
|
# ----------------------------------------------------------
|
||||||
|
|
||||||
|
macro mulMont_by_1_gen[N: static int](
|
||||||
|
t_EIR: var array[N, SecretWord],
|
||||||
|
M_PIR: array[N, SecretWord],
|
||||||
|
m0ninv_REG: BaseType) =
|
||||||
|
|
||||||
|
# No register spilling handling
|
||||||
|
doAssert N <= 6, "The Assembly-optimized montgomery reduction requires at most 6 limbs."
|
||||||
|
|
||||||
|
result = newStmtList()
|
||||||
|
|
||||||
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
|
# On x86, compilers only let us use 15 out of 16 registers
|
||||||
|
# RAX and RDX are defacto used due to the MUL instructions
|
||||||
|
# so we store everything in scratchspaces restoring as needed
|
||||||
|
let
|
||||||
|
scratchSlots = 2
|
||||||
|
|
||||||
|
t = init(OperandArray, nimSymbol = t_EIR, N, ElemsInReg, InputOutput_EnsureClobber)
|
||||||
|
# We could force M as immediate by specializing per moduli
|
||||||
|
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||||
|
# MultiPurpose Register slots
|
||||||
|
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||||
|
|
||||||
|
# MUL requires RAX and RDX
|
||||||
|
|
||||||
|
m0ninv = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: "[m0ninv]",
|
||||||
|
nimSymbol: m0ninv_REG,
|
||||||
|
rm: MemOffsettable,
|
||||||
|
constraint: Input,
|
||||||
|
cEmit: "&" & $m0ninv_REG
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
C = scratch[0] # Stores the high-part of muliplication
|
||||||
|
m = scratch[1] # Stores (t[0] * m0ninv) mod 2ʷ
|
||||||
|
|
||||||
|
let scratchSym = scratch.nimSymbol
|
||||||
|
|
||||||
|
# Copy a in t
|
||||||
|
result.add quote do:
|
||||||
|
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
|
||||||
|
|
||||||
|
# Algorithm
|
||||||
|
# ---------------------------------------------------------
|
||||||
|
# for i in 0 .. n-1:
|
||||||
|
# m <- t[0] * m0ninv mod 2ʷ (i.e. simple multiplication)
|
||||||
|
# C, _ = t[0] + m * M[0]
|
||||||
|
# for j in 1 .. n-1:
|
||||||
|
# (C, t[j-1]) <- r[j] + m*M[j] + C
|
||||||
|
# t[n-1] = C
|
||||||
|
|
||||||
|
ctx.comment "for i in 0 ..< N:"
|
||||||
|
for i in 0 ..< N:
|
||||||
|
ctx.comment " m <- t[0] * m0ninv mod 2ʷ"
|
||||||
|
ctx.mov m, m0ninv
|
||||||
|
ctx.imul m, t[0]
|
||||||
|
|
||||||
|
ctx.comment " C, _ = t[0] + m * M[0]"
|
||||||
|
ctx.`xor` C, C
|
||||||
|
ctx.mov rax, M[0]
|
||||||
|
ctx.mul rdx, rax, m, rax
|
||||||
|
ctx.add rax, t[0]
|
||||||
|
ctx.adc C, rdx
|
||||||
|
|
||||||
|
ctx.comment " for j in 1 .. n-1:"
|
||||||
|
for j in 1 ..< N:
|
||||||
|
ctx.comment " (C, t[j-1]) <- r[j] + m*M[j] + C"
|
||||||
|
ctx.mov rax, M[j]
|
||||||
|
ctx.mul rdx, rax, m, rax
|
||||||
|
ctx.add C, t[j]
|
||||||
|
ctx.adc rdx, 0
|
||||||
|
ctx.add C, rax
|
||||||
|
ctx.adc rdx, 0
|
||||||
|
ctx.mov t[j-1], C
|
||||||
|
ctx.mov C, rdx
|
||||||
|
|
||||||
|
ctx.comment " final carry"
|
||||||
|
ctx.mov t[N-1], C
|
||||||
|
|
||||||
|
result.add ctx.generate()
|
||||||
|
|
||||||
|
func fromMont_asm*(r: var Limbs, a, M: Limbs, m0ninv: BaseType) =
|
||||||
|
## Constant-time Montgomery residue form to BigInt conversion
|
||||||
|
var t{.noInit.} = a
|
||||||
|
block:
|
||||||
|
t.mulMont_by_1_gen(M, m0ninv)
|
||||||
|
|
||||||
|
block: # Map from [0, 2p) to [0, p)
|
||||||
|
var workspace{.noInit.}: typeof(r)
|
||||||
|
r.finalSub_gen(t, M, workspace, mayCarry = false)
|
||||||
|
|
||||||
# Sanity checks
|
# Sanity checks
|
||||||
# ----------------------------------------------------------
|
# ----------------------------------------------------------
|
||||||
|
@ -242,7 +295,7 @@ when isMainModule:
|
||||||
|
|
||||||
# TODO: Properly handle low number of limbs
|
# TODO: Properly handle low number of limbs
|
||||||
|
|
||||||
func montyRedc2x_Comba[N: static int](
|
func redc2xMont_Comba[N: static int](
|
||||||
r: var array[N, SecretWord],
|
r: var array[N, SecretWord],
|
||||||
a: array[N*2, SecretWord],
|
a: array[N*2, SecretWord],
|
||||||
M: array[N, SecretWord],
|
M: array[N, SecretWord],
|
||||||
|
@ -298,10 +351,10 @@ when isMainModule:
|
||||||
var a_sqr{.noInit.}, na_sqr{.noInit.}: Limbs[2]
|
var a_sqr{.noInit.}, na_sqr{.noInit.}: Limbs[2]
|
||||||
var a_sqr_comba{.noInit.}, na_sqr_comba{.noInit.}: Limbs[2]
|
var a_sqr_comba{.noInit.}, na_sqr_comba{.noInit.}: Limbs[2]
|
||||||
|
|
||||||
a_sqr.montRed_asm(adbl_sqr, M, 1, hasSpareBit = false)
|
a_sqr.redcMont_asm(adbl_sqr, M, 1, hasSpareBit = false, skipReduction = false)
|
||||||
na_sqr.montRed_asm(nadbl_sqr, M, 1, hasSpareBit = false)
|
na_sqr.redcMont_asm(nadbl_sqr, M, 1, hasSpareBit = false, skipReduction = false)
|
||||||
a_sqr_comba.montyRedc2x_Comba(adbl_sqr, M, 1)
|
a_sqr_comba.redc2xMont_Comba(adbl_sqr, M, 1)
|
||||||
na_sqr_comba.montyRedc2x_Comba(nadbl_sqr, M, 1)
|
na_sqr_comba.redc2xMont_Comba(nadbl_sqr, M, 1)
|
||||||
|
|
||||||
debugecho "--------------------------------"
|
debugecho "--------------------------------"
|
||||||
debugecho "after:"
|
debugecho "after:"
|
|
@ -0,0 +1,269 @@
|
||||||
|
# Constantine
|
||||||
|
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
||||||
|
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
||||||
|
# Licensed and distributed under either of
|
||||||
|
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||||
|
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||||
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||||
|
|
||||||
|
import
|
||||||
|
# Standard library
|
||||||
|
std/macros,
|
||||||
|
# Internal
|
||||||
|
../../config/common,
|
||||||
|
../../primitives,
|
||||||
|
./limbs_asm_modular_x86
|
||||||
|
|
||||||
|
# ############################################################
|
||||||
|
#
|
||||||
|
# Assembly implementation of finite fields
|
||||||
|
#
|
||||||
|
# ############################################################
|
||||||
|
|
||||||
|
static: doAssert UseASM_X86_64
|
||||||
|
|
||||||
|
# MULX/ADCX/ADOX
|
||||||
|
{.localPassC:"-madx -mbmi2".}
|
||||||
|
# Necessary for the compiler to find enough registers (enabled at -O1)
|
||||||
|
{.localPassC:"-fomit-frame-pointer".}
|
||||||
|
|
||||||
|
# No exceptions allowed
|
||||||
|
{.push raises: [].}
|
||||||
|
|
||||||
|
# Montgomery reduction
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
macro redc2xMont_adx_gen[N: static int](
|
||||||
|
r_PIR: var array[N, SecretWord],
|
||||||
|
a_PIR: array[N*2, SecretWord],
|
||||||
|
M_PIR: array[N, SecretWord],
|
||||||
|
m0ninv_REG: BaseType,
|
||||||
|
hasSpareBit, skipReduction: static bool
|
||||||
|
) =
|
||||||
|
|
||||||
|
# No register spilling handling
|
||||||
|
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
|
||||||
|
|
||||||
|
result = newStmtList()
|
||||||
|
|
||||||
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
|
let
|
||||||
|
# We could force M as immediate by specializing per moduli
|
||||||
|
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||||
|
|
||||||
|
let uSlots = N+1
|
||||||
|
let vSlots = max(N-1, 5)
|
||||||
|
|
||||||
|
var # Scratchspaces
|
||||||
|
u = init(OperandArray, nimSymbol = ident"U", uSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||||
|
v = init(OperandArray, nimSymbol = ident"V", vSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||||
|
|
||||||
|
# Prologue
|
||||||
|
let usym = u.nimSymbol
|
||||||
|
let vsym = v.nimSymbol
|
||||||
|
result.add quote do:
|
||||||
|
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
||||||
|
var `usym`{.noinit.}: Limbs[`uSlots`]
|
||||||
|
var `vsym` {.noInit.}: Limbs[`vSlots`]
|
||||||
|
`vsym`[0] = cast[SecretWord](`r_PIR`[0].unsafeAddr)
|
||||||
|
`vsym`[1] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
|
||||||
|
`vsym`[2] = SecretWord(`m0ninv_REG`)
|
||||||
|
|
||||||
|
let r_temp = v[0].asArrayAddr(len = N)
|
||||||
|
let a = v[1].asArrayAddr(len = 2*N)
|
||||||
|
let m0ninv = v[2]
|
||||||
|
let lo = v[3]
|
||||||
|
let hi = v[4]
|
||||||
|
|
||||||
|
# Algorithm
|
||||||
|
# ---------------------------------------------------------
|
||||||
|
# for i in 0 .. n-1:
|
||||||
|
# hi <- 0
|
||||||
|
# m <- a[i] * m0ninv mod 2ʷ (i.e. simple multiplication)
|
||||||
|
# for j in 0 .. n-1:
|
||||||
|
# (hi, lo) <- a[i+j] + m * M[j] + hi
|
||||||
|
# a[i+j] <- lo
|
||||||
|
# a[i+n] += hi
|
||||||
|
# for i in 0 .. n-1:
|
||||||
|
# r[i] = a[i+n]
|
||||||
|
# if r >= M:
|
||||||
|
# r -= M
|
||||||
|
|
||||||
|
ctx.mov rdx, m0ninv
|
||||||
|
|
||||||
|
for i in 0 ..< N:
|
||||||
|
ctx.mov u[i], a[i]
|
||||||
|
|
||||||
|
for i in 0 ..< N:
|
||||||
|
# RDX contains m0ninv at the start of each loop
|
||||||
|
ctx.comment ""
|
||||||
|
ctx.imul rdx, u[0] # m <- a[i] * m0ninv mod 2ʷ
|
||||||
|
ctx.comment "---- Reduction " & $i
|
||||||
|
ctx.`xor` u[N], u[N]
|
||||||
|
|
||||||
|
for j in 0 ..< N-1:
|
||||||
|
ctx.comment ""
|
||||||
|
ctx.mulx hi, lo, M[j], rdx
|
||||||
|
ctx.adcx u[j], lo
|
||||||
|
ctx.adox u[j+1], hi
|
||||||
|
|
||||||
|
# Last limb
|
||||||
|
ctx.comment ""
|
||||||
|
ctx.mulx hi, lo, M[N-1], rdx
|
||||||
|
ctx.mov rdx, m0ninv # Reload m0ninv for next iter
|
||||||
|
ctx.adcx u[N-1], lo
|
||||||
|
ctx.adox hi, u[N]
|
||||||
|
ctx.adcx u[N], hi
|
||||||
|
|
||||||
|
u.rotateLeft()
|
||||||
|
|
||||||
|
ctx.mov rdx, r_temp
|
||||||
|
let r = rdx.asArrayAddr(len = N)
|
||||||
|
|
||||||
|
# This does a[i+n] += hi
|
||||||
|
# but in a separate carry chain, fused with the
|
||||||
|
# copy "r[i] = a[i+n]"
|
||||||
|
for i in 0 ..< N:
|
||||||
|
if i == 0:
|
||||||
|
ctx.add u[i], a[i+N]
|
||||||
|
else:
|
||||||
|
ctx.adc u[i], a[i+N]
|
||||||
|
|
||||||
|
let t = repackRegisters(v, u[N])
|
||||||
|
|
||||||
|
if hasSpareBit and skipReduction:
|
||||||
|
for i in 0 ..< N:
|
||||||
|
ctx.mov r[i], t[i]
|
||||||
|
elif hasSpareBit:
|
||||||
|
ctx.finalSubNoCarryImpl(r, u, M, t)
|
||||||
|
else:
|
||||||
|
ctx.finalSubMayCarryImpl(r, u, M, t, hi)
|
||||||
|
|
||||||
|
# Code generation
|
||||||
|
result.add ctx.generate()
|
||||||
|
|
||||||
|
func redcMont_asm_adx_inline*[N: static int](
|
||||||
|
r: var array[N, SecretWord],
|
||||||
|
a: array[N*2, SecretWord],
|
||||||
|
M: array[N, SecretWord],
|
||||||
|
m0ninv: BaseType,
|
||||||
|
hasSpareBit: static bool,
|
||||||
|
skipReduction: static bool = false
|
||||||
|
) {.inline.} =
|
||||||
|
## Constant-time Montgomery reduction
|
||||||
|
## Inline-version
|
||||||
|
redc2xMont_adx_gen(r, a, M, m0ninv, hasSpareBit, skipReduction)
|
||||||
|
|
||||||
|
func redcMont_asm_adx*[N: static int](
|
||||||
|
r: var array[N, SecretWord],
|
||||||
|
a: array[N*2, SecretWord],
|
||||||
|
M: array[N, SecretWord],
|
||||||
|
m0ninv: BaseType,
|
||||||
|
hasSpareBit: static bool,
|
||||||
|
skipReduction: static bool = false
|
||||||
|
) =
|
||||||
|
## Constant-time Montgomery reduction
|
||||||
|
redcMont_asm_adx_inline(r, a, M, m0ninv, hasSpareBit, skipReduction)
|
||||||
|
|
||||||
|
|
||||||
|
# Montgomery conversion
|
||||||
|
# ----------------------------------------------------------
|
||||||
|
|
||||||
|
macro mulMont_by_1_adx_gen[N: static int](
|
||||||
|
t_EIR: var array[N, SecretWord],
|
||||||
|
M_PIR: array[N, SecretWord],
|
||||||
|
m0ninv_REG: BaseType) =
|
||||||
|
|
||||||
|
# No register spilling handling
|
||||||
|
doAssert N <= 6, "The Assembly-optimized montgomery reduction requires at most 6 limbs."
|
||||||
|
|
||||||
|
result = newStmtList()
|
||||||
|
|
||||||
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
|
# On x86, compilers only let us use 15 out of 16 registers
|
||||||
|
# RAX and RDX are defacto used due to the MUL instructions
|
||||||
|
# so we store everything in scratchspaces restoring as needed
|
||||||
|
let
|
||||||
|
scratchSlots = 1
|
||||||
|
|
||||||
|
t = init(OperandArray, nimSymbol = t_EIR, N, ElemsInReg, InputOutput_EnsureClobber)
|
||||||
|
# We could force M as immediate by specializing per moduli
|
||||||
|
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||||
|
# MultiPurpose Register slots
|
||||||
|
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||||
|
|
||||||
|
# MUL requires RAX and RDX
|
||||||
|
|
||||||
|
m0ninv = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: "[m0ninv]",
|
||||||
|
nimSymbol: m0ninv_REG,
|
||||||
|
rm: MemOffsettable,
|
||||||
|
constraint: Input,
|
||||||
|
cEmit: "&" & $m0ninv_REG
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
C = scratch[0] # Stores the high-part of muliplication
|
||||||
|
|
||||||
|
let scratchSym = scratch.nimSymbol
|
||||||
|
|
||||||
|
# Copy a in t
|
||||||
|
result.add quote do:
|
||||||
|
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
|
||||||
|
|
||||||
|
# Algorithm
|
||||||
|
# ---------------------------------------------------------
|
||||||
|
# for i in 0 .. n-1:
|
||||||
|
# m <- t[0] * m0ninv mod 2ʷ (i.e. simple multiplication)
|
||||||
|
# C, _ = t[0] + m * M[0]
|
||||||
|
# for j in 1 .. n-1:
|
||||||
|
# (C, t[j-1]) <- r[j] + m*M[j] + C
|
||||||
|
# t[n-1] = C
|
||||||
|
|
||||||
|
# Low-level optimizations
|
||||||
|
# https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
|
||||||
|
# Section 3.5.1.8 xor'ing a reg with itself is free (except for instruction code size)
|
||||||
|
|
||||||
|
ctx.comment "for i in 0 ..< N:"
|
||||||
|
for i in 0 ..< N:
|
||||||
|
ctx.comment " m <- t[0] * m0ninv mod 2ʷ"
|
||||||
|
ctx.mov rdx, m0ninv
|
||||||
|
ctx.imul rdx, t[0]
|
||||||
|
|
||||||
|
# Create 2 parallel carry-chains for adcx and adox
|
||||||
|
# We need to clear the carry/overflow flags first for ADCX/ADOX
|
||||||
|
# with the smallest instruction if possible (xor rax, rax)
|
||||||
|
# to reduce instruction-cache miss
|
||||||
|
ctx.comment " C, _ = t[0] + m * M[0]"
|
||||||
|
ctx.`xor` rax, rax
|
||||||
|
ctx.mulx C, rax, M[0], rdx
|
||||||
|
ctx.adcx rax, t[0] # Set C', the carry flag for future adcx, but don't accumulate in C yet
|
||||||
|
ctx.mov t[0], C
|
||||||
|
|
||||||
|
# for j=1 to N-1
|
||||||
|
# (S,t[j-1]) := t[j] + m*M[j] + S
|
||||||
|
ctx.comment " for j=1 to N-1"
|
||||||
|
ctx.comment " (C,t[j-1]) := t[j] + m*M[j] + C"
|
||||||
|
for j in 1 ..< N:
|
||||||
|
ctx.adcx t[j-1], t[j]
|
||||||
|
ctx.mulx t[j], C, M[j], rdx
|
||||||
|
ctx.adox t[j-1], C
|
||||||
|
|
||||||
|
ctx.comment " final carries"
|
||||||
|
ctx.mov rax, 0
|
||||||
|
ctx.adcx t[N-1], rax
|
||||||
|
ctx.adox t[N-1], rax
|
||||||
|
|
||||||
|
result.add ctx.generate()
|
||||||
|
|
||||||
|
func fromMont_asm_adx*(r: var Limbs, a, M: Limbs, m0ninv: BaseType) =
|
||||||
|
## Constant-time Montgomery residue form to BigInt conversion
|
||||||
|
## Requires ADX and BMI2 instruction set
|
||||||
|
var t{.noInit.} = a
|
||||||
|
block:
|
||||||
|
t.mulMont_by_1_adx_gen(M, m0ninv)
|
||||||
|
|
||||||
|
block: # Map from [0, 2p) to [0, p)
|
||||||
|
var workspace{.noInit.}: typeof(r)
|
||||||
|
r.finalSub_gen(t, M, workspace, mayCarry = false)
|
|
@ -298,7 +298,7 @@ func prod_high_words*[rBits, aBits, bBits](r: var BigInt[rBits], a: BigInt[aBits
|
||||||
# - Barret reduction
|
# - Barret reduction
|
||||||
# - Approximating multiplication by a fractional constant in the form f(a) = K/C * a
|
# - Approximating multiplication by a fractional constant in the form f(a) = K/C * a
|
||||||
# with K and C known at compile-time.
|
# with K and C known at compile-time.
|
||||||
# We can instead find a well chosen M = (2^WordBitWidth)^w, with M > C (i.e. M is a power of 2 bigger than C)
|
# We can instead find a well chosen M = (2^WordBitWidth)ʷ, with M > C (i.e. M is a power of 2 bigger than C)
|
||||||
# Precompute P = K*M/C at compile-time
|
# Precompute P = K*M/C at compile-time
|
||||||
# and at runtime do P*a/M <=> P*a >> WordBitWidth*w
|
# and at runtime do P*a/M <=> P*a >> WordBitWidth*w
|
||||||
# i.e. prod_high_words(result, P, a, w)
|
# i.e. prod_high_words(result, P, a, w)
|
||||||
|
|
|
@ -24,9 +24,9 @@ import
|
||||||
#
|
#
|
||||||
# ############################################################
|
# ############################################################
|
||||||
|
|
||||||
func montyResidue*(mres: var BigInt, a, N, r2modM: BigInt, m0ninv: static BaseType, spareBits: static int) =
|
func getMont*(mres: var BigInt, a, N, r2modM: BigInt, m0ninv: static BaseType, spareBits: static int) =
|
||||||
## Convert a BigInt from its natural representation
|
## Convert a BigInt from its natural representation
|
||||||
## to the Montgomery n-residue form
|
## to the Montgomery residue form
|
||||||
##
|
##
|
||||||
## `mres` is overwritten. It's bitlength must be properly set before calling this procedure.
|
## `mres` is overwritten. It's bitlength must be properly set before calling this procedure.
|
||||||
##
|
##
|
||||||
|
@ -39,37 +39,35 @@ func montyResidue*(mres: var BigInt, a, N, r2modM: BigInt, m0ninv: static BaseTy
|
||||||
## - `r2modM` is R² (mod M)
|
## - `r2modM` is R² (mod M)
|
||||||
## with W = M.len
|
## with W = M.len
|
||||||
## and R = (2^WordBitWidth)^W
|
## and R = (2^WordBitWidth)^W
|
||||||
montyResidue(mres.limbs, a.limbs, N.limbs, r2modM.limbs, m0ninv, spareBits)
|
getMont(mres.limbs, a.limbs, N.limbs, r2modM.limbs, m0ninv, spareBits)
|
||||||
|
|
||||||
func redc*[mBits](r: var BigInt[mBits], a, M: BigInt[mBits], m0ninv: static BaseType, spareBits: static int) =
|
func fromMont*[mBits](r: var BigInt[mBits], a, M: BigInt[mBits], m0ninv: static BaseType, spareBits: static int) =
|
||||||
## Convert a BigInt from its Montgomery n-residue form
|
## Convert a BigInt from its Montgomery residue form
|
||||||
## to the natural representation
|
## to the natural representation
|
||||||
##
|
##
|
||||||
## `mres` is modified in-place
|
## `mres` is modified in-place
|
||||||
##
|
##
|
||||||
## Caller must take care of properly switching between
|
## Caller must take care of properly switching between
|
||||||
## the natural and montgomery domain.
|
## the natural and montgomery domain.
|
||||||
let one = block:
|
fromMont(r.limbs, a.limbs, M.limbs, m0ninv, spareBits)
|
||||||
var one {.noInit.}: BigInt[mBits]
|
|
||||||
one.setOne()
|
|
||||||
one
|
|
||||||
redc(r.limbs, a.limbs, one.limbs, M.limbs, m0ninv, spareBits)
|
|
||||||
|
|
||||||
func montyMul*(r: var BigInt, a, b, M: BigInt, negInvModWord: static BaseType, spareBits: static int) =
|
func mulMont*(r: var BigInt, a, b, M: BigInt, negInvModWord: static BaseType,
|
||||||
|
spareBits: static int, skipReduction: static bool = false) =
|
||||||
## Compute r <- a*b (mod M) in the Montgomery domain
|
## Compute r <- a*b (mod M) in the Montgomery domain
|
||||||
##
|
##
|
||||||
## This resets r to zero before processing. Use {.noInit.}
|
## This resets r to zero before processing. Use {.noInit.}
|
||||||
## to avoid duplicating with Nim zero-init policy
|
## to avoid duplicating with Nim zero-init policy
|
||||||
montyMul(r.limbs, a.limbs, b.limbs, M.limbs, negInvModWord, spareBits)
|
mulMont(r.limbs, a.limbs, b.limbs, M.limbs, negInvModWord, spareBits, skipReduction)
|
||||||
|
|
||||||
func montySquare*(r: var BigInt, a, M: BigInt, negInvModWord: static BaseType, spareBits: static int) =
|
func squareMont*(r: var BigInt, a, M: BigInt, negInvModWord: static BaseType,
|
||||||
|
spareBits: static int, skipReduction: static bool = false) =
|
||||||
## Compute r <- a^2 (mod M) in the Montgomery domain
|
## Compute r <- a^2 (mod M) in the Montgomery domain
|
||||||
##
|
##
|
||||||
## This resets r to zero before processing. Use {.noInit.}
|
## This resets r to zero before processing. Use {.noInit.}
|
||||||
## to avoid duplicating with Nim zero-init policy
|
## to avoid duplicating with Nim zero-init policy
|
||||||
montySquare(r.limbs, a.limbs, M.limbs, negInvModWord, spareBits)
|
squareMont(r.limbs, a.limbs, M.limbs, negInvModWord, spareBits, skipReduction)
|
||||||
|
|
||||||
func montyPow*[mBits: static int](
|
func powMont*[mBits: static int](
|
||||||
a: var BigInt[mBits], exponent: openarray[byte],
|
a: var BigInt[mBits], exponent: openarray[byte],
|
||||||
M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int,
|
M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int,
|
||||||
spareBits: static int
|
spareBits: static int
|
||||||
|
@ -91,9 +89,9 @@ func montyPow*[mBits: static int](
|
||||||
const scratchLen = if windowSize == 1: 2
|
const scratchLen = if windowSize == 1: 2
|
||||||
else: (1 shl windowSize) + 1
|
else: (1 shl windowSize) + 1
|
||||||
var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired]]
|
var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired]]
|
||||||
montyPow(a.limbs, exponent, M.limbs, one.limbs, negInvModWord, scratchSpace, spareBits)
|
powMont(a.limbs, exponent, M.limbs, one.limbs, negInvModWord, scratchSpace, spareBits)
|
||||||
|
|
||||||
func montyPowUnsafeExponent*[mBits: static int](
|
func powMontUnsafeExponent*[mBits: static int](
|
||||||
a: var BigInt[mBits], exponent: openarray[byte],
|
a: var BigInt[mBits], exponent: openarray[byte],
|
||||||
M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int,
|
M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int,
|
||||||
spareBits: static int
|
spareBits: static int
|
||||||
|
@ -115,12 +113,12 @@ func montyPowUnsafeExponent*[mBits: static int](
|
||||||
const scratchLen = if windowSize == 1: 2
|
const scratchLen = if windowSize == 1: 2
|
||||||
else: (1 shl windowSize) + 1
|
else: (1 shl windowSize) + 1
|
||||||
var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired]]
|
var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired]]
|
||||||
montyPowUnsafeExponent(a.limbs, exponent, M.limbs, one.limbs, negInvModWord, scratchSpace, spareBits)
|
powMontUnsafeExponent(a.limbs, exponent, M.limbs, one.limbs, negInvModWord, scratchSpace, spareBits)
|
||||||
|
|
||||||
from ../io/io_bigints import exportRawUint
|
from ../io/io_bigints import exportRawUint
|
||||||
# Workaround recursive dependencies
|
# Workaround recursive dependencies
|
||||||
|
|
||||||
func montyPow*[mBits, eBits: static int](
|
func powMont*[mBits, eBits: static int](
|
||||||
a: var BigInt[mBits], exponent: BigInt[eBits],
|
a: var BigInt[mBits], exponent: BigInt[eBits],
|
||||||
M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int,
|
M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int,
|
||||||
spareBits: static int
|
spareBits: static int
|
||||||
|
@ -137,9 +135,9 @@ func montyPow*[mBits, eBits: static int](
|
||||||
var expBE {.noInit.}: array[(ebits + 7) div 8, byte]
|
var expBE {.noInit.}: array[(ebits + 7) div 8, byte]
|
||||||
expBE.exportRawUint(exponent, bigEndian)
|
expBE.exportRawUint(exponent, bigEndian)
|
||||||
|
|
||||||
montyPow(a, expBE, M, one, negInvModWord, windowSize, spareBits)
|
powMont(a, expBE, M, one, negInvModWord, windowSize, spareBits)
|
||||||
|
|
||||||
func montyPowUnsafeExponent*[mBits, eBits: static int](
|
func powMontUnsafeExponent*[mBits, eBits: static int](
|
||||||
a: var BigInt[mBits], exponent: BigInt[eBits],
|
a: var BigInt[mBits], exponent: BigInt[eBits],
|
||||||
M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int,
|
M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int,
|
||||||
spareBits: static int
|
spareBits: static int
|
||||||
|
@ -160,7 +158,7 @@ func montyPowUnsafeExponent*[mBits, eBits: static int](
|
||||||
var expBE {.noInit.}: array[(ebits + 7) div 8, byte]
|
var expBE {.noInit.}: array[(ebits + 7) div 8, byte]
|
||||||
expBE.exportRawUint(exponent, bigEndian)
|
expBE.exportRawUint(exponent, bigEndian)
|
||||||
|
|
||||||
montyPowUnsafeExponent(a, expBE, M, one, negInvModWord, windowSize, spareBits)
|
powMontUnsafeExponent(a, expBE, M, one, negInvModWord, windowSize, spareBits)
|
||||||
|
|
||||||
{.pop.} # inline
|
{.pop.} # inline
|
||||||
{.pop.} # raises no exceptions
|
{.pop.} # raises no exceptions
|
||||||
|
|
|
@ -56,16 +56,20 @@ func fromBig*(dst: var FF, src: BigInt) =
|
||||||
when nimvm:
|
when nimvm:
|
||||||
dst.mres.montyResidue_precompute(src, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord())
|
dst.mres.montyResidue_precompute(src, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord())
|
||||||
else:
|
else:
|
||||||
dst.mres.montyResidue(src, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
|
dst.mres.getMont(src, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
|
||||||
|
|
||||||
func fromBig*[C: static Curve](T: type FF[C], src: BigInt): FF[C] {.noInit.} =
|
func fromBig*[C: static Curve](T: type FF[C], src: BigInt): FF[C] {.noInit.} =
|
||||||
## Convert a BigInt to its Montgomery form
|
## Convert a BigInt to its Montgomery form
|
||||||
result.fromBig(src)
|
result.fromBig(src)
|
||||||
|
|
||||||
|
func fromField*(dst: var BigInt, src: FF) {.noInit, inline.} =
|
||||||
|
## Convert a finite-field element to a BigInt in natural representation
|
||||||
|
dst.fromMont(src.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits())
|
||||||
|
|
||||||
func toBig*(src: FF): auto {.noInit, inline.} =
|
func toBig*(src: FF): auto {.noInit, inline.} =
|
||||||
## Convert a finite-field element to a BigInt in natural representation
|
## Convert a finite-field element to a BigInt in natural representation
|
||||||
var r {.noInit.}: typeof(src.mres)
|
var r {.noInit.}: typeof(src.mres)
|
||||||
r.redc(src.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits())
|
r.fromField(src)
|
||||||
return r
|
return r
|
||||||
|
|
||||||
# Copy
|
# Copy
|
||||||
|
@ -102,7 +106,7 @@ func cswap*(a, b: var FF, ctl: CTBool) {.meter.} =
|
||||||
|
|
||||||
# Note: for `+=`, double, sum
|
# Note: for `+=`, double, sum
|
||||||
# not(a.mres < FF.fieldMod()) is unnecessary if the prime has the form
|
# not(a.mres < FF.fieldMod()) is unnecessary if the prime has the form
|
||||||
# (2^64)^w - 1 (if using uint64 words).
|
# (2^64)ʷ - 1 (if using uint64 words).
|
||||||
# In practice I'm not aware of such prime being using in elliptic curves.
|
# In practice I'm not aware of such prime being using in elliptic curves.
|
||||||
# 2^127 - 1 and 2^521 - 1 are used but 127 and 521 are not multiple of 32/64
|
# 2^127 - 1 and 2^521 - 1 are used but 127 and 521 are not multiple of 32/64
|
||||||
|
|
||||||
|
@ -148,7 +152,7 @@ func setMinusOne*(a: var FF) =
|
||||||
func `+=`*(a: var FF, b: FF) {.meter.} =
|
func `+=`*(a: var FF, b: FF) {.meter.} =
|
||||||
## In-place addition modulo p
|
## In-place addition modulo p
|
||||||
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||||
addmod_asm(a.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
|
addmod_asm(a.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs, FF.getSpareBits() >= 1)
|
||||||
else:
|
else:
|
||||||
var overflowed = add(a.mres, b.mres)
|
var overflowed = add(a.mres, b.mres)
|
||||||
overflowed = overflowed or not(a.mres < FF.fieldMod())
|
overflowed = overflowed or not(a.mres < FF.fieldMod())
|
||||||
|
@ -165,7 +169,7 @@ func `-=`*(a: var FF, b: FF) {.meter.} =
|
||||||
func double*(a: var FF) {.meter.} =
|
func double*(a: var FF) {.meter.} =
|
||||||
## Double ``a`` modulo p
|
## Double ``a`` modulo p
|
||||||
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||||
addmod_asm(a.mres.limbs, a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
|
addmod_asm(a.mres.limbs, a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs, FF.getSpareBits() >= 1)
|
||||||
else:
|
else:
|
||||||
var overflowed = double(a.mres)
|
var overflowed = double(a.mres)
|
||||||
overflowed = overflowed or not(a.mres < FF.fieldMod())
|
overflowed = overflowed or not(a.mres < FF.fieldMod())
|
||||||
|
@ -175,7 +179,7 @@ func sum*(r: var FF, a, b: FF) {.meter.} =
|
||||||
## Sum ``a`` and ``b`` into ``r`` modulo p
|
## Sum ``a`` and ``b`` into ``r`` modulo p
|
||||||
## r is initialized/overwritten
|
## r is initialized/overwritten
|
||||||
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||||
addmod_asm(r.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
|
addmod_asm(r.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs, FF.getSpareBits() >= 1)
|
||||||
else:
|
else:
|
||||||
var overflowed = r.mres.sum(a.mres, b.mres)
|
var overflowed = r.mres.sum(a.mres, b.mres)
|
||||||
overflowed = overflowed or not(r.mres < FF.fieldMod())
|
overflowed = overflowed or not(r.mres < FF.fieldMod())
|
||||||
|
@ -204,20 +208,20 @@ func double*(r: var FF, a: FF) {.meter.} =
|
||||||
## Double ``a`` into ``r``
|
## Double ``a`` into ``r``
|
||||||
## `r` is initialized/overwritten
|
## `r` is initialized/overwritten
|
||||||
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||||
addmod_asm(r.mres.limbs, a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
|
addmod_asm(r.mres.limbs, a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs, FF.getSpareBits() >= 1)
|
||||||
else:
|
else:
|
||||||
var overflowed = r.mres.double(a.mres)
|
var overflowed = r.mres.double(a.mres)
|
||||||
overflowed = overflowed or not(r.mres < FF.fieldMod())
|
overflowed = overflowed or not(r.mres < FF.fieldMod())
|
||||||
discard csub(r.mres, FF.fieldMod(), overflowed)
|
discard csub(r.mres, FF.fieldMod(), overflowed)
|
||||||
|
|
||||||
func prod*(r: var FF, a, b: FF) {.meter.} =
|
func prod*(r: var FF, a, b: FF, skipReduction: static bool = false) {.meter.} =
|
||||||
## Store the product of ``a`` by ``b`` modulo p into ``r``
|
## Store the product of ``a`` by ``b`` modulo p into ``r``
|
||||||
## ``r`` is initialized / overwritten
|
## ``r`` is initialized / overwritten
|
||||||
r.mres.montyMul(a.mres, b.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits())
|
r.mres.mulMont(a.mres, b.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits(), skipReduction)
|
||||||
|
|
||||||
func square*(r: var FF, a: FF) {.meter.} =
|
func square*(r: var FF, a: FF, skipReduction: static bool = false) {.meter.} =
|
||||||
## Squaring modulo p
|
## Squaring modulo p
|
||||||
r.mres.montySquare(a.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits())
|
r.mres.squareMont(a.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits(), skipReduction)
|
||||||
|
|
||||||
func neg*(r: var FF, a: FF) {.meter.} =
|
func neg*(r: var FF, a: FF) {.meter.} =
|
||||||
## Negate modulo p
|
## Negate modulo p
|
||||||
|
@ -261,20 +265,20 @@ func cneg*(r: var FF, a: FF, ctl: SecretBool) {.meter.} =
|
||||||
func cneg*(a: var FF, ctl: SecretBool) {.meter.} =
|
func cneg*(a: var FF, ctl: SecretBool) {.meter.} =
|
||||||
## Constant-time in-place conditional negation
|
## Constant-time in-place conditional negation
|
||||||
## The negation is only performed if ctl is "true"
|
## The negation is only performed if ctl is "true"
|
||||||
var t = a
|
var t {.noInit.} = a
|
||||||
a.cneg(t, ctl)
|
a.cneg(t, ctl)
|
||||||
|
|
||||||
func cadd*(a: var FF, b: FF, ctl: SecretBool) {.meter.} =
|
func cadd*(a: var FF, b: FF, ctl: SecretBool) {.meter.} =
|
||||||
## Constant-time out-place conditional addition
|
## Constant-time out-place conditional addition
|
||||||
## The addition is only performed if ctl is "true"
|
## The addition is only performed if ctl is "true"
|
||||||
var t = a
|
var t {.noInit.} = a
|
||||||
t += b
|
t += b
|
||||||
a.ccopy(t, ctl)
|
a.ccopy(t, ctl)
|
||||||
|
|
||||||
func csub*(a: var FF, b: FF, ctl: SecretBool) {.meter.} =
|
func csub*(a: var FF, b: FF, ctl: SecretBool) {.meter.} =
|
||||||
## Constant-time out-place conditional substraction
|
## Constant-time out-place conditional substraction
|
||||||
## The substraction is only performed if ctl is "true"
|
## The substraction is only performed if ctl is "true"
|
||||||
var t = a
|
var t {.noInit.} = a
|
||||||
t -= b
|
t -= b
|
||||||
a.ccopy(t, ctl)
|
a.ccopy(t, ctl)
|
||||||
|
|
||||||
|
@ -340,7 +344,7 @@ func pow*(a: var FF, exponent: BigInt) =
|
||||||
## ``a``: a field element to be exponentiated
|
## ``a``: a field element to be exponentiated
|
||||||
## ``exponent``: a big integer
|
## ``exponent``: a big integer
|
||||||
const windowSize = 5 # TODO: find best window size for each curves
|
const windowSize = 5 # TODO: find best window size for each curves
|
||||||
a.mres.montyPow(
|
a.mres.powMont(
|
||||||
exponent,
|
exponent,
|
||||||
FF.fieldMod(), FF.getMontyOne(),
|
FF.fieldMod(), FF.getMontyOne(),
|
||||||
FF.getNegInvModWord(), windowSize,
|
FF.getNegInvModWord(), windowSize,
|
||||||
|
@ -352,7 +356,7 @@ func pow*(a: var FF, exponent: openarray[byte]) =
|
||||||
## ``a``: a field element to be exponentiated
|
## ``a``: a field element to be exponentiated
|
||||||
## ``exponent``: a big integer in canonical big endian representation
|
## ``exponent``: a big integer in canonical big endian representation
|
||||||
const windowSize = 5 # TODO: find best window size for each curves
|
const windowSize = 5 # TODO: find best window size for each curves
|
||||||
a.mres.montyPow(
|
a.mres.powMont(
|
||||||
exponent,
|
exponent,
|
||||||
FF.fieldMod(), FF.getMontyOne(),
|
FF.fieldMod(), FF.getMontyOne(),
|
||||||
FF.getNegInvModWord(), windowSize,
|
FF.getNegInvModWord(), windowSize,
|
||||||
|
@ -371,7 +375,7 @@ func powUnsafeExponent*(a: var FF, exponent: BigInt) =
|
||||||
## - power analysis
|
## - power analysis
|
||||||
## - timing analysis
|
## - timing analysis
|
||||||
const windowSize = 5 # TODO: find best window size for each curves
|
const windowSize = 5 # TODO: find best window size for each curves
|
||||||
a.mres.montyPowUnsafeExponent(
|
a.mres.powMontUnsafeExponent(
|
||||||
exponent,
|
exponent,
|
||||||
FF.fieldMod(), FF.getMontyOne(),
|
FF.fieldMod(), FF.getMontyOne(),
|
||||||
FF.getNegInvModWord(), windowSize,
|
FF.getNegInvModWord(), windowSize,
|
||||||
|
@ -390,7 +394,7 @@ func powUnsafeExponent*(a: var FF, exponent: openarray[byte]) =
|
||||||
## - power analysis
|
## - power analysis
|
||||||
## - timing analysis
|
## - timing analysis
|
||||||
const windowSize = 5 # TODO: find best window size for each curves
|
const windowSize = 5 # TODO: find best window size for each curves
|
||||||
a.mres.montyPowUnsafeExponent(
|
a.mres.powMontUnsafeExponent(
|
||||||
exponent,
|
exponent,
|
||||||
FF.fieldMod(), FF.getMontyOne(),
|
FF.fieldMod(), FF.getMontyOne(),
|
||||||
FF.getNegInvModWord(), windowSize,
|
FF.getNegInvModWord(), windowSize,
|
||||||
|
@ -409,21 +413,39 @@ func `*=`*(a: var FF, b: FF) {.meter.} =
|
||||||
## Multiplication modulo p
|
## Multiplication modulo p
|
||||||
a.prod(a, b)
|
a.prod(a, b)
|
||||||
|
|
||||||
func square*(a: var FF) {.meter.} =
|
func square*(a: var FF, skipReduction: static bool = false) {.meter.} =
|
||||||
## Squaring modulo p
|
## Squaring modulo p
|
||||||
a.mres.montySquare(a.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits())
|
a.square(a, skipReduction)
|
||||||
|
|
||||||
func square_repeated*(r: var FF, num: int) {.meter.} =
|
func square_repeated*(a: var FF, num: int, skipReduction: static bool = false) {.meter.} =
|
||||||
## Repeated squarings
|
## Repeated squarings
|
||||||
|
# Except in Tonelli-Shanks, num is always known at compile-time
|
||||||
|
# and square repeated is inlined, so the compiler should optimize the branches away.
|
||||||
|
|
||||||
|
# TODO: understand the conditions to avoid the final substraction
|
||||||
for _ in 0 ..< num:
|
for _ in 0 ..< num:
|
||||||
r.square()
|
a.square(skipReduction = false)
|
||||||
|
|
||||||
func square_repeated*(r: var FF, a: FF, num: int) {.meter.} =
|
func square_repeated*(r: var FF, a: FF, num: int, skipReduction: static bool = false) {.meter.} =
|
||||||
## Repeated squarings
|
## Repeated squarings
|
||||||
|
|
||||||
|
# TODO: understand the conditions to avoid the final substraction
|
||||||
r.square(a)
|
r.square(a)
|
||||||
for _ in 1 ..< num:
|
for _ in 1 ..< num:
|
||||||
r.square()
|
r.square()
|
||||||
|
|
||||||
|
func square_repeated_then_mul*(a: var FF, num: int, b: FF, skipReduction: static bool = false) {.meter.} =
|
||||||
|
## Square `a`, `num` times and then multiply by b
|
||||||
|
## Assumes at least 1 squaring
|
||||||
|
a.square_repeated(num, skipReduction = false)
|
||||||
|
a.prod(a, b, skipReduction = skipReduction)
|
||||||
|
|
||||||
|
func square_repeated_then_mul*(r: var FF, a: FF, num: int, b: FF, skipReduction: static bool = false) {.meter.} =
|
||||||
|
## Square `a`, `num` times and then multiply by b
|
||||||
|
## Assumes at least 1 squaring
|
||||||
|
r.square_repeated(a, num, skipReduction = false)
|
||||||
|
r.prod(r, b, skipReduction = skipReduction)
|
||||||
|
|
||||||
func `*=`*(a: var FF, b: static int) =
|
func `*=`*(a: var FF, b: static int) =
|
||||||
## Multiplication by a small integer known at compile-time
|
## Multiplication by a small integer known at compile-time
|
||||||
# Implementation:
|
# Implementation:
|
||||||
|
|
|
@ -73,7 +73,7 @@ func redc2x*(r: var Fp, a: FpDbl) =
|
||||||
## Reduce a double-precision field element into r
|
## Reduce a double-precision field element into r
|
||||||
## from [0, 2ⁿp) range to [0, p) range
|
## from [0, 2ⁿp) range to [0, p) range
|
||||||
const N = r.mres.limbs.len
|
const N = r.mres.limbs.len
|
||||||
montyRedc2x(
|
redc2xMont(
|
||||||
r.mres.limbs,
|
r.mres.limbs,
|
||||||
a.limbs2x,
|
a.limbs2x,
|
||||||
Fp.C.Mod.limbs,
|
Fp.C.Mod.limbs,
|
||||||
|
|
|
@ -128,7 +128,7 @@ func numWordsFromBits(bits: int): int {.inline.} =
|
||||||
func shlAddMod_estimate(a: LimbsViewMut, aLen: int,
|
func shlAddMod_estimate(a: LimbsViewMut, aLen: int,
|
||||||
c: SecretWord, M: LimbsViewConst, mBits: int
|
c: SecretWord, M: LimbsViewConst, mBits: int
|
||||||
): tuple[neg, tooBig: SecretBool] =
|
): tuple[neg, tooBig: SecretBool] =
|
||||||
## Estimate a <- a shl 2^w + c (mod M)
|
## Estimate a <- a shl 2ʷ + c (mod M)
|
||||||
##
|
##
|
||||||
## with w the base word width, usually 32 on 32-bit platforms and 64 on 64-bit platforms
|
## with w the base word width, usually 32 on 32-bit platforms and 64 on 64-bit platforms
|
||||||
##
|
##
|
||||||
|
|
|
@ -74,7 +74,7 @@ func prod*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b:
|
||||||
when UseASM_X86_64 and aLen <= 6:
|
when UseASM_X86_64 and aLen <= 6:
|
||||||
# ADX implies BMI2
|
# ADX implies BMI2
|
||||||
if ({.noSideEffect.}: hasAdx()):
|
if ({.noSideEffect.}: hasAdx()):
|
||||||
mul_asm_adx_bmi2(r, a, b)
|
mul_asm_adx(r, a, b)
|
||||||
else:
|
else:
|
||||||
mul_asm(r, a, b)
|
mul_asm(r, a, b)
|
||||||
elif UseASM_X86_64:
|
elif UseASM_X86_64:
|
||||||
|
@ -98,7 +98,7 @@ func prod_high_words*[rLen, aLen, bLen](
|
||||||
# - Barret reduction
|
# - Barret reduction
|
||||||
# - Approximating multiplication by a fractional constant in the form f(a) = K/C * a
|
# - Approximating multiplication by a fractional constant in the form f(a) = K/C * a
|
||||||
# with K and C known at compile-time.
|
# with K and C known at compile-time.
|
||||||
# We can instead find a well chosen M = (2^WordBitWidth)^w, with M > C (i.e. M is a power of 2 bigger than C)
|
# We can instead find a well chosen M = (2^WordBitWidth)ʷ, with M > C (i.e. M is a power of 2 bigger than C)
|
||||||
# Precompute P = K*M/C at compile-time
|
# Precompute P = K*M/C at compile-time
|
||||||
# and at runtime do P*a/M <=> P*a >> (WordBitWidth*w)
|
# and at runtime do P*a/M <=> P*a >> (WordBitWidth*w)
|
||||||
# i.e. prod_high_words(result, P, a, w)
|
# i.e. prod_high_words(result, P, a, w)
|
||||||
|
@ -203,7 +203,7 @@ func square*[rLen, aLen](
|
||||||
when UseASM_X86_64 and aLen in {4, 6} and rLen == 2*aLen:
|
when UseASM_X86_64 and aLen in {4, 6} and rLen == 2*aLen:
|
||||||
# ADX implies BMI2
|
# ADX implies BMI2
|
||||||
if ({.noSideEffect.}: hasAdx()):
|
if ({.noSideEffect.}: hasAdx()):
|
||||||
square_asm_adx_bmi2(r, a)
|
square_asm_adx(r, a)
|
||||||
else:
|
else:
|
||||||
square_asm(r, a)
|
square_asm(r, a)
|
||||||
elif UseASM_X86_64:
|
elif UseASM_X86_64:
|
||||||
|
|
|
@ -15,12 +15,12 @@ import
|
||||||
./limbs, ./limbs_extmul
|
./limbs, ./limbs_extmul
|
||||||
|
|
||||||
when UseASM_X86_32:
|
when UseASM_X86_32:
|
||||||
import ./assembly/limbs_asm_montred_x86
|
import ./assembly/limbs_asm_redc_mont_x86
|
||||||
when UseASM_X86_64:
|
when UseASM_X86_64:
|
||||||
import
|
import
|
||||||
./assembly/limbs_asm_montmul_x86,
|
./assembly/limbs_asm_mul_mont_x86,
|
||||||
./assembly/limbs_asm_montmul_x86_adx_bmi2,
|
./assembly/limbs_asm_mul_mont_x86_adx_bmi2,
|
||||||
./assembly/limbs_asm_montred_x86_adx_bmi2
|
./assembly/limbs_asm_redc_mont_x86_adx_bmi2
|
||||||
|
|
||||||
# ############################################################
|
# ############################################################
|
||||||
#
|
#
|
||||||
|
@ -53,12 +53,19 @@ when UseASM_X86_64:
|
||||||
|
|
||||||
# Montgomery Reduction
|
# Montgomery Reduction
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
func montyRedc2x_CIOS[N: static int](
|
func redc2xMont_CIOS[N: static int](
|
||||||
r: var array[N, SecretWord],
|
r: var array[N, SecretWord],
|
||||||
a: array[N*2, SecretWord],
|
a: array[N*2, SecretWord],
|
||||||
M: array[N, SecretWord],
|
M: array[N, SecretWord],
|
||||||
m0ninv: BaseType) =
|
m0ninv: BaseType, skipReduction: static bool = false) =
|
||||||
## Montgomery reduce a double-precision bigint modulo M
|
## Montgomery reduce a double-precision bigint modulo M
|
||||||
|
##
|
||||||
|
## This maps
|
||||||
|
## - [0, 4p²) -> [0, 2p) with skipReduction
|
||||||
|
## - [0, 4p²) -> [0, p) without
|
||||||
|
##
|
||||||
|
## SkipReduction skips the final substraction step.
|
||||||
|
## For skipReduction, M needs to have a spare bit in it's representation i.e. unused MSB.
|
||||||
# - Analyzing and Comparing Montgomery Multiplication Algorithms
|
# - Analyzing and Comparing Montgomery Multiplication Algorithms
|
||||||
# Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr.
|
# Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr.
|
||||||
# http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf
|
# http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf
|
||||||
|
@ -81,7 +88,7 @@ func montyRedc2x_CIOS[N: static int](
|
||||||
#
|
#
|
||||||
# for i in 0 .. n-1:
|
# for i in 0 .. n-1:
|
||||||
# C <- 0
|
# C <- 0
|
||||||
# m <- a[i] * m0ninv mod 2^w (i.e. simple multiplication)
|
# m <- a[i] * m0ninv mod 2ʷ (i.e. simple multiplication)
|
||||||
# for j in 0 .. n-1:
|
# for j in 0 .. n-1:
|
||||||
# (C, S) <- a[i+j] + m * M[j] + C
|
# (C, S) <- a[i+j] + m * M[j] + C
|
||||||
# a[i+j] <- S
|
# a[i+j] <- S
|
||||||
|
@ -95,8 +102,8 @@ func montyRedc2x_CIOS[N: static int](
|
||||||
# to the higher limb if any, thank you "implementation detail"
|
# to the higher limb if any, thank you "implementation detail"
|
||||||
# missing from paper.
|
# missing from paper.
|
||||||
|
|
||||||
var a = a # Copy "t" for mutation and ensure on stack
|
var a {.noInit.} = a # Copy "t" for mutation and ensure on stack
|
||||||
var res: typeof(r) # Accumulator
|
var res {.noInit.}: typeof(r) # Accumulator
|
||||||
staticFor i, 0, N:
|
staticFor i, 0, N:
|
||||||
var C = Zero
|
var C = Zero
|
||||||
let m = a[i] * SecretWord(m0ninv)
|
let m = a[i] * SecretWord(m0ninv)
|
||||||
|
@ -112,15 +119,23 @@ func montyRedc2x_CIOS[N: static int](
|
||||||
addC(carry, res[i], a[i+N], res[i], carry)
|
addC(carry, res[i], a[i+N], res[i], carry)
|
||||||
|
|
||||||
# Final substraction
|
# Final substraction
|
||||||
|
when not skipReduction:
|
||||||
discard res.csub(M, SecretWord(carry).isNonZero() or not(res < M))
|
discard res.csub(M, SecretWord(carry).isNonZero() or not(res < M))
|
||||||
r = res
|
r = res
|
||||||
|
|
||||||
func montyRedc2x_Comba[N: static int](
|
func redc2xMont_Comba[N: static int](
|
||||||
r: var array[N, SecretWord],
|
r: var array[N, SecretWord],
|
||||||
a: array[N*2, SecretWord],
|
a: array[N*2, SecretWord],
|
||||||
M: array[N, SecretWord],
|
M: array[N, SecretWord],
|
||||||
m0ninv: BaseType) =
|
m0ninv: BaseType, skipReduction: static bool = false) =
|
||||||
## Montgomery reduce a double-precision bigint modulo M
|
## Montgomery reduce a double-precision bigint modulo M
|
||||||
|
##
|
||||||
|
## This maps
|
||||||
|
## - [0, 4p²) -> [0, 2p) with skipReduction
|
||||||
|
## - [0, 4p²) -> [0, p) without
|
||||||
|
##
|
||||||
|
## SkipReduction skips the final substraction step.
|
||||||
|
## For skipReduction, M needs to have a spare bit in it's representation i.e. unused MSB.
|
||||||
# We use Product Scanning / Comba multiplication
|
# We use Product Scanning / Comba multiplication
|
||||||
var t, u, v = Zero
|
var t, u, v = Zero
|
||||||
var carry: Carry
|
var carry: Carry
|
||||||
|
@ -156,18 +171,25 @@ func montyRedc2x_Comba[N: static int](
|
||||||
addC(carry, z[N-1], v, a[2*N-1], Carry(0))
|
addC(carry, z[N-1], v, a[2*N-1], Carry(0))
|
||||||
|
|
||||||
# Final substraction
|
# Final substraction
|
||||||
|
when not skipReduction:
|
||||||
discard z.csub(M, SecretBool(carry) or not(z < M))
|
discard z.csub(M, SecretBool(carry) or not(z < M))
|
||||||
r = z
|
r = z
|
||||||
|
|
||||||
# Montgomery Multiplication
|
# Montgomery Multiplication
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
func montyMul_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
|
func mulMont_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipReduction: static bool = false) =
|
||||||
## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
|
## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
|
||||||
## and no-carry optimization.
|
## and no-carry optimization.
|
||||||
## This requires the most significant word of the Modulus
|
## This requires the most significant word of the Modulus
|
||||||
## M[^1] < high(SecretWord) shr 1 (i.e. less than 0b01111...1111)
|
## M[^1] < high(SecretWord) shr 1 (i.e. less than 0b01111...1111)
|
||||||
## https://hackmd.io/@zkteam/modular_multiplication
|
## https://hackmd.io/@gnark/modular_multiplication
|
||||||
|
##
|
||||||
|
## This maps
|
||||||
|
## - [0, 2p) -> [0, 2p) with skipReduction
|
||||||
|
## - [0, 2p) -> [0, p) without
|
||||||
|
##
|
||||||
|
## SkipReduction skips the final substraction step.
|
||||||
|
|
||||||
# We want all the computation to be kept in registers
|
# We want all the computation to be kept in registers
|
||||||
# hence we use a temporary `t`, hoping that the compiler does it.
|
# hence we use a temporary `t`, hoping that the compiler does it.
|
||||||
|
@ -175,7 +197,7 @@ func montyMul_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
|
||||||
const N = t.len
|
const N = t.len
|
||||||
staticFor i, 0, N:
|
staticFor i, 0, N:
|
||||||
# (A, t[0]) <- a[0] * b[i] + t[0]
|
# (A, t[0]) <- a[0] * b[i] + t[0]
|
||||||
# m <- (t[0] * m0ninv) mod 2^w
|
# m <- (t[0] * m0ninv) mod 2ʷ
|
||||||
# (C, _) <- m * M[0] + t[0]
|
# (C, _) <- m * M[0] + t[0]
|
||||||
var A: SecretWord
|
var A: SecretWord
|
||||||
muladd1(A, t[0], a[0], b[i], t[0])
|
muladd1(A, t[0], a[0], b[i], t[0])
|
||||||
|
@ -191,10 +213,11 @@ func montyMul_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
|
||||||
|
|
||||||
t[N-1] = C + A
|
t[N-1] = C + A
|
||||||
|
|
||||||
|
when not skipReduction:
|
||||||
discard t.csub(M, not(t < M))
|
discard t.csub(M, not(t < M))
|
||||||
r = t
|
r = t
|
||||||
|
|
||||||
func montyMul_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) {.used.} =
|
func mulMont_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) {.used.} =
|
||||||
## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
|
## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
|
||||||
# - Analyzing and Comparing Montgomery Multiplication Algorithms
|
# - Analyzing and Comparing Montgomery Multiplication Algorithms
|
||||||
# Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr.
|
# Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr.
|
||||||
|
@ -221,7 +244,7 @@ func montyMul_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) {.used.} =
|
||||||
addC(tNp1, tN, tN, A, Carry(0))
|
addC(tNp1, tN, tN, A, Carry(0))
|
||||||
|
|
||||||
# Reduction
|
# Reduction
|
||||||
# m <- (t[0] * m0ninv) mod 2^w
|
# m <- (t[0] * m0ninv) mod 2ʷ
|
||||||
# (C, _) <- m * M[0] + t[0]
|
# (C, _) <- m * M[0] + t[0]
|
||||||
var C, lo = Zero
|
var C, lo = Zero
|
||||||
let m = t[0] * SecretWord(m0ninv)
|
let m = t[0] * SecretWord(m0ninv)
|
||||||
|
@ -239,11 +262,18 @@ func montyMul_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) {.used.} =
|
||||||
# t[N+1] can only be non-zero in the intermediate computation
|
# t[N+1] can only be non-zero in the intermediate computation
|
||||||
# since it is immediately reduce to t[N] at the end of each "i" iteration
|
# since it is immediately reduce to t[N] at the end of each "i" iteration
|
||||||
# However if t[N] is non-zero we have t > M
|
# However if t[N] is non-zero we have t > M
|
||||||
discard t.csub(M, tN.isNonZero() or not(t < M)) # TODO: (t >= M) is unnecessary for prime in the form (2^64)^w
|
discard t.csub(M, tN.isNonZero() or not(t < M)) # TODO: (t >= M) is unnecessary for prime in the form (2^64)ʷ
|
||||||
r = t
|
r = t
|
||||||
|
|
||||||
func montyMul_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
|
func mulMont_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipReduction: static bool = false) =
|
||||||
## Montgomery Multiplication using Finely Integrated Product Scanning (FIPS)
|
## Montgomery Multiplication using Finely Integrated Product Scanning (FIPS)
|
||||||
|
##
|
||||||
|
## This maps
|
||||||
|
## - [0, 2p) -> [0, 2p) with skipReduction
|
||||||
|
## - [0, 2p) -> [0, p) without
|
||||||
|
##
|
||||||
|
## SkipReduction skips the final substraction step.
|
||||||
|
## For skipReduction, M needs to have a spare bit in it's representation i.e. unused MSB.
|
||||||
# - Architectural Enhancements for Montgomery
|
# - Architectural Enhancements for Montgomery
|
||||||
# Multiplication on Embedded RISC Processors
|
# Multiplication on Embedded RISC Processors
|
||||||
# Johann Großschädl and Guy-Armand Kamendje, 2003
|
# Johann Großschädl and Guy-Armand Kamendje, 2003
|
||||||
|
@ -276,6 +306,7 @@ func montyMul_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
|
||||||
u = t
|
u = t
|
||||||
t = Zero
|
t = Zero
|
||||||
|
|
||||||
|
when not skipReduction:
|
||||||
discard z.csub(M, v.isNonZero() or not(z < M))
|
discard z.csub(M, v.isNonZero() or not(z < M))
|
||||||
r = z
|
r = z
|
||||||
|
|
||||||
|
@ -313,36 +344,81 @@ func montyMul_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
|
||||||
# ...
|
# ...
|
||||||
# for j in 1 ..< N: # <- Montgomery reduce.
|
# for j in 1 ..< N: # <- Montgomery reduce.
|
||||||
|
|
||||||
|
# Montgomery Conversion
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# In Montgomery form, inputs are scaled by a constant R
|
||||||
|
# so a' = aR (mod p) and b' = bR (mod p)
|
||||||
|
#
|
||||||
|
# A classic multiplication would do a'*b' = abR² (mod p)
|
||||||
|
# we then need to remove the extra R, hence:
|
||||||
|
# - Montgomery reduction (redc) does 1/R (mod p) to map abR² (mod p) -> abR (mod p)
|
||||||
|
# - Montgomery multiplication directly compute mulMont(aR, bR) = abR (mod p)
|
||||||
|
#
|
||||||
|
# So to convert a to a' = aR (mod p), we can do mulMont(a, R²) = aR (mod p)
|
||||||
|
# and to convert a' to a = aR / R (mod p) we can do:
|
||||||
|
# - redc(aR) = a
|
||||||
|
# - or mulMont(aR, 1) = a
|
||||||
|
|
||||||
|
func fromMont_CIOS(r: var Limbs, a, M: Limbs, m0ninv: BaseType) =
|
||||||
|
## Convert from Montgomery form to canonical BigInt form
|
||||||
|
# for i in 0 .. n-1:
|
||||||
|
# m <- t[0] * m0ninv mod 2ʷ (i.e. simple multiplication)
|
||||||
|
# C, _ = t[0] + m * M[0]
|
||||||
|
# for j in 1 ..n-1:
|
||||||
|
# (C, t[j-1]) <- r[j] + m*M[j] + C
|
||||||
|
# t[n-1] = C
|
||||||
|
|
||||||
|
const N = a.len
|
||||||
|
var t {.noInit.} = a # Ensure working in registers
|
||||||
|
|
||||||
|
staticFor i, 0, N:
|
||||||
|
let m = t[0] * SecretWord(m0ninv)
|
||||||
|
var C, lo: SecretWord
|
||||||
|
muladd1(C, lo, m, M[0], t[0])
|
||||||
|
staticFor j, 1, N:
|
||||||
|
muladd2(C, t[j-1], m, M[j], C, t[j])
|
||||||
|
t[N-1] = C
|
||||||
|
|
||||||
|
discard t.csub(M, not(t < M))
|
||||||
|
r = t
|
||||||
|
|
||||||
# Exported API
|
# Exported API
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
# TODO upstream, using Limbs[N] breaks semcheck
|
# TODO upstream, using Limbs[N] breaks semcheck
|
||||||
func montyRedc2x*[N: static int](
|
func redc2xMont*[N: static int](
|
||||||
r: var array[N, SecretWord],
|
r: var array[N, SecretWord],
|
||||||
a: array[N*2, SecretWord],
|
a: array[N*2, SecretWord],
|
||||||
M: array[N, SecretWord],
|
M: array[N, SecretWord],
|
||||||
m0ninv: BaseType, spareBits: static int) {.inline.} =
|
m0ninv: BaseType,
|
||||||
|
spareBits: static int, skipReduction: static bool = false) {.inline.} =
|
||||||
## Montgomery reduce a double-precision bigint modulo M
|
## Montgomery reduce a double-precision bigint modulo M
|
||||||
|
|
||||||
|
const skipReduction = skipReduction and spareBits >= 1
|
||||||
|
|
||||||
when UseASM_X86_64 and r.len <= 6:
|
when UseASM_X86_64 and r.len <= 6:
|
||||||
# ADX implies BMI2
|
# ADX implies BMI2
|
||||||
if ({.noSideEffect.}: hasAdx()):
|
if ({.noSideEffect.}: hasAdx()):
|
||||||
montRed_asm_adx_bmi2(r, a, M, m0ninv, spareBits >= 1)
|
redcMont_asm_adx(r, a, M, m0ninv, spareBits >= 1, skipReduction)
|
||||||
else:
|
else:
|
||||||
when r.len in {3..6}:
|
when r.len in {3..6}:
|
||||||
montRed_asm(r, a, M, m0ninv, spareBits >= 1)
|
redcMont_asm(r, a, M, m0ninv, spareBits >= 1, skipReduction)
|
||||||
else:
|
else:
|
||||||
montyRedc2x_CIOS(r, a, M, m0ninv)
|
redc2xMont_CIOS(r, a, M, m0ninv, skipReduction)
|
||||||
# montyRedc2x_Comba(r, a, M, m0ninv)
|
# redc2xMont_Comba(r, a, M, m0ninv)
|
||||||
elif UseASM_X86_64 and r.len in {3..6}:
|
elif UseASM_X86_64 and r.len in {3..6}:
|
||||||
# TODO: Assembly faster than GCC but slower than Clang
|
# TODO: Assembly faster than GCC but slower than Clang
|
||||||
montRed_asm(r, a, M, m0ninv, spareBits >= 1)
|
redcMont_asm(r, a, M, m0ninv, spareBits >= 1, skipReduction)
|
||||||
else:
|
else:
|
||||||
montyRedc2x_CIOS(r, a, M, m0ninv)
|
redc2xMont_CIOS(r, a, M, m0ninv, skipReduction)
|
||||||
# montyRedc2x_Comba(r, a, M, m0ninv)
|
# redc2xMont_Comba(r, a, M, m0ninv, skipReduction)
|
||||||
|
|
||||||
func montyMul*(
|
func mulMont*(
|
||||||
r: var Limbs, a, b, M: Limbs,
|
r: var Limbs, a, b, M: Limbs,
|
||||||
m0ninv: static BaseType, spareBits: static int) {.inline.} =
|
m0ninv: BaseType,
|
||||||
|
spareBits: static int,
|
||||||
|
skipReduction: static bool = false) {.inline.} =
|
||||||
## Compute r <- a*b (mod M) in the Montgomery domain
|
## Compute r <- a*b (mod M) in the Montgomery domain
|
||||||
## `m0ninv` = -1/M (mod SecretWord). Our words are 2^32 or 2^64
|
## `m0ninv` = -1/M (mod SecretWord). Our words are 2^32 or 2^64
|
||||||
##
|
##
|
||||||
|
@ -369,43 +445,50 @@ func montyMul*(
|
||||||
# The implementation is visible from here, the compiler can make decision whether to:
|
# The implementation is visible from here, the compiler can make decision whether to:
|
||||||
# - specialize/duplicate code for m0ninv == 1 (especially if only 1 curve is needed)
|
# - specialize/duplicate code for m0ninv == 1 (especially if only 1 curve is needed)
|
||||||
# - keep it generic and optimize code size
|
# - keep it generic and optimize code size
|
||||||
|
|
||||||
|
const skipReduction = skipReduction and spareBits >= 1
|
||||||
|
|
||||||
when spareBits >= 1:
|
when spareBits >= 1:
|
||||||
when UseASM_X86_64 and a.len in {2 .. 6}: # TODO: handle spilling
|
when UseASM_X86_64 and a.len in {2 .. 6}: # TODO: handle spilling
|
||||||
# ADX implies BMI2
|
# ADX implies BMI2
|
||||||
if ({.noSideEffect.}: hasAdx()):
|
if ({.noSideEffect.}: hasAdx()):
|
||||||
montMul_CIOS_sparebit_asm_adx_bmi2(r, a, b, M, m0ninv)
|
mulMont_CIOS_sparebit_asm_adx(r, a, b, M, m0ninv, skipReduction)
|
||||||
else:
|
else:
|
||||||
montMul_CIOS_sparebit_asm(r, a, b, M, m0ninv)
|
mulMont_CIOS_sparebit_asm(r, a, b, M, m0ninv, skipReduction)
|
||||||
else:
|
else:
|
||||||
montyMul_CIOS_sparebit(r, a, b, M, m0ninv)
|
mulMont_CIOS_sparebit(r, a, b, M, m0ninv, skipReduction)
|
||||||
else:
|
else:
|
||||||
montyMul_FIPS(r, a, b, M, m0ninv)
|
mulMont_FIPS(r, a, b, M, m0ninv, skipReduction)
|
||||||
|
|
||||||
func montySquare*[N](r: var Limbs[N], a, M: Limbs[N],
|
func squareMont*[N](r: var Limbs[N], a, M: Limbs[N],
|
||||||
m0ninv: static BaseType, spareBits: static int) {.inline.} =
|
m0ninv: BaseType,
|
||||||
|
spareBits: static int,
|
||||||
|
skipReduction: static bool = false) {.inline.} =
|
||||||
## Compute r <- a^2 (mod M) in the Montgomery domain
|
## Compute r <- a^2 (mod M) in the Montgomery domain
|
||||||
## `m0ninv` = -1/M (mod SecretWord). Our words are 2^31 or 2^63
|
## `m0ninv` = -1/M (mod SecretWord). Our words are 2^31 or 2^63
|
||||||
|
|
||||||
|
const skipReduction = skipReduction and spareBits >= 1
|
||||||
|
|
||||||
when UseASM_X86_64 and a.len in {4, 6}:
|
when UseASM_X86_64 and a.len in {4, 6}:
|
||||||
# ADX implies BMI2
|
# ADX implies BMI2
|
||||||
if ({.noSideEffect.}: hasAdx()):
|
if ({.noSideEffect.}: hasAdx()):
|
||||||
# With ADX and spare bit, montSquare_CIOS_asm_adx_bmi2
|
# With ADX and spare bit, squareMont_CIOS_asm_adx
|
||||||
# which uses unfused squaring then Montgomery reduction
|
# which uses unfused squaring then Montgomery reduction
|
||||||
# is slightly slower than fused Montgomery multiplication
|
# is slightly slower than fused Montgomery multiplication
|
||||||
when spareBits >= 1:
|
when spareBits >= 1:
|
||||||
montMul_CIOS_sparebit_asm_adx_bmi2(r, a, a, M, m0ninv)
|
mulMont_CIOS_sparebit_asm_adx(r, a, a, M, m0ninv, skipReduction)
|
||||||
else:
|
else:
|
||||||
montSquare_CIOS_asm_adx_bmi2(r, a, M, m0ninv, spareBits >= 1)
|
squareMont_CIOS_asm_adx(r, a, M, m0ninv, spareBits >= 1, skipReduction)
|
||||||
else:
|
else:
|
||||||
montSquare_CIOS_asm(r, a, M, m0ninv, spareBits >= 1)
|
squareMont_CIOS_asm(r, a, M, m0ninv, spareBits >= 1, skipReduction)
|
||||||
elif UseASM_X86_64:
|
elif UseASM_X86_64:
|
||||||
var r2x {.noInit.}: Limbs[2*N]
|
var r2x {.noInit.}: Limbs[2*N]
|
||||||
r2x.square(a)
|
r2x.square(a)
|
||||||
r.montyRedc2x(r2x, M, m0ninv, spareBits)
|
r.redc2xMont(r2x, M, m0ninv, spareBits, skipReduction)
|
||||||
else:
|
else:
|
||||||
montyMul(r, a, a, M, m0ninv, spareBits)
|
mulMont(r, a, a, M, m0ninv, spareBits, skipReduction)
|
||||||
|
|
||||||
func redc*(r: var Limbs, a, one, M: Limbs,
|
func fromMont*(r: var Limbs, a, M: Limbs,
|
||||||
m0ninv: static BaseType, spareBits: static int) =
|
m0ninv: static BaseType, spareBits: static int) =
|
||||||
## Transform a bigint ``a`` from it's Montgomery N-residue representation (mod N)
|
## Transform a bigint ``a`` from it's Montgomery N-residue representation (mod N)
|
||||||
## to the regular natural representation (mod N)
|
## to the regular natural representation (mod N)
|
||||||
|
@ -424,10 +507,16 @@ func redc*(r: var Limbs, a, one, M: Limbs,
|
||||||
# - https://en.wikipedia.org/wiki/Montgomery_modular_multiplication#Montgomery_arithmetic_on_multiprecision_(variable-radix)_integers
|
# - https://en.wikipedia.org/wiki/Montgomery_modular_multiplication#Montgomery_arithmetic_on_multiprecision_(variable-radix)_integers
|
||||||
# - http://langevin.univ-tln.fr/cours/MLC/extra/montgomery.pdf
|
# - http://langevin.univ-tln.fr/cours/MLC/extra/montgomery.pdf
|
||||||
# Montgomery original paper
|
# Montgomery original paper
|
||||||
#
|
when UseASM_X86_64 and a.len in {2 .. 6}:
|
||||||
montyMul(r, a, one, M, m0ninv, spareBits)
|
# ADX implies BMI2
|
||||||
|
if ({.noSideEffect.}: hasAdx()):
|
||||||
|
fromMont_asm_adx(r, a, M, m0ninv)
|
||||||
|
else:
|
||||||
|
fromMont_asm(r, a, M, m0ninv)
|
||||||
|
else:
|
||||||
|
fromMont_CIOS(r, a, M, m0ninv)
|
||||||
|
|
||||||
func montyResidue*(r: var Limbs, a, M, r2modM: Limbs,
|
func getMont*(r: var Limbs, a, M, r2modM: Limbs,
|
||||||
m0ninv: static BaseType, spareBits: static int) =
|
m0ninv: static BaseType, spareBits: static int) =
|
||||||
## Transform a bigint ``a`` from it's natural representation (mod N)
|
## Transform a bigint ``a`` from it's natural representation (mod N)
|
||||||
## to a the Montgomery n-residue representation
|
## to a the Montgomery n-residue representation
|
||||||
|
@ -446,7 +535,7 @@ func montyResidue*(r: var Limbs, a, M, r2modM: Limbs,
|
||||||
## Important: `r` is overwritten
|
## Important: `r` is overwritten
|
||||||
## The result `r` buffer size MUST be at least the size of `M` buffer
|
## The result `r` buffer size MUST be at least the size of `M` buffer
|
||||||
# Reference: https://eprint.iacr.org/2017/1057.pdf
|
# Reference: https://eprint.iacr.org/2017/1057.pdf
|
||||||
montyMul(r, a, r2ModM, M, m0ninv, spareBits)
|
mulMont(r, a, r2ModM, M, m0ninv, spareBits)
|
||||||
|
|
||||||
# Montgomery Modular Exponentiation
|
# Montgomery Modular Exponentiation
|
||||||
# ------------------------------------------
|
# ------------------------------------------
|
||||||
|
@ -489,7 +578,7 @@ func getWindowLen(bufLen: int): uint =
|
||||||
while (1 shl result) + 1 > bufLen:
|
while (1 shl result) + 1 > bufLen:
|
||||||
dec result
|
dec result
|
||||||
|
|
||||||
func montyPowPrologue(
|
func powMontPrologue(
|
||||||
a: var Limbs, M, one: Limbs,
|
a: var Limbs, M, one: Limbs,
|
||||||
m0ninv: static BaseType,
|
m0ninv: static BaseType,
|
||||||
scratchspace: var openarray[Limbs],
|
scratchspace: var openarray[Limbs],
|
||||||
|
@ -507,12 +596,12 @@ func montyPowPrologue(
|
||||||
else:
|
else:
|
||||||
scratchspace[2] = a
|
scratchspace[2] = a
|
||||||
for k in 2 ..< 1 shl result:
|
for k in 2 ..< 1 shl result:
|
||||||
scratchspace[k+1].montyMul(scratchspace[k], a, M, m0ninv, spareBits)
|
scratchspace[k+1].mulMont(scratchspace[k], a, M, m0ninv, spareBits)
|
||||||
|
|
||||||
# Set a to one
|
# Set a to one
|
||||||
a = one
|
a = one
|
||||||
|
|
||||||
func montyPowSquarings(
|
func powMontSquarings(
|
||||||
a: var Limbs,
|
a: var Limbs,
|
||||||
exponent: openarray[byte],
|
exponent: openarray[byte],
|
||||||
M: Limbs,
|
M: Limbs,
|
||||||
|
@ -557,12 +646,11 @@ func montyPowSquarings(
|
||||||
|
|
||||||
# We have k bits and can do k squaring
|
# We have k bits and can do k squaring
|
||||||
for i in 0 ..< k:
|
for i in 0 ..< k:
|
||||||
tmp.montySquare(a, M, m0ninv, spareBits)
|
a.squareMont(a, M, m0ninv, spareBits)
|
||||||
a = tmp
|
|
||||||
|
|
||||||
return (k, bits)
|
return (k, bits)
|
||||||
|
|
||||||
func montyPow*(
|
func powMont*(
|
||||||
a: var Limbs,
|
a: var Limbs,
|
||||||
exponent: openarray[byte],
|
exponent: openarray[byte],
|
||||||
M, one: Limbs,
|
M, one: Limbs,
|
||||||
|
@ -596,7 +684,7 @@ func montyPow*(
|
||||||
## A window of size 5 requires (2^5 + 1)*(381 + 7)/8 = 33 * 48 bytes = 1584 bytes
|
## A window of size 5 requires (2^5 + 1)*(381 + 7)/8 = 33 * 48 bytes = 1584 bytes
|
||||||
## of scratchspace (on the stack).
|
## of scratchspace (on the stack).
|
||||||
|
|
||||||
let window = montyPowPrologue(a, M, one, m0ninv, scratchspace, spareBits)
|
let window = powMontPrologue(a, M, one, m0ninv, scratchspace, spareBits)
|
||||||
|
|
||||||
# We process bits with from most to least significant.
|
# We process bits with from most to least significant.
|
||||||
# At each loop iteration with have acc_len bits in acc.
|
# At each loop iteration with have acc_len bits in acc.
|
||||||
|
@ -607,7 +695,7 @@ func montyPow*(
|
||||||
acc, acc_len: uint
|
acc, acc_len: uint
|
||||||
e = 0
|
e = 0
|
||||||
while acc_len > 0 or e < exponent.len:
|
while acc_len > 0 or e < exponent.len:
|
||||||
let (k, bits) = montyPowSquarings(
|
let (k, bits) = powMontSquarings(
|
||||||
a, exponent, M, m0ninv,
|
a, exponent, M, m0ninv,
|
||||||
scratchspace[0], window,
|
scratchspace[0], window,
|
||||||
acc, acc_len, e,
|
acc, acc_len, e,
|
||||||
|
@ -626,10 +714,10 @@ func montyPow*(
|
||||||
|
|
||||||
# Multiply with the looked-up value
|
# Multiply with the looked-up value
|
||||||
# we keep the product only if the exponent bits are not all zeroes
|
# we keep the product only if the exponent bits are not all zeroes
|
||||||
scratchspace[0].montyMul(a, scratchspace[1], M, m0ninv, spareBits)
|
scratchspace[0].mulMont(a, scratchspace[1], M, m0ninv, spareBits)
|
||||||
a.ccopy(scratchspace[0], SecretWord(bits).isNonZero())
|
a.ccopy(scratchspace[0], SecretWord(bits).isNonZero())
|
||||||
|
|
||||||
func montyPowUnsafeExponent*(
|
func powMontUnsafeExponent*(
|
||||||
a: var Limbs,
|
a: var Limbs,
|
||||||
exponent: openarray[byte],
|
exponent: openarray[byte],
|
||||||
M, one: Limbs,
|
M, one: Limbs,
|
||||||
|
@ -649,13 +737,13 @@ func montyPowUnsafeExponent*(
|
||||||
|
|
||||||
# TODO: scratchspace[1] is unused when window > 1
|
# TODO: scratchspace[1] is unused when window > 1
|
||||||
|
|
||||||
let window = montyPowPrologue(a, M, one, m0ninv, scratchspace, spareBits)
|
let window = powMontPrologue(a, M, one, m0ninv, scratchspace, spareBits)
|
||||||
|
|
||||||
var
|
var
|
||||||
acc, acc_len: uint
|
acc, acc_len: uint
|
||||||
e = 0
|
e = 0
|
||||||
while acc_len > 0 or e < exponent.len:
|
while acc_len > 0 or e < exponent.len:
|
||||||
let (_, bits) = montyPowSquarings(
|
let (_, bits) = powMontSquarings(
|
||||||
a, exponent, M, m0ninv,
|
a, exponent, M, m0ninv,
|
||||||
scratchspace[0], window,
|
scratchspace[0], window,
|
||||||
acc, acc_len, e,
|
acc, acc_len, e,
|
||||||
|
@ -665,10 +753,10 @@ func montyPowUnsafeExponent*(
|
||||||
## Warning ⚠️: Exposes the exponent bits
|
## Warning ⚠️: Exposes the exponent bits
|
||||||
if bits != 0:
|
if bits != 0:
|
||||||
if window > 1:
|
if window > 1:
|
||||||
scratchspace[0].montyMul(a, scratchspace[1+bits], M, m0ninv, spareBits)
|
scratchspace[0].mulMont(a, scratchspace[1+bits], M, m0ninv, spareBits)
|
||||||
else:
|
else:
|
||||||
# scratchspace[1] holds the original `a`
|
# scratchspace[1] holds the original `a`
|
||||||
scratchspace[0].montyMul(a, scratchspace[1], M, m0ninv, spareBits)
|
scratchspace[0].mulMont(a, scratchspace[1], M, m0ninv, spareBits)
|
||||||
a = scratchspace[0]
|
a = scratchspace[0]
|
||||||
|
|
||||||
{.pop.} # raises no exceptions
|
{.pop.} # raises no exceptions
|
||||||
|
|
|
@ -355,7 +355,7 @@ func r3mod*(M: BigInt): BigInt =
|
||||||
## This is used in hash-to-curve to
|
## This is used in hash-to-curve to
|
||||||
## reduce a double-sized bigint mod M
|
## reduce a double-sized bigint mod M
|
||||||
## and map it to the Montgomery domain
|
## and map it to the Montgomery domain
|
||||||
## with just redc2x + montyMul
|
## with just redc2x + mulMont
|
||||||
r_powmod(3, M)
|
r_powmod(3, M)
|
||||||
|
|
||||||
func montyOne*(M: BigInt): BigInt =
|
func montyOne*(M: BigInt): BigInt =
|
||||||
|
@ -391,7 +391,7 @@ func primePlus1div2*(P: BigInt): BigInt =
|
||||||
|
|
||||||
# (P+1)/2 = P/2 + 1 if P is odd,
|
# (P+1)/2 = P/2 + 1 if P is odd,
|
||||||
# this avoids overflowing if the prime uses all bits
|
# this avoids overflowing if the prime uses all bits
|
||||||
# i.e. in the form (2^64)^w - 1 or (2^32)^w - 1
|
# i.e. in the form (2^64)ʷ - 1 or (2^32)ʷ - 1
|
||||||
|
|
||||||
result = P
|
result = P
|
||||||
result.shiftRight(1)
|
result.shiftRight(1)
|
||||||
|
@ -491,7 +491,7 @@ func toCanonicalIntRepr*[bits: static int](
|
||||||
# ############################################################
|
# ############################################################
|
||||||
# This is needed to avoid recursive dependencies
|
# This is needed to avoid recursive dependencies
|
||||||
|
|
||||||
func montyMul_precompute(r: var BigInt, a, b, M: BigInt, m0ninv: BaseType) =
|
func mulMont_precompute(r: var BigInt, a, b, M: BigInt, m0ninv: BaseType) =
|
||||||
## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
|
## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
|
||||||
var t: typeof(M) # zero-init
|
var t: typeof(M) # zero-init
|
||||||
const N = t.limbs.len
|
const N = t.limbs.len
|
||||||
|
@ -527,4 +527,4 @@ func montyResidue_precompute*(r: var BigInt, a, M, r2modM: BigInt,
|
||||||
## Transform a bigint ``a`` from it's natural representation (mod N)
|
## Transform a bigint ``a`` from it's natural representation (mod N)
|
||||||
## to a the Montgomery n-residue representation
|
## to a the Montgomery n-residue representation
|
||||||
## This is intended for compile-time precomputations-only
|
## This is intended for compile-time precomputations-only
|
||||||
montyMul_precompute(r, a, r2ModM, M, m0ninv)
|
mulMont_precompute(r, a, r2ModM, M, m0ninv)
|
||||||
|
|
|
@ -154,15 +154,15 @@ func expandMessageXMD*[B1, B2, B3: byte|char, len_in_bytes: static int](
|
||||||
return
|
return
|
||||||
|
|
||||||
func redc2x[FF](r: var FF, big2x: BigInt) {.inline.} =
|
func redc2x[FF](r: var FF, big2x: BigInt) {.inline.} =
|
||||||
r.mres.limbs.montyRedc2x(
|
r.mres.limbs.redc2xMont(
|
||||||
big2x.limbs,
|
big2x.limbs,
|
||||||
FF.fieldMod().limbs,
|
FF.fieldMod().limbs,
|
||||||
FF.getNegInvModWord(),
|
FF.getNegInvModWord(),
|
||||||
FF.getSpareBits()
|
FF.getSpareBits()
|
||||||
)
|
)
|
||||||
|
|
||||||
func montyMul(r: var BigInt, a, b: BigInt, FF: type) {.inline.} =
|
func mulMont(r: var BigInt, a, b: BigInt, FF: type) {.inline.} =
|
||||||
r.limbs.montyMul(
|
r.limbs.mulMont(
|
||||||
a.limbs, b.limbs,
|
a.limbs, b.limbs,
|
||||||
FF.fieldMod().limbs,
|
FF.fieldMod().limbs,
|
||||||
FF.getNegInvModWord(),
|
FF.getNegInvModWord(),
|
||||||
|
@ -228,14 +228,14 @@ func hashToField*[Field; B1, B2, B3: byte|char, count: static int](
|
||||||
# Reduces modulo p and output in Montgomery domain
|
# Reduces modulo p and output in Montgomery domain
|
||||||
when m == 1:
|
when m == 1:
|
||||||
output[i].redc2x(big2x)
|
output[i].redc2x(big2x)
|
||||||
output[i].mres.montyMul(
|
output[i].mres.mulMont(
|
||||||
output[i].mres,
|
output[i].mres,
|
||||||
Fp[Field.C].getR3ModP(),
|
Fp[Field.C].getR3ModP(),
|
||||||
Fp[Field.C])
|
Fp[Field.C])
|
||||||
|
|
||||||
else:
|
else:
|
||||||
output[i].coords[j].redc2x(big2x)
|
output[i].coords[j].redc2x(big2x)
|
||||||
output[i].coords[j].mres.montyMul(
|
output[i].coords[j].mres.mulMont(
|
||||||
output[i].coords[j].mres,
|
output[i].coords[j].mres,
|
||||||
Fp[Field.C].getR3ModP(),
|
Fp[Field.C].getR3ModP(),
|
||||||
Fp[Field.C])
|
Fp[Field.C])
|
||||||
|
|
|
@ -249,7 +249,7 @@ let
|
||||||
hasBmi2Impl = testX86Feature(Bmi2)
|
hasBmi2Impl = testX86Feature(Bmi2)
|
||||||
hasTsxHleImpl = testX86Feature(TsxHle)
|
hasTsxHleImpl = testX86Feature(TsxHle)
|
||||||
hasTsxRtmImpl = testX86Feature(TsxRtm)
|
hasTsxRtmImpl = testX86Feature(TsxRtm)
|
||||||
hasAdxImpl = testX86Feature(TsxHle)
|
hasAdxImpl = testX86Feature(Adx)
|
||||||
hasSgxImpl = testX86Feature(Sgx)
|
hasSgxImpl = testX86Feature(Sgx)
|
||||||
hasGfniImpl = testX86Feature(Gfni)
|
hasGfniImpl = testX86Feature(Gfni)
|
||||||
hasAesImpl = testX86Feature(Aes)
|
hasAesImpl = testX86Feature(Aes)
|
||||||
|
|
|
@ -337,6 +337,10 @@ func generate*(a: Assembler_x86): NimNode =
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
result = nnkBlockStmt.newTree(
|
||||||
|
newEmptyNode(),
|
||||||
|
result
|
||||||
|
)
|
||||||
|
|
||||||
func getStrOffset(a: Assembler_x86, op: Operand): string =
|
func getStrOffset(a: Assembler_x86, op: Operand): string =
|
||||||
if op.kind != kFromArray:
|
if op.kind != kFromArray:
|
||||||
|
|
|
@ -195,8 +195,8 @@ func eth_evm_ecmul*(
|
||||||
|
|
||||||
var sprime{.noInit.}: typeof(smod.mres)
|
var sprime{.noInit.}: typeof(smod.mres)
|
||||||
# Due to mismatch between the BigInt[256] input and the rest being BigInt[254]
|
# Due to mismatch between the BigInt[256] input and the rest being BigInt[254]
|
||||||
# we use the low-level montyResidue instead of 'fromBig'
|
# we use the low-level getMont instead of 'fromBig'
|
||||||
montyResidue(smod.mres.limbs, s.limbs,
|
getMont(smod.mres.limbs, s.limbs,
|
||||||
Fr[BN254_Snarks].fieldMod().limbs,
|
Fr[BN254_Snarks].fieldMod().limbs,
|
||||||
Fr[BN254_Snarks].getR2modP().limbs,
|
Fr[BN254_Snarks].getR2modP().limbs,
|
||||||
Fr[BN254_Snarks].getNegInvModWord(),
|
Fr[BN254_Snarks].getNegInvModWord(),
|
||||||
|
|
|
@ -13,8 +13,8 @@ import
|
||||||
../../arithmetic,
|
../../arithmetic,
|
||||||
../../arithmetic/assembly/[
|
../../arithmetic/assembly/[
|
||||||
limbs_asm_mul_x86_adx_bmi2,
|
limbs_asm_mul_x86_adx_bmi2,
|
||||||
limbs_asm_montmul_x86_adx_bmi2,
|
limbs_asm_mul_mont_x86_adx_bmi2,
|
||||||
limbs_asm_montred_x86_adx_bmi2
|
limbs_asm_redc_mont_x86_adx_bmi2
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -50,7 +50,7 @@ func has2extraBits(F: type Fp): bool =
|
||||||
# 𝔽p2 squaring
|
# 𝔽p2 squaring
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
func sqrx2x_complex_asm_adx_bmi2*(
|
func sqrx2x_complex_asm_adx*(
|
||||||
r: var array[2, FpDbl],
|
r: var array[2, FpDbl],
|
||||||
a: array[2, Fp]
|
a: array[2, Fp]
|
||||||
) =
|
) =
|
||||||
|
@ -67,11 +67,11 @@ func sqrx2x_complex_asm_adx_bmi2*(
|
||||||
t0.double(a.c1)
|
t0.double(a.c1)
|
||||||
t1.sum(a.c0, a.c1)
|
t1.sum(a.c0, a.c1)
|
||||||
|
|
||||||
r.c1.mul_asm_adx_bmi2_impl(t0, a.c0)
|
r.c1.mul_asm_adx_inline(t0, a.c0)
|
||||||
t0.diff(a.c0, a.c1)
|
t0.diff(a.c0, a.c1)
|
||||||
r.c0.mul_asm_adx_bmi2_impl(t0, t1)
|
r.c0.mul_asm_adx_inline(t0, t1)
|
||||||
|
|
||||||
func sqrx_complex_sparebit_asm_adx_bmi2*(
|
func sqrx_complex_sparebit_asm_adx*(
|
||||||
r: var array[2, Fp],
|
r: var array[2, Fp],
|
||||||
a: array[2, Fp]
|
a: array[2, Fp]
|
||||||
) =
|
) =
|
||||||
|
@ -85,15 +85,15 @@ func sqrx_complex_sparebit_asm_adx_bmi2*(
|
||||||
var v0 {.noInit.}, v1 {.noInit.}: typeof(r.c0)
|
var v0 {.noInit.}, v1 {.noInit.}: typeof(r.c0)
|
||||||
v0.diff(a.c0, a.c1)
|
v0.diff(a.c0, a.c1)
|
||||||
v1.sum(a.c0, a.c1)
|
v1.sum(a.c0, a.c1)
|
||||||
r.c1.mres.limbs.montMul_CIOS_sparebit_asm_adx_bmi2(a.c0.mres.limbs, a.c1.mres.limbs, Fp.fieldMod().limbs, Fp.getNegInvModWord())
|
r.c1.mres.limbs.mulMont_CIOS_sparebit_asm_adx(a.c0.mres.limbs, a.c1.mres.limbs, Fp.fieldMod().limbs, Fp.getNegInvModWord())
|
||||||
# aliasing: a unneeded now
|
# aliasing: a unneeded now
|
||||||
r.c1.double()
|
r.c1.double()
|
||||||
r.c0.mres.limbs.montMul_CIOS_sparebit_asm_adx_bmi2(v0.mres.limbs, v1.mres.limbs, Fp.fieldMod().limbs, Fp.getNegInvModWord())
|
r.c0.mres.limbs.mulMont_CIOS_sparebit_asm_adx(v0.mres.limbs, v1.mres.limbs, Fp.fieldMod().limbs, Fp.getNegInvModWord())
|
||||||
|
|
||||||
# 𝔽p2 multiplication
|
# 𝔽p2 multiplication
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
func mulx2x_complex_asm_adx_bmi2*(
|
func mul2x_fp2_complex_asm_adx*(
|
||||||
r: var array[2, FpDbl],
|
r: var array[2, FpDbl],
|
||||||
a, b: array[2, Fp]
|
a, b: array[2, Fp]
|
||||||
) =
|
) =
|
||||||
|
@ -101,15 +101,15 @@ func mulx2x_complex_asm_adx_bmi2*(
|
||||||
var D {.noInit.}: typeof(r.c0)
|
var D {.noInit.}: typeof(r.c0)
|
||||||
var t0 {.noInit.}, t1 {.noInit.}: typeof(a.c0)
|
var t0 {.noInit.}, t1 {.noInit.}: typeof(a.c0)
|
||||||
|
|
||||||
r.c0.limbs2x.mul_asm_adx_bmi2_impl(a.c0.mres.limbs, b.c0.mres.limbs)
|
r.c0.limbs2x.mul_asm_adx_inline(a.c0.mres.limbs, b.c0.mres.limbs)
|
||||||
D.limbs2x.mul_asm_adx_bmi2_impl(a.c1.mres.limbs, b.c1.mres.limbs)
|
D.limbs2x.mul_asm_adx_inline(a.c1.mres.limbs, b.c1.mres.limbs)
|
||||||
when Fp.has1extraBit():
|
when Fp.has1extraBit():
|
||||||
t0.sumUnr(a.c0, a.c1)
|
t0.sumUnr(a.c0, a.c1)
|
||||||
t1.sumUnr(b.c0, b.c1)
|
t1.sumUnr(b.c0, b.c1)
|
||||||
else:
|
else:
|
||||||
t0.sum(a.c0, a.c1)
|
t0.sum(a.c0, a.c1)
|
||||||
t1.sum(b.c0, b.c1)
|
t1.sum(b.c0, b.c1)
|
||||||
r.c1.limbs2x.mul_asm_adx_bmi2_impl(t0.mres.limbs, t1.mres.limbs)
|
r.c1.limbs2x.mul_asm_adx_inline(t0.mres.limbs, t1.mres.limbs)
|
||||||
when Fp.has1extraBit():
|
when Fp.has1extraBit():
|
||||||
r.c1.diff2xUnr(r.c1, r.c0)
|
r.c1.diff2xUnr(r.c1, r.c0)
|
||||||
r.c1.diff2xUnr(r.c1, D)
|
r.c1.diff2xUnr(r.c1, D)
|
||||||
|
@ -118,20 +118,20 @@ func mulx2x_complex_asm_adx_bmi2*(
|
||||||
r.c1.diff2xMod(r.c1, D)
|
r.c1.diff2xMod(r.c1, D)
|
||||||
r.c0.diff2xMod(r.c0, D)
|
r.c0.diff2xMod(r.c0, D)
|
||||||
|
|
||||||
func mulx_complex_asm_adx_bmi2*(
|
func mul_fp2_complex_asm_adx*(
|
||||||
r: var array[2, Fp],
|
r: var array[2, Fp],
|
||||||
a, b: array[2, Fp]
|
a, b: array[2, Fp]
|
||||||
) =
|
) =
|
||||||
## Complex multiplication on 𝔽p2
|
## Complex multiplication on 𝔽p2
|
||||||
var d {.noInit.}: array[2,doublePrec(Fp)]
|
var d {.noInit.}: array[2,doublePrec(Fp)]
|
||||||
d.mulx2x_complex_asm_adx_bmi2(a, b)
|
d.mul2x_fp2_complex_asm_adx(a, b)
|
||||||
r.c0.mres.limbs.montRed_asm_adx_bmi2_impl(
|
r.c0.mres.limbs.redcMont_asm_adx_inline(
|
||||||
d.c0.limbs2x,
|
d.c0.limbs2x,
|
||||||
Fp.fieldMod().limbs,
|
Fp.fieldMod().limbs,
|
||||||
Fp.getNegInvModWord(),
|
Fp.getNegInvModWord(),
|
||||||
Fp.has1extraBit()
|
Fp.has1extraBit()
|
||||||
)
|
)
|
||||||
r.c1.mres.limbs.montRed_asm_adx_bmi2_impl(
|
r.c1.mres.limbs.redcMont_asm_adx_inline(
|
||||||
d.c1.limbs2x,
|
d.c1.limbs2x,
|
||||||
Fp.fieldMod().limbs,
|
Fp.fieldMod().limbs,
|
||||||
Fp.getNegInvModWord(),
|
Fp.getNegInvModWord(),
|
||||||
|
|
|
@ -454,7 +454,7 @@ func prod*[C: static Curve](
|
||||||
# (c0 + c1 x) (u + v x) => u c0 + (u c0 + u c1)x + v c1 x²
|
# (c0 + c1 x) (u + v x) => u c0 + (u c0 + u c1)x + v c1 x²
|
||||||
# => u c0 + β v c1 + (v c0 + u c1) x
|
# => u c0 + β v c1 + (v c0 + u c1) x
|
||||||
when a.fromComplexExtension() and u == 1 and v == 1:
|
when a.fromComplexExtension() and u == 1 and v == 1:
|
||||||
let t = a.c0
|
let t {.noInit.} = a.c0
|
||||||
r.c0.diff(t, a.c1)
|
r.c0.diff(t, a.c1)
|
||||||
r.c1.sum(t, a.c1)
|
r.c1.sum(t, a.c1)
|
||||||
else:
|
else:
|
||||||
|
@ -504,7 +504,7 @@ func prod2x*[C: static Curve](
|
||||||
const Beta {.used.} = C.getNonResidueFp()
|
const Beta {.used.} = C.getNonResidueFp()
|
||||||
|
|
||||||
when complex and U == 1 and V == 1:
|
when complex and U == 1 and V == 1:
|
||||||
let a1 = a.c1
|
let a1 {.noInit.} = a.c1
|
||||||
r.c1.sum2xMod(a.c0, a1)
|
r.c1.sum2xMod(a.c0, a1)
|
||||||
r.c0.diff2xMod(a.c0, a1)
|
r.c0.diff2xMod(a.c0, a1)
|
||||||
else:
|
else:
|
||||||
|
@ -567,13 +567,13 @@ func `/=`*[C: static Curve](a: var Fp2[C], _: type NonResidue) =
|
||||||
# 1/2 * (c0 + c1, c1 - c0)
|
# 1/2 * (c0 + c1, c1 - c0)
|
||||||
|
|
||||||
when a.fromComplexExtension() and u == 1 and v == 1:
|
when a.fromComplexExtension() and u == 1 and v == 1:
|
||||||
let t = a.c0
|
let t {.noInit.} = a.c0
|
||||||
a.c0 += a.c1
|
a.c0 += a.c1
|
||||||
a.c1 -= t
|
a.c1 -= t
|
||||||
a.div2()
|
a.div2()
|
||||||
else:
|
else:
|
||||||
var a0 = a.c0
|
var a0 {.noInit.} = a.c0
|
||||||
let a1 = a.c1
|
let a1 {.noInit.} = a.c1
|
||||||
const u2v2 = u*u - Beta*v*v # (u² - βv²)
|
const u2v2 = u*u - Beta*v*v # (u² - βv²)
|
||||||
# TODO can be precomputed to avoid costly inversion.
|
# TODO can be precomputed to avoid costly inversion.
|
||||||
var u2v2inv {.noInit.}: a.c0.typeof
|
var u2v2inv {.noInit.}: a.c0.typeof
|
||||||
|
@ -605,7 +605,7 @@ func prod*(r: var QuadraticExt, a: QuadraticExt, _: type NonResidue) =
|
||||||
## - if sextic non-residue: 𝔽p8, 𝔽p12 or 𝔽p24
|
## - if sextic non-residue: 𝔽p8, 𝔽p12 or 𝔽p24
|
||||||
##
|
##
|
||||||
## Assumes that the non-residue is sqrt(lower extension non-residue)
|
## Assumes that the non-residue is sqrt(lower extension non-residue)
|
||||||
let t = a.c0
|
let t {.noInit.} = a.c0
|
||||||
r.c0.prod(a.c1, NonResidue)
|
r.c0.prod(a.c1, NonResidue)
|
||||||
r.c1 = t
|
r.c1 = t
|
||||||
|
|
||||||
|
@ -628,7 +628,7 @@ func prod2x*(
|
||||||
_: type NonResidue) =
|
_: type NonResidue) =
|
||||||
## Multiplication by non-residue
|
## Multiplication by non-residue
|
||||||
static: doAssert not(r.c0 is FpDbl), "Wrong dispatch, there is a specific non-residue multiplication for the base extension."
|
static: doAssert not(r.c0 is FpDbl), "Wrong dispatch, there is a specific non-residue multiplication for the base extension."
|
||||||
let t = a.c0
|
let t {.noInit.} = a.c0
|
||||||
r.c0.prod2x(a.c1, NonResidue)
|
r.c0.prod2x(a.c1, NonResidue)
|
||||||
`=`(r.c1, t) # "r.c1 = t", is refused by the compiler.
|
`=`(r.c1, t) # "r.c1 = t", is refused by the compiler.
|
||||||
|
|
||||||
|
@ -650,7 +650,7 @@ func prod*(r: var CubicExt, a: CubicExt, _: type NonResidue) =
|
||||||
## For all curves γ = v with v the factor for the cubic extension coordinate
|
## For all curves γ = v with v the factor for the cubic extension coordinate
|
||||||
## and v³ = ξ
|
## and v³ = ξ
|
||||||
## (c0 + c1 v + c2 v²) v => ξ c2 + c0 v + c1 v²
|
## (c0 + c1 v + c2 v²) v => ξ c2 + c0 v + c1 v²
|
||||||
let t = a.c2
|
let t {.noInit.} = a.c2
|
||||||
r.c1 = a.c0
|
r.c1 = a.c0
|
||||||
r.c2 = a.c1
|
r.c2 = a.c1
|
||||||
r.c0.prod(t, NonResidue)
|
r.c0.prod(t, NonResidue)
|
||||||
|
@ -677,7 +677,7 @@ func prod2x*(
|
||||||
## For all curves γ = v with v the factor for cubic extension coordinate
|
## For all curves γ = v with v the factor for cubic extension coordinate
|
||||||
## and v³ = ξ
|
## and v³ = ξ
|
||||||
## (c0 + c1 v + c2 v²) v => ξ c2 + c0 v + c1 v²
|
## (c0 + c1 v + c2 v²) v => ξ c2 + c0 v + c1 v²
|
||||||
let t = a.c2
|
let t {.noInit.} = a.c2
|
||||||
r.c1 = a.c0
|
r.c1 = a.c0
|
||||||
r.c2 = a.c1
|
r.c2 = a.c1
|
||||||
r.c0.prod2x(t, NonResidue)
|
r.c0.prod2x(t, NonResidue)
|
||||||
|
@ -1243,7 +1243,7 @@ func square*(r: var QuadraticExt, a: QuadraticExt) =
|
||||||
when true:
|
when true:
|
||||||
when UseASM_X86_64 and a.c0.mres.limbs.len <= 6 and r.typeof.has1extraBit():
|
when UseASM_X86_64 and a.c0.mres.limbs.len <= 6 and r.typeof.has1extraBit():
|
||||||
if ({.noSideEffect.}: hasAdx()):
|
if ({.noSideEffect.}: hasAdx()):
|
||||||
r.coords.sqrx_complex_sparebit_asm_adx_bmi2(a.coords)
|
r.coords.sqrx_complex_sparebit_asm_adx(a.coords)
|
||||||
else:
|
else:
|
||||||
r.square_complex(a)
|
r.square_complex(a)
|
||||||
else:
|
else:
|
||||||
|
@ -1281,7 +1281,7 @@ func prod*(r: var QuadraticExt, a, b: QuadraticExt) =
|
||||||
else: # faster
|
else: # faster
|
||||||
when UseASM_X86_64 and a.c0.mres.limbs.len <= 6:
|
when UseASM_X86_64 and a.c0.mres.limbs.len <= 6:
|
||||||
if ({.noSideEffect.}: hasAdx()):
|
if ({.noSideEffect.}: hasAdx()):
|
||||||
r.coords.mulx_complex_asm_adx_bmi2(a.coords, b.coords)
|
r.coords.mul_fp2_complex_asm_adx(a.coords, b.coords)
|
||||||
else:
|
else:
|
||||||
var d {.noInit.}: doublePrec(typeof(r))
|
var d {.noInit.}: doublePrec(typeof(r))
|
||||||
d.prod2x_complex(a, b)
|
d.prod2x_complex(a, b)
|
||||||
|
@ -1318,7 +1318,7 @@ func prod2x*(r: var QuadraticExt2x, a, b: QuadraticExt) =
|
||||||
when a.fromComplexExtension():
|
when a.fromComplexExtension():
|
||||||
when UseASM_X86_64 and a.c0.mres.limbs.len <= 6:
|
when UseASM_X86_64 and a.c0.mres.limbs.len <= 6:
|
||||||
if ({.noSideEffect.}: hasAdx()):
|
if ({.noSideEffect.}: hasAdx()):
|
||||||
r.coords.mulx2x_complex_asm_adx_bmi2(a.coords, b.coords)
|
r.coords.mul2x_fp2_complex_asm_adx(a.coords, b.coords)
|
||||||
else:
|
else:
|
||||||
r.prod2x_complex(a, b)
|
r.prod2x_complex(a, b)
|
||||||
else:
|
else:
|
||||||
|
@ -1591,7 +1591,7 @@ func prod2xImpl(r: var CubicExt2x, a, b: CubicExt) =
|
||||||
V2.prod2x(a.c2, b.c2)
|
V2.prod2x(a.c2, b.c2)
|
||||||
|
|
||||||
# r₀ = β ((a₁ + a₂)(b₁ + b₂) - v₁ - v₂) + v₀
|
# r₀ = β ((a₁ + a₂)(b₁ + b₂) - v₁ - v₂) + v₀
|
||||||
when false: # CubicExt.has1extraBit():
|
when a.c0 is Fp and CubicExt.has1extraBit():
|
||||||
t0.sumUnr(a.c1, a.c2)
|
t0.sumUnr(a.c1, a.c2)
|
||||||
t1.sumUnr(b.c1, b.c2)
|
t1.sumUnr(b.c1, b.c2)
|
||||||
else:
|
else:
|
||||||
|
@ -1604,7 +1604,7 @@ func prod2xImpl(r: var CubicExt2x, a, b: CubicExt) =
|
||||||
r.c0.sum2xMod(r.c0, V0)
|
r.c0.sum2xMod(r.c0, V0)
|
||||||
|
|
||||||
# r₁ = (a₀ + a₁) * (b₀ + b₁) - v₀ - v₁ + β v₂
|
# r₁ = (a₀ + a₁) * (b₀ + b₁) - v₀ - v₁ + β v₂
|
||||||
when false: # CubicExt.has1extraBit():
|
when a.c0 is Fp and CubicExt.has1extraBit():
|
||||||
t0.sumUnr(a.c0, a.c1)
|
t0.sumUnr(a.c0, a.c1)
|
||||||
t1.sumUnr(b.c0, b.c1)
|
t1.sumUnr(b.c0, b.c1)
|
||||||
else:
|
else:
|
||||||
|
@ -1617,7 +1617,7 @@ func prod2xImpl(r: var CubicExt2x, a, b: CubicExt) =
|
||||||
r.c1.sum2xMod(r.c1, r.c2)
|
r.c1.sum2xMod(r.c1, r.c2)
|
||||||
|
|
||||||
# r₂ = (a₀ + a₂) * (b₀ + b₂) - v₀ - v₂ + v₁
|
# r₂ = (a₀ + a₂) * (b₀ + b₂) - v₀ - v₂ + v₁
|
||||||
when false: # CubicExt.has1extraBit():
|
when a.c0 is Fp and CubicExt.has1extraBit():
|
||||||
t0.sumUnr(a.c0, a.c2)
|
t0.sumUnr(a.c0, a.c2)
|
||||||
t1.sumUnr(b.c0, b.c2)
|
t1.sumUnr(b.c0, b.c2)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -84,7 +84,11 @@ The optimizations can be of algebraic, algorithmic or "implementation details" n
|
||||||
- [ ] NAF recoding
|
- [ ] NAF recoding
|
||||||
- [ ] windowed-NAF recoding
|
- [ ] windowed-NAF recoding
|
||||||
- [ ] SIMD vectorized select in window algorithm
|
- [ ] SIMD vectorized select in window algorithm
|
||||||
- [ ] Almost Montgomery Multiplication, https://eprint.iacr.org/2011/239.pdf
|
- [ ] Montgomery Multiplication with no final substraction,
|
||||||
|
- Bos and Montgomery, https://eprint.iacr.org/2017/1057.pdf
|
||||||
|
- Colin D Walter, https://colinandmargaret.co.uk/Research/CDW_ELL_99.pdf
|
||||||
|
- Hachez and Quisquater, https://link.springer.com/content/pdf/10.1007%2F3-540-44499-8_23.pdf
|
||||||
|
- Gueron, https://eprint.iacr.org/2011/239.pdf
|
||||||
- [ ] Pippenger multi-exponentiation (variable-time)
|
- [ ] Pippenger multi-exponentiation (variable-time)
|
||||||
- [ ] parallelized Pippenger
|
- [ ] parallelized Pippenger
|
||||||
|
|
||||||
|
|
|
@ -148,7 +148,7 @@ func random_unsafe(rng: var RngState, a: var FF) =
|
||||||
|
|
||||||
# Note: a simple modulo will be biaised but it's simple and "fast"
|
# Note: a simple modulo will be biaised but it's simple and "fast"
|
||||||
reduced.reduce(unreduced, FF.fieldMod())
|
reduced.reduce(unreduced, FF.fieldMod())
|
||||||
a.mres.montyResidue(reduced, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
|
a.mres.getMont(reduced, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
|
||||||
|
|
||||||
func random_unsafe(rng: var RngState, a: var ExtensionField) =
|
func random_unsafe(rng: var RngState, a: var ExtensionField) =
|
||||||
## Recursively initialize an extension Field element
|
## Recursively initialize an extension Field element
|
||||||
|
@ -179,7 +179,7 @@ func random_highHammingWeight(rng: var RngState, a: var FF) =
|
||||||
|
|
||||||
# Note: a simple modulo will be biaised but it's simple and "fast"
|
# Note: a simple modulo will be biaised but it's simple and "fast"
|
||||||
reduced.reduce(unreduced, FF.fieldMod())
|
reduced.reduce(unreduced, FF.fieldMod())
|
||||||
a.mres.montyResidue(reduced, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
|
a.mres.getMont(reduced, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
|
||||||
|
|
||||||
func random_highHammingWeight(rng: var RngState, a: var ExtensionField) =
|
func random_highHammingWeight(rng: var RngState, a: var ExtensionField) =
|
||||||
## Recursively initialize an extension Field element
|
## Recursively initialize an extension Field element
|
||||||
|
@ -224,7 +224,7 @@ func random_long01Seq(rng: var RngState, a: var FF) =
|
||||||
|
|
||||||
# Note: a simple modulo will be biaised but it's simple and "fast"
|
# Note: a simple modulo will be biaised but it's simple and "fast"
|
||||||
reduced.reduce(unreduced, FF.fieldMod())
|
reduced.reduce(unreduced, FF.fieldMod())
|
||||||
a.mres.montyResidue(reduced, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
|
a.mres.getMont(reduced, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
|
||||||
|
|
||||||
func random_long01Seq(rng: var RngState, a: var ExtensionField) =
|
func random_long01Seq(rng: var RngState, a: var ExtensionField) =
|
||||||
## Recursively initialize an extension Field element
|
## Recursively initialize an extension Field element
|
||||||
|
|
|
@ -8,7 +8,8 @@
|
||||||
|
|
||||||
import std/unittest,
|
import std/unittest,
|
||||||
../constantine/arithmetic,
|
../constantine/arithmetic,
|
||||||
../constantine/io/io_fields,
|
../constantine/arithmetic/limbs_montgomery,
|
||||||
|
../constantine/io/[io_bigints, io_fields],
|
||||||
../constantine/config/curves
|
../constantine/config/curves
|
||||||
|
|
||||||
static: doAssert defined(testingCurves), "This modules requires the -d:testingCurves compile option"
|
static: doAssert defined(testingCurves), "This modules requires the -d:testingCurves compile option"
|
||||||
|
@ -317,4 +318,54 @@ proc largeField() =
|
||||||
|
|
||||||
check: bool r.isZero()
|
check: bool r.isZero()
|
||||||
|
|
||||||
|
test "fromMont doesn't need a final substraction with 256-bit prime (full word used)":
|
||||||
|
block:
|
||||||
|
var a: Fp[Secp256k1]
|
||||||
|
a.mres = Fp[Secp256k1].getMontyPrimeMinus1()
|
||||||
|
let expected = BigInt[256].fromHex"0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2E"
|
||||||
|
|
||||||
|
var r: BigInt[256]
|
||||||
|
r.fromField(a)
|
||||||
|
|
||||||
|
check: bool(r == expected)
|
||||||
|
block:
|
||||||
|
var a: Fp[Secp256k1]
|
||||||
|
var d: FpDbl[Secp256k1]
|
||||||
|
|
||||||
|
# Set Montgomery repr to the largest field element in Montgomery Residue form
|
||||||
|
a.mres = BigInt[256].fromHex"0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2E"
|
||||||
|
d.limbs2x = (BigInt[512].fromHex"0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2E").limbs
|
||||||
|
|
||||||
|
var r, expected: BigInt[256]
|
||||||
|
|
||||||
|
r.fromField(a)
|
||||||
|
expected.limbs.redc2xMont(d.limbs2x, Secp256k1.Mod().limbs, Fp[Secp256k1].getNegInvModWord(), Fp[Secp256k1].getSpareBits())
|
||||||
|
|
||||||
|
check: bool(r == expected)
|
||||||
|
|
||||||
|
test "fromMont doesn't need a final substraction with 255-bit prime (1 spare bit)":
|
||||||
|
block:
|
||||||
|
var a: Fp[Curve25519]
|
||||||
|
a.mres = Fp[Curve25519].getMontyPrimeMinus1()
|
||||||
|
let expected = BigInt[255].fromHex"0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffec"
|
||||||
|
|
||||||
|
var r: BigInt[255]
|
||||||
|
r.fromField(a)
|
||||||
|
|
||||||
|
check: bool(r == expected)
|
||||||
|
block:
|
||||||
|
var a: Fp[Curve25519]
|
||||||
|
var d: FpDbl[Curve25519]
|
||||||
|
|
||||||
|
# Set Montgomery repr to the largest field element in Montgomery Residue form
|
||||||
|
a.mres = BigInt[255].fromHex"0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffec"
|
||||||
|
d.limbs2x = (BigInt[512].fromHex"0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffec").limbs
|
||||||
|
|
||||||
|
var r, expected: BigInt[255]
|
||||||
|
|
||||||
|
r.fromField(a)
|
||||||
|
expected.limbs.redc2xMont(d.limbs2x, Curve25519.Mod().limbs, Fp[Curve25519].getNegInvModWord(), Fp[Curve25519].getSpareBits())
|
||||||
|
|
||||||
|
check: bool(r == expected)
|
||||||
|
|
||||||
largeField()
|
largeField()
|
||||||
|
|
Loading…
Reference in New Issue