Low-level refactoring (#175)
* Add specific fromMont conversion routine. Rename montyResidue to getMont * missed test file * Add x86_64 ASM for fromMont * Add x86_64 MULX/ADCX/ADOX for fromMont * rework Montgomery Multiplication with prefetch/latency hiding techniques * Fix ADX autodetection, closes #174. Rollback faster mul_mont attempt, no improvement and debug pain. * finalSub in fromMont & adx_bmi -> adx * Some {.noInit.} to avoid Nim zeroMem (which should be optimized away but who knows) * Uniformize name 'op+domain': mulmod - mulmont * Fix asm codegen bug "0x0000555555565930 <+896>: sbb 0x20(%r8),%r8" with Clang in final substraction * Prepare for skipping final substraction * Don't forget to copy the result when we skip the final substraction * Seems like we need to stash the idea of skipping the final substraction for now, needs bounds analysis https://eprint.iacr.org/2017/1057.pdf * fix condition for ASM 32-bit * optim modular addition when sparebit is available
This commit is contained in:
parent
53c4db7ead
commit
14af7e8724
|
@ -91,6 +91,20 @@ proc sqrBench*(T: typedesc, iters: int) =
|
|||
bench("Squaring", T, iters):
|
||||
r.square(x)
|
||||
|
||||
proc toBigBench*(T: typedesc, iters: int) =
|
||||
var r: matchingBigInt(T.C)
|
||||
let x = rng.random_unsafe(T)
|
||||
preventOptimAway(r)
|
||||
bench("BigInt <- field conversion", T, iters):
|
||||
r.fromField(x)
|
||||
|
||||
proc toFieldBench*(T: typedesc, iters: int) =
|
||||
var r: T
|
||||
let x = rng.random_unsafe(matchingBigInt(T.C))
|
||||
preventOptimAway(r)
|
||||
bench("BigInt -> field conversion", T, iters):
|
||||
r.fromBig(x)
|
||||
|
||||
proc invBench*(T: typedesc, iters: int) =
|
||||
var r: T
|
||||
let x = rng.random_unsafe(T)
|
||||
|
|
|
@ -50,6 +50,9 @@ proc main() =
|
|||
mulBench(Fp[curve], Iters)
|
||||
sqrBench(Fp[curve], Iters)
|
||||
smallSeparator()
|
||||
toBigBench(Fp[curve], Iters)
|
||||
toFieldBench(Fp[curve], Iters)
|
||||
smallSeparator()
|
||||
invBench(Fp[curve], ExponentIters)
|
||||
sqrtBench(Fp[curve], ExponentIters)
|
||||
sqrtRatioBench(Fp[curve], ExponentIters)
|
||||
|
|
|
@ -24,14 +24,111 @@ import
|
|||
# They are nice to let the compiler deals with mov
|
||||
# but too constraining so we move things ourselves.
|
||||
|
||||
static: doAssert UseASM_X86_64
|
||||
static: doAssert UseASM_X86_32
|
||||
|
||||
{.localPassC:"-fomit-frame-pointer".} # Needed so that the compiler finds enough registers
|
||||
|
||||
proc finalSubNoCarryImpl*(
|
||||
ctx: var Assembler_x86,
|
||||
r: Operand or OperandArray,
|
||||
a, M, scratch: OperandArray
|
||||
) =
|
||||
## Reduce `a` into `r` modulo `M`
|
||||
##
|
||||
## r, a, scratch, scratchReg are mutated
|
||||
## M is read-only
|
||||
let N = M.len
|
||||
ctx.comment "Final substraction (no carry)"
|
||||
for i in 0 ..< N:
|
||||
ctx.mov scratch[i], a[i]
|
||||
if i == 0:
|
||||
ctx.sub scratch[i], M[i]
|
||||
else:
|
||||
ctx.sbb scratch[i], M[i]
|
||||
|
||||
# If we borrowed it means that we were smaller than
|
||||
# the modulus and we don't need "scratch"
|
||||
for i in 0 ..< N:
|
||||
ctx.cmovnc a[i], scratch[i]
|
||||
ctx.mov r[i], a[i]
|
||||
|
||||
proc finalSubMayCarryImpl*(
|
||||
ctx: var Assembler_x86,
|
||||
r: Operand or OperandArray,
|
||||
a, M, scratch: OperandArray,
|
||||
scratchReg: Operand or Register or OperandReuse
|
||||
) =
|
||||
## Reduce `a` into `r` modulo `M`
|
||||
## To be used when the final substraction can
|
||||
## also depend on the carry flag
|
||||
##
|
||||
## r, a, scratch, scratchReg are mutated
|
||||
## M is read-only
|
||||
|
||||
ctx.comment "Final substraction (may carry)"
|
||||
|
||||
# Mask: scratchReg contains 0xFFFF or 0x0000
|
||||
ctx.sbb scratchReg, scratchReg
|
||||
|
||||
# Now substract the modulus to test a < p
|
||||
let N = M.len
|
||||
for i in 0 ..< N:
|
||||
ctx.mov scratch[i], a[i]
|
||||
if i == 0:
|
||||
ctx.sub scratch[i], M[i]
|
||||
else:
|
||||
ctx.sbb scratch[i], M[i]
|
||||
|
||||
# If it overflows here, it means that it was
|
||||
# smaller than the modulus and we don't need `scratch`
|
||||
ctx.sbb scratchReg, 0
|
||||
|
||||
# If we borrowed it means that we were smaller than
|
||||
# the modulus and we don't need "scratch"
|
||||
for i in 0 ..< N:
|
||||
ctx.cmovnc a[i], scratch[i]
|
||||
ctx.mov r[i], a[i]
|
||||
|
||||
macro finalSub_gen*[N: static int](
|
||||
r_PIR: var array[N, SecretWord],
|
||||
a_EIR, M_PIR: array[N, SecretWord],
|
||||
scratch_EIR: var array[N, SecretWord],
|
||||
mayCarry: static bool): untyped =
|
||||
## Returns:
|
||||
## a-M if a > M
|
||||
## a otherwise
|
||||
##
|
||||
## - r_PIR is a pointer to the result array, mutated,
|
||||
## - a_EIR is an array of registers, mutated,
|
||||
## - M_PIR is a pointer to an array, read-only,
|
||||
## - scratch_EIR is an array of registers, mutated
|
||||
## - mayCarry is set to true when the carry flag also needs to be read
|
||||
result = newStmtList()
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
let
|
||||
r = init(OperandArray, nimSymbol = r_PIR, N, PointerInReg, InputOutput)
|
||||
# We reuse the reg used for b for overflow detection
|
||||
a = init(OperandArray, nimSymbol = a_EIR, N, ElemsInReg, InputOutput)
|
||||
# We could force m as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||
t = init(OperandArray, nimSymbol = scratch_EIR, N, ElemsInReg, Output_EarlyClobber)
|
||||
|
||||
if mayCarry:
|
||||
ctx.finalSubMayCarryImpl(
|
||||
r, a, M, t, rax
|
||||
)
|
||||
else:
|
||||
ctx.finalSubNoCarryImpl(
|
||||
r, a, M, t
|
||||
)
|
||||
|
||||
result.add ctx.generate()
|
||||
|
||||
# Field addition
|
||||
# ------------------------------------------------------------
|
||||
|
||||
macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
|
||||
macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N], hasSpareBit: static bool): untyped =
|
||||
## Generate an optimized modular addition kernel
|
||||
# Register pressure note:
|
||||
# We could generate a kernel per modulus m by hardcoding it as immediate
|
||||
|
@ -68,33 +165,18 @@ macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
|
|||
# Interleaved copy in a second buffer as well
|
||||
ctx.mov v[i], u[i]
|
||||
|
||||
# Mask: overflowed contains 0xFFFF or 0x0000
|
||||
# TODO: unnecessary if MSB never set, i.e. "Field.getSpareBits >= 1"
|
||||
let overflowed = b.reuseRegister()
|
||||
ctx.sbb overflowed, overflowed
|
||||
|
||||
# Now substract the modulus to test a < p
|
||||
for i in 0 ..< N:
|
||||
if i == 0:
|
||||
ctx.sub v[0], M[0]
|
||||
if hasSparebit:
|
||||
ctx.finalSubNoCarryImpl(r, u, M, v)
|
||||
else:
|
||||
ctx.sbb v[i], M[i]
|
||||
ctx.finalSubMayCarryImpl(
|
||||
r, u, M, v, b.reuseRegister()
|
||||
)
|
||||
|
||||
# If it overflows here, it means that it was
|
||||
# smaller than the modulus and we don't need V
|
||||
ctx.sbb overflowed, 0
|
||||
result.add ctx.generate()
|
||||
|
||||
# Conditional Mov and
|
||||
# and store result
|
||||
for i in 0 ..< N:
|
||||
ctx.cmovnc u[i], v[i]
|
||||
ctx.mov r[i], u[i]
|
||||
|
||||
result.add ctx.generate
|
||||
|
||||
func addmod_asm*(r: var Limbs, a, b, m: Limbs) =
|
||||
func addmod_asm*(r: var Limbs, a, b, m: Limbs, hasSpareBit: static bool) =
|
||||
## Constant-time modular addition
|
||||
addmod_gen(r, a, b, m)
|
||||
addmod_gen(r, a, b, m, hasSpareBit)
|
||||
|
||||
# Field substraction
|
||||
# ------------------------------------------------------------
|
||||
|
@ -225,7 +307,7 @@ when isMainModule:
|
|||
debugecho " a: ", a.toHex()
|
||||
debugecho " b: ", b.toHex()
|
||||
debugecho " m: ", m.toHex()
|
||||
addmod_asm(a, a, b, m)
|
||||
addmod_asm(a, a, b, m, hasSpareBit = false)
|
||||
debugecho "after:"
|
||||
debugecho " a: ", a.toHex().tolower
|
||||
debugecho " s: ", s
|
||||
|
@ -245,7 +327,7 @@ when isMainModule:
|
|||
debugecho " a: ", a.toHex()
|
||||
debugecho " b: ", b.toHex()
|
||||
debugecho " m: ", m.toHex()
|
||||
addmod_asm(a, a, b, m)
|
||||
addmod_asm(a, a, b, m, hasSpareBit = false)
|
||||
debugecho "after:"
|
||||
debugecho " a: ", a.toHex().tolower
|
||||
debugecho " s: ", s
|
||||
|
@ -265,7 +347,7 @@ when isMainModule:
|
|||
debugecho " a: ", a.toHex()
|
||||
debugecho " b: ", b.toHex()
|
||||
debugecho " m: ", m.toHex()
|
||||
addmod_asm(a, a, b, m)
|
||||
addmod_asm(a, a, b, m, hasSpareBit = false)
|
||||
debugecho "after:"
|
||||
debugecho " a: ", a.toHex().tolower
|
||||
debugecho " s: ", s
|
||||
|
@ -285,7 +367,7 @@ when isMainModule:
|
|||
debugecho " a: ", a.toHex()
|
||||
debugecho " b: ", b.toHex()
|
||||
debugecho " m: ", m.toHex()
|
||||
addmod_asm(a, a, b, m)
|
||||
addmod_asm(a, a, b, m, hasSpareBit = false)
|
||||
debugecho "after:"
|
||||
debugecho " a: ", a.toHex().tolower
|
||||
debugecho " s: ", s
|
||||
|
@ -308,7 +390,7 @@ when isMainModule:
|
|||
debugecho " a: ", a.toHex()
|
||||
debugecho " b: ", b.toHex()
|
||||
debugecho " m: ", m.toHex()
|
||||
submod_asm(a, a, b, m)
|
||||
submod_asm(a, a, b, m, hasSpareBit = false)
|
||||
debugecho "after:"
|
||||
debugecho " a: ", a.toHex().tolower
|
||||
debugecho " s: ", s
|
||||
|
@ -333,7 +415,7 @@ when isMainModule:
|
|||
debugecho " a: ", a.toHex()
|
||||
debugecho " b: ", b.toHex()
|
||||
debugecho " m: ", m.toHex()
|
||||
submod_asm(r, a, b, m)
|
||||
submod_asm(r, a, b, m, hasSpareBit = false)
|
||||
debugecho "after:"
|
||||
debugecho " r: ", r.toHex().tolower
|
||||
debugecho " s: ", s
|
||||
|
|
|
@ -1,160 +0,0 @@
|
|||
# Constantine
|
||||
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
||||
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import
|
||||
# Standard library
|
||||
std/macros,
|
||||
# Internal
|
||||
../../config/common,
|
||||
../../primitives,
|
||||
./limbs_asm_montred_x86
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# Assembly implementation of finite fields
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
static: doAssert UseASM_X86_64
|
||||
|
||||
# MULX/ADCX/ADOX
|
||||
{.localPassC:"-madx -mbmi2".}
|
||||
# Necessary for the compiler to find enough registers (enabled at -O1)
|
||||
{.localPassC:"-fomit-frame-pointer".}
|
||||
|
||||
# No exceptions allowed
|
||||
{.push raises: [].}
|
||||
|
||||
# Montgomery reduction
|
||||
# ------------------------------------------------------------
|
||||
|
||||
macro montyRedc2x_adx_gen*[N: static int](
|
||||
r_MR: var array[N, SecretWord],
|
||||
a_MR: array[N*2, SecretWord],
|
||||
M_MR: array[N, SecretWord],
|
||||
m0ninv_MR: BaseType,
|
||||
hasSpareBit: static bool
|
||||
) =
|
||||
result = newStmtList()
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
let
|
||||
# We could force M as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = M_MR, N, PointerInReg, Input)
|
||||
|
||||
let uSlots = N+1
|
||||
let vSlots = max(N-1, 5)
|
||||
|
||||
var # Scratchspaces
|
||||
u = init(OperandArray, nimSymbol = ident"U", uSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||
v = init(OperandArray, nimSymbol = ident"V", vSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||
|
||||
# Prologue
|
||||
let usym = u.nimSymbol
|
||||
let vsym = v.nimSymbol
|
||||
result.add quote do:
|
||||
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
||||
var `usym`{.noinit.}: Limbs[`uSlots`]
|
||||
var `vsym` {.noInit.}: Limbs[`vSlots`]
|
||||
`vsym`[0] = cast[SecretWord](`r_MR`[0].unsafeAddr)
|
||||
`vsym`[1] = cast[SecretWord](`a_MR`[0].unsafeAddr)
|
||||
`vsym`[2] = SecretWord(`m0ninv_MR`)
|
||||
|
||||
let r_temp = v[0].asArrayAddr(len = N)
|
||||
let a = v[1].asArrayAddr(len = 2*N)
|
||||
let m0ninv = v[2]
|
||||
let lo = v[3]
|
||||
let hi = v[4]
|
||||
|
||||
# Algorithm
|
||||
# ---------------------------------------------------------
|
||||
# for i in 0 .. n-1:
|
||||
# hi <- 0
|
||||
# m <- a[i] * m0ninv mod 2^w (i.e. simple multiplication)
|
||||
# for j in 0 .. n-1:
|
||||
# (hi, lo) <- a[i+j] + m * M[j] + hi
|
||||
# a[i+j] <- lo
|
||||
# a[i+n] += hi
|
||||
# for i in 0 .. n-1:
|
||||
# r[i] = a[i+n]
|
||||
# if r >= M:
|
||||
# r -= M
|
||||
|
||||
# No register spilling handling
|
||||
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
|
||||
|
||||
ctx.mov rdx, m0ninv
|
||||
|
||||
for i in 0 ..< N:
|
||||
ctx.mov u[i], a[i]
|
||||
|
||||
for i in 0 ..< N:
|
||||
# RDX contains m0ninv at the start of each loop
|
||||
ctx.comment ""
|
||||
ctx.imul rdx, u[0] # m <- a[i] * m0ninv mod 2^w
|
||||
ctx.comment "---- Reduction " & $i
|
||||
ctx.`xor` u[N], u[N]
|
||||
|
||||
for j in 0 ..< N-1:
|
||||
ctx.comment ""
|
||||
ctx.mulx hi, lo, M[j], rdx
|
||||
ctx.adcx u[j], lo
|
||||
ctx.adox u[j+1], hi
|
||||
|
||||
# Last limb
|
||||
ctx.comment ""
|
||||
ctx.mulx hi, lo, M[N-1], rdx
|
||||
ctx.mov rdx, m0ninv # Reload m0ninv for next iter
|
||||
ctx.adcx u[N-1], lo
|
||||
ctx.adox hi, u[N]
|
||||
ctx.adcx u[N], hi
|
||||
|
||||
u.rotateLeft()
|
||||
|
||||
ctx.mov rdx, r_temp
|
||||
let r = rdx.asArrayAddr(len = N)
|
||||
|
||||
# This does a[i+n] += hi
|
||||
# but in a separate carry chain, fused with the
|
||||
# copy "r[i] = a[i+n]"
|
||||
for i in 0 ..< N:
|
||||
if i == 0:
|
||||
ctx.add u[i], a[i+N]
|
||||
else:
|
||||
ctx.adc u[i], a[i+N]
|
||||
|
||||
let t = repackRegisters(v, u[N])
|
||||
|
||||
if hasSpareBit:
|
||||
ctx.finalSubNoCarry(r, u, M, t)
|
||||
else:
|
||||
ctx.finalSubCanOverflow(r, u, M, t, hi)
|
||||
|
||||
# Code generation
|
||||
result.add ctx.generate()
|
||||
|
||||
func montRed_asm_adx_bmi2_impl*[N: static int](
|
||||
r: var array[N, SecretWord],
|
||||
a: array[N*2, SecretWord],
|
||||
M: array[N, SecretWord],
|
||||
m0ninv: BaseType,
|
||||
hasSpareBit: static bool
|
||||
) =
|
||||
## Constant-time Montgomery reduction
|
||||
## Inline-version
|
||||
montyRedc2x_adx_gen(r, a, M, m0ninv, hasSpareBit)
|
||||
|
||||
func montRed_asm_adx_bmi2*[N: static int](
|
||||
r: var array[N, SecretWord],
|
||||
a: array[N*2, SecretWord],
|
||||
M: array[N, SecretWord],
|
||||
m0ninv: BaseType,
|
||||
hasSpareBit: static bool
|
||||
) =
|
||||
## Constant-time Montgomery reduction
|
||||
montRed_asm_adx_bmi2_impl(r, a, M, m0ninv, hasSpareBit)
|
|
@ -12,7 +12,8 @@ import
|
|||
# Internal
|
||||
../../config/common,
|
||||
../../primitives,
|
||||
./limbs_asm_montred_x86,
|
||||
./limbs_asm_modular_x86,
|
||||
./limbs_asm_redc_mont_x86,
|
||||
./limbs_asm_mul_x86
|
||||
|
||||
# ############################################################
|
||||
|
@ -34,7 +35,11 @@ static: doAssert UseASM_X86_64
|
|||
# Montgomery multiplication
|
||||
# ------------------------------------------------------------
|
||||
# Fallback when no ADX and BMI2 support (MULX, ADCX, ADOX)
|
||||
macro montMul_CIOS_sparebit_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_MM: Limbs[N], m0ninv_MM: BaseType): untyped =
|
||||
macro mulMont_CIOS_sparebit_gen[N: static int](
|
||||
r_PIR: var Limbs[N], a_PIR, b_PIR,
|
||||
M_PIR: Limbs[N], m0ninv_REG: BaseType,
|
||||
skipReduction: static bool
|
||||
): untyped =
|
||||
## Generate an optimized Montgomery Multiplication kernel
|
||||
## using the CIOS method
|
||||
##
|
||||
|
@ -44,14 +49,17 @@ macro montMul_CIOS_sparebit_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M
|
|||
## M[^1] < high(SecretWord) shr 2 (i.e. less than 0b00111...1111)
|
||||
## https://hackmd.io/@zkteam/modular_multiplication
|
||||
|
||||
# No register spilling handling
|
||||
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
|
||||
|
||||
result = newStmtList()
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
let
|
||||
scratchSlots = max(N, 6)
|
||||
scratchSlots = 6
|
||||
|
||||
# We could force M as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = M_MM, N, PointerInReg, Input)
|
||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||
# If N is too big, we need to spill registers. TODO.
|
||||
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
|
||||
# MultiPurpose Register slots
|
||||
|
@ -62,10 +70,10 @@ macro montMul_CIOS_sparebit_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M
|
|||
m0ninv = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[m0ninv]",
|
||||
nimSymbol: m0ninv_MM,
|
||||
nimSymbol: m0ninv_REG,
|
||||
rm: MemOffsettable,
|
||||
constraint: Input,
|
||||
cEmit: "&" & $m0ninv_MM
|
||||
cEmit: "&" & $m0ninv_REG
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -76,7 +84,7 @@ macro montMul_CIOS_sparebit_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M
|
|||
b = scratch[1].asArrayAddr(len = N) # Store the `b` operand
|
||||
A = scratch[2] # High part of extended precision multiplication
|
||||
C = scratch[3]
|
||||
m = scratch[4] # Stores (t[0] * m0ninv) mod 2^w
|
||||
m = scratch[4] # Stores (t[0] * m0ninv) mod 2ʷ
|
||||
r = scratch[5] # Stores the `r` operand
|
||||
|
||||
# Registers used:
|
||||
|
@ -94,18 +102,18 @@ macro montMul_CIOS_sparebit_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M
|
|||
result.add quote do:
|
||||
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
||||
|
||||
var `tsym`: typeof(`r_MM`) # zero init
|
||||
var `tsym`: typeof(`r_PIR`) # zero init
|
||||
# Assumes 64-bit limbs on 64-bit arch (or you can't store an address)
|
||||
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
|
||||
`scratchSym`[0] = cast[SecretWord](`a_MM`[0].unsafeAddr)
|
||||
`scratchSym`[1] = cast[SecretWord](`b_MM`[0].unsafeAddr)
|
||||
`scratchSym`[5] = cast[SecretWord](`r_MM`[0].unsafeAddr)
|
||||
`scratchSym`[0] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
|
||||
`scratchSym`[1] = cast[SecretWord](`b_PIR`[0].unsafeAddr)
|
||||
`scratchSym`[5] = cast[SecretWord](`r_PIR`[0].unsafeAddr)
|
||||
|
||||
# Algorithm
|
||||
# -----------------------------------------
|
||||
# for i=0 to N-1
|
||||
# (A, t[0]) <- a[0] * b[i] + t[0]
|
||||
# m <- (t[0] * m0ninv) mod 2^w
|
||||
# m <- (t[0] * m0ninv) mod 2ʷ
|
||||
# (C, _) <- m * M[0] + t[0]
|
||||
# for j=1 to N-1
|
||||
# (A, t[j]) <- a[j] * b[i] + A + t[j]
|
||||
|
@ -127,7 +135,7 @@ macro montMul_CIOS_sparebit_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M
|
|||
ctx.adc rdx, 0
|
||||
ctx.mov A, rdx
|
||||
|
||||
# m <- (t[0] * m0ninv) mod 2^w
|
||||
# m <- (t[0] * m0ninv) mod 2ʷ
|
||||
ctx.mov m, m0ninv
|
||||
ctx.imul m, t[0]
|
||||
|
||||
|
@ -164,19 +172,27 @@ macro montMul_CIOS_sparebit_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M
|
|||
ctx.add A, C
|
||||
ctx.mov t[N-1], A
|
||||
|
||||
ctx.mov rdx, r
|
||||
let r2 = rdx.asArrayAddr(len = N)
|
||||
ctx.mov rax, r # move r away from scratchspace that will be used for final substraction
|
||||
let r2 = rax.asArrayAddr(len = N)
|
||||
|
||||
ctx.finalSubNoCarry(
|
||||
if skipReduction:
|
||||
for i in 0 ..< N:
|
||||
ctx.mov r2[i], t[i]
|
||||
else:
|
||||
ctx.finalSubNoCarryImpl(
|
||||
r2, t, M,
|
||||
scratch
|
||||
)
|
||||
result.add ctx.generate()
|
||||
|
||||
result.add ctx.generate
|
||||
|
||||
func montMul_CIOS_sparebit_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
|
||||
## Constant-time modular multiplication
|
||||
montMul_CIOS_sparebit_gen(r, a, b, M, m0ninv)
|
||||
func mulMont_CIOS_sparebit_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipReduction: static bool = false) =
|
||||
## Constant-time Montgomery multiplication
|
||||
## If "skipReduction" is set
|
||||
## the result is in the range [0, 2M)
|
||||
## otherwise the result is in the range [0, M)
|
||||
##
|
||||
## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
|
||||
r.mulMont_CIOS_sparebit_gen(a, b, M, m0ninv, skipReduction)
|
||||
|
||||
# Montgomery Squaring
|
||||
# ------------------------------------------------------------
|
||||
|
@ -189,25 +205,12 @@ func square_asm_inline[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen
|
|||
## but not for stack variables
|
||||
sqr_gen(r, a)
|
||||
|
||||
func montRed_asm_inline[N: static int](
|
||||
r: var array[N, SecretWord],
|
||||
a: array[N*2, SecretWord],
|
||||
M: array[N, SecretWord],
|
||||
m0ninv: BaseType,
|
||||
hasSpareBit: static bool
|
||||
) {.inline.} =
|
||||
## Constant-time Montgomery reduction
|
||||
## Extra indirection as the generator assumes that
|
||||
## arrays are pointers, which is true for parameters
|
||||
## but not for stack variables
|
||||
montyRedc2x_gen(r, a, M, m0ninv, hasSpareBit)
|
||||
|
||||
func montSquare_CIOS_asm*[N](
|
||||
func squareMont_CIOS_asm*[N](
|
||||
r: var Limbs[N],
|
||||
a, M: Limbs[N],
|
||||
m0ninv: BaseType,
|
||||
hasSpareBit: static bool) =
|
||||
hasSpareBit, skipReduction: static bool) =
|
||||
## Constant-time modular squaring
|
||||
var r2x {.noInit.}: Limbs[2*N]
|
||||
r2x.square_asm_inline(a)
|
||||
r.montRed_asm_inline(r2x, M, m0ninv, hasSpareBit)
|
||||
r.redcMont_asm_inline(r2x, M, m0ninv, hasSpareBit, skipReduction)
|
|
@ -8,12 +8,12 @@
|
|||
|
||||
import
|
||||
# Standard library
|
||||
std/macros,
|
||||
std/[macros, algorithm],
|
||||
# Internal
|
||||
../../config/common,
|
||||
../../primitives,
|
||||
./limbs_asm_montred_x86,
|
||||
./limbs_asm_montred_x86_adx_bmi2,
|
||||
./limbs_asm_modular_x86,
|
||||
./limbs_asm_redc_mont_x86_adx_bmi2,
|
||||
./limbs_asm_mul_x86_adx_bmi2
|
||||
|
||||
# ############################################################
|
||||
|
@ -36,6 +36,7 @@ static: doAssert UseASM_X86_64
|
|||
|
||||
# Montgomery Multiplication
|
||||
# ------------------------------------------------------------
|
||||
|
||||
proc mulx_by_word(
|
||||
ctx: var Assembler_x86,
|
||||
hi: Operand,
|
||||
|
@ -149,7 +150,7 @@ proc partialRedx(
|
|||
ctx.mov rdx, t[0]
|
||||
ctx.imul rdx, m0ninv
|
||||
|
||||
# Clear carry flags - TODO: necessary?
|
||||
# Clear carry flags
|
||||
ctx.`xor` S, S
|
||||
|
||||
# S,_ := t[0] + m*M[0]
|
||||
|
@ -158,6 +159,8 @@ proc partialRedx(
|
|||
ctx.adcx lo, t[0] # set the carry flag for the future ADCX
|
||||
ctx.mov t[0], S
|
||||
|
||||
ctx.mov lo, 0
|
||||
|
||||
# for j=1 to N-1
|
||||
# (S,t[j-1]) := t[j] + m*M[j] + S
|
||||
ctx.comment " for j=1 to N-1"
|
||||
|
@ -170,26 +173,31 @@ proc partialRedx(
|
|||
# Last carries
|
||||
# t[N-1] = S + C
|
||||
ctx.comment " Reduction carry "
|
||||
ctx.mov S, 0
|
||||
ctx.adcx t[N-1], S
|
||||
ctx.adox t[N-1], C
|
||||
ctx.adcx lo, C # lo contains 0 so C += S
|
||||
ctx.adox t[N-1], lo
|
||||
|
||||
macro montMul_CIOS_sparebit_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_MM: Limbs[N], m0ninv_MM: BaseType): untyped =
|
||||
macro mulMont_CIOS_sparebit_adx_gen[N: static int](
|
||||
r_PIR: var Limbs[N], a_PIR, b_PIR,
|
||||
M_PIR: Limbs[N], m0ninv_REG: BaseType,
|
||||
skipReduction: static bool): untyped =
|
||||
## Generate an optimized Montgomery Multiplication kernel
|
||||
## using the CIOS method
|
||||
## This requires the most significant word of the Modulus
|
||||
## M[^1] < high(SecretWord) shr 2 (i.e. less than 0b00111...1111)
|
||||
## https://hackmd.io/@zkteam/modular_multiplication
|
||||
|
||||
# No register spilling handling
|
||||
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
|
||||
|
||||
result = newStmtList()
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
let
|
||||
scratchSlots = max(N, 6)
|
||||
scratchSlots = 6
|
||||
|
||||
r = init(OperandArray, nimSymbol = r_MM, N, PointerInReg, InputOutput_EnsureClobber)
|
||||
r = init(OperandArray, nimSymbol = r_PIR, N, PointerInReg, InputOutput_EnsureClobber)
|
||||
# We could force M as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = M_MM, N, PointerInReg, Input)
|
||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||
# If N is too big, we need to spill registers. TODO.
|
||||
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
|
||||
# MultiPurpose Register slots
|
||||
|
@ -220,12 +228,12 @@ macro montMul_CIOS_sparebit_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM
|
|||
result.add quote do:
|
||||
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
||||
|
||||
var `tsym`: typeof(`r_MM`) # zero init
|
||||
var `tsym`{.noInit.}: typeof(`r_PIR`) # zero init
|
||||
# Assumes 64-bit limbs on 64-bit arch (or you can't store an address)
|
||||
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
|
||||
`scratchSym`[0] = cast[SecretWord](`a_MM`[0].unsafeAddr)
|
||||
`scratchSym`[1] = cast[SecretWord](`b_MM`[0].unsafeAddr)
|
||||
`scratchSym`[4] = SecretWord `m0ninv_MM`
|
||||
`scratchSym`[0] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
|
||||
`scratchSym`[1] = cast[SecretWord](`b_PIR`[0].unsafeAddr)
|
||||
`scratchSym`[4] = SecretWord `m0ninv_REG`
|
||||
|
||||
# Algorithm
|
||||
# -----------------------------------------
|
||||
|
@ -238,9 +246,6 @@ macro montMul_CIOS_sparebit_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM
|
|||
# (C,t[j-1]) := t[j] + m*M[j] + C
|
||||
# t[N-1] = C + A
|
||||
|
||||
# No register spilling handling
|
||||
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
|
||||
|
||||
for i in 0 ..< N:
|
||||
if i == 0:
|
||||
ctx.mulx_by_word(
|
||||
|
@ -263,47 +268,35 @@ macro montMul_CIOS_sparebit_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM
|
|||
lo, C
|
||||
)
|
||||
|
||||
ctx.finalSubNoCarry(
|
||||
if skipReduction:
|
||||
for i in 0 ..< N:
|
||||
ctx.mov r[i], t[i]
|
||||
else:
|
||||
ctx.finalSubNoCarryImpl(
|
||||
r, t, M,
|
||||
scratch
|
||||
)
|
||||
|
||||
result.add ctx.generate
|
||||
|
||||
func montMul_CIOS_sparebit_asm_adx_bmi2*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
|
||||
## Constant-time modular multiplication
|
||||
## Requires the prime modulus to have a spare bit in the representation. (Hence if using 64-bit words and 4 words, to be at most 255-bit)
|
||||
montMul_CIOS_sparebit_adx_bmi2_gen(r, a, b, M, m0ninv)
|
||||
func mulMont_CIOS_sparebit_asm_adx*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipReduction: static bool = false) =
|
||||
## Constant-time Montgomery multiplication
|
||||
## If "skipReduction" is set
|
||||
## the result is in the range [0, 2M)
|
||||
## otherwise the result is in the range [0, M)
|
||||
##
|
||||
## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
|
||||
r.mulMont_CIOS_sparebit_adx_gen(a, b, M, m0ninv, skipReduction)
|
||||
|
||||
# Montgomery Squaring
|
||||
# ------------------------------------------------------------
|
||||
|
||||
func square_asm_adx_bmi2_inline[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) {.inline.} =
|
||||
## Multi-precision Squaring
|
||||
## Extra indirection as the generator assumes that
|
||||
## arrays are pointers, which is true for parameters
|
||||
## but not for stack variables.
|
||||
sqrx_gen(r, a)
|
||||
|
||||
func montRed_asm_adx_bmi2_inline[N: static int](
|
||||
r: var array[N, SecretWord],
|
||||
a: array[N*2, SecretWord],
|
||||
M: array[N, SecretWord],
|
||||
m0ninv: BaseType,
|
||||
hasSpareBit: static bool
|
||||
) {.inline.} =
|
||||
## Constant-time Montgomery reduction
|
||||
## Extra indirection as the generator assumes that
|
||||
## arrays are pointers, which is true for parameters
|
||||
## but not for stack variables.
|
||||
montyRedc2x_adx_gen(r, a, M, m0ninv, hasSpareBit)
|
||||
|
||||
func montSquare_CIOS_asm_adx_bmi2*[N](
|
||||
func squareMont_CIOS_asm_adx*[N](
|
||||
r: var Limbs[N],
|
||||
a, M: Limbs[N],
|
||||
m0ninv: BaseType,
|
||||
hasSpareBit: static bool) =
|
||||
hasSpareBit, skipReduction: static bool) =
|
||||
## Constant-time modular squaring
|
||||
var r2x {.noInit.}: Limbs[2*N]
|
||||
r2x.square_asm_adx_bmi2_inline(a)
|
||||
r.montRed_asm_adx_bmi2_inline(r2x, M, m0ninv, hasSpareBit)
|
||||
r2x.square_asm_adx_inline(a)
|
||||
r.redcMont_asm_adx(r2x, M, m0ninv, hasSpareBit, skipReduction)
|
|
@ -111,7 +111,7 @@ proc mulaccx_by_word(
|
|||
ctx.adcx hi, rdx
|
||||
ctx.adox hi, rdx
|
||||
|
||||
macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen], bx: Limbs[bLen]) =
|
||||
macro mulx_gen[rLen, aLen, bLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limbs[aLen], b_PIR: Limbs[bLen]) =
|
||||
## `a`, `b`, `r` can have a different number of limbs
|
||||
## if `r`.limbs.len < a.limbs.len + b.limbs.len
|
||||
## The result will be truncated, i.e. it will be
|
||||
|
@ -123,9 +123,9 @@ macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen
|
|||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
let
|
||||
r = init(OperandArray, nimSymbol = rx, rLen, PointerInReg, InputOutput_EnsureClobber)
|
||||
a = init(OperandArray, nimSymbol = ax, aLen, PointerInReg, Input)
|
||||
b = init(OperandArray, nimSymbol = bx, bLen, PointerInReg, Input)
|
||||
r = init(OperandArray, nimSymbol = r_PIR, rLen, PointerInReg, InputOutput_EnsureClobber)
|
||||
a = init(OperandArray, nimSymbol = a_PIR, aLen, PointerInReg, Input)
|
||||
b = init(OperandArray, nimSymbol = b_PIR, bLen, PointerInReg, Input)
|
||||
|
||||
# MULX requires RDX
|
||||
|
||||
|
@ -168,24 +168,24 @@ macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen
|
|||
# Codegen
|
||||
result.add ctx.generate
|
||||
|
||||
func mul_asm_adx_bmi2_impl*[rLen, aLen, bLen: static int](
|
||||
func mul_asm_adx_inline*[rLen, aLen, bLen: static int](
|
||||
r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) {.inline.} =
|
||||
## Multi-precision Multiplication
|
||||
## Assumes r doesn't alias a or b
|
||||
## Inline version
|
||||
mulx_gen(r, a, b)
|
||||
|
||||
func mul_asm_adx_bmi2*[rLen, aLen, bLen: static int](
|
||||
func mul_asm_adx*[rLen, aLen, bLen: static int](
|
||||
r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
|
||||
## Multi-precision Multiplication
|
||||
## Assumes r doesn't alias a or b
|
||||
mul_asm_adx_bmi2_impl(r, a, b)
|
||||
mul_asm_adx_inline(r, a, b)
|
||||
|
||||
# Squaring
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
#
|
||||
# Strategy:
|
||||
# We want to use the same scheduling as mul_asm_adx_bmi2
|
||||
# We want to use the same scheduling as mul_asm_adx
|
||||
# and so process `a[0..<N]` by `word`
|
||||
# and store the intermediate result in `[t[n..1]:r0]`
|
||||
#
|
||||
|
@ -561,7 +561,7 @@ func sqrx_gen6L(ctx: var Assembler_x86, r, a: OperandArray, t: var OperandArray)
|
|||
merge_diag_and_partsum(r, a, hi1, lo1, zero, 4)
|
||||
merge_diag_and_partsum(r, a, hi2, lo2, zero, 5)
|
||||
|
||||
macro sqrx_gen*[rLen, aLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen]) =
|
||||
macro sqrx_gen*[rLen, aLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limbs[aLen]) =
|
||||
## Squaring
|
||||
## `a` and `r` can have a different number of limbs
|
||||
## if `r`.limbs.len < a.limbs.len * 2
|
||||
|
@ -578,8 +578,8 @@ macro sqrx_gen*[rLen, aLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen]) =
|
|||
# t = 2 * a.len = 12
|
||||
# We use the full x86 register set.
|
||||
|
||||
r = init(OperandArray, nimSymbol = rx, rLen, PointerInReg, InputOutput)
|
||||
a = init(OperandArray, nimSymbol = ax, aLen, PointerInReg, Input)
|
||||
r = init(OperandArray, nimSymbol = r_PIR, rLen, PointerInReg, InputOutput)
|
||||
a = init(OperandArray, nimSymbol = a_PIR, aLen, PointerInReg, Input)
|
||||
|
||||
# MULX requires RDX
|
||||
|
||||
|
@ -604,10 +604,15 @@ macro sqrx_gen*[rLen, aLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen]) =
|
|||
# Codegen
|
||||
result.add ctx.generate
|
||||
|
||||
func square_asm_adx_bmi2*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
||||
func square_asm_adx_inline*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) {.inline.} =
|
||||
## Multi-precision Squaring
|
||||
## inline version
|
||||
sqrx_gen(r, a)
|
||||
|
||||
func square_asm_adx*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
||||
## Multi-precision Squaring
|
||||
## Assumes r doesn't alias a
|
||||
sqrx_gen(r, a)
|
||||
square_asm_adx_inline(r, a)
|
||||
|
||||
|
||||
# Sanity checks
|
||||
|
@ -628,8 +633,8 @@ when isMainModule:
|
|||
|
||||
var a2x, expected: Limbs[8]
|
||||
|
||||
a2x.square_asm_adx_bmi2(a)
|
||||
expected.mul_asm_adx_bmi2(a, a)
|
||||
a2x.square_asm_adx(a)
|
||||
expected.mul_asm_adx(a, a)
|
||||
debugecho "--------------------------------"
|
||||
debugecho "before:"
|
||||
debugecho " a : ", a.toString()
|
||||
|
@ -644,8 +649,8 @@ when isMainModule:
|
|||
|
||||
var a2x, expected: Limbs[8]
|
||||
|
||||
a2x.square_asm_adx_bmi2(a)
|
||||
expected.mul_asm_adx_bmi2(a, a)
|
||||
a2x.square_asm_adx(a)
|
||||
expected.mul_asm_adx(a, a)
|
||||
debugecho "--------------------------------"
|
||||
debugecho "before:"
|
||||
debugecho " a : ", a.toString()
|
||||
|
@ -667,8 +672,8 @@ when isMainModule:
|
|||
|
||||
var a2x, expected: Limbs[12]
|
||||
|
||||
a2x.square_asm_adx_bmi2(a)
|
||||
expected.mul_asm_adx_bmi2(a, a)
|
||||
a2x.square_asm_adx(a)
|
||||
expected.mul_asm_adx(a, a)
|
||||
debugecho "--------------------------------"
|
||||
debugecho "before:"
|
||||
debugecho " a : ", a.toString()
|
||||
|
@ -683,8 +688,8 @@ when isMainModule:
|
|||
|
||||
var a2x, expected: Limbs[12]
|
||||
|
||||
a2x.square_asm_adx_bmi2(a)
|
||||
expected.mul_asm_adx_bmi2(a, a)
|
||||
a2x.square_asm_adx(a)
|
||||
expected.mul_asm_adx(a, a)
|
||||
debugecho "--------------------------------"
|
||||
debugecho "before:"
|
||||
debugecho " a : ", a.toString()
|
||||
|
|
|
@ -11,7 +11,8 @@ import
|
|||
std/macros,
|
||||
# Internal
|
||||
../../config/common,
|
||||
../../primitives
|
||||
../../primitives,
|
||||
./limbs_asm_modular_x86
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
|
@ -25,76 +26,21 @@ static: doAssert UseASM_X86_32
|
|||
# Necessary for the compiler to find enough registers (enabled at -O1)
|
||||
{.localPassC:"-fomit-frame-pointer".}
|
||||
|
||||
proc finalSubNoCarry*(
|
||||
ctx: var Assembler_x86,
|
||||
r: Operand or OperandArray,
|
||||
a, M, scratch: OperandArray
|
||||
) =
|
||||
## Reduce `a` into `r` modulo `M`
|
||||
let N = M.len
|
||||
ctx.comment "Final substraction (no carry)"
|
||||
for i in 0 ..< N:
|
||||
ctx.mov scratch[i], a[i]
|
||||
if i == 0:
|
||||
ctx.sub scratch[i], M[i]
|
||||
else:
|
||||
ctx.sbb scratch[i], M[i]
|
||||
|
||||
# If we borrowed it means that we were smaller than
|
||||
# the modulus and we don't need "scratch"
|
||||
for i in 0 ..< N:
|
||||
ctx.cmovnc a[i], scratch[i]
|
||||
ctx.mov r[i], a[i]
|
||||
|
||||
proc finalSubCanOverflow*(
|
||||
ctx: var Assembler_x86,
|
||||
r: Operand or OperandArray,
|
||||
a, M, scratch: OperandArray,
|
||||
overflowReg: Operand or Register
|
||||
) =
|
||||
## Reduce `a` into `r` modulo `M`
|
||||
## To be used when the final substraction can
|
||||
## also depend on the carry flag
|
||||
## This is in particular possible when the MSB
|
||||
## is set for the prime modulus
|
||||
## `overflowReg` should be a register that will be used
|
||||
## to store the carry flag
|
||||
|
||||
ctx.comment "Final substraction (may carry)"
|
||||
|
||||
# Mask: overflowed contains 0xFFFF or 0x0000
|
||||
ctx.sbb overflowReg, overflowReg
|
||||
|
||||
# Now substract the modulus to test a < p
|
||||
let N = M.len
|
||||
for i in 0 ..< N:
|
||||
ctx.mov scratch[i], a[i]
|
||||
if i == 0:
|
||||
ctx.sub scratch[i], M[i]
|
||||
else:
|
||||
ctx.sbb scratch[i], M[i]
|
||||
|
||||
# If it overflows here, it means that it was
|
||||
# smaller than the modulus and we don't need `scratch`
|
||||
ctx.sbb overflowReg, 0
|
||||
|
||||
# If we borrowed it means that we were smaller than
|
||||
# the modulus and we don't need "scratch"
|
||||
for i in 0 ..< N:
|
||||
ctx.cmovnc a[i], scratch[i]
|
||||
ctx.mov r[i], a[i]
|
||||
|
||||
|
||||
# Montgomery reduction
|
||||
# ------------------------------------------------------------
|
||||
|
||||
macro montyRedc2x_gen*[N: static int](
|
||||
r_MR: var array[N, SecretWord],
|
||||
a_MR: array[N*2, SecretWord],
|
||||
M_MR: array[N, SecretWord],
|
||||
m0ninv_MR: BaseType,
|
||||
hasSpareBit: static bool
|
||||
macro redc2xMont_gen*[N: static int](
|
||||
r_PIR: var array[N, SecretWord],
|
||||
a_PIR: array[N*2, SecretWord],
|
||||
M_PIR: array[N, SecretWord],
|
||||
m0ninv_REG: BaseType,
|
||||
hasSpareBit, skipReduction: static bool
|
||||
) =
|
||||
|
||||
# No register spilling handling
|
||||
doAssert N > 2, "The Assembly-optimized montgomery reduction requires a minimum of 2 limbs."
|
||||
doAssert N <= 6, "The Assembly-optimized montgomery reduction requires at most 6 limbs."
|
||||
|
||||
result = newStmtList()
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
|
@ -103,7 +49,7 @@ macro montyRedc2x_gen*[N: static int](
|
|||
# so we store everything in scratchspaces restoring as needed
|
||||
let
|
||||
# We could force M as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = M_MR, N, PointerInReg, Input)
|
||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||
# MUL requires RAX and RDX
|
||||
|
||||
let uSlots = N+2
|
||||
|
@ -119,9 +65,9 @@ macro montyRedc2x_gen*[N: static int](
|
|||
result.add quote do:
|
||||
var `usym`{.noinit.}: Limbs[`uSlots`]
|
||||
var `vsym` {.noInit.}: Limbs[`vSlots`]
|
||||
`vsym`[0] = cast[SecretWord](`r_MR`[0].unsafeAddr)
|
||||
`vsym`[1] = cast[SecretWord](`a_MR`[0].unsafeAddr)
|
||||
`vsym`[2] = SecretWord(`m0ninv_MR`)
|
||||
`vsym`[0] = cast[SecretWord](`r_PIR`[0].unsafeAddr)
|
||||
`vsym`[1] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
|
||||
`vsym`[2] = SecretWord(`m0ninv_REG`)
|
||||
|
||||
let r_temp = v[0].asArrayAddr(len = N)
|
||||
let a = v[1].asArrayAddr(len = 2*N)
|
||||
|
@ -131,7 +77,7 @@ macro montyRedc2x_gen*[N: static int](
|
|||
# ---------------------------------------------------------
|
||||
# for i in 0 .. n-1:
|
||||
# hi <- 0
|
||||
# m <- a[i] * m0ninv mod 2^w (i.e. simple multiplication)
|
||||
# m <- a[i] * m0ninv mod 2ʷ (i.e. simple multiplication)
|
||||
# for j in 0 .. n-1:
|
||||
# (hi, lo) <- a[i+j] + m * M[j] + hi
|
||||
# a[i+j] <- lo
|
||||
|
@ -141,15 +87,11 @@ macro montyRedc2x_gen*[N: static int](
|
|||
# if r >= M:
|
||||
# r -= M
|
||||
|
||||
# No register spilling handling
|
||||
doAssert N > 2, "The Assembly-optimized montgomery reduction requires a minimum of 2 limbs."
|
||||
doAssert N <= 6, "The Assembly-optimized montgomery reduction requires at most 6 limbs."
|
||||
|
||||
for i in 0 ..< N:
|
||||
ctx.mov u[i], a[i]
|
||||
|
||||
ctx.mov u[N], u[0]
|
||||
ctx.imul u[0], m0ninv # m <- a[i] * m0ninv mod 2^w
|
||||
ctx.imul u[0], m0ninv # m <- a[i] * m0ninv mod 2ʷ
|
||||
ctx.mov rax, u[0]
|
||||
|
||||
# scratch: [a[0] * m0, a[1], a[2], a[3], a[0]] for 4 limbs
|
||||
|
@ -208,27 +150,138 @@ macro montyRedc2x_gen*[N: static int](
|
|||
else:
|
||||
ctx.adc u[i], a[i+N]
|
||||
|
||||
# v is invalidated from now on
|
||||
let t = repackRegisters(v, u[N], u[N+1])
|
||||
|
||||
# v is invalidated
|
||||
if hasSpareBit:
|
||||
ctx.finalSubNoCarry(r, u, M, t)
|
||||
if hasSpareBit and skipReduction:
|
||||
for i in 0 ..< N:
|
||||
ctx.mov r[i], t[i]
|
||||
elif hasSpareBit:
|
||||
ctx.finalSubNoCarryImpl(r, u, M, t)
|
||||
else:
|
||||
ctx.finalSubCanOverflow(r, u, M, t, rax)
|
||||
ctx.finalSubMayCarryImpl(r, u, M, t, rax)
|
||||
|
||||
# Code generation
|
||||
result.add ctx.generate()
|
||||
|
||||
func montRed_asm*[N: static int](
|
||||
func redcMont_asm_inline*[N: static int](
|
||||
r: var array[N, SecretWord],
|
||||
a: array[N*2, SecretWord],
|
||||
M: array[N, SecretWord],
|
||||
m0ninv: BaseType,
|
||||
hasSpareBit: static bool
|
||||
hasSpareBit: static bool,
|
||||
skipReduction: static bool = false
|
||||
) {.inline.} =
|
||||
## Constant-time Montgomery reduction
|
||||
## Inline-version
|
||||
redc2xMont_gen(r, a, M, m0ninv, hasSpareBit, skipReduction)
|
||||
|
||||
func redcMont_asm*[N: static int](
|
||||
r: var array[N, SecretWord],
|
||||
a: array[N*2, SecretWord],
|
||||
M: array[N, SecretWord],
|
||||
m0ninv: BaseType,
|
||||
hasSpareBit, skipReduction: static bool
|
||||
) =
|
||||
## Constant-time Montgomery reduction
|
||||
static: doAssert UseASM_X86_64, "This requires x86-64."
|
||||
montyRedc2x_gen(r, a, M, m0ninv, hasSpareBit)
|
||||
redcMont_asm_inline(r, a, M, m0ninv, hasSpareBit, skipReduction)
|
||||
|
||||
# Montgomery conversion
|
||||
# ----------------------------------------------------------
|
||||
|
||||
macro mulMont_by_1_gen[N: static int](
|
||||
t_EIR: var array[N, SecretWord],
|
||||
M_PIR: array[N, SecretWord],
|
||||
m0ninv_REG: BaseType) =
|
||||
|
||||
# No register spilling handling
|
||||
doAssert N <= 6, "The Assembly-optimized montgomery reduction requires at most 6 limbs."
|
||||
|
||||
result = newStmtList()
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
# On x86, compilers only let us use 15 out of 16 registers
|
||||
# RAX and RDX are defacto used due to the MUL instructions
|
||||
# so we store everything in scratchspaces restoring as needed
|
||||
let
|
||||
scratchSlots = 2
|
||||
|
||||
t = init(OperandArray, nimSymbol = t_EIR, N, ElemsInReg, InputOutput_EnsureClobber)
|
||||
# We could force M as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||
# MultiPurpose Register slots
|
||||
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||
|
||||
# MUL requires RAX and RDX
|
||||
|
||||
m0ninv = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[m0ninv]",
|
||||
nimSymbol: m0ninv_REG,
|
||||
rm: MemOffsettable,
|
||||
constraint: Input,
|
||||
cEmit: "&" & $m0ninv_REG
|
||||
)
|
||||
)
|
||||
|
||||
C = scratch[0] # Stores the high-part of muliplication
|
||||
m = scratch[1] # Stores (t[0] * m0ninv) mod 2ʷ
|
||||
|
||||
let scratchSym = scratch.nimSymbol
|
||||
|
||||
# Copy a in t
|
||||
result.add quote do:
|
||||
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
|
||||
|
||||
# Algorithm
|
||||
# ---------------------------------------------------------
|
||||
# for i in 0 .. n-1:
|
||||
# m <- t[0] * m0ninv mod 2ʷ (i.e. simple multiplication)
|
||||
# C, _ = t[0] + m * M[0]
|
||||
# for j in 1 .. n-1:
|
||||
# (C, t[j-1]) <- r[j] + m*M[j] + C
|
||||
# t[n-1] = C
|
||||
|
||||
ctx.comment "for i in 0 ..< N:"
|
||||
for i in 0 ..< N:
|
||||
ctx.comment " m <- t[0] * m0ninv mod 2ʷ"
|
||||
ctx.mov m, m0ninv
|
||||
ctx.imul m, t[0]
|
||||
|
||||
ctx.comment " C, _ = t[0] + m * M[0]"
|
||||
ctx.`xor` C, C
|
||||
ctx.mov rax, M[0]
|
||||
ctx.mul rdx, rax, m, rax
|
||||
ctx.add rax, t[0]
|
||||
ctx.adc C, rdx
|
||||
|
||||
ctx.comment " for j in 1 .. n-1:"
|
||||
for j in 1 ..< N:
|
||||
ctx.comment " (C, t[j-1]) <- r[j] + m*M[j] + C"
|
||||
ctx.mov rax, M[j]
|
||||
ctx.mul rdx, rax, m, rax
|
||||
ctx.add C, t[j]
|
||||
ctx.adc rdx, 0
|
||||
ctx.add C, rax
|
||||
ctx.adc rdx, 0
|
||||
ctx.mov t[j-1], C
|
||||
ctx.mov C, rdx
|
||||
|
||||
ctx.comment " final carry"
|
||||
ctx.mov t[N-1], C
|
||||
|
||||
result.add ctx.generate()
|
||||
|
||||
func fromMont_asm*(r: var Limbs, a, M: Limbs, m0ninv: BaseType) =
|
||||
## Constant-time Montgomery residue form to BigInt conversion
|
||||
var t{.noInit.} = a
|
||||
block:
|
||||
t.mulMont_by_1_gen(M, m0ninv)
|
||||
|
||||
block: # Map from [0, 2p) to [0, p)
|
||||
var workspace{.noInit.}: typeof(r)
|
||||
r.finalSub_gen(t, M, workspace, mayCarry = false)
|
||||
|
||||
# Sanity checks
|
||||
# ----------------------------------------------------------
|
||||
|
@ -242,7 +295,7 @@ when isMainModule:
|
|||
|
||||
# TODO: Properly handle low number of limbs
|
||||
|
||||
func montyRedc2x_Comba[N: static int](
|
||||
func redc2xMont_Comba[N: static int](
|
||||
r: var array[N, SecretWord],
|
||||
a: array[N*2, SecretWord],
|
||||
M: array[N, SecretWord],
|
||||
|
@ -298,10 +351,10 @@ when isMainModule:
|
|||
var a_sqr{.noInit.}, na_sqr{.noInit.}: Limbs[2]
|
||||
var a_sqr_comba{.noInit.}, na_sqr_comba{.noInit.}: Limbs[2]
|
||||
|
||||
a_sqr.montRed_asm(adbl_sqr, M, 1, hasSpareBit = false)
|
||||
na_sqr.montRed_asm(nadbl_sqr, M, 1, hasSpareBit = false)
|
||||
a_sqr_comba.montyRedc2x_Comba(adbl_sqr, M, 1)
|
||||
na_sqr_comba.montyRedc2x_Comba(nadbl_sqr, M, 1)
|
||||
a_sqr.redcMont_asm(adbl_sqr, M, 1, hasSpareBit = false, skipReduction = false)
|
||||
na_sqr.redcMont_asm(nadbl_sqr, M, 1, hasSpareBit = false, skipReduction = false)
|
||||
a_sqr_comba.redc2xMont_Comba(adbl_sqr, M, 1)
|
||||
na_sqr_comba.redc2xMont_Comba(nadbl_sqr, M, 1)
|
||||
|
||||
debugecho "--------------------------------"
|
||||
debugecho "after:"
|
|
@ -0,0 +1,269 @@
|
|||
# Constantine
|
||||
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
||||
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
||||
# Licensed and distributed under either of
|
||||
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import
|
||||
# Standard library
|
||||
std/macros,
|
||||
# Internal
|
||||
../../config/common,
|
||||
../../primitives,
|
||||
./limbs_asm_modular_x86
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# Assembly implementation of finite fields
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
static: doAssert UseASM_X86_64
|
||||
|
||||
# MULX/ADCX/ADOX
|
||||
{.localPassC:"-madx -mbmi2".}
|
||||
# Necessary for the compiler to find enough registers (enabled at -O1)
|
||||
{.localPassC:"-fomit-frame-pointer".}
|
||||
|
||||
# No exceptions allowed
|
||||
{.push raises: [].}
|
||||
|
||||
# Montgomery reduction
|
||||
# ------------------------------------------------------------
|
||||
|
||||
macro redc2xMont_adx_gen[N: static int](
|
||||
r_PIR: var array[N, SecretWord],
|
||||
a_PIR: array[N*2, SecretWord],
|
||||
M_PIR: array[N, SecretWord],
|
||||
m0ninv_REG: BaseType,
|
||||
hasSpareBit, skipReduction: static bool
|
||||
) =
|
||||
|
||||
# No register spilling handling
|
||||
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
|
||||
|
||||
result = newStmtList()
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
let
|
||||
# We could force M as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||
|
||||
let uSlots = N+1
|
||||
let vSlots = max(N-1, 5)
|
||||
|
||||
var # Scratchspaces
|
||||
u = init(OperandArray, nimSymbol = ident"U", uSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||
v = init(OperandArray, nimSymbol = ident"V", vSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||
|
||||
# Prologue
|
||||
let usym = u.nimSymbol
|
||||
let vsym = v.nimSymbol
|
||||
result.add quote do:
|
||||
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
||||
var `usym`{.noinit.}: Limbs[`uSlots`]
|
||||
var `vsym` {.noInit.}: Limbs[`vSlots`]
|
||||
`vsym`[0] = cast[SecretWord](`r_PIR`[0].unsafeAddr)
|
||||
`vsym`[1] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
|
||||
`vsym`[2] = SecretWord(`m0ninv_REG`)
|
||||
|
||||
let r_temp = v[0].asArrayAddr(len = N)
|
||||
let a = v[1].asArrayAddr(len = 2*N)
|
||||
let m0ninv = v[2]
|
||||
let lo = v[3]
|
||||
let hi = v[4]
|
||||
|
||||
# Algorithm
|
||||
# ---------------------------------------------------------
|
||||
# for i in 0 .. n-1:
|
||||
# hi <- 0
|
||||
# m <- a[i] * m0ninv mod 2ʷ (i.e. simple multiplication)
|
||||
# for j in 0 .. n-1:
|
||||
# (hi, lo) <- a[i+j] + m * M[j] + hi
|
||||
# a[i+j] <- lo
|
||||
# a[i+n] += hi
|
||||
# for i in 0 .. n-1:
|
||||
# r[i] = a[i+n]
|
||||
# if r >= M:
|
||||
# r -= M
|
||||
|
||||
ctx.mov rdx, m0ninv
|
||||
|
||||
for i in 0 ..< N:
|
||||
ctx.mov u[i], a[i]
|
||||
|
||||
for i in 0 ..< N:
|
||||
# RDX contains m0ninv at the start of each loop
|
||||
ctx.comment ""
|
||||
ctx.imul rdx, u[0] # m <- a[i] * m0ninv mod 2ʷ
|
||||
ctx.comment "---- Reduction " & $i
|
||||
ctx.`xor` u[N], u[N]
|
||||
|
||||
for j in 0 ..< N-1:
|
||||
ctx.comment ""
|
||||
ctx.mulx hi, lo, M[j], rdx
|
||||
ctx.adcx u[j], lo
|
||||
ctx.adox u[j+1], hi
|
||||
|
||||
# Last limb
|
||||
ctx.comment ""
|
||||
ctx.mulx hi, lo, M[N-1], rdx
|
||||
ctx.mov rdx, m0ninv # Reload m0ninv for next iter
|
||||
ctx.adcx u[N-1], lo
|
||||
ctx.adox hi, u[N]
|
||||
ctx.adcx u[N], hi
|
||||
|
||||
u.rotateLeft()
|
||||
|
||||
ctx.mov rdx, r_temp
|
||||
let r = rdx.asArrayAddr(len = N)
|
||||
|
||||
# This does a[i+n] += hi
|
||||
# but in a separate carry chain, fused with the
|
||||
# copy "r[i] = a[i+n]"
|
||||
for i in 0 ..< N:
|
||||
if i == 0:
|
||||
ctx.add u[i], a[i+N]
|
||||
else:
|
||||
ctx.adc u[i], a[i+N]
|
||||
|
||||
let t = repackRegisters(v, u[N])
|
||||
|
||||
if hasSpareBit and skipReduction:
|
||||
for i in 0 ..< N:
|
||||
ctx.mov r[i], t[i]
|
||||
elif hasSpareBit:
|
||||
ctx.finalSubNoCarryImpl(r, u, M, t)
|
||||
else:
|
||||
ctx.finalSubMayCarryImpl(r, u, M, t, hi)
|
||||
|
||||
# Code generation
|
||||
result.add ctx.generate()
|
||||
|
||||
func redcMont_asm_adx_inline*[N: static int](
|
||||
r: var array[N, SecretWord],
|
||||
a: array[N*2, SecretWord],
|
||||
M: array[N, SecretWord],
|
||||
m0ninv: BaseType,
|
||||
hasSpareBit: static bool,
|
||||
skipReduction: static bool = false
|
||||
) {.inline.} =
|
||||
## Constant-time Montgomery reduction
|
||||
## Inline-version
|
||||
redc2xMont_adx_gen(r, a, M, m0ninv, hasSpareBit, skipReduction)
|
||||
|
||||
func redcMont_asm_adx*[N: static int](
|
||||
r: var array[N, SecretWord],
|
||||
a: array[N*2, SecretWord],
|
||||
M: array[N, SecretWord],
|
||||
m0ninv: BaseType,
|
||||
hasSpareBit: static bool,
|
||||
skipReduction: static bool = false
|
||||
) =
|
||||
## Constant-time Montgomery reduction
|
||||
redcMont_asm_adx_inline(r, a, M, m0ninv, hasSpareBit, skipReduction)
|
||||
|
||||
|
||||
# Montgomery conversion
|
||||
# ----------------------------------------------------------
|
||||
|
||||
macro mulMont_by_1_adx_gen[N: static int](
|
||||
t_EIR: var array[N, SecretWord],
|
||||
M_PIR: array[N, SecretWord],
|
||||
m0ninv_REG: BaseType) =
|
||||
|
||||
# No register spilling handling
|
||||
doAssert N <= 6, "The Assembly-optimized montgomery reduction requires at most 6 limbs."
|
||||
|
||||
result = newStmtList()
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
# On x86, compilers only let us use 15 out of 16 registers
|
||||
# RAX and RDX are defacto used due to the MUL instructions
|
||||
# so we store everything in scratchspaces restoring as needed
|
||||
let
|
||||
scratchSlots = 1
|
||||
|
||||
t = init(OperandArray, nimSymbol = t_EIR, N, ElemsInReg, InputOutput_EnsureClobber)
|
||||
# We could force M as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||
# MultiPurpose Register slots
|
||||
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||
|
||||
# MUL requires RAX and RDX
|
||||
|
||||
m0ninv = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[m0ninv]",
|
||||
nimSymbol: m0ninv_REG,
|
||||
rm: MemOffsettable,
|
||||
constraint: Input,
|
||||
cEmit: "&" & $m0ninv_REG
|
||||
)
|
||||
)
|
||||
|
||||
C = scratch[0] # Stores the high-part of muliplication
|
||||
|
||||
let scratchSym = scratch.nimSymbol
|
||||
|
||||
# Copy a in t
|
||||
result.add quote do:
|
||||
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
|
||||
|
||||
# Algorithm
|
||||
# ---------------------------------------------------------
|
||||
# for i in 0 .. n-1:
|
||||
# m <- t[0] * m0ninv mod 2ʷ (i.e. simple multiplication)
|
||||
# C, _ = t[0] + m * M[0]
|
||||
# for j in 1 .. n-1:
|
||||
# (C, t[j-1]) <- r[j] + m*M[j] + C
|
||||
# t[n-1] = C
|
||||
|
||||
# Low-level optimizations
|
||||
# https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
|
||||
# Section 3.5.1.8 xor'ing a reg with itself is free (except for instruction code size)
|
||||
|
||||
ctx.comment "for i in 0 ..< N:"
|
||||
for i in 0 ..< N:
|
||||
ctx.comment " m <- t[0] * m0ninv mod 2ʷ"
|
||||
ctx.mov rdx, m0ninv
|
||||
ctx.imul rdx, t[0]
|
||||
|
||||
# Create 2 parallel carry-chains for adcx and adox
|
||||
# We need to clear the carry/overflow flags first for ADCX/ADOX
|
||||
# with the smallest instruction if possible (xor rax, rax)
|
||||
# to reduce instruction-cache miss
|
||||
ctx.comment " C, _ = t[0] + m * M[0]"
|
||||
ctx.`xor` rax, rax
|
||||
ctx.mulx C, rax, M[0], rdx
|
||||
ctx.adcx rax, t[0] # Set C', the carry flag for future adcx, but don't accumulate in C yet
|
||||
ctx.mov t[0], C
|
||||
|
||||
# for j=1 to N-1
|
||||
# (S,t[j-1]) := t[j] + m*M[j] + S
|
||||
ctx.comment " for j=1 to N-1"
|
||||
ctx.comment " (C,t[j-1]) := t[j] + m*M[j] + C"
|
||||
for j in 1 ..< N:
|
||||
ctx.adcx t[j-1], t[j]
|
||||
ctx.mulx t[j], C, M[j], rdx
|
||||
ctx.adox t[j-1], C
|
||||
|
||||
ctx.comment " final carries"
|
||||
ctx.mov rax, 0
|
||||
ctx.adcx t[N-1], rax
|
||||
ctx.adox t[N-1], rax
|
||||
|
||||
result.add ctx.generate()
|
||||
|
||||
func fromMont_asm_adx*(r: var Limbs, a, M: Limbs, m0ninv: BaseType) =
|
||||
## Constant-time Montgomery residue form to BigInt conversion
|
||||
## Requires ADX and BMI2 instruction set
|
||||
var t{.noInit.} = a
|
||||
block:
|
||||
t.mulMont_by_1_adx_gen(M, m0ninv)
|
||||
|
||||
block: # Map from [0, 2p) to [0, p)
|
||||
var workspace{.noInit.}: typeof(r)
|
||||
r.finalSub_gen(t, M, workspace, mayCarry = false)
|
|
@ -298,7 +298,7 @@ func prod_high_words*[rBits, aBits, bBits](r: var BigInt[rBits], a: BigInt[aBits
|
|||
# - Barret reduction
|
||||
# - Approximating multiplication by a fractional constant in the form f(a) = K/C * a
|
||||
# with K and C known at compile-time.
|
||||
# We can instead find a well chosen M = (2^WordBitWidth)^w, with M > C (i.e. M is a power of 2 bigger than C)
|
||||
# We can instead find a well chosen M = (2^WordBitWidth)ʷ, with M > C (i.e. M is a power of 2 bigger than C)
|
||||
# Precompute P = K*M/C at compile-time
|
||||
# and at runtime do P*a/M <=> P*a >> WordBitWidth*w
|
||||
# i.e. prod_high_words(result, P, a, w)
|
||||
|
|
|
@ -24,9 +24,9 @@ import
|
|||
#
|
||||
# ############################################################
|
||||
|
||||
func montyResidue*(mres: var BigInt, a, N, r2modM: BigInt, m0ninv: static BaseType, spareBits: static int) =
|
||||
func getMont*(mres: var BigInt, a, N, r2modM: BigInt, m0ninv: static BaseType, spareBits: static int) =
|
||||
## Convert a BigInt from its natural representation
|
||||
## to the Montgomery n-residue form
|
||||
## to the Montgomery residue form
|
||||
##
|
||||
## `mres` is overwritten. It's bitlength must be properly set before calling this procedure.
|
||||
##
|
||||
|
@ -39,37 +39,35 @@ func montyResidue*(mres: var BigInt, a, N, r2modM: BigInt, m0ninv: static BaseTy
|
|||
## - `r2modM` is R² (mod M)
|
||||
## with W = M.len
|
||||
## and R = (2^WordBitWidth)^W
|
||||
montyResidue(mres.limbs, a.limbs, N.limbs, r2modM.limbs, m0ninv, spareBits)
|
||||
getMont(mres.limbs, a.limbs, N.limbs, r2modM.limbs, m0ninv, spareBits)
|
||||
|
||||
func redc*[mBits](r: var BigInt[mBits], a, M: BigInt[mBits], m0ninv: static BaseType, spareBits: static int) =
|
||||
## Convert a BigInt from its Montgomery n-residue form
|
||||
func fromMont*[mBits](r: var BigInt[mBits], a, M: BigInt[mBits], m0ninv: static BaseType, spareBits: static int) =
|
||||
## Convert a BigInt from its Montgomery residue form
|
||||
## to the natural representation
|
||||
##
|
||||
## `mres` is modified in-place
|
||||
##
|
||||
## Caller must take care of properly switching between
|
||||
## the natural and montgomery domain.
|
||||
let one = block:
|
||||
var one {.noInit.}: BigInt[mBits]
|
||||
one.setOne()
|
||||
one
|
||||
redc(r.limbs, a.limbs, one.limbs, M.limbs, m0ninv, spareBits)
|
||||
fromMont(r.limbs, a.limbs, M.limbs, m0ninv, spareBits)
|
||||
|
||||
func montyMul*(r: var BigInt, a, b, M: BigInt, negInvModWord: static BaseType, spareBits: static int) =
|
||||
func mulMont*(r: var BigInt, a, b, M: BigInt, negInvModWord: static BaseType,
|
||||
spareBits: static int, skipReduction: static bool = false) =
|
||||
## Compute r <- a*b (mod M) in the Montgomery domain
|
||||
##
|
||||
## This resets r to zero before processing. Use {.noInit.}
|
||||
## to avoid duplicating with Nim zero-init policy
|
||||
montyMul(r.limbs, a.limbs, b.limbs, M.limbs, negInvModWord, spareBits)
|
||||
mulMont(r.limbs, a.limbs, b.limbs, M.limbs, negInvModWord, spareBits, skipReduction)
|
||||
|
||||
func montySquare*(r: var BigInt, a, M: BigInt, negInvModWord: static BaseType, spareBits: static int) =
|
||||
func squareMont*(r: var BigInt, a, M: BigInt, negInvModWord: static BaseType,
|
||||
spareBits: static int, skipReduction: static bool = false) =
|
||||
## Compute r <- a^2 (mod M) in the Montgomery domain
|
||||
##
|
||||
## This resets r to zero before processing. Use {.noInit.}
|
||||
## to avoid duplicating with Nim zero-init policy
|
||||
montySquare(r.limbs, a.limbs, M.limbs, negInvModWord, spareBits)
|
||||
squareMont(r.limbs, a.limbs, M.limbs, negInvModWord, spareBits, skipReduction)
|
||||
|
||||
func montyPow*[mBits: static int](
|
||||
func powMont*[mBits: static int](
|
||||
a: var BigInt[mBits], exponent: openarray[byte],
|
||||
M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int,
|
||||
spareBits: static int
|
||||
|
@ -91,9 +89,9 @@ func montyPow*[mBits: static int](
|
|||
const scratchLen = if windowSize == 1: 2
|
||||
else: (1 shl windowSize) + 1
|
||||
var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired]]
|
||||
montyPow(a.limbs, exponent, M.limbs, one.limbs, negInvModWord, scratchSpace, spareBits)
|
||||
powMont(a.limbs, exponent, M.limbs, one.limbs, negInvModWord, scratchSpace, spareBits)
|
||||
|
||||
func montyPowUnsafeExponent*[mBits: static int](
|
||||
func powMontUnsafeExponent*[mBits: static int](
|
||||
a: var BigInt[mBits], exponent: openarray[byte],
|
||||
M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int,
|
||||
spareBits: static int
|
||||
|
@ -115,12 +113,12 @@ func montyPowUnsafeExponent*[mBits: static int](
|
|||
const scratchLen = if windowSize == 1: 2
|
||||
else: (1 shl windowSize) + 1
|
||||
var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired]]
|
||||
montyPowUnsafeExponent(a.limbs, exponent, M.limbs, one.limbs, negInvModWord, scratchSpace, spareBits)
|
||||
powMontUnsafeExponent(a.limbs, exponent, M.limbs, one.limbs, negInvModWord, scratchSpace, spareBits)
|
||||
|
||||
from ../io/io_bigints import exportRawUint
|
||||
# Workaround recursive dependencies
|
||||
|
||||
func montyPow*[mBits, eBits: static int](
|
||||
func powMont*[mBits, eBits: static int](
|
||||
a: var BigInt[mBits], exponent: BigInt[eBits],
|
||||
M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int,
|
||||
spareBits: static int
|
||||
|
@ -137,9 +135,9 @@ func montyPow*[mBits, eBits: static int](
|
|||
var expBE {.noInit.}: array[(ebits + 7) div 8, byte]
|
||||
expBE.exportRawUint(exponent, bigEndian)
|
||||
|
||||
montyPow(a, expBE, M, one, negInvModWord, windowSize, spareBits)
|
||||
powMont(a, expBE, M, one, negInvModWord, windowSize, spareBits)
|
||||
|
||||
func montyPowUnsafeExponent*[mBits, eBits: static int](
|
||||
func powMontUnsafeExponent*[mBits, eBits: static int](
|
||||
a: var BigInt[mBits], exponent: BigInt[eBits],
|
||||
M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int,
|
||||
spareBits: static int
|
||||
|
@ -160,7 +158,7 @@ func montyPowUnsafeExponent*[mBits, eBits: static int](
|
|||
var expBE {.noInit.}: array[(ebits + 7) div 8, byte]
|
||||
expBE.exportRawUint(exponent, bigEndian)
|
||||
|
||||
montyPowUnsafeExponent(a, expBE, M, one, negInvModWord, windowSize, spareBits)
|
||||
powMontUnsafeExponent(a, expBE, M, one, negInvModWord, windowSize, spareBits)
|
||||
|
||||
{.pop.} # inline
|
||||
{.pop.} # raises no exceptions
|
||||
|
|
|
@ -56,16 +56,20 @@ func fromBig*(dst: var FF, src: BigInt) =
|
|||
when nimvm:
|
||||
dst.mres.montyResidue_precompute(src, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord())
|
||||
else:
|
||||
dst.mres.montyResidue(src, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
|
||||
dst.mres.getMont(src, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
|
||||
|
||||
func fromBig*[C: static Curve](T: type FF[C], src: BigInt): FF[C] {.noInit.} =
|
||||
## Convert a BigInt to its Montgomery form
|
||||
result.fromBig(src)
|
||||
|
||||
func fromField*(dst: var BigInt, src: FF) {.noInit, inline.} =
|
||||
## Convert a finite-field element to a BigInt in natural representation
|
||||
dst.fromMont(src.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits())
|
||||
|
||||
func toBig*(src: FF): auto {.noInit, inline.} =
|
||||
## Convert a finite-field element to a BigInt in natural representation
|
||||
var r {.noInit.}: typeof(src.mres)
|
||||
r.redc(src.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits())
|
||||
r.fromField(src)
|
||||
return r
|
||||
|
||||
# Copy
|
||||
|
@ -102,7 +106,7 @@ func cswap*(a, b: var FF, ctl: CTBool) {.meter.} =
|
|||
|
||||
# Note: for `+=`, double, sum
|
||||
# not(a.mres < FF.fieldMod()) is unnecessary if the prime has the form
|
||||
# (2^64)^w - 1 (if using uint64 words).
|
||||
# (2^64)ʷ - 1 (if using uint64 words).
|
||||
# In practice I'm not aware of such prime being using in elliptic curves.
|
||||
# 2^127 - 1 and 2^521 - 1 are used but 127 and 521 are not multiple of 32/64
|
||||
|
||||
|
@ -148,7 +152,7 @@ func setMinusOne*(a: var FF) =
|
|||
func `+=`*(a: var FF, b: FF) {.meter.} =
|
||||
## In-place addition modulo p
|
||||
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||
addmod_asm(a.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
|
||||
addmod_asm(a.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs, FF.getSpareBits() >= 1)
|
||||
else:
|
||||
var overflowed = add(a.mres, b.mres)
|
||||
overflowed = overflowed or not(a.mres < FF.fieldMod())
|
||||
|
@ -165,7 +169,7 @@ func `-=`*(a: var FF, b: FF) {.meter.} =
|
|||
func double*(a: var FF) {.meter.} =
|
||||
## Double ``a`` modulo p
|
||||
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||
addmod_asm(a.mres.limbs, a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
|
||||
addmod_asm(a.mres.limbs, a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs, FF.getSpareBits() >= 1)
|
||||
else:
|
||||
var overflowed = double(a.mres)
|
||||
overflowed = overflowed or not(a.mres < FF.fieldMod())
|
||||
|
@ -175,7 +179,7 @@ func sum*(r: var FF, a, b: FF) {.meter.} =
|
|||
## Sum ``a`` and ``b`` into ``r`` modulo p
|
||||
## r is initialized/overwritten
|
||||
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||
addmod_asm(r.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
|
||||
addmod_asm(r.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs, FF.getSpareBits() >= 1)
|
||||
else:
|
||||
var overflowed = r.mres.sum(a.mres, b.mres)
|
||||
overflowed = overflowed or not(r.mres < FF.fieldMod())
|
||||
|
@ -204,20 +208,20 @@ func double*(r: var FF, a: FF) {.meter.} =
|
|||
## Double ``a`` into ``r``
|
||||
## `r` is initialized/overwritten
|
||||
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||
addmod_asm(r.mres.limbs, a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
|
||||
addmod_asm(r.mres.limbs, a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs, FF.getSpareBits() >= 1)
|
||||
else:
|
||||
var overflowed = r.mres.double(a.mres)
|
||||
overflowed = overflowed or not(r.mres < FF.fieldMod())
|
||||
discard csub(r.mres, FF.fieldMod(), overflowed)
|
||||
|
||||
func prod*(r: var FF, a, b: FF) {.meter.} =
|
||||
func prod*(r: var FF, a, b: FF, skipReduction: static bool = false) {.meter.} =
|
||||
## Store the product of ``a`` by ``b`` modulo p into ``r``
|
||||
## ``r`` is initialized / overwritten
|
||||
r.mres.montyMul(a.mres, b.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits())
|
||||
r.mres.mulMont(a.mres, b.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits(), skipReduction)
|
||||
|
||||
func square*(r: var FF, a: FF) {.meter.} =
|
||||
func square*(r: var FF, a: FF, skipReduction: static bool = false) {.meter.} =
|
||||
## Squaring modulo p
|
||||
r.mres.montySquare(a.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits())
|
||||
r.mres.squareMont(a.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits(), skipReduction)
|
||||
|
||||
func neg*(r: var FF, a: FF) {.meter.} =
|
||||
## Negate modulo p
|
||||
|
@ -261,20 +265,20 @@ func cneg*(r: var FF, a: FF, ctl: SecretBool) {.meter.} =
|
|||
func cneg*(a: var FF, ctl: SecretBool) {.meter.} =
|
||||
## Constant-time in-place conditional negation
|
||||
## The negation is only performed if ctl is "true"
|
||||
var t = a
|
||||
var t {.noInit.} = a
|
||||
a.cneg(t, ctl)
|
||||
|
||||
func cadd*(a: var FF, b: FF, ctl: SecretBool) {.meter.} =
|
||||
## Constant-time out-place conditional addition
|
||||
## The addition is only performed if ctl is "true"
|
||||
var t = a
|
||||
var t {.noInit.} = a
|
||||
t += b
|
||||
a.ccopy(t, ctl)
|
||||
|
||||
func csub*(a: var FF, b: FF, ctl: SecretBool) {.meter.} =
|
||||
## Constant-time out-place conditional substraction
|
||||
## The substraction is only performed if ctl is "true"
|
||||
var t = a
|
||||
var t {.noInit.} = a
|
||||
t -= b
|
||||
a.ccopy(t, ctl)
|
||||
|
||||
|
@ -340,7 +344,7 @@ func pow*(a: var FF, exponent: BigInt) =
|
|||
## ``a``: a field element to be exponentiated
|
||||
## ``exponent``: a big integer
|
||||
const windowSize = 5 # TODO: find best window size for each curves
|
||||
a.mres.montyPow(
|
||||
a.mres.powMont(
|
||||
exponent,
|
||||
FF.fieldMod(), FF.getMontyOne(),
|
||||
FF.getNegInvModWord(), windowSize,
|
||||
|
@ -352,7 +356,7 @@ func pow*(a: var FF, exponent: openarray[byte]) =
|
|||
## ``a``: a field element to be exponentiated
|
||||
## ``exponent``: a big integer in canonical big endian representation
|
||||
const windowSize = 5 # TODO: find best window size for each curves
|
||||
a.mres.montyPow(
|
||||
a.mres.powMont(
|
||||
exponent,
|
||||
FF.fieldMod(), FF.getMontyOne(),
|
||||
FF.getNegInvModWord(), windowSize,
|
||||
|
@ -371,7 +375,7 @@ func powUnsafeExponent*(a: var FF, exponent: BigInt) =
|
|||
## - power analysis
|
||||
## - timing analysis
|
||||
const windowSize = 5 # TODO: find best window size for each curves
|
||||
a.mres.montyPowUnsafeExponent(
|
||||
a.mres.powMontUnsafeExponent(
|
||||
exponent,
|
||||
FF.fieldMod(), FF.getMontyOne(),
|
||||
FF.getNegInvModWord(), windowSize,
|
||||
|
@ -390,7 +394,7 @@ func powUnsafeExponent*(a: var FF, exponent: openarray[byte]) =
|
|||
## - power analysis
|
||||
## - timing analysis
|
||||
const windowSize = 5 # TODO: find best window size for each curves
|
||||
a.mres.montyPowUnsafeExponent(
|
||||
a.mres.powMontUnsafeExponent(
|
||||
exponent,
|
||||
FF.fieldMod(), FF.getMontyOne(),
|
||||
FF.getNegInvModWord(), windowSize,
|
||||
|
@ -409,21 +413,39 @@ func `*=`*(a: var FF, b: FF) {.meter.} =
|
|||
## Multiplication modulo p
|
||||
a.prod(a, b)
|
||||
|
||||
func square*(a: var FF) {.meter.} =
|
||||
func square*(a: var FF, skipReduction: static bool = false) {.meter.} =
|
||||
## Squaring modulo p
|
||||
a.mres.montySquare(a.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.getSpareBits())
|
||||
a.square(a, skipReduction)
|
||||
|
||||
func square_repeated*(r: var FF, num: int) {.meter.} =
|
||||
func square_repeated*(a: var FF, num: int, skipReduction: static bool = false) {.meter.} =
|
||||
## Repeated squarings
|
||||
# Except in Tonelli-Shanks, num is always known at compile-time
|
||||
# and square repeated is inlined, so the compiler should optimize the branches away.
|
||||
|
||||
# TODO: understand the conditions to avoid the final substraction
|
||||
for _ in 0 ..< num:
|
||||
r.square()
|
||||
a.square(skipReduction = false)
|
||||
|
||||
func square_repeated*(r: var FF, a: FF, num: int) {.meter.} =
|
||||
func square_repeated*(r: var FF, a: FF, num: int, skipReduction: static bool = false) {.meter.} =
|
||||
## Repeated squarings
|
||||
|
||||
# TODO: understand the conditions to avoid the final substraction
|
||||
r.square(a)
|
||||
for _ in 1 ..< num:
|
||||
r.square()
|
||||
|
||||
func square_repeated_then_mul*(a: var FF, num: int, b: FF, skipReduction: static bool = false) {.meter.} =
|
||||
## Square `a`, `num` times and then multiply by b
|
||||
## Assumes at least 1 squaring
|
||||
a.square_repeated(num, skipReduction = false)
|
||||
a.prod(a, b, skipReduction = skipReduction)
|
||||
|
||||
func square_repeated_then_mul*(r: var FF, a: FF, num: int, b: FF, skipReduction: static bool = false) {.meter.} =
|
||||
## Square `a`, `num` times and then multiply by b
|
||||
## Assumes at least 1 squaring
|
||||
r.square_repeated(a, num, skipReduction = false)
|
||||
r.prod(r, b, skipReduction = skipReduction)
|
||||
|
||||
func `*=`*(a: var FF, b: static int) =
|
||||
## Multiplication by a small integer known at compile-time
|
||||
# Implementation:
|
||||
|
|
|
@ -73,7 +73,7 @@ func redc2x*(r: var Fp, a: FpDbl) =
|
|||
## Reduce a double-precision field element into r
|
||||
## from [0, 2ⁿp) range to [0, p) range
|
||||
const N = r.mres.limbs.len
|
||||
montyRedc2x(
|
||||
redc2xMont(
|
||||
r.mres.limbs,
|
||||
a.limbs2x,
|
||||
Fp.C.Mod.limbs,
|
||||
|
|
|
@ -128,7 +128,7 @@ func numWordsFromBits(bits: int): int {.inline.} =
|
|||
func shlAddMod_estimate(a: LimbsViewMut, aLen: int,
|
||||
c: SecretWord, M: LimbsViewConst, mBits: int
|
||||
): tuple[neg, tooBig: SecretBool] =
|
||||
## Estimate a <- a shl 2^w + c (mod M)
|
||||
## Estimate a <- a shl 2ʷ + c (mod M)
|
||||
##
|
||||
## with w the base word width, usually 32 on 32-bit platforms and 64 on 64-bit platforms
|
||||
##
|
||||
|
|
|
@ -74,7 +74,7 @@ func prod*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b:
|
|||
when UseASM_X86_64 and aLen <= 6:
|
||||
# ADX implies BMI2
|
||||
if ({.noSideEffect.}: hasAdx()):
|
||||
mul_asm_adx_bmi2(r, a, b)
|
||||
mul_asm_adx(r, a, b)
|
||||
else:
|
||||
mul_asm(r, a, b)
|
||||
elif UseASM_X86_64:
|
||||
|
@ -98,7 +98,7 @@ func prod_high_words*[rLen, aLen, bLen](
|
|||
# - Barret reduction
|
||||
# - Approximating multiplication by a fractional constant in the form f(a) = K/C * a
|
||||
# with K and C known at compile-time.
|
||||
# We can instead find a well chosen M = (2^WordBitWidth)^w, with M > C (i.e. M is a power of 2 bigger than C)
|
||||
# We can instead find a well chosen M = (2^WordBitWidth)ʷ, with M > C (i.e. M is a power of 2 bigger than C)
|
||||
# Precompute P = K*M/C at compile-time
|
||||
# and at runtime do P*a/M <=> P*a >> (WordBitWidth*w)
|
||||
# i.e. prod_high_words(result, P, a, w)
|
||||
|
@ -203,7 +203,7 @@ func square*[rLen, aLen](
|
|||
when UseASM_X86_64 and aLen in {4, 6} and rLen == 2*aLen:
|
||||
# ADX implies BMI2
|
||||
if ({.noSideEffect.}: hasAdx()):
|
||||
square_asm_adx_bmi2(r, a)
|
||||
square_asm_adx(r, a)
|
||||
else:
|
||||
square_asm(r, a)
|
||||
elif UseASM_X86_64:
|
||||
|
|
|
@ -15,12 +15,12 @@ import
|
|||
./limbs, ./limbs_extmul
|
||||
|
||||
when UseASM_X86_32:
|
||||
import ./assembly/limbs_asm_montred_x86
|
||||
import ./assembly/limbs_asm_redc_mont_x86
|
||||
when UseASM_X86_64:
|
||||
import
|
||||
./assembly/limbs_asm_montmul_x86,
|
||||
./assembly/limbs_asm_montmul_x86_adx_bmi2,
|
||||
./assembly/limbs_asm_montred_x86_adx_bmi2
|
||||
./assembly/limbs_asm_mul_mont_x86,
|
||||
./assembly/limbs_asm_mul_mont_x86_adx_bmi2,
|
||||
./assembly/limbs_asm_redc_mont_x86_adx_bmi2
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
|
@ -53,12 +53,19 @@ when UseASM_X86_64:
|
|||
|
||||
# Montgomery Reduction
|
||||
# ------------------------------------------------------------
|
||||
func montyRedc2x_CIOS[N: static int](
|
||||
func redc2xMont_CIOS[N: static int](
|
||||
r: var array[N, SecretWord],
|
||||
a: array[N*2, SecretWord],
|
||||
M: array[N, SecretWord],
|
||||
m0ninv: BaseType) =
|
||||
m0ninv: BaseType, skipReduction: static bool = false) =
|
||||
## Montgomery reduce a double-precision bigint modulo M
|
||||
##
|
||||
## This maps
|
||||
## - [0, 4p²) -> [0, 2p) with skipReduction
|
||||
## - [0, 4p²) -> [0, p) without
|
||||
##
|
||||
## SkipReduction skips the final substraction step.
|
||||
## For skipReduction, M needs to have a spare bit in it's representation i.e. unused MSB.
|
||||
# - Analyzing and Comparing Montgomery Multiplication Algorithms
|
||||
# Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr.
|
||||
# http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf
|
||||
|
@ -81,7 +88,7 @@ func montyRedc2x_CIOS[N: static int](
|
|||
#
|
||||
# for i in 0 .. n-1:
|
||||
# C <- 0
|
||||
# m <- a[i] * m0ninv mod 2^w (i.e. simple multiplication)
|
||||
# m <- a[i] * m0ninv mod 2ʷ (i.e. simple multiplication)
|
||||
# for j in 0 .. n-1:
|
||||
# (C, S) <- a[i+j] + m * M[j] + C
|
||||
# a[i+j] <- S
|
||||
|
@ -95,8 +102,8 @@ func montyRedc2x_CIOS[N: static int](
|
|||
# to the higher limb if any, thank you "implementation detail"
|
||||
# missing from paper.
|
||||
|
||||
var a = a # Copy "t" for mutation and ensure on stack
|
||||
var res: typeof(r) # Accumulator
|
||||
var a {.noInit.} = a # Copy "t" for mutation and ensure on stack
|
||||
var res {.noInit.}: typeof(r) # Accumulator
|
||||
staticFor i, 0, N:
|
||||
var C = Zero
|
||||
let m = a[i] * SecretWord(m0ninv)
|
||||
|
@ -112,15 +119,23 @@ func montyRedc2x_CIOS[N: static int](
|
|||
addC(carry, res[i], a[i+N], res[i], carry)
|
||||
|
||||
# Final substraction
|
||||
when not skipReduction:
|
||||
discard res.csub(M, SecretWord(carry).isNonZero() or not(res < M))
|
||||
r = res
|
||||
|
||||
func montyRedc2x_Comba[N: static int](
|
||||
func redc2xMont_Comba[N: static int](
|
||||
r: var array[N, SecretWord],
|
||||
a: array[N*2, SecretWord],
|
||||
M: array[N, SecretWord],
|
||||
m0ninv: BaseType) =
|
||||
m0ninv: BaseType, skipReduction: static bool = false) =
|
||||
## Montgomery reduce a double-precision bigint modulo M
|
||||
##
|
||||
## This maps
|
||||
## - [0, 4p²) -> [0, 2p) with skipReduction
|
||||
## - [0, 4p²) -> [0, p) without
|
||||
##
|
||||
## SkipReduction skips the final substraction step.
|
||||
## For skipReduction, M needs to have a spare bit in it's representation i.e. unused MSB.
|
||||
# We use Product Scanning / Comba multiplication
|
||||
var t, u, v = Zero
|
||||
var carry: Carry
|
||||
|
@ -156,18 +171,25 @@ func montyRedc2x_Comba[N: static int](
|
|||
addC(carry, z[N-1], v, a[2*N-1], Carry(0))
|
||||
|
||||
# Final substraction
|
||||
when not skipReduction:
|
||||
discard z.csub(M, SecretBool(carry) or not(z < M))
|
||||
r = z
|
||||
|
||||
# Montgomery Multiplication
|
||||
# ------------------------------------------------------------
|
||||
|
||||
func montyMul_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
|
||||
func mulMont_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipReduction: static bool = false) =
|
||||
## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
|
||||
## and no-carry optimization.
|
||||
## This requires the most significant word of the Modulus
|
||||
## M[^1] < high(SecretWord) shr 1 (i.e. less than 0b01111...1111)
|
||||
## https://hackmd.io/@zkteam/modular_multiplication
|
||||
## https://hackmd.io/@gnark/modular_multiplication
|
||||
##
|
||||
## This maps
|
||||
## - [0, 2p) -> [0, 2p) with skipReduction
|
||||
## - [0, 2p) -> [0, p) without
|
||||
##
|
||||
## SkipReduction skips the final substraction step.
|
||||
|
||||
# We want all the computation to be kept in registers
|
||||
# hence we use a temporary `t`, hoping that the compiler does it.
|
||||
|
@ -175,7 +197,7 @@ func montyMul_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
|
|||
const N = t.len
|
||||
staticFor i, 0, N:
|
||||
# (A, t[0]) <- a[0] * b[i] + t[0]
|
||||
# m <- (t[0] * m0ninv) mod 2^w
|
||||
# m <- (t[0] * m0ninv) mod 2ʷ
|
||||
# (C, _) <- m * M[0] + t[0]
|
||||
var A: SecretWord
|
||||
muladd1(A, t[0], a[0], b[i], t[0])
|
||||
|
@ -191,10 +213,11 @@ func montyMul_CIOS_sparebit(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
|
|||
|
||||
t[N-1] = C + A
|
||||
|
||||
when not skipReduction:
|
||||
discard t.csub(M, not(t < M))
|
||||
r = t
|
||||
|
||||
func montyMul_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) {.used.} =
|
||||
func mulMont_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) {.used.} =
|
||||
## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
|
||||
# - Analyzing and Comparing Montgomery Multiplication Algorithms
|
||||
# Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr.
|
||||
|
@ -221,7 +244,7 @@ func montyMul_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) {.used.} =
|
|||
addC(tNp1, tN, tN, A, Carry(0))
|
||||
|
||||
# Reduction
|
||||
# m <- (t[0] * m0ninv) mod 2^w
|
||||
# m <- (t[0] * m0ninv) mod 2ʷ
|
||||
# (C, _) <- m * M[0] + t[0]
|
||||
var C, lo = Zero
|
||||
let m = t[0] * SecretWord(m0ninv)
|
||||
|
@ -239,11 +262,18 @@ func montyMul_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) {.used.} =
|
|||
# t[N+1] can only be non-zero in the intermediate computation
|
||||
# since it is immediately reduce to t[N] at the end of each "i" iteration
|
||||
# However if t[N] is non-zero we have t > M
|
||||
discard t.csub(M, tN.isNonZero() or not(t < M)) # TODO: (t >= M) is unnecessary for prime in the form (2^64)^w
|
||||
discard t.csub(M, tN.isNonZero() or not(t < M)) # TODO: (t >= M) is unnecessary for prime in the form (2^64)ʷ
|
||||
r = t
|
||||
|
||||
func montyMul_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
|
||||
func mulMont_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipReduction: static bool = false) =
|
||||
## Montgomery Multiplication using Finely Integrated Product Scanning (FIPS)
|
||||
##
|
||||
## This maps
|
||||
## - [0, 2p) -> [0, 2p) with skipReduction
|
||||
## - [0, 2p) -> [0, p) without
|
||||
##
|
||||
## SkipReduction skips the final substraction step.
|
||||
## For skipReduction, M needs to have a spare bit in it's representation i.e. unused MSB.
|
||||
# - Architectural Enhancements for Montgomery
|
||||
# Multiplication on Embedded RISC Processors
|
||||
# Johann Großschädl and Guy-Armand Kamendje, 2003
|
||||
|
@ -276,6 +306,7 @@ func montyMul_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
|
|||
u = t
|
||||
t = Zero
|
||||
|
||||
when not skipReduction:
|
||||
discard z.csub(M, v.isNonZero() or not(z < M))
|
||||
r = z
|
||||
|
||||
|
@ -313,36 +344,81 @@ func montyMul_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
|
|||
# ...
|
||||
# for j in 1 ..< N: # <- Montgomery reduce.
|
||||
|
||||
# Montgomery Conversion
|
||||
# ------------------------------------------------------------
|
||||
#
|
||||
# In Montgomery form, inputs are scaled by a constant R
|
||||
# so a' = aR (mod p) and b' = bR (mod p)
|
||||
#
|
||||
# A classic multiplication would do a'*b' = abR² (mod p)
|
||||
# we then need to remove the extra R, hence:
|
||||
# - Montgomery reduction (redc) does 1/R (mod p) to map abR² (mod p) -> abR (mod p)
|
||||
# - Montgomery multiplication directly compute mulMont(aR, bR) = abR (mod p)
|
||||
#
|
||||
# So to convert a to a' = aR (mod p), we can do mulMont(a, R²) = aR (mod p)
|
||||
# and to convert a' to a = aR / R (mod p) we can do:
|
||||
# - redc(aR) = a
|
||||
# - or mulMont(aR, 1) = a
|
||||
|
||||
func fromMont_CIOS(r: var Limbs, a, M: Limbs, m0ninv: BaseType) =
|
||||
## Convert from Montgomery form to canonical BigInt form
|
||||
# for i in 0 .. n-1:
|
||||
# m <- t[0] * m0ninv mod 2ʷ (i.e. simple multiplication)
|
||||
# C, _ = t[0] + m * M[0]
|
||||
# for j in 1 ..n-1:
|
||||
# (C, t[j-1]) <- r[j] + m*M[j] + C
|
||||
# t[n-1] = C
|
||||
|
||||
const N = a.len
|
||||
var t {.noInit.} = a # Ensure working in registers
|
||||
|
||||
staticFor i, 0, N:
|
||||
let m = t[0] * SecretWord(m0ninv)
|
||||
var C, lo: SecretWord
|
||||
muladd1(C, lo, m, M[0], t[0])
|
||||
staticFor j, 1, N:
|
||||
muladd2(C, t[j-1], m, M[j], C, t[j])
|
||||
t[N-1] = C
|
||||
|
||||
discard t.csub(M, not(t < M))
|
||||
r = t
|
||||
|
||||
# Exported API
|
||||
# ------------------------------------------------------------
|
||||
|
||||
# TODO upstream, using Limbs[N] breaks semcheck
|
||||
func montyRedc2x*[N: static int](
|
||||
func redc2xMont*[N: static int](
|
||||
r: var array[N, SecretWord],
|
||||
a: array[N*2, SecretWord],
|
||||
M: array[N, SecretWord],
|
||||
m0ninv: BaseType, spareBits: static int) {.inline.} =
|
||||
m0ninv: BaseType,
|
||||
spareBits: static int, skipReduction: static bool = false) {.inline.} =
|
||||
## Montgomery reduce a double-precision bigint modulo M
|
||||
|
||||
const skipReduction = skipReduction and spareBits >= 1
|
||||
|
||||
when UseASM_X86_64 and r.len <= 6:
|
||||
# ADX implies BMI2
|
||||
if ({.noSideEffect.}: hasAdx()):
|
||||
montRed_asm_adx_bmi2(r, a, M, m0ninv, spareBits >= 1)
|
||||
redcMont_asm_adx(r, a, M, m0ninv, spareBits >= 1, skipReduction)
|
||||
else:
|
||||
when r.len in {3..6}:
|
||||
montRed_asm(r, a, M, m0ninv, spareBits >= 1)
|
||||
redcMont_asm(r, a, M, m0ninv, spareBits >= 1, skipReduction)
|
||||
else:
|
||||
montyRedc2x_CIOS(r, a, M, m0ninv)
|
||||
# montyRedc2x_Comba(r, a, M, m0ninv)
|
||||
redc2xMont_CIOS(r, a, M, m0ninv, skipReduction)
|
||||
# redc2xMont_Comba(r, a, M, m0ninv)
|
||||
elif UseASM_X86_64 and r.len in {3..6}:
|
||||
# TODO: Assembly faster than GCC but slower than Clang
|
||||
montRed_asm(r, a, M, m0ninv, spareBits >= 1)
|
||||
redcMont_asm(r, a, M, m0ninv, spareBits >= 1, skipReduction)
|
||||
else:
|
||||
montyRedc2x_CIOS(r, a, M, m0ninv)
|
||||
# montyRedc2x_Comba(r, a, M, m0ninv)
|
||||
redc2xMont_CIOS(r, a, M, m0ninv, skipReduction)
|
||||
# redc2xMont_Comba(r, a, M, m0ninv, skipReduction)
|
||||
|
||||
func montyMul*(
|
||||
func mulMont*(
|
||||
r: var Limbs, a, b, M: Limbs,
|
||||
m0ninv: static BaseType, spareBits: static int) {.inline.} =
|
||||
m0ninv: BaseType,
|
||||
spareBits: static int,
|
||||
skipReduction: static bool = false) {.inline.} =
|
||||
## Compute r <- a*b (mod M) in the Montgomery domain
|
||||
## `m0ninv` = -1/M (mod SecretWord). Our words are 2^32 or 2^64
|
||||
##
|
||||
|
@ -369,43 +445,50 @@ func montyMul*(
|
|||
# The implementation is visible from here, the compiler can make decision whether to:
|
||||
# - specialize/duplicate code for m0ninv == 1 (especially if only 1 curve is needed)
|
||||
# - keep it generic and optimize code size
|
||||
|
||||
const skipReduction = skipReduction and spareBits >= 1
|
||||
|
||||
when spareBits >= 1:
|
||||
when UseASM_X86_64 and a.len in {2 .. 6}: # TODO: handle spilling
|
||||
# ADX implies BMI2
|
||||
if ({.noSideEffect.}: hasAdx()):
|
||||
montMul_CIOS_sparebit_asm_adx_bmi2(r, a, b, M, m0ninv)
|
||||
mulMont_CIOS_sparebit_asm_adx(r, a, b, M, m0ninv, skipReduction)
|
||||
else:
|
||||
montMul_CIOS_sparebit_asm(r, a, b, M, m0ninv)
|
||||
mulMont_CIOS_sparebit_asm(r, a, b, M, m0ninv, skipReduction)
|
||||
else:
|
||||
montyMul_CIOS_sparebit(r, a, b, M, m0ninv)
|
||||
mulMont_CIOS_sparebit(r, a, b, M, m0ninv, skipReduction)
|
||||
else:
|
||||
montyMul_FIPS(r, a, b, M, m0ninv)
|
||||
mulMont_FIPS(r, a, b, M, m0ninv, skipReduction)
|
||||
|
||||
func montySquare*[N](r: var Limbs[N], a, M: Limbs[N],
|
||||
m0ninv: static BaseType, spareBits: static int) {.inline.} =
|
||||
func squareMont*[N](r: var Limbs[N], a, M: Limbs[N],
|
||||
m0ninv: BaseType,
|
||||
spareBits: static int,
|
||||
skipReduction: static bool = false) {.inline.} =
|
||||
## Compute r <- a^2 (mod M) in the Montgomery domain
|
||||
## `m0ninv` = -1/M (mod SecretWord). Our words are 2^31 or 2^63
|
||||
|
||||
const skipReduction = skipReduction and spareBits >= 1
|
||||
|
||||
when UseASM_X86_64 and a.len in {4, 6}:
|
||||
# ADX implies BMI2
|
||||
if ({.noSideEffect.}: hasAdx()):
|
||||
# With ADX and spare bit, montSquare_CIOS_asm_adx_bmi2
|
||||
# With ADX and spare bit, squareMont_CIOS_asm_adx
|
||||
# which uses unfused squaring then Montgomery reduction
|
||||
# is slightly slower than fused Montgomery multiplication
|
||||
when spareBits >= 1:
|
||||
montMul_CIOS_sparebit_asm_adx_bmi2(r, a, a, M, m0ninv)
|
||||
mulMont_CIOS_sparebit_asm_adx(r, a, a, M, m0ninv, skipReduction)
|
||||
else:
|
||||
montSquare_CIOS_asm_adx_bmi2(r, a, M, m0ninv, spareBits >= 1)
|
||||
squareMont_CIOS_asm_adx(r, a, M, m0ninv, spareBits >= 1, skipReduction)
|
||||
else:
|
||||
montSquare_CIOS_asm(r, a, M, m0ninv, spareBits >= 1)
|
||||
squareMont_CIOS_asm(r, a, M, m0ninv, spareBits >= 1, skipReduction)
|
||||
elif UseASM_X86_64:
|
||||
var r2x {.noInit.}: Limbs[2*N]
|
||||
r2x.square(a)
|
||||
r.montyRedc2x(r2x, M, m0ninv, spareBits)
|
||||
r.redc2xMont(r2x, M, m0ninv, spareBits, skipReduction)
|
||||
else:
|
||||
montyMul(r, a, a, M, m0ninv, spareBits)
|
||||
mulMont(r, a, a, M, m0ninv, spareBits, skipReduction)
|
||||
|
||||
func redc*(r: var Limbs, a, one, M: Limbs,
|
||||
func fromMont*(r: var Limbs, a, M: Limbs,
|
||||
m0ninv: static BaseType, spareBits: static int) =
|
||||
## Transform a bigint ``a`` from it's Montgomery N-residue representation (mod N)
|
||||
## to the regular natural representation (mod N)
|
||||
|
@ -424,10 +507,16 @@ func redc*(r: var Limbs, a, one, M: Limbs,
|
|||
# - https://en.wikipedia.org/wiki/Montgomery_modular_multiplication#Montgomery_arithmetic_on_multiprecision_(variable-radix)_integers
|
||||
# - http://langevin.univ-tln.fr/cours/MLC/extra/montgomery.pdf
|
||||
# Montgomery original paper
|
||||
#
|
||||
montyMul(r, a, one, M, m0ninv, spareBits)
|
||||
when UseASM_X86_64 and a.len in {2 .. 6}:
|
||||
# ADX implies BMI2
|
||||
if ({.noSideEffect.}: hasAdx()):
|
||||
fromMont_asm_adx(r, a, M, m0ninv)
|
||||
else:
|
||||
fromMont_asm(r, a, M, m0ninv)
|
||||
else:
|
||||
fromMont_CIOS(r, a, M, m0ninv)
|
||||
|
||||
func montyResidue*(r: var Limbs, a, M, r2modM: Limbs,
|
||||
func getMont*(r: var Limbs, a, M, r2modM: Limbs,
|
||||
m0ninv: static BaseType, spareBits: static int) =
|
||||
## Transform a bigint ``a`` from it's natural representation (mod N)
|
||||
## to a the Montgomery n-residue representation
|
||||
|
@ -446,7 +535,7 @@ func montyResidue*(r: var Limbs, a, M, r2modM: Limbs,
|
|||
## Important: `r` is overwritten
|
||||
## The result `r` buffer size MUST be at least the size of `M` buffer
|
||||
# Reference: https://eprint.iacr.org/2017/1057.pdf
|
||||
montyMul(r, a, r2ModM, M, m0ninv, spareBits)
|
||||
mulMont(r, a, r2ModM, M, m0ninv, spareBits)
|
||||
|
||||
# Montgomery Modular Exponentiation
|
||||
# ------------------------------------------
|
||||
|
@ -489,7 +578,7 @@ func getWindowLen(bufLen: int): uint =
|
|||
while (1 shl result) + 1 > bufLen:
|
||||
dec result
|
||||
|
||||
func montyPowPrologue(
|
||||
func powMontPrologue(
|
||||
a: var Limbs, M, one: Limbs,
|
||||
m0ninv: static BaseType,
|
||||
scratchspace: var openarray[Limbs],
|
||||
|
@ -507,12 +596,12 @@ func montyPowPrologue(
|
|||
else:
|
||||
scratchspace[2] = a
|
||||
for k in 2 ..< 1 shl result:
|
||||
scratchspace[k+1].montyMul(scratchspace[k], a, M, m0ninv, spareBits)
|
||||
scratchspace[k+1].mulMont(scratchspace[k], a, M, m0ninv, spareBits)
|
||||
|
||||
# Set a to one
|
||||
a = one
|
||||
|
||||
func montyPowSquarings(
|
||||
func powMontSquarings(
|
||||
a: var Limbs,
|
||||
exponent: openarray[byte],
|
||||
M: Limbs,
|
||||
|
@ -557,12 +646,11 @@ func montyPowSquarings(
|
|||
|
||||
# We have k bits and can do k squaring
|
||||
for i in 0 ..< k:
|
||||
tmp.montySquare(a, M, m0ninv, spareBits)
|
||||
a = tmp
|
||||
a.squareMont(a, M, m0ninv, spareBits)
|
||||
|
||||
return (k, bits)
|
||||
|
||||
func montyPow*(
|
||||
func powMont*(
|
||||
a: var Limbs,
|
||||
exponent: openarray[byte],
|
||||
M, one: Limbs,
|
||||
|
@ -596,7 +684,7 @@ func montyPow*(
|
|||
## A window of size 5 requires (2^5 + 1)*(381 + 7)/8 = 33 * 48 bytes = 1584 bytes
|
||||
## of scratchspace (on the stack).
|
||||
|
||||
let window = montyPowPrologue(a, M, one, m0ninv, scratchspace, spareBits)
|
||||
let window = powMontPrologue(a, M, one, m0ninv, scratchspace, spareBits)
|
||||
|
||||
# We process bits with from most to least significant.
|
||||
# At each loop iteration with have acc_len bits in acc.
|
||||
|
@ -607,7 +695,7 @@ func montyPow*(
|
|||
acc, acc_len: uint
|
||||
e = 0
|
||||
while acc_len > 0 or e < exponent.len:
|
||||
let (k, bits) = montyPowSquarings(
|
||||
let (k, bits) = powMontSquarings(
|
||||
a, exponent, M, m0ninv,
|
||||
scratchspace[0], window,
|
||||
acc, acc_len, e,
|
||||
|
@ -626,10 +714,10 @@ func montyPow*(
|
|||
|
||||
# Multiply with the looked-up value
|
||||
# we keep the product only if the exponent bits are not all zeroes
|
||||
scratchspace[0].montyMul(a, scratchspace[1], M, m0ninv, spareBits)
|
||||
scratchspace[0].mulMont(a, scratchspace[1], M, m0ninv, spareBits)
|
||||
a.ccopy(scratchspace[0], SecretWord(bits).isNonZero())
|
||||
|
||||
func montyPowUnsafeExponent*(
|
||||
func powMontUnsafeExponent*(
|
||||
a: var Limbs,
|
||||
exponent: openarray[byte],
|
||||
M, one: Limbs,
|
||||
|
@ -649,13 +737,13 @@ func montyPowUnsafeExponent*(
|
|||
|
||||
# TODO: scratchspace[1] is unused when window > 1
|
||||
|
||||
let window = montyPowPrologue(a, M, one, m0ninv, scratchspace, spareBits)
|
||||
let window = powMontPrologue(a, M, one, m0ninv, scratchspace, spareBits)
|
||||
|
||||
var
|
||||
acc, acc_len: uint
|
||||
e = 0
|
||||
while acc_len > 0 or e < exponent.len:
|
||||
let (_, bits) = montyPowSquarings(
|
||||
let (_, bits) = powMontSquarings(
|
||||
a, exponent, M, m0ninv,
|
||||
scratchspace[0], window,
|
||||
acc, acc_len, e,
|
||||
|
@ -665,10 +753,10 @@ func montyPowUnsafeExponent*(
|
|||
## Warning ⚠️: Exposes the exponent bits
|
||||
if bits != 0:
|
||||
if window > 1:
|
||||
scratchspace[0].montyMul(a, scratchspace[1+bits], M, m0ninv, spareBits)
|
||||
scratchspace[0].mulMont(a, scratchspace[1+bits], M, m0ninv, spareBits)
|
||||
else:
|
||||
# scratchspace[1] holds the original `a`
|
||||
scratchspace[0].montyMul(a, scratchspace[1], M, m0ninv, spareBits)
|
||||
scratchspace[0].mulMont(a, scratchspace[1], M, m0ninv, spareBits)
|
||||
a = scratchspace[0]
|
||||
|
||||
{.pop.} # raises no exceptions
|
||||
|
|
|
@ -355,7 +355,7 @@ func r3mod*(M: BigInt): BigInt =
|
|||
## This is used in hash-to-curve to
|
||||
## reduce a double-sized bigint mod M
|
||||
## and map it to the Montgomery domain
|
||||
## with just redc2x + montyMul
|
||||
## with just redc2x + mulMont
|
||||
r_powmod(3, M)
|
||||
|
||||
func montyOne*(M: BigInt): BigInt =
|
||||
|
@ -391,7 +391,7 @@ func primePlus1div2*(P: BigInt): BigInt =
|
|||
|
||||
# (P+1)/2 = P/2 + 1 if P is odd,
|
||||
# this avoids overflowing if the prime uses all bits
|
||||
# i.e. in the form (2^64)^w - 1 or (2^32)^w - 1
|
||||
# i.e. in the form (2^64)ʷ - 1 or (2^32)ʷ - 1
|
||||
|
||||
result = P
|
||||
result.shiftRight(1)
|
||||
|
@ -491,7 +491,7 @@ func toCanonicalIntRepr*[bits: static int](
|
|||
# ############################################################
|
||||
# This is needed to avoid recursive dependencies
|
||||
|
||||
func montyMul_precompute(r: var BigInt, a, b, M: BigInt, m0ninv: BaseType) =
|
||||
func mulMont_precompute(r: var BigInt, a, b, M: BigInt, m0ninv: BaseType) =
|
||||
## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
|
||||
var t: typeof(M) # zero-init
|
||||
const N = t.limbs.len
|
||||
|
@ -527,4 +527,4 @@ func montyResidue_precompute*(r: var BigInt, a, M, r2modM: BigInt,
|
|||
## Transform a bigint ``a`` from it's natural representation (mod N)
|
||||
## to a the Montgomery n-residue representation
|
||||
## This is intended for compile-time precomputations-only
|
||||
montyMul_precompute(r, a, r2ModM, M, m0ninv)
|
||||
mulMont_precompute(r, a, r2ModM, M, m0ninv)
|
||||
|
|
|
@ -154,15 +154,15 @@ func expandMessageXMD*[B1, B2, B3: byte|char, len_in_bytes: static int](
|
|||
return
|
||||
|
||||
func redc2x[FF](r: var FF, big2x: BigInt) {.inline.} =
|
||||
r.mres.limbs.montyRedc2x(
|
||||
r.mres.limbs.redc2xMont(
|
||||
big2x.limbs,
|
||||
FF.fieldMod().limbs,
|
||||
FF.getNegInvModWord(),
|
||||
FF.getSpareBits()
|
||||
)
|
||||
|
||||
func montyMul(r: var BigInt, a, b: BigInt, FF: type) {.inline.} =
|
||||
r.limbs.montyMul(
|
||||
func mulMont(r: var BigInt, a, b: BigInt, FF: type) {.inline.} =
|
||||
r.limbs.mulMont(
|
||||
a.limbs, b.limbs,
|
||||
FF.fieldMod().limbs,
|
||||
FF.getNegInvModWord(),
|
||||
|
@ -228,14 +228,14 @@ func hashToField*[Field; B1, B2, B3: byte|char, count: static int](
|
|||
# Reduces modulo p and output in Montgomery domain
|
||||
when m == 1:
|
||||
output[i].redc2x(big2x)
|
||||
output[i].mres.montyMul(
|
||||
output[i].mres.mulMont(
|
||||
output[i].mres,
|
||||
Fp[Field.C].getR3ModP(),
|
||||
Fp[Field.C])
|
||||
|
||||
else:
|
||||
output[i].coords[j].redc2x(big2x)
|
||||
output[i].coords[j].mres.montyMul(
|
||||
output[i].coords[j].mres.mulMont(
|
||||
output[i].coords[j].mres,
|
||||
Fp[Field.C].getR3ModP(),
|
||||
Fp[Field.C])
|
||||
|
|
|
@ -249,7 +249,7 @@ let
|
|||
hasBmi2Impl = testX86Feature(Bmi2)
|
||||
hasTsxHleImpl = testX86Feature(TsxHle)
|
||||
hasTsxRtmImpl = testX86Feature(TsxRtm)
|
||||
hasAdxImpl = testX86Feature(TsxHle)
|
||||
hasAdxImpl = testX86Feature(Adx)
|
||||
hasSgxImpl = testX86Feature(Sgx)
|
||||
hasGfniImpl = testX86Feature(Gfni)
|
||||
hasAesImpl = testX86Feature(Aes)
|
||||
|
|
|
@ -337,6 +337,10 @@ func generate*(a: Assembler_x86): NimNode =
|
|||
)
|
||||
)
|
||||
)
|
||||
result = nnkBlockStmt.newTree(
|
||||
newEmptyNode(),
|
||||
result
|
||||
)
|
||||
|
||||
func getStrOffset(a: Assembler_x86, op: Operand): string =
|
||||
if op.kind != kFromArray:
|
||||
|
|
|
@ -195,8 +195,8 @@ func eth_evm_ecmul*(
|
|||
|
||||
var sprime{.noInit.}: typeof(smod.mres)
|
||||
# Due to mismatch between the BigInt[256] input and the rest being BigInt[254]
|
||||
# we use the low-level montyResidue instead of 'fromBig'
|
||||
montyResidue(smod.mres.limbs, s.limbs,
|
||||
# we use the low-level getMont instead of 'fromBig'
|
||||
getMont(smod.mres.limbs, s.limbs,
|
||||
Fr[BN254_Snarks].fieldMod().limbs,
|
||||
Fr[BN254_Snarks].getR2modP().limbs,
|
||||
Fr[BN254_Snarks].getNegInvModWord(),
|
||||
|
|
|
@ -13,8 +13,8 @@ import
|
|||
../../arithmetic,
|
||||
../../arithmetic/assembly/[
|
||||
limbs_asm_mul_x86_adx_bmi2,
|
||||
limbs_asm_montmul_x86_adx_bmi2,
|
||||
limbs_asm_montred_x86_adx_bmi2
|
||||
limbs_asm_mul_mont_x86_adx_bmi2,
|
||||
limbs_asm_redc_mont_x86_adx_bmi2
|
||||
]
|
||||
|
||||
|
||||
|
@ -50,7 +50,7 @@ func has2extraBits(F: type Fp): bool =
|
|||
# 𝔽p2 squaring
|
||||
# ------------------------------------------------------------
|
||||
|
||||
func sqrx2x_complex_asm_adx_bmi2*(
|
||||
func sqrx2x_complex_asm_adx*(
|
||||
r: var array[2, FpDbl],
|
||||
a: array[2, Fp]
|
||||
) =
|
||||
|
@ -67,11 +67,11 @@ func sqrx2x_complex_asm_adx_bmi2*(
|
|||
t0.double(a.c1)
|
||||
t1.sum(a.c0, a.c1)
|
||||
|
||||
r.c1.mul_asm_adx_bmi2_impl(t0, a.c0)
|
||||
r.c1.mul_asm_adx_inline(t0, a.c0)
|
||||
t0.diff(a.c0, a.c1)
|
||||
r.c0.mul_asm_adx_bmi2_impl(t0, t1)
|
||||
r.c0.mul_asm_adx_inline(t0, t1)
|
||||
|
||||
func sqrx_complex_sparebit_asm_adx_bmi2*(
|
||||
func sqrx_complex_sparebit_asm_adx*(
|
||||
r: var array[2, Fp],
|
||||
a: array[2, Fp]
|
||||
) =
|
||||
|
@ -85,15 +85,15 @@ func sqrx_complex_sparebit_asm_adx_bmi2*(
|
|||
var v0 {.noInit.}, v1 {.noInit.}: typeof(r.c0)
|
||||
v0.diff(a.c0, a.c1)
|
||||
v1.sum(a.c0, a.c1)
|
||||
r.c1.mres.limbs.montMul_CIOS_sparebit_asm_adx_bmi2(a.c0.mres.limbs, a.c1.mres.limbs, Fp.fieldMod().limbs, Fp.getNegInvModWord())
|
||||
r.c1.mres.limbs.mulMont_CIOS_sparebit_asm_adx(a.c0.mres.limbs, a.c1.mres.limbs, Fp.fieldMod().limbs, Fp.getNegInvModWord())
|
||||
# aliasing: a unneeded now
|
||||
r.c1.double()
|
||||
r.c0.mres.limbs.montMul_CIOS_sparebit_asm_adx_bmi2(v0.mres.limbs, v1.mres.limbs, Fp.fieldMod().limbs, Fp.getNegInvModWord())
|
||||
r.c0.mres.limbs.mulMont_CIOS_sparebit_asm_adx(v0.mres.limbs, v1.mres.limbs, Fp.fieldMod().limbs, Fp.getNegInvModWord())
|
||||
|
||||
# 𝔽p2 multiplication
|
||||
# ------------------------------------------------------------
|
||||
|
||||
func mulx2x_complex_asm_adx_bmi2*(
|
||||
func mul2x_fp2_complex_asm_adx*(
|
||||
r: var array[2, FpDbl],
|
||||
a, b: array[2, Fp]
|
||||
) =
|
||||
|
@ -101,15 +101,15 @@ func mulx2x_complex_asm_adx_bmi2*(
|
|||
var D {.noInit.}: typeof(r.c0)
|
||||
var t0 {.noInit.}, t1 {.noInit.}: typeof(a.c0)
|
||||
|
||||
r.c0.limbs2x.mul_asm_adx_bmi2_impl(a.c0.mres.limbs, b.c0.mres.limbs)
|
||||
D.limbs2x.mul_asm_adx_bmi2_impl(a.c1.mres.limbs, b.c1.mres.limbs)
|
||||
r.c0.limbs2x.mul_asm_adx_inline(a.c0.mres.limbs, b.c0.mres.limbs)
|
||||
D.limbs2x.mul_asm_adx_inline(a.c1.mres.limbs, b.c1.mres.limbs)
|
||||
when Fp.has1extraBit():
|
||||
t0.sumUnr(a.c0, a.c1)
|
||||
t1.sumUnr(b.c0, b.c1)
|
||||
else:
|
||||
t0.sum(a.c0, a.c1)
|
||||
t1.sum(b.c0, b.c1)
|
||||
r.c1.limbs2x.mul_asm_adx_bmi2_impl(t0.mres.limbs, t1.mres.limbs)
|
||||
r.c1.limbs2x.mul_asm_adx_inline(t0.mres.limbs, t1.mres.limbs)
|
||||
when Fp.has1extraBit():
|
||||
r.c1.diff2xUnr(r.c1, r.c0)
|
||||
r.c1.diff2xUnr(r.c1, D)
|
||||
|
@ -118,20 +118,20 @@ func mulx2x_complex_asm_adx_bmi2*(
|
|||
r.c1.diff2xMod(r.c1, D)
|
||||
r.c0.diff2xMod(r.c0, D)
|
||||
|
||||
func mulx_complex_asm_adx_bmi2*(
|
||||
func mul_fp2_complex_asm_adx*(
|
||||
r: var array[2, Fp],
|
||||
a, b: array[2, Fp]
|
||||
) =
|
||||
## Complex multiplication on 𝔽p2
|
||||
var d {.noInit.}: array[2,doublePrec(Fp)]
|
||||
d.mulx2x_complex_asm_adx_bmi2(a, b)
|
||||
r.c0.mres.limbs.montRed_asm_adx_bmi2_impl(
|
||||
d.mul2x_fp2_complex_asm_adx(a, b)
|
||||
r.c0.mres.limbs.redcMont_asm_adx_inline(
|
||||
d.c0.limbs2x,
|
||||
Fp.fieldMod().limbs,
|
||||
Fp.getNegInvModWord(),
|
||||
Fp.has1extraBit()
|
||||
)
|
||||
r.c1.mres.limbs.montRed_asm_adx_bmi2_impl(
|
||||
r.c1.mres.limbs.redcMont_asm_adx_inline(
|
||||
d.c1.limbs2x,
|
||||
Fp.fieldMod().limbs,
|
||||
Fp.getNegInvModWord(),
|
||||
|
|
|
@ -454,7 +454,7 @@ func prod*[C: static Curve](
|
|||
# (c0 + c1 x) (u + v x) => u c0 + (u c0 + u c1)x + v c1 x²
|
||||
# => u c0 + β v c1 + (v c0 + u c1) x
|
||||
when a.fromComplexExtension() and u == 1 and v == 1:
|
||||
let t = a.c0
|
||||
let t {.noInit.} = a.c0
|
||||
r.c0.diff(t, a.c1)
|
||||
r.c1.sum(t, a.c1)
|
||||
else:
|
||||
|
@ -504,7 +504,7 @@ func prod2x*[C: static Curve](
|
|||
const Beta {.used.} = C.getNonResidueFp()
|
||||
|
||||
when complex and U == 1 and V == 1:
|
||||
let a1 = a.c1
|
||||
let a1 {.noInit.} = a.c1
|
||||
r.c1.sum2xMod(a.c0, a1)
|
||||
r.c0.diff2xMod(a.c0, a1)
|
||||
else:
|
||||
|
@ -567,13 +567,13 @@ func `/=`*[C: static Curve](a: var Fp2[C], _: type NonResidue) =
|
|||
# 1/2 * (c0 + c1, c1 - c0)
|
||||
|
||||
when a.fromComplexExtension() and u == 1 and v == 1:
|
||||
let t = a.c0
|
||||
let t {.noInit.} = a.c0
|
||||
a.c0 += a.c1
|
||||
a.c1 -= t
|
||||
a.div2()
|
||||
else:
|
||||
var a0 = a.c0
|
||||
let a1 = a.c1
|
||||
var a0 {.noInit.} = a.c0
|
||||
let a1 {.noInit.} = a.c1
|
||||
const u2v2 = u*u - Beta*v*v # (u² - βv²)
|
||||
# TODO can be precomputed to avoid costly inversion.
|
||||
var u2v2inv {.noInit.}: a.c0.typeof
|
||||
|
@ -605,7 +605,7 @@ func prod*(r: var QuadraticExt, a: QuadraticExt, _: type NonResidue) =
|
|||
## - if sextic non-residue: 𝔽p8, 𝔽p12 or 𝔽p24
|
||||
##
|
||||
## Assumes that the non-residue is sqrt(lower extension non-residue)
|
||||
let t = a.c0
|
||||
let t {.noInit.} = a.c0
|
||||
r.c0.prod(a.c1, NonResidue)
|
||||
r.c1 = t
|
||||
|
||||
|
@ -628,7 +628,7 @@ func prod2x*(
|
|||
_: type NonResidue) =
|
||||
## Multiplication by non-residue
|
||||
static: doAssert not(r.c0 is FpDbl), "Wrong dispatch, there is a specific non-residue multiplication for the base extension."
|
||||
let t = a.c0
|
||||
let t {.noInit.} = a.c0
|
||||
r.c0.prod2x(a.c1, NonResidue)
|
||||
`=`(r.c1, t) # "r.c1 = t", is refused by the compiler.
|
||||
|
||||
|
@ -650,7 +650,7 @@ func prod*(r: var CubicExt, a: CubicExt, _: type NonResidue) =
|
|||
## For all curves γ = v with v the factor for the cubic extension coordinate
|
||||
## and v³ = ξ
|
||||
## (c0 + c1 v + c2 v²) v => ξ c2 + c0 v + c1 v²
|
||||
let t = a.c2
|
||||
let t {.noInit.} = a.c2
|
||||
r.c1 = a.c0
|
||||
r.c2 = a.c1
|
||||
r.c0.prod(t, NonResidue)
|
||||
|
@ -677,7 +677,7 @@ func prod2x*(
|
|||
## For all curves γ = v with v the factor for cubic extension coordinate
|
||||
## and v³ = ξ
|
||||
## (c0 + c1 v + c2 v²) v => ξ c2 + c0 v + c1 v²
|
||||
let t = a.c2
|
||||
let t {.noInit.} = a.c2
|
||||
r.c1 = a.c0
|
||||
r.c2 = a.c1
|
||||
r.c0.prod2x(t, NonResidue)
|
||||
|
@ -1243,7 +1243,7 @@ func square*(r: var QuadraticExt, a: QuadraticExt) =
|
|||
when true:
|
||||
when UseASM_X86_64 and a.c0.mres.limbs.len <= 6 and r.typeof.has1extraBit():
|
||||
if ({.noSideEffect.}: hasAdx()):
|
||||
r.coords.sqrx_complex_sparebit_asm_adx_bmi2(a.coords)
|
||||
r.coords.sqrx_complex_sparebit_asm_adx(a.coords)
|
||||
else:
|
||||
r.square_complex(a)
|
||||
else:
|
||||
|
@ -1281,7 +1281,7 @@ func prod*(r: var QuadraticExt, a, b: QuadraticExt) =
|
|||
else: # faster
|
||||
when UseASM_X86_64 and a.c0.mres.limbs.len <= 6:
|
||||
if ({.noSideEffect.}: hasAdx()):
|
||||
r.coords.mulx_complex_asm_adx_bmi2(a.coords, b.coords)
|
||||
r.coords.mul_fp2_complex_asm_adx(a.coords, b.coords)
|
||||
else:
|
||||
var d {.noInit.}: doublePrec(typeof(r))
|
||||
d.prod2x_complex(a, b)
|
||||
|
@ -1318,7 +1318,7 @@ func prod2x*(r: var QuadraticExt2x, a, b: QuadraticExt) =
|
|||
when a.fromComplexExtension():
|
||||
when UseASM_X86_64 and a.c0.mres.limbs.len <= 6:
|
||||
if ({.noSideEffect.}: hasAdx()):
|
||||
r.coords.mulx2x_complex_asm_adx_bmi2(a.coords, b.coords)
|
||||
r.coords.mul2x_fp2_complex_asm_adx(a.coords, b.coords)
|
||||
else:
|
||||
r.prod2x_complex(a, b)
|
||||
else:
|
||||
|
@ -1591,7 +1591,7 @@ func prod2xImpl(r: var CubicExt2x, a, b: CubicExt) =
|
|||
V2.prod2x(a.c2, b.c2)
|
||||
|
||||
# r₀ = β ((a₁ + a₂)(b₁ + b₂) - v₁ - v₂) + v₀
|
||||
when false: # CubicExt.has1extraBit():
|
||||
when a.c0 is Fp and CubicExt.has1extraBit():
|
||||
t0.sumUnr(a.c1, a.c2)
|
||||
t1.sumUnr(b.c1, b.c2)
|
||||
else:
|
||||
|
@ -1604,7 +1604,7 @@ func prod2xImpl(r: var CubicExt2x, a, b: CubicExt) =
|
|||
r.c0.sum2xMod(r.c0, V0)
|
||||
|
||||
# r₁ = (a₀ + a₁) * (b₀ + b₁) - v₀ - v₁ + β v₂
|
||||
when false: # CubicExt.has1extraBit():
|
||||
when a.c0 is Fp and CubicExt.has1extraBit():
|
||||
t0.sumUnr(a.c0, a.c1)
|
||||
t1.sumUnr(b.c0, b.c1)
|
||||
else:
|
||||
|
@ -1617,7 +1617,7 @@ func prod2xImpl(r: var CubicExt2x, a, b: CubicExt) =
|
|||
r.c1.sum2xMod(r.c1, r.c2)
|
||||
|
||||
# r₂ = (a₀ + a₂) * (b₀ + b₂) - v₀ - v₂ + v₁
|
||||
when false: # CubicExt.has1extraBit():
|
||||
when a.c0 is Fp and CubicExt.has1extraBit():
|
||||
t0.sumUnr(a.c0, a.c2)
|
||||
t1.sumUnr(b.c0, b.c2)
|
||||
else:
|
||||
|
|
|
@ -84,7 +84,11 @@ The optimizations can be of algebraic, algorithmic or "implementation details" n
|
|||
- [ ] NAF recoding
|
||||
- [ ] windowed-NAF recoding
|
||||
- [ ] SIMD vectorized select in window algorithm
|
||||
- [ ] Almost Montgomery Multiplication, https://eprint.iacr.org/2011/239.pdf
|
||||
- [ ] Montgomery Multiplication with no final substraction,
|
||||
- Bos and Montgomery, https://eprint.iacr.org/2017/1057.pdf
|
||||
- Colin D Walter, https://colinandmargaret.co.uk/Research/CDW_ELL_99.pdf
|
||||
- Hachez and Quisquater, https://link.springer.com/content/pdf/10.1007%2F3-540-44499-8_23.pdf
|
||||
- Gueron, https://eprint.iacr.org/2011/239.pdf
|
||||
- [ ] Pippenger multi-exponentiation (variable-time)
|
||||
- [ ] parallelized Pippenger
|
||||
|
||||
|
|
|
@ -148,7 +148,7 @@ func random_unsafe(rng: var RngState, a: var FF) =
|
|||
|
||||
# Note: a simple modulo will be biaised but it's simple and "fast"
|
||||
reduced.reduce(unreduced, FF.fieldMod())
|
||||
a.mres.montyResidue(reduced, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
|
||||
a.mres.getMont(reduced, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
|
||||
|
||||
func random_unsafe(rng: var RngState, a: var ExtensionField) =
|
||||
## Recursively initialize an extension Field element
|
||||
|
@ -179,7 +179,7 @@ func random_highHammingWeight(rng: var RngState, a: var FF) =
|
|||
|
||||
# Note: a simple modulo will be biaised but it's simple and "fast"
|
||||
reduced.reduce(unreduced, FF.fieldMod())
|
||||
a.mres.montyResidue(reduced, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
|
||||
a.mres.getMont(reduced, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
|
||||
|
||||
func random_highHammingWeight(rng: var RngState, a: var ExtensionField) =
|
||||
## Recursively initialize an extension Field element
|
||||
|
@ -224,7 +224,7 @@ func random_long01Seq(rng: var RngState, a: var FF) =
|
|||
|
||||
# Note: a simple modulo will be biaised but it's simple and "fast"
|
||||
reduced.reduce(unreduced, FF.fieldMod())
|
||||
a.mres.montyResidue(reduced, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
|
||||
a.mres.getMont(reduced, FF.fieldMod(), FF.getR2modP(), FF.getNegInvModWord(), FF.getSpareBits())
|
||||
|
||||
func random_long01Seq(rng: var RngState, a: var ExtensionField) =
|
||||
## Recursively initialize an extension Field element
|
||||
|
|
|
@ -8,7 +8,8 @@
|
|||
|
||||
import std/unittest,
|
||||
../constantine/arithmetic,
|
||||
../constantine/io/io_fields,
|
||||
../constantine/arithmetic/limbs_montgomery,
|
||||
../constantine/io/[io_bigints, io_fields],
|
||||
../constantine/config/curves
|
||||
|
||||
static: doAssert defined(testingCurves), "This modules requires the -d:testingCurves compile option"
|
||||
|
@ -317,4 +318,54 @@ proc largeField() =
|
|||
|
||||
check: bool r.isZero()
|
||||
|
||||
test "fromMont doesn't need a final substraction with 256-bit prime (full word used)":
|
||||
block:
|
||||
var a: Fp[Secp256k1]
|
||||
a.mres = Fp[Secp256k1].getMontyPrimeMinus1()
|
||||
let expected = BigInt[256].fromHex"0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2E"
|
||||
|
||||
var r: BigInt[256]
|
||||
r.fromField(a)
|
||||
|
||||
check: bool(r == expected)
|
||||
block:
|
||||
var a: Fp[Secp256k1]
|
||||
var d: FpDbl[Secp256k1]
|
||||
|
||||
# Set Montgomery repr to the largest field element in Montgomery Residue form
|
||||
a.mres = BigInt[256].fromHex"0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2E"
|
||||
d.limbs2x = (BigInt[512].fromHex"0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2E").limbs
|
||||
|
||||
var r, expected: BigInt[256]
|
||||
|
||||
r.fromField(a)
|
||||
expected.limbs.redc2xMont(d.limbs2x, Secp256k1.Mod().limbs, Fp[Secp256k1].getNegInvModWord(), Fp[Secp256k1].getSpareBits())
|
||||
|
||||
check: bool(r == expected)
|
||||
|
||||
test "fromMont doesn't need a final substraction with 255-bit prime (1 spare bit)":
|
||||
block:
|
||||
var a: Fp[Curve25519]
|
||||
a.mres = Fp[Curve25519].getMontyPrimeMinus1()
|
||||
let expected = BigInt[255].fromHex"0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffec"
|
||||
|
||||
var r: BigInt[255]
|
||||
r.fromField(a)
|
||||
|
||||
check: bool(r == expected)
|
||||
block:
|
||||
var a: Fp[Curve25519]
|
||||
var d: FpDbl[Curve25519]
|
||||
|
||||
# Set Montgomery repr to the largest field element in Montgomery Residue form
|
||||
a.mres = BigInt[255].fromHex"0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffec"
|
||||
d.limbs2x = (BigInt[512].fromHex"0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffec").limbs
|
||||
|
||||
var r, expected: BigInt[255]
|
||||
|
||||
r.fromField(a)
|
||||
expected.limbs.redc2xMont(d.limbs2x, Curve25519.Mod().limbs, Fp[Curve25519].getNegInvModWord(), Fp[Curve25519].getSpareBits())
|
||||
|
||||
check: bool(r == expected)
|
||||
|
||||
largeField()
|
||||
|
|
Loading…
Reference in New Issue