mirror of
https://github.com/logos-storage/constantine.git
synced 2026-01-02 13:13:07 +00:00
FpDbl revisited (#144) - 7% perf improvement everywhere, up to 30% in double-width primitives
* reorg mul -> limbs_double_width, ConstantineASM CttASM * Implement squaring specialized scalar path (22% faster than mul) * Implement "portable" assembly for squaring * stash part of the changes * Reorg montgomery reduction - prepare to introduce Comba optimization * Implement comba Montgomery reduce (but it's slower!) * rename t -> a * 30% performance improvement by avoiding toOpenArray! * variable renaming * Fix 32-bit imports * slightly better assembly for sub2x * There is an annoying bottleneck * use out-of-place Fp assembly instead of in-place * diffAlias is unneeded now * cosmetic * speedup fpDbl sub by 20% * Fix Fp2 -> Fp6 -> Fp12 towering. It seems 5% faster * Stash ADCX/ADOX squaring
This commit is contained in:
parent
d12d5faf21
commit
83dcd988b3
@ -42,7 +42,7 @@ generated incorrect add-with-carry code.
|
||||
|
||||
On x86-64, inline assembly is used to workaround compilers having issues optimizing large integer arithmetic,
|
||||
and also ensure constant-time code.
|
||||
This can be deactivated with `"-d:ConstantineASM=false"`:
|
||||
This can be deactivated with `"-d:CttASM=false"`:
|
||||
- at a significant performance cost with GCC (~50% slower than Clang).
|
||||
- at misssed opportunity on recent CPUs that support MULX/ADCX/ADOX instructions (~60% faster than Clang).
|
||||
- There is a 2.4x perf ratio between using plain GCC vs GCC with inline assembly.
|
||||
|
||||
@ -172,6 +172,12 @@ proc mul2xBench*(rLen, aLen, bLen: static int, iters: int) =
|
||||
bench("Multiplication", $rLen & " <- " & $aLen & " x " & $bLen, iters):
|
||||
r.prod(a, b)
|
||||
|
||||
proc square2xBench*(rLen, aLen: static int, iters: int) =
|
||||
var r: BigInt[rLen]
|
||||
let a = rng.random_unsafe(BigInt[aLen])
|
||||
bench("Squaring", $rLen & " <- " & $aLen & "²", iters):
|
||||
r.square(a)
|
||||
|
||||
proc reduce2x*(T: typedesc, iters: int) =
|
||||
var r: T
|
||||
var t: doubleWidth(T)
|
||||
@ -189,6 +195,7 @@ proc main() =
|
||||
diff2x(Fp[BLS12_381], iters = 10_000_000)
|
||||
diff2xNoReduce(Fp[BLS12_381], iters = 10_000_000)
|
||||
mul2xBench(768, 384, 384, iters = 10_000_000)
|
||||
square2xBench(768, 384, iters = 10_000_000)
|
||||
reduce2x(Fp[BLS12_381], iters = 10_000_000)
|
||||
separator()
|
||||
|
||||
|
||||
@ -232,7 +232,7 @@ proc buildBench(benchName: string, compiler = "", useAsm = true, run = false) =
|
||||
if compiler != "":
|
||||
cc = "--cc:" & compiler
|
||||
if not useAsm:
|
||||
cc &= " -d:ConstantineASM=false"
|
||||
cc &= " -d:CttASM=false"
|
||||
exec "nim c " & cc &
|
||||
" -d:danger --verbosity:0 -o:build/bench/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") &
|
||||
" --nimcache:nimcache/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") &
|
||||
@ -246,7 +246,7 @@ proc runTests(requireGMP: bool, dumpCmdFile = false, test32bit = false, testASM
|
||||
if not(td.useGMP and not requireGMP):
|
||||
var flags = ""
|
||||
if not testASM:
|
||||
flags &= " -d:ConstantineASM=false"
|
||||
flags &= " -d:CttASM=false"
|
||||
if test32bit:
|
||||
flags &= " -d:Constantine32"
|
||||
if td.path in useDebug:
|
||||
|
||||
@ -27,58 +27,67 @@ static: doAssert UseASM_X86_64
|
||||
# Substraction
|
||||
# ------------------------------------------------------------
|
||||
|
||||
macro sub2x_gen[N: static int](a: var Limbs[N], b: Limbs[N], M: Limbs[N div 2]): untyped =
|
||||
macro sub2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N div 2]): untyped =
|
||||
## Generate an optimized out-of-place double-width substraction kernel
|
||||
|
||||
result = newStmtList()
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
let
|
||||
N2 = N div 2
|
||||
H = N div 2
|
||||
|
||||
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, InputOutput)
|
||||
# We reuse the reg used for B for overflow detection
|
||||
arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, InputOutput)
|
||||
# We could force M as immediate by specializing per moduli
|
||||
arrM = init(OperandArray, nimSymbol = M, N, PointerInReg, Input)
|
||||
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
|
||||
# We reuse the reg used for b for overflow detection
|
||||
b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
|
||||
# We could force m as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
|
||||
# If N is too big, we need to spill registers. TODO.
|
||||
arrT = init(OperandArray, nimSymbol = ident"t", N2, ElemsInReg, Output_EarlyClobber)
|
||||
arrTadd = init(OperandArray, nimSymbol = ident"tadd", N2, ElemsInReg, Output_EarlyClobber)
|
||||
u = init(OperandArray, nimSymbol = ident"U", H, ElemsInReg, InputOutput)
|
||||
v = init(OperandArray, nimSymbol = ident"V", H, ElemsInReg, InputOutput)
|
||||
|
||||
let usym = u.nimSymbol
|
||||
let vsym = v.nimSymbol
|
||||
result.add quote do:
|
||||
var `usym`{.noinit.}, `vsym` {.noInit.}: typeof(`A`)
|
||||
staticFor i, 0, `H`:
|
||||
`usym`[i] = `A`[i]
|
||||
staticFor i, `H`, `N`:
|
||||
`vsym`[i-`H`] = `A`[i]
|
||||
|
||||
# Substraction
|
||||
for i in 0 ..< N:
|
||||
ctx.mov arrT[i mod N2], arrA[i]
|
||||
# u = a[0..<H] - b[0..<H], v = a[H..<N]
|
||||
for i in 0 ..< H:
|
||||
if i == 0:
|
||||
ctx.sub arrT[0], arrB[0]
|
||||
ctx.sub u[0], b[0]
|
||||
else:
|
||||
ctx.sbb arrT[i mod N2], arrB[i]
|
||||
ctx.mov arrA[i], arrT[i mod N2]
|
||||
# Interleaved copy the modulus to hide SBB latencies
|
||||
if i < N2:
|
||||
ctx.mov arrTadd[i], arrM[i]
|
||||
ctx.sbb u[i], b[i]
|
||||
|
||||
# Everything should be hot in cache now so movs are cheaper
|
||||
# we can try using 2 per SBB
|
||||
# v = a[H..<N] - b[H..<N], a[0..<H] = u, u = M
|
||||
for i in H ..< N:
|
||||
ctx.mov r[i-H], u[i-H]
|
||||
ctx.sbb v[i-H], b[i]
|
||||
ctx.mov u[i-H], M[i-H] # TODO, bottleneck 17% perf: prefetch or inline modulus?
|
||||
|
||||
# Mask: underflowed contains 0xFFFF or 0x0000
|
||||
let underflowed = arrB.reuseRegister()
|
||||
let underflowed = b.reuseRegister()
|
||||
ctx.sbb underflowed, underflowed
|
||||
|
||||
# Now mask the adder, with 0 or the modulus limbs
|
||||
for i in 0 ..< N2:
|
||||
ctx.`and` arrTadd[i], underflowed
|
||||
for i in 0 ..< H:
|
||||
ctx.`and` u[i], underflowed
|
||||
|
||||
# Add the masked modulus
|
||||
for i in 0 ..< N2:
|
||||
for i in 0 ..< H:
|
||||
if i == 0:
|
||||
ctx.add arrT[0], arrTadd[0]
|
||||
ctx.add u[0], v[0]
|
||||
else:
|
||||
ctx.adc arrT[i], arrTadd[i]
|
||||
ctx.mov arrA[i+N2], arrT[i]
|
||||
ctx.adc u[i], v[i]
|
||||
ctx.mov r[i+H], u[i]
|
||||
|
||||
let t = arrT.nimSymbol
|
||||
let tadd = arrTadd.nimSymbol
|
||||
result.add quote do:
|
||||
var `t`{.noinit.}, `tadd` {.noInit.}: typeof(`a`)
|
||||
result.add ctx.generate
|
||||
|
||||
func sub2x_asm*[N: static int](a: var Limbs[N], b: Limbs[N], M: Limbs[N div 2]) =
|
||||
func sub2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N div 2]) =
|
||||
## Constant-time double-width substraction
|
||||
sub2x_gen(a, b, M)
|
||||
sub2x_gen(r, a, b, M)
|
||||
|
||||
@ -31,10 +31,10 @@ static: doAssert UseASM_X86_64
|
||||
# Field addition
|
||||
# ------------------------------------------------------------
|
||||
|
||||
macro addmod_gen[N: static int](a: var Limbs[N], b, M: Limbs[N]): untyped =
|
||||
macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
|
||||
## Generate an optimized modular addition kernel
|
||||
# Register pressure note:
|
||||
# We could generate a kernel per modulus M by hardocing it as immediate
|
||||
# We could generate a kernel per modulus m by hardcoding it as immediate
|
||||
# however this requires
|
||||
# - duplicating the kernel and also
|
||||
# - 64-bit immediate encoding is quite large
|
||||
@ -43,64 +43,66 @@ macro addmod_gen[N: static int](a: var Limbs[N], b, M: Limbs[N]): untyped =
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
let
|
||||
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, InputOutput)
|
||||
# We reuse the reg used for B for overflow detection
|
||||
arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, InputOutput)
|
||||
# We could force M as immediate by specializing per moduli
|
||||
arrM = init(OperandArray, nimSymbol = M, N, PointerInReg, Input)
|
||||
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
|
||||
# We reuse the reg used for b for overflow detection
|
||||
b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
|
||||
# We could force m as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
|
||||
# If N is too big, we need to spill registers. TODO.
|
||||
arrT = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
|
||||
arrTsub = init(OperandArray, nimSymbol = ident"tsub", N, ElemsInReg, Output_EarlyClobber)
|
||||
u = init(OperandArray, nimSymbol = ident"u", N, ElemsInReg, InputOutput)
|
||||
v = init(OperandArray, nimSymbol = ident"v", N, ElemsInReg, Output_EarlyClobber)
|
||||
|
||||
let usym = u.nimSymbol
|
||||
let vsym = v.nimSymbol
|
||||
result.add quote do:
|
||||
var `usym`{.noinit.}, `vsym` {.noInit.}: typeof(`A`)
|
||||
staticFor i, 0, `N`:
|
||||
`usym`[i] = `A`[i]
|
||||
|
||||
# Addition
|
||||
for i in 0 ..< N:
|
||||
ctx.mov arrT[i], arrA[i]
|
||||
if i == 0:
|
||||
ctx.add arrT[0], arrB[0]
|
||||
ctx.add u[0], b[0]
|
||||
else:
|
||||
ctx.adc arrT[i], arrB[i]
|
||||
ctx.adc u[i], b[i]
|
||||
# Interleaved copy in a second buffer as well
|
||||
ctx.mov arrTsub[i], arrT[i]
|
||||
ctx.mov v[i], u[i]
|
||||
|
||||
# Mask: overflowed contains 0xFFFF or 0x0000
|
||||
# TODO: unnecessary if MSB never set, i.e. "canUseNoCarryMontyMul"
|
||||
let overflowed = arrB.reuseRegister()
|
||||
let overflowed = b.reuseRegister()
|
||||
ctx.sbb overflowed, overflowed
|
||||
|
||||
# Now substract the modulus
|
||||
for i in 0 ..< N:
|
||||
if i == 0:
|
||||
ctx.sub arrTsub[0], arrM[0]
|
||||
ctx.sub v[0], M[0]
|
||||
else:
|
||||
ctx.sbb arrTsub[i], arrM[i]
|
||||
ctx.sbb v[i], M[i]
|
||||
|
||||
# If it overflows here, it means that it was
|
||||
# smaller than the modulus and we don't need arrTsub
|
||||
# smaller than the modulus and we don'u need V
|
||||
ctx.sbb overflowed, 0
|
||||
|
||||
# Conditional Mov and
|
||||
# and store result
|
||||
for i in 0 ..< N:
|
||||
ctx.cmovnc arrT[i], arrTsub[i]
|
||||
ctx.mov arrA[i], arrT[i]
|
||||
ctx.cmovnc u[i], v[i]
|
||||
ctx.mov r[i], u[i]
|
||||
|
||||
let t = arrT.nimSymbol
|
||||
let tsub = arrTsub.nimSymbol
|
||||
result.add quote do:
|
||||
var `t`{.noinit.}, `tsub` {.noInit.}: typeof(`a`)
|
||||
result.add ctx.generate
|
||||
|
||||
func addmod_asm*(a: var Limbs, b, M: Limbs) =
|
||||
func addmod_asm*(r: var Limbs, a, b, m: Limbs) =
|
||||
## Constant-time modular addition
|
||||
addmod_gen(a, b, M)
|
||||
addmod_gen(r, a, b, m)
|
||||
|
||||
# Field substraction
|
||||
# ------------------------------------------------------------
|
||||
|
||||
macro submod_gen[N: static int](a: var Limbs[N], b, M: Limbs[N]): untyped =
|
||||
macro submod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
|
||||
## Generate an optimized modular addition kernel
|
||||
# Register pressure note:
|
||||
# We could generate a kernel per modulus M by hardocing it as immediate
|
||||
# We could generate a kernel per modulus m by hardocing it as immediate
|
||||
# however this requires
|
||||
# - duplicating the kernel and also
|
||||
# - 64-bit immediate encoding is quite large
|
||||
@ -109,119 +111,121 @@ macro submod_gen[N: static int](a: var Limbs[N], b, M: Limbs[N]): untyped =
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
let
|
||||
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, InputOutput)
|
||||
# We reuse the reg used for B for overflow detection
|
||||
arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, InputOutput)
|
||||
# We could force M as immediate by specializing per moduli
|
||||
arrM = init(OperandArray, nimSymbol = M, N, PointerInReg, Input)
|
||||
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
|
||||
# We reuse the reg used for b for overflow detection
|
||||
b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
|
||||
# We could force m as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
|
||||
# If N is too big, we need to spill registers. TODO.
|
||||
arrT = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
|
||||
arrTadd = init(OperandArray, nimSymbol = ident"tadd", N, ElemsInReg, Output_EarlyClobber)
|
||||
u = init(OperandArray, nimSymbol = ident"U", N, ElemsInReg, InputOutput)
|
||||
v = init(OperandArray, nimSymbol = ident"V", N, ElemsInReg, Output_EarlyClobber)
|
||||
|
||||
let usym = u.nimSymbol
|
||||
let vsym = v.nimSymbol
|
||||
result.add quote do:
|
||||
var `usym`{.noinit.}, `vsym` {.noInit.}: typeof(`A`)
|
||||
staticFor i, 0, `N`:
|
||||
`usym`[i] = `A`[i]
|
||||
|
||||
# Substraction
|
||||
for i in 0 ..< N:
|
||||
ctx.mov arrT[i], arrA[i]
|
||||
if i == 0:
|
||||
ctx.sub arrT[0], arrB[0]
|
||||
ctx.sub u[0], b[0]
|
||||
else:
|
||||
ctx.sbb arrT[i], arrB[i]
|
||||
ctx.sbb u[i], b[i]
|
||||
# Interleaved copy the modulus to hide SBB latencies
|
||||
ctx.mov arrTadd[i], arrM[i]
|
||||
ctx.mov v[i], M[i]
|
||||
|
||||
# Mask: underflowed contains 0xFFFF or 0x0000
|
||||
let underflowed = arrB.reuseRegister()
|
||||
let underflowed = b.reuseRegister()
|
||||
ctx.sbb underflowed, underflowed
|
||||
|
||||
# Now mask the adder, with 0 or the modulus limbs
|
||||
for i in 0 ..< N:
|
||||
ctx.`and` arrTadd[i], underflowed
|
||||
ctx.`and` v[i], underflowed
|
||||
|
||||
# Add the masked modulus
|
||||
for i in 0 ..< N:
|
||||
if i == 0:
|
||||
ctx.add arrT[0], arrTadd[0]
|
||||
ctx.add u[0], v[0]
|
||||
else:
|
||||
ctx.adc arrT[i], arrTadd[i]
|
||||
ctx.mov arrA[i], arrT[i]
|
||||
ctx.adc u[i], v[i]
|
||||
ctx.mov r[i], u[i]
|
||||
|
||||
let t = arrT.nimSymbol
|
||||
let tadd = arrTadd.nimSymbol
|
||||
result.add quote do:
|
||||
var `t`{.noinit.}, `tadd` {.noInit.}: typeof(`a`)
|
||||
result.add ctx.generate
|
||||
|
||||
func submod_asm*(a: var Limbs, b, M: Limbs) =
|
||||
func submod_asm*(r: var Limbs, a, b, M: Limbs) =
|
||||
## Constant-time modular substraction
|
||||
## Warning, does not handle aliasing of a and b
|
||||
submod_gen(a, b, M)
|
||||
submod_gen(r, a, b, M)
|
||||
|
||||
# Field negation
|
||||
# ------------------------------------------------------------
|
||||
|
||||
macro negmod_gen[N: static int](r: var Limbs[N], a, M: Limbs[N]): untyped =
|
||||
macro negmod_gen[N: static int](R: var Limbs[N], A, m: Limbs[N]): untyped =
|
||||
## Generate an optimized modular negation kernel
|
||||
|
||||
result = newStmtList()
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
let
|
||||
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, Input)
|
||||
arrR = init(OperandArray, nimSymbol = r, N, PointerInReg, InputOutput)
|
||||
arrT = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
|
||||
# We could force M as immediate by specializing per moduli
|
||||
# We reuse the reg used for M for overflow detection
|
||||
arrM = init(OperandArray, nimSymbol = M, N, PointerInReg, InputOutput)
|
||||
a = init(OperandArray, nimSymbol = A, N, PointerInReg, Input)
|
||||
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
|
||||
u = init(OperandArray, nimSymbol = ident"U", N, ElemsInReg, Output_EarlyClobber)
|
||||
# We could force m as immediate by specializing per moduli
|
||||
# We reuse the reg used for m for overflow detection
|
||||
M = init(OperandArray, nimSymbol = m, N, PointerInReg, InputOutput)
|
||||
|
||||
# Substraction M - a
|
||||
# Substraction m - a
|
||||
for i in 0 ..< N:
|
||||
ctx.mov arrT[i], arrM[i]
|
||||
ctx.mov u[i], M[i]
|
||||
if i == 0:
|
||||
ctx.sub arrT[0], arrA[0]
|
||||
ctx.sub u[0], a[0]
|
||||
else:
|
||||
ctx.sbb arrT[i], arrA[i]
|
||||
ctx.sbb u[i], a[i]
|
||||
|
||||
# Deal with a == 0
|
||||
let isZero = arrM.reuseRegister()
|
||||
ctx.mov isZero, arrA[0]
|
||||
let isZero = M.reuseRegister()
|
||||
ctx.mov isZero, a[0]
|
||||
for i in 1 ..< N:
|
||||
ctx.`or` isZero, arrA[i]
|
||||
ctx.`or` isZero, a[i]
|
||||
|
||||
# Zero result if a == 0
|
||||
for i in 0 ..< N:
|
||||
ctx.cmovz arrT[i], isZero
|
||||
ctx.mov arrR[i], arrT[i]
|
||||
ctx.cmovz u[i], isZero
|
||||
ctx.mov r[i], u[i]
|
||||
|
||||
let t = arrT.nimSymbol
|
||||
let usym = u.nimSymbol
|
||||
result.add quote do:
|
||||
var `t`{.noinit.}: typeof(`a`)
|
||||
var `usym`{.noinit.}: typeof(`A`)
|
||||
result.add ctx.generate
|
||||
|
||||
func negmod_asm*(r: var Limbs, a, M: Limbs) =
|
||||
func negmod_asm*(r: var Limbs, a, m: Limbs) =
|
||||
## Constant-time modular negation
|
||||
negmod_gen(r, a, M)
|
||||
negmod_gen(r, a, m)
|
||||
|
||||
# Sanity checks
|
||||
# ----------------------------------------------------------
|
||||
|
||||
when isMainModule:
|
||||
import ../config/type_bigint, algorithm, strutils
|
||||
import ../../config/type_bigint, algorithm, strutils
|
||||
|
||||
proc mainAdd() =
|
||||
var a = [SecretWord 0xE3DF60E8F6D0AF9A'u64, SecretWord 0x7B2665C2258A7625'u64, SecretWord 0x68FC9A1D0977C8E0'u64, SecretWord 0xF3DC61ED7DE76883'u64]
|
||||
var b = [SecretWord 0x78E9C2EF58BB6B78'u64, SecretWord 0x547F65BD19014254'u64, SecretWord 0x556A115819EAD4B5'u64, SecretWord 0x8CA844A546935DC3'u64]
|
||||
var M = [SecretWord 0xFFFFFFFF00000001'u64, SecretWord 0x0000000000000000'u64, SecretWord 0x00000000FFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64]
|
||||
var m = [SecretWord 0xFFFFFFFF00000001'u64, SecretWord 0x0000000000000000'u64, SecretWord 0x00000000FFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64]
|
||||
var s = "0x5cc923d94f8c1b11cfa5cb7f3e8bb879be66ab7423629d968084a692c47ac647"
|
||||
|
||||
a.reverse()
|
||||
b.reverse()
|
||||
M.reverse()
|
||||
m.reverse()
|
||||
|
||||
debugecho "--------------------------------"
|
||||
debugecho "before:"
|
||||
debugecho " a: ", a.toHex()
|
||||
debugecho " b: ", b.toHex()
|
||||
debugecho " m: ", M.toHex()
|
||||
addmod_asm(a, b, M)
|
||||
debugecho " m: ", m.toHex()
|
||||
addmod_asm(a, a, b, m)
|
||||
debugecho "after:"
|
||||
debugecho " a: ", a.toHex().tolower
|
||||
debugecho " s: ", s
|
||||
@ -229,19 +233,19 @@ when isMainModule:
|
||||
|
||||
a = [SecretWord 0x00935a991ca215a6'u64, SecretWord 0x5fbdac6294679337'u64, SecretWord 0x1e41793877b80f12'u64, SecretWord 0x5724cd93cb32932d'u64]
|
||||
b = [SecretWord 0x19dd4ecfda64ef80'u64, SecretWord 0x92deeb1532169c3d'u64, SecretWord 0x69ce4ee28421cd30'u64, SecretWord 0x4d90ab5a40295321'u64]
|
||||
M = [SecretWord 0x2523648240000001'u64, SecretWord 0xba344d8000000008'u64, SecretWord 0x6121000000000013'u64, SecretWord 0xa700000000000013'u64]
|
||||
m = [SecretWord 0x2523648240000001'u64, SecretWord 0xba344d8000000008'u64, SecretWord 0x6121000000000013'u64, SecretWord 0xa700000000000013'u64]
|
||||
s = "0x1a70a968f7070526f29c9777c67e2f74880fc81afbd9dc42a4b578ee0b5be64e"
|
||||
|
||||
a.reverse()
|
||||
b.reverse()
|
||||
M.reverse()
|
||||
m.reverse()
|
||||
|
||||
debugecho "--------------------------------"
|
||||
debugecho "before:"
|
||||
debugecho " a: ", a.toHex()
|
||||
debugecho " b: ", b.toHex()
|
||||
debugecho " m: ", M.toHex()
|
||||
addmod_asm(a, b, M)
|
||||
debugecho " m: ", m.toHex()
|
||||
addmod_asm(a, a, b, m)
|
||||
debugecho "after:"
|
||||
debugecho " a: ", a.toHex().tolower
|
||||
debugecho " s: ", s
|
||||
@ -249,19 +253,19 @@ when isMainModule:
|
||||
|
||||
a = [SecretWord 0x1c7d810f37fc6e0b'u64, SecretWord 0xb91aba4ce339cea3'u64, SecretWord 0xd9f5571ccc4dfd1a'u64, SecretWord 0xf5906ee9df91f554'u64]
|
||||
b = [SecretWord 0x18394ffe94874c9f'u64, SecretWord 0x6e8a8ad032fc5f15'u64, SecretWord 0x7533a2b46b7e9530'u64, SecretWord 0x2849996b4bb61b48'u64]
|
||||
M = [SecretWord 0x2523648240000001'u64, SecretWord 0xba344d8000000008'u64, SecretWord 0x6121000000000013'u64, SecretWord 0xa700000000000013'u64]
|
||||
m = [SecretWord 0x2523648240000001'u64, SecretWord 0xba344d8000000008'u64, SecretWord 0x6121000000000013'u64, SecretWord 0xa700000000000013'u64]
|
||||
s = "0x0f936c8b8c83baa96d70f79d16362db0ee07f9d137cc923776da08552b481089"
|
||||
|
||||
a.reverse()
|
||||
b.reverse()
|
||||
M.reverse()
|
||||
m.reverse()
|
||||
|
||||
debugecho "--------------------------"
|
||||
debugecho "before:"
|
||||
debugecho " a: ", a.toHex()
|
||||
debugecho " b: ", b.toHex()
|
||||
debugecho " m: ", M.toHex()
|
||||
addmod_asm(a, b, M)
|
||||
debugecho " m: ", m.toHex()
|
||||
addmod_asm(a, a, b, m)
|
||||
debugecho "after:"
|
||||
debugecho " a: ", a.toHex().tolower
|
||||
debugecho " s: ", s
|
||||
@ -269,19 +273,19 @@ when isMainModule:
|
||||
|
||||
a = [SecretWord 0xe9d55643'u64, SecretWord 0x580ec4cc3f91cef3'u64, SecretWord 0x11ecbb7d35b36449'u64, SecretWord 0x35535ca31c5dc2ba'u64]
|
||||
b = [SecretWord 0x97f7ed94'u64, SecretWord 0xbad96eb98204a622'u64, SecretWord 0xbba94400f9a061d6'u64, SecretWord 0x60d3521a0d3dd9eb'u64]
|
||||
M = [SecretWord 0xffffffff'u64, SecretWord 0xffffffffffffffff'u64, SecretWord 0xffffffff00000000'u64, SecretWord 0x0000000000000001'u64]
|
||||
m = [SecretWord 0xffffffff'u64, SecretWord 0xffffffffffffffff'u64, SecretWord 0xffffffff00000000'u64, SecretWord 0x0000000000000001'u64]
|
||||
s = "0x0000000081cd43d812e83385c1967515cd95ff7f2f53c61f9626aebd299b9ca4"
|
||||
|
||||
a.reverse()
|
||||
b.reverse()
|
||||
M.reverse()
|
||||
m.reverse()
|
||||
|
||||
debugecho "--------------------------"
|
||||
debugecho "before:"
|
||||
debugecho " a: ", a.toHex()
|
||||
debugecho " b: ", b.toHex()
|
||||
debugecho " m: ", M.toHex()
|
||||
addmod_asm(a, b, M)
|
||||
debugecho " m: ", m.toHex()
|
||||
addmod_asm(a, a, b, m)
|
||||
debugecho "after:"
|
||||
debugecho " a: ", a.toHex().tolower
|
||||
debugecho " s: ", s
|
||||
@ -292,22 +296,47 @@ when isMainModule:
|
||||
proc mainSub() =
|
||||
var a = [SecretWord 0xf9c32e89b80b17bd'u64, SecretWord 0xdbd3069d4ca0e1c3'u64, SecretWord 0x980d4c70d39d5e17'u64, SecretWord 0xd9f0252845f18c3a'u64]
|
||||
var b = [SecretWord 0x215075604bfd64de'u64, SecretWord 0x36dc488149fc5d3e'u64, SecretWord 0x91fff665385d20fd'u64, SecretWord 0xe980a5a203b43179'u64]
|
||||
var M = [SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFEFFFFFC2F'u64]
|
||||
var m = [SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFEFFFFFC2F'u64]
|
||||
var s = "0xd872b9296c0db2dfa4f6be1c02a48485060d560b9b403d19f06f7f86423d5ac1"
|
||||
|
||||
a.reverse()
|
||||
b.reverse()
|
||||
M.reverse()
|
||||
m.reverse()
|
||||
|
||||
debugecho "--------------------------------"
|
||||
debugecho "before:"
|
||||
debugecho " a: ", a.toHex()
|
||||
debugecho " b: ", b.toHex()
|
||||
debugecho " m: ", M.toHex()
|
||||
submod_asm(a, b, M)
|
||||
debugecho " m: ", m.toHex()
|
||||
submod_asm(a, a, b, m)
|
||||
debugecho "after:"
|
||||
debugecho " a: ", a.toHex().tolower
|
||||
debugecho " s: ", s
|
||||
debugecho " ok: ", a.toHex().tolower == s
|
||||
|
||||
mainSub()
|
||||
|
||||
proc mainSubOutplace() =
|
||||
var a = [SecretWord 0xf9c32e89b80b17bd'u64, SecretWord 0xdbd3069d4ca0e1c3'u64, SecretWord 0x980d4c70d39d5e17'u64, SecretWord 0xd9f0252845f18c3a'u64]
|
||||
var b = [SecretWord 0x215075604bfd64de'u64, SecretWord 0x36dc488149fc5d3e'u64, SecretWord 0x91fff665385d20fd'u64, SecretWord 0xe980a5a203b43179'u64]
|
||||
var m = [SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFEFFFFFC2F'u64]
|
||||
var s = "0xd872b9296c0db2dfa4f6be1c02a48485060d560b9b403d19f06f7f86423d5ac1"
|
||||
|
||||
a.reverse()
|
||||
b.reverse()
|
||||
m.reverse()
|
||||
|
||||
var r: typeof(a)
|
||||
|
||||
debugecho "--------------------------------"
|
||||
debugecho "before:"
|
||||
debugecho " a: ", a.toHex()
|
||||
debugecho " b: ", b.toHex()
|
||||
debugecho " m: ", m.toHex()
|
||||
submod_asm(r, a, b, m)
|
||||
debugecho "after:"
|
||||
debugecho " r: ", r.toHex().tolower
|
||||
debugecho " s: ", s
|
||||
debugecho " ok: ", r.toHex().tolower == s
|
||||
|
||||
mainSubOutplace()
|
||||
|
||||
@ -28,31 +28,31 @@ static: doAssert UseASM_X86_32
|
||||
proc finalSubNoCarry*(
|
||||
ctx: var Assembler_x86,
|
||||
r: Operand or OperandArray,
|
||||
t, M, scratch: OperandArray
|
||||
a, M, scratch: OperandArray
|
||||
) =
|
||||
## Reduce `t` into `r` modulo `M`
|
||||
## Reduce `a` into `r` modulo `M`
|
||||
let N = M.len
|
||||
ctx.comment "Final substraction (no carry)"
|
||||
for i in 0 ..< N:
|
||||
ctx.mov scratch[i], t[i]
|
||||
ctx.mov scratch[i], a[i]
|
||||
if i == 0:
|
||||
ctx.sub scratch[i], M[i]
|
||||
else:
|
||||
ctx.sbb scratch[i], M[i]
|
||||
|
||||
# If we borrowed it means that we were smaller than
|
||||
# the modulus and we don't need "scratch"
|
||||
# the modulus and we don'a need "scratch"
|
||||
for i in 0 ..< N:
|
||||
ctx.cmovnc t[i], scratch[i]
|
||||
ctx.mov r[i], t[i]
|
||||
ctx.cmovnc a[i], scratch[i]
|
||||
ctx.mov r[i], a[i]
|
||||
|
||||
proc finalSubCanOverflow*(
|
||||
ctx: var Assembler_x86,
|
||||
r: Operand or OperandArray,
|
||||
t, M, scratch: OperandArray,
|
||||
a, M, scratch: OperandArray,
|
||||
overflowReg: Operand
|
||||
) =
|
||||
## Reduce `t` into `r` modulo `M`
|
||||
## Reduce `a` into `r` modulo `M`
|
||||
## To be used when the final substraction can
|
||||
## also depend on the carry flag
|
||||
## This is in particular possible when the MSB
|
||||
@ -65,7 +65,7 @@ proc finalSubCanOverflow*(
|
||||
let N = M.len
|
||||
ctx.comment "Final substraction (may carry)"
|
||||
for i in 0 ..< N:
|
||||
ctx.mov scratch[i], t[i]
|
||||
ctx.mov scratch[i], a[i]
|
||||
if i == 0:
|
||||
ctx.sub scratch[i], M[i]
|
||||
else:
|
||||
@ -74,10 +74,10 @@ proc finalSubCanOverflow*(
|
||||
ctx.sbb overflowReg, 0
|
||||
|
||||
# If we borrowed it means that we were smaller than
|
||||
# the modulus and we don't need "scratch"
|
||||
# the modulus and we don'a need "scratch"
|
||||
for i in 0 ..< N:
|
||||
ctx.cmovnc t[i], scratch[i]
|
||||
ctx.mov r[i], t[i]
|
||||
ctx.cmovnc a[i], scratch[i]
|
||||
ctx.mov r[i], a[i]
|
||||
|
||||
|
||||
# Montgomery reduction
|
||||
@ -85,7 +85,7 @@ proc finalSubCanOverflow*(
|
||||
|
||||
macro montyRed_gen[N: static int](
|
||||
r_MR: var array[N, SecretWord],
|
||||
t_MR: array[N*2, SecretWord],
|
||||
a_MR: array[N*2, SecretWord],
|
||||
M_MR: array[N, SecretWord],
|
||||
m0ninv_MR: BaseType,
|
||||
canUseNoCarryMontyMul: static bool
|
||||
@ -148,13 +148,13 @@ macro montyRed_gen[N: static int](
|
||||
# ---------------------------------------------------------
|
||||
# for i in 0 .. n-1:
|
||||
# hi <- 0
|
||||
# m <- t[i] * m0ninv mod 2^w (i.e. simple multiplication)
|
||||
# m <- a[i] * m0ninv mod 2^w (i.e. simple multiplication)
|
||||
# for j in 0 .. n-1:
|
||||
# (hi, lo) <- t[i+j] + m * M[j] + hi
|
||||
# t[i+j] <- lo
|
||||
# t[i+n] += hi
|
||||
# (hi, lo) <- a[i+j] + m * M[j] + hi
|
||||
# a[i+j] <- lo
|
||||
# a[i+n] += hi
|
||||
# for i in 0 .. n-1:
|
||||
# r[i] = t[i+n]
|
||||
# r[i] = a[i+n]
|
||||
# if r >= M:
|
||||
# r -= M
|
||||
|
||||
@ -162,14 +162,15 @@ macro montyRed_gen[N: static int](
|
||||
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
|
||||
|
||||
result.add quote do:
|
||||
`eax` = BaseType `t_MR`[0]
|
||||
`scratchSym`[1 .. `N`-1] = `t_MR`.toOpenArray(1, `N`-1)
|
||||
`eax` = BaseType `a_MR`[0]
|
||||
staticFor i, 0, `N`: # Do NOT use Nim slice/toOpenArray, they are not inlined
|
||||
`scratchSym`[i] = `a_MR`[i]
|
||||
|
||||
ctx.mov scratch[N], rRAX
|
||||
ctx.imul rRAX, m0ninv # m <- t[i] * m0ninv mod 2^w
|
||||
ctx.imul rRAX, m0ninv # m <- a[i] * m0ninv mod 2^w
|
||||
ctx.mov scratch[0], rRAX
|
||||
|
||||
# scratch: [t[0] * m0, t[1], t[2], t[3], t[0]] for 4 limbs
|
||||
# scratch: [a[0] * m0, a[1], a[2], a[3], a[0]] for 4 limbs
|
||||
|
||||
for i in 0 ..< N:
|
||||
ctx.comment ""
|
||||
@ -217,23 +218,23 @@ macro montyRed_gen[N: static int](
|
||||
ctx = init(Assembler_x86, BaseType)
|
||||
|
||||
let r = init(OperandArray, nimSymbol = r_MR, N, PointerInReg, InputOutput_EnsureClobber)
|
||||
let t = init(OperandArray, nimSymbol = t_MR, N*2, PointerInReg, Input)
|
||||
let a = init(OperandArray, nimSymbol = a_MR, N*2, PointerInReg, Input)
|
||||
let extraRegNeeded = N-2
|
||||
let tsub = init(OperandArray, nimSymbol = ident"tsub", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber)
|
||||
let tsubsym = tsub.nimSymbol
|
||||
let t = init(OperandArray, nimSymbol = ident"t", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber)
|
||||
let tsym = t.nimSymbol
|
||||
result.add quote do:
|
||||
var `tsubsym` {.noInit.}: Limbs[`extraRegNeeded`]
|
||||
var `tsym` {.noInit.}: Limbs[`extraRegNeeded`]
|
||||
|
||||
# This does t[i+n] += hi
|
||||
# This does a[i+n] += hi
|
||||
# but in a separate carry chain, fused with the
|
||||
# copy "r[i] = t[i+n]"
|
||||
# copy "r[i] = a[i+n]"
|
||||
for i in 0 ..< N:
|
||||
if i == 0:
|
||||
ctx.add scratch[i], t[i+N]
|
||||
ctx.add scratch[i], a[i+N]
|
||||
else:
|
||||
ctx.adc scratch[i], t[i+N]
|
||||
ctx.adc scratch[i], a[i+N]
|
||||
|
||||
let reuse = repackRegisters(tsub, scratch[N], scratch[N+1])
|
||||
let reuse = repackRegisters(t, scratch[N], scratch[N+1])
|
||||
|
||||
if canUseNoCarryMontyMul:
|
||||
ctx.finalSubNoCarry(r, scratch, M, reuse)
|
||||
@ -245,10 +246,10 @@ macro montyRed_gen[N: static int](
|
||||
|
||||
func montRed_asm*[N: static int](
|
||||
r: var array[N, SecretWord],
|
||||
t: array[N*2, SecretWord],
|
||||
a: array[N*2, SecretWord],
|
||||
M: array[N, SecretWord],
|
||||
m0ninv: BaseType,
|
||||
canUseNoCarryMontyMul: static bool
|
||||
) =
|
||||
## Constant-time Montgomery reduction
|
||||
montyRed_gen(r, t, M, m0ninv, canUseNoCarryMontyMul)
|
||||
montyRed_gen(r, a, M, m0ninv, canUseNoCarryMontyMul)
|
||||
|
||||
@ -37,7 +37,7 @@ static: doAssert UseASM_X86_64
|
||||
|
||||
macro montyRedx_gen[N: static int](
|
||||
r_MR: var array[N, SecretWord],
|
||||
t_MR: array[N*2, SecretWord],
|
||||
a_MR: array[N*2, SecretWord],
|
||||
M_MR: array[N, SecretWord],
|
||||
m0ninv_MR: BaseType,
|
||||
canUseNoCarryMontyMul: static bool
|
||||
@ -109,13 +109,13 @@ macro montyRedx_gen[N: static int](
|
||||
# ---------------------------------------------------------
|
||||
# for i in 0 .. n-1:
|
||||
# hi <- 0
|
||||
# m <- t[i] * m0ninv mod 2^w (i.e. simple multiplication)
|
||||
# m <- a[i] * m0ninv mod 2^w (i.e. simple multiplication)
|
||||
# for j in 0 .. n-1:
|
||||
# (hi, lo) <- t[i+j] + m * M[j] + hi
|
||||
# t[i+j] <- lo
|
||||
# t[i+n] += hi
|
||||
# (hi, lo) <- a[i+j] + m * M[j] + hi
|
||||
# a[i+j] <- lo
|
||||
# a[i+n] += hi
|
||||
# for i in 0 .. n-1:
|
||||
# r[i] = t[i+n]
|
||||
# r[i] = a[i+n]
|
||||
# if r >= M:
|
||||
# r -= M
|
||||
|
||||
@ -124,12 +124,13 @@ macro montyRedx_gen[N: static int](
|
||||
|
||||
result.add quote do:
|
||||
`edx` = BaseType(`m0ninv_MR`)
|
||||
`scratchSym`[0 .. `N`-1] = `t_MR`.toOpenArray(0, `N`-1)
|
||||
staticFor i, 0, `N`: # Do NOT use Nim slice/toOpenArray, they are not inlined
|
||||
`scratchSym`[i] = `a_MR`[i]
|
||||
|
||||
for i in 0 ..< N:
|
||||
# RDX contains m0ninv at the start of each loop
|
||||
ctx.comment ""
|
||||
ctx.imul rRDX, scratch[0] # m <- t[i] * m0ninv mod 2^w
|
||||
ctx.imul rRDX, scratch[0] # m <- a[i] * m0ninv mod 2^w
|
||||
ctx.comment "---- Reduction " & $i
|
||||
ctx.`xor` scratch[N], scratch[N]
|
||||
|
||||
@ -156,23 +157,23 @@ macro montyRedx_gen[N: static int](
|
||||
ctx = init(Assembler_x86, BaseType)
|
||||
|
||||
let r = init(OperandArray, nimSymbol = r_MR, N, PointerInReg, InputOutput_EnsureClobber)
|
||||
let t = init(OperandArray, nimSymbol = t_MR, N*2, PointerInReg, Input)
|
||||
let a = init(OperandArray, nimSymbol = a_MR, N*2, PointerInReg, Input)
|
||||
let extraRegNeeded = N-1
|
||||
let tsub = init(OperandArray, nimSymbol = ident"tsub", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber)
|
||||
let tsubsym = tsub.nimSymbol
|
||||
let t = init(OperandArray, nimSymbol = ident"t", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber)
|
||||
let tsym = t.nimSymbol
|
||||
result.add quote do:
|
||||
var `tsubsym` {.noInit.}: Limbs[`extraRegNeeded`]
|
||||
var `tsym` {.noInit.}: Limbs[`extraRegNeeded`]
|
||||
|
||||
# This does t[i+n] += hi
|
||||
# This does a[i+n] += hi
|
||||
# but in a separate carry chain, fused with the
|
||||
# copy "r[i] = t[i+n]"
|
||||
# copy "r[i] = a[i+n]"
|
||||
for i in 0 ..< N:
|
||||
if i == 0:
|
||||
ctx.add scratch[i], t[i+N]
|
||||
ctx.add scratch[i], a[i+N]
|
||||
else:
|
||||
ctx.adc scratch[i], t[i+N]
|
||||
ctx.adc scratch[i], a[i+N]
|
||||
|
||||
let reuse = repackRegisters(tsub, scratch[N])
|
||||
let reuse = repackRegisters(t, scratch[N])
|
||||
|
||||
if canUseNoCarryMontyMul:
|
||||
ctx.finalSubNoCarry(r, scratch, M, reuse)
|
||||
@ -184,10 +185,10 @@ macro montyRedx_gen[N: static int](
|
||||
|
||||
func montRed_asm_adx_bmi2*[N: static int](
|
||||
r: var array[N, SecretWord],
|
||||
t: array[N*2, SecretWord],
|
||||
a: array[N*2, SecretWord],
|
||||
M: array[N, SecretWord],
|
||||
m0ninv: BaseType,
|
||||
canUseNoCarryMontyMul: static bool
|
||||
) =
|
||||
## Constant-time Montgomery reduction
|
||||
montyRedx_gen(r, t, M, m0ninv, canUseNoCarryMontyMul)
|
||||
montyRedx_gen(r, a, M, m0ninv, canUseNoCarryMontyMul)
|
||||
|
||||
@ -30,6 +30,9 @@ import
|
||||
static: doAssert UseASM_X86_64 # Need 8 registers just for mul
|
||||
# and 32-bit only has 8 max.
|
||||
|
||||
# Multiplication
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
|
||||
macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
|
||||
## Comba multiplication generator
|
||||
## `a`, `b`, `r` can have a different number of limbs
|
||||
@ -114,7 +117,12 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
|
||||
ctx.`xor` v, v
|
||||
ctx.`xor` t, t
|
||||
|
||||
for i in 0 ..< min(aLen+bLen, rLen):
|
||||
let stopEx = min(aLen+bLen, rLen)
|
||||
|
||||
for i in 0 ..< stopEx:
|
||||
# Invariant for product scanning:
|
||||
# if we multiply accumulate by a[k1] * b[k2]
|
||||
# we have k1+k2 == i
|
||||
let ib = min(bLen-1, i)
|
||||
let ia = i - ib
|
||||
for j in 0 ..< min(aLen - ia, ib+1):
|
||||
@ -127,7 +135,7 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
|
||||
|
||||
ctx.mov arrR[i], v
|
||||
|
||||
if i != min(aLen+bLen, rLen) - 1:
|
||||
if i != stopEx - 1:
|
||||
ctx.mov v, u
|
||||
ctx.mov u, t
|
||||
ctx.`xor` t, t
|
||||
@ -144,3 +152,140 @@ func mul_asm*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
|
||||
## Multi-precision Multiplication
|
||||
## Assumes r doesn't alias a or b
|
||||
mul_gen(r, a, b)
|
||||
|
||||
# Squaring
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
|
||||
macro square_gen[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
||||
## Comba squaring generator
|
||||
## `a` and `r` can have a different number of limbs
|
||||
## if `r`.limbs.len < a.limbs.len * 2
|
||||
## The result will be truncated, i.e. it will be
|
||||
## a² (mod (2^WordBitwidth)^r.limbs.len)
|
||||
##
|
||||
## Assumes r doesn't aliases a
|
||||
|
||||
result = newStmtList()
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
let
|
||||
arrR = init(OperandArray, nimSymbol = r, rLen, PointerInReg, InputOutput_EnsureClobber)
|
||||
arrA = init(OperandArray, nimSymbol = a, aLen, PointerInReg, Input)
|
||||
|
||||
t = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[t]",
|
||||
nimSymbol: ident"t",
|
||||
rm: Reg,
|
||||
constraint: Output_EarlyClobber,
|
||||
cEmit: "t"
|
||||
)
|
||||
)
|
||||
|
||||
u = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[u]",
|
||||
nimSymbol: ident"u",
|
||||
rm: Reg,
|
||||
constraint: Output_EarlyClobber,
|
||||
cEmit: "u"
|
||||
)
|
||||
)
|
||||
|
||||
v = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[v]",
|
||||
nimSymbol: ident"v",
|
||||
rm: Reg,
|
||||
constraint: Output_EarlyClobber,
|
||||
cEmit: "v"
|
||||
)
|
||||
)
|
||||
|
||||
# MUL requires RAX and RDX
|
||||
rRAX = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[rax]",
|
||||
nimSymbol: ident"rax",
|
||||
rm: RAX,
|
||||
constraint: Output_EarlyClobber,
|
||||
cEmit: "rax"
|
||||
)
|
||||
)
|
||||
|
||||
rRDX = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[rdx]",
|
||||
nimSymbol: ident"rdx",
|
||||
rm: RDX,
|
||||
constraint: Output_EarlyClobber,
|
||||
cEmit: "rdx"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# Prologue
|
||||
let tsym = t.desc.nimSymbol
|
||||
let usym = u.desc.nimSymbol
|
||||
let vsym = v.desc.nimSymbol
|
||||
let eax = rRAX.desc.nimSymbol
|
||||
let edx = rRDX.desc.nimSymbol
|
||||
result.add quote do:
|
||||
var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init
|
||||
var `eax`{.noInit.}, `edx`{.noInit.}: BaseType
|
||||
|
||||
# Algorithm
|
||||
ctx.`xor` u, u
|
||||
ctx.`xor` v, v
|
||||
ctx.`xor` t, t
|
||||
|
||||
let stopEx = min(aLen*2, rLen)
|
||||
|
||||
for i in 0 ..< stopEx:
|
||||
# Invariant for product scanning:
|
||||
# if we multiply accumulate by a[k1] * b[k2]
|
||||
# we have k1+k2 == i
|
||||
let ib = min(aLen-1, i)
|
||||
let ia = i - ib
|
||||
for j in 0 ..< min(aLen - ia, ib+1):
|
||||
let k1 = ia+j
|
||||
let k2 = ib-j
|
||||
if k1 < k2:
|
||||
# (t, u, v) <- (t, u, v) + 2 * a[k1] * a[k2]
|
||||
ctx.mov rRAX, arrA[k2]
|
||||
ctx.mul rdx, rax, arrA[k1], rax
|
||||
ctx.add rRAX, rRAX
|
||||
ctx.adc rRDX, rRDX
|
||||
ctx.adc t, 0
|
||||
ctx.add v, rRAX
|
||||
ctx.adc u, rRDX
|
||||
ctx.adc t, 0
|
||||
elif k1 == k2:
|
||||
# (t, u, v) <- (t, u, v) + a[k1] * a[k2]
|
||||
ctx.mov rRAX, arrA[k2]
|
||||
ctx.mul rdx, rax, arrA[k1], rax
|
||||
ctx.add v, rRAX
|
||||
ctx.adc u, rRDX
|
||||
ctx.adc t, 0
|
||||
else:
|
||||
discard
|
||||
|
||||
ctx.mov arrR[i], v
|
||||
|
||||
if i != stopEx - 1:
|
||||
ctx.mov v, u
|
||||
ctx.mov u, t
|
||||
ctx.`xor` t, t
|
||||
|
||||
if aLen*2 < rLen:
|
||||
ctx.`xor` rRAX, rRAX
|
||||
for i in aLen*2 ..< rLen:
|
||||
ctx.mov arrR[i], rRAX
|
||||
|
||||
# Codegen
|
||||
result.add ctx.generate
|
||||
|
||||
func square_asm*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
||||
## Multi-precision Squaring
|
||||
## Assumes r doesn't alias a
|
||||
square_gen(r, a)
|
||||
|
||||
@ -195,3 +195,135 @@ func mul_asm_adx_bmi2*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limb
|
||||
## Multi-precision Multiplication
|
||||
## Assumes r doesn't alias a or b
|
||||
mulx_gen(r, a, b)
|
||||
|
||||
|
||||
# Squaring
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
# TODO: We use 16 registers but GCC/Clang still complain :/
|
||||
|
||||
# macro sqrx_gen[rLen, aLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen]) =
|
||||
# ## Squaring
|
||||
# ## `a` and `r` can have a different number of limbs
|
||||
# ## if `r`.limbs.len < a.limbs.len * 2
|
||||
# ## The result will be truncated, i.e. it will be
|
||||
# ## a² (mod (2^WordBitwidth)^r.limbs.len)
|
||||
# ##
|
||||
# ## Assumes r doesn't aliases a
|
||||
# result = newStmtList()
|
||||
#
|
||||
# var ctx = init(Assembler_x86, BaseType)
|
||||
# let
|
||||
# # Register count with 6 limbs:
|
||||
# # r + a + rax + rdx = 4
|
||||
# # t = 2 * a.len = 12
|
||||
# # We use the full x86 register set.
|
||||
#
|
||||
# r = init(OperandArray, nimSymbol = rx, rLen, PointerInReg, InputOutput)
|
||||
# a = init(OperandArray, nimSymbol = ax, aLen, PointerInReg, Input)
|
||||
#
|
||||
# N = a.len
|
||||
# tSlots = a.len * 2
|
||||
# # If aLen is too big, we need to spill registers. TODO.
|
||||
# t = init(OperandArray, nimSymbol = ident"t", tSlots, ElemsInReg, Output_EarlyClobber)
|
||||
#
|
||||
# # MULX requires RDX
|
||||
# rRDX = Operand(
|
||||
# desc: OperandDesc(
|
||||
# asmId: "[rdx]",
|
||||
# nimSymbol: ident"rdx",
|
||||
# rm: RDX,
|
||||
# constraint: Output_EarlyClobber,
|
||||
# cEmit: "rdx"
|
||||
# )
|
||||
# )
|
||||
#
|
||||
# # Scratch spaces for carries
|
||||
# rRAX = Operand(
|
||||
# desc: OperandDesc(
|
||||
# asmId: "[rax]",
|
||||
# nimSymbol: ident"rax",
|
||||
# rm: RAX,
|
||||
# constraint: Output_EarlyClobber,
|
||||
# cEmit: "rax"
|
||||
# )
|
||||
# )
|
||||
#
|
||||
# # Prologue
|
||||
# # -------------------------------
|
||||
# let tsym = t.nimSymbol
|
||||
# let eax = rRAX.desc.nimSymbol
|
||||
# let edx = rRDX.desc.nimSymbol
|
||||
# result.add quote do:
|
||||
# var `tsym`{.noInit.}: array[`N`, BaseType]
|
||||
# var `eax`{.noInit.}, `edx`{.noInit.}: BaseType
|
||||
#
|
||||
# # Algorithm
|
||||
# # -------------------------------
|
||||
#
|
||||
# block: # Triangle
|
||||
# # i = 0
|
||||
# # ----------------
|
||||
# ctx.mov rRDX, a[0]
|
||||
# # Put a[1..<N] in unused registers, 4 mov per cycle on x86
|
||||
# for i in 1 ..< N:
|
||||
# ctx.mov t[i+N], a[i]
|
||||
#
|
||||
# let # Carry handlers
|
||||
# hi = r.reuseRegister()
|
||||
# lo = rRAX
|
||||
#
|
||||
# for j in 1 ..< N:
|
||||
# # (carry, t[j]) <- a[j] * a[0] with a[j] in t[j+N]
|
||||
# ctx.mulx t[j], rRAX, t[j+N], rdx
|
||||
# if j == 1:
|
||||
# ctx.add t[j-1], rRAX
|
||||
# else:
|
||||
# ctx.adc t[j-1], rRAX
|
||||
# ctx.adc t[N-1], 0
|
||||
#
|
||||
# for i in 1 ..< N-1:
|
||||
# ctx.comment " Process squaring triangle " & $i
|
||||
# ctx.mov rRDX, a[i]
|
||||
# ctx.`xor` t[i+N], t[i+N] # Clear flags
|
||||
# for j in i+1 ..< N:
|
||||
# ctx.mulx hi, lo, t[j+N], rdx
|
||||
# ctx.adox t[i+j], lo
|
||||
# if j == N-1:
|
||||
# break
|
||||
# ctx.adcx t[i+j+1], hi
|
||||
#
|
||||
# ctx.comment " Accumulate last carries in i+N word"
|
||||
# # t[i+N] is already 0
|
||||
# ctx.adcx hi, t[i+N]
|
||||
# ctx.adox t[i+N], hi
|
||||
#
|
||||
# block:
|
||||
# ctx.comment "Finish: (t[2*i+1], t[2*i]) <- 2*t[2*i] + a[i]*a[i]"
|
||||
#
|
||||
# # Restore result
|
||||
# ctx.mov r.reuseRegister(), xmm0
|
||||
#
|
||||
# ctx.mov rRDX, a[0]
|
||||
#
|
||||
# # (t[2*i+1], t[2*i]) <- 2*t[2*i] + a[i]*a[i]
|
||||
# for i in 0 ..< N:
|
||||
# ctx.mulx rRAX, rRDX, a[i], rdx
|
||||
# ctx.add t[2*i], t[2*i]
|
||||
# ctx.adc t[2*i+1], 0
|
||||
# ctx.add t[2*i], rRDX
|
||||
# if i != N - 1:
|
||||
# ctx.mov rRDX, a[i+1]
|
||||
# ctx.adc t[2*i+1], rRAX
|
||||
# ctx.mov r[i], t[i]
|
||||
#
|
||||
# # Move the rest
|
||||
# for i in N ..< min(rLen, 2*N):
|
||||
# ctx.mov r[i], t[i]
|
||||
#
|
||||
# # Codegen
|
||||
# result.add ctx.generate
|
||||
#
|
||||
# func square_asm_adx_bmi2*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
||||
# ## Multi-precision Squaring
|
||||
# ## Assumes r doesn't alias a
|
||||
# sqrx_gen(r, a)
|
||||
|
||||
@ -10,7 +10,9 @@ import
|
||||
../config/[common, type_bigint],
|
||||
../primitives,
|
||||
./limbs,
|
||||
./limbs_modular
|
||||
./limbs_double_width,
|
||||
./limbs_modular,
|
||||
./limbs_montgomery
|
||||
|
||||
export BigInt
|
||||
|
||||
@ -269,6 +271,10 @@ func prod*[rBits, aBits, bBits](r: var BigInt[rBits], a: BigInt[aBits], b: BigIn
|
||||
## if r.bits < a.bits + b.bits
|
||||
## the multiplication will overflow.
|
||||
## It will be truncated if it cannot fit in r limbs.
|
||||
##
|
||||
## Truncation is at limb-level NOT bitlevel
|
||||
## It is recommended to only use
|
||||
## rBits >= aBits + bBits unless you know what you are doing.
|
||||
r.limbs.prod(a.limbs, b.limbs)
|
||||
|
||||
func mul*[aBits, bBits](a: var BigInt[aBits], b: BigInt[bBits]) =
|
||||
@ -298,6 +304,19 @@ func prod_high_words*[rBits, aBits, bBits](r: var BigInt[rBits], a: BigInt[aBits
|
||||
# i.e. prod_high_words(result, P, a, w)
|
||||
r.limbs.prod_high_words(a.limbs, b.limbs, lowestWordIndex)
|
||||
|
||||
func square*[rBits, aBits](r: var BigInt[rBits], a: BigInt[aBits]) =
|
||||
## Multi-precision squaring
|
||||
## r <- a²
|
||||
## `a`, `r` can have different sizes
|
||||
## if r.bits < a.bits * 2
|
||||
## the multiplication will overflow.
|
||||
## It will be truncated if it cannot fit in r limbs.
|
||||
##
|
||||
## Truncation is at limb-level NOT bitlevel
|
||||
## It is recommended to only use
|
||||
## rBits >= aBits * 2 unless you know what you are doing.
|
||||
r.limbs.square(a.limbs)
|
||||
|
||||
# Bit Manipulation
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
@ -135,7 +135,7 @@ func setOne*(a: var FF) {.inline.} =
|
||||
func `+=`*(a: var FF, b: FF) {.inline, meter.} =
|
||||
## In-place addition modulo p
|
||||
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||
addmod_asm(a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
|
||||
addmod_asm(a.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
|
||||
else:
|
||||
var overflowed = add(a.mres, b.mres)
|
||||
overflowed = overflowed or not(a.mres < FF.fieldMod())
|
||||
@ -144,7 +144,7 @@ func `+=`*(a: var FF, b: FF) {.inline, meter.} =
|
||||
func `-=`*(a: var FF, b: FF) {.inline, meter.} =
|
||||
## In-place substraction modulo p
|
||||
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||
submod_asm(a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
|
||||
submod_asm(a.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
|
||||
else:
|
||||
let underflowed = sub(a.mres, b.mres)
|
||||
discard cadd(a.mres, FF.fieldMod(), underflowed)
|
||||
@ -152,7 +152,7 @@ func `-=`*(a: var FF, b: FF) {.inline, meter.} =
|
||||
func double*(a: var FF) {.inline, meter.} =
|
||||
## Double ``a`` modulo p
|
||||
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||
addmod_asm(a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
|
||||
addmod_asm(a.mres.limbs, a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
|
||||
else:
|
||||
var overflowed = double(a.mres)
|
||||
overflowed = overflowed or not(a.mres < FF.fieldMod())
|
||||
@ -162,8 +162,7 @@ func sum*(r: var FF, a, b: FF) {.inline, meter.} =
|
||||
## Sum ``a`` and ``b`` into ``r`` modulo p
|
||||
## r is initialized/overwritten
|
||||
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||
r = a
|
||||
addmod_asm(r.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
|
||||
addmod_asm(r.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
|
||||
else:
|
||||
var overflowed = r.mres.sum(a.mres, b.mres)
|
||||
overflowed = overflowed or not(r.mres < FF.fieldMod())
|
||||
@ -178,20 +177,7 @@ func diff*(r: var FF, a, b: FF) {.inline, meter.} =
|
||||
## `r` is initialized/overwritten
|
||||
## Requires r != b
|
||||
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||
r = a
|
||||
submod_asm(r.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
|
||||
else:
|
||||
var underflowed = r.mres.diff(a.mres, b.mres)
|
||||
discard cadd(r.mres, FF.fieldMod(), underflowed)
|
||||
|
||||
func diffAlias*(r: var FF, a, b: FF) {.inline, meter.} =
|
||||
## Substract `b` from `a` and store the result into `r`.
|
||||
## `r` is initialized/overwritten
|
||||
## Handles r == b
|
||||
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||
var t = a
|
||||
submod_asm(t.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
|
||||
r = t
|
||||
submod_asm(r.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
|
||||
else:
|
||||
var underflowed = r.mres.diff(a.mres, b.mres)
|
||||
discard cadd(r.mres, FF.fieldMod(), underflowed)
|
||||
@ -205,8 +191,7 @@ func double*(r: var FF, a: FF) {.inline, meter.} =
|
||||
## Double ``a`` into ``r``
|
||||
## `r` is initialized/overwritten
|
||||
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||
r = a
|
||||
addmod_asm(r.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
|
||||
addmod_asm(r.mres.limbs, a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
|
||||
else:
|
||||
var overflowed = r.mres.double(a.mres)
|
||||
overflowed = overflowed or not(r.mres < FF.fieldMod())
|
||||
@ -223,10 +208,7 @@ func square*(r: var FF, a: FF) {.inline, meter.} =
|
||||
|
||||
func neg*(r: var FF, a: FF) {.inline, meter.} =
|
||||
## Negate modulo p
|
||||
when UseASM_X86_64 and defined(gcc):
|
||||
# Clang and every compiler besides GCC
|
||||
# can cleanly optimized this
|
||||
# especially on FF2
|
||||
when UseASM_X86_64:
|
||||
negmod_asm(r.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
|
||||
else:
|
||||
# If a = 0 we need r = 0 and not r = M
|
||||
|
||||
@ -12,7 +12,8 @@ import
|
||||
./bigints,
|
||||
./finite_fields,
|
||||
./limbs,
|
||||
./limbs_double_width
|
||||
./limbs_double_width,
|
||||
./limbs_montgomery
|
||||
|
||||
when UseASM_X86_64:
|
||||
import assembly/limbs_asm_modular_dbl_width_x86
|
||||
@ -27,10 +28,17 @@ template doubleWidth*(T: typedesc[Fp]): typedesc =
|
||||
## Return the double-width type matching with Fp
|
||||
FpDbl[T.C]
|
||||
|
||||
func `==`*(a, b: FpDbl): SecretBool {.inline.} =
|
||||
a.limbs2x == b.limbs2x
|
||||
|
||||
func mulNoReduce*(r: var FpDbl, a, b: Fp) {.inline.} =
|
||||
## Store the product of ``a`` by ``b`` into ``r``
|
||||
r.limbs2x.prod(a.mres.limbs, b.mres.limbs)
|
||||
|
||||
func squareNoReduce*(r: var FpDbl, a: Fp) {.inline.} =
|
||||
## Store the square of ``a`` into ``r``
|
||||
r.limbs2x.square(a.mres.limbs)
|
||||
|
||||
func reduce*(r: var Fp, a: FpDbl) {.inline.} =
|
||||
## Reduce a double-width field element into r
|
||||
const N = r.mres.limbs.len
|
||||
@ -42,20 +50,16 @@ func reduce*(r: var Fp, a: FpDbl) {.inline.} =
|
||||
Fp.canUseNoCarryMontyMul()
|
||||
)
|
||||
|
||||
func diffNoInline(r: var FpDbl, a, b: FpDbl): Borrow =
|
||||
r.limbs2x.diff(a.limbs2x, b.limbs2x)
|
||||
|
||||
func diffNoReduce*(r: var FpDbl, a, b: FpDbl) =
|
||||
## Double-width substraction without reduction
|
||||
discard diffNoInline(r, a, b)
|
||||
discard r.limbs2x.diff(a.limbs2x, b.limbs2x)
|
||||
|
||||
func diff*(r: var FpDbl, a, b: FpDbl) =
|
||||
func diff*(r: var FpDbl, a, b: FpDbl) {.inline.}=
|
||||
## Double-width modular substraction
|
||||
when false: # TODO slower
|
||||
r = a
|
||||
sub2x_asm(r.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs)
|
||||
when UseASM_X86_64:
|
||||
sub2x_asm(r.limbs2x, a.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs)
|
||||
else:
|
||||
var underflowed = SecretBool diffNoInline(r, a, b)
|
||||
var underflowed = SecretBool r.limbs2x.diff(a.limbs2x, b.limbs2x)
|
||||
|
||||
const N = r.limbs2x.len div 2
|
||||
const M = FpDbl.C.Mod
|
||||
@ -65,8 +69,6 @@ func diff*(r: var FpDbl, a, b: FpDbl) =
|
||||
addC(carry, sum, r.limbs2x[i+N], M.limbs[i], carry)
|
||||
underflowed.ccopy(r.limbs2x[i+N], sum)
|
||||
|
||||
func `-=`*(a: var FpDbl, b: FpDbl) =
|
||||
when false: # TODO slower
|
||||
sub2x_asm(a.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs)
|
||||
else:
|
||||
a.diff(a, b)
|
||||
func `-=`*(a: var FpDbl, b: FpDbl) {.inline.}=
|
||||
## Double-width modular substraction
|
||||
a.diff(a, b)
|
||||
|
||||
@ -12,9 +12,6 @@ import
|
||||
|
||||
when UseASM_X86_32:
|
||||
import ./assembly/limbs_asm_x86
|
||||
when UseASM_X86_64:
|
||||
import ./assembly/limbs_asm_mul_x86
|
||||
import ./assembly/limbs_asm_mul_x86_adx_bmi2
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
@ -335,89 +332,6 @@ func cneg*(a: var Limbs, ctl: CTBool) =
|
||||
|
||||
{.pop.} # inline
|
||||
|
||||
# Multiplication
|
||||
# ------------------------------------------------------------
|
||||
|
||||
func prod*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
|
||||
## Multi-precision multiplication
|
||||
## r <- a*b
|
||||
##
|
||||
## `a`, `b`, `r` can have a different number of limbs
|
||||
## if `r`.limbs.len < a.limbs.len + b.limbs.len
|
||||
## The result will be truncated, i.e. it will be
|
||||
## a * b (mod (2^WordBitwidth)^r.limbs.len)
|
||||
##
|
||||
## `r` must not alias ``a`` or ``b``
|
||||
|
||||
when UseASM_X86_64 and aLen <= 6:
|
||||
if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()):
|
||||
mul_asm_adx_bmi2(r, a, b)
|
||||
else:
|
||||
mul_asm(r, a, b)
|
||||
elif UseASM_X86_64:
|
||||
mul_asm(r, a, b)
|
||||
else:
|
||||
# We use Product Scanning / Comba multiplication
|
||||
var t, u, v = Zero
|
||||
|
||||
staticFor i, 0, min(a.len+b.len, r.len):
|
||||
const ib = min(b.len-1, i)
|
||||
const ia = i - ib
|
||||
staticFor j, 0, min(a.len - ia, ib+1):
|
||||
mulAcc(t, u, v, a[ia+j], b[ib-j])
|
||||
|
||||
r[i] = v
|
||||
v = u
|
||||
u = t
|
||||
t = Zero
|
||||
|
||||
if aLen+bLen < rLen:
|
||||
for i in aLen+bLen ..< rLen:
|
||||
r[i] = Zero
|
||||
|
||||
func prod_high_words*[rLen, aLen, bLen](
|
||||
r: var Limbs[rLen],
|
||||
a: Limbs[aLen], b: Limbs[bLen],
|
||||
lowestWordIndex: static int) =
|
||||
## Multi-precision multiplication keeping only high words
|
||||
## r <- a*b >> (2^WordBitWidth)^lowestWordIndex
|
||||
##
|
||||
## `a`, `b`, `r` can have a different number of limbs
|
||||
## if `r`.limbs.len < a.limbs.len + b.limbs.len - lowestWordIndex
|
||||
## The result will be truncated, i.e. it will be
|
||||
## a * b >> (2^WordBitWidth)^lowestWordIndex (mod (2^WordBitwidth)^r.limbs.len)
|
||||
#
|
||||
# This is useful for
|
||||
# - Barret reduction
|
||||
# - Approximating multiplication by a fractional constant in the form f(a) = K/C * a
|
||||
# with K and C known at compile-time.
|
||||
# We can instead find a well chosen M = (2^WordBitWidth)^w, with M > C (i.e. M is a power of 2 bigger than C)
|
||||
# Precompute P = K*M/C at compile-time
|
||||
# and at runtime do P*a/M <=> P*a >> (WordBitWidth*w)
|
||||
# i.e. prod_high_words(result, P, a, w)
|
||||
|
||||
# We use Product Scanning / Comba multiplication
|
||||
var t, u, v = Zero # Will raise warning on empty iterations
|
||||
var z: Limbs[rLen] # zero-init, ensure on stack and removes in-place problems
|
||||
|
||||
# The previous 2 columns can affect the lowest word due to carries
|
||||
# but not the ones before (we accumulate in 3 words (t, u, v))
|
||||
const w = lowestWordIndex - 2
|
||||
|
||||
staticFor i, max(0, w), min(a.len+b.len, r.len+lowestWordIndex):
|
||||
const ib = min(b.len-1, i)
|
||||
const ia = i - ib
|
||||
staticFor j, 0, min(a.len - ia, ib+1):
|
||||
mulAcc(t, u, v, a[ia+j], b[ib-j])
|
||||
|
||||
when i >= lowestWordIndex:
|
||||
z[i-lowestWordIndex] = v
|
||||
v = u
|
||||
u = t
|
||||
t = Zero
|
||||
|
||||
r = z
|
||||
|
||||
# Division
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
@ -11,10 +11,9 @@ import
|
||||
../primitives,
|
||||
./limbs
|
||||
|
||||
when UseASM_X86_32:
|
||||
import ./assembly/limbs_asm_montred_x86
|
||||
when UseASM_X86_64:
|
||||
import ./assembly/limbs_asm_montred_x86_adx_bmi2
|
||||
import ./assembly/limbs_asm_mul_x86
|
||||
import ./assembly/limbs_asm_mul_x86_adx_bmi2
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
@ -25,77 +24,170 @@ when UseASM_X86_64:
|
||||
# No exceptions allowed
|
||||
{.push raises: [].}
|
||||
|
||||
# Montgomery Reduction
|
||||
# Multiplication
|
||||
# ------------------------------------------------------------
|
||||
# This is the reduction part of SOS (Separated Operand Scanning) modular multiplication technique
|
||||
|
||||
# TODO upstream, using Limbs[N] breaks semcheck
|
||||
func montyRed*[N: static int](
|
||||
r: var array[N, SecretWord],
|
||||
t: array[N*2, SecretWord],
|
||||
M: array[N, SecretWord],
|
||||
m0ninv: BaseType, canUseNoCarryMontyMul: static bool) =
|
||||
## Montgomery reduce a double-width bigint modulo M
|
||||
# - Analyzing and Comparing Montgomery Multiplication Algorithms
|
||||
# Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr.
|
||||
# http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf
|
||||
#
|
||||
# - Arithmetic of Finite Fields
|
||||
# Chapter 5 of Guide to Pairing-Based Cryptography
|
||||
# Jean Luc Beuchat, Luis J. Dominguez Perez, Sylvain Duquesne, Nadia El Mrabet, Laura Fuentes-Castañeda, Francisco Rodríguez-Henríquez, 2017
|
||||
# https://www.researchgate.net/publication/319538235_Arithmetic_of_Finite_Fields
|
||||
#
|
||||
# Algorithm
|
||||
# Inputs:
|
||||
# - N number of limbs
|
||||
# - t[0 ..< 2N] (double-width input to reduce)
|
||||
# - M[0 ..< N] The field modulus (must be odd for Montgomery reduction)
|
||||
# - m0ninv: Montgomery Reduction magic number = -1/M[0]
|
||||
# Output:
|
||||
# - r[0 ..< N], in the Montgomery domain
|
||||
# Parameters:
|
||||
# - w, the word width usually 64 on 64-bit platforms or 32 on 32-bit
|
||||
#
|
||||
# for i in 0 .. n-1:
|
||||
# C <- 0
|
||||
# m <- t[i] * m0ninv mod 2^w (i.e. simple multiplication)
|
||||
# for j in 0 .. n-1:
|
||||
# (C, S) <- t[i+j] + m * M[j] + C
|
||||
# t[i+j] <- S
|
||||
# t[i+n] += C
|
||||
# for i in 0 .. n-1:
|
||||
# r[i] = t[i+n]
|
||||
# if r >= M:
|
||||
# r -= M
|
||||
#
|
||||
# Important note: `t[i+n] += C` should propagate the carry
|
||||
# to the higher limb if any, thank you "implementation detail"
|
||||
# missing from paper.
|
||||
when UseASM_X86_64 and r.len <= 6:
|
||||
func prod*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
|
||||
## Multi-precision multiplication
|
||||
## r <- a*b
|
||||
##
|
||||
## `a`, `b`, `r` can have a different number of limbs
|
||||
## if `r`.limbs.len < a.limbs.len + b.limbs.len
|
||||
## The result will be truncated, i.e. it will be
|
||||
## a * b (mod (2^WordBitwidth)^r.limbs.len)
|
||||
##
|
||||
## `r` must not alias ``a`` or ``b``
|
||||
|
||||
when UseASM_X86_64 and aLen <= 6:
|
||||
if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()):
|
||||
montRed_asm_adx_bmi2(r, t, M, m0ninv, canUseNoCarryMontyMul)
|
||||
mul_asm_adx_bmi2(r, a, b)
|
||||
else:
|
||||
montRed_asm(r, t, M, m0ninv, canUseNoCarryMontyMul)
|
||||
elif UseASM_X86_32 and r.len <= 6:
|
||||
# TODO: Assembly faster than GCC but slower than Clang
|
||||
montRed_asm(r, t, M, m0ninv, canUseNoCarryMontyMul)
|
||||
mul_asm(r, a, b)
|
||||
elif UseASM_X86_64:
|
||||
mul_asm(r, a, b)
|
||||
else:
|
||||
var t = t # Copy "t" for mutation and ensure on stack
|
||||
var res: typeof(r) # Accumulator
|
||||
staticFor i, 0, N:
|
||||
var C = Zero
|
||||
let m = t[i] * SecretWord(m0ninv)
|
||||
staticFor j, 0, N:
|
||||
muladd2(C, t[i+j], m, M[j], t[i+j], C)
|
||||
res[i] = C
|
||||
# We use Product Scanning / Comba multiplication
|
||||
var t, u, v = Zero
|
||||
const stopEx = min(a.len+b.len, r.len)
|
||||
|
||||
# This does t[i+n] += C
|
||||
# but in a separate carry chain, fused with the
|
||||
# copy "r[i] = t[i+n]"
|
||||
var carry = Carry(0)
|
||||
staticFor i, 0, N:
|
||||
addC(carry, res[i], t[i+N], res[i], carry)
|
||||
staticFor i, 0, stopEx:
|
||||
# Invariant for product scanning:
|
||||
# if we multiply accumulate by a[k1] * b[k2]
|
||||
# we have k1+k2 == i
|
||||
const ib = min(b.len-1, i)
|
||||
const ia = i - ib
|
||||
staticFor j, 0, min(a.len - ia, ib+1):
|
||||
mulAcc(t, u, v, a[ia+j], b[ib-j])
|
||||
|
||||
# Final substraction
|
||||
discard res.csub(M, SecretWord(carry).isNonZero() or not(res < M))
|
||||
r = res
|
||||
r[i] = v
|
||||
when i < stopEx-1:
|
||||
v = u
|
||||
u = t
|
||||
t = Zero
|
||||
|
||||
if aLen+bLen < rLen:
|
||||
for i in aLen+bLen ..< rLen:
|
||||
r[i] = Zero
|
||||
|
||||
func prod_high_words*[rLen, aLen, bLen](
|
||||
r: var Limbs[rLen],
|
||||
a: Limbs[aLen], b: Limbs[bLen],
|
||||
lowestWordIndex: static int) =
|
||||
## Multi-precision multiplication keeping only high words
|
||||
## r <- a*b >> (2^WordBitWidth)^lowestWordIndex
|
||||
##
|
||||
## `a`, `b`, `r` can have a different number of limbs
|
||||
## if `r`.limbs.len < a.limbs.len + b.limbs.len - lowestWordIndex
|
||||
## The result will be truncated, i.e. it will be
|
||||
## a * b >> (2^WordBitWidth)^lowestWordIndex (mod (2^WordBitwidth)^r.limbs.len)
|
||||
#
|
||||
# This is useful for
|
||||
# - Barret reduction
|
||||
# - Approximating multiplication by a fractional constant in the form f(a) = K/C * a
|
||||
# with K and C known at compile-time.
|
||||
# We can instead find a well chosen M = (2^WordBitWidth)^w, with M > C (i.e. M is a power of 2 bigger than C)
|
||||
# Precompute P = K*M/C at compile-time
|
||||
# and at runtime do P*a/M <=> P*a >> (WordBitWidth*w)
|
||||
# i.e. prod_high_words(result, P, a, w)
|
||||
|
||||
# We use Product Scanning / Comba multiplication
|
||||
var t, u, v = Zero # Will raise warning on empty iterations
|
||||
var z: Limbs[rLen] # zero-init, ensure on stack and removes in-place problems
|
||||
|
||||
# The previous 2 columns can affect the lowest word due to carries
|
||||
# but not the ones before (we accumulate in 3 words (t, u, v))
|
||||
const w = lowestWordIndex - 2
|
||||
const stopEx = min(a.len+b.len, r.len+lowestWordIndex)
|
||||
|
||||
staticFor i, max(0, w), stopEx:
|
||||
const ib = min(b.len-1, i)
|
||||
const ia = i - ib
|
||||
staticFor j, 0, min(a.len - ia, ib+1):
|
||||
mulAcc(t, u, v, a[ia+j], b[ib-j])
|
||||
|
||||
when i >= lowestWordIndex:
|
||||
z[i-lowestWordIndex] = v
|
||||
when i < stopEx-1:
|
||||
v = u
|
||||
u = t
|
||||
t = Zero
|
||||
|
||||
r = z
|
||||
|
||||
func square_Comba[rLen, aLen](
|
||||
r: var Limbs[rLen],
|
||||
a: Limbs[aLen]) =
|
||||
## Multi-precision squaring using Comba / Product Scanning
|
||||
var t, u, v = Zero
|
||||
const stopEx = min(a.len * 2, r.len)
|
||||
|
||||
staticFor i, 0, stopEx:
|
||||
# Invariant for product scanning:
|
||||
# if we multiply accumulate by a[k1] * a[k2]
|
||||
# we have k1+k2 == i
|
||||
const ib = min(a.len-1, i)
|
||||
const ia = i - ib
|
||||
staticFor j, 0, min(a.len - ia, ib+1):
|
||||
const k1 = ia+j
|
||||
const k2 = ib-j
|
||||
when k1 < k2:
|
||||
mulDoubleAcc(t, u, v, a[k1], a[k2])
|
||||
elif k1 == k2:
|
||||
mulAcc(t, u, v, a[k1], a[k2])
|
||||
else:
|
||||
discard
|
||||
|
||||
r[i] = v
|
||||
when i < stopEx-1:
|
||||
v = u
|
||||
u = t
|
||||
t = Zero
|
||||
|
||||
if aLen*2 < rLen:
|
||||
for i in aLen*2 ..< rLen:
|
||||
r[i] = Zero
|
||||
|
||||
func square_operandScan[rLen, aLen](
|
||||
r: var Limbs[rLen],
|
||||
a: Limbs[aLen]) =
|
||||
## Multi-precision squaring using Operand Scanning
|
||||
const stopEx = min(a.len * 2, r.len)
|
||||
var t: typeof(r) # zero-init, ensure on stack
|
||||
var C = Zero
|
||||
static: doAssert aLen * 2 == rLen, "Truncated square operand scanning is not implemented"
|
||||
|
||||
staticFor i, 0, stopEx:
|
||||
staticFor j, i+1, stopEx:
|
||||
muladd2(C, t[i+j], a[j], a[i], t[i+j], C)
|
||||
t[i+stopEx] = C
|
||||
|
||||
staticFor i, 0, aLen:
|
||||
# (t[2*i+1], t[2*i]) <- 2*t[2*i] + a[i]*a[i]
|
||||
var u, v = Zero
|
||||
var carry: Carry
|
||||
# a[i] * a[i]
|
||||
mul(u, v, a[i], a[i])
|
||||
# 2*t[2*i]
|
||||
addC(carry, t[2*i], t[2*i], t[2*i], Carry(0))
|
||||
addC(carry, t[2*i+1], Zero, Zero, carry)
|
||||
# 2*t[2*i] + a[i] * a[i]
|
||||
addC(carry, t[2*i], t[2*i], u, Carry(0))
|
||||
addC(carry, t[2*i+1], Zero, v, carry)
|
||||
|
||||
r = t
|
||||
|
||||
func square*[rLen, aLen](
|
||||
r: var Limbs[rLen],
|
||||
a: Limbs[aLen]) =
|
||||
## Multi-precision squaring
|
||||
## r <- a²
|
||||
##
|
||||
## if `r`.limbs.len < a.limbs.len * 2
|
||||
## The result will be truncated, i.e. it will be
|
||||
## a² (mod (2^WordBitwidth)^r.limbs.len)
|
||||
##
|
||||
## `r` must not alias ``a`` or ``b``
|
||||
when UseASM_X86_64:
|
||||
square_asm(r, a)
|
||||
else:
|
||||
square_comba(r, a)
|
||||
|
||||
@ -14,10 +14,13 @@ import
|
||||
../primitives,
|
||||
./limbs
|
||||
|
||||
when UseASM_X86_32:
|
||||
import ./assembly/limbs_asm_montred_x86
|
||||
when UseASM_X86_64:
|
||||
import
|
||||
./assembly/limbs_asm_montmul_x86,
|
||||
./assembly/limbs_asm_montmul_x86_adx_bmi2
|
||||
./assembly/limbs_asm_montmul_x86_adx_bmi2,
|
||||
./assembly/limbs_asm_montred_x86_adx_bmi2
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
@ -42,7 +45,7 @@ when UseASM_X86_64:
|
||||
# No exceptions allowed
|
||||
{.push raises: [].}
|
||||
|
||||
# Implementation
|
||||
# Montgomery Multiplication
|
||||
# ------------------------------------------------------------
|
||||
|
||||
func montyMul_CIOS_nocarry(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
|
||||
@ -162,6 +165,9 @@ func montyMul_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
|
||||
discard z.csub(M, v.isNonZero() or not(z < M))
|
||||
r = z
|
||||
|
||||
# Montgomery Squaring
|
||||
# ------------------------------------------------------------
|
||||
|
||||
func montySquare_CIOS_nocarry(r: var Limbs, a, M: Limbs, m0ninv: BaseType) {.used.}=
|
||||
## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
|
||||
## and no-carry optimization.
|
||||
@ -267,6 +273,133 @@ func montySquare_CIOS(r: var Limbs, a, M: Limbs, m0ninv: BaseType) {.used.}=
|
||||
discard t.csub(M, tN.isNonZero() or not(t < M)) # TODO: (t >= M) is unnecessary for prime in the form (2^64)^w
|
||||
r = t
|
||||
|
||||
# Montgomery Reduction
|
||||
# ------------------------------------------------------------
|
||||
func montyRed_CIOS[N: static int](
|
||||
r: var array[N, SecretWord],
|
||||
a: array[N*2, SecretWord],
|
||||
M: array[N, SecretWord],
|
||||
m0ninv: BaseType) =
|
||||
## Montgomery reduce a double-width bigint modulo M
|
||||
# - Analyzing and Comparing Montgomery Multiplication Algorithms
|
||||
# Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr.
|
||||
# http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf
|
||||
#
|
||||
# - Arithmetic of Finite Fields
|
||||
# Chapter 5 of Guide to Pairing-Based Cryptography
|
||||
# Jean Luc Beuchat, Luis J. Dominguez Perez, Sylvain Duquesne, Nadia El Mrabet, Laura Fuentes-Castañeda, Francisco Rodríguez-Henríquez, 2017
|
||||
# https://www.researchgate.net/publication/319538235_Arithmetic_of_Finite_Fields
|
||||
#
|
||||
# Algorithm
|
||||
# Inputs:
|
||||
# - N number of limbs
|
||||
# - a[0 ..< 2N] (double-width input to reduce)
|
||||
# - M[0 ..< N] The field modulus (must be odd for Montgomery reduction)
|
||||
# - m0ninv: Montgomery Reduction magic number = -1/M[0]
|
||||
# Output:
|
||||
# - r[0 ..< N], in the Montgomery domain
|
||||
# Parameters:
|
||||
# - w, the word width usually 64 on 64-bit platforms or 32 on 32-bit
|
||||
#
|
||||
# for i in 0 .. n-1:
|
||||
# C <- 0
|
||||
# m <- a[i] * m0ninv mod 2^w (i.e. simple multiplication)
|
||||
# for j in 0 .. n-1:
|
||||
# (C, S) <- a[i+j] + m * M[j] + C
|
||||
# a[i+j] <- S
|
||||
# a[i+n] += C
|
||||
# for i in 0 .. n-1:
|
||||
# r[i] = a[i+n]
|
||||
# if r >= M:
|
||||
# r -= M
|
||||
#
|
||||
# Important note: `a[i+n] += C` should propagate the carry
|
||||
# to the higher limb if any, thank you "implementation detail"
|
||||
# missing from paper.
|
||||
|
||||
var a = a # Copy "t" for mutation and ensure on stack
|
||||
var res: typeof(r) # Accumulator
|
||||
staticFor i, 0, N:
|
||||
var C = Zero
|
||||
let m = a[i] * SecretWord(m0ninv)
|
||||
staticFor j, 0, N:
|
||||
muladd2(C, a[i+j], m, M[j], a[i+j], C)
|
||||
res[i] = C
|
||||
|
||||
# This does t[i+n] += C
|
||||
# but in a separate carry chain, fused with the
|
||||
# copy "r[i] = t[i+n]"
|
||||
var carry = Carry(0)
|
||||
staticFor i, 0, N:
|
||||
addC(carry, res[i], a[i+N], res[i], carry)
|
||||
|
||||
# Final substraction
|
||||
discard res.csub(M, SecretWord(carry).isNonZero() or not(res < M))
|
||||
r = res
|
||||
|
||||
func montyRed_Comba[N: static int](
|
||||
r: var array[N, SecretWord],
|
||||
a: array[N*2, SecretWord],
|
||||
M: array[N, SecretWord],
|
||||
m0ninv: BaseType) =
|
||||
## Montgomery reduce a double-width bigint modulo M
|
||||
# We use Product Scanning / Comba multiplication
|
||||
var t, u, v = Zero
|
||||
var carry: Carry
|
||||
var z: typeof(r) # zero-init, ensure on stack and removes in-place problems in tower fields
|
||||
staticFor i, 0, N:
|
||||
staticFor j, 0, i:
|
||||
mulAcc(t, u, v, z[j], M[i-j])
|
||||
|
||||
addC(carry, v, v, a[i], Carry(0))
|
||||
addC(carry, u, u, Zero, carry)
|
||||
addC(carry, t, t, Zero, carry)
|
||||
|
||||
z[i] = v * SecretWord(m0ninv)
|
||||
mulAcc(t, u, v, z[i], M[0])
|
||||
v = u
|
||||
u = t
|
||||
t = Zero
|
||||
|
||||
staticFor i, N, 2*N-1:
|
||||
staticFor j, i-N+1, N:
|
||||
mulAcc(t, u, v, z[j], M[i-j])
|
||||
|
||||
addC(carry, v, v, a[i], Carry(0))
|
||||
addC(carry, u, u, Zero, carry)
|
||||
addC(carry, t, t, Zero, carry)
|
||||
|
||||
z[i-N] = v
|
||||
|
||||
v = u
|
||||
u = t
|
||||
t = Zero
|
||||
|
||||
addC(carry, z[N-1], v, a[2*N-1], Carry(0))
|
||||
|
||||
# Final substraction
|
||||
discard z.csub(M, SecretBool(carry) or not(z < M))
|
||||
r = z
|
||||
|
||||
# TODO upstream, using Limbs[N] breaks semcheck
|
||||
func montyRed*[N: static int](
|
||||
r: var array[N, SecretWord],
|
||||
a: array[N*2, SecretWord],
|
||||
M: array[N, SecretWord],
|
||||
m0ninv: BaseType, canUseNoCarryMontyMul: static bool) =
|
||||
## Montgomery reduce a double-width bigint modulo M
|
||||
when UseASM_X86_64 and r.len <= 6:
|
||||
if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()):
|
||||
montRed_asm_adx_bmi2(r, a, M, m0ninv, canUseNoCarryMontyMul)
|
||||
else:
|
||||
montRed_asm(r, a, M, m0ninv, canUseNoCarryMontyMul)
|
||||
elif UseASM_X86_32 and r.len <= 6:
|
||||
# TODO: Assembly faster than GCC but slower than Clang
|
||||
montRed_asm(r, a, M, m0ninv, canUseNoCarryMontyMul)
|
||||
else:
|
||||
montyRed_CIOS(r, a, M, m0ninv)
|
||||
# montyRed_Comba(r, a, M, m0ninv)
|
||||
|
||||
# Exported API
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
@ -54,8 +54,8 @@ const
|
||||
|
||||
# TODO, we restrict assembly to 64-bit words
|
||||
# We need to support register spills for large limbs
|
||||
const ConstantineASM {.booldefine.} = true
|
||||
const UseASM_X86_32* = ConstantineASM and X86 and GCC_Compatible
|
||||
const CttASM {.booldefine.} = true
|
||||
const UseASM_X86_32* = CttASM and X86 and GCC_Compatible
|
||||
const UseASM_X86_64* = WordBitWidth == 64 and UseASM_X86_32
|
||||
|
||||
# ############################################################
|
||||
|
||||
@ -90,7 +90,7 @@ func curve_eq_rhs*[F](y2: var F, x: F, Tw: static Twisted) =
|
||||
else:
|
||||
{.error: "Only twisted curves are supported on extension field 𝔽p²".}
|
||||
|
||||
y2.diffAlias(t, y2)
|
||||
y2.diff(t, y2)
|
||||
|
||||
when F.C.getCoefA() != 0:
|
||||
t = x
|
||||
|
||||
@ -198,7 +198,7 @@ func sum*[F; Tw: static Twisted](
|
||||
r.y.sum(Q.x, Q.z) # 15. Y₃ <- X₂ + Z₂
|
||||
r.x *= r.y # 16. X₃ <- X₃ Y₃ X₃ = (X₁Z₁)(X₂Z₂)
|
||||
r.y.sum(t0, t2) # 17. Y₃ <- t₀ + t₂ Y₃ = X₁ X₂ + Z₁ Z₂
|
||||
r.y.diffAlias(r.x, r.y) # 18. Y₃ <- X₃ - Y₃ Y₃ = (X₁ + Z₁)(X₂ + Z₂) - (X₁ X₂ + Z₁ Z₂) = X₁Z₂ + X₂Z₁
|
||||
r.y.diff(r.x, r.y) # 18. Y₃ <- X₃ - Y₃ Y₃ = (X₁ + Z₁)(X₂ + Z₂) - (X₁ X₂ + Z₁ Z₂) = X₁Z₂ + X₂Z₁
|
||||
when Tw == OnTwist and F.C.getSexticTwist() == D_Twist:
|
||||
t0 *= SexticNonResidue
|
||||
t1 *= SexticNonResidue
|
||||
@ -214,7 +214,7 @@ func sum*[F; Tw: static Twisted](
|
||||
r.y *= SexticNonResidue
|
||||
r.x.prod(t4, r.y) # 25. X₃ <- t₄ Y₃ X₃ = 3b(Y₁Z₂ + Y₂Z₁)(X₁Z₂ + X₂Z₁)
|
||||
t2.prod(t3, t1) # 26. t₂ <- t₃ t₁ t₂ = (X₁Y₂ + X₂Y₁) (Y₁Y₂ - 3bZ₁Z₂)
|
||||
r.x.diffAlias(t2, r.x) # 27. X₃ <- t₂ - X₃ X₃ = (X₁Y₂ + X₂Y₁) (Y₁Y₂ - 3bZ₁Z₂) - 3b(Y₁Z₂ + Y₂Z₁)(X₁Z₂ + X₂Z₁)
|
||||
r.x.diff(t2, r.x) # 27. X₃ <- t₂ - X₃ X₃ = (X₁Y₂ + X₂Y₁) (Y₁Y₂ - 3bZ₁Z₂) - 3b(Y₁Z₂ + Y₂Z₁)(X₁Z₂ + X₂Z₁)
|
||||
r.y *= t0 # 28. Y₃ <- Y₃ t₀ Y₃ = 9bX₁X₂ (X₁Z₂ + X₂Z₁)
|
||||
t1 *= r.z # 29. t₁ <- t₁ Z₃ t₁ = (Y₁Y₂ - 3bZ₁Z₂)(Y₁Y₂ + 3bZ₁Z₂)
|
||||
r.y += t1 # 30. Y₃ <- t₁ + Y₃ Y₃ = (Y₁Y₂ + 3bZ₁Z₂)(Y₁Y₂ - 3bZ₁Z₂) + 9bX₁X₂ (X₁Z₂ + X₂Z₁)
|
||||
@ -277,7 +277,7 @@ func madd*[F; Tw: static Twisted](
|
||||
r.y *= SexticNonResidue
|
||||
r.x.prod(t4, r.y) # 18. X₃ <- t₄ Y₃, X₃ = (Y₁ + Y₂Z₁) 3b(X₁ + X₂Z₁)
|
||||
t2.prod(t3, t1) # 19. t₂ <- t₃ t₁, t₂ = (X₁Y₂ + X₂Y₁)(Y₁Y₂ - 3bZ₁)
|
||||
r.x.diffAlias(t2, r.x) # 20. X₃ <- t₂ - X₃, X₃ = (X₁Y₂ + X₂Y₁)(Y₁Y₂ - 3bZ₁) - 3b(Y₁ + Y₂Z₁)(X₁ + X₂Z₁)
|
||||
r.x.diff(t2, r.x) # 20. X₃ <- t₂ - X₃, X₃ = (X₁Y₂ + X₂Y₁)(Y₁Y₂ - 3bZ₁) - 3b(Y₁ + Y₂Z₁)(X₁ + X₂Z₁)
|
||||
r.y *= t0 # 21. Y₃ <- Y₃ t₀, Y₃ = 9bX₁X₂ (X₁ + X₂Z₁)
|
||||
t1 *= r.z # 22. t₁ <- t₁ Z₃, t₁ = (Y₁Y₂ - 3bZ₁)(Y₁Y₂ + 3bZ₁)
|
||||
r.y += t1 # 23. Y₃ <- t₁ + Y₃, Y₃ = (Y₁Y₂ + 3bZ₁)(Y₁Y₂ - 3bZ₁) + 9bX₁X₂ (X₁ + X₂Z₁)
|
||||
|
||||
@ -270,7 +270,7 @@ func line_eval_fused_add[F](
|
||||
F.prod(T.z, C)
|
||||
G.prod(T.x, D)
|
||||
H.double(G)
|
||||
H.diffAlias(F, H)
|
||||
H.diff(F, H)
|
||||
H += E
|
||||
I.prod(T.y, E)
|
||||
|
||||
|
||||
@ -138,3 +138,17 @@ func mulAcc*[T: Ct[uint32]|Ct[uint64]](t, u, v: var T, a, b: T) {.inline.} =
|
||||
addC(carry, v, v, UV[0], Carry(0))
|
||||
addC(carry, u, u, UV[1], carry)
|
||||
t += T(carry)
|
||||
|
||||
func mulDoubleAcc*[T: Ct[uint32]|Ct[uint64]](t, u, v: var T, a, b: T) {.inline.} =
|
||||
## (t, u, v) <- (t, u, v) + 2 * a * b
|
||||
var UV: array[2, T]
|
||||
var carry: Carry
|
||||
mul(UV[1], UV[0], a, b)
|
||||
|
||||
addC(carry, UV[0], UV[0], UV[0], Carry(0))
|
||||
addC(carry, UV[1], UV[1], UV[1], carry)
|
||||
t += T(carry)
|
||||
|
||||
addC(carry, v, v, UV[0], Carry(0))
|
||||
addC(carry, u, u, UV[1], carry)
|
||||
t += T(carry)
|
||||
|
||||
@ -40,7 +40,7 @@ type
|
||||
CarryFlag = "@ccc"
|
||||
|
||||
Register* = enum
|
||||
rbx, rdx, r8, rax
|
||||
rbx, rdx, r8, rax, xmm0
|
||||
|
||||
Constraint* = enum
|
||||
## GCC extended assembly modifier
|
||||
@ -369,6 +369,26 @@ func codeFragment(a: var Assembler_x86, instr: string, imm: int, op: Operand) =
|
||||
|
||||
a.operands.incl op.desc
|
||||
|
||||
func codeFragment(a: var Assembler_x86, instr: string, reg: Register, op: OperandReuse) =
|
||||
# Generate a code fragment
|
||||
# ⚠️ Warning:
|
||||
# The caller should deal with destination/source operand
|
||||
# so that it fits GNU Assembly
|
||||
if a.wordBitWidth == 64:
|
||||
a.code &= instr & "q %%" & $reg & ", %" & $op.asmId & '\n'
|
||||
else:
|
||||
a.code &= instr & "l %%" & $reg & ", %" & $op.asmId & '\n'
|
||||
|
||||
func codeFragment(a: var Assembler_x86, instr: string, op: OperandReuse, reg: Register) =
|
||||
# Generate a code fragment
|
||||
# ⚠️ Warning:
|
||||
# The caller should deal with destination/source operand
|
||||
# so that it fits GNU Assembly
|
||||
if a.wordBitWidth == 64:
|
||||
a.code &= instr & "q %" & $op.asmId & ", %%" & $reg & '\n'
|
||||
else:
|
||||
a.code &= instr & "l %" & $op.asmId & ", %%" & $reg & '\n'
|
||||
|
||||
func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: Register) =
|
||||
# Generate a code fragment
|
||||
# ⚠️ Warning:
|
||||
@ -605,6 +625,17 @@ func mov*(a: var Assembler_x86, dst: Operand, imm: int) =
|
||||
a.codeFragment("mov", imm, dst)
|
||||
# No clobber
|
||||
|
||||
func mov*(a: var Assembler_x86, dst: Register, src: OperandReuse) =
|
||||
## Does: dst <- src with dst a fixed register
|
||||
a.codeFragment("mov", src, dst)
|
||||
# No clobber
|
||||
|
||||
func mov*(a: var Assembler_x86, dst: OperandReuse, src: Register) =
|
||||
## Does: dst <- imm
|
||||
# doAssert dst.desc.constraint in OutputReg, $dst.repr
|
||||
a.codeFragment("mov", src, dst)
|
||||
# No clobber
|
||||
|
||||
func cmovc*(a: var Assembler_x86, dst, src: Operand) =
|
||||
## Does: dst <- src if the carry flag is set
|
||||
doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr
|
||||
@ -696,19 +727,38 @@ func mulx*(a: var Assembler_x86, dHi, dLo, src0: Operand, src1: Register) =
|
||||
|
||||
a.operands.incl src0.desc
|
||||
|
||||
func adcx*(a: var Assembler_x86, dst, src: Operand) =
|
||||
func mulx*(a: var Assembler_x86, dHi: OperandReuse, dLo, src0: Operand, src1: Register) =
|
||||
## Does (dHi, dLo) <- src0 * src1
|
||||
doAssert src1 == rdx, "MULX requires the RDX register"
|
||||
doAssert dLo.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
|
||||
"The destination operand must be a register " & $dLo.repr
|
||||
doAssert dLo.desc.constraint in OutputReg
|
||||
|
||||
let off0 = a.getStrOffset(src0)
|
||||
|
||||
# Annoying AT&T syntax
|
||||
if a.wordBitWidth == 64:
|
||||
a.code &= "mulxq " & off0 & ", %" & $dLo.desc.asmId & ", %" & $dHi.asmId & '\n'
|
||||
else:
|
||||
a.code &= "mulxl " & off0 & ", %" & $dLo.desc.asmId & ", %" & $dHi.asmId & '\n'
|
||||
|
||||
a.operands.incl src0.desc
|
||||
|
||||
func adcx*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|OperandReuse) =
|
||||
## Does: dst <- dst + src + carry
|
||||
## and only sets the carry flag
|
||||
doAssert dst.desc.constraint in OutputReg, $dst.repr
|
||||
doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr
|
||||
when dst is Operand:
|
||||
doAssert dst.desc.constraint in OutputReg, $dst.repr
|
||||
doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr
|
||||
a.codeFragment("adcx", src, dst)
|
||||
a.areFlagsClobbered = true
|
||||
|
||||
func adox*(a: var Assembler_x86, dst, src: Operand) =
|
||||
func adox*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|OperandReuse) =
|
||||
## Does: dst <- dst + src + overflow
|
||||
## and only sets the overflow flag
|
||||
doAssert dst.desc.constraint in OutputReg, $dst.repr
|
||||
doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr
|
||||
when dst is Operand:
|
||||
doAssert dst.desc.constraint in OutputReg, $dst.repr
|
||||
doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr
|
||||
a.codeFragment("adox", src, dst)
|
||||
a.areFlagsClobbered = true
|
||||
|
||||
|
||||
@ -87,8 +87,8 @@ func prod_complex(r: var QuadraticExt, a, b: QuadraticExt) =
|
||||
# TODO: GCC is adding an unexplainable 30 cycles tax to this function (~10% slow down)
|
||||
# for seemingly no reason
|
||||
|
||||
when true: # Single-width implementation
|
||||
# Clang 330 cycles on i9-9980XE @4.1 GHz
|
||||
when false: # Single-width implementation - BLS12-381
|
||||
# Clang 348 cycles on i9-9980XE @3.9 GHz
|
||||
var a0b0 {.noInit.}, a1b1 {.noInit.}: typeof(r.c0)
|
||||
a0b0.prod(a.c0, b.c0) # [1 Mul]
|
||||
a1b1.prod(a.c1, b.c1) # [2 Mul]
|
||||
@ -102,7 +102,7 @@ func prod_complex(r: var QuadraticExt, a, b: QuadraticExt) =
|
||||
r.c1 -= a1b1 # r1 = (b0 + b1)(a0 + a1) - a0b0 - a1b1 # [3 Mul, 2 Add, 3 Sub]
|
||||
|
||||
else: # Double-width implementation with lazy reduction
|
||||
# Deactivated for now Clang 360 cycles on i9-9980XE @4.1 GHz
|
||||
# Clang 341 cycles on i9-9980XE @3.9 GHz
|
||||
var a0b0 {.noInit.}, a1b1 {.noInit.}: doubleWidth(typeof(r.c0))
|
||||
var d {.noInit.}: doubleWidth(typeof(r.c0))
|
||||
const msbSet = r.c0.typeof.canUseNoCarryMontyMul()
|
||||
@ -120,20 +120,19 @@ func prod_complex(r: var QuadraticExt, a, b: QuadraticExt) =
|
||||
d -= a0b0
|
||||
d -= a1b1
|
||||
else:
|
||||
d.diffNoReduce(d, a0b0) # 10 cycles - cumul 152
|
||||
d.diffNoReduce(d, a1b1) # 10 cycles - cumul 162
|
||||
a0b0.diff(a0b0, a1b1) # 18 cycles - cumul 170
|
||||
r.c0.reduce(a0b0) # 68 cycles - cumul 248
|
||||
r.c1.reduce(d) # 68 cycles - cumul 316
|
||||
d.diffNoReduce(d, a0b0) # 11 cycles - cumul 153
|
||||
d.diffNoReduce(d, a1b1) # 11 cycles - cumul 164
|
||||
a0b0.diff(a0b0, a1b1) # 19 cycles - cumul 183
|
||||
r.c0.reduce(a0b0) # 50 cycles - cumul 233
|
||||
r.c1.reduce(d) # 50 cycles - cumul 288
|
||||
|
||||
# Single-width [3 Mul, 2 Add, 3 Sub]
|
||||
# 3*81 + 2*14 + 3*12 = 307 theoretical cycles
|
||||
# 330 measured
|
||||
# 3*88 + 2*14 + 3*14 = 334 theoretical cycles
|
||||
# 348 measured
|
||||
# Double-Width
|
||||
# 316 theoretical cycles
|
||||
# 365 measured
|
||||
# Reductions can be 2x10 faster using MCL algorithm
|
||||
# but there are still unexplained 50 cycles diff between theo and measured
|
||||
# 288 theoretical cycles
|
||||
# 329 measured
|
||||
# Unexplained 40 cycles diff between theo and measured
|
||||
# and unexplained 30 cycles between Clang and GCC
|
||||
# - Function calls?
|
||||
# - push/pop stack?
|
||||
|
||||
@ -168,18 +168,6 @@ func diff*(r: var CubicExt, a, b: CubicExt) =
|
||||
r.c1.diff(a.c1, b.c1)
|
||||
r.c2.diff(a.c2, b.c2)
|
||||
|
||||
func diffAlias*(r: var QuadraticExt, a, b: QuadraticExt) =
|
||||
## Diff ``a`` and ``b`` into ``r``
|
||||
## Handles r and b aliasing
|
||||
r.c0.diffAlias(a.c0, b.c0)
|
||||
r.c1.diffAlias(a.c1, b.c1)
|
||||
|
||||
func diffAlias*(r: var CubicExt, a, b: CubicExt) =
|
||||
## Diff ``a`` and ``b`` into ``r``
|
||||
## Handles r and b aliasing
|
||||
r.c0.diffAlias(a.c0, b.c0)
|
||||
r.c1.diffAlias(a.c1, b.c1)
|
||||
r.c2.diffAlias(a.c2, b.c2)
|
||||
|
||||
# Conditional arithmetic
|
||||
# -------------------------------------------------------------------
|
||||
|
||||
@ -196,21 +196,32 @@ func `*=`*(a: var Fp2, _: typedesc[ξ]) {.inline.} =
|
||||
type
|
||||
Fp12*[C: static Curve] = object
|
||||
c0*, c1*, c2*: Fp4[C]
|
||||
# c0*, c1*: Fp6[C]
|
||||
|
||||
γ = NonResidue
|
||||
# We call the non-residue γ (Gamma) on 𝔽p6 to avoid confusion between non-residue
|
||||
# of different tower level
|
||||
|
||||
func `*`*(_: typedesc[γ], a: Fp4): Fp4 {.noInit, inline.} =
|
||||
## Multiply an element of 𝔽p4 by the sextic non-residue
|
||||
## chosen to construct 𝔽p12
|
||||
result.c0 = ξ * a.c1
|
||||
result.c1 = a.c0
|
||||
|
||||
func `*=`*(a: var Fp4, _: typedesc[γ]) {.inline.} =
|
||||
a = γ * a
|
||||
|
||||
func `*`*(_: typedesc[γ], a: Fp6): Fp6 {.noInit, inline.} =
|
||||
## Multiply an element of 𝔽p6 by the cubic non-residue
|
||||
## chosen to construct 𝔽p12
|
||||
## For all curves γ = v with v the factor for 𝔽p6 coordinate
|
||||
## and v³ = ξ
|
||||
## (c0 + c1 v + c2 v²) v => ξ c2 + c0 v + c1 v²
|
||||
result.c0 = ξ * a.c1
|
||||
result.c0 = ξ * a.c2
|
||||
result.c1 = a.c0
|
||||
result.c2 = a.c1
|
||||
|
||||
func `*=`*(a: var Fp4, _: typedesc[γ]) {.inline.} =
|
||||
func `*=`*(a: var Fp6, _: typedesc[γ]) {.inline.} =
|
||||
a = γ * a
|
||||
|
||||
# Sparse functions
|
||||
|
||||
@ -71,7 +71,7 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst
|
||||
|double* | 7212| 2404000000000.000| 0.003| 0.000|
|
||||
|sum* | 21058| 7019333333333.333| 0.003| 0.000|
|
||||
|diff* | 8884| 2961333333333.333| 0.003| 0.000|
|
||||
|diffAlias* | 10| inf| 0.000| 0.000|
|
||||
|diff* | 10| inf| 0.000| 0.000|
|
||||
|double* | 4186| inf| 0.000| 0.000|
|
||||
|prod* | 14486| 1609555555555.555| 0.009| 0.000|
|
||||
|square* | 16| inf| 0.000| 0.000|
|
||||
|
||||
@ -287,6 +287,28 @@ proc mainMulHigh() =
|
||||
r.prod_high_words(b, a, 2)
|
||||
check: bool(r == expected)
|
||||
|
||||
proc mainSquare() =
|
||||
suite "Multi-precision multiplication" & " [" & $WordBitwidth & "-bit mode]":
|
||||
test "Squaring is consistent with multiplication (rBits = 2*aBits)":
|
||||
block:
|
||||
let a = BigInt[200].fromHex"0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DE"
|
||||
|
||||
var r_mul, r_sqr: BigInt[400]
|
||||
|
||||
r_mul.prod(a, a)
|
||||
r_sqr.square(a)
|
||||
check: bool(r_mul == r_sqr)
|
||||
|
||||
test "Squaring is consistent with multiplication (truncated)":
|
||||
block:
|
||||
let a = BigInt[200].fromHex"0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DE"
|
||||
|
||||
var r_mul, r_sqr: BigInt[256]
|
||||
|
||||
r_mul.prod(a, a)
|
||||
r_sqr.square(a)
|
||||
check: bool(r_mul == r_sqr)
|
||||
|
||||
proc mainModular() =
|
||||
suite "Modular operations - small modulus" & " [" & $WordBitwidth & "-bit mode]":
|
||||
# Vectors taken from Stint - https://github.com/status-im/nim-stint
|
||||
@ -637,9 +659,10 @@ proc mainModularInverse() =
|
||||
check: bool(r == expected)
|
||||
|
||||
mainArith()
|
||||
# mainMul()
|
||||
# mainMulHigh()
|
||||
# mainModular()
|
||||
# mainNeg()
|
||||
# mainCopySwap()
|
||||
# mainModularInverse()
|
||||
mainMul()
|
||||
mainMulHigh()
|
||||
mainSquare()
|
||||
mainModular()
|
||||
mainNeg()
|
||||
mainCopySwap()
|
||||
mainModularInverse()
|
||||
|
||||
@ -24,74 +24,100 @@ rng.seed(seed)
|
||||
echo "\n------------------------------------------------------\n"
|
||||
echo "test_finite_fields_double_width xoshiro512** seed: ", seed
|
||||
|
||||
proc randomCurve(C: static Curve) =
|
||||
let a = rng.random_unsafe(Fp[C])
|
||||
let b = rng.random_unsafe(Fp[C])
|
||||
template mulTest(rng_gen: untyped): untyped =
|
||||
proc `mul _ rng_gen`(C: static Curve) =
|
||||
let a = rng_gen(rng, Fp[C])
|
||||
let b = rng.random_unsafe(Fp[C])
|
||||
|
||||
var r_fp, r_fpDbl: Fp[C]
|
||||
var tmpDbl: FpDbl[C]
|
||||
var r_fp{.noInit.}, r_fpDbl{.noInit.}: Fp[C]
|
||||
var tmpDbl{.noInit.}: FpDbl[C]
|
||||
|
||||
r_fp.prod(a, b)
|
||||
tmpDbl.mulNoReduce(a, b)
|
||||
r_fpDbl.reduce(tmpDbl)
|
||||
r_fp.prod(a, b)
|
||||
tmpDbl.mulNoReduce(a, b)
|
||||
r_fpDbl.reduce(tmpDbl)
|
||||
|
||||
doAssert bool(r_fp == r_fpDbl)
|
||||
doAssert bool(r_fp == r_fpDbl)
|
||||
|
||||
proc randomHighHammingWeight(C: static Curve) =
|
||||
let a = rng.random_highHammingWeight(Fp[C])
|
||||
let b = rng.random_highHammingWeight(Fp[C])
|
||||
template sqrTest(rng_gen: untyped): untyped =
|
||||
proc `sqr _ rng_gen`(C: static Curve) =
|
||||
let a = rng_gen(rng, Fp[C])
|
||||
|
||||
var r_fp, r_fpDbl: Fp[C]
|
||||
var tmpDbl: FpDbl[C]
|
||||
var mulDbl{.noInit.}, sqrDbl{.noInit.}: FpDbl[C]
|
||||
|
||||
r_fp.prod(a, b)
|
||||
tmpDbl.mulNoReduce(a, b)
|
||||
r_fpDbl.reduce(tmpDbl)
|
||||
mulDbl.mulNoReduce(a, a)
|
||||
sqrDbl.squareNoReduce(a)
|
||||
|
||||
doAssert bool(r_fp == r_fpDbl)
|
||||
doAssert bool(mulDbl == sqrDbl)
|
||||
|
||||
proc random_long01Seq(C: static Curve) =
|
||||
let a = rng.random_long01Seq(Fp[C])
|
||||
let b = rng.random_long01Seq(Fp[C])
|
||||
|
||||
var r_fp, r_fpDbl: Fp[C]
|
||||
var tmpDbl: FpDbl[C]
|
||||
|
||||
r_fp.prod(a, b)
|
||||
tmpDbl.mulNoReduce(a, b)
|
||||
r_fpDbl.reduce(tmpDbl)
|
||||
|
||||
doAssert bool(r_fp == r_fpDbl)
|
||||
mulTest(random_unsafe)
|
||||
mulTest(randomHighHammingWeight)
|
||||
mulTest(random_long01Seq)
|
||||
sqrTest(random_unsafe)
|
||||
sqrTest(randomHighHammingWeight)
|
||||
sqrTest(random_long01Seq)
|
||||
|
||||
suite "Field Multiplication via double-width field elements is consistent with single-width." & " [" & $WordBitwidth & "-bit mode]":
|
||||
test "With P-224 field modulus":
|
||||
for _ in 0 ..< Iters:
|
||||
randomCurve(P224)
|
||||
mul_random_unsafe(P224)
|
||||
for _ in 0 ..< Iters:
|
||||
randomHighHammingWeight(P224)
|
||||
mul_randomHighHammingWeight(P224)
|
||||
for _ in 0 ..< Iters:
|
||||
random_long01Seq(P224)
|
||||
mul_random_long01Seq(P224)
|
||||
|
||||
test "With P-256 field modulus":
|
||||
for _ in 0 ..< Iters:
|
||||
randomCurve(P256)
|
||||
mul_random_unsafe(P256)
|
||||
for _ in 0 ..< Iters:
|
||||
randomHighHammingWeight(P256)
|
||||
mul_randomHighHammingWeight(P256)
|
||||
for _ in 0 ..< Iters:
|
||||
random_long01Seq(P256)
|
||||
mul_random_long01Seq(P256)
|
||||
|
||||
test "With BN254_Snarks field modulus":
|
||||
for _ in 0 ..< Iters:
|
||||
randomCurve(BN254_Snarks)
|
||||
mul_random_unsafe(BN254_Snarks)
|
||||
for _ in 0 ..< Iters:
|
||||
randomHighHammingWeight(BN254_Snarks)
|
||||
mul_randomHighHammingWeight(BN254_Snarks)
|
||||
for _ in 0 ..< Iters:
|
||||
random_long01Seq(BN254_Snarks)
|
||||
mul_random_long01Seq(BN254_Snarks)
|
||||
|
||||
test "With BLS12_381 field modulus":
|
||||
for _ in 0 ..< Iters:
|
||||
randomCurve(BLS12_381)
|
||||
mul_random_unsafe(BLS12_381)
|
||||
for _ in 0 ..< Iters:
|
||||
randomHighHammingWeight(BLS12_381)
|
||||
mul_randomHighHammingWeight(BLS12_381)
|
||||
for _ in 0 ..< Iters:
|
||||
random_long01Seq(BLS12_381)
|
||||
mul_random_long01Seq(BLS12_381)
|
||||
|
||||
suite "Field Squaring via double-width field elements is consistent with single-width." & " [" & $WordBitwidth & "-bit mode]":
|
||||
test "With P-224 field modulus":
|
||||
for _ in 0 ..< Iters:
|
||||
sqr_random_unsafe(P224)
|
||||
for _ in 0 ..< Iters:
|
||||
sqr_randomHighHammingWeight(P224)
|
||||
for _ in 0 ..< Iters:
|
||||
sqr_random_long01Seq(P224)
|
||||
|
||||
test "With P-256 field modulus":
|
||||
for _ in 0 ..< Iters:
|
||||
sqr_random_unsafe(P256)
|
||||
for _ in 0 ..< Iters:
|
||||
sqr_randomHighHammingWeight(P256)
|
||||
for _ in 0 ..< Iters:
|
||||
sqr_random_long01Seq(P256)
|
||||
|
||||
test "With BN254_Snarks field modulus":
|
||||
for _ in 0 ..< Iters:
|
||||
sqr_random_unsafe(BN254_Snarks)
|
||||
for _ in 0 ..< Iters:
|
||||
sqr_randomHighHammingWeight(BN254_Snarks)
|
||||
for _ in 0 ..< Iters:
|
||||
sqr_random_long01Seq(BN254_Snarks)
|
||||
|
||||
test "With BLS12_381 field modulus":
|
||||
for _ in 0 ..< Iters:
|
||||
sqr_random_unsafe(BLS12_381)
|
||||
for _ in 0 ..< Iters:
|
||||
sqr_randomHighHammingWeight(BLS12_381)
|
||||
for _ in 0 ..< Iters:
|
||||
sqr_random_long01Seq(BLS12_381)
|
||||
|
||||
@ -155,8 +155,9 @@ proc subTests(gmpRng: var gmp_randstate_t, a, b, p, r: var mpz_t, C: static Curv
|
||||
var r2Test = aTest
|
||||
r2Test -= bTest
|
||||
|
||||
# Substraction with r and b aliasing
|
||||
var r3Test = bTest
|
||||
r3Test.diffAlias(aTest, r3Test)
|
||||
r3Test.diff(aTest, r3Test)
|
||||
|
||||
binary_epilogue(r, a, b, rTest, aBuf, bBuf, "Substraction (with result)")
|
||||
binary_epilogue(r, a, b, r2Test, aBuf, bBuf, "Substraction (in-place)")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user