FpDbl revisited (#144) - 7% perf improvement everywhere, up to 30% in double-width primitives

* reorg mul -> limbs_double_width, ConstantineASM CttASM

* Implement squaring specialized scalar path (22% faster than mul)

* Implement "portable" assembly for squaring

* stash part of the changes

* Reorg montgomery reduction - prepare to introduce Comba optimization

* Implement comba Montgomery reduce (but it's slower!)

* rename t -> a

* 30% performance improvement by avoiding toOpenArray!

* variable renaming

* Fix 32-bit imports

* slightly better assembly for sub2x

* There is an annoying bottleneck

* use out-of-place Fp assembly instead of in-place

* diffAlias is unneeded now

* cosmetic

* speedup fpDbl sub by 20%

* Fix Fp2 -> Fp6 -> Fp12 towering. It seems 5% faster

* Stash ADCX/ADOX squaring
This commit is contained in:
Mamy Ratsimbazafy 2021-02-01 03:52:27 +01:00 committed by GitHub
parent d12d5faf21
commit 83dcd988b3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
28 changed files with 1046 additions and 468 deletions

View File

@ -42,7 +42,7 @@ generated incorrect add-with-carry code.
On x86-64, inline assembly is used to workaround compilers having issues optimizing large integer arithmetic,
and also ensure constant-time code.
This can be deactivated with `"-d:ConstantineASM=false"`:
This can be deactivated with `"-d:CttASM=false"`:
- at a significant performance cost with GCC (~50% slower than Clang).
- at misssed opportunity on recent CPUs that support MULX/ADCX/ADOX instructions (~60% faster than Clang).
- There is a 2.4x perf ratio between using plain GCC vs GCC with inline assembly.

View File

@ -172,6 +172,12 @@ proc mul2xBench*(rLen, aLen, bLen: static int, iters: int) =
bench("Multiplication", $rLen & " <- " & $aLen & " x " & $bLen, iters):
r.prod(a, b)
proc square2xBench*(rLen, aLen: static int, iters: int) =
var r: BigInt[rLen]
let a = rng.random_unsafe(BigInt[aLen])
bench("Squaring", $rLen & " <- " & $aLen & "²", iters):
r.square(a)
proc reduce2x*(T: typedesc, iters: int) =
var r: T
var t: doubleWidth(T)
@ -189,6 +195,7 @@ proc main() =
diff2x(Fp[BLS12_381], iters = 10_000_000)
diff2xNoReduce(Fp[BLS12_381], iters = 10_000_000)
mul2xBench(768, 384, 384, iters = 10_000_000)
square2xBench(768, 384, iters = 10_000_000)
reduce2x(Fp[BLS12_381], iters = 10_000_000)
separator()

View File

@ -232,7 +232,7 @@ proc buildBench(benchName: string, compiler = "", useAsm = true, run = false) =
if compiler != "":
cc = "--cc:" & compiler
if not useAsm:
cc &= " -d:ConstantineASM=false"
cc &= " -d:CttASM=false"
exec "nim c " & cc &
" -d:danger --verbosity:0 -o:build/bench/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") &
" --nimcache:nimcache/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") &
@ -246,7 +246,7 @@ proc runTests(requireGMP: bool, dumpCmdFile = false, test32bit = false, testASM
if not(td.useGMP and not requireGMP):
var flags = ""
if not testASM:
flags &= " -d:ConstantineASM=false"
flags &= " -d:CttASM=false"
if test32bit:
flags &= " -d:Constantine32"
if td.path in useDebug:

View File

@ -27,58 +27,67 @@ static: doAssert UseASM_X86_64
# Substraction
# ------------------------------------------------------------
macro sub2x_gen[N: static int](a: var Limbs[N], b: Limbs[N], M: Limbs[N div 2]): untyped =
macro sub2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N div 2]): untyped =
## Generate an optimized out-of-place double-width substraction kernel
result = newStmtList()
var ctx = init(Assembler_x86, BaseType)
let
N2 = N div 2
H = N div 2
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, InputOutput)
# We reuse the reg used for B for overflow detection
arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, InputOutput)
# We could force M as immediate by specializing per moduli
arrM = init(OperandArray, nimSymbol = M, N, PointerInReg, Input)
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
# We reuse the reg used for b for overflow detection
b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
# We could force m as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
# If N is too big, we need to spill registers. TODO.
arrT = init(OperandArray, nimSymbol = ident"t", N2, ElemsInReg, Output_EarlyClobber)
arrTadd = init(OperandArray, nimSymbol = ident"tadd", N2, ElemsInReg, Output_EarlyClobber)
u = init(OperandArray, nimSymbol = ident"U", H, ElemsInReg, InputOutput)
v = init(OperandArray, nimSymbol = ident"V", H, ElemsInReg, InputOutput)
let usym = u.nimSymbol
let vsym = v.nimSymbol
result.add quote do:
var `usym`{.noinit.}, `vsym` {.noInit.}: typeof(`A`)
staticFor i, 0, `H`:
`usym`[i] = `A`[i]
staticFor i, `H`, `N`:
`vsym`[i-`H`] = `A`[i]
# Substraction
for i in 0 ..< N:
ctx.mov arrT[i mod N2], arrA[i]
# u = a[0..<H] - b[0..<H], v = a[H..<N]
for i in 0 ..< H:
if i == 0:
ctx.sub arrT[0], arrB[0]
ctx.sub u[0], b[0]
else:
ctx.sbb arrT[i mod N2], arrB[i]
ctx.mov arrA[i], arrT[i mod N2]
# Interleaved copy the modulus to hide SBB latencies
if i < N2:
ctx.mov arrTadd[i], arrM[i]
ctx.sbb u[i], b[i]
# Everything should be hot in cache now so movs are cheaper
# we can try using 2 per SBB
# v = a[H..<N] - b[H..<N], a[0..<H] = u, u = M
for i in H ..< N:
ctx.mov r[i-H], u[i-H]
ctx.sbb v[i-H], b[i]
ctx.mov u[i-H], M[i-H] # TODO, bottleneck 17% perf: prefetch or inline modulus?
# Mask: underflowed contains 0xFFFF or 0x0000
let underflowed = arrB.reuseRegister()
let underflowed = b.reuseRegister()
ctx.sbb underflowed, underflowed
# Now mask the adder, with 0 or the modulus limbs
for i in 0 ..< N2:
ctx.`and` arrTadd[i], underflowed
for i in 0 ..< H:
ctx.`and` u[i], underflowed
# Add the masked modulus
for i in 0 ..< N2:
for i in 0 ..< H:
if i == 0:
ctx.add arrT[0], arrTadd[0]
ctx.add u[0], v[0]
else:
ctx.adc arrT[i], arrTadd[i]
ctx.mov arrA[i+N2], arrT[i]
ctx.adc u[i], v[i]
ctx.mov r[i+H], u[i]
let t = arrT.nimSymbol
let tadd = arrTadd.nimSymbol
result.add quote do:
var `t`{.noinit.}, `tadd` {.noInit.}: typeof(`a`)
result.add ctx.generate
func sub2x_asm*[N: static int](a: var Limbs[N], b: Limbs[N], M: Limbs[N div 2]) =
func sub2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N div 2]) =
## Constant-time double-width substraction
sub2x_gen(a, b, M)
sub2x_gen(r, a, b, M)

View File

@ -31,10 +31,10 @@ static: doAssert UseASM_X86_64
# Field addition
# ------------------------------------------------------------
macro addmod_gen[N: static int](a: var Limbs[N], b, M: Limbs[N]): untyped =
macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
## Generate an optimized modular addition kernel
# Register pressure note:
# We could generate a kernel per modulus M by hardocing it as immediate
# We could generate a kernel per modulus m by hardcoding it as immediate
# however this requires
# - duplicating the kernel and also
# - 64-bit immediate encoding is quite large
@ -43,64 +43,66 @@ macro addmod_gen[N: static int](a: var Limbs[N], b, M: Limbs[N]): untyped =
var ctx = init(Assembler_x86, BaseType)
let
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, InputOutput)
# We reuse the reg used for B for overflow detection
arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, InputOutput)
# We could force M as immediate by specializing per moduli
arrM = init(OperandArray, nimSymbol = M, N, PointerInReg, Input)
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
# We reuse the reg used for b for overflow detection
b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
# We could force m as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
# If N is too big, we need to spill registers. TODO.
arrT = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
arrTsub = init(OperandArray, nimSymbol = ident"tsub", N, ElemsInReg, Output_EarlyClobber)
u = init(OperandArray, nimSymbol = ident"u", N, ElemsInReg, InputOutput)
v = init(OperandArray, nimSymbol = ident"v", N, ElemsInReg, Output_EarlyClobber)
let usym = u.nimSymbol
let vsym = v.nimSymbol
result.add quote do:
var `usym`{.noinit.}, `vsym` {.noInit.}: typeof(`A`)
staticFor i, 0, `N`:
`usym`[i] = `A`[i]
# Addition
for i in 0 ..< N:
ctx.mov arrT[i], arrA[i]
if i == 0:
ctx.add arrT[0], arrB[0]
ctx.add u[0], b[0]
else:
ctx.adc arrT[i], arrB[i]
ctx.adc u[i], b[i]
# Interleaved copy in a second buffer as well
ctx.mov arrTsub[i], arrT[i]
ctx.mov v[i], u[i]
# Mask: overflowed contains 0xFFFF or 0x0000
# TODO: unnecessary if MSB never set, i.e. "canUseNoCarryMontyMul"
let overflowed = arrB.reuseRegister()
let overflowed = b.reuseRegister()
ctx.sbb overflowed, overflowed
# Now substract the modulus
for i in 0 ..< N:
if i == 0:
ctx.sub arrTsub[0], arrM[0]
ctx.sub v[0], M[0]
else:
ctx.sbb arrTsub[i], arrM[i]
ctx.sbb v[i], M[i]
# If it overflows here, it means that it was
# smaller than the modulus and we don't need arrTsub
# smaller than the modulus and we don'u need V
ctx.sbb overflowed, 0
# Conditional Mov and
# and store result
for i in 0 ..< N:
ctx.cmovnc arrT[i], arrTsub[i]
ctx.mov arrA[i], arrT[i]
ctx.cmovnc u[i], v[i]
ctx.mov r[i], u[i]
let t = arrT.nimSymbol
let tsub = arrTsub.nimSymbol
result.add quote do:
var `t`{.noinit.}, `tsub` {.noInit.}: typeof(`a`)
result.add ctx.generate
func addmod_asm*(a: var Limbs, b, M: Limbs) =
func addmod_asm*(r: var Limbs, a, b, m: Limbs) =
## Constant-time modular addition
addmod_gen(a, b, M)
addmod_gen(r, a, b, m)
# Field substraction
# ------------------------------------------------------------
macro submod_gen[N: static int](a: var Limbs[N], b, M: Limbs[N]): untyped =
macro submod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
## Generate an optimized modular addition kernel
# Register pressure note:
# We could generate a kernel per modulus M by hardocing it as immediate
# We could generate a kernel per modulus m by hardocing it as immediate
# however this requires
# - duplicating the kernel and also
# - 64-bit immediate encoding is quite large
@ -109,119 +111,121 @@ macro submod_gen[N: static int](a: var Limbs[N], b, M: Limbs[N]): untyped =
var ctx = init(Assembler_x86, BaseType)
let
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, InputOutput)
# We reuse the reg used for B for overflow detection
arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, InputOutput)
# We could force M as immediate by specializing per moduli
arrM = init(OperandArray, nimSymbol = M, N, PointerInReg, Input)
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
# We reuse the reg used for b for overflow detection
b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
# We could force m as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
# If N is too big, we need to spill registers. TODO.
arrT = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
arrTadd = init(OperandArray, nimSymbol = ident"tadd", N, ElemsInReg, Output_EarlyClobber)
u = init(OperandArray, nimSymbol = ident"U", N, ElemsInReg, InputOutput)
v = init(OperandArray, nimSymbol = ident"V", N, ElemsInReg, Output_EarlyClobber)
let usym = u.nimSymbol
let vsym = v.nimSymbol
result.add quote do:
var `usym`{.noinit.}, `vsym` {.noInit.}: typeof(`A`)
staticFor i, 0, `N`:
`usym`[i] = `A`[i]
# Substraction
for i in 0 ..< N:
ctx.mov arrT[i], arrA[i]
if i == 0:
ctx.sub arrT[0], arrB[0]
ctx.sub u[0], b[0]
else:
ctx.sbb arrT[i], arrB[i]
ctx.sbb u[i], b[i]
# Interleaved copy the modulus to hide SBB latencies
ctx.mov arrTadd[i], arrM[i]
ctx.mov v[i], M[i]
# Mask: underflowed contains 0xFFFF or 0x0000
let underflowed = arrB.reuseRegister()
let underflowed = b.reuseRegister()
ctx.sbb underflowed, underflowed
# Now mask the adder, with 0 or the modulus limbs
for i in 0 ..< N:
ctx.`and` arrTadd[i], underflowed
ctx.`and` v[i], underflowed
# Add the masked modulus
for i in 0 ..< N:
if i == 0:
ctx.add arrT[0], arrTadd[0]
ctx.add u[0], v[0]
else:
ctx.adc arrT[i], arrTadd[i]
ctx.mov arrA[i], arrT[i]
ctx.adc u[i], v[i]
ctx.mov r[i], u[i]
let t = arrT.nimSymbol
let tadd = arrTadd.nimSymbol
result.add quote do:
var `t`{.noinit.}, `tadd` {.noInit.}: typeof(`a`)
result.add ctx.generate
func submod_asm*(a: var Limbs, b, M: Limbs) =
func submod_asm*(r: var Limbs, a, b, M: Limbs) =
## Constant-time modular substraction
## Warning, does not handle aliasing of a and b
submod_gen(a, b, M)
submod_gen(r, a, b, M)
# Field negation
# ------------------------------------------------------------
macro negmod_gen[N: static int](r: var Limbs[N], a, M: Limbs[N]): untyped =
macro negmod_gen[N: static int](R: var Limbs[N], A, m: Limbs[N]): untyped =
## Generate an optimized modular negation kernel
result = newStmtList()
var ctx = init(Assembler_x86, BaseType)
let
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, Input)
arrR = init(OperandArray, nimSymbol = r, N, PointerInReg, InputOutput)
arrT = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
# We could force M as immediate by specializing per moduli
# We reuse the reg used for M for overflow detection
arrM = init(OperandArray, nimSymbol = M, N, PointerInReg, InputOutput)
a = init(OperandArray, nimSymbol = A, N, PointerInReg, Input)
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
u = init(OperandArray, nimSymbol = ident"U", N, ElemsInReg, Output_EarlyClobber)
# We could force m as immediate by specializing per moduli
# We reuse the reg used for m for overflow detection
M = init(OperandArray, nimSymbol = m, N, PointerInReg, InputOutput)
# Substraction M - a
# Substraction m - a
for i in 0 ..< N:
ctx.mov arrT[i], arrM[i]
ctx.mov u[i], M[i]
if i == 0:
ctx.sub arrT[0], arrA[0]
ctx.sub u[0], a[0]
else:
ctx.sbb arrT[i], arrA[i]
ctx.sbb u[i], a[i]
# Deal with a == 0
let isZero = arrM.reuseRegister()
ctx.mov isZero, arrA[0]
let isZero = M.reuseRegister()
ctx.mov isZero, a[0]
for i in 1 ..< N:
ctx.`or` isZero, arrA[i]
ctx.`or` isZero, a[i]
# Zero result if a == 0
for i in 0 ..< N:
ctx.cmovz arrT[i], isZero
ctx.mov arrR[i], arrT[i]
ctx.cmovz u[i], isZero
ctx.mov r[i], u[i]
let t = arrT.nimSymbol
let usym = u.nimSymbol
result.add quote do:
var `t`{.noinit.}: typeof(`a`)
var `usym`{.noinit.}: typeof(`A`)
result.add ctx.generate
func negmod_asm*(r: var Limbs, a, M: Limbs) =
func negmod_asm*(r: var Limbs, a, m: Limbs) =
## Constant-time modular negation
negmod_gen(r, a, M)
negmod_gen(r, a, m)
# Sanity checks
# ----------------------------------------------------------
when isMainModule:
import ../config/type_bigint, algorithm, strutils
import ../../config/type_bigint, algorithm, strutils
proc mainAdd() =
var a = [SecretWord 0xE3DF60E8F6D0AF9A'u64, SecretWord 0x7B2665C2258A7625'u64, SecretWord 0x68FC9A1D0977C8E0'u64, SecretWord 0xF3DC61ED7DE76883'u64]
var b = [SecretWord 0x78E9C2EF58BB6B78'u64, SecretWord 0x547F65BD19014254'u64, SecretWord 0x556A115819EAD4B5'u64, SecretWord 0x8CA844A546935DC3'u64]
var M = [SecretWord 0xFFFFFFFF00000001'u64, SecretWord 0x0000000000000000'u64, SecretWord 0x00000000FFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64]
var m = [SecretWord 0xFFFFFFFF00000001'u64, SecretWord 0x0000000000000000'u64, SecretWord 0x00000000FFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64]
var s = "0x5cc923d94f8c1b11cfa5cb7f3e8bb879be66ab7423629d968084a692c47ac647"
a.reverse()
b.reverse()
M.reverse()
m.reverse()
debugecho "--------------------------------"
debugecho "before:"
debugecho " a: ", a.toHex()
debugecho " b: ", b.toHex()
debugecho " m: ", M.toHex()
addmod_asm(a, b, M)
debugecho " m: ", m.toHex()
addmod_asm(a, a, b, m)
debugecho "after:"
debugecho " a: ", a.toHex().tolower
debugecho " s: ", s
@ -229,19 +233,19 @@ when isMainModule:
a = [SecretWord 0x00935a991ca215a6'u64, SecretWord 0x5fbdac6294679337'u64, SecretWord 0x1e41793877b80f12'u64, SecretWord 0x5724cd93cb32932d'u64]
b = [SecretWord 0x19dd4ecfda64ef80'u64, SecretWord 0x92deeb1532169c3d'u64, SecretWord 0x69ce4ee28421cd30'u64, SecretWord 0x4d90ab5a40295321'u64]
M = [SecretWord 0x2523648240000001'u64, SecretWord 0xba344d8000000008'u64, SecretWord 0x6121000000000013'u64, SecretWord 0xa700000000000013'u64]
m = [SecretWord 0x2523648240000001'u64, SecretWord 0xba344d8000000008'u64, SecretWord 0x6121000000000013'u64, SecretWord 0xa700000000000013'u64]
s = "0x1a70a968f7070526f29c9777c67e2f74880fc81afbd9dc42a4b578ee0b5be64e"
a.reverse()
b.reverse()
M.reverse()
m.reverse()
debugecho "--------------------------------"
debugecho "before:"
debugecho " a: ", a.toHex()
debugecho " b: ", b.toHex()
debugecho " m: ", M.toHex()
addmod_asm(a, b, M)
debugecho " m: ", m.toHex()
addmod_asm(a, a, b, m)
debugecho "after:"
debugecho " a: ", a.toHex().tolower
debugecho " s: ", s
@ -249,19 +253,19 @@ when isMainModule:
a = [SecretWord 0x1c7d810f37fc6e0b'u64, SecretWord 0xb91aba4ce339cea3'u64, SecretWord 0xd9f5571ccc4dfd1a'u64, SecretWord 0xf5906ee9df91f554'u64]
b = [SecretWord 0x18394ffe94874c9f'u64, SecretWord 0x6e8a8ad032fc5f15'u64, SecretWord 0x7533a2b46b7e9530'u64, SecretWord 0x2849996b4bb61b48'u64]
M = [SecretWord 0x2523648240000001'u64, SecretWord 0xba344d8000000008'u64, SecretWord 0x6121000000000013'u64, SecretWord 0xa700000000000013'u64]
m = [SecretWord 0x2523648240000001'u64, SecretWord 0xba344d8000000008'u64, SecretWord 0x6121000000000013'u64, SecretWord 0xa700000000000013'u64]
s = "0x0f936c8b8c83baa96d70f79d16362db0ee07f9d137cc923776da08552b481089"
a.reverse()
b.reverse()
M.reverse()
m.reverse()
debugecho "--------------------------"
debugecho "before:"
debugecho " a: ", a.toHex()
debugecho " b: ", b.toHex()
debugecho " m: ", M.toHex()
addmod_asm(a, b, M)
debugecho " m: ", m.toHex()
addmod_asm(a, a, b, m)
debugecho "after:"
debugecho " a: ", a.toHex().tolower
debugecho " s: ", s
@ -269,19 +273,19 @@ when isMainModule:
a = [SecretWord 0xe9d55643'u64, SecretWord 0x580ec4cc3f91cef3'u64, SecretWord 0x11ecbb7d35b36449'u64, SecretWord 0x35535ca31c5dc2ba'u64]
b = [SecretWord 0x97f7ed94'u64, SecretWord 0xbad96eb98204a622'u64, SecretWord 0xbba94400f9a061d6'u64, SecretWord 0x60d3521a0d3dd9eb'u64]
M = [SecretWord 0xffffffff'u64, SecretWord 0xffffffffffffffff'u64, SecretWord 0xffffffff00000000'u64, SecretWord 0x0000000000000001'u64]
m = [SecretWord 0xffffffff'u64, SecretWord 0xffffffffffffffff'u64, SecretWord 0xffffffff00000000'u64, SecretWord 0x0000000000000001'u64]
s = "0x0000000081cd43d812e83385c1967515cd95ff7f2f53c61f9626aebd299b9ca4"
a.reverse()
b.reverse()
M.reverse()
m.reverse()
debugecho "--------------------------"
debugecho "before:"
debugecho " a: ", a.toHex()
debugecho " b: ", b.toHex()
debugecho " m: ", M.toHex()
addmod_asm(a, b, M)
debugecho " m: ", m.toHex()
addmod_asm(a, a, b, m)
debugecho "after:"
debugecho " a: ", a.toHex().tolower
debugecho " s: ", s
@ -292,22 +296,47 @@ when isMainModule:
proc mainSub() =
var a = [SecretWord 0xf9c32e89b80b17bd'u64, SecretWord 0xdbd3069d4ca0e1c3'u64, SecretWord 0x980d4c70d39d5e17'u64, SecretWord 0xd9f0252845f18c3a'u64]
var b = [SecretWord 0x215075604bfd64de'u64, SecretWord 0x36dc488149fc5d3e'u64, SecretWord 0x91fff665385d20fd'u64, SecretWord 0xe980a5a203b43179'u64]
var M = [SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFEFFFFFC2F'u64]
var m = [SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFEFFFFFC2F'u64]
var s = "0xd872b9296c0db2dfa4f6be1c02a48485060d560b9b403d19f06f7f86423d5ac1"
a.reverse()
b.reverse()
M.reverse()
m.reverse()
debugecho "--------------------------------"
debugecho "before:"
debugecho " a: ", a.toHex()
debugecho " b: ", b.toHex()
debugecho " m: ", M.toHex()
submod_asm(a, b, M)
debugecho " m: ", m.toHex()
submod_asm(a, a, b, m)
debugecho "after:"
debugecho " a: ", a.toHex().tolower
debugecho " s: ", s
debugecho " ok: ", a.toHex().tolower == s
mainSub()
proc mainSubOutplace() =
var a = [SecretWord 0xf9c32e89b80b17bd'u64, SecretWord 0xdbd3069d4ca0e1c3'u64, SecretWord 0x980d4c70d39d5e17'u64, SecretWord 0xd9f0252845f18c3a'u64]
var b = [SecretWord 0x215075604bfd64de'u64, SecretWord 0x36dc488149fc5d3e'u64, SecretWord 0x91fff665385d20fd'u64, SecretWord 0xe980a5a203b43179'u64]
var m = [SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFFFFFFFFFF'u64, SecretWord 0xFFFFFFFEFFFFFC2F'u64]
var s = "0xd872b9296c0db2dfa4f6be1c02a48485060d560b9b403d19f06f7f86423d5ac1"
a.reverse()
b.reverse()
m.reverse()
var r: typeof(a)
debugecho "--------------------------------"
debugecho "before:"
debugecho " a: ", a.toHex()
debugecho " b: ", b.toHex()
debugecho " m: ", m.toHex()
submod_asm(r, a, b, m)
debugecho "after:"
debugecho " r: ", r.toHex().tolower
debugecho " s: ", s
debugecho " ok: ", r.toHex().tolower == s
mainSubOutplace()

View File

@ -28,31 +28,31 @@ static: doAssert UseASM_X86_32
proc finalSubNoCarry*(
ctx: var Assembler_x86,
r: Operand or OperandArray,
t, M, scratch: OperandArray
a, M, scratch: OperandArray
) =
## Reduce `t` into `r` modulo `M`
## Reduce `a` into `r` modulo `M`
let N = M.len
ctx.comment "Final substraction (no carry)"
for i in 0 ..< N:
ctx.mov scratch[i], t[i]
ctx.mov scratch[i], a[i]
if i == 0:
ctx.sub scratch[i], M[i]
else:
ctx.sbb scratch[i], M[i]
# If we borrowed it means that we were smaller than
# the modulus and we don't need "scratch"
# the modulus and we don'a need "scratch"
for i in 0 ..< N:
ctx.cmovnc t[i], scratch[i]
ctx.mov r[i], t[i]
ctx.cmovnc a[i], scratch[i]
ctx.mov r[i], a[i]
proc finalSubCanOverflow*(
ctx: var Assembler_x86,
r: Operand or OperandArray,
t, M, scratch: OperandArray,
a, M, scratch: OperandArray,
overflowReg: Operand
) =
## Reduce `t` into `r` modulo `M`
## Reduce `a` into `r` modulo `M`
## To be used when the final substraction can
## also depend on the carry flag
## This is in particular possible when the MSB
@ -65,7 +65,7 @@ proc finalSubCanOverflow*(
let N = M.len
ctx.comment "Final substraction (may carry)"
for i in 0 ..< N:
ctx.mov scratch[i], t[i]
ctx.mov scratch[i], a[i]
if i == 0:
ctx.sub scratch[i], M[i]
else:
@ -74,10 +74,10 @@ proc finalSubCanOverflow*(
ctx.sbb overflowReg, 0
# If we borrowed it means that we were smaller than
# the modulus and we don't need "scratch"
# the modulus and we don'a need "scratch"
for i in 0 ..< N:
ctx.cmovnc t[i], scratch[i]
ctx.mov r[i], t[i]
ctx.cmovnc a[i], scratch[i]
ctx.mov r[i], a[i]
# Montgomery reduction
@ -85,7 +85,7 @@ proc finalSubCanOverflow*(
macro montyRed_gen[N: static int](
r_MR: var array[N, SecretWord],
t_MR: array[N*2, SecretWord],
a_MR: array[N*2, SecretWord],
M_MR: array[N, SecretWord],
m0ninv_MR: BaseType,
canUseNoCarryMontyMul: static bool
@ -148,13 +148,13 @@ macro montyRed_gen[N: static int](
# ---------------------------------------------------------
# for i in 0 .. n-1:
# hi <- 0
# m <- t[i] * m0ninv mod 2^w (i.e. simple multiplication)
# m <- a[i] * m0ninv mod 2^w (i.e. simple multiplication)
# for j in 0 .. n-1:
# (hi, lo) <- t[i+j] + m * M[j] + hi
# t[i+j] <- lo
# t[i+n] += hi
# (hi, lo) <- a[i+j] + m * M[j] + hi
# a[i+j] <- lo
# a[i+n] += hi
# for i in 0 .. n-1:
# r[i] = t[i+n]
# r[i] = a[i+n]
# if r >= M:
# r -= M
@ -162,14 +162,15 @@ macro montyRed_gen[N: static int](
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
result.add quote do:
`eax` = BaseType `t_MR`[0]
`scratchSym`[1 .. `N`-1] = `t_MR`.toOpenArray(1, `N`-1)
`eax` = BaseType `a_MR`[0]
staticFor i, 0, `N`: # Do NOT use Nim slice/toOpenArray, they are not inlined
`scratchSym`[i] = `a_MR`[i]
ctx.mov scratch[N], rRAX
ctx.imul rRAX, m0ninv # m <- t[i] * m0ninv mod 2^w
ctx.imul rRAX, m0ninv # m <- a[i] * m0ninv mod 2^w
ctx.mov scratch[0], rRAX
# scratch: [t[0] * m0, t[1], t[2], t[3], t[0]] for 4 limbs
# scratch: [a[0] * m0, a[1], a[2], a[3], a[0]] for 4 limbs
for i in 0 ..< N:
ctx.comment ""
@ -217,23 +218,23 @@ macro montyRed_gen[N: static int](
ctx = init(Assembler_x86, BaseType)
let r = init(OperandArray, nimSymbol = r_MR, N, PointerInReg, InputOutput_EnsureClobber)
let t = init(OperandArray, nimSymbol = t_MR, N*2, PointerInReg, Input)
let a = init(OperandArray, nimSymbol = a_MR, N*2, PointerInReg, Input)
let extraRegNeeded = N-2
let tsub = init(OperandArray, nimSymbol = ident"tsub", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber)
let tsubsym = tsub.nimSymbol
let t = init(OperandArray, nimSymbol = ident"t", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber)
let tsym = t.nimSymbol
result.add quote do:
var `tsubsym` {.noInit.}: Limbs[`extraRegNeeded`]
var `tsym` {.noInit.}: Limbs[`extraRegNeeded`]
# This does t[i+n] += hi
# This does a[i+n] += hi
# but in a separate carry chain, fused with the
# copy "r[i] = t[i+n]"
# copy "r[i] = a[i+n]"
for i in 0 ..< N:
if i == 0:
ctx.add scratch[i], t[i+N]
ctx.add scratch[i], a[i+N]
else:
ctx.adc scratch[i], t[i+N]
ctx.adc scratch[i], a[i+N]
let reuse = repackRegisters(tsub, scratch[N], scratch[N+1])
let reuse = repackRegisters(t, scratch[N], scratch[N+1])
if canUseNoCarryMontyMul:
ctx.finalSubNoCarry(r, scratch, M, reuse)
@ -245,10 +246,10 @@ macro montyRed_gen[N: static int](
func montRed_asm*[N: static int](
r: var array[N, SecretWord],
t: array[N*2, SecretWord],
a: array[N*2, SecretWord],
M: array[N, SecretWord],
m0ninv: BaseType,
canUseNoCarryMontyMul: static bool
) =
## Constant-time Montgomery reduction
montyRed_gen(r, t, M, m0ninv, canUseNoCarryMontyMul)
montyRed_gen(r, a, M, m0ninv, canUseNoCarryMontyMul)

View File

@ -37,7 +37,7 @@ static: doAssert UseASM_X86_64
macro montyRedx_gen[N: static int](
r_MR: var array[N, SecretWord],
t_MR: array[N*2, SecretWord],
a_MR: array[N*2, SecretWord],
M_MR: array[N, SecretWord],
m0ninv_MR: BaseType,
canUseNoCarryMontyMul: static bool
@ -109,13 +109,13 @@ macro montyRedx_gen[N: static int](
# ---------------------------------------------------------
# for i in 0 .. n-1:
# hi <- 0
# m <- t[i] * m0ninv mod 2^w (i.e. simple multiplication)
# m <- a[i] * m0ninv mod 2^w (i.e. simple multiplication)
# for j in 0 .. n-1:
# (hi, lo) <- t[i+j] + m * M[j] + hi
# t[i+j] <- lo
# t[i+n] += hi
# (hi, lo) <- a[i+j] + m * M[j] + hi
# a[i+j] <- lo
# a[i+n] += hi
# for i in 0 .. n-1:
# r[i] = t[i+n]
# r[i] = a[i+n]
# if r >= M:
# r -= M
@ -124,12 +124,13 @@ macro montyRedx_gen[N: static int](
result.add quote do:
`edx` = BaseType(`m0ninv_MR`)
`scratchSym`[0 .. `N`-1] = `t_MR`.toOpenArray(0, `N`-1)
staticFor i, 0, `N`: # Do NOT use Nim slice/toOpenArray, they are not inlined
`scratchSym`[i] = `a_MR`[i]
for i in 0 ..< N:
# RDX contains m0ninv at the start of each loop
ctx.comment ""
ctx.imul rRDX, scratch[0] # m <- t[i] * m0ninv mod 2^w
ctx.imul rRDX, scratch[0] # m <- a[i] * m0ninv mod 2^w
ctx.comment "---- Reduction " & $i
ctx.`xor` scratch[N], scratch[N]
@ -156,23 +157,23 @@ macro montyRedx_gen[N: static int](
ctx = init(Assembler_x86, BaseType)
let r = init(OperandArray, nimSymbol = r_MR, N, PointerInReg, InputOutput_EnsureClobber)
let t = init(OperandArray, nimSymbol = t_MR, N*2, PointerInReg, Input)
let a = init(OperandArray, nimSymbol = a_MR, N*2, PointerInReg, Input)
let extraRegNeeded = N-1
let tsub = init(OperandArray, nimSymbol = ident"tsub", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber)
let tsubsym = tsub.nimSymbol
let t = init(OperandArray, nimSymbol = ident"t", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber)
let tsym = t.nimSymbol
result.add quote do:
var `tsubsym` {.noInit.}: Limbs[`extraRegNeeded`]
var `tsym` {.noInit.}: Limbs[`extraRegNeeded`]
# This does t[i+n] += hi
# This does a[i+n] += hi
# but in a separate carry chain, fused with the
# copy "r[i] = t[i+n]"
# copy "r[i] = a[i+n]"
for i in 0 ..< N:
if i == 0:
ctx.add scratch[i], t[i+N]
ctx.add scratch[i], a[i+N]
else:
ctx.adc scratch[i], t[i+N]
ctx.adc scratch[i], a[i+N]
let reuse = repackRegisters(tsub, scratch[N])
let reuse = repackRegisters(t, scratch[N])
if canUseNoCarryMontyMul:
ctx.finalSubNoCarry(r, scratch, M, reuse)
@ -184,10 +185,10 @@ macro montyRedx_gen[N: static int](
func montRed_asm_adx_bmi2*[N: static int](
r: var array[N, SecretWord],
t: array[N*2, SecretWord],
a: array[N*2, SecretWord],
M: array[N, SecretWord],
m0ninv: BaseType,
canUseNoCarryMontyMul: static bool
) =
## Constant-time Montgomery reduction
montyRedx_gen(r, t, M, m0ninv, canUseNoCarryMontyMul)
montyRedx_gen(r, a, M, m0ninv, canUseNoCarryMontyMul)

View File

@ -30,6 +30,9 @@ import
static: doAssert UseASM_X86_64 # Need 8 registers just for mul
# and 32-bit only has 8 max.
# Multiplication
# -----------------------------------------------------------------------------------------------
macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
## Comba multiplication generator
## `a`, `b`, `r` can have a different number of limbs
@ -114,7 +117,12 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
ctx.`xor` v, v
ctx.`xor` t, t
for i in 0 ..< min(aLen+bLen, rLen):
let stopEx = min(aLen+bLen, rLen)
for i in 0 ..< stopEx:
# Invariant for product scanning:
# if we multiply accumulate by a[k1] * b[k2]
# we have k1+k2 == i
let ib = min(bLen-1, i)
let ia = i - ib
for j in 0 ..< min(aLen - ia, ib+1):
@ -127,7 +135,7 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
ctx.mov arrR[i], v
if i != min(aLen+bLen, rLen) - 1:
if i != stopEx - 1:
ctx.mov v, u
ctx.mov u, t
ctx.`xor` t, t
@ -144,3 +152,140 @@ func mul_asm*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
## Multi-precision Multiplication
## Assumes r doesn't alias a or b
mul_gen(r, a, b)
# Squaring
# -----------------------------------------------------------------------------------------------
macro square_gen[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
## Comba squaring generator
## `a` and `r` can have a different number of limbs
## if `r`.limbs.len < a.limbs.len * 2
## The result will be truncated, i.e. it will be
## a² (mod (2^WordBitwidth)^r.limbs.len)
##
## Assumes r doesn't aliases a
result = newStmtList()
var ctx = init(Assembler_x86, BaseType)
let
arrR = init(OperandArray, nimSymbol = r, rLen, PointerInReg, InputOutput_EnsureClobber)
arrA = init(OperandArray, nimSymbol = a, aLen, PointerInReg, Input)
t = Operand(
desc: OperandDesc(
asmId: "[t]",
nimSymbol: ident"t",
rm: Reg,
constraint: Output_EarlyClobber,
cEmit: "t"
)
)
u = Operand(
desc: OperandDesc(
asmId: "[u]",
nimSymbol: ident"u",
rm: Reg,
constraint: Output_EarlyClobber,
cEmit: "u"
)
)
v = Operand(
desc: OperandDesc(
asmId: "[v]",
nimSymbol: ident"v",
rm: Reg,
constraint: Output_EarlyClobber,
cEmit: "v"
)
)
# MUL requires RAX and RDX
rRAX = Operand(
desc: OperandDesc(
asmId: "[rax]",
nimSymbol: ident"rax",
rm: RAX,
constraint: Output_EarlyClobber,
cEmit: "rax"
)
)
rRDX = Operand(
desc: OperandDesc(
asmId: "[rdx]",
nimSymbol: ident"rdx",
rm: RDX,
constraint: Output_EarlyClobber,
cEmit: "rdx"
)
)
# Prologue
let tsym = t.desc.nimSymbol
let usym = u.desc.nimSymbol
let vsym = v.desc.nimSymbol
let eax = rRAX.desc.nimSymbol
let edx = rRDX.desc.nimSymbol
result.add quote do:
var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init
var `eax`{.noInit.}, `edx`{.noInit.}: BaseType
# Algorithm
ctx.`xor` u, u
ctx.`xor` v, v
ctx.`xor` t, t
let stopEx = min(aLen*2, rLen)
for i in 0 ..< stopEx:
# Invariant for product scanning:
# if we multiply accumulate by a[k1] * b[k2]
# we have k1+k2 == i
let ib = min(aLen-1, i)
let ia = i - ib
for j in 0 ..< min(aLen - ia, ib+1):
let k1 = ia+j
let k2 = ib-j
if k1 < k2:
# (t, u, v) <- (t, u, v) + 2 * a[k1] * a[k2]
ctx.mov rRAX, arrA[k2]
ctx.mul rdx, rax, arrA[k1], rax
ctx.add rRAX, rRAX
ctx.adc rRDX, rRDX
ctx.adc t, 0
ctx.add v, rRAX
ctx.adc u, rRDX
ctx.adc t, 0
elif k1 == k2:
# (t, u, v) <- (t, u, v) + a[k1] * a[k2]
ctx.mov rRAX, arrA[k2]
ctx.mul rdx, rax, arrA[k1], rax
ctx.add v, rRAX
ctx.adc u, rRDX
ctx.adc t, 0
else:
discard
ctx.mov arrR[i], v
if i != stopEx - 1:
ctx.mov v, u
ctx.mov u, t
ctx.`xor` t, t
if aLen*2 < rLen:
ctx.`xor` rRAX, rRAX
for i in aLen*2 ..< rLen:
ctx.mov arrR[i], rRAX
# Codegen
result.add ctx.generate
func square_asm*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
## Multi-precision Squaring
## Assumes r doesn't alias a
square_gen(r, a)

View File

@ -195,3 +195,135 @@ func mul_asm_adx_bmi2*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limb
## Multi-precision Multiplication
## Assumes r doesn't alias a or b
mulx_gen(r, a, b)
# Squaring
# -----------------------------------------------------------------------------------------------
# TODO: We use 16 registers but GCC/Clang still complain :/
# macro sqrx_gen[rLen, aLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen]) =
# ## Squaring
# ## `a` and `r` can have a different number of limbs
# ## if `r`.limbs.len < a.limbs.len * 2
# ## The result will be truncated, i.e. it will be
# ## a² (mod (2^WordBitwidth)^r.limbs.len)
# ##
# ## Assumes r doesn't aliases a
# result = newStmtList()
#
# var ctx = init(Assembler_x86, BaseType)
# let
# # Register count with 6 limbs:
# # r + a + rax + rdx = 4
# # t = 2 * a.len = 12
# # We use the full x86 register set.
#
# r = init(OperandArray, nimSymbol = rx, rLen, PointerInReg, InputOutput)
# a = init(OperandArray, nimSymbol = ax, aLen, PointerInReg, Input)
#
# N = a.len
# tSlots = a.len * 2
# # If aLen is too big, we need to spill registers. TODO.
# t = init(OperandArray, nimSymbol = ident"t", tSlots, ElemsInReg, Output_EarlyClobber)
#
# # MULX requires RDX
# rRDX = Operand(
# desc: OperandDesc(
# asmId: "[rdx]",
# nimSymbol: ident"rdx",
# rm: RDX,
# constraint: Output_EarlyClobber,
# cEmit: "rdx"
# )
# )
#
# # Scratch spaces for carries
# rRAX = Operand(
# desc: OperandDesc(
# asmId: "[rax]",
# nimSymbol: ident"rax",
# rm: RAX,
# constraint: Output_EarlyClobber,
# cEmit: "rax"
# )
# )
#
# # Prologue
# # -------------------------------
# let tsym = t.nimSymbol
# let eax = rRAX.desc.nimSymbol
# let edx = rRDX.desc.nimSymbol
# result.add quote do:
# var `tsym`{.noInit.}: array[`N`, BaseType]
# var `eax`{.noInit.}, `edx`{.noInit.}: BaseType
#
# # Algorithm
# # -------------------------------
#
# block: # Triangle
# # i = 0
# # ----------------
# ctx.mov rRDX, a[0]
# # Put a[1..<N] in unused registers, 4 mov per cycle on x86
# for i in 1 ..< N:
# ctx.mov t[i+N], a[i]
#
# let # Carry handlers
# hi = r.reuseRegister()
# lo = rRAX
#
# for j in 1 ..< N:
# # (carry, t[j]) <- a[j] * a[0] with a[j] in t[j+N]
# ctx.mulx t[j], rRAX, t[j+N], rdx
# if j == 1:
# ctx.add t[j-1], rRAX
# else:
# ctx.adc t[j-1], rRAX
# ctx.adc t[N-1], 0
#
# for i in 1 ..< N-1:
# ctx.comment " Process squaring triangle " & $i
# ctx.mov rRDX, a[i]
# ctx.`xor` t[i+N], t[i+N] # Clear flags
# for j in i+1 ..< N:
# ctx.mulx hi, lo, t[j+N], rdx
# ctx.adox t[i+j], lo
# if j == N-1:
# break
# ctx.adcx t[i+j+1], hi
#
# ctx.comment " Accumulate last carries in i+N word"
# # t[i+N] is already 0
# ctx.adcx hi, t[i+N]
# ctx.adox t[i+N], hi
#
# block:
# ctx.comment "Finish: (t[2*i+1], t[2*i]) <- 2*t[2*i] + a[i]*a[i]"
#
# # Restore result
# ctx.mov r.reuseRegister(), xmm0
#
# ctx.mov rRDX, a[0]
#
# # (t[2*i+1], t[2*i]) <- 2*t[2*i] + a[i]*a[i]
# for i in 0 ..< N:
# ctx.mulx rRAX, rRDX, a[i], rdx
# ctx.add t[2*i], t[2*i]
# ctx.adc t[2*i+1], 0
# ctx.add t[2*i], rRDX
# if i != N - 1:
# ctx.mov rRDX, a[i+1]
# ctx.adc t[2*i+1], rRAX
# ctx.mov r[i], t[i]
#
# # Move the rest
# for i in N ..< min(rLen, 2*N):
# ctx.mov r[i], t[i]
#
# # Codegen
# result.add ctx.generate
#
# func square_asm_adx_bmi2*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
# ## Multi-precision Squaring
# ## Assumes r doesn't alias a
# sqrx_gen(r, a)

View File

@ -10,7 +10,9 @@ import
../config/[common, type_bigint],
../primitives,
./limbs,
./limbs_modular
./limbs_double_width,
./limbs_modular,
./limbs_montgomery
export BigInt
@ -269,6 +271,10 @@ func prod*[rBits, aBits, bBits](r: var BigInt[rBits], a: BigInt[aBits], b: BigIn
## if r.bits < a.bits + b.bits
## the multiplication will overflow.
## It will be truncated if it cannot fit in r limbs.
##
## Truncation is at limb-level NOT bitlevel
## It is recommended to only use
## rBits >= aBits + bBits unless you know what you are doing.
r.limbs.prod(a.limbs, b.limbs)
func mul*[aBits, bBits](a: var BigInt[aBits], b: BigInt[bBits]) =
@ -298,6 +304,19 @@ func prod_high_words*[rBits, aBits, bBits](r: var BigInt[rBits], a: BigInt[aBits
# i.e. prod_high_words(result, P, a, w)
r.limbs.prod_high_words(a.limbs, b.limbs, lowestWordIndex)
func square*[rBits, aBits](r: var BigInt[rBits], a: BigInt[aBits]) =
## Multi-precision squaring
## r <- a²
## `a`, `r` can have different sizes
## if r.bits < a.bits * 2
## the multiplication will overflow.
## It will be truncated if it cannot fit in r limbs.
##
## Truncation is at limb-level NOT bitlevel
## It is recommended to only use
## rBits >= aBits * 2 unless you know what you are doing.
r.limbs.square(a.limbs)
# Bit Manipulation
# ------------------------------------------------------------

View File

@ -135,7 +135,7 @@ func setOne*(a: var FF) {.inline.} =
func `+=`*(a: var FF, b: FF) {.inline, meter.} =
## In-place addition modulo p
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
addmod_asm(a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
addmod_asm(a.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
else:
var overflowed = add(a.mres, b.mres)
overflowed = overflowed or not(a.mres < FF.fieldMod())
@ -144,7 +144,7 @@ func `+=`*(a: var FF, b: FF) {.inline, meter.} =
func `-=`*(a: var FF, b: FF) {.inline, meter.} =
## In-place substraction modulo p
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
submod_asm(a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
submod_asm(a.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
else:
let underflowed = sub(a.mres, b.mres)
discard cadd(a.mres, FF.fieldMod(), underflowed)
@ -152,7 +152,7 @@ func `-=`*(a: var FF, b: FF) {.inline, meter.} =
func double*(a: var FF) {.inline, meter.} =
## Double ``a`` modulo p
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
addmod_asm(a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
addmod_asm(a.mres.limbs, a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
else:
var overflowed = double(a.mres)
overflowed = overflowed or not(a.mres < FF.fieldMod())
@ -162,8 +162,7 @@ func sum*(r: var FF, a, b: FF) {.inline, meter.} =
## Sum ``a`` and ``b`` into ``r`` modulo p
## r is initialized/overwritten
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
r = a
addmod_asm(r.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
addmod_asm(r.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
else:
var overflowed = r.mres.sum(a.mres, b.mres)
overflowed = overflowed or not(r.mres < FF.fieldMod())
@ -178,20 +177,7 @@ func diff*(r: var FF, a, b: FF) {.inline, meter.} =
## `r` is initialized/overwritten
## Requires r != b
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
r = a
submod_asm(r.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
else:
var underflowed = r.mres.diff(a.mres, b.mres)
discard cadd(r.mres, FF.fieldMod(), underflowed)
func diffAlias*(r: var FF, a, b: FF) {.inline, meter.} =
## Substract `b` from `a` and store the result into `r`.
## `r` is initialized/overwritten
## Handles r == b
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
var t = a
submod_asm(t.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
r = t
submod_asm(r.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
else:
var underflowed = r.mres.diff(a.mres, b.mres)
discard cadd(r.mres, FF.fieldMod(), underflowed)
@ -205,8 +191,7 @@ func double*(r: var FF, a: FF) {.inline, meter.} =
## Double ``a`` into ``r``
## `r` is initialized/overwritten
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
r = a
addmod_asm(r.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
addmod_asm(r.mres.limbs, a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
else:
var overflowed = r.mres.double(a.mres)
overflowed = overflowed or not(r.mres < FF.fieldMod())
@ -223,10 +208,7 @@ func square*(r: var FF, a: FF) {.inline, meter.} =
func neg*(r: var FF, a: FF) {.inline, meter.} =
## Negate modulo p
when UseASM_X86_64 and defined(gcc):
# Clang and every compiler besides GCC
# can cleanly optimized this
# especially on FF2
when UseASM_X86_64:
negmod_asm(r.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
else:
# If a = 0 we need r = 0 and not r = M

View File

@ -12,7 +12,8 @@ import
./bigints,
./finite_fields,
./limbs,
./limbs_double_width
./limbs_double_width,
./limbs_montgomery
when UseASM_X86_64:
import assembly/limbs_asm_modular_dbl_width_x86
@ -27,10 +28,17 @@ template doubleWidth*(T: typedesc[Fp]): typedesc =
## Return the double-width type matching with Fp
FpDbl[T.C]
func `==`*(a, b: FpDbl): SecretBool {.inline.} =
a.limbs2x == b.limbs2x
func mulNoReduce*(r: var FpDbl, a, b: Fp) {.inline.} =
## Store the product of ``a`` by ``b`` into ``r``
r.limbs2x.prod(a.mres.limbs, b.mres.limbs)
func squareNoReduce*(r: var FpDbl, a: Fp) {.inline.} =
## Store the square of ``a`` into ``r``
r.limbs2x.square(a.mres.limbs)
func reduce*(r: var Fp, a: FpDbl) {.inline.} =
## Reduce a double-width field element into r
const N = r.mres.limbs.len
@ -42,20 +50,16 @@ func reduce*(r: var Fp, a: FpDbl) {.inline.} =
Fp.canUseNoCarryMontyMul()
)
func diffNoInline(r: var FpDbl, a, b: FpDbl): Borrow =
r.limbs2x.diff(a.limbs2x, b.limbs2x)
func diffNoReduce*(r: var FpDbl, a, b: FpDbl) =
## Double-width substraction without reduction
discard diffNoInline(r, a, b)
discard r.limbs2x.diff(a.limbs2x, b.limbs2x)
func diff*(r: var FpDbl, a, b: FpDbl) =
func diff*(r: var FpDbl, a, b: FpDbl) {.inline.}=
## Double-width modular substraction
when false: # TODO slower
r = a
sub2x_asm(r.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs)
when UseASM_X86_64:
sub2x_asm(r.limbs2x, a.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs)
else:
var underflowed = SecretBool diffNoInline(r, a, b)
var underflowed = SecretBool r.limbs2x.diff(a.limbs2x, b.limbs2x)
const N = r.limbs2x.len div 2
const M = FpDbl.C.Mod
@ -65,8 +69,6 @@ func diff*(r: var FpDbl, a, b: FpDbl) =
addC(carry, sum, r.limbs2x[i+N], M.limbs[i], carry)
underflowed.ccopy(r.limbs2x[i+N], sum)
func `-=`*(a: var FpDbl, b: FpDbl) =
when false: # TODO slower
sub2x_asm(a.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs)
else:
a.diff(a, b)
func `-=`*(a: var FpDbl, b: FpDbl) {.inline.}=
## Double-width modular substraction
a.diff(a, b)

View File

@ -12,9 +12,6 @@ import
when UseASM_X86_32:
import ./assembly/limbs_asm_x86
when UseASM_X86_64:
import ./assembly/limbs_asm_mul_x86
import ./assembly/limbs_asm_mul_x86_adx_bmi2
# ############################################################
#
@ -335,89 +332,6 @@ func cneg*(a: var Limbs, ctl: CTBool) =
{.pop.} # inline
# Multiplication
# ------------------------------------------------------------
func prod*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
## Multi-precision multiplication
## r <- a*b
##
## `a`, `b`, `r` can have a different number of limbs
## if `r`.limbs.len < a.limbs.len + b.limbs.len
## The result will be truncated, i.e. it will be
## a * b (mod (2^WordBitwidth)^r.limbs.len)
##
## `r` must not alias ``a`` or ``b``
when UseASM_X86_64 and aLen <= 6:
if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()):
mul_asm_adx_bmi2(r, a, b)
else:
mul_asm(r, a, b)
elif UseASM_X86_64:
mul_asm(r, a, b)
else:
# We use Product Scanning / Comba multiplication
var t, u, v = Zero
staticFor i, 0, min(a.len+b.len, r.len):
const ib = min(b.len-1, i)
const ia = i - ib
staticFor j, 0, min(a.len - ia, ib+1):
mulAcc(t, u, v, a[ia+j], b[ib-j])
r[i] = v
v = u
u = t
t = Zero
if aLen+bLen < rLen:
for i in aLen+bLen ..< rLen:
r[i] = Zero
func prod_high_words*[rLen, aLen, bLen](
r: var Limbs[rLen],
a: Limbs[aLen], b: Limbs[bLen],
lowestWordIndex: static int) =
## Multi-precision multiplication keeping only high words
## r <- a*b >> (2^WordBitWidth)^lowestWordIndex
##
## `a`, `b`, `r` can have a different number of limbs
## if `r`.limbs.len < a.limbs.len + b.limbs.len - lowestWordIndex
## The result will be truncated, i.e. it will be
## a * b >> (2^WordBitWidth)^lowestWordIndex (mod (2^WordBitwidth)^r.limbs.len)
#
# This is useful for
# - Barret reduction
# - Approximating multiplication by a fractional constant in the form f(a) = K/C * a
# with K and C known at compile-time.
# We can instead find a well chosen M = (2^WordBitWidth)^w, with M > C (i.e. M is a power of 2 bigger than C)
# Precompute P = K*M/C at compile-time
# and at runtime do P*a/M <=> P*a >> (WordBitWidth*w)
# i.e. prod_high_words(result, P, a, w)
# We use Product Scanning / Comba multiplication
var t, u, v = Zero # Will raise warning on empty iterations
var z: Limbs[rLen] # zero-init, ensure on stack and removes in-place problems
# The previous 2 columns can affect the lowest word due to carries
# but not the ones before (we accumulate in 3 words (t, u, v))
const w = lowestWordIndex - 2
staticFor i, max(0, w), min(a.len+b.len, r.len+lowestWordIndex):
const ib = min(b.len-1, i)
const ia = i - ib
staticFor j, 0, min(a.len - ia, ib+1):
mulAcc(t, u, v, a[ia+j], b[ib-j])
when i >= lowestWordIndex:
z[i-lowestWordIndex] = v
v = u
u = t
t = Zero
r = z
# Division
# ------------------------------------------------------------

View File

@ -11,10 +11,9 @@ import
../primitives,
./limbs
when UseASM_X86_32:
import ./assembly/limbs_asm_montred_x86
when UseASM_X86_64:
import ./assembly/limbs_asm_montred_x86_adx_bmi2
import ./assembly/limbs_asm_mul_x86
import ./assembly/limbs_asm_mul_x86_adx_bmi2
# ############################################################
#
@ -25,77 +24,170 @@ when UseASM_X86_64:
# No exceptions allowed
{.push raises: [].}
# Montgomery Reduction
# Multiplication
# ------------------------------------------------------------
# This is the reduction part of SOS (Separated Operand Scanning) modular multiplication technique
# TODO upstream, using Limbs[N] breaks semcheck
func montyRed*[N: static int](
r: var array[N, SecretWord],
t: array[N*2, SecretWord],
M: array[N, SecretWord],
m0ninv: BaseType, canUseNoCarryMontyMul: static bool) =
## Montgomery reduce a double-width bigint modulo M
# - Analyzing and Comparing Montgomery Multiplication Algorithms
# Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr.
# http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf
#
# - Arithmetic of Finite Fields
# Chapter 5 of Guide to Pairing-Based Cryptography
# Jean Luc Beuchat, Luis J. Dominguez Perez, Sylvain Duquesne, Nadia El Mrabet, Laura Fuentes-Castañeda, Francisco Rodríguez-Henríquez, 2017
# https://www.researchgate.net/publication/319538235_Arithmetic_of_Finite_Fields
#
# Algorithm
# Inputs:
# - N number of limbs
# - t[0 ..< 2N] (double-width input to reduce)
# - M[0 ..< N] The field modulus (must be odd for Montgomery reduction)
# - m0ninv: Montgomery Reduction magic number = -1/M[0]
# Output:
# - r[0 ..< N], in the Montgomery domain
# Parameters:
# - w, the word width usually 64 on 64-bit platforms or 32 on 32-bit
#
# for i in 0 .. n-1:
# C <- 0
# m <- t[i] * m0ninv mod 2^w (i.e. simple multiplication)
# for j in 0 .. n-1:
# (C, S) <- t[i+j] + m * M[j] + C
# t[i+j] <- S
# t[i+n] += C
# for i in 0 .. n-1:
# r[i] = t[i+n]
# if r >= M:
# r -= M
#
# Important note: `t[i+n] += C` should propagate the carry
# to the higher limb if any, thank you "implementation detail"
# missing from paper.
when UseASM_X86_64 and r.len <= 6:
func prod*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
## Multi-precision multiplication
## r <- a*b
##
## `a`, `b`, `r` can have a different number of limbs
## if `r`.limbs.len < a.limbs.len + b.limbs.len
## The result will be truncated, i.e. it will be
## a * b (mod (2^WordBitwidth)^r.limbs.len)
##
## `r` must not alias ``a`` or ``b``
when UseASM_X86_64 and aLen <= 6:
if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()):
montRed_asm_adx_bmi2(r, t, M, m0ninv, canUseNoCarryMontyMul)
mul_asm_adx_bmi2(r, a, b)
else:
montRed_asm(r, t, M, m0ninv, canUseNoCarryMontyMul)
elif UseASM_X86_32 and r.len <= 6:
# TODO: Assembly faster than GCC but slower than Clang
montRed_asm(r, t, M, m0ninv, canUseNoCarryMontyMul)
mul_asm(r, a, b)
elif UseASM_X86_64:
mul_asm(r, a, b)
else:
var t = t # Copy "t" for mutation and ensure on stack
var res: typeof(r) # Accumulator
staticFor i, 0, N:
var C = Zero
let m = t[i] * SecretWord(m0ninv)
staticFor j, 0, N:
muladd2(C, t[i+j], m, M[j], t[i+j], C)
res[i] = C
# We use Product Scanning / Comba multiplication
var t, u, v = Zero
const stopEx = min(a.len+b.len, r.len)
# This does t[i+n] += C
# but in a separate carry chain, fused with the
# copy "r[i] = t[i+n]"
var carry = Carry(0)
staticFor i, 0, N:
addC(carry, res[i], t[i+N], res[i], carry)
staticFor i, 0, stopEx:
# Invariant for product scanning:
# if we multiply accumulate by a[k1] * b[k2]
# we have k1+k2 == i
const ib = min(b.len-1, i)
const ia = i - ib
staticFor j, 0, min(a.len - ia, ib+1):
mulAcc(t, u, v, a[ia+j], b[ib-j])
# Final substraction
discard res.csub(M, SecretWord(carry).isNonZero() or not(res < M))
r = res
r[i] = v
when i < stopEx-1:
v = u
u = t
t = Zero
if aLen+bLen < rLen:
for i in aLen+bLen ..< rLen:
r[i] = Zero
func prod_high_words*[rLen, aLen, bLen](
r: var Limbs[rLen],
a: Limbs[aLen], b: Limbs[bLen],
lowestWordIndex: static int) =
## Multi-precision multiplication keeping only high words
## r <- a*b >> (2^WordBitWidth)^lowestWordIndex
##
## `a`, `b`, `r` can have a different number of limbs
## if `r`.limbs.len < a.limbs.len + b.limbs.len - lowestWordIndex
## The result will be truncated, i.e. it will be
## a * b >> (2^WordBitWidth)^lowestWordIndex (mod (2^WordBitwidth)^r.limbs.len)
#
# This is useful for
# - Barret reduction
# - Approximating multiplication by a fractional constant in the form f(a) = K/C * a
# with K and C known at compile-time.
# We can instead find a well chosen M = (2^WordBitWidth)^w, with M > C (i.e. M is a power of 2 bigger than C)
# Precompute P = K*M/C at compile-time
# and at runtime do P*a/M <=> P*a >> (WordBitWidth*w)
# i.e. prod_high_words(result, P, a, w)
# We use Product Scanning / Comba multiplication
var t, u, v = Zero # Will raise warning on empty iterations
var z: Limbs[rLen] # zero-init, ensure on stack and removes in-place problems
# The previous 2 columns can affect the lowest word due to carries
# but not the ones before (we accumulate in 3 words (t, u, v))
const w = lowestWordIndex - 2
const stopEx = min(a.len+b.len, r.len+lowestWordIndex)
staticFor i, max(0, w), stopEx:
const ib = min(b.len-1, i)
const ia = i - ib
staticFor j, 0, min(a.len - ia, ib+1):
mulAcc(t, u, v, a[ia+j], b[ib-j])
when i >= lowestWordIndex:
z[i-lowestWordIndex] = v
when i < stopEx-1:
v = u
u = t
t = Zero
r = z
func square_Comba[rLen, aLen](
r: var Limbs[rLen],
a: Limbs[aLen]) =
## Multi-precision squaring using Comba / Product Scanning
var t, u, v = Zero
const stopEx = min(a.len * 2, r.len)
staticFor i, 0, stopEx:
# Invariant for product scanning:
# if we multiply accumulate by a[k1] * a[k2]
# we have k1+k2 == i
const ib = min(a.len-1, i)
const ia = i - ib
staticFor j, 0, min(a.len - ia, ib+1):
const k1 = ia+j
const k2 = ib-j
when k1 < k2:
mulDoubleAcc(t, u, v, a[k1], a[k2])
elif k1 == k2:
mulAcc(t, u, v, a[k1], a[k2])
else:
discard
r[i] = v
when i < stopEx-1:
v = u
u = t
t = Zero
if aLen*2 < rLen:
for i in aLen*2 ..< rLen:
r[i] = Zero
func square_operandScan[rLen, aLen](
r: var Limbs[rLen],
a: Limbs[aLen]) =
## Multi-precision squaring using Operand Scanning
const stopEx = min(a.len * 2, r.len)
var t: typeof(r) # zero-init, ensure on stack
var C = Zero
static: doAssert aLen * 2 == rLen, "Truncated square operand scanning is not implemented"
staticFor i, 0, stopEx:
staticFor j, i+1, stopEx:
muladd2(C, t[i+j], a[j], a[i], t[i+j], C)
t[i+stopEx] = C
staticFor i, 0, aLen:
# (t[2*i+1], t[2*i]) <- 2*t[2*i] + a[i]*a[i]
var u, v = Zero
var carry: Carry
# a[i] * a[i]
mul(u, v, a[i], a[i])
# 2*t[2*i]
addC(carry, t[2*i], t[2*i], t[2*i], Carry(0))
addC(carry, t[2*i+1], Zero, Zero, carry)
# 2*t[2*i] + a[i] * a[i]
addC(carry, t[2*i], t[2*i], u, Carry(0))
addC(carry, t[2*i+1], Zero, v, carry)
r = t
func square*[rLen, aLen](
r: var Limbs[rLen],
a: Limbs[aLen]) =
## Multi-precision squaring
## r <- a²
##
## if `r`.limbs.len < a.limbs.len * 2
## The result will be truncated, i.e. it will be
## a² (mod (2^WordBitwidth)^r.limbs.len)
##
## `r` must not alias ``a`` or ``b``
when UseASM_X86_64:
square_asm(r, a)
else:
square_comba(r, a)

View File

@ -14,10 +14,13 @@ import
../primitives,
./limbs
when UseASM_X86_32:
import ./assembly/limbs_asm_montred_x86
when UseASM_X86_64:
import
./assembly/limbs_asm_montmul_x86,
./assembly/limbs_asm_montmul_x86_adx_bmi2
./assembly/limbs_asm_montmul_x86_adx_bmi2,
./assembly/limbs_asm_montred_x86_adx_bmi2
# ############################################################
#
@ -42,7 +45,7 @@ when UseASM_X86_64:
# No exceptions allowed
{.push raises: [].}
# Implementation
# Montgomery Multiplication
# ------------------------------------------------------------
func montyMul_CIOS_nocarry(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
@ -162,6 +165,9 @@ func montyMul_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
discard z.csub(M, v.isNonZero() or not(z < M))
r = z
# Montgomery Squaring
# ------------------------------------------------------------
func montySquare_CIOS_nocarry(r: var Limbs, a, M: Limbs, m0ninv: BaseType) {.used.}=
## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
## and no-carry optimization.
@ -267,6 +273,133 @@ func montySquare_CIOS(r: var Limbs, a, M: Limbs, m0ninv: BaseType) {.used.}=
discard t.csub(M, tN.isNonZero() or not(t < M)) # TODO: (t >= M) is unnecessary for prime in the form (2^64)^w
r = t
# Montgomery Reduction
# ------------------------------------------------------------
func montyRed_CIOS[N: static int](
r: var array[N, SecretWord],
a: array[N*2, SecretWord],
M: array[N, SecretWord],
m0ninv: BaseType) =
## Montgomery reduce a double-width bigint modulo M
# - Analyzing and Comparing Montgomery Multiplication Algorithms
# Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr.
# http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf
#
# - Arithmetic of Finite Fields
# Chapter 5 of Guide to Pairing-Based Cryptography
# Jean Luc Beuchat, Luis J. Dominguez Perez, Sylvain Duquesne, Nadia El Mrabet, Laura Fuentes-Castañeda, Francisco Rodríguez-Henríquez, 2017
# https://www.researchgate.net/publication/319538235_Arithmetic_of_Finite_Fields
#
# Algorithm
# Inputs:
# - N number of limbs
# - a[0 ..< 2N] (double-width input to reduce)
# - M[0 ..< N] The field modulus (must be odd for Montgomery reduction)
# - m0ninv: Montgomery Reduction magic number = -1/M[0]
# Output:
# - r[0 ..< N], in the Montgomery domain
# Parameters:
# - w, the word width usually 64 on 64-bit platforms or 32 on 32-bit
#
# for i in 0 .. n-1:
# C <- 0
# m <- a[i] * m0ninv mod 2^w (i.e. simple multiplication)
# for j in 0 .. n-1:
# (C, S) <- a[i+j] + m * M[j] + C
# a[i+j] <- S
# a[i+n] += C
# for i in 0 .. n-1:
# r[i] = a[i+n]
# if r >= M:
# r -= M
#
# Important note: `a[i+n] += C` should propagate the carry
# to the higher limb if any, thank you "implementation detail"
# missing from paper.
var a = a # Copy "t" for mutation and ensure on stack
var res: typeof(r) # Accumulator
staticFor i, 0, N:
var C = Zero
let m = a[i] * SecretWord(m0ninv)
staticFor j, 0, N:
muladd2(C, a[i+j], m, M[j], a[i+j], C)
res[i] = C
# This does t[i+n] += C
# but in a separate carry chain, fused with the
# copy "r[i] = t[i+n]"
var carry = Carry(0)
staticFor i, 0, N:
addC(carry, res[i], a[i+N], res[i], carry)
# Final substraction
discard res.csub(M, SecretWord(carry).isNonZero() or not(res < M))
r = res
func montyRed_Comba[N: static int](
r: var array[N, SecretWord],
a: array[N*2, SecretWord],
M: array[N, SecretWord],
m0ninv: BaseType) =
## Montgomery reduce a double-width bigint modulo M
# We use Product Scanning / Comba multiplication
var t, u, v = Zero
var carry: Carry
var z: typeof(r) # zero-init, ensure on stack and removes in-place problems in tower fields
staticFor i, 0, N:
staticFor j, 0, i:
mulAcc(t, u, v, z[j], M[i-j])
addC(carry, v, v, a[i], Carry(0))
addC(carry, u, u, Zero, carry)
addC(carry, t, t, Zero, carry)
z[i] = v * SecretWord(m0ninv)
mulAcc(t, u, v, z[i], M[0])
v = u
u = t
t = Zero
staticFor i, N, 2*N-1:
staticFor j, i-N+1, N:
mulAcc(t, u, v, z[j], M[i-j])
addC(carry, v, v, a[i], Carry(0))
addC(carry, u, u, Zero, carry)
addC(carry, t, t, Zero, carry)
z[i-N] = v
v = u
u = t
t = Zero
addC(carry, z[N-1], v, a[2*N-1], Carry(0))
# Final substraction
discard z.csub(M, SecretBool(carry) or not(z < M))
r = z
# TODO upstream, using Limbs[N] breaks semcheck
func montyRed*[N: static int](
r: var array[N, SecretWord],
a: array[N*2, SecretWord],
M: array[N, SecretWord],
m0ninv: BaseType, canUseNoCarryMontyMul: static bool) =
## Montgomery reduce a double-width bigint modulo M
when UseASM_X86_64 and r.len <= 6:
if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()):
montRed_asm_adx_bmi2(r, a, M, m0ninv, canUseNoCarryMontyMul)
else:
montRed_asm(r, a, M, m0ninv, canUseNoCarryMontyMul)
elif UseASM_X86_32 and r.len <= 6:
# TODO: Assembly faster than GCC but slower than Clang
montRed_asm(r, a, M, m0ninv, canUseNoCarryMontyMul)
else:
montyRed_CIOS(r, a, M, m0ninv)
# montyRed_Comba(r, a, M, m0ninv)
# Exported API
# ------------------------------------------------------------

View File

@ -54,8 +54,8 @@ const
# TODO, we restrict assembly to 64-bit words
# We need to support register spills for large limbs
const ConstantineASM {.booldefine.} = true
const UseASM_X86_32* = ConstantineASM and X86 and GCC_Compatible
const CttASM {.booldefine.} = true
const UseASM_X86_32* = CttASM and X86 and GCC_Compatible
const UseASM_X86_64* = WordBitWidth == 64 and UseASM_X86_32
# ############################################################

View File

@ -90,7 +90,7 @@ func curve_eq_rhs*[F](y2: var F, x: F, Tw: static Twisted) =
else:
{.error: "Only twisted curves are supported on extension field 𝔽".}
y2.diffAlias(t, y2)
y2.diff(t, y2)
when F.C.getCoefA() != 0:
t = x

View File

@ -198,7 +198,7 @@ func sum*[F; Tw: static Twisted](
r.y.sum(Q.x, Q.z) # 15. Y₃ <- X₂ + Z₂
r.x *= r.y # 16. X₃ <- X₃ Y₃ X₃ = (X₁Z₁)(X₂Z₂)
r.y.sum(t0, t2) # 17. Y₃ <- t₀ + t₂ Y₃ = X₁ X₂ + Z₁ Z₂
r.y.diffAlias(r.x, r.y) # 18. Y₃ <- X₃ - Y₃ Y₃ = (X₁ + Z₁)(X₂ + Z₂) - (X₁ X₂ + Z₁ Z₂) = X₁Z₂ + X₂Z₁
r.y.diff(r.x, r.y) # 18. Y₃ <- X₃ - Y₃ Y₃ = (X₁ + Z₁)(X₂ + Z₂) - (X₁ X₂ + Z₁ Z₂) = X₁Z₂ + X₂Z₁
when Tw == OnTwist and F.C.getSexticTwist() == D_Twist:
t0 *= SexticNonResidue
t1 *= SexticNonResidue
@ -214,7 +214,7 @@ func sum*[F; Tw: static Twisted](
r.y *= SexticNonResidue
r.x.prod(t4, r.y) # 25. X₃ <- t₄ Y₃ X₃ = 3b(Y₁Z₂ + Y₂Z₁)(X₁Z₂ + X₂Z₁)
t2.prod(t3, t1) # 26. t₂ <- t₃ t₁ t₂ = (X₁Y₂ + X₂Y₁) (Y₁Y₂ - 3bZ₁Z₂)
r.x.diffAlias(t2, r.x) # 27. X₃ <- t₂ - X₃ X₃ = (X₁Y₂ + X₂Y₁) (Y₁Y₂ - 3bZ₁Z₂) - 3b(Y₁Z₂ + Y₂Z₁)(X₁Z₂ + X₂Z₁)
r.x.diff(t2, r.x) # 27. X₃ <- t₂ - X₃ X₃ = (X₁Y₂ + X₂Y₁) (Y₁Y₂ - 3bZ₁Z₂) - 3b(Y₁Z₂ + Y₂Z₁)(X₁Z₂ + X₂Z₁)
r.y *= t0 # 28. Y₃ <- Y₃ t₀ Y₃ = 9bX₁X₂ (X₁Z₂ + X₂Z₁)
t1 *= r.z # 29. t₁ <- t₁ Z₃ t₁ = (Y₁Y₂ - 3bZ₁Z₂)(Y₁Y₂ + 3bZ₁Z₂)
r.y += t1 # 30. Y₃ <- t₁ + Y₃ Y₃ = (Y₁Y₂ + 3bZ₁Z₂)(Y₁Y₂ - 3bZ₁Z₂) + 9bX₁X₂ (X₁Z₂ + X₂Z₁)
@ -277,7 +277,7 @@ func madd*[F; Tw: static Twisted](
r.y *= SexticNonResidue
r.x.prod(t4, r.y) # 18. X₃ <- t₄ Y₃, X₃ = (Y₁ + Y₂Z₁) 3b(X₁ + X₂Z₁)
t2.prod(t3, t1) # 19. t₂ <- t₃ t₁, t₂ = (X₁Y₂ + X₂Y₁)(Y₁Y₂ - 3bZ₁)
r.x.diffAlias(t2, r.x) # 20. X₃ <- t₂ - X₃, X₃ = (X₁Y₂ + X₂Y₁)(Y₁Y₂ - 3bZ₁) - 3b(Y₁ + Y₂Z₁)(X₁ + X₂Z₁)
r.x.diff(t2, r.x) # 20. X₃ <- t₂ - X₃, X₃ = (X₁Y₂ + X₂Y₁)(Y₁Y₂ - 3bZ₁) - 3b(Y₁ + Y₂Z₁)(X₁ + X₂Z₁)
r.y *= t0 # 21. Y₃ <- Y₃ t₀, Y₃ = 9bX₁X₂ (X₁ + X₂Z₁)
t1 *= r.z # 22. t₁ <- t₁ Z₃, t₁ = (Y₁Y₂ - 3bZ₁)(Y₁Y₂ + 3bZ₁)
r.y += t1 # 23. Y₃ <- t₁ + Y₃, Y₃ = (Y₁Y₂ + 3bZ₁)(Y₁Y₂ - 3bZ₁) + 9bX₁X₂ (X₁ + X₂Z₁)

View File

@ -270,7 +270,7 @@ func line_eval_fused_add[F](
F.prod(T.z, C)
G.prod(T.x, D)
H.double(G)
H.diffAlias(F, H)
H.diff(F, H)
H += E
I.prod(T.y, E)

View File

@ -138,3 +138,17 @@ func mulAcc*[T: Ct[uint32]|Ct[uint64]](t, u, v: var T, a, b: T) {.inline.} =
addC(carry, v, v, UV[0], Carry(0))
addC(carry, u, u, UV[1], carry)
t += T(carry)
func mulDoubleAcc*[T: Ct[uint32]|Ct[uint64]](t, u, v: var T, a, b: T) {.inline.} =
## (t, u, v) <- (t, u, v) + 2 * a * b
var UV: array[2, T]
var carry: Carry
mul(UV[1], UV[0], a, b)
addC(carry, UV[0], UV[0], UV[0], Carry(0))
addC(carry, UV[1], UV[1], UV[1], carry)
t += T(carry)
addC(carry, v, v, UV[0], Carry(0))
addC(carry, u, u, UV[1], carry)
t += T(carry)

View File

@ -40,7 +40,7 @@ type
CarryFlag = "@ccc"
Register* = enum
rbx, rdx, r8, rax
rbx, rdx, r8, rax, xmm0
Constraint* = enum
## GCC extended assembly modifier
@ -369,6 +369,26 @@ func codeFragment(a: var Assembler_x86, instr: string, imm: int, op: Operand) =
a.operands.incl op.desc
func codeFragment(a: var Assembler_x86, instr: string, reg: Register, op: OperandReuse) =
# Generate a code fragment
# ⚠️ Warning:
# The caller should deal with destination/source operand
# so that it fits GNU Assembly
if a.wordBitWidth == 64:
a.code &= instr & "q %%" & $reg & ", %" & $op.asmId & '\n'
else:
a.code &= instr & "l %%" & $reg & ", %" & $op.asmId & '\n'
func codeFragment(a: var Assembler_x86, instr: string, op: OperandReuse, reg: Register) =
# Generate a code fragment
# ⚠️ Warning:
# The caller should deal with destination/source operand
# so that it fits GNU Assembly
if a.wordBitWidth == 64:
a.code &= instr & "q %" & $op.asmId & ", %%" & $reg & '\n'
else:
a.code &= instr & "l %" & $op.asmId & ", %%" & $reg & '\n'
func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: Register) =
# Generate a code fragment
# ⚠️ Warning:
@ -605,6 +625,17 @@ func mov*(a: var Assembler_x86, dst: Operand, imm: int) =
a.codeFragment("mov", imm, dst)
# No clobber
func mov*(a: var Assembler_x86, dst: Register, src: OperandReuse) =
## Does: dst <- src with dst a fixed register
a.codeFragment("mov", src, dst)
# No clobber
func mov*(a: var Assembler_x86, dst: OperandReuse, src: Register) =
## Does: dst <- imm
# doAssert dst.desc.constraint in OutputReg, $dst.repr
a.codeFragment("mov", src, dst)
# No clobber
func cmovc*(a: var Assembler_x86, dst, src: Operand) =
## Does: dst <- src if the carry flag is set
doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr
@ -696,19 +727,38 @@ func mulx*(a: var Assembler_x86, dHi, dLo, src0: Operand, src1: Register) =
a.operands.incl src0.desc
func adcx*(a: var Assembler_x86, dst, src: Operand) =
func mulx*(a: var Assembler_x86, dHi: OperandReuse, dLo, src0: Operand, src1: Register) =
## Does (dHi, dLo) <- src0 * src1
doAssert src1 == rdx, "MULX requires the RDX register"
doAssert dLo.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
"The destination operand must be a register " & $dLo.repr
doAssert dLo.desc.constraint in OutputReg
let off0 = a.getStrOffset(src0)
# Annoying AT&T syntax
if a.wordBitWidth == 64:
a.code &= "mulxq " & off0 & ", %" & $dLo.desc.asmId & ", %" & $dHi.asmId & '\n'
else:
a.code &= "mulxl " & off0 & ", %" & $dLo.desc.asmId & ", %" & $dHi.asmId & '\n'
a.operands.incl src0.desc
func adcx*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|OperandReuse) =
## Does: dst <- dst + src + carry
## and only sets the carry flag
doAssert dst.desc.constraint in OutputReg, $dst.repr
doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr
when dst is Operand:
doAssert dst.desc.constraint in OutputReg, $dst.repr
doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr
a.codeFragment("adcx", src, dst)
a.areFlagsClobbered = true
func adox*(a: var Assembler_x86, dst, src: Operand) =
func adox*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|OperandReuse) =
## Does: dst <- dst + src + overflow
## and only sets the overflow flag
doAssert dst.desc.constraint in OutputReg, $dst.repr
doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr
when dst is Operand:
doAssert dst.desc.constraint in OutputReg, $dst.repr
doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr
a.codeFragment("adox", src, dst)
a.areFlagsClobbered = true

View File

@ -87,8 +87,8 @@ func prod_complex(r: var QuadraticExt, a, b: QuadraticExt) =
# TODO: GCC is adding an unexplainable 30 cycles tax to this function (~10% slow down)
# for seemingly no reason
when true: # Single-width implementation
# Clang 330 cycles on i9-9980XE @4.1 GHz
when false: # Single-width implementation - BLS12-381
# Clang 348 cycles on i9-9980XE @3.9 GHz
var a0b0 {.noInit.}, a1b1 {.noInit.}: typeof(r.c0)
a0b0.prod(a.c0, b.c0) # [1 Mul]
a1b1.prod(a.c1, b.c1) # [2 Mul]
@ -102,7 +102,7 @@ func prod_complex(r: var QuadraticExt, a, b: QuadraticExt) =
r.c1 -= a1b1 # r1 = (b0 + b1)(a0 + a1) - a0b0 - a1b1 # [3 Mul, 2 Add, 3 Sub]
else: # Double-width implementation with lazy reduction
# Deactivated for now Clang 360 cycles on i9-9980XE @4.1 GHz
# Clang 341 cycles on i9-9980XE @3.9 GHz
var a0b0 {.noInit.}, a1b1 {.noInit.}: doubleWidth(typeof(r.c0))
var d {.noInit.}: doubleWidth(typeof(r.c0))
const msbSet = r.c0.typeof.canUseNoCarryMontyMul()
@ -120,20 +120,19 @@ func prod_complex(r: var QuadraticExt, a, b: QuadraticExt) =
d -= a0b0
d -= a1b1
else:
d.diffNoReduce(d, a0b0) # 10 cycles - cumul 152
d.diffNoReduce(d, a1b1) # 10 cycles - cumul 162
a0b0.diff(a0b0, a1b1) # 18 cycles - cumul 170
r.c0.reduce(a0b0) # 68 cycles - cumul 248
r.c1.reduce(d) # 68 cycles - cumul 316
d.diffNoReduce(d, a0b0) # 11 cycles - cumul 153
d.diffNoReduce(d, a1b1) # 11 cycles - cumul 164
a0b0.diff(a0b0, a1b1) # 19 cycles - cumul 183
r.c0.reduce(a0b0) # 50 cycles - cumul 233
r.c1.reduce(d) # 50 cycles - cumul 288
# Single-width [3 Mul, 2 Add, 3 Sub]
# 3*81 + 2*14 + 3*12 = 307 theoretical cycles
# 330 measured
# 3*88 + 2*14 + 3*14 = 334 theoretical cycles
# 348 measured
# Double-Width
# 316 theoretical cycles
# 365 measured
# Reductions can be 2x10 faster using MCL algorithm
# but there are still unexplained 50 cycles diff between theo and measured
# 288 theoretical cycles
# 329 measured
# Unexplained 40 cycles diff between theo and measured
# and unexplained 30 cycles between Clang and GCC
# - Function calls?
# - push/pop stack?

View File

@ -168,18 +168,6 @@ func diff*(r: var CubicExt, a, b: CubicExt) =
r.c1.diff(a.c1, b.c1)
r.c2.diff(a.c2, b.c2)
func diffAlias*(r: var QuadraticExt, a, b: QuadraticExt) =
## Diff ``a`` and ``b`` into ``r``
## Handles r and b aliasing
r.c0.diffAlias(a.c0, b.c0)
r.c1.diffAlias(a.c1, b.c1)
func diffAlias*(r: var CubicExt, a, b: CubicExt) =
## Diff ``a`` and ``b`` into ``r``
## Handles r and b aliasing
r.c0.diffAlias(a.c0, b.c0)
r.c1.diffAlias(a.c1, b.c1)
r.c2.diffAlias(a.c2, b.c2)
# Conditional arithmetic
# -------------------------------------------------------------------

View File

@ -196,21 +196,32 @@ func `*=`*(a: var Fp2, _: typedesc[ξ]) {.inline.} =
type
Fp12*[C: static Curve] = object
c0*, c1*, c2*: Fp4[C]
# c0*, c1*: Fp6[C]
γ = NonResidue
# We call the non-residue γ (Gamma) on 𝔽p6 to avoid confusion between non-residue
# of different tower level
func `*`*(_: typedesc[γ], a: Fp4): Fp4 {.noInit, inline.} =
## Multiply an element of 𝔽p4 by the sextic non-residue
## chosen to construct 𝔽p12
result.c0 = ξ * a.c1
result.c1 = a.c0
func `*=`*(a: var Fp4, _: typedesc[γ]) {.inline.} =
a = γ * a
func `*`*(_: typedesc[γ], a: Fp6): Fp6 {.noInit, inline.} =
## Multiply an element of 𝔽p6 by the cubic non-residue
## chosen to construct 𝔽p12
## For all curves γ = v with v the factor for 𝔽p6 coordinate
## and v³ = ξ
## (c0 + c1 v + c2 v²) v => ξ c2 + c0 v + c1 v²
result.c0 = ξ * a.c1
result.c0 = ξ * a.c2
result.c1 = a.c0
result.c2 = a.c1
func `*=`*(a: var Fp4, _: typedesc[γ]) {.inline.} =
func `*=`*(a: var Fp6, _: typedesc[γ]) {.inline.} =
a = γ * a
# Sparse functions

View File

@ -71,7 +71,7 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst
|double* | 7212| 2404000000000.000| 0.003| 0.000|
|sum* | 21058| 7019333333333.333| 0.003| 0.000|
|diff* | 8884| 2961333333333.333| 0.003| 0.000|
|diffAlias* | 10| inf| 0.000| 0.000|
|diff* | 10| inf| 0.000| 0.000|
|double* | 4186| inf| 0.000| 0.000|
|prod* | 14486| 1609555555555.555| 0.009| 0.000|
|square* | 16| inf| 0.000| 0.000|

View File

@ -287,6 +287,28 @@ proc mainMulHigh() =
r.prod_high_words(b, a, 2)
check: bool(r == expected)
proc mainSquare() =
suite "Multi-precision multiplication" & " [" & $WordBitwidth & "-bit mode]":
test "Squaring is consistent with multiplication (rBits = 2*aBits)":
block:
let a = BigInt[200].fromHex"0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DE"
var r_mul, r_sqr: BigInt[400]
r_mul.prod(a, a)
r_sqr.square(a)
check: bool(r_mul == r_sqr)
test "Squaring is consistent with multiplication (truncated)":
block:
let a = BigInt[200].fromHex"0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DE"
var r_mul, r_sqr: BigInt[256]
r_mul.prod(a, a)
r_sqr.square(a)
check: bool(r_mul == r_sqr)
proc mainModular() =
suite "Modular operations - small modulus" & " [" & $WordBitwidth & "-bit mode]":
# Vectors taken from Stint - https://github.com/status-im/nim-stint
@ -637,9 +659,10 @@ proc mainModularInverse() =
check: bool(r == expected)
mainArith()
# mainMul()
# mainMulHigh()
# mainModular()
# mainNeg()
# mainCopySwap()
# mainModularInverse()
mainMul()
mainMulHigh()
mainSquare()
mainModular()
mainNeg()
mainCopySwap()
mainModularInverse()

View File

@ -24,74 +24,100 @@ rng.seed(seed)
echo "\n------------------------------------------------------\n"
echo "test_finite_fields_double_width xoshiro512** seed: ", seed
proc randomCurve(C: static Curve) =
let a = rng.random_unsafe(Fp[C])
let b = rng.random_unsafe(Fp[C])
template mulTest(rng_gen: untyped): untyped =
proc `mul _ rng_gen`(C: static Curve) =
let a = rng_gen(rng, Fp[C])
let b = rng.random_unsafe(Fp[C])
var r_fp, r_fpDbl: Fp[C]
var tmpDbl: FpDbl[C]
var r_fp{.noInit.}, r_fpDbl{.noInit.}: Fp[C]
var tmpDbl{.noInit.}: FpDbl[C]
r_fp.prod(a, b)
tmpDbl.mulNoReduce(a, b)
r_fpDbl.reduce(tmpDbl)
r_fp.prod(a, b)
tmpDbl.mulNoReduce(a, b)
r_fpDbl.reduce(tmpDbl)
doAssert bool(r_fp == r_fpDbl)
doAssert bool(r_fp == r_fpDbl)
proc randomHighHammingWeight(C: static Curve) =
let a = rng.random_highHammingWeight(Fp[C])
let b = rng.random_highHammingWeight(Fp[C])
template sqrTest(rng_gen: untyped): untyped =
proc `sqr _ rng_gen`(C: static Curve) =
let a = rng_gen(rng, Fp[C])
var r_fp, r_fpDbl: Fp[C]
var tmpDbl: FpDbl[C]
var mulDbl{.noInit.}, sqrDbl{.noInit.}: FpDbl[C]
r_fp.prod(a, b)
tmpDbl.mulNoReduce(a, b)
r_fpDbl.reduce(tmpDbl)
mulDbl.mulNoReduce(a, a)
sqrDbl.squareNoReduce(a)
doAssert bool(r_fp == r_fpDbl)
doAssert bool(mulDbl == sqrDbl)
proc random_long01Seq(C: static Curve) =
let a = rng.random_long01Seq(Fp[C])
let b = rng.random_long01Seq(Fp[C])
var r_fp, r_fpDbl: Fp[C]
var tmpDbl: FpDbl[C]
r_fp.prod(a, b)
tmpDbl.mulNoReduce(a, b)
r_fpDbl.reduce(tmpDbl)
doAssert bool(r_fp == r_fpDbl)
mulTest(random_unsafe)
mulTest(randomHighHammingWeight)
mulTest(random_long01Seq)
sqrTest(random_unsafe)
sqrTest(randomHighHammingWeight)
sqrTest(random_long01Seq)
suite "Field Multiplication via double-width field elements is consistent with single-width." & " [" & $WordBitwidth & "-bit mode]":
test "With P-224 field modulus":
for _ in 0 ..< Iters:
randomCurve(P224)
mul_random_unsafe(P224)
for _ in 0 ..< Iters:
randomHighHammingWeight(P224)
mul_randomHighHammingWeight(P224)
for _ in 0 ..< Iters:
random_long01Seq(P224)
mul_random_long01Seq(P224)
test "With P-256 field modulus":
for _ in 0 ..< Iters:
randomCurve(P256)
mul_random_unsafe(P256)
for _ in 0 ..< Iters:
randomHighHammingWeight(P256)
mul_randomHighHammingWeight(P256)
for _ in 0 ..< Iters:
random_long01Seq(P256)
mul_random_long01Seq(P256)
test "With BN254_Snarks field modulus":
for _ in 0 ..< Iters:
randomCurve(BN254_Snarks)
mul_random_unsafe(BN254_Snarks)
for _ in 0 ..< Iters:
randomHighHammingWeight(BN254_Snarks)
mul_randomHighHammingWeight(BN254_Snarks)
for _ in 0 ..< Iters:
random_long01Seq(BN254_Snarks)
mul_random_long01Seq(BN254_Snarks)
test "With BLS12_381 field modulus":
for _ in 0 ..< Iters:
randomCurve(BLS12_381)
mul_random_unsafe(BLS12_381)
for _ in 0 ..< Iters:
randomHighHammingWeight(BLS12_381)
mul_randomHighHammingWeight(BLS12_381)
for _ in 0 ..< Iters:
random_long01Seq(BLS12_381)
mul_random_long01Seq(BLS12_381)
suite "Field Squaring via double-width field elements is consistent with single-width." & " [" & $WordBitwidth & "-bit mode]":
test "With P-224 field modulus":
for _ in 0 ..< Iters:
sqr_random_unsafe(P224)
for _ in 0 ..< Iters:
sqr_randomHighHammingWeight(P224)
for _ in 0 ..< Iters:
sqr_random_long01Seq(P224)
test "With P-256 field modulus":
for _ in 0 ..< Iters:
sqr_random_unsafe(P256)
for _ in 0 ..< Iters:
sqr_randomHighHammingWeight(P256)
for _ in 0 ..< Iters:
sqr_random_long01Seq(P256)
test "With BN254_Snarks field modulus":
for _ in 0 ..< Iters:
sqr_random_unsafe(BN254_Snarks)
for _ in 0 ..< Iters:
sqr_randomHighHammingWeight(BN254_Snarks)
for _ in 0 ..< Iters:
sqr_random_long01Seq(BN254_Snarks)
test "With BLS12_381 field modulus":
for _ in 0 ..< Iters:
sqr_random_unsafe(BLS12_381)
for _ in 0 ..< Iters:
sqr_randomHighHammingWeight(BLS12_381)
for _ in 0 ..< Iters:
sqr_random_long01Seq(BLS12_381)

View File

@ -155,8 +155,9 @@ proc subTests(gmpRng: var gmp_randstate_t, a, b, p, r: var mpz_t, C: static Curv
var r2Test = aTest
r2Test -= bTest
# Substraction with r and b aliasing
var r3Test = bTest
r3Test.diffAlias(aTest, r3Test)
r3Test.diff(aTest, r3Test)
binary_epilogue(r, a, b, rTest, aBuf, bBuf, "Substraction (with result)")
binary_epilogue(r, a, b, r2Test, aBuf, bBuf, "Substraction (in-place)")