From 83dcd988b323237fab10da4c5690e6eebd572db3 Mon Sep 17 00:00:00 2001 From: Mamy Ratsimbazafy Date: Mon, 1 Feb 2021 03:52:27 +0100 Subject: [PATCH] FpDbl revisited (#144) - 7% perf improvement everywhere, up to 30% in double-width primitives * reorg mul -> limbs_double_width, ConstantineASM CttASM * Implement squaring specialized scalar path (22% faster than mul) * Implement "portable" assembly for squaring * stash part of the changes * Reorg montgomery reduction - prepare to introduce Comba optimization * Implement comba Montgomery reduce (but it's slower!) * rename t -> a * 30% performance improvement by avoiding toOpenArray! * variable renaming * Fix 32-bit imports * slightly better assembly for sub2x * There is an annoying bottleneck * use out-of-place Fp assembly instead of in-place * diffAlias is unneeded now * cosmetic * speedup fpDbl sub by 20% * Fix Fp2 -> Fp6 -> Fp12 towering. It seems 5% faster * Stash ADCX/ADOX squaring --- README.md | 2 +- benchmarks/bench_fp_double_width.nim | 7 + constantine.nimble | 4 +- .../limbs_asm_modular_dbl_width_x86.nim | 69 +++--- .../assembly/limbs_asm_modular_x86.nim | 209 +++++++++------- .../assembly/limbs_asm_montred_x86.nim | 67 ++--- .../limbs_asm_montred_x86_adx_bmi2.nim | 39 +-- .../arithmetic/assembly/limbs_asm_mul_x86.nim | 149 ++++++++++- .../assembly/limbs_asm_mul_x86_adx_bmi2.nim | 132 ++++++++++ constantine/arithmetic/bigints.nim | 21 +- constantine/arithmetic/finite_fields.nim | 32 +-- .../arithmetic/finite_fields_double_width.nim | 32 +-- constantine/arithmetic/limbs.nim | 86 ------- constantine/arithmetic/limbs_double_width.nim | 232 ++++++++++++------ constantine/arithmetic/limbs_montgomery.nim | 137 ++++++++++- constantine/config/common.nim | 4 +- .../elliptic/ec_shortweierstrass_affine.nim | 2 +- .../ec_shortweierstrass_projective.nim | 6 +- constantine/pairing/lines_projective.nim | 2 +- constantine/primitives/extended_precision.nim | 14 ++ .../primitives/macro_assembler_x86.nim | 64 ++++- .../quadratic_extensions.nim | 27 +- .../tower_field_extensions/tower_common.nim | 12 - .../tower_instantiation.nim | 15 +- metering/README.md | 2 +- tests/t_bigints.nim | 35 ++- tests/t_finite_fields_double_width.nim | 110 +++++---- tests/t_finite_fields_vs_gmp.nim | 3 +- 28 files changed, 1046 insertions(+), 468 deletions(-) diff --git a/README.md b/README.md index 9191508..c4c176f 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ generated incorrect add-with-carry code. On x86-64, inline assembly is used to workaround compilers having issues optimizing large integer arithmetic, and also ensure constant-time code. -This can be deactivated with `"-d:ConstantineASM=false"`: +This can be deactivated with `"-d:CttASM=false"`: - at a significant performance cost with GCC (~50% slower than Clang). - at misssed opportunity on recent CPUs that support MULX/ADCX/ADOX instructions (~60% faster than Clang). - There is a 2.4x perf ratio between using plain GCC vs GCC with inline assembly. diff --git a/benchmarks/bench_fp_double_width.nim b/benchmarks/bench_fp_double_width.nim index bbb958a..2315de7 100644 --- a/benchmarks/bench_fp_double_width.nim +++ b/benchmarks/bench_fp_double_width.nim @@ -172,6 +172,12 @@ proc mul2xBench*(rLen, aLen, bLen: static int, iters: int) = bench("Multiplication", $rLen & " <- " & $aLen & " x " & $bLen, iters): r.prod(a, b) +proc square2xBench*(rLen, aLen: static int, iters: int) = + var r: BigInt[rLen] + let a = rng.random_unsafe(BigInt[aLen]) + bench("Squaring", $rLen & " <- " & $aLen & "²", iters): + r.square(a) + proc reduce2x*(T: typedesc, iters: int) = var r: T var t: doubleWidth(T) @@ -189,6 +195,7 @@ proc main() = diff2x(Fp[BLS12_381], iters = 10_000_000) diff2xNoReduce(Fp[BLS12_381], iters = 10_000_000) mul2xBench(768, 384, 384, iters = 10_000_000) + square2xBench(768, 384, iters = 10_000_000) reduce2x(Fp[BLS12_381], iters = 10_000_000) separator() diff --git a/constantine.nimble b/constantine.nimble index 9a7236d..99eb565 100644 --- a/constantine.nimble +++ b/constantine.nimble @@ -232,7 +232,7 @@ proc buildBench(benchName: string, compiler = "", useAsm = true, run = false) = if compiler != "": cc = "--cc:" & compiler if not useAsm: - cc &= " -d:ConstantineASM=false" + cc &= " -d:CttASM=false" exec "nim c " & cc & " -d:danger --verbosity:0 -o:build/bench/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") & " --nimcache:nimcache/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") & @@ -246,7 +246,7 @@ proc runTests(requireGMP: bool, dumpCmdFile = false, test32bit = false, testASM if not(td.useGMP and not requireGMP): var flags = "" if not testASM: - flags &= " -d:ConstantineASM=false" + flags &= " -d:CttASM=false" if test32bit: flags &= " -d:Constantine32" if td.path in useDebug: diff --git a/constantine/arithmetic/assembly/limbs_asm_modular_dbl_width_x86.nim b/constantine/arithmetic/assembly/limbs_asm_modular_dbl_width_x86.nim index ed49566..54f6b4c 100644 --- a/constantine/arithmetic/assembly/limbs_asm_modular_dbl_width_x86.nim +++ b/constantine/arithmetic/assembly/limbs_asm_modular_dbl_width_x86.nim @@ -27,58 +27,67 @@ static: doAssert UseASM_X86_64 # Substraction # ------------------------------------------------------------ -macro sub2x_gen[N: static int](a: var Limbs[N], b: Limbs[N], M: Limbs[N div 2]): untyped = +macro sub2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N div 2]): untyped = ## Generate an optimized out-of-place double-width substraction kernel result = newStmtList() var ctx = init(Assembler_x86, BaseType) let - N2 = N div 2 + H = N div 2 - arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, InputOutput) - # We reuse the reg used for B for overflow detection - arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, InputOutput) - # We could force M as immediate by specializing per moduli - arrM = init(OperandArray, nimSymbol = M, N, PointerInReg, Input) + r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput) + # We reuse the reg used for b for overflow detection + b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput) + # We could force m as immediate by specializing per moduli + M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input) # If N is too big, we need to spill registers. TODO. - arrT = init(OperandArray, nimSymbol = ident"t", N2, ElemsInReg, Output_EarlyClobber) - arrTadd = init(OperandArray, nimSymbol = ident"tadd", N2, ElemsInReg, Output_EarlyClobber) + u = init(OperandArray, nimSymbol = ident"U", H, ElemsInReg, InputOutput) + v = init(OperandArray, nimSymbol = ident"V", H, ElemsInReg, InputOutput) + + let usym = u.nimSymbol + let vsym = v.nimSymbol + result.add quote do: + var `usym`{.noinit.}, `vsym` {.noInit.}: typeof(`A`) + staticFor i, 0, `H`: + `usym`[i] = `A`[i] + staticFor i, `H`, `N`: + `vsym`[i-`H`] = `A`[i] # Substraction - for i in 0 ..< N: - ctx.mov arrT[i mod N2], arrA[i] + # u = a[0..= M: # r -= M @@ -162,14 +162,15 @@ macro montyRed_gen[N: static int]( doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs." result.add quote do: - `eax` = BaseType `t_MR`[0] - `scratchSym`[1 .. `N`-1] = `t_MR`.toOpenArray(1, `N`-1) + `eax` = BaseType `a_MR`[0] + staticFor i, 0, `N`: # Do NOT use Nim slice/toOpenArray, they are not inlined + `scratchSym`[i] = `a_MR`[i] ctx.mov scratch[N], rRAX - ctx.imul rRAX, m0ninv # m <- t[i] * m0ninv mod 2^w + ctx.imul rRAX, m0ninv # m <- a[i] * m0ninv mod 2^w ctx.mov scratch[0], rRAX - # scratch: [t[0] * m0, t[1], t[2], t[3], t[0]] for 4 limbs + # scratch: [a[0] * m0, a[1], a[2], a[3], a[0]] for 4 limbs for i in 0 ..< N: ctx.comment "" @@ -217,23 +218,23 @@ macro montyRed_gen[N: static int]( ctx = init(Assembler_x86, BaseType) let r = init(OperandArray, nimSymbol = r_MR, N, PointerInReg, InputOutput_EnsureClobber) - let t = init(OperandArray, nimSymbol = t_MR, N*2, PointerInReg, Input) + let a = init(OperandArray, nimSymbol = a_MR, N*2, PointerInReg, Input) let extraRegNeeded = N-2 - let tsub = init(OperandArray, nimSymbol = ident"tsub", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber) - let tsubsym = tsub.nimSymbol + let t = init(OperandArray, nimSymbol = ident"t", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber) + let tsym = t.nimSymbol result.add quote do: - var `tsubsym` {.noInit.}: Limbs[`extraRegNeeded`] + var `tsym` {.noInit.}: Limbs[`extraRegNeeded`] - # This does t[i+n] += hi + # This does a[i+n] += hi # but in a separate carry chain, fused with the - # copy "r[i] = t[i+n]" + # copy "r[i] = a[i+n]" for i in 0 ..< N: if i == 0: - ctx.add scratch[i], t[i+N] + ctx.add scratch[i], a[i+N] else: - ctx.adc scratch[i], t[i+N] + ctx.adc scratch[i], a[i+N] - let reuse = repackRegisters(tsub, scratch[N], scratch[N+1]) + let reuse = repackRegisters(t, scratch[N], scratch[N+1]) if canUseNoCarryMontyMul: ctx.finalSubNoCarry(r, scratch, M, reuse) @@ -245,10 +246,10 @@ macro montyRed_gen[N: static int]( func montRed_asm*[N: static int]( r: var array[N, SecretWord], - t: array[N*2, SecretWord], + a: array[N*2, SecretWord], M: array[N, SecretWord], m0ninv: BaseType, canUseNoCarryMontyMul: static bool ) = ## Constant-time Montgomery reduction - montyRed_gen(r, t, M, m0ninv, canUseNoCarryMontyMul) + montyRed_gen(r, a, M, m0ninv, canUseNoCarryMontyMul) diff --git a/constantine/arithmetic/assembly/limbs_asm_montred_x86_adx_bmi2.nim b/constantine/arithmetic/assembly/limbs_asm_montred_x86_adx_bmi2.nim index 9108d4d..afc26a1 100644 --- a/constantine/arithmetic/assembly/limbs_asm_montred_x86_adx_bmi2.nim +++ b/constantine/arithmetic/assembly/limbs_asm_montred_x86_adx_bmi2.nim @@ -37,7 +37,7 @@ static: doAssert UseASM_X86_64 macro montyRedx_gen[N: static int]( r_MR: var array[N, SecretWord], - t_MR: array[N*2, SecretWord], + a_MR: array[N*2, SecretWord], M_MR: array[N, SecretWord], m0ninv_MR: BaseType, canUseNoCarryMontyMul: static bool @@ -109,13 +109,13 @@ macro montyRedx_gen[N: static int]( # --------------------------------------------------------- # for i in 0 .. n-1: # hi <- 0 - # m <- t[i] * m0ninv mod 2^w (i.e. simple multiplication) + # m <- a[i] * m0ninv mod 2^w (i.e. simple multiplication) # for j in 0 .. n-1: - # (hi, lo) <- t[i+j] + m * M[j] + hi - # t[i+j] <- lo - # t[i+n] += hi + # (hi, lo) <- a[i+j] + m * M[j] + hi + # a[i+j] <- lo + # a[i+n] += hi # for i in 0 .. n-1: - # r[i] = t[i+n] + # r[i] = a[i+n] # if r >= M: # r -= M @@ -124,12 +124,13 @@ macro montyRedx_gen[N: static int]( result.add quote do: `edx` = BaseType(`m0ninv_MR`) - `scratchSym`[0 .. `N`-1] = `t_MR`.toOpenArray(0, `N`-1) + staticFor i, 0, `N`: # Do NOT use Nim slice/toOpenArray, they are not inlined + `scratchSym`[i] = `a_MR`[i] for i in 0 ..< N: # RDX contains m0ninv at the start of each loop ctx.comment "" - ctx.imul rRDX, scratch[0] # m <- t[i] * m0ninv mod 2^w + ctx.imul rRDX, scratch[0] # m <- a[i] * m0ninv mod 2^w ctx.comment "---- Reduction " & $i ctx.`xor` scratch[N], scratch[N] @@ -156,23 +157,23 @@ macro montyRedx_gen[N: static int]( ctx = init(Assembler_x86, BaseType) let r = init(OperandArray, nimSymbol = r_MR, N, PointerInReg, InputOutput_EnsureClobber) - let t = init(OperandArray, nimSymbol = t_MR, N*2, PointerInReg, Input) + let a = init(OperandArray, nimSymbol = a_MR, N*2, PointerInReg, Input) let extraRegNeeded = N-1 - let tsub = init(OperandArray, nimSymbol = ident"tsub", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber) - let tsubsym = tsub.nimSymbol + let t = init(OperandArray, nimSymbol = ident"t", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber) + let tsym = t.nimSymbol result.add quote do: - var `tsubsym` {.noInit.}: Limbs[`extraRegNeeded`] + var `tsym` {.noInit.}: Limbs[`extraRegNeeded`] - # This does t[i+n] += hi + # This does a[i+n] += hi # but in a separate carry chain, fused with the - # copy "r[i] = t[i+n]" + # copy "r[i] = a[i+n]" for i in 0 ..< N: if i == 0: - ctx.add scratch[i], t[i+N] + ctx.add scratch[i], a[i+N] else: - ctx.adc scratch[i], t[i+N] + ctx.adc scratch[i], a[i+N] - let reuse = repackRegisters(tsub, scratch[N]) + let reuse = repackRegisters(t, scratch[N]) if canUseNoCarryMontyMul: ctx.finalSubNoCarry(r, scratch, M, reuse) @@ -184,10 +185,10 @@ macro montyRedx_gen[N: static int]( func montRed_asm_adx_bmi2*[N: static int]( r: var array[N, SecretWord], - t: array[N*2, SecretWord], + a: array[N*2, SecretWord], M: array[N, SecretWord], m0ninv: BaseType, canUseNoCarryMontyMul: static bool ) = ## Constant-time Montgomery reduction - montyRedx_gen(r, t, M, m0ninv, canUseNoCarryMontyMul) + montyRedx_gen(r, a, M, m0ninv, canUseNoCarryMontyMul) diff --git a/constantine/arithmetic/assembly/limbs_asm_mul_x86.nim b/constantine/arithmetic/assembly/limbs_asm_mul_x86.nim index 72ff595..547a5f7 100644 --- a/constantine/arithmetic/assembly/limbs_asm_mul_x86.nim +++ b/constantine/arithmetic/assembly/limbs_asm_mul_x86.nim @@ -30,6 +30,9 @@ import static: doAssert UseASM_X86_64 # Need 8 registers just for mul # and 32-bit only has 8 max. +# Multiplication +# ----------------------------------------------------------------------------------------------- + macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) = ## Comba multiplication generator ## `a`, `b`, `r` can have a different number of limbs @@ -114,7 +117,12 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], ctx.`xor` v, v ctx.`xor` t, t - for i in 0 ..< min(aLen+bLen, rLen): + let stopEx = min(aLen+bLen, rLen) + + for i in 0 ..< stopEx: + # Invariant for product scanning: + # if we multiply accumulate by a[k1] * b[k2] + # we have k1+k2 == i let ib = min(bLen-1, i) let ia = i - ib for j in 0 ..< min(aLen - ia, ib+1): @@ -127,7 +135,7 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], ctx.mov arrR[i], v - if i != min(aLen+bLen, rLen) - 1: + if i != stopEx - 1: ctx.mov v, u ctx.mov u, t ctx.`xor` t, t @@ -144,3 +152,140 @@ func mul_asm*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], ## Multi-precision Multiplication ## Assumes r doesn't alias a or b mul_gen(r, a, b) + +# Squaring +# ----------------------------------------------------------------------------------------------- + +macro square_gen[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) = + ## Comba squaring generator + ## `a` and `r` can have a different number of limbs + ## if `r`.limbs.len < a.limbs.len * 2 + ## The result will be truncated, i.e. it will be + ## a² (mod (2^WordBitwidth)^r.limbs.len) + ## + ## Assumes r doesn't aliases a + + result = newStmtList() + + var ctx = init(Assembler_x86, BaseType) + let + arrR = init(OperandArray, nimSymbol = r, rLen, PointerInReg, InputOutput_EnsureClobber) + arrA = init(OperandArray, nimSymbol = a, aLen, PointerInReg, Input) + + t = Operand( + desc: OperandDesc( + asmId: "[t]", + nimSymbol: ident"t", + rm: Reg, + constraint: Output_EarlyClobber, + cEmit: "t" + ) + ) + + u = Operand( + desc: OperandDesc( + asmId: "[u]", + nimSymbol: ident"u", + rm: Reg, + constraint: Output_EarlyClobber, + cEmit: "u" + ) + ) + + v = Operand( + desc: OperandDesc( + asmId: "[v]", + nimSymbol: ident"v", + rm: Reg, + constraint: Output_EarlyClobber, + cEmit: "v" + ) + ) + + # MUL requires RAX and RDX + rRAX = Operand( + desc: OperandDesc( + asmId: "[rax]", + nimSymbol: ident"rax", + rm: RAX, + constraint: Output_EarlyClobber, + cEmit: "rax" + ) + ) + + rRDX = Operand( + desc: OperandDesc( + asmId: "[rdx]", + nimSymbol: ident"rdx", + rm: RDX, + constraint: Output_EarlyClobber, + cEmit: "rdx" + ) + ) + + + # Prologue + let tsym = t.desc.nimSymbol + let usym = u.desc.nimSymbol + let vsym = v.desc.nimSymbol + let eax = rRAX.desc.nimSymbol + let edx = rRDX.desc.nimSymbol + result.add quote do: + var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init + var `eax`{.noInit.}, `edx`{.noInit.}: BaseType + + # Algorithm + ctx.`xor` u, u + ctx.`xor` v, v + ctx.`xor` t, t + + let stopEx = min(aLen*2, rLen) + + for i in 0 ..< stopEx: + # Invariant for product scanning: + # if we multiply accumulate by a[k1] * b[k2] + # we have k1+k2 == i + let ib = min(aLen-1, i) + let ia = i - ib + for j in 0 ..< min(aLen - ia, ib+1): + let k1 = ia+j + let k2 = ib-j + if k1 < k2: + # (t, u, v) <- (t, u, v) + 2 * a[k1] * a[k2] + ctx.mov rRAX, arrA[k2] + ctx.mul rdx, rax, arrA[k1], rax + ctx.add rRAX, rRAX + ctx.adc rRDX, rRDX + ctx.adc t, 0 + ctx.add v, rRAX + ctx.adc u, rRDX + ctx.adc t, 0 + elif k1 == k2: + # (t, u, v) <- (t, u, v) + a[k1] * a[k2] + ctx.mov rRAX, arrA[k2] + ctx.mul rdx, rax, arrA[k1], rax + ctx.add v, rRAX + ctx.adc u, rRDX + ctx.adc t, 0 + else: + discard + + ctx.mov arrR[i], v + + if i != stopEx - 1: + ctx.mov v, u + ctx.mov u, t + ctx.`xor` t, t + + if aLen*2 < rLen: + ctx.`xor` rRAX, rRAX + for i in aLen*2 ..< rLen: + ctx.mov arrR[i], rRAX + + # Codegen + result.add ctx.generate + +func square_asm*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) = + ## Multi-precision Squaring + ## Assumes r doesn't alias a + square_gen(r, a) diff --git a/constantine/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim b/constantine/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim index c5c2aa7..f56fcdd 100644 --- a/constantine/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim +++ b/constantine/arithmetic/assembly/limbs_asm_mul_x86_adx_bmi2.nim @@ -195,3 +195,135 @@ func mul_asm_adx_bmi2*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limb ## Multi-precision Multiplication ## Assumes r doesn't alias a or b mulx_gen(r, a, b) + + +# Squaring +# ----------------------------------------------------------------------------------------------- +# TODO: We use 16 registers but GCC/Clang still complain :/ + +# macro sqrx_gen[rLen, aLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen]) = +# ## Squaring +# ## `a` and `r` can have a different number of limbs +# ## if `r`.limbs.len < a.limbs.len * 2 +# ## The result will be truncated, i.e. it will be +# ## a² (mod (2^WordBitwidth)^r.limbs.len) +# ## +# ## Assumes r doesn't aliases a +# result = newStmtList() +# +# var ctx = init(Assembler_x86, BaseType) +# let +# # Register count with 6 limbs: +# # r + a + rax + rdx = 4 +# # t = 2 * a.len = 12 +# # We use the full x86 register set. +# +# r = init(OperandArray, nimSymbol = rx, rLen, PointerInReg, InputOutput) +# a = init(OperandArray, nimSymbol = ax, aLen, PointerInReg, Input) +# +# N = a.len +# tSlots = a.len * 2 +# # If aLen is too big, we need to spill registers. TODO. +# t = init(OperandArray, nimSymbol = ident"t", tSlots, ElemsInReg, Output_EarlyClobber) +# +# # MULX requires RDX +# rRDX = Operand( +# desc: OperandDesc( +# asmId: "[rdx]", +# nimSymbol: ident"rdx", +# rm: RDX, +# constraint: Output_EarlyClobber, +# cEmit: "rdx" +# ) +# ) +# +# # Scratch spaces for carries +# rRAX = Operand( +# desc: OperandDesc( +# asmId: "[rax]", +# nimSymbol: ident"rax", +# rm: RAX, +# constraint: Output_EarlyClobber, +# cEmit: "rax" +# ) +# ) +# +# # Prologue +# # ------------------------------- +# let tsym = t.nimSymbol +# let eax = rRAX.desc.nimSymbol +# let edx = rRDX.desc.nimSymbol +# result.add quote do: +# var `tsym`{.noInit.}: array[`N`, BaseType] +# var `eax`{.noInit.}, `edx`{.noInit.}: BaseType +# +# # Algorithm +# # ------------------------------- +# +# block: # Triangle +# # i = 0 +# # ---------------- +# ctx.mov rRDX, a[0] +# # Put a[1..= aBits + bBits unless you know what you are doing. r.limbs.prod(a.limbs, b.limbs) func mul*[aBits, bBits](a: var BigInt[aBits], b: BigInt[bBits]) = @@ -298,6 +304,19 @@ func prod_high_words*[rBits, aBits, bBits](r: var BigInt[rBits], a: BigInt[aBits # i.e. prod_high_words(result, P, a, w) r.limbs.prod_high_words(a.limbs, b.limbs, lowestWordIndex) +func square*[rBits, aBits](r: var BigInt[rBits], a: BigInt[aBits]) = + ## Multi-precision squaring + ## r <- a² + ## `a`, `r` can have different sizes + ## if r.bits < a.bits * 2 + ## the multiplication will overflow. + ## It will be truncated if it cannot fit in r limbs. + ## + ## Truncation is at limb-level NOT bitlevel + ## It is recommended to only use + ## rBits >= aBits * 2 unless you know what you are doing. + r.limbs.square(a.limbs) + # Bit Manipulation # ------------------------------------------------------------ diff --git a/constantine/arithmetic/finite_fields.nim b/constantine/arithmetic/finite_fields.nim index 40171c7..b3d0209 100644 --- a/constantine/arithmetic/finite_fields.nim +++ b/constantine/arithmetic/finite_fields.nim @@ -135,7 +135,7 @@ func setOne*(a: var FF) {.inline.} = func `+=`*(a: var FF, b: FF) {.inline, meter.} = ## In-place addition modulo p when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling - addmod_asm(a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs) + addmod_asm(a.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs) else: var overflowed = add(a.mres, b.mres) overflowed = overflowed or not(a.mres < FF.fieldMod()) @@ -144,7 +144,7 @@ func `+=`*(a: var FF, b: FF) {.inline, meter.} = func `-=`*(a: var FF, b: FF) {.inline, meter.} = ## In-place substraction modulo p when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling - submod_asm(a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs) + submod_asm(a.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs) else: let underflowed = sub(a.mres, b.mres) discard cadd(a.mres, FF.fieldMod(), underflowed) @@ -152,7 +152,7 @@ func `-=`*(a: var FF, b: FF) {.inline, meter.} = func double*(a: var FF) {.inline, meter.} = ## Double ``a`` modulo p when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling - addmod_asm(a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs) + addmod_asm(a.mres.limbs, a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs) else: var overflowed = double(a.mres) overflowed = overflowed or not(a.mres < FF.fieldMod()) @@ -162,8 +162,7 @@ func sum*(r: var FF, a, b: FF) {.inline, meter.} = ## Sum ``a`` and ``b`` into ``r`` modulo p ## r is initialized/overwritten when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling - r = a - addmod_asm(r.mres.limbs, b.mres.limbs, FF.fieldMod().limbs) + addmod_asm(r.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs) else: var overflowed = r.mres.sum(a.mres, b.mres) overflowed = overflowed or not(r.mres < FF.fieldMod()) @@ -178,20 +177,7 @@ func diff*(r: var FF, a, b: FF) {.inline, meter.} = ## `r` is initialized/overwritten ## Requires r != b when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling - r = a - submod_asm(r.mres.limbs, b.mres.limbs, FF.fieldMod().limbs) - else: - var underflowed = r.mres.diff(a.mres, b.mres) - discard cadd(r.mres, FF.fieldMod(), underflowed) - -func diffAlias*(r: var FF, a, b: FF) {.inline, meter.} = - ## Substract `b` from `a` and store the result into `r`. - ## `r` is initialized/overwritten - ## Handles r == b - when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling - var t = a - submod_asm(t.mres.limbs, b.mres.limbs, FF.fieldMod().limbs) - r = t + submod_asm(r.mres.limbs, a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs) else: var underflowed = r.mres.diff(a.mres, b.mres) discard cadd(r.mres, FF.fieldMod(), underflowed) @@ -205,8 +191,7 @@ func double*(r: var FF, a: FF) {.inline, meter.} = ## Double ``a`` into ``r`` ## `r` is initialized/overwritten when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling - r = a - addmod_asm(r.mres.limbs, a.mres.limbs, FF.fieldMod().limbs) + addmod_asm(r.mres.limbs, a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs) else: var overflowed = r.mres.double(a.mres) overflowed = overflowed or not(r.mres < FF.fieldMod()) @@ -223,10 +208,7 @@ func square*(r: var FF, a: FF) {.inline, meter.} = func neg*(r: var FF, a: FF) {.inline, meter.} = ## Negate modulo p - when UseASM_X86_64 and defined(gcc): - # Clang and every compiler besides GCC - # can cleanly optimized this - # especially on FF2 + when UseASM_X86_64: negmod_asm(r.mres.limbs, a.mres.limbs, FF.fieldMod().limbs) else: # If a = 0 we need r = 0 and not r = M diff --git a/constantine/arithmetic/finite_fields_double_width.nim b/constantine/arithmetic/finite_fields_double_width.nim index 0d176b7..1762c0e 100644 --- a/constantine/arithmetic/finite_fields_double_width.nim +++ b/constantine/arithmetic/finite_fields_double_width.nim @@ -12,7 +12,8 @@ import ./bigints, ./finite_fields, ./limbs, - ./limbs_double_width + ./limbs_double_width, + ./limbs_montgomery when UseASM_X86_64: import assembly/limbs_asm_modular_dbl_width_x86 @@ -27,10 +28,17 @@ template doubleWidth*(T: typedesc[Fp]): typedesc = ## Return the double-width type matching with Fp FpDbl[T.C] +func `==`*(a, b: FpDbl): SecretBool {.inline.} = + a.limbs2x == b.limbs2x + func mulNoReduce*(r: var FpDbl, a, b: Fp) {.inline.} = ## Store the product of ``a`` by ``b`` into ``r`` r.limbs2x.prod(a.mres.limbs, b.mres.limbs) +func squareNoReduce*(r: var FpDbl, a: Fp) {.inline.} = + ## Store the square of ``a`` into ``r`` + r.limbs2x.square(a.mres.limbs) + func reduce*(r: var Fp, a: FpDbl) {.inline.} = ## Reduce a double-width field element into r const N = r.mres.limbs.len @@ -42,20 +50,16 @@ func reduce*(r: var Fp, a: FpDbl) {.inline.} = Fp.canUseNoCarryMontyMul() ) -func diffNoInline(r: var FpDbl, a, b: FpDbl): Borrow = - r.limbs2x.diff(a.limbs2x, b.limbs2x) - func diffNoReduce*(r: var FpDbl, a, b: FpDbl) = ## Double-width substraction without reduction - discard diffNoInline(r, a, b) + discard r.limbs2x.diff(a.limbs2x, b.limbs2x) -func diff*(r: var FpDbl, a, b: FpDbl) = +func diff*(r: var FpDbl, a, b: FpDbl) {.inline.}= ## Double-width modular substraction - when false: # TODO slower - r = a - sub2x_asm(r.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs) + when UseASM_X86_64: + sub2x_asm(r.limbs2x, a.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs) else: - var underflowed = SecretBool diffNoInline(r, a, b) + var underflowed = SecretBool r.limbs2x.diff(a.limbs2x, b.limbs2x) const N = r.limbs2x.len div 2 const M = FpDbl.C.Mod @@ -65,8 +69,6 @@ func diff*(r: var FpDbl, a, b: FpDbl) = addC(carry, sum, r.limbs2x[i+N], M.limbs[i], carry) underflowed.ccopy(r.limbs2x[i+N], sum) -func `-=`*(a: var FpDbl, b: FpDbl) = - when false: # TODO slower - sub2x_asm(a.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs) - else: - a.diff(a, b) +func `-=`*(a: var FpDbl, b: FpDbl) {.inline.}= + ## Double-width modular substraction + a.diff(a, b) diff --git a/constantine/arithmetic/limbs.nim b/constantine/arithmetic/limbs.nim index 01d3342..6b81111 100644 --- a/constantine/arithmetic/limbs.nim +++ b/constantine/arithmetic/limbs.nim @@ -12,9 +12,6 @@ import when UseASM_X86_32: import ./assembly/limbs_asm_x86 -when UseASM_X86_64: - import ./assembly/limbs_asm_mul_x86 - import ./assembly/limbs_asm_mul_x86_adx_bmi2 # ############################################################ # @@ -335,89 +332,6 @@ func cneg*(a: var Limbs, ctl: CTBool) = {.pop.} # inline -# Multiplication -# ------------------------------------------------------------ - -func prod*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) = - ## Multi-precision multiplication - ## r <- a*b - ## - ## `a`, `b`, `r` can have a different number of limbs - ## if `r`.limbs.len < a.limbs.len + b.limbs.len - ## The result will be truncated, i.e. it will be - ## a * b (mod (2^WordBitwidth)^r.limbs.len) - ## - ## `r` must not alias ``a`` or ``b`` - - when UseASM_X86_64 and aLen <= 6: - if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()): - mul_asm_adx_bmi2(r, a, b) - else: - mul_asm(r, a, b) - elif UseASM_X86_64: - mul_asm(r, a, b) - else: - # We use Product Scanning / Comba multiplication - var t, u, v = Zero - - staticFor i, 0, min(a.len+b.len, r.len): - const ib = min(b.len-1, i) - const ia = i - ib - staticFor j, 0, min(a.len - ia, ib+1): - mulAcc(t, u, v, a[ia+j], b[ib-j]) - - r[i] = v - v = u - u = t - t = Zero - - if aLen+bLen < rLen: - for i in aLen+bLen ..< rLen: - r[i] = Zero - -func prod_high_words*[rLen, aLen, bLen]( - r: var Limbs[rLen], - a: Limbs[aLen], b: Limbs[bLen], - lowestWordIndex: static int) = - ## Multi-precision multiplication keeping only high words - ## r <- a*b >> (2^WordBitWidth)^lowestWordIndex - ## - ## `a`, `b`, `r` can have a different number of limbs - ## if `r`.limbs.len < a.limbs.len + b.limbs.len - lowestWordIndex - ## The result will be truncated, i.e. it will be - ## a * b >> (2^WordBitWidth)^lowestWordIndex (mod (2^WordBitwidth)^r.limbs.len) - # - # This is useful for - # - Barret reduction - # - Approximating multiplication by a fractional constant in the form f(a) = K/C * a - # with K and C known at compile-time. - # We can instead find a well chosen M = (2^WordBitWidth)^w, with M > C (i.e. M is a power of 2 bigger than C) - # Precompute P = K*M/C at compile-time - # and at runtime do P*a/M <=> P*a >> (WordBitWidth*w) - # i.e. prod_high_words(result, P, a, w) - - # We use Product Scanning / Comba multiplication - var t, u, v = Zero # Will raise warning on empty iterations - var z: Limbs[rLen] # zero-init, ensure on stack and removes in-place problems - - # The previous 2 columns can affect the lowest word due to carries - # but not the ones before (we accumulate in 3 words (t, u, v)) - const w = lowestWordIndex - 2 - - staticFor i, max(0, w), min(a.len+b.len, r.len+lowestWordIndex): - const ib = min(b.len-1, i) - const ia = i - ib - staticFor j, 0, min(a.len - ia, ib+1): - mulAcc(t, u, v, a[ia+j], b[ib-j]) - - when i >= lowestWordIndex: - z[i-lowestWordIndex] = v - v = u - u = t - t = Zero - - r = z - # Division # ------------------------------------------------------------ diff --git a/constantine/arithmetic/limbs_double_width.nim b/constantine/arithmetic/limbs_double_width.nim index 098852c..d58a7bb 100644 --- a/constantine/arithmetic/limbs_double_width.nim +++ b/constantine/arithmetic/limbs_double_width.nim @@ -11,10 +11,9 @@ import ../primitives, ./limbs -when UseASM_X86_32: - import ./assembly/limbs_asm_montred_x86 when UseASM_X86_64: - import ./assembly/limbs_asm_montred_x86_adx_bmi2 + import ./assembly/limbs_asm_mul_x86 + import ./assembly/limbs_asm_mul_x86_adx_bmi2 # ############################################################ # @@ -25,77 +24,170 @@ when UseASM_X86_64: # No exceptions allowed {.push raises: [].} -# Montgomery Reduction +# Multiplication # ------------------------------------------------------------ -# This is the reduction part of SOS (Separated Operand Scanning) modular multiplication technique -# TODO upstream, using Limbs[N] breaks semcheck -func montyRed*[N: static int]( - r: var array[N, SecretWord], - t: array[N*2, SecretWord], - M: array[N, SecretWord], - m0ninv: BaseType, canUseNoCarryMontyMul: static bool) = - ## Montgomery reduce a double-width bigint modulo M - # - Analyzing and Comparing Montgomery Multiplication Algorithms - # Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr. - # http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf - # - # - Arithmetic of Finite Fields - # Chapter 5 of Guide to Pairing-Based Cryptography - # Jean Luc Beuchat, Luis J. Dominguez Perez, Sylvain Duquesne, Nadia El Mrabet, Laura Fuentes-Castañeda, Francisco Rodríguez-Henríquez, 2017 - # https://www.researchgate.net/publication/319538235_Arithmetic_of_Finite_Fields - # - # Algorithm - # Inputs: - # - N number of limbs - # - t[0 ..< 2N] (double-width input to reduce) - # - M[0 ..< N] The field modulus (must be odd for Montgomery reduction) - # - m0ninv: Montgomery Reduction magic number = -1/M[0] - # Output: - # - r[0 ..< N], in the Montgomery domain - # Parameters: - # - w, the word width usually 64 on 64-bit platforms or 32 on 32-bit - # - # for i in 0 .. n-1: - # C <- 0 - # m <- t[i] * m0ninv mod 2^w (i.e. simple multiplication) - # for j in 0 .. n-1: - # (C, S) <- t[i+j] + m * M[j] + C - # t[i+j] <- S - # t[i+n] += C - # for i in 0 .. n-1: - # r[i] = t[i+n] - # if r >= M: - # r -= M - # - # Important note: `t[i+n] += C` should propagate the carry - # to the higher limb if any, thank you "implementation detail" - # missing from paper. - when UseASM_X86_64 and r.len <= 6: +func prod*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) = + ## Multi-precision multiplication + ## r <- a*b + ## + ## `a`, `b`, `r` can have a different number of limbs + ## if `r`.limbs.len < a.limbs.len + b.limbs.len + ## The result will be truncated, i.e. it will be + ## a * b (mod (2^WordBitwidth)^r.limbs.len) + ## + ## `r` must not alias ``a`` or ``b`` + + when UseASM_X86_64 and aLen <= 6: if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()): - montRed_asm_adx_bmi2(r, t, M, m0ninv, canUseNoCarryMontyMul) + mul_asm_adx_bmi2(r, a, b) else: - montRed_asm(r, t, M, m0ninv, canUseNoCarryMontyMul) - elif UseASM_X86_32 and r.len <= 6: - # TODO: Assembly faster than GCC but slower than Clang - montRed_asm(r, t, M, m0ninv, canUseNoCarryMontyMul) + mul_asm(r, a, b) + elif UseASM_X86_64: + mul_asm(r, a, b) else: - var t = t # Copy "t" for mutation and ensure on stack - var res: typeof(r) # Accumulator - staticFor i, 0, N: - var C = Zero - let m = t[i] * SecretWord(m0ninv) - staticFor j, 0, N: - muladd2(C, t[i+j], m, M[j], t[i+j], C) - res[i] = C + # We use Product Scanning / Comba multiplication + var t, u, v = Zero + const stopEx = min(a.len+b.len, r.len) - # This does t[i+n] += C - # but in a separate carry chain, fused with the - # copy "r[i] = t[i+n]" - var carry = Carry(0) - staticFor i, 0, N: - addC(carry, res[i], t[i+N], res[i], carry) + staticFor i, 0, stopEx: + # Invariant for product scanning: + # if we multiply accumulate by a[k1] * b[k2] + # we have k1+k2 == i + const ib = min(b.len-1, i) + const ia = i - ib + staticFor j, 0, min(a.len - ia, ib+1): + mulAcc(t, u, v, a[ia+j], b[ib-j]) - # Final substraction - discard res.csub(M, SecretWord(carry).isNonZero() or not(res < M)) - r = res + r[i] = v + when i < stopEx-1: + v = u + u = t + t = Zero + + if aLen+bLen < rLen: + for i in aLen+bLen ..< rLen: + r[i] = Zero + +func prod_high_words*[rLen, aLen, bLen]( + r: var Limbs[rLen], + a: Limbs[aLen], b: Limbs[bLen], + lowestWordIndex: static int) = + ## Multi-precision multiplication keeping only high words + ## r <- a*b >> (2^WordBitWidth)^lowestWordIndex + ## + ## `a`, `b`, `r` can have a different number of limbs + ## if `r`.limbs.len < a.limbs.len + b.limbs.len - lowestWordIndex + ## The result will be truncated, i.e. it will be + ## a * b >> (2^WordBitWidth)^lowestWordIndex (mod (2^WordBitwidth)^r.limbs.len) + # + # This is useful for + # - Barret reduction + # - Approximating multiplication by a fractional constant in the form f(a) = K/C * a + # with K and C known at compile-time. + # We can instead find a well chosen M = (2^WordBitWidth)^w, with M > C (i.e. M is a power of 2 bigger than C) + # Precompute P = K*M/C at compile-time + # and at runtime do P*a/M <=> P*a >> (WordBitWidth*w) + # i.e. prod_high_words(result, P, a, w) + + # We use Product Scanning / Comba multiplication + var t, u, v = Zero # Will raise warning on empty iterations + var z: Limbs[rLen] # zero-init, ensure on stack and removes in-place problems + + # The previous 2 columns can affect the lowest word due to carries + # but not the ones before (we accumulate in 3 words (t, u, v)) + const w = lowestWordIndex - 2 + const stopEx = min(a.len+b.len, r.len+lowestWordIndex) + + staticFor i, max(0, w), stopEx: + const ib = min(b.len-1, i) + const ia = i - ib + staticFor j, 0, min(a.len - ia, ib+1): + mulAcc(t, u, v, a[ia+j], b[ib-j]) + + when i >= lowestWordIndex: + z[i-lowestWordIndex] = v + when i < stopEx-1: + v = u + u = t + t = Zero + + r = z + +func square_Comba[rLen, aLen]( + r: var Limbs[rLen], + a: Limbs[aLen]) = + ## Multi-precision squaring using Comba / Product Scanning + var t, u, v = Zero + const stopEx = min(a.len * 2, r.len) + + staticFor i, 0, stopEx: + # Invariant for product scanning: + # if we multiply accumulate by a[k1] * a[k2] + # we have k1+k2 == i + const ib = min(a.len-1, i) + const ia = i - ib + staticFor j, 0, min(a.len - ia, ib+1): + const k1 = ia+j + const k2 = ib-j + when k1 < k2: + mulDoubleAcc(t, u, v, a[k1], a[k2]) + elif k1 == k2: + mulAcc(t, u, v, a[k1], a[k2]) + else: + discard + + r[i] = v + when i < stopEx-1: + v = u + u = t + t = Zero + + if aLen*2 < rLen: + for i in aLen*2 ..< rLen: + r[i] = Zero + +func square_operandScan[rLen, aLen]( + r: var Limbs[rLen], + a: Limbs[aLen]) = + ## Multi-precision squaring using Operand Scanning + const stopEx = min(a.len * 2, r.len) + var t: typeof(r) # zero-init, ensure on stack + var C = Zero + static: doAssert aLen * 2 == rLen, "Truncated square operand scanning is not implemented" + + staticFor i, 0, stopEx: + staticFor j, i+1, stopEx: + muladd2(C, t[i+j], a[j], a[i], t[i+j], C) + t[i+stopEx] = C + + staticFor i, 0, aLen: + # (t[2*i+1], t[2*i]) <- 2*t[2*i] + a[i]*a[i] + var u, v = Zero + var carry: Carry + # a[i] * a[i] + mul(u, v, a[i], a[i]) + # 2*t[2*i] + addC(carry, t[2*i], t[2*i], t[2*i], Carry(0)) + addC(carry, t[2*i+1], Zero, Zero, carry) + # 2*t[2*i] + a[i] * a[i] + addC(carry, t[2*i], t[2*i], u, Carry(0)) + addC(carry, t[2*i+1], Zero, v, carry) + + r = t + +func square*[rLen, aLen]( + r: var Limbs[rLen], + a: Limbs[aLen]) = + ## Multi-precision squaring + ## r <- a² + ## + ## if `r`.limbs.len < a.limbs.len * 2 + ## The result will be truncated, i.e. it will be + ## a² (mod (2^WordBitwidth)^r.limbs.len) + ## + ## `r` must not alias ``a`` or ``b`` + when UseASM_X86_64: + square_asm(r, a) + else: + square_comba(r, a) diff --git a/constantine/arithmetic/limbs_montgomery.nim b/constantine/arithmetic/limbs_montgomery.nim index b8afe32..cae6e09 100644 --- a/constantine/arithmetic/limbs_montgomery.nim +++ b/constantine/arithmetic/limbs_montgomery.nim @@ -14,10 +14,13 @@ import ../primitives, ./limbs +when UseASM_X86_32: + import ./assembly/limbs_asm_montred_x86 when UseASM_X86_64: import ./assembly/limbs_asm_montmul_x86, - ./assembly/limbs_asm_montmul_x86_adx_bmi2 + ./assembly/limbs_asm_montmul_x86_adx_bmi2, + ./assembly/limbs_asm_montred_x86_adx_bmi2 # ############################################################ # @@ -42,7 +45,7 @@ when UseASM_X86_64: # No exceptions allowed {.push raises: [].} -# Implementation +# Montgomery Multiplication # ------------------------------------------------------------ func montyMul_CIOS_nocarry(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) = @@ -162,6 +165,9 @@ func montyMul_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) = discard z.csub(M, v.isNonZero() or not(z < M)) r = z +# Montgomery Squaring +# ------------------------------------------------------------ + func montySquare_CIOS_nocarry(r: var Limbs, a, M: Limbs, m0ninv: BaseType) {.used.}= ## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS) ## and no-carry optimization. @@ -267,6 +273,133 @@ func montySquare_CIOS(r: var Limbs, a, M: Limbs, m0ninv: BaseType) {.used.}= discard t.csub(M, tN.isNonZero() or not(t < M)) # TODO: (t >= M) is unnecessary for prime in the form (2^64)^w r = t +# Montgomery Reduction +# ------------------------------------------------------------ +func montyRed_CIOS[N: static int]( + r: var array[N, SecretWord], + a: array[N*2, SecretWord], + M: array[N, SecretWord], + m0ninv: BaseType) = + ## Montgomery reduce a double-width bigint modulo M + # - Analyzing and Comparing Montgomery Multiplication Algorithms + # Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr. + # http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf + # + # - Arithmetic of Finite Fields + # Chapter 5 of Guide to Pairing-Based Cryptography + # Jean Luc Beuchat, Luis J. Dominguez Perez, Sylvain Duquesne, Nadia El Mrabet, Laura Fuentes-Castañeda, Francisco Rodríguez-Henríquez, 2017 + # https://www.researchgate.net/publication/319538235_Arithmetic_of_Finite_Fields + # + # Algorithm + # Inputs: + # - N number of limbs + # - a[0 ..< 2N] (double-width input to reduce) + # - M[0 ..< N] The field modulus (must be odd for Montgomery reduction) + # - m0ninv: Montgomery Reduction magic number = -1/M[0] + # Output: + # - r[0 ..< N], in the Montgomery domain + # Parameters: + # - w, the word width usually 64 on 64-bit platforms or 32 on 32-bit + # + # for i in 0 .. n-1: + # C <- 0 + # m <- a[i] * m0ninv mod 2^w (i.e. simple multiplication) + # for j in 0 .. n-1: + # (C, S) <- a[i+j] + m * M[j] + C + # a[i+j] <- S + # a[i+n] += C + # for i in 0 .. n-1: + # r[i] = a[i+n] + # if r >= M: + # r -= M + # + # Important note: `a[i+n] += C` should propagate the carry + # to the higher limb if any, thank you "implementation detail" + # missing from paper. + + var a = a # Copy "t" for mutation and ensure on stack + var res: typeof(r) # Accumulator + staticFor i, 0, N: + var C = Zero + let m = a[i] * SecretWord(m0ninv) + staticFor j, 0, N: + muladd2(C, a[i+j], m, M[j], a[i+j], C) + res[i] = C + + # This does t[i+n] += C + # but in a separate carry chain, fused with the + # copy "r[i] = t[i+n]" + var carry = Carry(0) + staticFor i, 0, N: + addC(carry, res[i], a[i+N], res[i], carry) + + # Final substraction + discard res.csub(M, SecretWord(carry).isNonZero() or not(res < M)) + r = res + +func montyRed_Comba[N: static int]( + r: var array[N, SecretWord], + a: array[N*2, SecretWord], + M: array[N, SecretWord], + m0ninv: BaseType) = + ## Montgomery reduce a double-width bigint modulo M + # We use Product Scanning / Comba multiplication + var t, u, v = Zero + var carry: Carry + var z: typeof(r) # zero-init, ensure on stack and removes in-place problems in tower fields + staticFor i, 0, N: + staticFor j, 0, i: + mulAcc(t, u, v, z[j], M[i-j]) + + addC(carry, v, v, a[i], Carry(0)) + addC(carry, u, u, Zero, carry) + addC(carry, t, t, Zero, carry) + + z[i] = v * SecretWord(m0ninv) + mulAcc(t, u, v, z[i], M[0]) + v = u + u = t + t = Zero + + staticFor i, N, 2*N-1: + staticFor j, i-N+1, N: + mulAcc(t, u, v, z[j], M[i-j]) + + addC(carry, v, v, a[i], Carry(0)) + addC(carry, u, u, Zero, carry) + addC(carry, t, t, Zero, carry) + + z[i-N] = v + + v = u + u = t + t = Zero + + addC(carry, z[N-1], v, a[2*N-1], Carry(0)) + + # Final substraction + discard z.csub(M, SecretBool(carry) or not(z < M)) + r = z + +# TODO upstream, using Limbs[N] breaks semcheck +func montyRed*[N: static int]( + r: var array[N, SecretWord], + a: array[N*2, SecretWord], + M: array[N, SecretWord], + m0ninv: BaseType, canUseNoCarryMontyMul: static bool) = + ## Montgomery reduce a double-width bigint modulo M + when UseASM_X86_64 and r.len <= 6: + if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()): + montRed_asm_adx_bmi2(r, a, M, m0ninv, canUseNoCarryMontyMul) + else: + montRed_asm(r, a, M, m0ninv, canUseNoCarryMontyMul) + elif UseASM_X86_32 and r.len <= 6: + # TODO: Assembly faster than GCC but slower than Clang + montRed_asm(r, a, M, m0ninv, canUseNoCarryMontyMul) + else: + montyRed_CIOS(r, a, M, m0ninv) + # montyRed_Comba(r, a, M, m0ninv) + # Exported API # ------------------------------------------------------------ diff --git a/constantine/config/common.nim b/constantine/config/common.nim index 333a698..8acee3f 100644 --- a/constantine/config/common.nim +++ b/constantine/config/common.nim @@ -54,8 +54,8 @@ const # TODO, we restrict assembly to 64-bit words # We need to support register spills for large limbs -const ConstantineASM {.booldefine.} = true -const UseASM_X86_32* = ConstantineASM and X86 and GCC_Compatible +const CttASM {.booldefine.} = true +const UseASM_X86_32* = CttASM and X86 and GCC_Compatible const UseASM_X86_64* = WordBitWidth == 64 and UseASM_X86_32 # ############################################################ diff --git a/constantine/elliptic/ec_shortweierstrass_affine.nim b/constantine/elliptic/ec_shortweierstrass_affine.nim index 71f7fc4..e8e4d6f 100644 --- a/constantine/elliptic/ec_shortweierstrass_affine.nim +++ b/constantine/elliptic/ec_shortweierstrass_affine.nim @@ -90,7 +90,7 @@ func curve_eq_rhs*[F](y2: var F, x: F, Tw: static Twisted) = else: {.error: "Only twisted curves are supported on extension field 𝔽p²".} - y2.diffAlias(t, y2) + y2.diff(t, y2) when F.C.getCoefA() != 0: t = x diff --git a/constantine/elliptic/ec_shortweierstrass_projective.nim b/constantine/elliptic/ec_shortweierstrass_projective.nim index 4b3fd6e..c45d3e8 100644 --- a/constantine/elliptic/ec_shortweierstrass_projective.nim +++ b/constantine/elliptic/ec_shortweierstrass_projective.nim @@ -198,7 +198,7 @@ func sum*[F; Tw: static Twisted]( r.y.sum(Q.x, Q.z) # 15. Y₃ <- X₂ + Z₂ r.x *= r.y # 16. X₃ <- X₃ Y₃ X₃ = (X₁Z₁)(X₂Z₂) r.y.sum(t0, t2) # 17. Y₃ <- t₀ + t₂ Y₃ = X₁ X₂ + Z₁ Z₂ - r.y.diffAlias(r.x, r.y) # 18. Y₃ <- X₃ - Y₃ Y₃ = (X₁ + Z₁)(X₂ + Z₂) - (X₁ X₂ + Z₁ Z₂) = X₁Z₂ + X₂Z₁ + r.y.diff(r.x, r.y) # 18. Y₃ <- X₃ - Y₃ Y₃ = (X₁ + Z₁)(X₂ + Z₂) - (X₁ X₂ + Z₁ Z₂) = X₁Z₂ + X₂Z₁ when Tw == OnTwist and F.C.getSexticTwist() == D_Twist: t0 *= SexticNonResidue t1 *= SexticNonResidue @@ -214,7 +214,7 @@ func sum*[F; Tw: static Twisted]( r.y *= SexticNonResidue r.x.prod(t4, r.y) # 25. X₃ <- t₄ Y₃ X₃ = 3b(Y₁Z₂ + Y₂Z₁)(X₁Z₂ + X₂Z₁) t2.prod(t3, t1) # 26. t₂ <- t₃ t₁ t₂ = (X₁Y₂ + X₂Y₁) (Y₁Y₂ - 3bZ₁Z₂) - r.x.diffAlias(t2, r.x) # 27. X₃ <- t₂ - X₃ X₃ = (X₁Y₂ + X₂Y₁) (Y₁Y₂ - 3bZ₁Z₂) - 3b(Y₁Z₂ + Y₂Z₁)(X₁Z₂ + X₂Z₁) + r.x.diff(t2, r.x) # 27. X₃ <- t₂ - X₃ X₃ = (X₁Y₂ + X₂Y₁) (Y₁Y₂ - 3bZ₁Z₂) - 3b(Y₁Z₂ + Y₂Z₁)(X₁Z₂ + X₂Z₁) r.y *= t0 # 28. Y₃ <- Y₃ t₀ Y₃ = 9bX₁X₂ (X₁Z₂ + X₂Z₁) t1 *= r.z # 29. t₁ <- t₁ Z₃ t₁ = (Y₁Y₂ - 3bZ₁Z₂)(Y₁Y₂ + 3bZ₁Z₂) r.y += t1 # 30. Y₃ <- t₁ + Y₃ Y₃ = (Y₁Y₂ + 3bZ₁Z₂)(Y₁Y₂ - 3bZ₁Z₂) + 9bX₁X₂ (X₁Z₂ + X₂Z₁) @@ -277,7 +277,7 @@ func madd*[F; Tw: static Twisted]( r.y *= SexticNonResidue r.x.prod(t4, r.y) # 18. X₃ <- t₄ Y₃, X₃ = (Y₁ + Y₂Z₁) 3b(X₁ + X₂Z₁) t2.prod(t3, t1) # 19. t₂ <- t₃ t₁, t₂ = (X₁Y₂ + X₂Y₁)(Y₁Y₂ - 3bZ₁) - r.x.diffAlias(t2, r.x) # 20. X₃ <- t₂ - X₃, X₃ = (X₁Y₂ + X₂Y₁)(Y₁Y₂ - 3bZ₁) - 3b(Y₁ + Y₂Z₁)(X₁ + X₂Z₁) + r.x.diff(t2, r.x) # 20. X₃ <- t₂ - X₃, X₃ = (X₁Y₂ + X₂Y₁)(Y₁Y₂ - 3bZ₁) - 3b(Y₁ + Y₂Z₁)(X₁ + X₂Z₁) r.y *= t0 # 21. Y₃ <- Y₃ t₀, Y₃ = 9bX₁X₂ (X₁ + X₂Z₁) t1 *= r.z # 22. t₁ <- t₁ Z₃, t₁ = (Y₁Y₂ - 3bZ₁)(Y₁Y₂ + 3bZ₁) r.y += t1 # 23. Y₃ <- t₁ + Y₃, Y₃ = (Y₁Y₂ + 3bZ₁)(Y₁Y₂ - 3bZ₁) + 9bX₁X₂ (X₁ + X₂Z₁) diff --git a/constantine/pairing/lines_projective.nim b/constantine/pairing/lines_projective.nim index 58e7bef..31904ed 100644 --- a/constantine/pairing/lines_projective.nim +++ b/constantine/pairing/lines_projective.nim @@ -270,7 +270,7 @@ func line_eval_fused_add[F]( F.prod(T.z, C) G.prod(T.x, D) H.double(G) - H.diffAlias(F, H) + H.diff(F, H) H += E I.prod(T.y, E) diff --git a/constantine/primitives/extended_precision.nim b/constantine/primitives/extended_precision.nim index ae0bc6f..ac98b75 100644 --- a/constantine/primitives/extended_precision.nim +++ b/constantine/primitives/extended_precision.nim @@ -138,3 +138,17 @@ func mulAcc*[T: Ct[uint32]|Ct[uint64]](t, u, v: var T, a, b: T) {.inline.} = addC(carry, v, v, UV[0], Carry(0)) addC(carry, u, u, UV[1], carry) t += T(carry) + +func mulDoubleAcc*[T: Ct[uint32]|Ct[uint64]](t, u, v: var T, a, b: T) {.inline.} = + ## (t, u, v) <- (t, u, v) + 2 * a * b + var UV: array[2, T] + var carry: Carry + mul(UV[1], UV[0], a, b) + + addC(carry, UV[0], UV[0], UV[0], Carry(0)) + addC(carry, UV[1], UV[1], UV[1], carry) + t += T(carry) + + addC(carry, v, v, UV[0], Carry(0)) + addC(carry, u, u, UV[1], carry) + t += T(carry) diff --git a/constantine/primitives/macro_assembler_x86.nim b/constantine/primitives/macro_assembler_x86.nim index ae17ba9..e9c09ca 100644 --- a/constantine/primitives/macro_assembler_x86.nim +++ b/constantine/primitives/macro_assembler_x86.nim @@ -40,7 +40,7 @@ type CarryFlag = "@ccc" Register* = enum - rbx, rdx, r8, rax + rbx, rdx, r8, rax, xmm0 Constraint* = enum ## GCC extended assembly modifier @@ -369,6 +369,26 @@ func codeFragment(a: var Assembler_x86, instr: string, imm: int, op: Operand) = a.operands.incl op.desc +func codeFragment(a: var Assembler_x86, instr: string, reg: Register, op: OperandReuse) = + # Generate a code fragment + # ⚠️ Warning: + # The caller should deal with destination/source operand + # so that it fits GNU Assembly + if a.wordBitWidth == 64: + a.code &= instr & "q %%" & $reg & ", %" & $op.asmId & '\n' + else: + a.code &= instr & "l %%" & $reg & ", %" & $op.asmId & '\n' + +func codeFragment(a: var Assembler_x86, instr: string, op: OperandReuse, reg: Register) = + # Generate a code fragment + # ⚠️ Warning: + # The caller should deal with destination/source operand + # so that it fits GNU Assembly + if a.wordBitWidth == 64: + a.code &= instr & "q %" & $op.asmId & ", %%" & $reg & '\n' + else: + a.code &= instr & "l %" & $op.asmId & ", %%" & $reg & '\n' + func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: Register) = # Generate a code fragment # ⚠️ Warning: @@ -605,6 +625,17 @@ func mov*(a: var Assembler_x86, dst: Operand, imm: int) = a.codeFragment("mov", imm, dst) # No clobber +func mov*(a: var Assembler_x86, dst: Register, src: OperandReuse) = + ## Does: dst <- src with dst a fixed register + a.codeFragment("mov", src, dst) + # No clobber + +func mov*(a: var Assembler_x86, dst: OperandReuse, src: Register) = + ## Does: dst <- imm + # doAssert dst.desc.constraint in OutputReg, $dst.repr + a.codeFragment("mov", src, dst) + # No clobber + func cmovc*(a: var Assembler_x86, dst, src: Operand) = ## Does: dst <- src if the carry flag is set doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr @@ -696,19 +727,38 @@ func mulx*(a: var Assembler_x86, dHi, dLo, src0: Operand, src1: Register) = a.operands.incl src0.desc -func adcx*(a: var Assembler_x86, dst, src: Operand) = +func mulx*(a: var Assembler_x86, dHi: OperandReuse, dLo, src0: Operand, src1: Register) = + ## Does (dHi, dLo) <- src0 * src1 + doAssert src1 == rdx, "MULX requires the RDX register" + doAssert dLo.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, + "The destination operand must be a register " & $dLo.repr + doAssert dLo.desc.constraint in OutputReg + + let off0 = a.getStrOffset(src0) + + # Annoying AT&T syntax + if a.wordBitWidth == 64: + a.code &= "mulxq " & off0 & ", %" & $dLo.desc.asmId & ", %" & $dHi.asmId & '\n' + else: + a.code &= "mulxl " & off0 & ", %" & $dLo.desc.asmId & ", %" & $dHi.asmId & '\n' + + a.operands.incl src0.desc + +func adcx*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|OperandReuse) = ## Does: dst <- dst + src + carry ## and only sets the carry flag - doAssert dst.desc.constraint in OutputReg, $dst.repr - doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr + when dst is Operand: + doAssert dst.desc.constraint in OutputReg, $dst.repr + doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr a.codeFragment("adcx", src, dst) a.areFlagsClobbered = true -func adox*(a: var Assembler_x86, dst, src: Operand) = +func adox*(a: var Assembler_x86, dst: Operand|OperandReuse, src: Operand|OperandReuse) = ## Does: dst <- dst + src + overflow ## and only sets the overflow flag - doAssert dst.desc.constraint in OutputReg, $dst.repr - doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr + when dst is Operand: + doAssert dst.desc.constraint in OutputReg, $dst.repr + doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr a.codeFragment("adox", src, dst) a.areFlagsClobbered = true diff --git a/constantine/tower_field_extensions/quadratic_extensions.nim b/constantine/tower_field_extensions/quadratic_extensions.nim index 8b70efe..e761bfd 100644 --- a/constantine/tower_field_extensions/quadratic_extensions.nim +++ b/constantine/tower_field_extensions/quadratic_extensions.nim @@ -87,8 +87,8 @@ func prod_complex(r: var QuadraticExt, a, b: QuadraticExt) = # TODO: GCC is adding an unexplainable 30 cycles tax to this function (~10% slow down) # for seemingly no reason - when true: # Single-width implementation - # Clang 330 cycles on i9-9980XE @4.1 GHz + when false: # Single-width implementation - BLS12-381 + # Clang 348 cycles on i9-9980XE @3.9 GHz var a0b0 {.noInit.}, a1b1 {.noInit.}: typeof(r.c0) a0b0.prod(a.c0, b.c0) # [1 Mul] a1b1.prod(a.c1, b.c1) # [2 Mul] @@ -102,7 +102,7 @@ func prod_complex(r: var QuadraticExt, a, b: QuadraticExt) = r.c1 -= a1b1 # r1 = (b0 + b1)(a0 + a1) - a0b0 - a1b1 # [3 Mul, 2 Add, 3 Sub] else: # Double-width implementation with lazy reduction - # Deactivated for now Clang 360 cycles on i9-9980XE @4.1 GHz + # Clang 341 cycles on i9-9980XE @3.9 GHz var a0b0 {.noInit.}, a1b1 {.noInit.}: doubleWidth(typeof(r.c0)) var d {.noInit.}: doubleWidth(typeof(r.c0)) const msbSet = r.c0.typeof.canUseNoCarryMontyMul() @@ -120,20 +120,19 @@ func prod_complex(r: var QuadraticExt, a, b: QuadraticExt) = d -= a0b0 d -= a1b1 else: - d.diffNoReduce(d, a0b0) # 10 cycles - cumul 152 - d.diffNoReduce(d, a1b1) # 10 cycles - cumul 162 - a0b0.diff(a0b0, a1b1) # 18 cycles - cumul 170 - r.c0.reduce(a0b0) # 68 cycles - cumul 248 - r.c1.reduce(d) # 68 cycles - cumul 316 + d.diffNoReduce(d, a0b0) # 11 cycles - cumul 153 + d.diffNoReduce(d, a1b1) # 11 cycles - cumul 164 + a0b0.diff(a0b0, a1b1) # 19 cycles - cumul 183 + r.c0.reduce(a0b0) # 50 cycles - cumul 233 + r.c1.reduce(d) # 50 cycles - cumul 288 # Single-width [3 Mul, 2 Add, 3 Sub] - # 3*81 + 2*14 + 3*12 = 307 theoretical cycles - # 330 measured + # 3*88 + 2*14 + 3*14 = 334 theoretical cycles + # 348 measured # Double-Width - # 316 theoretical cycles - # 365 measured - # Reductions can be 2x10 faster using MCL algorithm - # but there are still unexplained 50 cycles diff between theo and measured + # 288 theoretical cycles + # 329 measured + # Unexplained 40 cycles diff between theo and measured # and unexplained 30 cycles between Clang and GCC # - Function calls? # - push/pop stack? diff --git a/constantine/tower_field_extensions/tower_common.nim b/constantine/tower_field_extensions/tower_common.nim index 0e1f17e..6fdb63d 100644 --- a/constantine/tower_field_extensions/tower_common.nim +++ b/constantine/tower_field_extensions/tower_common.nim @@ -168,18 +168,6 @@ func diff*(r: var CubicExt, a, b: CubicExt) = r.c1.diff(a.c1, b.c1) r.c2.diff(a.c2, b.c2) -func diffAlias*(r: var QuadraticExt, a, b: QuadraticExt) = - ## Diff ``a`` and ``b`` into ``r`` - ## Handles r and b aliasing - r.c0.diffAlias(a.c0, b.c0) - r.c1.diffAlias(a.c1, b.c1) - -func diffAlias*(r: var CubicExt, a, b: CubicExt) = - ## Diff ``a`` and ``b`` into ``r`` - ## Handles r and b aliasing - r.c0.diffAlias(a.c0, b.c0) - r.c1.diffAlias(a.c1, b.c1) - r.c2.diffAlias(a.c2, b.c2) # Conditional arithmetic # ------------------------------------------------------------------- diff --git a/constantine/tower_field_extensions/tower_instantiation.nim b/constantine/tower_field_extensions/tower_instantiation.nim index 39c688a..8502f58 100644 --- a/constantine/tower_field_extensions/tower_instantiation.nim +++ b/constantine/tower_field_extensions/tower_instantiation.nim @@ -196,21 +196,32 @@ func `*=`*(a: var Fp2, _: typedesc[ξ]) {.inline.} = type Fp12*[C: static Curve] = object c0*, c1*, c2*: Fp4[C] + # c0*, c1*: Fp6[C] γ = NonResidue # We call the non-residue γ (Gamma) on 𝔽p6 to avoid confusion between non-residue # of different tower level func `*`*(_: typedesc[γ], a: Fp4): Fp4 {.noInit, inline.} = + ## Multiply an element of 𝔽p4 by the sextic non-residue + ## chosen to construct 𝔽p12 + result.c0 = ξ * a.c1 + result.c1 = a.c0 + +func `*=`*(a: var Fp4, _: typedesc[γ]) {.inline.} = + a = γ * a + +func `*`*(_: typedesc[γ], a: Fp6): Fp6 {.noInit, inline.} = ## Multiply an element of 𝔽p6 by the cubic non-residue ## chosen to construct 𝔽p12 ## For all curves γ = v with v the factor for 𝔽p6 coordinate ## and v³ = ξ ## (c0 + c1 v + c2 v²) v => ξ c2 + c0 v + c1 v² - result.c0 = ξ * a.c1 + result.c0 = ξ * a.c2 result.c1 = a.c0 + result.c2 = a.c1 -func `*=`*(a: var Fp4, _: typedesc[γ]) {.inline.} = +func `*=`*(a: var Fp6, _: typedesc[γ]) {.inline.} = a = γ * a # Sparse functions diff --git a/metering/README.md b/metering/README.md index 53253fb..7456289 100644 --- a/metering/README.md +++ b/metering/README.md @@ -71,7 +71,7 @@ The CPU Cycle Count is indicative only. It cannot be used to compare across syst |double* | 7212| 2404000000000.000| 0.003| 0.000| |sum* | 21058| 7019333333333.333| 0.003| 0.000| |diff* | 8884| 2961333333333.333| 0.003| 0.000| -|diffAlias* | 10| inf| 0.000| 0.000| +|diff* | 10| inf| 0.000| 0.000| |double* | 4186| inf| 0.000| 0.000| |prod* | 14486| 1609555555555.555| 0.009| 0.000| |square* | 16| inf| 0.000| 0.000| diff --git a/tests/t_bigints.nim b/tests/t_bigints.nim index 212df6c..0d6db16 100644 --- a/tests/t_bigints.nim +++ b/tests/t_bigints.nim @@ -287,6 +287,28 @@ proc mainMulHigh() = r.prod_high_words(b, a, 2) check: bool(r == expected) +proc mainSquare() = + suite "Multi-precision multiplication" & " [" & $WordBitwidth & "-bit mode]": + test "Squaring is consistent with multiplication (rBits = 2*aBits)": + block: + let a = BigInt[200].fromHex"0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DE" + + var r_mul, r_sqr: BigInt[400] + + r_mul.prod(a, a) + r_sqr.square(a) + check: bool(r_mul == r_sqr) + + test "Squaring is consistent with multiplication (truncated)": + block: + let a = BigInt[200].fromHex"0xDEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DEADBEEF_DE" + + var r_mul, r_sqr: BigInt[256] + + r_mul.prod(a, a) + r_sqr.square(a) + check: bool(r_mul == r_sqr) + proc mainModular() = suite "Modular operations - small modulus" & " [" & $WordBitwidth & "-bit mode]": # Vectors taken from Stint - https://github.com/status-im/nim-stint @@ -637,9 +659,10 @@ proc mainModularInverse() = check: bool(r == expected) mainArith() -# mainMul() -# mainMulHigh() -# mainModular() -# mainNeg() -# mainCopySwap() -# mainModularInverse() +mainMul() +mainMulHigh() +mainSquare() +mainModular() +mainNeg() +mainCopySwap() +mainModularInverse() diff --git a/tests/t_finite_fields_double_width.nim b/tests/t_finite_fields_double_width.nim index 9934281..0ca027a 100644 --- a/tests/t_finite_fields_double_width.nim +++ b/tests/t_finite_fields_double_width.nim @@ -24,74 +24,100 @@ rng.seed(seed) echo "\n------------------------------------------------------\n" echo "test_finite_fields_double_width xoshiro512** seed: ", seed -proc randomCurve(C: static Curve) = - let a = rng.random_unsafe(Fp[C]) - let b = rng.random_unsafe(Fp[C]) +template mulTest(rng_gen: untyped): untyped = + proc `mul _ rng_gen`(C: static Curve) = + let a = rng_gen(rng, Fp[C]) + let b = rng.random_unsafe(Fp[C]) - var r_fp, r_fpDbl: Fp[C] - var tmpDbl: FpDbl[C] + var r_fp{.noInit.}, r_fpDbl{.noInit.}: Fp[C] + var tmpDbl{.noInit.}: FpDbl[C] - r_fp.prod(a, b) - tmpDbl.mulNoReduce(a, b) - r_fpDbl.reduce(tmpDbl) + r_fp.prod(a, b) + tmpDbl.mulNoReduce(a, b) + r_fpDbl.reduce(tmpDbl) - doAssert bool(r_fp == r_fpDbl) + doAssert bool(r_fp == r_fpDbl) -proc randomHighHammingWeight(C: static Curve) = - let a = rng.random_highHammingWeight(Fp[C]) - let b = rng.random_highHammingWeight(Fp[C]) +template sqrTest(rng_gen: untyped): untyped = + proc `sqr _ rng_gen`(C: static Curve) = + let a = rng_gen(rng, Fp[C]) - var r_fp, r_fpDbl: Fp[C] - var tmpDbl: FpDbl[C] + var mulDbl{.noInit.}, sqrDbl{.noInit.}: FpDbl[C] - r_fp.prod(a, b) - tmpDbl.mulNoReduce(a, b) - r_fpDbl.reduce(tmpDbl) + mulDbl.mulNoReduce(a, a) + sqrDbl.squareNoReduce(a) - doAssert bool(r_fp == r_fpDbl) + doAssert bool(mulDbl == sqrDbl) -proc random_long01Seq(C: static Curve) = - let a = rng.random_long01Seq(Fp[C]) - let b = rng.random_long01Seq(Fp[C]) - - var r_fp, r_fpDbl: Fp[C] - var tmpDbl: FpDbl[C] - - r_fp.prod(a, b) - tmpDbl.mulNoReduce(a, b) - r_fpDbl.reduce(tmpDbl) - - doAssert bool(r_fp == r_fpDbl) +mulTest(random_unsafe) +mulTest(randomHighHammingWeight) +mulTest(random_long01Seq) +sqrTest(random_unsafe) +sqrTest(randomHighHammingWeight) +sqrTest(random_long01Seq) suite "Field Multiplication via double-width field elements is consistent with single-width." & " [" & $WordBitwidth & "-bit mode]": test "With P-224 field modulus": for _ in 0 ..< Iters: - randomCurve(P224) + mul_random_unsafe(P224) for _ in 0 ..< Iters: - randomHighHammingWeight(P224) + mul_randomHighHammingWeight(P224) for _ in 0 ..< Iters: - random_long01Seq(P224) + mul_random_long01Seq(P224) test "With P-256 field modulus": for _ in 0 ..< Iters: - randomCurve(P256) + mul_random_unsafe(P256) for _ in 0 ..< Iters: - randomHighHammingWeight(P256) + mul_randomHighHammingWeight(P256) for _ in 0 ..< Iters: - random_long01Seq(P256) + mul_random_long01Seq(P256) test "With BN254_Snarks field modulus": for _ in 0 ..< Iters: - randomCurve(BN254_Snarks) + mul_random_unsafe(BN254_Snarks) for _ in 0 ..< Iters: - randomHighHammingWeight(BN254_Snarks) + mul_randomHighHammingWeight(BN254_Snarks) for _ in 0 ..< Iters: - random_long01Seq(BN254_Snarks) + mul_random_long01Seq(BN254_Snarks) test "With BLS12_381 field modulus": for _ in 0 ..< Iters: - randomCurve(BLS12_381) + mul_random_unsafe(BLS12_381) for _ in 0 ..< Iters: - randomHighHammingWeight(BLS12_381) + mul_randomHighHammingWeight(BLS12_381) for _ in 0 ..< Iters: - random_long01Seq(BLS12_381) + mul_random_long01Seq(BLS12_381) + +suite "Field Squaring via double-width field elements is consistent with single-width." & " [" & $WordBitwidth & "-bit mode]": + test "With P-224 field modulus": + for _ in 0 ..< Iters: + sqr_random_unsafe(P224) + for _ in 0 ..< Iters: + sqr_randomHighHammingWeight(P224) + for _ in 0 ..< Iters: + sqr_random_long01Seq(P224) + + test "With P-256 field modulus": + for _ in 0 ..< Iters: + sqr_random_unsafe(P256) + for _ in 0 ..< Iters: + sqr_randomHighHammingWeight(P256) + for _ in 0 ..< Iters: + sqr_random_long01Seq(P256) + + test "With BN254_Snarks field modulus": + for _ in 0 ..< Iters: + sqr_random_unsafe(BN254_Snarks) + for _ in 0 ..< Iters: + sqr_randomHighHammingWeight(BN254_Snarks) + for _ in 0 ..< Iters: + sqr_random_long01Seq(BN254_Snarks) + + test "With BLS12_381 field modulus": + for _ in 0 ..< Iters: + sqr_random_unsafe(BLS12_381) + for _ in 0 ..< Iters: + sqr_randomHighHammingWeight(BLS12_381) + for _ in 0 ..< Iters: + sqr_random_long01Seq(BLS12_381) diff --git a/tests/t_finite_fields_vs_gmp.nim b/tests/t_finite_fields_vs_gmp.nim index e9d8ef4..30b4ab9 100644 --- a/tests/t_finite_fields_vs_gmp.nim +++ b/tests/t_finite_fields_vs_gmp.nim @@ -155,8 +155,9 @@ proc subTests(gmpRng: var gmp_randstate_t, a, b, p, r: var mpz_t, C: static Curv var r2Test = aTest r2Test -= bTest + # Substraction with r and b aliasing var r3Test = bTest - r3Test.diffAlias(aTest, r3Test) + r3Test.diff(aTest, r3Test) binary_epilogue(r, a, b, rTest, aBuf, bBuf, "Substraction (with result)") binary_epilogue(r, a, b, r2Test, aBuf, bBuf, "Substraction (in-place)")