From 928f5155827252c01ee0c7a5c58c8b9deecc928f Mon Sep 17 00:00:00 2001 From: Mamy Ratsimbazafy Date: Sat, 29 Oct 2022 22:43:40 +0200 Subject: [PATCH] Batch additions (#207) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Batch elliptic curve addition * accelerate chained muls * jac mixed add handle doubling. jac additions handle aliasing when adding infinity * properly skip sanitizer on BLS signature test * properly skip sanitizer² on BLS signature test --- benchmarks/bench_ec_g1_batch.nim | 68 ++++ benchmarks/bench_elliptic_template.nim | 123 +++---- constantine.nimble | 71 ++-- constantine/math/elliptic/README.md | 2 +- .../elliptic/ec_shortweierstrass_affine.nim | 3 + .../ec_shortweierstrass_batch_ops.nim | 303 ++++++++++++++++++ .../elliptic/ec_shortweierstrass_jacobian.nim | 294 +++++++++++------ .../ec_shortweierstrass_projective.nim | 37 ++- .../elliptic/ec_twistededwards_projective.nim | 4 + constantine/math/extension_fields/towers.nim | 24 ++ constantine/platforms/allocs.nim | 32 ++ helpers/pararun.nim | 2 +- tests/math/t_ec_shortw_jac_g1_batch_add.nim | 30 ++ tests/math/t_ec_shortw_prj_g1_batch_add.nim | 30 ++ tests/math/t_ec_template.nim | 223 +++++++++++-- 15 files changed, 1027 insertions(+), 219 deletions(-) create mode 100644 benchmarks/bench_ec_g1_batch.nim create mode 100644 constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim create mode 100644 constantine/platforms/allocs.nim create mode 100644 tests/math/t_ec_shortw_jac_g1_batch_add.nim create mode 100644 tests/math/t_ec_shortw_prj_g1_batch_add.nim diff --git a/benchmarks/bench_ec_g1_batch.nim b/benchmarks/bench_ec_g1_batch.nim new file mode 100644 index 0000000..8e1ebbd --- /dev/null +++ b/benchmarks/bench_ec_g1_batch.nim @@ -0,0 +1,68 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + # Internals + ../constantine/math/config/curves, + ../constantine/math/arithmetic, + ../constantine/math/elliptic/[ + ec_shortweierstrass_affine, + ec_shortweierstrass_projective, + ec_shortweierstrass_jacobian], + # Helpers + ../helpers/static_for, + ./bench_elliptic_template, + # Standard library + std/strutils + +# ############################################################ +# +# Benchmark of the G1 group of +# Short Weierstrass elliptic curves +# in (homogeneous) projective coordinates +# +# ############################################################ + + +const Iters = 10_000 +const AvailableCurves = [ + # BN254_Snarks, + BLS12_381, +] + +proc main() = + separator() + staticFor i, 0, AvailableCurves.len: + const curve = AvailableCurves[i] + addBench(ECP_ShortW_Prj[Fp[curve], G1], Iters) + addBench(ECP_ShortW_Jac[Fp[curve], G1], Iters) + mixedAddBench(ECP_ShortW_Prj[Fp[curve], G1], Iters) + mixedAddBench(ECP_ShortW_Jac[Fp[curve], G1], Iters) + doublingBench(ECP_ShortW_Prj[Fp[curve], G1], Iters) + doublingBench(ECP_ShortW_Jac[Fp[curve], G1], Iters) + separator() + for numPoints in [10, 100, 1000, 10000, 100000, 1000000]: + let batchIters = max(1, Iters div numPoints) + multiAddBench(ECP_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = false, batchIters) + separator() + for numPoints in [10, 100, 1000, 10000, 100000, 1000000]: + let batchIters = max(1, Iters div numPoints) + multiAddBench(ECP_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = true, batchIters) + separator() + for numPoints in [10, 100, 1000, 10000, 100000, 1000000]: + let batchIters = max(1, Iters div numPoints) + multiAddBench(ECP_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = false, batchIters) + separator() + for numPoints in [10, 100, 1000, 10000, 100000, 1000000]: + let batchIters = max(1, Iters div numPoints) + multiAddBench(ECP_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = true, batchIters) + separator() + separator() + +main() +notes() diff --git a/benchmarks/bench_elliptic_template.nim b/benchmarks/bench_elliptic_template.nim index fd83738..4ee293b 100644 --- a/benchmarks/bench_elliptic_template.nim +++ b/benchmarks/bench_elliptic_template.nim @@ -22,6 +22,7 @@ import ec_shortweierstrass_affine, ec_shortweierstrass_projective, ec_shortweierstrass_jacobian, + ec_shortweierstrass_batch_ops, ec_scalar_mul, ec_endomorphism_accel], # Helpers ../helpers/[prng_unsafe, static_for], @@ -35,10 +36,10 @@ export abstractions # generic sandwich on SecretBool and SecretBool in Jacobian proc separator*() = separator(177) -macro fixEllipticDisplay(T: typedesc): untyped = +macro fixEllipticDisplay(EC: typedesc): untyped = # At compile-time, enums are integers and their display is buggy # we get the Curve ID instead of the curve name. - let instantiated = T.getTypeInst() + let instantiated = EC.getTypeInst() var name = $instantiated[1][0] # EllipticEquationFormCoordinates let fieldName = $instantiated[1][1][0] let curveName = $Curve(instantiated[1][1][1].intVal) @@ -53,100 +54,108 @@ proc report(op, elliptic: string, start, stop: MonoTime, startClk, stopClk: int6 else: echo &"{op:<60} {elliptic:<40} {throughput:>15.3f} ops/s {ns:>9} ns/op" -template bench(op: string, T: typedesc, iters: int, body: untyped): untyped = +template bench(op: string, EC: typedesc, iters: int, body: untyped): untyped = measure(iters, startTime, stopTime, startClk, stopClk, body) - report(op, fixEllipticDisplay(T), startTime, stopTime, startClk, stopClk, iters) + report(op, fixEllipticDisplay(EC), startTime, stopTime, startClk, stopClk, iters) -proc addBench*(T: typedesc, iters: int) = - const G1_or_G2 = when T.F is Fp: "G1" else: "G2" - var r {.noInit.}: T - let P = rng.random_unsafe(T) - let Q = rng.random_unsafe(T) - bench("EC Add " & G1_or_G2, T, iters): +proc addBench*(EC: typedesc, iters: int) = + var r {.noInit.}: EC + let P = rng.random_unsafe(EC) + let Q = rng.random_unsafe(EC) + bench("EC Add " & $EC.G, EC, iters): r.sum(P, Q) -proc mixedAddBench*(T: typedesc, iters: int) = - const G1_or_G2 = when T.F is Fp: "G1" else: "G2" - var r {.noInit.}: T - let P = rng.random_unsafe(T) - let Q = rng.random_unsafe(T) - var Qaff: ECP_ShortW_Aff[T.F, T.G] +proc mixedAddBench*(EC: typedesc, iters: int) = + var r {.noInit.}: EC + let P = rng.random_unsafe(EC) + let Q = rng.random_unsafe(EC) + var Qaff: ECP_ShortW_Aff[EC.F, EC.G] Qaff.affine(Q) - bench("EC Mixed Addition " & G1_or_G2, T, iters): + bench("EC Mixed Addition " & $EC.G, EC, iters): r.madd(P, Qaff) -proc doublingBench*(T: typedesc, iters: int) = - const G1_or_G2 = when T.F is Fp: "G1" else: "G2" - var r {.noInit.}: T - let P = rng.random_unsafe(T) - bench("EC Double " & G1_or_G2, T, iters): +proc doublingBench*(EC: typedesc, iters: int) = + var r {.noInit.}: EC + let P = rng.random_unsafe(EC) + bench("EC Double " & $EC.G, EC, iters): r.double(P) -proc affFromProjBench*(T: typedesc, iters: int) = - const G1_or_G2 = when T.F is Fp: "G1" else: "G2" - var r {.noInit.}: ECP_ShortW_Aff[T.F, T.G] - let P = rng.random_unsafe(T) - bench("EC Projective to Affine " & G1_or_G2, T, iters): +proc affFromProjBench*(EC: typedesc, iters: int) = + var r {.noInit.}: ECP_ShortW_Aff[EC.F, EC.G] + let P = rng.random_unsafe(EC) + bench("EC Projective to Affine " & $EC.G, EC, iters): r.affine(P) -proc affFromJacBench*(T: typedesc, iters: int) = - const G1_or_G2 = when T.F is Fp: "G1" else: "G2" - var r {.noInit.}: ECP_ShortW_Aff[T.F, T.G] - let P = rng.random_unsafe(T) - bench("EC Jacobian to Affine " & G1_or_G2, T, iters): +proc affFromJacBench*(EC: typedesc, iters: int) = + var r {.noInit.}: ECP_ShortW_Aff[EC.F, EC.G] + let P = rng.random_unsafe(EC) + bench("EC Jacobian to Affine " & $EC.G, EC, iters): r.affine(P) -proc scalarMulGenericBench*(T: typedesc, window: static int, iters: int) = - const bits = T.F.C.getCurveOrderBitwidth() - const G1_or_G2 = when T.F is Fp: "G1" else: "G2" +proc scalarMulGenericBench*(EC: typedesc, window: static int, iters: int) = + const bits = EC.F.C.getCurveOrderBitwidth() - var r {.noInit.}: T - let P = rng.random_unsafe(T) # TODO: clear cofactor + var r {.noInit.}: EC + let P = rng.random_unsafe(EC) # TODO: clear cofactor let exponent = rng.random_unsafe(BigInt[bits]) - bench("EC ScalarMul " & $bits & "-bit " & G1_or_G2 & " (window-" & $window & ", generic)", T, iters): + bench("EC ScalarMul " & $bits & "-bit " & $EC.G & " (window-" & $window & ", generic)", EC, iters): r = P r.scalarMulGeneric(exponent, window) -proc scalarMulEndo*(T: typedesc, iters: int) = - const bits = T.F.C.getCurveOrderBitwidth() - const G1_or_G2 = when T.F is Fp: "G1" else: "G2" +proc scalarMulEndo*(EC: typedesc, iters: int) = + const bits = EC.F.C.getCurveOrderBitwidth() - var r {.noInit.}: T - let P = rng.random_unsafe(T) # TODO: clear cofactor + var r {.noInit.}: EC + let P = rng.random_unsafe(EC) # TODO: clear cofactor let exponent = rng.random_unsafe(BigInt[bits]) - bench("EC ScalarMul " & $bits & "-bit " & G1_or_G2 & " (endomorphism accelerated)", T, iters): + bench("EC ScalarMul " & $bits & "-bit " & $EC.G & " (endomorphism accelerated)", EC, iters): r = P r.scalarMulEndo(exponent) -proc scalarMulEndoWindow*(T: typedesc, iters: int) = - const bits = T.F.C.getCurveOrderBitwidth() - const G1_or_G2 = when T.F is Fp: "G1" else: "G2" +proc scalarMulEndoWindow*(EC: typedesc, iters: int) = + const bits = EC.F.C.getCurveOrderBitwidth() - var r {.noInit.}: T - let P = rng.random_unsafe(T) # TODO: clear cofactor + var r {.noInit.}: EC + let P = rng.random_unsafe(EC) # TODO: clear cofactor let exponent = rng.random_unsafe(BigInt[bits]) - bench("EC ScalarMul " & $bits & "-bit " & G1_or_G2 & " (window-2, endomorphism accelerated)", T, iters): + bench("EC ScalarMul " & $bits & "-bit " & $EC.G & " (window-2, endomorphism accelerated)", EC, iters): r = P - when T.F is Fp: + when EC.F is Fp: r.scalarMulGLV_m2w2(exponent) else: {.error: "Not implemented".} -proc scalarMulUnsafeDoubleAddBench*(T: typedesc, iters: int) = - const bits = T.F.C.getCurveOrderBitwidth() - const G1_or_G2 = when T.F is Fp: "G1" else: "G2" +proc scalarMulUnsafeDoubleAddBench*(EC: typedesc, iters: int) = + const bits = EC.F.C.getCurveOrderBitwidth() - var r {.noInit.}: T - let P = rng.random_unsafe(T) # TODO: clear cofactor + var r {.noInit.}: EC + let P = rng.random_unsafe(EC) # TODO: clear cofactor let exponent = rng.random_unsafe(BigInt[bits]) - bench("EC ScalarMul " & $bits & "-bit " & G1_or_G2 & " (unsafe reference DoubleAdd)", T, iters): + bench("EC ScalarMul " & $bits & "-bit " & $EC.G & " (unsafe reference DoubleAdd)", EC, iters): r = P r.unsafe_ECmul_double_add(exponent) + +proc multiAddBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) = + var points = newSeq[ECP_ShortW_Aff[EC.F, EC.G]](numPoints) + + for i in 0 ..< numPoints: + points[i] = rng.random_unsafe(ECP_ShortW_Aff[EC.F, EC.G]) + + var r{.noInit.}: EC + + if useBatching: + bench("EC Multi-Addition batched " & $EC.G & " (" & $numPoints & " points)", EC, iters): + r.sum_batch_vartime(points) + else: + bench("EC Multi-Addition unbatched mixed add " & $EC.G & " (" & $numPoints & " points)", EC, iters): + r.setInf() + for i in 0 ..< numPoints: + r += points[i] \ No newline at end of file diff --git a/constantine.nimble b/constantine.nimble index c003a9f..e279bc5 100644 --- a/constantine.nimble +++ b/constantine.nimble @@ -96,13 +96,13 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[ # Elliptic curve arithmetic G1 # ---------------------------------------------------------- - # ("tests/math/t_ec_shortw_prj_g1_add_double.nim", false), + ("tests/math/t_ec_shortw_prj_g1_add_double.nim", false), # ("tests/math/t_ec_shortw_prj_g1_mul_sanity.nim", false), # ("tests/math/t_ec_shortw_prj_g1_mul_distri.nim", false), ("tests/math/t_ec_shortw_prj_g1_mul_vs_ref.nim", false), ("tests/math/t_ec_shortw_prj_g1_mixed_add.nim", false), - # ("tests/math/t_ec_shortw_jac_g1_add_double.nim", false), + ("tests/math/t_ec_shortw_jac_g1_add_double.nim", false), # ("tests/math/t_ec_shortw_jac_g1_mul_sanity.nim", false), # ("tests/math/t_ec_shortw_jac_g1_mul_distri.nim", false), ("tests/math/t_ec_shortw_jac_g1_mul_vs_ref.nim", false), @@ -115,49 +115,49 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[ # Elliptic curve arithmetic G2 # ---------------------------------------------------------- - # ("tests/math/t_ec_shortw_prj_g2_add_double_bn254_snarks.nim", false), + ("tests/math/t_ec_shortw_prj_g2_add_double_bn254_snarks.nim", false), # ("tests/math/t_ec_shortw_prj_g2_mul_sanity_bn254_snarks.nim", false), # ("tests/math/t_ec_shortw_prj_g2_mul_distri_bn254_snarks.nim", false), ("tests/math/t_ec_shortw_prj_g2_mul_vs_ref_bn254_snarks.nim", false), ("tests/math/t_ec_shortw_prj_g2_mixed_add_bn254_snarks.nim", false), - # ("tests/math/t_ec_shortw_prj_g2_add_double_bls12_381.nim", false), + ("tests/math/t_ec_shortw_prj_g2_add_double_bls12_381.nim", false), # ("tests/math/t_ec_shortw_prj_g2_mul_sanity_bls12_381.nim", false), # ("tests/math/t_ec_shortw_prj_g2_mul_distri_bls12_381.nim", false), ("tests/math/t_ec_shortw_prj_g2_mul_vs_ref_bls12_381.nim", false), ("tests/math/t_ec_shortw_prj_g2_mixed_add_bls12_381.nim", false), - # ("tests/math/t_ec_shortw_prj_g2_add_double_bls12_377.nim", false), + ("tests/math/t_ec_shortw_prj_g2_add_double_bls12_377.nim", false), # ("tests/math/t_ec_shortw_prj_g2_mul_sanity_bls12_377.nim", false), # ("tests/math/t_ec_shortw_prj_g2_mul_distri_bls12_377.nim", false), ("tests/math/t_ec_shortw_prj_g2_mul_vs_ref_bls12_377.nim", false), ("tests/math/t_ec_shortw_prj_g2_mixed_add_bls12_377.nim", false), - # ("tests/math/t_ec_shortw_prj_g2_add_double_bw6_761.nim", false), + ("tests/math/t_ec_shortw_prj_g2_add_double_bw6_761.nim", false), # ("tests/math/t_ec_shortw_prj_g2_mul_sanity_bw6_761.nim", false), # ("tests/math/t_ec_shortw_prj_g2_mul_distri_bw6_761.nim", false), ("tests/math/t_ec_shortw_prj_g2_mul_vs_ref_bw6_761.nim", false), ("tests/math/t_ec_shortw_prj_g2_mixed_add_bw6_761.nim", false), - # ("tests/math/t_ec_shortw_jac_g2_add_double_bn254_snarks.nim", false), + ("tests/math/t_ec_shortw_jac_g2_add_double_bn254_snarks.nim", false), # ("tests/math/t_ec_shortw_jac_g2_mul_sanity_bn254_snarks.nim", false), # ("tests/math/t_ec_shortw_jac_g2_mul_distri_bn254_snarks.nim", false), ("tests/math/t_ec_shortw_jac_g2_mul_vs_ref_bn254_snarks.nim", false), ("tests/math/t_ec_shortw_jac_g2_mixed_add_bn254_snarks.nim", false), - # ("tests/math/t_ec_shortw_jac_g2_add_double_bls12_381.nim", false), + ("tests/math/t_ec_shortw_jac_g2_add_double_bls12_381.nim", false), # ("tests/math/t_ec_shortw_jac_g2_mul_sanity_bls12_381.nim", false), # ("tests/math/t_ec_shortw_jac_g2_mul_distri_bls12_381.nim", false), ("tests/math/t_ec_shortw_jac_g2_mul_vs_ref_bls12_381.nim", false), ("tests/math/t_ec_shortw_jac_g2_mixed_add_bls12_381.nim", false), - # ("tests/math/t_ec_shortw_jac_g2_add_double_bls12_377.nim", false), + ("tests/math/t_ec_shortw_jac_g2_add_double_bls12_377.nim", false), # ("tests/math/t_ec_shortw_jac_g2_mul_sanity_bls12_377.nim", false), # ("tests/math/t_ec_shortw_jac_g2_mul_distri_bls12_377.nim", false), ("tests/math/t_ec_shortw_jac_g2_mul_vs_ref_bls12_377.nim", false), ("tests/math/t_ec_shortw_jac_g2_mixed_add_bls12_377.nim", false), - # ("tests/math/t_ec_shortw_jac_g2_add_double_bw6_761.nim", false), + ("tests/math/t_ec_shortw_jac_g2_add_double_bw6_761.nim", false), # ("tests/math/t_ec_shortw_jac_g2_mul_sanity_bw6_761.nim", false), # ("tests/math/t_ec_shortw_jac_g2_mul_distri_bw6_761.nim", false), ("tests/math/t_ec_shortw_jac_g2_mul_vs_ref_bw6_761.nim", false), @@ -177,6 +177,11 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[ # ---------------------------------------------------------- ("tests/math/t_ec_shortw_prj_edge_cases.nim", false), + # Elliptic curve arithmetic - batch computation + # ---------------------------------------------------------- + ("tests/math/t_ec_shortw_prj_g1_batch_add.nim", false), + ("tests/math/t_ec_shortw_jac_g1_batch_add.nim", false), + # Subgroups and cofactors # ---------------------------------------------------------- ("tests/math/t_ec_subgroups_bn254_nogami.nim", false), @@ -232,6 +237,7 @@ const benchDesc = [ "bench_fp6", "bench_fp12", "bench_ec_g1", + "bench_ec_g1_batch", "bench_ec_g2", "bench_pairing_bls12_377", "bench_pairing_bls12_381", @@ -262,13 +268,14 @@ const skipSanitizers = [ "tests/math/t_ec_sage_bn254_snarks.nim", "tests/math/t_ec_sage_bls12_377.nim", "tests/math/t_ec_sage_bls12_381.nim", + "tests/t_blssig_pop_on_bls12381_g2.nim", "tests/t_hash_to_field.nim", "tests/t_hash_to_curve.nim", "tests/t_hash_to_curve_random.nim", "tests/t_mac_poly1305.nim", "tests/t_mac_hmac.nim", "tests/t_kdf_hkdf.nim", - "tests/t_ethereum_eip2333_bls12381_key_derivation" + "tests/t_ethereum_eip2333_bls12381_key_derivation.nim" ] when defined(windows): @@ -302,8 +309,8 @@ template setupCommand(): untyped {.dirty.} = var flags = flags when not defined(windows): # Not available in MinGW https://github.com/libressl-portable/portable/issues/54 - flags &= " --passC:-fstack-protector-all" - let command = "nim " & lang & cc & " " & flags & + flags &= " --passC:-fstack-protector-strong" + let command = "nim " & lang & cc & " -d:release " & flags & " --verbosity:0 --outdir:build/testsuite -r --hints:off --warnings:off " & " --nimcache:nimcache/" & path & " " & path @@ -672,37 +679,55 @@ task bench_fp12_clang_noasm, "Run benchmark 𝔽p12 with clang - no Assembly": # Elliptic curve G1 # ------------------------------------------ -task bench_ec_g1, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - Default compiler": +task bench_ec_g1, "Run benchmark on Elliptic Curve group 𝔾1 - Default compiler": runBench("bench_ec_g1") -task bench_ec_g1_gcc, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - GCC": +task bench_ec_g1_gcc, "Run benchmark on Elliptic Curve group 𝔾1 - GCC": runBench("bench_ec_g1", "gcc") -task bench_ec_g1_clang, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - Clang": +task bench_ec_g1_clang, "Run benchmark on Elliptic Curve group 𝔾1 - Clang": runBench("bench_ec_g1", "clang") -task bench_ec_g1_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - GCC no Assembly": +task bench_ec_g1_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - GCC no Assembly": runBench("bench_ec_g1", "gcc", useAsm = false) -task bench_ec_g1_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - Clang no Assembly": +task bench_ec_g1_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - Clang no Assembly": + runBench("bench_ec_g1", "clang", useAsm = false) + +# Elliptic curve G1 - batch operations +# ------------------------------------------ + +task bench_ec_g1_batch, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - Default compiler": + runBench("bench_ec_g1_batch") + +task bench_ec_g1_batch_gcc, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - GCC": + runBench("bench_ec_g1_batch", "gcc") + +task bench_ec_g1_batch_clang, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - Clang": + runBench("bench_ec_g1_batch", "clang") + +task bench_ec_g1_batch_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - GCC no Assembly": + runBench("bench_ec_g1_batch", "gcc", useAsm = false) + +task bench_ec_g1_batch_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - Clang no Assembly": runBench("bench_ec_g1", "clang", useAsm = false) # Elliptic curve G2 # ------------------------------------------ -task bench_ec_g2, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - Default compiler": +task bench_ec_g2, "Run benchmark on Elliptic Curve group 𝔾2 - Default compiler": runBench("bench_ec_g2") -task bench_ec_g2_gcc, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - GCC": +task bench_ec_g2_gcc, "Run benchmark on Elliptic Curve group 𝔾2 - GCC": runBench("bench_ec_g2", "gcc") -task bench_ec_g2_clang, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - Clang": +task bench_ec_g2_clang, "Run benchmark on Elliptic Curve group 𝔾2 - Clang": runBench("bench_ec_g2", "clang") -task bench_ec_g2_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - GCC no Assembly": +task bench_ec_g2_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - GCC no Assembly": runBench("bench_ec_g2", "gcc", useAsm = false) -task bench_ec_g2_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - Clang no Assembly": +task bench_ec_g2_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - Clang no Assembly": runBench("bench_ec_g2", "clang", useAsm = false) # Pairings diff --git a/constantine/math/elliptic/README.md b/constantine/math/elliptic/README.md index ba7e7d7..101ea11 100644 --- a/constantine/math/elliptic/README.md +++ b/constantine/math/elliptic/README.md @@ -62,7 +62,7 @@ Ry = λ(Px - Rx) - Py ``` but in the case of addition ``` -λ = (Qy - Py) / (Px - Qx) +λ = (Qy - Py) / (Qx - Px) ``` which is undefined for P == Q or P == -Q (as `-(x, y) = (x, -y)`) diff --git a/constantine/math/elliptic/ec_shortweierstrass_affine.nim b/constantine/math/elliptic/ec_shortweierstrass_affine.nim index e06afa7..c01a7fc 100644 --- a/constantine/math/elliptic/ec_shortweierstrass_affine.nim +++ b/constantine/math/elliptic/ec_shortweierstrass_affine.nim @@ -14,6 +14,9 @@ import ../io/[io_fields, io_extfields], ../constants/zoo_constants +# No exceptions allowed +{.push raises: [].} + # ############################################################ # # Elliptic Curve in Short Weierstrass form diff --git a/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim b/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim new file mode 100644 index 0000000..5e2a087 --- /dev/null +++ b/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim @@ -0,0 +1,303 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + ../../platforms/[abstractions, allocs], + ../arithmetic, + ../extension_fields, + ./ec_shortweierstrass_affine, + ./ec_shortweierstrass_jacobian, + ./ec_shortweierstrass_projective + +# No exceptions allowed +{.push raises: [].} + +# ############################################################ +# +# Elliptic Curve in Short Weierstrass form +# Batch addition +# +# ############################################################ + +# Affine primitives +# ------------------------------------------------------------ +# +# The equation for elliptic curve addition is in affine (x, y) coordinates: +# +# P + Q = R +# (Px, Py) + (Qx, Qy) = (Rx, Ry) +# +# with +# Rx = λ² - Px - Qx +# Ry = λ(Px - Rx) - Py +# +# in the case of addition +# λ = (Qy - Py) / (Qx - Px) +# +# which is undefined for P == Q or P == -Q as -(x, y) = (x, -y) +# +# if P = Q, the doubling formula uses the slope of the tangent at the limit +# λ = (3 Px² + a) / (2 Px) +# +# if P = -Q, the sum is the point at infinity +# +# ~~~~ +# +# Those formulas require +# addition: 2M + 1S + 1I +# doubling: 2M + 2S + 1I +# +# Inversion is very expensive: +# 119.5x multiplications (with ADX) for BN254 +# 98.4x multiplications (with ADX) for BLS12-381 +# +# However, n inversions can use Montgomery's batch inversion +# at the cost of 3(n-1)M + 1I +# +# Hence batch addition can have an asymptotic cost of +# 5M + 1S +# Compared to +# Jacobian addition: 12M + 4S +# Jacobian mixed addition: 7M + 4S +# Projective addition: 12M (for curves in the form y² = x³ + b) +# Projective mixed addition: 11M (for curves in the form y² = x³ + b) + +func lambdaAdd[F; G: static Subgroup](lambda_num, lambda_den: var F, P, Q: ECP_ShortW_Aff[F, G]) = + ## Compute the slope of the line (PQ) + lambda_num.diff(Q.y, P.y) + lambda_den.diff(Q.x, P.x) + +func lambdaDouble[F; G: static Subgroup](lambda_num, lambda_den: var F, P: ECP_ShortW_Aff[F, G]) = + ## Compute the tangent at P + lambda_num.square(P.x) + lambda_num *= 3 + when F.C.getCoefA() != 0: + t += F.C.getCoefA() + + lambda_den.double(P.y) + +func affineAdd[F; G: static Subgroup]( + r: var ECP_ShortW_Aff[F, G], + lambda: var F, + P, Q: ECP_ShortW_Aff[F, G]) = + + r.x.square(lambda) + r.x -= P.x + r.x -= Q.x + + r.y.diff(P.x, r.x) + r.y *= lambda + r.y -= P.y + +{.push checks:off.} +func accum_half_vartime[F; G: static Subgroup]( + points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]], + lambdas: ptr UncheckedArray[tuple[num, den: F]], + len: uint) {.noinline.} = + ## Affine accumulation of half the points into the other half + ## Warning ⚠️ : variable-time + ## + ## Accumulate `len` points pairwise into `len/2` + ## + ## Input/output: + ## - points: `len/2` affine points to add (must be even) + ## Partial sums are stored in [0, len/2) + ## [len/2, len) data has been destroyed + ## + ## Scratchspace: + ## - Lambdas + ## + ## Output: + ## - r + ## + ## Warning ⚠️ : cannot be inlined if used in loop due to the use of alloca + + debug: doAssert len and 1 == 0, "There must be an even number of points" + + let N = len div 2 + + # Step 1: Compute numerators and denominators of λᵢ = λᵢ_num / λᵢ_den + for i in 0 ..< N: + let p = i + let q = i+N + let q_prev = i-1+N + + # As we can't divide by 0 in normal cases, λᵢ_den != 0, + # so we use it to indicate special handling. + template markSpecialCase(): untyped {.dirty.} = + # we use Qy as an accumulator, so we save Qy in λᵢ_num + lambdas[i].num = points[q].y + # Mark for special handling + lambdas[i].den.setZero() + + # Step 2: Accumulate denominators in Qy, which is not used anymore. + if i == 0: + points[q].y.setOne() + else: + points[q].y = points[q_prev].y + + # Special case 1: infinity points have affine coordinates (0, 0) by convention + # it doesn't match the y²=x³+ax+b equation so slope formula need special handling + if points[p].isInf().bool() or points[q].isInf().bool(): + markSpecialCase() + continue + + # Special case 2: λ = (Qy-Py)/(Qx-Px) which is undefined when Px == Qx + # This happens when P == Q or P == -Q + if bool(points[p].x == points[q].x): + if bool(points[p].y == points[q].y): + lambdaDouble(lambdas[i].num, lambdas[i].den, points[p]) + else: # P = -Q, so P+Q = inf + markSpecialCase() + continue + else: + lambdaAdd(lambdas[i].num, lambdas[i].den, points[p], points[q]) + + # Step 2: Accumulate denominators in Qy, which is not used anymore. + if i == 0: + points[q].y = lambdas[i].den + else: + points[q].y.prod(points[q_prev].y, lambdas[i].den, skipFinalSub = true) + + # Step 3: batch invert + var accInv {.noInit.}: F + accInv.setZero() + points[len-1].y += accInv # Undo skipFinalSub, ensure that the last accum is in canonical form, before inversion + accInv.inv(points[len-1].y) + + # Step 4: Compute the partial sums + + template recallSpecialCase(i, p, q): untyped {.dirty.} = + # As Qy is used as an accumulator, we saved Qy in λᵢ_num + # For special caseshandling, restore it. + points[q].y = lambdas[i].num + if points[p].isInf().bool(): + points[i] = points[q] + elif points[q].x.isZero().bool() and lambdas[i].num.isZero().bool(): + discard "points[i] = points[p]" # i == p + else: + points[i].setInf() + + for i in countdown(N-1, 1): + let p = i + let q = i+N + let q_prev = i-1+N + + if lambdas[i].den.isZero().bool(): + recallSpecialCase(i, p, q) + continue + + # Compute lambda + points[q].y.prod(accInv, points[q_prev].y, skipFinalSub = true) + points[q].y.prod(points[q].y, lambdas[i].num, skipFinalSub = true) + + # Compute EC addition + var r{.noInit.}: ECP_ShortW_Aff[F, G] + r.affineAdd(lambda = points[q].y, points[p], points[q]) + + # Store result + points[i] = r + + # Next iteration + accInv.prod(accInv, lambdas[i].den, skipFinalSub = true) + + block: # Tail + let i = 0 + let p = 0 + let q = N + + if lambdas[0].den.isZero().bool(): + recallSpecialCase(i, p, q) + else: + # Compute lambda + points[q].y.prod(lambdas[0].num, accInv, skipFinalSub = true) + + # Compute EC addition + var r{.noInit.}: ECP_ShortW_Aff[F, G] + r.affineAdd(lambda = points[q].y, points[p], points[q]) + + # Store result + points[0] = r + +{.pop.} + +# Batch addition: jacobian +# ------------------------------------------------------------ + +{.push checks:off.} +func accumSum_chunk_vartime[F; G: static Subgroup]( + r: var (ECP_ShortW_Jac[F, G] or ECP_ShortW_Prj[F, G]), + points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]], + lambdas: ptr UncheckedArray[tuple[num, den: F]], + len: uint) = + ## Accumulate `points` into r. + ## `r` is NOT overwritten + ## r += ∑ points + + const ChunkThreshold = 16 + var n = len + + while n >= ChunkThreshold: + if (n and 1) == 1: # odd number of points + ## Accumulate the last + r += points[n-1] + n -= 1 + + # Compute [0, n/2) += [n/2, n) + accum_half_vartime(points, lambdas, n) + + # Next chunk + n = n div 2 + + # Tail + for i in 0'u ..< n: + r += points[i] +{.pop.} + +{.push checks:off.} +func sum_batch_vartime*[F; G: static Subgroup]( + r: var (ECP_ShortW_Jac[F, G] or ECP_ShortW_Prj[F, G]), + points: openArray[ECP_ShortW_Aff[F, G]]) = + ## Batch addition of `points` into `r` + ## `r` is overwritten + + # We chunk the addition to limit memory usage + # especially as we allocate on the stack. + + # From experience in high-performance computing, + # here are the constraints we want to optimize for + # 1. MSVC limits stack to 1MB by default, we want to use a fraction of that. + # 2. We want to use a large fraction of L2 cache, but not more. + # 3. We want to use a large fraction of the memory addressable by the TLB. + # 4. We optimize for hyperthreading with 2 sibling threads (Xeon Phi hyperthreads have 4 siblings). + # Meaning we want to use less than half the L2 cache so that if run on siblings threads (same physical core), + # the chunks don't evict each other. + # + # Hardware: + # - a Raspberry Pi 4 (2019, Cortex A72) has 1MB L2 cache size + # - Intel Ice Lake (2019, Core 11XXX) and AMD Zen 2 (2019, Ryzen 3XXX) have 512kB L2 cache size + # + # After one chunk is processed we are well within all 64-bit CPU L2 cache bounds + # as we halve after each chunk. + + r.setInf() + + const maxChunkSize = 262144 # 2¹⁸ = 262144 + const maxStride = maxChunkSize div sizeof(ECP_ShortW_Aff[F, G]) + + let n = min(maxStride, points.len) + let accumulators = alloca(ECP_ShortW_Aff[F, G], n) + let lambdas = alloca(tuple[num, den: F], n) + + for i in countup(0, points.len-1, maxStride): + let n = min(maxStride, points.len - i) + let size = n * sizeof(ECP_ShortW_Aff[F, G]) + copyMem(accumulators[0].addr, points[i].unsafeAddr, size) + r.accumSum_chunk_vartime(accumulators, lambdas, uint n) + +{.pop.} \ No newline at end of file diff --git a/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim b/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim index c36c735..ca41fc0 100644 --- a/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim +++ b/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim @@ -15,6 +15,9 @@ import export Subgroup +# No exceptions allowed +{.push raises: [].} + # ############################################################ # # Elliptic Curve in Short Weierstrass form @@ -47,15 +50,15 @@ func `==`*(P, Q: ECP_ShortW_Jac): SecretBool = var z1z1 {.noInit.}, z2z2 {.noInit.}: F var a{.noInit.}, b{.noInit.}: F - z1z1.square(P.z) - z2z2.square(Q.z) + z1z1.square(P.z, skipFinalSub = true) + z2z2.square(Q.z, skipFinalSub = true) a.prod(P.x, z2z2) b.prod(Q.x, z1z1) result = a == b - a.prod(P.y, Q.z) - b.prod(Q.y, P.z) + a.prod(P.y, Q.z, skipFinalSub = true) + b.prod(Q.y, P.z, skipFinalSub = true) a *= z2z2 b *= z1z1 result = result and a == b @@ -108,9 +111,9 @@ func trySetFromCoordsXandZ*[F; G]( result = sqrt_if_square(P.y) var z2 {.noInit.}: F - z2.square(z) + z2.square(z, skipFinalSub = true) P.x.prod(x, z2) - P.y *= z2 + P.y.prod(P.y, z2, skipFinalSub = true) P.y *= z P.z = z @@ -209,13 +212,13 @@ template sumImpl[F; G: static Subgroup]( block: # Addition-only, check for exceptional cases var Z2Z2 {.noInit.}, U2 {.noInit.}, S2 {.noInit.}: F - Z2Z2.square(Q.z) - S1.prod(Q.z, Z2Z2) + Z2Z2.square(Q.z, skipFinalSub = true) + S1.prod(Q.z, Z2Z2, skipFinalSub = true) S1 *= P.y # S₁ = Y₁*Z₂³ U1.prod(P.x, Z2Z2) # U₁ = X₁*Z₂² - Z1Z1.square(P.z) - S2.prod(P.z, Z1Z1) + Z1Z1.square(P.z, skipFinalSub = true) + S2.prod(P.z, Z1Z1, skipFinalSub = true) S2 *= Q.y # S₂ = Y₂*Z₁³ U2.prod(Q.x, Z1Z1) # U₂ = X₂*Z₁² @@ -285,10 +288,10 @@ template sumImpl[F; G: static Subgroup]( var b{.noInit.} = HH_or_YY a.ccopy(P.x, isDbl) b.ccopy(P.x, isDbl) - HHH_or_Mpre.prod(a, b) # HHH or X₁² + HHH_or_Mpre.prod(a, b, true) # HHH or X₁² # Assuming doubling path - a.square(HHH_or_Mpre) + a.square(HHH_or_Mpre, skipFinalSub = true) a *= HHH_or_Mpre # a = 3X₁² b.square(Z1Z1) # b.mulCheckSparse(CoefA) # TODO: broken static compile-time type inference @@ -302,31 +305,33 @@ template sumImpl[F; G: static Subgroup]( # - R_or_M is set with R (add) or M (dbl) # - HHH_or_Mpre contains HHH (add) or garbage precomputation (dbl) # - V_or_S is set with V = U₁*HH (add) or S = X₁*YY (dbl) + var o {.noInit.}: typeof(r) block: # Finishing line - # we can start using r, while carefully handling r and P or Q aliasing var t {.noInit.}: F t.double(V_or_S) - r.x.square(R_or_M) - r.x -= t # X₃ = R²-2*V (add) or M²-2*S (dbl) - r.x.csub(HHH_or_Mpre, not isDbl) # X₃ = R²-HHH-2*V (add) or M²-2*S (dbl) + o.x.square(R_or_M) + o.x -= t # X₃ = R²-2*V (add) or M²-2*S (dbl) + o.x.csub(HHH_or_Mpre, not isDbl) # X₃ = R²-HHH-2*V (add) or M²-2*S (dbl) - V_or_S -= r.x # V-X₃ (add) or S-X₃ (dbl) - r.y.prod(R_or_M, V_or_S) # Y₃ = R(V-X₃) (add) or M(S-X₃) (dbl) + V_or_S -= o.x # V-X₃ (add) or S-X₃ (dbl) + o.y.prod(R_or_M, V_or_S) # Y₃ = R(V-X₃) (add) or M(S-X₃) (dbl) HHH_or_Mpre.ccopy(HH_or_YY, isDbl) # HHH (add) or YY (dbl) S1.ccopy(HH_or_YY, isDbl) # S1 (add) or YY (dbl) HHH_or_Mpre *= S1 # HHH*S1 (add) or YY² (dbl) - r.y -= HHH_or_Mpre # Y₃ = R(V-X₃)-S₁*HHH (add) or M(S-X₃)-YY² (dbl) + o.y -= HHH_or_Mpre # Y₃ = R(V-X₃)-S₁*HHH (add) or M(S-X₃)-YY² (dbl) t = Q.z t.ccopy(H_or_Y, isDbl) # Z₂ (add) or Y₁ (dbl) - t *= P.z # Z₁Z₂ (add) or Y₁Z₁ (dbl) - r.z.prod(t, H_or_Y) # Z₁Z₂H (add) or garbage (dbl) - r.z.ccopy(t, isDbl) # Z₁Z₂H (add) or Y₁Z₁ (dbl) + t.prod(t, P.z, true) # Z₁Z₂ (add) or Y₁Z₁ (dbl) + o.z.prod(t, H_or_Y) # Z₁Z₂H (add) or garbage (dbl) + o.z.ccopy(t, isDbl) # Z₁Z₂H (add) or Y₁Z₁ (dbl) # if P or R were infinity points they would have spread 0 with Z₁Z₂ block: # Infinity points - r.ccopy(Q, P.isInf()) - r.ccopy(P, Q.isInf()) + o.ccopy(Q, P.isInf()) + o.ccopy(P, Q.isInf()) + + r = o func sum*[F; G: static Subgroup]( r: var ECP_ShortW_Jac[F, G], @@ -379,81 +384,176 @@ func madd*[F; G: static Subgroup]( P: ECP_ShortW_Jac[F, G], Q: ECP_ShortW_Aff[F, G] ) = - ## Elliptic curve mixed addition for Short Weierstrass curves - ## with p in Jacobian coordinates and Q in affine coordinates + ## Elliptic curve mixed addition for Short Weierstrass curves in Jacobian coordinates + ## with the curve ``a`` being a parameter for summing on isogenous curves. ## ## R = P + Q - ## - ## TODO: ⚠️ cannot handle P == Q - # "madd-2007-bl" mixed addition formula - https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl - # with conditional copies to handle infinity points - # Assumptions: Z2=1. - # Cost: 7M + 4S + 9add + 3*2 + 1*4. - # Source: 2007 Bernstein–Lange. - # Explicit formulas: + ## + ## Short Weierstrass curves have the following equation in Jacobian coordinates + ## Y² = X³ + aXZ⁴ + bZ⁶ + ## from the affine equation + ## y² = x³ + a x + b + ## + ## ``r`` is initialized/overwritten with the sum + ## ``CoefA`` allows fast path for curve with a == 0 or a == -3 + ## and also allows summing on curve isogenies. + ## + ## Implementation is constant-time, in particular it will not expose + ## that P == Q or P == -Q or P or Q are the infinity points + ## to simple side-channel attacks (SCA) + ## This is done by using a "complete" or "exception-free" addition law. # - # Z1Z1 = Z1² - # U2 = X2*Z1Z1 - # S2 = Y2*Z1*Z1Z1 - # H = U2-X1 - # HH = H2 - # I = 4*HH - # J = H*I - # r = 2*(S2-Y1) - # V = X1*I - # X3 = r²-J-2*V - # Y3 = r*(V-X3)-2*Y1*J - # Z3 = (Z1+H)²-Z1Z1-HH - var Z1Z1 {.noInit.}, H {.noInit.}, HH {.noInit.}, I{.noInit.}, J {.noInit.}: F + # Implementation, see write-up at the bottom. + # We fuse addition and doubling with condition copy by swapping + # terms with the following table + # + # | Addition, Cohen et al, 1998 | Doubling, Cohen et al, 1998 | Doubling = -3 | Doubling a = 0 | + # | 12M + 4S + 6add + 1*2 | 3M + 6S + 1*a + 4add + 1*2 + 1*3 + 1half | | | + # | ----------------------------- | -----------------------------------------| ----------------- | -------------- | + # | Z₁Z₁ = Z₁² | Z₁Z₁ = Z₁² | | | + # | Z₂Z₂ = Z₂² | | | | + # | | | | | + # | U₁ = X₁*Z₂Z₂ | | | | + # | U₂ = X₂*Z₁Z₁ | | | | + # | S₁ = Y₁*Z₂*Z₂Z₂ | | | | + # | S₂ = Y₂*Z₁*Z₁Z₁ | | | | + # | H = U₂-U₁ # P=-Q, P=Inf, P=Q | | | | + # | R = S₂-S₁ # Q=Inf | | | | + # | | | | | + # | HH = H² | YY = Y₁² | | | + # | V = U₁*HH | S = X₁*YY | | | + # | HHH = H*HH | M = (3*X₁²+a*ZZ²)/2 | 3(X₁-ZZ)(X₁+ZZ)/2 | 3X₁²/2 | + # | | | | | + # | X₃ = R²-HHH-2*V | X₃ = M²-2*S | | | + # | Y₃ = R*(V-X₃)-S₁*HHH | Y₃ = M*(S-X₃)-YY*YY | | | + # | Z₃ = Z₁*Z₂*H | Z₃ = Y₁*Z₁ | | | + # + # For mixed adddition we just set Z₂ = 1 + var Z1Z1 {.noInit.}, U1 {.noInit.}, S1 {.noInit.}, H {.noInit.}, R {.noinit.}: F - # Preload P and Q in cache - let pIsInf = P.isInf() - let qIsInf = Q.isInf() + block: # Addition-only, check for exceptional cases + var U2 {.noInit.}, S2 {.noInit.}: F + U1 = P.x + S1 = P.y - Z1Z1.square(P.z) # Z₁Z₁ = Z₁² - r.z.prod(P.z, Z1Z1) # P.Z is hot in cache, keep it in same register. - r.z *= Q.y # S₂ = Y₂Z₁Z₁Z₁ -- r.z used as S₂ + Z1Z1.square(P.z, skipFinalSub = true) + S2.prod(P.z, Z1Z1, skipFinalSub = true) + S2 *= Q.y # S₂ = Y₂*Z₁³ + U2.prod(Q.x, Z1Z1) # U₂ = X₂*Z₁² - H.prod(Q.x, Z1Z1) # U₂ = X₂Z₁Z₁ - H -= P.x # H = U₂ - X₁ + H.diff(U2, U1) # H = U₂-U₁ + R.diff(S2, S1) # R = S₂-S₁ - HH.square(H) # HH = H² + # Exceptional cases + # Expressing H as affine, if H == 0, P == Q or -Q + # H = U₂-U₁ = X₂*Z₁² - X₁*Z₂² = x₂*Z₂²*Z₁² - x₁*Z₁²*Z₂² + # if H == 0 && R == 0, P = Q -> doubling + # if only H == 0, P = -Q -> infinity, implied in Z₃ = Z₁*Z₂*H = 0 + # if only R == 0, P and Q are related by the cubic root endomorphism + let isDbl = H.isZero() and R.isZero() - I.double(HH) - I.double() # I = 4HH + # Rename buffers under the form (add_or_dbl) + template R_or_M: untyped = R + template H_or_Y: untyped = H + template V_or_S: untyped = U1 + var HH_or_YY {.noInit.}: F + var HHH_or_Mpre {.noInit.}: F - J.prod(H, I) # J = H*I - r.y.prod(P.x, I) # V = X₁*I -- r.y used as V + H_or_Y.ccopy(P.y, isDbl) # H (add) or Y₁ (dbl) + HH_or_YY.square(H_or_Y) # H² (add) or Y₁² (dbl) - r.z -= P.y # - r.z.double() # r = 2*(S₂-Y₁) -- r.z used as r + V_or_S.ccopy(P.x, isDbl) # U₁ (add) or X₁ (dbl) + V_or_S *= HH_or_YY # V = U₁*HH (add) or S = X₁*YY (dbl) - r.x.square(r.z) # r² - r.x -= J - r.x -= r.y - r.x -= r.y # X₃ = r²-J-2*V -- r.x computed + block: # Compute M for doubling + # "when" static evaluation doesn't shortcut booleans :/ + # which causes issues when CoefA isn't an int but Fp or Fp2 + const CoefA = F.C.getCoefA() + when CoefA is int: + const CoefA_eq_zero = CoefA == 0 + const CoefA_eq_minus3 {.used.} = CoefA == -3 + else: + const CoefA_eq_zero = false + const CoefA_eq_minus3 = false - r.y -= r.x # V-X₃ - r.y *= r.z # r*(V-X₃) + when CoefA_eq_zero: + var a {.noInit.} = H + var b {.noInit.} = HH_or_YY + a.ccopy(P.x, isDbl) # H or X₁ + b.ccopy(P.x, isDbl) # HH or X₁ + HHH_or_Mpre.prod(a, b) # HHH or X₁² - J *= P.y # Y₁J -- J reused as Y₁J - r.y -= J - r.y -= J # Y₃ = r*(V-X₃) - 2*Y₁J -- r.y computed + var M{.noInit.} = HHH_or_Mpre # Assuming on doubling path + M.div2() # X₁²/2 + M += HHH_or_Mpre # 3X₁²/2 + R_or_M.ccopy(M, isDbl) - r.z.sum(P.z, H) # Z₁ + H - r.z.square() - r.z -= Z1Z1 - r.z -= HH # Z₃ = (Z1+H)²-Z1Z1-HH + elif CoefA_eq_minus3: + var a{.noInit.}, b{.noInit.}: F + a.sum(P.x, Z1Z1) + b.diff(P.z, Z1Z1) + a.ccopy(H_or_Y, not isDbl) # H or X₁+ZZ + b.ccopy(HH_or_YY, not isDbl) # HH or X₁-ZZ + HHH_or_Mpre.prod(a, b) # HHH or X₁²-ZZ² - # Now handle points at infinity - proc one(): F = - result.setOne() + var M{.noInit.} = HHH_or_Mpre # Assuming on doubling path + M.div2() # (X₁²-ZZ²)/2 + M += HHH_or_Mpre # 3(X₁²-ZZ²)/2 + R_or_M.ccopy(M, isDbl) - r.x.ccopy(Q.x, pIsInf) - r.y.ccopy(Q.y, pIsInf) - r.z.ccopy(static(one()), pIsInf) + else: + # TODO: Costly `a` coefficients can be computed + # by merging their computation with Z₃ = Z₁*Z₂*H (add) or Z₃ = Y₁*Z₁ (dbl) + var a{.noInit.} = H + var b{.noInit.} = HH_or_YY + a.ccopy(P.x, isDbl) + b.ccopy(P.x, isDbl) + HHH_or_Mpre.prod(a, b, true) # HHH or X₁² - r.ccopy(P, qIsInf) + # Assuming doubling path + a.square(HHH_or_Mpre, skipFinalSub = true) + a *= HHH_or_Mpre # a = 3X₁² + b.square(Z1Z1) + # b.mulCheckSparse(CoefA) # TODO: broken static compile-time type inference + b *= CoefA # b = αZZ, with α the "a" coefficient of the curve + + a += b + a.div2() + R_or_M.ccopy(a, isDbl) # (3X₁² - αZZ)/2 + + # Let's count our horses, at this point: + # - R_or_M is set with R (add) or M (dbl) + # - HHH_or_Mpre contains HHH (add) or garbage precomputation (dbl) + # - V_or_S is set with V = U₁*HH (add) or S = X₁*YY (dbl) + var o {.noInit.}: typeof(r) + block: # Finishing line + var t {.noInit.}: F + t.double(V_or_S) + o.x.square(R_or_M) + o.x -= t # X₃ = R²-2*V (add) or M²-2*S (dbl) + o.x.csub(HHH_or_Mpre, not isDbl) # X₃ = R²-HHH-2*V (add) or M²-2*S (dbl) + + V_or_S -= o.x # V-X₃ (add) or S-X₃ (dbl) + o.y.prod(R_or_M, V_or_S) # Y₃ = R(V-X₃) (add) or M(S-X₃) (dbl) + HHH_or_Mpre.ccopy(HH_or_YY, isDbl) # HHH (add) or YY (dbl) + S1.ccopy(HH_or_YY, isDbl) # S1 (add) or YY (dbl) + HHH_or_Mpre *= S1 # HHH*S1 (add) or YY² (dbl) + o.y -= HHH_or_Mpre # Y₃ = R(V-X₃)-S₁*HHH (add) or M(S-X₃)-YY² (dbl) + + t.setOne() + t.ccopy(H_or_Y, isDbl) # Z₂ (add) or Y₁ (dbl) + t.prod(t, P.z, true) # Z₁Z₂ (add) or Y₁Z₁ (dbl) + o.z.prod(t, H_or_Y) # Z₁Z₂H (add) or garbage (dbl) + o.z.ccopy(t, isDbl) # Z₁Z₂H (add) or Y₁Z₁ (dbl) + + block: # Infinity points + o.x.ccopy(Q.x, P.isInf()) + o.y.ccopy(Q.y, P.isInf()) + o.z.csetOne(P.isInf()) + + o.ccopy(P, Q.isInf()) + + r = o func double*[F; G: static Subgroup]( r: var ECP_ShortW_Jac[F, G], @@ -524,11 +624,7 @@ func `+=`*(P: var ECP_ShortW_Jac, Q: ECP_ShortW_Jac) {.inline.} = func `+=`*(P: var ECP_ShortW_Jac, Q: ECP_ShortW_Aff) {.inline.} = ## In-place mixed point addition - ## - ## TODO: ⚠️ cannot handle P == Q - var t{.noInit.}: typeof(P) - t.madd(P, Q) - P = t + P.madd(P, Q) func double*(P: var ECP_ShortW_Jac) {.inline.} = ## In-place point doubling @@ -547,10 +643,10 @@ func affine*[F; G]( jac: ECP_ShortW_Jac[F, G]) = var invZ {.noInit.}, invZ2{.noInit.}: F invZ.inv(jac.z) - invZ2.square(invZ) + invZ2.square(invZ, skipFinalSub = true) aff.x.prod(jac.x, invZ2) - invZ *= invZ2 + invZ.prod(invZ, invZ2, skipFinalSub = true) aff.y.prod(jac.y, invZ) func fromAffine*[F; G]( @@ -586,38 +682,38 @@ func batchAffine*[N: static int, F, G]( zeroes[i] = z.isZero() z.csetOne(zeroes[i]) - affs[i].x.prod(affs[i-1].x, z) + if i != N-1: + affs[i].x.prod(affs[i-1].x, z, skipFinalSub = true) + else: + affs[i].x.prod(affs[i-1].x, z, skipFinalSub = false) var accInv {.noInit.}: F accInv.inv(affs[N-1].x) for i in countdown(N-1, 1): - # Skip zero z-coordinates (infinity points) - var z = affs[i].x - # Extract 1/Pᵢ var invi {.noInit.}: F - invi.prod(accInv, affs[i-1].x) + invi.prod(accInv, affs[i-1].x, skipFinalSub = true) invi.csetZero(zeroes[i]) # Now convert Pᵢ to affine var invi2 {.noinit.}: F - invi2.square(invi) + invi2.square(invi, skipFinalSub = true) affs[i].x.prod(jacs[i].x, invi2) - invi *= invi2 + invi.prod(invi, invi2, skipFinalSub = true) affs[i].y.prod(jacs[i].y, invi) # next iteration invi = jacs[i].z invi.csetOne(zeroes[i]) - accInv *= invi + accInv.prod(accInv, invi, skipFinalSub = true) block: # tail var invi2 {.noinit.}: F accInv.csetZero(zeroes[0]) - invi2.square(accInv) + invi2.square(accInv, skipFinalSub = true) affs[0].x.prod(jacs[0].x, invi2) - accInv *= invi2 + accInv.prod(accInv, invi2, skipFinalSub = true) affs[0].y.prod(jacs[0].y, accInv) # ############################################################ diff --git a/constantine/math/elliptic/ec_shortweierstrass_projective.nim b/constantine/math/elliptic/ec_shortweierstrass_projective.nim index 7a20006..36e826d 100644 --- a/constantine/math/elliptic/ec_shortweierstrass_projective.nim +++ b/constantine/math/elliptic/ec_shortweierstrass_projective.nim @@ -15,6 +15,9 @@ import export Subgroup +# No exceptions allowed +{.push raises: [].} + # ############################################################ # # Elliptic Curve in Short Weierstrass form @@ -261,7 +264,6 @@ func madd*[F; G: static Subgroup]( ## ``r`` may alias P # TODO: static doAssert odd order - when F.C.getCoefA() == 0: var t0 {.noInit.}, t1 {.noInit.}, t2 {.noInit.}, t3 {.noInit.}, t4 {.noInit.}: F var x3 {.noInit.}, y3 {.noInit.}, z3 {.noInit.}: F @@ -273,6 +275,10 @@ func madd*[F; G: static Subgroup]( # Y₃ = (Y₁Y₂ + 3bZ₁)(Y₁Y₂ − 3bZ₁) # + 9bX₁X₂ (X₁ + X₂Z₁) # Z₃= (Y₁ + Y₂Z₁)(Y₁Y₂ + 3bZ₁) + 3 X₁X₂ (X₁Y₂ + X₂Y₁) + # + # Note¹⁰ mentions that due to Qz = 1, cannot be + # the point at infinity. + # We solve that by conditional copies. t0.prod(P.x, Q.x) # 1. t₀ <- X₁ X₂ t1.prod(P.y, Q.y) # 2. t₁ <- Y₁ Y₂ t3.sum(P.x, P.y) # 3. t₃ <- X₁ + Y₁ ! error in paper @@ -304,13 +310,24 @@ func madd*[F; G: static Subgroup]( y3 *= SexticNonResidue x3.prod(t4, y3) # 18. X₃ <- t₄ Y₃, X₃ = (Y₁ + Y₂Z₁) 3b(X₁ + X₂Z₁) t2.prod(t3, t1) # 19. t₂ <- t₃ t₁, t₂ = (X₁Y₂ + X₂Y₁)(Y₁Y₂ - 3bZ₁) - r.x.diff(t2, x3) # 20. X₃ <- t₂ - X₃, X₃ = (X₁Y₂ + X₂Y₁)(Y₁Y₂ - 3bZ₁) - 3b(Y₁ + Y₂Z₁)(X₁ + X₂Z₁) + x3.diff(t2, x3) # 20. X₃ <- t₂ - X₃, X₃ = (X₁Y₂ + X₂Y₁)(Y₁Y₂ - 3bZ₁) - 3b(Y₁ + Y₂Z₁)(X₁ + X₂Z₁) y3 *= t0 # 21. Y₃ <- Y₃ t₀, Y₃ = 9bX₁X₂ (X₁ + X₂Z₁) t1 *= z3 # 22. t₁ <- t₁ Z₃, t₁ = (Y₁Y₂ - 3bZ₁)(Y₁Y₂ + 3bZ₁) - r.y.sum(y3, t1) # 23. Y₃ <- t₁ + Y₃, Y₃ = (Y₁Y₂ + 3bZ₁)(Y₁Y₂ - 3bZ₁) + 9bX₁X₂ (X₁ + X₂Z₁) + y3 += t1 # 23. Y₃ <- t₁ + Y₃, Y₃ = (Y₁Y₂ + 3bZ₁)(Y₁Y₂ - 3bZ₁) + 9bX₁X₂ (X₁ + X₂Z₁) t0 *= t3 # 31. t₀ <- t₀ t₃, t₀ = 3X₁X₂ (X₁Y₂ + X₂Y₁) z3 *= t4 # 32. Z₃ <- Z₃ t₄, Z₃ = (Y₁Y₂ + 3bZ₁)(Y₁ + Y₂Z₁) - r.z.sum(z3, t0) # 33. Z₃ <- Z₃ + t₀, Z₃ = (Y₁ + Y₂Z₁)(Y₁Y₂ + 3bZ₁) + 3X₁X₂ (X₁Y₂ + X₂Y₁) + z3 += t0 # 33. Z₃ <- Z₃ + t₀, Z₃ = (Y₁ + Y₂Z₁)(Y₁Y₂ + 3bZ₁) + 3X₁X₂ (X₁Y₂ + X₂Y₁) + + # Deal with infinity point. r and P might alias. + let inf = Q.isInf() + x3.ccopy(P.x, inf) + y3.ccopy(P.y, inf) + z3.ccopy(P.z, inf) + + r.x = x3 + r.y = y3 + r.z = z3 + else: {.error: "Not implemented.".} @@ -454,18 +471,18 @@ func batchAffine*[N: static int, F, G]( zeroes[i] = z.isZero() z.csetOne(zeroes[i]) - affs[i].x.prod(affs[i-1].x, z) + if i != N-1: + affs[i].x.prod(affs[i-1].x, z, skipFinalSub = true) + else: + affs[i].x.prod(affs[i-1].x, z, skipFinalSub = false) var accInv {.noInit.}: F accInv.inv(affs[N-1].x) for i in countdown(N-1, 1): - # Skip zero z-coordinates (infinity points) - var z = affs[i].x - # Extract 1/Pᵢ var invi {.noInit.}: F - invi.prod(accInv, affs[i-1].x) + invi.prod(accInv, affs[i-1].x, skipFinalSub = true) invi.csetZero(zeroes[i]) # Now convert Pᵢ to affine @@ -475,7 +492,7 @@ func batchAffine*[N: static int, F, G]( # next iteration invi = projs[i].z invi.csetOne(zeroes[i]) - accInv *= invi + accInv.prod(accInv, invi, skipFinalSub = true) block: # tail accInv.csetZero(zeroes[0]) diff --git a/constantine/math/elliptic/ec_twistededwards_projective.nim b/constantine/math/elliptic/ec_twistededwards_projective.nim index 1398948..9e5cbaf 100644 --- a/constantine/math/elliptic/ec_twistededwards_projective.nim +++ b/constantine/math/elliptic/ec_twistededwards_projective.nim @@ -276,6 +276,10 @@ func double*[Field]( E -= D # C stores E-D r.y *= E +func `+=`*(P: var ECP_TwEdwards_Prj, Q: ECP_TwEdwards_Prj) {.inline.} = + ## In-place point addition + P.sum(P, Q) + func double*(P: var ECP_TwEdwards_Prj) {.inline.} = ## In-place EC doubling P.double(P) diff --git a/constantine/math/extension_fields/towers.nim b/constantine/math/extension_fields/towers.nim index 5e13c20..bc9f51e 100644 --- a/constantine/math/extension_fields/towers.nim +++ b/constantine/math/extension_fields/towers.nim @@ -2156,5 +2156,29 @@ func inv*(a: var CubicExt) = ## to affine for elliptic curve a.invImpl(a) +# Convenience functions +# ---------------------------------------------------------------------- + +template square*(a: var ExtensionField, skipFinalSub: static bool) = + # Square alias, + # this allows using the same code for + # the base field and its extensions while benefitting from skipping + # the final substraction on Fp + a.square() + +template square*(r: var ExtensionField, a: ExtensionField, skipFinalSub: static bool) = + # Square alias, + # this allows using the same code for + # the base field and its extensions while benefitting from skipping + # the final substraction on Fp + r.square(a) + +template prod*(r: var ExtensionField, a, b: ExtensionField, skipFinalSub: static bool) = + # Prod alias, + # this allows using the same code for + # the base field and its extensions while benefitting from skipping + # the final substraction on Fp + r.prod(a, b) + {.pop.} # inline {.pop.} # raises no exceptions diff --git a/constantine/platforms/allocs.nim b/constantine/platforms/allocs.nim new file mode 100644 index 0000000..16e4675 --- /dev/null +++ b/constantine/platforms/allocs.nim @@ -0,0 +1,32 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +# ############################################################ +# +# Allocators +# +# ############################################################ + +# Due to the following constraints: +# - No dynamic allocation in single-threaded codepaths (for compatibility with embedded devices like TPM or secure hardware) +# - Avoiding cryptographic material in third-party libraries (like a memory allocator) +# - Giving full control of the library user on allocation strategy +# - Performance, especially for long-running processes (fragmentation, multithreaded allocation...) +# +# stack allocation is strongly preferred where necessary. + +when defined(windows): + proc alloca(size: int): pointer {.header: "".} +else: + proc alloca(size: int): pointer {.header: "".} + +template alloca*(T: typedesc): ptr T = + cast[ptr T](alloca(sizeof(T))) + +template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] = + cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len)) \ No newline at end of file diff --git a/helpers/pararun.nim b/helpers/pararun.nim index bacd14c..663fd71 100644 --- a/helpers/pararun.nim +++ b/helpers/pararun.nim @@ -13,7 +13,7 @@ import # Pararun is a parallel shell command runner # ------------------------------------------ -# Usage: pararun # AsyncSemaphore # ---------------------------------------------------------------- diff --git a/tests/math/t_ec_shortw_jac_g1_batch_add.nim b/tests/math/t_ec_shortw_jac_g1_batch_add.nim new file mode 100644 index 0000000..3c00158 --- /dev/null +++ b/tests/math/t_ec_shortw_jac_g1_batch_add.nim @@ -0,0 +1,30 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + # Internals + ../../constantine/math/config/curves, + ../../constantine/math/elliptic/ec_shortweierstrass_jacobian, + ../../constantine/math/arithmetic, + # Test utilities + ./t_ec_template + +const + numPoints = [1, 2, 8, 16, 128, 1024, 2048, 16384, 32768] # 262144, 1048576] + +run_EC_batch_add_impl( + ec = ECP_ShortW_Jac[Fp[BN254_Snarks], G1], + numPoints = numPoints, + moduleName = "test_ec_shortweierstrass_jacobian_batch_add_" & $BN254_Snarks + ) + +run_EC_batch_add_impl( + ec = ECP_ShortW_Jac[Fp[BLS12_381], G1], + numPoints = numPoints, + moduleName = "test_ec_shortweierstrass_jacobian_batch_add_" & $BLS12_381 + ) diff --git a/tests/math/t_ec_shortw_prj_g1_batch_add.nim b/tests/math/t_ec_shortw_prj_g1_batch_add.nim new file mode 100644 index 0000000..44d3ba9 --- /dev/null +++ b/tests/math/t_ec_shortw_prj_g1_batch_add.nim @@ -0,0 +1,30 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + # Internals + ../../constantine/math/config/curves, + ../../constantine/math/elliptic/ec_shortweierstrass_projective, + ../../constantine/math/arithmetic, + # Test utilities + ./t_ec_template + +const + numPoints = [1, 2, 8, 16, 128, 1024, 2048, 16384, 32768] # 262144, 1048576] + +run_EC_batch_add_impl( + ec = ECP_ShortW_Prj[Fp[BN254_Snarks], G1], + numPoints = numPoints, + moduleName = "test_ec_shortweierstrass_projective_batch_add_" & $BN254_Snarks + ) + +run_EC_batch_add_impl( + ec = ECP_ShortW_Prj[Fp[BLS12_381], G1], + numPoints = numPoints, + moduleName = "test_ec_shortweierstrass_projective_batch_add_" & $BLS12_381 + ) diff --git a/tests/math/t_ec_template.nim b/tests/math/t_ec_template.nim index fcd3479..c53870b 100644 --- a/tests/math/t_ec_template.nim +++ b/tests/math/t_ec_template.nim @@ -23,6 +23,7 @@ import ec_shortweierstrass_affine, ec_shortweierstrass_jacobian, ec_shortweierstrass_projective, + ec_shortweierstrass_batch_ops, ec_twistededwards_affine, ec_twistededwards_projective, ec_scalar_mul], @@ -41,7 +42,7 @@ type Long01Sequence func random_point*(rng: var RngState, EC: typedesc, randZ: bool, gen: RandomGen): EC {.noInit.} = - if not randZ: + when EC is ECP_ShortW_Aff: if gen == Uniform: result = rng.random_unsafe(EC) elif gen == HighHammingWeight: @@ -49,12 +50,20 @@ func random_point*(rng: var RngState, EC: typedesc, randZ: bool, gen: RandomGen) else: result = rng.random_long01Seq(EC) else: - if gen == Uniform: - result = rng.random_unsafe_with_randZ(EC) - elif gen == HighHammingWeight: - result = rng.random_highHammingWeight_with_randZ(EC) + if not randZ: + if gen == Uniform: + result = rng.random_unsafe(EC) + elif gen == HighHammingWeight: + result = rng.random_highHammingWeight(EC) + else: + result = rng.random_long01Seq(EC) else: - result = rng.random_long01Seq_with_randZ(EC) + if gen == Uniform: + result = rng.random_unsafe_with_randZ(EC) + elif gen == HighHammingWeight: + result = rng.random_highHammingWeight_with_randZ(EC) + else: + result = rng.random_long01Seq_with_randZ(EC) template pairingGroup(EC: typedesc): string = when EC is (ECP_ShortW_Aff or ECP_ShortW_Prj or ECP_ShortW_Jac): @@ -99,6 +108,15 @@ proc run_EC_addition_tests*( r.sum(inf, P) check: bool(r == P) + # Aliasing tests + r = P + r += inf + check: bool(r == P) + + r.setInf() + r += P + check: bool(r == P) + test(ec, randZ = false, gen = Uniform) test(ec, randZ = true, gen = Uniform) test(ec, randZ = false, gen = HighHammingWeight) @@ -438,7 +456,7 @@ proc run_EC_mixed_add_impl*( suite testSuiteDesc & " - " & $ec & " - [" & $WordBitwidth & "-bit mode]": test "EC " & G1_or_G2 & " mixed addition is consistent with general addition": - proc test(EC: typedesc, bits: static int, randZ: bool, gen: RandomGen) = + proc test(EC: typedesc, randZ: bool, gen: RandomGen) = for _ in 0 ..< Iters: let a = rng.random_point(EC, randZ, gen) let b = rng.random_point(EC, randZ, gen) @@ -452,12 +470,83 @@ proc run_EC_mixed_add_impl*( check: bool(r_generic == r_mixed) - test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = Uniform) - test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = Uniform) - test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = HighHammingWeight) - test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = HighHammingWeight) - test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = Long01Sequence) - test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = Long01Sequence) + test(ec, randZ = false, gen = Uniform) + test(ec, randZ = true, gen = Uniform) + test(ec, randZ = false, gen = HighHammingWeight) + test(ec, randZ = true, gen = HighHammingWeight) + test(ec, randZ = false, gen = Long01Sequence) + test(ec, randZ = true, gen = Long01Sequence) + + test "EC " & G1_or_G2 & " mixed addition - doubling": + proc test(EC: typedesc, randZ: bool, gen: RandomGen) = + for _ in 0 ..< Iters: + let a = rng.random_point(EC, randZ, gen) + var aAff: ECP_ShortW_Aff[EC.F, EC.G] + aAff.affine(a) + + var r_generic, r_mixed: EC + + r_generic.double(a) + r_mixed.madd(a, aAff) + check: bool(r_generic == r_mixed) + + # Aliasing test + r_mixed = a + r_mixed += aAff + check: bool(r_generic == r_mixed) + + test(ec, randZ = false, gen = Uniform) + test(ec, randZ = true, gen = Uniform) + test(ec, randZ = false, gen = HighHammingWeight) + test(ec, randZ = true, gen = HighHammingWeight) + test(ec, randZ = false, gen = Long01Sequence) + test(ec, randZ = true, gen = Long01Sequence) + + test "EC " & G1_or_G2 & " mixed addition - adding infinity LHS": + proc test(EC: typedesc, randZ: bool, gen: RandomGen) = + for _ in 0 ..< Iters: + var a{.noInit.}: EC + a.setInf() + let bAff = rng.random_point(ECP_ShortW_Aff[EC.F, EC.G], randZ = false, gen) + + var r_mixed{.noInit.}: EC + r_mixed.madd(a, bAff) + + var r{.noInit.}: ECP_ShortW_Aff[EC.F, EC.G] + r.affine(r_mixed) + + a += bAff + + check: + bool(r == bAff) + bool(a == r_mixed) + + test(ec, randZ = false, gen = Uniform) + test(ec, randZ = false, gen = HighHammingWeight) + test(ec, randZ = false, gen = Long01Sequence) + + test "EC " & G1_or_G2 & " mixed addition - adding infinity RHS": + proc test(EC: typedesc, randZ: bool, gen: RandomGen) = + for _ in 0 ..< Iters: + let a = rng.random_point(EC, randZ, gen) + var bAff{.noInit.}: ECP_ShortW_Aff[EC.F, EC.G] + bAff.setInf() + + var r{.noInit.}: EC + r.madd(a, bAff) + + check: bool(r == a) + + r = a + r += bAff + check: bool(r == a) + + test(ec, randZ = false, gen = Uniform) + test(ec, randZ = true, gen = Uniform) + test(ec, randZ = false, gen = HighHammingWeight) + test(ec, randZ = true, gen = HighHammingWeight) + test(ec, randZ = false, gen = Long01Sequence) + test(ec, randZ = true, gen = Long01Sequence) proc run_EC_subgroups_cofactors_impl*( ec: typedesc, @@ -480,7 +569,7 @@ proc run_EC_subgroups_cofactors_impl*( suite testSuiteDesc & " - " & $ec & " - [" & $WordBitwidth & "-bit mode]": test "Effective cofactor matches accelerated cofactor clearing" & " - " & $ec & " - [" & $WordBitwidth & "-bit mode]": - proc test(EC: typedesc, bits: static int, randZ: bool, gen: RandomGen) = + proc test(EC: typedesc, randZ: bool, gen: RandomGen) = for _ in 0 ..< ItersMul: let P = rng.random_point(EC, randZ, gen) var cPeff = P @@ -491,17 +580,17 @@ proc run_EC_subgroups_cofactors_impl*( check: bool(cPeff == cPfast) - test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = Uniform) - test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = Uniform) - test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = HighHammingWeight) - test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = HighHammingWeight) - test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = Long01Sequence) - test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = Long01Sequence) + test(ec, randZ = false, gen = Uniform) + test(ec, randZ = true, gen = Uniform) + test(ec, randZ = false, gen = HighHammingWeight) + test(ec, randZ = true, gen = HighHammingWeight) + test(ec, randZ = false, gen = Long01Sequence) + test(ec, randZ = true, gen = Long01Sequence) test "Subgroup checks and cofactor clearing consistency": var inSubgroup = 0 var offSubgroup = 0 - proc test(EC: typedesc, bits: static int, randZ: bool, gen: RandomGen) = + proc test(EC: typedesc, randZ: bool, gen: RandomGen) = stdout.write " " for _ in 0 ..< ItersMul: let P = rng.random_point(EC, randZ, gen) @@ -526,12 +615,12 @@ proc run_EC_subgroups_cofactors_impl*( stdout.write '\n' - test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = Uniform) - test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = Uniform) - test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = HighHammingWeight) - test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = HighHammingWeight) - test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = Long01Sequence) - test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = Long01Sequence) + test(ec, randZ = false, gen = Uniform) + test(ec, randZ = true, gen = Uniform) + test(ec, randZ = false, gen = HighHammingWeight) + test(ec, randZ = true, gen = HighHammingWeight) + test(ec, randZ = false, gen = Long01Sequence) + test(ec, randZ = true, gen = Long01Sequence) echo " [SUCCESS] Test finished with ", inSubgroup, " points in ", G1_or_G2, " subgroup and ", offSubgroup, " points on curve but not in subgroup (before cofactor clearing)" @@ -699,4 +788,82 @@ proc run_EC_conversion_failures*( doAssert bool(Qs[i] == Rs[i]) test_bn254_snarks_g1(ECP_ShortW_Prj[Fp[BN254_Snarks], G1]) - test_bn254_snarks_g1(ECP_ShortW_Jac[Fp[BN254_Snarks], G1]) \ No newline at end of file + test_bn254_snarks_g1(ECP_ShortW_Jac[Fp[BN254_Snarks], G1]) + +proc run_EC_batch_add_impl*[N: static int]( + ec: typedesc, + numPoints: array[N, int], + moduleName: string + ) = + + # Random seed for reproducibility + var rng: RngState + let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32 + rng.seed(seed) + echo "\n------------------------------------------------------\n" + echo moduleName, " xoshiro512** seed: ", seed + + when ec.G == G1: + const G1_or_G2 = "G1" + else: + const G1_or_G2 = "G2" + + const testSuiteDesc = "Elliptic curve batch addition for Short Weierstrass form" + + suite testSuiteDesc & " - " & $ec & " - [" & $WordBitwidth & "-bit mode]": + for n in numPoints: + test $ec & " batch addition (N=" & $n & ")": + proc test(EC: typedesc, gen: RandomGen) = + var points = newSeq[ECP_ShortW_Aff[EC.F, EC.G]](n) + + for i in 0 ..< n: + points[i] = rng.random_point(ECP_ShortW_Aff[EC.F, EC.G], randZ = false, gen) + + var r_batch{.noinit.}, r_ref{.noInit.}: EC + + r_ref.setInf() + for i in 0 ..< n: + r_ref += points[i] + + r_batch.sum_batch_vartime(points) + + check: bool(r_batch == r_ref) + + + test(ec, gen = Uniform) + test(ec, gen = HighHammingWeight) + test(ec, gen = Long01Sequence) + + test "EC " & G1_or_G2 & " batch addition (N=" & $n & ") - special cases": + proc test(EC: typedesc, gen: RandomGen) = + var points = newSeq[ECP_ShortW_Aff[EC.F, EC.G]](n) + + let halfN = n div 2 + + for i in 0 ..< halfN: + points[i] = rng.random_point(ECP_ShortW_Aff[EC.F, EC.G], randZ = false, gen) + + for i in halfN ..< n: + # The special cases test relies on internal knowledge that we sum(points[i], points[i+n/2] + # It should be changed if scheduling change, for example if we sum(points[2*i], points[2*i+1]) + let c = rng.random_unsafe(3) + if c == 0: + points[i] = rng.random_point(ECP_ShortW_Aff[EC.F, EC.G], randZ = false, gen) + elif c == 1: + points[i] = points[i-halfN] + else: + points[i].neg(points[i-halfN]) + + var r_batch{.noinit.}, r_ref{.noInit.}: EC + + r_ref.setInf() + for i in 0 ..< n: + r_ref += points[i] + + r_batch.sum_batch_vartime(points) + + check: bool(r_batch == r_ref) + + test(ec, gen = Uniform) + test(ec, gen = HighHammingWeight) + test(ec, gen = Long01Sequence)