diff --git a/.travis.yml b/.travis.yml index 5612fe8..3b9ea56 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,6 +13,7 @@ matrix: # Build and test on both x86-64 and ARM64 # Ubuntu Bionic (18.04) is needed, it includes # GCC 7 codegen fixes to addcarry_u64. + # Clang 9 (and GCC-6) are needed for inline assembly "flag output constraints" - dist: bionic arch: amd64 env: @@ -33,9 +34,19 @@ matrix: - ARCH=amd64 - CHANNEL=devel compiler: clang + # addons: + # apt: + # sources: + # - ubuntu-toolchain-r-test + # - llvm-toolchain-bionic-9.0 # LLVM 9 repo is disallowed + # packages: + # - clang-9.0 + # env: + # - MATRIX_EVAL="CC=clang-9.0 && CXX=clang++-9.0" # On OSX we only test against clang (gcc is mapped to clang by default) - os: osx + osx_image: xcode11.5 # Need xcode 11.4.2 min for Clang 9 arch: amd64 env: - ARCH=amd64 @@ -98,9 +109,17 @@ before_script: script: - nimble refresh - nimble install gmp stew - - nimble test_parallel - - if [[ "$ARCH" != "arm64" ]]; then - nimble test_parallel_no_assembler; + # Installing Clang9.0 or later is a pain in Travis + # for inline assembly "flag output constraint" + # Also MacOS build is timing out with 2 series of tests. + - | + if [[ "$TRAVIS_COMPILER" == "clang" ]]; then + nimble test_parallel_no_assembler + else + nimble test_parallel + if [[ "$ARCH" != "arm64" ]]; then + nimble test_parallel_no_assembler + fi fi branches: except: diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 8817707..201f251 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -36,7 +36,7 @@ strategy: # TEST_LANG: c MacOS_devel_64bit: - VM: 'macOS-10.14' + VM: 'macOS-10.15' UCPU: amd64 CHANNEL: devel TEST_LANG: c diff --git a/benchmarks/bench_elliptic_template.nim b/benchmarks/bench_elliptic_template.nim index fb746ef..f66c281 100644 --- a/benchmarks/bench_elliptic_template.nim +++ b/benchmarks/bench_elliptic_template.nim @@ -61,7 +61,7 @@ echo "Optimization level => " echo " no optimization: ", not defined(release) echo " release: ", defined(release) echo " danger: ", defined(danger) -echo " inline assembly: ", UseX86ASM +echo " inline assembly: ", UseASM_X86_64 when (sizeof(int) == 4) or defined(Constantine32): echo "⚠️ Warning: using Constantine with 32-bit limbs" diff --git a/benchmarks/bench_fields_template.nim b/benchmarks/bench_fields_template.nim index e17bc4a..afbd5be 100644 --- a/benchmarks/bench_fields_template.nim +++ b/benchmarks/bench_fields_template.nim @@ -58,7 +58,7 @@ echo "Optimization level => " echo " no optimization: ", not defined(release) echo " release: ", defined(release) echo " danger: ", defined(danger) -echo " inline assembly: ", UseX86ASM +echo " inline assembly: ", UseASM_X86_64 when (sizeof(int) == 4) or defined(Constantine32): echo "⚠️ Warning: using Constantine with 32-bit limbs" diff --git a/benchmarks/bench_fp_double_width.nim b/benchmarks/bench_fp_double_width.nim new file mode 100644 index 0000000..bbb958a --- /dev/null +++ b/benchmarks/bench_fp_double_width.nim @@ -0,0 +1,196 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +# ############################################################ +# +# Benchmark of finite fields +# +# ############################################################ + +import + # Internals + ../constantine/config/[curves, common], + ../constantine/arithmetic, + ../constantine/towers, + # Helpers + ../helpers/[prng_unsafe, static_for], + ./platforms, + # Standard library + std/[monotimes, times, strformat, strutils, macros] + +var rng: RngState +let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32 +rng.seed(seed) +echo "bench xoshiro512** seed: ", seed + +# warmup +proc warmup*() = + # Warmup - make sure cpu is on max perf + let start = cpuTime() + var foo = 123 + for i in 0 ..< 300_000_000: + foo += i*i mod 456 + foo = foo mod 789 + + # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects + let stop = cpuTime() + echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n" + +warmup() + +when defined(gcc): + echo "\nCompiled with GCC" +elif defined(clang): + echo "\nCompiled with Clang" +elif defined(vcc): + echo "\nCompiled with MSVC" +elif defined(icc): + echo "\nCompiled with ICC" +else: + echo "\nCompiled with an unknown compiler" + +echo "Optimization level => " +echo " no optimization: ", not defined(release) +echo " release: ", defined(release) +echo " danger: ", defined(danger) +echo " inline assembly: ", UseASM_X86_64 + +when (sizeof(int) == 4) or defined(Constantine32): + echo "⚠️ Warning: using Constantine with 32-bit limbs" +else: + echo "Using Constantine with 64-bit limbs" + +when SupportsCPUName: + echo "Running on ", cpuName(), "" + +when SupportsGetTicks: + echo "\n⚠️ Cycles measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them." + echo "i.e. a 20% overclock will be about 20% off (assuming no dynamic frequency scaling)" + +echo "\n=================================================================================================================\n" + +proc separator*() = + echo "-".repeat(145) + +proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) = + let ns = inNanoseconds((stop-start) div iters) + let throughput = 1e9 / float64(ns) + when SupportsGetTicks: + echo &"{op:<28} {field:<40} {throughput:>15.3f} ops/s {ns:>9} ns/op {(stopClk - startClk) div iters:>9} CPU cycles (approx)" + else: + echo &"{op:<28} {field:<40} {throughput:>15.3f} ops/s {ns:>9} ns/op" + +proc notes*() = + echo "Notes:" + echo " - Compilers:" + echo " Compilers are severely limited on multiprecision arithmetic." + echo " Inline Assembly is used by default (nimble bench_fp)." + echo " Bench without assembly can use \"nimble bench_fp_gcc\" or \"nimble bench_fp_clang\"." + echo " GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries." + echo " - The simplest operations might be optimized away by the compiler." + echo " - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)" + +template bench(op: string, desc: string, iters: int, body: untyped): untyped = + let start = getMonotime() + when SupportsGetTicks: + let startClk = getTicks() + for _ in 0 ..< iters: + body + when SupportsGetTicks: + let stopClk = getTicks() + let stop = getMonotime() + + when not SupportsGetTicks: + let startClk = -1'i64 + let stopClk = -1'i64 + + report(op, desc, start, stop, startClk, stopClk, iters) + +func random_unsafe(rng: var RngState, a: var FpDbl, Base: typedesc) = + ## Initialize a standalone Double-Width field element + ## we don't reduce it modulo p², this is only used for benchmark + let aHi = rng.random_unsafe(Base) + let aLo = rng.random_unsafe(Base) + for i in 0 ..< aLo.mres.limbs.len: + a.limbs2x[i] = aLo.mres.limbs[i] + for i in 0 ..< aHi.mres.limbs.len: + a.limbs2x[aLo.mres.limbs.len+i] = aHi.mres.limbs[i] + +proc sumNoReduce(T: typedesc, iters: int) = + var r: T + let a = rng.random_unsafe(T) + let b = rng.random_unsafe(T) + bench("Addition no reduce", $T, iters): + r.sumNoReduce(a, b) + +proc sum(T: typedesc, iters: int) = + var r: T + let a = rng.random_unsafe(T) + let b = rng.random_unsafe(T) + bench("Addition", $T, iters): + r.sum(a, b) + +proc diffNoReduce(T: typedesc, iters: int) = + var r: T + let a = rng.random_unsafe(T) + let b = rng.random_unsafe(T) + bench("Substraction no reduce", $T, iters): + r.diffNoReduce(a, b) + +proc diff(T: typedesc, iters: int) = + var r: T + let a = rng.random_unsafe(T) + let b = rng.random_unsafe(T) + bench("Substraction", $T, iters): + r.diff(a, b) + +proc diff2xNoReduce(T: typedesc, iters: int) = + var r, a, b: doubleWidth(T) + rng.random_unsafe(r, T) + rng.random_unsafe(a, T) + rng.random_unsafe(b, T) + bench("Substraction 2x no reduce", $doubleWidth(T), iters): + r.diffNoReduce(a, b) + +proc diff2x(T: typedesc, iters: int) = + var r, a, b: doubleWidth(T) + rng.random_unsafe(r, T) + rng.random_unsafe(a, T) + rng.random_unsafe(b, T) + bench("Substraction 2x", $doubleWidth(T), iters): + r.diff(a, b) + +proc mul2xBench*(rLen, aLen, bLen: static int, iters: int) = + var r: BigInt[rLen] + let a = rng.random_unsafe(BigInt[aLen]) + let b = rng.random_unsafe(BigInt[bLen]) + bench("Multiplication", $rLen & " <- " & $aLen & " x " & $bLen, iters): + r.prod(a, b) + +proc reduce2x*(T: typedesc, iters: int) = + var r: T + var t: doubleWidth(T) + rng.random_unsafe(t, T) + + bench("Reduce 2x-width", $T & " <- " & $doubleWidth(T), iters): + r.reduce(t) + +proc main() = + separator() + sumNoReduce(Fp[BLS12_381], iters = 10_000_000) + diffNoReduce(Fp[BLS12_381], iters = 10_000_000) + sum(Fp[BLS12_381], iters = 10_000_000) + diff(Fp[BLS12_381], iters = 10_000_000) + diff2x(Fp[BLS12_381], iters = 10_000_000) + diff2xNoReduce(Fp[BLS12_381], iters = 10_000_000) + mul2xBench(768, 384, 384, iters = 10_000_000) + reduce2x(Fp[BLS12_381], iters = 10_000_000) + separator() + +main() +notes() diff --git a/constantine.nimble b/constantine.nimble index ae409e9..471fbbb 100644 --- a/constantine.nimble +++ b/constantine.nimble @@ -35,6 +35,8 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[ ("tests/t_finite_fields_vs_gmp.nim", true), # Precompute ("tests/t_precomputed", false), + # Double-width finite fields + ("tests/t_finite_fields_double_width.nim", false), # Towers of extension fields ("tests/t_fp2.nim", false), ("tests/t_fp2_sqrt.nim", false), @@ -100,13 +102,15 @@ proc test(flags, path: string, commandFile = false) = # commandFile.writeLine command exec "echo \'" & command & "\' >> " & buildParallel -proc runBench(benchName: string, compiler = "") = +proc runBench(benchName: string, compiler = "", useAsm = true) = if not dirExists "build": mkDir "build" var cc = "" if compiler != "": - cc = "--cc:" & compiler & " -d:ConstantineASM=false" + cc = "--cc:" & compiler + if not useAsm: + cc &= " -d:ConstantineASM=false" exec "nim c " & cc & " -d:danger --verbosity:0 -o:build/" & benchName & "_" & compiler & " -r --hints:off --warnings:off benchmarks/" & benchName & ".nim" @@ -298,6 +302,27 @@ task bench_fp_gcc, "Run benchmark 𝔽p with gcc": task bench_fp_clang, "Run benchmark 𝔽p with clang": runBench("bench_fp", "clang") +task bench_fp_gcc_noasm, "Run benchmark 𝔽p with gcc - no Assembly": + runBench("bench_fp", "gcc", useAsm = false) + +task bench_fp_clang_noasm, "Run benchmark 𝔽p with clang - no Assembly": + runBench("bench_fp", "clang", useAsm = false) + +task bench_fpdbl, "Run benchmark 𝔽pDbl with your default compiler": + runBench("bench_fp_double_width") + +task bench_fpdbl_gcc, "Run benchmark 𝔽p with gcc": + runBench("bench_fp_double_width", "gcc") + +task bench_fpdbl_clang, "Run benchmark 𝔽p with clang": + runBench("bench_fp_double_width", "clang") + +task bench_fpdbl_gcc_noasm, "Run benchmark 𝔽p with gcc - no Assembly": + runBench("bench_fp_double_width", "gcc", useAsm = false) + +task bench_fpdbl_clang_noasm, "Run benchmark 𝔽p with clang - no Assembly": + runBench("bench_fp_double_width", "clang", useAsm = false) + task bench_fp2, "Run benchmark with 𝔽p2 your default compiler": runBench("bench_fp2") @@ -307,6 +332,12 @@ task bench_fp2_gcc, "Run benchmark 𝔽p2 with gcc": task bench_fp2_clang, "Run benchmark 𝔽p2 with clang": runBench("bench_fp2", "clang") +task bench_fp2_gcc_noasm, "Run benchmark 𝔽p2 with gcc - no Assembly": + runBench("bench_fp2", "gcc", useAsm = false) + +task bench_fp2_clang_noasm, "Run benchmark 𝔽p2 with clang - no Assembly": + runBench("bench_fp2", "clang", useAsm = false) + task bench_fp6, "Run benchmark with 𝔽p6 your default compiler": runBench("bench_fp6") @@ -316,6 +347,12 @@ task bench_fp6_gcc, "Run benchmark 𝔽p6 with gcc": task bench_fp6_clang, "Run benchmark 𝔽p6 with clang": runBench("bench_fp6", "clang") +task bench_fp6_gcc_noasm, "Run benchmark 𝔽p6 with gcc - no Assembly": + runBench("bench_fp6", "gcc", useAsm = false) + +task bench_fp6_clang_noasm, "Run benchmark 𝔽p6 with clang - no Assembly": + runBench("bench_fp6", "clang", useAsm = false) + task bench_fp12, "Run benchmark with 𝔽p12 your default compiler": runBench("bench_fp12") @@ -325,6 +362,12 @@ task bench_fp12_gcc, "Run benchmark 𝔽p12 with gcc": task bench_fp12_clang, "Run benchmark 𝔽p12 with clang": runBench("bench_fp12", "clang") +task bench_fp12_gcc_noasm, "Run benchmark 𝔽p12 with gcc - no Assembly": + runBench("bench_fp12", "gcc", useAsm = false) + +task bench_fp12_clang_noasm, "Run benchmark 𝔽p12 with clang - no Assembly": + runBench("bench_fp12", "clang", useAsm = false) + task bench_ec_g1, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - GCC": runBench("bench_ec_g1") @@ -334,6 +377,12 @@ task bench_ec_g1_gcc, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weier task bench_ec_g1_clang, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - Clang": runBench("bench_ec_g1", "clang") +task bench_ec_g1_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - GCC no Assembly": + runBench("bench_ec_g1", "gcc", useAsm = false) + +task bench_ec_g1_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - Clang no Assembly": + runBench("bench_ec_g1", "clang", useAsm = false) + task bench_ec_g2, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - GCC": runBench("bench_ec_g2") @@ -342,3 +391,9 @@ task bench_ec_g2_gcc, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weier task bench_ec_g2_clang, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - Clang": runBench("bench_ec_g2", "clang") + +task bench_ec_g2_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - GCC no Assembly": + runBench("bench_ec_g2", "gcc", useAsm = false) + +task bench_ec_g2_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - Clang no Assembly": + runBench("bench_ec_g2", "clang", useAsm = false) diff --git a/constantine/arithmetic.nim b/constantine/arithmetic.nim index 5a85716..0a2f9fe 100644 --- a/constantine/arithmetic.nim +++ b/constantine/arithmetic.nim @@ -8,8 +8,8 @@ import arithmetic/bigints, - arithmetic/[finite_fields, finite_fields_inversion] + arithmetic/[finite_fields, finite_fields_inversion, finite_fields_double_width] export bigints, - finite_fields, finite_fields_inversion + finite_fields, finite_fields_inversion, finite_fields_double_width diff --git a/constantine/arithmetic/bigints.nim b/constantine/arithmetic/bigints.nim index 302e3a1..d4bc8da 100644 --- a/constantine/arithmetic/bigints.nim +++ b/constantine/arithmetic/bigints.nim @@ -9,13 +9,10 @@ import ../config/[common, type_bigint], ../primitives, - ./limbs_generic, + ./limbs, ./limbs_generic_modular, ./limbs_montgomery -when UseX86ASM: - import ./limbs_asm_x86 - export BigInt # ############################################################ @@ -85,10 +82,7 @@ func ccopy*(a: var BigInt, b: BigInt, ctl: SecretBool) = ## If ctl is true: b is copied into a ## if ctl is false: b is not copied and a is untouched ## Time and memory accesses are the same whether a copy occurs or not - when UseX86ASM: - ccopy_asm(a.limbs, b.limbs, ctl) - else: - ccopy(a.limbs, b.limbs, ctl) + ccopy(a.limbs, b.limbs, ctl) func cswap*(a, b: var BigInt, ctl: CTBool) = ## Swap ``a`` and ``b`` if ``ctl`` is true @@ -245,6 +239,14 @@ func prod*[rBits, aBits, bBits](r: var BigInt[rBits], a: BigInt[aBits], b: BigIn ## It will be truncated if it cannot fit in r limbs. r.limbs.prod(a.limbs, b.limbs) +func mul*[aBits, bBits](a: var BigInt[aBits], b: BigInt[bBits]) = + ## Multi-precision multiplication + ## a <- a*b + ## `a`, `b`, can have different sizes + var t{.noInit.}: typeof(a) + t.limbs.prod(a.limbs, b.limbs) + a = t + func prod_high_words*[rBits, aBits, bBits](r: var BigInt[rBits], a: BigInt[aBits], b: BigInt[bBits], lowestWordIndex: static int) = ## Multi-precision multiplication keeping only high words ## r <- a*b >> (2^WordBitWidth)^lowestWordIndex diff --git a/constantine/arithmetic/finite_fields.nim b/constantine/arithmetic/finite_fields.nim index ce157d8..e68a936 100644 --- a/constantine/arithmetic/finite_fields.nim +++ b/constantine/arithmetic/finite_fields.nim @@ -29,7 +29,7 @@ import ../config/[common, type_fp, curves], ./bigints, ./limbs_montgomery -when UseX86ASM: +when UseASM_X86_64: import ./limbs_asm_modular_x86 export Fp @@ -120,7 +120,7 @@ func setOne*(a: var Fp) {.inline.} = func `+=`*(a: var Fp, b: Fp) {.inline.} = ## In-place addition modulo p - when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling + when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling addmod_asm(a.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs) else: var overflowed = add(a.mres, b.mres) @@ -129,7 +129,7 @@ func `+=`*(a: var Fp, b: Fp) {.inline.} = func `-=`*(a: var Fp, b: Fp) {.inline.} = ## In-place substraction modulo p - when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling + when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling submod_asm(a.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs) else: let underflowed = sub(a.mres, b.mres) @@ -137,7 +137,7 @@ func `-=`*(a: var Fp, b: Fp) {.inline.} = func double*(a: var Fp) {.inline.} = ## Double ``a`` modulo p - when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling + when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling addmod_asm(a.mres.limbs, a.mres.limbs, Fp.C.Mod.limbs) else: var overflowed = double(a.mres) @@ -145,9 +145,9 @@ func double*(a: var Fp) {.inline.} = discard csub(a.mres, Fp.C.Mod, overflowed) func sum*(r: var Fp, a, b: Fp) {.inline.} = - ## Sum ``a`` and ``b`` into ``r`` module p + ## Sum ``a`` and ``b`` into ``r`` modulo p ## r is initialized/overwritten - when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling + when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling r = a addmod_asm(r.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs) else: @@ -155,21 +155,42 @@ func sum*(r: var Fp, a, b: Fp) {.inline.} = overflowed = overflowed or not(r.mres < Fp.C.Mod) discard csub(r.mres, Fp.C.Mod, overflowed) +func sumNoReduce*(r: var Fp, a, b: Fp) {.inline.} = + ## Sum ``a`` and ``b`` into ``r`` without reduction + discard r.mres.sum(a.mres, b.mres) + func diff*(r: var Fp, a, b: Fp) {.inline.} = ## Substract `b` from `a` and store the result into `r`. ## `r` is initialized/overwritten - when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling - var t = a # Handle aliasing r == b + ## Requires r != b + when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling + r = a + submod_asm(r.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs) + else: + var underflowed = r.mres.diff(a.mres, b.mres) + discard cadd(r.mres, Fp.C.Mod, underflowed) + +func diffAlias*(r: var Fp, a, b: Fp) {.inline.} = + ## Substract `b` from `a` and store the result into `r`. + ## `r` is initialized/overwritten + ## Handles r == b + when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling + var t = a submod_asm(t.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs) r = t else: var underflowed = r.mres.diff(a.mres, b.mres) discard cadd(r.mres, Fp.C.Mod, underflowed) +func diffNoReduce*(r: var Fp, a, b: Fp) {.inline.} = + ## Substract `b` from `a` and store the result into `r` + ## without reduction + discard r.mres.diff(a.mres, b.mres) + func double*(r: var Fp, a: Fp) {.inline.} = ## Double ``a`` into ``r`` ## `r` is initialized/overwritten - when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling + when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling r = a addmod_asm(r.mres.limbs, a.mres.limbs, Fp.C.Mod.limbs) else: @@ -188,7 +209,7 @@ func square*(r: var Fp, a: Fp) {.inline.} = func neg*(r: var Fp, a: Fp) {.inline.} = ## Negate modulo p - when UseX86ASM and defined(gcc): + when UseASM_X86_64 and defined(gcc): # Clang and every compiler besides GCC # can cleanly optimized this # especially on Fp2 diff --git a/constantine/arithmetic/finite_fields_double_width.nim b/constantine/arithmetic/finite_fields_double_width.nim new file mode 100644 index 0000000..523c744 --- /dev/null +++ b/constantine/arithmetic/finite_fields_double_width.nim @@ -0,0 +1,72 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + ../config/[common, curves, type_bigint, type_fp], + ../primitives, + ./bigints, + ./finite_fields, + ./limbs, + ./limbs_double_width + +when UseASM_X86_64: + import limbs_asm_modular_dbl_width_x86 + +type FpDbl*[C: static Curve] = object + ## Double-width Fp element + ## This allows saving on reductions + # We directly work with double the number of limbs + limbs2x*: matchingLimbs2x(C) + +template doubleWidth*(T: typedesc[Fp]): typedesc = + ## Return the double-width type matching with Fp + FpDbl[T.C] + +func mulNoReduce*(r: var FpDbl, a, b: Fp) {.inline.} = + ## Store the product of ``a`` by ``b`` into ``r`` + r.limbs2x.prod(a.mres.limbs, b.mres.limbs) + +func reduce*(r: var Fp, a: FpDbl) {.inline.} = + ## Reduce a double-width field element into r + const N = r.mres.limbs.len + montyRed( + r.mres.limbs, + a.limbs2x, + Fp.C.Mod.limbs, + Fp.C.getNegInvModWord(), + Fp.C.canUseNoCarryMontyMul() + ) + +func diffNoInline(r: var FpDbl, a, b: FpDbl): Borrow = + r.limbs2x.diff(a.limbs2x, b.limbs2x) + +func diffNoReduce*(r: var FpDbl, a, b: FpDbl) = + ## Double-width substraction without reduction + discard diffNoInline(r, a, b) + +func diff*(r: var FpDbl, a, b: FpDbl) = + ## Double-width modular substraction + when false: # TODO slower + r = a + sub2x_asm(r.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs) + else: + var underflowed = SecretBool diffNoInline(r, a, b) + + const N = r.limbs2x.len div 2 + const M = FpDbl.C.Mod + var carry = Carry(0) + var sum: SecretWord + for i in 0 ..< N: + addC(carry, sum, r.limbs2x[i+N], M.limbs[i], carry) + underflowed.ccopy(r.limbs2x[i+N], sum) + +func `-=`*(a: var FpDbl, b: FpDbl) = + when false: # TODO slower + sub2x_asm(a.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs) + else: + a.diff(a, b) diff --git a/constantine/arithmetic/limbs_generic.nim b/constantine/arithmetic/limbs.nim similarity index 84% rename from constantine/arithmetic/limbs_generic.nim rename to constantine/arithmetic/limbs.nim index ded7c5a..8f7be16 100644 --- a/constantine/arithmetic/limbs_generic.nim +++ b/constantine/arithmetic/limbs.nim @@ -8,8 +8,13 @@ import ../config/common, - ../primitives, - ../../helpers/static_for + ../primitives + +when UseASM_X86_32: + import ./limbs_asm_x86 +when UseASM_X86_64: + import ./limbs_asm_mul_x86 + import ./limbs_asm_mul_x86_adx_bmi2 # ############################################################ # @@ -37,39 +42,9 @@ import # The limb-endianess is little-endian, less significant limb is at index 0. # The word-endianness is native-endian. -type Limbs*[N: static int] = array[N, SecretWord] - ## Limbs-type - ## Should be distinct type to avoid builtins to use non-constant time - ## implementation, for example for comparison. - ## - ## but for unknown reason, it prevents semchecking `bits` - # No exceptions allowed {.push raises: [].} -# ############################################################ -# -# Accessors -# -# ############################################################ -# -# Commented out since we don't use a distinct type - -# template `[]`[N](v: Limbs[N], idx: int): SecretWord = -# (array[N, SecretWord])(v)[idx] -# -# template `[]`[N](v: var Limbs[N], idx: int): var SecretWord = -# (array[N, SecretWord])(v)[idx] -# -# template `[]=`[N](v: Limbs[N], idx: int, val: SecretWord) = -# (array[N, SecretWord])(v)[idx] = val - -# ############################################################ -# -# Checks and debug/test only primitives -# -# ############################################################ - # ############################################################ # # Limbs Primitives @@ -104,8 +79,11 @@ func ccopy*(a: var Limbs, b: Limbs, ctl: SecretBool) = ## If ctl is true: b is copied into a ## if ctl is false: b is not copied and a is untouched ## Time and memory accesses are the same whether a copy occurs or not - for i in 0 ..< a.len: - ctl.ccopy(a[i], b[i]) + when UseASM_X86_32: + ccopy_asm(a, b, ctl) + else: + for i in 0 ..< a.len: + ctl.ccopy(a[i], b[i]) func cswap*(a, b: var Limbs, ctl: CTBool) = ## Swap ``a`` and ``b`` if ``ctl`` is true @@ -190,9 +168,12 @@ func shiftRight*(a: var Limbs, k: int) {.inline.}= func add*(a: var Limbs, b: Limbs): Carry = ## Limbs addition ## Returns the carry - result = Carry(0) - for i in 0 ..< a.len: - addC(result, a[i], a[i], b[i], result) + when UseASM_X86_32: + result = add_asm(a, a, b) + else: + result = Carry(0) + for i in 0 ..< a.len: + addC(result, a[i], a[i], b[i], result) func add*(a: var Limbs, w: SecretWord): Carry = ## Limbs addition, add a number that fits in a word @@ -222,16 +203,22 @@ func sum*(r: var Limbs, a, b: Limbs): Carry = ## `r` is initialized/overwritten ## ## Returns the carry - result = Carry(0) - for i in 0 ..< a.len: - addC(result, r[i], a[i], b[i], result) + when UseASM_X86_32: + result = add_asm(r, a, b) + else: + result = Carry(0) + for i in 0 ..< a.len: + addC(result, r[i], a[i], b[i], result) func sub*(a: var Limbs, b: Limbs): Borrow = ## Limbs substraction ## Returns the borrow - result = Borrow(0) - for i in 0 ..< a.len: - subB(result, a[i], a[i], b[i], result) + when UseASM_X86_32: + result = sub_asm(a, a, b) + else: + result = Borrow(0) + for i in 0 ..< a.len: + subB(result, a[i], a[i], b[i], result) func sub*(a: var Limbs, w: SecretWord): Borrow = ## Limbs substraction, sub a number that fits in a word @@ -272,9 +259,12 @@ func diff*(r: var Limbs, a, b: Limbs): Borrow = ## `r` is initialized/overwritten ## ## Returns the borrow - result = Borrow(0) - for i in 0 ..< a.len: - subB(result, r[i], a[i], b[i], result) + when UseASM_X86_32: + result = sub_asm(r, a, b) + else: + result = Borrow(0) + for i in 0 ..< a.len: + subB(result, r[i], a[i], b[i], result) func cneg*(a: var Limbs, ctl: CTBool) = ## Conditional negation. @@ -301,7 +291,7 @@ func cneg*(a: var Limbs, ctl: CTBool) = # Multiplication # ------------------------------------------------------------ -func prod*[rLen, aLen, bLen](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) = +func prod*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) = ## Multi-precision multiplication ## r <- a*b ## @@ -309,23 +299,34 @@ func prod*[rLen, aLen, bLen](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) ## if `r`.limbs.len < a.limbs.len + b.limbs.len ## The result will be truncated, i.e. it will be ## a * b (mod (2^WordBitwidth)^r.limbs.len) + ## + ## `r` must not alias ``a`` or ``b`` - # We use Product Scanning / Comba multiplication - var t, u, v = SecretWord(0) - var z: Limbs[rLen] # zero-init, ensure on stack and removes in-place problems + when UseASM_X86_64 and aLen <= 6: + if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()): + mul_asm_adx_bmi2(r, a, b) + else: + mul_asm(r, a, b) + elif UseASM_X86_64: + mul_asm(r, a, b) + else: + # We use Product Scanning / Comba multiplication + var t, u, v = SecretWord(0) - staticFor i, 0, min(a.len+b.len, r.len): - const ib = min(b.len-1, i) - const ia = i - ib - staticFor j, 0, min(a.len - ia, ib+1): - mulAcc(t, u, v, a[ia+j], b[ib-j]) + staticFor i, 0, min(a.len+b.len, r.len): + const ib = min(b.len-1, i) + const ia = i - ib + staticFor j, 0, min(a.len - ia, ib+1): + mulAcc(t, u, v, a[ia+j], b[ib-j]) - z[i] = v - v = u - u = t - t = SecretWord(0) + r[i] = v + v = u + u = t + t = SecretWord(0) - r = z + if aLen+bLen < rLen: + for i in aLen+bLen ..< rLen: + r[i] = SecretWord 0 func prod_high_words*[rLen, aLen, bLen]( r: var Limbs[rLen], diff --git a/constantine/arithmetic/limbs_asm_modular_dbl_width_x86.nim b/constantine/arithmetic/limbs_asm_modular_dbl_width_x86.nim new file mode 100644 index 0000000..96ee717 --- /dev/null +++ b/constantine/arithmetic/limbs_asm_modular_dbl_width_x86.nim @@ -0,0 +1,84 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + # Standard library + std/macros, + # Internal + ../config/common, + ../primitives + +# ############################################################ +# +# Assembly implementation of FpDbl +# +# ############################################################ + +static: doAssert UseASM_X86_64 +{.localPassC:"-fomit-frame-pointer".} # Needed so that the compiler finds enough registers + +# TODO slower than intrinsics + +# Substraction +# ------------------------------------------------------------ + +macro sub2x_gen[N: static int](a: var Limbs[N], b: Limbs[N], M: Limbs[N div 2]): untyped = + ## Generate an optimized out-of-place double-width substraction kernel + + result = newStmtList() + + var ctx = init(Assembler_x86, BaseType) + let + N2 = N div 2 + + arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, InputOutput) + # We reuse the reg used for B for overflow detection + arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, InputOutput) + # We could force M as immediate by specializing per moduli + arrM = init(OperandArray, nimSymbol = M, N, PointerInReg, Input) + # If N is too big, we need to spill registers. TODO. + arrT = init(OperandArray, nimSymbol = ident"t", N2, ElemsInReg, Output_EarlyClobber) + arrTadd = init(OperandArray, nimSymbol = ident"tadd", N2, ElemsInReg, Output_EarlyClobber) + + # Substraction + for i in 0 ..< N: + ctx.mov arrT[i mod N2], arrA[i] + if i == 0: + ctx.sub arrT[0], arrB[0] + else: + ctx.sbb arrT[i mod N2], arrB[i] + ctx.mov arrA[i], arrT[i mod N2] + # Interleaved copy the modulus to hide SBB latencies + if i < N2: + ctx.mov arrTadd[i], arrM[i] + + # Mask: underflowed contains 0xFFFF or 0x0000 + let underflowed = arrB.reuseRegister() + ctx.sbb underflowed, underflowed + + # Now mask the adder, with 0 or the modulus limbs + for i in 0 ..< N2: + ctx.`and` arrTadd[i], underflowed + + # Add the masked modulus + for i in 0 ..< N2: + if i == 0: + ctx.add arrT[0], arrTadd[0] + else: + ctx.adc arrT[i], arrTadd[i] + ctx.mov arrA[i+N2], arrT[i] + + let t = arrT.nimSymbol + let tadd = arrTadd.nimSymbol + result.add quote do: + var `t`{.noinit.}, `tadd` {.noInit.}: typeof(`a`) + result.add ctx.generate + +func sub2x_asm*[N: static int](a: var Limbs[N], b: Limbs[N], M: Limbs[N div 2]) = + ## Constant-time double-width substraction + sub2x_gen(a, b, M) diff --git a/constantine/arithmetic/limbs_asm_modular_x86.nim b/constantine/arithmetic/limbs_asm_modular_x86.nim index 40108c0..05c0aff 100644 --- a/constantine/arithmetic/limbs_asm_modular_x86.nim +++ b/constantine/arithmetic/limbs_asm_modular_x86.nim @@ -12,7 +12,7 @@ import # Internal ../config/common, ../primitives, - ./limbs_generic + ./limbs # ############################################################ # @@ -25,7 +25,7 @@ import # They are nice to let the compiler deals with mov # but too constraining so we move things ourselves. -static: doAssert UseX86ASM +static: doAssert UseASM_X86_64 {.localPassC:"-fomit-frame-pointer".} # Needed so that the compiler finds enough registers @@ -64,6 +64,7 @@ macro addmod_gen[N: static int](a: var Limbs[N], b, M: Limbs[N]): untyped = ctx.mov arrTsub[i], arrT[i] # Mask: overflowed contains 0xFFFF or 0x0000 + # TODO: unnecessary if MSB never set, i.e. "canUseNoCarryMontyMul" let overflowed = arrB.reuseRegister() ctx.sbb overflowed, overflowed @@ -118,7 +119,7 @@ macro submod_gen[N: static int](a: var Limbs[N], b, M: Limbs[N]): untyped = arrT = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber) arrTadd = init(OperandArray, nimSymbol = ident"tadd", N, ElemsInReg, Output_EarlyClobber) - # Addition + # Substraction for i in 0 ..< N: ctx.mov arrT[i], arrA[i] if i == 0: diff --git a/constantine/arithmetic/limbs_asm_montmul_x86.nim b/constantine/arithmetic/limbs_asm_montmul_x86.nim index 7be67c8..8a50584 100644 --- a/constantine/arithmetic/limbs_asm_montmul_x86.nim +++ b/constantine/arithmetic/limbs_asm_montmul_x86.nim @@ -12,7 +12,8 @@ import # Internal ../config/common, ../primitives, - ./limbs_generic + ./limbs, + ./limbs_asm_montred_x86 # ############################################################ # @@ -25,7 +26,7 @@ import # They are nice to let the compiler deals with mov # but too constraining so we move things ourselves. -static: doAssert UseX86ASM +static: doAssert UseASM_X86_64 # Necessary for the compiler to find enough registers (enabled at -O1) {.localPassC:"-fomit-frame-pointer".} @@ -33,28 +34,6 @@ static: doAssert UseX86ASM # Montgomery multiplication # ------------------------------------------------------------ # Fallback when no ADX and BMI2 support (MULX, ADCX, ADOX) - -proc finalSub*( - ctx: var Assembler_x86, - r: Operand or OperandArray, - t, M, scratch: OperandArray - ) = - ## Reduce `t` into `r` modulo `M` - let N = M.len - ctx.comment "Final substraction" - for i in 0 ..< N: - ctx.mov scratch[i], t[i] - if i == 0: - ctx.sub scratch[i], M[i] - else: - ctx.sbb scratch[i], M[i] - - # If we borrowed it means that we were smaller than - # the modulus and we don't need "scratch" - for i in 0 ..< N: - ctx.cmovnc t[i], scratch[i] - ctx.mov r[i], t[i] - macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_MM: Limbs[N], m0ninv_MM: BaseType): untyped = ## Generate an optimized Montgomery Multiplication kernel ## using the CIOS method @@ -211,7 +190,7 @@ macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_ ctx.mov rRDX, r let r2 = rRDX.asArrayAddr(len = N) - ctx.finalSub( + ctx.finalSubNoCarry( r2, t, M, scratch ) diff --git a/constantine/arithmetic/limbs_asm_montmul_x86_adx_bmi2.nim b/constantine/arithmetic/limbs_asm_montmul_x86_adx_bmi2.nim index f9d7d57..db6b85a 100644 --- a/constantine/arithmetic/limbs_asm_montmul_x86_adx_bmi2.nim +++ b/constantine/arithmetic/limbs_asm_montmul_x86_adx_bmi2.nim @@ -12,8 +12,8 @@ import # Internal ../config/common, ../primitives, - ./limbs_generic, - ./limbs_asm_montmul_x86 + ./limbs, + ./limbs_asm_montred_x86 # ############################################################ # @@ -26,7 +26,7 @@ import # They are nice to let the compiler deals with mov # but too constraining so we move things ourselves. -static: doAssert UseX86ASM +static: doAssert UseASM_X86_64 # MULX/ADCX/ADOX {.localPassC:"-madx -mbmi2".} @@ -37,53 +37,58 @@ static: doAssert UseX86ASM # ------------------------------------------------------------ proc mulx_by_word( ctx: var Assembler_x86, - C: Operand, + hi: Operand, t: OperandArray, a: Operand, # Pointer in scratchspace - word: Operand, - S, rRDX: Operand + word0: Operand, + lo, rRDX: Operand ) = ## Multiply the `a[0..= 2, "The Assembly-optimized montgomery multiplication requires at least 2 limbs." ctx.comment " Outer loop i = 0" - ctx.`xor` rRDX, rRDX # Clear flags - TODO: necessary? - ctx.mov rRDX, word # for j=0 to N-1 # (C,t[j]) := t[j] + a[j]*b[i] + C # First limb - ctx.mulx t[1], t[0], a[0], rdx + ctx.mov rRDX, word0 + if N > 1: + ctx.mulx t[1], t[0], a[0], rdx + ctx.`xor` hi, hi # Clear flags - TODO: necessary? + else: + ctx.mulx hi, t[0], a[0], rdx + return # Steady state for j in 1 ..< N-1: - ctx.mulx t[j+1], S, a[j], rdx - ctx.adox t[j], S # TODO, we probably can use ADC here + ctx.mulx t[j+1], lo, a[j], rdx + if j == 1: + ctx.add t[j], lo + else: + ctx.adc t[j], lo # Last limb - ctx.mulx C, S, a[N-1], rdx - ctx.adox t[N-1], S + ctx.comment " Outer loop i = 0, last limb" + ctx.mulx hi, lo, a[N-1], rdx + ctx.adc t[N-1], lo # Final carries - ctx.comment " Mul carries i = 0" - ctx.mov rRDX, 0 # Set to 0 without clearing flags - ctx.adcx C, rRDX - ctx.adox C, rRDX + ctx.comment " Accumulate last carries in hi word" + ctx.adc hi, 0 proc mulaccx_by_word( ctx: var Assembler_x86, - C: Operand, + hi: Operand, t: OperandArray, a: Operand, # Pointer in scratchspace i: int, word: Operand, - S, rRDX: Operand + lo, rRDX: Operand ) = ## Multiply the `a[0..= 2, "The Assembly-optimized montgomery multiplication requires at least 2 limbs." doAssert i != 0 - ctx.comment " Outer loop i = " & $i - ctx.`xor` rRDX, rRDX # Clear flags - TODO: necessary? + ctx.comment " Outer loop i = " & $i & ", j in [0, " & $N & ")" ctx.mov rRDX, word + ctx.`xor` hi, hi # Clear flags - TODO: necessary? # for j=0 to N-1 # (C,t[j]) := t[j] + a[j]*b[i] + C # Steady state for j in 0 ..< N-1: - ctx.mulx C, S, a[j], rdx - ctx.adox t[j], S - ctx.adcx t[j+1], C + ctx.mulx hi, lo, a[j], rdx + ctx.adox t[j], lo + ctx.adcx t[j+1], hi # Last limb - ctx.mulx C, S, a[N-1], rdx - ctx.adox t[N-1], S + ctx.comment " Outer loop i = " & $i & ", last limb" + ctx.mulx hi, lo, a[N-1], rdx + ctx.adox t[N-1], lo # Final carries - ctx.comment " Mul carries i = " & $i + ctx.comment " Accumulate last carries in hi word" ctx.mov rRDX, 0 # Set to 0 without clearing flags - ctx.adcx C, rRDX - ctx.adox C, rRDX + ctx.adcx hi, rRDX + ctx.adox hi, rRDX proc partialRedx( ctx: var Assembler_x86, @@ -163,7 +168,7 @@ proc partialRedx( ctx.adox t[j-1], S # Last carries - # t[N-1} = S + C + # t[N-1] = S + C ctx.comment " Reduction carry " ctx.mov S, 0 ctx.adcx t[N-1], S @@ -182,13 +187,13 @@ macro montMul_CIOS_nocarry_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM, let scratchSlots = max(N, 6) - r = init(OperandArray, nimSymbol = r_MM, N, PointerInReg, InputOutput) + r = init(OperandArray, nimSymbol = r_MM, N, PointerInReg, InputOutput_EnsureClobber) # We could force M as immediate by specializing per moduli M = init(OperandArray, nimSymbol = M_MM, N, PointerInReg, Input) # If N is too big, we need to spill registers. TODO. t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber) # MultiPurpose Register slots - scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput) + scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber) # MULX requires RDX rRDX = Operand( @@ -270,7 +275,7 @@ macro montMul_CIOS_nocarry_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM, lo, C, rRDX ) - ctx.finalSub( + ctx.finalSubNoCarry( r, t, M, scratch ) diff --git a/constantine/arithmetic/limbs_asm_montred_x86.nim b/constantine/arithmetic/limbs_asm_montred_x86.nim new file mode 100644 index 0000000..a696812 --- /dev/null +++ b/constantine/arithmetic/limbs_asm_montred_x86.nim @@ -0,0 +1,255 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + # Standard library + std/macros, + # Internal + ../config/common, + ../primitives, + ./limbs + +# ############################################################ +# +# Assembly implementation of finite fields +# +# ############################################################ + + +static: doAssert UseASM_X86_32 + +# Necessary for the compiler to find enough registers (enabled at -O1) +{.localPassC:"-fomit-frame-pointer".} + +proc finalSubNoCarry*( + ctx: var Assembler_x86, + r: Operand or OperandArray, + t, M, scratch: OperandArray + ) = + ## Reduce `t` into `r` modulo `M` + let N = M.len + ctx.comment "Final substraction (no carry)" + for i in 0 ..< N: + ctx.mov scratch[i], t[i] + if i == 0: + ctx.sub scratch[i], M[i] + else: + ctx.sbb scratch[i], M[i] + + # If we borrowed it means that we were smaller than + # the modulus and we don't need "scratch" + for i in 0 ..< N: + ctx.cmovnc t[i], scratch[i] + ctx.mov r[i], t[i] + +proc finalSubCanOverflow*( + ctx: var Assembler_x86, + r: Operand or OperandArray, + t, M, scratch: OperandArray, + overflowReg: Operand + ) = + ## Reduce `t` into `r` modulo `M` + ## To be used when the final substraction can + ## also depend on the carry flag + ## This is in particular possible when the MSB + ## is set for the prime modulus + ## `overflowReg` should be a register that will be used + ## to store the carry flag + + ctx.sbb overflowReg, overflowReg + + let N = M.len + ctx.comment "Final substraction (may carry)" + for i in 0 ..< N: + ctx.mov scratch[i], t[i] + if i == 0: + ctx.sub scratch[i], M[i] + else: + ctx.sbb scratch[i], M[i] + + ctx.sbb overflowReg, 0 + + # If we borrowed it means that we were smaller than + # the modulus and we don't need "scratch" + for i in 0 ..< N: + ctx.cmovnc t[i], scratch[i] + ctx.mov r[i], t[i] + + +# Montgomery reduction +# ------------------------------------------------------------ + +macro montyRed_gen[N: static int]( + r_MR: var array[N, SecretWord], + t_MR: array[N*2, SecretWord], + M_MR: array[N, SecretWord], + m0ninv_MR: BaseType, + canUseNoCarryMontyMul: static bool + ) = + # TODO, slower than Clang, in particular due to the shadowing + + result = newStmtList() + + var ctx = init(Assembler_x86, BaseType) + let + # We could force M as immediate by specializing per moduli + M = init(OperandArray, nimSymbol = M_MR, N, PointerInReg, Input) + + # MUL requires RAX and RDX + rRAX = Operand( + desc: OperandDesc( + asmId: "[rax]", + nimSymbol: ident"rax", + rm: RAX, + constraint: InputOutput_EnsureClobber, + cEmit: "rax" + ) + ) + + rRDX = Operand( + desc: OperandDesc( + asmId: "[rdx]", + nimSymbol: ident"rdx", + rm: RDX, + constraint: Output_EarlyClobber, + cEmit: "rdx" + ) + ) + + m0ninv = Operand( + desc: OperandDesc( + asmId: "[m0ninv]", + nimSymbol: m0ninv_MR, + rm: Reg, + constraint: Input, + cEmit: "m0ninv" + ) + ) + + + let scratchSlots = N+2 + var scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber) + + # Prologue + let eax = rRAX.desc.nimSymbol + let edx = rRDX.desc.nimSymbol + let scratchSym = scratch.nimSymbol + result.add quote do: + static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress) + + var `eax`{.noInit.}, `edx`{.noInit.}: BaseType + var `scratchSym` {.noInit.}: Limbs[`scratchSlots`] + + # Algorithm + # --------------------------------------------------------- + # for i in 0 .. n-1: + # hi <- 0 + # m <- t[i] * m0ninv mod 2^w (i.e. simple multiplication) + # for j in 0 .. n-1: + # (hi, lo) <- t[i+j] + m * M[j] + hi + # t[i+j] <- lo + # t[i+n] += hi + # for i in 0 .. n-1: + # r[i] = t[i+n] + # if r >= M: + # r -= M + + # No register spilling handling + doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs." + + result.add quote do: + `eax` = BaseType `t_MR`[0] + `scratchSym`[1 .. `N`-1] = `t_MR`.toOpenArray(1, `N`-1) + + ctx.mov scratch[N], rRAX + ctx.imul rRAX, m0ninv # m <- t[i] * m0ninv mod 2^w + ctx.mov scratch[0], rRAX + + # scratch: [t[0] * m0, t[1], t[2], t[3], t[0]] for 4 limbs + + for i in 0 ..< N: + ctx.comment "" + let hi = scratch[N] + let next = scratch[N+1] + + ctx.mul rdx, rax, M[0], rax + ctx.add hi, rRAX # Guaranteed to be zero + ctx.mov rRAX, scratch[0] + ctx.adc hi, rRDX + + for j in 1 ..< N-1: + ctx.comment "" + ctx.mul rdx, rax, M[j], rax + ctx.add scratch[j], rRAX + ctx.mov rRAX, scratch[0] + ctx.adc rRDX, 0 + ctx.add scratch[j], hi + ctx.adc rRDX, 0 + ctx.mov hi, rRDX + + # Next load + if i < N-1: + ctx.comment "" + ctx.mov next, scratch[1] + ctx.imul scratch[1], m0ninv + ctx.comment "" + + # Last limb + ctx.comment "" + ctx.mul rdx, rax, M[N-1], rax + ctx.add scratch[N-1], rRAX + ctx.mov rRAX, scratch[1] # Contains next * m0 + ctx.adc rRDX, 0 + ctx.add scratch[N-1], hi + ctx.adc rRDX, 0 + ctx.mov hi, rRDX + + scratch.rotateLeft() + + # Code generation + result.add ctx.generate() + + # New codegen + ctx = init(Assembler_x86, BaseType) + + let r = init(OperandArray, nimSymbol = r_MR, N, PointerInReg, InputOutput_EnsureClobber) + let t = init(OperandArray, nimSymbol = t_MR, N*2, PointerInReg, Input) + let extraRegNeeded = N-2 + let tsub = init(OperandArray, nimSymbol = ident"tsub", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber) + let tsubsym = tsub.nimSymbol + result.add quote do: + var `tsubsym` {.noInit.}: Limbs[`extraRegNeeded`] + + # This does t[i+n] += hi + # but in a separate carry chain, fused with the + # copy "r[i] = t[i+n]" + for i in 0 ..< N: + if i == 0: + ctx.add scratch[i], t[i+N] + else: + ctx.adc scratch[i], t[i+N] + + let reuse = repackRegisters(tsub, scratch[N], scratch[N+1]) + + if canUseNoCarryMontyMul: + ctx.finalSubNoCarry(r, scratch, M, reuse) + else: + ctx.finalSubCanOverflow(r, scratch, M, reuse, rRAX) + + # Code generation + result.add ctx.generate() + +func montRed_asm*[N: static int]( + r: var array[N, SecretWord], + t: array[N*2, SecretWord], + M: array[N, SecretWord], + m0ninv: BaseType, + canUseNoCarryMontyMul: static bool + ) = + ## Constant-time Montgomery reduction + montyRed_gen(r, t, M, m0ninv, canUseNoCarryMontyMul) diff --git a/constantine/arithmetic/limbs_asm_montred_x86_adx_bmi2.nim b/constantine/arithmetic/limbs_asm_montred_x86_adx_bmi2.nim new file mode 100644 index 0000000..442cd34 --- /dev/null +++ b/constantine/arithmetic/limbs_asm_montred_x86_adx_bmi2.nim @@ -0,0 +1,191 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + # Standard library + std/macros, + # Internal + ../config/common, + ../primitives, + ./limbs, + ./limbs_asm_montred_x86 + +# ############################################################ +# +# Assembly implementation of finite fields +# +# ############################################################ + +# TODO, MCL has an implementation about 14% faster + +static: doAssert UseASM_X86_64 + +# MULX/ADCX/ADOX +{.localPassC:"-madx -mbmi2".} +# Necessary for the compiler to find enough registers (enabled at -O1) +{.localPassC:"-fomit-frame-pointer".} + +# Montgomery reduction +# ------------------------------------------------------------ + +macro montyRedx_gen[N: static int]( + r_MR: var array[N, SecretWord], + t_MR: array[N*2, SecretWord], + M_MR: array[N, SecretWord], + m0ninv_MR: BaseType, + canUseNoCarryMontyMul: static bool + ) = + # TODO, slower than Clang, in particular due to the shadowing + + result = newStmtList() + + var ctx = init(Assembler_x86, BaseType) + let + # We could force M as immediate by specializing per moduli + M = init(OperandArray, nimSymbol = M_MR, N, PointerInReg, Input) + + hi = Operand( + desc: OperandDesc( + asmId: "[hi]", + nimSymbol: ident"hi", + rm: Reg, + constraint: Output_EarlyClobber, + cEmit: "hi" + ) + ) + + lo = Operand( + desc: OperandDesc( + asmId: "[lo]", + nimSymbol: ident"lo", + rm: Reg, + constraint: Output_EarlyClobber, + cEmit: "lo" + ) + ) + + rRDX = Operand( + desc: OperandDesc( + asmId: "[rdx]", + nimSymbol: ident"rdx", + rm: RDX, + constraint: InputOutput_EnsureClobber, + cEmit: "rdx" + ) + ) + + m0ninv = Operand( + desc: OperandDesc( + asmId: "[m0ninv]", + nimSymbol: m0ninv_MR, + rm: Reg, + constraint: Input, + cEmit: "m0ninv" + ) + ) + + let scratchSlots = N+1 + var scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber) + + # Prologue + let edx = rRDX.desc.nimSymbol + let hisym = hi.desc.nimSymbol + let losym = lo.desc.nimSymbol + let scratchSym = scratch.nimSymbol + result.add quote do: + static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress) + + var `hisym`{.noInit.}, `losym`{.noInit.}, `edx`{.noInit.}: BaseType + var `scratchSym` {.noInit.}: Limbs[`scratchSlots`] + + # Algorithm + # --------------------------------------------------------- + # for i in 0 .. n-1: + # hi <- 0 + # m <- t[i] * m0ninv mod 2^w (i.e. simple multiplication) + # for j in 0 .. n-1: + # (hi, lo) <- t[i+j] + m * M[j] + hi + # t[i+j] <- lo + # t[i+n] += hi + # for i in 0 .. n-1: + # r[i] = t[i+n] + # if r >= M: + # r -= M + + # No register spilling handling + doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs." + + result.add quote do: + `edx` = BaseType(`m0ninv_MR`) + `scratchSym`[0 .. `N`-1] = `t_MR`.toOpenArray(0, `N`-1) + + for i in 0 ..< N: + # RDX contains m0ninv at the start of each loop + ctx.comment "" + ctx.imul rRDX, scratch[0] # m <- t[i] * m0ninv mod 2^w + ctx.comment "---- Reduction " & $i + ctx.`xor` scratch[N], scratch[N] + + for j in 0 ..< N-1: + ctx.comment "" + ctx.mulx hi, lo, M[j], rdx + ctx.adcx scratch[j], lo + ctx.adox scratch[j+1], hi + + # Last limb + ctx.comment "" + ctx.mulx hi, lo, M[N-1], rdx + ctx.mov rRDX, m0ninv # Reload m0ninv for next iter + ctx.adcx scratch[N-1], lo + ctx.adox hi, scratch[N] + ctx.adcx scratch[N], hi + + scratch.rotateLeft() + + # Code generation + result.add ctx.generate() + + # New codegen + ctx = init(Assembler_x86, BaseType) + + let r = init(OperandArray, nimSymbol = r_MR, N, PointerInReg, InputOutput_EnsureClobber) + let t = init(OperandArray, nimSymbol = t_MR, N*2, PointerInReg, Input) + let extraRegNeeded = N-1 + let tsub = init(OperandArray, nimSymbol = ident"tsub", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber) + let tsubsym = tsub.nimSymbol + result.add quote do: + var `tsubsym` {.noInit.}: Limbs[`extraRegNeeded`] + + # This does t[i+n] += hi + # but in a separate carry chain, fused with the + # copy "r[i] = t[i+n]" + for i in 0 ..< N: + if i == 0: + ctx.add scratch[i], t[i+N] + else: + ctx.adc scratch[i], t[i+N] + + let reuse = repackRegisters(tsub, scratch[N]) + + if canUseNoCarryMontyMul: + ctx.finalSubNoCarry(r, scratch, M, reuse) + else: + ctx.finalSubCanOverflow(r, scratch, M, reuse, hi) + + # Code generation + result.add ctx.generate() + +func montRed_asm_adx_bmi2*[N: static int]( + r: var array[N, SecretWord], + t: array[N*2, SecretWord], + M: array[N, SecretWord], + m0ninv: BaseType, + canUseNoCarryMontyMul: static bool + ) = + ## Constant-time Montgomery reduction + montyRedx_gen(r, t, M, m0ninv, canUseNoCarryMontyMul) diff --git a/constantine/arithmetic/limbs_asm_mul_x86.nim b/constantine/arithmetic/limbs_asm_mul_x86.nim new file mode 100644 index 0000000..64a12e3 --- /dev/null +++ b/constantine/arithmetic/limbs_asm_mul_x86.nim @@ -0,0 +1,146 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + # Standard library + std/macros, + # Internal + ../config/common, + ../primitives + +# ############################################################ +# +# Assembly implementation of bigint multiplication +# +# ############################################################ + +# Note: We can refer to at most 30 registers in inline assembly +# and "InputOutput" registers count double +# They are nice to let the compiler deals with mov +# but too constraining so we move things ourselves. + +# TODO: verify that assembly generated works for small arrays +# that are passed by values + +static: doAssert UseASM_X86_64 # Need 8 registers just for mul + # and 32-bit only has 8 max. + +macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) = + ## Comba multiplication generator + ## `a`, `b`, `r` can have a different number of limbs + ## if `r`.limbs.len < a.limbs.len + b.limbs.len + ## The result will be truncated, i.e. it will be + ## a * b (mod (2^WordBitwidth)^r.limbs.len) + ## + ## Assumes r doesn't aliases a or b + + result = newStmtList() + + var ctx = init(Assembler_x86, BaseType) + let + arrR = init(OperandArray, nimSymbol = r, rLen, PointerInReg, InputOutput_EnsureClobber) + arrA = init(OperandArray, nimSymbol = a, aLen, PointerInReg, Input) + arrB = init(OperandArray, nimSymbol = b, bLen, PointerInReg, Input) + + t = Operand( + desc: OperandDesc( + asmId: "[t]", + nimSymbol: ident"t", + rm: Reg, + constraint: Output_EarlyClobber, + cEmit: "t" + ) + ) + + u = Operand( + desc: OperandDesc( + asmId: "[u]", + nimSymbol: ident"u", + rm: Reg, + constraint: Output_EarlyClobber, + cEmit: "u" + ) + ) + + v = Operand( + desc: OperandDesc( + asmId: "[v]", + nimSymbol: ident"v", + rm: Reg, + constraint: Output_EarlyClobber, + cEmit: "v" + ) + ) + + # MUL requires RAX and RDX + rRAX = Operand( + desc: OperandDesc( + asmId: "[rax]", + nimSymbol: ident"rax", + rm: RAX, + constraint: Output_EarlyClobber, + cEmit: "rax" + ) + ) + + rRDX = Operand( + desc: OperandDesc( + asmId: "[rdx]", + nimSymbol: ident"rdx", + rm: RDX, + constraint: Output_EarlyClobber, + cEmit: "rdx" + ) + ) + + + # Prologue + let tsym = t.desc.nimSymbol + let usym = u.desc.nimSymbol + let vsym = v.desc.nimSymbol + let eax = rRAX.desc.nimSymbol + let edx = rRDX.desc.nimSymbol + result.add quote do: + var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init + var `eax`{.noInit.}, `edx`{.noInit.}: BaseType + + # Algorithm + ctx.`xor` u, u + ctx.`xor` v, v + ctx.`xor` t, t + + for i in 0 ..< min(aLen+bLen, rLen): + let ib = min(bLen-1, i) + let ia = i - ib + for j in 0 ..< min(aLen - ia, ib+1): + # (t, u, v) <- (t, u, v) + a[ia+j] * b[ib-j] + ctx.mov rRAX, arrB[ib-j] + ctx.mul rdx, rax, arrA[ia+j], rax + ctx.add v, rRAX + ctx.adc u, rRDX + ctx.adc t, 0 + + ctx.mov arrR[i], v + + if i != min(aLen+bLen, rLen) - 1: + ctx.mov v, u + ctx.mov u, t + ctx.`xor` t, t + + if aLen+bLen < rLen: + ctx.`xor` rRAX, rRAX + for i in aLen+bLen ..< rLen: + ctx.mov arrR[i], rRAX + + # Codegen + result.add ctx.generate + +func mul_asm*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) = + ## Multi-precision Multiplication + ## Assumes r doesn't alias a or b + mul_gen(r, a, b) diff --git a/constantine/arithmetic/limbs_asm_mul_x86_adx_bmi2.nim b/constantine/arithmetic/limbs_asm_mul_x86_adx_bmi2.nim new file mode 100644 index 0000000..dcc9042 --- /dev/null +++ b/constantine/arithmetic/limbs_asm_mul_x86_adx_bmi2.nim @@ -0,0 +1,197 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + # Standard library + std/macros, + # Internal + ../config/common, + ../primitives + +# ############################################################ +# +# Assembly implementation of finite fields +# +# ############################################################ + +# Note: We can refer to at most 30 registers in inline assembly +# and "InputOutput" registers count double +# They are nice to let the compiler deals with mov +# but too constraining so we move things ourselves. + +static: doAssert UseASM_X86_64 + +# MULX/ADCX/ADOX +{.localPassC:"-madx -mbmi2".} +# Necessary for the compiler to find enough registers (enabled at -O1) +# {.localPassC:"-fomit-frame-pointer".} + +# Multiplication +# ------------------------------------------------------------ +proc mulx_by_word( + ctx: var Assembler_x86, + r0: Operand, + a, t: OperandArray, + word0: Operand, + rRAX, rRDX: Operand + ) = + ## Multiply the `a[0..= M: + # r -= M + # + # Important note: `t[i+n] += C` should propagate the carry + # to the higher limb if any, thank you "implementation detail" + # missing from paper. + when UseASM_X86_64 and r.len <= 6: + if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()): + montRed_asm_adx_bmi2(r, t, M, m0ninv, canUseNoCarryMontyMul) + else: + montRed_asm(r, t, M, m0ninv, canUseNoCarryMontyMul) + elif UseASM_X86_32 and r.len <= 6: + # TODO: Assembly faster than GCC but slower than Clang + montRed_asm(r, t, M, m0ninv, canUseNoCarryMontyMul) + else: + var t = t # Copy "t" for mutation and ensure on stack + var res: typeof(r) # Accumulator + staticFor i, 0, N: + var C = Zero + let m = t[i] * SecretWord(m0ninv) + staticFor j, 0, N: + muladd2(C, t[i+j], m, M[j], t[i+j], C) + res[i] = C + + # This does t[i+n] += C + # but in a separate carry chain, fused with the + # copy "r[i] = t[i+n]" + var carry = Carry(0) + staticFor i, 0, N: + addC(carry, res[i], t[i+N], res[i], carry) + + # Final substraction + discard res.csub(M, SecretWord(carry).isNonZero() or not(res < M)) + r = res diff --git a/constantine/arithmetic/limbs_generic_modular.nim b/constantine/arithmetic/limbs_generic_modular.nim index ac73d6c..c0b0586 100644 --- a/constantine/arithmetic/limbs_generic_modular.nim +++ b/constantine/arithmetic/limbs_generic_modular.nim @@ -9,7 +9,7 @@ import ../config/common, ../primitives, - ./limbs_generic + ./limbs # No exceptions allowed {.push raises: [].} diff --git a/constantine/arithmetic/limbs_montgomery.nim b/constantine/arithmetic/limbs_montgomery.nim index 93d8396..c6e53f4 100644 --- a/constantine/arithmetic/limbs_montgomery.nim +++ b/constantine/arithmetic/limbs_montgomery.nim @@ -12,9 +12,9 @@ import # Internal ../config/common, ../primitives, - ./limbs_generic + ./limbs -when UseX86ASM: +when UseASM_X86_64: import ./limbs_asm_montmul_x86, ./limbs_asm_montmul_x86_adx_bmi2 @@ -37,55 +37,7 @@ when UseX86ASM: # - pairing final exponentiation # are bottlenecked by Montgomery multiplications or squarings # -# Unfortunately, the fastest implementation of Montgomery Multiplication -# on x86 is impossible without resorting to assembly (probably 15~30% faster) -# -# It requires implementing 2 parallel pipelines of carry-chains (via instruction-level parallelism) -# of MULX, ADCX and ADOX instructions, according to Intel paper: -# https://www.intel.cn/content/dam/www/public/us/en/documents/white-papers/ia-large-integer-arithmetic-paper.pdf -# and the code generation of MCL -# https://github.com/herumi/mcl -# -# A generic implementation would require implementing a mini-compiler as macro -# significantly sacrificing code readability, portability, auditability and maintainability. -# -# This would however save significant hardware or cloud resources. -# An example inline assembly compiler for add-with-carry is available in -# primitives/research/addcarry_subborrow_compiler.nim -# -# Instead we follow the optimized high-level implementation of Goff -# which skips a significant amount of additions for moduli -# that have their the most significant bit unset. - -# Loop unroller -# ------------------------------------------------------------ - -proc replaceNodes(ast: NimNode, what: NimNode, by: NimNode): NimNode = - # Replace "what" ident node by "by" - proc inspect(node: NimNode): NimNode = - case node.kind: - of {nnkIdent, nnkSym}: - if node.eqIdent(what): - return by - return node - of nnkEmpty: - return node - of nnkLiterals: - return node - else: - var rTree = node.kind.newTree() - for child in node: - rTree.add inspect(child) - return rTree - result = inspect(ast) - -macro staticFor(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped): untyped = - result = newStmtList() - for i in start ..< stopEx: - result.add nnkBlockStmt.newTree( - ident("unrolledIter_" & $idx & $i), - body.replaceNodes(idx, newLit i) - ) +# Hence we use inline assembly where possible # No exceptions allowed {.push raises: [].} @@ -348,7 +300,7 @@ func montyMul*( # - specialize/duplicate code for m0ninv == 1 (especially if only 1 curve is needed) # - keep it generic and optimize code size when canUseNoCarryMontyMul: - when UseX86ASM and a.len in {2 .. 6}: # TODO: handle spilling + when UseASM_X86_64 and a.len in {2 .. 6}: # TODO: handle spilling if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()): montMul_CIOS_nocarry_asm_adx_bmi2(r, a, b, M, m0ninv) else: diff --git a/constantine/config/common.nim b/constantine/config/common.nim index 4707845..050d7a1 100644 --- a/constantine/config/common.nim +++ b/constantine/config/common.nim @@ -30,6 +30,14 @@ type SecretBool* = CTBool[SecretWord] + + Limbs*[N: static int] = array[N, SecretWord] + ## Limbs-type + ## Should be distinct type to avoid builtins to use non-constant time + ## implementation, for example for comparison. + ## + ## but for unknown reason, it prevents semchecking `bits` + const WordBitWidth* = sizeof(SecretWord) * 8 ## Logical word size @@ -44,7 +52,8 @@ const # TODO, we restrict assembly to 64-bit words # We need to support register spills for large limbs const ConstantineASM {.booldefine.} = true -const UseX86ASM* = WordBitWidth == 64 and ConstantineASM and X86 and GCC_Compatible +const UseASM_X86_32* = ConstantineASM and X86 and GCC_Compatible +const UseASM_X86_64* = WordBitWidth == 64 and UseASM_X86_32 # ############################################################ # diff --git a/constantine/config/curves.nim b/constantine/config/curves.nim index cc6ecdb..a2debf6 100644 --- a/constantine/config/curves.nim +++ b/constantine/config/curves.nim @@ -27,15 +27,19 @@ macro Mod*(C: static Curve): untyped = ## Get the Modulus associated to a curve result = bindSym($C & "_Modulus") -func getCurveBitwidth*(C: static Curve): static int = +template getCurveBitwidth*(C: Curve): int = ## Returns the number of bits taken by the curve modulus - result = static(CurveBitWidth[C]) + CurveBitWidth[C] template matchingBigInt*(C: static Curve): untyped = BigInt[CurveBitWidth[C]] -func family*(C: static Curve): CurveFamily = - result = static(CurveFamilies[C]) +template family*(C: Curve): CurveFamily = + CurveFamilies[C] + +template matchingLimbs2x*(C: Curve): untyped = + const N2 = wordsRequired(getCurveBitwidth(C)) * 2 # TODO upstream, not precomputing N2 breaks semcheck + array[N2, SecretWord] # TODO upstream, using Limbs[N2] breaks semcheck # ############################################################ # diff --git a/constantine/config/type_bigint.nim b/constantine/config/type_bigint.nim index 340cfff..4579545 100644 --- a/constantine/config/type_bigint.nim +++ b/constantine/config/type_bigint.nim @@ -31,8 +31,6 @@ type debug: import std/strutils - type Limbs[N: static int] = array[N, SecretWord] - func toString*(a: Limbs): string = result = "[" result.add " 0x" & toHex(BaseType(a[0])) diff --git a/constantine/elliptic/ec_endomorphism_accel.nim b/constantine/elliptic/ec_endomorphism_accel.nim index eece904..bcaae31 100644 --- a/constantine/elliptic/ec_endomorphism_accel.nim +++ b/constantine/elliptic/ec_endomorphism_accel.nim @@ -10,7 +10,6 @@ import # Standard Library std/typetraits, # Internal - ../../helpers/static_for, ../primitives, ../config/[common, curves, type_bigint], ../arithmetic, diff --git a/constantine/elliptic/ec_endomorphism_params.nim b/constantine/elliptic/ec_endomorphism_params.nim index 1f00a75..8f430b3 100644 --- a/constantine/elliptic/ec_endomorphism_params.nim +++ b/constantine/elliptic/ec_endomorphism_params.nim @@ -8,7 +8,6 @@ import # Internal - ../../helpers/static_for, ../primitives, ../config/[common, curves, type_bigint], ../arithmetic, diff --git a/constantine/elliptic/ec_weierstrass_projective.nim b/constantine/elliptic/ec_weierstrass_projective.nim index 7c27fcf..3bdcd51 100644 --- a/constantine/elliptic/ec_weierstrass_projective.nim +++ b/constantine/elliptic/ec_weierstrass_projective.nim @@ -190,7 +190,7 @@ func sum*[F]( r.y.sum(Q.x, Q.z) # 15. Y3 <- X2 + Z2 r.x *= r.y # 16. X3 <- X3 Y3 X3 = (X1 Z1)(X2 Z2) r.y.sum(t0, t2) # 17. Y3 <- t0 + t2 Y3 = X1 X2 + Z1 Z2 - r.y.diff(r.x, r.y) # 18. Y3 <- X3 - Y3 Y3 = (X1 + Z1)(X2 + Z2) - (X1 X2 + Z1 Z2) = X1 Z2 + X2 Z1 + r.y.diffAlias(r.x, r.y) # 18. Y3 <- X3 - Y3 Y3 = (X1 + Z1)(X2 + Z2) - (X1 X2 + Z1 Z2) = X1 Z2 + X2 Z1 when F is Fp2 and F.C.getSexticTwist() == D_Twist: t0 *= SexticNonResidue t1 *= SexticNonResidue @@ -206,7 +206,7 @@ func sum*[F]( r.y *= SexticNonResidue r.x.prod(t4, r.y) # 25. X3 <- t4 Y3 X3 = 3b(Y1 Z2 + Y2 Z1)(X1 Z2 + X2 Z1) t2.prod(t3, t1) # 26. t2 <- t3 t1 t2 = (X1 Y2 + X2 Y1) (Y1 Y2 - 3b Z1 Z2) - r.x.diff(t2, r.x) # 27. X3 <- t2 - X3 X3 = (X1 Y2 + X2 Y1) (Y1 Y2 - 3b Z1 Z2) - 3b(Y1 Z2 + Y2 Z1)(X1 Z2 + X2 Z1) + r.x.diffAlias(t2, r.x) # 27. X3 <- t2 - X3 X3 = (X1 Y2 + X2 Y1) (Y1 Y2 - 3b Z1 Z2) - 3b(Y1 Z2 + Y2 Z1)(X1 Z2 + X2 Z1) r.y *= t0 # 28. Y3 <- Y3 t0 Y3 = 9b X1 X2 (X1 Z2 + X2 Z1) t1 *= r.z # 29. t1 <- t1 Z3 t1 = (Y1 Y2 - 3b Z1 Z2)(Y1 Y2 + 3b Z1 Z2) r.y += t1 # 30. Y3 <- t1 + Y3 Y3 = (Y1 Y2 + 3b Z1 Z2)(Y1 Y2 - 3b Z1 Z2) + 9b X1 X2 (X1 Z2 + X2 Z1) diff --git a/constantine/primitives.nim b/constantine/primitives.nim index 0db7d07..7bea2a9 100644 --- a/constantine/primitives.nim +++ b/constantine/primitives.nim @@ -12,7 +12,8 @@ import primitives/multiplexers, primitives/addcarry_subborrow, primitives/extended_precision, - primitives/bithacks + primitives/bithacks, + primitives/static_for export constant_time_types, @@ -20,7 +21,8 @@ export multiplexers, addcarry_subborrow, extended_precision, - bithacks + bithacks, + staticFor when X86 and GCC_Compatible: import primitives/[cpuinfo_x86, macro_assembler_x86] diff --git a/constantine/primitives/macro_assembler_x86.nim b/constantine/primitives/macro_assembler_x86.nim index f9630f7..96faf1c 100644 --- a/constantine/primitives/macro_assembler_x86.nim +++ b/constantine/primitives/macro_assembler_x86.nim @@ -6,7 +6,7 @@ # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). # at your option. This file may not be copied, modified, or distributed except according to those terms. -import std/[macros, strutils, sets, hashes] +import std/[macros, strutils, sets, hashes, algorithm] # A compile-time inline assembler @@ -33,6 +33,9 @@ type RAX = "a" + # Flags + CarryFlag = "@ccc" + Register* = enum rbx, rdx, r8, rax @@ -98,6 +101,12 @@ func hash(od: OperandDesc): Hash = func len*(opArray: OperandArray): int = opArray.buf.len +func len*(opArray: Operand): int = + opArray.buf.len + +func rotateLeft*(opArray: var OperandArray) = + opArray.buf.rotateLeft(1) + proc `[]`*(opArray: OperandArray, index: int): Operand = opArray.buf[index] @@ -181,6 +190,25 @@ func asArrayAddr*(op: Operand, len: int): Operand = # Code generation # ------------------------------------------------------------------------------------------------------------ +func setToCarryFlag*(a: var Assembler_x86, carry: NimNode) = + + # We need to dereference the hidden pointer of var param + let isHiddenDeref = carry.kind == nnkHiddenDeref + let nimSymbol = if isHiddenDeref: carry[0] + else: carry + {.noSideEffect.}: + let symStr = $nimSymbol + + let desc = OperandDesc( + asmId: "", + nimSymbol: ident(symStr), + rm: CarryFlag, + constraint: Output_Overwrite, + cEmit: symStr + ) + + a.operands.incl(desc) + func generate*(a: Assembler_x86): NimNode = ## Generate the inline assembly code from ## the desired instruction @@ -527,7 +555,7 @@ func cmovc*(a: var Assembler_x86, dst, src: Operand) = func cmovnc*(a: var Assembler_x86, dst, src: Operand) = ## Does: dst <- src if the carry flag is not set doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr - doAssert dst.desc.constraint in {Output_EarlyClobber, InputOutput, Output_Overwrite}, $dst.repr + doAssert dst.desc.constraint in OutputReg, $dst.repr a.codeFragment("cmovnc", src, dst) # No clobber @@ -566,7 +594,7 @@ func mul*(a: var Assembler_x86, dHi, dLo: Register, src0: Operand, src1: Registe func imul*(a: var Assembler_x86, dst, src: Operand) = ## Does dst <- dst * src, keeping only the low half - doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr + doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr doAssert dst.desc.constraint in OutputReg, $dst.repr a.codeFragment("imul", src, dst) @@ -574,9 +602,9 @@ func imul*(a: var Assembler_x86, dst, src: Operand) = func mulx*(a: var Assembler_x86, dHi, dLo, src0: Operand, src1: Register) = ## Does (dHi, dLo) <- src0 * src1 doAssert src1 == rdx, "MULX requires the RDX register" - doAssert dHi.desc.rm in {Reg, ElemsInReg} or dHi.desc.rm in SpecificRegisters, + doAssert dHi.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register " & $dHi.repr - doAssert dLo.desc.rm in {Reg, ElemsInReg} or dLo.desc.rm in SpecificRegisters, + doAssert dLo.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register " & $dLo.repr doAssert dHi.desc.constraint in OutputReg doAssert dLo.desc.constraint in OutputReg @@ -595,7 +623,7 @@ func adcx*(a: var Assembler_x86, dst, src: Operand) = ## Does: dst <- dst + src + carry ## and only sets the carry flag doAssert dst.desc.constraint in OutputReg, $dst.repr - doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr + doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr a.codeFragment("adcx", src, dst) a.areFlagsClobbered = true @@ -603,7 +631,7 @@ func adox*(a: var Assembler_x86, dst, src: Operand) = ## Does: dst <- dst + src + overflow ## and only sets the overflow flag doAssert dst.desc.constraint in OutputReg, $dst.repr - doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr + doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr a.codeFragment("adox", src, dst) a.areFlagsClobbered = true diff --git a/constantine/primitives/static_for.nim b/constantine/primitives/static_for.nim new file mode 100644 index 0000000..af531ee --- /dev/null +++ b/constantine/primitives/static_for.nim @@ -0,0 +1,36 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import std/macros + +proc replaceNodes(ast: NimNode, what: NimNode, by: NimNode): NimNode = + # Replace "what" ident node by "by" + proc inspect(node: NimNode): NimNode = + case node.kind: + of {nnkIdent, nnkSym}: + if node.eqIdent(what): + return by + return node + of nnkEmpty: + return node + of nnkLiterals: + return node + else: + var rTree = node.kind.newTree() + for child in node: + rTree.add inspect(child) + return rTree + result = inspect(ast) + +macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped): untyped = + result = newStmtList() + for i in start ..< stopEx: + result.add nnkBlockStmt.newTree( + ident("unrolledIter_" & $idx & $i), + body.replaceNodes(idx, newLit i) + ) diff --git a/constantine/tower_field_extensions/quadratic_extensions.nim b/constantine/tower_field_extensions/quadratic_extensions.nim index 48b5892..48767c7 100644 --- a/constantine/tower_field_extensions/quadratic_extensions.nim +++ b/constantine/tower_field_extensions/quadratic_extensions.nim @@ -84,17 +84,59 @@ func prod_complex(r: var QuadraticExt, a, b: QuadraticExt) = mixin fromComplexExtension static: doAssert r.fromComplexExtension() - var a0b0 {.noInit.}, a1b1 {.noInit.}: typeof(r.c0) - a0b0.prod(a.c0, b.c0) # [1 Mul] - a1b1.prod(a.c1, b.c1) # [2 Mul] + # TODO: GCC is adding an unexplainable 30 cycles tax to this function (~10% slow down) + # for seemingly no reason - r.c0.sum(a.c0, a.c1) # r0 = (a0 + a1) # [2 Mul, 1 Add] - r.c1.sum(b.c0, b.c1) # r1 = (b0 + b1) # [2 Mul, 2 Add] - r.c1 *= r.c0 # r1 = (b0 + b1)(a0 + a1) # [3 Mul, 2 Add] - 𝔽p temporary + when true: # Single-width implementation + # Clang 330 cycles on i9-9980XE @4.1 GHz + var a0b0 {.noInit.}, a1b1 {.noInit.}: typeof(r.c0) + a0b0.prod(a.c0, b.c0) # [1 Mul] + a1b1.prod(a.c1, b.c1) # [2 Mul] - r.c0.diff(a0b0, a1b1) # r0 = a0 b0 - a1 b1 # [3 Mul, 2 Add, 1 Sub] - r.c1 -= a0b0 # r1 = (b0 + b1)(a0 + a1) - a0b0 # [3 Mul, 2 Add, 2 Sub] - r.c1 -= a1b1 # r1 = (b0 + b1)(a0 + a1) - a0b0 - a1b1 # [3 Mul, 2 Add, 3 Sub] + r.c0.sum(a.c0, a.c1) # r0 = (a0 + a1) # [2 Mul, 1 Add] + r.c1.sum(b.c0, b.c1) # r1 = (b0 + b1) # [2 Mul, 2 Add] + r.c1 *= r.c0 # r1 = (b0 + b1)(a0 + a1) # [3 Mul, 2 Add] - 𝔽p temporary + + r.c0.diff(a0b0, a1b1) # r0 = a0 b0 - a1 b1 # [3 Mul, 2 Add, 1 Sub] + r.c1 -= a0b0 # r1 = (b0 + b1)(a0 + a1) - a0b0 # [3 Mul, 2 Add, 2 Sub] + r.c1 -= a1b1 # r1 = (b0 + b1)(a0 + a1) - a0b0 - a1b1 # [3 Mul, 2 Add, 3 Sub] + + else: # Double-width implementation with lazy reduction + # Deactivated for now Clang 360 cycles on i9-9980XE @4.1 GHz + var a0b0 {.noInit.}, a1b1 {.noInit.}: doubleWidth(typeof(r.c0)) + var d {.noInit.}: doubleWidth(typeof(r.c0)) + const msbSet = r.c0.typeof.C.canUseNoCarryMontyMul() + + a0b0.mulNoReduce(a.c0, b.c0) # 44 cycles - cumul 44 + a1b1.mulNoReduce(a.c1, b.c1) # 44 cycles - cumul 88 + when msbSet: + r.c0.sum(a.c0, a.c1) + r.c1.sum(b.c0, b.c1) + else: + r.c0.sumNoReduce(a.c0, a.c1) # 5 cycles - cumul 93 + r.c1.sumNoReduce(b.c0, b.c1) # 5 cycles - cumul 98 + d.mulNoReduce(r.c0, r.c1) # 44 cycles - cumul 142 + when msbSet: + d -= a0b0 + d -= a1b1 + else: + d.diffNoReduce(d, a0b0) # 10 cycles - cumul 152 + d.diffNoReduce(d, a1b1) # 10 cycles - cumul 162 + a0b0.diff(a0b0, a1b1) # 18 cycles - cumul 170 + r.c0.reduce(a0b0) # 68 cycles - cumul 248 + r.c1.reduce(d) # 68 cycles - cumul 316 + + # Single-width [3 Mul, 2 Add, 3 Sub] + # 3*81 + 2*14 + 3*12 = 307 theoretical cycles + # 330 measured + # Double-Width + # 316 theoretical cycles + # 365 measured + # Reductions can be 2x10 faster using MCL algorithm + # but there are still unexplained 50 cycles diff between theo and measured + # and unexplained 30 cycles between Clang and GCC + # - Function calls? + # - push/pop stack? # Commutative ring implementation for generic quadratic extension fields # ------------------------------------------------------------------- diff --git a/constantine/tower_field_extensions/tower_common.nim b/constantine/tower_field_extensions/tower_common.nim index 88716cd..c05f01d 100644 --- a/constantine/tower_field_extensions/tower_common.nim +++ b/constantine/tower_field_extensions/tower_common.nim @@ -154,6 +154,19 @@ func diff*(r: var CubicExt, a, b: CubicExt) = r.c1.diff(a.c1, b.c1) r.c2.diff(a.c2, b.c2) +func diffAlias*(r: var QuadraticExt, a, b: QuadraticExt) = + ## Diff ``a`` and ``b`` into ``r`` + ## Handles r and b aliasing + r.c0.diffAlias(a.c0, b.c0) + r.c1.diffAlias(a.c1, b.c1) + +func diffAlias*(r: var CubicExt, a, b: CubicExt) = + ## Diff ``a`` and ``b`` into ``r`` + ## Handles r and b aliasing + r.c0.diffAlias(a.c0, b.c0) + r.c1.diffAlias(a.c1, b.c1) + r.c2.diffAlias(a.c2, b.c2) + # Multiplication by a small integer known at compile-time # ------------------------------------------------------------------- diff --git a/tests/t_bigints.nim b/tests/t_bigints.nim index 7136650..be43128 100644 --- a/tests/t_bigints.nim +++ b/tests/t_bigints.nim @@ -145,6 +145,7 @@ proc mainArith() = discard a.add(SecretWord 1) check: bool(a == expected) +proc mainMul() = suite "Multi-precision multiplication" & " [" & $WordBitwidth & "-bit mode]": test "Same size operand into double size result": block: @@ -185,6 +186,7 @@ proc mainArith() = r.prod(b, a) check: bool(r == expected) +proc mainMulHigh() = suite "Multi-precision multiplication keeping only high words" & " [" & $WordBitwidth & "-bit mode]": test "Same size operand into double size result - discard first word": block: @@ -270,6 +272,7 @@ proc mainArith() = r.prod_high_words(b, a, 2) check: bool(r == expected) +proc mainModular() = suite "Modular operations - small modulus" & " [" & $WordBitwidth & "-bit mode]": # Vectors taken from Stint - https://github.com/status-im/nim-stint test "100 mod 13": @@ -619,6 +622,9 @@ proc mainModularInverse() = check: bool(r == expected) mainArith() +mainMul() +mainMulHigh() +mainModular() mainNeg() mainCopySwap() mainModularInverse() diff --git a/tests/t_finite_fields_double_width.nim b/tests/t_finite_fields_double_width.nim new file mode 100644 index 0000000..da9951f --- /dev/null +++ b/tests/t_finite_fields_double_width.nim @@ -0,0 +1,97 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + # Standard library + std/[unittest, times], + # Internal + ../constantine/arithmetic, + ../constantine/io/[io_bigints, io_fields], + ../constantine/config/[curves, common, type_bigint], + # Test utilities + ../helpers/prng_unsafe + +const Iters = 128 + +var rng: RngState +let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32 +rng.seed(seed) +echo "\n------------------------------------------------------\n" +echo "test_finite_fields_double_width xoshiro512** seed: ", seed + +proc randomCurve(C: static Curve) = + let a = rng.random_unsafe(Fp[C]) + let b = rng.random_unsafe(Fp[C]) + + var r_fp, r_fpDbl: Fp[C] + var tmpDbl: FpDbl[C] + + r_fp.prod(a, b) + tmpDbl.mulNoReduce(a, b) + r_fpDbl.reduce(tmpDbl) + + doAssert bool(r_fp == r_fpDbl) + +proc randomHighHammingWeight(C: static Curve) = + let a = rng.random_highHammingWeight(Fp[C]) + let b = rng.random_highHammingWeight(Fp[C]) + + var r_fp, r_fpDbl: Fp[C] + var tmpDbl: FpDbl[C] + + r_fp.prod(a, b) + tmpDbl.mulNoReduce(a, b) + r_fpDbl.reduce(tmpDbl) + + doAssert bool(r_fp == r_fpDbl) + +proc random_long01Seq(C: static Curve) = + let a = rng.random_long01Seq(Fp[C]) + let b = rng.random_long01Seq(Fp[C]) + + var r_fp, r_fpDbl: Fp[C] + var tmpDbl: FpDbl[C] + + r_fp.prod(a, b) + tmpDbl.mulNoReduce(a, b) + r_fpDbl.reduce(tmpDbl) + + doAssert bool(r_fp == r_fpDbl) + +suite "Field Multiplication via double-width field elements is consistent with single-width." & " [" & $WordBitwidth & "-bit mode]": + test "With P-224 field modulus": + for _ in 0 ..< Iters: + randomCurve(P224) + for _ in 0 ..< Iters: + randomHighHammingWeight(P224) + for _ in 0 ..< Iters: + random_long01Seq(P224) + + test "With P-256 field modulus": + for _ in 0 ..< Iters: + randomCurve(P256) + for _ in 0 ..< Iters: + randomHighHammingWeight(P256) + for _ in 0 ..< Iters: + random_long01Seq(P256) + + test "With BN254_Snarks field modulus": + for _ in 0 ..< Iters: + randomCurve(BN254_Snarks) + for _ in 0 ..< Iters: + randomHighHammingWeight(BN254_Snarks) + for _ in 0 ..< Iters: + random_long01Seq(BN254_Snarks) + + test "With BLS12_381 field modulus": + for _ in 0 ..< Iters: + randomCurve(BLS12_381) + for _ in 0 ..< Iters: + randomHighHammingWeight(BLS12_381) + for _ in 0 ..< Iters: + random_long01Seq(BLS12_381) diff --git a/tests/t_precomputed.nim b/tests/t_precomputed.nim index 01c4591..0518937 100644 --- a/tests/t_precomputed.nim +++ b/tests/t_precomputed.nim @@ -25,7 +25,7 @@ proc checkCubeRootOfUnity(curve: static Curve) = test $curve & " cube root of unity (mod r)": var cru: BigInt[3 * curve.getCurveOrderBitwidth()] cru.prod(curve.getCubicRootOfUnity_mod_r(), curve.getCubicRootOfUnity_mod_r()) - cru.prod(cru, curve.getCubicRootOfUnity_mod_r()) + cru.mul(curve.getCubicRootOfUnity_mod_r()) var r: BigInt[curve.getCurveOrderBitwidth()] r.reduce(cru, curve.getCurveOrder)