diff --git a/.travis.yml b/.travis.yml index aa55f63..5612fe8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -99,6 +99,9 @@ script: - nimble refresh - nimble install gmp stew - nimble test_parallel + - if [[ "$ARCH" != "arm64" ]]; then + nimble test_parallel_no_assembler; + fi branches: except: - gh-pages diff --git a/README.md b/README.md index a6de283..cf5909a 100644 --- a/README.md +++ b/README.md @@ -19,12 +19,18 @@ You can install the developement version of the library through nimble with the nimble install https://github.com/mratsim/constantine@#master ``` -For speed it is recommended to prefer Clang, MSVC or ICC over GCC. -GCC does not properly optimize add-with-carry and sub-with-borrow loops (see [Compiler-caveats](#Compiler-caveats)). +For speed it is recommended to prefer Clang, MSVC or ICC over GCC (see [Compiler-caveats](#Compiler-caveats)). Further if using GCC, GCC 7 at minimum is required, previous versions generated incorrect add-with-carry code. +On x86-64, inline assembly is used to workaround compilers having issues optimizing large integer arithmetic, +and also ensure constant-time code. +This can be deactivated with `"-d:ConstantineASM=false"`: +- at a significant performance cost with GCC (~50% slower than Clang). +- at misssed opportunity on recent CPUs that support MULX/ADCX/ADOX instructions (~60% faster than Clang). +- There is a 2.4x perf ratio between using plain GCC vs GCC with inline assembly. + ## Target audience The library aims to be a portable, compact and hardened library for elliptic curve cryptography needs, in particular for blockchain protocols and zero-knowledge proofs system. @@ -39,10 +45,13 @@ in this order ## Curves supported At the moment the following curves are supported, adding a new curve only requires adding the prime modulus -and its bitsize in [constantine/config/curves.nim](constantine/config/curves.nim). +and its bitsize in [constantine/config/curves.nim](constantine/config/curves_declaration.nim). The following curves are configured: +> Note: At the moment, finite field arithmetic is fully supported +> but elliptic curve arithmetic is work-in-progress. + ### ECDH / ECDSA curves - NIST P-224 @@ -58,7 +67,8 @@ Families: - FKM: Fotiadis-Konstantinou-Martindale Curves: -- BN254 (Zero-Knowledge Proofs, Snarks, Starks, Zcash, Ethereum 1) +- BN254_Nogami +- BN254_Snarks (Zero-Knowledge Proofs, Snarks, Starks, Zcash, Ethereum 1) - BLS12-377 (Zexe) - BLS12-381 (Algorand, Chia Networks, Dfinity, Ethereum 2, Filecoin, Zcash Sapling) - BN446 @@ -137,8 +147,13 @@ To measure the performance of Constantine ```bash git clone https://github.com/mratsim/constantine -nimble bench_fp_clang -nimble bench_fp2_clang +nimble bench_fp # Using Assembly (+ GCC) +nimble bench_fp_clang # Using Clang only +nimble bench_fp_gcc # Using Clang only (very slow) +nimble bench_fp2 +# ... +nimble bench_ec_g1 +nimble bench_ec_g2 ``` As mentioned in the [Compiler caveats](#compiler-caveats) section, GCC is up to 2x slower than Clang due to mishandling of carries and register usage. @@ -146,33 +161,51 @@ As mentioned in the [Compiler caveats](#compiler-caveats) section, GCC is up to On my machine, for selected benchmarks on the prime field for popular pairing-friendly curves. ``` -⚠️ Measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them. -========================================================================================================== +Compiled with GCC +Optimization level => + no optimization: false + release: true + danger: true + inline assembly: true +Using Constantine with 64-bit limbs +Running on Intel(R) Core(TM) i9-9980XE CPU @ 3.00GHz -All benchmarks are using constant-time implementations to protect against side-channel attacks. +⚠️ Cycles measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them. +i.e. a 20% overclock will be about 20% off (assuming no dynamic frequency scaling) -Compiled with Clang -Running on Intel(R) Core(TM) i9-9980XE CPU @ 3.00GHz (overclocked all-core Turbo @4.1GHz) - --------------------------------------------------------------------------------- -Addition Fp[BN254] 0 ns 0 cycles -Substraction Fp[BN254] 0 ns 0 cycles -Negation Fp[BN254] 0 ns 0 cycles -Multiplication Fp[BN254] 21 ns 65 cycles -Squaring Fp[BN254] 18 ns 55 cycles -Inversion Fp[BN254] 6266 ns 18799 cycles --------------------------------------------------------------------------------- -Addition Fp[BLS12_381] 0 ns 0 cycles -Substraction Fp[BLS12_381] 0 ns 0 cycles -Negation Fp[BLS12_381] 0 ns 0 cycles -Multiplication Fp[BLS12_381] 45 ns 136 cycles -Squaring Fp[BLS12_381] 39 ns 118 cycles -Inversion Fp[BLS12_381] 15683 ns 47050 cycles --------------------------------------------------------------------------------- +================================================================================================================= +------------------------------------------------------------------------------------------------------------------------------------------------- +Addition Fp[BN254_Snarks] 333333333.333 ops/s 3 ns/op 9 CPU cycles (approx) +Substraction Fp[BN254_Snarks] 500000000.000 ops/s 2 ns/op 8 CPU cycles (approx) +Negation Fp[BN254_Snarks] 1000000000.000 ops/s 1 ns/op 3 CPU cycles (approx) +Multiplication Fp[BN254_Snarks] 71428571.429 ops/s 14 ns/op 44 CPU cycles (approx) +Squaring Fp[BN254_Snarks] 71428571.429 ops/s 14 ns/op 44 CPU cycles (approx) +Inversion (constant-time Euclid) Fp[BN254_Snarks] 122579.063 ops/s 8158 ns/op 24474 CPU cycles (approx) +Inversion via exponentiation p-2 (Little Fermat) Fp[BN254_Snarks] 153822.489 ops/s 6501 ns/op 19504 CPU cycles (approx) +Square Root + square check (constant-time) Fp[BN254_Snarks] 153491.942 ops/s 6515 ns/op 19545 CPU cycles (approx) +Exp curve order (constant-time) - 254-bit Fp[BN254_Snarks] 104580.632 ops/s 9562 ns/op 28687 CPU cycles (approx) +Exp curve order (Leak exponent bits) - 254-bit Fp[BN254_Snarks] 153798.831 ops/s 6502 ns/op 19506 CPU cycles (approx) +------------------------------------------------------------------------------------------------------------------------------------------------- +Addition Fp[BLS12_381] 250000000.000 ops/s 4 ns/op 14 CPU cycles (approx) +Substraction Fp[BLS12_381] 250000000.000 ops/s 4 ns/op 13 CPU cycles (approx) +Negation Fp[BLS12_381] 1000000000.000 ops/s 1 ns/op 4 CPU cycles (approx) +Multiplication Fp[BLS12_381] 35714285.714 ops/s 28 ns/op 84 CPU cycles (approx) +Squaring Fp[BLS12_381] 35714285.714 ops/s 28 ns/op 85 CPU cycles (approx) +Inversion (constant-time Euclid) Fp[BLS12_381] 43763.676 ops/s 22850 ns/op 68552 CPU cycles (approx) +Inversion via exponentiation p-2 (Little Fermat) Fp[BLS12_381] 63983.620 ops/s 15629 ns/op 46889 CPU cycles (approx) +Square Root + square check (constant-time) Fp[BLS12_381] 63856.960 ops/s 15660 ns/op 46982 CPU cycles (approx) +Exp curve order (constant-time) - 255-bit Fp[BLS12_381] 68535.399 ops/s 14591 ns/op 43775 CPU cycles (approx) +Exp curve order (Leak exponent bits) - 255-bit Fp[BLS12_381] 93222.709 ops/s 10727 ns/op 32181 CPU cycles (approx) +------------------------------------------------------------------------------------------------------------------------------------------------- Notes: - GCC is significantly slower than Clang on multiprecision arithmetic. - The simplest operations might be optimized away by the compiler. + - Compilers: + Compilers are severely limited on multiprecision arithmetic. + Inline Assembly is used by default (nimble bench_fp). + Bench without assembly can use "nimble bench_fp_gcc" or "nimble bench_fp_clang". + GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries. + - The simplest operations might be optimized away by the compiler. + - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits) ``` ### Compiler caveats @@ -234,25 +267,15 @@ add256: retq ``` +As a workaround key procedures use inline assembly. + ### Inline assembly -Constantine uses inline assembly for a very restricted use-case: "conditional mov", -and a temporary use-case "hardware 128-bit division" that will be replaced ASAP (as hardware division is not constant-time). +While using intrinsics significantly improve code readability, portability, auditability and maintainability, +Constantine use inline assembly on x86-64 to ensure performance portability despite poor optimization (for GCC) +and also to use dedicated large integer instructions MULX, ADCX, ADOX that compilers cannot generate. -Using intrinsics otherwise significantly improve code readability, portability, auditability and maintainability. - -#### Future optimizations - -In the future more inline assembly primitives might be added provided the performance benefit outvalues the significant complexity. -In particular, multiprecision multiplication and squaring on x86 can use the instructions MULX, ADCX and ADOX -to multiply-accumulate on 2 carry chains in parallel (with instruction-level parallelism) -and improve performance by 15~20% over an uint128-based implementation. -As no compiler is able to generate such code even when using the `_mulx_u64` and `_addcarryx_u64` intrinsics, -either the assembly for each supported bigint size must be hardcoded -or a "compiler" must be implemented in macros that will generate the required inline assembly at compile-time. - -Such a compiler can also be used to overcome GCC codegen deficiencies, here is an example for add-with-carry: -https://github.com/mratsim/finite-fields/blob/d7f6d8bb/macro_add_carry.nim +The speed improvement on finite field arithmetic is up 60% with MULX, ADCX, ADOX on BLS12-381 (6 limbs). ## Sizes: code size, stack usage @@ -286,3 +309,7 @@ or * Apache License, Version 2.0, ([LICENSE-APACHEv2](LICENSE-APACHEv2) or http://www.apache.org/licenses/LICENSE-2.0) at your option. This file may not be copied, modified, or distributed except according to those terms. + +This library has **no external dependencies**. +In particular GMP is used only for testing and differential fuzzing +and is not linked in the library. diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 38ed9d4..8817707 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -186,12 +186,19 @@ steps: echo "PATH=${PATH}" export ucpu=${UCPU} nimble test_parallel - displayName: 'Testing the package (including GMP)' + displayName: 'Testing Constantine with Assembler and with GMP' + condition: ne(variables['Agent.OS'], 'Windows_NT') + + - bash: | + echo "PATH=${PATH}" + export ucpu=${UCPU} + nimble test_parallel_no_assembler + displayName: 'Testing Constantine without Assembler and with GMP' condition: ne(variables['Agent.OS'], 'Windows_NT') - bash: | echo "PATH=${PATH}" export ucpu=${UCPU} nimble test_no_gmp - displayName: 'Testing the package (without GMP)' + displayName: 'Testing the package (without Assembler or GMP)' condition: eq(variables['Agent.OS'], 'Windows_NT') diff --git a/benchmarks/bench_ec_g1.nim b/benchmarks/bench_ec_g1.nim index 6389353..1bf2ddf 100644 --- a/benchmarks/bench_ec_g1.nim +++ b/benchmarks/bench_ec_g1.nim @@ -64,8 +64,4 @@ proc main() = separator() main() - -echo "\nNotes:" -echo " - GCC is significantly slower than Clang on multiprecision arithmetic." -echo " - The simplest operations might be optimized away by the compiler." -echo " - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)" +notes() diff --git a/benchmarks/bench_ec_g2.nim b/benchmarks/bench_ec_g2.nim index 6fb16f2..7207fa9 100644 --- a/benchmarks/bench_ec_g2.nim +++ b/benchmarks/bench_ec_g2.nim @@ -65,8 +65,4 @@ proc main() = separator() main() - -echo "\nNotes:" -echo " - GCC is significantly slower than Clang on multiprecision arithmetic." -echo " - The simplest operations might be optimized away by the compiler." -echo " - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)" +notes() diff --git a/benchmarks/bench_elliptic_template.nim b/benchmarks/bench_elliptic_template.nim index 3728acf..fb746ef 100644 --- a/benchmarks/bench_elliptic_template.nim +++ b/benchmarks/bench_elliptic_template.nim @@ -14,7 +14,7 @@ import # Internals - ../constantine/config/curves, + ../constantine/config/[curves, common], ../constantine/arithmetic, ../constantine/io/io_bigints, ../constantine/elliptic/[ec_weierstrass_projective, ec_scalar_mul, ec_endomorphism_accel], @@ -57,7 +57,11 @@ elif defined(icc): else: echo "\nCompiled with an unknown compiler" -echo "Optimization level => no optimization: ", not defined(release), " | release: ", defined(release), " | danger: ", defined(danger) +echo "Optimization level => " +echo " no optimization: ", not defined(release) +echo " release: ", defined(release) +echo " danger: ", defined(danger) +echo " inline assembly: ", UseX86ASM when (sizeof(int) == 4) or defined(Constantine32): echo "⚠️ Warning: using Constantine with 32-bit limbs" @@ -84,6 +88,16 @@ proc report(op, elliptic: string, start, stop: MonoTime, startClk, stopClk: int6 else: echo &"{op:<60} {elliptic:<40} {throughput:>15.3f} ops/s {ns:>9} ns/op" +proc notes*() = + echo "Notes:" + echo " - Compilers:" + echo " Compilers are severely limited on multiprecision arithmetic." + echo " Inline Assembly is used by default (nimble bench_fp)." + echo " Bench without assembly can use \"nimble bench_fp_gcc\" or \"nimble bench_fp_clang\"." + echo " GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries." + echo " - The simplest operations might be optimized away by the compiler." + echo " - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)" + macro fixEllipticDisplay(T: typedesc): untyped = # At compile-time, enums are integers and their display is buggy # we get the Curve ID instead of the curve name. diff --git a/benchmarks/bench_fields_template.nim b/benchmarks/bench_fields_template.nim index 9135c23..e17bc4a 100644 --- a/benchmarks/bench_fields_template.nim +++ b/benchmarks/bench_fields_template.nim @@ -14,7 +14,7 @@ import # Internals - ../constantine/config/curves, + ../constantine/config/[curves, common], ../constantine/arithmetic, ../constantine/towers, # Helpers @@ -54,7 +54,11 @@ elif defined(icc): else: echo "\nCompiled with an unknown compiler" -echo "Optimization level => no optimization: ", not defined(release), " | release: ", defined(release), " | danger: ", defined(danger) +echo "Optimization level => " +echo " no optimization: ", not defined(release) +echo " release: ", defined(release) +echo " danger: ", defined(danger) +echo " inline assembly: ", UseX86ASM when (sizeof(int) == 4) or defined(Constantine32): echo "⚠️ Warning: using Constantine with 32-bit limbs" @@ -81,6 +85,16 @@ proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, else: echo &"{op:<50} {field:<18} {throughput:>15.3f} ops/s {ns:>9} ns/op" +proc notes*() = + echo "Notes:" + echo " - Compilers:" + echo " Compilers are severely limited on multiprecision arithmetic." + echo " Inline Assembly is used by default (nimble bench_fp)." + echo " Bench without assembly can use \"nimble bench_fp_gcc\" or \"nimble bench_fp_clang\"." + echo " GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries." + echo " - The simplest operations might be optimized away by the compiler." + echo " - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)" + macro fixFieldDisplay(T: typedesc): untyped = # At compile-time, enums are integers and their display is buggy # we get the Curve ID instead of the curve name. diff --git a/benchmarks/bench_fp.nim b/benchmarks/bench_fp.nim index 5ec224d..ea510ee 100644 --- a/benchmarks/bench_fp.nim +++ b/benchmarks/bench_fp.nim @@ -59,8 +59,4 @@ proc main() = separator() main() - -echo "Notes:" -echo " - GCC is significantly slower than Clang on multiprecision arithmetic." -echo " - The simplest operations might be optimized away by the compiler." -echo " - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)" +notes() diff --git a/benchmarks/bench_fp12.nim b/benchmarks/bench_fp12.nim index f97bb56..e09efe3 100644 --- a/benchmarks/bench_fp12.nim +++ b/benchmarks/bench_fp12.nim @@ -50,8 +50,4 @@ proc main() = separator() main() - -echo "Notes:" -echo " - GCC is significantly slower than Clang on multiprecision arithmetic." -echo " - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)" -echo " - The tower of extension fields chosen can lead to a large difference of performance between primes of similar bitwidth." +notes() diff --git a/benchmarks/bench_fp2.nim b/benchmarks/bench_fp2.nim index db5579e..52d4b7c 100644 --- a/benchmarks/bench_fp2.nim +++ b/benchmarks/bench_fp2.nim @@ -51,8 +51,4 @@ proc main() = separator() main() - -echo "Notes:" -echo " - GCC is significantly slower than Clang on multiprecision arithmetic." -echo " - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)" -echo " - The tower of extension fields chosen can lead to a large difference of performance between primes of similar bitwidth." +notes() diff --git a/benchmarks/bench_fp6.nim b/benchmarks/bench_fp6.nim index 06577de..717e1f6 100644 --- a/benchmarks/bench_fp6.nim +++ b/benchmarks/bench_fp6.nim @@ -50,8 +50,4 @@ proc main() = separator() main() - -echo "Notes:" -echo " - GCC is significantly slower than Clang on multiprecision arithmetic." -echo " - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)" -echo " - The tower of extension fields chosen can lead to a large difference of performance between primes of similar bitwidth." +notes() diff --git a/constantine.nimble b/constantine.nimble index 5579c87..ae409e9 100644 --- a/constantine.nimble +++ b/constantine.nimble @@ -106,7 +106,7 @@ proc runBench(benchName: string, compiler = "") = var cc = "" if compiler != "": - cc = "--cc:" & compiler + cc = "--cc:" & compiler & " -d:ConstantineASM=false" exec "nim c " & cc & " -d:danger --verbosity:0 -o:build/" & benchName & "_" & compiler & " -r --hints:off --warnings:off benchmarks/" & benchName & ".nim" @@ -209,6 +209,45 @@ task test_parallel, "Run all tests in parallel (via GNU parallel)": runBench("bench_ec_g1") runBench("bench_ec_g2") +task test_parallel_no_assembler, "Run all tests (without macro assembler) in parallel (via GNU parallel)": + # -d:testingCurves is configured in a *.nim.cfg for convenience + let cmdFile = true # open(buildParallel, mode = fmWrite) # Nimscript doesn't support IO :/ + exec "> " & buildParallel + + for td in testDesc: + if td.path in useDebug: + test "-d:debugConstantine -d:ConstantineASM=false", td.path, cmdFile + else: + test " -d:ConstantineASM=false", td.path, cmdFile + + # cmdFile.close() + # Execute everything in parallel with GNU parallel + exec "parallel --keep-order --group < " & buildParallel + + exec "> " & buildParallel + if sizeof(int) == 8: # 32-bit tests on 64-bit arch + for td in testDesc: + if td.path in useDebug: + test "-d:Constantine32 -d:debugConstantine -d:ConstantineASM=false", td.path, cmdFile + else: + test "-d:Constantine32 -d:ConstantineASM=false", td.path, cmdFile + # cmdFile.close() + # Execute everything in parallel with GNU parallel + exec "parallel --keep-order --group < " & buildParallel + + # Now run the benchmarks + # + # Benchmarks compile and run + # ignore Windows 32-bit for the moment + # Ensure benchmarks stay relevant. Ignore Windows 32-bit at the moment + if not defined(windows) or not (existsEnv"UCPU" or getEnv"UCPU" == "i686"): + runBench("bench_fp") + runBench("bench_fp2") + runBench("bench_fp6") + runBench("bench_fp12") + runBench("bench_ec_g1") + runBench("bench_ec_g2") + task test_parallel_no_gmp, "Run all tests in parallel (via GNU parallel)": # -d:testingCurves is configured in a *.nim.cfg for convenience let cmdFile = true # open(buildParallel, mode = fmWrite) # Nimscript doesn't support IO :/ diff --git a/constantine/arithmetic/finite_fields.nim b/constantine/arithmetic/finite_fields.nim index 00190d5..d85d822 100644 --- a/constantine/arithmetic/finite_fields.nim +++ b/constantine/arithmetic/finite_fields.nim @@ -29,11 +29,13 @@ import ../config/[common, type_fp, curves], ./bigints, ./limbs_montgomery +when UseX86ASM: + import ./finite_fields_asm_x86 + export Fp # No exceptions allowed {.push raises: [].} -{.push inline.} # ############################################################ # @@ -41,15 +43,15 @@ export Fp # # ############################################################ -func fromBig*[C: static Curve](T: type Fp[C], src: BigInt): Fp[C] {.noInit.} = +func fromBig*[C: static Curve](T: type Fp[C], src: BigInt): Fp[C] {.noInit, inline.} = ## Convert a BigInt to its Montgomery form result.mres.montyResidue(src, C.Mod, C.getR2modP(), C.getNegInvModWord(), C.canUseNoCarryMontyMul()) -func fromBig*[C: static Curve](dst: var Fp[C], src: BigInt) = +func fromBig*[C: static Curve](dst: var Fp[C], src: BigInt) {.inline.}= ## Convert a BigInt to its Montgomery form dst.mres.montyResidue(src, C.Mod, C.getR2modP(), C.getNegInvModWord(), C.canUseNoCarryMontyMul()) -func toBig*(src: Fp): auto {.noInit.} = +func toBig*(src: Fp): auto {.noInit, inline.} = ## Convert a finite-field element to a BigInt in natural representation var r {.noInit.}: typeof(src.mres) r.redc(src.mres, Fp.C.Mod, Fp.C.getNegInvModWord(), Fp.C.canUseNoCarryMontyMul()) @@ -58,14 +60,17 @@ func toBig*(src: Fp): auto {.noInit.} = # Copy # ------------------------------------------------------------ -func ccopy*(a: var Fp, b: Fp, ctl: SecretBool) = +func ccopy*(a: var Fp, b: Fp, ctl: SecretBool) {.inline.} = ## Constant-time conditional copy ## If ctl is true: b is copied into a ## if ctl is false: b is not copied and a is unmodified ## Time and memory accesses are the same whether a copy occurs or not - ccopy(a.mres, b.mres, ctl) + when UseX86ASM: + ccopy_asm(a.mres.limbs, b.mres.limbs, ctl) + else: + ccopy(a.mres, b.mres, ctl) -func cswap*(a, b: var Fp, ctl: CTBool) = +func cswap*(a, b: var Fp, ctl: CTBool) {.inline.} = ## Swap ``a`` and ``b`` if ``ctl`` is true ## ## Constant-time: @@ -93,80 +98,108 @@ func cswap*(a, b: var Fp, ctl: CTBool) = # In practice I'm not aware of such prime being using in elliptic curves. # 2^127 - 1 and 2^521 - 1 are used but 127 and 521 are not multiple of 32/64 -func `==`*(a, b: Fp): SecretBool = +func `==`*(a, b: Fp): SecretBool {.inline.} = ## Constant-time equality check a.mres == b.mres -func isZero*(a: Fp): SecretBool = +func isZero*(a: Fp): SecretBool {.inline.} = ## Constant-time check if zero a.mres.isZero() -func isOne*(a: Fp): SecretBool = +func isOne*(a: Fp): SecretBool {.inline.} = ## Constant-time check if one a.mres == Fp.C.getMontyOne() -func setZero*(a: var Fp) = +func setZero*(a: var Fp) {.inline.} = ## Set ``a`` to zero a.mres.setZero() -func setOne*(a: var Fp) = +func setOne*(a: var Fp) {.inline.} = ## Set ``a`` to one # Note: we need 1 in Montgomery residue form # TODO: Nim codegen is not optimal it uses a temporary # Check if the compiler optimizes it away a.mres = Fp.C.getMontyOne() -func `+=`*(a: var Fp, b: Fp) = +func `+=`*(a: var Fp, b: Fp) {.inline.} = ## In-place addition modulo p - var overflowed = add(a.mres, b.mres) - overflowed = overflowed or not(a.mres < Fp.C.Mod) - discard csub(a.mres, Fp.C.Mod, overflowed) + when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling + addmod_asm(a.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs) + else: + var overflowed = add(a.mres, b.mres) + overflowed = overflowed or not(a.mres < Fp.C.Mod) + discard csub(a.mres, Fp.C.Mod, overflowed) -func `-=`*(a: var Fp, b: Fp) = +func `-=`*(a: var Fp, b: Fp) {.inline.} = ## In-place substraction modulo p - let underflowed = sub(a.mres, b.mres) - discard cadd(a.mres, Fp.C.Mod, underflowed) + when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling + submod_asm(a.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs) + else: + let underflowed = sub(a.mres, b.mres) + discard cadd(a.mres, Fp.C.Mod, underflowed) -func double*(a: var Fp) = +func double*(a: var Fp) {.inline.} = ## Double ``a`` modulo p - var overflowed = double(a.mres) - overflowed = overflowed or not(a.mres < Fp.C.Mod) - discard csub(a.mres, Fp.C.Mod, overflowed) + when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling + addmod_asm(a.mres.limbs, a.mres.limbs, Fp.C.Mod.limbs) + else: + var overflowed = double(a.mres) + overflowed = overflowed or not(a.mres < Fp.C.Mod) + discard csub(a.mres, Fp.C.Mod, overflowed) -func sum*(r: var Fp, a, b: Fp) = +func sum*(r: var Fp, a, b: Fp) {.inline.} = ## Sum ``a`` and ``b`` into ``r`` module p ## r is initialized/overwritten - var overflowed = r.mres.sum(a.mres, b.mres) - overflowed = overflowed or not(r.mres < Fp.C.Mod) - discard csub(r.mres, Fp.C.Mod, overflowed) + when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling + r = a + addmod_asm(r.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs) + else: + var overflowed = r.mres.sum(a.mres, b.mres) + overflowed = overflowed or not(r.mres < Fp.C.Mod) + discard csub(r.mres, Fp.C.Mod, overflowed) -func diff*(r: var Fp, a, b: Fp) = +func diff*(r: var Fp, a, b: Fp) {.inline.} = ## Substract `b` from `a` and store the result into `r`. ## `r` is initialized/overwritten - var underflowed = r.mres.diff(a.mres, b.mres) - discard cadd(r.mres, Fp.C.Mod, underflowed) + when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling + var t = a # Handle aliasing r == b + submod_asm(t.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs) + r = t + else: + var underflowed = r.mres.diff(a.mres, b.mres) + discard cadd(r.mres, Fp.C.Mod, underflowed) -func double*(r: var Fp, a: Fp) = +func double*(r: var Fp, a: Fp) {.inline.} = ## Double ``a`` into ``r`` ## `r` is initialized/overwritten - var overflowed = r.mres.double(a.mres) - overflowed = overflowed or not(r.mres < Fp.C.Mod) - discard csub(r.mres, Fp.C.Mod, overflowed) + when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling + r = a + addmod_asm(r.mres.limbs, a.mres.limbs, Fp.C.Mod.limbs) + else: + var overflowed = r.mres.double(a.mres) + overflowed = overflowed or not(r.mres < Fp.C.Mod) + discard csub(r.mres, Fp.C.Mod, overflowed) -func prod*(r: var Fp, a, b: Fp) = +func prod*(r: var Fp, a, b: Fp) {.inline.} = ## Store the product of ``a`` by ``b`` modulo p into ``r`` ## ``r`` is initialized / overwritten r.mres.montyMul(a.mres, b.mres, Fp.C.Mod, Fp.C.getNegInvModWord(), Fp.C.canUseNoCarryMontyMul()) -func square*(r: var Fp, a: Fp) = +func square*(r: var Fp, a: Fp) {.inline.} = ## Squaring modulo p r.mres.montySquare(a.mres, Fp.C.Mod, Fp.C.getNegInvModWord(), Fp.C.canUseNoCarryMontySquare()) -func neg*(r: var Fp, a: Fp) = +func neg*(r: var Fp, a: Fp) {.inline.} = ## Negate modulo p - discard r.mres.diff(Fp.C.Mod, a.mres) + when UseX86ASM and defined(gcc): + # Clang and every compiler besides GCC + # can cleanly optimized this + # especially on Fp2 + negmod_asm(r.mres.limbs, a.mres.limbs, Fp.C.Mod.limbs) + else: + discard r.mres.diff(Fp.C.Mod, a.mres) -func div2*(a: var Fp) = +func div2*(a: var Fp) {.inline.} = ## Modular division by 2 a.mres.div2_modular(Fp.C.getPrimePlus1div2()) @@ -178,7 +211,7 @@ func div2*(a: var Fp) = # # Internally those procedures will allocate extra scratchspace on the stack -func pow*(a: var Fp, exponent: BigInt) = +func pow*(a: var Fp, exponent: BigInt) {.inline.} = ## Exponentiation modulo p ## ``a``: a field element to be exponentiated ## ``exponent``: a big integer @@ -191,7 +224,7 @@ func pow*(a: var Fp, exponent: BigInt) = Fp.C.canUseNoCarryMontySquare() ) -func pow*(a: var Fp, exponent: openarray[byte]) = +func pow*(a: var Fp, exponent: openarray[byte]) {.inline.} = ## Exponentiation modulo p ## ``a``: a field element to be exponentiated ## ``exponent``: a big integer in canonical big endian representation @@ -204,7 +237,7 @@ func pow*(a: var Fp, exponent: openarray[byte]) = Fp.C.canUseNoCarryMontySquare() ) -func powUnsafeExponent*(a: var Fp, exponent: BigInt) = +func powUnsafeExponent*(a: var Fp, exponent: BigInt) {.inline.} = ## Exponentiation modulo p ## ``a``: a field element to be exponentiated ## ``exponent``: a big integer @@ -224,7 +257,7 @@ func powUnsafeExponent*(a: var Fp, exponent: BigInt) = Fp.C.canUseNoCarryMontySquare() ) -func powUnsafeExponent*(a: var Fp, exponent: openarray[byte]) = +func powUnsafeExponent*(a: var Fp, exponent: openarray[byte]) {.inline.} = ## Exponentiation modulo p ## ``a``: a field element to be exponentiated ## ``exponent``: a big integer a big integer in canonical big endian representation @@ -250,7 +283,7 @@ func powUnsafeExponent*(a: var Fp, exponent: openarray[byte]) = # # ############################################################ -func isSquare*[C](a: Fp[C]): SecretBool = +func isSquare*[C](a: Fp[C]): SecretBool {.inline.} = ## Returns true if ``a`` is a square (quadratic residue) in 𝔽p ## ## Assumes that the prime modulus ``p`` is public. @@ -272,7 +305,7 @@ func isSquare*[C](a: Fp[C]): SecretBool = xi.mres == C.getMontyPrimeMinus1() ) -func sqrt_p3mod4[C](a: var Fp[C]) = +func sqrt_p3mod4[C](a: var Fp[C]) {.inline.} = ## Compute the square root of ``a`` ## ## This requires ``a`` to be a square @@ -286,7 +319,7 @@ func sqrt_p3mod4[C](a: var Fp[C]) = static: doAssert BaseType(C.Mod.limbs[0]) mod 4 == 3 a.powUnsafeExponent(C.getPrimePlus1div4_BE()) -func sqrt_invsqrt_p3mod4[C](sqrt, invsqrt: var Fp[C], a: Fp[C]) = +func sqrt_invsqrt_p3mod4[C](sqrt, invsqrt: var Fp[C], a: Fp[C]) {.inline.} = ## If ``a`` is a square, compute the square root of ``a`` in sqrt ## and the inverse square root of a in invsqrt ## @@ -307,7 +340,7 @@ func sqrt_invsqrt_p3mod4[C](sqrt, invsqrt: var Fp[C], a: Fp[C]) = # √a ≡ a * 1/√a ≡ a^((p+1)/4) (mod p) sqrt.prod(invsqrt, a) -func sqrt_invsqrt_if_square_p3mod4[C](sqrt, invsqrt: var Fp[C], a: Fp[C]): SecretBool = +func sqrt_invsqrt_if_square_p3mod4[C](sqrt, invsqrt: var Fp[C], a: Fp[C]): SecretBool {.inline.} = ## If ``a`` is a square, compute the square root of ``a`` in sqrt ## and the inverse square root of a in invsqrt ## @@ -319,7 +352,7 @@ func sqrt_invsqrt_if_square_p3mod4[C](sqrt, invsqrt: var Fp[C], a: Fp[C]): Secre euler.prod(sqrt, invsqrt) result = not(euler.mres == C.getMontyPrimeMinus1()) -func sqrt_if_square_p3mod4[C](a: var Fp[C]): SecretBool = +func sqrt_if_square_p3mod4[C](a: var Fp[C]): SecretBool {.inline.} = ## If ``a`` is a square, compute the square root of ``a`` ## if not, ``a`` is unmodified. ## @@ -334,7 +367,7 @@ func sqrt_if_square_p3mod4[C](a: var Fp[C]): SecretBool = result = sqrt_invsqrt_if_square_p3mod4(sqrt, invsqrt, a) a.ccopy(sqrt, result) -func sqrt*[C](a: var Fp[C]) = +func sqrt*[C](a: var Fp[C]) {.inline.} = ## Compute the square root of ``a`` ## ## This requires ``a`` to be a square @@ -349,7 +382,7 @@ func sqrt*[C](a: var Fp[C]) = else: {.error: "Square root is only implemented for p ≡ 3 (mod 4)".} -func sqrt_if_square*[C](a: var Fp[C]): SecretBool = +func sqrt_if_square*[C](a: var Fp[C]): SecretBool {.inline.} = ## If ``a`` is a square, compute the square root of ``a`` ## if not, ``a`` is unmodified. ## @@ -361,7 +394,7 @@ func sqrt_if_square*[C](a: var Fp[C]): SecretBool = else: {.error: "Square root is only implemented for p ≡ 3 (mod 4)".} -func sqrt_invsqrt*[C](sqrt, invsqrt: var Fp[C], a: Fp[C]) = +func sqrt_invsqrt*[C](sqrt, invsqrt: var Fp[C], a: Fp[C]) {.inline.} = ## Compute the square root and inverse square root of ``a`` ## ## This requires ``a`` to be a square @@ -376,7 +409,7 @@ func sqrt_invsqrt*[C](sqrt, invsqrt: var Fp[C], a: Fp[C]) = else: {.error: "Square root is only implemented for p ≡ 3 (mod 4)".} -func sqrt_invsqrt_if_square*[C](sqrt, invsqrt: var Fp[C], a: Fp[C]): SecretBool = +func sqrt_invsqrt_if_square*[C](sqrt, invsqrt: var Fp[C], a: Fp[C]): SecretBool {.inline.} = ## Compute the square root and ivnerse square root of ``a`` ## ## This returns true if ``a`` is square and sqrt/invsqrt contains the square root/inverse square root @@ -403,15 +436,15 @@ func sqrt_invsqrt_if_square*[C](sqrt, invsqrt: var Fp[C], a: Fp[C]): SecretBool # - Those that return a field element # - Those that internally allocate a temporary field element -func `+`*(a, b: Fp): Fp {.noInit.} = +func `+`*(a, b: Fp): Fp {.noInit, inline.} = ## Addition modulo p result.sum(a, b) -func `-`*(a, b: Fp): Fp {.noInit.} = +func `-`*(a, b: Fp): Fp {.noInit, inline.} = ## Substraction modulo p result.diff(a, b) -func `*`*(a, b: Fp): Fp {.noInit.} = +func `*`*(a, b: Fp): Fp {.noInit, inline.} = ## Multiplication modulo p ## ## It is recommended to assign with {.noInit.} @@ -419,11 +452,11 @@ func `*`*(a, b: Fp): Fp {.noInit.} = ## routine will zero init internally the result. result.prod(a, b) -func `*=`*(a: var Fp, b: Fp) = +func `*=`*(a: var Fp, b: Fp) {.inline.} = ## Multiplication modulo p a.prod(a, b) -func square*(a: var Fp) = +func square*(a: var Fp) {.inline.}= ## Squaring modulo p a.mres.montySquare(a.mres, Fp.C.Mod, Fp.C.getNegInvModWord(), Fp.C.canUseNoCarryMontySquare()) diff --git a/constantine/arithmetic/finite_fields_asm_mul_x86.nim b/constantine/arithmetic/finite_fields_asm_mul_x86.nim new file mode 100644 index 0000000..f943875 --- /dev/null +++ b/constantine/arithmetic/finite_fields_asm_mul_x86.nim @@ -0,0 +1,223 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + # Standard library + std/macros, + # Internal + ../config/common, + ../primitives, + ./limbs + +# ############################################################ +# +# Assembly implementation of finite fields +# +# ############################################################ + +# Note: We can refer to at most 30 registers in inline assembly +# and "InputOutput" registers count double +# They are nice to let the compiler deals with mov +# but too constraining so we move things ourselves. + +static: doAssert UseX86ASM + +# Necessary for the compiler to find enough registers (enabled at -O1) +{.localPassC:"-fomit-frame-pointer".} + +# Montgomery multiplication +# ------------------------------------------------------------ +# Fallback when no ADX and BMI2 support (MULX, ADCX, ADOX) + +proc finalSub*( + ctx: var Assembler_x86, + r: Operand or OperandArray, + t, M, scratch: OperandArray + ) = + ## Reduce `t` into `r` modulo `M` + let N = M.len + ctx.comment "Final substraction" + for i in 0 ..< N: + ctx.mov scratch[i], t[i] + if i == 0: + ctx.sub scratch[i], M[i] + else: + ctx.sbb scratch[i], M[i] + + # If we borrowed it means that we were smaller than + # the modulus and we don't need "scratch" + for i in 0 ..< N: + ctx.cmovnc t[i], scratch[i] + ctx.mov r[i], t[i] + +macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_MM: Limbs[N], m0ninv_MM: BaseType): untyped = + ## Generate an optimized Montgomery Multiplication kernel + ## using the CIOS method + ## + ## The multiplication and reduction are further merged in the same loop + ## + ## This requires the most significant word of the Modulus + ## M[^1] < high(SecretWord) shr 2 (i.e. less than 0b00111...1111) + ## https://hackmd.io/@zkteam/modular_multiplication + + result = newStmtList() + + var ctx = init(Assembler_x86, BaseType) + let + scratchSlots = max(N, 6) + + # We could force M as immediate by specializing per moduli + M = init(OperandArray, nimSymbol = M_MM, N, PointerInReg, Input) + # If N is too big, we need to spill registers. TODO. + t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber) + # MultiPurpose Register slots + scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber) + + # MUL requires RAX and RDX + rRAX = Operand( + desc: OperandDesc( + asmId: "[rax]", + nimSymbol: ident"rax", + rm: RAX, + constraint: Output_EarlyClobber, + cEmit: "rax" + ) + ) + + rRDX = Operand( + desc: OperandDesc( + asmId: "[rdx]", + nimSymbol: ident"rdx", + rm: RDX, + constraint: Output_EarlyClobber, + cEmit: "rdx" + ) + ) + + m0ninv = Operand( + desc: OperandDesc( + asmId: "[m0ninv]", + nimSymbol: m0ninv_MM, + rm: MemOffsettable, + constraint: Input, + cEmit: "&" & $m0ninv_MM + ) + ) + + # We're really constrained by register and somehow setting as memory doesn't help + # So we store the result `r` in the scratch space and then reload it in RDX + # before the scratchspace is used in final substraction + a = scratch[0].asArrayAddr(len = N) # Store the `a` operand + b = scratch[1].asArrayAddr(len = N) # Store the `b` operand + A = scratch[2] # High part of extended precision multiplication + C = scratch[3] + m = scratch[4] # Stores (t[0] * m0ninv) mod 2^w + r = scratch[5] # Stores the `r` operand + + # Registers used: + # - 1 for `M` + # - 6 for `t` (at most) + # - 6 for `scratch` + # - 2 for RAX and RDX + # Total 15 out of 16 + # We can save 1 by hardcoding M as immediate (and m0ninv) + # but this prevent reusing the same code for multiple curves like BLS12-377 and BLS12-381 + # We might be able to save registers by having `r` and `M` be memory operand as well + + let tsym = t.nimSymbol + let scratchSym = scratch.nimSymbol + let eax = rRAX.desc.nimSymbol + let edx = rRDX.desc.nimSymbol + result.add quote do: + static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress) + + var `tsym`: typeof(`r_MM`) # zero init + # Assumes 64-bit limbs on 64-bit arch (or you can't store an address) + var `scratchSym` {.noInit.}: Limbs[`scratchSlots`] + var `eax`{.noInit.}, `edx`{.noInit.}: BaseType + + `scratchSym`[0] = cast[SecretWord](`a_MM`[0].unsafeAddr) + `scratchSym`[1] = cast[SecretWord](`b_MM`[0].unsafeAddr) + `scratchSym`[5] = cast[SecretWord](`r_MM`[0].unsafeAddr) + + # Algorithm + # ----------------------------------------- + # for i=0 to N-1 + # (A, t[0]) <- a[0] * b[i] + t[0] + # m <- (t[0] * m0ninv) mod 2^w + # (C, _) <- m * M[0] + t[0] + # for j=1 to N-1 + # (A, t[j]) <- a[j] * b[i] + A + t[j] + # (C, t[j-1]) <- m * M[j] + C + t[j] + # + # t[N-1] = C + A + + # No register spilling handling + doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs." + + for i in 0 ..< N: + # (A, t[0]) <- a[0] * b[i] + t[0] + ctx.mov rRAX, a[0] + ctx.mul rdx, rax, b[i], rax + if i == 0: # overwrite t[0] + ctx.mov t[0], rRAX + else: # Accumulate in t[0] + ctx.add t[0], rRAX + ctx.adc rRDX, 0 + ctx.mov A, rRDX + + # m <- (t[0] * m0ninv) mod 2^w + ctx.mov m, m0ninv + ctx.imul m, t[0] + + # (C, _) <- m * M[0] + t[0] + ctx.`xor` C, C + ctx.mov rRAX, M[0] + ctx.mul rdx, rax, m, rax + ctx.add rRAX, t[0] + ctx.adc C, rRDX + + for j in 1 ..< N: + # (A, t[j]) <- a[j] * b[i] + A + t[j] + ctx.mov rRAX, a[j] + ctx.mul rdx, rax, b[i], rax + if i == 0: + ctx.mov t[j], A + else: + ctx.add t[j], A + ctx.adc rRDX, 0 + ctx.`xor` A, A + ctx.add t[j], rRAX + ctx.adc A, rRDX + + # (C, t[j-1]) <- m * M[j] + C + t[j] + ctx.mov rRAX, M[j] + ctx.mul rdx, rax, m, rax + ctx.add C, t[j] + ctx.adc rRDX, 0 + ctx.add C, rRAX + ctx.adc rRDX, 0 + ctx.mov t[j-1], C + ctx.mov C, rRDX + + ctx.add A, C + ctx.mov t[N-1], A + + ctx.mov rRDX, r + let r2 = rRDX.asArrayAddr(len = N) + + ctx.finalSub( + r2, t, M, + scratch + ) + + result.add ctx.generate + +func montMul_CIOS_nocarry_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) = + ## Constant-time modular multiplication + montMul_CIOS_nocarry_gen(r, a, b, M, m0ninv) diff --git a/constantine/arithmetic/finite_fields_asm_mul_x86_adx_bmi2.nim b/constantine/arithmetic/finite_fields_asm_mul_x86_adx_bmi2.nim new file mode 100644 index 0000000..8c41e23 --- /dev/null +++ b/constantine/arithmetic/finite_fields_asm_mul_x86_adx_bmi2.nim @@ -0,0 +1,282 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + # Standard library + std/macros, + # Internal + ../config/common, + ../primitives, + ./limbs, + ./finite_fields_asm_mul_x86 + +# ############################################################ +# +# Assembly implementation of finite fields +# +# ############################################################ + +# Note: We can refer to at most 30 registers in inline assembly +# and "InputOutput" registers count double +# They are nice to let the compiler deals with mov +# but too constraining so we move things ourselves. + +static: doAssert UseX86ASM + +# MULX/ADCX/ADOX +{.localPassC:"-madx -mbmi2".} +# Necessary for the compiler to find enough registers (enabled at -O1) +{.localPassC:"-fomit-frame-pointer".} + +# Montgomery Multiplication +# ------------------------------------------------------------ +proc mulx_by_word( + ctx: var Assembler_x86, + C: Operand, + t: OperandArray, + a: Operand, # Pointer in scratchspace + word: Operand, + S, rRDX: Operand + ) = + ## Multiply the `a[0..= 2, "The Assembly-optimized montgomery multiplication requires at least 2 limbs." + ctx.comment " Outer loop i = 0" + ctx.`xor` rRDX, rRDX # Clear flags - TODO: necessary? + ctx.mov rRDX, word + + # for j=0 to N-1 + # (C,t[j]) := t[j] + a[j]*b[i] + C + + # First limb + ctx.mulx t[1], t[0], a[0], rdx + + # Steady state + for j in 1 ..< N-1: + ctx.mulx t[j+1], S, a[j], rdx + ctx.adox t[j], S # TODO, we probably can use ADC here + + # Last limb + ctx.mulx C, S, a[N-1], rdx + ctx.adox t[N-1], S + + # Final carries + ctx.comment " Mul carries i = 0" + ctx.mov rRDX, 0 # Set to 0 without clearing flags + ctx.adcx C, rRDX + ctx.adox C, rRDX + +proc mulaccx_by_word( + ctx: var Assembler_x86, + C: Operand, + t: OperandArray, + a: Operand, # Pointer in scratchspace + i: int, + word: Operand, + S, rRDX: Operand + ) = + ## Multiply the `a[0..= 2, "The Assembly-optimized montgomery multiplication requires at least 2 limbs." + doAssert i != 0 + + ctx.comment " Outer loop i = " & $i + ctx.`xor` rRDX, rRDX # Clear flags - TODO: necessary? + ctx.mov rRDX, word + + # for j=0 to N-1 + # (C,t[j]) := t[j] + a[j]*b[i] + C + + # Steady state + for j in 0 ..< N-1: + ctx.mulx C, S, a[j], rdx + ctx.adox t[j], S + ctx.adcx t[j+1], C + + # Last limb + ctx.mulx C, S, a[N-1], rdx + ctx.adox t[N-1], S + + # Final carries + ctx.comment " Mul carries i = " & $i + ctx.mov rRDX, 0 # Set to 0 without clearing flags + ctx.adcx C, rRDX + ctx.adox C, rRDX + +proc partialRedx( + ctx: var Assembler_x86, + C: Operand, + t: OperandArray, + M: OperandArray, + m0ninv: Operand, + lo, S, rRDX: Operand + ) = + ## Partial Montgomery reduction + ## For CIOS method + ## `C` the update carry flag (represents t[N]) + ## `t[0.. **every code compiled in 32-bit with MSVC on 64-bit architectures will call llmul every time a 64-bit multiplication is executed.** +- [When Constant-Time Source Yields Variable-Time Binary: Exploiting Curve25519-donna Built with MSVC 2015.](https://infoscience.epfl.ch/record/223794/files/32_1.pdf) + +#### Verification of Assembly + +The assembly code generated needs special tooling for formal verification that is different from the C code in https://github.com/mratsim/constantine/issues/6. +Recently Microsoft Research introduced Vale: +- Vale: Verifying High-Performance Cryptographic Assembly Code\ + Barry Bond and Chris Hawblitzel, Microsoft Research; Manos Kapritsos, University of Michigan; K. Rustan M. Leino and Jacob R. Lorch, Microsoft Research; Bryan Parno, Carnegie Mellon University; Ashay Rane, The University of Texas at Austin;Srinath Setty, Microsoft Research; Laure Thompson, Cornell University\ + https://www.usenix.org/system/files/conference/usenixsecurity17/sec17-bond.pdf + https://github.com/project-everest/vale +Vale can be used to verify assembly crypto code against the architecture and also detect timing attacks. + +### Assembly Performance + +Beyond security, compilers do not expose several primitives that are necessary for necessary for multiprecision arithmetic. + +#### Add with carry, sub with borrow + +The most egregious example is add with carry which led to the GMP team to implement everything in Assembly even though this is a most basic need and almost all processor have an ADC instruction, some like the 6502 from 30 years ago only have ADC and no ADD. +See: +- https://gmplib.org/manual/Assembly-Carry-Propagation.html +- +![image](https://user-images.githubusercontent.com/22738317/83965806-8f4e2980-a8b6-11ea-9fbb-719e42d119dc.png) + +Some specific platforms might expose add with carry, for example x86 but even then the code generation might be extremely poor: https://gcc.godbolt.org/z/2h768y ```C #include #include @@ -47,7 +84,6 @@ void add256(uint64_t a[4], uint64_t b[4]){ carry = _addcarry_u64(carry, a[i], b[i], &a[i]); } ``` - GCC ```asm add256: @@ -70,7 +106,6 @@ add256: adcq %rax, 24(%rdi) ret ``` - Clang ```asm add256: @@ -84,8 +119,9 @@ add256: adcq %rax, 24(%rdi) retq ``` +(Reported fixed but it is not? https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67317) -### Inline assembly +And no way to use ADC for ARM architectures with GCC. +Clang does offer `__builtin_addcll` which might work now or [not](https://stackoverflow.com/questions/33690791/producing-good-add-with-carry-code-from-clang) as fixing the add with carry for x86 took years. Alternatively Clang does offer new arbitrary width integer since a month ago, called ExtInt http://blog.llvm.org/2020/04/the-new-clang-extint-feature-provides.html it is unknown however if code is guaranted to be constant-time. -Using inline assembly will sacrifice code readability, portability, auditability and maintainability. -That said the performance might be worth it. +See also: https://stackoverflow.com/questions/29029572/multi-word-addition-using-the-carry-flag/29212615 diff --git a/constantine/primitives/cpuinfo_x86.nim b/constantine/primitives/cpuinfo_x86.nim new file mode 100644 index 0000000..35a1a6d --- /dev/null +++ b/constantine/primitives/cpuinfo_x86.nim @@ -0,0 +1,779 @@ +# From awr1: https://github.com/nim-lang/Nim/pull/11816/files + +proc cpuidX86(eaxi, ecxi: int32): tuple[eax, ebx, ecx, edx: int32] {.used.}= + when defined(vcc): + # limited inline asm support in vcc, so intrinsics, here we go: + proc cpuidVcc(cpuInfo: ptr int32; functionID, subFunctionID: int32) + {.cdecl, importc: "__cpuidex", header: "intrin.h".} + cpuidVcc(addr result.eax, eaxi, ecxi) + else: + var (eaxr, ebxr, ecxr, edxr) = (0'i32, 0'i32, 0'i32, 0'i32) + asm """ + cpuid + :"=a"(`eaxr`), "=b"(`ebxr`), "=c"(`ecxr`), "=d"(`edxr`) + :"a"(`eaxi`), "c"(`ecxi`)""" + (eaxr, ebxr, ecxr, edxr) + +proc cpuNameX86(): string {.used.}= + var leaves {.global.} = cast[array[48, char]]([ + cpuidX86(eaxi = 0x80000002'i32, ecxi = 0), + cpuidX86(eaxi = 0x80000003'i32, ecxi = 0), + cpuidX86(eaxi = 0x80000004'i32, ecxi = 0)]) + result = $cast[cstring](addr leaves[0]) + +type + X86Feature {.pure.} = enum + HypervisorPresence, Hyperthreading, NoSMT, IntelVtx, Amdv, X87fpu, Mmx, + MmxExt, F3DNow, F3DNowEnhanced, Prefetch, Sse, Sse2, Sse3, Ssse3, Sse4a, + Sse41, Sse42, Avx, Avx2, Avx512f, Avx512dq, Avx512ifma, Avx512pf, + Avx512er, Avx512cd, Avx512bw, Avx512vl, Avx512vbmi, Avx512vbmi2, + Avx512vpopcntdq, Avx512vnni, Avx512vnniw4, Avx512fmaps4, Avx512bitalg, + Avx512bfloat16, Avx512vp2intersect, Rdrand, Rdseed, MovBigEndian, Popcnt, + Fma3, Fma4, Xop, Cas8B, Cas16B, Abm, Bmi1, Bmi2, TsxHle, TsxRtm, Adx, Sgx, + Gfni, Aes, Vaes, Vpclmulqdq, Pclmulqdq, NxBit, Float16c, Sha, Clflush, + ClflushOpt, Clwb, PrefetchWT1, Mpx + +let + leaf1 = cpuidX86(eaxi = 1, ecxi = 0) + leaf7 = cpuidX86(eaxi = 7, ecxi = 0) + leaf8 = cpuidX86(eaxi = 0x80000001'i32, ecxi = 0) + +# The reason why we don't just evaluate these directly in the `let` variable +# list is so that we can internally organize features by their input (leaf) +# and output registers. +proc testX86Feature(feature: X86Feature): bool = + proc test(input, bit: int): bool = + ((1 shl bit) and input) != 0 + + # see: https://en.wikipedia.org/wiki/CPUID#Calling_CPUID + # see: Intel® Architecture Instruction Set Extensions and Future Features + # Programming Reference + result = case feature + # leaf 1, edx + of X87fpu: + leaf1.edx.test(0) + of Clflush: + leaf1.edx.test(19) + of Mmx: + leaf1.edx.test(23) + of Sse: + leaf1.edx.test(25) + of Sse2: + leaf1.edx.test(26) + of Hyperthreading: + leaf1.edx.test(28) + + # leaf 1, ecx + of Sse3: + leaf1.ecx.test(0) + of Pclmulqdq: + leaf1.ecx.test(1) + of IntelVtx: + leaf1.ecx.test(5) + of Ssse3: + leaf1.ecx.test(9) + of Fma3: + leaf1.ecx.test(12) + of Cas16B: + leaf1.ecx.test(13) + of Sse41: + leaf1.ecx.test(19) + of Sse42: + leaf1.ecx.test(20) + of MovBigEndian: + leaf1.ecx.test(22) + of Popcnt: + leaf1.ecx.test(23) + of Aes: + leaf1.ecx.test(25) + of Avx: + leaf1.ecx.test(28) + of Float16c: + leaf1.ecx.test(29) + of Rdrand: + leaf1.ecx.test(30) + of HypervisorPresence: + leaf1.ecx.test(31) + + # leaf 7, ecx + of PrefetchWT1: + leaf7.ecx.test(0) + of Avx512vbmi: + leaf7.ecx.test(1) + of Avx512vbmi2: + leaf7.ecx.test(6) + of Gfni: + leaf7.ecx.test(8) + of Vaes: + leaf7.ecx.test(9) + of Vpclmulqdq: + leaf7.ecx.test(10) + of Avx512vnni: + leaf7.ecx.test(11) + of Avx512bitalg: + leaf7.ecx.test(12) + of Avx512vpopcntdq: + leaf7.ecx.test(14) + + # lead 7, eax + of Avx512bfloat16: + leaf7.eax.test(5) + + # leaf 7, ebx + of Sgx: + leaf7.ebx.test(2) + of Bmi1: + leaf7.ebx.test(3) + of TsxHle: + leaf7.ebx.test(4) + of Avx2: + leaf7.ebx.test(5) + of Bmi2: + leaf7.ebx.test(8) + of TsxRtm: + leaf7.ebx.test(11) + of Mpx: + leaf7.ebx.test(14) + of Avx512f: + leaf7.ebx.test(16) + of Avx512dq: + leaf7.ebx.test(17) + of Rdseed: + leaf7.ebx.test(18) + of Adx: + leaf7.ebx.test(19) + of Avx512ifma: + leaf7.ebx.test(21) + of ClflushOpt: + leaf7.ebx.test(23) + of Clwb: + leaf7.ebx.test(24) + of Avx512pf: + leaf7.ebx.test(26) + of Avx512er: + leaf7.ebx.test(27) + of Avx512cd: + leaf7.ebx.test(28) + of Sha: + leaf7.ebx.test(29) + of Avx512bw: + leaf7.ebx.test(30) + of Avx512vl: + leaf7.ebx.test(31) + + # leaf 7, edx + of Avx512vnniw4: + leaf7.edx.test(2) + of Avx512fmaps4: + leaf7.edx.test(3) + of Avx512vp2intersect: + leaf7.edx.test(8) + + # leaf 8, edx + of NoSMT: + leaf8.edx.test(1) + of Cas8B: + leaf8.edx.test(8) + of NxBit: + leaf8.edx.test(20) + of MmxExt: + leaf8.edx.test(22) + of F3DNowEnhanced: + leaf8.edx.test(30) + of F3DNow: + leaf8.edx.test(31) + + # leaf 8, ecx + of Amdv: + leaf8.ecx.test(2) + of Abm: + leaf8.ecx.test(5) + of Sse4a: + leaf8.ecx.test(6) + of Prefetch: + leaf8.ecx.test(8) + of Xop: + leaf8.ecx.test(11) + of Fma4: + leaf8.ecx.test(16) + +let + isHypervisorPresentImpl = testX86Feature(HypervisorPresence) + hasSimultaneousMultithreadingImpl = + testX86Feature(Hyperthreading) or not testX86Feature(NoSMT) + hasIntelVtxImpl = testX86Feature(IntelVtx) + hasAmdvImpl = testX86Feature(Amdv) + hasX87fpuImpl = testX86Feature(X87fpu) + hasMmxImpl = testX86Feature(Mmx) + hasMmxExtImpl = testX86Feature(MmxExt) + has3DNowImpl = testX86Feature(F3DNow) + has3DNowEnhancedImpl = testX86Feature(F3DNowEnhanced) + hasPrefetchImpl = testX86Feature(Prefetch) or testX86Feature(F3DNow) + hasSseImpl = testX86Feature(Sse) + hasSse2Impl = testX86Feature(Sse2) + hasSse3Impl = testX86Feature(Sse3) + hasSsse3Impl = testX86Feature(Ssse3) + hasSse4aImpl = testX86Feature(Sse4a) + hasSse41Impl = testX86Feature(Sse41) + hasSse42Impl = testX86Feature(Sse42) + hasAvxImpl = testX86Feature(Avx) + hasAvx2Impl = testX86Feature(Avx2) + hasAvx512fImpl = testX86Feature(Avx512f) + hasAvx512dqImpl = testX86Feature(Avx512dq) + hasAvx512ifmaImpl = testX86Feature(Avx512ifma) + hasAvx512pfImpl = testX86Feature(Avx512pf) + hasAvx512erImpl = testX86Feature(Avx512er) + hasAvx512cdImpl = testX86Feature(Avx512dq) + hasAvx512bwImpl = testX86Feature(Avx512bw) + hasAvx512vlImpl = testX86Feature(Avx512vl) + hasAvx512vbmiImpl = testX86Feature(Avx512vbmi) + hasAvx512vbmi2Impl = testX86Feature(Avx512vbmi2) + hasAvx512vpopcntdqImpl = testX86Feature(Avx512vpopcntdq) + hasAvx512vnniImpl = testX86Feature(Avx512vnni) + hasAvx512vnniw4Impl = testX86Feature(Avx512vnniw4) + hasAvx512fmaps4Impl = testX86Feature(Avx512fmaps4) + hasAvx512bitalgImpl = testX86Feature(Avx512bitalg) + hasAvx512bfloat16Impl = testX86Feature(Avx512bfloat16) + hasAvx512vp2intersectImpl = testX86Feature(Avx512vp2intersect) + hasRdrandImpl = testX86Feature(Rdrand) + hasRdseedImpl = testX86Feature(Rdseed) + hasMovBigEndianImpl = testX86Feature(MovBigEndian) + hasPopcntImpl = testX86Feature(Popcnt) + hasFma3Impl = testX86Feature(Fma3) + hasFma4Impl = testX86Feature(Fma4) + hasXopImpl = testX86Feature(Xop) + hasCas8BImpl = testX86Feature(Cas8B) + hasCas16BImpl = testX86Feature(Cas16B) + hasAbmImpl = testX86Feature(Abm) + hasBmi1Impl = testX86Feature(Bmi1) + hasBmi2Impl = testX86Feature(Bmi2) + hasTsxHleImpl = testX86Feature(TsxHle) + hasTsxRtmImpl = testX86Feature(TsxRtm) + hasAdxImpl = testX86Feature(TsxHle) + hasSgxImpl = testX86Feature(Sgx) + hasGfniImpl = testX86Feature(Gfni) + hasAesImpl = testX86Feature(Aes) + hasVaesImpl = testX86Feature(Vaes) + hasVpclmulqdqImpl = testX86Feature(Vpclmulqdq) + hasPclmulqdqImpl = testX86Feature(Pclmulqdq) + hasNxBitImpl = testX86Feature(NxBit) + hasFloat16cImpl = testX86Feature(Float16c) + hasShaImpl = testX86Feature(Sha) + hasClflushImpl = testX86Feature(Clflush) + hasClflushOptImpl = testX86Feature(ClflushOpt) + hasClwbImpl = testX86Feature(Clwb) + hasPrefetchWT1Impl = testX86Feature(PrefetchWT1) + hasMpxImpl = testX86Feature(Mpx) + +# NOTE: We use procedures here (layered over the variables) to keep the API +# consistent and usable against possible future heterogenous systems with ISA +# differences between cores (a possibility that has historical precedents, for +# instance, the PPU/SPU relationship found on the IBM Cell). If future systems +# do end up having disparate ISA features across multiple cores, expect there to +# be a "cpuCore" argument added to the feature procs. + +proc isHypervisorPresent*(): bool {.inline.} = + return isHypervisorPresentImpl + ## **(x86 Only)** + ## + ## Reports `true` if this application is running inside of a virtual machine + ## (this is by no means foolproof). + +proc hasSimultaneousMultithreading*(): bool {.inline.} = + return hasSimultaneousMultithreadingImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware is utilizing simultaneous multithreading + ## (branded as *"hyperthreads"* on Intel processors). + +proc hasIntelVtx*(): bool {.inline.} = + return hasIntelVtxImpl + ## **(x86 Only)** + ## + ## Reports `true` if the Intel virtualization extensions (VT-x) are available. + +proc hasAmdv*(): bool {.inline.} = + return hasAmdvImpl + ## **(x86 Only)** + ## + ## Reports `true` if the AMD virtualization extensions (AMD-V) are available. + +proc hasX87fpu*(): bool {.inline.} = + return hasX87fpuImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use x87 floating-point instructions + ## (includes support for single, double, and 80-bit percision floats as per + ## IEEE 754-1985). + ## + ## By virtue of SSE2 enforced compliance on AMD64 CPUs, this should always be + ## `true` on 64-bit x86 processors. It should be noted that support of these + ## instructions is deprecated on 64-bit versions of Windows - see MSDN_. + ## + ## .. _MSDN: https://docs.microsoft.com/en-us/windows/win32/dxtecharts/sixty-four-bit-programming-for-game-developers#porting-applications-to-64-bit-platforms + +proc hasMmx*(): bool {.inline.} = + return hasMmxImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use MMX SIMD instructions. + ## + ## By virtue of SSE2 enforced compliance on AMD64 CPUs, this should always be + ## `true` on 64-bit x86 processors. It should be noted that support of these + ## instructions is deprecated on 64-bit versions of Windows (see MSDN_ for + ## more info). + ## + ## .. _MSDN: https://docs.microsoft.com/en-us/windows/win32/dxtecharts/sixty-four-bit-programming-for-game-developers#porting-applications-to-64-bit-platforms + +proc hasMmxExt*(): bool {.inline.} = + return hasMmxExtImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use "Extended MMX" SIMD instructions. + ## + ## It should be noted that support of these instructions is deprecated on + ## 64-bit versions of Windows (see MSDN_ for more info). + ## + ## .. _MSDN: https://docs.microsoft.com/en-us/windows/win32/dxtecharts/sixty-four-bit-programming-for-game-developers#porting-applications-to-64-bit-platforms + +proc has3DNow*(): bool {.inline.} = + return has3DNowImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use 3DNow! SIMD instructions. + ## + ## It should be noted that support of these instructions is deprecated on + ## 64-bit versions of Windows (see MSDN_ for more info), and that the 3DNow! + ## instructions (with an exception made for the prefetch instructions, see the + ## `hasPrefetch` procedure) have been phased out of AMD processors since 2010 + ## (see `AMD Developer Central`_ for more info). + ## + ## .. _MSDN: https://docs.microsoft.com/en-us/windows/win32/dxtecharts/sixty-four-bit-programming-for-game-developers#porting-applications-to-64-bit-platforms + ## .. _`AMD Developer Central`: https://web.archive.org/web/20131109151245/http://developer.amd.com/community/blog/2010/08/18/3dnow-deprecated/ + +proc has3DNowEnhanced*(): bool {.inline.} = + return has3DNowEnhancedImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use "Enhanced 3DNow!" SIMD instructions. + ## + ## It should be noted that support of these instructions is deprecated on + ## 64-bit versions of Windows (see MSDN_ for more info), and that the 3DNow! + ## instructions (with an exception made for the prefetch instructions, see the + ## `hasPrefetch` procedure) have been phased out of AMD processors since 2010 + ## (see `AMD Developer Central`_ for more info). + ## + ## .. _MSDN: https://docs.microsoft.com/en-us/windows/win32/dxtecharts/sixty-four-bit-programming-for-game-developers#porting-applications-to-64-bit-platforms + ## .. _`AMD Developer Central`: https://web.archive.org/web/20131109151245/http://developer.amd.com/community/blog/2010/08/18/3dnow-deprecated/ + +proc hasPrefetch*(): bool {.inline.} = + return hasPrefetchImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use the `PREFETCH` and `PREFETCHW` + ## instructions. These instructions originally included as part of 3DNow!, but + ## potentially indepdendent from the rest of it due to changes in contemporary + ## AMD processors (see above). + +proc hasSse*(): bool {.inline.} = + return hasSseImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use the SSE (Streaming SIMD Extensions) + ## 1.0 instructions, which introduced 128-bit SIMD on x86 machines. + ## + ## By virtue of SSE2 enforced compliance on AMD64 CPUs, this should always be + ## `true` on 64-bit x86 processors. + +proc hasSse2*(): bool {.inline.} = + return hasSse2Impl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use the SSE (Streaming SIMD Extensions) + ## 2.0 instructions. + ## + ## By virtue of SSE2 enforced compliance on AMD64 CPUs, this should always be + ## `true` on 64-bit x86 processors. + +proc hasSse3*(): bool {.inline.} = + return hasSse3Impl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use SSE (Streaming SIMD Extensions) 3.0 + ## instructions. + +proc hasSsse3*(): bool {.inline.} = + return hasSsse3Impl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use Supplemental SSE (Streaming SIMD + ## Extensions) 3.0 instructions. + +proc hasSse4a*(): bool {.inline.} = + return hasSse4aImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use Supplemental SSE (Streaming SIMD + ## Extensions) 4a instructions. + +proc hasSse41*(): bool {.inline.} = + return hasSse41Impl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use Supplemental SSE (Streaming SIMD + ## Extensions) 4.1 instructions. + +proc hasSse42*(): bool {.inline.} = + return hasSse42Impl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use Supplemental SSE (Streaming SIMD + ## Extensions) 4.2 instructions. + +proc hasAvx*(): bool {.inline.} = + return hasAvxImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions) + ## 1.0 instructions, which introduced 256-bit SIMD on x86 machines along with + ## addded reencoded versions of prior 128-bit SSE instructions into the more + ## code-dense and non-backward compatible VEX (Vector Extensions) format. + +proc hasAvx2*(): bool {.inline.} = + return hasAvx2Impl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions) 2.0 + ## instructions. + +proc hasAvx512f*(): bool {.inline.} = + return hasAvx512fImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions) + ## 512-bit F (Foundation) instructions. + +proc hasAvx512dq*(): bool {.inline.} = + return hasAvx512dqImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions) + ## 512-bit DQ (Doubleword + Quadword) instructions. + +proc hasAvx512ifma*(): bool {.inline.} = + return hasAvx512ifmaImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions) + ## 512-bit IFMA (Integer Fused Multiply Accumulation) instructions. + +proc hasAvx512pf*(): bool {.inline.} = + return hasAvx512pfImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions) + ## 512-bit PF (Prefetch) instructions. + +proc hasAvx512er*(): bool {.inline.} = + return hasAvx512erImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions) + ## 512-bit ER (Exponential and Reciprocal) instructions. + +proc hasAvx512cd*(): bool {.inline.} = + return hasAvx512cdImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions) + ## 512-bit CD (Conflict Detection) instructions. + +proc hasAvx512bw*(): bool {.inline.} = + return hasAvx512bwImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions) + ## 512-bit BW (Byte and Word) instructions. + +proc hasAvx512vl*(): bool {.inline.} = + return hasAvx512vlImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions) + ## 512-bit VL (Vector Length) instructions. + +proc hasAvx512vbmi*(): bool {.inline.} = + return hasAvx512vbmiImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions) + ## 512-bit VBMI (Vector Byte Manipulation) 1.0 instructions. + +proc hasAvx512vbmi2*(): bool {.inline.} = + return hasAvx512vbmi2Impl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions) + ## 512-bit VBMI (Vector Byte Manipulation) 2.0 instructions. + +proc hasAvx512vpopcntdq*(): bool {.inline.} = + return hasAvx512vpopcntdqImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use the AVX (Advanced Vector Extensions) + ## 512-bit `VPOPCNTDQ` (population count, i.e. determine number of flipped + ## bits) instruction. + +proc hasAvx512vnni*(): bool {.inline.} = + return hasAvx512vnniImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions) + ## 512-bit VNNI (Vector Neural Network) instructions. + +proc hasAvx512vnniw4*(): bool {.inline.} = + return hasAvx512vnniw4Impl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions) + ## 512-bit 4VNNIW (Vector Neural Network Word Variable Percision) + ## instructions. + +proc hasAvx512fmaps4*(): bool {.inline.} = + return hasAvx512fmaps4Impl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions) + ## 512-bit 4FMAPS (Fused-Multiply-Accumulation Single-percision) instructions. + +proc hasAvx512bitalg*(): bool {.inline.} = + return hasAvx512bitalgImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions) + ## 512-bit BITALG (Bit Algorithms) instructions. + +proc hasAvx512bfloat16*(): bool {.inline.} = + return hasAvx512bfloat16Impl + ## **(x86 Only)** + ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions) + ## 512-bit BFLOAT16 (8-bit exponent, 7-bit mantissa) instructions used by + ## Intel DL (Deep Learning) Boost. + +proc hasAvx512vp2intersect*(): bool {.inline.} = + return hasAvx512vp2intersectImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware can use AVX (Advanced Vector Extensions) + ## 512-bit VP2INTERSECT (Compute Intersections between Dualwords + Quadwords) + ## instructions. + +proc hasRdrand*(): bool {.inline.} = + return hasRdrandImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for the `RDRAND` instruction, + ## i.e. Intel on-CPU hardware random number generation. + +proc hasRdseed*(): bool {.inline.} = + return hasRdseedImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for the `RDSEED` instruction, + ## i.e. Intel on-CPU hardware random number generation (used for seeding other + ## PRNGs). + +proc hasMovBigEndian*(): bool {.inline.} = + return hasMovBigEndianImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for the `MOVBE` instruction for + ## endianness/byte-order switching. + +proc hasPopcnt*(): bool {.inline.} = + return hasPopcntImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for the `POPCNT` (population + ## count, i.e. determine number of flipped bits) instruction. + +proc hasFma3*(): bool {.inline.} = + return hasFma3Impl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for the FMA3 (Fused Multiply + ## Accumulation 3-operand) SIMD instructions. + +proc hasFma4*(): bool {.inline.} = + return hasFma4Impl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for the FMA4 (Fused Multiply + ## Accumulation 4-operand) SIMD instructions. + +proc hasXop*(): bool {.inline.} = + return hasXopImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for the XOP (eXtended + ## Operations) SIMD instructions. These instructions are exclusive to the + ## Bulldozer AMD microarchitecture family (i.e. Bulldozer, Piledriver, + ## Steamroller, and Excavator) and were phased out with the release of the Zen + ## design. + +proc hasCas8B*(): bool {.inline.} = + return hasCas8BImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for the (`LOCK`-able) + ## `CMPXCHG8B` 64-bit compare-and-swap instruction. + +proc hasCas16B*(): bool {.inline.} = + return hasCas16BImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for the (`LOCK`-able) + ## `CMPXCHG16B` 128-bit compare-and-swap instruction. + +proc hasAbm*(): bool {.inline.} = + return hasAbmImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for ABM (Advanced Bit + ## Manipulation) insturctions (i.e. `POPCNT` and `LZCNT` for counting leading + ## zeroes). + +proc hasBmi1*(): bool {.inline.} = + return hasBmi1Impl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for BMI (Bit Manipulation) 1.0 + ## instructions. + +proc hasBmi2*(): bool {.inline.} = + return hasBmi2Impl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for BMI (Bit Manipulation) 2.0 + ## instructions. + +proc hasTsxHle*(): bool {.inline.} = + return hasTsxHleImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for HLE (Hardware Lock Elision) + ## as part of Intel's TSX (Transactional Synchronization Extensions). + +proc hasTsxRtm*(): bool {.inline.} = + return hasTsxRtmImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for RTM (Restricted + ## Transactional Memory) as part of Intel's TSX (Transactional Synchronization + ## Extensions). + +proc hasAdx*(): bool {.inline.} = + return hasAdxImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for ADX (Multi-percision + ## Add-Carry Extensions) insructions. + +proc hasSgx*(): bool {.inline.} = + return hasSgxImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for SGX (Software Guard + ## eXtensions) memory encryption technology. + +proc hasGfni*(): bool {.inline.} = + return hasGfniImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for GFNI (Galois Field Affine + ## Transformation) instructions. + +proc hasAes*(): bool {.inline.} = + return hasAesImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for AESNI (Advanced Encryption + ## Standard) instructions. + +proc hasVaes*(): bool {.inline.} = + return hasVaesImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for VAES (Vectorized Advanced + ## Encryption Standard) instructions. + +proc hasVpclmulqdq*(): bool {.inline.} = + return hasVpclmulqdqImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for `VCLMULQDQ` (512 and 256-bit + ## Carryless Multiplication) instructions. + +proc hasPclmulqdq*(): bool {.inline.} = + return hasPclmulqdqImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for `PCLMULQDQ` (128-bit + ## Carryless Multiplication) instructions. + +proc hasNxBit*(): bool {.inline.} = + return hasNxBitImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for NX-bit (No-eXecute) + ## technology for marking pages of memory as non-executable. + +proc hasFloat16c*(): bool {.inline.} = + return hasFloat16cImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for F16C instructions, used for + ## converting 16-bit "half-percision" floating-point values to and from + ## single-percision floating-point values. + +proc hasSha*(): bool {.inline.} = + return hasShaImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for SHA (Secure Hash Algorithm) + ## instructions. + +proc hasClflush*(): bool {.inline.} = + return hasClflushImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for the `CLFLUSH` (Cache-line + ## Flush) instruction. + +proc hasClflushOpt*(): bool {.inline.} = + return hasClflushOptImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for the `CLFLUSHOPT` (Cache-line + ## Flush Optimized) instruction. + +proc hasClwb*(): bool {.inline.} = + return hasClwbImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for the `CLWB` (Cache-line Write + ## Back) instruction. + +proc hasPrefetchWT1*(): bool {.inline.} = + return hasPrefetchWT1Impl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for the `PREFECTHWT1` + ## instruction. + +proc hasMpx*(): bool {.inline.} = + return hasMpxImpl + ## **(x86 Only)** + ## + ## Reports `true` if the hardware has support for MPX (Memory Protection + ## eXtensions). diff --git a/constantine/primitives/macro_assembler_x86.nim b/constantine/primitives/macro_assembler_x86.nim new file mode 100644 index 0000000..f9630f7 --- /dev/null +++ b/constantine/primitives/macro_assembler_x86.nim @@ -0,0 +1,620 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import std/[macros, strutils, sets, hashes] + +# A compile-time inline assembler + +type + RM* = enum + ## Register or Memory operand + # https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html + Reg = "r" + Mem = "m" + AnyRegOrMem = "rm" # use "r, m" instead? + Imm = "i" + MemOffsettable = "o" + AnyRegMemImm = "g" + AnyMemOffImm = "oi" + AnyRegImm = "ri" + + PointerInReg = "r" # Store an array pointer + ElemsInReg = "r" # Store each individual array element in reg + + # Specific registers + RCX = "c" + RDX = "d" + R8 = "r8" + + RAX = "a" + + Register* = enum + rbx, rdx, r8, rax + + Constraint* = enum + ## GCC extended assembly modifier + Input = "" + Input_Commutative = "%" + Input_EarlyClobber = "&" + Output_Overwrite = "=" + Output_EarlyClobber = "=&" + InputOutput = "+" + InputOutput_EnsureClobber = "+&" # For register InputOutput, clang needs "+&" bug? + + OpKind = enum + kRegister + kFromArray + kArrayAddr + + Operand* = object + desc*: OperandDesc + case kind: OpKind + of kRegister: + discard + of kFromArray: + offset: int + of kArrayAddr: + buf: seq[Operand] + + OperandDesc* = ref object + asmId*: string # [a] - ASM id + nimSymbol*: NimNode # a - Nim nimSymbol + rm*: RM + constraint*: Constraint + cEmit*: string # C emit for example a->limbs + + OperandArray* = object + nimSymbol*: NimNode + buf: seq[Operand] + + OperandReuse* = object + # Allow reusing a register + asmId*: string + + Assembler_x86* = object + code: string + operands: HashSet[OperandDesc] + wordBitWidth*: int + wordSize: int + areFlagsClobbered: bool + isStackClobbered: bool + + Stack* = object + +const SpecificRegisters = {RCX, RDX, R8, RAX} +const OutputReg = {Output_EarlyClobber, InputOutput, InputOutput_EnsureClobber, Output_Overwrite} + +func hash(od: OperandDesc): Hash = + {.noSideEffect.}: + hash($od.nimSymbol) + +# TODO: remove the need of OperandArray + +func len*(opArray: OperandArray): int = + opArray.buf.len + +proc `[]`*(opArray: OperandArray, index: int): Operand = + opArray.buf[index] + +func `[]`*(opArray: var OperandArray, index: int): var Operand = + opArray.buf[index] + +func `[]`*(arrayAddr: Operand, index: int): Operand = + arrayAddr.buf[index] + +func `[]`*(arrayAddr: var Operand, index: int): var Operand = + arrayAddr.buf[index] + +func init*(T: type Assembler_x86, Word: typedesc[SomeUnsignedInt]): Assembler_x86 = + result.wordSize = sizeof(Word) + result.wordBitWidth = result.wordSize * 8 + +func init*(T: type OperandArray, nimSymbol: NimNode, len: int, rm: RM, constraint: Constraint): OperandArray = + doAssert rm in { + MemOffsettable, + AnyMemOffImm, + PointerInReg, + ElemsInReg + } or rm in SpecificRegisters + + result.buf.setLen(len) + + # We need to dereference the hidden pointer of var param + let isHiddenDeref = nimSymbol.kind == nnkHiddenDeref + let nimSymbol = if isHiddenDeref: nimSymbol[0] + else: nimSymbol + {.noSideEffect.}: + let symStr = $nimSymbol + + result.nimSymbol = nimSymbol + + if rm in {PointerInReg, MemOffsettable, AnyMemOffImm} or + rm in SpecificRegisters: + let desc = OperandDesc( + asmId: "[" & symStr & "]", + nimSymbol: nimSymbol, + rm: rm, + constraint: constraint, + cEmit: symStr + ) + for i in 0 ..< len: + result.buf[i] = Operand( + desc: desc, + kind: kFromArray, + offset: i + ) + else: + # We can't store an array in register so we create assign individual register + # per array elements instead + for i in 0 ..< len: + result.buf[i] = Operand( + desc: OperandDesc( + asmId: "[" & symStr & $i & "]", + nimSymbol: ident(symStr & $i), + rm: rm, + constraint: constraint, + cEmit: symStr & "[" & $i & "]" + ), + kind: kRegister + ) + +func asArrayAddr*(op: Operand, len: int): Operand = + ## Use the value stored in an operand as an array address + doAssert op.desc.rm in {Reg, PointerInReg, ElemsInReg}+SpecificRegisters + result = Operand( + kind: kArrayAddr, + desc: nil, + buf: newSeq[Operand](len) + ) + for i in 0 ..< len: + result.buf[i] = Operand( + desc: op.desc, + kind: kFromArray, + offset: i + ) + +# Code generation +# ------------------------------------------------------------------------------------------------------------ + +func generate*(a: Assembler_x86): NimNode = + ## Generate the inline assembly code from + ## the desired instruction + + var + outOperands: seq[string] + inOperands: seq[string] + memClobbered = false + + for odesc in a.operands.items(): + var decl: string + if odesc.rm in SpecificRegisters: + # [a] "rbx" (`a`) + decl = odesc.asmId & "\"" & $odesc.constraint & $odesc.rm & "\"" & + " (`" & odesc.cEmit & "`)" + elif odesc.rm in {Mem, AnyRegOrMem, MemOffsettable, AnyRegMemImm, AnyMemOffImm}: + # [a] "+r" (`*a`) + # We need to deref the pointer to memory + decl = odesc.asmId & " \"" & $odesc.constraint & $odesc.rm & "\"" & + " (`*" & odesc.cEmit & "`)" + else: + # [a] "+r" (`a[0]`) + decl = odesc.asmId & " \"" & $odesc.constraint & $odesc.rm & "\"" & + " (`" & odesc.cEmit & "`)" + + if odesc.constraint in {Input, Input_Commutative}: + inOperands.add decl + else: + outOperands.add decl + + if odesc.rm == PointerInReg and odesc.constraint in {Output_Overwrite, Output_EarlyClobber, InputOutput, InputOutput_EnsureClobber}: + memClobbered = true + + var params: string + params.add ": " & outOperands.join(", ") & '\n' + params.add ": " & inOperands.join(", ") & '\n' + + let clobbers = [(a.isStackClobbered, "sp"), + (a.areFlagsClobbered, "cc"), + (memClobbered, "memory")] + var clobberList = ": " + for (clobbered, str) in clobbers: + if clobbered: + if clobberList.len == 2: + clobberList.add "\"" & str & '\"' + else: + clobberList.add ", \"" & str & '\"' + + params.add clobberList + + # GCC will optimize ASM away if there are no + # memory operand or volatile + memory clobber + # https://stackoverflow.com/questions/34244185/looping-over-arrays-with-inline-assembly + + # result = nnkAsmStmt.newTree( + # newEmptyNode(), + # newLit(asmStmt & params) + # ) + + var asmStmt = "\"" & a.code.replace("\n", "\\n\"\n\"") + asmStmt.setLen(asmStmt.len - 1) # drop the last quote + + result = nnkPragma.newTree( + nnkExprColonExpr.newTree( + ident"emit", + newLit( + "asm volatile(\n" & asmStmt & params & ");" + ) + ) + ) + +func getStrOffset(a: Assembler_x86, op: Operand): string = + if op.kind != kFromArray: + return "%" & op.desc.asmId + + # Beware GCC / Clang differences with array offsets + # https://lists.llvm.org/pipermail/llvm-dev/2017-August/116202.html + + if op.desc.rm in {Mem, AnyRegOrMem, MemOffsettable, AnyMemOffImm, AnyRegMemImm}: + # Directly accessing memory + if op.offset == 0: + return "%" & op.desc.asmId + if defined(gcc): + return $(op.offset * a.wordSize) & "+%" & op.desc.asmId + elif defined(clang): + return $(op.offset * a.wordSize) & "%" & op.desc.asmId + else: + error "Unconfigured compiler" + elif op.desc.rm == PointerInReg or + op.desc.rm in SpecificRegisters or + (op.desc.rm == ElemsInReg and op.kind == kFromArray): + if op.offset == 0: + return "(%" & $op.desc.asmId & ')' + if defined(gcc): + return $(op.offset * a.wordSize) & "+(%" & $op.desc.asmId & ')' + elif defined(clang): + return $(op.offset * a.wordSize) & "(%" & $op.desc.asmId & ')' + else: + error "Unconfigured compiler" + else: + error "Unsupported: " & $op.desc.rm.ord + +func codeFragment(a: var Assembler_x86, instr: string, op: Operand) = + # Generate a code fragment + let off = a.getStrOffset(op) + + if a.wordBitWidth == 64: + a.code &= instr & "q " & off & '\n' + elif a.wordBitWidth == 32: + a.code &= instr & "l " & off & '\n' + else: + error "Unsupported bitwidth: " & $a.wordBitWidth + + a.operands.incl op.desc + +func codeFragment(a: var Assembler_x86, instr: string, op0, op1: Operand) = + # Generate a code fragment + # ⚠️ Warning: + # The caller should deal with destination/source operand + # so that it fits GNU Assembly + let off0 = a.getStrOffset(op0) + let off1 = a.getStrOffset(op1) + + if a.wordBitWidth == 64: + a.code &= instr & "q " & off0 & ", " & off1 & '\n' + elif a.wordBitWidth == 32: + a.code &= instr & "l " & off0 & ", " & off1 & '\n' + else: + error "Unsupported bitwidth: " & $a.wordBitWidth + + a.operands.incl op0.desc + a.operands.incl op1.desc + +func codeFragment(a: var Assembler_x86, instr: string, imm: int, op: Operand) = + # Generate a code fragment + # ⚠️ Warning: + # The caller should deal with destination/source operand + # so that it fits GNU Assembly + let off = a.getStrOffset(op) + + if a.wordBitWidth == 64: + a.code &= instr & "q $" & $imm & ", " & off & '\n' + else: + a.code &= instr & "l $" & $imm & ", " & off & '\n' + + a.operands.incl op.desc + +func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: Register) = + # Generate a code fragment + # ⚠️ Warning: + # The caller should deal with destination/source operand + # so that it fits GNU Assembly + if a.wordBitWidth == 64: + a.code &= instr & "q $" & $imm & ", %%" & $reg & '\n' + else: + a.code &= instr & "l $" & $imm & ", %%" & $reg & '\n' + +func codeFragment(a: var Assembler_x86, instr: string, reg0, reg1: Register) = + # Generate a code fragment + # ⚠️ Warning: + # The caller should deal with destination/source operand + # so that it fits GNU Assembly + if a.wordBitWidth == 64: + a.code &= instr & "q %%" & $reg0 & ", %%" & $reg1 & '\n' + else: + a.code &= instr & "l %%" & $reg0 & ", %%" & $reg1 & '\n' + +func codeFragment(a: var Assembler_x86, instr: string, imm: int, reg: OperandReuse) = + # Generate a code fragment + # ⚠️ Warning: + # The caller should deal with destination/source operand + # so that it fits GNU Assembly + if a.wordBitWidth == 64: + a.code &= instr & "q $" & $imm & ", %" & $reg.asmId & '\n' + else: + a.code &= instr & "l $" & $imm & ", %" & $reg.asmId & '\n' + +func codeFragment(a: var Assembler_x86, instr: string, reg0, reg1: OperandReuse) = + # Generate a code fragment + # ⚠️ Warning: + # The caller should deal with destination/source operand + # so that it fits GNU Assembly + if a.wordBitWidth == 64: + a.code &= instr & "q %" & $reg0.asmId & ", %" & $reg1.asmId & '\n' + else: + a.code &= instr & "l %" & $reg0.asmId & ", %" & $reg1.asmId & '\n' + +func codeFragment(a: var Assembler_x86, instr: string, reg0: OperandReuse, reg1: Operand) = + # Generate a code fragment + # ⚠️ Warning: + # The caller should deal with destination/source operand + # so that it fits GNU Assembly + if a.wordBitWidth == 64: + a.code &= instr & "q %" & $reg0.asmId & ", %" & $reg1.desc.asmId & '\n' + else: + a.code &= instr & "l %" & $reg0.asmId & ", %" & $reg1.desc.asmId & '\n' + + a.operands.incl reg1.desc + +func reuseRegister*(reg: OperandArray): OperandReuse = + # TODO: disable the reg input + doAssert reg.buf[0].desc.constraint == InputOutput + result.asmId = reg.buf[0].desc.asmId + +func comment*(a: var Assembler_x86, comment: string) = + # Add a comment + a.code &= "# " & comment & '\n' + +func repackRegisters*(regArr: OperandArray, regs: varargs[Operand]): OperandArray = + ## Extend an array of registers with extra registers + result.buf = regArr.buf + result.buf.add regs + result.nimSymbol = nil + +# Instructions +# ------------------------------------------------------------------------------------------------------------ + +func add*(a: var Assembler_x86, dst, src: Operand) = + ## Does: dst <- dst + src + doAssert dst.desc.constraint in OutputReg + a.codeFragment("add", src, dst) + a.areFlagsClobbered = true + +func adc*(a: var Assembler_x86, dst, src: Operand) = + ## Does: dst <- dst + src + carry + doAssert dst.desc.constraint in OutputReg + a.codeFragment("adc", src, dst) + a.areFlagsClobbered = true + + if dst.desc.rm != Reg: + {.warning: "Using addcarry with a memory destination, this incurs significant performance penalties.".} + +func adc*(a: var Assembler_x86, dst: Operand, imm: int) = + ## Does: dst <- dst + imm + borrow + doAssert dst.desc.constraint in OutputReg + a.codeFragment("adc", imm, dst) + a.areFlagsClobbered = true + + if dst.desc.rm != Reg: + {.warning: "Using addcarry with a memory destination, this incurs significant performance penalties.".} + +func sub*(a: var Assembler_x86, dst, src: Operand) = + ## Does: dst <- dst - src + doAssert dst.desc.constraint in OutputReg + a.codeFragment("sub", src, dst) + a.areFlagsClobbered = true + +func sbb*(a: var Assembler_x86, dst, src: Operand) = + ## Does: dst <- dst - src - borrow + doAssert dst.desc.constraint in OutputReg + a.codeFragment("sbb", src, dst) + a.areFlagsClobbered = true + + if dst.desc.rm != Reg: + {.warning: "Using subborrow with a memory destination, this incurs significant performance penalties.".} + +func sbb*(a: var Assembler_x86, dst: Operand, imm: int) = + ## Does: dst <- dst - imm - borrow + doAssert dst.desc.constraint in OutputReg + a.codeFragment("sbb", imm, dst) + a.areFlagsClobbered = true + + if dst.desc.rm != Reg: + {.warning: "Using subborrow with a memory destination, this incurs significant performance penalties.".} + +func sbb*(a: var Assembler_x86, dst: Register, imm: int) = + ## Does: dst <- dst - imm - borrow + a.codeFragment("sbb", imm, dst) + a.areFlagsClobbered = true + +func sbb*(a: var Assembler_x86, dst, src: Register) = + ## Does: dst <- dst - imm - borrow + a.codeFragment("sbb", src, dst) + a.areFlagsClobbered = true + +func sbb*(a: var Assembler_x86, dst: OperandReuse, imm: int) = + ## Does: dst <- dst - imm - borrow + a.codeFragment("sbb", imm, dst) + a.areFlagsClobbered = true + +func sbb*(a: var Assembler_x86, dst, src: OperandReuse) = + ## Does: dst <- dst - imm - borrow + a.codeFragment("sbb", src, dst) + a.areFlagsClobbered = true + +func sar*(a: var Assembler_x86, dst: Operand, imm: int) = + ## Does Arithmetic Right Shift (i.e. with sign extension) + doAssert dst.desc.constraint in OutputReg + a.codeFragment("sar", imm, dst) + a.areFlagsClobbered = true + +func `and`*(a: var Assembler_x86, dst: OperandReuse, imm: int) = + ## Compute the bitwise AND of x and y and + ## set the Sign, Zero and Parity flags + a.codeFragment("and", imm, dst) + a.areFlagsClobbered = true + +func `and`*(a: var Assembler_x86, dst, src: Operand) = + ## Compute the bitwise AND of x and y and + ## set the Sign, Zero and Parity flags + a.codeFragment("and", src, dst) + a.areFlagsClobbered = true + +func `and`*(a: var Assembler_x86, dst: Operand, src: OperandReuse) = + ## Compute the bitwise AND of x and y and + ## set the Sign, Zero and Parity flags + a.codeFragment("and", src, dst) + a.areFlagsClobbered = true + +func test*(a: var Assembler_x86, x, y: Operand) = + ## Compute the bitwise AND of x and y and + ## set the Sign, Zero and Parity flags + a.codeFragment("test", x, y) + a.areFlagsClobbered = true + +func `xor`*(a: var Assembler_x86, x, y: Operand) = + ## Compute the bitwise xor of x and y and + ## reset all flags + a.codeFragment("xor", x, y) + a.areFlagsClobbered = true + +func mov*(a: var Assembler_x86, dst, src: Operand) = + ## Does: dst <- src + doAssert dst.desc.constraint in OutputReg, $dst.repr + + a.codeFragment("mov", src, dst) + # No clobber + +func mov*(a: var Assembler_x86, dst: Operand, imm: int) = + ## Does: dst <- imm + doAssert dst.desc.constraint in OutputReg, $dst.repr + + a.codeFragment("mov", imm, dst) + # No clobber + +func cmovc*(a: var Assembler_x86, dst, src: Operand) = + ## Does: dst <- src if the carry flag is set + doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr + doAssert dst.desc.constraint in OutputReg, $dst.repr + + a.codeFragment("cmovc", src, dst) + # No clobber + +func cmovnc*(a: var Assembler_x86, dst, src: Operand) = + ## Does: dst <- src if the carry flag is not set + doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr + doAssert dst.desc.constraint in {Output_EarlyClobber, InputOutput, Output_Overwrite}, $dst.repr + + a.codeFragment("cmovnc", src, dst) + # No clobber + +func cmovz*(a: var Assembler_x86, dst, src: Operand) = + ## Does: dst <- src if the zero flag is not set + doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr + doAssert dst.desc.constraint in OutputReg, $dst.repr + + a.codeFragment("cmovz", src, dst) + # No clobber + +func cmovnz*(a: var Assembler_x86, dst, src: Operand) = + ## Does: dst <- src if the zero flag is not set + doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr + doAssert dst.desc.constraint in OutputReg, $dst.repr + + a.codeFragment("cmovnz", src, dst) + # No clobber + +func cmovs*(a: var Assembler_x86, dst, src: Operand) = + ## Does: dst <- src if the sign flag + doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr + doAssert dst.desc.constraint in OutputReg, $dst.repr + + a.codeFragment("cmovs", src, dst) + # No clobber + +func mul*(a: var Assembler_x86, dHi, dLo: Register, src0: Operand, src1: Register) = + ## Does (dHi, dLo) <- src0 * src1 + doAssert src1 == rax, "MUL requires the RAX register" + doAssert dHi == rdx, "MUL requires the RDX register" + doAssert dLo == rax, "MUL requires the RAX register" + + a.codeFragment("mul", src0) + +func imul*(a: var Assembler_x86, dst, src: Operand) = + ## Does dst <- dst * src, keeping only the low half + doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr + doAssert dst.desc.constraint in OutputReg, $dst.repr + + a.codeFragment("imul", src, dst) + +func mulx*(a: var Assembler_x86, dHi, dLo, src0: Operand, src1: Register) = + ## Does (dHi, dLo) <- src0 * src1 + doAssert src1 == rdx, "MULX requires the RDX register" + doAssert dHi.desc.rm in {Reg, ElemsInReg} or dHi.desc.rm in SpecificRegisters, + "The destination operand must be a register " & $dHi.repr + doAssert dLo.desc.rm in {Reg, ElemsInReg} or dLo.desc.rm in SpecificRegisters, + "The destination operand must be a register " & $dLo.repr + doAssert dHi.desc.constraint in OutputReg + doAssert dLo.desc.constraint in OutputReg + + let off0 = a.getStrOffset(src0) + + # Annoying AT&T syntax + if a.wordBitWidth == 64: + a.code &= "mulxq " & off0 & ", %" & $dLo.desc.asmId & ", %" & $dHi.desc.asmId & '\n' + else: + a.code &= "mulxl " & off0 & ", %" & $dLo.desc.asmId & ", %" & $dHi.desc.asmId & '\n' + + a.operands.incl src0.desc + +func adcx*(a: var Assembler_x86, dst, src: Operand) = + ## Does: dst <- dst + src + carry + ## and only sets the carry flag + doAssert dst.desc.constraint in OutputReg, $dst.repr + doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr + a.codeFragment("adcx", src, dst) + a.areFlagsClobbered = true + +func adox*(a: var Assembler_x86, dst, src: Operand) = + ## Does: dst <- dst + src + overflow + ## and only sets the overflow flag + doAssert dst.desc.constraint in OutputReg, $dst.repr + doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr + a.codeFragment("adox", src, dst) + a.areFlagsClobbered = true + +func push*(a: var Assembler_x86, _: type Stack, reg: Operand) = + ## Push the content of register on the stack + doAssert reg.desc.rm in {Reg, PointerInReg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $reg.repr + a.codeFragment("push", reg) + a.isStackClobbered = true + +func pop*(a: var Assembler_x86, _: type Stack, reg: Operand) = + ## Pop the content of register on the stack + doAssert reg.desc.rm in {Reg, PointerInReg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $reg.repr + a.codeFragment("pop", reg) + a.isStackClobbered = true diff --git a/constantine/primitives/research/README.md b/constantine/primitives/research/README.md deleted file mode 100644 index 32cc175..0000000 --- a/constantine/primitives/research/README.md +++ /dev/null @@ -1,45 +0,0 @@ -# Compiler for generic inline assembly code-generation - -This folder holds alternative implementations of primitives -that uses inline assembly. - -This avoids the pitfalls of traditional compiler bad code generation -for multiprecision arithmetic (see GCC https://gcc.godbolt.org/z/2h768y) -or unsupported features like handling 2 carry chains for -multiplication using MULX/ADOX/ADCX. - -To be generic over multiple curves, -for example BN254 requires 4 words and BLS12-381 requires 6 words of size 64 bits, -the compilers is implemented as a set of macros that generate inline assembly. - -⚠⚠⚠ Warning! Warning! Warning! - -This is a significant sacrifice of code readability, portability, auditability and maintainability in favor of performance. - -This combines 2 of the most notorious ways to obfuscate your code: -* metaprogramming and macros -* inline assembly - -Adventurers beware: not for the faint of heart. - -This is unfinished, untested, unused, unfuzzed and just a proof-of-concept at the moment.* - -_* I take no responsibility if this smashes your stack, eats your cat, hides a skeleton in your closet, warps a pink elephant in the room, summons untold eldritch horrors or causes the heat death of the universe. You have been warned._ - -_The road to debugging hell is paved with metaprogrammed assembly optimizations._ - -_For my defence, OpenSSL assembly is generated by a Perl script and neither Perl nor the generated Assembly are type-checked by a dependently-typed compiler._ - -## References - -Multiprecision (Montgomery) Multiplication & Squaring in Assembly - -- Intel MULX/ADCX/ADOX Table 2 p13: https://www.intel.cn/content/dam/www/public/us/en/documents/white-papers/ia-large-integer-arithmetic-paper.pdf -- Squaring: https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/large-integer-squaring-ia-paper.pdf -- https://eprint.iacr.org/eprint-bin/getfile.pl?entry=2017/558&version=20170608:200345&file=558.pdf -- https://github.com/intel/ipp-crypto -- https://github.com/herumi/mcl - -Experimentations in Nim - -- https://github.com/mratsim/finite-fields diff --git a/constantine/primitives/research/addcarry_subborrow_compiler.nim b/constantine/primitives/research/addcarry_subborrow_compiler.nim deleted file mode 100644 index 3391577..0000000 --- a/constantine/primitives/research/addcarry_subborrow_compiler.nim +++ /dev/null @@ -1,133 +0,0 @@ -# Constantine -# Copyright (c) 2018-2019 Status Research & Development GmbH -# Copyright (c) 2020-Present Mamy André-Ratsimbazafy -# Licensed and distributed under either of -# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). -# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). -# at your option. This file may not be copied, modified, or distributed except according to those terms. - -# ############################################################ -# -# Add-with-carry and Sub-with-borrow -# -# ############################################################ -# -# This is a proof-of-concept optimal add-with-carry -# compiler implemented as Nim macros. -# -# This overcome the bad GCC codegen aven with addcary_u64 intrinsic. - -import std/macros - -func wordsRequired(bits: int): int {.compileTime.} = - ## Compute the number of limbs required - ## from the announced bit length - (bits + 64 - 1) div 64 - -type - BigInt[bits: static int] {.byref.} = object - ## BigInt - ## Enforce-passing by reference otherwise uint128 are passed by stack - ## which causes issue with the inline assembly - limbs: array[bits.wordsRequired, uint64] - -macro addCarryGen_u64(a, b: untyped, bits: static int): untyped = - var asmStmt = (block: - " movq %[b], %[tmp]\n" & - " addq %[tmp], %[a]\n" - ) - - let maxByteOffset = bits div 8 - const wsize = sizeof(uint64) - - when defined(gcc): - for byteOffset in countup(wsize, maxByteOffset-1, wsize): - asmStmt.add (block: - "\n" & - # movq 8+%[b], %[tmp] - " movq " & $byteOffset & "+%[b], %[tmp]\n" & - # adcq %[tmp], 8+%[a] - " adcq %[tmp], " & $byteOffset & "+%[a]\n" - ) - elif defined(clang): - # https://lists.llvm.org/pipermail/llvm-dev/2017-August/116202.html - for byteOffset in countup(wsize, maxByteOffset-1, wsize): - asmStmt.add (block: - "\n" & - # movq 8+%[b], %[tmp] - " movq " & $byteOffset & "%[b], %[tmp]\n" & - # adcq %[tmp], 8+%[a] - " adcq %[tmp], " & $byteOffset & "%[a]\n" - ) - - let tmp = ident("tmp") - asmStmt.add (block: - ": [tmp] \"+r\" (`" & $tmp & "`), [a] \"+m\" (`" & $a & "->limbs[0]`)\n" & - ": [b] \"m\"(`" & $b & "->limbs[0]`)\n" & - ": \"cc\"" - ) - - result = newStmtList() - result.add quote do: - var `tmp`{.noinit.}: uint64 - - result.add nnkAsmStmt.newTree( - newEmptyNode(), - newLit asmStmt - ) - - echo result.toStrLit - -func `+=`(a: var BigInt, b: BigInt) {.noinline.}= - # Depending on inline or noinline - # the generated ASM addressing must be tweaked for Clang - # https://lists.llvm.org/pipermail/llvm-dev/2017-August/116202.html - addCarryGen_u64(a, b, BigInt.bits) - -# ############################################# -when isMainModule: - import std/random - proc rand(T: typedesc[BigInt]): T = - for i in 0 ..< result.limbs.len: - result.limbs[i] = uint64(rand(high(int))) - - proc main() = - block: - let a = BigInt[128](limbs: [high(uint64), 0]) - let b = BigInt[128](limbs: [1'u64, 0]) - - echo "a: ", a - echo "b: ", b - echo "------------------------------------------------------" - - var a1 = a - a1 += b - echo a1 - echo "======================================================" - - block: - let a = rand(BigInt[256]) - let b = rand(BigInt[256]) - - echo "a: ", a - echo "b: ", b - echo "------------------------------------------------------" - - var a1 = a - a1 += b - echo a1 - echo "======================================================" - - block: - let a = rand(BigInt[384]) - let b = rand(BigInt[384]) - - echo "a: ", a - echo "b: ", b - echo "------------------------------------------------------" - - var a1 = a - a1 += b - echo a1 - - main() diff --git a/tests/t_ec_sage_bls12_381 b/tests/t_ec_sage_bls12_381 deleted file mode 100755 index a40778a..0000000 Binary files a/tests/t_ec_sage_bls12_381 and /dev/null differ diff --git a/tests/t_finite_fields_vs_gmp.nim b/tests/t_finite_fields_vs_gmp.nim index 20d0749..af96673 100644 --- a/tests/t_finite_fields_vs_gmp.nim +++ b/tests/t_finite_fields_vs_gmp.nim @@ -20,22 +20,13 @@ import echo "\n------------------------------------------------------\n" var RNG {.compileTime.} = initRand(1234) -const CurveParams = [ - P224, - BN254_Nogami, - BN254_Snarks, - Curve25519, - P256, - Secp256k1, - BLS12_377, - BLS12_381, - BN446, - FKM12_447, - BLS12_461, - BN462 -] -const AvailableCurves = [P224, BN254_Nogami, BN254_Snarks, P256, Secp256k1, BLS12_381] +const AvailableCurves = [ + P224, + BN254_Nogami, BN254_Snarks, + P256, Secp256k1, + BLS12_381 +] const # https://gmplib.org/manual/Integer-Import-and-Export.html GMP_WordLittleEndian = -1'i32 diff --git a/tests/t_io_fields.nim b/tests/t_io_fields.nim index bfdaa68..520eb2f 100644 --- a/tests/t_io_fields.nim +++ b/tests/t_io_fields.nim @@ -140,6 +140,14 @@ proc main() = check: p == hex + test "Round trip on prime field of BN254 Snarks curve": + block: # 2^126 + const p = "0x0000000000000000000000000000000040000000000000000000000000000000" + let x = Fp[BN254_Snarks].fromBig BigInt[254].fromHex(p) + let hex = x.toHex(bigEndian) + + check: p == hex + test "Round trip on prime field of BLS12_381 curve": block: # 2^126 const p = "0x000000000000000000000000000000000000000000000000000000000000000040000000000000000000000000000000"