mirror of
https://github.com/logos-storage/constantine.git
synced 2026-01-02 13:13:07 +00:00
Rework assembly to be compatible with LTO (#231)
* rework assembler register/mem and constraint declarations * Introduce constraint UnmutatedPointerToWriteMem * Create invidual memory cell operands * [Assembly] fully support indirect memory addressing * fix calling convention for exported procs * Prepare for switch to intel syntax to avoid clang constant propagation asm symbol name interfering OR pointer+offset addressing * use modifiers to prevent bad string mixin fo assembler to linker of propagated consts * Assembly: switch to intel syntax * with working memory operand - now works with LTO on both GCC and clang and constant folding * use memory operand in more places * remove some inline now that we have lto * cleanup compiler config and benches * tracer shouldn't force dependencies when unused * fix cc on linux * nimble fixes * update README [skip CI] * update MacOS CI with Homebrew Clang * oops nimble bindings disappeared * more nimble fixes * fix sha256 exported symbol * improve constraints on modular addition * Add extra constraint to force reloading of pointer in reg inputs * Fix LLVM gold linker running out of registers * workaround MinGW64 GCC 12.2 bad codegen in t_pairing_cyclotomic_subgroup with LTO
This commit is contained in:
parent
9a7137466e
commit
c6d9a213f2
32
.github/workflows/ci.yml
vendored
32
.github/workflows/ci.yml
vendored
@ -25,6 +25,10 @@ jobs:
|
||||
cpu: amd64
|
||||
TEST_LANG: c
|
||||
BACKEND: NO_ASM
|
||||
- os: windows
|
||||
cpu: amd64
|
||||
TEST_LANG: c
|
||||
BACKEND: ASM
|
||||
- os: macos
|
||||
cpu: amd64
|
||||
TEST_LANG: c
|
||||
@ -172,7 +176,19 @@ jobs:
|
||||
|
||||
- name: Install test dependencies (macOS)
|
||||
if: runner.os == 'macOS'
|
||||
run: brew install gmp
|
||||
run: |
|
||||
brew install gmp
|
||||
mkdir -p external/bin
|
||||
cat << EOF > external/bin/clang
|
||||
#!/bin/bash
|
||||
exec $(brew --prefix llvm@15)/bin/clang "\$@"
|
||||
EOF
|
||||
cat << EOF > external/bin/clang++
|
||||
#!/bin/bash
|
||||
exec $(brew --prefix llvm@15)/bin/clang++ "\$@"
|
||||
EOF
|
||||
chmod 755 external/bin/{clang,clang++}
|
||||
echo '${{ github.workspace }}/external/bin' >> $GITHUB_PATH
|
||||
|
||||
- name: Setup MSYS2 (Windows)
|
||||
if: runner.os == 'Windows'
|
||||
@ -210,9 +226,19 @@ jobs:
|
||||
shell: bash
|
||||
run: |
|
||||
cd constantine
|
||||
nimble bindings --verbose
|
||||
nimble bindings_no_asm --verbose
|
||||
nimble test_bindings --verbose
|
||||
nimble test_parallel_no_asm --verbose
|
||||
- name: Run Constantine tests (Windows with Assembly)
|
||||
# So "test_bindings" uses C and can find GMP
|
||||
# but nim-gmp cannot find GMP on Windows CI
|
||||
if: runner.os == 'Windows' && matrix.target.BACKEND == 'ASM'
|
||||
shell: msys2 {0}
|
||||
run: |
|
||||
cd constantine
|
||||
nimble bindings --verbose
|
||||
nimble test_bindings --verbose
|
||||
nimble test_parallel_no_gmp --verbose
|
||||
- name: Run Constantine tests (Windows no Assembly)
|
||||
# So "test_bindings" uses C and can find GMP
|
||||
# but nim-gmp cannot find GMP on Windows CI
|
||||
@ -220,6 +246,6 @@ jobs:
|
||||
shell: msys2 {0}
|
||||
run: |
|
||||
cd constantine
|
||||
nimble bindings --verbose
|
||||
nimble bindings_no_asm --verbose
|
||||
nimble test_bindings --verbose
|
||||
nimble test_parallel_no_gmp_no_asm --verbose
|
||||
|
||||
247
README.md
247
README.md
@ -25,9 +25,11 @@ The implementations are accompanied with SAGE code used as reference implementat
|
||||
- [Table of Contents](#table-of-contents)
|
||||
- [Target audience](#target-audience)
|
||||
- [Protocols](#protocols)
|
||||
- [Curves supported in the backend](#curves-supported-in-the-backend)
|
||||
- [Installation](#installation)
|
||||
- [Dependencies](#dependencies)
|
||||
- [From C](#from-c)
|
||||
- [From Nim](#from-nim)
|
||||
- [Dependencies & Requirements](#dependencies--requirements)
|
||||
- [Curves supported in the backend](#curves-supported-in-the-backend)
|
||||
- [Security](#security)
|
||||
- [Disclaimer](#disclaimer)
|
||||
- [Security disclosure](#security-disclosure)
|
||||
@ -36,6 +38,7 @@ The implementations are accompanied with SAGE code used as reference implementat
|
||||
- [In zero-knowledge proofs](#in-zero-knowledge-proofs)
|
||||
- [Measuring performance](#measuring-performance)
|
||||
- [BLS12_381 Clang + inline Assembly](#bls12_381-clang--inline-assembly)
|
||||
- [Parallelism](#parallelism)
|
||||
- [Why Nim](#why-nim)
|
||||
- [Compiler caveats](#compiler-caveats)
|
||||
- [Inline assembly](#inline-assembly)
|
||||
@ -67,26 +70,110 @@ Protocols to address these goals, (authenticated) encryption, signature, traitor
|
||||
are designed.\
|
||||
Note: some goals might be mutually exclusive, for example "plausible deniability" and "non-repudiation".
|
||||
|
||||
After [installation](#installation), the available high-level protocols are:
|
||||
## Installation
|
||||
|
||||
- [x] Ethereum EVM precompiles on BN254_Snarks (also called alt_bn128 or bn256 in Ethereum)
|
||||
### From C
|
||||
|
||||
`import constantine/ethereum_evm_precompiles`
|
||||
- [x] BLS signature on BLS12-381 G2 as used in Ethereum 2.
|
||||
1. Install a C compiler, for example:
|
||||
- Debian/Ubuntu `sudo apt update && sudo apt install build-essential`
|
||||
- Archlinux `pacman -S base-devel`
|
||||
|
||||
2. Install nim, it is available in most distros package manager for Linux and Homebrew for MacOS
|
||||
Windows binaries are on the official website: https://nim-lang.org/install_unix.html
|
||||
- Debian/Ubuntu `sudo apt install nim`
|
||||
- Archlinux `pacman -S nim`
|
||||
|
||||
3. Compile the bindings.
|
||||
- Recommended: \
|
||||
`CC:clang nimble bindings`
|
||||
- or `nimble bindings_no_asm`\
|
||||
to compile without assembly (otherwise it autodetects support)
|
||||
- or with default compiler\
|
||||
`nimble bindings`
|
||||
|
||||
4. Ensure bindings work
|
||||
- `nimble test_bindings`
|
||||
|
||||
5. Bindings location
|
||||
- The bindings are put in `constantine/lib`
|
||||
- The headers are in [constantine/include](./include) for example [Ethereum BLS signatures](./include/constantine_ethereum_bls_signatures.h)
|
||||
|
||||
6. Read the examples in [examples_c](./examples_c):
|
||||
- Using the [Ethereum BLS signatures bindings from C](./examples_c/ethereum_bls_signatures.c)
|
||||
- Testing Constantine BLS12-381 vs GMP [./examples_c/t_libctt_bls12_381.c](./examples_c/t_libctt_bls12_381.c)
|
||||
|
||||
The bindings currently provided are:
|
||||
|
||||
- Ethereum BLS signatures on BLS12-381 G2
|
||||
Cryptographic suite: `BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_POP_`
|
||||
|
||||
This scheme is also used in the following blockchains:
|
||||
Algorand, Chia, Dfinity, Filecoin, Tezos, Zcash.
|
||||
They may have their pubkeys on G1 and signatures on G2 like Ethereum or the other way around.
|
||||
|
||||
> Parameter discussion:
|
||||
>
|
||||
> As Ethereum validators' pubkeys are duplicated, stored and transmitter over and over in the protocol,
|
||||
having them be as small as possible was important.
|
||||
On another hand, BLS signatures were first popularized due to their succinctness.
|
||||
And having signatures on G1 is useful when short signatures are desired, in embedded for example.
|
||||
- [x] SHA256 hash
|
||||
- ...
|
||||
- BLS12-381 arithmetic:
|
||||
- field arithmetic
|
||||
- on Fr (i.e. modulo the 255-bit curve order)
|
||||
- on Fp (i.e. modulo the 381-bit prime modulus)
|
||||
- on Fp2
|
||||
- elliptic curve arithmetic:
|
||||
- on elliptic curve over Fp (EC G1) with affine, jacobian and homogenous projective coordinates
|
||||
- on elliptic curve over Fp2 (EC G2) with affine, jacobian and homogenous projective coordinates
|
||||
- currently not exposed: \
|
||||
scalar multiplication, multi-scalar multiplications \
|
||||
pairings and multi-pairings \
|
||||
are implemented but not exposed
|
||||
- _All operations are constant-time unless explicitly mentioned_ vartime
|
||||
|
||||
- The Pasta curves: Pallas and Vesta
|
||||
- field arithmetic
|
||||
- on Fr (i.e. modulo the 255-bit curve order)
|
||||
- on Fp (i.e. modulo the 255-bit prime modulus)
|
||||
- elliptic curve arithmetic:
|
||||
- on elliptic curve over Fp (EC G1) with affine, jacobian and homogenous projective coordinates
|
||||
- currently not exposed: \
|
||||
scalar multiplication, multi-scalar multiplications \
|
||||
are implemented but not exposed
|
||||
- _All operations are constant-time unless explicitly mentioned_ vartime
|
||||
|
||||
### From Nim
|
||||
|
||||
You can install the developement version of the library through nimble with the following command
|
||||
```
|
||||
nimble install https://github.com/mratsim/constantine@#master
|
||||
```
|
||||
|
||||
## Dependencies & Requirements
|
||||
|
||||
For speed it is recommended to use Clang (see [Compiler-caveats](#Compiler-caveats)).
|
||||
In particular GCC generates inefficient add-with-carry code.
|
||||
|
||||
Constantine requires at least:
|
||||
- GCC 7 \
|
||||
Previous versions generated incorrect add-with-carry code.
|
||||
- Clang 14 \
|
||||
On x86-64, inline assembly is used to workaround compilers having issues optimizing large integer arithmetic,
|
||||
and also ensure constant-time code. \
|
||||
Constantine uses the intel assembly syntax to address issues with the default AT&T syntax and constants propagated in Clang. \
|
||||
Clang 14 added support for `-masm=intel`. \
|
||||
\
|
||||
On MacOS, Apple Clang does not support Intel assembly syntax, use Homebrew Clang instead or compile without assembly.\
|
||||
_Note that Apple is discontinuing Intel CPU throughough their product line so this will impact only older model and Mac Pro_
|
||||
|
||||
On Windows, Constantine is tested with MinGW. The Microsoft Visual C++ Compiler is not configured.
|
||||
|
||||
Constantine has no dependencies, even on Nim standard library except:
|
||||
- for testing
|
||||
- jsony for parsing json test vectors
|
||||
- the Nim standard library for unittesting, formatting and datetime.
|
||||
- GMP for testing against GMP
|
||||
- for benchmarking
|
||||
- The Nim standard libreary for timing and formatting
|
||||
- for Nvidia GPU backend:
|
||||
- the LLVM runtime ("dev" version with headers is not needed)
|
||||
- the CUDA runtime ("dev" version with headers is not needed)
|
||||
- at compile-time
|
||||
- we need the std/macros library to generate Nim code.
|
||||
|
||||
## Curves supported in the backend
|
||||
|
||||
@ -108,42 +195,10 @@ The following curves are configured:
|
||||
- Jubjub, a curve embedded in BLS12-381 scalar field to be used in zk-SNARKS circuits.
|
||||
- Bandersnatch, a more efficient curve embedded in BLS12-381 scalar field to be used in zk-SNARKS circuits.
|
||||
- Other curves
|
||||
- Edwards25519, used in ed25519 and X25519 from TLS 1.3 protocol and the Signal protocol.
|
||||
|
||||
- Edwards25519, used in ed25519 and X25519 from TLS 1.3 protocol and the Signal protocol. \
|
||||
With Ristretto, it can be used in bulletproofs.
|
||||
- The Pasta curves (Pallas and Vesta) for the Halo 2 proof system (Zcash).
|
||||
|
||||
|
||||
## Installation
|
||||
|
||||
You can install the developement version of the library through nimble with the following command
|
||||
```
|
||||
nimble install https://github.com/mratsim/constantine@#master
|
||||
```
|
||||
|
||||
For speed it is recommended to prefer Clang, MSVC or ICC over GCC (see [Compiler-caveats](#Compiler-caveats)).
|
||||
|
||||
Further if using GCC, GCC 7 at minimum is required, previous versions
|
||||
generated incorrect add-with-carry code.
|
||||
|
||||
On x86-64, inline assembly is used to workaround compilers having issues optimizing large integer arithmetic,
|
||||
and also ensure constant-time code.
|
||||
|
||||
## Dependencies
|
||||
|
||||
Constantine has no dependencies, even on Nim standard library except:
|
||||
- for testing
|
||||
- jsony for parsing json test vectors
|
||||
- the Nim standard library for unittesting, formatting and datetime.
|
||||
- GMP for testing against GMP
|
||||
- for benchmarking
|
||||
- The Nim standard libreary for timing and formatting
|
||||
- for Nvidia GPU backend:
|
||||
- the LLVM runtime ("dev" version with headers is not needed)
|
||||
- the CUDA runtime ("dev" version with headers is not needed)
|
||||
- at compile-time
|
||||
- we need the std/macros library to generate Nim code.
|
||||
|
||||
## Security
|
||||
|
||||
Hardening an implementation against all existing and upcoming attack vectors is an extremely complex task.
|
||||
@ -217,47 +272,79 @@ To measure the performance of Constantine
|
||||
|
||||
```bash
|
||||
git clone https://github.com/mratsim/constantine
|
||||
nimble bench_fp # Using default compiler + Assembly
|
||||
nimble bench_fp_clang # Using Clang + Assembly (recommended)
|
||||
nimble bench_fp_gcc # Using GCC + Assembly (decent)
|
||||
nimble bench_fp_clang_noasm # Using Clang only (acceptable)
|
||||
nimble bench_fp_gcc # Using GCC only (slowest)
|
||||
nimble bench_fp2
|
||||
# ...
|
||||
nimble bench_ec_g1_clang
|
||||
nimble bench_ec_g2_clang
|
||||
nimble bench_pairing_bn254_nogami_clang
|
||||
nimble bench_pairing_bn254_snarks_clang
|
||||
nimble bench_pairing_bls12_377_clang
|
||||
nimble bench_pairing_bls12_381_clang
|
||||
|
||||
# Default compiler
|
||||
nimble bench_fp
|
||||
|
||||
# Arithmetic
|
||||
CC=clang nimble bench_fp # Using Clang + Assembly (recommended)
|
||||
CC=clang nimble bench_fp2
|
||||
CC=clang nimble bench_fp12
|
||||
|
||||
# Scalar multiplication and pairings
|
||||
CC=clang nimble bench_ec_g1_scalar_mul
|
||||
CC=clang nimble bench_ec_g2_scalar_mul
|
||||
CC=clang nimble bench_pairing_bls12_381
|
||||
|
||||
# And per-curve summaries
|
||||
nimble bench_summary_bn254_nogami_clang
|
||||
nimble bench_summary_bn254_snarks_clang
|
||||
nimble bench_summary_bls12_377_clang
|
||||
nimble bench_summary_bls12_381_clang
|
||||
CC=clang nimble bench_summary_bn254_nogami
|
||||
CC=clang nimble bench_summary_bn254_snarks
|
||||
CC=clang nimble bench_summary_bls12_377
|
||||
CC=clang nimble bench_summary_bls12_381
|
||||
|
||||
# The Ethereum BLS signature protocol
|
||||
CC=clang nimble bench_ethereum_bls_signatures
|
||||
|
||||
# Multi-scalar multiplication
|
||||
CC=clang nimble bench_ec_g1_msm_bls12_381
|
||||
CC=clang nimble bench_ec_g1_msm_bn256_snarks
|
||||
```
|
||||
|
||||
The full list of benchmarks is available in the [`benchmarks`](./benchmarks) folder.
|
||||
|
||||
As mentioned in the [Compiler caveats](#compiler-caveats) section, GCC is up to 2x slower than Clang due to mishandling of carries and register usage.
|
||||
|
||||
On my machine i9-11980HK (8 cores 2.6GHz, turbo 5GHz), for Clang + Assembly, **all being constant-time** (including scalar multiplication, square root and inversion).
|
||||
|
||||
#### BLS12_381 (Clang + inline Assembly)
|
||||
|
||||
```
|
||||
--------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
EC ScalarMul 255-bit G1 ECP_ShortW_Prj[Fp[BLS12_381]] 16086.740 ops/s 62163 ns/op 205288 CPU cycles (approx)
|
||||
EC ScalarMul 255-bit G1 ECP_ShortW_Jac[Fp[BLS12_381]] 16670.834 ops/s 59985 ns/op 198097 CPU cycles (approx)
|
||||
EC ScalarMul 255-bit G2 ECP_ShortW_Prj[Fp2[BLS12_381]] 8333.403 ops/s 119999 ns/op 396284 CPU cycles (approx)
|
||||
EC ScalarMul 255-bit G2 ECP_ShortW_Jac[Fp2[BLS12_381]] 9300.682 ops/s 107519 ns/op 355071 CPU cycles (approx)
|
||||
--------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
Miller Loop BLS12 BLS12_381 5102.223 ops/s 195993 ns/op 647251 CPU cycles (approx)
|
||||
Final Exponentiation BLS12 BLS12_381 4209.109 ops/s 237580 ns/op 784588 CPU cycles (approx)
|
||||
Pairing BLS12 BLS12_381 2343.045 ops/s 426795 ns/op 1409453 CPU cycles (approx)
|
||||
--------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
Hash to G2 (Draft #11) BLS12_381 6558.495 ops/s 152474 ns/op 503531 CPU cycles (approx)
|
||||
--------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
```
|
||||
On my machine i9-11980HK (8 cores 2.6GHz, turbo 5GHz), for Clang + Assembly, **all being constant-time** (including scalar multiplication, square root and inversion).
|
||||
|
||||

|
||||
|
||||

|
||||

|
||||

|
||||
|
||||
On a i9-9980XE (18 cores,watercooled, overclocked, 4.1GHz all core turbo)
|
||||
|
||||

|
||||
|
||||
#### Parallelism
|
||||
|
||||
Constantine multithreaded primitives are powered by a highly tuned threadpool and stress-tested for:
|
||||
- scheduler overhead
|
||||
- load balancing with extreme imbalance
|
||||
- nested data parallelism
|
||||
- contention
|
||||
- speculative/conditional parallelism
|
||||
|
||||
and provides the following paradigms:
|
||||
- Future-based task-parallelism
|
||||
- Data parallelism (nestable and awaitable for loops)
|
||||
- including arbitrary parallel reductions
|
||||
- Dataflow parallelism / Stream parallelism / Graph Parallelism / Pipeline parallelism
|
||||
- Structured Parallelism
|
||||
|
||||
The threadpool parallel-for loops use lazy loop splitting and are fully adaptative to the workload being scheduled, the threads in-flight load and the hardware speed unlike most (all?) runtime, see:
|
||||
- OpenMP woes depending on hardware and workload: https://github.com/zy97140/omp-benchmark-for-pytorch
|
||||
- Raytracing ideal runtime, adapt to pixel compute load: \
|
||||
Most (all?) production runtime use scheduling A (split on number of threads like GCC OpenMP) or B (eager splitting, unable to adapt to actual work like LLVM/Intel OpenMP or Intel TBB) while Constantine uses C.
|
||||
|
||||
The threadpool provides efficient backoff strategy to conserve power based on:
|
||||
- eventcounts / futexes, for low overhead backoff
|
||||
- log-log iterated backoff, a provably optimal backoff strategy used for wireless communication to minimize communication in parallel for-loops
|
||||
|
||||
The research papers on high performance multithreading available in Weave repo: https://github.com/mratsim/weave/tree/7682784/research.\
|
||||
_Note: The threadpool is not backed by Weave but by an inspired runtime that has been significantly simplified for ease of auditing. In particular it uses shared-memory based work-stealing instead of channel-based work-requesting for load balancing as distributed computing is not a target, ..., yet._
|
||||
|
||||
## Why Nim
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@ echo " release: ", defined(release)
|
||||
echo " danger: ", defined(danger)
|
||||
echo " inline assembly: ", UseASM_X86_64
|
||||
|
||||
when (sizeof(int) == 4) or defined(Constantine32):
|
||||
when (sizeof(int) == 4) or defined(Ctt32):
|
||||
echo "⚠️ Warning: using Constantine with 32-bit limbs"
|
||||
else:
|
||||
echo "Using Constantine with 64-bit limbs"
|
||||
|
||||
@ -61,7 +61,7 @@ echo " release: ", defined(release)
|
||||
echo " danger: ", defined(danger)
|
||||
echo " inline assembly: ", UseASM_X86_64
|
||||
|
||||
when (sizeof(int) == 4) or defined(Constantine32):
|
||||
when (sizeof(int) == 4) or defined(Ctt32):
|
||||
echo "⚠️ Warning: using Constantine with 32-bit limbs"
|
||||
else:
|
||||
echo "Using Constantine with 64-bit limbs"
|
||||
|
||||
@ -33,7 +33,7 @@ else:
|
||||
proc SHA256[T: byte|char](
|
||||
msg: openarray[T],
|
||||
digest: ptr array[32, byte] = nil
|
||||
): ptr array[32, byte] {.cdecl, dynlib: DLLSSLName, importc.}
|
||||
): ptr array[32, byte] {.noconv, dynlib: DLLSSLName, importc.}
|
||||
|
||||
proc SHA256_OpenSSL[T: byte|char](
|
||||
digest: var array[32, byte],
|
||||
|
||||
@ -19,9 +19,9 @@ export curves, curves_primitives
|
||||
|
||||
template genBindingsField*(Field: untyped) =
|
||||
when appType == "lib":
|
||||
{.push cdecl, dynlib, exportc, raises: [].} # No exceptions allowed
|
||||
{.push noconv, dynlib, exportc, raises: [].} # No exceptions allowed
|
||||
else:
|
||||
{.push cdecl, exportc, raises: [].} # No exceptions allowed
|
||||
{.push noconv, exportc, raises: [].} # No exceptions allowed
|
||||
|
||||
func `ctt _ Field _ unmarshalBE`(dst: var Field, src: openarray[byte]) =
|
||||
## Deserialize
|
||||
@ -122,9 +122,9 @@ template genBindingsField*(Field: untyped) =
|
||||
|
||||
template genBindingsFieldSqrt*(Field: untyped) =
|
||||
when appType == "lib":
|
||||
{.push cdecl, dynlib, exportc, raises: [].} # No exceptions allowed
|
||||
{.push noconv, dynlib, exportc, raises: [].} # No exceptions allowed
|
||||
else:
|
||||
{.push cdecl, exportc, raises: [].} # No exceptions allowed
|
||||
{.push noconv, exportc, raises: [].} # No exceptions allowed
|
||||
|
||||
func `ctt _ Field _ is_square`(a: Field): SecretBool =
|
||||
a.isSquare()
|
||||
@ -155,9 +155,9 @@ template genBindingsFieldSqrt*(Field: untyped) =
|
||||
|
||||
template genBindingsExtField*(Field: untyped) =
|
||||
when appType == "lib":
|
||||
{.push cdecl, dynlib, exportc, raises: [].} # No exceptions allowed
|
||||
{.push noconv, dynlib, exportc, raises: [].} # No exceptions allowed
|
||||
else:
|
||||
{.push cdecl, exportc, raises: [].} # No exceptions allowed
|
||||
{.push noconv, exportc, raises: [].} # No exceptions allowed
|
||||
|
||||
# --------------------------------------------------------------------------------------
|
||||
func `ctt _ Field _ is_eq`(a, b: Field): SecretBool =
|
||||
@ -258,9 +258,9 @@ template genBindingsExtField*(Field: untyped) =
|
||||
|
||||
template genBindingsExtFieldSqrt*(Field: untyped) =
|
||||
when appType == "lib":
|
||||
{.push cdecl, dynlib, exportc, raises: [].} # No exceptions allowed
|
||||
{.push noconv, dynlib, exportc, raises: [].} # No exceptions allowed
|
||||
else:
|
||||
{.push cdecl, exportc, raises: [].} # No exceptions allowed
|
||||
{.push noconv, exportc, raises: [].} # No exceptions allowed
|
||||
|
||||
func `ctt _ Field _ is_square`(a: Field): SecretBool =
|
||||
a.isSquare()
|
||||
@ -275,9 +275,9 @@ template genBindingsExtFieldSqrt*(Field: untyped) =
|
||||
|
||||
template genBindings_EC_ShortW_Affine*(ECP, Field: untyped) =
|
||||
when appType == "lib":
|
||||
{.push cdecl, dynlib, exportc, raises: [].} # No exceptions allowed
|
||||
{.push noconv, dynlib, exportc, raises: [].} # No exceptions allowed
|
||||
else:
|
||||
{.push cdecl, exportc, raises: [].} # No exceptions allowed
|
||||
{.push noconv, exportc, raises: [].} # No exceptions allowed
|
||||
|
||||
# --------------------------------------------------------------------------------------
|
||||
func `ctt _ ECP _ is_eq`(P, Q: ECP): SecretBool =
|
||||
@ -305,9 +305,9 @@ template genBindings_EC_ShortW_Affine*(ECP, Field: untyped) =
|
||||
|
||||
template genBindings_EC_ShortW_NonAffine*(ECP, ECP_Aff, Field: untyped) =
|
||||
when appType == "lib":
|
||||
{.push cdecl, dynlib, exportc, raises: [].} # No exceptions allowed
|
||||
{.push noconv, dynlib, exportc, raises: [].} # No exceptions allowed
|
||||
else:
|
||||
{.push cdecl, exportc, raises: [].} # No exceptions allowed
|
||||
{.push noconv, exportc, raises: [].} # No exceptions allowed
|
||||
|
||||
# --------------------------------------------------------------------------------------
|
||||
func `ctt _ ECP _ is_eq`(P, Q: ECP): SecretBool =
|
||||
|
||||
@ -17,17 +17,17 @@ import std/strformat
|
||||
# Library compilation
|
||||
# ----------------------------------------------------------------
|
||||
|
||||
proc releaseBuildOptions: string =
|
||||
proc releaseBuildOptions(useASM, useLTO = true): string =
|
||||
# -d:danger --opt:size
|
||||
# to avoid boundsCheck and overflowChecks that would trigger exceptions or allocations in a crypto library.
|
||||
# Those are internally guaranteed at compile-time by fixed-sized array
|
||||
# and checked at runtime with an appropriate error code if any for user-input.
|
||||
#
|
||||
# Furthermore we optimize for size, the performance critical procedures
|
||||
# Furthermore we may optimize for size, the performance critical procedures
|
||||
# either use assembly or are unrolled manually with staticFor,
|
||||
# Optimizations at -O3 deal with loops and branching
|
||||
# which we mostly don't have. It's better to optimize
|
||||
# for instructions cache.
|
||||
# which we mostly don't have.
|
||||
# Hence optimizing for instructions cache may pay off.
|
||||
#
|
||||
# --panics:on -d:noSignalHandler
|
||||
# Even with `raises: []`, Nim still has an exception path
|
||||
@ -50,11 +50,23 @@ proc releaseBuildOptions: string =
|
||||
# Reduce instructions cache misses.
|
||||
# https://lkml.org/lkml/2015/5/21/443
|
||||
# Our non-inlined functions are large so size cost is minimal.
|
||||
" -d:danger --opt:size " &
|
||||
let compiler = if existsEnv"CC": " --cc:" & getEnv"CC"
|
||||
else: ""
|
||||
|
||||
let noASM = if not useASM: " -d:CttASM=false "
|
||||
else: ""
|
||||
|
||||
let lto = if useLTO: " --passC:-flto=auto --passL:-flto=auto "
|
||||
else: ""
|
||||
|
||||
compiler &
|
||||
noASM &
|
||||
lto &
|
||||
" -d:danger " &
|
||||
# " --opt:size " &
|
||||
" --panics:on -d:noSignalHandler " &
|
||||
" --mm:arc -d:useMalloc " &
|
||||
" --verbosity:0 --hints:off --warnings:off " &
|
||||
# " --passC:-flto --passL:-flto " &
|
||||
" --passC:-fno-semantic-interposition " &
|
||||
" --passC:-falign-functions=64 "
|
||||
|
||||
@ -62,13 +74,14 @@ type BindingsKind = enum
|
||||
kCurve
|
||||
kProtocol
|
||||
|
||||
proc genDynamicBindings(bindingsKind: BindingsKind, bindingsName, prefixNimMain: string) =
|
||||
proc genDynamicBindings(bindingsKind: BindingsKind, bindingsName, prefixNimMain: string, useASM = true) =
|
||||
proc compile(libName: string, flags = "") =
|
||||
echo "Compiling dynamic library: lib/" & libName
|
||||
|
||||
exec "nim c " &
|
||||
" --noMain --app:lib " &
|
||||
flags &
|
||||
releaseBuildOptions() &
|
||||
releaseBuildOptions(useASM, useLTO = true) &
|
||||
" --noMain --app:lib " &
|
||||
&" --nimMainPrefix:{prefixNimMain} " &
|
||||
&" --out:{libName} --outdir:lib " &
|
||||
(block:
|
||||
@ -98,24 +111,24 @@ proc genDynamicBindings(bindingsKind: BindingsKind, bindingsName, prefixNimMain:
|
||||
else:
|
||||
compile "lib" & bindingsName & ".so"
|
||||
|
||||
proc genStaticBindings(bindingsKind: BindingsKind, bindingsName, prefixNimMain: string) =
|
||||
proc genStaticBindings(bindingsKind: BindingsKind, bindingsName, prefixNimMain: string, useASM = true) =
|
||||
proc compile(libName: string, flags = "") =
|
||||
echo "Compiling static library: lib/" & libName
|
||||
|
||||
exec "nim c " &
|
||||
" --noMain --app:staticLib " &
|
||||
flags &
|
||||
releaseBuildOptions() &
|
||||
" --nimMainPrefix:" & prefixNimMain &
|
||||
" --out:" & libName & " --outdir:lib " &
|
||||
releaseBuildOptions(useASM, useLTO = false) &
|
||||
" --noMain --app:staticLib " &
|
||||
&" --nimMainPrefix:{prefixNimMain} " &
|
||||
&" --out:{libName} --outdir:lib " &
|
||||
(block:
|
||||
case bindingsKind
|
||||
of kCurve:
|
||||
" --nimcache:nimcache/bindings_curves/" & bindingsName &
|
||||
" bindings_generators/" & bindingsName & ".nim"
|
||||
&" --nimcache:nimcache/bindings_curves/{bindingsName}" &
|
||||
&" bindings_generators/{bindingsName}.nim"
|
||||
of kProtocol:
|
||||
" --nimcache:nimcache/bindings_protocols/" & bindingsName &
|
||||
" constantine/" & bindingsName & ".nim"
|
||||
)
|
||||
&" --nimcache:nimcache/bindings_protocols/{bindingsName}" &
|
||||
&" constantine/{bindingsName}.nim")
|
||||
|
||||
let bindingsName = block:
|
||||
case bindingsKind
|
||||
@ -138,13 +151,13 @@ proc genStaticBindings(bindingsKind: BindingsKind, bindingsName, prefixNimMain:
|
||||
proc genHeaders(bindingsName: string) =
|
||||
echo "Generating header: include/" & bindingsName & ".h"
|
||||
exec "nim c -d:CttGenerateHeaders " &
|
||||
releaseBuildOptions() &
|
||||
" -d:release " &
|
||||
" --out:" & bindingsName & "_gen_header.exe --outdir:build " &
|
||||
" --nimcache:nimcache/bindings_curves_headers/" & bindingsName & "_header" &
|
||||
" bindings_generators/" & bindingsName & ".nim"
|
||||
exec "build/" & bindingsName & "_gen_header.exe include"
|
||||
|
||||
task bindings, "Generate Constantine bindings":
|
||||
task bindings, "Generate Constantine bindings (no assembly)":
|
||||
# Curve arithmetic
|
||||
genStaticBindings(kCurve, "constantine_bls12_381", "ctt_bls12381_init_")
|
||||
genDynamicBindings(kCurve, "constantine_bls12_381", "ctt_bls12381_init_")
|
||||
@ -158,6 +171,23 @@ task bindings, "Generate Constantine bindings":
|
||||
# Protocols
|
||||
genStaticBindings(kProtocol, "ethereum_bls_signatures", "ctt_eth_bls_init_")
|
||||
genDynamicBindings(kProtocol, "ethereum_bls_signatures", "ctt_eth_bls_init_")
|
||||
echo ""
|
||||
|
||||
task bindings_no_asm, "Generate Constantine bindings (no assembly)":
|
||||
# Curve arithmetic
|
||||
genStaticBindings(kCurve, "constantine_bls12_381", "ctt_bls12381_init_", useASM = false)
|
||||
genDynamicBindings(kCurve, "constantine_bls12_381", "ctt_bls12381_init_", useASM = false)
|
||||
genHeaders("constantine_bls12_381")
|
||||
echo ""
|
||||
genStaticBindings(kCurve, "constantine_pasta", "ctt_pasta_init_", useASM = false)
|
||||
genDynamicBindings(kCurve, "constantine_pasta", "ctt_pasta_init_", useASM = false)
|
||||
genHeaders("constantine_pasta")
|
||||
echo ""
|
||||
|
||||
# Protocols
|
||||
genStaticBindings(kProtocol, "ethereum_bls_signatures", "ctt_eth_bls_init_", useASM = false)
|
||||
genDynamicBindings(kProtocol, "ethereum_bls_signatures", "ctt_eth_bls_init_", useASM = false)
|
||||
echo ""
|
||||
|
||||
proc testLib(path, testName, libName: string, useGMP: bool) =
|
||||
let dynlibName = if defined(windows): libName & ".dll"
|
||||
@ -166,21 +196,25 @@ proc testLib(path, testName, libName: string, useGMP: bool) =
|
||||
let staticlibName = if defined(windows): libName & ".lib"
|
||||
else: "lib" & libName & ".a"
|
||||
|
||||
let cc = if existsEnv"CC": getEnv"CC"
|
||||
else: "gcc"
|
||||
|
||||
echo &"\n[Bindings: {path}/{testName}.c] Testing dynamically linked library {dynlibName}"
|
||||
exec &"gcc -Iinclude -Llib -o build/testbindings/{testName}_dynlink.exe {path}/{testName}.c -l{libName} " & (if useGMP: "-lgmp" else: "")
|
||||
exec &"{cc} -Iinclude -Llib -o build/testbindings/{testName}_dynlink.exe {path}/{testName}.c -l{libName} " & (if useGMP: "-lgmp" else: "")
|
||||
when defined(windows):
|
||||
# Put DLL near the exe as LD_LIBRARY_PATH doesn't work even in a POSIX compatible shell
|
||||
exec &"./build/testbindings/{testName}_dynlink.exe"
|
||||
else:
|
||||
exec &"LD_LIBRARY_PATH=lib ./build/testbindings/{testName}_dynlink.exe"
|
||||
|
||||
echo ""
|
||||
|
||||
echo &"\n[Bindings: {path}/{testName}.c] Testing statically linked library: {staticlibName}"
|
||||
# Beware MacOS annoying linker with regards to static libraries
|
||||
# The following standard way cannot be used on MacOS
|
||||
# exec "gcc -Iinclude -Llib -o build/t_libctt_bls12_381_sl.exe examples_c/t_libctt_bls12_381.c -lgmp -Wl,-Bstatic -lconstantine_bls12_381 -Wl,-Bdynamic"
|
||||
exec &"gcc -Iinclude -o build/testbindings/{testName}_staticlink.exe {path}/{testName}.c lib/{staticlibName} " & (if useGMP: "-lgmp" else: "")
|
||||
exec &"{cc} -Iinclude -o build/testbindings/{testName}_staticlink.exe {path}/{testName}.c lib/{staticlibName} " & (if useGMP: "-lgmp" else: "")
|
||||
exec &"./build/testbindings/{testName}_staticlink.exe"
|
||||
echo ""
|
||||
|
||||
task test_bindings, "Test C bindings":
|
||||
exec "mkdir -p build/testbindings"
|
||||
@ -485,9 +519,22 @@ const skipSanitizers = [
|
||||
|
||||
when defined(windows):
|
||||
# UBSAN is not available on mingw
|
||||
# https://github.com/libressl-portable/portable/issues/54
|
||||
const sanitizers = ""
|
||||
else:
|
||||
const sanitizers =
|
||||
|
||||
" --passC:-fstack-protector-strong " &
|
||||
|
||||
# Fortify source wouldn't help us detect errors in cosntantine
|
||||
# because everything is stack allocated
|
||||
# except with the threadpool:
|
||||
# - https://developers.redhat.com/blog/2021/04/16/broadening-compiler-checks-for-buffer-overflows-in-_fortify_source#what_s_next_for__fortify_source
|
||||
# - https://developers.redhat.com/articles/2023/02/06/how-improve-application-security-using-fortifysource3#how_to_improve_application_fortification
|
||||
# We also don't use memcpy as it is not constant-time and our copy is compile-time sized.
|
||||
|
||||
" --passC:-D_FORTIFY_SOURCE=3 " &
|
||||
|
||||
# Sanitizers are incompatible with nim default GC
|
||||
# The conservative stack scanning of Nim default GC triggers, alignment UB and stack-buffer-overflow check.
|
||||
# Address sanitizer requires free registers and needs to be disabled for some inline assembly files.
|
||||
@ -497,8 +544,8 @@ else:
|
||||
|
||||
# " --passC:-fsanitize=undefined --passL:-fsanitize=undefined" &
|
||||
# " --passC:-fsanitize=address --passL:-fsanitize=address" &
|
||||
" --passC:-fno-sanitize-recover" # Enforce crash on undefined behaviour
|
||||
|
||||
# " --passC:-fno-sanitize-recover" # Enforce crash on undefined behaviour
|
||||
""
|
||||
|
||||
# Tests & Benchmarks helper functions
|
||||
# ----------------------------------------------------------------
|
||||
@ -508,25 +555,17 @@ proc clearParallelBuild() =
|
||||
if fileExists(buildParallel):
|
||||
rmFile(buildParallel)
|
||||
|
||||
template setupTestCommand(): untyped {.dirty.} =
|
||||
proc setupTestCommand(flags, path: string, useASM: bool): string =
|
||||
var lang = "c"
|
||||
if existsEnv"TEST_LANG":
|
||||
lang = getEnv"TEST_LANG"
|
||||
|
||||
var cc = ""
|
||||
if existsEnv"CC":
|
||||
cc = " --cc:" & getEnv"CC"
|
||||
|
||||
var flags = flags
|
||||
when not defined(windows):
|
||||
# Not available in MinGW https://github.com/libressl-portable/portable/issues/54
|
||||
flags &= " --passC:-fstack-protector-strong --passC:-D_FORTIFY_SOURCE=2 "
|
||||
let command = "nim " & lang & cc &
|
||||
return "nim " & lang &
|
||||
" -r " &
|
||||
flags &
|
||||
releaseBuildOptions() &
|
||||
releaseBuildOptions(useASM) &
|
||||
" --outdir:build/testsuite " &
|
||||
" --nimcache:nimcache/" & path & " " &
|
||||
&" --nimcache:nimcache/{path} " &
|
||||
path
|
||||
|
||||
proc test(cmd: string) =
|
||||
@ -535,73 +574,72 @@ proc test(cmd: string) =
|
||||
echo "=============================================================================================="
|
||||
exec cmd
|
||||
|
||||
proc testBatch(commands: var string, flags, path: string) =
|
||||
setupTestCommand()
|
||||
commands &= command & '\n'
|
||||
proc testBatch(commands: var string, flags, path: string, useASM = true) =
|
||||
# With LTO, the linker produces lots of spurious warnings when copying into openArrays/strings
|
||||
|
||||
template setupBench(): untyped {.dirty.} =
|
||||
let runFlag = if run: " -r "
|
||||
else: " "
|
||||
let flags = if defined(gcc): flags & " --passC:-Wno-stringop-overflow --passL:-Wno-stringop-overflow "
|
||||
else: flags
|
||||
|
||||
var lang = " c "
|
||||
if existsEnv"TEST_LANG":
|
||||
lang = getEnv"TEST_LANG"
|
||||
commands = commands & setupTestCommand(flags, path, useASM) & '\n'
|
||||
|
||||
var cc = ""
|
||||
if compiler != "":
|
||||
cc = "--cc:" & compiler
|
||||
elif existsEnv"CC":
|
||||
cc = " --cc:" & getEnv"CC"
|
||||
proc setupBench(benchName: string, run: bool, useAsm: bool): string =
|
||||
var runFlags = " "
|
||||
if run: # Beware of https://github.com/nim-lang/Nim/issues/21704
|
||||
runFlags = runFlags & " -r "
|
||||
|
||||
if not useAsm:
|
||||
cc &= " -d:CttASM=false"
|
||||
let command = "nim " & lang & cc &
|
||||
releaseBuildOptions() &
|
||||
" -o:build/bench/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") &
|
||||
" --nimcache:nimcache/benches/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") &
|
||||
runFlag & " benchmarks/" & benchName & ".nim"
|
||||
let asmStatus = if useASM: "useASM"
|
||||
else: "noASM"
|
||||
|
||||
proc runBench(benchName: string, compiler = "", useAsm = true) =
|
||||
if defined(gcc):
|
||||
# With LTO, the linker produces lots of spurious warnings when copying into openArrays/strings
|
||||
runFlags = runFlags & " --passC:-Wno-stringop-overflow --passL:-Wno-stringop-overflow "
|
||||
|
||||
let cc = if existsEnv"CC": getEnv"CC"
|
||||
else: "defaultcompiler"
|
||||
|
||||
return "nim c " &
|
||||
runFlags &
|
||||
releaseBuildOptions(useASM) &
|
||||
&" -o:build/bench/{benchName}_{cc}_{asmStatus}" &
|
||||
&" --nimcache:nimcache/benches/{benchName}_{cc}_{asmStatus}" &
|
||||
&" benchmarks/{benchName}.nim"
|
||||
|
||||
proc runBench(benchName: string, useAsm = true) =
|
||||
if not dirExists "build":
|
||||
mkDir "build"
|
||||
let run = true
|
||||
setupBench()
|
||||
let command = setupBench(benchName, run = true, useAsm)
|
||||
exec command
|
||||
|
||||
proc buildBenchBatch(commands: var string, benchName: string, compiler = "", useAsm = true) =
|
||||
let run = false
|
||||
let compiler = ""
|
||||
setupBench()
|
||||
commands &= command & '\n'
|
||||
proc buildBenchBatch(commands: var string, benchName: string, useAsm = true) =
|
||||
let command = setupBench(benchName, run = false, useAsm)
|
||||
commands = commands & command & '\n'
|
||||
|
||||
proc addTestSet(cmdFile: var string, requireGMP: bool, test32bit = false, testASM = true) =
|
||||
proc addTestSet(cmdFile: var string, requireGMP: bool, test32bit = false, useASM = true) =
|
||||
if not dirExists "build":
|
||||
mkDir "build"
|
||||
echo "Found " & $testDesc.len & " tests to run."
|
||||
|
||||
for td in testDesc:
|
||||
if not(td.useGMP and not requireGMP):
|
||||
var flags = ""
|
||||
if not testASM:
|
||||
flags &= " -d:CttASM=false "
|
||||
var flags = "" # Beware of https://github.com/nim-lang/Nim/issues/21704
|
||||
if test32bit:
|
||||
flags &= " -d:Constantine32 "
|
||||
flags = flags & " -d:Ctt32 "
|
||||
if td.path in useDebug:
|
||||
flags &= " -d:debugConstantine "
|
||||
flags = flags & " -d:CttDebug "
|
||||
if td.path notin skipSanitizers:
|
||||
flags &= sanitizers
|
||||
flags = flags & sanitizers
|
||||
|
||||
cmdFile.testBatch(flags, td.path)
|
||||
cmdFile.testBatch(flags, td.path, useASM)
|
||||
|
||||
proc addTestSetNvidia(cmdFile: var string) =
|
||||
if not dirExists "build":
|
||||
mkDir "build"
|
||||
echo "Found " & $testDescNvidia.len & " tests to run."
|
||||
|
||||
for path in testDescThreadpool:
|
||||
var flags = ""
|
||||
for path in testDescNvidia:
|
||||
var flags = "" # Beware of https://github.com/nim-lang/Nim/issues/21704
|
||||
if path notin skipSanitizers:
|
||||
flags &= sanitizers
|
||||
flags = flags & sanitizers
|
||||
cmdFile.testBatch(flags, path)
|
||||
|
||||
proc addTestSetThreadpool(cmdFile: var string) =
|
||||
@ -612,26 +650,24 @@ proc addTestSetThreadpool(cmdFile: var string) =
|
||||
for path in testDescThreadpool:
|
||||
var flags = " --threads:on --debugger:native "
|
||||
if path notin skipSanitizers:
|
||||
flags &= sanitizers
|
||||
flags = flags & sanitizers
|
||||
cmdFile.testBatch(flags, path)
|
||||
|
||||
proc addTestSetMultithreadedCrypto(cmdFile: var string, test32bit = false, testASM = true) =
|
||||
proc addTestSetMultithreadedCrypto(cmdFile: var string, test32bit = false, useASM = true) =
|
||||
if not dirExists "build":
|
||||
mkDir "build"
|
||||
echo "Found " & $testDescMultithreadedCrypto.len & " tests to run."
|
||||
|
||||
for td in testDescMultithreadedCrypto:
|
||||
var flags = " --threads:on --debugger:native"
|
||||
if not testASM:
|
||||
flags &= " -d:CttASM=false"
|
||||
if test32bit:
|
||||
flags &= " -d:Constantine32"
|
||||
flags = flags & " -d:Ctt32 "
|
||||
if td in useDebug:
|
||||
flags &= " -d:debugConstantine"
|
||||
flags = flags & " -d:CttDebug "
|
||||
if td notin skipSanitizers:
|
||||
flags &= sanitizers
|
||||
flags = flags & sanitizers
|
||||
|
||||
cmdFile.testBatch(flags, td)
|
||||
cmdFile.testBatch(flags, td, useASM)
|
||||
|
||||
proc addBenchSet(cmdFile: var string, useAsm = true) =
|
||||
if not dirExists "build":
|
||||
@ -649,7 +685,7 @@ proc genParallelCmdRunner() =
|
||||
task test, "Run all tests":
|
||||
# -d:testingCurves is configured in a *.nim.cfg for convenience
|
||||
var cmdFile: string
|
||||
cmdFile.addTestSet(requireGMP = true, testASM = true)
|
||||
cmdFile.addTestSet(requireGMP = true, useASM = true)
|
||||
cmdFile.addBenchSet(useASM = true) # Build (but don't run) benches to ensure they stay relevant
|
||||
cmdFile.addTestSetThreadpool()
|
||||
cmdFile.addTestSetMultithreadedCrypto()
|
||||
@ -660,10 +696,10 @@ task test, "Run all tests":
|
||||
task test_no_asm, "Run all tests (no assembly)":
|
||||
# -d:testingCurves is configured in a *.nim.cfg for convenience
|
||||
var cmdFile: string
|
||||
cmdFile.addTestSet(requireGMP = true, testASM = false)
|
||||
cmdFile.addTestSet(requireGMP = true, useASM = false)
|
||||
cmdFile.addBenchSet(useASM = false) # Build (but don't run) benches to ensure they stay relevant
|
||||
cmdFile.addTestSetThreadpool()
|
||||
cmdFile.addTestSetMultithreadedCrypto(testASM = false)
|
||||
cmdFile.addTestSetMultithreadedCrypto(useASM = false)
|
||||
for cmd in cmdFile.splitLines():
|
||||
if cmd != "": # Windows doesn't like empty commands
|
||||
exec cmd
|
||||
@ -671,7 +707,7 @@ task test_no_asm, "Run all tests (no assembly)":
|
||||
task test_no_gmp, "Run tests that don't require GMP":
|
||||
# -d:testingCurves is configured in a *.nim.cfg for convenience
|
||||
var cmdFile: string
|
||||
cmdFile.addTestSet(requireGMP = false, testASM = true)
|
||||
cmdFile.addTestSet(requireGMP = false, useASM = true)
|
||||
cmdFile.addBenchSet(useASM = true) # Build (but don't run) benches to ensure they stay relevant
|
||||
cmdFile.addTestSetThreadpool()
|
||||
cmdFile.addTestSetMultithreadedCrypto()
|
||||
@ -682,10 +718,10 @@ task test_no_gmp, "Run tests that don't require GMP":
|
||||
task test_no_gmp_no_asm, "Run tests that don't require GMP using a pure Nim backend":
|
||||
# -d:testingCurves is configured in a *.nim.cfg for convenience
|
||||
var cmdFile: string
|
||||
cmdFile.addTestSet(requireGMP = false, testASM = false)
|
||||
cmdFile.addTestSet(requireGMP = false, useASM = false)
|
||||
cmdFile.addBenchSet(useASM = false) # Build (but don't run) benches to ensure they stay relevant
|
||||
cmdFile.addTestSetThreadpool()
|
||||
cmdFile.addTestSetMultithreadedCrypto(testASM = false)
|
||||
cmdFile.addTestSetMultithreadedCrypto(useASM = false)
|
||||
for cmd in cmdFile.splitLines():
|
||||
if cmd != "": # Windows doesn't like empty commands
|
||||
exec cmd
|
||||
@ -696,7 +732,7 @@ task test_parallel, "Run all tests in parallel":
|
||||
genParallelCmdRunner()
|
||||
|
||||
var cmdFile: string
|
||||
cmdFile.addTestSet(requireGMP = true, testASM = true)
|
||||
cmdFile.addTestSet(requireGMP = true, useASM = true)
|
||||
cmdFile.addBenchSet(useASM = true) # Build (but don't run) benches to ensure they stay relevant
|
||||
writeFile(buildParallel, cmdFile)
|
||||
exec "build/pararun " & buildParallel
|
||||
@ -715,7 +751,7 @@ task test_parallel_no_asm, "Run all tests (without macro assembler) in parallel"
|
||||
genParallelCmdRunner()
|
||||
|
||||
var cmdFile: string
|
||||
cmdFile.addTestSet(requireGMP = true, testASM = false)
|
||||
cmdFile.addTestSet(requireGMP = true, useASM = false)
|
||||
cmdFile.addBenchSet(useASM = false)
|
||||
writeFile(buildParallel, cmdFile)
|
||||
exec "build/pararun " & buildParallel
|
||||
@ -723,7 +759,7 @@ task test_parallel_no_asm, "Run all tests (without macro assembler) in parallel"
|
||||
# Threadpool tests done serially
|
||||
cmdFile = ""
|
||||
cmdFile.addTestSetThreadpool()
|
||||
cmdFile.addTestSetMultithreadedCrypto(testASM = false)
|
||||
cmdFile.addTestSetMultithreadedCrypto(useASM = false)
|
||||
for cmd in cmdFile.splitLines():
|
||||
if cmd != "": # Windows doesn't like empty commands
|
||||
exec cmd
|
||||
@ -734,7 +770,7 @@ task test_parallel_no_gmp, "Run all tests in parallel":
|
||||
genParallelCmdRunner()
|
||||
|
||||
var cmdFile: string
|
||||
cmdFile.addTestSet(requireGMP = false, testASM = true)
|
||||
cmdFile.addTestSet(requireGMP = false, useASM = true)
|
||||
cmdFile.addBenchSet(useASM = true) # Build (but don't run) benches to ensure they stay relevant
|
||||
writeFile(buildParallel, cmdFile)
|
||||
exec "build/pararun " & buildParallel
|
||||
@ -753,7 +789,7 @@ task test_parallel_no_gmp_no_asm, "Run all tests in parallel":
|
||||
genParallelCmdRunner()
|
||||
|
||||
var cmdFile: string
|
||||
cmdFile.addTestSet(requireGMP = false, testASM = false)
|
||||
cmdFile.addTestSet(requireGMP = false, useASM = false)
|
||||
cmdFile.addBenchSet(useASM = false) # Build (but don't run) benches to ensure they stay relevant
|
||||
writeFile(buildParallel, cmdFile)
|
||||
exec "build/pararun " & buildParallel
|
||||
@ -761,7 +797,7 @@ task test_parallel_no_gmp_no_asm, "Run all tests in parallel":
|
||||
# Threadpool tests done serially
|
||||
cmdFile = ""
|
||||
cmdFile.addTestSetThreadpool()
|
||||
cmdFile.addTestSetMultithreadedCrypto(testASM = false)
|
||||
cmdFile.addTestSetMultithreadedCrypto(useASM = false)
|
||||
for cmd in cmdFile.splitLines():
|
||||
if cmd != "": # Windows doesn't like empty commands
|
||||
exec cmd
|
||||
@ -790,389 +826,199 @@ task test_nvidia, "Run all tests for Nvidia GPUs":
|
||||
# Finite field 𝔽p
|
||||
# ------------------------------------------
|
||||
|
||||
task bench_fp, "Run benchmark 𝔽p with your default compiler":
|
||||
task bench_fp, "Run benchmark 𝔽p with your CC compiler":
|
||||
runBench("bench_fp")
|
||||
|
||||
task bench_fp_gcc, "Run benchmark 𝔽p with gcc":
|
||||
runBench("bench_fp", "gcc")
|
||||
|
||||
task bench_fp_clang, "Run benchmark 𝔽p with clang":
|
||||
runBench("bench_fp", "clang")
|
||||
|
||||
task bench_fp_gcc_noasm, "Run benchmark 𝔽p with gcc - no Assembly":
|
||||
runBench("bench_fp", "gcc", useAsm = false)
|
||||
|
||||
task bench_fp_clang_noasm, "Run benchmark 𝔽p with clang - no Assembly":
|
||||
runBench("bench_fp", "clang", useAsm = false)
|
||||
task bench_fp_noasm, "Run benchmark 𝔽p with your CC compiler - no Assembly":
|
||||
runBench("bench_fp", useAsm = false)
|
||||
|
||||
# Double-precision field 𝔽pDbl
|
||||
# ------------------------------------------
|
||||
|
||||
task bench_fpdbl, "Run benchmark 𝔽pDbl with your default compiler":
|
||||
task bench_fpdbl, "Run benchmark 𝔽pDbl with your CC compiler":
|
||||
runBench("bench_fp_double_precision")
|
||||
|
||||
task bench_fpdbl_gcc, "Run benchmark 𝔽p with gcc":
|
||||
runBench("bench_fp_double_precision", "gcc")
|
||||
task bench_fpdbl_noasm, "Run benchmark 𝔽p with CC compiler - no Assembly":
|
||||
runBench("bench_fp_double_precision", useAsm = false)
|
||||
|
||||
task bench_fpdbl_clang, "Run benchmark 𝔽p with clang":
|
||||
runBench("bench_fp_double_precision", "clang")
|
||||
|
||||
task bench_fpdbl_gcc_noasm, "Run benchmark 𝔽p with gcc - no Assembly":
|
||||
runBench("bench_fp_double_precision", "gcc", useAsm = false)
|
||||
|
||||
task bench_fpdbl_clang_noasm, "Run benchmark 𝔽p with clang - no Assembly":
|
||||
runBench("bench_fp_double_precision", "clang", useAsm = false)
|
||||
|
||||
# Extension field 𝔽p2
|
||||
# ------------------------------------------
|
||||
|
||||
task bench_fp2, "Run benchmark with 𝔽p2 your default compiler":
|
||||
task bench_fp2, "Run benchmark 𝔽p2 with your CC compiler":
|
||||
runBench("bench_fp2")
|
||||
|
||||
task bench_fp2_gcc, "Run benchmark 𝔽p2 with gcc":
|
||||
runBench("bench_fp2", "gcc")
|
||||
|
||||
task bench_fp2_clang, "Run benchmark 𝔽p2 with clang":
|
||||
runBench("bench_fp2", "clang")
|
||||
|
||||
task bench_fp2_gcc_noasm, "Run benchmark 𝔽p2 with gcc - no Assembly":
|
||||
runBench("bench_fp2", "gcc", useAsm = false)
|
||||
|
||||
task bench_fp2_clang_noasm, "Run benchmark 𝔽p2 with clang - no Assembly":
|
||||
runBench("bench_fp2", "clang", useAsm = false)
|
||||
task bench_fp2_noasm, "Run benchmark 𝔽p2 with CC compiler - no Assembly":
|
||||
runBench("bench_fp2", useAsm = false)
|
||||
|
||||
# Extension field 𝔽p4
|
||||
# ------------------------------------------
|
||||
|
||||
task bench_fp4, "Run benchmark with 𝔽p4 your default compiler":
|
||||
task bench_fp4, "Run benchmark 𝔽p4 with your CC compiler":
|
||||
runBench("bench_fp4")
|
||||
|
||||
task bench_fp4_gcc, "Run benchmark 𝔽p4 with gcc":
|
||||
runBench("bench_fp4", "gcc")
|
||||
task bench_fp4_noasm, "Run benchmark 𝔽p4 with CC compiler - no Assembly":
|
||||
runBench("bench_fp4", useAsm = false)
|
||||
|
||||
task bench_fp4_clang, "Run benchmark 𝔽p4 with clang":
|
||||
runBench("bench_fp4", "clang")
|
||||
|
||||
task bench_fp4_gcc_noasm, "Run benchmark 𝔽p4 with gcc - no Assembly":
|
||||
runBench("bench_fp4", "gcc", useAsm = false)
|
||||
|
||||
task bench_fp4_clang_noasm, "Run benchmark 𝔽p4 with clang - no Assembly":
|
||||
runBench("bench_fp4", "clang", useAsm = false)
|
||||
|
||||
# Extension field 𝔽p6
|
||||
# ------------------------------------------
|
||||
|
||||
task bench_fp6, "Run benchmark with 𝔽p6 your default compiler":
|
||||
task bench_fp6, "Run benchmark 𝔽p6 with your CC compiler":
|
||||
runBench("bench_fp6")
|
||||
|
||||
task bench_fp6_gcc, "Run benchmark 𝔽p6 with gcc":
|
||||
runBench("bench_fp6", "gcc")
|
||||
|
||||
task bench_fp6_clang, "Run benchmark 𝔽p6 with clang":
|
||||
runBench("bench_fp6", "clang")
|
||||
|
||||
task bench_fp6_gcc_noasm, "Run benchmark 𝔽p6 with gcc - no Assembly":
|
||||
runBench("bench_fp6", "gcc", useAsm = false)
|
||||
|
||||
task bench_fp6_clang_noasm, "Run benchmark 𝔽p6 with clang - no Assembly":
|
||||
runBench("bench_fp6", "clang", useAsm = false)
|
||||
task bench_fp6_noasm, "Run benchmark 𝔽p6 with CC compiler - no Assembly":
|
||||
runBench("bench_fp6", useAsm = false)
|
||||
|
||||
# Extension field 𝔽p12
|
||||
# ------------------------------------------
|
||||
|
||||
task bench_fp12, "Run benchmark with 𝔽p12 your default compiler":
|
||||
task bench_fp12, "Run benchmark 𝔽p12 with your CC compiler":
|
||||
runBench("bench_fp12")
|
||||
|
||||
task bench_fp12_gcc, "Run benchmark 𝔽p12 with gcc":
|
||||
runBench("bench_fp12", "gcc")
|
||||
|
||||
task bench_fp12_clang, "Run benchmark 𝔽p12 with clang":
|
||||
runBench("bench_fp12", "clang")
|
||||
|
||||
task bench_fp12_gcc_noasm, "Run benchmark 𝔽p12 with gcc - no Assembly":
|
||||
runBench("bench_fp12", "gcc", useAsm = false)
|
||||
|
||||
task bench_fp12_clang_noasm, "Run benchmark 𝔽p12 with clang - no Assembly":
|
||||
runBench("bench_fp12", "clang", useAsm = false)
|
||||
task bench_fp12_noasm, "Run benchmark 𝔽p12 with CC compiler - no Assembly":
|
||||
runBench("bench_fp12", useAsm = false)
|
||||
|
||||
# Elliptic curve G1
|
||||
# ------------------------------------------
|
||||
|
||||
task bench_ec_g1, "Run benchmark on Elliptic Curve group 𝔾1 - Default compiler":
|
||||
task bench_ec_g1, "Run benchmark on Elliptic Curve group 𝔾1 - CC compiler":
|
||||
runBench("bench_ec_g1")
|
||||
|
||||
task bench_ec_g1_gcc, "Run benchmark on Elliptic Curve group 𝔾1 - GCC":
|
||||
runBench("bench_ec_g1", "gcc")
|
||||
task bench_ec_g1_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - CC compiler no Assembly":
|
||||
runBench("bench_ec_g1", useAsm = false)
|
||||
|
||||
task bench_ec_g1_clang, "Run benchmark on Elliptic Curve group 𝔾1 - Clang":
|
||||
runBench("bench_ec_g1", "clang")
|
||||
|
||||
task bench_ec_g1_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - GCC no Assembly":
|
||||
runBench("bench_ec_g1", "gcc", useAsm = false)
|
||||
|
||||
task bench_ec_g1_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - Clang no Assembly":
|
||||
runBench("bench_ec_g1", "clang", useAsm = false)
|
||||
|
||||
# Elliptic curve G1 - batch operations
|
||||
# ------------------------------------------
|
||||
|
||||
task bench_ec_g1_batch, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - Default compiler":
|
||||
task bench_ec_g1_batch, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - CC compiler":
|
||||
runBench("bench_ec_g1_batch")
|
||||
|
||||
task bench_ec_g1_batch_gcc, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - GCC":
|
||||
runBench("bench_ec_g1_batch", "gcc")
|
||||
task bench_ec_g1_batch_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - CC compiler no Assembly":
|
||||
runBench("bench_ec_g1_batch", useAsm = false)
|
||||
|
||||
task bench_ec_g1_batch_clang, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - Clang":
|
||||
runBench("bench_ec_g1_batch", "clang")
|
||||
|
||||
task bench_ec_g1_batch_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - GCC no Assembly":
|
||||
runBench("bench_ec_g1_batch", "gcc", useAsm = false)
|
||||
|
||||
task bench_ec_g1_batch_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - Clang no Assembly":
|
||||
runBench("bench_ec_g1_batch", "clang", useAsm = false)
|
||||
|
||||
# Elliptic curve G1 - scalar multiplication
|
||||
# ------------------------------------------
|
||||
|
||||
task bench_ec_g1_scalar_mul, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - Default compiler":
|
||||
task bench_ec_g1_scalar_mul, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - CC compiler":
|
||||
runBench("bench_ec_g1_scalar_mul")
|
||||
|
||||
task bench_ec_g1_scalar_mul_gcc, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - GCC":
|
||||
runBench("bench_ec_g1_scalar_mul", "gcc")
|
||||
|
||||
task bench_ec_g1_scalar_mul_clang, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - Clang":
|
||||
runBench("bench_ec_g1_scalar_mul", "clang")
|
||||
|
||||
task bench_ec_g1_scalar_mul_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - GCC no Assembly":
|
||||
runBench("bench_ec_g1_scalar_mul", "gcc", useAsm = false)
|
||||
|
||||
task bench_ec_g1_scalar_mul_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - Clang no Assembly":
|
||||
runBench("bench_ec_g1_scalar_mul", "clang", useAsm = false)
|
||||
task bench_ec_g1_scalar_mul_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - CC compiler no Assembly":
|
||||
runBench("bench_ec_g1_scalar_mul", useAsm = false)
|
||||
|
||||
# Elliptic curve G1 - Multi-scalar-mul
|
||||
# ------------------------------------------
|
||||
|
||||
task bench_ec_g1_msm_bn254_snarks, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - Default compiler":
|
||||
task bench_ec_g1_msm_bn254_snarks, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - CC compiler":
|
||||
runBench("bench_ec_g1_msm_bn254_snarks")
|
||||
|
||||
task bench_ec_g1_msm_bn254_snarks_gcc, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - GCC":
|
||||
runBench("bench_ec_g1_msm_bn254_snarks", "gcc")
|
||||
task bench_ec_g1_msm_bn254_snarks_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - CC compiler no Assembly":
|
||||
runBench("bench_ec_g1_msm_bn254_snarks", useAsm = false)
|
||||
|
||||
task bench_ec_g1_msm_bn254_snarks_clang, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - Clang":
|
||||
runBench("bench_ec_g1_msm_bn254_snarks", "clang")
|
||||
|
||||
task bench_ec_g1_msm_bn254_snarks_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - GCC no Assembly":
|
||||
runBench("bench_ec_g1_msm_bn254_snarks", "gcc", useAsm = false)
|
||||
|
||||
task bench_ec_g1_msm_bn254_snarks_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - Clang no Assembly":
|
||||
runBench("bench_ec_g1_msm_bn254_snarks", "clang", useAsm = false)
|
||||
|
||||
task bench_ec_g1_msm_bls12_381, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - Default compiler":
|
||||
task bench_ec_g1_msm_bls12_381, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - CC compiler":
|
||||
runBench("bench_ec_g1_msm_bls12_381")
|
||||
|
||||
task bench_ec_g1_msm_bls12_381_gcc, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - GCC":
|
||||
runBench("bench_ec_g1_msm_bls12_381", "gcc")
|
||||
|
||||
task bench_ec_g1_msm_bls12_381_clang, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - Clang":
|
||||
runBench("bench_ec_g1_msm_bls12_381", "clang")
|
||||
|
||||
task bench_ec_g1_msm_bls12_381_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - GCC no Assembly":
|
||||
runBench("bench_ec_g1_msm_bls12_381", "gcc", useAsm = false)
|
||||
|
||||
task bench_ec_g1_msm_bls12_381_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - Clang no Assembly":
|
||||
runBench("bench_ec_g1_msm_bls12_381", "clang", useAsm = false)
|
||||
task bench_ec_g1_msm_bls12_381_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - CC compiler no Assembly":
|
||||
runBench("bench_ec_g1_msm_bls12_381", useAsm = false)
|
||||
|
||||
# Elliptic curve G2
|
||||
# ------------------------------------------
|
||||
|
||||
task bench_ec_g2, "Run benchmark on Elliptic Curve group 𝔾2 - Default compiler":
|
||||
task bench_ec_g2, "Run benchmark on Elliptic Curve group 𝔾2 - CC compiler":
|
||||
runBench("bench_ec_g2")
|
||||
|
||||
task bench_ec_g2_gcc, "Run benchmark on Elliptic Curve group 𝔾2 - GCC":
|
||||
runBench("bench_ec_g2", "gcc")
|
||||
|
||||
task bench_ec_g2_clang, "Run benchmark on Elliptic Curve group 𝔾2 - Clang":
|
||||
runBench("bench_ec_g2", "clang")
|
||||
|
||||
task bench_ec_g2_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - GCC no Assembly":
|
||||
runBench("bench_ec_g2", "gcc", useAsm = false)
|
||||
|
||||
task bench_ec_g2_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - Clang no Assembly":
|
||||
runBench("bench_ec_g2", "clang", useAsm = false)
|
||||
task bench_ec_g2_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - CC compiler no Assembly":
|
||||
runBench("bench_ec_g2", useAsm = false)
|
||||
|
||||
# Elliptic curve G2 - scalar multiplication
|
||||
# ------------------------------------------
|
||||
|
||||
task bench_ec_g2_scalar_mul, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - Default compiler":
|
||||
task bench_ec_g2_scalar_mul, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - CC compiler":
|
||||
runBench("bench_ec_g2_scalar_mul")
|
||||
|
||||
task bench_ec_g2_scalar_mul_gcc, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - GCC":
|
||||
runBench("bench_ec_g2_scalar_mul", "gcc")
|
||||
|
||||
task bench_ec_g2_scalar_mul_clang, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - Clang":
|
||||
runBench("bench_ec_g2_scalar_mul", "clang")
|
||||
|
||||
task bench_ec_g2_scalar_mul_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - GCC no Assembly":
|
||||
runBench("bench_ec_g2_scalar_mul", "gcc", useAsm = false)
|
||||
|
||||
task bench_ec_g2_scalar_mul_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - Clang no Assembly":
|
||||
runBench("bench_ec_g2_scalar_mul", "clang", useAsm = false)
|
||||
task bench_ec_g2_scalar_mul_noasm, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - CC compiler no Assembly":
|
||||
runBench("bench_ec_g2_scalar_mul", useAsm = false)
|
||||
|
||||
# Pairings
|
||||
# ------------------------------------------
|
||||
|
||||
task bench_pairing_bls12_377, "Run pairings benchmarks for BLS12-377 - Default compiler":
|
||||
task bench_pairing_bls12_377, "Run pairings benchmarks for BLS12-377 - CC compiler":
|
||||
runBench("bench_pairing_bls12_377")
|
||||
|
||||
task bench_pairing_bls12_377_gcc, "Run pairings benchmarks for BLS12-377 - GCC":
|
||||
runBench("bench_pairing_bls12_377", "gcc")
|
||||
|
||||
task bench_pairing_bls12_377_clang, "Run pairings benchmarks for BLS12-377 - Clang":
|
||||
runBench("bench_pairing_bls12_377", "clang")
|
||||
|
||||
task bench_pairing_bls12_377_gcc_noasm, "Run pairings benchmarks for BLS12-377 - GCC no Assembly":
|
||||
runBench("bench_pairing_bls12_377", "gcc", useAsm = false)
|
||||
|
||||
task bench_pairing_bls12_377_clang_noasm, "Run pairings benchmarks for BLS12-377 - Clang no Assembly":
|
||||
runBench("bench_pairing_bls12_377", "clang", useAsm = false)
|
||||
task bench_pairing_bls12_377_noasm, "Run pairings benchmarks for BLS12-377 - CC compiler no Assembly":
|
||||
runBench("bench_pairing_bls12_377", useAsm = false)
|
||||
|
||||
# --
|
||||
|
||||
task bench_pairing_bls12_381, "Run pairings benchmarks for BLS12-381 - Default compiler":
|
||||
task bench_pairing_bls12_381, "Run pairings benchmarks for BLS12-381 - CC compiler":
|
||||
runBench("bench_pairing_bls12_381")
|
||||
|
||||
task bench_pairing_bls12_381_gcc, "Run pairings benchmarks for BLS12-381 - GCC":
|
||||
runBench("bench_pairing_bls12_381", "gcc")
|
||||
|
||||
task bench_pairing_bls12_381_clang, "Run pairings benchmarks for BLS12-381 - Clang":
|
||||
runBench("bench_pairing_bls12_381", "clang")
|
||||
|
||||
task bench_pairing_bls12_381_gcc_noasm, "Run pairings benchmarks for BLS12-381 - GCC no Assembly":
|
||||
runBench("bench_pairing_bls12_381", "gcc", useAsm = false)
|
||||
|
||||
task bench_pairing_bls12_381_clang_noasm, "Run pairings benchmarks for BLS12-381 - Clang no Assembly":
|
||||
runBench("bench_pairing_bls12_381", "clang", useAsm = false)
|
||||
task bench_pairing_bls12_381_noasm, "Run pairings benchmarks for BLS12-381 - CC compiler no Assembly":
|
||||
runBench("bench_pairing_bls12_381", useAsm = false)
|
||||
|
||||
# --
|
||||
|
||||
task bench_pairing_bn254_nogami, "Run pairings benchmarks for BN254-Nogami - Default compiler":
|
||||
task bench_pairing_bn254_nogami, "Run pairings benchmarks for BN254-Nogami - CC compiler":
|
||||
runBench("bench_pairing_bn254_nogami")
|
||||
|
||||
task bench_pairing_bn254_nogami_gcc, "Run pairings benchmarks for BN254-Nogami - GCC":
|
||||
runBench("bench_pairing_bn254_nogami", "gcc")
|
||||
|
||||
task bench_pairing_bn254_nogami_clang, "Run pairings benchmarks for BN254-Nogami - Clang":
|
||||
runBench("bench_pairing_bn254_nogami", "clang")
|
||||
|
||||
task bench_pairing_bn254_nogami_gcc_noasm, "Run pairings benchmarks for BN254-Nogami - GCC no Assembly":
|
||||
runBench("bench_pairing_bn254_nogami", "gcc", useAsm = false)
|
||||
|
||||
task bench_pairing_bn254_nogami_clang_noasm, "Run pairings benchmarks for BN254-Nogami - Clang no Assembly":
|
||||
runBench("bench_pairing_bn254_nogami", "clang", useAsm = false)
|
||||
task bench_pairing_bn254_nogami_noasm, "Run pairings benchmarks for BN254-Nogami - CC compiler no Assembly":
|
||||
runBench("bench_pairing_bn254_nogami", useAsm = false)
|
||||
|
||||
# --
|
||||
|
||||
task bench_pairing_bn254_snarks, "Run pairings benchmarks for BN254-Snarks - Default compiler":
|
||||
task bench_pairing_bn254_snarks, "Run pairings benchmarks for BN254-Snarks - CC compiler":
|
||||
runBench("bench_pairing_bn254_snarks")
|
||||
|
||||
task bench_pairing_bn254_snarks_gcc, "Run pairings benchmarks for BN254-Snarks - GCC":
|
||||
runBench("bench_pairing_bn254_snarks", "gcc")
|
||||
|
||||
task bench_pairing_bn254_snarks_clang, "Run pairings benchmarks for BN254-Snarks - Clang":
|
||||
runBench("bench_pairing_bn254_snarks", "clang")
|
||||
|
||||
task bench_pairing_bn254_snarks_gcc_noasm, "Run pairings benchmarks for BN254-Snarks - GCC no Assembly":
|
||||
runBench("bench_pairing_bn254_snarks", "gcc", useAsm = false)
|
||||
|
||||
task bench_pairing_bn254_snarks_clang_noasm, "Run pairings benchmarks for BN254-Snarks - Clang no Assembly":
|
||||
runBench("bench_pairing_bn254_snarks", "clang", useAsm = false)
|
||||
task bench_pairing_bn254_snarks_noasm, "Run pairings benchmarks for BN254-Snarks - CC compiler no Assembly":
|
||||
runBench("bench_pairing_bn254_snarks", useAsm = false)
|
||||
|
||||
|
||||
# Curve summaries
|
||||
# ------------------------------------------
|
||||
|
||||
task bench_summary_bls12_377, "Run summary benchmarks for BLS12-377 - Default compiler":
|
||||
task bench_summary_bls12_377, "Run summary benchmarks for BLS12-377 - CC compiler":
|
||||
runBench("bench_summary_bls12_377")
|
||||
|
||||
task bench_summary_bls12_377_gcc, "Run summary benchmarks for BLS12-377 - GCC":
|
||||
runBench("bench_summary_bls12_377", "gcc")
|
||||
|
||||
task bench_summary_bls12_377_clang, "Run summary benchmarks for BLS12-377 - Clang":
|
||||
runBench("bench_summary_bls12_377", "clang")
|
||||
|
||||
task bench_summary_bls12_377_gcc_noasm, "Run summary benchmarks for BLS12-377 - GCC no Assembly":
|
||||
runBench("bench_summary_bls12_377", "gcc", useAsm = false)
|
||||
|
||||
task bench_summary_bls12_377_clang_noasm, "Run summary benchmarks for BLS12-377 - Clang no Assembly":
|
||||
runBench("bench_summary_bls12_377", "clang", useAsm = false)
|
||||
task bench_summary_bls12_377_noasm, "Run summary benchmarks for BLS12-377 - CC compiler no Assembly":
|
||||
runBench("bench_summary_bls12_377", useAsm = false)
|
||||
|
||||
# --
|
||||
|
||||
task bench_summary_bls12_381, "Run summary benchmarks for BLS12-381 - Default compiler":
|
||||
task bench_summary_bls12_381, "Run summary benchmarks for BLS12-381 - CC compiler":
|
||||
runBench("bench_summary_bls12_381")
|
||||
|
||||
task bench_summary_bls12_381_gcc, "Run summary benchmarks for BLS12-381 - GCC":
|
||||
runBench("bench_summary_bls12_381", "gcc")
|
||||
|
||||
task bench_summary_bls12_381_clang, "Run summary benchmarks for BLS12-381 - Clang":
|
||||
runBench("bench_summary_bls12_381", "clang")
|
||||
|
||||
task bench_summary_bls12_381_gcc_noasm, "Run summary benchmarks for BLS12-381 - GCC no Assembly":
|
||||
runBench("bench_summary_bls12_381", "gcc", useAsm = false)
|
||||
|
||||
task bench_summary_bls12_381_clang_noasm, "Run summary benchmarks for BLS12-381 - Clang no Assembly":
|
||||
runBench("bench_summary_bls12_381", "clang", useAsm = false)
|
||||
task bench_summary_bls12_381_noasm, "Run summary benchmarks for BLS12-381 - CC compiler no Assembly":
|
||||
runBench("bench_summary_bls12_381", useAsm = false)
|
||||
|
||||
# --
|
||||
|
||||
task bench_summary_bn254_nogami, "Run summary benchmarks for BN254-Nogami - Default compiler":
|
||||
task bench_summary_bn254_nogami, "Run summary benchmarks for BN254-Nogami - CC compiler":
|
||||
runBench("bench_summary_bn254_nogami")
|
||||
|
||||
task bench_summary_bn254_nogami_gcc, "Run summary benchmarks for BN254-Nogami - GCC":
|
||||
runBench("bench_summary_bn254_nogami", "gcc")
|
||||
|
||||
task bench_summary_bn254_nogami_clang, "Run summary benchmarks for BN254-Nogami - Clang":
|
||||
runBench("bench_summary_bn254_nogami", "clang")
|
||||
|
||||
task bench_summary_bn254_nogami_gcc_noasm, "Run summary benchmarks for BN254-Nogami - GCC no Assembly":
|
||||
runBench("bench_summary_bn254_nogami", "gcc", useAsm = false)
|
||||
|
||||
task bench_summary_bn254_nogami_clang_noasm, "Run summary benchmarks for BN254-Nogami - Clang no Assembly":
|
||||
runBench("bench_summary_bn254_nogami", "clang", useAsm = false)
|
||||
task bench_summary_bn254_nogami_noasm, "Run summary benchmarks for BN254-Nogami - CC compiler no Assembly":
|
||||
runBench("bench_summary_bn254_nogami", useAsm = false)
|
||||
|
||||
# --
|
||||
|
||||
task bench_summary_bn254_snarks, "Run summary benchmarks for BN254-Snarks - Default compiler":
|
||||
task bench_summary_bn254_snarks, "Run summary benchmarks for BN254-Snarks - CC compiler":
|
||||
runBench("bench_summary_bn254_snarks")
|
||||
|
||||
task bench_summary_bn254_snarks_gcc, "Run summary benchmarks for BN254-Snarks - GCC":
|
||||
runBench("bench_summary_bn254_snarks", "gcc")
|
||||
|
||||
task bench_summary_bn254_snarks_clang, "Run summary benchmarks for BN254-Snarks - Clang":
|
||||
runBench("bench_summary_bn254_snarks", "clang")
|
||||
|
||||
task bench_summary_bn254_snarks_gcc_noasm, "Run summary benchmarks for BN254-Snarks - GCC no Assembly":
|
||||
runBench("bench_summary_bn254_snarks", "gcc", useAsm = false)
|
||||
|
||||
task bench_summary_bn254_snarks_clang_noasm, "Run summary benchmarks for BN254-Snarks - Clang no Assembly":
|
||||
runBench("bench_summary_bn254_snarks", "clang", useAsm = false)
|
||||
task bench_summary_bn254_snarks_noasm, "Run summary benchmarks for BN254-Snarks - CC compiler no Assembly":
|
||||
runBench("bench_summary_bn254_snarks", useAsm = false)
|
||||
|
||||
# --
|
||||
|
||||
task bench_summary_pasta, "Run summary benchmarks for the Pasta curves - Default compiler":
|
||||
task bench_summary_pasta, "Run summary benchmarks for the Pasta curves - CC compiler":
|
||||
runBench("bench_summary_pasta")
|
||||
|
||||
task bench_summary_pasta_gcc, "Run summary benchmarks for the Pasta curves - GCC":
|
||||
runBench("bench_summary_pasta", "gcc")
|
||||
|
||||
task bench_summary_pasta_clang, "Run summary benchmarks for the Pasta curves - Clang":
|
||||
runBench("bench_summary_pasta", "clang")
|
||||
|
||||
task bench_summary_pasta_gcc_noasm, "Run summary benchmarks for the Pasta curves - GCC no Assembly":
|
||||
runBench("bench_summary_pasta", "gcc", useAsm = false)
|
||||
|
||||
task bench_summary_pasta_clang_noasm, "Run summary benchmarks for the Pasta curves - Clang no Assembly":
|
||||
runBench("bench_summary_pasta", "clang", useAsm = false)
|
||||
task bench_summary_pasta_noasm, "Run summary benchmarks for the Pasta curves - CC compiler no Assembly":
|
||||
runBench("bench_summary_pasta", useAsm = false)
|
||||
|
||||
# Hashes
|
||||
# ------------------------------------------
|
||||
@ -1185,31 +1031,13 @@ task bench_sha256, "Run SHA256 benchmarks":
|
||||
task bench_hash_to_curve, "Run Hash-to-Curve benchmarks":
|
||||
runBench("bench_hash_to_curve")
|
||||
|
||||
task bench_hash_to_curve_gcc, "Run Hash-to-Curve benchmarks":
|
||||
runBench("bench_hash_to_curve", "gcc")
|
||||
|
||||
task bench_hash_to_curve_clang, "Run Hash-to-Curve benchmarks":
|
||||
runBench("bench_hash_to_curve", "clang")
|
||||
|
||||
task bench_hash_to_curve_gcc_noasm, "Run Hash-to-Curve benchmarks":
|
||||
runBench("bench_hash_to_curve", "gcc", useAsm = false)
|
||||
|
||||
task bench_hash_to_curve_clang_noasm, "Run Hash-to-Curve benchmarks":
|
||||
runBench("bench_hash_to_curve", "clang", useAsm = false)
|
||||
task bench_hash_to_curve_noasm, "Run Hash-to-Curve benchmarks - No Assembly":
|
||||
runBench("bench_hash_to_curve", useAsm = false)
|
||||
|
||||
# BLS signatures
|
||||
# ------------------------------------------
|
||||
task bench_ethereum_bls_signatures, "Run Ethereum BLS signatures benchmarks":
|
||||
task bench_ethereum_bls_signatures, "Run Ethereum BLS signatures benchmarks - CC compiler":
|
||||
runBench("bench_ethereum_bls_signatures")
|
||||
|
||||
task bench_ethereum_bls_signatures_gcc, "Run Ethereum BLS signatures benchmarks":
|
||||
runBench("bench_ethereum_bls_signatures", "gcc")
|
||||
|
||||
task bench_ethereum_bls_signatures_clang, "Run Ethereum BLS signatures benchmarks":
|
||||
runBench("bench_ethereum_bls_signatures", "clang")
|
||||
|
||||
task bench_ethereum_bls_signatures_gcc_noasm, "Run Ethereum BLS signatures benchmarks":
|
||||
runBench("bench_ethereum_bls_signatures", "gcc", useAsm = false)
|
||||
|
||||
task bench_ethereum_bls_signatures_clang_noasm, "Run Ethereum BLS signatures benchmarks":
|
||||
runBench("bench_ethereum_bls_signatures", "clang", useAsm = false)
|
||||
task bench_ethereum_bls_signatures_noasm, "Run Ethereum BLS signatures benchmarks - CC compiler no assembly":
|
||||
runBench("bench_ethereum_bls_signatures", useAsm = false)
|
||||
|
||||
@ -50,7 +50,7 @@ import ./zoo_exports
|
||||
static:
|
||||
# Xxport SHA256 routines with a protocol specific prefix
|
||||
# This exports sha256.init(), sha256.update(), sha256.finish() and sha256.clear()
|
||||
prefix_sha256 = prefix_ffi & "_sha256_"
|
||||
prefix_sha256 = prefix_ffi & "sha256_"
|
||||
|
||||
import hashes
|
||||
export hashes # generic sandwich on sha256
|
||||
|
||||
@ -10,6 +10,7 @@ import
|
||||
# Standard library
|
||||
std/macros,
|
||||
# Internal
|
||||
./limbs_asm_modular_x86,
|
||||
../../../platforms/abstractions
|
||||
|
||||
# ############################################################
|
||||
@ -32,7 +33,7 @@ static: doAssert UseASM_X86_64
|
||||
# Double-precision field addition
|
||||
# ------------------------------------------------------------
|
||||
|
||||
macro addmod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N div 2]): untyped =
|
||||
macro addmod2x_gen[N: static int](r_PIR: var Limbs[N], a_MEM, b_MEM: Limbs[N], M_MEM: Limbs[N div 2], spareBits: static int): untyped =
|
||||
## Generate an optimized out-of-place double-precision addition kernel
|
||||
|
||||
result = newStmtList()
|
||||
@ -41,23 +42,28 @@ macro addmod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N di
|
||||
let
|
||||
H = N div 2
|
||||
|
||||
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
|
||||
# We reuse the reg used for b for overflow detection
|
||||
b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
|
||||
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||
b = asmArray(b_MEM, N, MemOffsettable, asmInput)
|
||||
# We could force m as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
|
||||
M = asmArray(M_MEM, H, MemOffsettable, asmInput)
|
||||
# If N is too big, we need to spill registers. TODO.
|
||||
u = init(OperandArray, nimSymbol = ident"U", H, ElemsInReg, InputOutput)
|
||||
v = init(OperandArray, nimSymbol = ident"V", H, ElemsInReg, InputOutput)
|
||||
uSym = ident"u"
|
||||
vSym = ident"v"
|
||||
u = asmArray(uSym, H, ElemsInReg, asmInputOutput)
|
||||
v = asmArray(vSym, H, ElemsInReg, asmInputOutput)
|
||||
|
||||
overflowRegSym = ident"overflowReg"
|
||||
overflowReg = asmValue(overflowRegSym, Reg, asmOutputOverwrite)
|
||||
|
||||
let usym = u.nimSymbol
|
||||
let vsym = v.nimSymbol
|
||||
result.add quote do:
|
||||
var `usym`{.noinit.}, `vsym` {.noInit.}: typeof(`A`)
|
||||
var `uSym`{.noinit.}, `vSym` {.noInit.}: typeof(`a_MEM`)
|
||||
staticFor i, 0, `H`:
|
||||
`usym`[i] = `A`[i]
|
||||
`uSym`[i] = `a_MEM`[i]
|
||||
staticFor i, `H`, `N`:
|
||||
`vsym`[i-`H`] = `A`[i]
|
||||
`vSym`[i-`H`] = `a_MEM`[i]
|
||||
|
||||
when `sparebits` == 0:
|
||||
var `overflowRegSym`{.noInit.}: BaseType
|
||||
|
||||
# Addition
|
||||
# u = a[0..<H] + b[0..<H], v = a[H..<N]
|
||||
@ -72,38 +78,26 @@ macro addmod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N di
|
||||
ctx.adc v[i-H], b[i]
|
||||
ctx.mov u[i-H], v[i-H]
|
||||
|
||||
# Mask: overflowed contains 0xFFFF or 0x0000
|
||||
# TODO: unnecessary if MSB never set, i.e. "Field.getSpareBits >= 1"
|
||||
let overflowed = b.reuseRegister()
|
||||
ctx.sbb overflowed, overflowed
|
||||
let rUpperHalf = r.subset(H, N)
|
||||
|
||||
# Now substract the modulus to test a < 2ⁿp
|
||||
ctx.sub v[0], M[0]
|
||||
for i in 1 ..< H:
|
||||
ctx.sbb v[i], M[i]
|
||||
if spareBits >= 1:
|
||||
# Now substract the modulus to test a < 2ⁿp
|
||||
ctx.finalSubNoOverflowImpl(rUpperHalf, v, M, u)
|
||||
else:
|
||||
ctx.finalSubMayOverflowImpl(rUpperHalf, v, M, u, scratchReg = overflowReg)
|
||||
|
||||
# If it overflows here, it means that it was
|
||||
# smaller than the modulus and we don't need v
|
||||
ctx.sbb overflowed, 0
|
||||
result.add ctx.generate()
|
||||
|
||||
# Conditional Mov and
|
||||
# and store result
|
||||
for i in 0 ..< H:
|
||||
ctx.cmovnc u[i], v[i]
|
||||
ctx.mov r[i+H], u[i]
|
||||
|
||||
result.add ctx.generate
|
||||
|
||||
func addmod2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N div 2]) {.noInline.} =
|
||||
func addmod2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N div 2], spareBits: static int) =
|
||||
## Constant-time double-precision addition
|
||||
## Output is conditionally reduced by 2ⁿp
|
||||
## to stay in the [0, 2ⁿp) range
|
||||
addmod2x_gen(r, a, b, M)
|
||||
addmod2x_gen(r, a, b, M, spareBits)
|
||||
|
||||
# Double-precision field substraction
|
||||
# ------------------------------------------------------------
|
||||
|
||||
macro submod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N div 2]): untyped =
|
||||
macro submod2x_gen[N: static int](r_PIR: var Limbs[N], a_MEM, b_PIR: Limbs[N], M_MEM: Limbs[N div 2]): untyped =
|
||||
## Generate an optimized out-of-place double-precision substraction kernel
|
||||
|
||||
result = newStmtList()
|
||||
@ -112,23 +106,22 @@ macro submod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N di
|
||||
let
|
||||
H = N div 2
|
||||
|
||||
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
|
||||
# We reuse the reg used for b for overflow detection
|
||||
b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
|
||||
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||
b = asmArray(b_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memRead) # We reuse the reg used for b for overflow detection
|
||||
# We could force m as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
|
||||
M = asmArray(M_MEM, H, MemOffsettable, asmInput)
|
||||
# If N is too big, we need to spill registers. TODO.
|
||||
u = init(OperandArray, nimSymbol = ident"U", H, ElemsInReg, InputOutput)
|
||||
v = init(OperandArray, nimSymbol = ident"V", H, ElemsInReg, InputOutput)
|
||||
uSym = ident"u"
|
||||
vSym = ident"v"
|
||||
u = asmArray(uSym, H, ElemsInReg, asmInputOutput)
|
||||
v = asmArray(vSym, H, ElemsInReg, asmInputOutput)
|
||||
|
||||
let usym = u.nimSymbol
|
||||
let vsym = v.nimSymbol
|
||||
result.add quote do:
|
||||
var `usym`{.noinit.}, `vsym` {.noInit.}: typeof(`A`)
|
||||
var `uSym`{.noinit.}, `vSym` {.noInit.}: typeof(`a_MEM`)
|
||||
staticFor i, 0, `H`:
|
||||
`usym`[i] = `A`[i]
|
||||
`uSym`[i] = `a_MEM`[i]
|
||||
staticFor i, `H`, `N`:
|
||||
`vsym`[i-`H`] = `A`[i]
|
||||
`vSym`[i-`H`] = `a_MEM`[i]
|
||||
|
||||
# Substraction
|
||||
# u = a[0..<H] - b[0..<H], v = a[H..<N]
|
||||
@ -158,9 +151,9 @@ macro submod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N di
|
||||
ctx.adc u[i], v[i]
|
||||
ctx.mov r[i+H], u[i]
|
||||
|
||||
result.add ctx.generate
|
||||
result.add ctx.generate()
|
||||
|
||||
func submod2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N div 2]) {.noInline.} =
|
||||
func submod2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N div 2]) =
|
||||
## Constant-time double-precision substraction
|
||||
## Output is conditionally reduced by 2ⁿp
|
||||
## to stay in the [0, 2ⁿp) range
|
||||
@ -169,7 +162,7 @@ func submod2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N di
|
||||
# Double-precision field negation
|
||||
# ------------------------------------------------------------
|
||||
|
||||
macro negmod2x_gen[N: static int](R: var Limbs[N], A: Limbs[N], m: Limbs[N div 2]): untyped =
|
||||
macro negmod2x_gen[N: static int](r_PIR: var Limbs[N], a_MEM: Limbs[N], M_MEM: Limbs[N div 2]): untyped =
|
||||
## Generate an optimized modular negation kernel
|
||||
|
||||
result = newStmtList()
|
||||
@ -178,22 +171,20 @@ macro negmod2x_gen[N: static int](R: var Limbs[N], A: Limbs[N], m: Limbs[N div 2
|
||||
let
|
||||
H = N div 2
|
||||
|
||||
a = init(OperandArray, nimSymbol = A, N, PointerInReg, Input)
|
||||
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
|
||||
u = init(OperandArray, nimSymbol = ident"U", N, ElemsInReg, Output_EarlyClobber)
|
||||
a = asmArray(a_MEM, N, MemOffsettable, asmInput)
|
||||
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||
uSym = ident"u"
|
||||
u = asmArray(uSym, N, ElemsInReg, asmOutputEarlyClobber)
|
||||
# We could force m as immediate by specializing per moduli
|
||||
# We reuse the reg used for m for overflow detection
|
||||
M = init(OperandArray, nimSymbol = m, N, PointerInReg, InputOutput)
|
||||
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||
|
||||
isZero = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[isZero]",
|
||||
nimSymbol: ident"isZero",
|
||||
rm: Reg,
|
||||
constraint: Output_EarlyClobber,
|
||||
cEmit: "isZero"
|
||||
)
|
||||
)
|
||||
isZeroSym = ident"isZero"
|
||||
isZero = asmValue(isZeroSym, Reg, asmOutputEarlyClobber)
|
||||
|
||||
result.add quote do:
|
||||
var `isZerosym`{.noInit.}: BaseType
|
||||
var `usym`{.noinit, used.}: typeof(`a_MEM`)
|
||||
|
||||
# Substraction 2ⁿp - a
|
||||
# The lower half of 2ⁿp is filled with zero
|
||||
@ -227,13 +218,8 @@ macro negmod2x_gen[N: static int](R: var Limbs[N], A: Limbs[N], m: Limbs[N div 2
|
||||
ctx.cmovz u[i-H], isZero
|
||||
ctx.mov r[i], u[i-H]
|
||||
|
||||
let isZerosym = isZero.desc.nimSymbol
|
||||
let usym = u.nimSymbol
|
||||
result.add quote do:
|
||||
var `isZerosym`{.noInit.}: BaseType
|
||||
var `usym`{.noinit, used.}: typeof(`A`)
|
||||
result.add ctx.generate
|
||||
result.add ctx.generate()
|
||||
|
||||
func negmod2x_asm*[N: static int](r: var Limbs[N], a: Limbs[N], M: Limbs[N div 2]) {.noInline.} =
|
||||
func negmod2x_asm*[N: static int](r: var Limbs[N], a: Limbs[N], M: Limbs[N div 2]) =
|
||||
## Constant-time double-precision negation
|
||||
negmod2x_gen(r, a, M)
|
||||
|
||||
@ -18,11 +18,6 @@ import
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
# Note: We can refer to at most 30 registers in inline assembly
|
||||
# and "InputOutput" registers count double
|
||||
# They are nice to let the compiler deals with mov
|
||||
# but too constraining so we move things ourselves.
|
||||
|
||||
static: doAssert UseASM_X86_32
|
||||
|
||||
# Necessary for the compiler to find enough registers
|
||||
@ -31,7 +26,8 @@ static: doAssert UseASM_X86_32
|
||||
proc finalSubNoOverflowImpl*(
|
||||
ctx: var Assembler_x86,
|
||||
r: Operand or OperandArray,
|
||||
a, M, scratch: OperandArray) =
|
||||
a, M, scratch: OperandArray,
|
||||
a_in_scratch = false) =
|
||||
## Reduce `a` into `r` modulo `M`
|
||||
## To be used when the modulus does not use the full bitwidth of the storing words
|
||||
## for example a 255-bit modulus in n words of total max size 2^256
|
||||
@ -42,10 +38,12 @@ proc finalSubNoOverflowImpl*(
|
||||
ctx.comment "Final substraction (cannot overflow its limbs)"
|
||||
|
||||
# Substract the modulus, and test a < p with the last borrow
|
||||
ctx.mov scratch[0], a[0]
|
||||
if not a_in_scratch:
|
||||
ctx.mov scratch[0], a[0]
|
||||
ctx.sub scratch[0], M[0]
|
||||
for i in 1 ..< N:
|
||||
ctx.mov scratch[i], a[i]
|
||||
if not a_in_scratch:
|
||||
ctx.mov scratch[i], a[i]
|
||||
ctx.sbb scratch[i], M[i]
|
||||
|
||||
# If we borrowed it means that we were smaller than
|
||||
@ -58,13 +56,15 @@ proc finalSubMayOverflowImpl*(
|
||||
ctx: var Assembler_x86,
|
||||
r: Operand or OperandArray,
|
||||
a, M, scratch: OperandArray,
|
||||
scratchReg: Operand or Register or OperandReuse) =
|
||||
a_in_scratch = false,
|
||||
scratchReg: Operand or Register or OperandReuse = rax) =
|
||||
## Reduce `a` into `r` modulo `M`
|
||||
## To be used when the final substraction can
|
||||
## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256)
|
||||
##
|
||||
## r, a, scratch, scratchReg are mutated
|
||||
## r, a, scratch are mutated
|
||||
## M is read-only
|
||||
## This clobbers RAX
|
||||
let N = M.len
|
||||
ctx.comment "Final substraction (may carry)"
|
||||
|
||||
@ -72,10 +72,12 @@ proc finalSubMayOverflowImpl*(
|
||||
ctx.sbb scratchReg, scratchReg
|
||||
|
||||
# Now substract the modulus, and test a < p with the last borrow
|
||||
ctx.mov scratch[0], a[0]
|
||||
if not a_in_scratch:
|
||||
ctx.mov scratch[0], a[0]
|
||||
ctx.sub scratch[0], M[0]
|
||||
for i in 1 ..< N:
|
||||
ctx.mov scratch[i], a[i]
|
||||
if not a_in_scratch:
|
||||
ctx.mov scratch[i], a[i]
|
||||
ctx.sbb scratch[i], M[i]
|
||||
|
||||
# If it overflows here, it means that it was
|
||||
@ -89,9 +91,10 @@ proc finalSubMayOverflowImpl*(
|
||||
ctx.mov r[i], a[i]
|
||||
|
||||
macro finalSub_gen*[N: static int](
|
||||
r_PIR: var array[N, SecretWord],
|
||||
a_EIR, M_PIR: array[N, SecretWord],
|
||||
scratch_EIR: var array[N, SecretWord],
|
||||
r_PIR: var Limbs[N],
|
||||
a_EIR: Limbs[N],
|
||||
M_MEM: Limbs[N],
|
||||
scratch_EIR: var Limbs[N],
|
||||
mayOverflow: static bool): untyped =
|
||||
## Returns:
|
||||
## a-M if a > M
|
||||
@ -99,35 +102,32 @@ macro finalSub_gen*[N: static int](
|
||||
##
|
||||
## - r_PIR is a pointer to the result array, mutated,
|
||||
## - a_EIR is an array of registers, mutated,
|
||||
## - M_PIR is a pointer to an array, read-only,
|
||||
## - M_MEM is a pointer to an array, read-only,
|
||||
## - scratch_EIR is an array of registers, mutated
|
||||
## - mayOverflow is set to true when the carry flag also needs to be read
|
||||
result = newStmtList()
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
let
|
||||
r = init(OperandArray, nimSymbol = r_PIR, N, PointerInReg, InputOutput)
|
||||
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||
# We reuse the reg used for b for overflow detection
|
||||
a = init(OperandArray, nimSymbol = a_EIR, N, ElemsInReg, InputOutput)
|
||||
a = asmArray(a_EIR, N, ElemsInReg, asmInputOutput)
|
||||
# We could force m as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||
t = init(OperandArray, nimSymbol = scratch_EIR, N, ElemsInReg, Output_EarlyClobber)
|
||||
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||
t = asmArray(scratch_EIR, N, ElemsInReg, asmOutputEarlyClobber)
|
||||
|
||||
if mayOverflow:
|
||||
ctx.finalSubMayOverflowImpl(
|
||||
r, a, M, t, rax
|
||||
)
|
||||
ctx.finalSubMayOverflowImpl(r, a, M, t)
|
||||
else:
|
||||
ctx.finalSubNoOverflowImpl(
|
||||
r, a, M, t
|
||||
)
|
||||
ctx.finalSubNoOverflowImpl(r, a, M, t)
|
||||
|
||||
result.add ctx.generate()
|
||||
|
||||
# Field addition
|
||||
# ------------------------------------------------------------
|
||||
|
||||
macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N], spareBits: static int): untyped =
|
||||
|
||||
macro addmod_gen[N: static int](r_PIR: var Limbs[N], a_PIR, b_PIR, M_MEM: Limbs[N], spareBits: static int): untyped =
|
||||
## Generate an optimized modular addition kernel
|
||||
# Register pressure note:
|
||||
# We could generate a kernel per modulus m by hardcoding it as immediate
|
||||
@ -139,21 +139,20 @@ macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N], spareBits: s
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
let
|
||||
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
|
||||
# We reuse the reg used for b for overflow detection
|
||||
b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
|
||||
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but_ec_shortw_prj_g1_sum_reduce.nimt compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||
b = asmArray(b_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memRead) # LLVM Gold linker runs out of registers in t_ec_shortw_prj_g1_sum_reduce if we use b as Memoffsettable and a separate overflow register
|
||||
# We could force m as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
|
||||
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||
# If N is too big, we need to spill registers. TODO.
|
||||
u = init(OperandArray, nimSymbol = ident"u", N, ElemsInReg, InputOutput)
|
||||
v = init(OperandArray, nimSymbol = ident"v", N, ElemsInReg, Output_EarlyClobber)
|
||||
uSym = ident"u"
|
||||
vSym = ident"v"
|
||||
u = asmArray(uSym, N, ElemsInReg, asmInputOutput)
|
||||
v = asmArray(vSym, N, ElemsInReg, asmOutputEarlyClobber)
|
||||
|
||||
let usym = u.nimSymbol
|
||||
let vsym = v.nimSymbol
|
||||
result.add quote do:
|
||||
var `usym`{.noinit.}, `vsym` {.noInit, used.}: typeof(`A`)
|
||||
var `usym`{.noinit.}, `vsym` {.noInit, used.}: typeof(`a_PIR`)
|
||||
staticFor i, 0, `N`:
|
||||
`usym`[i] = `A`[i]
|
||||
`usym`[i] = `a_PIR`[i]
|
||||
|
||||
# Addition
|
||||
ctx.add u[0], b[0]
|
||||
@ -164,23 +163,20 @@ macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N], spareBits: s
|
||||
ctx.mov v[i], u[i]
|
||||
|
||||
if spareBits >= 1:
|
||||
ctx.finalSubNoOverflowImpl(r, u, M, v)
|
||||
ctx.finalSubNoOverflowImpl(r, u, M, v, a_in_scratch = true)
|
||||
else:
|
||||
ctx.finalSubMayOverflowImpl(
|
||||
r, u, M, v, b.reuseRegister()
|
||||
)
|
||||
ctx.finalSubMayOverflowImpl(r, u, M, v, a_in_scratch = true, scratchReg = b.reuseRegister())
|
||||
|
||||
result.add ctx.generate()
|
||||
|
||||
func addmod_asm*(r: var Limbs, a, b, m: Limbs, spareBits: static int) {.noInline.} =
|
||||
func addmod_asm*(r: var Limbs, a, b, M: Limbs, spareBits: static int) =
|
||||
## Constant-time modular addition
|
||||
# This MUST be noInline or Clang will run out of registers with LTO
|
||||
addmod_gen(r, a, b, m, spareBits)
|
||||
addmod_gen(r, a, b, M, spareBits)
|
||||
|
||||
# Field substraction
|
||||
# ------------------------------------------------------------
|
||||
|
||||
macro submod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
|
||||
macro submod_gen[N: static int](r_PIR: var Limbs[N], a_PIR, b_PIR, M_MEM: Limbs[N]): untyped =
|
||||
## Generate an optimized modular addition kernel
|
||||
# Register pressure note:
|
||||
# We could generate a kernel per modulus m by hardocing it as immediate
|
||||
@ -192,21 +188,20 @@ macro submod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
let
|
||||
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
|
||||
# We reuse the reg used for b for overflow detection
|
||||
b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
|
||||
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||
b = asmArray(b_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memRead) # register reused for underflow detection
|
||||
# We could force m as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
|
||||
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||
# If N is too big, we need to spill registers. TODO.
|
||||
u = init(OperandArray, nimSymbol = ident"U", N, ElemsInReg, InputOutput)
|
||||
v = init(OperandArray, nimSymbol = ident"V", N, ElemsInReg, Output_EarlyClobber)
|
||||
uSym = ident"u"
|
||||
vSym = ident"v"
|
||||
u = asmArray(uSym, N, ElemsInReg, asmInputOutput)
|
||||
v = asmArray(vSym, N, ElemsInReg, asmOutputEarlyClobber)
|
||||
|
||||
let usym = u.nimSymbol
|
||||
let vsym = v.nimSymbol
|
||||
result.add quote do:
|
||||
var `usym`{.noinit.}, `vsym` {.noInit, used.}: typeof(`A`)
|
||||
var `usym`{.noinit.}, `vsym` {.noInit, used.}: typeof(`a_PIR`)
|
||||
staticFor i, 0, `N`:
|
||||
`usym`[i] = `A`[i]
|
||||
`usym`[i] = `a_PIR`[i]
|
||||
|
||||
# Substraction
|
||||
ctx.sub u[0], b[0]
|
||||
@ -231,30 +226,37 @@ macro submod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
|
||||
ctx.adc u[i], v[i]
|
||||
ctx.mov r[i], u[i]
|
||||
|
||||
result.add ctx.generate
|
||||
result.add ctx.generate()
|
||||
|
||||
func submod_asm*(r: var Limbs, a, b, M: Limbs) {.noInline.} =
|
||||
func submod_asm*(r: var Limbs, a, b, M: Limbs) =
|
||||
## Constant-time modular substraction
|
||||
## Warning, does not handle aliasing of a and b
|
||||
# This MUST be noInline or Clang will run out of registers with LTO
|
||||
submod_gen(r, a, b, M)
|
||||
|
||||
# Field negation
|
||||
# ------------------------------------------------------------
|
||||
|
||||
macro negmod_gen[N: static int](R: var Limbs[N], A, m: Limbs[N]): untyped =
|
||||
macro negmod_gen[N: static int](r_PIR: var Limbs[N], a_MEM, M_MEM: Limbs[N]): untyped =
|
||||
## Generate an optimized modular negation kernel
|
||||
|
||||
result = newStmtList()
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
let
|
||||
a = init(OperandArray, nimSymbol = A, N, PointerInReg, Input)
|
||||
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
|
||||
u = init(OperandArray, nimSymbol = ident"U", N, ElemsInReg, Output_EarlyClobber)
|
||||
a = asmArray(a_MEM, N, MemOffsettable, asmInput)
|
||||
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||
uSym = ident"u"
|
||||
u = asmArray(uSym, N, ElemsInReg, asmOutputEarlyClobber)
|
||||
# We could force m as immediate by specializing per moduli
|
||||
# We reuse the reg used for m for overflow detection
|
||||
M = init(OperandArray, nimSymbol = m, N, PointerInReg, InputOutput)
|
||||
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||
|
||||
isZeroSym = ident"isZero"
|
||||
isZero = asmValue(isZeroSym, Reg, asmOutputEarlyClobber)
|
||||
|
||||
result.add quote do:
|
||||
var `usym`{.noinit, used.}: typeof(`a_MEM`)
|
||||
var `isZeroSym`{.noinit.}: BaseType
|
||||
|
||||
# Substraction m - a
|
||||
ctx.mov u[0], M[0]
|
||||
@ -264,7 +266,6 @@ macro negmod_gen[N: static int](R: var Limbs[N], A, m: Limbs[N]): untyped =
|
||||
ctx.sbb u[i], a[i]
|
||||
|
||||
# Deal with a == 0
|
||||
let isZero = M.reuseRegister()
|
||||
ctx.mov isZero, a[0]
|
||||
for i in 1 ..< N:
|
||||
ctx.`or` isZero, a[i]
|
||||
@ -274,11 +275,8 @@ macro negmod_gen[N: static int](R: var Limbs[N], A, m: Limbs[N]): untyped =
|
||||
ctx.cmovz u[i], isZero
|
||||
ctx.mov r[i], u[i]
|
||||
|
||||
let usym = u.nimSymbol
|
||||
result.add quote do:
|
||||
var `usym`{.noinit, used.}: typeof(`A`)
|
||||
result.add ctx.generate
|
||||
result.add ctx.generate()
|
||||
|
||||
func negmod_asm*(r: var Limbs, a, m: Limbs) =
|
||||
func negmod_asm*(r: var Limbs, a, M: Limbs) =
|
||||
## Constant-time modular negation
|
||||
negmod_gen(r, a, m)
|
||||
negmod_gen(r, a, M)
|
||||
|
||||
@ -21,11 +21,6 @@ import
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
# Note: We can refer to at most 30 registers in inline assembly
|
||||
# and "InputOutput" registers count double
|
||||
# They are nice to let the compiler deals with mov
|
||||
# but too constraining so we move things ourselves.
|
||||
|
||||
static: doAssert UseASM_X86_64
|
||||
|
||||
# Necessary for the compiler to find enough registers
|
||||
@ -37,7 +32,7 @@ static: doAssert UseASM_X86_64
|
||||
# Fallback when no ADX and BMI2 support (MULX, ADCX, ADOX)
|
||||
macro mulMont_CIOS_sparebit_gen[N: static int](
|
||||
r_PIR: var Limbs[N], a_PIR, b_PIR,
|
||||
M_PIR: Limbs[N], m0ninv_REG: BaseType,
|
||||
M_MEM: Limbs[N], m0ninv_REG: BaseType,
|
||||
skipFinalSub: static bool): untyped =
|
||||
## Generate an optimized Montgomery Multiplication kernel
|
||||
## using the CIOS method
|
||||
@ -58,29 +53,23 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
|
||||
scratchSlots = 6
|
||||
|
||||
# We could force M as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||
# If N is too big, we need to spill registers. TODO.
|
||||
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
|
||||
tSym = ident"t"
|
||||
t = asmArray(tSym, N, ElemsInReg, asmOutputEarlyClobber)
|
||||
# MultiPurpose Register slots
|
||||
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||
scratchSym = ident"scratch"
|
||||
scratch = asmArray(scratchSym, scratchSlots, ElemsInReg, asmInputOutputEarlyClobber)
|
||||
|
||||
# MUL requires RAX and RDX
|
||||
|
||||
m0ninv = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[m0ninv]",
|
||||
nimSymbol: m0ninv_REG,
|
||||
rm: MemOffsettable,
|
||||
constraint: Input,
|
||||
cEmit: "&" & $m0ninv_REG
|
||||
)
|
||||
)
|
||||
m0ninv = asmValue(m0ninv_REG, Mem, asmInput)
|
||||
|
||||
# We're really constrained by register and somehow setting as memory doesn't help
|
||||
# So we store the result `r` in the scratch space and then reload it in RDX
|
||||
# before the scratchspace is used in final substraction
|
||||
a = scratch[0].asArrayAddr(len = N) # Store the `a` operand
|
||||
b = scratch[1].asArrayAddr(len = N) # Store the `b` operand
|
||||
a = scratch[0].asArrayAddr(a_PIR, len = N, memIndirect = memRead) # Store the `a` operand
|
||||
b = scratch[1].asArrayAddr(b_PIR, len = N, memIndirect = memRead) # Store the `b` operand
|
||||
A = scratch[2] # High part of extended precision multiplication
|
||||
C = scratch[3]
|
||||
m = scratch[4] # Stores (t[0] * m0ninv) mod 2ʷ
|
||||
@ -96,12 +85,10 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
|
||||
# but this prevent reusing the same code for multiple curves like BLS12-377 and BLS12-381
|
||||
# We might be able to save registers by having `r` and `M` be memory operand as well
|
||||
|
||||
let tsym = t.nimSymbol
|
||||
let scratchSym = scratch.nimSymbol
|
||||
result.add quote do:
|
||||
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
||||
|
||||
var `tsym`{.noInit, used.}: typeof(`r_PIR`)
|
||||
var `tSym`{.noInit, used.}: typeof(`r_PIR`)
|
||||
# Assumes 64-bit limbs on 64-bit arch (or you can't store an address)
|
||||
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
|
||||
`scratchSym`[0] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
|
||||
@ -172,26 +159,22 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
|
||||
ctx.mov t[N-1], A
|
||||
|
||||
ctx.mov rax, r # move r away from scratchspace that will be used for final substraction
|
||||
let r2 = rax.asArrayAddr(len = N)
|
||||
let r2 = rax.asArrayAddr(r_PIR, len = N, memIndirect = memWrite)
|
||||
|
||||
if skipFinalSub:
|
||||
for i in 0 ..< N:
|
||||
ctx.mov r2[i], t[i]
|
||||
else:
|
||||
ctx.finalSubNoOverflowImpl(
|
||||
r2, t, M,
|
||||
scratch
|
||||
)
|
||||
ctx.finalSubNoOverflowImpl(r2, t, M, scratch)
|
||||
result.add ctx.generate()
|
||||
|
||||
func mulMont_CIOS_sparebit_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) {.noInline.} =
|
||||
func mulMont_CIOS_sparebit_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) =
|
||||
## Constant-time Montgomery multiplication
|
||||
## If "skipFinalSub" is set
|
||||
## the result is in the range [0, 2M)
|
||||
## otherwise the result is in the range [0, M)
|
||||
##
|
||||
## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
|
||||
# This MUST be noInline or Clang will run out of registers with LTO
|
||||
r.mulMont_CIOS_sparebit_gen(a, b, M, m0ninv, skipFinalSub)
|
||||
|
||||
# Montgomery Squaring
|
||||
@ -212,7 +195,7 @@ func squareMont_CIOS_asm*[N](
|
||||
|
||||
macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
|
||||
r_PIR: var Limbs[N], a_PIR, b_PIR: array[K, Limbs[N]],
|
||||
M_PIR: Limbs[N], m0ninv_REG: BaseType,
|
||||
M_MEM: Limbs[N], m0ninv_REG: BaseType,
|
||||
skipFinalSub: static bool): untyped =
|
||||
## Generate an optimized Montgomery merged sum of products ⅀aᵢ.bᵢ kernel
|
||||
## using the CIOS method
|
||||
@ -242,29 +225,23 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
|
||||
scratchSlots = 6
|
||||
|
||||
# We could force M as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||
# If N is too big, we need to spill registers. TODO.
|
||||
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
|
||||
tSym = ident"t"
|
||||
t = asmArray(tSym, N, ElemsInReg, asmOutputEarlyClobber)
|
||||
# MultiPurpose Register slots
|
||||
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||
scratchSym = ident"scratch"
|
||||
scratch = asmArray(scratchSym, scratchSlots, ElemsInReg, asmInputOutputEarlyClobber)
|
||||
|
||||
# MUL requires RAX and RDX
|
||||
|
||||
m0ninv = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[m0ninv]",
|
||||
nimSymbol: m0ninv_REG,
|
||||
rm: MemOffsettable,
|
||||
constraint: Input,
|
||||
cEmit: "&" & $m0ninv_REG
|
||||
)
|
||||
)
|
||||
m0ninv = asmValue(m0ninv_REG, Mem, asmInput)
|
||||
|
||||
# We're really constrained by register and somehow setting as memory doesn't help
|
||||
# So we store the result `r` in the scratch space and then reload it in RDX
|
||||
# before the scratchspace is used in final substraction
|
||||
a = scratch[0].as2dArrayAddr(rows = K, cols = N) # Store the `a` operand
|
||||
b = scratch[1].as2dArrayAddr(rows = K, cols = N) # Store the `b` operand
|
||||
a = scratch[0].as2dArrayAddr(a_PIR, rows = K, cols = N, memIndirect = memRead) # Store the `a` operand
|
||||
b = scratch[1].as2dArrayAddr(b_PIR, rows = K, cols = N, memIndirect = memRead) # Store the `b` operand
|
||||
tN = scratch[2] # High part of extended precision multiplication
|
||||
C = scratch[3] # Carry during reduction step
|
||||
r = scratch[4] # Stores the `r` operand
|
||||
@ -280,9 +257,6 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
|
||||
# We can save 1 by hardcoding M as immediate (and m0ninv)
|
||||
# but this prevent reusing the same code for multiple curves like BLS12-377 and BLS12-381
|
||||
# We might be able to save registers by having `r` and `M` be memory operand as well
|
||||
|
||||
let tsym = t.nimSymbol
|
||||
let scratchSym = scratch.nimSymbol
|
||||
result.add quote do:
|
||||
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
||||
|
||||
@ -377,7 +351,7 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
|
||||
|
||||
|
||||
ctx.mov rax, r # move r away from scratchspace that will be used for final substraction
|
||||
let r2 = rax.asArrayAddr(len = N)
|
||||
let r2 = rax.asArrayAddr(r_PIR, len = N, memIndirect = memWrite)
|
||||
|
||||
if skipFinalSub:
|
||||
ctx.comment " Copy result"
|
||||
@ -387,8 +361,7 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
|
||||
ctx.comment " Final substraction"
|
||||
ctx.finalSubNoOverflowImpl(
|
||||
r2, t, M,
|
||||
scratch
|
||||
)
|
||||
scratch)
|
||||
result.add ctx.generate()
|
||||
|
||||
func sumprodMont_CIOS_spare2bits_asm*[N, K: static int](
|
||||
|
||||
@ -21,11 +21,6 @@ import
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
# Note: We can refer to at most 30 registers in inline assembly
|
||||
# and "InputOutput" registers count double
|
||||
# They are nice to let the compiler deals with mov
|
||||
# but too constraining so we move things ourselves.
|
||||
|
||||
static: doAssert UseASM_X86_64
|
||||
|
||||
# MULX/ADCX/ADOX
|
||||
@ -176,7 +171,7 @@ proc partialRedx(
|
||||
|
||||
macro mulMont_CIOS_sparebit_adx_gen[N: static int](
|
||||
r_PIR: var Limbs[N], a_PIR, b_PIR,
|
||||
M_PIR: Limbs[N], m0ninv_REG: BaseType,
|
||||
M_MEM: Limbs[N], m0ninv_REG: BaseType,
|
||||
skipFinalSub: static bool): untyped =
|
||||
## Generate an optimized Montgomery Multiplication kernel
|
||||
## using the CIOS method
|
||||
@ -193,18 +188,20 @@ macro mulMont_CIOS_sparebit_adx_gen[N: static int](
|
||||
let
|
||||
scratchSlots = 6
|
||||
|
||||
r = init(OperandArray, nimSymbol = r_PIR, N, PointerInReg, InputOutput_EnsureClobber)
|
||||
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it). # Changing that to MemOffsetable triggers an error in negmod in test_bindings. Missing clobber?
|
||||
# We could force M as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||
# If N is too big, we need to spill registers. TODO.
|
||||
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
|
||||
tSym = ident"t"
|
||||
t = asmArray(tSym, N, ElemsInReg, asmOutputEarlyClobber)
|
||||
# MultiPurpose Register slots
|
||||
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||
scratchSym = ident"scratch"
|
||||
scratch = asmArray(scratchSym, scratchSlots, ElemsInReg, asmInputOutputEarlyClobber)
|
||||
|
||||
# MULX requires RDX as well
|
||||
|
||||
a = scratch[0].asArrayAddr(len = N) # Store the `a` operand
|
||||
b = scratch[1].asArrayAddr(len = N) # Store the `b` operand
|
||||
a = scratch[0].asArrayAddr(a_PIR, len = N, memIndirect = memRead) # Store the `a` operand
|
||||
b = scratch[1].asArrayAddr(b_PIR, len = N, memIndirect = memRead) # Store the `b` operand
|
||||
A = scratch[2] # High part of extended precision multiplication
|
||||
C = scratch[3]
|
||||
m0ninv = scratch[4] # Modular inverse of M[0]
|
||||
@ -221,8 +218,6 @@ macro mulMont_CIOS_sparebit_adx_gen[N: static int](
|
||||
# but this prevent reusing the same code for multiple curves like BLS12-377 and BLS12-381
|
||||
# We might be able to save registers by having `r` and `M` be memory operand as well
|
||||
|
||||
let tsym = t.nimSymbol
|
||||
let scratchSym = scratch.nimSymbol
|
||||
result.add quote do:
|
||||
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
||||
|
||||
@ -250,21 +245,18 @@ macro mulMont_CIOS_sparebit_adx_gen[N: static int](
|
||||
A, t,
|
||||
a,
|
||||
b[0],
|
||||
C
|
||||
)
|
||||
C)
|
||||
else:
|
||||
ctx.mulaccx_by_word(
|
||||
A, t,
|
||||
a, i,
|
||||
b[i],
|
||||
C
|
||||
)
|
||||
C)
|
||||
|
||||
ctx.partialRedx(
|
||||
A, t,
|
||||
M, m0ninv,
|
||||
lo, C
|
||||
)
|
||||
lo, C)
|
||||
|
||||
if skipFinalSub:
|
||||
for i in 0 ..< N:
|
||||
@ -272,19 +264,9 @@ macro mulMont_CIOS_sparebit_adx_gen[N: static int](
|
||||
else:
|
||||
ctx.finalSubNoOverflowImpl(
|
||||
r, t, M,
|
||||
scratch
|
||||
)
|
||||
scratch)
|
||||
|
||||
result.add ctx.generate
|
||||
|
||||
func mulMont_CIOS_sparebit_asm_adx_inline*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) {.inline.} =
|
||||
## Constant-time Montgomery multiplication
|
||||
## If "skipFinalSub" is set
|
||||
## the result is in the range [0, 2M)
|
||||
## otherwise the result is in the range [0, M)
|
||||
##
|
||||
## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
|
||||
r.mulMont_CIOS_sparebit_adx_gen(a, b, M, m0ninv, skipFinalSub)
|
||||
result.add ctx.generate()
|
||||
|
||||
func mulMont_CIOS_sparebit_asm_adx*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) =
|
||||
## Constant-time Montgomery multiplication
|
||||
@ -293,7 +275,7 @@ func mulMont_CIOS_sparebit_asm_adx*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseTy
|
||||
## otherwise the result is in the range [0, M)
|
||||
##
|
||||
## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
|
||||
r.mulMont_CIOS_sparebit_asm_adx_inline(a, b, M, m0ninv, skipFinalSub)
|
||||
r.mulMont_CIOS_sparebit_adx_gen(a, b, M, m0ninv, skipFinalSub)
|
||||
|
||||
# Montgomery Squaring
|
||||
# ------------------------------------------------------------
|
||||
@ -313,7 +295,7 @@ func squareMont_CIOS_asm_adx*[N](
|
||||
|
||||
macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
|
||||
r_PIR: var Limbs[N], a_PIR, b_PIR: array[K, Limbs[N]],
|
||||
M_PIR: Limbs[N], m0ninv_REG: BaseType,
|
||||
M_MEM: Limbs[N], m0ninv_REG: BaseType,
|
||||
skipFinalSub: static bool): untyped =
|
||||
## Generate an optimized Montgomery merged sum of products ⅀aᵢ.bᵢ kernel
|
||||
## using the CIOS method
|
||||
@ -343,29 +325,23 @@ macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
|
||||
scratchSlots = 6
|
||||
|
||||
# We could force M as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||
# If N is too big, we need to spill registers. TODO.
|
||||
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
|
||||
tSym = ident"t"
|
||||
t = asmArray(tSym, N, ElemsInReg, asmOutputEarlyClobber)
|
||||
# MultiPurpose Register slots
|
||||
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||
scratchSym = ident"scratch"
|
||||
scratch = asmArray(scratchSym, scratchSlots, ElemsInReg, asmInputOutputEarlyClobber)
|
||||
|
||||
# MULX requires RDX as well
|
||||
|
||||
m0ninv = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[m0ninv]",
|
||||
nimSymbol: m0ninv_REG,
|
||||
rm: MemOffsettable,
|
||||
constraint: Input,
|
||||
cEmit: "&" & $m0ninv_REG
|
||||
)
|
||||
)
|
||||
m0ninv = asmValue(m0ninv_REG, Mem, asmInput)
|
||||
|
||||
# We're really constrained by register and somehow setting as memory doesn't help
|
||||
# So we store the result `r` in the scratch space and then reload it in RDX
|
||||
# before the scratchspace is used in final substraction
|
||||
a = scratch[0].as2dArrayAddr(rows = K, cols = N) # Store the `a` operand
|
||||
b = scratch[1].as2dArrayAddr(rows = K, cols = N) # Store the `b` operand
|
||||
a = scratch[0].as2dArrayAddr(a_PIR, rows = K, cols = N, memIndirect = memRead) # Store the `a` operand
|
||||
b = scratch[1].as2dArrayAddr(b_PIR, rows = K, cols = N, memIndirect = memRead) # Store the `b` operand
|
||||
tN = scratch[2] # High part of extended precision multiplication
|
||||
C = scratch[3] # Carry during reduction step
|
||||
r = scratch[4] # Stores the `r` operand
|
||||
@ -382,8 +358,6 @@ macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
|
||||
# but this prevent reusing the same code for multiple curves like BLS12-377 and BLS12-381
|
||||
# We might be able to save registers by having `r` and `M` be memory operand as well
|
||||
|
||||
let tsym = t.nimSymbol
|
||||
let scratchSym = scratch.nimSymbol
|
||||
result.add quote do:
|
||||
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
||||
|
||||
@ -461,11 +435,10 @@ macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
|
||||
ctx.partialRedx(
|
||||
tN, t,
|
||||
M, m0ninv,
|
||||
rax, C
|
||||
)
|
||||
rax, C)
|
||||
|
||||
ctx.mov rax, r # move r away from scratchspace that will be used for final substraction
|
||||
let r2 = rax.asArrayAddr(len = N)
|
||||
let r2 = rax.asArrayAddr(r_PIR, len = N, memIndirect = memWrite)
|
||||
|
||||
if skipFinalSub:
|
||||
ctx.comment " Copy result"
|
||||
@ -473,10 +446,7 @@ macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
|
||||
ctx.mov r2[i], t[i]
|
||||
else:
|
||||
ctx.comment " Final substraction"
|
||||
ctx.finalSubNoOverflowImpl(
|
||||
r2, t, M,
|
||||
scratch
|
||||
)
|
||||
ctx.finalSubNoOverflowImpl(r2, t, M, scratch)
|
||||
result.add ctx.generate()
|
||||
|
||||
func sumprodMont_CIOS_spare2bits_asm_adx*[N, K: static int](
|
||||
|
||||
@ -18,18 +18,13 @@ import
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
# Note: We can refer to at most 30 registers in inline assembly
|
||||
# and "InputOutput" registers count double
|
||||
# They are nice to let the compiler deals with mov
|
||||
# but too constraining so we move things ourselves.
|
||||
|
||||
static: doAssert UseASM_X86_64 # Need 8 registers just for mul
|
||||
# and 32-bit only has 8 max.
|
||||
|
||||
# Multiplication
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
|
||||
macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
|
||||
macro mul_gen[rLen, aLen, bLen: static int](r_PIR: var Limbs[rLen], a_MEM: Limbs[aLen], b_MEM: Limbs[bLen]) =
|
||||
## Comba multiplication generator
|
||||
## `a`, `b`, `r` can have a different number of limbs
|
||||
## if `r`.limbs.len < a.limbs.len + b.limbs.len
|
||||
@ -42,54 +37,29 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
let
|
||||
arrR = init(OperandArray, nimSymbol = r, rLen, PointerInReg, InputOutput_EnsureClobber)
|
||||
arrA = init(OperandArray, nimSymbol = a, aLen, PointerInReg, Input)
|
||||
arrB = init(OperandArray, nimSymbol = b, bLen, PointerInReg, Input)
|
||||
r = asmArray(r_PIR, rLen, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||
a = asmArray(a_MEM, aLen, MemOffsettable, asmInput)
|
||||
b = asmArray(b_MEM, aLen, MemOffsettable, asmInput)
|
||||
|
||||
t = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[t]",
|
||||
nimSymbol: ident"t",
|
||||
rm: Reg,
|
||||
constraint: Output_EarlyClobber,
|
||||
cEmit: "t"
|
||||
)
|
||||
)
|
||||
|
||||
u = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[u]",
|
||||
nimSymbol: ident"u",
|
||||
rm: Reg,
|
||||
constraint: Output_EarlyClobber,
|
||||
cEmit: "u"
|
||||
)
|
||||
)
|
||||
|
||||
v = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[v]",
|
||||
nimSymbol: ident"v",
|
||||
rm: Reg,
|
||||
constraint: Output_EarlyClobber,
|
||||
cEmit: "v"
|
||||
)
|
||||
)
|
||||
tSym = ident"t"
|
||||
t = asmValue(tSym, Reg, asmOutputEarlyClobber)
|
||||
uSym = ident"u"
|
||||
u = asmValue(uSym, Reg, asmOutputEarlyClobber)
|
||||
vSym = ident"v"
|
||||
v = asmValue(vSym, Reg, asmOutputEarlyClobber)
|
||||
|
||||
# MUL requires RAX and RDX
|
||||
|
||||
# Prologue
|
||||
let tsym = t.desc.nimSymbol
|
||||
let usym = u.desc.nimSymbol
|
||||
let vsym = v.desc.nimSymbol
|
||||
result.add quote do:
|
||||
var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init
|
||||
var `tSym`{.noInit.}, `uSym`{.noInit.}, `vSym`{.noInit.}: BaseType
|
||||
|
||||
# Algorithm
|
||||
# Zero-init
|
||||
ctx.`xor` u, u
|
||||
ctx.`xor` v, v
|
||||
ctx.`xor` t, t
|
||||
|
||||
# Algorithm
|
||||
let stopEx = min(aLen+bLen, rLen)
|
||||
|
||||
for i in 0 ..< stopEx:
|
||||
@ -100,13 +70,13 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
|
||||
let ia = i - ib
|
||||
for j in 0 ..< min(aLen - ia, ib+1):
|
||||
# (t, u, v) <- (t, u, v) + a[ia+j] * b[ib-j]
|
||||
ctx.mov rax, arrB[ib-j]
|
||||
ctx.mul rdx, rax, arrA[ia+j], rax
|
||||
ctx.mov rax, b[ib-j]
|
||||
ctx.mul rdx, rax, a[ia+j], rax
|
||||
ctx.add v, rax
|
||||
ctx.adc u, rdx
|
||||
ctx.adc t, 0
|
||||
|
||||
ctx.mov arrR[i], v
|
||||
ctx.mov r[i], v
|
||||
|
||||
if i != stopEx - 1:
|
||||
ctx.mov v, u
|
||||
@ -116,10 +86,10 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
|
||||
if aLen+bLen < rLen:
|
||||
ctx.`xor` rax, rax
|
||||
for i in aLen+bLen ..< rLen:
|
||||
ctx.mov arrR[i], rax
|
||||
ctx.mov r[i], rax
|
||||
|
||||
# Codegen
|
||||
result.add ctx.generate
|
||||
result.add ctx.generate()
|
||||
|
||||
func mul_asm*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
|
||||
## Multi-precision Multiplication
|
||||
@ -129,7 +99,7 @@ func mul_asm*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
|
||||
# Squaring
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
|
||||
macro sqr_gen*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
||||
macro sqr_gen*[rLen, aLen: static int](r_PIR: var Limbs[rLen], a_MEM: Limbs[aLen]) =
|
||||
## Comba squaring generator
|
||||
## `a` and `r` can have a different number of limbs
|
||||
## if `r`.limbs.len < a.limbs.len * 2
|
||||
@ -142,51 +112,26 @@ macro sqr_gen*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
let
|
||||
arrR = init(OperandArray, nimSymbol = r, rLen, PointerInReg, InputOutput_EnsureClobber)
|
||||
arrA = init(OperandArray, nimSymbol = a, aLen, PointerInReg, Input)
|
||||
r = asmArray(r_PIR, rLen, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||
a = asmArray(a_MEM, aLen, MemOffsettable, asmInput)
|
||||
|
||||
t = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[t]",
|
||||
nimSymbol: ident"t",
|
||||
rm: Reg,
|
||||
constraint: Output_EarlyClobber,
|
||||
cEmit: "t"
|
||||
)
|
||||
)
|
||||
|
||||
u = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[u]",
|
||||
nimSymbol: ident"u",
|
||||
rm: Reg,
|
||||
constraint: Output_EarlyClobber,
|
||||
cEmit: "u"
|
||||
)
|
||||
)
|
||||
|
||||
v = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[v]",
|
||||
nimSymbol: ident"v",
|
||||
rm: Reg,
|
||||
constraint: Output_EarlyClobber,
|
||||
cEmit: "v"
|
||||
)
|
||||
)
|
||||
tSym = ident"t"
|
||||
t = asmValue(tSym, Reg, asmOutputEarlyClobber)
|
||||
uSym = ident"u"
|
||||
u = asmValue(uSym, Reg, asmOutputEarlyClobber)
|
||||
vSym = ident"v"
|
||||
v = asmValue(vSym, Reg, asmOutputEarlyClobber)
|
||||
|
||||
# Prologue
|
||||
let tsym = t.desc.nimSymbol
|
||||
let usym = u.desc.nimSymbol
|
||||
let vsym = v.desc.nimSymbol
|
||||
result.add quote do:
|
||||
var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init
|
||||
var `tSym`{.noInit.}, `uSym`{.noInit.}, `vSym`{.noInit.}: BaseType
|
||||
|
||||
# Algorithm
|
||||
# Zero-init
|
||||
ctx.`xor` u, u
|
||||
ctx.`xor` v, v
|
||||
ctx.`xor` t, t
|
||||
|
||||
# Algorithm
|
||||
let stopEx = min(aLen*2, rLen)
|
||||
|
||||
for i in 0 ..< stopEx:
|
||||
@ -200,8 +145,8 @@ macro sqr_gen*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
||||
let k2 = ib-j
|
||||
if k1 < k2:
|
||||
# (t, u, v) <- (t, u, v) + 2 * a[k1] * a[k2]
|
||||
ctx.mov rax, arrA[k2]
|
||||
ctx.mul rdx, rax, arrA[k1], rax
|
||||
ctx.mov rax, a[k2]
|
||||
ctx.mul rdx, rax, a[k1], rax
|
||||
ctx.add rax, rax
|
||||
ctx.adc rdx, rdx
|
||||
ctx.adc t, 0
|
||||
@ -210,15 +155,15 @@ macro sqr_gen*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
||||
ctx.adc t, 0
|
||||
elif k1 == k2:
|
||||
# (t, u, v) <- (t, u, v) + a[k1] * a[k2]
|
||||
ctx.mov rax, arrA[k2]
|
||||
ctx.mul rdx, rax, arrA[k1], rax
|
||||
ctx.mov rax, a[k2]
|
||||
ctx.mul rdx, rax, a[k1], rax
|
||||
ctx.add v, rax
|
||||
ctx.adc u, rdx
|
||||
ctx.adc t, 0
|
||||
else:
|
||||
discard
|
||||
|
||||
ctx.mov arrR[i], v
|
||||
ctx.mov r[i], v
|
||||
|
||||
if i != stopEx - 1:
|
||||
ctx.mov v, u
|
||||
@ -228,10 +173,10 @@ macro sqr_gen*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
||||
if aLen*2 < rLen:
|
||||
ctx.`xor` rax, rax
|
||||
for i in aLen*2 ..< rLen:
|
||||
ctx.mov arrR[i], rax
|
||||
ctx.mov r[i], rax
|
||||
|
||||
# Codegen
|
||||
result.add ctx.generate
|
||||
result.add ctx.generate()
|
||||
|
||||
func square_asm*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
||||
## Multi-precision Squaring
|
||||
|
||||
@ -18,11 +18,6 @@ import
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
# Note: We can refer to at most 30 registers in inline assembly
|
||||
# and "InputOutput" registers count double
|
||||
# They are nice to let the compiler deals with mov
|
||||
# but too constraining so we move things ourselves.
|
||||
|
||||
static: doAssert UseASM_X86_64
|
||||
|
||||
# MULX/ADCX/ADOX
|
||||
@ -108,7 +103,7 @@ proc mulaccx_by_word(
|
||||
ctx.adcx hi, rdx
|
||||
ctx.adox hi, rdx
|
||||
|
||||
macro mulx_gen[rLen, aLen, bLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limbs[aLen], b_PIR: Limbs[bLen]) =
|
||||
macro mulx_gen[rLen, aLen, bLen: static int](r_PIR: var Limbs[rLen], a_MEM: Limbs[aLen], b_MEM: Limbs[bLen]) =
|
||||
## `a`, `b`, `r` can have a different number of limbs
|
||||
## if `r`.limbs.len < a.limbs.len + b.limbs.len
|
||||
## The result will be truncated, i.e. it will be
|
||||
@ -120,35 +115,33 @@ macro mulx_gen[rLen, aLen, bLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limb
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
let
|
||||
r = init(OperandArray, nimSymbol = r_PIR, rLen, PointerInReg, InputOutput_EnsureClobber)
|
||||
a = init(OperandArray, nimSymbol = a_PIR, aLen, PointerInReg, Input)
|
||||
b = init(OperandArray, nimSymbol = b_PIR, bLen, PointerInReg, Input)
|
||||
r = asmArray(r_PIR, rLen, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||
a = asmArray(a_MEM, aLen, MemOffsettable, asmInput)
|
||||
b = asmArray(b_MEM, bLen, MemOffsettable, asmInput)
|
||||
|
||||
# MULX requires RDX
|
||||
|
||||
tSym = ident"t"
|
||||
tSlots = aLen+1 # Extra for high word
|
||||
|
||||
var # If aLen is too big, we need to spill registers. TODO.
|
||||
t = init(OperandArray, nimSymbol = ident"t", tSlots, ElemsInReg, Output_EarlyClobber)
|
||||
t = asmArray(tSym, tSlots, ElemsInReg, asmOutputEarlyClobber)
|
||||
|
||||
# Prologue
|
||||
let tsym = t.nimSymbol
|
||||
result.add quote do:
|
||||
var `tsym`{.noInit, used.}: array[`tSlots`, BaseType]
|
||||
var `tSym`{.noInit, used.}: array[`tSlots`, BaseType]
|
||||
|
||||
for i in 0 ..< min(rLen, bLen):
|
||||
if i == 0:
|
||||
ctx.mulx_by_word(
|
||||
r[0],
|
||||
a, t,
|
||||
b[0]
|
||||
)
|
||||
b[0])
|
||||
else:
|
||||
ctx.mulaccx_by_word(
|
||||
r, i,
|
||||
a, t,
|
||||
b[i]
|
||||
)
|
||||
b[i])
|
||||
|
||||
t.rotateLeft()
|
||||
|
||||
@ -163,20 +156,13 @@ macro mulx_gen[rLen, aLen, bLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limb
|
||||
ctx.mov r[i], rax
|
||||
|
||||
# Codegen
|
||||
result.add ctx.generate
|
||||
|
||||
func mul_asm_adx_inline*[rLen, aLen, bLen: static int](
|
||||
r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) {.inline.} =
|
||||
## Multi-precision Multiplication
|
||||
## Assumes r doesn't alias a or b
|
||||
## Inline version
|
||||
mulx_gen(r, a, b)
|
||||
result.add ctx.generate()
|
||||
|
||||
func mul_asm_adx*[rLen, aLen, bLen: static int](
|
||||
r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
|
||||
## Multi-precision Multiplication
|
||||
## Assumes r doesn't alias a or b
|
||||
mul_asm_adx_inline(r, a, b)
|
||||
mulx_gen(r, a, b)
|
||||
|
||||
# Squaring
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
@ -558,7 +544,7 @@ func sqrx_gen6L(ctx: var Assembler_x86, r, a: OperandArray, t: var OperandArray)
|
||||
merge_diag_and_partsum(r, a, hi1, lo1, zero, 4)
|
||||
merge_diag_and_partsum(r, a, hi2, lo2, zero, 5)
|
||||
|
||||
macro sqrx_gen*[rLen, aLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limbs[aLen]) =
|
||||
macro sqrx_gen*[rLen, aLen: static int](r_PIR: var Limbs[rLen], a_MEM: Limbs[aLen]) =
|
||||
## Squaring
|
||||
## `a` and `r` can have a different number of limbs
|
||||
## if `r`.limbs.len < a.limbs.len * 2
|
||||
@ -575,21 +561,20 @@ macro sqrx_gen*[rLen, aLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limbs[aLe
|
||||
# t = 2 * a.len = 12
|
||||
# We use the full x86 register set.
|
||||
|
||||
r = init(OperandArray, nimSymbol = r_PIR, rLen, PointerInReg, InputOutput)
|
||||
a = init(OperandArray, nimSymbol = a_PIR, aLen, PointerInReg, Input)
|
||||
r = asmArray(r_PIR, rLen, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||
a = asmArray(a_MEM, aLen, MemOffsettable, asmInput)
|
||||
|
||||
# MULX requires RDX
|
||||
|
||||
tSym = ident"t"
|
||||
tSlots = aLen+1 # Extra for high word
|
||||
|
||||
var # If aLen is too big, we need to spill registers. TODO.
|
||||
t = init(OperandArray, nimSymbol = ident"t", tSlots, ElemsInReg, Output_EarlyClobber)
|
||||
t = asmArray(tSym, tSlots, ElemsInReg, asmOutputEarlyClobber)
|
||||
|
||||
# Prologue
|
||||
# -------------------------------
|
||||
let tsym = t.nimSymbol
|
||||
result.add quote do:
|
||||
var `tsym`{.noInit, used.}: array[`tSlots`, BaseType]
|
||||
var `tSym`{.noInit, used.}: array[`tSlots`, BaseType]
|
||||
|
||||
if aLen == 4:
|
||||
ctx.sqrx_gen4L(r, a, t)
|
||||
@ -599,7 +584,7 @@ macro sqrx_gen*[rLen, aLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limbs[aLe
|
||||
error: "Not implemented"
|
||||
|
||||
# Codegen
|
||||
result.add ctx.generate
|
||||
result.add ctx.generate()
|
||||
|
||||
func square_asm_adx*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
||||
## Multi-precision Squaring
|
||||
|
||||
@ -31,7 +31,7 @@ static: doAssert UseASM_X86_32
|
||||
macro redc2xMont_gen*[N: static int](
|
||||
r_PIR: var array[N, SecretWord],
|
||||
a_PIR: array[N*2, SecretWord],
|
||||
M_PIR: array[N, SecretWord],
|
||||
M_MEM: array[N, SecretWord],
|
||||
m0ninv_REG: BaseType,
|
||||
spareBits: static int, skipFinalSub: static bool) =
|
||||
# No register spilling handling
|
||||
@ -46,28 +46,27 @@ macro redc2xMont_gen*[N: static int](
|
||||
# so we store everything in scratchspaces restoring as needed
|
||||
let
|
||||
# We could force M as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||
# MUL requires RAX and RDX
|
||||
|
||||
let uSlots = N+2
|
||||
let vSlots = max(N-2, 3)
|
||||
|
||||
let uSym = ident"u"
|
||||
let vSym = ident"v"
|
||||
var # Scratchspaces
|
||||
u = init(OperandArray, nimSymbol = ident"U", uSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||
v = init(OperandArray, nimSymbol = ident"V", vSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||
u = asmArray(uSym, uSlots, ElemsInReg, asmInputOutputEarlyClobber)
|
||||
v = asmArray(vSym, vSlots, ElemsInReg, asmInputOutputEarlyClobber)
|
||||
|
||||
# Prologue
|
||||
let usym = u.nimSymbol
|
||||
let vsym = v.nimSymbol
|
||||
result.add quote do:
|
||||
var `usym`{.noinit, used.}: Limbs[`uSlots`]
|
||||
var `vsym` {.noInit.}: Limbs[`vSlots`]
|
||||
`vsym`[0] = cast[SecretWord](`r_PIR`[0].unsafeAddr)
|
||||
`vsym`[1] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
|
||||
`vsym`[2] = SecretWord(`m0ninv_REG`)
|
||||
var `uSym`{.noinit, used.}: Limbs[`uSlots`]
|
||||
var `vSym` {.noInit.}: Limbs[`vSlots`]
|
||||
`vSym`[0] = cast[SecretWord](`r_PIR`[0].unsafeAddr)
|
||||
`vSym`[1] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
|
||||
`vSym`[2] = SecretWord(`m0ninv_REG`)
|
||||
|
||||
let r_temp = v[0].asArrayAddr(len = N)
|
||||
let a = v[1].asArrayAddr(len = 2*N)
|
||||
let r_temp = v[0].asArrayAddr(r_PIR, len = N, memIndirect = memWrite)
|
||||
let a = v[1].asArrayAddr(a_PIR, len = 2*N, memIndirect = memRead)
|
||||
let m0ninv = v[2]
|
||||
|
||||
# Algorithm
|
||||
@ -137,7 +136,7 @@ macro redc2xMont_gen*[N: static int](
|
||||
|
||||
if not(spareBits >= 2 and skipFinalSub):
|
||||
ctx.mov rdx, r_temp
|
||||
let r = rdx.asArrayAddr(len = N)
|
||||
let r = rdx.asArrayAddr(r_PIR, len = N, memIndirect = memWrite)
|
||||
|
||||
# This does a[i+n] += hi
|
||||
# but in a separate carry chain, fused with the
|
||||
@ -157,7 +156,7 @@ macro redc2xMont_gen*[N: static int](
|
||||
elif spareBits >= 1:
|
||||
ctx.finalSubNoOverflowImpl(r, u, M, t)
|
||||
else:
|
||||
ctx.finalSubMayOverflowImpl(r, u, M, t, rax)
|
||||
ctx.finalSubMayOverflowImpl(r, u, M, t)
|
||||
|
||||
# Code generation
|
||||
result.add ctx.generate()
|
||||
@ -168,9 +167,8 @@ func redcMont_asm*[N: static int](
|
||||
M: array[N, SecretWord],
|
||||
m0ninv: BaseType,
|
||||
spareBits: static int,
|
||||
skipFinalSub: static bool) {.noInline.} =
|
||||
skipFinalSub: static bool) =
|
||||
## Constant-time Montgomery reduction
|
||||
# This MUST be noInline or Clang will run out of registers with LTO
|
||||
static: doAssert UseASM_X86_64, "This requires x86-64."
|
||||
redc2xMont_gen(r, a, M, m0ninv, spareBits, skipFinalSub)
|
||||
|
||||
@ -179,7 +177,7 @@ func redcMont_asm*[N: static int](
|
||||
|
||||
macro mulMont_by_1_gen[N: static int](
|
||||
t_EIR: var array[N, SecretWord],
|
||||
M_PIR: array[N, SecretWord],
|
||||
M_MEM: array[N, SecretWord],
|
||||
m0ninv_REG: BaseType) =
|
||||
|
||||
# No register spilling handling
|
||||
@ -192,34 +190,22 @@ macro mulMont_by_1_gen[N: static int](
|
||||
# RAX and RDX are defacto used due to the MUL instructions
|
||||
# so we store everything in scratchspaces restoring as needed
|
||||
let
|
||||
scratchSlots = 2
|
||||
|
||||
t = init(OperandArray, nimSymbol = t_EIR, N, ElemsInReg, InputOutput_EnsureClobber)
|
||||
t = asmArray(t_EIR, N, ElemsInReg, asmInputOutputEarlyClobber)
|
||||
# We could force M as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||
# MultiPurpose Register slots
|
||||
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||
|
||||
# MUL requires RAX and RDX
|
||||
|
||||
m0ninv = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[m0ninv]",
|
||||
nimSymbol: m0ninv_REG,
|
||||
rm: MemOffsettable,
|
||||
constraint: Input,
|
||||
cEmit: "&" & $m0ninv_REG
|
||||
)
|
||||
)
|
||||
|
||||
C = scratch[0] # Stores the high-part of muliplication
|
||||
m = scratch[1] # Stores (t[0] * m0ninv) mod 2ʷ
|
||||
|
||||
let scratchSym = scratch.nimSymbol
|
||||
m0ninv = asmValue(m0ninv_REG, Mem, asmInput)
|
||||
Csym = ident"C"
|
||||
C = asmValue(Csym, Reg, asmOutputEarlyClobber) # Stores the high-part of muliplication
|
||||
mSym = ident"m"
|
||||
m = asmValue(msym, Reg, asmOutputEarlyClobber) # Stores (t[0] * m0ninv) mod 2ʷ
|
||||
|
||||
# Copy a in t
|
||||
result.add quote do:
|
||||
var `scratchSym` {.noInit, used.}: Limbs[`scratchSlots`]
|
||||
var `Csym` {.noInit, used.}: BaseType
|
||||
var `mSym` {.noInit, used.}: BaseType
|
||||
|
||||
# Algorithm
|
||||
# ---------------------------------------------------------
|
||||
|
||||
@ -35,7 +35,7 @@ static: doAssert UseASM_X86_64
|
||||
macro redc2xMont_adx_gen[N: static int](
|
||||
r_PIR: var array[N, SecretWord],
|
||||
a_PIR: array[N*2, SecretWord],
|
||||
M_PIR: array[N, SecretWord],
|
||||
M_MEM: array[N, SecretWord],
|
||||
m0ninv_REG: BaseType,
|
||||
spareBits: static int, skipFinalSub: static bool) =
|
||||
|
||||
@ -45,30 +45,28 @@ macro redc2xMont_adx_gen[N: static int](
|
||||
result = newStmtList()
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
let
|
||||
# We could force M as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||
let M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||
|
||||
let uSlots = N+1
|
||||
let vSlots = max(N-1, 5)
|
||||
let uSym = ident"u"
|
||||
let vSym = ident"v"
|
||||
|
||||
var # Scratchspaces
|
||||
u = init(OperandArray, nimSymbol = ident"U", uSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||
v = init(OperandArray, nimSymbol = ident"V", vSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||
u = asmArray(uSym, uSlots, ElemsInReg, asmInputOutputEarlyClobber)
|
||||
v = asmArray(vSym, vSlots, ElemsInReg, asmInputOutputEarlyClobber)
|
||||
|
||||
# Prologue
|
||||
let usym = u.nimSymbol
|
||||
let vsym = v.nimSymbol
|
||||
result.add quote do:
|
||||
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
||||
var `usym`{.noinit, used.}: Limbs[`uSlots`]
|
||||
var `vsym` {.noInit.}: Limbs[`vSlots`]
|
||||
`vsym`[0] = cast[SecretWord](`r_PIR`[0].unsafeAddr)
|
||||
`vsym`[1] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
|
||||
`vsym`[2] = SecretWord(`m0ninv_REG`)
|
||||
var `uSym`{.noinit, used.}: Limbs[`uSlots`]
|
||||
var `vSym` {.noInit.}: Limbs[`vSlots`]
|
||||
`vSym`[0] = cast[SecretWord](`r_PIR`[0].unsafeAddr)
|
||||
`vSym`[1] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
|
||||
`vSym`[2] = SecretWord(`m0ninv_REG`)
|
||||
|
||||
let r_temp = v[0].asArrayAddr(len = N)
|
||||
let a = v[1].asArrayAddr(len = 2*N)
|
||||
let r_temp = v[0]
|
||||
let a = v[1].asArrayAddr(a_PIR, len = 2*N, memIndirect = memRead)
|
||||
let m0ninv = v[2]
|
||||
let lo = v[3]
|
||||
let hi = v[4]
|
||||
@ -116,7 +114,7 @@ macro redc2xMont_adx_gen[N: static int](
|
||||
u.rotateLeft()
|
||||
|
||||
ctx.mov rdx, r_temp
|
||||
let r = rdx.asArrayAddr(len = N)
|
||||
let r = rdx.asArrayAddr(r_PIR, len = N, memIndirect = memWrite)
|
||||
|
||||
# This does a[i+n] += hi
|
||||
# but in a separate carry chain, fused with the
|
||||
@ -135,7 +133,7 @@ macro redc2xMont_adx_gen[N: static int](
|
||||
elif spareBits >= 1:
|
||||
ctx.finalSubNoOverflowImpl(r, u, M, t)
|
||||
else:
|
||||
ctx.finalSubMayOverflowImpl(r, u, M, t, hi)
|
||||
ctx.finalSubMayOverflowImpl(r, u, M, t)
|
||||
|
||||
# Code generation
|
||||
result.add ctx.generate()
|
||||
@ -146,7 +144,7 @@ func redcMont_asm_adx*[N: static int](
|
||||
M: array[N, SecretWord],
|
||||
m0ninv: BaseType,
|
||||
spareBits: static int,
|
||||
skipFinalSub: static bool = false) {.noInline.} =
|
||||
skipFinalSub: static bool = false) =
|
||||
## Constant-time Montgomery reduction
|
||||
# Inlining redcMont_asm_adx twice in mul_fp2_complex_asm_adx
|
||||
# causes GCC to miscompile with -Os (--opt:size)
|
||||
@ -158,7 +156,7 @@ func redcMont_asm_adx*[N: static int](
|
||||
|
||||
macro mulMont_by_1_adx_gen[N: static int](
|
||||
t_EIR: var array[N, SecretWord],
|
||||
M_PIR: array[N, SecretWord],
|
||||
M_MEM: array[N, SecretWord],
|
||||
m0ninv_REG: BaseType) =
|
||||
|
||||
# No register spilling handling
|
||||
@ -171,33 +169,20 @@ macro mulMont_by_1_adx_gen[N: static int](
|
||||
# RAX and RDX are defacto used due to the MUL instructions
|
||||
# so we store everything in scratchspaces restoring as needed
|
||||
let
|
||||
scratchSlots = 1
|
||||
|
||||
t = init(OperandArray, nimSymbol = t_EIR, N, ElemsInReg, InputOutput_EnsureClobber)
|
||||
t = asmArray(t_EIR, N, ElemsInReg, asmInputOutputEarlyClobber)
|
||||
# We could force M as immediate by specializing per moduli
|
||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
||||
# MultiPurpose Register slots
|
||||
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||
|
||||
# MUL requires RAX and RDX
|
||||
|
||||
m0ninv = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[m0ninv]",
|
||||
nimSymbol: m0ninv_REG,
|
||||
rm: MemOffsettable,
|
||||
constraint: Input,
|
||||
cEmit: "&" & $m0ninv_REG
|
||||
)
|
||||
)
|
||||
m0ninv = asmValue(m0ninv_REG, Mem, asmInput)
|
||||
|
||||
C = scratch[0] # Stores the high-part of muliplication
|
||||
|
||||
let scratchSym = scratch.nimSymbol
|
||||
Csym = ident"C"
|
||||
C = asmValue(Csym, Reg, asmOutputEarlyClobber) # Stores the high-part of muliplication
|
||||
|
||||
# Copy a in t
|
||||
result.add quote do:
|
||||
var `scratchSym` {.noInit, used.}: Limbs[`scratchSlots`]
|
||||
var `Csym` {.noInit, used.}: BaseType
|
||||
|
||||
# Algorithm
|
||||
# ---------------------------------------------------------
|
||||
|
||||
@ -18,74 +18,45 @@ import
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
# Note: We can refer to at most 30 registers in inline assembly
|
||||
# and "InputOutput" registers count double
|
||||
# They are nice to let the compiler deals with mov
|
||||
# but too constraining so we move things ourselves.
|
||||
|
||||
static: doAssert UseASM_X86_32
|
||||
|
||||
# Copy
|
||||
# ------------------------------------------------------------
|
||||
macro ccopy_gen[N: static int](a: var Limbs[N], b: Limbs[N], ctl: SecretBool): untyped =
|
||||
macro ccopy_gen[N: static int](a_PIR: var Limbs[N], b_MEM: Limbs[N], ctl: SecretBool): untyped =
|
||||
## Generate an optimized conditional copy kernel
|
||||
result = newStmtList()
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
|
||||
let
|
||||
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, InputOutput)
|
||||
arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, Input)
|
||||
a = asmArray(a_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memReadWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||
b = asmArray(b_MEM, N, MemOffsettable, asmInput)
|
||||
|
||||
control = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[ctl]",
|
||||
nimSymbol: ctl,
|
||||
rm: Reg,
|
||||
constraint: Input,
|
||||
cEmit: "ctl"
|
||||
)
|
||||
)
|
||||
control = asmValue(ctl, Reg, asmInput)
|
||||
|
||||
t0Sym = ident"t0"
|
||||
t1Sym = ident"t1"
|
||||
|
||||
var # Swappable registers to break dependency chains
|
||||
t0 = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[t0]",
|
||||
nimSymbol: ident"t0",
|
||||
rm: Reg,
|
||||
constraint: Output_EarlyClobber,
|
||||
cEmit: "t0"
|
||||
)
|
||||
)
|
||||
|
||||
t1 = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[t1]",
|
||||
nimSymbol: ident"t1",
|
||||
rm: Reg,
|
||||
constraint: Output_EarlyClobber,
|
||||
cEmit: "t1"
|
||||
)
|
||||
)
|
||||
t0 = asmValue(t0Sym, Reg, asmOutputEarlyClobber)
|
||||
t1 = asmValue(t1Sym, Reg, asmOutputEarlyClobber)
|
||||
|
||||
# Prologue
|
||||
let t0sym = t0.desc.nimSymbol
|
||||
let t1sym = t1.desc.nimSymbol
|
||||
result.add quote do:
|
||||
var `t0sym`{.noinit.}, `t1sym`{.noinit.}: BaseType
|
||||
|
||||
# Algorithm
|
||||
ctx.test control, control
|
||||
for i in 0 ..< N:
|
||||
ctx.mov t0, arrA[i]
|
||||
ctx.cmovnz t0, arrB[i]
|
||||
ctx.mov arrA[i], t0
|
||||
ctx.mov t0, a[i]
|
||||
ctx.cmovnz t0, b[i]
|
||||
ctx.mov a[i], t0
|
||||
swap(t0, t1)
|
||||
|
||||
# Codegen
|
||||
result.add ctx.generate()
|
||||
|
||||
func ccopy_asm*(a: var Limbs, b: Limbs, ctl: SecretBool) {.inline.}=
|
||||
func ccopy_asm*(a: var Limbs, b: Limbs, ctl: SecretBool) =
|
||||
## Constant-time conditional copy
|
||||
## If ctl is true: b is copied into a
|
||||
## if ctl is false: b is not copied and a is untouched
|
||||
@ -95,121 +66,89 @@ func ccopy_asm*(a: var Limbs, b: Limbs, ctl: SecretBool) {.inline.}=
|
||||
# Addition
|
||||
# ------------------------------------------------------------
|
||||
|
||||
macro add_gen[N: static int](carry: var Carry, r: var Limbs[N], a, b: Limbs[N]): untyped =
|
||||
macro add_gen[N: static int](carry: var Carry, r_PIR: var Limbs[N], a_MEM, b_MEM: Limbs[N]): untyped =
|
||||
## Generate an optimized out-of-place addition kernel
|
||||
|
||||
result = newStmtList()
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
let
|
||||
arrR = init(OperandArray, nimSymbol = r, N, PointerInReg, InputOutput)
|
||||
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, Input)
|
||||
arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, Input)
|
||||
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||
a = asmArray(a_MEM, N, MemOffsettable, asmInput)
|
||||
b = asmArray(b_MEM, N, MemOffsettable, asmInput)
|
||||
|
||||
t0Sym = ident"t0"
|
||||
t1Sym = ident"t1"
|
||||
|
||||
var # Swappable registers to break dependency chains
|
||||
t0 = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[t0]",
|
||||
nimSymbol: ident"t0",
|
||||
rm: Reg,
|
||||
constraint: Output_EarlyClobber,
|
||||
cEmit: "t0"
|
||||
)
|
||||
)
|
||||
|
||||
t1 = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[t1]",
|
||||
nimSymbol: ident"t1",
|
||||
rm: Reg,
|
||||
constraint: Output_EarlyClobber,
|
||||
cEmit: "t1"
|
||||
)
|
||||
)
|
||||
t0 = asmValue(t0Sym, Reg, asmOutputEarlyClobber)
|
||||
t1 = asmValue(t1Sym, Reg, asmOutputEarlyClobber)
|
||||
|
||||
# Prologue
|
||||
let t0sym = t0.desc.nimSymbol
|
||||
let t1sym = t1.desc.nimSymbol
|
||||
result.add quote do:
|
||||
var `t0sym`{.noinit.}, `t1sym`{.noinit.}: BaseType
|
||||
|
||||
# Algorithm
|
||||
ctx.mov t0, arrA[0] # Prologue
|
||||
ctx.add t0, arrB[0]
|
||||
ctx.mov t0, a[0] # Prologue
|
||||
ctx.add t0, b[0]
|
||||
|
||||
for i in 1 ..< N:
|
||||
ctx.mov t1, arrA[i] # Prepare the next iteration
|
||||
ctx.mov arrR[i-1], t0 # Save the previous result in an interleaved manner
|
||||
ctx.adc t1, arrB[i] # Compute
|
||||
swap(t0, t1) # Break dependency chain
|
||||
ctx.mov t1, a[i] # Prepare the next iteration
|
||||
ctx.mov r[i-1], t0 # Save the previous result in an interleaved manner
|
||||
ctx.adc t1, b[i] # Compute
|
||||
swap(t0, t1) # Break dependency chain
|
||||
|
||||
ctx.mov arrR[N-1], t0 # Epilogue
|
||||
ctx.mov r[N-1], t0 # Epilogue
|
||||
ctx.setToCarryFlag(carry)
|
||||
|
||||
# Codegen
|
||||
result.add ctx.generate
|
||||
result.add ctx.generate()
|
||||
|
||||
func add_asm*(r: var Limbs, a, b: Limbs): Carry {.inline.}=
|
||||
func add_asm*(r: var Limbs, a, b: Limbs): Carry =
|
||||
## Constant-time addition
|
||||
add_gen(result, r, a, b)
|
||||
|
||||
# Substraction
|
||||
# ------------------------------------------------------------
|
||||
|
||||
macro sub_gen[N: static int](borrow: var Borrow, r: var Limbs[N], a, b: Limbs[N]): untyped =
|
||||
macro sub_gen[N: static int](borrow: var Borrow, r_PIR: var Limbs[N], a_MEM, b_MEM: Limbs[N]): untyped =
|
||||
## Generate an optimized out-of-place substraction kernel
|
||||
|
||||
result = newStmtList()
|
||||
|
||||
var ctx = init(Assembler_x86, BaseType)
|
||||
let
|
||||
arrR = init(OperandArray, nimSymbol = r, N, PointerInReg, InputOutput)
|
||||
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, Input)
|
||||
arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, Input)
|
||||
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||
a = asmArray(a_MEM, N, MemOffsettable, asmInput)
|
||||
b = asmArray(b_MEM, N, MemOffsettable, asmInput)
|
||||
|
||||
t0Sym = ident"t0"
|
||||
t1Sym = ident"t1"
|
||||
|
||||
var # Swappable registers to break dependency chains
|
||||
t0 = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[t0]",
|
||||
nimSymbol: ident"t0",
|
||||
rm: Reg,
|
||||
constraint: Output_EarlyClobber,
|
||||
cEmit: "t0"
|
||||
)
|
||||
)
|
||||
|
||||
t1 = Operand(
|
||||
desc: OperandDesc(
|
||||
asmId: "[t1]",
|
||||
nimSymbol: ident"t1",
|
||||
rm: Reg,
|
||||
constraint: Output_EarlyClobber,
|
||||
cEmit: "t1"
|
||||
)
|
||||
)
|
||||
t0 = asmValue(t0Sym, Reg, asmOutputEarlyClobber)
|
||||
t1 = asmValue(t1Sym, Reg, asmOutputEarlyClobber)
|
||||
|
||||
# Prologue
|
||||
let t0sym = t0.desc.nimSymbol
|
||||
let t1sym = t1.desc.nimSymbol
|
||||
result.add quote do:
|
||||
var `t0sym`{.noinit.}, `t1sym`{.noinit.}: BaseType
|
||||
|
||||
# Algorithm
|
||||
ctx.mov t0, arrA[0] # Prologue
|
||||
ctx.sub t0, arrB[0]
|
||||
ctx.mov t0, a[0] # Prologue
|
||||
ctx.sub t0, b[0]
|
||||
|
||||
for i in 1 ..< N:
|
||||
ctx.mov t1, arrA[i] # Prepare the next iteration
|
||||
ctx.mov arrR[i-1], t0 # Save the previous reult in an interleaved manner
|
||||
ctx.sbb t1, arrB[i] # Compute
|
||||
swap(t0, t1) # Break dependency chain
|
||||
ctx.mov t1, a[i] # Prepare the next iteration
|
||||
ctx.mov r[i-1], t0 # Save the previous reult in an interleaved manner
|
||||
ctx.sbb t1, b[i] # Compute
|
||||
swap(t0, t1) # Break dependency chain
|
||||
|
||||
ctx.mov arrR[N-1], t0 # Epilogue
|
||||
ctx.mov r[N-1], t0 # Epilogue
|
||||
ctx.setToCarryFlag(borrow)
|
||||
|
||||
# Codegen
|
||||
result.add ctx.generate
|
||||
result.add ctx.generate()
|
||||
|
||||
func sub_asm*(r: var Limbs, a, b: Limbs): Borrow {.inline.}=
|
||||
func sub_asm*(r: var Limbs, a, b: Limbs): Borrow =
|
||||
## Constant-time substraction
|
||||
sub_gen(result, r, a, b)
|
||||
|
||||
@ -152,7 +152,7 @@ func setMinusOne*(a: var FF) =
|
||||
|
||||
func neg*(r: var FF, a: FF) {.meter.} =
|
||||
## Negate modulo p
|
||||
when UseASM_X86_64:
|
||||
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||
negmod_asm(r.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
|
||||
else:
|
||||
# If a = 0 we need r = 0 and not r = M
|
||||
|
||||
@ -118,7 +118,7 @@ func sum2xMod*(r: var FpDbl, a, b: FpDbl) =
|
||||
## Output is conditionally reduced by 2ⁿp
|
||||
## to stay in the [0, 2ⁿp) range
|
||||
when UseASM_X86_64:
|
||||
addmod2x_asm(r.limbs2x, a.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs)
|
||||
addmod2x_asm(r.limbs2x, a.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs, Fp[FpDbl.C].getSpareBits())
|
||||
else:
|
||||
# Addition step
|
||||
var overflowed = SecretBool r.limbs2x.sum(a.limbs2x, b.limbs2x)
|
||||
|
||||
@ -543,10 +543,8 @@ func sumprodMont*[N: static int](
|
||||
r: var Limbs, a, b: array[N, Limbs],
|
||||
M: Limbs, m0ninv: BaseType,
|
||||
spareBits: static int,
|
||||
skipFinalSub: static bool = false) {.noInline.} =
|
||||
skipFinalSub: static bool = false) =
|
||||
## Compute r <- ⅀aᵢ.bᵢ (mod M) (sum of products)
|
||||
# This function must be noInline or GCC miscompiles
|
||||
# with LTO, see https://github.com/mratsim/constantine/issues/230
|
||||
when spareBits >= 2:
|
||||
when UseASM_X86_64 and r.len in {2 .. 6}:
|
||||
if ({.noSideEffect.}: hasAdx()):
|
||||
|
||||
@ -139,5 +139,5 @@ macro debugConsts(): untyped {.used.} =
|
||||
result.add quote do:
|
||||
echo "----------------------------------------------------------------------------"
|
||||
|
||||
# debug: # displayed with -d:debugConstantine
|
||||
# debug: # displayed with -d:CttDebug
|
||||
# debugConsts()
|
||||
|
||||
@ -62,9 +62,9 @@ func sqrx2x_complex_asm_adx*(
|
||||
t0.double(a.c1)
|
||||
t1.sum(a.c0, a.c1)
|
||||
|
||||
r.c1.limbs2x.mul_asm_adx_inline(t0.mres.limbs, a.c0.mres.limbs)
|
||||
r.c1.limbs2x.mul_asm_adx(t0.mres.limbs, a.c0.mres.limbs)
|
||||
t0.diff(a.c0, a.c1)
|
||||
r.c0.limbs2x.mul_asm_adx_inline(t0.mres.limbs, t1.mres.limbs)
|
||||
r.c0.limbs2x.mul_asm_adx(t0.mres.limbs, t1.mres.limbs)
|
||||
|
||||
func sqrx_complex_sparebit_asm_adx*(
|
||||
r: var array[2, Fp],
|
||||
@ -94,15 +94,15 @@ func mul2x_fp2_complex_asm_adx*(
|
||||
var D {.noInit.}: typeof(r.c0)
|
||||
var t0 {.noInit.}, t1 {.noInit.}: typeof(a.c0)
|
||||
|
||||
r.c0.limbs2x.mul_asm_adx_inline(a.c0.mres.limbs, b.c0.mres.limbs)
|
||||
D.limbs2x.mul_asm_adx_inline(a.c1.mres.limbs, b.c1.mres.limbs)
|
||||
when Fp.has1extraBit():
|
||||
t0.sumUnr(a.c0, a.c1)
|
||||
t1.sumUnr(b.c0, b.c1)
|
||||
else:
|
||||
t0.sum(a.c0, a.c1)
|
||||
t1.sum(b.c0, b.c1)
|
||||
r.c1.limbs2x.mul_asm_adx_inline(t0.mres.limbs, t1.mres.limbs)
|
||||
r.c0.limbs2x.mul_asm_adx(a.c0.mres.limbs, b.c0.mres.limbs)
|
||||
D.limbs2x.mul_asm_adx(a.c1.mres.limbs, b.c1.mres.limbs)
|
||||
r.c1.limbs2x.mul_asm_adx(t0.mres.limbs, t1.mres.limbs)
|
||||
when Fp.has1extraBit():
|
||||
r.c1.diff2xUnr(r.c1, r.c0)
|
||||
r.c1.diff2xUnr(r.c1, D)
|
||||
|
||||
@ -856,14 +856,16 @@ func prod2x_complex(r: var QuadraticExt2x, a, b: Fp2) =
|
||||
var D {.noInit.}: typeof(r.c0)
|
||||
var t0 {.noInit.}, t1 {.noInit.}: typeof(a.c0)
|
||||
|
||||
r.c0.prod2x(a.c0, b.c0) # r0 = a0 b0
|
||||
D.prod2x(a.c1, b.c1) # d = a1 b1
|
||||
when Fp2.has1extraBit():
|
||||
t0.sumUnr(a.c0, a.c1)
|
||||
t1.sumUnr(b.c0, b.c1)
|
||||
else:
|
||||
t0.sum(a.c0, a.c1)
|
||||
t1.sum(b.c0, b.c1)
|
||||
|
||||
r.c0.prod2x(a.c0, b.c0) # r0 = a0 b0
|
||||
D.prod2x(a.c1, b.c1) # d = a1 b1
|
||||
|
||||
r.c1.prod2x(t0, t1) # r1 = (b0 + b1)(a0 + a1)
|
||||
when Fp2.has1extraBit():
|
||||
r.c1.diff2xUnr(r.c1, r.c0) # r1 = (b0 + b1)(a0 + a1) - a0 b0
|
||||
@ -1052,9 +1054,6 @@ func prod2x_disjoint*[Fdbl, F](
|
||||
var V0 {.noInit.}, V1 {.noInit.}: typeof(r.c0) # Double-precision
|
||||
var t0 {.noInit.}, t1 {.noInit.}: typeof(a0) # Single-width
|
||||
|
||||
# Require 2 extra bits
|
||||
V0.prod2x(a0, b0) # v0 = a0b0
|
||||
V1.prod2x(a1, b1) # v1 = a1b1
|
||||
when F.has1extraBit():
|
||||
t0.sumUnr(a0, a1)
|
||||
t1.sumUnr(b0, b1)
|
||||
@ -1062,6 +1061,9 @@ func prod2x_disjoint*[Fdbl, F](
|
||||
t0.sum(a0, a1)
|
||||
t1.sum(b0, b1)
|
||||
|
||||
V0.prod2x(a0, b0) # v0 = a0b0
|
||||
V1.prod2x(a1, b1) # v1 = a1b1
|
||||
|
||||
r.c1.prod2x(t0, t1) # r1 = (a0 + a1)(b0 + b1)
|
||||
r.c1.diff2xMod(r.c1, V0) # r1 = (a0 + a1)(b0 + b1) - a0b0
|
||||
r.c1.diff2xMod(r.c1, V1) # r1 = (a0 + a1)(b0 + b1) - a0b0 - a1b1
|
||||
|
||||
@ -41,8 +41,7 @@ export BigInt, wordsRequired
|
||||
func unmarshalLE[T](
|
||||
dst: var openArray[T],
|
||||
src: openarray[byte],
|
||||
wordBitWidth: static int
|
||||
) =
|
||||
wordBitWidth: static int) =
|
||||
## Parse an unsigned integer from its canonical
|
||||
## little-endian unsigned representation
|
||||
## and store it into a BigInt
|
||||
@ -85,8 +84,7 @@ func unmarshalLE[T](
|
||||
func unmarshalBE[T](
|
||||
dst: var openArray[T],
|
||||
src: openarray[byte],
|
||||
wordBitWidth: static int
|
||||
) =
|
||||
wordBitWidth: static int) =
|
||||
## Parse an unsigned integer from its canonical
|
||||
## big-endian unsigned representation (octet string)
|
||||
## and store it into a BigInt.
|
||||
|
||||
@ -17,7 +17,7 @@ import ../../metering/tracer
|
||||
|
||||
export primitives, tracer
|
||||
|
||||
when sizeof(int) == 8 and not defined(Constantine32):
|
||||
when sizeof(int) == 8 and not defined(Ctt32):
|
||||
type
|
||||
BaseType* = uint64
|
||||
## Physical BigInt for conversion in "normal integers"
|
||||
@ -67,7 +67,7 @@ type VarTime* = object
|
||||
|
||||
type SignedSecretWord* = distinct SecretWord
|
||||
|
||||
when sizeof(int) == 8 and not defined(Constantine32):
|
||||
when sizeof(int) == 8 and not defined(Ctt32):
|
||||
type
|
||||
SignedBaseType* = int64
|
||||
else:
|
||||
|
||||
@ -49,28 +49,16 @@ template mux_x86_impl() {.dirty.} =
|
||||
static: doAssert(X86)
|
||||
static: doAssert(GCC_Compatible)
|
||||
|
||||
when sizeof(T) == 8:
|
||||
var muxed = x
|
||||
asm """
|
||||
testq %[ctl], %[ctl]
|
||||
cmovzq %[y], %[muxed]
|
||||
: [muxed] "+r" (`muxed`)
|
||||
: [ctl] "r" (`ctl`), [y] "r" (`y`)
|
||||
: "cc"
|
||||
"""
|
||||
muxed
|
||||
elif sizeof(T) == 4:
|
||||
var muxed = x
|
||||
asm """
|
||||
testl %[ctl], %[ctl]
|
||||
cmovzl %[y], %[muxed]
|
||||
: [muxed] "+r" (`muxed`)
|
||||
: [ctl] "r" (`ctl`), [y] "r" (`y`)
|
||||
: "cc"
|
||||
"""
|
||||
muxed
|
||||
else:
|
||||
{.error: "Unsupported word size".}
|
||||
var muxed = x
|
||||
asm """
|
||||
test %[ctl], %[ctl]
|
||||
cmovz %[muxed], %[y]
|
||||
: [muxed] "+r" (`muxed`)
|
||||
: [ctl] "r" (`ctl`), [y] "r" (`y`)
|
||||
: "cc"
|
||||
"""
|
||||
muxed
|
||||
|
||||
|
||||
func mux_x86[T](ctl: CTBool[T], x, y: T): T {.inline.}=
|
||||
## Multiplexer / selector
|
||||
@ -92,42 +80,23 @@ func ccopy_x86[T](ctl: CTBool[T], x: var T, y: T) {.inline.}=
|
||||
static: doAssert(X86)
|
||||
static: doAssert(GCC_Compatible)
|
||||
|
||||
when sizeof(T) == 8:
|
||||
when defined(cpp):
|
||||
asm """
|
||||
testq %[ctl], %[ctl]
|
||||
cmovnzq %[y], %[x]
|
||||
when defined(cpp):
|
||||
asm """
|
||||
test %[ctl], %[ctl]
|
||||
cmovnz %[x], %[y]
|
||||
: [x] "+r" (`x`)
|
||||
: [ctl] "r" (`ctl`), [y] "r" (`y`)
|
||||
: "cc"
|
||||
"""
|
||||
else:
|
||||
asm """
|
||||
testq %[ctl], %[ctl]
|
||||
cmovnzq %[y], %[x]
|
||||
: [x] "+r" (`*x`)
|
||||
: [ctl] "r" (`ctl`), [y] "r" (`y`)
|
||||
: "cc"
|
||||
"""
|
||||
elif sizeof(T) == 4:
|
||||
when defined(cpp):
|
||||
asm """
|
||||
testl %[ctl], %[ctl]
|
||||
cmovnzl %[y], %[x]
|
||||
: [x] "+r" (`x`)
|
||||
: [ctl] "r" (`ctl`), [y] "r" (`y`)
|
||||
: "cc"
|
||||
"""
|
||||
else:
|
||||
asm """
|
||||
testl %[ctl], %[ctl]
|
||||
cmovnzl %[y], %[x]
|
||||
: [x] "+r" (`*x`)
|
||||
: [ctl] "r" (`ctl`), [y] "r" (`y`)
|
||||
: "cc"
|
||||
"""
|
||||
|
||||
else:
|
||||
{.error: "Unsupported word size".}
|
||||
asm """
|
||||
test %[ctl], %[ctl]
|
||||
cmovnz %[x], %[y]
|
||||
: [x] "+r" (`*x`)
|
||||
: [ctl] "r" (`ctl`), [y] "r" (`y`)
|
||||
: "cc"
|
||||
"""
|
||||
|
||||
# Public functions
|
||||
# ------------------------------------------------------------
|
||||
|
||||
@ -44,7 +44,7 @@ macro replacePragmasByInline(procAst: typed): untyped =
|
||||
|
||||
result = newStmtList()
|
||||
|
||||
# The push cdecl is applied multiple times :/, so fight push with push
|
||||
# The push noconv is applied multiple times :/, so fight push with push
|
||||
result.add nnkPragma.newTree(ident"push", ident"nimcall", ident"inline")
|
||||
|
||||
result.add newProc(
|
||||
@ -61,7 +61,7 @@ macro wrapOpenArrayLenType*(ty: typedesc, procAst: untyped): untyped =
|
||||
## Wraps pointer+len library calls in properly typed and converted openArray calls
|
||||
##
|
||||
## ```
|
||||
## {.push cdecl.}
|
||||
## {.push noconv.}
|
||||
## proc foo*(r: int, a: openArray[CustomType], b: int) {.wrapOpenArrayLenType: uint32, importc: "foo", dynlib: "libfoo.so".}
|
||||
## {.pop.}
|
||||
## ```
|
||||
@ -69,7 +69,7 @@ macro wrapOpenArrayLenType*(ty: typedesc, procAst: untyped): untyped =
|
||||
## is transformed into
|
||||
##
|
||||
## ```
|
||||
## proc foo(r: int, a: ptr CustomType, aLen: uint32, b: int) {.cdecl, importc: "foo", dynlib: "libfoo.so".}
|
||||
## proc foo(r: int, a: ptr CustomType, aLen: uint32, b: int) {.noconv, importc: "foo", dynlib: "libfoo.so".}
|
||||
##
|
||||
## proc foo*(r: int, a: openArray[CustomType], b: int) {.inline.} =
|
||||
## foo(r, a[0].unsafeAddr, a.len.uint32, b)
|
||||
@ -140,7 +140,7 @@ macro wrapOpenArrayLenType*(ty: typedesc, procAst: untyped): untyped =
|
||||
|
||||
when isMainModule:
|
||||
expandMacros:
|
||||
{.push cdecl.}
|
||||
{.push noconv.}
|
||||
|
||||
proc foo(x: int, a: openArray[uint32], name: cstring) {.wrapOpenArrayLenType: cuint.} =
|
||||
discard
|
||||
|
||||
@ -26,7 +26,7 @@ static: echo "[Constantine] Using library " & libLLVM
|
||||
# also link to libLLVM, for example if they implement a virtual machine (for the EVM, for Snarks/zero-knowledge, ...).
|
||||
# Hence Constantine should always use LLVM context to "namespace" its own codegen and avoid collisions in the global context.
|
||||
|
||||
{.push cdecl, dynlib: libLLVM.}
|
||||
{.push noconv, dynlib: libLLVM.}
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
@ -571,4 +571,4 @@ proc memset*(builder: BuilderRef, `ptr`, val, len: ValueRef, align: uint32) {.im
|
||||
proc memcpy*(builder: BuilderRef, dst: ValueRef, dstAlign: uint32, src: ValueRef, srcAlign: uint32, size: ValueRef) {.importc: "LLVMBuildMemcpy".}
|
||||
proc memmove*(builder: BuilderRef, dst: ValueRef, dstAlign: uint32, src: ValueRef, srcAlign: uint32, size: ValueRef) {.importc: "LLVMBuildMemmove".}
|
||||
|
||||
{.pop.} # {.used, hint[Name]: off, cdecl, dynlib: libLLVM.}
|
||||
{.pop.} # {.used, hint[Name]: off, noconv, dynlib: libLLVM.}
|
||||
|
||||
@ -482,7 +482,7 @@ type
|
||||
CUstream* = distinct pointer
|
||||
CUdeviceptr* = distinct pointer
|
||||
|
||||
{.push cdecl, importc, dynlib: "libcuda.so".}
|
||||
{.push noconv, importc, dynlib: "libcuda.so".}
|
||||
|
||||
proc cuInit*(flags: uint32): CUresult
|
||||
|
||||
@ -515,4 +515,4 @@ proc cuMemFree*(devptr: CUdeviceptr): CUresult
|
||||
proc cuMemcpyHtoD*(dst: CUdeviceptr, src: pointer, size: csize_t): CUresult
|
||||
proc cuMemcpyDtoH*(dst: pointer, src: CUdeviceptr, size: csize_t): CUresult
|
||||
|
||||
{.pop.} # {.push cdecl, importc, dynlib: "libcuda.so".}
|
||||
{.pop.} # {.push noconv, importc, dynlib: "libcuda.so".}
|
||||
|
||||
@ -4,7 +4,7 @@ proc cpuidX86(eaxi, ecxi: int32): tuple[eax, ebx, ecx, edx: int32] {.used.}=
|
||||
when defined(vcc):
|
||||
# limited inline asm support in vcc, so intrinsics, here we go:
|
||||
proc cpuidVcc(cpuInfo: ptr int32; functionID, subFunctionID: int32)
|
||||
{.cdecl, importc: "__cpuidex", header: "intrin.h".}
|
||||
{.noconv, importc: "__cpuidex", header: "intrin.h".}
|
||||
cpuidVcc(addr result.eax, eaxi, ecxi)
|
||||
else:
|
||||
var (eaxr, ebxr, ecxr, edxr) = (0'i32, 0'i32, 0'i32, 0'i32)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -53,7 +53,7 @@ when X86 and GCC_Compatible:
|
||||
# ############################################################
|
||||
|
||||
template debug*(body: untyped): untyped =
|
||||
when defined(debugConstantine):
|
||||
when defined(CttDebug):
|
||||
body
|
||||
|
||||
proc builtin_unreachable(){.nodecl, importc: "__builtin_unreachable".}
|
||||
|
||||
@ -34,7 +34,7 @@ import std/macros
|
||||
# --------------------------------------------------------
|
||||
|
||||
# Everything should be a template that doesn't produce any code
|
||||
# when debugConstantine is not defined.
|
||||
# when CttDebug is not defined.
|
||||
# Those checks are controlled by a custom flag instead of
|
||||
# "--boundsChecks" or "--nilChecks" to decouple them from user code checks.
|
||||
# Furthermore, we want them to be very lightweight on performance
|
||||
|
||||
@ -76,9 +76,9 @@ const ULF_WAKE_MASK = ULF_NO_ERRNO or
|
||||
ULF_WAKE_THREAD or
|
||||
ULF_WAKE_ALLOW_NON_OWNER
|
||||
|
||||
proc ulock_wait(operation: uint32, address: pointer, expected: uint64, timeout: uint32): cint {.importc:"__ulock_wait", cdecl.}
|
||||
proc ulock_wait2(operation: uint32, address: pointer, expected: uint64, timeout, value2: uint64): cint {.importc:"__ulock_wait2", cdecl.}
|
||||
proc ulock_wake(operation: uint32, address: pointer, wake_value: uint64): cint {.importc:"__ulock_wake", cdecl.}
|
||||
proc ulock_wait(operation: uint32, address: pointer, expected: uint64, timeout: uint32): cint {.importc:"__ulock_wait", noconv.}
|
||||
proc ulock_wait2(operation: uint32, address: pointer, expected: uint64, timeout, value2: uint64): cint {.importc:"__ulock_wait2", noconv.}
|
||||
proc ulock_wake(operation: uint32, address: pointer, wake_value: uint64): cint {.importc:"__ulock_wake", noconv.}
|
||||
|
||||
# Futex API
|
||||
# ------------------------------------------------------------------------
|
||||
|
||||
@ -150,7 +150,7 @@ macro genCharAPI*(procAst: untyped): untyped =
|
||||
wrapperBody.add ident($procAst.params[i][j])
|
||||
|
||||
var pragmas = nnkPragma.newTree(ident"inline")
|
||||
let skipPragmas = ["inline", "noinline", "noInline", "exportc", "exportcpp", "extern", "cdecl", "stdcall", "dynlib", "libPrefix"]
|
||||
let skipPragmas = ["inline", "noinline", "noInline", "exportc", "exportcpp", "extern", "noconv", "cdecl", "stdcall", "dynlib", "libPrefix"]
|
||||
for i in 0 ..< procAst.pragma.len:
|
||||
if procAst.pragma[i].kind == nnkIdent:
|
||||
if $procAst.pragma[i] notin skipPragmas:
|
||||
|
||||
@ -15,7 +15,7 @@
|
||||
# that internally uses `sha256.hash`,
|
||||
# the ideal outcome is for `sha256.hash to be exported as `ctt_sha256_hash` and
|
||||
# have `hash_to_curve` directly use that.
|
||||
# 3. Furthermore while compiling Nim only, no export marker (cdecl, dynlib, exportc) are used.
|
||||
# 3. Furthermore while compiling Nim only, no export marker (noconv, dynlib, exportc) are used.
|
||||
#
|
||||
# Each prefix must be modified before importing the module to export
|
||||
|
||||
@ -37,7 +37,7 @@ macro libPrefix*(prefix: static string, procAst: untyped): untyped =
|
||||
if pragmas.kind == nnkEmpty:
|
||||
pragmas = nnkPragma.newTree()
|
||||
|
||||
pragmas.add ident"cdecl"
|
||||
pragmas.add ident"noconv"
|
||||
pragmas.add nnkExprColonExpr.newTree(
|
||||
ident"exportc",
|
||||
newLit(prefix & "$1"))
|
||||
|
||||
BIN
media/bls12_381_msm_i9-11980HK-8cores_1.png
Normal file
BIN
media/bls12_381_msm_i9-11980HK-8cores_1.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 459 KiB |
BIN
media/bls12_381_msm_i9-11980HK-8cores_2.png
Normal file
BIN
media/bls12_381_msm_i9-11980HK-8cores_2.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 464 KiB |
BIN
media/bls12_381_msm_i9-11980HK-8cores_3.png
Normal file
BIN
media/bls12_381_msm_i9-11980HK-8cores_3.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 334 KiB |
BIN
media/bls12_381_perf_summary_i9-11980HK.png
Normal file
BIN
media/bls12_381_perf_summary_i9-11980HK.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 150 KiB |
BIN
media/bn254_snarks_msm-i9-9980XE-18cores.png
Normal file
BIN
media/bn254_snarks_msm-i9-9980XE-18cores.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 91 KiB |
BIN
media/parallel_load_distribution.png
Normal file
BIN
media/parallel_load_distribution.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 598 KiB |
@ -6,60 +6,42 @@
|
||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import
|
||||
std/[macros, times, monotimes],
|
||||
../benchmarks/platforms
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# Trace operations
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
# Utils
|
||||
# --------------------------------------------------
|
||||
const someGcc = defined(gcc) or defined(llvm_gcc) or defined(clang) or defined(icc)
|
||||
const hasThreadSupport = defined(threads)
|
||||
|
||||
proc atomicInc*(memLoc: var int64, x = 1'i64): int64 =
|
||||
when someGcc and hasThreadSupport:
|
||||
result = atomicAddFetch(memLoc.addr, x, ATOMIC_RELAXED)
|
||||
elif defined(vcc) and hasThreadSupport:
|
||||
result = addAndFetch(memLoc.addr, x)
|
||||
result += x
|
||||
else:
|
||||
memloc += x
|
||||
result = memLoc
|
||||
|
||||
# Types
|
||||
# --------------------------------------------------
|
||||
|
||||
type
|
||||
Metadata* = object
|
||||
procName*: string
|
||||
module: string
|
||||
package: string
|
||||
tag: string # Can be change to multi-tags later
|
||||
numCalls*: int64
|
||||
cumulatedTimeNs*: int64 # in microseconds
|
||||
when SupportsGetTicks:
|
||||
cumulatedCycles*: int64
|
||||
|
||||
template mtag(tagname: string){.pragma, used.}
|
||||
## This will allow tagging proc in the future with
|
||||
## "Fp", "ec", "polynomial"
|
||||
|
||||
const CttMeter {.booldefine.} = off
|
||||
const CttTrace {.booldefine.} = off # For manual "debug-echo"-style timing.
|
||||
|
||||
var ctMetrics{.compileTime.}: seq[Metadata]
|
||||
## Metrics are collected here, this is just a temporary holder of compileTime values
|
||||
## Unfortunately the "seq" is emptied when passing the compileTime/runtime boundaries
|
||||
## due to Nim bugs
|
||||
|
||||
when CttMeter or CttTrace:
|
||||
|
||||
import ../benchmarks/platforms
|
||||
|
||||
type
|
||||
Metadata* = object
|
||||
procName*: string
|
||||
module: string
|
||||
package: string
|
||||
tag: string # Can be change to multi-tags later
|
||||
numCalls*: int64
|
||||
cumulatedTimeNs*: int64 # in microseconds
|
||||
when SupportsGetTicks:
|
||||
cumulatedCycles*: int64
|
||||
|
||||
var ctMetrics{.compileTime.}: seq[Metadata]
|
||||
## Metrics are collected here, this is just a temporary holder of compileTime values
|
||||
## Unfortunately the "seq" is emptied when passing the compileTime/runtime boundaries
|
||||
## due to Nim bugs
|
||||
|
||||
|
||||
# strformat doesn't work in templates.
|
||||
from strutils import alignLeft, formatFloat
|
||||
import std/[macros, times, monotimes]
|
||||
|
||||
var Metrics*: seq[Metadata]
|
||||
## We can't directly use it at compileTime because it doesn't exist.
|
||||
@ -69,80 +51,96 @@ when CttMeter or CttTrace:
|
||||
proc resetMetering*() =
|
||||
Metrics = static(ctMetrics)
|
||||
|
||||
# Symbols
|
||||
# --------------------------------------------------
|
||||
|
||||
template fnEntry(name: string, id: int, startTime, startCycle: untyped): untyped =
|
||||
## Bench tracing to insert on function entry
|
||||
{.noSideEffect, gcsafe.}:
|
||||
discard Metrics[id].numCalls.atomicInc()
|
||||
let startTime = getMonoTime()
|
||||
when SupportsGetTicks:
|
||||
let startCycle = getTicks()
|
||||
# Utils
|
||||
# --------------------------------------------------
|
||||
const someGcc = defined(gcc) or defined(llvm_gcc) or defined(clang) or defined(icc)
|
||||
const hasThreadSupport = defined(threads)
|
||||
|
||||
proc atomicInc*(memLoc: var int64, x = 1'i64): int64 =
|
||||
when someGcc and hasThreadSupport:
|
||||
result = atomicAddFetch(memLoc.addr, x, ATOMIC_RELAXED)
|
||||
elif defined(vcc) and hasThreadSupport:
|
||||
result = addAndFetch(memLoc.addr, x)
|
||||
result += x
|
||||
else:
|
||||
let startCycle = 0
|
||||
memloc += x
|
||||
result = memLoc
|
||||
|
||||
template fnExit(name: string, id: int, startTime, startCycle: untyped): untyped =
|
||||
## Bench tracing to insert before each function exit
|
||||
{.noSideEffect, gcsafe.}:
|
||||
when SupportsGetTicks:
|
||||
let stopCycle = getTicks()
|
||||
let stopTime = getMonoTime()
|
||||
when SupportsGetTicks:
|
||||
let elapsedCycles = stopCycle - startCycle
|
||||
let elapsedTime = inMicroseconds(stopTime - startTime)
|
||||
# Symbols
|
||||
# --------------------------------------------------
|
||||
|
||||
discard Metrics[id].cumulatedTimeNs.atomicInc(elapsedTime)
|
||||
when SupportsGetTicks:
|
||||
discard Metrics[id].cumulatedCycles.atomicInc(elapsedCycles)
|
||||
|
||||
when CttTrace:
|
||||
# Advice: Use "when name == relevantProc" to isolate specific procedures.
|
||||
# strformat doesn't work in templates.
|
||||
template fnEntry(name: string, id: int, startTime, startCycle: untyped): untyped =
|
||||
## Bench tracing to insert on function entry
|
||||
{.noSideEffect, gcsafe.}:
|
||||
discard Metrics[id].numCalls.atomicInc()
|
||||
let startTime = getMonoTime()
|
||||
when SupportsGetTicks:
|
||||
echo static(alignLeft(name, 50)),
|
||||
"Time (µs): ", alignLeft(formatFloat(elapsedTime.float64 * 1e-3, precision=3), 10),
|
||||
"Cycles (billions): ", formatFloat(elapsedCycles.float64 * 1e-9, precision=3)
|
||||
let startCycle = getTicks()
|
||||
else:
|
||||
echo static(alignLeft(name, 50)),
|
||||
"Time (µs): ", alignLeft(formatFloat(elapsedTime.float64 * 1e-3, precision=3), 10)
|
||||
let startCycle = 0
|
||||
|
||||
macro meterAnnotate(procAst: untyped): untyped =
|
||||
procAst.expectKind({nnkProcDef, nnkFuncDef})
|
||||
template fnExit(name: string, id: int, startTime, startCycle: untyped): untyped =
|
||||
## Bench tracing to insert before each function exit
|
||||
{.noSideEffect, gcsafe.}:
|
||||
when SupportsGetTicks:
|
||||
let stopCycle = getTicks()
|
||||
let stopTime = getMonoTime()
|
||||
when SupportsGetTicks:
|
||||
let elapsedCycles = stopCycle - startCycle
|
||||
let elapsedTime = inMicroseconds(stopTime - startTime)
|
||||
|
||||
let id = ctMetrics.len
|
||||
let name = procAst[0].repr & procAst[3].repr
|
||||
# TODO, get the module and the package the proc is coming from
|
||||
# and the tag "Fp", "ec", "polynomial" ...
|
||||
discard Metrics[id].cumulatedTimeNs.atomicInc(elapsedTime)
|
||||
when SupportsGetTicks:
|
||||
discard Metrics[id].cumulatedCycles.atomicInc(elapsedCycles)
|
||||
|
||||
ctMetrics.add Metadata(procName: name)
|
||||
var newBody = newStmtList()
|
||||
let startTime = genSym(nskLet, "metering_" & name & "_startTime_")
|
||||
let startCycle = genSym(nskLet, "metering_" & name & "_startCycles_")
|
||||
newBody.add getAst(fnEntry(name, id, startTime, startCycle))
|
||||
newbody.add nnkDefer.newTree(getAst(fnExit(name, id, startTime, startCycle)))
|
||||
newBody.add procAst.body
|
||||
|
||||
if procAst[4].kind != nnkEmpty:
|
||||
# Timing procedures adds the TimeEffect tag, which interferes with {.tags:[VarTime].}
|
||||
# as TimeEffect is not listed. We drop the `tags` for metering
|
||||
var pragmas: NimNode
|
||||
if procAst[4].len == 1:
|
||||
if procAst[4][0].kind == nnkExprColonExpr and procAst[4][0][0].eqIdent"tags":
|
||||
pragmas = newEmptyNode()
|
||||
else:
|
||||
pragmas = procAst[4]
|
||||
else:
|
||||
pragmas = nnkPragma.newTree()
|
||||
for i in 0 ..< procAst[4].len:
|
||||
if procAst[4][0].kind == nnkExprColonExpr and procAst[4][0][0].eqIdent"tags":
|
||||
continue
|
||||
when CttTrace:
|
||||
# Advice: Use "when name == relevantProc" to isolate specific procedures.
|
||||
# strformat doesn't work in templates.
|
||||
when SupportsGetTicks:
|
||||
echo static(alignLeft(name, 50)),
|
||||
"Time (µs): ", alignLeft(formatFloat(elapsedTime.float64 * 1e-3, precision=3), 10),
|
||||
"Cycles (billions): ", formatFloat(elapsedCycles.float64 * 1e-9, precision=3)
|
||||
else:
|
||||
pragmas.add procAst[4][0]
|
||||
procAst[4] = pragmas
|
||||
echo static(alignLeft(name, 50)),
|
||||
"Time (µs): ", alignLeft(formatFloat(elapsedTime.float64 * 1e-3, precision=3), 10)
|
||||
|
||||
procAst.body = newBody
|
||||
result = procAst
|
||||
macro meterAnnotate(procAst: untyped): untyped =
|
||||
procAst.expectKind({nnkProcDef, nnkFuncDef})
|
||||
|
||||
let id = ctMetrics.len
|
||||
let name = procAst[0].repr & procAst[3].repr
|
||||
# TODO, get the module and the package the proc is coming from
|
||||
# and the tag "Fp", "ec", "polynomial" ...
|
||||
|
||||
ctMetrics.add Metadata(procName: name)
|
||||
var newBody = newStmtList()
|
||||
let startTime = genSym(nskLet, "metering_" & name & "_startTime_")
|
||||
let startCycle = genSym(nskLet, "metering_" & name & "_startCycles_")
|
||||
newBody.add getAst(fnEntry(name, id, startTime, startCycle))
|
||||
newbody.add nnkDefer.newTree(getAst(fnExit(name, id, startTime, startCycle)))
|
||||
newBody.add procAst.body
|
||||
|
||||
if procAst[4].kind != nnkEmpty:
|
||||
# Timing procedures adds the TimeEffect tag, which interferes with {.tags:[VarTime].}
|
||||
# as TimeEffect is not listed. We drop the `tags` for metering
|
||||
var pragmas: NimNode
|
||||
if procAst[4].len == 1:
|
||||
if procAst[4][0].kind == nnkExprColonExpr and procAst[4][0][0].eqIdent"tags":
|
||||
pragmas = newEmptyNode()
|
||||
else:
|
||||
pragmas = procAst[4]
|
||||
else:
|
||||
pragmas = nnkPragma.newTree()
|
||||
for i in 0 ..< procAst[4].len:
|
||||
if procAst[4][0].kind == nnkExprColonExpr and procAst[4][0][0].eqIdent"tags":
|
||||
continue
|
||||
else:
|
||||
pragmas.add procAst[4][0]
|
||||
procAst[4] = pragmas
|
||||
|
||||
procAst.body = newBody
|
||||
result = procAst
|
||||
|
||||
template meter*(procBody: untyped): untyped =
|
||||
when CttMeter or CttTrace:
|
||||
@ -157,14 +155,15 @@ when isMainModule:
|
||||
|
||||
static: doAssert CttMeter or CttTrace, "CttMeter or CttTrace must be on for tracing"
|
||||
|
||||
expandMacros:
|
||||
proc foo(x: int): int{.meter.} =
|
||||
echo "Hey hey hey"
|
||||
result = x
|
||||
when CttMeter or CttTrace: # Avoid warnings from nim check or nimsuggest
|
||||
expandMacros:
|
||||
proc foo(x: int): int{.meter.} =
|
||||
echo "Hey hey hey"
|
||||
result = x
|
||||
|
||||
resetMetering()
|
||||
resetMetering()
|
||||
|
||||
echo Metrics
|
||||
discard foo(10)
|
||||
echo Metrics
|
||||
doAssert Metrics[0].numCalls == 1
|
||||
echo Metrics
|
||||
discard foo(10)
|
||||
echo Metrics
|
||||
doAssert Metrics[0].numCalls == 1
|
||||
|
||||
@ -52,7 +52,7 @@ type
|
||||
|
||||
NvvmProgram = distinct pointer
|
||||
|
||||
{.push cdecl, importc, dynlib: "libnvvm.so".}
|
||||
{.push noconv, importc, dynlib: "libnvvm.so".}
|
||||
|
||||
proc nvvmGetErrorString*(r: NvvmResult): cstring
|
||||
proc nvvmVersion*(major, minor: var int32): NvvmResult
|
||||
@ -69,7 +69,7 @@ proc nvvmGetCompiledResult*(prog: NvvmProgram; buffer: ptr char): NvvmResult
|
||||
proc nvvmGetProgramLogSize*(prog: NvvmProgram; bufferSizeRet: var csize_t): NvvmResult
|
||||
proc nvvmGetProgramLog*(prog: NvvmProgram; buffer: ptr char): NvvmResult
|
||||
|
||||
{.pop.} # {.push cdecl, importc, header: "<nvvm.h>".}
|
||||
{.pop.} # {.push noconv, importc, header: "<nvvm.h>".}
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
|
||||
@ -1 +1 @@
|
||||
-d:debugConstantine
|
||||
-d:CttDebug
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
-d:testingCurves
|
||||
-d:debugConstantine
|
||||
-d:CttDebug
|
||||
|
||||
@ -142,11 +142,15 @@ proc runTowerTests*[N](
|
||||
block:
|
||||
var r{.noinit.}: Field
|
||||
r.square(One)
|
||||
check: bool(r == One)
|
||||
doAssert bool(r == One),
|
||||
"\n(" & $Field & "): Expected one: " & One.toHex() & "\n" &
|
||||
"got: " & r.toHex()
|
||||
block:
|
||||
var r{.noinit.}: Field
|
||||
r.prod(One, One)
|
||||
check: bool(r == One)
|
||||
doAssert bool(r == One),
|
||||
"\n(" & $Field & "): Expected one: " & One.toHex() & "\n" &
|
||||
"got: " & r.toHex()
|
||||
|
||||
staticFor(curve, TestCurves):
|
||||
test(ExtField(ExtDegree, curve))
|
||||
@ -168,12 +172,16 @@ proc runTowerTests*[N](
|
||||
var r: Field
|
||||
r.square(Two)
|
||||
|
||||
check: bool(r == Four)
|
||||
doAssert bool(r == Four),
|
||||
"\n(" & $Field & "): Expected 4: " & Four.toHex() & "\n" &
|
||||
"got: " & r.toHex()
|
||||
block:
|
||||
var r: Field
|
||||
r.prod(Two, Two)
|
||||
|
||||
check: bool(r == Four)
|
||||
doAssert bool(r == Four),
|
||||
"\n(" & $Field & "): Expected 4: " & Four.toHex() & "\n" &
|
||||
"got: " & r.toHex()
|
||||
|
||||
staticFor(curve, TestCurves):
|
||||
test(ExtField(ExtDegree, curve))
|
||||
@ -197,12 +205,16 @@ proc runTowerTests*[N](
|
||||
var u: Field
|
||||
u.square(Three)
|
||||
|
||||
check: bool(u == Nine)
|
||||
doAssert bool(u == Nine),
|
||||
"\n(" & $Field & "): Expected 9: " & Nine.toHex() & "\n" &
|
||||
"got: " & u.toHex()
|
||||
block:
|
||||
var u: Field
|
||||
u.prod(Three, Three)
|
||||
|
||||
check: bool(u == Nine)
|
||||
doAssert bool(u == Nine),
|
||||
"\n(" & $Field & "): Expected 9: " & Nine.toHex() & "\n" &
|
||||
"got: " & u.toHex()
|
||||
|
||||
staticFor(curve, TestCurves):
|
||||
test(ExtField(ExtDegree, curve))
|
||||
@ -226,12 +238,16 @@ proc runTowerTests*[N](
|
||||
var u: Field
|
||||
u.square(MinusThree)
|
||||
|
||||
check: bool(u == Nine)
|
||||
doAssert bool(u == Nine),
|
||||
"\n(" & $Field & "): Expected 9: " & Nine.toHex() & "\n" &
|
||||
"got: " & u.toHex()
|
||||
block:
|
||||
var u: Field
|
||||
u.prod(MinusThree, MinusThree)
|
||||
|
||||
check: bool(u == Nine)
|
||||
doAssert bool(u == Nine),
|
||||
"\n(" & $Field & "): Expected 9: " & Nine.toHex() & "\n" &
|
||||
"got: " & u.toHex()
|
||||
|
||||
staticFor(curve, TestCurves):
|
||||
test(ExtField(ExtDegree, curve))
|
||||
|
||||
@ -22,10 +22,10 @@ import
|
||||
|
||||
const
|
||||
Iters = 4
|
||||
TestCurves = [
|
||||
BN254_Nogami,
|
||||
TestCurves = [ # Note activating some combination of curves causes miscompile / bad constant propagation with LTO in Windows MinGW GCC 12.2 (but not 8.1 or not 12.2 on Linux)
|
||||
# BN254_Nogami,
|
||||
BN254_Snarks,
|
||||
BLS12_377,
|
||||
# BLS12_377,
|
||||
BLS12_381
|
||||
]
|
||||
|
||||
|
||||
@ -37,7 +37,7 @@ when not defined(windows):
|
||||
proc SHA256[T: byte|char](
|
||||
msg: openarray[T],
|
||||
digest: ptr array[32, byte] = nil
|
||||
): ptr array[32, byte] {.cdecl, dynlib: DLLSSLName, importc.}
|
||||
): ptr array[32, byte] {.noconv, dynlib: DLLSSLName, importc.}
|
||||
|
||||
# proc EVP_Q_digest[T: byte|char](
|
||||
# ossl_libctx: pointer,
|
||||
@ -45,7 +45,7 @@ when not defined(windows):
|
||||
# propq: cstring,
|
||||
# data: openArray[T],
|
||||
# digest: var array[32, byte],
|
||||
# size: ptr uint): int32 {.cdecl, dynlib: DLLSSLName, importc.}
|
||||
# size: ptr uint): int32 {.noconv, dynlib: DLLSSLName, importc.}
|
||||
|
||||
proc SHA256_OpenSSL[T: byte|char](
|
||||
digest: var array[32, byte],
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user