mirror of
https://github.com/logos-storage/constantine.git
synced 2026-01-06 23:23:13 +00:00
Rework assembly to be compatible with LTO (#231)
* rework assembler register/mem and constraint declarations * Introduce constraint UnmutatedPointerToWriteMem * Create invidual memory cell operands * [Assembly] fully support indirect memory addressing * fix calling convention for exported procs * Prepare for switch to intel syntax to avoid clang constant propagation asm symbol name interfering OR pointer+offset addressing * use modifiers to prevent bad string mixin fo assembler to linker of propagated consts * Assembly: switch to intel syntax * with working memory operand - now works with LTO on both GCC and clang and constant folding * use memory operand in more places * remove some inline now that we have lto * cleanup compiler config and benches * tracer shouldn't force dependencies when unused * fix cc on linux * nimble fixes * update README [skip CI] * update MacOS CI with Homebrew Clang * oops nimble bindings disappeared * more nimble fixes * fix sha256 exported symbol * improve constraints on modular addition * Add extra constraint to force reloading of pointer in reg inputs * Fix LLVM gold linker running out of registers * workaround MinGW64 GCC 12.2 bad codegen in t_pairing_cyclotomic_subgroup with LTO
This commit is contained in:
parent
9a7137466e
commit
c6d9a213f2
32
.github/workflows/ci.yml
vendored
32
.github/workflows/ci.yml
vendored
@ -25,6 +25,10 @@ jobs:
|
|||||||
cpu: amd64
|
cpu: amd64
|
||||||
TEST_LANG: c
|
TEST_LANG: c
|
||||||
BACKEND: NO_ASM
|
BACKEND: NO_ASM
|
||||||
|
- os: windows
|
||||||
|
cpu: amd64
|
||||||
|
TEST_LANG: c
|
||||||
|
BACKEND: ASM
|
||||||
- os: macos
|
- os: macos
|
||||||
cpu: amd64
|
cpu: amd64
|
||||||
TEST_LANG: c
|
TEST_LANG: c
|
||||||
@ -172,7 +176,19 @@ jobs:
|
|||||||
|
|
||||||
- name: Install test dependencies (macOS)
|
- name: Install test dependencies (macOS)
|
||||||
if: runner.os == 'macOS'
|
if: runner.os == 'macOS'
|
||||||
run: brew install gmp
|
run: |
|
||||||
|
brew install gmp
|
||||||
|
mkdir -p external/bin
|
||||||
|
cat << EOF > external/bin/clang
|
||||||
|
#!/bin/bash
|
||||||
|
exec $(brew --prefix llvm@15)/bin/clang "\$@"
|
||||||
|
EOF
|
||||||
|
cat << EOF > external/bin/clang++
|
||||||
|
#!/bin/bash
|
||||||
|
exec $(brew --prefix llvm@15)/bin/clang++ "\$@"
|
||||||
|
EOF
|
||||||
|
chmod 755 external/bin/{clang,clang++}
|
||||||
|
echo '${{ github.workspace }}/external/bin' >> $GITHUB_PATH
|
||||||
|
|
||||||
- name: Setup MSYS2 (Windows)
|
- name: Setup MSYS2 (Windows)
|
||||||
if: runner.os == 'Windows'
|
if: runner.os == 'Windows'
|
||||||
@ -210,9 +226,19 @@ jobs:
|
|||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
cd constantine
|
cd constantine
|
||||||
nimble bindings --verbose
|
nimble bindings_no_asm --verbose
|
||||||
nimble test_bindings --verbose
|
nimble test_bindings --verbose
|
||||||
nimble test_parallel_no_asm --verbose
|
nimble test_parallel_no_asm --verbose
|
||||||
|
- name: Run Constantine tests (Windows with Assembly)
|
||||||
|
# So "test_bindings" uses C and can find GMP
|
||||||
|
# but nim-gmp cannot find GMP on Windows CI
|
||||||
|
if: runner.os == 'Windows' && matrix.target.BACKEND == 'ASM'
|
||||||
|
shell: msys2 {0}
|
||||||
|
run: |
|
||||||
|
cd constantine
|
||||||
|
nimble bindings --verbose
|
||||||
|
nimble test_bindings --verbose
|
||||||
|
nimble test_parallel_no_gmp --verbose
|
||||||
- name: Run Constantine tests (Windows no Assembly)
|
- name: Run Constantine tests (Windows no Assembly)
|
||||||
# So "test_bindings" uses C and can find GMP
|
# So "test_bindings" uses C and can find GMP
|
||||||
# but nim-gmp cannot find GMP on Windows CI
|
# but nim-gmp cannot find GMP on Windows CI
|
||||||
@ -220,6 +246,6 @@ jobs:
|
|||||||
shell: msys2 {0}
|
shell: msys2 {0}
|
||||||
run: |
|
run: |
|
||||||
cd constantine
|
cd constantine
|
||||||
nimble bindings --verbose
|
nimble bindings_no_asm --verbose
|
||||||
nimble test_bindings --verbose
|
nimble test_bindings --verbose
|
||||||
nimble test_parallel_no_gmp_no_asm --verbose
|
nimble test_parallel_no_gmp_no_asm --verbose
|
||||||
|
|||||||
247
README.md
247
README.md
@ -25,9 +25,11 @@ The implementations are accompanied with SAGE code used as reference implementat
|
|||||||
- [Table of Contents](#table-of-contents)
|
- [Table of Contents](#table-of-contents)
|
||||||
- [Target audience](#target-audience)
|
- [Target audience](#target-audience)
|
||||||
- [Protocols](#protocols)
|
- [Protocols](#protocols)
|
||||||
- [Curves supported in the backend](#curves-supported-in-the-backend)
|
|
||||||
- [Installation](#installation)
|
- [Installation](#installation)
|
||||||
- [Dependencies](#dependencies)
|
- [From C](#from-c)
|
||||||
|
- [From Nim](#from-nim)
|
||||||
|
- [Dependencies & Requirements](#dependencies--requirements)
|
||||||
|
- [Curves supported in the backend](#curves-supported-in-the-backend)
|
||||||
- [Security](#security)
|
- [Security](#security)
|
||||||
- [Disclaimer](#disclaimer)
|
- [Disclaimer](#disclaimer)
|
||||||
- [Security disclosure](#security-disclosure)
|
- [Security disclosure](#security-disclosure)
|
||||||
@ -36,6 +38,7 @@ The implementations are accompanied with SAGE code used as reference implementat
|
|||||||
- [In zero-knowledge proofs](#in-zero-knowledge-proofs)
|
- [In zero-knowledge proofs](#in-zero-knowledge-proofs)
|
||||||
- [Measuring performance](#measuring-performance)
|
- [Measuring performance](#measuring-performance)
|
||||||
- [BLS12_381 Clang + inline Assembly](#bls12_381-clang--inline-assembly)
|
- [BLS12_381 Clang + inline Assembly](#bls12_381-clang--inline-assembly)
|
||||||
|
- [Parallelism](#parallelism)
|
||||||
- [Why Nim](#why-nim)
|
- [Why Nim](#why-nim)
|
||||||
- [Compiler caveats](#compiler-caveats)
|
- [Compiler caveats](#compiler-caveats)
|
||||||
- [Inline assembly](#inline-assembly)
|
- [Inline assembly](#inline-assembly)
|
||||||
@ -67,26 +70,110 @@ Protocols to address these goals, (authenticated) encryption, signature, traitor
|
|||||||
are designed.\
|
are designed.\
|
||||||
Note: some goals might be mutually exclusive, for example "plausible deniability" and "non-repudiation".
|
Note: some goals might be mutually exclusive, for example "plausible deniability" and "non-repudiation".
|
||||||
|
|
||||||
After [installation](#installation), the available high-level protocols are:
|
## Installation
|
||||||
|
|
||||||
- [x] Ethereum EVM precompiles on BN254_Snarks (also called alt_bn128 or bn256 in Ethereum)
|
### From C
|
||||||
|
|
||||||
`import constantine/ethereum_evm_precompiles`
|
1. Install a C compiler, for example:
|
||||||
- [x] BLS signature on BLS12-381 G2 as used in Ethereum 2.
|
- Debian/Ubuntu `sudo apt update && sudo apt install build-essential`
|
||||||
|
- Archlinux `pacman -S base-devel`
|
||||||
|
|
||||||
|
2. Install nim, it is available in most distros package manager for Linux and Homebrew for MacOS
|
||||||
|
Windows binaries are on the official website: https://nim-lang.org/install_unix.html
|
||||||
|
- Debian/Ubuntu `sudo apt install nim`
|
||||||
|
- Archlinux `pacman -S nim`
|
||||||
|
|
||||||
|
3. Compile the bindings.
|
||||||
|
- Recommended: \
|
||||||
|
`CC:clang nimble bindings`
|
||||||
|
- or `nimble bindings_no_asm`\
|
||||||
|
to compile without assembly (otherwise it autodetects support)
|
||||||
|
- or with default compiler\
|
||||||
|
`nimble bindings`
|
||||||
|
|
||||||
|
4. Ensure bindings work
|
||||||
|
- `nimble test_bindings`
|
||||||
|
|
||||||
|
5. Bindings location
|
||||||
|
- The bindings are put in `constantine/lib`
|
||||||
|
- The headers are in [constantine/include](./include) for example [Ethereum BLS signatures](./include/constantine_ethereum_bls_signatures.h)
|
||||||
|
|
||||||
|
6. Read the examples in [examples_c](./examples_c):
|
||||||
|
- Using the [Ethereum BLS signatures bindings from C](./examples_c/ethereum_bls_signatures.c)
|
||||||
|
- Testing Constantine BLS12-381 vs GMP [./examples_c/t_libctt_bls12_381.c](./examples_c/t_libctt_bls12_381.c)
|
||||||
|
|
||||||
|
The bindings currently provided are:
|
||||||
|
|
||||||
|
- Ethereum BLS signatures on BLS12-381 G2
|
||||||
Cryptographic suite: `BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_POP_`
|
Cryptographic suite: `BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_POP_`
|
||||||
|
|
||||||
This scheme is also used in the following blockchains:
|
This scheme is also used in the following blockchains:
|
||||||
Algorand, Chia, Dfinity, Filecoin, Tezos, Zcash.
|
Algorand, Chia, Dfinity, Filecoin, Tezos, Zcash.
|
||||||
They may have their pubkeys on G1 and signatures on G2 like Ethereum or the other way around.
|
They may have their pubkeys on G1 and signatures on G2 like Ethereum or the other way around.
|
||||||
|
|
||||||
> Parameter discussion:
|
- BLS12-381 arithmetic:
|
||||||
>
|
- field arithmetic
|
||||||
> As Ethereum validators' pubkeys are duplicated, stored and transmitter over and over in the protocol,
|
- on Fr (i.e. modulo the 255-bit curve order)
|
||||||
having them be as small as possible was important.
|
- on Fp (i.e. modulo the 381-bit prime modulus)
|
||||||
On another hand, BLS signatures were first popularized due to their succinctness.
|
- on Fp2
|
||||||
And having signatures on G1 is useful when short signatures are desired, in embedded for example.
|
- elliptic curve arithmetic:
|
||||||
- [x] SHA256 hash
|
- on elliptic curve over Fp (EC G1) with affine, jacobian and homogenous projective coordinates
|
||||||
- ...
|
- on elliptic curve over Fp2 (EC G2) with affine, jacobian and homogenous projective coordinates
|
||||||
|
- currently not exposed: \
|
||||||
|
scalar multiplication, multi-scalar multiplications \
|
||||||
|
pairings and multi-pairings \
|
||||||
|
are implemented but not exposed
|
||||||
|
- _All operations are constant-time unless explicitly mentioned_ vartime
|
||||||
|
|
||||||
|
- The Pasta curves: Pallas and Vesta
|
||||||
|
- field arithmetic
|
||||||
|
- on Fr (i.e. modulo the 255-bit curve order)
|
||||||
|
- on Fp (i.e. modulo the 255-bit prime modulus)
|
||||||
|
- elliptic curve arithmetic:
|
||||||
|
- on elliptic curve over Fp (EC G1) with affine, jacobian and homogenous projective coordinates
|
||||||
|
- currently not exposed: \
|
||||||
|
scalar multiplication, multi-scalar multiplications \
|
||||||
|
are implemented but not exposed
|
||||||
|
- _All operations are constant-time unless explicitly mentioned_ vartime
|
||||||
|
|
||||||
|
### From Nim
|
||||||
|
|
||||||
|
You can install the developement version of the library through nimble with the following command
|
||||||
|
```
|
||||||
|
nimble install https://github.com/mratsim/constantine@#master
|
||||||
|
```
|
||||||
|
|
||||||
|
## Dependencies & Requirements
|
||||||
|
|
||||||
|
For speed it is recommended to use Clang (see [Compiler-caveats](#Compiler-caveats)).
|
||||||
|
In particular GCC generates inefficient add-with-carry code.
|
||||||
|
|
||||||
|
Constantine requires at least:
|
||||||
|
- GCC 7 \
|
||||||
|
Previous versions generated incorrect add-with-carry code.
|
||||||
|
- Clang 14 \
|
||||||
|
On x86-64, inline assembly is used to workaround compilers having issues optimizing large integer arithmetic,
|
||||||
|
and also ensure constant-time code. \
|
||||||
|
Constantine uses the intel assembly syntax to address issues with the default AT&T syntax and constants propagated in Clang. \
|
||||||
|
Clang 14 added support for `-masm=intel`. \
|
||||||
|
\
|
||||||
|
On MacOS, Apple Clang does not support Intel assembly syntax, use Homebrew Clang instead or compile without assembly.\
|
||||||
|
_Note that Apple is discontinuing Intel CPU throughough their product line so this will impact only older model and Mac Pro_
|
||||||
|
|
||||||
|
On Windows, Constantine is tested with MinGW. The Microsoft Visual C++ Compiler is not configured.
|
||||||
|
|
||||||
|
Constantine has no dependencies, even on Nim standard library except:
|
||||||
|
- for testing
|
||||||
|
- jsony for parsing json test vectors
|
||||||
|
- the Nim standard library for unittesting, formatting and datetime.
|
||||||
|
- GMP for testing against GMP
|
||||||
|
- for benchmarking
|
||||||
|
- The Nim standard libreary for timing and formatting
|
||||||
|
- for Nvidia GPU backend:
|
||||||
|
- the LLVM runtime ("dev" version with headers is not needed)
|
||||||
|
- the CUDA runtime ("dev" version with headers is not needed)
|
||||||
|
- at compile-time
|
||||||
|
- we need the std/macros library to generate Nim code.
|
||||||
|
|
||||||
## Curves supported in the backend
|
## Curves supported in the backend
|
||||||
|
|
||||||
@ -108,42 +195,10 @@ The following curves are configured:
|
|||||||
- Jubjub, a curve embedded in BLS12-381 scalar field to be used in zk-SNARKS circuits.
|
- Jubjub, a curve embedded in BLS12-381 scalar field to be used in zk-SNARKS circuits.
|
||||||
- Bandersnatch, a more efficient curve embedded in BLS12-381 scalar field to be used in zk-SNARKS circuits.
|
- Bandersnatch, a more efficient curve embedded in BLS12-381 scalar field to be used in zk-SNARKS circuits.
|
||||||
- Other curves
|
- Other curves
|
||||||
- Edwards25519, used in ed25519 and X25519 from TLS 1.3 protocol and the Signal protocol.
|
- Edwards25519, used in ed25519 and X25519 from TLS 1.3 protocol and the Signal protocol. \
|
||||||
|
|
||||||
With Ristretto, it can be used in bulletproofs.
|
With Ristretto, it can be used in bulletproofs.
|
||||||
- The Pasta curves (Pallas and Vesta) for the Halo 2 proof system (Zcash).
|
- The Pasta curves (Pallas and Vesta) for the Halo 2 proof system (Zcash).
|
||||||
|
|
||||||
|
|
||||||
## Installation
|
|
||||||
|
|
||||||
You can install the developement version of the library through nimble with the following command
|
|
||||||
```
|
|
||||||
nimble install https://github.com/mratsim/constantine@#master
|
|
||||||
```
|
|
||||||
|
|
||||||
For speed it is recommended to prefer Clang, MSVC or ICC over GCC (see [Compiler-caveats](#Compiler-caveats)).
|
|
||||||
|
|
||||||
Further if using GCC, GCC 7 at minimum is required, previous versions
|
|
||||||
generated incorrect add-with-carry code.
|
|
||||||
|
|
||||||
On x86-64, inline assembly is used to workaround compilers having issues optimizing large integer arithmetic,
|
|
||||||
and also ensure constant-time code.
|
|
||||||
|
|
||||||
## Dependencies
|
|
||||||
|
|
||||||
Constantine has no dependencies, even on Nim standard library except:
|
|
||||||
- for testing
|
|
||||||
- jsony for parsing json test vectors
|
|
||||||
- the Nim standard library for unittesting, formatting and datetime.
|
|
||||||
- GMP for testing against GMP
|
|
||||||
- for benchmarking
|
|
||||||
- The Nim standard libreary for timing and formatting
|
|
||||||
- for Nvidia GPU backend:
|
|
||||||
- the LLVM runtime ("dev" version with headers is not needed)
|
|
||||||
- the CUDA runtime ("dev" version with headers is not needed)
|
|
||||||
- at compile-time
|
|
||||||
- we need the std/macros library to generate Nim code.
|
|
||||||
|
|
||||||
## Security
|
## Security
|
||||||
|
|
||||||
Hardening an implementation against all existing and upcoming attack vectors is an extremely complex task.
|
Hardening an implementation against all existing and upcoming attack vectors is an extremely complex task.
|
||||||
@ -217,47 +272,79 @@ To measure the performance of Constantine
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/mratsim/constantine
|
git clone https://github.com/mratsim/constantine
|
||||||
nimble bench_fp # Using default compiler + Assembly
|
|
||||||
nimble bench_fp_clang # Using Clang + Assembly (recommended)
|
# Default compiler
|
||||||
nimble bench_fp_gcc # Using GCC + Assembly (decent)
|
nimble bench_fp
|
||||||
nimble bench_fp_clang_noasm # Using Clang only (acceptable)
|
|
||||||
nimble bench_fp_gcc # Using GCC only (slowest)
|
# Arithmetic
|
||||||
nimble bench_fp2
|
CC=clang nimble bench_fp # Using Clang + Assembly (recommended)
|
||||||
# ...
|
CC=clang nimble bench_fp2
|
||||||
nimble bench_ec_g1_clang
|
CC=clang nimble bench_fp12
|
||||||
nimble bench_ec_g2_clang
|
|
||||||
nimble bench_pairing_bn254_nogami_clang
|
# Scalar multiplication and pairings
|
||||||
nimble bench_pairing_bn254_snarks_clang
|
CC=clang nimble bench_ec_g1_scalar_mul
|
||||||
nimble bench_pairing_bls12_377_clang
|
CC=clang nimble bench_ec_g2_scalar_mul
|
||||||
nimble bench_pairing_bls12_381_clang
|
CC=clang nimble bench_pairing_bls12_381
|
||||||
|
|
||||||
# And per-curve summaries
|
# And per-curve summaries
|
||||||
nimble bench_summary_bn254_nogami_clang
|
CC=clang nimble bench_summary_bn254_nogami
|
||||||
nimble bench_summary_bn254_snarks_clang
|
CC=clang nimble bench_summary_bn254_snarks
|
||||||
nimble bench_summary_bls12_377_clang
|
CC=clang nimble bench_summary_bls12_377
|
||||||
nimble bench_summary_bls12_381_clang
|
CC=clang nimble bench_summary_bls12_381
|
||||||
|
|
||||||
|
# The Ethereum BLS signature protocol
|
||||||
|
CC=clang nimble bench_ethereum_bls_signatures
|
||||||
|
|
||||||
|
# Multi-scalar multiplication
|
||||||
|
CC=clang nimble bench_ec_g1_msm_bls12_381
|
||||||
|
CC=clang nimble bench_ec_g1_msm_bn256_snarks
|
||||||
```
|
```
|
||||||
|
|
||||||
|
The full list of benchmarks is available in the [`benchmarks`](./benchmarks) folder.
|
||||||
|
|
||||||
As mentioned in the [Compiler caveats](#compiler-caveats) section, GCC is up to 2x slower than Clang due to mishandling of carries and register usage.
|
As mentioned in the [Compiler caveats](#compiler-caveats) section, GCC is up to 2x slower than Clang due to mishandling of carries and register usage.
|
||||||
|
|
||||||
On my machine i9-11980HK (8 cores 2.6GHz, turbo 5GHz), for Clang + Assembly, **all being constant-time** (including scalar multiplication, square root and inversion).
|
|
||||||
|
|
||||||
#### BLS12_381 (Clang + inline Assembly)
|
#### BLS12_381 (Clang + inline Assembly)
|
||||||
|
|
||||||
```
|
On my machine i9-11980HK (8 cores 2.6GHz, turbo 5GHz), for Clang + Assembly, **all being constant-time** (including scalar multiplication, square root and inversion).
|
||||||
--------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
||||||
EC ScalarMul 255-bit G1 ECP_ShortW_Prj[Fp[BLS12_381]] 16086.740 ops/s 62163 ns/op 205288 CPU cycles (approx)
|

|
||||||
EC ScalarMul 255-bit G1 ECP_ShortW_Jac[Fp[BLS12_381]] 16670.834 ops/s 59985 ns/op 198097 CPU cycles (approx)
|
|
||||||
EC ScalarMul 255-bit G2 ECP_ShortW_Prj[Fp2[BLS12_381]] 8333.403 ops/s 119999 ns/op 396284 CPU cycles (approx)
|

|
||||||
EC ScalarMul 255-bit G2 ECP_ShortW_Jac[Fp2[BLS12_381]] 9300.682 ops/s 107519 ns/op 355071 CPU cycles (approx)
|

|
||||||
--------------------------------------------------------------------------------------------------------------------------------------------------------
|

|
||||||
Miller Loop BLS12 BLS12_381 5102.223 ops/s 195993 ns/op 647251 CPU cycles (approx)
|
|
||||||
Final Exponentiation BLS12 BLS12_381 4209.109 ops/s 237580 ns/op 784588 CPU cycles (approx)
|
On a i9-9980XE (18 cores,watercooled, overclocked, 4.1GHz all core turbo)
|
||||||
Pairing BLS12 BLS12_381 2343.045 ops/s 426795 ns/op 1409453 CPU cycles (approx)
|
|
||||||
--------------------------------------------------------------------------------------------------------------------------------------------------------
|

|
||||||
Hash to G2 (Draft #11) BLS12_381 6558.495 ops/s 152474 ns/op 503531 CPU cycles (approx)
|
|
||||||
--------------------------------------------------------------------------------------------------------------------------------------------------------
|
#### Parallelism
|
||||||
```
|
|
||||||
|
Constantine multithreaded primitives are powered by a highly tuned threadpool and stress-tested for:
|
||||||
|
- scheduler overhead
|
||||||
|
- load balancing with extreme imbalance
|
||||||
|
- nested data parallelism
|
||||||
|
- contention
|
||||||
|
- speculative/conditional parallelism
|
||||||
|
|
||||||
|
and provides the following paradigms:
|
||||||
|
- Future-based task-parallelism
|
||||||
|
- Data parallelism (nestable and awaitable for loops)
|
||||||
|
- including arbitrary parallel reductions
|
||||||
|
- Dataflow parallelism / Stream parallelism / Graph Parallelism / Pipeline parallelism
|
||||||
|
- Structured Parallelism
|
||||||
|
|
||||||
|
The threadpool parallel-for loops use lazy loop splitting and are fully adaptative to the workload being scheduled, the threads in-flight load and the hardware speed unlike most (all?) runtime, see:
|
||||||
|
- OpenMP woes depending on hardware and workload: https://github.com/zy97140/omp-benchmark-for-pytorch
|
||||||
|
- Raytracing ideal runtime, adapt to pixel compute load: \
|
||||||
|
Most (all?) production runtime use scheduling A (split on number of threads like GCC OpenMP) or B (eager splitting, unable to adapt to actual work like LLVM/Intel OpenMP or Intel TBB) while Constantine uses C.
|
||||||
|
|
||||||
|
The threadpool provides efficient backoff strategy to conserve power based on:
|
||||||
|
- eventcounts / futexes, for low overhead backoff
|
||||||
|
- log-log iterated backoff, a provably optimal backoff strategy used for wireless communication to minimize communication in parallel for-loops
|
||||||
|
|
||||||
|
The research papers on high performance multithreading available in Weave repo: https://github.com/mratsim/weave/tree/7682784/research.\
|
||||||
|
_Note: The threadpool is not backed by Weave but by an inspired runtime that has been significantly simplified for ease of auditing. In particular it uses shared-memory based work-stealing instead of channel-based work-requesting for load balancing as distributed computing is not a target, ..., yet._
|
||||||
|
|
||||||
## Why Nim
|
## Why Nim
|
||||||
|
|
||||||
|
|||||||
@ -60,7 +60,7 @@ echo " release: ", defined(release)
|
|||||||
echo " danger: ", defined(danger)
|
echo " danger: ", defined(danger)
|
||||||
echo " inline assembly: ", UseASM_X86_64
|
echo " inline assembly: ", UseASM_X86_64
|
||||||
|
|
||||||
when (sizeof(int) == 4) or defined(Constantine32):
|
when (sizeof(int) == 4) or defined(Ctt32):
|
||||||
echo "⚠️ Warning: using Constantine with 32-bit limbs"
|
echo "⚠️ Warning: using Constantine with 32-bit limbs"
|
||||||
else:
|
else:
|
||||||
echo "Using Constantine with 64-bit limbs"
|
echo "Using Constantine with 64-bit limbs"
|
||||||
|
|||||||
@ -61,7 +61,7 @@ echo " release: ", defined(release)
|
|||||||
echo " danger: ", defined(danger)
|
echo " danger: ", defined(danger)
|
||||||
echo " inline assembly: ", UseASM_X86_64
|
echo " inline assembly: ", UseASM_X86_64
|
||||||
|
|
||||||
when (sizeof(int) == 4) or defined(Constantine32):
|
when (sizeof(int) == 4) or defined(Ctt32):
|
||||||
echo "⚠️ Warning: using Constantine with 32-bit limbs"
|
echo "⚠️ Warning: using Constantine with 32-bit limbs"
|
||||||
else:
|
else:
|
||||||
echo "Using Constantine with 64-bit limbs"
|
echo "Using Constantine with 64-bit limbs"
|
||||||
|
|||||||
@ -33,7 +33,7 @@ else:
|
|||||||
proc SHA256[T: byte|char](
|
proc SHA256[T: byte|char](
|
||||||
msg: openarray[T],
|
msg: openarray[T],
|
||||||
digest: ptr array[32, byte] = nil
|
digest: ptr array[32, byte] = nil
|
||||||
): ptr array[32, byte] {.cdecl, dynlib: DLLSSLName, importc.}
|
): ptr array[32, byte] {.noconv, dynlib: DLLSSLName, importc.}
|
||||||
|
|
||||||
proc SHA256_OpenSSL[T: byte|char](
|
proc SHA256_OpenSSL[T: byte|char](
|
||||||
digest: var array[32, byte],
|
digest: var array[32, byte],
|
||||||
|
|||||||
@ -19,9 +19,9 @@ export curves, curves_primitives
|
|||||||
|
|
||||||
template genBindingsField*(Field: untyped) =
|
template genBindingsField*(Field: untyped) =
|
||||||
when appType == "lib":
|
when appType == "lib":
|
||||||
{.push cdecl, dynlib, exportc, raises: [].} # No exceptions allowed
|
{.push noconv, dynlib, exportc, raises: [].} # No exceptions allowed
|
||||||
else:
|
else:
|
||||||
{.push cdecl, exportc, raises: [].} # No exceptions allowed
|
{.push noconv, exportc, raises: [].} # No exceptions allowed
|
||||||
|
|
||||||
func `ctt _ Field _ unmarshalBE`(dst: var Field, src: openarray[byte]) =
|
func `ctt _ Field _ unmarshalBE`(dst: var Field, src: openarray[byte]) =
|
||||||
## Deserialize
|
## Deserialize
|
||||||
@ -122,9 +122,9 @@ template genBindingsField*(Field: untyped) =
|
|||||||
|
|
||||||
template genBindingsFieldSqrt*(Field: untyped) =
|
template genBindingsFieldSqrt*(Field: untyped) =
|
||||||
when appType == "lib":
|
when appType == "lib":
|
||||||
{.push cdecl, dynlib, exportc, raises: [].} # No exceptions allowed
|
{.push noconv, dynlib, exportc, raises: [].} # No exceptions allowed
|
||||||
else:
|
else:
|
||||||
{.push cdecl, exportc, raises: [].} # No exceptions allowed
|
{.push noconv, exportc, raises: [].} # No exceptions allowed
|
||||||
|
|
||||||
func `ctt _ Field _ is_square`(a: Field): SecretBool =
|
func `ctt _ Field _ is_square`(a: Field): SecretBool =
|
||||||
a.isSquare()
|
a.isSquare()
|
||||||
@ -155,9 +155,9 @@ template genBindingsFieldSqrt*(Field: untyped) =
|
|||||||
|
|
||||||
template genBindingsExtField*(Field: untyped) =
|
template genBindingsExtField*(Field: untyped) =
|
||||||
when appType == "lib":
|
when appType == "lib":
|
||||||
{.push cdecl, dynlib, exportc, raises: [].} # No exceptions allowed
|
{.push noconv, dynlib, exportc, raises: [].} # No exceptions allowed
|
||||||
else:
|
else:
|
||||||
{.push cdecl, exportc, raises: [].} # No exceptions allowed
|
{.push noconv, exportc, raises: [].} # No exceptions allowed
|
||||||
|
|
||||||
# --------------------------------------------------------------------------------------
|
# --------------------------------------------------------------------------------------
|
||||||
func `ctt _ Field _ is_eq`(a, b: Field): SecretBool =
|
func `ctt _ Field _ is_eq`(a, b: Field): SecretBool =
|
||||||
@ -258,9 +258,9 @@ template genBindingsExtField*(Field: untyped) =
|
|||||||
|
|
||||||
template genBindingsExtFieldSqrt*(Field: untyped) =
|
template genBindingsExtFieldSqrt*(Field: untyped) =
|
||||||
when appType == "lib":
|
when appType == "lib":
|
||||||
{.push cdecl, dynlib, exportc, raises: [].} # No exceptions allowed
|
{.push noconv, dynlib, exportc, raises: [].} # No exceptions allowed
|
||||||
else:
|
else:
|
||||||
{.push cdecl, exportc, raises: [].} # No exceptions allowed
|
{.push noconv, exportc, raises: [].} # No exceptions allowed
|
||||||
|
|
||||||
func `ctt _ Field _ is_square`(a: Field): SecretBool =
|
func `ctt _ Field _ is_square`(a: Field): SecretBool =
|
||||||
a.isSquare()
|
a.isSquare()
|
||||||
@ -275,9 +275,9 @@ template genBindingsExtFieldSqrt*(Field: untyped) =
|
|||||||
|
|
||||||
template genBindings_EC_ShortW_Affine*(ECP, Field: untyped) =
|
template genBindings_EC_ShortW_Affine*(ECP, Field: untyped) =
|
||||||
when appType == "lib":
|
when appType == "lib":
|
||||||
{.push cdecl, dynlib, exportc, raises: [].} # No exceptions allowed
|
{.push noconv, dynlib, exportc, raises: [].} # No exceptions allowed
|
||||||
else:
|
else:
|
||||||
{.push cdecl, exportc, raises: [].} # No exceptions allowed
|
{.push noconv, exportc, raises: [].} # No exceptions allowed
|
||||||
|
|
||||||
# --------------------------------------------------------------------------------------
|
# --------------------------------------------------------------------------------------
|
||||||
func `ctt _ ECP _ is_eq`(P, Q: ECP): SecretBool =
|
func `ctt _ ECP _ is_eq`(P, Q: ECP): SecretBool =
|
||||||
@ -305,9 +305,9 @@ template genBindings_EC_ShortW_Affine*(ECP, Field: untyped) =
|
|||||||
|
|
||||||
template genBindings_EC_ShortW_NonAffine*(ECP, ECP_Aff, Field: untyped) =
|
template genBindings_EC_ShortW_NonAffine*(ECP, ECP_Aff, Field: untyped) =
|
||||||
when appType == "lib":
|
when appType == "lib":
|
||||||
{.push cdecl, dynlib, exportc, raises: [].} # No exceptions allowed
|
{.push noconv, dynlib, exportc, raises: [].} # No exceptions allowed
|
||||||
else:
|
else:
|
||||||
{.push cdecl, exportc, raises: [].} # No exceptions allowed
|
{.push noconv, exportc, raises: [].} # No exceptions allowed
|
||||||
|
|
||||||
# --------------------------------------------------------------------------------------
|
# --------------------------------------------------------------------------------------
|
||||||
func `ctt _ ECP _ is_eq`(P, Q: ECP): SecretBool =
|
func `ctt _ ECP _ is_eq`(P, Q: ECP): SecretBool =
|
||||||
|
|||||||
@ -17,17 +17,17 @@ import std/strformat
|
|||||||
# Library compilation
|
# Library compilation
|
||||||
# ----------------------------------------------------------------
|
# ----------------------------------------------------------------
|
||||||
|
|
||||||
proc releaseBuildOptions: string =
|
proc releaseBuildOptions(useASM, useLTO = true): string =
|
||||||
# -d:danger --opt:size
|
# -d:danger --opt:size
|
||||||
# to avoid boundsCheck and overflowChecks that would trigger exceptions or allocations in a crypto library.
|
# to avoid boundsCheck and overflowChecks that would trigger exceptions or allocations in a crypto library.
|
||||||
# Those are internally guaranteed at compile-time by fixed-sized array
|
# Those are internally guaranteed at compile-time by fixed-sized array
|
||||||
# and checked at runtime with an appropriate error code if any for user-input.
|
# and checked at runtime with an appropriate error code if any for user-input.
|
||||||
#
|
#
|
||||||
# Furthermore we optimize for size, the performance critical procedures
|
# Furthermore we may optimize for size, the performance critical procedures
|
||||||
# either use assembly or are unrolled manually with staticFor,
|
# either use assembly or are unrolled manually with staticFor,
|
||||||
# Optimizations at -O3 deal with loops and branching
|
# Optimizations at -O3 deal with loops and branching
|
||||||
# which we mostly don't have. It's better to optimize
|
# which we mostly don't have.
|
||||||
# for instructions cache.
|
# Hence optimizing for instructions cache may pay off.
|
||||||
#
|
#
|
||||||
# --panics:on -d:noSignalHandler
|
# --panics:on -d:noSignalHandler
|
||||||
# Even with `raises: []`, Nim still has an exception path
|
# Even with `raises: []`, Nim still has an exception path
|
||||||
@ -50,11 +50,23 @@ proc releaseBuildOptions: string =
|
|||||||
# Reduce instructions cache misses.
|
# Reduce instructions cache misses.
|
||||||
# https://lkml.org/lkml/2015/5/21/443
|
# https://lkml.org/lkml/2015/5/21/443
|
||||||
# Our non-inlined functions are large so size cost is minimal.
|
# Our non-inlined functions are large so size cost is minimal.
|
||||||
" -d:danger --opt:size " &
|
let compiler = if existsEnv"CC": " --cc:" & getEnv"CC"
|
||||||
|
else: ""
|
||||||
|
|
||||||
|
let noASM = if not useASM: " -d:CttASM=false "
|
||||||
|
else: ""
|
||||||
|
|
||||||
|
let lto = if useLTO: " --passC:-flto=auto --passL:-flto=auto "
|
||||||
|
else: ""
|
||||||
|
|
||||||
|
compiler &
|
||||||
|
noASM &
|
||||||
|
lto &
|
||||||
|
" -d:danger " &
|
||||||
|
# " --opt:size " &
|
||||||
" --panics:on -d:noSignalHandler " &
|
" --panics:on -d:noSignalHandler " &
|
||||||
" --mm:arc -d:useMalloc " &
|
" --mm:arc -d:useMalloc " &
|
||||||
" --verbosity:0 --hints:off --warnings:off " &
|
" --verbosity:0 --hints:off --warnings:off " &
|
||||||
# " --passC:-flto --passL:-flto " &
|
|
||||||
" --passC:-fno-semantic-interposition " &
|
" --passC:-fno-semantic-interposition " &
|
||||||
" --passC:-falign-functions=64 "
|
" --passC:-falign-functions=64 "
|
||||||
|
|
||||||
@ -62,13 +74,14 @@ type BindingsKind = enum
|
|||||||
kCurve
|
kCurve
|
||||||
kProtocol
|
kProtocol
|
||||||
|
|
||||||
proc genDynamicBindings(bindingsKind: BindingsKind, bindingsName, prefixNimMain: string) =
|
proc genDynamicBindings(bindingsKind: BindingsKind, bindingsName, prefixNimMain: string, useASM = true) =
|
||||||
proc compile(libName: string, flags = "") =
|
proc compile(libName: string, flags = "") =
|
||||||
echo "Compiling dynamic library: lib/" & libName
|
echo "Compiling dynamic library: lib/" & libName
|
||||||
|
|
||||||
exec "nim c " &
|
exec "nim c " &
|
||||||
" --noMain --app:lib " &
|
|
||||||
flags &
|
flags &
|
||||||
releaseBuildOptions() &
|
releaseBuildOptions(useASM, useLTO = true) &
|
||||||
|
" --noMain --app:lib " &
|
||||||
&" --nimMainPrefix:{prefixNimMain} " &
|
&" --nimMainPrefix:{prefixNimMain} " &
|
||||||
&" --out:{libName} --outdir:lib " &
|
&" --out:{libName} --outdir:lib " &
|
||||||
(block:
|
(block:
|
||||||
@ -98,24 +111,24 @@ proc genDynamicBindings(bindingsKind: BindingsKind, bindingsName, prefixNimMain:
|
|||||||
else:
|
else:
|
||||||
compile "lib" & bindingsName & ".so"
|
compile "lib" & bindingsName & ".so"
|
||||||
|
|
||||||
proc genStaticBindings(bindingsKind: BindingsKind, bindingsName, prefixNimMain: string) =
|
proc genStaticBindings(bindingsKind: BindingsKind, bindingsName, prefixNimMain: string, useASM = true) =
|
||||||
proc compile(libName: string, flags = "") =
|
proc compile(libName: string, flags = "") =
|
||||||
echo "Compiling static library: lib/" & libName
|
echo "Compiling static library: lib/" & libName
|
||||||
|
|
||||||
exec "nim c " &
|
exec "nim c " &
|
||||||
" --noMain --app:staticLib " &
|
|
||||||
flags &
|
flags &
|
||||||
releaseBuildOptions() &
|
releaseBuildOptions(useASM, useLTO = false) &
|
||||||
" --nimMainPrefix:" & prefixNimMain &
|
" --noMain --app:staticLib " &
|
||||||
" --out:" & libName & " --outdir:lib " &
|
&" --nimMainPrefix:{prefixNimMain} " &
|
||||||
|
&" --out:{libName} --outdir:lib " &
|
||||||
(block:
|
(block:
|
||||||
case bindingsKind
|
case bindingsKind
|
||||||
of kCurve:
|
of kCurve:
|
||||||
" --nimcache:nimcache/bindings_curves/" & bindingsName &
|
&" --nimcache:nimcache/bindings_curves/{bindingsName}" &
|
||||||
" bindings_generators/" & bindingsName & ".nim"
|
&" bindings_generators/{bindingsName}.nim"
|
||||||
of kProtocol:
|
of kProtocol:
|
||||||
" --nimcache:nimcache/bindings_protocols/" & bindingsName &
|
&" --nimcache:nimcache/bindings_protocols/{bindingsName}" &
|
||||||
" constantine/" & bindingsName & ".nim"
|
&" constantine/{bindingsName}.nim")
|
||||||
)
|
|
||||||
|
|
||||||
let bindingsName = block:
|
let bindingsName = block:
|
||||||
case bindingsKind
|
case bindingsKind
|
||||||
@ -138,13 +151,13 @@ proc genStaticBindings(bindingsKind: BindingsKind, bindingsName, prefixNimMain:
|
|||||||
proc genHeaders(bindingsName: string) =
|
proc genHeaders(bindingsName: string) =
|
||||||
echo "Generating header: include/" & bindingsName & ".h"
|
echo "Generating header: include/" & bindingsName & ".h"
|
||||||
exec "nim c -d:CttGenerateHeaders " &
|
exec "nim c -d:CttGenerateHeaders " &
|
||||||
releaseBuildOptions() &
|
" -d:release " &
|
||||||
" --out:" & bindingsName & "_gen_header.exe --outdir:build " &
|
" --out:" & bindingsName & "_gen_header.exe --outdir:build " &
|
||||||
" --nimcache:nimcache/bindings_curves_headers/" & bindingsName & "_header" &
|
" --nimcache:nimcache/bindings_curves_headers/" & bindingsName & "_header" &
|
||||||
" bindings_generators/" & bindingsName & ".nim"
|
" bindings_generators/" & bindingsName & ".nim"
|
||||||
exec "build/" & bindingsName & "_gen_header.exe include"
|
exec "build/" & bindingsName & "_gen_header.exe include"
|
||||||
|
|
||||||
task bindings, "Generate Constantine bindings":
|
task bindings, "Generate Constantine bindings (no assembly)":
|
||||||
# Curve arithmetic
|
# Curve arithmetic
|
||||||
genStaticBindings(kCurve, "constantine_bls12_381", "ctt_bls12381_init_")
|
genStaticBindings(kCurve, "constantine_bls12_381", "ctt_bls12381_init_")
|
||||||
genDynamicBindings(kCurve, "constantine_bls12_381", "ctt_bls12381_init_")
|
genDynamicBindings(kCurve, "constantine_bls12_381", "ctt_bls12381_init_")
|
||||||
@ -158,6 +171,23 @@ task bindings, "Generate Constantine bindings":
|
|||||||
# Protocols
|
# Protocols
|
||||||
genStaticBindings(kProtocol, "ethereum_bls_signatures", "ctt_eth_bls_init_")
|
genStaticBindings(kProtocol, "ethereum_bls_signatures", "ctt_eth_bls_init_")
|
||||||
genDynamicBindings(kProtocol, "ethereum_bls_signatures", "ctt_eth_bls_init_")
|
genDynamicBindings(kProtocol, "ethereum_bls_signatures", "ctt_eth_bls_init_")
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
task bindings_no_asm, "Generate Constantine bindings (no assembly)":
|
||||||
|
# Curve arithmetic
|
||||||
|
genStaticBindings(kCurve, "constantine_bls12_381", "ctt_bls12381_init_", useASM = false)
|
||||||
|
genDynamicBindings(kCurve, "constantine_bls12_381", "ctt_bls12381_init_", useASM = false)
|
||||||
|
genHeaders("constantine_bls12_381")
|
||||||
|
echo ""
|
||||||
|
genStaticBindings(kCurve, "constantine_pasta", "ctt_pasta_init_", useASM = false)
|
||||||
|
genDynamicBindings(kCurve, "constantine_pasta", "ctt_pasta_init_", useASM = false)
|
||||||
|
genHeaders("constantine_pasta")
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Protocols
|
||||||
|
genStaticBindings(kProtocol, "ethereum_bls_signatures", "ctt_eth_bls_init_", useASM = false)
|
||||||
|
genDynamicBindings(kProtocol, "ethereum_bls_signatures", "ctt_eth_bls_init_", useASM = false)
|
||||||
|
echo ""
|
||||||
|
|
||||||
proc testLib(path, testName, libName: string, useGMP: bool) =
|
proc testLib(path, testName, libName: string, useGMP: bool) =
|
||||||
let dynlibName = if defined(windows): libName & ".dll"
|
let dynlibName = if defined(windows): libName & ".dll"
|
||||||
@ -166,21 +196,25 @@ proc testLib(path, testName, libName: string, useGMP: bool) =
|
|||||||
let staticlibName = if defined(windows): libName & ".lib"
|
let staticlibName = if defined(windows): libName & ".lib"
|
||||||
else: "lib" & libName & ".a"
|
else: "lib" & libName & ".a"
|
||||||
|
|
||||||
|
let cc = if existsEnv"CC": getEnv"CC"
|
||||||
|
else: "gcc"
|
||||||
|
|
||||||
echo &"\n[Bindings: {path}/{testName}.c] Testing dynamically linked library {dynlibName}"
|
echo &"\n[Bindings: {path}/{testName}.c] Testing dynamically linked library {dynlibName}"
|
||||||
exec &"gcc -Iinclude -Llib -o build/testbindings/{testName}_dynlink.exe {path}/{testName}.c -l{libName} " & (if useGMP: "-lgmp" else: "")
|
exec &"{cc} -Iinclude -Llib -o build/testbindings/{testName}_dynlink.exe {path}/{testName}.c -l{libName} " & (if useGMP: "-lgmp" else: "")
|
||||||
when defined(windows):
|
when defined(windows):
|
||||||
# Put DLL near the exe as LD_LIBRARY_PATH doesn't work even in a POSIX compatible shell
|
# Put DLL near the exe as LD_LIBRARY_PATH doesn't work even in a POSIX compatible shell
|
||||||
exec &"./build/testbindings/{testName}_dynlink.exe"
|
exec &"./build/testbindings/{testName}_dynlink.exe"
|
||||||
else:
|
else:
|
||||||
exec &"LD_LIBRARY_PATH=lib ./build/testbindings/{testName}_dynlink.exe"
|
exec &"LD_LIBRARY_PATH=lib ./build/testbindings/{testName}_dynlink.exe"
|
||||||
|
echo ""
|
||||||
|
|
||||||
echo &"\n[Bindings: {path}/{testName}.c] Testing statically linked library: {staticlibName}"
|
echo &"\n[Bindings: {path}/{testName}.c] Testing statically linked library: {staticlibName}"
|
||||||
# Beware MacOS annoying linker with regards to static libraries
|
# Beware MacOS annoying linker with regards to static libraries
|
||||||
# The following standard way cannot be used on MacOS
|
# The following standard way cannot be used on MacOS
|
||||||
# exec "gcc -Iinclude -Llib -o build/t_libctt_bls12_381_sl.exe examples_c/t_libctt_bls12_381.c -lgmp -Wl,-Bstatic -lconstantine_bls12_381 -Wl,-Bdynamic"
|
# exec "gcc -Iinclude -Llib -o build/t_libctt_bls12_381_sl.exe examples_c/t_libctt_bls12_381.c -lgmp -Wl,-Bstatic -lconstantine_bls12_381 -Wl,-Bdynamic"
|
||||||
exec &"gcc -Iinclude -o build/testbindings/{testName}_staticlink.exe {path}/{testName}.c lib/{staticlibName} " & (if useGMP: "-lgmp" else: "")
|
exec &"{cc} -Iinclude -o build/testbindings/{testName}_staticlink.exe {path}/{testName}.c lib/{staticlibName} " & (if useGMP: "-lgmp" else: "")
|
||||||
exec &"./build/testbindings/{testName}_staticlink.exe"
|
exec &"./build/testbindings/{testName}_staticlink.exe"
|
||||||
|
echo ""
|
||||||
|
|
||||||
task test_bindings, "Test C bindings":
|
task test_bindings, "Test C bindings":
|
||||||
exec "mkdir -p build/testbindings"
|
exec "mkdir -p build/testbindings"
|
||||||
@ -485,9 +519,22 @@ const skipSanitizers = [
|
|||||||
|
|
||||||
when defined(windows):
|
when defined(windows):
|
||||||
# UBSAN is not available on mingw
|
# UBSAN is not available on mingw
|
||||||
|
# https://github.com/libressl-portable/portable/issues/54
|
||||||
const sanitizers = ""
|
const sanitizers = ""
|
||||||
else:
|
else:
|
||||||
const sanitizers =
|
const sanitizers =
|
||||||
|
|
||||||
|
" --passC:-fstack-protector-strong " &
|
||||||
|
|
||||||
|
# Fortify source wouldn't help us detect errors in cosntantine
|
||||||
|
# because everything is stack allocated
|
||||||
|
# except with the threadpool:
|
||||||
|
# - https://developers.redhat.com/blog/2021/04/16/broadening-compiler-checks-for-buffer-overflows-in-_fortify_source#what_s_next_for__fortify_source
|
||||||
|
# - https://developers.redhat.com/articles/2023/02/06/how-improve-application-security-using-fortifysource3#how_to_improve_application_fortification
|
||||||
|
# We also don't use memcpy as it is not constant-time and our copy is compile-time sized.
|
||||||
|
|
||||||
|
" --passC:-D_FORTIFY_SOURCE=3 " &
|
||||||
|
|
||||||
# Sanitizers are incompatible with nim default GC
|
# Sanitizers are incompatible with nim default GC
|
||||||
# The conservative stack scanning of Nim default GC triggers, alignment UB and stack-buffer-overflow check.
|
# The conservative stack scanning of Nim default GC triggers, alignment UB and stack-buffer-overflow check.
|
||||||
# Address sanitizer requires free registers and needs to be disabled for some inline assembly files.
|
# Address sanitizer requires free registers and needs to be disabled for some inline assembly files.
|
||||||
@ -497,8 +544,8 @@ else:
|
|||||||
|
|
||||||
# " --passC:-fsanitize=undefined --passL:-fsanitize=undefined" &
|
# " --passC:-fsanitize=undefined --passL:-fsanitize=undefined" &
|
||||||
# " --passC:-fsanitize=address --passL:-fsanitize=address" &
|
# " --passC:-fsanitize=address --passL:-fsanitize=address" &
|
||||||
" --passC:-fno-sanitize-recover" # Enforce crash on undefined behaviour
|
# " --passC:-fno-sanitize-recover" # Enforce crash on undefined behaviour
|
||||||
|
""
|
||||||
|
|
||||||
# Tests & Benchmarks helper functions
|
# Tests & Benchmarks helper functions
|
||||||
# ----------------------------------------------------------------
|
# ----------------------------------------------------------------
|
||||||
@ -508,25 +555,17 @@ proc clearParallelBuild() =
|
|||||||
if fileExists(buildParallel):
|
if fileExists(buildParallel):
|
||||||
rmFile(buildParallel)
|
rmFile(buildParallel)
|
||||||
|
|
||||||
template setupTestCommand(): untyped {.dirty.} =
|
proc setupTestCommand(flags, path: string, useASM: bool): string =
|
||||||
var lang = "c"
|
var lang = "c"
|
||||||
if existsEnv"TEST_LANG":
|
if existsEnv"TEST_LANG":
|
||||||
lang = getEnv"TEST_LANG"
|
lang = getEnv"TEST_LANG"
|
||||||
|
|
||||||
var cc = ""
|
return "nim " & lang &
|
||||||
if existsEnv"CC":
|
|
||||||
cc = " --cc:" & getEnv"CC"
|
|
||||||
|
|
||||||
var flags = flags
|
|
||||||
when not defined(windows):
|
|
||||||
# Not available in MinGW https://github.com/libressl-portable/portable/issues/54
|
|
||||||
flags &= " --passC:-fstack-protector-strong --passC:-D_FORTIFY_SOURCE=2 "
|
|
||||||
let command = "nim " & lang & cc &
|
|
||||||
" -r " &
|
" -r " &
|
||||||
flags &
|
flags &
|
||||||
releaseBuildOptions() &
|
releaseBuildOptions(useASM) &
|
||||||
" --outdir:build/testsuite " &
|
" --outdir:build/testsuite " &
|
||||||
" --nimcache:nimcache/" & path & " " &
|
&" --nimcache:nimcache/{path} " &
|
||||||
path
|
path
|
||||||
|
|
||||||
proc test(cmd: string) =
|
proc test(cmd: string) =
|
||||||
@ -535,73 +574,72 @@ proc test(cmd: string) =
|
|||||||
echo "=============================================================================================="
|
echo "=============================================================================================="
|
||||||
exec cmd
|
exec cmd
|
||||||
|
|
||||||
proc testBatch(commands: var string, flags, path: string) =
|
proc testBatch(commands: var string, flags, path: string, useASM = true) =
|
||||||
setupTestCommand()
|
# With LTO, the linker produces lots of spurious warnings when copying into openArrays/strings
|
||||||
commands &= command & '\n'
|
|
||||||
|
|
||||||
template setupBench(): untyped {.dirty.} =
|
let flags = if defined(gcc): flags & " --passC:-Wno-stringop-overflow --passL:-Wno-stringop-overflow "
|
||||||
let runFlag = if run: " -r "
|
else: flags
|
||||||
else: " "
|
|
||||||
|
|
||||||
var lang = " c "
|
commands = commands & setupTestCommand(flags, path, useASM) & '\n'
|
||||||
if existsEnv"TEST_LANG":
|
|
||||||
lang = getEnv"TEST_LANG"
|
|
||||||
|
|
||||||
var cc = ""
|
proc setupBench(benchName: string, run: bool, useAsm: bool): string =
|
||||||
if compiler != "":
|
var runFlags = " "
|
||||||
cc = "--cc:" & compiler
|
if run: # Beware of https://github.com/nim-lang/Nim/issues/21704
|
||||||
elif existsEnv"CC":
|
runFlags = runFlags & " -r "
|
||||||
cc = " --cc:" & getEnv"CC"
|
|
||||||
|
|
||||||
if not useAsm:
|
let asmStatus = if useASM: "useASM"
|
||||||
cc &= " -d:CttASM=false"
|
else: "noASM"
|
||||||
let command = "nim " & lang & cc &
|
|
||||||
releaseBuildOptions() &
|
|
||||||
" -o:build/bench/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") &
|
|
||||||
" --nimcache:nimcache/benches/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") &
|
|
||||||
runFlag & " benchmarks/" & benchName & ".nim"
|
|
||||||
|
|
||||||
proc runBench(benchName: string, compiler = "", useAsm = true) =
|
if defined(gcc):
|
||||||
|
# With LTO, the linker produces lots of spurious warnings when copying into openArrays/strings
|
||||||
|
runFlags = runFlags & " --passC:-Wno-stringop-overflow --passL:-Wno-stringop-overflow "
|
||||||
|
|
||||||
|
let cc = if existsEnv"CC": getEnv"CC"
|
||||||
|
else: "defaultcompiler"
|
||||||
|
|
||||||
|
return "nim c " &
|
||||||
|
runFlags &
|
||||||
|
releaseBuildOptions(useASM) &
|
||||||
|
&" -o:build/bench/{benchName}_{cc}_{asmStatus}" &
|
||||||
|
&" --nimcache:nimcache/benches/{benchName}_{cc}_{asmStatus}" &
|
||||||
|
&" benchmarks/{benchName}.nim"
|
||||||
|
|
||||||
|
proc runBench(benchName: string, useAsm = true) =
|
||||||
if not dirExists "build":
|
if not dirExists "build":
|
||||||
mkDir "build"
|
mkDir "build"
|
||||||
let run = true
|
let command = setupBench(benchName, run = true, useAsm)
|
||||||
setupBench()
|
|
||||||
exec command
|
exec command
|
||||||
|
|
||||||
proc buildBenchBatch(commands: var string, benchName: string, compiler = "", useAsm = true) =
|
proc buildBenchBatch(commands: var string, benchName: string, useAsm = true) =
|
||||||
let run = false
|
let command = setupBench(benchName, run = false, useAsm)
|
||||||
let compiler = ""
|
commands = commands & command & '\n'
|
||||||
setupBench()
|
|
||||||
commands &= command & '\n'
|
|
||||||
|
|
||||||
proc addTestSet(cmdFile: var string, requireGMP: bool, test32bit = false, testASM = true) =
|
proc addTestSet(cmdFile: var string, requireGMP: bool, test32bit = false, useASM = true) =
|
||||||
if not dirExists "build":
|
if not dirExists "build":
|
||||||
mkDir "build"
|
mkDir "build"
|
||||||
echo "Found " & $testDesc.len & " tests to run."
|
echo "Found " & $testDesc.len & " tests to run."
|
||||||
|
|
||||||
for td in testDesc:
|
for td in testDesc:
|
||||||
if not(td.useGMP and not requireGMP):
|
if not(td.useGMP and not requireGMP):
|
||||||
var flags = ""
|
var flags = "" # Beware of https://github.com/nim-lang/Nim/issues/21704
|
||||||
if not testASM:
|
|
||||||
flags &= " -d:CttASM=false "
|
|
||||||
if test32bit:
|
if test32bit:
|
||||||
flags &= " -d:Constantine32 "
|
flags = flags & " -d:Ctt32 "
|
||||||
if td.path in useDebug:
|
if td.path in useDebug:
|
||||||
flags &= " -d:debugConstantine "
|
flags = flags & " -d:CttDebug "
|
||||||
if td.path notin skipSanitizers:
|
if td.path notin skipSanitizers:
|
||||||
flags &= sanitizers
|
flags = flags & sanitizers
|
||||||
|
|
||||||
cmdFile.testBatch(flags, td.path)
|
cmdFile.testBatch(flags, td.path, useASM)
|
||||||
|
|
||||||
proc addTestSetNvidia(cmdFile: var string) =
|
proc addTestSetNvidia(cmdFile: var string) =
|
||||||
if not dirExists "build":
|
if not dirExists "build":
|
||||||
mkDir "build"
|
mkDir "build"
|
||||||
echo "Found " & $testDescNvidia.len & " tests to run."
|
echo "Found " & $testDescNvidia.len & " tests to run."
|
||||||
|
|
||||||
for path in testDescThreadpool:
|
for path in testDescNvidia:
|
||||||
var flags = ""
|
var flags = "" # Beware of https://github.com/nim-lang/Nim/issues/21704
|
||||||
if path notin skipSanitizers:
|
if path notin skipSanitizers:
|
||||||
flags &= sanitizers
|
flags = flags & sanitizers
|
||||||
cmdFile.testBatch(flags, path)
|
cmdFile.testBatch(flags, path)
|
||||||
|
|
||||||
proc addTestSetThreadpool(cmdFile: var string) =
|
proc addTestSetThreadpool(cmdFile: var string) =
|
||||||
@ -612,26 +650,24 @@ proc addTestSetThreadpool(cmdFile: var string) =
|
|||||||
for path in testDescThreadpool:
|
for path in testDescThreadpool:
|
||||||
var flags = " --threads:on --debugger:native "
|
var flags = " --threads:on --debugger:native "
|
||||||
if path notin skipSanitizers:
|
if path notin skipSanitizers:
|
||||||
flags &= sanitizers
|
flags = flags & sanitizers
|
||||||
cmdFile.testBatch(flags, path)
|
cmdFile.testBatch(flags, path)
|
||||||
|
|
||||||
proc addTestSetMultithreadedCrypto(cmdFile: var string, test32bit = false, testASM = true) =
|
proc addTestSetMultithreadedCrypto(cmdFile: var string, test32bit = false, useASM = true) =
|
||||||
if not dirExists "build":
|
if not dirExists "build":
|
||||||
mkDir "build"
|
mkDir "build"
|
||||||
echo "Found " & $testDescMultithreadedCrypto.len & " tests to run."
|
echo "Found " & $testDescMultithreadedCrypto.len & " tests to run."
|
||||||
|
|
||||||
for td in testDescMultithreadedCrypto:
|
for td in testDescMultithreadedCrypto:
|
||||||
var flags = " --threads:on --debugger:native"
|
var flags = " --threads:on --debugger:native"
|
||||||
if not testASM:
|
|
||||||
flags &= " -d:CttASM=false"
|
|
||||||
if test32bit:
|
if test32bit:
|
||||||
flags &= " -d:Constantine32"
|
flags = flags & " -d:Ctt32 "
|
||||||
if td in useDebug:
|
if td in useDebug:
|
||||||
flags &= " -d:debugConstantine"
|
flags = flags & " -d:CttDebug "
|
||||||
if td notin skipSanitizers:
|
if td notin skipSanitizers:
|
||||||
flags &= sanitizers
|
flags = flags & sanitizers
|
||||||
|
|
||||||
cmdFile.testBatch(flags, td)
|
cmdFile.testBatch(flags, td, useASM)
|
||||||
|
|
||||||
proc addBenchSet(cmdFile: var string, useAsm = true) =
|
proc addBenchSet(cmdFile: var string, useAsm = true) =
|
||||||
if not dirExists "build":
|
if not dirExists "build":
|
||||||
@ -649,7 +685,7 @@ proc genParallelCmdRunner() =
|
|||||||
task test, "Run all tests":
|
task test, "Run all tests":
|
||||||
# -d:testingCurves is configured in a *.nim.cfg for convenience
|
# -d:testingCurves is configured in a *.nim.cfg for convenience
|
||||||
var cmdFile: string
|
var cmdFile: string
|
||||||
cmdFile.addTestSet(requireGMP = true, testASM = true)
|
cmdFile.addTestSet(requireGMP = true, useASM = true)
|
||||||
cmdFile.addBenchSet(useASM = true) # Build (but don't run) benches to ensure they stay relevant
|
cmdFile.addBenchSet(useASM = true) # Build (but don't run) benches to ensure they stay relevant
|
||||||
cmdFile.addTestSetThreadpool()
|
cmdFile.addTestSetThreadpool()
|
||||||
cmdFile.addTestSetMultithreadedCrypto()
|
cmdFile.addTestSetMultithreadedCrypto()
|
||||||
@ -660,10 +696,10 @@ task test, "Run all tests":
|
|||||||
task test_no_asm, "Run all tests (no assembly)":
|
task test_no_asm, "Run all tests (no assembly)":
|
||||||
# -d:testingCurves is configured in a *.nim.cfg for convenience
|
# -d:testingCurves is configured in a *.nim.cfg for convenience
|
||||||
var cmdFile: string
|
var cmdFile: string
|
||||||
cmdFile.addTestSet(requireGMP = true, testASM = false)
|
cmdFile.addTestSet(requireGMP = true, useASM = false)
|
||||||
cmdFile.addBenchSet(useASM = false) # Build (but don't run) benches to ensure they stay relevant
|
cmdFile.addBenchSet(useASM = false) # Build (but don't run) benches to ensure they stay relevant
|
||||||
cmdFile.addTestSetThreadpool()
|
cmdFile.addTestSetThreadpool()
|
||||||
cmdFile.addTestSetMultithreadedCrypto(testASM = false)
|
cmdFile.addTestSetMultithreadedCrypto(useASM = false)
|
||||||
for cmd in cmdFile.splitLines():
|
for cmd in cmdFile.splitLines():
|
||||||
if cmd != "": # Windows doesn't like empty commands
|
if cmd != "": # Windows doesn't like empty commands
|
||||||
exec cmd
|
exec cmd
|
||||||
@ -671,7 +707,7 @@ task test_no_asm, "Run all tests (no assembly)":
|
|||||||
task test_no_gmp, "Run tests that don't require GMP":
|
task test_no_gmp, "Run tests that don't require GMP":
|
||||||
# -d:testingCurves is configured in a *.nim.cfg for convenience
|
# -d:testingCurves is configured in a *.nim.cfg for convenience
|
||||||
var cmdFile: string
|
var cmdFile: string
|
||||||
cmdFile.addTestSet(requireGMP = false, testASM = true)
|
cmdFile.addTestSet(requireGMP = false, useASM = true)
|
||||||
cmdFile.addBenchSet(useASM = true) # Build (but don't run) benches to ensure they stay relevant
|
cmdFile.addBenchSet(useASM = true) # Build (but don't run) benches to ensure they stay relevant
|
||||||
cmdFile.addTestSetThreadpool()
|
cmdFile.addTestSetThreadpool()
|
||||||
cmdFile.addTestSetMultithreadedCrypto()
|
cmdFile.addTestSetMultithreadedCrypto()
|
||||||
@ -682,10 +718,10 @@ task test_no_gmp, "Run tests that don't require GMP":
|
|||||||
task test_no_gmp_no_asm, "Run tests that don't require GMP using a pure Nim backend":
|
task test_no_gmp_no_asm, "Run tests that don't require GMP using a pure Nim backend":
|
||||||
# -d:testingCurves is configured in a *.nim.cfg for convenience
|
# -d:testingCurves is configured in a *.nim.cfg for convenience
|
||||||
var cmdFile: string
|
var cmdFile: string
|
||||||
cmdFile.addTestSet(requireGMP = false, testASM = false)
|
cmdFile.addTestSet(requireGMP = false, useASM = false)
|
||||||
cmdFile.addBenchSet(useASM = false) # Build (but don't run) benches to ensure they stay relevant
|
cmdFile.addBenchSet(useASM = false) # Build (but don't run) benches to ensure they stay relevant
|
||||||
cmdFile.addTestSetThreadpool()
|
cmdFile.addTestSetThreadpool()
|
||||||
cmdFile.addTestSetMultithreadedCrypto(testASM = false)
|
cmdFile.addTestSetMultithreadedCrypto(useASM = false)
|
||||||
for cmd in cmdFile.splitLines():
|
for cmd in cmdFile.splitLines():
|
||||||
if cmd != "": # Windows doesn't like empty commands
|
if cmd != "": # Windows doesn't like empty commands
|
||||||
exec cmd
|
exec cmd
|
||||||
@ -696,7 +732,7 @@ task test_parallel, "Run all tests in parallel":
|
|||||||
genParallelCmdRunner()
|
genParallelCmdRunner()
|
||||||
|
|
||||||
var cmdFile: string
|
var cmdFile: string
|
||||||
cmdFile.addTestSet(requireGMP = true, testASM = true)
|
cmdFile.addTestSet(requireGMP = true, useASM = true)
|
||||||
cmdFile.addBenchSet(useASM = true) # Build (but don't run) benches to ensure they stay relevant
|
cmdFile.addBenchSet(useASM = true) # Build (but don't run) benches to ensure they stay relevant
|
||||||
writeFile(buildParallel, cmdFile)
|
writeFile(buildParallel, cmdFile)
|
||||||
exec "build/pararun " & buildParallel
|
exec "build/pararun " & buildParallel
|
||||||
@ -715,7 +751,7 @@ task test_parallel_no_asm, "Run all tests (without macro assembler) in parallel"
|
|||||||
genParallelCmdRunner()
|
genParallelCmdRunner()
|
||||||
|
|
||||||
var cmdFile: string
|
var cmdFile: string
|
||||||
cmdFile.addTestSet(requireGMP = true, testASM = false)
|
cmdFile.addTestSet(requireGMP = true, useASM = false)
|
||||||
cmdFile.addBenchSet(useASM = false)
|
cmdFile.addBenchSet(useASM = false)
|
||||||
writeFile(buildParallel, cmdFile)
|
writeFile(buildParallel, cmdFile)
|
||||||
exec "build/pararun " & buildParallel
|
exec "build/pararun " & buildParallel
|
||||||
@ -723,7 +759,7 @@ task test_parallel_no_asm, "Run all tests (without macro assembler) in parallel"
|
|||||||
# Threadpool tests done serially
|
# Threadpool tests done serially
|
||||||
cmdFile = ""
|
cmdFile = ""
|
||||||
cmdFile.addTestSetThreadpool()
|
cmdFile.addTestSetThreadpool()
|
||||||
cmdFile.addTestSetMultithreadedCrypto(testASM = false)
|
cmdFile.addTestSetMultithreadedCrypto(useASM = false)
|
||||||
for cmd in cmdFile.splitLines():
|
for cmd in cmdFile.splitLines():
|
||||||
if cmd != "": # Windows doesn't like empty commands
|
if cmd != "": # Windows doesn't like empty commands
|
||||||
exec cmd
|
exec cmd
|
||||||
@ -734,7 +770,7 @@ task test_parallel_no_gmp, "Run all tests in parallel":
|
|||||||
genParallelCmdRunner()
|
genParallelCmdRunner()
|
||||||
|
|
||||||
var cmdFile: string
|
var cmdFile: string
|
||||||
cmdFile.addTestSet(requireGMP = false, testASM = true)
|
cmdFile.addTestSet(requireGMP = false, useASM = true)
|
||||||
cmdFile.addBenchSet(useASM = true) # Build (but don't run) benches to ensure they stay relevant
|
cmdFile.addBenchSet(useASM = true) # Build (but don't run) benches to ensure they stay relevant
|
||||||
writeFile(buildParallel, cmdFile)
|
writeFile(buildParallel, cmdFile)
|
||||||
exec "build/pararun " & buildParallel
|
exec "build/pararun " & buildParallel
|
||||||
@ -753,7 +789,7 @@ task test_parallel_no_gmp_no_asm, "Run all tests in parallel":
|
|||||||
genParallelCmdRunner()
|
genParallelCmdRunner()
|
||||||
|
|
||||||
var cmdFile: string
|
var cmdFile: string
|
||||||
cmdFile.addTestSet(requireGMP = false, testASM = false)
|
cmdFile.addTestSet(requireGMP = false, useASM = false)
|
||||||
cmdFile.addBenchSet(useASM = false) # Build (but don't run) benches to ensure they stay relevant
|
cmdFile.addBenchSet(useASM = false) # Build (but don't run) benches to ensure they stay relevant
|
||||||
writeFile(buildParallel, cmdFile)
|
writeFile(buildParallel, cmdFile)
|
||||||
exec "build/pararun " & buildParallel
|
exec "build/pararun " & buildParallel
|
||||||
@ -761,7 +797,7 @@ task test_parallel_no_gmp_no_asm, "Run all tests in parallel":
|
|||||||
# Threadpool tests done serially
|
# Threadpool tests done serially
|
||||||
cmdFile = ""
|
cmdFile = ""
|
||||||
cmdFile.addTestSetThreadpool()
|
cmdFile.addTestSetThreadpool()
|
||||||
cmdFile.addTestSetMultithreadedCrypto(testASM = false)
|
cmdFile.addTestSetMultithreadedCrypto(useASM = false)
|
||||||
for cmd in cmdFile.splitLines():
|
for cmd in cmdFile.splitLines():
|
||||||
if cmd != "": # Windows doesn't like empty commands
|
if cmd != "": # Windows doesn't like empty commands
|
||||||
exec cmd
|
exec cmd
|
||||||
@ -790,389 +826,199 @@ task test_nvidia, "Run all tests for Nvidia GPUs":
|
|||||||
# Finite field 𝔽p
|
# Finite field 𝔽p
|
||||||
# ------------------------------------------
|
# ------------------------------------------
|
||||||
|
|
||||||
task bench_fp, "Run benchmark 𝔽p with your default compiler":
|
task bench_fp, "Run benchmark 𝔽p with your CC compiler":
|
||||||
runBench("bench_fp")
|
runBench("bench_fp")
|
||||||
|
|
||||||
task bench_fp_gcc, "Run benchmark 𝔽p with gcc":
|
task bench_fp_noasm, "Run benchmark 𝔽p with your CC compiler - no Assembly":
|
||||||
runBench("bench_fp", "gcc")
|
runBench("bench_fp", useAsm = false)
|
||||||
|
|
||||||
task bench_fp_clang, "Run benchmark 𝔽p with clang":
|
|
||||||
runBench("bench_fp", "clang")
|
|
||||||
|
|
||||||
task bench_fp_gcc_noasm, "Run benchmark 𝔽p with gcc - no Assembly":
|
|
||||||
runBench("bench_fp", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_fp_clang_noasm, "Run benchmark 𝔽p with clang - no Assembly":
|
|
||||||
runBench("bench_fp", "clang", useAsm = false)
|
|
||||||
|
|
||||||
# Double-precision field 𝔽pDbl
|
# Double-precision field 𝔽pDbl
|
||||||
# ------------------------------------------
|
# ------------------------------------------
|
||||||
|
|
||||||
task bench_fpdbl, "Run benchmark 𝔽pDbl with your default compiler":
|
task bench_fpdbl, "Run benchmark 𝔽pDbl with your CC compiler":
|
||||||
runBench("bench_fp_double_precision")
|
runBench("bench_fp_double_precision")
|
||||||
|
|
||||||
task bench_fpdbl_gcc, "Run benchmark 𝔽p with gcc":
|
task bench_fpdbl_noasm, "Run benchmark 𝔽p with CC compiler - no Assembly":
|
||||||
runBench("bench_fp_double_precision", "gcc")
|
runBench("bench_fp_double_precision", useAsm = false)
|
||||||
|
|
||||||
task bench_fpdbl_clang, "Run benchmark 𝔽p with clang":
|
|
||||||
runBench("bench_fp_double_precision", "clang")
|
|
||||||
|
|
||||||
task bench_fpdbl_gcc_noasm, "Run benchmark 𝔽p with gcc - no Assembly":
|
|
||||||
runBench("bench_fp_double_precision", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_fpdbl_clang_noasm, "Run benchmark 𝔽p with clang - no Assembly":
|
|
||||||
runBench("bench_fp_double_precision", "clang", useAsm = false)
|
|
||||||
|
|
||||||
# Extension field 𝔽p2
|
# Extension field 𝔽p2
|
||||||
# ------------------------------------------
|
# ------------------------------------------
|
||||||
|
|
||||||
task bench_fp2, "Run benchmark with 𝔽p2 your default compiler":
|
task bench_fp2, "Run benchmark 𝔽p2 with your CC compiler":
|
||||||
runBench("bench_fp2")
|
runBench("bench_fp2")
|
||||||
|
|
||||||
task bench_fp2_gcc, "Run benchmark 𝔽p2 with gcc":
|
task bench_fp2_noasm, "Run benchmark 𝔽p2 with CC compiler - no Assembly":
|
||||||
runBench("bench_fp2", "gcc")
|
runBench("bench_fp2", useAsm = false)
|
||||||
|
|
||||||
task bench_fp2_clang, "Run benchmark 𝔽p2 with clang":
|
|
||||||
runBench("bench_fp2", "clang")
|
|
||||||
|
|
||||||
task bench_fp2_gcc_noasm, "Run benchmark 𝔽p2 with gcc - no Assembly":
|
|
||||||
runBench("bench_fp2", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_fp2_clang_noasm, "Run benchmark 𝔽p2 with clang - no Assembly":
|
|
||||||
runBench("bench_fp2", "clang", useAsm = false)
|
|
||||||
|
|
||||||
# Extension field 𝔽p4
|
# Extension field 𝔽p4
|
||||||
# ------------------------------------------
|
# ------------------------------------------
|
||||||
|
|
||||||
task bench_fp4, "Run benchmark with 𝔽p4 your default compiler":
|
task bench_fp4, "Run benchmark 𝔽p4 with your CC compiler":
|
||||||
runBench("bench_fp4")
|
runBench("bench_fp4")
|
||||||
|
|
||||||
task bench_fp4_gcc, "Run benchmark 𝔽p4 with gcc":
|
task bench_fp4_noasm, "Run benchmark 𝔽p4 with CC compiler - no Assembly":
|
||||||
runBench("bench_fp4", "gcc")
|
runBench("bench_fp4", useAsm = false)
|
||||||
|
|
||||||
task bench_fp4_clang, "Run benchmark 𝔽p4 with clang":
|
|
||||||
runBench("bench_fp4", "clang")
|
|
||||||
|
|
||||||
task bench_fp4_gcc_noasm, "Run benchmark 𝔽p4 with gcc - no Assembly":
|
|
||||||
runBench("bench_fp4", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_fp4_clang_noasm, "Run benchmark 𝔽p4 with clang - no Assembly":
|
|
||||||
runBench("bench_fp4", "clang", useAsm = false)
|
|
||||||
|
|
||||||
# Extension field 𝔽p6
|
# Extension field 𝔽p6
|
||||||
# ------------------------------------------
|
# ------------------------------------------
|
||||||
|
|
||||||
task bench_fp6, "Run benchmark with 𝔽p6 your default compiler":
|
task bench_fp6, "Run benchmark 𝔽p6 with your CC compiler":
|
||||||
runBench("bench_fp6")
|
runBench("bench_fp6")
|
||||||
|
|
||||||
task bench_fp6_gcc, "Run benchmark 𝔽p6 with gcc":
|
task bench_fp6_noasm, "Run benchmark 𝔽p6 with CC compiler - no Assembly":
|
||||||
runBench("bench_fp6", "gcc")
|
runBench("bench_fp6", useAsm = false)
|
||||||
|
|
||||||
task bench_fp6_clang, "Run benchmark 𝔽p6 with clang":
|
|
||||||
runBench("bench_fp6", "clang")
|
|
||||||
|
|
||||||
task bench_fp6_gcc_noasm, "Run benchmark 𝔽p6 with gcc - no Assembly":
|
|
||||||
runBench("bench_fp6", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_fp6_clang_noasm, "Run benchmark 𝔽p6 with clang - no Assembly":
|
|
||||||
runBench("bench_fp6", "clang", useAsm = false)
|
|
||||||
|
|
||||||
# Extension field 𝔽p12
|
# Extension field 𝔽p12
|
||||||
# ------------------------------------------
|
# ------------------------------------------
|
||||||
|
|
||||||
task bench_fp12, "Run benchmark with 𝔽p12 your default compiler":
|
task bench_fp12, "Run benchmark 𝔽p12 with your CC compiler":
|
||||||
runBench("bench_fp12")
|
runBench("bench_fp12")
|
||||||
|
|
||||||
task bench_fp12_gcc, "Run benchmark 𝔽p12 with gcc":
|
task bench_fp12_noasm, "Run benchmark 𝔽p12 with CC compiler - no Assembly":
|
||||||
runBench("bench_fp12", "gcc")
|
runBench("bench_fp12", useAsm = false)
|
||||||
|
|
||||||
task bench_fp12_clang, "Run benchmark 𝔽p12 with clang":
|
|
||||||
runBench("bench_fp12", "clang")
|
|
||||||
|
|
||||||
task bench_fp12_gcc_noasm, "Run benchmark 𝔽p12 with gcc - no Assembly":
|
|
||||||
runBench("bench_fp12", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_fp12_clang_noasm, "Run benchmark 𝔽p12 with clang - no Assembly":
|
|
||||||
runBench("bench_fp12", "clang", useAsm = false)
|
|
||||||
|
|
||||||
# Elliptic curve G1
|
# Elliptic curve G1
|
||||||
# ------------------------------------------
|
# ------------------------------------------
|
||||||
|
|
||||||
task bench_ec_g1, "Run benchmark on Elliptic Curve group 𝔾1 - Default compiler":
|
task bench_ec_g1, "Run benchmark on Elliptic Curve group 𝔾1 - CC compiler":
|
||||||
runBench("bench_ec_g1")
|
runBench("bench_ec_g1")
|
||||||
|
|
||||||
task bench_ec_g1_gcc, "Run benchmark on Elliptic Curve group 𝔾1 - GCC":
|
task bench_ec_g1_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - CC compiler no Assembly":
|
||||||
runBench("bench_ec_g1", "gcc")
|
runBench("bench_ec_g1", useAsm = false)
|
||||||
|
|
||||||
task bench_ec_g1_clang, "Run benchmark on Elliptic Curve group 𝔾1 - Clang":
|
|
||||||
runBench("bench_ec_g1", "clang")
|
|
||||||
|
|
||||||
task bench_ec_g1_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - GCC no Assembly":
|
|
||||||
runBench("bench_ec_g1", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_ec_g1_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - Clang no Assembly":
|
|
||||||
runBench("bench_ec_g1", "clang", useAsm = false)
|
|
||||||
|
|
||||||
# Elliptic curve G1 - batch operations
|
# Elliptic curve G1 - batch operations
|
||||||
# ------------------------------------------
|
# ------------------------------------------
|
||||||
|
|
||||||
task bench_ec_g1_batch, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - Default compiler":
|
task bench_ec_g1_batch, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - CC compiler":
|
||||||
runBench("bench_ec_g1_batch")
|
runBench("bench_ec_g1_batch")
|
||||||
|
|
||||||
task bench_ec_g1_batch_gcc, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - GCC":
|
task bench_ec_g1_batch_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - CC compiler no Assembly":
|
||||||
runBench("bench_ec_g1_batch", "gcc")
|
runBench("bench_ec_g1_batch", useAsm = false)
|
||||||
|
|
||||||
task bench_ec_g1_batch_clang, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - Clang":
|
|
||||||
runBench("bench_ec_g1_batch", "clang")
|
|
||||||
|
|
||||||
task bench_ec_g1_batch_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - GCC no Assembly":
|
|
||||||
runBench("bench_ec_g1_batch", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_ec_g1_batch_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - Clang no Assembly":
|
|
||||||
runBench("bench_ec_g1_batch", "clang", useAsm = false)
|
|
||||||
|
|
||||||
# Elliptic curve G1 - scalar multiplication
|
# Elliptic curve G1 - scalar multiplication
|
||||||
# ------------------------------------------
|
# ------------------------------------------
|
||||||
|
|
||||||
task bench_ec_g1_scalar_mul, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - Default compiler":
|
task bench_ec_g1_scalar_mul, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - CC compiler":
|
||||||
runBench("bench_ec_g1_scalar_mul")
|
runBench("bench_ec_g1_scalar_mul")
|
||||||
|
|
||||||
task bench_ec_g1_scalar_mul_gcc, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - GCC":
|
task bench_ec_g1_scalar_mul_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - CC compiler no Assembly":
|
||||||
runBench("bench_ec_g1_scalar_mul", "gcc")
|
runBench("bench_ec_g1_scalar_mul", useAsm = false)
|
||||||
|
|
||||||
task bench_ec_g1_scalar_mul_clang, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - Clang":
|
|
||||||
runBench("bench_ec_g1_scalar_mul", "clang")
|
|
||||||
|
|
||||||
task bench_ec_g1_scalar_mul_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - GCC no Assembly":
|
|
||||||
runBench("bench_ec_g1_scalar_mul", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_ec_g1_scalar_mul_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - Clang no Assembly":
|
|
||||||
runBench("bench_ec_g1_scalar_mul", "clang", useAsm = false)
|
|
||||||
|
|
||||||
# Elliptic curve G1 - Multi-scalar-mul
|
# Elliptic curve G1 - Multi-scalar-mul
|
||||||
# ------------------------------------------
|
# ------------------------------------------
|
||||||
|
|
||||||
task bench_ec_g1_msm_bn254_snarks, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - Default compiler":
|
task bench_ec_g1_msm_bn254_snarks, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - CC compiler":
|
||||||
runBench("bench_ec_g1_msm_bn254_snarks")
|
runBench("bench_ec_g1_msm_bn254_snarks")
|
||||||
|
|
||||||
task bench_ec_g1_msm_bn254_snarks_gcc, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - GCC":
|
task bench_ec_g1_msm_bn254_snarks_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - CC compiler no Assembly":
|
||||||
runBench("bench_ec_g1_msm_bn254_snarks", "gcc")
|
runBench("bench_ec_g1_msm_bn254_snarks", useAsm = false)
|
||||||
|
|
||||||
task bench_ec_g1_msm_bn254_snarks_clang, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - Clang":
|
task bench_ec_g1_msm_bls12_381, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - CC compiler":
|
||||||
runBench("bench_ec_g1_msm_bn254_snarks", "clang")
|
|
||||||
|
|
||||||
task bench_ec_g1_msm_bn254_snarks_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - GCC no Assembly":
|
|
||||||
runBench("bench_ec_g1_msm_bn254_snarks", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_ec_g1_msm_bn254_snarks_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - Clang no Assembly":
|
|
||||||
runBench("bench_ec_g1_msm_bn254_snarks", "clang", useAsm = false)
|
|
||||||
|
|
||||||
task bench_ec_g1_msm_bls12_381, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - Default compiler":
|
|
||||||
runBench("bench_ec_g1_msm_bls12_381")
|
runBench("bench_ec_g1_msm_bls12_381")
|
||||||
|
|
||||||
task bench_ec_g1_msm_bls12_381_gcc, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - GCC":
|
task bench_ec_g1_msm_bls12_381_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - CC compiler no Assembly":
|
||||||
runBench("bench_ec_g1_msm_bls12_381", "gcc")
|
runBench("bench_ec_g1_msm_bls12_381", useAsm = false)
|
||||||
|
|
||||||
task bench_ec_g1_msm_bls12_381_clang, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - Clang":
|
|
||||||
runBench("bench_ec_g1_msm_bls12_381", "clang")
|
|
||||||
|
|
||||||
task bench_ec_g1_msm_bls12_381_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - GCC no Assembly":
|
|
||||||
runBench("bench_ec_g1_msm_bls12_381", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_ec_g1_msm_bls12_381_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - Clang no Assembly":
|
|
||||||
runBench("bench_ec_g1_msm_bls12_381", "clang", useAsm = false)
|
|
||||||
|
|
||||||
# Elliptic curve G2
|
# Elliptic curve G2
|
||||||
# ------------------------------------------
|
# ------------------------------------------
|
||||||
|
|
||||||
task bench_ec_g2, "Run benchmark on Elliptic Curve group 𝔾2 - Default compiler":
|
task bench_ec_g2, "Run benchmark on Elliptic Curve group 𝔾2 - CC compiler":
|
||||||
runBench("bench_ec_g2")
|
runBench("bench_ec_g2")
|
||||||
|
|
||||||
task bench_ec_g2_gcc, "Run benchmark on Elliptic Curve group 𝔾2 - GCC":
|
task bench_ec_g2_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - CC compiler no Assembly":
|
||||||
runBench("bench_ec_g2", "gcc")
|
runBench("bench_ec_g2", useAsm = false)
|
||||||
|
|
||||||
task bench_ec_g2_clang, "Run benchmark on Elliptic Curve group 𝔾2 - Clang":
|
|
||||||
runBench("bench_ec_g2", "clang")
|
|
||||||
|
|
||||||
task bench_ec_g2_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - GCC no Assembly":
|
|
||||||
runBench("bench_ec_g2", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_ec_g2_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - Clang no Assembly":
|
|
||||||
runBench("bench_ec_g2", "clang", useAsm = false)
|
|
||||||
|
|
||||||
# Elliptic curve G2 - scalar multiplication
|
# Elliptic curve G2 - scalar multiplication
|
||||||
# ------------------------------------------
|
# ------------------------------------------
|
||||||
|
|
||||||
task bench_ec_g2_scalar_mul, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - Default compiler":
|
task bench_ec_g2_scalar_mul, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - CC compiler":
|
||||||
runBench("bench_ec_g2_scalar_mul")
|
runBench("bench_ec_g2_scalar_mul")
|
||||||
|
|
||||||
task bench_ec_g2_scalar_mul_gcc, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - GCC":
|
|
||||||
runBench("bench_ec_g2_scalar_mul", "gcc")
|
|
||||||
|
|
||||||
task bench_ec_g2_scalar_mul_clang, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - Clang":
|
task bench_ec_g2_scalar_mul_noasm, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - CC compiler no Assembly":
|
||||||
runBench("bench_ec_g2_scalar_mul", "clang")
|
runBench("bench_ec_g2_scalar_mul", useAsm = false)
|
||||||
|
|
||||||
task bench_ec_g2_scalar_mul_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - GCC no Assembly":
|
|
||||||
runBench("bench_ec_g2_scalar_mul", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_ec_g2_scalar_mul_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - Clang no Assembly":
|
|
||||||
runBench("bench_ec_g2_scalar_mul", "clang", useAsm = false)
|
|
||||||
|
|
||||||
# Pairings
|
# Pairings
|
||||||
# ------------------------------------------
|
# ------------------------------------------
|
||||||
|
|
||||||
task bench_pairing_bls12_377, "Run pairings benchmarks for BLS12-377 - Default compiler":
|
task bench_pairing_bls12_377, "Run pairings benchmarks for BLS12-377 - CC compiler":
|
||||||
runBench("bench_pairing_bls12_377")
|
runBench("bench_pairing_bls12_377")
|
||||||
|
|
||||||
task bench_pairing_bls12_377_gcc, "Run pairings benchmarks for BLS12-377 - GCC":
|
task bench_pairing_bls12_377_noasm, "Run pairings benchmarks for BLS12-377 - CC compiler no Assembly":
|
||||||
runBench("bench_pairing_bls12_377", "gcc")
|
runBench("bench_pairing_bls12_377", useAsm = false)
|
||||||
|
|
||||||
task bench_pairing_bls12_377_clang, "Run pairings benchmarks for BLS12-377 - Clang":
|
|
||||||
runBench("bench_pairing_bls12_377", "clang")
|
|
||||||
|
|
||||||
task bench_pairing_bls12_377_gcc_noasm, "Run pairings benchmarks for BLS12-377 - GCC no Assembly":
|
|
||||||
runBench("bench_pairing_bls12_377", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_pairing_bls12_377_clang_noasm, "Run pairings benchmarks for BLS12-377 - Clang no Assembly":
|
|
||||||
runBench("bench_pairing_bls12_377", "clang", useAsm = false)
|
|
||||||
|
|
||||||
# --
|
# --
|
||||||
|
|
||||||
task bench_pairing_bls12_381, "Run pairings benchmarks for BLS12-381 - Default compiler":
|
task bench_pairing_bls12_381, "Run pairings benchmarks for BLS12-381 - CC compiler":
|
||||||
runBench("bench_pairing_bls12_381")
|
runBench("bench_pairing_bls12_381")
|
||||||
|
|
||||||
task bench_pairing_bls12_381_gcc, "Run pairings benchmarks for BLS12-381 - GCC":
|
task bench_pairing_bls12_381_noasm, "Run pairings benchmarks for BLS12-381 - CC compiler no Assembly":
|
||||||
runBench("bench_pairing_bls12_381", "gcc")
|
runBench("bench_pairing_bls12_381", useAsm = false)
|
||||||
|
|
||||||
task bench_pairing_bls12_381_clang, "Run pairings benchmarks for BLS12-381 - Clang":
|
|
||||||
runBench("bench_pairing_bls12_381", "clang")
|
|
||||||
|
|
||||||
task bench_pairing_bls12_381_gcc_noasm, "Run pairings benchmarks for BLS12-381 - GCC no Assembly":
|
|
||||||
runBench("bench_pairing_bls12_381", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_pairing_bls12_381_clang_noasm, "Run pairings benchmarks for BLS12-381 - Clang no Assembly":
|
|
||||||
runBench("bench_pairing_bls12_381", "clang", useAsm = false)
|
|
||||||
|
|
||||||
# --
|
# --
|
||||||
|
|
||||||
task bench_pairing_bn254_nogami, "Run pairings benchmarks for BN254-Nogami - Default compiler":
|
task bench_pairing_bn254_nogami, "Run pairings benchmarks for BN254-Nogami - CC compiler":
|
||||||
runBench("bench_pairing_bn254_nogami")
|
runBench("bench_pairing_bn254_nogami")
|
||||||
|
|
||||||
task bench_pairing_bn254_nogami_gcc, "Run pairings benchmarks for BN254-Nogami - GCC":
|
task bench_pairing_bn254_nogami_noasm, "Run pairings benchmarks for BN254-Nogami - CC compiler no Assembly":
|
||||||
runBench("bench_pairing_bn254_nogami", "gcc")
|
runBench("bench_pairing_bn254_nogami", useAsm = false)
|
||||||
|
|
||||||
task bench_pairing_bn254_nogami_clang, "Run pairings benchmarks for BN254-Nogami - Clang":
|
|
||||||
runBench("bench_pairing_bn254_nogami", "clang")
|
|
||||||
|
|
||||||
task bench_pairing_bn254_nogami_gcc_noasm, "Run pairings benchmarks for BN254-Nogami - GCC no Assembly":
|
|
||||||
runBench("bench_pairing_bn254_nogami", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_pairing_bn254_nogami_clang_noasm, "Run pairings benchmarks for BN254-Nogami - Clang no Assembly":
|
|
||||||
runBench("bench_pairing_bn254_nogami", "clang", useAsm = false)
|
|
||||||
|
|
||||||
# --
|
# --
|
||||||
|
|
||||||
task bench_pairing_bn254_snarks, "Run pairings benchmarks for BN254-Snarks - Default compiler":
|
task bench_pairing_bn254_snarks, "Run pairings benchmarks for BN254-Snarks - CC compiler":
|
||||||
runBench("bench_pairing_bn254_snarks")
|
runBench("bench_pairing_bn254_snarks")
|
||||||
|
|
||||||
task bench_pairing_bn254_snarks_gcc, "Run pairings benchmarks for BN254-Snarks - GCC":
|
task bench_pairing_bn254_snarks_noasm, "Run pairings benchmarks for BN254-Snarks - CC compiler no Assembly":
|
||||||
runBench("bench_pairing_bn254_snarks", "gcc")
|
runBench("bench_pairing_bn254_snarks", useAsm = false)
|
||||||
|
|
||||||
task bench_pairing_bn254_snarks_clang, "Run pairings benchmarks for BN254-Snarks - Clang":
|
|
||||||
runBench("bench_pairing_bn254_snarks", "clang")
|
|
||||||
|
|
||||||
task bench_pairing_bn254_snarks_gcc_noasm, "Run pairings benchmarks for BN254-Snarks - GCC no Assembly":
|
|
||||||
runBench("bench_pairing_bn254_snarks", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_pairing_bn254_snarks_clang_noasm, "Run pairings benchmarks for BN254-Snarks - Clang no Assembly":
|
|
||||||
runBench("bench_pairing_bn254_snarks", "clang", useAsm = false)
|
|
||||||
|
|
||||||
|
|
||||||
# Curve summaries
|
# Curve summaries
|
||||||
# ------------------------------------------
|
# ------------------------------------------
|
||||||
|
|
||||||
task bench_summary_bls12_377, "Run summary benchmarks for BLS12-377 - Default compiler":
|
task bench_summary_bls12_377, "Run summary benchmarks for BLS12-377 - CC compiler":
|
||||||
runBench("bench_summary_bls12_377")
|
runBench("bench_summary_bls12_377")
|
||||||
|
|
||||||
task bench_summary_bls12_377_gcc, "Run summary benchmarks for BLS12-377 - GCC":
|
|
||||||
runBench("bench_summary_bls12_377", "gcc")
|
|
||||||
|
|
||||||
task bench_summary_bls12_377_clang, "Run summary benchmarks for BLS12-377 - Clang":
|
task bench_summary_bls12_377_noasm, "Run summary benchmarks for BLS12-377 - CC compiler no Assembly":
|
||||||
runBench("bench_summary_bls12_377", "clang")
|
runBench("bench_summary_bls12_377", useAsm = false)
|
||||||
|
|
||||||
task bench_summary_bls12_377_gcc_noasm, "Run summary benchmarks for BLS12-377 - GCC no Assembly":
|
|
||||||
runBench("bench_summary_bls12_377", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_summary_bls12_377_clang_noasm, "Run summary benchmarks for BLS12-377 - Clang no Assembly":
|
|
||||||
runBench("bench_summary_bls12_377", "clang", useAsm = false)
|
|
||||||
|
|
||||||
# --
|
# --
|
||||||
|
|
||||||
task bench_summary_bls12_381, "Run summary benchmarks for BLS12-381 - Default compiler":
|
task bench_summary_bls12_381, "Run summary benchmarks for BLS12-381 - CC compiler":
|
||||||
runBench("bench_summary_bls12_381")
|
runBench("bench_summary_bls12_381")
|
||||||
|
|
||||||
task bench_summary_bls12_381_gcc, "Run summary benchmarks for BLS12-381 - GCC":
|
task bench_summary_bls12_381_noasm, "Run summary benchmarks for BLS12-381 - CC compiler no Assembly":
|
||||||
runBench("bench_summary_bls12_381", "gcc")
|
runBench("bench_summary_bls12_381", useAsm = false)
|
||||||
|
|
||||||
task bench_summary_bls12_381_clang, "Run summary benchmarks for BLS12-381 - Clang":
|
|
||||||
runBench("bench_summary_bls12_381", "clang")
|
|
||||||
|
|
||||||
task bench_summary_bls12_381_gcc_noasm, "Run summary benchmarks for BLS12-381 - GCC no Assembly":
|
|
||||||
runBench("bench_summary_bls12_381", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_summary_bls12_381_clang_noasm, "Run summary benchmarks for BLS12-381 - Clang no Assembly":
|
|
||||||
runBench("bench_summary_bls12_381", "clang", useAsm = false)
|
|
||||||
|
|
||||||
# --
|
# --
|
||||||
|
|
||||||
task bench_summary_bn254_nogami, "Run summary benchmarks for BN254-Nogami - Default compiler":
|
task bench_summary_bn254_nogami, "Run summary benchmarks for BN254-Nogami - CC compiler":
|
||||||
runBench("bench_summary_bn254_nogami")
|
runBench("bench_summary_bn254_nogami")
|
||||||
|
|
||||||
task bench_summary_bn254_nogami_gcc, "Run summary benchmarks for BN254-Nogami - GCC":
|
task bench_summary_bn254_nogami_noasm, "Run summary benchmarks for BN254-Nogami - CC compiler no Assembly":
|
||||||
runBench("bench_summary_bn254_nogami", "gcc")
|
runBench("bench_summary_bn254_nogami", useAsm = false)
|
||||||
|
|
||||||
task bench_summary_bn254_nogami_clang, "Run summary benchmarks for BN254-Nogami - Clang":
|
|
||||||
runBench("bench_summary_bn254_nogami", "clang")
|
|
||||||
|
|
||||||
task bench_summary_bn254_nogami_gcc_noasm, "Run summary benchmarks for BN254-Nogami - GCC no Assembly":
|
|
||||||
runBench("bench_summary_bn254_nogami", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_summary_bn254_nogami_clang_noasm, "Run summary benchmarks for BN254-Nogami - Clang no Assembly":
|
|
||||||
runBench("bench_summary_bn254_nogami", "clang", useAsm = false)
|
|
||||||
|
|
||||||
# --
|
# --
|
||||||
|
|
||||||
task bench_summary_bn254_snarks, "Run summary benchmarks for BN254-Snarks - Default compiler":
|
task bench_summary_bn254_snarks, "Run summary benchmarks for BN254-Snarks - CC compiler":
|
||||||
runBench("bench_summary_bn254_snarks")
|
runBench("bench_summary_bn254_snarks")
|
||||||
|
|
||||||
task bench_summary_bn254_snarks_gcc, "Run summary benchmarks for BN254-Snarks - GCC":
|
|
||||||
runBench("bench_summary_bn254_snarks", "gcc")
|
|
||||||
|
|
||||||
task bench_summary_bn254_snarks_clang, "Run summary benchmarks for BN254-Snarks - Clang":
|
task bench_summary_bn254_snarks_noasm, "Run summary benchmarks for BN254-Snarks - CC compiler no Assembly":
|
||||||
runBench("bench_summary_bn254_snarks", "clang")
|
runBench("bench_summary_bn254_snarks", useAsm = false)
|
||||||
|
|
||||||
task bench_summary_bn254_snarks_gcc_noasm, "Run summary benchmarks for BN254-Snarks - GCC no Assembly":
|
|
||||||
runBench("bench_summary_bn254_snarks", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_summary_bn254_snarks_clang_noasm, "Run summary benchmarks for BN254-Snarks - Clang no Assembly":
|
|
||||||
runBench("bench_summary_bn254_snarks", "clang", useAsm = false)
|
|
||||||
|
|
||||||
# --
|
# --
|
||||||
|
|
||||||
task bench_summary_pasta, "Run summary benchmarks for the Pasta curves - Default compiler":
|
task bench_summary_pasta, "Run summary benchmarks for the Pasta curves - CC compiler":
|
||||||
runBench("bench_summary_pasta")
|
runBench("bench_summary_pasta")
|
||||||
|
|
||||||
task bench_summary_pasta_gcc, "Run summary benchmarks for the Pasta curves - GCC":
|
|
||||||
runBench("bench_summary_pasta", "gcc")
|
|
||||||
|
|
||||||
task bench_summary_pasta_clang, "Run summary benchmarks for the Pasta curves - Clang":
|
task bench_summary_pasta_noasm, "Run summary benchmarks for the Pasta curves - CC compiler no Assembly":
|
||||||
runBench("bench_summary_pasta", "clang")
|
runBench("bench_summary_pasta", useAsm = false)
|
||||||
|
|
||||||
task bench_summary_pasta_gcc_noasm, "Run summary benchmarks for the Pasta curves - GCC no Assembly":
|
|
||||||
runBench("bench_summary_pasta", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_summary_pasta_clang_noasm, "Run summary benchmarks for the Pasta curves - Clang no Assembly":
|
|
||||||
runBench("bench_summary_pasta", "clang", useAsm = false)
|
|
||||||
|
|
||||||
# Hashes
|
# Hashes
|
||||||
# ------------------------------------------
|
# ------------------------------------------
|
||||||
@ -1185,31 +1031,13 @@ task bench_sha256, "Run SHA256 benchmarks":
|
|||||||
task bench_hash_to_curve, "Run Hash-to-Curve benchmarks":
|
task bench_hash_to_curve, "Run Hash-to-Curve benchmarks":
|
||||||
runBench("bench_hash_to_curve")
|
runBench("bench_hash_to_curve")
|
||||||
|
|
||||||
task bench_hash_to_curve_gcc, "Run Hash-to-Curve benchmarks":
|
task bench_hash_to_curve_noasm, "Run Hash-to-Curve benchmarks - No Assembly":
|
||||||
runBench("bench_hash_to_curve", "gcc")
|
runBench("bench_hash_to_curve", useAsm = false)
|
||||||
|
|
||||||
task bench_hash_to_curve_clang, "Run Hash-to-Curve benchmarks":
|
|
||||||
runBench("bench_hash_to_curve", "clang")
|
|
||||||
|
|
||||||
task bench_hash_to_curve_gcc_noasm, "Run Hash-to-Curve benchmarks":
|
|
||||||
runBench("bench_hash_to_curve", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_hash_to_curve_clang_noasm, "Run Hash-to-Curve benchmarks":
|
|
||||||
runBench("bench_hash_to_curve", "clang", useAsm = false)
|
|
||||||
|
|
||||||
# BLS signatures
|
# BLS signatures
|
||||||
# ------------------------------------------
|
# ------------------------------------------
|
||||||
task bench_ethereum_bls_signatures, "Run Ethereum BLS signatures benchmarks":
|
task bench_ethereum_bls_signatures, "Run Ethereum BLS signatures benchmarks - CC compiler":
|
||||||
runBench("bench_ethereum_bls_signatures")
|
runBench("bench_ethereum_bls_signatures")
|
||||||
|
|
||||||
task bench_ethereum_bls_signatures_gcc, "Run Ethereum BLS signatures benchmarks":
|
task bench_ethereum_bls_signatures_noasm, "Run Ethereum BLS signatures benchmarks - CC compiler no assembly":
|
||||||
runBench("bench_ethereum_bls_signatures", "gcc")
|
runBench("bench_ethereum_bls_signatures", useAsm = false)
|
||||||
|
|
||||||
task bench_ethereum_bls_signatures_clang, "Run Ethereum BLS signatures benchmarks":
|
|
||||||
runBench("bench_ethereum_bls_signatures", "clang")
|
|
||||||
|
|
||||||
task bench_ethereum_bls_signatures_gcc_noasm, "Run Ethereum BLS signatures benchmarks":
|
|
||||||
runBench("bench_ethereum_bls_signatures", "gcc", useAsm = false)
|
|
||||||
|
|
||||||
task bench_ethereum_bls_signatures_clang_noasm, "Run Ethereum BLS signatures benchmarks":
|
|
||||||
runBench("bench_ethereum_bls_signatures", "clang", useAsm = false)
|
|
||||||
|
|||||||
@ -50,7 +50,7 @@ import ./zoo_exports
|
|||||||
static:
|
static:
|
||||||
# Xxport SHA256 routines with a protocol specific prefix
|
# Xxport SHA256 routines with a protocol specific prefix
|
||||||
# This exports sha256.init(), sha256.update(), sha256.finish() and sha256.clear()
|
# This exports sha256.init(), sha256.update(), sha256.finish() and sha256.clear()
|
||||||
prefix_sha256 = prefix_ffi & "_sha256_"
|
prefix_sha256 = prefix_ffi & "sha256_"
|
||||||
|
|
||||||
import hashes
|
import hashes
|
||||||
export hashes # generic sandwich on sha256
|
export hashes # generic sandwich on sha256
|
||||||
|
|||||||
@ -10,6 +10,7 @@ import
|
|||||||
# Standard library
|
# Standard library
|
||||||
std/macros,
|
std/macros,
|
||||||
# Internal
|
# Internal
|
||||||
|
./limbs_asm_modular_x86,
|
||||||
../../../platforms/abstractions
|
../../../platforms/abstractions
|
||||||
|
|
||||||
# ############################################################
|
# ############################################################
|
||||||
@ -32,7 +33,7 @@ static: doAssert UseASM_X86_64
|
|||||||
# Double-precision field addition
|
# Double-precision field addition
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
macro addmod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N div 2]): untyped =
|
macro addmod2x_gen[N: static int](r_PIR: var Limbs[N], a_MEM, b_MEM: Limbs[N], M_MEM: Limbs[N div 2], spareBits: static int): untyped =
|
||||||
## Generate an optimized out-of-place double-precision addition kernel
|
## Generate an optimized out-of-place double-precision addition kernel
|
||||||
|
|
||||||
result = newStmtList()
|
result = newStmtList()
|
||||||
@ -41,23 +42,28 @@ macro addmod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N di
|
|||||||
let
|
let
|
||||||
H = N div 2
|
H = N div 2
|
||||||
|
|
||||||
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
|
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||||
# We reuse the reg used for b for overflow detection
|
b = asmArray(b_MEM, N, MemOffsettable, asmInput)
|
||||||
b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
|
|
||||||
# We could force m as immediate by specializing per moduli
|
# We could force m as immediate by specializing per moduli
|
||||||
M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
|
M = asmArray(M_MEM, H, MemOffsettable, asmInput)
|
||||||
# If N is too big, we need to spill registers. TODO.
|
# If N is too big, we need to spill registers. TODO.
|
||||||
u = init(OperandArray, nimSymbol = ident"U", H, ElemsInReg, InputOutput)
|
uSym = ident"u"
|
||||||
v = init(OperandArray, nimSymbol = ident"V", H, ElemsInReg, InputOutput)
|
vSym = ident"v"
|
||||||
|
u = asmArray(uSym, H, ElemsInReg, asmInputOutput)
|
||||||
|
v = asmArray(vSym, H, ElemsInReg, asmInputOutput)
|
||||||
|
|
||||||
|
overflowRegSym = ident"overflowReg"
|
||||||
|
overflowReg = asmValue(overflowRegSym, Reg, asmOutputOverwrite)
|
||||||
|
|
||||||
let usym = u.nimSymbol
|
|
||||||
let vsym = v.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
var `usym`{.noinit.}, `vsym` {.noInit.}: typeof(`A`)
|
var `uSym`{.noinit.}, `vSym` {.noInit.}: typeof(`a_MEM`)
|
||||||
staticFor i, 0, `H`:
|
staticFor i, 0, `H`:
|
||||||
`usym`[i] = `A`[i]
|
`uSym`[i] = `a_MEM`[i]
|
||||||
staticFor i, `H`, `N`:
|
staticFor i, `H`, `N`:
|
||||||
`vsym`[i-`H`] = `A`[i]
|
`vSym`[i-`H`] = `a_MEM`[i]
|
||||||
|
|
||||||
|
when `sparebits` == 0:
|
||||||
|
var `overflowRegSym`{.noInit.}: BaseType
|
||||||
|
|
||||||
# Addition
|
# Addition
|
||||||
# u = a[0..<H] + b[0..<H], v = a[H..<N]
|
# u = a[0..<H] + b[0..<H], v = a[H..<N]
|
||||||
@ -72,38 +78,26 @@ macro addmod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N di
|
|||||||
ctx.adc v[i-H], b[i]
|
ctx.adc v[i-H], b[i]
|
||||||
ctx.mov u[i-H], v[i-H]
|
ctx.mov u[i-H], v[i-H]
|
||||||
|
|
||||||
# Mask: overflowed contains 0xFFFF or 0x0000
|
let rUpperHalf = r.subset(H, N)
|
||||||
# TODO: unnecessary if MSB never set, i.e. "Field.getSpareBits >= 1"
|
|
||||||
let overflowed = b.reuseRegister()
|
|
||||||
ctx.sbb overflowed, overflowed
|
|
||||||
|
|
||||||
|
if spareBits >= 1:
|
||||||
# Now substract the modulus to test a < 2ⁿp
|
# Now substract the modulus to test a < 2ⁿp
|
||||||
ctx.sub v[0], M[0]
|
ctx.finalSubNoOverflowImpl(rUpperHalf, v, M, u)
|
||||||
for i in 1 ..< H:
|
else:
|
||||||
ctx.sbb v[i], M[i]
|
ctx.finalSubMayOverflowImpl(rUpperHalf, v, M, u, scratchReg = overflowReg)
|
||||||
|
|
||||||
# If it overflows here, it means that it was
|
result.add ctx.generate()
|
||||||
# smaller than the modulus and we don't need v
|
|
||||||
ctx.sbb overflowed, 0
|
|
||||||
|
|
||||||
# Conditional Mov and
|
func addmod2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N div 2], spareBits: static int) =
|
||||||
# and store result
|
|
||||||
for i in 0 ..< H:
|
|
||||||
ctx.cmovnc u[i], v[i]
|
|
||||||
ctx.mov r[i+H], u[i]
|
|
||||||
|
|
||||||
result.add ctx.generate
|
|
||||||
|
|
||||||
func addmod2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N div 2]) {.noInline.} =
|
|
||||||
## Constant-time double-precision addition
|
## Constant-time double-precision addition
|
||||||
## Output is conditionally reduced by 2ⁿp
|
## Output is conditionally reduced by 2ⁿp
|
||||||
## to stay in the [0, 2ⁿp) range
|
## to stay in the [0, 2ⁿp) range
|
||||||
addmod2x_gen(r, a, b, M)
|
addmod2x_gen(r, a, b, M, spareBits)
|
||||||
|
|
||||||
# Double-precision field substraction
|
# Double-precision field substraction
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
macro submod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N div 2]): untyped =
|
macro submod2x_gen[N: static int](r_PIR: var Limbs[N], a_MEM, b_PIR: Limbs[N], M_MEM: Limbs[N div 2]): untyped =
|
||||||
## Generate an optimized out-of-place double-precision substraction kernel
|
## Generate an optimized out-of-place double-precision substraction kernel
|
||||||
|
|
||||||
result = newStmtList()
|
result = newStmtList()
|
||||||
@ -112,23 +106,22 @@ macro submod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N di
|
|||||||
let
|
let
|
||||||
H = N div 2
|
H = N div 2
|
||||||
|
|
||||||
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
|
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||||
# We reuse the reg used for b for overflow detection
|
b = asmArray(b_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memRead) # We reuse the reg used for b for overflow detection
|
||||||
b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
|
|
||||||
# We could force m as immediate by specializing per moduli
|
# We could force m as immediate by specializing per moduli
|
||||||
M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
|
M = asmArray(M_MEM, H, MemOffsettable, asmInput)
|
||||||
# If N is too big, we need to spill registers. TODO.
|
# If N is too big, we need to spill registers. TODO.
|
||||||
u = init(OperandArray, nimSymbol = ident"U", H, ElemsInReg, InputOutput)
|
uSym = ident"u"
|
||||||
v = init(OperandArray, nimSymbol = ident"V", H, ElemsInReg, InputOutput)
|
vSym = ident"v"
|
||||||
|
u = asmArray(uSym, H, ElemsInReg, asmInputOutput)
|
||||||
|
v = asmArray(vSym, H, ElemsInReg, asmInputOutput)
|
||||||
|
|
||||||
let usym = u.nimSymbol
|
|
||||||
let vsym = v.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
var `usym`{.noinit.}, `vsym` {.noInit.}: typeof(`A`)
|
var `uSym`{.noinit.}, `vSym` {.noInit.}: typeof(`a_MEM`)
|
||||||
staticFor i, 0, `H`:
|
staticFor i, 0, `H`:
|
||||||
`usym`[i] = `A`[i]
|
`uSym`[i] = `a_MEM`[i]
|
||||||
staticFor i, `H`, `N`:
|
staticFor i, `H`, `N`:
|
||||||
`vsym`[i-`H`] = `A`[i]
|
`vSym`[i-`H`] = `a_MEM`[i]
|
||||||
|
|
||||||
# Substraction
|
# Substraction
|
||||||
# u = a[0..<H] - b[0..<H], v = a[H..<N]
|
# u = a[0..<H] - b[0..<H], v = a[H..<N]
|
||||||
@ -158,9 +151,9 @@ macro submod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N di
|
|||||||
ctx.adc u[i], v[i]
|
ctx.adc u[i], v[i]
|
||||||
ctx.mov r[i+H], u[i]
|
ctx.mov r[i+H], u[i]
|
||||||
|
|
||||||
result.add ctx.generate
|
result.add ctx.generate()
|
||||||
|
|
||||||
func submod2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N div 2]) {.noInline.} =
|
func submod2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N div 2]) =
|
||||||
## Constant-time double-precision substraction
|
## Constant-time double-precision substraction
|
||||||
## Output is conditionally reduced by 2ⁿp
|
## Output is conditionally reduced by 2ⁿp
|
||||||
## to stay in the [0, 2ⁿp) range
|
## to stay in the [0, 2ⁿp) range
|
||||||
@ -169,7 +162,7 @@ func submod2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N di
|
|||||||
# Double-precision field negation
|
# Double-precision field negation
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
macro negmod2x_gen[N: static int](R: var Limbs[N], A: Limbs[N], m: Limbs[N div 2]): untyped =
|
macro negmod2x_gen[N: static int](r_PIR: var Limbs[N], a_MEM: Limbs[N], M_MEM: Limbs[N div 2]): untyped =
|
||||||
## Generate an optimized modular negation kernel
|
## Generate an optimized modular negation kernel
|
||||||
|
|
||||||
result = newStmtList()
|
result = newStmtList()
|
||||||
@ -178,22 +171,20 @@ macro negmod2x_gen[N: static int](R: var Limbs[N], A: Limbs[N], m: Limbs[N div 2
|
|||||||
let
|
let
|
||||||
H = N div 2
|
H = N div 2
|
||||||
|
|
||||||
a = init(OperandArray, nimSymbol = A, N, PointerInReg, Input)
|
a = asmArray(a_MEM, N, MemOffsettable, asmInput)
|
||||||
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
|
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||||
u = init(OperandArray, nimSymbol = ident"U", N, ElemsInReg, Output_EarlyClobber)
|
uSym = ident"u"
|
||||||
|
u = asmArray(uSym, N, ElemsInReg, asmOutputEarlyClobber)
|
||||||
# We could force m as immediate by specializing per moduli
|
# We could force m as immediate by specializing per moduli
|
||||||
# We reuse the reg used for m for overflow detection
|
# We reuse the reg used for m for overflow detection
|
||||||
M = init(OperandArray, nimSymbol = m, N, PointerInReg, InputOutput)
|
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||||
|
|
||||||
isZero = Operand(
|
isZeroSym = ident"isZero"
|
||||||
desc: OperandDesc(
|
isZero = asmValue(isZeroSym, Reg, asmOutputEarlyClobber)
|
||||||
asmId: "[isZero]",
|
|
||||||
nimSymbol: ident"isZero",
|
result.add quote do:
|
||||||
rm: Reg,
|
var `isZerosym`{.noInit.}: BaseType
|
||||||
constraint: Output_EarlyClobber,
|
var `usym`{.noinit, used.}: typeof(`a_MEM`)
|
||||||
cEmit: "isZero"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Substraction 2ⁿp - a
|
# Substraction 2ⁿp - a
|
||||||
# The lower half of 2ⁿp is filled with zero
|
# The lower half of 2ⁿp is filled with zero
|
||||||
@ -227,13 +218,8 @@ macro negmod2x_gen[N: static int](R: var Limbs[N], A: Limbs[N], m: Limbs[N div 2
|
|||||||
ctx.cmovz u[i-H], isZero
|
ctx.cmovz u[i-H], isZero
|
||||||
ctx.mov r[i], u[i-H]
|
ctx.mov r[i], u[i-H]
|
||||||
|
|
||||||
let isZerosym = isZero.desc.nimSymbol
|
result.add ctx.generate()
|
||||||
let usym = u.nimSymbol
|
|
||||||
result.add quote do:
|
|
||||||
var `isZerosym`{.noInit.}: BaseType
|
|
||||||
var `usym`{.noinit, used.}: typeof(`A`)
|
|
||||||
result.add ctx.generate
|
|
||||||
|
|
||||||
func negmod2x_asm*[N: static int](r: var Limbs[N], a: Limbs[N], M: Limbs[N div 2]) {.noInline.} =
|
func negmod2x_asm*[N: static int](r: var Limbs[N], a: Limbs[N], M: Limbs[N div 2]) =
|
||||||
## Constant-time double-precision negation
|
## Constant-time double-precision negation
|
||||||
negmod2x_gen(r, a, M)
|
negmod2x_gen(r, a, M)
|
||||||
|
|||||||
@ -18,11 +18,6 @@ import
|
|||||||
#
|
#
|
||||||
# ############################################################
|
# ############################################################
|
||||||
|
|
||||||
# Note: We can refer to at most 30 registers in inline assembly
|
|
||||||
# and "InputOutput" registers count double
|
|
||||||
# They are nice to let the compiler deals with mov
|
|
||||||
# but too constraining so we move things ourselves.
|
|
||||||
|
|
||||||
static: doAssert UseASM_X86_32
|
static: doAssert UseASM_X86_32
|
||||||
|
|
||||||
# Necessary for the compiler to find enough registers
|
# Necessary for the compiler to find enough registers
|
||||||
@ -31,7 +26,8 @@ static: doAssert UseASM_X86_32
|
|||||||
proc finalSubNoOverflowImpl*(
|
proc finalSubNoOverflowImpl*(
|
||||||
ctx: var Assembler_x86,
|
ctx: var Assembler_x86,
|
||||||
r: Operand or OperandArray,
|
r: Operand or OperandArray,
|
||||||
a, M, scratch: OperandArray) =
|
a, M, scratch: OperandArray,
|
||||||
|
a_in_scratch = false) =
|
||||||
## Reduce `a` into `r` modulo `M`
|
## Reduce `a` into `r` modulo `M`
|
||||||
## To be used when the modulus does not use the full bitwidth of the storing words
|
## To be used when the modulus does not use the full bitwidth of the storing words
|
||||||
## for example a 255-bit modulus in n words of total max size 2^256
|
## for example a 255-bit modulus in n words of total max size 2^256
|
||||||
@ -42,9 +38,11 @@ proc finalSubNoOverflowImpl*(
|
|||||||
ctx.comment "Final substraction (cannot overflow its limbs)"
|
ctx.comment "Final substraction (cannot overflow its limbs)"
|
||||||
|
|
||||||
# Substract the modulus, and test a < p with the last borrow
|
# Substract the modulus, and test a < p with the last borrow
|
||||||
|
if not a_in_scratch:
|
||||||
ctx.mov scratch[0], a[0]
|
ctx.mov scratch[0], a[0]
|
||||||
ctx.sub scratch[0], M[0]
|
ctx.sub scratch[0], M[0]
|
||||||
for i in 1 ..< N:
|
for i in 1 ..< N:
|
||||||
|
if not a_in_scratch:
|
||||||
ctx.mov scratch[i], a[i]
|
ctx.mov scratch[i], a[i]
|
||||||
ctx.sbb scratch[i], M[i]
|
ctx.sbb scratch[i], M[i]
|
||||||
|
|
||||||
@ -58,13 +56,15 @@ proc finalSubMayOverflowImpl*(
|
|||||||
ctx: var Assembler_x86,
|
ctx: var Assembler_x86,
|
||||||
r: Operand or OperandArray,
|
r: Operand or OperandArray,
|
||||||
a, M, scratch: OperandArray,
|
a, M, scratch: OperandArray,
|
||||||
scratchReg: Operand or Register or OperandReuse) =
|
a_in_scratch = false,
|
||||||
|
scratchReg: Operand or Register or OperandReuse = rax) =
|
||||||
## Reduce `a` into `r` modulo `M`
|
## Reduce `a` into `r` modulo `M`
|
||||||
## To be used when the final substraction can
|
## To be used when the final substraction can
|
||||||
## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256)
|
## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256)
|
||||||
##
|
##
|
||||||
## r, a, scratch, scratchReg are mutated
|
## r, a, scratch are mutated
|
||||||
## M is read-only
|
## M is read-only
|
||||||
|
## This clobbers RAX
|
||||||
let N = M.len
|
let N = M.len
|
||||||
ctx.comment "Final substraction (may carry)"
|
ctx.comment "Final substraction (may carry)"
|
||||||
|
|
||||||
@ -72,9 +72,11 @@ proc finalSubMayOverflowImpl*(
|
|||||||
ctx.sbb scratchReg, scratchReg
|
ctx.sbb scratchReg, scratchReg
|
||||||
|
|
||||||
# Now substract the modulus, and test a < p with the last borrow
|
# Now substract the modulus, and test a < p with the last borrow
|
||||||
|
if not a_in_scratch:
|
||||||
ctx.mov scratch[0], a[0]
|
ctx.mov scratch[0], a[0]
|
||||||
ctx.sub scratch[0], M[0]
|
ctx.sub scratch[0], M[0]
|
||||||
for i in 1 ..< N:
|
for i in 1 ..< N:
|
||||||
|
if not a_in_scratch:
|
||||||
ctx.mov scratch[i], a[i]
|
ctx.mov scratch[i], a[i]
|
||||||
ctx.sbb scratch[i], M[i]
|
ctx.sbb scratch[i], M[i]
|
||||||
|
|
||||||
@ -89,9 +91,10 @@ proc finalSubMayOverflowImpl*(
|
|||||||
ctx.mov r[i], a[i]
|
ctx.mov r[i], a[i]
|
||||||
|
|
||||||
macro finalSub_gen*[N: static int](
|
macro finalSub_gen*[N: static int](
|
||||||
r_PIR: var array[N, SecretWord],
|
r_PIR: var Limbs[N],
|
||||||
a_EIR, M_PIR: array[N, SecretWord],
|
a_EIR: Limbs[N],
|
||||||
scratch_EIR: var array[N, SecretWord],
|
M_MEM: Limbs[N],
|
||||||
|
scratch_EIR: var Limbs[N],
|
||||||
mayOverflow: static bool): untyped =
|
mayOverflow: static bool): untyped =
|
||||||
## Returns:
|
## Returns:
|
||||||
## a-M if a > M
|
## a-M if a > M
|
||||||
@ -99,35 +102,32 @@ macro finalSub_gen*[N: static int](
|
|||||||
##
|
##
|
||||||
## - r_PIR is a pointer to the result array, mutated,
|
## - r_PIR is a pointer to the result array, mutated,
|
||||||
## - a_EIR is an array of registers, mutated,
|
## - a_EIR is an array of registers, mutated,
|
||||||
## - M_PIR is a pointer to an array, read-only,
|
## - M_MEM is a pointer to an array, read-only,
|
||||||
## - scratch_EIR is an array of registers, mutated
|
## - scratch_EIR is an array of registers, mutated
|
||||||
## - mayOverflow is set to true when the carry flag also needs to be read
|
## - mayOverflow is set to true when the carry flag also needs to be read
|
||||||
result = newStmtList()
|
result = newStmtList()
|
||||||
|
|
||||||
var ctx = init(Assembler_x86, BaseType)
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
let
|
let
|
||||||
r = init(OperandArray, nimSymbol = r_PIR, N, PointerInReg, InputOutput)
|
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||||
# We reuse the reg used for b for overflow detection
|
# We reuse the reg used for b for overflow detection
|
||||||
a = init(OperandArray, nimSymbol = a_EIR, N, ElemsInReg, InputOutput)
|
a = asmArray(a_EIR, N, ElemsInReg, asmInputOutput)
|
||||||
# We could force m as immediate by specializing per moduli
|
# We could force m as immediate by specializing per moduli
|
||||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||||
t = init(OperandArray, nimSymbol = scratch_EIR, N, ElemsInReg, Output_EarlyClobber)
|
t = asmArray(scratch_EIR, N, ElemsInReg, asmOutputEarlyClobber)
|
||||||
|
|
||||||
if mayOverflow:
|
if mayOverflow:
|
||||||
ctx.finalSubMayOverflowImpl(
|
ctx.finalSubMayOverflowImpl(r, a, M, t)
|
||||||
r, a, M, t, rax
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
ctx.finalSubNoOverflowImpl(
|
ctx.finalSubNoOverflowImpl(r, a, M, t)
|
||||||
r, a, M, t
|
|
||||||
)
|
|
||||||
|
|
||||||
result.add ctx.generate()
|
result.add ctx.generate()
|
||||||
|
|
||||||
# Field addition
|
# Field addition
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N], spareBits: static int): untyped =
|
|
||||||
|
macro addmod_gen[N: static int](r_PIR: var Limbs[N], a_PIR, b_PIR, M_MEM: Limbs[N], spareBits: static int): untyped =
|
||||||
## Generate an optimized modular addition kernel
|
## Generate an optimized modular addition kernel
|
||||||
# Register pressure note:
|
# Register pressure note:
|
||||||
# We could generate a kernel per modulus m by hardcoding it as immediate
|
# We could generate a kernel per modulus m by hardcoding it as immediate
|
||||||
@ -139,21 +139,20 @@ macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N], spareBits: s
|
|||||||
|
|
||||||
var ctx = init(Assembler_x86, BaseType)
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
let
|
let
|
||||||
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
|
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but_ec_shortw_prj_g1_sum_reduce.nimt compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||||
# We reuse the reg used for b for overflow detection
|
b = asmArray(b_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memRead) # LLVM Gold linker runs out of registers in t_ec_shortw_prj_g1_sum_reduce if we use b as Memoffsettable and a separate overflow register
|
||||||
b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
|
|
||||||
# We could force m as immediate by specializing per moduli
|
# We could force m as immediate by specializing per moduli
|
||||||
M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
|
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||||
# If N is too big, we need to spill registers. TODO.
|
# If N is too big, we need to spill registers. TODO.
|
||||||
u = init(OperandArray, nimSymbol = ident"u", N, ElemsInReg, InputOutput)
|
uSym = ident"u"
|
||||||
v = init(OperandArray, nimSymbol = ident"v", N, ElemsInReg, Output_EarlyClobber)
|
vSym = ident"v"
|
||||||
|
u = asmArray(uSym, N, ElemsInReg, asmInputOutput)
|
||||||
|
v = asmArray(vSym, N, ElemsInReg, asmOutputEarlyClobber)
|
||||||
|
|
||||||
let usym = u.nimSymbol
|
|
||||||
let vsym = v.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
var `usym`{.noinit.}, `vsym` {.noInit, used.}: typeof(`A`)
|
var `usym`{.noinit.}, `vsym` {.noInit, used.}: typeof(`a_PIR`)
|
||||||
staticFor i, 0, `N`:
|
staticFor i, 0, `N`:
|
||||||
`usym`[i] = `A`[i]
|
`usym`[i] = `a_PIR`[i]
|
||||||
|
|
||||||
# Addition
|
# Addition
|
||||||
ctx.add u[0], b[0]
|
ctx.add u[0], b[0]
|
||||||
@ -164,23 +163,20 @@ macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N], spareBits: s
|
|||||||
ctx.mov v[i], u[i]
|
ctx.mov v[i], u[i]
|
||||||
|
|
||||||
if spareBits >= 1:
|
if spareBits >= 1:
|
||||||
ctx.finalSubNoOverflowImpl(r, u, M, v)
|
ctx.finalSubNoOverflowImpl(r, u, M, v, a_in_scratch = true)
|
||||||
else:
|
else:
|
||||||
ctx.finalSubMayOverflowImpl(
|
ctx.finalSubMayOverflowImpl(r, u, M, v, a_in_scratch = true, scratchReg = b.reuseRegister())
|
||||||
r, u, M, v, b.reuseRegister()
|
|
||||||
)
|
|
||||||
|
|
||||||
result.add ctx.generate()
|
result.add ctx.generate()
|
||||||
|
|
||||||
func addmod_asm*(r: var Limbs, a, b, m: Limbs, spareBits: static int) {.noInline.} =
|
func addmod_asm*(r: var Limbs, a, b, M: Limbs, spareBits: static int) =
|
||||||
## Constant-time modular addition
|
## Constant-time modular addition
|
||||||
# This MUST be noInline or Clang will run out of registers with LTO
|
addmod_gen(r, a, b, M, spareBits)
|
||||||
addmod_gen(r, a, b, m, spareBits)
|
|
||||||
|
|
||||||
# Field substraction
|
# Field substraction
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
macro submod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
|
macro submod_gen[N: static int](r_PIR: var Limbs[N], a_PIR, b_PIR, M_MEM: Limbs[N]): untyped =
|
||||||
## Generate an optimized modular addition kernel
|
## Generate an optimized modular addition kernel
|
||||||
# Register pressure note:
|
# Register pressure note:
|
||||||
# We could generate a kernel per modulus m by hardocing it as immediate
|
# We could generate a kernel per modulus m by hardocing it as immediate
|
||||||
@ -192,21 +188,20 @@ macro submod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
|
|||||||
|
|
||||||
var ctx = init(Assembler_x86, BaseType)
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
let
|
let
|
||||||
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
|
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||||
# We reuse the reg used for b for overflow detection
|
b = asmArray(b_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memRead) # register reused for underflow detection
|
||||||
b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
|
|
||||||
# We could force m as immediate by specializing per moduli
|
# We could force m as immediate by specializing per moduli
|
||||||
M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
|
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||||
# If N is too big, we need to spill registers. TODO.
|
# If N is too big, we need to spill registers. TODO.
|
||||||
u = init(OperandArray, nimSymbol = ident"U", N, ElemsInReg, InputOutput)
|
uSym = ident"u"
|
||||||
v = init(OperandArray, nimSymbol = ident"V", N, ElemsInReg, Output_EarlyClobber)
|
vSym = ident"v"
|
||||||
|
u = asmArray(uSym, N, ElemsInReg, asmInputOutput)
|
||||||
|
v = asmArray(vSym, N, ElemsInReg, asmOutputEarlyClobber)
|
||||||
|
|
||||||
let usym = u.nimSymbol
|
|
||||||
let vsym = v.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
var `usym`{.noinit.}, `vsym` {.noInit, used.}: typeof(`A`)
|
var `usym`{.noinit.}, `vsym` {.noInit, used.}: typeof(`a_PIR`)
|
||||||
staticFor i, 0, `N`:
|
staticFor i, 0, `N`:
|
||||||
`usym`[i] = `A`[i]
|
`usym`[i] = `a_PIR`[i]
|
||||||
|
|
||||||
# Substraction
|
# Substraction
|
||||||
ctx.sub u[0], b[0]
|
ctx.sub u[0], b[0]
|
||||||
@ -231,30 +226,37 @@ macro submod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
|
|||||||
ctx.adc u[i], v[i]
|
ctx.adc u[i], v[i]
|
||||||
ctx.mov r[i], u[i]
|
ctx.mov r[i], u[i]
|
||||||
|
|
||||||
result.add ctx.generate
|
result.add ctx.generate()
|
||||||
|
|
||||||
func submod_asm*(r: var Limbs, a, b, M: Limbs) {.noInline.} =
|
func submod_asm*(r: var Limbs, a, b, M: Limbs) =
|
||||||
## Constant-time modular substraction
|
## Constant-time modular substraction
|
||||||
## Warning, does not handle aliasing of a and b
|
## Warning, does not handle aliasing of a and b
|
||||||
# This MUST be noInline or Clang will run out of registers with LTO
|
|
||||||
submod_gen(r, a, b, M)
|
submod_gen(r, a, b, M)
|
||||||
|
|
||||||
# Field negation
|
# Field negation
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
macro negmod_gen[N: static int](R: var Limbs[N], A, m: Limbs[N]): untyped =
|
macro negmod_gen[N: static int](r_PIR: var Limbs[N], a_MEM, M_MEM: Limbs[N]): untyped =
|
||||||
## Generate an optimized modular negation kernel
|
## Generate an optimized modular negation kernel
|
||||||
|
|
||||||
result = newStmtList()
|
result = newStmtList()
|
||||||
|
|
||||||
var ctx = init(Assembler_x86, BaseType)
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
let
|
let
|
||||||
a = init(OperandArray, nimSymbol = A, N, PointerInReg, Input)
|
a = asmArray(a_MEM, N, MemOffsettable, asmInput)
|
||||||
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
|
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||||
u = init(OperandArray, nimSymbol = ident"U", N, ElemsInReg, Output_EarlyClobber)
|
uSym = ident"u"
|
||||||
|
u = asmArray(uSym, N, ElemsInReg, asmOutputEarlyClobber)
|
||||||
# We could force m as immediate by specializing per moduli
|
# We could force m as immediate by specializing per moduli
|
||||||
# We reuse the reg used for m for overflow detection
|
# We reuse the reg used for m for overflow detection
|
||||||
M = init(OperandArray, nimSymbol = m, N, PointerInReg, InputOutput)
|
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||||
|
|
||||||
|
isZeroSym = ident"isZero"
|
||||||
|
isZero = asmValue(isZeroSym, Reg, asmOutputEarlyClobber)
|
||||||
|
|
||||||
|
result.add quote do:
|
||||||
|
var `usym`{.noinit, used.}: typeof(`a_MEM`)
|
||||||
|
var `isZeroSym`{.noinit.}: BaseType
|
||||||
|
|
||||||
# Substraction m - a
|
# Substraction m - a
|
||||||
ctx.mov u[0], M[0]
|
ctx.mov u[0], M[0]
|
||||||
@ -264,7 +266,6 @@ macro negmod_gen[N: static int](R: var Limbs[N], A, m: Limbs[N]): untyped =
|
|||||||
ctx.sbb u[i], a[i]
|
ctx.sbb u[i], a[i]
|
||||||
|
|
||||||
# Deal with a == 0
|
# Deal with a == 0
|
||||||
let isZero = M.reuseRegister()
|
|
||||||
ctx.mov isZero, a[0]
|
ctx.mov isZero, a[0]
|
||||||
for i in 1 ..< N:
|
for i in 1 ..< N:
|
||||||
ctx.`or` isZero, a[i]
|
ctx.`or` isZero, a[i]
|
||||||
@ -274,11 +275,8 @@ macro negmod_gen[N: static int](R: var Limbs[N], A, m: Limbs[N]): untyped =
|
|||||||
ctx.cmovz u[i], isZero
|
ctx.cmovz u[i], isZero
|
||||||
ctx.mov r[i], u[i]
|
ctx.mov r[i], u[i]
|
||||||
|
|
||||||
let usym = u.nimSymbol
|
result.add ctx.generate()
|
||||||
result.add quote do:
|
|
||||||
var `usym`{.noinit, used.}: typeof(`A`)
|
|
||||||
result.add ctx.generate
|
|
||||||
|
|
||||||
func negmod_asm*(r: var Limbs, a, m: Limbs) =
|
func negmod_asm*(r: var Limbs, a, M: Limbs) =
|
||||||
## Constant-time modular negation
|
## Constant-time modular negation
|
||||||
negmod_gen(r, a, m)
|
negmod_gen(r, a, M)
|
||||||
|
|||||||
@ -21,11 +21,6 @@ import
|
|||||||
#
|
#
|
||||||
# ############################################################
|
# ############################################################
|
||||||
|
|
||||||
# Note: We can refer to at most 30 registers in inline assembly
|
|
||||||
# and "InputOutput" registers count double
|
|
||||||
# They are nice to let the compiler deals with mov
|
|
||||||
# but too constraining so we move things ourselves.
|
|
||||||
|
|
||||||
static: doAssert UseASM_X86_64
|
static: doAssert UseASM_X86_64
|
||||||
|
|
||||||
# Necessary for the compiler to find enough registers
|
# Necessary for the compiler to find enough registers
|
||||||
@ -37,7 +32,7 @@ static: doAssert UseASM_X86_64
|
|||||||
# Fallback when no ADX and BMI2 support (MULX, ADCX, ADOX)
|
# Fallback when no ADX and BMI2 support (MULX, ADCX, ADOX)
|
||||||
macro mulMont_CIOS_sparebit_gen[N: static int](
|
macro mulMont_CIOS_sparebit_gen[N: static int](
|
||||||
r_PIR: var Limbs[N], a_PIR, b_PIR,
|
r_PIR: var Limbs[N], a_PIR, b_PIR,
|
||||||
M_PIR: Limbs[N], m0ninv_REG: BaseType,
|
M_MEM: Limbs[N], m0ninv_REG: BaseType,
|
||||||
skipFinalSub: static bool): untyped =
|
skipFinalSub: static bool): untyped =
|
||||||
## Generate an optimized Montgomery Multiplication kernel
|
## Generate an optimized Montgomery Multiplication kernel
|
||||||
## using the CIOS method
|
## using the CIOS method
|
||||||
@ -58,29 +53,23 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
|
|||||||
scratchSlots = 6
|
scratchSlots = 6
|
||||||
|
|
||||||
# We could force M as immediate by specializing per moduli
|
# We could force M as immediate by specializing per moduli
|
||||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||||
# If N is too big, we need to spill registers. TODO.
|
# If N is too big, we need to spill registers. TODO.
|
||||||
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
|
tSym = ident"t"
|
||||||
|
t = asmArray(tSym, N, ElemsInReg, asmOutputEarlyClobber)
|
||||||
# MultiPurpose Register slots
|
# MultiPurpose Register slots
|
||||||
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
scratchSym = ident"scratch"
|
||||||
|
scratch = asmArray(scratchSym, scratchSlots, ElemsInReg, asmInputOutputEarlyClobber)
|
||||||
|
|
||||||
# MUL requires RAX and RDX
|
# MUL requires RAX and RDX
|
||||||
|
|
||||||
m0ninv = Operand(
|
m0ninv = asmValue(m0ninv_REG, Mem, asmInput)
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[m0ninv]",
|
|
||||||
nimSymbol: m0ninv_REG,
|
|
||||||
rm: MemOffsettable,
|
|
||||||
constraint: Input,
|
|
||||||
cEmit: "&" & $m0ninv_REG
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# We're really constrained by register and somehow setting as memory doesn't help
|
# We're really constrained by register and somehow setting as memory doesn't help
|
||||||
# So we store the result `r` in the scratch space and then reload it in RDX
|
# So we store the result `r` in the scratch space and then reload it in RDX
|
||||||
# before the scratchspace is used in final substraction
|
# before the scratchspace is used in final substraction
|
||||||
a = scratch[0].asArrayAddr(len = N) # Store the `a` operand
|
a = scratch[0].asArrayAddr(a_PIR, len = N, memIndirect = memRead) # Store the `a` operand
|
||||||
b = scratch[1].asArrayAddr(len = N) # Store the `b` operand
|
b = scratch[1].asArrayAddr(b_PIR, len = N, memIndirect = memRead) # Store the `b` operand
|
||||||
A = scratch[2] # High part of extended precision multiplication
|
A = scratch[2] # High part of extended precision multiplication
|
||||||
C = scratch[3]
|
C = scratch[3]
|
||||||
m = scratch[4] # Stores (t[0] * m0ninv) mod 2ʷ
|
m = scratch[4] # Stores (t[0] * m0ninv) mod 2ʷ
|
||||||
@ -96,12 +85,10 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
|
|||||||
# but this prevent reusing the same code for multiple curves like BLS12-377 and BLS12-381
|
# but this prevent reusing the same code for multiple curves like BLS12-377 and BLS12-381
|
||||||
# We might be able to save registers by having `r` and `M` be memory operand as well
|
# We might be able to save registers by having `r` and `M` be memory operand as well
|
||||||
|
|
||||||
let tsym = t.nimSymbol
|
|
||||||
let scratchSym = scratch.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
||||||
|
|
||||||
var `tsym`{.noInit, used.}: typeof(`r_PIR`)
|
var `tSym`{.noInit, used.}: typeof(`r_PIR`)
|
||||||
# Assumes 64-bit limbs on 64-bit arch (or you can't store an address)
|
# Assumes 64-bit limbs on 64-bit arch (or you can't store an address)
|
||||||
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
|
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
|
||||||
`scratchSym`[0] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
|
`scratchSym`[0] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
|
||||||
@ -172,26 +159,22 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
|
|||||||
ctx.mov t[N-1], A
|
ctx.mov t[N-1], A
|
||||||
|
|
||||||
ctx.mov rax, r # move r away from scratchspace that will be used for final substraction
|
ctx.mov rax, r # move r away from scratchspace that will be used for final substraction
|
||||||
let r2 = rax.asArrayAddr(len = N)
|
let r2 = rax.asArrayAddr(r_PIR, len = N, memIndirect = memWrite)
|
||||||
|
|
||||||
if skipFinalSub:
|
if skipFinalSub:
|
||||||
for i in 0 ..< N:
|
for i in 0 ..< N:
|
||||||
ctx.mov r2[i], t[i]
|
ctx.mov r2[i], t[i]
|
||||||
else:
|
else:
|
||||||
ctx.finalSubNoOverflowImpl(
|
ctx.finalSubNoOverflowImpl(r2, t, M, scratch)
|
||||||
r2, t, M,
|
|
||||||
scratch
|
|
||||||
)
|
|
||||||
result.add ctx.generate()
|
result.add ctx.generate()
|
||||||
|
|
||||||
func mulMont_CIOS_sparebit_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) {.noInline.} =
|
func mulMont_CIOS_sparebit_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) =
|
||||||
## Constant-time Montgomery multiplication
|
## Constant-time Montgomery multiplication
|
||||||
## If "skipFinalSub" is set
|
## If "skipFinalSub" is set
|
||||||
## the result is in the range [0, 2M)
|
## the result is in the range [0, 2M)
|
||||||
## otherwise the result is in the range [0, M)
|
## otherwise the result is in the range [0, M)
|
||||||
##
|
##
|
||||||
## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
|
## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
|
||||||
# This MUST be noInline or Clang will run out of registers with LTO
|
|
||||||
r.mulMont_CIOS_sparebit_gen(a, b, M, m0ninv, skipFinalSub)
|
r.mulMont_CIOS_sparebit_gen(a, b, M, m0ninv, skipFinalSub)
|
||||||
|
|
||||||
# Montgomery Squaring
|
# Montgomery Squaring
|
||||||
@ -212,7 +195,7 @@ func squareMont_CIOS_asm*[N](
|
|||||||
|
|
||||||
macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
|
macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
|
||||||
r_PIR: var Limbs[N], a_PIR, b_PIR: array[K, Limbs[N]],
|
r_PIR: var Limbs[N], a_PIR, b_PIR: array[K, Limbs[N]],
|
||||||
M_PIR: Limbs[N], m0ninv_REG: BaseType,
|
M_MEM: Limbs[N], m0ninv_REG: BaseType,
|
||||||
skipFinalSub: static bool): untyped =
|
skipFinalSub: static bool): untyped =
|
||||||
## Generate an optimized Montgomery merged sum of products ⅀aᵢ.bᵢ kernel
|
## Generate an optimized Montgomery merged sum of products ⅀aᵢ.bᵢ kernel
|
||||||
## using the CIOS method
|
## using the CIOS method
|
||||||
@ -242,29 +225,23 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
|
|||||||
scratchSlots = 6
|
scratchSlots = 6
|
||||||
|
|
||||||
# We could force M as immediate by specializing per moduli
|
# We could force M as immediate by specializing per moduli
|
||||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||||
# If N is too big, we need to spill registers. TODO.
|
# If N is too big, we need to spill registers. TODO.
|
||||||
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
|
tSym = ident"t"
|
||||||
|
t = asmArray(tSym, N, ElemsInReg, asmOutputEarlyClobber)
|
||||||
# MultiPurpose Register slots
|
# MultiPurpose Register slots
|
||||||
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
scratchSym = ident"scratch"
|
||||||
|
scratch = asmArray(scratchSym, scratchSlots, ElemsInReg, asmInputOutputEarlyClobber)
|
||||||
|
|
||||||
# MUL requires RAX and RDX
|
# MUL requires RAX and RDX
|
||||||
|
|
||||||
m0ninv = Operand(
|
m0ninv = asmValue(m0ninv_REG, Mem, asmInput)
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[m0ninv]",
|
|
||||||
nimSymbol: m0ninv_REG,
|
|
||||||
rm: MemOffsettable,
|
|
||||||
constraint: Input,
|
|
||||||
cEmit: "&" & $m0ninv_REG
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# We're really constrained by register and somehow setting as memory doesn't help
|
# We're really constrained by register and somehow setting as memory doesn't help
|
||||||
# So we store the result `r` in the scratch space and then reload it in RDX
|
# So we store the result `r` in the scratch space and then reload it in RDX
|
||||||
# before the scratchspace is used in final substraction
|
# before the scratchspace is used in final substraction
|
||||||
a = scratch[0].as2dArrayAddr(rows = K, cols = N) # Store the `a` operand
|
a = scratch[0].as2dArrayAddr(a_PIR, rows = K, cols = N, memIndirect = memRead) # Store the `a` operand
|
||||||
b = scratch[1].as2dArrayAddr(rows = K, cols = N) # Store the `b` operand
|
b = scratch[1].as2dArrayAddr(b_PIR, rows = K, cols = N, memIndirect = memRead) # Store the `b` operand
|
||||||
tN = scratch[2] # High part of extended precision multiplication
|
tN = scratch[2] # High part of extended precision multiplication
|
||||||
C = scratch[3] # Carry during reduction step
|
C = scratch[3] # Carry during reduction step
|
||||||
r = scratch[4] # Stores the `r` operand
|
r = scratch[4] # Stores the `r` operand
|
||||||
@ -280,9 +257,6 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
|
|||||||
# We can save 1 by hardcoding M as immediate (and m0ninv)
|
# We can save 1 by hardcoding M as immediate (and m0ninv)
|
||||||
# but this prevent reusing the same code for multiple curves like BLS12-377 and BLS12-381
|
# but this prevent reusing the same code for multiple curves like BLS12-377 and BLS12-381
|
||||||
# We might be able to save registers by having `r` and `M` be memory operand as well
|
# We might be able to save registers by having `r` and `M` be memory operand as well
|
||||||
|
|
||||||
let tsym = t.nimSymbol
|
|
||||||
let scratchSym = scratch.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
||||||
|
|
||||||
@ -377,7 +351,7 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
|
|||||||
|
|
||||||
|
|
||||||
ctx.mov rax, r # move r away from scratchspace that will be used for final substraction
|
ctx.mov rax, r # move r away from scratchspace that will be used for final substraction
|
||||||
let r2 = rax.asArrayAddr(len = N)
|
let r2 = rax.asArrayAddr(r_PIR, len = N, memIndirect = memWrite)
|
||||||
|
|
||||||
if skipFinalSub:
|
if skipFinalSub:
|
||||||
ctx.comment " Copy result"
|
ctx.comment " Copy result"
|
||||||
@ -387,8 +361,7 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
|
|||||||
ctx.comment " Final substraction"
|
ctx.comment " Final substraction"
|
||||||
ctx.finalSubNoOverflowImpl(
|
ctx.finalSubNoOverflowImpl(
|
||||||
r2, t, M,
|
r2, t, M,
|
||||||
scratch
|
scratch)
|
||||||
)
|
|
||||||
result.add ctx.generate()
|
result.add ctx.generate()
|
||||||
|
|
||||||
func sumprodMont_CIOS_spare2bits_asm*[N, K: static int](
|
func sumprodMont_CIOS_spare2bits_asm*[N, K: static int](
|
||||||
|
|||||||
@ -21,11 +21,6 @@ import
|
|||||||
#
|
#
|
||||||
# ############################################################
|
# ############################################################
|
||||||
|
|
||||||
# Note: We can refer to at most 30 registers in inline assembly
|
|
||||||
# and "InputOutput" registers count double
|
|
||||||
# They are nice to let the compiler deals with mov
|
|
||||||
# but too constraining so we move things ourselves.
|
|
||||||
|
|
||||||
static: doAssert UseASM_X86_64
|
static: doAssert UseASM_X86_64
|
||||||
|
|
||||||
# MULX/ADCX/ADOX
|
# MULX/ADCX/ADOX
|
||||||
@ -176,7 +171,7 @@ proc partialRedx(
|
|||||||
|
|
||||||
macro mulMont_CIOS_sparebit_adx_gen[N: static int](
|
macro mulMont_CIOS_sparebit_adx_gen[N: static int](
|
||||||
r_PIR: var Limbs[N], a_PIR, b_PIR,
|
r_PIR: var Limbs[N], a_PIR, b_PIR,
|
||||||
M_PIR: Limbs[N], m0ninv_REG: BaseType,
|
M_MEM: Limbs[N], m0ninv_REG: BaseType,
|
||||||
skipFinalSub: static bool): untyped =
|
skipFinalSub: static bool): untyped =
|
||||||
## Generate an optimized Montgomery Multiplication kernel
|
## Generate an optimized Montgomery Multiplication kernel
|
||||||
## using the CIOS method
|
## using the CIOS method
|
||||||
@ -193,18 +188,20 @@ macro mulMont_CIOS_sparebit_adx_gen[N: static int](
|
|||||||
let
|
let
|
||||||
scratchSlots = 6
|
scratchSlots = 6
|
||||||
|
|
||||||
r = init(OperandArray, nimSymbol = r_PIR, N, PointerInReg, InputOutput_EnsureClobber)
|
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it). # Changing that to MemOffsetable triggers an error in negmod in test_bindings. Missing clobber?
|
||||||
# We could force M as immediate by specializing per moduli
|
# We could force M as immediate by specializing per moduli
|
||||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||||
# If N is too big, we need to spill registers. TODO.
|
# If N is too big, we need to spill registers. TODO.
|
||||||
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
|
tSym = ident"t"
|
||||||
|
t = asmArray(tSym, N, ElemsInReg, asmOutputEarlyClobber)
|
||||||
# MultiPurpose Register slots
|
# MultiPurpose Register slots
|
||||||
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
scratchSym = ident"scratch"
|
||||||
|
scratch = asmArray(scratchSym, scratchSlots, ElemsInReg, asmInputOutputEarlyClobber)
|
||||||
|
|
||||||
# MULX requires RDX as well
|
# MULX requires RDX as well
|
||||||
|
|
||||||
a = scratch[0].asArrayAddr(len = N) # Store the `a` operand
|
a = scratch[0].asArrayAddr(a_PIR, len = N, memIndirect = memRead) # Store the `a` operand
|
||||||
b = scratch[1].asArrayAddr(len = N) # Store the `b` operand
|
b = scratch[1].asArrayAddr(b_PIR, len = N, memIndirect = memRead) # Store the `b` operand
|
||||||
A = scratch[2] # High part of extended precision multiplication
|
A = scratch[2] # High part of extended precision multiplication
|
||||||
C = scratch[3]
|
C = scratch[3]
|
||||||
m0ninv = scratch[4] # Modular inverse of M[0]
|
m0ninv = scratch[4] # Modular inverse of M[0]
|
||||||
@ -221,8 +218,6 @@ macro mulMont_CIOS_sparebit_adx_gen[N: static int](
|
|||||||
# but this prevent reusing the same code for multiple curves like BLS12-377 and BLS12-381
|
# but this prevent reusing the same code for multiple curves like BLS12-377 and BLS12-381
|
||||||
# We might be able to save registers by having `r` and `M` be memory operand as well
|
# We might be able to save registers by having `r` and `M` be memory operand as well
|
||||||
|
|
||||||
let tsym = t.nimSymbol
|
|
||||||
let scratchSym = scratch.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
||||||
|
|
||||||
@ -250,21 +245,18 @@ macro mulMont_CIOS_sparebit_adx_gen[N: static int](
|
|||||||
A, t,
|
A, t,
|
||||||
a,
|
a,
|
||||||
b[0],
|
b[0],
|
||||||
C
|
C)
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
ctx.mulaccx_by_word(
|
ctx.mulaccx_by_word(
|
||||||
A, t,
|
A, t,
|
||||||
a, i,
|
a, i,
|
||||||
b[i],
|
b[i],
|
||||||
C
|
C)
|
||||||
)
|
|
||||||
|
|
||||||
ctx.partialRedx(
|
ctx.partialRedx(
|
||||||
A, t,
|
A, t,
|
||||||
M, m0ninv,
|
M, m0ninv,
|
||||||
lo, C
|
lo, C)
|
||||||
)
|
|
||||||
|
|
||||||
if skipFinalSub:
|
if skipFinalSub:
|
||||||
for i in 0 ..< N:
|
for i in 0 ..< N:
|
||||||
@ -272,19 +264,9 @@ macro mulMont_CIOS_sparebit_adx_gen[N: static int](
|
|||||||
else:
|
else:
|
||||||
ctx.finalSubNoOverflowImpl(
|
ctx.finalSubNoOverflowImpl(
|
||||||
r, t, M,
|
r, t, M,
|
||||||
scratch
|
scratch)
|
||||||
)
|
|
||||||
|
|
||||||
result.add ctx.generate
|
result.add ctx.generate()
|
||||||
|
|
||||||
func mulMont_CIOS_sparebit_asm_adx_inline*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) {.inline.} =
|
|
||||||
## Constant-time Montgomery multiplication
|
|
||||||
## If "skipFinalSub" is set
|
|
||||||
## the result is in the range [0, 2M)
|
|
||||||
## otherwise the result is in the range [0, M)
|
|
||||||
##
|
|
||||||
## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
|
|
||||||
r.mulMont_CIOS_sparebit_adx_gen(a, b, M, m0ninv, skipFinalSub)
|
|
||||||
|
|
||||||
func mulMont_CIOS_sparebit_asm_adx*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) =
|
func mulMont_CIOS_sparebit_asm_adx*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) =
|
||||||
## Constant-time Montgomery multiplication
|
## Constant-time Montgomery multiplication
|
||||||
@ -293,7 +275,7 @@ func mulMont_CIOS_sparebit_asm_adx*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseTy
|
|||||||
## otherwise the result is in the range [0, M)
|
## otherwise the result is in the range [0, M)
|
||||||
##
|
##
|
||||||
## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
|
## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
|
||||||
r.mulMont_CIOS_sparebit_asm_adx_inline(a, b, M, m0ninv, skipFinalSub)
|
r.mulMont_CIOS_sparebit_adx_gen(a, b, M, m0ninv, skipFinalSub)
|
||||||
|
|
||||||
# Montgomery Squaring
|
# Montgomery Squaring
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
@ -313,7 +295,7 @@ func squareMont_CIOS_asm_adx*[N](
|
|||||||
|
|
||||||
macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
|
macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
|
||||||
r_PIR: var Limbs[N], a_PIR, b_PIR: array[K, Limbs[N]],
|
r_PIR: var Limbs[N], a_PIR, b_PIR: array[K, Limbs[N]],
|
||||||
M_PIR: Limbs[N], m0ninv_REG: BaseType,
|
M_MEM: Limbs[N], m0ninv_REG: BaseType,
|
||||||
skipFinalSub: static bool): untyped =
|
skipFinalSub: static bool): untyped =
|
||||||
## Generate an optimized Montgomery merged sum of products ⅀aᵢ.bᵢ kernel
|
## Generate an optimized Montgomery merged sum of products ⅀aᵢ.bᵢ kernel
|
||||||
## using the CIOS method
|
## using the CIOS method
|
||||||
@ -343,29 +325,23 @@ macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
|
|||||||
scratchSlots = 6
|
scratchSlots = 6
|
||||||
|
|
||||||
# We could force M as immediate by specializing per moduli
|
# We could force M as immediate by specializing per moduli
|
||||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||||
# If N is too big, we need to spill registers. TODO.
|
# If N is too big, we need to spill registers. TODO.
|
||||||
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
|
tSym = ident"t"
|
||||||
|
t = asmArray(tSym, N, ElemsInReg, asmOutputEarlyClobber)
|
||||||
# MultiPurpose Register slots
|
# MultiPurpose Register slots
|
||||||
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
scratchSym = ident"scratch"
|
||||||
|
scratch = asmArray(scratchSym, scratchSlots, ElemsInReg, asmInputOutputEarlyClobber)
|
||||||
|
|
||||||
# MULX requires RDX as well
|
# MULX requires RDX as well
|
||||||
|
|
||||||
m0ninv = Operand(
|
m0ninv = asmValue(m0ninv_REG, Mem, asmInput)
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[m0ninv]",
|
|
||||||
nimSymbol: m0ninv_REG,
|
|
||||||
rm: MemOffsettable,
|
|
||||||
constraint: Input,
|
|
||||||
cEmit: "&" & $m0ninv_REG
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# We're really constrained by register and somehow setting as memory doesn't help
|
# We're really constrained by register and somehow setting as memory doesn't help
|
||||||
# So we store the result `r` in the scratch space and then reload it in RDX
|
# So we store the result `r` in the scratch space and then reload it in RDX
|
||||||
# before the scratchspace is used in final substraction
|
# before the scratchspace is used in final substraction
|
||||||
a = scratch[0].as2dArrayAddr(rows = K, cols = N) # Store the `a` operand
|
a = scratch[0].as2dArrayAddr(a_PIR, rows = K, cols = N, memIndirect = memRead) # Store the `a` operand
|
||||||
b = scratch[1].as2dArrayAddr(rows = K, cols = N) # Store the `b` operand
|
b = scratch[1].as2dArrayAddr(b_PIR, rows = K, cols = N, memIndirect = memRead) # Store the `b` operand
|
||||||
tN = scratch[2] # High part of extended precision multiplication
|
tN = scratch[2] # High part of extended precision multiplication
|
||||||
C = scratch[3] # Carry during reduction step
|
C = scratch[3] # Carry during reduction step
|
||||||
r = scratch[4] # Stores the `r` operand
|
r = scratch[4] # Stores the `r` operand
|
||||||
@ -382,8 +358,6 @@ macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
|
|||||||
# but this prevent reusing the same code for multiple curves like BLS12-377 and BLS12-381
|
# but this prevent reusing the same code for multiple curves like BLS12-377 and BLS12-381
|
||||||
# We might be able to save registers by having `r` and `M` be memory operand as well
|
# We might be able to save registers by having `r` and `M` be memory operand as well
|
||||||
|
|
||||||
let tsym = t.nimSymbol
|
|
||||||
let scratchSym = scratch.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
||||||
|
|
||||||
@ -461,11 +435,10 @@ macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
|
|||||||
ctx.partialRedx(
|
ctx.partialRedx(
|
||||||
tN, t,
|
tN, t,
|
||||||
M, m0ninv,
|
M, m0ninv,
|
||||||
rax, C
|
rax, C)
|
||||||
)
|
|
||||||
|
|
||||||
ctx.mov rax, r # move r away from scratchspace that will be used for final substraction
|
ctx.mov rax, r # move r away from scratchspace that will be used for final substraction
|
||||||
let r2 = rax.asArrayAddr(len = N)
|
let r2 = rax.asArrayAddr(r_PIR, len = N, memIndirect = memWrite)
|
||||||
|
|
||||||
if skipFinalSub:
|
if skipFinalSub:
|
||||||
ctx.comment " Copy result"
|
ctx.comment " Copy result"
|
||||||
@ -473,10 +446,7 @@ macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
|
|||||||
ctx.mov r2[i], t[i]
|
ctx.mov r2[i], t[i]
|
||||||
else:
|
else:
|
||||||
ctx.comment " Final substraction"
|
ctx.comment " Final substraction"
|
||||||
ctx.finalSubNoOverflowImpl(
|
ctx.finalSubNoOverflowImpl(r2, t, M, scratch)
|
||||||
r2, t, M,
|
|
||||||
scratch
|
|
||||||
)
|
|
||||||
result.add ctx.generate()
|
result.add ctx.generate()
|
||||||
|
|
||||||
func sumprodMont_CIOS_spare2bits_asm_adx*[N, K: static int](
|
func sumprodMont_CIOS_spare2bits_asm_adx*[N, K: static int](
|
||||||
|
|||||||
@ -18,18 +18,13 @@ import
|
|||||||
#
|
#
|
||||||
# ############################################################
|
# ############################################################
|
||||||
|
|
||||||
# Note: We can refer to at most 30 registers in inline assembly
|
|
||||||
# and "InputOutput" registers count double
|
|
||||||
# They are nice to let the compiler deals with mov
|
|
||||||
# but too constraining so we move things ourselves.
|
|
||||||
|
|
||||||
static: doAssert UseASM_X86_64 # Need 8 registers just for mul
|
static: doAssert UseASM_X86_64 # Need 8 registers just for mul
|
||||||
# and 32-bit only has 8 max.
|
# and 32-bit only has 8 max.
|
||||||
|
|
||||||
# Multiplication
|
# Multiplication
|
||||||
# -----------------------------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
|
macro mul_gen[rLen, aLen, bLen: static int](r_PIR: var Limbs[rLen], a_MEM: Limbs[aLen], b_MEM: Limbs[bLen]) =
|
||||||
## Comba multiplication generator
|
## Comba multiplication generator
|
||||||
## `a`, `b`, `r` can have a different number of limbs
|
## `a`, `b`, `r` can have a different number of limbs
|
||||||
## if `r`.limbs.len < a.limbs.len + b.limbs.len
|
## if `r`.limbs.len < a.limbs.len + b.limbs.len
|
||||||
@ -42,54 +37,29 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
|
|||||||
|
|
||||||
var ctx = init(Assembler_x86, BaseType)
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
let
|
let
|
||||||
arrR = init(OperandArray, nimSymbol = r, rLen, PointerInReg, InputOutput_EnsureClobber)
|
r = asmArray(r_PIR, rLen, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||||
arrA = init(OperandArray, nimSymbol = a, aLen, PointerInReg, Input)
|
a = asmArray(a_MEM, aLen, MemOffsettable, asmInput)
|
||||||
arrB = init(OperandArray, nimSymbol = b, bLen, PointerInReg, Input)
|
b = asmArray(b_MEM, aLen, MemOffsettable, asmInput)
|
||||||
|
|
||||||
t = Operand(
|
tSym = ident"t"
|
||||||
desc: OperandDesc(
|
t = asmValue(tSym, Reg, asmOutputEarlyClobber)
|
||||||
asmId: "[t]",
|
uSym = ident"u"
|
||||||
nimSymbol: ident"t",
|
u = asmValue(uSym, Reg, asmOutputEarlyClobber)
|
||||||
rm: Reg,
|
vSym = ident"v"
|
||||||
constraint: Output_EarlyClobber,
|
v = asmValue(vSym, Reg, asmOutputEarlyClobber)
|
||||||
cEmit: "t"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
u = Operand(
|
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[u]",
|
|
||||||
nimSymbol: ident"u",
|
|
||||||
rm: Reg,
|
|
||||||
constraint: Output_EarlyClobber,
|
|
||||||
cEmit: "u"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
v = Operand(
|
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[v]",
|
|
||||||
nimSymbol: ident"v",
|
|
||||||
rm: Reg,
|
|
||||||
constraint: Output_EarlyClobber,
|
|
||||||
cEmit: "v"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# MUL requires RAX and RDX
|
# MUL requires RAX and RDX
|
||||||
|
|
||||||
# Prologue
|
# Prologue
|
||||||
let tsym = t.desc.nimSymbol
|
|
||||||
let usym = u.desc.nimSymbol
|
|
||||||
let vsym = v.desc.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init
|
var `tSym`{.noInit.}, `uSym`{.noInit.}, `vSym`{.noInit.}: BaseType
|
||||||
|
|
||||||
# Algorithm
|
# Zero-init
|
||||||
ctx.`xor` u, u
|
ctx.`xor` u, u
|
||||||
ctx.`xor` v, v
|
ctx.`xor` v, v
|
||||||
ctx.`xor` t, t
|
ctx.`xor` t, t
|
||||||
|
|
||||||
|
# Algorithm
|
||||||
let stopEx = min(aLen+bLen, rLen)
|
let stopEx = min(aLen+bLen, rLen)
|
||||||
|
|
||||||
for i in 0 ..< stopEx:
|
for i in 0 ..< stopEx:
|
||||||
@ -100,13 +70,13 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
|
|||||||
let ia = i - ib
|
let ia = i - ib
|
||||||
for j in 0 ..< min(aLen - ia, ib+1):
|
for j in 0 ..< min(aLen - ia, ib+1):
|
||||||
# (t, u, v) <- (t, u, v) + a[ia+j] * b[ib-j]
|
# (t, u, v) <- (t, u, v) + a[ia+j] * b[ib-j]
|
||||||
ctx.mov rax, arrB[ib-j]
|
ctx.mov rax, b[ib-j]
|
||||||
ctx.mul rdx, rax, arrA[ia+j], rax
|
ctx.mul rdx, rax, a[ia+j], rax
|
||||||
ctx.add v, rax
|
ctx.add v, rax
|
||||||
ctx.adc u, rdx
|
ctx.adc u, rdx
|
||||||
ctx.adc t, 0
|
ctx.adc t, 0
|
||||||
|
|
||||||
ctx.mov arrR[i], v
|
ctx.mov r[i], v
|
||||||
|
|
||||||
if i != stopEx - 1:
|
if i != stopEx - 1:
|
||||||
ctx.mov v, u
|
ctx.mov v, u
|
||||||
@ -116,10 +86,10 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
|
|||||||
if aLen+bLen < rLen:
|
if aLen+bLen < rLen:
|
||||||
ctx.`xor` rax, rax
|
ctx.`xor` rax, rax
|
||||||
for i in aLen+bLen ..< rLen:
|
for i in aLen+bLen ..< rLen:
|
||||||
ctx.mov arrR[i], rax
|
ctx.mov r[i], rax
|
||||||
|
|
||||||
# Codegen
|
# Codegen
|
||||||
result.add ctx.generate
|
result.add ctx.generate()
|
||||||
|
|
||||||
func mul_asm*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
|
func mul_asm*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
|
||||||
## Multi-precision Multiplication
|
## Multi-precision Multiplication
|
||||||
@ -129,7 +99,7 @@ func mul_asm*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
|
|||||||
# Squaring
|
# Squaring
|
||||||
# -----------------------------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
macro sqr_gen*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
macro sqr_gen*[rLen, aLen: static int](r_PIR: var Limbs[rLen], a_MEM: Limbs[aLen]) =
|
||||||
## Comba squaring generator
|
## Comba squaring generator
|
||||||
## `a` and `r` can have a different number of limbs
|
## `a` and `r` can have a different number of limbs
|
||||||
## if `r`.limbs.len < a.limbs.len * 2
|
## if `r`.limbs.len < a.limbs.len * 2
|
||||||
@ -142,51 +112,26 @@ macro sqr_gen*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
|||||||
|
|
||||||
var ctx = init(Assembler_x86, BaseType)
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
let
|
let
|
||||||
arrR = init(OperandArray, nimSymbol = r, rLen, PointerInReg, InputOutput_EnsureClobber)
|
r = asmArray(r_PIR, rLen, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||||
arrA = init(OperandArray, nimSymbol = a, aLen, PointerInReg, Input)
|
a = asmArray(a_MEM, aLen, MemOffsettable, asmInput)
|
||||||
|
|
||||||
t = Operand(
|
tSym = ident"t"
|
||||||
desc: OperandDesc(
|
t = asmValue(tSym, Reg, asmOutputEarlyClobber)
|
||||||
asmId: "[t]",
|
uSym = ident"u"
|
||||||
nimSymbol: ident"t",
|
u = asmValue(uSym, Reg, asmOutputEarlyClobber)
|
||||||
rm: Reg,
|
vSym = ident"v"
|
||||||
constraint: Output_EarlyClobber,
|
v = asmValue(vSym, Reg, asmOutputEarlyClobber)
|
||||||
cEmit: "t"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
u = Operand(
|
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[u]",
|
|
||||||
nimSymbol: ident"u",
|
|
||||||
rm: Reg,
|
|
||||||
constraint: Output_EarlyClobber,
|
|
||||||
cEmit: "u"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
v = Operand(
|
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[v]",
|
|
||||||
nimSymbol: ident"v",
|
|
||||||
rm: Reg,
|
|
||||||
constraint: Output_EarlyClobber,
|
|
||||||
cEmit: "v"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Prologue
|
# Prologue
|
||||||
let tsym = t.desc.nimSymbol
|
|
||||||
let usym = u.desc.nimSymbol
|
|
||||||
let vsym = v.desc.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init
|
var `tSym`{.noInit.}, `uSym`{.noInit.}, `vSym`{.noInit.}: BaseType
|
||||||
|
|
||||||
# Algorithm
|
# Zero-init
|
||||||
ctx.`xor` u, u
|
ctx.`xor` u, u
|
||||||
ctx.`xor` v, v
|
ctx.`xor` v, v
|
||||||
ctx.`xor` t, t
|
ctx.`xor` t, t
|
||||||
|
|
||||||
|
# Algorithm
|
||||||
let stopEx = min(aLen*2, rLen)
|
let stopEx = min(aLen*2, rLen)
|
||||||
|
|
||||||
for i in 0 ..< stopEx:
|
for i in 0 ..< stopEx:
|
||||||
@ -200,8 +145,8 @@ macro sqr_gen*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
|||||||
let k2 = ib-j
|
let k2 = ib-j
|
||||||
if k1 < k2:
|
if k1 < k2:
|
||||||
# (t, u, v) <- (t, u, v) + 2 * a[k1] * a[k2]
|
# (t, u, v) <- (t, u, v) + 2 * a[k1] * a[k2]
|
||||||
ctx.mov rax, arrA[k2]
|
ctx.mov rax, a[k2]
|
||||||
ctx.mul rdx, rax, arrA[k1], rax
|
ctx.mul rdx, rax, a[k1], rax
|
||||||
ctx.add rax, rax
|
ctx.add rax, rax
|
||||||
ctx.adc rdx, rdx
|
ctx.adc rdx, rdx
|
||||||
ctx.adc t, 0
|
ctx.adc t, 0
|
||||||
@ -210,15 +155,15 @@ macro sqr_gen*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
|||||||
ctx.adc t, 0
|
ctx.adc t, 0
|
||||||
elif k1 == k2:
|
elif k1 == k2:
|
||||||
# (t, u, v) <- (t, u, v) + a[k1] * a[k2]
|
# (t, u, v) <- (t, u, v) + a[k1] * a[k2]
|
||||||
ctx.mov rax, arrA[k2]
|
ctx.mov rax, a[k2]
|
||||||
ctx.mul rdx, rax, arrA[k1], rax
|
ctx.mul rdx, rax, a[k1], rax
|
||||||
ctx.add v, rax
|
ctx.add v, rax
|
||||||
ctx.adc u, rdx
|
ctx.adc u, rdx
|
||||||
ctx.adc t, 0
|
ctx.adc t, 0
|
||||||
else:
|
else:
|
||||||
discard
|
discard
|
||||||
|
|
||||||
ctx.mov arrR[i], v
|
ctx.mov r[i], v
|
||||||
|
|
||||||
if i != stopEx - 1:
|
if i != stopEx - 1:
|
||||||
ctx.mov v, u
|
ctx.mov v, u
|
||||||
@ -228,10 +173,10 @@ macro sqr_gen*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
|||||||
if aLen*2 < rLen:
|
if aLen*2 < rLen:
|
||||||
ctx.`xor` rax, rax
|
ctx.`xor` rax, rax
|
||||||
for i in aLen*2 ..< rLen:
|
for i in aLen*2 ..< rLen:
|
||||||
ctx.mov arrR[i], rax
|
ctx.mov r[i], rax
|
||||||
|
|
||||||
# Codegen
|
# Codegen
|
||||||
result.add ctx.generate
|
result.add ctx.generate()
|
||||||
|
|
||||||
func square_asm*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
func square_asm*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
||||||
## Multi-precision Squaring
|
## Multi-precision Squaring
|
||||||
|
|||||||
@ -18,11 +18,6 @@ import
|
|||||||
#
|
#
|
||||||
# ############################################################
|
# ############################################################
|
||||||
|
|
||||||
# Note: We can refer to at most 30 registers in inline assembly
|
|
||||||
# and "InputOutput" registers count double
|
|
||||||
# They are nice to let the compiler deals with mov
|
|
||||||
# but too constraining so we move things ourselves.
|
|
||||||
|
|
||||||
static: doAssert UseASM_X86_64
|
static: doAssert UseASM_X86_64
|
||||||
|
|
||||||
# MULX/ADCX/ADOX
|
# MULX/ADCX/ADOX
|
||||||
@ -108,7 +103,7 @@ proc mulaccx_by_word(
|
|||||||
ctx.adcx hi, rdx
|
ctx.adcx hi, rdx
|
||||||
ctx.adox hi, rdx
|
ctx.adox hi, rdx
|
||||||
|
|
||||||
macro mulx_gen[rLen, aLen, bLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limbs[aLen], b_PIR: Limbs[bLen]) =
|
macro mulx_gen[rLen, aLen, bLen: static int](r_PIR: var Limbs[rLen], a_MEM: Limbs[aLen], b_MEM: Limbs[bLen]) =
|
||||||
## `a`, `b`, `r` can have a different number of limbs
|
## `a`, `b`, `r` can have a different number of limbs
|
||||||
## if `r`.limbs.len < a.limbs.len + b.limbs.len
|
## if `r`.limbs.len < a.limbs.len + b.limbs.len
|
||||||
## The result will be truncated, i.e. it will be
|
## The result will be truncated, i.e. it will be
|
||||||
@ -120,35 +115,33 @@ macro mulx_gen[rLen, aLen, bLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limb
|
|||||||
|
|
||||||
var ctx = init(Assembler_x86, BaseType)
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
let
|
let
|
||||||
r = init(OperandArray, nimSymbol = r_PIR, rLen, PointerInReg, InputOutput_EnsureClobber)
|
r = asmArray(r_PIR, rLen, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||||
a = init(OperandArray, nimSymbol = a_PIR, aLen, PointerInReg, Input)
|
a = asmArray(a_MEM, aLen, MemOffsettable, asmInput)
|
||||||
b = init(OperandArray, nimSymbol = b_PIR, bLen, PointerInReg, Input)
|
b = asmArray(b_MEM, bLen, MemOffsettable, asmInput)
|
||||||
|
|
||||||
# MULX requires RDX
|
# MULX requires RDX
|
||||||
|
|
||||||
|
tSym = ident"t"
|
||||||
tSlots = aLen+1 # Extra for high word
|
tSlots = aLen+1 # Extra for high word
|
||||||
|
|
||||||
var # If aLen is too big, we need to spill registers. TODO.
|
var # If aLen is too big, we need to spill registers. TODO.
|
||||||
t = init(OperandArray, nimSymbol = ident"t", tSlots, ElemsInReg, Output_EarlyClobber)
|
t = asmArray(tSym, tSlots, ElemsInReg, asmOutputEarlyClobber)
|
||||||
|
|
||||||
# Prologue
|
# Prologue
|
||||||
let tsym = t.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
var `tsym`{.noInit, used.}: array[`tSlots`, BaseType]
|
var `tSym`{.noInit, used.}: array[`tSlots`, BaseType]
|
||||||
|
|
||||||
for i in 0 ..< min(rLen, bLen):
|
for i in 0 ..< min(rLen, bLen):
|
||||||
if i == 0:
|
if i == 0:
|
||||||
ctx.mulx_by_word(
|
ctx.mulx_by_word(
|
||||||
r[0],
|
r[0],
|
||||||
a, t,
|
a, t,
|
||||||
b[0]
|
b[0])
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
ctx.mulaccx_by_word(
|
ctx.mulaccx_by_word(
|
||||||
r, i,
|
r, i,
|
||||||
a, t,
|
a, t,
|
||||||
b[i]
|
b[i])
|
||||||
)
|
|
||||||
|
|
||||||
t.rotateLeft()
|
t.rotateLeft()
|
||||||
|
|
||||||
@ -163,20 +156,13 @@ macro mulx_gen[rLen, aLen, bLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limb
|
|||||||
ctx.mov r[i], rax
|
ctx.mov r[i], rax
|
||||||
|
|
||||||
# Codegen
|
# Codegen
|
||||||
result.add ctx.generate
|
result.add ctx.generate()
|
||||||
|
|
||||||
func mul_asm_adx_inline*[rLen, aLen, bLen: static int](
|
|
||||||
r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) {.inline.} =
|
|
||||||
## Multi-precision Multiplication
|
|
||||||
## Assumes r doesn't alias a or b
|
|
||||||
## Inline version
|
|
||||||
mulx_gen(r, a, b)
|
|
||||||
|
|
||||||
func mul_asm_adx*[rLen, aLen, bLen: static int](
|
func mul_asm_adx*[rLen, aLen, bLen: static int](
|
||||||
r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
|
r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
|
||||||
## Multi-precision Multiplication
|
## Multi-precision Multiplication
|
||||||
## Assumes r doesn't alias a or b
|
## Assumes r doesn't alias a or b
|
||||||
mul_asm_adx_inline(r, a, b)
|
mulx_gen(r, a, b)
|
||||||
|
|
||||||
# Squaring
|
# Squaring
|
||||||
# -----------------------------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------------------------
|
||||||
@ -558,7 +544,7 @@ func sqrx_gen6L(ctx: var Assembler_x86, r, a: OperandArray, t: var OperandArray)
|
|||||||
merge_diag_and_partsum(r, a, hi1, lo1, zero, 4)
|
merge_diag_and_partsum(r, a, hi1, lo1, zero, 4)
|
||||||
merge_diag_and_partsum(r, a, hi2, lo2, zero, 5)
|
merge_diag_and_partsum(r, a, hi2, lo2, zero, 5)
|
||||||
|
|
||||||
macro sqrx_gen*[rLen, aLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limbs[aLen]) =
|
macro sqrx_gen*[rLen, aLen: static int](r_PIR: var Limbs[rLen], a_MEM: Limbs[aLen]) =
|
||||||
## Squaring
|
## Squaring
|
||||||
## `a` and `r` can have a different number of limbs
|
## `a` and `r` can have a different number of limbs
|
||||||
## if `r`.limbs.len < a.limbs.len * 2
|
## if `r`.limbs.len < a.limbs.len * 2
|
||||||
@ -575,21 +561,20 @@ macro sqrx_gen*[rLen, aLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limbs[aLe
|
|||||||
# t = 2 * a.len = 12
|
# t = 2 * a.len = 12
|
||||||
# We use the full x86 register set.
|
# We use the full x86 register set.
|
||||||
|
|
||||||
r = init(OperandArray, nimSymbol = r_PIR, rLen, PointerInReg, InputOutput)
|
r = asmArray(r_PIR, rLen, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||||
a = init(OperandArray, nimSymbol = a_PIR, aLen, PointerInReg, Input)
|
a = asmArray(a_MEM, aLen, MemOffsettable, asmInput)
|
||||||
|
|
||||||
# MULX requires RDX
|
# MULX requires RDX
|
||||||
|
tSym = ident"t"
|
||||||
tSlots = aLen+1 # Extra for high word
|
tSlots = aLen+1 # Extra for high word
|
||||||
|
|
||||||
var # If aLen is too big, we need to spill registers. TODO.
|
var # If aLen is too big, we need to spill registers. TODO.
|
||||||
t = init(OperandArray, nimSymbol = ident"t", tSlots, ElemsInReg, Output_EarlyClobber)
|
t = asmArray(tSym, tSlots, ElemsInReg, asmOutputEarlyClobber)
|
||||||
|
|
||||||
# Prologue
|
# Prologue
|
||||||
# -------------------------------
|
# -------------------------------
|
||||||
let tsym = t.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
var `tsym`{.noInit, used.}: array[`tSlots`, BaseType]
|
var `tSym`{.noInit, used.}: array[`tSlots`, BaseType]
|
||||||
|
|
||||||
if aLen == 4:
|
if aLen == 4:
|
||||||
ctx.sqrx_gen4L(r, a, t)
|
ctx.sqrx_gen4L(r, a, t)
|
||||||
@ -599,7 +584,7 @@ macro sqrx_gen*[rLen, aLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limbs[aLe
|
|||||||
error: "Not implemented"
|
error: "Not implemented"
|
||||||
|
|
||||||
# Codegen
|
# Codegen
|
||||||
result.add ctx.generate
|
result.add ctx.generate()
|
||||||
|
|
||||||
func square_asm_adx*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
func square_asm_adx*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
|
||||||
## Multi-precision Squaring
|
## Multi-precision Squaring
|
||||||
|
|||||||
@ -31,7 +31,7 @@ static: doAssert UseASM_X86_32
|
|||||||
macro redc2xMont_gen*[N: static int](
|
macro redc2xMont_gen*[N: static int](
|
||||||
r_PIR: var array[N, SecretWord],
|
r_PIR: var array[N, SecretWord],
|
||||||
a_PIR: array[N*2, SecretWord],
|
a_PIR: array[N*2, SecretWord],
|
||||||
M_PIR: array[N, SecretWord],
|
M_MEM: array[N, SecretWord],
|
||||||
m0ninv_REG: BaseType,
|
m0ninv_REG: BaseType,
|
||||||
spareBits: static int, skipFinalSub: static bool) =
|
spareBits: static int, skipFinalSub: static bool) =
|
||||||
# No register spilling handling
|
# No register spilling handling
|
||||||
@ -46,28 +46,27 @@ macro redc2xMont_gen*[N: static int](
|
|||||||
# so we store everything in scratchspaces restoring as needed
|
# so we store everything in scratchspaces restoring as needed
|
||||||
let
|
let
|
||||||
# We could force M as immediate by specializing per moduli
|
# We could force M as immediate by specializing per moduli
|
||||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||||
# MUL requires RAX and RDX
|
# MUL requires RAX and RDX
|
||||||
|
|
||||||
let uSlots = N+2
|
let uSlots = N+2
|
||||||
let vSlots = max(N-2, 3)
|
let vSlots = max(N-2, 3)
|
||||||
|
let uSym = ident"u"
|
||||||
|
let vSym = ident"v"
|
||||||
var # Scratchspaces
|
var # Scratchspaces
|
||||||
u = init(OperandArray, nimSymbol = ident"U", uSlots, ElemsInReg, InputOutput_EnsureClobber)
|
u = asmArray(uSym, uSlots, ElemsInReg, asmInputOutputEarlyClobber)
|
||||||
v = init(OperandArray, nimSymbol = ident"V", vSlots, ElemsInReg, InputOutput_EnsureClobber)
|
v = asmArray(vSym, vSlots, ElemsInReg, asmInputOutputEarlyClobber)
|
||||||
|
|
||||||
# Prologue
|
# Prologue
|
||||||
let usym = u.nimSymbol
|
|
||||||
let vsym = v.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
var `usym`{.noinit, used.}: Limbs[`uSlots`]
|
var `uSym`{.noinit, used.}: Limbs[`uSlots`]
|
||||||
var `vsym` {.noInit.}: Limbs[`vSlots`]
|
var `vSym` {.noInit.}: Limbs[`vSlots`]
|
||||||
`vsym`[0] = cast[SecretWord](`r_PIR`[0].unsafeAddr)
|
`vSym`[0] = cast[SecretWord](`r_PIR`[0].unsafeAddr)
|
||||||
`vsym`[1] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
|
`vSym`[1] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
|
||||||
`vsym`[2] = SecretWord(`m0ninv_REG`)
|
`vSym`[2] = SecretWord(`m0ninv_REG`)
|
||||||
|
|
||||||
let r_temp = v[0].asArrayAddr(len = N)
|
let r_temp = v[0].asArrayAddr(r_PIR, len = N, memIndirect = memWrite)
|
||||||
let a = v[1].asArrayAddr(len = 2*N)
|
let a = v[1].asArrayAddr(a_PIR, len = 2*N, memIndirect = memRead)
|
||||||
let m0ninv = v[2]
|
let m0ninv = v[2]
|
||||||
|
|
||||||
# Algorithm
|
# Algorithm
|
||||||
@ -137,7 +136,7 @@ macro redc2xMont_gen*[N: static int](
|
|||||||
|
|
||||||
if not(spareBits >= 2 and skipFinalSub):
|
if not(spareBits >= 2 and skipFinalSub):
|
||||||
ctx.mov rdx, r_temp
|
ctx.mov rdx, r_temp
|
||||||
let r = rdx.asArrayAddr(len = N)
|
let r = rdx.asArrayAddr(r_PIR, len = N, memIndirect = memWrite)
|
||||||
|
|
||||||
# This does a[i+n] += hi
|
# This does a[i+n] += hi
|
||||||
# but in a separate carry chain, fused with the
|
# but in a separate carry chain, fused with the
|
||||||
@ -157,7 +156,7 @@ macro redc2xMont_gen*[N: static int](
|
|||||||
elif spareBits >= 1:
|
elif spareBits >= 1:
|
||||||
ctx.finalSubNoOverflowImpl(r, u, M, t)
|
ctx.finalSubNoOverflowImpl(r, u, M, t)
|
||||||
else:
|
else:
|
||||||
ctx.finalSubMayOverflowImpl(r, u, M, t, rax)
|
ctx.finalSubMayOverflowImpl(r, u, M, t)
|
||||||
|
|
||||||
# Code generation
|
# Code generation
|
||||||
result.add ctx.generate()
|
result.add ctx.generate()
|
||||||
@ -168,9 +167,8 @@ func redcMont_asm*[N: static int](
|
|||||||
M: array[N, SecretWord],
|
M: array[N, SecretWord],
|
||||||
m0ninv: BaseType,
|
m0ninv: BaseType,
|
||||||
spareBits: static int,
|
spareBits: static int,
|
||||||
skipFinalSub: static bool) {.noInline.} =
|
skipFinalSub: static bool) =
|
||||||
## Constant-time Montgomery reduction
|
## Constant-time Montgomery reduction
|
||||||
# This MUST be noInline or Clang will run out of registers with LTO
|
|
||||||
static: doAssert UseASM_X86_64, "This requires x86-64."
|
static: doAssert UseASM_X86_64, "This requires x86-64."
|
||||||
redc2xMont_gen(r, a, M, m0ninv, spareBits, skipFinalSub)
|
redc2xMont_gen(r, a, M, m0ninv, spareBits, skipFinalSub)
|
||||||
|
|
||||||
@ -179,7 +177,7 @@ func redcMont_asm*[N: static int](
|
|||||||
|
|
||||||
macro mulMont_by_1_gen[N: static int](
|
macro mulMont_by_1_gen[N: static int](
|
||||||
t_EIR: var array[N, SecretWord],
|
t_EIR: var array[N, SecretWord],
|
||||||
M_PIR: array[N, SecretWord],
|
M_MEM: array[N, SecretWord],
|
||||||
m0ninv_REG: BaseType) =
|
m0ninv_REG: BaseType) =
|
||||||
|
|
||||||
# No register spilling handling
|
# No register spilling handling
|
||||||
@ -192,34 +190,22 @@ macro mulMont_by_1_gen[N: static int](
|
|||||||
# RAX and RDX are defacto used due to the MUL instructions
|
# RAX and RDX are defacto used due to the MUL instructions
|
||||||
# so we store everything in scratchspaces restoring as needed
|
# so we store everything in scratchspaces restoring as needed
|
||||||
let
|
let
|
||||||
scratchSlots = 2
|
t = asmArray(t_EIR, N, ElemsInReg, asmInputOutputEarlyClobber)
|
||||||
|
|
||||||
t = init(OperandArray, nimSymbol = t_EIR, N, ElemsInReg, InputOutput_EnsureClobber)
|
|
||||||
# We could force M as immediate by specializing per moduli
|
# We could force M as immediate by specializing per moduli
|
||||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||||
# MultiPurpose Register slots
|
|
||||||
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
|
||||||
|
|
||||||
# MUL requires RAX and RDX
|
# MUL requires RAX and RDX
|
||||||
|
|
||||||
m0ninv = Operand(
|
m0ninv = asmValue(m0ninv_REG, Mem, asmInput)
|
||||||
desc: OperandDesc(
|
Csym = ident"C"
|
||||||
asmId: "[m0ninv]",
|
C = asmValue(Csym, Reg, asmOutputEarlyClobber) # Stores the high-part of muliplication
|
||||||
nimSymbol: m0ninv_REG,
|
mSym = ident"m"
|
||||||
rm: MemOffsettable,
|
m = asmValue(msym, Reg, asmOutputEarlyClobber) # Stores (t[0] * m0ninv) mod 2ʷ
|
||||||
constraint: Input,
|
|
||||||
cEmit: "&" & $m0ninv_REG
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
C = scratch[0] # Stores the high-part of muliplication
|
|
||||||
m = scratch[1] # Stores (t[0] * m0ninv) mod 2ʷ
|
|
||||||
|
|
||||||
let scratchSym = scratch.nimSymbol
|
|
||||||
|
|
||||||
# Copy a in t
|
# Copy a in t
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
var `scratchSym` {.noInit, used.}: Limbs[`scratchSlots`]
|
var `Csym` {.noInit, used.}: BaseType
|
||||||
|
var `mSym` {.noInit, used.}: BaseType
|
||||||
|
|
||||||
# Algorithm
|
# Algorithm
|
||||||
# ---------------------------------------------------------
|
# ---------------------------------------------------------
|
||||||
|
|||||||
@ -35,7 +35,7 @@ static: doAssert UseASM_X86_64
|
|||||||
macro redc2xMont_adx_gen[N: static int](
|
macro redc2xMont_adx_gen[N: static int](
|
||||||
r_PIR: var array[N, SecretWord],
|
r_PIR: var array[N, SecretWord],
|
||||||
a_PIR: array[N*2, SecretWord],
|
a_PIR: array[N*2, SecretWord],
|
||||||
M_PIR: array[N, SecretWord],
|
M_MEM: array[N, SecretWord],
|
||||||
m0ninv_REG: BaseType,
|
m0ninv_REG: BaseType,
|
||||||
spareBits: static int, skipFinalSub: static bool) =
|
spareBits: static int, skipFinalSub: static bool) =
|
||||||
|
|
||||||
@ -45,30 +45,28 @@ macro redc2xMont_adx_gen[N: static int](
|
|||||||
result = newStmtList()
|
result = newStmtList()
|
||||||
|
|
||||||
var ctx = init(Assembler_x86, BaseType)
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
let
|
let M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||||
# We could force M as immediate by specializing per moduli
|
|
||||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
|
||||||
|
|
||||||
let uSlots = N+1
|
let uSlots = N+1
|
||||||
let vSlots = max(N-1, 5)
|
let vSlots = max(N-1, 5)
|
||||||
|
let uSym = ident"u"
|
||||||
|
let vSym = ident"v"
|
||||||
|
|
||||||
var # Scratchspaces
|
var # Scratchspaces
|
||||||
u = init(OperandArray, nimSymbol = ident"U", uSlots, ElemsInReg, InputOutput_EnsureClobber)
|
u = asmArray(uSym, uSlots, ElemsInReg, asmInputOutputEarlyClobber)
|
||||||
v = init(OperandArray, nimSymbol = ident"V", vSlots, ElemsInReg, InputOutput_EnsureClobber)
|
v = asmArray(vSym, vSlots, ElemsInReg, asmInputOutputEarlyClobber)
|
||||||
|
|
||||||
# Prologue
|
# Prologue
|
||||||
let usym = u.nimSymbol
|
|
||||||
let vsym = v.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
||||||
var `usym`{.noinit, used.}: Limbs[`uSlots`]
|
var `uSym`{.noinit, used.}: Limbs[`uSlots`]
|
||||||
var `vsym` {.noInit.}: Limbs[`vSlots`]
|
var `vSym` {.noInit.}: Limbs[`vSlots`]
|
||||||
`vsym`[0] = cast[SecretWord](`r_PIR`[0].unsafeAddr)
|
`vSym`[0] = cast[SecretWord](`r_PIR`[0].unsafeAddr)
|
||||||
`vsym`[1] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
|
`vSym`[1] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
|
||||||
`vsym`[2] = SecretWord(`m0ninv_REG`)
|
`vSym`[2] = SecretWord(`m0ninv_REG`)
|
||||||
|
|
||||||
let r_temp = v[0].asArrayAddr(len = N)
|
let r_temp = v[0]
|
||||||
let a = v[1].asArrayAddr(len = 2*N)
|
let a = v[1].asArrayAddr(a_PIR, len = 2*N, memIndirect = memRead)
|
||||||
let m0ninv = v[2]
|
let m0ninv = v[2]
|
||||||
let lo = v[3]
|
let lo = v[3]
|
||||||
let hi = v[4]
|
let hi = v[4]
|
||||||
@ -116,7 +114,7 @@ macro redc2xMont_adx_gen[N: static int](
|
|||||||
u.rotateLeft()
|
u.rotateLeft()
|
||||||
|
|
||||||
ctx.mov rdx, r_temp
|
ctx.mov rdx, r_temp
|
||||||
let r = rdx.asArrayAddr(len = N)
|
let r = rdx.asArrayAddr(r_PIR, len = N, memIndirect = memWrite)
|
||||||
|
|
||||||
# This does a[i+n] += hi
|
# This does a[i+n] += hi
|
||||||
# but in a separate carry chain, fused with the
|
# but in a separate carry chain, fused with the
|
||||||
@ -135,7 +133,7 @@ macro redc2xMont_adx_gen[N: static int](
|
|||||||
elif spareBits >= 1:
|
elif spareBits >= 1:
|
||||||
ctx.finalSubNoOverflowImpl(r, u, M, t)
|
ctx.finalSubNoOverflowImpl(r, u, M, t)
|
||||||
else:
|
else:
|
||||||
ctx.finalSubMayOverflowImpl(r, u, M, t, hi)
|
ctx.finalSubMayOverflowImpl(r, u, M, t)
|
||||||
|
|
||||||
# Code generation
|
# Code generation
|
||||||
result.add ctx.generate()
|
result.add ctx.generate()
|
||||||
@ -146,7 +144,7 @@ func redcMont_asm_adx*[N: static int](
|
|||||||
M: array[N, SecretWord],
|
M: array[N, SecretWord],
|
||||||
m0ninv: BaseType,
|
m0ninv: BaseType,
|
||||||
spareBits: static int,
|
spareBits: static int,
|
||||||
skipFinalSub: static bool = false) {.noInline.} =
|
skipFinalSub: static bool = false) =
|
||||||
## Constant-time Montgomery reduction
|
## Constant-time Montgomery reduction
|
||||||
# Inlining redcMont_asm_adx twice in mul_fp2_complex_asm_adx
|
# Inlining redcMont_asm_adx twice in mul_fp2_complex_asm_adx
|
||||||
# causes GCC to miscompile with -Os (--opt:size)
|
# causes GCC to miscompile with -Os (--opt:size)
|
||||||
@ -158,7 +156,7 @@ func redcMont_asm_adx*[N: static int](
|
|||||||
|
|
||||||
macro mulMont_by_1_adx_gen[N: static int](
|
macro mulMont_by_1_adx_gen[N: static int](
|
||||||
t_EIR: var array[N, SecretWord],
|
t_EIR: var array[N, SecretWord],
|
||||||
M_PIR: array[N, SecretWord],
|
M_MEM: array[N, SecretWord],
|
||||||
m0ninv_REG: BaseType) =
|
m0ninv_REG: BaseType) =
|
||||||
|
|
||||||
# No register spilling handling
|
# No register spilling handling
|
||||||
@ -171,33 +169,20 @@ macro mulMont_by_1_adx_gen[N: static int](
|
|||||||
# RAX and RDX are defacto used due to the MUL instructions
|
# RAX and RDX are defacto used due to the MUL instructions
|
||||||
# so we store everything in scratchspaces restoring as needed
|
# so we store everything in scratchspaces restoring as needed
|
||||||
let
|
let
|
||||||
scratchSlots = 1
|
t = asmArray(t_EIR, N, ElemsInReg, asmInputOutputEarlyClobber)
|
||||||
|
|
||||||
t = init(OperandArray, nimSymbol = t_EIR, N, ElemsInReg, InputOutput_EnsureClobber)
|
|
||||||
# We could force M as immediate by specializing per moduli
|
# We could force M as immediate by specializing per moduli
|
||||||
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
|
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
|
||||||
# MultiPurpose Register slots
|
|
||||||
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
|
||||||
|
|
||||||
# MUL requires RAX and RDX
|
# MUL requires RAX and RDX
|
||||||
|
|
||||||
m0ninv = Operand(
|
m0ninv = asmValue(m0ninv_REG, Mem, asmInput)
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[m0ninv]",
|
|
||||||
nimSymbol: m0ninv_REG,
|
|
||||||
rm: MemOffsettable,
|
|
||||||
constraint: Input,
|
|
||||||
cEmit: "&" & $m0ninv_REG
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
C = scratch[0] # Stores the high-part of muliplication
|
Csym = ident"C"
|
||||||
|
C = asmValue(Csym, Reg, asmOutputEarlyClobber) # Stores the high-part of muliplication
|
||||||
let scratchSym = scratch.nimSymbol
|
|
||||||
|
|
||||||
# Copy a in t
|
# Copy a in t
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
var `scratchSym` {.noInit, used.}: Limbs[`scratchSlots`]
|
var `Csym` {.noInit, used.}: BaseType
|
||||||
|
|
||||||
# Algorithm
|
# Algorithm
|
||||||
# ---------------------------------------------------------
|
# ---------------------------------------------------------
|
||||||
|
|||||||
@ -18,74 +18,45 @@ import
|
|||||||
#
|
#
|
||||||
# ############################################################
|
# ############################################################
|
||||||
|
|
||||||
# Note: We can refer to at most 30 registers in inline assembly
|
|
||||||
# and "InputOutput" registers count double
|
|
||||||
# They are nice to let the compiler deals with mov
|
|
||||||
# but too constraining so we move things ourselves.
|
|
||||||
|
|
||||||
static: doAssert UseASM_X86_32
|
static: doAssert UseASM_X86_32
|
||||||
|
|
||||||
# Copy
|
# Copy
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
macro ccopy_gen[N: static int](a: var Limbs[N], b: Limbs[N], ctl: SecretBool): untyped =
|
macro ccopy_gen[N: static int](a_PIR: var Limbs[N], b_MEM: Limbs[N], ctl: SecretBool): untyped =
|
||||||
## Generate an optimized conditional copy kernel
|
## Generate an optimized conditional copy kernel
|
||||||
result = newStmtList()
|
result = newStmtList()
|
||||||
|
|
||||||
var ctx = init(Assembler_x86, BaseType)
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
|
|
||||||
let
|
let
|
||||||
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, InputOutput)
|
a = asmArray(a_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memReadWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||||
arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, Input)
|
b = asmArray(b_MEM, N, MemOffsettable, asmInput)
|
||||||
|
|
||||||
control = Operand(
|
control = asmValue(ctl, Reg, asmInput)
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[ctl]",
|
t0Sym = ident"t0"
|
||||||
nimSymbol: ctl,
|
t1Sym = ident"t1"
|
||||||
rm: Reg,
|
|
||||||
constraint: Input,
|
|
||||||
cEmit: "ctl"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
var # Swappable registers to break dependency chains
|
var # Swappable registers to break dependency chains
|
||||||
t0 = Operand(
|
t0 = asmValue(t0Sym, Reg, asmOutputEarlyClobber)
|
||||||
desc: OperandDesc(
|
t1 = asmValue(t1Sym, Reg, asmOutputEarlyClobber)
|
||||||
asmId: "[t0]",
|
|
||||||
nimSymbol: ident"t0",
|
|
||||||
rm: Reg,
|
|
||||||
constraint: Output_EarlyClobber,
|
|
||||||
cEmit: "t0"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
t1 = Operand(
|
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[t1]",
|
|
||||||
nimSymbol: ident"t1",
|
|
||||||
rm: Reg,
|
|
||||||
constraint: Output_EarlyClobber,
|
|
||||||
cEmit: "t1"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Prologue
|
# Prologue
|
||||||
let t0sym = t0.desc.nimSymbol
|
|
||||||
let t1sym = t1.desc.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
var `t0sym`{.noinit.}, `t1sym`{.noinit.}: BaseType
|
var `t0sym`{.noinit.}, `t1sym`{.noinit.}: BaseType
|
||||||
|
|
||||||
# Algorithm
|
# Algorithm
|
||||||
ctx.test control, control
|
ctx.test control, control
|
||||||
for i in 0 ..< N:
|
for i in 0 ..< N:
|
||||||
ctx.mov t0, arrA[i]
|
ctx.mov t0, a[i]
|
||||||
ctx.cmovnz t0, arrB[i]
|
ctx.cmovnz t0, b[i]
|
||||||
ctx.mov arrA[i], t0
|
ctx.mov a[i], t0
|
||||||
swap(t0, t1)
|
swap(t0, t1)
|
||||||
|
|
||||||
# Codegen
|
# Codegen
|
||||||
result.add ctx.generate()
|
result.add ctx.generate()
|
||||||
|
|
||||||
func ccopy_asm*(a: var Limbs, b: Limbs, ctl: SecretBool) {.inline.}=
|
func ccopy_asm*(a: var Limbs, b: Limbs, ctl: SecretBool) =
|
||||||
## Constant-time conditional copy
|
## Constant-time conditional copy
|
||||||
## If ctl is true: b is copied into a
|
## If ctl is true: b is copied into a
|
||||||
## if ctl is false: b is not copied and a is untouched
|
## if ctl is false: b is not copied and a is untouched
|
||||||
@ -95,121 +66,89 @@ func ccopy_asm*(a: var Limbs, b: Limbs, ctl: SecretBool) {.inline.}=
|
|||||||
# Addition
|
# Addition
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
macro add_gen[N: static int](carry: var Carry, r: var Limbs[N], a, b: Limbs[N]): untyped =
|
macro add_gen[N: static int](carry: var Carry, r_PIR: var Limbs[N], a_MEM, b_MEM: Limbs[N]): untyped =
|
||||||
## Generate an optimized out-of-place addition kernel
|
## Generate an optimized out-of-place addition kernel
|
||||||
|
|
||||||
result = newStmtList()
|
result = newStmtList()
|
||||||
|
|
||||||
var ctx = init(Assembler_x86, BaseType)
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
let
|
let
|
||||||
arrR = init(OperandArray, nimSymbol = r, N, PointerInReg, InputOutput)
|
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||||
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, Input)
|
a = asmArray(a_MEM, N, MemOffsettable, asmInput)
|
||||||
arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, Input)
|
b = asmArray(b_MEM, N, MemOffsettable, asmInput)
|
||||||
|
|
||||||
|
t0Sym = ident"t0"
|
||||||
|
t1Sym = ident"t1"
|
||||||
|
|
||||||
var # Swappable registers to break dependency chains
|
var # Swappable registers to break dependency chains
|
||||||
t0 = Operand(
|
t0 = asmValue(t0Sym, Reg, asmOutputEarlyClobber)
|
||||||
desc: OperandDesc(
|
t1 = asmValue(t1Sym, Reg, asmOutputEarlyClobber)
|
||||||
asmId: "[t0]",
|
|
||||||
nimSymbol: ident"t0",
|
|
||||||
rm: Reg,
|
|
||||||
constraint: Output_EarlyClobber,
|
|
||||||
cEmit: "t0"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
t1 = Operand(
|
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[t1]",
|
|
||||||
nimSymbol: ident"t1",
|
|
||||||
rm: Reg,
|
|
||||||
constraint: Output_EarlyClobber,
|
|
||||||
cEmit: "t1"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Prologue
|
# Prologue
|
||||||
let t0sym = t0.desc.nimSymbol
|
|
||||||
let t1sym = t1.desc.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
var `t0sym`{.noinit.}, `t1sym`{.noinit.}: BaseType
|
var `t0sym`{.noinit.}, `t1sym`{.noinit.}: BaseType
|
||||||
|
|
||||||
# Algorithm
|
# Algorithm
|
||||||
ctx.mov t0, arrA[0] # Prologue
|
ctx.mov t0, a[0] # Prologue
|
||||||
ctx.add t0, arrB[0]
|
ctx.add t0, b[0]
|
||||||
|
|
||||||
for i in 1 ..< N:
|
for i in 1 ..< N:
|
||||||
ctx.mov t1, arrA[i] # Prepare the next iteration
|
ctx.mov t1, a[i] # Prepare the next iteration
|
||||||
ctx.mov arrR[i-1], t0 # Save the previous result in an interleaved manner
|
ctx.mov r[i-1], t0 # Save the previous result in an interleaved manner
|
||||||
ctx.adc t1, arrB[i] # Compute
|
ctx.adc t1, b[i] # Compute
|
||||||
swap(t0, t1) # Break dependency chain
|
swap(t0, t1) # Break dependency chain
|
||||||
|
|
||||||
ctx.mov arrR[N-1], t0 # Epilogue
|
ctx.mov r[N-1], t0 # Epilogue
|
||||||
ctx.setToCarryFlag(carry)
|
ctx.setToCarryFlag(carry)
|
||||||
|
|
||||||
# Codegen
|
# Codegen
|
||||||
result.add ctx.generate
|
result.add ctx.generate()
|
||||||
|
|
||||||
func add_asm*(r: var Limbs, a, b: Limbs): Carry {.inline.}=
|
func add_asm*(r: var Limbs, a, b: Limbs): Carry =
|
||||||
## Constant-time addition
|
## Constant-time addition
|
||||||
add_gen(result, r, a, b)
|
add_gen(result, r, a, b)
|
||||||
|
|
||||||
# Substraction
|
# Substraction
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
macro sub_gen[N: static int](borrow: var Borrow, r: var Limbs[N], a, b: Limbs[N]): untyped =
|
macro sub_gen[N: static int](borrow: var Borrow, r_PIR: var Limbs[N], a_MEM, b_MEM: Limbs[N]): untyped =
|
||||||
## Generate an optimized out-of-place substraction kernel
|
## Generate an optimized out-of-place substraction kernel
|
||||||
|
|
||||||
result = newStmtList()
|
result = newStmtList()
|
||||||
|
|
||||||
var ctx = init(Assembler_x86, BaseType)
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
let
|
let
|
||||||
arrR = init(OperandArray, nimSymbol = r, N, PointerInReg, InputOutput)
|
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
|
||||||
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, Input)
|
a = asmArray(a_MEM, N, MemOffsettable, asmInput)
|
||||||
arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, Input)
|
b = asmArray(b_MEM, N, MemOffsettable, asmInput)
|
||||||
|
|
||||||
|
t0Sym = ident"t0"
|
||||||
|
t1Sym = ident"t1"
|
||||||
|
|
||||||
var # Swappable registers to break dependency chains
|
var # Swappable registers to break dependency chains
|
||||||
t0 = Operand(
|
t0 = asmValue(t0Sym, Reg, asmOutputEarlyClobber)
|
||||||
desc: OperandDesc(
|
t1 = asmValue(t1Sym, Reg, asmOutputEarlyClobber)
|
||||||
asmId: "[t0]",
|
|
||||||
nimSymbol: ident"t0",
|
|
||||||
rm: Reg,
|
|
||||||
constraint: Output_EarlyClobber,
|
|
||||||
cEmit: "t0"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
t1 = Operand(
|
|
||||||
desc: OperandDesc(
|
|
||||||
asmId: "[t1]",
|
|
||||||
nimSymbol: ident"t1",
|
|
||||||
rm: Reg,
|
|
||||||
constraint: Output_EarlyClobber,
|
|
||||||
cEmit: "t1"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Prologue
|
# Prologue
|
||||||
let t0sym = t0.desc.nimSymbol
|
|
||||||
let t1sym = t1.desc.nimSymbol
|
|
||||||
result.add quote do:
|
result.add quote do:
|
||||||
var `t0sym`{.noinit.}, `t1sym`{.noinit.}: BaseType
|
var `t0sym`{.noinit.}, `t1sym`{.noinit.}: BaseType
|
||||||
|
|
||||||
# Algorithm
|
# Algorithm
|
||||||
ctx.mov t0, arrA[0] # Prologue
|
ctx.mov t0, a[0] # Prologue
|
||||||
ctx.sub t0, arrB[0]
|
ctx.sub t0, b[0]
|
||||||
|
|
||||||
for i in 1 ..< N:
|
for i in 1 ..< N:
|
||||||
ctx.mov t1, arrA[i] # Prepare the next iteration
|
ctx.mov t1, a[i] # Prepare the next iteration
|
||||||
ctx.mov arrR[i-1], t0 # Save the previous reult in an interleaved manner
|
ctx.mov r[i-1], t0 # Save the previous reult in an interleaved manner
|
||||||
ctx.sbb t1, arrB[i] # Compute
|
ctx.sbb t1, b[i] # Compute
|
||||||
swap(t0, t1) # Break dependency chain
|
swap(t0, t1) # Break dependency chain
|
||||||
|
|
||||||
ctx.mov arrR[N-1], t0 # Epilogue
|
ctx.mov r[N-1], t0 # Epilogue
|
||||||
ctx.setToCarryFlag(borrow)
|
ctx.setToCarryFlag(borrow)
|
||||||
|
|
||||||
# Codegen
|
# Codegen
|
||||||
result.add ctx.generate
|
result.add ctx.generate()
|
||||||
|
|
||||||
func sub_asm*(r: var Limbs, a, b: Limbs): Borrow {.inline.}=
|
func sub_asm*(r: var Limbs, a, b: Limbs): Borrow =
|
||||||
## Constant-time substraction
|
## Constant-time substraction
|
||||||
sub_gen(result, r, a, b)
|
sub_gen(result, r, a, b)
|
||||||
|
|||||||
@ -152,7 +152,7 @@ func setMinusOne*(a: var FF) =
|
|||||||
|
|
||||||
func neg*(r: var FF, a: FF) {.meter.} =
|
func neg*(r: var FF, a: FF) {.meter.} =
|
||||||
## Negate modulo p
|
## Negate modulo p
|
||||||
when UseASM_X86_64:
|
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||||
negmod_asm(r.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
|
negmod_asm(r.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
|
||||||
else:
|
else:
|
||||||
# If a = 0 we need r = 0 and not r = M
|
# If a = 0 we need r = 0 and not r = M
|
||||||
|
|||||||
@ -118,7 +118,7 @@ func sum2xMod*(r: var FpDbl, a, b: FpDbl) =
|
|||||||
## Output is conditionally reduced by 2ⁿp
|
## Output is conditionally reduced by 2ⁿp
|
||||||
## to stay in the [0, 2ⁿp) range
|
## to stay in the [0, 2ⁿp) range
|
||||||
when UseASM_X86_64:
|
when UseASM_X86_64:
|
||||||
addmod2x_asm(r.limbs2x, a.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs)
|
addmod2x_asm(r.limbs2x, a.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs, Fp[FpDbl.C].getSpareBits())
|
||||||
else:
|
else:
|
||||||
# Addition step
|
# Addition step
|
||||||
var overflowed = SecretBool r.limbs2x.sum(a.limbs2x, b.limbs2x)
|
var overflowed = SecretBool r.limbs2x.sum(a.limbs2x, b.limbs2x)
|
||||||
|
|||||||
@ -543,10 +543,8 @@ func sumprodMont*[N: static int](
|
|||||||
r: var Limbs, a, b: array[N, Limbs],
|
r: var Limbs, a, b: array[N, Limbs],
|
||||||
M: Limbs, m0ninv: BaseType,
|
M: Limbs, m0ninv: BaseType,
|
||||||
spareBits: static int,
|
spareBits: static int,
|
||||||
skipFinalSub: static bool = false) {.noInline.} =
|
skipFinalSub: static bool = false) =
|
||||||
## Compute r <- ⅀aᵢ.bᵢ (mod M) (sum of products)
|
## Compute r <- ⅀aᵢ.bᵢ (mod M) (sum of products)
|
||||||
# This function must be noInline or GCC miscompiles
|
|
||||||
# with LTO, see https://github.com/mratsim/constantine/issues/230
|
|
||||||
when spareBits >= 2:
|
when spareBits >= 2:
|
||||||
when UseASM_X86_64 and r.len in {2 .. 6}:
|
when UseASM_X86_64 and r.len in {2 .. 6}:
|
||||||
if ({.noSideEffect.}: hasAdx()):
|
if ({.noSideEffect.}: hasAdx()):
|
||||||
|
|||||||
@ -139,5 +139,5 @@ macro debugConsts(): untyped {.used.} =
|
|||||||
result.add quote do:
|
result.add quote do:
|
||||||
echo "----------------------------------------------------------------------------"
|
echo "----------------------------------------------------------------------------"
|
||||||
|
|
||||||
# debug: # displayed with -d:debugConstantine
|
# debug: # displayed with -d:CttDebug
|
||||||
# debugConsts()
|
# debugConsts()
|
||||||
|
|||||||
@ -62,9 +62,9 @@ func sqrx2x_complex_asm_adx*(
|
|||||||
t0.double(a.c1)
|
t0.double(a.c1)
|
||||||
t1.sum(a.c0, a.c1)
|
t1.sum(a.c0, a.c1)
|
||||||
|
|
||||||
r.c1.limbs2x.mul_asm_adx_inline(t0.mres.limbs, a.c0.mres.limbs)
|
r.c1.limbs2x.mul_asm_adx(t0.mres.limbs, a.c0.mres.limbs)
|
||||||
t0.diff(a.c0, a.c1)
|
t0.diff(a.c0, a.c1)
|
||||||
r.c0.limbs2x.mul_asm_adx_inline(t0.mres.limbs, t1.mres.limbs)
|
r.c0.limbs2x.mul_asm_adx(t0.mres.limbs, t1.mres.limbs)
|
||||||
|
|
||||||
func sqrx_complex_sparebit_asm_adx*(
|
func sqrx_complex_sparebit_asm_adx*(
|
||||||
r: var array[2, Fp],
|
r: var array[2, Fp],
|
||||||
@ -94,15 +94,15 @@ func mul2x_fp2_complex_asm_adx*(
|
|||||||
var D {.noInit.}: typeof(r.c0)
|
var D {.noInit.}: typeof(r.c0)
|
||||||
var t0 {.noInit.}, t1 {.noInit.}: typeof(a.c0)
|
var t0 {.noInit.}, t1 {.noInit.}: typeof(a.c0)
|
||||||
|
|
||||||
r.c0.limbs2x.mul_asm_adx_inline(a.c0.mres.limbs, b.c0.mres.limbs)
|
|
||||||
D.limbs2x.mul_asm_adx_inline(a.c1.mres.limbs, b.c1.mres.limbs)
|
|
||||||
when Fp.has1extraBit():
|
when Fp.has1extraBit():
|
||||||
t0.sumUnr(a.c0, a.c1)
|
t0.sumUnr(a.c0, a.c1)
|
||||||
t1.sumUnr(b.c0, b.c1)
|
t1.sumUnr(b.c0, b.c1)
|
||||||
else:
|
else:
|
||||||
t0.sum(a.c0, a.c1)
|
t0.sum(a.c0, a.c1)
|
||||||
t1.sum(b.c0, b.c1)
|
t1.sum(b.c0, b.c1)
|
||||||
r.c1.limbs2x.mul_asm_adx_inline(t0.mres.limbs, t1.mres.limbs)
|
r.c0.limbs2x.mul_asm_adx(a.c0.mres.limbs, b.c0.mres.limbs)
|
||||||
|
D.limbs2x.mul_asm_adx(a.c1.mres.limbs, b.c1.mres.limbs)
|
||||||
|
r.c1.limbs2x.mul_asm_adx(t0.mres.limbs, t1.mres.limbs)
|
||||||
when Fp.has1extraBit():
|
when Fp.has1extraBit():
|
||||||
r.c1.diff2xUnr(r.c1, r.c0)
|
r.c1.diff2xUnr(r.c1, r.c0)
|
||||||
r.c1.diff2xUnr(r.c1, D)
|
r.c1.diff2xUnr(r.c1, D)
|
||||||
|
|||||||
@ -856,14 +856,16 @@ func prod2x_complex(r: var QuadraticExt2x, a, b: Fp2) =
|
|||||||
var D {.noInit.}: typeof(r.c0)
|
var D {.noInit.}: typeof(r.c0)
|
||||||
var t0 {.noInit.}, t1 {.noInit.}: typeof(a.c0)
|
var t0 {.noInit.}, t1 {.noInit.}: typeof(a.c0)
|
||||||
|
|
||||||
r.c0.prod2x(a.c0, b.c0) # r0 = a0 b0
|
|
||||||
D.prod2x(a.c1, b.c1) # d = a1 b1
|
|
||||||
when Fp2.has1extraBit():
|
when Fp2.has1extraBit():
|
||||||
t0.sumUnr(a.c0, a.c1)
|
t0.sumUnr(a.c0, a.c1)
|
||||||
t1.sumUnr(b.c0, b.c1)
|
t1.sumUnr(b.c0, b.c1)
|
||||||
else:
|
else:
|
||||||
t0.sum(a.c0, a.c1)
|
t0.sum(a.c0, a.c1)
|
||||||
t1.sum(b.c0, b.c1)
|
t1.sum(b.c0, b.c1)
|
||||||
|
|
||||||
|
r.c0.prod2x(a.c0, b.c0) # r0 = a0 b0
|
||||||
|
D.prod2x(a.c1, b.c1) # d = a1 b1
|
||||||
|
|
||||||
r.c1.prod2x(t0, t1) # r1 = (b0 + b1)(a0 + a1)
|
r.c1.prod2x(t0, t1) # r1 = (b0 + b1)(a0 + a1)
|
||||||
when Fp2.has1extraBit():
|
when Fp2.has1extraBit():
|
||||||
r.c1.diff2xUnr(r.c1, r.c0) # r1 = (b0 + b1)(a0 + a1) - a0 b0
|
r.c1.diff2xUnr(r.c1, r.c0) # r1 = (b0 + b1)(a0 + a1) - a0 b0
|
||||||
@ -1052,9 +1054,6 @@ func prod2x_disjoint*[Fdbl, F](
|
|||||||
var V0 {.noInit.}, V1 {.noInit.}: typeof(r.c0) # Double-precision
|
var V0 {.noInit.}, V1 {.noInit.}: typeof(r.c0) # Double-precision
|
||||||
var t0 {.noInit.}, t1 {.noInit.}: typeof(a0) # Single-width
|
var t0 {.noInit.}, t1 {.noInit.}: typeof(a0) # Single-width
|
||||||
|
|
||||||
# Require 2 extra bits
|
|
||||||
V0.prod2x(a0, b0) # v0 = a0b0
|
|
||||||
V1.prod2x(a1, b1) # v1 = a1b1
|
|
||||||
when F.has1extraBit():
|
when F.has1extraBit():
|
||||||
t0.sumUnr(a0, a1)
|
t0.sumUnr(a0, a1)
|
||||||
t1.sumUnr(b0, b1)
|
t1.sumUnr(b0, b1)
|
||||||
@ -1062,6 +1061,9 @@ func prod2x_disjoint*[Fdbl, F](
|
|||||||
t0.sum(a0, a1)
|
t0.sum(a0, a1)
|
||||||
t1.sum(b0, b1)
|
t1.sum(b0, b1)
|
||||||
|
|
||||||
|
V0.prod2x(a0, b0) # v0 = a0b0
|
||||||
|
V1.prod2x(a1, b1) # v1 = a1b1
|
||||||
|
|
||||||
r.c1.prod2x(t0, t1) # r1 = (a0 + a1)(b0 + b1)
|
r.c1.prod2x(t0, t1) # r1 = (a0 + a1)(b0 + b1)
|
||||||
r.c1.diff2xMod(r.c1, V0) # r1 = (a0 + a1)(b0 + b1) - a0b0
|
r.c1.diff2xMod(r.c1, V0) # r1 = (a0 + a1)(b0 + b1) - a0b0
|
||||||
r.c1.diff2xMod(r.c1, V1) # r1 = (a0 + a1)(b0 + b1) - a0b0 - a1b1
|
r.c1.diff2xMod(r.c1, V1) # r1 = (a0 + a1)(b0 + b1) - a0b0 - a1b1
|
||||||
|
|||||||
@ -41,8 +41,7 @@ export BigInt, wordsRequired
|
|||||||
func unmarshalLE[T](
|
func unmarshalLE[T](
|
||||||
dst: var openArray[T],
|
dst: var openArray[T],
|
||||||
src: openarray[byte],
|
src: openarray[byte],
|
||||||
wordBitWidth: static int
|
wordBitWidth: static int) =
|
||||||
) =
|
|
||||||
## Parse an unsigned integer from its canonical
|
## Parse an unsigned integer from its canonical
|
||||||
## little-endian unsigned representation
|
## little-endian unsigned representation
|
||||||
## and store it into a BigInt
|
## and store it into a BigInt
|
||||||
@ -85,8 +84,7 @@ func unmarshalLE[T](
|
|||||||
func unmarshalBE[T](
|
func unmarshalBE[T](
|
||||||
dst: var openArray[T],
|
dst: var openArray[T],
|
||||||
src: openarray[byte],
|
src: openarray[byte],
|
||||||
wordBitWidth: static int
|
wordBitWidth: static int) =
|
||||||
) =
|
|
||||||
## Parse an unsigned integer from its canonical
|
## Parse an unsigned integer from its canonical
|
||||||
## big-endian unsigned representation (octet string)
|
## big-endian unsigned representation (octet string)
|
||||||
## and store it into a BigInt.
|
## and store it into a BigInt.
|
||||||
|
|||||||
@ -17,7 +17,7 @@ import ../../metering/tracer
|
|||||||
|
|
||||||
export primitives, tracer
|
export primitives, tracer
|
||||||
|
|
||||||
when sizeof(int) == 8 and not defined(Constantine32):
|
when sizeof(int) == 8 and not defined(Ctt32):
|
||||||
type
|
type
|
||||||
BaseType* = uint64
|
BaseType* = uint64
|
||||||
## Physical BigInt for conversion in "normal integers"
|
## Physical BigInt for conversion in "normal integers"
|
||||||
@ -67,7 +67,7 @@ type VarTime* = object
|
|||||||
|
|
||||||
type SignedSecretWord* = distinct SecretWord
|
type SignedSecretWord* = distinct SecretWord
|
||||||
|
|
||||||
when sizeof(int) == 8 and not defined(Constantine32):
|
when sizeof(int) == 8 and not defined(Ctt32):
|
||||||
type
|
type
|
||||||
SignedBaseType* = int64
|
SignedBaseType* = int64
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -49,28 +49,16 @@ template mux_x86_impl() {.dirty.} =
|
|||||||
static: doAssert(X86)
|
static: doAssert(X86)
|
||||||
static: doAssert(GCC_Compatible)
|
static: doAssert(GCC_Compatible)
|
||||||
|
|
||||||
when sizeof(T) == 8:
|
|
||||||
var muxed = x
|
var muxed = x
|
||||||
asm """
|
asm """
|
||||||
testq %[ctl], %[ctl]
|
test %[ctl], %[ctl]
|
||||||
cmovzq %[y], %[muxed]
|
cmovz %[muxed], %[y]
|
||||||
: [muxed] "+r" (`muxed`)
|
: [muxed] "+r" (`muxed`)
|
||||||
: [ctl] "r" (`ctl`), [y] "r" (`y`)
|
: [ctl] "r" (`ctl`), [y] "r" (`y`)
|
||||||
: "cc"
|
: "cc"
|
||||||
"""
|
"""
|
||||||
muxed
|
muxed
|
||||||
elif sizeof(T) == 4:
|
|
||||||
var muxed = x
|
|
||||||
asm """
|
|
||||||
testl %[ctl], %[ctl]
|
|
||||||
cmovzl %[y], %[muxed]
|
|
||||||
: [muxed] "+r" (`muxed`)
|
|
||||||
: [ctl] "r" (`ctl`), [y] "r" (`y`)
|
|
||||||
: "cc"
|
|
||||||
"""
|
|
||||||
muxed
|
|
||||||
else:
|
|
||||||
{.error: "Unsupported word size".}
|
|
||||||
|
|
||||||
func mux_x86[T](ctl: CTBool[T], x, y: T): T {.inline.}=
|
func mux_x86[T](ctl: CTBool[T], x, y: T): T {.inline.}=
|
||||||
## Multiplexer / selector
|
## Multiplexer / selector
|
||||||
@ -92,42 +80,23 @@ func ccopy_x86[T](ctl: CTBool[T], x: var T, y: T) {.inline.}=
|
|||||||
static: doAssert(X86)
|
static: doAssert(X86)
|
||||||
static: doAssert(GCC_Compatible)
|
static: doAssert(GCC_Compatible)
|
||||||
|
|
||||||
when sizeof(T) == 8:
|
|
||||||
when defined(cpp):
|
when defined(cpp):
|
||||||
asm """
|
asm """
|
||||||
testq %[ctl], %[ctl]
|
test %[ctl], %[ctl]
|
||||||
cmovnzq %[y], %[x]
|
cmovnz %[x], %[y]
|
||||||
: [x] "+r" (`x`)
|
: [x] "+r" (`x`)
|
||||||
: [ctl] "r" (`ctl`), [y] "r" (`y`)
|
: [ctl] "r" (`ctl`), [y] "r" (`y`)
|
||||||
: "cc"
|
: "cc"
|
||||||
"""
|
"""
|
||||||
|
|
||||||
else:
|
else:
|
||||||
asm """
|
asm """
|
||||||
testq %[ctl], %[ctl]
|
test %[ctl], %[ctl]
|
||||||
cmovnzq %[y], %[x]
|
cmovnz %[x], %[y]
|
||||||
: [x] "+r" (`*x`)
|
: [x] "+r" (`*x`)
|
||||||
: [ctl] "r" (`ctl`), [y] "r" (`y`)
|
: [ctl] "r" (`ctl`), [y] "r" (`y`)
|
||||||
: "cc"
|
: "cc"
|
||||||
"""
|
"""
|
||||||
elif sizeof(T) == 4:
|
|
||||||
when defined(cpp):
|
|
||||||
asm """
|
|
||||||
testl %[ctl], %[ctl]
|
|
||||||
cmovnzl %[y], %[x]
|
|
||||||
: [x] "+r" (`x`)
|
|
||||||
: [ctl] "r" (`ctl`), [y] "r" (`y`)
|
|
||||||
: "cc"
|
|
||||||
"""
|
|
||||||
else:
|
|
||||||
asm """
|
|
||||||
testl %[ctl], %[ctl]
|
|
||||||
cmovnzl %[y], %[x]
|
|
||||||
: [x] "+r" (`*x`)
|
|
||||||
: [ctl] "r" (`ctl`), [y] "r" (`y`)
|
|
||||||
: "cc"
|
|
||||||
"""
|
|
||||||
else:
|
|
||||||
{.error: "Unsupported word size".}
|
|
||||||
|
|
||||||
# Public functions
|
# Public functions
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|||||||
@ -44,7 +44,7 @@ macro replacePragmasByInline(procAst: typed): untyped =
|
|||||||
|
|
||||||
result = newStmtList()
|
result = newStmtList()
|
||||||
|
|
||||||
# The push cdecl is applied multiple times :/, so fight push with push
|
# The push noconv is applied multiple times :/, so fight push with push
|
||||||
result.add nnkPragma.newTree(ident"push", ident"nimcall", ident"inline")
|
result.add nnkPragma.newTree(ident"push", ident"nimcall", ident"inline")
|
||||||
|
|
||||||
result.add newProc(
|
result.add newProc(
|
||||||
@ -61,7 +61,7 @@ macro wrapOpenArrayLenType*(ty: typedesc, procAst: untyped): untyped =
|
|||||||
## Wraps pointer+len library calls in properly typed and converted openArray calls
|
## Wraps pointer+len library calls in properly typed and converted openArray calls
|
||||||
##
|
##
|
||||||
## ```
|
## ```
|
||||||
## {.push cdecl.}
|
## {.push noconv.}
|
||||||
## proc foo*(r: int, a: openArray[CustomType], b: int) {.wrapOpenArrayLenType: uint32, importc: "foo", dynlib: "libfoo.so".}
|
## proc foo*(r: int, a: openArray[CustomType], b: int) {.wrapOpenArrayLenType: uint32, importc: "foo", dynlib: "libfoo.so".}
|
||||||
## {.pop.}
|
## {.pop.}
|
||||||
## ```
|
## ```
|
||||||
@ -69,7 +69,7 @@ macro wrapOpenArrayLenType*(ty: typedesc, procAst: untyped): untyped =
|
|||||||
## is transformed into
|
## is transformed into
|
||||||
##
|
##
|
||||||
## ```
|
## ```
|
||||||
## proc foo(r: int, a: ptr CustomType, aLen: uint32, b: int) {.cdecl, importc: "foo", dynlib: "libfoo.so".}
|
## proc foo(r: int, a: ptr CustomType, aLen: uint32, b: int) {.noconv, importc: "foo", dynlib: "libfoo.so".}
|
||||||
##
|
##
|
||||||
## proc foo*(r: int, a: openArray[CustomType], b: int) {.inline.} =
|
## proc foo*(r: int, a: openArray[CustomType], b: int) {.inline.} =
|
||||||
## foo(r, a[0].unsafeAddr, a.len.uint32, b)
|
## foo(r, a[0].unsafeAddr, a.len.uint32, b)
|
||||||
@ -140,7 +140,7 @@ macro wrapOpenArrayLenType*(ty: typedesc, procAst: untyped): untyped =
|
|||||||
|
|
||||||
when isMainModule:
|
when isMainModule:
|
||||||
expandMacros:
|
expandMacros:
|
||||||
{.push cdecl.}
|
{.push noconv.}
|
||||||
|
|
||||||
proc foo(x: int, a: openArray[uint32], name: cstring) {.wrapOpenArrayLenType: cuint.} =
|
proc foo(x: int, a: openArray[uint32], name: cstring) {.wrapOpenArrayLenType: cuint.} =
|
||||||
discard
|
discard
|
||||||
|
|||||||
@ -26,7 +26,7 @@ static: echo "[Constantine] Using library " & libLLVM
|
|||||||
# also link to libLLVM, for example if they implement a virtual machine (for the EVM, for Snarks/zero-knowledge, ...).
|
# also link to libLLVM, for example if they implement a virtual machine (for the EVM, for Snarks/zero-knowledge, ...).
|
||||||
# Hence Constantine should always use LLVM context to "namespace" its own codegen and avoid collisions in the global context.
|
# Hence Constantine should always use LLVM context to "namespace" its own codegen and avoid collisions in the global context.
|
||||||
|
|
||||||
{.push cdecl, dynlib: libLLVM.}
|
{.push noconv, dynlib: libLLVM.}
|
||||||
|
|
||||||
# ############################################################
|
# ############################################################
|
||||||
#
|
#
|
||||||
@ -571,4 +571,4 @@ proc memset*(builder: BuilderRef, `ptr`, val, len: ValueRef, align: uint32) {.im
|
|||||||
proc memcpy*(builder: BuilderRef, dst: ValueRef, dstAlign: uint32, src: ValueRef, srcAlign: uint32, size: ValueRef) {.importc: "LLVMBuildMemcpy".}
|
proc memcpy*(builder: BuilderRef, dst: ValueRef, dstAlign: uint32, src: ValueRef, srcAlign: uint32, size: ValueRef) {.importc: "LLVMBuildMemcpy".}
|
||||||
proc memmove*(builder: BuilderRef, dst: ValueRef, dstAlign: uint32, src: ValueRef, srcAlign: uint32, size: ValueRef) {.importc: "LLVMBuildMemmove".}
|
proc memmove*(builder: BuilderRef, dst: ValueRef, dstAlign: uint32, src: ValueRef, srcAlign: uint32, size: ValueRef) {.importc: "LLVMBuildMemmove".}
|
||||||
|
|
||||||
{.pop.} # {.used, hint[Name]: off, cdecl, dynlib: libLLVM.}
|
{.pop.} # {.used, hint[Name]: off, noconv, dynlib: libLLVM.}
|
||||||
|
|||||||
@ -482,7 +482,7 @@ type
|
|||||||
CUstream* = distinct pointer
|
CUstream* = distinct pointer
|
||||||
CUdeviceptr* = distinct pointer
|
CUdeviceptr* = distinct pointer
|
||||||
|
|
||||||
{.push cdecl, importc, dynlib: "libcuda.so".}
|
{.push noconv, importc, dynlib: "libcuda.so".}
|
||||||
|
|
||||||
proc cuInit*(flags: uint32): CUresult
|
proc cuInit*(flags: uint32): CUresult
|
||||||
|
|
||||||
@ -515,4 +515,4 @@ proc cuMemFree*(devptr: CUdeviceptr): CUresult
|
|||||||
proc cuMemcpyHtoD*(dst: CUdeviceptr, src: pointer, size: csize_t): CUresult
|
proc cuMemcpyHtoD*(dst: CUdeviceptr, src: pointer, size: csize_t): CUresult
|
||||||
proc cuMemcpyDtoH*(dst: pointer, src: CUdeviceptr, size: csize_t): CUresult
|
proc cuMemcpyDtoH*(dst: pointer, src: CUdeviceptr, size: csize_t): CUresult
|
||||||
|
|
||||||
{.pop.} # {.push cdecl, importc, dynlib: "libcuda.so".}
|
{.pop.} # {.push noconv, importc, dynlib: "libcuda.so".}
|
||||||
|
|||||||
@ -4,7 +4,7 @@ proc cpuidX86(eaxi, ecxi: int32): tuple[eax, ebx, ecx, edx: int32] {.used.}=
|
|||||||
when defined(vcc):
|
when defined(vcc):
|
||||||
# limited inline asm support in vcc, so intrinsics, here we go:
|
# limited inline asm support in vcc, so intrinsics, here we go:
|
||||||
proc cpuidVcc(cpuInfo: ptr int32; functionID, subFunctionID: int32)
|
proc cpuidVcc(cpuInfo: ptr int32; functionID, subFunctionID: int32)
|
||||||
{.cdecl, importc: "__cpuidex", header: "intrin.h".}
|
{.noconv, importc: "__cpuidex", header: "intrin.h".}
|
||||||
cpuidVcc(addr result.eax, eaxi, ecxi)
|
cpuidVcc(addr result.eax, eaxi, ecxi)
|
||||||
else:
|
else:
|
||||||
var (eaxr, ebxr, ecxr, edxr) = (0'i32, 0'i32, 0'i32, 0'i32)
|
var (eaxr, ebxr, ecxr, edxr) = (0'i32, 0'i32, 0'i32, 0'i32)
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -53,7 +53,7 @@ when X86 and GCC_Compatible:
|
|||||||
# ############################################################
|
# ############################################################
|
||||||
|
|
||||||
template debug*(body: untyped): untyped =
|
template debug*(body: untyped): untyped =
|
||||||
when defined(debugConstantine):
|
when defined(CttDebug):
|
||||||
body
|
body
|
||||||
|
|
||||||
proc builtin_unreachable(){.nodecl, importc: "__builtin_unreachable".}
|
proc builtin_unreachable(){.nodecl, importc: "__builtin_unreachable".}
|
||||||
|
|||||||
@ -34,7 +34,7 @@ import std/macros
|
|||||||
# --------------------------------------------------------
|
# --------------------------------------------------------
|
||||||
|
|
||||||
# Everything should be a template that doesn't produce any code
|
# Everything should be a template that doesn't produce any code
|
||||||
# when debugConstantine is not defined.
|
# when CttDebug is not defined.
|
||||||
# Those checks are controlled by a custom flag instead of
|
# Those checks are controlled by a custom flag instead of
|
||||||
# "--boundsChecks" or "--nilChecks" to decouple them from user code checks.
|
# "--boundsChecks" or "--nilChecks" to decouple them from user code checks.
|
||||||
# Furthermore, we want them to be very lightweight on performance
|
# Furthermore, we want them to be very lightweight on performance
|
||||||
|
|||||||
@ -76,9 +76,9 @@ const ULF_WAKE_MASK = ULF_NO_ERRNO or
|
|||||||
ULF_WAKE_THREAD or
|
ULF_WAKE_THREAD or
|
||||||
ULF_WAKE_ALLOW_NON_OWNER
|
ULF_WAKE_ALLOW_NON_OWNER
|
||||||
|
|
||||||
proc ulock_wait(operation: uint32, address: pointer, expected: uint64, timeout: uint32): cint {.importc:"__ulock_wait", cdecl.}
|
proc ulock_wait(operation: uint32, address: pointer, expected: uint64, timeout: uint32): cint {.importc:"__ulock_wait", noconv.}
|
||||||
proc ulock_wait2(operation: uint32, address: pointer, expected: uint64, timeout, value2: uint64): cint {.importc:"__ulock_wait2", cdecl.}
|
proc ulock_wait2(operation: uint32, address: pointer, expected: uint64, timeout, value2: uint64): cint {.importc:"__ulock_wait2", noconv.}
|
||||||
proc ulock_wake(operation: uint32, address: pointer, wake_value: uint64): cint {.importc:"__ulock_wake", cdecl.}
|
proc ulock_wake(operation: uint32, address: pointer, wake_value: uint64): cint {.importc:"__ulock_wake", noconv.}
|
||||||
|
|
||||||
# Futex API
|
# Futex API
|
||||||
# ------------------------------------------------------------------------
|
# ------------------------------------------------------------------------
|
||||||
|
|||||||
@ -150,7 +150,7 @@ macro genCharAPI*(procAst: untyped): untyped =
|
|||||||
wrapperBody.add ident($procAst.params[i][j])
|
wrapperBody.add ident($procAst.params[i][j])
|
||||||
|
|
||||||
var pragmas = nnkPragma.newTree(ident"inline")
|
var pragmas = nnkPragma.newTree(ident"inline")
|
||||||
let skipPragmas = ["inline", "noinline", "noInline", "exportc", "exportcpp", "extern", "cdecl", "stdcall", "dynlib", "libPrefix"]
|
let skipPragmas = ["inline", "noinline", "noInline", "exportc", "exportcpp", "extern", "noconv", "cdecl", "stdcall", "dynlib", "libPrefix"]
|
||||||
for i in 0 ..< procAst.pragma.len:
|
for i in 0 ..< procAst.pragma.len:
|
||||||
if procAst.pragma[i].kind == nnkIdent:
|
if procAst.pragma[i].kind == nnkIdent:
|
||||||
if $procAst.pragma[i] notin skipPragmas:
|
if $procAst.pragma[i] notin skipPragmas:
|
||||||
|
|||||||
@ -15,7 +15,7 @@
|
|||||||
# that internally uses `sha256.hash`,
|
# that internally uses `sha256.hash`,
|
||||||
# the ideal outcome is for `sha256.hash to be exported as `ctt_sha256_hash` and
|
# the ideal outcome is for `sha256.hash to be exported as `ctt_sha256_hash` and
|
||||||
# have `hash_to_curve` directly use that.
|
# have `hash_to_curve` directly use that.
|
||||||
# 3. Furthermore while compiling Nim only, no export marker (cdecl, dynlib, exportc) are used.
|
# 3. Furthermore while compiling Nim only, no export marker (noconv, dynlib, exportc) are used.
|
||||||
#
|
#
|
||||||
# Each prefix must be modified before importing the module to export
|
# Each prefix must be modified before importing the module to export
|
||||||
|
|
||||||
@ -37,7 +37,7 @@ macro libPrefix*(prefix: static string, procAst: untyped): untyped =
|
|||||||
if pragmas.kind == nnkEmpty:
|
if pragmas.kind == nnkEmpty:
|
||||||
pragmas = nnkPragma.newTree()
|
pragmas = nnkPragma.newTree()
|
||||||
|
|
||||||
pragmas.add ident"cdecl"
|
pragmas.add ident"noconv"
|
||||||
pragmas.add nnkExprColonExpr.newTree(
|
pragmas.add nnkExprColonExpr.newTree(
|
||||||
ident"exportc",
|
ident"exportc",
|
||||||
newLit(prefix & "$1"))
|
newLit(prefix & "$1"))
|
||||||
|
|||||||
BIN
media/bls12_381_msm_i9-11980HK-8cores_1.png
Normal file
BIN
media/bls12_381_msm_i9-11980HK-8cores_1.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 459 KiB |
BIN
media/bls12_381_msm_i9-11980HK-8cores_2.png
Normal file
BIN
media/bls12_381_msm_i9-11980HK-8cores_2.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 464 KiB |
BIN
media/bls12_381_msm_i9-11980HK-8cores_3.png
Normal file
BIN
media/bls12_381_msm_i9-11980HK-8cores_3.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 334 KiB |
BIN
media/bls12_381_perf_summary_i9-11980HK.png
Normal file
BIN
media/bls12_381_perf_summary_i9-11980HK.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 150 KiB |
BIN
media/bn254_snarks_msm-i9-9980XE-18cores.png
Normal file
BIN
media/bn254_snarks_msm-i9-9980XE-18cores.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 91 KiB |
BIN
media/parallel_load_distribution.png
Normal file
BIN
media/parallel_load_distribution.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 598 KiB |
@ -6,35 +6,23 @@
|
|||||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||||
|
|
||||||
import
|
|
||||||
std/[macros, times, monotimes],
|
|
||||||
../benchmarks/platforms
|
|
||||||
|
|
||||||
# ############################################################
|
# ############################################################
|
||||||
#
|
#
|
||||||
# Trace operations
|
# Trace operations
|
||||||
#
|
#
|
||||||
# ############################################################
|
# ############################################################
|
||||||
|
|
||||||
# Utils
|
|
||||||
# --------------------------------------------------
|
|
||||||
const someGcc = defined(gcc) or defined(llvm_gcc) or defined(clang) or defined(icc)
|
|
||||||
const hasThreadSupport = defined(threads)
|
|
||||||
|
|
||||||
proc atomicInc*(memLoc: var int64, x = 1'i64): int64 =
|
|
||||||
when someGcc and hasThreadSupport:
|
|
||||||
result = atomicAddFetch(memLoc.addr, x, ATOMIC_RELAXED)
|
|
||||||
elif defined(vcc) and hasThreadSupport:
|
|
||||||
result = addAndFetch(memLoc.addr, x)
|
|
||||||
result += x
|
|
||||||
else:
|
|
||||||
memloc += x
|
|
||||||
result = memLoc
|
|
||||||
|
|
||||||
# Types
|
# Types
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
||||||
type
|
const CttMeter {.booldefine.} = off
|
||||||
|
const CttTrace {.booldefine.} = off # For manual "debug-echo"-style timing.
|
||||||
|
|
||||||
|
when CttMeter or CttTrace:
|
||||||
|
|
||||||
|
import ../benchmarks/platforms
|
||||||
|
|
||||||
|
type
|
||||||
Metadata* = object
|
Metadata* = object
|
||||||
procName*: string
|
procName*: string
|
||||||
module: string
|
module: string
|
||||||
@ -45,21 +33,15 @@ type
|
|||||||
when SupportsGetTicks:
|
when SupportsGetTicks:
|
||||||
cumulatedCycles*: int64
|
cumulatedCycles*: int64
|
||||||
|
|
||||||
template mtag(tagname: string){.pragma, used.}
|
var ctMetrics{.compileTime.}: seq[Metadata]
|
||||||
## This will allow tagging proc in the future with
|
|
||||||
## "Fp", "ec", "polynomial"
|
|
||||||
|
|
||||||
const CttMeter {.booldefine.} = off
|
|
||||||
const CttTrace {.booldefine.} = off # For manual "debug-echo"-style timing.
|
|
||||||
|
|
||||||
var ctMetrics{.compileTime.}: seq[Metadata]
|
|
||||||
## Metrics are collected here, this is just a temporary holder of compileTime values
|
## Metrics are collected here, this is just a temporary holder of compileTime values
|
||||||
## Unfortunately the "seq" is emptied when passing the compileTime/runtime boundaries
|
## Unfortunately the "seq" is emptied when passing the compileTime/runtime boundaries
|
||||||
## due to Nim bugs
|
## due to Nim bugs
|
||||||
|
|
||||||
when CttMeter or CttTrace:
|
|
||||||
# strformat doesn't work in templates.
|
# strformat doesn't work in templates.
|
||||||
from strutils import alignLeft, formatFloat
|
from strutils import alignLeft, formatFloat
|
||||||
|
import std/[macros, times, monotimes]
|
||||||
|
|
||||||
var Metrics*: seq[Metadata]
|
var Metrics*: seq[Metadata]
|
||||||
## We can't directly use it at compileTime because it doesn't exist.
|
## We can't directly use it at compileTime because it doesn't exist.
|
||||||
@ -69,10 +51,26 @@ when CttMeter or CttTrace:
|
|||||||
proc resetMetering*() =
|
proc resetMetering*() =
|
||||||
Metrics = static(ctMetrics)
|
Metrics = static(ctMetrics)
|
||||||
|
|
||||||
# Symbols
|
|
||||||
# --------------------------------------------------
|
|
||||||
|
|
||||||
template fnEntry(name: string, id: int, startTime, startCycle: untyped): untyped =
|
# Utils
|
||||||
|
# --------------------------------------------------
|
||||||
|
const someGcc = defined(gcc) or defined(llvm_gcc) or defined(clang) or defined(icc)
|
||||||
|
const hasThreadSupport = defined(threads)
|
||||||
|
|
||||||
|
proc atomicInc*(memLoc: var int64, x = 1'i64): int64 =
|
||||||
|
when someGcc and hasThreadSupport:
|
||||||
|
result = atomicAddFetch(memLoc.addr, x, ATOMIC_RELAXED)
|
||||||
|
elif defined(vcc) and hasThreadSupport:
|
||||||
|
result = addAndFetch(memLoc.addr, x)
|
||||||
|
result += x
|
||||||
|
else:
|
||||||
|
memloc += x
|
||||||
|
result = memLoc
|
||||||
|
|
||||||
|
# Symbols
|
||||||
|
# --------------------------------------------------
|
||||||
|
|
||||||
|
template fnEntry(name: string, id: int, startTime, startCycle: untyped): untyped =
|
||||||
## Bench tracing to insert on function entry
|
## Bench tracing to insert on function entry
|
||||||
{.noSideEffect, gcsafe.}:
|
{.noSideEffect, gcsafe.}:
|
||||||
discard Metrics[id].numCalls.atomicInc()
|
discard Metrics[id].numCalls.atomicInc()
|
||||||
@ -82,7 +80,7 @@ template fnEntry(name: string, id: int, startTime, startCycle: untyped): untyped
|
|||||||
else:
|
else:
|
||||||
let startCycle = 0
|
let startCycle = 0
|
||||||
|
|
||||||
template fnExit(name: string, id: int, startTime, startCycle: untyped): untyped =
|
template fnExit(name: string, id: int, startTime, startCycle: untyped): untyped =
|
||||||
## Bench tracing to insert before each function exit
|
## Bench tracing to insert before each function exit
|
||||||
{.noSideEffect, gcsafe.}:
|
{.noSideEffect, gcsafe.}:
|
||||||
when SupportsGetTicks:
|
when SupportsGetTicks:
|
||||||
@ -107,7 +105,7 @@ template fnExit(name: string, id: int, startTime, startCycle: untyped): untyped
|
|||||||
echo static(alignLeft(name, 50)),
|
echo static(alignLeft(name, 50)),
|
||||||
"Time (µs): ", alignLeft(formatFloat(elapsedTime.float64 * 1e-3, precision=3), 10)
|
"Time (µs): ", alignLeft(formatFloat(elapsedTime.float64 * 1e-3, precision=3), 10)
|
||||||
|
|
||||||
macro meterAnnotate(procAst: untyped): untyped =
|
macro meterAnnotate(procAst: untyped): untyped =
|
||||||
procAst.expectKind({nnkProcDef, nnkFuncDef})
|
procAst.expectKind({nnkProcDef, nnkFuncDef})
|
||||||
|
|
||||||
let id = ctMetrics.len
|
let id = ctMetrics.len
|
||||||
@ -157,6 +155,7 @@ when isMainModule:
|
|||||||
|
|
||||||
static: doAssert CttMeter or CttTrace, "CttMeter or CttTrace must be on for tracing"
|
static: doAssert CttMeter or CttTrace, "CttMeter or CttTrace must be on for tracing"
|
||||||
|
|
||||||
|
when CttMeter or CttTrace: # Avoid warnings from nim check or nimsuggest
|
||||||
expandMacros:
|
expandMacros:
|
||||||
proc foo(x: int): int{.meter.} =
|
proc foo(x: int): int{.meter.} =
|
||||||
echo "Hey hey hey"
|
echo "Hey hey hey"
|
||||||
|
|||||||
@ -52,7 +52,7 @@ type
|
|||||||
|
|
||||||
NvvmProgram = distinct pointer
|
NvvmProgram = distinct pointer
|
||||||
|
|
||||||
{.push cdecl, importc, dynlib: "libnvvm.so".}
|
{.push noconv, importc, dynlib: "libnvvm.so".}
|
||||||
|
|
||||||
proc nvvmGetErrorString*(r: NvvmResult): cstring
|
proc nvvmGetErrorString*(r: NvvmResult): cstring
|
||||||
proc nvvmVersion*(major, minor: var int32): NvvmResult
|
proc nvvmVersion*(major, minor: var int32): NvvmResult
|
||||||
@ -69,7 +69,7 @@ proc nvvmGetCompiledResult*(prog: NvvmProgram; buffer: ptr char): NvvmResult
|
|||||||
proc nvvmGetProgramLogSize*(prog: NvvmProgram; bufferSizeRet: var csize_t): NvvmResult
|
proc nvvmGetProgramLogSize*(prog: NvvmProgram; bufferSizeRet: var csize_t): NvvmResult
|
||||||
proc nvvmGetProgramLog*(prog: NvvmProgram; buffer: ptr char): NvvmResult
|
proc nvvmGetProgramLog*(prog: NvvmProgram; buffer: ptr char): NvvmResult
|
||||||
|
|
||||||
{.pop.} # {.push cdecl, importc, header: "<nvvm.h>".}
|
{.pop.} # {.push noconv, importc, header: "<nvvm.h>".}
|
||||||
|
|
||||||
# ############################################################
|
# ############################################################
|
||||||
#
|
#
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
-d:debugConstantine
|
-d:CttDebug
|
||||||
|
|||||||
@ -1,2 +1,2 @@
|
|||||||
-d:testingCurves
|
-d:testingCurves
|
||||||
-d:debugConstantine
|
-d:CttDebug
|
||||||
|
|||||||
@ -142,11 +142,15 @@ proc runTowerTests*[N](
|
|||||||
block:
|
block:
|
||||||
var r{.noinit.}: Field
|
var r{.noinit.}: Field
|
||||||
r.square(One)
|
r.square(One)
|
||||||
check: bool(r == One)
|
doAssert bool(r == One),
|
||||||
|
"\n(" & $Field & "): Expected one: " & One.toHex() & "\n" &
|
||||||
|
"got: " & r.toHex()
|
||||||
block:
|
block:
|
||||||
var r{.noinit.}: Field
|
var r{.noinit.}: Field
|
||||||
r.prod(One, One)
|
r.prod(One, One)
|
||||||
check: bool(r == One)
|
doAssert bool(r == One),
|
||||||
|
"\n(" & $Field & "): Expected one: " & One.toHex() & "\n" &
|
||||||
|
"got: " & r.toHex()
|
||||||
|
|
||||||
staticFor(curve, TestCurves):
|
staticFor(curve, TestCurves):
|
||||||
test(ExtField(ExtDegree, curve))
|
test(ExtField(ExtDegree, curve))
|
||||||
@ -168,12 +172,16 @@ proc runTowerTests*[N](
|
|||||||
var r: Field
|
var r: Field
|
||||||
r.square(Two)
|
r.square(Two)
|
||||||
|
|
||||||
check: bool(r == Four)
|
doAssert bool(r == Four),
|
||||||
|
"\n(" & $Field & "): Expected 4: " & Four.toHex() & "\n" &
|
||||||
|
"got: " & r.toHex()
|
||||||
block:
|
block:
|
||||||
var r: Field
|
var r: Field
|
||||||
r.prod(Two, Two)
|
r.prod(Two, Two)
|
||||||
|
|
||||||
check: bool(r == Four)
|
doAssert bool(r == Four),
|
||||||
|
"\n(" & $Field & "): Expected 4: " & Four.toHex() & "\n" &
|
||||||
|
"got: " & r.toHex()
|
||||||
|
|
||||||
staticFor(curve, TestCurves):
|
staticFor(curve, TestCurves):
|
||||||
test(ExtField(ExtDegree, curve))
|
test(ExtField(ExtDegree, curve))
|
||||||
@ -197,12 +205,16 @@ proc runTowerTests*[N](
|
|||||||
var u: Field
|
var u: Field
|
||||||
u.square(Three)
|
u.square(Three)
|
||||||
|
|
||||||
check: bool(u == Nine)
|
doAssert bool(u == Nine),
|
||||||
|
"\n(" & $Field & "): Expected 9: " & Nine.toHex() & "\n" &
|
||||||
|
"got: " & u.toHex()
|
||||||
block:
|
block:
|
||||||
var u: Field
|
var u: Field
|
||||||
u.prod(Three, Three)
|
u.prod(Three, Three)
|
||||||
|
|
||||||
check: bool(u == Nine)
|
doAssert bool(u == Nine),
|
||||||
|
"\n(" & $Field & "): Expected 9: " & Nine.toHex() & "\n" &
|
||||||
|
"got: " & u.toHex()
|
||||||
|
|
||||||
staticFor(curve, TestCurves):
|
staticFor(curve, TestCurves):
|
||||||
test(ExtField(ExtDegree, curve))
|
test(ExtField(ExtDegree, curve))
|
||||||
@ -226,12 +238,16 @@ proc runTowerTests*[N](
|
|||||||
var u: Field
|
var u: Field
|
||||||
u.square(MinusThree)
|
u.square(MinusThree)
|
||||||
|
|
||||||
check: bool(u == Nine)
|
doAssert bool(u == Nine),
|
||||||
|
"\n(" & $Field & "): Expected 9: " & Nine.toHex() & "\n" &
|
||||||
|
"got: " & u.toHex()
|
||||||
block:
|
block:
|
||||||
var u: Field
|
var u: Field
|
||||||
u.prod(MinusThree, MinusThree)
|
u.prod(MinusThree, MinusThree)
|
||||||
|
|
||||||
check: bool(u == Nine)
|
doAssert bool(u == Nine),
|
||||||
|
"\n(" & $Field & "): Expected 9: " & Nine.toHex() & "\n" &
|
||||||
|
"got: " & u.toHex()
|
||||||
|
|
||||||
staticFor(curve, TestCurves):
|
staticFor(curve, TestCurves):
|
||||||
test(ExtField(ExtDegree, curve))
|
test(ExtField(ExtDegree, curve))
|
||||||
|
|||||||
@ -22,10 +22,10 @@ import
|
|||||||
|
|
||||||
const
|
const
|
||||||
Iters = 4
|
Iters = 4
|
||||||
TestCurves = [
|
TestCurves = [ # Note activating some combination of curves causes miscompile / bad constant propagation with LTO in Windows MinGW GCC 12.2 (but not 8.1 or not 12.2 on Linux)
|
||||||
BN254_Nogami,
|
# BN254_Nogami,
|
||||||
BN254_Snarks,
|
BN254_Snarks,
|
||||||
BLS12_377,
|
# BLS12_377,
|
||||||
BLS12_381
|
BLS12_381
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@ -37,7 +37,7 @@ when not defined(windows):
|
|||||||
proc SHA256[T: byte|char](
|
proc SHA256[T: byte|char](
|
||||||
msg: openarray[T],
|
msg: openarray[T],
|
||||||
digest: ptr array[32, byte] = nil
|
digest: ptr array[32, byte] = nil
|
||||||
): ptr array[32, byte] {.cdecl, dynlib: DLLSSLName, importc.}
|
): ptr array[32, byte] {.noconv, dynlib: DLLSSLName, importc.}
|
||||||
|
|
||||||
# proc EVP_Q_digest[T: byte|char](
|
# proc EVP_Q_digest[T: byte|char](
|
||||||
# ossl_libctx: pointer,
|
# ossl_libctx: pointer,
|
||||||
@ -45,7 +45,7 @@ when not defined(windows):
|
|||||||
# propq: cstring,
|
# propq: cstring,
|
||||||
# data: openArray[T],
|
# data: openArray[T],
|
||||||
# digest: var array[32, byte],
|
# digest: var array[32, byte],
|
||||||
# size: ptr uint): int32 {.cdecl, dynlib: DLLSSLName, importc.}
|
# size: ptr uint): int32 {.noconv, dynlib: DLLSSLName, importc.}
|
||||||
|
|
||||||
proc SHA256_OpenSSL[T: byte|char](
|
proc SHA256_OpenSSL[T: byte|char](
|
||||||
digest: var array[32, byte],
|
digest: var array[32, byte],
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user