Rework assembly to be compatible with LTO (#231)

* rework assembler register/mem and constraint declarations

* Introduce constraint UnmutatedPointerToWriteMem

* Create invidual memory cell operands

* [Assembly] fully support indirect memory addressing

* fix calling convention for exported procs

* Prepare for switch to intel syntax to avoid clang constant propagation asm symbol name interfering OR pointer+offset addressing

* use modifiers to prevent bad string mixin fo assembler to linker of propagated consts

* Assembly: switch to intel syntax

* with working memory operand - now works with LTO on both GCC and clang and constant folding

* use memory operand in more places

* remove some inline now that we have lto

* cleanup compiler config and benches

* tracer shouldn't force dependencies when unused

* fix cc on linux

* nimble fixes

* update README [skip CI]

* update MacOS CI with Homebrew Clang

* oops nimble bindings disappeared

* more nimble fixes

* fix sha256 exported symbol

* improve constraints on modular addition

* Add extra constraint to force reloading of pointer in reg inputs

* Fix LLVM gold linker running out of registers

* workaround MinGW64 GCC 12.2 bad codegen in t_pairing_cyclotomic_subgroup with LTO
This commit is contained in:
Mamy Ratsimbazafy 2023-04-26 06:58:31 +02:00 committed by GitHub
parent 9a7137466e
commit c6d9a213f2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
49 changed files with 1366 additions and 1577 deletions

View File

@ -25,6 +25,10 @@ jobs:
cpu: amd64
TEST_LANG: c
BACKEND: NO_ASM
- os: windows
cpu: amd64
TEST_LANG: c
BACKEND: ASM
- os: macos
cpu: amd64
TEST_LANG: c
@ -172,7 +176,19 @@ jobs:
- name: Install test dependencies (macOS)
if: runner.os == 'macOS'
run: brew install gmp
run: |
brew install gmp
mkdir -p external/bin
cat << EOF > external/bin/clang
#!/bin/bash
exec $(brew --prefix llvm@15)/bin/clang "\$@"
EOF
cat << EOF > external/bin/clang++
#!/bin/bash
exec $(brew --prefix llvm@15)/bin/clang++ "\$@"
EOF
chmod 755 external/bin/{clang,clang++}
echo '${{ github.workspace }}/external/bin' >> $GITHUB_PATH
- name: Setup MSYS2 (Windows)
if: runner.os == 'Windows'
@ -210,9 +226,19 @@ jobs:
shell: bash
run: |
cd constantine
nimble bindings --verbose
nimble bindings_no_asm --verbose
nimble test_bindings --verbose
nimble test_parallel_no_asm --verbose
- name: Run Constantine tests (Windows with Assembly)
# So "test_bindings" uses C and can find GMP
# but nim-gmp cannot find GMP on Windows CI
if: runner.os == 'Windows' && matrix.target.BACKEND == 'ASM'
shell: msys2 {0}
run: |
cd constantine
nimble bindings --verbose
nimble test_bindings --verbose
nimble test_parallel_no_gmp --verbose
- name: Run Constantine tests (Windows no Assembly)
# So "test_bindings" uses C and can find GMP
# but nim-gmp cannot find GMP on Windows CI
@ -220,6 +246,6 @@ jobs:
shell: msys2 {0}
run: |
cd constantine
nimble bindings --verbose
nimble bindings_no_asm --verbose
nimble test_bindings --verbose
nimble test_parallel_no_gmp_no_asm --verbose

247
README.md
View File

@ -25,9 +25,11 @@ The implementations are accompanied with SAGE code used as reference implementat
- [Table of Contents](#table-of-contents)
- [Target audience](#target-audience)
- [Protocols](#protocols)
- [Curves supported in the backend](#curves-supported-in-the-backend)
- [Installation](#installation)
- [Dependencies](#dependencies)
- [From C](#from-c)
- [From Nim](#from-nim)
- [Dependencies & Requirements](#dependencies--requirements)
- [Curves supported in the backend](#curves-supported-in-the-backend)
- [Security](#security)
- [Disclaimer](#disclaimer)
- [Security disclosure](#security-disclosure)
@ -36,6 +38,7 @@ The implementations are accompanied with SAGE code used as reference implementat
- [In zero-knowledge proofs](#in-zero-knowledge-proofs)
- [Measuring performance](#measuring-performance)
- [BLS12_381 Clang + inline Assembly](#bls12_381-clang--inline-assembly)
- [Parallelism](#parallelism)
- [Why Nim](#why-nim)
- [Compiler caveats](#compiler-caveats)
- [Inline assembly](#inline-assembly)
@ -67,26 +70,110 @@ Protocols to address these goals, (authenticated) encryption, signature, traitor
are designed.\
Note: some goals might be mutually exclusive, for example "plausible deniability" and "non-repudiation".
After [installation](#installation), the available high-level protocols are:
## Installation
- [x] Ethereum EVM precompiles on BN254_Snarks (also called alt_bn128 or bn256 in Ethereum)
### From C
`import constantine/ethereum_evm_precompiles`
- [x] BLS signature on BLS12-381 G2 as used in Ethereum 2.
1. Install a C compiler, for example:
- Debian/Ubuntu `sudo apt update && sudo apt install build-essential`
- Archlinux `pacman -S base-devel`
2. Install nim, it is available in most distros package manager for Linux and Homebrew for MacOS
Windows binaries are on the official website: https://nim-lang.org/install_unix.html
- Debian/Ubuntu `sudo apt install nim`
- Archlinux `pacman -S nim`
3. Compile the bindings.
- Recommended: \
`CC:clang nimble bindings`
- or `nimble bindings_no_asm`\
to compile without assembly (otherwise it autodetects support)
- or with default compiler\
`nimble bindings`
4. Ensure bindings work
- `nimble test_bindings`
5. Bindings location
- The bindings are put in `constantine/lib`
- The headers are in [constantine/include](./include) for example [Ethereum BLS signatures](./include/constantine_ethereum_bls_signatures.h)
6. Read the examples in [examples_c](./examples_c):
- Using the [Ethereum BLS signatures bindings from C](./examples_c/ethereum_bls_signatures.c)
- Testing Constantine BLS12-381 vs GMP [./examples_c/t_libctt_bls12_381.c](./examples_c/t_libctt_bls12_381.c)
The bindings currently provided are:
- Ethereum BLS signatures on BLS12-381 G2
Cryptographic suite: `BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_POP_`
This scheme is also used in the following blockchains:
Algorand, Chia, Dfinity, Filecoin, Tezos, Zcash.
They may have their pubkeys on G1 and signatures on G2 like Ethereum or the other way around.
> Parameter discussion:
>
> As Ethereum validators' pubkeys are duplicated, stored and transmitter over and over in the protocol,
having them be as small as possible was important.
On another hand, BLS signatures were first popularized due to their succinctness.
And having signatures on G1 is useful when short signatures are desired, in embedded for example.
- [x] SHA256 hash
- ...
- BLS12-381 arithmetic:
- field arithmetic
- on Fr (i.e. modulo the 255-bit curve order)
- on Fp (i.e. modulo the 381-bit prime modulus)
- on Fp2
- elliptic curve arithmetic:
- on elliptic curve over Fp (EC G1) with affine, jacobian and homogenous projective coordinates
- on elliptic curve over Fp2 (EC G2) with affine, jacobian and homogenous projective coordinates
- currently not exposed: \
scalar multiplication, multi-scalar multiplications \
pairings and multi-pairings \
are implemented but not exposed
- _All operations are constant-time unless explicitly mentioned_ vartime
- The Pasta curves: Pallas and Vesta
- field arithmetic
- on Fr (i.e. modulo the 255-bit curve order)
- on Fp (i.e. modulo the 255-bit prime modulus)
- elliptic curve arithmetic:
- on elliptic curve over Fp (EC G1) with affine, jacobian and homogenous projective coordinates
- currently not exposed: \
scalar multiplication, multi-scalar multiplications \
are implemented but not exposed
- _All operations are constant-time unless explicitly mentioned_ vartime
### From Nim
You can install the developement version of the library through nimble with the following command
```
nimble install https://github.com/mratsim/constantine@#master
```
## Dependencies & Requirements
For speed it is recommended to use Clang (see [Compiler-caveats](#Compiler-caveats)).
In particular GCC generates inefficient add-with-carry code.
Constantine requires at least:
- GCC 7 \
Previous versions generated incorrect add-with-carry code.
- Clang 14 \
On x86-64, inline assembly is used to workaround compilers having issues optimizing large integer arithmetic,
and also ensure constant-time code. \
Constantine uses the intel assembly syntax to address issues with the default AT&T syntax and constants propagated in Clang. \
Clang 14 added support for `-masm=intel`. \
\
On MacOS, Apple Clang does not support Intel assembly syntax, use Homebrew Clang instead or compile without assembly.\
_Note that Apple is discontinuing Intel CPU throughough their product line so this will impact only older model and Mac Pro_
On Windows, Constantine is tested with MinGW. The Microsoft Visual C++ Compiler is not configured.
Constantine has no dependencies, even on Nim standard library except:
- for testing
- jsony for parsing json test vectors
- the Nim standard library for unittesting, formatting and datetime.
- GMP for testing against GMP
- for benchmarking
- The Nim standard libreary for timing and formatting
- for Nvidia GPU backend:
- the LLVM runtime ("dev" version with headers is not needed)
- the CUDA runtime ("dev" version with headers is not needed)
- at compile-time
- we need the std/macros library to generate Nim code.
## Curves supported in the backend
@ -108,42 +195,10 @@ The following curves are configured:
- Jubjub, a curve embedded in BLS12-381 scalar field to be used in zk-SNARKS circuits.
- Bandersnatch, a more efficient curve embedded in BLS12-381 scalar field to be used in zk-SNARKS circuits.
- Other curves
- Edwards25519, used in ed25519 and X25519 from TLS 1.3 protocol and the Signal protocol.
- Edwards25519, used in ed25519 and X25519 from TLS 1.3 protocol and the Signal protocol. \
With Ristretto, it can be used in bulletproofs.
- The Pasta curves (Pallas and Vesta) for the Halo 2 proof system (Zcash).
## Installation
You can install the developement version of the library through nimble with the following command
```
nimble install https://github.com/mratsim/constantine@#master
```
For speed it is recommended to prefer Clang, MSVC or ICC over GCC (see [Compiler-caveats](#Compiler-caveats)).
Further if using GCC, GCC 7 at minimum is required, previous versions
generated incorrect add-with-carry code.
On x86-64, inline assembly is used to workaround compilers having issues optimizing large integer arithmetic,
and also ensure constant-time code.
## Dependencies
Constantine has no dependencies, even on Nim standard library except:
- for testing
- jsony for parsing json test vectors
- the Nim standard library for unittesting, formatting and datetime.
- GMP for testing against GMP
- for benchmarking
- The Nim standard libreary for timing and formatting
- for Nvidia GPU backend:
- the LLVM runtime ("dev" version with headers is not needed)
- the CUDA runtime ("dev" version with headers is not needed)
- at compile-time
- we need the std/macros library to generate Nim code.
## Security
Hardening an implementation against all existing and upcoming attack vectors is an extremely complex task.
@ -217,47 +272,79 @@ To measure the performance of Constantine
```bash
git clone https://github.com/mratsim/constantine
nimble bench_fp # Using default compiler + Assembly
nimble bench_fp_clang # Using Clang + Assembly (recommended)
nimble bench_fp_gcc # Using GCC + Assembly (decent)
nimble bench_fp_clang_noasm # Using Clang only (acceptable)
nimble bench_fp_gcc # Using GCC only (slowest)
nimble bench_fp2
# ...
nimble bench_ec_g1_clang
nimble bench_ec_g2_clang
nimble bench_pairing_bn254_nogami_clang
nimble bench_pairing_bn254_snarks_clang
nimble bench_pairing_bls12_377_clang
nimble bench_pairing_bls12_381_clang
# Default compiler
nimble bench_fp
# Arithmetic
CC=clang nimble bench_fp # Using Clang + Assembly (recommended)
CC=clang nimble bench_fp2
CC=clang nimble bench_fp12
# Scalar multiplication and pairings
CC=clang nimble bench_ec_g1_scalar_mul
CC=clang nimble bench_ec_g2_scalar_mul
CC=clang nimble bench_pairing_bls12_381
# And per-curve summaries
nimble bench_summary_bn254_nogami_clang
nimble bench_summary_bn254_snarks_clang
nimble bench_summary_bls12_377_clang
nimble bench_summary_bls12_381_clang
CC=clang nimble bench_summary_bn254_nogami
CC=clang nimble bench_summary_bn254_snarks
CC=clang nimble bench_summary_bls12_377
CC=clang nimble bench_summary_bls12_381
# The Ethereum BLS signature protocol
CC=clang nimble bench_ethereum_bls_signatures
# Multi-scalar multiplication
CC=clang nimble bench_ec_g1_msm_bls12_381
CC=clang nimble bench_ec_g1_msm_bn256_snarks
```
The full list of benchmarks is available in the [`benchmarks`](./benchmarks) folder.
As mentioned in the [Compiler caveats](#compiler-caveats) section, GCC is up to 2x slower than Clang due to mishandling of carries and register usage.
On my machine i9-11980HK (8 cores 2.6GHz, turbo 5GHz), for Clang + Assembly, **all being constant-time** (including scalar multiplication, square root and inversion).
#### BLS12_381 (Clang + inline Assembly)
```
--------------------------------------------------------------------------------------------------------------------------------------------------------
EC ScalarMul 255-bit G1 ECP_ShortW_Prj[Fp[BLS12_381]] 16086.740 ops/s 62163 ns/op 205288 CPU cycles (approx)
EC ScalarMul 255-bit G1 ECP_ShortW_Jac[Fp[BLS12_381]] 16670.834 ops/s 59985 ns/op 198097 CPU cycles (approx)
EC ScalarMul 255-bit G2 ECP_ShortW_Prj[Fp2[BLS12_381]] 8333.403 ops/s 119999 ns/op 396284 CPU cycles (approx)
EC ScalarMul 255-bit G2 ECP_ShortW_Jac[Fp2[BLS12_381]] 9300.682 ops/s 107519 ns/op 355071 CPU cycles (approx)
--------------------------------------------------------------------------------------------------------------------------------------------------------
Miller Loop BLS12 BLS12_381 5102.223 ops/s 195993 ns/op 647251 CPU cycles (approx)
Final Exponentiation BLS12 BLS12_381 4209.109 ops/s 237580 ns/op 784588 CPU cycles (approx)
Pairing BLS12 BLS12_381 2343.045 ops/s 426795 ns/op 1409453 CPU cycles (approx)
--------------------------------------------------------------------------------------------------------------------------------------------------------
Hash to G2 (Draft #11) BLS12_381 6558.495 ops/s 152474 ns/op 503531 CPU cycles (approx)
--------------------------------------------------------------------------------------------------------------------------------------------------------
```
On my machine i9-11980HK (8 cores 2.6GHz, turbo 5GHz), for Clang + Assembly, **all being constant-time** (including scalar multiplication, square root and inversion).
![BLS12-381 perf summary](./media/bls12_381_perf_summary_i9-11980HK.png)
![BLS12-381 Multi-Scalar multiplication 1](./media/bls12_381_msm_i9-11980HK-8cores_1.png)
![BLS12-381 Multi-Scalar multiplication 2](./media/bls12_381_msm_i9-11980HK-8cores_2.png)
![BLS12-381 Multi-Scalar multiplication 3](./media/bls12_381_msm_i9-11980HK-8cores_3.png)
On a i9-9980XE (18 cores,watercooled, overclocked, 4.1GHz all core turbo)
![BN254-Snarks multi-sclar multiplication](./media/bn254_snarks_msm-i9-9980XE-18cores.png)
#### Parallelism
Constantine multithreaded primitives are powered by a highly tuned threadpool and stress-tested for:
- scheduler overhead
- load balancing with extreme imbalance
- nested data parallelism
- contention
- speculative/conditional parallelism
and provides the following paradigms:
- Future-based task-parallelism
- Data parallelism (nestable and awaitable for loops)
- including arbitrary parallel reductions
- Dataflow parallelism / Stream parallelism / Graph Parallelism / Pipeline parallelism
- Structured Parallelism
The threadpool parallel-for loops use lazy loop splitting and are fully adaptative to the workload being scheduled, the threads in-flight load and the hardware speed unlike most (all?) runtime, see:
- OpenMP woes depending on hardware and workload: https://github.com/zy97140/omp-benchmark-for-pytorch
- Raytracing ideal runtime, adapt to pixel compute load: ![load distribution](./media/parallel_load_distribution.png)\
Most (all?) production runtime use scheduling A (split on number of threads like GCC OpenMP) or B (eager splitting, unable to adapt to actual work like LLVM/Intel OpenMP or Intel TBB) while Constantine uses C.
The threadpool provides efficient backoff strategy to conserve power based on:
- eventcounts / futexes, for low overhead backoff
- log-log iterated backoff, a provably optimal backoff strategy used for wireless communication to minimize communication in parallel for-loops
The research papers on high performance multithreading available in Weave repo: https://github.com/mratsim/weave/tree/7682784/research.\
_Note: The threadpool is not backed by Weave but by an inspired runtime that has been significantly simplified for ease of auditing. In particular it uses shared-memory based work-stealing instead of channel-based work-requesting for load balancing as distributed computing is not a target, ..., yet._
## Why Nim

View File

@ -60,7 +60,7 @@ echo " release: ", defined(release)
echo " danger: ", defined(danger)
echo " inline assembly: ", UseASM_X86_64
when (sizeof(int) == 4) or defined(Constantine32):
when (sizeof(int) == 4) or defined(Ctt32):
echo "⚠️ Warning: using Constantine with 32-bit limbs"
else:
echo "Using Constantine with 64-bit limbs"

View File

@ -61,7 +61,7 @@ echo " release: ", defined(release)
echo " danger: ", defined(danger)
echo " inline assembly: ", UseASM_X86_64
when (sizeof(int) == 4) or defined(Constantine32):
when (sizeof(int) == 4) or defined(Ctt32):
echo "⚠️ Warning: using Constantine with 32-bit limbs"
else:
echo "Using Constantine with 64-bit limbs"

View File

@ -33,7 +33,7 @@ else:
proc SHA256[T: byte|char](
msg: openarray[T],
digest: ptr array[32, byte] = nil
): ptr array[32, byte] {.cdecl, dynlib: DLLSSLName, importc.}
): ptr array[32, byte] {.noconv, dynlib: DLLSSLName, importc.}
proc SHA256_OpenSSL[T: byte|char](
digest: var array[32, byte],

View File

@ -19,9 +19,9 @@ export curves, curves_primitives
template genBindingsField*(Field: untyped) =
when appType == "lib":
{.push cdecl, dynlib, exportc, raises: [].} # No exceptions allowed
{.push noconv, dynlib, exportc, raises: [].} # No exceptions allowed
else:
{.push cdecl, exportc, raises: [].} # No exceptions allowed
{.push noconv, exportc, raises: [].} # No exceptions allowed
func `ctt _ Field _ unmarshalBE`(dst: var Field, src: openarray[byte]) =
## Deserialize
@ -122,9 +122,9 @@ template genBindingsField*(Field: untyped) =
template genBindingsFieldSqrt*(Field: untyped) =
when appType == "lib":
{.push cdecl, dynlib, exportc, raises: [].} # No exceptions allowed
{.push noconv, dynlib, exportc, raises: [].} # No exceptions allowed
else:
{.push cdecl, exportc, raises: [].} # No exceptions allowed
{.push noconv, exportc, raises: [].} # No exceptions allowed
func `ctt _ Field _ is_square`(a: Field): SecretBool =
a.isSquare()
@ -155,9 +155,9 @@ template genBindingsFieldSqrt*(Field: untyped) =
template genBindingsExtField*(Field: untyped) =
when appType == "lib":
{.push cdecl, dynlib, exportc, raises: [].} # No exceptions allowed
{.push noconv, dynlib, exportc, raises: [].} # No exceptions allowed
else:
{.push cdecl, exportc, raises: [].} # No exceptions allowed
{.push noconv, exportc, raises: [].} # No exceptions allowed
# --------------------------------------------------------------------------------------
func `ctt _ Field _ is_eq`(a, b: Field): SecretBool =
@ -258,9 +258,9 @@ template genBindingsExtField*(Field: untyped) =
template genBindingsExtFieldSqrt*(Field: untyped) =
when appType == "lib":
{.push cdecl, dynlib, exportc, raises: [].} # No exceptions allowed
{.push noconv, dynlib, exportc, raises: [].} # No exceptions allowed
else:
{.push cdecl, exportc, raises: [].} # No exceptions allowed
{.push noconv, exportc, raises: [].} # No exceptions allowed
func `ctt _ Field _ is_square`(a: Field): SecretBool =
a.isSquare()
@ -275,9 +275,9 @@ template genBindingsExtFieldSqrt*(Field: untyped) =
template genBindings_EC_ShortW_Affine*(ECP, Field: untyped) =
when appType == "lib":
{.push cdecl, dynlib, exportc, raises: [].} # No exceptions allowed
{.push noconv, dynlib, exportc, raises: [].} # No exceptions allowed
else:
{.push cdecl, exportc, raises: [].} # No exceptions allowed
{.push noconv, exportc, raises: [].} # No exceptions allowed
# --------------------------------------------------------------------------------------
func `ctt _ ECP _ is_eq`(P, Q: ECP): SecretBool =
@ -305,9 +305,9 @@ template genBindings_EC_ShortW_Affine*(ECP, Field: untyped) =
template genBindings_EC_ShortW_NonAffine*(ECP, ECP_Aff, Field: untyped) =
when appType == "lib":
{.push cdecl, dynlib, exportc, raises: [].} # No exceptions allowed
{.push noconv, dynlib, exportc, raises: [].} # No exceptions allowed
else:
{.push cdecl, exportc, raises: [].} # No exceptions allowed
{.push noconv, exportc, raises: [].} # No exceptions allowed
# --------------------------------------------------------------------------------------
func `ctt _ ECP _ is_eq`(P, Q: ECP): SecretBool =

View File

@ -17,17 +17,17 @@ import std/strformat
# Library compilation
# ----------------------------------------------------------------
proc releaseBuildOptions: string =
proc releaseBuildOptions(useASM, useLTO = true): string =
# -d:danger --opt:size
# to avoid boundsCheck and overflowChecks that would trigger exceptions or allocations in a crypto library.
# Those are internally guaranteed at compile-time by fixed-sized array
# and checked at runtime with an appropriate error code if any for user-input.
#
# Furthermore we optimize for size, the performance critical procedures
# Furthermore we may optimize for size, the performance critical procedures
# either use assembly or are unrolled manually with staticFor,
# Optimizations at -O3 deal with loops and branching
# which we mostly don't have. It's better to optimize
# for instructions cache.
# which we mostly don't have.
# Hence optimizing for instructions cache may pay off.
#
# --panics:on -d:noSignalHandler
# Even with `raises: []`, Nim still has an exception path
@ -50,11 +50,23 @@ proc releaseBuildOptions: string =
# Reduce instructions cache misses.
# https://lkml.org/lkml/2015/5/21/443
# Our non-inlined functions are large so size cost is minimal.
" -d:danger --opt:size " &
let compiler = if existsEnv"CC": " --cc:" & getEnv"CC"
else: ""
let noASM = if not useASM: " -d:CttASM=false "
else: ""
let lto = if useLTO: " --passC:-flto=auto --passL:-flto=auto "
else: ""
compiler &
noASM &
lto &
" -d:danger " &
# " --opt:size " &
" --panics:on -d:noSignalHandler " &
" --mm:arc -d:useMalloc " &
" --verbosity:0 --hints:off --warnings:off " &
# " --passC:-flto --passL:-flto " &
" --passC:-fno-semantic-interposition " &
" --passC:-falign-functions=64 "
@ -62,13 +74,14 @@ type BindingsKind = enum
kCurve
kProtocol
proc genDynamicBindings(bindingsKind: BindingsKind, bindingsName, prefixNimMain: string) =
proc genDynamicBindings(bindingsKind: BindingsKind, bindingsName, prefixNimMain: string, useASM = true) =
proc compile(libName: string, flags = "") =
echo "Compiling dynamic library: lib/" & libName
exec "nim c " &
" --noMain --app:lib " &
flags &
releaseBuildOptions() &
releaseBuildOptions(useASM, useLTO = true) &
" --noMain --app:lib " &
&" --nimMainPrefix:{prefixNimMain} " &
&" --out:{libName} --outdir:lib " &
(block:
@ -98,24 +111,24 @@ proc genDynamicBindings(bindingsKind: BindingsKind, bindingsName, prefixNimMain:
else:
compile "lib" & bindingsName & ".so"
proc genStaticBindings(bindingsKind: BindingsKind, bindingsName, prefixNimMain: string) =
proc genStaticBindings(bindingsKind: BindingsKind, bindingsName, prefixNimMain: string, useASM = true) =
proc compile(libName: string, flags = "") =
echo "Compiling static library: lib/" & libName
exec "nim c " &
" --noMain --app:staticLib " &
flags &
releaseBuildOptions() &
" --nimMainPrefix:" & prefixNimMain &
" --out:" & libName & " --outdir:lib " &
releaseBuildOptions(useASM, useLTO = false) &
" --noMain --app:staticLib " &
&" --nimMainPrefix:{prefixNimMain} " &
&" --out:{libName} --outdir:lib " &
(block:
case bindingsKind
of kCurve:
" --nimcache:nimcache/bindings_curves/" & bindingsName &
" bindings_generators/" & bindingsName & ".nim"
&" --nimcache:nimcache/bindings_curves/{bindingsName}" &
&" bindings_generators/{bindingsName}.nim"
of kProtocol:
" --nimcache:nimcache/bindings_protocols/" & bindingsName &
" constantine/" & bindingsName & ".nim"
)
&" --nimcache:nimcache/bindings_protocols/{bindingsName}" &
&" constantine/{bindingsName}.nim")
let bindingsName = block:
case bindingsKind
@ -138,13 +151,13 @@ proc genStaticBindings(bindingsKind: BindingsKind, bindingsName, prefixNimMain:
proc genHeaders(bindingsName: string) =
echo "Generating header: include/" & bindingsName & ".h"
exec "nim c -d:CttGenerateHeaders " &
releaseBuildOptions() &
" -d:release " &
" --out:" & bindingsName & "_gen_header.exe --outdir:build " &
" --nimcache:nimcache/bindings_curves_headers/" & bindingsName & "_header" &
" bindings_generators/" & bindingsName & ".nim"
exec "build/" & bindingsName & "_gen_header.exe include"
task bindings, "Generate Constantine bindings":
task bindings, "Generate Constantine bindings (no assembly)":
# Curve arithmetic
genStaticBindings(kCurve, "constantine_bls12_381", "ctt_bls12381_init_")
genDynamicBindings(kCurve, "constantine_bls12_381", "ctt_bls12381_init_")
@ -158,6 +171,23 @@ task bindings, "Generate Constantine bindings":
# Protocols
genStaticBindings(kProtocol, "ethereum_bls_signatures", "ctt_eth_bls_init_")
genDynamicBindings(kProtocol, "ethereum_bls_signatures", "ctt_eth_bls_init_")
echo ""
task bindings_no_asm, "Generate Constantine bindings (no assembly)":
# Curve arithmetic
genStaticBindings(kCurve, "constantine_bls12_381", "ctt_bls12381_init_", useASM = false)
genDynamicBindings(kCurve, "constantine_bls12_381", "ctt_bls12381_init_", useASM = false)
genHeaders("constantine_bls12_381")
echo ""
genStaticBindings(kCurve, "constantine_pasta", "ctt_pasta_init_", useASM = false)
genDynamicBindings(kCurve, "constantine_pasta", "ctt_pasta_init_", useASM = false)
genHeaders("constantine_pasta")
echo ""
# Protocols
genStaticBindings(kProtocol, "ethereum_bls_signatures", "ctt_eth_bls_init_", useASM = false)
genDynamicBindings(kProtocol, "ethereum_bls_signatures", "ctt_eth_bls_init_", useASM = false)
echo ""
proc testLib(path, testName, libName: string, useGMP: bool) =
let dynlibName = if defined(windows): libName & ".dll"
@ -166,21 +196,25 @@ proc testLib(path, testName, libName: string, useGMP: bool) =
let staticlibName = if defined(windows): libName & ".lib"
else: "lib" & libName & ".a"
let cc = if existsEnv"CC": getEnv"CC"
else: "gcc"
echo &"\n[Bindings: {path}/{testName}.c] Testing dynamically linked library {dynlibName}"
exec &"gcc -Iinclude -Llib -o build/testbindings/{testName}_dynlink.exe {path}/{testName}.c -l{libName} " & (if useGMP: "-lgmp" else: "")
exec &"{cc} -Iinclude -Llib -o build/testbindings/{testName}_dynlink.exe {path}/{testName}.c -l{libName} " & (if useGMP: "-lgmp" else: "")
when defined(windows):
# Put DLL near the exe as LD_LIBRARY_PATH doesn't work even in a POSIX compatible shell
exec &"./build/testbindings/{testName}_dynlink.exe"
else:
exec &"LD_LIBRARY_PATH=lib ./build/testbindings/{testName}_dynlink.exe"
echo ""
echo &"\n[Bindings: {path}/{testName}.c] Testing statically linked library: {staticlibName}"
# Beware MacOS annoying linker with regards to static libraries
# The following standard way cannot be used on MacOS
# exec "gcc -Iinclude -Llib -o build/t_libctt_bls12_381_sl.exe examples_c/t_libctt_bls12_381.c -lgmp -Wl,-Bstatic -lconstantine_bls12_381 -Wl,-Bdynamic"
exec &"gcc -Iinclude -o build/testbindings/{testName}_staticlink.exe {path}/{testName}.c lib/{staticlibName} " & (if useGMP: "-lgmp" else: "")
exec &"{cc} -Iinclude -o build/testbindings/{testName}_staticlink.exe {path}/{testName}.c lib/{staticlibName} " & (if useGMP: "-lgmp" else: "")
exec &"./build/testbindings/{testName}_staticlink.exe"
echo ""
task test_bindings, "Test C bindings":
exec "mkdir -p build/testbindings"
@ -485,9 +519,22 @@ const skipSanitizers = [
when defined(windows):
# UBSAN is not available on mingw
# https://github.com/libressl-portable/portable/issues/54
const sanitizers = ""
else:
const sanitizers =
" --passC:-fstack-protector-strong " &
# Fortify source wouldn't help us detect errors in cosntantine
# because everything is stack allocated
# except with the threadpool:
# - https://developers.redhat.com/blog/2021/04/16/broadening-compiler-checks-for-buffer-overflows-in-_fortify_source#what_s_next_for__fortify_source
# - https://developers.redhat.com/articles/2023/02/06/how-improve-application-security-using-fortifysource3#how_to_improve_application_fortification
# We also don't use memcpy as it is not constant-time and our copy is compile-time sized.
" --passC:-D_FORTIFY_SOURCE=3 " &
# Sanitizers are incompatible with nim default GC
# The conservative stack scanning of Nim default GC triggers, alignment UB and stack-buffer-overflow check.
# Address sanitizer requires free registers and needs to be disabled for some inline assembly files.
@ -497,8 +544,8 @@ else:
# " --passC:-fsanitize=undefined --passL:-fsanitize=undefined" &
# " --passC:-fsanitize=address --passL:-fsanitize=address" &
" --passC:-fno-sanitize-recover" # Enforce crash on undefined behaviour
# " --passC:-fno-sanitize-recover" # Enforce crash on undefined behaviour
""
# Tests & Benchmarks helper functions
# ----------------------------------------------------------------
@ -508,25 +555,17 @@ proc clearParallelBuild() =
if fileExists(buildParallel):
rmFile(buildParallel)
template setupTestCommand(): untyped {.dirty.} =
proc setupTestCommand(flags, path: string, useASM: bool): string =
var lang = "c"
if existsEnv"TEST_LANG":
lang = getEnv"TEST_LANG"
var cc = ""
if existsEnv"CC":
cc = " --cc:" & getEnv"CC"
var flags = flags
when not defined(windows):
# Not available in MinGW https://github.com/libressl-portable/portable/issues/54
flags &= " --passC:-fstack-protector-strong --passC:-D_FORTIFY_SOURCE=2 "
let command = "nim " & lang & cc &
return "nim " & lang &
" -r " &
flags &
releaseBuildOptions() &
releaseBuildOptions(useASM) &
" --outdir:build/testsuite " &
" --nimcache:nimcache/" & path & " " &
&" --nimcache:nimcache/{path} " &
path
proc test(cmd: string) =
@ -535,73 +574,72 @@ proc test(cmd: string) =
echo "=============================================================================================="
exec cmd
proc testBatch(commands: var string, flags, path: string) =
setupTestCommand()
commands &= command & '\n'
proc testBatch(commands: var string, flags, path: string, useASM = true) =
# With LTO, the linker produces lots of spurious warnings when copying into openArrays/strings
template setupBench(): untyped {.dirty.} =
let runFlag = if run: " -r "
else: " "
let flags = if defined(gcc): flags & " --passC:-Wno-stringop-overflow --passL:-Wno-stringop-overflow "
else: flags
var lang = " c "
if existsEnv"TEST_LANG":
lang = getEnv"TEST_LANG"
commands = commands & setupTestCommand(flags, path, useASM) & '\n'
var cc = ""
if compiler != "":
cc = "--cc:" & compiler
elif existsEnv"CC":
cc = " --cc:" & getEnv"CC"
proc setupBench(benchName: string, run: bool, useAsm: bool): string =
var runFlags = " "
if run: # Beware of https://github.com/nim-lang/Nim/issues/21704
runFlags = runFlags & " -r "
if not useAsm:
cc &= " -d:CttASM=false"
let command = "nim " & lang & cc &
releaseBuildOptions() &
" -o:build/bench/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") &
" --nimcache:nimcache/benches/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") &
runFlag & " benchmarks/" & benchName & ".nim"
let asmStatus = if useASM: "useASM"
else: "noASM"
proc runBench(benchName: string, compiler = "", useAsm = true) =
if defined(gcc):
# With LTO, the linker produces lots of spurious warnings when copying into openArrays/strings
runFlags = runFlags & " --passC:-Wno-stringop-overflow --passL:-Wno-stringop-overflow "
let cc = if existsEnv"CC": getEnv"CC"
else: "defaultcompiler"
return "nim c " &
runFlags &
releaseBuildOptions(useASM) &
&" -o:build/bench/{benchName}_{cc}_{asmStatus}" &
&" --nimcache:nimcache/benches/{benchName}_{cc}_{asmStatus}" &
&" benchmarks/{benchName}.nim"
proc runBench(benchName: string, useAsm = true) =
if not dirExists "build":
mkDir "build"
let run = true
setupBench()
let command = setupBench(benchName, run = true, useAsm)
exec command
proc buildBenchBatch(commands: var string, benchName: string, compiler = "", useAsm = true) =
let run = false
let compiler = ""
setupBench()
commands &= command & '\n'
proc buildBenchBatch(commands: var string, benchName: string, useAsm = true) =
let command = setupBench(benchName, run = false, useAsm)
commands = commands & command & '\n'
proc addTestSet(cmdFile: var string, requireGMP: bool, test32bit = false, testASM = true) =
proc addTestSet(cmdFile: var string, requireGMP: bool, test32bit = false, useASM = true) =
if not dirExists "build":
mkDir "build"
echo "Found " & $testDesc.len & " tests to run."
for td in testDesc:
if not(td.useGMP and not requireGMP):
var flags = ""
if not testASM:
flags &= " -d:CttASM=false "
var flags = "" # Beware of https://github.com/nim-lang/Nim/issues/21704
if test32bit:
flags &= " -d:Constantine32 "
flags = flags & " -d:Ctt32 "
if td.path in useDebug:
flags &= " -d:debugConstantine "
flags = flags & " -d:CttDebug "
if td.path notin skipSanitizers:
flags &= sanitizers
flags = flags & sanitizers
cmdFile.testBatch(flags, td.path)
cmdFile.testBatch(flags, td.path, useASM)
proc addTestSetNvidia(cmdFile: var string) =
if not dirExists "build":
mkDir "build"
echo "Found " & $testDescNvidia.len & " tests to run."
for path in testDescThreadpool:
var flags = ""
for path in testDescNvidia:
var flags = "" # Beware of https://github.com/nim-lang/Nim/issues/21704
if path notin skipSanitizers:
flags &= sanitizers
flags = flags & sanitizers
cmdFile.testBatch(flags, path)
proc addTestSetThreadpool(cmdFile: var string) =
@ -612,26 +650,24 @@ proc addTestSetThreadpool(cmdFile: var string) =
for path in testDescThreadpool:
var flags = " --threads:on --debugger:native "
if path notin skipSanitizers:
flags &= sanitizers
flags = flags & sanitizers
cmdFile.testBatch(flags, path)
proc addTestSetMultithreadedCrypto(cmdFile: var string, test32bit = false, testASM = true) =
proc addTestSetMultithreadedCrypto(cmdFile: var string, test32bit = false, useASM = true) =
if not dirExists "build":
mkDir "build"
echo "Found " & $testDescMultithreadedCrypto.len & " tests to run."
for td in testDescMultithreadedCrypto:
var flags = " --threads:on --debugger:native"
if not testASM:
flags &= " -d:CttASM=false"
if test32bit:
flags &= " -d:Constantine32"
flags = flags & " -d:Ctt32 "
if td in useDebug:
flags &= " -d:debugConstantine"
flags = flags & " -d:CttDebug "
if td notin skipSanitizers:
flags &= sanitizers
flags = flags & sanitizers
cmdFile.testBatch(flags, td)
cmdFile.testBatch(flags, td, useASM)
proc addBenchSet(cmdFile: var string, useAsm = true) =
if not dirExists "build":
@ -649,7 +685,7 @@ proc genParallelCmdRunner() =
task test, "Run all tests":
# -d:testingCurves is configured in a *.nim.cfg for convenience
var cmdFile: string
cmdFile.addTestSet(requireGMP = true, testASM = true)
cmdFile.addTestSet(requireGMP = true, useASM = true)
cmdFile.addBenchSet(useASM = true) # Build (but don't run) benches to ensure they stay relevant
cmdFile.addTestSetThreadpool()
cmdFile.addTestSetMultithreadedCrypto()
@ -660,10 +696,10 @@ task test, "Run all tests":
task test_no_asm, "Run all tests (no assembly)":
# -d:testingCurves is configured in a *.nim.cfg for convenience
var cmdFile: string
cmdFile.addTestSet(requireGMP = true, testASM = false)
cmdFile.addTestSet(requireGMP = true, useASM = false)
cmdFile.addBenchSet(useASM = false) # Build (but don't run) benches to ensure they stay relevant
cmdFile.addTestSetThreadpool()
cmdFile.addTestSetMultithreadedCrypto(testASM = false)
cmdFile.addTestSetMultithreadedCrypto(useASM = false)
for cmd in cmdFile.splitLines():
if cmd != "": # Windows doesn't like empty commands
exec cmd
@ -671,7 +707,7 @@ task test_no_asm, "Run all tests (no assembly)":
task test_no_gmp, "Run tests that don't require GMP":
# -d:testingCurves is configured in a *.nim.cfg for convenience
var cmdFile: string
cmdFile.addTestSet(requireGMP = false, testASM = true)
cmdFile.addTestSet(requireGMP = false, useASM = true)
cmdFile.addBenchSet(useASM = true) # Build (but don't run) benches to ensure they stay relevant
cmdFile.addTestSetThreadpool()
cmdFile.addTestSetMultithreadedCrypto()
@ -682,10 +718,10 @@ task test_no_gmp, "Run tests that don't require GMP":
task test_no_gmp_no_asm, "Run tests that don't require GMP using a pure Nim backend":
# -d:testingCurves is configured in a *.nim.cfg for convenience
var cmdFile: string
cmdFile.addTestSet(requireGMP = false, testASM = false)
cmdFile.addTestSet(requireGMP = false, useASM = false)
cmdFile.addBenchSet(useASM = false) # Build (but don't run) benches to ensure they stay relevant
cmdFile.addTestSetThreadpool()
cmdFile.addTestSetMultithreadedCrypto(testASM = false)
cmdFile.addTestSetMultithreadedCrypto(useASM = false)
for cmd in cmdFile.splitLines():
if cmd != "": # Windows doesn't like empty commands
exec cmd
@ -696,7 +732,7 @@ task test_parallel, "Run all tests in parallel":
genParallelCmdRunner()
var cmdFile: string
cmdFile.addTestSet(requireGMP = true, testASM = true)
cmdFile.addTestSet(requireGMP = true, useASM = true)
cmdFile.addBenchSet(useASM = true) # Build (but don't run) benches to ensure they stay relevant
writeFile(buildParallel, cmdFile)
exec "build/pararun " & buildParallel
@ -715,7 +751,7 @@ task test_parallel_no_asm, "Run all tests (without macro assembler) in parallel"
genParallelCmdRunner()
var cmdFile: string
cmdFile.addTestSet(requireGMP = true, testASM = false)
cmdFile.addTestSet(requireGMP = true, useASM = false)
cmdFile.addBenchSet(useASM = false)
writeFile(buildParallel, cmdFile)
exec "build/pararun " & buildParallel
@ -723,7 +759,7 @@ task test_parallel_no_asm, "Run all tests (without macro assembler) in parallel"
# Threadpool tests done serially
cmdFile = ""
cmdFile.addTestSetThreadpool()
cmdFile.addTestSetMultithreadedCrypto(testASM = false)
cmdFile.addTestSetMultithreadedCrypto(useASM = false)
for cmd in cmdFile.splitLines():
if cmd != "": # Windows doesn't like empty commands
exec cmd
@ -734,7 +770,7 @@ task test_parallel_no_gmp, "Run all tests in parallel":
genParallelCmdRunner()
var cmdFile: string
cmdFile.addTestSet(requireGMP = false, testASM = true)
cmdFile.addTestSet(requireGMP = false, useASM = true)
cmdFile.addBenchSet(useASM = true) # Build (but don't run) benches to ensure they stay relevant
writeFile(buildParallel, cmdFile)
exec "build/pararun " & buildParallel
@ -753,7 +789,7 @@ task test_parallel_no_gmp_no_asm, "Run all tests in parallel":
genParallelCmdRunner()
var cmdFile: string
cmdFile.addTestSet(requireGMP = false, testASM = false)
cmdFile.addTestSet(requireGMP = false, useASM = false)
cmdFile.addBenchSet(useASM = false) # Build (but don't run) benches to ensure they stay relevant
writeFile(buildParallel, cmdFile)
exec "build/pararun " & buildParallel
@ -761,7 +797,7 @@ task test_parallel_no_gmp_no_asm, "Run all tests in parallel":
# Threadpool tests done serially
cmdFile = ""
cmdFile.addTestSetThreadpool()
cmdFile.addTestSetMultithreadedCrypto(testASM = false)
cmdFile.addTestSetMultithreadedCrypto(useASM = false)
for cmd in cmdFile.splitLines():
if cmd != "": # Windows doesn't like empty commands
exec cmd
@ -790,389 +826,199 @@ task test_nvidia, "Run all tests for Nvidia GPUs":
# Finite field 𝔽p
# ------------------------------------------
task bench_fp, "Run benchmark 𝔽p with your default compiler":
task bench_fp, "Run benchmark 𝔽p with your CC compiler":
runBench("bench_fp")
task bench_fp_gcc, "Run benchmark 𝔽p with gcc":
runBench("bench_fp", "gcc")
task bench_fp_clang, "Run benchmark 𝔽p with clang":
runBench("bench_fp", "clang")
task bench_fp_gcc_noasm, "Run benchmark 𝔽p with gcc - no Assembly":
runBench("bench_fp", "gcc", useAsm = false)
task bench_fp_clang_noasm, "Run benchmark 𝔽p with clang - no Assembly":
runBench("bench_fp", "clang", useAsm = false)
task bench_fp_noasm, "Run benchmark 𝔽p with your CC compiler - no Assembly":
runBench("bench_fp", useAsm = false)
# Double-precision field 𝔽pDbl
# ------------------------------------------
task bench_fpdbl, "Run benchmark 𝔽pDbl with your default compiler":
task bench_fpdbl, "Run benchmark 𝔽pDbl with your CC compiler":
runBench("bench_fp_double_precision")
task bench_fpdbl_gcc, "Run benchmark 𝔽p with gcc":
runBench("bench_fp_double_precision", "gcc")
task bench_fpdbl_noasm, "Run benchmark 𝔽p with CC compiler - no Assembly":
runBench("bench_fp_double_precision", useAsm = false)
task bench_fpdbl_clang, "Run benchmark 𝔽p with clang":
runBench("bench_fp_double_precision", "clang")
task bench_fpdbl_gcc_noasm, "Run benchmark 𝔽p with gcc - no Assembly":
runBench("bench_fp_double_precision", "gcc", useAsm = false)
task bench_fpdbl_clang_noasm, "Run benchmark 𝔽p with clang - no Assembly":
runBench("bench_fp_double_precision", "clang", useAsm = false)
# Extension field 𝔽p2
# ------------------------------------------
task bench_fp2, "Run benchmark with 𝔽p2 your default compiler":
task bench_fp2, "Run benchmark 𝔽p2 with your CC compiler":
runBench("bench_fp2")
task bench_fp2_gcc, "Run benchmark 𝔽p2 with gcc":
runBench("bench_fp2", "gcc")
task bench_fp2_clang, "Run benchmark 𝔽p2 with clang":
runBench("bench_fp2", "clang")
task bench_fp2_gcc_noasm, "Run benchmark 𝔽p2 with gcc - no Assembly":
runBench("bench_fp2", "gcc", useAsm = false)
task bench_fp2_clang_noasm, "Run benchmark 𝔽p2 with clang - no Assembly":
runBench("bench_fp2", "clang", useAsm = false)
task bench_fp2_noasm, "Run benchmark 𝔽p2 with CC compiler - no Assembly":
runBench("bench_fp2", useAsm = false)
# Extension field 𝔽p4
# ------------------------------------------
task bench_fp4, "Run benchmark with 𝔽p4 your default compiler":
task bench_fp4, "Run benchmark 𝔽p4 with your CC compiler":
runBench("bench_fp4")
task bench_fp4_gcc, "Run benchmark 𝔽p4 with gcc":
runBench("bench_fp4", "gcc")
task bench_fp4_noasm, "Run benchmark 𝔽p4 with CC compiler - no Assembly":
runBench("bench_fp4", useAsm = false)
task bench_fp4_clang, "Run benchmark 𝔽p4 with clang":
runBench("bench_fp4", "clang")
task bench_fp4_gcc_noasm, "Run benchmark 𝔽p4 with gcc - no Assembly":
runBench("bench_fp4", "gcc", useAsm = false)
task bench_fp4_clang_noasm, "Run benchmark 𝔽p4 with clang - no Assembly":
runBench("bench_fp4", "clang", useAsm = false)
# Extension field 𝔽p6
# ------------------------------------------
task bench_fp6, "Run benchmark with 𝔽p6 your default compiler":
task bench_fp6, "Run benchmark 𝔽p6 with your CC compiler":
runBench("bench_fp6")
task bench_fp6_gcc, "Run benchmark 𝔽p6 with gcc":
runBench("bench_fp6", "gcc")
task bench_fp6_clang, "Run benchmark 𝔽p6 with clang":
runBench("bench_fp6", "clang")
task bench_fp6_gcc_noasm, "Run benchmark 𝔽p6 with gcc - no Assembly":
runBench("bench_fp6", "gcc", useAsm = false)
task bench_fp6_clang_noasm, "Run benchmark 𝔽p6 with clang - no Assembly":
runBench("bench_fp6", "clang", useAsm = false)
task bench_fp6_noasm, "Run benchmark 𝔽p6 with CC compiler - no Assembly":
runBench("bench_fp6", useAsm = false)
# Extension field 𝔽p12
# ------------------------------------------
task bench_fp12, "Run benchmark with 𝔽p12 your default compiler":
task bench_fp12, "Run benchmark 𝔽p12 with your CC compiler":
runBench("bench_fp12")
task bench_fp12_gcc, "Run benchmark 𝔽p12 with gcc":
runBench("bench_fp12", "gcc")
task bench_fp12_clang, "Run benchmark 𝔽p12 with clang":
runBench("bench_fp12", "clang")
task bench_fp12_gcc_noasm, "Run benchmark 𝔽p12 with gcc - no Assembly":
runBench("bench_fp12", "gcc", useAsm = false)
task bench_fp12_clang_noasm, "Run benchmark 𝔽p12 with clang - no Assembly":
runBench("bench_fp12", "clang", useAsm = false)
task bench_fp12_noasm, "Run benchmark 𝔽p12 with CC compiler - no Assembly":
runBench("bench_fp12", useAsm = false)
# Elliptic curve G1
# ------------------------------------------
task bench_ec_g1, "Run benchmark on Elliptic Curve group 𝔾1 - Default compiler":
task bench_ec_g1, "Run benchmark on Elliptic Curve group 𝔾1 - CC compiler":
runBench("bench_ec_g1")
task bench_ec_g1_gcc, "Run benchmark on Elliptic Curve group 𝔾1 - GCC":
runBench("bench_ec_g1", "gcc")
task bench_ec_g1_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - CC compiler no Assembly":
runBench("bench_ec_g1", useAsm = false)
task bench_ec_g1_clang, "Run benchmark on Elliptic Curve group 𝔾1 - Clang":
runBench("bench_ec_g1", "clang")
task bench_ec_g1_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - GCC no Assembly":
runBench("bench_ec_g1", "gcc", useAsm = false)
task bench_ec_g1_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - Clang no Assembly":
runBench("bench_ec_g1", "clang", useAsm = false)
# Elliptic curve G1 - batch operations
# ------------------------------------------
task bench_ec_g1_batch, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - Default compiler":
task bench_ec_g1_batch, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - CC compiler":
runBench("bench_ec_g1_batch")
task bench_ec_g1_batch_gcc, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - GCC":
runBench("bench_ec_g1_batch", "gcc")
task bench_ec_g1_batch_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - CC compiler no Assembly":
runBench("bench_ec_g1_batch", useAsm = false)
task bench_ec_g1_batch_clang, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - Clang":
runBench("bench_ec_g1_batch", "clang")
task bench_ec_g1_batch_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - GCC no Assembly":
runBench("bench_ec_g1_batch", "gcc", useAsm = false)
task bench_ec_g1_batch_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - Clang no Assembly":
runBench("bench_ec_g1_batch", "clang", useAsm = false)
# Elliptic curve G1 - scalar multiplication
# ------------------------------------------
task bench_ec_g1_scalar_mul, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - Default compiler":
task bench_ec_g1_scalar_mul, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - CC compiler":
runBench("bench_ec_g1_scalar_mul")
task bench_ec_g1_scalar_mul_gcc, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - GCC":
runBench("bench_ec_g1_scalar_mul", "gcc")
task bench_ec_g1_scalar_mul_clang, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - Clang":
runBench("bench_ec_g1_scalar_mul", "clang")
task bench_ec_g1_scalar_mul_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - GCC no Assembly":
runBench("bench_ec_g1_scalar_mul", "gcc", useAsm = false)
task bench_ec_g1_scalar_mul_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - Clang no Assembly":
runBench("bench_ec_g1_scalar_mul", "clang", useAsm = false)
task bench_ec_g1_scalar_mul_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - CC compiler no Assembly":
runBench("bench_ec_g1_scalar_mul", useAsm = false)
# Elliptic curve G1 - Multi-scalar-mul
# ------------------------------------------
task bench_ec_g1_msm_bn254_snarks, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - Default compiler":
task bench_ec_g1_msm_bn254_snarks, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - CC compiler":
runBench("bench_ec_g1_msm_bn254_snarks")
task bench_ec_g1_msm_bn254_snarks_gcc, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - GCC":
runBench("bench_ec_g1_msm_bn254_snarks", "gcc")
task bench_ec_g1_msm_bn254_snarks_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - CC compiler no Assembly":
runBench("bench_ec_g1_msm_bn254_snarks", useAsm = false)
task bench_ec_g1_msm_bn254_snarks_clang, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - Clang":
runBench("bench_ec_g1_msm_bn254_snarks", "clang")
task bench_ec_g1_msm_bn254_snarks_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - GCC no Assembly":
runBench("bench_ec_g1_msm_bn254_snarks", "gcc", useAsm = false)
task bench_ec_g1_msm_bn254_snarks_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - Clang no Assembly":
runBench("bench_ec_g1_msm_bn254_snarks", "clang", useAsm = false)
task bench_ec_g1_msm_bls12_381, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - Default compiler":
task bench_ec_g1_msm_bls12_381, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - CC compiler":
runBench("bench_ec_g1_msm_bls12_381")
task bench_ec_g1_msm_bls12_381_gcc, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - GCC":
runBench("bench_ec_g1_msm_bls12_381", "gcc")
task bench_ec_g1_msm_bls12_381_clang, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - Clang":
runBench("bench_ec_g1_msm_bls12_381", "clang")
task bench_ec_g1_msm_bls12_381_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - GCC no Assembly":
runBench("bench_ec_g1_msm_bls12_381", "gcc", useAsm = false)
task bench_ec_g1_msm_bls12_381_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - Clang no Assembly":
runBench("bench_ec_g1_msm_bls12_381", "clang", useAsm = false)
task bench_ec_g1_msm_bls12_381_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - CC compiler no Assembly":
runBench("bench_ec_g1_msm_bls12_381", useAsm = false)
# Elliptic curve G2
# ------------------------------------------
task bench_ec_g2, "Run benchmark on Elliptic Curve group 𝔾2 - Default compiler":
task bench_ec_g2, "Run benchmark on Elliptic Curve group 𝔾2 - CC compiler":
runBench("bench_ec_g2")
task bench_ec_g2_gcc, "Run benchmark on Elliptic Curve group 𝔾2 - GCC":
runBench("bench_ec_g2", "gcc")
task bench_ec_g2_clang, "Run benchmark on Elliptic Curve group 𝔾2 - Clang":
runBench("bench_ec_g2", "clang")
task bench_ec_g2_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - GCC no Assembly":
runBench("bench_ec_g2", "gcc", useAsm = false)
task bench_ec_g2_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - Clang no Assembly":
runBench("bench_ec_g2", "clang", useAsm = false)
task bench_ec_g2_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - CC compiler no Assembly":
runBench("bench_ec_g2", useAsm = false)
# Elliptic curve G2 - scalar multiplication
# ------------------------------------------
task bench_ec_g2_scalar_mul, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - Default compiler":
task bench_ec_g2_scalar_mul, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - CC compiler":
runBench("bench_ec_g2_scalar_mul")
task bench_ec_g2_scalar_mul_gcc, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - GCC":
runBench("bench_ec_g2_scalar_mul", "gcc")
task bench_ec_g2_scalar_mul_clang, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - Clang":
runBench("bench_ec_g2_scalar_mul", "clang")
task bench_ec_g2_scalar_mul_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - GCC no Assembly":
runBench("bench_ec_g2_scalar_mul", "gcc", useAsm = false)
task bench_ec_g2_scalar_mul_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - Clang no Assembly":
runBench("bench_ec_g2_scalar_mul", "clang", useAsm = false)
task bench_ec_g2_scalar_mul_noasm, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - CC compiler no Assembly":
runBench("bench_ec_g2_scalar_mul", useAsm = false)
# Pairings
# ------------------------------------------
task bench_pairing_bls12_377, "Run pairings benchmarks for BLS12-377 - Default compiler":
task bench_pairing_bls12_377, "Run pairings benchmarks for BLS12-377 - CC compiler":
runBench("bench_pairing_bls12_377")
task bench_pairing_bls12_377_gcc, "Run pairings benchmarks for BLS12-377 - GCC":
runBench("bench_pairing_bls12_377", "gcc")
task bench_pairing_bls12_377_clang, "Run pairings benchmarks for BLS12-377 - Clang":
runBench("bench_pairing_bls12_377", "clang")
task bench_pairing_bls12_377_gcc_noasm, "Run pairings benchmarks for BLS12-377 - GCC no Assembly":
runBench("bench_pairing_bls12_377", "gcc", useAsm = false)
task bench_pairing_bls12_377_clang_noasm, "Run pairings benchmarks for BLS12-377 - Clang no Assembly":
runBench("bench_pairing_bls12_377", "clang", useAsm = false)
task bench_pairing_bls12_377_noasm, "Run pairings benchmarks for BLS12-377 - CC compiler no Assembly":
runBench("bench_pairing_bls12_377", useAsm = false)
# --
task bench_pairing_bls12_381, "Run pairings benchmarks for BLS12-381 - Default compiler":
task bench_pairing_bls12_381, "Run pairings benchmarks for BLS12-381 - CC compiler":
runBench("bench_pairing_bls12_381")
task bench_pairing_bls12_381_gcc, "Run pairings benchmarks for BLS12-381 - GCC":
runBench("bench_pairing_bls12_381", "gcc")
task bench_pairing_bls12_381_clang, "Run pairings benchmarks for BLS12-381 - Clang":
runBench("bench_pairing_bls12_381", "clang")
task bench_pairing_bls12_381_gcc_noasm, "Run pairings benchmarks for BLS12-381 - GCC no Assembly":
runBench("bench_pairing_bls12_381", "gcc", useAsm = false)
task bench_pairing_bls12_381_clang_noasm, "Run pairings benchmarks for BLS12-381 - Clang no Assembly":
runBench("bench_pairing_bls12_381", "clang", useAsm = false)
task bench_pairing_bls12_381_noasm, "Run pairings benchmarks for BLS12-381 - CC compiler no Assembly":
runBench("bench_pairing_bls12_381", useAsm = false)
# --
task bench_pairing_bn254_nogami, "Run pairings benchmarks for BN254-Nogami - Default compiler":
task bench_pairing_bn254_nogami, "Run pairings benchmarks for BN254-Nogami - CC compiler":
runBench("bench_pairing_bn254_nogami")
task bench_pairing_bn254_nogami_gcc, "Run pairings benchmarks for BN254-Nogami - GCC":
runBench("bench_pairing_bn254_nogami", "gcc")
task bench_pairing_bn254_nogami_clang, "Run pairings benchmarks for BN254-Nogami - Clang":
runBench("bench_pairing_bn254_nogami", "clang")
task bench_pairing_bn254_nogami_gcc_noasm, "Run pairings benchmarks for BN254-Nogami - GCC no Assembly":
runBench("bench_pairing_bn254_nogami", "gcc", useAsm = false)
task bench_pairing_bn254_nogami_clang_noasm, "Run pairings benchmarks for BN254-Nogami - Clang no Assembly":
runBench("bench_pairing_bn254_nogami", "clang", useAsm = false)
task bench_pairing_bn254_nogami_noasm, "Run pairings benchmarks for BN254-Nogami - CC compiler no Assembly":
runBench("bench_pairing_bn254_nogami", useAsm = false)
# --
task bench_pairing_bn254_snarks, "Run pairings benchmarks for BN254-Snarks - Default compiler":
task bench_pairing_bn254_snarks, "Run pairings benchmarks for BN254-Snarks - CC compiler":
runBench("bench_pairing_bn254_snarks")
task bench_pairing_bn254_snarks_gcc, "Run pairings benchmarks for BN254-Snarks - GCC":
runBench("bench_pairing_bn254_snarks", "gcc")
task bench_pairing_bn254_snarks_clang, "Run pairings benchmarks for BN254-Snarks - Clang":
runBench("bench_pairing_bn254_snarks", "clang")
task bench_pairing_bn254_snarks_gcc_noasm, "Run pairings benchmarks for BN254-Snarks - GCC no Assembly":
runBench("bench_pairing_bn254_snarks", "gcc", useAsm = false)
task bench_pairing_bn254_snarks_clang_noasm, "Run pairings benchmarks for BN254-Snarks - Clang no Assembly":
runBench("bench_pairing_bn254_snarks", "clang", useAsm = false)
task bench_pairing_bn254_snarks_noasm, "Run pairings benchmarks for BN254-Snarks - CC compiler no Assembly":
runBench("bench_pairing_bn254_snarks", useAsm = false)
# Curve summaries
# ------------------------------------------
task bench_summary_bls12_377, "Run summary benchmarks for BLS12-377 - Default compiler":
task bench_summary_bls12_377, "Run summary benchmarks for BLS12-377 - CC compiler":
runBench("bench_summary_bls12_377")
task bench_summary_bls12_377_gcc, "Run summary benchmarks for BLS12-377 - GCC":
runBench("bench_summary_bls12_377", "gcc")
task bench_summary_bls12_377_clang, "Run summary benchmarks for BLS12-377 - Clang":
runBench("bench_summary_bls12_377", "clang")
task bench_summary_bls12_377_gcc_noasm, "Run summary benchmarks for BLS12-377 - GCC no Assembly":
runBench("bench_summary_bls12_377", "gcc", useAsm = false)
task bench_summary_bls12_377_clang_noasm, "Run summary benchmarks for BLS12-377 - Clang no Assembly":
runBench("bench_summary_bls12_377", "clang", useAsm = false)
task bench_summary_bls12_377_noasm, "Run summary benchmarks for BLS12-377 - CC compiler no Assembly":
runBench("bench_summary_bls12_377", useAsm = false)
# --
task bench_summary_bls12_381, "Run summary benchmarks for BLS12-381 - Default compiler":
task bench_summary_bls12_381, "Run summary benchmarks for BLS12-381 - CC compiler":
runBench("bench_summary_bls12_381")
task bench_summary_bls12_381_gcc, "Run summary benchmarks for BLS12-381 - GCC":
runBench("bench_summary_bls12_381", "gcc")
task bench_summary_bls12_381_clang, "Run summary benchmarks for BLS12-381 - Clang":
runBench("bench_summary_bls12_381", "clang")
task bench_summary_bls12_381_gcc_noasm, "Run summary benchmarks for BLS12-381 - GCC no Assembly":
runBench("bench_summary_bls12_381", "gcc", useAsm = false)
task bench_summary_bls12_381_clang_noasm, "Run summary benchmarks for BLS12-381 - Clang no Assembly":
runBench("bench_summary_bls12_381", "clang", useAsm = false)
task bench_summary_bls12_381_noasm, "Run summary benchmarks for BLS12-381 - CC compiler no Assembly":
runBench("bench_summary_bls12_381", useAsm = false)
# --
task bench_summary_bn254_nogami, "Run summary benchmarks for BN254-Nogami - Default compiler":
task bench_summary_bn254_nogami, "Run summary benchmarks for BN254-Nogami - CC compiler":
runBench("bench_summary_bn254_nogami")
task bench_summary_bn254_nogami_gcc, "Run summary benchmarks for BN254-Nogami - GCC":
runBench("bench_summary_bn254_nogami", "gcc")
task bench_summary_bn254_nogami_clang, "Run summary benchmarks for BN254-Nogami - Clang":
runBench("bench_summary_bn254_nogami", "clang")
task bench_summary_bn254_nogami_gcc_noasm, "Run summary benchmarks for BN254-Nogami - GCC no Assembly":
runBench("bench_summary_bn254_nogami", "gcc", useAsm = false)
task bench_summary_bn254_nogami_clang_noasm, "Run summary benchmarks for BN254-Nogami - Clang no Assembly":
runBench("bench_summary_bn254_nogami", "clang", useAsm = false)
task bench_summary_bn254_nogami_noasm, "Run summary benchmarks for BN254-Nogami - CC compiler no Assembly":
runBench("bench_summary_bn254_nogami", useAsm = false)
# --
task bench_summary_bn254_snarks, "Run summary benchmarks for BN254-Snarks - Default compiler":
task bench_summary_bn254_snarks, "Run summary benchmarks for BN254-Snarks - CC compiler":
runBench("bench_summary_bn254_snarks")
task bench_summary_bn254_snarks_gcc, "Run summary benchmarks for BN254-Snarks - GCC":
runBench("bench_summary_bn254_snarks", "gcc")
task bench_summary_bn254_snarks_clang, "Run summary benchmarks for BN254-Snarks - Clang":
runBench("bench_summary_bn254_snarks", "clang")
task bench_summary_bn254_snarks_gcc_noasm, "Run summary benchmarks for BN254-Snarks - GCC no Assembly":
runBench("bench_summary_bn254_snarks", "gcc", useAsm = false)
task bench_summary_bn254_snarks_clang_noasm, "Run summary benchmarks for BN254-Snarks - Clang no Assembly":
runBench("bench_summary_bn254_snarks", "clang", useAsm = false)
task bench_summary_bn254_snarks_noasm, "Run summary benchmarks for BN254-Snarks - CC compiler no Assembly":
runBench("bench_summary_bn254_snarks", useAsm = false)
# --
task bench_summary_pasta, "Run summary benchmarks for the Pasta curves - Default compiler":
task bench_summary_pasta, "Run summary benchmarks for the Pasta curves - CC compiler":
runBench("bench_summary_pasta")
task bench_summary_pasta_gcc, "Run summary benchmarks for the Pasta curves - GCC":
runBench("bench_summary_pasta", "gcc")
task bench_summary_pasta_clang, "Run summary benchmarks for the Pasta curves - Clang":
runBench("bench_summary_pasta", "clang")
task bench_summary_pasta_gcc_noasm, "Run summary benchmarks for the Pasta curves - GCC no Assembly":
runBench("bench_summary_pasta", "gcc", useAsm = false)
task bench_summary_pasta_clang_noasm, "Run summary benchmarks for the Pasta curves - Clang no Assembly":
runBench("bench_summary_pasta", "clang", useAsm = false)
task bench_summary_pasta_noasm, "Run summary benchmarks for the Pasta curves - CC compiler no Assembly":
runBench("bench_summary_pasta", useAsm = false)
# Hashes
# ------------------------------------------
@ -1185,31 +1031,13 @@ task bench_sha256, "Run SHA256 benchmarks":
task bench_hash_to_curve, "Run Hash-to-Curve benchmarks":
runBench("bench_hash_to_curve")
task bench_hash_to_curve_gcc, "Run Hash-to-Curve benchmarks":
runBench("bench_hash_to_curve", "gcc")
task bench_hash_to_curve_clang, "Run Hash-to-Curve benchmarks":
runBench("bench_hash_to_curve", "clang")
task bench_hash_to_curve_gcc_noasm, "Run Hash-to-Curve benchmarks":
runBench("bench_hash_to_curve", "gcc", useAsm = false)
task bench_hash_to_curve_clang_noasm, "Run Hash-to-Curve benchmarks":
runBench("bench_hash_to_curve", "clang", useAsm = false)
task bench_hash_to_curve_noasm, "Run Hash-to-Curve benchmarks - No Assembly":
runBench("bench_hash_to_curve", useAsm = false)
# BLS signatures
# ------------------------------------------
task bench_ethereum_bls_signatures, "Run Ethereum BLS signatures benchmarks":
task bench_ethereum_bls_signatures, "Run Ethereum BLS signatures benchmarks - CC compiler":
runBench("bench_ethereum_bls_signatures")
task bench_ethereum_bls_signatures_gcc, "Run Ethereum BLS signatures benchmarks":
runBench("bench_ethereum_bls_signatures", "gcc")
task bench_ethereum_bls_signatures_clang, "Run Ethereum BLS signatures benchmarks":
runBench("bench_ethereum_bls_signatures", "clang")
task bench_ethereum_bls_signatures_gcc_noasm, "Run Ethereum BLS signatures benchmarks":
runBench("bench_ethereum_bls_signatures", "gcc", useAsm = false)
task bench_ethereum_bls_signatures_clang_noasm, "Run Ethereum BLS signatures benchmarks":
runBench("bench_ethereum_bls_signatures", "clang", useAsm = false)
task bench_ethereum_bls_signatures_noasm, "Run Ethereum BLS signatures benchmarks - CC compiler no assembly":
runBench("bench_ethereum_bls_signatures", useAsm = false)

View File

@ -50,7 +50,7 @@ import ./zoo_exports
static:
# Xxport SHA256 routines with a protocol specific prefix
# This exports sha256.init(), sha256.update(), sha256.finish() and sha256.clear()
prefix_sha256 = prefix_ffi & "_sha256_"
prefix_sha256 = prefix_ffi & "sha256_"
import hashes
export hashes # generic sandwich on sha256

View File

@ -10,6 +10,7 @@ import
# Standard library
std/macros,
# Internal
./limbs_asm_modular_x86,
../../../platforms/abstractions
# ############################################################
@ -32,7 +33,7 @@ static: doAssert UseASM_X86_64
# Double-precision field addition
# ------------------------------------------------------------
macro addmod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N div 2]): untyped =
macro addmod2x_gen[N: static int](r_PIR: var Limbs[N], a_MEM, b_MEM: Limbs[N], M_MEM: Limbs[N div 2], spareBits: static int): untyped =
## Generate an optimized out-of-place double-precision addition kernel
result = newStmtList()
@ -41,23 +42,28 @@ macro addmod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N di
let
H = N div 2
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
# We reuse the reg used for b for overflow detection
b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
b = asmArray(b_MEM, N, MemOffsettable, asmInput)
# We could force m as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
M = asmArray(M_MEM, H, MemOffsettable, asmInput)
# If N is too big, we need to spill registers. TODO.
u = init(OperandArray, nimSymbol = ident"U", H, ElemsInReg, InputOutput)
v = init(OperandArray, nimSymbol = ident"V", H, ElemsInReg, InputOutput)
uSym = ident"u"
vSym = ident"v"
u = asmArray(uSym, H, ElemsInReg, asmInputOutput)
v = asmArray(vSym, H, ElemsInReg, asmInputOutput)
overflowRegSym = ident"overflowReg"
overflowReg = asmValue(overflowRegSym, Reg, asmOutputOverwrite)
let usym = u.nimSymbol
let vsym = v.nimSymbol
result.add quote do:
var `usym`{.noinit.}, `vsym` {.noInit.}: typeof(`A`)
var `uSym`{.noinit.}, `vSym` {.noInit.}: typeof(`a_MEM`)
staticFor i, 0, `H`:
`usym`[i] = `A`[i]
`uSym`[i] = `a_MEM`[i]
staticFor i, `H`, `N`:
`vsym`[i-`H`] = `A`[i]
`vSym`[i-`H`] = `a_MEM`[i]
when `sparebits` == 0:
var `overflowRegSym`{.noInit.}: BaseType
# Addition
# u = a[0..<H] + b[0..<H], v = a[H..<N]
@ -72,38 +78,26 @@ macro addmod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N di
ctx.adc v[i-H], b[i]
ctx.mov u[i-H], v[i-H]
# Mask: overflowed contains 0xFFFF or 0x0000
# TODO: unnecessary if MSB never set, i.e. "Field.getSpareBits >= 1"
let overflowed = b.reuseRegister()
ctx.sbb overflowed, overflowed
let rUpperHalf = r.subset(H, N)
# Now substract the modulus to test a < 2ⁿp
ctx.sub v[0], M[0]
for i in 1 ..< H:
ctx.sbb v[i], M[i]
if spareBits >= 1:
# Now substract the modulus to test a < 2ⁿp
ctx.finalSubNoOverflowImpl(rUpperHalf, v, M, u)
else:
ctx.finalSubMayOverflowImpl(rUpperHalf, v, M, u, scratchReg = overflowReg)
# If it overflows here, it means that it was
# smaller than the modulus and we don't need v
ctx.sbb overflowed, 0
result.add ctx.generate()
# Conditional Mov and
# and store result
for i in 0 ..< H:
ctx.cmovnc u[i], v[i]
ctx.mov r[i+H], u[i]
result.add ctx.generate
func addmod2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N div 2]) {.noInline.} =
func addmod2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N div 2], spareBits: static int) =
## Constant-time double-precision addition
## Output is conditionally reduced by 2ⁿp
## to stay in the [0, 2ⁿp) range
addmod2x_gen(r, a, b, M)
addmod2x_gen(r, a, b, M, spareBits)
# Double-precision field substraction
# ------------------------------------------------------------
macro submod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N div 2]): untyped =
macro submod2x_gen[N: static int](r_PIR: var Limbs[N], a_MEM, b_PIR: Limbs[N], M_MEM: Limbs[N div 2]): untyped =
## Generate an optimized out-of-place double-precision substraction kernel
result = newStmtList()
@ -112,23 +106,22 @@ macro submod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N di
let
H = N div 2
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
# We reuse the reg used for b for overflow detection
b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
b = asmArray(b_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memRead) # We reuse the reg used for b for overflow detection
# We could force m as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
M = asmArray(M_MEM, H, MemOffsettable, asmInput)
# If N is too big, we need to spill registers. TODO.
u = init(OperandArray, nimSymbol = ident"U", H, ElemsInReg, InputOutput)
v = init(OperandArray, nimSymbol = ident"V", H, ElemsInReg, InputOutput)
uSym = ident"u"
vSym = ident"v"
u = asmArray(uSym, H, ElemsInReg, asmInputOutput)
v = asmArray(vSym, H, ElemsInReg, asmInputOutput)
let usym = u.nimSymbol
let vsym = v.nimSymbol
result.add quote do:
var `usym`{.noinit.}, `vsym` {.noInit.}: typeof(`A`)
var `uSym`{.noinit.}, `vSym` {.noInit.}: typeof(`a_MEM`)
staticFor i, 0, `H`:
`usym`[i] = `A`[i]
`uSym`[i] = `a_MEM`[i]
staticFor i, `H`, `N`:
`vsym`[i-`H`] = `A`[i]
`vSym`[i-`H`] = `a_MEM`[i]
# Substraction
# u = a[0..<H] - b[0..<H], v = a[H..<N]
@ -158,9 +151,9 @@ macro submod2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N di
ctx.adc u[i], v[i]
ctx.mov r[i+H], u[i]
result.add ctx.generate
result.add ctx.generate()
func submod2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N div 2]) {.noInline.} =
func submod2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N div 2]) =
## Constant-time double-precision substraction
## Output is conditionally reduced by 2ⁿp
## to stay in the [0, 2ⁿp) range
@ -169,7 +162,7 @@ func submod2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N di
# Double-precision field negation
# ------------------------------------------------------------
macro negmod2x_gen[N: static int](R: var Limbs[N], A: Limbs[N], m: Limbs[N div 2]): untyped =
macro negmod2x_gen[N: static int](r_PIR: var Limbs[N], a_MEM: Limbs[N], M_MEM: Limbs[N div 2]): untyped =
## Generate an optimized modular negation kernel
result = newStmtList()
@ -178,22 +171,20 @@ macro negmod2x_gen[N: static int](R: var Limbs[N], A: Limbs[N], m: Limbs[N div 2
let
H = N div 2
a = init(OperandArray, nimSymbol = A, N, PointerInReg, Input)
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
u = init(OperandArray, nimSymbol = ident"U", N, ElemsInReg, Output_EarlyClobber)
a = asmArray(a_MEM, N, MemOffsettable, asmInput)
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
uSym = ident"u"
u = asmArray(uSym, N, ElemsInReg, asmOutputEarlyClobber)
# We could force m as immediate by specializing per moduli
# We reuse the reg used for m for overflow detection
M = init(OperandArray, nimSymbol = m, N, PointerInReg, InputOutput)
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
isZero = Operand(
desc: OperandDesc(
asmId: "[isZero]",
nimSymbol: ident"isZero",
rm: Reg,
constraint: Output_EarlyClobber,
cEmit: "isZero"
)
)
isZeroSym = ident"isZero"
isZero = asmValue(isZeroSym, Reg, asmOutputEarlyClobber)
result.add quote do:
var `isZerosym`{.noInit.}: BaseType
var `usym`{.noinit, used.}: typeof(`a_MEM`)
# Substraction 2ⁿp - a
# The lower half of 2ⁿp is filled with zero
@ -227,13 +218,8 @@ macro negmod2x_gen[N: static int](R: var Limbs[N], A: Limbs[N], m: Limbs[N div 2
ctx.cmovz u[i-H], isZero
ctx.mov r[i], u[i-H]
let isZerosym = isZero.desc.nimSymbol
let usym = u.nimSymbol
result.add quote do:
var `isZerosym`{.noInit.}: BaseType
var `usym`{.noinit, used.}: typeof(`A`)
result.add ctx.generate
result.add ctx.generate()
func negmod2x_asm*[N: static int](r: var Limbs[N], a: Limbs[N], M: Limbs[N div 2]) {.noInline.} =
func negmod2x_asm*[N: static int](r: var Limbs[N], a: Limbs[N], M: Limbs[N div 2]) =
## Constant-time double-precision negation
negmod2x_gen(r, a, M)

View File

@ -18,11 +18,6 @@ import
#
# ############################################################
# Note: We can refer to at most 30 registers in inline assembly
# and "InputOutput" registers count double
# They are nice to let the compiler deals with mov
# but too constraining so we move things ourselves.
static: doAssert UseASM_X86_32
# Necessary for the compiler to find enough registers
@ -31,7 +26,8 @@ static: doAssert UseASM_X86_32
proc finalSubNoOverflowImpl*(
ctx: var Assembler_x86,
r: Operand or OperandArray,
a, M, scratch: OperandArray) =
a, M, scratch: OperandArray,
a_in_scratch = false) =
## Reduce `a` into `r` modulo `M`
## To be used when the modulus does not use the full bitwidth of the storing words
## for example a 255-bit modulus in n words of total max size 2^256
@ -42,10 +38,12 @@ proc finalSubNoOverflowImpl*(
ctx.comment "Final substraction (cannot overflow its limbs)"
# Substract the modulus, and test a < p with the last borrow
ctx.mov scratch[0], a[0]
if not a_in_scratch:
ctx.mov scratch[0], a[0]
ctx.sub scratch[0], M[0]
for i in 1 ..< N:
ctx.mov scratch[i], a[i]
if not a_in_scratch:
ctx.mov scratch[i], a[i]
ctx.sbb scratch[i], M[i]
# If we borrowed it means that we were smaller than
@ -58,13 +56,15 @@ proc finalSubMayOverflowImpl*(
ctx: var Assembler_x86,
r: Operand or OperandArray,
a, M, scratch: OperandArray,
scratchReg: Operand or Register or OperandReuse) =
a_in_scratch = false,
scratchReg: Operand or Register or OperandReuse = rax) =
## Reduce `a` into `r` modulo `M`
## To be used when the final substraction can
## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256)
##
## r, a, scratch, scratchReg are mutated
## r, a, scratch are mutated
## M is read-only
## This clobbers RAX
let N = M.len
ctx.comment "Final substraction (may carry)"
@ -72,10 +72,12 @@ proc finalSubMayOverflowImpl*(
ctx.sbb scratchReg, scratchReg
# Now substract the modulus, and test a < p with the last borrow
ctx.mov scratch[0], a[0]
if not a_in_scratch:
ctx.mov scratch[0], a[0]
ctx.sub scratch[0], M[0]
for i in 1 ..< N:
ctx.mov scratch[i], a[i]
if not a_in_scratch:
ctx.mov scratch[i], a[i]
ctx.sbb scratch[i], M[i]
# If it overflows here, it means that it was
@ -89,9 +91,10 @@ proc finalSubMayOverflowImpl*(
ctx.mov r[i], a[i]
macro finalSub_gen*[N: static int](
r_PIR: var array[N, SecretWord],
a_EIR, M_PIR: array[N, SecretWord],
scratch_EIR: var array[N, SecretWord],
r_PIR: var Limbs[N],
a_EIR: Limbs[N],
M_MEM: Limbs[N],
scratch_EIR: var Limbs[N],
mayOverflow: static bool): untyped =
## Returns:
## a-M if a > M
@ -99,35 +102,32 @@ macro finalSub_gen*[N: static int](
##
## - r_PIR is a pointer to the result array, mutated,
## - a_EIR is an array of registers, mutated,
## - M_PIR is a pointer to an array, read-only,
## - M_MEM is a pointer to an array, read-only,
## - scratch_EIR is an array of registers, mutated
## - mayOverflow is set to true when the carry flag also needs to be read
result = newStmtList()
var ctx = init(Assembler_x86, BaseType)
let
r = init(OperandArray, nimSymbol = r_PIR, N, PointerInReg, InputOutput)
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
# We reuse the reg used for b for overflow detection
a = init(OperandArray, nimSymbol = a_EIR, N, ElemsInReg, InputOutput)
a = asmArray(a_EIR, N, ElemsInReg, asmInputOutput)
# We could force m as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
t = init(OperandArray, nimSymbol = scratch_EIR, N, ElemsInReg, Output_EarlyClobber)
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
t = asmArray(scratch_EIR, N, ElemsInReg, asmOutputEarlyClobber)
if mayOverflow:
ctx.finalSubMayOverflowImpl(
r, a, M, t, rax
)
ctx.finalSubMayOverflowImpl(r, a, M, t)
else:
ctx.finalSubNoOverflowImpl(
r, a, M, t
)
ctx.finalSubNoOverflowImpl(r, a, M, t)
result.add ctx.generate()
# Field addition
# ------------------------------------------------------------
macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N], spareBits: static int): untyped =
macro addmod_gen[N: static int](r_PIR: var Limbs[N], a_PIR, b_PIR, M_MEM: Limbs[N], spareBits: static int): untyped =
## Generate an optimized modular addition kernel
# Register pressure note:
# We could generate a kernel per modulus m by hardcoding it as immediate
@ -139,21 +139,20 @@ macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N], spareBits: s
var ctx = init(Assembler_x86, BaseType)
let
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
# We reuse the reg used for b for overflow detection
b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but_ec_shortw_prj_g1_sum_reduce.nimt compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
b = asmArray(b_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memRead) # LLVM Gold linker runs out of registers in t_ec_shortw_prj_g1_sum_reduce if we use b as Memoffsettable and a separate overflow register
# We could force m as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
# If N is too big, we need to spill registers. TODO.
u = init(OperandArray, nimSymbol = ident"u", N, ElemsInReg, InputOutput)
v = init(OperandArray, nimSymbol = ident"v", N, ElemsInReg, Output_EarlyClobber)
uSym = ident"u"
vSym = ident"v"
u = asmArray(uSym, N, ElemsInReg, asmInputOutput)
v = asmArray(vSym, N, ElemsInReg, asmOutputEarlyClobber)
let usym = u.nimSymbol
let vsym = v.nimSymbol
result.add quote do:
var `usym`{.noinit.}, `vsym` {.noInit, used.}: typeof(`A`)
var `usym`{.noinit.}, `vsym` {.noInit, used.}: typeof(`a_PIR`)
staticFor i, 0, `N`:
`usym`[i] = `A`[i]
`usym`[i] = `a_PIR`[i]
# Addition
ctx.add u[0], b[0]
@ -164,23 +163,20 @@ macro addmod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N], spareBits: s
ctx.mov v[i], u[i]
if spareBits >= 1:
ctx.finalSubNoOverflowImpl(r, u, M, v)
ctx.finalSubNoOverflowImpl(r, u, M, v, a_in_scratch = true)
else:
ctx.finalSubMayOverflowImpl(
r, u, M, v, b.reuseRegister()
)
ctx.finalSubMayOverflowImpl(r, u, M, v, a_in_scratch = true, scratchReg = b.reuseRegister())
result.add ctx.generate()
func addmod_asm*(r: var Limbs, a, b, m: Limbs, spareBits: static int) {.noInline.} =
func addmod_asm*(r: var Limbs, a, b, M: Limbs, spareBits: static int) =
## Constant-time modular addition
# This MUST be noInline or Clang will run out of registers with LTO
addmod_gen(r, a, b, m, spareBits)
addmod_gen(r, a, b, M, spareBits)
# Field substraction
# ------------------------------------------------------------
macro submod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
macro submod_gen[N: static int](r_PIR: var Limbs[N], a_PIR, b_PIR, M_MEM: Limbs[N]): untyped =
## Generate an optimized modular addition kernel
# Register pressure note:
# We could generate a kernel per modulus m by hardocing it as immediate
@ -192,21 +188,20 @@ macro submod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
var ctx = init(Assembler_x86, BaseType)
let
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
# We reuse the reg used for b for overflow detection
b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
b = asmArray(b_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memRead) # register reused for underflow detection
# We could force m as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
# If N is too big, we need to spill registers. TODO.
u = init(OperandArray, nimSymbol = ident"U", N, ElemsInReg, InputOutput)
v = init(OperandArray, nimSymbol = ident"V", N, ElemsInReg, Output_EarlyClobber)
uSym = ident"u"
vSym = ident"v"
u = asmArray(uSym, N, ElemsInReg, asmInputOutput)
v = asmArray(vSym, N, ElemsInReg, asmOutputEarlyClobber)
let usym = u.nimSymbol
let vsym = v.nimSymbol
result.add quote do:
var `usym`{.noinit.}, `vsym` {.noInit, used.}: typeof(`A`)
var `usym`{.noinit.}, `vsym` {.noInit, used.}: typeof(`a_PIR`)
staticFor i, 0, `N`:
`usym`[i] = `A`[i]
`usym`[i] = `a_PIR`[i]
# Substraction
ctx.sub u[0], b[0]
@ -231,30 +226,37 @@ macro submod_gen[N: static int](R: var Limbs[N], A, B, m: Limbs[N]): untyped =
ctx.adc u[i], v[i]
ctx.mov r[i], u[i]
result.add ctx.generate
result.add ctx.generate()
func submod_asm*(r: var Limbs, a, b, M: Limbs) {.noInline.} =
func submod_asm*(r: var Limbs, a, b, M: Limbs) =
## Constant-time modular substraction
## Warning, does not handle aliasing of a and b
# This MUST be noInline or Clang will run out of registers with LTO
submod_gen(r, a, b, M)
# Field negation
# ------------------------------------------------------------
macro negmod_gen[N: static int](R: var Limbs[N], A, m: Limbs[N]): untyped =
macro negmod_gen[N: static int](r_PIR: var Limbs[N], a_MEM, M_MEM: Limbs[N]): untyped =
## Generate an optimized modular negation kernel
result = newStmtList()
var ctx = init(Assembler_x86, BaseType)
let
a = init(OperandArray, nimSymbol = A, N, PointerInReg, Input)
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
u = init(OperandArray, nimSymbol = ident"U", N, ElemsInReg, Output_EarlyClobber)
a = asmArray(a_MEM, N, MemOffsettable, asmInput)
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
uSym = ident"u"
u = asmArray(uSym, N, ElemsInReg, asmOutputEarlyClobber)
# We could force m as immediate by specializing per moduli
# We reuse the reg used for m for overflow detection
M = init(OperandArray, nimSymbol = m, N, PointerInReg, InputOutput)
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
isZeroSym = ident"isZero"
isZero = asmValue(isZeroSym, Reg, asmOutputEarlyClobber)
result.add quote do:
var `usym`{.noinit, used.}: typeof(`a_MEM`)
var `isZeroSym`{.noinit.}: BaseType
# Substraction m - a
ctx.mov u[0], M[0]
@ -264,7 +266,6 @@ macro negmod_gen[N: static int](R: var Limbs[N], A, m: Limbs[N]): untyped =
ctx.sbb u[i], a[i]
# Deal with a == 0
let isZero = M.reuseRegister()
ctx.mov isZero, a[0]
for i in 1 ..< N:
ctx.`or` isZero, a[i]
@ -274,11 +275,8 @@ macro negmod_gen[N: static int](R: var Limbs[N], A, m: Limbs[N]): untyped =
ctx.cmovz u[i], isZero
ctx.mov r[i], u[i]
let usym = u.nimSymbol
result.add quote do:
var `usym`{.noinit, used.}: typeof(`A`)
result.add ctx.generate
result.add ctx.generate()
func negmod_asm*(r: var Limbs, a, m: Limbs) =
func negmod_asm*(r: var Limbs, a, M: Limbs) =
## Constant-time modular negation
negmod_gen(r, a, m)
negmod_gen(r, a, M)

View File

@ -21,11 +21,6 @@ import
#
# ############################################################
# Note: We can refer to at most 30 registers in inline assembly
# and "InputOutput" registers count double
# They are nice to let the compiler deals with mov
# but too constraining so we move things ourselves.
static: doAssert UseASM_X86_64
# Necessary for the compiler to find enough registers
@ -37,7 +32,7 @@ static: doAssert UseASM_X86_64
# Fallback when no ADX and BMI2 support (MULX, ADCX, ADOX)
macro mulMont_CIOS_sparebit_gen[N: static int](
r_PIR: var Limbs[N], a_PIR, b_PIR,
M_PIR: Limbs[N], m0ninv_REG: BaseType,
M_MEM: Limbs[N], m0ninv_REG: BaseType,
skipFinalSub: static bool): untyped =
## Generate an optimized Montgomery Multiplication kernel
## using the CIOS method
@ -58,29 +53,23 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
scratchSlots = 6
# We could force M as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
# If N is too big, we need to spill registers. TODO.
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
tSym = ident"t"
t = asmArray(tSym, N, ElemsInReg, asmOutputEarlyClobber)
# MultiPurpose Register slots
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
scratchSym = ident"scratch"
scratch = asmArray(scratchSym, scratchSlots, ElemsInReg, asmInputOutputEarlyClobber)
# MUL requires RAX and RDX
m0ninv = Operand(
desc: OperandDesc(
asmId: "[m0ninv]",
nimSymbol: m0ninv_REG,
rm: MemOffsettable,
constraint: Input,
cEmit: "&" & $m0ninv_REG
)
)
m0ninv = asmValue(m0ninv_REG, Mem, asmInput)
# We're really constrained by register and somehow setting as memory doesn't help
# So we store the result `r` in the scratch space and then reload it in RDX
# before the scratchspace is used in final substraction
a = scratch[0].asArrayAddr(len = N) # Store the `a` operand
b = scratch[1].asArrayAddr(len = N) # Store the `b` operand
a = scratch[0].asArrayAddr(a_PIR, len = N, memIndirect = memRead) # Store the `a` operand
b = scratch[1].asArrayAddr(b_PIR, len = N, memIndirect = memRead) # Store the `b` operand
A = scratch[2] # High part of extended precision multiplication
C = scratch[3]
m = scratch[4] # Stores (t[0] * m0ninv) mod 2ʷ
@ -96,12 +85,10 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
# but this prevent reusing the same code for multiple curves like BLS12-377 and BLS12-381
# We might be able to save registers by having `r` and `M` be memory operand as well
let tsym = t.nimSymbol
let scratchSym = scratch.nimSymbol
result.add quote do:
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
var `tsym`{.noInit, used.}: typeof(`r_PIR`)
var `tSym`{.noInit, used.}: typeof(`r_PIR`)
# Assumes 64-bit limbs on 64-bit arch (or you can't store an address)
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
`scratchSym`[0] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
@ -172,26 +159,22 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
ctx.mov t[N-1], A
ctx.mov rax, r # move r away from scratchspace that will be used for final substraction
let r2 = rax.asArrayAddr(len = N)
let r2 = rax.asArrayAddr(r_PIR, len = N, memIndirect = memWrite)
if skipFinalSub:
for i in 0 ..< N:
ctx.mov r2[i], t[i]
else:
ctx.finalSubNoOverflowImpl(
r2, t, M,
scratch
)
ctx.finalSubNoOverflowImpl(r2, t, M, scratch)
result.add ctx.generate()
func mulMont_CIOS_sparebit_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) {.noInline.} =
func mulMont_CIOS_sparebit_asm*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) =
## Constant-time Montgomery multiplication
## If "skipFinalSub" is set
## the result is in the range [0, 2M)
## otherwise the result is in the range [0, M)
##
## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
# This MUST be noInline or Clang will run out of registers with LTO
r.mulMont_CIOS_sparebit_gen(a, b, M, m0ninv, skipFinalSub)
# Montgomery Squaring
@ -212,7 +195,7 @@ func squareMont_CIOS_asm*[N](
macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
r_PIR: var Limbs[N], a_PIR, b_PIR: array[K, Limbs[N]],
M_PIR: Limbs[N], m0ninv_REG: BaseType,
M_MEM: Limbs[N], m0ninv_REG: BaseType,
skipFinalSub: static bool): untyped =
## Generate an optimized Montgomery merged sum of products ⅀aᵢ.bᵢ kernel
## using the CIOS method
@ -242,29 +225,23 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
scratchSlots = 6
# We could force M as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
# If N is too big, we need to spill registers. TODO.
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
tSym = ident"t"
t = asmArray(tSym, N, ElemsInReg, asmOutputEarlyClobber)
# MultiPurpose Register slots
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
scratchSym = ident"scratch"
scratch = asmArray(scratchSym, scratchSlots, ElemsInReg, asmInputOutputEarlyClobber)
# MUL requires RAX and RDX
m0ninv = Operand(
desc: OperandDesc(
asmId: "[m0ninv]",
nimSymbol: m0ninv_REG,
rm: MemOffsettable,
constraint: Input,
cEmit: "&" & $m0ninv_REG
)
)
m0ninv = asmValue(m0ninv_REG, Mem, asmInput)
# We're really constrained by register and somehow setting as memory doesn't help
# So we store the result `r` in the scratch space and then reload it in RDX
# before the scratchspace is used in final substraction
a = scratch[0].as2dArrayAddr(rows = K, cols = N) # Store the `a` operand
b = scratch[1].as2dArrayAddr(rows = K, cols = N) # Store the `b` operand
a = scratch[0].as2dArrayAddr(a_PIR, rows = K, cols = N, memIndirect = memRead) # Store the `a` operand
b = scratch[1].as2dArrayAddr(b_PIR, rows = K, cols = N, memIndirect = memRead) # Store the `b` operand
tN = scratch[2] # High part of extended precision multiplication
C = scratch[3] # Carry during reduction step
r = scratch[4] # Stores the `r` operand
@ -280,9 +257,6 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
# We can save 1 by hardcoding M as immediate (and m0ninv)
# but this prevent reusing the same code for multiple curves like BLS12-377 and BLS12-381
# We might be able to save registers by having `r` and `M` be memory operand as well
let tsym = t.nimSymbol
let scratchSym = scratch.nimSymbol
result.add quote do:
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
@ -377,7 +351,7 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
ctx.mov rax, r # move r away from scratchspace that will be used for final substraction
let r2 = rax.asArrayAddr(len = N)
let r2 = rax.asArrayAddr(r_PIR, len = N, memIndirect = memWrite)
if skipFinalSub:
ctx.comment " Copy result"
@ -387,8 +361,7 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
ctx.comment " Final substraction"
ctx.finalSubNoOverflowImpl(
r2, t, M,
scratch
)
scratch)
result.add ctx.generate()
func sumprodMont_CIOS_spare2bits_asm*[N, K: static int](

View File

@ -21,11 +21,6 @@ import
#
# ############################################################
# Note: We can refer to at most 30 registers in inline assembly
# and "InputOutput" registers count double
# They are nice to let the compiler deals with mov
# but too constraining so we move things ourselves.
static: doAssert UseASM_X86_64
# MULX/ADCX/ADOX
@ -176,7 +171,7 @@ proc partialRedx(
macro mulMont_CIOS_sparebit_adx_gen[N: static int](
r_PIR: var Limbs[N], a_PIR, b_PIR,
M_PIR: Limbs[N], m0ninv_REG: BaseType,
M_MEM: Limbs[N], m0ninv_REG: BaseType,
skipFinalSub: static bool): untyped =
## Generate an optimized Montgomery Multiplication kernel
## using the CIOS method
@ -193,18 +188,20 @@ macro mulMont_CIOS_sparebit_adx_gen[N: static int](
let
scratchSlots = 6
r = init(OperandArray, nimSymbol = r_PIR, N, PointerInReg, InputOutput_EnsureClobber)
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it). # Changing that to MemOffsetable triggers an error in negmod in test_bindings. Missing clobber?
# We could force M as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
# If N is too big, we need to spill registers. TODO.
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
tSym = ident"t"
t = asmArray(tSym, N, ElemsInReg, asmOutputEarlyClobber)
# MultiPurpose Register slots
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
scratchSym = ident"scratch"
scratch = asmArray(scratchSym, scratchSlots, ElemsInReg, asmInputOutputEarlyClobber)
# MULX requires RDX as well
a = scratch[0].asArrayAddr(len = N) # Store the `a` operand
b = scratch[1].asArrayAddr(len = N) # Store the `b` operand
a = scratch[0].asArrayAddr(a_PIR, len = N, memIndirect = memRead) # Store the `a` operand
b = scratch[1].asArrayAddr(b_PIR, len = N, memIndirect = memRead) # Store the `b` operand
A = scratch[2] # High part of extended precision multiplication
C = scratch[3]
m0ninv = scratch[4] # Modular inverse of M[0]
@ -221,8 +218,6 @@ macro mulMont_CIOS_sparebit_adx_gen[N: static int](
# but this prevent reusing the same code for multiple curves like BLS12-377 and BLS12-381
# We might be able to save registers by having `r` and `M` be memory operand as well
let tsym = t.nimSymbol
let scratchSym = scratch.nimSymbol
result.add quote do:
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
@ -250,21 +245,18 @@ macro mulMont_CIOS_sparebit_adx_gen[N: static int](
A, t,
a,
b[0],
C
)
C)
else:
ctx.mulaccx_by_word(
A, t,
a, i,
b[i],
C
)
C)
ctx.partialRedx(
A, t,
M, m0ninv,
lo, C
)
lo, C)
if skipFinalSub:
for i in 0 ..< N:
@ -272,19 +264,9 @@ macro mulMont_CIOS_sparebit_adx_gen[N: static int](
else:
ctx.finalSubNoOverflowImpl(
r, t, M,
scratch
)
scratch)
result.add ctx.generate
func mulMont_CIOS_sparebit_asm_adx_inline*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) {.inline.} =
## Constant-time Montgomery multiplication
## If "skipFinalSub" is set
## the result is in the range [0, 2M)
## otherwise the result is in the range [0, M)
##
## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
r.mulMont_CIOS_sparebit_adx_gen(a, b, M, m0ninv, skipFinalSub)
result.add ctx.generate()
func mulMont_CIOS_sparebit_asm_adx*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType, skipFinalSub: static bool = false) =
## Constant-time Montgomery multiplication
@ -293,7 +275,7 @@ func mulMont_CIOS_sparebit_asm_adx*(r: var Limbs, a, b, M: Limbs, m0ninv: BaseTy
## otherwise the result is in the range [0, M)
##
## This procedure can only be called if the modulus doesn't use the full bitwidth of its underlying representation
r.mulMont_CIOS_sparebit_asm_adx_inline(a, b, M, m0ninv, skipFinalSub)
r.mulMont_CIOS_sparebit_adx_gen(a, b, M, m0ninv, skipFinalSub)
# Montgomery Squaring
# ------------------------------------------------------------
@ -313,7 +295,7 @@ func squareMont_CIOS_asm_adx*[N](
macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
r_PIR: var Limbs[N], a_PIR, b_PIR: array[K, Limbs[N]],
M_PIR: Limbs[N], m0ninv_REG: BaseType,
M_MEM: Limbs[N], m0ninv_REG: BaseType,
skipFinalSub: static bool): untyped =
## Generate an optimized Montgomery merged sum of products ⅀aᵢ.bᵢ kernel
## using the CIOS method
@ -343,29 +325,23 @@ macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
scratchSlots = 6
# We could force M as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
# If N is too big, we need to spill registers. TODO.
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
tSym = ident"t"
t = asmArray(tSym, N, ElemsInReg, asmOutputEarlyClobber)
# MultiPurpose Register slots
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
scratchSym = ident"scratch"
scratch = asmArray(scratchSym, scratchSlots, ElemsInReg, asmInputOutputEarlyClobber)
# MULX requires RDX as well
m0ninv = Operand(
desc: OperandDesc(
asmId: "[m0ninv]",
nimSymbol: m0ninv_REG,
rm: MemOffsettable,
constraint: Input,
cEmit: "&" & $m0ninv_REG
)
)
m0ninv = asmValue(m0ninv_REG, Mem, asmInput)
# We're really constrained by register and somehow setting as memory doesn't help
# So we store the result `r` in the scratch space and then reload it in RDX
# before the scratchspace is used in final substraction
a = scratch[0].as2dArrayAddr(rows = K, cols = N) # Store the `a` operand
b = scratch[1].as2dArrayAddr(rows = K, cols = N) # Store the `b` operand
a = scratch[0].as2dArrayAddr(a_PIR, rows = K, cols = N, memIndirect = memRead) # Store the `a` operand
b = scratch[1].as2dArrayAddr(b_PIR, rows = K, cols = N, memIndirect = memRead) # Store the `b` operand
tN = scratch[2] # High part of extended precision multiplication
C = scratch[3] # Carry during reduction step
r = scratch[4] # Stores the `r` operand
@ -382,8 +358,6 @@ macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
# but this prevent reusing the same code for multiple curves like BLS12-377 and BLS12-381
# We might be able to save registers by having `r` and `M` be memory operand as well
let tsym = t.nimSymbol
let scratchSym = scratch.nimSymbol
result.add quote do:
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
@ -461,11 +435,10 @@ macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
ctx.partialRedx(
tN, t,
M, m0ninv,
rax, C
)
rax, C)
ctx.mov rax, r # move r away from scratchspace that will be used for final substraction
let r2 = rax.asArrayAddr(len = N)
let r2 = rax.asArrayAddr(r_PIR, len = N, memIndirect = memWrite)
if skipFinalSub:
ctx.comment " Copy result"
@ -473,10 +446,7 @@ macro sumprodMont_CIOS_spare2bits_adx_gen[N, K: static int](
ctx.mov r2[i], t[i]
else:
ctx.comment " Final substraction"
ctx.finalSubNoOverflowImpl(
r2, t, M,
scratch
)
ctx.finalSubNoOverflowImpl(r2, t, M, scratch)
result.add ctx.generate()
func sumprodMont_CIOS_spare2bits_asm_adx*[N, K: static int](

View File

@ -18,18 +18,13 @@ import
#
# ############################################################
# Note: We can refer to at most 30 registers in inline assembly
# and "InputOutput" registers count double
# They are nice to let the compiler deals with mov
# but too constraining so we move things ourselves.
static: doAssert UseASM_X86_64 # Need 8 registers just for mul
# and 32-bit only has 8 max.
# Multiplication
# -----------------------------------------------------------------------------------------------
macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
macro mul_gen[rLen, aLen, bLen: static int](r_PIR: var Limbs[rLen], a_MEM: Limbs[aLen], b_MEM: Limbs[bLen]) =
## Comba multiplication generator
## `a`, `b`, `r` can have a different number of limbs
## if `r`.limbs.len < a.limbs.len + b.limbs.len
@ -42,54 +37,29 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
var ctx = init(Assembler_x86, BaseType)
let
arrR = init(OperandArray, nimSymbol = r, rLen, PointerInReg, InputOutput_EnsureClobber)
arrA = init(OperandArray, nimSymbol = a, aLen, PointerInReg, Input)
arrB = init(OperandArray, nimSymbol = b, bLen, PointerInReg, Input)
r = asmArray(r_PIR, rLen, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
a = asmArray(a_MEM, aLen, MemOffsettable, asmInput)
b = asmArray(b_MEM, aLen, MemOffsettable, asmInput)
t = Operand(
desc: OperandDesc(
asmId: "[t]",
nimSymbol: ident"t",
rm: Reg,
constraint: Output_EarlyClobber,
cEmit: "t"
)
)
u = Operand(
desc: OperandDesc(
asmId: "[u]",
nimSymbol: ident"u",
rm: Reg,
constraint: Output_EarlyClobber,
cEmit: "u"
)
)
v = Operand(
desc: OperandDesc(
asmId: "[v]",
nimSymbol: ident"v",
rm: Reg,
constraint: Output_EarlyClobber,
cEmit: "v"
)
)
tSym = ident"t"
t = asmValue(tSym, Reg, asmOutputEarlyClobber)
uSym = ident"u"
u = asmValue(uSym, Reg, asmOutputEarlyClobber)
vSym = ident"v"
v = asmValue(vSym, Reg, asmOutputEarlyClobber)
# MUL requires RAX and RDX
# Prologue
let tsym = t.desc.nimSymbol
let usym = u.desc.nimSymbol
let vsym = v.desc.nimSymbol
result.add quote do:
var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init
var `tSym`{.noInit.}, `uSym`{.noInit.}, `vSym`{.noInit.}: BaseType
# Algorithm
# Zero-init
ctx.`xor` u, u
ctx.`xor` v, v
ctx.`xor` t, t
# Algorithm
let stopEx = min(aLen+bLen, rLen)
for i in 0 ..< stopEx:
@ -100,13 +70,13 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
let ia = i - ib
for j in 0 ..< min(aLen - ia, ib+1):
# (t, u, v) <- (t, u, v) + a[ia+j] * b[ib-j]
ctx.mov rax, arrB[ib-j]
ctx.mul rdx, rax, arrA[ia+j], rax
ctx.mov rax, b[ib-j]
ctx.mul rdx, rax, a[ia+j], rax
ctx.add v, rax
ctx.adc u, rdx
ctx.adc t, 0
ctx.mov arrR[i], v
ctx.mov r[i], v
if i != stopEx - 1:
ctx.mov v, u
@ -116,10 +86,10 @@ macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
if aLen+bLen < rLen:
ctx.`xor` rax, rax
for i in aLen+bLen ..< rLen:
ctx.mov arrR[i], rax
ctx.mov r[i], rax
# Codegen
result.add ctx.generate
result.add ctx.generate()
func mul_asm*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
## Multi-precision Multiplication
@ -129,7 +99,7 @@ func mul_asm*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen],
# Squaring
# -----------------------------------------------------------------------------------------------
macro sqr_gen*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
macro sqr_gen*[rLen, aLen: static int](r_PIR: var Limbs[rLen], a_MEM: Limbs[aLen]) =
## Comba squaring generator
## `a` and `r` can have a different number of limbs
## if `r`.limbs.len < a.limbs.len * 2
@ -142,51 +112,26 @@ macro sqr_gen*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
var ctx = init(Assembler_x86, BaseType)
let
arrR = init(OperandArray, nimSymbol = r, rLen, PointerInReg, InputOutput_EnsureClobber)
arrA = init(OperandArray, nimSymbol = a, aLen, PointerInReg, Input)
r = asmArray(r_PIR, rLen, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
a = asmArray(a_MEM, aLen, MemOffsettable, asmInput)
t = Operand(
desc: OperandDesc(
asmId: "[t]",
nimSymbol: ident"t",
rm: Reg,
constraint: Output_EarlyClobber,
cEmit: "t"
)
)
u = Operand(
desc: OperandDesc(
asmId: "[u]",
nimSymbol: ident"u",
rm: Reg,
constraint: Output_EarlyClobber,
cEmit: "u"
)
)
v = Operand(
desc: OperandDesc(
asmId: "[v]",
nimSymbol: ident"v",
rm: Reg,
constraint: Output_EarlyClobber,
cEmit: "v"
)
)
tSym = ident"t"
t = asmValue(tSym, Reg, asmOutputEarlyClobber)
uSym = ident"u"
u = asmValue(uSym, Reg, asmOutputEarlyClobber)
vSym = ident"v"
v = asmValue(vSym, Reg, asmOutputEarlyClobber)
# Prologue
let tsym = t.desc.nimSymbol
let usym = u.desc.nimSymbol
let vsym = v.desc.nimSymbol
result.add quote do:
var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init
var `tSym`{.noInit.}, `uSym`{.noInit.}, `vSym`{.noInit.}: BaseType
# Algorithm
# Zero-init
ctx.`xor` u, u
ctx.`xor` v, v
ctx.`xor` t, t
# Algorithm
let stopEx = min(aLen*2, rLen)
for i in 0 ..< stopEx:
@ -200,8 +145,8 @@ macro sqr_gen*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
let k2 = ib-j
if k1 < k2:
# (t, u, v) <- (t, u, v) + 2 * a[k1] * a[k2]
ctx.mov rax, arrA[k2]
ctx.mul rdx, rax, arrA[k1], rax
ctx.mov rax, a[k2]
ctx.mul rdx, rax, a[k1], rax
ctx.add rax, rax
ctx.adc rdx, rdx
ctx.adc t, 0
@ -210,15 +155,15 @@ macro sqr_gen*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
ctx.adc t, 0
elif k1 == k2:
# (t, u, v) <- (t, u, v) + a[k1] * a[k2]
ctx.mov rax, arrA[k2]
ctx.mul rdx, rax, arrA[k1], rax
ctx.mov rax, a[k2]
ctx.mul rdx, rax, a[k1], rax
ctx.add v, rax
ctx.adc u, rdx
ctx.adc t, 0
else:
discard
ctx.mov arrR[i], v
ctx.mov r[i], v
if i != stopEx - 1:
ctx.mov v, u
@ -228,10 +173,10 @@ macro sqr_gen*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
if aLen*2 < rLen:
ctx.`xor` rax, rax
for i in aLen*2 ..< rLen:
ctx.mov arrR[i], rax
ctx.mov r[i], rax
# Codegen
result.add ctx.generate
result.add ctx.generate()
func square_asm*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
## Multi-precision Squaring

View File

@ -18,11 +18,6 @@ import
#
# ############################################################
# Note: We can refer to at most 30 registers in inline assembly
# and "InputOutput" registers count double
# They are nice to let the compiler deals with mov
# but too constraining so we move things ourselves.
static: doAssert UseASM_X86_64
# MULX/ADCX/ADOX
@ -108,7 +103,7 @@ proc mulaccx_by_word(
ctx.adcx hi, rdx
ctx.adox hi, rdx
macro mulx_gen[rLen, aLen, bLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limbs[aLen], b_PIR: Limbs[bLen]) =
macro mulx_gen[rLen, aLen, bLen: static int](r_PIR: var Limbs[rLen], a_MEM: Limbs[aLen], b_MEM: Limbs[bLen]) =
## `a`, `b`, `r` can have a different number of limbs
## if `r`.limbs.len < a.limbs.len + b.limbs.len
## The result will be truncated, i.e. it will be
@ -120,35 +115,33 @@ macro mulx_gen[rLen, aLen, bLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limb
var ctx = init(Assembler_x86, BaseType)
let
r = init(OperandArray, nimSymbol = r_PIR, rLen, PointerInReg, InputOutput_EnsureClobber)
a = init(OperandArray, nimSymbol = a_PIR, aLen, PointerInReg, Input)
b = init(OperandArray, nimSymbol = b_PIR, bLen, PointerInReg, Input)
r = asmArray(r_PIR, rLen, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
a = asmArray(a_MEM, aLen, MemOffsettable, asmInput)
b = asmArray(b_MEM, bLen, MemOffsettable, asmInput)
# MULX requires RDX
tSym = ident"t"
tSlots = aLen+1 # Extra for high word
var # If aLen is too big, we need to spill registers. TODO.
t = init(OperandArray, nimSymbol = ident"t", tSlots, ElemsInReg, Output_EarlyClobber)
t = asmArray(tSym, tSlots, ElemsInReg, asmOutputEarlyClobber)
# Prologue
let tsym = t.nimSymbol
result.add quote do:
var `tsym`{.noInit, used.}: array[`tSlots`, BaseType]
var `tSym`{.noInit, used.}: array[`tSlots`, BaseType]
for i in 0 ..< min(rLen, bLen):
if i == 0:
ctx.mulx_by_word(
r[0],
a, t,
b[0]
)
b[0])
else:
ctx.mulaccx_by_word(
r, i,
a, t,
b[i]
)
b[i])
t.rotateLeft()
@ -163,20 +156,13 @@ macro mulx_gen[rLen, aLen, bLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limb
ctx.mov r[i], rax
# Codegen
result.add ctx.generate
func mul_asm_adx_inline*[rLen, aLen, bLen: static int](
r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) {.inline.} =
## Multi-precision Multiplication
## Assumes r doesn't alias a or b
## Inline version
mulx_gen(r, a, b)
result.add ctx.generate()
func mul_asm_adx*[rLen, aLen, bLen: static int](
r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
## Multi-precision Multiplication
## Assumes r doesn't alias a or b
mul_asm_adx_inline(r, a, b)
mulx_gen(r, a, b)
# Squaring
# -----------------------------------------------------------------------------------------------
@ -558,7 +544,7 @@ func sqrx_gen6L(ctx: var Assembler_x86, r, a: OperandArray, t: var OperandArray)
merge_diag_and_partsum(r, a, hi1, lo1, zero, 4)
merge_diag_and_partsum(r, a, hi2, lo2, zero, 5)
macro sqrx_gen*[rLen, aLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limbs[aLen]) =
macro sqrx_gen*[rLen, aLen: static int](r_PIR: var Limbs[rLen], a_MEM: Limbs[aLen]) =
## Squaring
## `a` and `r` can have a different number of limbs
## if `r`.limbs.len < a.limbs.len * 2
@ -575,21 +561,20 @@ macro sqrx_gen*[rLen, aLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limbs[aLe
# t = 2 * a.len = 12
# We use the full x86 register set.
r = init(OperandArray, nimSymbol = r_PIR, rLen, PointerInReg, InputOutput)
a = init(OperandArray, nimSymbol = a_PIR, aLen, PointerInReg, Input)
r = asmArray(r_PIR, rLen, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
a = asmArray(a_MEM, aLen, MemOffsettable, asmInput)
# MULX requires RDX
tSym = ident"t"
tSlots = aLen+1 # Extra for high word
var # If aLen is too big, we need to spill registers. TODO.
t = init(OperandArray, nimSymbol = ident"t", tSlots, ElemsInReg, Output_EarlyClobber)
t = asmArray(tSym, tSlots, ElemsInReg, asmOutputEarlyClobber)
# Prologue
# -------------------------------
let tsym = t.nimSymbol
result.add quote do:
var `tsym`{.noInit, used.}: array[`tSlots`, BaseType]
var `tSym`{.noInit, used.}: array[`tSlots`, BaseType]
if aLen == 4:
ctx.sqrx_gen4L(r, a, t)
@ -599,7 +584,7 @@ macro sqrx_gen*[rLen, aLen: static int](r_PIR: var Limbs[rLen], a_PIR: Limbs[aLe
error: "Not implemented"
# Codegen
result.add ctx.generate
result.add ctx.generate()
func square_asm_adx*[rLen, aLen: static int](r: var Limbs[rLen], a: Limbs[aLen]) =
## Multi-precision Squaring

View File

@ -31,7 +31,7 @@ static: doAssert UseASM_X86_32
macro redc2xMont_gen*[N: static int](
r_PIR: var array[N, SecretWord],
a_PIR: array[N*2, SecretWord],
M_PIR: array[N, SecretWord],
M_MEM: array[N, SecretWord],
m0ninv_REG: BaseType,
spareBits: static int, skipFinalSub: static bool) =
# No register spilling handling
@ -46,28 +46,27 @@ macro redc2xMont_gen*[N: static int](
# so we store everything in scratchspaces restoring as needed
let
# We could force M as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
# MUL requires RAX and RDX
let uSlots = N+2
let vSlots = max(N-2, 3)
let uSym = ident"u"
let vSym = ident"v"
var # Scratchspaces
u = init(OperandArray, nimSymbol = ident"U", uSlots, ElemsInReg, InputOutput_EnsureClobber)
v = init(OperandArray, nimSymbol = ident"V", vSlots, ElemsInReg, InputOutput_EnsureClobber)
u = asmArray(uSym, uSlots, ElemsInReg, asmInputOutputEarlyClobber)
v = asmArray(vSym, vSlots, ElemsInReg, asmInputOutputEarlyClobber)
# Prologue
let usym = u.nimSymbol
let vsym = v.nimSymbol
result.add quote do:
var `usym`{.noinit, used.}: Limbs[`uSlots`]
var `vsym` {.noInit.}: Limbs[`vSlots`]
`vsym`[0] = cast[SecretWord](`r_PIR`[0].unsafeAddr)
`vsym`[1] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
`vsym`[2] = SecretWord(`m0ninv_REG`)
var `uSym`{.noinit, used.}: Limbs[`uSlots`]
var `vSym` {.noInit.}: Limbs[`vSlots`]
`vSym`[0] = cast[SecretWord](`r_PIR`[0].unsafeAddr)
`vSym`[1] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
`vSym`[2] = SecretWord(`m0ninv_REG`)
let r_temp = v[0].asArrayAddr(len = N)
let a = v[1].asArrayAddr(len = 2*N)
let r_temp = v[0].asArrayAddr(r_PIR, len = N, memIndirect = memWrite)
let a = v[1].asArrayAddr(a_PIR, len = 2*N, memIndirect = memRead)
let m0ninv = v[2]
# Algorithm
@ -137,7 +136,7 @@ macro redc2xMont_gen*[N: static int](
if not(spareBits >= 2 and skipFinalSub):
ctx.mov rdx, r_temp
let r = rdx.asArrayAddr(len = N)
let r = rdx.asArrayAddr(r_PIR, len = N, memIndirect = memWrite)
# This does a[i+n] += hi
# but in a separate carry chain, fused with the
@ -157,7 +156,7 @@ macro redc2xMont_gen*[N: static int](
elif spareBits >= 1:
ctx.finalSubNoOverflowImpl(r, u, M, t)
else:
ctx.finalSubMayOverflowImpl(r, u, M, t, rax)
ctx.finalSubMayOverflowImpl(r, u, M, t)
# Code generation
result.add ctx.generate()
@ -168,9 +167,8 @@ func redcMont_asm*[N: static int](
M: array[N, SecretWord],
m0ninv: BaseType,
spareBits: static int,
skipFinalSub: static bool) {.noInline.} =
skipFinalSub: static bool) =
## Constant-time Montgomery reduction
# This MUST be noInline or Clang will run out of registers with LTO
static: doAssert UseASM_X86_64, "This requires x86-64."
redc2xMont_gen(r, a, M, m0ninv, spareBits, skipFinalSub)
@ -179,7 +177,7 @@ func redcMont_asm*[N: static int](
macro mulMont_by_1_gen[N: static int](
t_EIR: var array[N, SecretWord],
M_PIR: array[N, SecretWord],
M_MEM: array[N, SecretWord],
m0ninv_REG: BaseType) =
# No register spilling handling
@ -192,34 +190,22 @@ macro mulMont_by_1_gen[N: static int](
# RAX and RDX are defacto used due to the MUL instructions
# so we store everything in scratchspaces restoring as needed
let
scratchSlots = 2
t = init(OperandArray, nimSymbol = t_EIR, N, ElemsInReg, InputOutput_EnsureClobber)
t = asmArray(t_EIR, N, ElemsInReg, asmInputOutputEarlyClobber)
# We could force M as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
# MultiPurpose Register slots
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
# MUL requires RAX and RDX
m0ninv = Operand(
desc: OperandDesc(
asmId: "[m0ninv]",
nimSymbol: m0ninv_REG,
rm: MemOffsettable,
constraint: Input,
cEmit: "&" & $m0ninv_REG
)
)
C = scratch[0] # Stores the high-part of muliplication
m = scratch[1] # Stores (t[0] * m0ninv) mod 2ʷ
let scratchSym = scratch.nimSymbol
m0ninv = asmValue(m0ninv_REG, Mem, asmInput)
Csym = ident"C"
C = asmValue(Csym, Reg, asmOutputEarlyClobber) # Stores the high-part of muliplication
mSym = ident"m"
m = asmValue(msym, Reg, asmOutputEarlyClobber) # Stores (t[0] * m0ninv) mod 2ʷ
# Copy a in t
result.add quote do:
var `scratchSym` {.noInit, used.}: Limbs[`scratchSlots`]
var `Csym` {.noInit, used.}: BaseType
var `mSym` {.noInit, used.}: BaseType
# Algorithm
# ---------------------------------------------------------

View File

@ -35,7 +35,7 @@ static: doAssert UseASM_X86_64
macro redc2xMont_adx_gen[N: static int](
r_PIR: var array[N, SecretWord],
a_PIR: array[N*2, SecretWord],
M_PIR: array[N, SecretWord],
M_MEM: array[N, SecretWord],
m0ninv_REG: BaseType,
spareBits: static int, skipFinalSub: static bool) =
@ -45,30 +45,28 @@ macro redc2xMont_adx_gen[N: static int](
result = newStmtList()
var ctx = init(Assembler_x86, BaseType)
let
# We could force M as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
let M = asmArray(M_MEM, N, MemOffsettable, asmInput)
let uSlots = N+1
let vSlots = max(N-1, 5)
let uSym = ident"u"
let vSym = ident"v"
var # Scratchspaces
u = init(OperandArray, nimSymbol = ident"U", uSlots, ElemsInReg, InputOutput_EnsureClobber)
v = init(OperandArray, nimSymbol = ident"V", vSlots, ElemsInReg, InputOutput_EnsureClobber)
u = asmArray(uSym, uSlots, ElemsInReg, asmInputOutputEarlyClobber)
v = asmArray(vSym, vSlots, ElemsInReg, asmInputOutputEarlyClobber)
# Prologue
let usym = u.nimSymbol
let vsym = v.nimSymbol
result.add quote do:
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
var `usym`{.noinit, used.}: Limbs[`uSlots`]
var `vsym` {.noInit.}: Limbs[`vSlots`]
`vsym`[0] = cast[SecretWord](`r_PIR`[0].unsafeAddr)
`vsym`[1] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
`vsym`[2] = SecretWord(`m0ninv_REG`)
var `uSym`{.noinit, used.}: Limbs[`uSlots`]
var `vSym` {.noInit.}: Limbs[`vSlots`]
`vSym`[0] = cast[SecretWord](`r_PIR`[0].unsafeAddr)
`vSym`[1] = cast[SecretWord](`a_PIR`[0].unsafeAddr)
`vSym`[2] = SecretWord(`m0ninv_REG`)
let r_temp = v[0].asArrayAddr(len = N)
let a = v[1].asArrayAddr(len = 2*N)
let r_temp = v[0]
let a = v[1].asArrayAddr(a_PIR, len = 2*N, memIndirect = memRead)
let m0ninv = v[2]
let lo = v[3]
let hi = v[4]
@ -116,7 +114,7 @@ macro redc2xMont_adx_gen[N: static int](
u.rotateLeft()
ctx.mov rdx, r_temp
let r = rdx.asArrayAddr(len = N)
let r = rdx.asArrayAddr(r_PIR, len = N, memIndirect = memWrite)
# This does a[i+n] += hi
# but in a separate carry chain, fused with the
@ -135,7 +133,7 @@ macro redc2xMont_adx_gen[N: static int](
elif spareBits >= 1:
ctx.finalSubNoOverflowImpl(r, u, M, t)
else:
ctx.finalSubMayOverflowImpl(r, u, M, t, hi)
ctx.finalSubMayOverflowImpl(r, u, M, t)
# Code generation
result.add ctx.generate()
@ -146,7 +144,7 @@ func redcMont_asm_adx*[N: static int](
M: array[N, SecretWord],
m0ninv: BaseType,
spareBits: static int,
skipFinalSub: static bool = false) {.noInline.} =
skipFinalSub: static bool = false) =
## Constant-time Montgomery reduction
# Inlining redcMont_asm_adx twice in mul_fp2_complex_asm_adx
# causes GCC to miscompile with -Os (--opt:size)
@ -158,7 +156,7 @@ func redcMont_asm_adx*[N: static int](
macro mulMont_by_1_adx_gen[N: static int](
t_EIR: var array[N, SecretWord],
M_PIR: array[N, SecretWord],
M_MEM: array[N, SecretWord],
m0ninv_REG: BaseType) =
# No register spilling handling
@ -171,33 +169,20 @@ macro mulMont_by_1_adx_gen[N: static int](
# RAX and RDX are defacto used due to the MUL instructions
# so we store everything in scratchspaces restoring as needed
let
scratchSlots = 1
t = init(OperandArray, nimSymbol = t_EIR, N, ElemsInReg, InputOutput_EnsureClobber)
t = asmArray(t_EIR, N, ElemsInReg, asmInputOutputEarlyClobber)
# We could force M as immediate by specializing per moduli
M = init(OperandArray, nimSymbol = M_PIR, N, PointerInReg, Input)
# MultiPurpose Register slots
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
M = asmArray(M_MEM, N, MemOffsettable, asmInput)
# MUL requires RAX and RDX
m0ninv = Operand(
desc: OperandDesc(
asmId: "[m0ninv]",
nimSymbol: m0ninv_REG,
rm: MemOffsettable,
constraint: Input,
cEmit: "&" & $m0ninv_REG
)
)
m0ninv = asmValue(m0ninv_REG, Mem, asmInput)
C = scratch[0] # Stores the high-part of muliplication
let scratchSym = scratch.nimSymbol
Csym = ident"C"
C = asmValue(Csym, Reg, asmOutputEarlyClobber) # Stores the high-part of muliplication
# Copy a in t
result.add quote do:
var `scratchSym` {.noInit, used.}: Limbs[`scratchSlots`]
var `Csym` {.noInit, used.}: BaseType
# Algorithm
# ---------------------------------------------------------

View File

@ -18,74 +18,45 @@ import
#
# ############################################################
# Note: We can refer to at most 30 registers in inline assembly
# and "InputOutput" registers count double
# They are nice to let the compiler deals with mov
# but too constraining so we move things ourselves.
static: doAssert UseASM_X86_32
# Copy
# ------------------------------------------------------------
macro ccopy_gen[N: static int](a: var Limbs[N], b: Limbs[N], ctl: SecretBool): untyped =
macro ccopy_gen[N: static int](a_PIR: var Limbs[N], b_MEM: Limbs[N], ctl: SecretBool): untyped =
## Generate an optimized conditional copy kernel
result = newStmtList()
var ctx = init(Assembler_x86, BaseType)
let
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, InputOutput)
arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, Input)
a = asmArray(a_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memReadWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
b = asmArray(b_MEM, N, MemOffsettable, asmInput)
control = Operand(
desc: OperandDesc(
asmId: "[ctl]",
nimSymbol: ctl,
rm: Reg,
constraint: Input,
cEmit: "ctl"
)
)
control = asmValue(ctl, Reg, asmInput)
t0Sym = ident"t0"
t1Sym = ident"t1"
var # Swappable registers to break dependency chains
t0 = Operand(
desc: OperandDesc(
asmId: "[t0]",
nimSymbol: ident"t0",
rm: Reg,
constraint: Output_EarlyClobber,
cEmit: "t0"
)
)
t1 = Operand(
desc: OperandDesc(
asmId: "[t1]",
nimSymbol: ident"t1",
rm: Reg,
constraint: Output_EarlyClobber,
cEmit: "t1"
)
)
t0 = asmValue(t0Sym, Reg, asmOutputEarlyClobber)
t1 = asmValue(t1Sym, Reg, asmOutputEarlyClobber)
# Prologue
let t0sym = t0.desc.nimSymbol
let t1sym = t1.desc.nimSymbol
result.add quote do:
var `t0sym`{.noinit.}, `t1sym`{.noinit.}: BaseType
# Algorithm
ctx.test control, control
for i in 0 ..< N:
ctx.mov t0, arrA[i]
ctx.cmovnz t0, arrB[i]
ctx.mov arrA[i], t0
ctx.mov t0, a[i]
ctx.cmovnz t0, b[i]
ctx.mov a[i], t0
swap(t0, t1)
# Codegen
result.add ctx.generate()
func ccopy_asm*(a: var Limbs, b: Limbs, ctl: SecretBool) {.inline.}=
func ccopy_asm*(a: var Limbs, b: Limbs, ctl: SecretBool) =
## Constant-time conditional copy
## If ctl is true: b is copied into a
## if ctl is false: b is not copied and a is untouched
@ -95,121 +66,89 @@ func ccopy_asm*(a: var Limbs, b: Limbs, ctl: SecretBool) {.inline.}=
# Addition
# ------------------------------------------------------------
macro add_gen[N: static int](carry: var Carry, r: var Limbs[N], a, b: Limbs[N]): untyped =
macro add_gen[N: static int](carry: var Carry, r_PIR: var Limbs[N], a_MEM, b_MEM: Limbs[N]): untyped =
## Generate an optimized out-of-place addition kernel
result = newStmtList()
var ctx = init(Assembler_x86, BaseType)
let
arrR = init(OperandArray, nimSymbol = r, N, PointerInReg, InputOutput)
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, Input)
arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, Input)
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
a = asmArray(a_MEM, N, MemOffsettable, asmInput)
b = asmArray(b_MEM, N, MemOffsettable, asmInput)
t0Sym = ident"t0"
t1Sym = ident"t1"
var # Swappable registers to break dependency chains
t0 = Operand(
desc: OperandDesc(
asmId: "[t0]",
nimSymbol: ident"t0",
rm: Reg,
constraint: Output_EarlyClobber,
cEmit: "t0"
)
)
t1 = Operand(
desc: OperandDesc(
asmId: "[t1]",
nimSymbol: ident"t1",
rm: Reg,
constraint: Output_EarlyClobber,
cEmit: "t1"
)
)
t0 = asmValue(t0Sym, Reg, asmOutputEarlyClobber)
t1 = asmValue(t1Sym, Reg, asmOutputEarlyClobber)
# Prologue
let t0sym = t0.desc.nimSymbol
let t1sym = t1.desc.nimSymbol
result.add quote do:
var `t0sym`{.noinit.}, `t1sym`{.noinit.}: BaseType
# Algorithm
ctx.mov t0, arrA[0] # Prologue
ctx.add t0, arrB[0]
ctx.mov t0, a[0] # Prologue
ctx.add t0, b[0]
for i in 1 ..< N:
ctx.mov t1, arrA[i] # Prepare the next iteration
ctx.mov arrR[i-1], t0 # Save the previous result in an interleaved manner
ctx.adc t1, arrB[i] # Compute
swap(t0, t1) # Break dependency chain
ctx.mov t1, a[i] # Prepare the next iteration
ctx.mov r[i-1], t0 # Save the previous result in an interleaved manner
ctx.adc t1, b[i] # Compute
swap(t0, t1) # Break dependency chain
ctx.mov arrR[N-1], t0 # Epilogue
ctx.mov r[N-1], t0 # Epilogue
ctx.setToCarryFlag(carry)
# Codegen
result.add ctx.generate
result.add ctx.generate()
func add_asm*(r: var Limbs, a, b: Limbs): Carry {.inline.}=
func add_asm*(r: var Limbs, a, b: Limbs): Carry =
## Constant-time addition
add_gen(result, r, a, b)
# Substraction
# ------------------------------------------------------------
macro sub_gen[N: static int](borrow: var Borrow, r: var Limbs[N], a, b: Limbs[N]): untyped =
macro sub_gen[N: static int](borrow: var Borrow, r_PIR: var Limbs[N], a_MEM, b_MEM: Limbs[N]): untyped =
## Generate an optimized out-of-place substraction kernel
result = newStmtList()
var ctx = init(Assembler_x86, BaseType)
let
arrR = init(OperandArray, nimSymbol = r, N, PointerInReg, InputOutput)
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, Input)
arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, Input)
r = asmArray(r_PIR, N, PointerInReg, asmInputOutputEarlyClobber, memIndirect = memWrite) # MemOffsettable is the better constraint but compilers say it is impossible. Use early clobber to ensure it is not affected by constant propagation at slight pessimization (reloading it).
a = asmArray(a_MEM, N, MemOffsettable, asmInput)
b = asmArray(b_MEM, N, MemOffsettable, asmInput)
t0Sym = ident"t0"
t1Sym = ident"t1"
var # Swappable registers to break dependency chains
t0 = Operand(
desc: OperandDesc(
asmId: "[t0]",
nimSymbol: ident"t0",
rm: Reg,
constraint: Output_EarlyClobber,
cEmit: "t0"
)
)
t1 = Operand(
desc: OperandDesc(
asmId: "[t1]",
nimSymbol: ident"t1",
rm: Reg,
constraint: Output_EarlyClobber,
cEmit: "t1"
)
)
t0 = asmValue(t0Sym, Reg, asmOutputEarlyClobber)
t1 = asmValue(t1Sym, Reg, asmOutputEarlyClobber)
# Prologue
let t0sym = t0.desc.nimSymbol
let t1sym = t1.desc.nimSymbol
result.add quote do:
var `t0sym`{.noinit.}, `t1sym`{.noinit.}: BaseType
# Algorithm
ctx.mov t0, arrA[0] # Prologue
ctx.sub t0, arrB[0]
ctx.mov t0, a[0] # Prologue
ctx.sub t0, b[0]
for i in 1 ..< N:
ctx.mov t1, arrA[i] # Prepare the next iteration
ctx.mov arrR[i-1], t0 # Save the previous reult in an interleaved manner
ctx.sbb t1, arrB[i] # Compute
swap(t0, t1) # Break dependency chain
ctx.mov t1, a[i] # Prepare the next iteration
ctx.mov r[i-1], t0 # Save the previous reult in an interleaved manner
ctx.sbb t1, b[i] # Compute
swap(t0, t1) # Break dependency chain
ctx.mov arrR[N-1], t0 # Epilogue
ctx.mov r[N-1], t0 # Epilogue
ctx.setToCarryFlag(borrow)
# Codegen
result.add ctx.generate
result.add ctx.generate()
func sub_asm*(r: var Limbs, a, b: Limbs): Borrow {.inline.}=
func sub_asm*(r: var Limbs, a, b: Limbs): Borrow =
## Constant-time substraction
sub_gen(result, r, a, b)

View File

@ -152,7 +152,7 @@ func setMinusOne*(a: var FF) =
func neg*(r: var FF, a: FF) {.meter.} =
## Negate modulo p
when UseASM_X86_64:
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
negmod_asm(r.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
else:
# If a = 0 we need r = 0 and not r = M

View File

@ -118,7 +118,7 @@ func sum2xMod*(r: var FpDbl, a, b: FpDbl) =
## Output is conditionally reduced by 2ⁿp
## to stay in the [0, 2ⁿp) range
when UseASM_X86_64:
addmod2x_asm(r.limbs2x, a.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs)
addmod2x_asm(r.limbs2x, a.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs, Fp[FpDbl.C].getSpareBits())
else:
# Addition step
var overflowed = SecretBool r.limbs2x.sum(a.limbs2x, b.limbs2x)

View File

@ -543,10 +543,8 @@ func sumprodMont*[N: static int](
r: var Limbs, a, b: array[N, Limbs],
M: Limbs, m0ninv: BaseType,
spareBits: static int,
skipFinalSub: static bool = false) {.noInline.} =
skipFinalSub: static bool = false) =
## Compute r <- ⅀aᵢ.bᵢ (mod M) (sum of products)
# This function must be noInline or GCC miscompiles
# with LTO, see https://github.com/mratsim/constantine/issues/230
when spareBits >= 2:
when UseASM_X86_64 and r.len in {2 .. 6}:
if ({.noSideEffect.}: hasAdx()):

View File

@ -139,5 +139,5 @@ macro debugConsts(): untyped {.used.} =
result.add quote do:
echo "----------------------------------------------------------------------------"
# debug: # displayed with -d:debugConstantine
# debug: # displayed with -d:CttDebug
# debugConsts()

View File

@ -62,9 +62,9 @@ func sqrx2x_complex_asm_adx*(
t0.double(a.c1)
t1.sum(a.c0, a.c1)
r.c1.limbs2x.mul_asm_adx_inline(t0.mres.limbs, a.c0.mres.limbs)
r.c1.limbs2x.mul_asm_adx(t0.mres.limbs, a.c0.mres.limbs)
t0.diff(a.c0, a.c1)
r.c0.limbs2x.mul_asm_adx_inline(t0.mres.limbs, t1.mres.limbs)
r.c0.limbs2x.mul_asm_adx(t0.mres.limbs, t1.mres.limbs)
func sqrx_complex_sparebit_asm_adx*(
r: var array[2, Fp],
@ -94,15 +94,15 @@ func mul2x_fp2_complex_asm_adx*(
var D {.noInit.}: typeof(r.c0)
var t0 {.noInit.}, t1 {.noInit.}: typeof(a.c0)
r.c0.limbs2x.mul_asm_adx_inline(a.c0.mres.limbs, b.c0.mres.limbs)
D.limbs2x.mul_asm_adx_inline(a.c1.mres.limbs, b.c1.mres.limbs)
when Fp.has1extraBit():
t0.sumUnr(a.c0, a.c1)
t1.sumUnr(b.c0, b.c1)
else:
t0.sum(a.c0, a.c1)
t1.sum(b.c0, b.c1)
r.c1.limbs2x.mul_asm_adx_inline(t0.mres.limbs, t1.mres.limbs)
r.c0.limbs2x.mul_asm_adx(a.c0.mres.limbs, b.c0.mres.limbs)
D.limbs2x.mul_asm_adx(a.c1.mres.limbs, b.c1.mres.limbs)
r.c1.limbs2x.mul_asm_adx(t0.mres.limbs, t1.mres.limbs)
when Fp.has1extraBit():
r.c1.diff2xUnr(r.c1, r.c0)
r.c1.diff2xUnr(r.c1, D)

View File

@ -856,14 +856,16 @@ func prod2x_complex(r: var QuadraticExt2x, a, b: Fp2) =
var D {.noInit.}: typeof(r.c0)
var t0 {.noInit.}, t1 {.noInit.}: typeof(a.c0)
r.c0.prod2x(a.c0, b.c0) # r0 = a0 b0
D.prod2x(a.c1, b.c1) # d = a1 b1
when Fp2.has1extraBit():
t0.sumUnr(a.c0, a.c1)
t1.sumUnr(b.c0, b.c1)
else:
t0.sum(a.c0, a.c1)
t1.sum(b.c0, b.c1)
r.c0.prod2x(a.c0, b.c0) # r0 = a0 b0
D.prod2x(a.c1, b.c1) # d = a1 b1
r.c1.prod2x(t0, t1) # r1 = (b0 + b1)(a0 + a1)
when Fp2.has1extraBit():
r.c1.diff2xUnr(r.c1, r.c0) # r1 = (b0 + b1)(a0 + a1) - a0 b0
@ -1052,9 +1054,6 @@ func prod2x_disjoint*[Fdbl, F](
var V0 {.noInit.}, V1 {.noInit.}: typeof(r.c0) # Double-precision
var t0 {.noInit.}, t1 {.noInit.}: typeof(a0) # Single-width
# Require 2 extra bits
V0.prod2x(a0, b0) # v0 = a0b0
V1.prod2x(a1, b1) # v1 = a1b1
when F.has1extraBit():
t0.sumUnr(a0, a1)
t1.sumUnr(b0, b1)
@ -1062,6 +1061,9 @@ func prod2x_disjoint*[Fdbl, F](
t0.sum(a0, a1)
t1.sum(b0, b1)
V0.prod2x(a0, b0) # v0 = a0b0
V1.prod2x(a1, b1) # v1 = a1b1
r.c1.prod2x(t0, t1) # r1 = (a0 + a1)(b0 + b1)
r.c1.diff2xMod(r.c1, V0) # r1 = (a0 + a1)(b0 + b1) - a0b0
r.c1.diff2xMod(r.c1, V1) # r1 = (a0 + a1)(b0 + b1) - a0b0 - a1b1

View File

@ -41,8 +41,7 @@ export BigInt, wordsRequired
func unmarshalLE[T](
dst: var openArray[T],
src: openarray[byte],
wordBitWidth: static int
) =
wordBitWidth: static int) =
## Parse an unsigned integer from its canonical
## little-endian unsigned representation
## and store it into a BigInt
@ -85,8 +84,7 @@ func unmarshalLE[T](
func unmarshalBE[T](
dst: var openArray[T],
src: openarray[byte],
wordBitWidth: static int
) =
wordBitWidth: static int) =
## Parse an unsigned integer from its canonical
## big-endian unsigned representation (octet string)
## and store it into a BigInt.

View File

@ -17,7 +17,7 @@ import ../../metering/tracer
export primitives, tracer
when sizeof(int) == 8 and not defined(Constantine32):
when sizeof(int) == 8 and not defined(Ctt32):
type
BaseType* = uint64
## Physical BigInt for conversion in "normal integers"
@ -67,7 +67,7 @@ type VarTime* = object
type SignedSecretWord* = distinct SecretWord
when sizeof(int) == 8 and not defined(Constantine32):
when sizeof(int) == 8 and not defined(Ctt32):
type
SignedBaseType* = int64
else:

View File

@ -49,28 +49,16 @@ template mux_x86_impl() {.dirty.} =
static: doAssert(X86)
static: doAssert(GCC_Compatible)
when sizeof(T) == 8:
var muxed = x
asm """
testq %[ctl], %[ctl]
cmovzq %[y], %[muxed]
: [muxed] "+r" (`muxed`)
: [ctl] "r" (`ctl`), [y] "r" (`y`)
: "cc"
"""
muxed
elif sizeof(T) == 4:
var muxed = x
asm """
testl %[ctl], %[ctl]
cmovzl %[y], %[muxed]
: [muxed] "+r" (`muxed`)
: [ctl] "r" (`ctl`), [y] "r" (`y`)
: "cc"
"""
muxed
else:
{.error: "Unsupported word size".}
var muxed = x
asm """
test %[ctl], %[ctl]
cmovz %[muxed], %[y]
: [muxed] "+r" (`muxed`)
: [ctl] "r" (`ctl`), [y] "r" (`y`)
: "cc"
"""
muxed
func mux_x86[T](ctl: CTBool[T], x, y: T): T {.inline.}=
## Multiplexer / selector
@ -92,42 +80,23 @@ func ccopy_x86[T](ctl: CTBool[T], x: var T, y: T) {.inline.}=
static: doAssert(X86)
static: doAssert(GCC_Compatible)
when sizeof(T) == 8:
when defined(cpp):
asm """
testq %[ctl], %[ctl]
cmovnzq %[y], %[x]
when defined(cpp):
asm """
test %[ctl], %[ctl]
cmovnz %[x], %[y]
: [x] "+r" (`x`)
: [ctl] "r" (`ctl`), [y] "r" (`y`)
: "cc"
"""
else:
asm """
testq %[ctl], %[ctl]
cmovnzq %[y], %[x]
: [x] "+r" (`*x`)
: [ctl] "r" (`ctl`), [y] "r" (`y`)
: "cc"
"""
elif sizeof(T) == 4:
when defined(cpp):
asm """
testl %[ctl], %[ctl]
cmovnzl %[y], %[x]
: [x] "+r" (`x`)
: [ctl] "r" (`ctl`), [y] "r" (`y`)
: "cc"
"""
else:
asm """
testl %[ctl], %[ctl]
cmovnzl %[y], %[x]
: [x] "+r" (`*x`)
: [ctl] "r" (`ctl`), [y] "r" (`y`)
: "cc"
"""
else:
{.error: "Unsupported word size".}
asm """
test %[ctl], %[ctl]
cmovnz %[x], %[y]
: [x] "+r" (`*x`)
: [ctl] "r" (`ctl`), [y] "r" (`y`)
: "cc"
"""
# Public functions
# ------------------------------------------------------------

View File

@ -44,7 +44,7 @@ macro replacePragmasByInline(procAst: typed): untyped =
result = newStmtList()
# The push cdecl is applied multiple times :/, so fight push with push
# The push noconv is applied multiple times :/, so fight push with push
result.add nnkPragma.newTree(ident"push", ident"nimcall", ident"inline")
result.add newProc(
@ -61,7 +61,7 @@ macro wrapOpenArrayLenType*(ty: typedesc, procAst: untyped): untyped =
## Wraps pointer+len library calls in properly typed and converted openArray calls
##
## ```
## {.push cdecl.}
## {.push noconv.}
## proc foo*(r: int, a: openArray[CustomType], b: int) {.wrapOpenArrayLenType: uint32, importc: "foo", dynlib: "libfoo.so".}
## {.pop.}
## ```
@ -69,7 +69,7 @@ macro wrapOpenArrayLenType*(ty: typedesc, procAst: untyped): untyped =
## is transformed into
##
## ```
## proc foo(r: int, a: ptr CustomType, aLen: uint32, b: int) {.cdecl, importc: "foo", dynlib: "libfoo.so".}
## proc foo(r: int, a: ptr CustomType, aLen: uint32, b: int) {.noconv, importc: "foo", dynlib: "libfoo.so".}
##
## proc foo*(r: int, a: openArray[CustomType], b: int) {.inline.} =
## foo(r, a[0].unsafeAddr, a.len.uint32, b)
@ -140,7 +140,7 @@ macro wrapOpenArrayLenType*(ty: typedesc, procAst: untyped): untyped =
when isMainModule:
expandMacros:
{.push cdecl.}
{.push noconv.}
proc foo(x: int, a: openArray[uint32], name: cstring) {.wrapOpenArrayLenType: cuint.} =
discard

View File

@ -26,7 +26,7 @@ static: echo "[Constantine] Using library " & libLLVM
# also link to libLLVM, for example if they implement a virtual machine (for the EVM, for Snarks/zero-knowledge, ...).
# Hence Constantine should always use LLVM context to "namespace" its own codegen and avoid collisions in the global context.
{.push cdecl, dynlib: libLLVM.}
{.push noconv, dynlib: libLLVM.}
# ############################################################
#
@ -571,4 +571,4 @@ proc memset*(builder: BuilderRef, `ptr`, val, len: ValueRef, align: uint32) {.im
proc memcpy*(builder: BuilderRef, dst: ValueRef, dstAlign: uint32, src: ValueRef, srcAlign: uint32, size: ValueRef) {.importc: "LLVMBuildMemcpy".}
proc memmove*(builder: BuilderRef, dst: ValueRef, dstAlign: uint32, src: ValueRef, srcAlign: uint32, size: ValueRef) {.importc: "LLVMBuildMemmove".}
{.pop.} # {.used, hint[Name]: off, cdecl, dynlib: libLLVM.}
{.pop.} # {.used, hint[Name]: off, noconv, dynlib: libLLVM.}

View File

@ -482,7 +482,7 @@ type
CUstream* = distinct pointer
CUdeviceptr* = distinct pointer
{.push cdecl, importc, dynlib: "libcuda.so".}
{.push noconv, importc, dynlib: "libcuda.so".}
proc cuInit*(flags: uint32): CUresult
@ -515,4 +515,4 @@ proc cuMemFree*(devptr: CUdeviceptr): CUresult
proc cuMemcpyHtoD*(dst: CUdeviceptr, src: pointer, size: csize_t): CUresult
proc cuMemcpyDtoH*(dst: pointer, src: CUdeviceptr, size: csize_t): CUresult
{.pop.} # {.push cdecl, importc, dynlib: "libcuda.so".}
{.pop.} # {.push noconv, importc, dynlib: "libcuda.so".}

View File

@ -4,7 +4,7 @@ proc cpuidX86(eaxi, ecxi: int32): tuple[eax, ebx, ecx, edx: int32] {.used.}=
when defined(vcc):
# limited inline asm support in vcc, so intrinsics, here we go:
proc cpuidVcc(cpuInfo: ptr int32; functionID, subFunctionID: int32)
{.cdecl, importc: "__cpuidex", header: "intrin.h".}
{.noconv, importc: "__cpuidex", header: "intrin.h".}
cpuidVcc(addr result.eax, eaxi, ecxi)
else:
var (eaxr, ebxr, ecxr, edxr) = (0'i32, 0'i32, 0'i32, 0'i32)

File diff suppressed because it is too large Load Diff

View File

@ -53,7 +53,7 @@ when X86 and GCC_Compatible:
# ############################################################
template debug*(body: untyped): untyped =
when defined(debugConstantine):
when defined(CttDebug):
body
proc builtin_unreachable(){.nodecl, importc: "__builtin_unreachable".}

View File

@ -34,7 +34,7 @@ import std/macros
# --------------------------------------------------------
# Everything should be a template that doesn't produce any code
# when debugConstantine is not defined.
# when CttDebug is not defined.
# Those checks are controlled by a custom flag instead of
# "--boundsChecks" or "--nilChecks" to decouple them from user code checks.
# Furthermore, we want them to be very lightweight on performance

View File

@ -76,9 +76,9 @@ const ULF_WAKE_MASK = ULF_NO_ERRNO or
ULF_WAKE_THREAD or
ULF_WAKE_ALLOW_NON_OWNER
proc ulock_wait(operation: uint32, address: pointer, expected: uint64, timeout: uint32): cint {.importc:"__ulock_wait", cdecl.}
proc ulock_wait2(operation: uint32, address: pointer, expected: uint64, timeout, value2: uint64): cint {.importc:"__ulock_wait2", cdecl.}
proc ulock_wake(operation: uint32, address: pointer, wake_value: uint64): cint {.importc:"__ulock_wake", cdecl.}
proc ulock_wait(operation: uint32, address: pointer, expected: uint64, timeout: uint32): cint {.importc:"__ulock_wait", noconv.}
proc ulock_wait2(operation: uint32, address: pointer, expected: uint64, timeout, value2: uint64): cint {.importc:"__ulock_wait2", noconv.}
proc ulock_wake(operation: uint32, address: pointer, wake_value: uint64): cint {.importc:"__ulock_wake", noconv.}
# Futex API
# ------------------------------------------------------------------------

View File

@ -150,7 +150,7 @@ macro genCharAPI*(procAst: untyped): untyped =
wrapperBody.add ident($procAst.params[i][j])
var pragmas = nnkPragma.newTree(ident"inline")
let skipPragmas = ["inline", "noinline", "noInline", "exportc", "exportcpp", "extern", "cdecl", "stdcall", "dynlib", "libPrefix"]
let skipPragmas = ["inline", "noinline", "noInline", "exportc", "exportcpp", "extern", "noconv", "cdecl", "stdcall", "dynlib", "libPrefix"]
for i in 0 ..< procAst.pragma.len:
if procAst.pragma[i].kind == nnkIdent:
if $procAst.pragma[i] notin skipPragmas:

View File

@ -15,7 +15,7 @@
# that internally uses `sha256.hash`,
# the ideal outcome is for `sha256.hash to be exported as `ctt_sha256_hash` and
# have `hash_to_curve` directly use that.
# 3. Furthermore while compiling Nim only, no export marker (cdecl, dynlib, exportc) are used.
# 3. Furthermore while compiling Nim only, no export marker (noconv, dynlib, exportc) are used.
#
# Each prefix must be modified before importing the module to export
@ -37,7 +37,7 @@ macro libPrefix*(prefix: static string, procAst: untyped): untyped =
if pragmas.kind == nnkEmpty:
pragmas = nnkPragma.newTree()
pragmas.add ident"cdecl"
pragmas.add ident"noconv"
pragmas.add nnkExprColonExpr.newTree(
ident"exportc",
newLit(prefix & "$1"))

Binary file not shown.

After

Width:  |  Height:  |  Size: 459 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 464 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 334 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 150 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 91 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 598 KiB

View File

@ -6,60 +6,42 @@
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
std/[macros, times, monotimes],
../benchmarks/platforms
# ############################################################
#
# Trace operations
#
# ############################################################
# Utils
# --------------------------------------------------
const someGcc = defined(gcc) or defined(llvm_gcc) or defined(clang) or defined(icc)
const hasThreadSupport = defined(threads)
proc atomicInc*(memLoc: var int64, x = 1'i64): int64 =
when someGcc and hasThreadSupport:
result = atomicAddFetch(memLoc.addr, x, ATOMIC_RELAXED)
elif defined(vcc) and hasThreadSupport:
result = addAndFetch(memLoc.addr, x)
result += x
else:
memloc += x
result = memLoc
# Types
# --------------------------------------------------
type
Metadata* = object
procName*: string
module: string
package: string
tag: string # Can be change to multi-tags later
numCalls*: int64
cumulatedTimeNs*: int64 # in microseconds
when SupportsGetTicks:
cumulatedCycles*: int64
template mtag(tagname: string){.pragma, used.}
## This will allow tagging proc in the future with
## "Fp", "ec", "polynomial"
const CttMeter {.booldefine.} = off
const CttTrace {.booldefine.} = off # For manual "debug-echo"-style timing.
var ctMetrics{.compileTime.}: seq[Metadata]
## Metrics are collected here, this is just a temporary holder of compileTime values
## Unfortunately the "seq" is emptied when passing the compileTime/runtime boundaries
## due to Nim bugs
when CttMeter or CttTrace:
import ../benchmarks/platforms
type
Metadata* = object
procName*: string
module: string
package: string
tag: string # Can be change to multi-tags later
numCalls*: int64
cumulatedTimeNs*: int64 # in microseconds
when SupportsGetTicks:
cumulatedCycles*: int64
var ctMetrics{.compileTime.}: seq[Metadata]
## Metrics are collected here, this is just a temporary holder of compileTime values
## Unfortunately the "seq" is emptied when passing the compileTime/runtime boundaries
## due to Nim bugs
# strformat doesn't work in templates.
from strutils import alignLeft, formatFloat
import std/[macros, times, monotimes]
var Metrics*: seq[Metadata]
## We can't directly use it at compileTime because it doesn't exist.
@ -69,80 +51,96 @@ when CttMeter or CttTrace:
proc resetMetering*() =
Metrics = static(ctMetrics)
# Symbols
# --------------------------------------------------
template fnEntry(name: string, id: int, startTime, startCycle: untyped): untyped =
## Bench tracing to insert on function entry
{.noSideEffect, gcsafe.}:
discard Metrics[id].numCalls.atomicInc()
let startTime = getMonoTime()
when SupportsGetTicks:
let startCycle = getTicks()
# Utils
# --------------------------------------------------
const someGcc = defined(gcc) or defined(llvm_gcc) or defined(clang) or defined(icc)
const hasThreadSupport = defined(threads)
proc atomicInc*(memLoc: var int64, x = 1'i64): int64 =
when someGcc and hasThreadSupport:
result = atomicAddFetch(memLoc.addr, x, ATOMIC_RELAXED)
elif defined(vcc) and hasThreadSupport:
result = addAndFetch(memLoc.addr, x)
result += x
else:
let startCycle = 0
memloc += x
result = memLoc
template fnExit(name: string, id: int, startTime, startCycle: untyped): untyped =
## Bench tracing to insert before each function exit
{.noSideEffect, gcsafe.}:
when SupportsGetTicks:
let stopCycle = getTicks()
let stopTime = getMonoTime()
when SupportsGetTicks:
let elapsedCycles = stopCycle - startCycle
let elapsedTime = inMicroseconds(stopTime - startTime)
# Symbols
# --------------------------------------------------
discard Metrics[id].cumulatedTimeNs.atomicInc(elapsedTime)
when SupportsGetTicks:
discard Metrics[id].cumulatedCycles.atomicInc(elapsedCycles)
when CttTrace:
# Advice: Use "when name == relevantProc" to isolate specific procedures.
# strformat doesn't work in templates.
template fnEntry(name: string, id: int, startTime, startCycle: untyped): untyped =
## Bench tracing to insert on function entry
{.noSideEffect, gcsafe.}:
discard Metrics[id].numCalls.atomicInc()
let startTime = getMonoTime()
when SupportsGetTicks:
echo static(alignLeft(name, 50)),
"Time (µs): ", alignLeft(formatFloat(elapsedTime.float64 * 1e-3, precision=3), 10),
"Cycles (billions): ", formatFloat(elapsedCycles.float64 * 1e-9, precision=3)
let startCycle = getTicks()
else:
echo static(alignLeft(name, 50)),
"Time (µs): ", alignLeft(formatFloat(elapsedTime.float64 * 1e-3, precision=3), 10)
let startCycle = 0
macro meterAnnotate(procAst: untyped): untyped =
procAst.expectKind({nnkProcDef, nnkFuncDef})
template fnExit(name: string, id: int, startTime, startCycle: untyped): untyped =
## Bench tracing to insert before each function exit
{.noSideEffect, gcsafe.}:
when SupportsGetTicks:
let stopCycle = getTicks()
let stopTime = getMonoTime()
when SupportsGetTicks:
let elapsedCycles = stopCycle - startCycle
let elapsedTime = inMicroseconds(stopTime - startTime)
let id = ctMetrics.len
let name = procAst[0].repr & procAst[3].repr
# TODO, get the module and the package the proc is coming from
# and the tag "Fp", "ec", "polynomial" ...
discard Metrics[id].cumulatedTimeNs.atomicInc(elapsedTime)
when SupportsGetTicks:
discard Metrics[id].cumulatedCycles.atomicInc(elapsedCycles)
ctMetrics.add Metadata(procName: name)
var newBody = newStmtList()
let startTime = genSym(nskLet, "metering_" & name & "_startTime_")
let startCycle = genSym(nskLet, "metering_" & name & "_startCycles_")
newBody.add getAst(fnEntry(name, id, startTime, startCycle))
newbody.add nnkDefer.newTree(getAst(fnExit(name, id, startTime, startCycle)))
newBody.add procAst.body
if procAst[4].kind != nnkEmpty:
# Timing procedures adds the TimeEffect tag, which interferes with {.tags:[VarTime].}
# as TimeEffect is not listed. We drop the `tags` for metering
var pragmas: NimNode
if procAst[4].len == 1:
if procAst[4][0].kind == nnkExprColonExpr and procAst[4][0][0].eqIdent"tags":
pragmas = newEmptyNode()
else:
pragmas = procAst[4]
else:
pragmas = nnkPragma.newTree()
for i in 0 ..< procAst[4].len:
if procAst[4][0].kind == nnkExprColonExpr and procAst[4][0][0].eqIdent"tags":
continue
when CttTrace:
# Advice: Use "when name == relevantProc" to isolate specific procedures.
# strformat doesn't work in templates.
when SupportsGetTicks:
echo static(alignLeft(name, 50)),
"Time (µs): ", alignLeft(formatFloat(elapsedTime.float64 * 1e-3, precision=3), 10),
"Cycles (billions): ", formatFloat(elapsedCycles.float64 * 1e-9, precision=3)
else:
pragmas.add procAst[4][0]
procAst[4] = pragmas
echo static(alignLeft(name, 50)),
"Time (µs): ", alignLeft(formatFloat(elapsedTime.float64 * 1e-3, precision=3), 10)
procAst.body = newBody
result = procAst
macro meterAnnotate(procAst: untyped): untyped =
procAst.expectKind({nnkProcDef, nnkFuncDef})
let id = ctMetrics.len
let name = procAst[0].repr & procAst[3].repr
# TODO, get the module and the package the proc is coming from
# and the tag "Fp", "ec", "polynomial" ...
ctMetrics.add Metadata(procName: name)
var newBody = newStmtList()
let startTime = genSym(nskLet, "metering_" & name & "_startTime_")
let startCycle = genSym(nskLet, "metering_" & name & "_startCycles_")
newBody.add getAst(fnEntry(name, id, startTime, startCycle))
newbody.add nnkDefer.newTree(getAst(fnExit(name, id, startTime, startCycle)))
newBody.add procAst.body
if procAst[4].kind != nnkEmpty:
# Timing procedures adds the TimeEffect tag, which interferes with {.tags:[VarTime].}
# as TimeEffect is not listed. We drop the `tags` for metering
var pragmas: NimNode
if procAst[4].len == 1:
if procAst[4][0].kind == nnkExprColonExpr and procAst[4][0][0].eqIdent"tags":
pragmas = newEmptyNode()
else:
pragmas = procAst[4]
else:
pragmas = nnkPragma.newTree()
for i in 0 ..< procAst[4].len:
if procAst[4][0].kind == nnkExprColonExpr and procAst[4][0][0].eqIdent"tags":
continue
else:
pragmas.add procAst[4][0]
procAst[4] = pragmas
procAst.body = newBody
result = procAst
template meter*(procBody: untyped): untyped =
when CttMeter or CttTrace:
@ -157,14 +155,15 @@ when isMainModule:
static: doAssert CttMeter or CttTrace, "CttMeter or CttTrace must be on for tracing"
expandMacros:
proc foo(x: int): int{.meter.} =
echo "Hey hey hey"
result = x
when CttMeter or CttTrace: # Avoid warnings from nim check or nimsuggest
expandMacros:
proc foo(x: int): int{.meter.} =
echo "Hey hey hey"
result = x
resetMetering()
resetMetering()
echo Metrics
discard foo(10)
echo Metrics
doAssert Metrics[0].numCalls == 1
echo Metrics
discard foo(10)
echo Metrics
doAssert Metrics[0].numCalls == 1

View File

@ -52,7 +52,7 @@ type
NvvmProgram = distinct pointer
{.push cdecl, importc, dynlib: "libnvvm.so".}
{.push noconv, importc, dynlib: "libnvvm.so".}
proc nvvmGetErrorString*(r: NvvmResult): cstring
proc nvvmVersion*(major, minor: var int32): NvvmResult
@ -69,7 +69,7 @@ proc nvvmGetCompiledResult*(prog: NvvmProgram; buffer: ptr char): NvvmResult
proc nvvmGetProgramLogSize*(prog: NvvmProgram; bufferSizeRet: var csize_t): NvvmResult
proc nvvmGetProgramLog*(prog: NvvmProgram; buffer: ptr char): NvvmResult
{.pop.} # {.push cdecl, importc, header: "<nvvm.h>".}
{.pop.} # {.push noconv, importc, header: "<nvvm.h>".}
# ############################################################
#

View File

@ -1 +1 @@
-d:debugConstantine
-d:CttDebug

View File

@ -1,2 +1,2 @@
-d:testingCurves
-d:debugConstantine
-d:CttDebug

View File

@ -142,11 +142,15 @@ proc runTowerTests*[N](
block:
var r{.noinit.}: Field
r.square(One)
check: bool(r == One)
doAssert bool(r == One),
"\n(" & $Field & "): Expected one: " & One.toHex() & "\n" &
"got: " & r.toHex()
block:
var r{.noinit.}: Field
r.prod(One, One)
check: bool(r == One)
doAssert bool(r == One),
"\n(" & $Field & "): Expected one: " & One.toHex() & "\n" &
"got: " & r.toHex()
staticFor(curve, TestCurves):
test(ExtField(ExtDegree, curve))
@ -168,12 +172,16 @@ proc runTowerTests*[N](
var r: Field
r.square(Two)
check: bool(r == Four)
doAssert bool(r == Four),
"\n(" & $Field & "): Expected 4: " & Four.toHex() & "\n" &
"got: " & r.toHex()
block:
var r: Field
r.prod(Two, Two)
check: bool(r == Four)
doAssert bool(r == Four),
"\n(" & $Field & "): Expected 4: " & Four.toHex() & "\n" &
"got: " & r.toHex()
staticFor(curve, TestCurves):
test(ExtField(ExtDegree, curve))
@ -197,12 +205,16 @@ proc runTowerTests*[N](
var u: Field
u.square(Three)
check: bool(u == Nine)
doAssert bool(u == Nine),
"\n(" & $Field & "): Expected 9: " & Nine.toHex() & "\n" &
"got: " & u.toHex()
block:
var u: Field
u.prod(Three, Three)
check: bool(u == Nine)
doAssert bool(u == Nine),
"\n(" & $Field & "): Expected 9: " & Nine.toHex() & "\n" &
"got: " & u.toHex()
staticFor(curve, TestCurves):
test(ExtField(ExtDegree, curve))
@ -226,12 +238,16 @@ proc runTowerTests*[N](
var u: Field
u.square(MinusThree)
check: bool(u == Nine)
doAssert bool(u == Nine),
"\n(" & $Field & "): Expected 9: " & Nine.toHex() & "\n" &
"got: " & u.toHex()
block:
var u: Field
u.prod(MinusThree, MinusThree)
check: bool(u == Nine)
doAssert bool(u == Nine),
"\n(" & $Field & "): Expected 9: " & Nine.toHex() & "\n" &
"got: " & u.toHex()
staticFor(curve, TestCurves):
test(ExtField(ExtDegree, curve))

View File

@ -22,10 +22,10 @@ import
const
Iters = 4
TestCurves = [
BN254_Nogami,
TestCurves = [ # Note activating some combination of curves causes miscompile / bad constant propagation with LTO in Windows MinGW GCC 12.2 (but not 8.1 or not 12.2 on Linux)
# BN254_Nogami,
BN254_Snarks,
BLS12_377,
# BLS12_377,
BLS12_381
]

View File

@ -37,7 +37,7 @@ when not defined(windows):
proc SHA256[T: byte|char](
msg: openarray[T],
digest: ptr array[32, byte] = nil
): ptr array[32, byte] {.cdecl, dynlib: DLLSSLName, importc.}
): ptr array[32, byte] {.noconv, dynlib: DLLSSLName, importc.}
# proc EVP_Q_digest[T: byte|char](
# ossl_libctx: pointer,
@ -45,7 +45,7 @@ when not defined(windows):
# propq: cstring,
# data: openArray[T],
# digest: var array[32, byte],
# size: ptr uint): int32 {.cdecl, dynlib: DLLSSLName, importc.}
# size: ptr uint): int32 {.noconv, dynlib: DLLSSLName, importc.}
proc SHA256_OpenSSL[T: byte|char](
digest: var array[32, byte],