mirror of
https://github.com/logos-storage/constantine.git
synced 2026-01-03 13:43:10 +00:00
Double-width tower extension part 1 (#72)
* Implement double-width field multiplication for double-width towering * Fp2 mul acceleration via double-width lazy reduction (pure Nim) * Inline assembly for basic add and sub * Use 2 registers instead of 12+ for ASM conditional copy * Prepare assembly for extended multiprecision multiplication support * Add assembly for mul * initial implementation of assembly reduction * stash current progress of assembly reduction * Fix clobbering issue, only P256 comparison remain buggy * Fix asm montgomery reduction for NIST P256 as well * MULX/ADCX/ADOX multi-precision multiplication * MULX/ADCX/ADOX reduction v1 * Add (deactivated) assembly for double-width substraction + rework benches * Add bench to nimble and deactivate double-width for now. slower than classic * Fix x86-32 running out of registers for mul * Clang needs to be at v9 to support flag output constraints (Xcode 11.4.2 / OSX Catalina) * 32-bit doesn't have enough registers for ASM mul * Fix again Travis Clang 9 issues * LLVM 9 is not whitelisted in travis * deactivated assembler with travis clang * syntax error * another * ... * missing space, yeah ...
This commit is contained in:
parent
5e8b1870a6
commit
d41c653c8a
25
.travis.yml
25
.travis.yml
@ -13,6 +13,7 @@ matrix:
|
|||||||
# Build and test on both x86-64 and ARM64
|
# Build and test on both x86-64 and ARM64
|
||||||
# Ubuntu Bionic (18.04) is needed, it includes
|
# Ubuntu Bionic (18.04) is needed, it includes
|
||||||
# GCC 7 codegen fixes to addcarry_u64.
|
# GCC 7 codegen fixes to addcarry_u64.
|
||||||
|
# Clang 9 (and GCC-6) are needed for inline assembly "flag output constraints"
|
||||||
- dist: bionic
|
- dist: bionic
|
||||||
arch: amd64
|
arch: amd64
|
||||||
env:
|
env:
|
||||||
@ -33,9 +34,19 @@ matrix:
|
|||||||
- ARCH=amd64
|
- ARCH=amd64
|
||||||
- CHANNEL=devel
|
- CHANNEL=devel
|
||||||
compiler: clang
|
compiler: clang
|
||||||
|
# addons:
|
||||||
|
# apt:
|
||||||
|
# sources:
|
||||||
|
# - ubuntu-toolchain-r-test
|
||||||
|
# - llvm-toolchain-bionic-9.0 # LLVM 9 repo is disallowed
|
||||||
|
# packages:
|
||||||
|
# - clang-9.0
|
||||||
|
# env:
|
||||||
|
# - MATRIX_EVAL="CC=clang-9.0 && CXX=clang++-9.0"
|
||||||
|
|
||||||
# On OSX we only test against clang (gcc is mapped to clang by default)
|
# On OSX we only test against clang (gcc is mapped to clang by default)
|
||||||
- os: osx
|
- os: osx
|
||||||
|
osx_image: xcode11.5 # Need xcode 11.4.2 min for Clang 9
|
||||||
arch: amd64
|
arch: amd64
|
||||||
env:
|
env:
|
||||||
- ARCH=amd64
|
- ARCH=amd64
|
||||||
@ -98,9 +109,17 @@ before_script:
|
|||||||
script:
|
script:
|
||||||
- nimble refresh
|
- nimble refresh
|
||||||
- nimble install gmp stew
|
- nimble install gmp stew
|
||||||
- nimble test_parallel
|
# Installing Clang9.0 or later is a pain in Travis
|
||||||
- if [[ "$ARCH" != "arm64" ]]; then
|
# for inline assembly "flag output constraint"
|
||||||
nimble test_parallel_no_assembler;
|
# Also MacOS build is timing out with 2 series of tests.
|
||||||
|
- |
|
||||||
|
if [[ "$TRAVIS_COMPILER" == "clang" ]]; then
|
||||||
|
nimble test_parallel_no_assembler
|
||||||
|
else
|
||||||
|
nimble test_parallel
|
||||||
|
if [[ "$ARCH" != "arm64" ]]; then
|
||||||
|
nimble test_parallel_no_assembler
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
branches:
|
branches:
|
||||||
except:
|
except:
|
||||||
|
|||||||
@ -36,7 +36,7 @@ strategy:
|
|||||||
# TEST_LANG: c
|
# TEST_LANG: c
|
||||||
|
|
||||||
MacOS_devel_64bit:
|
MacOS_devel_64bit:
|
||||||
VM: 'macOS-10.14'
|
VM: 'macOS-10.15'
|
||||||
UCPU: amd64
|
UCPU: amd64
|
||||||
CHANNEL: devel
|
CHANNEL: devel
|
||||||
TEST_LANG: c
|
TEST_LANG: c
|
||||||
|
|||||||
@ -61,7 +61,7 @@ echo "Optimization level => "
|
|||||||
echo " no optimization: ", not defined(release)
|
echo " no optimization: ", not defined(release)
|
||||||
echo " release: ", defined(release)
|
echo " release: ", defined(release)
|
||||||
echo " danger: ", defined(danger)
|
echo " danger: ", defined(danger)
|
||||||
echo " inline assembly: ", UseX86ASM
|
echo " inline assembly: ", UseASM_X86_64
|
||||||
|
|
||||||
when (sizeof(int) == 4) or defined(Constantine32):
|
when (sizeof(int) == 4) or defined(Constantine32):
|
||||||
echo "⚠️ Warning: using Constantine with 32-bit limbs"
|
echo "⚠️ Warning: using Constantine with 32-bit limbs"
|
||||||
|
|||||||
@ -58,7 +58,7 @@ echo "Optimization level => "
|
|||||||
echo " no optimization: ", not defined(release)
|
echo " no optimization: ", not defined(release)
|
||||||
echo " release: ", defined(release)
|
echo " release: ", defined(release)
|
||||||
echo " danger: ", defined(danger)
|
echo " danger: ", defined(danger)
|
||||||
echo " inline assembly: ", UseX86ASM
|
echo " inline assembly: ", UseASM_X86_64
|
||||||
|
|
||||||
when (sizeof(int) == 4) or defined(Constantine32):
|
when (sizeof(int) == 4) or defined(Constantine32):
|
||||||
echo "⚠️ Warning: using Constantine with 32-bit limbs"
|
echo "⚠️ Warning: using Constantine with 32-bit limbs"
|
||||||
|
|||||||
196
benchmarks/bench_fp_double_width.nim
Normal file
196
benchmarks/bench_fp_double_width.nim
Normal file
@ -0,0 +1,196 @@
|
|||||||
|
# Constantine
|
||||||
|
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
||||||
|
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
||||||
|
# Licensed and distributed under either of
|
||||||
|
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||||
|
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||||
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||||
|
|
||||||
|
# ############################################################
|
||||||
|
#
|
||||||
|
# Benchmark of finite fields
|
||||||
|
#
|
||||||
|
# ############################################################
|
||||||
|
|
||||||
|
import
|
||||||
|
# Internals
|
||||||
|
../constantine/config/[curves, common],
|
||||||
|
../constantine/arithmetic,
|
||||||
|
../constantine/towers,
|
||||||
|
# Helpers
|
||||||
|
../helpers/[prng_unsafe, static_for],
|
||||||
|
./platforms,
|
||||||
|
# Standard library
|
||||||
|
std/[monotimes, times, strformat, strutils, macros]
|
||||||
|
|
||||||
|
var rng: RngState
|
||||||
|
let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
|
||||||
|
rng.seed(seed)
|
||||||
|
echo "bench xoshiro512** seed: ", seed
|
||||||
|
|
||||||
|
# warmup
|
||||||
|
proc warmup*() =
|
||||||
|
# Warmup - make sure cpu is on max perf
|
||||||
|
let start = cpuTime()
|
||||||
|
var foo = 123
|
||||||
|
for i in 0 ..< 300_000_000:
|
||||||
|
foo += i*i mod 456
|
||||||
|
foo = foo mod 789
|
||||||
|
|
||||||
|
# Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
|
||||||
|
let stop = cpuTime()
|
||||||
|
echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
|
||||||
|
|
||||||
|
warmup()
|
||||||
|
|
||||||
|
when defined(gcc):
|
||||||
|
echo "\nCompiled with GCC"
|
||||||
|
elif defined(clang):
|
||||||
|
echo "\nCompiled with Clang"
|
||||||
|
elif defined(vcc):
|
||||||
|
echo "\nCompiled with MSVC"
|
||||||
|
elif defined(icc):
|
||||||
|
echo "\nCompiled with ICC"
|
||||||
|
else:
|
||||||
|
echo "\nCompiled with an unknown compiler"
|
||||||
|
|
||||||
|
echo "Optimization level => "
|
||||||
|
echo " no optimization: ", not defined(release)
|
||||||
|
echo " release: ", defined(release)
|
||||||
|
echo " danger: ", defined(danger)
|
||||||
|
echo " inline assembly: ", UseASM_X86_64
|
||||||
|
|
||||||
|
when (sizeof(int) == 4) or defined(Constantine32):
|
||||||
|
echo "⚠️ Warning: using Constantine with 32-bit limbs"
|
||||||
|
else:
|
||||||
|
echo "Using Constantine with 64-bit limbs"
|
||||||
|
|
||||||
|
when SupportsCPUName:
|
||||||
|
echo "Running on ", cpuName(), ""
|
||||||
|
|
||||||
|
when SupportsGetTicks:
|
||||||
|
echo "\n⚠️ Cycles measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
|
||||||
|
echo "i.e. a 20% overclock will be about 20% off (assuming no dynamic frequency scaling)"
|
||||||
|
|
||||||
|
echo "\n=================================================================================================================\n"
|
||||||
|
|
||||||
|
proc separator*() =
|
||||||
|
echo "-".repeat(145)
|
||||||
|
|
||||||
|
proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
|
||||||
|
let ns = inNanoseconds((stop-start) div iters)
|
||||||
|
let throughput = 1e9 / float64(ns)
|
||||||
|
when SupportsGetTicks:
|
||||||
|
echo &"{op:<28} {field:<40} {throughput:>15.3f} ops/s {ns:>9} ns/op {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
|
||||||
|
else:
|
||||||
|
echo &"{op:<28} {field:<40} {throughput:>15.3f} ops/s {ns:>9} ns/op"
|
||||||
|
|
||||||
|
proc notes*() =
|
||||||
|
echo "Notes:"
|
||||||
|
echo " - Compilers:"
|
||||||
|
echo " Compilers are severely limited on multiprecision arithmetic."
|
||||||
|
echo " Inline Assembly is used by default (nimble bench_fp)."
|
||||||
|
echo " Bench without assembly can use \"nimble bench_fp_gcc\" or \"nimble bench_fp_clang\"."
|
||||||
|
echo " GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries."
|
||||||
|
echo " - The simplest operations might be optimized away by the compiler."
|
||||||
|
echo " - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)"
|
||||||
|
|
||||||
|
template bench(op: string, desc: string, iters: int, body: untyped): untyped =
|
||||||
|
let start = getMonotime()
|
||||||
|
when SupportsGetTicks:
|
||||||
|
let startClk = getTicks()
|
||||||
|
for _ in 0 ..< iters:
|
||||||
|
body
|
||||||
|
when SupportsGetTicks:
|
||||||
|
let stopClk = getTicks()
|
||||||
|
let stop = getMonotime()
|
||||||
|
|
||||||
|
when not SupportsGetTicks:
|
||||||
|
let startClk = -1'i64
|
||||||
|
let stopClk = -1'i64
|
||||||
|
|
||||||
|
report(op, desc, start, stop, startClk, stopClk, iters)
|
||||||
|
|
||||||
|
func random_unsafe(rng: var RngState, a: var FpDbl, Base: typedesc) =
|
||||||
|
## Initialize a standalone Double-Width field element
|
||||||
|
## we don't reduce it modulo p², this is only used for benchmark
|
||||||
|
let aHi = rng.random_unsafe(Base)
|
||||||
|
let aLo = rng.random_unsafe(Base)
|
||||||
|
for i in 0 ..< aLo.mres.limbs.len:
|
||||||
|
a.limbs2x[i] = aLo.mres.limbs[i]
|
||||||
|
for i in 0 ..< aHi.mres.limbs.len:
|
||||||
|
a.limbs2x[aLo.mres.limbs.len+i] = aHi.mres.limbs[i]
|
||||||
|
|
||||||
|
proc sumNoReduce(T: typedesc, iters: int) =
|
||||||
|
var r: T
|
||||||
|
let a = rng.random_unsafe(T)
|
||||||
|
let b = rng.random_unsafe(T)
|
||||||
|
bench("Addition no reduce", $T, iters):
|
||||||
|
r.sumNoReduce(a, b)
|
||||||
|
|
||||||
|
proc sum(T: typedesc, iters: int) =
|
||||||
|
var r: T
|
||||||
|
let a = rng.random_unsafe(T)
|
||||||
|
let b = rng.random_unsafe(T)
|
||||||
|
bench("Addition", $T, iters):
|
||||||
|
r.sum(a, b)
|
||||||
|
|
||||||
|
proc diffNoReduce(T: typedesc, iters: int) =
|
||||||
|
var r: T
|
||||||
|
let a = rng.random_unsafe(T)
|
||||||
|
let b = rng.random_unsafe(T)
|
||||||
|
bench("Substraction no reduce", $T, iters):
|
||||||
|
r.diffNoReduce(a, b)
|
||||||
|
|
||||||
|
proc diff(T: typedesc, iters: int) =
|
||||||
|
var r: T
|
||||||
|
let a = rng.random_unsafe(T)
|
||||||
|
let b = rng.random_unsafe(T)
|
||||||
|
bench("Substraction", $T, iters):
|
||||||
|
r.diff(a, b)
|
||||||
|
|
||||||
|
proc diff2xNoReduce(T: typedesc, iters: int) =
|
||||||
|
var r, a, b: doubleWidth(T)
|
||||||
|
rng.random_unsafe(r, T)
|
||||||
|
rng.random_unsafe(a, T)
|
||||||
|
rng.random_unsafe(b, T)
|
||||||
|
bench("Substraction 2x no reduce", $doubleWidth(T), iters):
|
||||||
|
r.diffNoReduce(a, b)
|
||||||
|
|
||||||
|
proc diff2x(T: typedesc, iters: int) =
|
||||||
|
var r, a, b: doubleWidth(T)
|
||||||
|
rng.random_unsafe(r, T)
|
||||||
|
rng.random_unsafe(a, T)
|
||||||
|
rng.random_unsafe(b, T)
|
||||||
|
bench("Substraction 2x", $doubleWidth(T), iters):
|
||||||
|
r.diff(a, b)
|
||||||
|
|
||||||
|
proc mul2xBench*(rLen, aLen, bLen: static int, iters: int) =
|
||||||
|
var r: BigInt[rLen]
|
||||||
|
let a = rng.random_unsafe(BigInt[aLen])
|
||||||
|
let b = rng.random_unsafe(BigInt[bLen])
|
||||||
|
bench("Multiplication", $rLen & " <- " & $aLen & " x " & $bLen, iters):
|
||||||
|
r.prod(a, b)
|
||||||
|
|
||||||
|
proc reduce2x*(T: typedesc, iters: int) =
|
||||||
|
var r: T
|
||||||
|
var t: doubleWidth(T)
|
||||||
|
rng.random_unsafe(t, T)
|
||||||
|
|
||||||
|
bench("Reduce 2x-width", $T & " <- " & $doubleWidth(T), iters):
|
||||||
|
r.reduce(t)
|
||||||
|
|
||||||
|
proc main() =
|
||||||
|
separator()
|
||||||
|
sumNoReduce(Fp[BLS12_381], iters = 10_000_000)
|
||||||
|
diffNoReduce(Fp[BLS12_381], iters = 10_000_000)
|
||||||
|
sum(Fp[BLS12_381], iters = 10_000_000)
|
||||||
|
diff(Fp[BLS12_381], iters = 10_000_000)
|
||||||
|
diff2x(Fp[BLS12_381], iters = 10_000_000)
|
||||||
|
diff2xNoReduce(Fp[BLS12_381], iters = 10_000_000)
|
||||||
|
mul2xBench(768, 384, 384, iters = 10_000_000)
|
||||||
|
reduce2x(Fp[BLS12_381], iters = 10_000_000)
|
||||||
|
separator()
|
||||||
|
|
||||||
|
main()
|
||||||
|
notes()
|
||||||
@ -35,6 +35,8 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
|
|||||||
("tests/t_finite_fields_vs_gmp.nim", true),
|
("tests/t_finite_fields_vs_gmp.nim", true),
|
||||||
# Precompute
|
# Precompute
|
||||||
("tests/t_precomputed", false),
|
("tests/t_precomputed", false),
|
||||||
|
# Double-width finite fields
|
||||||
|
("tests/t_finite_fields_double_width.nim", false),
|
||||||
# Towers of extension fields
|
# Towers of extension fields
|
||||||
("tests/t_fp2.nim", false),
|
("tests/t_fp2.nim", false),
|
||||||
("tests/t_fp2_sqrt.nim", false),
|
("tests/t_fp2_sqrt.nim", false),
|
||||||
@ -100,13 +102,15 @@ proc test(flags, path: string, commandFile = false) =
|
|||||||
# commandFile.writeLine command
|
# commandFile.writeLine command
|
||||||
exec "echo \'" & command & "\' >> " & buildParallel
|
exec "echo \'" & command & "\' >> " & buildParallel
|
||||||
|
|
||||||
proc runBench(benchName: string, compiler = "") =
|
proc runBench(benchName: string, compiler = "", useAsm = true) =
|
||||||
if not dirExists "build":
|
if not dirExists "build":
|
||||||
mkDir "build"
|
mkDir "build"
|
||||||
|
|
||||||
var cc = ""
|
var cc = ""
|
||||||
if compiler != "":
|
if compiler != "":
|
||||||
cc = "--cc:" & compiler & " -d:ConstantineASM=false"
|
cc = "--cc:" & compiler
|
||||||
|
if not useAsm:
|
||||||
|
cc &= " -d:ConstantineASM=false"
|
||||||
exec "nim c " & cc &
|
exec "nim c " & cc &
|
||||||
" -d:danger --verbosity:0 -o:build/" & benchName & "_" & compiler &
|
" -d:danger --verbosity:0 -o:build/" & benchName & "_" & compiler &
|
||||||
" -r --hints:off --warnings:off benchmarks/" & benchName & ".nim"
|
" -r --hints:off --warnings:off benchmarks/" & benchName & ".nim"
|
||||||
@ -298,6 +302,27 @@ task bench_fp_gcc, "Run benchmark 𝔽p with gcc":
|
|||||||
task bench_fp_clang, "Run benchmark 𝔽p with clang":
|
task bench_fp_clang, "Run benchmark 𝔽p with clang":
|
||||||
runBench("bench_fp", "clang")
|
runBench("bench_fp", "clang")
|
||||||
|
|
||||||
|
task bench_fp_gcc_noasm, "Run benchmark 𝔽p with gcc - no Assembly":
|
||||||
|
runBench("bench_fp", "gcc", useAsm = false)
|
||||||
|
|
||||||
|
task bench_fp_clang_noasm, "Run benchmark 𝔽p with clang - no Assembly":
|
||||||
|
runBench("bench_fp", "clang", useAsm = false)
|
||||||
|
|
||||||
|
task bench_fpdbl, "Run benchmark 𝔽pDbl with your default compiler":
|
||||||
|
runBench("bench_fp_double_width")
|
||||||
|
|
||||||
|
task bench_fpdbl_gcc, "Run benchmark 𝔽p with gcc":
|
||||||
|
runBench("bench_fp_double_width", "gcc")
|
||||||
|
|
||||||
|
task bench_fpdbl_clang, "Run benchmark 𝔽p with clang":
|
||||||
|
runBench("bench_fp_double_width", "clang")
|
||||||
|
|
||||||
|
task bench_fpdbl_gcc_noasm, "Run benchmark 𝔽p with gcc - no Assembly":
|
||||||
|
runBench("bench_fp_double_width", "gcc", useAsm = false)
|
||||||
|
|
||||||
|
task bench_fpdbl_clang_noasm, "Run benchmark 𝔽p with clang - no Assembly":
|
||||||
|
runBench("bench_fp_double_width", "clang", useAsm = false)
|
||||||
|
|
||||||
task bench_fp2, "Run benchmark with 𝔽p2 your default compiler":
|
task bench_fp2, "Run benchmark with 𝔽p2 your default compiler":
|
||||||
runBench("bench_fp2")
|
runBench("bench_fp2")
|
||||||
|
|
||||||
@ -307,6 +332,12 @@ task bench_fp2_gcc, "Run benchmark 𝔽p2 with gcc":
|
|||||||
task bench_fp2_clang, "Run benchmark 𝔽p2 with clang":
|
task bench_fp2_clang, "Run benchmark 𝔽p2 with clang":
|
||||||
runBench("bench_fp2", "clang")
|
runBench("bench_fp2", "clang")
|
||||||
|
|
||||||
|
task bench_fp2_gcc_noasm, "Run benchmark 𝔽p2 with gcc - no Assembly":
|
||||||
|
runBench("bench_fp2", "gcc", useAsm = false)
|
||||||
|
|
||||||
|
task bench_fp2_clang_noasm, "Run benchmark 𝔽p2 with clang - no Assembly":
|
||||||
|
runBench("bench_fp2", "clang", useAsm = false)
|
||||||
|
|
||||||
task bench_fp6, "Run benchmark with 𝔽p6 your default compiler":
|
task bench_fp6, "Run benchmark with 𝔽p6 your default compiler":
|
||||||
runBench("bench_fp6")
|
runBench("bench_fp6")
|
||||||
|
|
||||||
@ -316,6 +347,12 @@ task bench_fp6_gcc, "Run benchmark 𝔽p6 with gcc":
|
|||||||
task bench_fp6_clang, "Run benchmark 𝔽p6 with clang":
|
task bench_fp6_clang, "Run benchmark 𝔽p6 with clang":
|
||||||
runBench("bench_fp6", "clang")
|
runBench("bench_fp6", "clang")
|
||||||
|
|
||||||
|
task bench_fp6_gcc_noasm, "Run benchmark 𝔽p6 with gcc - no Assembly":
|
||||||
|
runBench("bench_fp6", "gcc", useAsm = false)
|
||||||
|
|
||||||
|
task bench_fp6_clang_noasm, "Run benchmark 𝔽p6 with clang - no Assembly":
|
||||||
|
runBench("bench_fp6", "clang", useAsm = false)
|
||||||
|
|
||||||
task bench_fp12, "Run benchmark with 𝔽p12 your default compiler":
|
task bench_fp12, "Run benchmark with 𝔽p12 your default compiler":
|
||||||
runBench("bench_fp12")
|
runBench("bench_fp12")
|
||||||
|
|
||||||
@ -325,6 +362,12 @@ task bench_fp12_gcc, "Run benchmark 𝔽p12 with gcc":
|
|||||||
task bench_fp12_clang, "Run benchmark 𝔽p12 with clang":
|
task bench_fp12_clang, "Run benchmark 𝔽p12 with clang":
|
||||||
runBench("bench_fp12", "clang")
|
runBench("bench_fp12", "clang")
|
||||||
|
|
||||||
|
task bench_fp12_gcc_noasm, "Run benchmark 𝔽p12 with gcc - no Assembly":
|
||||||
|
runBench("bench_fp12", "gcc", useAsm = false)
|
||||||
|
|
||||||
|
task bench_fp12_clang_noasm, "Run benchmark 𝔽p12 with clang - no Assembly":
|
||||||
|
runBench("bench_fp12", "clang", useAsm = false)
|
||||||
|
|
||||||
task bench_ec_g1, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - GCC":
|
task bench_ec_g1, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - GCC":
|
||||||
runBench("bench_ec_g1")
|
runBench("bench_ec_g1")
|
||||||
|
|
||||||
@ -334,6 +377,12 @@ task bench_ec_g1_gcc, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weier
|
|||||||
task bench_ec_g1_clang, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - Clang":
|
task bench_ec_g1_clang, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - Clang":
|
||||||
runBench("bench_ec_g1", "clang")
|
runBench("bench_ec_g1", "clang")
|
||||||
|
|
||||||
|
task bench_ec_g1_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - GCC no Assembly":
|
||||||
|
runBench("bench_ec_g1", "gcc", useAsm = false)
|
||||||
|
|
||||||
|
task bench_ec_g1_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - Clang no Assembly":
|
||||||
|
runBench("bench_ec_g1", "clang", useAsm = false)
|
||||||
|
|
||||||
task bench_ec_g2, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - GCC":
|
task bench_ec_g2, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - GCC":
|
||||||
runBench("bench_ec_g2")
|
runBench("bench_ec_g2")
|
||||||
|
|
||||||
@ -342,3 +391,9 @@ task bench_ec_g2_gcc, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weier
|
|||||||
|
|
||||||
task bench_ec_g2_clang, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - Clang":
|
task bench_ec_g2_clang, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - Clang":
|
||||||
runBench("bench_ec_g2", "clang")
|
runBench("bench_ec_g2", "clang")
|
||||||
|
|
||||||
|
task bench_ec_g2_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - GCC no Assembly":
|
||||||
|
runBench("bench_ec_g2", "gcc", useAsm = false)
|
||||||
|
|
||||||
|
task bench_ec_g2_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - Clang no Assembly":
|
||||||
|
runBench("bench_ec_g2", "clang", useAsm = false)
|
||||||
|
|||||||
@ -8,8 +8,8 @@
|
|||||||
|
|
||||||
import
|
import
|
||||||
arithmetic/bigints,
|
arithmetic/bigints,
|
||||||
arithmetic/[finite_fields, finite_fields_inversion]
|
arithmetic/[finite_fields, finite_fields_inversion, finite_fields_double_width]
|
||||||
|
|
||||||
export
|
export
|
||||||
bigints,
|
bigints,
|
||||||
finite_fields, finite_fields_inversion
|
finite_fields, finite_fields_inversion, finite_fields_double_width
|
||||||
|
|||||||
@ -9,13 +9,10 @@
|
|||||||
import
|
import
|
||||||
../config/[common, type_bigint],
|
../config/[common, type_bigint],
|
||||||
../primitives,
|
../primitives,
|
||||||
./limbs_generic,
|
./limbs,
|
||||||
./limbs_generic_modular,
|
./limbs_generic_modular,
|
||||||
./limbs_montgomery
|
./limbs_montgomery
|
||||||
|
|
||||||
when UseX86ASM:
|
|
||||||
import ./limbs_asm_x86
|
|
||||||
|
|
||||||
export BigInt
|
export BigInt
|
||||||
|
|
||||||
# ############################################################
|
# ############################################################
|
||||||
@ -85,10 +82,7 @@ func ccopy*(a: var BigInt, b: BigInt, ctl: SecretBool) =
|
|||||||
## If ctl is true: b is copied into a
|
## If ctl is true: b is copied into a
|
||||||
## if ctl is false: b is not copied and a is untouched
|
## if ctl is false: b is not copied and a is untouched
|
||||||
## Time and memory accesses are the same whether a copy occurs or not
|
## Time and memory accesses are the same whether a copy occurs or not
|
||||||
when UseX86ASM:
|
ccopy(a.limbs, b.limbs, ctl)
|
||||||
ccopy_asm(a.limbs, b.limbs, ctl)
|
|
||||||
else:
|
|
||||||
ccopy(a.limbs, b.limbs, ctl)
|
|
||||||
|
|
||||||
func cswap*(a, b: var BigInt, ctl: CTBool) =
|
func cswap*(a, b: var BigInt, ctl: CTBool) =
|
||||||
## Swap ``a`` and ``b`` if ``ctl`` is true
|
## Swap ``a`` and ``b`` if ``ctl`` is true
|
||||||
@ -245,6 +239,14 @@ func prod*[rBits, aBits, bBits](r: var BigInt[rBits], a: BigInt[aBits], b: BigIn
|
|||||||
## It will be truncated if it cannot fit in r limbs.
|
## It will be truncated if it cannot fit in r limbs.
|
||||||
r.limbs.prod(a.limbs, b.limbs)
|
r.limbs.prod(a.limbs, b.limbs)
|
||||||
|
|
||||||
|
func mul*[aBits, bBits](a: var BigInt[aBits], b: BigInt[bBits]) =
|
||||||
|
## Multi-precision multiplication
|
||||||
|
## a <- a*b
|
||||||
|
## `a`, `b`, can have different sizes
|
||||||
|
var t{.noInit.}: typeof(a)
|
||||||
|
t.limbs.prod(a.limbs, b.limbs)
|
||||||
|
a = t
|
||||||
|
|
||||||
func prod_high_words*[rBits, aBits, bBits](r: var BigInt[rBits], a: BigInt[aBits], b: BigInt[bBits], lowestWordIndex: static int) =
|
func prod_high_words*[rBits, aBits, bBits](r: var BigInt[rBits], a: BigInt[aBits], b: BigInt[bBits], lowestWordIndex: static int) =
|
||||||
## Multi-precision multiplication keeping only high words
|
## Multi-precision multiplication keeping only high words
|
||||||
## r <- a*b >> (2^WordBitWidth)^lowestWordIndex
|
## r <- a*b >> (2^WordBitWidth)^lowestWordIndex
|
||||||
|
|||||||
@ -29,7 +29,7 @@ import
|
|||||||
../config/[common, type_fp, curves],
|
../config/[common, type_fp, curves],
|
||||||
./bigints, ./limbs_montgomery
|
./bigints, ./limbs_montgomery
|
||||||
|
|
||||||
when UseX86ASM:
|
when UseASM_X86_64:
|
||||||
import ./limbs_asm_modular_x86
|
import ./limbs_asm_modular_x86
|
||||||
|
|
||||||
export Fp
|
export Fp
|
||||||
@ -120,7 +120,7 @@ func setOne*(a: var Fp) {.inline.} =
|
|||||||
|
|
||||||
func `+=`*(a: var Fp, b: Fp) {.inline.} =
|
func `+=`*(a: var Fp, b: Fp) {.inline.} =
|
||||||
## In-place addition modulo p
|
## In-place addition modulo p
|
||||||
when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling
|
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||||
addmod_asm(a.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs)
|
addmod_asm(a.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs)
|
||||||
else:
|
else:
|
||||||
var overflowed = add(a.mres, b.mres)
|
var overflowed = add(a.mres, b.mres)
|
||||||
@ -129,7 +129,7 @@ func `+=`*(a: var Fp, b: Fp) {.inline.} =
|
|||||||
|
|
||||||
func `-=`*(a: var Fp, b: Fp) {.inline.} =
|
func `-=`*(a: var Fp, b: Fp) {.inline.} =
|
||||||
## In-place substraction modulo p
|
## In-place substraction modulo p
|
||||||
when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling
|
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||||
submod_asm(a.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs)
|
submod_asm(a.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs)
|
||||||
else:
|
else:
|
||||||
let underflowed = sub(a.mres, b.mres)
|
let underflowed = sub(a.mres, b.mres)
|
||||||
@ -137,7 +137,7 @@ func `-=`*(a: var Fp, b: Fp) {.inline.} =
|
|||||||
|
|
||||||
func double*(a: var Fp) {.inline.} =
|
func double*(a: var Fp) {.inline.} =
|
||||||
## Double ``a`` modulo p
|
## Double ``a`` modulo p
|
||||||
when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling
|
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||||
addmod_asm(a.mres.limbs, a.mres.limbs, Fp.C.Mod.limbs)
|
addmod_asm(a.mres.limbs, a.mres.limbs, Fp.C.Mod.limbs)
|
||||||
else:
|
else:
|
||||||
var overflowed = double(a.mres)
|
var overflowed = double(a.mres)
|
||||||
@ -145,9 +145,9 @@ func double*(a: var Fp) {.inline.} =
|
|||||||
discard csub(a.mres, Fp.C.Mod, overflowed)
|
discard csub(a.mres, Fp.C.Mod, overflowed)
|
||||||
|
|
||||||
func sum*(r: var Fp, a, b: Fp) {.inline.} =
|
func sum*(r: var Fp, a, b: Fp) {.inline.} =
|
||||||
## Sum ``a`` and ``b`` into ``r`` module p
|
## Sum ``a`` and ``b`` into ``r`` modulo p
|
||||||
## r is initialized/overwritten
|
## r is initialized/overwritten
|
||||||
when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling
|
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||||
r = a
|
r = a
|
||||||
addmod_asm(r.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs)
|
addmod_asm(r.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs)
|
||||||
else:
|
else:
|
||||||
@ -155,21 +155,42 @@ func sum*(r: var Fp, a, b: Fp) {.inline.} =
|
|||||||
overflowed = overflowed or not(r.mres < Fp.C.Mod)
|
overflowed = overflowed or not(r.mres < Fp.C.Mod)
|
||||||
discard csub(r.mres, Fp.C.Mod, overflowed)
|
discard csub(r.mres, Fp.C.Mod, overflowed)
|
||||||
|
|
||||||
|
func sumNoReduce*(r: var Fp, a, b: Fp) {.inline.} =
|
||||||
|
## Sum ``a`` and ``b`` into ``r`` without reduction
|
||||||
|
discard r.mres.sum(a.mres, b.mres)
|
||||||
|
|
||||||
func diff*(r: var Fp, a, b: Fp) {.inline.} =
|
func diff*(r: var Fp, a, b: Fp) {.inline.} =
|
||||||
## Substract `b` from `a` and store the result into `r`.
|
## Substract `b` from `a` and store the result into `r`.
|
||||||
## `r` is initialized/overwritten
|
## `r` is initialized/overwritten
|
||||||
when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling
|
## Requires r != b
|
||||||
var t = a # Handle aliasing r == b
|
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||||
|
r = a
|
||||||
|
submod_asm(r.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs)
|
||||||
|
else:
|
||||||
|
var underflowed = r.mres.diff(a.mres, b.mres)
|
||||||
|
discard cadd(r.mres, Fp.C.Mod, underflowed)
|
||||||
|
|
||||||
|
func diffAlias*(r: var Fp, a, b: Fp) {.inline.} =
|
||||||
|
## Substract `b` from `a` and store the result into `r`.
|
||||||
|
## `r` is initialized/overwritten
|
||||||
|
## Handles r == b
|
||||||
|
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||||
|
var t = a
|
||||||
submod_asm(t.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs)
|
submod_asm(t.mres.limbs, b.mres.limbs, Fp.C.Mod.limbs)
|
||||||
r = t
|
r = t
|
||||||
else:
|
else:
|
||||||
var underflowed = r.mres.diff(a.mres, b.mres)
|
var underflowed = r.mres.diff(a.mres, b.mres)
|
||||||
discard cadd(r.mres, Fp.C.Mod, underflowed)
|
discard cadd(r.mres, Fp.C.Mod, underflowed)
|
||||||
|
|
||||||
|
func diffNoReduce*(r: var Fp, a, b: Fp) {.inline.} =
|
||||||
|
## Substract `b` from `a` and store the result into `r`
|
||||||
|
## without reduction
|
||||||
|
discard r.mres.diff(a.mres, b.mres)
|
||||||
|
|
||||||
func double*(r: var Fp, a: Fp) {.inline.} =
|
func double*(r: var Fp, a: Fp) {.inline.} =
|
||||||
## Double ``a`` into ``r``
|
## Double ``a`` into ``r``
|
||||||
## `r` is initialized/overwritten
|
## `r` is initialized/overwritten
|
||||||
when UseX86ASM and a.mres.limbs.len <= 6: # TODO: handle spilling
|
when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
|
||||||
r = a
|
r = a
|
||||||
addmod_asm(r.mres.limbs, a.mres.limbs, Fp.C.Mod.limbs)
|
addmod_asm(r.mres.limbs, a.mres.limbs, Fp.C.Mod.limbs)
|
||||||
else:
|
else:
|
||||||
@ -188,7 +209,7 @@ func square*(r: var Fp, a: Fp) {.inline.} =
|
|||||||
|
|
||||||
func neg*(r: var Fp, a: Fp) {.inline.} =
|
func neg*(r: var Fp, a: Fp) {.inline.} =
|
||||||
## Negate modulo p
|
## Negate modulo p
|
||||||
when UseX86ASM and defined(gcc):
|
when UseASM_X86_64 and defined(gcc):
|
||||||
# Clang and every compiler besides GCC
|
# Clang and every compiler besides GCC
|
||||||
# can cleanly optimized this
|
# can cleanly optimized this
|
||||||
# especially on Fp2
|
# especially on Fp2
|
||||||
|
|||||||
72
constantine/arithmetic/finite_fields_double_width.nim
Normal file
72
constantine/arithmetic/finite_fields_double_width.nim
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
# Constantine
|
||||||
|
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
||||||
|
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
||||||
|
# Licensed and distributed under either of
|
||||||
|
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||||
|
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||||
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||||
|
|
||||||
|
import
|
||||||
|
../config/[common, curves, type_bigint, type_fp],
|
||||||
|
../primitives,
|
||||||
|
./bigints,
|
||||||
|
./finite_fields,
|
||||||
|
./limbs,
|
||||||
|
./limbs_double_width
|
||||||
|
|
||||||
|
when UseASM_X86_64:
|
||||||
|
import limbs_asm_modular_dbl_width_x86
|
||||||
|
|
||||||
|
type FpDbl*[C: static Curve] = object
|
||||||
|
## Double-width Fp element
|
||||||
|
## This allows saving on reductions
|
||||||
|
# We directly work with double the number of limbs
|
||||||
|
limbs2x*: matchingLimbs2x(C)
|
||||||
|
|
||||||
|
template doubleWidth*(T: typedesc[Fp]): typedesc =
|
||||||
|
## Return the double-width type matching with Fp
|
||||||
|
FpDbl[T.C]
|
||||||
|
|
||||||
|
func mulNoReduce*(r: var FpDbl, a, b: Fp) {.inline.} =
|
||||||
|
## Store the product of ``a`` by ``b`` into ``r``
|
||||||
|
r.limbs2x.prod(a.mres.limbs, b.mres.limbs)
|
||||||
|
|
||||||
|
func reduce*(r: var Fp, a: FpDbl) {.inline.} =
|
||||||
|
## Reduce a double-width field element into r
|
||||||
|
const N = r.mres.limbs.len
|
||||||
|
montyRed(
|
||||||
|
r.mres.limbs,
|
||||||
|
a.limbs2x,
|
||||||
|
Fp.C.Mod.limbs,
|
||||||
|
Fp.C.getNegInvModWord(),
|
||||||
|
Fp.C.canUseNoCarryMontyMul()
|
||||||
|
)
|
||||||
|
|
||||||
|
func diffNoInline(r: var FpDbl, a, b: FpDbl): Borrow =
|
||||||
|
r.limbs2x.diff(a.limbs2x, b.limbs2x)
|
||||||
|
|
||||||
|
func diffNoReduce*(r: var FpDbl, a, b: FpDbl) =
|
||||||
|
## Double-width substraction without reduction
|
||||||
|
discard diffNoInline(r, a, b)
|
||||||
|
|
||||||
|
func diff*(r: var FpDbl, a, b: FpDbl) =
|
||||||
|
## Double-width modular substraction
|
||||||
|
when false: # TODO slower
|
||||||
|
r = a
|
||||||
|
sub2x_asm(r.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs)
|
||||||
|
else:
|
||||||
|
var underflowed = SecretBool diffNoInline(r, a, b)
|
||||||
|
|
||||||
|
const N = r.limbs2x.len div 2
|
||||||
|
const M = FpDbl.C.Mod
|
||||||
|
var carry = Carry(0)
|
||||||
|
var sum: SecretWord
|
||||||
|
for i in 0 ..< N:
|
||||||
|
addC(carry, sum, r.limbs2x[i+N], M.limbs[i], carry)
|
||||||
|
underflowed.ccopy(r.limbs2x[i+N], sum)
|
||||||
|
|
||||||
|
func `-=`*(a: var FpDbl, b: FpDbl) =
|
||||||
|
when false: # TODO slower
|
||||||
|
sub2x_asm(a.limbs2x, b.limbs2x, FpDbl.C.Mod.limbs)
|
||||||
|
else:
|
||||||
|
a.diff(a, b)
|
||||||
@ -8,8 +8,13 @@
|
|||||||
|
|
||||||
import
|
import
|
||||||
../config/common,
|
../config/common,
|
||||||
../primitives,
|
../primitives
|
||||||
../../helpers/static_for
|
|
||||||
|
when UseASM_X86_32:
|
||||||
|
import ./limbs_asm_x86
|
||||||
|
when UseASM_X86_64:
|
||||||
|
import ./limbs_asm_mul_x86
|
||||||
|
import ./limbs_asm_mul_x86_adx_bmi2
|
||||||
|
|
||||||
# ############################################################
|
# ############################################################
|
||||||
#
|
#
|
||||||
@ -37,39 +42,9 @@ import
|
|||||||
# The limb-endianess is little-endian, less significant limb is at index 0.
|
# The limb-endianess is little-endian, less significant limb is at index 0.
|
||||||
# The word-endianness is native-endian.
|
# The word-endianness is native-endian.
|
||||||
|
|
||||||
type Limbs*[N: static int] = array[N, SecretWord]
|
|
||||||
## Limbs-type
|
|
||||||
## Should be distinct type to avoid builtins to use non-constant time
|
|
||||||
## implementation, for example for comparison.
|
|
||||||
##
|
|
||||||
## but for unknown reason, it prevents semchecking `bits`
|
|
||||||
|
|
||||||
# No exceptions allowed
|
# No exceptions allowed
|
||||||
{.push raises: [].}
|
{.push raises: [].}
|
||||||
|
|
||||||
# ############################################################
|
|
||||||
#
|
|
||||||
# Accessors
|
|
||||||
#
|
|
||||||
# ############################################################
|
|
||||||
#
|
|
||||||
# Commented out since we don't use a distinct type
|
|
||||||
|
|
||||||
# template `[]`[N](v: Limbs[N], idx: int): SecretWord =
|
|
||||||
# (array[N, SecretWord])(v)[idx]
|
|
||||||
#
|
|
||||||
# template `[]`[N](v: var Limbs[N], idx: int): var SecretWord =
|
|
||||||
# (array[N, SecretWord])(v)[idx]
|
|
||||||
#
|
|
||||||
# template `[]=`[N](v: Limbs[N], idx: int, val: SecretWord) =
|
|
||||||
# (array[N, SecretWord])(v)[idx] = val
|
|
||||||
|
|
||||||
# ############################################################
|
|
||||||
#
|
|
||||||
# Checks and debug/test only primitives
|
|
||||||
#
|
|
||||||
# ############################################################
|
|
||||||
|
|
||||||
# ############################################################
|
# ############################################################
|
||||||
#
|
#
|
||||||
# Limbs Primitives
|
# Limbs Primitives
|
||||||
@ -104,8 +79,11 @@ func ccopy*(a: var Limbs, b: Limbs, ctl: SecretBool) =
|
|||||||
## If ctl is true: b is copied into a
|
## If ctl is true: b is copied into a
|
||||||
## if ctl is false: b is not copied and a is untouched
|
## if ctl is false: b is not copied and a is untouched
|
||||||
## Time and memory accesses are the same whether a copy occurs or not
|
## Time and memory accesses are the same whether a copy occurs or not
|
||||||
for i in 0 ..< a.len:
|
when UseASM_X86_32:
|
||||||
ctl.ccopy(a[i], b[i])
|
ccopy_asm(a, b, ctl)
|
||||||
|
else:
|
||||||
|
for i in 0 ..< a.len:
|
||||||
|
ctl.ccopy(a[i], b[i])
|
||||||
|
|
||||||
func cswap*(a, b: var Limbs, ctl: CTBool) =
|
func cswap*(a, b: var Limbs, ctl: CTBool) =
|
||||||
## Swap ``a`` and ``b`` if ``ctl`` is true
|
## Swap ``a`` and ``b`` if ``ctl`` is true
|
||||||
@ -190,9 +168,12 @@ func shiftRight*(a: var Limbs, k: int) {.inline.}=
|
|||||||
func add*(a: var Limbs, b: Limbs): Carry =
|
func add*(a: var Limbs, b: Limbs): Carry =
|
||||||
## Limbs addition
|
## Limbs addition
|
||||||
## Returns the carry
|
## Returns the carry
|
||||||
result = Carry(0)
|
when UseASM_X86_32:
|
||||||
for i in 0 ..< a.len:
|
result = add_asm(a, a, b)
|
||||||
addC(result, a[i], a[i], b[i], result)
|
else:
|
||||||
|
result = Carry(0)
|
||||||
|
for i in 0 ..< a.len:
|
||||||
|
addC(result, a[i], a[i], b[i], result)
|
||||||
|
|
||||||
func add*(a: var Limbs, w: SecretWord): Carry =
|
func add*(a: var Limbs, w: SecretWord): Carry =
|
||||||
## Limbs addition, add a number that fits in a word
|
## Limbs addition, add a number that fits in a word
|
||||||
@ -222,16 +203,22 @@ func sum*(r: var Limbs, a, b: Limbs): Carry =
|
|||||||
## `r` is initialized/overwritten
|
## `r` is initialized/overwritten
|
||||||
##
|
##
|
||||||
## Returns the carry
|
## Returns the carry
|
||||||
result = Carry(0)
|
when UseASM_X86_32:
|
||||||
for i in 0 ..< a.len:
|
result = add_asm(r, a, b)
|
||||||
addC(result, r[i], a[i], b[i], result)
|
else:
|
||||||
|
result = Carry(0)
|
||||||
|
for i in 0 ..< a.len:
|
||||||
|
addC(result, r[i], a[i], b[i], result)
|
||||||
|
|
||||||
func sub*(a: var Limbs, b: Limbs): Borrow =
|
func sub*(a: var Limbs, b: Limbs): Borrow =
|
||||||
## Limbs substraction
|
## Limbs substraction
|
||||||
## Returns the borrow
|
## Returns the borrow
|
||||||
result = Borrow(0)
|
when UseASM_X86_32:
|
||||||
for i in 0 ..< a.len:
|
result = sub_asm(a, a, b)
|
||||||
subB(result, a[i], a[i], b[i], result)
|
else:
|
||||||
|
result = Borrow(0)
|
||||||
|
for i in 0 ..< a.len:
|
||||||
|
subB(result, a[i], a[i], b[i], result)
|
||||||
|
|
||||||
func sub*(a: var Limbs, w: SecretWord): Borrow =
|
func sub*(a: var Limbs, w: SecretWord): Borrow =
|
||||||
## Limbs substraction, sub a number that fits in a word
|
## Limbs substraction, sub a number that fits in a word
|
||||||
@ -272,9 +259,12 @@ func diff*(r: var Limbs, a, b: Limbs): Borrow =
|
|||||||
## `r` is initialized/overwritten
|
## `r` is initialized/overwritten
|
||||||
##
|
##
|
||||||
## Returns the borrow
|
## Returns the borrow
|
||||||
result = Borrow(0)
|
when UseASM_X86_32:
|
||||||
for i in 0 ..< a.len:
|
result = sub_asm(r, a, b)
|
||||||
subB(result, r[i], a[i], b[i], result)
|
else:
|
||||||
|
result = Borrow(0)
|
||||||
|
for i in 0 ..< a.len:
|
||||||
|
subB(result, r[i], a[i], b[i], result)
|
||||||
|
|
||||||
func cneg*(a: var Limbs, ctl: CTBool) =
|
func cneg*(a: var Limbs, ctl: CTBool) =
|
||||||
## Conditional negation.
|
## Conditional negation.
|
||||||
@ -301,7 +291,7 @@ func cneg*(a: var Limbs, ctl: CTBool) =
|
|||||||
# Multiplication
|
# Multiplication
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
func prod*[rLen, aLen, bLen](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
|
func prod*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
|
||||||
## Multi-precision multiplication
|
## Multi-precision multiplication
|
||||||
## r <- a*b
|
## r <- a*b
|
||||||
##
|
##
|
||||||
@ -309,23 +299,34 @@ func prod*[rLen, aLen, bLen](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen])
|
|||||||
## if `r`.limbs.len < a.limbs.len + b.limbs.len
|
## if `r`.limbs.len < a.limbs.len + b.limbs.len
|
||||||
## The result will be truncated, i.e. it will be
|
## The result will be truncated, i.e. it will be
|
||||||
## a * b (mod (2^WordBitwidth)^r.limbs.len)
|
## a * b (mod (2^WordBitwidth)^r.limbs.len)
|
||||||
|
##
|
||||||
|
## `r` must not alias ``a`` or ``b``
|
||||||
|
|
||||||
# We use Product Scanning / Comba multiplication
|
when UseASM_X86_64 and aLen <= 6:
|
||||||
var t, u, v = SecretWord(0)
|
if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()):
|
||||||
var z: Limbs[rLen] # zero-init, ensure on stack and removes in-place problems
|
mul_asm_adx_bmi2(r, a, b)
|
||||||
|
else:
|
||||||
|
mul_asm(r, a, b)
|
||||||
|
elif UseASM_X86_64:
|
||||||
|
mul_asm(r, a, b)
|
||||||
|
else:
|
||||||
|
# We use Product Scanning / Comba multiplication
|
||||||
|
var t, u, v = SecretWord(0)
|
||||||
|
|
||||||
staticFor i, 0, min(a.len+b.len, r.len):
|
staticFor i, 0, min(a.len+b.len, r.len):
|
||||||
const ib = min(b.len-1, i)
|
const ib = min(b.len-1, i)
|
||||||
const ia = i - ib
|
const ia = i - ib
|
||||||
staticFor j, 0, min(a.len - ia, ib+1):
|
staticFor j, 0, min(a.len - ia, ib+1):
|
||||||
mulAcc(t, u, v, a[ia+j], b[ib-j])
|
mulAcc(t, u, v, a[ia+j], b[ib-j])
|
||||||
|
|
||||||
z[i] = v
|
r[i] = v
|
||||||
v = u
|
v = u
|
||||||
u = t
|
u = t
|
||||||
t = SecretWord(0)
|
t = SecretWord(0)
|
||||||
|
|
||||||
r = z
|
if aLen+bLen < rLen:
|
||||||
|
for i in aLen+bLen ..< rLen:
|
||||||
|
r[i] = SecretWord 0
|
||||||
|
|
||||||
func prod_high_words*[rLen, aLen, bLen](
|
func prod_high_words*[rLen, aLen, bLen](
|
||||||
r: var Limbs[rLen],
|
r: var Limbs[rLen],
|
||||||
84
constantine/arithmetic/limbs_asm_modular_dbl_width_x86.nim
Normal file
84
constantine/arithmetic/limbs_asm_modular_dbl_width_x86.nim
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
# Constantine
|
||||||
|
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
||||||
|
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
||||||
|
# Licensed and distributed under either of
|
||||||
|
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||||
|
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||||
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||||
|
|
||||||
|
import
|
||||||
|
# Standard library
|
||||||
|
std/macros,
|
||||||
|
# Internal
|
||||||
|
../config/common,
|
||||||
|
../primitives
|
||||||
|
|
||||||
|
# ############################################################
|
||||||
|
#
|
||||||
|
# Assembly implementation of FpDbl
|
||||||
|
#
|
||||||
|
# ############################################################
|
||||||
|
|
||||||
|
static: doAssert UseASM_X86_64
|
||||||
|
{.localPassC:"-fomit-frame-pointer".} # Needed so that the compiler finds enough registers
|
||||||
|
|
||||||
|
# TODO slower than intrinsics
|
||||||
|
|
||||||
|
# Substraction
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
macro sub2x_gen[N: static int](a: var Limbs[N], b: Limbs[N], M: Limbs[N div 2]): untyped =
|
||||||
|
## Generate an optimized out-of-place double-width substraction kernel
|
||||||
|
|
||||||
|
result = newStmtList()
|
||||||
|
|
||||||
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
|
let
|
||||||
|
N2 = N div 2
|
||||||
|
|
||||||
|
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, InputOutput)
|
||||||
|
# We reuse the reg used for B for overflow detection
|
||||||
|
arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, InputOutput)
|
||||||
|
# We could force M as immediate by specializing per moduli
|
||||||
|
arrM = init(OperandArray, nimSymbol = M, N, PointerInReg, Input)
|
||||||
|
# If N is too big, we need to spill registers. TODO.
|
||||||
|
arrT = init(OperandArray, nimSymbol = ident"t", N2, ElemsInReg, Output_EarlyClobber)
|
||||||
|
arrTadd = init(OperandArray, nimSymbol = ident"tadd", N2, ElemsInReg, Output_EarlyClobber)
|
||||||
|
|
||||||
|
# Substraction
|
||||||
|
for i in 0 ..< N:
|
||||||
|
ctx.mov arrT[i mod N2], arrA[i]
|
||||||
|
if i == 0:
|
||||||
|
ctx.sub arrT[0], arrB[0]
|
||||||
|
else:
|
||||||
|
ctx.sbb arrT[i mod N2], arrB[i]
|
||||||
|
ctx.mov arrA[i], arrT[i mod N2]
|
||||||
|
# Interleaved copy the modulus to hide SBB latencies
|
||||||
|
if i < N2:
|
||||||
|
ctx.mov arrTadd[i], arrM[i]
|
||||||
|
|
||||||
|
# Mask: underflowed contains 0xFFFF or 0x0000
|
||||||
|
let underflowed = arrB.reuseRegister()
|
||||||
|
ctx.sbb underflowed, underflowed
|
||||||
|
|
||||||
|
# Now mask the adder, with 0 or the modulus limbs
|
||||||
|
for i in 0 ..< N2:
|
||||||
|
ctx.`and` arrTadd[i], underflowed
|
||||||
|
|
||||||
|
# Add the masked modulus
|
||||||
|
for i in 0 ..< N2:
|
||||||
|
if i == 0:
|
||||||
|
ctx.add arrT[0], arrTadd[0]
|
||||||
|
else:
|
||||||
|
ctx.adc arrT[i], arrTadd[i]
|
||||||
|
ctx.mov arrA[i+N2], arrT[i]
|
||||||
|
|
||||||
|
let t = arrT.nimSymbol
|
||||||
|
let tadd = arrTadd.nimSymbol
|
||||||
|
result.add quote do:
|
||||||
|
var `t`{.noinit.}, `tadd` {.noInit.}: typeof(`a`)
|
||||||
|
result.add ctx.generate
|
||||||
|
|
||||||
|
func sub2x_asm*[N: static int](a: var Limbs[N], b: Limbs[N], M: Limbs[N div 2]) =
|
||||||
|
## Constant-time double-width substraction
|
||||||
|
sub2x_gen(a, b, M)
|
||||||
@ -12,7 +12,7 @@ import
|
|||||||
# Internal
|
# Internal
|
||||||
../config/common,
|
../config/common,
|
||||||
../primitives,
|
../primitives,
|
||||||
./limbs_generic
|
./limbs
|
||||||
|
|
||||||
# ############################################################
|
# ############################################################
|
||||||
#
|
#
|
||||||
@ -25,7 +25,7 @@ import
|
|||||||
# They are nice to let the compiler deals with mov
|
# They are nice to let the compiler deals with mov
|
||||||
# but too constraining so we move things ourselves.
|
# but too constraining so we move things ourselves.
|
||||||
|
|
||||||
static: doAssert UseX86ASM
|
static: doAssert UseASM_X86_64
|
||||||
|
|
||||||
{.localPassC:"-fomit-frame-pointer".} # Needed so that the compiler finds enough registers
|
{.localPassC:"-fomit-frame-pointer".} # Needed so that the compiler finds enough registers
|
||||||
|
|
||||||
@ -64,6 +64,7 @@ macro addmod_gen[N: static int](a: var Limbs[N], b, M: Limbs[N]): untyped =
|
|||||||
ctx.mov arrTsub[i], arrT[i]
|
ctx.mov arrTsub[i], arrT[i]
|
||||||
|
|
||||||
# Mask: overflowed contains 0xFFFF or 0x0000
|
# Mask: overflowed contains 0xFFFF or 0x0000
|
||||||
|
# TODO: unnecessary if MSB never set, i.e. "canUseNoCarryMontyMul"
|
||||||
let overflowed = arrB.reuseRegister()
|
let overflowed = arrB.reuseRegister()
|
||||||
ctx.sbb overflowed, overflowed
|
ctx.sbb overflowed, overflowed
|
||||||
|
|
||||||
@ -118,7 +119,7 @@ macro submod_gen[N: static int](a: var Limbs[N], b, M: Limbs[N]): untyped =
|
|||||||
arrT = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
|
arrT = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
|
||||||
arrTadd = init(OperandArray, nimSymbol = ident"tadd", N, ElemsInReg, Output_EarlyClobber)
|
arrTadd = init(OperandArray, nimSymbol = ident"tadd", N, ElemsInReg, Output_EarlyClobber)
|
||||||
|
|
||||||
# Addition
|
# Substraction
|
||||||
for i in 0 ..< N:
|
for i in 0 ..< N:
|
||||||
ctx.mov arrT[i], arrA[i]
|
ctx.mov arrT[i], arrA[i]
|
||||||
if i == 0:
|
if i == 0:
|
||||||
|
|||||||
@ -12,7 +12,8 @@ import
|
|||||||
# Internal
|
# Internal
|
||||||
../config/common,
|
../config/common,
|
||||||
../primitives,
|
../primitives,
|
||||||
./limbs_generic
|
./limbs,
|
||||||
|
./limbs_asm_montred_x86
|
||||||
|
|
||||||
# ############################################################
|
# ############################################################
|
||||||
#
|
#
|
||||||
@ -25,7 +26,7 @@ import
|
|||||||
# They are nice to let the compiler deals with mov
|
# They are nice to let the compiler deals with mov
|
||||||
# but too constraining so we move things ourselves.
|
# but too constraining so we move things ourselves.
|
||||||
|
|
||||||
static: doAssert UseX86ASM
|
static: doAssert UseASM_X86_64
|
||||||
|
|
||||||
# Necessary for the compiler to find enough registers (enabled at -O1)
|
# Necessary for the compiler to find enough registers (enabled at -O1)
|
||||||
{.localPassC:"-fomit-frame-pointer".}
|
{.localPassC:"-fomit-frame-pointer".}
|
||||||
@ -33,28 +34,6 @@ static: doAssert UseX86ASM
|
|||||||
# Montgomery multiplication
|
# Montgomery multiplication
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
# Fallback when no ADX and BMI2 support (MULX, ADCX, ADOX)
|
# Fallback when no ADX and BMI2 support (MULX, ADCX, ADOX)
|
||||||
|
|
||||||
proc finalSub*(
|
|
||||||
ctx: var Assembler_x86,
|
|
||||||
r: Operand or OperandArray,
|
|
||||||
t, M, scratch: OperandArray
|
|
||||||
) =
|
|
||||||
## Reduce `t` into `r` modulo `M`
|
|
||||||
let N = M.len
|
|
||||||
ctx.comment "Final substraction"
|
|
||||||
for i in 0 ..< N:
|
|
||||||
ctx.mov scratch[i], t[i]
|
|
||||||
if i == 0:
|
|
||||||
ctx.sub scratch[i], M[i]
|
|
||||||
else:
|
|
||||||
ctx.sbb scratch[i], M[i]
|
|
||||||
|
|
||||||
# If we borrowed it means that we were smaller than
|
|
||||||
# the modulus and we don't need "scratch"
|
|
||||||
for i in 0 ..< N:
|
|
||||||
ctx.cmovnc t[i], scratch[i]
|
|
||||||
ctx.mov r[i], t[i]
|
|
||||||
|
|
||||||
macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_MM: Limbs[N], m0ninv_MM: BaseType): untyped =
|
macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_MM: Limbs[N], m0ninv_MM: BaseType): untyped =
|
||||||
## Generate an optimized Montgomery Multiplication kernel
|
## Generate an optimized Montgomery Multiplication kernel
|
||||||
## using the CIOS method
|
## using the CIOS method
|
||||||
@ -211,7 +190,7 @@ macro montMul_CIOS_nocarry_gen[N: static int](r_MM: var Limbs[N], a_MM, b_MM, M_
|
|||||||
ctx.mov rRDX, r
|
ctx.mov rRDX, r
|
||||||
let r2 = rRDX.asArrayAddr(len = N)
|
let r2 = rRDX.asArrayAddr(len = N)
|
||||||
|
|
||||||
ctx.finalSub(
|
ctx.finalSubNoCarry(
|
||||||
r2, t, M,
|
r2, t, M,
|
||||||
scratch
|
scratch
|
||||||
)
|
)
|
||||||
|
|||||||
@ -12,8 +12,8 @@ import
|
|||||||
# Internal
|
# Internal
|
||||||
../config/common,
|
../config/common,
|
||||||
../primitives,
|
../primitives,
|
||||||
./limbs_generic,
|
./limbs,
|
||||||
./limbs_asm_montmul_x86
|
./limbs_asm_montred_x86
|
||||||
|
|
||||||
# ############################################################
|
# ############################################################
|
||||||
#
|
#
|
||||||
@ -26,7 +26,7 @@ import
|
|||||||
# They are nice to let the compiler deals with mov
|
# They are nice to let the compiler deals with mov
|
||||||
# but too constraining so we move things ourselves.
|
# but too constraining so we move things ourselves.
|
||||||
|
|
||||||
static: doAssert UseX86ASM
|
static: doAssert UseASM_X86_64
|
||||||
|
|
||||||
# MULX/ADCX/ADOX
|
# MULX/ADCX/ADOX
|
||||||
{.localPassC:"-madx -mbmi2".}
|
{.localPassC:"-madx -mbmi2".}
|
||||||
@ -37,53 +37,58 @@ static: doAssert UseX86ASM
|
|||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
proc mulx_by_word(
|
proc mulx_by_word(
|
||||||
ctx: var Assembler_x86,
|
ctx: var Assembler_x86,
|
||||||
C: Operand,
|
hi: Operand,
|
||||||
t: OperandArray,
|
t: OperandArray,
|
||||||
a: Operand, # Pointer in scratchspace
|
a: Operand, # Pointer in scratchspace
|
||||||
word: Operand,
|
word0: Operand,
|
||||||
S, rRDX: Operand
|
lo, rRDX: Operand
|
||||||
) =
|
) =
|
||||||
## Multiply the `a[0..<N]` by `word` and store in `t[0..<N]`
|
## Multiply the `a[0..<N]` by `word` and store in `t[0..<N]`
|
||||||
## and carry register `C` (t[N])
|
## and carry register `C` (t[N])
|
||||||
## `t` and `C` overwritten
|
## `t` and `C` overwritten
|
||||||
## `S` is a scratchspace carry register
|
## `S` is a scratchspace carry register
|
||||||
## `rRDX` is the RDX register descriptor
|
## `rRDX` is the RDX register descriptor
|
||||||
let N = t.len
|
let N = min(a.len, t.len)
|
||||||
|
|
||||||
doAssert N >= 2, "The Assembly-optimized montgomery multiplication requires at least 2 limbs."
|
|
||||||
ctx.comment " Outer loop i = 0"
|
ctx.comment " Outer loop i = 0"
|
||||||
ctx.`xor` rRDX, rRDX # Clear flags - TODO: necessary?
|
|
||||||
ctx.mov rRDX, word
|
|
||||||
|
|
||||||
# for j=0 to N-1
|
# for j=0 to N-1
|
||||||
# (C,t[j]) := t[j] + a[j]*b[i] + C
|
# (C,t[j]) := t[j] + a[j]*b[i] + C
|
||||||
|
|
||||||
# First limb
|
# First limb
|
||||||
ctx.mulx t[1], t[0], a[0], rdx
|
ctx.mov rRDX, word0
|
||||||
|
if N > 1:
|
||||||
|
ctx.mulx t[1], t[0], a[0], rdx
|
||||||
|
ctx.`xor` hi, hi # Clear flags - TODO: necessary?
|
||||||
|
else:
|
||||||
|
ctx.mulx hi, t[0], a[0], rdx
|
||||||
|
return
|
||||||
|
|
||||||
# Steady state
|
# Steady state
|
||||||
for j in 1 ..< N-1:
|
for j in 1 ..< N-1:
|
||||||
ctx.mulx t[j+1], S, a[j], rdx
|
ctx.mulx t[j+1], lo, a[j], rdx
|
||||||
ctx.adox t[j], S # TODO, we probably can use ADC here
|
if j == 1:
|
||||||
|
ctx.add t[j], lo
|
||||||
|
else:
|
||||||
|
ctx.adc t[j], lo
|
||||||
|
|
||||||
# Last limb
|
# Last limb
|
||||||
ctx.mulx C, S, a[N-1], rdx
|
ctx.comment " Outer loop i = 0, last limb"
|
||||||
ctx.adox t[N-1], S
|
ctx.mulx hi, lo, a[N-1], rdx
|
||||||
|
ctx.adc t[N-1], lo
|
||||||
|
|
||||||
# Final carries
|
# Final carries
|
||||||
ctx.comment " Mul carries i = 0"
|
ctx.comment " Accumulate last carries in hi word"
|
||||||
ctx.mov rRDX, 0 # Set to 0 without clearing flags
|
ctx.adc hi, 0
|
||||||
ctx.adcx C, rRDX
|
|
||||||
ctx.adox C, rRDX
|
|
||||||
|
|
||||||
proc mulaccx_by_word(
|
proc mulaccx_by_word(
|
||||||
ctx: var Assembler_x86,
|
ctx: var Assembler_x86,
|
||||||
C: Operand,
|
hi: Operand,
|
||||||
t: OperandArray,
|
t: OperandArray,
|
||||||
a: Operand, # Pointer in scratchspace
|
a: Operand, # Pointer in scratchspace
|
||||||
i: int,
|
i: int,
|
||||||
word: Operand,
|
word: Operand,
|
||||||
S, rRDX: Operand
|
lo, rRDX: Operand
|
||||||
) =
|
) =
|
||||||
## Multiply the `a[0..<N]` by `word`
|
## Multiply the `a[0..<N]` by `word`
|
||||||
## and accumulate in `t[0..<N]`
|
## and accumulate in `t[0..<N]`
|
||||||
@ -91,33 +96,33 @@ proc mulaccx_by_word(
|
|||||||
## `t` and `C` are multiply-accumulated
|
## `t` and `C` are multiply-accumulated
|
||||||
## `S` is a scratchspace register
|
## `S` is a scratchspace register
|
||||||
## `rRDX` is the RDX register descriptor
|
## `rRDX` is the RDX register descriptor
|
||||||
let N = t.len
|
let N = min(a.len, t.len)
|
||||||
|
|
||||||
doAssert N >= 2, "The Assembly-optimized montgomery multiplication requires at least 2 limbs."
|
|
||||||
doAssert i != 0
|
doAssert i != 0
|
||||||
|
|
||||||
ctx.comment " Outer loop i = " & $i
|
ctx.comment " Outer loop i = " & $i & ", j in [0, " & $N & ")"
|
||||||
ctx.`xor` rRDX, rRDX # Clear flags - TODO: necessary?
|
|
||||||
ctx.mov rRDX, word
|
ctx.mov rRDX, word
|
||||||
|
ctx.`xor` hi, hi # Clear flags - TODO: necessary?
|
||||||
|
|
||||||
# for j=0 to N-1
|
# for j=0 to N-1
|
||||||
# (C,t[j]) := t[j] + a[j]*b[i] + C
|
# (C,t[j]) := t[j] + a[j]*b[i] + C
|
||||||
|
|
||||||
# Steady state
|
# Steady state
|
||||||
for j in 0 ..< N-1:
|
for j in 0 ..< N-1:
|
||||||
ctx.mulx C, S, a[j], rdx
|
ctx.mulx hi, lo, a[j], rdx
|
||||||
ctx.adox t[j], S
|
ctx.adox t[j], lo
|
||||||
ctx.adcx t[j+1], C
|
ctx.adcx t[j+1], hi
|
||||||
|
|
||||||
# Last limb
|
# Last limb
|
||||||
ctx.mulx C, S, a[N-1], rdx
|
ctx.comment " Outer loop i = " & $i & ", last limb"
|
||||||
ctx.adox t[N-1], S
|
ctx.mulx hi, lo, a[N-1], rdx
|
||||||
|
ctx.adox t[N-1], lo
|
||||||
|
|
||||||
# Final carries
|
# Final carries
|
||||||
ctx.comment " Mul carries i = " & $i
|
ctx.comment " Accumulate last carries in hi word"
|
||||||
ctx.mov rRDX, 0 # Set to 0 without clearing flags
|
ctx.mov rRDX, 0 # Set to 0 without clearing flags
|
||||||
ctx.adcx C, rRDX
|
ctx.adcx hi, rRDX
|
||||||
ctx.adox C, rRDX
|
ctx.adox hi, rRDX
|
||||||
|
|
||||||
proc partialRedx(
|
proc partialRedx(
|
||||||
ctx: var Assembler_x86,
|
ctx: var Assembler_x86,
|
||||||
@ -163,7 +168,7 @@ proc partialRedx(
|
|||||||
ctx.adox t[j-1], S
|
ctx.adox t[j-1], S
|
||||||
|
|
||||||
# Last carries
|
# Last carries
|
||||||
# t[N-1} = S + C
|
# t[N-1] = S + C
|
||||||
ctx.comment " Reduction carry "
|
ctx.comment " Reduction carry "
|
||||||
ctx.mov S, 0
|
ctx.mov S, 0
|
||||||
ctx.adcx t[N-1], S
|
ctx.adcx t[N-1], S
|
||||||
@ -182,13 +187,13 @@ macro montMul_CIOS_nocarry_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM,
|
|||||||
let
|
let
|
||||||
scratchSlots = max(N, 6)
|
scratchSlots = max(N, 6)
|
||||||
|
|
||||||
r = init(OperandArray, nimSymbol = r_MM, N, PointerInReg, InputOutput)
|
r = init(OperandArray, nimSymbol = r_MM, N, PointerInReg, InputOutput_EnsureClobber)
|
||||||
# We could force M as immediate by specializing per moduli
|
# We could force M as immediate by specializing per moduli
|
||||||
M = init(OperandArray, nimSymbol = M_MM, N, PointerInReg, Input)
|
M = init(OperandArray, nimSymbol = M_MM, N, PointerInReg, Input)
|
||||||
# If N is too big, we need to spill registers. TODO.
|
# If N is too big, we need to spill registers. TODO.
|
||||||
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
|
t = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
|
||||||
# MultiPurpose Register slots
|
# MultiPurpose Register slots
|
||||||
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput)
|
scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||||
|
|
||||||
# MULX requires RDX
|
# MULX requires RDX
|
||||||
rRDX = Operand(
|
rRDX = Operand(
|
||||||
@ -270,7 +275,7 @@ macro montMul_CIOS_nocarry_adx_bmi2_gen[N: static int](r_MM: var Limbs[N], a_MM,
|
|||||||
lo, C, rRDX
|
lo, C, rRDX
|
||||||
)
|
)
|
||||||
|
|
||||||
ctx.finalSub(
|
ctx.finalSubNoCarry(
|
||||||
r, t, M,
|
r, t, M,
|
||||||
scratch
|
scratch
|
||||||
)
|
)
|
||||||
|
|||||||
255
constantine/arithmetic/limbs_asm_montred_x86.nim
Normal file
255
constantine/arithmetic/limbs_asm_montred_x86.nim
Normal file
@ -0,0 +1,255 @@
|
|||||||
|
# Constantine
|
||||||
|
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
||||||
|
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
||||||
|
# Licensed and distributed under either of
|
||||||
|
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||||
|
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||||
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||||
|
|
||||||
|
import
|
||||||
|
# Standard library
|
||||||
|
std/macros,
|
||||||
|
# Internal
|
||||||
|
../config/common,
|
||||||
|
../primitives,
|
||||||
|
./limbs
|
||||||
|
|
||||||
|
# ############################################################
|
||||||
|
#
|
||||||
|
# Assembly implementation of finite fields
|
||||||
|
#
|
||||||
|
# ############################################################
|
||||||
|
|
||||||
|
|
||||||
|
static: doAssert UseASM_X86_32
|
||||||
|
|
||||||
|
# Necessary for the compiler to find enough registers (enabled at -O1)
|
||||||
|
{.localPassC:"-fomit-frame-pointer".}
|
||||||
|
|
||||||
|
proc finalSubNoCarry*(
|
||||||
|
ctx: var Assembler_x86,
|
||||||
|
r: Operand or OperandArray,
|
||||||
|
t, M, scratch: OperandArray
|
||||||
|
) =
|
||||||
|
## Reduce `t` into `r` modulo `M`
|
||||||
|
let N = M.len
|
||||||
|
ctx.comment "Final substraction (no carry)"
|
||||||
|
for i in 0 ..< N:
|
||||||
|
ctx.mov scratch[i], t[i]
|
||||||
|
if i == 0:
|
||||||
|
ctx.sub scratch[i], M[i]
|
||||||
|
else:
|
||||||
|
ctx.sbb scratch[i], M[i]
|
||||||
|
|
||||||
|
# If we borrowed it means that we were smaller than
|
||||||
|
# the modulus and we don't need "scratch"
|
||||||
|
for i in 0 ..< N:
|
||||||
|
ctx.cmovnc t[i], scratch[i]
|
||||||
|
ctx.mov r[i], t[i]
|
||||||
|
|
||||||
|
proc finalSubCanOverflow*(
|
||||||
|
ctx: var Assembler_x86,
|
||||||
|
r: Operand or OperandArray,
|
||||||
|
t, M, scratch: OperandArray,
|
||||||
|
overflowReg: Operand
|
||||||
|
) =
|
||||||
|
## Reduce `t` into `r` modulo `M`
|
||||||
|
## To be used when the final substraction can
|
||||||
|
## also depend on the carry flag
|
||||||
|
## This is in particular possible when the MSB
|
||||||
|
## is set for the prime modulus
|
||||||
|
## `overflowReg` should be a register that will be used
|
||||||
|
## to store the carry flag
|
||||||
|
|
||||||
|
ctx.sbb overflowReg, overflowReg
|
||||||
|
|
||||||
|
let N = M.len
|
||||||
|
ctx.comment "Final substraction (may carry)"
|
||||||
|
for i in 0 ..< N:
|
||||||
|
ctx.mov scratch[i], t[i]
|
||||||
|
if i == 0:
|
||||||
|
ctx.sub scratch[i], M[i]
|
||||||
|
else:
|
||||||
|
ctx.sbb scratch[i], M[i]
|
||||||
|
|
||||||
|
ctx.sbb overflowReg, 0
|
||||||
|
|
||||||
|
# If we borrowed it means that we were smaller than
|
||||||
|
# the modulus and we don't need "scratch"
|
||||||
|
for i in 0 ..< N:
|
||||||
|
ctx.cmovnc t[i], scratch[i]
|
||||||
|
ctx.mov r[i], t[i]
|
||||||
|
|
||||||
|
|
||||||
|
# Montgomery reduction
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
macro montyRed_gen[N: static int](
|
||||||
|
r_MR: var array[N, SecretWord],
|
||||||
|
t_MR: array[N*2, SecretWord],
|
||||||
|
M_MR: array[N, SecretWord],
|
||||||
|
m0ninv_MR: BaseType,
|
||||||
|
canUseNoCarryMontyMul: static bool
|
||||||
|
) =
|
||||||
|
# TODO, slower than Clang, in particular due to the shadowing
|
||||||
|
|
||||||
|
result = newStmtList()
|
||||||
|
|
||||||
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
|
let
|
||||||
|
# We could force M as immediate by specializing per moduli
|
||||||
|
M = init(OperandArray, nimSymbol = M_MR, N, PointerInReg, Input)
|
||||||
|
|
||||||
|
# MUL requires RAX and RDX
|
||||||
|
rRAX = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: "[rax]",
|
||||||
|
nimSymbol: ident"rax",
|
||||||
|
rm: RAX,
|
||||||
|
constraint: InputOutput_EnsureClobber,
|
||||||
|
cEmit: "rax"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
rRDX = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: "[rdx]",
|
||||||
|
nimSymbol: ident"rdx",
|
||||||
|
rm: RDX,
|
||||||
|
constraint: Output_EarlyClobber,
|
||||||
|
cEmit: "rdx"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
m0ninv = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: "[m0ninv]",
|
||||||
|
nimSymbol: m0ninv_MR,
|
||||||
|
rm: Reg,
|
||||||
|
constraint: Input,
|
||||||
|
cEmit: "m0ninv"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
let scratchSlots = N+2
|
||||||
|
var scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||||
|
|
||||||
|
# Prologue
|
||||||
|
let eax = rRAX.desc.nimSymbol
|
||||||
|
let edx = rRDX.desc.nimSymbol
|
||||||
|
let scratchSym = scratch.nimSymbol
|
||||||
|
result.add quote do:
|
||||||
|
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
||||||
|
|
||||||
|
var `eax`{.noInit.}, `edx`{.noInit.}: BaseType
|
||||||
|
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
|
||||||
|
|
||||||
|
# Algorithm
|
||||||
|
# ---------------------------------------------------------
|
||||||
|
# for i in 0 .. n-1:
|
||||||
|
# hi <- 0
|
||||||
|
# m <- t[i] * m0ninv mod 2^w (i.e. simple multiplication)
|
||||||
|
# for j in 0 .. n-1:
|
||||||
|
# (hi, lo) <- t[i+j] + m * M[j] + hi
|
||||||
|
# t[i+j] <- lo
|
||||||
|
# t[i+n] += hi
|
||||||
|
# for i in 0 .. n-1:
|
||||||
|
# r[i] = t[i+n]
|
||||||
|
# if r >= M:
|
||||||
|
# r -= M
|
||||||
|
|
||||||
|
# No register spilling handling
|
||||||
|
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
|
||||||
|
|
||||||
|
result.add quote do:
|
||||||
|
`eax` = BaseType `t_MR`[0]
|
||||||
|
`scratchSym`[1 .. `N`-1] = `t_MR`.toOpenArray(1, `N`-1)
|
||||||
|
|
||||||
|
ctx.mov scratch[N], rRAX
|
||||||
|
ctx.imul rRAX, m0ninv # m <- t[i] * m0ninv mod 2^w
|
||||||
|
ctx.mov scratch[0], rRAX
|
||||||
|
|
||||||
|
# scratch: [t[0] * m0, t[1], t[2], t[3], t[0]] for 4 limbs
|
||||||
|
|
||||||
|
for i in 0 ..< N:
|
||||||
|
ctx.comment ""
|
||||||
|
let hi = scratch[N]
|
||||||
|
let next = scratch[N+1]
|
||||||
|
|
||||||
|
ctx.mul rdx, rax, M[0], rax
|
||||||
|
ctx.add hi, rRAX # Guaranteed to be zero
|
||||||
|
ctx.mov rRAX, scratch[0]
|
||||||
|
ctx.adc hi, rRDX
|
||||||
|
|
||||||
|
for j in 1 ..< N-1:
|
||||||
|
ctx.comment ""
|
||||||
|
ctx.mul rdx, rax, M[j], rax
|
||||||
|
ctx.add scratch[j], rRAX
|
||||||
|
ctx.mov rRAX, scratch[0]
|
||||||
|
ctx.adc rRDX, 0
|
||||||
|
ctx.add scratch[j], hi
|
||||||
|
ctx.adc rRDX, 0
|
||||||
|
ctx.mov hi, rRDX
|
||||||
|
|
||||||
|
# Next load
|
||||||
|
if i < N-1:
|
||||||
|
ctx.comment ""
|
||||||
|
ctx.mov next, scratch[1]
|
||||||
|
ctx.imul scratch[1], m0ninv
|
||||||
|
ctx.comment ""
|
||||||
|
|
||||||
|
# Last limb
|
||||||
|
ctx.comment ""
|
||||||
|
ctx.mul rdx, rax, M[N-1], rax
|
||||||
|
ctx.add scratch[N-1], rRAX
|
||||||
|
ctx.mov rRAX, scratch[1] # Contains next * m0
|
||||||
|
ctx.adc rRDX, 0
|
||||||
|
ctx.add scratch[N-1], hi
|
||||||
|
ctx.adc rRDX, 0
|
||||||
|
ctx.mov hi, rRDX
|
||||||
|
|
||||||
|
scratch.rotateLeft()
|
||||||
|
|
||||||
|
# Code generation
|
||||||
|
result.add ctx.generate()
|
||||||
|
|
||||||
|
# New codegen
|
||||||
|
ctx = init(Assembler_x86, BaseType)
|
||||||
|
|
||||||
|
let r = init(OperandArray, nimSymbol = r_MR, N, PointerInReg, InputOutput_EnsureClobber)
|
||||||
|
let t = init(OperandArray, nimSymbol = t_MR, N*2, PointerInReg, Input)
|
||||||
|
let extraRegNeeded = N-2
|
||||||
|
let tsub = init(OperandArray, nimSymbol = ident"tsub", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber)
|
||||||
|
let tsubsym = tsub.nimSymbol
|
||||||
|
result.add quote do:
|
||||||
|
var `tsubsym` {.noInit.}: Limbs[`extraRegNeeded`]
|
||||||
|
|
||||||
|
# This does t[i+n] += hi
|
||||||
|
# but in a separate carry chain, fused with the
|
||||||
|
# copy "r[i] = t[i+n]"
|
||||||
|
for i in 0 ..< N:
|
||||||
|
if i == 0:
|
||||||
|
ctx.add scratch[i], t[i+N]
|
||||||
|
else:
|
||||||
|
ctx.adc scratch[i], t[i+N]
|
||||||
|
|
||||||
|
let reuse = repackRegisters(tsub, scratch[N], scratch[N+1])
|
||||||
|
|
||||||
|
if canUseNoCarryMontyMul:
|
||||||
|
ctx.finalSubNoCarry(r, scratch, M, reuse)
|
||||||
|
else:
|
||||||
|
ctx.finalSubCanOverflow(r, scratch, M, reuse, rRAX)
|
||||||
|
|
||||||
|
# Code generation
|
||||||
|
result.add ctx.generate()
|
||||||
|
|
||||||
|
func montRed_asm*[N: static int](
|
||||||
|
r: var array[N, SecretWord],
|
||||||
|
t: array[N*2, SecretWord],
|
||||||
|
M: array[N, SecretWord],
|
||||||
|
m0ninv: BaseType,
|
||||||
|
canUseNoCarryMontyMul: static bool
|
||||||
|
) =
|
||||||
|
## Constant-time Montgomery reduction
|
||||||
|
montyRed_gen(r, t, M, m0ninv, canUseNoCarryMontyMul)
|
||||||
191
constantine/arithmetic/limbs_asm_montred_x86_adx_bmi2.nim
Normal file
191
constantine/arithmetic/limbs_asm_montred_x86_adx_bmi2.nim
Normal file
@ -0,0 +1,191 @@
|
|||||||
|
# Constantine
|
||||||
|
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
||||||
|
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
||||||
|
# Licensed and distributed under either of
|
||||||
|
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||||
|
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||||
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||||
|
|
||||||
|
import
|
||||||
|
# Standard library
|
||||||
|
std/macros,
|
||||||
|
# Internal
|
||||||
|
../config/common,
|
||||||
|
../primitives,
|
||||||
|
./limbs,
|
||||||
|
./limbs_asm_montred_x86
|
||||||
|
|
||||||
|
# ############################################################
|
||||||
|
#
|
||||||
|
# Assembly implementation of finite fields
|
||||||
|
#
|
||||||
|
# ############################################################
|
||||||
|
|
||||||
|
# TODO, MCL has an implementation about 14% faster
|
||||||
|
|
||||||
|
static: doAssert UseASM_X86_64
|
||||||
|
|
||||||
|
# MULX/ADCX/ADOX
|
||||||
|
{.localPassC:"-madx -mbmi2".}
|
||||||
|
# Necessary for the compiler to find enough registers (enabled at -O1)
|
||||||
|
{.localPassC:"-fomit-frame-pointer".}
|
||||||
|
|
||||||
|
# Montgomery reduction
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
macro montyRedx_gen[N: static int](
|
||||||
|
r_MR: var array[N, SecretWord],
|
||||||
|
t_MR: array[N*2, SecretWord],
|
||||||
|
M_MR: array[N, SecretWord],
|
||||||
|
m0ninv_MR: BaseType,
|
||||||
|
canUseNoCarryMontyMul: static bool
|
||||||
|
) =
|
||||||
|
# TODO, slower than Clang, in particular due to the shadowing
|
||||||
|
|
||||||
|
result = newStmtList()
|
||||||
|
|
||||||
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
|
let
|
||||||
|
# We could force M as immediate by specializing per moduli
|
||||||
|
M = init(OperandArray, nimSymbol = M_MR, N, PointerInReg, Input)
|
||||||
|
|
||||||
|
hi = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: "[hi]",
|
||||||
|
nimSymbol: ident"hi",
|
||||||
|
rm: Reg,
|
||||||
|
constraint: Output_EarlyClobber,
|
||||||
|
cEmit: "hi"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
lo = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: "[lo]",
|
||||||
|
nimSymbol: ident"lo",
|
||||||
|
rm: Reg,
|
||||||
|
constraint: Output_EarlyClobber,
|
||||||
|
cEmit: "lo"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
rRDX = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: "[rdx]",
|
||||||
|
nimSymbol: ident"rdx",
|
||||||
|
rm: RDX,
|
||||||
|
constraint: InputOutput_EnsureClobber,
|
||||||
|
cEmit: "rdx"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
m0ninv = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: "[m0ninv]",
|
||||||
|
nimSymbol: m0ninv_MR,
|
||||||
|
rm: Reg,
|
||||||
|
constraint: Input,
|
||||||
|
cEmit: "m0ninv"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
let scratchSlots = N+1
|
||||||
|
var scratch = init(OperandArray, nimSymbol = ident"scratch", scratchSlots, ElemsInReg, InputOutput_EnsureClobber)
|
||||||
|
|
||||||
|
# Prologue
|
||||||
|
let edx = rRDX.desc.nimSymbol
|
||||||
|
let hisym = hi.desc.nimSymbol
|
||||||
|
let losym = lo.desc.nimSymbol
|
||||||
|
let scratchSym = scratch.nimSymbol
|
||||||
|
result.add quote do:
|
||||||
|
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
|
||||||
|
|
||||||
|
var `hisym`{.noInit.}, `losym`{.noInit.}, `edx`{.noInit.}: BaseType
|
||||||
|
var `scratchSym` {.noInit.}: Limbs[`scratchSlots`]
|
||||||
|
|
||||||
|
# Algorithm
|
||||||
|
# ---------------------------------------------------------
|
||||||
|
# for i in 0 .. n-1:
|
||||||
|
# hi <- 0
|
||||||
|
# m <- t[i] * m0ninv mod 2^w (i.e. simple multiplication)
|
||||||
|
# for j in 0 .. n-1:
|
||||||
|
# (hi, lo) <- t[i+j] + m * M[j] + hi
|
||||||
|
# t[i+j] <- lo
|
||||||
|
# t[i+n] += hi
|
||||||
|
# for i in 0 .. n-1:
|
||||||
|
# r[i] = t[i+n]
|
||||||
|
# if r >= M:
|
||||||
|
# r -= M
|
||||||
|
|
||||||
|
# No register spilling handling
|
||||||
|
doAssert N <= 6, "The Assembly-optimized montgomery multiplication requires at most 6 limbs."
|
||||||
|
|
||||||
|
result.add quote do:
|
||||||
|
`edx` = BaseType(`m0ninv_MR`)
|
||||||
|
`scratchSym`[0 .. `N`-1] = `t_MR`.toOpenArray(0, `N`-1)
|
||||||
|
|
||||||
|
for i in 0 ..< N:
|
||||||
|
# RDX contains m0ninv at the start of each loop
|
||||||
|
ctx.comment ""
|
||||||
|
ctx.imul rRDX, scratch[0] # m <- t[i] * m0ninv mod 2^w
|
||||||
|
ctx.comment "---- Reduction " & $i
|
||||||
|
ctx.`xor` scratch[N], scratch[N]
|
||||||
|
|
||||||
|
for j in 0 ..< N-1:
|
||||||
|
ctx.comment ""
|
||||||
|
ctx.mulx hi, lo, M[j], rdx
|
||||||
|
ctx.adcx scratch[j], lo
|
||||||
|
ctx.adox scratch[j+1], hi
|
||||||
|
|
||||||
|
# Last limb
|
||||||
|
ctx.comment ""
|
||||||
|
ctx.mulx hi, lo, M[N-1], rdx
|
||||||
|
ctx.mov rRDX, m0ninv # Reload m0ninv for next iter
|
||||||
|
ctx.adcx scratch[N-1], lo
|
||||||
|
ctx.adox hi, scratch[N]
|
||||||
|
ctx.adcx scratch[N], hi
|
||||||
|
|
||||||
|
scratch.rotateLeft()
|
||||||
|
|
||||||
|
# Code generation
|
||||||
|
result.add ctx.generate()
|
||||||
|
|
||||||
|
# New codegen
|
||||||
|
ctx = init(Assembler_x86, BaseType)
|
||||||
|
|
||||||
|
let r = init(OperandArray, nimSymbol = r_MR, N, PointerInReg, InputOutput_EnsureClobber)
|
||||||
|
let t = init(OperandArray, nimSymbol = t_MR, N*2, PointerInReg, Input)
|
||||||
|
let extraRegNeeded = N-1
|
||||||
|
let tsub = init(OperandArray, nimSymbol = ident"tsub", extraRegNeeded, ElemsInReg, InputOutput_EnsureClobber)
|
||||||
|
let tsubsym = tsub.nimSymbol
|
||||||
|
result.add quote do:
|
||||||
|
var `tsubsym` {.noInit.}: Limbs[`extraRegNeeded`]
|
||||||
|
|
||||||
|
# This does t[i+n] += hi
|
||||||
|
# but in a separate carry chain, fused with the
|
||||||
|
# copy "r[i] = t[i+n]"
|
||||||
|
for i in 0 ..< N:
|
||||||
|
if i == 0:
|
||||||
|
ctx.add scratch[i], t[i+N]
|
||||||
|
else:
|
||||||
|
ctx.adc scratch[i], t[i+N]
|
||||||
|
|
||||||
|
let reuse = repackRegisters(tsub, scratch[N])
|
||||||
|
|
||||||
|
if canUseNoCarryMontyMul:
|
||||||
|
ctx.finalSubNoCarry(r, scratch, M, reuse)
|
||||||
|
else:
|
||||||
|
ctx.finalSubCanOverflow(r, scratch, M, reuse, hi)
|
||||||
|
|
||||||
|
# Code generation
|
||||||
|
result.add ctx.generate()
|
||||||
|
|
||||||
|
func montRed_asm_adx_bmi2*[N: static int](
|
||||||
|
r: var array[N, SecretWord],
|
||||||
|
t: array[N*2, SecretWord],
|
||||||
|
M: array[N, SecretWord],
|
||||||
|
m0ninv: BaseType,
|
||||||
|
canUseNoCarryMontyMul: static bool
|
||||||
|
) =
|
||||||
|
## Constant-time Montgomery reduction
|
||||||
|
montyRedx_gen(r, t, M, m0ninv, canUseNoCarryMontyMul)
|
||||||
146
constantine/arithmetic/limbs_asm_mul_x86.nim
Normal file
146
constantine/arithmetic/limbs_asm_mul_x86.nim
Normal file
@ -0,0 +1,146 @@
|
|||||||
|
# Constantine
|
||||||
|
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
||||||
|
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
||||||
|
# Licensed and distributed under either of
|
||||||
|
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||||
|
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||||
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||||
|
|
||||||
|
import
|
||||||
|
# Standard library
|
||||||
|
std/macros,
|
||||||
|
# Internal
|
||||||
|
../config/common,
|
||||||
|
../primitives
|
||||||
|
|
||||||
|
# ############################################################
|
||||||
|
#
|
||||||
|
# Assembly implementation of bigint multiplication
|
||||||
|
#
|
||||||
|
# ############################################################
|
||||||
|
|
||||||
|
# Note: We can refer to at most 30 registers in inline assembly
|
||||||
|
# and "InputOutput" registers count double
|
||||||
|
# They are nice to let the compiler deals with mov
|
||||||
|
# but too constraining so we move things ourselves.
|
||||||
|
|
||||||
|
# TODO: verify that assembly generated works for small arrays
|
||||||
|
# that are passed by values
|
||||||
|
|
||||||
|
static: doAssert UseASM_X86_64 # Need 8 registers just for mul
|
||||||
|
# and 32-bit only has 8 max.
|
||||||
|
|
||||||
|
macro mul_gen[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
|
||||||
|
## Comba multiplication generator
|
||||||
|
## `a`, `b`, `r` can have a different number of limbs
|
||||||
|
## if `r`.limbs.len < a.limbs.len + b.limbs.len
|
||||||
|
## The result will be truncated, i.e. it will be
|
||||||
|
## a * b (mod (2^WordBitwidth)^r.limbs.len)
|
||||||
|
##
|
||||||
|
## Assumes r doesn't aliases a or b
|
||||||
|
|
||||||
|
result = newStmtList()
|
||||||
|
|
||||||
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
|
let
|
||||||
|
arrR = init(OperandArray, nimSymbol = r, rLen, PointerInReg, InputOutput_EnsureClobber)
|
||||||
|
arrA = init(OperandArray, nimSymbol = a, aLen, PointerInReg, Input)
|
||||||
|
arrB = init(OperandArray, nimSymbol = b, bLen, PointerInReg, Input)
|
||||||
|
|
||||||
|
t = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: "[t]",
|
||||||
|
nimSymbol: ident"t",
|
||||||
|
rm: Reg,
|
||||||
|
constraint: Output_EarlyClobber,
|
||||||
|
cEmit: "t"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
u = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: "[u]",
|
||||||
|
nimSymbol: ident"u",
|
||||||
|
rm: Reg,
|
||||||
|
constraint: Output_EarlyClobber,
|
||||||
|
cEmit: "u"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
v = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: "[v]",
|
||||||
|
nimSymbol: ident"v",
|
||||||
|
rm: Reg,
|
||||||
|
constraint: Output_EarlyClobber,
|
||||||
|
cEmit: "v"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# MUL requires RAX and RDX
|
||||||
|
rRAX = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: "[rax]",
|
||||||
|
nimSymbol: ident"rax",
|
||||||
|
rm: RAX,
|
||||||
|
constraint: Output_EarlyClobber,
|
||||||
|
cEmit: "rax"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
rRDX = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: "[rdx]",
|
||||||
|
nimSymbol: ident"rdx",
|
||||||
|
rm: RDX,
|
||||||
|
constraint: Output_EarlyClobber,
|
||||||
|
cEmit: "rdx"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Prologue
|
||||||
|
let tsym = t.desc.nimSymbol
|
||||||
|
let usym = u.desc.nimSymbol
|
||||||
|
let vsym = v.desc.nimSymbol
|
||||||
|
let eax = rRAX.desc.nimSymbol
|
||||||
|
let edx = rRDX.desc.nimSymbol
|
||||||
|
result.add quote do:
|
||||||
|
var `tsym`{.noInit.}, `usym`{.noInit.}, `vsym`{.noInit.}: BaseType # zero-init
|
||||||
|
var `eax`{.noInit.}, `edx`{.noInit.}: BaseType
|
||||||
|
|
||||||
|
# Algorithm
|
||||||
|
ctx.`xor` u, u
|
||||||
|
ctx.`xor` v, v
|
||||||
|
ctx.`xor` t, t
|
||||||
|
|
||||||
|
for i in 0 ..< min(aLen+bLen, rLen):
|
||||||
|
let ib = min(bLen-1, i)
|
||||||
|
let ia = i - ib
|
||||||
|
for j in 0 ..< min(aLen - ia, ib+1):
|
||||||
|
# (t, u, v) <- (t, u, v) + a[ia+j] * b[ib-j]
|
||||||
|
ctx.mov rRAX, arrB[ib-j]
|
||||||
|
ctx.mul rdx, rax, arrA[ia+j], rax
|
||||||
|
ctx.add v, rRAX
|
||||||
|
ctx.adc u, rRDX
|
||||||
|
ctx.adc t, 0
|
||||||
|
|
||||||
|
ctx.mov arrR[i], v
|
||||||
|
|
||||||
|
if i != min(aLen+bLen, rLen) - 1:
|
||||||
|
ctx.mov v, u
|
||||||
|
ctx.mov u, t
|
||||||
|
ctx.`xor` t, t
|
||||||
|
|
||||||
|
if aLen+bLen < rLen:
|
||||||
|
ctx.`xor` rRAX, rRAX
|
||||||
|
for i in aLen+bLen ..< rLen:
|
||||||
|
ctx.mov arrR[i], rRAX
|
||||||
|
|
||||||
|
# Codegen
|
||||||
|
result.add ctx.generate
|
||||||
|
|
||||||
|
func mul_asm*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
|
||||||
|
## Multi-precision Multiplication
|
||||||
|
## Assumes r doesn't alias a or b
|
||||||
|
mul_gen(r, a, b)
|
||||||
197
constantine/arithmetic/limbs_asm_mul_x86_adx_bmi2.nim
Normal file
197
constantine/arithmetic/limbs_asm_mul_x86_adx_bmi2.nim
Normal file
@ -0,0 +1,197 @@
|
|||||||
|
# Constantine
|
||||||
|
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
||||||
|
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
||||||
|
# Licensed and distributed under either of
|
||||||
|
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||||
|
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||||
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||||
|
|
||||||
|
import
|
||||||
|
# Standard library
|
||||||
|
std/macros,
|
||||||
|
# Internal
|
||||||
|
../config/common,
|
||||||
|
../primitives
|
||||||
|
|
||||||
|
# ############################################################
|
||||||
|
#
|
||||||
|
# Assembly implementation of finite fields
|
||||||
|
#
|
||||||
|
# ############################################################
|
||||||
|
|
||||||
|
# Note: We can refer to at most 30 registers in inline assembly
|
||||||
|
# and "InputOutput" registers count double
|
||||||
|
# They are nice to let the compiler deals with mov
|
||||||
|
# but too constraining so we move things ourselves.
|
||||||
|
|
||||||
|
static: doAssert UseASM_X86_64
|
||||||
|
|
||||||
|
# MULX/ADCX/ADOX
|
||||||
|
{.localPassC:"-madx -mbmi2".}
|
||||||
|
# Necessary for the compiler to find enough registers (enabled at -O1)
|
||||||
|
# {.localPassC:"-fomit-frame-pointer".}
|
||||||
|
|
||||||
|
# Multiplication
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
proc mulx_by_word(
|
||||||
|
ctx: var Assembler_x86,
|
||||||
|
r0: Operand,
|
||||||
|
a, t: OperandArray,
|
||||||
|
word0: Operand,
|
||||||
|
rRAX, rRDX: Operand
|
||||||
|
) =
|
||||||
|
## Multiply the `a[0..<N]` by `word`
|
||||||
|
## and store in `[t:r0]`
|
||||||
|
## with [t:r0] = tn, tn-1, ... t1, r0
|
||||||
|
doAssert a.len + 1 == t.len
|
||||||
|
let N = a.len
|
||||||
|
|
||||||
|
ctx.comment " Outer loop i = 0, j=0 to " & $N
|
||||||
|
|
||||||
|
# for j=0 to N-1
|
||||||
|
# (C,t[j]) := t[j] + a[j]*b[i] + C
|
||||||
|
|
||||||
|
# First limb
|
||||||
|
ctx.mov rRDX, word0
|
||||||
|
ctx.`xor` rRAX, rRAX # Clear flags (important if steady state is skipped)
|
||||||
|
ctx.mulx t[0], rRAX, a[0], rdx
|
||||||
|
ctx.mov r0, rRAX
|
||||||
|
|
||||||
|
# Steady state
|
||||||
|
for j in 1 ..< N:
|
||||||
|
ctx.mulx t[j], rRAX, a[j], rdx
|
||||||
|
if j == 1:
|
||||||
|
ctx.add t[j-1], rRAX
|
||||||
|
else:
|
||||||
|
ctx.adc t[j-1], rRAX
|
||||||
|
|
||||||
|
# Final carries
|
||||||
|
ctx.comment " Accumulate last carries in hi word"
|
||||||
|
ctx.adc t[N-1], 0
|
||||||
|
|
||||||
|
proc mulaccx_by_word(
|
||||||
|
ctx: var Assembler_x86,
|
||||||
|
r: OperandArray,
|
||||||
|
i: int,
|
||||||
|
a, t: OperandArray,
|
||||||
|
word: Operand,
|
||||||
|
rRAX, rRDX: Operand
|
||||||
|
) =
|
||||||
|
## Multiply the `a[0..<N]` by `word`
|
||||||
|
## and store in `[t:r0]`
|
||||||
|
## with [t:r0] = tn, tn-1, ... t1, r0
|
||||||
|
doAssert a.len + 1 == t.len
|
||||||
|
let N = min(a.len, r.len)
|
||||||
|
let hi = t[a.len]
|
||||||
|
|
||||||
|
doAssert i != 0
|
||||||
|
|
||||||
|
ctx.comment " Outer loop i = " & $i & ", j in [0, " & $N & ")"
|
||||||
|
ctx.mov rRDX, word
|
||||||
|
ctx.`xor` rRAX, rRAX # Clear flags
|
||||||
|
|
||||||
|
# for j=0 to N-1
|
||||||
|
# (C,t[j]) := t[j] + a[j]*b[i] + C
|
||||||
|
|
||||||
|
# Steady state
|
||||||
|
for j in 0 ..< N:
|
||||||
|
ctx.mulx hi, rRAX, a[j], rdx
|
||||||
|
ctx.adox t[j], rRAX
|
||||||
|
if j == 0:
|
||||||
|
ctx.mov r[i], t[j]
|
||||||
|
if j == N-1:
|
||||||
|
break
|
||||||
|
ctx.adcx t[j+1], hi
|
||||||
|
|
||||||
|
# Final carries
|
||||||
|
ctx.comment " Accumulate last carries in hi word"
|
||||||
|
ctx.mov rRDX, 0 # Set to 0 without clearing flags
|
||||||
|
ctx.adcx hi, rRDX
|
||||||
|
ctx.adox hi, rRDX
|
||||||
|
|
||||||
|
macro mulx_gen[rLen, aLen, bLen: static int](rx: var Limbs[rLen], ax: Limbs[aLen], bx: Limbs[bLen]) =
|
||||||
|
## `a`, `b`, `r` can have a different number of limbs
|
||||||
|
## if `r`.limbs.len < a.limbs.len + b.limbs.len
|
||||||
|
## The result will be truncated, i.e. it will be
|
||||||
|
## a * b (mod (2^WordBitwidth)^r.limbs.len)
|
||||||
|
##
|
||||||
|
## Assumes r doesn't aliases a or b
|
||||||
|
|
||||||
|
result = newStmtList()
|
||||||
|
|
||||||
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
|
let
|
||||||
|
r = init(OperandArray, nimSymbol = rx, rLen, PointerInReg, InputOutput_EnsureClobber)
|
||||||
|
a = init(OperandArray, nimSymbol = ax, aLen, PointerInReg, Input)
|
||||||
|
b = init(OperandArray, nimSymbol = bx, bLen, PointerInReg, Input)
|
||||||
|
|
||||||
|
# MULX requires RDX
|
||||||
|
rRDX = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: "[rdx]",
|
||||||
|
nimSymbol: ident"rdx",
|
||||||
|
rm: RDX,
|
||||||
|
constraint: Output_EarlyClobber,
|
||||||
|
cEmit: "rdx"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
rRAX = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: "[rax]",
|
||||||
|
nimSymbol: ident"rax",
|
||||||
|
rm: RAX,
|
||||||
|
constraint: Output_EarlyClobber,
|
||||||
|
cEmit: "rax"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
tSlots = aLen+1 # Extra for high word
|
||||||
|
|
||||||
|
var # If aLen is too big, we need to spill registers. TODO.
|
||||||
|
t = init(OperandArray, nimSymbol = ident"t", tSlots, ElemsInReg, Output_EarlyClobber)
|
||||||
|
|
||||||
|
|
||||||
|
# Prologue
|
||||||
|
let tsym = t.nimSymbol
|
||||||
|
let eax = rRAX.desc.nimSymbol
|
||||||
|
let edx = rRDX.desc.nimSymbol
|
||||||
|
result.add quote do:
|
||||||
|
var `tsym`{.noInit.}: array[`tSlots`, BaseType]
|
||||||
|
var `edx`{.noInit.}, `eax`{.noInit.}: BaseType
|
||||||
|
|
||||||
|
for i in 0 ..< min(rLen, bLen):
|
||||||
|
if i == 0:
|
||||||
|
ctx.mulx_by_word(
|
||||||
|
r[0],
|
||||||
|
a, t,
|
||||||
|
b[0],
|
||||||
|
rRAX, rRDX,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
ctx.mulaccx_by_word(
|
||||||
|
r, i,
|
||||||
|
a, t,
|
||||||
|
b[i],
|
||||||
|
rRAX, rRDX
|
||||||
|
)
|
||||||
|
|
||||||
|
t.rotateLeft()
|
||||||
|
|
||||||
|
# Copy upper-limbs to result
|
||||||
|
for i in b.len ..< min(a.len+b.len, rLen):
|
||||||
|
ctx.mov r[i], t[i-b.len]
|
||||||
|
|
||||||
|
# Zero the extra
|
||||||
|
if aLen+bLen < rLen:
|
||||||
|
ctx.`xor` rRAX, rRAX
|
||||||
|
for i in aLen+bLen ..< rLen:
|
||||||
|
ctx.mov r[i], rRAX
|
||||||
|
|
||||||
|
# Codegen
|
||||||
|
result.add ctx.generate
|
||||||
|
|
||||||
|
func mul_asm_adx_bmi2*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
|
||||||
|
## Multi-precision Multiplication
|
||||||
|
## Assumes r doesn't alias a or b
|
||||||
|
mulx_gen(r, a, b)
|
||||||
@ -11,8 +11,7 @@ import
|
|||||||
std/macros,
|
std/macros,
|
||||||
# Internal
|
# Internal
|
||||||
../config/common,
|
../config/common,
|
||||||
../primitives,
|
../primitives
|
||||||
./limbs_generic
|
|
||||||
|
|
||||||
# ############################################################
|
# ############################################################
|
||||||
#
|
#
|
||||||
@ -25,9 +24,10 @@ import
|
|||||||
# They are nice to let the compiler deals with mov
|
# They are nice to let the compiler deals with mov
|
||||||
# but too constraining so we move things ourselves.
|
# but too constraining so we move things ourselves.
|
||||||
|
|
||||||
static: doAssert UseX86ASM
|
# TODO: verify that assembly generated works for small arrays
|
||||||
|
# that are passed by values
|
||||||
|
|
||||||
{.localPassC:"-fomit-frame-pointer".} # Needed so that the compiler finds enough registers
|
static: doAssert UseASM_X86_32
|
||||||
|
|
||||||
# Copy
|
# Copy
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
@ -40,8 +40,6 @@ macro ccopy_gen[N: static int](a: var Limbs[N], b: Limbs[N], ctl: SecretBool): u
|
|||||||
let
|
let
|
||||||
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, InputOutput)
|
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, InputOutput)
|
||||||
arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, Input)
|
arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, Input)
|
||||||
# If N is too big, we need to spill registers. TODO.
|
|
||||||
arrT = init(OperandArray, nimSymbol = ident"t", N, ElemsInReg, Output_EarlyClobber)
|
|
||||||
|
|
||||||
control = Operand(
|
control = Operand(
|
||||||
desc: OperandDesc(
|
desc: OperandDesc(
|
||||||
@ -53,16 +51,42 @@ macro ccopy_gen[N: static int](a: var Limbs[N], b: Limbs[N], ctl: SecretBool): u
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var # Swappable registers to break dependency chains
|
||||||
|
t0 = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: "[t0]",
|
||||||
|
nimSymbol: ident"t0",
|
||||||
|
rm: Reg,
|
||||||
|
constraint: Output_EarlyClobber,
|
||||||
|
cEmit: "t0"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
t1 = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: "[t1]",
|
||||||
|
nimSymbol: ident"t1",
|
||||||
|
rm: Reg,
|
||||||
|
constraint: Output_EarlyClobber,
|
||||||
|
cEmit: "t1"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Prologue
|
||||||
|
let t0sym = t0.desc.nimSymbol
|
||||||
|
let t1sym = t1.desc.nimSymbol
|
||||||
|
result.add quote do:
|
||||||
|
var `t0sym`{.noinit.}, `t1sym`{.noinit.}: BaseType
|
||||||
|
|
||||||
|
# Algorithm
|
||||||
ctx.test control, control
|
ctx.test control, control
|
||||||
for i in 0 ..< N:
|
for i in 0 ..< N:
|
||||||
ctx.mov arrT[i], arrA[i]
|
ctx.mov t0, arrA[i]
|
||||||
ctx.cmovnz arrT[i], arrB[i]
|
ctx.cmovnz t0, arrB[i]
|
||||||
ctx.mov arrA[i], arrT[i]
|
ctx.mov arrA[i], t0
|
||||||
|
swap(t0, t1)
|
||||||
|
|
||||||
let t = arrT.nimSymbol
|
# Codegen
|
||||||
let c = control.desc.nimSymbol
|
|
||||||
result.add quote do:
|
|
||||||
var `t` {.noInit.}: typeof(`a`)
|
|
||||||
result.add ctx.generate()
|
result.add ctx.generate()
|
||||||
|
|
||||||
func ccopy_asm*(a: var Limbs, b: Limbs, ctl: SecretBool) {.inline.}=
|
func ccopy_asm*(a: var Limbs, b: Limbs, ctl: SecretBool) {.inline.}=
|
||||||
@ -71,3 +95,121 @@ func ccopy_asm*(a: var Limbs, b: Limbs, ctl: SecretBool) {.inline.}=
|
|||||||
## if ctl is false: b is not copied and a is untouched
|
## if ctl is false: b is not copied and a is untouched
|
||||||
## Time and memory accesses are the same whether a copy occurs or not
|
## Time and memory accesses are the same whether a copy occurs or not
|
||||||
ccopy_gen(a, b, ctl)
|
ccopy_gen(a, b, ctl)
|
||||||
|
|
||||||
|
# Addition
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
macro add_gen[N: static int](carry: var Carry, r: var Limbs[N], a, b: Limbs[N]): untyped =
|
||||||
|
## Generate an optimized out-of-place addition kernel
|
||||||
|
|
||||||
|
result = newStmtList()
|
||||||
|
|
||||||
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
|
let
|
||||||
|
arrR = init(OperandArray, nimSymbol = r, N, PointerInReg, InputOutput)
|
||||||
|
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, Input)
|
||||||
|
arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, Input)
|
||||||
|
|
||||||
|
var # Swappable registers to break dependency chains
|
||||||
|
t0 = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: "[t0]",
|
||||||
|
nimSymbol: ident"t0",
|
||||||
|
rm: Reg,
|
||||||
|
constraint: Output_EarlyClobber,
|
||||||
|
cEmit: "t0"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
t1 = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: "[t1]",
|
||||||
|
nimSymbol: ident"t1",
|
||||||
|
rm: Reg,
|
||||||
|
constraint: Output_EarlyClobber,
|
||||||
|
cEmit: "t1"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Prologue
|
||||||
|
let t0sym = t0.desc.nimSymbol
|
||||||
|
let t1sym = t1.desc.nimSymbol
|
||||||
|
result.add quote do:
|
||||||
|
var `t0sym`{.noinit.}, `t1sym`{.noinit.}: BaseType
|
||||||
|
|
||||||
|
# Algorithm
|
||||||
|
for i in 0 ..< N:
|
||||||
|
ctx.mov t0, arrA[i]
|
||||||
|
if i == 0:
|
||||||
|
ctx.add t0, arrB[0]
|
||||||
|
else:
|
||||||
|
ctx.adc t0, arrB[i]
|
||||||
|
ctx.mov arrR[i], t0
|
||||||
|
swap(t0, t1)
|
||||||
|
ctx.setToCarryFlag(carry)
|
||||||
|
|
||||||
|
# Codegen
|
||||||
|
result.add ctx.generate
|
||||||
|
|
||||||
|
func add_asm*(r: var Limbs, a, b: Limbs): Carry {.inline.}=
|
||||||
|
## Constant-time addition
|
||||||
|
add_gen(result, r, a, b)
|
||||||
|
|
||||||
|
# Substraction
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
macro sub_gen[N: static int](borrow: var Borrow, r: var Limbs[N], a, b: Limbs[N]): untyped =
|
||||||
|
## Generate an optimized out-of-place substraction kernel
|
||||||
|
|
||||||
|
result = newStmtList()
|
||||||
|
|
||||||
|
var ctx = init(Assembler_x86, BaseType)
|
||||||
|
let
|
||||||
|
arrR = init(OperandArray, nimSymbol = r, N, PointerInReg, InputOutput)
|
||||||
|
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, Input)
|
||||||
|
arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, Input)
|
||||||
|
|
||||||
|
var # Swappable registers to break dependency chains
|
||||||
|
t0 = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: "[t0]",
|
||||||
|
nimSymbol: ident"t0",
|
||||||
|
rm: Reg,
|
||||||
|
constraint: Output_EarlyClobber,
|
||||||
|
cEmit: "t0"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
t1 = Operand(
|
||||||
|
desc: OperandDesc(
|
||||||
|
asmId: "[t1]",
|
||||||
|
nimSymbol: ident"t1",
|
||||||
|
rm: Reg,
|
||||||
|
constraint: Output_EarlyClobber,
|
||||||
|
cEmit: "t1"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Prologue
|
||||||
|
let t0sym = t0.desc.nimSymbol
|
||||||
|
let t1sym = t1.desc.nimSymbol
|
||||||
|
result.add quote do:
|
||||||
|
var `t0sym`{.noinit.}, `t1sym`{.noinit.}: BaseType
|
||||||
|
|
||||||
|
# Algorithm
|
||||||
|
for i in 0 ..< N:
|
||||||
|
ctx.mov t0, arrA[i]
|
||||||
|
if i == 0:
|
||||||
|
ctx.sub t0, arrB[0]
|
||||||
|
else:
|
||||||
|
ctx.sbb t0, arrB[i]
|
||||||
|
ctx.mov arrR[i], t0
|
||||||
|
swap(t0, t1)
|
||||||
|
ctx.setToCarryFlag(borrow)
|
||||||
|
|
||||||
|
# Codegen
|
||||||
|
result.add ctx.generate
|
||||||
|
|
||||||
|
func sub_asm*(r: var Limbs, a, b: Limbs): Borrow {.inline.}=
|
||||||
|
## Constant-time substraction
|
||||||
|
sub_gen(result, r, a, b)
|
||||||
|
|||||||
101
constantine/arithmetic/limbs_double_width.nim
Normal file
101
constantine/arithmetic/limbs_double_width.nim
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
# Constantine
|
||||||
|
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
||||||
|
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
||||||
|
# Licensed and distributed under either of
|
||||||
|
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||||
|
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||||
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||||
|
|
||||||
|
import
|
||||||
|
../config/common,
|
||||||
|
../primitives,
|
||||||
|
./limbs
|
||||||
|
|
||||||
|
when UseASM_X86_32:
|
||||||
|
import ./limbs_asm_montred_x86
|
||||||
|
when UseASM_X86_64:
|
||||||
|
import ./limbs_asm_montred_x86_adx_bmi2
|
||||||
|
|
||||||
|
# ############################################################
|
||||||
|
#
|
||||||
|
# Limbs raw representation and operations
|
||||||
|
#
|
||||||
|
# ############################################################
|
||||||
|
|
||||||
|
# No exceptions allowed
|
||||||
|
{.push raises: [].}
|
||||||
|
|
||||||
|
# Montgomery Reduction
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# This is the reduction part of SOS (Separated Operand Scanning) modular multiplication technique
|
||||||
|
|
||||||
|
# TODO upstream, using Limbs[N] breaks semcheck
|
||||||
|
func montyRed*[N: static int](
|
||||||
|
r: var array[N, SecretWord],
|
||||||
|
t: array[N*2, SecretWord],
|
||||||
|
M: array[N, SecretWord],
|
||||||
|
m0ninv: BaseType, canUseNoCarryMontyMul: static bool) =
|
||||||
|
## Montgomery reduce a double-width bigint modulo M
|
||||||
|
# - Analyzing and Comparing Montgomery Multiplication Algorithms
|
||||||
|
# Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr.
|
||||||
|
# http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf
|
||||||
|
#
|
||||||
|
# - Arithmetic of Finite Fields
|
||||||
|
# Chapter 5 of Guide to Pairing-Based Cryptography
|
||||||
|
# Jean Luc Beuchat, Luis J. Dominguez Perez, Sylvain Duquesne, Nadia El Mrabet, Laura Fuentes-Castañeda, Francisco Rodríguez-Henríquez, 2017
|
||||||
|
# https://www.researchgate.net/publication/319538235_Arithmetic_of_Finite_Fields
|
||||||
|
#
|
||||||
|
# Algorithm
|
||||||
|
# Inputs:
|
||||||
|
# - N number of limbs
|
||||||
|
# - t[0 ..< 2N] (double-width input to reduce)
|
||||||
|
# - M[0 ..< N] The field modulus (must be odd for Montgomery reduction)
|
||||||
|
# - m0ninv: Montgomery Reduction magic number = -1/M[0]
|
||||||
|
# Output:
|
||||||
|
# - r[0 ..< N], in the Montgomery domain
|
||||||
|
# Parameters:
|
||||||
|
# - w, the word width usually 64 on 64-bit platforms or 32 on 32-bit
|
||||||
|
#
|
||||||
|
# for i in 0 .. n-1:
|
||||||
|
# C <- 0
|
||||||
|
# m <- t[i] * m0ninv mod 2^w (i.e. simple multiplication)
|
||||||
|
# for j in 0 .. n-1:
|
||||||
|
# (C, S) <- t[i+j] + m * M[j] + C
|
||||||
|
# t[i+j] <- S
|
||||||
|
# t[i+n] += C
|
||||||
|
# for i in 0 .. n-1:
|
||||||
|
# r[i] = t[i+n]
|
||||||
|
# if r >= M:
|
||||||
|
# r -= M
|
||||||
|
#
|
||||||
|
# Important note: `t[i+n] += C` should propagate the carry
|
||||||
|
# to the higher limb if any, thank you "implementation detail"
|
||||||
|
# missing from paper.
|
||||||
|
when UseASM_X86_64 and r.len <= 6:
|
||||||
|
if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()):
|
||||||
|
montRed_asm_adx_bmi2(r, t, M, m0ninv, canUseNoCarryMontyMul)
|
||||||
|
else:
|
||||||
|
montRed_asm(r, t, M, m0ninv, canUseNoCarryMontyMul)
|
||||||
|
elif UseASM_X86_32 and r.len <= 6:
|
||||||
|
# TODO: Assembly faster than GCC but slower than Clang
|
||||||
|
montRed_asm(r, t, M, m0ninv, canUseNoCarryMontyMul)
|
||||||
|
else:
|
||||||
|
var t = t # Copy "t" for mutation and ensure on stack
|
||||||
|
var res: typeof(r) # Accumulator
|
||||||
|
staticFor i, 0, N:
|
||||||
|
var C = Zero
|
||||||
|
let m = t[i] * SecretWord(m0ninv)
|
||||||
|
staticFor j, 0, N:
|
||||||
|
muladd2(C, t[i+j], m, M[j], t[i+j], C)
|
||||||
|
res[i] = C
|
||||||
|
|
||||||
|
# This does t[i+n] += C
|
||||||
|
# but in a separate carry chain, fused with the
|
||||||
|
# copy "r[i] = t[i+n]"
|
||||||
|
var carry = Carry(0)
|
||||||
|
staticFor i, 0, N:
|
||||||
|
addC(carry, res[i], t[i+N], res[i], carry)
|
||||||
|
|
||||||
|
# Final substraction
|
||||||
|
discard res.csub(M, SecretWord(carry).isNonZero() or not(res < M))
|
||||||
|
r = res
|
||||||
@ -9,7 +9,7 @@
|
|||||||
import
|
import
|
||||||
../config/common,
|
../config/common,
|
||||||
../primitives,
|
../primitives,
|
||||||
./limbs_generic
|
./limbs
|
||||||
|
|
||||||
# No exceptions allowed
|
# No exceptions allowed
|
||||||
{.push raises: [].}
|
{.push raises: [].}
|
||||||
|
|||||||
@ -12,9 +12,9 @@ import
|
|||||||
# Internal
|
# Internal
|
||||||
../config/common,
|
../config/common,
|
||||||
../primitives,
|
../primitives,
|
||||||
./limbs_generic
|
./limbs
|
||||||
|
|
||||||
when UseX86ASM:
|
when UseASM_X86_64:
|
||||||
import
|
import
|
||||||
./limbs_asm_montmul_x86,
|
./limbs_asm_montmul_x86,
|
||||||
./limbs_asm_montmul_x86_adx_bmi2
|
./limbs_asm_montmul_x86_adx_bmi2
|
||||||
@ -37,55 +37,7 @@ when UseX86ASM:
|
|||||||
# - pairing final exponentiation
|
# - pairing final exponentiation
|
||||||
# are bottlenecked by Montgomery multiplications or squarings
|
# are bottlenecked by Montgomery multiplications or squarings
|
||||||
#
|
#
|
||||||
# Unfortunately, the fastest implementation of Montgomery Multiplication
|
# Hence we use inline assembly where possible
|
||||||
# on x86 is impossible without resorting to assembly (probably 15~30% faster)
|
|
||||||
#
|
|
||||||
# It requires implementing 2 parallel pipelines of carry-chains (via instruction-level parallelism)
|
|
||||||
# of MULX, ADCX and ADOX instructions, according to Intel paper:
|
|
||||||
# https://www.intel.cn/content/dam/www/public/us/en/documents/white-papers/ia-large-integer-arithmetic-paper.pdf
|
|
||||||
# and the code generation of MCL
|
|
||||||
# https://github.com/herumi/mcl
|
|
||||||
#
|
|
||||||
# A generic implementation would require implementing a mini-compiler as macro
|
|
||||||
# significantly sacrificing code readability, portability, auditability and maintainability.
|
|
||||||
#
|
|
||||||
# This would however save significant hardware or cloud resources.
|
|
||||||
# An example inline assembly compiler for add-with-carry is available in
|
|
||||||
# primitives/research/addcarry_subborrow_compiler.nim
|
|
||||||
#
|
|
||||||
# Instead we follow the optimized high-level implementation of Goff
|
|
||||||
# which skips a significant amount of additions for moduli
|
|
||||||
# that have their the most significant bit unset.
|
|
||||||
|
|
||||||
# Loop unroller
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
|
|
||||||
proc replaceNodes(ast: NimNode, what: NimNode, by: NimNode): NimNode =
|
|
||||||
# Replace "what" ident node by "by"
|
|
||||||
proc inspect(node: NimNode): NimNode =
|
|
||||||
case node.kind:
|
|
||||||
of {nnkIdent, nnkSym}:
|
|
||||||
if node.eqIdent(what):
|
|
||||||
return by
|
|
||||||
return node
|
|
||||||
of nnkEmpty:
|
|
||||||
return node
|
|
||||||
of nnkLiterals:
|
|
||||||
return node
|
|
||||||
else:
|
|
||||||
var rTree = node.kind.newTree()
|
|
||||||
for child in node:
|
|
||||||
rTree.add inspect(child)
|
|
||||||
return rTree
|
|
||||||
result = inspect(ast)
|
|
||||||
|
|
||||||
macro staticFor(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped): untyped =
|
|
||||||
result = newStmtList()
|
|
||||||
for i in start ..< stopEx:
|
|
||||||
result.add nnkBlockStmt.newTree(
|
|
||||||
ident("unrolledIter_" & $idx & $i),
|
|
||||||
body.replaceNodes(idx, newLit i)
|
|
||||||
)
|
|
||||||
|
|
||||||
# No exceptions allowed
|
# No exceptions allowed
|
||||||
{.push raises: [].}
|
{.push raises: [].}
|
||||||
@ -348,7 +300,7 @@ func montyMul*(
|
|||||||
# - specialize/duplicate code for m0ninv == 1 (especially if only 1 curve is needed)
|
# - specialize/duplicate code for m0ninv == 1 (especially if only 1 curve is needed)
|
||||||
# - keep it generic and optimize code size
|
# - keep it generic and optimize code size
|
||||||
when canUseNoCarryMontyMul:
|
when canUseNoCarryMontyMul:
|
||||||
when UseX86ASM and a.len in {2 .. 6}: # TODO: handle spilling
|
when UseASM_X86_64 and a.len in {2 .. 6}: # TODO: handle spilling
|
||||||
if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()):
|
if ({.noSideEffect.}: hasBmi2()) and ({.noSideEffect.}: hasAdx()):
|
||||||
montMul_CIOS_nocarry_asm_adx_bmi2(r, a, b, M, m0ninv)
|
montMul_CIOS_nocarry_asm_adx_bmi2(r, a, b, M, m0ninv)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -30,6 +30,14 @@ type
|
|||||||
|
|
||||||
SecretBool* = CTBool[SecretWord]
|
SecretBool* = CTBool[SecretWord]
|
||||||
|
|
||||||
|
|
||||||
|
Limbs*[N: static int] = array[N, SecretWord]
|
||||||
|
## Limbs-type
|
||||||
|
## Should be distinct type to avoid builtins to use non-constant time
|
||||||
|
## implementation, for example for comparison.
|
||||||
|
##
|
||||||
|
## but for unknown reason, it prevents semchecking `bits`
|
||||||
|
|
||||||
const
|
const
|
||||||
WordBitWidth* = sizeof(SecretWord) * 8
|
WordBitWidth* = sizeof(SecretWord) * 8
|
||||||
## Logical word size
|
## Logical word size
|
||||||
@ -44,7 +52,8 @@ const
|
|||||||
# TODO, we restrict assembly to 64-bit words
|
# TODO, we restrict assembly to 64-bit words
|
||||||
# We need to support register spills for large limbs
|
# We need to support register spills for large limbs
|
||||||
const ConstantineASM {.booldefine.} = true
|
const ConstantineASM {.booldefine.} = true
|
||||||
const UseX86ASM* = WordBitWidth == 64 and ConstantineASM and X86 and GCC_Compatible
|
const UseASM_X86_32* = ConstantineASM and X86 and GCC_Compatible
|
||||||
|
const UseASM_X86_64* = WordBitWidth == 64 and UseASM_X86_32
|
||||||
|
|
||||||
# ############################################################
|
# ############################################################
|
||||||
#
|
#
|
||||||
|
|||||||
@ -27,15 +27,19 @@ macro Mod*(C: static Curve): untyped =
|
|||||||
## Get the Modulus associated to a curve
|
## Get the Modulus associated to a curve
|
||||||
result = bindSym($C & "_Modulus")
|
result = bindSym($C & "_Modulus")
|
||||||
|
|
||||||
func getCurveBitwidth*(C: static Curve): static int =
|
template getCurveBitwidth*(C: Curve): int =
|
||||||
## Returns the number of bits taken by the curve modulus
|
## Returns the number of bits taken by the curve modulus
|
||||||
result = static(CurveBitWidth[C])
|
CurveBitWidth[C]
|
||||||
|
|
||||||
template matchingBigInt*(C: static Curve): untyped =
|
template matchingBigInt*(C: static Curve): untyped =
|
||||||
BigInt[CurveBitWidth[C]]
|
BigInt[CurveBitWidth[C]]
|
||||||
|
|
||||||
func family*(C: static Curve): CurveFamily =
|
template family*(C: Curve): CurveFamily =
|
||||||
result = static(CurveFamilies[C])
|
CurveFamilies[C]
|
||||||
|
|
||||||
|
template matchingLimbs2x*(C: Curve): untyped =
|
||||||
|
const N2 = wordsRequired(getCurveBitwidth(C)) * 2 # TODO upstream, not precomputing N2 breaks semcheck
|
||||||
|
array[N2, SecretWord] # TODO upstream, using Limbs[N2] breaks semcheck
|
||||||
|
|
||||||
# ############################################################
|
# ############################################################
|
||||||
#
|
#
|
||||||
|
|||||||
@ -31,8 +31,6 @@ type
|
|||||||
debug:
|
debug:
|
||||||
import std/strutils
|
import std/strutils
|
||||||
|
|
||||||
type Limbs[N: static int] = array[N, SecretWord]
|
|
||||||
|
|
||||||
func toString*(a: Limbs): string =
|
func toString*(a: Limbs): string =
|
||||||
result = "["
|
result = "["
|
||||||
result.add " 0x" & toHex(BaseType(a[0]))
|
result.add " 0x" & toHex(BaseType(a[0]))
|
||||||
|
|||||||
@ -10,7 +10,6 @@ import
|
|||||||
# Standard Library
|
# Standard Library
|
||||||
std/typetraits,
|
std/typetraits,
|
||||||
# Internal
|
# Internal
|
||||||
../../helpers/static_for,
|
|
||||||
../primitives,
|
../primitives,
|
||||||
../config/[common, curves, type_bigint],
|
../config/[common, curves, type_bigint],
|
||||||
../arithmetic,
|
../arithmetic,
|
||||||
|
|||||||
@ -8,7 +8,6 @@
|
|||||||
|
|
||||||
import
|
import
|
||||||
# Internal
|
# Internal
|
||||||
../../helpers/static_for,
|
|
||||||
../primitives,
|
../primitives,
|
||||||
../config/[common, curves, type_bigint],
|
../config/[common, curves, type_bigint],
|
||||||
../arithmetic,
|
../arithmetic,
|
||||||
|
|||||||
@ -190,7 +190,7 @@ func sum*[F](
|
|||||||
r.y.sum(Q.x, Q.z) # 15. Y3 <- X2 + Z2
|
r.y.sum(Q.x, Q.z) # 15. Y3 <- X2 + Z2
|
||||||
r.x *= r.y # 16. X3 <- X3 Y3 X3 = (X1 Z1)(X2 Z2)
|
r.x *= r.y # 16. X3 <- X3 Y3 X3 = (X1 Z1)(X2 Z2)
|
||||||
r.y.sum(t0, t2) # 17. Y3 <- t0 + t2 Y3 = X1 X2 + Z1 Z2
|
r.y.sum(t0, t2) # 17. Y3 <- t0 + t2 Y3 = X1 X2 + Z1 Z2
|
||||||
r.y.diff(r.x, r.y) # 18. Y3 <- X3 - Y3 Y3 = (X1 + Z1)(X2 + Z2) - (X1 X2 + Z1 Z2) = X1 Z2 + X2 Z1
|
r.y.diffAlias(r.x, r.y) # 18. Y3 <- X3 - Y3 Y3 = (X1 + Z1)(X2 + Z2) - (X1 X2 + Z1 Z2) = X1 Z2 + X2 Z1
|
||||||
when F is Fp2 and F.C.getSexticTwist() == D_Twist:
|
when F is Fp2 and F.C.getSexticTwist() == D_Twist:
|
||||||
t0 *= SexticNonResidue
|
t0 *= SexticNonResidue
|
||||||
t1 *= SexticNonResidue
|
t1 *= SexticNonResidue
|
||||||
@ -206,7 +206,7 @@ func sum*[F](
|
|||||||
r.y *= SexticNonResidue
|
r.y *= SexticNonResidue
|
||||||
r.x.prod(t4, r.y) # 25. X3 <- t4 Y3 X3 = 3b(Y1 Z2 + Y2 Z1)(X1 Z2 + X2 Z1)
|
r.x.prod(t4, r.y) # 25. X3 <- t4 Y3 X3 = 3b(Y1 Z2 + Y2 Z1)(X1 Z2 + X2 Z1)
|
||||||
t2.prod(t3, t1) # 26. t2 <- t3 t1 t2 = (X1 Y2 + X2 Y1) (Y1 Y2 - 3b Z1 Z2)
|
t2.prod(t3, t1) # 26. t2 <- t3 t1 t2 = (X1 Y2 + X2 Y1) (Y1 Y2 - 3b Z1 Z2)
|
||||||
r.x.diff(t2, r.x) # 27. X3 <- t2 - X3 X3 = (X1 Y2 + X2 Y1) (Y1 Y2 - 3b Z1 Z2) - 3b(Y1 Z2 + Y2 Z1)(X1 Z2 + X2 Z1)
|
r.x.diffAlias(t2, r.x) # 27. X3 <- t2 - X3 X3 = (X1 Y2 + X2 Y1) (Y1 Y2 - 3b Z1 Z2) - 3b(Y1 Z2 + Y2 Z1)(X1 Z2 + X2 Z1)
|
||||||
r.y *= t0 # 28. Y3 <- Y3 t0 Y3 = 9b X1 X2 (X1 Z2 + X2 Z1)
|
r.y *= t0 # 28. Y3 <- Y3 t0 Y3 = 9b X1 X2 (X1 Z2 + X2 Z1)
|
||||||
t1 *= r.z # 29. t1 <- t1 Z3 t1 = (Y1 Y2 - 3b Z1 Z2)(Y1 Y2 + 3b Z1 Z2)
|
t1 *= r.z # 29. t1 <- t1 Z3 t1 = (Y1 Y2 - 3b Z1 Z2)(Y1 Y2 + 3b Z1 Z2)
|
||||||
r.y += t1 # 30. Y3 <- t1 + Y3 Y3 = (Y1 Y2 + 3b Z1 Z2)(Y1 Y2 - 3b Z1 Z2) + 9b X1 X2 (X1 Z2 + X2 Z1)
|
r.y += t1 # 30. Y3 <- t1 + Y3 Y3 = (Y1 Y2 + 3b Z1 Z2)(Y1 Y2 - 3b Z1 Z2) + 9b X1 X2 (X1 Z2 + X2 Z1)
|
||||||
|
|||||||
@ -12,7 +12,8 @@ import
|
|||||||
primitives/multiplexers,
|
primitives/multiplexers,
|
||||||
primitives/addcarry_subborrow,
|
primitives/addcarry_subborrow,
|
||||||
primitives/extended_precision,
|
primitives/extended_precision,
|
||||||
primitives/bithacks
|
primitives/bithacks,
|
||||||
|
primitives/static_for
|
||||||
|
|
||||||
export
|
export
|
||||||
constant_time_types,
|
constant_time_types,
|
||||||
@ -20,7 +21,8 @@ export
|
|||||||
multiplexers,
|
multiplexers,
|
||||||
addcarry_subborrow,
|
addcarry_subborrow,
|
||||||
extended_precision,
|
extended_precision,
|
||||||
bithacks
|
bithacks,
|
||||||
|
staticFor
|
||||||
|
|
||||||
when X86 and GCC_Compatible:
|
when X86 and GCC_Compatible:
|
||||||
import primitives/[cpuinfo_x86, macro_assembler_x86]
|
import primitives/[cpuinfo_x86, macro_assembler_x86]
|
||||||
|
|||||||
@ -6,7 +6,7 @@
|
|||||||
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||||
|
|
||||||
import std/[macros, strutils, sets, hashes]
|
import std/[macros, strutils, sets, hashes, algorithm]
|
||||||
|
|
||||||
# A compile-time inline assembler
|
# A compile-time inline assembler
|
||||||
|
|
||||||
@ -33,6 +33,9 @@ type
|
|||||||
|
|
||||||
RAX = "a"
|
RAX = "a"
|
||||||
|
|
||||||
|
# Flags
|
||||||
|
CarryFlag = "@ccc"
|
||||||
|
|
||||||
Register* = enum
|
Register* = enum
|
||||||
rbx, rdx, r8, rax
|
rbx, rdx, r8, rax
|
||||||
|
|
||||||
@ -98,6 +101,12 @@ func hash(od: OperandDesc): Hash =
|
|||||||
func len*(opArray: OperandArray): int =
|
func len*(opArray: OperandArray): int =
|
||||||
opArray.buf.len
|
opArray.buf.len
|
||||||
|
|
||||||
|
func len*(opArray: Operand): int =
|
||||||
|
opArray.buf.len
|
||||||
|
|
||||||
|
func rotateLeft*(opArray: var OperandArray) =
|
||||||
|
opArray.buf.rotateLeft(1)
|
||||||
|
|
||||||
proc `[]`*(opArray: OperandArray, index: int): Operand =
|
proc `[]`*(opArray: OperandArray, index: int): Operand =
|
||||||
opArray.buf[index]
|
opArray.buf[index]
|
||||||
|
|
||||||
@ -181,6 +190,25 @@ func asArrayAddr*(op: Operand, len: int): Operand =
|
|||||||
# Code generation
|
# Code generation
|
||||||
# ------------------------------------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
func setToCarryFlag*(a: var Assembler_x86, carry: NimNode) =
|
||||||
|
|
||||||
|
# We need to dereference the hidden pointer of var param
|
||||||
|
let isHiddenDeref = carry.kind == nnkHiddenDeref
|
||||||
|
let nimSymbol = if isHiddenDeref: carry[0]
|
||||||
|
else: carry
|
||||||
|
{.noSideEffect.}:
|
||||||
|
let symStr = $nimSymbol
|
||||||
|
|
||||||
|
let desc = OperandDesc(
|
||||||
|
asmId: "",
|
||||||
|
nimSymbol: ident(symStr),
|
||||||
|
rm: CarryFlag,
|
||||||
|
constraint: Output_Overwrite,
|
||||||
|
cEmit: symStr
|
||||||
|
)
|
||||||
|
|
||||||
|
a.operands.incl(desc)
|
||||||
|
|
||||||
func generate*(a: Assembler_x86): NimNode =
|
func generate*(a: Assembler_x86): NimNode =
|
||||||
## Generate the inline assembly code from
|
## Generate the inline assembly code from
|
||||||
## the desired instruction
|
## the desired instruction
|
||||||
@ -527,7 +555,7 @@ func cmovc*(a: var Assembler_x86, dst, src: Operand) =
|
|||||||
func cmovnc*(a: var Assembler_x86, dst, src: Operand) =
|
func cmovnc*(a: var Assembler_x86, dst, src: Operand) =
|
||||||
## Does: dst <- src if the carry flag is not set
|
## Does: dst <- src if the carry flag is not set
|
||||||
doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr
|
doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr
|
||||||
doAssert dst.desc.constraint in {Output_EarlyClobber, InputOutput, Output_Overwrite}, $dst.repr
|
doAssert dst.desc.constraint in OutputReg, $dst.repr
|
||||||
|
|
||||||
a.codeFragment("cmovnc", src, dst)
|
a.codeFragment("cmovnc", src, dst)
|
||||||
# No clobber
|
# No clobber
|
||||||
@ -566,7 +594,7 @@ func mul*(a: var Assembler_x86, dHi, dLo: Register, src0: Operand, src1: Registe
|
|||||||
|
|
||||||
func imul*(a: var Assembler_x86, dst, src: Operand) =
|
func imul*(a: var Assembler_x86, dst, src: Operand) =
|
||||||
## Does dst <- dst * src, keeping only the low half
|
## Does dst <- dst * src, keeping only the low half
|
||||||
doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr
|
doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr
|
||||||
doAssert dst.desc.constraint in OutputReg, $dst.repr
|
doAssert dst.desc.constraint in OutputReg, $dst.repr
|
||||||
|
|
||||||
a.codeFragment("imul", src, dst)
|
a.codeFragment("imul", src, dst)
|
||||||
@ -574,9 +602,9 @@ func imul*(a: var Assembler_x86, dst, src: Operand) =
|
|||||||
func mulx*(a: var Assembler_x86, dHi, dLo, src0: Operand, src1: Register) =
|
func mulx*(a: var Assembler_x86, dHi, dLo, src0: Operand, src1: Register) =
|
||||||
## Does (dHi, dLo) <- src0 * src1
|
## Does (dHi, dLo) <- src0 * src1
|
||||||
doAssert src1 == rdx, "MULX requires the RDX register"
|
doAssert src1 == rdx, "MULX requires the RDX register"
|
||||||
doAssert dHi.desc.rm in {Reg, ElemsInReg} or dHi.desc.rm in SpecificRegisters,
|
doAssert dHi.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
|
||||||
"The destination operand must be a register " & $dHi.repr
|
"The destination operand must be a register " & $dHi.repr
|
||||||
doAssert dLo.desc.rm in {Reg, ElemsInReg} or dLo.desc.rm in SpecificRegisters,
|
doAssert dLo.desc.rm in {Reg, ElemsInReg}+SpecificRegisters,
|
||||||
"The destination operand must be a register " & $dLo.repr
|
"The destination operand must be a register " & $dLo.repr
|
||||||
doAssert dHi.desc.constraint in OutputReg
|
doAssert dHi.desc.constraint in OutputReg
|
||||||
doAssert dLo.desc.constraint in OutputReg
|
doAssert dLo.desc.constraint in OutputReg
|
||||||
@ -595,7 +623,7 @@ func adcx*(a: var Assembler_x86, dst, src: Operand) =
|
|||||||
## Does: dst <- dst + src + carry
|
## Does: dst <- dst + src + carry
|
||||||
## and only sets the carry flag
|
## and only sets the carry flag
|
||||||
doAssert dst.desc.constraint in OutputReg, $dst.repr
|
doAssert dst.desc.constraint in OutputReg, $dst.repr
|
||||||
doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr
|
doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr
|
||||||
a.codeFragment("adcx", src, dst)
|
a.codeFragment("adcx", src, dst)
|
||||||
a.areFlagsClobbered = true
|
a.areFlagsClobbered = true
|
||||||
|
|
||||||
@ -603,7 +631,7 @@ func adox*(a: var Assembler_x86, dst, src: Operand) =
|
|||||||
## Does: dst <- dst + src + overflow
|
## Does: dst <- dst + src + overflow
|
||||||
## and only sets the overflow flag
|
## and only sets the overflow flag
|
||||||
doAssert dst.desc.constraint in OutputReg, $dst.repr
|
doAssert dst.desc.constraint in OutputReg, $dst.repr
|
||||||
doAssert dst.desc.rm in {Reg, ElemsInReg}, "The destination operand must be a register: " & $dst.repr
|
doAssert dst.desc.rm in {Reg, ElemsInReg}+SpecificRegisters, "The destination operand must be a register: " & $dst.repr
|
||||||
a.codeFragment("adox", src, dst)
|
a.codeFragment("adox", src, dst)
|
||||||
a.areFlagsClobbered = true
|
a.areFlagsClobbered = true
|
||||||
|
|
||||||
|
|||||||
36
constantine/primitives/static_for.nim
Normal file
36
constantine/primitives/static_for.nim
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
# Constantine
|
||||||
|
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
||||||
|
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
||||||
|
# Licensed and distributed under either of
|
||||||
|
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||||
|
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||||
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||||
|
|
||||||
|
import std/macros
|
||||||
|
|
||||||
|
proc replaceNodes(ast: NimNode, what: NimNode, by: NimNode): NimNode =
|
||||||
|
# Replace "what" ident node by "by"
|
||||||
|
proc inspect(node: NimNode): NimNode =
|
||||||
|
case node.kind:
|
||||||
|
of {nnkIdent, nnkSym}:
|
||||||
|
if node.eqIdent(what):
|
||||||
|
return by
|
||||||
|
return node
|
||||||
|
of nnkEmpty:
|
||||||
|
return node
|
||||||
|
of nnkLiterals:
|
||||||
|
return node
|
||||||
|
else:
|
||||||
|
var rTree = node.kind.newTree()
|
||||||
|
for child in node:
|
||||||
|
rTree.add inspect(child)
|
||||||
|
return rTree
|
||||||
|
result = inspect(ast)
|
||||||
|
|
||||||
|
macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped): untyped =
|
||||||
|
result = newStmtList()
|
||||||
|
for i in start ..< stopEx:
|
||||||
|
result.add nnkBlockStmt.newTree(
|
||||||
|
ident("unrolledIter_" & $idx & $i),
|
||||||
|
body.replaceNodes(idx, newLit i)
|
||||||
|
)
|
||||||
@ -84,17 +84,59 @@ func prod_complex(r: var QuadraticExt, a, b: QuadraticExt) =
|
|||||||
mixin fromComplexExtension
|
mixin fromComplexExtension
|
||||||
static: doAssert r.fromComplexExtension()
|
static: doAssert r.fromComplexExtension()
|
||||||
|
|
||||||
var a0b0 {.noInit.}, a1b1 {.noInit.}: typeof(r.c0)
|
# TODO: GCC is adding an unexplainable 30 cycles tax to this function (~10% slow down)
|
||||||
a0b0.prod(a.c0, b.c0) # [1 Mul]
|
# for seemingly no reason
|
||||||
a1b1.prod(a.c1, b.c1) # [2 Mul]
|
|
||||||
|
|
||||||
r.c0.sum(a.c0, a.c1) # r0 = (a0 + a1) # [2 Mul, 1 Add]
|
when true: # Single-width implementation
|
||||||
r.c1.sum(b.c0, b.c1) # r1 = (b0 + b1) # [2 Mul, 2 Add]
|
# Clang 330 cycles on i9-9980XE @4.1 GHz
|
||||||
r.c1 *= r.c0 # r1 = (b0 + b1)(a0 + a1) # [3 Mul, 2 Add] - 𝔽p temporary
|
var a0b0 {.noInit.}, a1b1 {.noInit.}: typeof(r.c0)
|
||||||
|
a0b0.prod(a.c0, b.c0) # [1 Mul]
|
||||||
|
a1b1.prod(a.c1, b.c1) # [2 Mul]
|
||||||
|
|
||||||
r.c0.diff(a0b0, a1b1) # r0 = a0 b0 - a1 b1 # [3 Mul, 2 Add, 1 Sub]
|
r.c0.sum(a.c0, a.c1) # r0 = (a0 + a1) # [2 Mul, 1 Add]
|
||||||
r.c1 -= a0b0 # r1 = (b0 + b1)(a0 + a1) - a0b0 # [3 Mul, 2 Add, 2 Sub]
|
r.c1.sum(b.c0, b.c1) # r1 = (b0 + b1) # [2 Mul, 2 Add]
|
||||||
r.c1 -= a1b1 # r1 = (b0 + b1)(a0 + a1) - a0b0 - a1b1 # [3 Mul, 2 Add, 3 Sub]
|
r.c1 *= r.c0 # r1 = (b0 + b1)(a0 + a1) # [3 Mul, 2 Add] - 𝔽p temporary
|
||||||
|
|
||||||
|
r.c0.diff(a0b0, a1b1) # r0 = a0 b0 - a1 b1 # [3 Mul, 2 Add, 1 Sub]
|
||||||
|
r.c1 -= a0b0 # r1 = (b0 + b1)(a0 + a1) - a0b0 # [3 Mul, 2 Add, 2 Sub]
|
||||||
|
r.c1 -= a1b1 # r1 = (b0 + b1)(a0 + a1) - a0b0 - a1b1 # [3 Mul, 2 Add, 3 Sub]
|
||||||
|
|
||||||
|
else: # Double-width implementation with lazy reduction
|
||||||
|
# Deactivated for now Clang 360 cycles on i9-9980XE @4.1 GHz
|
||||||
|
var a0b0 {.noInit.}, a1b1 {.noInit.}: doubleWidth(typeof(r.c0))
|
||||||
|
var d {.noInit.}: doubleWidth(typeof(r.c0))
|
||||||
|
const msbSet = r.c0.typeof.C.canUseNoCarryMontyMul()
|
||||||
|
|
||||||
|
a0b0.mulNoReduce(a.c0, b.c0) # 44 cycles - cumul 44
|
||||||
|
a1b1.mulNoReduce(a.c1, b.c1) # 44 cycles - cumul 88
|
||||||
|
when msbSet:
|
||||||
|
r.c0.sum(a.c0, a.c1)
|
||||||
|
r.c1.sum(b.c0, b.c1)
|
||||||
|
else:
|
||||||
|
r.c0.sumNoReduce(a.c0, a.c1) # 5 cycles - cumul 93
|
||||||
|
r.c1.sumNoReduce(b.c0, b.c1) # 5 cycles - cumul 98
|
||||||
|
d.mulNoReduce(r.c0, r.c1) # 44 cycles - cumul 142
|
||||||
|
when msbSet:
|
||||||
|
d -= a0b0
|
||||||
|
d -= a1b1
|
||||||
|
else:
|
||||||
|
d.diffNoReduce(d, a0b0) # 10 cycles - cumul 152
|
||||||
|
d.diffNoReduce(d, a1b1) # 10 cycles - cumul 162
|
||||||
|
a0b0.diff(a0b0, a1b1) # 18 cycles - cumul 170
|
||||||
|
r.c0.reduce(a0b0) # 68 cycles - cumul 248
|
||||||
|
r.c1.reduce(d) # 68 cycles - cumul 316
|
||||||
|
|
||||||
|
# Single-width [3 Mul, 2 Add, 3 Sub]
|
||||||
|
# 3*81 + 2*14 + 3*12 = 307 theoretical cycles
|
||||||
|
# 330 measured
|
||||||
|
# Double-Width
|
||||||
|
# 316 theoretical cycles
|
||||||
|
# 365 measured
|
||||||
|
# Reductions can be 2x10 faster using MCL algorithm
|
||||||
|
# but there are still unexplained 50 cycles diff between theo and measured
|
||||||
|
# and unexplained 30 cycles between Clang and GCC
|
||||||
|
# - Function calls?
|
||||||
|
# - push/pop stack?
|
||||||
|
|
||||||
# Commutative ring implementation for generic quadratic extension fields
|
# Commutative ring implementation for generic quadratic extension fields
|
||||||
# -------------------------------------------------------------------
|
# -------------------------------------------------------------------
|
||||||
|
|||||||
@ -154,6 +154,19 @@ func diff*(r: var CubicExt, a, b: CubicExt) =
|
|||||||
r.c1.diff(a.c1, b.c1)
|
r.c1.diff(a.c1, b.c1)
|
||||||
r.c2.diff(a.c2, b.c2)
|
r.c2.diff(a.c2, b.c2)
|
||||||
|
|
||||||
|
func diffAlias*(r: var QuadraticExt, a, b: QuadraticExt) =
|
||||||
|
## Diff ``a`` and ``b`` into ``r``
|
||||||
|
## Handles r and b aliasing
|
||||||
|
r.c0.diffAlias(a.c0, b.c0)
|
||||||
|
r.c1.diffAlias(a.c1, b.c1)
|
||||||
|
|
||||||
|
func diffAlias*(r: var CubicExt, a, b: CubicExt) =
|
||||||
|
## Diff ``a`` and ``b`` into ``r``
|
||||||
|
## Handles r and b aliasing
|
||||||
|
r.c0.diffAlias(a.c0, b.c0)
|
||||||
|
r.c1.diffAlias(a.c1, b.c1)
|
||||||
|
r.c2.diffAlias(a.c2, b.c2)
|
||||||
|
|
||||||
# Multiplication by a small integer known at compile-time
|
# Multiplication by a small integer known at compile-time
|
||||||
# -------------------------------------------------------------------
|
# -------------------------------------------------------------------
|
||||||
|
|
||||||
|
|||||||
@ -145,6 +145,7 @@ proc mainArith() =
|
|||||||
discard a.add(SecretWord 1)
|
discard a.add(SecretWord 1)
|
||||||
check: bool(a == expected)
|
check: bool(a == expected)
|
||||||
|
|
||||||
|
proc mainMul() =
|
||||||
suite "Multi-precision multiplication" & " [" & $WordBitwidth & "-bit mode]":
|
suite "Multi-precision multiplication" & " [" & $WordBitwidth & "-bit mode]":
|
||||||
test "Same size operand into double size result":
|
test "Same size operand into double size result":
|
||||||
block:
|
block:
|
||||||
@ -185,6 +186,7 @@ proc mainArith() =
|
|||||||
r.prod(b, a)
|
r.prod(b, a)
|
||||||
check: bool(r == expected)
|
check: bool(r == expected)
|
||||||
|
|
||||||
|
proc mainMulHigh() =
|
||||||
suite "Multi-precision multiplication keeping only high words" & " [" & $WordBitwidth & "-bit mode]":
|
suite "Multi-precision multiplication keeping only high words" & " [" & $WordBitwidth & "-bit mode]":
|
||||||
test "Same size operand into double size result - discard first word":
|
test "Same size operand into double size result - discard first word":
|
||||||
block:
|
block:
|
||||||
@ -270,6 +272,7 @@ proc mainArith() =
|
|||||||
r.prod_high_words(b, a, 2)
|
r.prod_high_words(b, a, 2)
|
||||||
check: bool(r == expected)
|
check: bool(r == expected)
|
||||||
|
|
||||||
|
proc mainModular() =
|
||||||
suite "Modular operations - small modulus" & " [" & $WordBitwidth & "-bit mode]":
|
suite "Modular operations - small modulus" & " [" & $WordBitwidth & "-bit mode]":
|
||||||
# Vectors taken from Stint - https://github.com/status-im/nim-stint
|
# Vectors taken from Stint - https://github.com/status-im/nim-stint
|
||||||
test "100 mod 13":
|
test "100 mod 13":
|
||||||
@ -619,6 +622,9 @@ proc mainModularInverse() =
|
|||||||
check: bool(r == expected)
|
check: bool(r == expected)
|
||||||
|
|
||||||
mainArith()
|
mainArith()
|
||||||
|
mainMul()
|
||||||
|
mainMulHigh()
|
||||||
|
mainModular()
|
||||||
mainNeg()
|
mainNeg()
|
||||||
mainCopySwap()
|
mainCopySwap()
|
||||||
mainModularInverse()
|
mainModularInverse()
|
||||||
|
|||||||
97
tests/t_finite_fields_double_width.nim
Normal file
97
tests/t_finite_fields_double_width.nim
Normal file
@ -0,0 +1,97 @@
|
|||||||
|
# Constantine
|
||||||
|
# Copyright (c) 2018-2019 Status Research & Development GmbH
|
||||||
|
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
|
||||||
|
# Licensed and distributed under either of
|
||||||
|
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
|
||||||
|
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
|
||||||
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||||
|
|
||||||
|
import
|
||||||
|
# Standard library
|
||||||
|
std/[unittest, times],
|
||||||
|
# Internal
|
||||||
|
../constantine/arithmetic,
|
||||||
|
../constantine/io/[io_bigints, io_fields],
|
||||||
|
../constantine/config/[curves, common, type_bigint],
|
||||||
|
# Test utilities
|
||||||
|
../helpers/prng_unsafe
|
||||||
|
|
||||||
|
const Iters = 128
|
||||||
|
|
||||||
|
var rng: RngState
|
||||||
|
let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
|
||||||
|
rng.seed(seed)
|
||||||
|
echo "\n------------------------------------------------------\n"
|
||||||
|
echo "test_finite_fields_double_width xoshiro512** seed: ", seed
|
||||||
|
|
||||||
|
proc randomCurve(C: static Curve) =
|
||||||
|
let a = rng.random_unsafe(Fp[C])
|
||||||
|
let b = rng.random_unsafe(Fp[C])
|
||||||
|
|
||||||
|
var r_fp, r_fpDbl: Fp[C]
|
||||||
|
var tmpDbl: FpDbl[C]
|
||||||
|
|
||||||
|
r_fp.prod(a, b)
|
||||||
|
tmpDbl.mulNoReduce(a, b)
|
||||||
|
r_fpDbl.reduce(tmpDbl)
|
||||||
|
|
||||||
|
doAssert bool(r_fp == r_fpDbl)
|
||||||
|
|
||||||
|
proc randomHighHammingWeight(C: static Curve) =
|
||||||
|
let a = rng.random_highHammingWeight(Fp[C])
|
||||||
|
let b = rng.random_highHammingWeight(Fp[C])
|
||||||
|
|
||||||
|
var r_fp, r_fpDbl: Fp[C]
|
||||||
|
var tmpDbl: FpDbl[C]
|
||||||
|
|
||||||
|
r_fp.prod(a, b)
|
||||||
|
tmpDbl.mulNoReduce(a, b)
|
||||||
|
r_fpDbl.reduce(tmpDbl)
|
||||||
|
|
||||||
|
doAssert bool(r_fp == r_fpDbl)
|
||||||
|
|
||||||
|
proc random_long01Seq(C: static Curve) =
|
||||||
|
let a = rng.random_long01Seq(Fp[C])
|
||||||
|
let b = rng.random_long01Seq(Fp[C])
|
||||||
|
|
||||||
|
var r_fp, r_fpDbl: Fp[C]
|
||||||
|
var tmpDbl: FpDbl[C]
|
||||||
|
|
||||||
|
r_fp.prod(a, b)
|
||||||
|
tmpDbl.mulNoReduce(a, b)
|
||||||
|
r_fpDbl.reduce(tmpDbl)
|
||||||
|
|
||||||
|
doAssert bool(r_fp == r_fpDbl)
|
||||||
|
|
||||||
|
suite "Field Multiplication via double-width field elements is consistent with single-width." & " [" & $WordBitwidth & "-bit mode]":
|
||||||
|
test "With P-224 field modulus":
|
||||||
|
for _ in 0 ..< Iters:
|
||||||
|
randomCurve(P224)
|
||||||
|
for _ in 0 ..< Iters:
|
||||||
|
randomHighHammingWeight(P224)
|
||||||
|
for _ in 0 ..< Iters:
|
||||||
|
random_long01Seq(P224)
|
||||||
|
|
||||||
|
test "With P-256 field modulus":
|
||||||
|
for _ in 0 ..< Iters:
|
||||||
|
randomCurve(P256)
|
||||||
|
for _ in 0 ..< Iters:
|
||||||
|
randomHighHammingWeight(P256)
|
||||||
|
for _ in 0 ..< Iters:
|
||||||
|
random_long01Seq(P256)
|
||||||
|
|
||||||
|
test "With BN254_Snarks field modulus":
|
||||||
|
for _ in 0 ..< Iters:
|
||||||
|
randomCurve(BN254_Snarks)
|
||||||
|
for _ in 0 ..< Iters:
|
||||||
|
randomHighHammingWeight(BN254_Snarks)
|
||||||
|
for _ in 0 ..< Iters:
|
||||||
|
random_long01Seq(BN254_Snarks)
|
||||||
|
|
||||||
|
test "With BLS12_381 field modulus":
|
||||||
|
for _ in 0 ..< Iters:
|
||||||
|
randomCurve(BLS12_381)
|
||||||
|
for _ in 0 ..< Iters:
|
||||||
|
randomHighHammingWeight(BLS12_381)
|
||||||
|
for _ in 0 ..< Iters:
|
||||||
|
random_long01Seq(BLS12_381)
|
||||||
@ -25,7 +25,7 @@ proc checkCubeRootOfUnity(curve: static Curve) =
|
|||||||
test $curve & " cube root of unity (mod r)":
|
test $curve & " cube root of unity (mod r)":
|
||||||
var cru: BigInt[3 * curve.getCurveOrderBitwidth()]
|
var cru: BigInt[3 * curve.getCurveOrderBitwidth()]
|
||||||
cru.prod(curve.getCubicRootOfUnity_mod_r(), curve.getCubicRootOfUnity_mod_r())
|
cru.prod(curve.getCubicRootOfUnity_mod_r(), curve.getCubicRootOfUnity_mod_r())
|
||||||
cru.prod(cru, curve.getCubicRootOfUnity_mod_r())
|
cru.mul(curve.getCubicRootOfUnity_mod_r())
|
||||||
|
|
||||||
var r: BigInt[curve.getCurveOrderBitwidth()]
|
var r: BigInt[curve.getCurveOrderBitwidth()]
|
||||||
r.reduce(cru, curve.getCurveOrder)
|
r.reduce(cru, curve.getCurveOrder)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user