constantine/benchmarks/bench_fp_double_precision.nim
Mamy Ratsimbazafy c6d9a213f2
Rework assembly to be compatible with LTO (#231)
* rework assembler register/mem and constraint declarations

* Introduce constraint UnmutatedPointerToWriteMem

* Create invidual memory cell operands

* [Assembly] fully support indirect memory addressing

* fix calling convention for exported procs

* Prepare for switch to intel syntax to avoid clang constant propagation asm symbol name interfering OR pointer+offset addressing

* use modifiers to prevent bad string mixin fo assembler to linker of propagated consts

* Assembly: switch to intel syntax

* with working memory operand - now works with LTO on both GCC and clang and constant folding

* use memory operand in more places

* remove some inline now that we have lto

* cleanup compiler config and benches

* tracer shouldn't force dependencies when unused

* fix cc on linux

* nimble fixes

* update README [skip CI]

* update MacOS CI with Homebrew Clang

* oops nimble bindings disappeared

* more nimble fixes

* fix sha256 exported symbol

* improve constraints on modular addition

* Add extra constraint to force reloading of pointer in reg inputs

* Fix LLVM gold linker running out of registers

* workaround MinGW64 GCC 12.2 bad codegen in t_pairing_cyclotomic_subgroup with LTO
2023-04-26 06:58:31 +02:00

255 lines
8.2 KiB
Nim

# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# ############################################################
#
# Benchmark of finite fields
#
# ############################################################
import
# Internals
../constantine/platforms/abstractions,
../constantine/math/config/curves,
../constantine/math/arithmetic,
../constantine/math/extension_fields,
# Helpers
../helpers/prng_unsafe,
./platforms,
# Standard library
std/[monotimes, times, strformat, strutils]
var rng: RngState
let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
rng.seed(seed)
echo "bench xoshiro512** seed: ", seed
# warmup
proc warmup*() =
# Warmup - make sure cpu is on max perf
let start = cpuTime()
var foo = 123
for i in 0 ..< 300_000_000:
foo += i*i mod 456
foo = foo mod 789
# Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
let stop = cpuTime()
echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
warmup()
when defined(gcc):
echo "\nCompiled with GCC"
elif defined(clang):
echo "\nCompiled with Clang"
elif defined(vcc):
echo "\nCompiled with MSVC"
elif defined(icc):
echo "\nCompiled with ICC"
else:
echo "\nCompiled with an unknown compiler"
echo "Optimization level => "
echo " no optimization: ", not defined(release)
echo " release: ", defined(release)
echo " danger: ", defined(danger)
echo " inline assembly: ", UseASM_X86_64
when (sizeof(int) == 4) or defined(Ctt32):
echo "⚠️ Warning: using Constantine with 32-bit limbs"
else:
echo "Using Constantine with 64-bit limbs"
when SupportsCPUName:
echo "Running on ", cpuName(), ""
when SupportsGetTicks:
echo "\n⚠️ Cycles measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
echo "i.e. a 20% overclock will be about 20% off (assuming no dynamic frequency scaling)"
echo "\n=================================================================================================================\n"
proc separator*() =
echo "-".repeat(145)
proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
let ns = inNanoseconds((stop-start) div iters)
let throughput = 1e9 / float64(ns)
when SupportsGetTicks:
echo &"{op:<28} {field:<40} {throughput:>15.3f} ops/s {ns:>9} ns/op {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
else:
echo &"{op:<28} {field:<40} {throughput:>15.3f} ops/s {ns:>9} ns/op"
proc notes*() =
echo "Notes:"
echo " - Compilers:"
echo " Compilers are severely limited on multiprecision arithmetic."
echo " Constantine compile-time assembler is used by default (nimble bench_fp)."
echo " GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries."
echo " GCC also seems to have issues with large temporaries and register spilling."
echo " This is somewhat alleviated by Constantine compile-time assembler."
echo " Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc\" or \"nimble bench_ec_g1_clang\"."
echo " Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc_noasm\" or \"nimble bench_ec_g1_clang_noasm\"."
echo " - The simplest operations might be optimized away by the compiler."
template bench(op: string, desc: string, iters: int, body: untyped): untyped =
let start = getMonotime()
when SupportsGetTicks:
let startClk = getTicks()
for _ in 0 ..< iters:
body
when SupportsGetTicks:
let stopClk = getTicks()
let stop = getMonotime()
when not SupportsGetTicks:
let startClk = -1'i64
let stopClk = -1'i64
report(op, desc, start, stop, startClk, stopClk, iters)
func random_unsafe(rng: var RngState, a: var FpDbl, Base: typedesc) =
## Initialize a standalone Double-Width field element
## we don't reduce it modulo p², this is only used for benchmark
let aHi = rng.random_unsafe(Base)
let aLo = rng.random_unsafe(Base)
for i in 0 ..< aLo.mres.limbs.len:
a.limbs2x[i] = aLo.mres.limbs[i]
for i in 0 ..< aHi.mres.limbs.len:
a.limbs2x[aLo.mres.limbs.len+i] = aHi.mres.limbs[i]
proc sumUnr(T: typedesc, iters: int) =
var r: T
let a = rng.random_unsafe(T)
let b = rng.random_unsafe(T)
bench("Addition unreduced", $T, iters):
r.sumUnr(a, b)
proc sum(T: typedesc, iters: int) =
var r: T
let a = rng.random_unsafe(T)
let b = rng.random_unsafe(T)
bench("Addition", $T, iters):
r.sum(a, b)
proc diffUnr(T: typedesc, iters: int) =
var r: T
let a = rng.random_unsafe(T)
let b = rng.random_unsafe(T)
bench("Substraction unreduced", $T, iters):
r.diffUnr(a, b)
proc diff(T: typedesc, iters: int) =
var r: T
let a = rng.random_unsafe(T)
let b = rng.random_unsafe(T)
bench("Substraction", $T, iters):
r.diff(a, b)
proc neg(T: typedesc, iters: int) =
var r: T
let a = rng.random_unsafe(T)
bench("Negation", $T, iters):
r.neg(a)
proc sum2xUnreduce(T: typedesc, iters: int) =
var r, a, b: doublePrec(T)
rng.random_unsafe(r, T)
rng.random_unsafe(a, T)
rng.random_unsafe(b, T)
bench("Addition 2x unreduced", $doublePrec(T), iters):
r.sum2xUnr(a, b)
proc sum2x(T: typedesc, iters: int) =
var r, a, b: doublePrec(T)
rng.random_unsafe(r, T)
rng.random_unsafe(a, T)
rng.random_unsafe(b, T)
bench("Addition 2x reduced", $doublePrec(T), iters):
r.sum2xMod(a, b)
proc diff2xUnreduce(T: typedesc, iters: int) =
var r, a, b: doublePrec(T)
rng.random_unsafe(r, T)
rng.random_unsafe(a, T)
rng.random_unsafe(b, T)
bench("Substraction 2x unreduced", $doublePrec(T), iters):
r.diff2xUnr(a, b)
proc diff2x(T: typedesc, iters: int) =
var r, a, b: doublePrec(T)
rng.random_unsafe(r, T)
rng.random_unsafe(a, T)
rng.random_unsafe(b, T)
bench("Substraction 2x reduced", $doublePrec(T), iters):
r.diff2xMod(a, b)
proc neg2x(T: typedesc, iters: int) =
var r, a: doublePrec(T)
rng.random_unsafe(a, T)
bench("Negation 2x reduced", $doublePrec(T), iters):
r.neg2xMod(a)
proc prod2xBench*(rLen, aLen, bLen: static int, iters: int) =
var r: BigInt[rLen]
let a = rng.random_unsafe(BigInt[aLen])
let b = rng.random_unsafe(BigInt[bLen])
bench("Multiplication 2x", $rLen & " <- " & $aLen & " x " & $bLen, iters):
r.prod(a, b)
proc square2xBench*(rLen, aLen: static int, iters: int) =
var r: BigInt[rLen]
let a = rng.random_unsafe(BigInt[aLen])
bench("Squaring 2x", $rLen & " <- " & $aLen & "²", iters):
r.square(a)
proc reduce2x*(T: typedesc, iters: int) =
var r: T
var t: doublePrec(T)
rng.random_unsafe(t, T)
bench("Redc 2x", $T & " <- " & $doublePrec(T), iters):
r.redc2x(t)
proc reduce2xViaDivision*(T: typedesc, iters: int) =
const bits2x = 2 * T.C.getCurveBitWidth()
var r: matchingBigInt(T.C)
let t = rng.random_unsafe(BigInt[bits2x])
bench("Reduction via division", $T & " <- " & $doublePrec(T), iters):
r.reduce(t, T.fieldMod())
proc main() =
separator()
sum(Fp[BLS12_381], iters = 10_000_000)
sumUnr(Fp[BLS12_381], iters = 10_000_000)
diff(Fp[BLS12_381], iters = 10_000_000)
diffUnr(Fp[BLS12_381], iters = 10_000_000)
neg(Fp[BLS12_381], iters = 10_000_000)
separator()
sum2x(Fp[BLS12_381], iters = 10_000_000)
sum2xUnreduce(Fp[BLS12_381], iters = 10_000_000)
diff2x(Fp[BLS12_381], iters = 10_000_000)
diff2xUnreduce(Fp[BLS12_381], iters = 10_000_000)
neg2x(Fp[BLS12_381], iters = 10_000_000)
separator()
prod2xBench(512, 256, 256, iters = 10_000_000)
prod2xBench(768, 384, 384, iters = 10_000_000)
square2xBench(512, 256, iters = 10_000_000)
square2xBench(768, 384, iters = 10_000_000)
reduce2x(Fp[BN254_Snarks], iters = 10_000_000)
reduce2x(Fp[BLS12_381], iters = 10_000_000)
reduce2xViaDivision(Fp[BN254_Snarks], iters = 10_000)
reduce2xViaDivision(Fp[BLS12_381], iters = 10_000)
separator()
main()
notes()