constantine/benchmarks/bench_fp_double_width.nim

# Constantine
# Copyright (c) 2018-2019    Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.

# ############################################################
#
#             Benchmark of finite fields
#
# ############################################################

import
  # Internals
  ../constantine/config/[curves, common],
  ../constantine/arithmetic,
  ../constantine/towers,
  # Helpers
  ../helpers/[prng_unsafe, static_for],
  ./platforms,
  # Standard library
  std/[monotimes, times, strformat, strutils, macros]

var rng: RngState
let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
rng.seed(seed)
echo "bench xoshiro512** seed: ", seed

# warmup
proc warmup*() =
  # Warmup - make sure cpu is on max perf
  let start = cpuTime()
  var foo = 123
  for i in 0 ..< 300_000_000:
    foo += i*i mod 456
    foo = foo mod 789

  # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
  let stop = cpuTime()
  echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"

warmup()

when defined(gcc):
  echo "\nCompiled with GCC"
elif defined(clang):
  echo "\nCompiled with Clang"
elif defined(vcc):
  echo "\nCompiled with MSVC"
elif defined(icc):
  echo "\nCompiled with ICC"
else:
  echo "\nCompiled with an unknown compiler"

echo "Optimization level => "
echo "  no optimization: ", not defined(release)
echo "  release: ", defined(release)
echo "  danger: ", defined(danger)
echo "  inline assembly: ", UseASM_X86_64

when (sizeof(int) == 4) or defined(Constantine32):
  echo "⚠️ Warning: using Constantine with 32-bit limbs"
else:
  echo "Using Constantine with 64-bit limbs"

when SupportsCPUName:
  echo "Running on ", cpuName(), ""

when SupportsGetTicks:
  echo "\n⚠️ Cycles measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
  echo "i.e. a 20% overclock will be about 20% off (assuming no dynamic frequency scaling)"

echo "\n=================================================================================================================\n"

proc separator*() =
  echo "-".repeat(145)

proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
  let ns = inNanoseconds((stop-start) div iters)
  let throughput = 1e9 / float64(ns)
  when SupportsGetTicks:
    echo &"{op:<28} {field:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op     {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
  else:
    echo &"{op:<28} {field:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op"

proc notes*() =
  echo "Notes:"
  echo "  - Compilers:"
  echo "    Compilers are severely limited on multiprecision arithmetic."
  echo "    Inline Assembly is used by default (nimble bench_fp)."
  echo "    Bench without assembly can use \"nimble bench_fp_gcc\" or \"nimble bench_fp_clang\"."
  echo "    GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries."
  echo "  - The simplest operations might be optimized away by the compiler."
  echo "  - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)"

template bench(op: string, desc: string, iters: int, body: untyped): untyped =
  let start = getMonotime()
  when SupportsGetTicks:
    let startClk = getTicks()
  for _ in 0 ..< iters:
    body
  when SupportsGetTicks:
    let stopClk = getTicks()
  let stop = getMonotime()

  when not SupportsGetTicks:
    let startClk = -1'i64
    let stopClk = -1'i64

  report(op, desc, start, stop, startClk, stopClk, iters)

func random_unsafe(rng: var RngState, a: var FpDbl, Base: typedesc) =
  ## Initialize a standalone Double-Width field element
  ## we don't reduce it modulo p², this is only used for benchmark
  let aHi = rng.random_unsafe(Base)
  let aLo = rng.random_unsafe(Base)
  for i in 0 ..< aLo.mres.limbs.len:
    a.limbs2x[i] = aLo.mres.limbs[i]
  for i in 0 ..< aHi.mres.limbs.len:
    a.limbs2x[aLo.mres.limbs.len+i] = aHi.mres.limbs[i]

proc sumNoReduce(T: typedesc, iters: int) =
  var r: T
  let a = rng.random_unsafe(T)
  let b = rng.random_unsafe(T)
  bench("Addition no reduce", $T, iters):
    r.sumNoReduce(a, b)

proc sum(T: typedesc, iters: int) =
  var r: T
  let a = rng.random_unsafe(T)
  let b = rng.random_unsafe(T)
  bench("Addition", $T, iters):
    r.sum(a, b)

proc diffNoReduce(T: typedesc, iters: int) =
  var r: T
  let a = rng.random_unsafe(T)
  let b = rng.random_unsafe(T)
  bench("Substraction no reduce", $T, iters):
    r.diffNoReduce(a, b)

proc diff(T: typedesc, iters: int) =
  var r: T
  let a = rng.random_unsafe(T)
  let b = rng.random_unsafe(T)
  bench("Substraction", $T, iters):
    r.diff(a, b)

proc diff2xNoReduce(T: typedesc, iters: int) =
  var r, a, b: doubleWidth(T)
  rng.random_unsafe(r, T)
  rng.random_unsafe(a, T)
  rng.random_unsafe(b, T)
  bench("Substraction 2x no reduce", $doubleWidth(T), iters):
    r.diffNoReduce(a, b)

proc diff2x(T: typedesc, iters: int) =
  var r, a, b: doubleWidth(T)
  rng.random_unsafe(r, T)
  rng.random_unsafe(a, T)
  rng.random_unsafe(b, T)
  bench("Substraction 2x", $doubleWidth(T), iters):
    r.diff(a, b)

proc mul2xBench*(rLen, aLen, bLen: static int, iters: int) =
  var r: BigInt[rLen]
  let a = rng.random_unsafe(BigInt[aLen])
  let b = rng.random_unsafe(BigInt[bLen])
  bench("Multiplication", $rLen & " <- " & $aLen & " x " & $bLen, iters):
    r.prod(a, b)

proc square2xBench*(rLen, aLen: static int, iters: int) =
  var r: BigInt[rLen]
  let a = rng.random_unsafe(BigInt[aLen])
  bench("Squaring", $rLen & " <- " & $aLen & "²", iters):
    r.square(a)

proc reduce2x*(T: typedesc, iters: int) =
  var r: T
  var t: doubleWidth(T)
  rng.random_unsafe(t, T)

  bench("Reduce 2x-width", $T & " <- " & $doubleWidth(T), iters):
    r.reduce(t)

proc main() =
  separator()
  sumNoReduce(Fp[BLS12_381], iters = 10_000_000)
  diffNoReduce(Fp[BLS12_381], iters = 10_000_000)
  sum(Fp[BLS12_381], iters = 10_000_000)
  diff(Fp[BLS12_381], iters = 10_000_000)
  diff2x(Fp[BLS12_381], iters = 10_000_000)
  diff2xNoReduce(Fp[BLS12_381], iters = 10_000_000)
  mul2xBench(768, 384, 384, iters = 10_000_000)
  square2xBench(768, 384, iters = 10_000_000)
  reduce2x(Fp[BLS12_381], iters = 10_000_000)
  separator()

main()
notes()
Double-width tower extension part 1 (#72) * Implement double-width field multiplication for double-width towering * Fp2 mul acceleration via double-width lazy reduction (pure Nim) * Inline assembly for basic add and sub * Use 2 registers instead of 12+ for ASM conditional copy * Prepare assembly for extended multiprecision multiplication support * Add assembly for mul * initial implementation of assembly reduction * stash current progress of assembly reduction * Fix clobbering issue, only P256 comparison remain buggy * Fix asm montgomery reduction for NIST P256 as well * MULX/ADCX/ADOX multi-precision multiplication * MULX/ADCX/ADOX reduction v1 * Add (deactivated) assembly for double-width substraction + rework benches * Add bench to nimble and deactivate double-width for now. slower than classic * Fix x86-32 running out of registers for mul * Clang needs to be at v9 to support flag output constraints (Xcode 11.4.2 / OSX Catalina) * 32-bit doesn't have enough registers for ASM mul * Fix again Travis Clang 9 issues * LLVM 9 is not whitelisted in travis * deactivated assembler with travis clang * syntax error * another * ... * missing space, yeah ... 2020-08-20 10:21:39 +02:00			`# Constantine`
			`# Copyright (c) 2018-2019 Status Research & Development GmbH`
			`# Copyright (c) 2020-Present Mamy André-Ratsimbazafy`
			`# Licensed and distributed under either of`
			`# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).`
			`# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).`
			`# at your option. This file may not be copied, modified, or distributed except according to those terms.`

			`# ############################################################`
			`#`
			`# Benchmark of finite fields`
			`#`
			`# ############################################################`

			`import`
			`# Internals`
			`../constantine/config/[curves, common],`
			`../constantine/arithmetic,`
			`../constantine/towers,`
			`# Helpers`
			`../helpers/[prng_unsafe, static_for],`
			`./platforms,`
			`# Standard library`
			`std/[monotimes, times, strformat, strutils, macros]`

			`var rng: RngState`
			`let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32`
			`rng.seed(seed)`
			`echo "bench xoshiro512** seed: ", seed`

			`# warmup`
			`proc warmup*() =`
			`# Warmup - make sure cpu is on max perf`
			`let start = cpuTime()`
			`var foo = 123`
			`for i in 0 ..< 300_000_000:`
			`foo += i*i mod 456`
			`foo = foo mod 789`

			`# Compiler shouldn't optimize away the results as cpuTime rely on sideeffects`
			`let stop = cpuTime()`
			`echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"`

			`warmup()`

			`when defined(gcc):`
			`echo "\nCompiled with GCC"`
			`elif defined(clang):`
			`echo "\nCompiled with Clang"`
			`elif defined(vcc):`
			`echo "\nCompiled with MSVC"`
			`elif defined(icc):`
			`echo "\nCompiled with ICC"`
			`else:`
			`echo "\nCompiled with an unknown compiler"`

			`echo "Optimization level => "`
			`echo " no optimization: ", not defined(release)`
			`echo " release: ", defined(release)`
			`echo " danger: ", defined(danger)`
			`echo " inline assembly: ", UseASM_X86_64`

			`when (sizeof(int) == 4) or defined(Constantine32):`
			`echo "⚠️ Warning: using Constantine with 32-bit limbs"`
			`else:`
			`echo "Using Constantine with 64-bit limbs"`

			`when SupportsCPUName:`
			`echo "Running on ", cpuName(), ""`

			`when SupportsGetTicks:`
			`echo "\n⚠️ Cycles measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."`
			`echo "i.e. a 20% overclock will be about 20% off (assuming no dynamic frequency scaling)"`

			`echo "\n=================================================================================================================\n"`

			`proc separator*() =`
			`echo "-".repeat(145)`

			`proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =`
			`let ns = inNanoseconds((stop-start) div iters)`
			`let throughput = 1e9 / float64(ns)`
			`when SupportsGetTicks:`
			`echo &"{op:<28} {field:<40} {throughput:>15.3f} ops/s {ns:>9} ns/op {(stopClk - startClk) div iters:>9} CPU cycles (approx)"`
			`else:`
			`echo &"{op:<28} {field:<40} {throughput:>15.3f} ops/s {ns:>9} ns/op"`

			`proc notes*() =`
			`echo "Notes:"`
			`echo " - Compilers:"`
			`echo " Compilers are severely limited on multiprecision arithmetic."`
			`echo " Inline Assembly is used by default (nimble bench_fp)."`
			`echo " Bench without assembly can use \"nimble bench_fp_gcc\" or \"nimble bench_fp_clang\"."`
			`echo " GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries."`
			`echo " - The simplest operations might be optimized away by the compiler."`
			`echo " - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)"`

			`template bench(op: string, desc: string, iters: int, body: untyped): untyped =`
			`let start = getMonotime()`
			`when SupportsGetTicks:`
			`let startClk = getTicks()`
			`for _ in 0 ..< iters:`
			`body`
			`when SupportsGetTicks:`
			`let stopClk = getTicks()`
			`let stop = getMonotime()`

			`when not SupportsGetTicks:`
			`let startClk = -1'i64`
			`let stopClk = -1'i64`

			`report(op, desc, start, stop, startClk, stopClk, iters)`

			`func random_unsafe(rng: var RngState, a: var FpDbl, Base: typedesc) =`
			`## Initialize a standalone Double-Width field element`
			`## we don't reduce it modulo p², this is only used for benchmark`
			`let aHi = rng.random_unsafe(Base)`
			`let aLo = rng.random_unsafe(Base)`
			`for i in 0 ..< aLo.mres.limbs.len:`
			`a.limbs2x[i] = aLo.mres.limbs[i]`
			`for i in 0 ..< aHi.mres.limbs.len:`
			`a.limbs2x[aLo.mres.limbs.len+i] = aHi.mres.limbs[i]`

			`proc sumNoReduce(T: typedesc, iters: int) =`
			`var r: T`
			`let a = rng.random_unsafe(T)`
			`let b = rng.random_unsafe(T)`
			`bench("Addition no reduce", $T, iters):`
			`r.sumNoReduce(a, b)`

			`proc sum(T: typedesc, iters: int) =`
			`var r: T`
			`let a = rng.random_unsafe(T)`
			`let b = rng.random_unsafe(T)`
			`bench("Addition", $T, iters):`
			`r.sum(a, b)`

			`proc diffNoReduce(T: typedesc, iters: int) =`
			`var r: T`
			`let a = rng.random_unsafe(T)`
			`let b = rng.random_unsafe(T)`
			`bench("Substraction no reduce", $T, iters):`
			`r.diffNoReduce(a, b)`

			`proc diff(T: typedesc, iters: int) =`
			`var r: T`
			`let a = rng.random_unsafe(T)`
			`let b = rng.random_unsafe(T)`
			`bench("Substraction", $T, iters):`
			`r.diff(a, b)`

			`proc diff2xNoReduce(T: typedesc, iters: int) =`
			`var r, a, b: doubleWidth(T)`
			`rng.random_unsafe(r, T)`
			`rng.random_unsafe(a, T)`
			`rng.random_unsafe(b, T)`
			`bench("Substraction 2x no reduce", $doubleWidth(T), iters):`
			`r.diffNoReduce(a, b)`

			`proc diff2x(T: typedesc, iters: int) =`
			`var r, a, b: doubleWidth(T)`
			`rng.random_unsafe(r, T)`
			`rng.random_unsafe(a, T)`
			`rng.random_unsafe(b, T)`
			`bench("Substraction 2x", $doubleWidth(T), iters):`
			`r.diff(a, b)`

			`proc mul2xBench*(rLen, aLen, bLen: static int, iters: int) =`
			`var r: BigInt[rLen]`
			`let a = rng.random_unsafe(BigInt[aLen])`
			`let b = rng.random_unsafe(BigInt[bLen])`
			`bench("Multiplication", $rLen & " <- " & $aLen & " x " & $bLen, iters):`
			`r.prod(a, b)`

FpDbl revisited (#144) - 7% perf improvement everywhere, up to 30% in double-width primitives * reorg mul -> limbs_double_width, ConstantineASM CttASM * Implement squaring specialized scalar path (22% faster than mul) * Implement "portable" assembly for squaring * stash part of the changes * Reorg montgomery reduction - prepare to introduce Comba optimization * Implement comba Montgomery reduce (but it's slower!) * rename t -> a * 30% performance improvement by avoiding toOpenArray! * variable renaming * Fix 32-bit imports * slightly better assembly for sub2x * There is an annoying bottleneck * use out-of-place Fp assembly instead of in-place * diffAlias is unneeded now * cosmetic * speedup fpDbl sub by 20% * Fix Fp2 -> Fp6 -> Fp12 towering. It seems 5% faster * Stash ADCX/ADOX squaring 2021-02-01 03:52:27 +01:00			`proc square2xBench*(rLen, aLen: static int, iters: int) =`
			`var r: BigInt[rLen]`
			`let a = rng.random_unsafe(BigInt[aLen])`
			`bench("Squaring", $rLen & " <- " & $aLen & "²", iters):`
			`r.square(a)`

Double-width tower extension part 1 (#72) * Implement double-width field multiplication for double-width towering * Fp2 mul acceleration via double-width lazy reduction (pure Nim) * Inline assembly for basic add and sub * Use 2 registers instead of 12+ for ASM conditional copy * Prepare assembly for extended multiprecision multiplication support * Add assembly for mul * initial implementation of assembly reduction * stash current progress of assembly reduction * Fix clobbering issue, only P256 comparison remain buggy * Fix asm montgomery reduction for NIST P256 as well * MULX/ADCX/ADOX multi-precision multiplication * MULX/ADCX/ADOX reduction v1 * Add (deactivated) assembly for double-width substraction + rework benches * Add bench to nimble and deactivate double-width for now. slower than classic * Fix x86-32 running out of registers for mul * Clang needs to be at v9 to support flag output constraints (Xcode 11.4.2 / OSX Catalina) * 32-bit doesn't have enough registers for ASM mul * Fix again Travis Clang 9 issues * LLVM 9 is not whitelisted in travis * deactivated assembler with travis clang * syntax error * another * ... * missing space, yeah ... 2020-08-20 10:21:39 +02:00			`proc reduce2x*(T: typedesc, iters: int) =`
			`var r: T`
			`var t: doubleWidth(T)`
			`rng.random_unsafe(t, T)`

			`bench("Reduce 2x-width", $T & " <- " & $doubleWidth(T), iters):`
			`r.reduce(t)`

			`proc main() =`
			`separator()`
			`sumNoReduce(Fp[BLS12_381], iters = 10_000_000)`
			`diffNoReduce(Fp[BLS12_381], iters = 10_000_000)`
			`sum(Fp[BLS12_381], iters = 10_000_000)`
			`diff(Fp[BLS12_381], iters = 10_000_000)`
			`diff2x(Fp[BLS12_381], iters = 10_000_000)`
			`diff2xNoReduce(Fp[BLS12_381], iters = 10_000_000)`
			`mul2xBench(768, 384, 384, iters = 10_000_000)`
FpDbl revisited (#144) - 7% perf improvement everywhere, up to 30% in double-width primitives * reorg mul -> limbs_double_width, ConstantineASM CttASM * Implement squaring specialized scalar path (22% faster than mul) * Implement "portable" assembly for squaring * stash part of the changes * Reorg montgomery reduction - prepare to introduce Comba optimization * Implement comba Montgomery reduce (but it's slower!) * rename t -> a * 30% performance improvement by avoiding toOpenArray! * variable renaming * Fix 32-bit imports * slightly better assembly for sub2x * There is an annoying bottleneck * use out-of-place Fp assembly instead of in-place * diffAlias is unneeded now * cosmetic * speedup fpDbl sub by 20% * Fix Fp2 -> Fp6 -> Fp12 towering. It seems 5% faster * Stash ADCX/ADOX squaring 2021-02-01 03:52:27 +01:00			`square2xBench(768, 384, iters = 10_000_000)`
Double-width tower extension part 1 (#72) * Implement double-width field multiplication for double-width towering * Fp2 mul acceleration via double-width lazy reduction (pure Nim) * Inline assembly for basic add and sub * Use 2 registers instead of 12+ for ASM conditional copy * Prepare assembly for extended multiprecision multiplication support * Add assembly for mul * initial implementation of assembly reduction * stash current progress of assembly reduction * Fix clobbering issue, only P256 comparison remain buggy * Fix asm montgomery reduction for NIST P256 as well * MULX/ADCX/ADOX multi-precision multiplication * MULX/ADCX/ADOX reduction v1 * Add (deactivated) assembly for double-width substraction + rework benches * Add bench to nimble and deactivate double-width for now. slower than classic * Fix x86-32 running out of registers for mul * Clang needs to be at v9 to support flag output constraints (Xcode 11.4.2 / OSX Catalina) * 32-bit doesn't have enough registers for ASM mul * Fix again Travis Clang 9 issues * LLVM 9 is not whitelisted in travis * deactivated assembler with travis clang * syntax error * another * ... * missing space, yeah ... 2020-08-20 10:21:39 +02:00			`reduce2x(Fp[BLS12_381], iters = 10_000_000)`
			`separator()`

			`main()`
			`notes()`