From 1fdb1df80ac2288cbdb3ad9e89c0d57ab5b489da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= Date: Sat, 29 Feb 2020 19:36:35 +0100 Subject: [PATCH] Add benchmark clock timers --- benchmarks/bls12_381_fp.nim | 12 ++++++++- benchmarks/timers.nim | 49 +++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 benchmarks/timers.nim diff --git a/benchmarks/bls12_381_fp.nim b/benchmarks/bls12_381_fp.nim index cf6bd40..b96cec2 100644 --- a/benchmarks/bls12_381_fp.nim +++ b/benchmarks/bls12_381_fp.nim @@ -25,12 +25,16 @@ import ../constantine/config/[common, curves], ../constantine/arithmetic/[bigints_checked, finite_fields], ../constantine/io/[io_bigints, io_fields], - random, std/monotimes, times, strformat + random, std/monotimes, times, strformat, + ./timers const Iters = 1_000_000 randomize(1234) +echo "\n⚠️ Measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them." +echo "==========================================================================================================\n" + proc addBench() = var r, x, y: Fp[BLS12_381] # BN254 field modulus @@ -39,12 +43,15 @@ proc addBench() = y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9") let start = getMonotime() + let startClk = getTicks() for _ in 0 ..< Iters: x += y + let stopClk = getTicks() let stop = getMonotime() echo &"Time for {Iters} additions in 𝔽p (constant-time 381-bit): {inMilliseconds(stop-start)} ms" echo &"Time for 1 addition in 𝔽p ==> {inNanoseconds((stop-start) div Iters)} ns" + echo &"Cycles per addition 𝔽p ==> {(stopClk - startClk) div Iters} cycles" addBench() @@ -56,11 +63,14 @@ proc mulBench() = y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9") let start = getMonotime() + let startClk = getTicks() for _ in 0 ..< Iters: r.prod(x, y) + let stopClk = getTicks() let stop = getMonotime() echo &"Time for {Iters} multiplications 𝔽p (constant-time 381-bit): {inMilliseconds(stop-start)} ms" echo &"Time for 1 multiplication 𝔽p ==> {inNanoseconds((stop-start) div Iters)} ns" + echo &"Cycles per multiplication 𝔽p ==> {(stopClk - startClk) div Iters} cycles" mulBench() diff --git a/benchmarks/timers.nim b/benchmarks/timers.nim new file mode 100644 index 0000000..45d415f --- /dev/null +++ b/benchmarks/timers.nim @@ -0,0 +1,49 @@ +when defined(i386) or defined(amd64): + # From Linux + # + # The RDTSC instruction is not ordered relative to memory + # access. The Intel SDM and the AMD APM are both vague on this + # point, but empirically an RDTSC instruction can be + # speculatively executed before prior loads. An RDTSC + # immediately after an appropriate barrier appears to be + # ordered as a normal load, that is, it provides the same + # ordering guarantees as reading from a global memory location + # that some other imaginary CPU is updating continuously with a + # time stamp. + # + # From Intel SDM + # https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf + when not defined(vcc): + when defined(amd64): + proc getTicks*(): int64 {.inline.} = + var lo, hi: int64 + # TODO: Provide a compile-time flag for RDTSCP support + # and use it instead of lfence + RDTSC + {.emit: """asm volatile( + "lfence\n" + "rdtsc\n" + : "=a"(`lo`), "=d"(`hi`) + : + : "memory" + );""".} + return (hi shl 32) or lo + else: + proc getTicks*(): int64 {.inline.} = + # TODO: Provide a compile-time flag for RDTSCP support + # and use it instead of lfence + RDTSC + {.emit: """asm volatile( + "lfence\n" + "rdtsc\n" + : "=a"(`result`) + : + : "memory" + );""".} + else: + proc rdtsc(): int64 {.sideeffect, importc: "__rdtsc", header: "".} + proc lfence() {.importc: "__mm_lfence", header: "".} + + proc getTicks*(): int64 {.inline.} = + lfence() + return rdtsc() +else: + {.error: "getticks is not supported on this CPU architecture".}