From 1fdb1df80ac2288cbdb3ad9e89c0d57ab5b489da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Sat, 29 Feb 2020 19:36:35 +0100
Subject: [PATCH] Add benchmark clock timers

---
 benchmarks/bls12_381_fp.nim | 12 ++++++++-
 benchmarks/timers.nim       | 49 +++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/timers.nim

diff --git a/benchmarks/bls12_381_fp.nim b/benchmarks/bls12_381_fp.nim
index cf6bd40..b96cec2 100644
--- a/benchmarks/bls12_381_fp.nim
+++ b/benchmarks/bls12_381_fp.nim
@@ -25,12 +25,16 @@ import
   ../constantine/config/[common, curves],
   ../constantine/arithmetic/[bigints_checked, finite_fields],
   ../constantine/io/[io_bigints, io_fields],
-  random, std/monotimes, times, strformat
+  random, std/monotimes, times, strformat,
+  ./timers
 
 const Iters = 1_000_000
 
 randomize(1234)
 
+echo "\n⚠️ Measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
+echo "==========================================================================================================\n"
+
 proc addBench() =
   var r, x, y: Fp[BLS12_381]
   # BN254 field modulus
@@ -39,12 +43,15 @@ proc addBench() =
   y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9")
 
   let start = getMonotime()
+  let startClk = getTicks()
   for _ in 0 ..< Iters:
     x += y
+  let stopClk = getTicks()
   let stop = getMonotime()
 
   echo &"Time for {Iters} additions in 𝔽p (constant-time 381-bit): {inMilliseconds(stop-start)} ms"
   echo &"Time for 1 addition in 𝔽p ==> {inNanoseconds((stop-start) div Iters)} ns"
+  echo &"Cycles per addition 𝔽p ==> {(stopClk - startClk) div Iters} cycles"
 
 addBench()
 
@@ -56,11 +63,14 @@ proc mulBench() =
   y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9")
 
   let start = getMonotime()
+  let startClk = getTicks()
   for _ in 0 ..< Iters:
     r.prod(x, y)
+  let stopClk = getTicks()
   let stop = getMonotime()
 
   echo &"Time for {Iters} multiplications 𝔽p (constant-time 381-bit): {inMilliseconds(stop-start)} ms"
   echo &"Time for 1 multiplication 𝔽p ==> {inNanoseconds((stop-start) div Iters)} ns"
+  echo &"Cycles per multiplication 𝔽p ==> {(stopClk - startClk) div Iters} cycles"
 
 mulBench()
diff --git a/benchmarks/timers.nim b/benchmarks/timers.nim
new file mode 100644
index 0000000..45d415f
--- /dev/null
+++ b/benchmarks/timers.nim
@@ -0,0 +1,49 @@
+when defined(i386) or defined(amd64):
+  # From Linux
+  #
+  # The RDTSC instruction is not ordered relative to memory
+  # access.  The Intel SDM and the AMD APM are both vague on this
+  # point, but empirically an RDTSC instruction can be
+  # speculatively executed before prior loads.  An RDTSC
+  # immediately after an appropriate barrier appears to be
+  # ordered as a normal load, that is, it provides the same
+  # ordering guarantees as reading from a global memory location
+  # that some other imaginary CPU is updating continuously with a
+  # time stamp.
+  #
+  # From Intel SDM
+  # https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
+  when not defined(vcc):
+    when defined(amd64):
+      proc getTicks*(): int64 {.inline.} =
+        var lo, hi: int64
+        # TODO: Provide a compile-time flag for RDTSCP support
+        #       and use it instead of lfence + RDTSC
+        {.emit: """asm volatile(
+          "lfence\n"
+          "rdtsc\n"
+          : "=a"(`lo`), "=d"(`hi`)
+          :
+          : "memory"
+        );""".}
+        return (hi shl 32) or lo
+    else:
+      proc getTicks*(): int64 {.inline.} =
+        # TODO: Provide a compile-time flag for RDTSCP support
+        #       and use it instead of lfence + RDTSC
+        {.emit: """asm volatile(
+          "lfence\n"
+          "rdtsc\n"
+          : "=a"(`result`)
+          :
+          : "memory"
+        );""".}
+  else:
+    proc rdtsc(): int64 {.sideeffect, importc: "__rdtsc", header: "<intrin.h>".}
+    proc lfence() {.importc: "__mm_lfence", header: "<intrin.h>".}
+
+    proc getTicks*(): int64 {.inline.} =
+      lfence()
+      return rdtsc()
+else:
+  {.error: "getticks is not supported on this CPU architecture".}