nimbus-eth2/nbench/platforms/x86.nim

# beacon_chain
# Copyright (c) 2018 Status Research & Development GmbH
# Licensed and distributed under either of
#   * MIT license (license terms in the root directory or at https://opensource.org/licenses/MIT).
#   * Apache v2 license (license terms in the root directory or at https://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.

# Cpu Name
# -------------------------------------------------------

{.passC:"-std=gnu99".} # TODO may conflict with milagro "-std=c99"

proc cpuID(eaxi, ecxi: int32): tuple[eax, ebx, ecx, edx: int32] =
  when defined(vcc):
    proc cpuidVcc(cpuInfo: ptr int32; functionID: int32)
      {.importc: "__cpuidex", header: "intrin.h".}
    cpuidVcc(addr result.eax, eaxi, ecxi)
  else:
    var (eaxr, ebxr, ecxr, edxr) = (0'i32, 0'i32, 0'i32, 0'i32)
    asm """
      cpuid
      :"=a"(`eaxr`), "=b"(`ebxr`), "=c"(`ecxr`), "=d"(`edxr`)
      :"a"(`eaxi`), "c"(`ecxi`)"""
    (eaxr, ebxr, ecxr, edxr)

proc cpuName*(): string =
  var leaves {.global.} = cast[array[48, char]]([
    cpuID(eaxi = 0x80000002'i32, ecxi = 0),
    cpuID(eaxi = 0x80000003'i32, ecxi = 0),
    cpuID(eaxi = 0x80000004'i32, ecxi = 0)])
  result = $cast[cstring](addr leaves[0])

# Counting cycles
# -------------------------------------------------------

# From Linux
#
# The RDTSC instruction is not ordered relative to memory
# access.  The Intel SDM and the AMD APM are both vague on this
# point, but empirically an RDTSC instruction can be
# speculatively executed before prior loads.  An RDTSC
# immediately after an appropriate barrier appears to be
# ordered as a normal load, that is, it provides the same
# ordering guarantees as reading from a global memory location
# that some other imaginary CPU is updating continuously with a
# time stamp.
#
# From Intel SDM
# https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf

proc getTicks*(): int64 {.inline.} =
  when defined(vcc):
    proc rdtsc(): int64 {.sideeffect, importc: "__rdtsc", header: "<intrin.h>".}
    proc lfence() {.importc: "__mm_lfence", header: "<intrin.h>".}

    lfence()
    return rdtsc()

  else:
    when defined(amd64):
      var lo, hi: int64
      # TODO: Provide a compile-time flag for RDTSCP support
      #       and use it instead of lfence + RDTSC
      {.emit: """asm volatile(
        "lfence\n"
        "rdtsc\n"
        : "=a"(`lo`), "=d"(`hi`)
        :
        : "memory"
      );""".}
      return (hi shl 32) or lo
    else: # 32-bit x86
      # TODO: Provide a compile-time flag for RDTSCP support
      #       and use it instead of lfence + RDTSC
      {.emit: """asm volatile(
        "lfence\n"
        "rdtsc\n"
        : "=a"(`result`)
        :
        : "memory"
      );""".}

# Sanity check
# -------------------------------------------------------

when isMainModule:

  import std/[times, monotimes, math, volatile, os]

  block: # CpuName
    echo "Your CPU is:    "
    echo "   ", cpuName()

  block: # Cycle Count
    echo "The cost of an int64 modulo operation on your platform is:"

    # Dealing with compiler optimization on microbenchmarks is hard
    {.pragma: volatile, codegenDecl: "volatile $# $#".}

    proc modNtimes(a, b: int64, N: int) {.noinline.} =
      var c{.volatile.}: int64
      for i in 0 ..< N:
        c.addr.volatileStore(a.unsafeAddr.volatileLoad() mod b.unsafeAddr.volatileLoad())

    let a {.volatile.} = 1000003'i64 # a prime number
    let b {.volatile.} = 10007'i64   # another prime number
    let N {.volatile.} = 3_000_000

    let startMono = getMonoTime()
    let startCycles = getTicks()
    modNtimes(a, b, N)
    let stopCycles = getTicks()
    let stopMono = getMonoTime()


    let elapsedMono = inNanoseconds(stopMono - startMono)
    let elapsedCycles = stopCycles - startCycles
    let timerResolutionGHz = round(elapsedCycles.float32 / elapsedMono.float32, 3)

    echo "   ", (elapsedCycles) div N, " cycles"
    echo "   ", (elapsedMono) div N, " ns/iter"
    echo "   ", timerResolutionGHz, " GHz (timer resolution)"

  block: # CPU Frequency
    discard # TODO, surprisingly this is very complex
Nbench - Flexible benchmarking of Nimbus internals (#641) * nbench PoC * Remove the yaml files from the example scenarios * update README with current status * Add an alternative implementation that uses defer * Forgot to add the old proc body * slots-processing * allow benching state_transition failures * Add Attestations processing (workaround confutils bug: - https://github.com/status-im/nim-confutils/issues/10 - https://github.com/status-im/nim-confutils/issues/11 - https://github.com/status-im/nim-confutils/issues/12 * Add CLI command in the readme * Filter report and add notes about CPU cycles * Report averages * Add debugecho style time/cycle print * Report when we skip BLS and state root verification * Update to 0.9.3 * Generalize scenario parsing * Support all block processing scenarios * parallel bench runner PoC * gitBetter load issues reporting (the load issues were invalid signature and expected to fail) 2019-12-20 16:14:43 +00:00			`# beacon_chain`
			`# Copyright (c) 2018 Status Research & Development GmbH`
			`# Licensed and distributed under either of`
			`# * MIT license (license terms in the root directory or at https://opensource.org/licenses/MIT).`
			`# * Apache v2 license (license terms in the root directory or at https://www.apache.org/licenses/LICENSE-2.0).`
			`# at your option. This file may not be copied, modified, or distributed except according to those terms.`

			`# Cpu Name`
			`# -------------------------------------------------------`

			`{.passC:"-std=gnu99".} # TODO may conflict with milagro "-std=c99"`

			`proc cpuID(eaxi, ecxi: int32): tuple[eax, ebx, ecx, edx: int32] =`
			`when defined(vcc):`
			`proc cpuidVcc(cpuInfo: ptr int32; functionID: int32)`
			`{.importc: "__cpuidex", header: "intrin.h".}`
			`cpuidVcc(addr result.eax, eaxi, ecxi)`
			`else:`
			`var (eaxr, ebxr, ecxr, edxr) = (0'i32, 0'i32, 0'i32, 0'i32)`
			`asm """`
			`cpuid`
			:"=a"(`eaxr`), "=b"(`ebxr`), "=c"(`ecxr`), "=d"(`edxr`)
			:"a"(`eaxi`), "c"(`ecxi`)"""
			`(eaxr, ebxr, ecxr, edxr)`

			`proc cpuName*(): string =`
			`var leaves {.global.} = cast[array[48, char]]([`
			`cpuID(eaxi = 0x80000002'i32, ecxi = 0),`
			`cpuID(eaxi = 0x80000003'i32, ecxi = 0),`
			`cpuID(eaxi = 0x80000004'i32, ecxi = 0)])`
			`result = $cast[cstring](addr leaves[0])`

			`# Counting cycles`
			`# -------------------------------------------------------`

			`# From Linux`
			`#`
			`# The RDTSC instruction is not ordered relative to memory`
			`# access. The Intel SDM and the AMD APM are both vague on this`
			`# point, but empirically an RDTSC instruction can be`
			`# speculatively executed before prior loads. An RDTSC`
			`# immediately after an appropriate barrier appears to be`
			`# ordered as a normal load, that is, it provides the same`
			`# ordering guarantees as reading from a global memory location`
			`# that some other imaginary CPU is updating continuously with a`
			`# time stamp.`
			`#`
			`# From Intel SDM`
			`# https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf`

			`proc getTicks*(): int64 {.inline.} =`
			`when defined(vcc):`
			`proc rdtsc(): int64 {.sideeffect, importc: "__rdtsc", header: "<intrin.h>".}`
			`proc lfence() {.importc: "__mm_lfence", header: "<intrin.h>".}`

			`lfence()`
			`return rdtsc()`

			`else:`
			`when defined(amd64):`
			`var lo, hi: int64`
			`# TODO: Provide a compile-time flag for RDTSCP support`
			`# and use it instead of lfence + RDTSC`
			`{.emit: """asm volatile(`
			`"lfence\n"`
			`"rdtsc\n"`
			: "=a"(`lo`), "=d"(`hi`)
			`:`
			`: "memory"`
			`);""".}`
			`return (hi shl 32) or lo`
			`else: # 32-bit x86`
			`# TODO: Provide a compile-time flag for RDTSCP support`
			`# and use it instead of lfence + RDTSC`
			`{.emit: """asm volatile(`
			`"lfence\n"`
			`"rdtsc\n"`
			: "=a"(`result`)
			`:`
			`: "memory"`
			`);""".}`

			`# Sanity check`
			`# -------------------------------------------------------`

			`when isMainModule:`

			`import std/[times, monotimes, math, volatile, os]`

			`block: # CpuName`
			`echo "Your CPU is: "`
			`echo " ", cpuName()`

			`block: # Cycle Count`
			`echo "The cost of an int64 modulo operation on your platform is:"`

			`# Dealing with compiler optimization on microbenchmarks is hard`
			`{.pragma: volatile, codegenDecl: "volatile $# $#".}`

			`proc modNtimes(a, b: int64, N: int) {.noinline.} =`
			`var c{.volatile.}: int64`
			`for i in 0 ..< N:`
			`c.addr.volatileStore(a.unsafeAddr.volatileLoad() mod b.unsafeAddr.volatileLoad())`

			`let a {.volatile.} = 1000003'i64 # a prime number`
			`let b {.volatile.} = 10007'i64 # another prime number`
			`let N {.volatile.} = 3_000_000`

			`let startMono = getMonoTime()`
			`let startCycles = getTicks()`
			`modNtimes(a, b, N)`
			`let stopCycles = getTicks()`
			`let stopMono = getMonoTime()`


			`let elapsedMono = inNanoseconds(stopMono - startMono)`
			`let elapsedCycles = stopCycles - startCycles`
			`let timerResolutionGHz = round(elapsedCycles.float32 / elapsedMono.float32, 3)`

			`echo " ", (elapsedCycles) div N, " cycles"`
			`echo " ", (elapsedMono) div N, " ns/iter"`
			`echo " ", timerResolutionGHz, " GHz (timer resolution)"`

			`block: # CPU Frequency`
			`discard # TODO, surprisingly this is very complex`