Metering (#140)

* Add metering facilities * Metering reporting * Add example report on metering BLS12-381 pairings
2021-01-29 22:21:19 +01:00 · 2021-01-29 22:21:19 +01:00 · b91ec1cb15
parent 95e23339b2
commit b91ec1cb15
8 changed files with 395 additions and 39 deletions
--- a/constantine/arithmetic/finite_fields.nim
+++ b/constantine/arithmetic/finite_fields.nim
@ -70,14 +70,14 @@ func toBig*(src: FF): auto {.noInit, inline.} =
 # Copy
 # ------------------------------------------------------------

-func ccopy*(a: var FF, b: FF, ctl: SecretBool) {.inline.} =
+func ccopy*(a: var FF, b: FF, ctl: SecretBool) {.inline, meter.} =
  ## Constant-time conditional copy
  ## If ctl is true: b is copied into a
  ## if ctl is false: b is not copied and a is unmodified
  ## Time and memory accesses are the same whether a copy occurs or not
  ccopy(a.mres, b.mres, ctl)

-func cswap*(a, b: var FF, ctl: CTBool) {.inline.} =
+func cswap*(a, b: var FF, ctl: CTBool) {.inline, meter.} =
  ## Swap ``a`` and ``b`` if ``ctl`` is true
  ##
  ## Constant-time:
@ -132,7 +132,7 @@ func setOne*(a: var FF) {.inline.} =
  #       Check if the compiler optimizes it away
  a.mres = FF.getMontyOne()

-func `+=`*(a: var FF, b: FF) {.inline.} =
+func `+=`*(a: var FF, b: FF) {.inline, meter.} =
  ## In-place addition modulo p
  when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
    addmod_asm(a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
@ -141,7 +141,7 @@ func `+=`*(a: var FF, b: FF) {.inline.} =
    overflowed = overflowed or not(a.mres < FF.fieldMod())
    discard csub(a.mres, FF.fieldMod(), overflowed)

-func `-=`*(a: var FF, b: FF) {.inline.} =
+func `-=`*(a: var FF, b: FF) {.inline, meter.} =
  ## In-place substraction modulo p
  when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
    submod_asm(a.mres.limbs, b.mres.limbs, FF.fieldMod().limbs)
@ -149,7 +149,7 @@ func `-=`*(a: var FF, b: FF) {.inline.} =
    let underflowed = sub(a.mres, b.mres)
    discard cadd(a.mres, FF.fieldMod(), underflowed)

-func double*(a: var FF) {.inline.} =
+func double*(a: var FF) {.inline, meter.} =
  ## Double ``a`` modulo p
  when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
    addmod_asm(a.mres.limbs, a.mres.limbs, FF.fieldMod().limbs)
@ -158,7 +158,7 @@ func double*(a: var FF) {.inline.} =
    overflowed = overflowed or not(a.mres < FF.fieldMod())
    discard csub(a.mres, FF.fieldMod(), overflowed)

-func sum*(r: var FF, a, b: FF) {.inline.} =
+func sum*(r: var FF, a, b: FF) {.inline, meter.} =
  ## Sum ``a`` and ``b`` into ``r`` modulo p
  ## r is initialized/overwritten
  when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
@ -169,11 +169,11 @@ func sum*(r: var FF, a, b: FF) {.inline.} =
    overflowed = overflowed or not(r.mres < FF.fieldMod())
    discard csub(r.mres, FF.fieldMod(), overflowed)

-func sumNoReduce*(r: var FF, a, b: FF) {.inline.} =
+func sumNoReduce*(r: var FF, a, b: FF) {.inline, meter.} =
  ## Sum ``a`` and ``b`` into ``r`` without reduction
  discard r.mres.sum(a.mres, b.mres)

-func diff*(r: var FF, a, b: FF) {.inline.} =
+func diff*(r: var FF, a, b: FF) {.inline, meter.} =
  ## Substract `b` from `a` and store the result into `r`.
  ## `r` is initialized/overwritten
  ## Requires r != b
@ -184,7 +184,7 @@ func diff*(r: var FF, a, b: FF) {.inline.} =
    var underflowed = r.mres.diff(a.mres, b.mres)
    discard cadd(r.mres, FF.fieldMod(), underflowed)

-func diffAlias*(r: var FF, a, b: FF) {.inline.} =
+func diffAlias*(r: var FF, a, b: FF) {.inline, meter.} =
  ## Substract `b` from `a` and store the result into `r`.
  ## `r` is initialized/overwritten
  ## Handles r == b
@ -196,12 +196,12 @@ func diffAlias*(r: var FF, a, b: FF) {.inline.} =
    var underflowed = r.mres.diff(a.mres, b.mres)
    discard cadd(r.mres, FF.fieldMod(), underflowed)

-func diffNoReduce*(r: var FF, a, b: FF) {.inline.} =
+func diffNoReduce*(r: var FF, a, b: FF) {.inline, meter.} =
  ## Substract `b` from `a` and store the result into `r`
  ## without reduction
  discard r.mres.diff(a.mres, b.mres)

-func double*(r: var FF, a: FF) {.inline.} =
+func double*(r: var FF, a: FF) {.inline, meter.} =
  ## Double ``a`` into ``r``
  ## `r` is initialized/overwritten
  when UseASM_X86_64 and a.mres.limbs.len <= 6: # TODO: handle spilling
@ -212,16 +212,16 @@ func double*(r: var FF, a: FF) {.inline.} =
    overflowed = overflowed or not(r.mres < FF.fieldMod())
    discard csub(r.mres, FF.fieldMod(), overflowed)

-func prod*(r: var FF, a, b: FF) {.inline.} =
+func prod*(r: var FF, a, b: FF) {.inline, meter.} =
  ## Store the product of ``a`` by ``b`` modulo p into ``r``
  ## ``r`` is initialized / overwritten
  r.mres.montyMul(a.mres, b.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.canUseNoCarryMontyMul())

-func square*(r: var FF, a: FF) {.inline.} =
+func square*(r: var FF, a: FF) {.inline, meter.} =
  ## Squaring modulo p
  r.mres.montySquare(a.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.canUseNoCarryMontySquare())

-func neg*(r: var FF, a: FF) {.inline.} =
+func neg*(r: var FF, a: FF) {.inline, meter.} =
  ## Negate modulo p
  when UseASM_X86_64 and defined(gcc):
    # Clang and every compiler besides GCC
@ -239,11 +239,11 @@ func neg*(r: var FF, a: FF) {.inline.} =
    t.mres.czero(isZero)
    r = t

-func neg*(a: var FF) {.inline.} =
+func neg*(a: var FF) {.inline, meter.} =
  ## Negate modulo p
  a.neg(a)

-func div2*(a: var FF) {.inline.} =
+func div2*(a: var FF) {.inline, meter.} =
  ## Modular division by 2
  a.mres.div2_modular(FF.getPrimePlus1div2())

@ -253,26 +253,26 @@ func div2*(a: var FF) {.inline.} =
 #
 # ############################################################

-func cneg*(r: var FF, a: FF, ctl: SecretBool) =
+func cneg*(r: var FF, a: FF, ctl: SecretBool) {.meter.} =
  ## Constant-time in-place conditional negation
  ## The negation is only performed if ctl is "true"
  r.neg(a)
  r.ccopy(a, not ctl)

-func cneg*(a: var FF, ctl: SecretBool) =
+func cneg*(a: var FF, ctl: SecretBool) {.meter.} =
  ## Constant-time in-place conditional negation
  ## The negation is only performed if ctl is "true"
  var t = a
  a.cneg(t, ctl)

-func cadd*(a: var FF, b: FF, ctl: SecretBool) =
+func cadd*(a: var FF, b: FF, ctl: SecretBool) {.meter.} =
  ## Constant-time in-place conditional addition
  ## The addition is only performed if ctl is "true"
  var t = a
  t += b
  a.ccopy(t, ctl)

-func csub*(a: var FF, b: FF, ctl: SecretBool) =
+func csub*(a: var FF, b: FF, ctl: SecretBool) {.meter.} =
  ## Constant-time in-place conditional substraction
  ## The substraction is only performed if ctl is "true"
  var t = a
@ -365,15 +365,15 @@ func powUnsafeExponent*(a: var FF, exponent: openarray[byte]) {.inline.} =
 # - Those that return a field element
 # - Those that internally allocate a temporary field element

-func `+`*(a, b: FF): FF {.noInit, inline.} =
+func `+`*(a, b: FF): FF {.noInit, inline, meter.} =
  ## Addition modulo p
  result.sum(a, b)

-func `-`*(a, b: FF): FF {.noInit, inline.} =
+func `-`*(a, b: FF): FF {.noInit, inline, meter.} =
  ## Substraction modulo p
  result.diff(a, b)

-func `*`*(a, b: FF): FF {.noInit, inline.} =
+func `*`*(a, b: FF): FF {.noInit, inline, meter.} =
  ## Multiplication modulo p
  ##
  ## It is recommended to assign with {.noInit.}
@ -381,20 +381,20 @@ func `*`*(a, b: FF): FF {.noInit, inline.} =
  ## routine will zero init internally the result.
  result.prod(a, b)

-func `*=`*(a: var FF, b: FF) {.inline.} =
+func `*=`*(a: var FF, b: FF) {.inline, meter.} =
  ## Multiplication modulo p
  a.prod(a, b)

-func square*(a: var FF) {.inline.} =
+func square*(a: var FF) {.inline, meter.} =
  ## Squaring modulo p
  a.mres.montySquare(a.mres, FF.fieldMod(), FF.getNegInvModWord(), FF.canUseNoCarryMontySquare())

-func square_repeated*(r: var FF, num: int) {.inline.} =
+func square_repeated*(r: var FF, num: int) {.inline, meter.} =
  ## Repeated squarings
  for _ in 0 ..< num:
    r.square()

-func square_repeated*(r: var FF, a: FF, num: int) {.inline.} =
+func square_repeated*(r: var FF, a: FF, num: int) {.inline, meter.} =
  ## Repeated squarings
  r.square(a)
  for _ in 1 ..< num:
--- a/constantine/config/common.nim
+++ b/constantine/config/common.nim
@ -13,6 +13,9 @@
 # ############################################################

 import ../primitives
+import ../../metering/tracer
+
+export tracer

 when sizeof(int) == 8 and not defined(Constantine32):
  type
--- a/constantine/pairing/cyclotomic_fp12.nim
+++ b/constantine/pairing/cyclotomic_fp12.nim
@ -8,7 +8,7 @@

 import
  ../primitives,
-  ../config/curves,
+  ../config/[common, curves],
  ../arithmetic,
  ../towers,
  ../isogeny/frobenius
@ -31,7 +31,7 @@ import

 # 𝔽p12 -> Gϕ₁₂ - Mapping to Cyclotomic group
 # ----------------------------------------------------------------
-func finalExpEasy*[C: static Curve](f: var Fp12[C]) =
+func finalExpEasy*[C: static Curve](f: var Fp12[C]) {.meter.} =
  ## Easy part of the final exponentiation
  ##
  ## This maps the result of the Miller loop into the cyclotomic subgroup Gϕ₁₂
@ -119,19 +119,19 @@ func finalExpEasy*[C: static Curve](f: var Fp12[C]) =
 #
 # The result of any pairing is in a cyclotomic subgroup

-func cyclotomic_inv*(a: var Fp12) =
+func cyclotomic_inv*(a: var Fp12) {.meter.} =
  ## Fast inverse for a
  ## `a` MUST be in the cyclotomic subgroup
  ## consequently `a` MUST be unitary
  a.conj()

-func cyclotomic_inv*(r: var Fp12, a: Fp12) =
+func cyclotomic_inv*(r: var Fp12, a: Fp12) {.meter.} =
  ## Fast inverse for a
  ## `a` MUST be in the cyclotomic subgroup
  ## consequently `a` MUST be unitary
  r.conj(a)

-func cyclotomic_square*[C](r: var Fp12[C], a: Fp12[C]) =
+func cyclotomic_square*[C](r: var Fp12[C], a: Fp12[C]) {.meter.} =
  ## Square `a` into `r`
  ## `a` MUST be in the cyclotomic subgroup
  ## consequently `a` MUST be unitary
@ -177,7 +177,7 @@ func cyclotomic_square*[C](r: var Fp12[C], a: Fp12[C]) =
  else:
    {.error: "Not implemented".}

-func cyclotomic_square*[C](a: var Fp12[C]) =
+func cyclotomic_square*[C](a: var Fp12[C]) {.meter.} =
  ## Square `a` into `r`
  ## `a` MUST be in the cyclotomic subgroup
  ## consequently `a` MUST be unitary
@ -225,7 +225,7 @@ func cyclotomic_square*[C](a: var Fp12[C]) =
  else:
    {.error: "Not implemented".}

-func cycl_sqr_repeated*(f: var Fp12, num: int) {.inline.} =
+func cycl_sqr_repeated*(f: var Fp12, num: int) {.inline, meter.} =
  ## Repeated cyclotomic squarings
  for _ in 0 ..< num:
    f.cyclotomic_square()
@ -240,7 +240,7 @@ iterator unpack(scalarByte: byte): bool =
  yield bool((scalarByte and 0b00000010) shr 1)
  yield bool( scalarByte and 0b00000001)

-func cyclotomic_exp*[C](r: var Fp12[C], a: Fp12[C], exponent: BigInt, invert: bool) =
+func cyclotomic_exp*[C](r: var Fp12[C], a: Fp12[C], exponent: BigInt, invert: bool) {.meter.} =
    var eBytes: array[(exponent.bits+7) div 8, byte]
    eBytes.exportRawUint(exponent, bigEndian)

--- a/constantine/pairing/pairing_bls12.nim
+++ b/constantine/pairing/pairing_bls12.nim
@ -7,7 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

 import
-  ../config/[curves, type_ff],
+  ../config/[common, curves, type_ff],
  ../towers,
  ../elliptic/[
    ec_shortweierstrass_affine,
@ -49,7 +49,7 @@ func millerLoopGenericBLS12*[C](
       f: var Fp12[C],
       P: ECP_ShortW_Aff[Fp[C], NotOnTwist],
       Q: ECP_ShortW_Aff[Fp2[C], OnTwist]
-     ) =
+     ) {.meter.} =
  ## Generic Miller Loop for BLS12 curve
  ## Computes f{u,Q}(P) with u the BLS curve parameter

@ -133,7 +133,7 @@ func pairing_bls12_reference*[C](
 # Optimized pairing implementation
 # ----------------------------------------------------------------

-func finalExpHard_BLS12*[C](f: var Fp12[C]) =
+func finalExpHard_BLS12*[C](f: var Fp12[C]) {.meter.} =
  ## Hard part of the final exponentiation
  ## Specialized for BLS12 curves
  ##
@ -191,7 +191,7 @@ func finalExpHard_BLS12*[C](f: var Fp12[C]) =
 func pairing_bls12*[C](
       gt: var Fp12[C],
       P: ECP_ShortW_Proj[Fp[C], NotOnTwist],
-       Q: ECP_ShortW_Proj[Fp2[C], OnTwist]) =
+       Q: ECP_ShortW_Proj[Fp2[C], OnTwist]) {.meter.} =
  ## Compute the optimal Ate Pairing for BLS12 curves
  ## Input: P ∈ G1, Q ∈ G2
  ## Output: e(P, Q) ∈ Gt
--- a/metering/README.md
+++ b/metering/README.md
@ -0,0 +1,97 @@
+# Metering
+
+## Overview
+
+This folder allows measuring an accurate cost of high-level primitives in terms of basic operations (Field mul, add, inv, ...)
+
+### For optimization
+
+Metering allows choosing the best algorithm or representation when multiple are available, for example choosing elliptic curve coordinates between affine projective or jacobian? Also some might be faster for certain fields (Fp or Fp2) or certain curves.
+
+It also allows to focus tuning operations that underlie the high-level building blocks. This is not a replacement for profiling but a complement.
+Metering allows reasoning at the complexity and algorithmic level while profiling allows reasoning at the hardware and timing level.
+
+### For blockchains
+
+Important for blockchain to correctly price the VM opcodes. Pricing too low would allow denial-of-service attacks, too high will  disincentivize their use.
+
+Note: this only takes into account the number of operations
+but does not take into account stack usage for temporaries.
+
+## Measuring cost
+
+The file m_pairings has a minimal example for the current state.
+
+```Nim
+var rng*: RngState
+let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+rng.seed(seed)
+echo "bench xoshiro512** seed: ", seed
+
+func random_point*(rng: var RngState, EC: typedesc): EC {.noInit.} =
+  result = rng.random_unsafe(EC)
+  result.clearCofactorReference()
+
+proc pairingBLS12Meter*(C: static Curve) =
+  let
+    P = rng.random_point(ECP_ShortW_Proj[Fp[C], NotOnTwist])
+    Q = rng.random_point(ECP_ShortW_Proj[Fp2[C], OnTwist])
+
+  var f: Fp12[C]
+
+  resetMetering()
+  f.pairing_bls12(P, Q)
+
+resetMetering()
+pairingBLS12Meter(BLS12_381)
+const flags = if UseASM_X86_64 or UseASM_X86_32: "UseAssembly" else: "NoAssembly"
+reportCli(Metrics, flags)
+```
+
+After compiling with
+```
+nim c -r --hints:off --warnings:off --verbosity:0 -d:danger -d:CttMeter --outdir:build metering/m_pairings.nim
+```
+
+We get
+
+```
+bench xoshiro512** seed: 1611954740
+
+CPU: Intel(R) Core(TM) i9-9980XE CPU @ 3.00GHz
+The CPU Cycle Count is indicative only. It cannot be used to compare across systems, works at your CPU nominal frequency and is sensitive to overclocking, throttling and frequency scaling (powersaving and Turbo Boost).
+
+
+|--------------------------------------------------|--------------|--------------------|---------------|-----------------|--------------------------|--------------------------|
+|                    Procedures                    |  # of Calls  | Throughput (ops/s) |   Time (µs)   |  Avg Time (µs)  | CPU cycles (in billions) | Avg cycles (in billions) |
+|                   UseAssembly                    |              |                    |               |                 |     indicative only      |     indicative only      |
+|--------------------------------------------------|--------------|--------------------|---------------|-----------------|--------------------------|--------------------------|
+|`+=`*                                             |         11473|                 inf|          0.000|            0.000|
+|`-=`*                                             |         18603|   2067000000000.000|          0.009|            0.000|
+|double*                                           |          7212|   2404000000000.000|          0.003|            0.000|
+|sum*                                              |         21058|   7019333333333.333|          0.003|            0.000|
+|diff*                                             |          8884|   2961333333333.333|          0.003|            0.000|
+|diffAlias*                                        |            10|                 inf|          0.000|            0.000|
+|double*                                           |          4186|                 inf|          0.000|            0.000|
+|prod*                                             |         14486|   1609555555555.555|          0.009|            0.000|
+|square*                                           |            16|                 inf|          0.000|            0.000|
+|neg*                                              |          2093|                 inf|          0.000|            0.000|
+|neg*                                              |          2050|                 inf|          0.000|            0.000|
+|div2*                                             |           512|                 inf|          0.000|            0.000|
+|`*=`*                                             |          5584|    620444444444.444|          0.009|            0.000|
+|square*                                           |          1116|                 inf|          0.000|            0.000|
+|square_repeated*                                  |           126|      1235294117.647|          0.102|            0.001|
+|finalExpEasy*                                     |             1|         5555555.556|          0.180|            0.180|
+|cyclotomic_inv*                                   |             5|      1000000000.000|          0.005|            0.001|
+|cyclotomic_inv*                                   |             1|                 inf|          0.000|            0.000|
+|cyclotomic_square*                                |             6|        70588235.294|          0.085|            0.014|
+|cyclotomic_square*                                |           309|        70499657.769|          4.383|            0.014|
+|cycl_sqr_repeated*                                |            25|         5556790.398|          4.499|            0.180|
+|millerLoopGenericBLS12*                           |             1|          279251.606|          3.581|            3.581|
+|finalExpHard_BLS12*                               |             1|          178475.817|          5.603|            5.603|
+|pairing_bls12*                                    |             1|          105196.718|          9.506|            9.506|
+|--------------------------------------------------|--------------|--------------------|---------------|-----------------|--------------------------|--------------------------|
+```
+
+The reporting and tracing will be improved to collect the fields and curves
+It's already useful to know how many base field operations are necessary.
--- a/metering/m_pairings.nim
+++ b/metering/m_pairings.nim
@ -0,0 +1,42 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  std/times,
+  ./reports, ./tracer,
+  ../constantine/config/[common, curves],
+  ../constantine/[arithmetic, towers],
+  ../constantine/elliptic/ec_shortweierstrass_projective,
+  ../constantine/hash_to_curve/cofactors,
+  ../constantine/pairing/pairing_bls12,
+  # Helpers
+  ../helpers/prng_unsafe
+
+var rng*: RngState
+let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+rng.seed(seed)
+echo "bench xoshiro512** seed: ", seed
+
+func random_point*(rng: var RngState, EC: typedesc): EC {.noInit.} =
+  result = rng.random_unsafe(EC)
+  result.clearCofactorReference()
+
+proc pairingBLS12Meter*(C: static Curve) =
+  let
+    P = rng.random_point(ECP_ShortW_Proj[Fp[C], NotOnTwist])
+    Q = rng.random_point(ECP_ShortW_Proj[Fp2[C], OnTwist])
+
+  var f: Fp12[C]
+
+  resetMetering()
+  f.pairing_bls12(P, Q)
+
+resetMetering()
+pairingBLS12Meter(BLS12_381)
+const flags = if UseASM_X86_64 or UseASM_X86_32: "UseAssembly" else: "NoAssembly"
+reportCli(Metrics, flags)
--- a/metering/reports.nim
+++ b/metering/reports.nim
@ -0,0 +1,62 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  std/[strformat, strutils],
+  ../benchmarks/platforms,
+  tracer
+
+# Reporting benchmark result
+# -------------------------------------------------------
+
+proc reportCli*(metrics: seq[Metadata], flags: string) =
+
+  let name = when SupportsCPUName: cpuName() else: "(name auto-detection not implemented for this CPU family)"
+  echo "\nCPU: ", name
+
+  when SupportsGetTicks:
+    # https://blog.trailofbits.com/2019/10/03/tsc-frequency-for-all-better-profiling-and-benchmarking/
+    # https://www.agner.org/optimize/blog/read.php?i=838
+    echo "The CPU Cycle Count is indicative only. It cannot be used to compare across systems, works at your CPU nominal frequency and is sensitive to overclocking, throttling and frequency scaling (powersaving and Turbo Boost)."
+
+    const lineSep = &"""|{'-'.repeat(50)}|{'-'.repeat(14)}|{'-'.repeat(20)}|{'-'.repeat(15)}|{'-'.repeat(17)}|{'-'.repeat(26)}|{'-'.repeat(26)}|"""
+    echo "\n"
+    echo lineSep
+    echo &"""|{"Procedures":^50}|{"# of Calls":^14}|{"Throughput (ops/s)":^20}|{"Time (µs)":^15}|{"Avg Time (µs)":^17}|{"CPU cycles (in billions)":^26}|{"Avg cycles (in billions)":^26}|"""
+    echo &"""|{flags:^50}|{' '.repeat(14)}|{' '.repeat(20)}|{' '.repeat(15)}|{' '.repeat(17)}|{"indicative only":^26}|{"indicative only":^26}|"""
+    echo lineSep
+    for m in metrics:
+      if m.numCalls == 0:
+        continue
+      # TODO: running variance / standard deviation but the Welford method is quite costly.
+      #       https://nim-lang.org/docs/stats.html / https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
+      let cumulTimeUs = m.cumulatedTimeNs.float64 * 1e-3
+      let avgTimeUs = cumulTimeUs / m.numCalls.float64
+      let throughput = 1e6 / avgTimeUs
+      let cumulCyclesBillions = m.cumulatedCycles.float64 * 1e-9
+      let avgCyclesBillions = cumulCyclesBillions / m.numCalls.float64
+      echo &"""|{m.procName:<50}|{m.numCalls:>14}|{throughput:>20.3f}|{cumulTimeUs:>15.3f}|{avgTimeUs:>17.3f}|"""
+    echo lineSep
+
+  else:
+    const lineSep = &"""|{'-'.repeat(50)}|{'-'.repeat(14)}|{'-'.repeat(20)}|{'-'.repeat(15)}|{'-'.repeat(17)}|"""
+    echo "\n"
+    echo lineSep
+    echo &"""|{"Procedures":^50}|{"# of Calls":^14}|{"Throughput (ops/s)":^20}|{"Time (µs)":^15}|{"Avg Time (µs)":^17}|"""
+    echo &"""|{flags:^50}|{' '.repeat(14)}|{' '.repeat(20)}|{' '.repeat(15)}|{' '.repeat(17)}|"""
+    echo lineSep
+    for m in metrics:
+      if m.numCalls == 0:
+        continue
+      # TODO: running variance / standard deviation but the Welford method is quite costly.
+      #       https://nim-lang.org/docs/stats.html / https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
+      let cumulTimeUs = m.cumulatedTimeNs.float64 * 1e-3
+      let avgTimeUs = cumulTimeUs / m.numCalls.float64
+      let throughput = 1e6 / avgTimeUs
+      echo &"""|{m.procName:<50}|{m.numCalls:>14}|{throughput:>20.3f}|{cumulTimeUs:>15.3f}|{avgTimeUs:>17.3f}|"""
+    echo lineSep
--- a/metering/tracer.nim
+++ b/metering/tracer.nim
@ -0,0 +1,152 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  std/[macros, times, monotimes],
+  ../benchmarks/platforms
+
+# ############################################################
+#
+#                     Trace operations
+#
+# ############################################################
+
+# Utils
+# --------------------------------------------------
+const someGcc = defined(gcc) or defined(llvm_gcc) or defined(clang) or defined(icc)
+const hasThreadSupport = defined(threads)
+
+proc atomicInc*(memLoc: var int64, x = 1'i64): int64 =
+  when someGcc and hasThreadSupport:
+    result = atomicAddFetch(memLoc.addr, x, ATOMIC_RELAXED)
+  elif defined(vcc) and hasThreadSupport:
+    result = addAndFetch(memLoc.addr, x)
+    result += x
+  else:
+    memloc += x
+    result = memLoc
+
+# Types
+# --------------------------------------------------
+
+type
+  Metadata* = object
+    procName*: string
+    module: string
+    package: string
+    tag: string # Can be change to multi-tags later
+    numCalls*: int64
+    cumulatedTimeNs*: int64 # in microseconds
+    when SupportsGetTicks:
+      cumulatedCycles*: int64
+
+var ctMetrics{.compileTime.}: seq[Metadata]
+  ## Metrics are collected here, this is just a temporary holder of compileTime values
+  ## Unfortunately the "seq" is emptied when passing the compileTime/runtime boundaries
+  ## due to Nim bugs
+
+var Metrics*: seq[Metadata]
+  ## We can't directly use it at compileTime because it doesn't exist.
+  ## We need `Metrics = static(ctMetrics)`
+  ## To transfer the compileTime content to runtime at an opportune time.
+
+template mtag(tagname: string){.pragma.}
+  ## This will allow tagging proc in the future with
+  ## "Fp", "ec", "polynomial"
+
+proc resetMetering*() =
+  Metrics = static(ctMetrics)
+
+const CttMeter {.booldefine.} = off
+
+const CttTrace {.booldefine.} = off # For manual "debug-echo"-style timing.
+when CttTrace:
+  # strformat doesn't work in templates.
+  from strutils import alignLeft, formatFloat
+
+# Symbols
+# --------------------------------------------------
+
+template fnEntry(name: string, id: int, startTime, startCycle: untyped): untyped =
+  ## Bench tracing to insert on function entry
+  {.noSideEffect, gcsafe.}:
+    discard Metrics[id].numCalls.atomicInc()
+    let startTime = getMonoTime()
+    when SupportsGetTicks:
+      let startCycle = getTicks()
+    else:
+      let startCycle = 0
+
+template fnExit(name: string, id: int, startTime, startCycle: untyped): untyped =
+  ## Bench tracing to insert before each function exit
+  {.noSideEffect, gcsafe.}:
+    when SupportsGetTicks:
+      let stopCycle = getTicks()
+    let stopTime = getMonoTime()
+    when SupportsGetTicks:
+      let elapsedCycles = stopCycle - startCycle
+    let elapsedTime = inMicroseconds(stopTime - startTime)
+
+    discard Metrics[id].cumulatedTimeNs.atomicInc(elapsedTime)
+    when SupportsGetTicks:
+      discard Metrics[id].cumulatedCycles.atomicInc(elapsedCycles)
+
+    when CttTrace:
+      # Advice: Use "when name == relevantProc" to isolate specific procedures.
+      # strformat doesn't work in templates.
+      when SupportsGetTicks:
+        echo static(alignLeft(name, 50)),
+            "Time (µs): ", alignLeft(formatFloat(elapsedTime.float64 * 1e-3, precision=3), 10),
+            "Cycles (billions): ", formatFloat(elapsedCycles.float64 * 1e-9, precision=3)
+      else:
+        echo static(alignLeft(name, 50)),
+            "Time (µs): ", alignLeft(formatFloat(elapsedTime.float64 * 1e-3, precision=3), 10)
+
+macro meterAnnotate(procAst: untyped): untyped =
+  procAst.expectKind({nnkProcDef, nnkFuncDef})
+
+  let id = ctMetrics.len
+  let name = procAst[0].repr
+  # TODO, get the module and the package the proc is coming from
+  #       and the tag "Fp", "ec", "polynomial" ...
+
+  ctMetrics.add Metadata(procName: name)
+  var newBody = newStmtList()
+  let startTime = genSym(nskLet, "metering_" & name & "_startTime_")
+  let startCycle = genSym(nskLet, "metering_" & name & "_startCycles_")
+  newBody.add getAst(fnEntry(name, id, startTime, startCycle))
+  newbody.add nnkDefer.newTree(getAst(fnExit(name, id, startTime, startCycle)))
+  newBody.add procAst.body
+
+  procAst.body = newBody
+  result = procAst
+
+template meter*(procBody: untyped): untyped =
+  when CttMeter or CttTrace:
+    meterAnnotate(procBody)
+  else:
+    procBody
+
+# Sanity checks
+# ---------------------------------------------------
+
+when isMainModule:
+
+  static: doAssert CttMeter or CttTrace, "CttMeter or CttTrace must be on for tracing"
+
+  expandMacros:
+    proc foo(x: int): int{.meter.} =
+      echo "Hey hey hey"
+      result = x
+
+  resetMetering()
+
+  echo Metrics
+  discard foo(10)
+  echo Metrics
+  doAssert Metrics[0].numCalls == 1