modexp: 2.5x accel on small exponent (#268)

* add metering to modexp * modexp: accel exponent = 1 * modexp: improve runtime Montgomery constants compute. 2.49x faster on DOS vectors
2026-01-07 15:43:08 +00:00 · 2023-09-09 09:21:05 +02:00 · 2023-09-09 09:21:05 +02:00 · 15757557b4
commit 15757557b4
parent f3a5f352b8
16 changed files with 315 additions and 51 deletions
--- a/benchmarks/bench_evm_modexp_dos.nim
+++ b/benchmarks/bench_evm_modexp_dos.nim
@ -3,16 +3,16 @@ import
  ../constantine/math/arithmetic,
  ../constantine/math/io/io_bigints,
  ../constantine/platforms/abstractions,
-  ./platforms, ./bench_blueprint
+  ./bench_blueprint
 proc report(op: string, elapsedNs: int64, elapsedCycles: int64, iters: int) =
  let ns = elapsedNs div iters
  let cycles = elapsedCycles div iters
  let throughput = 1e9 / float64(ns)
  when SupportsGetTicks:
-    echo &"{op:<45} {throughput:>15.3f} ops/s {ns:>16} ns/op {cycles:>12} CPU cycles (approx)"
+    echo &"{op:<70} {throughput:>15.3f} ops/s {ns:>16} ns/op {cycles:>12} CPU cycles (approx)"
  else:
-    echo &"{op:<45} {throughput:>15.3f} ops/s {ns:>16} ns/op"
+    echo &"{op:<70} {throughput:>15.3f} ops/s {ns:>16} ns/op"
 template bench(fnCall: untyped, ticks, ns: var int64): untyped =
  block:
@ -148,11 +148,119 @@ proc dos1() =
        (let _ = r.eth_evm_modexp(input)),
        ticks, nanoseconds)
-  report("EVM Modexp - 32,32,32", nanoseconds, ticks, execsEIP2565)
+  report("EVM Modexp - 32,32,32 - even base & power-of-2 modulus", nanoseconds, ticks, execsEIP2565)
  echo "Total time: ", nanoseconds.float64 / 1e6, " ms for ", execsEIP2565, " iterations"
 proc dos2() =
  let input = [
    # Length of base (1)
    uint8 0x00,
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
    # Length of exponent (1)
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
    # Length of modulus (121)
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x79,
    # Base
    0x33,
    # Exponent
    0x01,
    # Modulus
    0x04, 0xea, 0xbb, 0x12, 0x55, 0x88, 0xd7, 0x3c, 0xad, 0x22, 0xea, 0x2b, 0x4a, 0x77, 0x6e, 0x9d,
    0x4d, 0xfc, 0x13, 0xa8, 0x1b, 0xf9, 0x0c, 0x0d, 0x37, 0xe8, 0x4e, 0x8b, 0xeb, 0xb2, 0xa5, 0x48,
    0x8b, 0x2c, 0x87, 0x6d, 0x13, 0x51, 0x75, 0xeb, 0x97, 0xc6, 0x13, 0xd9, 0x06, 0xce, 0x8b, 0x53,
    0xd0, 0x02, 0x68, 0xb8, 0xd6, 0x12, 0xab, 0x8b, 0x15, 0x0c, 0xef, 0x0a, 0xd0, 0x3b, 0x73, 0xd2,
    0xdb, 0x9d, 0x2a, 0xa5, 0x23, 0x70, 0xdc, 0x26, 0x55, 0x80, 0xca, 0xf2, 0xc0, 0x18, 0xe3, 0xe3,
    0x1b, 0xad, 0xd5, 0x22, 0xdd, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x1c, 0x05, 0x71, 0x52, 0x7c, 0x3a, 0xb0, 0x77,
  ]
  var r = newSeq[byte](121)
  var ticks, nanoseconds: int64
  let (gasFeeEIP198, gasFeeEIP2565) = computeGasFee(input)
  const blockSize = 30000000
  let execsEIP198 = blockSize div gasFeeEIP198
  let execsEIP2565 = blockSize div gasFeeEIP2565
  echo "Gas cost: ", gasFeeEIP198, " gas (EIP-198) - ", execsEIP198, " executions per block"
  echo "Gas cost: ", gasFeeEIP2565, " gas (EIP-2565) - ", execsEIP2565, " executions per block"
  for i in 0 ..< execsEIP2565:
      bench(
        (let _ = r.eth_evm_modexp(input)),
        ticks, nanoseconds)
  report("EVM Modexp - 1,1,121 - exponent=1 and odd modulus", nanoseconds, ticks, execsEIP2565)
  echo "Total time: ", nanoseconds.float64 / 1e6, " ms for ", execsEIP2565, " iterations"
 proc dos2a() =
  # shortcuttable variation with even modulus
  let input = [
    # Length of base (1)
    uint8 0x00,
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
    # Length of exponent (1)
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
    # Length of modulus (121)
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x79,
    # Base
    0x33,
    # Exponent
    0x01,
    # Modulus
    0x04, 0xea, 0xbb, 0x12, 0x55, 0x88, 0xd7, 0x3c, 0xad, 0x22, 0xea, 0x2b, 0x4a, 0x77, 0x6e, 0x9d,
    0x4d, 0xfc, 0x13, 0xa8, 0x1b, 0xf9, 0x0c, 0x0d, 0x37, 0xe8, 0x4e, 0x8b, 0xeb, 0xb2, 0xa5, 0x48,
    0x8b, 0x2c, 0x87, 0x6d, 0x13, 0x51, 0x75, 0xeb, 0x97, 0xc6, 0x13, 0xd9, 0x06, 0xce, 0x8b, 0x53,
    0xd0, 0x02, 0x68, 0xb8, 0xd6, 0x12, 0xab, 0x8b, 0x15, 0x0c, 0xef, 0x0a, 0xd0, 0x3b, 0x73, 0xd2,
    0xdb, 0x9d, 0x2a, 0xa5, 0x23, 0x70, 0xdc, 0x26, 0x55, 0x80, 0xca, 0xf2, 0xc0, 0x18, 0xe3, 0xe3,
    0x1b, 0xad, 0xd5, 0x22, 0xdd, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x1c, 0x05, 0x71, 0x52, 0x7c, 0x3a, 0xb0, 0x76,
  ]
  var r = newSeq[byte](121)
  var ticks, nanoseconds: int64
  let (gasFeeEIP198, gasFeeEIP2565) = computeGasFee(input)
  const blockSize = 30000000
  let execsEIP198 = blockSize div gasFeeEIP198
  let execsEIP2565 = blockSize div gasFeeEIP2565
  echo "Gas cost: ", gasFeeEIP198, " gas (EIP-198) - ", execsEIP198, " executions per block"
  echo "Gas cost: ", gasFeeEIP2565, " gas (EIP-2565) - ", execsEIP2565, " executions per block"
  for i in 0 ..< execsEIP2565:
      bench(
        (let _ = r.eth_evm_modexp(input)),
        ticks, nanoseconds)
  report("EVM Modexp - 1,1,121 - exponent=1 and even modulus", nanoseconds, ticks, execsEIP2565)
  echo "Total time: ", nanoseconds.float64 / 1e6, " ms for ", execsEIP2565, " iterations"
 proc dos2b() =
  # even variation with no shortcut
  let input = [
    # Length of base (1)
    uint8 0x00,
@ -201,10 +309,11 @@ proc dos2() =
        (let _ = r.eth_evm_modexp(input)),
        ticks, nanoseconds)
-  report("EVM Modexp - 1,1,121", nanoseconds, ticks, execsEIP2565)
+  report("EVM Modexp - 1,1,121 - exponent=16 and odd modulus", nanoseconds, ticks, execsEIP2565)
  echo "Total time: ", nanoseconds.float64 / 1e6, " ms for ", execsEIP2565, " iterations"
-proc dos3() =
+proc dos2c() =
  # odd variation with no shortcut
  let input = [
    # Length of base (1)
@ -254,11 +363,71 @@ proc dos3() =
      (let _ = r.eth_evm_modexp(input)),
      ticks, nanoseconds)
-  report("EVM Modexp - 1,1,121", nanoseconds, ticks, execsEIP2565)
+  report("EVM Modexp - 1,1,121 - exponent=7 and odd modulus", nanoseconds, ticks, execsEIP2565)
  echo "Total time: ", nanoseconds.float64 / 1e6, " ms for ", execsEIP2565, " iterations"
 proc dos2d() =
  # odd variation with no shortcut and power of 2 modulus
  let input = [
    # Length of base (1)
    uint8 0x00,
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
    # Length of exponent (1)
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
    # Length of modulus (121)
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x79,
    # Base
    0x33,
    # Exponent
    0x07,
    # Modulus
    0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  ]
  var r = newSeq[byte](121)
  var ticks, nanoseconds: int64
  let (gasFeeEIP198, gasFeeEIP2565) = computeGasFee(input)
  const blockSize = 30000000
  let execsEIP198 = blockSize div gasFeeEIP198
  let execsEIP2565 = blockSize div gasFeeEIP2565
  echo "Gas cost: ", gasFeeEIP198, " gas (EIP-198) - ", execsEIP198, " executions per block"
  echo "Gas cost: ", gasFeeEIP2565, " gas (EIP-2565) - ", execsEIP2565, " executions per block"
  for i in 0 ..< execsEIP2565:
    bench(
      (let _ = r.eth_evm_modexp(input)),
      ticks, nanoseconds)
  report("EVM Modexp - 1,1,121 - exponent=7 and power-of-2 modulus", nanoseconds, ticks, execsEIP2565)
  echo "Total time: ", nanoseconds.float64 / 1e6, " ms for ", execsEIP2565, " iterations"
 dos1()
 echo "\n"
 dos2()
 echo "\n"
-dos3()
+dos2a()
 echo "\n"
 dos2b()
 echo "\n"
 dos2c()
 echo "\n"
 dos2d()
--- a/constantine/ethereum_evm_precompiles.nim
+++ b/constantine/ethereum_evm_precompiles.nim
@ -370,7 +370,7 @@ func eth_evm_ecpairing*(
    r[r.len-1] = byte 1
  return cttEVM_Success
-func eth_evm_modexp*(r: var openArray[byte], inputs: openArray[byte]): CttEVMStatus {.noInline, tags:[Alloca, Vartime].} =
+func eth_evm_modexp*(r: var openArray[byte], inputs: openArray[byte]): CttEVMStatus {.noInline, tags:[Alloca, Vartime], meter.} =
  ## Modular exponentiation
  ##
  ## Name: MODEXP
--- a/constantine/math/elliptic/ec_scalar_mul_vartime.nim
+++ b/constantine/math/elliptic/ec_scalar_mul_vartime.nim
@ -38,7 +38,7 @@ template `+=`[F; G: static Subgroup](P: var (ECP_ShortW_Jac[F, G] or ECP_ShortW_
 template `-=`[F; G: static Subgroup](P: var (ECP_ShortW_Jac[F, G] or ECP_ShortW_Prj[F, G]), Q: ECP_ShortW_Aff[F, G]) =
  P.msub_vartime(P, Q)
-func scalarMul_doubleAdd_vartime*[EC](P: var EC, scalar: BigInt) {.tags:[VarTime].} =
+func scalarMul_doubleAdd_vartime*[EC](P: var EC, scalar: BigInt) {.tags:[VarTime], meter.} =
  ## **Variable-time** Elliptic Curve Scalar Multiplication
  ##
  ##   P <- [k] P
@ -67,7 +67,7 @@ func scalarMul_doubleAdd_vartime*[EC](P: var EC, scalar: BigInt) {.tags:[VarTime
        else:
          P += Paff
-func scalarMul_addchain_4bit_vartime[EC](P: var EC, scalar: BigInt) {.tags:[VarTime].} =
+func scalarMul_addchain_4bit_vartime[EC](P: var EC, scalar: BigInt) {.tags:[VarTime], meter.} =
  ## **Variable-time** Elliptic Curve Scalar Multiplication
  ## This can only handle for small scalars up to 2⁴ = 16 excluded
  let s = uint scalar.limbs[0]
@ -206,7 +206,7 @@ func accumNAF[precompSize, NafMax: static int, EC, ECaff](
    elif digit < 0:
      P -= tab[-digit shr 1]
-func scalarMul_minHammingWeight_windowed_vartime*[EC](P: var EC, scalar: BigInt, window: static int) {.tags:[VarTime, Alloca].} =
+func scalarMul_minHammingWeight_windowed_vartime*[EC](P: var EC, scalar: BigInt, window: static int) {.tags:[VarTime, Alloca], meter.} =
  ## **Variable-time** Elliptic Curve Scalar Multiplication
  ##
  ##   P <- [k] P
@ -246,7 +246,7 @@ func scalarMul_minHammingWeight_windowed_vartime*[EC](P: var EC, scalar: BigInt,
 func scalarMulEndo_minHammingWeight_windowed_vartime*[scalBits: static int; EC](
       P: var EC,
       scalar: BigInt[scalBits],
-       window: static int) {.tags:[VarTime, Alloca].} =
+       window: static int) {.tags:[VarTime, Alloca], meter.} =
  ## Endomorphism-accelerated windowed vartime scalar multiplication
  ##
  ##   P <- [k] P
--- a/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim
@ -88,7 +88,7 @@ func batchAffine*[N: static int, F, G](
 func batchAffine*[F, G](
       affs: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
       jacs: ptr UncheckedArray[ECP_ShortW_Jac[F, G]],
-       N: int) {.noInline, tags:[Alloca].} =
+       N: int) {.noInline, tags:[Alloca], meter.} =
  # Algorithm: Montgomery's batch inversion
  # - Speeding the Pollard and Elliptic Curve Methods of Factorization
  #   Section 10.3.1
--- a/constantine/math_arbitrary_precision/arithmetic/bigints_views.nim
+++ b/constantine/math_arbitrary_precision/arithmetic/bigints_views.nim
@ -46,7 +46,7 @@ func powOddMod_vartime*(
       a: openArray[SecretWord],
       exponent: openArray[byte],
       M: openArray[SecretWord],
-       window: int) {.noInline, tags:[Alloca, VarTime].} =
+       window: int) {.noInline, tags:[Alloca, VarTime], meter.} =
  ## r <- a^exponent (mod M) with M odd
  ## assumes a < M
  ##
@ -57,6 +57,12 @@ func powOddMod_vartime*(
  let aBits  = a.getBits_LE_vartime()
  let mBits  = M.getBits_LE_vartime()
  let eBits  = exponent.getBits_BE_vartime()
  if eBits == 1:
    r.view().reduce(a.view(), aBits, M.view(), mBits)
    return
  let L      = wordsRequired(mBits)
  let m0ninv = M[0].negInvModWord()
  var rMont  = allocStackArray(SecretWord, L)
@ -97,7 +103,7 @@ func powMod_vartime*(
       a: openArray[SecretWord],
       exponent: openArray[byte],
       M: openArray[SecretWord],
-       window: int) {.noInline, tags:[Alloca, VarTime].} =
+       window: int) {.noInline, tags:[Alloca, VarTime], meter.} =
  ## r <- a^exponent (mod M) with M odd
  ## assumes a < exponent
  ##
--- a/constantine/math_arbitrary_precision/arithmetic/limbs_division.nim
+++ b/constantine/math_arbitrary_precision/arithmetic/limbs_division.nim
@ -142,7 +142,7 @@ func shlAddMod(a: LimbsViewMut, aLen: int,
 func reduce*(r: LimbsViewMut,
            a: LimbsViewAny, aBits: int,
-            M: LimbsViewConst, mBits: int) =
+            M: LimbsViewConst, mBits: int) {.meter.} =
  ## Reduce `a` modulo `M` and store the result in `r`
  ##
  ## The modulus `M` most-significant bit at `mBits` MUST be set.
--- a/constantine/math_arbitrary_precision/arithmetic/limbs_extmul.nim
+++ b/constantine/math_arbitrary_precision/arithmetic/limbs_extmul.nim
@ -39,6 +39,6 @@ func prod_comba(r: var openArray[SecretWord], a, b: openArray[SecretWord]) {.noI
  for i in stopEx ..< r.len:
    r[i] = Zero
-func prod*(r: var openArray[SecretWord], a, b: openArray[SecretWord]) {.inline.}=
+func prod*(r: var openArray[SecretWord], a, b: openArray[SecretWord]) {.inline, meter.}=
  ## Extended precision multiplication
  r.prod_comba(a, b)
--- a/constantine/math_arbitrary_precision/arithmetic/limbs_fixedprec.nim
+++ b/constantine/math_arbitrary_precision/arithmetic/limbs_fixedprec.nim
@ -30,7 +30,7 @@ import
 # Comparison
 # ------------------------------------------------------------
-func lt*(a, b: distinct LimbsViewAny, len: int): SecretBool =
+func lt*(a, b: distinct LimbsViewAny, len: int): SecretBool {.meter.} =
  ## Returns true if a < b
  ## Comparison is constant-time
  var diff: SecretWord
@ -43,7 +43,7 @@ func lt*(a, b: distinct LimbsViewAny, len: int): SecretBool =
 # Type-erased add-sub
 # ------------------------------------------------------------
-func cadd*(a: LimbsViewMut, b: LimbsViewAny, ctl: SecretBool, len: int): Carry =
+func cadd*(a: LimbsViewMut, b: LimbsViewAny, ctl: SecretBool, len: int): Carry {.meter.} =
  ## Type-erased conditional addition
  ## Returns the carry
  ##
@ -58,7 +58,7 @@ func cadd*(a: LimbsViewMut, b: LimbsViewAny, ctl: SecretBool, len: int): Carry =
    addC(result, sum, a[i], b[i], result)
    ctl.ccopy(a[i], sum)
-func csub*(a: LimbsViewMut, b: LimbsViewAny, ctl: SecretBool, len: int): Borrow =
+func csub*(a: LimbsViewMut, b: LimbsViewAny, ctl: SecretBool, len: int): Borrow {.meter.} =
  ## Type-erased conditional addition
  ## Returns the borrow
  ##
--- a/constantine/math_arbitrary_precision/arithmetic/limbs_mod.nim
+++ b/constantine/math_arbitrary_precision/arithmetic/limbs_mod.nim
@ -17,7 +17,7 @@ import
 #
 # ############################################################
-func addmod_vartime*(r: var openArray[SecretWord], a, b, M: openArray[SecretWord]) =
+func addmod_vartime*(r: var openArray[SecretWord], a, b, M: openArray[SecretWord]) {.meter.} =
  ## r <- a+b (mod M)
  ## assumes a and b are in the range [0, M)
@ -43,6 +43,6 @@ func addmod_vartime*(r: var openArray[SecretWord], a, b, M: openArray[SecretWord
    for i in 0 ..< r.len:
      r[i] = t[i]
-func doublemod_vartime*(r: var openArray[SecretWord], a, M: openArray[SecretWord]) {.inline.} =
+func doublemod_vartime*(r: var openArray[SecretWord], a, M: openArray[SecretWord]) {.inline, meter.} =
  ## r <- 2a (mod M)
  r.addmod_vartime(a, a, M)
--- a/constantine/math_arbitrary_precision/arithmetic/limbs_mod2k.nim
+++ b/constantine/math_arbitrary_precision/arithmetic/limbs_mod2k.nim
@ -17,7 +17,7 @@ import
 # No exceptions allowed
 {.push raises: [], checks: off.}
-func mod2k_vartime*(a: var openArray[SecretWord], k: uint) =
+func mod2k_vartime*(a: var openArray[SecretWord], k: uint) {.meter.} =
  ## a <- a (mod 2ᵏ)
  const SlotShift = log2_vartime(WordBitWidth.uint32)
  const SelectMask = WordBitWidth - 1
@ -38,7 +38,7 @@ func mod2k_vartime*(a: var openArray[SecretWord], k: uint) =
  for i in hiIndex+1 ..< a.len:
    a[i] = Zero
-func submod2k_vartime*(r{.noAlias.}: var openArray[SecretWord], a, b: openArray[SecretWord], k: uint) =
+func submod2k_vartime*(r{.noAlias.}: var openArray[SecretWord], a, b: openArray[SecretWord], k: uint) {.meter.} =
  ## r <- a - b (mod 2ᵏ)
  debug:
    const SlotShift = log2_vartime(WordBitWidth.uint32)
@ -63,7 +63,7 @@ func submod2k_vartime*(r{.noAlias.}: var openArray[SecretWord], a, b: openArray[
  r.mod2k_vartime(k)
-func mulmod2k_vartime*(r: var openArray[SecretWord], a, b: openArray[SecretWord], k: uint) {.inline.} =
+func mulmod2k_vartime*(r: var openArray[SecretWord], a, b: openArray[SecretWord], k: uint) {.inline, meter.} =
  ## r <- a*b (mod 2ᵏ)
  r.prod(a, b)
  r.mod2k_vartime(k)
@ -75,7 +75,7 @@ iterator unpackLE(scalarByte: byte): bool =
 func powMod2k_vartime*(
       r{.noAlias.}: var openArray[SecretWord],
       a{.noAlias.}: openArray[SecretWord],
-       exponent: openArray[byte], k: uint) {.noInline, tags: [Alloca].} =
+       exponent: openArray[byte], k: uint) {.noInline, tags: [Alloca], meter.} =
  ## r <- a^exponent (mod 2ᵏ)
  ##
  ## Requires:
@ -115,6 +115,13 @@ func powMod2k_vartime*(
    r[0] = One  # x⁰ = 1, even for 0⁰
    return
  if msb == 0: # exponent is 1
    for i in 0 ..< min(r.len, a.len):
      # range [r.len, a.len) will be truncated (mod 2ᵏ)
      r[i] = a[i]
    r.mod2k_vartime(k)
    return
  if a.isEven().bool:
    let aTrailingZeroes = block:
      var i = 0
@ -155,7 +162,7 @@ func powMod2k_vartime*(
 func invModBitwidth(a: SecretWord): SecretWord {.borrow.}
  ## Inversion a⁻¹ (mod 2³²) or a⁻¹ (mod 2⁶⁴)
-func invMod2k_vartime*(r: var openArray[SecretWord], a: openArray[SecretWord], k: uint) {.noInline, tags: [Alloca].} =
+func invMod2k_vartime*(r: var openArray[SecretWord], a: openArray[SecretWord], k: uint) {.noInline, tags: [Alloca], meter.} =
  ## Inversion a⁻¹ (mod 2ᵏ)
  ## with 2ᵏ a multi-precision integer.
  #
--- a/constantine/math_arbitrary_precision/arithmetic/limbs_montgomery.nim
+++ b/constantine/math_arbitrary_precision/arithmetic/limbs_montgomery.nim
@ -11,7 +11,8 @@ import
  ../../platforms/[abstractions, allocs, bithacks],
  ./limbs_views,
  ./limbs_mod,
-  ./limbs_fixedprec
+  ./limbs_fixedprec,
  ./limbs_division
 # No exceptions allowed
 {.push raises: [], checks: off.}
@ -66,18 +67,37 @@ func r_powmod_vartime(r: var openArray[SecretWord], M: openArray[SecretWord], n:
  for i in start ..< stop:
    r.doublemod_vartime(r, M)
-func oneMont_vartime*(r: var openArray[SecretWord], M: openArray[SecretWord]) =
+func oneMont_vartime*(r: var openArray[SecretWord], M: openArray[SecretWord]) {.meter.} =
  ## Returns 1 in Montgomery domain:
  r.r_powmod_vartime(M, 1)
-func r2_vartime*(r: var openArray[SecretWord], M: openArray[SecretWord]) =
+  # r.r_powmod_vartime(M, 1)
  let mBits = getBits_LE_vartime(M)
  let t = allocStackArray(SecretWord, M.len + 1)
  zeroMem(t, M.len*sizeof(SecretWord))
  t[M.len] = One
  r.view().reduce(LimbsViewMut t, M.len*WordBitWidth+1, M.view(), mBits)
 func r2_vartime*(r: var openArray[SecretWord], M: openArray[SecretWord]) {.meter.} =
  ## Returns the Montgomery domain magic constant for the input modulus:
  ##
  ##   R² ≡ R² (mod M) with R = (2^WordBitWidth)^numWords
  ##
  ## Assuming a field modulus of size 256-bit with 63-bit words, we require 5 words
  ##   R² ≡ ((2^63)^5)^2 (mod M) = 2^630 (mod M)
-  r.r_powmod_vartime(M, 2)
+
  # r.r_powmod_vartime(M, 2)
  let mBits = getBits_LE_vartime(M)
  let t = allocStackArray(SecretWord, 2*M.len + 1)
  zeroMem(t, 2*M.len*sizeof(SecretWord))
  t[2*M.len] = One
  r.view().reduce(LimbsViewMut t, 2*M.len*WordBitWidth+1, M.view(), mBits)
 # Montgomery multiplication
 # ------------------------------------------
@ -88,7 +108,7 @@ func mulMont_FIPS*(
       M: LimbsViewConst,
       m0ninv: SecretWord,
       mBits: int,
-       skipFinalSub: static bool = false) {.noInline, tags:[Alloca].} =
+       skipFinalSub: static bool = false) {.noInline, tags:[Alloca], meter.} =
  ## Montgomery Multiplication using Finely Integrated Product Scanning (FIPS)
  ##
  ## This maps
@ -138,7 +158,7 @@ func mulMont_FIPS*(
 # ------------------------------------------
 func fromMont*(r: LimbsViewMut, a: LimbsViewAny, M: LimbsViewConst,
-               m0ninv: SecretWord, mBits: int) {.noInline, tags:[Alloca].} =
+               m0ninv: SecretWord, mBits: int) {.noInline, tags:[Alloca], meter.} =
  ## Transform a bigint ``a`` from it's Montgomery N-residue representation (mod N)
  ## to the regular natural representation (mod N)
  ##
@ -166,7 +186,7 @@ func fromMont*(r: LimbsViewMut, a: LimbsViewAny, M: LimbsViewConst,
  r.copyWords(0, t, 0, N)
 func getMont*(r: LimbsViewMut, a: LimbsViewAny, M, r2modM: LimbsViewConst,
-                   m0ninv: SecretWord, mBits: int) {.inline.} =
+                   m0ninv: SecretWord, mBits: int) {.inline, meter.} =
  ## Transform a bigint ``a`` from it's natural representation (mod N)
  ## to a the Montgomery n-residue representation
  ##
@ -233,7 +253,7 @@ func powMontPrologue(
       m0ninv: SecretWord,
       scratchspace: LimbsViewMut,
       scratchLen: int,
-       mBits: int): uint {.tags:[Alloca].} =
+       mBits: int): uint {.tags:[Alloca], meter.} =
  ## Setup the scratchspace
  ## Returns the fixed-window size for exponentiation with window optimization.
  # Precompute window content, special case for window = 1
@ -263,7 +283,7 @@ func powMontSquarings(
        tmp: LimbsViewMut,
        window: uint,
        acc, acc_len: var uint,
-        e: var int): tuple[k, bits: uint] {.inline.}=
+        e: var int): tuple[k, bits: uint] {.inline, meter.}=
  ## Squaring step of exponentiation by squaring
  ## Get the next k bits in range [1, window)
  ## Square k times
@ -309,7 +329,7 @@ func powMont*(
       m0ninv: SecretWord,
       scratchspace: LimbsViewMut,
       scratchLen: int,
-       mBits: int) =
+       mBits: int) {.meter.} =
  ## Modular exponentiation r = a^exponent mod M
  ## in the Montgomery domain
  ##
@ -379,7 +399,7 @@ func powMont_vartime*(
       m0ninv: SecretWord,
       scratchspace: LimbsViewMut,
       scratchLen: int,
-       mBits: int) {.tags:[VarTime, Alloca].} =
+       mBits: int) {.tags:[VarTime, Alloca], meter.} =
  ## Modular exponentiation a <- a^exponent (mod M)
  ## in the Montgomery domain
  ##
--- a/constantine/math_arbitrary_precision/arithmetic/limbs_multiprec.nim
+++ b/constantine/math_arbitrary_precision/arithmetic/limbs_multiprec.nim
@ -61,7 +61,7 @@ func shrWords(r {.noalias.}: var openArray[SecretWord], a: openArray[SecretWord]
  for i in a.len-w ..< r.len:
    r[i] = Zero
-func shiftRight_vartime*(r {.noalias.}: var openArray[SecretWord], a: openArray[SecretWord], k: SomeInteger) =
+func shiftRight_vartime*(r {.noalias.}: var openArray[SecretWord], a: openArray[SecretWord], k: SomeInteger) {.meter.} =
  ## Shift `a` right by k bits and store in `r`
  if k == 0:
    let min = min(a.len, r.len)
@ -87,7 +87,7 @@ func shiftRight_vartime*(r {.noalias.}: var openArray[SecretWord], a: openArray[
 # Arithmetic
 # --------------------------------------------------------
-func neg*(a: var openArray[SecretWord]) =
+func neg*(a: var openArray[SecretWord]) {.meter.} =
  ## Computes the additive inverse -a
  ## in 2-complement representation
@ -97,7 +97,7 @@ func neg*(a: var openArray[SecretWord]) =
  for i in 1 ..< a.len:
    addC(carry, a[i], not(a[i]), Zero, carry)
-func addMP*(r {.noAlias.}: var openArray[SecretWord], a, b: openArray[SecretWord]): bool =
+func addMP*(r {.noAlias.}: var openArray[SecretWord], a, b: openArray[SecretWord]): bool {.meter.} =
  ## r <- a + b
  ##   and
  ## returns the carry
@ -130,7 +130,7 @@ func addMP*(r {.noAlias.}: var openArray[SecretWord], a, b: openArray[SecretWord
  else:
    return bool carry
-func subMP*(r {.noAlias.}: var openArray[SecretWord], a, b: openArray[SecretWord]): bool =
+func subMP*(r {.noAlias.}: var openArray[SecretWord], a, b: openArray[SecretWord]): bool {.meter.} =
  ## r <- a - b
  ##   and
  ## returns false if a >= b
--- a/constantine/platforms/primitives.nim
+++ b/constantine/platforms/primitives.nim
@ -90,7 +90,7 @@ func setOne*(a: var openArray[SomeNumber]){.inline.} =
  a[0] = 1
  for i in 1 ..< a.len:
    a[i] = 0
-    
+
 func asBytes*(s: static string): auto =
  ## Reinterpret a compile-time string as an array of bytes
  const N = s.len
@ -104,8 +104,7 @@ func rawCopy*(
       dStart: SomeInteger,
       src: openArray[byte],
       sStart: SomeInteger,
-       len: SomeInteger
+       len: SomeInteger) {.inline.} =
     ) {.inline.} =
  ## Copy dst[dStart ..< dStart+len] = src[sStart ..< sStart+len]
  ## Unlike the standard library, this cannot throw
  ## even a defect.
--- a/metering/m_modexp.nim
+++ b/metering/m_modexp.nim
@ -0,0 +1,53 @@
 # Constantine
 # Copyright (c) 2018-2019    Status Research & Development GmbH
 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
 # Licensed and distributed under either of
 #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 import
  ./reports, ./tracer,
  ../constantine/ethereum_evm_precompiles,
  ../constantine/platforms/abstractions
 let input = [
    # Length of base (1)
    uint8 0x00,
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
    # Length of exponent (1)
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
    # Length of modulus (121)
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x79,
    # Base
    0x33,
    # Exponent
    0x07,
    # Modulus
    0x04, 0xea, 0xbb, 0x12, 0x55, 0x88, 0xd7, 0x3c, 0xad, 0x22, 0xea, 0x2b, 0x4a, 0x77, 0x6e, 0x9d,
    0x4d, 0xfc, 0x13, 0xa8, 0x1b, 0xf9, 0x0c, 0x0d, 0x37, 0xe8, 0x4e, 0x8b, 0xeb, 0xb2, 0xa5, 0x48,
    0x8b, 0x2c, 0x87, 0x6d, 0x13, 0x51, 0x75, 0xeb, 0x97, 0xc6, 0x13, 0xd9, 0x06, 0xce, 0x8b, 0x53,
    0xd0, 0x02, 0x68, 0xb8, 0xd6, 0x12, 0xab, 0x8b, 0x15, 0x0c, 0xef, 0x0a, 0xd0, 0x3b, 0x73, 0xd2,
    0xdb, 0x9d, 0x2a, 0xa5, 0x23, 0x70, 0xdc, 0x26, 0x55, 0x80, 0xca, 0xf2, 0xc0, 0x18, 0xe3, 0xe3,
    0x1b, 0xad, 0xd5, 0x22, 0xdd, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x1c, 0x05, 0x71, 0x52, 0x7c, 0x3a, 0xb0, 0x77,
  ]
 var r = newSeq[byte](121)
 resetMetering()
 let status = eth_evm_modexp(r, input)
 doAssert status == cttEVM_Success
 const flags = if UseASM_X86_64 or UseASM_X86_32: "UseAssembly" else: "NoAssembly"
 reportCli(Metrics, flags)
--- a/metering/reports.nim
+++ b/metering/reports.nim
@ -33,6 +33,11 @@ proc reportCli*(metrics: seq[Metadata], flags: string) =
    for m in metrics:
      if m.numCalls == 0:
        continue
      let shortname = block:
        if m.procName.len <= 150: m.procName.replace('\n', ' ')
        else: m.procName[0..145].replace('\n', ' ') & " ..."
      # TODO: running variance / standard deviation but the Welford method is quite costly.
      #       https://nim-lang.org/docs/stats.html / https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
      let cumulTimeUs = m.cumulatedTimeNs.float64 * 1e-3
@ -40,11 +45,11 @@ proc reportCli*(metrics: seq[Metadata], flags: string) =
      let throughput = 1e6 / avgTimeUs
      let cumulCyclesBillions = m.cumulatedCycles.float64 * 1e-9
      let avgCyclesBillions = cumulCyclesBillions / m.numCalls.float64
-      echo &"""|{m.procName:<150}|{m.numCalls:>14}|{throughput:>20.3f}|{cumulTimeUs:>15.3f}|{avgTimeUs:>17.3f}|"""
+      echo &"""|{shortname:<150}|{m.numCalls:>14}|{throughput:>20.3f}|{cumulTimeUs:>15.3f}|{avgTimeUs:>17.3f}|"""
    echo lineSep
  else:
-    const lineSep = &"""|{'-'.repeat(50)}|{'-'.repeat(14)}|{'-'.repeat(20)}|{'-'.repeat(15)}|{'-'.repeat(17)}|"""
+    const lineSep = &"""|{'-'.repeat(150)}|{'-'.repeat(14)}|{'-'.repeat(20)}|{'-'.repeat(15)}|{'-'.repeat(17)}|"""
    echo "\n"
    echo lineSep
    echo &"""|{"Procedures":^150}|{"# of Calls":^14}|{"Throughput (ops/s)":^20}|{"Time (µs)":^15}|{"Avg Time (µs)":^17}|"""
@ -53,10 +58,15 @@ proc reportCli*(metrics: seq[Metadata], flags: string) =
    for m in metrics:
      if m.numCalls == 0:
        continue
      let shortname = block:
        if m.procName.len <= 150: m.procName.replace('\n', ' ')
        else: m.procName[0..145].replace('\n', ' ') & " ..."
      # TODO: running variance / standard deviation but the Welford method is quite costly.
      #       https://nim-lang.org/docs/stats.html / https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
      let cumulTimeUs = m.cumulatedTimeNs.float64 * 1e-3
      let avgTimeUs = cumulTimeUs / m.numCalls.float64
      let throughput = 1e6 / avgTimeUs
-      echo &"""|{m.procName:<150}|{m.numCalls:>14}|{throughput:>20.3f}|{cumulTimeUs:>15.3f}|{avgTimeUs:>17.3f}|"""
+      echo &"""|{shortname:<150}|{m.numCalls:>14}|{throughput:>20.3f}|{cumulTimeUs:>15.3f}|{avgTimeUs:>17.3f}|"""
    echo lineSep
--- a/metering/tracer.nim
+++ b/metering/tracer.nim
@ -88,7 +88,7 @@ when CTT_METER or CTT_TRACE:
      let stopTime = getMonoTime()
      when SupportsGetTicks:
        let elapsedCycles = stopCycle - startCycle
-      let elapsedTime = inMicroseconds(stopTime - startTime)
+      let elapsedTime = inNanoseconds(stopTime - startTime)
      discard Metrics[id].cumulatedTimeNs.atomicInc(elapsedTime)
      when SupportsGetTicks: