modexp: 2.5x accel on small exponent (#268)

* add metering to modexp * modexp: accel exponent = 1 * modexp: improve runtime Montgomery constants compute. 2.49x faster on DOS vectors
2023-09-09 09:21:05 +02:00 · 2023-09-09 09:21:05 +02:00 · 15757557b4
parent f3a5f352b8
commit 15757557b4
16 changed files with 315 additions and 51 deletions
--- a/benchmarks/bench_evm_modexp_dos.nim
+++ b/benchmarks/bench_evm_modexp_dos.nim
@ -3,16 +3,16 @@ import
  ../constantine/math/arithmetic,
  ../constantine/math/io/io_bigints,
  ../constantine/platforms/abstractions,
-  ./platforms, ./bench_blueprint
+  ./bench_blueprint

 proc report(op: string, elapsedNs: int64, elapsedCycles: int64, iters: int) =
  let ns = elapsedNs div iters
  let cycles = elapsedCycles div iters
  let throughput = 1e9 / float64(ns)
  when SupportsGetTicks:
-    echo &"{op:<45} {throughput:>15.3f} ops/s {ns:>16} ns/op {cycles:>12} CPU cycles (approx)"
+    echo &"{op:<70} {throughput:>15.3f} ops/s {ns:>16} ns/op {cycles:>12} CPU cycles (approx)"
  else:
-    echo &"{op:<45} {throughput:>15.3f} ops/s {ns:>16} ns/op"
+    echo &"{op:<70} {throughput:>15.3f} ops/s {ns:>16} ns/op"

 template bench(fnCall: untyped, ticks, ns: var int64): untyped =
  block:
@ -148,11 +148,119 @@ proc dos1() =
        (let _ = r.eth_evm_modexp(input)),
        ticks, nanoseconds)

-  report("EVM Modexp - 32,32,32", nanoseconds, ticks, execsEIP2565)
+  report("EVM Modexp - 32,32,32 - even base & power-of-2 modulus", nanoseconds, ticks, execsEIP2565)
  echo "Total time: ", nanoseconds.float64 / 1e6, " ms for ", execsEIP2565, " iterations"

 proc dos2() =

+  let input = [
+    # Length of base (1)
+    uint8 0x00,
+          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+
+    # Length of exponent (1)
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+
+    # Length of modulus (121)
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x79,
+
+    # Base
+    0x33,
+
+    # Exponent
+    0x01,
+
+    # Modulus
+    0x04, 0xea, 0xbb, 0x12, 0x55, 0x88, 0xd7, 0x3c, 0xad, 0x22, 0xea, 0x2b, 0x4a, 0x77, 0x6e, 0x9d,
+    0x4d, 0xfc, 0x13, 0xa8, 0x1b, 0xf9, 0x0c, 0x0d, 0x37, 0xe8, 0x4e, 0x8b, 0xeb, 0xb2, 0xa5, 0x48,
+    0x8b, 0x2c, 0x87, 0x6d, 0x13, 0x51, 0x75, 0xeb, 0x97, 0xc6, 0x13, 0xd9, 0x06, 0xce, 0x8b, 0x53,
+    0xd0, 0x02, 0x68, 0xb8, 0xd6, 0x12, 0xab, 0x8b, 0x15, 0x0c, 0xef, 0x0a, 0xd0, 0x3b, 0x73, 0xd2,
+    0xdb, 0x9d, 0x2a, 0xa5, 0x23, 0x70, 0xdc, 0x26, 0x55, 0x80, 0xca, 0xf2, 0xc0, 0x18, 0xe3, 0xe3,
+    0x1b, 0xad, 0xd5, 0x22, 0xdd, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x1c, 0x05, 0x71, 0x52, 0x7c, 0x3a, 0xb0, 0x77,
+  ]
+
+  var r = newSeq[byte](121)
+  var ticks, nanoseconds: int64
+
+  let (gasFeeEIP198, gasFeeEIP2565) = computeGasFee(input)
+  const blockSize = 30000000
+
+  let execsEIP198 = blockSize div gasFeeEIP198
+  let execsEIP2565 = blockSize div gasFeeEIP2565
+
+  echo "Gas cost: ", gasFeeEIP198, " gas (EIP-198) - ", execsEIP198, " executions per block"
+  echo "Gas cost: ", gasFeeEIP2565, " gas (EIP-2565) - ", execsEIP2565, " executions per block"
+
+  for i in 0 ..< execsEIP2565:
+      bench(
+        (let _ = r.eth_evm_modexp(input)),
+        ticks, nanoseconds)
+
+  report("EVM Modexp - 1,1,121 - exponent=1 and odd modulus", nanoseconds, ticks, execsEIP2565)
+  echo "Total time: ", nanoseconds.float64 / 1e6, " ms for ", execsEIP2565, " iterations"
+
+proc dos2a() =
+  # shortcuttable variation with even modulus
+
+  let input = [
+    # Length of base (1)
+    uint8 0x00,
+          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+
+    # Length of exponent (1)
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+
+    # Length of modulus (121)
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x79,
+
+    # Base
+    0x33,
+
+    # Exponent
+    0x01,
+
+    # Modulus
+    0x04, 0xea, 0xbb, 0x12, 0x55, 0x88, 0xd7, 0x3c, 0xad, 0x22, 0xea, 0x2b, 0x4a, 0x77, 0x6e, 0x9d,
+    0x4d, 0xfc, 0x13, 0xa8, 0x1b, 0xf9, 0x0c, 0x0d, 0x37, 0xe8, 0x4e, 0x8b, 0xeb, 0xb2, 0xa5, 0x48,
+    0x8b, 0x2c, 0x87, 0x6d, 0x13, 0x51, 0x75, 0xeb, 0x97, 0xc6, 0x13, 0xd9, 0x06, 0xce, 0x8b, 0x53,
+    0xd0, 0x02, 0x68, 0xb8, 0xd6, 0x12, 0xab, 0x8b, 0x15, 0x0c, 0xef, 0x0a, 0xd0, 0x3b, 0x73, 0xd2,
+    0xdb, 0x9d, 0x2a, 0xa5, 0x23, 0x70, 0xdc, 0x26, 0x55, 0x80, 0xca, 0xf2, 0xc0, 0x18, 0xe3, 0xe3,
+    0x1b, 0xad, 0xd5, 0x22, 0xdd, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x1c, 0x05, 0x71, 0x52, 0x7c, 0x3a, 0xb0, 0x76,
+  ]
+
+  var r = newSeq[byte](121)
+  var ticks, nanoseconds: int64
+
+  let (gasFeeEIP198, gasFeeEIP2565) = computeGasFee(input)
+  const blockSize = 30000000
+
+  let execsEIP198 = blockSize div gasFeeEIP198
+  let execsEIP2565 = blockSize div gasFeeEIP2565
+
+  echo "Gas cost: ", gasFeeEIP198, " gas (EIP-198) - ", execsEIP198, " executions per block"
+  echo "Gas cost: ", gasFeeEIP2565, " gas (EIP-2565) - ", execsEIP2565, " executions per block"
+
+  for i in 0 ..< execsEIP2565:
+      bench(
+        (let _ = r.eth_evm_modexp(input)),
+        ticks, nanoseconds)
+
+  report("EVM Modexp - 1,1,121 - exponent=1 and even modulus", nanoseconds, ticks, execsEIP2565)
+  echo "Total time: ", nanoseconds.float64 / 1e6, " ms for ", execsEIP2565, " iterations"
+
+proc dos2b() =
+  # even variation with no shortcut
+
  let input = [
    # Length of base (1)
    uint8 0x00,
@ -201,10 +309,11 @@ proc dos2() =
        (let _ = r.eth_evm_modexp(input)),
        ticks, nanoseconds)

-  report("EVM Modexp - 1,1,121", nanoseconds, ticks, execsEIP2565)
+  report("EVM Modexp - 1,1,121 - exponent=16 and odd modulus", nanoseconds, ticks, execsEIP2565)
  echo "Total time: ", nanoseconds.float64 / 1e6, " ms for ", execsEIP2565, " iterations"

-proc dos3() =
+proc dos2c() =
+  # odd variation with no shortcut

  let input = [
    # Length of base (1)
@ -254,11 +363,71 @@ proc dos3() =
      (let _ = r.eth_evm_modexp(input)),
      ticks, nanoseconds)

-  report("EVM Modexp - 1,1,121", nanoseconds, ticks, execsEIP2565)
+  report("EVM Modexp - 1,1,121 - exponent=7 and odd modulus", nanoseconds, ticks, execsEIP2565)
+  echo "Total time: ", nanoseconds.float64 / 1e6, " ms for ", execsEIP2565, " iterations"
+
+proc dos2d() =
+  # odd variation with no shortcut and power of 2 modulus
+
+  let input = [
+    # Length of base (1)
+    uint8 0x00,
+          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+
+    # Length of exponent (1)
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+
+    # Length of modulus (121)
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x79,
+
+    # Base
+    0x33,
+
+    # Exponent
+    0x07,
+
+    # Modulus
+    0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  ]
+
+  var r = newSeq[byte](121)
+  var ticks, nanoseconds: int64
+
+  let (gasFeeEIP198, gasFeeEIP2565) = computeGasFee(input)
+  const blockSize = 30000000
+
+  let execsEIP198 = blockSize div gasFeeEIP198
+  let execsEIP2565 = blockSize div gasFeeEIP2565
+
+  echo "Gas cost: ", gasFeeEIP198, " gas (EIP-198) - ", execsEIP198, " executions per block"
+  echo "Gas cost: ", gasFeeEIP2565, " gas (EIP-2565) - ", execsEIP2565, " executions per block"
+
+  for i in 0 ..< execsEIP2565:
+    bench(
+      (let _ = r.eth_evm_modexp(input)),
+      ticks, nanoseconds)
+
+  report("EVM Modexp - 1,1,121 - exponent=7 and power-of-2 modulus", nanoseconds, ticks, execsEIP2565)
  echo "Total time: ", nanoseconds.float64 / 1e6, " ms for ", execsEIP2565, " iterations"

 dos1()
 echo "\n"
 dos2()
 echo "\n"
-dos3()
+dos2a()
+echo "\n"
+dos2b()
+echo "\n"
+dos2c()
+echo "\n"
+dos2d()
--- a/constantine/ethereum_evm_precompiles.nim
+++ b/constantine/ethereum_evm_precompiles.nim
@ -370,7 +370,7 @@ func eth_evm_ecpairing*(
    r[r.len-1] = byte 1
  return cttEVM_Success

-func eth_evm_modexp*(r: var openArray[byte], inputs: openArray[byte]): CttEVMStatus {.noInline, tags:[Alloca, Vartime].} =
+func eth_evm_modexp*(r: var openArray[byte], inputs: openArray[byte]): CttEVMStatus {.noInline, tags:[Alloca, Vartime], meter.} =
  ## Modular exponentiation
  ##
  ## Name: MODEXP
--- a/constantine/math/elliptic/ec_scalar_mul_vartime.nim
+++ b/constantine/math/elliptic/ec_scalar_mul_vartime.nim
@ -38,7 +38,7 @@ template `+=`[F; G: static Subgroup](P: var (ECP_ShortW_Jac[F, G] or ECP_ShortW_
 template `-=`[F; G: static Subgroup](P: var (ECP_ShortW_Jac[F, G] or ECP_ShortW_Prj[F, G]), Q: ECP_ShortW_Aff[F, G]) =
  P.msub_vartime(P, Q)

-func scalarMul_doubleAdd_vartime*[EC](P: var EC, scalar: BigInt) {.tags:[VarTime].} =
+func scalarMul_doubleAdd_vartime*[EC](P: var EC, scalar: BigInt) {.tags:[VarTime], meter.} =
  ## **Variable-time** Elliptic Curve Scalar Multiplication
  ##
  ##   P <- [k] P
@ -67,7 +67,7 @@ func scalarMul_doubleAdd_vartime*[EC](P: var EC, scalar: BigInt) {.tags:[VarTime
        else:
          P += Paff

-func scalarMul_addchain_4bit_vartime[EC](P: var EC, scalar: BigInt) {.tags:[VarTime].} =
+func scalarMul_addchain_4bit_vartime[EC](P: var EC, scalar: BigInt) {.tags:[VarTime], meter.} =
  ## **Variable-time** Elliptic Curve Scalar Multiplication
  ## This can only handle for small scalars up to 2⁴ = 16 excluded
  let s = uint scalar.limbs[0]
@ -206,7 +206,7 @@ func accumNAF[precompSize, NafMax: static int, EC, ECaff](
    elif digit < 0:
      P -= tab[-digit shr 1]

-func scalarMul_minHammingWeight_windowed_vartime*[EC](P: var EC, scalar: BigInt, window: static int) {.tags:[VarTime, Alloca].} =
+func scalarMul_minHammingWeight_windowed_vartime*[EC](P: var EC, scalar: BigInt, window: static int) {.tags:[VarTime, Alloca], meter.} =
  ## **Variable-time** Elliptic Curve Scalar Multiplication
  ##
  ##   P <- [k] P
@ -246,7 +246,7 @@ func scalarMul_minHammingWeight_windowed_vartime*[EC](P: var EC, scalar: BigInt,
 func scalarMulEndo_minHammingWeight_windowed_vartime*[scalBits: static int; EC](
       P: var EC,
       scalar: BigInt[scalBits],
-       window: static int) {.tags:[VarTime, Alloca].} =
+       window: static int) {.tags:[VarTime, Alloca], meter.} =
  ## Endomorphism-accelerated windowed vartime scalar multiplication
  ##
  ##   P <- [k] P
--- a/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim
@ -88,7 +88,7 @@ func batchAffine*[N: static int, F, G](
 func batchAffine*[F, G](
       affs: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
       jacs: ptr UncheckedArray[ECP_ShortW_Jac[F, G]],
-       N: int) {.noInline, tags:[Alloca].} =
+       N: int) {.noInline, tags:[Alloca], meter.} =
  # Algorithm: Montgomery's batch inversion
  # - Speeding the Pollard and Elliptic Curve Methods of Factorization
  #   Section 10.3.1
--- a/constantine/math_arbitrary_precision/arithmetic/bigints_views.nim
+++ b/constantine/math_arbitrary_precision/arithmetic/bigints_views.nim
@ -46,7 +46,7 @@ func powOddMod_vartime*(
       a: openArray[SecretWord],
       exponent: openArray[byte],
       M: openArray[SecretWord],
-       window: int) {.noInline, tags:[Alloca, VarTime].} =
+       window: int) {.noInline, tags:[Alloca, VarTime], meter.} =
  ## r <- a^exponent (mod M) with M odd
  ## assumes a < M
  ##
@ -57,6 +57,12 @@ func powOddMod_vartime*(

  let aBits  = a.getBits_LE_vartime()
  let mBits  = M.getBits_LE_vartime()
+  let eBits  = exponent.getBits_BE_vartime()
+
+  if eBits == 1:
+    r.view().reduce(a.view(), aBits, M.view(), mBits)
+    return
+
  let L      = wordsRequired(mBits)
  let m0ninv = M[0].negInvModWord()
  var rMont  = allocStackArray(SecretWord, L)
@ -97,7 +103,7 @@ func powMod_vartime*(
       a: openArray[SecretWord],
       exponent: openArray[byte],
       M: openArray[SecretWord],
-       window: int) {.noInline, tags:[Alloca, VarTime].} =
+       window: int) {.noInline, tags:[Alloca, VarTime], meter.} =
  ## r <- a^exponent (mod M) with M odd
  ## assumes a < exponent
  ##
--- a/constantine/math_arbitrary_precision/arithmetic/limbs_division.nim
+++ b/constantine/math_arbitrary_precision/arithmetic/limbs_division.nim
@ -142,7 +142,7 @@ func shlAddMod(a: LimbsViewMut, aLen: int,

 func reduce*(r: LimbsViewMut,
            a: LimbsViewAny, aBits: int,
-            M: LimbsViewConst, mBits: int) =
+            M: LimbsViewConst, mBits: int) {.meter.} =
  ## Reduce `a` modulo `M` and store the result in `r`
  ##
  ## The modulus `M` most-significant bit at `mBits` MUST be set.
--- a/constantine/math_arbitrary_precision/arithmetic/limbs_extmul.nim
+++ b/constantine/math_arbitrary_precision/arithmetic/limbs_extmul.nim
@ -39,6 +39,6 @@ func prod_comba(r: var openArray[SecretWord], a, b: openArray[SecretWord]) {.noI
  for i in stopEx ..< r.len:
    r[i] = Zero

-func prod*(r: var openArray[SecretWord], a, b: openArray[SecretWord]) {.inline.}=
+func prod*(r: var openArray[SecretWord], a, b: openArray[SecretWord]) {.inline, meter.}=
  ## Extended precision multiplication
  r.prod_comba(a, b)
--- a/constantine/math_arbitrary_precision/arithmetic/limbs_fixedprec.nim
+++ b/constantine/math_arbitrary_precision/arithmetic/limbs_fixedprec.nim
@ -30,7 +30,7 @@ import
 # Comparison
 # ------------------------------------------------------------

-func lt*(a, b: distinct LimbsViewAny, len: int): SecretBool =
+func lt*(a, b: distinct LimbsViewAny, len: int): SecretBool {.meter.} =
  ## Returns true if a < b
  ## Comparison is constant-time
  var diff: SecretWord
@ -43,7 +43,7 @@ func lt*(a, b: distinct LimbsViewAny, len: int): SecretBool =
 # Type-erased add-sub
 # ------------------------------------------------------------

-func cadd*(a: LimbsViewMut, b: LimbsViewAny, ctl: SecretBool, len: int): Carry =
+func cadd*(a: LimbsViewMut, b: LimbsViewAny, ctl: SecretBool, len: int): Carry {.meter.} =
  ## Type-erased conditional addition
  ## Returns the carry
  ##
@ -58,7 +58,7 @@ func cadd*(a: LimbsViewMut, b: LimbsViewAny, ctl: SecretBool, len: int): Carry =
    addC(result, sum, a[i], b[i], result)
    ctl.ccopy(a[i], sum)

-func csub*(a: LimbsViewMut, b: LimbsViewAny, ctl: SecretBool, len: int): Borrow =
+func csub*(a: LimbsViewMut, b: LimbsViewAny, ctl: SecretBool, len: int): Borrow {.meter.} =
  ## Type-erased conditional addition
  ## Returns the borrow
  ##
--- a/constantine/math_arbitrary_precision/arithmetic/limbs_mod.nim
+++ b/constantine/math_arbitrary_precision/arithmetic/limbs_mod.nim
@ -17,7 +17,7 @@ import
 #
 # ############################################################

-func addmod_vartime*(r: var openArray[SecretWord], a, b, M: openArray[SecretWord]) =
+func addmod_vartime*(r: var openArray[SecretWord], a, b, M: openArray[SecretWord]) {.meter.} =
  ## r <- a+b (mod M)
  ## assumes a and b are in the range [0, M)

@ -43,6 +43,6 @@ func addmod_vartime*(r: var openArray[SecretWord], a, b, M: openArray[SecretWord
    for i in 0 ..< r.len:
      r[i] = t[i]

-func doublemod_vartime*(r: var openArray[SecretWord], a, M: openArray[SecretWord]) {.inline.} =
+func doublemod_vartime*(r: var openArray[SecretWord], a, M: openArray[SecretWord]) {.inline, meter.} =
  ## r <- 2a (mod M)
  r.addmod_vartime(a, a, M)
--- a/constantine/math_arbitrary_precision/arithmetic/limbs_mod2k.nim
+++ b/constantine/math_arbitrary_precision/arithmetic/limbs_mod2k.nim
@ -17,7 +17,7 @@ import
 # No exceptions allowed
 {.push raises: [], checks: off.}

-func mod2k_vartime*(a: var openArray[SecretWord], k: uint) =
+func mod2k_vartime*(a: var openArray[SecretWord], k: uint) {.meter.} =
  ## a <- a (mod 2ᵏ)
  const SlotShift = log2_vartime(WordBitWidth.uint32)
  const SelectMask = WordBitWidth - 1
@ -38,7 +38,7 @@ func mod2k_vartime*(a: var openArray[SecretWord], k: uint) =
  for i in hiIndex+1 ..< a.len:
    a[i] = Zero

-func submod2k_vartime*(r{.noAlias.}: var openArray[SecretWord], a, b: openArray[SecretWord], k: uint) =
+func submod2k_vartime*(r{.noAlias.}: var openArray[SecretWord], a, b: openArray[SecretWord], k: uint) {.meter.} =
  ## r <- a - b (mod 2ᵏ)
  debug:
    const SlotShift = log2_vartime(WordBitWidth.uint32)
@ -63,7 +63,7 @@ func submod2k_vartime*(r{.noAlias.}: var openArray[SecretWord], a, b: openArray[

  r.mod2k_vartime(k)

-func mulmod2k_vartime*(r: var openArray[SecretWord], a, b: openArray[SecretWord], k: uint) {.inline.} =
+func mulmod2k_vartime*(r: var openArray[SecretWord], a, b: openArray[SecretWord], k: uint) {.inline, meter.} =
  ## r <- a*b (mod 2ᵏ)
  r.prod(a, b)
  r.mod2k_vartime(k)
@ -75,7 +75,7 @@ iterator unpackLE(scalarByte: byte): bool =
 func powMod2k_vartime*(
       r{.noAlias.}: var openArray[SecretWord],
       a{.noAlias.}: openArray[SecretWord],
-       exponent: openArray[byte], k: uint) {.noInline, tags: [Alloca].} =
+       exponent: openArray[byte], k: uint) {.noInline, tags: [Alloca], meter.} =
  ## r <- a^exponent (mod 2ᵏ)
  ##
  ## Requires:
@ -115,6 +115,13 @@ func powMod2k_vartime*(
    r[0] = One  # x⁰ = 1, even for 0⁰
    return

+  if msb == 0: # exponent is 1
+    for i in 0 ..< min(r.len, a.len):
+      # range [r.len, a.len) will be truncated (mod 2ᵏ)
+      r[i] = a[i]
+    r.mod2k_vartime(k)
+    return
+
  if a.isEven().bool:
    let aTrailingZeroes = block:
      var i = 0
@ -155,7 +162,7 @@ func powMod2k_vartime*(
 func invModBitwidth(a: SecretWord): SecretWord {.borrow.}
  ## Inversion a⁻¹ (mod 2³²) or a⁻¹ (mod 2⁶⁴)

-func invMod2k_vartime*(r: var openArray[SecretWord], a: openArray[SecretWord], k: uint) {.noInline, tags: [Alloca].} =
+func invMod2k_vartime*(r: var openArray[SecretWord], a: openArray[SecretWord], k: uint) {.noInline, tags: [Alloca], meter.} =
  ## Inversion a⁻¹ (mod 2ᵏ)
  ## with 2ᵏ a multi-precision integer.
  #
--- a/constantine/math_arbitrary_precision/arithmetic/limbs_montgomery.nim
+++ b/constantine/math_arbitrary_precision/arithmetic/limbs_montgomery.nim
@ -11,7 +11,8 @@ import
  ../../platforms/[abstractions, allocs, bithacks],
  ./limbs_views,
  ./limbs_mod,
-  ./limbs_fixedprec
+  ./limbs_fixedprec,
+  ./limbs_division

 # No exceptions allowed
 {.push raises: [], checks: off.}
@ -66,18 +67,37 @@ func r_powmod_vartime(r: var openArray[SecretWord], M: openArray[SecretWord], n:
  for i in start ..< stop:
    r.doublemod_vartime(r, M)

-func oneMont_vartime*(r: var openArray[SecretWord], M: openArray[SecretWord]) =
+func oneMont_vartime*(r: var openArray[SecretWord], M: openArray[SecretWord]) {.meter.} =
  ## Returns 1 in Montgomery domain:
-  r.r_powmod_vartime(M, 1)

-func r2_vartime*(r: var openArray[SecretWord], M: openArray[SecretWord]) =
+  # r.r_powmod_vartime(M, 1)
+
+  let mBits = getBits_LE_vartime(M)
+
+  let t = allocStackArray(SecretWord, M.len + 1)
+  zeroMem(t, M.len*sizeof(SecretWord))
+  t[M.len] = One
+
+  r.view().reduce(LimbsViewMut t, M.len*WordBitWidth+1, M.view(), mBits)
+
+func r2_vartime*(r: var openArray[SecretWord], M: openArray[SecretWord]) {.meter.} =
  ## Returns the Montgomery domain magic constant for the input modulus:
  ##
  ##   R² ≡ R² (mod M) with R = (2^WordBitWidth)^numWords
  ##
  ## Assuming a field modulus of size 256-bit with 63-bit words, we require 5 words
  ##   R² ≡ ((2^63)^5)^2 (mod M) = 2^630 (mod M)
-  r.r_powmod_vartime(M, 2)
+
+  # r.r_powmod_vartime(M, 2)
+
+  let mBits = getBits_LE_vartime(M)
+
+  let t = allocStackArray(SecretWord, 2*M.len + 1)
+  zeroMem(t, 2*M.len*sizeof(SecretWord))
+  t[2*M.len] = One
+
+  r.view().reduce(LimbsViewMut t, 2*M.len*WordBitWidth+1, M.view(), mBits)
+

 # Montgomery multiplication
 # ------------------------------------------
@ -88,7 +108,7 @@ func mulMont_FIPS*(
       M: LimbsViewConst,
       m0ninv: SecretWord,
       mBits: int,
-       skipFinalSub: static bool = false) {.noInline, tags:[Alloca].} =
+       skipFinalSub: static bool = false) {.noInline, tags:[Alloca], meter.} =
  ## Montgomery Multiplication using Finely Integrated Product Scanning (FIPS)
  ##
  ## This maps
@ -138,7 +158,7 @@ func mulMont_FIPS*(
 # ------------------------------------------

 func fromMont*(r: LimbsViewMut, a: LimbsViewAny, M: LimbsViewConst,
-               m0ninv: SecretWord, mBits: int) {.noInline, tags:[Alloca].} =
+               m0ninv: SecretWord, mBits: int) {.noInline, tags:[Alloca], meter.} =
  ## Transform a bigint ``a`` from it's Montgomery N-residue representation (mod N)
  ## to the regular natural representation (mod N)
  ##
@ -166,7 +186,7 @@ func fromMont*(r: LimbsViewMut, a: LimbsViewAny, M: LimbsViewConst,
  r.copyWords(0, t, 0, N)

 func getMont*(r: LimbsViewMut, a: LimbsViewAny, M, r2modM: LimbsViewConst,
-                   m0ninv: SecretWord, mBits: int) {.inline.} =
+                   m0ninv: SecretWord, mBits: int) {.inline, meter.} =
  ## Transform a bigint ``a`` from it's natural representation (mod N)
  ## to a the Montgomery n-residue representation
  ##
@ -233,7 +253,7 @@ func powMontPrologue(
       m0ninv: SecretWord,
       scratchspace: LimbsViewMut,
       scratchLen: int,
-       mBits: int): uint {.tags:[Alloca].} =
+       mBits: int): uint {.tags:[Alloca], meter.} =
  ## Setup the scratchspace
  ## Returns the fixed-window size for exponentiation with window optimization.
  # Precompute window content, special case for window = 1
@ -263,7 +283,7 @@ func powMontSquarings(
        tmp: LimbsViewMut,
        window: uint,
        acc, acc_len: var uint,
-        e: var int): tuple[k, bits: uint] {.inline.}=
+        e: var int): tuple[k, bits: uint] {.inline, meter.}=
  ## Squaring step of exponentiation by squaring
  ## Get the next k bits in range [1, window)
  ## Square k times
@ -309,7 +329,7 @@ func powMont*(
       m0ninv: SecretWord,
       scratchspace: LimbsViewMut,
       scratchLen: int,
-       mBits: int) =
+       mBits: int) {.meter.} =
  ## Modular exponentiation r = a^exponent mod M
  ## in the Montgomery domain
  ##
@ -379,7 +399,7 @@ func powMont_vartime*(
       m0ninv: SecretWord,
       scratchspace: LimbsViewMut,
       scratchLen: int,
-       mBits: int) {.tags:[VarTime, Alloca].} =
+       mBits: int) {.tags:[VarTime, Alloca], meter.} =
  ## Modular exponentiation a <- a^exponent (mod M)
  ## in the Montgomery domain
  ##
--- a/constantine/math_arbitrary_precision/arithmetic/limbs_multiprec.nim
+++ b/constantine/math_arbitrary_precision/arithmetic/limbs_multiprec.nim
@ -61,7 +61,7 @@ func shrWords(r {.noalias.}: var openArray[SecretWord], a: openArray[SecretWord]
  for i in a.len-w ..< r.len:
    r[i] = Zero

-func shiftRight_vartime*(r {.noalias.}: var openArray[SecretWord], a: openArray[SecretWord], k: SomeInteger) =
+func shiftRight_vartime*(r {.noalias.}: var openArray[SecretWord], a: openArray[SecretWord], k: SomeInteger) {.meter.} =
  ## Shift `a` right by k bits and store in `r`
  if k == 0:
    let min = min(a.len, r.len)
@ -87,7 +87,7 @@ func shiftRight_vartime*(r {.noalias.}: var openArray[SecretWord], a: openArray[
 # Arithmetic
 # --------------------------------------------------------

-func neg*(a: var openArray[SecretWord]) =
+func neg*(a: var openArray[SecretWord]) {.meter.} =
  ## Computes the additive inverse -a
  ## in 2-complement representation

@ -97,7 +97,7 @@ func neg*(a: var openArray[SecretWord]) =
  for i in 1 ..< a.len:
    addC(carry, a[i], not(a[i]), Zero, carry)

-func addMP*(r {.noAlias.}: var openArray[SecretWord], a, b: openArray[SecretWord]): bool =
+func addMP*(r {.noAlias.}: var openArray[SecretWord], a, b: openArray[SecretWord]): bool {.meter.} =
  ## r <- a + b
  ##   and
  ## returns the carry
@ -130,7 +130,7 @@ func addMP*(r {.noAlias.}: var openArray[SecretWord], a, b: openArray[SecretWord
  else:
    return bool carry

-func subMP*(r {.noAlias.}: var openArray[SecretWord], a, b: openArray[SecretWord]): bool =
+func subMP*(r {.noAlias.}: var openArray[SecretWord], a, b: openArray[SecretWord]): bool {.meter.} =
  ## r <- a - b
  ##   and
  ## returns false if a >= b
--- a/constantine/platforms/primitives.nim
+++ b/constantine/platforms/primitives.nim
@ -90,7 +90,7 @@ func setOne*(a: var openArray[SomeNumber]){.inline.} =
  a[0] = 1
  for i in 1 ..< a.len:
    a[i] = 0
-    
+
 func asBytes*(s: static string): auto =
  ## Reinterpret a compile-time string as an array of bytes
  const N = s.len
@ -104,8 +104,7 @@ func rawCopy*(
       dStart: SomeInteger,
       src: openArray[byte],
       sStart: SomeInteger,
-       len: SomeInteger
-     ) {.inline.} =
+       len: SomeInteger) {.inline.} =
  ## Copy dst[dStart ..< dStart+len] = src[sStart ..< sStart+len]
  ## Unlike the standard library, this cannot throw
  ## even a defect.
--- a/metering/m_modexp.nim
+++ b/metering/m_modexp.nim
@ -0,0 +1,53 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ./reports, ./tracer,
+  ../constantine/ethereum_evm_precompiles,
+  ../constantine/platforms/abstractions
+
+let input = [
+    # Length of base (1)
+    uint8 0x00,
+          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+
+    # Length of exponent (1)
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+
+    # Length of modulus (121)
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x79,
+
+    # Base
+    0x33,
+
+    # Exponent
+    0x07,
+
+    # Modulus
+    0x04, 0xea, 0xbb, 0x12, 0x55, 0x88, 0xd7, 0x3c, 0xad, 0x22, 0xea, 0x2b, 0x4a, 0x77, 0x6e, 0x9d,
+    0x4d, 0xfc, 0x13, 0xa8, 0x1b, 0xf9, 0x0c, 0x0d, 0x37, 0xe8, 0x4e, 0x8b, 0xeb, 0xb2, 0xa5, 0x48,
+    0x8b, 0x2c, 0x87, 0x6d, 0x13, 0x51, 0x75, 0xeb, 0x97, 0xc6, 0x13, 0xd9, 0x06, 0xce, 0x8b, 0x53,
+    0xd0, 0x02, 0x68, 0xb8, 0xd6, 0x12, 0xab, 0x8b, 0x15, 0x0c, 0xef, 0x0a, 0xd0, 0x3b, 0x73, 0xd2,
+    0xdb, 0x9d, 0x2a, 0xa5, 0x23, 0x70, 0xdc, 0x26, 0x55, 0x80, 0xca, 0xf2, 0xc0, 0x18, 0xe3, 0xe3,
+    0x1b, 0xad, 0xd5, 0x22, 0xdd, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x1c, 0x05, 0x71, 0x52, 0x7c, 0x3a, 0xb0, 0x77,
+  ]
+
+var r = newSeq[byte](121)
+
+resetMetering()
+
+let status = eth_evm_modexp(r, input)
+doAssert status == cttEVM_Success
+
+const flags = if UseASM_X86_64 or UseASM_X86_32: "UseAssembly" else: "NoAssembly"
+reportCli(Metrics, flags)
--- a/metering/reports.nim
+++ b/metering/reports.nim
@ -33,6 +33,11 @@ proc reportCli*(metrics: seq[Metadata], flags: string) =
    for m in metrics:
      if m.numCalls == 0:
        continue
+
+      let shortname = block:
+        if m.procName.len <= 150: m.procName.replace('\n', ' ')
+        else: m.procName[0..145].replace('\n', ' ') & " ..."
+
      # TODO: running variance / standard deviation but the Welford method is quite costly.
      #       https://nim-lang.org/docs/stats.html / https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
      let cumulTimeUs = m.cumulatedTimeNs.float64 * 1e-3
@ -40,11 +45,11 @@ proc reportCli*(metrics: seq[Metadata], flags: string) =
      let throughput = 1e6 / avgTimeUs
      let cumulCyclesBillions = m.cumulatedCycles.float64 * 1e-9
      let avgCyclesBillions = cumulCyclesBillions / m.numCalls.float64
-      echo &"""|{m.procName:<150}|{m.numCalls:>14}|{throughput:>20.3f}|{cumulTimeUs:>15.3f}|{avgTimeUs:>17.3f}|"""
+      echo &"""|{shortname:<150}|{m.numCalls:>14}|{throughput:>20.3f}|{cumulTimeUs:>15.3f}|{avgTimeUs:>17.3f}|"""
    echo lineSep

  else:
-    const lineSep = &"""|{'-'.repeat(50)}|{'-'.repeat(14)}|{'-'.repeat(20)}|{'-'.repeat(15)}|{'-'.repeat(17)}|"""
+    const lineSep = &"""|{'-'.repeat(150)}|{'-'.repeat(14)}|{'-'.repeat(20)}|{'-'.repeat(15)}|{'-'.repeat(17)}|"""
    echo "\n"
    echo lineSep
    echo &"""|{"Procedures":^150}|{"# of Calls":^14}|{"Throughput (ops/s)":^20}|{"Time (µs)":^15}|{"Avg Time (µs)":^17}|"""
@ -53,10 +58,15 @@ proc reportCli*(metrics: seq[Metadata], flags: string) =
    for m in metrics:
      if m.numCalls == 0:
        continue
+
+      let shortname = block:
+        if m.procName.len <= 150: m.procName.replace('\n', ' ')
+        else: m.procName[0..145].replace('\n', ' ') & " ..."
+
      # TODO: running variance / standard deviation but the Welford method is quite costly.
      #       https://nim-lang.org/docs/stats.html / https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
      let cumulTimeUs = m.cumulatedTimeNs.float64 * 1e-3
      let avgTimeUs = cumulTimeUs / m.numCalls.float64
      let throughput = 1e6 / avgTimeUs
-      echo &"""|{m.procName:<150}|{m.numCalls:>14}|{throughput:>20.3f}|{cumulTimeUs:>15.3f}|{avgTimeUs:>17.3f}|"""
+      echo &"""|{shortname:<150}|{m.numCalls:>14}|{throughput:>20.3f}|{cumulTimeUs:>15.3f}|{avgTimeUs:>17.3f}|"""
    echo lineSep
--- a/metering/tracer.nim
+++ b/metering/tracer.nim
@ -88,7 +88,7 @@ when CTT_METER or CTT_TRACE:
      let stopTime = getMonoTime()
      when SupportsGetTicks:
        let elapsedCycles = stopCycle - startCycle
-      let elapsedTime = inMicroseconds(stopTime - startTime)
+      let elapsedTime = inNanoseconds(stopTime - startTime)

      discard Metrics[id].cumulatedTimeNs.atomicInc(elapsedTime)
      when SupportsGetTicks: