Multi-Scalar-Multiplication / Linear combination (#220)

* unoptimized msm * MSM: reorder loops * add a signed windowed recoding technique * improve wNAF table access * use batchAffine * revamp EC tests * MSM signed digit support * refactor MSM: recode signed ahead of time * missing test vector * refactor allocs and Alloca sideeffect * add an endomorphism threshold * Add Jacobian extended coordinates * refactor recodings, prepare for parallelizable on-the-fly signed recoding * recoding changes, introduce proper NAF for pairings * more pairings refactoring, introduce miller accumulator for EVM * some optim to the addchain miller loop * start optimizing multi-pairing * finish multi-miller loop refactoring * minor tuning * MSM: signed encoding suitable for parallelism (no precompute) * cleanup signed window encoding * add prefetching * add metering * properly init result to infinity * comment on prefetching * introduce vartime inversion for batch additions * fix JacExt infinity conversion * add batchAffine for MSM, though slower than JacExtended at the moment * add a batch affine scheduler for MSM * Add Multi-Scalar-Multiplication endomorphism acceleration * some tuning * signed integer fixes + 32-bit + tuning * Some more tuning * common msm bench + don't use affine for c < 9 * nit
2026-01-02 13:13:07 +00:00 · 2023-02-16 12:45:05 +01:00 · 2023-02-16 12:45:05 +01:00 · e5612f5705
commit e5612f5705
parent 082cd1deb9
143 changed files with 18968 additions and 1295 deletions
--- a/benchmarks/bench_ec_g1.nim
+++ b/benchmarks/bench_ec_g1.nim
@ -12,7 +12,8 @@ import
  ../constantine/math/arithmetic,
  ../constantine/math/elliptic/[
    ec_shortweierstrass_projective,
-    ec_shortweierstrass_jacobian],
+    ec_shortweierstrass_jacobian,
+    ec_shortweierstrass_jacobian_extended],
  # Helpers
  ./bench_elliptic_template

@ -46,33 +47,32 @@ proc main() =
    const curve = AvailableCurves[i]
    addBench(ECP_ShortW_Prj[Fp[curve], G1], Iters)
    addBench(ECP_ShortW_Jac[Fp[curve], G1], Iters)
+    addBench(ECP_ShortW_JacExt[Fp[curve], G1], Iters)
    mixedAddBench(ECP_ShortW_Prj[Fp[curve], G1], Iters)
    mixedAddBench(ECP_ShortW_Jac[Fp[curve], G1], Iters)
+    mixedAddBench(ECP_ShortW_JacExt[Fp[curve], G1], Iters)
    doublingBench(ECP_ShortW_Prj[Fp[curve], G1], Iters)
    doublingBench(ECP_ShortW_Jac[Fp[curve], G1], Iters)
+    doublingBench(ECP_ShortW_JacExt[Fp[curve], G1], Iters)
    separator()
    affFromProjBench(ECP_ShortW_Prj[Fp[curve], G1], MulIters)
    affFromJacBench(ECP_ShortW_Jac[Fp[curve], G1], MulIters)
    separator()
-    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Prj[Fp[curve], G1], MulIters)
-    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Jac[Fp[curve], G1], MulIters)
+    for numPoints in [10, 100, 1000, 10000]:
+      let batchIters = max(1, Iters div numPoints)
+      affFromProjBatchBench(ECP_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = false, batchIters)
    separator()
-    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Prj[Fp[curve], G1], MulIters)
-    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Jac[Fp[curve], G1], MulIters)
+    for numPoints in [10, 100, 1000, 10000]:
+      let batchIters = max(1, Iters div numPoints)
+      affFromProjBatchBench(ECP_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = true, batchIters)
    separator()
-    scalarMulGenericBench(ECP_ShortW_Prj[Fp[curve], G1], window = 2, MulIters)
-    scalarMulGenericBench(ECP_ShortW_Prj[Fp[curve], G1], window = 3, MulIters)
-    scalarMulGenericBench(ECP_ShortW_Prj[Fp[curve], G1], window = 4, MulIters)
-    scalarMulGenericBench(ECP_ShortW_Prj[Fp[curve], G1], window = 5, MulIters)
-    scalarMulGenericBench(ECP_ShortW_Jac[Fp[curve], G1], window = 2, MulIters)
-    scalarMulGenericBench(ECP_ShortW_Jac[Fp[curve], G1], window = 3, MulIters)
-    scalarMulGenericBench(ECP_ShortW_Jac[Fp[curve], G1], window = 4, MulIters)
-    scalarMulGenericBench(ECP_ShortW_Jac[Fp[curve], G1], window = 5, MulIters)
+    for numPoints in [10, 100, 1000, 10000]:
+      let batchIters = max(1, Iters div numPoints)
+      affFromJacBatchBench(ECP_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = false, batchIters)
    separator()
-    scalarMulEndo(ECP_ShortW_Prj[Fp[curve], G1], MulIters)
-    scalarMulEndoWindow(ECP_ShortW_Prj[Fp[curve], G1], MulIters)
-    scalarMulEndo(ECP_ShortW_Jac[Fp[curve], G1], MulIters)
-    scalarMulEndoWindow(ECP_ShortW_Jac[Fp[curve], G1], MulIters)
+    for numPoints in [10, 100, 1000, 10000]:
+      let batchIters = max(1, Iters div numPoints)
+      affFromJacBatchBench(ECP_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = true, batchIters)
    separator()
    separator()

--- a/benchmarks/bench_ec_g1_batch.nim
+++ b/benchmarks/bench_ec_g1_batch.nim
@ -14,6 +14,7 @@ import
    ec_shortweierstrass_affine,
    ec_shortweierstrass_projective,
    ec_shortweierstrass_jacobian,
+    ec_shortweierstrass_jacobian_extended,
    ec_shortweierstrass_batch_ops_parallel],
  ../constantine/platforms/threadpool/threadpool,
  # Helpers
@ -38,7 +39,7 @@ proc multiAddParallelBench*(EC: typedesc, numPoints: int, iters: int) =
  var tp = Threadpool.new()

  bench("EC parallel batch add  (" & align($tp.numThreads, 2) & " threads)   " & $EC.G & " (" & $numPoints & " points)", EC, iters):
-    tp.sum_batch_vartime_parallel(r, points)
+    tp.sum_reduce_vartime_parallel(r, points)

  tp.shutdown()

@ -57,41 +58,55 @@ const AvailableCurves = [
  BLS12_381,
 ]

+# const testNumPoints = [10, 100, 1000, 10000, 100000]
+const testNumPoints = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 65536, 131072]
+
 proc main() =
  separator()
  staticFor i, 0, AvailableCurves.len:
    const curve = AvailableCurves[i]
    addBench(ECP_ShortW_Prj[Fp[curve], G1], Iters)
-    addBench(ECP_ShortW_Jac[Fp[curve], G1], Iters)
-    mixedAddBench(ECP_ShortW_Prj[Fp[curve], G1], Iters)
-    mixedAddBench(ECP_ShortW_Jac[Fp[curve], G1], Iters)
    doublingBench(ECP_ShortW_Prj[Fp[curve], G1], Iters)
+    mixedAddBench(ECP_ShortW_Prj[Fp[curve], G1], Iters)
+    addBench(ECP_ShortW_Jac[Fp[curve], G1], Iters)
    doublingBench(ECP_ShortW_Jac[Fp[curve], G1], Iters)
+    mixedAddBench(ECP_ShortW_Jac[Fp[curve], G1], Iters)
+    addBench(ECP_ShortW_JacExt[Fp[curve], G1], Iters)
+    doublingBench(ECP_ShortW_JacExt[Fp[curve], G1], Iters)
+    mixedAddBench(ECP_ShortW_JacExt[Fp[curve], G1], Iters)
    separator()
-    for numPoints in [10, 100, 1000, 10000, 100000, 1000000]:
+    for numPoints in testNumPoints:
      let batchIters = max(1, Iters div numPoints)
      multiAddBench(ECP_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = false, batchIters)
    separator()
-    for numPoints in [10, 100, 1000, 10000, 100000, 1000000]:
+    for numPoints in testNumPoints:
      let batchIters = max(1, Iters div numPoints)
      multiAddBench(ECP_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = true, batchIters)
    separator()
-    for numPoints in [10, 100, 1000, 10000, 100000, 1000000]:
+    for numPoints in testNumPoints:
      let batchIters = max(1, Iters div numPoints)
      multiAddParallelBench(ECP_ShortW_Prj[Fp[curve], G1], numPoints, batchIters)
    separator()
-    for numPoints in [10, 100, 1000, 10000, 100000, 1000000]:
+    for numPoints in testNumPoints:
      let batchIters = max(1, Iters div numPoints)
      multiAddBench(ECP_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = false, batchIters)
    separator()
-    for numPoints in [10, 100, 1000, 10000, 100000, 1000000]:
+    for numPoints in testNumPoints:
      let batchIters = max(1, Iters div numPoints)
      multiAddBench(ECP_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = true, batchIters)
    separator()
-    for numPoints in [10, 100, 1000, 10000, 100000, 1000000]:
+    for numPoints in testNumPoints:
      let batchIters = max(1, Iters div numPoints)
      multiAddParallelBench(ECP_ShortW_Jac[Fp[curve], G1], numPoints, batchIters)
    separator()
+    for numPoints in testNumPoints:
+      let batchIters = max(1, Iters div numPoints)
+      multiAddBench(ECP_ShortW_JacExt[Fp[curve], G1], numPoints, useBatching = false, batchIters)
+    separator()
+    for numPoints in testNumPoints:
+      let batchIters = max(1, Iters div numPoints)
+      multiAddBench(ECP_ShortW_JacExt[Fp[curve], G1], numPoints, useBatching = true, batchIters)
+    separator()
    separator()

 main()
--- a/benchmarks/bench_ec_g1_msm_bls12_381.nim
+++ b/benchmarks/bench_ec_g1_msm_bls12_381.nim
@ -0,0 +1,60 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internals
+  ../constantine/math/config/curves,
+  ../constantine/math/arithmetic,
+  ../constantine/math/elliptic/[
+    ec_shortweierstrass_affine,
+    ec_shortweierstrass_projective,
+    ec_shortweierstrass_jacobian,
+    ec_scalar_mul,
+    ec_multi_scalar_mul],
+  ../constantine/math/constants/zoo_subgroups,
+  # Helpers
+  ../helpers/prng_unsafe,
+  ./bench_elliptic_template,
+  ./bench_blueprint
+
+# ############################################################
+#
+#               Benchmark of the G1 group of
+#            Short Weierstrass elliptic curves
+#          in (homogeneous) projective coordinates
+#
+# ############################################################
+
+
+const Iters = 10_000
+const AvailableCurves = [
+  BLS12_381,
+]
+
+# const testNumPoints = [10, 100, 1000, 10000, 100000]
+const testNumPoints = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192,
+                       16384, 32768, 65536, 131072, 262144]
+
+proc main() =
+  separator()
+  staticFor i, 0, AvailableCurves.len:
+    const curve = AvailableCurves[i]
+    separator()
+    # for numPoints in testNumPoints:
+    #   let batchIters = max(1, Iters div numPoints)
+    #   msmBench(ECP_ShortW_Prj[Fp[curve], G1], numPoints, batchIters)
+    #   separator()
+    # separator()
+    for numPoints in testNumPoints:
+      let batchIters = max(1, Iters div numPoints)
+      msmBench(ECP_ShortW_Jac[Fp[curve], G1], numPoints, batchIters)
+      separator()
+    separator()
+
+main()
+notes()
--- a/benchmarks/bench_ec_g1_msm_bn254_snarks.nim
+++ b/benchmarks/bench_ec_g1_msm_bn254_snarks.nim
@ -0,0 +1,59 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internals
+  ../constantine/math/config/curves,
+  ../constantine/math/arithmetic,
+  ../constantine/math/elliptic/[
+    ec_shortweierstrass_affine,
+    ec_shortweierstrass_projective,
+    ec_shortweierstrass_jacobian,
+    ec_scalar_mul,
+    ec_multi_scalar_mul],
+  ../constantine/math/constants/zoo_subgroups,
+  # Helpers
+  ../helpers/prng_unsafe,
+  ./bench_elliptic_template,
+  ./bench_blueprint
+
+# ############################################################
+#
+#               Benchmark of the G1 group of
+#            Short Weierstrass elliptic curves
+#          in (homogeneous) projective coordinates
+#
+# ############################################################
+
+
+const Iters = 10_000
+const AvailableCurves = [
+  BN254_Snarks,
+]
+
+const testNumPoints = [10, 100, 1000, 10000, 100000]
+# const testNumPoints = [64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
+
+proc main() =
+  separator()
+  staticFor i, 0, AvailableCurves.len:
+    const curve = AvailableCurves[i]
+    separator()
+    # for numPoints in testNumPoints:
+    #   let batchIters = max(1, Iters div numPoints)
+    #   msmBench(ECP_ShortW_Prj[Fp[curve], G1], numPoints, batchIters)
+    #   separator()
+    # separator()
+    for numPoints in testNumPoints:
+      let batchIters = max(1, Iters div numPoints)
+      msmBench(ECP_ShortW_Jac[Fp[curve], G1], numPoints, batchIters)
+      separator()
+    separator()
+
+main()
+notes()
--- a/benchmarks/bench_ec_g1_scalar_mul.nim
+++ b/benchmarks/bench_ec_g1_scalar_mul.nim
@ -0,0 +1,81 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internals
+  ../constantine/math/config/curves,
+  ../constantine/math/arithmetic,
+  ../constantine/math/elliptic/[
+    ec_shortweierstrass_projective,
+    ec_shortweierstrass_jacobian],
+  # Helpers
+  ./bench_elliptic_template
+
+# ############################################################
+#
+#               Benchmark of the G1 group of
+#            Short Weierstrass elliptic curves
+#          in (homogeneous) projective coordinates
+#
+# ############################################################
+
+
+const Iters = 10_000
+const MulIters = 100
+const AvailableCurves = [
+  # P224,
+  BN254_Nogami,
+  BN254_Snarks,
+  # Edwards25519,
+  # P256,
+  # Secp256k1,
+  Pallas,
+  Vesta,
+  BLS12_377,
+  BLS12_381,
+]
+
+proc main() =
+  separator()
+  staticFor i, 0, AvailableCurves.len:
+    const curve = AvailableCurves[i]
+    const bits = 64 # curve.getCurveOrderBitwidth()
+    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Prj[Fp[curve], G1], bits, MulIters)
+    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Jac[Fp[curve], G1], bits, MulIters)
+    separator()
+    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Prj[Fp[curve], G1], bits, MulIters)
+    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Jac[Fp[curve], G1], bits, MulIters)
+    separator()
+    scalarMulGenericBench(ECP_ShortW_Prj[Fp[curve], G1], bits, window = 2, MulIters)
+    scalarMulGenericBench(ECP_ShortW_Prj[Fp[curve], G1], bits, window = 3, MulIters)
+    scalarMulGenericBench(ECP_ShortW_Prj[Fp[curve], G1], bits, window = 4, MulIters)
+    scalarMulGenericBench(ECP_ShortW_Prj[Fp[curve], G1], bits, window = 5, MulIters)
+    scalarMulGenericBench(ECP_ShortW_Jac[Fp[curve], G1], bits, window = 2, MulIters)
+    scalarMulGenericBench(ECP_ShortW_Jac[Fp[curve], G1], bits, window = 3, MulIters)
+    scalarMulGenericBench(ECP_ShortW_Jac[Fp[curve], G1], bits, window = 4, MulIters)
+    scalarMulGenericBench(ECP_ShortW_Jac[Fp[curve], G1], bits, window = 5, MulIters)
+    separator()
+    scalarMulUnsafeWNAFBench(ECP_ShortW_Prj[Fp[curve], G1], bits, window = 2, MulIters)
+    scalarMulUnsafeWNAFBench(ECP_ShortW_Prj[Fp[curve], G1], bits, window = 3, MulIters)
+    scalarMulUnsafeWNAFBench(ECP_ShortW_Prj[Fp[curve], G1], bits, window = 4, MulIters)
+    scalarMulUnsafeWNAFBench(ECP_ShortW_Prj[Fp[curve], G1], bits, window = 5, MulIters)
+    scalarMulUnsafeWNAFBench(ECP_ShortW_Jac[Fp[curve], G1], bits, window = 2, MulIters)
+    scalarMulUnsafeWNAFBench(ECP_ShortW_Jac[Fp[curve], G1], bits, window = 3, MulIters)
+    scalarMulUnsafeWNAFBench(ECP_ShortW_Jac[Fp[curve], G1], bits, window = 4, MulIters)
+    scalarMulUnsafeWNAFBench(ECP_ShortW_Jac[Fp[curve], G1], bits, window = 5, MulIters)
+    separator()
+    when bits >= 196: # All endomorphisms constants are below this threshold
+      scalarMulEndo(      ECP_ShortW_Prj[Fp[curve], G1], bits, MulIters)
+      scalarMulEndoWindow(ECP_ShortW_Prj[Fp[curve], G1], bits, MulIters)
+      scalarMulEndo(      ECP_ShortW_Jac[Fp[curve], G1], bits, MulIters)
+      scalarMulEndoWindow(ECP_ShortW_Jac[Fp[curve], G1], bits, MulIters)
+      separator()
+    separator()
+
+main()
+notes()
--- a/benchmarks/bench_ec_g2.nim
+++ b/benchmarks/bench_ec_g2.nim
@ -13,7 +13,8 @@ import
  ../constantine/math/extension_fields,
  ../constantine/math/elliptic/[
    ec_shortweierstrass_projective,
-    ec_shortweierstrass_jacobian],
+    ec_shortweierstrass_jacobian,
+    ec_shortweierstrass_jacobian_extended],
  # Helpers
  ./bench_elliptic_template,
  # Standard library
@ -47,31 +48,32 @@ proc main() =
    const curve = AvailableCurves[i]
    addBench(ECP_ShortW_Prj[Fp2[curve], G2], Iters)
    addBench(ECP_ShortW_Jac[Fp2[curve], G2], Iters)
+    addBench(ECP_ShortW_JacExt[Fp2[curve], G2], Iters)
    mixedAddBench(ECP_ShortW_Prj[Fp2[curve], G2], Iters)
    mixedAddBench(ECP_ShortW_Jac[Fp2[curve], G2], Iters)
+    mixedAddBench(ECP_ShortW_JacExt[Fp2[curve], G2], Iters)
    doublingBench(ECP_ShortW_Prj[Fp2[curve], G2], Iters)
    doublingBench(ECP_ShortW_Jac[Fp2[curve], G2], Iters)
+    doublingBench(ECP_ShortW_JacExt[Fp2[curve], G2], Iters)
    separator()
    affFromProjBench(ECP_ShortW_Prj[Fp2[curve], G2], MulIters)
    affFromJacBench(ECP_ShortW_Jac[Fp2[curve], G2], MulIters)
    separator()
-    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Prj[Fp2[curve], G2], MulIters)
-    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Jac[Fp2[curve], G2], MulIters)
+    for numPoints in [10, 100, 1000, 10000]:
+      let batchIters = max(1, Iters div numPoints)
+      affFromProjBatchBench(ECP_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = false, batchIters)
    separator()
-    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Prj[Fp2[curve], G2], MulIters)
-    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Jac[Fp2[curve], G2], MulIters)
+    for numPoints in [10, 100, 1000, 10000]:
+      let batchIters = max(1, Iters div numPoints)
+      affFromProjBatchBench(ECP_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = true, batchIters)
    separator()
-    scalarMulGenericBench(ECP_ShortW_Prj[Fp2[curve], G2], window = 2, MulIters)
-    scalarMulGenericBench(ECP_ShortW_Prj[Fp2[curve], G2], window = 3, MulIters)
-    scalarMulGenericBench(ECP_ShortW_Prj[Fp2[curve], G2], window = 4, MulIters)
-    scalarMulGenericBench(ECP_ShortW_Prj[Fp2[curve], G2], window = 5, MulIters)
-    scalarMulGenericBench(ECP_ShortW_Jac[Fp2[curve], G2], window = 2, MulIters)
-    scalarMulGenericBench(ECP_ShortW_Jac[Fp2[curve], G2], window = 3, MulIters)
-    scalarMulGenericBench(ECP_ShortW_Jac[Fp2[curve], G2], window = 4, MulIters)
-    scalarMulGenericBench(ECP_ShortW_Jac[Fp2[curve], G2], window = 5, MulIters)
+    for numPoints in [10, 100, 1000, 10000]:
+      let batchIters = max(1, Iters div numPoints)
+      affFromJacBatchBench(ECP_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = false, batchIters)
    separator()
-    scalarMulEndo(ECP_ShortW_Prj[Fp2[curve], G2], MulIters)
-    scalarMulEndo(ECP_ShortW_Jac[Fp2[curve], G2], MulIters)
+    for numPoints in [10, 100, 1000, 10000]:
+      let batchIters = max(1, Iters div numPoints)
+      affFromJacBatchBench(ECP_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = true, batchIters)
    separator()
    separator()

--- a/benchmarks/bench_ec_g2_scalar_mul.nim
+++ b/benchmarks/bench_ec_g2_scalar_mul.nim
@ -0,0 +1,80 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internals
+  ../constantine/math/config/curves,
+  ../constantine/math/arithmetic,
+  ../constantine/math/extension_fields,
+  ../constantine/math/elliptic/[
+    ec_shortweierstrass_projective,
+    ec_shortweierstrass_jacobian],
+  # Helpers
+  ./bench_elliptic_template,
+  # Standard library
+  std/strutils
+
+# ############################################################
+#
+#               Benchmark of the G1 group of
+#            Short Weierstrass elliptic curves
+#          in (homogeneous) projective coordinates
+#
+# ############################################################
+
+
+const Iters = 10_000
+const MulIters = 500
+const AvailableCurves = [
+  # P224,
+  BN254_Nogami,
+  BN254_Snarks,
+  # Edwards25519,
+  # P256,
+  # Secp256k1,
+  BLS12_377,
+  BLS12_381,
+]
+
+proc main() =
+  separator()
+  staticFor i, 0, AvailableCurves.len:
+    const curve = AvailableCurves[i]
+    const bits = 64 # curve.getCurveOrderBitwidth()
+    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, MulIters)
+    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, MulIters)
+    separator()
+    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, MulIters)
+    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, MulIters)
+    separator()
+    scalarMulGenericBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, window = 2, MulIters)
+    scalarMulGenericBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, window = 3, MulIters)
+    scalarMulGenericBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, window = 4, MulIters)
+    scalarMulGenericBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, window = 5, MulIters)
+    scalarMulGenericBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, window = 2, MulIters)
+    scalarMulGenericBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, window = 3, MulIters)
+    scalarMulGenericBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, window = 4, MulIters)
+    scalarMulGenericBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, window = 5, MulIters)
+    separator()
+    scalarMulUnsafeWNAFBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, window = 2, MulIters)
+    scalarMulUnsafeWNAFBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, window = 3, MulIters)
+    scalarMulUnsafeWNAFBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, window = 4, MulIters)
+    scalarMulUnsafeWNAFBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, window = 5, MulIters)
+    scalarMulUnsafeWNAFBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, window = 2, MulIters)
+    scalarMulUnsafeWNAFBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, window = 3, MulIters)
+    scalarMulUnsafeWNAFBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, window = 4, MulIters)
+    scalarMulUnsafeWNAFBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, window = 5, MulIters)
+    separator()
+    when bits >= 196: # All endomorphisms constants are below this threshold
+      scalarMulEndo(ECP_ShortW_Prj[Fp2[curve], G2], bits, MulIters)
+      scalarMulEndo(ECP_ShortW_Jac[Fp2[curve], G2], bits, MulIters)
+      separator()
+    separator()
+
+main()
+notes()
--- a/benchmarks/bench_elliptic_template.nim
+++ b/benchmarks/bench_elliptic_template.nim
@ -22,19 +22,21 @@ import
    ec_shortweierstrass_affine,
    ec_shortweierstrass_projective,
    ec_shortweierstrass_jacobian,
+    ec_shortweierstrass_jacobian_extended,
    ec_shortweierstrass_batch_ops,
    ec_scalar_mul, ec_endomorphism_accel],
+    ../constantine/math/constants/zoo_subgroups,
  # Helpers
  ../helpers/prng_unsafe,
  ./platforms,
  ./bench_blueprint,
  # Reference unsafe scalar multiplication
-  ../tests/math/support/ec_reference_scalar_mult
+  ../constantine/math/elliptic/ec_scalar_mul_vartime

 export notes
 export abstractions # generic sandwich on SecretBool and SecretBool in Jacobian sum

-proc separator*() = separator(177)
+proc separator*() = separator(206)

 macro fixEllipticDisplay(EC: typedesc): untyped =
  # At compile-time, enums are integers and their display is buggy
@ -50,18 +52,28 @@ proc report(op, elliptic: string, start, stop: MonoTime, startClk, stopClk: int6
  let ns = inNanoseconds((stop-start) div iters)
  let throughput = 1e9 / float64(ns)
  when SupportsGetTicks:
-    echo &"{op:<60} {elliptic:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op     {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
+    echo &"{op:<80} {elliptic:<40} {throughput:>15.3f} ops/s     {ns:>12} ns/op     {(stopClk - startClk) div iters:>12} CPU cycles (approx)"
  else:
-    echo &"{op:<60} {elliptic:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op"
+    echo &"{op:<80} {elliptic:<40} {throughput:>15.3f} ops/s     {ns:>12} ns/op"

 template bench*(op: string, EC: typedesc, iters: int, body: untyped): untyped =
  measure(iters, startTime, stopTime, startClk, stopClk, body)
  report(op, fixEllipticDisplay(EC), startTime, stopTime, startClk, stopClk, iters)

+func `+=`[F; G: static Subgroup](P: var ECP_ShortW_JacExt[F, G], Q: ECP_ShortW_JacExt[F, G]) {.inline.}=
+  P.sum_vartime(P, Q)
+func `+=`[F; G: static Subgroup](P: var ECP_ShortW_JacExt[F, G], Q: ECP_ShortW_Aff[F, G]) {.inline.}=
+  P.madd_vartime(P, Q)
+
 proc addBench*(EC: typedesc, iters: int) =
  var r {.noInit.}: EC
  let P = rng.random_unsafe(EC)
  let Q = rng.random_unsafe(EC)
+
+  when EC is ECP_ShortW_JacExt:
+    bench("EC Add vartime " & $EC.G, EC, iters):
+      r.sum_vartime(P, Q)
+  else:
    bench("EC Add " & $EC.G, EC, iters):
      r.sum(P, Q)

@ -71,6 +83,11 @@ proc mixedAddBench*(EC: typedesc, iters: int) =
  let Q = rng.random_unsafe(EC)
  var Qaff: ECP_ShortW_Aff[EC.F, EC.G]
  Qaff.affine(Q)
+
+  when EC is ECP_ShortW_JacExt:
+    bench("EC Mixed Addition vartime " & $EC.G, EC, iters):
+      r.madd_vartime(P, Qaff)
+  else:
    bench("EC Mixed Addition " & $EC.G, EC, iters):
      r.madd(P, Qaff)

@ -92,11 +109,40 @@ proc affFromJacBench*(EC: typedesc, iters: int) =
  bench("EC Jacobian to Affine " & $EC.G, EC, iters):
    r.affine(P)

-proc scalarMulGenericBench*(EC: typedesc, window: static int, iters: int) =
-  const bits = EC.F.C.getCurveOrderBitwidth()
+proc affFromProjBatchBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) =
+  var r = newSeq[affine(EC)](numPoints)
+  var points = newSeq[EC](numPoints)

+  for i in 0 ..< numPoints:
+    points[i] = rng.random_unsafe(EC)
+
+  if useBatching:
+    bench("EC Projective to Affine -   batched " & $EC.G & " (" & $numPoints & " points)", EC, iters):
+      r.asUnchecked().batchAffine(points.asUnchecked(), numPoints)
+  else:
+    bench("EC Projective to Affine - unbatched " & $EC.G & " (" & $numPoints & " points)", EC, iters):
+      for i in 0 ..< numPoints:
+        r[i].affine(points[i])
+
+proc affFromJacBatchBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) =
+  var r = newSeq[affine(EC)](numPoints)
+  var points = newSeq[EC](numPoints)
+
+  for i in 0 ..< numPoints:
+    points[i] = rng.random_unsafe(EC)
+
+  if useBatching:
+    bench("EC Jacobian to Affine -   batched " & $EC.G & " (" & $numPoints & " points)", EC, iters):
+      r.asUnchecked().batchAffine(points.asUnchecked(), numPoints)
+  else:
+    bench("EC Jacobian to Affine - unbatched " & $EC.G & " (" & $numPoints & " points)", EC, iters):
+      for i in 0 ..< numPoints:
+        r[i].affine(points[i])
+
+proc scalarMulGenericBench*(EC: typedesc, bits, window: static int, iters: int) =
  var r {.noInit.}: EC
-  let P = rng.random_unsafe(EC) # TODO: clear cofactor
+  var P = rng.random_unsafe(EC)
+  P.clearCofactor()

  let exponent = rng.random_unsafe(BigInt[bits])

@ -104,11 +150,10 @@ proc scalarMulGenericBench*(EC: typedesc, window: static int, iters: int) =
    r = P
    r.scalarMulGeneric(exponent, window)

-proc scalarMulEndo*(EC: typedesc, iters: int) =
-  const bits = EC.F.C.getCurveOrderBitwidth()
-
+proc scalarMulEndo*(EC: typedesc, bits: static int, iters: int) =
  var r {.noInit.}: EC
-  let P = rng.random_unsafe(EC) # TODO: clear cofactor
+  var P = rng.random_unsafe(EC)
+  P.clearCofactor()

  let exponent = rng.random_unsafe(BigInt[bits])

@ -116,11 +161,10 @@ proc scalarMulEndo*(EC: typedesc, iters: int) =
    r = P
    r.scalarMulEndo(exponent)

-proc scalarMulEndoWindow*(EC: typedesc, iters: int) =
-  const bits = EC.F.C.getCurveOrderBitwidth()
-
+proc scalarMulEndoWindow*(EC: typedesc, bits: static int, iters: int) =
  var r {.noInit.}: EC
-  let P = rng.random_unsafe(EC) # TODO: clear cofactor
+  var P = rng.random_unsafe(EC)
+  P.clearCofactor()

  let exponent = rng.random_unsafe(BigInt[bits])

@ -131,29 +175,38 @@ proc scalarMulEndoWindow*(EC: typedesc, iters: int) =
    else:
      {.error: "Not implemented".}

-proc scalarMulUnsafeDoubleAddBench*(EC: typedesc, iters: int) =
-  const bits = EC.F.C.getCurveOrderBitwidth()
-
+proc scalarMulUnsafeDoubleAddBench*(EC: typedesc, bits: static int, iters: int) =
  var r {.noInit.}: EC
-  let P = rng.random_unsafe(EC) # TODO: clear cofactor
+  var P = rng.random_unsafe(EC)
+  P.clearCofactor()

  let exponent = rng.random_unsafe(BigInt[bits])

  bench("EC ScalarMul " & $bits & "-bit " & $EC.G & " (unsafe reference DoubleAdd)", EC, iters):
    r = P
-    r.unsafe_ECmul_double_add(exponent)
-
-proc scalarMulUnsafeMinHammingWeightRecodingBench*(EC: typedesc, iters: int) =
-  const bits = EC.F.C.getCurveOrderBitwidth()
+    r.scalarMul_doubleAdd_vartime(exponent)

+proc scalarMulUnsafeMinHammingWeightRecodingBench*(EC: typedesc, bits: static int, iters: int) =
  var r {.noInit.}: EC
-  var P = rng.random_unsafe(EC) # TODO: clear cofactor
+  var P = rng.random_unsafe(EC)
+  P.clearCofactor()

  let exponent = rng.random_unsafe(BigInt[bits])

  bench("EC ScalarMul " & $bits & "-bit " & $EC.G & " (unsafe min Hamming Weight recoding)", EC, iters):
    r = P
-    r.unsafe_ECmul_minHammingWeight(exponent)
+    r.scalarMul_minHammingWeight_vartime(exponent)
+
+proc scalarMulUnsafeWNAFBench*(EC: typedesc, bits, window: static int, iters: int) =
+  var r {.noInit.}: EC
+  var P = rng.random_unsafe(EC)
+  P.clearCofactor()
+
+  let exponent = rng.random_unsafe(BigInt[bits])
+
+  bench("EC ScalarMul " & $bits & "-bit " & $EC.G & " (unsafe wNAF-" & $window & ")", EC, iters):
+    r = P
+    r.scalarMul_minHammingWeight_windowed_vartime(exponent, window)

 proc multiAddBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) =
  var points = newSeq[ECP_ShortW_Aff[EC.F, EC.G]](numPoints)
@ -165,9 +218,61 @@ proc multiAddBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int)

  if useBatching:
    bench("EC Multi Add batched                  " & $EC.G & " (" & $numPoints & " points)", EC, iters):
-      r.sum_batch_vartime(points)
+      r.sum_reduce_vartime(points)
  else:
    bench("EC Multi Mixed-Add unbatched          " & $EC.G & " (" & $numPoints & " points)", EC, iters):
      r.setInf()
      for i in 0 ..< numPoints:
        r += points[i]
+
+
+proc msmBench*(EC: typedesc, numPoints: int, iters: int) =
+  const bits = EC.F.C.getCurveOrderBitwidth()
+  var points = newSeq[ECP_ShortW_Aff[EC.F, EC.G]](numPoints)
+  var scalars = newSeq[BigInt[bits]](numPoints)
+
+  for i in 0 ..< numPoints:
+    var tmp = rng.random_unsafe(EC)
+    tmp.clearCofactor()
+    points[i].affine(tmp)
+    scalars[i] = rng.random_unsafe(BigInt[bits])
+
+  var r{.noInit.}: EC
+  var startNaive, stopNaive, startMSMbaseline, stopMSMbaseline, startMSMopt, stopMSMopt: MonoTime
+
+  if numPoints <= 100000:
+    bench("EC scalar muls                " & align($numPoints, 7) & " (scalars " & $bits & "-bit, points) pairs ", EC, iters):
+      startNaive = getMonotime()
+      var tmp: EC
+      r.setInf()
+      for i in 0 ..< points.len:
+        tmp.fromAffine(points[i])
+        tmp.scalarMul(scalars[i])
+        r += tmp
+      stopNaive = getMonotime()
+
+  block:
+    bench("EC multi-scalar-mul baseline  " & align($numPoints, 7) & " (scalars " & $bits & "-bit, points) pairs ", EC, iters):
+      startMSMbaseline = getMonotime()
+      r.multiScalarMul_reference_vartime(scalars, points)
+      stopMSMbaseline = getMonotime()
+
+  block:
+    bench("EC multi-scalar-mul optimized " & align($numPoints, 7) & " (scalars " & $bits & "-bit, points) pairs ", EC, iters):
+      startMSMopt = getMonotime()
+      r.multiScalarMul_vartime(scalars, points)
+      stopMSMopt = getMonotime()
+
+  let perfNaive = inNanoseconds((stopNaive-startNaive) div iters)
+  let perfMSMbaseline = inNanoseconds((stopMSMbaseline-startMSMbaseline) div iters)
+  let perfMSMopt = inNanoseconds((stopMSMopt-startMSMopt) div iters)
+
+  if numPoints <= 100000:
+    let speedupBaseline = float(perfNaive) / float(perfMSMbaseline)
+    echo &"Speedup ratio baseline over naive linear combination: {speedupBaseline:>6.3f}x"
+
+    let speedupOpt = float(perfNaive) / float(perfMSMopt)
+    echo &"Speedup ratio optimized over naive linear combination: {speedupOpt:>6.3f}x"
+
+  let speedupOptBaseline = float(perfMSMbaseline) / float(perfMSMopt)
+  echo &"Speedup ratio optimized over baseline linear combination: {speedupOptBaseline:>6.3f}x"
--- a/benchmarks/bench_fields_template.nim
+++ b/benchmarks/bench_fields_template.nim
@ -160,6 +160,13 @@ proc invBench*(T: typedesc, iters: int) =
  bench("Inversion (constant-time)", T, iters):
    r.inv(x)

+proc invVartimeBench*(T: typedesc, iters: int) =
+  var r: T
+  let x = rng.random_unsafe(T)
+  preventOptimAway(r)
+  bench("Inversion (variable-time)", T, iters):
+    r.inv_vartime(x)
+
 proc isSquareBench*(T: typedesc, iters: int) =
  let x = rng.random_unsafe(T)
  bench("isSquare (constant-time)", T, iters):
--- a/benchmarks/bench_fp.nim
+++ b/benchmarks/bench_fp.nim
@ -61,6 +61,7 @@ proc main() =
    toFieldBench(Fp[curve], Iters)
    smallSeparator()
    invBench(Fp[curve], ExponentIters)
+    invVartimeBench(Fp[curve], ExponentIters)
    isSquareBench(Fp[curve], ExponentIters)
    sqrtBench(Fp[curve], ExponentIters)
    sqrtRatioBench(Fp[curve], ExponentIters)
--- a/benchmarks/bench_fp12.nim
+++ b/benchmarks/bench_fp12.nim
@ -40,6 +40,7 @@ proc main() =
    mulBench(Fp12[curve], Iters)
    sqrBench(Fp12[curve], Iters)
    invBench(Fp12[curve], InvIters)
+    invVartimeBench(Fp12[curve], InvIters)
    separator()

 main()
--- a/benchmarks/bench_fp2.nim
+++ b/benchmarks/bench_fp2.nim
@ -48,6 +48,7 @@ proc main() =
    rdc2xBench(Fp2[curve], Iters)
    smallSeparator()
    invBench(Fp2[curve], InvIters)
+    invVartimeBench(Fp2[curve], InvIters)
    isSquareBench(Fp2[curve], InvIters)
    sqrtBench(Fp2[curve], InvIters)
    separator()
--- a/benchmarks/bench_fp4.nim
+++ b/benchmarks/bench_fp4.nim
@ -48,6 +48,7 @@ proc main() =
    rdc2xBench(Fp4[curve], Iters)
    smallSeparator()
    invBench(Fp4[curve], InvIters)
+    invVartimeBench(Fp4[curve], InvIters)
    separator()

 main()
--- a/benchmarks/bench_fp6.nim
+++ b/benchmarks/bench_fp6.nim
@ -46,6 +46,7 @@ proc main() =
    rdc2xBench(Fp6[curve], Iters)
    smallSeparator()
    invBench(Fp6[curve], InvIters)
+    invVartimeBench(Fp6[curve], InvIters)
    separator()

 main()
--- a/benchmarks/bench_pairing_template.nim
+++ b/benchmarks/bench_pairing_template.nim
@ -184,7 +184,7 @@ proc millerLoopBLS12Bench*(C: static Curve, iters: int) =

  var f: Fp12[C]
  bench("Miller Loop BLS12", C, iters):
-    f.millerLoopGenericBLS12(P, Q)
+    f.millerLoopGenericBLS12(Q, P)

 proc millerLoopBNBench*(C: static Curve, iters: int) =
  let
@ -193,7 +193,7 @@ proc millerLoopBNBench*(C: static Curve, iters: int) =

  var f: Fp12[C]
  bench("Miller Loop BN", C, iters):
-    f.millerLoopGenericBN(P, Q)
+    f.millerLoopGenericBN(Q, P)

 proc finalExpEasyBench*(C: static Curve, iters: int) =
  var r = rng.random_unsafe(Fp12[C])
--- a/benchmarks/bench_summary_template.nim
+++ b/benchmarks/bench_summary_template.nim
@ -172,7 +172,7 @@ proc millerLoopBLS12Bench*(C: static Curve, iters: int) =

  var f: Fp12[C]
  bench("Miller Loop BLS12", C, iters):
-    f.millerLoopGenericBLS12(P, Q)
+    f.millerLoopGenericBLS12(Q, P)

 proc millerLoopBNBench*(C: static Curve, iters: int) =
  let
@ -181,7 +181,7 @@ proc millerLoopBNBench*(C: static Curve, iters: int) =

  var f: Fp12[C]
  bench("Miller Loop BN", C, iters):
-    f.millerLoopGenericBN(P, Q)
+    f.millerLoopGenericBN(Q, P)

 proc finalExpBLS12Bench*(C: static Curve, iters: int) =
  var r = rng.random_unsafe(Fp12[C])
--- a/constantine.nimble
+++ b/constantine.nimble
@ -99,15 +99,18 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
  # ("tests/math/t_ec_shortw_prj_g1_add_double.nim", false),
  # ("tests/math/t_ec_shortw_prj_g1_mul_sanity.nim", false),
  # ("tests/math/t_ec_shortw_prj_g1_mul_distri.nim", false),
-  # ("tests/math/t_ec_shortw_prj_g1_mul_vs_ref.nim", false),
+  ("tests/math/t_ec_shortw_prj_g1_mul_vs_ref.nim", false),
  # ("tests/math/t_ec_shortw_prj_g1_mixed_add.nim", false),

  # ("tests/math/t_ec_shortw_jac_g1_add_double.nim", false),
  # ("tests/math/t_ec_shortw_jac_g1_mul_sanity.nim", false),
  # ("tests/math/t_ec_shortw_jac_g1_mul_distri.nim", false),
-  # ("tests/math/t_ec_shortw_jac_g1_mul_vs_ref.nim", false),
+  ("tests/math/t_ec_shortw_jac_g1_mul_vs_ref.nim", false),
  # ("tests/math/t_ec_shortw_jac_g1_mixed_add.nim", false),

+  ("tests/math/t_ec_shortw_jacext_g1_add_double.nim", false),
+  ("tests/math/t_ec_shortw_jacext_g1_mixed_add.nim", false),
+
  # ("tests/math/t_ec_twedwards_prj_add_double", false),
  # ("tests/math/t_ec_twedwards_prj_mul_sanity", false),
  # ("tests/math/t_ec_twedwards_prj_mul_distri", false),
@ -118,49 +121,49 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
  # ("tests/math/t_ec_shortw_prj_g2_add_double_bn254_snarks.nim", false),
  # ("tests/math/t_ec_shortw_prj_g2_mul_sanity_bn254_snarks.nim", false),
  # ("tests/math/t_ec_shortw_prj_g2_mul_distri_bn254_snarks.nim", false),
-  # ("tests/math/t_ec_shortw_prj_g2_mul_vs_ref_bn254_snarks.nim", false),
+  ("tests/math/t_ec_shortw_prj_g2_mul_vs_ref_bn254_snarks.nim", false),
  # ("tests/math/t_ec_shortw_prj_g2_mixed_add_bn254_snarks.nim", false),

  # ("tests/math/t_ec_shortw_prj_g2_add_double_bls12_381.nim", false),
  # ("tests/math/t_ec_shortw_prj_g2_mul_sanity_bls12_381.nim", false),
  # ("tests/math/t_ec_shortw_prj_g2_mul_distri_bls12_381.nim", false),
-  # ("tests/math/t_ec_shortw_prj_g2_mul_vs_ref_bls12_381.nim", false),
+  ("tests/math/t_ec_shortw_prj_g2_mul_vs_ref_bls12_381.nim", false),
  # ("tests/math/t_ec_shortw_prj_g2_mixed_add_bls12_381.nim", false),

  # ("tests/math/t_ec_shortw_prj_g2_add_double_bls12_377.nim", false),
  # ("tests/math/t_ec_shortw_prj_g2_mul_sanity_bls12_377.nim", false),
  # ("tests/math/t_ec_shortw_prj_g2_mul_distri_bls12_377.nim", false),
-  # ("tests/math/t_ec_shortw_prj_g2_mul_vs_ref_bls12_377.nim", false),
+  ("tests/math/t_ec_shortw_prj_g2_mul_vs_ref_bls12_377.nim", false),
  # ("tests/math/t_ec_shortw_prj_g2_mixed_add_bls12_377.nim", false),

  # ("tests/math/t_ec_shortw_prj_g2_add_double_bw6_761.nim", false),
  # ("tests/math/t_ec_shortw_prj_g2_mul_sanity_bw6_761.nim", false),
  # ("tests/math/t_ec_shortw_prj_g2_mul_distri_bw6_761.nim", false),
-  # ("tests/math/t_ec_shortw_prj_g2_mul_vs_ref_bw6_761.nim", false),
+  ("tests/math/t_ec_shortw_prj_g2_mul_vs_ref_bw6_761.nim", false),
  # ("tests/math/t_ec_shortw_prj_g2_mixed_add_bw6_761.nim", false),

  # ("tests/math/t_ec_shortw_jac_g2_add_double_bn254_snarks.nim", false),
  # ("tests/math/t_ec_shortw_jac_g2_mul_sanity_bn254_snarks.nim", false),
  # ("tests/math/t_ec_shortw_jac_g2_mul_distri_bn254_snarks.nim", false),
-  # ("tests/math/t_ec_shortw_jac_g2_mul_vs_ref_bn254_snarks.nim", false),
+  ("tests/math/t_ec_shortw_jac_g2_mul_vs_ref_bn254_snarks.nim", false),
  # ("tests/math/t_ec_shortw_jac_g2_mixed_add_bn254_snarks.nim", false),

  # ("tests/math/t_ec_shortw_jac_g2_add_double_bls12_381.nim", false),
  # ("tests/math/t_ec_shortw_jac_g2_mul_sanity_bls12_381.nim", false),
  # ("tests/math/t_ec_shortw_jac_g2_mul_distri_bls12_381.nim", false),
-  # ("tests/math/t_ec_shortw_jac_g2_mul_vs_ref_bls12_381.nim", false),
+  ("tests/math/t_ec_shortw_jac_g2_mul_vs_ref_bls12_381.nim", false),
  # ("tests/math/t_ec_shortw_jac_g2_mixed_add_bls12_381.nim", false),

  # ("tests/math/t_ec_shortw_jac_g2_add_double_bls12_377.nim", false),
  # ("tests/math/t_ec_shortw_jac_g2_mul_sanity_bls12_377.nim", false),
  # ("tests/math/t_ec_shortw_jac_g2_mul_distri_bls12_377.nim", false),
-  # ("tests/math/t_ec_shortw_jac_g2_mul_vs_ref_bls12_377.nim", false),
+  ("tests/math/t_ec_shortw_jac_g2_mul_vs_ref_bls12_377.nim", false),
  # ("tests/math/t_ec_shortw_jac_g2_mixed_add_bls12_377.nim", false),

  # ("tests/math/t_ec_shortw_jac_g2_add_double_bw6_761.nim", false),
  # ("tests/math/t_ec_shortw_jac_g2_mul_sanity_bw6_761.nim", false),
  # ("tests/math/t_ec_shortw_jac_g2_mul_distri_bw6_761.nim", false),
-  # ("tests/math/t_ec_shortw_jac_g2_mul_vs_ref_bw6_761.nim", false),
+  ("tests/math/t_ec_shortw_jac_g2_mul_vs_ref_bw6_761.nim", false),
  # ("tests/math/t_ec_shortw_jac_g2_mixed_add_bw6_761.nim", false),

  # Elliptic curve arithmetic vs Sagemath
@ -179,8 +182,11 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[

  # Elliptic curve arithmetic - batch computation
  # ----------------------------------------------------------
-  ("tests/math/t_ec_shortw_prj_g1_batch_add.nim", false),
-  ("tests/math/t_ec_shortw_jac_g1_batch_add.nim", false),
+  ("tests/math/t_ec_shortw_prj_g1_sum_reduce.nim", false),
+  ("tests/math/t_ec_shortw_jac_g1_sum_reduce.nim", false),
+  ("tests/math/t_ec_shortw_jacext_g1_sum_reduce.nim", false),
+  ("tests/math/t_ec_shortw_prj_g1_msm.nim", false),
+  ("tests/math/t_ec_shortw_jac_g1_msm.nim", false),

  # Subgroups and cofactors
  # ----------------------------------------------------------
@ -210,6 +216,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
  # ----------------------------------------------------------
  ("tests/math/t_pairing_bn254_nogami_multi.nim", false),
  ("tests/math/t_pairing_bn254_snarks_multi.nim", false),
+  ("tests/math/t_pairing_bls12_377_multi.nim", false),
  ("tests/math/t_pairing_bls12_381_multi.nim", false),

  # Prime order fields
@ -258,8 +265,12 @@ const benchDesc = [
  "bench_fp6",
  "bench_fp12",
  "bench_ec_g1",
+  "bench_ec_g1_scalar_mul",
  "bench_ec_g1_batch",
+  "bench_ec_g1_msm_bn254_snarks",
+  "bench_ec_g1_msm_bls12_381",
  "bench_ec_g2",
+  "bench_ec_g2_scalar_mul",
  "bench_pairing_bls12_377",
  "bench_pairing_bls12_381",
  "bench_pairing_bn254_nogami",
@ -826,7 +837,58 @@ task bench_ec_g1_batch_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (
  runBench("bench_ec_g1_batch", "gcc", useAsm = false)

 task bench_ec_g1_batch_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - Clang no Assembly":
-  runBench("bench_ec_g1", "clang", useAsm = false)
+  runBench("bench_ec_g1_batch", "clang", useAsm = false)
+
+# Elliptic curve G1 - scalar multiplication
+# ------------------------------------------
+
+task bench_ec_g1_scalar_mul, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - Default compiler":
+  runBench("bench_ec_g1_scalar_mul")
+
+task bench_ec_g1_scalar_mul_gcc, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - GCC":
+  runBench("bench_ec_g1_scalar_mul", "gcc")
+
+task bench_ec_g1_scalar_mul_clang, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - Clang":
+  runBench("bench_ec_g1_scalar_mul", "clang")
+
+task bench_ec_g1_scalar_mul_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - GCC no Assembly":
+  runBench("bench_ec_g1_scalar_mul", "gcc", useAsm = false)
+
+task bench_ec_g1_scalar_mul_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Scalar Multiplication) - Clang no Assembly":
+  runBench("bench_ec_g1_scalar_mul", "clang", useAsm = false)
+
+# Elliptic curve G1 - Multi-scalar-mul
+# ------------------------------------------
+
+task bench_ec_g1_msm_bn254_snarks, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - Default compiler":
+  runBench("bench_ec_g1_msm_bn254_snarks")
+
+task bench_ec_g1_msm_bn254_snarks_gcc, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - GCC":
+  runBench("bench_ec_g1_msm_bn254_snarks", "gcc")
+
+task bench_ec_g1_msm_bn254_snarks_clang, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - Clang":
+  runBench("bench_ec_g1_msm_bn254_snarks", "clang")
+
+task bench_ec_g1_msm_bn254_snarks_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - GCC no Assembly":
+  runBench("bench_ec_g1_msm_bn254_snarks", "gcc", useAsm = false)
+
+task bench_ec_g1_msm_bn254_snarks_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BN254-Snarks - Clang no Assembly":
+  runBench("bench_ec_g1_msm_bn254_snarks", "clang", useAsm = false)
+
+task bench_ec_g1_msm_bls12_381, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - Default compiler":
+  runBench("bench_ec_g1_msm_bls12_381")
+
+task bench_ec_g1_msm_bls12_381_gcc, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - GCC":
+  runBench("bench_ec_g1_msm_bls12_381", "gcc")
+
+task bench_ec_g1_msm_bls12_381_clang, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - Clang":
+  runBench("bench_ec_g1_msm_bls12_381", "clang")
+
+task bench_ec_g1_msm_bls12_381_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - GCC no Assembly":
+  runBench("bench_ec_g1_msm_bls12_381", "gcc", useAsm = false)
+
+task bench_ec_g1_msm_bls12_381_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (Multi-Scalar-Mul) for BLS12-381 - Clang no Assembly":
+  runBench("bench_ec_g1_msm_bls12_381", "clang", useAsm = false)

 # Elliptic curve G2
 # ------------------------------------------
@ -846,6 +908,24 @@ task bench_ec_g2_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - GCC n
 task bench_ec_g2_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - Clang no Assembly":
  runBench("bench_ec_g2", "clang", useAsm = false)

+# Elliptic curve G2 - scalar multiplication
+# ------------------------------------------
+
+task bench_ec_g2_scalar_mul, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - Default compiler":
+  runBench("bench_ec_g2_scalar_mul")
+
+task bench_ec_g2_scalar_mul_gcc, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - GCC":
+  runBench("bench_ec_g2_scalar_mul", "gcc")
+
+task bench_ec_g2_scalar_mul_clang, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - Clang":
+  runBench("bench_ec_g2_scalar_mul", "clang")
+
+task bench_ec_g2_scalar_mul_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - GCC no Assembly":
+  runBench("bench_ec_g2_scalar_mul", "gcc", useAsm = false)
+
+task bench_ec_g2_scalar_mul_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾2 (Multi-Scalar-Mul) - Clang no Assembly":
+  runBench("bench_ec_g2_scalar_mul", "clang", useAsm = false)
+
 # Pairings
 # ------------------------------------------

--- a/constantine/ethereum_evm_precompiles.nim
+++ b/constantine/ethereum_evm_precompiles.nim
@ -12,7 +12,7 @@ import
  ./math/[arithmetic, extension_fields],
  ./math/arithmetic/limbs_montgomery,
  ./math/ec_shortweierstrass,
-  ./math/pairings/[pairings_bn, miller_loops, cyclotomic_subgroups],
+  ./math/pairings/[pairings_generic, miller_accumulators],
  ./math/constants/zoo_subgroups,
  ./math/io/[io_bigints, io_fields]

@ -86,8 +86,7 @@ func fromRawCoords(

  return cttEVM_Success

-func eth_evm_ecadd*(
-      r: var array[64, byte], inputs: openarray[byte]): CttEVMStatus =
+func eth_evm_ecadd*(r: var array[64, byte], inputs: openarray[byte]): CttEVMStatus =
  ## Elliptic Curve addition on BN254_Snarks
  ## (also called alt_bn128 in Ethereum specs
  ##  and bn256 in Ethereum tests)
@ -142,8 +141,7 @@ func eth_evm_ecadd*(
    aff.y, bigEndian
  )

-func eth_evm_ecmul*(
-      r: var array[64, byte], inputs: openarray[byte]): CttEVMStatus =
+func eth_evm_ecmul*(r: var array[64, byte], inputs: openarray[byte]): CttEVMStatus =
  ## Elliptic Curve multiplication on BN254_Snarks
  ## (also called alt_bn128 in Ethereum specs
  ##  and bn256 in Ethereum tests)
@ -192,7 +190,6 @@ func eth_evm_ecmul*(
    # which is 31.5% faster than plain windowed scalar multiplication
    # at the low cost of a modular reduction.

-    var sprime{.noInit.}: typeof(smod.mres)
    # Due to mismatch between the BigInt[256] input and the rest being BigInt[254]
    # we use the low-level getMont instead of 'fromBig'
    getMont(smod.mres.limbs, s.limbs,
@ -200,8 +197,7 @@ func eth_evm_ecmul*(
                Fr[BN254_Snarks].getR2modP().limbs,
                Fr[BN254_Snarks].getNegInvModWord(),
                Fr[BN254_Snarks].getSpareBits())
-    sprime = smod.toBig()
-    P.scalarMul(sprime)
+    P.scalarMul(smod.toBig())
  else:
    P.scalarMul(s)

@ -323,10 +319,13 @@ func eth_evm_ecpairing*(
    r[r.len-1] = byte 1
    return

-  var gt0{.noInit.}, gt1{.noInit.}: Fp12[BN254_Snarks]
  var P{.noInit.}: ECP_ShortW_Aff[Fp[BN254_Snarks], G1]
  var Q{.noInit.}: ECP_ShortW_Aff[Fp2[BN254_Snarks], G2]

+  var acc {.noInit.}: MillerAccumulator[Fp[BN254_Snarks], Fp2[BN254_Snarks], Fp12[BN254_Snarks]]
+  acc.init()
+  var foundInfinity = false
+
  for i in 0 ..< N:
    let pos = i*192

@ -348,15 +347,18 @@ func eth_evm_ecpairing*(
    if statusQ != cttEVM_Success:
      return statusQ

-    gt1.millerLoopGenericBN(P, Q)
-    if i == 0:
-      gt0 = gt1
-    else:
-      gt0 *= gt1
+    let regular = acc.update(P, Q)
+    if not regular:
+      foundInfinity = true

-  gt0.finalExpEasy()
-  gt0.finalExpHard_BN()
+  if foundInfinity: # pairing with infinity returns 1, hence no need to compute the following
+    r[r.len-1] = byte 1
+    return
+
+  var gt {.noinit.}: Fp12[BN254_Snarks]
+  acc.finish(gt)
+  gt.finalExp()

  zeroMem(r.addr, r.sizeof())
-  if gt0.isOne().bool:
+  if gt.isOne().bool:
    r[r.len-1] = byte 1
--- a/constantine/math/arithmetic/bigints.nim
+++ b/constantine/math/arithmetic/bigints.nim
@ -61,7 +61,7 @@ export BigInt
 # https://github.com/mratsim/constantine/issues/15

 # No exceptions allowed
-{.push raises: [].}
+{.push raises: [], checks: off.}
 {.push inline.}

 # Initialization
@ -354,7 +354,26 @@ func setBit*[bits: static int](a: var BigInt[bits], index: int) =
  let shifted = One shl (index and SelectMask)
  slot[] = slot[] or shifted

-# Multiplication by small cosntants
+func getWindowAt*(a: BigInt, bitIndex: int, windowSize: static int): SecretWord {.inline.} =
+  ## Access a window of `a` of size bitsize
+  static: doAssert windowSize <= WordBitWidth
+
+  const SlotShift = log2_vartime(WordBitWidth.uint32)
+  const WordMask = WordBitWidth - 1
+  const WindowMask = SecretWord((1 shl windowSize) - 1)
+
+  let slot     = bitIndex shr SlotShift
+  let word     = a.limbs[slot]                    # word in limbs
+  let pos      = bitIndex and WordMask            # position in the word
+
+  # This is constant-time, the branch does not depend on secret data.
+  if pos + windowSize > WordBitWidth and slot+1 < a.limbs.len:
+    # Read next word as well
+    return SecretWord((word shr pos) or (a.limbs[slot+1] shl (WordBitWidth-pos))) and WindowMask
+  else:
+    return SecretWord(word shr pos) and WindowMask
+
+# Multiplication by small constants
 # ------------------------------------------------------------

 func `*=`*(a: var BigInt, b: static int) =
@ -491,22 +510,93 @@ func invmod*[bits](r: var BigInt[bits], a, M: BigInt[bits]) =
  one.setOne()
  r.invmod(a, one, M)

+{.pop.} # inline
+
+# ############################################################
+#
+#                   **Variable-Time**
+#
+# ############################################################
+
+{.push inline.}
+
+func invmod_vartime*[bits](
+       r: var BigInt[bits],
+       a, F, M: BigInt[bits]) {.tags: [VarTime].} =
+  ## Compute the modular inverse of ``a`` modulo M
+  ## r ≡ F.a⁻¹ (mod M)
+  ##
+  ## M MUST be odd, M does not need to be prime.
+  ## ``a`` MUST be less than M.
+  r.limbs.invmod_vartime(a.limbs, F.limbs, M.limbs, bits)
+
+func invmod_vartime*[bits](
+       r: var BigInt[bits],
+       a: BigInt[bits],
+       F, M: static BigInt[bits]) {.tags: [VarTime].} =
+  ## Compute the modular inverse of ``a`` modulo M
+  ## r ≡ F.a⁻¹ (mod M)
+  ##
+  ## with F and M known at compile-time
+  ##
+  ## M MUST be odd, M does not need to be prime.
+  ## ``a`` MUST be less than M.
+  r.limbs.invmod_vartime(a.limbs, F.limbs, M.limbs, bits)
+
+func invmod_vartime*[bits](r: var BigInt[bits], a, M: BigInt[bits]) {.tags: [VarTime].} =
+  ## Compute the modular inverse of ``a`` modulo M
+  ##
+  ## The modulus ``M`` MUST be odd
+  var one {.noInit.}: BigInt[bits]
+  one.setOne()
+  r.invmod_vartime(a, one, M)
+
+{.pop.}
+
 # ############################################################
 #
 #                   Recoding
 #
 # ############################################################
+#
+# Litterature
+#
+# - Elliptic Curves in Cryptography
+#   Blake, Seroussi, Smart, 1999
+#
+# - Efficient Arithmetic on Koblitz Curves
+#   Jerome A. Solinas, 2000
+#   https://decred.org/research/solinas2000.pdf
+#
+# - Optimal Left-to-Right Binary Signed-Digit Recoding
+#   Joye, Yen, 2000
+#   https://marcjoye.github.io/papers/JY00sd2r.pdf
+#
+# - Guide to Elliptic Curve Cryptography
+#   Hankerson, Menezes, Vanstone, 2004
+#
+# - Signed Binary Representations Revisited
+#   Katsuyuki Okeya, Katja Schmidt-Samoa, Christian Spahn, and Tsuyoshi Takagi, 2004
+#   https://eprint.iacr.org/2004/195.pdf
+#
+# - Some Explicit Formulae of NAF and its Left-to-Right Analogue
+#   Dong-Guk Han, Tetsuya Izu, and Tsuyoshi Takagi
+#   https://eprint.iacr.org/2005/384.pdf
+#
+# See also on Booth encoding and Modified Booth Encoding (bit-pair recoding)
+# - https://www.ece.ucdavis.edu/~bbaas/281/notes/Handout.booth.pdf
+# - https://vulms.vu.edu.pk/Courses/CS501/Downloads/Booth%20and%20bit%20pair%20encoding.pdf
+# - https://vulms.vu.edu.pk/Courses/CS501/Downloads/Bit-Pair%20Recoding.pdf
+# - http://www.ecs.umass.edu/ece/koren/arith/simulator/ModBooth/

-iterator recoding_l2r_vartime*(a: BigInt): int8 =
+iterator recoding_l2r_signed_vartime*[bits: static int](a: BigInt[bits]): int8 =
  ## This is a minimum-Hamming-Weight left-to-right recoding.
  ## It outputs signed {-1, 0, 1} bits from MSB to LSB
  ## with minimal Hamming Weight to minimize operations
-  ## in Miller Loop and vartime scalar multiplications
+  ## in Miller Loops and vartime scalar multiplications
  ##
-  ## Tagged vartime as it returns an int8
-  ## - Optimal Left-to-Right Binary Signed-Digit Recoding
-  ##   Joye, Yen, 2000
-  ##   https://marcjoye.github.io/papers/JY00sd2r.pdf
+  ## ⚠️ While the recoding is constant-time,
+  ##   usage of this recoding is intended vartime

  # As the caller is copy-pasted at each yield
  # we rework the algorithm so that we have a single yield point
@ -514,12 +604,12 @@ iterator recoding_l2r_vartime*(a: BigInt): int8 =

  var bi, bi1, ri, ri1, ri2: int8

-  var i = a.bits
+  var i = bits
  while true:
-    if i == a.bits: # We rely on compiler to hoist this branch out of the loop.
+    if i == bits: # We rely on compiler to hoist this branch out of the loop.
      ri = 0
-      ri1 = int8 a.bit(a.bits-1)
-      ri2 = int8 a.bit(a.bits-2)
+      ri1 = int8 a.bit(bits-1)
+      ri2 = int8 a.bit(bits-2)
      bi = 0
    else:
      bi = bi1
@ -531,12 +621,225 @@ iterator recoding_l2r_vartime*(a: BigInt): int8 =
        ri2 = int8 a.bit(i-2)

    bi1 = (bi + ri1 + ri2) shr 1
-    yield -2*bi + ri + bi1
+    let r = -2*bi + ri + bi1
+    yield r

-    if i > 0:
+    if i != 0:
      i -= 1
    else:
      break

-{.pop.} # inline
+func recode_l2r_signed_vartime*[bits: static int](
+       recoded: var array[bits+1, SomeSignedInt], a: BigInt[bits]): int {.tags:[VarTime].} =
+  ## Recode left-to-right (MSB to LSB)
+  ## Output from most significant to least significant
+  ## Returns the number of bits used
+  type I = SomeSignedInt
+  var i = 0
+  for bit in a.recoding_l2r_signed_vartime():
+    recoded[i] = I(bit)
+    inc i
+  return i
+
+iterator recoding_r2l_signed_vartime*[bits: static int](a: BigInt[bits]): int8 =
+  ## This is a minimum-Hamming-Weight left-to-right recoding.
+  ## It outputs signed {-1, 0, 1} bits from LSB to MSB
+  ## with minimal Hamming Weight to minimize operations
+  ## in Miller Loops and vartime scalar multiplications
+  ##
+  ## ⚠️ While the recoding is constant-time,
+  ##   usage of this recoding is intended vartime
+  ##
+  ## Implementation uses 2-NAF
+  # This is equivalent to `var r = (3a - a); if (r and 1) == 0: r shr 1`
+  var ci, ci1, ri, ri1: int8
+
+  var i = 0
+  while i <= bits:
+    if i == 0: # We rely on compiler to hoist this branch out of the loop.
+      ri = int8 a.bit(0)
+      ri1 = int8 a.bit(1)
+      ci = 0
+    else:
+      ci = ci1
+      ri = ri1
+      if i >= bits - 1:
+        ri1 = 0
+      else:
+        ri1 = int8 a.bit(i+1)
+
+    ci1 = (ci + ri + ri1) shr 1
+    let r = ci + ri - 2*ci1
+    yield r
+
+    i += 1
+
+func recode_r2l_signed_vartime*[bits: static int](
+       recoded: var array[bits+1, SomeSignedInt], a: BigInt[bits]): int {.tags:[VarTime].} =
+  ## Recode right-to-left (LSB to MSB)
+  ## Output from least significant to most significant
+  ## Returns the number of bits used
+  type I = SomeSignedInt
+  var i = 0
+  for bit in a.recoding_r2l_signed_vartime():
+    recoded[i] = I(bit)
+    inc i
+  return i
+
+iterator recoding_r2l_signed_window_vartime*(a: BigInt, windowLogSize: int): int {.tags:[VarTime].} =
+  ## This is a minimum-Hamming-Weight right-to-left windowed recoding with the following properties
+  ## 1. The most significant non-zero bit is positive.
+  ## 2. Among any w consecutive digits, at most one is non-zero.
+  ## 3. Each non-zero digit is odd and less than 2ʷ⁻¹ in absolute value.
+  ## 4. The length of the recoding is at most BigInt.bits + 1
+  ##
+  ## This returns input one digit at a time and not the whole window.
+  ##
+  ## ⚠️ not constant-time
+
+  let sMax = 1 shl (windowLogSize - 1)
+  let uMax = sMax + sMax
+  let mask = uMax - 1
+
+  var a {.noInit.} = a
+  var zeroes = 0
+
+  while true:
+    # 1. Count zeroes in LSB
+    var ctz = 0
+    for i in 0 ..< a.limbs.len:
+      let ai = a.limbs[i]
+      if ai.isZero().bool:
+        ctz += WordBitWidth
+      else:
+        ctz += BaseType(ai).countTrailingZeroBits_vartime().int
+        break
+
+    # 2. Remove them
+    if ctz >= WordBitWidth:
+      let wordOffset = int(ctz shr log2_vartime(uint32 WordBitWidth))
+      for i in 0 ..< a.limbs.len-wordOffset:
+        a.limbs[i] = a.limbs[i+wordOffset]
+      for i in a.limbs.len-wordOffset ..< a.limbs.len:
+        a.limbs[i] = Zero
+      ctz = ctz and (WordBitWidth-1)
+      zeroes += wordOffset * WordBitWidth
+    if ctz > 0:
+      a.shiftRight(ctz)
+      zeroes += ctz
+
+    # 3. Yield - We merge yield points with a goto-based state machine
+    # Nim copy-pastes the iterator for-loop body at yield points, we don't want to duplicate code
+    # hence we need a single yield point
+
+    type State = enum
+      StatePrepareYield
+      StateYield
+      StateExit
+
+    var yieldVal = 0
+    var nextState = StatePrepareYield
+
+    var state {.goto.} = StatePrepareYield
+    case state
+    of StatePrepareYield:
+      # 3.a Yield zeroes
+      zeroes -= 1
+      if zeroes >= 0:
+        state = StateYield # goto StateYield
+
+      # 3.b Yield the least significant window
+      var lsw = a.limbs[0].int and mask # signed is important
+      a.shiftRight(windowLogSize)
+      if (lsw and sMax) != 0:           # MSB of window set
+        a += One                        #   Lend 2ʷ to next digit
+        lsw -= uMax                     #   push from [0, 2ʷ) to [-2ʷ⁻¹, 2ʷ⁻¹)
+
+      zeroes = windowLogSize-1
+      yieldVal = lsw
+      nextState = StateExit
+      # Fall through StateYield
+
+    of StateYield:
+      yield yieldVal
+      case nextState
+      of StatePrepareYield: state = StatePrepareYield
+      of StateExit:         state = StateExit
+      else:                 unreachable()
+
+    of StateExit:
+      if a.isZero().bool:
+        break
+
+func recode_r2l_signed_window_vartime*[bits: static int](
+       naf: var array[bits+1, SomeSignedInt], a: BigInt[bits], window: int): int {.tags:[VarTime].} =
+  ## Minimum Hamming-Weight windowed NAF recoding
+  ## Output from least significant to most significant
+  ## Returns the number of bits used
+  ##
+  ## The `naf` output is returned one digit at a time and not one window at a time
+  type I = SomeSignedInt
+  var i = 0
+  for digit in a.recoding_r2l_signed_window_vartime(window):
+    naf[i] = I(digit)
+    i += 1
+  return i
+
+func signedWindowEncoding(digit: SecretWord, bitsize: static int): tuple[val: SecretWord, neg: SecretBool] {.inline.} =
+  ## Get the signed window encoding for `digit`
+  ##
+  ## This uses the fact that 999 = 100 - 1
+  ## It replaces string of binary 1 with 1...-1
+  ## i.e. 0111 becomes 1 0 0 -1
+  ##
+  ## This looks at [bitᵢ₊ₙ..bitᵢ | bitᵢ₋₁]
+  ## and encodes   [bitᵢ₊ₙ..bitᵢ]
+  ##
+  ## Notes:
+  ##   - This is not a minimum weight encoding unlike NAF
+  ##   - Due to constant-time requirement in scalar multiplication
+  ##     or bucketing large window in multi-scalar-multiplication
+  ##     minimum weight encoding might not lead to saving operations
+  ##   - Unlike NAF and wNAF encoding, there is no carry to propagate
+  ##     hence this is suitable for parallelization without encoding precomputation
+  ##     and for GPUs
+  ##   - Implementation uses Booth encoding
+  result.neg = SecretBool(digit shr bitsize)
+
+  let negMask = -SecretWord(result.neg)
+  const valMask = SecretWord((1 shl bitsize) - 1)
+
+  let encode = (digit + One) shr 1            # Lookup bitᵢ₋₁, flip series of 1's
+  result.val = (encode + negMask) xor negMask # absolute value
+  result.val = result.val and valMask
+
+func getSignedFullWindowAt*(a: BigInt, bitIndex: int, windowSize: static int): tuple[val: SecretWord, neg: SecretBool] {.inline.} =
+  ## Access a signed window of `a` of size bitsize
+  ## Returns a signed encoding.
+  ##
+  ## The result is `windowSize` bits at a time.
+  ##
+  ## bitIndex != 0 and bitIndex mod windowSize == 0
+  debug: doAssert (bitIndex != 0) and (bitIndex mod windowSize) == 0
+  let digit = a.getWindowAt(bitIndex-1, windowSize+1) # get the bit on the right of the window for Booth encoding
+  return digit.signedWindowEncoding(windowSize)
+
+func getSignedBottomWindow*(a: BigInt, windowSize: static int): tuple[val: SecretWord, neg: SecretBool] {.inline.} =
+  ## Access the least significant signed window of `a` of size bitsize
+  ## Returns a signed encoding.
+  ##
+  ## The result is `windowSize` bits at a time.
+  let digit = a.getWindowAt(0, windowSize) shl 1 # Add implicit 0 on the right of LSB for Booth encoding
+  return digit.signedWindowEncoding(windowSize)
+
+func getSignedTopWindow*(a: BigInt, topIndex: int, excess: static int): tuple[val: SecretWord, neg: SecretBool] {.inline.} =
+  ## Access the least significant signed window of `a` of size bitsize
+  ## Returns a signed encoding.
+  ##
+  ## The result is `excess` bits at a time.
+  ##
+  ## bitIndex != 0 and bitIndex mod windowSize == 0
+  let digit = a.getWindowAt(topIndex-1, excess+1) # Add implicit 0 on the left of MSB and get the bit on the right of the window
+  return digit.signedWindowEncoding(excess+1)
+
 {.pop.} # raises no exceptions
--- a/constantine/math/arithmetic/finite_fields.nim
+++ b/constantine/math/arithmetic/finite_fields.nim
@ -550,9 +550,6 @@ template mulCheckSparse*(a: var Fp, b: Fp) =
  else:
    a *= b

-{.pop.} # inline
-{.pop.} # raises no exceptions
-
 # ############################################################
 #
 #            Field arithmetic ergonomic macros
@ -595,3 +592,27 @@ macro addchain*(fn: untyped): untyped =

  result[^1] = body
  # echo result.toStrLit()
+
+# ############################################################
+#
+#                   **Variable-Time**
+#
+# ############################################################
+
+func inv_vartime*(r: var FF, a: FF) {.tags: [VarTime].} =
+  ## Variable-time Inversion modulo p
+  ##
+  ## The inverse of 0 is 0.
+  ## Incidentally this avoids extra check
+  ## to convert Jacobian and Projective coordinates
+  ## to affine for elliptic curve
+  r.mres.invmod_vartime(a.mres, FF.getR2modP(), FF.fieldMod())
+
+func inv_vartime*(a: var FF) {.tags: [VarTime].} =
+  ## Variable-time Inversion modulo p
+  ##
+  ## The inverse of 0 is 0.
+  ## Incidentally this avoids extra check
+  ## to convert Jacobian and Projective coordinates
+  ## to affine for elliptic curve
+  a.inv_vartime(a)
--- a/constantine/math/arithmetic/limbs_exgcd.nim
+++ b/constantine/math/arithmetic/limbs_exgcd.nim
@ -7,11 +7,12 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

 import
-  ../../platforms/[abstractions, signed_secret_words],
+  ../../platforms/abstractions,
  ./limbs, ./limbs_unsaturated

 # No exceptions allowed
 {.push raises: [].}
+{.push checks: off.}

 # ############################################################
 #
@ -342,18 +343,24 @@ func matVecMul_shr_k_mod_M[N, E: static int](
  d[N-1] = cd.lo
  e[N-1] = ce.lo

-func matVecMul_shr_k[N, E: static int](
+template matVecMul_shr_k_impl(
       t: TransitionMatrix,
-       f, g: var LimbsUnsaturated[N, E],
+       f, g: var LimbsUnsaturated,
+       Excess: static int,
+       numLimbsLeft: int or static int,
       k: static int
  ) =
  ## Compute
  ##
  ## [u v] [f]
  ## [q r].[g] / 2ᵏ
+  ##
+  ## Template so that it can be specialized
+  ## when iteration number is fixed and compiler can unroll, in constant-time case
+  ## or variable and the full buffer might not be used (vartime)

-  static: doAssert k == WordBitWidth - E
-  const Max = SignedSecretWord(MaxWord shr E)
+  static: doAssert k == WordBitWidth - Excess
+  const Max = SignedSecretWord(MaxWord shr Excess)

  let
    u = t.u
@ -376,7 +383,7 @@ func matVecMul_shr_k[N, E: static int](
  cf.ashr(k)
  cg.ashr(k)

-  for i in 1 ..< N:
+  for i in 1 ..< numLimbsLeft:
    cf.ssumprodAccNoCarry(u, f[i], v, g[i])
    cg.ssumprodAccNoCarry(q, f[i], r, g[i])
    f[i-1] = cf.lo and Max
@ -384,8 +391,11 @@ func matVecMul_shr_k[N, E: static int](
    cf.ashr(k)
    cg.ashr(k)

-  f[N-1] = cf.lo
-  g[N-1] = cg.lo
+  f[numLimbsLeft-1] = cf.lo
+  g[numLimbsLeft-1] = cg.lo
+
+func matVecMul_shr_k[N, E: static int](t: TransitionMatrix, f, g: var LimbsUnsaturated[N, E], k: static int) =
+  matVecMul_shr_k_impl(t, f, g, E, N, k)

 func invmodImpl[N, E](
       a: var LimbsUnsaturated[N, E],
@ -666,3 +676,217 @@ func legendre*(a: Limbs, M: static Limbs, bits: static int): SecretWord =
  a2.fromPackedRepr(a)

  legendreImpl(a2, m2, k, bits)
+
+
+# ############################################################
+#
+#              Variable-time optimizations
+#
+# ############################################################
+
+const NegInvMod256 = [
+    # Stores tab[i div 2] = -i⁻¹ (mod 256), with i odd
+    # See "invModBitwidth" on "Dumas iterations"
+    # ax ≡ 1 (mod 2ᵏ) <=> ax(2 - ax) ≡ 1 (mod 2^(2k))
+    # a⁻¹ (mod 256) = a(2-a²)
+      -1, -235, -141, -183,  -57, -227, -133, -239,
+    -241,  -91, -253, -167,  -41,  -83, -245, -223,
+    -225, -203, -109, -151,  -25, -195, -101, -207,
+    -209,  -59, -221, -135,   -9,  -51, -213, -191,
+    -193, -171,  -77, -119, -249, -163,  -69, -175,
+    -177,  -27, -189, -103, -233,  -19, -181, -159,
+    -161, -139,  -45,  -87, -217, -131,  -37, -143,
+    -145, -251, -157,  -71, -201, -243, -149, -127,
+    -129, -107,  -13,  -55, -185,  -99,   -5, -111,
+    -113, -219, -125,  -39, -169, -211, -117,  -95,
+     -97,  -75, -237,  -23, -153,  -67, -229,  -79,
+     -81, -187,  -93,   -7, -137, -179,  -85,  -63,
+     -65,  -43, -205, -247, -121,  -35, -197,  -47,
+     -49, -155,  -61, -231, -105, -147,  -53,  -31,
+     -33,  -11, -173, -215,  -89,   -3, -165,  -15,
+     -17, -123,  -29, -199,  -73, -115,  -21, -255]
+
+func batchedDivsteps_vartime(
+       t: var TransitionMatrix,
+       eta: SignedSecretWord,
+       f0, g0: SecretWord,
+       k: static int
+     ): SignedSecretWord {.tags:[Vartime].} =
+  ## Bernstein-Yang eta (-delta) batch of divsteps
+  ## **Variable-Time**
+  ##
+  ## Output:
+  ## - return eta for the next batch of divsteps
+  ## - mutate t, the transition matrix to apply `numIters` divsteps at once
+  ##   t is scaled by 2ᵏ
+  ##
+  ## Input:
+  ## - f0, bottom limb of f
+  ## - g0, bottom limb of g
+  ## - k, the maximum batch size, transition matrix is scaled by 2ᵏ
+
+  template swapNeg(a, b) =
+    var tmp = -a
+    a = b
+    b = tmp
+
+  var
+    u = One
+    v = Zero
+    q = Zero
+    r = One
+    f = f0
+    g = g0
+
+    eta = cast[SignedBaseType](eta)
+    bitsLeft = cast[SignedBaseType](k)
+
+  while true:
+    # Count zeros up to bitsLeft and process a batch of divsteps up to that number
+    let zeros = (g.BaseType or (1.BaseType shl bitsLeft)).countTrailingZeroBits_vartime()
+    g = g shr zeros
+    u = u shl zeros
+    v = v shl zeros
+    eta -= cast[SignedBaseType](zeros)
+    bitsLeft -= cast[SignedBaseType](zeros)
+
+    if bitsLeft == 0:
+      break
+
+    # Now process, the 1's.
+    if eta < 0:
+      eta = -eta
+      swapNeg(f, g)
+      swapNeg(u, q)
+      swapNeg(v, r)
+
+    # We process up to 6 1's at once
+    const mask6 = SecretWord((1 shl 6) - 1)
+    let limit = min(eta+1, bitsLeft)
+    let maskLimit = (MaxWord shr (WordBitWidth - limit)) and mask6
+    # Find the multiple of f to add to cancel the bottom min(limit, 6) bits of g
+    let w = (g * SecretWord NegInvMod256[int((f and mask6) shr 1)]) and maskLimit
+
+    # Next iteration will have at least 6 0's to process at once
+    g += f*w
+    q += u*w
+    r += v*w
+
+  t.u = SignedSecretWord u
+  t.v = SignedSecretWord v
+  t.q = SignedSecretWord q
+  t.r = SignedSecretWord r
+  return SignedSecretWord(eta)
+
+func matVecMul_shr_k_partial(t: TransitionMatrix, f, g: var LimbsUnsaturated, len: int, k: static int) =
+  ## Matrix-Vector multiplication with top part of f and g being zeros
+  matVecMul_shr_k_impl(t, f, g, LimbsUnsaturated.Excess, len, k)
+
+func isZero_vartime(a: LimbsUnsaturated, limbsLeft: int): bool {.tags:[VarTime].} =
+  for i in 0 ..< limbsLeft:
+    if a[i].int != 0:
+      return false
+  return true
+
+func discardUnusedLimb_vartime[N, E: static int](limbsLeft: var int, f, g: var LimbsUnsaturated[N, E]) {.tags:[VarTime].} =
+  ## If f and g both don't use their last limb, it will propagate the sign down to the previous one
+  if limbsLeft == 1:
+    return
+
+  let fn = f[limbsLeft-1]
+  let gn = g[limbsLeft-1]
+  var mask = SignedSecretWord(0)
+  mask = mask or (fn xor fn.isNegMask()) # 0 if last limb has nothing left but its sign
+  mask = mask or (gn xor gn.isNegMask()) # 0 if last limb has nothing left but its sign
+  if cast[SignedBaseType](mask) == 0:
+    f[limbsLeft-2] = f[limbsLeft-2] or fn.lshl(WordBitWidth-E) # if only sign is left, the last limb is 11..11 if negative
+    g[limbsLeft-2] = g[limbsLeft-2] or gn.lshl(WordBitWidth-E) # or 00..00 if positive
+    limbsLeft -= 1
+
+func invmodImpl_vartime[N, E: static int](
+       a: var LimbsUnsaturated[N, E],
+       F, M: LimbsUnsaturated[N, E],
+       invMod2powK: SecretWord,
+       k, bits: static int) {.tags:[VarTime].} =
+  ## **Variable-time** Modular inversion using Bernstein-Yang algorithm
+  ## r ≡ F.a⁻¹ (mod M)
+
+  # eta = -delta
+  var eta = cast[SignedSecretWord](-1)
+  var d{.noInit.}, e{.noInit.}: LimbsUnsaturated[N, E]
+  var f{.noInit.}, g{.noInit.}: LimbsUnsaturated[N, E]
+
+  d.setZero()
+  e = F
+
+  f = M
+  g = a
+
+  var limbsLeft = N
+
+  while true:
+    var t{.noInit.}: TransitionMatrix
+    # Compute transition matrix and next eta
+    eta = t.batchedDivsteps_vartime(eta, SecretWord f[0], SecretWord g[0], k)
+    # Apply the transition matrix
+    # [u v]    [d]
+    # [q r]/2ᵏ.[e]  mod M
+    t.matVecMul_shr_k_mod_M(d, e, k, M, invMod2powK)
+    # [u v]    [f]
+    # [q r]/2ᵏ.[g]
+    t.matVecMul_shr_k_partial(f, g, limbsLeft, k)
+    if g.isZero_vartime(limbsLeft):
+      break
+    limbsLeft.discardUnusedLimb_vartime(f, g)
+
+  d.canonicalize(signMask = f[limbsLeft-1].isNegMask(), M)
+  a = d
+
+func invmod_vartime*(
+       r: var Limbs, a: Limbs,
+       F, M: Limbs, bits: static int) {.tags:[VarTime].} =
+  ## Compute the scaled modular inverse of ``a`` modulo M
+  ## r ≡ F.a⁻¹ (mod M)
+  ##
+  ## M MUST be odd, M does not need to be prime.
+  ## ``a`` MUST be less than M.
+  const Excess = 2
+  const k = WordBitWidth - Excess
+  const NumUnsatWords = (bits + k - 1) div k
+
+  # Convert values to unsaturated repr
+  var m2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
+  var factor {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
+  m2.fromPackedRepr(M)
+  factor.fromPackedRepr(F)
+  let m0invK = SecretWord invMod2powK(BaseType M[0], k)
+
+  var a2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
+  a2.fromPackedRepr(a)
+  a2.invmodImpl_vartime(factor, m2, m0invK, k, bits)
+  r.fromUnsatRepr(a2)
+
+func invmod_vartime*(
+       r: var Limbs, a: Limbs,
+       F, M: static Limbs, bits: static int) {.tags:[VarTime].} =
+  ## Compute the scaled modular inverse of ``a`` modulo M
+  ## r ≡ F.a⁻¹ (mod M) (compile-time factor and modulus overload)
+  ##
+  ## with F and M known at compile-time
+  ##
+  ## M MUST be odd, M does not need to be prime.
+  ## ``a`` MUST be less than M.
+
+  const Excess = 2
+  const k = WordBitWidth - Excess
+  const NumUnsatWords = (bits + k - 1) div k
+
+  # Convert values to unsaturated repr
+  const m2 = LimbsUnsaturated[NumUnsatWords, Excess].fromPackedRepr(M)
+  const factor = LimbsUnsaturated[NumUnsatWords, Excess].fromPackedRepr(F)
+  const m0invK = SecretWord invMod2powK(BaseType M[0], k)
+
+  var a2 {.noInit.}: LimbsUnsaturated[NumUnsatWords, Excess]
+  a2.fromPackedRepr(a)
+  a2.invmodImpl_vartime(factor, m2, m0invK, k, bits)
+  r.fromUnsatRepr(a2)
--- a/constantine/math/arithmetic/limbs_unsaturated.nim
+++ b/constantine/math/arithmetic/limbs_unsaturated.nim
@ -6,7 +6,7 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

-import  ../../platforms/[abstractions, signed_secret_words]
+import  ../../platforms/abstractions

 type
  LimbsUnsaturated*[N, Excess: static int] = object
--- a/constantine/math/config/curves_declaration.nim
+++ b/constantine/math/config/curves_declaration.nim
@ -198,6 +198,9 @@ declareCurves:
    modulus: "0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f"
    order: "0xfffffffffffffffffffffffffffffffebaaedce6af48a03bbfd25e8cd0364141"
    orderBitwidth: 256
+    eq_form: ShortWeierstrass
+    coef_a: 0
+    coef_b: 7
  curve BLS12_377:
    # Zexe curve
    # (p41) https://eprint.iacr.org/2018/962.pdf
--- a/constantine/math/constants/bls12_377_pairings.nim
+++ b/constantine/math/constants/bls12_377_pairings.nim
@ -13,8 +13,7 @@ import
  ../extension_fields,
  ../elliptic/[ec_shortweierstrass_affine, ec_shortweierstrass_projective],
  ../pairings/[cyclotomic_subgroups, miller_loops],
-  ../isogenies/frobenius,
-  ../../platforms/allocs
+  ../isogenies/frobenius

 # Slow generic implementation
 # ------------------------------------------------------------
@ -22,8 +21,7 @@ import
 # The bit count must be exact for the Miller loop
 const BLS12_377_pairing_ate_param* = block:
  # BLS12 Miller loop is parametrized by u
-  # +1 to bitlength so that we can mul by 3 for NAF encoding
-  BigInt[64+1].fromHex"0x8508c00000000001"
+  BigInt[64].fromHex"0x8508c00000000001"

 const BLS12_377_pairing_ate_param_isNeg* = false

@ -65,7 +63,7 @@ func millerLoopAddchain*(
       Qs: ptr UncheckedArray[ECP_ShortW_Aff[Fp2[BLS12_377], G2]],
       Ps: ptr UncheckedArray[ECP_ShortW_Aff[Fp[BLS12_377], G1]],
       N: int
-     ) =
+     ) {.noInline.} =
  ## Miller Loop for BLS12-377 curve
  ## Computes f{u,Q}(P) with u the BLS curve parameter

--- a/constantine/math/constants/bls12_377_subgroups.nim
+++ b/constantine/math/constants/bls12_377_subgroups.nim
@ -105,7 +105,7 @@ func clearCofactorReference*(P: var ECP_ShortW_Prj[Fp2[BLS12_377], G2]) {.inline
 # BLS12 G1
 # ------------------------------------------------------------

-func clearCofactorFast*(P: var ECP_ShortW_Prj[Fp[BLS12_377], G1]) =
+func clearCofactorFast*(P: var ECP_ShortW[Fp[BLS12_377], G1]) =
  ## Clear the cofactor of BLS12_377 G1
  ##
  ## Wahby et al "Fast and simple constant-time hashing to the BLS12-377 elliptic curve", https://eprint.iacr.org/2019/403
@ -144,7 +144,7 @@ func clearCofactorFast*(P: var ECP_ShortW_Prj[Fp[BLS12_377], G1]) =
 # with Psi (ψ) - untwist-Frobenius-Twist function
 # and x the curve BLS parameter

-func clearCofactorFast*(P: var ECP_ShortW_Prj[Fp2[BLS12_377], G2]) =
+func clearCofactorFast*(P: var ECP_ShortW[Fp2[BLS12_377], G2]) =
  ## Clear the cofactor of BLS12_377 G2
  ## Optimized using endomorphisms
  ## P -> [x²-x-1]P + [x-1] ψ(P) + ψ²([2]P)
@ -172,7 +172,7 @@ func clearCofactorFast*(P: var ECP_ShortW_Prj[Fp2[BLS12_377], G2]) =
 #
 # ############################################################

-func isInSubgroup*(P: ECP_ShortW_Prj[Fp[BLS12_377], G1]): SecretBool =
+func isInSubgroup*(P: ECP_ShortW[Fp[BLS12_377], G1]): SecretBool =
  ## Returns true if P is in G1 subgroup, i.e. P is a point of order r.
  ## A point may be on a curve but not on the prime order r subgroup.
  ## Not checking subgroup exposes a protocol to small subgroup attacks.
@ -182,7 +182,7 @@ func isInSubgroup*(P: ECP_ShortW_Prj[Fp[BLS12_377], G1]): SecretBool =
  #   A note on group membership tests for G1, G2 and GT
  #   on BLS pairing-friendly curves
  #   P is in the G1 subgroup iff ϕ(P) == [-u²](P)
-  var t0{.noInit.}, t1{.noInit.}: ECP_ShortW_Prj[Fp[BLS12_377], G1]
+  var t0{.noInit.}, t1{.noInit.}: typeof(P)

  # [-u²]P
  t0.pow_bls12_377_x(P)
@ -195,7 +195,7 @@ func isInSubgroup*(P: ECP_ShortW_Prj[Fp[BLS12_377], G1]): SecretBool =

  return t0 == t1

-func isInSubgroup*(P: ECP_ShortW_Prj[Fp2[BLS12_377], G2]): SecretBool =
+func isInSubgroup*(P: ECP_ShortW[Fp2[BLS12_377], G2]): SecretBool =
  ## Returns true if P is in G2 subgroup, i.e. P is a point of order r.
  ## A point may be on a curve but not on the prime order r subgroup.
  ## Not checking subgroup exposes a protocol to small subgroup attacks.
@ -205,8 +205,29 @@ func isInSubgroup*(P: ECP_ShortW_Prj[Fp2[BLS12_377], G2]): SecretBool =
  #   A note on group membership tests for G1, G2 and GT
  #   on BLS pairing-friendly curves
  #   P is in the G1 subgroup iff ψ(P) == [u](P)
-  var t0{.noInit.}, t1{.noInit.}: ECP_ShortW_Prj[Fp2[BLS12_377], G2]
+  var t0{.noInit.}, t1{.noInit.}: typeof(P)
  t0.pow_bls12_377_x(P) # [u]P
  t1.frobenius_psi(P)   # ψ(P)

  return t0 == t1
+
+func isInSubgroup*(P: ECP_ShortW_Aff[Fp[BLS12_377], G1]): SecretBool =
+  ## Returns true if P is in 𝔾1 subgroup, i.e. P is a point of order r.
+  ## A point may be on a curve but not on the prime order r subgroup.
+  ## Not checking subgroup exposes a protocol to small subgroup attacks.
+  ##
+  ## Warning ⚠: Assumes that P is on curve
+  var t{.noInit.}: ECP_ShortW_Prj[Fp[BLS12_377], G1]
+  t.fromAffine(P)
+  return t.isInSubgroup()
+
+
+func isInSubgroup*(P: ECP_ShortW_Aff[Fp2[BLS12_377], G2]): SecretBool =
+  ## Returns true if P is in 𝔾2 subgroup, i.e. P is a point of order r.
+  ## A point may be on a curve but not on the prime order r subgroup.
+  ## Not checking subgroup exposes a protocol to small subgroup attacks.
+  ##
+  ## Warning ⚠: Assumes that P is on curve
+  var t{.noInit.}: ECP_ShortW_Jac[Fp2[BLS12_377], G2]
+  t.fromAffine(P)
+  return t.isInSubgroup()
--- a/constantine/math/constants/bls12_381_pairings.nim
+++ b/constantine/math/constants/bls12_381_pairings.nim
@ -13,8 +13,7 @@ import
  ../extension_fields,
  ../elliptic/[ec_shortweierstrass_affine, ec_shortweierstrass_projective],
  ../pairings/[cyclotomic_subgroups, miller_loops],
-  ../isogenies/frobenius,
-  ../../platforms/allocs
+  ../isogenies/frobenius

 # Slow generic implementation
 # ------------------------------------------------------------
@ -22,8 +21,7 @@ import
 # The bit count must be exact for the Miller loop
 const BLS12_381_pairing_ate_param* = block:
  # BLS12 Miller loop is parametrized by u
-  # +2 to bitlength so that we can mul by 3 for NAF encoding
-  BigInt[64+2].fromHex"0xd201000000010000"
+  BigInt[64].fromHex"0xd201000000010000"

 const BLS12_381_pairing_ate_param_isNeg* = true

@ -66,7 +64,7 @@ func millerLoopAddchain*(
       Qs: ptr UncheckedArray[ECP_ShortW_Aff[Fp2[BLS12_381], G2]],
       Ps: ptr UncheckedArray[ECP_ShortW_Aff[Fp[BLS12_381], G1]],
       N: int
-     ) =
+     ) {.noInline.} =
  ## Generic Miller Loop for BLS12 curve
  ## Computes f{u,Q}(P) with u the BLS curve parameter

--- a/constantine/math/constants/bls12_381_subgroups.nim
+++ b/constantine/math/constants/bls12_381_subgroups.nim
@ -166,7 +166,7 @@ func clearCofactorFast*(P: var ECP_ShortW[Fp2[BLS12_381], G2]) =
 #
 # ############################################################

-func isInSubgroup*(P: ECP_ShortW_Jac[Fp[BLS12_381], G1] or ECP_ShortW_Prj[Fp[BLS12_381], G1]): SecretBool =
+func isInSubgroup*(P: ECP_ShortW[Fp[BLS12_381], G1]): SecretBool =
  ## Returns true if P is in 𝔾1 subgroup, i.e. P is a point of order r.
  ## A point may be on a curve but not on the prime order r subgroup.
  ## Not checking subgroup exposes a protocol to small subgroup attacks.
@ -189,7 +189,7 @@ func isInSubgroup*(P: ECP_ShortW_Jac[Fp[BLS12_381], G1] or ECP_ShortW_Prj[Fp[BLS

  return t0 == t1

-func isInSubgroup*(P: ECP_ShortW_Jac[Fp2[BLS12_381], G2] or ECP_ShortW_Prj[Fp2[BLS12_381], G2]): SecretBool =
+func isInSubgroup*(P: ECP_ShortW[Fp2[BLS12_381], G2]): SecretBool =
  ## Returns true if P is in 𝔾2 subgroup, i.e. P is a point of order r.
  ## A point may be on a curve but not on the prime order r subgroup.
  ## Not checking subgroup exposes a protocol to small subgroup attacks.
--- a/constantine/math/constants/bn254_nogami_pairings.nim
+++ b/constantine/math/constants/bn254_nogami_pairings.nim
@ -13,8 +13,7 @@ import
  ../extension_fields,
  ../elliptic/[ec_shortweierstrass_affine, ec_shortweierstrass_projective],
  ../pairings/[cyclotomic_subgroups, miller_loops],
-  ../isogenies/frobenius,
-  ../../platforms/allocs
+  ../isogenies/frobenius

 # Slow generic implementation
 # ------------------------------------------------------------
@ -22,8 +21,7 @@ import
 # The bit count must be exact for the Miller loop
 const BN254_Nogami_pairing_ate_param* = block:
  # BN Miller loop is parametrized by 6u+2
-  # +2 to bitlength so that we can mul by 3 for NAF encoding
-  BigInt[65+2].fromHex"0x18300000000000004"
+  BigInt[65].fromHex"0x18300000000000004"

 const BN254_Nogami_pairing_ate_param_isNeg* = true

@ -56,16 +54,17 @@ func millerLoopAddchain*(

  # Negative AteParam
  f.conj()
+  T.neg()

  # Ate pairing for BN curves needs adjustment after basic Miller loop
-  f.millerCorrectionBN(T, Q, P, BN254_Nogami_pairing_ate_param_isNeg)
+  f.millerCorrectionBN(T, Q, P)

 func millerLoopAddchain*(
       f: var Fp12[BN254_Nogami],
       Qs: ptr UncheckedArray[ECP_ShortW_Aff[Fp2[BN254_Nogami], G2]],
       Ps: ptr UncheckedArray[ECP_ShortW_Aff[Fp[BN254_Nogami], G1]],
       N: int
-     ) =
+     ) {.noInline.} =
  ## Miller Loop for BN254-Nogami curve
  ## Computes f{6u+2,Q}(P) with u the BLS curve parameter
  var Ts = allocStackArray(ECP_ShortW_Prj[Fp2[BN254_Nogami], G2], N)
@ -78,9 +77,11 @@ func millerLoopAddchain*(

  # Negative AteParam
  f.conj()
+  for i in 0 ..< N:
+    Ts[i].neg()

  for i in 0 ..< N:
-    f.millerCorrectionBN(Ts[i], Qs[i], Ps[i], BN254_Nogami_pairing_ate_param_isNeg)
+    f.millerCorrectionBN(Ts[i], Qs[i], Ps[i])

 func cycl_exp_by_curve_param*(
       r: var Fp12[BN254_Nogami], a: Fp12[BN254_Nogami],
--- a/constantine/math/constants/bn254_snarks_pairings.nim
+++ b/constantine/math/constants/bn254_snarks_pairings.nim
@ -20,8 +20,7 @@ import
 # The bit count must be exact for the Miller loop
 const BN254_Snarks_pairing_ate_param* = block:
  # BN Miller loop is parametrized by 6u+2
-  # +2 to bitlength so that we can mul by 3 for NAF encoding
-  BigInt[65+2].fromHex"0x19d797039be763ba8"
+  BigInt[65].fromHex"0x19d797039be763ba8"

 const BN254_Snarks_pairing_ate_param_isNeg* = false

--- a/constantine/math/constants/bw6_761_pairings.nim
+++ b/constantine/math/constants/bw6_761_pairings.nim
@ -20,8 +20,7 @@ import
 # 1st part: f_{u+1,Q}(P)
 const BW6_761_pairing_ate_param_1_unopt* = block:
  # BW6-761 unoptimized Miller loop first part is parametrized by u+1
-  # +1 to bitlength so that we can mul by 3 for NAF encoding
-  BigInt[64+1].fromHex"0x8508c00000000002"
+  BigInt[64].fromHex"0x8508c00000000002"

 const BW6_761_pairing_ate_param_1_unopt_isNeg* = false

--- a/constantine/math/constants/pallas_subgroups.nim
+++ b/constantine/math/constants/pallas_subgroups.nim
@ -19,7 +19,7 @@ import
 #
 # ############################################################

-func clearCofactorReference*(P: var ECP_ShortW_Prj[Fp[Pallas], G1]) {.inline.} =
+func clearCofactorReference*(P: var ECP_ShortW[Fp[Pallas], G1]) {.inline.} =
  ## Clear the cofactor of Pallas G1
  ## The Pasta curves have a prime-order group so this is a no-op
  discard
--- a/constantine/math/constants/secp256k1_subgroups.nim
+++ b/constantine/math/constants/secp256k1_subgroups.nim
@ -0,0 +1,37 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internals
+  ../../platforms/abstractions,
+  ../config/curves,
+  ../arithmetic,
+  ../ec_shortweierstrass
+
+# ############################################################
+#
+#                Clear Cofactor - Naive
+#
+# ############################################################
+
+func clearCofactorReference*(P: var ECP_ShortW[Fp[Secp256k1], G1]) {.inline.} =
+  ## Clear the cofactor of Secp256k1
+  ## The secp256k1 curve has a prime-order group so this is a no-op
+  discard
+
+# ############################################################
+#
+#                Subgroup checks
+#
+# ############################################################
+
+func isInSubgroup*(P: ECP_ShortW[Fp[Secp256k1], G1]): SecretBool {.inline.} =
+  ## This is a no-op, all points on curve are in the correct subgroup.
+  ##
+  ## Warning ⚠: Assumes that P is on curve
+  return CtTrue
--- a/constantine/math/constants/vesta_subgroups.nim
+++ b/constantine/math/constants/vesta_subgroups.nim
@ -19,7 +19,7 @@ import
 #
 # ############################################################

-func clearCofactorReference*(P: var ECP_ShortW_Prj[Fp[Vesta], G1]) {.inline.} =
+func clearCofactorReference*(P: var ECP_ShortW[Fp[Vesta], G1]) {.inline.} =
  ## Clear the cofactor of Vesta G1
  ## The Pasta curves have a prime-order group so this is a no-op
  discard
--- a/constantine/math/constants/zoo_endomorphisms.nim
+++ b/constantine/math/constants/zoo_endomorphisms.nim
@ -49,3 +49,10 @@ func hasEndomorphismAcceleration*(C: static Curve): bool =
    Pallas,
    Vesta
  }
+
+const EndomorphismThreshold* = 196
+  ## We use substraction by maximum infinity norm coefficient
+  ## to split scalars for endomorphisms
+  ## For small scalars the substraction will overflow
+  ##
+  ## TODO: implement an alternative way to split scalars.
--- a/constantine/math/constants/zoo_subgroups.nim
+++ b/constantine/math/constants/zoo_subgroups.nim
@ -15,14 +15,16 @@ import
  ./bn254_snarks_subgroups,
  ./bw6_761_subgroups,
  ./pallas_subgroups,
-  ./vesta_subgroups
+  ./vesta_subgroups,
+  ./secp256k1_subgroups

 export
  bls12_377_subgroups,
  bls12_381_subgroups,
  bn254_nogami_subgroups,
  bn254_snarks_subgroups,
-  bw6_761_subgroups
+  bw6_761_subgroups,
+  secp256k1_subgroups

 func clearCofactor*[ECP](P: var ECP) {.inline.} =
  ## Clear the cofactor of a point on the curve
--- a/constantine/math/elliptic/ec_endomorphism_accel.nim
+++ b/constantine/math/elliptic/ec_endomorphism_accel.nim
@ -58,6 +58,8 @@ func decomposeEndo*[M, scalBits, L: static int](
  ##     We need to test the mini scalar, which is 65 bits so 2 Fp so about 2 cycles
  ##     and negate it as well.

+  static: doAssert scalBits >= L, "Cannot decompose a scalar smaller than a mini-scalar or the decomposition coefficient"
+
  # Equal when no window or no negative handling, greater otherwise
  static: doAssert L >= (scalBits + M - 1) div M + 1
  const w = F.C.getCurveOrderBitwidth().wordsRequired()
@ -493,7 +495,7 @@ func scalarMulGLV_m2w2*[scalBits; EC](
  mixin affine
  type ECaff = affine(EC)
  const C = P0.F.C # curve
-  static: doAssert: scalBits == C.getCurveOrderBitwidth()
+  static: doAssert: scalBits <= C.getCurveOrderBitwidth()

  # 1. Compute endomorphisms
  when P0.G == G1:
--- a/constantine/math/elliptic/ec_multi_scalar_mul.nim
+++ b/constantine/math/elliptic/ec_multi_scalar_mul.nim
@ -0,0 +1,411 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ./ec_multi_scalar_mul_scheduler,
+       ./ec_endomorphism_accel,
+       ../constants/zoo_endomorphisms
+export bestBucketBitSize
+
+# No exceptions allowed in core cryptographic operations
+{.push raises: [].}
+{.push checks: off.}
+
+# ########################################################### #
+#                                                             #
+#             Multi Scalar Multiplication                     #
+#                                                             #
+# ########################################################### #
+
+# Multi-scalar-multiplication is the primary bottleneck in all zero-knowledge proofs and polynomial commmitment schemes.
+# In particular, those are at the heart of zk-rollups to bundle a large amount of blockchain transactions.
+# They may have to add tens of millions of elliptic curve points to generate proofs,
+# requiring powerful machines, GPUs or even FPGAs implementations.
+#
+# Multi-scalar multiplication does a linear combination of
+#   R = [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ
+#
+# The current iteration is a reference baseline before evaluating and adding various optimizations
+# (scalar recoding, change of coordinate systems, bucket sizing, sorting ...)
+#
+# See the litterature references at the top of `ec_multi_scalar_mul_scheduler.nim`
+
+func multiScalarMulImpl_reference_vartime[F, G; bits: static int](
+       r: var ECP_ShortW[F, G],
+       coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
+       N: int, c: static int) {.tags:[VarTime, HeapAlloc].} =
+  ## Inner implementation of MSM, for static dispatch over c, the bucket bit length
+  ## This is a straightforward simple translation of BDLO12, section 4
+
+  # Prologue
+  # --------
+  const numBuckets = 1 shl c - 1 # bucket 0 is unused
+  const numWindows = (bits + c - 1) div c
+  type EC = typeof(r)
+
+  let miniMSMs = allocHeapArray(EC, numWindows)
+  let buckets = allocHeapArray(EC, numBuckets)
+
+  # Algorithm
+  # ---------
+  for w in 0 ..< numWindows:
+    # Place our points in a bucket corresponding to
+    # how many times their bit pattern in the current window of size c
+    for i in 0 ..< numBuckets:
+      buckets[i].setInf()
+
+    # 1. Bucket accumulation.                            Cost: n - (2ᶜ-1) => n points in 2ᶜ-1 buckets, first point per bucket is just copied
+    for j in 0 ..< N:
+      let b = cast[int](coefs[j].getWindowAt(w*c, c))
+      if b == 0: # bucket 0 is unused, no need to add [0]Pⱼ
+        continue
+      else:
+        buckets[b-1] += points[j]
+
+    # 2. Bucket reduction.                               Cost: 2x(2ᶜ-2) => 2 additions per 2ᶜ-1 bucket, last bucket is just copied
+    # We have ordered subset sums in each bucket, we now need to compute the mini-MSM
+    #   [1]S₁ + [2]S₂ + [3]S₃ + ... + [2ᶜ-1]S₂c₋₁
+    var accumBuckets{.noInit.}, miniMSM{.noInit.}: EC
+    accumBuckets = buckets[numBuckets-1]
+    miniMSM = buckets[numBuckets-1]
+
+    # Example with c = 3, 2³ = 8
+    for k in countdown(numBuckets-2, 0):
+      accumBuckets += buckets[k] # Stores S₈ then    S₈+S₇ then       S₈+S₇+S₆ then ...
+      miniMSM += accumBuckets    # Stores S₈ then [2]S₈+S₇ then [3]S₈+[2]S₇+S₆ then ...
+
+    miniMSMs[w] = miniMSM
+
+  # 3. Final reduction.                                  Cost: (b/c - 1)x(c+1) => b/c windows, first is copied, c doublings + 1 addition per window
+  r = miniMSMs[numWindows-1]
+  for w in countdown(numWindows-2, 0):
+    for _ in 0 ..< c:
+      r.double()
+    r += miniMSMs[w]
+
+  # Cleanup
+  # -------
+  buckets.freeHeap()
+  miniMSMs.freeHeap()
+
+func multiScalarMul_reference_vartime*[EC](r: var EC, coefs: openArray[BigInt], points: openArray[ECP_ShortW_Aff]) {.tags:[VarTime, HeapAlloc].} =
+  ## Multiscalar multiplication:
+  ##   r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ
+  debug: doAssert coefs.len == points.len
+
+  let N = points.len
+  let coefs = coefs.asUnchecked()
+  let points = points.asUnchecked()
+  let c = bestBucketBitSize(N, BigInt.bits, useSignedBuckets = false, useManualTuning = false)
+
+  case c
+  of  2: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  2)
+  of  3: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  3)
+  of  4: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  4)
+  of  5: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  5)
+  of  6: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  6)
+  of  7: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  7)
+  of  8: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  8)
+  of  9: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  9)
+  of 10: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 10)
+  of 11: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 11)
+  of 12: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 12)
+  of 13: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 13)
+  of 14: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 14)
+  of 15: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 15)
+  of 16: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 16)
+  of 17: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 17)
+  of 18: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 18)
+  of 19: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 19)
+  of 20: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 20)
+  of 21: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 21)
+  else:
+    unreachable()
+
+# ########################################################### #
+#                                                             #
+#                 Multi Scalar Multiplication                 #
+#                     Optimized versions                      #
+#                                                             #
+# ########################################################### #
+#
+# Multi-Scalar-Mul is the largest bottleneck in Zero-Knowledge-Proofs protocols
+# There are ways to avoid FFTs, none to avoid Multi-Scalar-Multiplication
+# Hence optimizing it is worth millions, see https://zprize.io
+
+func accumulate[F, G](buckets: ptr UncheckedArray[ECP_ShortW_JacExt[F, G]], val: SecretWord, negate: SecretBool, point: ECP_ShortW_Aff[F, G]) {.inline, meter.} =
+  let val = BaseType(val)
+  if val == 0: # Skip [0]P
+    return
+  elif negate.bool:
+    buckets[val-1] -= point
+  else:
+    buckets[val-1] += point
+
+func bucketReduce[EC](r: var EC, buckets: ptr UncheckedArray[EC], numBuckets: static int) {.meter.} =
+  # We interleave reduction with zero-ing the bucket to use instruction-level parallelism
+
+  var accumBuckets{.noInit.}: typeof(r)
+  accumBuckets = buckets[numBuckets-1]
+  r = buckets[numBuckets-1]
+  buckets[numBuckets-1].setInf()
+
+  for k in countdown(numBuckets-2, 0):
+    accumBuckets += buckets[k]
+    r += accumBuckets
+    buckets[k].setInf()
+
+type MiniMsmKind = enum
+  kTopWindow
+  kFullWindow
+  kBottomWindow
+
+func miniMSM_jacext[F, G; bits: static int](
+       r: var ECP_ShortW[F, G],
+       buckets: ptr UncheckedArray[ECP_ShortW_JacExt[F, G]],
+       bitIndex: int, miniMsmKind: static MiniMsmKind, c: static int,
+       coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]], N: int) {.meter.} =
+  ## Apply a mini-Multi-Scalar-Multiplication on [bitIndex, bitIndex+window)
+  ## slice of all (coef, point) pairs
+
+  const excess = bits mod c
+  const top = bits - excess
+
+  # 1. Bucket Accumulation
+  var curVal, nextVal: SecretWord
+  var curNeg, nextNeg: SecretBool
+
+  template getSignedWindow(j : int): tuple[val: SecretWord, neg: SecretBool] =
+    when miniMsmKind == kBottomWindow: coefs[j].getSignedBottomWindow(c)
+    elif miniMsmKind == kTopWindow:    coefs[j].getSignedTopWindow(top, excess)
+    else:                              coefs[j].getSignedFullWindowAt(bitIndex, c)
+
+  (curVal, curNeg) = getSignedWindow(0)
+  for j in 0 ..< N-1:
+    (nextVal, nextNeg) = getSignedWindow(j+1)
+    if nextVal.BaseType != 0:
+      # In cryptography, points are indistinguishable from random
+      # hence, without prefetching, accessing the next bucket is a guaranteed cache miss
+      prefetchLarge(buckets[nextVal.BaseType-1].addr, Write, HighTemporalLocality, maxCacheLines = 2)
+    buckets.accumulate(curVal, curNeg, points[j])
+    curVal = nextVal
+    curNeg = nextNeg
+  buckets.accumulate(curVal, curNeg, points[N-1])
+
+  # 2. Bucket Reduction
+  var sliceSum{.noinit.}: ECP_ShortW_JacExt[F, G]
+  sliceSum.bucketReduce(buckets, numBuckets = 1 shl (c-1))
+
+  # 3. Mini-MSM on the slice [bitIndex, bitIndex+window)
+  var windowSum{.noInit.}: typeof(r)
+  windowSum.fromJacobianExtended_vartime(sliceSum)
+  r += windowSum
+
+  when miniMsmKind != kBottomWindow:
+    for _ in 0 ..< c:
+      r.double()
+
+func multiScalarMulJacExt_vartime[F, G; bits: static int](
+       r: var ECP_ShortW[F, G],
+       coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
+       N: int, c: static int) {.tags:[VarTime, HeapAlloc], meter.} =
+  ## Multiscalar multiplication:
+  ##   r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ
+
+  # Setup
+  # -----
+  const numBuckets = 1 shl (c-1)
+  type EcBucket = ECP_ShortW_JacExt[F, G]
+
+  let buckets = allocHeapArray(EcBucket, numBuckets)
+  zeroMem(buckets[0].addr, sizeof(EcBucket) * numBuckets)
+
+  # Algorithm
+  # ---------
+  const excess = bits mod c
+  const top = bits - excess
+  var w = top
+  r.setInf()
+
+  if excess != 0 and w != 0: # Prologue
+    r.miniMSM_jacext(buckets, w, kTopWindow, c, coefs, points, N)
+    w -= c
+
+  while w != 0:              # Steady state
+    r.miniMSM_jacext(buckets, w, kFullWindow, c, coefs, points, N)
+    w -= c
+
+  block:                     # Epilogue
+    r.miniMSM_jacext(buckets, w, kBottomWindow, c, coefs, points, N)
+
+  # Cleanup
+  # -------
+  buckets.freeHeap()
+
+func miniMSM_affine[NumBuckets, QueueLen, F, G; bits: static int](
+       r: var ECP_ShortW[F, G],
+       sched: var Scheduler[NumBuckets, QueueLen, F, G],
+       bitIndex: int, miniMsmKind: static MiniMsmKind, c: static int,
+       coefs: ptr UncheckedArray[BigInt[bits]], N: int) {.meter.} =
+  ## Apply a mini-Multi-Scalar-Multiplication on [bitIndex, bitIndex+window)
+  ## slice of all (coef, point) pairs
+
+  const excess = bits mod c
+  const top = bits - excess
+  static: doAssert miniMsmKind != kTopWindow, "The top window is smaller in bits which increases collisions in scheduler."
+
+  sched.buckets[].init()
+
+  # 1. Bucket Accumulation
+  var curSP, nextSP: ScheduledPoint
+
+  template getSignedWindow(j : int): tuple[val: SecretWord, neg: SecretBool] =
+    when miniMsmKind == kBottomWindow: coefs[j].getSignedBottomWindow(c)
+    elif miniMsmKind == kTopWindow:    coefs[j].getSignedTopWindow(top, excess)
+    else:                              coefs[j].getSignedFullWindowAt(bitIndex, c)
+
+  curSP = scheduledPointDescriptor(0, getSignedWindow(0))
+  for j in 0 ..< N-1:
+    nextSP = scheduledPointDescriptor(j+1, getSignedWindow(j+1))
+    sched.prefetch(nextSP)
+    sched.schedule(curSP)
+    curSP = nextSP
+  sched.schedule(curSP)
+  sched.flushPendingAndReset()
+
+  # 2. Bucket Reduction
+  var sliceSum{.noInit.}: ECP_ShortW_JacExt[F, G]
+  sliceSum.bucketReduce(sched.buckets[])
+
+  # 3. Mini-MSM on the slice [bitIndex, bitIndex+window)
+  var windowSum{.noInit.}: typeof(r)
+  windowSum.fromJacobianExtended_vartime(sliceSum)
+  r += windowSum
+
+  when miniMsmKind != kBottomWindow:
+    for _ in 0 ..< c:
+      r.double()
+
+func multiScalarMulAffine_vartime[F, G; bits: static int](
+       r: var ECP_ShortW[F, G],
+       coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
+       N: int, c: static int) {.tags:[VarTime, Alloca, HeapAlloc], meter.} =
+  ## Multiscalar multiplication:
+  ##   r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ
+
+  # Setup
+  # -----
+  const (numBuckets, queueLen) = c.deriveSchedulerConstants()
+  let buckets = allocHeap(Buckets[numBuckets, F, G])
+  buckets[].init()
+  let sched = allocHeap(Scheduler[numBuckets, queueLen, F, G])
+  sched[].init(points, buckets, 0, numBuckets.int32)
+
+  # Algorithm
+  # ---------
+  const excess = bits mod c
+  const top = bits - excess
+  var w = top
+  r.setInf()
+
+  if excess != 0 and w != 0: # Prologue
+    # The top might use only a few bits, the affine scheduler would likely have significant collisions
+    zeroMem(sched.buckets.ptJacExt.addr, buckets.ptJacExt.sizeof())
+    r.miniMSM_jacext(sched.buckets.ptJacExt.asUnchecked(), w, kTopWindow, c, coefs, points, N)
+    w -= c
+
+  while w != 0:              # Steady state
+    r.miniMSM_affine(sched[], w, kFullWindow, c, coefs, N)
+    w -= c
+
+  block:                     # Epilogue
+    r.miniMSM_affine(sched[], w, kBottomWindow, c, coefs, N)
+
+  # Cleanup
+  # -------
+  sched.freeHeap()
+  buckets.freeHeap()
+
+func multiScalarMul_dispatch_vartime[bits: static int, F, G](
+       r: var ECP_ShortW[F, G], coefs: ptr UncheckedArray[BigInt[bits]],
+       points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]], N: int) =
+  ## Multiscalar multiplication:
+  ##   r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ
+  let c = bestBucketBitSize(N, bits, useSignedBuckets = true, useManualTuning = true)
+
+  case c
+  of  2: multiScalarMulJacExt_vartime(r, coefs, points, N, c =  2)
+  of  3: multiScalarMulJacExt_vartime(r, coefs, points, N, c =  3)
+  of  4: multiScalarMulJacExt_vartime(r, coefs, points, N, c =  4)
+  of  5: multiScalarMulJacExt_vartime(r, coefs, points, N, c =  5)
+  of  6: multiScalarMulJacExt_vartime(r, coefs, points, N, c =  6)
+  of  7: multiScalarMulJacExt_vartime(r, coefs, points, N, c =  7)
+  of  8: multiScalarMulJacExt_vartime(r, coefs, points, N, c =  8)
+  of  9: multiScalarMulAffine_vartime(r, coefs, points, N, c =  9)
+  of 10: multiScalarMulAffine_vartime(r, coefs, points, N, c = 10)
+  of 11: multiScalarMulAffine_vartime(r, coefs, points, N, c = 11)
+  of 12: multiScalarMulAffine_vartime(r, coefs, points, N, c = 12)
+  of 13: multiScalarMulAffine_vartime(r, coefs, points, N, c = 13)
+  of 14: multiScalarMulAffine_vartime(r, coefs, points, N, c = 14)
+  of 15: multiScalarMulAffine_vartime(r, coefs, points, N, c = 15)
+  of 16: multiScalarMulAffine_vartime(r, coefs, points, N, c = 16)
+  of 17: multiScalarMulAffine_vartime(r, coefs, points, N, c = 17)
+  of 18: multiScalarMulAffine_vartime(r, coefs, points, N, c = 18)
+  else:
+    unreachable()
+
+func multiScalarMul_vartime*[bits: static int, F, G](
+       r: var ECP_ShortW[F, G],
+       coefs: openArray[BigInt[bits]],
+       points: openArray[ECP_ShortW_Aff[F, G]]) {.tags:[VarTime, Alloca, HeapAlloc], meter.} =
+  ## Multiscalar multiplication:
+  ##   r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ
+
+  debug: doAssert coefs.len == points.len
+  let N = points.len
+
+  when bits <= F.C.getCurveOrderBitwidth() and
+       F.C.hasEndomorphismAcceleration():
+    # TODO, min amount of bits for endomorphisms?
+
+    const M = when F is Fp:  2
+              elif F is Fp2: 4
+              else: {.error: "Unconfigured".}
+
+    const L = (bits + M - 1) div M + 1
+    let splitCoefs   = allocHeapArray(array[M, BigInt[L]], N)
+    let endoBasis    = allocHeapArray(array[M, ECP_ShortW_Aff[F, G]], N)
+
+    for i in 0 ..< N:
+      var negatePoints {.noinit.}: array[M, SecretBool]
+      splitCoefs[i].decomposeEndo(negatePoints, coefs[i], F)
+      if negatePoints[0].bool:
+        endoBasis[i][0].neg(points[i])
+      else:
+        endoBasis[i][0] = points[i]
+
+      when F is Fp:
+        endoBasis[i][1].x.prod(points[i].x, F.C.getCubicRootOfUnity_mod_p())
+        if negatePoints[1].bool:
+          endoBasis[i][1].y.neg(points[i].y)
+        else:
+          endoBasis[i][1].y = points[i].y
+      else:
+        staticFor m, 1, M:
+          endoBasis[i][m].frobenius_psi(points[i], m)
+          if negatePoints[m].bool:
+            endoBasis[i][m].neg()
+
+    let endoCoefs = cast[ptr UncheckedArray[BigInt[L]]](splitCoefs)
+    let endoPoints  = cast[ptr UncheckedArray[ECP_ShortW_Aff[F, G]]](endoBasis)
+    multiScalarMul_dispatch_vartime(r, endoCoefs, endoPoints, M*N)
+
+    endoBasis.freeHeap()
+    splitCoefs.freeHeap()
+
+  else:
+    multiScalarMul_dispatch_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N)
--- a/constantine/math/elliptic/ec_multi_scalar_mul_scheduler.nim
+++ b/constantine/math/elliptic/ec_multi_scalar_mul_scheduler.nim
@ -0,0 +1,611 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../../platforms/abstractions,
+  ../arithmetic,
+  ../ec_shortweierstrass,
+  ./ec_shortweierstrass_jacobian_extended,
+  ./ec_shortweierstrass_batch_ops
+
+export abstractions, arithmetic,
+       ec_shortweierstrass, ec_shortweierstrass_jacobian_extended
+
+# No exceptions allowed in core cryptographic operations
+{.push raises: [].}
+{.push checks: off.}
+
+# ########################################################### #
+#                                                             #
+#          Multi Scalar Multiplication - Scheduling           #
+#                                                             #
+# ########################################################### #
+
+# This file implements a bucketing acceleration structure.
+#
+# See the following for the baseline algorithm:
+# - Faster batch forgery identification
+#   Daniel J. Bernstein, Jeroen Doumen, Tanja Lange, and Jan-Jaap Oosterwijk, 2012
+#   https://eprint.iacr.org/2012/549.pdf
+# - Simple guide to fast linear combinations (aka multiexponentiations)
+#   Vitalik Buterin, 2020
+#   https://ethresear.ch/t/simple-guide-to-fast-linear-combinations-aka-multiexponentiations/7238
+#   https://github.com/ethereum/research/blob/5c6fec6/fast_linear_combinations/multicombs.py
+# - zkStudyClub: Multi-scalar multiplication: state of the art & new ideas
+#   Gus Gutoski, 2020
+#   https://www.youtube.com/watch?v=Bl5mQA7UL2I
+#
+# And for the scheduling technique and collision probability analysis
+# - FPGA Acceleration of Multi-Scalar Multiplication: CycloneMSM
+#   Kaveh Aasaraai, Don Beaver, Emanuele Cesena, Rahul Maganti, Nicolas Stalder and Javier Varela, 2022
+#   https://eprint.iacr.org/2022/1396.pdf
+#
+# Challenges:
+# - For the popular BLS12-377 and BLS12-381, an affine elliptic point takes 96 bytes
+#   an extended jacobian point takes 192 bytes.
+# - We want to deal with a large number of points, for example the Zprize competition used 2²⁶ ~= 67M points
+#   in particular, memory usage is a concern as those input already require ~6.7GB for a BLS12 prime,
+#   so we can't use much scratchspace, especially on GPUs.
+# - Any bit-twiddling algorithm must scale at most linearly with the number of points
+#   Algorithm that for example finds the most common pair of points for an optimized addition chain
+#   are O(n²) and will need to select from a subsample.
+# - The scalars are random, so the bucket accessed is random, which needs sorting or prefetching
+#   to avoid bottlenecking on memory bandwidth. But sorting requires copies ...
+# - While copies improve locality, our types are huge, 96~192 bytes
+#   and we have millions of them.
+# - We want our algorithm to be scalable to a large number of threads at minimum, or even better on GPUs.
+#   Hence it should naturally offer data parallelism, which is tricky due to collisions when accumulating
+#   1M points into 32~64K buckets.
+# - The asymptotically fastest addition formulae are affine addition with individual cost 3M + 1I
+#   and asymptotic cost for N points N*3M + N*3M+1I using batch inversion.
+#   Vartime inversion cost 70-100M depending on the number of bits in the prime
+#   (multiplication cost scale quadratically while inversion via Euclid linearly)
+# - The second fastest general coordinate system is Extended Jacobian with cost 10M,
+#   so the threshold for N is:
+#     N*3M+N*3M+100M < N*10M <=> 100M < N * 4M <=> 25 < N
+#   Hence we want to maximize the chance of doing 25 additions (so we need 50 points).
+#   Given than there is low probability for consecutive random points to be assigned to the same bucket,
+#   we can't keep a queue per bucket for batch accumulation.
+#   However we can do a vector addition as there is a high probability that consecutive random points
+#   are assigned to different buckets.
+#
+# Strategy:
+# - Each bucket is associated with (EC Affine, EC ExtJac, set[Empty, AffineSet, ExtJacSet]), in SoA storage
+# - Each thread is assigned a range of buckets and keeps a scheduler
+#     start, stop: int32
+#     curQueue, curRescheduled: int32
+#     bucketMap:                BigInt[NumNZBuckets]
+#     queue:                    array[MaxCapacity, (Target Bucket, PointID)]
+#     rescheduled:              array[32, (Target Bucket, PointID)]
+#   - when the queue reaches max capacity, we compute a vector affine addition with the target buckets
+#     we interleave with prefetching to reduce cache misses.
+#   - when the rescheduled array reaches max capacity, we check if there are at least 32 items in the queue
+#     and if so schedule an vector addition otherwise we flush the queue into the EC ExtJac.
+#     i.e. in the worst case, when all points are the same, we fallback to the JacExt MSM.
+#   - As a stretch optimization, if many points in rescheduled queue target the same bucket
+#     we can use sum_reduce_vartime, but are there workloads like that?
+#
+# Queue size is given by formula `4*c² - 16*c - 128` to handle various concerns: amortization of batch affine, memory usage, collision probability
+# `c` is chosen to minimize the number of EC operations but does not take into account memory bandwidth and cache misses cost.
+#
+# Collision probability for `QueueSize` consecutive *uniformly random* points
+# is derived from a Poisson distribution.
+# NumCollisions = N*QueueSize/NumNZBuckets is the number of collisions
+# NumCollisions / N is the probability of collision
+
+# -------inputs-------    c      ----buckets----   queue length  collision map bytes  num collisions   collision %
+#  2^0               1    2      2^1           2           -144                    8             -72      -7200.0%
+#  2^1               2    2      2^1           2           -144                    8            -144      -7200.0%
+#  2^2               4    3      2^2           4           -140                    8            -140      -3500.0%
+#  2^3               8    3      2^2           4           -140                    8            -280      -3500.0%
+#  2^4              16    4      2^3           8           -128                    8            -256      -1600.0%
+#  2^5              32    5      2^4          16           -108                    8            -216       -675.0%
+#  2^6              64    5      2^4          16           -108                    8            -432       -675.0%
+#  2^7             128    6      2^5          32            -80                    8            -320       -250.0%
+#  2^8             256    7      2^6          64            -44                    8            -176        -68.8%
+#  2^9             512    8      2^7         128              0                   16               0          0.0%
+# 2^10            1024    9      2^8         256             52                   32             208         20.3% <- At half the queue length, we can still amortize batch inversion
+# 2^11            2048    9      2^8         256             52                   32             416         20.3%
+# 2^12            4096   10      2^9         512            112                   64             896         21.9%
+# 2^13            8192   11     2^10        1024            180                  128            1440         17.6%
+# 2^14           16384   12     2^11        2048            256                  256            2048         12.5%
+# 2^15           32768   13     2^12        4096            340                  512            2720          8.3%
+# 2^16           65536   14     2^13        8192            432                 1024            3456          5.3%
+# 2^17          131072   15     2^14       16384            532                 2048            4256          3.2% <- 100/32 = 3.125, a collision queue of size 32 is highly unlikely to reach full capacity
+# 2^18          262144   16     2^15       32768            640                 4096            5120          2.0% <- ~10MB of buckets
+# 2^19          524288   17     2^16       65536            756                 8192            6048          1.2% <- for BLS12-381, the queue size reaches 64K aliasing conflict threshold
+# 2^20         1048576   17     2^16       65536            756                 8192           12096          1.2%
+# 2^21         2097152   18     2^17      131072            880                16384           14080          0.7%
+# 2^22         4194304   19     2^18      262144           1012                32768           16192          0.4%
+# 2^23         8388608   20     2^19      524288           1152                65536           18432          0.2%
+# 2^24        16777216   21     2^20     1048576           1300               131072           20800          0.1%
+# 2^25        33554432   22     2^21     2097152           1456               262144           23296          0.1%
+# 2^26        67108864   23     2^22     4194304           1620               524288           25920          0.0%
+# 2^27       134217728   24     2^23     8388608           1792              1048576           28672          0.0%
+# 2^28       268435456   25     2^24    16777216           1972              2097152           31552          0.0%
+# 2^29       536870912   26     2^25    33554432           2160              4194304           34560          0.0%
+# 2^30      1073741824   27     2^26    67108864           2356              8388608           37696          0.0%
+# 2^31      2147483648   28     2^27   134217728           2560             16777216           40960          0.0%
+# 2^32      4294967296   29     2^28   268435456           2772             33554432           44352          0.0%
+# 2^33      8589934592   30     2^29   536870912           2992             67108864           47872          0.0%
+# 2^34     17179869184   31     2^30  1073741824           3220            134217728           51520          0.0%
+# 2^35     34359738368   32     2^31  2147483648           3456            268435456           55296          0.0%
+#
+# The code to reproduce this table is at the bottom
+
+# Sizes for BLS12-381 with c = 16
+#
+# Buckets: 32768
+# - Status:             1        32768
+# - Affine:            96      3145728
+# - ExtJac:           192      6291456
+#   ----------------------------------
+#   Total             289    9 469 952  ~= 10MB
+#
+# Scheduler: 1 per thread
+# - start, stop:        8
+# - queue cursors:      8
+# - bucketMap:       4096
+# - rescheduled:      256
+#   -----------------------------------
+#   Total            4368 ~= 4KB per thread
+
+# ########################################################### #
+#                                                             #
+#                    General utilities                        #
+#                                                             #
+# ########################################################### #
+
+func bestBucketBitSize*(inputSize: int, scalarBitwidth: static int, useSignedBuckets, useManualTuning: static bool): int {.inline.} =
+  ## Evaluate the best bucket bit-size for the input size.
+  ## That bucket size minimize group operations.
+  ## This ignore cache effect. Computation can become memory-bound, especially with large buckets
+  ## that don't fit in L1 cache, trigger the 64K aliasing conflict or worse (overflowing L2 cache or TLB).
+  ## Especially, scalars are expected to be indistinguishable from random so buckets accessed during accumulation
+  ## will be in a random pattern, triggering cache misses.
+
+  # Raw operation cost is approximately
+  # 1. Bucket accumulation
+  #      n - (2ᶜ-1) additions for b/c windows    or n - (2ᶜ⁻¹-1) if using signed buckets
+  # 2. Bucket reduction
+  #      2x(2ᶜ-2) additions for b/c windows      or 2x(2ᶜ⁻¹-2)
+  # 3. Final reduction
+  #      (b/c - 1) x (c doublings + 1 addition)
+  # Total
+  #   b/c (n + 2ᶜ - 2) A + (b/c - 1) x (c*D + A)
+  # https://www.youtube.com/watch?v=Bl5mQA7UL2I
+
+  # A doubling costs 50% of an addition with jacobian coordinates
+  # and between 60% (BLS12-381 G1) to 66% (BN254-Snarks G1)
+
+  const A = 10'f32  # Addition cost
+  const D =  6'f32  # Doubling cost
+
+  const s = int useSignedBuckets
+  let n = inputSize
+  let b = float32(scalarBitwidth)
+  var minCost = float32(Inf)
+  for c in 2 .. 21:
+    let b_over_c = b/c.float32
+
+    let bucket_accumulate_reduce = b_over_c * float32(n + (1 shl (c-s)) - 2) * A
+    let final_reduction = (b_over_c - 1'f32) * (c.float32*D + A)
+    let cost = bucket_accumulate_reduce + final_reduction
+    if cost < minCost:
+      minCost = cost
+      result = c
+
+  # Manual tuning, memory bandwidth / cache boundaries of
+  # L1, L2 caches, TLB and 64 aliasing conflict
+  # are not taken into account in previous formula.
+  # Each increase in c doubles memory used.
+  when useManualTuning:
+    if 14 <= result:
+      result -= 1
+    if 15 <= result:
+      result -= 1
+    if 16 <= result:
+      result -= 1
+
+# Extended Jacobian generic bindings
+# ----------------------------------
+# All vartime procedures MUST be tagged vartime
+# Hence we do not expose `sum` or `+=` for extended jacobian operation to prevent `vartime` mistakes
+# we create a local `sum` or `+=` for this module only
+func `+=`*[F; G: static Subgroup](P: var ECP_ShortW_JacExt[F, G], Q: ECP_ShortW_JacExt[F, G]) {.inline.}=
+  P.sum_vartime(P, Q)
+func `+=`*[F; G: static Subgroup](P: var ECP_ShortW_JacExt[F, G], Q: ECP_ShortW_Aff[F, G]) {.inline.}=
+  P.madd_vartime(P, Q)
+func `-=`*[F; G: static Subgroup](P: var ECP_ShortW_JacExt[F, G], Q: ECP_ShortW_Aff[F, G]) {.inline.}=
+  P.msub_vartime(P, Q)
+
+# ########################################################### #
+#                                                             #
+#                       Scheduler                             #
+#                                                             #
+# ########################################################### #
+#
+# "磨刀不误砍柴功"
+# "Sharpening the axe will not delay cutting the wood" - Chinese proverb
+
+type
+  BucketStatus = enum
+    kAffine, kJacExt
+
+  Buckets*[N: static int, F; G: static Subgroup] = object
+    status:    array[N, set[BucketStatus]]
+    ptAff:     array[N, ECP_ShortW_Aff[F, G]]
+    ptJacExt*: array[N, ECP_ShortW_JacExt[F, G]] # Public for the top window
+
+  ScheduledPoint* = object
+    bucket  {.bitsize:26.}: int64 # Supports up to 2²⁵ =      33 554 432 buckets and -1 for the skipped bucket 0
+    sign    {.bitsize: 1.}: int64
+    pointID {.bitsize:37.}: int64 # Supports up to 2³⁷ = 137 438 953 472 points
+
+  Scheduler*[NumNZBuckets, QueueLen: static int, F; G: static Subgroup] = object
+    points:                        ptr UncheckedArray[ECP_ShortW_Aff[F, G]]
+    buckets*:                      ptr Buckets[NumNZBuckets, F, G]
+    start, stopEx:                 int32                # Bucket range
+    numScheduled, numCollisions:   int32
+    collisionsMap:                 BigInt[NumNZBuckets] # We use a BigInt as a bitmap, when all you have is an axe ...
+    queue:                         array[QueueLen, ScheduledPoint]
+    collisions:                    array[32, ScheduledPoint]
+
+const MinVectorAddThreshold = 32
+
+func init*(buckets: var Buckets) {.inline.} =
+  zeroMem(buckets.status.addr, buckets.status.sizeof())
+
+func reset(buckets: var Buckets, index: int) {.inline.} =
+  buckets.status[index] = {}
+
+func deriveSchedulerConstants*(c: int): tuple[numNZBuckets, queueLen: int] {.compileTime.} =
+  # Returns the number of non-zero buckets and the scheduler queue length
+  result.numNZBuckets = 1 shl (c-1)
+  result.queueLen = max(MinVectorAddThreshold, 4*c*c - 16*c - 128)
+
+func init*[NumNZBuckets, QueueLen: static int, F; G: static Subgroup](
+      sched: var Scheduler[NumNZBuckets, QueueLen, F, G], points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
+      buckets: ptr Buckets[NumNZBuckets, F, G], start, stopEx: int32) {.inline.} =
+  ## init a scheduler overseeing buckets [start, stopEx)
+  ## within the indices [0, NumNZBuckets). Bucket for value 0 is considered at index -1.
+  sched.points        =  points
+  sched.buckets       = buckets
+  sched.start         =   start
+  sched.stopEx        =  stopEx
+  sched.numScheduled  =       0
+  sched.numCollisions =       0
+
+func scheduledPointDescriptor*(pointIndex: int, pointDesc: tuple[val: SecretWord, neg: SecretBool]): ScheduledPoint {.inline.} =
+  ScheduledPoint(
+    bucket:  cast[int64](pointDesc.val)-1, # shift bucket by 1 as bucket 0 is skipped
+    sign:    cast[int64](pointDesc.neg),
+    pointID: cast[int64](pointIndex))
+
+func enqueuePoint(sched: var Scheduler, sp: ScheduledPoint) {.inline.} =
+  sched.queue[sched.numScheduled] = sp
+  sched.collisionsMap.setBit(sp.bucket.int)
+  sched.numScheduled += 1
+
+func handleCollision(sched: var Scheduler, sp: ScheduledPoint)
+func rescheduleCollisions(sched: var Scheduler)
+func sparseVectorAddition[F, G](
+       buckets:         ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
+       bucketStatuses:  ptr UncheckedArray[set[BucketStatus]],
+       points:          ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
+       scheduledPoints: ptr UncheckedArray[ScheduledPoint],
+       numScheduled:    int32) {.noInline, tags:[VarTime, Alloca].}
+
+func prefetch*(sched: Scheduler, sp: ScheduledPoint) =
+  let bucket = sp.bucket
+  if bucket == -1:
+    return
+
+  prefetch(sched.buckets.status[bucket].addr, Write, HighTemporalLocality)
+  prefetchLarge(sched.buckets.ptAff[bucket].addr, Write, HighTemporalLocality, maxCacheLines = 1)
+  prefetchLarge(sched.buckets.ptJacExt[bucket].addr, Write, HighTemporalLocality, maxCacheLines = 1)
+
+func schedule*(sched: var Scheduler, sp: ScheduledPoint) =
+  ## Schedule a point for accumulating in buckets
+
+  let bucket = int sp.bucket
+  if not(sched.start <= bucket and bucket < sched.stopEx):
+    return
+
+  if kAffine notin sched.buckets.status[bucket]: # Random access, prefetch to avoid cache-misses
+    if sp.sign == 0:
+      sched.buckets.ptAff[bucket] = sched.points[sp.pointID]
+    else:
+      sched.buckets.ptAff[bucket].neg(sched.points[sp.pointID])
+    sched.buckets.status[bucket].incl(kAffine)
+    return
+
+  if sched.collisionsMap.bit(bucket).bool:
+    sched.handleCollision(sp)
+    return
+
+  sched.enqueuePoint(sp)
+
+  if sched.numScheduled == sched.queue.len:
+    sparseVectorAddition(
+      sched.buckets.ptAff.asUnchecked(), sched.buckets.status.asUnchecked(),
+      sched.points, sched.queue.asUnchecked(), sched.numScheduled)
+    sched.numScheduled = 0
+    sched.collisionsMap.setZero()
+    sched.rescheduleCollisions()
+
+func handleCollision(sched: var Scheduler, sp: ScheduledPoint) =
+  if sched.numCollisions < sched.collisions.len:
+    sched.collisions[sched.numCollisions] = sp
+    sched.numCollisions += 1
+    return
+
+  # If we want to optimize for a workload were many multipliers are the same, it's here
+  if kJacExt notin sched.buckets.status[sp.bucket]:
+    sched.buckets.ptJacExt[sp.bucket].fromAffine(sched.points[sp.pointID])
+    if sp.sign != 0:
+      sched.buckets.ptJacExt[sp.bucket].neg()
+    sched.buckets.status[sp.bucket].incl(kJacExt)
+    return
+
+  if sp.sign == 0:
+    sched.buckets.ptJacExt[sp.bucket] += sched.points[sp.pointID]
+  else:
+    sched.buckets.ptJacExt[sp.bucket] -= sched.points[sp.pointID]
+
+func rescheduleCollisions(sched: var Scheduler) =
+  template last: untyped = sched.numCollisions-1
+  var i = last()
+  while i >= 0:
+    let sp = sched.collisions[i]
+    if not sched.collisionsMap.bit(sp.bucket.int).bool:
+      sched.enqueuePoint(sp)
+      if i != last():
+        sched.collisions[i] = sched.collisions[last()]
+      sched.numCollisions -= 1
+    i -= 1
+
+func flushBuffer(sched: var Scheduler, buf: ptr UncheckedArray[ScheduledPoint], count: var int32) =
+  for i in 0 ..< count:
+    let sp = buf[i]
+    if kJacExt in sched.buckets.status[sp.bucket]:
+      if sp.sign == 0:
+        sched.buckets.ptJacExt[sp.bucket] += sched.points[sp.pointID]
+      else:
+        sched.buckets.ptJacExt[sp.bucket] -= sched.points[sp.pointID]
+    else:
+      sched.buckets.ptJacExt[sp.bucket].fromAffine(sched.points[sp.pointID])
+      if sp.sign != 0:
+        sched.buckets.ptJacExt[sp.bucket].neg()
+      sched.buckets.status[sp.bucket].incl(kJacExt)
+  count = 0
+
+func flushPendingAndReset*(sched: var Scheduler) =
+  if sched.numScheduled >= MinVectorAddThreshold:
+    sparseVectorAddition(
+      sched.buckets.ptAff.asUnchecked(), sched.buckets.status.asUnchecked(),
+      sched.points, sched.queue.asUnchecked(), sched.numScheduled)
+    sched.numScheduled = 0
+
+  if sched.numScheduled > 0:
+    sched.flushBuffer(sched.queue.asUnchecked(), sched.numScheduled)
+
+  if sched.numCollisions > 0:
+    sched.flushBuffer(sched.collisions.asUnchecked(), sched.numCollisions)
+
+  sched.collisionsMap.setZero()
+
+# ########################################################### #
+#                                                             #
+#                    Computation                             #
+#                                                             #
+# ########################################################### #
+
+func sparseVectorAddition[F, G](
+       buckets: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
+       bucketStatuses: ptr UncheckedArray[set[BucketStatus]],
+       points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
+       scheduledPoints: ptr UncheckedArray[ScheduledPoint],
+       numScheduled: int32
+      ) {.noInline, tags:[VarTime, Alloca].} =
+  ## Does a sparse vector addition: buckets += scheduledPoints
+  ## This implementation is optimized using batch affine inversion
+  ## with an asymptotic cost for N points of N*6M + I
+  ## where M is field multiplication and I the field inversion.
+  ##
+  ## Inversion usually costs between 66M to 120M depending on implementation:
+  ## - scaling linearly with bits (Euclid, Lehmer, Stein, Bernstein-Yang, Pornin algorithm)
+  ## - scaling quadratically with bits if using Fermat's Little Theorem a⁻¹ ≡ ᵖ⁻² (mod p) with addition chains
+  ## - constant-time or variable time
+  ##
+  ## `scheduledPoints` must all target a different bucket.
+  template sps: untyped = scheduledPoints
+
+  type SpecialCase = enum
+    kRegular, kInfLhs, kInfRhs, kOpposite
+
+  let lambdas = allocStackArray(tuple[num, den: F], numScheduled)
+  let accumDen = allocStackArray(F, numScheduled)
+  let specialCases = allocStackArray(SpecialCase, numScheduled)
+
+  # Step 1: Compute numerators and denominators of λᵢ = λᵢ_num / λᵢ_den
+  for i in 0 ..< numScheduled:
+
+    template skipSpecialCase {.dirty.} =
+      if i == 0: accumDen[i].setOne()
+      else: accumDen[i] = accumDen[i-1]
+      continue
+
+    if i != numScheduled - 1:
+      prefetchLarge(points[sps[i+1].pointID].addr, Read, HighTemporalLocality, maxCacheLines = 4)
+      prefetch(bucketStatuses[sps[i+1].bucket].addr, Read, HighTemporalLocality)
+      prefetchLarge(buckets[sps[i+1].bucket].addr, Read, HighTemporalLocality, maxCacheLines = 4)
+
+    # Special cases 1: infinity points have affine coordinates (0, 0) by convention
+    #                  it doesn't match the y²=x³+ax+b equation so slope formula need special handling
+    if (kAffine notin bucketStatuses[sps[i].bucket]) or buckets[sps[i].bucket].isInf().bool:
+      specialCases[i] = kInfLhs
+      skipSpecialCase()
+    elif points[sps[i].pointID].isInf().bool:
+      specialCases[i] = kInfRhs
+      skipSpecialCase()
+
+    # Special case 2: λ = (Qy-Py)/(Qx-Px) which is undefined when Px == Qx
+    #                 This happens when P == Q or P == -Q
+    if bool(buckets[sps[i].bucket].x == points[sps[i].pointID].x):
+      if sps[i].sign == 0:
+        if bool(buckets[sps[i].bucket].y == points[sps[i].pointID].y):
+          lambdaDouble(lambdas[i].num, lambdas[i].den, buckets[sps[i].bucket])
+        else:
+          specialCases[i] = kOpposite
+          skipSpecialCase()
+      else:
+        if bool(buckets[sps[i].bucket].y == points[sps[i].pointID].y):
+          specialCases[i] = kOpposite
+          skipSpecialCase()
+        else:
+          lambdaDouble(lambdas[i].num, lambdas[i].den, buckets[sps[i].bucket])
+    else:
+      if sps[i].sign == 0:
+        lambdaAdd(lambdas[i].num, lambdas[i].den, buckets[sps[i].bucket], points[sps[i].pointID])
+      else:
+        lambdaSub(lambdas[i].num, lambdas[i].den, buckets[sps[i].bucket], points[sps[i].pointID])
+
+    # Step 2: Accumulate denominators.
+    specialCases[i] = kRegular
+    if i == 0:
+      accumDen[i] = lambdas[i].den
+    elif i == numScheduled-1:
+      accumDen[i].prod(accumDen[i-1], lambdas[i].den)
+    else:
+      accumDen[i].prod(accumDen[i-1], lambdas[i].den, skipFinalSub = true)
+
+  # Step 3: Batch invert
+  var accInv {.noInit.}: F
+  accInv.inv_vartime(accumDen[numScheduled-1])
+
+  # Step 4: Output the sums
+  for i in countdown(numScheduled-1, 1):
+    prefetchLarge(points[sps[i-1].pointID].addr, Read, HighTemporalLocality, maxCacheLines = 4)
+    prefetchLarge(buckets[sps[i-1].bucket].addr, Write, HighTemporalLocality, maxCacheLines = 4)
+
+    if specialCases[i] == kInfLhs:
+      if sps[i]. sign == 0:
+        buckets[sps[i].bucket] = points[sps[i].pointID]
+      else:
+        buckets[sps[i].bucket].neg(points[sps[i].pointID])
+      bucketStatuses[sps[i].bucket].incl(kAffine)
+      continue
+    elif specialCases[i] == kInfRhs:
+      continue
+    elif specialCases[i] == kOpposite:
+      buckets[sps[i].bucket].setInf()
+      bucketStatuses[sps[i].bucket].excl(kAffine)
+      continue
+
+    # Compute lambda - destroys accumDen[i]
+    accumDen[i].prod(accInv, accumDen[i-1], skipFinalSub = true)
+    accumDen[i].prod(accumDen[i], lambdas[i].num, skipFinalSub = true)
+
+    # Compute EC addition
+    var r{.noInit.}: ECP_ShortW_Aff[F, G]
+    r.affineAdd(lambda = accumDen[i], buckets[sps[i].bucket], points[sps[i].pointID]) # points[sps[i].pointID].y unused even if sign is negative
+
+    # Store result
+    buckets[sps[i].bucket] = r
+
+    # Next iteration
+    accInv.prod(accInv, lambdas[i].den, skipFinalSub = true)
+
+  block: # tail
+    if specialCases[0] == kInfLhs:
+      if sps[0].sign == 0:
+        buckets[sps[0].bucket] = points[sps[0].pointID]
+      else:
+        buckets[sps[0].bucket].neg(points[sps[0].pointID])
+      bucketStatuses[sps[0].bucket].incl(kAffine)
+    elif specialCases[0] == kInfRhs:
+      discard
+    elif specialCases[0] == kOpposite:
+      buckets[sps[0].bucket].setInf()
+      bucketStatuses[sps[0].bucket].excl(kAffine)
+    else:
+      # Compute lambda
+      accumDen[0].prod(lambdas[0].num, accInv, skipFinalSub = true)
+
+      # Compute EC addition
+      var r{.noInit.}: ECP_ShortW_Aff[F, G]
+      r.affineAdd(lambda = accumDen[0], buckets[sps[0].bucket], points[sps[0].pointID])
+
+      # Store result
+      buckets[sps[0].bucket] = r
+
+func bucketReduce*[N, F, G](
+       r: var ECP_ShortW_JacExt[F, G],
+       buckets: var Buckets[N, F, G]) =
+
+  var accumBuckets{.noinit.}: ECP_ShortW_JacExt[F, G]
+
+  if kAffine in buckets.status[N-1]:
+    if kJacExt in buckets.status[N-1]:
+      accumBuckets.madd_vartime(buckets.ptJacExt[N-1], buckets.ptAff[N-1])
+    else:
+      accumBuckets.fromAffine(buckets.ptAff[N-1])
+  elif kJacExt in buckets.status[N-1]:
+    accumBuckets = buckets.ptJacExt[N-1]
+  else:
+    accumBuckets.setInf()
+  r = accumBuckets
+  buckets.reset(N-1)
+
+  for k in countdown(N-2, 0):
+    if kAffine in buckets.status[k]:
+      if kJacExt in buckets.status[k]:
+        var t{.noInit.}: ECP_ShortW_JacExt[F, G]
+        t.madd_vartime(buckets.ptJacExt[k], buckets.ptAff[k])
+        accumBuckets += t
+      else:
+        accumBuckets += buckets.ptAff[k]
+    elif kJacExt in buckets.status[k]:
+      accumBuckets += buckets.ptJacExt[k]
+
+    buckets.reset(k)
+    r += accumBuckets
+
+# ########################################################### #
+#                                                             #
+#                   Statistics generation                     #
+#                                                             #
+# ########################################################### #
+
+when isMainModule:
+  import strformat
+
+  proc echoSchedulingParameter(logInputSize: int, echoHeader = false) {.raises:[ValueError].} =
+
+    const titles = ["-------inputs-------", "c", "----buckets----", "queue length", "collision map bytes", "num collisions", "collision %"]
+    const header = &"{titles[0]:>16}  {titles[1]:>3}  {titles[2]:>19}  {titles[3]:>13}  {titles[4]:>16}  {titles[5]:>14}  {titles[6]:>12}"
+
+    if echoHeader:
+      echo header
+      return
+
+    let inputSize = 1 shl logInputSize
+    let c = inputSize.bestBucketBitSize(255, useSignedBuckets = true, useManualTuning = false)
+    let twoPow = "2^"
+    let numNZBuckets = 1 shl (c-1)
+    let collisionMapSize = ((1 shl (c-1))+63) div 64 * 8 # Stored in BigInt[1 shl (c-1)]
+    let queueSize = 4*c*c - 16*c - 128
+    let numCollisions = float(inputSize*queueSize) / float(numNZBuckets)
+    let collisionPercentage = numCollisions / float(inputSize) * 100
+
+    echo &"{twoPow & $logInputSize:>4}  {inputSize:>14}  {c:>3}     {twoPow & $(c-1):>4} {numNZBuckets:>11}  {queueSize:>13}  {collisionMapSize:>19}  {numCollisions:>14}  {collisionPercentage:>11.1f}%"
+
+  echoSchedulingParameter(0, echoHeader = true)
+  for n in 0 ..< 36:
+    echoSchedulingParameter(n)
--- a/constantine/math/elliptic/ec_scalar_mul.nim
+++ b/constantine/math/elliptic/ec_scalar_mul.nim
@ -242,6 +242,7 @@ func scalarMul*[EC](
  ## Those will be assumed to maintain constant-time property
  when BigInt.bits <= EC.F.C.getCurveOrderBitwidth() and
       EC.F.C.hasEndomorphismAcceleration():
+    # TODO, min amount of bits for endomorphisms?
    when EC.F is Fp:
      P.scalarMulGLV_m2w2(scalar)
    elif EC.F is Fp2:
--- a/constantine/math/elliptic/ec_scalar_mul_vartime.nim
+++ b/constantine/math/elliptic/ec_scalar_mul_vartime.nim
@ -0,0 +1,128 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internals
+  ../arithmetic,
+  ../ec_shortweierstrass,
+  ../io/io_bigints,
+  ../../platforms/abstractions
+
+{.push raises: [].} # No exceptions allowed in core cryptographic operations
+{.push checks: off.} # No defects due to array bound checking or signed integer overflow allowed
+
+# Support files for testing Elliptic Curve arithmetic
+# ------------------------------------------------------------------------------
+
+iterator unpack(scalarByte: byte): bool =
+  yield bool((scalarByte and 0b10000000) shr 7)
+  yield bool((scalarByte and 0b01000000) shr 6)
+  yield bool((scalarByte and 0b00100000) shr 5)
+  yield bool((scalarByte and 0b00010000) shr 4)
+  yield bool((scalarByte and 0b00001000) shr 3)
+  yield bool((scalarByte and 0b00000100) shr 2)
+  yield bool((scalarByte and 0b00000010) shr 1)
+  yield bool( scalarByte and 0b00000001)
+
+func scalarMul_doubleAdd_vartime*[EC](P: var EC, scalar: BigInt) {.tags:[VarTime].} =
+  ## **Variable-time** Elliptic Curve Scalar Multiplication
+  ##
+  ##   P <- [k] P
+  ##
+  ## This uses the double-and-add algorithm
+  ## This MUST NOT be used with secret data.
+  ##
+  ## This is highly VULNERABLE to timing attacks and power analysis attacks.
+  var scalarCanonical: array[(scalar.bits+7) div 8, byte]
+  scalarCanonical.marshal(scalar, bigEndian)
+
+  var Paff {.noinit.}: affine(EC)
+  Paff.affine(P)
+
+  P.setInf()
+  for scalarByte in scalarCanonical:
+    for bit in unpack(scalarByte):
+      P.double()
+      if bit:
+        P += Paff
+
+func scalarMul_minHammingWeight_vartime*[EC](P: var EC, scalar: BigInt) {.tags:[VarTime].}  =
+  ## **Variable-time** Elliptic Curve Scalar Multiplication
+  ##
+  ##   P <- [k] P
+  ##
+  ## This uses an online recoding with minimum Hamming Weight
+  ## (which is not NAF, NAF is least-significant bit to most)
+  ## This MUST NOT be used with secret data.
+  ##
+  ## This is highly VULNERABLE to timing attacks and power analysis attacks
+  var Paff {.noinit.}: affine(EC)
+  Paff.affine(P)
+
+  P.setInf()
+  for bit in recoding_l2r_signed_vartime(scalar):
+    P.double()
+    if bit == 1:
+      P += Paff
+    elif bit == -1:
+      P -= Paff
+
+func scalarMul_minHammingWeight_windowed_vartime*[EC](P: var EC, scalar: BigInt, window: static int) {.tags:[VarTime, Alloca].} =
+  ## **Variable-time** Elliptic Curve Scalar Multiplication
+  ##
+  ##   P <- [k] P
+  ##
+  ## This uses windowed-NAF (wNAF)
+  ## This MUST NOT be used with secret data.
+  ##
+  ## This is highly VULNERABLE to timing attacks and power analysis attacks
+
+  # Signed digits divides precomputation table size by 2
+  # Odd-only divides precomputation table size by another 2
+  const precompSize = 1 shl (window - 2)
+
+  when window <= 8:
+    type I = int8
+  elif window <= 16:
+    type I = int16
+  elif window <= 32:
+    type I = int32
+  else:
+    type I = int64
+
+  var naf {.noInit.}: array[BigInt.bits+1, I]
+  let nafLen = naf.recode_r2l_signed_window_vartime(scalar, window)
+
+  var P2{.noInit.}: EC
+  P2.double(P)
+
+  var tabEC {.noinit.}: array[precompSize, EC]
+  tabEC[0] = P
+  for i in 1 ..< tabEC.len:
+    tabEC[i].sum(tabEC[i-1], P2)
+
+  var tab {.noinit.}: array[precompSize, affine(EC)]
+  tab.batchAffine(tabEC)
+
+  # init
+  if naf[nafLen-1] > 0:
+    P.fromAffine(tab[naf[nafLen-1] shr 1])
+  elif naf[nafLen-1] < 0:
+    P.fromAffine(tab[-naf[nafLen-1] shr 1])
+    P.neg()
+  else:
+    P.setInf()
+
+  # steady state
+  for i in 1 ..< nafLen:
+    P.double()
+    let digit = naf[nafLen-1-i]
+    if digit > 0:
+      P += tab[digit shr 1]
+    elif digit < 0:
+      P -= tab[-digit shr 1]
--- a/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim
@ -7,12 +7,13 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

 import
-  ../../platforms/[abstractions, allocs],
+  ../../platforms/abstractions,
  ../arithmetic,
  ../extension_fields,
  ./ec_shortweierstrass_affine,
  ./ec_shortweierstrass_jacobian,
-  ./ec_shortweierstrass_projective
+  ./ec_shortweierstrass_projective,
+  ./ec_shortweierstrass_jacobian_extended

 # No exceptions allowed, or array bound checks or integer overflow
 {.push raises: [], checks:off.}
@ -27,7 +28,7 @@ import
 func batchAffine*[F, G](
       affs: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
       projs: ptr UncheckedArray[ECP_ShortW_Prj[F, G]],
-       N: int) =
+       N: int) {.noInline, tags:[Alloca].} =
  # Algorithm: Montgomery's batch inversion
  # - Speeding the Pollard and Elliptic Curve Methods of Factorization
  #   Section 10.3.1
@ -87,7 +88,7 @@ func batchAffine*[N: static int, F, G](
 func batchAffine*[F, G](
       affs: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
       jacs: ptr UncheckedArray[ECP_ShortW_Jac[F, G]],
-       N: int) =
+       N: int) {.noInline, tags:[Alloca].} =
  # Algorithm: Montgomery's batch inversion
  # - Speeding the Pollard and Elliptic Curve Methods of Factorization
  #   Section 10.3.1
@ -153,7 +154,7 @@ func batchAffine*[N: static int, F, G](
 # ############################################################
 #
 #             Elliptic Curve in Short Weierstrass form
-#                     Batch addition
+#                     Sum Reduction
 #
 # ############################################################

@ -192,7 +193,7 @@ func batchAffine*[N: static int, F, G](
 # However, n inversions can use Montgomery's batch inversion
 # at the cost of 3(n-1)M + 1I
 #
-# Hence batch addition can have an asymptotic cost of
+# Hence sum reduction can have an asymptotic cost of
 #   5M + 1S
 # Compared to
 #   Jacobian addition:         12M + 4S
@ -200,12 +201,18 @@ func batchAffine*[N: static int, F, G](
 #   Projective addition:       12M      (for curves in the form y² = x³ + b)
 #   Projective mixed addition: 11M      (for curves in the form y² = x³ + b)

-func lambdaAdd[F; G: static Subgroup](lambda_num, lambda_den: var F, P, Q: ECP_ShortW_Aff[F, G]) =
+func lambdaAdd*[F; G: static Subgroup](lambda_num, lambda_den: var F, P, Q: ECP_ShortW_Aff[F, G]) {.inline.} =
  ## Compute the slope of the line (PQ)
  lambda_num.diff(Q.y, P.y)
  lambda_den.diff(Q.x, P.x)

-func lambdaDouble[F; G: static Subgroup](lambda_num, lambda_den: var F, P: ECP_ShortW_Aff[F, G]) =
+func lambdaSub*[F; G: static Subgroup](lambda_num, lambda_den: var F, P, Q: ECP_ShortW_Aff[F, G]) {.inline.} =
+  ## Compute the slope of the line (PQ)
+  lambda_num.neg(Q.y)
+  lambda_num -= P.y
+  lambda_den.diff(Q.x, P.x)
+
+func lambdaDouble*[F; G: static Subgroup](lambda_num, lambda_den: var F, P: ECP_ShortW_Aff[F, G]) {.inline.} =
  ## Compute the tangent at P
  lambda_num.square(P.x)
  lambda_num *= 3
@ -214,11 +221,11 @@ func lambdaDouble[F; G: static Subgroup](lambda_num, lambda_den: var F, P: ECP_S

  lambda_den.double(P.y)

-func affineAdd[F; G: static Subgroup](
-       r: var ECP_ShortW_Aff[F, G],
-       lambda: var F,
+func affineAdd*[F; G: static Subgroup](
+       r{.noAlias.}: var ECP_ShortW_Aff[F, G],
+       lambda: F,
       P, Q: ECP_ShortW_Aff[F, G]) =
-
+  ## `r` MUST NOT alias P or Q
  r.x.square(lambda)
  r.x -= P.x
  r.x -= Q.x
@ -229,8 +236,7 @@ func affineAdd[F; G: static Subgroup](

 func accum_half_vartime[F; G: static Subgroup](
       points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
-       lambdas: ptr UncheckedArray[tuple[num, den: F]],
-       len: uint) {.noinline.} =
+       len: uint) {.noInline, tags:[VarTime, Alloca].} =
  ## Affine accumulation of half the points into the other half
  ## Warning ⚠️ : variable-time
  ##
@ -241,17 +247,13 @@ func accum_half_vartime[F; G: static Subgroup](
  ##           Partial sums are stored in [0, len/2)
  ##           [len/2, len) data has been destroyed
  ##
-  ## Scratchspace:
-  ## - Lambdas
-  ##
  ## Output:
  ## - r
-  ##
-  ## Warning ⚠️ : cannot be inlined if used in loop due to the use of alloca

  debug: doAssert len and 1 == 0, "There must be an even number of points"

-  let N = len div 2
+  let N = int(len div 2)
+  let lambdas = allocStackArray(tuple[num, den: F], N)

  # Step 1: Compute numerators and denominators of λᵢ = λᵢ_num / λᵢ_den
  for i in 0 ..< N:
@ -293,14 +295,14 @@ func accum_half_vartime[F; G: static Subgroup](
    # Step 2: Accumulate denominators in Qy, which is not used anymore.
    if i == 0:
      points[q].y = lambdas[i].den
+    elif i == N-1:
+      points[q].y.prod(points[q_prev].y, lambdas[i].den)
    else:
      points[q].y.prod(points[q_prev].y, lambdas[i].den, skipFinalSub = true)

  # Step 3: batch invert
  var accInv {.noInit.}: F
-  accInv.setZero()
-  points[len-1].y += accInv   # Undo skipFinalSub, ensure that the last accum is in canonical form, before inversion
-  accInv.inv(points[len-1].y)
+  accInv.inv_vartime(points[len-1].y)

  # Step 4: Compute the partial sums

@ -311,7 +313,7 @@ func accum_half_vartime[F; G: static Subgroup](
    if points[p].isInf().bool():
      points[i] = points[q]
    elif points[q].x.isZero().bool() and lambdas[i].num.isZero().bool():
-      discard "points[i] = points[p]" # i == p
+      discard "points[q] is infinity => point[p] unchanged"
    else:
      points[i].setInf()

@ -356,17 +358,23 @@ func accum_half_vartime[F; G: static Subgroup](
      # Store result
      points[0] = r

-# Batch addition: jacobian
+# Batch addition - High-level
 # ------------------------------------------------------------

+template `+=`[F; G: static Subgroup](P: var ECP_ShortW_JacExt[F, G], Q: ECP_ShortW_Aff[F, G]) =
+  # All vartime procedures MUST be tagged vartime
+  # Hence we do not expose `+=` for extended jacobian operation to prevent `vartime` mistakes
+  # The following algorithms are all tagged vartime, hence for genericity
+  # we create a local `+=` for this module only
+  madd_vartime(P, P, Q)
+
 func accumSum_chunk_vartime[F; G: static Subgroup](
-       r: var (ECP_ShortW_Jac[F, G] or ECP_ShortW_Prj[F, G]),
-       points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
-       lambdas: ptr UncheckedArray[tuple[num, den: F]],
-       len: uint) =
+       r: var (ECP_ShortW_Jac[F, G] or ECP_ShortW_Prj[F, G] or ECP_ShortW_JacExt[F, G]),
+       points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]], len: uint) =
  ## Accumulate `points` into r.
  ## `r` is NOT overwritten
  ## r += ∑ points
+  ## `points` are destroyed

  const minNumPointsSerial = 16
  var n = len
@ -378,7 +386,7 @@ func accumSum_chunk_vartime[F; G: static Subgroup](
      n -= 1

    # Compute [0, n/2) += [n/2, n)
-    accum_half_vartime(points, lambdas, n)
+    accum_half_vartime(points, n)

    # Next chunk
    n = n div 2
@ -387,11 +395,11 @@ func accumSum_chunk_vartime[F; G: static Subgroup](
  for i in 0'u ..< n:
    r += points[i]

-func sum_batch_vartime*[F; G: static Subgroup](
-       r: var (ECP_ShortW_Jac[F, G] or ECP_ShortW_Prj[F, G]),
-       points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]], pointsLen: int) =
-  ## Batch addition of `points` into `r`
-  ## `r` is overwritten
+func accum_batch_vartime[F; G: static Subgroup](
+       r: var (ECP_ShortW_Jac[F, G] or ECP_ShortW_Prj[F, G] or ECP_ShortW_JacExt[F, G]),
+       points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]], pointsLen: int) {.noInline, tags:[VarTime, Alloca].} =
+  ## Batch accumulation of `points` into `r`
+  ## `r` is accumulated into

  # We chunk the addition to limit memory usage
  # especially as we allocate on the stack.
@ -412,27 +420,81 @@ func sum_batch_vartime*[F; G: static Subgroup](
  # After one chunk is processed we are well within all 64-bit CPU L2 cache bounds
  # as we halve after each chunk.

-  r.setInf()
-
  const maxTempMem = 262144 # 2¹⁸ = 262144
  const maxStride = maxTempMem div sizeof(ECP_ShortW_Aff[F, G])

  let n = min(maxStride, pointsLen)
  let accumulators = allocStackArray(ECP_ShortW_Aff[F, G], n)
-  let lambdas = allocStackArray(tuple[num, den: F], n)

  for i in countup(0, pointsLen-1, maxStride):
    let n = min(maxStride, pointsLen - i)
    let size = n * sizeof(ECP_ShortW_Aff[F, G])
    copyMem(accumulators[0].addr, points[i].unsafeAddr, size)
-    r.accumSum_chunk_vartime(accumulators, lambdas, uint n)
+    r.accumSum_chunk_vartime(accumulators, uint n)

-func sum_batch_vartime*[F; G: static Subgroup](
-       r: var (ECP_ShortW_Jac[F, G] or ECP_ShortW_Prj[F, G]),
-       points: openArray[ECP_ShortW_Aff[F, G]]) {.inline.} =
+func sum_reduce_vartime*[F; G: static Subgroup](
+       r: var (ECP_ShortW_Jac[F, G] or ECP_ShortW_Prj[F, G] or ECP_ShortW_JacExt[F, G]),
+       points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]], pointsLen: int) {.inline, tags:[VarTime, Alloca].} =
  ## Batch addition of `points` into `r`
  ## `r` is overwritten
-  if points.len == 0:
  r.setInf()
+  if pointsLen == 0:
    return
-  r.sum_batch_vartime(points.asUnchecked(), points.len)
+  r.accum_batch_vartime(points, pointsLen)
+
+func sum_reduce_vartime*[F; G: static Subgroup](
+       r: var (ECP_ShortW_Jac[F, G] or ECP_ShortW_Prj[F, G] or ECP_ShortW_JacExt[F, G]),
+       points: openArray[ECP_ShortW_Aff[F, G]]) {.inline, tags:[VarTime, Alloca].} =
+  ## Batch addition of `points` into `r`
+  ## `r` is overwritten
+  r.sum_reduce_vartime(points.asUnchecked(), points.len)
+
+# ############################################################
+#
+#                  EC Addition Accumulator
+#
+# ############################################################
+
+# Accumulators stores partial additions
+# They allow supporting EC additions in a streaming fashion
+
+type EcAddAccumulator_vartime*[EC, F; G: static Subgroup; AccumMax: static int] = object
+  ## Elliptic curve addition accumulator
+  ## **Variable-Time**
+  # The `cur` is dereferenced first so better locality if at the beginning
+  # Do we want alignment guarantees?
+  cur: uint32
+  accum: EC
+  buffer: array[AccumMax, ECP_ShortW_Aff[F, G]]
+
+func init*(ctx: var EcAddAccumulator_vartime) =
+  static: doAssert EcAddAccumulator_vartime.AccumMax >= 16, "There is no point in a EcAddBatchAccumulator if the batch size is too small"
+  ctx.accum.setInf()
+  ctx.cur = 0
+
+func consumeBuffer[EC, F; G: static Subgroup; AccumMax: static int](
+       ctx: var EcAddAccumulator_vartime[EC, F, G, AccumMax]) {.noInline, tags: [VarTime, Alloca].}=
+  if ctx.cur == 0:
+    return
+
+  let lambdas = allocStackArray(tuple[num, den: F], ctx.cur.int)
+  ctx.accum.accumSum_chunk_vartime(ctx.buffer.asUnchecked(), lambdas, ctx.cur.uint)
+  ctx.cur = 0
+
+func update*[EC, F, G; AccumMax: static int](
+        ctx: var EcAddAccumulator_vartime[EC, F, G, AccumMax],
+        P: ECP_ShortW_Aff[F, G]) =
+
+  if ctx.cur == AccumMax:
+    ctx.consumeBuffer()
+
+  ctx.buffer[ctx.cur] = P
+  ctx.cur += 1
+
+# TODO: `merge` for parallel recursive divide-and-conquer processing
+
+func finish*[EC, F, G; AccumMax: static int](
+        ctx: var EcAddAccumulator_vartime[EC, F, G, AccumMax],
+        accumulatedResult: var EC) =
+  ctx.consumeBuffer()
+  accumulatedResult = ctx.accum
--- a/constantine/math/elliptic/ec_shortweierstrass_batch_ops_parallel.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_batch_ops_parallel.nim
@ -7,7 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

 import
-  ../../platforms/[abstractions, allocs],
+  ../../platforms/abstractions,
  ../../platforms/threadpool/threadpool,
  ./ec_shortweierstrass_affine,
  ./ec_shortweierstrass_jacobian,
@ -65,10 +65,10 @@ iterator items(c: ChunkDescriptor): tuple[chunkID, start, stopEx: int] =
      let chunkSize = c.baseChunkSize
      yield (chunkID, offset, min(offset+chunkSize, c.totalIters))

-proc sum_batch_vartime_parallel*[F; G: static Subgroup](
+proc sum_reduce_vartime_parallel*[F; G: static Subgroup](
       tp: Threadpool,
       r: var (ECP_ShortW_Jac[F, G] or ECP_ShortW_Prj[F, G]),
-       points: openArray[ECP_ShortW_Aff[F, G]]) =
+       points: openArray[ECP_ShortW_Aff[F, G]]) {.noInline.} =
  ## Batch addition of `points` into `r`
  ## `r` is overwritten
  ## Compute is parallelized, if beneficial.
@ -92,7 +92,7 @@ proc sum_batch_vartime_parallel*[F; G: static Subgroup](
  static: doAssert minNumPointsParallel <= maxNumPoints, "The curve " & $r.typeof & " requires large size and needs to be tuned."

  if points.len < minNumPointsParallel:
-    r.sum_batch_vartime(points)
+    r.sum_reduce_vartime(points)
    return

  let chunkDesc = computeBalancedChunks(
@ -103,12 +103,12 @@ proc sum_batch_vartime_parallel*[F; G: static Subgroup](
  let partialResults = allocStackArray(r.typeof(), chunkDesc.numChunks)

  for iter in items(chunkDesc):
-    proc sum_batch_vartime_wrapper(res: ptr, p: ptr, pLen: int) {.nimcall.} =
+    proc sum_reduce_vartime_wrapper(res: ptr, p: ptr, pLen: int) {.nimcall.} =
      # The borrow checker prevents capturing `var` and `openArray`
      # so we capture pointers instead.
-      res[].sum_batch_vartime(p, pLen)
+      res[].sum_reduce_vartime(p, pLen)

-    tp.spawn partialResults[iter.chunkID].addr.sum_batch_vartime_wrapper(
+    tp.spawn partialResults[iter.chunkID].addr.sum_reduce_vartime_wrapper(
              points.asUnchecked() +% iter.start,
              iter.stopEx - iter.start)

@ -122,7 +122,7 @@ proc sum_batch_vartime_parallel*[F; G: static Subgroup](
  else:
    let partialResultsAffine = allocStackArray(ECP_ShortW_Aff[F, G], chunkDesc.numChunks)
    partialResultsAffine.batchAffine(partialResults, chunkDesc.numChunks)
-    r.sum_batch_vartime(partialResultsAffine, chunkDesc.numChunks)
+    r.sum_reduce_vartime(partialResultsAffine, chunkDesc.numChunks)

 # Sanity checks
 # ---------------------------------------
--- a/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim
@ -37,11 +37,23 @@ type ECP_ShortW_Jac*[F; G: static Subgroup] = object
  ## Note that jacobian coordinates are not unique
  x*, y*, z*: F

-template affine*[F, G](_: type ECP_ShortW_Jac[F, G]): typedesc =
-  ## Returns the affine type that corresponds to the Jacobian type input
-  ECP_ShortW_Aff[F, G]
+func isInf*(P: ECP_ShortW_Jac): SecretBool {.inline.} =
+  ## Returns true if P is an infinity point
+  ## and false otherwise
+  ##
+  ## Note: the jacobian coordinates equation is
+  ##       Y² = X³ + aXZ⁴ + bZ⁶
+  ## A "zero" point is any point with coordinates X and Z = 0
+  ## Y can be anything
+  result = P.z.isZero()

-func `==`*(P, Q: ECP_ShortW_Jac): SecretBool =
+func setInf*(P: var ECP_ShortW_Jac) {.inline.} =
+  ## Set ``P`` to infinity
+  P.x.setOne()
+  P.y.setOne()
+  P.z.setZero()
+
+func `==`*(P, Q: ECP_ShortW_Jac): SecretBool {.meter.} =
  ## Constant-time equality check
  ## This is a costly operation
  # Reminder: the representation is not unique
@ -63,21 +75,8 @@ func `==`*(P, Q: ECP_ShortW_Jac): SecretBool =
  b *= z1z1
  result = result and a == b

-func isInf*(P: ECP_ShortW_Jac): SecretBool {.inline.} =
-  ## Returns true if P is an infinity point
-  ## and false otherwise
-  ##
-  ## Note: the jacobian coordinates equation is
-  ##       Y² = X³ + aXZ⁴ + bZ⁶
-  ## A "zero" point is any point with coordinates X and Z = 0
-  ## Y can be anything
-  result = P.z.isZero()
-
-func setInf*(P: var ECP_ShortW_Jac) {.inline.} =
-  ## Set ``P`` to infinity
-  P.x.setOne()
-  P.y.setOne()
-  P.z.setZero()
+  # Ensure a zero-init point doesn't propagate 0s and match any
+  result = result and not(P.isInf() xor Q.isInf())

 func ccopy*(P: var ECP_ShortW_Jac, Q: ECP_ShortW_Jac, ctl: SecretBool) {.inline.} =
  ## Constant-time conditional copy
@ -337,7 +336,7 @@ func sum*[F; G: static Subgroup](
       r: var ECP_ShortW_Jac[F, G],
       P, Q: ECP_ShortW_Jac[F, G],
       CoefA: static F
-     ) =
+     ) {.meter.} =
  ## Elliptic curve point addition for Short Weierstrass curves in Jacobian coordinates
  ## with the curve ``a`` being a parameter for summing on isogenous curves.
  ##
@ -361,7 +360,7 @@ func sum*[F; G: static Subgroup](
 func sum*[F; G: static Subgroup](
       r: var ECP_ShortW_Jac[F, G],
       P, Q: ECP_ShortW_Jac[F, G]
-     ) =
+     ) {.meter.} =
  ## Elliptic curve point addition for Short Weierstrass curves in Jacobian coordinates
  ##
  ##   R = P + Q
@ -383,7 +382,7 @@ func madd*[F; G: static Subgroup](
       r: var ECP_ShortW_Jac[F, G],
       P: ECP_ShortW_Jac[F, G],
       Q: ECP_ShortW_Aff[F, G]
-     ) =
+     ) {.meter.} =
  ## Elliptic curve mixed addition for Short Weierstrass curves in Jacobian coordinates
  ## with the curve ``a`` being a parameter for summing on isogenous curves.
  ##
@ -555,10 +554,7 @@ func madd*[F; G: static Subgroup](

  r = o

-func double*[F; G: static Subgroup](
-       r: var ECP_ShortW_Jac[F, G],
-       P: ECP_ShortW_Jac[F, G]
-     ) =
+func double*[F; G: static Subgroup](r: var ECP_ShortW_Jac[F, G], P: ECP_ShortW_Jac[F, G]) {.meter.} =
  ## Elliptic curve point doubling for Short Weierstrass curves in projective coordinate
  ##
  ##   R = [2] P
@ -642,9 +638,19 @@ func `-=`*(P: var ECP_ShortW_Jac, Q: ECP_ShortW_Jac) {.inline.} =
  nQ.neg(Q)
  P.sum(P, nQ)

+func `-=`*(P: var ECP_ShortW_Jac, Q: ECP_ShortW_Aff) {.inline.} =
+  ## In-place point substraction
+  var nQ {.noInit.}: typeof(Q)
+  nQ.neg(Q)
+  P.madd(P, nQ)
+
+template affine*[F, G](_: type ECP_ShortW_Jac[F, G]): typedesc =
+  ## Returns the affine type that corresponds to the Jacobian type input
+  ECP_ShortW_Aff[F, G]
+
 func affine*[F; G](
       aff: var ECP_ShortW_Aff[F, G],
-       jac: ECP_ShortW_Jac[F, G]) =
+       jac: ECP_ShortW_Jac[F, G]) {.meter.} =
  var invZ {.noInit.}, invZ2{.noInit.}: F
  invZ.inv(jac.z)
  invZ2.square(invZ, skipFinalSub = true)
@ -659,3 +665,4 @@ func fromAffine*[F; G](
  jac.x = aff.x
  jac.y = aff.y
  jac.z.setOne()
+  jac.z.csetZero(aff.isInf())
--- a/constantine/math/elliptic/ec_shortweierstrass_jacobian_extended.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_jacobian_extended.nim
@ -0,0 +1,371 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../../platforms/abstractions,
+  ../config/curves,
+  ../arithmetic,
+  ../extension_fields,
+  ./ec_shortweierstrass_affine,
+  ./ec_shortweierstrass_projective,
+  ./ec_shortweierstrass_jacobian
+
+export Subgroup
+
+# No exceptions allowed
+{.push raises: [].}
+
+# ############################################################
+#
+#             Elliptic Curve in Short Weierstrass form
+#                with Extended Jacobian Coordinates
+#
+# ############################################################
+
+type ECP_ShortW_JacExt*[F; G: static Subgroup] = object
+  ## Elliptic curve point for a curve in Short Weierstrass form
+  ##   y² = x³ + a x + b
+  ##
+  ## over a field F
+  ##
+  ## in Extended Jacobian coordinates (X, Y, ZZ, ZZZ)
+  ## corresponding to (x, y) with X = xZ² and Y = yZ³
+  ##
+  ## Note that extended jacobian coordinates are not unique
+  x*, y*, zz*, zzz*: F
+
+func fromAffine*[F; G](jacext: var ECP_ShortW_JacExt[F, G], aff: ECP_ShortW_Aff[F, G]) {.inline.}
+
+func isInf*(P: ECP_ShortW_JacExt): SecretBool {.inline, meter.} =
+  ## Returns true if P is an infinity point
+  ## and false otherwise
+  result = P.zz.isZero()
+
+func setInf*(P: var ECP_ShortW_JacExt) {.inline.} =
+  ## Set ``P`` to infinity
+  P.x.setOne()
+  P.y.setOne()
+  P.zz.setZero()
+  P.zzz.setZero()
+
+func `==`*(P, Q: ECP_ShortW_JacExt): SecretBool {.meter.} =
+  ## Constant-time equality check
+  ## This is a costly operation
+  # Reminder: the representation is not unique
+  type F = ECP_ShortW_JacExt.F
+
+  var a{.noInit.}, b{.noInit.}: F
+
+  a.prod(P.x, Q.zz)
+  b.prod(Q.x, P.zz)
+  result = a == b
+
+  a.prod(P.y, Q.zzz)
+  b.prod(Q.y, P.zzz)
+  result = result and a == b
+
+  # Ensure a zero-init point doesn't propagate 0s and match any
+  result = result and not(P.isInf() xor Q.isInf())
+
+func trySetFromCoordsXandZ*[F; G](
+       P: var ECP_ShortW_JacExt[F, G],
+       x, z: F): SecretBool =
+  ## Try to create a point the elliptic curve
+  ## Y² = X³ + aXZ⁴ + bZ⁶  (Jacobian coordinates)
+  ## y² = x³ + a x + b     (affine coordinate)
+  ## return true and update `P` if `x` leads to a valid point
+  ## return false otherwise, in that case `P` is undefined.
+  ##
+  ## Note: Dedicated robust procedures for hashing-to-curve
+  ##       will be provided, this is intended for testing purposes.
+  ##
+  ##       For **test case generation only**,
+  ##       this is preferred to generating random point
+  ##       via random scalar multiplication of the curve generator
+  ##       as the latter assumes:
+  ##       - point addition, doubling work
+  ##       - scalar multiplication works
+  ##       - a generator point is defined
+  ##       i.e. you can't test unless everything is already working
+  P.y.curve_eq_rhs(x, G)
+  result = sqrt_if_square(P.y)
+
+  P.zz.square(z)
+  P.x.prod(x, P.zz)
+
+  P.zzz.prod(P.zz, z)
+  P.y.prod(P.y, P.zzz)
+
+func trySetFromCoordX*[F; G](
+       P: var ECP_ShortW_JacExt[F, G],
+       x: F): SecretBool =
+  ## Try to create a point the elliptic curve
+  ## y² = x³ + a x + b     (affine coordinate)
+  ##
+  ## The `ZZ` and `ZZZ` coordinates are set to 1
+  ##
+  ## return true and update `P` if `x` leads to a valid point
+  ## return false otherwise, in that case `P` is undefined.
+  ##
+  ## Note: Dedicated robust procedures for hashing-to-curve
+  ##       will be provided, this is intended for testing purposes.
+  ##
+  ##       For **test case generation only**,
+  ##       this is preferred to generating random point
+  ##       via random scalar multiplication of the curve generator
+  ##       as the latter assumes:
+  ##       - point addition, doubling work
+  ##       - scalar multiplication works
+  ##       - a generator point is defined
+  ##       i.e. you can't test unless everything is already working
+  P.y.curve_eq_rhs(x, G)
+  result = sqrt_if_square(P.y)
+  P.x = x
+  P.zz.setOne()
+  P.zzz.setOne()
+
+func neg*(P: var ECP_ShortW_JacExt, Q: ECP_ShortW_JacExt) {.inline.} =
+  ## Negate ``P``
+  P.x = Q.x
+  P.y.neg(Q.y)
+  P.zz = Q.zz
+  P.zzz = Q.zzz
+
+func neg*(P: var ECP_ShortW_JacExt) {.inline.} =
+  ## Negate ``P``
+  P.y.neg()
+
+func double*[F; G: static Subgroup](r: var ECP_ShortW_JacExt[F, G], P: ECP_ShortW_JacExt[F, G]) {.meter.} =
+  # http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1
+  var U{.noInit.}, V{.noInit.}, W{.noinit.}, S{.noInit.}, M{.noInit.}: F
+
+  U.double(P.y)
+  V.square(U)
+  W.prod(U, V)
+  S.prod(P.x, V)
+  M.square(P.x)
+  M *= 3
+  when F.C.getCoefA() != 0:
+    {.error: "Not implemented.".}
+
+  # aliasing, we don't use P.x and U anymore
+  r.x.square(M)
+  U.double(S)
+  r.x -= U
+  S -= r.x
+  r.y.prod(W, P.y)
+  M *= S
+  r.y.diff(M, r.y)
+  r.zz.prod(P.zz, V)
+  r.zzz.prod(P.zzz, W)
+
+func sum_vartime*[F; G: static Subgroup](
+       r: var ECP_ShortW_JacExt[F, G],
+       p, q: ECP_ShortW_JacExt[F, G])
+       {.tags:[VarTime], meter.} =
+  ## **Variable-time** Extended Jacobian addition
+  ##
+  ## This MUST NOT be used with secret data.
+  ##
+  ## This is highly VULNERABLE to timing attacks and power analysis attacks.
+  # https://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#addition-add-2008-s
+
+  if p.isInf().bool:
+    r = q
+    return
+  if q.isInf().bool:
+    r = p
+    return
+
+  var U{.noInit.}, S{.noInit.}, P{.noInit.}, R{.noInit.}: F
+
+  U.prod(p.x, q.zz)
+  P.prod(q.x, p.zz)
+  S.prod(p.y, q.zzz)
+  R.prod(q.y, p.zzz)
+
+  P -= U
+  R -= S
+
+  if P.isZero().bool:   # Same x coordinate
+    if R.isZero().bool: # case P == Q
+      r.double(q)
+      return
+    else:               # case P = -Q
+      r.setInf()
+      return
+
+  var PP{.noInit.}, PPP{.noInit.}, Q{.noInit.}: F
+
+  PP.square(P)
+  PPP.prod(PP, P)
+  Q.prod(U, PP)
+
+  r.x.square(R)
+  P.double(Q)
+  r.x -= PPP
+  r.x -= P
+
+  Q -= r.x
+  r.y.prod(S, PPP)
+  R *= Q
+  r.y.diff(R, r.y)
+
+  r.zz.prod(p.zz, q.zz)
+  r.zz *= PP
+  r.zzz.prod(p.zzz, q.zzz)
+  r.zzz *= PPP
+
+func mdouble*[F; G: static Subgroup](r: var ECP_ShortW_JacExt[F, G], P: ECP_ShortW_Aff[F, G]) {.meter.} =
+  ## Mixed EC point double
+  # http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-mdbl-2008-s-1
+
+  var U{.noInit.}, V{.noInit.}, W{.noinit.}, S{.noInit.}, M{.noInit.}: F
+
+  U.double(P.y)
+  V.square(U)
+  W.prod(U, V)
+  S.prod(P.x, V)
+  M.square(P.x)
+  M *= 3
+  when F.C.getCoefA() != 0:
+    {.error: "Not implemented.".}
+
+  # aliasing, we don't use P.x and U anymore
+  r.x.square(M)
+  U.double(S)
+  r.x -= U
+  S -= r.x
+  r.y.prod(W, P.y)
+  M *= S
+  r.y.diff(M, r.y)
+  r.zz = V
+  r.zzz = W
+
+func madd_vartime*[F; G: static Subgroup](
+       r: var ECP_ShortW_JacExt[F, G],
+       p: ECP_ShortW_JacExt[F, G],
+       q: ECP_ShortW_Aff[F, G])
+       {.tags:[VarTime], meter.} =
+  ## **Variable-time** Extended Jacobian mixed addition
+  ##
+  ## This MUST NOT be used with secret data.
+  ##
+  ## This is highly VULNERABLE to timing attacks and power analysis attacks.
+  # https://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#addition-add-2008-s
+
+  if p.isInf().bool:
+    r.fromAffine(q)
+    return
+  if q.isInf().bool:
+    r = p
+    return
+
+  var P{.noInit.}, R{.noInit.}: F
+
+  P.prod(q.x, p.zz)
+  R.prod(q.y, p.zzz)
+
+  P -= p.x
+  R -= p.y
+
+  if P.isZero().bool:   # Same x coordinate
+    if R.isZero().bool: # case P == Q
+      r.mdouble(q)
+      return
+    else:               # case P = -Q
+      r.setInf()
+      return
+
+  var PP{.noInit.}, PPP{.noInit.}, Q{.noInit.}: F
+
+  PP.square(P)
+  PPP.prod(PP, P)
+  Q.prod(p.x, PP)
+
+  r.x.square(R)
+  P.double(Q)
+  r.x -= PPP
+  r.x -= P
+
+  Q -= r.x
+  r.y.prod(p.y, PPP)
+  R *= Q
+  r.y.diff(R, r.y)
+
+  r.zz.prod(p.zz, PP)
+  r.zzz.prod(p.zzz, PPP)
+
+func msub_vartime*[F; G: static Subgroup](
+       r: var ECP_ShortW_JacExt[F, G],
+       p: ECP_ShortW_JacExt[F, G],
+       q: ECP_ShortW_Aff[F, G]) {.tags:[VarTime], inline.} =
+  var nQ {.noInit.}: ECP_ShortW_Aff[F, G]
+  nQ.neg(q)
+  r.madd_vartime(p, nQ)
+
+# Conversions
+# -----------
+
+template affine*[F, G](_: type ECP_ShortW_JacExt[F, G]): typedesc =
+  ## Returns the affine type that corresponds to the Extended Jacobian type input
+  ECP_ShortW_Aff[F, G]
+
+template jacobianExtended*[EC](_: typedesc[EC]): typedesc =
+  ## Returns the affine type that corresponds to the Extended Jacobian type input
+  ECP_ShortW_JacExt[EC.F, EC.G]
+
+func affine*[F; G](
+       aff: var ECP_ShortW_Aff[F, G],
+       jacext: ECP_ShortW_JacExt[F, G]) {.meter.} =
+  var invZZ {.noInit.}, invZZZ{.noInit.}: F
+  invZZZ.inv(jacext.zzz)
+  invZZ.prod(jacext.zz, invZZZ, skipFinalSub = true)
+  invZZ.square(skipFinalSub = true)
+  aff.x.prod(jacext.x, invZZ)
+  aff.y.prod(jacext.y, invZZZ)
+
+func fromAffine*[F; G](
+       jacext: var ECP_ShortW_JacExt[F, G],
+       aff: ECP_ShortW_Aff[F, G]) {.inline, meter.} =
+  jacext.x = aff.x
+  jacext.y = aff.y
+  jacext.zz.setOne()
+  jacext.zzz.setOne()
+
+  let inf = aff.isInf()
+  jacext.zz.csetZero(inf)
+  jacext.zzz.csetZero(inf)
+
+func fromJacobianExtended_vartime*[F; G](
+       prj: var ECP_ShortW_Prj[F, G],
+       jacext: ECP_ShortW_JacExt[F, G]) {.inline, meter, tags:[VarTime].} =
+  # Affine (x, y)
+  # Jacobian extended (xZ², yZ³, Z², Z³)
+  # Projective        (xZ', yZ', Z')
+  # We can choose Z' = Z⁵
+  if jacext.isInf().bool:
+    prj.setInf()
+    return
+  prj.z.prod(jacext.zz, jacext.zzz)
+  prj.x.prod(jacext.x, jacext.zzz)
+  prj.y.prod(jacext.y, jacext.zz)
+
+func fromJacobianExtended_vartime*[F; G](
+       jac: var ECP_ShortW_Jac[F, G],
+       jacext: ECP_ShortW_JacExt[F, G]) {.inline, meter, tags:[VarTime].} =
+  # Affine (x, y)
+  # Jacobian extended (xZ²,  yZ³,  Z², Z³)
+  # Jacobian          (xZ'², yZ'³, Z')
+  # We can choose Z' = Z²
+  if jacext.isInf().bool:
+    jac.setInf()
+    return
+  jac.x.prod(jacext.x, jacext.zz)
+  jac.y.prod(jacext.y, jacext.zzz)
+  jac.z = jacext.zz
--- a/constantine/math/elliptic/ec_shortweierstrass_projective.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_projective.nim
@ -37,9 +37,21 @@ type ECP_ShortW_Prj*[F; G: static Subgroup] = object
  ## Note that projective coordinates are not unique
  x*, y*, z*: F

-template affine*[F, G](_: type ECP_ShortW_Prj[F, G]): typedesc =
-  ## Returns the affine type that corresponds to the Jacobian type input
-  ECP_ShortW_Aff[F, G]
+func isInf*(P: ECP_ShortW_Prj): SecretBool {.inline.} =
+  ## Returns true if P is an infinity point
+  ## and false otherwise
+  ##
+  ## Note: the projective coordinates equation is
+  ##       Y²Z = X³ + aXZ² + bZ³
+  ## A "zero" point is any point with coordinates X and Z = 0
+  ## Y can be anything
+  result = P.x.isZero() and P.z.isZero()
+
+func setInf*(P: var ECP_ShortW_Prj) {.inline.} =
+  ## Set ``P`` to infinity
+  P.x.setZero()
+  P.y.setOne()
+  P.z.setZero()

 func `==`*(P, Q: ECP_ShortW_Prj): SecretBool =
  ## Constant-time equality check
@ -57,21 +69,8 @@ func `==`*(P, Q: ECP_ShortW_Prj): SecretBool =
  b.prod(Q.y, P.z)
  result = result and a == b

-func isInf*(P: ECP_ShortW_Prj): SecretBool {.inline.} =
-  ## Returns true if P is an infinity point
-  ## and false otherwise
-  ##
-  ## Note: the projective coordinates equation is
-  ##       Y²Z = X³ + aXZ² + bZ³
-  ## A "zero" point is any point with coordinates X and Z = 0
-  ## Y can be anything
-  result = P.x.isZero() and P.z.isZero()
-
-func setInf*(P: var ECP_ShortW_Prj) {.inline.} =
-  ## Set ``P`` to infinity
-  P.x.setZero()
-  P.y.setOne()
-  P.z.setZero()
+  # Ensure a zero-init point doesn't propagate 0s and match any
+  result = result and not(P.isInf() xor Q.isInf())

 func ccopy*(P: var ECP_ShortW_Prj, Q: ECP_ShortW_Prj, ctl: SecretBool) {.inline.} =
  ## Constant-time conditional copy
@ -153,7 +152,7 @@ func cneg*(P: var ECP_ShortW_Prj, ctl: CTBool) {.inline.} =
 func sum*[F; G: static Subgroup](
       r: var ECP_ShortW_Prj[F, G],
       P, Q: ECP_ShortW_Prj[F, G]
-     ) =
+     ) {.meter.} =
  ## Elliptic curve point addition for Short Weierstrass curves in projective coordinates
  ##
  ##   R = P + Q
@ -253,7 +252,7 @@ func madd*[F; G: static Subgroup](
       r: var ECP_ShortW_Prj[F, G],
       P: ECP_ShortW_Prj[F, G],
       Q: ECP_ShortW_Aff[F, G]
-     ) =
+     ) {.meter.} =
  ## Elliptic curve mixed addition for Short Weierstrass curves
  ## with p in Projective coordinates and Q in affine coordinates
  ##
@ -331,7 +330,7 @@ func madd*[F; G: static Subgroup](
 func double*[F; G: static Subgroup](
       r: var ECP_ShortW_Prj[F, G],
       P: ECP_ShortW_Prj[F, G]
-     ) =
+     ) {.meter.} =
  ## Elliptic curve point doubling for Short Weierstrass curves in projective coordinate
  ##
  ##   R = [2] P
@ -430,9 +429,19 @@ func `-=`*(P: var ECP_ShortW_Prj, Q: ECP_ShortW_Prj) {.inline.} =
  nQ.neg(Q)
  P.sum(P, nQ)

+func `-=`*(P: var ECP_ShortW_Prj, Q: ECP_ShortW_Aff) {.inline.} =
+  ## In-place point substraction
+  var nQ {.noInit.}: typeof(Q)
+  nQ.neg(Q)
+  P.madd(P, nQ)
+
+template affine*[F, G](_: type ECP_ShortW_Prj[F, G]): typedesc =
+  ## Returns the affine type that corresponds to the Jacobian type input
+  ECP_ShortW_Aff[F, G]
+
 func affine*[F, G](
       aff: var ECP_ShortW_Aff[F, G],
-       proj: ECP_ShortW_Prj[F, G]) =
+       proj: ECP_ShortW_Prj[F, G]) {.meter.} =
  var invZ {.noInit.}: F
  invZ.inv(proj.z)

@ -445,3 +454,7 @@ func fromAffine*[F, G](
  proj.x = aff.x
  proj.y = aff.y
  proj.z.setOne()
+
+  let inf = aff.isInf()
+  proj.x.csetZero(inf)
+  proj.z.csetZero(inf)
--- a/constantine/math/extension_fields/towers.nim
+++ b/constantine/math/extension_fields/towers.nim
@ -707,9 +707,12 @@ func prefer_3sqr_over_2mul(F: type ExtensionField): bool {.compileTime.} =

  let a = default(F)
  # No shortcut in the VM
-  when a.c0 is ExtensionField:
-    when a.c0.c0 is ExtensionField:
+  when a.c0 is Fp12:
+    # Benchmarked on BLS12-381
+    when a.c0.c0 is Fp6:
      return true
+    elif a.c0.c0 is Fp4:
+      return false
    else: return false
  else: return false

@ -1233,7 +1236,7 @@ func mul2x_sparse_by_0y*[Fdbl, F](
 # Inversion
 # -------------------------------------------------------------------

-func invImpl(r: var QuadraticExt, a: QuadraticExt) =
+func invImpl(r: var QuadraticExt, a: QuadraticExt, useVartime: static bool = false) =
  ## Compute the multiplicative inverse of ``a``
  ##
  ## The inverse of 0 is 0.
@ -1257,6 +1260,9 @@ func invImpl(r: var QuadraticExt, a: QuadraticExt) =
    v0 -= v1              # v0 = a0² - β a1² (the norm / squared magnitude of a)

  # [1 Inv, 2 Sqr, 1 Add]
+  when useVartime:
+    v1.inv_vartime(v0)
+  else:
    v1.inv(v0)            # v1 = 1 / (a0² - β a1²)

  # [1 Inv, 2 Mul, 2 Sqr, 1 Add, 1 Neg]
@ -1264,7 +1270,7 @@ func invImpl(r: var QuadraticExt, a: QuadraticExt) =
  v0.neg(v1)              # v0 = -1 / (a0² - β a1²)
  r.c1.prod(a.c1, v0)     # r1 = -a1 / (a0² - β a1²)

-func inv2xImpl(r: var QuadraticExt, a: QuadraticExt) =
+func inv2xImpl(r: var QuadraticExt, a: QuadraticExt, useVartime: static bool = false) =
  ## Compute the multiplicative inverse of ``a``
  ##
  ## The inverse of 0 is 0.
@ -1284,6 +1290,9 @@ func inv2xImpl(r: var QuadraticExt, a: QuadraticExt) =

  # [1 Inv, 2 Sqr, 1 Add]
  t.redc2x(V0)
+  when useVartime:
+    t.inv_vartime()
+  else:
    t.inv()                 # v1 = 1 / (a0² - β a1²)

  # [1 Inv, 2 Mul, 2 Sqr, 1 Add, 1 Neg]
@ -1983,7 +1992,7 @@ func mul2x_sparse_by_0yz*[Fpkdiv3](r: var CubicExt2x, a: CubicExt, y, z: Fpkdiv3
 # Inversion
 # ----------------------------------------------------------------------

-func invImpl(r: var CubicExt, a: CubicExt) =
+func invImpl(r: var CubicExt, a: CubicExt, useVartime: static bool = false) =
  ## Compute the multiplicative inverse of ``a``
  ##
  ## The inverse of 0 is 0.
@ -2031,6 +2040,9 @@ func invImpl(r: var CubicExt, a: CubicExt) =
  r.c0.prod(a.c0, A) # aliasing: last use of a₀, destroy r₀
  t += r.c0

+  when useVartime:
+    t.inv_vartime()
+  else:
    t.inv()

  # (a0 + a1 v + a2 v²)^-1 = (A + B v + C v²) / F
@ -2038,7 +2050,7 @@ func invImpl(r: var CubicExt, a: CubicExt) =
  r.c1.prod(B, t)
  r.c2.prod(C, t)

-func inv2xImpl(r: var CubicExt, a: CubicExt) =
+func inv2xImpl(r: var CubicExt, a: CubicExt, useVartime: static bool = false) =
  ## Compute the multiplicative inverse of ``a``
  ## via lazy reduction
  ##
@ -2083,6 +2095,9 @@ func inv2xImpl(r: var CubicExt, a: CubicExt) =
  t.sum2xUnr(t, t2)
  f.redc2x(t)

+  when useVartime:
+    f.inv_vartime()
+  else:
    f.inv()

  # (a0 + a1 v + a2 v²)^-1 = (A + B v + C v²) / F
@ -2142,7 +2157,7 @@ func inv*(r: var CubicExt, a: CubicExt) =
  ## Incidentally this avoids extra check
  ## to convert Jacobian and Projective coordinates
  ## to affine for elliptic curve
-  when true:
+  when CubicExt.C.has_large_field_elem() or r is Fp12:
    r.invImpl(a)
  else:
    r.inv2xImpl(a)
@ -2180,5 +2195,44 @@ template prod*(r: var ExtensionField, a, b: ExtensionField, skipFinalSub: static
  # the final substraction on Fp
  r.prod(a, b)

+# ############################################################
+#                                                            #
+#                     Variable-time                          #
+#                                                            #
+# ############################################################
+
+func inv_vartime*(r: var QuadraticExt, a: QuadraticExt) {.tags:[VarTime].} =
+  ## Compute the multiplicative inverse of ``a``
+  ##
+  ## The inverse of 0 is 0.
+  ## Incidentally this avoids extra check
+  ## to convert Jacobian and Projective coordinates
+  ## to affine for elliptic curve
+  when true:
+    r.invImpl(a, useVartime = true)
+  else: # Lazy reduction, doesn't seem to gain speed.
+    r.inv2xImpl(a, useVartime = true)
+
+func inv_vartime*(r: var CubicExt, a: CubicExt) {.tags:[VarTime].} =
+  ## Compute the multiplicative inverse of ``a``
+  ##
+  ## The inverse of 0 is 0.
+  ## Incidentally this avoids extra check
+  ## to convert Jacobian and Projective coordinates
+  ## to affine for elliptic curve
+  when CubicExt.C.has_large_field_elem() or r is Fp12:
+    r.invImpl(a, useVartime = true)
+  else:
+    r.inv2xImpl(a, useVartime = true)
+
+func inv_vartime*(a: var ExtensionField) {.tags:[VarTime].} =
+  ## Compute the multiplicative inverse of ``a``
+  ##
+  ## The inverse of 0 is 0.
+  ## Incidentally this avoids extra check
+  ## to convert Jacobian and Projective coordinates
+  ## to affine for elliptic curve
+  a.invImpl(a, useVartime = true)
+
 {.pop.} # inline
 {.pop.} # raises no exceptions
--- a/constantine/math/io/io_ec.nim
+++ b/constantine/math/io/io_ec.nim
@ -14,7 +14,8 @@ import
  ../elliptic/[
    ec_shortweierstrass_affine,
    ec_shortweierstrass_projective,
-    ec_shortweierstrass_jacobian
+    ec_shortweierstrass_jacobian,
+    ec_shortweierstrass_jacobian_extended
  ]

 # No exceptions allowed
@ -27,7 +28,7 @@ import
 #
 # ############################################################

-func toHex*[EC: ECP_ShortW_Prj or ECP_ShortW_Jac or ECP_ShortW_Aff](P: EC, indent: static int = 0): string =
+func toHex*[EC: ECP_ShortW_Prj or ECP_ShortW_Jac or ECP_ShortW_Aff or ECP_ShortW_JacExt](P: EC, indent: static int = 0): string =
  ## Stringify an elliptic curve point to Hex
  ## Note. Leading zeros are not removed.
  ## Result is prefixed with 0x
--- a/constantine/math/pairings/cyclotomic_subgroups.nim
+++ b/constantine/math/pairings/cyclotomic_subgroups.nim
@ -392,7 +392,7 @@ func cyclotomic_exp*[FT](r: var FT, a: FT, exponent: static BigInt, invert: bool

  r.setOne()
  var init = false
-  for bit in recoding_l2r_vartime(exponent):
+  for bit in recoding_l2r_signed_vartime(exponent):
    if init:
      r.cyclotomic_square()
    if bit == 1:
--- a/constantine/math/pairings/miller_loops.nim
+++ b/constantine/math/pairings/miller_loops.nim
@ -7,15 +7,17 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

 import
+  ../config/curves,
  ../elliptic/[
    ec_shortweierstrass_affine,
    ec_shortweierstrass_projective
  ],
+  ../arithmetic,
  ../isogenies/frobenius,
  ./lines_eval

 # No exceptions allowed
-{.push raises: [].}
+{.push raises: [], checks: off.}

 # ############################################################
 #                                                            #
@ -23,60 +25,82 @@ import
 #                                                            #
 # ############################################################

+func recodeNafForPairing(ate: BigInt): seq[int8] {.compileTime.} =
+  ## We need a NAF recoding and we need to skip the MSB for pairings
+  var recoded: array[ate.bits+1, int8]
+  let recodedLen = recoded.recode_r2l_signed_vartime(ate)
+  var msbPos = recodedLen-1
+  while true:
+    if recoded[msbPos] != 0:
+      break
+    else:
+      msbPos -= 1
+      doAssert msbPos >= 0
+  result = recoded[0 ..< msbPos]
+
 func basicMillerLoop*[FT, F1, F2](
       f: var FT,
-       line: var Line[F2],
       T: var ECP_ShortW_Prj[F2, G2],
       P: ECP_ShortW_Aff[F1, G1],
       Q: ECP_ShortW_Aff[F2, G2],
-       ate_param: auto,
-       ate_param_isNeg: static bool
-    ) =
+       ate_param: static BigInt) =
  ## Basic Miller loop iterations
+  ##
+  ## Multiplications by constants in the Miller loop is eliminated by final exponentiation
+  ## aka cofactor clearing in the pairing group.
+  ##
+  ## This means that there is no need to inverse/conjugate when `ate_param_isNeg` is false
+  ## in the general case.
+  ## If further processing is required, `ate_param_isNeg` must be taken into account by the caller.
  static:
    doAssert FT.C == F1.C
    doAssert FT.C == F2.C

-  f.setOne()
+  const naf = ate_param.recodeNafForPairing()
+  var line0 {.noInit.}, line1 {.noInit.}: Line[F2]
  var nQ {.noInit.}: ECP_ShortW_Aff[F2, G2]
+  f.setOne()
  nQ.neg(Q)

-  template u: untyped = ate_param
-  var u3 = ate_param
-  u3 *= 3
-  for i in countdown(u3.bits - 2, 1):
-    if i != u3.bits - 2:
+  block: # naf.len - 1
+    line0.line_double(T, P)
+    let bit = naf[naf.len-1]
+    if bit == 1:
+      line1.line_add(T, Q, P)
+      f.prod_from_2_lines(line0, line1)
+    elif bit == -1:
+      line1.line_add(T, nQ, P)
+      f.prod_from_2_lines(line0, line1)
+    else:
+      f.mul_by_line(line0)
+
+  for i in countdown(naf.len-2, 0):
+    let bit = naf[i]
    f.square()
-    line.line_double(T, P)
-    f.mul_by_line(line)
+    line0.line_double(T, P)

-    let naf = u3.bit(i).int8 - u.bit(i).int8 # This can throw exception
-    if naf == 1:
-      line.line_add(T, Q, P)
-      f.mul_by_line(line)
-    elif naf == -1:
-      line.line_add(T, nQ, P)
-      f.mul_by_line(line)
-
-  when ate_param_isNeg:
-    # In GT, x^-1 == conjugate(x)
-    # Remark 7.1, chapter 7.1.1 of Guide to Pairing-Based Cryptography, El Mrabet, 2017
-    f.conj()
+    if bit == 1:
+      line1.line_add(T, Q, P)
+      f.mul_by_2_lines(line0, line1)
+    elif bit == -1:
+      line1.line_add(T, nQ, P)
+      f.mul_by_2_lines(line0, line1)
+    else:
+      f.mul_by_line(line0)

 func millerCorrectionBN*[FT, F1, F2](
       f: var FT,
       T: var ECP_ShortW_Prj[F2, G2],
       Q: ECP_ShortW_Aff[F2, G2],
-       P: ECP_ShortW_Aff[F1, G1],
-       ate_param_isNeg: static bool
-     ) =
+       P: ECP_ShortW_Aff[F1, G1]) =
  ## Ate pairing for BN curves need adjustment after basic Miller loop
+  ## If `ate_param_isNeg` f must be cyclotomic inverted/conjugated
+  ## and T must be negated by the caller.
  static:
    doAssert FT.C == F1.C
    doAssert FT.C == F2.C
+    doAssert FT.C.family() == BarretoNaehrig

-  when ate_param_isNeg:
-    T.neg()
  var V {.noInit.}: typeof(Q)
  var line1 {.noInit.}, line2 {.noInit.}: Line[F2]

@ -122,92 +146,67 @@ func miller_init_double_then_add*[FT, F1, F2](
       T: var ECP_ShortW_Prj[F2, G2],
       Q: ECP_ShortW_Aff[F2, G2],
       P: ECP_ShortW_Aff[F1, G1],
-       numDoublings: static int
-     ) =
+       numDoublings: static int) =
  ## Start a Miller Loop with
  ## - `numDoubling` doublings
  ## - 1 add
  ##
  ## f is overwritten
  ## T is overwritten by Q
-  static:
-    doAssert FT.C == F1.C
-    doAssert FT.C == F2.C
-    doAssert numDoublings >= 1

-  {.push checks: off.} # No OverflowError or IndexError allowed
-  var line {.noInit.}: Line[F2]
-
-  # First step: 0b10, T <- Q, f = 1 (mod p¹²), f *= line
-  # ----------------------------------------------------
+  var line0 {.noInit.}, line1 {.noInit.}: Line[F2]
  T.fromAffine(Q)

-  # f.square() -> square(1)
-  line.line_double(T, P)
+  # First step: 0b1..., T <- Q, f = 1 (mod p¹²), f *= line
+  line0.line_double(T, P)

-  # Doubling steps: 0b10...00
-  # ----------------------------------------------------
-
-  # Process all doublings, the second is special cased
-  # as:
-  # - The first line is squared (sparse * sparse)
-  # - The second is (somewhat-sparse * sparse)
+  # Second step: 0b10 or 0b11
+  # If we have more than 1 doubling, we square the line instead of squaring f
  when numDoublings >= 2:
-    f.prod_from_2_lines(line, line)
-    line.line_double(T, P)
-    f.mul_by_line(line)
+    f.prod_from_2_lines(line0, line0)
+    line0.line_double(T, P)
+
+  # Doublings step: 0b10...0
  for _ in 2 ..< numDoublings:
+    # Apply previous line0
+    f.mul_by_line(line0)
    f.square()
-      line.line_double(T, P)
-      f.mul_by_line(line)
+    line0.line_double(T, P)

  # Addition step: 0b10...01
-  # ------------------------------------------------
-
-  # If there was only a single doubling needed,
-  # we special case the addition as
-  # - The first line and second are sparse (sparse * sparse)
+  line1.line_add(T, Q, P)
  when numDoublings == 1:
-    # f *= line <=> f = line for the first iteration
-    var line2 {.noInit.}: Line[F2]
-    line2.line_add(T, Q, P)
-    f.prod_from_2_lines(line, line2)
+    f.prod_from_2_lines(line0, line1)
  else:
-    line.line_add(T, Q, P)
-    f.mul_by_line(line)
-
-  {.pop.} # No OverflowError or IndexError allowed
+    f.mul_by_2_lines(line0, line1)

 func miller_accum_double_then_add*[FT, F1, F2](
       f: var FT,
       T: var ECP_ShortW_Prj[F2, G2],
       Q: ECP_ShortW_Aff[F2, G2],
       P: ECP_ShortW_Aff[F1, G1],
-       numDoublings: int,
-       add = true
-     ) =
+       numDoublings: int, add = true) =
  ## Continue a Miller Loop with
  ## - `numDoubling` doublings
  ## - 1 add
  ##
  ## f and T are updated
-  #
-  # `numDoublings` and `add` can be hardcoded at compile-time
-  # to prevent fault attacks.
-  # But fault attacks only happen on embedded
-  # and embedded is likely to want to minimize codesize.
-  # What to do?
-  {.push checks: off.} # No OverflowError or IndexError allowed

-  var line {.noInit.}: Line[F2]
-  for _ in 0 ..< numDoublings:
+  var line0 {.noInit.}, line1 {.noInit.}: Line[F2]
+
  f.square()
-    line.line_double(T, P)
-    f.mul_by_line(line)
+  line0.line_double(T, P)
+
+  for _ in 1 ..< numDoublings:
+    f.mul_by_line(line0)
+    f.square()
+    line0.line_double(T, P)

  if add:
-    line.line_add(T, Q, P)
-    f.mul_by_line(line)
+    line1.line_add(T, Q, P)
+    f.mul_by_2_lines(line0, line1)
+  else:
+    f.mul_by_line(line0)

 # Miller Loop - multi-pairing
 # ----------------------------------------------------------------------------
@ -217,61 +216,52 @@ func miller_accum_double_then_add*[FT, F1, F2](
 # See `multi_pairing.md``
 # We implement Aranha approach

+func isOdd(n: int): bool {.inline.} = bool(n and 1)
+
 func double_jToN[FT, F1, F2](
       f: var FT,
       j: static int,
-       line0, line1: var Line[F2],
+       lineOddRemainder: var Line[F2],
       Ts: ptr UncheckedArray[ECP_ShortW_Prj[F2, G2]],
       Ps: ptr UncheckedArray[ECP_ShortW_Aff[F1, G1]],
       N: int) =
  ## Doubling steps for pairings j to N
+  ## if N is odd, lineOddRemainder must be applied to `f`

-  {.push checks: off.} # No OverflowError or IndexError allowed
+  var line0{.noInit.}, line1{.noInit.}: Line[F2]
  # Sparse merge 2 by 2, starting from j
-  for i in countup(j, N-1, 2):
-    if i+1 >= N:
-      break
-
+  for i in countup(j, N-2, 2):
    line0.line_double(Ts[i], Ps[i])
    line1.line_double(Ts[i+1], Ps[i+1])
    f.mul_by_2_lines(line0, line1)

-  if (N and 1) == 1: # N >= 2 and N is odd, there is a leftover
-    line0.line_double(Ts[N-1], Ps[N-1])
-    f.mul_by_line(line0)
-
-  {.pop.}
+  if N.isOdd(): # N >= 2 and N is odd, there is a leftover
+    lineOddRemainder.line_double(Ts[N-1], Ps[N-1])

 func add_jToN[FT, F1, F2](
       f: var FT,
       j: static int,
-       line0, line1: var Line[F2],
+       lineOddRemainder: var Line[F2],
       Ts: ptr UncheckedArray[ECP_ShortW_Prj[F2, G2]],
       Qs: ptr UncheckedArray[ECP_ShortW_Aff[F2, G2]],
       Ps: ptr UncheckedArray[ECP_ShortW_Aff[F1, G1]],
       N: int)=
  ## Addition steps for pairings 0 to N

-  {.push checks: off.} # No OverflowError or IndexError allowed
+  var line0{.noInit.}, line1{.noInit.}: Line[F2]
  # Sparse merge 2 by 2, starting from 0
-  for i in countup(j, N-1, 2):
-    if i+1 >= N:
-      break
-
+  for i in countup(j, N-2, 2):
    line0.line_add(Ts[i], Qs[i], Ps[i])
    line1.line_add(Ts[i+1], Qs[i+1], Ps[i+1])
    f.mul_by_2_lines(line0, line1)

-  if (N and 1) == 1: # N >= 2 and N is odd, there is a leftover
-    line0.line_add(Ts[N-1], Qs[N-1], Ps[N-1])
-    f.mul_by_line(line0)
-
-  {.pop.}
+  if N.isOdd(): # N >= 2 and N is odd, there is a leftover
+    lineOddRemainder.line_add(Ts[N-1], Qs[N-1], Ps[N-1])

 func add_jToN_negateQ[FT, F1, F2](
       f: var FT,
       j: static int,
-       line0, line1: var Line[F2],
+       lineOddRemainder: var Line[F2],
       Ts: ptr UncheckedArray[ECP_ShortW_Prj[F2, G2]],
       Qs: ptr UncheckedArray[ECP_ShortW_Aff[F2, G2]],
       Ps: ptr UncheckedArray[ECP_ShortW_Aff[F1, G1]],
@ -279,62 +269,55 @@ func add_jToN_negateQ[FT, F1, F2](
  ## Addition steps for pairings 0 to N

  var nQ{.noInit.}: ECP_ShortW_Aff[F2, G2]
-
-  {.push checks: off.} # No OverflowError or IndexError allowed
+  var line0{.noInit.}, line1{.noInit.}: Line[F2]
  # Sparse merge 2 by 2, starting from 0
-  for i in countup(j, N-1, 2):
-    if i+1 >= N:
-      break
-
+  for i in countup(j, N-2, 2):
    nQ.neg(Qs[i])
    line0.line_add(Ts[i], nQ, Ps[i])
    nQ.neg(Qs[i+1])
    line1.line_add(Ts[i+1], nQ, Ps[i+1])
    f.mul_by_2_lines(line0, line1)

-  if (N and 1) == 1: # N >= 2 and N is odd, there is a leftover
+  if N.isOdd(): # N >= 2 and N is odd, there is a leftover
    nQ.neg(Qs[N-1])
-    line0.line_add(Ts[N-1], nQ, Ps[N-1])
-    f.mul_by_line(line0)
-
-  {.pop.}
+    lineOddRemainder.line_add(Ts[N-1], nQ, Ps[N-1])

 func basicMillerLoop*[FT, F1, F2](
       f: var FT,
-       line0, line1: var Line[F2],
       Ts: ptr UncheckedArray[ECP_ShortW_Prj[F2, G2]],
       Ps: ptr UncheckedArray[ECP_ShortW_Aff[F1, G1]],
       Qs: ptr UncheckedArray[ECP_ShortW_Aff[F2, G2]],
       N: int,
-       ate_param: auto,
-       ate_param_isNeg: static bool
-    ) =
+       ate_param: static Bigint) =
  ## Basic Miller loop iterations
+  ##
+  ## Multiplications by constants in the Miller loop is eliminated by final exponentiation
+  ## aka cofactor clearing in the pairing group.
+  ##
+  ## This means that there is no need to inverse/conjugate when `ate_param_isNeg` is false
+  ## in the general case.
+  ## If further processing is required, `ate_param_isNeg` must be taken into account by the caller.

-  static:
-    doAssert FT.C == F1.C
-    doAssert FT.C == F2.C
-
+  const naf = ate_param.recodeNafForPairing()
+  var lineOddRemainder0{.noInit.}, lineOddRemainder1{.noinit.}: Line[F2]
  f.setOne()

-  template u: untyped = ate_param
-  var u3 = ate_param
-  u3 *= 3
-  for i in countdown(u3.bits - 2, 1):
-    if i != u3.bits - 2:
+  for i in countdown(naf.len-1, 0):
+    let bit = naf[i]
+    if i != naf.len-1:
      f.square()
-    f.double_jToN(j=0, line0, line1, Ts, Ps, N)
+    f.double_jToN(j=0, lineOddRemainder0, Ts, Ps, N)

-    let naf = u3.bit(i).int8 - u.bit(i).int8 # This can throw exception
-    if naf == 1:
-      f.add_jToN(j=0, line0, line1, Ts, Qs, Ps, N)
-    elif naf == -1:
-      f.add_jToN_negateQ(j=0, line0, line1, Ts, Qs, Ps, N)
+    if bit == 1:
+      f.add_jToN(j=0, lineOddRemainder1, Ts, Qs, Ps, N)
+    elif bit == -1:
+      f.add_jToN_negateQ(j=0, lineOddRemainder1, Ts, Qs, Ps, N)

-  when ate_param_isNeg:
-    # In GT, x^-1 == conjugate(x)
-    # Remark 7.1, chapter 7.1.1 of Guide to Pairing-Based Cryptography, El Mrabet, 2017
-    f.conj()
+    if N.isOdd():
+      if bit == 0:
+        f.mul_by_line(lineOddRemainder0)
+      else:
+        f.mul_by_2_lines(lineOddRemainder0, lineOddRemainder1)

 func miller_init_double_then_add*[FT, F1, F2](
       f: var FT,
@ -342,8 +325,7 @@ func miller_init_double_then_add*[FT, F1, F2](
       Qs: ptr UncheckedArray[ECP_ShortW_Aff[F2, G2]],
       Ps: ptr UncheckedArray[ECP_ShortW_Aff[F1, G1]],
       N: int,
-       numDoublings: static int
-     ) =
+       numDoublings: static int) =
  ## Start a Miller Loop
  ## This means
  ## - 1 doubling
@ -351,52 +333,32 @@ func miller_init_double_then_add*[FT, F1, F2](
  ##
  ## f is overwritten
  ## Ts are overwritten by Qs
-  static:
-    doAssert FT.C == F1.C
-    doAssert FT.C == F2.C

-  {.push checks: off.} # No OverflowError or IndexError allowed
-  var line0 {.noInit.}, line1 {.noInit.}: Line[F2]
+  if N == 1:
+    f.miller_init_double_then_add(Ts[0], Qs[0], Ps[0], numDoublings)
+    return

-  # First step: T <- Q, f = 1 (mod p¹²), f *= line
-  # ----------------------------------------------
+  var lineOddRemainder0 {.noInit.}, lineOddRemainder1 {.noInit.}: Line[F2]
  for i in 0 ..< N:
    Ts[i].fromAffine(Qs[i])

-  line0.line_double(Ts[0], Ps[0])
-  if N >= 2:
-    line1.line_double(Ts[1], Ps[1])
-    f.prod_from_2_lines(line0, line1)
-    f.double_jToN(j=2, line0, line1, Ts, Ps, N)
+  # First step: T <- Q, f = 1 (mod p¹²), f *= line
+  lineOddRemainder0.line_double(Ts[0], Ps[0])
+  lineOddRemainder1.line_double(Ts[1], Ps[1])
+  f.prod_from_2_lines(lineOddRemainder0, lineOddRemainder1)
+  f.double_jToN(j=2, lineOddRemainder0, Ts, Ps, N)

-  # Doubling steps: 0b10...00
-  # ------------------------------------------------
-  when numDoublings > 1: # Already did the MSB doubling
-    if N == 1:           # f = line0
-      f.prod_from_2_lines(line0, line0) # f.square()
-      line0.line_double(Ts[0], Ps[0])
-      f.mul_by_line(line0)
-      for _ in 2 ..< numDoublings:
+  # Doublings step: 0b10...0
+  for _ in 1 ..< numDoublings:
+    if N.isOdd():
+      f.mul_by_line(lineOddRemainder0)
    f.square()
-        f.double_jtoN(j=0, line0, line1, Ts, Ps, N)
-    else:
-      for _ in 0 ..< numDoublings:
-        f.square()
-        f.double_jtoN(j=0, line0, line1, Ts, Ps, N)
+    f.double_jToN(j=0, lineOddRemainder0, Ts, Ps, N)

  # Addition step: 0b10...01
-  # ------------------------------------------------
-
-  when numDoublings == 1:
-    if N == 1: # f = line0
-      line1.line_add(Ts[0], Qs[0], Ps[0])
-      f.prod_from_2_lines(line0, line1)
-    else:
-      f.add_jToN(j=0,line0, line1, Ts, Qs, Ps, N)
-  else:
-    f.add_jToN(j=0,line0, line1, Ts, Qs, Ps, N)
-
-  {.pop.} # No OverflowError or IndexError allowed
+  f.add_jToN(j=0, lineOddRemainder1, Ts, Qs, Ps, N)
+  if N.isOdd():
+    f.mul_by_2_lines(lineOddRemainder0, lineOddRemainder1)

 func miller_accum_double_then_add*[FT, F1, F2](
       f: var FT,
@ -404,18 +366,31 @@ func miller_accum_double_then_add*[FT, F1, F2](
       Qs: ptr UncheckedArray[ECP_ShortW_Aff[F2, G2]],
       Ps: ptr UncheckedArray[ECP_ShortW_Aff[F1, G1]],
       N: int,
-       numDoublings: int,
-       add = true
-     ) =
+       numDoublings: int, add = true) =
  ## Continue a Miller Loop with
  ## - `numDoubling` doublings
  ## - 1 add
  ##
  ## f and T are updated
-  var line0{.noInit.}, line1{.noinit.}: Line[F2]
-  for _ in 0 ..< numDoublings:
+
+  if N == 1:
+    f.miller_accum_double_then_add(Ts[0], Qs[0], Ps[0], numDoublings, add)
+    return
+
+  var lineOddRemainder0 {.noInit.}, lineOddRemainder1 {.noInit.}: Line[F2]
+
  f.square()
-    f.double_jtoN(j=0, line0, line1, Ts, Ps, N)
+  f.double_jtoN(j=0, lineOddRemainder0, Ts, Ps, N)
+  for _ in 1 ..< numDoublings:
+    if N.isOdd():
+      f.mul_by_line(lineOddRemainder0)
+    f.square()
+    f.double_jtoN(j=0, lineOddRemainder0, Ts, Ps, N)

  if add:
-    f.add_jToN(j=0, line0, line1, Ts, Qs, Ps, N)
+    f.add_jToN(j=0, lineOddRemainder1, Ts, Qs, Ps, N)
+    if N.isOdd():
+      f.mul_by_2_lines(lineOddRemainder0, lineOddRemainder1)
+  else:
+    if N.isOdd():
+      f.mul_by_line(lineOddRemainder0)
--- a/constantine/math/pairings/pairings_bls12.nim
+++ b/constantine/math/pairings/pairings_bls12.nim
@ -18,7 +18,6 @@ import
  ../constants/zoo_pairings,
  ../arithmetic,
  ./cyclotomic_subgroups,
-  ./lines_eval,
  ./miller_loops

 export zoo_pairings # generic sandwich https://github.com/nim-lang/Nim/issues/11225
@ -54,23 +53,29 @@ export zoo_pairings # generic sandwich https://github.com/nim-lang/Nim/issues/11

 func millerLoopGenericBLS12*[C](
       f: var Fp12[C],
-       P: ECP_ShortW_Aff[Fp[C], G1],
-       Q: ECP_ShortW_Aff[Fp2[C], G2]
+       Q: ECP_ShortW_Aff[Fp2[C], G2],
+       P: ECP_ShortW_Aff[Fp[C], G1]
     ) {.meter.} =
  ## Generic Miller Loop for BLS12 curve
  ## Computes f{u,Q}(P) with u the BLS curve parameter
-
-  var
-    T {.noInit.}: ECP_ShortW_Prj[Fp2[C], G2]
-    line {.noInit.}: Line[Fp2[C]]
-
+  var T {.noInit.}: ECP_ShortW_Prj[Fp2[C], G2]
  T.fromAffine(Q)

-  basicMillerLoop(
-    f, line, T,
-    P, Q,
-    pairing(C, ate_param), pairing(C, ate_param_isNeg)
-  )
+  basicMillerLoop(f, T, P, Q, pairing(C, ate_param))
+
+func millerLoopGenericBLS12*[C](
+       f: var Fp12[C],
+       Qs: ptr UncheckedArray[ECP_ShortW_Aff[Fp2[C], G2]],
+       Ps: ptr UncheckedArray[ECP_ShortW_Aff[Fp[C], G1]],
+       N: int
+     ) {.noinline, tags:[Alloca], meter.} =
+  ## Generic Miller Loop for BLS12 curve
+  ## Computes f{u,Q}(P) with u the BLS curve parameter
+  var Ts = allocStackArray(ECP_ShortW_Prj[Fp2[C], G2], N)
+  for i in 0 ..< N:
+    Ts[i].fromAffine(Qs[i])
+
+  basicMillerLoop(f, Ts, Ps, Qs, N, pairing(C, ate_param))

 func finalExpGeneric[C: static Curve](f: var Fp12[C]) =
  ## A generic and slow implementation of final exponentiation
@ -86,7 +91,7 @@ func pairing_bls12_reference*[C](
  ## Output: e(P, Q) ∈ Gt
  ##
  ## Reference implementation
-  gt.millerLoopGenericBLS12(P, Q)
+  gt.millerLoopGenericBLS12(Q, P)
  gt.finalExpGeneric()

 # Optimized pairing implementation
--- a/constantine/math/pairings/pairings_bn.nim
+++ b/constantine/math/pairings/pairings_bn.nim
@ -7,7 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

 import
-  ../../platforms/[abstractions, allocs],
+  ../../platforms/abstractions,
  ../config/curves,
  ../extension_fields,
  ../elliptic/[
@ -16,7 +16,6 @@ import
  ],
  ../isogenies/frobenius,
  ../constants/zoo_pairings,
-  ./lines_eval,
  ./cyclotomic_subgroups,
  ./miller_loops

@ -50,53 +49,45 @@ export zoo_pairings # generic sandwich https://github.com/nim-lang/Nim/issues/11

 func millerLoopGenericBN*[C](
       f: var Fp12[C],
+       Q: ECP_ShortW_Aff[Fp2[C], G2],
       P: ECP_ShortW_Aff[Fp[C], G1],
-       Q: ECP_ShortW_Aff[Fp2[C], G2]
     ) {.meter.} =
  ## Generic Miller Loop for BN curves
  ## Computes f{6u+2,Q}(P) with u the BN curve parameter
-  var
-    T {.noInit.}: ECP_ShortW_Prj[Fp2[C], G2]
-    line {.noInit.}: Line[Fp2[C]]
-
+  var T {.noInit.}: ECP_ShortW_Prj[Fp2[C], G2]
  T.fromAffine(Q)

-  basicMillerLoop(
-    f, line, T,
-    P, Q,
-    pairing(C, ate_param), pairing(C, ate_param_isNeg)
-  )
+  basicMillerLoop(f, T, P, Q, pairing(C, ate_param))
+
+  when pairing(C, ate_param_is_neg):
+    f.conj()
+    T.neg()

  # Ate pairing for BN curves needs adjustment after basic Miller loop
-  f.millerCorrectionBN(
-    T, Q, P,
-    pairing(C, ate_param_isNeg)
-  )
+  f.millerCorrectionBN(T, Q, P)

 func millerLoopGenericBN*[C](
       f: var Fp12[C],
-       Ps: ptr UncheckedArray[ECP_ShortW_Aff[Fp[C], G1]],
       Qs: ptr UncheckedArray[ECP_ShortW_Aff[Fp2[C], G2]],
+       Ps: ptr UncheckedArray[ECP_ShortW_Aff[Fp[C], G1]],
       N: int
-     ) {.meter.} =
+     ) {.noinline, tags:[Alloca], meter.} =
  ## Generic Miller Loop for BN curves
  ## Computes f{6u+2,Q}(P) with u the BN curve parameter
-  var
-    Ts = allocStackArray(ECP_ShortW_Prj[Fp2[C], G2], N)
-    line0 {.noInit.}, line1 {.noInit.}: Line[Fp2[C]]
-
+  var Ts = allocStackArray(ECP_ShortW_Prj[Fp2[C], G2], N)
  for i in 0 ..< N:
    Ts[i].fromAffine(Qs[i])

-  basicMillerLoop(
-    f, line0, line1, Ts,
-    Ps, Qs, N,
-    pairing(C, ate_param), pairing(C, ate_param_isNeg)
-  )
+  basicMillerLoop(f, Ts, Ps, Qs, N, pairing(C, ate_param))
+
+  when pairing(C, ate_param_is_neg):
+    f.conj()
+    for i in 0 ..< N:
+      Ts[i].neg()

  # Ate pairing for BN curves needs adjustment after basic Miller loop
  for i in 0 ..< N:
-    f.millerCorrectionBN(Ts[i], Qs[i], Ps[i], pairing(C, ate_param_isNeg))
+    f.millerCorrectionBN(Ts[i], Qs[i], Ps[i])

 func finalExpGeneric[C: static Curve](f: var Fp12[C]) =
  ## A generic and slow implementation of final exponentiation
@ -180,7 +171,7 @@ func pairing_bn*[C](
  when C == BN254_Nogami:
    gt.millerLoopAddChain(Q, P)
  else:
-    gt.millerLoopGenericBN(P, Q)
+    gt.millerLoopGenericBN(Q, P)
  gt.finalExpEasy()
  gt.finalExpHard_BN()

@ -196,6 +187,6 @@ func pairing_bn*[N: static int, C](
  when C == BN254_Nogami:
    gt.millerLoopAddChain(Qs.asUnchecked(), Ps.asUnchecked(), N)
  else:
-    gt.millerLoopGenericBN(Ps.asUnchecked(), Qs.asUnchecked(), N)
+    gt.millerLoopGenericBN(Qs.asUnchecked(), Ps.asUnchecked(), N)
  gt.finalExpEasy()
  gt.finalExpHard_BN()
--- a/constantine/math/pairings/pairings_bw6_761.nim
+++ b/constantine/math/pairings/pairings_bw6_761.nim
@ -33,20 +33,16 @@ export zoo_pairings # generic sandwich https://github.com/nim-lang/Nim/issues/11

 func millerLoopBW6_761_naive[C](
       f: var Fp6[C],
-       P: ECP_ShortW_Aff[Fp[C], G1],
-       Q: ECP_ShortW_Aff[Fp[C], G2]
+       Q: ECP_ShortW_Aff[Fp[C], G2],
+       P: ECP_ShortW_Aff[Fp[C], G1]
     ) =
  ## Miller Loop for BW6_761 curve
  ## Computes f_{u+1,Q}(P)*Frobenius(f_{u*(u^2-u-1),Q}(P))
-
-  var
-    T {.noInit.}: ECP_ShortW_Prj[Fp[C], G2]
-    line {.noInit.}: Line[Fp[C]]
-
+  var T {.noInit.}: ECP_ShortW_Prj[Fp[C], G2]
  T.fromAffine(Q)

  basicMillerLoop(
-    f, line, T,
+    f, T,
    P, Q,
    pairing(C, ate_param_1_unopt), pairing(C, ate_param_1_unopt_isNeg)
  )
@ -55,7 +51,7 @@ func millerLoopBW6_761_naive[C](
  T.fromAffine(Q)

  basicMillerLoop(
-    f2, line, T,
+    f2, T,
    P, Q,
    pairing(C, ate_param_2_unopt), pairing(C, ate_param_2_unopt_isNeg)
  )
@ -79,16 +75,15 @@ func finalExpHard_BW6_761*[C: static Curve](f: var Fp6[C]) =

 func millerLoopBW6_761_opt_to_debug[C](
       f: var Fp6[C],
-       P: ECP_ShortW_Aff[Fp[C], G1],
-       Q: ECP_ShortW_Aff[Fp[C], G2]
+       Q: ECP_ShortW_Aff[Fp[C], G2],
+       P: ECP_ShortW_Aff[Fp[C], G1]
     ) {.used.} =
  ## Miller Loop Otpimized for BW6_761 curve

  # 1st part: f_{u,Q}(P)
  # ------------------------------
-  var
-    T {.noInit.}: ECP_ShortW_Prj[Fp[C], G2]
-    line {.noInit.}: Line[Fp[C]]
+  var T {.noInit.}: ECP_ShortW_Prj[Fp[C], G2]
+  var line {.noInit.}: Line[Fp[C]]

  T.fromAffine(Q)
  f.setOne()
@ -161,6 +156,6 @@ func pairing_bw6_761_reference*[C](
  ##
  ## Reference implementation
  {.error: "BW6_761 Miller loop is not working yet".}
-  gt.millerLoopBW6_761_naive(P, Q)
+  gt.millerLoopBW6_761_naive(Q, P)
  gt.finalExpEasy()
  gt.finalExpHard_BW6_761()
--- a/constantine/math/pairings/pairings_generic.nim
+++ b/constantine/math/pairings/pairings_generic.nim
@ -21,11 +21,11 @@ func pairing*[C](gt: var Fp12[C], P, Q: auto) {.inline.} =
  else:
    {.error: "Pairing not implemented for " & $C.}

-func millerLoop*[C](gt: var Fp12[C], P, Q: auto, n: int) {.inline.} =
+func millerLoop*[C](gt: var Fp12[C], Q, P: auto, n: int) {.inline.} =
  when C == BN254_Snarks:
-    gt.millerLoopGenericBN(P, Q, n)
+    gt.millerLoopGenericBN(Q, P, n)
  else:
-    gt.millerLoopAddchain(P, Q, n)
+    gt.millerLoopAddchain(Q, P, n)

 func finalExp*[C](gt: var Fp12[C]){.inline.} =
  gt.finalExpEasy()
--- a/constantine/platforms/abstractions.nim
+++ b/constantine/platforms/abstractions.nim
@ -55,3 +55,180 @@ const
 const CttASM {.booldefine.} = true
 const UseASM_X86_32* = CttASM and X86 and GCC_Compatible
 const UseASM_X86_64* = WordBitWidth == 64 and UseASM_X86_32
+
+# We use Nim effect system to track vartime subroutines
+type VarTime*   = object
+
+# ############################################################
+#
+#                    Signed Secret Words
+#
+# ############################################################
+
+type SignedSecretWord* = distinct SecretWord
+
+when sizeof(int) == 8 and not defined(Constantine32):
+  type
+    SignedBaseType* = int64
+else:
+  type
+    SignedBaseType* = int32
+
+template fmap(x: SignedSecretWord, op: untyped, y: SignedSecretWord): SignedSecretWord =
+  ## Unwrap x and y from their distinct type
+  ## Apply op, and rewrap them
+  SignedSecretWord(op(SecretWord(x), SecretWord(y)))
+
+template fmapAsgn(x: var SignedSecretWord, op: untyped, y: SignedSecretWord) =
+  ## Unwrap x and y from their distinct type
+  ## Apply assignment op, and rewrap them
+  op(cast[var SecretWord](x.addr), SecretWord(y))
+
+template `and`*(x, y: SignedSecretWord): SignedSecretWord    = fmap(x, `and`, y)
+template `or`*(x, y: SignedSecretWord): SignedSecretWord     = fmap(x, `or`, y)
+template `xor`*(x, y: SignedSecretWord): SignedSecretWord    = SignedSecretWord(BaseType(x) xor BaseType(y))
+template `not`*(x: SignedSecretWord): SignedSecretWord       = SignedSecretWord(not SecretWord(x))
+template `+`*(x, y: SignedSecretWord): SignedSecretWord      = fmap(x, `+`, y)
+template `+=`*(x: var SignedSecretWord, y: SignedSecretWord) = fmapAsgn(x, `+=`, y)
+template `-`*(x, y: SignedSecretWord): SignedSecretWord      = fmap(x, `-`, y)
+template `-=`*(x: var SignedSecretWord, y: SignedSecretWord) = fmapAsgn(x, `-=`, y)
+
+template `-`*(x: SignedSecretWord): SignedSecretWord =
+  # We don't use Nim signed integers to avoid range checks
+  SignedSecretWord(-SecretWord(x))
+
+template `*`*(x, y: SignedSecretWord): SignedSecretWord =
+  # Warning ⚠️ : We assume that hardware multiplication is constant time
+  # but this is not always true. See https://www.bearssl.org/ctmul.html
+  fmap(x, `*`, y)
+
+# shifts
+template ashr*(x: SignedSecretWord, y: SomeNumber): SignedSecretWord =
+  ## Arithmetic right shift
+  # We need to cast to Nim ints without Nim checks
+  cast[SignedSecretWord](cast[SignedBaseType](x).ashr(y))
+
+template lshr*(x: SignedSecretWord, y: SomeNumber): SignedSecretWord =
+  ## Logical right shift
+  SignedSecretWord(SecretWord(x) shr y)
+
+template lshl*(x: SignedSecretWord, y: SomeNumber): SignedSecretWord =
+  ## Logical left shift
+  SignedSecretWord(SecretWord(x) shl y)
+
+# Hardened Boolean primitives
+# ---------------------------
+
+template `==`*(x, y: SignedSecretWord): SecretBool =
+  SecretWord(x) == SecretWord(y)
+
+# Conditional arithmetic
+# ----------------------
+
+func isNeg*(a: SignedSecretWord): SignedSecretWord {.inline.} =
+  ## Returns 1 if a is negative
+  ## and 0 otherwise
+  a.lshr(WordBitWidth-1)
+
+func isOdd*(a: SignedSecretWord): SignedSecretWord {.inline.} =
+  ## Returns 1 if a is odd
+  ## and 0 otherwise
+  a and SignedSecretWord(1)
+
+func isZeroMask*(a: SignedSecretWord): SignedSecretWord {.inline.} =
+  ## Produce the -1 mask if a is 0
+  ## and 0 otherwise
+  # In x86 assembly, we can use "neg" + "sbb"
+  -SignedSecretWord(a.SecretWord().isZero())
+
+func isNegMask*(a: SignedSecretWord): SignedSecretWord {.inline.} =
+  ## Produce the -1 mask if a is negative
+  ## and 0 otherwise
+  a.ashr(WordBitWidth-1)
+
+func isOddMask*(a: SignedSecretWord): SignedSecretWord {.inline.} =
+  ## Produce the -1 mask if a is odd
+  ## and 0 otherwise
+  -(a and SignedSecretWord(1))
+
+func isInRangeMask*(val, lo, hi: SignedSecretWord): SignedSecretWord {.inline.} =
+  ## Produce 0b11111111 mask if lo <= val <= hi (inclusive range)
+  ## and 0b00000000 otherwise
+  let loInvMask = isNegMask(val-lo) # if val-lo < 0 => val < lo
+  let hiInvMask = isNegMask(hi-val) # if hi-val < 0 => val > hi
+  return not(loInvMask or hiInvMask)
+
+func csetZero*(a: var SignedSecretWord, mask: SignedSecretWord) {.inline.} =
+  ## Conditionally set `a` to 0
+  ## mask must be 0 (0x00000...0000) (kept as is)
+  ## or -1 (0xFFFF...FFFF) (zeroed)
+  a = a and mask
+
+func cneg*(
+       a: SignedSecretWord,
+       mask: SignedSecretWord): SignedSecretWord {.inline.} =
+  ## Conditionally negate `a`
+  ## mask must be 0 (0x00000...0000) (no negation)
+  ## or -1 (0xFFFF...FFFF) (negation)
+  (a xor mask) - mask
+
+func cadd*(
+       a: var SignedSecretWord,
+       b: SignedSecretWord,
+       mask: SignedSecretWord) {.inline.} =
+  ## Conditionally add `b` to `a`
+  ## mask must be 0 (0x00000...0000) (no addition)
+  ## or -1 (0xFFFF...FFFF) (addition)
+  a = a + (b and mask)
+
+func csub*(
+       a: var SignedSecretWord,
+       b: SignedSecretWord,
+       mask: SignedSecretWord) {.inline.} =
+  ## Conditionally substract `b` from `a`
+  ## mask must be 0 (0x00000...0000) (no substraction)
+  ## or -1 (0xFFFF...FFFF) (substraction)
+  a = a - (b and mask)
+
+# Double-Width signed arithmetic
+# ------------------------------
+
+type DSWord* = object
+  lo*, hi*: SignedSecretWord
+
+func smulAccNoCarry*(r: var DSWord, a, b: SignedSecretWord) {.inline.}=
+  ## Signed accumulated multiplication
+  ## (_, hi, lo) += a*b
+  ## This assumes no overflowing
+  var UV: array[2, SecretWord]
+  var carry: Carry
+  smul(UV[1], UV[0], SecretWord a, SecretWord b)
+  addC(carry, UV[0], UV[0], SecretWord r.lo, Carry(0))
+  addC(carry, UV[1], UV[1], SecretWord r.hi, carry)
+
+  r.lo = SignedSecretWord UV[0]
+  r.hi = SignedSecretWord UV[1]
+
+func ssumprodAccNoCarry*(r: var DSWord, a, u, b, v: SignedSecretWord) {.inline.}=
+  ## Accumulated sum of products
+  ## (_, hi, lo) += a*u + b*v
+  ## This assumes no overflowing
+  var carry: Carry
+  var x1, x0, y1, y0: SecretWord
+  smul(x1, x0, SecretWord a, SecretWord u)
+  addC(carry, x0, x0, SecretWord r.lo, Carry(0))
+  addC(carry, x1, x1, SecretWord r.hi, carry)
+  smul(y1, y0, SecretWord b, SecretWord v)
+  addC(carry, x0, x0, y0, Carry(0))
+  addC(carry, x1, x1, y1, carry)
+
+  r.lo = SignedSecretWord x0
+  r.hi = SignedSecretWord x1
+
+func ashr*(
+       r: var DSWord,
+       k: SomeInteger) {.inline.} =
+  ## Arithmetic right-shift of a double-word
+  ## This does not normalize the excess bits
+  r.lo = r.lo.lshr(k) or r.hi.lshl(WordBitWidth - k)
+  r.hi = r.hi.ashr(k)
--- a/constantine/platforms/allocs.nim
+++ b/constantine/platforms/allocs.nim
@ -23,32 +23,37 @@
 #
 # stack allocation is strongly preferred where necessary.

+# We use Nim effect system to track allocating subroutines
+type
+  Alloca*    = object
+  HeapAlloc* = object
+
 # Bindings
 # ----------------------------------------------------------------------------------
 # We wrap them with int instead of size_t / csize_t

 when defined(windows):
-  proc alloca(size: int): pointer {.header: "<malloc.h>".}
+  proc alloca(size: int): pointer {.tags:[Alloca], header: "<malloc.h>".}
 else:
-  proc alloca(size: int): pointer {.header: "<alloca.h>".}
+  proc alloca(size: int): pointer {.tags:[Alloca], header: "<alloca.h>".}

-proc malloc(size: int): pointer {.sideeffect, header: "<stdlib.h>".}
-proc free(p: pointer) {.sideeffect, header: "<stdlib.h>".}
+proc malloc(size: int): pointer {.tags:[HeapAlloc], header: "<stdlib.h>".}
+proc free(p: pointer) {.tags:[HeapAlloc], header: "<stdlib.h>".}

 when defined(windows):
-  proc aligned_alloc_windows(size, alignment: int): pointer {.sideeffect,importc:"_aligned_malloc", header:"<malloc.h>".}
+  proc aligned_alloc_windows(size, alignment: int): pointer {.tags:[HeapAlloc],importc:"_aligned_malloc", header:"<malloc.h>".}
    # Beware of the arg order!
  proc aligned_alloc(alignment, size: int): pointer {.inline.} =
    aligned_alloc_windows(size, alignment)
-  proc aligned_free(p: pointer){.sideeffect,importc:"_aligned_free", header:"<malloc.h>".}
+  proc aligned_free(p: pointer){.tags:[HeapAlloc],importc:"_aligned_free", header:"<malloc.h>".}
 elif defined(osx):
-  proc posix_memalign(mem: var pointer, alignment, size: int){.sideeffect,importc, header:"<stdlib.h>".}
+  proc posix_memalign(mem: var pointer, alignment, size: int){.tags:[HeapAlloc],importc, header:"<stdlib.h>".}
  proc aligned_alloc(alignment, size: int): pointer {.inline.} =
    posix_memalign(result, alignment, size)
-  proc aligned_free(p: pointer) {.sideeffect, importc: "free", header: "<stdlib.h>".}
+  proc aligned_free(p: pointer) {.tags:[HeapAlloc], importc: "free", header: "<stdlib.h>".}
 else:
-  proc aligned_alloc(alignment, size: int): pointer {.sideeffect,importc, header:"<stdlib.h>".}
-  proc aligned_free(p: pointer) {.sideeffect, importc: "free", header: "<stdlib.h>".}
+  proc aligned_alloc(alignment, size: int): pointer {.tags:[HeapAlloc],importc, header:"<stdlib.h>".}
+  proc aligned_free(p: pointer) {.tags:[HeapAlloc], importc: "free", header: "<stdlib.h>".}

 # Helpers
 # ----------------------------------------------------------------------------------
--- a/constantine/platforms/bithacks.nim
+++ b/constantine/platforms/bithacks.nim
@ -134,7 +134,7 @@ func ctz_impl_vartime(n: uint64): uint64 =
  let isolateLSB = n xor (n-1)
  uint64 lookup[(isolateLSB * 0x03f79d71b4cb0a89'u64) shr 58]

-func countTrailingZeroBits*[T: SomeUnsignedInt](n: T): T {.inline.} =
+func countTrailingZeroBits_vartime*[T: SomeUnsignedInt](n: T): T {.inline.} =
  ## Count the number of trailing zero bits of an integer
  when nimvm:
    if n == 0:
@ -151,7 +151,7 @@ func isPowerOf2_vartime*(n: SomeUnsignedInt): bool {.inline.} =
  ## Returns true if n is a power of 2
  ## ⚠️ Result is bool instead of Secretbool,
  ## for compile-time or explicit vartime proc only.
-  (n and (n - 1)) == 0
+  (n and (n - 1)) == 0 and n > 0

 func nextPowerOfTwo_vartime*(n: uint32): uint32 {.inline.} =
  ## Returns x if x is a power of 2
--- a/constantine/platforms/codecs.nim
+++ b/constantine/platforms/codecs.nim
@ -6,7 +6,7 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

-import ./abstractions, ./signed_secret_words
+import ./abstractions

 # ############################################################
 #
--- a/constantine/platforms/compilers/compiler_optim_hints.nim
+++ b/constantine/platforms/compilers/compiler_optim_hints.nim
@ -0,0 +1,151 @@
+# Laser & Arraymancer
+# Copyright (c) 2017-2018 Mamy André-Ratsimbazafy
+# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
+# This file may not be copied, modified, or distributed except according to those terms.
+
+const LASER_MEM_ALIGN*{.intdefine.} = 64
+static:
+  assert LASER_MEM_ALIGN != 0, "Alignment " & $LASER_MEM_ALIGN & "must be a power of 2"
+  assert (LASER_MEM_ALIGN and (LASER_MEM_ALIGN - 1)) == 0, "Alignment " & $LASER_MEM_ALIGN & "must be a power of 2"
+
+template withCompilerOptimHints*() =
+  # See https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html
+  # and https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#Common-Variable-Attributes
+
+  # Variable is created aligned by LASER_MEM_ALIGN.
+  # This is useful to ensure an object can be loaded
+  # in a minimum amount of cache lines load
+  # For example, the stack part of tensors are 128 bytes and can be loaded in 2 cache lines
+  # but would require 3 loads if they are misaligned.
+  when defined(vcc):
+    {.pragma: align_variable, codegenDecl: "__declspec(align(" & $LASER_MEM_ALIGN & ")) $# $#".}
+  else:
+    {.pragma: align_variable, codegenDecl: "$# $# __attribute__((aligned(" & $LASER_MEM_ALIGN & ")))".}
+
+  # Variable. Pointer does not alias any existing valid pointers.
+  when not defined(vcc):
+    {.pragma: restrict, codegenDecl: "$# __restrict__ $#".}
+  else:
+    {.pragma: restrict, codegenDecl: "$# __restrict $#".}
+
+const withBuiltins = defined(gcc) or defined(clang) or defined(icc)
+
+type
+  PrefetchRW* {.size: cint.sizeof.} = enum
+    Read = 0
+    Write = 1
+  PrefetchLocality* {.size: cint.sizeof.} = enum
+    NoTemporalLocality = 0 # Data can be discarded from CPU cache after access
+    LowTemporalLocality = 1
+    ModerateTemporalLocality = 2
+    HighTemporalLocality = 3 # Data should be left in all levels of cache possible
+    # Translation
+    # 0 - use no cache eviction level
+    # 1 - L1 cache eviction level
+    # 2 - L2 cache eviction level
+    # 3 - L1 and L2 cache eviction level
+
+when withBuiltins:
+  proc builtin_assume_aligned(data: pointer, alignment: csize_t): pointer {.importc: "__builtin_assume_aligned", noDecl.}
+  proc builtin_prefetch(data: pointer, rw: PrefetchRW, locality: PrefetchLocality) {.importc: "__builtin_prefetch", noDecl.}
+
+when defined(cpp):
+  proc static_cast[T: ptr](input: pointer): T
+    {.importcpp: "static_cast<'0>(@)".}
+
+template assume_aligned*[T](data: ptr T, alignment: static int = LASER_MEM_ALIGN): ptr T =
+  when defined(cpp) and withBuiltins: # builtin_assume_aligned returns void pointers, this does not compile in C++, they must all be typed
+    static_cast[ptr T](builtin_assume_aligned(data, alignment))
+  elif withBuiltins:
+    cast[ptr T](builtin_assume_aligned(data, alignment))
+  else:
+    data
+
+template prefetch*(
+            data: ptr or pointer,
+            rw: static PrefetchRW = Read,
+            locality: static PrefetchLocality = HighTemporalLocality) =
+  ## Prefetch examples:
+  ##   - https://scripts.mit.edu/~birge/blog/accelerating-code-using-gccs-prefetch-extension/
+  ##   - https://stackoverflow.com/questions/7327994/prefetching-examples
+  ##   - https://lemire.me/blog/2018/04/30/is-software-prefetching-__builtin_prefetch-useful-for-performance/
+  ##   - https://www.naftaliharris.com/blog/2x-speedup-with-one-line-of-code/
+  when withBuiltins:
+    builtin_prefetch(data, rw, locality)
+  else:
+    discard
+
+template pragma_ivdep() {.used.}=
+  ## Tell the compiler to ignore unproven loop dependencies
+  ## such as "a[i] = a[i + k] * c;" if k is unknown, as it introduces a loop
+  ## dependency if it's negative
+  ## https://software.intel.com/en-us/node/524501
+  ##
+  ## Placeholder
+  # We don't expose that as it only works on C for loop. Nim only generates while loop
+  # except when using OpenMP. But the OpenMP "simd" already achieves the same as ivdep.
+  when defined(gcc):
+    {.emit: "#pragma GCC ivdep".}
+  else: # Supported on ICC and Cray
+    {.emit: "pragma ivdep".}
+
+template withCompilerFunctionHints() {.used.}=
+  ## Not exposed, Nim codegen will declare them as normal C function.
+  ## This messes up with N_NIMCALL, N_LIB_PRIVATE, N_INLINE and also
+  ## creates duplicate symbols when one function called by a hot or pure function
+  ## is public and inline (because hot and pure cascade to all cunfctions called)
+  ## and they cannot be stacked easily: (hot, pure) will only apply the last
+
+  # Function. Returned pointer is aligned to LASER_MEM_ALIGN
+  {.pragma: aligned_ptr_result, codegenDecl: "__attribute__((assume_aligned(" & $LASER_MEM_ALIGN & ")) $# $#$#".}
+
+  # Function. Returned pointer cannot alias any other valid pointer and no pointers to valid object occur in any
+  # storage pointed to.
+  {.pragma: malloc, codegenDecl: "__attribute__((malloc)) $# $#$#".}
+
+  # Function. Creates one or more function versions that can process multiple arguments using SIMD.
+  # Ignored when -fopenmp is used and within an OpenMP simd loop
+  {.pragma: simd, codegenDecl: "__attribute__((simd)) $# $#$#".}
+
+  # Function. Indicates hot and cold path. Ignored when using profile guided optimization.
+  {.pragma: hot, codegenDecl: "__attribute__((hot)) $# $#$#".}
+  {.pragma: cold, codegenDecl: "__attribute__((cold)) $# $#$#".}
+
+  # ## pure and const
+  # ## Affect Common Sub-expression Elimination, Dead Code Elimination and loop optimization.
+  # See
+  #   - https://lwn.net/Articles/285332/
+  #   - http://benyossef.com/helping-the-compiler-help-you/
+  #
+  # Function. The function only accesses its input params and global variables state.
+  # It does not modify any global, calling it multiple times with the same params
+  # and global variables will produce the same result.
+  {.pragma: gcc_pure, codegenDecl: "__attribute__((pure)) $# $#$#".}
+  #
+  # Function. The function only accesses its input params and calling it multiple times
+  # with the same params will produce the same result.
+  # Warning ⚠:
+  #   Pointer inputs must not be dereferenced to read the memory pointed to.
+  #   In Nim stack arrays are passed by pointers and big stack data structures
+  #   are passed by reference as well. I.e. Result unknown.
+  {.pragma: gcc_const, codegenDecl: "__attribute__((const)) $# $#$#".}
+
+  # We don't define per-function fast-math, GCC attribute optimize is broken:
+  # --> https://gcc.gnu.org/ml/gcc/2009-10/msg00402.html
+  #
+  # Workaround floating point latency for algorithms like sum
+  # should be done manually.
+  #
+  # See : https://stackoverflow.com/questions/39095993/does-each-floating-point-operation-take-the-same-time
+  # and https://www.agner.org/optimize/vectorclass.pdf "Using multiple accumulators"
+  #
+  # FP addition has a latency of 3~5 clock cycles, i.e. the result cannot be reused for that much time.
+  # But the throughput is 1 FP add per clock cycle (and even 2 per clock cycle for Skylake)
+  # So we need to use extra accumulators to fully utilize the FP throughput despite FP latency.
+  # On Skylake, all FP latencies are 4: https://www.agner.org/optimize/blog/read.php?i=415
+  #
+  # Note that this is per CPU cores, each core needs its own "global CPU accumulator" to combat
+  # false sharing when multithreading.
+  #
+  # This wouldn't be needed with fast-math because compiler would consider FP addition associative
+  # and create intermediate variables as needed to exploit this through put.
--- a/constantine/platforms/constant_time/ct_routines.nim
+++ b/constantine/platforms/constant_time/ct_routines.nim
@ -112,13 +112,7 @@ template `*=`*[T: Ct](x, y: T) =
 template `-`*[T: Ct](x: T): T =
  ## Unary minus returns the two-complement representation
  ## of an unsigned integer
-  # We could use "not(x) + 1" but the codegen is not optimal
-  when nimvm:
-    not(x) + T(1)
-  else: # Use C so that compiler uses the "neg" instructions
-    var neg: T
-    {.emit:[neg, " = -", x, ";"].}
-    neg
+  T(0) - x

 # ############################################################
 #
@ -175,19 +169,6 @@ template cneg*[T: Ct](x: T, ctl: CTBool[T]): T =
  # Conditional negate if ctl is true
  (x xor -T(ctl)) + T(ctl)

-# ############################################################
-#
-#         Workaround system.nim `!=` template
-#
-# ############################################################
-
-# system.nim defines `!=` as a catchall template
-# in terms of `==` while we define `==` in terms of `!=`
-# So we would have not(not(noteq(x,y)))
-
-template trmFixSystemNotEq*{x != y}[T: Ct](x, y: T): CTBool[T] =
-  noteq(x, y)
-
 # ############################################################
 #
 #                       Table lookups
@ -217,15 +198,3 @@ template isNonZero*[T: Ct](x: T): CTBool[T] =
 template isZero*[T: Ct](x: T): CTBool[T] =
  # In x86 assembly, we can use "neg" + "adc"
  not isNonZero(x)
-
-# ############################################################
-#
-#             Transform x == 0 and x != 0
-#             into their optimized version
-#
-# ############################################################
-
-template trmIsZero*{x == 0}[T: Ct](x: T): CTBool[T] = x.isZero
-template trmIsZero*{0 == x}[T: Ct](x: T): CTBool[T] = x.isZero
-template trmIsNonZero*{x != 0}[T: Ct](x: T): CTBool[T] = x.isNonZero
-template trmIsNonZero*{0 != x}[T: Ct](x: T): CTBool[T] = x.isNonZero
--- a/constantine/platforms/primitives.nim
+++ b/constantine/platforms/primitives.nim
@ -15,10 +15,12 @@ import
  ],
  compilers/[
    addcarry_subborrow,
-    extended_precision
+    extended_precision,
+    compiler_optim_hints
  ],
  ./bithacks,
-  ./static_for
+  ./static_for,
+  ./allocs

 export
  ct_types,
@ -28,7 +30,9 @@ export
  extended_precision,
  ct_division,
  bithacks,
-  staticFor
+  staticFor,
+  allocs,
+  compiler_optim_hints

 when X86 and GCC_Compatible:
  import isa/[cpuinfo_x86, macro_assembler_x86]
@ -48,6 +52,9 @@ template debug*(body: untyped): untyped =
  when defined(debugConstantine):
    body

+func unreachable*() {.noReturn.} =
+  doAssert false, "Unreachable"
+
 # ############################################################
 #
 #                         Buffers
@ -111,10 +118,22 @@ template asUnchecked*[T](a: openArray[T]): ptr UncheckedArray[T] =
 # to a function as `var` are passed by hidden pointers in Nim and the wrong
 # pointer will be modified. Templates are fine.

-func `+%`*(p: ptr, offset: SomeInteger): type(p) {.inline, noInit.}=
+func `+%`*(p: ptr or pointer, offset: SomeInteger): type(p) {.inline, noInit.}=
  ## Pointer increment
  {.emit: [result, " = ", p, " + ", offset, ";"].}

-func `+%=`*(p: var ptr, offset: SomeInteger){.inline.}=
+func `+%=`*(p: var (ptr or pointer), offset: SomeInteger){.inline.}=
  ## Pointer increment
  p = p +% offset
+
+func prefetchLarge*[T](
+        data: ptr T,
+        rw: static PrefetchRW = Read,
+        locality: static PrefetchLocality = HighTemporalLocality,
+        maxCacheLines: static int = 0) {.inline.} =
+  ## Prefetch a large value
+  let pdata = pointer(data)
+  const span = sizeof(T) div 64 # 64 byte cache line
+  const N = if maxCacheLines == 0: span else: min(span, maxCacheLines)
+  for i in 0 ..< N:
+    prefetch(pdata +% (i*64), rw, locality)
--- a/constantine/platforms/signed_secret_words.nim
+++ b/constantine/platforms/signed_secret_words.nim
@ -1,195 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import ./abstractions
-
-type SignedSecretWord* = distinct SecretWord
-
-when sizeof(int) == 8 and not defined(Constantine32):
-  type
-    SignedBaseType* = int64
-else:
-  type
-    SignedBaseType* = int32
-
-# ############################################################
-#
-#                      Arithmetic
-#
-# ############################################################
-
-template fmap(x: SignedSecretWord, op: untyped, y: SignedSecretWord): SignedSecretWord =
-  ## Unwrap x and y from their distinct type
-  ## Apply op, and rewrap them
-  SignedSecretWord(op(SecretWord(x), SecretWord(y)))
-
-template fmapAsgn(x: SignedSecretWord, op: untyped, y: SignedSecretWord) =
-  ## Unwrap x and y from their distinct type
-  ## Apply assignment op, and rewrap them
-  op(SecretWord(x), SecretWord(y))
-
-template `and`*(x, y: SignedSecretWord): SignedSecretWord    = fmap(x, `and`, y)
-template `or`*(x, y: SignedSecretWord): SignedSecretWord     = fmap(x, `or`, y)
-template `xor`*(x, y: SignedSecretWord): SignedSecretWord    = SignedSecretWord(BaseType(x) xor BaseType(y))
-template `not`*(x: SignedSecretWord): SignedSecretWord       = SignedSecretWord(not SecretWord(x))
-template `+`*(x, y: SignedSecretWord): SignedSecretWord      = fmap(x, `+`, y)
-template `+=`*(x: var SignedSecretWord, y: SignedSecretWord) = fmapAsgn(x, `+=`, y)
-template `-`*(x, y: SignedSecretWord): SignedSecretWord      = fmap(x, `-`, y)
-template `-=`*(x: var SignedSecretWord, y: SignedSecretWord) = fmapAsgn(x, `-=`, y)
-
-template `-`*(x: SignedSecretWord): SignedSecretWord =
-  # We don't use Nim signed integers to avoid range checks
-  SignedSecretWord(-SecretWord(x))
-
-template `*`*(x, y: SignedSecretWord): SignedSecretWord =
-  # Warning ⚠️ : We assume that hardware multiplication is constant time
-  # but this is not always true. See https://www.bearssl.org/ctmul.html
-  fmap(x, `*`, y)
-
-# shifts
-template ashr*(x: SignedSecretWord, y: SomeNumber): SignedSecretWord =
-  ## Arithmetic right shift
-  # We need to cast to Nim ints without Nim checks
-  cast[SignedSecretWord](cast[SignedBaseType](x).ashr(y))
-
-template lshr*(x: SignedSecretWord, y: SomeNumber): SignedSecretWord =
-  ## Logical right shift
-  SignedSecretWord(SecretWord(x) shr y)
-
-template lshl*(x: SignedSecretWord, y: SomeNumber): SignedSecretWord =
-  ## Logical left shift
-  SignedSecretWord(SecretWord(x) shl y)
-
-# ############################################################
-#
-#             Hardened Boolean primitives
-#
-# ############################################################
-
-template `==`*(x, y: SignedSecretWord): SecretBool =
-  SecretWord(x) == SecretWord(y)
-
-# ############################################################
-#
-#                Conditional arithmetic
-#
-# ############################################################
-
-# SignedSecretWord
-# ----------------
-
-func isNeg*(a: SignedSecretWord): SignedSecretWord {.inline.} =
-  ## Returns 1 if a is negative
-  ## and 0 otherwise
-  a.lshr(WordBitWidth-1)
-
-func isOdd*(a: SignedSecretWord): SignedSecretWord {.inline.} =
-  ## Returns 1 if a is odd
-  ## and 0 otherwise
-  a and SignedSecretWord(1)
-
-func isZeroMask*(a: SignedSecretWord): SignedSecretWord {.inline.} =
-  ## Produce the -1 mask if a is 0
-  ## and 0 otherwise
-  # In x86 assembly, we can use "neg" + "sbb"
-  -SignedSecretWord(a.SecretWord().isZero())
-
-func isNegMask*(a: SignedSecretWord): SignedSecretWord {.inline.} =
-  ## Produce the -1 mask if a is negative
-  ## and 0 otherwise
-  a.ashr(WordBitWidth-1)
-
-func isOddMask*(a: SignedSecretWord): SignedSecretWord {.inline.} =
-  ## Produce the -1 mask if a is odd
-  ## and 0 otherwise
-  -(a and SignedSecretWord(1))
-
-func isInRangeMask*(val, lo, hi: SignedSecretWord): SignedSecretWord {.inline.} =
-  ## Produce 0b11111111 mask if lo <= val <= hi (inclusive range)
-  ## and 0b00000000 otherwise
-  let loInvMask = isNegMask(val-lo) # if val-lo < 0 => val < lo
-  let hiInvMask = isNegMask(hi-val) # if hi-val < 0 => val > hi
-  return not(loInvMask or hiInvMask)
-
-func csetZero*(a: var SignedSecretWord, mask: SignedSecretWord) {.inline.} =
-  ## Conditionally set `a` to 0
-  ## mask must be 0 (0x00000...0000) (kept as is)
-  ## or -1 (0xFFFF...FFFF) (zeroed)
-  a = a and mask
-
-func cneg*(
-       a: SignedSecretWord,
-       mask: SignedSecretWord): SignedSecretWord {.inline.} =
-  ## Conditionally negate `a`
-  ## mask must be 0 (0x00000...0000) (no negation)
-  ## or -1 (0xFFFF...FFFF) (negation)
-  (a xor mask) - mask
-
-func cadd*(
-       a: var SignedSecretWord,
-       b: SignedSecretWord,
-       mask: SignedSecretWord) {.inline.} =
-  ## Conditionally add `b` to `a`
-  ## mask must be 0 (0x00000...0000) (no addition)
-  ## or -1 (0xFFFF...FFFF) (addition)
-  a = a + (b and mask)
-
-func csub*(
-       a: var SignedSecretWord,
-       b: SignedSecretWord,
-       mask: SignedSecretWord) {.inline.} =
-  ## Conditionally substract `b` from `a`
-  ## mask must be 0 (0x00000...0000) (no substraction)
-  ## or -1 (0xFFFF...FFFF) (substraction)
-  a = a - (b and mask)
-
-# ############################################################
-#
-#                Double-Width signed arithmetic
-#
-# ############################################################
-
-type DSWord* = object
-  lo*, hi*: SignedSecretWord
-
-func smulAccNoCarry*(r: var DSWord, a, b: SignedSecretWord) {.inline.}=
-  ## Signed accumulated multiplication
-  ## (_, hi, lo) += a*b
-  ## This assumes no overflowing
-  var UV: array[2, SecretWord]
-  var carry: Carry
-  smul(UV[1], UV[0], SecretWord a, SecretWord b)
-  addC(carry, UV[0], UV[0], SecretWord r.lo, Carry(0))
-  addC(carry, UV[1], UV[1], SecretWord r.hi, carry)
-
-  r.lo = SignedSecretWord UV[0]
-  r.hi = SignedSecretWord UV[1]
-
-func ssumprodAccNoCarry*(r: var DSWord, a, u, b, v: SignedSecretWord) {.inline.}=
-  ## Accumulated sum of products
-  ## (_, hi, lo) += a*u + b*v
-  ## This assumes no overflowing
-  var carry: Carry
-  var x1, x0, y1, y0: SecretWord
-  smul(x1, x0, SecretWord a, SecretWord u)
-  addC(carry, x0, x0, SecretWord r.lo, Carry(0))
-  addC(carry, x1, x1, SecretWord r.hi, carry)
-  smul(y1, y0, SecretWord b, SecretWord v)
-  addC(carry, x0, x0, y0, Carry(0))
-  addC(carry, x1, x1, y1, carry)
-
-  r.lo = SignedSecretWord x0
-  r.hi = SignedSecretWord x1
-
-func ashr*(
-       r: var DSWord,
-       k: SomeInteger) {.inline.} =
-  ## Arithmetic right-shift of a double-word
-  ## This does not normalize the excess bits
-  r.lo = r.lo.lshr(k) or r.hi.lshl(WordBitWidth - k)
-  r.hi = r.hi.ashr(k)
--- a/constantine/signatures/bls_signatures.nim
+++ b/constantine/signatures/bls_signatures.nim
@ -9,7 +9,7 @@
 import
    ../math/[ec_shortweierstrass, extension_fields],
    ../math/io/io_bigints,
-    ../math/elliptic/ec_shortweierstrass_batch_ops,
+    ../math/elliptic/ec_scalar_mul_vartime,
    ../math/pairings/[pairings_generic, miller_accumulators],
    ../math/constants/zoo_generators,
    ../math/config/curves,
@ -366,32 +366,6 @@ func init*[T0, T1: char|byte](

  H.hash(ctx.secureBlinding, secureRandomBytes, accumSepTag)

-func scalarMul_minHammingWeight_vartime[EC](
-       P: var EC,
-       scalar: BigInt,
-     ) =
-  ## **Variable-time** Elliptic Curve Scalar Multiplication
-  ##
-  ##   P <- [k] P
-  ##
-  ## This uses an online recoding with minimum Hamming Weight
-  ## (which is not NAF, NAF is least-significant bit to most)
-  ## Due to those scalars being 64-bit, window-method or endomorphism acceleration are slower
-  ## than double-and-add.
-  ##
-  ## This is highly VULNERABLE to timing attacks and power analysis attacks.
-  ## For our usecase, scaling with a random number not in attacker control,
-  ## leaking the scalar bits is not an issue.
-  var t0{.noInit.}: typeof(P)
-  t0.setInf()
-  for bit in recoding_l2r_vartime(scalar):
-    t0.double()
-    if bit == 1:
-      t0 += P
-    elif bit == -1:
-      t0 -= P
-  P = t0
-
 func update*[T: char|byte, Pubkey, Sig: ECP_ShortW_Aff](
       ctx: var BLSBatchSigAccumulator,
       pubkey: Pubkey,
@ -456,8 +430,8 @@ func update*[T: char|byte, Pubkey, Sig: ECP_ShortW_Aff](

    var randFactor{.noInit.}: BigInt[64]
    randFactor.unmarshal(ctx.secureBlinding.toOpenArray(0, 7), bigEndian)
-    pkG1_jac.scalarMul_minHammingWeight_vartime(randFactor)
-    sigG2_jac.scalarMul_minHammingWeight_vartime(randFactor)
+    pkG1_jac.scalarMul_minHammingWeight_windowed_vartime(randFactor, window = 3)
+    sigG2_jac.scalarMul_minHammingWeight_windowed_vartime(randFactor, window = 3)

    if ctx.aggSigOnce == false:
      ctx.aggSig = sigG2_jac
@ -492,8 +466,8 @@ func update*[T: char|byte, Pubkey, Sig: ECP_ShortW_Aff](

    var randFactor{.noInit.}: BigInt[64]
    randFactor.unmarshal(ctx.secureBlinding.toOpenArray(0, 7), bigEndian)
-    hmsgG1_jac.scalarMul_minHammingWeight_vartime(randFactor)
-    sigG1_jac.scalarMul_minHammingWeight_vartime(randFactor)
+    hmsgG1_jac.scalarMul_minHammingWeight_windowed_vartime(randFactor, window = 3)
+    sigG1_jac.scalarMul_minHammingWeight_windowed_vartime(randFactor, window = 3)

    if ctx.aggSigOnce == false:
      ctx.aggSig = sigG1_jac
@ -571,7 +545,7 @@ func finalVerify*(ctx: var BLSBatchSigAccumulator): bool =
 func aggregate*[T: ECP_ShortW_Aff](r: var T, points: openarray[T]) =
  ## Aggregate pubkeys or signatures
  var accum {.noinit.}: ECP_ShortW_Jac[T.F, T.G]
-  accum.sum_batch_vartime(points)
+  accum.sum_reduce_vartime(points)
  r.affine(accum)

 func fastAggregateVerify*[B1, B2: byte|char, Pubkey, Sig](
--- a/helpers/prng_unsafe.nim
+++ b/helpers/prng_unsafe.nim
@ -15,6 +15,7 @@ import
    ec_shortweierstrass_affine,
    ec_shortweierstrass_projective,
    ec_shortweierstrass_jacobian,
+    ec_shortweierstrass_jacobian_extended,
    ec_twistededwards_affine,
    ec_twistededwards_projective],
  ../constantine/math/io/io_bigints,
@ -282,19 +283,19 @@ func random_long01Seq(rng: var RngState, a: var ExtensionField) =
 # Elliptic curves
 # ------------------------------------------------------------

-type ECP = ECP_ShortW_Aff or ECP_ShortW_Prj or ECP_ShortW_Jac or
+type ECP = ECP_ShortW_Aff or ECP_ShortW_Prj or ECP_ShortW_Jac or ECP_ShortW_JacExt or
           ECP_TwEdwards_Aff or ECP_TwEdwards_Prj
-type ECP_ext = ECP_ShortW_Prj or ECP_ShortW_Jac or
+type ECP_ext = ECP_ShortW_Prj or ECP_ShortW_Jac or ECP_ShortW_JacExt or
               ECP_TwEdwards_Prj

 template trySetFromCoord[F](a: ECP, fieldElem: F): SecretBool =
-  when a is (ECP_ShortW_Aff or ECP_ShortW_Prj or ECP_ShortW_Jac):
+  when a is (ECP_ShortW_Aff or ECP_ShortW_Prj or ECP_ShortW_Jac or ECP_ShortW_JacExt):
    trySetFromCoordX(a, fieldElem)
  else:
    trySetFromCoordY(a, fieldElem)

 template trySetFromCoords[F](a: ECP, fieldElem, scale: F): SecretBool =
-  when a is (ECP_ShortW_Prj or ECP_ShortW_Jac):
+  when a is (ECP_ShortW_Prj or ECP_ShortW_Jac or ECP_ShortW_JacExt):
    trySetFromCoordsXandZ(a, fieldElem, scale)
  else:
    trySetFromCoordsYandZ(a, fieldElem, scale)
--- a/metering/m_msm.nim
+++ b/metering/m_msm.nim
@ -0,0 +1,44 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  std/times,
+  ./reports, ./tracer,
+  ../constantine/math/config/curves,
+  ../constantine/math/[arithmetic, extension_fields, ec_shortweierstrass],
+  ../constantine/math/constants/zoo_subgroups,
+  ../constantine/math/elliptic/ec_multi_scalar_mul,
+  ../constantine/platforms/abstractions,
+  # Helpers
+  ../helpers/prng_unsafe
+
+var rng*: RngState
+let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+rng.seed(seed)
+echo "bench xoshiro512** seed: ", seed
+
+proc msmMeter*(EC: typedesc, numPoints: int) =
+  const bits = EC.F.C.getCurveOrderBitwidth()
+  var points = newSeq[ECP_ShortW_Aff[EC.F, EC.G]](numPoints)
+  var scalars = newSeq[BigInt[bits]](numPoints)
+
+  for i in 0 ..< numPoints:
+    var tmp = rng.random_unsafe(EC)
+    tmp.clearCofactor()
+    points[i].affine(tmp)
+    scalars[i] = rng.random_unsafe(BigInt[bits])
+
+  var r{.noInit.}: EC
+  r.setinf()
+  resetMetering()
+  r.multiScalarMul_vartime(scalars, points)
+
+resetMetering()
+msmMeter(ECP_ShortW_Jac[Fp[BLS12_381], G1], 10000)
+const flags = if UseASM_X86_64 or UseASM_X86_32: "UseAssembly" else: "NoAssembly"
+reportCli(Metrics, flags)
--- a/metering/m_pairings.nim
+++ b/metering/m_pairings.nim
@ -9,11 +9,11 @@
 import
  std/times,
  ./reports, ./tracer,
-  ../constantine/math/config/[common, curves],
-  ../constantine/math/[arithmetic, extension_fields],
-  ../constantine/math/elliptic/ec_shortweierstrass_projective,
+  ../constantine/math/config/curves,
+  ../constantine/math/[arithmetic, extension_fields, ec_shortweierstrass],
  ../constantine/math/constants/zoo_subgroups,
-  ../constantine/math/pairings/pairings_bls12,
+  ../constantine/math/pairings/pairings_generic,
+  ../constantine/platforms/abstractions,
  # Helpers
  ../helpers/prng_unsafe

@ -22,19 +22,20 @@ let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
 rng.seed(seed)
 echo "bench xoshiro512** seed: ", seed

-func random_point*(rng: var RngState, EC: typedesc): EC {.noInit.} =
-  result = rng.random_unsafe(EC)
-  result.clearCofactor()
+func random_point*(rng: var RngState, EC: typedesc[ECP_ShortW_Aff]): EC {.noInit.} =
+  var jac = rng.random_unsafe(ECP_ShortW_Jac[EC.F, EC.G])
+  jac.clearCofactor()
+  result.affine(jac)

 proc pairingBLS12Meter*(C: static Curve) =
  let
-    P = rng.random_point(ECP_ShortW_Prj[Fp[C], G1])
-    Q = rng.random_point(ECP_ShortW_Prj[Fp2[C], G2])
+    P = rng.random_point(ECP_ShortW_Aff[Fp[C], G1])
+    Q = rng.random_point(ECP_ShortW_Aff[Fp2[C], G2])

  var f: Fp12[C]

  resetMetering()
-  f.pairing_bls12(P, Q)
+  f.pairing(P, Q)

 resetMetering()
 pairingBLS12Meter(BLS12_381)
--- a/metering/reports.nim
+++ b/metering/reports.nim
@ -24,11 +24,11 @@ proc reportCli*(metrics: seq[Metadata], flags: string) =
    # https://www.agner.org/optimize/blog/read.php?i=838
    echo "The CPU Cycle Count is indicative only. It cannot be used to compare across systems, works at your CPU nominal frequency and is sensitive to overclocking, throttling and frequency scaling (powersaving and Turbo Boost)."

-    const lineSep = &"""|{'-'.repeat(50)}|{'-'.repeat(14)}|{'-'.repeat(20)}|{'-'.repeat(15)}|{'-'.repeat(17)}|{'-'.repeat(26)}|{'-'.repeat(26)}|"""
+    const lineSep = &"""|{'-'.repeat(150)}|{'-'.repeat(14)}|{'-'.repeat(20)}|{'-'.repeat(15)}|{'-'.repeat(17)}|{'-'.repeat(26)}|{'-'.repeat(26)}|"""
    echo "\n"
    echo lineSep
-    echo &"""|{"Procedures":^50}|{"# of Calls":^14}|{"Throughput (ops/s)":^20}|{"Time (µs)":^15}|{"Avg Time (µs)":^17}|{"CPU cycles (in billions)":^26}|{"Avg cycles (in billions)":^26}|"""
-    echo &"""|{flags:^50}|{' '.repeat(14)}|{' '.repeat(20)}|{' '.repeat(15)}|{' '.repeat(17)}|{"indicative only":^26}|{"indicative only":^26}|"""
+    echo &"""|{"Procedures":^150}|{"# of Calls":^14}|{"Throughput (ops/s)":^20}|{"Time (µs)":^15}|{"Avg Time (µs)":^17}|{"CPU cycles (in billions)":^26}|{"Avg cycles (in billions)":^26}|"""
+    echo &"""|{flags:^150}|{' '.repeat(14)}|{' '.repeat(20)}|{' '.repeat(15)}|{' '.repeat(17)}|{"indicative only":^26}|{"indicative only":^26}|"""
    echo lineSep
    for m in metrics:
      if m.numCalls == 0:
@ -40,15 +40,15 @@ proc reportCli*(metrics: seq[Metadata], flags: string) =
      let throughput = 1e6 / avgTimeUs
      let cumulCyclesBillions = m.cumulatedCycles.float64 * 1e-9
      let avgCyclesBillions = cumulCyclesBillions / m.numCalls.float64
-      echo &"""|{m.procName:<50}|{m.numCalls:>14}|{throughput:>20.3f}|{cumulTimeUs:>15.3f}|{avgTimeUs:>17.3f}|"""
+      echo &"""|{m.procName:<150}|{m.numCalls:>14}|{throughput:>20.3f}|{cumulTimeUs:>15.3f}|{avgTimeUs:>17.3f}|"""
    echo lineSep

  else:
    const lineSep = &"""|{'-'.repeat(50)}|{'-'.repeat(14)}|{'-'.repeat(20)}|{'-'.repeat(15)}|{'-'.repeat(17)}|"""
    echo "\n"
    echo lineSep
-    echo &"""|{"Procedures":^50}|{"# of Calls":^14}|{"Throughput (ops/s)":^20}|{"Time (µs)":^15}|{"Avg Time (µs)":^17}|"""
-    echo &"""|{flags:^50}|{' '.repeat(14)}|{' '.repeat(20)}|{' '.repeat(15)}|{' '.repeat(17)}|"""
+    echo &"""|{"Procedures":^150}|{"# of Calls":^14}|{"Throughput (ops/s)":^20}|{"Time (µs)":^15}|{"Avg Time (µs)":^17}|"""
+    echo &"""|{flags:^150}|{' '.repeat(14)}|{' '.repeat(20)}|{' '.repeat(15)}|{' '.repeat(17)}|"""
    echo lineSep
    for m in metrics:
      if m.numCalls == 0:
@ -58,5 +58,5 @@ proc reportCli*(metrics: seq[Metadata], flags: string) =
      let cumulTimeUs = m.cumulatedTimeNs.float64 * 1e-3
      let avgTimeUs = cumulTimeUs / m.numCalls.float64
      let throughput = 1e6 / avgTimeUs
-      echo &"""|{m.procName:<50}|{m.numCalls:>14}|{throughput:>20.3f}|{cumulTimeUs:>15.3f}|{avgTimeUs:>17.3f}|"""
+      echo &"""|{m.procName:<150}|{m.numCalls:>14}|{throughput:>20.3f}|{cumulTimeUs:>15.3f}|{avgTimeUs:>17.3f}|"""
    echo lineSep
--- a/metering/tracer.nim
+++ b/metering/tracer.nim
@ -57,7 +57,7 @@ var ctMetrics{.compileTime.}: seq[Metadata]
  ## Unfortunately the "seq" is emptied when passing the compileTime/runtime boundaries
  ## due to Nim bugs

-when CttTrace:
+when CttMeter or CttTrace:
  # strformat doesn't work in templates.
  from strutils import alignLeft, formatFloat

@ -111,7 +111,7 @@ macro meterAnnotate(procAst: untyped): untyped =
  procAst.expectKind({nnkProcDef, nnkFuncDef})

  let id = ctMetrics.len
-  let name = procAst[0].repr
+  let name = procAst[0].repr & procAst[3].repr
  # TODO, get the module and the package the proc is coming from
  #       and the tag "Fp", "ec", "polynomial" ...

@ -123,6 +123,24 @@ macro meterAnnotate(procAst: untyped): untyped =
  newbody.add nnkDefer.newTree(getAst(fnExit(name, id, startTime, startCycle)))
  newBody.add procAst.body

+  if procAst[4].kind != nnkEmpty:
+    # Timing procedures adds the TimeEffect tag, which interferes with {.tags:[VarTime].}
+    # as TimeEffect is not listed. We drop the `tags` for metering
+    var pragmas: NimNode
+    if procAst[4].len == 1:
+      if procAst[4][0].kind == nnkExprColonExpr and procAst[4][0][0].eqIdent"tags":
+        pragmas = newEmptyNode()
+      else:
+        pragmas = procAst[4]
+    else:
+      pragmas = nnkPragma.newTree()
+      for i in 0 ..< procAst[4].len:
+        if procAst[4][0].kind == nnkExprColonExpr and procAst[4][0][0].eqIdent"tags":
+          continue
+        else:
+          pragmas.add procAst[4][0]
+    procAst[4] = pragmas
+
  procAst.body = newBody
  result = procAst

--- a/sage/derive_pairing.sage
+++ b/sage/derive_pairing.sage
@ -63,10 +63,7 @@ def genAteParam(curve_name, curve_config):
  buf += ate_comment

  ate_bits = int(ate_param).bit_length()
-  naf_bits = int(3*ate_param).bit_length() - ate_bits
-
-  buf += f'  # +{naf_bits} to bitlength so that we can mul by 3 for NAF encoding\n'
-  buf += f'  BigInt[{ate_bits}+{naf_bits}].fromHex"0x{Integer(abs(ate_param)).hex()}"\n\n'
+  buf += f'  BigInt[{ate_bits}].fromHex"0x{Integer(abs(ate_param)).hex()}"\n\n'

  buf += f'const {curve_name}_pairing_ate_param_isNeg* = {"true" if ate_param < 0 else "false"}'

@ -198,7 +195,7 @@ def genFinalExp(curve_name, curve_config):
    scale = 3*(u^3-u^2+1)
    scaleDesc = ' * 3*(u^3-u^2+1)'

-  fexp = (pᵏ - 1)//r
+  fexp = (p^k - 1)//r
  fexp *= scale

  buf = f'const {curve_name}_pairing_finalexponent* = block:\n'
--- a/sage/testgen_scalar_mul.sage
+++ b/sage/testgen_scalar_mul.sage
@ -81,7 +81,7 @@ def serialize_EC_Fp2(P):
 # Generator
 # ---------------------------------------------------------

-def genScalarMulG1(curve_name, curve_config, count, seed):
+def genScalarMulG1(curve_name, curve_config, count, seed, scalarBits = None):
  p = curve_config[curve_name]['field']['modulus']
  r = curve_config[curve_name]['field']['order']
  form = curve_config[curve_name]['curve']['form']
@ -109,13 +109,14 @@ def genScalarMulG1(curve_name, curve_config, count, seed):
  for i in progressbar(range(count)):
    v = {}
    P = G1.random_point()
-    scalar = randrange(r)
+    scalar = randrange(1 << scalarBits) if scalarBits else randrange(r)

    P *= cofactor # clear cofactor
    Q = scalar * P

    v['id'] = i
    v['P'] = serialize_EC_Fp(P)
+    v['scalarBits'] = scalarBits if scalarBits else r.bit_length()
    v['scalar'] = serialize_bigint(scalar)
    v['Q'] = serialize_EC_Fp(Q)
    vectors.append(v)
@ -123,7 +124,7 @@ def genScalarMulG1(curve_name, curve_config, count, seed):
  out['vectors'] = vectors
  return out

-def genScalarMulG2(curve_name, curve_config, count, seed):
+def genScalarMulG2(curve_name, curve_config, count, seed, scalarBits = None):
  p = curve_config[curve_name]['field']['modulus']
  r = curve_config[curve_name]['field']['order']
  form = curve_config[curve_name]['curve']['form']
@ -197,7 +198,7 @@ def genScalarMulG2(curve_name, curve_config, count, seed):
  for i in progressbar(range(count)):
      v = {}
      P = G2.random_point()
-      scalar = randrange(r)
+      scalar = randrange(1 << scalarBits) if scalarBits else randrange(r)

      P *= cofactor # clear cofactor
      Q = scalar * P
@ -205,10 +206,12 @@ def genScalarMulG2(curve_name, curve_config, count, seed):
      v['id'] = i
      if G2_field == 'Fp2':
        v['P'] = serialize_EC_Fp2(P)
+        v['scalarBits'] = scalarBits if scalarBits else r.bit_length()
        v['scalar'] = serialize_bigint(scalar)
        v['Q'] = serialize_EC_Fp2(Q)
      elif G2_field == 'Fp':
        v['P'] = serialize_EC_Fp(P)
+        v['scalarBits'] = scalarBits if scalarBits else r.bit_length()
        v['scalar'] = serialize_bigint(scalar)
        v['Q'] = serialize_EC_Fp(Q)
      vectors.append(v)
@ -222,7 +225,7 @@ def genScalarMulG2(curve_name, curve_config, count, seed):
 if __name__ == "__main__":
  # Usage
  # BLS12-381
-  # sage sage/derive_pairing.sage BLS12_381 G1
+  # sage sage/testgen_scalar_mul.sage BLS12_381 G1 {scalarBits: optional int}

  from argparse import ArgumentParser

@ -232,6 +235,9 @@ if __name__ == "__main__":

  curve = args.curve[0]
  group = args.curve[1]
+  scalarBits = None
+  if len(args.curve) > 2:
+    scalarBits = int(args.curve[2])

  if curve not in Curves:
    raise ValueError(
@ -245,16 +251,17 @@ if __name__ == "__main__":
      ' is not a valid group, expected G1 or G2 instead'
    )
  else:
-    print(f'\nGenerating test vectors tv_{curve}_scalar_mul_{group}.json')
+    bits = scalarBits if scalarBits else Curves[curve]['field']['order'].bit_length()
+    print(f'\nGenerating test vectors tv_{curve}_scalar_mul_{group}_{bits}bit.json')
    print('----------------------------------------------------\n')

    count = 40
    seed = 1337

    if group == 'G1':
-      out = genScalarMulG1(curve, Curves, count, seed)
+      out = genScalarMulG1(curve, Curves, count, seed, scalarBits)
    elif group == 'G2':
-      out = genScalarMulG2(curve, Curves, count, seed)
+      out = genScalarMulG2(curve, Curves, count, seed, scalarBits)

-    with open(f'tv_{curve}_scalar_mul_{group}.json', 'w') as f:
+    with open(f'tv_{curve}_scalar_mul_{group}_{bits}bits.json', 'w') as f:
      json.dump(out, f, indent=2)
--- a/tests/math/support/ec_reference_scalar_mult.nim
+++ b/tests/math/support/ec_reference_scalar_mult.nim
@ -1,71 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  # Internals
-  ../../../constantine/math/arithmetic,
-  ../../../constantine/math/io/io_bigints
-
-# Support files for testing Elliptic Curve arithmetic
-# ------------------------------------------------------------------------------
-
-iterator unpack(scalarByte: byte): bool =
-  yield bool((scalarByte and 0b10000000) shr 7)
-  yield bool((scalarByte and 0b01000000) shr 6)
-  yield bool((scalarByte and 0b00100000) shr 5)
-  yield bool((scalarByte and 0b00010000) shr 4)
-  yield bool((scalarByte and 0b00001000) shr 3)
-  yield bool((scalarByte and 0b00000100) shr 2)
-  yield bool((scalarByte and 0b00000010) shr 1)
-  yield bool( scalarByte and 0b00000001)
-
-func unsafe_ECmul_double_add*[EC](
-       P: var EC,
-       scalar: BigInt,
-     ) =
-  ## **Unsafe** Elliptic Curve Scalar Multiplication
-  ##
-  ##   P <- [k] P
-  ##
-  ## This uses the double-and-add algorithm to verify the constant-time production implementation
-  ## This is UNSAFE to use in production and only intended for testing purposes.
-  ##
-  ## This is highly VULNERABLE to timing attacks and power analysis attacks
-  var scalarCanonical: array[(scalar.bits+7) div 8, byte]
-  scalarCanonical.marshal(scalar, bigEndian)
-
-  var t0: typeof(P)
-  t0.setInf()
-  for scalarByte in scalarCanonical:
-    for bit in unpack(scalarByte):
-      t0.double()
-      if bit:
-        t0 += P
-  P = t0
-
-func unsafe_ECmul_minHammingWeight*[EC](
-       P: var EC,
-       scalar: BigInt) =
-  ## **Unsafe** Elliptic Curve Scalar Multiplication
-  ##
-  ##   P <- [k] P
-  ##
-  ## This uses an online recoding with minimum Hamming Weight
-  ## (which is not NAF, NAF is least-significant bit to most)
-  ## This is UNSAFE to use in production and only intended for testing purposes.
-  ##
-  ## This is highly VULNERABLE to timing attacks and power analysis attacks
-  var t0{.noInit.}: typeof(P)
-  t0.setInf()
-  for bit in recoding_l2r_vartime(scalar):
-    t0.double()
-    if bit == 1:
-      t0 += P
-    elif bit == -1:
-      t0 -= P
-  P = t0
--- a/tests/math/t_bigints.nim
+++ b/tests/math/t_bigints.nim
@ -19,7 +19,7 @@ import
 echo "\n------------------------------------------------------\n"

 proc mainArith() =
-  suite "isZero" & " [" & $WordBitWidth & "-bit mode]":
+  suite "isZero" & " [" & $WordBitWidth & "-bit words]":
    test "isZero for zero":
      var x: BigInt[128]
      check: x.isZero().bool
@ -49,7 +49,7 @@ proc mainArith() =
        check: static(not x.isZero().bool)


-  suite "Arithmetic operations - Addition" & " [" & $WordBitWidth & "-bit mode]":
+  suite "Arithmetic operations - Addition" & " [" & $WordBitWidth & "-bit words]":
    test "Adding 2 zeros":
      var a = fromHex(BigInt[128], "0x00000000000000000000000000000000")
      let b = fromHex(BigInt[128], "0x00000000000000000000000000000000")
@ -149,7 +149,7 @@ proc mainArith() =
          bool(a == c)
          not bool(carry)

-  suite "BigInt + SecretWord" & " [" & $WordBitWidth & "-bit mode]":
+  suite "BigInt + SecretWord" & " [" & $WordBitWidth & "-bit words]":
    test "Addition limbs carry":
      block: # P256 / 2
        var a = BigInt[256].fromhex"0x7fffffff800000008000000000000000000000007fffffffffffffffffffffff"
@ -160,7 +160,7 @@ proc mainArith() =
        check: bool(a == expected)

 proc mainMul() =
-  suite "Multi-precision multiplication" & " [" & $WordBitWidth & "-bit mode]":
+  suite "Multi-precision multiplication" & " [" & $WordBitWidth & "-bit words]":
    test "Same size operand into double size result":
      block:
        var r = canary(BigInt[256])
@ -201,7 +201,7 @@ proc mainMul() =
        check: bool(r == expected)

 proc mainMulHigh() =
-  suite "Multi-precision multiplication keeping only high words" & " [" & $WordBitWidth & "-bit mode]":
+  suite "Multi-precision multiplication keeping only high words" & " [" & $WordBitWidth & "-bit words]":
    test "Same size operand into double size result - discard first word":
      block:
        var r = canary(BigInt[256])
@ -287,7 +287,7 @@ proc mainMulHigh() =
        check: bool(r == expected)

 proc mainSquare() =
-  suite "Multi-precision multiplication" & " [" & $WordBitWidth & "-bit mode]":
+  suite "Multi-precision multiplication" & " [" & $WordBitWidth & "-bit words]":
    test "Squaring is consistent with multiplication (rBits = 2*aBits)":
      block:
        let a = BigInt[200].fromHex"0xDEADBEEFDEADBEEFDEADBEEFDEADBEEFDEADBEEFDEADBEEFDE"
@ -309,7 +309,7 @@ proc mainSquare() =
        check: bool(rmul == rsqr)

 proc mainModular() =
-  suite "Modular operations - small modulus" & " [" & $WordBitWidth & "-bit mode]":
+  suite "Modular operations - small modulus" & " [" & $WordBitWidth & "-bit words]":
    # Vectors taken from Stint - https://github.com/status-im/nim-stint
    test "100 mod 13":
      # Test 1 word and more than 1 word
@ -368,7 +368,7 @@ proc mainModular() =
        "\n  r (low-level repr): " & $r &
        "\n  expected (ll repr): " & $expected

-  suite "Modular operations - small modulus - Stint specific failures highlighted by property-based testing" & " [" & $WordBitWidth & "-bit mode]":
+  suite "Modular operations - small modulus - Stint specific failures highlighted by property-based testing" & " [" & $WordBitWidth & "-bit words]":
    # Vectors taken from Stint - https://github.com/status-im/nim-stint
    test "Modulo: 65696211516342324 mod 174261910798982":
      let u = 65696211516342324'u64
@ -401,7 +401,7 @@ proc mainModular() =
        "\n  expected (ll repr): " & $expected

 proc mainNeg() =
-  suite "Conditional negation" & " [" & $WordBitWidth & "-bit mode]":
+  suite "Conditional negation" & " [" & $WordBitWidth & "-bit words]":
    test "Conditional negation":
      block:
        var a = fromHex(BigInt[128], "0x12345678FF11FFAA00321321CAFECAFE")
@ -499,7 +499,7 @@ proc mainNeg() =
          bool(b == b2)

 proc mainCopySwap() =
-  suite "Copy and Swap" & " [" & $WordBitWidth & "-bit mode]":
+  suite "Copy and Swap" & " [" & $WordBitWidth & "-bit words]":
    test "Conditional copy":
      block:
        var a = fromHex(BigInt[128], "0x12345678FF11FFAA00321321CAFECAFE")
@ -545,7 +545,7 @@ proc mainCopySwap() =
          bool(eB == b)

 proc mainModularInverse() =
-  suite "Modular Inverse (with odd modulus)" & " [" & $WordBitWidth & "-bit mode]":
+  suite "Modular Inverse (with odd modulus)" & " [" & $WordBitWidth & "-bit words]":
    # Note: We don't define multi-precision multiplication
    #       because who needs it when you have Montgomery?
    #       ¯\(ツ)/¯
@ -556,10 +556,14 @@ proc mainModularInverse() =

        let expected = BigInt[16].fromUint(1969'u16)
        var r = canary(BigInt[16])
+        var r2 = canary(BigInt[16])

        r.invmod(a, M)
+        r2.invmod_vartime(a, M)

-        check: bool(r == expected)
+        check:
+          bool(r == expected)
+          bool(r2 == expected)

      block: # huge int
        let a = BigInt[381].fromUint(42'u16)
@ -567,10 +571,14 @@ proc mainModularInverse() =

        let expected = BigInt[381].fromUint(1969'u16)
        var r = canary(BigInt[381])
+        var r2 = canary(BigInt[381])

        r.invmod(a, M)
+        r2.invmod_vartime(a, M)

-        check: bool(r == expected)
+        check:
+          bool(r == expected)
+          bool(r2 == expected)

    test "271^-1 (mod 383) = 106":
      block: # small int
@ -579,10 +587,14 @@ proc mainModularInverse() =

        let expected = BigInt[16].fromUint(106'u16)
        var r = canary(BigInt[16])
+        var r2 = canary(BigInt[16])

        r.invmod(a, M)
+        r2.invmod_vartime(a, M)

-        check: bool(r == expected)
+        check:
+          bool(r == expected)
+          bool(r2 == expected)

      block: # huge int
        let a = BigInt[381].fromUint(271'u16)
@ -590,10 +602,14 @@ proc mainModularInverse() =

        let expected = BigInt[381].fromUint(106'u16)
        var r = canary(BigInt[381])
+        var r2 = canary(BigInt[381])

        r.invmod(a, M)
+        r2.invmod_vartime(a, M)

-        check: bool(r == expected)
+        check:
+          bool(r == expected)
+          bool(r2 == expected)

    test "BN254_Modulus^-1 (mod BLS12_381)":
      let a = BigInt[381].fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
@ -602,9 +618,14 @@ proc mainModularInverse() =
      let expected = BigInt[381].fromHex("0x0636759a0f3034fa47174b2c0334902f11e9915b7bd89c6a2b3082b109abbc9837da17201f6d8286fe6203caa1b9d4c8")

      var r = canary(BigInt[381])
-      r.invmod(a, M)
+      var r2 = canary(BigInt[381])

-      check: bool(r == expected)
+      r.invmod(a, M)
+      r2.invmod_vartime(a, M)
+
+      check:
+        bool(r == expected)
+        bool(r2 == expected)

    test "0^-1 (mod any) = 0 (need for tower of extension fields)":
      block:
@ -613,10 +634,14 @@ proc mainModularInverse() =

        let expected = BigInt[16].fromUint(0'u16)
        var r = canary(BigInt[16])
+        var r2 = canary(BigInt[16])

        r.invmod(a, M)
+        r2.invmod_vartime(a, M)

-        check: bool(r == expected)
+        check:
+          bool(r == expected)
+          bool(r2 == expected)

      block:
        let a = BigInt[381].fromUint(0'u16)
@ -624,10 +649,14 @@ proc mainModularInverse() =

        let expected = BigInt[381].fromUint(0'u16)
        var r = canary(BigInt[381])
+        var r2 = canary(BigInt[381])

        r.invmod(a, M)
+        r2.invmod_vartime(a, M)

-        check: bool(r == expected)
+        check:
+          bool(r == expected)
+          bool(r2 == expected)

 mainArith()
 mainMul()
--- a/tests/math/t_bigints_multimod.nim
+++ b/tests/math/t_bigints_multimod.nim
@ -17,7 +17,7 @@ import
 echo "\n------------------------------------------------------\n"

 proc main() =
-  suite "Bigints - Multiprecision modulo" & " [" & $WordBitWidth & "-bit mode]":
+  suite "Bigints - Multiprecision modulo" & " [" & $WordBitWidth & "-bit words]":
    test "bitsize 237 mod bitsize 192":
      let a = BigInt[237].fromHex("0x123456789012345678901234567890123456789012345678901234567890")
      let m = BigInt[192].fromHex("0xAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAB")
--- a/tests/math/t_ec_frobenius.nim
+++ b/tests/math/t_ec_frobenius.nim
@ -49,7 +49,7 @@ proc test(
    R.frobenius_psi(P)
    doAssert: bool(R == Q)

-suite "ψ (Psi) - Untwist-Frobenius-Twist Endomorphism on G2 vs SageMath" & " [" & $WordBitWidth & "-bit mode]":
+suite "ψ (Psi) - Untwist-Frobenius-Twist Endomorphism on G2 vs SageMath" & " [" & $WordBitWidth & "-bit words]":
  # Generated via
  # - sage sage/frobenius_bn254_snarks.sage
  # - sage sage/frobenius_bls12_377.sage
@ -214,7 +214,7 @@ suite "ψ (Psi) - Untwist-Frobenius-Twist Endomorphism on G2 vs SageMath" & " ["
    Qy1 = "77ef6850d4a8f181a10196398cd344011a44c50dce00e18578f3526301263492086d44c7c3d1db5b12499b4033116e1"
  )

-suite "ψ - psi(psi(P)) == psi2(P) - (Untwist-Frobenius-Twist Endomorphism)" & " [" & $WordBitWidth & "-bit mode]":
+suite "ψ - psi(psi(P)) == psi2(P) - (Untwist-Frobenius-Twist Endomorphism)" & " [" & $WordBitWidth & "-bit words]":
  const Iters = 8
  proc test(EC: typedesc, randZ: static bool, gen: static RandomGen) =
    for i in 0 ..< Iters:
@ -247,7 +247,7 @@ suite "ψ - psi(psi(P)) == psi2(P) - (Untwist-Frobenius-Twist Endomorphism)" & "
  testAll(ECP_ShortW_Prj[Fp2[BLS12_381], G2])
  testAll(ECP_ShortW_Prj[Fp[BW6_761], G2])

-suite "ψ²(P) - [t]ψ(P) + [p]P = Inf" & " [" & $WordBitWidth & "-bit mode]":
+suite "ψ²(P) - [t]ψ(P) + [p]P = Inf" & " [" & $WordBitWidth & "-bit words]":
  const Iters = 10
  proc trace(C: static Curve): auto =
    # Returns (abs(trace), isNegativeSign)
@ -314,7 +314,7 @@ suite "ψ²(P) - [t]ψ(P) + [p]P = Inf" & " [" & $WordBitWidth & "-bit mode]":
  testAll(ECP_ShortW_Prj[Fp2[BLS12_381], G2])
  testAll(ECP_ShortW_Prj[Fp[BW6_761], G2])

-suite "ψ⁴(P) - ψ²(P) + P = Inf (k-th cyclotomic polynomial with embedding degree k=12)" & " [" & $WordBitWidth & "-bit mode]":
+suite "ψ⁴(P) - ψ²(P) + P = Inf (k-th cyclotomic polynomial with embedding degree k=12)" & " [" & $WordBitWidth & "-bit words]":
  const Iters = 10

  proc test(EC: typedesc, randZ: static bool, gen: static RandomGen) =
@ -344,7 +344,7 @@ suite "ψ⁴(P) - ψ²(P) + P = Inf (k-th cyclotomic polynomial with embedding d
  testAll(ECP_ShortW_Prj[Fp2[BLS12_377], G2])
  testAll(ECP_ShortW_Prj[Fp2[BLS12_381], G2])

-suite "ψ²(P) - ψ(P) + P = Inf (k-th cyclotomic polynomial with embedding degree k=6)" & " [" & $WordBitWidth & "-bit mode]":
+suite "ψ²(P) - ψ(P) + P = Inf (k-th cyclotomic polynomial with embedding degree k=6)" & " [" & $WordBitWidth & "-bit words]":
  const Iters = 10

  proc test(EC: typedesc, randZ: static bool, gen: static RandomGen) =
--- a/tests/math/t_ec_sage_bls12_377.nim
+++ b/tests/math/t_ec_sage_bls12_377.nim
@ -15,22 +15,23 @@ import
  # Test utilities
  ./t_ec_sage_template

+staticFor(bits, [32, 64, 128, BLS12_377.getCurveOrderBitwidth()]):
  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Prj[Fp[BLS12_377], G1],
+    ECP_ShortW_Prj[Fp[BLS12_377], G1], bits,
    "t_ec_sage_bls12_377_g1_projective"
  )

  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Jac[Fp[BLS12_377], G1],
+    ECP_ShortW_Jac[Fp[BLS12_377], G1], bits,
    "t_ec_sage_bls12_377_g1_jacobian"
  )

  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Prj[Fp2[BLS12_377], G2],
+    ECP_ShortW_Prj[Fp2[BLS12_377], G2], bits,
    "t_ec_sage_bls12_377_g2_projective"
  )

  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Jac[Fp2[BLS12_377], G2],
+    ECP_ShortW_Jac[Fp2[BLS12_377], G2], bits,
    "t_ec_sage_bls12_377_g2_jacobian"
  )
--- a/tests/math/t_ec_sage_bls12_381.nim
+++ b/tests/math/t_ec_sage_bls12_381.nim
@ -15,22 +15,23 @@ import
  # Test utilities
  ./t_ec_sage_template

+staticFor(bits, [32, 64, 128, BLS12_381.getCurveOrderBitwidth()]):
  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Prj[Fp[BLS12_381], G1],
+    ECP_ShortW_Prj[Fp[BLS12_381], G1], bits,
    "t_ec_sage_bls12_381_g1_projective"
  )

  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Jac[Fp[BLS12_381], G1],
+    ECP_ShortW_Jac[Fp[BLS12_381], G1], bits,
    "t_ec_sage_bls12_381_g1_jacobian"
  )

  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Prj[Fp2[BLS12_381], G2],
+    ECP_ShortW_Prj[Fp2[BLS12_381], G2], bits,
    "t_ec_sage_bls12_381_g2_projective"
  )

  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Jac[Fp2[BLS12_381], G2],
+    ECP_ShortW_Jac[Fp2[BLS12_381], G2], bits,
    "t_ec_sage_bls12_381_g2_jacobian"
  )
--- a/tests/math/t_ec_sage_bn254_nogami.nim
+++ b/tests/math/t_ec_sage_bn254_nogami.nim
@ -15,22 +15,23 @@ import
  # Test utilities
  ./t_ec_sage_template

+staticFor(bits, [BN254_Nogami.getCurveOrderBitwidth()]):
  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Prj[Fp[BN254_Nogami], G1],
+    ECP_ShortW_Prj[Fp[BN254_Nogami], G1], bits,
    "t_ec_sage_bn254_nogami_g1_projective"
  )

  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Jac[Fp[BN254_Nogami], G1],
+    ECP_ShortW_Jac[Fp[BN254_Nogami], G1], bits,
    "t_ec_sage_bn254_nogami_g1_jacobian"
  )

  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Prj[Fp2[BN254_Nogami], G2],
+    ECP_ShortW_Prj[Fp2[BN254_Nogami], G2], bits,
    "t_ec_sage_bn254_nogami_g2_projective"
  )

  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Jac[Fp2[BN254_Nogami], G2],
+    ECP_ShortW_Jac[Fp2[BN254_Nogami], G2], bits,
    "t_ec_sage_bn254_nogami_g2_jacobian"
  )
--- a/tests/math/t_ec_sage_bn254_snarks.nim
+++ b/tests/math/t_ec_sage_bn254_snarks.nim
@ -15,22 +15,23 @@ import
  # Test utilities
  ./t_ec_sage_template

+staticFor(bits, [32, 64, 128, BN254_Snarks.getCurveOrderBitwidth()]):
  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Prj[Fp[BN254_Snarks], G1],
+    ECP_ShortW_Prj[Fp[BN254_Snarks], G1], bits,
    "t_ec_sage_bn254_snarks_g1_projective"
  )

  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Jac[Fp[BN254_Snarks], G1],
+    ECP_ShortW_Jac[Fp[BN254_Snarks], G1], bits,
    "t_ec_sage_bn254_snarks_g1_jacobian"
  )

  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Prj[Fp2[BN254_Snarks], G2],
+    ECP_ShortW_Prj[Fp2[BN254_Snarks], G2], bits,
    "t_ec_sage_bn254_snarks_g2_projective"
  )

  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Jac[Fp2[BN254_Snarks], G2],
+    ECP_ShortW_Jac[Fp2[BN254_Snarks], G2], bits,
    "t_ec_sage_bn254_snarks_g2_jacobian"
  )
--- a/tests/math/t_ec_sage_bw6_761_g1.nim
+++ b/tests/math/t_ec_sage_bw6_761_g1.nim
@ -20,22 +20,23 @@ import
 # this creates bad codegen, in the C code, the `value`parameter gets the wrong type
 # TODO: upstream

+staticFor(bits, [BW6_761.getCurveOrderBitwidth()]):
  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Prj[Fp[BW6_761], G1],
+    ECP_ShortW_Prj[Fp[BW6_761], G1], bits,
    "t_ec_sage_bw6_761_g1_projective"
  )

  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Jac[Fp[BW6_761], G1],
+    ECP_ShortW_Jac[Fp[BW6_761], G1], bits,
    "t_ec_sage_bw6_761_g1_jacobian"
  )

  # run_scalar_mul_test_vs_sage(
-#   ECP_ShortW_Prj[Fp[BW6_761], G2],
+  #   ECP_ShortW_Prj[Fp[BW6_761], G2], bits,
  #   "t_ec_sage_bw6_761_g2_projective"
  # )

  # run_scalar_mul_test_vs_sage(
-#   ECP_ShortW_Jac[Fp[BW6_761], G2],
+  #   ECP_ShortW_Jac[Fp[BW6_761], G2], bits,
  #   "t_ec_sage_bw6_761_g2_jacobian"
  # )
--- a/tests/math/t_ec_sage_bw6_761_g2.nim
+++ b/tests/math/t_ec_sage_bw6_761_g2.nim
@ -20,22 +20,23 @@ import
 # this creates bad codegen, in the C code, the `value`parameter gets the wrong type
 # TODO: upstream

+staticFor(bits, [BW6_761.getCurveOrderBitwidth()]):
  # run_scalar_mul_test_vs_sage(
-#   ECP_ShortW_Prj[Fp[BW6_761], G1],
+  #   ECP_ShortW_Prj[Fp[BW6_761], G1], bits,
  #   "t_ec_sage_bw6_761_g1_projective"
  # )

  # run_scalar_mul_test_vs_sage(
-#   ECP_ShortW_Jac[Fp[BW6_761], G1],
+  #   ECP_ShortW_Jac[Fp[BW6_761], G1], bits,
  #   "t_ec_sage_bw6_761_g1_jacobian"
  # )

  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Prj[Fp[BW6_761], G2],
+    ECP_ShortW_Prj[Fp[BW6_761], G2], bits,
    "t_ec_sage_bw6_761_g2_projective"
  )

  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Jac[Fp[BW6_761], G2],
+    ECP_ShortW_Jac[Fp[BW6_761], G2], bits,
    "t_ec_sage_bw6_761_g2_jacobian"
  )
--- a/tests/math/t_ec_sage_pallas.nim
+++ b/tests/math/t_ec_sage_pallas.nim
@ -15,12 +15,13 @@ import
  # Test utilities
  ./t_ec_sage_template

+staticFor(bits, [Pallas.getCurveOrderBitwidth()]):
  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Prj[Fp[Pallas], G1],
+    ECP_ShortW_Prj[Fp[Pallas], G1], bits,
    "t_ec_sage_pallas_g1_projective"
  )

  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Jac[Fp[Pallas], G1],
+    ECP_ShortW_Jac[Fp[Pallas], G1], bits,
    "t_ec_sage_pallas_g1_jacobian"
  )
--- a/tests/math/t_ec_sage_template.nim
+++ b/tests/math/t_ec_sage_template.nim
@ -13,7 +13,6 @@ import
  pkg/jsony,
  # Internals
  ../../constantine/platforms/abstractions,
-  ../../constantine/math/config/curves,
  ../../constantine/math/[arithmetic, extension_fields],
  ../../constantine/math/io/[io_bigints, io_ec],
  ../../constantine/math/elliptic/[
@ -22,72 +21,21 @@ import
    ec_shortweierstrass_jacobian,
    ec_scalar_mul,
    ec_endomorphism_accel],
+  ../../constantine/math/constants/zoo_endomorphisms,
  # Test utilities
-  ./support/ec_reference_scalar_mult
+  ../../constantine/math/elliptic/ec_scalar_mul_vartime

 export unittest, abstractions, arithmetic # Generic sandwich

 # Serialization
 # --------------------------------------------------------------------------

-macro matchingScalar*(EC: type ECP_ShortW_Aff): untyped =
-  ## Workaround the annoying type system
-  ## 1. Higher-kinded type
-  ## 2. Computation in type section needs template or macro indirection
-  ## 3. Converting NimNode to typedesc
-  ##      https://github.com/nim-lang/Nim/issues/6785
-  # BigInt[EC.F.C.getCurveOrderBitwidth()]
-
-  let ec = EC.getTypeImpl()
-  # echo ec.treerepr
-  # BracketExpr
-  # Sym "typeDesc"
-  # BracketExpr
-  #   Sym "ECP_ShortW_Aff"
-  #   BracketExpr
-  #     Sym "Fp"
-  #     IntLit 12
-  #   IntLit 0
-
-  doAssert ec[0].eqIdent"typedesc"
-  doAssert ec[1][0].eqIdent"ECP_ShortW_Aff"
-  ec[1][1].expectkind(nnkBracketExpr)
-  doAssert ($ec[1][1][0]).startsWith"Fp"
-
-  let curve = Curve(ec[1][1][1].intVal)
-  let bitwidth = getAST(getCurveOrderBitwidth(curve))
-  result = nnkBracketExpr.newTree(
-    bindSym"BigInt",
-    bitwidth
-  )
-
-macro matchingNonResidueType*(EC: type ECP_ShortW_Aff): untyped =
-  ## Workaround the annoying type system
-  ## 1. Higher-kinded type
-  ## 2. Computation in type section needs template or macro indirection
-  ## 3. Converting NimNode to typedesc
-  ##      https://github.com/nim-lang/Nim/issues/6785
-  let ec = EC.getTypeImpl()
-  doAssert ec[0].eqIdent"typedesc"
-  doAssert ec[1][0].eqIdent"ECP_ShortW_Aff"
-  ec[1][1].expectkind(nnkBracketExpr)
-  doAssert ($ec[1][1][0]).startsWith"Fp"
-
-  # int or array[2, int]
-  if ec[1][1][0].eqIdent"Fp":
-    result = bindSym"int"
-  elif ec[1][1][0].eqIdent"Fp2":
-    result = nnkBracketExpr.newTree(
-      bindSym"array",
-      newLit 2,
-      bindSym"int"
-    )
-
 type
-  TestVector*[EC: ECP_ShortW_Aff] = object
+  TestVector*[EC: ECP_ShortW_Aff, bits: static int] = object
    id: int
    P: EC
-    scalar: matchingScalar(EC)
+    scalarBits: int
+    scalar: BigInt[bits]
    Q: EC

  EC_G1_hex = object
@ -102,7 +50,7 @@ type
    x: Fp2_hex
    y: Fp2_hex

-  ScalarMulTestG1[EC: ECP_ShortW_Aff] = object
+  ScalarMulTestG1[EC: ECP_ShortW_Aff, bits: static int] = object
    curve: string
    group: string
    modulus: string
@ -112,9 +60,9 @@ type
    a: string
    b: string
    # vectors ------------------
-    vectors: seq[TestVector[EC]]
+    vectors: seq[TestVector[EC, bits]]

-  ScalarMulTestG2[EC: ECP_ShortW_Aff] = object
+  ScalarMulTestG2[EC: ECP_ShortW_Aff, bits: static int] = object
    curve: string
    group: string
    modulus: string
@ -128,9 +76,12 @@ type
    twist: string
    non_residue_fp: int
    G2_field: string
-    non_residue_twist: matchingNonResidueType(EC) # int or array[2, int]
+    when EC.F is Fp:
+      non_residue_twist: int
+    else:
+      non_residue_twist: array[2, int]
    # vectors ------------------
-    vectors: seq[TestVector[EC]]
+    vectors: seq[TestVector[EC, bits]]

 const
  TestVectorsDir* =
@ -170,7 +121,8 @@ proc parseHook*(src: string, pos: var int, value: var ECP_ShortW_Aff) =
 proc loadVectors(TestType: typedesc): TestType =
  const group = when TestType.EC.G == G1: "G1"
                else: "G2"
-  const filename = "tv_" & $TestType.EC.F.C & "_scalar_mul_" & group & ".json"
+  const filename = "tv_" & $TestType.EC.F.C & "_scalar_mul_" & group & "_" & $TestType.bits & "bit.json"
+  echo "Loading: ", filename
  let content = readFile(TestVectorsDir/filename)
  result = content.fromJson(TestType)

@ -178,7 +130,7 @@ proc loadVectors(TestType: typedesc): TestType =
 # ------------------------------------------------------------------------

 proc run_scalar_mul_test_vs_sage*(
-       EC: typedesc,
+       EC: typedesc, bits: static int,
       moduleName: string
     ) =
  echo "\n------------------------------------------------------\n"
@ -186,38 +138,48 @@ proc run_scalar_mul_test_vs_sage*(

  when EC.G == G1:
    const G1_or_G2 = "G1"
-    let vec = loadVectors(ScalarMulTestG1[ECP_ShortW_Aff[EC.F, EC.G]])
+    let vec = loadVectors(ScalarMulTestG1[ECP_ShortW_Aff[EC.F, EC.G], bits])
  else:
    const G1_or_G2 = "G2"
-    let vec = loadVectors(ScalarMulTestG2[ECP_ShortW_Aff[EC.F, EC.G]])
+    let vec = loadVectors(ScalarMulTestG2[ECP_ShortW_Aff[EC.F, EC.G], bits])

  const coord = when EC is ECP_ShortW_Prj: " Projective coordinates "
                elif EC is ECP_ShortW_Jac: " Jacobian coordinates "

-  const testSuiteDesc = "Scalar Multiplication " & $EC.F.C & " " & G1_or_G2 & " vs SageMath"
+  const testSuiteDesc = "Scalar Multiplication " & $EC.F.C & " " & G1_or_G2 & " vs SageMath - " & $bits & "-bit scalar"

-  suite testSuiteDesc & " [" & $WordBitWidth & "-bit mode]":
+  suite testSuiteDesc & " [" & $WordBitWidth & "-bit words]":
    for i in 0 ..< vec.vectors.len:
-      test "test " & $vec.vectors[i].id & " - " & $EC:
+      test "test " & $vec.vectors[i].id & " - " & $EC & " - " & $bits & "-bit scalar":
        var
          P{.noInit.}: EC
          Q {.noInit.}: EC
          impl {.noInit.}: EC
          reference {.noInit.}: EC
-          endo {.noInit.}: EC
+          refMinWeight {.noInit.}: EC

        P.fromAffine(vec.vectors[i].P)
        Q.fromAffine(vec.vectors[i].Q)
        impl = P
        reference = P
-        endo = P
+        refMinWeight = P

        impl.scalarMulGeneric(vec.vectors[i].scalar)
-        reference.unsafe_ECmul_double_add(vec.vectors[i].scalar)
-        endo.scalarMulEndo(vec.vectors[i].scalar)
+        reference.scalarMul_doubleAdd_vartime(vec.vectors[i].scalar)
+        refMinWeight.scalarMul_minHammingWeight_vartime(vec.vectors[i].scalar)

        doAssert: bool(Q == reference)
        doAssert: bool(Q == impl)
+        doAssert: bool(Q == refMinWeight)
+
+        staticFor w, 2, 14:
+          var refWNAF = P
+          refWNAF.scalarMul_minHammingWeight_windowed_vartime(vec.vectors[i].scalar, window = w)
+          check: bool(impl == refWNAF)
+
+        when bits >= EndomorphismThreshold: # All endomorphisms constants are below this threshold
+          var endo = P
+          endo.scalarMulEndo(vec.vectors[i].scalar)
          doAssert: bool(Q == endo)

          when EC.F is Fp: # Test windowed endomorphism acceleration
--- a/tests/math/t_ec_sage_vesta.nim
+++ b/tests/math/t_ec_sage_vesta.nim
@ -15,12 +15,13 @@ import
  # Test utilities
  ./t_ec_sage_template

+staticFor(bits, [Vesta.getCurveOrderBitwidth()]):
  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Prj[Fp[Vesta], G1],
+    ECP_ShortW_Prj[Fp[Vesta], G1], bits,
    "t_ec_sage_vesta_g1_projective"
  )

  run_scalar_mul_test_vs_sage(
-  ECP_ShortW_Jac[Fp[Vesta], G1],
+    ECP_ShortW_Jac[Fp[Vesta], G1], bits,
    "t_ec_sage_vesta_g1_jacobian"
  )
--- a/tests/math/t_ec_shortw_jac_g1_msm.nim
+++ b/tests/math/t_ec_shortw_jac_g1_msm.nim
@ -0,0 +1,29 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internals
+  ../../constantine/math/config/curves,
+  ../../constantine/math/elliptic/ec_shortweierstrass_jacobian,
+  ../../constantine/math/arithmetic,
+  # Test utilities
+  ./t_ec_template
+
+const numPoints = [1, 2, 8, 16, 32, 64, 128, 1024, 2048, 16384] # 32768, 262144, 1048576]
+
+run_EC_multi_scalar_mul_impl(
+    ec = ECP_ShortW_Jac[Fp[BN254_Snarks], G1],
+    numPoints = numPoints,
+    moduleName = "test_ec_shortweierstrass_jacobian_multi_scalar_mul_" & $BN254_Snarks
+  )
+
+run_EC_multi_scalar_mul_impl(
+    ec = ECP_ShortW_Jac[Fp[BLS12_381], G1],
+    numPoints = numPoints,
+    moduleName = "test_ec_shortweierstrass_jacobian_multi_scalar_mul_" & $BLS12_381
+  )
--- a/tests/math/t_ec_shortw_jac_g1_mul_sanity.nim
+++ b/tests/math/t_ec_shortw_jac_g1_mul_sanity.nim
@ -16,7 +16,7 @@ import
  ../../constantine/math/elliptic/[ec_shortweierstrass_affine, ec_shortweierstrass_jacobian, ec_scalar_mul],
  # Test utilities
  ../../helpers/prng_unsafe,
-  ./support/ec_reference_scalar_mult,
+  ../../constantine/math/elliptic/ec_scalar_mul_vartime,
  ./t_ec_template

 const
@ -50,7 +50,7 @@ suite "Order checks on BN254_Snarks":
          reference = a

        impl.scalarMulGeneric(exponent)
-        reference.unsafe_ECmul_double_add(exponent)
+        reference.scalarMul_doubleAdd_vartime(exponent)

        check:
          bool(impl.isInf())
--- a/tests/math/t_ec_shortw_jac_g1_sum_reduce.nim
+++ b/tests/math/t_ec_shortw_jac_g1_sum_reduce.nim
--- a/tests/math/t_ec_shortw_jacext_g1_add_double.nim
+++ b/tests/math/t_ec_shortw_jacext_g1_add_double.nim
@ -0,0 +1,53 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internals
+  ../../constantine/math/config/[type_ff, curves],
+  ../../constantine/math/elliptic/ec_shortweierstrass_jacobian_extended,
+  # Test utilities
+  ./t_ec_template
+
+const
+  Iters = 6
+
+run_EC_addition_tests(
+    ec = ECP_ShortW_JacExt[Fp[BN254_Snarks], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_jacobian_extended_g1_add_double_" & $BN254_Snarks
+  )
+
+run_EC_addition_tests(
+    ec = ECP_ShortW_JacExt[Fp[BLS12_381], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_jacobian_extended_g1_add_double_" & $BLS12_381
+  )
+
+run_EC_addition_tests(
+    ec = ECP_ShortW_JacExt[Fp[BLS12_377], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_jacobian_extended_g1_add_double_" & $BLS12_377
+  )
+
+run_EC_addition_tests(
+    ec = ECP_ShortW_JacExt[Fp[BW6_761], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_jacobian_extended_g1_add_double_" & $BW6_761
+  )
+
+run_EC_addition_tests(
+    ec = ECP_ShortW_JacExt[Fp[Pallas], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_jacobian_extended_g1_add_double_" & $Pallas
+  )
+
+run_EC_addition_tests(
+    ec = ECP_ShortW_JacExt[Fp[Vesta], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_jacobian_extended_g1_add_double_" & $Vesta
+  )
--- a/tests/math/t_ec_shortw_jacext_g1_mixed_add.nim
+++ b/tests/math/t_ec_shortw_jacext_g1_mixed_add.nim
@ -0,0 +1,54 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internals
+  ../../constantine/math/config/curves,
+  ../../constantine/math/elliptic/ec_shortweierstrass_jacobian_extended,
+  ../../constantine/math/arithmetic,
+  # Test utilities
+  ./t_ec_template
+
+const
+  Iters = 6
+
+run_EC_mixed_add_impl(
+    ec = ECP_ShortW_JacExt[Fp[BN254_Snarks], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_jacobian_extendedmixed_add_" & $BN254_Snarks
+  )
+
+run_EC_mixed_add_impl(
+    ec = ECP_ShortW_JacExt[Fp[BLS12_381], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_jacobian_extendedmixed_add_" & $BLS12_381
+  )
+
+run_EC_mixed_add_impl(
+    ec = ECP_ShortW_JacExt[Fp[BLS12_377], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_jacobian_extendedmixed_add_" & $BLS12_377
+  )
+
+run_EC_mixed_add_impl(
+    ec = ECP_ShortW_JacExt[Fp[BW6_761], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_jacobian_extendedmixed_add_" & $BW6_761
+  )
+
+run_EC_mixed_add_impl(
+    ec = ECP_ShortW_JacExt[Fp[Pallas], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_jacobian_extendedmixed_add_" & $Pallas
+  )
+
+run_EC_mixed_add_impl(
+    ec = ECP_ShortW_JacExt[Fp[Vesta], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_jacobian_extendedmixed_add_" & $Vesta
+  )
--- a/tests/math/t_ec_shortw_jacext_g1_sum_reduce.nim
+++ b/tests/math/t_ec_shortw_jacext_g1_sum_reduce.nim
@ -0,0 +1,29 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internals
+  ../../constantine/math/config/curves,
+  ../../constantine/math/elliptic/ec_shortweierstrass_jacobian_extended,
+  ../../constantine/math/arithmetic,
+  # Test utilities
+  ./t_ec_template
+
+const numPoints = [1, 2, 8, 16, 128, 1024, 2048, 16384, 32768] # 262144, 1048576]
+
+run_EC_batch_add_impl(
+    ec = ECP_ShortW_JacExt[Fp[BN254_Snarks], G1],
+    numPoints = numPoints,
+    moduleName = "test_ec_shortweierstrass_jacobian_extended_batch_add_" & $BN254_Snarks
+  )
+
+run_EC_batch_add_impl(
+    ec = ECP_ShortW_JacExt[Fp[BLS12_381], G1],
+    numPoints = numPoints,
+    moduleName = "test_ec_shortweierstrass_jacobian_extended_batch_add_" & $BLS12_381
+  )
--- a/tests/math/t_ec_shortw_prj_g1_msm.nim
+++ b/tests/math/t_ec_shortw_prj_g1_msm.nim
@ -0,0 +1,29 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internals
+  ../../constantine/math/config/curves,
+  ../../constantine/math/elliptic/ec_shortweierstrass_projective,
+  ../../constantine/math/arithmetic,
+  # Test utilities
+  ./t_ec_template
+
+const numPoints = [1, 2, 8, 16, 128, 1024, 2048, 16384] # 32768, 262144, 1048576]
+
+run_EC_multi_scalar_mul_impl(
+    ec = ECP_ShortW_Prj[Fp[BN254_Snarks], G1],
+    numPoints = numPoints,
+    moduleName = "test_ec_shortweierstrass_projective_multi_scalar_mul_" & $BN254_Snarks
+  )
+
+run_EC_multi_scalar_mul_impl(
+    ec = ECP_ShortW_Prj[Fp[BLS12_381], G1],
+    numPoints = numPoints,
+    moduleName = "test_ec_shortweierstrass_projective_multi_scalar_mul_" & $BLS12_381
+  )
--- a/tests/math/t_ec_shortw_prj_g1_mul_sanity.nim
+++ b/tests/math/t_ec_shortw_prj_g1_mul_sanity.nim
@ -15,7 +15,7 @@ import
  ../../constantine/math/elliptic/[ec_shortweierstrass_affine, ec_shortweierstrass_projective, ec_scalar_mul],
  # Test utilities
  ../../helpers/prng_unsafe,
-  ./support/ec_reference_scalar_mult,
+  ../../constantine/math/elliptic/ec_scalar_mul_vartime,
  ./t_ec_template

 const
@ -49,7 +49,7 @@ suite "Order checks on BN254_Snarks":
          reference = a

        impl.scalarMulGeneric(exponent)
-        reference.unsafe_ECmul_double_add(exponent)
+        reference.scalarMul_doubleAdd_vartime(exponent)

        check:
          bool(impl.isInf())
--- a/tests/math/t_ec_shortw_prj_g1_sum_reduce.nim
+++ b/tests/math/t_ec_shortw_prj_g1_sum_reduce.nim
--- a/tests/math/t_ec_template.nim
+++ b/tests/math/t_ec_template.nim
@ -22,19 +22,36 @@ import
  ../../constantine/math/elliptic/[
    ec_shortweierstrass_affine,
    ec_shortweierstrass_jacobian,
+    ec_shortweierstrass_jacobian_extended,
    ec_shortweierstrass_projective,
    ec_shortweierstrass_batch_ops,
    ec_twistededwards_affine,
    ec_twistededwards_projective,
-    ec_scalar_mul],
+    ec_scalar_mul,
+    ec_multi_scalar_mul],
  ../../constantine/math/io/[io_bigints, io_fields, io_ec],
  ../../constantine/math/constants/zoo_subgroups,
  # Test utilities
  ../../helpers/prng_unsafe,
-  ./support/ec_reference_scalar_mult
+  ../../constantine/math/elliptic/ec_scalar_mul_vartime

 export unittest, abstractions, arithmetic # Generic sandwich

+# Extended Jacobian generic bindings
+# ----------------------------------
+# All vartime procedures MUST be tagged vartime
+# Hence we do not expose `sum` or `+=` for extended jacobian operation to prevent `vartime` mistakes
+# we create a local `sum` or `+=` for this module only
+
+func sum[F; G: static Subgroup](r: var ECP_ShortW_JacExt[F, G], P, Q: ECP_ShortW_JacExt[F, G]) =
+  r.sum_vartime(P, Q)
+func `+=`[F; G: static Subgroup](P: var ECP_ShortW_JacExt[F, G], Q: ECP_ShortW_JacExt[F, G]) =
+  P.sum_vartime(P, Q)
+func madd[F; G: static Subgroup](r: var ECP_ShortW_JacExt[F, G], P: ECP_ShortW_JacExt[F, G], Q: ECP_ShortW_Aff[F, G]) =
+  r.madd_vartime(P, Q)
+func `+=`[F; G: static Subgroup](P: var ECP_ShortW_JacExt[F, G], Q: ECP_ShortW_Aff[F, G]) =
+  P.madd_vartime(P, Q)
+
 type
  RandomGen* = enum
    Uniform
@ -65,15 +82,6 @@ func random_point*(rng: var RngState, EC: typedesc, randZ: bool, gen: RandomGen)
      else:
        result = rng.random_long01Seq_with_randZ(EC)

-template pairingGroup(EC: typedesc): string =
-  when EC is (ECP_ShortW_Aff or ECP_ShortW_Prj or ECP_ShortW_Jac):
-    when EC.G == G1:
-      "G1"
-    else:
-      "G2"
-  else:
-    ""
-
 proc run_EC_addition_tests*(
       ec: typedesc,
       Iters: static int,
@ -87,12 +95,10 @@ proc run_EC_addition_tests*(
  echo "\n------------------------------------------------------\n"
  echo moduleName, " xoshiro512** seed: ", seed

-  const G1_or_G2 = pairingGroup(ec)
-
  const testSuiteDesc = "Elliptic curve in " & $ec.F.C.getEquationForm() & " form with projective coordinates"

  suite testSuiteDesc & " - " & $ec & " - [" & $WordBitWidth & "-bit mode]":
-    test "The infinity point is the neutral element w.r.t. to EC " & G1_or_G2 & " addition":
+    test "The infinity point is the neutral element w.r.t. to EC " & $ec.G & " addition":
      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
        var inf {.noInit.}: EC
        inf.setInf()
@ -124,6 +130,40 @@ proc run_EC_addition_tests*(
      test(ec, randZ = false, gen = Long01Sequence)
      test(ec, randZ = true, gen = Long01Sequence)

+    test "Infinity point from affine conversion gives proper result":
+      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
+        var affInf {.noInit.}: affine(EC)
+        var inf {.noInit.}: EC
+        affInf.setInf()
+        inf.fromAffine(affInf)
+        check: bool inf.isInf()
+
+        for _ in 0 ..< Iters:
+          var r{.noInit.}: EC
+          let P = rng.random_point(EC, randZ, gen)
+
+          r.sum(P, inf)
+          check: bool(r == P)
+
+          r.sum(inf, P)
+          check: bool(r == P)
+
+          # Aliasing tests
+          r = P
+          r += inf
+          check: bool(r == P)
+
+          r = inf
+          r += P
+          check: bool(r == P)
+
+      test(ec, randZ = false, gen = Uniform)
+      test(ec, randZ = true, gen = Uniform)
+      test(ec, randZ = false, gen = HighHammingWeight)
+      test(ec, randZ = true, gen = HighHammingWeight)
+      test(ec, randZ = false, gen = Long01Sequence)
+      test(ec, randZ = true, gen = Long01Sequence)
+
    test "Adding opposites gives an infinity point":
      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
        for _ in 0 ..< Iters:
@ -145,7 +185,7 @@ proc run_EC_addition_tests*(
      test(ec, randZ = false, gen = Long01Sequence)
      test(ec, randZ = true, gen = Long01Sequence)

-    test "EC " & G1_or_G2 & " add is commutative":
+    test "EC " & $ec.G & " add is commutative":
      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
        for _ in 0 ..< Iters:
          var r0{.noInit.}, r1{.noInit.}: EC
@ -163,7 +203,7 @@ proc run_EC_addition_tests*(
      test(ec, randZ = false, gen = Long01Sequence)
      test(ec, randZ = true, gen = Long01Sequence)

-    test "EC " & G1_or_G2 & " add is associative":
+    test "EC " & $ec.G & " add is associative":
      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
        for _ in 0 ..< Iters:
          let a = rng.random_point(EC, randZ, gen)
@ -212,7 +252,7 @@ proc run_EC_addition_tests*(
      test(ec, randZ = false, gen = Long01Sequence)
      test(ec, randZ = true, gen = Long01Sequence)

-    test "EC " & G1_or_G2 & " double and EC " & G1_or_G2 & " add are consistent":
+    test "EC " & $ec.G & " double and EC " & $ec.G & " add are consistent":
      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
        for _ in 0 ..< Iters:
          let a = rng.random_point(EC, randZ, gen)
@ -244,12 +284,10 @@ proc run_EC_mul_sanity_tests*(
  echo "\n------------------------------------------------------\n"
  echo moduleName, " xoshiro512** seed: ", seed

-  const G1_or_G2 = pairingGroup(ec)
-
  const testSuiteDesc = "Elliptic curve in " & $ec.F.C.getEquationForm() & " form"

  suite testSuiteDesc & " - " & $ec & " - [" & $WordBitWidth & "-bit mode]":
-    test "EC " & G1_or_G2 & " mul [0]P == Inf":
+    test "EC " & $ec.G & " mul [0]P == Inf":
      proc test(EC: typedesc, bits: static int, randZ: bool, gen: RandomGen) =
        for _ in 0 ..< ItersMul:
          let a = rng.random_point(EC, randZ, gen)
@ -259,7 +297,7 @@ proc run_EC_mul_sanity_tests*(
            reference = a

          impl.scalarMulGeneric(BigInt[bits]())
-          reference.unsafe_ECmul_double_add(BigInt[bits]())
+          reference.scalarMul_doubleAdd_vartime(BigInt[bits]())

          check:
            bool(impl.isInf())
@ -272,7 +310,7 @@ proc run_EC_mul_sanity_tests*(
      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = Long01Sequence)
      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = Long01Sequence)

-    test "EC " & G1_or_G2 & " mul [1]P == P":
+    test "EC " & $ec.G & " mul [1]P == P":
      proc test(EC: typedesc, bits: static int, randZ: bool, gen: RandomGen) =
        for _ in 0 ..< ItersMul:
          let a = rng.random_point(EC, randZ, gen)
@ -285,7 +323,7 @@ proc run_EC_mul_sanity_tests*(
            reference = a

          impl.scalarMulGeneric(exponent)
-          reference.unsafe_ECmul_double_add(exponent)
+          reference.scalarMul_doubleAdd_vartime(exponent)

          check:
            bool(impl == a)
@ -298,7 +336,7 @@ proc run_EC_mul_sanity_tests*(
      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = Long01Sequence)
      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = Long01Sequence)

-    test "EC " & G1_or_G2 & " mul [2]P == P.double()":
+    test "EC " & $ec.G & " mul [2]P == P.double()":
      proc test(EC: typedesc, bits: static int, randZ: bool, gen: RandomGen) =
        for _ in 0 ..< ItersMul:
          let a = rng.random_point(EC, randZ, gen)
@ -313,7 +351,7 @@ proc run_EC_mul_sanity_tests*(
            reference = a

          impl.scalarMulGeneric(exponent)
-          reference.unsafe_ECmul_double_add(exponent)
+          reference.scalarMul_doubleAdd_vartime(exponent)

          check:
            bool(impl == doubleA)
@ -339,13 +377,11 @@ proc run_EC_mul_distributive_tests*(
  echo "\n------------------------------------------------------\n"
  echo moduleName, " xoshiro512** seed: ", seed

-  const G1_or_G2 = pairingGroup(ec)
-
  const testSuiteDesc = "Elliptic curve in " & $ec.F.C.getEquationForm() & " form"

  suite testSuiteDesc & " - " & $ec & " - [" & $WordBitWidth & "-bit mode]":

-    test "EC " & G1_or_G2 & " mul is distributive over EC add":
+    test "EC " & $ec.G & " mul is distributive over EC add":
      proc test(EC: typedesc, bits: static int, randZ: bool, gen: RandomGen) =
        for _ in 0 ..< ItersMul:
          let a = rng.random_point(EC, randZ, gen)
@ -362,20 +398,20 @@ proc run_EC_mul_distributive_tests*(
          fReference.sum(a, b)

          fImpl.scalarMulGeneric(exponent)
-          fReference.unsafe_ECmul_double_add(exponent)
+          fReference.scalarMul_doubleAdd_vartime(exponent)

          # [k]a + [k]b - Distributed
          var kaImpl = a
          var kaRef = a

          kaImpl.scalarMulGeneric(exponent)
-          kaRef.unsafe_ECmul_double_add(exponent)
+          kaRef.scalarMul_doubleAdd_vartime(exponent)

          var kbImpl = b
          var kbRef = b

          kbImpl.scalarMulGeneric(exponent)
-          kbRef.unsafe_ECmul_double_add(exponent)
+          kbRef.scalarMul_doubleAdd_vartime(exponent)

          var kakbImpl{.noInit.}, kakbRef{.noInit.}: EC
          kakbImpl.sum(kaImpl, kbImpl)
@ -406,17 +442,16 @@ proc run_EC_mul_vs_ref_impl*(
  echo "\n------------------------------------------------------\n"
  echo moduleName, " xoshiro512** seed: ", seed

-  const G1_or_G2 = pairingGroup(ec)
-
  const testSuiteDesc = "Elliptic curve in " & $ec.F.C.getEquationForm() & " form"

  suite testSuiteDesc & " - " & $ec & " - [" & $WordBitWidth & "-bit mode]":
-    test "EC " & G1_or_G2 & " mul constant-time is equivalent to a simple double-and-add algorithm":
+    test "EC " & $ec.G & " mul constant-time is equivalent to a simple double-and-add and recoded algorithms":
      proc test(EC: typedesc, bits: static int, randZ: bool, gen: RandomGen) =
        for _ in 0 ..< ItersMul:
          let a = rng.random_point(EC, randZ, gen)

-          let exponent = rng.random_unsafe(BigInt[bits])
+          # We want to test how window methods handles unbalanced 0/1
+          let exponent = rng.random_long01Seq(BigInt[bits])

          var
            impl = a
@ -424,13 +459,24 @@ proc run_EC_mul_vs_ref_impl*(
            refMinWeight = a

          impl.scalarMulGeneric(exponent)
-          reference.unsafe_ECmul_double_add(exponent)
-          refMinWeight.unsafe_ECmul_minHammingWeight(exponent)
+          reference.scalarMul_doubleAdd_vartime(exponent)
+          refMinWeight.scalarMul_minHammingWeight_vartime(exponent)

          check:
            bool(impl == reference)
            bool(impl == refMinWeight)

+          proc refWNaf(w: static int) = # workaround staticFor symbol visibility
+            var refWNAF = a
+            refWNAF.scalarMul_minHammingWeight_windowed_vartime(exponent, window = w)
+            check: bool(impl == refWNAF)
+
+          refWNaf(2)
+          refWNaf(3)
+          refWNaf(5)
+          refWNaf(8)
+          refWNaf(13)
+
      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = Uniform)
      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = Uniform)
      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = HighHammingWeight)
@ -451,15 +497,10 @@ proc run_EC_mixed_add_impl*(
  echo "\n------------------------------------------------------\n"
  echo moduleName, " xoshiro512** seed: ", seed

-  when ec.G == G1:
-    const G1_or_G2 = "G1"
-  else:
-    const G1_or_G2 = "G2"
-
  const testSuiteDesc = "Elliptic curve mixed addition for Short Weierstrass form"

  suite testSuiteDesc & " - " & $ec & " - [" & $WordBitWidth & "-bit mode]":
-    test "EC " & G1_or_G2 & " mixed addition is consistent with general addition":
+    test "EC " & $ec.G & " mixed addition is consistent with general addition":
      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
        for _ in 0 ..< Iters:
          let a = rng.random_point(EC, randZ, gen)
@ -481,7 +522,7 @@ proc run_EC_mixed_add_impl*(
      test(ec, randZ = false, gen = Long01Sequence)
      test(ec, randZ = true, gen = Long01Sequence)

-    test "EC " & G1_or_G2 & " mixed addition - doubling":
+    test "EC " & $ec.G & " mixed addition - doubling":
      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
        for _ in 0 ..< Iters:
          let a = rng.random_point(EC, randZ, gen)
@ -506,7 +547,7 @@ proc run_EC_mixed_add_impl*(
      test(ec, randZ = false, gen = Long01Sequence)
      test(ec, randZ = true, gen = Long01Sequence)

-    test "EC " & G1_or_G2 & " mixed addition - adding infinity LHS":
+    test "EC " & $ec.G & " mixed addition - adding infinity LHS":
      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
        for _ in 0 ..< Iters:
          var a{.noInit.}: EC
@ -529,7 +570,31 @@ proc run_EC_mixed_add_impl*(
      test(ec, randZ = false, gen = HighHammingWeight)
      test(ec, randZ = false, gen = Long01Sequence)

-    test "EC " & G1_or_G2 & " mixed addition - adding infinity RHS":
+    test "EC " & $ec.G & " mixed addition - adding infinity RHS":
+      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
+        for _ in 0 ..< Iters:
+          let a = rng.random_point(EC, randZ, gen)
+          var naAff{.noInit.}: ECP_ShortW_Aff[EC.F, EC.G]
+          naAff.affine(a)
+          naAff.neg()
+
+          var r{.noInit.}: EC
+          r.madd(a, naAff)
+
+          check: r.isInf().bool
+
+          r = a
+          r += naAff
+          check: r.isInf().bool
+
+      test(ec, randZ = false, gen = Uniform)
+      test(ec, randZ = true, gen = Uniform)
+      test(ec, randZ = false, gen = HighHammingWeight)
+      test(ec, randZ = true, gen = HighHammingWeight)
+      test(ec, randZ = false, gen = Long01Sequence)
+      test(ec, randZ = true, gen = Long01Sequence)
+
+    test "EC " & $ec.G & " mixed addition - adding opposites":
      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
        for _ in 0 ..< Iters:
          let a = rng.random_point(EC, randZ, gen)
@ -564,11 +629,6 @@ proc run_EC_subgroups_cofactors_impl*(
  echo "\n------------------------------------------------------\n"
  echo moduleName, " xoshiro512** seed: ", seed

-  when ec.G == G1:
-    const G1_or_G2 = "G1"
-  else:
-    const G1_or_G2 = "G2"
-
  const testSuiteDesc = "Elliptic curve subgroup check and cofactor clearing"

  suite testSuiteDesc & " - " & $ec & " - [" & $WordBitWidth & "-bit mode]":
@ -626,7 +686,7 @@ proc run_EC_subgroups_cofactors_impl*(
      test(ec, randZ = false, gen = Long01Sequence)
      test(ec, randZ = true, gen = Long01Sequence)

-      echo "    [SUCCESS] Test finished with ", inSubgroup, " points in ", G1_or_G2, " subgroup and ",
+      echo "    [SUCCESS] Test finished with ", inSubgroup, " points in ", $ec.G, " subgroup and ",
              offSubgroup, " points on curve but not in subgroup (before cofactor clearing)"

 proc run_EC_affine_conversion*(
@ -642,12 +702,10 @@ proc run_EC_affine_conversion*(
  echo "\n------------------------------------------------------\n"
  echo moduleName, " xoshiro512** seed: ", seed

-  const G1_or_G2 = pairingGroup(ec)
-
  const testSuiteDesc = "Elliptic curve in " & $ec.F.C.getEquationForm() & " form"

  suite testSuiteDesc & " - " & $ec & " - [" & $WordBitWidth & "-bit mode]":
-    test "EC " & G1_or_G2 & " batchAffine is consistent with single affine conversion":
+    test "EC " & $ec.G & " batchAffine is consistent with single affine conversion":
      proc test(EC: typedesc, gen: RandomGen) =
        const batchSize = 10
        for _ in 0 ..< Iters:
@ -807,16 +865,11 @@ proc run_EC_batch_add_impl*[N: static int](
  echo "\n------------------------------------------------------\n"
  echo moduleName, " xoshiro512** seed: ", seed

-  when ec.G == G1:
-    const G1_or_G2 = "G1"
-  else:
-    const G1_or_G2 = "G2"
-
-  const testSuiteDesc = "Elliptic curve batch addition for Short Weierstrass form"
+  const testSuiteDesc = "Elliptic curve sum reduction for Short Weierstrass form"

  suite testSuiteDesc & " - " & $ec & " - [" & $WordBitWidth & "-bit mode]":
    for n in numPoints:
-      test $ec & " batch addition (N=" & $n & ")":
+      test $ec & " sum reduction (N=" & $n & ")":
        proc test(EC: typedesc, gen: RandomGen) =
          var points = newSeq[ECP_ShortW_Aff[EC.F, EC.G]](n)

@ -829,7 +882,7 @@ proc run_EC_batch_add_impl*[N: static int](
          for i in 0 ..< n:
            r_ref += points[i]

-          r_batch.sum_batch_vartime(points)
+          r_batch.sum_reduce_vartime(points)

          check: bool(r_batch == r_ref)

@ -838,7 +891,7 @@ proc run_EC_batch_add_impl*[N: static int](
        test(ec, gen = HighHammingWeight)
        test(ec, gen = Long01Sequence)

-      test "EC " & G1_or_G2 & " batch addition (N=" & $n & ") - special cases":
+      test "EC " & $ec.G & " sum reduction (N=" & $n & ") - special cases":
        proc test(EC: typedesc, gen: RandomGen) =
          var points = newSeq[ECP_ShortW_Aff[EC.F, EC.G]](n)

@ -864,10 +917,57 @@ proc run_EC_batch_add_impl*[N: static int](
          for i in 0 ..< n:
            r_ref += points[i]

-          r_batch.sum_batch_vartime(points)
+          r_batch.sum_reduce_vartime(points)

          check: bool(r_batch == r_ref)

        test(ec, gen = Uniform)
        test(ec, gen = HighHammingWeight)
        test(ec, gen = Long01Sequence)
+
+proc run_EC_multi_scalar_mul_impl*[N: static int](
+       ec: typedesc,
+       numPoints: array[N, int],
+       moduleName: string
+     ) =
+
+  # Random seed for reproducibility
+  var rng: RngState
+  let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+  rng.seed(seed)
+  echo "\n------------------------------------------------------\n"
+  echo moduleName, " xoshiro512** seed: ", seed
+
+  const testSuiteDesc = "Elliptic curve multi-scalar-multiplication for Short Weierstrass form"
+
+  suite testSuiteDesc & " - " & $ec & " - [" & $WordBitWidth & "-bit mode]":
+    for n in numPoints:
+      let bucketBits = bestBucketBitSize(n, ec.F.C.getCurveOrderBitwidth(), useSignedBuckets = false, useManualTuning = false)
+      test $ec & " Multi-scalar-mul (N=" & $n & ", bucket bits: " & $bucketBits & ")":
+        proc test(EC: typedesc, gen: RandomGen) =
+          var points = newSeq[ECP_ShortW_Aff[EC.F, EC.G]](n)
+          var coefs = newSeq[BigInt[EC.F.C.getCurveOrderBitwidth()]](n)
+
+          for i in 0 ..< n:
+            var tmp = rng.random_unsafe(EC)
+            tmp.clearCofactor()
+            points[i].affine(tmp)
+            coefs[i] = rng.random_unsafe(BigInt[EC.F.C.getCurveOrderBitwidth()])
+
+          var naive, naive_tmp: EC
+          naive.setInf()
+          for i in 0 ..< n:
+            naive_tmp.fromAffine(points[i])
+            naive_tmp.scalarMulGeneric(coefs[i])
+            naive += naive_tmp
+
+          var msm_ref, msm: EC
+          msm_ref.multiScalarMul_reference_vartime(coefs, points)
+          msm.multiScalarMul_vartime(coefs, points)
+
+          doAssert bool(naive == msm_ref)
+          doAssert bool(naive == msm)
+
+        test(ec, gen = Uniform)
+        test(ec, gen = HighHammingWeight)
+        test(ec, gen = Long01Sequence)
--- a/tests/math/t_finite_fields_double_precision.nim
+++ b/tests/math/t_finite_fields_double_precision.nim
@ -125,7 +125,7 @@ sqrTest(random_unsafe)
 sqrTest(randomHighHammingWeight)
 sqrTest(random_long01Seq)

-suite "Field Addition/Substraction/Negation via double-precision field elements" & " [" & $WordBitWidth & "-bit mode]":
+suite "Field Addition/Substraction/Negation via double-precision field elements" & " [" & $WordBitWidth & "-bit words]":
  test "With P-224 field modulus":
    for _ in 0 ..< Iters:
      addsubneg_random_unsafe(P224)
@ -197,7 +197,7 @@ suite "Field Addition/Substraction/Negation via double-precision field elements"

    check: bool r.isZero()

-suite "Field Multiplication via double-precision field elements is consistent with single-width." & " [" & $WordBitWidth & "-bit mode]":
+suite "Field Multiplication via double-precision field elements is consistent with single-width." & " [" & $WordBitWidth & "-bit words]":
  test "With P-224 field modulus":
    for _ in 0 ..< Iters:
      mul_random_unsafe(P224)
@ -262,7 +262,7 @@ suite "Field Multiplication via double-precision field elements is consistent wi
    for _ in 0 ..< Iters:
      mul_random_long01Seq(Vesta)

-suite "Field Squaring via double-precision field elements is consistent with single-width." & " [" & $WordBitWidth & "-bit mode]":
+suite "Field Squaring via double-precision field elements is consistent with single-width." & " [" & $WordBitWidth & "-bit words]":
  test "With P-224 field modulus":
    for _ in 0 ..< Iters:
      sqr_random_unsafe(P224)
--- a/tests/math/t_finite_fields_mulsquare.nim
+++ b/tests/math/t_finite_fields_mulsquare.nim
@ -78,7 +78,7 @@ proc sanity(C: static Curve) =
            bool(n == expected)

 proc mainSanity() =
-  suite "Modular squaring is consistent with multiplication on special elements" & " [" & $WordBitWidth & "-bit mode]":
+  suite "Modular squaring is consistent with multiplication on special elements" & " [" & $WordBitWidth & "-bit words]":
    sanity Fake101
    sanity Mersenne61
    sanity Mersenne127
@ -94,7 +94,7 @@ proc mainSanity() =
 mainSanity()

 proc mainSelectCases() =
-  suite "Modular Squaring: selected tricky cases" & " [" & $WordBitWidth & "-bit mode]":
+  suite "Modular Squaring: selected tricky cases" & " [" & $WordBitWidth & "-bit words]":
    test "P-256 [FastSquaring = " & $(Fp[P256].getSpareBits() >= 2) & "]":
      block:
        # Triggered an issue in the (t[N+1], t[N]) = t[N] + (A1, A0)
@ -141,7 +141,7 @@ proc random_long01Seq(C: static Curve) =

  doAssert bool(r_mul == r_sqr)

-suite "Random Modular Squaring is consistent with Modular Multiplication" & " [" & $WordBitWidth & "-bit mode]":
+suite "Random Modular Squaring is consistent with Modular Multiplication" & " [" & $WordBitWidth & "-bit words]":
  test "Random squaring mod P-224 [FastSquaring = " & $(Fp[P224].getSpareBits() >= 2) & "]":
    for _ in 0 ..< Iters:
      randomCurve(P224)
@ -358,7 +358,7 @@ proc random_sumprod(C: static Curve, N: static int) =
  sumprod_test(random_long01Seq)
  sumProdMax()

-suite "Random sum products is consistent with naive " & " [" & $WordBitWidth & "-bit mode]":
+suite "Random sum products is consistent with naive " & " [" & $WordBitWidth & "-bit words]":

  const MaxLength = 8
  test "Random sum products mod P-224]":
--- a/tests/math/t_finite_fields_powinv.nim
+++ b/tests/math/t_finite_fields_powinv.nim
@ -29,7 +29,7 @@ echo "\n------------------------------------------------------\n"
 echo "test_finite_fields_powinv xoshiro512** seed: ", seed

 proc main() =
-  suite "Modular exponentiation over finite fields" & " [" & $WordBitWidth & "-bit mode]":
+  suite "Modular exponentiation over finite fields" & " [" & $WordBitWidth & "-bit words]":
    test "n² mod 101":
      let exponent = BigInt[64].fromUint(2'u64)

@ -202,7 +202,7 @@ proc main() =
    testRandomDiv2 Pallas
    testRandomDiv2 Vesta

-  suite "Modular inversion over prime fields" & " [" & $WordBitWidth & "-bit mode]":
+  suite "Modular inversion over prime fields" & " [" & $WordBitWidth & "-bit words]":
    test "Specific tests on Fp[BLS12_381]":
      block: # No inverse exist for 0 --> should return 0 for projective/jacobian to affine coordinate conversion
        var r, x: Fp[BLS12_381]
@ -210,12 +210,20 @@ proc main() =
        r.inv(x)
        check: bool r.isZero()

+        var r2: Fp[BLS12_381]
+        r2.inv_vartime(x)
+        check: bool r2.isZero()
+
      block:
        var r, x: Fp[BLS12_381]
        x.setOne()
        r.inv(x)
        check: bool r.isOne()

+        var r2: Fp[BLS12_381]
+        r2.inv_vartime(x)
+        check: bool r2.isOne()
+
      block:
        var r, x: Fp[BLS12_381]

@ -229,6 +237,10 @@ proc main() =
        check:
          computed == expected

+        var r2: Fp[BLS12_381]
+        r2.inv_vartime(x)
+        let computed2 = r2.toHex()
+
    test "Specific tests on Fp[BN254_Snarks]":
      block:
        var r, x: Fp[BN254_Snarks]
@ -244,6 +256,10 @@ proc main() =
        r.inv(x)
        check: bool(r == expected)

+        var r2: Fp[BN254_Snarks]
+        r2.inv_vartime(x)
+        check: bool(r2 == expected)
+
      block:
        var r, x, expected: Fp[BN254_Snarks]
        x.fromHex"0x0d2007d8aaface1b8501bfbe792974166e8f9ad6106e5b563604f0aea9ab06f6"
@ -252,6 +268,10 @@ proc main() =
        r.inv(x)
        check: bool(r == expected)

+        var r2: Fp[BN254_Snarks]
+        r2.inv_vartime(x)
+        check: bool(r2 == expected)
+
    proc testRandomInv(curve: static Curve) =
      test "Random inversion testing on " & $Curve(curve):
        var aInv, r: Fp[curve]
@ -264,6 +284,12 @@ proc main() =
          r.prod(aInv, a)
          check: bool r.isOne() or (a.isZero() and r.isZero())

+          aInv.inv_vartime(a)
+          r.prod(a, aInv)
+          check: bool r.isOne() or (a.isZero() and r.isZero())
+          r.prod(aInv, a)
+          check: bool r.isOne() or (a.isZero() and r.isZero())
+
        for _ in 0 ..< Iters:
          let a = rng.randomHighHammingWeight(Fp[curve])
          aInv.inv(a)
@ -272,6 +298,11 @@ proc main() =
          r.prod(aInv, a)
          check: bool r.isOne() or (a.isZero() and r.isZero())

+          aInv.inv_vartime(a)
+          r.prod(a, aInv)
+          check: bool r.isOne() or (a.isZero() and r.isZero())
+          r.prod(aInv, a)
+          check: bool r.isOne() or (a.isZero() and r.isZero())
        for _ in 0 ..< Iters:
          let a = rng.random_long01Seq(Fp[curve])
          aInv.inv(a)
@ -280,6 +311,12 @@ proc main() =
          r.prod(aInv, a)
          check: bool r.isOne() or (a.isZero() and r.isZero())

+          aInv.inv_vartime(a)
+          r.prod(a, aInv)
+          check: bool r.isOne() or (a.isZero() and r.isZero())
+          r.prod(aInv, a)
+          check: bool r.isOne() or (a.isZero() and r.isZero())
+
    testRandomInv P224
    testRandomInv BN254_Nogami
    testRandomInv BN254_Snarks
@ -295,7 +332,7 @@ proc main() =
 main()

 proc main_anti_regression =
-  suite "Bug highlighted by property-based testing" & " [" & $WordBitWidth & "-bit mode]":
+  suite "Bug highlighted by property-based testing" & " [" & $WordBitWidth & "-bit words]":
    # test "#30 - Euler's Criterion should be 1 for square on FKM12_447":
    #   var a: Fp[FKM12_447]
    #   # square of "0x406e5e74ee09c84fa0c59f2db3ac814a4937e2f57ecd3c0af4265e04598d643c5b772a6549a2d9b825445c34b8ba100fe8d912e61cfda43d"
--- a/tests/math/t_finite_fields_sqrt.nim
+++ b/tests/math/t_finite_fields_sqrt.nim
@ -146,7 +146,7 @@ proc randomSqrtRatioCheck(C: static Curve) =
      testSqrtRatioImpl(u, v)

 proc main() =
-  suite "Modular square root" & " [" & $WordBitWidth & "-bit mode]":
+  suite "Modular square root" & " [" & $WordBitWidth & "-bit words]":
    exhaustiveCheck Fake103, 103
    # exhaustiveCheck Fake10007, 10007
    # exhaustiveCheck Fake65519, 65519
@ -161,14 +161,14 @@ proc main() =
    randomSqrtCheck Pallas
    randomSqrtCheck Vesta

-  suite "Modular sqrt(u/v)" & " [" & $WordBitWidth & "-bit mode]":
+  suite "Modular sqrt(u/v)" & " [" & $WordBitWidth & "-bit words]":
    randomSqrtRatioCheck Edwards25519
    randomSqrtRatioCheck Jubjub
    randomSqrtRatioCheck Bandersnatch
    randomSqrtRatioCheck Pallas
    randomSqrtRatioCheck Vesta

-  suite "Modular square root - 32-bit bugs highlighted by property-based testing " & " [" & $WordBitWidth & "-bit mode]":
+  suite "Modular square root - 32-bit bugs highlighted by property-based testing " & " [" & $WordBitWidth & "-bit words]":
    # test "FKM12_447 - #30": - Deactivated, we don't support the curve as no one uses it.
    #   var a: Fp[FKM12_447]
    #   a.fromHex"0x406e5e74ee09c84fa0c59f2db3ac814a4937e2f57ecd3c0af4265e04598d643c5b772a6549a2d9b825445c34b8ba100fe8d912e61cfda43d"
--- a/tests/math/t_fp12_exponentiation.nim
+++ b/tests/math/t_fp12_exponentiation.nim
@ -175,7 +175,7 @@ proc test_invpow(C: static Curve, gen: RandomGen) =

  doAssert: bool(xa == xqya)

-suite "Exponentiation in 𝔽p12" & " [" & $WordBitWidth & "-bit mode]":
+suite "Exponentiation in 𝔽p12" & " [" & $WordBitWidth & "-bit words]":
  staticFor(curve, TestCurves):
    test "xᴬ xᴮ = xᴬ⁺ᴮ on " & $curve:
      test_sameBaseProduct(curve, gen = Uniform)
--- a/tests/math/t_fp2_sqrt.nim
+++ b/tests/math/t_fp2_sqrt.nim
@ -70,14 +70,14 @@ proc randomSqrtCheck(C: static Curve, gen: RandomGen) =
      bool(s == a or s == na)

 proc main() =
-  suite "Modular square root" & " [" & $WordBitWidth & "-bit mode]":
+  suite "Modular square root" & " [" & $WordBitWidth & "-bit words]":
    staticFor(curve, TestCurves):
      test "[𝔽p2] Random square root check for " & $curve:
        randomSqrtCheck(curve, gen = Uniform)
        randomSqrtCheck(curve, gen = HighHammingWeight)
        randomSqrtCheck(curve, gen = Long01Sequence)

-  suite "Modular square root - 32-bit bugs highlighted by property-based testing " & " [" & $WordBitWidth & "-bit mode]":
+  suite "Modular square root - 32-bit bugs highlighted by property-based testing " & " [" & $WordBitWidth & "-bit words]":
    test "sqrt_if_square invalid square BLS12_381 - #64":
      var a: Fp2[BLS12_381]
      a.fromHex(
@ -98,7 +98,7 @@ proc main() =
        bool not a.isSquare()
        bool not a.sqrt_if_square()

-  suite "Modular square root - Assembly bugs highlighted by property-based testing " & " [" & $WordBitWidth & "-bit mode]":
+  suite "Modular square root - Assembly bugs highlighted by property-based testing " & " [" & $WordBitWidth & "-bit words]":
    test "Don't set Neg(Zero) fields to modulus (non-unique Montgomery repr) - #136":
      # https://github.com/mratsim/constantine/issues/136
      # and https://github.com/mratsim/constantine/issues/114
--- a/tests/math/t_fp_cubic_root.nim
+++ b/tests/math/t_fp_cubic_root.nim
@ -23,7 +23,7 @@ proc checkCubeRootOfUnity(curve: static Curve) =
    check: bool cru.isOne()

 proc main() =
-  suite "Sanity checks on precomputed values" & " [" & $WordBitWidth & "-bit mode]":
+  suite "Sanity checks on precomputed values" & " [" & $WordBitWidth & "-bit words]":
    checkCubeRootOfUnity(BN254_Snarks)
    checkCubeRootOfUnity(BLS12_377)
    checkCubeRootOfUnity(BLS12_381)
--- a/tests/math/t_fp_tower_frobenius_template.nim
+++ b/tests/math/t_fp_tower_frobenius_template.nim
@ -68,7 +68,7 @@ proc runFrobeniusTowerTests*[N](
  rng.seed(seed)
  echo moduleName, " xoshiro512** seed: ", seed

-  suite testSuiteDesc & " [" & $WordBitWidth & "-bit mode]":
+  suite testSuiteDesc & " [" & $WordBitWidth & "-bit words]":
    test "Frobenius(a) = a^p (mod p^" & $ExtDegree & ")":
      proc test(Field: typedesc, Iters: static int, gen: RandomGen) =
        for _ in 0 ..< Iters:
--- a/Show More
+++ b/Show More