Accelerate FFT - endomorphism + wNAF vartime scalar mul (#258)

* accel FFT by 30+% with vartime endomorphism support * silly error fix * endomorphism + wNAF, closes #253, FFT 20% speedup * vartime EC addition for all repr * implement vartime EC add * finishing touches, renam to fft_vartime
2023-09-04 10:19:14 +02:00 · 2023-09-04 10:19:14 +02:00 · b9c911ba37
parent 4981c383bb
commit b9c911ba37
20 changed files with 1301 additions and 170 deletions
--- a/benchmarks/bench_ec_g1_scalar_mul.nim
+++ b/benchmarks/bench_ec_g1_scalar_mul.nim
@ -45,11 +45,11 @@ proc main() =
  staticFor i, 0, AvailableCurves.len:
    const curve = AvailableCurves[i]
    const bits = curve.getCurveOrderBitwidth()
-    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Prj[Fp[curve], G1], bits, MulIters)
-    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Jac[Fp[curve], G1], bits, MulIters)
+    scalarMulVartimeDoubleAddBench(ECP_ShortW_Prj[Fp[curve], G1], bits, MulIters)
+    scalarMulVartimeDoubleAddBench(ECP_ShortW_Jac[Fp[curve], G1], bits, MulIters)
    separator()
-    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Prj[Fp[curve], G1], bits, MulIters)
-    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Jac[Fp[curve], G1], bits, MulIters)
+    scalarMulVartimeMinHammingWeightRecodingBench(ECP_ShortW_Prj[Fp[curve], G1], bits, MulIters)
+    scalarMulVartimeMinHammingWeightRecodingBench(ECP_ShortW_Jac[Fp[curve], G1], bits, MulIters)
    separator()
    scalarMulGenericBench(ECP_ShortW_Prj[Fp[curve], G1], bits, window = 2, MulIters)
    scalarMulGenericBench(ECP_ShortW_Prj[Fp[curve], G1], bits, window = 3, MulIters)
@ -60,16 +60,25 @@ proc main() =
    scalarMulGenericBench(ECP_ShortW_Jac[Fp[curve], G1], bits, window = 4, MulIters)
    scalarMulGenericBench(ECP_ShortW_Jac[Fp[curve], G1], bits, window = 5, MulIters)
    separator()
-    scalarMulUnsafeWNAFBench(ECP_ShortW_Prj[Fp[curve], G1], bits, window = 2, MulIters)
-    scalarMulUnsafeWNAFBench(ECP_ShortW_Prj[Fp[curve], G1], bits, window = 3, MulIters)
-    scalarMulUnsafeWNAFBench(ECP_ShortW_Prj[Fp[curve], G1], bits, window = 4, MulIters)
-    scalarMulUnsafeWNAFBench(ECP_ShortW_Prj[Fp[curve], G1], bits, window = 5, MulIters)
-    scalarMulUnsafeWNAFBench(ECP_ShortW_Jac[Fp[curve], G1], bits, window = 2, MulIters)
-    scalarMulUnsafeWNAFBench(ECP_ShortW_Jac[Fp[curve], G1], bits, window = 3, MulIters)
-    scalarMulUnsafeWNAFBench(ECP_ShortW_Jac[Fp[curve], G1], bits, window = 4, MulIters)
-    scalarMulUnsafeWNAFBench(ECP_ShortW_Jac[Fp[curve], G1], bits, window = 5, MulIters)
+    scalarMulVartimeWNAFBench(ECP_ShortW_Prj[Fp[curve], G1], bits, window = 2, MulIters)
+    scalarMulVartimeWNAFBench(ECP_ShortW_Prj[Fp[curve], G1], bits, window = 3, MulIters)
+    scalarMulVartimeWNAFBench(ECP_ShortW_Prj[Fp[curve], G1], bits, window = 4, MulIters)
+    scalarMulVartimeWNAFBench(ECP_ShortW_Prj[Fp[curve], G1], bits, window = 5, MulIters)
+    scalarMulVartimeWNAFBench(ECP_ShortW_Jac[Fp[curve], G1], bits, window = 2, MulIters)
+    scalarMulVartimeWNAFBench(ECP_ShortW_Jac[Fp[curve], G1], bits, window = 3, MulIters)
+    scalarMulVartimeWNAFBench(ECP_ShortW_Jac[Fp[curve], G1], bits, window = 4, MulIters)
+    scalarMulVartimeWNAFBench(ECP_ShortW_Jac[Fp[curve], G1], bits, window = 5, MulIters)
    separator()
    when bits >= 196: # All endomorphisms constants are below this threshold
+      scalarMulVartimeEndoWNAFBench(ECP_ShortW_Prj[Fp[curve], G1], bits, window = 2, MulIters)
+      scalarMulVartimeEndoWNAFBench(ECP_ShortW_Prj[Fp[curve], G1], bits, window = 3, MulIters)
+      scalarMulVartimeEndoWNAFBench(ECP_ShortW_Prj[Fp[curve], G1], bits, window = 4, MulIters)
+      scalarMulVartimeEndoWNAFBench(ECP_ShortW_Prj[Fp[curve], G1], bits, window = 5, MulIters)
+      scalarMulVartimeEndoWNAFBench(ECP_ShortW_Jac[Fp[curve], G1], bits, window = 2, MulIters)
+      scalarMulVartimeEndoWNAFBench(ECP_ShortW_Jac[Fp[curve], G1], bits, window = 3, MulIters)
+      scalarMulVartimeEndoWNAFBench(ECP_ShortW_Jac[Fp[curve], G1], bits, window = 4, MulIters)
+      scalarMulVartimeEndoWNAFBench(ECP_ShortW_Jac[Fp[curve], G1], bits, window = 5, MulIters)
+      separator()
      scalarMulEndo(      ECP_ShortW_Prj[Fp[curve], G1], bits, MulIters)
      scalarMulEndoWindow(ECP_ShortW_Prj[Fp[curve], G1], bits, MulIters)
      scalarMulEndo(      ECP_ShortW_Jac[Fp[curve], G1], bits, MulIters)
--- a/benchmarks/bench_ec_g2_scalar_mul.nim
+++ b/benchmarks/bench_ec_g2_scalar_mul.nim
@ -46,11 +46,11 @@ proc main() =
  staticFor i, 0, AvailableCurves.len:
    const curve = AvailableCurves[i]
    const bits = curve.getCurveOrderBitwidth()
-    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, MulIters)
-    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, MulIters)
+    scalarMulVartimeDoubleAddBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, MulIters)
+    scalarMulVartimeDoubleAddBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, MulIters)
    separator()
-    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, MulIters)
-    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, MulIters)
+    scalarMulVartimeMinHammingWeightRecodingBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, MulIters)
+    scalarMulVartimeMinHammingWeightRecodingBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, MulIters)
    separator()
    scalarMulGenericBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, window = 2, MulIters)
    scalarMulGenericBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, window = 3, MulIters)
@ -61,16 +61,25 @@ proc main() =
    scalarMulGenericBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, window = 4, MulIters)
    scalarMulGenericBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, window = 5, MulIters)
    separator()
-    scalarMulUnsafeWNAFBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, window = 2, MulIters)
-    scalarMulUnsafeWNAFBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, window = 3, MulIters)
-    scalarMulUnsafeWNAFBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, window = 4, MulIters)
-    scalarMulUnsafeWNAFBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, window = 5, MulIters)
-    scalarMulUnsafeWNAFBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, window = 2, MulIters)
-    scalarMulUnsafeWNAFBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, window = 3, MulIters)
-    scalarMulUnsafeWNAFBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, window = 4, MulIters)
-    scalarMulUnsafeWNAFBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, window = 5, MulIters)
+    scalarMulVartimeWNAFBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, window = 2, MulIters)
+    scalarMulVartimeWNAFBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, window = 3, MulIters)
+    scalarMulVartimeWNAFBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, window = 4, MulIters)
+    scalarMulVartimeWNAFBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, window = 5, MulIters)
+    scalarMulVartimeWNAFBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, window = 2, MulIters)
+    scalarMulVartimeWNAFBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, window = 3, MulIters)
+    scalarMulVartimeWNAFBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, window = 4, MulIters)
+    scalarMulVartimeWNAFBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, window = 5, MulIters)
    separator()
    when bits >= 196: # All endomorphisms constants are below this threshold
+      scalarMulVartimeEndoWNAFBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, window = 2, MulIters)
+      scalarMulVartimeEndoWNAFBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, window = 3, MulIters)
+      scalarMulVartimeEndoWNAFBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, window = 4, MulIters)
+      scalarMulVartimeEndoWNAFBench(ECP_ShortW_Prj[Fp2[curve], G2], bits, window = 5, MulIters)
+      scalarMulVartimeEndoWNAFBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, window = 2, MulIters)
+      scalarMulVartimeEndoWNAFBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, window = 3, MulIters)
+      scalarMulVartimeEndoWNAFBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, window = 4, MulIters)
+      scalarMulVartimeEndoWNAFBench(ECP_ShortW_Jac[Fp2[curve], G2], bits, window = 5, MulIters)
+      separator()
      scalarMulEndo(ECP_ShortW_Prj[Fp2[curve], G2], bits, MulIters)
      scalarMulEndo(ECP_ShortW_Jac[Fp2[curve], G2], bits, MulIters)
      separator()
--- a/benchmarks/bench_elliptic_template.nim
+++ b/benchmarks/bench_elliptic_template.nim
@ -52,9 +52,9 @@ proc report(op, elliptic: string, start, stop: MonoTime, startClk, stopClk: int6
  let ns = inNanoseconds((stop-start) div iters)
  let throughput = 1e9 / float64(ns)
  when SupportsGetTicks:
-    echo &"{op:<68} {elliptic:<32} {throughput:>15.3f} ops/s {ns:>16} ns/op {(stopClk - startClk) div iters:>12} CPU cycles (approx)"
+    echo &"{op:<68} {elliptic:<36} {throughput:>15.3f} ops/s {ns:>16} ns/op {(stopClk - startClk) div iters:>12} CPU cycles (approx)"
  else:
-    echo &"{op:<68} {elliptic:<32} {throughput:>15.3f} ops/s {ns:>16} ns/op"
+    echo &"{op:<68} {elliptic:<36} {throughput:>15.3f} ops/s {ns:>16} ns/op"

 template bench*(op: string, EC: typedesc, iters: int, body: untyped): untyped =
  measure(iters, startTime, stopTime, startClk, stopClk, body)
@ -74,8 +74,12 @@ proc addBench*(EC: typedesc, iters: int) =
    bench("EC Add vartime " & $EC.G, EC, iters):
      r.sum_vartime(P, Q)
  else:
-    bench("EC Add " & $EC.G, EC, iters):
-      r.sum(P, Q)
+    block:
+      bench("EC Add " & $EC.G, EC, iters):
+        r.sum(P, Q)
+    block:
+      bench("EC Add vartime " & $EC.G, EC, iters):
+        r.sum_vartime(P, Q)

 proc mixedAddBench*(EC: typedesc, iters: int) =
  var r {.noInit.}: EC
@ -88,8 +92,12 @@ proc mixedAddBench*(EC: typedesc, iters: int) =
    bench("EC Mixed Addition vartime " & $EC.G, EC, iters):
      r.madd_vartime(P, Qaff)
  else:
-    bench("EC Mixed Addition " & $EC.G, EC, iters):
-      r.madd(P, Qaff)
+    block:
+      bench("EC Mixed Addition " & $EC.G, EC, iters):
+        r.madd(P, Qaff)
+    block:
+      bench("EC Mixed Addition vartime " & $EC.G, EC, iters):
+        r.madd_vartime(P, Qaff)

 proc doublingBench*(EC: typedesc, iters: int) =
  var r {.noInit.}: EC
@ -175,39 +183,50 @@ proc scalarMulEndoWindow*(EC: typedesc, bits: static int, iters: int) =
    else:
      {.error: "Not implemented".}

-proc scalarMulUnsafeDoubleAddBench*(EC: typedesc, bits: static int, iters: int) =
+proc scalarMulVartimeDoubleAddBench*(EC: typedesc, bits: static int, iters: int) =
  var r {.noInit.}: EC
  var P = rng.random_unsafe(EC)
  P.clearCofactor()

  let exponent = rng.random_unsafe(BigInt[bits])

-  bench("EC ScalarMul " & $bits & "-bit " & $EC.G & " (unsafe reference DoubleAdd)", EC, iters):
+  bench("EC ScalarMul " & $bits & "-bit " & $EC.G & " (vartime reference DoubleAdd)", EC, iters):
    r = P
    r.scalarMul_doubleAdd_vartime(exponent)

-proc scalarMulUnsafeMinHammingWeightRecodingBench*(EC: typedesc, bits: static int, iters: int) =
+proc scalarMulVartimeMinHammingWeightRecodingBench*(EC: typedesc, bits: static int, iters: int) =
  var r {.noInit.}: EC
  var P = rng.random_unsafe(EC)
  P.clearCofactor()

  let exponent = rng.random_unsafe(BigInt[bits])

-  bench("EC ScalarMul " & $bits & "-bit " & $EC.G & " (unsafe min Hamming Weight recoding)", EC, iters):
+  bench("EC ScalarMul " & $bits & "-bit " & $EC.G & " (vartime min Hamming Weight recoding)", EC, iters):
    r = P
    r.scalarMul_minHammingWeight_vartime(exponent)

-proc scalarMulUnsafeWNAFBench*(EC: typedesc, bits, window: static int, iters: int) =
+proc scalarMulVartimeWNAFBench*(EC: typedesc, bits, window: static int, iters: int) =
  var r {.noInit.}: EC
  var P = rng.random_unsafe(EC)
  P.clearCofactor()

  let exponent = rng.random_unsafe(BigInt[bits])

-  bench("EC ScalarMul " & $bits & "-bit " & $EC.G & " (unsafe wNAF-" & $window & ")", EC, iters):
+  bench("EC ScalarMul " & $bits & "-bit " & $EC.G & " (vartime wNAF-" & $window & ")", EC, iters):
    r = P
    r.scalarMul_minHammingWeight_windowed_vartime(exponent, window)

+proc scalarMulVartimeEndoWNAFBench*(EC: typedesc, bits, window: static int, iters: int) =
+  var r {.noInit.}: EC
+  var P = rng.random_unsafe(EC)
+  P.clearCofactor()
+
+  let exponent = rng.random_unsafe(BigInt[bits])
+
+  bench("EC ScalarMul " & $bits & "-bit " & $EC.G & " (vartime endomorphism + wNAF-" & $window & ")", EC, iters):
+    r = P
+    r.scalarMulEndo_minHammingWeight_windowed_vartime(exponent, window)
+
 proc multiAddBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) =
  var points = newSeq[ECP_ShortW_Aff[EC.F, EC.G]](numPoints)

--- a/constantine/math/elliptic/ec_endomorphism_accel.nim
+++ b/constantine/math/elliptic/ec_endomorphism_accel.nim
@ -280,8 +280,7 @@ func secretLookup[T](dst: var T, table: openArray[T], index: SecretWord) =

 func scalarMulEndo*[scalBits; EC](
       P: var EC,
-       scalar: BigInt[scalBits]
-     ) =
+       scalar: BigInt[scalBits]) =
  ## Elliptic Curve Scalar Multiplication
  ##
  ##   P <- [k] P
--- a/constantine/math/elliptic/ec_multi_scalar_mul.nim
+++ b/constantine/math/elliptic/ec_multi_scalar_mul.nim
@ -76,8 +76,8 @@ func multiScalarMulImpl_reference_vartime[F, G; bits: static int](

    # Example with c = 3, 2³ = 8
    for k in countdown(numBuckets-2, 0):
-      accumBuckets += buckets[k] # Stores S₈ then    S₈+S₇ then       S₈+S₇+S₆ then ...
-      miniMSM += accumBuckets    # Stores S₈ then [2]S₈+S₇ then [3]S₈+[2]S₇+S₆ then ...
+      accumBuckets.sum_vartime(accumBuckets, buckets[k]) # Stores S₈ then    S₈+S₇ then       S₈+S₇+S₆ then ...
+      miniMSM.sum_vartime(miniMSM, accumBuckets)         # Stores S₈ then [2]S₈+S₇ then [3]S₈+[2]S₇+S₆ then ...

    miniMSMs[w] = miniMSM

@ -86,7 +86,7 @@ func multiScalarMulImpl_reference_vartime[F, G; bits: static int](
  for w in countdown(numWindows-2, 0):
    for _ in 0 ..< c:
      r.double()
-    r += miniMSMs[w]
+    r.sum_vartime(r, miniMSMs[w])

  # Cleanup
  # -------
@ -152,8 +152,8 @@ func bucketReduce[EC](r: var EC, buckets: ptr UncheckedArray[EC], numBuckets: st
  buckets[numBuckets-1].setInf()

  for k in countdown(numBuckets-2, 0):
-    accumBuckets += buckets[k]
-    r += accumBuckets
+    accumBuckets.sum_vartime(accumBuckets, buckets[k])
+    r.sum_vartime(r, accumBuckets)
    buckets[k].setInf()

 type MiniMsmKind* = enum
@ -211,7 +211,7 @@ func miniMSM_jacext[F, G; bits: static int](
    coefs, points, N)

  # 3. Mini-MSM on the slice [bitIndex, bitIndex+window)
-  r += windowSum
+  r.sum_vartime(r, windowSum)
  when miniMsmKind != kBottomWindow:
    for _ in 0 ..< c:
      r.double()
@ -303,7 +303,7 @@ func miniMSM_affine[NumBuckets, QueueLen, F, G; bits: static int](
  # 3. Mini-MSM on the slice [bitIndex, bitIndex+window)
  var windowSum{.noInit.}: typeof(r)
  windowSum.fromJacobianExtended_vartime(windowSum_jacext)
-  r += windowSum
+  r.sum_vartime(r, windowSum)

  when miniMsmKind != kBottomWindow:
    for _ in 0 ..< c:
--- a/constantine/math/elliptic/ec_multi_scalar_mul_parallel.nim
+++ b/constantine/math/elliptic/ec_multi_scalar_mul_parallel.nim
@ -200,7 +200,7 @@ proc msmJacExt_vartime_parallel*[bits: static int, EC, F, G](
      for _ in 0 ..< c:
        r[].double()
      discard sync miniMSMsReady[w]
-      r[] += miniMSMsResults[w]
+      r[].sum_vartime(r[], miniMSMsResults[w])
  elif numWindows >= 2:
    discard sync miniMSMsReady[numWindows-2]
    r[] = miniMSMsResults[numWindows-2]
@ -208,7 +208,7 @@ proc msmJacExt_vartime_parallel*[bits: static int, EC, F, G](
      for _ in 0 ..< c:
        r[].double()
      discard sync miniMSMsReady[w]
-      r[] += miniMSMsResults[w]
+      r[].sum_vartime(r[], miniMSMsResults[w])

  # Cleanup
  # -------
@ -389,7 +389,7 @@ proc msmAffine_vartime_parallel*[bits: static int, EC, F, G](
      for _ in 0 ..< c:
        r[].double()
      discard sync miniMSMsReady[w]
-      r[] += miniMSMsResults[w]
+      r[].sum_vartime(r[], miniMSMsResults[w])
  elif numWindows >= 2:
    discard sync miniMSMsReady[numWindows-2]
    r[] = miniMSMsResults[numWindows-2]
@ -397,7 +397,7 @@ proc msmAffine_vartime_parallel*[bits: static int, EC, F, G](
      for _ in 0 ..< c:
        r[].double()
      discard sync miniMSMsReady[w]
-      r[] += miniMSMsResults[w]
+      r[].sum_vartime(r[], miniMSMsResults[w])

  # Cleanup
  # -------
@ -446,7 +446,7 @@ proc msmAffine_vartime_parallel_split[bits: static int, EC, F, G](

  for i in countdown(msmParallelism-2, 0):
    discard sync splitMSMsReady[i]
-    r[] += splitMSMsResults[i]
+    r[].sum_vartime(r[], splitMSMsResults[i])

  freeHeap(splitMSMsResults)

--- a/constantine/math/elliptic/ec_scalar_mul_vartime.nim
+++ b/constantine/math/elliptic/ec_scalar_mul_vartime.nim
@ -8,21 +8,33 @@

 import
  # Internals
+  ./ec_endomorphism_accel,
  ../arithmetic,
+  ../extension_fields,
  ../ec_shortweierstrass,
  ../io/io_bigints,
-  ../../platforms/abstractions
+  ../constants/zoo_endomorphisms,
+  ../isogenies/frobenius,
+  ../../platforms/abstractions,
+  ../../math_arbitrary_precision/arithmetic/limbs_views

 {.push raises: [].} # No exceptions allowed in core cryptographic operations
 {.push checks: off.} # No defects due to array bound checking or signed integer overflow allowed

-# Support files for testing Elliptic Curve arithmetic
+# Bit operations
 # ------------------------------------------------------------------------------

 iterator unpackBE(scalarByte: byte): bool =
  for i in countdown(7, 0):
    yield bool((scalarByte shr i) and 1)

+# Variable-time scalar multiplication
+# ------------------------------------------------------------------------------
+template `+=`[F; G: static Subgroup](P: var ECP_ShortW[F, G], Q: ECP_ShortW_Aff[F, G]) =
+  P.madd_vartime(P, Q)
+template `-=`[F; G: static Subgroup](P: var ECP_ShortW[F, G], Q: ECP_ShortW_Aff[F, G]) =
+  P.msub_vartime(P, Q)
+
 func scalarMul_doubleAdd_vartime*[EC](P: var EC, scalar: BigInt) {.tags:[VarTime].} =
  ## **Variable-time** Elliptic Curve Scalar Multiplication
  ##
@ -39,11 +51,106 @@ func scalarMul_doubleAdd_vartime*[EC](P: var EC, scalar: BigInt) {.tags:[VarTime
  Paff.affine(P)

  P.setInf()
+  var isInf = true
+
  for scalarByte in scalarCanonical:
    for bit in unpackBE(scalarByte):
-      P.double()
+      if not isInf:
+        P.double()
      if bit:
-        P += Paff
+        if isInf:
+          P.fromAffine(Paff)
+          isInf = false
+        else:
+          P += Paff
+
+func scalarMul_addchain_4bit_vartime[EC](P: var EC, scalar: BigInt) {.tags:[VarTime].} =
+  ## **Variable-time** Elliptic Curve Scalar Multiplication
+  ## This can only handle for small scalars up to 2⁴ = 16 excluded
+  let s = uint scalar.limbs[0]
+
+  case s
+  of 0:
+    P.setInf()
+  of 1:
+    return
+  of 2:
+    P.double()
+  of 3:
+    var t {.noInit.}: EC
+    t.double(P)
+    P.sum_vartime(P, t)
+  of 4:
+    P.double()
+    P.double()
+  of 5:
+    var t {.noInit.}: EC
+    t.double(P)
+    t.double(P)
+    P.sum_vartime(P, t)
+  of 6:
+    var t {.noInit.}: EC
+    t.double(P)
+    P.sum_vartime(P, t)
+    P.double()
+  of 7:
+    var t {.noInit.}: EC
+    t.double(P)
+    t.double()
+    t.double()
+    P.diff_vartime(t, P)
+  of 8:
+    P.double()
+    P.double()
+    P.double()
+  of 9:
+    var t {.noInit.}: EC
+    t.double(P)
+    t.double()
+    t.double()
+    P.sum_vartime(P, t)
+  of 10:
+    var t {.noInit.}: EC
+    t.double(P)
+    t.double()
+    P.sum_vartime(P, t)
+    P.double()
+  of 11:
+    var t1 {.noInit.}, t2 {.noInit.}: EC
+    t1.double(P)  # [2]P
+    t2.double(t1)
+    t2.double()   # [8]P
+    t1.sum_vartime(t1, t2)
+    P.sum_vartime(P, t1)
+  of 12:
+    var t1 {.noInit.}, t2 {.noInit.}: EC
+    t1.double(P)
+    t1.double()   # [4]P
+    t2.double(t1) # [8]P
+    P.sum_vartime(t1, t2)
+  of 13:
+    var t1 {.noInit.}, t2 {.noInit.}: EC
+    t1.double(P)
+    t1.double()   # [4]P
+    t2.double(t1) # [8]P
+    t1.sum_vartime(t1, t2)
+    P.sum_vartime(P, t1)
+  of 14:
+    var t {.noInit.}: EC
+    t.double(P)
+    t.double()
+    t.double()
+    t.diff_vartime(t, P) # [7]P
+    P.double(t)
+  of 15:
+    var t {.noInit.}: EC
+    t.double(P)
+    t.double()
+    t.double()
+    t.double()
+    P.diff_vartime(t, P)
+  else:
+    unreachable()

 func scalarMul_minHammingWeight_vartime*[EC](P: var EC, scalar: BigInt) {.tags:[VarTime].}  =
  ## **Variable-time** Elliptic Curve Scalar Multiplication
@ -66,6 +173,36 @@ func scalarMul_minHammingWeight_vartime*[EC](P: var EC, scalar: BigInt) {.tags:[
    elif bit == -1:
      P -= Paff

+func initNAF[precompSize, NafMax: static int, EC, ECaff](
+       P: var EC,
+       tab: array[precompSize, ECaff],
+       naf: array[NafMax, int8], nafLen: int,
+       nafIteratorIdx: int): bool {.inline.} =
+
+  let digit = naf[nafLen-1-nafIteratorIdx]
+  if digit > 0:
+    P.fromAffine(tab[digit shr 1])
+    return true
+  elif digit < 0:
+    P.fromAffine(tab[digit shr 1])
+    P.neg()
+    return true
+  else:
+    P.setInf()
+    return false
+
+func accumNAF[precompSize, NafMax: static int, EC, ECaff](
+       P: var EC,
+       tab: array[precompSize, ECaff],
+       naf: array[NafMax, int8], nafLen: int,
+       nafIteratorIdx: int) {.inline.} =
+
+    let digit = naf[nafLen-1-nafIteratorIdx]
+    if digit > 0:
+      P += tab[digit shr 1]
+    elif digit < 0:
+      P -= tab[-digit shr 1]
+
 func scalarMul_minHammingWeight_windowed_vartime*[EC](P: var EC, scalar: BigInt, window: static int) {.tags:[VarTime, Alloca].} =
  ## **Variable-time** Elliptic Curve Scalar Multiplication
  ##
@ -78,45 +215,169 @@ func scalarMul_minHammingWeight_windowed_vartime*[EC](P: var EC, scalar: BigInt,

  # Signed digits divides precomputation table size by 2
  # Odd-only divides precomputation table size by another 2
+
  const precompSize = 1 shl (window - 2)
-
-  when window <= 8:
-    type I = int8
-  elif window <= 16:
-    type I = int16
-  elif window <= 32:
-    type I = int32
-  else:
-    type I = int64
-
-  var naf {.noInit.}: array[BigInt.bits+1, I]
-  let nafLen = naf.recode_r2l_signed_window_vartime(scalar, window)
-
-  var P2{.noInit.}: EC
-  P2.double(P)
+  static: doAssert window < 8, "Window is too large and precomputation would use " & $(precompSize * sizeof(EC)) & " stack space."

  var tabEC {.noinit.}: array[precompSize, EC]
+  var P2{.noInit.}: EC
  tabEC[0] = P
+  P2.double(P)
  for i in 1 ..< tabEC.len:
-    tabEC[i].sum(tabEC[i-1], P2)
+    tabEC[i].sum_vartime(tabEC[i-1], P2)

  var tab {.noinit.}: array[precompSize, affine(EC)]
  tab.batchAffine(tabEC)

-  # init
-  if naf[nafLen-1] > 0:
-    P.fromAffine(tab[naf[nafLen-1] shr 1])
-  elif naf[nafLen-1] < 0:
-    P.fromAffine(tab[-naf[nafLen-1] shr 1])
-    P.neg()
-  else:
-    P.setInf()
+  var naf {.noInit.}: array[BigInt.bits+1, int8]
+  let nafLen = naf.recode_r2l_signed_window_vartime(scalar, window)

-  # steady state
-  for i in 1 ..< nafLen:
-    P.double()
-    let digit = naf[nafLen-1-i]
-    if digit > 0:
-      P += tab[digit shr 1]
-    elif digit < 0:
-      P -= tab[-digit shr 1]
+  var isInit = false
+  for i in 0 ..< nafLen:
+    if isInit:
+      P.double()
+      P.accumNAF(tab, naf, nafLen, i)
+    else:
+      isInit = P.initNAF(tab, naf, nafLen, i)
+
+func scalarMulEndo_minHammingWeight_windowed_vartime*[scalBits: static int; EC](
+       P: var EC,
+       scalar: BigInt[scalBits],
+       window: static int) {.tags:[VarTime, Alloca].} =
+  ## Endomorphism-accelerated windowed vartime scalar multiplication
+  ##
+  ##   P <- [k] P
+  ##
+  ## This uses windowed-NAF (wNAF)
+  ## This MUST NOT be used with secret data.
+  ##
+  ## This is highly VULNERABLE to timing attacks and power analysis attacks
+
+  # Signed digits divides precomputation table size by 2
+  # Odd-only divides precomputation table size by another 2
+  const precompSize = 1 shl (window - 2)
+  static: doAssert window < 8, "Window is too large and precomputation would use " & $(precompSize * sizeof(EC)) & " stack space."
+
+  when P.F is Fp:
+    const M = 2
+    # 1. Compute endomorphisms
+    var endomorphisms {.noInit.}: array[M-1, EC]
+    when P.G == G1:
+      endomorphisms[0] = P
+      endomorphisms[0].x *= EC.F.C.getCubicRootOfUnity_mod_p()
+    else:
+      endomorphisms[0].frobenius_psi(P, 2)
+
+  elif P.F is Fp2:
+    const M = 4
+    # 1. Compute endomorphisms
+    var endomorphisms {.noInit.}: array[M-1, EC]
+    endomorphisms[0].frobenius_psi(P)
+    endomorphisms[1].frobenius_psi(P, 2)
+    endomorphisms[2].frobenius_psi(P, 3)
+  else:
+    {.error: "Unconfigured".}
+
+  # 2. Decompose scalar into mini-scalars
+  const L = scalBits.ceilDiv_vartime(M) + 1
+  var miniScalars {.noInit.}: array[M, BigInt[L]]
+  var negatePoints {.noInit.}: array[M, SecretBool]
+  miniScalars.decomposeEndo(negatePoints, scalar, EC.F)
+
+  # 3. Handle negative mini-scalars
+  if negatePoints[0].bool:
+    P.neg()
+  for m in 1 ..< M:
+    if negatePoints[m].bool:
+      endomorphisms[m-1].neg()
+
+  # 4. EC precomputed table
+  var tabEC {.noinit.}: array[M, array[precompSize, EC]]
+  for m in 0 ..< M:
+    var P2{.noInit.}: EC
+    if m == 0:
+      tabEC[0][0] = P
+      P2.double(P)
+    else:
+      tabEC[m][0] = endomorphisms[m-1]
+      P2.double(endomorphisms[m-1])
+    for i in 1 ..< tabEC[m].len:
+      tabEC[m][i].sum_vartime(tabEC[m][i-1], P2)
+
+  var tab {.noinit.}: array[M, array[precompSize, affine(EC)]]
+  tab.batchAffine(tabEC)
+
+  # 5. wNAF precomputed tables
+  const NafLen = L+1
+  var tabNaf {.noinit.}: array[M, array[NafLen, int8]]
+
+  for m in 0 ..< M:
+    # tabNaf returns NAF from least-significant to most significant bits
+    let miniScalarLen = tabNaf[m].recode_r2l_signed_window_vartime(miniScalars[m], window)
+    # We compute from most significant to least significant
+    # so we pad with 0
+    for i in miniScalarLen ..< NafLen:
+      tabNaf[m][i] = 0
+
+  # 6. Compute
+  var isInit = false
+
+  for i in 0 ..< NafLen:
+    if isInit:
+      P.double()
+    for m in 0 ..< M:
+      if isInit:
+        P.accumNAF(tab[m], tabNaf[m], NafLen, i)
+      else:
+        isInit = P.initNAF(tab[m], tabNaf[m], NafLen, i)
+
+func scalarMul_vartime*[scalBits; EC](
+       P: var EC,
+       scalar: BigInt[scalBits]
+     ) {.inline.} =
+  ## Elliptic Curve Scalar Multiplication
+  ##
+  ##   P <- [k] P
+  ##
+  ## This select the best algorithm depending on heuristics
+  ## and the scalar being multiplied.
+  ## The scalar MUST NOT be a secret as this does not use side-channel countermeasures
+  ##
+  ## This may use endomorphism acceleration.
+  ## As endomorphism acceleration requires:
+  ## - Cofactor to be cleared
+  ## - 0 <= scalar < curve order
+  ## Those conditions will be assumed.
+
+  when P.F is Fp:
+    const M = 2
+  elif P.F is Fp2:
+    const M = 4
+  else:
+    {.error: "Unconfigured".}
+
+  const L = scalBits.ceilDiv_vartime(M) + 1
+
+  let usedBits = scalar.limbs.getBits_vartime()
+
+  when scalBits == EC.F.C.getCurveOrderBitwidth() and
+       EC.F.C.hasEndomorphismAcceleration():
+    if usedBits >= L:
+      when EC.F is Fp:
+        P.scalarMulEndo_minHammingWeight_windowed_vartime(scalar, window = 4)
+      elif EC.F is Fp2:
+        P.scalarMulEndo_minHammingWeight_windowed_vartime(scalar, window = 3)
+      else: # Curves defined on Fp^m with m > 2
+        {.error: "Unreachable".}
+      return
+
+  if 64 < usedBits:
+    # With a window of 5, we precompute 2^3 = 8 points
+    P.scalarMul_minHammingWeight_windowed_vartime(scalar, window = 5)
+  elif 16 < usedBits:
+    # With a window of 3, we precompute 2^1 = 2 points
+    P.scalarMul_minHammingWeight_windowed_vartime(scalar, window = 3)
+  elif 4 < usedBits:
+    P.scalarMul_doubleAdd_vartime(scalar)
+  else:
+    P.scalarMul_addchain_4bit_vartime(scalar)
--- a/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim
@ -151,6 +151,16 @@ func batchAffine*[N: static int, F, G](
       jacs: array[N, ECP_ShortW_Jac[F, G]]) {.inline.} =
  batchAffine(affs.asUnchecked(), jacs.asUnchecked(), N)

+func batchAffine*[M, N: static int, F, G](
+       affs: var array[M, array[N, ECP_ShortW_Aff[F, G]]],
+       projs: array[M, array[N, ECP_ShortW_Prj[F, G]]]) {.inline.} =
+  batchAffine(affs[0].asUnchecked(), projs[0].asUnchecked(), M*N)
+
+func batchAffine*[M, N: static int, F, G](
+       affs: var array[M, array[N, ECP_ShortW_Aff[F, G]]],
+       projs: array[M, array[N, ECP_ShortW_Jac[F, G]]]) {.inline.} =
+  batchAffine(affs[0].asUnchecked(), projs[0].asUnchecked(), M*N)
+
 # ############################################################
 #
 #             Elliptic Curve in Short Weierstrass form
--- a/constantine/math/elliptic/ec_shortweierstrass_batch_ops_parallel.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_batch_ops_parallel.nim
@ -66,7 +66,7 @@ proc sum_reduce_vartime_parallelChunks[F; G: static Subgroup](
  if chunkDesc.numChunks < minChunkSizeSerial:
    r.setInf()
    for i in 0 ..< chunkDesc.numChunks:
-      r += partialResults[i]
+      r.sum_vartime(r, partialResults[i])
  else:
    let partialResultsAffine = allocStackArray(ECP_ShortW_Aff[F, G], chunkDesc.numChunks)
    partialResultsAffine.batchAffine(partialResults, chunkDesc.numChunks)
@ -99,7 +99,7 @@ proc sum_reduce_vartime_parallelFor[F; G: static Subgroup](
        let n = min(maxStride, pointsLen-i)
        localSum.accumSum_chunk_vartime(p +% i, n)
      merge(remoteSum: Flowvar[typeof(r)]):
-        localSum += sync(remoteSum)
+        localSum.sum_vartime(localSum, sync(remoteSum))
      epilogue:
        return localSum

--- a/constantine/math/elliptic/ec_shortweierstrass_jacobian.md
+++ b/constantine/math/elliptic/ec_shortweierstrass_jacobian.md
@ -71,7 +71,7 @@ Let's look first at Cohen et al, 1998 formulae
 | S₁ = Y₁*Z₂*Z₂Z₂              |                                              |                 |                |
 | S₂ = Y₂*Z₁*Z₁Z₁              |                                              |                 |                |
 | H = U₂-U₁ # P=-Q, P=Inf, P=Q |                                              |                 |                |
-| F = S₂-S₁ # Q=Inf            |                                              |                 |                |
+| R = S₂-S₁ # Q=Inf            |                                              |                 |                |
 |                              |                                              |                 |                |
 | HH = H²                      | YY = Y₁²                                     |                 |                |
 | HHH = H*HH                   | M = 3*X₁²+a*ZZ²                              | 3(X₁-Z₁)(X₁+Z₁) | 3*X₁²          |
--- a/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim
@ -184,7 +184,7 @@ template sumImpl[F; G: static Subgroup](
  ## to simple side-channel attacks (SCA)
  ## This is done by using a "complete" or "exception-free" addition law.
  #
-  # Implementation, see write-up at the bottom.
+  # Implementation, see write-up in the accompanying Markdown file.
  # We fuse addition and doubling with condition copy by swapping
  # terms with the following table
  #
@ -403,7 +403,7 @@ func madd*[F; G: static Subgroup](
  ## to simple side-channel attacks (SCA)
  ## This is done by using a "complete" or "exception-free" addition law.
  #
-  # Implementation, see write-up at the bottom.
+  # Implementation, see write-up in the accompanying markdown file.
  # We fuse addition and doubling with condition copy by swapping
  # terms with the following table
  #
@ -674,3 +674,247 @@ func fromAffine*[F; G](
  jac.y = aff.y
  jac.z.setOne()
  jac.z.csetZero(aff.isInf())
+
+# Variable-time
+# -------------
+
+# In some primitives like FFTs, the extra work done for constant-time
+# is amplified by O(n log n) which may result in extra tens of minutes
+# to hours of computations. Those primitives do not need constant-timeness.
+
+func sum_vartime*[F; G: static Subgroup](
+       r: var ECP_ShortW_Jac[F, G],
+       p, q: ECP_ShortW_Jac[F, G])
+       {.tags:[VarTime], meter.} =
+  ## **Variable-time** Jacobian addition
+  ##
+  ## This MUST NOT be used with secret data.
+  ##
+  ## This is highly VULNERABLE to timing attacks and power analysis attacks.
+
+  if p.isInf().bool:
+    r = q
+    return
+  if q.isInf().bool:
+    r = p
+    return
+
+  # Accelerate mixed additions
+  let isPz1 = p.z.isOne().bool
+  let isQz1 = q.z.isOne().bool
+
+  # Addition, Cohen et al, 1998
+  # General case:            12M + 4S + 6add + 1*2
+  #
+  # Mixed-addition:          8M + 3S + 6add + 1*2
+  # Affine+Affine->Jacobian: 4M + 2S + 6add + 1*2
+
+  # |  Addition, Cohen et al, 1998  |
+  # |  12M + 4S + 6add + 1*2        |
+  # | ----------------------------- |
+  # | Z₁Z₁ = Z₁²                    |
+  # | Z₂Z₂ = Z₂²                    |
+  # |                               |
+  # | U₁ = X₁*Z₂Z₂                  |
+  # | U₂ = X₂*Z₁Z₁                  |
+  # | S₁ = Y₁*Z₂*Z₂Z₂               |
+  # | S₂ = Y₂*Z₁*Z₁Z₁               |
+  # | H  = U₂-U₁ # P=-Q, P=Inf, P=Q |
+  # | R  = S₂-S₁ # Q=Inf            |
+  # |                               |
+  # | HH  = H²                      |
+  # | V   = U₁*HH                   |
+  # | HHH = H*HH                    |
+  # |                               |
+  # | X₃ = R²-HHH-2*V               |
+  # | Y₃ = R*(V-X₃)-S₁*HHH          |
+  # | Z₃ = Z₁*Z₂*H                  |
+
+  var U {.noInit.}, S{.noInit.}, H{.noInit.}, R{.noInit.}: F
+
+  if not isPz1:                            # case Z₁ != 1
+    R.square(p.z, skipFinalSub = true)     #   Z₁Z₁ = Z₁²
+  if isQz1:                                # case Z₂ = 1
+    U = p.x                                #   U₁ = X₁*Z₂Z₂
+    if isPz1:                              #   case Z₁ = Z₂ = 1
+      H = q.x
+    else:
+      H.prod(q.x, R)
+    H -= U                                 #   H  = U₂-U₁
+    S = p.y                                #   S₁ = Y₁*Z₂*Z₂Z₂
+  else:                                    # case Z₂ != 1
+    S.square(q.z, skipFinalSub = true)
+    U.prod(p.x, S)                         #   U₁ = X₁*Z₂Z₂
+    if isPz1:
+      H = q.x
+    else:
+      H.prod(q.x, R)
+    H -= U                                 #   H  = U₂-U₁
+    S.prod(S, q.z, skipFinalSub = true)
+    S *= p.y                               #   S₁ = Y₁*Z₂*Z₂Z₂
+  if isPz1:
+    R = q.y
+  else:
+    R.prod(R, p.z, skipFinalSub = true)
+    R *= q.y                               #   S₂ = Y₂*Z₁*Z₁Z₁
+  R -= S                                   # R  = S₂-S₁
+
+  if H.isZero().bool:                      # Same x coordinate
+    if R.isZero().bool:                    # case P = Q
+      r.double(p)
+      return
+    else:                                  # case P = -Q
+      r.setInf()
+      return
+
+  var HHH{.noInit.}: F
+  template V: untyped = U
+
+  HHH.square(H, skipFinalSub = true)
+  V *= HHH                                # V   = U₁*HH
+  HHH *= H                                # HHH = H*HH
+
+  # X₃ = R²-HHH-2*V, we use the y coordinate as temporary (should we? cache misses?)
+  r.y.square(R)
+  r.y -= V
+  r.y -= V
+  r.x.diff(r.y, HHH)
+
+  # Y₃ = R*(V-X₃)-S₁*HHH
+  V -= r.x
+  V *= R
+  HHH *= S
+  r.y.diff(V, HHH)
+
+  # Z₃ = Z₁*Z₂*H
+  if isPz1:
+    if isQz1:
+      r.z = H
+    else:
+      r.z.prod(H, q.z)
+  else:
+    if isQz1:
+      r.z.prod(H, p.z)
+    else:
+      r.z.prod(p.z, q.z, skipFinalSub = true)
+      r.z *= H
+
+func madd_vartime*[F; G: static Subgroup](
+       r: var ECP_ShortW_Jac[F, G],
+       p: ECP_ShortW_Jac[F, G],
+       q: ECP_ShortW_Aff[F, G])
+       {.tags:[VarTime], meter.} =
+  ## **Variable-time** Jacobian mixed addition
+  ##
+  ## This MUST NOT be used with secret data.
+  ##
+  ## This is highly VULNERABLE to timing attacks and power analysis attacks.
+
+  if p.isInf().bool:
+    r.fromAffine(q)
+    return
+  if q.isInf().bool:
+    r = p
+    return
+
+  # Accelerate mixed additions
+  let isPz1 = p.z.isOne().bool
+
+  # Addition, Cohen et al, 1998
+  #
+  # Mixed-addition:          8M + 3S + 6add + 1*2
+  # Affine+Affine->Jacobian: 4M + 2S + 6add + 1*2
+
+  # |  Addition, Cohen et al, 1998  |
+  # |  12M + 4S + 6add + 1*2        |
+  # | ----------------------------- |
+  # | Z₁Z₁ = Z₁²                    |
+  # | Z₂Z₂ = Z₂²                    |
+  # |                               |
+  # | U₁ = X₁*Z₂Z₂                  |
+  # | U₂ = X₂*Z₁Z₁                  |
+  # | S₁ = Y₁*Z₂*Z₂Z₂               |
+  # | S₂ = Y₂*Z₁*Z₁Z₁               |
+  # | H  = U₂-U₁ # P=-Q, P=Inf, P=Q |
+  # | R  = S₂-S₁ # Q=Inf            |
+  # |                               |
+  # | HH  = H²                      |
+  # | V   = U₁*HH                   |
+  # | HHH = H*HH                    |
+  # |                               |
+  # | X₃ = R²-HHH-2*V               |
+  # | Y₃ = R*(V-X₃)-S₁*HHH          |
+  # | Z₃ = Z₁*Z₂*H                  |
+
+  var U {.noInit.}, S{.noInit.}, H{.noInit.}, R{.noInit.}: F
+
+  if not isPz1:                            # case Z₁ != 1
+    R.square(p.z, skipFinalSub = true)     #   Z₁Z₁ = Z₁²
+
+  U = p.x                                  #   U₁ = X₁*Z₂Z₂
+  if isPz1:                                #   case Z₁ = Z₂ = 1
+    H = q.x
+  else:
+    H.prod(q.x, R)
+  H -= U                                   #   H  = U₂-U₁
+  S = p.y                                  #   S₁ = Y₁*Z₂*Z₂Z₂
+
+  if isPz1:
+    R = q.y
+  else:
+    R.prod(R, p.z, skipFinalSub = true)
+    R *= q.y                               #   S₂ = Y₂*Z₁*Z₁Z₁
+  R -= S                                   # R  = S₂-S₁
+
+  if H.isZero().bool:                      # Same x coordinate
+    if R.isZero().bool:                    # case P = Q
+      r.double(p)
+      return
+    else:                                  # case P = -Q
+      r.setInf()
+      return
+
+  var HHH{.noInit.}: F
+  template V: untyped = U
+
+  HHH.square(H, skipFinalSub = true)
+  V *= HHH                                # V   = U₁*HH
+  HHH *= H                                # HHH = H*HH
+
+  # X₃ = R²-HHH-2*V, we use the y coordinate as temporary (should we? cache misses?)
+  r.y.square(R)
+  r.y -= V
+  r.y -= V
+  r.x.diff(r.y, HHH)
+
+  # Y₃ = R*(V-X₃)-S₁*HHH
+  V -= r.x
+  V *= R
+  HHH *= S
+  r.y.diff(V, HHH)
+
+  # Z₃ = Z₁*Z₂*H
+  if isPz1:
+    r.z = H
+  else:
+    r.z.prod(H, p.z)
+
+func diff_vartime*(r: var ECP_ShortW_Jac, P, Q: ECP_ShortW_Jac) {.inline.} =
+  ## r = P - Q
+  ##
+  ## This MUST NOT be used with secret data.
+  ##
+  ## This is highly VULNERABLE to timing attacks and power analysis attacks.
+  var nQ {.noInit.}: typeof(Q)
+  nQ.neg(Q)
+  r.sum_vartime(P, nQ)
+
+func msub_vartime*(r: var ECP_ShortW_Jac, P: ECP_ShortW_Jac, Q: ECP_ShortW_Aff) {.inline.} =
+  ## r = P - Q
+  ##
+  ## This MUST NOT be used with secret data.
+  ##
+  ## This is highly VULNERABLE to timing attacks and power analysis attacks.
+  var nQ {.noInit.}: typeof(Q)
+  nQ.neg(Q)
+  r.madd_vartime(P, nQ)
--- a/constantine/math/elliptic/ec_shortweierstrass_jacobian_extended.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_jacobian_extended.nim
@ -200,11 +200,15 @@ func sum_vartime*[F; G: static Subgroup](
      r.setInf()
      return

-  var PP{.noInit.}, PPP{.noInit.}, Q{.noInit.}: F
+  var PPP{.noInit.}, Q{.noInit.}: F

-  PP.square(P)
-  PPP.prod(PP, P)
-  Q.prod(U, PP)
+  PPP.square(P)
+
+  Q.prod(U, PPP)
+  r.zz.prod(p.zz, q.zz)
+  r.zz *= PPP
+
+  PPP *= P

  r.x.square(R)
  P.double(Q)
@ -216,8 +220,6 @@ func sum_vartime*[F; G: static Subgroup](
  R *= Q
  r.y.diff(R, r.y)

-  r.zz.prod(p.zz, q.zz)
-  r.zz *= PP
  r.zzz.prod(p.zzz, q.zzz)
  r.zzz *= PPP

--- a/constantine/math/elliptic/ec_shortweierstrass_projective.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_projective.nim
@ -464,3 +464,220 @@ func fromAffine*[F, G](
  proj.x.csetZero(inf)
  proj.y.csetOne(inf)
  proj.z.csetZero(inf)
+
+# Variable-time
+# -------------
+
+# In some primitives like FFTs, the extra work done for constant-time
+# is amplified by O(n log n) which may result in extra tens of minutes
+# to hours of computations. Those primitives do not need constant-timeness.
+
+func sum_vartime*[F; G: static Subgroup](
+       r: var ECP_ShortW_Prj[F, G],
+       p, q: ECP_ShortW_Prj[F, G])
+       {.tags:[VarTime], meter.} =
+  ## **Variable-time** homogeneous projective addition
+  ##
+  ## This MUST NOT be used with secret data.
+  ##
+  ## This is highly VULNERABLE to timing attacks and power analysis attacks.
+
+  if p.isInf().bool:
+    r = q
+    return
+  if q.isInf().bool:
+    r = p
+    return
+
+  # Accelerate mixed additions
+  let isPz1 = p.z.isOne().bool
+  let isQz1 = q.z.isOne().bool
+
+  # Addition, Cohen et al, 1998
+  # General case:            12M + 4S + 6add + 1*2
+  # https://hyperelliptic.org/EFD/g1p/auto-shortw-projective.html#addition-add-1998-cmo-2
+  #
+  # Y₁Z₂ = Y₁*Z₂
+  # X₁Z₂ = X₁*Z₂
+  # Z₁Z₂ = Z₁*Z₂
+  # u = Y₂*Z₁-Y₁Z₂
+  # uu = u²
+  # v = X₂*Z₁-X₁Z₂
+  # vv = v²
+  # vvv = v*vv
+  # R = vv*X₁Z₂
+  # A = uu*Z₁Z₂-vvv-2*R
+  # X₃ = v*A
+  # Y₃ = u*(R-A)-vvv*Y₁Z₂
+  # Z₃ = vvv*Z₁Z₂
+
+  var Y1Z2 {.noInit.}, R {.noInit.}: F
+  var U {.noInit.}, V {.noInit.}: F
+
+  if isQz1:
+    R = p.x
+    Y1Z2 = p.y
+  else:
+    R.prod(p.x, q.z)     # X₁Z₂
+    Y1Z2.prod(p.y, q.z)
+  if isPz1:
+    U = q.y
+    V = q.x
+  else:
+    U.prod(q.y, p.z)
+    V.prod(q.x, p.z)
+  V -= R
+
+  if V.isZero().bool:    # Same x coordinate
+    if bool(U == Y1Z2):  # case P = Q
+      r.double(p)
+      return
+    else:
+      r.setInf()         # case P = -Q
+      return
+
+  var VVV{.noInit.}: F
+
+  VVV.square(V, skipFinalSub = true)
+  R *= VVV
+  VVV *= V
+
+  r.y.diff(U, Y1Z2)      # u = Y₂*Z₁-Y₁Z₂
+  U.square(r.y)          # uu = u²
+
+  # A and Z₃ depend on Z₁Z₂
+  template A:untyped = U
+  if isQz1:
+    if isPz1:
+      r.z = VVV
+    else:
+      A.prod(U, p.z)
+      r.z.prod(VVV, p.z)
+  else:
+    if isPz1:
+      A.prod(U, q.z)
+      r.z.prod(VVV, q.z)
+    else:
+      r.z.prod(p.z, q.z, skipFinalSub = true)
+      A.prod(U, r.z)
+      r.z *= VVV
+
+  A -= VVV
+  A -= R
+  A -= R                  # A = uu*Z₁Z₂-vvv-2*R
+
+  r.x.prod(V, A)
+
+  R -= A
+  Y1Z2 *= VVV
+  r.y *= R
+  r.y -= Y1Z2
+
+func madd_vartime*[F; G: static Subgroup](
+       r: var ECP_ShortW_Prj[F, G],
+       p: ECP_ShortW_Prj[F, G],
+       q: ECP_ShortW_Aff[F, G])
+       {.tags:[VarTime], meter.} =
+  ## **Variable-time** homogeneous projective mixed addition
+  ##
+  ## This MUST NOT be used with secret data.
+  ##
+  ## This is highly VULNERABLE to timing attacks and power analysis attacks.
+
+  if p.isInf().bool:
+    r.fromAffine(q)
+    return
+  if q.isInf().bool:
+    r = p
+    return
+
+  # Accelerate mixed additions
+  let isPz1 = p.z.isOne().bool
+
+  # Addition, Cohen et al, 1998
+  # General case:            12M + 4S + 6add + 1*2
+  # https://hyperelliptic.org/EFD/g1p/auto-shortw-projective.html#addition-add-1998-cmo-2
+  #
+  # Y₁Z₂ = Y₁*Z₂
+  # X₁Z₂ = X₁*Z₂
+  # Z₁Z₂ = Z₁*Z₂
+  # u = Y₂*Z₁-Y₁Z₂
+  # uu = u²
+  # v = X₂*Z₁-X₁Z₂
+  # vv = v²
+  # vvv = v*vv
+  # R = vv*X₁Z₂
+  # A = uu*Z₁Z₂-vvv-2*R
+  # X₃ = v*A
+  # Y₃ = u*(R-A)-vvv*Y₁Z₂
+  # Z₃ = vvv*Z₁Z₂
+
+  var Y1Z2 {.noInit.}, R {.noInit.}: F
+  var U {.noInit.}, V {.noInit.}: F
+
+  R = p.x
+  Y1Z2 = p.y
+
+  if isPz1:
+    U = q.y
+    V = q.x
+  else:
+    U.prod(q.y, p.z)
+    V.prod(q.x, p.z)
+  V -= R
+
+  if V.isZero().bool:    # Same x coordinate
+    if bool(U == Y1Z2):  # case P = Q
+      r.double(p)
+      return
+    else:
+      r.setInf()         # case P = -Q
+      return
+
+  var VVV{.noInit.}: F
+
+  VVV.square(V, skipFinalSub = true)
+  R *= VVV
+  VVV *= V
+
+  r.y.diff(U, Y1Z2)      # u = Y₂*Z₁-Y₁Z₂
+  U.square(r.y)          # uu = u²
+
+  # A and Z₃ depend on Z₁Z₂
+  template A:untyped = U
+  if isPz1:
+    r.z = VVV
+  else:
+    A.prod(U, p.z)
+    r.z.prod(VVV, p.z)
+
+  A -= VVV
+  A -= R
+  A -= R                  # A = uu*Z₁Z₂-vvv-2*R
+
+  r.x.prod(V, A)
+
+  R -= A
+  Y1Z2 *= VVV
+  r.y *= R
+  r.y -= Y1Z2
+
+func diff_vartime*(r: var ECP_ShortW_Prj, P, Q: ECP_ShortW_Prj) {.inline.} =
+  ## r = P - Q
+  ##
+  ## This MUST NOT be used with secret data.
+  ##
+  ## This is highly VULNERABLE to timing attacks and power analysis attacks.
+  var nQ {.noInit.}: typeof(Q)
+  nQ.neg(Q)
+  r.sum_vartime(P, nQ)
+
+func msub_vartime*(r: var ECP_ShortW_Prj, P: ECP_ShortW_Prj, Q: ECP_ShortW_Aff) {.inline.} =
+  ## r = P - Q
+  ##
+  ## This MUST NOT be used with secret data.
+  ##
+  ## This is highly VULNERABLE to timing attacks and power analysis attacks.
+  var nQ {.noInit.}: typeof(Q)
+  nQ.neg(Q)
+  r.madd_vartime(P, nQ)
--- a/constantine/math/polynomials/fft.nim
+++ b/constantine/math/polynomials/fft.nim
@ -64,18 +64,18 @@ func simpleFT[EC; bits: static int](
  # FFT is a recursive algorithm
  # This is the base-case using a O(n²) algorithm

-  # TODO: endomorphism acceleration for windowed-NAF
-
  let L = output.len
  var last {.noInit.}, v {.noInit.}: EC

+  var v0w0 {.noInit.} = vals[0]
+  v0w0.scalarMul_vartime(rootsOfUnity[0])
+
  for i in 0 ..< L:
-    last = vals[0]
-    last.scalarMul_minHammingWeight_windowed_vartime(rootsOfUnity[0], window = 5)
+    last = v0w0
    for j in 1 ..< L:
      v = vals[j]
-      v.scalarMul_minHammingWeight_windowed_vartime(rootsOfUnity[(i*j) mod L], window = 5)
-      last += v
+      v.scalarMul_vartime(rootsOfUnity[(i*j) mod L])
+      last.sum_vartime(last, v)
    output[i] = last

 func fft_internal[EC; bits: static int](
@ -100,11 +100,11 @@ func fft_internal[EC; bits: static int](
  for i in 0 ..< half:
    # FFT Butterfly
    y_times_root = output[i+half]
-    y_times_root   .scalarMul_minHammingWeight_windowed_vartime(rootsOfUnity[i], window = 5)
-    output[i+half] .diff(output[i], y_times_root)
-    output[i]      += y_times_root
+    y_times_root   .scalarMul_vartime(rootsOfUnity[i])
+    output[i+half] .diff_vartime(output[i], y_times_root)
+    output[i]      .sum_vartime(output[i], y_times_root)

-func fft*[EC](
+func fft_vartime*[EC](
       desc: ECFFT_Descriptor[EC],
       output: var openarray[EC],
       vals: openarray[EC]): FFT_Status =
@ -121,7 +121,7 @@ func fft*[EC](
  fft_internal(voutput, vals.toStridedView(), rootz)
  return FFTS_Success

-func ifft*[EC](
+func ifft_vartime*[EC](
       desc: ECFFT_Descriptor[EC],
       output: var openarray[EC],
       vals: openarray[EC]): FFT_Status =
@ -144,7 +144,7 @@ func ifft*[EC](
  invLen.invmod_vartime(invLen, EC.F.C.getCurveOrder())

  for i in 0 ..< output.len:
-    output[i].scalarMul_minHammingWeight_windowed_vartime(invLen, window = 5)
+    output[i].scalarMul_vartime(invLen)

  return FFTS_Success

@ -360,12 +360,12 @@ when isMainModule:
      data[i].madd(data[i-1], BLS12_381.getGenerator("G1"))

    var coefs = newSeq[EC_G1](data.len)
-    let fftOk = fft(fftDesc, coefs, data)
+    let fftOk = fft_vartime(fftDesc, coefs, data)
    doAssert fftOk == FFTS_Success
    # display("coefs", 0, coefs)

    var res = newSeq[EC_G1](data.len)
-    let ifftOk = ifft(fftDesc, res, coefs)
+    let ifftOk = ifft_vartime(fftDesc, res, coefs)
    doAssert ifftOk == FFTS_Success
    # display("res", 0, res)

@ -415,7 +415,7 @@ when isMainModule:
      # Bench
      let start = getMonotime()
      for i in 0 ..< NumIters:
-        let status = fftDesc.fft(coefsOut, data)
+        let status = fftDesc.fft_vartime(coefsOut, data)
        doAssert status == FFTS_Success
      let stop = getMonotime()

--- a/constantine/signatures/bls_signatures.nim
+++ b/constantine/signatures/bls_signatures.nim
@ -419,8 +419,8 @@ func update*[Pubkey, Sig: ECP_ShortW_Aff](

    var randFactor{.noInit.}: BigInt[64]
    randFactor.unmarshal(ctx.secureBlinding.toOpenArray(0, 7), bigEndian)
-    pkG1_jac.scalarMul_minHammingWeight_windowed_vartime(randFactor, window = 3)
-    sigG2_jac.scalarMul_minHammingWeight_windowed_vartime(randFactor, window = 3)
+    pkG1_jac.scalarMul_vartime(randFactor)
+    sigG2_jac.scalarMul_vartime(randFactor)

    if ctx.aggSigOnce == false:
      ctx.aggSig = sigG2_jac
@ -455,8 +455,8 @@ func update*[Pubkey, Sig: ECP_ShortW_Aff](

    var randFactor{.noInit.}: BigInt[64]
    randFactor.unmarshal(ctx.secureBlinding.toOpenArray(0, 7), bigEndian)
-    hmsgG1_jac.scalarMul_minHammingWeight_windowed_vartime(randFactor, window = 3)
-    sigG1_jac.scalarMul_minHammingWeight_windowed_vartime(randFactor, window = 3)
+    hmsgG1_jac.scalarMul_vartime(randFactor)
+    sigG1_jac.scalarMul_vartime(randFactor)

    if ctx.aggSigOnce == false:
      ctx.aggSig = sigG1_jac
--- a/research/kzg/fft_g1.nim
+++ b/research/kzg/fft_g1.nim
@ -11,6 +11,7 @@ import
  ../../constantine/math/config/curves,
  ../../constantine/math/arithmetic,
  ../../constantine/math/ec_shortweierstrass,
+  ../../constantine/math/elliptic/ec_scalar_mul_vartime,
  ../../constantine/math/io/[io_fields, io_ec, io_bigints],
  # Research
  ./strided_views,
@ -95,28 +96,29 @@ func expandRootOfUnity[F](rootOfUnity: F): auto {.noInit.} =
 func simpleFT[EC; bits: static int](
       output: var View[EC],
       vals: View[EC],
-       rootsOfUnity: View[BigInt[bits]]
-     ) =
+       rootsOfUnity: View[BigInt[bits]]) =
  # FFT is a recursive algorithm
  # This is the base-case using a O(n²) algorithm

  let L = output.len
  var last {.noInit.}, v {.noInit.}: EC

+  var v0w0 {.noInit.} = vals[0]
+  v0w0.scalarMul_vartime(rootsOfUnity[0])
+
  for i in 0 ..< L:
-    last = vals[0]
-    last.scalarMul(rootsOfUnity[0])
+    last = v0w0
    for j in 1 ..< L:
      v = vals[j]
-      v.scalarMul(rootsOfUnity[(i*j) mod L])
-      last += v
+
+      v.scalarMul_vartime(rootsOfUnity[(i*j) mod L])
+      last.sum_vartime(last, v)
    output[i] = last

 func fft_internal[EC; bits: static int](
       output: var View[EC],
       vals: View[EC],
-       rootsOfUnity: View[BigInt[bits]]
-     ) =
+       rootsOfUnity: View[BigInt[bits]]) =
  if output.len <= 4:
    simpleFT(output, vals, rootsOfUnity)
    return
@ -135,11 +137,11 @@ func fft_internal[EC; bits: static int](
  for i in 0 ..< half:
    # FFT Butterfly
    y_times_root = output[i+half]
-    y_times_root   .scalarMul(rootsOfUnity[i])
-    output[i+half] .diff(output[i], y_times_root)
-    output[i]      += y_times_root
+    y_times_root   .scalarMul_vartime(rootsOfUnity[i])
+    output[i+half] .diff_vartime(output[i], y_times_root)
+    output[i]      .sum_vartime(output[i], y_times_root)

-func fft*[EC](
+func fft_vartime*[EC](
       desc: FFTDescriptor[EC],
       output: var openarray[EC],
       vals: openarray[EC]): FFT_Status =
@ -156,7 +158,7 @@ func fft*[EC](
  fft_internal(voutput, vals.toView(), rootz)
  return FFTS_Success

-func ifft*[EC](
+func ifft_vartime*[EC](
       desc: FFTDescriptor[EC],
       output: var openarray[EC],
       vals: openarray[EC]): FFT_Status =
@ -179,8 +181,8 @@ func ifft*[EC](
  invLen.inv_vartime()
  let inv = invLen.toBig()

-  for i in 0..< output.len:
-    output[i].scalarMul(inv)
+  for i in 0 ..< output.len:
+    output[i].scalarMul_vartime(inv)

  return FFTS_Success

@ -221,12 +223,12 @@ when isMainModule:
      data[i].madd(data[i-1], Generator1)

    var coefs = newSeq[EC_G1](data.len)
-    let fftOk = fft(fftDesc, coefs, data)
+    let fftOk = fft_vartime(fftDesc, coefs, data)
    doAssert fftOk == FFTS_Success
    # display("coefs", 0, coefs)

    var res = newSeq[EC_G1](data.len)
-    let ifftOk = ifft(fftDesc, res, coefs)
+    let ifftOk = ifft_vartime(fftDesc, res, coefs)
    doAssert ifftOk == FFTS_Success
    # display("res", 0, res)

@ -262,7 +264,7 @@ when isMainModule:

    warmup()

-    for scale in 4 ..< 10:
+    for scale in 4 ..< 16:
      # Setup

      let desc = FFTDescriptor[EC_G1].init(uint8 scale)
@ -276,7 +278,7 @@ when isMainModule:
      # Bench
      let start = getMonotime()
      for i in 0 ..< NumIters:
-        let status = desc.fft(coefsOut, data)
+        let status = desc.fft_vartime(coefsOut, data)
        doAssert status == FFTS_Success
      let stop = getMonotime()

--- a/tests/math_elliptic_curves/t_ec_sage_template.nim
+++ b/tests/math_elliptic_curves/t_ec_sage_template.nim
@ -172,7 +172,7 @@ proc run_scalar_mul_test_vs_sage*(
        doAssert: bool(Q == impl)
        doAssert: bool(Q == refMinWeight)

-        staticFor w, 2, 14:
+        staticFor w, 2, 5:
          var refWNAF = P
          refWNAF.scalarMul_minHammingWeight_windowed_vartime(vec.vectors[i].scalar, window = w)
          check: bool(impl == refWNAF)
@ -186,3 +186,8 @@ proc run_scalar_mul_test_vs_sage*(
            var endoW = P
            endoW.scalarMulGLV_m2w2(vec.vectors[i].scalar)
            doAssert: bool(Q == endoW)
+
+          staticFor w, 2, 5:
+            var endoWNAF = P
+            endoWNAF.scalarMulEndo_minHammingWeight_windowed_vartime(vec.vectors[i].scalar, window = w)
+            check: bool(impl == endoWNAF)
--- a/tests/math_elliptic_curves/t_ec_shortw_jac_g1_add_double.nim
+++ b/tests/math_elliptic_curves/t_ec_shortw_jac_g1_add_double.nim
@ -51,3 +51,39 @@ run_EC_addition_tests(
    Iters = Iters,
    moduleName = "test_ec_shortweierstrass_jacobian_g1_add_double_" & $Vesta
  )
+
+run_EC_addition_vartime_tests(
+    ec = ECP_ShortW_Jac[Fp[BN254_Snarks], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_jacobian_g1_add_double_vartime_" & $BN254_Snarks
+  )
+
+run_EC_addition_vartime_tests(
+    ec = ECP_ShortW_Jac[Fp[BLS12_381], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_jacobian_g1_add_double_vartime_" & $BLS12_381
+  )
+
+run_EC_addition_vartime_tests(
+    ec = ECP_ShortW_Jac[Fp[BLS12_377], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_jacobian_g1_add_double_vartime_" & $BLS12_377
+  )
+
+run_EC_addition_vartime_tests(
+    ec = ECP_ShortW_Jac[Fp[BW6_761], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_jacobian_g1_add_double_vartime_" & $BW6_761
+  )
+
+run_EC_addition_vartime_tests(
+    ec = ECP_ShortW_Jac[Fp[Pallas], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_jacobian_g1_add_double_vartime_" & $Pallas
+  )
+
+run_EC_addition_vartime_tests(
+    ec = ECP_ShortW_Jac[Fp[Vesta], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_jacobian_g1_add_double_vartime_" & $Vesta
+  )
--- a/tests/math_elliptic_curves/t_ec_shortw_prj_g1_add_double.nim
+++ b/tests/math_elliptic_curves/t_ec_shortw_prj_g1_add_double.nim
@ -14,7 +14,7 @@ import
  ./t_ec_template

 const
-  Iters = 8
+  Iters = 6

 run_EC_addition_tests(
    ec = ECP_ShortW_Prj[Fp[BN254_Snarks], G1],
@ -51,3 +51,39 @@ run_EC_addition_tests(
    Iters = Iters,
    moduleName = "test_ec_shortweierstrass_projective_g1_add_double_" & $Vesta
  )
+
+run_EC_addition_vartime_tests(
+    ec = ECP_ShortW_Prj[Fp[BN254_Snarks], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_projective_g1_add_double_vartime_" & $BN254_Snarks
+  )
+
+run_EC_addition_vartime_tests(
+    ec = ECP_ShortW_Prj[Fp[BLS12_381], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_projective_g1_add_double_vartime_" & $BLS12_381
+  )
+
+run_EC_addition_vartime_tests(
+    ec = ECP_ShortW_Prj[Fp[BLS12_377], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_projective_g1_add_double_vartime_" & $BLS12_377
+  )
+
+run_EC_addition_vartime_tests(
+    ec = ECP_ShortW_Prj[Fp[BW6_761], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_projective_g1_add_double_vartime_" & $BW6_761
+  )
+
+run_EC_addition_vartime_tests(
+    ec = ECP_ShortW_Prj[Fp[Pallas], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_projective_g1_add_double_vartime_" & $Pallas
+  )
+
+run_EC_addition_vartime_tests(
+    ec = ECP_ShortW_Prj[Fp[Vesta], G1],
+    Iters = Iters,
+    moduleName = "test_ec_shortweierstrass_projective_g1_add_double_vartime_" & $Vesta
+  )
--- a/tests/math_elliptic_curves/t_ec_template.nim
+++ b/tests/math_elliptic_curves/t_ec_template.nim
@ -92,7 +92,7 @@ proc run_EC_addition_tests*(
  echo "\n------------------------------------------------------\n"
  echo moduleName, " xoshiro512** seed: ", seed

-  const testSuiteDesc = "Elliptic curve in " & $ec.F.C.getEquationForm() & " form with projective coordinates"
+  const testSuiteDesc = "Elliptic curve in " & $ec.F.C.getEquationForm() & " form"

  suite testSuiteDesc & " - " & $ec & " - [" & $WordBitWidth & "-bit mode]":
    test "The infinity point is the neutral element w.r.t. to EC " & $ec.G & " addition":
@ -268,6 +268,193 @@ proc run_EC_addition_tests*(
      test(ec, randZ = false, gen = Long01Sequence)
      test(ec, randZ = true, gen = Long01Sequence)

+
+proc run_EC_addition_vartime_tests*(
+       ec: typedesc,
+       Iters: static int,
+       moduleName: string) =
+  var rng: RngState
+  let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+  rng.seed(seed)
+  echo "\n------------------------------------------------------\n"
+  echo moduleName, " xoshiro512** seed: ", seed
+
+  const testSuiteDesc = "Elliptic curve in " & $ec.F.C.getEquationForm() & " form"
+
+  suite testSuiteDesc & " - " & $ec & " (vartime) - [" & $WordBitWidth & "-bit mode]":
+    test "The infinity point is the neutral element w.r.t. to EC " & $ec.G & " addition (vartime)":
+      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
+        var inf {.noInit.}: EC
+        inf.setInf()
+        check: bool inf.isInf()
+
+        for _ in 0 ..< Iters:
+          var r{.noInit.}: EC
+          let P = rng.random_point(EC, randZ, gen)
+
+          r.sum_vartime(P, inf)
+          check: bool(r == P)
+
+          r.sum_vartime(inf, P)
+          check: bool(r == P)
+
+          # Aliasing tests
+          r = P
+          r.sum_vartime(r, inf)
+          check: bool(r == P)
+
+          r.setInf()
+          r.sum_vartime(r, P)
+          check: bool(r == P)
+
+      test(ec, randZ = false, gen = Uniform)
+      test(ec, randZ = true, gen = Uniform)
+      test(ec, randZ = false, gen = HighHammingWeight)
+      test(ec, randZ = true, gen = HighHammingWeight)
+      test(ec, randZ = false, gen = Long01Sequence)
+      test(ec, randZ = true, gen = Long01Sequence)
+
+    test "Infinity point from affine conversion gives proper result (vartime)":
+      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
+        var affInf {.noInit.}: affine(EC)
+        var inf {.noInit.}: EC
+        affInf.setInf()
+        inf.fromAffine(affInf)
+        check: bool inf.isInf()
+
+        for _ in 0 ..< Iters:
+          var r{.noInit.}: EC
+          let P = rng.random_point(EC, randZ, gen)
+
+          r.sum_vartime(P, inf)
+          check: bool(r == P)
+
+          r.sum_vartime(inf, P)
+          check: bool(r == P)
+
+          # Aliasing tests
+          r = P
+          r.sum_vartime(r, inf)
+          check: bool(r == P)
+
+          r.setInf()
+          r.sum_vartime(r, P)
+          check: bool(r == P)
+
+      test(ec, randZ = false, gen = Uniform)
+      test(ec, randZ = true, gen = Uniform)
+      test(ec, randZ = false, gen = HighHammingWeight)
+      test(ec, randZ = true, gen = HighHammingWeight)
+      test(ec, randZ = false, gen = Long01Sequence)
+      test(ec, randZ = true, gen = Long01Sequence)
+
+    test "Adding opposites gives an infinity point (vartime)":
+      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
+        for _ in 0 ..< Iters:
+          var r{.noInit.}: EC
+          let P = rng.random_point(EC, randZ, gen)
+          var Q = P
+          Q.neg()
+
+          r.sum_vartime(P, Q)
+          check: bool r.isInf()
+
+          r.sum_vartime(Q, P)
+          check: bool r.isInf()
+
+      test(ec, randZ = false, gen = Uniform)
+      test(ec, randZ = true, gen = Uniform)
+      test(ec, randZ = false, gen = HighHammingWeight)
+      test(ec, randZ = true, gen = HighHammingWeight)
+      test(ec, randZ = false, gen = Long01Sequence)
+      test(ec, randZ = true, gen = Long01Sequence)
+
+    test "EC " & $ec.G & " add is commutative (vartime)":
+      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
+        for _ in 0 ..< Iters:
+          var r0{.noInit.}, r1{.noInit.}: EC
+          let P = rng.random_point(EC, randZ, gen)
+          let Q = rng.random_point(EC, randZ, gen)
+
+          r0.sum_vartime(P, Q)
+          r1.sum_vartime(Q, P)
+          check: bool(r0 == r1)
+
+      test(ec, randZ = false, gen = Uniform)
+      test(ec, randZ = true, gen = Uniform)
+      test(ec, randZ = false, gen = HighHammingWeight)
+      test(ec, randZ = true, gen = HighHammingWeight)
+      test(ec, randZ = false, gen = Long01Sequence)
+      test(ec, randZ = true, gen = Long01Sequence)
+
+    test "EC " & $ec.G & " add is associative (vartime)":
+      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
+        for _ in 0 ..< Iters:
+          let a = rng.random_point(EC, randZ, gen)
+          let b = rng.random_point(EC, randZ, gen)
+          let c = rng.random_point(EC, randZ, gen)
+
+          var tmp1{.noInit.}, tmp2{.noInit.}: EC
+
+          # r0 = (a + b) + c
+          tmp1.sum_vartime(a, b)
+          tmp2.sum_vartime(tmp1, c)
+          let r0 = tmp2
+
+          # r1 = a + (b + c)
+          tmp1.sum_vartime(b, c)
+          tmp2.sum_vartime(a, tmp1)
+          let r1 = tmp2
+
+          # r2 = (a + c) + b
+          tmp1.sum_vartime(a, c)
+          tmp2.sum_vartime(tmp1, b)
+          let r2 = tmp2
+
+          # r3 = a + (c + b)
+          tmp1.sum_vartime(c, b)
+          tmp2.sum_vartime(a, tmp1)
+          let r3 = tmp2
+
+          # r4 = (c + a) + b
+          tmp1.sum_vartime(c, a)
+          tmp2.sum_vartime(tmp1, b)
+          let r4 = tmp2
+
+          # ...
+
+          check:
+            bool(r0 == r1)
+            bool(r0 == r2)
+            bool(r0 == r3)
+            bool(r0 == r4)
+
+      test(ec, randZ = false, gen = Uniform)
+      test(ec, randZ = true, gen = Uniform)
+      test(ec, randZ = false, gen = HighHammingWeight)
+      test(ec, randZ = true, gen = HighHammingWeight)
+      test(ec, randZ = false, gen = Long01Sequence)
+      test(ec, randZ = true, gen = Long01Sequence)
+
+    test "EC " & $ec.G & " double and EC " & $ec.G & " add are consistent (vartime)":
+      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
+        for _ in 0 ..< Iters:
+          let a = rng.random_point(EC, randZ, gen)
+
+          var r0{.noInit.}, r1{.noInit.}: EC
+
+          r0.double(a)
+          r1.sum_vartime(a, a)
+
+          check: bool(r0 == r1)
+
+      test(ec, randZ = false, gen = Uniform)
+      test(ec, randZ = true, gen = Uniform)
+      test(ec, randZ = false, gen = HighHammingWeight)
+      test(ec, randZ = true, gen = HighHammingWeight)
+      test(ec, randZ = false, gen = Long01Sequence)
+      test(ec, randZ = true, gen = Long01Sequence)
+
 proc run_EC_mul_sanity_tests*(
       ec: typedesc,
       ItersMul: static int,
@ -479,8 +666,6 @@ proc run_EC_mul_vs_ref_impl*(
          refWNaf(2)
          refWNaf(3)
          refWNaf(5)
-          refWNaf(8)
-          refWNaf(13)

      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = Uniform)
      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = Uniform)
@ -509,14 +694,23 @@ proc run_EC_mixed_add_impl*(
          let a = rng.random_point(EC, randZ, gen)
          let b = rng.random_point(EC, randZ, gen)
          var bAff: ECP_ShortW_Aff[EC.F, EC.G]
+          var bz1: EC
          bAff.affine(b)
+          bz1.fromAffine(bAff) # internals special-case Z=1

-          var r_generic, r_mixed: EC
+          var r_generic, r_mixed, r_vartime, r_vartime2, r_vartime3: EC

          r_generic.sum(a, b)
          r_mixed.madd(a, bAff)
+          r_vartime.sum_vartime(a, bz1)
+          r_vartime2.sum_vartime(a, b)
+          r_vartime3.madd_vartime(a, bAff)

-          check: bool(r_generic == r_mixed)
+          check:
+            bool(r_generic == r_mixed)
+            bool(r_generic == r_vartime)
+            bool(r_generic == r_vartime2)
+            bool(r_generic == r_vartime3)

      test(ec, randZ = false, gen = Uniform)
      test(ec, randZ = true, gen = Uniform)
@ -530,18 +724,37 @@ proc run_EC_mixed_add_impl*(
        for _ in 0 ..< Iters:
          let a = rng.random_point(EC, randZ, gen)
          var aAff: ECP_ShortW_Aff[EC.F, EC.G]
+          var az1: EC
          aAff.affine(a)
+          az1.fromAffine(aAff)

-          var r_generic, r_mixed: EC
+          var r_generic, r_mixed, r_vartime, r_vartime2, r_vartime3: EC

          r_generic.double(a)
          r_mixed.madd(a, aAff)
-          check: bool(r_generic == r_mixed)
+          r_vartime.sum_vartime(a, a)
+          r_vartime2.sum_vartime(a, az1)
+          r_vartime3.madd_vartime(a, aAff)
+          check:
+            bool(r_generic == r_mixed)
+            bool(r_generic == r_vartime)
+            bool(r_generic == r_vartime2)
+            bool(r_generic == r_vartime3)

          # Aliasing test
          r_mixed = a
          r_mixed += aAff
-          check: bool(r_generic == r_mixed)
+          r_vartime = a
+          r_vartime.sum_vartime(r_vartime, a)
+          r_vartime2 = az1
+          r_vartime2.sum_vartime(r_vartime2, az1)
+          r_vartime3 = a
+          r_vartime3.madd_vartime(r_vartime3, aAff)
+          check:
+            bool(r_generic == r_mixed)
+            bool(r_generic == r_vartime)
+            bool(r_generic == r_vartime2)
+            bool(r_generic == r_vartime3)

      test(ec, randZ = false, gen = Uniform)
      test(ec, randZ = true, gen = Uniform)
@ -563,41 +776,41 @@ proc run_EC_mixed_add_impl*(
          var r{.noInit.}: ECP_ShortW_Aff[EC.F, EC.G]
          r.affine(r_mixed)

+          # Aliasing test
          a += bAff

          check:
            bool(r == bAff)
            bool(a == r_mixed)

+          # vartime - internals special-case Z=1
+          var r_vartime, r_vartime2: EC
+          var b: EC
+          b.fromAffine(bAff)
+
+          a.setInf()
+          r_vartime.sum_vartime(a, b)
+          r_vartime2.madd_vartime(a, bAff)
+
+          check:
+            bool(r_vartime == r_mixed)
+            bool(r_vartime2 == r_mixed)
+
+          # Aliasing
+          r_vartime.setInf()
+          r_vartime.sum_vartime(r_vartime, b)
+          r_vartime2.setInf()
+          r_vartime2.sum_vartime(r_vartime2, b)
+
+          check:
+            bool(r_vartime == r_mixed)
+            bool(r_vartime2 == r_mixed)
+
      test(ec, randZ = false, gen = Uniform)
      test(ec, randZ = false, gen = HighHammingWeight)
      test(ec, randZ = false, gen = Long01Sequence)

    test "EC " & $ec.G & " mixed addition - adding infinity RHS":
-      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
-        for _ in 0 ..< Iters:
-          let a = rng.random_point(EC, randZ, gen)
-          var naAff{.noInit.}: ECP_ShortW_Aff[EC.F, EC.G]
-          naAff.affine(a)
-          naAff.neg()
-
-          var r{.noInit.}: EC
-          r.madd(a, naAff)
-
-          check: r.isInf().bool
-
-          r = a
-          r += naAff
-          check: r.isInf().bool
-
-      test(ec, randZ = false, gen = Uniform)
-      test(ec, randZ = true, gen = Uniform)
-      test(ec, randZ = false, gen = HighHammingWeight)
-      test(ec, randZ = true, gen = HighHammingWeight)
-      test(ec, randZ = false, gen = Long01Sequence)
-      test(ec, randZ = true, gen = Long01Sequence)
-
-    test "EC " & $ec.G & " mixed addition - adding opposites":
      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
        for _ in 0 ..< Iters:
          let a = rng.random_point(EC, randZ, gen)
@ -613,6 +826,75 @@ proc run_EC_mixed_add_impl*(
          r += bAff
          check: bool(r == a)

+          # vartime
+          var r_vartime, r_vartime2: EC
+          var b: EC
+          b.fromAffine(bAff)
+
+          r_vartime.sum_vartime(a, b)
+          r_vartime2.madd_vartime(a, bAff)
+
+          check:
+            bool(r_vartime == r)
+            bool(r_vartime2 == r)
+
+          # Aliasing
+          r_vartime = a
+          r_vartime.sum_vartime(r_vartime, b)
+          r_vartime2 = a
+          r_vartime2.sum_vartime(r_vartime2, b)
+
+          check:
+            bool(r_vartime == r)
+            bool(r_vartime2 == r)
+
+      test(ec, randZ = false, gen = Uniform)
+      test(ec, randZ = true, gen = Uniform)
+      test(ec, randZ = false, gen = HighHammingWeight)
+      test(ec, randZ = true, gen = HighHammingWeight)
+      test(ec, randZ = false, gen = Long01Sequence)
+      test(ec, randZ = true, gen = Long01Sequence)
+
+    test "EC " & $ec.G & " mixed addition - adding opposites":
+      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
+        for _ in 0 ..< Iters:
+          let a = rng.random_point(EC, randZ, gen)
+          var naAff{.noInit.}: ECP_ShortW_Aff[EC.F, EC.G]
+          naAff.affine(a)
+          naAff.neg()
+
+          var r{.noInit.}: EC
+          r.madd(a, naAff)
+
+          check: r.isInf().bool
+
+          # Aliasing
+          r = a
+          r += naAff
+          check: r.isInf().bool
+
+          # vartime
+          var r_vartime, r_vartime2: EC
+          var na: EC
+          na.fromAffine(naAff)
+
+          r_vartime.sum_vartime(a, na)
+          r_vartime2.madd_vartime(a, naAff)
+
+          check:
+            bool(r_vartime == r)
+            bool(r_vartime2 == r)
+
+          # Aliasing
+          r_vartime = a
+          r_vartime.sum_vartime(r_vartime, na)
+          r_vartime2 = a
+          r_vartime2.madd_vartime(r_vartime2, naAff)
+
+          check:
+            bool(r_vartime == r)
+            bool(r_vartime2 == r)
+
      test(ec, randZ = false, gen = Uniform)
      test(ec, randZ = true, gen = Uniform)
      test(ec, randZ = false, gen = HighHammingWeight)