Batch additions (#207)

* Batch elliptic curve addition * accelerate chained muls * jac mixed add handle doubling. jac additions handle aliasing when adding infinity * properly skip sanitizer on BLS signature test * properly skip sanitizer² on BLS signature test
2026-01-04 06:03:08 +00:00 · 2022-10-29 22:43:40 +02:00 · 2022-10-29 22:43:40 +02:00 · 928f515582
commit 928f515582
parent 93654d580e
15 changed files with 1027 additions and 219 deletions
--- a/benchmarks/bench_ec_g1_batch.nim
+++ b/benchmarks/bench_ec_g1_batch.nim
@ -0,0 +1,68 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internals
+  ../constantine/math/config/curves,
+  ../constantine/math/arithmetic,
+  ../constantine/math/elliptic/[
+    ec_shortweierstrass_affine,
+    ec_shortweierstrass_projective,
+    ec_shortweierstrass_jacobian],
+  # Helpers
+  ../helpers/static_for,
+  ./bench_elliptic_template,
+  # Standard library
+  std/strutils
+
+# ############################################################
+#
+#               Benchmark of the G1 group of
+#            Short Weierstrass elliptic curves
+#          in (homogeneous) projective coordinates
+#
+# ############################################################
+
+
+const Iters = 10_000
+const AvailableCurves = [
+  # BN254_Snarks,
+  BLS12_381,
+]
+
+proc main() =
+  separator()
+  staticFor i, 0, AvailableCurves.len:
+    const curve = AvailableCurves[i]
+    addBench(ECP_ShortW_Prj[Fp[curve], G1], Iters)
+    addBench(ECP_ShortW_Jac[Fp[curve], G1], Iters)
+    mixedAddBench(ECP_ShortW_Prj[Fp[curve], G1], Iters)
+    mixedAddBench(ECP_ShortW_Jac[Fp[curve], G1], Iters)
+    doublingBench(ECP_ShortW_Prj[Fp[curve], G1], Iters)
+    doublingBench(ECP_ShortW_Jac[Fp[curve], G1], Iters)
+    separator()
+    for numPoints in [10, 100, 1000, 10000, 100000, 1000000]:
+      let batchIters = max(1, Iters div numPoints)
+      multiAddBench(ECP_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = false, batchIters)
+    separator()
+    for numPoints in [10, 100, 1000, 10000, 100000, 1000000]:
+      let batchIters = max(1, Iters div numPoints)
+      multiAddBench(ECP_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = true, batchIters)
+    separator()
+    for numPoints in [10, 100, 1000, 10000, 100000, 1000000]:
+      let batchIters = max(1, Iters div numPoints)
+      multiAddBench(ECP_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = false, batchIters)
+    separator()
+    for numPoints in [10, 100, 1000, 10000, 100000, 1000000]:
+      let batchIters = max(1, Iters div numPoints)
+      multiAddBench(ECP_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = true, batchIters)
+    separator()
+    separator()
+
+main()
+notes()
--- a/benchmarks/bench_elliptic_template.nim
+++ b/benchmarks/bench_elliptic_template.nim
@ -22,6 +22,7 @@ import
    ec_shortweierstrass_affine,
    ec_shortweierstrass_projective,
    ec_shortweierstrass_jacobian,
+    ec_shortweierstrass_batch_ops,
    ec_scalar_mul, ec_endomorphism_accel],
  # Helpers
  ../helpers/[prng_unsafe, static_for],
@ -35,10 +36,10 @@ export abstractions # generic sandwich on SecretBool and SecretBool in Jacobian

 proc separator*() = separator(177)

-macro fixEllipticDisplay(T: typedesc): untyped =
+macro fixEllipticDisplay(EC: typedesc): untyped =
  # At compile-time, enums are integers and their display is buggy
  # we get the Curve ID instead of the curve name.
-  let instantiated = T.getTypeInst()
+  let instantiated = EC.getTypeInst()
  var name = $instantiated[1][0] # EllipticEquationFormCoordinates
  let fieldName = $instantiated[1][1][0]
  let curveName = $Curve(instantiated[1][1][1].intVal)
@ -53,100 +54,108 @@ proc report(op, elliptic: string, start, stop: MonoTime, startClk, stopClk: int6
  else:
    echo &"{op:<60} {elliptic:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op"

-template bench(op: string, T: typedesc, iters: int, body: untyped): untyped =
+template bench(op: string, EC: typedesc, iters: int, body: untyped): untyped =
  measure(iters, startTime, stopTime, startClk, stopClk, body)
-  report(op, fixEllipticDisplay(T), startTime, stopTime, startClk, stopClk, iters)
+  report(op, fixEllipticDisplay(EC), startTime, stopTime, startClk, stopClk, iters)

-proc addBench*(T: typedesc, iters: int) =
-  const G1_or_G2 = when T.F is Fp: "G1" else: "G2"
-  var r {.noInit.}: T
-  let P = rng.random_unsafe(T)
-  let Q = rng.random_unsafe(T)
-  bench("EC Add " & G1_or_G2, T, iters):
+proc addBench*(EC: typedesc, iters: int) =
+  var r {.noInit.}: EC
+  let P = rng.random_unsafe(EC)
+  let Q = rng.random_unsafe(EC)
+  bench("EC Add " & $EC.G, EC, iters):
    r.sum(P, Q)

-proc mixedAddBench*(T: typedesc, iters: int) =
-  const G1_or_G2 = when T.F is Fp: "G1" else: "G2"
-  var r {.noInit.}: T
-  let P = rng.random_unsafe(T)
-  let Q = rng.random_unsafe(T)
-  var Qaff: ECP_ShortW_Aff[T.F, T.G]
+proc mixedAddBench*(EC: typedesc, iters: int) =
+  var r {.noInit.}: EC
+  let P = rng.random_unsafe(EC)
+  let Q = rng.random_unsafe(EC)
+  var Qaff: ECP_ShortW_Aff[EC.F, EC.G]
  Qaff.affine(Q)
-  bench("EC Mixed Addition " & G1_or_G2, T, iters):
+  bench("EC Mixed Addition " & $EC.G, EC, iters):
    r.madd(P, Qaff)

-proc doublingBench*(T: typedesc, iters: int) =
-  const G1_or_G2 = when T.F is Fp: "G1" else: "G2"
-  var r {.noInit.}: T
-  let P = rng.random_unsafe(T)
-  bench("EC Double " & G1_or_G2, T, iters):
+proc doublingBench*(EC: typedesc, iters: int) =
+  var r {.noInit.}: EC
+  let P = rng.random_unsafe(EC)
+  bench("EC Double " & $EC.G, EC, iters):
    r.double(P)

-proc affFromProjBench*(T: typedesc, iters: int) =
-  const G1_or_G2 = when T.F is Fp: "G1" else: "G2"
-  var r {.noInit.}: ECP_ShortW_Aff[T.F, T.G]
-  let P = rng.random_unsafe(T)
-  bench("EC Projective to Affine " & G1_or_G2, T, iters):
+proc affFromProjBench*(EC: typedesc, iters: int) =
+  var r {.noInit.}: ECP_ShortW_Aff[EC.F, EC.G]
+  let P = rng.random_unsafe(EC)
+  bench("EC Projective to Affine " & $EC.G, EC, iters):
    r.affine(P)

-proc affFromJacBench*(T: typedesc, iters: int) =
-  const G1_or_G2 = when T.F is Fp: "G1" else: "G2"
-  var r {.noInit.}: ECP_ShortW_Aff[T.F, T.G]
-  let P = rng.random_unsafe(T)
-  bench("EC Jacobian to Affine " & G1_or_G2, T, iters):
+proc affFromJacBench*(EC: typedesc, iters: int) =
+  var r {.noInit.}: ECP_ShortW_Aff[EC.F, EC.G]
+  let P = rng.random_unsafe(EC)
+  bench("EC Jacobian to Affine " & $EC.G, EC, iters):
    r.affine(P)

-proc scalarMulGenericBench*(T: typedesc, window: static int, iters: int) =
-  const bits = T.F.C.getCurveOrderBitwidth()
-  const G1_or_G2 = when T.F is Fp: "G1" else: "G2"
+proc scalarMulGenericBench*(EC: typedesc, window: static int, iters: int) =
+  const bits = EC.F.C.getCurveOrderBitwidth()

-  var r {.noInit.}: T
-  let P = rng.random_unsafe(T) # TODO: clear cofactor
+  var r {.noInit.}: EC
+  let P = rng.random_unsafe(EC) # TODO: clear cofactor

  let exponent = rng.random_unsafe(BigInt[bits])

-  bench("EC ScalarMul " & $bits & "-bit " & G1_or_G2 & " (window-" & $window & ", generic)", T, iters):
+  bench("EC ScalarMul " & $bits & "-bit " & $EC.G & " (window-" & $window & ", generic)", EC, iters):
    r = P
    r.scalarMulGeneric(exponent, window)

-proc scalarMulEndo*(T: typedesc, iters: int) =
-  const bits = T.F.C.getCurveOrderBitwidth()
-  const G1_or_G2 = when T.F is Fp: "G1" else: "G2"
+proc scalarMulEndo*(EC: typedesc, iters: int) =
+  const bits = EC.F.C.getCurveOrderBitwidth()

-  var r {.noInit.}: T
-  let P = rng.random_unsafe(T) # TODO: clear cofactor
+  var r {.noInit.}: EC
+  let P = rng.random_unsafe(EC) # TODO: clear cofactor

  let exponent = rng.random_unsafe(BigInt[bits])

-  bench("EC ScalarMul " & $bits & "-bit " & G1_or_G2 & " (endomorphism accelerated)", T, iters):
+  bench("EC ScalarMul " & $bits & "-bit " & $EC.G & " (endomorphism accelerated)", EC, iters):
    r = P
    r.scalarMulEndo(exponent)

-proc scalarMulEndoWindow*(T: typedesc, iters: int) =
-  const bits = T.F.C.getCurveOrderBitwidth()
-  const G1_or_G2 = when T.F is Fp: "G1" else: "G2"
+proc scalarMulEndoWindow*(EC: typedesc, iters: int) =
+  const bits = EC.F.C.getCurveOrderBitwidth()

-  var r {.noInit.}: T
-  let P = rng.random_unsafe(T) # TODO: clear cofactor
+  var r {.noInit.}: EC
+  let P = rng.random_unsafe(EC) # TODO: clear cofactor

  let exponent = rng.random_unsafe(BigInt[bits])

-  bench("EC ScalarMul " & $bits & "-bit " & G1_or_G2 & " (window-2, endomorphism accelerated)", T, iters):
+  bench("EC ScalarMul " & $bits & "-bit " & $EC.G & " (window-2, endomorphism accelerated)", EC, iters):
    r = P
-    when T.F is Fp:
+    when EC.F is Fp:
      r.scalarMulGLV_m2w2(exponent)
    else:
      {.error: "Not implemented".}

-proc scalarMulUnsafeDoubleAddBench*(T: typedesc, iters: int) =
-  const bits = T.F.C.getCurveOrderBitwidth()
-  const G1_or_G2 = when T.F is Fp: "G1" else: "G2"
+proc scalarMulUnsafeDoubleAddBench*(EC: typedesc, iters: int) =
+  const bits = EC.F.C.getCurveOrderBitwidth()

-  var r {.noInit.}: T
-  let P = rng.random_unsafe(T) # TODO: clear cofactor
+  var r {.noInit.}: EC
+  let P = rng.random_unsafe(EC) # TODO: clear cofactor

  let exponent = rng.random_unsafe(BigInt[bits])

-  bench("EC ScalarMul " & $bits & "-bit " & G1_or_G2 & " (unsafe reference DoubleAdd)", T, iters):
+  bench("EC ScalarMul " & $bits & "-bit " & $EC.G & " (unsafe reference DoubleAdd)", EC, iters):
    r = P
    r.unsafe_ECmul_double_add(exponent)
+
+proc multiAddBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) =  
+  var points = newSeq[ECP_ShortW_Aff[EC.F, EC.G]](numPoints)
+
+  for i in 0 ..< numPoints:
+    points[i] = rng.random_unsafe(ECP_ShortW_Aff[EC.F, EC.G])
+
+  var r{.noInit.}: EC
+
+  if useBatching:
+    bench("EC Multi-Addition batched             " & $EC.G & " (" & $numPoints & " points)", EC, iters):
+      r.sum_batch_vartime(points)
+  else:
+    bench("EC Multi-Addition unbatched mixed add " & $EC.G & " (" & $numPoints & " points)", EC, iters):
+      r.setInf()
+      for i in 0 ..< numPoints:
+        r += points[i]
--- a/constantine.nimble
+++ b/constantine.nimble
@ -96,13 +96,13 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[

  # Elliptic curve arithmetic G1
  # ----------------------------------------------------------
-  # ("tests/math/t_ec_shortw_prj_g1_add_double.nim", false),
+  ("tests/math/t_ec_shortw_prj_g1_add_double.nim", false),
  # ("tests/math/t_ec_shortw_prj_g1_mul_sanity.nim", false),
  # ("tests/math/t_ec_shortw_prj_g1_mul_distri.nim", false),
  ("tests/math/t_ec_shortw_prj_g1_mul_vs_ref.nim", false),
  ("tests/math/t_ec_shortw_prj_g1_mixed_add.nim", false),

-  # ("tests/math/t_ec_shortw_jac_g1_add_double.nim", false),
+  ("tests/math/t_ec_shortw_jac_g1_add_double.nim", false),
  # ("tests/math/t_ec_shortw_jac_g1_mul_sanity.nim", false),
  # ("tests/math/t_ec_shortw_jac_g1_mul_distri.nim", false),
  ("tests/math/t_ec_shortw_jac_g1_mul_vs_ref.nim", false),
@ -115,49 +115,49 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[

  # Elliptic curve arithmetic G2
  # ----------------------------------------------------------
-  # ("tests/math/t_ec_shortw_prj_g2_add_double_bn254_snarks.nim", false),
+  ("tests/math/t_ec_shortw_prj_g2_add_double_bn254_snarks.nim", false),
  # ("tests/math/t_ec_shortw_prj_g2_mul_sanity_bn254_snarks.nim", false),
  # ("tests/math/t_ec_shortw_prj_g2_mul_distri_bn254_snarks.nim", false),
  ("tests/math/t_ec_shortw_prj_g2_mul_vs_ref_bn254_snarks.nim", false),
  ("tests/math/t_ec_shortw_prj_g2_mixed_add_bn254_snarks.nim", false),

-  # ("tests/math/t_ec_shortw_prj_g2_add_double_bls12_381.nim", false),
+  ("tests/math/t_ec_shortw_prj_g2_add_double_bls12_381.nim", false),
  # ("tests/math/t_ec_shortw_prj_g2_mul_sanity_bls12_381.nim", false),
  # ("tests/math/t_ec_shortw_prj_g2_mul_distri_bls12_381.nim", false),
  ("tests/math/t_ec_shortw_prj_g2_mul_vs_ref_bls12_381.nim", false),
  ("tests/math/t_ec_shortw_prj_g2_mixed_add_bls12_381.nim", false),

-  # ("tests/math/t_ec_shortw_prj_g2_add_double_bls12_377.nim", false),
+  ("tests/math/t_ec_shortw_prj_g2_add_double_bls12_377.nim", false),
  # ("tests/math/t_ec_shortw_prj_g2_mul_sanity_bls12_377.nim", false),
  # ("tests/math/t_ec_shortw_prj_g2_mul_distri_bls12_377.nim", false),
  ("tests/math/t_ec_shortw_prj_g2_mul_vs_ref_bls12_377.nim", false),
  ("tests/math/t_ec_shortw_prj_g2_mixed_add_bls12_377.nim", false),

-  # ("tests/math/t_ec_shortw_prj_g2_add_double_bw6_761.nim", false),
+  ("tests/math/t_ec_shortw_prj_g2_add_double_bw6_761.nim", false),
  # ("tests/math/t_ec_shortw_prj_g2_mul_sanity_bw6_761.nim", false),
  # ("tests/math/t_ec_shortw_prj_g2_mul_distri_bw6_761.nim", false),
  ("tests/math/t_ec_shortw_prj_g2_mul_vs_ref_bw6_761.nim", false),
  ("tests/math/t_ec_shortw_prj_g2_mixed_add_bw6_761.nim", false),

-  # ("tests/math/t_ec_shortw_jac_g2_add_double_bn254_snarks.nim", false),
+  ("tests/math/t_ec_shortw_jac_g2_add_double_bn254_snarks.nim", false),
  # ("tests/math/t_ec_shortw_jac_g2_mul_sanity_bn254_snarks.nim", false),
  # ("tests/math/t_ec_shortw_jac_g2_mul_distri_bn254_snarks.nim", false),
  ("tests/math/t_ec_shortw_jac_g2_mul_vs_ref_bn254_snarks.nim", false),
  ("tests/math/t_ec_shortw_jac_g2_mixed_add_bn254_snarks.nim", false),

-  # ("tests/math/t_ec_shortw_jac_g2_add_double_bls12_381.nim", false),
+  ("tests/math/t_ec_shortw_jac_g2_add_double_bls12_381.nim", false),
  # ("tests/math/t_ec_shortw_jac_g2_mul_sanity_bls12_381.nim", false),
  # ("tests/math/t_ec_shortw_jac_g2_mul_distri_bls12_381.nim", false),
  ("tests/math/t_ec_shortw_jac_g2_mul_vs_ref_bls12_381.nim", false),
  ("tests/math/t_ec_shortw_jac_g2_mixed_add_bls12_381.nim", false),

-  # ("tests/math/t_ec_shortw_jac_g2_add_double_bls12_377.nim", false),
+  ("tests/math/t_ec_shortw_jac_g2_add_double_bls12_377.nim", false),
  # ("tests/math/t_ec_shortw_jac_g2_mul_sanity_bls12_377.nim", false),
  # ("tests/math/t_ec_shortw_jac_g2_mul_distri_bls12_377.nim", false),
  ("tests/math/t_ec_shortw_jac_g2_mul_vs_ref_bls12_377.nim", false),
  ("tests/math/t_ec_shortw_jac_g2_mixed_add_bls12_377.nim", false),

-  # ("tests/math/t_ec_shortw_jac_g2_add_double_bw6_761.nim", false),
+  ("tests/math/t_ec_shortw_jac_g2_add_double_bw6_761.nim", false),
  # ("tests/math/t_ec_shortw_jac_g2_mul_sanity_bw6_761.nim", false),
  # ("tests/math/t_ec_shortw_jac_g2_mul_distri_bw6_761.nim", false),
  ("tests/math/t_ec_shortw_jac_g2_mul_vs_ref_bw6_761.nim", false),
@ -177,6 +177,11 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
  # ----------------------------------------------------------
  ("tests/math/t_ec_shortw_prj_edge_cases.nim", false),

+  # Elliptic curve arithmetic - batch computation
+  # ----------------------------------------------------------
+  ("tests/math/t_ec_shortw_prj_g1_batch_add.nim", false),
+  ("tests/math/t_ec_shortw_jac_g1_batch_add.nim", false),
+
  # Subgroups and cofactors
  # ----------------------------------------------------------
  ("tests/math/t_ec_subgroups_bn254_nogami.nim", false),
@ -232,6 +237,7 @@ const benchDesc = [
  "bench_fp6",
  "bench_fp12",
  "bench_ec_g1",
+  "bench_ec_g1_batch",
  "bench_ec_g2",
  "bench_pairing_bls12_377",
  "bench_pairing_bls12_381",
@ -262,13 +268,14 @@ const skipSanitizers = [
  "tests/math/t_ec_sage_bn254_snarks.nim",
  "tests/math/t_ec_sage_bls12_377.nim",
  "tests/math/t_ec_sage_bls12_381.nim",
+  "tests/t_blssig_pop_on_bls12381_g2.nim",
  "tests/t_hash_to_field.nim",
  "tests/t_hash_to_curve.nim",
  "tests/t_hash_to_curve_random.nim",
  "tests/t_mac_poly1305.nim",
  "tests/t_mac_hmac.nim",
  "tests/t_kdf_hkdf.nim",
-  "tests/t_ethereum_eip2333_bls12381_key_derivation"
+  "tests/t_ethereum_eip2333_bls12381_key_derivation.nim"
 ]

 when defined(windows):
@ -302,8 +309,8 @@ template setupCommand(): untyped {.dirty.} =
  var flags = flags
  when not defined(windows):
    # Not available in MinGW https://github.com/libressl-portable/portable/issues/54
-    flags &= " --passC:-fstack-protector-all"
-  let command = "nim " & lang & cc & " " & flags &
+    flags &= " --passC:-fstack-protector-strong"
+  let command = "nim " & lang & cc & " -d:release " & flags &
    " --verbosity:0 --outdir:build/testsuite -r --hints:off --warnings:off " &
    " --nimcache:nimcache/" & path & " " &
    path
@ -672,37 +679,55 @@ task bench_fp12_clang_noasm, "Run benchmark 𝔽p12 with clang - no Assembly":
 # Elliptic curve G1
 # ------------------------------------------

-task bench_ec_g1, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - Default compiler":
+task bench_ec_g1, "Run benchmark on Elliptic Curve group 𝔾1 - Default compiler":
  runBench("bench_ec_g1")

-task bench_ec_g1_gcc, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - GCC":
+task bench_ec_g1_gcc, "Run benchmark on Elliptic Curve group 𝔾1 - GCC":
  runBench("bench_ec_g1", "gcc")

-task bench_ec_g1_clang, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - Clang":
+task bench_ec_g1_clang, "Run benchmark on Elliptic Curve group 𝔾1 - Clang":
  runBench("bench_ec_g1", "clang")

-task bench_ec_g1_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - GCC no Assembly":
+task bench_ec_g1_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - GCC no Assembly":
  runBench("bench_ec_g1", "gcc", useAsm = false)

-task bench_ec_g1_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - Short Weierstrass with Projective Coordinates - Clang no Assembly":
+task bench_ec_g1_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 - Clang no Assembly":
+  runBench("bench_ec_g1", "clang", useAsm = false)
+
+# Elliptic curve G1 - batch operations
+# ------------------------------------------
+
+task bench_ec_g1_batch, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - Default compiler":
+  runBench("bench_ec_g1_batch")
+
+task bench_ec_g1_batch_gcc, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - GCC":
+  runBench("bench_ec_g1_batch", "gcc")
+
+task bench_ec_g1_batch_clang, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - Clang":
+  runBench("bench_ec_g1_batch", "clang")
+
+task bench_ec_g1_batch_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - GCC no Assembly":
+  runBench("bench_ec_g1_batch", "gcc", useAsm = false)
+
+task bench_ec_g1_batch_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾1 (batch ops) - Clang no Assembly":
  runBench("bench_ec_g1", "clang", useAsm = false)

 # Elliptic curve G2
 # ------------------------------------------

-task bench_ec_g2, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - Default compiler":
+task bench_ec_g2, "Run benchmark on Elliptic Curve group 𝔾2 - Default compiler":
  runBench("bench_ec_g2")

-task bench_ec_g2_gcc, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - GCC":
+task bench_ec_g2_gcc, "Run benchmark on Elliptic Curve group 𝔾2 - GCC":
  runBench("bench_ec_g2", "gcc")

-task bench_ec_g2_clang, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - Clang":
+task bench_ec_g2_clang, "Run benchmark on Elliptic Curve group 𝔾2 - Clang":
  runBench("bench_ec_g2", "clang")

-task bench_ec_g2_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - GCC no Assembly":
+task bench_ec_g2_gcc_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - GCC no Assembly":
  runBench("bench_ec_g2", "gcc", useAsm = false)

-task bench_ec_g2_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - Short Weierstrass with Projective Coordinates - Clang no Assembly":
+task bench_ec_g2_clang_noasm, "Run benchmark on Elliptic Curve group 𝔾2 - Clang no Assembly":
  runBench("bench_ec_g2", "clang", useAsm = false)

 # Pairings
--- a/constantine/math/elliptic/README.md
+++ b/constantine/math/elliptic/README.md
@ -62,7 +62,7 @@ Ry = λ(Px - Rx) - Py
 ```
 but in the case of addition
 ```
-λ = (Qy - Py) / (Px - Qx)
+λ = (Qy - Py) / (Qx - Px)
 ```
 which is undefined for P == Q or P == -Q (as `-(x, y) = (x, -y)`)

--- a/constantine/math/elliptic/ec_shortweierstrass_affine.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_affine.nim
@ -14,6 +14,9 @@ import
  ../io/[io_fields, io_extfields],
  ../constants/zoo_constants

+# No exceptions allowed
+{.push raises: [].}
+
 # ############################################################
 #
 #             Elliptic Curve in Short Weierstrass form
--- a/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_batch_ops.nim
@ -0,0 +1,303 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../../platforms/[abstractions, allocs],
+  ../arithmetic,
+  ../extension_fields,
+  ./ec_shortweierstrass_affine,
+  ./ec_shortweierstrass_jacobian,
+  ./ec_shortweierstrass_projective
+
+# No exceptions allowed
+{.push raises: [].}
+
+# ############################################################
+#
+#             Elliptic Curve in Short Weierstrass form
+#                     Batch addition
+#
+# ############################################################
+
+# Affine primitives
+# ------------------------------------------------------------
+#
+# The equation for elliptic curve addition is in affine (x, y) coordinates:
+# 
+# P + Q = R
+# (Px, Py) + (Qx, Qy) = (Rx, Ry)
+# 
+# with
+#   Rx = λ² - Px - Qx
+#   Ry = λ(Px - Rx) - Py
+# 
+# in the case of addition
+#   λ = (Qy - Py) / (Qx - Px)
+# 
+# which is undefined for P == Q or P == -Q as -(x, y) = (x, -y)
+# 
+# if P = Q, the doubling formula uses the slope of the tangent at the limit
+#   λ = (3 Px² + a) / (2 Px)
+#
+# if P = -Q, the sum is the point at infinity
+#
+# ~~~~
+#
+# Those formulas require
+#   addition: 2M + 1S + 1I
+#   doubling: 2M + 2S + 1I
+#
+# Inversion is very expensive:
+#   119.5x multiplications (with ADX) for BN254
+#   98.4x  multiplications (with ADX) for BLS12-381
+#
+# However, n inversions can use Montgomery's batch inversion
+# at the cost of 3(n-1)M + 1I
+#
+# Hence batch addition can have an asymptotic cost of
+#   5M + 1S
+# Compared to
+#   Jacobian addition:         12M + 4S
+#   Jacobian mixed addition:    7M + 4S
+#   Projective addition:       12M      (for curves in the form y² = x³ + b)
+#   Projective mixed addition: 11M      (for curves in the form y² = x³ + b)
+
+func lambdaAdd[F; G: static Subgroup](lambda_num, lambda_den: var F, P, Q: ECP_ShortW_Aff[F, G]) =
+  ## Compute the slope of the line (PQ)
+  lambda_num.diff(Q.y, P.y)
+  lambda_den.diff(Q.x, P.x)
+
+func lambdaDouble[F; G: static Subgroup](lambda_num, lambda_den: var F, P: ECP_ShortW_Aff[F, G]) =
+  ## Compute the tangent at P
+  lambda_num.square(P.x)
+  lambda_num *= 3
+  when F.C.getCoefA() != 0:
+    t += F.C.getCoefA()
+
+  lambda_den.double(P.y)
+
+func affineAdd[F; G: static Subgroup](
+       r: var ECP_ShortW_Aff[F, G],
+       lambda: var F,
+       P, Q: ECP_ShortW_Aff[F, G]) =
+  
+  r.x.square(lambda)
+  r.x -= P.x
+  r.x -= Q.x
+
+  r.y.diff(P.x, r.x)
+  r.y *= lambda
+  r.y -= P.y
+
+{.push checks:off.}
+func accum_half_vartime[F; G: static Subgroup](
+       points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
+       lambdas: ptr UncheckedArray[tuple[num, den: F]],
+       len: uint) {.noinline.} =
+  ## Affine accumulation of half the points into the other half
+  ## Warning ⚠️ : variable-time
+  ## 
+  ## Accumulate `len` points pairwise into `len/2` 
+  ## 
+  ## Input/output:
+  ## - points: `len/2` affine points to add (must be even)
+  ##           Partial sums are stored in [0, len/2)
+  ##           [len/2, len) data has been destroyed
+  ## 
+  ## Scratchspace:
+  ## - Lambdas
+  ## 
+  ## Output:
+  ## - r
+  ## 
+  ## Warning ⚠️ : cannot be inlined if used in loop due to the use of alloca
+
+  debug: doAssert len and 1 == 0, "There must be an even number of points"
+  
+  let N = len div 2
+
+  # Step 1: Compute numerators and denominators of λᵢ = λᵢ_num / λᵢ_den
+  for i in 0 ..< N:
+    let p = i
+    let q = i+N
+    let q_prev = i-1+N
+
+    # As we can't divide by 0 in normal cases, λᵢ_den != 0,
+    # so we use it to indicate special handling.
+    template markSpecialCase(): untyped {.dirty.} =
+      #  we use Qy as an accumulator, so we save Qy in λᵢ_num
+      lambdas[i].num = points[q].y
+      # Mark for special handling
+      lambdas[i].den.setZero()
+
+      # Step 2: Accumulate denominators in Qy, which is not used anymore.
+      if i == 0:
+        points[q].y.setOne()
+      else:
+        points[q].y = points[q_prev].y
+
+    # Special case 1: infinity points have affine coordinates (0, 0) by convention
+    #                 it doesn't match the y²=x³+ax+b equation so slope formula need special handling
+    if points[p].isInf().bool() or points[q].isInf().bool():
+      markSpecialCase()
+      continue
+
+    # Special case 2: λ = (Qy-Py)/(Qx-Px) which is undefined when Px == Qx
+    #                 This happens when P == Q or P == -Q
+    if bool(points[p].x == points[q].x):
+      if bool(points[p].y == points[q].y):
+        lambdaDouble(lambdas[i].num, lambdas[i].den, points[p])
+      else: # P = -Q, so P+Q = inf
+        markSpecialCase()
+        continue
+    else:
+      lambdaAdd(lambdas[i].num, lambdas[i].den, points[p], points[q])
+  
+    # Step 2: Accumulate denominators in Qy, which is not used anymore.
+    if i == 0:
+      points[q].y = lambdas[i].den
+    else:
+      points[q].y.prod(points[q_prev].y, lambdas[i].den, skipFinalSub = true)  
+
+  # Step 3: batch invert
+  var accInv {.noInit.}: F
+  accInv.setZero()
+  points[len-1].y += accInv   # Undo skipFinalSub, ensure that the last accum is in canonical form, before inversion
+  accInv.inv(points[len-1].y)
+
+  # Step 4: Compute the partial sums
+
+  template recallSpecialCase(i, p, q): untyped {.dirty.} =
+    # As Qy is used as an accumulator, we saved Qy in λᵢ_num
+    # For special caseshandling, restore it.
+    points[q].y = lambdas[i].num
+    if points[p].isInf().bool():
+      points[i] = points[q]
+    elif points[q].x.isZero().bool() and lambdas[i].num.isZero().bool():
+      discard "points[i] = points[p]" # i == p
+    else:
+      points[i].setInf()
+
+  for i in countdown(N-1, 1):
+    let p = i
+    let q = i+N
+    let q_prev = i-1+N
+
+    if lambdas[i].den.isZero().bool():
+      recallSpecialCase(i, p, q)
+      continue
+
+    # Compute lambda
+    points[q].y.prod(accInv, points[q_prev].y, skipFinalSub = true)
+    points[q].y.prod(points[q].y, lambdas[i].num, skipFinalSub = true)
+    
+    # Compute EC addition 
+    var r{.noInit.}: ECP_ShortW_Aff[F, G]
+    r.affineAdd(lambda = points[q].y, points[p], points[q])    
+
+    # Store result
+    points[i] = r
+
+    # Next iteration
+    accInv.prod(accInv, lambdas[i].den, skipFinalSub = true)
+
+  block: # Tail
+    let i = 0
+    let p = 0
+    let q = N
+
+    if lambdas[0].den.isZero().bool():
+      recallSpecialCase(i, p, q)
+    else:
+      # Compute lambda
+      points[q].y.prod(lambdas[0].num, accInv, skipFinalSub = true)
+  
+      # Compute EC addition 
+      var r{.noInit.}: ECP_ShortW_Aff[F, G]
+      r.affineAdd(lambda = points[q].y, points[p], points[q])    
+
+      # Store result
+      points[0] = r
+
+{.pop.}
+
+# Batch addition: jacobian
+# ------------------------------------------------------------
+
+{.push checks:off.}
+func accumSum_chunk_vartime[F; G: static Subgroup](
+       r: var (ECP_ShortW_Jac[F, G] or ECP_ShortW_Prj[F, G]),
+       points: ptr UncheckedArray[ECP_ShortW_Aff[F, G]],
+       lambdas: ptr UncheckedArray[tuple[num, den: F]],
+       len: uint) =
+  ## Accumulate `points` into r.
+  ## `r` is NOT overwritten
+  ## r += ∑ points
+  
+  const ChunkThreshold = 16
+  var n = len
+
+  while n >= ChunkThreshold:
+    if (n and 1) == 1: # odd number of points
+      ## Accumulate the last
+      r += points[n-1]
+      n -= 1
+    
+    # Compute [0, n/2) += [n/2, n)
+    accum_half_vartime(points, lambdas, n)
+
+    # Next chunk
+    n = n div 2
+
+  # Tail
+  for i in 0'u ..< n:
+    r += points[i]
+{.pop.}
+
+{.push checks:off.}
+func sum_batch_vartime*[F; G: static Subgroup](
+       r: var (ECP_ShortW_Jac[F, G] or ECP_ShortW_Prj[F, G]),
+       points: openArray[ECP_ShortW_Aff[F, G]]) =
+  ## Batch addition of `points` into `r`
+  ## `r` is overwritten
+  
+  # We chunk the addition to limit memory usage
+  # especially as we allocate on the stack.
+
+  # From experience in high-performance computing,
+  # here are the constraints we want to optimize for
+  #   1. MSVC limits stack to 1MB by default, we want to use a fraction of that.
+  #   2. We want to use a large fraction of L2 cache, but not more.
+  #   3. We want to use a large fraction of the memory addressable by the TLB.
+  #   4. We optimize for hyperthreading with 2 sibling threads (Xeon Phi hyperthreads have 4 siblings).
+  #      Meaning we want to use less than half the L2 cache so that if run on siblings threads (same physical core),
+  #      the chunks don't evict each other.
+  #
+  # Hardware:
+  # - a Raspberry Pi 4 (2019, Cortex A72) has 1MB L2 cache size
+  # - Intel Ice Lake (2019, Core 11XXX) and AMD Zen 2 (2019, Ryzen 3XXX) have 512kB L2 cache size
+  #
+  # After one chunk is processed we are well within all 64-bit CPU L2 cache bounds
+  # as we halve after each chunk.
+
+  r.setInf()
+
+  const maxChunkSize = 262144 # 2¹⁸ = 262144
+  const maxStride = maxChunkSize div sizeof(ECP_ShortW_Aff[F, G])
+  
+  let n = min(maxStride, points.len)
+  let accumulators = alloca(ECP_ShortW_Aff[F, G], n)
+  let lambdas = alloca(tuple[num, den: F], n)
+
+  for i in countup(0, points.len-1, maxStride):
+    let n = min(maxStride, points.len - i)
+    let size = n * sizeof(ECP_ShortW_Aff[F, G])
+    copyMem(accumulators[0].addr, points[i].unsafeAddr, size)
+    r.accumSum_chunk_vartime(accumulators, lambdas, uint n)
+
+{.pop.} 
--- a/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim
@ -15,6 +15,9 @@ import

 export Subgroup

+# No exceptions allowed
+{.push raises: [].}
+
 # ############################################################
 #
 #             Elliptic Curve in Short Weierstrass form
@ -47,15 +50,15 @@ func `==`*(P, Q: ECP_ShortW_Jac): SecretBool =
  var z1z1 {.noInit.}, z2z2 {.noInit.}: F
  var a{.noInit.}, b{.noInit.}: F

-  z1z1.square(P.z)
-  z2z2.square(Q.z)
+  z1z1.square(P.z, skipFinalSub = true)
+  z2z2.square(Q.z, skipFinalSub = true)

  a.prod(P.x, z2z2)
  b.prod(Q.x, z1z1)
  result = a == b

-  a.prod(P.y, Q.z)
-  b.prod(Q.y, P.z)
+  a.prod(P.y, Q.z, skipFinalSub = true)
+  b.prod(Q.y, P.z, skipFinalSub = true)
  a *= z2z2
  b *= z1z1
  result = result and a == b
@ -108,9 +111,9 @@ func trySetFromCoordsXandZ*[F; G](
  result = sqrt_if_square(P.y)

  var z2 {.noInit.}: F
-  z2.square(z)
+  z2.square(z, skipFinalSub = true)
  P.x.prod(x, z2)
-  P.y *= z2
+  P.y.prod(P.y, z2, skipFinalSub = true)
  P.y *= z
  P.z = z

@ -209,13 +212,13 @@ template sumImpl[F; G: static Subgroup](

  block: # Addition-only, check for exceptional cases
    var Z2Z2 {.noInit.}, U2 {.noInit.}, S2 {.noInit.}: F
-    Z2Z2.square(Q.z)
-    S1.prod(Q.z, Z2Z2)
+    Z2Z2.square(Q.z, skipFinalSub = true)
+    S1.prod(Q.z, Z2Z2, skipFinalSub = true)
    S1 *= P.y           # S₁ = Y₁*Z₂³
    U1.prod(P.x, Z2Z2)  # U₁ = X₁*Z₂²

-    Z1Z1.square(P.z)
-    S2.prod(P.z, Z1Z1)
+    Z1Z1.square(P.z, skipFinalSub = true)
+    S2.prod(P.z, Z1Z1, skipFinalSub = true)
    S2 *= Q.y           # S₂ = Y₂*Z₁³
    U2.prod(Q.x, Z1Z1)  # U₂ = X₂*Z₁²

@ -285,10 +288,10 @@ template sumImpl[F; G: static Subgroup](
      var b{.noInit.} = HH_or_YY
      a.ccopy(P.x, isDbl)
      b.ccopy(P.x, isDbl)
-      HHH_or_Mpre.prod(a, b)        # HHH or X₁²
+      HHH_or_Mpre.prod(a, b, true)  # HHH or X₁²

      # Assuming doubling path
-      a.square(HHH_or_Mpre)
+      a.square(HHH_or_Mpre, skipFinalSub = true)
      a *= HHH_or_Mpre              # a = 3X₁²
      b.square(Z1Z1)
      # b.mulCheckSparse(CoefA)     # TODO: broken static compile-time type inference
@ -302,31 +305,33 @@ template sumImpl[F; G: static Subgroup](
  # - R_or_M is set with R (add) or M (dbl)
  # - HHH_or_Mpre contains HHH (add) or garbage precomputation (dbl)
  # - V_or_S is set with V = U₁*HH (add) or S = X₁*YY (dbl)
+  var o {.noInit.}: typeof(r)
  block: # Finishing line
-    # we can start using r, while carefully handling r and P or Q aliasing
    var t {.noInit.}: F
    t.double(V_or_S)
-    r.x.square(R_or_M)
-    r.x -= t                           # X₃ = R²-2*V (add) or M²-2*S (dbl)
-    r.x.csub(HHH_or_Mpre, not isDbl)   # X₃ = R²-HHH-2*V (add) or M²-2*S (dbl)
+    o.x.square(R_or_M)
+    o.x -= t                           # X₃ = R²-2*V (add) or M²-2*S (dbl)
+    o.x.csub(HHH_or_Mpre, not isDbl)   # X₃ = R²-HHH-2*V (add) or M²-2*S (dbl)

-    V_or_S -= r.x                      # V-X₃ (add) or S-X₃ (dbl)
-    r.y.prod(R_or_M, V_or_S)           # Y₃ = R(V-X₃) (add) or M(S-X₃) (dbl)
+    V_or_S -= o.x                      # V-X₃ (add) or S-X₃ (dbl)
+    o.y.prod(R_or_M, V_or_S)           # Y₃ = R(V-X₃) (add) or M(S-X₃) (dbl)
    HHH_or_Mpre.ccopy(HH_or_YY, isDbl) # HHH (add) or YY (dbl)
    S1.ccopy(HH_or_YY, isDbl)          # S1 (add) or YY (dbl)
    HHH_or_Mpre *= S1                  # HHH*S1 (add) or YY² (dbl)
-    r.y -= HHH_or_Mpre                 # Y₃ = R(V-X₃)-S₁*HHH (add) or M(S-X₃)-YY² (dbl)
+    o.y -= HHH_or_Mpre                 # Y₃ = R(V-X₃)-S₁*HHH (add) or M(S-X₃)-YY² (dbl)

    t = Q.z
    t.ccopy(H_or_Y, isDbl)             # Z₂ (add) or Y₁ (dbl)
-    t *= P.z                           # Z₁Z₂ (add) or Y₁Z₁ (dbl)
-    r.z.prod(t, H_or_Y)                # Z₁Z₂H (add) or garbage (dbl)
-    r.z.ccopy(t, isDbl)                # Z₁Z₂H (add) or Y₁Z₁ (dbl)
+    t.prod(t, P.z, true)               # Z₁Z₂ (add) or Y₁Z₁ (dbl)
+    o.z.prod(t, H_or_Y)                # Z₁Z₂H (add) or garbage (dbl)
+    o.z.ccopy(t, isDbl)                # Z₁Z₂H (add) or Y₁Z₁ (dbl)

  # if P or R were infinity points they would have spread 0 with Z₁Z₂
  block: # Infinity points
-    r.ccopy(Q, P.isInf())
-    r.ccopy(P, Q.isInf())
+    o.ccopy(Q, P.isInf())
+    o.ccopy(P, Q.isInf())
+
+  r = o

 func sum*[F; G: static Subgroup](
       r: var ECP_ShortW_Jac[F, G],
@ -379,81 +384,176 @@ func madd*[F; G: static Subgroup](
       P: ECP_ShortW_Jac[F, G],
       Q: ECP_ShortW_Aff[F, G]
     ) =
-  ## Elliptic curve mixed addition for Short Weierstrass curves
-  ## with p in Jacobian coordinates and Q in affine coordinates
+  ## Elliptic curve mixed addition for Short Weierstrass curves in Jacobian coordinates
+  ## with the curve ``a`` being a parameter for summing on isogenous curves.
  ##
  ##   R = P + Q
-  ## 
-  ## TODO: ⚠️ cannot handle P == Q
-  # "madd-2007-bl" mixed addition formula - https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl
-  # with conditional copies to handle infinity points
-  #  Assumptions: Z2=1.
-  #  Cost: 7M + 4S + 9add + 3*2 + 1*4.
-  #  Source: 2007 Bernstein–Lange.
-  #  Explicit formulas:
+  ##
+  ## Short Weierstrass curves have the following equation in Jacobian coordinates
+  ##   Y² = X³ + aXZ⁴ + bZ⁶
+  ## from the affine equation
+  ##   y² = x³ + a x + b
+  ##
+  ## ``r`` is initialized/overwritten with the sum
+  ## ``CoefA`` allows fast path for curve with a == 0 or a == -3
+  ##            and also allows summing on curve isogenies.
+  ##
+  ## Implementation is constant-time, in particular it will not expose
+  ## that P == Q or P == -Q or P or Q are the infinity points
+  ## to simple side-channel attacks (SCA)
+  ## This is done by using a "complete" or "exception-free" addition law.
  #
-  #        Z1Z1 = Z1²
-  #        U2 = X2*Z1Z1
-  #        S2 = Y2*Z1*Z1Z1
-  #        H = U2-X1
-  #        HH = H2
-  #        I = 4*HH
-  #        J = H*I
-  #        r = 2*(S2-Y1)
-  #        V = X1*I
-  #        X3 = r²-J-2*V
-  #        Y3 = r*(V-X3)-2*Y1*J
-  #        Z3 = (Z1+H)²-Z1Z1-HH
-  var Z1Z1 {.noInit.}, H {.noInit.}, HH {.noInit.}, I{.noInit.}, J {.noInit.}: F
+  # Implementation, see write-up at the bottom.
+  # We fuse addition and doubling with condition copy by swapping
+  # terms with the following table
+  #
+  # |  Addition, Cohen et al, 1998  |      Doubling, Cohen et al, 1998         |   Doubling = -3   | Doubling a = 0 |
+  # |  12M + 4S + 6add + 1*2        | 3M + 6S + 1*a + 4add + 1*2 + 1*3 + 1half |                   |                |
+  # | ----------------------------- | -----------------------------------------| ----------------- | -------------- |
+  # | Z₁Z₁ = Z₁²                    | Z₁Z₁ = Z₁²                               |                   |                |
+  # | Z₂Z₂ = Z₂²                    |                                          |                   |                |
+  # |                               |                                          |                   |                |
+  # | U₁ = X₁*Z₂Z₂                  |                                          |                   |                |
+  # | U₂ = X₂*Z₁Z₁                  |                                          |                   |                |
+  # | S₁ = Y₁*Z₂*Z₂Z₂               |                                          |                   |                |
+  # | S₂ = Y₂*Z₁*Z₁Z₁               |                                          |                   |                |
+  # | H  = U₂-U₁ # P=-Q, P=Inf, P=Q |                                          |                   |                |
+  # | R  = S₂-S₁ # Q=Inf            |                                          |                   |                |
+  # |                               |                                          |                   |                |
+  # | HH  = H²                      | YY = Y₁²                                 |                   |                |
+  # | V   = U₁*HH                   | S  = X₁*YY                               |                   |                |
+  # | HHH = H*HH                    | M  = (3*X₁²+a*ZZ²)/2                     | 3(X₁-ZZ)(X₁+ZZ)/2 | 3X₁²/2         |
+  # |                               |                                          |                   |                |
+  # | X₃ = R²-HHH-2*V               | X₃ = M²-2*S                              |                   |                |
+  # | Y₃ = R*(V-X₃)-S₁*HHH          | Y₃ = M*(S-X₃)-YY*YY                      |                   |                |
+  # | Z₃ = Z₁*Z₂*H                  | Z₃ = Y₁*Z₁                               |                   |                |
+  #
+  # For mixed adddition we just set Z₂ = 1
+  var Z1Z1 {.noInit.}, U1 {.noInit.}, S1 {.noInit.}, H {.noInit.}, R {.noinit.}: F

-  # Preload P and Q in cache
-  let pIsInf = P.isInf()
-  let qIsInf = Q.isInf()
+  block: # Addition-only, check for exceptional cases
+    var U2 {.noInit.}, S2 {.noInit.}: F
+    U1 = P.x
+    S1 = P.y

-  Z1Z1.square(P.z)    # Z₁Z₁ = Z₁²
-  r.z.prod(P.z, Z1Z1) #               P.Z is hot in cache, keep it in same register.
-  r.z *= Q.y          # S₂ = Y₂Z₁Z₁Z₁         -- r.z used as S₂
+    Z1Z1.square(P.z, skipFinalSub = true)
+    S2.prod(P.z, Z1Z1, skipFinalSub = true)
+    S2 *= Q.y           # S₂ = Y₂*Z₁³
+    U2.prod(Q.x, Z1Z1)  # U₂ = X₂*Z₁²

-  H.prod(Q.x, Z1Z1)   # U₂ = X₂Z₁Z₁
-  H -= P.x            # H = U₂ - X₁
+    H.diff(U2, U1)      # H = U₂-U₁
+    R.diff(S2, S1)      # R = S₂-S₁

-  HH.square(H)        # HH = H²
+  # Exceptional cases
+  # Expressing H as affine, if H == 0, P == Q or -Q
+  # H = U₂-U₁ = X₂*Z₁² - X₁*Z₂² = x₂*Z₂²*Z₁² - x₁*Z₁²*Z₂²
+  # if H == 0 && R == 0, P = Q -> doubling
+  # if only H == 0, P = -Q     -> infinity, implied in Z₃ = Z₁*Z₂*H = 0
+  # if only R == 0, P and Q are related by the cubic root endomorphism
+  let isDbl = H.isZero() and R.isZero()

-  I.double(HH)
-  I.double()          # I = 4HH
+  # Rename buffers under the form (add_or_dbl)
+  template R_or_M: untyped = R
+  template H_or_Y: untyped = H
+  template V_or_S: untyped = U1
+  var HH_or_YY {.noInit.}: F
+  var HHH_or_Mpre {.noInit.}: F

-  J.prod(H, I)        # J = H*I
-  r.y.prod(P.x, I)    # V = X₁*I              -- r.y used as V
+  H_or_Y.ccopy(P.y, isDbl) # H         (add) or Y₁        (dbl)
+  HH_or_YY.square(H_or_Y)  # H²        (add) or Y₁²       (dbl)

-  r.z -= P.y          #
-  r.z.double()        # r = 2*(S₂-Y₁)         -- r.z used as r
+  V_or_S.ccopy(P.x, isDbl) # U₁        (add) or X₁        (dbl)
+  V_or_S *= HH_or_YY       # V = U₁*HH (add) or S = X₁*YY (dbl)

-  r.x.square(r.z)     # r²
-  r.x -= J
-  r.x -= r.y
-  r.x -= r.y          # X₃ = r²-J-2*V         -- r.x computed
+  block: # Compute M for doubling
+    # "when" static evaluation doesn't shortcut booleans :/
+    # which causes issues when CoefA isn't an int but Fp or Fp2
+    const CoefA = F.C.getCoefA()
+    when CoefA is int:
+      const CoefA_eq_zero = CoefA == 0
+      const CoefA_eq_minus3 {.used.} = CoefA == -3
+    else:
+      const CoefA_eq_zero = false
+      const CoefA_eq_minus3 = false

-  r.y -= r.x          # V-X₃
-  r.y *= r.z          # r*(V-X₃)
+    when CoefA_eq_zero:
+      var a {.noInit.} = H
+      var b {.noInit.} = HH_or_YY
+      a.ccopy(P.x, isDbl)           # H or X₁
+      b.ccopy(P.x, isDbl)           # HH or X₁
+      HHH_or_Mpre.prod(a, b)        # HHH or X₁²

-  J *= P.y            # Y₁J                   -- J reused as Y₁J
-  r.y -= J
-  r.y -= J            # Y₃ = r*(V-X₃) - 2*Y₁J -- r.y computed
+      var M{.noInit.} = HHH_or_Mpre # Assuming on doubling path
+      M.div2()                      #  X₁²/2
+      M += HHH_or_Mpre              # 3X₁²/2
+      R_or_M.ccopy(M, isDbl)

-  r.z.sum(P.z, H)     # Z₁ + H
-  r.z.square()
-  r.z -= Z1Z1
-  r.z -= HH           # Z₃ = (Z1+H)²-Z1Z1-HH
+    elif CoefA_eq_minus3:
+      var a{.noInit.}, b{.noInit.}: F
+      a.sum(P.x, Z1Z1)
+      b.diff(P.z, Z1Z1)
+      a.ccopy(H_or_Y, not isDbl)    # H   or X₁+ZZ
+      b.ccopy(HH_or_YY, not isDbl)  # HH  or X₁-ZZ
+      HHH_or_Mpre.prod(a, b)        # HHH or X₁²-ZZ²

-  # Now handle points at infinity
-  proc one(): F =
-    result.setOne()
+      var M{.noInit.} = HHH_or_Mpre # Assuming on doubling path
+      M.div2()                      # (X₁²-ZZ²)/2
+      M += HHH_or_Mpre              # 3(X₁²-ZZ²)/2
+      R_or_M.ccopy(M, isDbl)

-  r.x.ccopy(Q.x, pIsInf)
-  r.y.ccopy(Q.y, pIsInf)
-  r.z.ccopy(static(one()), pIsInf)
+    else:
+      # TODO: Costly `a` coefficients can be computed
+      # by merging their computation with Z₃ = Z₁*Z₂*H (add) or Z₃ = Y₁*Z₁ (dbl)
+      var a{.noInit.} = H
+      var b{.noInit.} = HH_or_YY
+      a.ccopy(P.x, isDbl)
+      b.ccopy(P.x, isDbl)
+      HHH_or_Mpre.prod(a, b, true)  # HHH or X₁²

-  r.ccopy(P, qIsInf)
+      # Assuming doubling path
+      a.square(HHH_or_Mpre, skipFinalSub = true)
+      a *= HHH_or_Mpre              # a = 3X₁²
+      b.square(Z1Z1)
+      # b.mulCheckSparse(CoefA)     # TODO: broken static compile-time type inference
+      b *= CoefA                    # b = αZZ, with α the "a" coefficient of the curve
+      
+      a += b
+      a.div2()
+      R_or_M.ccopy(a, isDbl)        # (3X₁² - αZZ)/2
+
+  # Let's count our horses, at this point:
+  # - R_or_M is set with R (add) or M (dbl)
+  # - HHH_or_Mpre contains HHH (add) or garbage precomputation (dbl)
+  # - V_or_S is set with V = U₁*HH (add) or S = X₁*YY (dbl)
+  var o {.noInit.}: typeof(r)
+  block: # Finishing line
+    var t {.noInit.}: F
+    t.double(V_or_S)
+    o.x.square(R_or_M)
+    o.x -= t                           # X₃ = R²-2*V (add) or M²-2*S (dbl)
+    o.x.csub(HHH_or_Mpre, not isDbl)   # X₃ = R²-HHH-2*V (add) or M²-2*S (dbl)
+
+    V_or_S -= o.x                      # V-X₃ (add) or S-X₃ (dbl)
+    o.y.prod(R_or_M, V_or_S)           # Y₃ = R(V-X₃) (add) or M(S-X₃) (dbl)
+    HHH_or_Mpre.ccopy(HH_or_YY, isDbl) # HHH (add) or YY (dbl)
+    S1.ccopy(HH_or_YY, isDbl)          # S1 (add) or YY (dbl)
+    HHH_or_Mpre *= S1                  # HHH*S1 (add) or YY² (dbl)
+    o.y -= HHH_or_Mpre                 # Y₃ = R(V-X₃)-S₁*HHH (add) or M(S-X₃)-YY² (dbl)
+
+    t.setOne()
+    t.ccopy(H_or_Y, isDbl)             # Z₂ (add) or Y₁ (dbl)
+    t.prod(t, P.z, true)               # Z₁Z₂ (add) or Y₁Z₁ (dbl)
+    o.z.prod(t, H_or_Y)                # Z₁Z₂H (add) or garbage (dbl)
+    o.z.ccopy(t, isDbl)                # Z₁Z₂H (add) or Y₁Z₁ (dbl)
+
+  block: # Infinity points
+    o.x.ccopy(Q.x, P.isInf())
+    o.y.ccopy(Q.y, P.isInf())
+    o.z.csetOne(P.isInf())
+    
+    o.ccopy(P, Q.isInf())
+
+  r = o

 func double*[F; G: static Subgroup](
       r: var ECP_ShortW_Jac[F, G],
@ -524,11 +624,7 @@ func `+=`*(P: var ECP_ShortW_Jac, Q: ECP_ShortW_Jac) {.inline.} =

 func `+=`*(P: var ECP_ShortW_Jac, Q: ECP_ShortW_Aff) {.inline.} =
  ## In-place mixed point addition
-  ## 
-  ## TODO: ⚠️ cannot handle P == Q
-  var t{.noInit.}: typeof(P)
-  t.madd(P, Q)
-  P = t
+  P.madd(P, Q)

 func double*(P: var ECP_ShortW_Jac) {.inline.} =
  ## In-place point doubling
@ -547,10 +643,10 @@ func affine*[F; G](
       jac: ECP_ShortW_Jac[F, G]) =
  var invZ {.noInit.}, invZ2{.noInit.}: F
  invZ.inv(jac.z)
-  invZ2.square(invZ)
+  invZ2.square(invZ, skipFinalSub = true)

  aff.x.prod(jac.x, invZ2)
-  invZ *= invZ2
+  invZ.prod(invZ, invZ2, skipFinalSub = true)
  aff.y.prod(jac.y, invZ)

 func fromAffine*[F; G](
@ -586,38 +682,38 @@ func batchAffine*[N: static int, F, G](
    zeroes[i] = z.isZero()
    z.csetOne(zeroes[i])

-    affs[i].x.prod(affs[i-1].x, z)
+    if i != N-1:
+      affs[i].x.prod(affs[i-1].x, z, skipFinalSub = true)
+    else:
+      affs[i].x.prod(affs[i-1].x, z, skipFinalSub = false)
  
  var accInv {.noInit.}: F
  accInv.inv(affs[N-1].x)

  for i in countdown(N-1, 1):
-    # Skip zero z-coordinates (infinity points)
-    var z = affs[i].x
-
    # Extract 1/Pᵢ
    var invi {.noInit.}: F
-    invi.prod(accInv, affs[i-1].x)
+    invi.prod(accInv, affs[i-1].x, skipFinalSub = true)
    invi.csetZero(zeroes[i])

    # Now convert Pᵢ to affine
    var invi2 {.noinit.}: F
-    invi2.square(invi)
+    invi2.square(invi, skipFinalSub = true)
    affs[i].x.prod(jacs[i].x, invi2)
-    invi *= invi2
+    invi.prod(invi, invi2, skipFinalSub = true)
    affs[i].y.prod(jacs[i].y, invi)

    # next iteration
    invi = jacs[i].z
    invi.csetOne(zeroes[i])
-    accInv *= invi
+    accInv.prod(accInv, invi, skipFinalSub = true)

  block: # tail
    var invi2 {.noinit.}: F
    accInv.csetZero(zeroes[0])
-    invi2.square(accInv)
+    invi2.square(accInv, skipFinalSub = true)
    affs[0].x.prod(jacs[0].x, invi2)
-    accInv *= invi2
+    accInv.prod(accInv, invi2, skipFinalSub = true)
    affs[0].y.prod(jacs[0].y, accInv)

 # ############################################################
--- a/constantine/math/elliptic/ec_shortweierstrass_projective.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_projective.nim
@ -15,6 +15,9 @@ import

 export Subgroup

+# No exceptions allowed
+{.push raises: [].}
+
 # ############################################################
 #
 #             Elliptic Curve in Short Weierstrass form
@ -261,7 +264,6 @@ func madd*[F; G: static Subgroup](
  ## ``r`` may alias P

  # TODO: static doAssert odd order
-
  when F.C.getCoefA() == 0:
    var t0 {.noInit.}, t1 {.noInit.}, t2 {.noInit.}, t3 {.noInit.}, t4 {.noInit.}: F
    var x3 {.noInit.}, y3 {.noInit.}, z3 {.noInit.}: F
@ -273,6 +275,10 @@ func madd*[F; G: static Subgroup](
    # Y₃ = (Y₁Y₂ + 3bZ₁)(Y₁Y₂ − 3bZ₁)
    #     + 9bX₁X₂ (X₁ + X₂Z₁)
    # Z₃= (Y₁ + Y₂Z₁)(Y₁Y₂ + 3bZ₁) + 3 X₁X₂ (X₁Y₂ + X₂Y₁)
+    #
+    # Note¹⁰ mentions that due to Qz = 1, cannot be
+    # the point at infinity.
+    # We solve that by conditional copies. 
    t0.prod(P.x, Q.x)         # 1.  t₀ <- X₁ X₂
    t1.prod(P.y, Q.y)         # 2.  t₁ <- Y₁ Y₂
    t3.sum(P.x, P.y)          # 3.  t₃ <- X₁ + Y₁ ! error in paper
@ -304,13 +310,24 @@ func madd*[F; G: static Subgroup](
      y3 *= SexticNonResidue
    x3.prod(t4, y3)           # 18. X₃ <- t₄ Y₃,   X₃ = (Y₁ + Y₂Z₁) 3b(X₁ + X₂Z₁)
    t2.prod(t3, t1)           # 19. t₂ <- t₃ t₁,   t₂ = (X₁Y₂ + X₂Y₁)(Y₁Y₂ - 3bZ₁)
-    r.x.diff(t2, x3)          # 20. X₃ <- t₂ - X₃, X₃ = (X₁Y₂ + X₂Y₁)(Y₁Y₂ - 3bZ₁) - 3b(Y₁ + Y₂Z₁)(X₁ + X₂Z₁)
+    x3.diff(t2, x3)           # 20. X₃ <- t₂ - X₃, X₃ = (X₁Y₂ + X₂Y₁)(Y₁Y₂ - 3bZ₁) - 3b(Y₁ + Y₂Z₁)(X₁ + X₂Z₁)
    y3 *= t0                  # 21. Y₃ <- Y₃ t₀,   Y₃ = 9bX₁X₂ (X₁ + X₂Z₁)
    t1 *= z3                  # 22. t₁ <- t₁ Z₃,   t₁ = (Y₁Y₂ - 3bZ₁)(Y₁Y₂ + 3bZ₁)
-    r.y.sum(y3, t1)           # 23. Y₃ <- t₁ + Y₃, Y₃ = (Y₁Y₂ + 3bZ₁)(Y₁Y₂ - 3bZ₁) + 9bX₁X₂ (X₁ + X₂Z₁)
+    y3 += t1                  # 23. Y₃ <- t₁ + Y₃, Y₃ = (Y₁Y₂ + 3bZ₁)(Y₁Y₂ - 3bZ₁) + 9bX₁X₂ (X₁ + X₂Z₁)
    t0 *= t3                  # 31. t₀ <- t₀ t₃,   t₀ = 3X₁X₂ (X₁Y₂ + X₂Y₁)
    z3 *= t4                  # 32. Z₃ <- Z₃ t₄,   Z₃ = (Y₁Y₂ + 3bZ₁)(Y₁ + Y₂Z₁)
-    r.z.sum(z3, t0)           # 33. Z₃ <- Z₃ + t₀, Z₃ = (Y₁ + Y₂Z₁)(Y₁Y₂ + 3bZ₁) + 3X₁X₂ (X₁Y₂ + X₂Y₁)
+    z3 += t0                  # 33. Z₃ <- Z₃ + t₀, Z₃ = (Y₁ + Y₂Z₁)(Y₁Y₂ + 3bZ₁) + 3X₁X₂ (X₁Y₂ + X₂Y₁)
+  
+    # Deal with infinity point. r and P might alias.
+    let inf = Q.isInf()
+    x3.ccopy(P.x, inf)
+    y3.ccopy(P.y, inf)
+    z3.ccopy(P.z, inf)
+
+    r.x = x3
+    r.y = y3
+    r.z = z3
+
  else:
    {.error: "Not implemented.".}

@ -454,18 +471,18 @@ func batchAffine*[N: static int, F, G](
    zeroes[i] = z.isZero()
    z.csetOne(zeroes[i])

-    affs[i].x.prod(affs[i-1].x, z)
+    if i != N-1:
+      affs[i].x.prod(affs[i-1].x, z, skipFinalSub = true)
+    else:
+      affs[i].x.prod(affs[i-1].x, z, skipFinalSub = false)
  
  var accInv {.noInit.}: F
  accInv.inv(affs[N-1].x)

  for i in countdown(N-1, 1):
-    # Skip zero z-coordinates (infinity points)
-    var z = affs[i].x
-
    # Extract 1/Pᵢ
    var invi {.noInit.}: F
-    invi.prod(accInv, affs[i-1].x)
+    invi.prod(accInv, affs[i-1].x, skipFinalSub = true)
    invi.csetZero(zeroes[i])

    # Now convert Pᵢ to affine
@ -475,7 +492,7 @@ func batchAffine*[N: static int, F, G](
    # next iteration
    invi = projs[i].z
    invi.csetOne(zeroes[i])
-    accInv *= invi
+    accInv.prod(accInv, invi, skipFinalSub = true)
  
  block: # tail
    accInv.csetZero(zeroes[0])
--- a/constantine/math/elliptic/ec_twistededwards_projective.nim
+++ b/constantine/math/elliptic/ec_twistededwards_projective.nim
@ -276,6 +276,10 @@ func double*[Field](
  E -= D           # C stores E-D
  r.y *= E

+func `+=`*(P: var ECP_TwEdwards_Prj, Q: ECP_TwEdwards_Prj) {.inline.} =
+  ## In-place point addition
+  P.sum(P, Q)
+
 func double*(P: var ECP_TwEdwards_Prj) {.inline.} =
  ## In-place EC doubling
  P.double(P)
--- a/constantine/math/extension_fields/towers.nim
+++ b/constantine/math/extension_fields/towers.nim
@ -2156,5 +2156,29 @@ func inv*(a: var CubicExt) =
  ## to affine for elliptic curve
  a.invImpl(a)

+# Convenience functions
+# ----------------------------------------------------------------------
+
+template square*(a: var ExtensionField, skipFinalSub: static bool) =
+  # Square alias,
+  # this allows using the same code for
+  # the base field and its extensions while benefitting from skipping
+  # the final substraction on Fp
+  a.square()
+
+template square*(r: var ExtensionField, a: ExtensionField, skipFinalSub: static bool) =
+  # Square alias,
+  # this allows using the same code for
+  # the base field and its extensions while benefitting from skipping
+  # the final substraction on Fp
+  r.square(a)
+
+template prod*(r: var ExtensionField, a, b: ExtensionField, skipFinalSub: static bool) =
+  # Prod alias,
+  # this allows using the same code for
+  # the base field and its extensions while benefitting from skipping
+  # the final substraction on Fp
+  r.prod(a, b)
+
 {.pop.} # inline
 {.pop.} # raises no exceptions
--- a/constantine/platforms/allocs.nim
+++ b/constantine/platforms/allocs.nim
@ -0,0 +1,32 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# ############################################################
+#
+#                        Allocators
+#
+# ############################################################
+
+# Due to the following constraints:
+# - No dynamic allocation in single-threaded codepaths (for compatibility with embedded devices like TPM or secure hardware)
+# - Avoiding cryptographic material in third-party libraries (like a memory allocator)
+# - Giving full control of the library user on allocation strategy
+# - Performance, especially for long-running processes (fragmentation, multithreaded allocation...)
+#
+# stack allocation is strongly preferred where necessary.
+
+when defined(windows):
+  proc alloca(size: int): pointer {.header: "<malloc.h>".}
+else:
+  proc alloca(size: int): pointer {.header: "<alloca.h>".}
+
+template alloca*(T: typedesc): ptr T =
+  cast[ptr T](alloca(sizeof(T)))
+
+template alloca*(T: typedesc, len: Natural): ptr UncheckedArray[T] =
+  cast[ptr UncheckedArray[T]](alloca(sizeof(T) * len))
--- a/helpers/pararun.nim
+++ b/helpers/pararun.nim
@ -13,7 +13,7 @@ import

 # Pararun is a parallel shell command runner
 # ------------------------------------------
-# Usage: pararun <file-with-1-command-per-line> <numWorkers
+# Usage: pararun <file-with-1-command-per-line> <numWorkers>

 # AsyncSemaphore
 # ----------------------------------------------------------------
--- a/tests/math/t_ec_shortw_jac_g1_batch_add.nim
+++ b/tests/math/t_ec_shortw_jac_g1_batch_add.nim
@ -0,0 +1,30 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internals
+  ../../constantine/math/config/curves,
+  ../../constantine/math/elliptic/ec_shortweierstrass_jacobian,
+  ../../constantine/math/arithmetic,
+  # Test utilities
+  ./t_ec_template
+
+const
+  numPoints = [1, 2, 8, 16, 128, 1024, 2048, 16384, 32768] # 262144, 1048576]
+
+run_EC_batch_add_impl(
+    ec = ECP_ShortW_Jac[Fp[BN254_Snarks], G1],
+    numPoints = numPoints,
+    moduleName = "test_ec_shortweierstrass_jacobian_batch_add_" & $BN254_Snarks
+  )
+
+run_EC_batch_add_impl(
+    ec = ECP_ShortW_Jac[Fp[BLS12_381], G1],
+    numPoints = numPoints,
+    moduleName = "test_ec_shortweierstrass_jacobian_batch_add_" & $BLS12_381
+  )
--- a/tests/math/t_ec_shortw_prj_g1_batch_add.nim
+++ b/tests/math/t_ec_shortw_prj_g1_batch_add.nim
@ -0,0 +1,30 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internals
+  ../../constantine/math/config/curves,
+  ../../constantine/math/elliptic/ec_shortweierstrass_projective,
+  ../../constantine/math/arithmetic,
+  # Test utilities
+  ./t_ec_template
+
+const
+  numPoints = [1, 2, 8, 16, 128, 1024, 2048, 16384, 32768] # 262144, 1048576]
+
+run_EC_batch_add_impl(
+    ec = ECP_ShortW_Prj[Fp[BN254_Snarks], G1],
+    numPoints = numPoints,
+    moduleName = "test_ec_shortweierstrass_projective_batch_add_" & $BN254_Snarks
+  )
+
+run_EC_batch_add_impl(
+    ec = ECP_ShortW_Prj[Fp[BLS12_381], G1],
+    numPoints = numPoints,
+    moduleName = "test_ec_shortweierstrass_projective_batch_add_" & $BLS12_381
+  )
--- a/tests/math/t_ec_template.nim
+++ b/tests/math/t_ec_template.nim
@ -23,6 +23,7 @@ import
    ec_shortweierstrass_affine,
    ec_shortweierstrass_jacobian,
    ec_shortweierstrass_projective,
+    ec_shortweierstrass_batch_ops,
    ec_twistededwards_affine,
    ec_twistededwards_projective,
    ec_scalar_mul],
@ -41,7 +42,7 @@ type
    Long01Sequence

 func random_point*(rng: var RngState, EC: typedesc, randZ: bool, gen: RandomGen): EC {.noInit.} =
-  if not randZ:
+  when EC is ECP_ShortW_Aff:
    if gen == Uniform:
      result = rng.random_unsafe(EC)
    elif gen == HighHammingWeight:
@ -49,12 +50,20 @@ func random_point*(rng: var RngState, EC: typedesc, randZ: bool, gen: RandomGen)
    else:
      result = rng.random_long01Seq(EC)
  else:
-    if gen == Uniform:
-      result = rng.random_unsafe_with_randZ(EC)
-    elif gen == HighHammingWeight:
-      result = rng.random_highHammingWeight_with_randZ(EC)
+    if not randZ:
+      if gen == Uniform:
+        result = rng.random_unsafe(EC)
+      elif gen == HighHammingWeight:
+        result = rng.random_highHammingWeight(EC)
+      else:
+        result = rng.random_long01Seq(EC)
    else:
-      result = rng.random_long01Seq_with_randZ(EC)
+      if gen == Uniform:
+        result = rng.random_unsafe_with_randZ(EC)
+      elif gen == HighHammingWeight:
+        result = rng.random_highHammingWeight_with_randZ(EC)
+      else:
+        result = rng.random_long01Seq_with_randZ(EC)

 template pairingGroup(EC: typedesc): string =
  when EC is (ECP_ShortW_Aff or ECP_ShortW_Prj or ECP_ShortW_Jac):
@ -99,6 +108,15 @@ proc run_EC_addition_tests*(
          r.sum(inf, P)
          check: bool(r == P)

+          # Aliasing tests
+          r = P
+          r += inf
+          check: bool(r == P)
+
+          r.setInf()
+          r += P
+          check: bool(r == P)
+
      test(ec, randZ = false, gen = Uniform)
      test(ec, randZ = true, gen = Uniform)
      test(ec, randZ = false, gen = HighHammingWeight)
@ -438,7 +456,7 @@ proc run_EC_mixed_add_impl*(

  suite testSuiteDesc & " - " & $ec & " - [" & $WordBitwidth & "-bit mode]":
    test "EC " & G1_or_G2 & " mixed addition is consistent with general addition":
-      proc test(EC: typedesc, bits: static int, randZ: bool, gen: RandomGen) =
+      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
        for _ in 0 ..< Iters:
          let a = rng.random_point(EC, randZ, gen)
          let b = rng.random_point(EC, randZ, gen)
@ -452,12 +470,83 @@ proc run_EC_mixed_add_impl*(

          check: bool(r_generic == r_mixed)

-      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = Uniform)
-      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = Uniform)
-      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = HighHammingWeight)
-      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = HighHammingWeight)
-      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = Long01Sequence)
-      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = Long01Sequence)
+      test(ec, randZ = false, gen = Uniform)
+      test(ec, randZ = true, gen = Uniform)
+      test(ec, randZ = false, gen = HighHammingWeight)
+      test(ec, randZ = true, gen = HighHammingWeight)
+      test(ec, randZ = false, gen = Long01Sequence)
+      test(ec, randZ = true, gen = Long01Sequence)
+
+    test "EC " & G1_or_G2 & " mixed addition - doubling":
+      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
+        for _ in 0 ..< Iters:
+          let a = rng.random_point(EC, randZ, gen)
+          var aAff: ECP_ShortW_Aff[EC.F, EC.G]
+          aAff.affine(a)
+
+          var r_generic, r_mixed: EC
+
+          r_generic.double(a)
+          r_mixed.madd(a, aAff)
+          check: bool(r_generic == r_mixed)
+          
+          # Aliasing test
+          r_mixed = a
+          r_mixed += aAff
+          check: bool(r_generic == r_mixed)
+
+      test(ec, randZ = false, gen = Uniform)
+      test(ec, randZ = true, gen = Uniform)
+      test(ec, randZ = false, gen = HighHammingWeight)
+      test(ec, randZ = true, gen = HighHammingWeight)
+      test(ec, randZ = false, gen = Long01Sequence)
+      test(ec, randZ = true, gen = Long01Sequence)
+
+    test "EC " & G1_or_G2 & " mixed addition - adding infinity LHS":
+      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
+        for _ in 0 ..< Iters:
+          var a{.noInit.}: EC
+          a.setInf()
+          let bAff = rng.random_point(ECP_ShortW_Aff[EC.F, EC.G], randZ = false, gen)
+
+          var r_mixed{.noInit.}: EC
+          r_mixed.madd(a, bAff)
+
+          var r{.noInit.}: ECP_ShortW_Aff[EC.F, EC.G]
+          r.affine(r_mixed)
+
+          a += bAff
+
+          check:
+            bool(r == bAff)
+            bool(a == r_mixed)
+
+      test(ec, randZ = false, gen = Uniform)
+      test(ec, randZ = false, gen = HighHammingWeight)
+      test(ec, randZ = false, gen = Long01Sequence)
+
+    test "EC " & G1_or_G2 & " mixed addition - adding infinity RHS":
+      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
+        for _ in 0 ..< Iters:
+          let a = rng.random_point(EC, randZ, gen)
+          var bAff{.noInit.}: ECP_ShortW_Aff[EC.F, EC.G]
+          bAff.setInf()
+
+          var r{.noInit.}: EC
+          r.madd(a, bAff)
+
+          check: bool(r == a)
+
+          r = a
+          r += bAff
+          check: bool(r == a)
+
+      test(ec, randZ = false, gen = Uniform)
+      test(ec, randZ = true, gen = Uniform)
+      test(ec, randZ = false, gen = HighHammingWeight)
+      test(ec, randZ = true, gen = HighHammingWeight)
+      test(ec, randZ = false, gen = Long01Sequence)
+      test(ec, randZ = true, gen = Long01Sequence)

 proc run_EC_subgroups_cofactors_impl*(
       ec: typedesc,
@ -480,7 +569,7 @@ proc run_EC_subgroups_cofactors_impl*(

  suite testSuiteDesc & " - " & $ec & " - [" & $WordBitwidth & "-bit mode]":
    test "Effective cofactor matches accelerated cofactor clearing" & " - " & $ec & " - [" & $WordBitwidth & "-bit mode]":
-      proc test(EC: typedesc, bits: static int, randZ: bool, gen: RandomGen) =
+      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
        for _ in 0 ..< ItersMul:
          let P = rng.random_point(EC, randZ, gen)
          var cPeff = P
@ -491,17 +580,17 @@ proc run_EC_subgroups_cofactors_impl*(

          check: bool(cPeff == cPfast)

-      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = Uniform)
-      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = Uniform)
-      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = HighHammingWeight)
-      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = HighHammingWeight)
-      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = Long01Sequence)
-      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = Long01Sequence)
+      test(ec, randZ = false, gen = Uniform)
+      test(ec, randZ = true, gen = Uniform)
+      test(ec, randZ = false, gen = HighHammingWeight)
+      test(ec, randZ = true, gen = HighHammingWeight)
+      test(ec, randZ = false, gen = Long01Sequence)
+      test(ec, randZ = true, gen = Long01Sequence)

    test "Subgroup checks and cofactor clearing consistency":
      var inSubgroup = 0
      var offSubgroup = 0
-      proc test(EC: typedesc, bits: static int, randZ: bool, gen: RandomGen) =
+      proc test(EC: typedesc, randZ: bool, gen: RandomGen) =
        stdout.write "    "
        for _ in 0 ..< ItersMul:
          let P = rng.random_point(EC, randZ, gen)
@ -526,12 +615,12 @@ proc run_EC_subgroups_cofactors_impl*(
        
        stdout.write '\n'

-      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = Uniform)
-      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = Uniform)
-      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = HighHammingWeight)
-      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = HighHammingWeight)
-      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = Long01Sequence)
-      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = Long01Sequence)
+      test(ec, randZ = false, gen = Uniform)
+      test(ec, randZ = true, gen = Uniform)
+      test(ec, randZ = false, gen = HighHammingWeight)
+      test(ec, randZ = true, gen = HighHammingWeight)
+      test(ec, randZ = false, gen = Long01Sequence)
+      test(ec, randZ = true, gen = Long01Sequence)
    
      echo "    [SUCCESS] Test finished with ", inSubgroup, " points in ", G1_or_G2, " subgroup and ",
              offSubgroup, " points on curve but not in subgroup (before cofactor clearing)"
@ -699,4 +788,82 @@ proc run_EC_conversion_failures*(
          doAssert bool(Qs[i] == Rs[i])

      test_bn254_snarks_g1(ECP_ShortW_Prj[Fp[BN254_Snarks], G1])
-      test_bn254_snarks_g1(ECP_ShortW_Jac[Fp[BN254_Snarks], G1])
+      test_bn254_snarks_g1(ECP_ShortW_Jac[Fp[BN254_Snarks], G1])
+
+proc run_EC_batch_add_impl*[N: static int](
+       ec: typedesc,
+       numPoints: array[N, int],
+       moduleName: string
+     ) =
+
+  # Random seed for reproducibility
+  var rng: RngState
+  let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+  rng.seed(seed)
+  echo "\n------------------------------------------------------\n"
+  echo moduleName, " xoshiro512** seed: ", seed
+
+  when ec.G == G1:
+    const G1_or_G2 = "G1"
+  else:
+    const G1_or_G2 = "G2"
+
+  const testSuiteDesc = "Elliptic curve batch addition for Short Weierstrass form"
+
+  suite testSuiteDesc & " - " & $ec & " - [" & $WordBitwidth & "-bit mode]":
+    for n in numPoints:
+      test $ec & " batch addition (N=" & $n & ")":
+        proc test(EC: typedesc, gen: RandomGen) =
+          var points = newSeq[ECP_ShortW_Aff[EC.F, EC.G]](n)
+          
+          for i in 0 ..< n:
+            points[i] = rng.random_point(ECP_ShortW_Aff[EC.F, EC.G], randZ = false, gen)
+
+          var r_batch{.noinit.}, r_ref{.noInit.}: EC
+
+          r_ref.setInf()
+          for i in 0 ..< n:
+            r_ref += points[i]
+
+          r_batch.sum_batch_vartime(points)
+
+          check: bool(r_batch == r_ref)
+
+
+        test(ec, gen = Uniform)
+        test(ec, gen = HighHammingWeight)
+        test(ec, gen = Long01Sequence)
+
+      test "EC " & G1_or_G2 & " batch addition (N=" & $n & ") - special cases":
+        proc test(EC: typedesc, gen: RandomGen) =
+          var points = newSeq[ECP_ShortW_Aff[EC.F, EC.G]](n)
+
+          let halfN = n div 2
+          
+          for i in 0 ..< halfN:
+            points[i] = rng.random_point(ECP_ShortW_Aff[EC.F, EC.G], randZ = false, gen)
+          
+          for i in halfN ..< n:
+            # The special cases test relies on internal knowledge that we sum(points[i], points[i+n/2]
+            # It should be changed if scheduling change, for example if we sum(points[2*i], points[2*i+1])
+            let c = rng.random_unsafe(3)
+            if c == 0:
+              points[i] = rng.random_point(ECP_ShortW_Aff[EC.F, EC.G], randZ = false, gen)
+            elif c == 1:
+              points[i] = points[i-halfN]
+            else:
+              points[i].neg(points[i-halfN])
+
+          var r_batch{.noinit.}, r_ref{.noInit.}: EC
+
+          r_ref.setInf()
+          for i in 0 ..< n:
+            r_ref += points[i]
+
+          r_batch.sum_batch_vartime(points)
+
+          check: bool(r_batch == r_ref)
+
+        test(ec, gen = Uniform)
+        test(ec, gen = HighHammingWeight)
+        test(ec, gen = Long01Sequence)