From 082cd1deb973618c38930e54f7f7b58f3e8835c7 Mon Sep 17 00:00:00 2001
From: Mamy Ratsimbazafy <mamy_github@numforge.co>
Date: Tue, 7 Feb 2023 16:27:53 +0100
Subject: [PATCH] MSB-to-LSB minimum Hamming Weight Recoding (#219)

* signed recoding

* use recoding
---
 benchmarks/bench_ec_g1.nim                    |  3 +
 benchmarks/bench_ec_g2.nim                    |  3 +
 benchmarks/bench_elliptic_template.nim        | 12 +++
 constantine.nimble                            |  2 +-
 constantine/math/arithmetic/bigints.nim       | 49 ++++++++++-
 .../elliptic/ec_shortweierstrass_jacobian.nim | 10 ++-
 .../ec_shortweierstrass_projective.nim        | 10 ++-
 .../math/pairings/cyclotomic_subgroups.nim    | 81 ++++++++++---------
 constantine/math/pairings/lines_eval.nim      | 62 +++++++-------
 constantine/math/pairings/miller_loops.nim    |  6 +-
 constantine/signatures/bls_signatures.nim     | 55 +++++++------
 .../math/support/ec_reference_scalar_mult.nim | 31 +++++--
 tests/math/t_ec_template.nim                  |  6 +-
 13 files changed, 216 insertions(+), 114 deletions(-)

diff --git a/benchmarks/bench_ec_g1.nim b/benchmarks/bench_ec_g1.nim
index aad7948..38e8825 100644
--- a/benchmarks/bench_ec_g1.nim
+++ b/benchmarks/bench_ec_g1.nim
@@ -57,6 +57,9 @@ proc main() =
     scalarMulUnsafeDoubleAddBench(ECP_ShortW_Prj[Fp[curve], G1], MulIters)
     scalarMulUnsafeDoubleAddBench(ECP_ShortW_Jac[Fp[curve], G1], MulIters)
     separator()
+    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Prj[Fp[curve], G1], MulIters)
+    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Jac[Fp[curve], G1], MulIters)
+    separator()
     scalarMulGenericBench(ECP_ShortW_Prj[Fp[curve], G1], window = 2, MulIters)
     scalarMulGenericBench(ECP_ShortW_Prj[Fp[curve], G1], window = 3, MulIters)
     scalarMulGenericBench(ECP_ShortW_Prj[Fp[curve], G1], window = 4, MulIters)
diff --git a/benchmarks/bench_ec_g2.nim b/benchmarks/bench_ec_g2.nim
index e43099a..dab49ec 100644
--- a/benchmarks/bench_ec_g2.nim
+++ b/benchmarks/bench_ec_g2.nim
@@ -58,6 +58,9 @@ proc main() =
     scalarMulUnsafeDoubleAddBench(ECP_ShortW_Prj[Fp2[curve], G2], MulIters)
     scalarMulUnsafeDoubleAddBench(ECP_ShortW_Jac[Fp2[curve], G2], MulIters)
     separator()
+    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Prj[Fp2[curve], G2], MulIters)
+    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Jac[Fp2[curve], G2], MulIters)
+    separator()
     scalarMulGenericBench(ECP_ShortW_Prj[Fp2[curve], G2], window = 2, MulIters)
     scalarMulGenericBench(ECP_ShortW_Prj[Fp2[curve], G2], window = 3, MulIters)
     scalarMulGenericBench(ECP_ShortW_Prj[Fp2[curve], G2], window = 4, MulIters)
diff --git a/benchmarks/bench_elliptic_template.nim b/benchmarks/bench_elliptic_template.nim
index a4b157d..5051065 100644
--- a/benchmarks/bench_elliptic_template.nim
+++ b/benchmarks/bench_elliptic_template.nim
@@ -143,6 +143,18 @@ proc scalarMulUnsafeDoubleAddBench*(EC: typedesc, iters: int) =
     r = P
     r.unsafe_ECmul_double_add(exponent)
 
+proc scalarMulUnsafeMinHammingWeightRecodingBench*(EC: typedesc, iters: int) =
+  const bits = EC.F.C.getCurveOrderBitwidth()
+
+  var r {.noInit.}: EC
+  var P = rng.random_unsafe(EC) # TODO: clear cofactor
+
+  let exponent = rng.random_unsafe(BigInt[bits])
+
+  bench("EC ScalarMul " & $bits & "-bit " & $EC.G & " (unsafe min Hamming Weight recoding)", EC, iters):
+    r = P
+    r.unsafe_ECmul_minHammingWeight(exponent)
+
 proc multiAddBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) =
   var points = newSeq[ECP_ShortW_Aff[EC.F, EC.G]](numPoints)
 
diff --git a/constantine.nimble b/constantine.nimble
index d47ed2d..65d5d5c 100644
--- a/constantine.nimble
+++ b/constantine.nimble
@@ -200,7 +200,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
   # ("tests/math/t_pairing_bls12_377_line_functions.nim", false),
   # ("tests/math/t_pairing_bls12_381_line_functions.nim", false),
   # ("tests/math/t_pairing_mul_fp12_by_lines.nim", false),
-  # ("tests/math/t_pairing_cyclotomic_subgroup.nim", false),
+  ("tests/math/t_pairing_cyclotomic_subgroup.nim", false),
   ("tests/math/t_pairing_bn254_nogami_optate.nim", false),
   ("tests/math/t_pairing_bn254_snarks_optate.nim", false),
   ("tests/math/t_pairing_bls12_377_optate.nim", false),
diff --git a/constantine/math/arithmetic/bigints.nim b/constantine/math/arithmetic/bigints.nim
index 25cb4f3..7483f7a 100644
--- a/constantine/math/arithmetic/bigints.nim
+++ b/constantine/math/arithmetic/bigints.nim
@@ -476,7 +476,7 @@ func invmod*[bits](
        F, M: static BigInt[bits]) =
   ## Compute the modular inverse of ``a`` modulo M
   ## r ≡ F.a⁻¹ (mod M)
-  ## 
+  ##
   ## with F and M known at compile-time
   ##
   ## M MUST be odd, M does not need to be prime.
@@ -491,5 +491,52 @@ func invmod*[bits](r: var BigInt[bits], a, M: BigInt[bits]) =
   one.setOne()
   r.invmod(a, one, M)
 
+# ############################################################
+#
+#                   Recoding
+#
+# ############################################################
+
+iterator recoding_l2r_vartime*(a: BigInt): int8 =
+  ## This is a minimum-Hamming-Weight left-to-right recoding.
+  ## It outputs signed {-1, 0, 1} bits from MSB to LSB
+  ## with minimal Hamming Weight to minimize operations
+  ## in Miller Loop and vartime scalar multiplications
+  ##
+  ## Tagged vartime as it returns an int8
+  ## - Optimal Left-to-Right Binary Signed-Digit Recoding
+  ##   Joye, Yen, 2000
+  ##   https://marcjoye.github.io/papers/JY00sd2r.pdf
+
+  # As the caller is copy-pasted at each yield
+  # we rework the algorithm so that we have a single yield point
+  # We rely on the compiler for loop hoisting and/or loop peeling
+
+  var bi, bi1, ri, ri1, ri2: int8
+
+  var i = a.bits
+  while true:
+    if i == a.bits: # We rely on compiler to hoist this branch out of the loop.
+      ri = 0
+      ri1 = int8 a.bit(a.bits-1)
+      ri2 = int8 a.bit(a.bits-2)
+      bi = 0
+    else:
+      bi = bi1
+      ri = ri1
+      ri1 = ri2
+      if i < 2:
+        ri2 = 0
+      else:
+        ri2 = int8 a.bit(i-2)
+
+    bi1 = (bi + ri1 + ri2) shr 1
+    yield -2*bi + ri + bi1
+
+    if i > 0:
+      i -= 1
+    else:
+      break
+
 {.pop.} # inline
 {.pop.} # raises no exceptions
diff --git a/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim b/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim
index 3145618..9515d36 100644
--- a/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim
@@ -630,14 +630,18 @@ func double*(P: var ECP_ShortW_Jac) {.inline.} =
   ## In-place point doubling
   P.double(P)
 
-func diff*(r: var ECP_ShortW_Jac,
-           P, Q: ECP_ShortW_Jac
-     ) {.inline.} =
+func diff*(r: var ECP_ShortW_Jac, P, Q: ECP_ShortW_Jac) {.inline.} =
   ## r = P - Q
   var nQ {.noInit.}: typeof(Q)
   nQ.neg(Q)
   r.sum(P, nQ)
 
+func `-=`*(P: var ECP_ShortW_Jac, Q: ECP_ShortW_Jac) {.inline.} =
+  ## In-place point substraction
+  var nQ {.noInit.}: typeof(Q)
+  nQ.neg(Q)
+  P.sum(P, nQ)
+
 func affine*[F; G](
        aff: var ECP_ShortW_Aff[F, G],
        jac: ECP_ShortW_Jac[F, G]) =
diff --git a/constantine/math/elliptic/ec_shortweierstrass_projective.nim b/constantine/math/elliptic/ec_shortweierstrass_projective.nim
index 2badd19..8e7bb4e 100644
--- a/constantine/math/elliptic/ec_shortweierstrass_projective.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_projective.nim
@@ -417,15 +417,19 @@ func double*(P: var ECP_ShortW_Prj) {.inline.} =
   ## In-place EC doubling
   P.double(P)
 
-func diff*(r: var ECP_ShortW_Prj,
-              P, Q: ECP_ShortW_Prj
-     ) {.inline.} =
+func diff*(r: var ECP_ShortW_Prj, P, Q: ECP_ShortW_Prj) {.inline.} =
   ## r = P - Q
   ## Can handle r and Q aliasing
   var nQ {.noInit.}: typeof(Q)
   nQ.neg(Q)
   r.sum(P, nQ)
 
+func `-=`*(P: var ECP_ShortW_Prj, Q: ECP_ShortW_Prj) {.inline.} =
+  ## In-place point substraction
+  var nQ {.noInit.}: typeof(Q)
+  nQ.neg(Q)
+  P.sum(P, nQ)
+
 func affine*[F, G](
        aff: var ECP_ShortW_Aff[F, G],
        proj: ECP_ShortW_Prj[F, G]) =
diff --git a/constantine/math/pairings/cyclotomic_subgroups.nim b/constantine/math/pairings/cyclotomic_subgroups.nim
index 179d641..dc614a4 100644
--- a/constantine/math/pairings/cyclotomic_subgroups.nim
+++ b/constantine/math/pairings/cyclotomic_subgroups.nim
@@ -243,7 +243,7 @@ func cyclotomic_square_cube_over_quad(r: var CubicExt, a: CubicExt) =
 
   # From here on, r aliasing with a is only for the first operation
   # and only read/write the exact same coordinates
-  
+
   # 3v₀₀ - 2a₀
   r.c0.c0.diff(v0.c0, a0)
   r.c0.c0.double()
@@ -305,21 +305,21 @@ func cyclotomic_square_quad_over_cube[F](r: var QuadraticExt[F], a: QuadraticExt
   #   v₁ = (b₃, b₂) = (a.c1.c0, a.c0.c2)
   #   v₂ = (b₁, b₅) = (a.c0.c1, a.c1.c2)
   var v0{.noInit.}, v1{.noInit.}, v2{.noInit.}: QuadraticExt[typeof(r.c0.c0)]
-  
+
   template b0: untyped = a.c0.c0
   template b1: untyped = a.c0.c1
   template b2: untyped = a.c0.c2
   template b3: untyped = a.c1.c0
   template b4: untyped = a.c1.c1
   template b5: untyped = a.c1.c2
-  
+
   v0.square_disjoint(b0, b4)
   v1.square_disjoint(b3, b2)
   v2.square_disjoint(b1, b5)
-  
+
   # From here on, r aliasing with a is only for the first operation
   # and only read/write the exact same coordinates
-  
+
   # 3v₀₀ - 2b₀
   r.c0.c0.diff(v0.c0, b0)
   r.c0.c0.double()
@@ -385,28 +385,31 @@ func cycl_sqr_repeated*[FT](r: var FT, a: FT, num: int) {.inline, meter.} =
   for _ in 1 ..< num:
     r.cyclotomic_square()
 
-iterator unpack(scalarByte: byte): bool =
-  yield bool((scalarByte and 0b10000000) shr 7)
-  yield bool((scalarByte and 0b01000000) shr 6)
-  yield bool((scalarByte and 0b00100000) shr 5)
-  yield bool((scalarByte and 0b00010000) shr 4)
-  yield bool((scalarByte and 0b00001000) shr 3)
-  yield bool((scalarByte and 0b00000100) shr 2)
-  yield bool((scalarByte and 0b00000010) shr 1)
-  yield bool( scalarByte and 0b00000001)
+func cyclotomic_exp*[FT](r: var FT, a: FT, exponent: static BigInt, invert: bool) {.meter.} =
+  ## Assumes public exponent
+  var na {.noInit.}: FT
+  na.cyclotomic_inv(a)
 
-func cyclotomic_exp*[FT](r: var FT, a: FT, exponent: BigInt, invert: bool) {.meter.} =
-    var eBytes: array[(exponent.bits+7) div 8, byte]
-    eBytes.marshal(exponent, bigEndian)
+  r.setOne()
+  var init = false
+  for bit in recoding_l2r_vartime(exponent):
+    if init:
+      r.cyclotomic_square()
+    if bit == 1:
+      if not init:
+        r = a
+        init = true
+      else:
+        r *= a
+    elif bit == -1:
+      if not init:
+        r = na
+        init = true
+      else:
+        r *= na
 
-    r.setOne()
-    for b in eBytes:
-      for bit in unpack(b):
-        r.cyclotomic_square()
-        if bit:
-          r *= a
-    if invert:
-      r.cyclotomic_inv()
+  if invert:
+    r.cyclotomic_inv()
 
 func isInCyclotomicSubgroup*[C](a: Fp6[C]): SecretBool =
   ## Check if a ∈ Fpⁿ: a^Φₙ(p) = 1
@@ -491,7 +494,7 @@ func cyclotomic_square_compressed*[F](g: var G2345[F]) =
   #       h₃ = 3(g₄² + g₅²ξ) - 2g₃
   #       h₄ = 3(g₂² + g₃²ξ) - 2g₄
   #       h₅ = 2g₅ + 3 ((g₂+g₃)²-g₂²-g₃²)
-  # (6 sqr)    
+  # (6 sqr)
   #
   # or with quadratic arithmetic
   #   (h₂+h₃u) = 3u(g₄+g₅u)² + 2(g₂-g₃u)
@@ -523,14 +526,14 @@ func recover_g1*[F](g1_num, g1_den: var F, g: G2345[F]) =
   #   g₁ = (g₅²ξ + 3g₄² - 2g₃)/4g₂
   # if g₂ == 0
   #   g₁ = 2g₄g₅/g₃
-  # 
+  #
   # Theorem 3.1, this is well-defined for all
   # g in Gϕₙ \ {1}
   # if g₂=g₃=0 then g₄=g₅=0 as well
   # and g₀ = 1
   let g2NonZero = not g.g2.isZero()
   var t{.noInit.}: F
-  
+
   g1_num = g.g4
   t = g.g5
   t.ccopy(g.g4, g2NonZero)
@@ -543,7 +546,7 @@ func recover_g1*[F](g1_num, g1_den: var F, g: G2345[F]) =
   t.square(g.g5)
   t *= NonResidue
   g1_num.cadd(t, g2NonZero)       # g₅²ξ + 3g₄² - 2g₃ or 2g₄g₅
-  
+
   t.prod(g.g2, 4)
   g1_den = g.g3
   g1_den.ccopy(t, g2NonZero)      # 4g₂ or g₃
@@ -556,7 +559,7 @@ func batch_ratio_g1s*[N: static int, F](
   ## This requires that all g1_den != 0 or all g1_den == 0
   ## which is the case if this is used to implement
   ## exponentiation in cyclotomic subgroup.
-  
+
   # Algorithm: Montgomery's batch inversion
   # - Speeding the Pollard and Elliptic Curve Methods of Factorization
   #   Section 10.3.1
@@ -581,7 +584,7 @@ func batch_ratio_g1s*[N: static int, F](
     dst[i] *= src[i].g1_num
     # Next iteration
     accInv *= src[i].g1_den
-  
+
   dst[0].prod(accInv, src[0].g1_num)
 
 func recover_g0*[F](
@@ -607,12 +610,12 @@ func fromFpk*[Fpkdiv6, Fpk](
        a: Fpk) =
   ## Convert from a sextic extension to the Karabina g₂₃₄₅
   ## representation.
-  
+
   # GT representations isomorphisms
   # ===============================
   #
   # Given a sextic twist, we can express all elements in terms of z = SNR¹ᐟ⁶
-  # 
+  #
   # The canonical direct sextic representation uses coefficients
   #
   #    c₀ + c₁ z + c₂ z² + c₃ z³ + c₄ z⁴ + c₅ z⁵
@@ -699,14 +702,14 @@ func asFpk*[Fpkdiv6, Fpk](
     {.error: "𝔽pᵏᐟ⁶ -> 𝔽pᵏ towering (direct sextic) is not implemented.".}
 
 func cyclotomic_exp_compressed*[N: static int, Fpk](
-       r: var Fpk, a: Fpk, 
+       r: var Fpk, a: Fpk,
        squarings: static array[N, int]) =
   ## Exponentiation on the cyclotomic subgroup
   ## via compressed repeated squarings
   ## Exponentiation is done least-signigicant bits first
   ## `squarings` represents the number of squarings
   ## to do before the next multiplication.
-  
+
   type Fpkdiv6 = typeof(a.c0.c0)
 
   var gs {.noInit.}: array[N, G2345[Fpkdiv6]]
@@ -724,7 +727,7 @@ func cyclotomic_exp_compressed*[N: static int, Fpk](
   var g1s_ratio {.noInit.}: array[N, tuple[g1_num, g1_den: Fpkdiv6]]
   for i in 0 ..< N:
     recover_g1(g1s_ratio[i].g1_num, g1s_ratio[i].g1_den, gs[i])
-  
+
   var g1s {.noInit.}: array[N, Fpkdiv6]
   g1s.batch_ratio_g1s(g1s_ratio)
 
@@ -746,10 +749,10 @@ func cyclotomic_exp_compressed*[N: static int, Fpk](
   ## Exponentiation is done least-signigicant bits first
   ## `squarings` represents the number of squarings
   ## to do before the next multiplication.
-  ## 
+  ##
   ## `accumSquarings` stores the accumulated squarings so far
   ## iff N != 1
-  
+
   type Fpkdiv6 = typeof(a.c0.c0)
 
   var gs {.noInit.}: array[N, G2345[Fpkdiv6]]
@@ -767,7 +770,7 @@ func cyclotomic_exp_compressed*[N: static int, Fpk](
   var g1s_ratio {.noInit.}: array[N, tuple[g1_num, g1_den: Fpkdiv6]]
   for i in 0 ..< N:
     recover_g1(g1s_ratio[i].g1_num, g1s_ratio[i].g1_den, gs[i])
-  
+
   var g1s {.noInit.}: array[N, Fpkdiv6]
   g1s.batch_ratio_g1s(g1s_ratio)
 
diff --git a/constantine/math/pairings/lines_eval.nim b/constantine/math/pairings/lines_eval.nim
index 367582f..6d5d7e8 100644
--- a/constantine/math/pairings/lines_eval.nim
+++ b/constantine/math/pairings/lines_eval.nim
@@ -24,7 +24,7 @@ import
 
 # ############################################################
 #
-#                     Lines functions    
+#                     Lines functions
 #
 # ############################################################
 
@@ -32,7 +32,7 @@ import
 # ===============================
 #
 # Given a sextic twist, we can express all elements in terms of z = SNR¹ᐟ⁶
-# 
+#
 # The canonical direct sextic representation uses coefficients
 #
 #    c₀ + c₁ z + c₂ z² + c₃ z³ + c₄ z⁴ + c₅ z⁵
@@ -84,16 +84,16 @@ type
     ## For a D-twist
     ##   in canonical coordinates over sextic polynomial [1, w, w², w³, w⁴, w⁵]
     ##   when evaluating the line at P(xₚ, yₚ)
-    ##     a.yₚ + b.xₚ w + c w³ 
-    ##     
+    ##     a.yₚ + b.xₚ w + c w³
+    ##
     ##   This translates in 𝔽pᵏ to
     ##     - acb000 (cubic over quadratic towering)
     ##     - a00bc0 (quadratic over cubic towering)
     ## For a M-Twist
     ##   in canonical coordinates over sextic polynomial [1, w, w², w³, w⁴, w⁵]
     ##   when evaluating the line at the twist ψ(P)(xₚw², yₚw³)
-    ##     a.yₚ w³ + b.xₚ w² + c 
-    ## 
+    ##     a.yₚ w³ + b.xₚ w² + c
+    ##
     ##   This translates in 𝔽pᵏ to
     ##     - ca00b0 (cubic over quadratic towering)
     ##     - cb00a0 (quadratic over cubic towering)
@@ -136,7 +136,7 @@ func line_update[F1, F2](line: var Line[F2], P: ECP_ShortW_Aff[F1, G1]) =
   #   a.yₚ + b.xₚ w + c w³
   #
   # M-Twist: line at ψ(P)(xₚw², yₚw³)
-  #   a.yₚ w³ + b.xₚ w² + c 
+  #   a.yₚ w³ + b.xₚ w² + c
   line.a *= P.y
   line.b *= P.x
 
@@ -236,11 +236,11 @@ func line_eval_fused_double[Field](
 
   var A {.noInit.}, B {.noInit.}, C {.noInit.}: Field
   var E {.noInit.}, F {.noInit.}, G {.noInit.}: Field
-  
+
   template H: untyped = line.a
   template I: untyped = line.b
   template J: untyped = line.c
-  
+
   A.prod(T.x, T.y)
   A.div2()          # A = XY/2
   B.square(T.y)     # B = Y²
@@ -383,7 +383,7 @@ func line_add*[F1, F2](
 
 # ############################################################
 #
-#            𝔽pᵏ by line - 𝔽pᵏ quadratic over cubic             
+#            𝔽pᵏ by line - 𝔽pᵏ quadratic over cubic
 #
 # ############################################################
 
@@ -430,7 +430,7 @@ func mul_sparse_by_line_a00bc0*[Fpk, Fpkdiv6](f: var Fpk, l: Line[Fpkdiv6]) =
 
     v1 *= NonResidue
     f.c0.sum(v0, v1)
-  
+
   else: # Lazy reduction
     var
       V0 {.noInit.}: doubleprec(Fpkdiv2)
@@ -438,13 +438,13 @@ func mul_sparse_by_line_a00bc0*[Fpk, Fpkdiv6](f: var Fpk, l: Line[Fpkdiv6]) =
       V  {.noInit.}: doubleprec(Fpkdiv2)
       t  {.noInit.}: Fpkdiv6
       u  {.noInit.}: Fpkdiv2
-  
+
     u.sum(f.c0, f.c1)
     t.sum(l.a, l.b)
     V.mul2x_sparse_by_xy0(u, t, l.c)
     V1.mul2x_sparse_by_xy0(f.c1, l.b, l.c)
     V0.mul2x_sparse_by_x00(f.c0, l.a)
-    
+
     V.diff2xMod(V, V1)
     V.diff2xMod(V, V0)
     f.c1.redc2x(V)
@@ -460,7 +460,7 @@ func prod_x00yz0_x00yz0_into_abcdefghij00*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
   #
   # A sparse xy0 by xy0 multiplication in 𝔽pᵏᐟ² (cubic) would be
   #   (a0', a1', 0).(b0', b1', 0)
-  #  
+  #
   #   r0' = a0'b0'
   #   r1' = a0'b1 + a1'b0'
   #     or (a0'+b1')(a1'+b0')-a0'a1'-b0'b1'
@@ -476,7 +476,7 @@ func prod_x00yz0_x00yz0_into_abcdefghij00*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
   #
   # a0b0 = ( x, 0, 0).( x', 0 , 0) = (xx',       0,  0 )
   # a1b1 = ( y, z, 0).( y', z', 0) = (yy', yz'+zy', zz')
-  
+
   # r0 = a0 b0 + ξ a1 b1
   # r1 = a0 b1 + a1 b0                       (Schoolbook)
   #    = (a0 + a1) (b0 + b1) - a0 b0 - a1 b1 (Karatsuba)
@@ -502,11 +502,11 @@ func prod_x00yz0_x00yz0_into_abcdefghij00*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
   YY.prod2x(l0.b, l1.b)
   ZZ.prod2x(l0.c, l1.c)
 
-  var V{.noInit.}: doublePrec(Fpkdiv6)  
+  var V{.noInit.}: doublePrec(Fpkdiv6)
   var t0{.noInit.}, t1{.noInit.}: Fpkdiv6
 
   # r0 = (xx'+ξzz', yy', yz'+zy')
-  # r00 = xx'+ξzz'' 
+  # r00 = xx'+ξzz''
   # r01 = yy'
   # r02 = yz'+zy' = (y+z)(y'+z') - yy' - zz'
   V.prod2x(ZZ, NonResidue)
@@ -592,13 +592,13 @@ func mul_sparse_by_line_cb00a0*[Fpk, Fpkdiv6](f: var Fpk, l: Line[Fpkdiv6]) =
       V  {.noInit.}: doubleprec(Fpkdiv2)
       t  {.noInit.}: Fpkdiv6
       u  {.noInit.}: Fpkdiv2
-  
+
     u.sum(f.c0, f.c1)
     t.sum(l.b, l.a)
     V.mul2x_sparse_by_xy0(u, l.c, t)
     V1.mul2x_sparse_by_0y0(f.c1, l.a)
     V0.mul2x_sparse_by_xy0(f.c0, l.c, l.b)
-    
+
     V.diff2xMod(V, V1)
     V.diff2xMod(V, V0)
     f.c1.redc2x(V)
@@ -614,7 +614,7 @@ func prod_zy00x0_zy00x0_into_abcdef00ghij*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
   #
   # A sparse xy0 by xy0 multiplication in 𝔽pᵏᐟ² (cubic) would be
   #   (a0', a1', 0).(b0', b1', 0)
-  #  
+  #
   #   r0' = a0'b0'
   #   r1' = a0'b1 + a1'b0'
   #     or (a0'+b1')(a1'+b0')-a0'a1'-b0'b1'
@@ -630,7 +630,7 @@ func prod_zy00x0_zy00x0_into_abcdef00ghij*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
   #
   # a0b0 = ( z, y, 0).(z', y', 0) = (zz', zy'+yz', yy')
   # a1b1 = ( 0, x, 0).( 0, x', 0) = (  0,       0, xx')
-  
+
   # r0 = a0 b0 + ξ a1 b1
   # r1 = a0 b1 + a1 b0                       (Schoolbook)
   #    = (a0 + a1) (b0 + b1) - a0 b0 - a1 b1 (Karatsuba)
@@ -656,7 +656,7 @@ func prod_zy00x0_zy00x0_into_abcdef00ghij*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
   YY.prod2x(l0.b, l1.b)
   ZZ.prod2x(l0.c, l1.c)
 
-  var V{.noInit.}: doublePrec(Fpkdiv6)  
+  var V{.noInit.}: doublePrec(Fpkdiv6)
   var t0{.noInit.}, t1{.noInit.}: Fpkdiv6
 
   # r0 = (zz'+ξxx', zy'+z'y, yy')
@@ -698,7 +698,7 @@ func prod_zy00x0_zy00x0_into_abcdef00ghij*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
 
 # ############################################################
 #
-#          𝔽pᵏ by line - 𝔽pᵏ cubic over quadratic         
+#          𝔽pᵏ by line - 𝔽pᵏ cubic over quadratic
 #
 # ############################################################
 
@@ -815,7 +815,7 @@ func prod_xzy000_xzy000_into_abcdefghij00*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
   #    = v1
   #
   # r1 is computed in 3 extra muls in the base field 𝔽pᵏᐟ⁶ (Karatsuba product in quadratic field)
-  # 
+  #
   # If we dive deeper:
   # r1 = a0b1 + a1b0 = (xy'+yx', zy'+yz')      (Schoolbook - 4 muls)
   # r10 = zy'+yz' = (x+y)(x'+y') - xx' - yy'
@@ -835,7 +835,7 @@ func prod_xzy000_xzy000_into_abcdefghij00*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
   YY.prod2x(l0.b, l1.b)
   ZZ.prod2x(l0.c, l1.c)
 
-  var V{.noInit.}: doublePrec(Fpkdiv6)  
+  var V{.noInit.}: doublePrec(Fpkdiv6)
   var t0{.noInit.}, t1{.noInit.}: Fpkdiv6
 
   # r0 = a0 b0 = (x, z).(x', z')
@@ -844,7 +844,7 @@ func prod_xzy000_xzy000_into_abcdefghij00*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
   V.prod2x(ZZ, NonResidue)
   V.sum2xMod(V, XX)
   f.c0.c0.redc2x(V)
-  
+
   t0.sum(l0.a, l0.c)
   t1.sum(l1.a, l1.c)
   V.prod2x(t0, t1)
@@ -987,7 +987,7 @@ func prod_zx00y0_zx00y0_into_abcd00efghij*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
   #    = (a0 + a2) * (b0 + b2) - v0 - v2
   #
   # r2 is computed in 3 extra muls in the base field 𝔽pᵏᐟ⁶ (Karatsuba product in quadratic field)
-  # 
+  #
   # If we dive deeper:
   # r2 = a0b2 + a2b0 = (zy'+yz', xy'+x'y)      (Schoolbook - 4 muls)
   # r20 = zy'+z'y = (z+y)(z'+y') - zz' - yy'
@@ -1007,7 +1007,7 @@ func prod_zx00y0_zx00y0_into_abcd00efghij*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
   YY.prod2x(l0.b, l1.b)
   ZZ.prod2x(l0.c, l1.c)
 
-  var V{.noInit.}: doublePrec(Fpkdiv6)  
+  var V{.noInit.}: doublePrec(Fpkdiv6)
   var t0{.noInit.}, t1{.noInit.}: Fpkdiv6
 
   # r0 = a0 b0 = (z, x).(z', x')
@@ -1016,7 +1016,7 @@ func prod_zx00y0_zx00y0_into_abcd00efghij*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
   V.prod2x(XX, NonResidue)
   V.sum2xMod(V, ZZ)
   f.c0.c0.redc2x(V)
-  
+
   t0.sum(l0.c, l0.a)
   t1.sum(l1.c, l1.a)
   V.prod2x(t0, t1)
@@ -1046,7 +1046,7 @@ func prod_zx00y0_zx00y0_into_abcd00efghij*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
 
 # ############################################################
 #
-#                    Sparse multiplication         
+#                    Sparse multiplication
 #
 # ############################################################
 
@@ -1397,3 +1397,5 @@ func mul_by_2_lines*[Fpk, Fpkdiv6](f: var Fpk, line0, line1: Line[Fpkdiv6]) {.in
   var t{.noInit.}: Fpk
   t.prod_from_2_lines(line0, line1)
   f.mul_by_prod_of_2_lines(t)
+
+# func asFpk
\ No newline at end of file
diff --git a/constantine/math/pairings/miller_loops.nim b/constantine/math/pairings/miller_loops.nim
index fae3968..3616b07 100644
--- a/constantine/math/pairings/miller_loops.nim
+++ b/constantine/math/pairings/miller_loops.nim
@@ -45,7 +45,8 @@ func basicMillerLoop*[FT, F1, F2](
   var u3 = ate_param
   u3 *= 3
   for i in countdown(u3.bits - 2, 1):
-    f.square()
+    if i != u3.bits - 2:
+      f.square()
     line.line_double(T, P)
     f.mul_by_line(line)
 
@@ -320,7 +321,8 @@ func basicMillerLoop*[FT, F1, F2](
   var u3 = ate_param
   u3 *= 3
   for i in countdown(u3.bits - 2, 1):
-    f.square()
+    if i != u3.bits - 2:
+      f.square()
     f.double_jToN(j=0, line0, line1, Ts, Ps, N)
 
     let naf = u3.bit(i).int8 - u.bit(i).int8 # This can throw exception
diff --git a/constantine/signatures/bls_signatures.nim b/constantine/signatures/bls_signatures.nim
index 460bc91..a44aa51 100644
--- a/constantine/signatures/bls_signatures.nim
+++ b/constantine/signatures/bls_signatures.nim
@@ -8,6 +8,7 @@
 
 import
     ../math/[ec_shortweierstrass, extension_fields],
+    ../math/io/io_bigints,
     ../math/elliptic/ec_shortweierstrass_batch_ops,
     ../math/pairings/[pairings_generic, miller_accumulators],
     ../math/constants/zoo_generators,
@@ -365,41 +366,30 @@ func init*[T0, T1: char|byte](
 
   H.hash(ctx.secureBlinding, secureRandomBytes, accumSepTag)
 
-iterator unpack(scalarByte: byte): bool =
-  yield bool((scalarByte and 0b10000000) shr 7)
-  yield bool((scalarByte and 0b01000000) shr 6)
-  yield bool((scalarByte and 0b00100000) shr 5)
-  yield bool((scalarByte and 0b00010000) shr 4)
-  yield bool((scalarByte and 0b00001000) shr 3)
-  yield bool((scalarByte and 0b00000100) shr 2)
-  yield bool((scalarByte and 0b00000010) shr 1)
-  yield bool( scalarByte and 0b00000001)
-
-func scalarMul_doubleAdd_vartime[EC](
+func scalarMul_minHammingWeight_vartime[EC](
        P: var EC,
-       scalarCanonical: openArray[byte],
+       scalar: BigInt,
      ) =
   ## **Variable-time** Elliptic Curve Scalar Multiplication
   ##
   ##   P <- [k] P
   ##
-  ## This uses the double-and-add algorithm
-  ## This is UNSAFE to use with secret data and is only intended for signature verification
-  ## to multiply by random blinding scalars.
+  ## This uses an online recoding with minimum Hamming Weight
+  ## (which is not NAF, NAF is least-significant bit to most)
   ## Due to those scalars being 64-bit, window-method or endomorphism acceleration are slower
   ## than double-and-add.
   ##
   ## This is highly VULNERABLE to timing attacks and power analysis attacks.
-  var t0{.noInit.}, t1{.noInit.}: typeof(P)
+  ## For our usecase, scaling with a random number not in attacker control,
+  ## leaking the scalar bits is not an issue.
+  var t0{.noInit.}: typeof(P)
   t0.setInf()
-  t1.setInf()
-  for scalarByte in scalarCanonical:
-    for bit in unpack(scalarByte):
-      t1.double(t0)
-      if bit:
-        t0.sum(t1, P)
-      else:
-        t0 = t1
+  for bit in recoding_l2r_vartime(scalar):
+    t0.double()
+    if bit == 1:
+      t0 += P
+    elif bit == -1:
+      t0 -= P
   P = t0
 
 func update*[T: char|byte, Pubkey, Sig: ECP_ShortW_Aff](
@@ -434,7 +424,12 @@ func update*[T: char|byte, Pubkey, Sig: ECP_ShortW_Aff](
   # we only use a 1..<2^64 random blinding factor.
   # We assume that the attacker cannot resubmit 2^64 times
   # forged public keys and signatures.
+  #
   # Discussion https://ethresear.ch/t/fast-verification-of-multiple-bls-signatures/5407
+  # See also
+  # - Faster batch forgery identification
+  #   Daniel J. Bernstein, Jeroen Doumen, Tanja Lange, and Jan-Jaap Oosterwijk, 2012
+  #   https://eprint.iacr.org/2012/549
 
   # We only use the first 8 bytes for blinding
   # but use the full 32 bytes to derive new random scalar
@@ -459,8 +454,10 @@ func update*[T: char|byte, Pubkey, Sig: ECP_ShortW_Aff](
     pkG1_jac.fromAffine(pubkey)
     sigG2_jac.fromAffine(signature)
 
-    pkG1_jac.scalarMul_doubleAdd_vartime(ctx.secureBlinding.toOpenArray(0, 7))
-    sigG2_jac.scalarMul_doubleAdd_vartime(ctx.secureBlinding.toOpenArray(0, 7))
+    var randFactor{.noInit.}: BigInt[64]
+    randFactor.unmarshal(ctx.secureBlinding.toOpenArray(0, 7), bigEndian)
+    pkG1_jac.scalarMul_minHammingWeight_vartime(randFactor)
+    sigG2_jac.scalarMul_minHammingWeight_vartime(randFactor)
 
     if ctx.aggSigOnce == false:
       ctx.aggSig = sigG2_jac
@@ -493,8 +490,10 @@ func update*[T: char|byte, Pubkey, Sig: ECP_ShortW_Aff](
 
     sigG1_jac.fromAffine(signature)
 
-    hmsgG1_jac.scalarMul_doubleAdd_vartime(ctx.secureBlinding.toOpenArray(0, 7))
-    sigG1_jac.scalarMul_doubleAdd_vartime(ctx.secureBlinding.toOpenArray(0, 7))
+    var randFactor{.noInit.}: BigInt[64]
+    randFactor.unmarshal(ctx.secureBlinding.toOpenArray(0, 7), bigEndian)
+    hmsgG1_jac.scalarMul_minHammingWeight_vartime(randFactor)
+    sigG1_jac.scalarMul_minHammingWeight_vartime(randFactor)
 
     if ctx.aggSigOnce == false:
       ctx.aggSig = sigG1_jac
diff --git a/tests/math/support/ec_reference_scalar_mult.nim b/tests/math/support/ec_reference_scalar_mult.nim
index bae7185..e73816f 100644
--- a/tests/math/support/ec_reference_scalar_mult.nim
+++ b/tests/math/support/ec_reference_scalar_mult.nim
@@ -39,14 +39,33 @@ func unsafe_ECmul_double_add*[EC](
   var scalarCanonical: array[(scalar.bits+7) div 8, byte]
   scalarCanonical.marshal(scalar, bigEndian)
 
-  var t0{.noInit.}, t1{.noInit.}: typeof(P)
+  var t0: typeof(P)
   t0.setInf()
-  t1.setInf()
   for scalarByte in scalarCanonical:
     for bit in unpack(scalarByte):
-      t1.double(t0)
+      t0.double()
       if bit:
-        t0.sum(t1, P)
-      else:
-        t0 = t1
+        t0 += P
   P = t0
+
+func unsafe_ECmul_minHammingWeight*[EC](
+       P: var EC,
+       scalar: BigInt) =
+  ## **Unsafe** Elliptic Curve Scalar Multiplication
+  ##
+  ##   P <- [k] P
+  ##
+  ## This uses an online recoding with minimum Hamming Weight
+  ## (which is not NAF, NAF is least-significant bit to most)
+  ## This is UNSAFE to use in production and only intended for testing purposes.
+  ##
+  ## This is highly VULNERABLE to timing attacks and power analysis attacks
+  var t0{.noInit.}: typeof(P)
+  t0.setInf()
+  for bit in recoding_l2r_vartime(scalar):
+    t0.double()
+    if bit == 1:
+      t0 += P
+    elif bit == -1:
+      t0 -= P
+  P = t0
\ No newline at end of file
diff --git a/tests/math/t_ec_template.nim b/tests/math/t_ec_template.nim
index 8faf79a..b75d6b4 100644
--- a/tests/math/t_ec_template.nim
+++ b/tests/math/t_ec_template.nim
@@ -421,11 +421,15 @@ proc run_EC_mul_vs_ref_impl*(
           var
             impl = a
             reference = a
+            refMinWeight = a
 
           impl.scalarMulGeneric(exponent)
           reference.unsafe_ECmul_double_add(exponent)
+          refMinWeight.unsafe_ECmul_minHammingWeight(exponent)
 
-          check: bool(impl == reference)
+          check:
+            bool(impl == reference)
+            bool(impl == refMinWeight)
 
       test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = Uniform)
       test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = Uniform)