MSB-to-LSB minimum Hamming Weight Recoding (#219)

* signed recoding * use recoding
2025-02-20 22:48:35 +00:00 · 2023-02-07 16:27:53 +01:00 · 2023-02-07 16:27:53 +01:00 · 082cd1deb9
commit 082cd1deb9
parent 7c5421ffdc
13 changed files with 216 additions and 114 deletions
--- a/benchmarks/bench_ec_g1.nim
+++ b/benchmarks/bench_ec_g1.nim
@ -57,6 +57,9 @@ proc main() =
    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Prj[Fp[curve], G1], MulIters)
    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Jac[Fp[curve], G1], MulIters)
    separator()
+    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Prj[Fp[curve], G1], MulIters)
+    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Jac[Fp[curve], G1], MulIters)
+    separator()
    scalarMulGenericBench(ECP_ShortW_Prj[Fp[curve], G1], window = 2, MulIters)
    scalarMulGenericBench(ECP_ShortW_Prj[Fp[curve], G1], window = 3, MulIters)
    scalarMulGenericBench(ECP_ShortW_Prj[Fp[curve], G1], window = 4, MulIters)
--- a/benchmarks/bench_ec_g2.nim
+++ b/benchmarks/bench_ec_g2.nim
@ -58,6 +58,9 @@ proc main() =
    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Prj[Fp2[curve], G2], MulIters)
    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Jac[Fp2[curve], G2], MulIters)
    separator()
+    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Prj[Fp2[curve], G2], MulIters)
+    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Jac[Fp2[curve], G2], MulIters)
+    separator()
    scalarMulGenericBench(ECP_ShortW_Prj[Fp2[curve], G2], window = 2, MulIters)
    scalarMulGenericBench(ECP_ShortW_Prj[Fp2[curve], G2], window = 3, MulIters)
    scalarMulGenericBench(ECP_ShortW_Prj[Fp2[curve], G2], window = 4, MulIters)
--- a/benchmarks/bench_elliptic_template.nim
+++ b/benchmarks/bench_elliptic_template.nim
@ -143,6 +143,18 @@ proc scalarMulUnsafeDoubleAddBench*(EC: typedesc, iters: int) =
    r = P
    r.unsafe_ECmul_double_add(exponent)

+proc scalarMulUnsafeMinHammingWeightRecodingBench*(EC: typedesc, iters: int) =
+  const bits = EC.F.C.getCurveOrderBitwidth()
+
+  var r {.noInit.}: EC
+  var P = rng.random_unsafe(EC) # TODO: clear cofactor
+
+  let exponent = rng.random_unsafe(BigInt[bits])
+
+  bench("EC ScalarMul " & $bits & "-bit " & $EC.G & " (unsafe min Hamming Weight recoding)", EC, iters):
+    r = P
+    r.unsafe_ECmul_minHammingWeight(exponent)
+
 proc multiAddBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) =
  var points = newSeq[ECP_ShortW_Aff[EC.F, EC.G]](numPoints)

--- a/constantine.nimble
+++ b/constantine.nimble
@ -200,7 +200,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
  # ("tests/math/t_pairing_bls12_377_line_functions.nim", false),
  # ("tests/math/t_pairing_bls12_381_line_functions.nim", false),
  # ("tests/math/t_pairing_mul_fp12_by_lines.nim", false),
-  # ("tests/math/t_pairing_cyclotomic_subgroup.nim", false),
+  ("tests/math/t_pairing_cyclotomic_subgroup.nim", false),
  ("tests/math/t_pairing_bn254_nogami_optate.nim", false),
  ("tests/math/t_pairing_bn254_snarks_optate.nim", false),
  ("tests/math/t_pairing_bls12_377_optate.nim", false),
--- a/constantine/math/arithmetic/bigints.nim
+++ b/constantine/math/arithmetic/bigints.nim
@ -476,7 +476,7 @@ func invmod*[bits](
       F, M: static BigInt[bits]) =
  ## Compute the modular inverse of ``a`` modulo M
  ## r ≡ F.a⁻¹ (mod M)
-  ## 
+  ##
  ## with F and M known at compile-time
  ##
  ## M MUST be odd, M does not need to be prime.
@ -491,5 +491,52 @@ func invmod*[bits](r: var BigInt[bits], a, M: BigInt[bits]) =
  one.setOne()
  r.invmod(a, one, M)

+# ############################################################
+#
+#                   Recoding
+#
+# ############################################################
+
+iterator recoding_l2r_vartime*(a: BigInt): int8 =
+  ## This is a minimum-Hamming-Weight left-to-right recoding.
+  ## It outputs signed {-1, 0, 1} bits from MSB to LSB
+  ## with minimal Hamming Weight to minimize operations
+  ## in Miller Loop and vartime scalar multiplications
+  ##
+  ## Tagged vartime as it returns an int8
+  ## - Optimal Left-to-Right Binary Signed-Digit Recoding
+  ##   Joye, Yen, 2000
+  ##   https://marcjoye.github.io/papers/JY00sd2r.pdf
+
+  # As the caller is copy-pasted at each yield
+  # we rework the algorithm so that we have a single yield point
+  # We rely on the compiler for loop hoisting and/or loop peeling
+
+  var bi, bi1, ri, ri1, ri2: int8
+
+  var i = a.bits
+  while true:
+    if i == a.bits: # We rely on compiler to hoist this branch out of the loop.
+      ri = 0
+      ri1 = int8 a.bit(a.bits-1)
+      ri2 = int8 a.bit(a.bits-2)
+      bi = 0
+    else:
+      bi = bi1
+      ri = ri1
+      ri1 = ri2
+      if i < 2:
+        ri2 = 0
+      else:
+        ri2 = int8 a.bit(i-2)
+
+    bi1 = (bi + ri1 + ri2) shr 1
+    yield -2*bi + ri + bi1
+
+    if i > 0:
+      i -= 1
+    else:
+      break
+
 {.pop.} # inline
 {.pop.} # raises no exceptions
--- a/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim
@ -630,14 +630,18 @@ func double*(P: var ECP_ShortW_Jac) {.inline.} =
  ## In-place point doubling
  P.double(P)

-func diff*(r: var ECP_ShortW_Jac,
-           P, Q: ECP_ShortW_Jac
-     ) {.inline.} =
+func diff*(r: var ECP_ShortW_Jac, P, Q: ECP_ShortW_Jac) {.inline.} =
  ## r = P - Q
  var nQ {.noInit.}: typeof(Q)
  nQ.neg(Q)
  r.sum(P, nQ)

+func `-=`*(P: var ECP_ShortW_Jac, Q: ECP_ShortW_Jac) {.inline.} =
+  ## In-place point substraction
+  var nQ {.noInit.}: typeof(Q)
+  nQ.neg(Q)
+  P.sum(P, nQ)
+
 func affine*[F; G](
       aff: var ECP_ShortW_Aff[F, G],
       jac: ECP_ShortW_Jac[F, G]) =
--- a/constantine/math/elliptic/ec_shortweierstrass_projective.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_projective.nim
@ -417,15 +417,19 @@ func double*(P: var ECP_ShortW_Prj) {.inline.} =
  ## In-place EC doubling
  P.double(P)

-func diff*(r: var ECP_ShortW_Prj,
-              P, Q: ECP_ShortW_Prj
-     ) {.inline.} =
+func diff*(r: var ECP_ShortW_Prj, P, Q: ECP_ShortW_Prj) {.inline.} =
  ## r = P - Q
  ## Can handle r and Q aliasing
  var nQ {.noInit.}: typeof(Q)
  nQ.neg(Q)
  r.sum(P, nQ)

+func `-=`*(P: var ECP_ShortW_Prj, Q: ECP_ShortW_Prj) {.inline.} =
+  ## In-place point substraction
+  var nQ {.noInit.}: typeof(Q)
+  nQ.neg(Q)
+  P.sum(P, nQ)
+
 func affine*[F, G](
       aff: var ECP_ShortW_Aff[F, G],
       proj: ECP_ShortW_Prj[F, G]) =
--- a/constantine/math/pairings/cyclotomic_subgroups.nim
+++ b/constantine/math/pairings/cyclotomic_subgroups.nim
@ -243,7 +243,7 @@ func cyclotomic_square_cube_over_quad(r: var CubicExt, a: CubicExt) =

  # From here on, r aliasing with a is only for the first operation
  # and only read/write the exact same coordinates
-  
+
  # 3v₀₀ - 2a₀
  r.c0.c0.diff(v0.c0, a0)
  r.c0.c0.double()
@ -305,21 +305,21 @@ func cyclotomic_square_quad_over_cube[F](r: var QuadraticExt[F], a: QuadraticExt
  #   v₁ = (b₃, b₂) = (a.c1.c0, a.c0.c2)
  #   v₂ = (b₁, b₅) = (a.c0.c1, a.c1.c2)
  var v0{.noInit.}, v1{.noInit.}, v2{.noInit.}: QuadraticExt[typeof(r.c0.c0)]
-  
+
  template b0: untyped = a.c0.c0
  template b1: untyped = a.c0.c1
  template b2: untyped = a.c0.c2
  template b3: untyped = a.c1.c0
  template b4: untyped = a.c1.c1
  template b5: untyped = a.c1.c2
-  
+
  v0.square_disjoint(b0, b4)
  v1.square_disjoint(b3, b2)
  v2.square_disjoint(b1, b5)
-  
+
  # From here on, r aliasing with a is only for the first operation
  # and only read/write the exact same coordinates
-  
+
  # 3v₀₀ - 2b₀
  r.c0.c0.diff(v0.c0, b0)
  r.c0.c0.double()
@ -385,28 +385,31 @@ func cycl_sqr_repeated*[FT](r: var FT, a: FT, num: int) {.inline, meter.} =
  for _ in 1 ..< num:
    r.cyclotomic_square()

-iterator unpack(scalarByte: byte): bool =
-  yield bool((scalarByte and 0b10000000) shr 7)
-  yield bool((scalarByte and 0b01000000) shr 6)
-  yield bool((scalarByte and 0b00100000) shr 5)
-  yield bool((scalarByte and 0b00010000) shr 4)
-  yield bool((scalarByte and 0b00001000) shr 3)
-  yield bool((scalarByte and 0b00000100) shr 2)
-  yield bool((scalarByte and 0b00000010) shr 1)
-  yield bool( scalarByte and 0b00000001)
+func cyclotomic_exp*[FT](r: var FT, a: FT, exponent: static BigInt, invert: bool) {.meter.} =
+  ## Assumes public exponent
+  var na {.noInit.}: FT
+  na.cyclotomic_inv(a)

-func cyclotomic_exp*[FT](r: var FT, a: FT, exponent: BigInt, invert: bool) {.meter.} =
-    var eBytes: array[(exponent.bits+7) div 8, byte]
-    eBytes.marshal(exponent, bigEndian)
+  r.setOne()
+  var init = false
+  for bit in recoding_l2r_vartime(exponent):
+    if init:
+      r.cyclotomic_square()
+    if bit == 1:
+      if not init:
+        r = a
+        init = true
+      else:
+        r *= a
+    elif bit == -1:
+      if not init:
+        r = na
+        init = true
+      else:
+        r *= na

-    r.setOne()
-    for b in eBytes:
-      for bit in unpack(b):
-        r.cyclotomic_square()
-        if bit:
-          r *= a
-    if invert:
-      r.cyclotomic_inv()
+  if invert:
+    r.cyclotomic_inv()

 func isInCyclotomicSubgroup*[C](a: Fp6[C]): SecretBool =
  ## Check if a ∈ Fpⁿ: a^Φₙ(p) = 1
@ -491,7 +494,7 @@ func cyclotomic_square_compressed*[F](g: var G2345[F]) =
  #       h₃ = 3(g₄² + g₅²ξ) - 2g₃
  #       h₄ = 3(g₂² + g₃²ξ) - 2g₄
  #       h₅ = 2g₅ + 3 ((g₂+g₃)²-g₂²-g₃²)
-  # (6 sqr)    
+  # (6 sqr)
  #
  # or with quadratic arithmetic
  #   (h₂+h₃u) = 3u(g₄+g₅u)² + 2(g₂-g₃u)
@ -523,14 +526,14 @@ func recover_g1*[F](g1_num, g1_den: var F, g: G2345[F]) =
  #   g₁ = (g₅²ξ + 3g₄² - 2g₃)/4g₂
  # if g₂ == 0
  #   g₁ = 2g₄g₅/g₃
-  # 
+  #
  # Theorem 3.1, this is well-defined for all
  # g in Gϕₙ \ {1}
  # if g₂=g₃=0 then g₄=g₅=0 as well
  # and g₀ = 1
  let g2NonZero = not g.g2.isZero()
  var t{.noInit.}: F
-  
+
  g1_num = g.g4
  t = g.g5
  t.ccopy(g.g4, g2NonZero)
@ -543,7 +546,7 @@ func recover_g1*[F](g1_num, g1_den: var F, g: G2345[F]) =
  t.square(g.g5)
  t *= NonResidue
  g1_num.cadd(t, g2NonZero)       # g₅²ξ + 3g₄² - 2g₃ or 2g₄g₅
-  
+
  t.prod(g.g2, 4)
  g1_den = g.g3
  g1_den.ccopy(t, g2NonZero)      # 4g₂ or g₃
@ -556,7 +559,7 @@ func batch_ratio_g1s*[N: static int, F](
  ## This requires that all g1_den != 0 or all g1_den == 0
  ## which is the case if this is used to implement
  ## exponentiation in cyclotomic subgroup.
-  
+
  # Algorithm: Montgomery's batch inversion
  # - Speeding the Pollard and Elliptic Curve Methods of Factorization
  #   Section 10.3.1
@ -581,7 +584,7 @@ func batch_ratio_g1s*[N: static int, F](
    dst[i] *= src[i].g1_num
    # Next iteration
    accInv *= src[i].g1_den
-  
+
  dst[0].prod(accInv, src[0].g1_num)

 func recover_g0*[F](
@ -607,12 +610,12 @@ func fromFpk*[Fpkdiv6, Fpk](
       a: Fpk) =
  ## Convert from a sextic extension to the Karabina g₂₃₄₅
  ## representation.
-  
+
  # GT representations isomorphisms
  # ===============================
  #
  # Given a sextic twist, we can express all elements in terms of z = SNR¹ᐟ⁶
-  # 
+  #
  # The canonical direct sextic representation uses coefficients
  #
  #    c₀ + c₁ z + c₂ z² + c₃ z³ + c₄ z⁴ + c₅ z⁵
@ -699,14 +702,14 @@ func asFpk*[Fpkdiv6, Fpk](
    {.error: "𝔽pᵏᐟ⁶ -> 𝔽pᵏ towering (direct sextic) is not implemented.".}

 func cyclotomic_exp_compressed*[N: static int, Fpk](
-       r: var Fpk, a: Fpk, 
+       r: var Fpk, a: Fpk,
       squarings: static array[N, int]) =
  ## Exponentiation on the cyclotomic subgroup
  ## via compressed repeated squarings
  ## Exponentiation is done least-signigicant bits first
  ## `squarings` represents the number of squarings
  ## to do before the next multiplication.
-  
+
  type Fpkdiv6 = typeof(a.c0.c0)

  var gs {.noInit.}: array[N, G2345[Fpkdiv6]]
@ -724,7 +727,7 @@ func cyclotomic_exp_compressed*[N: static int, Fpk](
  var g1s_ratio {.noInit.}: array[N, tuple[g1_num, g1_den: Fpkdiv6]]
  for i in 0 ..< N:
    recover_g1(g1s_ratio[i].g1_num, g1s_ratio[i].g1_den, gs[i])
-  
+
  var g1s {.noInit.}: array[N, Fpkdiv6]
  g1s.batch_ratio_g1s(g1s_ratio)

@ -746,10 +749,10 @@ func cyclotomic_exp_compressed*[N: static int, Fpk](
  ## Exponentiation is done least-signigicant bits first
  ## `squarings` represents the number of squarings
  ## to do before the next multiplication.
-  ## 
+  ##
  ## `accumSquarings` stores the accumulated squarings so far
  ## iff N != 1
-  
+
  type Fpkdiv6 = typeof(a.c0.c0)

  var gs {.noInit.}: array[N, G2345[Fpkdiv6]]
@ -767,7 +770,7 @@ func cyclotomic_exp_compressed*[N: static int, Fpk](
  var g1s_ratio {.noInit.}: array[N, tuple[g1_num, g1_den: Fpkdiv6]]
  for i in 0 ..< N:
    recover_g1(g1s_ratio[i].g1_num, g1s_ratio[i].g1_den, gs[i])
-  
+
  var g1s {.noInit.}: array[N, Fpkdiv6]
  g1s.batch_ratio_g1s(g1s_ratio)

--- a/constantine/math/pairings/lines_eval.nim
+++ b/constantine/math/pairings/lines_eval.nim
@ -24,7 +24,7 @@ import

 # ############################################################
 #
-#                     Lines functions    
+#                     Lines functions
 #
 # ############################################################

@ -32,7 +32,7 @@ import
 # ===============================
 #
 # Given a sextic twist, we can express all elements in terms of z = SNR¹ᐟ⁶
-# 
+#
 # The canonical direct sextic representation uses coefficients
 #
 #    c₀ + c₁ z + c₂ z² + c₃ z³ + c₄ z⁴ + c₅ z⁵
@ -84,16 +84,16 @@ type
    ## For a D-twist
    ##   in canonical coordinates over sextic polynomial [1, w, w², w³, w⁴, w⁵]
    ##   when evaluating the line at P(xₚ, yₚ)
-    ##     a.yₚ + b.xₚ w + c w³ 
-    ##     
+    ##     a.yₚ + b.xₚ w + c w³
+    ##
    ##   This translates in 𝔽pᵏ to
    ##     - acb000 (cubic over quadratic towering)
    ##     - a00bc0 (quadratic over cubic towering)
    ## For a M-Twist
    ##   in canonical coordinates over sextic polynomial [1, w, w², w³, w⁴, w⁵]
    ##   when evaluating the line at the twist ψ(P)(xₚw², yₚw³)
-    ##     a.yₚ w³ + b.xₚ w² + c 
-    ## 
+    ##     a.yₚ w³ + b.xₚ w² + c
+    ##
    ##   This translates in 𝔽pᵏ to
    ##     - ca00b0 (cubic over quadratic towering)
    ##     - cb00a0 (quadratic over cubic towering)
@ -136,7 +136,7 @@ func line_update[F1, F2](line: var Line[F2], P: ECP_ShortW_Aff[F1, G1]) =
  #   a.yₚ + b.xₚ w + c w³
  #
  # M-Twist: line at ψ(P)(xₚw², yₚw³)
-  #   a.yₚ w³ + b.xₚ w² + c 
+  #   a.yₚ w³ + b.xₚ w² + c
  line.a *= P.y
  line.b *= P.x

@ -236,11 +236,11 @@ func line_eval_fused_double[Field](

  var A {.noInit.}, B {.noInit.}, C {.noInit.}: Field
  var E {.noInit.}, F {.noInit.}, G {.noInit.}: Field
-  
+
  template H: untyped = line.a
  template I: untyped = line.b
  template J: untyped = line.c
-  
+
  A.prod(T.x, T.y)
  A.div2()          # A = XY/2
  B.square(T.y)     # B = Y²
@ -383,7 +383,7 @@ func line_add*[F1, F2](

 # ############################################################
 #
-#            𝔽pᵏ by line - 𝔽pᵏ quadratic over cubic             
+#            𝔽pᵏ by line - 𝔽pᵏ quadratic over cubic
 #
 # ############################################################

@ -430,7 +430,7 @@ func mul_sparse_by_line_a00bc0*[Fpk, Fpkdiv6](f: var Fpk, l: Line[Fpkdiv6]) =

    v1 *= NonResidue
    f.c0.sum(v0, v1)
-  
+
  else: # Lazy reduction
    var
      V0 {.noInit.}: doubleprec(Fpkdiv2)
@ -438,13 +438,13 @@ func mul_sparse_by_line_a00bc0*[Fpk, Fpkdiv6](f: var Fpk, l: Line[Fpkdiv6]) =
      V  {.noInit.}: doubleprec(Fpkdiv2)
      t  {.noInit.}: Fpkdiv6
      u  {.noInit.}: Fpkdiv2
-  
+
    u.sum(f.c0, f.c1)
    t.sum(l.a, l.b)
    V.mul2x_sparse_by_xy0(u, t, l.c)
    V1.mul2x_sparse_by_xy0(f.c1, l.b, l.c)
    V0.mul2x_sparse_by_x00(f.c0, l.a)
-    
+
    V.diff2xMod(V, V1)
    V.diff2xMod(V, V0)
    f.c1.redc2x(V)
@ -460,7 +460,7 @@ func prod_x00yz0_x00yz0_into_abcdefghij00*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
  #
  # A sparse xy0 by xy0 multiplication in 𝔽pᵏᐟ² (cubic) would be
  #   (a0', a1', 0).(b0', b1', 0)
-  #  
+  #
  #   r0' = a0'b0'
  #   r1' = a0'b1 + a1'b0'
  #     or (a0'+b1')(a1'+b0')-a0'a1'-b0'b1'
@ -476,7 +476,7 @@ func prod_x00yz0_x00yz0_into_abcdefghij00*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
  #
  # a0b0 = ( x, 0, 0).( x', 0 , 0) = (xx',       0,  0 )
  # a1b1 = ( y, z, 0).( y', z', 0) = (yy', yz'+zy', zz')
-  
+
  # r0 = a0 b0 + ξ a1 b1
  # r1 = a0 b1 + a1 b0                       (Schoolbook)
  #    = (a0 + a1) (b0 + b1) - a0 b0 - a1 b1 (Karatsuba)
@ -502,11 +502,11 @@ func prod_x00yz0_x00yz0_into_abcdefghij00*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
  YY.prod2x(l0.b, l1.b)
  ZZ.prod2x(l0.c, l1.c)

-  var V{.noInit.}: doublePrec(Fpkdiv6)  
+  var V{.noInit.}: doublePrec(Fpkdiv6)
  var t0{.noInit.}, t1{.noInit.}: Fpkdiv6

  # r0 = (xx'+ξzz', yy', yz'+zy')
-  # r00 = xx'+ξzz'' 
+  # r00 = xx'+ξzz''
  # r01 = yy'
  # r02 = yz'+zy' = (y+z)(y'+z') - yy' - zz'
  V.prod2x(ZZ, NonResidue)
@ -592,13 +592,13 @@ func mul_sparse_by_line_cb00a0*[Fpk, Fpkdiv6](f: var Fpk, l: Line[Fpkdiv6]) =
      V  {.noInit.}: doubleprec(Fpkdiv2)
      t  {.noInit.}: Fpkdiv6
      u  {.noInit.}: Fpkdiv2
-  
+
    u.sum(f.c0, f.c1)
    t.sum(l.b, l.a)
    V.mul2x_sparse_by_xy0(u, l.c, t)
    V1.mul2x_sparse_by_0y0(f.c1, l.a)
    V0.mul2x_sparse_by_xy0(f.c0, l.c, l.b)
-    
+
    V.diff2xMod(V, V1)
    V.diff2xMod(V, V0)
    f.c1.redc2x(V)
@ -614,7 +614,7 @@ func prod_zy00x0_zy00x0_into_abcdef00ghij*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
  #
  # A sparse xy0 by xy0 multiplication in 𝔽pᵏᐟ² (cubic) would be
  #   (a0', a1', 0).(b0', b1', 0)
-  #  
+  #
  #   r0' = a0'b0'
  #   r1' = a0'b1 + a1'b0'
  #     or (a0'+b1')(a1'+b0')-a0'a1'-b0'b1'
@ -630,7 +630,7 @@ func prod_zy00x0_zy00x0_into_abcdef00ghij*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
  #
  # a0b0 = ( z, y, 0).(z', y', 0) = (zz', zy'+yz', yy')
  # a1b1 = ( 0, x, 0).( 0, x', 0) = (  0,       0, xx')
-  
+
  # r0 = a0 b0 + ξ a1 b1
  # r1 = a0 b1 + a1 b0                       (Schoolbook)
  #    = (a0 + a1) (b0 + b1) - a0 b0 - a1 b1 (Karatsuba)
@ -656,7 +656,7 @@ func prod_zy00x0_zy00x0_into_abcdef00ghij*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
  YY.prod2x(l0.b, l1.b)
  ZZ.prod2x(l0.c, l1.c)

-  var V{.noInit.}: doublePrec(Fpkdiv6)  
+  var V{.noInit.}: doublePrec(Fpkdiv6)
  var t0{.noInit.}, t1{.noInit.}: Fpkdiv6

  # r0 = (zz'+ξxx', zy'+z'y, yy')
@ -698,7 +698,7 @@ func prod_zy00x0_zy00x0_into_abcdef00ghij*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin

 # ############################################################
 #
-#          𝔽pᵏ by line - 𝔽pᵏ cubic over quadratic         
+#          𝔽pᵏ by line - 𝔽pᵏ cubic over quadratic
 #
 # ############################################################

@ -815,7 +815,7 @@ func prod_xzy000_xzy000_into_abcdefghij00*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
  #    = v1
  #
  # r1 is computed in 3 extra muls in the base field 𝔽pᵏᐟ⁶ (Karatsuba product in quadratic field)
-  # 
+  #
  # If we dive deeper:
  # r1 = a0b1 + a1b0 = (xy'+yx', zy'+yz')      (Schoolbook - 4 muls)
  # r10 = zy'+yz' = (x+y)(x'+y') - xx' - yy'
@ -835,7 +835,7 @@ func prod_xzy000_xzy000_into_abcdefghij00*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
  YY.prod2x(l0.b, l1.b)
  ZZ.prod2x(l0.c, l1.c)

-  var V{.noInit.}: doublePrec(Fpkdiv6)  
+  var V{.noInit.}: doublePrec(Fpkdiv6)
  var t0{.noInit.}, t1{.noInit.}: Fpkdiv6

  # r0 = a0 b0 = (x, z).(x', z')
@ -844,7 +844,7 @@ func prod_xzy000_xzy000_into_abcdefghij00*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
  V.prod2x(ZZ, NonResidue)
  V.sum2xMod(V, XX)
  f.c0.c0.redc2x(V)
-  
+
  t0.sum(l0.a, l0.c)
  t1.sum(l1.a, l1.c)
  V.prod2x(t0, t1)
@ -987,7 +987,7 @@ func prod_zx00y0_zx00y0_into_abcd00efghij*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
  #    = (a0 + a2) * (b0 + b2) - v0 - v2
  #
  # r2 is computed in 3 extra muls in the base field 𝔽pᵏᐟ⁶ (Karatsuba product in quadratic field)
-  # 
+  #
  # If we dive deeper:
  # r2 = a0b2 + a2b0 = (zy'+yz', xy'+x'y)      (Schoolbook - 4 muls)
  # r20 = zy'+z'y = (z+y)(z'+y') - zz' - yy'
@ -1007,7 +1007,7 @@ func prod_zx00y0_zx00y0_into_abcd00efghij*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
  YY.prod2x(l0.b, l1.b)
  ZZ.prod2x(l0.c, l1.c)

-  var V{.noInit.}: doublePrec(Fpkdiv6)  
+  var V{.noInit.}: doublePrec(Fpkdiv6)
  var t0{.noInit.}, t1{.noInit.}: Fpkdiv6

  # r0 = a0 b0 = (z, x).(z', x')
@ -1016,7 +1016,7 @@ func prod_zx00y0_zx00y0_into_abcd00efghij*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin
  V.prod2x(XX, NonResidue)
  V.sum2xMod(V, ZZ)
  f.c0.c0.redc2x(V)
-  
+
  t0.sum(l0.c, l0.a)
  t1.sum(l1.c, l1.a)
  V.prod2x(t0, t1)
@ -1046,7 +1046,7 @@ func prod_zx00y0_zx00y0_into_abcd00efghij*[Fpk, Fpkdiv6](f: var Fpk, l0, l1: Lin

 # ############################################################
 #
-#                    Sparse multiplication         
+#                    Sparse multiplication
 #
 # ############################################################

@ -1397,3 +1397,5 @@ func mul_by_2_lines*[Fpk, Fpkdiv6](f: var Fpk, line0, line1: Line[Fpkdiv6]) {.in
  var t{.noInit.}: Fpk
  t.prod_from_2_lines(line0, line1)
  f.mul_by_prod_of_2_lines(t)
+
+# func asFpk
--- a/constantine/math/pairings/miller_loops.nim
+++ b/constantine/math/pairings/miller_loops.nim
@ -45,7 +45,8 @@ func basicMillerLoop*[FT, F1, F2](
  var u3 = ate_param
  u3 *= 3
  for i in countdown(u3.bits - 2, 1):
-    f.square()
+    if i != u3.bits - 2:
+      f.square()
    line.line_double(T, P)
    f.mul_by_line(line)

@ -320,7 +321,8 @@ func basicMillerLoop*[FT, F1, F2](
  var u3 = ate_param
  u3 *= 3
  for i in countdown(u3.bits - 2, 1):
-    f.square()
+    if i != u3.bits - 2:
+      f.square()
    f.double_jToN(j=0, line0, line1, Ts, Ps, N)

    let naf = u3.bit(i).int8 - u.bit(i).int8 # This can throw exception
--- a/constantine/signatures/bls_signatures.nim
+++ b/constantine/signatures/bls_signatures.nim
@ -8,6 +8,7 @@

 import
    ../math/[ec_shortweierstrass, extension_fields],
+    ../math/io/io_bigints,
    ../math/elliptic/ec_shortweierstrass_batch_ops,
    ../math/pairings/[pairings_generic, miller_accumulators],
    ../math/constants/zoo_generators,
@ -365,41 +366,30 @@ func init*[T0, T1: char|byte](

  H.hash(ctx.secureBlinding, secureRandomBytes, accumSepTag)

-iterator unpack(scalarByte: byte): bool =
-  yield bool((scalarByte and 0b10000000) shr 7)
-  yield bool((scalarByte and 0b01000000) shr 6)
-  yield bool((scalarByte and 0b00100000) shr 5)
-  yield bool((scalarByte and 0b00010000) shr 4)
-  yield bool((scalarByte and 0b00001000) shr 3)
-  yield bool((scalarByte and 0b00000100) shr 2)
-  yield bool((scalarByte and 0b00000010) shr 1)
-  yield bool( scalarByte and 0b00000001)
-
-func scalarMul_doubleAdd_vartime[EC](
+func scalarMul_minHammingWeight_vartime[EC](
       P: var EC,
-       scalarCanonical: openArray[byte],
+       scalar: BigInt,
     ) =
  ## **Variable-time** Elliptic Curve Scalar Multiplication
  ##
  ##   P <- [k] P
  ##
-  ## This uses the double-and-add algorithm
-  ## This is UNSAFE to use with secret data and is only intended for signature verification
-  ## to multiply by random blinding scalars.
+  ## This uses an online recoding with minimum Hamming Weight
+  ## (which is not NAF, NAF is least-significant bit to most)
  ## Due to those scalars being 64-bit, window-method or endomorphism acceleration are slower
  ## than double-and-add.
  ##
  ## This is highly VULNERABLE to timing attacks and power analysis attacks.
-  var t0{.noInit.}, t1{.noInit.}: typeof(P)
+  ## For our usecase, scaling with a random number not in attacker control,
+  ## leaking the scalar bits is not an issue.
+  var t0{.noInit.}: typeof(P)
  t0.setInf()
-  t1.setInf()
-  for scalarByte in scalarCanonical:
-    for bit in unpack(scalarByte):
-      t1.double(t0)
-      if bit:
-        t0.sum(t1, P)
-      else:
-        t0 = t1
+  for bit in recoding_l2r_vartime(scalar):
+    t0.double()
+    if bit == 1:
+      t0 += P
+    elif bit == -1:
+      t0 -= P
  P = t0

 func update*[T: char|byte, Pubkey, Sig: ECP_ShortW_Aff](
@ -434,7 +424,12 @@ func update*[T: char|byte, Pubkey, Sig: ECP_ShortW_Aff](
  # we only use a 1..<2^64 random blinding factor.
  # We assume that the attacker cannot resubmit 2^64 times
  # forged public keys and signatures.
+  #
  # Discussion https://ethresear.ch/t/fast-verification-of-multiple-bls-signatures/5407
+  # See also
+  # - Faster batch forgery identification
+  #   Daniel J. Bernstein, Jeroen Doumen, Tanja Lange, and Jan-Jaap Oosterwijk, 2012
+  #   https://eprint.iacr.org/2012/549

  # We only use the first 8 bytes for blinding
  # but use the full 32 bytes to derive new random scalar
@ -459,8 +454,10 @@ func update*[T: char|byte, Pubkey, Sig: ECP_ShortW_Aff](
    pkG1_jac.fromAffine(pubkey)
    sigG2_jac.fromAffine(signature)

-    pkG1_jac.scalarMul_doubleAdd_vartime(ctx.secureBlinding.toOpenArray(0, 7))
-    sigG2_jac.scalarMul_doubleAdd_vartime(ctx.secureBlinding.toOpenArray(0, 7))
+    var randFactor{.noInit.}: BigInt[64]
+    randFactor.unmarshal(ctx.secureBlinding.toOpenArray(0, 7), bigEndian)
+    pkG1_jac.scalarMul_minHammingWeight_vartime(randFactor)
+    sigG2_jac.scalarMul_minHammingWeight_vartime(randFactor)

    if ctx.aggSigOnce == false:
      ctx.aggSig = sigG2_jac
@ -493,8 +490,10 @@ func update*[T: char|byte, Pubkey, Sig: ECP_ShortW_Aff](

    sigG1_jac.fromAffine(signature)

-    hmsgG1_jac.scalarMul_doubleAdd_vartime(ctx.secureBlinding.toOpenArray(0, 7))
-    sigG1_jac.scalarMul_doubleAdd_vartime(ctx.secureBlinding.toOpenArray(0, 7))
+    var randFactor{.noInit.}: BigInt[64]
+    randFactor.unmarshal(ctx.secureBlinding.toOpenArray(0, 7), bigEndian)
+    hmsgG1_jac.scalarMul_minHammingWeight_vartime(randFactor)
+    sigG1_jac.scalarMul_minHammingWeight_vartime(randFactor)

    if ctx.aggSigOnce == false:
      ctx.aggSig = sigG1_jac
--- a/tests/math/support/ec_reference_scalar_mult.nim
+++ b/tests/math/support/ec_reference_scalar_mult.nim
@ -39,14 +39,33 @@ func unsafe_ECmul_double_add*[EC](
  var scalarCanonical: array[(scalar.bits+7) div 8, byte]
  scalarCanonical.marshal(scalar, bigEndian)

-  var t0{.noInit.}, t1{.noInit.}: typeof(P)
+  var t0: typeof(P)
  t0.setInf()
-  t1.setInf()
  for scalarByte in scalarCanonical:
    for bit in unpack(scalarByte):
-      t1.double(t0)
+      t0.double()
      if bit:
-        t0.sum(t1, P)
-      else:
-        t0 = t1
+        t0 += P
  P = t0
+
+func unsafe_ECmul_minHammingWeight*[EC](
+       P: var EC,
+       scalar: BigInt) =
+  ## **Unsafe** Elliptic Curve Scalar Multiplication
+  ##
+  ##   P <- [k] P
+  ##
+  ## This uses an online recoding with minimum Hamming Weight
+  ## (which is not NAF, NAF is least-significant bit to most)
+  ## This is UNSAFE to use in production and only intended for testing purposes.
+  ##
+  ## This is highly VULNERABLE to timing attacks and power analysis attacks
+  var t0{.noInit.}: typeof(P)
+  t0.setInf()
+  for bit in recoding_l2r_vartime(scalar):
+    t0.double()
+    if bit == 1:
+      t0 += P
+    elif bit == -1:
+      t0 -= P
+  P = t0
--- a/tests/math/t_ec_template.nim
+++ b/tests/math/t_ec_template.nim
@ -421,11 +421,15 @@ proc run_EC_mul_vs_ref_impl*(
          var
            impl = a
            reference = a
+            refMinWeight = a

          impl.scalarMulGeneric(exponent)
          reference.unsafe_ECmul_double_add(exponent)
+          refMinWeight.unsafe_ECmul_minHammingWeight(exponent)

-          check: bool(impl == reference)
+          check:
+            bool(impl == reference)
+            bool(impl == refMinWeight)

      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = Uniform)
      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = Uniform)