MSB-to-LSB minimum Hamming Weight Recoding (#219)

* signed recoding * use recoding
2025-01-14 13:04:42 +00:00 · 2023-02-07 16:27:53 +01:00 · 2023-02-07 16:27:53 +01:00 · 082cd1deb9
commit 082cd1deb9
parent 7c5421ffdc
13 changed files with 216 additions and 114 deletions
--- a/benchmarks/bench_ec_g1.nim
+++ b/benchmarks/bench_ec_g1.nim
@ -57,6 +57,9 @@ proc main() =
    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Prj[Fp[curve], G1], MulIters)
    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Jac[Fp[curve], G1], MulIters)
    separator()
+    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Prj[Fp[curve], G1], MulIters)
+    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Jac[Fp[curve], G1], MulIters)
+    separator()
    scalarMulGenericBench(ECP_ShortW_Prj[Fp[curve], G1], window = 2, MulIters)
    scalarMulGenericBench(ECP_ShortW_Prj[Fp[curve], G1], window = 3, MulIters)
    scalarMulGenericBench(ECP_ShortW_Prj[Fp[curve], G1], window = 4, MulIters)
--- a/benchmarks/bench_ec_g2.nim
+++ b/benchmarks/bench_ec_g2.nim
@ -58,6 +58,9 @@ proc main() =
    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Prj[Fp2[curve], G2], MulIters)
    scalarMulUnsafeDoubleAddBench(ECP_ShortW_Jac[Fp2[curve], G2], MulIters)
    separator()
+    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Prj[Fp2[curve], G2], MulIters)
+    scalarMulUnsafeMinHammingWeightRecodingBench(ECP_ShortW_Jac[Fp2[curve], G2], MulIters)
+    separator()
    scalarMulGenericBench(ECP_ShortW_Prj[Fp2[curve], G2], window = 2, MulIters)
    scalarMulGenericBench(ECP_ShortW_Prj[Fp2[curve], G2], window = 3, MulIters)
    scalarMulGenericBench(ECP_ShortW_Prj[Fp2[curve], G2], window = 4, MulIters)
--- a/benchmarks/bench_elliptic_template.nim
+++ b/benchmarks/bench_elliptic_template.nim
@ -143,6 +143,18 @@ proc scalarMulUnsafeDoubleAddBench*(EC: typedesc, iters: int) =
    r = P
    r.unsafe_ECmul_double_add(exponent)

+proc scalarMulUnsafeMinHammingWeightRecodingBench*(EC: typedesc, iters: int) =
+  const bits = EC.F.C.getCurveOrderBitwidth()
+
+  var r {.noInit.}: EC
+  var P = rng.random_unsafe(EC) # TODO: clear cofactor
+
+  let exponent = rng.random_unsafe(BigInt[bits])
+
+  bench("EC ScalarMul " & $bits & "-bit " & $EC.G & " (unsafe min Hamming Weight recoding)", EC, iters):
+    r = P
+    r.unsafe_ECmul_minHammingWeight(exponent)
+
 proc multiAddBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) =
  var points = newSeq[ECP_ShortW_Aff[EC.F, EC.G]](numPoints)

--- a/constantine.nimble
+++ b/constantine.nimble
@ -200,7 +200,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
  # ("tests/math/t_pairing_bls12_377_line_functions.nim", false),
  # ("tests/math/t_pairing_bls12_381_line_functions.nim", false),
  # ("tests/math/t_pairing_mul_fp12_by_lines.nim", false),
-  # ("tests/math/t_pairing_cyclotomic_subgroup.nim", false),
+  ("tests/math/t_pairing_cyclotomic_subgroup.nim", false),
  ("tests/math/t_pairing_bn254_nogami_optate.nim", false),
  ("tests/math/t_pairing_bn254_snarks_optate.nim", false),
  ("tests/math/t_pairing_bls12_377_optate.nim", false),
--- a/constantine/math/arithmetic/bigints.nim
+++ b/constantine/math/arithmetic/bigints.nim
@ -491,5 +491,52 @@ func invmod*[bits](r: var BigInt[bits], a, M: BigInt[bits]) =
  one.setOne()
  r.invmod(a, one, M)

+# ############################################################
+#
+#                   Recoding
+#
+# ############################################################
+
+iterator recoding_l2r_vartime*(a: BigInt): int8 =
+  ## This is a minimum-Hamming-Weight left-to-right recoding.
+  ## It outputs signed {-1, 0, 1} bits from MSB to LSB
+  ## with minimal Hamming Weight to minimize operations
+  ## in Miller Loop and vartime scalar multiplications
+  ##
+  ## Tagged vartime as it returns an int8
+  ## - Optimal Left-to-Right Binary Signed-Digit Recoding
+  ##   Joye, Yen, 2000
+  ##   https://marcjoye.github.io/papers/JY00sd2r.pdf
+
+  # As the caller is copy-pasted at each yield
+  # we rework the algorithm so that we have a single yield point
+  # We rely on the compiler for loop hoisting and/or loop peeling
+
+  var bi, bi1, ri, ri1, ri2: int8
+
+  var i = a.bits
+  while true:
+    if i == a.bits: # We rely on compiler to hoist this branch out of the loop.
+      ri = 0
+      ri1 = int8 a.bit(a.bits-1)
+      ri2 = int8 a.bit(a.bits-2)
+      bi = 0
+    else:
+      bi = bi1
+      ri = ri1
+      ri1 = ri2
+      if i < 2:
+        ri2 = 0
+      else:
+        ri2 = int8 a.bit(i-2)
+
+    bi1 = (bi + ri1 + ri2) shr 1
+    yield -2*bi + ri + bi1
+
+    if i > 0:
+      i -= 1
+    else:
+      break
+
 {.pop.} # inline
 {.pop.} # raises no exceptions
--- a/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_jacobian.nim
@ -630,14 +630,18 @@ func double*(P: var ECP_ShortW_Jac) {.inline.} =
  ## In-place point doubling
  P.double(P)

-func diff*(r: var ECP_ShortW_Jac,
-           P, Q: ECP_ShortW_Jac
-     ) {.inline.} =
+func diff*(r: var ECP_ShortW_Jac, P, Q: ECP_ShortW_Jac) {.inline.} =
  ## r = P - Q
  var nQ {.noInit.}: typeof(Q)
  nQ.neg(Q)
  r.sum(P, nQ)

+func `-=`*(P: var ECP_ShortW_Jac, Q: ECP_ShortW_Jac) {.inline.} =
+  ## In-place point substraction
+  var nQ {.noInit.}: typeof(Q)
+  nQ.neg(Q)
+  P.sum(P, nQ)
+
 func affine*[F; G](
       aff: var ECP_ShortW_Aff[F, G],
       jac: ECP_ShortW_Jac[F, G]) =
--- a/constantine/math/elliptic/ec_shortweierstrass_projective.nim
+++ b/constantine/math/elliptic/ec_shortweierstrass_projective.nim
@ -417,15 +417,19 @@ func double*(P: var ECP_ShortW_Prj) {.inline.} =
  ## In-place EC doubling
  P.double(P)

-func diff*(r: var ECP_ShortW_Prj,
-              P, Q: ECP_ShortW_Prj
-     ) {.inline.} =
+func diff*(r: var ECP_ShortW_Prj, P, Q: ECP_ShortW_Prj) {.inline.} =
  ## r = P - Q
  ## Can handle r and Q aliasing
  var nQ {.noInit.}: typeof(Q)
  nQ.neg(Q)
  r.sum(P, nQ)

+func `-=`*(P: var ECP_ShortW_Prj, Q: ECP_ShortW_Prj) {.inline.} =
+  ## In-place point substraction
+  var nQ {.noInit.}: typeof(Q)
+  nQ.neg(Q)
+  P.sum(P, nQ)
+
 func affine*[F, G](
       aff: var ECP_ShortW_Aff[F, G],
       proj: ECP_ShortW_Prj[F, G]) =
--- a/constantine/math/pairings/cyclotomic_subgroups.nim
+++ b/constantine/math/pairings/cyclotomic_subgroups.nim
@ -385,26 +385,29 @@ func cycl_sqr_repeated*[FT](r: var FT, a: FT, num: int) {.inline, meter.} =
  for _ in 1 ..< num:
    r.cyclotomic_square()

-iterator unpack(scalarByte: byte): bool =
-  yield bool((scalarByte and 0b10000000) shr 7)
-  yield bool((scalarByte and 0b01000000) shr 6)
-  yield bool((scalarByte and 0b00100000) shr 5)
-  yield bool((scalarByte and 0b00010000) shr 4)
-  yield bool((scalarByte and 0b00001000) shr 3)
-  yield bool((scalarByte and 0b00000100) shr 2)
-  yield bool((scalarByte and 0b00000010) shr 1)
-  yield bool( scalarByte and 0b00000001)
-
-func cyclotomic_exp*[FT](r: var FT, a: FT, exponent: BigInt, invert: bool) {.meter.} =
-    var eBytes: array[(exponent.bits+7) div 8, byte]
-    eBytes.marshal(exponent, bigEndian)
+func cyclotomic_exp*[FT](r: var FT, a: FT, exponent: static BigInt, invert: bool) {.meter.} =
+  ## Assumes public exponent
+  var na {.noInit.}: FT
+  na.cyclotomic_inv(a)

  r.setOne()
-    for b in eBytes:
-      for bit in unpack(b):
+  var init = false
+  for bit in recoding_l2r_vartime(exponent):
+    if init:
      r.cyclotomic_square()
-        if bit:
+    if bit == 1:
+      if not init:
+        r = a
+        init = true
+      else:
        r *= a
+    elif bit == -1:
+      if not init:
+        r = na
+        init = true
+      else:
+        r *= na
+
  if invert:
    r.cyclotomic_inv()

--- a/constantine/math/pairings/lines_eval.nim
+++ b/constantine/math/pairings/lines_eval.nim
@ -1397,3 +1397,5 @@ func mul_by_2_lines*[Fpk, Fpkdiv6](f: var Fpk, line0, line1: Line[Fpkdiv6]) {.in
  var t{.noInit.}: Fpk
  t.prod_from_2_lines(line0, line1)
  f.mul_by_prod_of_2_lines(t)
+
+# func asFpk
--- a/constantine/math/pairings/miller_loops.nim
+++ b/constantine/math/pairings/miller_loops.nim
@ -45,6 +45,7 @@ func basicMillerLoop*[FT, F1, F2](
  var u3 = ate_param
  u3 *= 3
  for i in countdown(u3.bits - 2, 1):
+    if i != u3.bits - 2:
      f.square()
    line.line_double(T, P)
    f.mul_by_line(line)
@ -320,6 +321,7 @@ func basicMillerLoop*[FT, F1, F2](
  var u3 = ate_param
  u3 *= 3
  for i in countdown(u3.bits - 2, 1):
+    if i != u3.bits - 2:
      f.square()
    f.double_jToN(j=0, line0, line1, Ts, Ps, N)

--- a/constantine/signatures/bls_signatures.nim
+++ b/constantine/signatures/bls_signatures.nim
@ -8,6 +8,7 @@

 import
    ../math/[ec_shortweierstrass, extension_fields],
+    ../math/io/io_bigints,
    ../math/elliptic/ec_shortweierstrass_batch_ops,
    ../math/pairings/[pairings_generic, miller_accumulators],
    ../math/constants/zoo_generators,
@ -365,41 +366,30 @@ func init*[T0, T1: char|byte](

  H.hash(ctx.secureBlinding, secureRandomBytes, accumSepTag)

-iterator unpack(scalarByte: byte): bool =
-  yield bool((scalarByte and 0b10000000) shr 7)
-  yield bool((scalarByte and 0b01000000) shr 6)
-  yield bool((scalarByte and 0b00100000) shr 5)
-  yield bool((scalarByte and 0b00010000) shr 4)
-  yield bool((scalarByte and 0b00001000) shr 3)
-  yield bool((scalarByte and 0b00000100) shr 2)
-  yield bool((scalarByte and 0b00000010) shr 1)
-  yield bool( scalarByte and 0b00000001)
-
-func scalarMul_doubleAdd_vartime[EC](
+func scalarMul_minHammingWeight_vartime[EC](
       P: var EC,
-       scalarCanonical: openArray[byte],
+       scalar: BigInt,
     ) =
  ## **Variable-time** Elliptic Curve Scalar Multiplication
  ##
  ##   P <- [k] P
  ##
-  ## This uses the double-and-add algorithm
-  ## This is UNSAFE to use with secret data and is only intended for signature verification
-  ## to multiply by random blinding scalars.
+  ## This uses an online recoding with minimum Hamming Weight
+  ## (which is not NAF, NAF is least-significant bit to most)
  ## Due to those scalars being 64-bit, window-method or endomorphism acceleration are slower
  ## than double-and-add.
  ##
  ## This is highly VULNERABLE to timing attacks and power analysis attacks.
-  var t0{.noInit.}, t1{.noInit.}: typeof(P)
+  ## For our usecase, scaling with a random number not in attacker control,
+  ## leaking the scalar bits is not an issue.
+  var t0{.noInit.}: typeof(P)
  t0.setInf()
-  t1.setInf()
-  for scalarByte in scalarCanonical:
-    for bit in unpack(scalarByte):
-      t1.double(t0)
-      if bit:
-        t0.sum(t1, P)
-      else:
-        t0 = t1
+  for bit in recoding_l2r_vartime(scalar):
+    t0.double()
+    if bit == 1:
+      t0 += P
+    elif bit == -1:
+      t0 -= P
  P = t0

 func update*[T: char|byte, Pubkey, Sig: ECP_ShortW_Aff](
@ -434,7 +424,12 @@ func update*[T: char|byte, Pubkey, Sig: ECP_ShortW_Aff](
  # we only use a 1..<2^64 random blinding factor.
  # We assume that the attacker cannot resubmit 2^64 times
  # forged public keys and signatures.
+  #
  # Discussion https://ethresear.ch/t/fast-verification-of-multiple-bls-signatures/5407
+  # See also
+  # - Faster batch forgery identification
+  #   Daniel J. Bernstein, Jeroen Doumen, Tanja Lange, and Jan-Jaap Oosterwijk, 2012
+  #   https://eprint.iacr.org/2012/549

  # We only use the first 8 bytes for blinding
  # but use the full 32 bytes to derive new random scalar
@ -459,8 +454,10 @@ func update*[T: char|byte, Pubkey, Sig: ECP_ShortW_Aff](
    pkG1_jac.fromAffine(pubkey)
    sigG2_jac.fromAffine(signature)

-    pkG1_jac.scalarMul_doubleAdd_vartime(ctx.secureBlinding.toOpenArray(0, 7))
-    sigG2_jac.scalarMul_doubleAdd_vartime(ctx.secureBlinding.toOpenArray(0, 7))
+    var randFactor{.noInit.}: BigInt[64]
+    randFactor.unmarshal(ctx.secureBlinding.toOpenArray(0, 7), bigEndian)
+    pkG1_jac.scalarMul_minHammingWeight_vartime(randFactor)
+    sigG2_jac.scalarMul_minHammingWeight_vartime(randFactor)

    if ctx.aggSigOnce == false:
      ctx.aggSig = sigG2_jac
@ -493,8 +490,10 @@ func update*[T: char|byte, Pubkey, Sig: ECP_ShortW_Aff](

    sigG1_jac.fromAffine(signature)

-    hmsgG1_jac.scalarMul_doubleAdd_vartime(ctx.secureBlinding.toOpenArray(0, 7))
-    sigG1_jac.scalarMul_doubleAdd_vartime(ctx.secureBlinding.toOpenArray(0, 7))
+    var randFactor{.noInit.}: BigInt[64]
+    randFactor.unmarshal(ctx.secureBlinding.toOpenArray(0, 7), bigEndian)
+    hmsgG1_jac.scalarMul_minHammingWeight_vartime(randFactor)
+    sigG1_jac.scalarMul_minHammingWeight_vartime(randFactor)

    if ctx.aggSigOnce == false:
      ctx.aggSig = sigG1_jac
--- a/tests/math/support/ec_reference_scalar_mult.nim
+++ b/tests/math/support/ec_reference_scalar_mult.nim
@ -39,14 +39,33 @@ func unsafe_ECmul_double_add*[EC](
  var scalarCanonical: array[(scalar.bits+7) div 8, byte]
  scalarCanonical.marshal(scalar, bigEndian)

-  var t0{.noInit.}, t1{.noInit.}: typeof(P)
+  var t0: typeof(P)
  t0.setInf()
-  t1.setInf()
  for scalarByte in scalarCanonical:
    for bit in unpack(scalarByte):
-      t1.double(t0)
+      t0.double()
      if bit:
-        t0.sum(t1, P)
-      else:
-        t0 = t1
+        t0 += P
+  P = t0
+
+func unsafe_ECmul_minHammingWeight*[EC](
+       P: var EC,
+       scalar: BigInt) =
+  ## **Unsafe** Elliptic Curve Scalar Multiplication
+  ##
+  ##   P <- [k] P
+  ##
+  ## This uses an online recoding with minimum Hamming Weight
+  ## (which is not NAF, NAF is least-significant bit to most)
+  ## This is UNSAFE to use in production and only intended for testing purposes.
+  ##
+  ## This is highly VULNERABLE to timing attacks and power analysis attacks
+  var t0{.noInit.}: typeof(P)
+  t0.setInf()
+  for bit in recoding_l2r_vartime(scalar):
+    t0.double()
+    if bit == 1:
+      t0 += P
+    elif bit == -1:
+      t0 -= P
  P = t0
--- a/tests/math/t_ec_template.nim
+++ b/tests/math/t_ec_template.nim
@ -421,11 +421,15 @@ proc run_EC_mul_vs_ref_impl*(
          var
            impl = a
            reference = a
+            refMinWeight = a

          impl.scalarMulGeneric(exponent)
          reference.unsafe_ECmul_double_add(exponent)
+          refMinWeight.unsafe_ECmul_minHammingWeight(exponent)

-          check: bool(impl == reference)
+          check:
+            bool(impl == reference)
+            bool(impl == refMinWeight)

      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = false, gen = Uniform)
      test(ec, bits = ec.F.C.getCurveOrderBitwidth(), randZ = true, gen = Uniform)