From 8b7374f40501a8435c87f524c252a52f7a1abdc3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Sun, 22 Mar 2020 13:24:37 +0100
Subject: [PATCH] Cleanup in Montgomery Mul, Square, Pow

---
 benchmarks/bench_fp.nim                     |  1 +
 constantine/arithmetic/bigints.nim          | 12 ++--
 constantine/arithmetic/finite_fields.nim    | 25 +++++++-
 constantine/arithmetic/limbs_montgomery.nim | 68 ++++++++++-----------
 4 files changed, 61 insertions(+), 45 deletions(-)

diff --git a/benchmarks/bench_fp.nim b/benchmarks/bench_fp.nim
index 6cb7440..5e2f06b 100644
--- a/benchmarks/bench_fp.nim
+++ b/benchmarks/bench_fp.nim
@@ -9,6 +9,7 @@
 import
   # Internals
   ../constantine/config/curves,
+  ../constantine/arithmetic,
   # Helpers
   ../helpers/static_for,
   ./bench_fields_template,
diff --git a/constantine/arithmetic/bigints.nim b/constantine/arithmetic/bigints.nim
index a3f85e0..a7c3719 100644
--- a/constantine/arithmetic/bigints.nim
+++ b/constantine/arithmetic/bigints.nim
@@ -315,7 +315,7 @@ func montySquare*(r: var BigInt, a, M: BigInt, negInvModWord: static BaseType, c
 func montyPow*[mBits, eBits: static int](
        a: var BigInt[mBits], exponent: BigInt[eBits],
        M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int,
-       canUseNoCarryMontyMul: static bool
+       canUseNoCarryMontyMul, canUseNoCarryMontySquare: static bool
       ) =
   ## Compute a <- a^exponent (mod M)
   ## ``a`` in the Montgomery domain
@@ -334,12 +334,12 @@ func montyPow*[mBits, eBits: static int](
   const scratchLen = if windowSize == 1: 2
                      else: (1 shl windowSize) + 1
   var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired]]
-  montyPow(a.limbs, expBE, M.limbs, one.limbs, negInvModWord, scratchSpace, canUseNoCarryMontyMul)
+  montyPow(a.limbs, expBE, M.limbs, one.limbs, negInvModWord, scratchSpace, canUseNoCarryMontyMul, canUseNoCarryMontySquare)
 
 func montyPowUnsafeExponent*[mBits, eBits: static int](
        a: var BigInt[mBits], exponent: BigInt[eBits],
        M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int,
-       canUseNoCarryMontyMul: static bool
+       canUseNoCarryMontyMul, canUseNoCarryMontySquare: static bool
       ) =
   ## Compute a <- a^exponent (mod M)
   ## ``a`` in the Montgomery domain
@@ -362,12 +362,12 @@ func montyPowUnsafeExponent*[mBits, eBits: static int](
   const scratchLen = if windowSize == 1: 2
                      else: (1 shl windowSize) + 1
   var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired]]
-  montyPowUnsafeExponent(a.limbs, expBE, M.limbs, one.limbs, negInvModWord, scratchSpace, canUseNoCarryMontyMul)
+  montyPowUnsafeExponent(a.limbs, expBE, M.limbs, one.limbs, negInvModWord, scratchSpace, canUseNoCarryMontyMul, canUseNoCarryMontySquare)
 
 func montyPowUnsafeExponent*[mBits: static int](
        a: var BigInt[mBits], exponent: openarray[byte],
        M, one: BigInt[mBits], negInvModWord: static BaseType, windowSize: static int,
-       canUseNoCarryMontyMul: static bool
+       canUseNoCarryMontyMul, canUseNoCarryMontySquare: static bool
       ) =
   ## Compute a <- a^exponent (mod M)
   ## ``a`` in the Montgomery domain
@@ -386,7 +386,7 @@ func montyPowUnsafeExponent*[mBits: static int](
   const scratchLen = if windowSize == 1: 2
                      else: (1 shl windowSize) + 1
   var scratchSpace {.noInit.}: array[scratchLen, Limbs[mBits.wordsRequired]]
-  montyPowUnsafeExponent(a.limbs, exponent, M.limbs, one.limbs, negInvModWord, scratchSpace, canUseNoCarryMontyMul)
+  montyPowUnsafeExponent(a.limbs, exponent, M.limbs, one.limbs, negInvModWord, scratchSpace, canUseNoCarryMontyMul, canUseNoCarryMontySquare)
 
 {.pop.} # inline
 {.pop.} # raises no exceptions
diff --git a/constantine/arithmetic/finite_fields.nim b/constantine/arithmetic/finite_fields.nim
index f11d5da..4083639 100644
--- a/constantine/arithmetic/finite_fields.nim
+++ b/constantine/arithmetic/finite_fields.nim
@@ -170,7 +170,8 @@ func pow*(a: var Fp, exponent: BigInt) =
     exponent,
     Fp.C.Mod, Fp.C.getMontyOne(),
     Fp.C.getNegInvModWord(), windowSize,
-    Fp.C.canUseNoCarryMontyMul()
+    Fp.C.canUseNoCarryMontyMul(),
+    Fp.C.canUseNoCarryMontySquare()
   )
 
 func powUnsafeExponent*(a: var Fp, exponent: BigInt) =
@@ -189,9 +190,29 @@ func powUnsafeExponent*(a: var Fp, exponent: BigInt) =
     exponent,
     Fp.C.Mod, Fp.C.getMontyOne(),
     Fp.C.getNegInvModWord(), windowSize,
-    Fp.C.canUseNoCarryMontyMul()
+    Fp.C.canUseNoCarryMontyMul(),
+    Fp.C.canUseNoCarryMontySquare()
   )
 
+func powUnsafeExponent*(a: var Fp, exponent: openarray[byte]) =
+  ## Exponentiation modulo p
+  ## ``a``: a field element to be exponentiated
+  ## ``exponent``: a big integer
+  ##
+  ## Warning ⚠️ :
+  ## This is an optimization for public exponent
+  ## Otherwise bits of the exponent can be retrieved with:
+  ## - memory access analysis
+  ## - power analysis
+  ## - timing analysis
+  const windowSize = 5 # TODO: find best window size for each curves
+  a.mres.montyPowUnsafeExponent(
+    exponent,
+    Fp.C.Mod, Fp.C.getMontyOne(),
+    Fp.C.getNegInvModWord(), windowSize,
+    Fp.C.canUseNoCarryMontyMul(),
+    Fp.C.canUseNoCarryMontySquare()
+  )
 
 # ############################################################
 #
diff --git a/constantine/arithmetic/limbs_montgomery.nim b/constantine/arithmetic/limbs_montgomery.nim
index ae7a7fc..ce30b37 100644
--- a/constantine/arithmetic/limbs_montgomery.nim
+++ b/constantine/arithmetic/limbs_montgomery.nim
@@ -85,9 +85,6 @@ macro staticFor(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped)
 
 # Implementation
 # ------------------------------------------------------------
-# Note: the low-level implementations should not use static parameter
-#       the code generated is already big enough for curve with different
-#       limb sizes, we want to use the same codepath when limbs lenght are compatible.
 
 func montyMul_CIOS_nocarry(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
   ## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
@@ -266,7 +263,7 @@ func montySquare_CIOS(r: var Limbs, a, M: Limbs, m0ninv: BaseType) =
 
 func montyMul*(
         r: var Limbs, a, b, M: Limbs,
-        m0ninv: static BaseType, canUseNoCarryMontyMul: static bool) {.inline.} =
+        m0ninv: static BaseType, canUseNoCarryMontyMul: static bool) =
   ## Compute r <- a*b (mod M) in the Montgomery domain
   ## `m0ninv` = -1/M (mod Word). Our words are 2^32 or 2^64
   ##
@@ -278,7 +275,7 @@ func montyMul*(
   ## Assuming 64-bit words, the magic constant should be:
   ##
   ## - µ ≡ -1/M[0] (mod 2^64) for a general multiplication
-  ##   This can be precomputed with `negInvModWord`
+  ##   This can be precomputed with `m0ninv`
   ## - 1 for conversion from Montgomery to canonical representation
   ##   The library implements a faster `redc` primitive for that use-case
   ## - R^2 (mod M) for conversion from canonical to Montgomery representation
@@ -286,27 +283,22 @@ func montyMul*(
   # i.e. c'R <- a'R b'R * R^-1 (mod M) in the natural domain
   # as in the Montgomery domain all numbers are scaled by R
 
-  # Nim doesn't like static Word, so we pass static BaseType up to here
-  # Then we passe them as Word again for the final processing.
-
-  # Many curve moduli are "Montgomery-friendly" which means that m0inv is 1
+  # Many curve moduli are "Montgomery-friendly" which means that m0ninv is 1
   # This saves N basic type multiplication and potentially many register mov
   # as well as unless using "mulx" instruction, x86 "mul" requires very specific registers.
-  # Compilers should be able to constant-propagate, but this prevents reusing code
-  # for example between secp256k1 (friendly) and BN254 (non-friendly).
-  # Here, as "montyMul" is inlined at the call site, the compiler shouldn't constant fold, saving size.
-  # Inlining the implementation instead (and no-inline this "montyMul" proc) would allow constant propagation
-  # of Montgomery-friendly m0ninv if the compiler deems it interesting,
-  # or we use `when m0ninv == 1` and enforce the inlining.
+  #
+  # The implementation is visible from here, the compiler can make decision whether to:
+  # - specialize/duplicate code for m0ninv == 1 (especially if only 1 curve is needed)
+  # - keep it generic and optimize code size
   when canUseNoCarryMontyMul:
     montyMul_CIOS_nocarry(r, a, b, M, m0ninv)
   else:
     montyMul_CIOS(r, a, b, M, m0ninv)
 
 func montySquare*(r: var Limbs, a, M: Limbs,
-                  m0ninv: static BaseType, canUseNoCarryMontySquare: static bool) {.inline.} =
+                  m0ninv: static BaseType, canUseNoCarryMontySquare: static bool) =
   ## Compute r <- a^2 (mod M) in the Montgomery domain
-  ## `negInvModWord` = -1/M (mod Word). Our words are 2^31 or 2^63
+  ## `m0ninv` = -1/M (mod Word). Our words are 2^31 or 2^63
 
   when canUseNoCarryMontySquare:
     montySquare_CIOS_nocarry(r, a, M, m0ninv)
@@ -314,7 +306,7 @@ func montySquare*(r: var Limbs, a, M: Limbs,
     montySquare_CIOS(r, a, M, m0ninv)
 
 func redc*(r: var Limbs, a, one, M: Limbs,
-           m0ninv: static BaseType, canUseNoCarryMontyMul: static bool) {.inline.} =
+           m0ninv: static BaseType, canUseNoCarryMontyMul: static bool) =
   ## Transform a bigint ``a`` from it's Montgomery N-residue representation (mod N)
   ## to the regular natural representation (mod N)
   ##
@@ -325,7 +317,7 @@ func redc*(r: var Limbs, a, one, M: Limbs,
   ##
   ## This is called a Montgomery Reduction
   ## The Montgomery Magic Constant is µ = -1/N mod M
-  ## is used internally and can be precomputed with negInvModWord(Curve)
+  ## is used internally and can be precomputed with m0ninv(Curve)
   # References:
   #   - https://eprint.iacr.org/2017/1057.pdf (Montgomery)
   #     page: Radix-r interleaved multiplication algorithm
@@ -336,7 +328,7 @@ func redc*(r: var Limbs, a, one, M: Limbs,
   montyMul(r, a, one, M, m0ninv, canUseNoCarryMontyMul)
 
 func montyResidue*(r: var Limbs, a, M, r2modM: Limbs,
-                   m0ninv: static BaseType, canUseNoCarryMontyMul: static bool) {.inline.} =
+                   m0ninv: static BaseType, canUseNoCarryMontyMul: static bool) =
   ## Transform a bigint ``a`` from it's natural representation (mod N)
   ## to a the Montgomery n-residue representation
   ##
@@ -424,12 +416,12 @@ func montyPowSquarings(
         a: var Limbs,
         exponent: openarray[byte],
         M: Limbs,
-        negInvModWord: static BaseType,
+        m0ninv: static BaseType,
         tmp: var Limbs,
         window: uint,
         acc, acc_len: var uint,
         e: var int,
-        canUseNoCarryMontyMul: static bool
+        canUseNoCarryMontySquare: static bool
       ): tuple[k, bits: uint] {.inline.}=
   ## Squaring step of exponentiation by squaring
   ## Get the next k bits in range [1, window)
@@ -455,7 +447,7 @@ func montyPowSquarings(
 
   # We have k bits and can do k squaring
   for i in 0 ..< k:
-    tmp.montySquare(a, M, negInvModWord, canUseNoCarryMontyMul)
+    tmp.montySquare(a, M, m0ninv, canUseNoCarryMontySquare)
     a = tmp
 
   return (k, bits)
@@ -464,9 +456,10 @@ func montyPow*(
        a: var Limbs,
        exponent: openarray[byte],
        M, one: Limbs,
-       negInvModWord: static BaseType,
+       m0ninv: static BaseType,
        scratchspace: var openarray[Limbs],
-       canUseNoCarryMontyMul: static bool
+       canUseNoCarryMontyMul: static bool,
+       canUseNoCarryMontySquare: static bool
       ) =
   ## Modular exponentiation r = a^exponent mod M
   ## in the Montgomery domain
@@ -479,7 +472,7 @@ func montyPow*(
   ##   Use ``exportRawUint`` for conversion
   ## - ``M`` is the modulus
   ## - ``one`` is 1 (mod M) in montgomery representation
-  ## - ``negInvModWord`` is the montgomery magic constant "-1/M[0] mod 2^WordBitSize"
+  ## - ``m0ninv`` is the montgomery magic constant "-1/M[0] mod 2^WordBitSize"
   ## - ``scratchspace`` with k the window bitsize of size up to 5
   ##   This is a buffer that can hold between 2^k + 1 big-ints
   ##   A window of of 1-bit (no window optimization) requires only 2 big-ints
@@ -494,7 +487,7 @@ func montyPow*(
   ## A window of size 5 requires (2^5 + 1)*(381 + 7)/8 = 33 * 48 bytes = 1584 bytes
   ## of scratchspace (on the stack).
 
-  let window = montyPowPrologue(a, M, one, negInvModWord, scratchspace, canUseNoCarryMontyMul)
+  let window = montyPowPrologue(a, M, one, m0ninv, scratchspace, canUseNoCarryMontyMul)
 
   # We process bits with from most to least significant.
   # At each loop iteration with have acc_len bits in acc.
@@ -506,10 +499,10 @@ func montyPow*(
     e = 0
   while acc_len > 0 or e < exponent.len:
     let (k, bits) = montyPowSquarings(
-      a, exponent, M, negInvModWord,
+      a, exponent, M, m0ninv,
       scratchspace[0], window,
       acc, acc_len, e,
-      canUseNoCarryMontyMul
+      canUseNoCarryMontySquare
     )
 
     # Window lookup: we set scratchspace[1] to the lookup value.
@@ -524,16 +517,17 @@ func montyPow*(
 
     # Multiply with the looked-up value
     # we keep the product only if the exponent bits are not all zero
-    scratchspace[0].montyMul(a, scratchspace[1], M, negInvModWord, canUseNoCarryMontyMul)
+    scratchspace[0].montyMul(a, scratchspace[1], M, m0ninv, canUseNoCarryMontyMul)
     a.ccopy(scratchspace[0], Word(bits).isNonZero())
 
 func montyPowUnsafeExponent*(
        a: var Limbs,
        exponent: openarray[byte],
        M, one: Limbs,
-       negInvModWord: static BaseType,
+       m0ninv: static BaseType,
        scratchspace: var openarray[Limbs],
-       canUseNoCarryMontyMul: static bool
+       canUseNoCarryMontyMul: static bool,
+       canUseNoCarryMontySquare: static bool
       ) =
   ## Modular exponentiation r = a^exponent mod M
   ## in the Montgomery domain
@@ -547,26 +541,26 @@ func montyPowUnsafeExponent*(
 
   # TODO: scratchspace[1] is unused when window > 1
 
-  let window = montyPowPrologue(a, M, one, negInvModWord, scratchspace, canUseNoCarryMontyMul)
+  let window = montyPowPrologue(a, M, one, m0ninv, scratchspace, canUseNoCarryMontyMul)
 
   var
     acc, acc_len: uint
     e = 0
   while acc_len > 0 or e < exponent.len:
     let (k, bits) = montyPowSquarings(
-      a, exponent, M, negInvModWord,
+      a, exponent, M, m0ninv,
       scratchspace[0], window,
       acc, acc_len, e,
-      canUseNoCarryMontyMul
+      canUseNoCarryMontySquare
     )
 
     ## Warning ⚠️: Exposes the exponent bits
     if bits != 0:
       if window > 1:
-        scratchspace[0].montyMul(a, scratchspace[1+bits], M, negInvModWord, canUseNoCarryMontyMul)
+        scratchspace[0].montyMul(a, scratchspace[1+bits], M, m0ninv, canUseNoCarryMontyMul)
       else:
         # scratchspace[1] holds the original `a`
-        scratchspace[0].montyMul(a, scratchspace[1], M, negInvModWord, canUseNoCarryMontyMul)
+        scratchspace[0].montyMul(a, scratchspace[1], M, m0ninv, canUseNoCarryMontyMul)
       a = scratchspace[0]
 
 {.pop.} # raises no exceptions