From 05bce529b49a21f28c2b2af1ab0a2cc93ccf5e32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Fri, 28 Feb 2020 22:46:20 +0100
Subject: [PATCH] 1st experiment at accelerating montgomery multiplication (665
 lines of specialized duplicated ASM code for some reason, monomorphization is
 probably better than that)

---
 benchmarks/bls12_381_fp.nim            | 66 ++++++++++++++++++++++++++
 constantine/arithmetic/README.md       |  4 ++
 constantine/arithmetic/bigints_raw.nim | 26 +++++-----
 3 files changed, 84 insertions(+), 12 deletions(-)
 create mode 100644 benchmarks/bls12_381_fp.nim

diff --git a/benchmarks/bls12_381_fp.nim b/benchmarks/bls12_381_fp.nim
new file mode 100644
index 0000000..17bc11f
--- /dev/null
+++ b/benchmarks/bls12_381_fp.nim
@@ -0,0 +1,66 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# ############################################################
+#
+#           Benchmark of modular exponentiation
+#
+# ############################################################
+
+# 2 implementations are available
+# - 1 is constant time
+# - 1 exposes the exponent bits to:
+#   timing attack,
+#   memory access analysis,
+#   power analysis (i.e. oscilloscopes on embedded)
+#   It is suitable for public exponents for example
+#   to compute modular inversion via the Fermat method
+
+import
+  ../constantine/config/[common, curves],
+  ../constantine/arithmetic/[bigints_checked, finite_fields],
+  ../constantine/io/[io_bigints, io_fields],
+  random, std/monotimes, times, strformat
+
+const Iters = 1_000_000
+
+randomize(1234)
+
+proc addBench() =
+  var r, x, y: Fp[BLS12_381]
+  # BN254 field modulus
+  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
+  # BLS12-381 prime - 2
+  y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9")
+
+  let start = getMonotime()
+  for _ in 0 ..< Iters:
+    x += y
+  let stop = getMonotime()
+
+  echo &"Time for {Iters} conditional additions (constant-time 381-bit): {inMilliseconds(stop-start)} ms"
+  echo &"Time for 1 conditional addition ==> {inNanoseconds((stop-start) div Iters)} ns"
+
+addBench()
+
+proc mulBench() =
+  var r, x, y: Fp[BLS12_381]
+  # BN254 field modulus
+  x.fromHex("0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47")
+  # BLS12-381 prime - 2
+  y.fromHex("0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9")
+
+  let start = getMonotime()
+  for _ in 0 ..< Iters:
+    r.prod(x, y)
+  let stop = getMonotime()
+
+  echo &"Time for {Iters} multiplications (constant-time 381-bit): {inMilliseconds(stop-start)} ms"
+  echo &"Time for 1 multiplication ==> {inNanoseconds((stop-start) div Iters)} ns"
+
+mulBench()
diff --git a/constantine/arithmetic/README.md b/constantine/arithmetic/README.md
index dc01271..9ce12e1 100644
--- a/constantine/arithmetic/README.md
+++ b/constantine/arithmetic/README.md
@@ -6,6 +6,10 @@ This folder contains the implementation of
 
 ## References
 
+- Analyzing and Comparing Montgomery Multiplication Algorithms
+  Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr.
+  http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf
+
 - Montgomery Arithmetic from a Software Perspective\
   Joppe W. Bos and Peter L. Montgomery, 2017\
   https://eprint.iacr.org/2017/1057
diff --git a/constantine/arithmetic/bigints_raw.nim b/constantine/arithmetic/bigints_raw.nim
index 9712e68..4916e6a 100644
--- a/constantine/arithmetic/bigints_raw.nim
+++ b/constantine/arithmetic/bigints_raw.nim
@@ -551,26 +551,28 @@ func montyMul*(
   r.setBitLength(bitSizeof(M))
   setZero(r)
 
-  var r_hi = Zero   # represents the high word that is used in intermediate computation before reduction mod M
+  var partials: array[14, DoubleWord]
+
   for i in 0 ..< nLen:
 
-    let zi = (r[0] + wordMul(a[i], b[0])).wordMul(negInvModWord)
-    var carry = Zero
+    let zi = (Word(partials[0]) + wordMul(a[i], b[0])).wordMul(negInvModWord)
 
     for j in 0 ..< nLen:
-      let z = DoubleWord(r[j]) + unsafeExtPrecMul(a[i], b[j]) +
-              unsafeExtPrecMul(zi, M[j]) + DoubleWord(carry)
-      carry = Word(z shr WordBitSize)
-      if j != 0: # "division" by a physical word 2^32 or 2^64
-        r[j-1] = Word(z).mask()
+      partials[j] += unsafeExtPrecMul(a[i], b[j]) + unsafeExtPrecMul(zi, M[j])
 
-    r_hi += carry
-    r[^1] = r_hi.mask()
-    r_hi = r_hi shr WordBitSize
+    var carry = partials[0] shr WordBitSize
+    for j in 1 .. nlen: # we need 1 extra temporary at nlen
+      partials[j] += carry
+      carry = partials[j] shr WordBitSize
+      partials[j-1] = partials[j] and DoubleWord(MaxWord)
+    partials[nlen] = partials[nlen] shr WordBitSize
+
+  for i in 0 ..< nLen:
+    r[i] = Word(partials[i])
 
   # If the extra word is not zero or if r-M does not borrow (i.e. r > M)
   # Then substract M
-  discard r.csub(M, r_hi.isNonZero() or not r.csub(M, CtFalse))
+  discard r.csub(M, CTBool[Word](partials[nLen].isNonZero()) or not r.csub(M, CtFalse))
 
 func redc*(r: BigIntViewMut, a: BigIntViewAny, one, N: BigIntViewConst, negInvModWord: Word) {.inline.} =
   ## Transform a bigint ``a`` from it's Montgomery N-residue representation (mod N)