From 88d4a58a10673e61bfb8b4b13d3003117aa047ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Sat, 29 Feb 2020 02:10:55 +0100
Subject: [PATCH] First steps in using uint64 words

---
 constantine/arithmetic/bigints_raw.nim        |  31 +-
 constantine/arithmetic/precomputed.nim        |   2 +-
 constantine/config/common.nim                 |   5 +-
 constantine/primitives/constant_time.nim      |  21 +-
 constantine/primitives/extended_precision.nim | 323 +++++-------------
 5 files changed, 115 insertions(+), 267 deletions(-)

diff --git a/constantine/arithmetic/bigints_raw.nim b/constantine/arithmetic/bigints_raw.nim
index 4916e6a..90e9060 100644
--- a/constantine/arithmetic/bigints_raw.nim
+++ b/constantine/arithmetic/bigints_raw.nim
@@ -138,7 +138,7 @@ template bitSizeof(v: BigIntViewAny): uint32 =
   bind BigIntView
   distinctBase(type v)(v).bitLength
 
-const divShiftor = log2(WordPhysBitSize)
+const divShiftor = log2(uint32(WordPhysBitSize))
 template numLimbs*(v: BigIntViewAny): int =
   ## Compute the number of limbs from
   ## the **internal** bitlength
@@ -447,9 +447,7 @@ func shlAddMod(a: BigIntViewMut, c: Word, M: BigIntViewConst) =
 
       block: # q*p
         # q * p + carry (doubleword) carry from previous limb
-        let qp = unsafeExtPrecMul(q, M[i]) + carry.DoubleWord
-        carry = Word(qp shr WordBitSize)           # New carry: high digit besides LSB
-        qp_lo = qp.Word.mask()                     # Normalize to u63
+        unsafeFMA(carry, qp_lo, q, M[i], carry)
 
       block: # a*2^63 - q*p
         a[i] -= qp_lo
@@ -551,28 +549,23 @@ func montyMul*(
   r.setBitLength(bitSizeof(M))
   setZero(r)
 
-  var partials: array[14, DoubleWord]
-
+  var r_hi = Zero   # represents the high word that is used in intermediate computation before reduction mod M
   for i in 0 ..< nLen:
 
-    let zi = (Word(partials[0]) + wordMul(a[i], b[0])).wordMul(negInvModWord)
+    let zi = (r[0] + wordMul(a[i], b[0])).wordMul(negInvModWord)
+    var carry, z = Zero
+    unsafeFMA2(carry, z, a[i], b[0], zi, M[0], r[0], carry)
 
-    for j in 0 ..< nLen:
-      partials[j] += unsafeExtPrecMul(a[i], b[j]) + unsafeExtPrecMul(zi, M[j])
+    for j in 1 ..< nLen:
+      unsafeFMA2(carry, r[j-1], a[i], b[j], zi, M[j], r[j], carry)
 
-    var carry = partials[0] shr WordBitSize
-    for j in 1 .. nlen: # we need 1 extra temporary at nlen
-      partials[j] += carry
-      carry = partials[j] shr WordBitSize
-      partials[j-1] = partials[j] and DoubleWord(MaxWord)
-    partials[nlen] = partials[nlen] shr WordBitSize
-
-  for i in 0 ..< nLen:
-    r[i] = Word(partials[i])
+    r_hi += carry
+    r[^1] = r_hi.mask()
+    r_hi = r_hi shr WordBitSize
 
   # If the extra word is not zero or if r-M does not borrow (i.e. r > M)
   # Then substract M
-  discard r.csub(M, CTBool[Word](partials[nLen].isNonZero()) or not r.csub(M, CtFalse))
+  discard r.csub(M, r_hi.isNonZero() or not r.csub(M, CtFalse))
 
 func redc*(r: BigIntViewMut, a: BigIntViewAny, one, N: BigIntViewConst, negInvModWord: Word) {.inline.} =
   ## Transform a bigint ``a`` from it's Montgomery N-residue representation (mod N)
diff --git a/constantine/arithmetic/precomputed.nim b/constantine/arithmetic/precomputed.nim
index 47f2537..4af0a75 100644
--- a/constantine/arithmetic/precomputed.nim
+++ b/constantine/arithmetic/precomputed.nim
@@ -126,7 +126,7 @@ func negInvModWord*(M: BigInt): BaseType =
 
   let
     M0 = BaseType(M.limbs[0])
-    k = log2(WordPhysBitSize)
+    k = log2(uint32(WordPhysBitSize))
 
   result = M0                 # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse
   for _ in 0 ..< k:           # at each iteration we get the inverse mod(2^2k)
diff --git a/constantine/config/common.nim b/constantine/config/common.nim
index e3a1ace..857beb6 100644
--- a/constantine/config/common.nim
+++ b/constantine/config/common.nim
@@ -14,12 +14,11 @@
 
 import ../primitives/constant_time
 
-type Word* = Ct[uint32]
+type Word* = Ct[uint64]
   ## Logical BigInt word
   ## A logical BigInt word is of size physical MachineWord-1
-type DoubleWord* = Ct[uint64]
 
-type BaseType* = uint32
+type BaseType* = uint64
   ## Physical BigInt for conversion in "normal integers"
 
 const
diff --git a/constantine/primitives/constant_time.nim b/constantine/primitives/constant_time.nim
index dee207e..2cc2e22 100644
--- a/constantine/primitives/constant_time.nim
+++ b/constantine/primitives/constant_time.nim
@@ -11,7 +11,6 @@
 #                Constant-time primitives
 #
 # ############################################################
-
 type
   BaseUint* = SomeUnsignedInt or byte
 
@@ -168,6 +167,26 @@ func log2*(x: uint32): uint32 =
   v = v or v shr 16
   lookup[(v * 0x07C4ACDD'u32) shr 27]
 
+func log2*(x: uint64): uint64 {.inline, noSideEffect.} =
+  ## Find the log base 2 of a 32-bit or less integer.
+  ## using De Bruijn multiplication
+  ## Works at compile-time, guaranteed constant-time.
+  ## Note: at runtime BitScanReverse or CountLeadingZero are more efficient
+  ##       but log2 is never needed at runtime.
+  # https://graphics.stanford.edu/%7Eseander/bithacks.html#IntegerLogDeBruijn
+  const lookup: array[64, uint8] = [0'u8, 58, 1, 59, 47, 53, 2, 60, 39, 48, 27, 54,
+    33, 42, 3, 61, 51, 37, 40, 49, 18, 28, 20, 55, 30, 34, 11, 43, 14, 22, 4, 62,
+    57, 46, 52, 38, 26, 32, 41, 50, 36, 17, 19, 29, 10, 13, 21, 56, 45, 25, 31,
+    35, 16, 9, 12, 44, 24, 15, 8, 23, 7, 6, 5, 63]
+  var v = x
+  v = v or v shr 1 # first round down to one less than a power of 2
+  v = v or v shr 2
+  v = v or v shr 4
+  v = v or v shr 8
+  v = v or v shr 16
+  v = v or v shr 32
+  lookup[(v * 0x03F6EAF2CD271461'u64) shr 58]
+
 # ############################################################
 #
 #             Hardened Boolean primitives
diff --git a/constantine/primitives/extended_precision.nim b/constantine/primitives/extended_precision.nim
index d5fe742..e7984df 100644
--- a/constantine/primitives/extended_precision.nim
+++ b/constantine/primitives/extended_precision.nim
@@ -20,10 +20,6 @@ import ./constant_time
 #
 # ############################################################
 
-template unsafeExtPrecMul*(a, b: Ct[uint32]): Ct[uint64] =
-  ## Extended precision multiplication uint32 * uint32 --> uint64
-  Ct[uint64](uint64(a) * uint64(b))
-
 func unsafeDiv2n1n*(q, r: var Ct[uint32], n_hi, n_lo, d: Ct[uint32]) {.inline.}=
   ## Division uint64 by uint32
   ## Warning ⚠️ :
@@ -35,261 +31,102 @@ func unsafeDiv2n1n*(q, r: var Ct[uint32], n_hi, n_lo, d: Ct[uint32]) {.inline.}=
   ## so that the most significant bit in d is set.
   # TODO !!! - Replace by constant-time, portable, non-assembly version
   #          -> use uint128? Compiler might add unwanted branches
+  {.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".}
   let dividend = (uint64(n_hi) shl 32) or uint64(n_lo)
   let divisor = uint64(d)
   q = (Ct[uint32])(dividend div divisor)
   r = (Ct[uint32])(dividend mod divisor)
 
+template unsafeFMA*(hi, lo: var Ct[uint32], a, b, c: Ct[uint32]) =
+  ## Extended precision multiplication + addition
+  ## This is constant-time on most hardware except some specific one like Cortex M0
+  ## (hi, lo) <- a*b + c
+  block:
+    # Note: since a and b use 31-bit,
+    # the result is 62-bit and carrying cannot overflow
+    let dblPrec = uint64(a) * uint64(b) + uint64(c)
+    hi = Ct[uint32](dblPrec shr 31)
+    lo = Ct[uint32](dblPrec) and Ct[uint32](1 shl 31 - 1)
+
+template unsafeFMA2*(hi, lo: var Ct[uint32], a1, b1, a2, b2, c1, c2: Ct[uint32]) =
+  ## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2
+  block:
+    # TODO: Can this overflow?
+    let dblPrec = uint64(a1) * uint64(b1) +
+                  uint64(a2) * uint64(b2) +
+                  uint64(c1) +
+                  uint64(c2)
+    hi = Ct[uint32](dblPrec shr 31)
+    lo = Ct[uint32](dblPrec) and Ct[uint32](1 shl 31 - 1)
+
 # ############################################################
 #
 #                     64-bit words
 #
 # ############################################################
 
-# func asm_x86_64_extMul(hi, lo: var uint64, a, b: uint64) {.inline.}=
-#   ## Extended precision multiplication uint64 * uint64 --> uint128
+when defined(gcc) or defined(clang) or defined(llvm_gcc):
+  type
+    uint128*{.importc: "unsigned __int128".} = object
 
-#   # TODO !!! - Replace by constant-time, portable, non-assembly version
-#   #          -> use uint128? Compiler might add unwanted branches
+  func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
+    ## Division uint128 by uint64
+    ## Warning ⚠️ :
+    ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+    ##   - if n_hi > d result is undefined
+    {.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".}
 
-#   # MUL r/m64
-#   # Multiply RAX by r/m64
-#   #
-#   # Inputs:
-#   #   - RAX
-#   #   - r/m
-#   # Outputs:
-#   #   - High word in RDX
-#   #   - Low word in RAX
+    # TODO !!! - Replace by constant-time, portable, non-assembly version
+    #          -> use uint128? Compiler might add unwanted branches
 
-#   # Don't forget to dereference the var hidden pointer in hi/lo
-#   asm """
-#     mulq %[operand]
-#     : "=d" (`*hi`), "=a" (`*lo`)
-#     : "a" (`a`), [operand] "rm" (`b`)
-#     :
-#   """
+    # DIV r/m64
+    # Divide RDX:RAX (n_hi:n_lo) by r/m64
+    #
+    # Inputs
+    #   - numerator high word in RDX,
+    #   - numerator low word in RAX,
+    #   - divisor as r/m parameter (register or memory at the compiler discretion)
+    # Result
+    #   - Quotient in RAX
+    #   - Remainder in RDX
 
-# func unsafeExtPrecMul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.}=
-#   ## Extended precision multiplication uint64 * uint64 --> uint128
-#   ##
-#   ## TODO, at the moment only x86_64 architecture are supported
-#   ##       as we use assembly.
-#   ##       Also we assume that the native integer division
-#   ##       provided by the PU is constant-time
+    # 1. name the register/memory "divisor"
+    # 2. don't forget to dereference the var hidden pointer
+    # 3. -
+    # 4. no clobbered registers beside explectly used RAX and RDX
+    asm """
+      divq %[divisor]
+      : "=a" (`*q`), "=d" (`*r`)
+      : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
+      :
+    """
 
-#   # Note, using C/Nim default `*` is inefficient
-#   # and complicated to make constant-time
-#   # See at the bottom.
+  template unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) =
+    ## Extended precision multiplication + addition
+    ## This is constant-time on most hardware except some specific one like Cortex M0
+    ## (hi, lo) <- a*b + c
+    block:
+      # Note: since a and b use 63-bit,
+      # the result is 126-bit and carrying cannot overflow
+      var dblPrec {.noInit.}: uint128
+      {.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, " + (unsigned __int128)",c,";"].}
 
-#   type T = uint64
+      {.emit:[hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
+      {.emit:[lo, " = (NU64)", dblPrec," & ", 1'u64 shl 63 - 1, ";"].}
 
-#   when not defined(amd64):
-#     {.error: "At the moment only x86_64 architecture is supported".}
-#   else:
-#     asm_x86_64_extMul(T(hi), T(lo), T(a), T(b))
+  template unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) =
+    ## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2
+    block:
+      # TODO: Can this overflow?
+      var dblPrec: uint128
+      {.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1,
+                       " + (unsigned __int128)", a2," * (unsigned __int128)", b2,
+                       " + (unsigned __int128)", c1,
+                       " + (unsigned __int128)", c2, ";"].}
+      {.emit:[hi, " = (NU64)", dblPrec," >> ", 63'u64, ";"].}
+      {.emit:[lo, " = (NU64)", dblPrec," & ", 1'u64 shl 63 - 1, ";"].}
 
-# func asm_x86_64_div2n1n(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
-#   ## Division uint128 by uint64
-#   ## Warning ⚠️ :
-#   ##   - if n_hi == d, quotient does not fit in an uint64
-#   ##   - if n_hi > d result is undefined
-
-#   # TODO !!! - Replace by constant-time, portable, non-assembly version
-#   #          -> use uint128? Compiler might add unwanted branches
-
-#   # DIV r/m64
-#   # Divide RDX:RAX (n_hi:n_lo) by r/m64
-#   #
-#   # Inputs
-#   #   - numerator high word in RDX,
-#   #   - numerator low word in RAX,
-#   #   - divisor as r/m parameter (register or memory at the compiler discretion)
-#   # Result
-#   #   - Quotient in RAX
-#   #   - Remainder in RDX
-
-#   # 1. name the register/memory "divisor"
-#   # 2. don't forget to dereference the var hidden pointer
-#   # 3. -
-#   # 4. no clobbered registers beside explectly used RAX and RDX
-#   asm """
-#     divq %[divisor]
-#     : "=a" (`*q`), "=d" (`*r`)
-#     : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
-#     :
-#   """
-
-# func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
-#   ## Division uint128 by uint64
-#   ## Warning ⚠️ :
-#   ##   - if n_hi == d, quotient does not fit in an uint64
-#   ##   - if n_hi > d result is undefined
-#   ##
-#   ## To avoid issues, n_hi, n_lo, d should be normalized.
-#   ## i.e. shifted (== multiplied by the same power of 2)
-#   ## so that the most significant bit in d is set.
-#   ##
-#   ## TODO, at the moment only x86_64 architecture are supported
-#   ##       as we use assembly.
-#   ##       Also we assume that the native integer division
-#   ##       provided by the PU is constant-time
-
-#   # Note, using C/Nim default `div` is inefficient
-#   # and complicated to make constant-time
-#   # See at the bottom.
-#   #
-#   # Furthermore compilers may try to substitute division
-#   # with a fast path that may have branches. It might also
-#   # be the same at the hardware level.
-
-#   # TODO !!! - Replace by constant-time, portable, non-assembly version
-#   #          -> use uint128? Compiler might add unwanted branches
-
-#   type T = uint64
-
-#   when not defined(amd64):
-#     {.error: "At the moment only x86_64 architecture is supported".}
-#   else:
-#     asm_x86_64_div2n1n(T(q), T(r), T(n_hi), T(n_lo), T(d))
-
-when isMainModule:
-  block: # Multiplication
-    var hi, lo: uint64
-
-    asm_x86_64_extMul(hi, lo, 1 shl 32, 1 shl 33) # 2^65
-    doAssert hi == 2
-    doAssert lo == 0
-
-  block: # Division
-    var q, r: uint64
-
-    # (1 shl 64) div 3
-    let n_hi = 1'u64
-    let n_lo = 0'u64
-    let d = 3'u64
-
-    asm_x86_64_div2n1n(q, r, n_hi, n_lo, d)
-
-    doAssert q == 6148914691236517205'u64
-    doAssert r == 1
-
-  # block: # TODO - support Quotient that doesn't fit in the result
-  #        # The usual way with normalization by the bitSize difference
-  #        # is fundamentally non constant-time
-  #        # it is probable that division is not constant-time at the hardware level as well
-  #        # as it throws sigfpe when the quotient doesn't fit in the result size
-
-  #   var q, r: uint64
-
-  #   let n_hi = 1'u64
-  #   let n_lo = 0'u64
-  #   let d = 1'u64
-
-  #   asm_x86_64_div2n1n(q, r, n_hi, n_lo, d)
-
-  #   echo "quotient: ", q
-  #   echo "remainder: ", r
-
-  # block:
-  #   var q, r: uint64
-
-  #   let n_hi = 4186590388502004879'u64
-  #   let n_lo = 17852795547484522084'u64
-  #   let d = 327340459940166448'u64
-
-  #   asm_x86_64_div2n1n(q, r, n_hi, n_lo, d)
-
-  #   echo "quotient: ", q
-  #   echo "remainder: ", r
-
-# ##############################################################
-#
-#  Non-constant-time portable extended precision multiplication
-#
-# ##############################################################
-
-# implementation from Stint: https://github.com/status-im/nim-stint/blob/edb1ade37309390cc641cee07ab62e5459d9ca44/stint/private/uint_mul.nim#L91-L135
-
-# template extPrecMulImpl(result: var UintImpl[uint64], op: untyped, u, v: uint64) =
-#   const
-#     p = 64 div 2
-#     base: uint64 = 1 shl p
-#
-#   var
-#     x0, x1, x2, x3: uint64
-#
-#   let
-#     ul = lo(u)
-#     uh = hi(u)
-#     vl = lo(v)
-#     vh = hi(v)
-#
-#   x0 = ul * vl
-#   x1 = ul * vh
-#   x2 = uh * vl
-#   x3 = uh * vh
-#
-#   x1 += hi(x0)          # This can't carry
-#   x1 += x2              # but this can
-#   if x1 < x2:           # if carry, add it to x3
-#     x3 += base
-#
-#   op(result.hi, x3 + hi(x1))
-#   op(result.lo, (x1 shl p) or lo(x0))
-#
-# func extPrecMul*(result: var UintImpl[uint64], u, v: uint64) =
-#   ## Extended precision multiplication
-#   extPrecMulImpl(result, `=`, u, v)
-#
-# func extPrecAddMul(result: var UintImpl[uint64], u, v: uint64) =
-#   ## Extended precision fused in-place addition & multiplication
-#   extPrecMulImpl(result, `+=`, u, v)
-
-# ############################################################
-#
-#             Non-constant-time portable div2n1n
-#
-# ############################################################
-
-# implementation from Stint: https://github.com/status-im/nim-stint/blob/edb1ade37309390cc641cee07ab62e5459d9ca44/stint/private/uint_div.nim#L131
-
-# func div2n1n[T: SomeunsignedInt](q, r: var T, n_hi, n_lo, d: T) =
-#
-#   # assert countLeadingZeroBits(d) == 0, "Divisor was not normalized"
-#
-#   const
-#     size = bitsof(q)
-#     halfSize = size div 2
-#     halfMask = (1.T shl halfSize) - 1.T
-#
-#   template halfQR(n_hi, n_lo, d, d_hi, d_lo: T): tuple[q,r: T] =
-#
-#     var (q, r) = divmod(n_hi, d_hi)
-#     let m = q * d_lo
-#     var r = (r shl halfSize) or n_lo
-#
-#     # Fix the reminder, we're at most 2 iterations off
-#     if r < m:
-#       dec q
-#       r += d
-#       if r >= d and r < m:
-#         dec q
-#         r += d
-#     r -= m
-#     (q, r)
-#
-#   let
-#     d_hi = d shr halfSize
-#     d_lo = d and halfMask
-#     n_lohi = nlo shr halfSize
-#     n_lolo = nlo and halfMask
-#
-#   # First half of the quotient
-#   let (q1, r1) = halfQR(n_hi, n_lohi, d, d_hi, d_lo)
-#
-#   # Second half
-#   let (q2, r2) = halfQR(r1, n_lolo, d, d_hi, d_lo)
-#
-#   q = (q1 shl halfSize) or q2
-#   r = r2
+else:
+  {.error: "Compiler not implemented".}
+  # For VCC and ICC use add_carry_u64, _add_carryx_u64 intrinsics
+  # and _umul128