Improve initialization for montymul. (64-bit speed is 2.3x 32-bit)

2025-01-30 04:27:54 +00:00 · 2020-02-29 14:59:20 +01:00 · 2020-02-29 14:59:20 +01:00 · 0fab0c8a42
commit 0fab0c8a42
parent feacf2b2ea
3 changed files with 32 additions and 9 deletions
--- a/constantine/arithmetic/bigints_raw.nim
+++ b/constantine/arithmetic/bigints_raw.nim
@ -553,10 +553,12 @@ func montyMul*(
  for i in 0 ..< nLen:

    let zi = (r[0] + wordMul(a[i], b[0])).wordMul(negInvModWord)
-    var carry, z = Zero
-    unsafeFMA2(carry, z, a[i], b[0], zi, M[0], r[0], carry)
+    var carry: Word
+    # (carry, _) <- a[i] * b[0] + zi * M[0] + r[0]
+    unsafeFMA2_hi(carry, a[i], b[0], zi, M[0], r[0])

    for j in 1 ..< nLen:
+      # (carry, r[j-1]) <- a[i] * b[j] + zi * M[j] + r[j] + carry
      unsafeFMA2(carry, r[j-1], a[i], b[j], zi, M[j], r[j], carry)

    r_hi += carry
--- a/constantine/config/common.nim
+++ b/constantine/config/common.nim
@ -14,16 +14,17 @@

 import ../primitives/constant_time

-type Word* = Ct[uint64]
-  ## Logical BigInt word
-  ## A logical BigInt word is of size physical MachineWord-1
-
-type BaseType* = uint64
-  ## Physical BigInt for conversion in "normal integers"
+type
+  BaseType* = uint64
+    ## Physical BigInt for conversion in "normal integers"
+  Word* = Ct[BaseType]
+    ## Logical BigInt word
+    ## A logical BigInt word is of size physical MachineWord-1

 const
+  ExcessBits = 1
  WordPhysBitSize* = sizeof(Word) * 8
-  WordBitSize* = WordPhysBitSize - 1
+  WordBitSize* = WordPhysBitSize - ExcessBits

  CtTrue* = ctrue(Word)
  CtFalse* = cfalse(Word)
--- a/constantine/primitives/extended_precision.nim
+++ b/constantine/primitives/extended_precision.nim
@ -59,6 +59,16 @@ template unsafeFMA2*(hi, lo: var Ct[uint32], a1, b1, a2, b2, c1, c2: Ct[uint32])
    hi = Ct[uint32](dblPrec shr 31)
    lo = Ct[uint32](dblPrec) and Ct[uint32](1 shl 31 - 1)

+template unsafeFMA2_hi*(hi: var Ct[uint32], a1, b1, a2, b2, c1: Ct[uint32]) =
+  ## Returns the high word of the sum of extended precision multiply-adds
+  ## (hi, _) <- a1 * b1 + a2 * b2 + c
+  block:
+    # TODO: Can this overflow?
+    let dblPrec = uint64(a1) * uint64(b1) +
+                  uint64(a2) * uint64(b2) +
+                  uint64(c1)
+    hi = Ct[uint32](dblPrec shr 31)
+
 # ############################################################
 #
 #                     64-bit words
@ -126,6 +136,16 @@ when defined(gcc) or defined(clang) or defined(llvm_gcc):
      {.emit:[hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
      {.emit:[lo, " = (NU64)", dblPrec," & ", (1'u64 shl 63 - 1), ";"].}

+  template unsafeFMA2_hi*(hi: var Ct[uint64], a1, b1, a2, b2, c: Ct[uint64]) =
+    ## Returns the high word of the sum of extended precision multiply-adds
+    ## (hi, _) <- a1 * b1 + a2 * b2 + c
+    block:
+      var dblPrec: uint128
+      {.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1,
+                       " + (unsigned __int128)", a2," * (unsigned __int128)", b2,
+                       " + (unsigned __int128)", c, ";"].}
+      {.emit:[hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
+
 else:
  {.error: "Compiler not implemented".}
  # For VCC and ICC use add_carry_u64, _add_carryx_u64 intrinsics