From c8e482f6d2e1e7d8492bd32dd658cb397fe51f38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Sun, 1 Mar 2020 16:18:45 +0100
Subject: [PATCH] Properly handle 32 bits

---
 constantine/primitives/extended_precision.nim | 281 +++++++++---------
 1 file changed, 141 insertions(+), 140 deletions(-)

diff --git a/constantine/primitives/extended_precision.nim b/constantine/primitives/extended_precision.nim
index fefdd91..e34562d 100644
--- a/constantine/primitives/extended_precision.nim
+++ b/constantine/primitives/extended_precision.nim
@@ -75,99 +75,14 @@ func unsafeFMA2_hi*(hi: var Ct[uint32], a1, b1, a2, b2, c1: Ct[uint32]) {.inline
 #
 # ############################################################
 
-const GccCompatible = defined(gcc) or defined(clang) or defined(llvm_gcc)
+when sizeof(int) == 8:
+  const GccCompatible = defined(gcc) or defined(clang) or defined(llvm_gcc)
 
-when sizeof(int) == 8 and GccCompatible:
-  type
-    uint128*{.importc: "unsigned __int128".} = object
+  when GccCompatible:
+    type
+      uint128*{.importc: "unsigned __int128".} = object
 
-  func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
-    ## Division uint128 by uint64
-    ## Warning ⚠️ :
-    ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
-    ##   - if n_hi > d result is undefined
-    {.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".}
-
-    # TODO !!! - Replace by constant-time, portable, non-assembly version
-    #          -> use uint128? Compiler might add unwanted branches
-
-    # DIV r/m64
-    # Divide RDX:RAX (n_hi:n_lo) by r/m64
-    #
-    # Inputs
-    #   - numerator high word in RDX,
-    #   - numerator low word in RAX,
-    #   - divisor as r/m parameter (register or memory at the compiler discretion)
-    # Result
-    #   - Quotient in RAX
-    #   - Remainder in RDX
-
-    # 1. name the register/memory "divisor"
-    # 2. don't forget to dereference the var hidden pointer
-    # 3. -
-    # 4. no clobbered registers beside explectly used RAX and RDX
-    when defined(amd64):
-      asm """
-        divq %[divisor]
-        : "=a" (`*q`), "=d" (`*r`)
-        : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
-        :
-      """
-    else:
-      var dblPrec {.noInit.}: uint128
-      {.emit:[dblPrec, " = (unsigned __int128)", n_hi," << 64 | (unsigned __int128)",n_lo,";"].}
-
-      # Don't forget to dereference the var param
-      {.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].}
-      {.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].}
-
-  func unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.}=
-    ## Extended precision multiplication + addition
-    ## This is constant-time on most hardware except some specific one like Cortex M0
-    ## (hi, lo) <- a*b + c
-    block:
-      # Note: since a and b use 63-bit,
-      # the result is 126-bit and carrying cannot overflow
-      var dblPrec {.noInit.}: uint128
-      {.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, " + (unsigned __int128)",c,";"].}
-
-      # Don't forget to dereference the var param
-      {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
-      {.emit:["*",lo, " = (NU64)", dblPrec," & ", 1'u64 shl 63 - 1, ";"].}
-
-  func unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) {.inline.}=
-    ## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2
-    block:
-      # TODO: Can this overflow?
-      var dblPrec: uint128
-      {.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1,
-                       " + (unsigned __int128)", a2," * (unsigned __int128)", b2,
-                       " + (unsigned __int128)", c1,
-                       " + (unsigned __int128)", c2, ";"].}
-      # Don't forget to dereference the var param
-      {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
-      {.emit:["*",lo, " = (NU64)", dblPrec," & ", (1'u64 shl 63 - 1), ";"].}
-
-  func unsafeFMA2_hi*(hi: var Ct[uint64], a1, b1, a2, b2, c: Ct[uint64]) {.inline.}=
-    ## Returns the high word of the sum of extended precision multiply-adds
-    ## (hi, _) <- a1 * b1 + a2 * b2 + c
-    block:
-      var dblPrec: uint128
-      {.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1,
-                       " + (unsigned __int128)", a2," * (unsigned __int128)", b2,
-                       " + (unsigned __int128)", c, ";"].}
-      # Don't forget to dereference the var param
-      {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
-
-elif sizeof(int) == 8 and defined(vcc):
-  func udiv128(highDividend, lowDividend, divisor: uint64, remainder: var uint64): uint64 {.importc:"_udiv128", header: "<immintrin.h>", nodecl.}
-    ## Division 128 by 64, Microsoft only, 64-bit only,
-    ## returns quotient as return value remainder as var parameter
-    ## Warning ⚠️ :
-    ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
-    ##   - if n_hi > d result is undefined
-
-  func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
+    func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
       ## Division uint128 by uint64
       ## Warning ⚠️ :
       ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
@@ -176,67 +91,153 @@ elif sizeof(int) == 8 and defined(vcc):
 
       # TODO !!! - Replace by constant-time, portable, non-assembly version
       #          -> use uint128? Compiler might add unwanted branches
-      q = udiv128(n_hi, n_lo, d, r)
 
-  func addcarry_u64(carryIn: cuchar, a, b: uint64, sum: var uint64): cuchar {.importc:"_addcarry_u64", header:"<intrin.h>", nodecl.}
-    ## (CarryOut, Sum) <-- a + b
-    ## Available on MSVC and ICC (Clang and GCC have very bad codegen, use uint128 instead)
-    ## Return value is the carry-out
+      # DIV r/m64
+      # Divide RDX:RAX (n_hi:n_lo) by r/m64
+      #
+      # Inputs
+      #   - numerator high word in RDX,
+      #   - numerator low word in RAX,
+      #   - divisor as r/m parameter (register or memory at the compiler discretion)
+      # Result
+      #   - Quotient in RAX
+      #   - Remainder in RDX
 
-  func umul128(a, b: uint64, hi: var uint64): uint64 {.importc:"_umul128", header:"<intrin.h>", nodecl.}
-    ## (hi, lo) <-- a * b
-    ## Return value is the low word
+      # 1. name the register/memory "divisor"
+      # 2. don't forget to dereference the var hidden pointer
+      # 3. -
+      # 4. no clobbered registers beside explectly used RAX and RDX
+      when defined(amd64):
+        asm """
+          divq %[divisor]
+          : "=a" (`*q`), "=d" (`*r`)
+          : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
+          :
+        """
+      else:
+        var dblPrec {.noInit.}: uint128
+        {.emit:[dblPrec, " = (unsigned __int128)", n_hi," << 64 | (unsigned __int128)",n_lo,";"].}
 
-  func unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.}=
-    ## Extended precision multiplication + addition
-    ## This is constant-time on most hardware except some specific one like Cortex M0
-    ## (hi, lo) <- a*b + c
-    var carry: cuchar
-    var hi, lo: uint64
-    lo = umul128(uint64(a), uint64(b), hi)
-    carry = addcarry_u64(cuchar(0), lo, uint64(c), lo)
-    discard addcarry_u64(carry, hi, 0, hi)
+        # Don't forget to dereference the var param
+        {.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].}
+        {.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].}
 
-  func unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) {.inline.}=
-    ## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2
-    var f1_lo, f1_hi, f2_lo, f2_hi: uint64
-    var carry: cuchar
+    func unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.}=
+      ## Extended precision multiplication + addition
+      ## This is constant-time on most hardware except some specific one like Cortex M0
+      ## (hi, lo) <- a*b + c
+      block:
+        # Note: since a and b use 63-bit,
+        # the result is 126-bit and carrying cannot overflow
+        var dblPrec {.noInit.}: uint128
+        {.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, " + (unsigned __int128)",c,";"].}
 
-    f1_lo = umul128(uint64(a1), uint64(b1), f1_hi)
-    f2_lo = umul128(uint64(a2), uint64(b2), f2_hi)
+        # Don't forget to dereference the var param
+        {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
+        {.emit:["*",lo, " = (NU64)", dblPrec," & ", 1'u64 shl 63 - 1, ";"].}
 
-    # On CPU with ADX: we can use addcarryx_u64 (adcx/adox) to have
-    # separate carry chains that can be processed in parallel by CPU
+    func unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) {.inline.}=
+      ## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2
+      block:
+        # TODO: Can this overflow?
+        var dblPrec: uint128
+        {.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1,
+                        " + (unsigned __int128)", a2," * (unsigned __int128)", b2,
+                        " + (unsigned __int128)", c1,
+                        " + (unsigned __int128)", c2, ";"].}
+        # Don't forget to dereference the var param
+        {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
+        {.emit:["*",lo, " = (NU64)", dblPrec," & ", (1'u64 shl 63 - 1), ";"].}
 
-    # Carry chain 1
-    carry = addcarry_u64(cuchar(0), f1_lo, uint64(c1), f1_lo)
-    discard addcarry_u64(carry, f1_hi, 0, f1_hi)
+    func unsafeFMA2_hi*(hi: var Ct[uint64], a1, b1, a2, b2, c: Ct[uint64]) {.inline.}=
+      ## Returns the high word of the sum of extended precision multiply-adds
+      ## (hi, _) <- a1 * b1 + a2 * b2 + c
+      block:
+        var dblPrec: uint128
+        {.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1,
+                        " + (unsigned __int128)", a2," * (unsigned __int128)", b2,
+                        " + (unsigned __int128)", c, ";"].}
+        # Don't forget to dereference the var param
+        {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
 
-    # Carry chain 2
-    carry = addcarry_u64(cuchar(0), f2_lo, uint64(c2), f2_lo)
-    discard addcarry_u64(carry, f2_hi, 0, f2_hi)
+  elif defined(vcc):
+    func udiv128(highDividend, lowDividend, divisor: uint64, remainder: var uint64): uint64 {.importc:"_udiv128", header: "<immintrin.h>", nodecl.}
+      ## Division 128 by 64, Microsoft only, 64-bit only,
+      ## returns quotient as return value remainder as var parameter
+      ## Warning ⚠️ :
+      ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+      ##   - if n_hi > d result is undefined
 
-    # Merge
-    carry = addcarry_u64(cuchar(0), f1_lo, f2_lo, lo)
-    discard addcarry_u64(carry, f1_hi, f2_hi, hi)
+    func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
+        ## Division uint128 by uint64
+        ## Warning ⚠️ :
+        ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+        ##   - if n_hi > d result is undefined
+        {.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".}
 
-  func unsafeFMA2_hi*(hi: var Ct[uint64], a1, b1, a2, b2, c: Ct[uint64]) {.inline.}=
-    ## Returns the high word of the sum of extended precision multiply-adds
-    ## (hi, _) <- a1 * b1 + a2 * b2 + c
+        # TODO !!! - Replace by constant-time, portable, non-assembly version
+        #          -> use uint128? Compiler might add unwanted branches
+        q = udiv128(n_hi, n_lo, d, r)
 
-    var f1_lo, f1_hi, f2_lo, f2_hi: uint64
-    var carry: cuchar
+    func addcarry_u64(carryIn: cuchar, a, b: uint64, sum: var uint64): cuchar {.importc:"_addcarry_u64", header:"<intrin.h>", nodecl.}
+      ## (CarryOut, Sum) <-- a + b
+      ## Available on MSVC and ICC (Clang and GCC have very bad codegen, use uint128 instead)
+      ## Return value is the carry-out
 
-    f1_lo = umul128(uint64(a1), uint64(b1), f1_hi)
-    f2_lo = umul128(uint64(a2), uint64(b2), f2_hi)
+    func umul128(a, b: uint64, hi: var uint64): uint64 {.importc:"_umul128", header:"<intrin.h>", nodecl.}
+      ## (hi, lo) <-- a * b
+      ## Return value is the low word
 
-    carry = addcarry_u64(cuchar(0), f1_lo, uint64(c), f1_lo)
-    discard addcarry_u64(carry, f1_hi, 0, f1_hi)
+    func unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.}=
+      ## Extended precision multiplication + addition
+      ## This is constant-time on most hardware except some specific one like Cortex M0
+      ## (hi, lo) <- a*b + c
+      var carry: cuchar
+      var hi, lo: uint64
+      lo = umul128(uint64(a), uint64(b), hi)
+      carry = addcarry_u64(cuchar(0), lo, uint64(c), lo)
+      discard addcarry_u64(carry, hi, 0, hi)
 
-    # Merge
-    var lo: uint64
-    carry = addcarry_u64(cuchar(0), f1_lo, f2_lo, lo)
-    discard addcarry_u64(carry, f1_hi, f2_hi, hi)
+    func unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) {.inline.}=
+      ## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2
+      var f1_lo, f1_hi, f2_lo, f2_hi: uint64
+      var carry: cuchar
 
-else:
-  {.error: "Compiler not implemented".}
+      f1_lo = umul128(uint64(a1), uint64(b1), f1_hi)
+      f2_lo = umul128(uint64(a2), uint64(b2), f2_hi)
+
+      # On CPU with ADX: we can use addcarryx_u64 (adcx/adox) to have
+      # separate carry chains that can be processed in parallel by CPU
+
+      # Carry chain 1
+      carry = addcarry_u64(cuchar(0), f1_lo, uint64(c1), f1_lo)
+      discard addcarry_u64(carry, f1_hi, 0, f1_hi)
+
+      # Carry chain 2
+      carry = addcarry_u64(cuchar(0), f2_lo, uint64(c2), f2_lo)
+      discard addcarry_u64(carry, f2_hi, 0, f2_hi)
+
+      # Merge
+      carry = addcarry_u64(cuchar(0), f1_lo, f2_lo, lo)
+      discard addcarry_u64(carry, f1_hi, f2_hi, hi)
+
+    func unsafeFMA2_hi*(hi: var Ct[uint64], a1, b1, a2, b2, c: Ct[uint64]) {.inline.}=
+      ## Returns the high word of the sum of extended precision multiply-adds
+      ## (hi, _) <- a1 * b1 + a2 * b2 + c
+
+      var f1_lo, f1_hi, f2_lo, f2_hi: uint64
+      var carry: cuchar
+
+      f1_lo = umul128(uint64(a1), uint64(b1), f1_hi)
+      f2_lo = umul128(uint64(a2), uint64(b2), f2_hi)
+
+      carry = addcarry_u64(cuchar(0), f1_lo, uint64(c), f1_lo)
+      discard addcarry_u64(carry, f1_hi, 0, f1_hi)
+
+      # Merge
+      var lo: uint64
+      carry = addcarry_u64(cuchar(0), f1_lo, f2_lo, lo)
+      discard addcarry_u64(carry, f1_hi, f2_hi, hi)
+
+  else:
+    {.error: "Compiler not implemented".}