Properly handle 32 bits

2020-03-01 16:18:45 +01:00 · 2020-03-01 16:18:45 +01:00 · c8e482f6d2
parent 3fdd457b52
commit c8e482f6d2
1 changed files with 141 additions and 140 deletions
--- a/constantine/primitives/extended_precision.nim
+++ b/constantine/primitives/extended_precision.nim
@ -75,99 +75,14 @@ func unsafeFMA2_hi*(hi: var Ct[uint32], a1, b1, a2, b2, c1: Ct[uint32]) {.inline
 #
 # ############################################################
-const GccCompatible = defined(gcc) or defined(clang) or defined(llvm_gcc)
+when sizeof(int) == 8:
  const GccCompatible = defined(gcc) or defined(clang) or defined(llvm_gcc)
-when sizeof(int) == 8 and GccCompatible:
+  when GccCompatible:
-  type
+    type
-    uint128*{.importc: "unsigned __int128".} = object
+      uint128*{.importc: "unsigned __int128".} = object
-  func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
+    func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
    ## Division uint128 by uint64
    ## Warning ⚠️ :
    ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
    ##   - if n_hi > d result is undefined
    {.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".}
    # TODO !!! - Replace by constant-time, portable, non-assembly version
    #          -> use uint128? Compiler might add unwanted branches
    # DIV r/m64
    # Divide RDX:RAX (n_hi:n_lo) by r/m64
    #
    # Inputs
    #   - numerator high word in RDX,
    #   - numerator low word in RAX,
    #   - divisor as r/m parameter (register or memory at the compiler discretion)
    # Result
    #   - Quotient in RAX
    #   - Remainder in RDX
    # 1. name the register/memory "divisor"
    # 2. don't forget to dereference the var hidden pointer
    # 3. -
    # 4. no clobbered registers beside explectly used RAX and RDX
    when defined(amd64):
      asm """
        divq %[divisor]
        : "=a" (`*q`), "=d" (`*r`)
        : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
        :
      """
    else:
      var dblPrec {.noInit.}: uint128
      {.emit:[dblPrec, " = (unsigned __int128)", n_hi," << 64 | (unsigned __int128)",n_lo,";"].}
      # Don't forget to dereference the var param
      {.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].}
      {.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].}
  func unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.}=
    ## Extended precision multiplication + addition
    ## This is constant-time on most hardware except some specific one like Cortex M0
    ## (hi, lo) <- a*b + c
    block:
      # Note: since a and b use 63-bit,
      # the result is 126-bit and carrying cannot overflow
      var dblPrec {.noInit.}: uint128
      {.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, " + (unsigned __int128)",c,";"].}
      # Don't forget to dereference the var param
      {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
      {.emit:["*",lo, " = (NU64)", dblPrec," & ", 1'u64 shl 63 - 1, ";"].}
  func unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) {.inline.}=
    ## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2
    block:
      # TODO: Can this overflow?
      var dblPrec: uint128
      {.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1,
                       " + (unsigned __int128)", a2," * (unsigned __int128)", b2,
                       " + (unsigned __int128)", c1,
                       " + (unsigned __int128)", c2, ";"].}
      # Don't forget to dereference the var param
      {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
      {.emit:["*",lo, " = (NU64)", dblPrec," & ", (1'u64 shl 63 - 1), ";"].}
  func unsafeFMA2_hi*(hi: var Ct[uint64], a1, b1, a2, b2, c: Ct[uint64]) {.inline.}=
    ## Returns the high word of the sum of extended precision multiply-adds
    ## (hi, _) <- a1 * b1 + a2 * b2 + c
    block:
      var dblPrec: uint128
      {.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1,
                       " + (unsigned __int128)", a2," * (unsigned __int128)", b2,
                       " + (unsigned __int128)", c, ";"].}
      # Don't forget to dereference the var param
      {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
 elif sizeof(int) == 8 and defined(vcc):
  func udiv128(highDividend, lowDividend, divisor: uint64, remainder: var uint64): uint64 {.importc:"_udiv128", header: "<immintrin.h>", nodecl.}
    ## Division 128 by 64, Microsoft only, 64-bit only,
    ## returns quotient as return value remainder as var parameter
    ## Warning ⚠️ :
    ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
    ##   - if n_hi > d result is undefined
  func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
      ## Division uint128 by uint64
      ## Warning ⚠️ :
      ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
@ -176,67 +91,153 @@ elif sizeof(int) == 8 and defined(vcc):
      # TODO !!! - Replace by constant-time, portable, non-assembly version
      #          -> use uint128? Compiler might add unwanted branches
      q = udiv128(n_hi, n_lo, d, r)
-  func addcarry_u64(carryIn: cuchar, a, b: uint64, sum: var uint64): cuchar {.importc:"_addcarry_u64", header:"<intrin.h>", nodecl.}
+      # DIV r/m64
-    ## (CarryOut, Sum) <-- a + b
+      # Divide RDX:RAX (n_hi:n_lo) by r/m64
-    ## Available on MSVC and ICC (Clang and GCC have very bad codegen, use uint128 instead)
+      #
-    ## Return value is the carry-out
+      # Inputs
      #   - numerator high word in RDX,
      #   - numerator low word in RAX,
      #   - divisor as r/m parameter (register or memory at the compiler discretion)
      # Result
      #   - Quotient in RAX
      #   - Remainder in RDX
-  func umul128(a, b: uint64, hi: var uint64): uint64 {.importc:"_umul128", header:"<intrin.h>", nodecl.}
+      # 1. name the register/memory "divisor"
-    ## (hi, lo) <-- a * b
+      # 2. don't forget to dereference the var hidden pointer
-    ## Return value is the low word
+      # 3. -
      # 4. no clobbered registers beside explectly used RAX and RDX
      when defined(amd64):
        asm """
          divq %[divisor]
          : "=a" (`*q`), "=d" (`*r`)
          : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
          :
        """
      else:
        var dblPrec {.noInit.}: uint128
        {.emit:[dblPrec, " = (unsigned __int128)", n_hi," << 64 | (unsigned __int128)",n_lo,";"].}
-  func unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.}=
+        # Don't forget to dereference the var param
-    ## Extended precision multiplication + addition
+        {.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].}
-    ## This is constant-time on most hardware except some specific one like Cortex M0
+        {.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].}
    ## (hi, lo) <- a*b + c
    var carry: cuchar
    var hi, lo: uint64
    lo = umul128(uint64(a), uint64(b), hi)
    carry = addcarry_u64(cuchar(0), lo, uint64(c), lo)
    discard addcarry_u64(carry, hi, 0, hi)
-  func unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) {.inline.}=
+    func unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.}=
-    ## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2
+      ## Extended precision multiplication + addition
-    var f1_lo, f1_hi, f2_lo, f2_hi: uint64
+      ## This is constant-time on most hardware except some specific one like Cortex M0
-    var carry: cuchar
+      ## (hi, lo) <- a*b + c
      block:
        # Note: since a and b use 63-bit,
        # the result is 126-bit and carrying cannot overflow
        var dblPrec {.noInit.}: uint128
        {.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, " + (unsigned __int128)",c,";"].}
-    f1_lo = umul128(uint64(a1), uint64(b1), f1_hi)
+        # Don't forget to dereference the var param
-    f2_lo = umul128(uint64(a2), uint64(b2), f2_hi)
+        {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
        {.emit:["*",lo, " = (NU64)", dblPrec," & ", 1'u64 shl 63 - 1, ";"].}
-    # On CPU with ADX: we can use addcarryx_u64 (adcx/adox) to have
+    func unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) {.inline.}=
-    # separate carry chains that can be processed in parallel by CPU
+      ## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2
      block:
        # TODO: Can this overflow?
        var dblPrec: uint128
        {.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1,
                        " + (unsigned __int128)", a2," * (unsigned __int128)", b2,
                        " + (unsigned __int128)", c1,
                        " + (unsigned __int128)", c2, ";"].}
        # Don't forget to dereference the var param
        {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
        {.emit:["*",lo, " = (NU64)", dblPrec," & ", (1'u64 shl 63 - 1), ";"].}
-    # Carry chain 1
+    func unsafeFMA2_hi*(hi: var Ct[uint64], a1, b1, a2, b2, c: Ct[uint64]) {.inline.}=
-    carry = addcarry_u64(cuchar(0), f1_lo, uint64(c1), f1_lo)
+      ## Returns the high word of the sum of extended precision multiply-adds
-    discard addcarry_u64(carry, f1_hi, 0, f1_hi)
+      ## (hi, _) <- a1 * b1 + a2 * b2 + c
      block:
        var dblPrec: uint128
        {.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1,
                        " + (unsigned __int128)", a2," * (unsigned __int128)", b2,
                        " + (unsigned __int128)", c, ";"].}
        # Don't forget to dereference the var param
        {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
-    # Carry chain 2
+  elif defined(vcc):
-    carry = addcarry_u64(cuchar(0), f2_lo, uint64(c2), f2_lo)
+    func udiv128(highDividend, lowDividend, divisor: uint64, remainder: var uint64): uint64 {.importc:"_udiv128", header: "<immintrin.h>", nodecl.}
-    discard addcarry_u64(carry, f2_hi, 0, f2_hi)
+      ## Division 128 by 64, Microsoft only, 64-bit only,
      ## returns quotient as return value remainder as var parameter
      ## Warning ⚠️ :
      ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
      ##   - if n_hi > d result is undefined
-    # Merge
+    func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
-    carry = addcarry_u64(cuchar(0), f1_lo, f2_lo, lo)
+        ## Division uint128 by uint64
-    discard addcarry_u64(carry, f1_hi, f2_hi, hi)
+        ## Warning ⚠️ :
        ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
        ##   - if n_hi > d result is undefined
        {.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".}
-  func unsafeFMA2_hi*(hi: var Ct[uint64], a1, b1, a2, b2, c: Ct[uint64]) {.inline.}=
+        # TODO !!! - Replace by constant-time, portable, non-assembly version
-    ## Returns the high word of the sum of extended precision multiply-adds
+        #          -> use uint128? Compiler might add unwanted branches
-    ## (hi, _) <- a1 * b1 + a2 * b2 + c
+        q = udiv128(n_hi, n_lo, d, r)
-    var f1_lo, f1_hi, f2_lo, f2_hi: uint64
+    func addcarry_u64(carryIn: cuchar, a, b: uint64, sum: var uint64): cuchar {.importc:"_addcarry_u64", header:"<intrin.h>", nodecl.}
-    var carry: cuchar
+      ## (CarryOut, Sum) <-- a + b
      ## Available on MSVC and ICC (Clang and GCC have very bad codegen, use uint128 instead)
      ## Return value is the carry-out
-    f1_lo = umul128(uint64(a1), uint64(b1), f1_hi)
+    func umul128(a, b: uint64, hi: var uint64): uint64 {.importc:"_umul128", header:"<intrin.h>", nodecl.}
-    f2_lo = umul128(uint64(a2), uint64(b2), f2_hi)
+      ## (hi, lo) <-- a * b
      ## Return value is the low word
-    carry = addcarry_u64(cuchar(0), f1_lo, uint64(c), f1_lo)
+    func unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.}=
-    discard addcarry_u64(carry, f1_hi, 0, f1_hi)
+      ## Extended precision multiplication + addition
      ## This is constant-time on most hardware except some specific one like Cortex M0
      ## (hi, lo) <- a*b + c
      var carry: cuchar
      var hi, lo: uint64
      lo = umul128(uint64(a), uint64(b), hi)
      carry = addcarry_u64(cuchar(0), lo, uint64(c), lo)
      discard addcarry_u64(carry, hi, 0, hi)
-    # Merge
+    func unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) {.inline.}=
-    var lo: uint64
+      ## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2
-    carry = addcarry_u64(cuchar(0), f1_lo, f2_lo, lo)
+      var f1_lo, f1_hi, f2_lo, f2_hi: uint64
-    discard addcarry_u64(carry, f1_hi, f2_hi, hi)
+      var carry: cuchar
-else:
+      f1_lo = umul128(uint64(a1), uint64(b1), f1_hi)
-  {.error: "Compiler not implemented".}
+      f2_lo = umul128(uint64(a2), uint64(b2), f2_hi)
      # On CPU with ADX: we can use addcarryx_u64 (adcx/adox) to have
      # separate carry chains that can be processed in parallel by CPU
      # Carry chain 1
      carry = addcarry_u64(cuchar(0), f1_lo, uint64(c1), f1_lo)
      discard addcarry_u64(carry, f1_hi, 0, f1_hi)
      # Carry chain 2
      carry = addcarry_u64(cuchar(0), f2_lo, uint64(c2), f2_lo)
      discard addcarry_u64(carry, f2_hi, 0, f2_hi)
      # Merge
      carry = addcarry_u64(cuchar(0), f1_lo, f2_lo, lo)
      discard addcarry_u64(carry, f1_hi, f2_hi, hi)
    func unsafeFMA2_hi*(hi: var Ct[uint64], a1, b1, a2, b2, c: Ct[uint64]) {.inline.}=
      ## Returns the high word of the sum of extended precision multiply-adds
      ## (hi, _) <- a1 * b1 + a2 * b2 + c
      var f1_lo, f1_hi, f2_lo, f2_hi: uint64
      var carry: cuchar
      f1_lo = umul128(uint64(a1), uint64(b1), f1_hi)
      f2_lo = umul128(uint64(a2), uint64(b2), f2_hi)
      carry = addcarry_u64(cuchar(0), f1_lo, uint64(c), f1_lo)
      discard addcarry_u64(carry, f1_hi, 0, f1_hi)
      # Merge
      var lo: uint64
      carry = addcarry_u64(cuchar(0), f1_lo, f2_lo, lo)
      discard addcarry_u64(carry, f1_hi, f2_hi, hi)
  else:
    {.error: "Compiler not implemented".}