diff --git a/constantine/primitives/extended_precision.nim b/constantine/primitives/extended_precision.nim index bf2b461..fefdd91 100644 --- a/constantine/primitives/extended_precision.nim +++ b/constantine/primitives/extended_precision.nim @@ -159,7 +159,84 @@ when sizeof(int) == 8 and GccCompatible: # Don't forget to dereference the var param {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].} +elif sizeof(int) == 8 and defined(vcc): + func udiv128(highDividend, lowDividend, divisor: uint64, remainder: var uint64): uint64 {.importc:"_udiv128", header: "", nodecl.} + ## Division 128 by 64, Microsoft only, 64-bit only, + ## returns quotient as return value remainder as var parameter + ## Warning ⚠️ : + ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE + ## - if n_hi > d result is undefined + + func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}= + ## Division uint128 by uint64 + ## Warning ⚠️ : + ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE + ## - if n_hi > d result is undefined + {.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".} + + # TODO !!! - Replace by constant-time, portable, non-assembly version + # -> use uint128? Compiler might add unwanted branches + q = udiv128(n_hi, n_lo, d, r) + + func addcarry_u64(carryIn: cuchar, a, b: uint64, sum: var uint64): cuchar {.importc:"_addcarry_u64", header:"", nodecl.} + ## (CarryOut, Sum) <-- a + b + ## Available on MSVC and ICC (Clang and GCC have very bad codegen, use uint128 instead) + ## Return value is the carry-out + + func umul128(a, b: uint64, hi: var uint64): uint64 {.importc:"_umul128", header:"", nodecl.} + ## (hi, lo) <-- a * b + ## Return value is the low word + + func unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.}= + ## Extended precision multiplication + addition + ## This is constant-time on most hardware except some specific one like Cortex M0 + ## (hi, lo) <- a*b + c + var carry: cuchar + var hi, lo: uint64 + lo = umul128(uint64(a), uint64(b), hi) + carry = addcarry_u64(cuchar(0), lo, uint64(c), lo) + discard addcarry_u64(carry, hi, 0, hi) + + func unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) {.inline.}= + ## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2 + var f1_lo, f1_hi, f2_lo, f2_hi: uint64 + var carry: cuchar + + f1_lo = umul128(uint64(a1), uint64(b1), f1_hi) + f2_lo = umul128(uint64(a2), uint64(b2), f2_hi) + + # On CPU with ADX: we can use addcarryx_u64 (adcx/adox) to have + # separate carry chains that can be processed in parallel by CPU + + # Carry chain 1 + carry = addcarry_u64(cuchar(0), f1_lo, uint64(c1), f1_lo) + discard addcarry_u64(carry, f1_hi, 0, f1_hi) + + # Carry chain 2 + carry = addcarry_u64(cuchar(0), f2_lo, uint64(c2), f2_lo) + discard addcarry_u64(carry, f2_hi, 0, f2_hi) + + # Merge + carry = addcarry_u64(cuchar(0), f1_lo, f2_lo, lo) + discard addcarry_u64(carry, f1_hi, f2_hi, hi) + + func unsafeFMA2_hi*(hi: var Ct[uint64], a1, b1, a2, b2, c: Ct[uint64]) {.inline.}= + ## Returns the high word of the sum of extended precision multiply-adds + ## (hi, _) <- a1 * b1 + a2 * b2 + c + + var f1_lo, f1_hi, f2_lo, f2_hi: uint64 + var carry: cuchar + + f1_lo = umul128(uint64(a1), uint64(b1), f1_hi) + f2_lo = umul128(uint64(a2), uint64(b2), f2_hi) + + carry = addcarry_u64(cuchar(0), f1_lo, uint64(c), f1_lo) + discard addcarry_u64(carry, f1_hi, 0, f1_hi) + + # Merge + var lo: uint64 + carry = addcarry_u64(cuchar(0), f1_lo, f2_lo, lo) + discard addcarry_u64(carry, f1_hi, f2_hi, hi) + else: {.error: "Compiler not implemented".} - # For VCC and ICC use add_carry_u64, _add_carryx_u64 intrinsics - # and _umul128