mirror of
https://github.com/codex-storage/constantine.git
synced 2025-01-30 04:27:54 +00:00
Add MSVC compiler support for 128-bit div/mul (untested)
This commit is contained in:
parent
dfb007e6bc
commit
3fdd457b52
@ -159,7 +159,84 @@ when sizeof(int) == 8 and GccCompatible:
|
||||
# Don't forget to dereference the var param
|
||||
{.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
|
||||
|
||||
elif sizeof(int) == 8 and defined(vcc):
|
||||
func udiv128(highDividend, lowDividend, divisor: uint64, remainder: var uint64): uint64 {.importc:"_udiv128", header: "<immintrin.h>", nodecl.}
|
||||
## Division 128 by 64, Microsoft only, 64-bit only,
|
||||
## returns quotient as return value remainder as var parameter
|
||||
## Warning ⚠️ :
|
||||
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
|
||||
## - if n_hi > d result is undefined
|
||||
|
||||
func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
|
||||
## Division uint128 by uint64
|
||||
## Warning ⚠️ :
|
||||
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
|
||||
## - if n_hi > d result is undefined
|
||||
{.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".}
|
||||
|
||||
# TODO !!! - Replace by constant-time, portable, non-assembly version
|
||||
# -> use uint128? Compiler might add unwanted branches
|
||||
q = udiv128(n_hi, n_lo, d, r)
|
||||
|
||||
func addcarry_u64(carryIn: cuchar, a, b: uint64, sum: var uint64): cuchar {.importc:"_addcarry_u64", header:"<intrin.h>", nodecl.}
|
||||
## (CarryOut, Sum) <-- a + b
|
||||
## Available on MSVC and ICC (Clang and GCC have very bad codegen, use uint128 instead)
|
||||
## Return value is the carry-out
|
||||
|
||||
func umul128(a, b: uint64, hi: var uint64): uint64 {.importc:"_umul128", header:"<intrin.h>", nodecl.}
|
||||
## (hi, lo) <-- a * b
|
||||
## Return value is the low word
|
||||
|
||||
func unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.}=
|
||||
## Extended precision multiplication + addition
|
||||
## This is constant-time on most hardware except some specific one like Cortex M0
|
||||
## (hi, lo) <- a*b + c
|
||||
var carry: cuchar
|
||||
var hi, lo: uint64
|
||||
lo = umul128(uint64(a), uint64(b), hi)
|
||||
carry = addcarry_u64(cuchar(0), lo, uint64(c), lo)
|
||||
discard addcarry_u64(carry, hi, 0, hi)
|
||||
|
||||
func unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) {.inline.}=
|
||||
## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2
|
||||
var f1_lo, f1_hi, f2_lo, f2_hi: uint64
|
||||
var carry: cuchar
|
||||
|
||||
f1_lo = umul128(uint64(a1), uint64(b1), f1_hi)
|
||||
f2_lo = umul128(uint64(a2), uint64(b2), f2_hi)
|
||||
|
||||
# On CPU with ADX: we can use addcarryx_u64 (adcx/adox) to have
|
||||
# separate carry chains that can be processed in parallel by CPU
|
||||
|
||||
# Carry chain 1
|
||||
carry = addcarry_u64(cuchar(0), f1_lo, uint64(c1), f1_lo)
|
||||
discard addcarry_u64(carry, f1_hi, 0, f1_hi)
|
||||
|
||||
# Carry chain 2
|
||||
carry = addcarry_u64(cuchar(0), f2_lo, uint64(c2), f2_lo)
|
||||
discard addcarry_u64(carry, f2_hi, 0, f2_hi)
|
||||
|
||||
# Merge
|
||||
carry = addcarry_u64(cuchar(0), f1_lo, f2_lo, lo)
|
||||
discard addcarry_u64(carry, f1_hi, f2_hi, hi)
|
||||
|
||||
func unsafeFMA2_hi*(hi: var Ct[uint64], a1, b1, a2, b2, c: Ct[uint64]) {.inline.}=
|
||||
## Returns the high word of the sum of extended precision multiply-adds
|
||||
## (hi, _) <- a1 * b1 + a2 * b2 + c
|
||||
|
||||
var f1_lo, f1_hi, f2_lo, f2_hi: uint64
|
||||
var carry: cuchar
|
||||
|
||||
f1_lo = umul128(uint64(a1), uint64(b1), f1_hi)
|
||||
f2_lo = umul128(uint64(a2), uint64(b2), f2_hi)
|
||||
|
||||
carry = addcarry_u64(cuchar(0), f1_lo, uint64(c), f1_lo)
|
||||
discard addcarry_u64(carry, f1_hi, 0, f1_hi)
|
||||
|
||||
# Merge
|
||||
var lo: uint64
|
||||
carry = addcarry_u64(cuchar(0), f1_lo, f2_lo, lo)
|
||||
discard addcarry_u64(carry, f1_hi, f2_hi, hi)
|
||||
|
||||
else:
|
||||
{.error: "Compiler not implemented".}
|
||||
# For VCC and ICC use add_carry_u64, _add_carryx_u64 intrinsics
|
||||
# and _umul128
|
||||
|
Loading…
x
Reference in New Issue
Block a user