Properly handle 32 bits
This commit is contained in:
parent
3fdd457b52
commit
c8e482f6d2
|
@ -75,99 +75,14 @@ func unsafeFMA2_hi*(hi: var Ct[uint32], a1, b1, a2, b2, c1: Ct[uint32]) {.inline
|
||||||
#
|
#
|
||||||
# ############################################################
|
# ############################################################
|
||||||
|
|
||||||
const GccCompatible = defined(gcc) or defined(clang) or defined(llvm_gcc)
|
when sizeof(int) == 8:
|
||||||
|
const GccCompatible = defined(gcc) or defined(clang) or defined(llvm_gcc)
|
||||||
|
|
||||||
when sizeof(int) == 8 and GccCompatible:
|
when GccCompatible:
|
||||||
type
|
type
|
||||||
uint128*{.importc: "unsigned __int128".} = object
|
uint128*{.importc: "unsigned __int128".} = object
|
||||||
|
|
||||||
func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
|
func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
|
||||||
## Division uint128 by uint64
|
|
||||||
## Warning ⚠️ :
|
|
||||||
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
|
|
||||||
## - if n_hi > d result is undefined
|
|
||||||
{.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".}
|
|
||||||
|
|
||||||
# TODO !!! - Replace by constant-time, portable, non-assembly version
|
|
||||||
# -> use uint128? Compiler might add unwanted branches
|
|
||||||
|
|
||||||
# DIV r/m64
|
|
||||||
# Divide RDX:RAX (n_hi:n_lo) by r/m64
|
|
||||||
#
|
|
||||||
# Inputs
|
|
||||||
# - numerator high word in RDX,
|
|
||||||
# - numerator low word in RAX,
|
|
||||||
# - divisor as r/m parameter (register or memory at the compiler discretion)
|
|
||||||
# Result
|
|
||||||
# - Quotient in RAX
|
|
||||||
# - Remainder in RDX
|
|
||||||
|
|
||||||
# 1. name the register/memory "divisor"
|
|
||||||
# 2. don't forget to dereference the var hidden pointer
|
|
||||||
# 3. -
|
|
||||||
# 4. no clobbered registers beside explectly used RAX and RDX
|
|
||||||
when defined(amd64):
|
|
||||||
asm """
|
|
||||||
divq %[divisor]
|
|
||||||
: "=a" (`*q`), "=d" (`*r`)
|
|
||||||
: "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
|
|
||||||
:
|
|
||||||
"""
|
|
||||||
else:
|
|
||||||
var dblPrec {.noInit.}: uint128
|
|
||||||
{.emit:[dblPrec, " = (unsigned __int128)", n_hi," << 64 | (unsigned __int128)",n_lo,";"].}
|
|
||||||
|
|
||||||
# Don't forget to dereference the var param
|
|
||||||
{.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].}
|
|
||||||
{.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].}
|
|
||||||
|
|
||||||
func unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.}=
|
|
||||||
## Extended precision multiplication + addition
|
|
||||||
## This is constant-time on most hardware except some specific one like Cortex M0
|
|
||||||
## (hi, lo) <- a*b + c
|
|
||||||
block:
|
|
||||||
# Note: since a and b use 63-bit,
|
|
||||||
# the result is 126-bit and carrying cannot overflow
|
|
||||||
var dblPrec {.noInit.}: uint128
|
|
||||||
{.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, " + (unsigned __int128)",c,";"].}
|
|
||||||
|
|
||||||
# Don't forget to dereference the var param
|
|
||||||
{.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
|
|
||||||
{.emit:["*",lo, " = (NU64)", dblPrec," & ", 1'u64 shl 63 - 1, ";"].}
|
|
||||||
|
|
||||||
func unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) {.inline.}=
|
|
||||||
## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2
|
|
||||||
block:
|
|
||||||
# TODO: Can this overflow?
|
|
||||||
var dblPrec: uint128
|
|
||||||
{.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1,
|
|
||||||
" + (unsigned __int128)", a2," * (unsigned __int128)", b2,
|
|
||||||
" + (unsigned __int128)", c1,
|
|
||||||
" + (unsigned __int128)", c2, ";"].}
|
|
||||||
# Don't forget to dereference the var param
|
|
||||||
{.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
|
|
||||||
{.emit:["*",lo, " = (NU64)", dblPrec," & ", (1'u64 shl 63 - 1), ";"].}
|
|
||||||
|
|
||||||
func unsafeFMA2_hi*(hi: var Ct[uint64], a1, b1, a2, b2, c: Ct[uint64]) {.inline.}=
|
|
||||||
## Returns the high word of the sum of extended precision multiply-adds
|
|
||||||
## (hi, _) <- a1 * b1 + a2 * b2 + c
|
|
||||||
block:
|
|
||||||
var dblPrec: uint128
|
|
||||||
{.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1,
|
|
||||||
" + (unsigned __int128)", a2," * (unsigned __int128)", b2,
|
|
||||||
" + (unsigned __int128)", c, ";"].}
|
|
||||||
# Don't forget to dereference the var param
|
|
||||||
{.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
|
|
||||||
|
|
||||||
elif sizeof(int) == 8 and defined(vcc):
|
|
||||||
func udiv128(highDividend, lowDividend, divisor: uint64, remainder: var uint64): uint64 {.importc:"_udiv128", header: "<immintrin.h>", nodecl.}
|
|
||||||
## Division 128 by 64, Microsoft only, 64-bit only,
|
|
||||||
## returns quotient as return value remainder as var parameter
|
|
||||||
## Warning ⚠️ :
|
|
||||||
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
|
|
||||||
## - if n_hi > d result is undefined
|
|
||||||
|
|
||||||
func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
|
|
||||||
## Division uint128 by uint64
|
## Division uint128 by uint64
|
||||||
## Warning ⚠️ :
|
## Warning ⚠️ :
|
||||||
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
|
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
|
||||||
|
@ -176,67 +91,153 @@ elif sizeof(int) == 8 and defined(vcc):
|
||||||
|
|
||||||
# TODO !!! - Replace by constant-time, portable, non-assembly version
|
# TODO !!! - Replace by constant-time, portable, non-assembly version
|
||||||
# -> use uint128? Compiler might add unwanted branches
|
# -> use uint128? Compiler might add unwanted branches
|
||||||
q = udiv128(n_hi, n_lo, d, r)
|
|
||||||
|
|
||||||
func addcarry_u64(carryIn: cuchar, a, b: uint64, sum: var uint64): cuchar {.importc:"_addcarry_u64", header:"<intrin.h>", nodecl.}
|
# DIV r/m64
|
||||||
## (CarryOut, Sum) <-- a + b
|
# Divide RDX:RAX (n_hi:n_lo) by r/m64
|
||||||
## Available on MSVC and ICC (Clang and GCC have very bad codegen, use uint128 instead)
|
#
|
||||||
## Return value is the carry-out
|
# Inputs
|
||||||
|
# - numerator high word in RDX,
|
||||||
|
# - numerator low word in RAX,
|
||||||
|
# - divisor as r/m parameter (register or memory at the compiler discretion)
|
||||||
|
# Result
|
||||||
|
# - Quotient in RAX
|
||||||
|
# - Remainder in RDX
|
||||||
|
|
||||||
func umul128(a, b: uint64, hi: var uint64): uint64 {.importc:"_umul128", header:"<intrin.h>", nodecl.}
|
# 1. name the register/memory "divisor"
|
||||||
## (hi, lo) <-- a * b
|
# 2. don't forget to dereference the var hidden pointer
|
||||||
## Return value is the low word
|
# 3. -
|
||||||
|
# 4. no clobbered registers beside explectly used RAX and RDX
|
||||||
|
when defined(amd64):
|
||||||
|
asm """
|
||||||
|
divq %[divisor]
|
||||||
|
: "=a" (`*q`), "=d" (`*r`)
|
||||||
|
: "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
|
||||||
|
:
|
||||||
|
"""
|
||||||
|
else:
|
||||||
|
var dblPrec {.noInit.}: uint128
|
||||||
|
{.emit:[dblPrec, " = (unsigned __int128)", n_hi," << 64 | (unsigned __int128)",n_lo,";"].}
|
||||||
|
|
||||||
func unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.}=
|
# Don't forget to dereference the var param
|
||||||
## Extended precision multiplication + addition
|
{.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].}
|
||||||
## This is constant-time on most hardware except some specific one like Cortex M0
|
{.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].}
|
||||||
## (hi, lo) <- a*b + c
|
|
||||||
var carry: cuchar
|
|
||||||
var hi, lo: uint64
|
|
||||||
lo = umul128(uint64(a), uint64(b), hi)
|
|
||||||
carry = addcarry_u64(cuchar(0), lo, uint64(c), lo)
|
|
||||||
discard addcarry_u64(carry, hi, 0, hi)
|
|
||||||
|
|
||||||
func unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) {.inline.}=
|
func unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.}=
|
||||||
## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2
|
## Extended precision multiplication + addition
|
||||||
var f1_lo, f1_hi, f2_lo, f2_hi: uint64
|
## This is constant-time on most hardware except some specific one like Cortex M0
|
||||||
var carry: cuchar
|
## (hi, lo) <- a*b + c
|
||||||
|
block:
|
||||||
|
# Note: since a and b use 63-bit,
|
||||||
|
# the result is 126-bit and carrying cannot overflow
|
||||||
|
var dblPrec {.noInit.}: uint128
|
||||||
|
{.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, " + (unsigned __int128)",c,";"].}
|
||||||
|
|
||||||
f1_lo = umul128(uint64(a1), uint64(b1), f1_hi)
|
# Don't forget to dereference the var param
|
||||||
f2_lo = umul128(uint64(a2), uint64(b2), f2_hi)
|
{.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
|
||||||
|
{.emit:["*",lo, " = (NU64)", dblPrec," & ", 1'u64 shl 63 - 1, ";"].}
|
||||||
|
|
||||||
# On CPU with ADX: we can use addcarryx_u64 (adcx/adox) to have
|
func unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) {.inline.}=
|
||||||
# separate carry chains that can be processed in parallel by CPU
|
## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2
|
||||||
|
block:
|
||||||
|
# TODO: Can this overflow?
|
||||||
|
var dblPrec: uint128
|
||||||
|
{.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1,
|
||||||
|
" + (unsigned __int128)", a2," * (unsigned __int128)", b2,
|
||||||
|
" + (unsigned __int128)", c1,
|
||||||
|
" + (unsigned __int128)", c2, ";"].}
|
||||||
|
# Don't forget to dereference the var param
|
||||||
|
{.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
|
||||||
|
{.emit:["*",lo, " = (NU64)", dblPrec," & ", (1'u64 shl 63 - 1), ";"].}
|
||||||
|
|
||||||
# Carry chain 1
|
func unsafeFMA2_hi*(hi: var Ct[uint64], a1, b1, a2, b2, c: Ct[uint64]) {.inline.}=
|
||||||
carry = addcarry_u64(cuchar(0), f1_lo, uint64(c1), f1_lo)
|
## Returns the high word of the sum of extended precision multiply-adds
|
||||||
discard addcarry_u64(carry, f1_hi, 0, f1_hi)
|
## (hi, _) <- a1 * b1 + a2 * b2 + c
|
||||||
|
block:
|
||||||
|
var dblPrec: uint128
|
||||||
|
{.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1,
|
||||||
|
" + (unsigned __int128)", a2," * (unsigned __int128)", b2,
|
||||||
|
" + (unsigned __int128)", c, ";"].}
|
||||||
|
# Don't forget to dereference the var param
|
||||||
|
{.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
|
||||||
|
|
||||||
# Carry chain 2
|
elif defined(vcc):
|
||||||
carry = addcarry_u64(cuchar(0), f2_lo, uint64(c2), f2_lo)
|
func udiv128(highDividend, lowDividend, divisor: uint64, remainder: var uint64): uint64 {.importc:"_udiv128", header: "<immintrin.h>", nodecl.}
|
||||||
discard addcarry_u64(carry, f2_hi, 0, f2_hi)
|
## Division 128 by 64, Microsoft only, 64-bit only,
|
||||||
|
## returns quotient as return value remainder as var parameter
|
||||||
|
## Warning ⚠️ :
|
||||||
|
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
|
||||||
|
## - if n_hi > d result is undefined
|
||||||
|
|
||||||
# Merge
|
func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
|
||||||
carry = addcarry_u64(cuchar(0), f1_lo, f2_lo, lo)
|
## Division uint128 by uint64
|
||||||
discard addcarry_u64(carry, f1_hi, f2_hi, hi)
|
## Warning ⚠️ :
|
||||||
|
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
|
||||||
|
## - if n_hi > d result is undefined
|
||||||
|
{.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".}
|
||||||
|
|
||||||
func unsafeFMA2_hi*(hi: var Ct[uint64], a1, b1, a2, b2, c: Ct[uint64]) {.inline.}=
|
# TODO !!! - Replace by constant-time, portable, non-assembly version
|
||||||
## Returns the high word of the sum of extended precision multiply-adds
|
# -> use uint128? Compiler might add unwanted branches
|
||||||
## (hi, _) <- a1 * b1 + a2 * b2 + c
|
q = udiv128(n_hi, n_lo, d, r)
|
||||||
|
|
||||||
var f1_lo, f1_hi, f2_lo, f2_hi: uint64
|
func addcarry_u64(carryIn: cuchar, a, b: uint64, sum: var uint64): cuchar {.importc:"_addcarry_u64", header:"<intrin.h>", nodecl.}
|
||||||
var carry: cuchar
|
## (CarryOut, Sum) <-- a + b
|
||||||
|
## Available on MSVC and ICC (Clang and GCC have very bad codegen, use uint128 instead)
|
||||||
|
## Return value is the carry-out
|
||||||
|
|
||||||
f1_lo = umul128(uint64(a1), uint64(b1), f1_hi)
|
func umul128(a, b: uint64, hi: var uint64): uint64 {.importc:"_umul128", header:"<intrin.h>", nodecl.}
|
||||||
f2_lo = umul128(uint64(a2), uint64(b2), f2_hi)
|
## (hi, lo) <-- a * b
|
||||||
|
## Return value is the low word
|
||||||
|
|
||||||
carry = addcarry_u64(cuchar(0), f1_lo, uint64(c), f1_lo)
|
func unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.}=
|
||||||
discard addcarry_u64(carry, f1_hi, 0, f1_hi)
|
## Extended precision multiplication + addition
|
||||||
|
## This is constant-time on most hardware except some specific one like Cortex M0
|
||||||
|
## (hi, lo) <- a*b + c
|
||||||
|
var carry: cuchar
|
||||||
|
var hi, lo: uint64
|
||||||
|
lo = umul128(uint64(a), uint64(b), hi)
|
||||||
|
carry = addcarry_u64(cuchar(0), lo, uint64(c), lo)
|
||||||
|
discard addcarry_u64(carry, hi, 0, hi)
|
||||||
|
|
||||||
# Merge
|
func unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) {.inline.}=
|
||||||
var lo: uint64
|
## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2
|
||||||
carry = addcarry_u64(cuchar(0), f1_lo, f2_lo, lo)
|
var f1_lo, f1_hi, f2_lo, f2_hi: uint64
|
||||||
discard addcarry_u64(carry, f1_hi, f2_hi, hi)
|
var carry: cuchar
|
||||||
|
|
||||||
else:
|
f1_lo = umul128(uint64(a1), uint64(b1), f1_hi)
|
||||||
{.error: "Compiler not implemented".}
|
f2_lo = umul128(uint64(a2), uint64(b2), f2_hi)
|
||||||
|
|
||||||
|
# On CPU with ADX: we can use addcarryx_u64 (adcx/adox) to have
|
||||||
|
# separate carry chains that can be processed in parallel by CPU
|
||||||
|
|
||||||
|
# Carry chain 1
|
||||||
|
carry = addcarry_u64(cuchar(0), f1_lo, uint64(c1), f1_lo)
|
||||||
|
discard addcarry_u64(carry, f1_hi, 0, f1_hi)
|
||||||
|
|
||||||
|
# Carry chain 2
|
||||||
|
carry = addcarry_u64(cuchar(0), f2_lo, uint64(c2), f2_lo)
|
||||||
|
discard addcarry_u64(carry, f2_hi, 0, f2_hi)
|
||||||
|
|
||||||
|
# Merge
|
||||||
|
carry = addcarry_u64(cuchar(0), f1_lo, f2_lo, lo)
|
||||||
|
discard addcarry_u64(carry, f1_hi, f2_hi, hi)
|
||||||
|
|
||||||
|
func unsafeFMA2_hi*(hi: var Ct[uint64], a1, b1, a2, b2, c: Ct[uint64]) {.inline.}=
|
||||||
|
## Returns the high word of the sum of extended precision multiply-adds
|
||||||
|
## (hi, _) <- a1 * b1 + a2 * b2 + c
|
||||||
|
|
||||||
|
var f1_lo, f1_hi, f2_lo, f2_hi: uint64
|
||||||
|
var carry: cuchar
|
||||||
|
|
||||||
|
f1_lo = umul128(uint64(a1), uint64(b1), f1_hi)
|
||||||
|
f2_lo = umul128(uint64(a2), uint64(b2), f2_hi)
|
||||||
|
|
||||||
|
carry = addcarry_u64(cuchar(0), f1_lo, uint64(c), f1_lo)
|
||||||
|
discard addcarry_u64(carry, f1_hi, 0, f1_hi)
|
||||||
|
|
||||||
|
# Merge
|
||||||
|
var lo: uint64
|
||||||
|
carry = addcarry_u64(cuchar(0), f1_lo, f2_lo, lo)
|
||||||
|
discard addcarry_u64(carry, f1_hi, f2_hi, hi)
|
||||||
|
|
||||||
|
else:
|
||||||
|
{.error: "Compiler not implemented".}
|
||||||
|
|
Loading…
Reference in New Issue