diff --git a/constantine/primitives/extended_precision.nim b/constantine/primitives/extended_precision.nim index fefdd91..e34562d 100644 --- a/constantine/primitives/extended_precision.nim +++ b/constantine/primitives/extended_precision.nim @@ -75,99 +75,14 @@ func unsafeFMA2_hi*(hi: var Ct[uint32], a1, b1, a2, b2, c1: Ct[uint32]) {.inline # # ############################################################ -const GccCompatible = defined(gcc) or defined(clang) or defined(llvm_gcc) +when sizeof(int) == 8: + const GccCompatible = defined(gcc) or defined(clang) or defined(llvm_gcc) -when sizeof(int) == 8 and GccCompatible: - type - uint128*{.importc: "unsigned __int128".} = object + when GccCompatible: + type + uint128*{.importc: "unsigned __int128".} = object - func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}= - ## Division uint128 by uint64 - ## Warning ⚠️ : - ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE - ## - if n_hi > d result is undefined - {.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".} - - # TODO !!! - Replace by constant-time, portable, non-assembly version - # -> use uint128? Compiler might add unwanted branches - - # DIV r/m64 - # Divide RDX:RAX (n_hi:n_lo) by r/m64 - # - # Inputs - # - numerator high word in RDX, - # - numerator low word in RAX, - # - divisor as r/m parameter (register or memory at the compiler discretion) - # Result - # - Quotient in RAX - # - Remainder in RDX - - # 1. name the register/memory "divisor" - # 2. don't forget to dereference the var hidden pointer - # 3. - - # 4. no clobbered registers beside explectly used RAX and RDX - when defined(amd64): - asm """ - divq %[divisor] - : "=a" (`*q`), "=d" (`*r`) - : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`) - : - """ - else: - var dblPrec {.noInit.}: uint128 - {.emit:[dblPrec, " = (unsigned __int128)", n_hi," << 64 | (unsigned __int128)",n_lo,";"].} - - # Don't forget to dereference the var param - {.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].} - {.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].} - - func unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.}= - ## Extended precision multiplication + addition - ## This is constant-time on most hardware except some specific one like Cortex M0 - ## (hi, lo) <- a*b + c - block: - # Note: since a and b use 63-bit, - # the result is 126-bit and carrying cannot overflow - var dblPrec {.noInit.}: uint128 - {.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, " + (unsigned __int128)",c,";"].} - - # Don't forget to dereference the var param - {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].} - {.emit:["*",lo, " = (NU64)", dblPrec," & ", 1'u64 shl 63 - 1, ";"].} - - func unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) {.inline.}= - ## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2 - block: - # TODO: Can this overflow? - var dblPrec: uint128 - {.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1, - " + (unsigned __int128)", a2," * (unsigned __int128)", b2, - " + (unsigned __int128)", c1, - " + (unsigned __int128)", c2, ";"].} - # Don't forget to dereference the var param - {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].} - {.emit:["*",lo, " = (NU64)", dblPrec," & ", (1'u64 shl 63 - 1), ";"].} - - func unsafeFMA2_hi*(hi: var Ct[uint64], a1, b1, a2, b2, c: Ct[uint64]) {.inline.}= - ## Returns the high word of the sum of extended precision multiply-adds - ## (hi, _) <- a1 * b1 + a2 * b2 + c - block: - var dblPrec: uint128 - {.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1, - " + (unsigned __int128)", a2," * (unsigned __int128)", b2, - " + (unsigned __int128)", c, ";"].} - # Don't forget to dereference the var param - {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].} - -elif sizeof(int) == 8 and defined(vcc): - func udiv128(highDividend, lowDividend, divisor: uint64, remainder: var uint64): uint64 {.importc:"_udiv128", header: "", nodecl.} - ## Division 128 by 64, Microsoft only, 64-bit only, - ## returns quotient as return value remainder as var parameter - ## Warning ⚠️ : - ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE - ## - if n_hi > d result is undefined - - func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}= + func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}= ## Division uint128 by uint64 ## Warning ⚠️ : ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE @@ -176,67 +91,153 @@ elif sizeof(int) == 8 and defined(vcc): # TODO !!! - Replace by constant-time, portable, non-assembly version # -> use uint128? Compiler might add unwanted branches - q = udiv128(n_hi, n_lo, d, r) - func addcarry_u64(carryIn: cuchar, a, b: uint64, sum: var uint64): cuchar {.importc:"_addcarry_u64", header:"", nodecl.} - ## (CarryOut, Sum) <-- a + b - ## Available on MSVC and ICC (Clang and GCC have very bad codegen, use uint128 instead) - ## Return value is the carry-out + # DIV r/m64 + # Divide RDX:RAX (n_hi:n_lo) by r/m64 + # + # Inputs + # - numerator high word in RDX, + # - numerator low word in RAX, + # - divisor as r/m parameter (register or memory at the compiler discretion) + # Result + # - Quotient in RAX + # - Remainder in RDX - func umul128(a, b: uint64, hi: var uint64): uint64 {.importc:"_umul128", header:"", nodecl.} - ## (hi, lo) <-- a * b - ## Return value is the low word + # 1. name the register/memory "divisor" + # 2. don't forget to dereference the var hidden pointer + # 3. - + # 4. no clobbered registers beside explectly used RAX and RDX + when defined(amd64): + asm """ + divq %[divisor] + : "=a" (`*q`), "=d" (`*r`) + : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`) + : + """ + else: + var dblPrec {.noInit.}: uint128 + {.emit:[dblPrec, " = (unsigned __int128)", n_hi," << 64 | (unsigned __int128)",n_lo,";"].} - func unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.}= - ## Extended precision multiplication + addition - ## This is constant-time on most hardware except some specific one like Cortex M0 - ## (hi, lo) <- a*b + c - var carry: cuchar - var hi, lo: uint64 - lo = umul128(uint64(a), uint64(b), hi) - carry = addcarry_u64(cuchar(0), lo, uint64(c), lo) - discard addcarry_u64(carry, hi, 0, hi) + # Don't forget to dereference the var param + {.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].} + {.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].} - func unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) {.inline.}= - ## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2 - var f1_lo, f1_hi, f2_lo, f2_hi: uint64 - var carry: cuchar + func unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.}= + ## Extended precision multiplication + addition + ## This is constant-time on most hardware except some specific one like Cortex M0 + ## (hi, lo) <- a*b + c + block: + # Note: since a and b use 63-bit, + # the result is 126-bit and carrying cannot overflow + var dblPrec {.noInit.}: uint128 + {.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, " + (unsigned __int128)",c,";"].} - f1_lo = umul128(uint64(a1), uint64(b1), f1_hi) - f2_lo = umul128(uint64(a2), uint64(b2), f2_hi) + # Don't forget to dereference the var param + {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].} + {.emit:["*",lo, " = (NU64)", dblPrec," & ", 1'u64 shl 63 - 1, ";"].} - # On CPU with ADX: we can use addcarryx_u64 (adcx/adox) to have - # separate carry chains that can be processed in parallel by CPU + func unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) {.inline.}= + ## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2 + block: + # TODO: Can this overflow? + var dblPrec: uint128 + {.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1, + " + (unsigned __int128)", a2," * (unsigned __int128)", b2, + " + (unsigned __int128)", c1, + " + (unsigned __int128)", c2, ";"].} + # Don't forget to dereference the var param + {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].} + {.emit:["*",lo, " = (NU64)", dblPrec," & ", (1'u64 shl 63 - 1), ";"].} - # Carry chain 1 - carry = addcarry_u64(cuchar(0), f1_lo, uint64(c1), f1_lo) - discard addcarry_u64(carry, f1_hi, 0, f1_hi) + func unsafeFMA2_hi*(hi: var Ct[uint64], a1, b1, a2, b2, c: Ct[uint64]) {.inline.}= + ## Returns the high word of the sum of extended precision multiply-adds + ## (hi, _) <- a1 * b1 + a2 * b2 + c + block: + var dblPrec: uint128 + {.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1, + " + (unsigned __int128)", a2," * (unsigned __int128)", b2, + " + (unsigned __int128)", c, ";"].} + # Don't forget to dereference the var param + {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].} - # Carry chain 2 - carry = addcarry_u64(cuchar(0), f2_lo, uint64(c2), f2_lo) - discard addcarry_u64(carry, f2_hi, 0, f2_hi) + elif defined(vcc): + func udiv128(highDividend, lowDividend, divisor: uint64, remainder: var uint64): uint64 {.importc:"_udiv128", header: "", nodecl.} + ## Division 128 by 64, Microsoft only, 64-bit only, + ## returns quotient as return value remainder as var parameter + ## Warning ⚠️ : + ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE + ## - if n_hi > d result is undefined - # Merge - carry = addcarry_u64(cuchar(0), f1_lo, f2_lo, lo) - discard addcarry_u64(carry, f1_hi, f2_hi, hi) + func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}= + ## Division uint128 by uint64 + ## Warning ⚠️ : + ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE + ## - if n_hi > d result is undefined + {.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".} - func unsafeFMA2_hi*(hi: var Ct[uint64], a1, b1, a2, b2, c: Ct[uint64]) {.inline.}= - ## Returns the high word of the sum of extended precision multiply-adds - ## (hi, _) <- a1 * b1 + a2 * b2 + c + # TODO !!! - Replace by constant-time, portable, non-assembly version + # -> use uint128? Compiler might add unwanted branches + q = udiv128(n_hi, n_lo, d, r) - var f1_lo, f1_hi, f2_lo, f2_hi: uint64 - var carry: cuchar + func addcarry_u64(carryIn: cuchar, a, b: uint64, sum: var uint64): cuchar {.importc:"_addcarry_u64", header:"", nodecl.} + ## (CarryOut, Sum) <-- a + b + ## Available on MSVC and ICC (Clang and GCC have very bad codegen, use uint128 instead) + ## Return value is the carry-out - f1_lo = umul128(uint64(a1), uint64(b1), f1_hi) - f2_lo = umul128(uint64(a2), uint64(b2), f2_hi) + func umul128(a, b: uint64, hi: var uint64): uint64 {.importc:"_umul128", header:"", nodecl.} + ## (hi, lo) <-- a * b + ## Return value is the low word - carry = addcarry_u64(cuchar(0), f1_lo, uint64(c), f1_lo) - discard addcarry_u64(carry, f1_hi, 0, f1_hi) + func unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.}= + ## Extended precision multiplication + addition + ## This is constant-time on most hardware except some specific one like Cortex M0 + ## (hi, lo) <- a*b + c + var carry: cuchar + var hi, lo: uint64 + lo = umul128(uint64(a), uint64(b), hi) + carry = addcarry_u64(cuchar(0), lo, uint64(c), lo) + discard addcarry_u64(carry, hi, 0, hi) - # Merge - var lo: uint64 - carry = addcarry_u64(cuchar(0), f1_lo, f2_lo, lo) - discard addcarry_u64(carry, f1_hi, f2_hi, hi) + func unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) {.inline.}= + ## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2 + var f1_lo, f1_hi, f2_lo, f2_hi: uint64 + var carry: cuchar -else: - {.error: "Compiler not implemented".} + f1_lo = umul128(uint64(a1), uint64(b1), f1_hi) + f2_lo = umul128(uint64(a2), uint64(b2), f2_hi) + + # On CPU with ADX: we can use addcarryx_u64 (adcx/adox) to have + # separate carry chains that can be processed in parallel by CPU + + # Carry chain 1 + carry = addcarry_u64(cuchar(0), f1_lo, uint64(c1), f1_lo) + discard addcarry_u64(carry, f1_hi, 0, f1_hi) + + # Carry chain 2 + carry = addcarry_u64(cuchar(0), f2_lo, uint64(c2), f2_lo) + discard addcarry_u64(carry, f2_hi, 0, f2_hi) + + # Merge + carry = addcarry_u64(cuchar(0), f1_lo, f2_lo, lo) + discard addcarry_u64(carry, f1_hi, f2_hi, hi) + + func unsafeFMA2_hi*(hi: var Ct[uint64], a1, b1, a2, b2, c: Ct[uint64]) {.inline.}= + ## Returns the high word of the sum of extended precision multiply-adds + ## (hi, _) <- a1 * b1 + a2 * b2 + c + + var f1_lo, f1_hi, f2_lo, f2_hi: uint64 + var carry: cuchar + + f1_lo = umul128(uint64(a1), uint64(b1), f1_hi) + f2_lo = umul128(uint64(a2), uint64(b2), f2_hi) + + carry = addcarry_u64(cuchar(0), f1_lo, uint64(c), f1_lo) + discard addcarry_u64(carry, f1_hi, 0, f1_hi) + + # Merge + var lo: uint64 + carry = addcarry_u64(cuchar(0), f1_lo, f2_lo, lo) + discard addcarry_u64(carry, f1_hi, f2_hi, hi) + + else: + {.error: "Compiler not implemented".}