diff --git a/stint/private/conversion.nim b/stint/private/conversion.nim deleted file mode 100644 index 469e1b1..0000000 --- a/stint/private/conversion.nim +++ /dev/null @@ -1,59 +0,0 @@ -# Stint -# Copyright 2018 Status Research & Development GmbH -# Licensed under either of -# -# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) -# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) -# -# at your option. This file may not be copied, modified, or distributed except according to those terms. - -import ./datatypes - -func toSubtype*[T: SomeInteger](b: bool, _: typedesc[T]): T {.inline.}= - b.T - -func toSubtype*[T: UintImpl | IntImpl](b: bool, _: typedesc[T]): T {.inline.}= - type SubTy = type result.lo - result.lo = toSubtype(b, SubTy) - -func toUint*(n: UintImpl or IntImpl or SomeSignedInt): auto {.inline.}= - ## Casts an unsigned integer to an uint of the same size - # TODO: uint128 support - when n.sizeof > 8: - {.fatal: "Unreachable. You are trying to cast a StUint with more than 64-bit of precision" .} - elif n.sizeof == 8: - cast[uint64](n) - elif n.sizeof == 4: - cast[uint32](n) - elif n.sizeof == 2: - cast[uint16](n) - else: - cast[uint8](n) - -func toUint*(n: SomeUnsignedInt): SomeUnsignedInt {.inline.}= - ## No-op overload of multi-precision int casting - n - -func asDoubleUint*(n: UintImpl | SomeUnsignedInt): auto {.inline.} = - ## Convert an integer or StUint to an uint with double the size - type Double = ( - when n.sizeof == 4: uint64 - elif n.sizeof == 2: uint32 - else: uint16 - ) - - n.toUint.Double - -func toInt*(n: UintImpl or IntImpl or SomeInteger): auto {.inline.}= - ## Casts an unsigned integer to an uint of the same size - # TODO: uint128 support - when n.sizeof > 8: - {.fatal: "Unreachable. You are trying to cast a StUint with more than 64-bit of precision" .} - elif n.sizeof == 8: - cast[int64](n) - elif n.sizeof == 4: - cast[int32](n) - elif n.sizeof == 2: - cast[int16](n) - else: - cast[int8](n) diff --git a/stint/private/datatypes.nim b/stint/private/datatypes.nim index 867683b..46da4be 100644 --- a/stint/private/datatypes.nim +++ b/stint/private/datatypes.nim @@ -7,191 +7,56 @@ # # at your option. This file may not be copied, modified, or distributed except according to those terms. -# TODO: test if GCC/Clang support uint128 natively - -# #### Overview -# -# Stint extends the default uint8, uint16, uint32, uint64 with power of 2 integers. -# Only limitation is your stack size so you can have uint128, uint256, uint512 ... -# Signed int are also possible. -# -# As a high-level API, Stint adheres to Nim and C conventions and uses the same operators like: -# `+`, `xor`, `not` ... -# -# #### Implementation -# -# Stint types are stored on the stack and have a structure -# similar to a binary tree of power of two unsigned integers -# with "high" and "low" words: -# -# Stuint[256] -# hi: Stuint[128] lo: Stuint[128] -# hihi: uint64 hilo: uint64 lohi: uint64 lolo: uint64 -# -# This follows paper https://hal.archives-ouvertes.fr/hal-00582593v2 -# "Recursive double-size fixed precision arithmetic" from Jul. 2016 -# to implement an efficient fixed precision bigint for embedded devices, especially FPGAs. -# -# For testing purpose, the flag `-d:stint_test` can be passed at compile-time -# to switch the backend to uint32. -# In the future the default backend will become uint128 on supporting compilers. -# -# This has following benefits: -# - BigEndian/LittleEndian support is trivial. -# - Not having for loops help the compiler producing the most efficient instructions -# like ADC (Add with Carry) -# - Proving that the recursive structure works at depth 64 for uint32 backend means that -# it would work at depth 128 for uint64 backend. -# We can easily choose a uint16 or uint8 backend as well. -# - Due to the recursive structure, testing operations when there is: -# - no leaves(uint64) -# - a root and leaves with no nodes (uint128) -# - a root + intermediate nodes + leaves (uint256) -# should be enough to ensure they work at all sizes, edge cases included. -# - Adding a new backend like uint128 (GCC/Clang) or uint256 (LLVM instrinsics only) is just adding -# a new case in the `uintImpl` template. -# - All math implementations of the operations have a straightforward translation -# to a high-low structure, including the fastest Karatsuba multiplication -# and co-recursive division algorithm by Burnikel and Ziegler. -# This makes translating those algorithms into Nim easier compared to an array backend. -# It would also probably require less code and would be much easier to audit versus -# the math reference papers. -# - For implementation of algorithms, there is no issue to take subslices of the memory representation -# with a recursive tree structure. -# On the other side, returning a `var array[N div 2, uint64]` is problematic at the moment. -# - Compile-time computation is possible while due to the previous issue -# an array backend would be required to use var openArray[uint64] -# i.e. pointers. -# - Note that while shift-right and left can easily be done an array of bytes -# this would have reduced performance compared to moving 64-bit words. -# An efficient implementation on array of words would require checking the shift -# versus a half-word to deal with carry-in/out from and to the adjacent words -# similar to a recursive implementation. -# -# Iterations over the whole integers, for example for `==` is always unrolled. -# Due to being on the stack, any optimizing compiler should compile that to efficient memcmp -# -# When not to use Stint: -# -# 1. Constant-time arithmetics -# - Do not use Stint if you need side-channels resistance, -# This requires to avoid all branches (i.e. no booleans) -# 2. Arbitrary-precision with lots of small-values -# - If you need arbitrary precision but most of the time you store mall values -# you will waste a lot of memory unless you use an object variant of various Stint sizes. -# type MyUint = object -# case kind: int -# of 0..64: uint64 -# of 66..128: ref Stuint[128] -# of 129..256: ref Stuint[256] -# ... -# -# Note: if you work with huge size, you can allocate stints on the heap with -# for example `type HeapInt8192 = ref Stint[8192]. -# If you have a lot of computations and intermediate variables it's probably worthwhile -# to explore creating an object pool to reuse the memory buffers. - -template checkDiv2(bits: static[int]): untyped = - # TODO: There is no need to check if power of 2 at each uintImpl instantiation, it slows compilation. - # However we easily get into nested templates instantiation if we use another - # template that first checks power-of-two and then calls the recursive uintImpl - static: - doAssert (bits and (bits-1)) == 0, $bits & " is not a power of 2" - doAssert bits >= 8, "The number of bits in a should be greater or equal to 8" - bits div 2 - -when defined(mpint_test): # TODO stint_test - template uintImpl*(bits: static[int]): untyped = - # Test version, StUint[64] = 2 uint32. Test the logic of the library - - when bits >= 128: UintImpl[uintImpl(checkDiv2(bits))] - elif bits == 64: UintImpl[uint32] - elif bits == 32: UintImpl[uint16] - elif bits == 16: UintImpl[uint8] - else: {.fatal: "Only power-of-2 >=16 supported when testing" .} - - template intImpl*(bits: static[int]): untyped = - # Test version, StInt[64] = 2 uint32. Test the logic of the library - # int is implemented using a signed hi part and an unsigned lo part, given - # that the sign resides in hi - - when bits >= 128: IntImpl[intImpl(checkDiv2(bits)), uintImpl(checkDiv2(bits))] - elif bits == 64: IntImpl[int32, uint32] - elif bits == 32: IntImpl[int16, uint16] - elif bits == 16: IntImpl[int8, uint8] - else: {.fatal: "Only power-of-2 >=16 supported when testing" .} +import + # Status lib + stew/bitops2 +when sizeof(int) == 8 and not defined(Stint32): + type Word* = uint64 else: - template uintImpl*(bits: static[int]): untyped = - mixin UintImpl - when bits >= 128: UintImpl[uintImpl(checkDiv2(bits))] - elif bits == 64: uint64 - elif bits == 32: uint32 - elif bits == 16: uint16 - elif bits == 8: uint8 - else: {.fatal: "Only power-of-2 >=8 supported" .} + type Word* = uint32 - template intImpl*(bits: static[int]): untyped = - # int is implemented using a signed hi part and an unsigned lo part, given - # that the sign resides in hi +type Word* = uint32 - when bits >= 128: IntImpl[intImpl(checkDiv2(bits)), uintImpl(checkDiv2(bits))] - elif bits == 64: int64 - elif bits == 32: int32 - elif bits == 16: int16 - elif bits == 8: int8 - else: {.fatal: "Only power-of-2 >=8 supported" .} +func wordsRequired*(bits: int): int {.compileTime.} = + ## Compute the number of limbs required + ## from the **announced** bit length + (bits + WordBitWidth - 1) div WordBitWidth type - # ### Private ### # - UintImpl*[BaseUint] = object - when system.cpuEndian == littleEndian: - lo*, hi*: BaseUint - else: - hi*, lo*: BaseUint - - IntImpl*[BaseInt, BaseUint] = object - # Ints are implemented in terms of uints - when system.cpuEndian == littleEndian: - lo*: BaseUint - hi*: BaseInt - else: - hi*: BaseInt - lo*: BaseUint - - # ### Private ### # + Limbs*[N: static int] = array[N, BaseUint] StUint*[bits: static[int]] = object - data*: uintImpl(bits) + ## Stack-based integer + ## Unsigned + limbs*: Limbs[bits.wordsRequired] StInt*[bits: static[int]] = object - data*: intImpl(bits) + ## Stack-based integer + ## Signed + limbs*: Limbs[bits.wordsRequired] -template applyHiLo*(a: UintImpl | IntImpl, c: untyped): untyped = - ## Apply `c` to each of `hi` and `lo` - var res: type a - res.hi = c(a.hi) - res.lo = c(a.lo) - res + Carry* = uint8 # distinct range[0'u8 .. 1] + Borrow* = uint8 # distinct range[0'u8 .. 1] -template applyHiLo*(a, b: UintImpl | IntImpl, c: untyped): untyped = - ## Apply `c` to each of `hi` and `lo` - var res: type a - res.hi = c(a.hi, b.hi) - res.lo = c(a.lo, b.lo) - res +const GCC_Compatible* = defined(gcc) or defined(clang) or defined(llvm_gcc) +const X86* = defined(amd64) or defined(i386) + +when sizeof(int) == 8 and GCC_Compatible: + type + uint128*{.importc: "unsigned __int128".} = object template leastSignificantWord*(num: SomeInteger): auto = num -func leastSignificantWord*(num: UintImpl | IntImpl): auto {.inline.} = - when num.lo is UintImpl: - num.lo.leastSignificantWord +func leastSignificantWord*(limbs: Limbs): auto {.inline.} = + when cpuEndian == littleEndian: + limbs[0] else: - num.lo + limbs[^1] -func mostSignificantWord*(num: UintImpl | IntImpl): auto {.inline.} = - when num.hi is (UintImpl | IntImpl): - num.hi.mostSignificantWord +func mostSignificantWord*(limbs: Limbs): auto {.inline.} = + when cpuEndian == littleEndian: + limbs[^1] else: - num.hi + limbs[0] diff --git a/stint/private/primitives/addcarry_subborrow.nim b/stint/private/primitives/addcarry_subborrow.nim new file mode 100644 index 0000000..c4e27df --- /dev/null +++ b/stint/private/primitives/addcarry_subborrow.nim @@ -0,0 +1,169 @@ +# Stint +# Copyright 2018 Status Research & Development GmbH +# Licensed under either of +# +# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) +# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) +# +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import ../datatypes + +# ############################################################ +# +# Add-with-carry and Sub-with-borrow +# +# ############################################################ +# +# This file implements add-with-carry and sub-with-borrow +# +# It is currently (Mar 2020) impossible to have the compiler +# generate optimal code in a generic way. +# +# On x86, addcarry_u64 intrinsic will generate optimal code +# except for GCC. +# +# On other CPU architectures inline assembly might be desirable. +# A compiler proof-of-concept is available in the "research" folder. +# +# See https://gcc.godbolt.org/z/2h768y +# ```C +# #include +# #include +# +# void add256(uint64_t a[4], uint64_t b[4]){ +# uint8_t carry = 0; +# for (int i = 0; i < 4; ++i) +# carry = _addcarry_u64(carry, a[i], b[i], &a[i]); +# } +# ``` +# +# GCC +# ```asm +# add256: +# movq (%rsi), %rax +# addq (%rdi), %rax +# setc %dl +# movq %rax, (%rdi) +# movq 8(%rdi), %rax +# addb $-1, %dl +# adcq 8(%rsi), %rax +# setc %dl +# movq %rax, 8(%rdi) +# movq 16(%rdi), %rax +# addb $-1, %dl +# adcq 16(%rsi), %rax +# setc %dl +# movq %rax, 16(%rdi) +# movq 24(%rsi), %rax +# addb $-1, %dl +# adcq %rax, 24(%rdi) +# ret +# ``` +# +# Clang +# ```asm +# add256: +# movq (%rsi), %rax +# addq %rax, (%rdi) +# movq 8(%rsi), %rax +# adcq %rax, 8(%rdi) +# movq 16(%rsi), %rax +# adcq %rax, 16(%rdi) +# movq 24(%rsi), %rax +# adcq %rax, 24(%rdi) +# retq +# ``` + +# ############################################################ +# +# Intrinsics +# +# ############################################################ + +# Note: GCC before 2017 had incorrect codegen in some cases: +# - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81300 + +when X86: + when defined(windows): + {.pragma: intrinsics, header:"", nodecl.} + else: + {.pragma: intrinsics, header:"", nodecl.} + + func addcarry_u32(carryIn: Carry, a, b: uint32, sum: var uint32): Carry {.importc: "_addcarry_u32", intrinsics.} + func subborrow_u32(borrowIn: Borrow, a, b: uint32, diff: var uint32): Borrow {.importc: "_subborrow_u32", intrinsics.} + + func addcarry_u64(carryIn: Carry, a, b: uint64, sum: var uint64): Carry {.importc: "_addcarry_u64", intrinsics.} + func subborrow_u64(borrowIn: Borrow, a, b:uint64, diff: var uint64): Borrow {.importc: "_subborrow_u64", intrinsics.} + +# ############################################################ +# +# Public +# +# ############################################################ + +func addC*(cOut: var Carry, sum: var uint32, a, b: uint32, cIn: Carry) {.inline.} = + ## Addition with carry + ## (CarryOut, Sum) <- a + b + CarryIn + when X86: + cOut = addcarry_u32(cIn, a, b, sum) + else: + let dblPrec = uint64(cIn) + uint64(a) + uint64(b) + sum = (uint32)(dblPrec) + cOut = Carry(dblPrec shr 32) + +func subB*(bOut: var Borrow, diff: var uint32, a, b: uint32, bIn: Borrow) {.inline.} = + ## Substraction with borrow + ## (BorrowOut, Diff) <- a - b - borrowIn + when X86: + bOut = subborrow_u32(bIn, a, b, diff) + else: + let dblPrec = uint64(a) - uint64(b) - uint64(bIn) + diff = (uint32)(dblPrec) + # On borrow the high word will be 0b1111...1111 and needs to be masked + bOut = Borrow((dblPrec shr 32) and 1) + +func addC*(cOut: var Carry, sum: var uint64, a, b: uint64, cIn: Carry) {.inline.} = + ## Addition with carry + ## (CarryOut, Sum) <- a + b + CarryIn + when X86: + cOut = addcarry_u64(cIn, a, b, sum) + else: + block: + static: + doAssert GCC_Compatible + doAssert sizeof(int) == 8 + + var dblPrec {.noInit.}: uint128 + {.emit:[dblPrec, " = (unsigned __int128)", a," + (unsigned __int128)", b, " + (unsigned __int128)",cIn,";"].} + + # Don't forget to dereference the var param in C mode + when defined(cpp): + {.emit:[cOut, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].} + {.emit:[sum, " = (NU64)", dblPrec,";"].} + else: + {.emit:["*",cOut, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].} + {.emit:["*",sum, " = (NU64)", dblPrec,";"].} + +func subB*(bOut: var Borrow, diff: var uint64, a, b: uint64, bIn: Borrow) {.inline.} = + ## Substraction with borrow + ## (BorrowOut, Diff) <- a - b - borrowIn + when X86: + bOut = subborrow_u64(bIn, a, b, diff) + else: + block: + static: + doAssert GCC_Compatible + doAssert sizeof(int) == 8 + + var dblPrec {.noInit.}: uint128 + {.emit:[dblPrec, " = (unsigned __int128)", a," - (unsigned __int128)", b, " - (unsigned __int128)",bIn,";"].} + + # Don't forget to dereference the var param in C mode + # On borrow the high word will be 0b1111...1111 and needs to be masked + when defined(cpp): + {.emit:[bOut, " = (NU64)(", dblPrec," >> ", 64'u64, ") & 1;"].} + {.emit:[diff, " = (NU64)", dblPrec,";"].} + else: + {.emit:["*",bOut, " = (NU64)(", dblPrec," >> ", 64'u64, ") & 1;"].} + {.emit:["*",diff, " = (NU64)", dblPrec,";"].} diff --git a/stint/private/primitives/extended_precision.nim b/stint/private/primitives/extended_precision.nim new file mode 100644 index 0000000..ef75040 --- /dev/null +++ b/stint/private/primitives/extended_precision.nim @@ -0,0 +1,130 @@ +# Stint +# Copyright 2018 Status Research & Development GmbH +# Licensed under either of +# +# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) +# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) +# +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +# ############################################################ +# +# Extended precision primitives +# +# ############################################################ + +import + ../datatypes, + ./addcarry_subborrow + +# ############################################################ +# +# 32-bit words +# +# ############################################################ + +func div2n1n*(q, r: var uint32, n_hi, n_lo, d: uint32) {.inline.}= + ## Division uint64 by uint32 + ## Warning ⚠️ : + ## - if n_hi == d, quotient does not fit in an uint32 + ## - if n_hi > d result is undefined + ## + ## To avoid issues, n_hi, n_lo, d should be normalized. + ## i.e. shifted (== multiplied by the same power of 2) + ## so that the most significant bit in d is set. + let dividend = (uint64(n_hi) shl 32) or uint64(n_lo) + let divisor = uint64(d) + q = uint32(dividend div divisor) + r = uint32(dividend mod divisor) + +func mul*(hi, lo: var uint32, a, b: uint32) {.inline.} = + ## Extended precision multiplication + ## (hi, lo) <- a*b + let dblPrec = uint64(a) * uint64(b) + lo = uint32(dblPrec) + hi = uint32(dblPrec shr 32) + +func muladd1*(hi, lo: var uint32, a, b, c: uint32) {.inline.} = + ## Extended precision multiplication + addition + ## (hi, lo) <- a*b + c + ## + ## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001) + ## so adding any c cannot overflow + let dblPrec = uint64(a) * uint64(b) + uint64(c) + lo = uint32(dblPrec) + hi = uint32(dblPrec shr 32) + +func muladd2*(hi, lo: var uint32, a, b, c1, c2: uint32) {.inline.}= + ## Extended precision multiplication + addition + addition + ## This is constant-time on most hardware except some specific one like Cortex M0 + ## (hi, lo) <- a*b + c1 + c2 + ## + ## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001) + ## so adding 0xFFFFFFFF leads to (hi: 0xFFFFFFFF, lo: 0x00000000) + ## and we have enough space to add again 0xFFFFFFFF without overflowing + let dblPrec = uint64(a) * uint64(b) + uint64(c1) + uint64(c2) + lo = uint32(dblPrec) + hi = uint32(dblPrec shr 32) + +# ############################################################ +# +# 64-bit words +# +# ############################################################ + +when sizeof(int) == 8: + when defined(vcc): + from ./extended_precision_x86_64_msvc import div2n1n, mul, muladd1, muladd2 + elif GCCCompatible: + when X86: + from ./extended_precision_x86_64_gcc import div2n1n + from ./extended_precision_64bit_uint128 import mul, muladd1, muladd2 + else: + from ./extended_precision_64bit_uint128 import div2n1n, mul, muladd1, muladd2 + export div2n1n, mul, muladd1, muladd2 + +# ############################################################ +# +# Composite primitives +# +# ############################################################ + +func mulDoubleAdd2*[T: uint32|uint64](r2: var Carry, r1, r0: var T, a, b, c: T, dHi: Carry, dLo: T) {.inline.} = + ## (r2, r1, r0) <- 2*a*b + c + (dHi, dLo) + ## with r = (r2, r1, r0) a triple-word number + ## and d = (dHi, dLo) a double-word number + ## r2 and dHi are carries, either 0 or 1 + + var carry: Carry + + # (r1, r0) <- a*b + # Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFF_FFFFFFFE, lo: 0x00000000_00000001) + mul(r1, r0, a, b) + + # (r2, r1, r0) <- 2*a*b + # Then (hi: 0xFFFFFFFF_FFFFFFFE, lo: 0x00000000_00000001) * 2 + # (carry: 1, hi: 0xFFFFFFFF_FFFFFFFC, lo: 0x00000000_00000002) + addC(carry, r0, r0, r0, Carry(0)) + addC(r2, r1, r1, r1, carry) + + # (r1, r0) <- (r1, r0) + c + # Adding any uint64 cannot overflow into r2 for example Adding 2^64-1 + # (carry: 1, hi: 0xFFFFFFFF_FFFFFFFD, lo: 0x00000000_00000001) + addC(carry, r0, r0, c, Carry(0)) + addC(carry, r1, r1, T(0), carry) + + # (r1, r0) <- (r1, r0) + (dHi, dLo) with dHi a carry (previous limb r2) + # (dHi, dLo) is at most (dhi: 1, dlo: 0xFFFFFFFF_FFFFFFFF) + # summing into (carry: 1, hi: 0xFFFFFFFF_FFFFFFFD, lo: 0x00000000_00000001) + # result at most in (carry: 1, hi: 0xFFFFFFFF_FFFFFFFF, lo: 0x00000000_00000000) + addC(carry, r0, r0, dLo, Carry(0)) + addC(carry, r1, r1, T(dHi), carry) + +func mulAcc*[T: uint32|uint64](t, u, v: var T, a, b: T) {.inline.} = + ## (t, u, v) <- (t, u, v) + a * b + var UV: array[2, T] + var carry: Carry + mul(UV[1], UV[0], a, b) + addC(carry, v, v, UV[0], Carry(0)) + addC(carry, u, u, UV[1], carry) + t += T(carry) diff --git a/stint/private/primitives/extended_precision_64bit_uint128.nim b/stint/private/primitives/extended_precision_64bit_uint128.nim new file mode 100644 index 0000000..7861427 --- /dev/null +++ b/stint/private/primitives/extended_precision_64bit_uint128.nim @@ -0,0 +1,95 @@ +# Stint +# Copyright 2018 Status Research & Development GmbH +# Licensed under either of +# +# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) +# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) +# +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import ../datatypes + +# ############################################################ +# +# Extended precision primitives on GCC & Clang (all CPU archs) +# +# ############################################################ + +static: + doAssert GCC_Compatible + doAssert sizeof(int) == 8 + +func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}= + ## Division uint128 by uint64 + ## Warning ⚠️ : + ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE on some platforms + ## - if n_hi > d result is undefined + var dblPrec {.noInit.}: uint128 + {.emit:[dblPrec, " = (unsigned __int128)", n_hi," << 64 | (unsigned __int128)",n_lo,";"].} + + # Don't forget to dereference the var param in C mode + when defined(cpp): + {.emit:[q, " = (NU64)(", dblPrec," / ", d, ");"].} + {.emit:[r, " = (NU64)(", dblPrec," % ", d, ");"].} + else: + {.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].} + {.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].} + +func mul*(hi, lo: var uint64, a, b: uint64) {.inline.} = + ## Extended precision multiplication + ## (hi, lo) <- a*b + block: + var dblPrec {.noInit.}: uint128 + {.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b,";"].} + + # Don't forget to dereference the var param in C mode + when defined(cpp): + {.emit:[hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].} + {.emit:[lo, " = (NU64)", dblPrec,";"].} + else: + {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].} + {.emit:["*",lo, " = (NU64)", dblPrec,";"].} + +func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} = + ## Extended precision multiplication + addition + ## (hi, lo) <- a*b + c + ## + ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001) + ## so adding any c cannot overflow + ## + ## This is constant-time on most hardware + ## See: https://www.bearssl.org/ctmul.html + block: + var dblPrec {.noInit.}: uint128 + {.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, " + (unsigned __int128)",c,";"].} + + # Don't forget to dereference the var param in C mode + when defined(cpp): + {.emit:[hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].} + {.emit:[lo, " = (NU64)", dblPrec,";"].} + else: + {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].} + {.emit:["*",lo, " = (NU64)", dblPrec,";"].} + +func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}= + ## Extended precision multiplication + addition + addition + ## This is constant-time on most hardware except some specific one like Cortex M0 + ## (hi, lo) <- a*b + c1 + c2 + ## + ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001) + ## so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000) + ## and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing + block: + var dblPrec {.noInit.}: uint128 + {.emit:[ + dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, + " + (unsigned __int128)",c1," + (unsigned __int128)",c2,";" + ].} + + # Don't forget to dereference the var param in C mode + when defined(cpp): + {.emit:[hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].} + {.emit:[lo, " = (NU64)", dblPrec,";"].} + else: + {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].} + {.emit:["*",lo, " = (NU64)", dblPrec,";"].} diff --git a/stint/private/primitives/extended_precision_x86_64_gcc.nim b/stint/private/primitives/extended_precision_x86_64_gcc.nim new file mode 100644 index 0000000..0e18c7f --- /dev/null +++ b/stint/private/primitives/extended_precision_x86_64_gcc.nim @@ -0,0 +1,57 @@ +# Stint +# Copyright 2018 Status Research & Development GmbH +# Licensed under either of +# +# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) +# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) +# +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import ../datatypes + +# ############################################################ +# +# Extended precision primitives for X86-64 on GCC & Clang +# +# ############################################################ + +static: + doAssert(defined(gcc) or defined(clang) or defined(llvm_gcc)) + doAssert sizeof(int) == 8 + doAssert X86 + +func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}= + ## Division uint128 by uint64 + ## Warning ⚠️ : + ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE + ## - if n_hi > d result is undefined + + # DIV r/m64 + # Divide RDX:RAX (n_hi:n_lo) by r/m64 + # + # Inputs + # - numerator high word in RDX, + # - numerator low word in RAX, + # - divisor as r/m parameter (register or memory at the compiler discretion) + # Result + # - Quotient in RAX + # - Remainder in RDX + + # 1. name the register/memory "divisor" + # 2. don't forget to dereference the var hidden pointer + # 3. - + # 4. no clobbered registers beside explicitly used RAX and RDX + when defined(cpp): + asm """ + divq %[divisor] + : "=a" (`q`), "=d" (`r`) + : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`) + : + """ + else: + asm """ + divq %[divisor] + : "=a" (`*q`), "=d" (`*r`) + : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`) + : + """ diff --git a/stint/private/primitives/extended_precision_x86_64_msvc.nim b/stint/private/primitives/extended_precision_x86_64_msvc.nim new file mode 100644 index 0000000..9adcd32 --- /dev/null +++ b/stint/private/primitives/extended_precision_x86_64_msvc.nim @@ -0,0 +1,87 @@ +# Stint +# Copyright 2018 Status Research & Development GmbH +# Licensed under either of +# +# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) +# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) +# +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + ../datatypes, + ./addcarry_subborrow + +# ############################################################ +# +# Extended precision primitives for X86-64 on MSVC +# +# ############################################################ + +static: + doAssert defined(vcc) + doAssert sizeof(int) == 8 + doAssert X86 + +func udiv128(highDividend, lowDividend, divisor: Ct[uint64], remainder: var Ct[uint64]): Ct[uint64] {.importc:"_udiv128", header: "", nodecl.} + ## Division 128 by 64, Microsoft only, 64-bit only, + ## returns quotient as return value remainder as var parameter + ## Warning ⚠️ : + ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE + ## - if n_hi > d result is undefined + +func umul128(a, b: Ct[uint64], hi: var Ct[uint64]): Ct[uint64] {.importc:"_umul128", header:"", nodecl.} + ## (hi, lo) <-- a * b + ## Return value is the low word + +func div2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}= + ## Division uint128 by uint64 + ## Warning ⚠️ : + ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE + ## - if n_hi > d result is undefined + {.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".} + + # TODO !!! - Replace by constant-time, portable, non-assembly version + # -> use uint128? Compiler might add unwanted branches + q = udiv128(n_hi, n_lo, d, r) + +func mul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} = + ## Extended precision multiplication + ## (hi, lo) <- a*b + ## + ## This is constant-time on most hardware + ## See: https://www.bearssl.org/ctmul.html + lo = umul128(a, b, hi) + +func muladd1*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.} = + ## Extended precision multiplication + addition + ## (hi, lo) <- a*b + c + ## + ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001) + ## so adding any c cannot overflow + ## + ## This is constant-time on most hardware + ## See: https://www.bearssl.org/ctmul.html + var carry: Carry + lo = umul128(a, b, hi) + addC(carry, lo, lo, c, Carry(0)) + addC(carry, hi, hi, 0, carry) + +func muladd2*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}= + ## Extended precision multiplication + addition + addition + ## This is constant-time on most hardware except some specific one like Cortex M0 + ## (hi, lo) <- a*b + c1 + c2 + ## + ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001) + ## so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000) + ## and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing + # For speed this could be implemented with parallel pipelined carry chains + # via MULX + ADCX + ADOX + var carry1, carry2: Carry + + lo = umul128(a, b, hi) + # Carry chain 1 + addC(carry1, lo, lo, c1, Carry(0)) + addC(carry1, hi, hi, 0, carry1) + # Carry chain 2 + addC(carry2, lo, lo, c2, Carry(0)) + addC(carry2, hi, hi, 0, carry2)