Implement types and uint128 primitives

2020-06-12 18:37:02 +02:00 · 2020-06-12 18:37:02 +02:00 · de87739635
parent c0ae9e10a9
commit de87739635
7 changed files with 572 additions and 228 deletions
--- a/stint/private/conversion.nim
+++ b/stint/private/conversion.nim
@ -1,59 +0,0 @@
-# Stint
-# Copyright 2018 Status Research & Development GmbH
-# Licensed under either of
-#
-#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
-#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
-#
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import  ./datatypes
-
-func toSubtype*[T: SomeInteger](b: bool, _: typedesc[T]): T {.inline.}=
-  b.T
-
-func toSubtype*[T: UintImpl | IntImpl](b: bool, _: typedesc[T]): T {.inline.}=
-  type SubTy = type result.lo
-  result.lo = toSubtype(b, SubTy)
-
-func toUint*(n: UintImpl or IntImpl or SomeSignedInt): auto {.inline.}=
-  ## Casts an unsigned integer to an uint of the same size
-  # TODO: uint128 support
-  when n.sizeof > 8:
-    {.fatal: "Unreachable. You are trying to cast a StUint with more than 64-bit of precision" .}
-  elif n.sizeof == 8:
-    cast[uint64](n)
-  elif n.sizeof == 4:
-    cast[uint32](n)
-  elif n.sizeof == 2:
-    cast[uint16](n)
-  else:
-    cast[uint8](n)
-
-func toUint*(n: SomeUnsignedInt): SomeUnsignedInt {.inline.}=
-  ## No-op overload of multi-precision int casting
-  n
-
-func asDoubleUint*(n: UintImpl | SomeUnsignedInt): auto {.inline.} =
-  ## Convert an integer or StUint to an uint with double the size
-  type Double = (
-    when n.sizeof == 4: uint64
-    elif n.sizeof == 2: uint32
-    else: uint16
-  )
-
-  n.toUint.Double
-
-func toInt*(n: UintImpl or IntImpl or SomeInteger): auto {.inline.}=
-  ## Casts an unsigned integer to an uint of the same size
-  # TODO: uint128 support
-  when n.sizeof > 8:
-    {.fatal: "Unreachable. You are trying to cast a StUint with more than 64-bit of precision" .}
-  elif n.sizeof == 8:
-    cast[int64](n)
-  elif n.sizeof == 4:
-    cast[int32](n)
-  elif n.sizeof == 2:
-    cast[int16](n)
-  else:
-    cast[int8](n)
--- a/stint/private/datatypes.nim
+++ b/stint/private/datatypes.nim
@ -7,191 +7,56 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.

-# TODO: test if GCC/Clang support uint128 natively
-
-# #### Overview
-#
-# Stint extends the default uint8, uint16, uint32, uint64 with power of 2 integers.
-# Only limitation is your stack size so you can have uint128, uint256, uint512 ...
-# Signed int are also possible.
-#
-# As a high-level API, Stint adheres to Nim and C conventions and uses the same operators like:
-# `+`, `xor`, `not` ...
-#
-# #### Implementation
-#
-# Stint types are stored on the stack and have a structure
-# similar to a binary tree of power of two unsigned integers
-# with "high" and "low" words:
-#
-#                              Stuint[256]
-#            hi: Stuint[128]                  lo: Stuint[128]
-#     hihi: uint64    hilo: uint64    lohi: uint64    lolo: uint64
-#
-# This follows paper https://hal.archives-ouvertes.fr/hal-00582593v2
-# "Recursive double-size fixed precision arithmetic" from Jul. 2016
-# to implement an efficient fixed precision bigint for embedded devices, especially FPGAs.
-#
-# For testing purpose, the flag `-d:stint_test` can be passed at compile-time
-# to switch the backend to uint32.
-# In the future the default backend will become uint128 on supporting compilers.
-#
-# This has following benefits:
-#   - BigEndian/LittleEndian support is trivial.
-#   - Not having for loops help the compiler producing the most efficient instructions
-#     like ADC (Add with Carry)
-#   - Proving that the recursive structure works at depth 64 for uint32 backend means that
-#     it would work at depth 128 for uint64 backend.
-#     We can easily choose a uint16 or uint8 backend as well.
-#   - Due to the recursive structure, testing operations when there is:
-#       - no leaves(uint64)
-#       - a root and leaves with no nodes (uint128)
-#       - a root + intermediate nodes + leaves (uint256)
-#     should be enough to ensure they work at all sizes, edge cases included.
-#   - Adding a new backend like uint128 (GCC/Clang) or uint256 (LLVM instrinsics only) is just adding
-#     a new case in the `uintImpl` template.
-#   - All math implementations of the operations have a straightforward translation
-#     to a high-low structure, including the fastest Karatsuba multiplication
-#     and co-recursive division algorithm by Burnikel and Ziegler.
-#     This makes translating those algorithms into Nim easier compared to an array backend.
-#     It would also probably require less code and would be much easier to audit versus
-#     the math reference papers.
-#   - For implementation of algorithms, there is no issue to take subslices of the memory representation
-#     with a recursive tree structure.
-#     On the other side, returning a `var array[N div 2, uint64]` is problematic at the moment.
-#   - Compile-time computation is possible while due to the previous issue
-#     an array backend would be required to use var openArray[uint64]
-#     i.e. pointers.
-#   - Note that while shift-right and left can easily be done an array of bytes
-#     this would have reduced performance compared to moving 64-bit words.
-#     An efficient implementation on array of words would require checking the shift
-#     versus a half-word to deal with carry-in/out from and to the adjacent words
-#     similar to a recursive implementation.
-#
-# Iterations over the whole integers, for example for `==` is always unrolled.
-# Due to being on the stack, any optimizing compiler should compile that to efficient memcmp
-#
-# When not to use Stint:
-#
-# 1. Constant-time arithmetics
-#    - Do not use Stint if you need side-channels resistance,
-#      This requires to avoid all branches (i.e. no booleans)
-# 2. Arbitrary-precision with lots of small-values
-#    - If you need arbitrary precision but most of the time you store mall values
-#      you will waste a lot of memory unless you use an object variant of various Stint sizes.
-#      type MyUint = object
-#        case kind: int
-#        of 0..64: uint64
-#        of 66..128: ref Stuint[128]
-#        of 129..256: ref Stuint[256]
-#        ...
-#
-# Note: if you work with huge size, you can allocate stints on the heap with
-#       for example `type HeapInt8192 = ref Stint[8192].
-#       If you have a lot of computations and intermediate variables it's probably worthwhile
-#       to explore creating an object pool to reuse the memory buffers.
-
-template checkDiv2(bits: static[int]): untyped =
-  # TODO: There is no need to check if power of 2 at each uintImpl instantiation, it slows compilation.
-  #       However we easily get into nested templates instantiation if we use another
-  #       template that first checks power-of-two and then calls the recursive uintImpl
-  static:
-    doAssert (bits and (bits-1)) == 0, $bits & " is not a power of 2"
-    doAssert bits >= 8, "The number of bits in a should be greater or equal to 8"
-  bits div 2
-
-when defined(mpint_test): # TODO stint_test
-  template uintImpl*(bits: static[int]): untyped =
-    # Test version, StUint[64] = 2 uint32. Test the logic of the library
-
-    when bits >= 128: UintImpl[uintImpl(checkDiv2(bits))]
-    elif bits == 64: UintImpl[uint32]
-    elif bits == 32: UintImpl[uint16]
-    elif bits == 16: UintImpl[uint8]
-    else: {.fatal: "Only power-of-2 >=16 supported when testing" .}
-
-  template intImpl*(bits: static[int]): untyped =
-    # Test version, StInt[64] = 2 uint32. Test the logic of the library
-    # int is implemented using a signed hi part and an unsigned lo part, given
-    # that the sign resides in hi
-
-    when bits >= 128: IntImpl[intImpl(checkDiv2(bits)), uintImpl(checkDiv2(bits))]
-    elif bits == 64: IntImpl[int32, uint32]
-    elif bits == 32: IntImpl[int16, uint16]
-    elif bits == 16: IntImpl[int8, uint8]
-    else: {.fatal: "Only power-of-2 >=16 supported when testing" .}
+import
+  # Status lib
+  stew/bitops2

+when sizeof(int) == 8 and not defined(Stint32):
+  type Word* = uint64
 else:
-  template uintImpl*(bits: static[int]): untyped =
-    mixin UintImpl
-    when bits >= 128: UintImpl[uintImpl(checkDiv2(bits))]
-    elif bits == 64: uint64
-    elif bits == 32: uint32
-    elif bits == 16: uint16
-    elif bits == 8: uint8
-    else: {.fatal: "Only power-of-2 >=8 supported" .}
+  type Word* = uint32

-  template intImpl*(bits: static[int]): untyped =
-    # int is implemented using a signed hi part and an unsigned lo part, given
-    # that the sign resides in hi
+type Word* = uint32

-    when bits >= 128: IntImpl[intImpl(checkDiv2(bits)), uintImpl(checkDiv2(bits))]
-    elif bits == 64: int64
-    elif bits == 32: int32
-    elif bits == 16: int16
-    elif bits == 8: int8
-    else: {.fatal: "Only power-of-2 >=8 supported" .}
+func wordsRequired*(bits: int): int {.compileTime.} =
+  ## Compute the number of limbs required
+  ## from the **announced** bit length
+  (bits + WordBitWidth - 1) div WordBitWidth

 type
-  # ### Private ### #
-  UintImpl*[BaseUint] = object
-    when system.cpuEndian == littleEndian:
-      lo*, hi*: BaseUint
-    else:
-      hi*, lo*: BaseUint
-
-  IntImpl*[BaseInt, BaseUint] = object
-    # Ints are implemented in terms of uints
-    when system.cpuEndian == littleEndian:
-      lo*: BaseUint
-      hi*: BaseInt
-    else:
-      hi*: BaseInt
-      lo*: BaseUint
-
-  # ### Private ### #
+  Limbs*[N: static int] = array[N, BaseUint]

  StUint*[bits: static[int]] = object
-    data*: uintImpl(bits)
+    ## Stack-based integer
+    ## Unsigned
+    limbs*: Limbs[bits.wordsRequired]

  StInt*[bits: static[int]] = object
-    data*: intImpl(bits)
+    ## Stack-based integer
+    ## Signed
+    limbs*: Limbs[bits.wordsRequired]

-template applyHiLo*(a: UintImpl | IntImpl, c: untyped): untyped =
-  ## Apply `c` to each of `hi` and `lo`
-  var res: type a
-  res.hi = c(a.hi)
-  res.lo = c(a.lo)
-  res
+  Carry* = uint8  # distinct range[0'u8 .. 1]
+  Borrow* = uint8 # distinct range[0'u8 .. 1]

-template applyHiLo*(a, b: UintImpl | IntImpl, c: untyped): untyped =
-  ## Apply `c` to each of `hi` and `lo`
-  var res: type a
-  res.hi = c(a.hi, b.hi)
-  res.lo = c(a.lo, b.lo)
-  res
+const GCC_Compatible* = defined(gcc) or defined(clang) or defined(llvm_gcc)
+const X86* = defined(amd64) or defined(i386)
+
+when sizeof(int) == 8 and GCC_Compatible:
+  type
+    uint128*{.importc: "unsigned __int128".} = object

 template leastSignificantWord*(num: SomeInteger): auto =
  num

-func leastSignificantWord*(num: UintImpl | IntImpl): auto {.inline.} =
-  when num.lo is UintImpl:
-    num.lo.leastSignificantWord
+func leastSignificantWord*(limbs: Limbs): auto {.inline.} =
+  when cpuEndian == littleEndian:
+    limbs[0]
  else:
-    num.lo
+    limbs[^1]

-func mostSignificantWord*(num: UintImpl | IntImpl): auto {.inline.} =
-  when num.hi is (UintImpl | IntImpl):
-    num.hi.mostSignificantWord
+func mostSignificantWord*(limbs: Limbs): auto {.inline.} =
+  when cpuEndian == littleEndian:
+    limbs[^1]
  else:
-    num.hi
+    limbs[0]
--- a/stint/private/primitives/addcarry_subborrow.nim
+++ b/stint/private/primitives/addcarry_subborrow.nim
@ -0,0 +1,169 @@
+# Stint
+# Copyright 2018 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ../datatypes
+
+# ############################################################
+#
+#            Add-with-carry and Sub-with-borrow
+#
+# ############################################################
+#
+# This file implements add-with-carry and sub-with-borrow
+#
+# It is currently (Mar 2020) impossible to have the compiler
+# generate optimal code in a generic way.
+#
+# On x86, addcarry_u64 intrinsic will generate optimal code
+# except for GCC.
+#
+# On other CPU architectures inline assembly might be desirable.
+# A compiler proof-of-concept is available in the "research" folder.
+#
+# See https://gcc.godbolt.org/z/2h768y
+# ```C
+# #include <stdint.h>
+# #include <x86intrin.h>
+#
+# void add256(uint64_t a[4], uint64_t b[4]){
+#   uint8_t carry = 0;
+#   for (int i = 0; i < 4; ++i)
+#     carry = _addcarry_u64(carry, a[i], b[i], &a[i]);
+# }
+# ```
+#
+# GCC
+# ```asm
+# add256:
+#         movq    (%rsi), %rax
+#         addq    (%rdi), %rax
+#         setc    %dl
+#         movq    %rax, (%rdi)
+#         movq    8(%rdi), %rax
+#         addb    $-1, %dl
+#         adcq    8(%rsi), %rax
+#         setc    %dl
+#         movq    %rax, 8(%rdi)
+#         movq    16(%rdi), %rax
+#         addb    $-1, %dl
+#         adcq    16(%rsi), %rax
+#         setc    %dl
+#         movq    %rax, 16(%rdi)
+#         movq    24(%rsi), %rax
+#         addb    $-1, %dl
+#         adcq    %rax, 24(%rdi)
+#         ret
+# ```
+#
+# Clang
+# ```asm
+# add256:
+#         movq    (%rsi), %rax
+#         addq    %rax, (%rdi)
+#         movq    8(%rsi), %rax
+#         adcq    %rax, 8(%rdi)
+#         movq    16(%rsi), %rax
+#         adcq    %rax, 16(%rdi)
+#         movq    24(%rsi), %rax
+#         adcq    %rax, 24(%rdi)
+#         retq
+# ```
+
+# ############################################################
+#
+#                     Intrinsics
+#
+# ############################################################
+
+# Note: GCC before 2017 had incorrect codegen in some cases:
+# - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81300
+
+when X86:
+  when defined(windows):
+    {.pragma: intrinsics, header:"<intrin.h>", nodecl.}
+  else:
+    {.pragma: intrinsics, header:"<x86intrin.h>", nodecl.}
+
+  func addcarry_u32(carryIn: Carry, a, b: uint32, sum: var uint32): Carry {.importc: "_addcarry_u32", intrinsics.}
+  func subborrow_u32(borrowIn: Borrow, a, b: uint32, diff: var uint32): Borrow {.importc: "_subborrow_u32", intrinsics.}
+
+  func addcarry_u64(carryIn: Carry, a, b: uint64, sum: var uint64): Carry {.importc: "_addcarry_u64", intrinsics.}
+  func subborrow_u64(borrowIn: Borrow, a, b:uint64, diff: var uint64): Borrow {.importc: "_subborrow_u64", intrinsics.}
+
+# ############################################################
+#
+#                     Public
+#
+# ############################################################
+
+func addC*(cOut: var Carry, sum: var uint32, a, b: uint32, cIn: Carry) {.inline.} =
+  ## Addition with carry
+  ## (CarryOut, Sum) <- a + b + CarryIn
+  when X86:
+    cOut = addcarry_u32(cIn, a, b, sum)
+  else:
+    let dblPrec = uint64(cIn) + uint64(a) + uint64(b)
+    sum = (uint32)(dblPrec)
+    cOut = Carry(dblPrec shr 32)
+
+func subB*(bOut: var Borrow, diff: var uint32, a, b: uint32, bIn: Borrow) {.inline.} =
+  ## Substraction with borrow
+  ## (BorrowOut, Diff) <- a - b - borrowIn
+  when X86:
+    bOut = subborrow_u32(bIn, a, b, diff)
+  else:
+    let dblPrec = uint64(a) - uint64(b) - uint64(bIn)
+    diff = (uint32)(dblPrec)
+    # On borrow the high word will be 0b1111...1111 and needs to be masked
+    bOut = Borrow((dblPrec shr 32) and 1)
+
+func addC*(cOut: var Carry, sum: var uint64, a, b: uint64, cIn: Carry) {.inline.} =
+  ## Addition with carry
+  ## (CarryOut, Sum) <- a + b + CarryIn
+  when X86:
+    cOut = addcarry_u64(cIn, a, b, sum)
+  else:
+    block:
+      static:
+        doAssert GCC_Compatible
+        doAssert sizeof(int) == 8
+
+      var dblPrec {.noInit.}: uint128
+      {.emit:[dblPrec, " = (unsigned __int128)", a," + (unsigned __int128)", b, " + (unsigned __int128)",cIn,";"].}
+
+      # Don't forget to dereference the var param in C mode
+      when defined(cpp):
+        {.emit:[cOut, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+        {.emit:[sum, " = (NU64)", dblPrec,";"].}
+      else:
+        {.emit:["*",cOut, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+        {.emit:["*",sum, " = (NU64)", dblPrec,";"].}
+
+func subB*(bOut: var Borrow, diff: var uint64, a, b: uint64, bIn: Borrow) {.inline.} =
+  ## Substraction with borrow
+  ## (BorrowOut, Diff) <- a - b - borrowIn
+  when X86:
+    bOut = subborrow_u64(bIn, a, b, diff)
+  else:
+    block:
+      static:
+        doAssert GCC_Compatible
+        doAssert sizeof(int) == 8
+
+      var dblPrec {.noInit.}: uint128
+      {.emit:[dblPrec, " = (unsigned __int128)", a," - (unsigned __int128)", b, " - (unsigned __int128)",bIn,";"].}
+
+      # Don't forget to dereference the var param in C mode
+      # On borrow the high word will be 0b1111...1111 and needs to be masked
+      when defined(cpp):
+        {.emit:[bOut, " = (NU64)(", dblPrec," >> ", 64'u64, ") & 1;"].}
+        {.emit:[diff, " = (NU64)", dblPrec,";"].}
+      else:
+        {.emit:["*",bOut, " = (NU64)(", dblPrec," >> ", 64'u64, ") & 1;"].}
+        {.emit:["*",diff, " = (NU64)", dblPrec,";"].}
--- a/stint/private/primitives/extended_precision.nim
+++ b/stint/private/primitives/extended_precision.nim
@ -0,0 +1,130 @@
+# Stint
+# Copyright 2018 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# ############################################################
+#
+#               Extended precision primitives
+#
+# ############################################################
+
+import
+  ../datatypes,
+  ./addcarry_subborrow
+
+# ############################################################
+#
+#                     32-bit words
+#
+# ############################################################
+
+func div2n1n*(q, r: var uint32, n_hi, n_lo, d: uint32) {.inline.}=
+  ## Division uint64 by uint32
+  ## Warning ⚠️ :
+  ##   - if n_hi == d, quotient does not fit in an uint32
+  ##   - if n_hi > d result is undefined
+  ##
+  ## To avoid issues, n_hi, n_lo, d should be normalized.
+  ## i.e. shifted (== multiplied by the same power of 2)
+  ## so that the most significant bit in d is set.
+  let dividend = (uint64(n_hi) shl 32) or uint64(n_lo)
+  let divisor = uint64(d)
+  q = uint32(dividend div divisor)
+  r = uint32(dividend mod divisor)
+
+func mul*(hi, lo: var uint32, a, b: uint32) {.inline.} =
+  ## Extended precision multiplication
+  ## (hi, lo) <- a*b
+  let dblPrec = uint64(a) * uint64(b)
+  lo = uint32(dblPrec)
+  hi = uint32(dblPrec shr 32)
+
+func muladd1*(hi, lo: var uint32, a, b, c: uint32) {.inline.} =
+  ## Extended precision multiplication + addition
+  ## (hi, lo) <- a*b + c
+  ##
+  ## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001)
+  ##       so adding any c cannot overflow
+  let dblPrec = uint64(a) * uint64(b) + uint64(c)
+  lo = uint32(dblPrec)
+  hi = uint32(dblPrec shr 32)
+
+func muladd2*(hi, lo: var uint32, a, b, c1, c2: uint32) {.inline.}=
+  ## Extended precision multiplication + addition + addition
+  ## This is constant-time on most hardware except some specific one like Cortex M0
+  ## (hi, lo) <- a*b + c1 + c2
+  ##
+  ## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001)
+  ##       so adding 0xFFFFFFFF leads to (hi: 0xFFFFFFFF, lo: 0x00000000)
+  ##       and we have enough space to add again 0xFFFFFFFF without overflowing
+  let dblPrec = uint64(a) * uint64(b) + uint64(c1) + uint64(c2)
+  lo = uint32(dblPrec)
+  hi = uint32(dblPrec shr 32)
+
+# ############################################################
+#
+#                     64-bit words
+#
+# ############################################################
+
+when sizeof(int) == 8:
+  when defined(vcc):
+    from ./extended_precision_x86_64_msvc import div2n1n, mul, muladd1, muladd2
+  elif GCCCompatible:
+    when X86:
+      from ./extended_precision_x86_64_gcc import div2n1n
+      from ./extended_precision_64bit_uint128 import mul, muladd1, muladd2
+    else:
+      from ./extended_precision_64bit_uint128 import div2n1n, mul, muladd1, muladd2
+  export div2n1n, mul, muladd1, muladd2
+
+# ############################################################
+#
+#                  Composite primitives
+#
+# ############################################################
+
+func mulDoubleAdd2*[T: uint32|uint64](r2: var Carry, r1, r0: var T, a, b, c: T, dHi: Carry, dLo: T) {.inline.} =
+  ## (r2, r1, r0) <- 2*a*b + c + (dHi, dLo)
+  ## with r = (r2, r1, r0) a triple-word number
+  ## and d = (dHi, dLo) a double-word number
+  ## r2 and dHi are carries, either 0 or 1
+
+  var carry: Carry
+
+  # (r1, r0) <- a*b
+  # Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFF_FFFFFFFE, lo: 0x00000000_00000001)
+  mul(r1, r0, a, b)
+
+  # (r2, r1, r0) <- 2*a*b
+  # Then  (hi: 0xFFFFFFFF_FFFFFFFE, lo: 0x00000000_00000001) * 2
+  #       (carry: 1, hi: 0xFFFFFFFF_FFFFFFFC, lo: 0x00000000_00000002)
+  addC(carry, r0, r0, r0, Carry(0))
+  addC(r2, r1, r1, r1, carry)
+
+  # (r1, r0) <- (r1, r0) + c
+  # Adding any uint64 cannot overflow into r2 for example Adding 2^64-1
+  #       (carry: 1, hi: 0xFFFFFFFF_FFFFFFFD, lo: 0x00000000_00000001)
+  addC(carry, r0, r0, c, Carry(0))
+  addC(carry, r1, r1, T(0), carry)
+
+  # (r1, r0) <- (r1, r0) + (dHi, dLo) with dHi a carry (previous limb r2)
+  # (dHi, dLo) is at most (dhi: 1, dlo: 0xFFFFFFFF_FFFFFFFF)
+  # summing into (carry: 1, hi: 0xFFFFFFFF_FFFFFFFD, lo: 0x00000000_00000001)
+  # result at most in (carry: 1, hi: 0xFFFFFFFF_FFFFFFFF, lo: 0x00000000_00000000)
+  addC(carry, r0, r0, dLo, Carry(0))
+  addC(carry, r1, r1, T(dHi), carry)
+
+func mulAcc*[T: uint32|uint64](t, u, v: var T, a, b: T) {.inline.} =
+  ## (t, u, v) <- (t, u, v) + a * b
+  var UV: array[2, T]
+  var carry: Carry
+  mul(UV[1], UV[0], a, b)
+  addC(carry, v, v, UV[0], Carry(0))
+  addC(carry, u, u, UV[1], carry)
+  t += T(carry)
--- a/stint/private/primitives/extended_precision_64bit_uint128.nim
+++ b/stint/private/primitives/extended_precision_64bit_uint128.nim
@ -0,0 +1,95 @@
+# Stint
+# Copyright 2018 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ../datatypes
+
+# ############################################################
+#
+# Extended precision primitives on GCC & Clang (all CPU archs)
+#
+# ############################################################
+
+static:
+  doAssert GCC_Compatible
+  doAssert sizeof(int) == 8
+
+func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
+  ## Division uint128 by uint64
+  ## Warning ⚠️ :
+  ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE on some platforms
+  ##   - if n_hi > d result is undefined
+  var dblPrec {.noInit.}: uint128
+  {.emit:[dblPrec, " = (unsigned __int128)", n_hi," << 64 | (unsigned __int128)",n_lo,";"].}
+
+  # Don't forget to dereference the var param in C mode
+  when defined(cpp):
+    {.emit:[q, " = (NU64)(", dblPrec," / ", d, ");"].}
+    {.emit:[r, " = (NU64)(", dblPrec," % ", d, ");"].}
+  else:
+    {.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].}
+    {.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].}
+
+func mul*(hi, lo: var uint64, a, b: uint64) {.inline.} =
+  ## Extended precision multiplication
+  ## (hi, lo) <- a*b
+  block:
+    var dblPrec {.noInit.}: uint128
+    {.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b,";"].}
+
+    # Don't forget to dereference the var param in C mode
+    when defined(cpp):
+      {.emit:[hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+      {.emit:[lo, " = (NU64)", dblPrec,";"].}
+    else:
+      {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+      {.emit:["*",lo, " = (NU64)", dblPrec,";"].}
+
+func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
+  ## Extended precision multiplication + addition
+  ## (hi, lo) <- a*b + c
+  ##
+  ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+  ##       so adding any c cannot overflow
+  ##
+  ## This is constant-time on most hardware
+  ## See: https://www.bearssl.org/ctmul.html
+  block:
+    var dblPrec {.noInit.}: uint128
+    {.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, " + (unsigned __int128)",c,";"].}
+
+    # Don't forget to dereference the var param in C mode
+    when defined(cpp):
+      {.emit:[hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+      {.emit:[lo, " = (NU64)", dblPrec,";"].}
+    else:
+      {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+      {.emit:["*",lo, " = (NU64)", dblPrec,";"].}
+
+func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
+  ## Extended precision multiplication + addition + addition
+  ## This is constant-time on most hardware except some specific one like Cortex M0
+  ## (hi, lo) <- a*b + c1 + c2
+  ##
+  ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+  ##       so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
+  ##       and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
+  block:
+    var dblPrec {.noInit.}: uint128
+    {.emit:[
+      dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b,
+               " + (unsigned __int128)",c1," + (unsigned __int128)",c2,";"
+    ].}
+
+    # Don't forget to dereference the var param in C mode
+    when defined(cpp):
+      {.emit:[hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+      {.emit:[lo, " = (NU64)", dblPrec,";"].}
+    else:
+      {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+      {.emit:["*",lo, " = (NU64)", dblPrec,";"].}
--- a/stint/private/primitives/extended_precision_x86_64_gcc.nim
+++ b/stint/private/primitives/extended_precision_x86_64_gcc.nim
@ -0,0 +1,57 @@
+# Stint
+# Copyright 2018 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ../datatypes
+
+# ############################################################
+#
+#   Extended precision primitives for X86-64 on GCC & Clang
+#
+# ############################################################
+
+static:
+  doAssert(defined(gcc) or defined(clang) or defined(llvm_gcc))
+  doAssert sizeof(int) == 8
+  doAssert X86
+
+func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
+  ## Division uint128 by uint64
+  ## Warning ⚠️ :
+  ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+  ##   - if n_hi > d result is undefined
+
+  # DIV r/m64
+  # Divide RDX:RAX (n_hi:n_lo) by r/m64
+  #
+  # Inputs
+  #   - numerator high word in RDX,
+  #   - numerator low word in RAX,
+  #   - divisor as r/m parameter (register or memory at the compiler discretion)
+  # Result
+  #   - Quotient in RAX
+  #   - Remainder in RDX
+
+  # 1. name the register/memory "divisor"
+  # 2. don't forget to dereference the var hidden pointer
+  # 3. -
+  # 4. no clobbered registers beside explicitly used RAX and RDX
+  when defined(cpp):
+    asm """
+      divq %[divisor]
+      : "=a" (`q`), "=d" (`r`)
+      : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
+      :
+    """
+  else:
+    asm """
+      divq %[divisor]
+      : "=a" (`*q`), "=d" (`*r`)
+      : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
+      :
+    """
--- a/stint/private/primitives/extended_precision_x86_64_msvc.nim
+++ b/stint/private/primitives/extended_precision_x86_64_msvc.nim
@ -0,0 +1,87 @@
+# Stint
+# Copyright 2018 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../datatypes,
+  ./addcarry_subborrow
+
+# ############################################################
+#
+#      Extended precision primitives for X86-64 on MSVC
+#
+# ############################################################
+
+static:
+  doAssert defined(vcc)
+  doAssert sizeof(int) == 8
+  doAssert X86
+
+func udiv128(highDividend, lowDividend, divisor: Ct[uint64], remainder: var Ct[uint64]): Ct[uint64] {.importc:"_udiv128", header: "<intrin.h>", nodecl.}
+  ## Division 128 by 64, Microsoft only, 64-bit only,
+  ## returns quotient as return value remainder as var parameter
+  ## Warning ⚠️ :
+  ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+  ##   - if n_hi > d result is undefined
+
+func umul128(a, b: Ct[uint64], hi: var Ct[uint64]): Ct[uint64] {.importc:"_umul128", header:"<intrin.h>", nodecl.}
+  ## (hi, lo) <-- a * b
+  ## Return value is the low word
+
+func div2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
+    ## Division uint128 by uint64
+    ## Warning ⚠️ :
+    ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+    ##   - if n_hi > d result is undefined
+    {.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".}
+
+    # TODO !!! - Replace by constant-time, portable, non-assembly version
+    #          -> use uint128? Compiler might add unwanted branches
+    q = udiv128(n_hi, n_lo, d, r)
+
+func mul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
+  ## Extended precision multiplication
+  ## (hi, lo) <- a*b
+  ##
+  ## This is constant-time on most hardware
+  ## See: https://www.bearssl.org/ctmul.html
+  lo = umul128(a, b, hi)
+
+func muladd1*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.} =
+  ## Extended precision multiplication + addition
+  ## (hi, lo) <- a*b + c
+  ##
+  ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+  ##       so adding any c cannot overflow
+  ##
+  ## This is constant-time on most hardware
+  ## See: https://www.bearssl.org/ctmul.html
+  var carry: Carry
+  lo = umul128(a, b, hi)
+  addC(carry, lo, lo, c, Carry(0))
+  addC(carry, hi, hi, 0, carry)
+
+func muladd2*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}=
+  ## Extended precision multiplication + addition + addition
+  ## This is constant-time on most hardware except some specific one like Cortex M0
+  ## (hi, lo) <- a*b + c1 + c2
+  ##
+  ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+  ##       so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
+  ##       and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
+  # For speed this could be implemented with parallel pipelined carry chains
+  # via MULX + ADCX + ADOX
+  var carry1, carry2: Carry
+
+  lo = umul128(a, b, hi)
+  # Carry chain 1
+  addC(carry1, lo, lo, c1, Carry(0))
+  addC(carry1, hi, hi, 0, carry1)
+  # Carry chain 2
+  addC(carry2, lo, lo, c2, Carry(0))
+  addC(carry2, hi, hi, 0, carry2)