uint division - compile and pass the single limb tests

2025-02-19 18:38:13 +00:00 · 2022-01-22 01:42:54 +01:00 · 2022-01-22 01:42:54 +01:00 · 53d2fd14f3
commit 53d2fd14f3
parent c2ed8a4bc2
8 changed files with 163 additions and 64 deletions
--- a/stint/private/datatypes.nim
+++ b/stint/private/datatypes.nim
@ -180,9 +180,9 @@ macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped
 # Copy
 # --------------------------------------------------------

-func copyFrom*[dLen, sLen](
-        dst: var SomeBigInteger[dLen],
-        src: SomeBigInteger[sLen]
+func copyFrom*(
+        dst: var SomeBigInteger,
+        src: SomeBigInteger
      ){.inline.} =
  ## Copy a BigInteger, truncated to 2^slen if the source
  ## is larger than the destination
--- a/stint/private/primitives/compiletime_fallback.nim
+++ b/stint/private/primitives/compiletime_fallback.nim
@ -80,7 +80,7 @@ func mul_nim*(hi, lo: var uint64, u, v: uint64) =
  hi = x3 + hi(x1)
  lo = merge(x1, lo(x0))

-func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
+func muladd1_nim*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
  ## Extended precision multiplication + addition
  ## (hi, lo) <- a*b + c
  ##
@ -91,7 +91,7 @@ func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
  addC_nim(carry, lo, lo, c, 0)
  addC_nim(carry, hi, hi, 0, carry)

-func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
+func muladd2_nim*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
  ## Extended precision multiplication + addition + addition
  ## (hi, lo) <- a*b + c1 + c2
  ##
@ -107,3 +107,48 @@ func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
  # Carry chain 2
  addC_nim(carry2, lo, lo, c2, 0)
  addC_nim(carry2, hi, hi, 0, carry2)
+
+
+func div2n1n_nim*[T: SomeunsignedInt](q, r: var T, n_hi, n_lo, d: T) =
+  ## Division uint128 by uint64
+  ## Warning ⚠️ :
+  ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+  ##   - if n_hi > d result is undefined
+
+  # doAssert leadingZeros(d) == 0, "Divisor was not normalized"
+
+  const
+    size = sizeof(q) * 8
+    halfSize = size div 2
+    halfMask = (1.T shl halfSize) - 1.T
+
+  template halfQR(n_hi, n_lo, d, d_hi, d_lo: T): tuple[q,r: T] =
+
+    var (q, r) = (n_hi div d_hi, n_hi mod d_hi)
+    let m = q * d_lo
+    r = (r shl halfSize) or n_lo
+
+    # Fix the reminder, we're at most 2 iterations off
+    if r < m:
+      dec q
+      r += d
+      if r >= d and r < m:
+        dec q
+        r += d
+    r -= m
+    (q, r)
+
+  let
+    d_hi = d shr halfSize
+    d_lo = d and halfMask
+    n_lohi = nlo shr halfSize
+    n_lolo = nlo and halfMask
+
+  # First half of the quotient
+  let (q1, r1) = halfQR(n_hi, n_lohi, d, d_hi, d_lo)
+
+  # Second half
+  let (q2, r2) = halfQR(r1, n_lolo, d, d_hi, d_lo)
+
+  q = (q1 shl halfSize) or q2
+  r = r2
--- a/stint/private/primitives/extended_precision.nim
+++ b/stint/private/primitives/extended_precision.nim
@ -73,19 +73,57 @@ func muladd2*(hi, lo: var uint32, a, b, c1, c2: uint32) {.inline.}=
 # ############################################################

 when sizeof(int) == 8 and not defined(Stint32):
-  when nimvm:
-    from ./compiletime_fallback import mul_nim, muladd1, muladd2
-  else:
-    when defined(vcc):
-      from ./extended_precision_x86_64_msvc import div2n1n, mul, muladd1, muladd2
-    elif GCCCompatible:
-      when X86:
-        from ./extended_precision_x86_64_gcc import div2n1n
-        from ./extended_precision_64bit_uint128 import mul, muladd1, muladd2
-      else:
-        from ./extended_precision_64bit_uint128 import div2n1n, mul, muladd1, muladd2
-    export div2n1n, mul
-  export muladd1, muladd2
+  from ./compiletime_fallback import div2n1n_nim, mul_nim, muladd1_nim, muladd2_nim
+
+  when defined(vcc):
+    from ./extended_precision_x86_64_msvc import div2n1n_128, mul_128, muladd1_128, muladd2_128
+  elif GCCCompatible:
+    when X86:
+      from ./extended_precision_x86_64_gcc import div2n1n_128
+      from ./extended_precision_64bit_uint128 import mul_128, muladd1_128, muladd2_128
+    else:
+      from ./extended_precision_64bit_uint128 import div2n1n_128, mul_128, muladd1_128, muladd2_128
+
+  func mul*(hi, lo: var uint64, u, v: uint64) {.inline.}=
+    ## Extended precision multiplication
+    ## (hi, lo) <- u * v
+    when nimvm:
+      mul_nim(hi, lo, u, v)
+    else:
+      mul_128(hi, lo, u, v)
+
+  func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.}=
+    ## Extended precision multiplication + addition
+    ## (hi, lo) <- a*b + c
+    ##
+    ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+    ##       so adding any c cannot overflow
+    when nimvm:
+      muladd1_nim(hi, lo, a, b, c)
+    else:
+      muladd1_128(hi, lo, a, b, c)
+
+  func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
+    ## Extended precision multiplication + addition + addition
+    ## (hi, lo) <- a*b + c1 + c2
+    ##
+    ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+    ##       so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
+    ##       and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
+    when nimvm:
+      muladd2_nim(hi, lo, a, b, c1, c2)
+    else:
+      muladd2_128(hi, lo, a, b, c1, c2)
+
+  func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
+    ## Division uint128 by uint64
+    ## Warning ⚠️ :
+    ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+    ##   - if n_hi > d result is undefined
+    when nimvm:
+      div2n1n_nim(q, r, n_hi, n_lo, d)
+    else:
+      div2n1n_128(q, r, n_hi, n_lo, d)

 # ############################################################
 #
@ -128,10 +166,7 @@ func mulAcc*[T: uint32|uint64](t, u, v: var T, a, b: T) {.inline.} =
  ## (t, u, v) <- (t, u, v) + a * b
  var UV: array[2, T]
  var carry: Carry
-  when nimvm:
-    mul_nim(UV[1], UV[0], a, b)
-  else:
-    mul(UV[1], UV[0], a, b)
+  mul(UV[1], UV[0], a, b)
  addC(carry, v, v, UV[0], Carry(0))
  addC(carry, u, u, UV[1], carry)
  t += T(carry)
--- a/stint/private/primitives/extended_precision_64bit_uint128.nim
+++ b/stint/private/primitives/extended_precision_64bit_uint128.nim
@ -19,7 +19,7 @@ static:
  doAssert GCC_Compatible
  doAssert sizeof(int) == 8

-func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
+func div2n1n_128*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
  ## Division uint128 by uint64
  ## Warning ⚠️ :
  ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE on some platforms
@ -35,7 +35,7 @@ func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
    {.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].}
    {.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].}

-func mul*(hi, lo: var uint64, a, b: uint64) {.inline.} =
+func mul_128*(hi, lo: var uint64, a, b: uint64) {.inline.} =
  ## Extended precision multiplication
  ## (hi, lo) <- a*b
  block:
@ -50,7 +50,7 @@ func mul*(hi, lo: var uint64, a, b: uint64) {.inline.} =
      {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
      {.emit:["*",lo, " = (NU64)", dblPrec,";"].}

-func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
+func muladd1_128*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
  ## Extended precision multiplication + addition
  ## (hi, lo) <- a*b + c
  ##
@ -71,7 +71,7 @@ func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
      {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
      {.emit:["*",lo, " = (NU64)", dblPrec,";"].}

-func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
+func muladd2_128*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
  ## Extended precision multiplication + addition + addition
  ## This is constant-time on most hardware except some specific one like Cortex M0
  ## (hi, lo) <- a*b + c1 + c2
--- a/stint/private/primitives/extended_precision_x86_64_gcc.nim
+++ b/stint/private/primitives/extended_precision_x86_64_gcc.nim
@ -20,7 +20,7 @@ static:
  doAssert sizeof(int) == 8
  doAssert X86

-func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
+func div2n1n_128*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
  ## Division uint128 by uint64
  ## Warning ⚠️ :
  ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
--- a/stint/private/primitives/extended_precision_x86_64_msvc.nim
+++ b/stint/private/primitives/extended_precision_x86_64_msvc.nim
@ -38,35 +38,25 @@ func div2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
    ## Warning ⚠️ :
    ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
    ##   - if n_hi > d result is undefined
-    {.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".}
-
-    # TODO !!! - Replace by constant-time, portable, non-assembly version
-    #          -> use uint128? Compiler might add unwanted branches
    q = udiv128(n_hi, n_lo, d, r)

-func mul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
+func mul_128*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
  ## Extended precision multiplication
  ## (hi, lo) <- a*b
-  ##
-  ## This is constant-time on most hardware
-  ## See: https://www.bearssl.org/ctmul.html
  lo = umul128(a, b, hi)

-func muladd1*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.} =
+func muladd1_128*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.} =
  ## Extended precision multiplication + addition
  ## (hi, lo) <- a*b + c
  ##
  ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
  ##       so adding any c cannot overflow
-  ##
-  ## This is constant-time on most hardware
-  ## See: https://www.bearssl.org/ctmul.html
  var carry: Carry
  lo = umul128(a, b, hi)
  addC(carry, lo, lo, c, Carry(0))
  addC(carry, hi, hi, 0, carry)

-func muladd2*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}=
+func muladd2_128*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}=
  ## Extended precision multiplication + addition + addition
  ## This is constant-time on most hardware except some specific one like Cortex M0
  ## (hi, lo) <- a*b + c1 + c2
--- a/stint/private/uint_div.nim
+++ b/stint/private/uint_div.nim
@ -63,11 +63,11 @@ func shortDiv*(a: var Limbs, k: Word): Word =
 #     d = d shr 1
 #     dec(shift)

-func knuthDivLE[qLen, rLen, uLen, vLen: static int](
-       q: var Limbs[qLen],
-       r: var Limbs[rLen],
-       u: Limbs[uLen],
-       v: Limbs[vLen],
+func knuthDivLE(
+       q: var StUint,
+       r: var StUint,
+       u: StUint,
+       v: StUint,
       needRemainder: bool) =
  ## Compute the quotient and remainder (if needed)
  ## of the division of u by v
@ -80,6 +80,15 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](
  #
  # Resources at the bottom of the file

+  const
+    qLen = q.limbs.len
+    rLen = r.limbs.len
+    uLen = u.limbs.len
+    vLen = v.limbs.len
+
+  template `[]`(a: Stuint, i: int): Word = a.limbs[i]
+  template `[]=`(a: Stuint, i: int, val: Word) = a.limbs[i] = val
+
  # Find the most significant word with actual set bits
  # and get the leading zero count there
  var divisorLen = vLen
@ -96,7 +105,7 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](
  # Divisor is a single word.
  if divisorLen == 1:
    q.copyFrom(u)
-    r.leastSignificantWord() = q.shortDiv(v.leastSignificantWord())
+    r.leastSignificantWord() = q.limbs.shortDiv(v.leastSignificantWord())
    # zero all but the least significant word
    var lsw = true
    for w in leastToMostSig(r):
@ -111,8 +120,8 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](

  # Normalize so that the divisor MSB is set,
  # vn cannot overflow, un can overflowed by 1 word at most, hence uLen+1
-  un.shlSmallOverflowing(u, clz)
-  vn.shlSmall(v, clz)
+  un.shlSmallOverflowing(u.limbs, clz)
+  vn.shlSmall(v.limbs, clz)

  static: doAssert cpuEndian == littleEndian, "Currently the division algorithm requires little endian ordering of the limbs"
  # TODO: is it worth it to have the uint be the exact same extended precision representation
@ -161,24 +170,42 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](
      q[j] -= 1
      var carry = Carry(0)
      for i in 0 ..< divisorLen:
-        addC(carry, u[j+i], u[j+i], v[i], carry)
+        addC(carry, un[j+i], un[j+i], v[i], carry)

  # Quotient is found, if remainder is needed we need to un-normalize un
  if needRemainder:
-    r.shrSmall(un, clz)
+    # r.limbs.shrSmall(un, clz) - TODO
+    when cpuEndian == littleEndian:
+      # rLen+1 == un.len
+      for i in 0 ..< rLen:
+        r[i] = (un[i] shr clz) or (un[i+1] shl (WordBitWidth - clz))
+    else:
+      {.error: "Not Implemented for bigEndian".}
+

 const BinaryShiftThreshold = 8  # If the difference in bit-length is below 8
                                # binary shift is probably faster

 func divmod(q, r: var Stuint,
+<<<<<<< HEAD
    x: Limbs[xLen], y: Limbs[yLen], needRemainder: bool) =
+=======
+            x, y: Stuint, needRemainder: bool) =
+
+>>>>>>> 88858a7 (uint division - compile and pass the single limb tests)
  let x_clz = x.leadingZeros()
  let y_clz = y.leadingZeros()

  # We short-circuit division depending on special-cases.
+<<<<<<< HEAD
  if unlikely(y.isZero):
    raise newException(DivByZeroDefect, "You attempted to divide by zero")
  elif y_clz == (bitsof(y) - 1):
+=======
+  if unlikely(y.isZero()):
+    raise newException(DivByZeroError, "You attempted to divide by zero")
+  elif y_clz == (y.bits - 1):
+>>>>>>> 88858a7 (uint division - compile and pass the single limb tests)
    # y is one
    q = x
  # elif (x.hi or y.hi).isZero:
@ -209,7 +236,7 @@ func `div`*(x, y: Stuint): Stuint {.inline.} =
 func `mod`*(x, y: Stuint): Stuint {.inline.} =
  ## Remainder operation for multi-precision unsigned uint
  var tmp{.noInit.}: Stuint
-  divmod(tmp, result, x,y, needRemainder = true)
+  divmod(tmp, result, x, y, needRemainder = true)

 func divmod*(x, y: Stuint): tuple[quot, rem: Stuint] =
  ## Division and remainder operations for multi-precision unsigned uint
--- a/tests/test_uint_divmod.nim
+++ b/tests/test_uint_divmod.nim
@ -190,19 +190,21 @@ suite "Testing unsigned int division and modulo implementation":
    check: cast[uint64](qr.quot) == 7'u64
    check: cast[uint64](qr.rem)  == 9'u64

-  test "Divmod(2^64, 3) returns the correct result":
-    let a = 1.stuint(128) shl 64
-    let b = 3.stuint(128)
-
-    let qr = divmod(a, b)
-
-    let q = cast[UintImpl[uint64]](qr.quot)
-    let r = cast[UintImpl[uint64]](qr.rem)
-
-    check: q.lo == 6148914691236517205'u64
-    check: q.hi == 0'u64
-    check: r.lo == 1'u64
-    check: r.hi == 0'u64
+  # TODO - no more .lo / .hi
+  #
+  # test "Divmod(2^64, 3) returns the correct result":
+  #   let a = 1.stuint(128) shl 64
+  #   let b = 3.stuint(128)
+  #
+  #   let qr = divmod(a, b)
+  #
+  #   let q = cast[UintImpl[uint64]](qr.quot)
+  #   let r = cast[UintImpl[uint64]](qr.rem)
+  #
+  #   check: q.lo == 6148914691236517205'u64
+  #   check: q.hi == 0'u64
+  #   check: r.lo == 1'u64
+  #   check: r.hi == 0'u64

  test "Divmod(1234567891234567890, 10) returns the correct result":
    let a = cast[StUint[64]](1234567891234567890'u64)