From 7efa2483e4083f6357e3ce3f878e10da49695042 Mon Sep 17 00:00:00 2001
From: Mamy Ratsimbazafy <mamy_github@numforge.co>
Date: Sun, 23 Jan 2022 21:39:26 +0100
Subject: [PATCH] Division/modulo implemented - pass property-based testing vs
 ttmath

---
 benchmarks/bench_mod.nim      |  18 +--
 stint/private/uint_addsub.nim |   2 -
 stint/private/uint_div.nim    | 228 ++++++++++++++++++++++++++--------
 stint/private/uint_shift.nim  |  41 +++---
 stint/uintops.nim             |   2 +-
 5 files changed, 204 insertions(+), 87 deletions(-)

diff --git a/benchmarks/bench_mod.nim b/benchmarks/bench_mod.nim
index 0f79b68..57fabb7 100644
--- a/benchmarks/bench_mod.nim
+++ b/benchmarks/bench_mod.nim
@@ -15,29 +15,33 @@ echo "Warmup: " & $(stop - start) & "s"
 
 ####################################
 
+let a = [123'u64, 123'u64, 123'u64, 123'u64]
+let m = [456'u64, 456'u64, 456'u64, 45'u64]
+
+let aU256 = cast[Stuint[256]](a)
+let mU256 = cast[Stuint[256]](m)
 
 start = cpuTime()
 block:
-  var foo = 123.u256
+  var foo = aU256
   for i in 0 ..< 10_000_000:
-    foo += i.u256 * i.u256 mod 456.u256
-    foo = foo mod 789.u256
+    foo += (foo * foo) mod mU256
 
 stop = cpuTime()
 echo "Library: " & $(stop - start) & "s"
 
 when defined(bench_ttmath):
   # need C++
-  import ttmath
+  import ttmath, ../tests/ttmath_compat
 
   template tt_u256(a: int): UInt[256] = ttmath.u256(a.uint)
 
   start = cpuTime()
   block:
-    var foo = 123.tt_u256
+    var foo = a.astt()
+    let mU256 = m.astt()
     for i in 0 ..< 10_000_000:
-      foo += i.tt_u256 * i.tt_u256 mod 456.tt_u256
-      foo = foo mod 789.tt_u256
+      foo += (foo * foo) mod mU256
 
   stop = cpuTime()
   echo "TTMath: " & $(stop - start) & "s"
diff --git a/stint/private/uint_addsub.nim b/stint/private/uint_addsub.nim
index f037c1b..6821d75 100644
--- a/stint/private/uint_addsub.nim
+++ b/stint/private/uint_addsub.nim
@@ -8,8 +8,6 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  # Status lib
-  stew/bitops2,
   # Internal
   ./datatypes,
   ./primitives/addcarry_subborrow
diff --git a/stint/private/uint_div.nim b/stint/private/uint_div.nim
index aaa8606..333234c 100644
--- a/stint/private/uint_div.nim
+++ b/stint/private/uint_div.nim
@@ -13,9 +13,33 @@ import
   # Internal
   ./datatypes,
   ./uint_bitwise,
-  ./uint_shift,
   ./primitives/[addcarry_subborrow, extended_precision]
 
+# Helpers
+# --------------------------------------------------------
+
+func usedBitsAndWords(a: openArray[Word]): tuple[bits, words: int] {.inline.} =
+  ## Returns the number of used words and bits in a bigInt
+  var clz = 0
+  # Count Leading Zeros
+  for i in countdown(a.len-1, 0):
+    let count = log2trunc(a[i])
+    # debugEcho "count: ", count, ", a[", i, "]: ", a[i].toBin(64)
+    if count == -1:
+      clz += WordBitWidth
+    else:
+      clz += WordBitWidth - count - 1
+      return (a.len*WordBitWidth - clz, i+1)
+
+func copyWords(
+       a: var openArray[Word], startA: int,
+       b: openArray[Word], startB: int,
+       numWords: int) =
+  ## Copy a slice of B into A. This properly deals
+  ## with overlaps when A and B are slices of the same buffer
+  for i in countdown(numWords-1, 0):
+    a[startA+i] = b[startB+i]
+
 # Division
 # --------------------------------------------------------
 
@@ -37,45 +61,18 @@ func shortDiv*(a: var Limbs, k: Word): Word =
     # Undo normalization
     result = result shr clz
 
-# func binaryShiftDiv[qLen, rLen, uLen, vLen: static int](
-#        q: var Limbs[qLen],
-#        r: var Limbs[rLen],
-#        u: Limbs[uLen],
-#        v: Limbs[vLen]) =
-#   ## Division for multi-precision unsigned uint
-#   ## Implementation through binary shift division
-#   doAssert y.isZero.not() # This should be checked on release mode in the divmod caller proc
-
-#   type SubTy = type x.lo
-
-#   var
-#     shift = y.leadingZeros - x.leadingZeros
-#     d = y shl shift
-
-#   r = x
-
-#   while shift >= 0:
-#     q += q
-#     if r >= d:
-#       r -= d
-#       q.lo = q.lo or one(SubTy)
-
-#     d = d shr 1
-#     dec(shift)
-
-func knuthDivLE(
-       q: var StUint,
-       r: var StUint,
-       u: StUint,
-       v: StUint,
-       needRemainder: bool) =
-  ## Compute the quotient and remainder (if needed)
-  ## of the division of u by v
+func shlAddMod_multi(a: var openArray[Word], c: Word,
+                     M: openArray[Word], mBits: int): Word =
+  ## Fused modular left-shift + add
+  ## Shift input `a` by a word and add `c` modulo `M`
+  ## 
+  ## Specialized for M being a multi-precision integer.
   ##
-  ## - q must be of size uLen - vLen + 1 (assuming u and v uses all words)
-  ## - r must be of size vLen (assuming v uses all words)
-  ## - uLen >= vLen
+  ## With a word W = 2^WordBitWidth and a modulus M
+  ## Does a <- a * W + c (mod M)
+  ## and returns q = (a * W + c ) / M
   ##
+<<<<<<< HEAD
   ## For now only LittleEndian is implemented
   #
   # Resources at the bottom of the file
@@ -187,25 +184,15 @@ const BinaryShiftThreshold = 8  # If the difference in bit-length is below 8
                                 # binary shift is probably faster
 
 func divmod(q, r: var Stuint,
-<<<<<<< HEAD
-    x: Limbs[xLen], y: Limbs[yLen], needRemainder: bool) =
-=======
             x, y: Stuint, needRemainder: bool) =
 
->>>>>>> 88858a7 (uint division - compile and pass the single limb tests)
   let x_clz = x.leadingZeros()
   let y_clz = y.leadingZeros()
 
   # We short-circuit division depending on special-cases.
-<<<<<<< HEAD
-  if unlikely(y.isZero):
-    raise newException(DivByZeroDefect, "You attempted to divide by zero")
-  elif y_clz == (bitsof(y) - 1):
-=======
   if unlikely(y.isZero()):
     raise newException(DivByZeroError, "You attempted to divide by zero")
   elif y_clz == (y.bits - 1):
->>>>>>> 88858a7 (uint division - compile and pass the single limb tests)
     # y is one
     q = x
   # elif (x.hi or y.hi).isZero:
@@ -225,28 +212,163 @@ func divmod(q, r: var Stuint,
     r = x
   # elif (y_clz - x_clz) < BinaryShiftThreshold:
   #   binaryShiftDiv(x, y, result.quot, result.rem)
+  ## The modulus `M` most-significant bit at `mBits` MUST be set.
+  
+                                        # Assuming 64-bit words
+  let hi = a[^1]                        # Save the high word to detect carries
+  let R = mBits and (WordBitWidth - 1)  # R = mBits mod 64
+
+  var a0, a1, m0: Word
+  if R == 0:                            # If the number of mBits is a multiple of 64
+    a0 = a[^1]                          #
+    copyWords(a, 1, a, 0, a.len-1)      # we can just shift words
+    a[0] = c                            # and replace the first one by c
+    a1 = a[^1]
+    m0 = M[^1]
+  else:                                 # Else: need to deal with partial word shifts at the edge.
+    let clz = WordBitWidth-R
+    a0 = (a[^1] shl clz) or (a[^2] shr R)
+    copyWords(a, 1, a, 0, a.len-1)
+    a[0] = c
+    a1 = (a[^1] shl clz) or (a[^2] shr R)
+    m0 = (M[^1] shl clz) or (M[^2] shr R)
+
+  # m0 has its high bit set. (a0, a1)/m0 fits in a limb.
+  # Get a quotient q, at most we will be 2 iterations off
+  # from the true quotient
+  var q: Word                           # Estimate quotient
+  if a0 == m0:                          # if a_hi == divisor
+    q = high(Word)                      # quotient = MaxWord (0b1111...1111)
+  elif a0 == 0 and a1 < m0:             # elif q == 0, true quotient = 0
+    q = 0
   else:
-    knuthDivLE(q, r, x, y, needRemainder)
+    var r: Word
+    div2n1n(q, r, a0, a1, m0)           # else instead of being of by 0, 1 or 2
+    q -= 1                              # we return q-1 to be off by -1, 0 or 1
+
+  # Now substract a*2^64 - q*m
+  var carry = Word(0)
+  var overM = true                      # Track if quotient greater than the modulus
+
+  for i in 0 ..< M.len:
+    var qm_lo: Word
+    block:                              # q*m
+      # q * p + carry (doubleword) carry from previous limb
+      muladd1(carry, qm_lo, q, M[i], carry)
+
+    block:                              # a*2^64 - q*m
+      var borrow: Borrow
+      subB(borrow, a[i], a[i], qm_lo, Borrow(0))
+      carry += Word(borrow) # Adjust if borrow
+
+    if a[i] != M[i]:
+      overM = a[i] > M[i]
+
+  # Fix quotient, the true quotient is either q-1, q or q+1
+  #
+  # if carry < q or carry == q and overM we must do "a -= M"
+  # if carry > hi (negative result) we must do "a += M"
+  if carry > hi:
+    var c = Carry(0)
+    for i in 0 ..< a.len:
+      addC(c, a[i], a[i], M[i], c)
+    q -= 1
+  elif overM or (carry < hi):
+    var b = Borrow(0)
+    for i in 0 ..< a.len:
+      subB(b, a[i], a[i], M[i], b)
+    q += 1
+
+  return q
+
+func shlAddMod(a: var openArray[Word], c: Word,
+               M: openArray[Word], mBits: int): Word {.inline.}=
+  ## Fused modular left-shift + add
+  ## Shift input `a` by a word and add `c` modulo `M`
+  ## 
+  ## With a word W = 2^WordBitWidth and a modulus M
+  ## Does a <- a * W + c (mod M)
+  ## and returns q = (a * W + c ) / M
+  ##
+  ## The modulus `M` most-significant bit at `mBits` MUST be set.
+  if mBits <= WordBitWidth:
+    # If M fits in a single limb
+
+    # We normalize M with clz so that the MSB is set
+    # And normalize (a * 2^64 + c) by R as well to maintain the result
+    # This ensures that (a0, a1)/p0 fits in a limb.
+    let R = mBits and (WordBitWidth - 1)
+    let clz = WordBitWidth-R
+
+    # (hi, lo) = a * 2^64 + c
+    let hi = (a[0] shl clz) or (c shr R)
+    let lo = c shl clz
+    let m0 = M[0] shl clz
+
+    var q, r: Word
+    div2n1n(q, r, hi, lo, m0)
+    a[0] = r shr clz
+    return q
+  else:
+    return shlAddMod_multi(a, c, M, mBits)
+
+func divRemImpl(
+       q, r: var openArray[Word],
+       a, b: openArray[Word]
+     ) =
+  let (aBits, aLen) = usedBitsAndWords(a)
+  let (bBits, bLen) = usedBitsAndWords(b)
+  let rLen = bLen
+
+  if aBits < bBits:
+    # if a uses less bits than b,
+    # a < b, so q = 0 and r = a
+    copyWords(r, 0, a, 0, aLen)
+    for i in aLen ..< r.len: # r.len >= rLen
+      r[i] = 0
+    for i in 0 ..< q.len:
+      q[i] = 0
+  else:
+    # The length of a is at least the divisor
+    # We can copy bLen-1 words
+    # and modular shift-lef-add the rest
+    let aOffset = aLen - bLen
+    copyWords(r, 0, a, aOffset+1, bLen-1)
+    r[rLen-1] = 0
+    # Now shift-left the copied words while adding the new word mod b
+    for i in countdown(aOffset, 0):
+      q[i] = shlAddMod(
+        r.toOpenArray(0, rLen-1),
+        a[i],
+        b.toOpenArray(0, bLen-1),
+        bBits
+      )
+
+    # Clean up extra words
+    for i in aOffset+1 ..< q.len:
+      q[i] = 0
+    for i in rLen ..< r.len:
+      r[i] = 0
 
 func `div`*(x, y: Stuint): Stuint {.inline.} =
   ## Division operation for multi-precision unsigned uint
   var tmp{.noInit.}: Stuint
-  divmod(result, tmp, x, y, needRemainder = false)
+  divRemImpl(result.limbs, tmp.limbs, x.limbs, y.limbs)
 
 func `mod`*(x, y: Stuint): Stuint {.inline.} =
   ## Remainder operation for multi-precision unsigned uint
   var tmp{.noInit.}: Stuint
-  divmod(tmp, result, x, y, needRemainder = true)
+  divRemImpl(tmp.limbs, result.limbs, x.limbs, y.limbs)
 
 func divmod*(x, y: Stuint): tuple[quot, rem: Stuint] =
   ## Division and remainder operations for multi-precision unsigned uint
-  divmod(result.quot, result.rem, x, y, needRemainder = true)
+  divRemImpl(result.quot.limbs, result.rem.limbs, x.limbs, y.limbs)
 
 # ######################################################################
 # Division implementations
 #
 # Multi-precision division is a costly
-#and also difficult to implement operation
+# and also difficult to implement operation
 
 # ##### Research #####
 
diff --git a/stint/private/uint_shift.nim b/stint/private/uint_shift.nim
index a1441b0..3d50abf 100644
--- a/stint/private/uint_shift.nim
+++ b/stint/private/uint_shift.nim
@@ -54,33 +54,14 @@ func shrWords*(r: var Limbs, a: Limbs, w: SomeInteger) =
   when cpuEndian == littleEndian:
     for i in 0 ..< Limbs.len-w:
       r[i] = a[i+w]
+    for i in Limbs.len-w ..< Limbs.len:
+      r[i] = 0
   else:
+    for i in countdown(Limbs.len-1, Limbs.len-w):
+      r[i] = 0
     for i in countdown(Limbs.len-w, 0):
       r[i] = a[i+w]
 
-func shlSmallOverflowing*[rLen, aLen: static int](
-       r: var Limbs[rLen], a: Limbs[aLen], k: SomeInteger) =
-  ## Compute the `shift left` operation of x and k
-  ##
-  ## k MUST be less than the base word size (2^32 or 2^64)
-  when cpuEndian == littleEndian:
-    r[0] = a[0] shl k
-    for i in 1 ..< a.len:
-      r[i] = (a[i] shl k) or (a[i-1] shr (WordBitWidth - k))
-    if rLen > aLen:
-      r[aLen] = a[aLen - 1] shr (WordBitWidth - k)
-      for i in aLen+1 ..< rLen:
-        r[i] = 0
-  else:
-    const offset = rLen - aLen
-    r[^1] = a[^1] shl k
-    for i in countdown(a.len-2, 0):
-      r[i+offset] = (a[i] shl k) or (a[i+1] shr (WordBitWidth - k))
-    if rLen > aLen:
-      r[offset-1] = a[0] shr (WordBitWidth - k)
-      for i in 0 ..< offset-1:
-        r[i] = 0
-
 func shlSmall*(r: var Limbs, a: Limbs, k: SomeInteger) =
   ## Compute the `shift left` operation of x and k
   ##
@@ -112,10 +93,14 @@ func shlLarge*(r: var Limbs, a: Limbs, w, shift: SomeInteger) =
 func shlWords*(r: var Limbs, a: Limbs, w: SomeInteger) =
   ## Shift left by w word
   when cpuEndian == littleEndian:
+    for i in 0 ..< w:
+      r[i] = 0
     for i in 0 ..< Limbs.len-w:
       r[i+w] = a[i]
   else:
-    for i in countdown(Limbs.len-1, 0):
+    for i in countdown(Limbs.len-1, Limbs.len-w):
+      r[i] = 0
+    for i in countdown(Limbs.len-w-1, 0):
       r[i] = a[i-w]
 
 # Wrappers
@@ -123,6 +108,10 @@ func shlWords*(r: var Limbs, a: Limbs, w: SomeInteger) =
 
 func shiftRight*(r: var Stuint, a: Stuint, k: SomeInteger) =
   ## Shift `a` right by k bits and store in `r`
+  if k == 0:
+    r = a
+    return
+  
   if k < WordBitWidth:
     r.limbs.shrSmall(a.limbs, k)
     return
@@ -138,6 +127,10 @@ func shiftRight*(r: var Stuint, a: Stuint, k: SomeInteger) =
 
 func shiftLeft*(r: var Stuint, a: Stuint, k: SomeInteger) =
   ## Shift `a` left by k bits and store in `r`
+  if k == 0:
+    r = a
+    return
+  
   if k < WordBitWidth:
     r.limbs.shlSmall(a.limbs, k)
     r.clearExtraBits()
diff --git a/stint/uintops.nim b/stint/uintops.nim
index 738a1b9..f196bd2 100644
--- a/stint/uintops.nim
+++ b/stint/uintops.nim
@@ -26,7 +26,7 @@ export StUint
 func setZero*(a: var StUint) =
   ## Set ``a`` to 0
   for i in 0 ..< a.limbs.len:
-    a[i] = 0
+    a.limbs[i] = 0
 
 func setSmallInt(a: var StUint, k: Word) =
   ## Set ``a`` to k