From 6eeba3d41a431040c5603a9f2c76b0075cc4c98f Mon Sep 17 00:00:00 2001
From: Mamy Ratsimbazafy <mratsim@users.noreply.github.com>
Date: Fri, 20 Apr 2018 11:13:47 +0200
Subject: [PATCH] Division: Fix recursive divide-and-conquer (#20)

* Simplify div2n1n

* Revert borrow detection, needed a cleverer scheme.

* Getting inspired by uint128 didn't work for recursive. Use recursive algo from the get go

* Fix shl bug ... (need fuzzy testing)

* divmod fixed for single nesting (?)

* Almost there

* Fix one part of div3n2n

* Division is wooorrrrkkinnnggg :fire:

* Fix compilation for the nested version

* forgot to not multiply by 8 the size

* Add another failing shift test

* Fix countLeadingZero for nested uint

* Cleanup: remove debugecho

* Move debug utils in a specific folder

* Fix forward declaration

* Move division it's own file
---
 src/debug/debugutils.nim         |  44 +++++
 src/private/bithacks.nim         |  11 +-
 src/private/stdlib_bitops.nim    |   2 +-
 src/private/uint_binary_ops.nim  | 190 +------------------
 src/private/uint_bitwise_ops.nim |  45 +++--
 src/private/uint_division.nim    | 310 +++++++++++++++++++++++++++++++
 src/uint_public.nim              |   5 +-
 tests/test_bitwise.nim           |  13 ++
 8 files changed, 415 insertions(+), 205 deletions(-)
 create mode 100644 src/debug/debugutils.nim
 create mode 100644 src/private/uint_division.nim

diff --git a/src/debug/debugutils.nim b/src/debug/debugutils.nim
new file mode 100644
index 0000000..2b3f1ef
--- /dev/null
+++ b/src/debug/debugutils.nim
@@ -0,0 +1,44 @@
+# Mpint
+# Copyright 2018 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# Utilities to debug MpInt
+
+import
+  strutils,
+  ../private/[uint_type, size_mpuintimpl]
+
+func tohexBE*[T: uint8 or uint16 or uint32 or uint64](x: T): string =
+  ## Stringify an uint to hex, Most significant byte on the left
+  ## i.e. a 1.uint64 will be 00000001
+
+  let bytes = cast[ptr array[T.sizeof, byte]](x.unsafeaddr)
+
+  result = ""
+  when system.cpuEndian == littleEndian:
+    for i in countdown(T.sizeof - 1, 0):
+      result.add toHex(bytes[i])
+  else:
+    for i in 0 ..< T.sizeof:
+      result.add toHex(bytes[i])
+
+func tohexBE*(x: MpUintImpl): string =
+  ## Stringify an uint to hex, Most significant byte on the left
+  ## i.e. a (2.uint128)^64 + 1 will be 0000000100000001
+
+  const size = size_mpuintimpl(x) div 8
+
+  let bytes = cast[ptr array[size, byte]](x.unsafeaddr)
+
+  result = ""
+  when system.cpuEndian == littleEndian:
+    for i in countdown(size - 1, 0):
+      result.add toHex(bytes[i])
+  else:
+    for i in 0 ..< size:
+      result.add toHex(bytes[i])
diff --git a/src/private/bithacks.nim b/src/private/bithacks.nim
index bf50436..7c80bb9 100644
--- a/src/private/bithacks.nim
+++ b/src/private/bithacks.nim
@@ -7,7 +7,7 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import  ./uint_type, stdlib_bitops
+import  ./uint_type, stdlib_bitops, size_mpuintimpl
 
 # We reuse bitops from Nim standard lib and optimize it further on x86.
 # On x86 clz it is implemented as bitscanreverse then xor and we need to again xor/sub.
@@ -57,6 +57,11 @@ proc bit_length*(n: MpUintImpl): int {.noSideEffect.}=
 
 proc countLeadingZeroBits*(x: MpUintImpl): int {.inline, nosideeffect.} =
   ## Returns the number of leading zero bits in integer.
+
+  const maxHalfRepr = size_mpuintimpl(x) div 2
+
   let hi_clz = x.hi.countLeadingZeroBits
-  result = if hi_clz == 0: x.lo.countLeadingZeroBits
-           else: hi_clz
+
+  result =  if hi_clz == maxHalfRepr:
+              x.lo.countLeadingZeroBits + maxHalfRepr
+            else: hi_clz
diff --git a/src/private/stdlib_bitops.nim b/src/private/stdlib_bitops.nim
index c56df67..6d3d165 100644
--- a/src/private/stdlib_bitops.nim
+++ b/src/private/stdlib_bitops.nim
@@ -97,7 +97,7 @@ proc countLeadingZeroBits*(x: SomeInteger): int {.inline, nosideeffect.} =
 
   # when noUndefined:
   if x == 0:
-    return 0
+    return sizeof(x) * 8
 
   when nimvm:
       when sizeof(x) <= 4: result = sizeof(x)*8 - 1 - fastlog2_nim(x.uint32)
diff --git a/src/private/uint_binary_ops.nim b/src/private/uint_binary_ops.nim
index 8768206..bf06540 100644
--- a/src/private/uint_binary_ops.nim
+++ b/src/private/uint_binary_ops.nim
@@ -7,7 +7,7 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import  ./bithacks, ./conversion, ./stdlib_bitops,
+import  ./bithacks, ./conversion,
         ./uint_type,
         ./uint_comparison,
         ./uint_bitwise_ops,
@@ -35,8 +35,9 @@ proc `-=`*(x: var MpUintImpl, y: MpUintImpl) {.noSideEffect, inline.}=
   # Optimized assembly should contain sbb instruction (substract with borrow)
   # Clang on MacOS does with the -d:release switch and MpUint[uint32] (uint64)
   type SubTy = type x.lo
+  let original = x.lo
   x.lo -= y.lo
-  x.hi -= (x.lo <= not y.lo).toSubtype(SubTy) + y.hi
+  x.hi -= (original < x.lo).toSubtype(SubTy) + y.hi
 
 proc `-`*(x, y: MpUintImpl): MpUintImpl {.noSideEffect, noInit, inline.}=
   # Substraction for multi-precision unsigned int
@@ -49,7 +50,7 @@ proc `-`*(x, y: MpUintImpl): MpUintImpl {.noSideEffect, noInit, inline.}=
 proc naiveMulImpl[T: MpUintImpl](x, y: T): MpUintImpl[T] {.noSideEffect, noInit, inline.}
   # Forward declaration
 
-proc naiveMul[T: BaseUint](x, y: T): MpUintImpl[T] {.noSideEffect, noInit, inline.}=
+proc naiveMul*[T: BaseUint](x, y: T): MpUintImpl[T] {.noSideEffect, noInit, inline.}=
   ## Naive multiplication algorithm with extended precision
 
   const size = size_mpuintimpl(x)
@@ -116,186 +117,3 @@ proc `*`*(x, y: MpUintImpl): MpUintImpl {.noSideEffect, noInit.}=
   # For T * T --> 2T (uint64 * uint64 --> uint128) we use extra precision multiplication
   result = naiveMul(x.lo, y.lo)
   result.hi += (naiveMul(x.hi, y.lo) + naiveMul(x.lo, y.hi)).lo
-
-
-# ################### Division ################### #
-from ./primitive_divmod import divmod
-
-proc divmod*(x, y: MpUintImpl): tuple[quot, rem: MpUintImpl] {.noSideEffect.}
-
-proc div2n1n[T: BaseUint](x_hi, x_lo, y: T): tuple[quot, rem: T] {.noSideEffect, noInit.} =
-  const
-    size = size_mpuintimpl(x_hi)
-    halfSize = size div 2
-    halfMask = (one(T) shl halfSize) - one(T)
-    base = one(T) shl halfSize
-
-  if unlikely(x_hi >= y):
-    raise newException(ValueError, "Division overflow")
-
-  let clz = countLeadingZeroBits(y) # We assume that for 0 clz returns 0
-
-  # normalization, shift so that the MSB is at 2^n
-  let xn = MpUintImpl[T](hi: x_hi, lo: x_lo) shl clz
-  let yn = y shl clz
-
-  # Break divisor in 2 and dividend in 4
-  let yn_hi = yn shr halfSize
-  let yn_lo = yn and halfMask
-
-  let xnlohi = xn.lo shr halfSize
-  let xnlolo = xn.lo and halfMask
-
-  # First half of the quotient
-  var (q1, r) = divmod(xn.hi, yn_hi)
-
-  while (q1 >= base) or ((q1 * yn_lo) > (base * r + xnlohi)):
-    q1 -= one(T)
-    r += yn_hi
-    if r >= base:
-      break
-
-  # Remove it
-  let xn_rest = xn.hi shl halfSize + xnlohi - (q1 * yn)
-
-  # Second half
-  var (q2, s) = divmod(xn_rest, yn_hi)
-
-  var
-    q2_ynlo = q2 * yn_lo
-    sbase_xnlolo = (s shl halfSize) or xnlolo
-
-  while (q2 >= base) or (q2_ynlo > sbase_xnlolo):
-    q2 -= one(T)
-    s += yn_hi
-    if s >= base:
-      break
-    else:
-      q2_ynlo -= yn_lo
-      sbase_xnlolo = (s shl halfSize) or xnlolo
-
-
-  result.quot = (q1 shl halfSize) or q2
-  result.rem = ((xn_rest shl halfSize) + xnlolo - q2 * yn) shr clz
-
-proc divmod*(x, y: MpUintImpl): tuple[quot, rem: MpUintImpl] {.noInit, noSideEffect.}=
-
-  # Using divide and conquer algorithm.
-  if y.hi.isZero:
-    if x.hi < y.lo: # Bit length of quotient x/y < bit_length(MpUintImpl) / 2
-      (result.quot.lo, result.rem.lo) = div2n1n(x.hi, x.lo, y.lo)
-    else:           # Quotient can overflow the subtype so we split work
-      (result.quot.hi, result.rem.hi) = divmod(x.hi, y.hi)
-      (result.quot.lo, result.rem.lo) = div2n1n(result.rem.hi, x.lo, y.lo)
-      result.rem.hi = zero(type result.rem.hi)
-    return
-  const
-    size = size_mpuintimpl(x)
-    halfSize = size div 2
-
-  block:
-    # Normalization of divisor
-    let clz = countLeadingZeroBits(x.hi)
-    let yn = (y shl clz)
-
-    # Prevent overflows
-    let xn = x shr 1
-
-    # Get the quotient
-    block:
-      let (qlo, _) = div2n1n(xn.hi, xn.lo, yn.hi)
-      result.quot.lo = qlo
-
-    # Undo normalization
-    result.quot = result.quot shr (halfSize - 1 - clz) # -1 to correct for xn shift
-
-  if not result.quot.isZero:
-    result.quot -= one(type result.quot)
-  # Quotient is correct or too small by one
-  # We will fix that once we know the remainder
-
-  # Remainder
-  result.rem = x - (y * result.quot)
-
-  # Fix quotient and reminder if we're off by one
-  if result.rem >= y:
-    # one more division round
-    result.quot += one(type result.quot)
-    result.rem -= y
-
-proc `div`*(x, y: MpUintImpl): MpUintImpl {.inline, noSideEffect.} =
-  ## Division operation for multi-precision unsigned uint
-  divmod(x,y).quot
-
-proc `mod`*(x, y: MpUintImpl): MpUintImpl {.inline, noSideEffect.} =
-  ## Division operation for multi-precision unsigned uint
-  divmod(x,y).rem
-
-
-# ######################################################################
-# Division implementations
-#
-# Division is the most costly operation
-# And also of critical importance for cryptography application
-
-# ##### Research #####
-
-# Overview of division algorithms:
-# - https://gmplib.org/manual/Division-Algorithms.html#Division-Algorithms
-# - https://gmplib.org/~tege/division-paper.pdf
-# - Comparison of fast division algorithms for large integers: http://bioinfo.ict.ac.cn/~dbu/AlgorithmCourses/Lectures/Hasselstrom2003.pdf
-
-# Libdivide has an implementations faster than hardware if dividing by the same number is needed
-# - http://libdivide.com/documentation.html
-# - https://github.com/ridiculousfish/libdivide/blob/master/libdivide.h
-# Furthermore libdivide also has branchless implementations
-
-# Current implementation
-# Currently we use the divide and conquer algorithm. Implementations can be found in
-# - Hacker's delight: http://www.hackersdelight.org/hdcodetxt/divDouble.c.txt
-# - Libdivide
-# - Code project: https://www.codeproject.com/Tips/785014/UInt-Division-Modulus
-# - Cuda-uint128 (unfinished): https://github.com/curtisseizert/CUDA-uint128/blob/master/cuda_uint128.h
-# - Mpdecimal: https://github.com/status-im/nim-decimal/blob/9b65e95299cb582b14e0ae9a656984a2ce0bab03/decimal/mpdecimal_wrapper/generated/basearith.c#L305-L412
-
-# Probably the most efficient algorithm that can benefit from MpUInt recursive data structure is
-# the recursive fast division by Burnikel and Ziegler (http://www.mpi-sb.mpg.de/~ziegler/TechRep.ps.gz):
-#  - Python implementation: https://bugs.python.org/file11060/fast_div.py and discussion https://bugs.python.org/issue3451
-#  - C++ implementation: https://github.com/linbox-team/givaro/blob/master/src/kernel/recint/rudiv.h
-#  - The Handbook of Elliptic and Hyperelliptic Cryptography Algorithm 10.35 on page 188 has a more explicit version of the div2NxN algorithm. This algorithm is directly recursive and avoids the mutual recursion of the original paper's calls between div2NxN and div3Nx2N.
-
-# Other libraries that can be used as reference for alternative (?) implementations:
-# - TTMath: https://github.com/status-im/nim-ttmath/blob/8f6ff2e57b65a350479c4012a53699e262b19975/src/headers/ttmathuint.h#L1530-L2383
-# - LibTomMath: https://github.com/libtom/libtommath
-# - Google Abseil: https://github.com/abseil/abseil-cpp/tree/master/absl/numeric
-# - Crypto libraries like libsecp256k1, OpenSSL, ... though they are not generics. (uint256 only for example)
-# Note: GMP/MPFR are GPL. The papers can be used but not their code.
-
-# ######################################################################
-# School division
-
-# proc divmod*(x, y: MpUintImpl): tuple[quot, rem: MpUintImpl] {.noSideEffect.}=
-#   ## Division for multi-precision unsigned uint
-#   ## Returns quotient + reminder in a (quot, rem) tuple
-#   #
-#   # Implementation through binary shift division
-#   if unlikely(y.isZero):
-#     raise newException(DivByZeroError, "You attempted to divide by zero")
-
-#   type SubTy = type x.lo
-
-#   var
-#     shift = x.bit_length - y.bit_length
-#     d = y shl shift
-
-#   result.rem  = x
-
-#   while shift >= 0:
-#     result.quot += result.quot
-#     if result.rem >= d:
-#       result.rem -= d
-#       result.quot.lo = result.quot.lo or one(SubTy)
-
-#     d = d shr 1
-#     dec(shift)
-
diff --git a/src/private/uint_bitwise_ops.nim b/src/private/uint_bitwise_ops.nim
index 9918cc1..aadc0bf 100644
--- a/src/private/uint_bitwise_ops.nim
+++ b/src/private/uint_bitwise_ops.nim
@@ -10,43 +10,60 @@
 import  ./uint_type, ./size_mpuintimpl, ./conversion
 
 
-proc `not`*(x: MpUintImpl): MpUintImpl {.noInit, noSideEffect, inline.}=
+func `not`*(x: MpUintImpl): MpUintImpl {.noInit, inline.}=
   ## Bitwise complement of unsigned integer x
   result.lo = not x.lo
   result.hi = not x.hi
 
-proc `or`*(x, y: MpUintImpl): MpUintImpl {.noInit, noSideEffect, inline.}=
+func `or`*(x, y: MpUintImpl): MpUintImpl {.noInit, inline.}=
   ## `Bitwise or` of numbers x and y
   result.lo = x.lo or y.lo
   result.hi = x.hi or y.hi
 
-proc `and`*(x, y: MpUintImpl): MpUintImpl {.noInit, noSideEffect, inline.}=
+func `and`*(x, y: MpUintImpl): MpUintImpl {.noInit, inline.}=
   ## `Bitwise and` of numbers x and y
   result.lo = x.lo and y.lo
   result.hi = x.hi and y.hi
 
-proc `xor`*(x, y: MpUintImpl): MpUintImpl {.noInit, noSideEffect, inline.}=
+func `xor`*(x, y: MpUintImpl): MpUintImpl {.noInit, inline.}=
   ## `Bitwise xor` of numbers x and y
   result.lo = x.lo xor y.lo
   result.hi = x.hi xor y.hi
 
-proc `shl`*(x: MpUintImpl, y: SomeInteger): MpUintImpl {.inline, noSideEffect.}=
+func `shr`*(x: MpUintImpl, y: SomeInteger): MpUintImpl {.inline.}
+  # Forward declaration
+
+func `shl`*(x: MpUintImpl, y: SomeInteger): MpUintImpl {.inline.}=
   ## Compute the `shift left` operation of x and y
   # Note: inlining this poses codegen/aliasing issue when doing `x = x shl 1`
+
+  # TODO: would it be better to reimplement this using an array of bytes/uint64
+  # That opens up to endianness issues.
+
   const halfSize = size_mpuintimpl(x) div 2
+  let defect = halfSize - int(y)
 
-  result.hi = (x.hi shl y) or (x.lo shl (y - halfSize))
-  if y < halfSize:
+  if y == 0:
+    return x
+  elif y == halfSize:
+    result.hi = x.lo
+  elif y < halfSize:
+    result.hi = (x.hi shl y) or (x.lo shr (halfSize - y))
     result.lo = x.lo shl y
+  else:
+    result.hi = x.lo shl (y - halfSize)
 
-proc `shr`*(x: MpUintImpl, y: SomeInteger): MpUintImpl {.inline, noSideEffect.}=
+func `shr`*(x: MpUintImpl, y: SomeInteger): MpUintImpl {.inline.}=
   ## Compute the `shift right` operation of x and y
   const halfSize = size_mpuintimpl(x) div 2
 
-  let overflow = y < halfSize
-
-  result.lo = (x.lo shr y) or (
-    if overflow: x.hi shl (halfSize - y) else: x.hi shr (y - halfSize)
-  )
-  if overflow:
+  if y == 0:
+    return x
+  elif y == halfSize:
+    result.lo = x.hi
+  elif y < halfSize:
+    result.lo = (x.lo shr y) or (x.hi shl (halfSize - y))
     result.hi = x.hi shr y
+  else:
+    result.lo = x.hi shr (y - halfSize)
+
diff --git a/src/private/uint_division.nim b/src/private/uint_division.nim
new file mode 100644
index 0000000..e195383
--- /dev/null
+++ b/src/private/uint_division.nim
@@ -0,0 +1,310 @@
+# Mpint
+# Copyright 2018 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import  ./bithacks, ./conversion,
+        ./uint_type,
+        ./uint_comparison,
+        ./uint_bitwise_ops,
+        ./uint_binary_ops,
+        ./size_mpuintimpl,
+        ./primitive_divmod
+
+# ################### Division ################### #
+# We use the following algorithm:
+#  - Fast recursive division by Burnikel and Ziegler
+
+###################################################################################################################
+##                                                                                                               ##
+##  Grade school division, but with (very) large digits, dividing [a1,a2,a3,a4] by [b1,b2]:                      ##
+##                                                                                                               ##
+##    +----+----+----+----+     +----+----+   +----+                                                             ##
+##    | a1 | a2 | a3 | a4 |  /  | b1 | b2 | = | q1 |        DivideThreeHalvesByTwo(a1a2, a3, b1b2, n, q1, r1r2)  ##
+##    +----+----+----+----+     +----+----+   +----+                                                             ##
+##    +--------------+  |                       |                                                                ##
+##    |   b1b2 * q1  |  |                       |                                                                ##
+##    +--------------+  |                       |                                                                ##
+##  - ================  v                       |                                                                ##
+##         +----+----+----+     +----+----+     |  +----+                                                        ##
+##         | r1 | r2 | a4 |  /  | b1 | b2 | =   |  | q2 |   DivideThreeHalvesByTwo(r1r2, a4, b1b2, n, q1, r1r2)  ##
+##         +----+----+----+     +----+----+     |  +----+                                                        ##
+##         +--------------+                     |    |                                                           ##
+##         |   b1b2 * q2  |                     |    |                                                           ##
+##         +--------------+                     |    |                                                           ##
+##       - ================                     v    v                                                           ##
+##              +----+----+                   +----+----+                                                        ##
+##              | r1 | r2 |                   | q1 | q2 |   r1r2 = a1a2a3a4 mod b1b2, q1q2 = a1a2a3a4 div b1b2   ##
+##              +----+----+                   +----+----+ ,                                                      ##
+##                                                                                                               ##
+##  Note: in the diagram above, a1, b1, q1, r1 etc. are the most significant "digits" of their numbers.          ##
+##                                                                                                               ##
+###################################################################################################################
+
+func div2n1n[T: SomeunsignedInt](q, r: var T, n_hi, n_lo, d: T) {.inline.}
+func div2n1n(q, r: var MpUintImpl, ah, al, b: MpUintImpl) {.inline.}
+  # Forward declaration
+
+func div3n2n[T]( q, r1, r0: var MpUintImpl[T],
+              a2, a1, a0: MpUintImpl[T],
+              b1, b0: MpUintImpl[T]) {.inline.}=
+  mixin div2n1n
+
+  type T = type q
+
+  var
+    c: T
+    carry: bool
+
+  if a2 < b1:
+    div2n1n(q, c, a2, a1, b1)
+  else:
+    q = zero(type q) - one(type q) # We want 0xFFFFF ....
+    c = a1 + b1
+    if c < a1:
+      carry = true
+
+  let
+    d = naiveMul(q, b0)
+    b = MpUintImpl[type c](hi: b1, lo: b0)
+
+  var r = MpUintImpl[type c](hi: c, lo: a0) - d
+
+  if  (not carry) and (d > r):
+    q -= one(type q)
+    r += b
+
+    if r > b:
+      q -= one(type q)
+      r += b
+
+  r1 = r.hi
+  r0 = r.lo
+
+template sub_ddmmss[T](sh, sl, ah, al, bh, bl: T) =
+  sl = al - bl
+  sh = ah - bh - (al < bl).T
+
+func lo[T:SomeUnsignedInt](x: T): T {.inline.} =
+  const
+    p = T.sizeof * 8 div 2
+    base = 1 shl p
+    mask = base - 1
+  result = x and mask
+
+func hi[T:SomeUnsignedInt](x: T): T {.inline.} =
+  const
+    p = T.sizeof * 8 div 2
+  result = x shr p
+
+func umul_ppmm[T](w1, w0: var T, u, v: T) =
+
+  const
+    p = (T.sizeof * 8 div 2)
+    base = 1 shl p
+
+  var
+    x0, x1, x2, x3: T
+
+  let
+    ul = u.lo
+    uh = u.hi
+    vl = v.lo
+    vh = v.hi
+
+  x0 = ul * vl
+  x1 = ul * vh
+  x2 = uh * vl
+  x3 = uh * vh
+
+  x1 += x0.hi           # This can't carry
+  x1 += x2              # but this can
+  if x1 < x2:           # if carry, add it to x3
+    x3 += base
+
+  w1 = x3 + x1.hi
+  w0 = (x1 shl p) + x0.lo
+
+
+proc div3n2n( q, r1, r0: var SomeUnsignedInt,
+              a2, a1, a0: SomeUnsignedInt,
+              b1, b0: SomeUnsignedInt) {.inline.}=
+  mixin div2n1n
+
+  type T = type q
+
+  var
+    c, d1, d0: T
+    carry: bool
+
+  if a2 < b1:
+    div2n1n(q, c, a2, a1, b1)
+
+  else:
+    q = 0.T - 1.T # We want 0xFFFFF ....
+    c = a1 + b1
+    if c < a1:
+      carry = true
+
+  umul_ppmm(d1, d0, q, b0)
+  sub_ddmmss(r1, r0, c, a0, d1, d0)
+
+  if  (not carry) and ((d1 > c) or ((d1 == c) and (d0 > a0))):
+    q -= 1.T
+    r0 += b0
+    r1 += b1
+    if r0 < b0:
+      inc r1
+
+    if (r1 > b1) or ((r1 == b1) and (r0 >= b0)):
+      q -= 1.T
+      r0 += b0
+      r1 += b1
+      if r0 < b0:
+        inc r1
+
+func div2n1n(q, r: var MpUintImpl, ah, al, b: MpUintImpl) {.inline.} =
+
+  # assert countLeadingZeroBits(b) == 0, "Divisor was not normalized"
+
+  var s: MpUintImpl
+  div3n2n(q.hi, s.hi, s.lo, ah.hi, ah.lo, al.hi, b.hi, b.lo)
+  div3n2n(q.lo, r.hi, r.lo, s.hi, s.lo, al.lo, b.hi, b.lo)
+
+func div2n1n[T: SomeunsignedInt](q, r: var T, n_hi, n_lo, d: T) {.inline.} =
+
+  # assert countLeadingZeroBits(d) == 0, "Divisor was not normalized"
+
+  const
+    size = size_mpuintimpl(q)
+    halfSize = size div 2
+    halfMask = (1.T shl halfSize) - 1.T
+
+  template halfQR(n_hi, n_lo, d_hi, d_lo: T): tuple[q,r: T] =
+
+    var (q, r) = divmod(n_hi, d_hi)
+    let m = q * d_lo
+    r = (r shl halfSize) or n_lo
+
+    # Fix the reminder, we're at most 2 iterations off
+    if r < m:
+      q -= 1.T
+      r += d_hi
+      if r >= d_hi and r < m:
+        q -= 1.T
+        r += d_hi
+    r -= m
+    (q, r)
+
+  let
+    d_hi = d shr halfSize
+    d_lo = d and halfMask
+    n_lohi = nlo shr halfSize
+    n_lolo = nlo and halfMask
+
+  # First half of the quotient
+  let (q1, r1) = halfQR(n_hi, n_lohi, d_hi, d_lo)
+
+  # Second half
+  let (q2, r2) = halfQR(r1, n_lolo, d_hi, d_lo)
+
+  q = (q1 shl halfSize) or q2
+  r = r2
+
+func divmod*[T](x, y: MpUintImpl[T]): tuple[quot, rem: MpUintImpl[T]] =
+
+  # Normalization
+  assert y.isZero.not()
+
+  const halfSize = size_mpuintimpl(x) div 2
+  let clz = countLeadingZeroBits(y)
+
+  let
+    xx = MpUintImpl[type x](lo: x) shl clz
+    yy = y shl clz
+
+  # Compute
+  div2n1n(result.quot, result.rem, xx.hi, xx.lo, yy)
+
+  # Undo normalization
+  result.rem = result.rem shr clz
+
+func `div`*(x, y: MpUintImpl): MpUintImpl {.inline.} =
+  ## Division operation for multi-precision unsigned uint
+  divmod(x,y).quot
+
+func `mod`*(x, y: MpUintImpl): MpUintImpl {.inline.} =
+  ## Division operation for multi-precision unsigned uint
+  divmod(x,y).rem
+
+
+# ######################################################################
+# Division implementations
+#
+# Division is the most costly operation
+# And also of critical importance for cryptography application
+
+# ##### Research #####
+
+# Overview of division algorithms:
+# - https://gmplib.org/manual/Division-Algorithms.html#Division-Algorithms
+# - https://gmplib.org/~tege/division-paper.pdf
+# - Comparison of fast division algorithms for large integers: http://bioinfo.ict.ac.cn/~dbu/AlgorithmCourses/Lectures/Hasselstrom2003.pdf
+
+# Libdivide has an implementations faster than hardware if dividing by the same number is needed
+# - http://libdivide.com/documentation.html
+# - https://github.com/ridiculousfish/libdivide/blob/master/libdivide.h
+# Furthermore libdivide also has branchless implementations
+
+# Implementation: we use recursive fast division by Burnikel and Ziegler.
+#
+# It is build upon divide and conquer algorithm that can be found in:
+# - Hacker's delight: http://www.hackersdelight.org/hdcodetxt/divDouble.c.txt
+# - Libdivide
+# - Code project: https://www.codeproject.com/Tips/785014/UInt-Division-Modulus
+# - Cuda-uint128 (unfinished): https://github.com/curtisseizert/CUDA-uint128/blob/master/cuda_uint128.h
+# - Mpdecimal: https://github.com/status-im/nim-decimal/blob/9b65e95299cb582b14e0ae9a656984a2ce0bab03/decimal/mpdecimal_wrapper/generated/basearith.c#L305-L412
+
+# Description of recursive fast division by Burnikel and Ziegler (http://www.mpi-sb.mpg.de/~ziegler/TechRep.ps.gz):
+#  - Python implementation: https://bugs.python.org/file11060/fast_div.py and discussion https://bugs.python.org/issue3451
+#  - C++ implementation: https://github.com/linbox-team/givaro/blob/master/src/kernel/recint/rudiv.h
+#  - The Handbook of Elliptic and Hyperelliptic Cryptography Algorithm 10.35 on page 188 has a more explicit version of the div2NxN algorithm. This algorithm is directly recursive and avoids the mutual recursion of the original paper's calls between div2NxN and div3Nx2N.
+
+# Other libraries that can be used as reference for alternative (?) implementations:
+# - TTMath: https://github.com/status-im/nim-ttmath/blob/8f6ff2e57b65a350479c4012a53699e262b19975/src/headers/ttmathuint.h#L1530-L2383
+# - LibTomMath: https://github.com/libtom/libtommath
+# - Google Abseil: https://github.com/abseil/abseil-cpp/tree/master/absl/numeric
+# - Crypto libraries like libsecp256k1, OpenSSL, ... though they are not generics. (uint256 only for example)
+# Note: GMP/MPFR are GPL. The papers can be used but not their code.
+
+# ######################################################################
+# School division
+
+# proc divmod*(x, y: MpUintImpl): tuple[quot, rem: MpUintImpl] {.noSideEffect.}=
+#   ## Division for multi-precision unsigned uint
+#   ## Returns quotient + reminder in a (quot, rem) tuple
+#   #
+#   # Implementation through binary shift division
+#   if unlikely(y.isZero):
+#     raise newException(DivByZeroError, "You attempted to divide by zero")
+
+#   type SubTy = type x.lo
+
+#   var
+#     shift = x.bit_length - y.bit_length
+#     d = y shl shift
+
+#   result.rem  = x
+
+#   while shift >= 0:
+#     result.quot += result.quot
+#     if result.rem >= d:
+#       result.rem -= d
+#       result.quot.lo = result.quot.lo or one(SubTy)
+
+#     d = d shr 1
+#     dec(shift)
diff --git a/src/uint_public.nim b/src/uint_public.nim
index 83adac5..ba75329 100644
--- a/src/uint_public.nim
+++ b/src/uint_public.nim
@@ -43,13 +43,16 @@ template make_binary_inplace(op): untyped =
   export op
 
 import ./private/uint_binary_ops
-import ./private/primitive_divmod
 
 make_binary(`+`, MpUint)
 make_binary_inplace(`+=`)
 make_binary(`-`, MpUint)
 make_binary_inplace(`-=`)
 make_binary(`*`, MpUint)
+
+import ./private/primitive_divmod,
+       ./private/uint_division
+
 make_binary(`div`, MpUint)
 make_binary(`mod`, MpUint)
 proc divmod*(x, y: MpUint): tuple[quot, rem: MpUint] {.noInit, inline, noSideEffect.} =
diff --git a/tests/test_bitwise.nim b/tests/test_bitwise.nim
index 0929492..10f2a72 100644
--- a/tests/test_bitwise.nim
+++ b/tests/test_bitwise.nim
@@ -16,6 +16,11 @@ suite "Testing bitwise operations":
   let z = 10000'u16
   assert cast[uint16](b) == z, "Test cannot proceed, something is wrong with the multiplication implementation"
 
+
+  let u = 10000.initMpUint(64)
+  let v = 10000'u64
+  let clz = 30
+
   test "Shift left - by less than half the size of the integer":
     check: cast[uint16](b) == z # Sanity check
     check: cast[uint16](b shl 2) == z shl 2
@@ -24,10 +29,18 @@ suite "Testing bitwise operations":
     check: cast[uint16](b) == z # Sanity check
     check: cast[uint16](b shl 10) == z shl 10
 
+    check: cast[uint64](u shl clz) == v shl clz
+
   test "Shift left - by half the size of the integer":
     check: cast[uint16](b) == z # Sanity check
     check: cast[uint16](b shl 8) == z shl 8
 
+    block: # Testing shl for nested MpUintImpl
+      let p2_64 = MpUintImpl[uint64](hi:1, lo:0)
+      let p = 1.initMpUint(128) shl 64
+
+      check: p == cast[MpUint[128]](p2_64)
+
   test "Shift right - by less than half the size of the integer":
     check: cast[uint16](b) == z # Sanity check
     check: cast[uint16](b shr 2) == z shr 2