From 6eeba3d41a431040c5603a9f2c76b0075cc4c98f Mon Sep 17 00:00:00 2001 From: Mamy Ratsimbazafy Date: Fri, 20 Apr 2018 11:13:47 +0200 Subject: [PATCH] Division: Fix recursive divide-and-conquer (#20) * Simplify div2n1n * Revert borrow detection, needed a cleverer scheme. * Getting inspired by uint128 didn't work for recursive. Use recursive algo from the get go * Fix shl bug ... (need fuzzy testing) * divmod fixed for single nesting (?) * Almost there * Fix one part of div3n2n * Division is wooorrrrkkinnnggg :fire: * Fix compilation for the nested version * forgot to not multiply by 8 the size * Add another failing shift test * Fix countLeadingZero for nested uint * Cleanup: remove debugecho * Move debug utils in a specific folder * Fix forward declaration * Move division it's own file --- src/debug/debugutils.nim | 44 +++++ src/private/bithacks.nim | 11 +- src/private/stdlib_bitops.nim | 2 +- src/private/uint_binary_ops.nim | 190 +------------------ src/private/uint_bitwise_ops.nim | 45 +++-- src/private/uint_division.nim | 310 +++++++++++++++++++++++++++++++ src/uint_public.nim | 5 +- tests/test_bitwise.nim | 13 ++ 8 files changed, 415 insertions(+), 205 deletions(-) create mode 100644 src/debug/debugutils.nim create mode 100644 src/private/uint_division.nim diff --git a/src/debug/debugutils.nim b/src/debug/debugutils.nim new file mode 100644 index 0000000..2b3f1ef --- /dev/null +++ b/src/debug/debugutils.nim @@ -0,0 +1,44 @@ +# Mpint +# Copyright 2018 Status Research & Development GmbH +# Licensed under either of +# +# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) +# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) +# +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +# Utilities to debug MpInt + +import + strutils, + ../private/[uint_type, size_mpuintimpl] + +func tohexBE*[T: uint8 or uint16 or uint32 or uint64](x: T): string = + ## Stringify an uint to hex, Most significant byte on the left + ## i.e. a 1.uint64 will be 00000001 + + let bytes = cast[ptr array[T.sizeof, byte]](x.unsafeaddr) + + result = "" + when system.cpuEndian == littleEndian: + for i in countdown(T.sizeof - 1, 0): + result.add toHex(bytes[i]) + else: + for i in 0 ..< T.sizeof: + result.add toHex(bytes[i]) + +func tohexBE*(x: MpUintImpl): string = + ## Stringify an uint to hex, Most significant byte on the left + ## i.e. a (2.uint128)^64 + 1 will be 0000000100000001 + + const size = size_mpuintimpl(x) div 8 + + let bytes = cast[ptr array[size, byte]](x.unsafeaddr) + + result = "" + when system.cpuEndian == littleEndian: + for i in countdown(size - 1, 0): + result.add toHex(bytes[i]) + else: + for i in 0 ..< size: + result.add toHex(bytes[i]) diff --git a/src/private/bithacks.nim b/src/private/bithacks.nim index bf50436..7c80bb9 100644 --- a/src/private/bithacks.nim +++ b/src/private/bithacks.nim @@ -7,7 +7,7 @@ # # at your option. This file may not be copied, modified, or distributed except according to those terms. -import ./uint_type, stdlib_bitops +import ./uint_type, stdlib_bitops, size_mpuintimpl # We reuse bitops from Nim standard lib and optimize it further on x86. # On x86 clz it is implemented as bitscanreverse then xor and we need to again xor/sub. @@ -57,6 +57,11 @@ proc bit_length*(n: MpUintImpl): int {.noSideEffect.}= proc countLeadingZeroBits*(x: MpUintImpl): int {.inline, nosideeffect.} = ## Returns the number of leading zero bits in integer. + + const maxHalfRepr = size_mpuintimpl(x) div 2 + let hi_clz = x.hi.countLeadingZeroBits - result = if hi_clz == 0: x.lo.countLeadingZeroBits - else: hi_clz + + result = if hi_clz == maxHalfRepr: + x.lo.countLeadingZeroBits + maxHalfRepr + else: hi_clz diff --git a/src/private/stdlib_bitops.nim b/src/private/stdlib_bitops.nim index c56df67..6d3d165 100644 --- a/src/private/stdlib_bitops.nim +++ b/src/private/stdlib_bitops.nim @@ -97,7 +97,7 @@ proc countLeadingZeroBits*(x: SomeInteger): int {.inline, nosideeffect.} = # when noUndefined: if x == 0: - return 0 + return sizeof(x) * 8 when nimvm: when sizeof(x) <= 4: result = sizeof(x)*8 - 1 - fastlog2_nim(x.uint32) diff --git a/src/private/uint_binary_ops.nim b/src/private/uint_binary_ops.nim index 8768206..bf06540 100644 --- a/src/private/uint_binary_ops.nim +++ b/src/private/uint_binary_ops.nim @@ -7,7 +7,7 @@ # # at your option. This file may not be copied, modified, or distributed except according to those terms. -import ./bithacks, ./conversion, ./stdlib_bitops, +import ./bithacks, ./conversion, ./uint_type, ./uint_comparison, ./uint_bitwise_ops, @@ -35,8 +35,9 @@ proc `-=`*(x: var MpUintImpl, y: MpUintImpl) {.noSideEffect, inline.}= # Optimized assembly should contain sbb instruction (substract with borrow) # Clang on MacOS does with the -d:release switch and MpUint[uint32] (uint64) type SubTy = type x.lo + let original = x.lo x.lo -= y.lo - x.hi -= (x.lo <= not y.lo).toSubtype(SubTy) + y.hi + x.hi -= (original < x.lo).toSubtype(SubTy) + y.hi proc `-`*(x, y: MpUintImpl): MpUintImpl {.noSideEffect, noInit, inline.}= # Substraction for multi-precision unsigned int @@ -49,7 +50,7 @@ proc `-`*(x, y: MpUintImpl): MpUintImpl {.noSideEffect, noInit, inline.}= proc naiveMulImpl[T: MpUintImpl](x, y: T): MpUintImpl[T] {.noSideEffect, noInit, inline.} # Forward declaration -proc naiveMul[T: BaseUint](x, y: T): MpUintImpl[T] {.noSideEffect, noInit, inline.}= +proc naiveMul*[T: BaseUint](x, y: T): MpUintImpl[T] {.noSideEffect, noInit, inline.}= ## Naive multiplication algorithm with extended precision const size = size_mpuintimpl(x) @@ -116,186 +117,3 @@ proc `*`*(x, y: MpUintImpl): MpUintImpl {.noSideEffect, noInit.}= # For T * T --> 2T (uint64 * uint64 --> uint128) we use extra precision multiplication result = naiveMul(x.lo, y.lo) result.hi += (naiveMul(x.hi, y.lo) + naiveMul(x.lo, y.hi)).lo - - -# ################### Division ################### # -from ./primitive_divmod import divmod - -proc divmod*(x, y: MpUintImpl): tuple[quot, rem: MpUintImpl] {.noSideEffect.} - -proc div2n1n[T: BaseUint](x_hi, x_lo, y: T): tuple[quot, rem: T] {.noSideEffect, noInit.} = - const - size = size_mpuintimpl(x_hi) - halfSize = size div 2 - halfMask = (one(T) shl halfSize) - one(T) - base = one(T) shl halfSize - - if unlikely(x_hi >= y): - raise newException(ValueError, "Division overflow") - - let clz = countLeadingZeroBits(y) # We assume that for 0 clz returns 0 - - # normalization, shift so that the MSB is at 2^n - let xn = MpUintImpl[T](hi: x_hi, lo: x_lo) shl clz - let yn = y shl clz - - # Break divisor in 2 and dividend in 4 - let yn_hi = yn shr halfSize - let yn_lo = yn and halfMask - - let xnlohi = xn.lo shr halfSize - let xnlolo = xn.lo and halfMask - - # First half of the quotient - var (q1, r) = divmod(xn.hi, yn_hi) - - while (q1 >= base) or ((q1 * yn_lo) > (base * r + xnlohi)): - q1 -= one(T) - r += yn_hi - if r >= base: - break - - # Remove it - let xn_rest = xn.hi shl halfSize + xnlohi - (q1 * yn) - - # Second half - var (q2, s) = divmod(xn_rest, yn_hi) - - var - q2_ynlo = q2 * yn_lo - sbase_xnlolo = (s shl halfSize) or xnlolo - - while (q2 >= base) or (q2_ynlo > sbase_xnlolo): - q2 -= one(T) - s += yn_hi - if s >= base: - break - else: - q2_ynlo -= yn_lo - sbase_xnlolo = (s shl halfSize) or xnlolo - - - result.quot = (q1 shl halfSize) or q2 - result.rem = ((xn_rest shl halfSize) + xnlolo - q2 * yn) shr clz - -proc divmod*(x, y: MpUintImpl): tuple[quot, rem: MpUintImpl] {.noInit, noSideEffect.}= - - # Using divide and conquer algorithm. - if y.hi.isZero: - if x.hi < y.lo: # Bit length of quotient x/y < bit_length(MpUintImpl) / 2 - (result.quot.lo, result.rem.lo) = div2n1n(x.hi, x.lo, y.lo) - else: # Quotient can overflow the subtype so we split work - (result.quot.hi, result.rem.hi) = divmod(x.hi, y.hi) - (result.quot.lo, result.rem.lo) = div2n1n(result.rem.hi, x.lo, y.lo) - result.rem.hi = zero(type result.rem.hi) - return - const - size = size_mpuintimpl(x) - halfSize = size div 2 - - block: - # Normalization of divisor - let clz = countLeadingZeroBits(x.hi) - let yn = (y shl clz) - - # Prevent overflows - let xn = x shr 1 - - # Get the quotient - block: - let (qlo, _) = div2n1n(xn.hi, xn.lo, yn.hi) - result.quot.lo = qlo - - # Undo normalization - result.quot = result.quot shr (halfSize - 1 - clz) # -1 to correct for xn shift - - if not result.quot.isZero: - result.quot -= one(type result.quot) - # Quotient is correct or too small by one - # We will fix that once we know the remainder - - # Remainder - result.rem = x - (y * result.quot) - - # Fix quotient and reminder if we're off by one - if result.rem >= y: - # one more division round - result.quot += one(type result.quot) - result.rem -= y - -proc `div`*(x, y: MpUintImpl): MpUintImpl {.inline, noSideEffect.} = - ## Division operation for multi-precision unsigned uint - divmod(x,y).quot - -proc `mod`*(x, y: MpUintImpl): MpUintImpl {.inline, noSideEffect.} = - ## Division operation for multi-precision unsigned uint - divmod(x,y).rem - - -# ###################################################################### -# Division implementations -# -# Division is the most costly operation -# And also of critical importance for cryptography application - -# ##### Research ##### - -# Overview of division algorithms: -# - https://gmplib.org/manual/Division-Algorithms.html#Division-Algorithms -# - https://gmplib.org/~tege/division-paper.pdf -# - Comparison of fast division algorithms for large integers: http://bioinfo.ict.ac.cn/~dbu/AlgorithmCourses/Lectures/Hasselstrom2003.pdf - -# Libdivide has an implementations faster than hardware if dividing by the same number is needed -# - http://libdivide.com/documentation.html -# - https://github.com/ridiculousfish/libdivide/blob/master/libdivide.h -# Furthermore libdivide also has branchless implementations - -# Current implementation -# Currently we use the divide and conquer algorithm. Implementations can be found in -# - Hacker's delight: http://www.hackersdelight.org/hdcodetxt/divDouble.c.txt -# - Libdivide -# - Code project: https://www.codeproject.com/Tips/785014/UInt-Division-Modulus -# - Cuda-uint128 (unfinished): https://github.com/curtisseizert/CUDA-uint128/blob/master/cuda_uint128.h -# - Mpdecimal: https://github.com/status-im/nim-decimal/blob/9b65e95299cb582b14e0ae9a656984a2ce0bab03/decimal/mpdecimal_wrapper/generated/basearith.c#L305-L412 - -# Probably the most efficient algorithm that can benefit from MpUInt recursive data structure is -# the recursive fast division by Burnikel and Ziegler (http://www.mpi-sb.mpg.de/~ziegler/TechRep.ps.gz): -# - Python implementation: https://bugs.python.org/file11060/fast_div.py and discussion https://bugs.python.org/issue3451 -# - C++ implementation: https://github.com/linbox-team/givaro/blob/master/src/kernel/recint/rudiv.h -# - The Handbook of Elliptic and Hyperelliptic Cryptography Algorithm 10.35 on page 188 has a more explicit version of the div2NxN algorithm. This algorithm is directly recursive and avoids the mutual recursion of the original paper's calls between div2NxN and div3Nx2N. - -# Other libraries that can be used as reference for alternative (?) implementations: -# - TTMath: https://github.com/status-im/nim-ttmath/blob/8f6ff2e57b65a350479c4012a53699e262b19975/src/headers/ttmathuint.h#L1530-L2383 -# - LibTomMath: https://github.com/libtom/libtommath -# - Google Abseil: https://github.com/abseil/abseil-cpp/tree/master/absl/numeric -# - Crypto libraries like libsecp256k1, OpenSSL, ... though they are not generics. (uint256 only for example) -# Note: GMP/MPFR are GPL. The papers can be used but not their code. - -# ###################################################################### -# School division - -# proc divmod*(x, y: MpUintImpl): tuple[quot, rem: MpUintImpl] {.noSideEffect.}= -# ## Division for multi-precision unsigned uint -# ## Returns quotient + reminder in a (quot, rem) tuple -# # -# # Implementation through binary shift division -# if unlikely(y.isZero): -# raise newException(DivByZeroError, "You attempted to divide by zero") - -# type SubTy = type x.lo - -# var -# shift = x.bit_length - y.bit_length -# d = y shl shift - -# result.rem = x - -# while shift >= 0: -# result.quot += result.quot -# if result.rem >= d: -# result.rem -= d -# result.quot.lo = result.quot.lo or one(SubTy) - -# d = d shr 1 -# dec(shift) - diff --git a/src/private/uint_bitwise_ops.nim b/src/private/uint_bitwise_ops.nim index 9918cc1..aadc0bf 100644 --- a/src/private/uint_bitwise_ops.nim +++ b/src/private/uint_bitwise_ops.nim @@ -10,43 +10,60 @@ import ./uint_type, ./size_mpuintimpl, ./conversion -proc `not`*(x: MpUintImpl): MpUintImpl {.noInit, noSideEffect, inline.}= +func `not`*(x: MpUintImpl): MpUintImpl {.noInit, inline.}= ## Bitwise complement of unsigned integer x result.lo = not x.lo result.hi = not x.hi -proc `or`*(x, y: MpUintImpl): MpUintImpl {.noInit, noSideEffect, inline.}= +func `or`*(x, y: MpUintImpl): MpUintImpl {.noInit, inline.}= ## `Bitwise or` of numbers x and y result.lo = x.lo or y.lo result.hi = x.hi or y.hi -proc `and`*(x, y: MpUintImpl): MpUintImpl {.noInit, noSideEffect, inline.}= +func `and`*(x, y: MpUintImpl): MpUintImpl {.noInit, inline.}= ## `Bitwise and` of numbers x and y result.lo = x.lo and y.lo result.hi = x.hi and y.hi -proc `xor`*(x, y: MpUintImpl): MpUintImpl {.noInit, noSideEffect, inline.}= +func `xor`*(x, y: MpUintImpl): MpUintImpl {.noInit, inline.}= ## `Bitwise xor` of numbers x and y result.lo = x.lo xor y.lo result.hi = x.hi xor y.hi -proc `shl`*(x: MpUintImpl, y: SomeInteger): MpUintImpl {.inline, noSideEffect.}= +func `shr`*(x: MpUintImpl, y: SomeInteger): MpUintImpl {.inline.} + # Forward declaration + +func `shl`*(x: MpUintImpl, y: SomeInteger): MpUintImpl {.inline.}= ## Compute the `shift left` operation of x and y # Note: inlining this poses codegen/aliasing issue when doing `x = x shl 1` + + # TODO: would it be better to reimplement this using an array of bytes/uint64 + # That opens up to endianness issues. + const halfSize = size_mpuintimpl(x) div 2 + let defect = halfSize - int(y) - result.hi = (x.hi shl y) or (x.lo shl (y - halfSize)) - if y < halfSize: + if y == 0: + return x + elif y == halfSize: + result.hi = x.lo + elif y < halfSize: + result.hi = (x.hi shl y) or (x.lo shr (halfSize - y)) result.lo = x.lo shl y + else: + result.hi = x.lo shl (y - halfSize) -proc `shr`*(x: MpUintImpl, y: SomeInteger): MpUintImpl {.inline, noSideEffect.}= +func `shr`*(x: MpUintImpl, y: SomeInteger): MpUintImpl {.inline.}= ## Compute the `shift right` operation of x and y const halfSize = size_mpuintimpl(x) div 2 - let overflow = y < halfSize - - result.lo = (x.lo shr y) or ( - if overflow: x.hi shl (halfSize - y) else: x.hi shr (y - halfSize) - ) - if overflow: + if y == 0: + return x + elif y == halfSize: + result.lo = x.hi + elif y < halfSize: + result.lo = (x.lo shr y) or (x.hi shl (halfSize - y)) result.hi = x.hi shr y + else: + result.lo = x.hi shr (y - halfSize) + diff --git a/src/private/uint_division.nim b/src/private/uint_division.nim new file mode 100644 index 0000000..e195383 --- /dev/null +++ b/src/private/uint_division.nim @@ -0,0 +1,310 @@ +# Mpint +# Copyright 2018 Status Research & Development GmbH +# Licensed under either of +# +# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) +# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) +# +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import ./bithacks, ./conversion, + ./uint_type, + ./uint_comparison, + ./uint_bitwise_ops, + ./uint_binary_ops, + ./size_mpuintimpl, + ./primitive_divmod + +# ################### Division ################### # +# We use the following algorithm: +# - Fast recursive division by Burnikel and Ziegler + +################################################################################################################### +## ## +## Grade school division, but with (very) large digits, dividing [a1,a2,a3,a4] by [b1,b2]: ## +## ## +## +----+----+----+----+ +----+----+ +----+ ## +## | a1 | a2 | a3 | a4 | / | b1 | b2 | = | q1 | DivideThreeHalvesByTwo(a1a2, a3, b1b2, n, q1, r1r2) ## +## +----+----+----+----+ +----+----+ +----+ ## +## +--------------+ | | ## +## | b1b2 * q1 | | | ## +## +--------------+ | | ## +## - ================ v | ## +## +----+----+----+ +----+----+ | +----+ ## +## | r1 | r2 | a4 | / | b1 | b2 | = | | q2 | DivideThreeHalvesByTwo(r1r2, a4, b1b2, n, q1, r1r2) ## +## +----+----+----+ +----+----+ | +----+ ## +## +--------------+ | | ## +## | b1b2 * q2 | | | ## +## +--------------+ | | ## +## - ================ v v ## +## +----+----+ +----+----+ ## +## | r1 | r2 | | q1 | q2 | r1r2 = a1a2a3a4 mod b1b2, q1q2 = a1a2a3a4 div b1b2 ## +## +----+----+ +----+----+ , ## +## ## +## Note: in the diagram above, a1, b1, q1, r1 etc. are the most significant "digits" of their numbers. ## +## ## +################################################################################################################### + +func div2n1n[T: SomeunsignedInt](q, r: var T, n_hi, n_lo, d: T) {.inline.} +func div2n1n(q, r: var MpUintImpl, ah, al, b: MpUintImpl) {.inline.} + # Forward declaration + +func div3n2n[T]( q, r1, r0: var MpUintImpl[T], + a2, a1, a0: MpUintImpl[T], + b1, b0: MpUintImpl[T]) {.inline.}= + mixin div2n1n + + type T = type q + + var + c: T + carry: bool + + if a2 < b1: + div2n1n(q, c, a2, a1, b1) + else: + q = zero(type q) - one(type q) # We want 0xFFFFF .... + c = a1 + b1 + if c < a1: + carry = true + + let + d = naiveMul(q, b0) + b = MpUintImpl[type c](hi: b1, lo: b0) + + var r = MpUintImpl[type c](hi: c, lo: a0) - d + + if (not carry) and (d > r): + q -= one(type q) + r += b + + if r > b: + q -= one(type q) + r += b + + r1 = r.hi + r0 = r.lo + +template sub_ddmmss[T](sh, sl, ah, al, bh, bl: T) = + sl = al - bl + sh = ah - bh - (al < bl).T + +func lo[T:SomeUnsignedInt](x: T): T {.inline.} = + const + p = T.sizeof * 8 div 2 + base = 1 shl p + mask = base - 1 + result = x and mask + +func hi[T:SomeUnsignedInt](x: T): T {.inline.} = + const + p = T.sizeof * 8 div 2 + result = x shr p + +func umul_ppmm[T](w1, w0: var T, u, v: T) = + + const + p = (T.sizeof * 8 div 2) + base = 1 shl p + + var + x0, x1, x2, x3: T + + let + ul = u.lo + uh = u.hi + vl = v.lo + vh = v.hi + + x0 = ul * vl + x1 = ul * vh + x2 = uh * vl + x3 = uh * vh + + x1 += x0.hi # This can't carry + x1 += x2 # but this can + if x1 < x2: # if carry, add it to x3 + x3 += base + + w1 = x3 + x1.hi + w0 = (x1 shl p) + x0.lo + + +proc div3n2n( q, r1, r0: var SomeUnsignedInt, + a2, a1, a0: SomeUnsignedInt, + b1, b0: SomeUnsignedInt) {.inline.}= + mixin div2n1n + + type T = type q + + var + c, d1, d0: T + carry: bool + + if a2 < b1: + div2n1n(q, c, a2, a1, b1) + + else: + q = 0.T - 1.T # We want 0xFFFFF .... + c = a1 + b1 + if c < a1: + carry = true + + umul_ppmm(d1, d0, q, b0) + sub_ddmmss(r1, r0, c, a0, d1, d0) + + if (not carry) and ((d1 > c) or ((d1 == c) and (d0 > a0))): + q -= 1.T + r0 += b0 + r1 += b1 + if r0 < b0: + inc r1 + + if (r1 > b1) or ((r1 == b1) and (r0 >= b0)): + q -= 1.T + r0 += b0 + r1 += b1 + if r0 < b0: + inc r1 + +func div2n1n(q, r: var MpUintImpl, ah, al, b: MpUintImpl) {.inline.} = + + # assert countLeadingZeroBits(b) == 0, "Divisor was not normalized" + + var s: MpUintImpl + div3n2n(q.hi, s.hi, s.lo, ah.hi, ah.lo, al.hi, b.hi, b.lo) + div3n2n(q.lo, r.hi, r.lo, s.hi, s.lo, al.lo, b.hi, b.lo) + +func div2n1n[T: SomeunsignedInt](q, r: var T, n_hi, n_lo, d: T) {.inline.} = + + # assert countLeadingZeroBits(d) == 0, "Divisor was not normalized" + + const + size = size_mpuintimpl(q) + halfSize = size div 2 + halfMask = (1.T shl halfSize) - 1.T + + template halfQR(n_hi, n_lo, d_hi, d_lo: T): tuple[q,r: T] = + + var (q, r) = divmod(n_hi, d_hi) + let m = q * d_lo + r = (r shl halfSize) or n_lo + + # Fix the reminder, we're at most 2 iterations off + if r < m: + q -= 1.T + r += d_hi + if r >= d_hi and r < m: + q -= 1.T + r += d_hi + r -= m + (q, r) + + let + d_hi = d shr halfSize + d_lo = d and halfMask + n_lohi = nlo shr halfSize + n_lolo = nlo and halfMask + + # First half of the quotient + let (q1, r1) = halfQR(n_hi, n_lohi, d_hi, d_lo) + + # Second half + let (q2, r2) = halfQR(r1, n_lolo, d_hi, d_lo) + + q = (q1 shl halfSize) or q2 + r = r2 + +func divmod*[T](x, y: MpUintImpl[T]): tuple[quot, rem: MpUintImpl[T]] = + + # Normalization + assert y.isZero.not() + + const halfSize = size_mpuintimpl(x) div 2 + let clz = countLeadingZeroBits(y) + + let + xx = MpUintImpl[type x](lo: x) shl clz + yy = y shl clz + + # Compute + div2n1n(result.quot, result.rem, xx.hi, xx.lo, yy) + + # Undo normalization + result.rem = result.rem shr clz + +func `div`*(x, y: MpUintImpl): MpUintImpl {.inline.} = + ## Division operation for multi-precision unsigned uint + divmod(x,y).quot + +func `mod`*(x, y: MpUintImpl): MpUintImpl {.inline.} = + ## Division operation for multi-precision unsigned uint + divmod(x,y).rem + + +# ###################################################################### +# Division implementations +# +# Division is the most costly operation +# And also of critical importance for cryptography application + +# ##### Research ##### + +# Overview of division algorithms: +# - https://gmplib.org/manual/Division-Algorithms.html#Division-Algorithms +# - https://gmplib.org/~tege/division-paper.pdf +# - Comparison of fast division algorithms for large integers: http://bioinfo.ict.ac.cn/~dbu/AlgorithmCourses/Lectures/Hasselstrom2003.pdf + +# Libdivide has an implementations faster than hardware if dividing by the same number is needed +# - http://libdivide.com/documentation.html +# - https://github.com/ridiculousfish/libdivide/blob/master/libdivide.h +# Furthermore libdivide also has branchless implementations + +# Implementation: we use recursive fast division by Burnikel and Ziegler. +# +# It is build upon divide and conquer algorithm that can be found in: +# - Hacker's delight: http://www.hackersdelight.org/hdcodetxt/divDouble.c.txt +# - Libdivide +# - Code project: https://www.codeproject.com/Tips/785014/UInt-Division-Modulus +# - Cuda-uint128 (unfinished): https://github.com/curtisseizert/CUDA-uint128/blob/master/cuda_uint128.h +# - Mpdecimal: https://github.com/status-im/nim-decimal/blob/9b65e95299cb582b14e0ae9a656984a2ce0bab03/decimal/mpdecimal_wrapper/generated/basearith.c#L305-L412 + +# Description of recursive fast division by Burnikel and Ziegler (http://www.mpi-sb.mpg.de/~ziegler/TechRep.ps.gz): +# - Python implementation: https://bugs.python.org/file11060/fast_div.py and discussion https://bugs.python.org/issue3451 +# - C++ implementation: https://github.com/linbox-team/givaro/blob/master/src/kernel/recint/rudiv.h +# - The Handbook of Elliptic and Hyperelliptic Cryptography Algorithm 10.35 on page 188 has a more explicit version of the div2NxN algorithm. This algorithm is directly recursive and avoids the mutual recursion of the original paper's calls between div2NxN and div3Nx2N. + +# Other libraries that can be used as reference for alternative (?) implementations: +# - TTMath: https://github.com/status-im/nim-ttmath/blob/8f6ff2e57b65a350479c4012a53699e262b19975/src/headers/ttmathuint.h#L1530-L2383 +# - LibTomMath: https://github.com/libtom/libtommath +# - Google Abseil: https://github.com/abseil/abseil-cpp/tree/master/absl/numeric +# - Crypto libraries like libsecp256k1, OpenSSL, ... though they are not generics. (uint256 only for example) +# Note: GMP/MPFR are GPL. The papers can be used but not their code. + +# ###################################################################### +# School division + +# proc divmod*(x, y: MpUintImpl): tuple[quot, rem: MpUintImpl] {.noSideEffect.}= +# ## Division for multi-precision unsigned uint +# ## Returns quotient + reminder in a (quot, rem) tuple +# # +# # Implementation through binary shift division +# if unlikely(y.isZero): +# raise newException(DivByZeroError, "You attempted to divide by zero") + +# type SubTy = type x.lo + +# var +# shift = x.bit_length - y.bit_length +# d = y shl shift + +# result.rem = x + +# while shift >= 0: +# result.quot += result.quot +# if result.rem >= d: +# result.rem -= d +# result.quot.lo = result.quot.lo or one(SubTy) + +# d = d shr 1 +# dec(shift) diff --git a/src/uint_public.nim b/src/uint_public.nim index 83adac5..ba75329 100644 --- a/src/uint_public.nim +++ b/src/uint_public.nim @@ -43,13 +43,16 @@ template make_binary_inplace(op): untyped = export op import ./private/uint_binary_ops -import ./private/primitive_divmod make_binary(`+`, MpUint) make_binary_inplace(`+=`) make_binary(`-`, MpUint) make_binary_inplace(`-=`) make_binary(`*`, MpUint) + +import ./private/primitive_divmod, + ./private/uint_division + make_binary(`div`, MpUint) make_binary(`mod`, MpUint) proc divmod*(x, y: MpUint): tuple[quot, rem: MpUint] {.noInit, inline, noSideEffect.} = diff --git a/tests/test_bitwise.nim b/tests/test_bitwise.nim index 0929492..10f2a72 100644 --- a/tests/test_bitwise.nim +++ b/tests/test_bitwise.nim @@ -16,6 +16,11 @@ suite "Testing bitwise operations": let z = 10000'u16 assert cast[uint16](b) == z, "Test cannot proceed, something is wrong with the multiplication implementation" + + let u = 10000.initMpUint(64) + let v = 10000'u64 + let clz = 30 + test "Shift left - by less than half the size of the integer": check: cast[uint16](b) == z # Sanity check check: cast[uint16](b shl 2) == z shl 2 @@ -24,10 +29,18 @@ suite "Testing bitwise operations": check: cast[uint16](b) == z # Sanity check check: cast[uint16](b shl 10) == z shl 10 + check: cast[uint64](u shl clz) == v shl clz + test "Shift left - by half the size of the integer": check: cast[uint16](b) == z # Sanity check check: cast[uint16](b shl 8) == z shl 8 + block: # Testing shl for nested MpUintImpl + let p2_64 = MpUintImpl[uint64](hi:1, lo:0) + let p = 1.initMpUint(128) shl 64 + + check: p == cast[MpUint[128]](p2_64) + test "Shift right - by less than half the size of the integer": check: cast[uint16](b) == z # Sanity check check: cast[uint16](b shr 2) == z shr 2