diff --git a/benchmarks/bench_mod.nim b/benchmarks/bench_mod.nim index 0f79b68..57fabb7 100644 --- a/benchmarks/bench_mod.nim +++ b/benchmarks/bench_mod.nim @@ -15,29 +15,33 @@ echo "Warmup: " & $(stop - start) & "s" #################################### +let a = [123'u64, 123'u64, 123'u64, 123'u64] +let m = [456'u64, 456'u64, 456'u64, 45'u64] + +let aU256 = cast[Stuint[256]](a) +let mU256 = cast[Stuint[256]](m) start = cpuTime() block: - var foo = 123.u256 + var foo = aU256 for i in 0 ..< 10_000_000: - foo += i.u256 * i.u256 mod 456.u256 - foo = foo mod 789.u256 + foo += (foo * foo) mod mU256 stop = cpuTime() echo "Library: " & $(stop - start) & "s" when defined(bench_ttmath): # need C++ - import ttmath + import ttmath, ../tests/ttmath_compat template tt_u256(a: int): UInt[256] = ttmath.u256(a.uint) start = cpuTime() block: - var foo = 123.tt_u256 + var foo = a.astt() + let mU256 = m.astt() for i in 0 ..< 10_000_000: - foo += i.tt_u256 * i.tt_u256 mod 456.tt_u256 - foo = foo mod 789.tt_u256 + foo += (foo * foo) mod mU256 stop = cpuTime() echo "TTMath: " & $(stop - start) & "s" diff --git a/stint/private/uint_addsub.nim b/stint/private/uint_addsub.nim index f037c1b..6821d75 100644 --- a/stint/private/uint_addsub.nim +++ b/stint/private/uint_addsub.nim @@ -8,8 +8,6 @@ # at your option. This file may not be copied, modified, or distributed except according to those terms. import - # Status lib - stew/bitops2, # Internal ./datatypes, ./primitives/addcarry_subborrow diff --git a/stint/private/uint_div.nim b/stint/private/uint_div.nim index aaa8606..333234c 100644 --- a/stint/private/uint_div.nim +++ b/stint/private/uint_div.nim @@ -13,9 +13,33 @@ import # Internal ./datatypes, ./uint_bitwise, - ./uint_shift, ./primitives/[addcarry_subborrow, extended_precision] +# Helpers +# -------------------------------------------------------- + +func usedBitsAndWords(a: openArray[Word]): tuple[bits, words: int] {.inline.} = + ## Returns the number of used words and bits in a bigInt + var clz = 0 + # Count Leading Zeros + for i in countdown(a.len-1, 0): + let count = log2trunc(a[i]) + # debugEcho "count: ", count, ", a[", i, "]: ", a[i].toBin(64) + if count == -1: + clz += WordBitWidth + else: + clz += WordBitWidth - count - 1 + return (a.len*WordBitWidth - clz, i+1) + +func copyWords( + a: var openArray[Word], startA: int, + b: openArray[Word], startB: int, + numWords: int) = + ## Copy a slice of B into A. This properly deals + ## with overlaps when A and B are slices of the same buffer + for i in countdown(numWords-1, 0): + a[startA+i] = b[startB+i] + # Division # -------------------------------------------------------- @@ -37,45 +61,18 @@ func shortDiv*(a: var Limbs, k: Word): Word = # Undo normalization result = result shr clz -# func binaryShiftDiv[qLen, rLen, uLen, vLen: static int]( -# q: var Limbs[qLen], -# r: var Limbs[rLen], -# u: Limbs[uLen], -# v: Limbs[vLen]) = -# ## Division for multi-precision unsigned uint -# ## Implementation through binary shift division -# doAssert y.isZero.not() # This should be checked on release mode in the divmod caller proc - -# type SubTy = type x.lo - -# var -# shift = y.leadingZeros - x.leadingZeros -# d = y shl shift - -# r = x - -# while shift >= 0: -# q += q -# if r >= d: -# r -= d -# q.lo = q.lo or one(SubTy) - -# d = d shr 1 -# dec(shift) - -func knuthDivLE( - q: var StUint, - r: var StUint, - u: StUint, - v: StUint, - needRemainder: bool) = - ## Compute the quotient and remainder (if needed) - ## of the division of u by v +func shlAddMod_multi(a: var openArray[Word], c: Word, + M: openArray[Word], mBits: int): Word = + ## Fused modular left-shift + add + ## Shift input `a` by a word and add `c` modulo `M` + ## + ## Specialized for M being a multi-precision integer. ## - ## - q must be of size uLen - vLen + 1 (assuming u and v uses all words) - ## - r must be of size vLen (assuming v uses all words) - ## - uLen >= vLen + ## With a word W = 2^WordBitWidth and a modulus M + ## Does a <- a * W + c (mod M) + ## and returns q = (a * W + c ) / M ## +<<<<<<< HEAD ## For now only LittleEndian is implemented # # Resources at the bottom of the file @@ -187,25 +184,15 @@ const BinaryShiftThreshold = 8 # If the difference in bit-length is below 8 # binary shift is probably faster func divmod(q, r: var Stuint, -<<<<<<< HEAD - x: Limbs[xLen], y: Limbs[yLen], needRemainder: bool) = -======= x, y: Stuint, needRemainder: bool) = ->>>>>>> 88858a7 (uint division - compile and pass the single limb tests) let x_clz = x.leadingZeros() let y_clz = y.leadingZeros() # We short-circuit division depending on special-cases. -<<<<<<< HEAD - if unlikely(y.isZero): - raise newException(DivByZeroDefect, "You attempted to divide by zero") - elif y_clz == (bitsof(y) - 1): -======= if unlikely(y.isZero()): raise newException(DivByZeroError, "You attempted to divide by zero") elif y_clz == (y.bits - 1): ->>>>>>> 88858a7 (uint division - compile and pass the single limb tests) # y is one q = x # elif (x.hi or y.hi).isZero: @@ -225,28 +212,163 @@ func divmod(q, r: var Stuint, r = x # elif (y_clz - x_clz) < BinaryShiftThreshold: # binaryShiftDiv(x, y, result.quot, result.rem) + ## The modulus `M` most-significant bit at `mBits` MUST be set. + + # Assuming 64-bit words + let hi = a[^1] # Save the high word to detect carries + let R = mBits and (WordBitWidth - 1) # R = mBits mod 64 + + var a0, a1, m0: Word + if R == 0: # If the number of mBits is a multiple of 64 + a0 = a[^1] # + copyWords(a, 1, a, 0, a.len-1) # we can just shift words + a[0] = c # and replace the first one by c + a1 = a[^1] + m0 = M[^1] + else: # Else: need to deal with partial word shifts at the edge. + let clz = WordBitWidth-R + a0 = (a[^1] shl clz) or (a[^2] shr R) + copyWords(a, 1, a, 0, a.len-1) + a[0] = c + a1 = (a[^1] shl clz) or (a[^2] shr R) + m0 = (M[^1] shl clz) or (M[^2] shr R) + + # m0 has its high bit set. (a0, a1)/m0 fits in a limb. + # Get a quotient q, at most we will be 2 iterations off + # from the true quotient + var q: Word # Estimate quotient + if a0 == m0: # if a_hi == divisor + q = high(Word) # quotient = MaxWord (0b1111...1111) + elif a0 == 0 and a1 < m0: # elif q == 0, true quotient = 0 + q = 0 else: - knuthDivLE(q, r, x, y, needRemainder) + var r: Word + div2n1n(q, r, a0, a1, m0) # else instead of being of by 0, 1 or 2 + q -= 1 # we return q-1 to be off by -1, 0 or 1 + + # Now substract a*2^64 - q*m + var carry = Word(0) + var overM = true # Track if quotient greater than the modulus + + for i in 0 ..< M.len: + var qm_lo: Word + block: # q*m + # q * p + carry (doubleword) carry from previous limb + muladd1(carry, qm_lo, q, M[i], carry) + + block: # a*2^64 - q*m + var borrow: Borrow + subB(borrow, a[i], a[i], qm_lo, Borrow(0)) + carry += Word(borrow) # Adjust if borrow + + if a[i] != M[i]: + overM = a[i] > M[i] + + # Fix quotient, the true quotient is either q-1, q or q+1 + # + # if carry < q or carry == q and overM we must do "a -= M" + # if carry > hi (negative result) we must do "a += M" + if carry > hi: + var c = Carry(0) + for i in 0 ..< a.len: + addC(c, a[i], a[i], M[i], c) + q -= 1 + elif overM or (carry < hi): + var b = Borrow(0) + for i in 0 ..< a.len: + subB(b, a[i], a[i], M[i], b) + q += 1 + + return q + +func shlAddMod(a: var openArray[Word], c: Word, + M: openArray[Word], mBits: int): Word {.inline.}= + ## Fused modular left-shift + add + ## Shift input `a` by a word and add `c` modulo `M` + ## + ## With a word W = 2^WordBitWidth and a modulus M + ## Does a <- a * W + c (mod M) + ## and returns q = (a * W + c ) / M + ## + ## The modulus `M` most-significant bit at `mBits` MUST be set. + if mBits <= WordBitWidth: + # If M fits in a single limb + + # We normalize M with clz so that the MSB is set + # And normalize (a * 2^64 + c) by R as well to maintain the result + # This ensures that (a0, a1)/p0 fits in a limb. + let R = mBits and (WordBitWidth - 1) + let clz = WordBitWidth-R + + # (hi, lo) = a * 2^64 + c + let hi = (a[0] shl clz) or (c shr R) + let lo = c shl clz + let m0 = M[0] shl clz + + var q, r: Word + div2n1n(q, r, hi, lo, m0) + a[0] = r shr clz + return q + else: + return shlAddMod_multi(a, c, M, mBits) + +func divRemImpl( + q, r: var openArray[Word], + a, b: openArray[Word] + ) = + let (aBits, aLen) = usedBitsAndWords(a) + let (bBits, bLen) = usedBitsAndWords(b) + let rLen = bLen + + if aBits < bBits: + # if a uses less bits than b, + # a < b, so q = 0 and r = a + copyWords(r, 0, a, 0, aLen) + for i in aLen ..< r.len: # r.len >= rLen + r[i] = 0 + for i in 0 ..< q.len: + q[i] = 0 + else: + # The length of a is at least the divisor + # We can copy bLen-1 words + # and modular shift-lef-add the rest + let aOffset = aLen - bLen + copyWords(r, 0, a, aOffset+1, bLen-1) + r[rLen-1] = 0 + # Now shift-left the copied words while adding the new word mod b + for i in countdown(aOffset, 0): + q[i] = shlAddMod( + r.toOpenArray(0, rLen-1), + a[i], + b.toOpenArray(0, bLen-1), + bBits + ) + + # Clean up extra words + for i in aOffset+1 ..< q.len: + q[i] = 0 + for i in rLen ..< r.len: + r[i] = 0 func `div`*(x, y: Stuint): Stuint {.inline.} = ## Division operation for multi-precision unsigned uint var tmp{.noInit.}: Stuint - divmod(result, tmp, x, y, needRemainder = false) + divRemImpl(result.limbs, tmp.limbs, x.limbs, y.limbs) func `mod`*(x, y: Stuint): Stuint {.inline.} = ## Remainder operation for multi-precision unsigned uint var tmp{.noInit.}: Stuint - divmod(tmp, result, x, y, needRemainder = true) + divRemImpl(tmp.limbs, result.limbs, x.limbs, y.limbs) func divmod*(x, y: Stuint): tuple[quot, rem: Stuint] = ## Division and remainder operations for multi-precision unsigned uint - divmod(result.quot, result.rem, x, y, needRemainder = true) + divRemImpl(result.quot.limbs, result.rem.limbs, x.limbs, y.limbs) # ###################################################################### # Division implementations # # Multi-precision division is a costly -#and also difficult to implement operation +# and also difficult to implement operation # ##### Research ##### diff --git a/stint/private/uint_shift.nim b/stint/private/uint_shift.nim index a1441b0..3d50abf 100644 --- a/stint/private/uint_shift.nim +++ b/stint/private/uint_shift.nim @@ -54,33 +54,14 @@ func shrWords*(r: var Limbs, a: Limbs, w: SomeInteger) = when cpuEndian == littleEndian: for i in 0 ..< Limbs.len-w: r[i] = a[i+w] + for i in Limbs.len-w ..< Limbs.len: + r[i] = 0 else: + for i in countdown(Limbs.len-1, Limbs.len-w): + r[i] = 0 for i in countdown(Limbs.len-w, 0): r[i] = a[i+w] -func shlSmallOverflowing*[rLen, aLen: static int]( - r: var Limbs[rLen], a: Limbs[aLen], k: SomeInteger) = - ## Compute the `shift left` operation of x and k - ## - ## k MUST be less than the base word size (2^32 or 2^64) - when cpuEndian == littleEndian: - r[0] = a[0] shl k - for i in 1 ..< a.len: - r[i] = (a[i] shl k) or (a[i-1] shr (WordBitWidth - k)) - if rLen > aLen: - r[aLen] = a[aLen - 1] shr (WordBitWidth - k) - for i in aLen+1 ..< rLen: - r[i] = 0 - else: - const offset = rLen - aLen - r[^1] = a[^1] shl k - for i in countdown(a.len-2, 0): - r[i+offset] = (a[i] shl k) or (a[i+1] shr (WordBitWidth - k)) - if rLen > aLen: - r[offset-1] = a[0] shr (WordBitWidth - k) - for i in 0 ..< offset-1: - r[i] = 0 - func shlSmall*(r: var Limbs, a: Limbs, k: SomeInteger) = ## Compute the `shift left` operation of x and k ## @@ -112,10 +93,14 @@ func shlLarge*(r: var Limbs, a: Limbs, w, shift: SomeInteger) = func shlWords*(r: var Limbs, a: Limbs, w: SomeInteger) = ## Shift left by w word when cpuEndian == littleEndian: + for i in 0 ..< w: + r[i] = 0 for i in 0 ..< Limbs.len-w: r[i+w] = a[i] else: - for i in countdown(Limbs.len-1, 0): + for i in countdown(Limbs.len-1, Limbs.len-w): + r[i] = 0 + for i in countdown(Limbs.len-w-1, 0): r[i] = a[i-w] # Wrappers @@ -123,6 +108,10 @@ func shlWords*(r: var Limbs, a: Limbs, w: SomeInteger) = func shiftRight*(r: var Stuint, a: Stuint, k: SomeInteger) = ## Shift `a` right by k bits and store in `r` + if k == 0: + r = a + return + if k < WordBitWidth: r.limbs.shrSmall(a.limbs, k) return @@ -138,6 +127,10 @@ func shiftRight*(r: var Stuint, a: Stuint, k: SomeInteger) = func shiftLeft*(r: var Stuint, a: Stuint, k: SomeInteger) = ## Shift `a` left by k bits and store in `r` + if k == 0: + r = a + return + if k < WordBitWidth: r.limbs.shlSmall(a.limbs, k) r.clearExtraBits() diff --git a/stint/uintops.nim b/stint/uintops.nim index 738a1b9..f196bd2 100644 --- a/stint/uintops.nim +++ b/stint/uintops.nim @@ -26,7 +26,7 @@ export StUint func setZero*(a: var StUint) = ## Set ``a`` to 0 for i in 0 ..< a.limbs.len: - a[i] = 0 + a.limbs[i] = 0 func setSmallInt(a: var StUint, k: Word) = ## Set ``a`` to k