uint division - compile and pass the single limb tests
This commit is contained in:
parent
c2ed8a4bc2
commit
53d2fd14f3
|
@ -180,9 +180,9 @@ macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped
|
|||
# Copy
|
||||
# --------------------------------------------------------
|
||||
|
||||
func copyFrom*[dLen, sLen](
|
||||
dst: var SomeBigInteger[dLen],
|
||||
src: SomeBigInteger[sLen]
|
||||
func copyFrom*(
|
||||
dst: var SomeBigInteger,
|
||||
src: SomeBigInteger
|
||||
){.inline.} =
|
||||
## Copy a BigInteger, truncated to 2^slen if the source
|
||||
## is larger than the destination
|
||||
|
|
|
@ -80,7 +80,7 @@ func mul_nim*(hi, lo: var uint64, u, v: uint64) =
|
|||
hi = x3 + hi(x1)
|
||||
lo = merge(x1, lo(x0))
|
||||
|
||||
func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
|
||||
func muladd1_nim*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
|
||||
## Extended precision multiplication + addition
|
||||
## (hi, lo) <- a*b + c
|
||||
##
|
||||
|
@ -91,7 +91,7 @@ func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
|
|||
addC_nim(carry, lo, lo, c, 0)
|
||||
addC_nim(carry, hi, hi, 0, carry)
|
||||
|
||||
func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
|
||||
func muladd2_nim*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
|
||||
## Extended precision multiplication + addition + addition
|
||||
## (hi, lo) <- a*b + c1 + c2
|
||||
##
|
||||
|
@ -107,3 +107,48 @@ func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
|
|||
# Carry chain 2
|
||||
addC_nim(carry2, lo, lo, c2, 0)
|
||||
addC_nim(carry2, hi, hi, 0, carry2)
|
||||
|
||||
|
||||
func div2n1n_nim*[T: SomeunsignedInt](q, r: var T, n_hi, n_lo, d: T) =
|
||||
## Division uint128 by uint64
|
||||
## Warning ⚠️ :
|
||||
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
|
||||
## - if n_hi > d result is undefined
|
||||
|
||||
# doAssert leadingZeros(d) == 0, "Divisor was not normalized"
|
||||
|
||||
const
|
||||
size = sizeof(q) * 8
|
||||
halfSize = size div 2
|
||||
halfMask = (1.T shl halfSize) - 1.T
|
||||
|
||||
template halfQR(n_hi, n_lo, d, d_hi, d_lo: T): tuple[q,r: T] =
|
||||
|
||||
var (q, r) = (n_hi div d_hi, n_hi mod d_hi)
|
||||
let m = q * d_lo
|
||||
r = (r shl halfSize) or n_lo
|
||||
|
||||
# Fix the reminder, we're at most 2 iterations off
|
||||
if r < m:
|
||||
dec q
|
||||
r += d
|
||||
if r >= d and r < m:
|
||||
dec q
|
||||
r += d
|
||||
r -= m
|
||||
(q, r)
|
||||
|
||||
let
|
||||
d_hi = d shr halfSize
|
||||
d_lo = d and halfMask
|
||||
n_lohi = nlo shr halfSize
|
||||
n_lolo = nlo and halfMask
|
||||
|
||||
# First half of the quotient
|
||||
let (q1, r1) = halfQR(n_hi, n_lohi, d, d_hi, d_lo)
|
||||
|
||||
# Second half
|
||||
let (q2, r2) = halfQR(r1, n_lolo, d, d_hi, d_lo)
|
||||
|
||||
q = (q1 shl halfSize) or q2
|
||||
r = r2
|
|
@ -73,19 +73,57 @@ func muladd2*(hi, lo: var uint32, a, b, c1, c2: uint32) {.inline.}=
|
|||
# ############################################################
|
||||
|
||||
when sizeof(int) == 8 and not defined(Stint32):
|
||||
when nimvm:
|
||||
from ./compiletime_fallback import mul_nim, muladd1, muladd2
|
||||
else:
|
||||
when defined(vcc):
|
||||
from ./extended_precision_x86_64_msvc import div2n1n, mul, muladd1, muladd2
|
||||
elif GCCCompatible:
|
||||
when X86:
|
||||
from ./extended_precision_x86_64_gcc import div2n1n
|
||||
from ./extended_precision_64bit_uint128 import mul, muladd1, muladd2
|
||||
else:
|
||||
from ./extended_precision_64bit_uint128 import div2n1n, mul, muladd1, muladd2
|
||||
export div2n1n, mul
|
||||
export muladd1, muladd2
|
||||
from ./compiletime_fallback import div2n1n_nim, mul_nim, muladd1_nim, muladd2_nim
|
||||
|
||||
when defined(vcc):
|
||||
from ./extended_precision_x86_64_msvc import div2n1n_128, mul_128, muladd1_128, muladd2_128
|
||||
elif GCCCompatible:
|
||||
when X86:
|
||||
from ./extended_precision_x86_64_gcc import div2n1n_128
|
||||
from ./extended_precision_64bit_uint128 import mul_128, muladd1_128, muladd2_128
|
||||
else:
|
||||
from ./extended_precision_64bit_uint128 import div2n1n_128, mul_128, muladd1_128, muladd2_128
|
||||
|
||||
func mul*(hi, lo: var uint64, u, v: uint64) {.inline.}=
|
||||
## Extended precision multiplication
|
||||
## (hi, lo) <- u * v
|
||||
when nimvm:
|
||||
mul_nim(hi, lo, u, v)
|
||||
else:
|
||||
mul_128(hi, lo, u, v)
|
||||
|
||||
func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.}=
|
||||
## Extended precision multiplication + addition
|
||||
## (hi, lo) <- a*b + c
|
||||
##
|
||||
## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
|
||||
## so adding any c cannot overflow
|
||||
when nimvm:
|
||||
muladd1_nim(hi, lo, a, b, c)
|
||||
else:
|
||||
muladd1_128(hi, lo, a, b, c)
|
||||
|
||||
func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
|
||||
## Extended precision multiplication + addition + addition
|
||||
## (hi, lo) <- a*b + c1 + c2
|
||||
##
|
||||
## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
|
||||
## so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
|
||||
## and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
|
||||
when nimvm:
|
||||
muladd2_nim(hi, lo, a, b, c1, c2)
|
||||
else:
|
||||
muladd2_128(hi, lo, a, b, c1, c2)
|
||||
|
||||
func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
|
||||
## Division uint128 by uint64
|
||||
## Warning ⚠️ :
|
||||
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
|
||||
## - if n_hi > d result is undefined
|
||||
when nimvm:
|
||||
div2n1n_nim(q, r, n_hi, n_lo, d)
|
||||
else:
|
||||
div2n1n_128(q, r, n_hi, n_lo, d)
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
|
@ -128,10 +166,7 @@ func mulAcc*[T: uint32|uint64](t, u, v: var T, a, b: T) {.inline.} =
|
|||
## (t, u, v) <- (t, u, v) + a * b
|
||||
var UV: array[2, T]
|
||||
var carry: Carry
|
||||
when nimvm:
|
||||
mul_nim(UV[1], UV[0], a, b)
|
||||
else:
|
||||
mul(UV[1], UV[0], a, b)
|
||||
mul(UV[1], UV[0], a, b)
|
||||
addC(carry, v, v, UV[0], Carry(0))
|
||||
addC(carry, u, u, UV[1], carry)
|
||||
t += T(carry)
|
||||
|
|
|
@ -19,7 +19,7 @@ static:
|
|||
doAssert GCC_Compatible
|
||||
doAssert sizeof(int) == 8
|
||||
|
||||
func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
|
||||
func div2n1n_128*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
|
||||
## Division uint128 by uint64
|
||||
## Warning ⚠️ :
|
||||
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE on some platforms
|
||||
|
@ -35,7 +35,7 @@ func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
|
|||
{.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].}
|
||||
{.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].}
|
||||
|
||||
func mul*(hi, lo: var uint64, a, b: uint64) {.inline.} =
|
||||
func mul_128*(hi, lo: var uint64, a, b: uint64) {.inline.} =
|
||||
## Extended precision multiplication
|
||||
## (hi, lo) <- a*b
|
||||
block:
|
||||
|
@ -50,7 +50,7 @@ func mul*(hi, lo: var uint64, a, b: uint64) {.inline.} =
|
|||
{.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
|
||||
{.emit:["*",lo, " = (NU64)", dblPrec,";"].}
|
||||
|
||||
func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
|
||||
func muladd1_128*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
|
||||
## Extended precision multiplication + addition
|
||||
## (hi, lo) <- a*b + c
|
||||
##
|
||||
|
@ -71,7 +71,7 @@ func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
|
|||
{.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
|
||||
{.emit:["*",lo, " = (NU64)", dblPrec,";"].}
|
||||
|
||||
func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
|
||||
func muladd2_128*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
|
||||
## Extended precision multiplication + addition + addition
|
||||
## This is constant-time on most hardware except some specific one like Cortex M0
|
||||
## (hi, lo) <- a*b + c1 + c2
|
||||
|
|
|
@ -20,7 +20,7 @@ static:
|
|||
doAssert sizeof(int) == 8
|
||||
doAssert X86
|
||||
|
||||
func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
|
||||
func div2n1n_128*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
|
||||
## Division uint128 by uint64
|
||||
## Warning ⚠️ :
|
||||
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
|
||||
|
|
|
@ -38,35 +38,25 @@ func div2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
|
|||
## Warning ⚠️ :
|
||||
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
|
||||
## - if n_hi > d result is undefined
|
||||
{.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".}
|
||||
|
||||
# TODO !!! - Replace by constant-time, portable, non-assembly version
|
||||
# -> use uint128? Compiler might add unwanted branches
|
||||
q = udiv128(n_hi, n_lo, d, r)
|
||||
|
||||
func mul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
|
||||
func mul_128*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
|
||||
## Extended precision multiplication
|
||||
## (hi, lo) <- a*b
|
||||
##
|
||||
## This is constant-time on most hardware
|
||||
## See: https://www.bearssl.org/ctmul.html
|
||||
lo = umul128(a, b, hi)
|
||||
|
||||
func muladd1*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.} =
|
||||
func muladd1_128*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.} =
|
||||
## Extended precision multiplication + addition
|
||||
## (hi, lo) <- a*b + c
|
||||
##
|
||||
## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
|
||||
## so adding any c cannot overflow
|
||||
##
|
||||
## This is constant-time on most hardware
|
||||
## See: https://www.bearssl.org/ctmul.html
|
||||
var carry: Carry
|
||||
lo = umul128(a, b, hi)
|
||||
addC(carry, lo, lo, c, Carry(0))
|
||||
addC(carry, hi, hi, 0, carry)
|
||||
|
||||
func muladd2*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}=
|
||||
func muladd2_128*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}=
|
||||
## Extended precision multiplication + addition + addition
|
||||
## This is constant-time on most hardware except some specific one like Cortex M0
|
||||
## (hi, lo) <- a*b + c1 + c2
|
||||
|
|
|
@ -63,11 +63,11 @@ func shortDiv*(a: var Limbs, k: Word): Word =
|
|||
# d = d shr 1
|
||||
# dec(shift)
|
||||
|
||||
func knuthDivLE[qLen, rLen, uLen, vLen: static int](
|
||||
q: var Limbs[qLen],
|
||||
r: var Limbs[rLen],
|
||||
u: Limbs[uLen],
|
||||
v: Limbs[vLen],
|
||||
func knuthDivLE(
|
||||
q: var StUint,
|
||||
r: var StUint,
|
||||
u: StUint,
|
||||
v: StUint,
|
||||
needRemainder: bool) =
|
||||
## Compute the quotient and remainder (if needed)
|
||||
## of the division of u by v
|
||||
|
@ -80,6 +80,15 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](
|
|||
#
|
||||
# Resources at the bottom of the file
|
||||
|
||||
const
|
||||
qLen = q.limbs.len
|
||||
rLen = r.limbs.len
|
||||
uLen = u.limbs.len
|
||||
vLen = v.limbs.len
|
||||
|
||||
template `[]`(a: Stuint, i: int): Word = a.limbs[i]
|
||||
template `[]=`(a: Stuint, i: int, val: Word) = a.limbs[i] = val
|
||||
|
||||
# Find the most significant word with actual set bits
|
||||
# and get the leading zero count there
|
||||
var divisorLen = vLen
|
||||
|
@ -96,7 +105,7 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](
|
|||
# Divisor is a single word.
|
||||
if divisorLen == 1:
|
||||
q.copyFrom(u)
|
||||
r.leastSignificantWord() = q.shortDiv(v.leastSignificantWord())
|
||||
r.leastSignificantWord() = q.limbs.shortDiv(v.leastSignificantWord())
|
||||
# zero all but the least significant word
|
||||
var lsw = true
|
||||
for w in leastToMostSig(r):
|
||||
|
@ -111,8 +120,8 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](
|
|||
|
||||
# Normalize so that the divisor MSB is set,
|
||||
# vn cannot overflow, un can overflowed by 1 word at most, hence uLen+1
|
||||
un.shlSmallOverflowing(u, clz)
|
||||
vn.shlSmall(v, clz)
|
||||
un.shlSmallOverflowing(u.limbs, clz)
|
||||
vn.shlSmall(v.limbs, clz)
|
||||
|
||||
static: doAssert cpuEndian == littleEndian, "Currently the division algorithm requires little endian ordering of the limbs"
|
||||
# TODO: is it worth it to have the uint be the exact same extended precision representation
|
||||
|
@ -161,24 +170,42 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](
|
|||
q[j] -= 1
|
||||
var carry = Carry(0)
|
||||
for i in 0 ..< divisorLen:
|
||||
addC(carry, u[j+i], u[j+i], v[i], carry)
|
||||
addC(carry, un[j+i], un[j+i], v[i], carry)
|
||||
|
||||
# Quotient is found, if remainder is needed we need to un-normalize un
|
||||
if needRemainder:
|
||||
r.shrSmall(un, clz)
|
||||
# r.limbs.shrSmall(un, clz) - TODO
|
||||
when cpuEndian == littleEndian:
|
||||
# rLen+1 == un.len
|
||||
for i in 0 ..< rLen:
|
||||
r[i] = (un[i] shr clz) or (un[i+1] shl (WordBitWidth - clz))
|
||||
else:
|
||||
{.error: "Not Implemented for bigEndian".}
|
||||
|
||||
|
||||
const BinaryShiftThreshold = 8 # If the difference in bit-length is below 8
|
||||
# binary shift is probably faster
|
||||
|
||||
func divmod(q, r: var Stuint,
|
||||
<<<<<<< HEAD
|
||||
x: Limbs[xLen], y: Limbs[yLen], needRemainder: bool) =
|
||||
=======
|
||||
x, y: Stuint, needRemainder: bool) =
|
||||
|
||||
>>>>>>> 88858a7 (uint division - compile and pass the single limb tests)
|
||||
let x_clz = x.leadingZeros()
|
||||
let y_clz = y.leadingZeros()
|
||||
|
||||
# We short-circuit division depending on special-cases.
|
||||
<<<<<<< HEAD
|
||||
if unlikely(y.isZero):
|
||||
raise newException(DivByZeroDefect, "You attempted to divide by zero")
|
||||
elif y_clz == (bitsof(y) - 1):
|
||||
=======
|
||||
if unlikely(y.isZero()):
|
||||
raise newException(DivByZeroError, "You attempted to divide by zero")
|
||||
elif y_clz == (y.bits - 1):
|
||||
>>>>>>> 88858a7 (uint division - compile and pass the single limb tests)
|
||||
# y is one
|
||||
q = x
|
||||
# elif (x.hi or y.hi).isZero:
|
||||
|
@ -209,7 +236,7 @@ func `div`*(x, y: Stuint): Stuint {.inline.} =
|
|||
func `mod`*(x, y: Stuint): Stuint {.inline.} =
|
||||
## Remainder operation for multi-precision unsigned uint
|
||||
var tmp{.noInit.}: Stuint
|
||||
divmod(tmp, result, x,y, needRemainder = true)
|
||||
divmod(tmp, result, x, y, needRemainder = true)
|
||||
|
||||
func divmod*(x, y: Stuint): tuple[quot, rem: Stuint] =
|
||||
## Division and remainder operations for multi-precision unsigned uint
|
||||
|
|
|
@ -190,19 +190,21 @@ suite "Testing unsigned int division and modulo implementation":
|
|||
check: cast[uint64](qr.quot) == 7'u64
|
||||
check: cast[uint64](qr.rem) == 9'u64
|
||||
|
||||
test "Divmod(2^64, 3) returns the correct result":
|
||||
let a = 1.stuint(128) shl 64
|
||||
let b = 3.stuint(128)
|
||||
|
||||
let qr = divmod(a, b)
|
||||
|
||||
let q = cast[UintImpl[uint64]](qr.quot)
|
||||
let r = cast[UintImpl[uint64]](qr.rem)
|
||||
|
||||
check: q.lo == 6148914691236517205'u64
|
||||
check: q.hi == 0'u64
|
||||
check: r.lo == 1'u64
|
||||
check: r.hi == 0'u64
|
||||
# TODO - no more .lo / .hi
|
||||
#
|
||||
# test "Divmod(2^64, 3) returns the correct result":
|
||||
# let a = 1.stuint(128) shl 64
|
||||
# let b = 3.stuint(128)
|
||||
#
|
||||
# let qr = divmod(a, b)
|
||||
#
|
||||
# let q = cast[UintImpl[uint64]](qr.quot)
|
||||
# let r = cast[UintImpl[uint64]](qr.rem)
|
||||
#
|
||||
# check: q.lo == 6148914691236517205'u64
|
||||
# check: q.hi == 0'u64
|
||||
# check: r.lo == 1'u64
|
||||
# check: r.hi == 0'u64
|
||||
|
||||
test "Divmod(1234567891234567890, 10) returns the correct result":
|
||||
let a = cast[StUint[64]](1234567891234567890'u64)
|
||||
|
|
Loading…
Reference in New Issue