uint division - compile and pass the single limb tests

This commit is contained in:
Mamy Ratsimbazafy 2022-01-22 01:42:54 +01:00 committed by jangko
parent c2ed8a4bc2
commit 53d2fd14f3
No known key found for this signature in database
GPG Key ID: 31702AE10541E6B9
8 changed files with 163 additions and 64 deletions

View File

@ -180,9 +180,9 @@ macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped
# Copy
# --------------------------------------------------------
func copyFrom*[dLen, sLen](
dst: var SomeBigInteger[dLen],
src: SomeBigInteger[sLen]
func copyFrom*(
dst: var SomeBigInteger,
src: SomeBigInteger
){.inline.} =
## Copy a BigInteger, truncated to 2^slen if the source
## is larger than the destination

View File

@ -80,7 +80,7 @@ func mul_nim*(hi, lo: var uint64, u, v: uint64) =
hi = x3 + hi(x1)
lo = merge(x1, lo(x0))
func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
func muladd1_nim*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
## Extended precision multiplication + addition
## (hi, lo) <- a*b + c
##
@ -91,7 +91,7 @@ func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
addC_nim(carry, lo, lo, c, 0)
addC_nim(carry, hi, hi, 0, carry)
func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
func muladd2_nim*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
## Extended precision multiplication + addition + addition
## (hi, lo) <- a*b + c1 + c2
##
@ -107,3 +107,48 @@ func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
# Carry chain 2
addC_nim(carry2, lo, lo, c2, 0)
addC_nim(carry2, hi, hi, 0, carry2)
func div2n1n_nim*[T: SomeunsignedInt](q, r: var T, n_hi, n_lo, d: T) =
## Division uint128 by uint64
## Warning ⚠️ :
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
## - if n_hi > d result is undefined
# doAssert leadingZeros(d) == 0, "Divisor was not normalized"
const
size = sizeof(q) * 8
halfSize = size div 2
halfMask = (1.T shl halfSize) - 1.T
template halfQR(n_hi, n_lo, d, d_hi, d_lo: T): tuple[q,r: T] =
var (q, r) = (n_hi div d_hi, n_hi mod d_hi)
let m = q * d_lo
r = (r shl halfSize) or n_lo
# Fix the reminder, we're at most 2 iterations off
if r < m:
dec q
r += d
if r >= d and r < m:
dec q
r += d
r -= m
(q, r)
let
d_hi = d shr halfSize
d_lo = d and halfMask
n_lohi = nlo shr halfSize
n_lolo = nlo and halfMask
# First half of the quotient
let (q1, r1) = halfQR(n_hi, n_lohi, d, d_hi, d_lo)
# Second half
let (q2, r2) = halfQR(r1, n_lolo, d, d_hi, d_lo)
q = (q1 shl halfSize) or q2
r = r2

View File

@ -73,19 +73,57 @@ func muladd2*(hi, lo: var uint32, a, b, c1, c2: uint32) {.inline.}=
# ############################################################
when sizeof(int) == 8 and not defined(Stint32):
when nimvm:
from ./compiletime_fallback import mul_nim, muladd1, muladd2
else:
from ./compiletime_fallback import div2n1n_nim, mul_nim, muladd1_nim, muladd2_nim
when defined(vcc):
from ./extended_precision_x86_64_msvc import div2n1n, mul, muladd1, muladd2
from ./extended_precision_x86_64_msvc import div2n1n_128, mul_128, muladd1_128, muladd2_128
elif GCCCompatible:
when X86:
from ./extended_precision_x86_64_gcc import div2n1n
from ./extended_precision_64bit_uint128 import mul, muladd1, muladd2
from ./extended_precision_x86_64_gcc import div2n1n_128
from ./extended_precision_64bit_uint128 import mul_128, muladd1_128, muladd2_128
else:
from ./extended_precision_64bit_uint128 import div2n1n, mul, muladd1, muladd2
export div2n1n, mul
export muladd1, muladd2
from ./extended_precision_64bit_uint128 import div2n1n_128, mul_128, muladd1_128, muladd2_128
func mul*(hi, lo: var uint64, u, v: uint64) {.inline.}=
## Extended precision multiplication
## (hi, lo) <- u * v
when nimvm:
mul_nim(hi, lo, u, v)
else:
mul_128(hi, lo, u, v)
func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.}=
## Extended precision multiplication + addition
## (hi, lo) <- a*b + c
##
## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
## so adding any c cannot overflow
when nimvm:
muladd1_nim(hi, lo, a, b, c)
else:
muladd1_128(hi, lo, a, b, c)
func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
## Extended precision multiplication + addition + addition
## (hi, lo) <- a*b + c1 + c2
##
## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
## so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
## and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
when nimvm:
muladd2_nim(hi, lo, a, b, c1, c2)
else:
muladd2_128(hi, lo, a, b, c1, c2)
func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
## Division uint128 by uint64
## Warning ⚠️ :
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
## - if n_hi > d result is undefined
when nimvm:
div2n1n_nim(q, r, n_hi, n_lo, d)
else:
div2n1n_128(q, r, n_hi, n_lo, d)
# ############################################################
#
@ -128,9 +166,6 @@ func mulAcc*[T: uint32|uint64](t, u, v: var T, a, b: T) {.inline.} =
## (t, u, v) <- (t, u, v) + a * b
var UV: array[2, T]
var carry: Carry
when nimvm:
mul_nim(UV[1], UV[0], a, b)
else:
mul(UV[1], UV[0], a, b)
addC(carry, v, v, UV[0], Carry(0))
addC(carry, u, u, UV[1], carry)

View File

@ -19,7 +19,7 @@ static:
doAssert GCC_Compatible
doAssert sizeof(int) == 8
func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
func div2n1n_128*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
## Division uint128 by uint64
## Warning ⚠️ :
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE on some platforms
@ -35,7 +35,7 @@ func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
{.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].}
{.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].}
func mul*(hi, lo: var uint64, a, b: uint64) {.inline.} =
func mul_128*(hi, lo: var uint64, a, b: uint64) {.inline.} =
## Extended precision multiplication
## (hi, lo) <- a*b
block:
@ -50,7 +50,7 @@ func mul*(hi, lo: var uint64, a, b: uint64) {.inline.} =
{.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
{.emit:["*",lo, " = (NU64)", dblPrec,";"].}
func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
func muladd1_128*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
## Extended precision multiplication + addition
## (hi, lo) <- a*b + c
##
@ -71,7 +71,7 @@ func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
{.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
{.emit:["*",lo, " = (NU64)", dblPrec,";"].}
func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
func muladd2_128*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
## Extended precision multiplication + addition + addition
## This is constant-time on most hardware except some specific one like Cortex M0
## (hi, lo) <- a*b + c1 + c2

View File

@ -20,7 +20,7 @@ static:
doAssert sizeof(int) == 8
doAssert X86
func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
func div2n1n_128*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
## Division uint128 by uint64
## Warning ⚠️ :
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE

View File

@ -38,35 +38,25 @@ func div2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
## Warning ⚠️ :
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
## - if n_hi > d result is undefined
{.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".}
# TODO !!! - Replace by constant-time, portable, non-assembly version
# -> use uint128? Compiler might add unwanted branches
q = udiv128(n_hi, n_lo, d, r)
func mul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
func mul_128*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
## Extended precision multiplication
## (hi, lo) <- a*b
##
## This is constant-time on most hardware
## See: https://www.bearssl.org/ctmul.html
lo = umul128(a, b, hi)
func muladd1*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.} =
func muladd1_128*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.} =
## Extended precision multiplication + addition
## (hi, lo) <- a*b + c
##
## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
## so adding any c cannot overflow
##
## This is constant-time on most hardware
## See: https://www.bearssl.org/ctmul.html
var carry: Carry
lo = umul128(a, b, hi)
addC(carry, lo, lo, c, Carry(0))
addC(carry, hi, hi, 0, carry)
func muladd2*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}=
func muladd2_128*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}=
## Extended precision multiplication + addition + addition
## This is constant-time on most hardware except some specific one like Cortex M0
## (hi, lo) <- a*b + c1 + c2

View File

@ -63,11 +63,11 @@ func shortDiv*(a: var Limbs, k: Word): Word =
# d = d shr 1
# dec(shift)
func knuthDivLE[qLen, rLen, uLen, vLen: static int](
q: var Limbs[qLen],
r: var Limbs[rLen],
u: Limbs[uLen],
v: Limbs[vLen],
func knuthDivLE(
q: var StUint,
r: var StUint,
u: StUint,
v: StUint,
needRemainder: bool) =
## Compute the quotient and remainder (if needed)
## of the division of u by v
@ -80,6 +80,15 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](
#
# Resources at the bottom of the file
const
qLen = q.limbs.len
rLen = r.limbs.len
uLen = u.limbs.len
vLen = v.limbs.len
template `[]`(a: Stuint, i: int): Word = a.limbs[i]
template `[]=`(a: Stuint, i: int, val: Word) = a.limbs[i] = val
# Find the most significant word with actual set bits
# and get the leading zero count there
var divisorLen = vLen
@ -96,7 +105,7 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](
# Divisor is a single word.
if divisorLen == 1:
q.copyFrom(u)
r.leastSignificantWord() = q.shortDiv(v.leastSignificantWord())
r.leastSignificantWord() = q.limbs.shortDiv(v.leastSignificantWord())
# zero all but the least significant word
var lsw = true
for w in leastToMostSig(r):
@ -111,8 +120,8 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](
# Normalize so that the divisor MSB is set,
# vn cannot overflow, un can overflowed by 1 word at most, hence uLen+1
un.shlSmallOverflowing(u, clz)
vn.shlSmall(v, clz)
un.shlSmallOverflowing(u.limbs, clz)
vn.shlSmall(v.limbs, clz)
static: doAssert cpuEndian == littleEndian, "Currently the division algorithm requires little endian ordering of the limbs"
# TODO: is it worth it to have the uint be the exact same extended precision representation
@ -161,24 +170,42 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](
q[j] -= 1
var carry = Carry(0)
for i in 0 ..< divisorLen:
addC(carry, u[j+i], u[j+i], v[i], carry)
addC(carry, un[j+i], un[j+i], v[i], carry)
# Quotient is found, if remainder is needed we need to un-normalize un
if needRemainder:
r.shrSmall(un, clz)
# r.limbs.shrSmall(un, clz) - TODO
when cpuEndian == littleEndian:
# rLen+1 == un.len
for i in 0 ..< rLen:
r[i] = (un[i] shr clz) or (un[i+1] shl (WordBitWidth - clz))
else:
{.error: "Not Implemented for bigEndian".}
const BinaryShiftThreshold = 8 # If the difference in bit-length is below 8
# binary shift is probably faster
func divmod(q, r: var Stuint,
<<<<<<< HEAD
x: Limbs[xLen], y: Limbs[yLen], needRemainder: bool) =
=======
x, y: Stuint, needRemainder: bool) =
>>>>>>> 88858a7 (uint division - compile and pass the single limb tests)
let x_clz = x.leadingZeros()
let y_clz = y.leadingZeros()
# We short-circuit division depending on special-cases.
<<<<<<< HEAD
if unlikely(y.isZero):
raise newException(DivByZeroDefect, "You attempted to divide by zero")
elif y_clz == (bitsof(y) - 1):
=======
if unlikely(y.isZero()):
raise newException(DivByZeroError, "You attempted to divide by zero")
elif y_clz == (y.bits - 1):
>>>>>>> 88858a7 (uint division - compile and pass the single limb tests)
# y is one
q = x
# elif (x.hi or y.hi).isZero:
@ -209,7 +236,7 @@ func `div`*(x, y: Stuint): Stuint {.inline.} =
func `mod`*(x, y: Stuint): Stuint {.inline.} =
## Remainder operation for multi-precision unsigned uint
var tmp{.noInit.}: Stuint
divmod(tmp, result, x,y, needRemainder = true)
divmod(tmp, result, x, y, needRemainder = true)
func divmod*(x, y: Stuint): tuple[quot, rem: Stuint] =
## Division and remainder operations for multi-precision unsigned uint

View File

@ -190,19 +190,21 @@ suite "Testing unsigned int division and modulo implementation":
check: cast[uint64](qr.quot) == 7'u64
check: cast[uint64](qr.rem) == 9'u64
test "Divmod(2^64, 3) returns the correct result":
let a = 1.stuint(128) shl 64
let b = 3.stuint(128)
let qr = divmod(a, b)
let q = cast[UintImpl[uint64]](qr.quot)
let r = cast[UintImpl[uint64]](qr.rem)
check: q.lo == 6148914691236517205'u64
check: q.hi == 0'u64
check: r.lo == 1'u64
check: r.hi == 0'u64
# TODO - no more .lo / .hi
#
# test "Divmod(2^64, 3) returns the correct result":
# let a = 1.stuint(128) shl 64
# let b = 3.stuint(128)
#
# let qr = divmod(a, b)
#
# let q = cast[UintImpl[uint64]](qr.quot)
# let r = cast[UintImpl[uint64]](qr.rem)
#
# check: q.lo == 6148914691236517205'u64
# check: q.hi == 0'u64
# check: r.lo == 1'u64
# check: r.hi == 0'u64
test "Divmod(1234567891234567890, 10) returns the correct result":
let a = cast[StUint[64]](1234567891234567890'u64)