uint division - compile and pass the single limb tests

This commit is contained in:
Mamy Ratsimbazafy 2022-01-22 01:42:54 +01:00 committed by jangko
parent c2ed8a4bc2
commit 53d2fd14f3
No known key found for this signature in database
GPG Key ID: 31702AE10541E6B9
8 changed files with 163 additions and 64 deletions

View File

@ -180,9 +180,9 @@ macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped
# Copy # Copy
# -------------------------------------------------------- # --------------------------------------------------------
func copyFrom*[dLen, sLen]( func copyFrom*(
dst: var SomeBigInteger[dLen], dst: var SomeBigInteger,
src: SomeBigInteger[sLen] src: SomeBigInteger
){.inline.} = ){.inline.} =
## Copy a BigInteger, truncated to 2^slen if the source ## Copy a BigInteger, truncated to 2^slen if the source
## is larger than the destination ## is larger than the destination

View File

@ -80,7 +80,7 @@ func mul_nim*(hi, lo: var uint64, u, v: uint64) =
hi = x3 + hi(x1) hi = x3 + hi(x1)
lo = merge(x1, lo(x0)) lo = merge(x1, lo(x0))
func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} = func muladd1_nim*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
## Extended precision multiplication + addition ## Extended precision multiplication + addition
## (hi, lo) <- a*b + c ## (hi, lo) <- a*b + c
## ##
@ -91,7 +91,7 @@ func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
addC_nim(carry, lo, lo, c, 0) addC_nim(carry, lo, lo, c, 0)
addC_nim(carry, hi, hi, 0, carry) addC_nim(carry, hi, hi, 0, carry)
func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}= func muladd2_nim*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
## Extended precision multiplication + addition + addition ## Extended precision multiplication + addition + addition
## (hi, lo) <- a*b + c1 + c2 ## (hi, lo) <- a*b + c1 + c2
## ##
@ -107,3 +107,48 @@ func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
# Carry chain 2 # Carry chain 2
addC_nim(carry2, lo, lo, c2, 0) addC_nim(carry2, lo, lo, c2, 0)
addC_nim(carry2, hi, hi, 0, carry2) addC_nim(carry2, hi, hi, 0, carry2)
func div2n1n_nim*[T: SomeunsignedInt](q, r: var T, n_hi, n_lo, d: T) =
## Division uint128 by uint64
## Warning ⚠️ :
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
## - if n_hi > d result is undefined
# doAssert leadingZeros(d) == 0, "Divisor was not normalized"
const
size = sizeof(q) * 8
halfSize = size div 2
halfMask = (1.T shl halfSize) - 1.T
template halfQR(n_hi, n_lo, d, d_hi, d_lo: T): tuple[q,r: T] =
var (q, r) = (n_hi div d_hi, n_hi mod d_hi)
let m = q * d_lo
r = (r shl halfSize) or n_lo
# Fix the reminder, we're at most 2 iterations off
if r < m:
dec q
r += d
if r >= d and r < m:
dec q
r += d
r -= m
(q, r)
let
d_hi = d shr halfSize
d_lo = d and halfMask
n_lohi = nlo shr halfSize
n_lolo = nlo and halfMask
# First half of the quotient
let (q1, r1) = halfQR(n_hi, n_lohi, d, d_hi, d_lo)
# Second half
let (q2, r2) = halfQR(r1, n_lolo, d, d_hi, d_lo)
q = (q1 shl halfSize) or q2
r = r2

View File

@ -73,19 +73,57 @@ func muladd2*(hi, lo: var uint32, a, b, c1, c2: uint32) {.inline.}=
# ############################################################ # ############################################################
when sizeof(int) == 8 and not defined(Stint32): when sizeof(int) == 8 and not defined(Stint32):
when nimvm: from ./compiletime_fallback import div2n1n_nim, mul_nim, muladd1_nim, muladd2_nim
from ./compiletime_fallback import mul_nim, muladd1, muladd2
else: when defined(vcc):
when defined(vcc): from ./extended_precision_x86_64_msvc import div2n1n_128, mul_128, muladd1_128, muladd2_128
from ./extended_precision_x86_64_msvc import div2n1n, mul, muladd1, muladd2 elif GCCCompatible:
elif GCCCompatible: when X86:
when X86: from ./extended_precision_x86_64_gcc import div2n1n_128
from ./extended_precision_x86_64_gcc import div2n1n from ./extended_precision_64bit_uint128 import mul_128, muladd1_128, muladd2_128
from ./extended_precision_64bit_uint128 import mul, muladd1, muladd2 else:
else: from ./extended_precision_64bit_uint128 import div2n1n_128, mul_128, muladd1_128, muladd2_128
from ./extended_precision_64bit_uint128 import div2n1n, mul, muladd1, muladd2
export div2n1n, mul func mul*(hi, lo: var uint64, u, v: uint64) {.inline.}=
export muladd1, muladd2 ## Extended precision multiplication
## (hi, lo) <- u * v
when nimvm:
mul_nim(hi, lo, u, v)
else:
mul_128(hi, lo, u, v)
func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.}=
## Extended precision multiplication + addition
## (hi, lo) <- a*b + c
##
## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
## so adding any c cannot overflow
when nimvm:
muladd1_nim(hi, lo, a, b, c)
else:
muladd1_128(hi, lo, a, b, c)
func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
## Extended precision multiplication + addition + addition
## (hi, lo) <- a*b + c1 + c2
##
## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
## so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
## and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
when nimvm:
muladd2_nim(hi, lo, a, b, c1, c2)
else:
muladd2_128(hi, lo, a, b, c1, c2)
func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
## Division uint128 by uint64
## Warning ⚠️ :
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
## - if n_hi > d result is undefined
when nimvm:
div2n1n_nim(q, r, n_hi, n_lo, d)
else:
div2n1n_128(q, r, n_hi, n_lo, d)
# ############################################################ # ############################################################
# #
@ -128,10 +166,7 @@ func mulAcc*[T: uint32|uint64](t, u, v: var T, a, b: T) {.inline.} =
## (t, u, v) <- (t, u, v) + a * b ## (t, u, v) <- (t, u, v) + a * b
var UV: array[2, T] var UV: array[2, T]
var carry: Carry var carry: Carry
when nimvm: mul(UV[1], UV[0], a, b)
mul_nim(UV[1], UV[0], a, b)
else:
mul(UV[1], UV[0], a, b)
addC(carry, v, v, UV[0], Carry(0)) addC(carry, v, v, UV[0], Carry(0))
addC(carry, u, u, UV[1], carry) addC(carry, u, u, UV[1], carry)
t += T(carry) t += T(carry)

View File

@ -19,7 +19,7 @@ static:
doAssert GCC_Compatible doAssert GCC_Compatible
doAssert sizeof(int) == 8 doAssert sizeof(int) == 8
func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}= func div2n1n_128*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
## Division uint128 by uint64 ## Division uint128 by uint64
## Warning ⚠️ : ## Warning ⚠️ :
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE on some platforms ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE on some platforms
@ -35,7 +35,7 @@ func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
{.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].} {.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].}
{.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].} {.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].}
func mul*(hi, lo: var uint64, a, b: uint64) {.inline.} = func mul_128*(hi, lo: var uint64, a, b: uint64) {.inline.} =
## Extended precision multiplication ## Extended precision multiplication
## (hi, lo) <- a*b ## (hi, lo) <- a*b
block: block:
@ -50,7 +50,7 @@ func mul*(hi, lo: var uint64, a, b: uint64) {.inline.} =
{.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].} {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
{.emit:["*",lo, " = (NU64)", dblPrec,";"].} {.emit:["*",lo, " = (NU64)", dblPrec,";"].}
func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} = func muladd1_128*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
## Extended precision multiplication + addition ## Extended precision multiplication + addition
## (hi, lo) <- a*b + c ## (hi, lo) <- a*b + c
## ##
@ -71,7 +71,7 @@ func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
{.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].} {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
{.emit:["*",lo, " = (NU64)", dblPrec,";"].} {.emit:["*",lo, " = (NU64)", dblPrec,";"].}
func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}= func muladd2_128*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
## Extended precision multiplication + addition + addition ## Extended precision multiplication + addition + addition
## This is constant-time on most hardware except some specific one like Cortex M0 ## This is constant-time on most hardware except some specific one like Cortex M0
## (hi, lo) <- a*b + c1 + c2 ## (hi, lo) <- a*b + c1 + c2

View File

@ -20,7 +20,7 @@ static:
doAssert sizeof(int) == 8 doAssert sizeof(int) == 8
doAssert X86 doAssert X86
func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}= func div2n1n_128*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
## Division uint128 by uint64 ## Division uint128 by uint64
## Warning ⚠️ : ## Warning ⚠️ :
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE

View File

@ -38,35 +38,25 @@ func div2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
## Warning ⚠️ : ## Warning ⚠️ :
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
## - if n_hi > d result is undefined ## - if n_hi > d result is undefined
{.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".}
# TODO !!! - Replace by constant-time, portable, non-assembly version
# -> use uint128? Compiler might add unwanted branches
q = udiv128(n_hi, n_lo, d, r) q = udiv128(n_hi, n_lo, d, r)
func mul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} = func mul_128*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
## Extended precision multiplication ## Extended precision multiplication
## (hi, lo) <- a*b ## (hi, lo) <- a*b
##
## This is constant-time on most hardware
## See: https://www.bearssl.org/ctmul.html
lo = umul128(a, b, hi) lo = umul128(a, b, hi)
func muladd1*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.} = func muladd1_128*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.} =
## Extended precision multiplication + addition ## Extended precision multiplication + addition
## (hi, lo) <- a*b + c ## (hi, lo) <- a*b + c
## ##
## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001) ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
## so adding any c cannot overflow ## so adding any c cannot overflow
##
## This is constant-time on most hardware
## See: https://www.bearssl.org/ctmul.html
var carry: Carry var carry: Carry
lo = umul128(a, b, hi) lo = umul128(a, b, hi)
addC(carry, lo, lo, c, Carry(0)) addC(carry, lo, lo, c, Carry(0))
addC(carry, hi, hi, 0, carry) addC(carry, hi, hi, 0, carry)
func muladd2*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}= func muladd2_128*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}=
## Extended precision multiplication + addition + addition ## Extended precision multiplication + addition + addition
## This is constant-time on most hardware except some specific one like Cortex M0 ## This is constant-time on most hardware except some specific one like Cortex M0
## (hi, lo) <- a*b + c1 + c2 ## (hi, lo) <- a*b + c1 + c2

View File

@ -63,11 +63,11 @@ func shortDiv*(a: var Limbs, k: Word): Word =
# d = d shr 1 # d = d shr 1
# dec(shift) # dec(shift)
func knuthDivLE[qLen, rLen, uLen, vLen: static int]( func knuthDivLE(
q: var Limbs[qLen], q: var StUint,
r: var Limbs[rLen], r: var StUint,
u: Limbs[uLen], u: StUint,
v: Limbs[vLen], v: StUint,
needRemainder: bool) = needRemainder: bool) =
## Compute the quotient and remainder (if needed) ## Compute the quotient and remainder (if needed)
## of the division of u by v ## of the division of u by v
@ -80,6 +80,15 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](
# #
# Resources at the bottom of the file # Resources at the bottom of the file
const
qLen = q.limbs.len
rLen = r.limbs.len
uLen = u.limbs.len
vLen = v.limbs.len
template `[]`(a: Stuint, i: int): Word = a.limbs[i]
template `[]=`(a: Stuint, i: int, val: Word) = a.limbs[i] = val
# Find the most significant word with actual set bits # Find the most significant word with actual set bits
# and get the leading zero count there # and get the leading zero count there
var divisorLen = vLen var divisorLen = vLen
@ -96,7 +105,7 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](
# Divisor is a single word. # Divisor is a single word.
if divisorLen == 1: if divisorLen == 1:
q.copyFrom(u) q.copyFrom(u)
r.leastSignificantWord() = q.shortDiv(v.leastSignificantWord()) r.leastSignificantWord() = q.limbs.shortDiv(v.leastSignificantWord())
# zero all but the least significant word # zero all but the least significant word
var lsw = true var lsw = true
for w in leastToMostSig(r): for w in leastToMostSig(r):
@ -111,8 +120,8 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](
# Normalize so that the divisor MSB is set, # Normalize so that the divisor MSB is set,
# vn cannot overflow, un can overflowed by 1 word at most, hence uLen+1 # vn cannot overflow, un can overflowed by 1 word at most, hence uLen+1
un.shlSmallOverflowing(u, clz) un.shlSmallOverflowing(u.limbs, clz)
vn.shlSmall(v, clz) vn.shlSmall(v.limbs, clz)
static: doAssert cpuEndian == littleEndian, "Currently the division algorithm requires little endian ordering of the limbs" static: doAssert cpuEndian == littleEndian, "Currently the division algorithm requires little endian ordering of the limbs"
# TODO: is it worth it to have the uint be the exact same extended precision representation # TODO: is it worth it to have the uint be the exact same extended precision representation
@ -161,24 +170,42 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](
q[j] -= 1 q[j] -= 1
var carry = Carry(0) var carry = Carry(0)
for i in 0 ..< divisorLen: for i in 0 ..< divisorLen:
addC(carry, u[j+i], u[j+i], v[i], carry) addC(carry, un[j+i], un[j+i], v[i], carry)
# Quotient is found, if remainder is needed we need to un-normalize un # Quotient is found, if remainder is needed we need to un-normalize un
if needRemainder: if needRemainder:
r.shrSmall(un, clz) # r.limbs.shrSmall(un, clz) - TODO
when cpuEndian == littleEndian:
# rLen+1 == un.len
for i in 0 ..< rLen:
r[i] = (un[i] shr clz) or (un[i+1] shl (WordBitWidth - clz))
else:
{.error: "Not Implemented for bigEndian".}
const BinaryShiftThreshold = 8 # If the difference in bit-length is below 8 const BinaryShiftThreshold = 8 # If the difference in bit-length is below 8
# binary shift is probably faster # binary shift is probably faster
func divmod(q, r: var Stuint, func divmod(q, r: var Stuint,
<<<<<<< HEAD
x: Limbs[xLen], y: Limbs[yLen], needRemainder: bool) = x: Limbs[xLen], y: Limbs[yLen], needRemainder: bool) =
=======
x, y: Stuint, needRemainder: bool) =
>>>>>>> 88858a7 (uint division - compile and pass the single limb tests)
let x_clz = x.leadingZeros() let x_clz = x.leadingZeros()
let y_clz = y.leadingZeros() let y_clz = y.leadingZeros()
# We short-circuit division depending on special-cases. # We short-circuit division depending on special-cases.
<<<<<<< HEAD
if unlikely(y.isZero): if unlikely(y.isZero):
raise newException(DivByZeroDefect, "You attempted to divide by zero") raise newException(DivByZeroDefect, "You attempted to divide by zero")
elif y_clz == (bitsof(y) - 1): elif y_clz == (bitsof(y) - 1):
=======
if unlikely(y.isZero()):
raise newException(DivByZeroError, "You attempted to divide by zero")
elif y_clz == (y.bits - 1):
>>>>>>> 88858a7 (uint division - compile and pass the single limb tests)
# y is one # y is one
q = x q = x
# elif (x.hi or y.hi).isZero: # elif (x.hi or y.hi).isZero:
@ -209,7 +236,7 @@ func `div`*(x, y: Stuint): Stuint {.inline.} =
func `mod`*(x, y: Stuint): Stuint {.inline.} = func `mod`*(x, y: Stuint): Stuint {.inline.} =
## Remainder operation for multi-precision unsigned uint ## Remainder operation for multi-precision unsigned uint
var tmp{.noInit.}: Stuint var tmp{.noInit.}: Stuint
divmod(tmp, result, x,y, needRemainder = true) divmod(tmp, result, x, y, needRemainder = true)
func divmod*(x, y: Stuint): tuple[quot, rem: Stuint] = func divmod*(x, y: Stuint): tuple[quot, rem: Stuint] =
## Division and remainder operations for multi-precision unsigned uint ## Division and remainder operations for multi-precision unsigned uint

View File

@ -190,19 +190,21 @@ suite "Testing unsigned int division and modulo implementation":
check: cast[uint64](qr.quot) == 7'u64 check: cast[uint64](qr.quot) == 7'u64
check: cast[uint64](qr.rem) == 9'u64 check: cast[uint64](qr.rem) == 9'u64
test "Divmod(2^64, 3) returns the correct result": # TODO - no more .lo / .hi
let a = 1.stuint(128) shl 64 #
let b = 3.stuint(128) # test "Divmod(2^64, 3) returns the correct result":
# let a = 1.stuint(128) shl 64
let qr = divmod(a, b) # let b = 3.stuint(128)
#
let q = cast[UintImpl[uint64]](qr.quot) # let qr = divmod(a, b)
let r = cast[UintImpl[uint64]](qr.rem) #
# let q = cast[UintImpl[uint64]](qr.quot)
check: q.lo == 6148914691236517205'u64 # let r = cast[UintImpl[uint64]](qr.rem)
check: q.hi == 0'u64 #
check: r.lo == 1'u64 # check: q.lo == 6148914691236517205'u64
check: r.hi == 0'u64 # check: q.hi == 0'u64
# check: r.lo == 1'u64
# check: r.hi == 0'u64
test "Divmod(1234567891234567890, 10) returns the correct result": test "Divmod(1234567891234567890, 10) returns the correct result":
let a = cast[StUint[64]](1234567891234567890'u64) let a = cast[StUint[64]](1234567891234567890'u64)