First steps in using uint64 words
This commit is contained in:
parent
05bce529b4
commit
88d4a58a10
|
@ -138,7 +138,7 @@ template bitSizeof(v: BigIntViewAny): uint32 =
|
|||
bind BigIntView
|
||||
distinctBase(type v)(v).bitLength
|
||||
|
||||
const divShiftor = log2(WordPhysBitSize)
|
||||
const divShiftor = log2(uint32(WordPhysBitSize))
|
||||
template numLimbs*(v: BigIntViewAny): int =
|
||||
## Compute the number of limbs from
|
||||
## the **internal** bitlength
|
||||
|
@ -447,9 +447,7 @@ func shlAddMod(a: BigIntViewMut, c: Word, M: BigIntViewConst) =
|
|||
|
||||
block: # q*p
|
||||
# q * p + carry (doubleword) carry from previous limb
|
||||
let qp = unsafeExtPrecMul(q, M[i]) + carry.DoubleWord
|
||||
carry = Word(qp shr WordBitSize) # New carry: high digit besides LSB
|
||||
qp_lo = qp.Word.mask() # Normalize to u63
|
||||
unsafeFMA(carry, qp_lo, q, M[i], carry)
|
||||
|
||||
block: # a*2^63 - q*p
|
||||
a[i] -= qp_lo
|
||||
|
@ -551,28 +549,23 @@ func montyMul*(
|
|||
r.setBitLength(bitSizeof(M))
|
||||
setZero(r)
|
||||
|
||||
var partials: array[14, DoubleWord]
|
||||
|
||||
var r_hi = Zero # represents the high word that is used in intermediate computation before reduction mod M
|
||||
for i in 0 ..< nLen:
|
||||
|
||||
let zi = (Word(partials[0]) + wordMul(a[i], b[0])).wordMul(negInvModWord)
|
||||
let zi = (r[0] + wordMul(a[i], b[0])).wordMul(negInvModWord)
|
||||
var carry, z = Zero
|
||||
unsafeFMA2(carry, z, a[i], b[0], zi, M[0], r[0], carry)
|
||||
|
||||
for j in 0 ..< nLen:
|
||||
partials[j] += unsafeExtPrecMul(a[i], b[j]) + unsafeExtPrecMul(zi, M[j])
|
||||
for j in 1 ..< nLen:
|
||||
unsafeFMA2(carry, r[j-1], a[i], b[j], zi, M[j], r[j], carry)
|
||||
|
||||
var carry = partials[0] shr WordBitSize
|
||||
for j in 1 .. nlen: # we need 1 extra temporary at nlen
|
||||
partials[j] += carry
|
||||
carry = partials[j] shr WordBitSize
|
||||
partials[j-1] = partials[j] and DoubleWord(MaxWord)
|
||||
partials[nlen] = partials[nlen] shr WordBitSize
|
||||
|
||||
for i in 0 ..< nLen:
|
||||
r[i] = Word(partials[i])
|
||||
r_hi += carry
|
||||
r[^1] = r_hi.mask()
|
||||
r_hi = r_hi shr WordBitSize
|
||||
|
||||
# If the extra word is not zero or if r-M does not borrow (i.e. r > M)
|
||||
# Then substract M
|
||||
discard r.csub(M, CTBool[Word](partials[nLen].isNonZero()) or not r.csub(M, CtFalse))
|
||||
discard r.csub(M, r_hi.isNonZero() or not r.csub(M, CtFalse))
|
||||
|
||||
func redc*(r: BigIntViewMut, a: BigIntViewAny, one, N: BigIntViewConst, negInvModWord: Word) {.inline.} =
|
||||
## Transform a bigint ``a`` from it's Montgomery N-residue representation (mod N)
|
||||
|
|
|
@ -126,7 +126,7 @@ func negInvModWord*(M: BigInt): BaseType =
|
|||
|
||||
let
|
||||
M0 = BaseType(M.limbs[0])
|
||||
k = log2(WordPhysBitSize)
|
||||
k = log2(uint32(WordPhysBitSize))
|
||||
|
||||
result = M0 # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse
|
||||
for _ in 0 ..< k: # at each iteration we get the inverse mod(2^2k)
|
||||
|
|
|
@ -14,12 +14,11 @@
|
|||
|
||||
import ../primitives/constant_time
|
||||
|
||||
type Word* = Ct[uint32]
|
||||
type Word* = Ct[uint64]
|
||||
## Logical BigInt word
|
||||
## A logical BigInt word is of size physical MachineWord-1
|
||||
type DoubleWord* = Ct[uint64]
|
||||
|
||||
type BaseType* = uint32
|
||||
type BaseType* = uint64
|
||||
## Physical BigInt for conversion in "normal integers"
|
||||
|
||||
const
|
||||
|
|
|
@ -11,7 +11,6 @@
|
|||
# Constant-time primitives
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
type
|
||||
BaseUint* = SomeUnsignedInt or byte
|
||||
|
||||
|
@ -168,6 +167,26 @@ func log2*(x: uint32): uint32 =
|
|||
v = v or v shr 16
|
||||
lookup[(v * 0x07C4ACDD'u32) shr 27]
|
||||
|
||||
func log2*(x: uint64): uint64 {.inline, noSideEffect.} =
|
||||
## Find the log base 2 of a 32-bit or less integer.
|
||||
## using De Bruijn multiplication
|
||||
## Works at compile-time, guaranteed constant-time.
|
||||
## Note: at runtime BitScanReverse or CountLeadingZero are more efficient
|
||||
## but log2 is never needed at runtime.
|
||||
# https://graphics.stanford.edu/%7Eseander/bithacks.html#IntegerLogDeBruijn
|
||||
const lookup: array[64, uint8] = [0'u8, 58, 1, 59, 47, 53, 2, 60, 39, 48, 27, 54,
|
||||
33, 42, 3, 61, 51, 37, 40, 49, 18, 28, 20, 55, 30, 34, 11, 43, 14, 22, 4, 62,
|
||||
57, 46, 52, 38, 26, 32, 41, 50, 36, 17, 19, 29, 10, 13, 21, 56, 45, 25, 31,
|
||||
35, 16, 9, 12, 44, 24, 15, 8, 23, 7, 6, 5, 63]
|
||||
var v = x
|
||||
v = v or v shr 1 # first round down to one less than a power of 2
|
||||
v = v or v shr 2
|
||||
v = v or v shr 4
|
||||
v = v or v shr 8
|
||||
v = v or v shr 16
|
||||
v = v or v shr 32
|
||||
lookup[(v * 0x03F6EAF2CD271461'u64) shr 58]
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# Hardened Boolean primitives
|
||||
|
|
|
@ -20,10 +20,6 @@ import ./constant_time
|
|||
#
|
||||
# ############################################################
|
||||
|
||||
template unsafeExtPrecMul*(a, b: Ct[uint32]): Ct[uint64] =
|
||||
## Extended precision multiplication uint32 * uint32 --> uint64
|
||||
Ct[uint64](uint64(a) * uint64(b))
|
||||
|
||||
func unsafeDiv2n1n*(q, r: var Ct[uint32], n_hi, n_lo, d: Ct[uint32]) {.inline.}=
|
||||
## Division uint64 by uint32
|
||||
## Warning ⚠️ :
|
||||
|
@ -35,261 +31,102 @@ func unsafeDiv2n1n*(q, r: var Ct[uint32], n_hi, n_lo, d: Ct[uint32]) {.inline.}=
|
|||
## so that the most significant bit in d is set.
|
||||
# TODO !!! - Replace by constant-time, portable, non-assembly version
|
||||
# -> use uint128? Compiler might add unwanted branches
|
||||
{.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".}
|
||||
let dividend = (uint64(n_hi) shl 32) or uint64(n_lo)
|
||||
let divisor = uint64(d)
|
||||
q = (Ct[uint32])(dividend div divisor)
|
||||
r = (Ct[uint32])(dividend mod divisor)
|
||||
|
||||
template unsafeFMA*(hi, lo: var Ct[uint32], a, b, c: Ct[uint32]) =
|
||||
## Extended precision multiplication + addition
|
||||
## This is constant-time on most hardware except some specific one like Cortex M0
|
||||
## (hi, lo) <- a*b + c
|
||||
block:
|
||||
# Note: since a and b use 31-bit,
|
||||
# the result is 62-bit and carrying cannot overflow
|
||||
let dblPrec = uint64(a) * uint64(b) + uint64(c)
|
||||
hi = Ct[uint32](dblPrec shr 31)
|
||||
lo = Ct[uint32](dblPrec) and Ct[uint32](1 shl 31 - 1)
|
||||
|
||||
template unsafeFMA2*(hi, lo: var Ct[uint32], a1, b1, a2, b2, c1, c2: Ct[uint32]) =
|
||||
## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2
|
||||
block:
|
||||
# TODO: Can this overflow?
|
||||
let dblPrec = uint64(a1) * uint64(b1) +
|
||||
uint64(a2) * uint64(b2) +
|
||||
uint64(c1) +
|
||||
uint64(c2)
|
||||
hi = Ct[uint32](dblPrec shr 31)
|
||||
lo = Ct[uint32](dblPrec) and Ct[uint32](1 shl 31 - 1)
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# 64-bit words
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
# func asm_x86_64_extMul(hi, lo: var uint64, a, b: uint64) {.inline.}=
|
||||
# ## Extended precision multiplication uint64 * uint64 --> uint128
|
||||
when defined(gcc) or defined(clang) or defined(llvm_gcc):
|
||||
type
|
||||
uint128*{.importc: "unsigned __int128".} = object
|
||||
|
||||
# # TODO !!! - Replace by constant-time, portable, non-assembly version
|
||||
# # -> use uint128? Compiler might add unwanted branches
|
||||
func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
|
||||
## Division uint128 by uint64
|
||||
## Warning ⚠️ :
|
||||
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
|
||||
## - if n_hi > d result is undefined
|
||||
{.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".}
|
||||
|
||||
# # MUL r/m64
|
||||
# # Multiply RAX by r/m64
|
||||
# #
|
||||
# # Inputs:
|
||||
# # - RAX
|
||||
# # - r/m
|
||||
# # Outputs:
|
||||
# # - High word in RDX
|
||||
# # - Low word in RAX
|
||||
# TODO !!! - Replace by constant-time, portable, non-assembly version
|
||||
# -> use uint128? Compiler might add unwanted branches
|
||||
|
||||
# # Don't forget to dereference the var hidden pointer in hi/lo
|
||||
# asm """
|
||||
# mulq %[operand]
|
||||
# : "=d" (`*hi`), "=a" (`*lo`)
|
||||
# : "a" (`a`), [operand] "rm" (`b`)
|
||||
# :
|
||||
# """
|
||||
# DIV r/m64
|
||||
# Divide RDX:RAX (n_hi:n_lo) by r/m64
|
||||
#
|
||||
# Inputs
|
||||
# - numerator high word in RDX,
|
||||
# - numerator low word in RAX,
|
||||
# - divisor as r/m parameter (register or memory at the compiler discretion)
|
||||
# Result
|
||||
# - Quotient in RAX
|
||||
# - Remainder in RDX
|
||||
|
||||
# func unsafeExtPrecMul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.}=
|
||||
# ## Extended precision multiplication uint64 * uint64 --> uint128
|
||||
# ##
|
||||
# ## TODO, at the moment only x86_64 architecture are supported
|
||||
# ## as we use assembly.
|
||||
# ## Also we assume that the native integer division
|
||||
# ## provided by the PU is constant-time
|
||||
# 1. name the register/memory "divisor"
|
||||
# 2. don't forget to dereference the var hidden pointer
|
||||
# 3. -
|
||||
# 4. no clobbered registers beside explectly used RAX and RDX
|
||||
asm """
|
||||
divq %[divisor]
|
||||
: "=a" (`*q`), "=d" (`*r`)
|
||||
: "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
|
||||
:
|
||||
"""
|
||||
|
||||
# # Note, using C/Nim default `*` is inefficient
|
||||
# # and complicated to make constant-time
|
||||
# # See at the bottom.
|
||||
template unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) =
|
||||
## Extended precision multiplication + addition
|
||||
## This is constant-time on most hardware except some specific one like Cortex M0
|
||||
## (hi, lo) <- a*b + c
|
||||
block:
|
||||
# Note: since a and b use 63-bit,
|
||||
# the result is 126-bit and carrying cannot overflow
|
||||
var dblPrec {.noInit.}: uint128
|
||||
{.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, " + (unsigned __int128)",c,";"].}
|
||||
|
||||
# type T = uint64
|
||||
{.emit:[hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
|
||||
{.emit:[lo, " = (NU64)", dblPrec," & ", 1'u64 shl 63 - 1, ";"].}
|
||||
|
||||
# when not defined(amd64):
|
||||
# {.error: "At the moment only x86_64 architecture is supported".}
|
||||
# else:
|
||||
# asm_x86_64_extMul(T(hi), T(lo), T(a), T(b))
|
||||
template unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) =
|
||||
## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2
|
||||
block:
|
||||
# TODO: Can this overflow?
|
||||
var dblPrec: uint128
|
||||
{.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1,
|
||||
" + (unsigned __int128)", a2," * (unsigned __int128)", b2,
|
||||
" + (unsigned __int128)", c1,
|
||||
" + (unsigned __int128)", c2, ";"].}
|
||||
{.emit:[hi, " = (NU64)", dblPrec," >> ", 63'u64, ";"].}
|
||||
{.emit:[lo, " = (NU64)", dblPrec," & ", 1'u64 shl 63 - 1, ";"].}
|
||||
|
||||
# func asm_x86_64_div2n1n(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
|
||||
# ## Division uint128 by uint64
|
||||
# ## Warning ⚠️ :
|
||||
# ## - if n_hi == d, quotient does not fit in an uint64
|
||||
# ## - if n_hi > d result is undefined
|
||||
|
||||
# # TODO !!! - Replace by constant-time, portable, non-assembly version
|
||||
# # -> use uint128? Compiler might add unwanted branches
|
||||
|
||||
# # DIV r/m64
|
||||
# # Divide RDX:RAX (n_hi:n_lo) by r/m64
|
||||
# #
|
||||
# # Inputs
|
||||
# # - numerator high word in RDX,
|
||||
# # - numerator low word in RAX,
|
||||
# # - divisor as r/m parameter (register or memory at the compiler discretion)
|
||||
# # Result
|
||||
# # - Quotient in RAX
|
||||
# # - Remainder in RDX
|
||||
|
||||
# # 1. name the register/memory "divisor"
|
||||
# # 2. don't forget to dereference the var hidden pointer
|
||||
# # 3. -
|
||||
# # 4. no clobbered registers beside explectly used RAX and RDX
|
||||
# asm """
|
||||
# divq %[divisor]
|
||||
# : "=a" (`*q`), "=d" (`*r`)
|
||||
# : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
|
||||
# :
|
||||
# """
|
||||
|
||||
# func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
|
||||
# ## Division uint128 by uint64
|
||||
# ## Warning ⚠️ :
|
||||
# ## - if n_hi == d, quotient does not fit in an uint64
|
||||
# ## - if n_hi > d result is undefined
|
||||
# ##
|
||||
# ## To avoid issues, n_hi, n_lo, d should be normalized.
|
||||
# ## i.e. shifted (== multiplied by the same power of 2)
|
||||
# ## so that the most significant bit in d is set.
|
||||
# ##
|
||||
# ## TODO, at the moment only x86_64 architecture are supported
|
||||
# ## as we use assembly.
|
||||
# ## Also we assume that the native integer division
|
||||
# ## provided by the PU is constant-time
|
||||
|
||||
# # Note, using C/Nim default `div` is inefficient
|
||||
# # and complicated to make constant-time
|
||||
# # See at the bottom.
|
||||
# #
|
||||
# # Furthermore compilers may try to substitute division
|
||||
# # with a fast path that may have branches. It might also
|
||||
# # be the same at the hardware level.
|
||||
|
||||
# # TODO !!! - Replace by constant-time, portable, non-assembly version
|
||||
# # -> use uint128? Compiler might add unwanted branches
|
||||
|
||||
# type T = uint64
|
||||
|
||||
# when not defined(amd64):
|
||||
# {.error: "At the moment only x86_64 architecture is supported".}
|
||||
# else:
|
||||
# asm_x86_64_div2n1n(T(q), T(r), T(n_hi), T(n_lo), T(d))
|
||||
|
||||
when isMainModule:
|
||||
block: # Multiplication
|
||||
var hi, lo: uint64
|
||||
|
||||
asm_x86_64_extMul(hi, lo, 1 shl 32, 1 shl 33) # 2^65
|
||||
doAssert hi == 2
|
||||
doAssert lo == 0
|
||||
|
||||
block: # Division
|
||||
var q, r: uint64
|
||||
|
||||
# (1 shl 64) div 3
|
||||
let n_hi = 1'u64
|
||||
let n_lo = 0'u64
|
||||
let d = 3'u64
|
||||
|
||||
asm_x86_64_div2n1n(q, r, n_hi, n_lo, d)
|
||||
|
||||
doAssert q == 6148914691236517205'u64
|
||||
doAssert r == 1
|
||||
|
||||
# block: # TODO - support Quotient that doesn't fit in the result
|
||||
# # The usual way with normalization by the bitSize difference
|
||||
# # is fundamentally non constant-time
|
||||
# # it is probable that division is not constant-time at the hardware level as well
|
||||
# # as it throws sigfpe when the quotient doesn't fit in the result size
|
||||
|
||||
# var q, r: uint64
|
||||
|
||||
# let n_hi = 1'u64
|
||||
# let n_lo = 0'u64
|
||||
# let d = 1'u64
|
||||
|
||||
# asm_x86_64_div2n1n(q, r, n_hi, n_lo, d)
|
||||
|
||||
# echo "quotient: ", q
|
||||
# echo "remainder: ", r
|
||||
|
||||
# block:
|
||||
# var q, r: uint64
|
||||
|
||||
# let n_hi = 4186590388502004879'u64
|
||||
# let n_lo = 17852795547484522084'u64
|
||||
# let d = 327340459940166448'u64
|
||||
|
||||
# asm_x86_64_div2n1n(q, r, n_hi, n_lo, d)
|
||||
|
||||
# echo "quotient: ", q
|
||||
# echo "remainder: ", r
|
||||
|
||||
# ##############################################################
|
||||
#
|
||||
# Non-constant-time portable extended precision multiplication
|
||||
#
|
||||
# ##############################################################
|
||||
|
||||
# implementation from Stint: https://github.com/status-im/nim-stint/blob/edb1ade37309390cc641cee07ab62e5459d9ca44/stint/private/uint_mul.nim#L91-L135
|
||||
|
||||
# template extPrecMulImpl(result: var UintImpl[uint64], op: untyped, u, v: uint64) =
|
||||
# const
|
||||
# p = 64 div 2
|
||||
# base: uint64 = 1 shl p
|
||||
#
|
||||
# var
|
||||
# x0, x1, x2, x3: uint64
|
||||
#
|
||||
# let
|
||||
# ul = lo(u)
|
||||
# uh = hi(u)
|
||||
# vl = lo(v)
|
||||
# vh = hi(v)
|
||||
#
|
||||
# x0 = ul * vl
|
||||
# x1 = ul * vh
|
||||
# x2 = uh * vl
|
||||
# x3 = uh * vh
|
||||
#
|
||||
# x1 += hi(x0) # This can't carry
|
||||
# x1 += x2 # but this can
|
||||
# if x1 < x2: # if carry, add it to x3
|
||||
# x3 += base
|
||||
#
|
||||
# op(result.hi, x3 + hi(x1))
|
||||
# op(result.lo, (x1 shl p) or lo(x0))
|
||||
#
|
||||
# func extPrecMul*(result: var UintImpl[uint64], u, v: uint64) =
|
||||
# ## Extended precision multiplication
|
||||
# extPrecMulImpl(result, `=`, u, v)
|
||||
#
|
||||
# func extPrecAddMul(result: var UintImpl[uint64], u, v: uint64) =
|
||||
# ## Extended precision fused in-place addition & multiplication
|
||||
# extPrecMulImpl(result, `+=`, u, v)
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# Non-constant-time portable div2n1n
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
# implementation from Stint: https://github.com/status-im/nim-stint/blob/edb1ade37309390cc641cee07ab62e5459d9ca44/stint/private/uint_div.nim#L131
|
||||
|
||||
# func div2n1n[T: SomeunsignedInt](q, r: var T, n_hi, n_lo, d: T) =
|
||||
#
|
||||
# # assert countLeadingZeroBits(d) == 0, "Divisor was not normalized"
|
||||
#
|
||||
# const
|
||||
# size = bitsof(q)
|
||||
# halfSize = size div 2
|
||||
# halfMask = (1.T shl halfSize) - 1.T
|
||||
#
|
||||
# template halfQR(n_hi, n_lo, d, d_hi, d_lo: T): tuple[q,r: T] =
|
||||
#
|
||||
# var (q, r) = divmod(n_hi, d_hi)
|
||||
# let m = q * d_lo
|
||||
# var r = (r shl halfSize) or n_lo
|
||||
#
|
||||
# # Fix the reminder, we're at most 2 iterations off
|
||||
# if r < m:
|
||||
# dec q
|
||||
# r += d
|
||||
# if r >= d and r < m:
|
||||
# dec q
|
||||
# r += d
|
||||
# r -= m
|
||||
# (q, r)
|
||||
#
|
||||
# let
|
||||
# d_hi = d shr halfSize
|
||||
# d_lo = d and halfMask
|
||||
# n_lohi = nlo shr halfSize
|
||||
# n_lolo = nlo and halfMask
|
||||
#
|
||||
# # First half of the quotient
|
||||
# let (q1, r1) = halfQR(n_hi, n_lohi, d, d_hi, d_lo)
|
||||
#
|
||||
# # Second half
|
||||
# let (q2, r2) = halfQR(r1, n_lolo, d, d_hi, d_lo)
|
||||
#
|
||||
# q = (q1 shl halfSize) or q2
|
||||
# r = r2
|
||||
else:
|
||||
{.error: "Compiler not implemented".}
|
||||
# For VCC and ICC use add_carry_u64, _add_carryx_u64 intrinsics
|
||||
# and _umul128
|
||||
|
|
Loading…
Reference in New Issue