First steps in using uint64 words

This commit is contained in:
Mamy André-Ratsimbazafy 2020-02-29 02:10:55 +01:00
parent 05bce529b4
commit 88d4a58a10
No known key found for this signature in database
GPG Key ID: 7B88AD1FE79492E1
5 changed files with 115 additions and 267 deletions

View File

@ -138,7 +138,7 @@ template bitSizeof(v: BigIntViewAny): uint32 =
bind BigIntView
distinctBase(type v)(v).bitLength
const divShiftor = log2(WordPhysBitSize)
const divShiftor = log2(uint32(WordPhysBitSize))
template numLimbs*(v: BigIntViewAny): int =
## Compute the number of limbs from
## the **internal** bitlength
@ -447,9 +447,7 @@ func shlAddMod(a: BigIntViewMut, c: Word, M: BigIntViewConst) =
block: # q*p
# q * p + carry (doubleword) carry from previous limb
let qp = unsafeExtPrecMul(q, M[i]) + carry.DoubleWord
carry = Word(qp shr WordBitSize) # New carry: high digit besides LSB
qp_lo = qp.Word.mask() # Normalize to u63
unsafeFMA(carry, qp_lo, q, M[i], carry)
block: # a*2^63 - q*p
a[i] -= qp_lo
@ -551,28 +549,23 @@ func montyMul*(
r.setBitLength(bitSizeof(M))
setZero(r)
var partials: array[14, DoubleWord]
var r_hi = Zero # represents the high word that is used in intermediate computation before reduction mod M
for i in 0 ..< nLen:
let zi = (Word(partials[0]) + wordMul(a[i], b[0])).wordMul(negInvModWord)
let zi = (r[0] + wordMul(a[i], b[0])).wordMul(negInvModWord)
var carry, z = Zero
unsafeFMA2(carry, z, a[i], b[0], zi, M[0], r[0], carry)
for j in 0 ..< nLen:
partials[j] += unsafeExtPrecMul(a[i], b[j]) + unsafeExtPrecMul(zi, M[j])
for j in 1 ..< nLen:
unsafeFMA2(carry, r[j-1], a[i], b[j], zi, M[j], r[j], carry)
var carry = partials[0] shr WordBitSize
for j in 1 .. nlen: # we need 1 extra temporary at nlen
partials[j] += carry
carry = partials[j] shr WordBitSize
partials[j-1] = partials[j] and DoubleWord(MaxWord)
partials[nlen] = partials[nlen] shr WordBitSize
for i in 0 ..< nLen:
r[i] = Word(partials[i])
r_hi += carry
r[^1] = r_hi.mask()
r_hi = r_hi shr WordBitSize
# If the extra word is not zero or if r-M does not borrow (i.e. r > M)
# Then substract M
discard r.csub(M, CTBool[Word](partials[nLen].isNonZero()) or not r.csub(M, CtFalse))
discard r.csub(M, r_hi.isNonZero() or not r.csub(M, CtFalse))
func redc*(r: BigIntViewMut, a: BigIntViewAny, one, N: BigIntViewConst, negInvModWord: Word) {.inline.} =
## Transform a bigint ``a`` from it's Montgomery N-residue representation (mod N)

View File

@ -126,7 +126,7 @@ func negInvModWord*(M: BigInt): BaseType =
let
M0 = BaseType(M.limbs[0])
k = log2(WordPhysBitSize)
k = log2(uint32(WordPhysBitSize))
result = M0 # Start from an inverse of M0 modulo 2, M0 is odd and it's own inverse
for _ in 0 ..< k: # at each iteration we get the inverse mod(2^2k)

View File

@ -14,12 +14,11 @@
import ../primitives/constant_time
type Word* = Ct[uint32]
type Word* = Ct[uint64]
## Logical BigInt word
## A logical BigInt word is of size physical MachineWord-1
type DoubleWord* = Ct[uint64]
type BaseType* = uint32
type BaseType* = uint64
## Physical BigInt for conversion in "normal integers"
const

View File

@ -11,7 +11,6 @@
# Constant-time primitives
#
# ############################################################
type
BaseUint* = SomeUnsignedInt or byte
@ -168,6 +167,26 @@ func log2*(x: uint32): uint32 =
v = v or v shr 16
lookup[(v * 0x07C4ACDD'u32) shr 27]
func log2*(x: uint64): uint64 {.inline, noSideEffect.} =
## Find the log base 2 of a 32-bit or less integer.
## using De Bruijn multiplication
## Works at compile-time, guaranteed constant-time.
## Note: at runtime BitScanReverse or CountLeadingZero are more efficient
## but log2 is never needed at runtime.
# https://graphics.stanford.edu/%7Eseander/bithacks.html#IntegerLogDeBruijn
const lookup: array[64, uint8] = [0'u8, 58, 1, 59, 47, 53, 2, 60, 39, 48, 27, 54,
33, 42, 3, 61, 51, 37, 40, 49, 18, 28, 20, 55, 30, 34, 11, 43, 14, 22, 4, 62,
57, 46, 52, 38, 26, 32, 41, 50, 36, 17, 19, 29, 10, 13, 21, 56, 45, 25, 31,
35, 16, 9, 12, 44, 24, 15, 8, 23, 7, 6, 5, 63]
var v = x
v = v or v shr 1 # first round down to one less than a power of 2
v = v or v shr 2
v = v or v shr 4
v = v or v shr 8
v = v or v shr 16
v = v or v shr 32
lookup[(v * 0x03F6EAF2CD271461'u64) shr 58]
# ############################################################
#
# Hardened Boolean primitives

View File

@ -20,10 +20,6 @@ import ./constant_time
#
# ############################################################
template unsafeExtPrecMul*(a, b: Ct[uint32]): Ct[uint64] =
## Extended precision multiplication uint32 * uint32 --> uint64
Ct[uint64](uint64(a) * uint64(b))
func unsafeDiv2n1n*(q, r: var Ct[uint32], n_hi, n_lo, d: Ct[uint32]) {.inline.}=
## Division uint64 by uint32
## Warning ⚠️ :
@ -35,261 +31,102 @@ func unsafeDiv2n1n*(q, r: var Ct[uint32], n_hi, n_lo, d: Ct[uint32]) {.inline.}=
## so that the most significant bit in d is set.
# TODO !!! - Replace by constant-time, portable, non-assembly version
# -> use uint128? Compiler might add unwanted branches
{.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".}
let dividend = (uint64(n_hi) shl 32) or uint64(n_lo)
let divisor = uint64(d)
q = (Ct[uint32])(dividend div divisor)
r = (Ct[uint32])(dividend mod divisor)
template unsafeFMA*(hi, lo: var Ct[uint32], a, b, c: Ct[uint32]) =
## Extended precision multiplication + addition
## This is constant-time on most hardware except some specific one like Cortex M0
## (hi, lo) <- a*b + c
block:
# Note: since a and b use 31-bit,
# the result is 62-bit and carrying cannot overflow
let dblPrec = uint64(a) * uint64(b) + uint64(c)
hi = Ct[uint32](dblPrec shr 31)
lo = Ct[uint32](dblPrec) and Ct[uint32](1 shl 31 - 1)
template unsafeFMA2*(hi, lo: var Ct[uint32], a1, b1, a2, b2, c1, c2: Ct[uint32]) =
## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2
block:
# TODO: Can this overflow?
let dblPrec = uint64(a1) * uint64(b1) +
uint64(a2) * uint64(b2) +
uint64(c1) +
uint64(c2)
hi = Ct[uint32](dblPrec shr 31)
lo = Ct[uint32](dblPrec) and Ct[uint32](1 shl 31 - 1)
# ############################################################
#
# 64-bit words
#
# ############################################################
# func asm_x86_64_extMul(hi, lo: var uint64, a, b: uint64) {.inline.}=
# ## Extended precision multiplication uint64 * uint64 --> uint128
when defined(gcc) or defined(clang) or defined(llvm_gcc):
type
uint128*{.importc: "unsigned __int128".} = object
# # TODO !!! - Replace by constant-time, portable, non-assembly version
# # -> use uint128? Compiler might add unwanted branches
func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
## Division uint128 by uint64
## Warning ⚠️ :
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
## - if n_hi > d result is undefined
{.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".}
# # MUL r/m64
# # Multiply RAX by r/m64
# #
# # Inputs:
# # - RAX
# # - r/m
# # Outputs:
# # - High word in RDX
# # - Low word in RAX
# TODO !!! - Replace by constant-time, portable, non-assembly version
# -> use uint128? Compiler might add unwanted branches
# # Don't forget to dereference the var hidden pointer in hi/lo
# asm """
# mulq %[operand]
# : "=d" (`*hi`), "=a" (`*lo`)
# : "a" (`a`), [operand] "rm" (`b`)
# :
# """
# DIV r/m64
# Divide RDX:RAX (n_hi:n_lo) by r/m64
#
# Inputs
# - numerator high word in RDX,
# - numerator low word in RAX,
# - divisor as r/m parameter (register or memory at the compiler discretion)
# Result
# - Quotient in RAX
# - Remainder in RDX
# func unsafeExtPrecMul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.}=
# ## Extended precision multiplication uint64 * uint64 --> uint128
# ##
# ## TODO, at the moment only x86_64 architecture are supported
# ## as we use assembly.
# ## Also we assume that the native integer division
# ## provided by the PU is constant-time
# 1. name the register/memory "divisor"
# 2. don't forget to dereference the var hidden pointer
# 3. -
# 4. no clobbered registers beside explectly used RAX and RDX
asm """
divq %[divisor]
: "=a" (`*q`), "=d" (`*r`)
: "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
:
"""
# # Note, using C/Nim default `*` is inefficient
# # and complicated to make constant-time
# # See at the bottom.
template unsafeFMA*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) =
## Extended precision multiplication + addition
## This is constant-time on most hardware except some specific one like Cortex M0
## (hi, lo) <- a*b + c
block:
# Note: since a and b use 63-bit,
# the result is 126-bit and carrying cannot overflow
var dblPrec {.noInit.}: uint128
{.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, " + (unsigned __int128)",c,";"].}
# type T = uint64
{.emit:[hi, " = (NU64)(", dblPrec," >> ", 63'u64, ");"].}
{.emit:[lo, " = (NU64)", dblPrec," & ", 1'u64 shl 63 - 1, ";"].}
# when not defined(amd64):
# {.error: "At the moment only x86_64 architecture is supported".}
# else:
# asm_x86_64_extMul(T(hi), T(lo), T(a), T(b))
template unsafeFMA2*(hi, lo: var Ct[uint64], a1, b1, a2, b2, c1, c2: Ct[uint64]) =
## (hi, lo) <- a1 * b1 + a2 * b2 + c1 + c2
block:
# TODO: Can this overflow?
var dblPrec: uint128
{.emit:[dblPrec, " = (unsigned __int128)", a1," * (unsigned __int128)", b1,
" + (unsigned __int128)", a2," * (unsigned __int128)", b2,
" + (unsigned __int128)", c1,
" + (unsigned __int128)", c2, ";"].}
{.emit:[hi, " = (NU64)", dblPrec," >> ", 63'u64, ";"].}
{.emit:[lo, " = (NU64)", dblPrec," & ", 1'u64 shl 63 - 1, ";"].}
# func asm_x86_64_div2n1n(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
# ## Division uint128 by uint64
# ## Warning ⚠️ :
# ## - if n_hi == d, quotient does not fit in an uint64
# ## - if n_hi > d result is undefined
# # TODO !!! - Replace by constant-time, portable, non-assembly version
# # -> use uint128? Compiler might add unwanted branches
# # DIV r/m64
# # Divide RDX:RAX (n_hi:n_lo) by r/m64
# #
# # Inputs
# # - numerator high word in RDX,
# # - numerator low word in RAX,
# # - divisor as r/m parameter (register or memory at the compiler discretion)
# # Result
# # - Quotient in RAX
# # - Remainder in RDX
# # 1. name the register/memory "divisor"
# # 2. don't forget to dereference the var hidden pointer
# # 3. -
# # 4. no clobbered registers beside explectly used RAX and RDX
# asm """
# divq %[divisor]
# : "=a" (`*q`), "=d" (`*r`)
# : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
# :
# """
# func unsafeDiv2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
# ## Division uint128 by uint64
# ## Warning ⚠️ :
# ## - if n_hi == d, quotient does not fit in an uint64
# ## - if n_hi > d result is undefined
# ##
# ## To avoid issues, n_hi, n_lo, d should be normalized.
# ## i.e. shifted (== multiplied by the same power of 2)
# ## so that the most significant bit in d is set.
# ##
# ## TODO, at the moment only x86_64 architecture are supported
# ## as we use assembly.
# ## Also we assume that the native integer division
# ## provided by the PU is constant-time
# # Note, using C/Nim default `div` is inefficient
# # and complicated to make constant-time
# # See at the bottom.
# #
# # Furthermore compilers may try to substitute division
# # with a fast path that may have branches. It might also
# # be the same at the hardware level.
# # TODO !!! - Replace by constant-time, portable, non-assembly version
# # -> use uint128? Compiler might add unwanted branches
# type T = uint64
# when not defined(amd64):
# {.error: "At the moment only x86_64 architecture is supported".}
# else:
# asm_x86_64_div2n1n(T(q), T(r), T(n_hi), T(n_lo), T(d))
when isMainModule:
block: # Multiplication
var hi, lo: uint64
asm_x86_64_extMul(hi, lo, 1 shl 32, 1 shl 33) # 2^65
doAssert hi == 2
doAssert lo == 0
block: # Division
var q, r: uint64
# (1 shl 64) div 3
let n_hi = 1'u64
let n_lo = 0'u64
let d = 3'u64
asm_x86_64_div2n1n(q, r, n_hi, n_lo, d)
doAssert q == 6148914691236517205'u64
doAssert r == 1
# block: # TODO - support Quotient that doesn't fit in the result
# # The usual way with normalization by the bitSize difference
# # is fundamentally non constant-time
# # it is probable that division is not constant-time at the hardware level as well
# # as it throws sigfpe when the quotient doesn't fit in the result size
# var q, r: uint64
# let n_hi = 1'u64
# let n_lo = 0'u64
# let d = 1'u64
# asm_x86_64_div2n1n(q, r, n_hi, n_lo, d)
# echo "quotient: ", q
# echo "remainder: ", r
# block:
# var q, r: uint64
# let n_hi = 4186590388502004879'u64
# let n_lo = 17852795547484522084'u64
# let d = 327340459940166448'u64
# asm_x86_64_div2n1n(q, r, n_hi, n_lo, d)
# echo "quotient: ", q
# echo "remainder: ", r
# ##############################################################
#
# Non-constant-time portable extended precision multiplication
#
# ##############################################################
# implementation from Stint: https://github.com/status-im/nim-stint/blob/edb1ade37309390cc641cee07ab62e5459d9ca44/stint/private/uint_mul.nim#L91-L135
# template extPrecMulImpl(result: var UintImpl[uint64], op: untyped, u, v: uint64) =
# const
# p = 64 div 2
# base: uint64 = 1 shl p
#
# var
# x0, x1, x2, x3: uint64
#
# let
# ul = lo(u)
# uh = hi(u)
# vl = lo(v)
# vh = hi(v)
#
# x0 = ul * vl
# x1 = ul * vh
# x2 = uh * vl
# x3 = uh * vh
#
# x1 += hi(x0) # This can't carry
# x1 += x2 # but this can
# if x1 < x2: # if carry, add it to x3
# x3 += base
#
# op(result.hi, x3 + hi(x1))
# op(result.lo, (x1 shl p) or lo(x0))
#
# func extPrecMul*(result: var UintImpl[uint64], u, v: uint64) =
# ## Extended precision multiplication
# extPrecMulImpl(result, `=`, u, v)
#
# func extPrecAddMul(result: var UintImpl[uint64], u, v: uint64) =
# ## Extended precision fused in-place addition & multiplication
# extPrecMulImpl(result, `+=`, u, v)
# ############################################################
#
# Non-constant-time portable div2n1n
#
# ############################################################
# implementation from Stint: https://github.com/status-im/nim-stint/blob/edb1ade37309390cc641cee07ab62e5459d9ca44/stint/private/uint_div.nim#L131
# func div2n1n[T: SomeunsignedInt](q, r: var T, n_hi, n_lo, d: T) =
#
# # assert countLeadingZeroBits(d) == 0, "Divisor was not normalized"
#
# const
# size = bitsof(q)
# halfSize = size div 2
# halfMask = (1.T shl halfSize) - 1.T
#
# template halfQR(n_hi, n_lo, d, d_hi, d_lo: T): tuple[q,r: T] =
#
# var (q, r) = divmod(n_hi, d_hi)
# let m = q * d_lo
# var r = (r shl halfSize) or n_lo
#
# # Fix the reminder, we're at most 2 iterations off
# if r < m:
# dec q
# r += d
# if r >= d and r < m:
# dec q
# r += d
# r -= m
# (q, r)
#
# let
# d_hi = d shr halfSize
# d_lo = d and halfMask
# n_lohi = nlo shr halfSize
# n_lolo = nlo and halfMask
#
# # First half of the quotient
# let (q1, r1) = halfQR(n_hi, n_lohi, d, d_hi, d_lo)
#
# # Second half
# let (q2, r2) = halfQR(r1, n_lolo, d, d_hi, d_lo)
#
# q = (q1 shl halfSize) or q2
# r = r2
else:
{.error: "Compiler not implemented".}
# For VCC and ICC use add_carry_u64, _add_carryx_u64 intrinsics
# and _umul128