Implement types and uint128 primitives

This commit is contained in:
Mamy André-Ratsimbazafy 2020-06-12 18:37:02 +02:00 committed by jangko
parent c0ae9e10a9
commit de87739635
No known key found for this signature in database
GPG Key ID: 31702AE10541E6B9
7 changed files with 572 additions and 228 deletions

View File

@ -1,59 +0,0 @@
# Stint
# Copyright 2018 Status Research & Development GmbH
# Licensed under either of
#
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
#
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import ./datatypes
func toSubtype*[T: SomeInteger](b: bool, _: typedesc[T]): T {.inline.}=
b.T
func toSubtype*[T: UintImpl | IntImpl](b: bool, _: typedesc[T]): T {.inline.}=
type SubTy = type result.lo
result.lo = toSubtype(b, SubTy)
func toUint*(n: UintImpl or IntImpl or SomeSignedInt): auto {.inline.}=
## Casts an unsigned integer to an uint of the same size
# TODO: uint128 support
when n.sizeof > 8:
{.fatal: "Unreachable. You are trying to cast a StUint with more than 64-bit of precision" .}
elif n.sizeof == 8:
cast[uint64](n)
elif n.sizeof == 4:
cast[uint32](n)
elif n.sizeof == 2:
cast[uint16](n)
else:
cast[uint8](n)
func toUint*(n: SomeUnsignedInt): SomeUnsignedInt {.inline.}=
## No-op overload of multi-precision int casting
n
func asDoubleUint*(n: UintImpl | SomeUnsignedInt): auto {.inline.} =
## Convert an integer or StUint to an uint with double the size
type Double = (
when n.sizeof == 4: uint64
elif n.sizeof == 2: uint32
else: uint16
)
n.toUint.Double
func toInt*(n: UintImpl or IntImpl or SomeInteger): auto {.inline.}=
## Casts an unsigned integer to an uint of the same size
# TODO: uint128 support
when n.sizeof > 8:
{.fatal: "Unreachable. You are trying to cast a StUint with more than 64-bit of precision" .}
elif n.sizeof == 8:
cast[int64](n)
elif n.sizeof == 4:
cast[int32](n)
elif n.sizeof == 2:
cast[int16](n)
else:
cast[int8](n)

View File

@ -7,191 +7,56 @@
#
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# TODO: test if GCC/Clang support uint128 natively
# #### Overview
#
# Stint extends the default uint8, uint16, uint32, uint64 with power of 2 integers.
# Only limitation is your stack size so you can have uint128, uint256, uint512 ...
# Signed int are also possible.
#
# As a high-level API, Stint adheres to Nim and C conventions and uses the same operators like:
# `+`, `xor`, `not` ...
#
# #### Implementation
#
# Stint types are stored on the stack and have a structure
# similar to a binary tree of power of two unsigned integers
# with "high" and "low" words:
#
# Stuint[256]
# hi: Stuint[128] lo: Stuint[128]
# hihi: uint64 hilo: uint64 lohi: uint64 lolo: uint64
#
# This follows paper https://hal.archives-ouvertes.fr/hal-00582593v2
# "Recursive double-size fixed precision arithmetic" from Jul. 2016
# to implement an efficient fixed precision bigint for embedded devices, especially FPGAs.
#
# For testing purpose, the flag `-d:stint_test` can be passed at compile-time
# to switch the backend to uint32.
# In the future the default backend will become uint128 on supporting compilers.
#
# This has following benefits:
# - BigEndian/LittleEndian support is trivial.
# - Not having for loops help the compiler producing the most efficient instructions
# like ADC (Add with Carry)
# - Proving that the recursive structure works at depth 64 for uint32 backend means that
# it would work at depth 128 for uint64 backend.
# We can easily choose a uint16 or uint8 backend as well.
# - Due to the recursive structure, testing operations when there is:
# - no leaves(uint64)
# - a root and leaves with no nodes (uint128)
# - a root + intermediate nodes + leaves (uint256)
# should be enough to ensure they work at all sizes, edge cases included.
# - Adding a new backend like uint128 (GCC/Clang) or uint256 (LLVM instrinsics only) is just adding
# a new case in the `uintImpl` template.
# - All math implementations of the operations have a straightforward translation
# to a high-low structure, including the fastest Karatsuba multiplication
# and co-recursive division algorithm by Burnikel and Ziegler.
# This makes translating those algorithms into Nim easier compared to an array backend.
# It would also probably require less code and would be much easier to audit versus
# the math reference papers.
# - For implementation of algorithms, there is no issue to take subslices of the memory representation
# with a recursive tree structure.
# On the other side, returning a `var array[N div 2, uint64]` is problematic at the moment.
# - Compile-time computation is possible while due to the previous issue
# an array backend would be required to use var openArray[uint64]
# i.e. pointers.
# - Note that while shift-right and left can easily be done an array of bytes
# this would have reduced performance compared to moving 64-bit words.
# An efficient implementation on array of words would require checking the shift
# versus a half-word to deal with carry-in/out from and to the adjacent words
# similar to a recursive implementation.
#
# Iterations over the whole integers, for example for `==` is always unrolled.
# Due to being on the stack, any optimizing compiler should compile that to efficient memcmp
#
# When not to use Stint:
#
# 1. Constant-time arithmetics
# - Do not use Stint if you need side-channels resistance,
# This requires to avoid all branches (i.e. no booleans)
# 2. Arbitrary-precision with lots of small-values
# - If you need arbitrary precision but most of the time you store mall values
# you will waste a lot of memory unless you use an object variant of various Stint sizes.
# type MyUint = object
# case kind: int
# of 0..64: uint64
# of 66..128: ref Stuint[128]
# of 129..256: ref Stuint[256]
# ...
#
# Note: if you work with huge size, you can allocate stints on the heap with
# for example `type HeapInt8192 = ref Stint[8192].
# If you have a lot of computations and intermediate variables it's probably worthwhile
# to explore creating an object pool to reuse the memory buffers.
template checkDiv2(bits: static[int]): untyped =
# TODO: There is no need to check if power of 2 at each uintImpl instantiation, it slows compilation.
# However we easily get into nested templates instantiation if we use another
# template that first checks power-of-two and then calls the recursive uintImpl
static:
doAssert (bits and (bits-1)) == 0, $bits & " is not a power of 2"
doAssert bits >= 8, "The number of bits in a should be greater or equal to 8"
bits div 2
when defined(mpint_test): # TODO stint_test
template uintImpl*(bits: static[int]): untyped =
# Test version, StUint[64] = 2 uint32. Test the logic of the library
when bits >= 128: UintImpl[uintImpl(checkDiv2(bits))]
elif bits == 64: UintImpl[uint32]
elif bits == 32: UintImpl[uint16]
elif bits == 16: UintImpl[uint8]
else: {.fatal: "Only power-of-2 >=16 supported when testing" .}
template intImpl*(bits: static[int]): untyped =
# Test version, StInt[64] = 2 uint32. Test the logic of the library
# int is implemented using a signed hi part and an unsigned lo part, given
# that the sign resides in hi
when bits >= 128: IntImpl[intImpl(checkDiv2(bits)), uintImpl(checkDiv2(bits))]
elif bits == 64: IntImpl[int32, uint32]
elif bits == 32: IntImpl[int16, uint16]
elif bits == 16: IntImpl[int8, uint8]
else: {.fatal: "Only power-of-2 >=16 supported when testing" .}
import
# Status lib
stew/bitops2
when sizeof(int) == 8 and not defined(Stint32):
type Word* = uint64
else:
template uintImpl*(bits: static[int]): untyped =
mixin UintImpl
when bits >= 128: UintImpl[uintImpl(checkDiv2(bits))]
elif bits == 64: uint64
elif bits == 32: uint32
elif bits == 16: uint16
elif bits == 8: uint8
else: {.fatal: "Only power-of-2 >=8 supported" .}
type Word* = uint32
template intImpl*(bits: static[int]): untyped =
# int is implemented using a signed hi part and an unsigned lo part, given
# that the sign resides in hi
type Word* = uint32
when bits >= 128: IntImpl[intImpl(checkDiv2(bits)), uintImpl(checkDiv2(bits))]
elif bits == 64: int64
elif bits == 32: int32
elif bits == 16: int16
elif bits == 8: int8
else: {.fatal: "Only power-of-2 >=8 supported" .}
func wordsRequired*(bits: int): int {.compileTime.} =
## Compute the number of limbs required
## from the **announced** bit length
(bits + WordBitWidth - 1) div WordBitWidth
type
# ### Private ### #
UintImpl*[BaseUint] = object
when system.cpuEndian == littleEndian:
lo*, hi*: BaseUint
else:
hi*, lo*: BaseUint
IntImpl*[BaseInt, BaseUint] = object
# Ints are implemented in terms of uints
when system.cpuEndian == littleEndian:
lo*: BaseUint
hi*: BaseInt
else:
hi*: BaseInt
lo*: BaseUint
# ### Private ### #
Limbs*[N: static int] = array[N, BaseUint]
StUint*[bits: static[int]] = object
data*: uintImpl(bits)
## Stack-based integer
## Unsigned
limbs*: Limbs[bits.wordsRequired]
StInt*[bits: static[int]] = object
data*: intImpl(bits)
## Stack-based integer
## Signed
limbs*: Limbs[bits.wordsRequired]
template applyHiLo*(a: UintImpl | IntImpl, c: untyped): untyped =
## Apply `c` to each of `hi` and `lo`
var res: type a
res.hi = c(a.hi)
res.lo = c(a.lo)
res
Carry* = uint8 # distinct range[0'u8 .. 1]
Borrow* = uint8 # distinct range[0'u8 .. 1]
template applyHiLo*(a, b: UintImpl | IntImpl, c: untyped): untyped =
## Apply `c` to each of `hi` and `lo`
var res: type a
res.hi = c(a.hi, b.hi)
res.lo = c(a.lo, b.lo)
res
const GCC_Compatible* = defined(gcc) or defined(clang) or defined(llvm_gcc)
const X86* = defined(amd64) or defined(i386)
when sizeof(int) == 8 and GCC_Compatible:
type
uint128*{.importc: "unsigned __int128".} = object
template leastSignificantWord*(num: SomeInteger): auto =
num
func leastSignificantWord*(num: UintImpl | IntImpl): auto {.inline.} =
when num.lo is UintImpl:
num.lo.leastSignificantWord
func leastSignificantWord*(limbs: Limbs): auto {.inline.} =
when cpuEndian == littleEndian:
limbs[0]
else:
num.lo
limbs[^1]
func mostSignificantWord*(num: UintImpl | IntImpl): auto {.inline.} =
when num.hi is (UintImpl | IntImpl):
num.hi.mostSignificantWord
func mostSignificantWord*(limbs: Limbs): auto {.inline.} =
when cpuEndian == littleEndian:
limbs[^1]
else:
num.hi
limbs[0]

View File

@ -0,0 +1,169 @@
# Stint
# Copyright 2018 Status Research & Development GmbH
# Licensed under either of
#
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
#
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import ../datatypes
# ############################################################
#
# Add-with-carry and Sub-with-borrow
#
# ############################################################
#
# This file implements add-with-carry and sub-with-borrow
#
# It is currently (Mar 2020) impossible to have the compiler
# generate optimal code in a generic way.
#
# On x86, addcarry_u64 intrinsic will generate optimal code
# except for GCC.
#
# On other CPU architectures inline assembly might be desirable.
# A compiler proof-of-concept is available in the "research" folder.
#
# See https://gcc.godbolt.org/z/2h768y
# ```C
# #include <stdint.h>
# #include <x86intrin.h>
#
# void add256(uint64_t a[4], uint64_t b[4]){
# uint8_t carry = 0;
# for (int i = 0; i < 4; ++i)
# carry = _addcarry_u64(carry, a[i], b[i], &a[i]);
# }
# ```
#
# GCC
# ```asm
# add256:
# movq (%rsi), %rax
# addq (%rdi), %rax
# setc %dl
# movq %rax, (%rdi)
# movq 8(%rdi), %rax
# addb $-1, %dl
# adcq 8(%rsi), %rax
# setc %dl
# movq %rax, 8(%rdi)
# movq 16(%rdi), %rax
# addb $-1, %dl
# adcq 16(%rsi), %rax
# setc %dl
# movq %rax, 16(%rdi)
# movq 24(%rsi), %rax
# addb $-1, %dl
# adcq %rax, 24(%rdi)
# ret
# ```
#
# Clang
# ```asm
# add256:
# movq (%rsi), %rax
# addq %rax, (%rdi)
# movq 8(%rsi), %rax
# adcq %rax, 8(%rdi)
# movq 16(%rsi), %rax
# adcq %rax, 16(%rdi)
# movq 24(%rsi), %rax
# adcq %rax, 24(%rdi)
# retq
# ```
# ############################################################
#
# Intrinsics
#
# ############################################################
# Note: GCC before 2017 had incorrect codegen in some cases:
# - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81300
when X86:
when defined(windows):
{.pragma: intrinsics, header:"<intrin.h>", nodecl.}
else:
{.pragma: intrinsics, header:"<x86intrin.h>", nodecl.}
func addcarry_u32(carryIn: Carry, a, b: uint32, sum: var uint32): Carry {.importc: "_addcarry_u32", intrinsics.}
func subborrow_u32(borrowIn: Borrow, a, b: uint32, diff: var uint32): Borrow {.importc: "_subborrow_u32", intrinsics.}
func addcarry_u64(carryIn: Carry, a, b: uint64, sum: var uint64): Carry {.importc: "_addcarry_u64", intrinsics.}
func subborrow_u64(borrowIn: Borrow, a, b:uint64, diff: var uint64): Borrow {.importc: "_subborrow_u64", intrinsics.}
# ############################################################
#
# Public
#
# ############################################################
func addC*(cOut: var Carry, sum: var uint32, a, b: uint32, cIn: Carry) {.inline.} =
## Addition with carry
## (CarryOut, Sum) <- a + b + CarryIn
when X86:
cOut = addcarry_u32(cIn, a, b, sum)
else:
let dblPrec = uint64(cIn) + uint64(a) + uint64(b)
sum = (uint32)(dblPrec)
cOut = Carry(dblPrec shr 32)
func subB*(bOut: var Borrow, diff: var uint32, a, b: uint32, bIn: Borrow) {.inline.} =
## Substraction with borrow
## (BorrowOut, Diff) <- a - b - borrowIn
when X86:
bOut = subborrow_u32(bIn, a, b, diff)
else:
let dblPrec = uint64(a) - uint64(b) - uint64(bIn)
diff = (uint32)(dblPrec)
# On borrow the high word will be 0b1111...1111 and needs to be masked
bOut = Borrow((dblPrec shr 32) and 1)
func addC*(cOut: var Carry, sum: var uint64, a, b: uint64, cIn: Carry) {.inline.} =
## Addition with carry
## (CarryOut, Sum) <- a + b + CarryIn
when X86:
cOut = addcarry_u64(cIn, a, b, sum)
else:
block:
static:
doAssert GCC_Compatible
doAssert sizeof(int) == 8
var dblPrec {.noInit.}: uint128
{.emit:[dblPrec, " = (unsigned __int128)", a," + (unsigned __int128)", b, " + (unsigned __int128)",cIn,";"].}
# Don't forget to dereference the var param in C mode
when defined(cpp):
{.emit:[cOut, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
{.emit:[sum, " = (NU64)", dblPrec,";"].}
else:
{.emit:["*",cOut, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
{.emit:["*",sum, " = (NU64)", dblPrec,";"].}
func subB*(bOut: var Borrow, diff: var uint64, a, b: uint64, bIn: Borrow) {.inline.} =
## Substraction with borrow
## (BorrowOut, Diff) <- a - b - borrowIn
when X86:
bOut = subborrow_u64(bIn, a, b, diff)
else:
block:
static:
doAssert GCC_Compatible
doAssert sizeof(int) == 8
var dblPrec {.noInit.}: uint128
{.emit:[dblPrec, " = (unsigned __int128)", a," - (unsigned __int128)", b, " - (unsigned __int128)",bIn,";"].}
# Don't forget to dereference the var param in C mode
# On borrow the high word will be 0b1111...1111 and needs to be masked
when defined(cpp):
{.emit:[bOut, " = (NU64)(", dblPrec," >> ", 64'u64, ") & 1;"].}
{.emit:[diff, " = (NU64)", dblPrec,";"].}
else:
{.emit:["*",bOut, " = (NU64)(", dblPrec," >> ", 64'u64, ") & 1;"].}
{.emit:["*",diff, " = (NU64)", dblPrec,";"].}

View File

@ -0,0 +1,130 @@
# Stint
# Copyright 2018 Status Research & Development GmbH
# Licensed under either of
#
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
#
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# ############################################################
#
# Extended precision primitives
#
# ############################################################
import
../datatypes,
./addcarry_subborrow
# ############################################################
#
# 32-bit words
#
# ############################################################
func div2n1n*(q, r: var uint32, n_hi, n_lo, d: uint32) {.inline.}=
## Division uint64 by uint32
## Warning ⚠️ :
## - if n_hi == d, quotient does not fit in an uint32
## - if n_hi > d result is undefined
##
## To avoid issues, n_hi, n_lo, d should be normalized.
## i.e. shifted (== multiplied by the same power of 2)
## so that the most significant bit in d is set.
let dividend = (uint64(n_hi) shl 32) or uint64(n_lo)
let divisor = uint64(d)
q = uint32(dividend div divisor)
r = uint32(dividend mod divisor)
func mul*(hi, lo: var uint32, a, b: uint32) {.inline.} =
## Extended precision multiplication
## (hi, lo) <- a*b
let dblPrec = uint64(a) * uint64(b)
lo = uint32(dblPrec)
hi = uint32(dblPrec shr 32)
func muladd1*(hi, lo: var uint32, a, b, c: uint32) {.inline.} =
## Extended precision multiplication + addition
## (hi, lo) <- a*b + c
##
## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001)
## so adding any c cannot overflow
let dblPrec = uint64(a) * uint64(b) + uint64(c)
lo = uint32(dblPrec)
hi = uint32(dblPrec shr 32)
func muladd2*(hi, lo: var uint32, a, b, c1, c2: uint32) {.inline.}=
## Extended precision multiplication + addition + addition
## This is constant-time on most hardware except some specific one like Cortex M0
## (hi, lo) <- a*b + c1 + c2
##
## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001)
## so adding 0xFFFFFFFF leads to (hi: 0xFFFFFFFF, lo: 0x00000000)
## and we have enough space to add again 0xFFFFFFFF without overflowing
let dblPrec = uint64(a) * uint64(b) + uint64(c1) + uint64(c2)
lo = uint32(dblPrec)
hi = uint32(dblPrec shr 32)
# ############################################################
#
# 64-bit words
#
# ############################################################
when sizeof(int) == 8:
when defined(vcc):
from ./extended_precision_x86_64_msvc import div2n1n, mul, muladd1, muladd2
elif GCCCompatible:
when X86:
from ./extended_precision_x86_64_gcc import div2n1n
from ./extended_precision_64bit_uint128 import mul, muladd1, muladd2
else:
from ./extended_precision_64bit_uint128 import div2n1n, mul, muladd1, muladd2
export div2n1n, mul, muladd1, muladd2
# ############################################################
#
# Composite primitives
#
# ############################################################
func mulDoubleAdd2*[T: uint32|uint64](r2: var Carry, r1, r0: var T, a, b, c: T, dHi: Carry, dLo: T) {.inline.} =
## (r2, r1, r0) <- 2*a*b + c + (dHi, dLo)
## with r = (r2, r1, r0) a triple-word number
## and d = (dHi, dLo) a double-word number
## r2 and dHi are carries, either 0 or 1
var carry: Carry
# (r1, r0) <- a*b
# Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFF_FFFFFFFE, lo: 0x00000000_00000001)
mul(r1, r0, a, b)
# (r2, r1, r0) <- 2*a*b
# Then (hi: 0xFFFFFFFF_FFFFFFFE, lo: 0x00000000_00000001) * 2
# (carry: 1, hi: 0xFFFFFFFF_FFFFFFFC, lo: 0x00000000_00000002)
addC(carry, r0, r0, r0, Carry(0))
addC(r2, r1, r1, r1, carry)
# (r1, r0) <- (r1, r0) + c
# Adding any uint64 cannot overflow into r2 for example Adding 2^64-1
# (carry: 1, hi: 0xFFFFFFFF_FFFFFFFD, lo: 0x00000000_00000001)
addC(carry, r0, r0, c, Carry(0))
addC(carry, r1, r1, T(0), carry)
# (r1, r0) <- (r1, r0) + (dHi, dLo) with dHi a carry (previous limb r2)
# (dHi, dLo) is at most (dhi: 1, dlo: 0xFFFFFFFF_FFFFFFFF)
# summing into (carry: 1, hi: 0xFFFFFFFF_FFFFFFFD, lo: 0x00000000_00000001)
# result at most in (carry: 1, hi: 0xFFFFFFFF_FFFFFFFF, lo: 0x00000000_00000000)
addC(carry, r0, r0, dLo, Carry(0))
addC(carry, r1, r1, T(dHi), carry)
func mulAcc*[T: uint32|uint64](t, u, v: var T, a, b: T) {.inline.} =
## (t, u, v) <- (t, u, v) + a * b
var UV: array[2, T]
var carry: Carry
mul(UV[1], UV[0], a, b)
addC(carry, v, v, UV[0], Carry(0))
addC(carry, u, u, UV[1], carry)
t += T(carry)

View File

@ -0,0 +1,95 @@
# Stint
# Copyright 2018 Status Research & Development GmbH
# Licensed under either of
#
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
#
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import ../datatypes
# ############################################################
#
# Extended precision primitives on GCC & Clang (all CPU archs)
#
# ############################################################
static:
doAssert GCC_Compatible
doAssert sizeof(int) == 8
func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
## Division uint128 by uint64
## Warning ⚠️ :
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE on some platforms
## - if n_hi > d result is undefined
var dblPrec {.noInit.}: uint128
{.emit:[dblPrec, " = (unsigned __int128)", n_hi," << 64 | (unsigned __int128)",n_lo,";"].}
# Don't forget to dereference the var param in C mode
when defined(cpp):
{.emit:[q, " = (NU64)(", dblPrec," / ", d, ");"].}
{.emit:[r, " = (NU64)(", dblPrec," % ", d, ");"].}
else:
{.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].}
{.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].}
func mul*(hi, lo: var uint64, a, b: uint64) {.inline.} =
## Extended precision multiplication
## (hi, lo) <- a*b
block:
var dblPrec {.noInit.}: uint128
{.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b,";"].}
# Don't forget to dereference the var param in C mode
when defined(cpp):
{.emit:[hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
{.emit:[lo, " = (NU64)", dblPrec,";"].}
else:
{.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
{.emit:["*",lo, " = (NU64)", dblPrec,";"].}
func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
## Extended precision multiplication + addition
## (hi, lo) <- a*b + c
##
## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
## so adding any c cannot overflow
##
## This is constant-time on most hardware
## See: https://www.bearssl.org/ctmul.html
block:
var dblPrec {.noInit.}: uint128
{.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, " + (unsigned __int128)",c,";"].}
# Don't forget to dereference the var param in C mode
when defined(cpp):
{.emit:[hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
{.emit:[lo, " = (NU64)", dblPrec,";"].}
else:
{.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
{.emit:["*",lo, " = (NU64)", dblPrec,";"].}
func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
## Extended precision multiplication + addition + addition
## This is constant-time on most hardware except some specific one like Cortex M0
## (hi, lo) <- a*b + c1 + c2
##
## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
## so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
## and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
block:
var dblPrec {.noInit.}: uint128
{.emit:[
dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b,
" + (unsigned __int128)",c1," + (unsigned __int128)",c2,";"
].}
# Don't forget to dereference the var param in C mode
when defined(cpp):
{.emit:[hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
{.emit:[lo, " = (NU64)", dblPrec,";"].}
else:
{.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
{.emit:["*",lo, " = (NU64)", dblPrec,";"].}

View File

@ -0,0 +1,57 @@
# Stint
# Copyright 2018 Status Research & Development GmbH
# Licensed under either of
#
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
#
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import ../datatypes
# ############################################################
#
# Extended precision primitives for X86-64 on GCC & Clang
#
# ############################################################
static:
doAssert(defined(gcc) or defined(clang) or defined(llvm_gcc))
doAssert sizeof(int) == 8
doAssert X86
func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
## Division uint128 by uint64
## Warning ⚠️ :
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
## - if n_hi > d result is undefined
# DIV r/m64
# Divide RDX:RAX (n_hi:n_lo) by r/m64
#
# Inputs
# - numerator high word in RDX,
# - numerator low word in RAX,
# - divisor as r/m parameter (register or memory at the compiler discretion)
# Result
# - Quotient in RAX
# - Remainder in RDX
# 1. name the register/memory "divisor"
# 2. don't forget to dereference the var hidden pointer
# 3. -
# 4. no clobbered registers beside explicitly used RAX and RDX
when defined(cpp):
asm """
divq %[divisor]
: "=a" (`q`), "=d" (`r`)
: "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
:
"""
else:
asm """
divq %[divisor]
: "=a" (`*q`), "=d" (`*r`)
: "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
:
"""

View File

@ -0,0 +1,87 @@
# Stint
# Copyright 2018 Status Research & Development GmbH
# Licensed under either of
#
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
#
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
../datatypes,
./addcarry_subborrow
# ############################################################
#
# Extended precision primitives for X86-64 on MSVC
#
# ############################################################
static:
doAssert defined(vcc)
doAssert sizeof(int) == 8
doAssert X86
func udiv128(highDividend, lowDividend, divisor: Ct[uint64], remainder: var Ct[uint64]): Ct[uint64] {.importc:"_udiv128", header: "<intrin.h>", nodecl.}
## Division 128 by 64, Microsoft only, 64-bit only,
## returns quotient as return value remainder as var parameter
## Warning ⚠️ :
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
## - if n_hi > d result is undefined
func umul128(a, b: Ct[uint64], hi: var Ct[uint64]): Ct[uint64] {.importc:"_umul128", header:"<intrin.h>", nodecl.}
## (hi, lo) <-- a * b
## Return value is the low word
func div2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
## Division uint128 by uint64
## Warning ⚠️ :
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
## - if n_hi > d result is undefined
{.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".}
# TODO !!! - Replace by constant-time, portable, non-assembly version
# -> use uint128? Compiler might add unwanted branches
q = udiv128(n_hi, n_lo, d, r)
func mul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
## Extended precision multiplication
## (hi, lo) <- a*b
##
## This is constant-time on most hardware
## See: https://www.bearssl.org/ctmul.html
lo = umul128(a, b, hi)
func muladd1*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.} =
## Extended precision multiplication + addition
## (hi, lo) <- a*b + c
##
## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
## so adding any c cannot overflow
##
## This is constant-time on most hardware
## See: https://www.bearssl.org/ctmul.html
var carry: Carry
lo = umul128(a, b, hi)
addC(carry, lo, lo, c, Carry(0))
addC(carry, hi, hi, 0, carry)
func muladd2*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}=
## Extended precision multiplication + addition + addition
## This is constant-time on most hardware except some specific one like Cortex M0
## (hi, lo) <- a*b + c1 + c2
##
## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
## so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
## and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
# For speed this could be implemented with parallel pipelined carry chains
# via MULX + ADCX + ADOX
var carry1, carry2: Carry
lo = umul128(a, b, hi)
# Carry chain 1
addC(carry1, lo, lo, c1, Carry(0))
addC(carry1, hi, hi, 0, carry1)
# Carry chain 2
addC(carry2, lo, lo, c2, Carry(0))
addC(carry2, hi, hi, 0, carry2)