Implement types and uint128 primitives
This commit is contained in:
parent
c0ae9e10a9
commit
de87739635
|
@ -1,59 +0,0 @@
|
|||
# Stint
|
||||
# Copyright 2018 Status Research & Development GmbH
|
||||
# Licensed under either of
|
||||
#
|
||||
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
|
||||
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
|
||||
#
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import ./datatypes
|
||||
|
||||
func toSubtype*[T: SomeInteger](b: bool, _: typedesc[T]): T {.inline.}=
|
||||
b.T
|
||||
|
||||
func toSubtype*[T: UintImpl | IntImpl](b: bool, _: typedesc[T]): T {.inline.}=
|
||||
type SubTy = type result.lo
|
||||
result.lo = toSubtype(b, SubTy)
|
||||
|
||||
func toUint*(n: UintImpl or IntImpl or SomeSignedInt): auto {.inline.}=
|
||||
## Casts an unsigned integer to an uint of the same size
|
||||
# TODO: uint128 support
|
||||
when n.sizeof > 8:
|
||||
{.fatal: "Unreachable. You are trying to cast a StUint with more than 64-bit of precision" .}
|
||||
elif n.sizeof == 8:
|
||||
cast[uint64](n)
|
||||
elif n.sizeof == 4:
|
||||
cast[uint32](n)
|
||||
elif n.sizeof == 2:
|
||||
cast[uint16](n)
|
||||
else:
|
||||
cast[uint8](n)
|
||||
|
||||
func toUint*(n: SomeUnsignedInt): SomeUnsignedInt {.inline.}=
|
||||
## No-op overload of multi-precision int casting
|
||||
n
|
||||
|
||||
func asDoubleUint*(n: UintImpl | SomeUnsignedInt): auto {.inline.} =
|
||||
## Convert an integer or StUint to an uint with double the size
|
||||
type Double = (
|
||||
when n.sizeof == 4: uint64
|
||||
elif n.sizeof == 2: uint32
|
||||
else: uint16
|
||||
)
|
||||
|
||||
n.toUint.Double
|
||||
|
||||
func toInt*(n: UintImpl or IntImpl or SomeInteger): auto {.inline.}=
|
||||
## Casts an unsigned integer to an uint of the same size
|
||||
# TODO: uint128 support
|
||||
when n.sizeof > 8:
|
||||
{.fatal: "Unreachable. You are trying to cast a StUint with more than 64-bit of precision" .}
|
||||
elif n.sizeof == 8:
|
||||
cast[int64](n)
|
||||
elif n.sizeof == 4:
|
||||
cast[int32](n)
|
||||
elif n.sizeof == 2:
|
||||
cast[int16](n)
|
||||
else:
|
||||
cast[int8](n)
|
|
@ -7,191 +7,56 @@
|
|||
#
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
# TODO: test if GCC/Clang support uint128 natively
|
||||
|
||||
# #### Overview
|
||||
#
|
||||
# Stint extends the default uint8, uint16, uint32, uint64 with power of 2 integers.
|
||||
# Only limitation is your stack size so you can have uint128, uint256, uint512 ...
|
||||
# Signed int are also possible.
|
||||
#
|
||||
# As a high-level API, Stint adheres to Nim and C conventions and uses the same operators like:
|
||||
# `+`, `xor`, `not` ...
|
||||
#
|
||||
# #### Implementation
|
||||
#
|
||||
# Stint types are stored on the stack and have a structure
|
||||
# similar to a binary tree of power of two unsigned integers
|
||||
# with "high" and "low" words:
|
||||
#
|
||||
# Stuint[256]
|
||||
# hi: Stuint[128] lo: Stuint[128]
|
||||
# hihi: uint64 hilo: uint64 lohi: uint64 lolo: uint64
|
||||
#
|
||||
# This follows paper https://hal.archives-ouvertes.fr/hal-00582593v2
|
||||
# "Recursive double-size fixed precision arithmetic" from Jul. 2016
|
||||
# to implement an efficient fixed precision bigint for embedded devices, especially FPGAs.
|
||||
#
|
||||
# For testing purpose, the flag `-d:stint_test` can be passed at compile-time
|
||||
# to switch the backend to uint32.
|
||||
# In the future the default backend will become uint128 on supporting compilers.
|
||||
#
|
||||
# This has following benefits:
|
||||
# - BigEndian/LittleEndian support is trivial.
|
||||
# - Not having for loops help the compiler producing the most efficient instructions
|
||||
# like ADC (Add with Carry)
|
||||
# - Proving that the recursive structure works at depth 64 for uint32 backend means that
|
||||
# it would work at depth 128 for uint64 backend.
|
||||
# We can easily choose a uint16 or uint8 backend as well.
|
||||
# - Due to the recursive structure, testing operations when there is:
|
||||
# - no leaves(uint64)
|
||||
# - a root and leaves with no nodes (uint128)
|
||||
# - a root + intermediate nodes + leaves (uint256)
|
||||
# should be enough to ensure they work at all sizes, edge cases included.
|
||||
# - Adding a new backend like uint128 (GCC/Clang) or uint256 (LLVM instrinsics only) is just adding
|
||||
# a new case in the `uintImpl` template.
|
||||
# - All math implementations of the operations have a straightforward translation
|
||||
# to a high-low structure, including the fastest Karatsuba multiplication
|
||||
# and co-recursive division algorithm by Burnikel and Ziegler.
|
||||
# This makes translating those algorithms into Nim easier compared to an array backend.
|
||||
# It would also probably require less code and would be much easier to audit versus
|
||||
# the math reference papers.
|
||||
# - For implementation of algorithms, there is no issue to take subslices of the memory representation
|
||||
# with a recursive tree structure.
|
||||
# On the other side, returning a `var array[N div 2, uint64]` is problematic at the moment.
|
||||
# - Compile-time computation is possible while due to the previous issue
|
||||
# an array backend would be required to use var openArray[uint64]
|
||||
# i.e. pointers.
|
||||
# - Note that while shift-right and left can easily be done an array of bytes
|
||||
# this would have reduced performance compared to moving 64-bit words.
|
||||
# An efficient implementation on array of words would require checking the shift
|
||||
# versus a half-word to deal with carry-in/out from and to the adjacent words
|
||||
# similar to a recursive implementation.
|
||||
#
|
||||
# Iterations over the whole integers, for example for `==` is always unrolled.
|
||||
# Due to being on the stack, any optimizing compiler should compile that to efficient memcmp
|
||||
#
|
||||
# When not to use Stint:
|
||||
#
|
||||
# 1. Constant-time arithmetics
|
||||
# - Do not use Stint if you need side-channels resistance,
|
||||
# This requires to avoid all branches (i.e. no booleans)
|
||||
# 2. Arbitrary-precision with lots of small-values
|
||||
# - If you need arbitrary precision but most of the time you store mall values
|
||||
# you will waste a lot of memory unless you use an object variant of various Stint sizes.
|
||||
# type MyUint = object
|
||||
# case kind: int
|
||||
# of 0..64: uint64
|
||||
# of 66..128: ref Stuint[128]
|
||||
# of 129..256: ref Stuint[256]
|
||||
# ...
|
||||
#
|
||||
# Note: if you work with huge size, you can allocate stints on the heap with
|
||||
# for example `type HeapInt8192 = ref Stint[8192].
|
||||
# If you have a lot of computations and intermediate variables it's probably worthwhile
|
||||
# to explore creating an object pool to reuse the memory buffers.
|
||||
|
||||
template checkDiv2(bits: static[int]): untyped =
|
||||
# TODO: There is no need to check if power of 2 at each uintImpl instantiation, it slows compilation.
|
||||
# However we easily get into nested templates instantiation if we use another
|
||||
# template that first checks power-of-two and then calls the recursive uintImpl
|
||||
static:
|
||||
doAssert (bits and (bits-1)) == 0, $bits & " is not a power of 2"
|
||||
doAssert bits >= 8, "The number of bits in a should be greater or equal to 8"
|
||||
bits div 2
|
||||
|
||||
when defined(mpint_test): # TODO stint_test
|
||||
template uintImpl*(bits: static[int]): untyped =
|
||||
# Test version, StUint[64] = 2 uint32. Test the logic of the library
|
||||
|
||||
when bits >= 128: UintImpl[uintImpl(checkDiv2(bits))]
|
||||
elif bits == 64: UintImpl[uint32]
|
||||
elif bits == 32: UintImpl[uint16]
|
||||
elif bits == 16: UintImpl[uint8]
|
||||
else: {.fatal: "Only power-of-2 >=16 supported when testing" .}
|
||||
|
||||
template intImpl*(bits: static[int]): untyped =
|
||||
# Test version, StInt[64] = 2 uint32. Test the logic of the library
|
||||
# int is implemented using a signed hi part and an unsigned lo part, given
|
||||
# that the sign resides in hi
|
||||
|
||||
when bits >= 128: IntImpl[intImpl(checkDiv2(bits)), uintImpl(checkDiv2(bits))]
|
||||
elif bits == 64: IntImpl[int32, uint32]
|
||||
elif bits == 32: IntImpl[int16, uint16]
|
||||
elif bits == 16: IntImpl[int8, uint8]
|
||||
else: {.fatal: "Only power-of-2 >=16 supported when testing" .}
|
||||
import
|
||||
# Status lib
|
||||
stew/bitops2
|
||||
|
||||
when sizeof(int) == 8 and not defined(Stint32):
|
||||
type Word* = uint64
|
||||
else:
|
||||
template uintImpl*(bits: static[int]): untyped =
|
||||
mixin UintImpl
|
||||
when bits >= 128: UintImpl[uintImpl(checkDiv2(bits))]
|
||||
elif bits == 64: uint64
|
||||
elif bits == 32: uint32
|
||||
elif bits == 16: uint16
|
||||
elif bits == 8: uint8
|
||||
else: {.fatal: "Only power-of-2 >=8 supported" .}
|
||||
type Word* = uint32
|
||||
|
||||
template intImpl*(bits: static[int]): untyped =
|
||||
# int is implemented using a signed hi part and an unsigned lo part, given
|
||||
# that the sign resides in hi
|
||||
type Word* = uint32
|
||||
|
||||
when bits >= 128: IntImpl[intImpl(checkDiv2(bits)), uintImpl(checkDiv2(bits))]
|
||||
elif bits == 64: int64
|
||||
elif bits == 32: int32
|
||||
elif bits == 16: int16
|
||||
elif bits == 8: int8
|
||||
else: {.fatal: "Only power-of-2 >=8 supported" .}
|
||||
func wordsRequired*(bits: int): int {.compileTime.} =
|
||||
## Compute the number of limbs required
|
||||
## from the **announced** bit length
|
||||
(bits + WordBitWidth - 1) div WordBitWidth
|
||||
|
||||
type
|
||||
# ### Private ### #
|
||||
UintImpl*[BaseUint] = object
|
||||
when system.cpuEndian == littleEndian:
|
||||
lo*, hi*: BaseUint
|
||||
else:
|
||||
hi*, lo*: BaseUint
|
||||
|
||||
IntImpl*[BaseInt, BaseUint] = object
|
||||
# Ints are implemented in terms of uints
|
||||
when system.cpuEndian == littleEndian:
|
||||
lo*: BaseUint
|
||||
hi*: BaseInt
|
||||
else:
|
||||
hi*: BaseInt
|
||||
lo*: BaseUint
|
||||
|
||||
# ### Private ### #
|
||||
Limbs*[N: static int] = array[N, BaseUint]
|
||||
|
||||
StUint*[bits: static[int]] = object
|
||||
data*: uintImpl(bits)
|
||||
## Stack-based integer
|
||||
## Unsigned
|
||||
limbs*: Limbs[bits.wordsRequired]
|
||||
|
||||
StInt*[bits: static[int]] = object
|
||||
data*: intImpl(bits)
|
||||
## Stack-based integer
|
||||
## Signed
|
||||
limbs*: Limbs[bits.wordsRequired]
|
||||
|
||||
template applyHiLo*(a: UintImpl | IntImpl, c: untyped): untyped =
|
||||
## Apply `c` to each of `hi` and `lo`
|
||||
var res: type a
|
||||
res.hi = c(a.hi)
|
||||
res.lo = c(a.lo)
|
||||
res
|
||||
Carry* = uint8 # distinct range[0'u8 .. 1]
|
||||
Borrow* = uint8 # distinct range[0'u8 .. 1]
|
||||
|
||||
template applyHiLo*(a, b: UintImpl | IntImpl, c: untyped): untyped =
|
||||
## Apply `c` to each of `hi` and `lo`
|
||||
var res: type a
|
||||
res.hi = c(a.hi, b.hi)
|
||||
res.lo = c(a.lo, b.lo)
|
||||
res
|
||||
const GCC_Compatible* = defined(gcc) or defined(clang) or defined(llvm_gcc)
|
||||
const X86* = defined(amd64) or defined(i386)
|
||||
|
||||
when sizeof(int) == 8 and GCC_Compatible:
|
||||
type
|
||||
uint128*{.importc: "unsigned __int128".} = object
|
||||
|
||||
template leastSignificantWord*(num: SomeInteger): auto =
|
||||
num
|
||||
|
||||
func leastSignificantWord*(num: UintImpl | IntImpl): auto {.inline.} =
|
||||
when num.lo is UintImpl:
|
||||
num.lo.leastSignificantWord
|
||||
func leastSignificantWord*(limbs: Limbs): auto {.inline.} =
|
||||
when cpuEndian == littleEndian:
|
||||
limbs[0]
|
||||
else:
|
||||
num.lo
|
||||
limbs[^1]
|
||||
|
||||
func mostSignificantWord*(num: UintImpl | IntImpl): auto {.inline.} =
|
||||
when num.hi is (UintImpl | IntImpl):
|
||||
num.hi.mostSignificantWord
|
||||
func mostSignificantWord*(limbs: Limbs): auto {.inline.} =
|
||||
when cpuEndian == littleEndian:
|
||||
limbs[^1]
|
||||
else:
|
||||
num.hi
|
||||
limbs[0]
|
||||
|
|
|
@ -0,0 +1,169 @@
|
|||
# Stint
|
||||
# Copyright 2018 Status Research & Development GmbH
|
||||
# Licensed under either of
|
||||
#
|
||||
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
|
||||
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
|
||||
#
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import ../datatypes
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# Add-with-carry and Sub-with-borrow
|
||||
#
|
||||
# ############################################################
|
||||
#
|
||||
# This file implements add-with-carry and sub-with-borrow
|
||||
#
|
||||
# It is currently (Mar 2020) impossible to have the compiler
|
||||
# generate optimal code in a generic way.
|
||||
#
|
||||
# On x86, addcarry_u64 intrinsic will generate optimal code
|
||||
# except for GCC.
|
||||
#
|
||||
# On other CPU architectures inline assembly might be desirable.
|
||||
# A compiler proof-of-concept is available in the "research" folder.
|
||||
#
|
||||
# See https://gcc.godbolt.org/z/2h768y
|
||||
# ```C
|
||||
# #include <stdint.h>
|
||||
# #include <x86intrin.h>
|
||||
#
|
||||
# void add256(uint64_t a[4], uint64_t b[4]){
|
||||
# uint8_t carry = 0;
|
||||
# for (int i = 0; i < 4; ++i)
|
||||
# carry = _addcarry_u64(carry, a[i], b[i], &a[i]);
|
||||
# }
|
||||
# ```
|
||||
#
|
||||
# GCC
|
||||
# ```asm
|
||||
# add256:
|
||||
# movq (%rsi), %rax
|
||||
# addq (%rdi), %rax
|
||||
# setc %dl
|
||||
# movq %rax, (%rdi)
|
||||
# movq 8(%rdi), %rax
|
||||
# addb $-1, %dl
|
||||
# adcq 8(%rsi), %rax
|
||||
# setc %dl
|
||||
# movq %rax, 8(%rdi)
|
||||
# movq 16(%rdi), %rax
|
||||
# addb $-1, %dl
|
||||
# adcq 16(%rsi), %rax
|
||||
# setc %dl
|
||||
# movq %rax, 16(%rdi)
|
||||
# movq 24(%rsi), %rax
|
||||
# addb $-1, %dl
|
||||
# adcq %rax, 24(%rdi)
|
||||
# ret
|
||||
# ```
|
||||
#
|
||||
# Clang
|
||||
# ```asm
|
||||
# add256:
|
||||
# movq (%rsi), %rax
|
||||
# addq %rax, (%rdi)
|
||||
# movq 8(%rsi), %rax
|
||||
# adcq %rax, 8(%rdi)
|
||||
# movq 16(%rsi), %rax
|
||||
# adcq %rax, 16(%rdi)
|
||||
# movq 24(%rsi), %rax
|
||||
# adcq %rax, 24(%rdi)
|
||||
# retq
|
||||
# ```
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# Intrinsics
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
# Note: GCC before 2017 had incorrect codegen in some cases:
|
||||
# - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81300
|
||||
|
||||
when X86:
|
||||
when defined(windows):
|
||||
{.pragma: intrinsics, header:"<intrin.h>", nodecl.}
|
||||
else:
|
||||
{.pragma: intrinsics, header:"<x86intrin.h>", nodecl.}
|
||||
|
||||
func addcarry_u32(carryIn: Carry, a, b: uint32, sum: var uint32): Carry {.importc: "_addcarry_u32", intrinsics.}
|
||||
func subborrow_u32(borrowIn: Borrow, a, b: uint32, diff: var uint32): Borrow {.importc: "_subborrow_u32", intrinsics.}
|
||||
|
||||
func addcarry_u64(carryIn: Carry, a, b: uint64, sum: var uint64): Carry {.importc: "_addcarry_u64", intrinsics.}
|
||||
func subborrow_u64(borrowIn: Borrow, a, b:uint64, diff: var uint64): Borrow {.importc: "_subborrow_u64", intrinsics.}
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# Public
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
func addC*(cOut: var Carry, sum: var uint32, a, b: uint32, cIn: Carry) {.inline.} =
|
||||
## Addition with carry
|
||||
## (CarryOut, Sum) <- a + b + CarryIn
|
||||
when X86:
|
||||
cOut = addcarry_u32(cIn, a, b, sum)
|
||||
else:
|
||||
let dblPrec = uint64(cIn) + uint64(a) + uint64(b)
|
||||
sum = (uint32)(dblPrec)
|
||||
cOut = Carry(dblPrec shr 32)
|
||||
|
||||
func subB*(bOut: var Borrow, diff: var uint32, a, b: uint32, bIn: Borrow) {.inline.} =
|
||||
## Substraction with borrow
|
||||
## (BorrowOut, Diff) <- a - b - borrowIn
|
||||
when X86:
|
||||
bOut = subborrow_u32(bIn, a, b, diff)
|
||||
else:
|
||||
let dblPrec = uint64(a) - uint64(b) - uint64(bIn)
|
||||
diff = (uint32)(dblPrec)
|
||||
# On borrow the high word will be 0b1111...1111 and needs to be masked
|
||||
bOut = Borrow((dblPrec shr 32) and 1)
|
||||
|
||||
func addC*(cOut: var Carry, sum: var uint64, a, b: uint64, cIn: Carry) {.inline.} =
|
||||
## Addition with carry
|
||||
## (CarryOut, Sum) <- a + b + CarryIn
|
||||
when X86:
|
||||
cOut = addcarry_u64(cIn, a, b, sum)
|
||||
else:
|
||||
block:
|
||||
static:
|
||||
doAssert GCC_Compatible
|
||||
doAssert sizeof(int) == 8
|
||||
|
||||
var dblPrec {.noInit.}: uint128
|
||||
{.emit:[dblPrec, " = (unsigned __int128)", a," + (unsigned __int128)", b, " + (unsigned __int128)",cIn,";"].}
|
||||
|
||||
# Don't forget to dereference the var param in C mode
|
||||
when defined(cpp):
|
||||
{.emit:[cOut, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
|
||||
{.emit:[sum, " = (NU64)", dblPrec,";"].}
|
||||
else:
|
||||
{.emit:["*",cOut, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
|
||||
{.emit:["*",sum, " = (NU64)", dblPrec,";"].}
|
||||
|
||||
func subB*(bOut: var Borrow, diff: var uint64, a, b: uint64, bIn: Borrow) {.inline.} =
|
||||
## Substraction with borrow
|
||||
## (BorrowOut, Diff) <- a - b - borrowIn
|
||||
when X86:
|
||||
bOut = subborrow_u64(bIn, a, b, diff)
|
||||
else:
|
||||
block:
|
||||
static:
|
||||
doAssert GCC_Compatible
|
||||
doAssert sizeof(int) == 8
|
||||
|
||||
var dblPrec {.noInit.}: uint128
|
||||
{.emit:[dblPrec, " = (unsigned __int128)", a," - (unsigned __int128)", b, " - (unsigned __int128)",bIn,";"].}
|
||||
|
||||
# Don't forget to dereference the var param in C mode
|
||||
# On borrow the high word will be 0b1111...1111 and needs to be masked
|
||||
when defined(cpp):
|
||||
{.emit:[bOut, " = (NU64)(", dblPrec," >> ", 64'u64, ") & 1;"].}
|
||||
{.emit:[diff, " = (NU64)", dblPrec,";"].}
|
||||
else:
|
||||
{.emit:["*",bOut, " = (NU64)(", dblPrec," >> ", 64'u64, ") & 1;"].}
|
||||
{.emit:["*",diff, " = (NU64)", dblPrec,";"].}
|
|
@ -0,0 +1,130 @@
|
|||
# Stint
|
||||
# Copyright 2018 Status Research & Development GmbH
|
||||
# Licensed under either of
|
||||
#
|
||||
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
|
||||
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
|
||||
#
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# Extended precision primitives
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
import
|
||||
../datatypes,
|
||||
./addcarry_subborrow
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# 32-bit words
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
func div2n1n*(q, r: var uint32, n_hi, n_lo, d: uint32) {.inline.}=
|
||||
## Division uint64 by uint32
|
||||
## Warning ⚠️ :
|
||||
## - if n_hi == d, quotient does not fit in an uint32
|
||||
## - if n_hi > d result is undefined
|
||||
##
|
||||
## To avoid issues, n_hi, n_lo, d should be normalized.
|
||||
## i.e. shifted (== multiplied by the same power of 2)
|
||||
## so that the most significant bit in d is set.
|
||||
let dividend = (uint64(n_hi) shl 32) or uint64(n_lo)
|
||||
let divisor = uint64(d)
|
||||
q = uint32(dividend div divisor)
|
||||
r = uint32(dividend mod divisor)
|
||||
|
||||
func mul*(hi, lo: var uint32, a, b: uint32) {.inline.} =
|
||||
## Extended precision multiplication
|
||||
## (hi, lo) <- a*b
|
||||
let dblPrec = uint64(a) * uint64(b)
|
||||
lo = uint32(dblPrec)
|
||||
hi = uint32(dblPrec shr 32)
|
||||
|
||||
func muladd1*(hi, lo: var uint32, a, b, c: uint32) {.inline.} =
|
||||
## Extended precision multiplication + addition
|
||||
## (hi, lo) <- a*b + c
|
||||
##
|
||||
## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001)
|
||||
## so adding any c cannot overflow
|
||||
let dblPrec = uint64(a) * uint64(b) + uint64(c)
|
||||
lo = uint32(dblPrec)
|
||||
hi = uint32(dblPrec shr 32)
|
||||
|
||||
func muladd2*(hi, lo: var uint32, a, b, c1, c2: uint32) {.inline.}=
|
||||
## Extended precision multiplication + addition + addition
|
||||
## This is constant-time on most hardware except some specific one like Cortex M0
|
||||
## (hi, lo) <- a*b + c1 + c2
|
||||
##
|
||||
## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001)
|
||||
## so adding 0xFFFFFFFF leads to (hi: 0xFFFFFFFF, lo: 0x00000000)
|
||||
## and we have enough space to add again 0xFFFFFFFF without overflowing
|
||||
let dblPrec = uint64(a) * uint64(b) + uint64(c1) + uint64(c2)
|
||||
lo = uint32(dblPrec)
|
||||
hi = uint32(dblPrec shr 32)
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# 64-bit words
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
when sizeof(int) == 8:
|
||||
when defined(vcc):
|
||||
from ./extended_precision_x86_64_msvc import div2n1n, mul, muladd1, muladd2
|
||||
elif GCCCompatible:
|
||||
when X86:
|
||||
from ./extended_precision_x86_64_gcc import div2n1n
|
||||
from ./extended_precision_64bit_uint128 import mul, muladd1, muladd2
|
||||
else:
|
||||
from ./extended_precision_64bit_uint128 import div2n1n, mul, muladd1, muladd2
|
||||
export div2n1n, mul, muladd1, muladd2
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# Composite primitives
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
func mulDoubleAdd2*[T: uint32|uint64](r2: var Carry, r1, r0: var T, a, b, c: T, dHi: Carry, dLo: T) {.inline.} =
|
||||
## (r2, r1, r0) <- 2*a*b + c + (dHi, dLo)
|
||||
## with r = (r2, r1, r0) a triple-word number
|
||||
## and d = (dHi, dLo) a double-word number
|
||||
## r2 and dHi are carries, either 0 or 1
|
||||
|
||||
var carry: Carry
|
||||
|
||||
# (r1, r0) <- a*b
|
||||
# Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFF_FFFFFFFE, lo: 0x00000000_00000001)
|
||||
mul(r1, r0, a, b)
|
||||
|
||||
# (r2, r1, r0) <- 2*a*b
|
||||
# Then (hi: 0xFFFFFFFF_FFFFFFFE, lo: 0x00000000_00000001) * 2
|
||||
# (carry: 1, hi: 0xFFFFFFFF_FFFFFFFC, lo: 0x00000000_00000002)
|
||||
addC(carry, r0, r0, r0, Carry(0))
|
||||
addC(r2, r1, r1, r1, carry)
|
||||
|
||||
# (r1, r0) <- (r1, r0) + c
|
||||
# Adding any uint64 cannot overflow into r2 for example Adding 2^64-1
|
||||
# (carry: 1, hi: 0xFFFFFFFF_FFFFFFFD, lo: 0x00000000_00000001)
|
||||
addC(carry, r0, r0, c, Carry(0))
|
||||
addC(carry, r1, r1, T(0), carry)
|
||||
|
||||
# (r1, r0) <- (r1, r0) + (dHi, dLo) with dHi a carry (previous limb r2)
|
||||
# (dHi, dLo) is at most (dhi: 1, dlo: 0xFFFFFFFF_FFFFFFFF)
|
||||
# summing into (carry: 1, hi: 0xFFFFFFFF_FFFFFFFD, lo: 0x00000000_00000001)
|
||||
# result at most in (carry: 1, hi: 0xFFFFFFFF_FFFFFFFF, lo: 0x00000000_00000000)
|
||||
addC(carry, r0, r0, dLo, Carry(0))
|
||||
addC(carry, r1, r1, T(dHi), carry)
|
||||
|
||||
func mulAcc*[T: uint32|uint64](t, u, v: var T, a, b: T) {.inline.} =
|
||||
## (t, u, v) <- (t, u, v) + a * b
|
||||
var UV: array[2, T]
|
||||
var carry: Carry
|
||||
mul(UV[1], UV[0], a, b)
|
||||
addC(carry, v, v, UV[0], Carry(0))
|
||||
addC(carry, u, u, UV[1], carry)
|
||||
t += T(carry)
|
|
@ -0,0 +1,95 @@
|
|||
# Stint
|
||||
# Copyright 2018 Status Research & Development GmbH
|
||||
# Licensed under either of
|
||||
#
|
||||
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
|
||||
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
|
||||
#
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import ../datatypes
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# Extended precision primitives on GCC & Clang (all CPU archs)
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
static:
|
||||
doAssert GCC_Compatible
|
||||
doAssert sizeof(int) == 8
|
||||
|
||||
func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
|
||||
## Division uint128 by uint64
|
||||
## Warning ⚠️ :
|
||||
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE on some platforms
|
||||
## - if n_hi > d result is undefined
|
||||
var dblPrec {.noInit.}: uint128
|
||||
{.emit:[dblPrec, " = (unsigned __int128)", n_hi," << 64 | (unsigned __int128)",n_lo,";"].}
|
||||
|
||||
# Don't forget to dereference the var param in C mode
|
||||
when defined(cpp):
|
||||
{.emit:[q, " = (NU64)(", dblPrec," / ", d, ");"].}
|
||||
{.emit:[r, " = (NU64)(", dblPrec," % ", d, ");"].}
|
||||
else:
|
||||
{.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].}
|
||||
{.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].}
|
||||
|
||||
func mul*(hi, lo: var uint64, a, b: uint64) {.inline.} =
|
||||
## Extended precision multiplication
|
||||
## (hi, lo) <- a*b
|
||||
block:
|
||||
var dblPrec {.noInit.}: uint128
|
||||
{.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b,";"].}
|
||||
|
||||
# Don't forget to dereference the var param in C mode
|
||||
when defined(cpp):
|
||||
{.emit:[hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
|
||||
{.emit:[lo, " = (NU64)", dblPrec,";"].}
|
||||
else:
|
||||
{.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
|
||||
{.emit:["*",lo, " = (NU64)", dblPrec,";"].}
|
||||
|
||||
func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
|
||||
## Extended precision multiplication + addition
|
||||
## (hi, lo) <- a*b + c
|
||||
##
|
||||
## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
|
||||
## so adding any c cannot overflow
|
||||
##
|
||||
## This is constant-time on most hardware
|
||||
## See: https://www.bearssl.org/ctmul.html
|
||||
block:
|
||||
var dblPrec {.noInit.}: uint128
|
||||
{.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, " + (unsigned __int128)",c,";"].}
|
||||
|
||||
# Don't forget to dereference the var param in C mode
|
||||
when defined(cpp):
|
||||
{.emit:[hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
|
||||
{.emit:[lo, " = (NU64)", dblPrec,";"].}
|
||||
else:
|
||||
{.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
|
||||
{.emit:["*",lo, " = (NU64)", dblPrec,";"].}
|
||||
|
||||
func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
|
||||
## Extended precision multiplication + addition + addition
|
||||
## This is constant-time on most hardware except some specific one like Cortex M0
|
||||
## (hi, lo) <- a*b + c1 + c2
|
||||
##
|
||||
## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
|
||||
## so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
|
||||
## and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
|
||||
block:
|
||||
var dblPrec {.noInit.}: uint128
|
||||
{.emit:[
|
||||
dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b,
|
||||
" + (unsigned __int128)",c1," + (unsigned __int128)",c2,";"
|
||||
].}
|
||||
|
||||
# Don't forget to dereference the var param in C mode
|
||||
when defined(cpp):
|
||||
{.emit:[hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
|
||||
{.emit:[lo, " = (NU64)", dblPrec,";"].}
|
||||
else:
|
||||
{.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
|
||||
{.emit:["*",lo, " = (NU64)", dblPrec,";"].}
|
|
@ -0,0 +1,57 @@
|
|||
# Stint
|
||||
# Copyright 2018 Status Research & Development GmbH
|
||||
# Licensed under either of
|
||||
#
|
||||
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
|
||||
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
|
||||
#
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import ../datatypes
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# Extended precision primitives for X86-64 on GCC & Clang
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
static:
|
||||
doAssert(defined(gcc) or defined(clang) or defined(llvm_gcc))
|
||||
doAssert sizeof(int) == 8
|
||||
doAssert X86
|
||||
|
||||
func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
|
||||
## Division uint128 by uint64
|
||||
## Warning ⚠️ :
|
||||
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
|
||||
## - if n_hi > d result is undefined
|
||||
|
||||
# DIV r/m64
|
||||
# Divide RDX:RAX (n_hi:n_lo) by r/m64
|
||||
#
|
||||
# Inputs
|
||||
# - numerator high word in RDX,
|
||||
# - numerator low word in RAX,
|
||||
# - divisor as r/m parameter (register or memory at the compiler discretion)
|
||||
# Result
|
||||
# - Quotient in RAX
|
||||
# - Remainder in RDX
|
||||
|
||||
# 1. name the register/memory "divisor"
|
||||
# 2. don't forget to dereference the var hidden pointer
|
||||
# 3. -
|
||||
# 4. no clobbered registers beside explicitly used RAX and RDX
|
||||
when defined(cpp):
|
||||
asm """
|
||||
divq %[divisor]
|
||||
: "=a" (`q`), "=d" (`r`)
|
||||
: "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
|
||||
:
|
||||
"""
|
||||
else:
|
||||
asm """
|
||||
divq %[divisor]
|
||||
: "=a" (`*q`), "=d" (`*r`)
|
||||
: "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
|
||||
:
|
||||
"""
|
|
@ -0,0 +1,87 @@
|
|||
# Stint
|
||||
# Copyright 2018 Status Research & Development GmbH
|
||||
# Licensed under either of
|
||||
#
|
||||
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
|
||||
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
|
||||
#
|
||||
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
||||
|
||||
import
|
||||
../datatypes,
|
||||
./addcarry_subborrow
|
||||
|
||||
# ############################################################
|
||||
#
|
||||
# Extended precision primitives for X86-64 on MSVC
|
||||
#
|
||||
# ############################################################
|
||||
|
||||
static:
|
||||
doAssert defined(vcc)
|
||||
doAssert sizeof(int) == 8
|
||||
doAssert X86
|
||||
|
||||
func udiv128(highDividend, lowDividend, divisor: Ct[uint64], remainder: var Ct[uint64]): Ct[uint64] {.importc:"_udiv128", header: "<intrin.h>", nodecl.}
|
||||
## Division 128 by 64, Microsoft only, 64-bit only,
|
||||
## returns quotient as return value remainder as var parameter
|
||||
## Warning ⚠️ :
|
||||
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
|
||||
## - if n_hi > d result is undefined
|
||||
|
||||
func umul128(a, b: Ct[uint64], hi: var Ct[uint64]): Ct[uint64] {.importc:"_umul128", header:"<intrin.h>", nodecl.}
|
||||
## (hi, lo) <-- a * b
|
||||
## Return value is the low word
|
||||
|
||||
func div2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
|
||||
## Division uint128 by uint64
|
||||
## Warning ⚠️ :
|
||||
## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
|
||||
## - if n_hi > d result is undefined
|
||||
{.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".}
|
||||
|
||||
# TODO !!! - Replace by constant-time, portable, non-assembly version
|
||||
# -> use uint128? Compiler might add unwanted branches
|
||||
q = udiv128(n_hi, n_lo, d, r)
|
||||
|
||||
func mul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
|
||||
## Extended precision multiplication
|
||||
## (hi, lo) <- a*b
|
||||
##
|
||||
## This is constant-time on most hardware
|
||||
## See: https://www.bearssl.org/ctmul.html
|
||||
lo = umul128(a, b, hi)
|
||||
|
||||
func muladd1*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.} =
|
||||
## Extended precision multiplication + addition
|
||||
## (hi, lo) <- a*b + c
|
||||
##
|
||||
## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
|
||||
## so adding any c cannot overflow
|
||||
##
|
||||
## This is constant-time on most hardware
|
||||
## See: https://www.bearssl.org/ctmul.html
|
||||
var carry: Carry
|
||||
lo = umul128(a, b, hi)
|
||||
addC(carry, lo, lo, c, Carry(0))
|
||||
addC(carry, hi, hi, 0, carry)
|
||||
|
||||
func muladd2*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}=
|
||||
## Extended precision multiplication + addition + addition
|
||||
## This is constant-time on most hardware except some specific one like Cortex M0
|
||||
## (hi, lo) <- a*b + c1 + c2
|
||||
##
|
||||
## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
|
||||
## so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
|
||||
## and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
|
||||
# For speed this could be implemented with parallel pipelined carry chains
|
||||
# via MULX + ADCX + ADOX
|
||||
var carry1, carry2: Carry
|
||||
|
||||
lo = umul128(a, b, hi)
|
||||
# Carry chain 1
|
||||
addC(carry1, lo, lo, c1, Carry(0))
|
||||
addC(carry1, hi, hi, 0, carry1)
|
||||
# Carry chain 2
|
||||
addC(carry2, lo, lo, c2, Carry(0))
|
||||
addC(carry2, hi, hi, 0, carry2)
|
Loading…
Reference in New Issue