ported-addition, bug in non-release mode for mul

This commit is contained in:
mratsim 2018-03-20 16:04:19 +01:00
parent 8b8f2a55c4
commit cd1adc84c9
9 changed files with 303 additions and 250 deletions

View File

@ -0,0 +1,45 @@
# Mpint
# Copyright 2018 Status Research & Development GmbH
# Licensed under either of
#
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
#
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import ../uint_type
proc `+=`*(x: var MpUint, y: MpUint) {.noSideEffect.}=
## In-place addition for multi-precision unsigned int
#
# Optimized assembly should contain adc instruction (add with carry)
# Clang on MacOS does with the -d:release switch and MpUint[uint32] (uint64)
type SubT = type x.lo
let tmp = x.lo
x.lo += y.lo
x.hi += SubT(x.lo < tmp) + y.hi
proc `+`*(x, y: MpUint): MpUint {.noSideEffect, noInit, inline.}=
# Addition for multi-precision unsigned int
result = x
result += y
debugEcho "+: " & $result
proc `-=`*(x: var MpUint, y: MpUint) {.noSideEffect.}=
## In-place substraction for multi-precision unsigned int
#
# Optimized assembly should contain sbb instruction (substract with borrow)
# Clang on MacOS does with the -d:release switch and MpUint[uint32] (uint64)
type SubT = type x.lo
let tmp = x.lo
x.lo -= y.lo
x.hi -= SubT(x.lo > tmp) + y.hi
proc `-`*(x, y: MpUint): MpUint {.noSideEffect, noInit, inline.}=
# Substraction for multi-precision unsigned int
result = x
result -= y

View File

@ -0,0 +1,56 @@
# Mpint
# Copyright 2018 Status Research & Development GmbH
# Licensed under either of
#
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
#
# at your option. This file may not be copied, modified, or distributed except according to those terms.
# proc divmod*(x, y: MpUint): tuple[quot, rem: MpUint] {.noSideEffect.}=
# ## Division for multi-precision unsigned uint
# ## Returns quotient + reminder in a (quot, rem) tuple
# #
# # Implementation through binary shift division
# const zero = T()
# when x.lo is MpUInt:
# const one = T(lo: getSubType(T)(1))
# else:
# const one: getSubType(T) = 1
# if unlikely(y.isZero):
# raise newException(DivByZeroError, "You attempted to divide by zero")
# var
# shift = x.bit_length - y.bit_length
# d = y shl shift
# result.rem = x
# while shift >= 0:
# result.quot += result.quot
# if result.rem >= d:
# result.rem -= d
# result.quot.lo = result.quot.lo or one
# d = d shr 1
# dec(shift)
# # Performance note:
# # The performance of this implementation is extremely dependant on shl and shr.
# #
# # Probably the most efficient algorithm that can benefit from MpUInt data structure is
# # the recursive fast division by Burnikel and Ziegler (http://www.mpi-sb.mpg.de/~ziegler/TechRep.ps.gz):
# # - Python implementation: https://bugs.python.org/file11060/fast_div.py and discussion https://bugs.python.org/issue3451
# # - C++ implementation: https://github.com/linbox-team/givaro/blob/master/src/kernel/recint/rudiv.h
# # - The Handbook of Elliptic and Hyperelliptic Cryptography Algorithm 10.35 on page 188 has a more explicit version of the div2NxN algorithm. This algorithm is directly recursive and avoids the mutual recursion of the original paper's calls between div2NxN and div3Nx2N.
# # - Comparison of fast division algorithms fro large integers: http://bioinfo.ict.ac.cn/~dbu/AlgorithmCourses/Lectures/Hasselstrom2003.pdf
# proc `div`*(x, y: MpUint): MpUint {.inline, noSideEffect.} =
# ## Division operation for multi-precision unsigned uint
# divmod(x,y).quot
# proc `mod`*(x, y: MpUint): MpUint {.inline, noSideEffect.} =
# ## Division operation for multi-precision unsigned uint
# divmod(x,y).rem

View File

@ -0,0 +1,99 @@
# Mpint
# Copyright 2018 Status Research & Development GmbH
# Licensed under either of
#
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
#
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import ../uint_type, ../uint_init
import ./addsub_impl
## Implementation of multiplication
# For our representation, it is similar to school grade multiplication
# Consider hi and lo as if they were digits
#
# 12
# X 15
# ------
# 10 lo*lo -> z0
# 5 hi*lo -> z1
# 2 lo*hi -> z1
# 10 hi*hi -- z2
# ------
# 180
#
# If T is a type
# For T * T --> T we don't need to compute z2 as it always overflow
# For T * T --> 2T (uint64 * uint64 --> uint128) we use extra precision multiplication
# See details at
# https://en.wikipedia.org/wiki/Karatsuba_algorithm
# https://locklessinc.com/articles/256bit_arithmetic/
# https://www.miracl.com/press/missing-a-trick-karatsuba-variations-michael-scott
#
# We use the naive school grade multiplication instead of Karatsuba I.e.
# z1 = x.hi * y.lo + x.lo * y.hi (Naive) = (x.lo - x.hi)(y.hi - y.lo) + z0 + z2 (Karatsuba)
#
# On modern architecture:
# - addition and multiplication have the same cost
# - Karatsuba would require to deal with potentially negative intermediate result
# and introduce branching
# - More total operations means more register moves
import typetraits
proc `*`*(x, y: MpUint): MpUint {.noSideEffect.}=
## Multiplication for multi-precision unsigned uint
mixin naiveMul
result = naiveMul(x.lo, y.lo)
result.hi += (naiveMul(x.hi, y.lo) + naiveMul(x.lo, y.hi)).lo
debugEcho "*: " & $result
debugEcho "* type: " & $result.type.name
# proc naiveMulImpl[bits: static[int]](x, y: MpUint[bits]): MpUint[bits * 2] {.noSideEffect.}=
# ## Naive multiplication algorithm with extended precision
# ## i.e. we use types twice bigger to do the multiplication
# ## and only keep the bottom part
# mixin naiveMul
# const
# halfSize = bits div 2
# let
# z0 = naiveMul(x.lo, y.lo)
# tmp = naiveMul(x.hi, y.lo)
# var z1 = tmp
# z1 += naiveMul(x.hi, y.lo)
# let z2 = (z1 < tmp).T + naiveMul(x.hi, y.hi)
# let tmp2 = z1.lo shl halfSize
# result.lo = tmp2
# result.lo += z0
# result.hi = (result.lo < tmp2).T + z2 + z1.hi
# proc naiveMul*[bits: static[int]](x, y: MpUint[bits]): MpUint[bits * 2] {.noSideEffect.}=
# naiveMulImpl(x, y)
proc naiveMul*(x, y: float): MpUint[16] {.noSideEffect.}=
result = toMpuint(x.uint16 * y.uint16)
proc naiveMul*(x, y: uint16): MpUint[32] {.noSideEffect.}=
result = toMpuint(x.uint32 * y.uint32)
debugEcho "naiveMul cast:" & $result
proc naiveMul*(x, y: uint32): MpUint[64] {.noSideEffect.}=
toMpuint(x.uint64 * y.uint64)
# proc naiveMul*(x, y: uint64): MpUint[128] {.noSideEffect, noInit, inline.}=
# let x = x.toMpUint
# let y = y.toMpUint
# naiveMulImpl[64](x, y)

View File

@ -8,13 +8,13 @@
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import ./uint_type,
./uint_init,
./uint_bitwise_ops,
./uint_binary_ops,
./uint_comparison
./uint_init
#./uint_bitwise_ops,
#./uint_binary_ops,
#./uint_comparison
export uint_type,
uint_init,
uint_bitwise_ops,
uint_binary_ops,
uint_comparison
uint_init
# uint_bitwise_ops,
# uint_binary_ops,
# uint_comparison

View File

@ -1,59 +0,0 @@
# Mpint
# Copyright 2018 Status Research & Development GmbH
# Licensed under either of
#
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
#
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import ../uint_type,
macros
macro getSubType*(T: typedesc): untyped =
## Returns the subtype of a generic type
## MpUint[uint32] --> uint32
getTypeInst(T)[1][1]
proc asUint*[T: MpUInt](n: T): auto {.noSideEffect, inline.}=
## Casts a multiprecision integer to an uint of the same size
when T.sizeof > 8:
raise newException("Unreachable. You are trying to cast a MpUint with more than 64-bit of precision")
elif T.sizeof == 8:
cast[uint64](n)
elif T.sizeof == 4:
cast[uint32](n)
elif T.sizeof == 2:
cast[uint16](n)
else:
raise newException("Unreachable. MpUInt must be 16-bit minimum and a power of 2")
proc asUint*[T: SomeUnsignedInt](n: T): T {.noSideEffect, inline.}=
## No-op overload of multi-precision int casting
n
proc asDoubleUint*[T: BaseUint](n: T): auto {.noSideEffect, inline.} =
## Convert an integer or MpUint to an uint with double the size
type Double = (
when T.sizeof == 4: uint64
elif T.sizeof == 2: uint32
else: uint16
)
n.asUint.Double
proc toMpUint*[T: SomeInteger](n: T): auto {.noSideEffect, inline.} =
## Cast an integer to the corresponding size MpUint
# Sometimes direct casting doesn't work and we must cast through a pointer
when T is uint64:
return (cast[ptr [MpUint[uint32]]](unsafeAddr n))[]
elif T is uint32:
return (cast[ptr [MpUint[uint16]]](unsafeAddr n))[]
elif T is uint16:
return (cast[ptr [MpUint[uint8]]](unsfddr n))[]
else:
raise newException(ValueError, "You can only cast uint16, uint32 or uint64 to multiprecision integers")

View File

@ -0,0 +1,8 @@
# Mpint
# Copyright 2018 Status Research & Development GmbH
# Licensed under either of
#
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
#
# at your option. This file may not be copied, modified, or distributed except according to those terms.

View File

@ -7,156 +7,22 @@
#
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import ./private/[bithacks, casting],
uint_type,
uint_comparison
import ./binary_ops/addsub_impl,
./binary_ops/mul_impl
proc `+=`*[T: MpUint](x: var T, y: T) {.noSideEffect.}=
## In-place addition for multi-precision unsigned int
#
# Optimized assembly should contain adc instruction (add with carry)
# Clang on MacOS does with the -d:release switch and MpUint[uint32] (uint64)
type SubT = type x.lo
let tmp = x.lo
export addsub_impl, mul_impl
x.lo += y.lo
x.hi += SubT(x.lo < tmp) + y.hi
when isMainModule:
proc `+`*[T: MpUint](x, y: T): T {.noSideEffect, noInit, inline.}=
# Addition for multi-precision unsigned int
result = x
result += y
import typetraits
import ./uint_init
proc `-=`*[T: MpUint](x: var T, y: T) {.noSideEffect.}=
## In-place substraction for multi-precision unsigned int
#
# Optimized assembly should contain sbb instruction (substract with borrow)
# Clang on MacOS does with the -d:release switch and MpUint[uint32] (uint64)
type SubT = type x.lo
let tmp = x.lo
let a = toMpUint(10'u32)
x.lo -= y.lo
x.hi -= SubT(x.lo > tmp) + y.hi
echo "a: " & $a
echo "a+a: " & $(a+a)
proc `-`*[T: MpUint](x, y: T): T {.noSideEffect, noInit, inline.}=
# Substraction for multi-precision unsigned int
result = x
result -= y
let z = a * a
echo z
echo $z.type.name
proc naiveMul[T: BaseUint](x, y: T): MpUint[T] {.noSideEffect, noInit, inline.}
# Forward declaration
proc `*`*[T: MpUint](x, y: T): T {.noSideEffect, noInit.}=
## Multiplication for multi-precision unsigned uint
#
# For our representation, it is similar to school grade multiplication
# Consider hi and lo as if they were digits
#
# 12
# X 15
# ------
# 10 lo*lo -> z0
# 5 hi*lo -> z1
# 2 lo*hi -> z1
# 10 hi*hi -- z2
# ------
# 180
#
# If T is a type
# For T * T --> T we don't need to compute z2 as it always overflow
# For T * T --> 2T (uint64 * uint64 --> uint128) we use extra precision multiplication
result = naiveMul(x.lo, y.lo)
result.hi += (naiveMul(x.hi, y.lo) + naiveMul(x.lo, y.hi)).lo
template naiveMulImpl[T: MpUint](x, y: T): MpUint[T] =
# See details at
# https://en.wikipedia.org/wiki/Karatsuba_algorithm
# https://locklessinc.com/articles/256bit_arithmetic/
# https://www.miracl.com/press/missing-a-trick-karatsuba-variations-michael-scott
#
# We use the naive school grade multiplication instead of Karatsuba I.e.
# z1 = x.hi * y.lo + x.lo * y.hi (Naive) = (x.lo - x.hi)(y.hi - y.lo) + z0 + z2 (Karatsuba)
#
# On modern architecture:
# - addition and multiplication have the same cost
# - Karatsuba would require to deal with potentially negative intermediate result
# and introduce branching
# - More total operations means more register moves
let
halfSize = T.sizeof * 4
z0 = naiveMul(x.lo, y.lo)
tmp = naiveMul(x.hi, y.lo)
var z1 = tmp
z1 += naiveMul(x.hi, y.lo)
let z2 = (z1 < tmp).T + naiveMul(x.hi, y.hi)
let tmp2 = z1.lo shl halfSize
result.lo = tmp2
result.lo += z0
result.hi = (result.lo < tmp2).T + z2 + z1.hi
proc naiveMul[T: BaseUint](x, y: T): MpUint[T] {.noSideEffect, noInit, inline.}=
## Naive multiplication algorithm with extended precision
when T.sizeof in {1, 2, 4}:
# Use types twice bigger to do the multiplication
cast[type result](x.asDoubleUint * y.asDoubleUint)
elif T.sizeof == 8: # uint64 or MpUint[uint32]
# We cannot double uint64 to uint128
naiveMulImpl(x.toMpUint, y.toMpUint)
else:
# Case: at least uint128 * uint128 --> uint256
naiveMulImpl(x, y)
proc divmod*[T: BaseUint](x, y: T): tuple[quot, rem: T] {.noSideEffect.}=
## Division for multi-precision unsigned uint
## Returns quotient + reminder in a (quot, rem) tuple
#
# Implementation through binary shift division
const zero = T()
when x.lo is MpUInt:
const one = T(lo: getSubType(T)(1))
else:
const one: getSubType(T) = 1
if unlikely(y.isZero):
raise newException(DivByZeroError, "You attempted to divide by zero")
var
shift = x.bit_length - y.bit_length
d = y shl shift
result.rem = x
while shift >= 0:
result.quot += result.quot
if result.rem >= d:
result.rem -= d
result.quot.lo = result.quot.lo or one
d = d shr 1
dec(shift)
# Performance note:
# The performance of this implementation is extremely dependant on shl and shr.
#
# Probably the most efficient algorithm that can benefit from MpUInt data structure is
# the recursive fast division by Burnikel and Ziegler (http://www.mpi-sb.mpg.de/~ziegler/TechRep.ps.gz):
# - Python implementation: https://bugs.python.org/file11060/fast_div.py and discussion https://bugs.python.org/issue3451
# - C++ implementation: https://github.com/linbox-team/givaro/blob/master/src/kernel/recint/rudiv.h
# - The Handbook of Elliptic and Hyperelliptic Cryptography Algorithm 10.35 on page 188 has a more explicit version of the div2NxN algorithm. This algorithm is directly recursive and avoids the mutual recursion of the original paper's calls between div2NxN and div3Nx2N.
# - Comparison of fast division algorithms fro large integers: http://bioinfo.ict.ac.cn/~dbu/AlgorithmCourses/Lectures/Hasselstrom2003.pdf
proc `div`*[T: BaseUint](x, y: T): T {.inline, noSideEffect.} =
## Division operation for multi-precision unsigned uint
divmod(x,y).quot
proc `mod`*[T: BaseUint](x, y: T): T {.inline, noSideEffect.} =
## Division operation for multi-precision unsigned uint
divmod(x,y).rem

View File

@ -9,27 +9,53 @@
import typetraits
import ./private/[bithacks, casting],
./uint_type
import ./uint_type
proc initMpUint*[T: BaseUint; U: BaseUInt](n: T, base_type: typedesc[U]): MpUint[U] {.noSideEffect.} =
when not (T is type result):
let len = n.bit_length
const size = sizeof(U) * 8
if len >= 2 * size:
# Todo print n
raise newException(ValueError, "Input cannot be stored in a multi-precision integer of base " & $T.name &
"\nIt requires at least " & $len & " bits of precision")
elif len < size:
result.lo = n.U # TODO: converter for MpInts
else: # Both have the same size and memory representation
assert len == size
n.toMpUint
proc toMpUint*(n: uint16): MpUint[16] {.noInit, inline, noSideEffect.}=
## Cast an integer to the corresponding size MpUint
cast[MpUint[16]](n)
proc toMpUint*(n: uint32): MpUint[32] {.noInit, inline, noSideEffect.}=
## Cast an integer to the corresponding size MpUint
cast[MpUint[32]](n)
proc toMpUint*(n: uint64): MpUint[64] {.noInit, inline, noSideEffect.}=
## Cast an integer to the corresponding size MpUint
cast[MpUint[64]](n)
proc initMpUint*(n: SomeUnsignedInt, bits: static[int]): MpUint[bits] {.noSideEffect.} =
const nb_bits = n.sizeof * 8
static:
assert (bits and (bits-1)) == 0, $bits & " is not a power of 2"
assert bits >= 16, "The number of bits in a should be greater or equal to 16"
assert nb_bits <= bits, "Input cannot be stored in a " & $bits "-bit multi-precision unsigned integer\n" &
"It requires at least " & nb_bits & " bits of precision"
when nb_bits <= bits div 2:
result.lo = (type result.lo)(n)
else:
n
result = n.toMpUint
proc u128*[T: BaseUInt](n: T): UInt128 {.noSideEffect, inline.}=
initMpUint(n, uint64)
## TODO: The current initMpUint prevents the following:
## let a = 10
## let b = initMpUint[16](a)
##
## It should use bit_length to get the number of bits needed instead
proc u256*[T: BaseUInt](n: T): UInt256 {.noSideEffect, inline.}=
initMpUint(n, UInt256)
# proc u128*[T: BaseUInt](n: T): UInt128 {.noSideEffect, inline.}=
# initMpUint(n, uint64)
# proc u256*[T: BaseUInt](n: T): UInt256 {.noSideEffect, inline.}=
# initMpUint(n, UInt256)
when isMainModule:
let a = 10'u16
let b = a.toMpUint
echo b.repr
echo b

View File

@ -8,24 +8,36 @@
# at your option. This file may not be copied, modified, or distributed except according to those terms.
type
MpUint*{.packed.}[BaseUint] = object
# TODO: The following is a hacky workaround
# due to:
# - https://github.com/nim-lang/Nim/issues/7230
# - https://github.com/nim-lang/Nim/issues/7378
# - https://github.com/nim-lang/Nim/issues/7379
BitsHolder[bits: static[int]] = object
type
MpUintImpl[bh] = object
# TODO: when gcc/clang defines it use the builtin uint128
when system.cpuEndian == littleEndian:
lo*, hi*: BaseUint
when bh is BitsHolder[128]: lo*, hi*: uint64
elif bh is BitsHolder[64]: lo*, hi*: uint32
elif bh is BitsHolder[32]: lo*, hi*: uint16
elif bh is BitsHolder[16]: lo*, hi*: uint8
# The following cannot be implemented recursively yet
elif bh is BitsHolder[256]: lo*, hi*: MpUintImpl[BitsHolder[128]]
# else:
# Not implemented
else:
hi*, lo*: BaseUint
when bh is BitsHolder[128]: hi*, lo*: uint64
elif bh is BitsHolder[64]: hi*, lo*: uint32
elif bh is BitsHolder[32]: hi*, lo*: uint16
elif bh is BitsHolder[16]: hi*, lo*: uint8
BaseUint* = SomeUnsignedInt or MpUint
# The following cannot be implemented recursively yet
elif bh is BitsHolder[256]: hi*, lo*: MpUintImpl[BitsHolder[128]]
# else:
# Not implemented
UInt128* = MpUint[uint64]
UInt256* = MpUint[UInt128]
template convBool(typ: typedesc): untyped =
converter boolMpUint*(b: bool): MpUint[typ] {.noSideEffect, inline.}=
result.lo = b.typ
convBool(uint8)
convBool(uint16)
convBool(uint32)
convBool(uint64)
MpUint*[bits: static[int]] = MpUintImpl[BitsHolder[bits]]