2018-02-15 19:26:10 +00:00
# Copyright (c) 2018 Status Research & Development GmbH
# Distributed under the MIT License (license terms are at http://opensource.org/licenses/MIT).
2018-02-16 08:22:23 +00:00
import . / private / utils ,
2018-02-16 16:48:54 +00:00
uint_type ,
uint_comparison
2018-02-15 19:26:10 +00:00
2018-02-16 10:38:48 +00:00
proc `+=` * [ T : MpUint ] ( x : var T , y : T ) {. noSideEffect . } =
2018-02-16 08:22:23 +00:00
## In-place addition for multi-precision unsigned int
2018-02-15 22:28:31 +00:00
#
# Optimized assembly should contain adc instruction (add with carry)
# Clang on MacOS does with the -d:release switch and MpUint[uint32] (uint64)
2018-02-16 20:07:51 +00:00
type MpBase = type x . lo
2018-02-16 10:38:48 +00:00
let tmp = x . lo
2018-02-15 19:26:10 +00:00
2018-02-16 10:38:48 +00:00
x . lo + = y . lo
2018-02-16 20:07:51 +00:00
x . hi + = MpBase ( x . lo < tmp ) + y . hi
2018-02-15 19:26:10 +00:00
2018-02-16 10:38:48 +00:00
proc `+` * [ T : MpUint ] ( x , y : T ) : T {. noSideEffect , noInit , inline . } =
2018-02-15 22:28:31 +00:00
# Addition for multi-precision unsigned int
2018-02-16 10:38:48 +00:00
result = x
result + = y
2018-02-15 22:28:31 +00:00
2018-02-16 10:38:48 +00:00
proc `-=` * [ T : MpUint ] ( x : var T , y : T ) {. noSideEffect . } =
2018-02-16 08:22:23 +00:00
## In-place substraction for multi-precision unsigned int
2018-02-15 22:28:31 +00:00
#
2018-02-16 08:22:23 +00:00
# Optimized assembly should contain sbb instruction (substract with borrow)
2018-02-15 22:28:31 +00:00
# Clang on MacOS does with the -d:release switch and MpUint[uint32] (uint64)
2018-02-16 20:07:51 +00:00
type MpBase = type x . lo
2018-02-16 10:38:48 +00:00
let tmp = x . lo
2018-02-15 22:28:31 +00:00
2018-02-16 10:38:48 +00:00
x . lo - = y . lo
2018-02-16 20:07:51 +00:00
x . hi - = MpBase ( x . lo > tmp ) + y . hi
2018-02-15 19:26:10 +00:00
2018-02-16 10:38:48 +00:00
proc `-` * [ T : MpUint ] ( x , y : T ) : T {. noSideEffect , noInit , inline . } =
2018-02-15 22:28:31 +00:00
# Substraction for multi-precision unsigned int
2018-02-16 10:38:48 +00:00
result = x
result - = y
2018-02-16 08:22:23 +00:00
2018-02-16 10:38:48 +00:00
proc naiveMul [ T : BaseUint ] ( x , y : T ) : MpUint [ T ] {. noSideEffect , noInit , inline . }
2018-02-16 08:22:23 +00:00
# Forward declaration
2018-02-16 10:38:48 +00:00
proc `*` * [ T : MpUint ] ( x , y : T ) : T {. noSideEffect , noInit . } =
2018-02-16 08:22:23 +00:00
## Multiplication for multi-precision unsigned uint
#
# For our representation, it is similar to school grade multiplication
# Consider hi and lo as if they were digits
#
# 12
# X 15
# ------
# 10 lo*lo -> z0
# 5 hi*lo -> z1
# 2 lo*hi -> z1
# 10 hi*hi -- z2
# ------
# 180
#
# If T is a type
# For T * T --> T we don't need to compute z2 as it always overflow
2018-02-16 10:01:03 +00:00
# For T * T --> 2T (uint64 * uint64 --> uint128) we use extra precision multiplication
2018-02-16 08:22:23 +00:00
2018-02-16 10:38:48 +00:00
result = naiveMul ( x . lo , y . lo )
result . hi + = ( naiveMul ( x . hi , y . lo ) + naiveMul ( x . lo , y . hi ) ) . lo
2018-02-16 08:22:23 +00:00
2018-02-16 10:01:03 +00:00
template naiveMulImpl [ T : MpUint ] ( x , y : T ) : MpUint [ T ] =
# See details at
2018-02-16 08:22:23 +00:00
# https://en.wikipedia.org/wiki/Karatsuba_algorithm
2018-02-16 10:01:03 +00:00
# https://locklessinc.com/articles/256bit_arithmetic/
# https://www.miracl.com/press/missing-a-trick-karatsuba-variations-michael-scott
#
# We use the naive school grade multiplication instead of Karatsuba I.e.
# z1 = x.hi * y.lo + x.lo * y.hi (Naive) = (x.lo - x.hi)(y.hi - y.lo) + z0 + z2 (Karatsuba)
#
# On modern architecture:
# - addition and multiplication have the same cost
# - Karatsuba would require to deal with potentially negative intermediate result
# and introduce branching
# - More total operations means more register moves
2018-02-16 08:40:21 +00:00
2018-02-16 15:47:52 +00:00
let # cannot be const, compile-time sizeof only works for simple types
2018-02-16 20:07:51 +00:00
size = T . sizeof * 8
2018-02-16 12:54:03 +00:00
halfSize = size div 2
2018-02-16 08:22:23 +00:00
let
2018-02-16 10:01:03 +00:00
z0 = naiveMul ( x . lo , y . lo )
tmp = naiveMul ( x . hi , y . lo )
2018-02-16 08:22:23 +00:00
var z1 = tmp
2018-02-16 10:01:03 +00:00
z1 + = naiveMul ( x . hi , y . lo )
let z2 = ( z1 < tmp ) . T + naiveMul ( x . hi , y . hi )
2018-02-16 08:22:23 +00:00
2018-02-16 12:54:03 +00:00
result . lo = z1 . lo shl halfSize + z0
2018-02-16 08:22:23 +00:00
result . hi = z2 + z1 . hi
2018-02-16 10:38:48 +00:00
proc naiveMul [ T : BaseUint ] ( x , y : T ) : MpUint [ T ] {. noSideEffect , noInit , inline . } =
2018-02-16 10:01:03 +00:00
## Naive multiplication algorithm with extended precision
2018-02-16 08:22:23 +00:00
when T . sizeof in { 1 , 2 , 4 } :
# Use types twice bigger to do the multiplication
2018-02-16 10:38:48 +00:00
cast [ type result ] ( x . asDoubleUint * y . asDoubleUint )
2018-02-16 08:22:23 +00:00
elif T . sizeof = = 8 : # uint64 or MpUint[uint32]
# We cannot double uint64 to uint128
2018-02-16 15:47:52 +00:00
naiveMulImpl ( x . toMpUint , y . toMpUint )
2018-02-16 08:22:23 +00:00
else :
# Case: at least uint128 * uint128 --> uint256
2018-02-16 20:07:51 +00:00
naiveMulImpl ( x , y )
2018-02-16 21:17:13 +00:00
proc divmod * [ T : BaseUint ] ( x , y : T ) : tuple [ quot , rem : T ] {. noSideEffect . } =
## Division for multi-precision unsigned uint
## Returns quotient + reminder in a (quot, rem) tuple
#
# Implementation through binary shift division
const zero = T ( )
when x . lo is MpUInt :
const one = T ( lo : getSubType ( T ) ( 1 ) )
const mpOne = one
else :
const one : getSubType ( T ) = 1
const mpOne = T ( lo : getSubType ( T ) ( 1 ) )
if y = = zero :
raise newException ( DivByZeroError , " You attempted to divide by zero " )
elif y = = mpOne :
result . quot = x
return
var
shift = x . bit_length - y . bit_length
d = y shl shift
result . rem = x
while shift > = 0 :
result . quot = result . quot shl 1
if result . rem > = d :
result . rem - = d
result . quot . lo = result . quot . lo or one
d = d shr 1
dec ( shift )
# Performance note:
# The performance of this implementation is extremely dependant on shl and shr.
#
# Probably the most efficient algorithm that can benefit from MpUInt data structure is
# the recursive fast division by Burnikel and Ziegler (http://www.mpi-sb.mpg.de/~ziegler/TechRep.ps.gz):
# - Python implementation: https://bugs.python.org/file11060/fast_div.py and discussion https://bugs.python.org/issue3451
# - C++ implementation: https://github.com/linbox-team/givaro/blob/master/src/kernel/recint/rudiv.h
# - The Handbook of Elliptic and Hyperelliptic Cryptography Algorithm 10.35 on page 188 has a more explicit version of the div2NxN algorithm. This algorithm is directly recursive and avoids the mutual recursion of the original paper's calls between div2NxN and div3Nx2N.
# - Comparison of fast division algorithms fro large integers: http://bioinfo.ict.ac.cn/~dbu/AlgorithmCourses/Lectures/Hasselstrom2003.pdf