167 lines
5.1 KiB
Nim
167 lines
5.1 KiB
Nim
# Stint
|
|
# Copyright 2018 Status Research & Development GmbH
|
|
# Licensed under either of
|
|
#
|
|
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
|
|
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
|
|
#
|
|
# at your option. This file may not be copied, modified, or distributed except according to those terms.
|
|
|
|
import macros,
|
|
./conversion,
|
|
./initialization,
|
|
./datatypes,
|
|
./uint_comparison,
|
|
./uint_addsub
|
|
|
|
# ################### Multiplication ################### #
|
|
|
|
func lo(x: uint64): uint64 {.inline.} =
|
|
const
|
|
p: uint64 = 32
|
|
base: uint64 = 1 shl p
|
|
mask: uint64 = base - 1
|
|
result = x and mask
|
|
|
|
func hi(x: uint64): uint64 {.inline.} =
|
|
const
|
|
p = 32
|
|
result = x shr p
|
|
|
|
# No generic, somehow Nim is given ambiguous call with the T: UintImpl overload
|
|
func extPrecMul*(result: var UintImpl[uint8], x, y: uint8) {.inline.}=
|
|
## Extended precision multiplication
|
|
result = cast[type result](x.asDoubleUint * y.asDoubleUint)
|
|
|
|
func extPrecMul*(result: var UintImpl[uint16], x, y: uint16) {.inline.}=
|
|
## Extended precision multiplication
|
|
result = cast[type result](x.asDoubleUint * y.asDoubleUint)
|
|
|
|
func extPrecMul*(result: var UintImpl[uint32], x, y: uint32) {.inline.}=
|
|
## Extended precision multiplication
|
|
result = cast[type result](x.asDoubleUint * y.asDoubleUint)
|
|
|
|
func extPrecAddMul[T: uint8 or uint16 or uint32](result: var UintImpl[T], x, y: T) {.inline.}=
|
|
## Extended precision fused in-place addition & multiplication
|
|
result += cast[type result](x.asDoubleUint * y.asDoubleUint)
|
|
|
|
template extPrecMulImpl(result: var UintImpl[uint64], op: untyped, u, v: uint64) =
|
|
const
|
|
p = 64 div 2
|
|
base: uint64 = 1 shl p
|
|
|
|
var
|
|
x0, x1, x2, x3: uint64
|
|
|
|
let
|
|
ul = lo(u)
|
|
uh = hi(u)
|
|
vl = lo(v)
|
|
vh = hi(v)
|
|
|
|
x0 = ul * vl
|
|
x1 = ul * vh
|
|
x2 = uh * vl
|
|
x3 = uh * vh
|
|
|
|
x1 += hi(x0) # This can't carry
|
|
x1 += x2 # but this can
|
|
if x1 < x2: # if carry, add it to x3
|
|
x3 += base
|
|
|
|
op(result.hi, x3 + hi(x1))
|
|
op(result.lo, (x1 shl p) or lo(x0))
|
|
|
|
func extPrecMul*(result: var UintImpl[uint64], u, v: uint64) =
|
|
## Extended precision multiplication
|
|
extPrecMulImpl(result, `=`, u, v)
|
|
|
|
func extPrecAddMul(result: var UintImpl[uint64], u, v: uint64) =
|
|
## Extended precision fused in-place addition & multiplication
|
|
extPrecMulImpl(result, `+=`, u, v)
|
|
|
|
macro eqSym(x, y: untyped): untyped =
|
|
let eq = $x == $y # Unfortunately eqIdent compares to string.
|
|
result = newLit eq
|
|
|
|
func extPrecAddMul[T](result: var UintImpl[UintImpl[T]], u, v: UintImpl[T])
|
|
func extPrecMul*[T](result: var UintImpl[UintImpl[T]], u, v: UintImpl[T])
|
|
# Forward declaration
|
|
|
|
template extPrecMulImpl*[T](result: var UintImpl[UintImpl[T]], op: untyped, x, y: UintImpl[T]) =
|
|
# See details at
|
|
# https://en.wikipedia.org/wiki/Karatsuba_algorithm
|
|
# https://locklessinc.com/articles/256bit_arithmetic/
|
|
# https://www.miracl.com/press/missing-a-trick-karatsuba-variations-michael-scott
|
|
#
|
|
# We use the naive school grade multiplication instead of Karatsuba I.e.
|
|
# z1 = x.hi * y.lo + x.lo * y.hi (Naive) = (x.lo - x.hi)(y.hi - y.lo) + z0 + z2 (Karatsuba)
|
|
#
|
|
# On modern architecture:
|
|
# - addition and multiplication have the same cost
|
|
# - Karatsuba would require to deal with potentially negative intermediate result
|
|
# and introduce branching
|
|
# - More total operations means more register moves
|
|
|
|
var z1: type x
|
|
|
|
# Low part and hi part - z0 & z2
|
|
when eqSym(op, `+=`):
|
|
extPrecAddMul(result.lo, x.lo, y.lo)
|
|
extPrecAddMul(result.hi, x.hi, y.hi)
|
|
else:
|
|
extPrecMul(result.lo, x.lo, y.lo)
|
|
extPrecMul(result.hi, x.hi, y.hi)
|
|
|
|
## TODO - fuse those parts and reduce the number of carry checks
|
|
# Middle part - z1 - 1st mul
|
|
extPrecMul(z1, x.hi, y.lo)
|
|
result.lo.hi += z1.lo
|
|
if result.lo.hi < z1.lo:
|
|
inc result.hi
|
|
|
|
result.hi.lo += z1.hi
|
|
if result.hi.lo < z1.hi:
|
|
inc result.hi.hi
|
|
|
|
# Middle part - z1 - 2nd mul
|
|
extPrecMul(z1, x.lo, y.hi)
|
|
result.lo.hi += z1.lo
|
|
if result.lo.hi < z1.lo:
|
|
inc result.hi
|
|
|
|
result.hi.lo += z1.hi
|
|
if result.hi.lo < z1.hi:
|
|
inc result.hi.hi
|
|
|
|
func extPrecAddMul[T](result: var UintImpl[UintImpl[T]], u, v: UintImpl[T]) =
|
|
## Extended precision fused in-place addition & multiplication
|
|
extPrecMulImpl(result, `+=`, u, v)
|
|
|
|
func extPrecMul*[T](result: var UintImpl[UintImpl[T]], u, v: UintImpl[T]) =
|
|
## Extended precision multiplication
|
|
extPrecMulImpl(result, `=`, u, v)
|
|
|
|
func `*`*[T](x, y: UintImpl[T]): UintImpl[T] {.inline.}=
|
|
## Multiplication for multi-precision unsigned uint
|
|
#
|
|
# For our representation, it is similar to school grade multiplication
|
|
# Consider hi and lo as if they were digits
|
|
#
|
|
# 12
|
|
# X 15
|
|
# ------
|
|
# 10 lo*lo -> z0
|
|
# 5 hi*lo -> z1
|
|
# 2 lo*hi -> z1
|
|
# 10 hi*hi -- z2
|
|
# ------
|
|
# 180
|
|
#
|
|
# If T is a type
|
|
# For T * T --> T we don't need to compute z2 as it always overflow
|
|
# For T * T --> 2T (uint64 * uint64 --> uint128) we use extra precision multiplication
|
|
|
|
extPrecMul(result, x.lo, y.lo)
|
|
result.hi += x.lo * y.hi + x.hi * y.lo
|