Add alternative branchless shift implementation (TODO benchmark on ARM)

This commit is contained in:
mratsim 2018-02-16 16:47:52 +01:00
parent 6e27069298
commit 5886d76ebc
3 changed files with 74 additions and 7 deletions

View File

@ -57,4 +57,18 @@ proc asDoubleUint*[T: BaseUint](n: T): auto {.noSideEffect, inline.} =
macro getSubType*(T: typedesc): untyped =
## Returns the subtype of a generic type
## MpUint[uint32] --> uint32
getTypeInst(T)[1][1]
getTypeInst(T)[1][1]
proc toMpUint*[T: SomeInteger](n: T): auto {.noSideEffect, inline.} =
## Cast an integer to the corresponding size MpUint
# Sometimes direct casting doesn't work and we must cast through a pointer
when T is uint64:
return (cast[ptr [MpUint[uint32]]](unsafeAddr n))[]
elif T is uint32:
return (cast[ptr [MpUint[uint16]]](unsafeAddr n))[]
elif T is uint16:
return (cast[ptr [MpUint[uint8]]](unsfddr n))[]
else:
raise newException(ValueError, "You can only cast uint16, uint32 or uint64 to multiprecision integers")

View File

@ -77,7 +77,7 @@ template naiveMulImpl[T: MpUint](x, y: T): MpUint[T] =
# and introduce branching
# - More total operations means more register moves
let # TODO: should be a const - https://github.com/nim-lang/Nim/pull/5664
let # cannot be const, compile-time sizeof only works for simple types
size = (T.sizeof * 8)
halfSize = size div 2
let
@ -100,7 +100,7 @@ proc naiveMul[T: BaseUint](x, y: T): MpUint[T] {.noSideEffect, noInit, inline.}=
elif T.sizeof == 8: # uint64 or MpUint[uint32]
# We cannot double uint64 to uint128
naiveMulImpl(cast[MpUint[uint32]](a), cast[MpUint[uint32]](b))
naiveMulImpl(x.toMpUint, y.toMpUint)
else:
# Case: at least uint128 * uint128 --> uint256
naiveMulImpl(a, b)
naiveMulImpl(x, y)

View File

@ -34,7 +34,7 @@ proc `shl`*[T: MpUint](x: T, y: SomeInteger): T {.noInit, noSideEffect.}=
if y == 0:
return x
let # TODO: should be a const - https://github.com/nim-lang/Nim/pull/5664
let # cannot be const, compile-time sizeof only works for simple types
size = (T.sizeof * 8)
halfSize = size div 2
@ -53,7 +53,7 @@ proc `shr`*[T: MpUint](x: T, y: SomeInteger): T {.noInit, noSideEffect.}=
if y == 0:
return x
let # TODO: should be a const - https://github.com/nim-lang/Nim/pull/5664
let # cannot be const, compile-time sizeof only works for simple types
size = (T.sizeof * 8)
halfSize = size div 2
@ -64,4 +64,57 @@ proc `shr`*[T: MpUint](x: T, y: SomeInteger): T {.noInit, noSideEffect.}=
result.hi = x.hi shr y
else:
result.hi = x.hi shr (y - halfSize)
result.lo = 0.Sub
result.lo = 0.Sub
# ########################################################################
# TODO Benchmarks (especially on ARM)
# Alternative shift implementations without branching
#
# Quick testing on MpUint[uint32] on x86_64 with Clang shows that it is somewhat slower
# Fast shifting is key to fast division and modulo operations
# proc `shl`*[T: MpUint](x: T, y: SomeInteger): T {.noInit, noSideEffect.}=
# ## Compute the `shift left` operation of x and y
# type Sub = getSubType T
#
# let # cannot be const, compile-time sizeof only works for simple types
# size = Sub(T.sizeof * 8)
# halfSize = size div 2
#
# var S = y.Sub and (size-1) # y mod size
#
# let
# M1 = Sub( ((((S + size-1) or S) and halfSize) div halfSize) - 1)
# M2 = Sub( (S div halfSize) - 1)
#
# S = S and (halfSize-1) # y mod halfsize
#
# result.hi = (x.lo shl S) and not M2
# result.lo = (x.lo shl S) and M2
# result.hi = result.hi or ((
# x.hi shl S or (x.lo shr (size - S) and M1)
# ) and M2)
# proc `shr`*[T: MpUint](x: T, y: SomeInteger): T {.noInit, noSideEffect.}=
# ## Compute the `shift right` operation of x and y
# type Sub = getSubType T
#
# let # cannot be const, compile-time sizeof only works for simple types
# size = Sub(T.sizeof * 8)
# halfSize = size div 2
#
# var S = y.Sub and (size-1) # y mod size
#
# let
# M1 = Sub( ((((S + size-1) or S) and halfSize) div halfSize) - 1)
# M2 = Sub( (S div halfSize) - 1)
#
# S = S and (halfSize-1) # y mod halfsize
#
# result.lo = (x.hi shr S) and not M2
# result.hi = (x.hi shr S) and M2
# result.lo = result.lo or ((
# x.lo shr S or (x.lo shl (size - S) and M1)
# ) and M2)