Add alternative branchless shift implementation (TODO benchmark on ARM)

2025-02-22 03:48:20 +00:00 · 2018-02-16 16:47:52 +01:00 · 2018-02-16 16:47:52 +01:00 · 5886d76ebc
commit 5886d76ebc
parent 6e27069298
3 changed files with 74 additions and 7 deletions
--- a/src/private/utils.nim
+++ b/src/private/utils.nim
@ -58,3 +58,17 @@ macro getSubType*(T: typedesc): untyped =
  ## Returns the subtype of a generic type
  ## MpUint[uint32] --> uint32
  getTypeInst(T)[1][1]
 proc toMpUint*[T: SomeInteger](n: T): auto {.noSideEffect, inline.} =
  ## Cast an integer to the corresponding size MpUint
  # Sometimes direct casting doesn't work and we must cast through a pointer
  when T is uint64:
    return (cast[ptr [MpUint[uint32]]](unsafeAddr n))[]
  elif T is uint32:
    return (cast[ptr [MpUint[uint16]]](unsafeAddr n))[]
  elif T is uint16:
    return (cast[ptr [MpUint[uint8]]](unsfddr n))[]
  else:
    raise newException(ValueError, "You can only cast uint16, uint32 or uint64 to multiprecision integers")
--- a/src/uint_binary_ops.nim
+++ b/src/uint_binary_ops.nim
@ -77,7 +77,7 @@ template naiveMulImpl[T: MpUint](x, y: T): MpUint[T] =
  #     and introduce branching
  #   - More total operations means more register moves
-  let # TODO: should be a const - https://github.com/nim-lang/Nim/pull/5664
+  let  # cannot be const, compile-time sizeof only works for simple types
    size = (T.sizeof * 8)
    halfSize = size div 2
  let
@ -100,7 +100,7 @@ proc naiveMul[T: BaseUint](x, y: T): MpUint[T] {.noSideEffect, noInit, inline.}=
  elif T.sizeof == 8: # uint64 or MpUint[uint32]
    # We cannot double uint64 to uint128
-    naiveMulImpl(cast[MpUint[uint32]](a), cast[MpUint[uint32]](b))
+    naiveMulImpl(x.toMpUint, y.toMpUint)
  else:
    # Case: at least uint128 * uint128 --> uint256
-    naiveMulImpl(a, b)
+    naiveMulImpl(x, y)
--- a/src/uint_bitwise_ops.nim
+++ b/src/uint_bitwise_ops.nim
@ -34,7 +34,7 @@ proc `shl`*[T: MpUint](x: T, y: SomeInteger): T {.noInit, noSideEffect.}=
  if y == 0:
    return x
-  let # TODO: should be a const - https://github.com/nim-lang/Nim/pull/5664
+  let # cannot be const, compile-time sizeof only works for simple types
    size = (T.sizeof * 8)
    halfSize = size div 2
@ -53,7 +53,7 @@ proc `shr`*[T: MpUint](x: T, y: SomeInteger): T {.noInit, noSideEffect.}=
  if y == 0:
    return x
-  let # TODO: should be a const - https://github.com/nim-lang/Nim/pull/5664
+  let # cannot be const, compile-time sizeof only works for simple types
    size = (T.sizeof * 8)
    halfSize = size div 2
@ -65,3 +65,56 @@ proc `shr`*[T: MpUint](x: T, y: SomeInteger): T {.noInit, noSideEffect.}=
  else:
    result.hi = x.hi shr (y - halfSize)
    result.lo = 0.Sub
 # ########################################################################
 # TODO Benchmarks (especially on ARM)
 # Alternative shift implementations without branching
 #
 # Quick testing on MpUint[uint32] on x86_64 with Clang shows that it is somewhat slower
 # Fast shifting is key to fast division and modulo operations
 # proc `shl`*[T: MpUint](x: T, y: SomeInteger): T {.noInit, noSideEffect.}=
 #   ## Compute the `shift left` operation of x and y
 #   type Sub = getSubType T
 #
 #   let # cannot be const, compile-time sizeof only works for simple types
 #     size = Sub(T.sizeof * 8)
 #     halfSize = size div 2
 #
 #   var S = y.Sub and (size-1) # y mod size
 #
 #   let
 #     M1 = Sub( ((((S + size-1) or S) and halfSize) div halfSize) - 1)
 #     M2 = Sub( (S div halfSize) - 1)
 #
 #   S = S and (halfSize-1) # y mod halfsize
 #
 #   result.hi = (x.lo shl S) and not M2
 #   result.lo = (x.lo shl S) and M2
 #   result.hi = result.hi or ((
 #     x.hi shl S or (x.lo shr (size - S) and M1)
 #   ) and M2)
 # proc `shr`*[T: MpUint](x: T, y: SomeInteger): T {.noInit, noSideEffect.}=
 #   ## Compute the `shift right` operation of x and y
 #   type Sub = getSubType T
 #
 #   let # cannot be const, compile-time sizeof only works for simple types
 #     size = Sub(T.sizeof * 8)
 #     halfSize = size div 2
 #
 #   var S = y.Sub and (size-1) # y mod size
 #
 #   let
 #     M1 = Sub( ((((S + size-1) or S) and halfSize) div halfSize) - 1)
 #     M2 = Sub( (S div halfSize) - 1)
 #
 #   S = S and (halfSize-1) # y mod halfsize
 #
 #   result.lo = (x.hi shr S) and not M2
 #   result.hi = (x.hi shr S) and M2
 #   result.lo = result.lo or ((
 #     x.lo shr S or (x.lo shl (size - S) and M1)
 #   ) and M2)