Add alternative branchless shift implementation (TODO benchmark on ARM)

2025-02-19 18:38:13 +00:00 · 2018-02-16 16:47:52 +01:00 · 2018-02-16 16:47:52 +01:00 · 5886d76ebc
commit 5886d76ebc
parent 6e27069298
3 changed files with 74 additions and 7 deletions
--- a/src/private/utils.nim
+++ b/src/private/utils.nim
@ -57,4 +57,18 @@ proc asDoubleUint*[T: BaseUint](n: T): auto {.noSideEffect, inline.} =
 macro getSubType*(T: typedesc): untyped =
  ## Returns the subtype of a generic type
  ## MpUint[uint32] --> uint32
-  getTypeInst(T)[1][1]
+  getTypeInst(T)[1][1]
+
+
+proc toMpUint*[T: SomeInteger](n: T): auto {.noSideEffect, inline.} =
+  ## Cast an integer to the corresponding size MpUint
+  # Sometimes direct casting doesn't work and we must cast through a pointer
+
+  when T is uint64:
+    return (cast[ptr [MpUint[uint32]]](unsafeAddr n))[]
+  elif T is uint32:
+    return (cast[ptr [MpUint[uint16]]](unsafeAddr n))[]
+  elif T is uint16:
+    return (cast[ptr [MpUint[uint8]]](unsfddr n))[]
+  else:
+    raise newException(ValueError, "You can only cast uint16, uint32 or uint64 to multiprecision integers")
--- a/src/uint_binary_ops.nim
+++ b/src/uint_binary_ops.nim
@ -77,7 +77,7 @@ template naiveMulImpl[T: MpUint](x, y: T): MpUint[T] =
  #     and introduce branching
  #   - More total operations means more register moves

-  let # TODO: should be a const - https://github.com/nim-lang/Nim/pull/5664
+  let  # cannot be const, compile-time sizeof only works for simple types
    size = (T.sizeof * 8)
    halfSize = size div 2
  let
@ -100,7 +100,7 @@ proc naiveMul[T: BaseUint](x, y: T): MpUint[T] {.noSideEffect, noInit, inline.}=

  elif T.sizeof == 8: # uint64 or MpUint[uint32]
    # We cannot double uint64 to uint128
-    naiveMulImpl(cast[MpUint[uint32]](a), cast[MpUint[uint32]](b))
+    naiveMulImpl(x.toMpUint, y.toMpUint)
  else:
    # Case: at least uint128 * uint128 --> uint256
-    naiveMulImpl(a, b)
+    naiveMulImpl(x, y)
--- a/src/uint_bitwise_ops.nim
+++ b/src/uint_bitwise_ops.nim
@ -34,7 +34,7 @@ proc `shl`*[T: MpUint](x: T, y: SomeInteger): T {.noInit, noSideEffect.}=
  if y == 0:
    return x

-  let # TODO: should be a const - https://github.com/nim-lang/Nim/pull/5664
+  let # cannot be const, compile-time sizeof only works for simple types
    size = (T.sizeof * 8)
    halfSize = size div 2

@ -53,7 +53,7 @@ proc `shr`*[T: MpUint](x: T, y: SomeInteger): T {.noInit, noSideEffect.}=
  if y == 0:
    return x

-  let # TODO: should be a const - https://github.com/nim-lang/Nim/pull/5664
+  let # cannot be const, compile-time sizeof only works for simple types
    size = (T.sizeof * 8)
    halfSize = size div 2

@ -64,4 +64,57 @@ proc `shr`*[T: MpUint](x: T, y: SomeInteger): T {.noInit, noSideEffect.}=
    result.hi = x.hi shr y
  else:
    result.hi = x.hi shr (y - halfSize)
-    result.lo = 0.Sub
+    result.lo = 0.Sub
+
+
+
+# ########################################################################
+# TODO Benchmarks (especially on ARM)
+# Alternative shift implementations without branching
+#
+# Quick testing on MpUint[uint32] on x86_64 with Clang shows that it is somewhat slower
+# Fast shifting is key to fast division and modulo operations
+
+# proc `shl`*[T: MpUint](x: T, y: SomeInteger): T {.noInit, noSideEffect.}=
+#   ## Compute the `shift left` operation of x and y
+#   type Sub = getSubType T
+#
+#   let # cannot be const, compile-time sizeof only works for simple types
+#     size = Sub(T.sizeof * 8)
+#     halfSize = size div 2
+#
+#   var S = y.Sub and (size-1) # y mod size
+#
+#   let
+#     M1 = Sub( ((((S + size-1) or S) and halfSize) div halfSize) - 1)
+#     M2 = Sub( (S div halfSize) - 1)
+#
+#   S = S and (halfSize-1) # y mod halfsize
+#
+#   result.hi = (x.lo shl S) and not M2
+#   result.lo = (x.lo shl S) and M2
+#   result.hi = result.hi or ((
+#     x.hi shl S or (x.lo shr (size - S) and M1)
+#   ) and M2)
+
+# proc `shr`*[T: MpUint](x: T, y: SomeInteger): T {.noInit, noSideEffect.}=
+#   ## Compute the `shift right` operation of x and y
+#   type Sub = getSubType T
+#
+#   let # cannot be const, compile-time sizeof only works for simple types
+#     size = Sub(T.sizeof * 8)
+#     halfSize = size div 2
+#
+#   var S = y.Sub and (size-1) # y mod size
+#
+#   let
+#     M1 = Sub( ((((S + size-1) or S) and halfSize) div halfSize) - 1)
+#     M2 = Sub( (S div halfSize) - 1)
+#
+#   S = S and (halfSize-1) # y mod halfsize
+#
+#   result.lo = (x.hi shr S) and not M2
+#   result.hi = (x.hi shr S) and M2
+#   result.lo = result.lo or ((
+#     x.lo shr S or (x.lo shl (size - S) and M1)
+#   ) and M2)