diff --git a/src/uint_binary_ops.nim b/src/uint_binary_ops.nim
index 011f634..eb8414b 100644
--- a/src/uint_binary_ops.nim
+++ b/src/uint_binary_ops.nim
@@ -80,7 +80,7 @@ template naiveMulImpl[T: MpUint](x, y: T): MpUint[T] =
 
   let  # cannot be const, compile-time sizeof only works for simple types
     size = T.sizeof * 8
-    halfSize = size div 2
+    halfSize = size shr 1
   let
     z0 = naiveMul(x.lo, y.lo)
     tmp = naiveMul(x.hi, y.lo)
diff --git a/src/uint_bitwise_ops.nim b/src/uint_bitwise_ops.nim
index 0b734ea..366d712 100644
--- a/src/uint_bitwise_ops.nim
+++ b/src/uint_bitwise_ops.nim
@@ -30,94 +30,26 @@ proc `shr`*[T: MpUint](x: T, y: SomeInteger): T {.noInit, noSideEffect.}
 
 proc `shl`*[T: MpUint](x: T, y: SomeInteger): T {.noInit, noSideEffect.}=
   ## Compute the `shift left` operation of x and y
-
-  if y == 0:
-    return x
-
   let
     size = T.sizeof * 8
-    halfSize = size div 2
+    halfSize = size shr 1
 
   type Sub = type x.lo
 
-  if y < halfSize:
-    result.hi = (x.hi shl y) or (x.lo shr (halfSize - y))
-    result.lo = x.lo shl y
-  else:
-    result.hi = x.lo shl (y - halfSize)
-    result.lo = 0.Sub
+  result.hi = (x.hi shl y) or (x.lo shl (y - halfSize))
+  result.lo = if y < halfSize: x.lo shl y
+              else: 0.Sub
+
 
 proc `shr`*[T: MpUint](x: T, y: SomeInteger): T {.noInit, noSideEffect.}=
   ## Compute the `shift right` operation of x and y
-
-  if y == 0:
-    return x
-
   let
     size = T.sizeof * 8
-    halfSize = size div 2
+    halfSize = size shr 1
 
   type Sub = type x.lo
 
-  if y < halfSize:
-    result.lo = (x.lo shr y) or (x.hi shl (halfSize - y))
-    result.hi = x.hi shr y
-  else:
-    result.hi = x.hi shr (y - halfSize)
-    result.lo = 0.Sub
+  result.lo = (x.lo shr y) or (x.hi shl (y - halfSize)) # the shl is not a mistake
+  result.hi = if y < halfSize: x.hi shr y
+              else: 0.Sub
 
-
-
-# ########################################################################
-# TODO Benchmarks (especially on ARM)
-# Alternative shift implementations without branching
-#
-# Quick testing on MpUint[uint32] on x86_64 with Clang shows that it is somewhat slower
-# Fast shifting is key to fast division and modulo operations
-#
-# Note: Using branchless shift will help preventing timing attacks / be more robust cryptography-wise
-# Note2: It's a mess to maintain/read/update
-
-# proc `shl`*[T: MpUint](x: T, y: SomeInteger): T {.noInit, noSideEffect.}=
-#   ## Compute the `shift left` operation of x and y
-#   type Sub = type x.lo
-#
-#   let # cannot be const, compile-time sizeof only works for simple types
-#     size = Sub(T.sizeof * 8)
-#     halfSize = size div 2
-#
-#   var S = y.Sub and (size-1) # y mod size
-#
-#   let
-#     M1 = Sub( ((((S + size-1) or S) and halfSize) div halfSize) - 1)
-#     M2 = Sub( (S div halfSize) - 1)
-#
-#   S = S and (halfSize-1) # y mod halfsize
-#
-#   result.hi = (x.lo shl S) and not M2
-#   result.lo = (x.lo shl S) and M2
-#   result.hi = result.hi or ((
-#     x.hi shl S or (x.lo shr (halfSize - S) and M1)
-#   ) and M2)
-
-# proc `shr`*[T: MpUint](x: T, y: SomeInteger): T {.noInit, noSideEffect.}=
-#   ## Compute the `shift right` operation of x and y
-#   type Sub = type x.lo
-#
-#   let # cannot be const, compile-time sizeof only works for simple types
-#     size = Sub(T.sizeof * 8)
-#     halfSize = size div 2
-#
-#   var S = y.Sub and (size-1) # y mod size
-#
-#   let
-#     M1 = Sub( ((((S + size-1) or S) and halfSize) div halfSize) - 1)
-#     M2 = Sub( (S div halfSize) - 1)
-#
-#   S = S and (halfSize-1) # y mod halfsize
-#
-#   result.lo = (x.hi shr S) and not M2
-#   result.hi = (x.hi shr S) and M2
-#   result.lo = result.lo or ((
-#     x.lo shr S or (x.hi shl (halfSize - S) and M1)
-#   ) and M2)