From 777a84e9f5825ed57d4496a5caebf08181d081cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Sat, 13 Jun 2020 16:44:13 +0200
Subject: [PATCH] Implement toHex/fromHex and fix `shl`

---
 stint/endians2.nim           | 136 ++++++++++++++++++++++++++++++-----
 stint/io.nim                 |  62 ++++++----------
 stint/private/uint_mul.nim   |   3 +-
 stint/private/uint_shift.nim |  93 ++++++++++++++++++++++++
 stint/uintops.nim            |  92 ++++++++++++------------
 5 files changed, 285 insertions(+), 101 deletions(-)
 create mode 100644 stint/private/uint_shift.nim

diff --git a/stint/endians2.nim b/stint/endians2.nim
index 6232a94..20d78bc 100644
--- a/stint/endians2.nim
+++ b/stint/endians2.nim
@@ -9,27 +9,131 @@
 
 import private/datatypes
 
-import stew/endians2
-export endians2
-
 {.push raises: [IndexError], noInit, gcsafe.}
 
-func toBytes*[bits: static int](x: StUint[bits], endian: Endianness = system.cpuEndian):
-    array[bits div 8, byte] {.inline.} =
-  when endian == system.cpuEndian:
-    for i in 0 ..< x.limbs.len:
-      result[i * sizeof(Word)] = x.limbs[i].toBytes()
+# Serialization
+# ------------------------------------------------------------------------------------------
+
+template toByte(x: SomeUnsignedInt): byte =
+  ## At compile-time, conversion to bytes checks the range
+  ## we want to ensure this is done at the register level
+  ## at runtime in a single "mov byte" instruction
+  when nimvm:
+    byte(x and 0xFF)
   else:
-    for i in 0 ..< x.limbs.len:
-      result[i * sizeof(Word)] = x.limbs[^i].toBytes()
+    byte(x)
 
-func toBytesLE*[bits: static int](x: StUint[bits]):
-    array[bits div 8, byte] {.inline.} =
-  toBytes(x, littleEndian)
+template blobFrom(dst: var openArray[byte], src: SomeUnsignedInt, startIdx: int, endian: static Endianness) =
+  ## Write an integer into a raw binary blob
+  ## Swapping endianness if needed
+  when endian == cpuEndian:
+    for i in 0 ..< sizeof(src):
+      dst[startIdx+i] = toByte((src shr (i * 8)))
+  else:
+    for i in 0 ..< sizeof(src):
+      dst[startIdx+sizeof(src)-1-i] = toByte((src shr (i * 8)))
 
-func toBytesBE*[bits: static int](x: StUint[bits]):
-    array[bits div 8, byte] {.inline.} =
-  toBytes(x, bigEndian)
+func toBytesLE*[bits: static int](src: StUint[bits]): array[bits div 8, byte] =
+  var
+    src_idx, dst_idx = 0
+    acc: Word = 0
+    acc_len = 0
+
+  when cpuEndian == bigEndian:
+    srcIdx = src.limbs.len - 1
+
+  var tail = result.len
+  while tail > 0:
+    when cpuEndian == littleEndian:
+      let w = if src_idx < src.limbs.len: src.limbs[src_idx]
+              else: 0
+      inc src_idx
+    else:
+      let w = if src_idx >= 0: src.limbs[src_idx]
+              else: 0
+      dec src_idx
+
+    if acc_len == 0:
+      # We need to refill the buffer to output 64-bit
+      acc = w
+      acc_len = WordBitWidth
+    else:
+      let lo = acc
+      acc = w
+
+      if tail >= sizeof(Word):
+        # Unrolled copy
+        result.blobFrom(src = lo, dst_idx, littleEndian)
+        dst_idx += sizeof(Word)
+        tail -= sizeof(Word)
+      else:
+        # Process the tail and exit
+        when cpuEndian == littleEndian:
+          # When requesting little-endian on little-endian platform
+          # we can just copy each byte
+          # tail is inclusive
+          for i in 0 ..< tail:
+            result[dst_idx+i] = toByte(lo shr (i*8))
+        else: # TODO check this
+          # We need to copy from the end
+          for i in 0 ..< tail:
+            result[dst_idx+i] = toByte(lo shr ((tail-i)*8))
+        return
+
+func toBytesBE*[bits: static int](src: StUint[bits]): array[bits div 8, byte] {.inline.} =
+  var
+    src_idx = 0
+    acc: Word = 0
+    acc_len = 0
+
+  when cpuEndian == bigEndian:
+    srcIdx = src.limbs.len - 1
+
+  var tail = result.len
+  while tail > 0:
+    when cpuEndian == littleEndian:
+      let w = if src_idx < src.limbs.len: src.limbs[src_idx]
+              else: 0
+      inc src_idx
+    else:
+      let w = if src_idx >= 0: src.limbs[src_idx]
+              else: 0
+      dec src_idx
+
+    if acc_len == 0:
+      # We need to refill the buffer to output 64-bit
+      acc = w
+      acc_len = WordBitWidth
+    else:
+      let lo = acc
+      acc = w
+
+      if tail >= sizeof(Word):
+        # Unrolled copy
+        tail -= sizeof(Word)
+        result.blobFrom(src = lo, tail, bigEndian)
+      else:
+        # Process the tail and exit
+        when cpuEndian == littleEndian:
+          # When requesting little-endian on little-endian platform
+          # we can just copy each byte
+          # tail is inclusive
+          for i in 0 ..< tail:
+            result[tail-1-i] = toByte(lo shr (i*8))
+        else:
+          # We need to copy from the end
+          for i in 0 ..< tail:
+            result[tail-1-i] = toByte(lo shr ((tail-i)*8))
+        return
+
+func toBytes*[bits: static int](x: StUint[bits], endian: Endianness = system.cpuEndian): array[bits div 8, byte] {.inline.} =
+  if endian == littleEndian:
+    result = x.toBytesLE()
+  else:
+    result = x.toBytesBE()
+
+# Deserialization
+# ------------------------------------------------------------------------------------------
 
 func fromBytesBE*[bits: static int](
     T: typedesc[StUint[bits]],
diff --git a/stint/io.nim b/stint/io.nim
index 26bca80..8483a15 100644
--- a/stint/io.nim
+++ b/stint/io.nim
@@ -8,12 +8,18 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
+  # Standard library
+  typetraits, algorithm, hashes,
+  # Status libraries
+  # stew/byteutils,
+  # Internal
   ./private/datatypes,
   # ./private/int_negabs,
   # ./private/compiletime_helpers,
   # ./intops,
-  ./uintops, ./endians2,
-  typetraits, algorithm, hashes
+  ./uintops, ./endians2
+
+from stew/byteutils import toHex # Why are we exporting readHexChar in byteutils?
 
 template static_check_size(T: typedesc[SomeInteger], bits: static[int]) =
   # To avoid a costly runtime check, we refuse storing into StUint types smaller
@@ -356,44 +362,20 @@ func hexToUint*[bits: static[int]](hexString: string): StUint[bits] {.inline.} =
 #   ## Leading zeros are stripped. Use dumpHex instead if you need the in-memory representation
 #   toString(num, 16)
 
-# func dumpHex*(x: Stint or StUint, order: static[Endianness] = bigEndian): string =
-#   ## Stringify an int to hex.
-#   ## Note. Leading zeros are not removed. Use toString(n, base = 16)/toHex instead.
-#   ##
-#   ## You can specify bigEndian or littleEndian order.
-#   ## i.e. in bigEndian:
-#   ## - 1.uint64 will be 00000001
-#   ## - (2.uint128)^64 + 1 will be 0000000100000001
-#   ##
-#   ## in littleEndian:
-#   ## - 1.uint64 will be 01000000
-#   ## - (2.uint128)^64 + 1 will be 0100000001000000
-
-#   const
-#     hexChars = "0123456789abcdef"
-#     size = bitsof(x.data) div 8
-
-#   result = newString(2*size)
-
-#   when nimvm:
-#     for i in 0 ..< size:
-#       when order == system.cpuEndian:
-#         let byte = x.data.getByte(i)
-#       else:
-#         let byte = x.data.getByte(size - 1 - i)
-#       result[2*i] = hexChars[int byte shr 4 and 0xF]
-#       result[2*i+1] = hexChars[int byte and 0xF]
-#   else:
-#     {.pragma: restrict, codegenDecl: "$# __restrict $#".}
-#     let bytes {.restrict.}= cast[ptr array[size, byte]](x.unsafeaddr)
-
-#     for i in 0 ..< size:
-#       when order == system.cpuEndian:
-#         result[2*i] = hexChars[int bytes[i] shr 4 and 0xF]
-#         result[2*i+1] = hexChars[int bytes[i] and 0xF]
-#       else:
-#         result[2*i] = hexChars[int bytes[bytes[].high - i] shr 4 and 0xF]
-#         result[2*i+1] = hexChars[int bytes[bytes[].high - i] and 0xF]
+func dumpHex*(a: Stint or StUint, order: static[Endianness] = bigEndian): string =
+  ## Stringify an int to hex.
+  ## Note. Leading zeros are not removed. Use toString(n, base = 16)/toHex instead.
+  ##
+  ## You can specify bigEndian or littleEndian order.
+  ## i.e. in bigEndian:
+  ## - 1.uint64 will be 00000001
+  ## - (2.uint128)^64 + 1 will be 0000000100000001
+  ##
+  ## in littleEndian:
+  ## - 1.uint64 will be 01000000
+  ## - (2.uint128)^64 + 1 will be 0100000001000000
+  let bytes = a.toBytes(order)
+  result = bytes.toHex()
 
 proc initFromBytesBE*[bits: static[int]](val: var Stuint[bits], 
                       ba: openarray[byte], 
diff --git a/stint/private/uint_mul.nim b/stint/private/uint_mul.nim
index 2b574f8..1155344 100644
--- a/stint/private/uint_mul.nim
+++ b/stint/private/uint_mul.nim
@@ -11,7 +11,8 @@ import
   ./datatypes,
   ./primitives/extended_precision
 
-# ################### Multiplication ################### #
+# Multiplication
+# --------------------------------------------------------
 {.push raises: [], gcsafe.}
 
 func prod*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
diff --git a/stint/private/uint_shift.nim b/stint/private/uint_shift.nim
new file mode 100644
index 0000000..12eb944
--- /dev/null
+++ b/stint/private/uint_shift.nim
@@ -0,0 +1,93 @@
+# Stint
+# Copyright 2018-Present Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ./datatypes
+
+# Shifts
+# --------------------------------------------------------
+{.push raises: [], gcsafe.}
+
+func shrSmall*(r: var Limbs, a: Limbs, k: SomeInteger) =
+  ## Shift right by k.
+  ##
+  ## k MUST be less than the base word size (2^32 or 2^64)
+  # Note: for speed, loading a[i] and a[i+1]
+  #       instead of a[i-1] and a[i]
+  #       is probably easier to parallelize for the compiler
+  #       (antidependence WAR vs loop-carried dependence RAW)
+  when cpuEndian == littleEndian:
+    for i in 0 ..< a.len-1:
+      r[i] = (a[i] shr k) or (a[i+1] shl (WordBitWidth - k))
+    r[^1] = a[^1] shr k
+  else:
+    for i in countdown(a.len-1, 1):
+      r[i] = (a[i] shr k) or (a[i-1] shl (WordBitWidth - k))
+    r[0] = a[0] shr k
+
+func shrLarge*(r: var Limbs, a: Limbs, w, shift: SomeInteger) =
+  ## Shift right by `w` words + `shift` bits
+  ## Assumes `r` is 0 initialized
+  if w > Limbs.len:
+    return
+
+  when cpuEndian == littleEndian:
+    for i in w ..< a.len-1:
+      r[i-w] = (a[i] shr shift) or (a[i+1] shl (WordBitWidth - shift))
+    r[^w] = a[^1] shr shift
+  else:
+    for i in countdown(a.len-1, 1+w):
+      r[i-w] = (a[i] shr shift) or (a[i-1] shl (WordBitWidth - k))
+    r[0] = a[w] shr shift
+
+func shrWords*(r: var Limbs, a: Limbs, w: SomeInteger) =
+  ## Shift right by w word
+  when cpuEndian == littleEndian:
+    for i in 0 ..< Limbs.len-w:
+      r[i] = a[i+w]
+  else:
+    for i in countdown(Limbs.len-w, 0):
+      r[i] = a[i+w]
+
+func shlSmall*(r: var Limbs, a: Limbs, k: SomeInteger) =
+  ## Compute the `shift left` operation of x and k
+  ##
+  ## k MUST be less than the base word size (2^32 or 2^64)
+  when cpuEndian == littleEndian:
+    r[0] = a[0] shl k
+    for i in 1 ..< a.len:
+      r[i] = (a[i] shl k) or (a[i-1] shr (WordBitWidth - k))
+  else:
+    r[^1] = a[^1] shl k
+    for i in countdown(a.len-2, 0):
+      r[i] = (a[i] shl k) or (a[i+1] shr (WordBitWidth - k))
+
+func shlLarge*(r: var Limbs, a: Limbs, w, shift: SomeInteger) =
+  ## Shift left by `w` words + `shift` bits
+  ## Assumes `r` is 0 initialized
+  if w > Limbs.len:
+    return
+
+  when cpuEndian == littleEndian:
+    r[w] = a[0] shl shift
+    for i in 1+w ..< r.len:
+      r[i] = (a[i-w] shl shift) or (a[i-w-1] shr (WordBitWidth - shift))
+  else:
+    r[^1] = a[^w] shl shift
+    for i in countdown(a.len-2-w, 0):
+      r[i+w] = (a[i] shl shift) or (a[i+1] shr (WordBitWidth - shift))
+
+func shlWords*(r: var Limbs, a: Limbs, w: SomeInteger) =
+  ## Shift left by w word
+  when cpuEndian == littleEndian:
+    for i in 0 ..< Limbs.len-w:
+      r[i+w] = a[i]
+  else:
+    for i in countdown(Limbs.len-1, 0):
+      r[i] = a[i-w]
diff --git a/stint/uintops.nim b/stint/uintops.nim
index 94ee52d..a227613 100644
--- a/stint/uintops.nim
+++ b/stint/uintops.nim
@@ -12,6 +12,7 @@ import
   stew/bitops2,
   # Internal
   ./private/datatypes,
+  ./private/uint_shift,
   ./private/primitives/addcarry_subborrow
 
 export StUint
@@ -127,69 +128,72 @@ func `xor`*(a, b: Stuint): Stuint =
     wr = wa xor wb
   result.clearExtraBits()
 
-func `shr`*(a: Stuint, k: SomeInteger): Stuint =
-  ## Shift right by k.
-  ##
-  ## k MUST be less than the base word size (2^32 or 2^64)
-  # Note: for speed, loading a[i] and a[i+1]
-  #       instead of a[i-1] and a[i]
-  #       is probably easier to parallelize for the compiler
-  #       (antidependence WAR vs loop-carried dependence RAW)
-  when cpuEndian == littleEndian:
-    for i in 0 ..< a.limbs.len-1:
-      result.limbs[i] = (a.limbs[i] shr k) or (a.limbs[i+1] shl (WordBitWidth - k))
-    result.limbs[^1] = a.limbs[^1] shr k
-  else:
-    for i in countdown(a.limbs.len-1, 1):
-      result.limbs[i] = (a.limbs[i] shr k) or (a.limbs[i-1] shl (WordBitWidth - k))
-    result.limbs[0] = a.limbs[0] shr k
-
-func `shl`*(a: Stuint, k: SomeInteger): Stuint =
-  ## Compute the `shift left` operation of x and k
-  when cpuEndian == littleEndian:
-    result.limbs[0] = a.limbs[0] shl k
-    for i in 1 ..< a.limbs.len:
-      result.limbs[i] = (a.limbs[i] shl k) or (a.limbs[i-1] shr (WordBitWidth - k))
-  else:
-    result.limbs[^1] = a.limbs[^1] shl k
-    for i in countdown(a.limbs.len-2, 0):
-      result.limbs[i] = (a.limbs[i] shl k) or (a.limbs[i+1] shr (WordBitWidth - k))
-  result.clearExtraBits()
-
-func countOnes*(x: Stuint): int {.inline.} =
+func countOnes*(a: Stuint): int {.inline.} =
   result = 0
-  for wx in leastToMostSig(x):
-    result += countOnes(wx)
+  for wa in leastToMostSig(a):
+    result += countOnes(wa)
 
-func parity*(x: Stuint): int {.inline.} =
-  result = parity(x.limbs[0])
-  for i in 1 ..< x.limbs.len:
-    result = result xor parity(x.limbs[i])
+func parity*(a: Stuint): int {.inline.} =
+  result = parity(a.limbs[0])
+  for i in 1 ..< a.limbs.len:
+    result = result xor parity(a.limbs[i])
 
-func leadingZeros*(x: Stuint): int {.inline.} =
+func leadingZeros*(a: Stuint): int {.inline.} =
   result = 0
-  for word in mostToLeastSig(x):
+  for word in mostToLeastSig(a):
     let zeroCount = word.leadingZeros()
     result += zeroCount
     if zeroCount != WordBitWidth:
       return
 
-func trailingZeros*(x: Stuint): int {.inline.} =
+func trailingZeros*(a: Stuint): int {.inline.} =
   result = 0
-  for word in leastToMostSig(x):
+  for word in leastToMostSig(a):
     let zeroCount = word.leadingZeros()
     result += zeroCount
     if zeroCount != WordBitWidth:
       return
 
-func firstOne*(x: Stuint): int {.inline.} =
-  result = trailingZeros(x)
-  if result == x.limbs.len * WordBitWidth:
+func firstOne*(a: Stuint): int {.inline.} =
+  result = trailingZeros(a)
+  if result == a.limbs.len * WordBitWidth:
     result = 0
   else:
     result += 1
 
-{.pop.}
+func `shr`*(a: Stuint, k: SomeInteger): Stuint {.inline.} =
+  ## Shift right by k bits
+  if k < WordBitWidth:
+    result.limbs.shrSmall(a.limbs, k)
+    return
+  # w = k div WordBitWidth, shift = k mod WordBitWidth
+  let w     = k shr static(log2trunc(uint32(WordBitWidth)))
+  let shift = k and (WordBitWidth - 1)
+
+  if shift == 0:
+    result.limbs.shrWords(a.limbs, w)
+  else:
+    result.limbs.shrLarge(a.limbs, w, shift)
+
+func `shl`*(a: Stuint, k: SomeInteger): Stuint {.inline.} =
+  ## Shift left by k bits
+  if k < WordBitWidth:
+    result.limbs.shlSmall(a.limbs, k)
+    result.clearExtraBits()
+    return
+  # w = k div WordBitWidth, shift = k mod WordBitWidth
+  let w     = k shr static(log2trunc(uint32(WordBitWidth)))
+  let shift = k and (WordBitWidth - 1)
+
+  if shift == 0:
+    result.limbs.shlWords(a.limbs, w)
+  else:
+    result.limbs.shlLarge(a.limbs, w, shift)
+
+  result.clearExtraBits()
+
+{.pop.} # End inline
+
 # Addsub
 # --------------------------------------------------------
 {.push raises: [], inline, noInit, gcsafe.}