From 4660dfe4a4bafe75751790dd46138efab3d311d3 Mon Sep 17 00:00:00 2001
From: Mamy Ratsimbazafy <mamy_github@numforge.co>
Date: Sun, 23 Jan 2022 22:45:47 +0100
Subject: [PATCH] Use littleEndian for limb-endianness: bigEndian arch are very
 rare, untestable in CI, a pain to maintain and an intermediate serialization
 step instead of casting is cheap

---
 README.md                      |   3 +-
 stint/endians2.nim             |  25 ------
 stint/io.nim                   |  95 ++-------------------
 stint/private/datatypes.nim    | 145 ++++++++++-----------------------
 stint/private/uint_addsub.nim  |  33 +++-----
 stint/private/uint_bitwise.nim |  32 ++++----
 stint/private/uint_div.nim     |  41 +---------
 stint/private/uint_shift.nim   |  76 +++++------------
 stint/uintops.nim              |  51 +++++++-----
 9 files changed, 132 insertions(+), 369 deletions(-)

diff --git a/README.md b/README.md
index 0a5906c..76b4191 100644
--- a/README.md
+++ b/README.md
@@ -20,8 +20,7 @@ Main focus:
     - Uint2048 for Ethereum Bloom filters
   - Ease of use:
     - Use traditional `+`, `-`, `+=`, etc operators like on native types
-    - Representation of numbers in memory is the exact same as native types and endianness aware.
-      - In practice that means that interfacing with binary blobs representing numbers from cryptographic    libraries can be done with a `cast` if it represents a Uint256, Uint512, Uint1024, Uint2048.
+    - converting to and from raw byte BigInts (also called octet string in IETF specs)
     - converting to and from Hex
     - converting to and from decimal strings
 
diff --git a/stint/endians2.nim b/stint/endians2.nim
index 20d78bc..ac8796c 100644
--- a/stint/endians2.nim
+++ b/stint/endians2.nim
@@ -245,28 +245,3 @@ func fromBytes*[bits: static int](
     result = fromBytesLE(T, x)
   else:
     result = fromBytesBE(T, x)
-
-# TODO: What is the use-case for all the procs below?
-# ------------------------------------------------------------------------------------------
-
-func toBE*[bits: static int](x: StUint[bits]): StUint[bits] {.inline, deprecated: "Use toByteArrayBE instead".} =
-  ## Convert a native endian value to big endian. Consider toBytesBE instead
-  ## which may prevent some confusion.
-  if cpuEndian == bigEndian: x
-  else: x.swapBytes
-
-func fromBE*[bits: static int](x: StUint[bits]): StUint[bits] {.inline, deprecated: "Use fromBytesBE instead".} =
-  ## Read a big endian value and return the corresponding native endian
-  # there's no difference between this and toBE, except when reading the code
-  toBE(x)
-
-func toLE*[bits: static int](x: StUint[bits]): StUint[bits] {.inline, deprecated.} =
-  ## Convert a native endian value to little endian. Consider toBytesLE instead
-  ## which may prevent some confusion.
-  if cpuEndian == littleEndian: x
-  else: x.swapBytes
-
-func fromLE*[bits: static int](x: StUint[bits]): StUint[bits] {.inline, deprecated: "Use fromBytesLE instead".} =
-  ## Read a little endian value and return the corresponding native endian
-  # there's no difference between this and toLE, except when reading the code
-  toLE(x)
diff --git a/stint/io.nim b/stint/io.nim
index 8483a15..c80fc4f 100644
--- a/stint/io.nim
+++ b/stint/io.nim
@@ -33,22 +33,10 @@ template static_check_size(T: typedesc[SomeInteger], bits: static[int]) =
 
 func stuint*[T: SomeInteger](n: T, bits: static[int]): StUint[bits] {.inline.}=
   ## Converts an integer to an arbitrary precision integer.
-  when cpuEndian == littleEndian:
-    result.limbs[0] = Word(n)
-    when sizeof(n) > sizeof(Word):
-      result.limbs[1] = Word(n) shr WordBitWidth
-  else:
-    result.limbs[^1] = Word(n)
-    when sizeof(n) > sizeof(Word):
-      result.limbs[^2] = Word(n) shr WordBitWidth
+  result.limbs[0] = Word(n)
+  when sizeof(n) > sizeof(Word):
+    result.limbs[1] = Word(n) shr WordBitWidth
 
-<<<<<<< HEAD
-func to*(x: SomeInteger, T: typedesc[StInt]): T =
-  stint(x, result.bits)
-
-func to*(x: SomeUnsignedInt, T: typedesc[StUint]): T =
-  stuint(x, result.bits)
-=======
 # func stint*[T: SomeInteger](n: T, bits: static[int]): StInt[bits] {.inline.}=
 #   ## Converts an integer to an arbitrary precision signed integer.
 #
@@ -88,8 +76,8 @@ func stuint*(a: StUint, bits: static[int]): StUint[bits] {.inline.} =
   ## unsigned int to unsigned int conversion
   ## smaller to bigger bits conversion will have the same value
   ## bigger to smaller bits conversion, the result is truncated
-  for wr, wa in leastToMostSig(result, a):
-    wr = wa
+  for i in 0 ..< result.len:
+    result[i] = a[i]
 
 # func stuint*(a: StInt, bits: static[int]): StUint[bits] {.inline.} =
 #   ## signed int to unsigned int conversion
@@ -377,82 +365,13 @@ func dumpHex*(a: Stint or StUint, order: static[Endianness] = bigEndian): string
   let bytes = a.toBytes(order)
   result = bytes.toHex()
 
-proc initFromBytesBE*[bits: static[int]](val: var Stuint[bits], 
-                      ba: openarray[byte], 
-                      allowPadding: static[bool] = true) {.deprecated:"Use fromBytesBE instead".}=
-  ## Initializes a UInt[bits] value from a byte buffer storing a big-endian
-  ## representation of a number.
-  ##
-  ## If `allowPadding` is set to false, the input array must be exactly
-  ## (bits div 8) bytes long. Otherwise, it may be shorter and the remaining
-  ## bytes will be assumed to be zero.
-
-  const N = bits div 8
-
-  when not allowPadding:
-    doAssert(ba.len == N)
-  else:
-    doAssert ba.len <= N
-    when system.cpuEndian == bigEndian:
-      let baseIdx = N - val.len
-    else:
-      let baseIdx = ba.len - 1
-
-  when nimvm:
-    when system.cpuEndian == bigEndian:
-      when allowPadding:
-        for i, b in ba: val.data.setByte(baseIdx + i, b)
-      else:
-        for i, b in ba: val.data.setByte(i, b)
-    else:
-      when allowPadding:
-        for i, b in ba: val.data.setByte(baseIdx - i, b)
-      else:
-        for i, b in ba: val.data.setByte(N-1 - i, b)
-  else:
-    {.pragma: restrict, codegenDecl: "$# __restrict $#".}
-    let r_ptr {.restrict.} = cast[ptr array[N, byte]](val.addr)
-
-    when system.cpuEndian == bigEndian:
-      # TODO: due to https://github.com/status-im/nim-stint/issues/38
-      # We can't cast a stack byte array to stuint with a convenient proc signature.
-      when allowPadding:
-        for i, b in ba: r_ptr[baseIdx + i] = b
-      else:
-        for i, b in ba: r_ptr[i] = b
-    else:
-      when allowPadding:
-        for i, b in ba: r_ptr[baseIdx - i] = b
-      else:
-        for i, b in ba: r_ptr[N-1 - i] = b
-
-func significantBytesBE*(val: openArray[byte]): int {.deprecated.}=
-  ## Returns the number of significant trailing bytes in a big endian
-  ## representation of a number.
-  # TODO: move that in https://github.com/status-im/nim-byteutils
-  for i in 0 ..< val.len:
-    if val[i] != 0:
-      return val.len - i
-  return 1
-
-func fromBytesBE*(T: type Stuint, ba: openarray[byte],
-                  allowPadding: static[bool] = true): T {.noInit, inline.} =
-  ## This function provides a convenience wrapper around `initFromBytesBE`.
-  when not allowPadding:
-    {.deprecated: "fromBytesBE without padding is deprecated".}
-    result.initFromBytesBE(ba, allowPadding)
-  else:
-    result = endians2.fromBytesBE(T, ba)
-
 func readUintBE*[bits: static[int]](ba: openarray[byte]): Stuint[bits] {.noInit, inline.}=
   ## Convert a big-endian array of (bits div 8) Bytes to an UInt[bits] (in native host endianness)
   ## Input:
   ##   - a big-endian openArray of size (bits div 8) at least
   ## Returns:
   ##   - A unsigned integer of the same size with `bits` bits
-  ##
-  ## ⚠ If the openarray length is bigger than bits div 8, part converted is undefined behaviour.
-  result = endians2.fromBytesBE(Stuint[bits], ba)
+  result = (typeof result).fromBytesBE(ba)
 
 func toByteArrayBE*[bits: static[int]](n: StUint[bits]): array[bits div 8, byte] {.noInit, inline.}=
   ## Convert a uint[bits] to to a big-endian array of bits div 8 bytes
@@ -460,7 +379,7 @@ func toByteArrayBE*[bits: static[int]](n: StUint[bits]): array[bits div 8, byte]
   ##   - an unsigned integer
   ## Returns:
   ##   - a big-endian array of the same size
-  result = n.toBytes(bigEndian)
+  result = n.toBytesBE()
 
 template hash*(num: StUint|StInt): Hash =
   # TODO:
diff --git a/stint/private/datatypes.nim b/stint/private/datatypes.nim
index 39947fa..1c43049 100644
--- a/stint/private/datatypes.nim
+++ b/stint/private/datatypes.nim
@@ -20,35 +20,22 @@ const WordBitWidth* = sizeof(Word) * 8
 
 func wordsRequired*(bits: int): int {.compileTime.} =
   ## Compute the number of limbs required
-  ## from the **announced** bit length
+  ## for the **announced** bit length
   (bits + WordBitWidth - 1) div WordBitWidth
 
 type
   Limbs*[N: static int] = array[N, Word]
     ## Limbs type
-    ## Large proc like multiplication and division
-    ## should operate at the limb-level
-    ## to avoid duplicate codepaths
-    ## For example for Stuint[16] and Stuint[32]
-    ## or if allowed in the future
-    ## Stuint[254] and Stuint[256]
 
   StUint*[bits: static[int]] = object
     ## Stack-based integer
     ## Unsigned
     limbs*: array[bits.wordsRequired, Word]
-      # TODO: using the limbs type here
-      #       can using StUint[8] of length 2, instead of 1
-      #       in test_uint_bitwise (in the VM)
-      #       unless you put the following instantiation
-      #       at the bottom of this file
-      # static:
-      #   echo StUint[8]()
+      # Limbs-Endianess is little-endian
 
-  StInt*[bits: static[int]] = object
+  StInt*[bits: static[int]] {.borrow: `.`.} = distinct StUint[bits]
     ## Stack-based integer
     ## Signed
-    limbs*: array[bits.wordsRequired, Word]
 
   Carry* = uint8  # distinct range[0'u8 .. 1]
   Borrow* = uint8 # distinct range[0'u8 .. 1]
@@ -62,25 +49,12 @@ when sizeof(int) == 8 and GCC_Compatible:
   type
     uint128*{.importc: "unsigned __int128".} = object
 
-# Accessors
+# Bithacks
 # --------------------------------------------------------
 
-template leastSignificantWord*(num: SomeInteger): auto =
-  num
+{.push raises: [], inline, noInit, gcsafe.}
 
-template leastSignificantWord*(a: SomeBigInteger): auto =
-  when cpuEndian == littleEndian:
-    a.limbs[0]
-  else:
-    a.limbs[^1]
-
-template mostSignificantWord*(a: SomeBigInteger): auto =
-  when cpuEndian == littleEndian:
-    a.limbs[^1]
-  else:
-    a.limbs[0]
-
-template clearExtraBits*(a: var StUint) =
+template clearExtraBitsOverMSB*(a: var StUint) =
   ## A Stuint is stored in an array of 32 of 64-bit word
   ## If we do bit manipulation at the word level,
   ## for example a 8-bit stuint stored in a 64-bit word
@@ -88,65 +62,35 @@ template clearExtraBits*(a: var StUint) =
   when a.bits != a.limbs.len * WordBitWidth:
     const posExtraBits = a.bits - (a.limbs.len-1) * WordBitWidth
     const mask = (Word(1) shl posExtraBits) - 1
-    mostSignificantWord(a) = mostSignificantWord(a) and mask
+    a[^1] = a[^1] and mask
+
+func usedBitsAndWords*(a: openArray[Word]): tuple[bits, words: int] =
+  ## Returns the number of used words and bits in a bigInt
+  var clz = 0
+  # Count Leading Zeros
+  for i in countdown(a.len-1, 0):
+    let count = log2trunc(a[i])
+    # debugEcho "count: ", count, ", a[", i, "]: ", a[i].toBin(64)
+    if count == -1:
+      clz += WordBitWidth
+    else:
+      clz += WordBitWidth - count - 1
+      return (a.len*WordBitWidth - clz, i+1)
+
+{.pop.}
+
+# Accessors
+# --------------------------------------------------------
+
+template `[]`*(a: SomeBigInteger, i: SomeInteger or BackwardsIndex): Word =
+  a.limbs[i]
+
+template `[]=`*(a: var SomeBigInteger, i: SomeInteger or BackwardsIndex, val: Word) =
+  a.limbs[i] = val
 
 # Iterations
 # --------------------------------------------------------
 
-iterator leastToMostSig*(a: SomeBigInteger): Word =
-  ## Iterate from least to most significant word
-  when cpuEndian == littleEndian:
-    for i in 0 ..< a.limbs.len:
-      yield a.limbs[i]
-  else:
-    for i in countdown(a.limbs.len-1, 0):
-      yield a.limbs[i]
-
-iterator leastToMostSig*(a: var SomeBigInteger): var Word =
-  ## Iterate from least to most significant word
-  when cpuEndian == littleEndian:
-    for i in 0 ..< a.limbs.len:
-      yield a.limbs[i]
-  else:
-    for i in countdown(a.limbs.len-1, 0):
-      yield a.limbs[i]
-
-iterator leastToMostSig*(a, b: SomeBigInteger): (Word, Word) =
-  ## Iterate from least to most significant word
-  when cpuEndian == littleEndian:
-    for i in 0 ..< a.limbs.len:
-      yield (a.limbs[i], b.limbs[i])
-  else:
-    for i in countdown(a.limbs.len-1, 0):
-      yield (a.limbs[i], b.limbs[i])
-
-iterator leastToMostSig*[aBits, bBits](a: var SomeBigInteger[aBits], b: SomeBigInteger[bBits]): (var Word, Word) =
-  ## Iterate from least to most significant word
-  when cpuEndian == littleEndian:
-    for i in 0 ..< min(a.limbs.len, b.limbs.len):
-      yield (a.limbs[i], b.limbs[i])
-  else:
-    for i in countdown(min(a.limbs.len, b.limbs.len)-1, 0):
-      yield (a.limbs[i], b.limbs[i])
-
-iterator leastToMostSig*(c: var SomeBigInteger, a, b: SomeBigInteger): (var Word, Word, Word) =
-  ## Iterate from least to most significant word
-  when cpuEndian == littleEndian:
-    for i in 0 ..< a.limbs.len:
-      yield (c.limbs[i], a.limbs[i], b.limbs[i])
-  else:
-    for i in countdown(a.limbs.len-1, 0):
-      yield (c.limbs[i], a.limbs[i], b.limbs[i])
-
-iterator mostToLeastSig*(a: SomeBigInteger): Word =
-  ## Iterate from most to least significant word
-  when cpuEndian == bigEndian:
-    for i in 0 ..< a.limbs.len:
-      yield a.limbs[i]
-  else:
-    for i in countdown(a.limbs.len-1, 0):
-      yield a.limbs[i]
-
 import std/macros
 
 proc replaceNodes(ast: NimNode, what: NimNode, by: NimNode): NimNode =
@@ -179,20 +123,15 @@ macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped
 
 # Copy
 # --------------------------------------------------------
+{.push raises: [], inline, noInit, gcsafe.}
 
-func copyFrom*(
-        dst: var SomeBigInteger,
-        src: SomeBigInteger
-      ){.inline.} =
-  ## Copy a BigInteger, truncated to 2^slen if the source
-  ## is larger than the destination
-  when cpuEndian == littleEndian:
-    for i in 0 ..< min(dst.limbs.len, src.limbs.len):
-      dst.limbs[i] = src.limbs[i]
-    for i in src.limbs.len ..< dst.limbs.len:
-      dst.limbs[i] = 0
-  else:
-    for i in countdown(dst.limbs.len-1, src.limbs.len):
-      dst.limbs[i] = 0
-    for i in countdown(src.limbs.len-1, 0):
-      dst.limbs[i] = src.limbs[i]
+func copyWords*(
+       a: var openArray[Word], startA: int,
+       b: openArray[Word], startB: int,
+       numWords: int) =
+  ## Copy a slice of B into A. This properly deals
+  ## with overlaps when A and B are slices of the same buffer
+  for i in countdown(numWords-1, 0):
+    a[startA+i] = b[startB+i]
+
+{.pop.}
\ No newline at end of file
diff --git a/stint/private/uint_addsub.nim b/stint/private/uint_addsub.nim
index 6821d75..3cd4909 100644
--- a/stint/private/uint_addsub.nim
+++ b/stint/private/uint_addsub.nim
@@ -19,40 +19,31 @@ import
 func sum*(r: var Stuint, a, b: Stuint) =
   ## Addition for multi-precision unsigned int
   var carry = Carry(0)
-  for wr, wa, wb in leastToMostSig(r, a, b):
-    addC(carry, wr, wa, wb, carry)
-  r.clearExtraBits()
+  for i in 0 ..< r.limbs.len:
+    addC(carry, r[i], a[i], b[i], carry)
+  r.clearExtraBitsOverMSB()
 
 func `+=`*(a: var Stuint, b: Stuint) =
   ## In-place addition for multi-precision unsigned int
-  var carry = Carry(0)
-  for wa, wb in leastToMostSig(a, b):
-    addC(carry, wa, wa, wb, carry)
-  a.clearExtraBits()
+  a.sum(a, b)
 
 func diff*(r: var Stuint, a, b: Stuint) =
   ## Substraction for multi-precision unsigned int
   var borrow = Borrow(0)
-  for wr, wa, wb in leastToMostSig(r, a, b):
-    subB(borrow, wr, wa, wb, borrow)
-  r.clearExtraBits()
+  for i in 0 ..< r.limbs.len:
+    subB(borrow, r[i], a[i], b[i], borrow)
+  r.clearExtraBitsOverMSB()
 
 func `-=`*(a: var Stuint, b: Stuint) =
   ## In-place substraction for multi-precision unsigned int
-  var borrow = Borrow(0)
-  for wa, wb in leastToMostSig(a, b):
-    subB(borrow, wa, wa, wb, borrow)
-  a.clearExtraBits()
+  a.diff(a, b)
 
 func inc*(a: var Stuint, w: Word = 1) =
   var carry = Carry(0)
-  when cpuEndian == littleEndian:
-    addC(carry, a.limbs[0], a.limbs[0], w, carry)
-    for i in 1 ..< a.limbs.len:
-      addC(carry, a.limbs[i], a.limbs[i], 0, carry)
-  else:
-    {.error: "Not implemented.".}
-  a.clearExtraBits()
+  addC(carry, a.limbs[0], a.limbs[0], w, carry)
+  for i in 1 ..< a.limbs.len:
+    addC(carry, a.limbs[i], a.limbs[i], 0, carry)
+  a.clearExtraBitsOverMSB()
 
 func sum*(r: var Stuint, a: Stuint, b: SomeUnsignedInt) =
   ## Addition for multi-precision unsigned int
diff --git a/stint/private/uint_bitwise.nim b/stint/private/uint_bitwise.nim
index 587b7a4..a3ce42b 100644
--- a/stint/private/uint_bitwise.nim
+++ b/stint/private/uint_bitwise.nim
@@ -20,30 +20,30 @@ import
 func bitnot*(r: var StUint, a: Stuint) =
   ## Bitwise complement of unsigned integer a
   ## i.e. flips all bits of the input
-  for wr, wa in leastToMostSig(r, a):
-    wr = not wa
-  r.clearExtraBits()
+  for i in 0 ..< r.len:
+    r[i] = not a[i]
+  r.clearExtraBitsOverMSB()
 
 func bitor*(r: var Stuint, a, b: Stuint) =
   ## `Bitwise or` of numbers a and b
-  for wr, wa, wb in leastToMostSig(r, a, b):
-    wr = wa or wb
+  for i in 0 ..< r.limbs.len:
+    r[i] = a[i] or b[i]
 
 func bitand*(r: var Stuint, a, b: Stuint) =
   ## `Bitwise and` of numbers a and b
-  for wr, wa, wb in leastToMostSig(r, a, b):
-    wr = wa and wb
+  for i in 0 ..< r.limbs.len:
+    r[i] = a[i] and b[i]
 
 func bitxor*(r: var Stuint, a, b: Stuint) =
   ## `Bitwise xor` of numbers x and y
-  for wr, wa, wb in leastToMostSig(r, a, b):
-    wr = wa xor wb
-  r.clearExtraBits()
+  for i in 0 ..< r.limbs.len:
+    r[i] = a[i] xor b[i]
+  r.clearExtraBitsOverMSB()
 
 func countOnes*(a: Stuint): int =
   result = 0
-  for wa in leastToMostSig(a):
-    result += countOnes(wa)
+  for i in 0 ..< a.limbs.len:
+    result += countOnes(a[i])
 
 func parity*(a: Stuint): int =
   result = parity(a.limbs[0])
@@ -56,8 +56,8 @@ func leadingZeros*(a: Stuint): int =
   # Adjust when we use only part of the word size
   var extraBits = WordBitWidth * a.limbs.len - a.bits
 
-  for word in mostToLeastSig(a):
-    let zeroCount = word.leadingZeros()
+  for i in countdown(a.len-1, 0):
+    let zeroCount = a.limbs[i].leadingZeros()
     if extraBits > 0:
       result += zeroCount - min(extraBits, WordBitWidth)
       extraBits -= WordBitWidth
@@ -68,8 +68,8 @@ func leadingZeros*(a: Stuint): int =
 
 func trailingZeros*(a: Stuint): int =
   result = 0
-  for word in leastToMostSig(a):
-    let zeroCount = word.trailingZeros()
+  for i in 0 ..< a.limbs.len:
+    let zeroCount = a[i].trailingZeros()
     result += zeroCount
     if zeroCount != WordBitWidth:
       break
diff --git a/stint/private/uint_div.nim b/stint/private/uint_div.nim
index 333234c..bb7ca5d 100644
--- a/stint/private/uint_div.nim
+++ b/stint/private/uint_div.nim
@@ -15,31 +15,6 @@ import
   ./uint_bitwise,
   ./primitives/[addcarry_subborrow, extended_precision]
 
-# Helpers
-# --------------------------------------------------------
-
-func usedBitsAndWords(a: openArray[Word]): tuple[bits, words: int] {.inline.} =
-  ## Returns the number of used words and bits in a bigInt
-  var clz = 0
-  # Count Leading Zeros
-  for i in countdown(a.len-1, 0):
-    let count = log2trunc(a[i])
-    # debugEcho "count: ", count, ", a[", i, "]: ", a[i].toBin(64)
-    if count == -1:
-      clz += WordBitWidth
-    else:
-      clz += WordBitWidth - count - 1
-      return (a.len*WordBitWidth - clz, i+1)
-
-func copyWords(
-       a: var openArray[Word], startA: int,
-       b: openArray[Word], startB: int,
-       numWords: int) =
-  ## Copy a slice of B into A. This properly deals
-  ## with overlaps when A and B are slices of the same buffer
-  for i in countdown(numWords-1, 0):
-    a[startA+i] = b[startB+i]
-
 # Division
 # --------------------------------------------------------
 
@@ -312,7 +287,7 @@ func shlAddMod(a: var openArray[Word], c: Word,
   else:
     return shlAddMod_multi(a, c, M, mBits)
 
-func divRemImpl(
+func divRem*(
        q, r: var openArray[Word],
        a, b: openArray[Word]
      ) =
@@ -350,20 +325,6 @@ func divRemImpl(
     for i in rLen ..< r.len:
       r[i] = 0
 
-func `div`*(x, y: Stuint): Stuint {.inline.} =
-  ## Division operation for multi-precision unsigned uint
-  var tmp{.noInit.}: Stuint
-  divRemImpl(result.limbs, tmp.limbs, x.limbs, y.limbs)
-
-func `mod`*(x, y: Stuint): Stuint {.inline.} =
-  ## Remainder operation for multi-precision unsigned uint
-  var tmp{.noInit.}: Stuint
-  divRemImpl(tmp.limbs, result.limbs, x.limbs, y.limbs)
-
-func divmod*(x, y: Stuint): tuple[quot, rem: Stuint] =
-  ## Division and remainder operations for multi-precision unsigned uint
-  divRemImpl(result.quot.limbs, result.rem.limbs, x.limbs, y.limbs)
-
 # ######################################################################
 # Division implementations
 #
diff --git a/stint/private/uint_shift.nim b/stint/private/uint_shift.nim
index 3d50abf..a0181c2 100644
--- a/stint/private/uint_shift.nim
+++ b/stint/private/uint_shift.nim
@@ -25,14 +25,9 @@ func shrSmall*(r: var Limbs, a: Limbs, k: SomeInteger) =
   #       instead of a[i-1] and a[i]
   #       is probably easier to parallelize for the compiler
   #       (antidependence WAR vs loop-carried dependence RAW)
-  when cpuEndian == littleEndian:
-    for i in 0 ..< a.len-1:
-      r[i] = (a[i] shr k) or (a[i+1] shl (WordBitWidth - k))
-    r[^1] = a[^1] shr k
-  else:
-    for i in countdown(a.len-1, 1):
-      r[i] = (a[i] shr k) or (a[i-1] shl (WordBitWidth - k))
-    r[0] = a[0] shr k
+  for i in 0 ..< a.len-1:
+    r[i] = (a[i] shr k) or (a[i+1] shl (WordBitWidth - k))
+  r[^1] = a[^1] shr k
 
 func shrLarge*(r: var Limbs, a: Limbs, w, shift: SomeInteger) =
   ## Shift right by `w` words + `shift` bits
@@ -40,40 +35,24 @@ func shrLarge*(r: var Limbs, a: Limbs, w, shift: SomeInteger) =
   if w > Limbs.len:
     return
 
-  when cpuEndian == littleEndian:
-    for i in w ..< a.len-1:
-      r[i-w] = (a[i] shr shift) or (a[i+1] shl (WordBitWidth - shift))
-    r[^(1+w)] = a[^1] shr shift
-  else:
-    for i in countdown(a.len-1, 1+w):
-      r[i-w] = (a[i] shr shift) or (a[i-1] shl (WordBitWidth - k))
-    r[0] = a[w] shr shift
+  for i in w ..< a.len-1:
+    r[i-w] = (a[i] shr shift) or (a[i+1] shl (WordBitWidth - shift))
+  r[^(1+w)] = a[^1] shr shift
 
 func shrWords*(r: var Limbs, a: Limbs, w: SomeInteger) =
   ## Shift right by w word
-  when cpuEndian == littleEndian:
-    for i in 0 ..< Limbs.len-w:
-      r[i] = a[i+w]
-    for i in Limbs.len-w ..< Limbs.len:
-      r[i] = 0
-  else:
-    for i in countdown(Limbs.len-1, Limbs.len-w):
-      r[i] = 0
-    for i in countdown(Limbs.len-w, 0):
-      r[i] = a[i+w]
+  for i in 0 ..< Limbs.len-w:
+    r[i] = a[i+w]
+  for i in Limbs.len-w ..< Limbs.len:
+    r[i] = 0
 
 func shlSmall*(r: var Limbs, a: Limbs, k: SomeInteger) =
   ## Compute the `shift left` operation of x and k
   ##
   ## k MUST be less than the base word size (2^32 or 2^64)
-  when cpuEndian == littleEndian:
-    r[0] = a[0] shl k
-    for i in 1 ..< a.len:
-      r[i] = (a[i] shl k) or (a[i-1] shr (WordBitWidth - k))
-  else:
-    r[^1] = a[^1] shl k
-    for i in countdown(a.len-2, 0):
-      r[i] = (a[i] shl k) or (a[i+1] shr (WordBitWidth - k))
+  r[0] = a[0] shl k
+  for i in 1 ..< a.len:
+    r[i] = (a[i] shl k) or (a[i-1] shr (WordBitWidth - k))
 
 func shlLarge*(r: var Limbs, a: Limbs, w, shift: SomeInteger) =
   ## Shift left by `w` words + `shift` bits
@@ -81,27 +60,16 @@ func shlLarge*(r: var Limbs, a: Limbs, w, shift: SomeInteger) =
   if w > Limbs.len:
     return
 
-  when cpuEndian == littleEndian:
-    r[w] = a[0] shl shift
-    for i in 1+w ..< r.len:
-      r[i] = (a[i-w] shl shift) or (a[i-w-1] shr (WordBitWidth - shift))
-  else:
-    r[^1] = a[^w] shl shift
-    for i in countdown(a.len-2-w, 0):
-      r[i+w] = (a[i] shl shift) or (a[i+1] shr (WordBitWidth - shift))
+  r[w] = a[0] shl shift
+  for i in 1+w ..< r.len:
+    r[i] = (a[i-w] shl shift) or (a[i-w-1] shr (WordBitWidth - shift))
 
 func shlWords*(r: var Limbs, a: Limbs, w: SomeInteger) =
   ## Shift left by w word
-  when cpuEndian == littleEndian:
-    for i in 0 ..< w:
-      r[i] = 0
-    for i in 0 ..< Limbs.len-w:
-      r[i+w] = a[i]
-  else:
-    for i in countdown(Limbs.len-1, Limbs.len-w):
-      r[i] = 0
-    for i in countdown(Limbs.len-w-1, 0):
-      r[i] = a[i-w]
+  for i in 0 ..< w:
+    r[i] = 0
+  for i in 0 ..< Limbs.len-w:
+    r[i+w] = a[i]
 
 # Wrappers
 # --------------------------------------------------------
@@ -133,7 +101,7 @@ func shiftLeft*(r: var Stuint, a: Stuint, k: SomeInteger) =
   
   if k < WordBitWidth:
     r.limbs.shlSmall(a.limbs, k)
-    r.clearExtraBits()
+    r.clearExtraBitsOverMSB()
     return
 
   # w = k div WordBitWidth, shift = k mod WordBitWidth
@@ -145,4 +113,4 @@ func shiftLeft*(r: var Stuint, a: Stuint, k: SomeInteger) =
   else:
     r.limbs.shlLarge(a.limbs, w, shift)
 
-  r.clearExtraBits()
+  r.clearExtraBitsOverMSB()
diff --git a/stint/uintops.nim b/stint/uintops.nim
index f196bd2..681616f 100644
--- a/stint/uintops.nim
+++ b/stint/uintops.nim
@@ -30,14 +30,9 @@ func setZero*(a: var StUint) =
 
 func setSmallInt(a: var StUint, k: Word) =
   ## Set ``a`` to k
-  when cpuEndian == littleEndian:
-    a.limbs[0] = k
-    for i in 1 ..< a.limbs.len:
-      a.limbs[i] = 0
-  else:
-    a.limbs[^1] = k
-    for i in 0 ..< a.limb.len - 1:
-      a.limbs[i] = 0
+  a.limbs[0] = k
+  for i in 1 ..< a.limbs.len:
+    a.limbs[i] = 0
 
 func setOne*(a: var StUint) =
   setSmallInt(a, 1)
@@ -51,8 +46,9 @@ func one*[bits: static[int]](T: typedesc[Stuint[bits]]): T {.inline.} =
   result.setOne()
 
 func high*[bits](_: typedesc[Stuint[bits]]): Stuint[bits] {.inline.} =
-  for wr in leastToMostSig(result):
-    wr = high(Word)
+  for i in 0 ..< result.len:
+    result[i] = high(Word)
+
 func low*[bits](_: typedesc[Stuint[bits]]): Stuint[bits] {.inline.} =
   discard
 
@@ -62,15 +58,15 @@ func low*[bits](_: typedesc[Stuint[bits]]): Stuint[bits] {.inline.} =
 {.push raises: [], inline, noInit, gcsafe.}
 
 func isZero*(a: Stuint): bool =
-  for word in leastToMostSig(a):
-    if word != 0:
+  for i in 0 ..< a.limbs.len:
+    if a[i] != 0:
       return false
   return true
 
 func `==`*(a, b: Stuint): bool {.inline.} =
   ## Unsigned `equal` comparison
-  for wa, wb in leastToMostSig(a, b):
-    if wa != wb:
+  for i in 0 ..< a.limbs.len:
+    if a[i] != b[i]:
       return false
   return true
 
@@ -78,8 +74,8 @@ func `<`*(a, b: Stuint): bool {.inline.} =
   ## Unsigned `less than` comparison
   var diff: Word
   var borrow: Borrow
-  for wa, wb in leastToMostSig(a, b):
-    subB(borrow, diff, wa, wb, borrow)
+  for i in 0 ..< a.limbs.len:
+    subB(borrow, diff, a[i], b[i], borrow)
   return bool(borrow)
 
 func `<=`*(a, b: Stuint): bool {.inline.} =
@@ -89,12 +85,12 @@ func `<=`*(a, b: Stuint): bool {.inline.} =
 func isOdd*(a: Stuint): bool {.inline.} =
   ## Returns true if input is off
   ## false otherwise
-  bool(a.leastSignificantWord and 1)
+  bool(a[0] and 1)
 
 func isEven*(a: Stuint): bool {.inline.} =
   ## Returns true if input is zero
   ## false otherwise
-  not a.isOdd
+  not a.isOdd()
 
 {.pop.}
 # Bitwise operations
@@ -178,7 +174,7 @@ export `+=`
 func `*`*(a, b: Stuint): Stuint =
   ## Integer multiplication
   result.limbs.prod(a.limbs, b.limbs)
-  result.clearExtraBits()
+  result.clearExtraBitsOverMSB()
 
 {.pop.}
 
@@ -228,5 +224,20 @@ func pow*[aBits, eBits](a: Stuint[aBits], e: Stuint[eBits]): Stuint[aBits] =
 
 # Division & Modulo
 # --------------------------------------------------------
+{.push raises: [], inline, noInit, gcsafe.}
 
-export uint_div
\ No newline at end of file
+func `div`*(x, y: Stuint): Stuint =
+  ## Division operation for multi-precision unsigned uint
+  var tmp{.noInit.}: Stuint
+  divRem(result.limbs, tmp.limbs, x.limbs, y.limbs)
+
+func `mod`*(x, y: Stuint): Stuint =
+  ## Remainder operation for multi-precision unsigned uint
+  var tmp{.noInit.}: Stuint
+  divRem(tmp.limbs, result.limbs, x.limbs, y.limbs)
+
+func divmod*(x, y: Stuint): tuple[quot, rem: Stuint] =
+  ## Division and remainder operations for multi-precision unsigned uint
+  divRem(result.quot.limbs, result.rem.limbs, x.limbs, y.limbs)
+
+{.pop.}
\ No newline at end of file