nim-eth/eth/trie/ordered_trie.nim

import ../common/hashes, ../rlp, stew/arraybuf

export hashes

type
  ShortHash = ArrayBuf[32, byte]

  OrderedTrieRootBuilder* = object
    ## A special case of hexary trie root building for the case where keys are
    ## sorted integers and number of entries is known ahead of time.
    ##
    ## The builder must be initialized with the value count by calling `init`.
    ##
    ## In the ethereum MPT, leaf leaves are computed by prefixing the value with
    ## its trie path slice. When the keys are ordere, we can pre-compute the
    ## trie path slice thus avoiding unnecessary storage of leaf values.
    ##
    ## Similar implementations with various tradeoffs exist that cover the
    ## general case:
    ##
    ## * https://github.com/alloy-rs/trie
    ## * https://github.com/rust-ethereum/ethereum/blob/b160820620aa9fd30050d5fcb306be4e12d58c8c/src/util.rs#L152
    ## * https://github.com/ethereum/go-ethereum/blob/master/trie/stacktrie.go
    ##
    ## TODO We don't need to store all leaves - instead, we could for each
    ##      level of the trie store only a hashing state that collects the trie
    ##      built "so far", similar to the StackTrie implementation - this works
    ##      for items 0x80 and up where the rlp-encoded order matches insertion
    ##      order.
    leaves: seq[ShortHash]

    items: int
      ## Number of items added so far (and therefore also the key of the next item)

func init*(T: type OrderedTrieRootBuilder, expected: int): T =
  T(leaves: newSeq[ShortHash](expected))

func toShortHash(v: openArray[byte]): ShortHash =
  if v.len < 32:
    ShortHash.initCopyFrom(v)
  else:
    ShortHash.initCopyFrom(keccak256(v).data)

func append(w: var RlpWriter, key: ShortHash) =
  if 1 < key.len and key.len < 32:
    w.appendRawBytes key.data
  else:
    w.append key.data

func keyAtIndex(b: var OrderedTrieRootBuilder, i: int): RlpIntBuf =
  # Given a leaf index, compute the rlp-encoded key
  let key =
    if i <= 0x7f:
      if i == min(0x7f, b.leaves.len - 1):
        0'u64
      else:
        uint64 i + 1
    else:
      uint64 i
  rlp.encodeInt(key)

func nibble(v: RlpIntBuf, i: int): byte =
  let data = v.data[i shr 1]
  if (i and 1) != 0:
    data and 0xf
  else:
    data shr 4

func nibbles(v: RlpIntBuf): int =
  v.len * 2

func sharedPrefixLen(a, b: RlpIntBuf): int =
  # Number of nibbles the two buffers have in common
  for i in 0 ..< min(a.len, b.len):
    if a[i] != b[i]:
      return
        if a.nibble(i * 2) == b.nibble(i * 2):
          i * 2 + 1
        else:
          i * 2
  min(a.len, b.len)

func hexPrefixEncode(
    r: RlpIntBuf, ibegin, iend: int, isLeaf = false
): ArrayBuf[10, byte] =
  let nibbleCount = iend - ibegin
  var oddnessFlag = (nibbleCount and 1) != 0
  result.setLen((nibbleCount div 2) + 1)
  result[0] = byte((int(isLeaf) * 2 + int(oddnessFlag)) shl 4)
  var writeHead = 0

  for i in ibegin ..< iend:
    let nextNibble = r.nibble(i)
    if oddnessFlag:
      result[writeHead] = result[writeHead] or nextNibble
    else:
      inc writeHead
      result[writeHead] = nextNibble shl 4
    oddnessFlag = not oddnessFlag

proc keyToIndex(b: var OrderedTrieRootBuilder, key: uint64): int =
  ## Given a key, compute its position according to the rlp-encoded integer
  ## ordering, ie the order that would result from encoding the key
  ## with RLP, "shortest big endian encoding" and sorting lexicographically -
  ## this lexicographical order determines the location of the key in the trie
  if key == 0:
    # Key 0 goes into position 0x7f or last, depending on how many there are
    min(0x7f, b.leaves.len - 1)
  elif key <= uint64 min(0x7f, b.leaves.len - 1):
    int key - 1
  else:
    int key

proc updateHash(b: var OrderedTrieRootBuilder, key: uint64, v: auto, w: var RlpWriter) =
  let
    pos = b.keyToIndex(key)
    cur = rlp.encodeInt(key)
  b.leaves[pos] =
    try:
      w.clear()
      w.startList(2)

      # compute the longest shared nibble prefix between a key and its sorted
      # neighbours which determines how much of the key is left in the leaf
      # itself during encoding
      let spl =
        if b.leaves.len == 1:
          -1 # If there's only one leaf, the whole key is used as leaf path
        else:
          if pos + 1 < b.leaves.len:
            let next = b.keyAtIndex(pos + 1)
            if pos > 0:
              let prev = b.keyAtIndex(pos - 1)
              max(prev.sharedPrefixLen(cur), next.sharedPrefixLen(cur))
            else:
              next.sharedPrefixLen(cur)
          else:
            let prev = b.keyAtIndex(pos - 1)
            prev.sharedPrefixLen(cur)

      w.append(cur.hexPrefixEncode(spl + 1, cur.nibbles, isLeaf = true).data())
      w.append(rlp.encode(v))

      toShortHash(w.finish)
    except RlpError:
      raiseAssert "RLP failures not expected"

proc add*[T](b: var OrderedTrieRootBuilder, v: openArray[T]) =
  ## Add items to the trie root builder, calling `rlp.encode(item)` to compute
  ## the value of the item. The total number of items added before calling
  ## `rootHash` must equal what was given in `init`.
  ##
  ## TODO instead of RLP-encoding the items to bytes, we should be hashing them
  ##      directly:
  ##      * https://github.com/status-im/nim-eth/issues/724
  ##      * https://github.com/status-im/nim-eth/issues/698
  var w = initRlpWriter()
  for item in v:
    b.updateHash(uint64 b.items, item, w)
    b.items += 1

proc computeKey(b: var OrderedTrieRootBuilder, rng: Slice[int], depth: int): ShortHash =
  if rng.len == 0:
    ShortHash.initCopyFrom([byte 128]) # RLP of empty list
  elif rng.len == 1: # Leaf
    b.leaves[rng.a]
  else: # Branch (or extension)
    var p = int.high
    let ka = b.keyAtIndex(rng.a)

    # Find the shortest shared prefix among the given keys - if this is not 0,
    # it means an extension node must be introduced among the nodes in the given
    # range. The top level always has a 0 shared length prefix because the
    # encodings for 0 and 1 start with different nibbles.
    if depth == 0:
      p = 0
    else:
      for i in 1 ..< rng.len:
        # TODO We can get rid of this loop by observing what the nibbles in the
        #      RLP integer encoding have in common and adjust accordingly
        p = min(p, sharedPrefixLen(ka, b.keyAtIndex(rng.a + i)))
        if p == depth:
          break

    var w = initRlpWriter()

    if p == depth: # No shared prefix - this is a branch
      w.startList(17)
      # Sub-divide the keys by nibble and recurse
      var pos = rng.a
      for n in 0'u8 .. 15'u8:
        var x: int
        # Pick out the keys that have the asked-for nibble at the given depth
        while pos + x <= rng.b and b.keyAtIndex(pos + x).nibble(depth) == n:
          x += 1

        if x > 0:
          w.append b.computeKey(pos .. pos + x - 1, depth + 1)
        else:
          w.append(openArray[byte]([]))
        pos += x

      w.append(openArray[byte]([])) # No data in branch nodes
    else:
      w.startList(2)
      w.append(ka.hexPrefixEncode(depth, p, isLeaf = false).data())
      w.append(b.computeKey(rng, p))

    toShortHash(w.finish())

proc rootHash*(b: var OrderedTrieRootBuilder): Root =
  doAssert b.items == b.leaves.len, "Items added does not match initial length"
  let h = b.computeKey(0 ..< b.leaves.len, 0)
  if h.len == 32:
    Root(h.buf)
  else:
    keccak256(h.data)

proc orderedTrieRoot*[T](items: openArray[T]): Root =
  ## Compute the MPT root of a list of items using their rlp-encoded index as
  ## key.
  ##
  ## Typical examples include the transaction and withdrawal roots that appear
  ## in blocks.
  ##
  ## The given values will be rlp-encoded using `rlp.encode`.
  var b = OrderedTrieRootBuilder.init(items.len)
  b.add(items)
  b.rootHash

when isMainModule: # A small benchmark
  import std/[monotimes, times], eth/trie/[hexary, db]

  let n = 1000000
  echo "Testing ", n
  let values = block:
    var tmp: seq[uint64]
    for i in 0 .. n:
      tmp.add i.uint64
    tmp

  let x0 = getMonoTime()
  let b1 = block:
    var db = OrderedTrieRootBuilder.init(values.len)

    db.add(values)
    db.rootHash()
  echo b1
  let x1 = getMonoTime()
  let b2 = block:
    var db2 = initHexaryTrie(newMemoryDB())
    for v in values:
      db2.put(rlp.encode(v), rlp.encode(v))

    db2.rootHash()
  let x2 = getMonoTime()
  assert b1 == b2

  echo (
    (x1 - x0), (x2 - x1), (x1 - x0).inNanoseconds.float / (x2 - x1).inNanoseconds.float
  )