nimbus-eth1/nimbus/sync/snap/worker/db/hexary_envelope.nim

# nimbus-eth1
# Copyright (c) 2021 Status Research & Development GmbH
# Licensed under either of
#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
#    http://www.apache.org/licenses/LICENSE-2.0)
#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or
#    http://opensource.org/licenses/MIT)
# at your option. This file may not be copied, modified, or distributed
# except according to those terms.

## Envelope tools for nodes and hex encoded *partial paths*
## ========================================================
##
## Envelope
## --------
## Given a hex encoded *partial path*, this is the maximum range of leaf node
## paths (of data type `NodeTag`) that starts with the *partial path*. It is
## obtained by creating an interval (of type `NodeTagRange`) with end points
## starting with the *partial path* and extening it with *zero* nibbles for
## the left end, and *0xf* nibbles for the right end.
##
## Boundary proofs
## ---------------
## The *boundary proof* for a range `iv` of leaf node paths (e.g. account
## hashes) for a given *state root* is a set of nodes enough to construct the
## partial *Merkel Patricia trie* containing the leafs. If the given range
## `iv` is larger than the left or right most leaf node paths, the *boundary
## proof* also implies that there is no other leaf path between the range
## boundary and the left or rightmost leaf path. There is not minimalist
## requirement of a *boundary proof*.
##
## Envelope decomposition
## ----------------------
## The idea is to compute the difference of the envelope of a hex encoded
## *partial path* off some range of leaf node paths and express the result as
## a list of envelopes (represented by either nodes or *partial paths*.)
##
## Prerequisites
## ^^^^^^^^^^^^^
## More formally, assume
##
## * ``partialPath`` is a hex encoded *partial path* (of type ``Blob``)
##
## * ``iv`` is a range of leaf node paths (of type ``NodeTagRange``)
##
## and assume further that for `iv` there are left and right *boundary proofs*
## in the database (e.g. as downloaded via the `snap/1` protocol.)
##
## The decomposition
## ^^^^^^^^^^^^^^^^^
## Then there is a (probably empty) set `W` of *partial paths* (represented by
## nodes or *partial paths*) where the envelope of each *partial path* in `W`
## has no common leaf path in `iv` (i.e. disjunct to the sub-range of `iv`
## where the boundaries are existing node keys.)
##
## Let this set `W` be maximal in the sense that for every *partial path* `p`
## which is prefixed by `partialPath` the envelope of which has no common leaf
## node in `iv` there exists a *partial path* `w` in `W` that prefixes `p`. In
## other words the envelope of `p` is contained in the envelope of `w`.
##
## Formally:
##
## * if ``p = partialPath & p-ext`` with ``(envelope of p) * iv`` has no
##   allocated nodes for in the hexary trie database
##
## * then there is a ``w = partialPath & w-ext`` in ``W`` with
##   ``p-ext = w-ext & some-ext``.
##
## Relation to boundary proofs
## ^^^^^^^^^^^^^^^^^^^^^^^^^^^
## Consider the decomposition of an empty *partial path* (the envelope of which
## representing the whole leaf node path range) for a leaf node range `iv`.
## This result is then a `boundary proof` for `iv` according to the definition
## above though it is highly redundant. All *partial path* bottom level nodes
## with envelopes disjunct to `iv` can be removed from `W` for a `boundary
## proof`.
##
import
  std/[algorithm, sequtils, tables],
  eth/[common, trie/nibbles],
  stew/interval_set,
  ../../range_desc,
  "."/[hexary_desc, hexary_error, hexary_nearby, hexary_paths]

{.push raises: [Defect].}

# ------------------------------------------------------------------------------
# Private helpers
# ------------------------------------------------------------------------------

proc `==`(a, b: XNodeObj): bool =
  if a.kind == b.kind:
    case a.kind:
    of Leaf:
      return a.lPfx == b.lPfx and a.lData == b.lData
    of Extension:
      return a.ePfx == b.ePfx and a.eLink == b.eLink
    of Branch:
      return a.bLink == b.bLink

proc isZeroLink(a: Blob): bool =
  ## Persistent database has `Blob` as key
  a.len == 0

proc isZeroLink(a: RepairKey): bool =
  ## Persistent database has `RepairKey` as key
  a.isZero

proc convertTo(key: RepairKey; T: type NodeKey): T =
  ## Might be lossy, check before use
  discard result.init(key.ByteArray33[1 .. 32])

proc toNodeSpecs(nodeKey: RepairKey; partialPath: Blob): NodeSpecs =
  NodeSpecs(
    nodeKey:     nodeKey.convertTo(NodeKey),
    partialPath: partialPath)

proc toNodeSpecs(nodeKey: Blob; partialPath: Blob): NodeSpecs =
  NodeSpecs(
    nodeKey:     nodeKey.convertTo(NodeKey),
    partialPath: partialPath)


template noKeyErrorOops(info: static[string]; code: untyped) =
  try:
    code
  except KeyError as e:
    raiseAssert "Impossible KeyError (" & info & "): " & e.msg

template noRlpErrorOops(info: static[string]; code: untyped) =
  try:
    code
  except RlpError as e:
    raiseAssert "Impossible RlpError (" & info & "): " & e.msg

# ------------------------------------------------------------------------------
# Private functions
# ------------------------------------------------------------------------------

proc padPartialPath(pfx: NibblesSeq; dblNibble: byte): NodeKey =
  ## Extend (or cut) `partialPath` nibbles sequence and generate `NodeKey`
  # Pad with zeroes
  var padded: NibblesSeq

  let padLen = 64 - pfx.len
  if 0 <= padLen:
    padded = pfx & dblNibble.repeat(padlen div 2).initNibbleRange
    if (padLen and 1) == 1:
      padded = padded & @[dblNibble].initNibbleRange.slice(1)
  else:
    let nope = seq[byte].default.initNibbleRange
    padded = pfx.slice(0,63) & nope # nope forces re-alignment

  let bytes = padded.getBytes
  (addr result.ByteArray32[0]).copyMem(unsafeAddr bytes[0], bytes.len)


proc decomposeLeft(
    envPt: RPath|XPath;
    ivPt: RPath|XPath;
      ): Result[seq[NodeSpecs],HexaryError] =
  ## Helper for `hexaryEnvelopeDecompose()` for handling left side of
  ## envelope from partial path argument
  #
  #      partialPath
  #       /     \
  #      /       \
  #    envPt..              -- envelope left end of partial path
  #        |
  #      ivPt..             -- `iv`, not fully covering left of `env`
  #
  var collect: seq[NodeSpecs]
  block rightCurbEnvelope:
    for n in 0 ..< min(envPt.path.len+1, ivPt.path.len):
      if n == envPt.path.len or envPt.path[n] != ivPt.path[n]:
        #
        # At this point, the `node` entries of either `path[n]` step are
        # the same. This is so because the predecessor steps were the same
        # or were the `rootKey` in case n == 0.
        #
        # But then (`node` entries being equal) the only way for the
        # `path[n]` steps to differ is in the entry selector `nibble` for
        # a branch node.
        #
        for m in n ..< ivPt.path.len:
          let
            pfx = ivPt.getNibbles(0, m) # common path segment
            top = ivPt.path[m].nibble   # need nibbles smaller than top
          #
          # Incidentally for a non-`Branch` node, the value `top` becomes
          # `-1` and the `for`- loop will be ignored (which is correct)
          for nibble in 0 ..< top:
            let nodeKey = ivPt.path[m].node.bLink[nibble]
            if not nodeKey.isZeroLink:
              collect.add nodeKey.toNodeSpecs hexPrefixEncode(
                pfx & @[nibble.byte].initNibbleRange.slice(1),isLeaf=false)
        break rightCurbEnvelope
    #
    # Fringe case, e.g. when `partialPath` is an empty prefix (aka `@[0]`)
    # and the database has a single leaf node `(a,some-value)` where the
    # `rootKey` is the hash of this node. In that case, `pMin == 0` and
    # `pMax == high(NodeTag)` and `iv == [a,a]`.
    #
    return err(DecomposeDegenerated)

  ok(collect)

proc decomposeRight(
    envPt: RPath|XPath;
    ivPt: RPath|XPath;
      ): Result[seq[NodeSpecs],HexaryError] =
  ## Helper for `hexaryEnvelopeDecompose()` for handling right side of
  ## envelope from partial path argument
  #
  #        partialPath
  #         /     \
  #        /       \
  #           .. envPt     -- envelope right end of partial path
  #              |
  #          .. ivPt       -- `iv`, not fully covering right of `env`
  #
  var collect: seq[NodeSpecs]
  block leftCurbEnvelope:
    for n in 0 ..< min(envPt.path.len+1, ivPt.path.len):
      if n == envPt.path.len or envPt.path[n] != ivPt.path[n]:
        for m in n ..< ivPt.path.len:
          let
            pfx = ivPt.getNibbles(0, m) # common path segment
            base = ivPt.path[m].nibble  # need nibbles greater/equal
          if 0 <= base:
            for nibble in base+1 .. 15:
              let nodeKey = ivPt.path[m].node.bLink[nibble]
              if not nodeKey.isZeroLink:
                collect.add nodeKey.toNodeSpecs hexPrefixEncode(
                  pfx & @[nibble.byte].initNibbleRange.slice(1),isLeaf=false)
        break leftCurbEnvelope
    return err(DecomposeDegenerated)

  ok(collect)


proc decomposeImpl(
    partialPath: Blob;               # Hex encoded partial path
    rootKey: NodeKey;                # State root
    iv: NodeTagRange;                # Proofed range of leaf paths
    db: HexaryGetFn|HexaryTreeDbRef; # Database abstraction
      ): Result[seq[NodeSpecs],HexaryError]
      {.gcsafe, raises: [Defect,RlpError,KeyError].} =
  ## Database agnostic implementation of `hexaryEnvelopeDecompose()`.
  let env = partialPath.hexaryEnvelope
  if iv.maxPt < env.minPt or env.maxPt < iv.minPt:
    return err(DecomposeDisjuct) # empty result

  var nodeSpex: seq[NodeSpecs]

  # So ranges do overlap. The case that the `partialPath` envelope is fully
  # contained in `iv` results in `@[]` which is implicitely handled by
  # non-matching any of the cases, below.
  if env.minPt < iv.minPt:
    let
      envPt = env.minPt.hexaryPath(rootKey, db)
      # Make sure that the min point is the nearest node to the right
      ivPt = block:
        let rc = iv.minPt.hexaryPath(rootKey, db).hexaryNearbyRight(db)
        if rc.isErr:
          return err(rc.error)
        rc.value
    block:
      let rc = envPt.decomposeLeft ivPt
      if rc.isErr:
        return err(rc.error)
      nodeSpex &= rc.value

  if iv.maxPt < env.maxPt:
    let
      envPt = env.maxPt.hexaryPath(rootKey, db)
      ivPt = block:
        let rc = iv.maxPt.hexaryPath(rootKey, db).hexaryNearbyLeft(db)
        if rc.isErr:
          return err(rc.error)
        rc.value
    block:
      let rc = envPt.decomposeRight ivPt
      if rc.isErr:
        return err(rc.error)
      nodeSpex &= rc.value

  ok(nodeSpex)

# ------------------------------------------------------------------------------
# Public functions, envelope constructor
# ------------------------------------------------------------------------------

proc hexaryEnvelope*(partialPath: Blob): NodeTagRange =
  ## Convert partial path to range of all concievable node keys starting with
  ## the partial path argument `partialPath`.
  let pfx = partialPath.hexPrefixDecode[1]
  NodeTagRange.new(
    pfx.padPartialPath(0).to(NodeTag),
    pfx.padPartialPath(255).to(NodeTag))

proc hexaryEnvelope*(node: NodeSpecs): NodeTagRange =
  ## variant of `hexaryEnvelope()`
  node.partialPath.hexaryEnvelope()

# ------------------------------------------------------------------------------
# Public functions, helpers
# ------------------------------------------------------------------------------

proc hexaryEnvelopeUniq*(
    partialPaths: openArray[Blob];
      ): seq[Blob]
      {.gcsafe, raises: [Defect,KeyError].} =
  ## Sort and simplify a list of partial paths by sorting envelopes while
  ## removing nested entries.
  var tab: Table[NodeTag,(Blob,bool)]

  for w in partialPaths:
    let iv = w.hexaryEnvelope
    tab[iv.minPt] = (w,true)    # begin entry
    tab[iv.maxPt] = (@[],false) # end entry

  # When sorted, nested entries look like
  #
  # 123000000.. (w0, true)
  # 123400000.. (w1, true)
  # 1234fffff..  (, false)
  # 123ffffff..  (, false)
  # ...
  # 777000000.. (w2, true)
  #
  var level = 0
  for key in toSeq(tab.keys).sorted(cmp):
    let (w,begin) = tab[key]
    if begin:
      if level == 0:
        result.add w
      level.inc
    else:
      level.dec

proc hexaryEnvelopeUniq*(
    nodes: openArray[NodeSpecs];
      ): seq[NodeSpecs]
      {.gcsafe, raises: [Defect,KeyError].} =
  ## Variant of `hexaryEnvelopeUniq` for sorting a `NodeSpecs` list by
  ## partial paths.
  var tab: Table[NodeTag,(NodeSpecs,bool)]

  for w in nodes:
    let iv = w.partialPath.hexaryEnvelope
    tab[iv.minPt] = (w,true)            # begin entry
    tab[iv.maxPt] = (NodeSpecs(),false) # end entry

  var level = 0
  for key in toSeq(tab.keys).sorted(cmp):
    let (w,begin) = tab[key]
    if begin:
      if level == 0:
        result.add w
      level.inc
    else:
      level.dec


proc hexaryEnvelopeTouchedBy*(
    rangeSet: NodeTagRangeSet;          # Set of intervals (aka ranges)
    partialPath: Blob;                  # Partial path for some node
      ): NodeTagRangeSet =
  ## For the envelope interval of the `partialPath` argument, this function
  ## returns the complete set of intervals from the argument set `rangeSet`
  ## that have a common point with the envelope (i.e. they are non-disjunct to
  ## the envelope.)
  result = NodeTagRangeSet.init()
  let probe = partialPath.hexaryEnvelope

  if 0 < rangeSet.covered probe:
    # Find an interval `start` that starts before the `probe` interval.
    # Preferably, this interval is the rightmost one starting before `probe`.
    var startSearch = low(NodeTag)

    # Try least interval starting within or to the right of `probe`.
    let rc = rangeSet.ge probe.minPt
    if rc.isOk:
      # Try predecessor
      let rx = rangeSet.le rc.value.minPt
      if rx.isOk:
        # Predecessor interval starts before `probe`, e.g.
        #
        #  .. [..rx..] [..rc..] ..
        #        [..probe..]
        #
        startSearch = rx.value.minPt
      else:
        # No predecessor, so `rc.value` is the very first interval, e.g.
        #
        #              [..rc..] ..
        #        [..probe..]
        #
        startSearch = rc.value.minPt
    else:
      # No interval starts in or after `probe`.
      #
      # So, if an interval ends before the right end of `probe`, it must
      # start before `probe`.
      let rx = rangeSet.le probe.maxPt
      if rx.isOk:
        #
        #  .. [..rx..] ..
        #        [..probe..]
        #
        startSearch = rx.value.minPt
      else:
        # Otherwise there is no interval preceding `probe`, so the zero
        # value for `start` will do the job, e.g.
        #
        #      [.....rx......]
        #        [..probe..]
        discard

    # Collect intervals left-to-right for non-disjunct to `probe`
    for w in increasing[NodeTag,UInt256](rangeSet, startSearch):
      if (w * probe).isOk:
        discard result.merge w
      elif probe.maxPt < w.minPt:
        break # all the `w` following will be disjuct, too

proc hexaryEnvelopeTouchedBy*(
    rangeSet: NodeTagRangeSet;          # Set of intervals (aka ranges)
    node: NodeSpecs;                    # Node w/hex encoded partial path
      ): NodeTagRangeSet =
  ## Variant of `hexaryEnvelopeTouchedBy()`
  rangeSet.hexaryEnvelopeTouchedBy(node.partialPath)

# ------------------------------------------------------------------------------
# Public functions, complement sub-tries
# ------------------------------------------------------------------------------

proc hexaryEnvelopeDecompose*(
    partialPath: Blob;             # Hex encoded partial path
    rootKey: NodeKey;              # State root
    iv: NodeTagRange;              # Proofed range of leaf paths
    db: HexaryTreeDbRef;           # Database
      ): Result[seq[NodeSpecs],HexaryError]
      {.gcsafe, raises: [Defect,KeyError].} =
  ## This function computes the decomposition of the argument `partialPath`
  ## relative to the argument range `iv`.
  ##
  ## * Comparison with `hexaryInspect()`
  ##
  ##   The function `hexaryInspect()` implements a width-first search for
  ##   dangling nodes starting at the state root (think of the cathode ray of
  ##   a CRT.) For the sake of comparison with `hexaryEnvelopeDecompose()`, the
  ##   search may be amended to ignore nodes the envelope of is fully contained
  ##   in some range `iv`. For a fully allocated hexary trie, there will be at
  ##   least one sub-trie of length *N* with leafs not in `iv`. So the number
  ##   of nodes visited is *O(16^N)* for some *N* at most 63.
  ##
  ##   The function `hexaryEnvelopeDecompose()` take the left or rightmost leaf
  ##   path from `iv`, calculates a chain length *N* of nodes from the state
  ##   root to the leaf, and for each node collects the links not pointing
  ##   inside the range `iv`. The number of nodes visited is *O(N)*.
  ##
  ##   The results of both functions are not interchangeable, though. The first
  ##   function `hexaryInspect()`, always returns dangling nodes if there are
  ##   any in which case the hexary trie is incomplete and there will be no way
  ##   to visit all nodes as they simply do not exist. But iteratively adding
  ##   nodes or sub-tries and re-running this algorithm will end up with having
  ##   all nodes visited.
  ##
  ##   The other function `hexaryEnvelopeDecompose()` always returns the same
  ##   result where some nodes might be dangling and may be treated similar to
  ##   what was discussed in the previous paragraph. This function also reveals
  ##   allocated nodes which might be checked for whether they exist fully or
  ##   partially for another state root hexary trie.
  ##
  ##   So both are sort of complementary where the function
  ##   `hexaryEnvelopeDecompose()` is a fast one and `hexaryInspect()` the
  ##   thorough one of last resort.
  ##
  noRlpErrorOops("in-memory hexaryEnvelopeDecompose"):
    return partialPath.decomposeImpl(rootKey, iv, db)

proc hexaryEnvelopeDecompose*(
    partialPath: Blob;             # Hex encoded partial path
    rootKey: NodeKey;              # State root
    iv: NodeTagRange;              # Proofed range of leaf paths
    getFn: HexaryGetFn;            # Database abstraction
      ): Result[seq[NodeSpecs],HexaryError]
      {.gcsafe, raises: [Defect,RlpError].} =
  ## Variant of `decompose()` for persistent database.
  noKeyErrorOops("persistent hexaryEnvelopeDecompose"):
    return partialPath.decomposeImpl(rootKey, iv, getFn)

# ------------------------------------------------------------------------------
# End
# ------------------------------------------------------------------------------