nimbus-eth1/nimbus/db/aristo/aristo_compute.nim

# nimbus-eth1
# Copyright (c) 2023-2024 Status Research & Development GmbH
# Licensed under either of
#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
#    http://www.apache.org/licenses/LICENSE-2.0)
#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or
#    http://opensource.org/licenses/MIT)
# at your option. This file may not be copied, modified, or distributed
# except according to those terms.

{.push raises: [].}

import
  std/strformat,
  chronicles,
  eth/common,
  results,
  "."/[aristo_desc, aristo_get, aristo_walk/persistent],
  ./aristo_desc/desc_backend

type WriteBatch = tuple[writer: PutHdlRef, count: int, depth: int, prefix: uint64]

# Keep write batch size _around_ 1mb, give or take some overhead - this is a
# tradeoff between efficiency and memory usage with diminishing returns the
# larger it is..
const batchSize = 1024 * 1024 div (sizeof(RootedVertexID) + sizeof(HashKey))

proc flush(batch: var WriteBatch, db: AristoDbRef): Result[void, AristoError] =
  if batch.writer != nil:
    ?db.backend.putEndFn batch.writer
    batch.writer = nil
  ok()

proc putVtx(
    batch: var WriteBatch,
    db: AristoDbRef,
    rvid: RootedVertexID,
    vtx: VertexRef,
    key: HashKey,
): Result[void, AristoError] =
  if batch.writer == nil:
    doAssert db.backend != nil, "source data is from the backend"
    batch.writer = ?db.backend.putBegFn()

  db.backend.putVtxFn(batch.writer, rvid, vtx, key)
  batch.count += 1

  ok()

func progress(batch: WriteBatch): string =
  # Return an approximation on how much of the keyspace has been covered by
  # looking at the path prefix that we're currently processing
  &"{(float(batch.prefix) / float(uint64.high)) * 100:02.2f}%"

func enter(batch: var WriteBatch, nibble: uint8) =
  batch.depth += 1
  if batch.depth <= 16:
    batch.prefix += uint64(nibble) shl ((16 - batch.depth) * 4)

func leave(batch: var WriteBatch, nibble: uint8) =
  if batch.depth <= 16:
    batch.prefix -= uint64(nibble) shl ((16 - batch.depth) * 4)
  batch.depth -= 1

proc putKeyAtLevel(
    db: AristoDbRef,
    rvid: RootedVertexID,
    vtx: VertexRef,
    key: HashKey,
    level: int,
    batch: var WriteBatch,
): Result[void, AristoError] =
  ## Store a hash key in the given layer or directly to the underlying database
  ## which helps ensure that memory usage is proportional to the pending change
  ## set (vertex data may have been committed to disk without computing the
  ## corresponding hash!)

  if level == -2:
    ?batch.putVtx(db, rvid, vtx, key)

    if batch.count mod batchSize == 0:
      ?batch.flush(db)

      if batch.count mod (batchSize * 100) == 0:
        info "Writing computeKey cache", keys = batch.count, accounts = batch.progress
      else:
        debug "Writing computeKey cache", keys = batch.count, accounts = batch.progress
  else:
    db.deltaAtLevel(level).sTab[rvid] = vtx
    db.deltaAtLevel(level).kMap[rvid] = key

  ok()

func maxLevel(cur, other: int): int =
  # Compare two levels and return the topmost in the stack, taking into account
  # the odd reversal of order around the zero point
  if cur < 0:
    max(cur, other) # >= 0 is always more topmost than <0
  elif other < 0:
    cur
  else:
    min(cur, other) # Here the order is reversed and 0 is the top layer

template encodeLeaf(w: var RlpWriter, pfx: NibblesBuf, leafData: untyped): HashKey =
  w.startList(2)
  w.append(pfx.toHexPrefix(isLeaf = true).data())
  w.append(leafData)
  w.finish().digestTo(HashKey)

template encodeBranch(w: var RlpWriter, vtx: VertexRef, subKeyForN: untyped): HashKey =
  w.startList(17)
  for (n {.inject.}, subvid {.inject.}) in vtx.allPairs():
    w.append(subKeyForN)
  w.append EmptyBlob
  w.finish().digestTo(HashKey)

template encodeExt(w: var RlpWriter, pfx: NibblesBuf, branchKey: HashKey): HashKey =
  w.startList(2)
  w.append(pfx.toHexPrefix(isLeaf = false).data())
  w.append(branchKey)
  w.finish().digestTo(HashKey)

proc getKey(
    db: AristoDbRef, rvid: RootedVertexID, skipLayers: static bool
): Result[((HashKey, VertexRef), int), AristoError] =
  ok when skipLayers:
    (?db.getKeyUbe(rvid, {GetVtxFlag.PeekCache}), -2)
  else:
    ?db.getKeyRc(rvid, {})

template childVid(v: VertexRef): VertexID =
  # If we have to recurse into a child, where would that recusion start?
  case v.vType
  of Leaf:
    if v.lData.pType == AccountData and v.lData.stoID.isValid:
      v.lData.stoID.vid
    else:
      default(VertexID)
  of Branch:
    v.startVid

proc computeKeyImpl(
    db: AristoDbRef,
    rvid: RootedVertexID,
    batch: var WriteBatch,
    vtx: VertexRef,
    level: int,
    skipLayers: static bool,
): Result[(HashKey, int), AristoError] =
  # The bloom filter available used only when creating the key cache from an
  # empty state

  # Top-most level of all the verticies this hash computation depends on
  var level = level

  # TODO this is the same code as when serializing NodeRef, without the NodeRef
  var writer = initRlpWriter()

  let key =
    case vtx.vType
    of Leaf:
      writer.encodeLeaf(vtx.pfx):
        case vtx.lData.pType
        of AccountData:
          let
            stoID = vtx.lData.stoID
            skey =
              if stoID.isValid:
                let
                  keyvtxl = ?db.getKey((stoID.vid, stoID.vid), skipLayers)
                  (skey, sl) =
                    if keyvtxl[0][0].isValid:
                      (keyvtxl[0][0], keyvtxl[1])
                    else:
                      ?db.computeKeyImpl(
                        (stoID.vid, stoID.vid),
                        batch,
                        keyvtxl[0][1],
                        keyvtxl[1],
                        skipLayers = skipLayers,
                      )
                level = maxLevel(level, sl)
                skey
              else:
                VOID_HASH_KEY

          rlp.encode Account(
            nonce: vtx.lData.account.nonce,
            balance: vtx.lData.account.balance,
            storageRoot: skey.to(Hash32),
            codeHash: vtx.lData.account.codeHash,
          )
        of StoData:
          # TODO avoid memory allocation when encoding storage data
          rlp.encode(vtx.lData.stoData)
    of Branch:
      # For branches, we need to load the vertices before recursing into them
      # to exploit their on-disk order
      var keyvtxs: array[16, ((HashKey, VertexRef), int)]
      for n, subvid in vtx.pairs:
        keyvtxs[n] = ?db.getKey((rvid.root, subvid), skipLayers)

      # Make sure we have keys computed for each hash
      block keysComputed:
        while true:
          # Compute missing keys in the order of the child vid that we have to
          # recurse into, again exploiting on-disk order - this more than
          # doubles computeKey speed on a fresh database!
          var
            minVid = default(VertexID)
            minIdx = keyvtxs.len + 1 # index where the minvid can be found
            n = 0'u8 # number of already-processed keys, for the progress bar

          # The O(n^2) sort/search here is fine given the small size of the list
          for nibble, keyvtx in keyvtxs.mpairs:
            let subvid = vtx.bVid(uint8 nibble)
            if (not subvid.isValid) or keyvtx[0][0].isValid:
              n += 1 # no need to compute key
              continue

            let childVid = keyvtx[0][1].childVid
            if not childVid.isValid:
              # leaf vertex without storage ID - we can compute the key trivially
              (keyvtx[0][0], keyvtx[1]) =
                ?db.computeKeyImpl(
                  (rvid.root, subvid),
                  batch,
                  keyvtx[0][1],
                  keyvtx[1],
                  skipLayers = skipLayers,
                )
              n += 1
              continue

            if minIdx == keyvtxs.len + 1 or childVid < minVid:
              minIdx = nibble
              minVid = childVid

          if minIdx == keyvtxs.len + 1: # no uncomputed key found!
            break keysComputed

          batch.enter(n)
          (keyvtxs[minIdx][0][0], keyvtxs[minIdx][1]) =
            ?db.computeKeyImpl(
              (rvid.root, vtx.bVid(uint8 minIdx)),
              batch,
              keyvtxs[minIdx][0][1],
              keyvtxs[minIdx][1],
              skipLayers = skipLayers,
            )
          batch.leave(n)

      template writeBranch(w: var RlpWriter): HashKey =
        w.encodeBranch(vtx):
          if subvid.isValid:
            level = maxLevel(level, keyvtxs[n][1])
            keyvtxs[n][0][0]
          else:
            VOID_HASH_KEY

      if vtx.pfx.len > 0: # Extension node
        writer.encodeExt(vtx.pfx):
          var bwriter = initRlpWriter()
          bwriter.writeBranch()
      else:
        writer.writeBranch()

  # Cache the hash into the same storage layer as the the top-most value that it
  # depends on (recursively) - this could be an ephemeral in-memory layer or the
  # underlying database backend - typically, values closer to the root are more
  # likely to live in an in-memory layer since any leaf change will lead to the
  # root key also changing while leaves that have never been hashed will see
  # their hash being saved directly to the backend.

  if vtx.vType != Leaf:
    ?db.putKeyAtLevel(rvid, vtx, key, level, batch)
  ok (key, level)

proc computeKeyImpl(
    db: AristoDbRef, rvid: RootedVertexID, skipLayers: static bool
): Result[HashKey, AristoError] =
  let (keyvtx, level) =
    when skipLayers:
      (?db.getKeyUbe(rvid, {GetVtxFlag.PeekCache}), -2)
    else:
      ?db.getKeyRc(rvid, {})

  if keyvtx[0].isValid:
    return ok(keyvtx[0])

  var batch: WriteBatch
  let res = computeKeyImpl(db, rvid, batch, keyvtx[1], level, skipLayers = skipLayers)
  if res.isOk:
    ?batch.flush(db)

    if batch.count > 0:
      if batch.count >= batchSize * 100:
        info "Wrote computeKey cache", keys = batch.count, accounts = "100.00%"
      else:
        debug "Wrote computeKey cache", keys = batch.count, accounts = "100.00%"

  ok (?res)[0]

proc computeKey*(
    db: AristoDbRef, # Database, top layer
    rvid: RootedVertexID, # Vertex to convert
): Result[HashKey, AristoError] =
  ## Compute the key for an arbitrary vertex ID. If successful, the length of
  ## the resulting key might be smaller than 32. If it is used as a root vertex
  ## state/hash, it must be converted to a `Hash32` (using (`.to(Hash32)`) as
  ## in `db.computeKey(rvid).value.to(Hash32)` which always results in a
  ## 32 byte value.
  computeKeyImpl(db, rvid, skipLayers = false)

proc computeKeys*(db: AristoDbRef, root: VertexID): Result[void, AristoError] =
  ## Ensure that key cache is topped up with the latest state root
  discard db.computeKeyImpl((root, root), skipLayers = true)

  ok()

# ------------------------------------------------------------------------------
# End
# ------------------------------------------------------------------------------