nimbus-eth1/nimbus/db/aristo/aristo_hashify.nim
Jordan Hrycaj 8e18e85288
Aristodb remove obsolete and time consuming admin features (#2048)
* Aristo: Reorg `hashify()` using different schedule algorithm

why:
  Directly calculating the search tree top down from the roots turns
  out to be faster than using the cached structures left over by `merge()`
  and `delete()`.
  Time gains is short of 20%

* Aristo: Remove `lTab[]` leaf entry object type

why:
  Not used anymore. It was previously needed to build the schedule for
  `hashify()`.

* Aristo: Avoid unnecessary re-org of the vertex ID recycling list

why:
  This list can become quite large so a heuristic is employed whether
  it makes sense to re-org.

  Also, re-org check is only done by `delete()` functions.

* Aristo: Remove key/reverse lookup table from tx layers

why:
  It is ignored except for handling proof nodes and costs unnecessary
  run time resources.

  This feature was originally needed to accommodate the mental transition
  from the legacy MPT to the `Aristo` trie :).

* Fix copyright year
2024-02-22 08:24:58 +00:00

332 lines
11 KiB
Nim

# nimbus-eth1
# Copyright (c) 2023-2024 Status Research & Development GmbH
# Licensed under either of
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
# http://www.apache.org/licenses/LICENSE-2.0)
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or
# http://opensource.org/licenses/MIT)
# at your option. This file may not be copied, modified, or distributed
# except according to those terms.
## Aristo DB -- Patricia Trie Merkleisation
## ========================================
##
## For the current state of the `Patricia Trie`, keys (equivalent to hashes)
## are associated with the vertex IDs. Existing key associations are taken
## as-is/unchecked unless the ID is marked a proof node. In the latter case,
## the key is assumed to be correct after re-calculation.
##
## The labelling algorithm works roughly as follows:
##
## * Given a set of start or root vertices, build the forest (of trees)
## downwards towards leafs vertices so that none of these vertices has a
## Merkle hash label.
##
## * Starting at the leaf vertices in width-first fashion, calculate the
## Merkle hashes and label the leaf vertices. Recursively work up labelling
## vertices up until the root nodes are reached.
##
## Note that there are some tweaks for `proof` node vertices which lead to
## incomplete trees in a way that the algoritm handles existing Merkle hash
## labels for missing vertices.
##
{.push raises: [].}
import
std/[algorithm, sequtils, sets, tables],
chronicles,
eth/common,
results,
stew/byteutils,
"."/[aristo_desc, aristo_get, aristo_layers, aristo_serialise, aristo_utils]
type
WidthFirstForest = object
## Collected width first search trees
root: HashSet[VertexID] ## Top level, root targets
pool: Table[VertexID,VertexID] ## Upper links pool
base: Table[VertexID,VertexID] ## Width-first leaf level links
leaf: HashSet[VertexID] ## Stans-alone leaf to process
rev: Table[VertexID,HashSet[VertexID]] ## Reverse look up table
logScope:
topics = "aristo-hashify"
# ------------------------------------------------------------------------------
# Private helpers
# ------------------------------------------------------------------------------
template logTxt(info: static[string]): static[string] =
"Hashify " & info
func getOrVoid(tab: Table[VertexID,VertexID]; vid: VertexID): VertexID =
tab.getOrDefault(vid, VertexID(0))
func contains(wff: WidthFirstForest; vid: VertexID): bool =
vid in wff.base or vid in wff.pool or vid in wff.root
# ------------------------------------------------------------------------------
# Private functions
# ------------------------------------------------------------------------------
func hasValue(
wffTable: Table[VertexID,VertexID];
vid: VertexID;
wff: WidthFirstForest;
): bool =
## Helper for efficient `value` access:
## ::
## wffTable.hasValue(wff, vid)
##
## instead of
## ::
## vid in wffTable.values.toSeq
##
for w in wff.rev.getOrVoid vid:
if w in wffTable:
return true
proc pedigree(
db: AristoDbRef; # Database, top layer
ancestors: HashSet[VertexID]; # Vertex IDs to start connecting from
proofs: HashSet[VertexID]; # Additional proof nodes to start from
): Result[WidthFirstForest,(VertexID,AristoError)] =
## For each vertex ID from the argument set `ancestors` find all un-labelled
## grand child vertices and build a forest (of trees) starting from the
## grand child vertices.
##
var
wff: WidthFirstForest
leafs: HashSet[VertexID]
proc register(wff: var WidthFirstForest; fromVid, toVid: VertexID) =
if toVid in wff.base:
# * there is `toVid->*` in `base[]`
# * so ``toVid->*` moved to `pool[]`
wff.pool[toVid] = wff.base.getOrVoid toVid
wff.base.del toVid
if wff.base.hasValue(fromVid, wff):
# * there is `*->fromVid` in `base[]`
# * so store `fromVid->toVid` in `pool[]`
wff.pool[fromVid] = toVid
else:
# store `fromVid->toVid` in `base[]`
wff.base[fromVid] = toVid
# Register reverse pair for quick table value lookup
wff.rev.withValue(toVid, val):
val[].incl fromVid
do:
wff.rev[toVid] = @[fromVid].toHashSet
# Remove unnecessarey sup-trie roots (e.g. for a storage root)
wff.root.excl fromVid
# Initialise greedy search which will keep a set of current leafs in the
# `leafs{}` set and follow up links in the `pool[]` table, leading all the
# way up to the `root{}` set.
#
# Process root nodes if they are unlabelled
var rootWasDeleted = VertexID(0)
for root in ancestors:
let vtx = db.getVtx root
if vtx.isNil:
if VertexID(LEAST_FREE_VID) <= root:
# There must be a another root, as well (e.g. `$1` for a storage
# root). Only the last one of some will be reported with error code.
rootWasDeleted = root
elif not db.getKey(root).isValid:
# Need to process `root` node
let children = vtx.subVids
if children.len == 0:
# This is an isolated leaf node
wff.leaf.incl root
else:
wff.root.incl root
for child in vtx.subVids:
if not db.getKey(child).isValid:
leafs.incl child
wff.register(child, root)
if rootWasDeleted.isValid and
wff.root.len == 0 and
wff.leaf.len == 0:
return err((rootWasDeleted,HashifyRootVtxUnresolved))
# Initialisation for `proof` nodes which are sort of similar to `root` nodes.
for proof in proofs:
let vtx = db.getVtx proof
if vtx.isNil or not db.getKey(proof).isValid:
return err((proof,HashifyVtxUnresolved))
let children = vtx.subVids
if 0 < children.len:
# To be treated as a root node
wff.root.incl proof
for child in vtx.subVids:
if not db.getKey(child).isValid:
leafs.incl child
wff.register(child, proof)
# Recursively step down and collect unlabelled vertices
while 0 < leafs.len:
var redo: typeof(leafs)
for parent in leafs:
assert parent.isValid
assert not db.getKey(parent).isValid
let vtx = db.getVtx parent
if not vtx.isNil:
let children = vtx.subVids.filterIt(not db.getKey(it).isValid)
if 0 < children.len:
for child in children:
redo.incl child
wff.register(child, parent)
continue
if parent notin wff.base:
# The buck stops here:
# move `(parent,granny)` from `pool[]` to `base[]`
let granny = wff.pool.getOrVoid parent
assert granny.isValid
wff.register(parent, granny)
wff.pool.del parent
redo.swap leafs
ok wff
# ------------------------------------------------------------------------------
# Private functions, tree traversal
# ------------------------------------------------------------------------------
proc createSched(
db: AristoDbRef; # Database, top layer
): Result[WidthFirstForest,(VertexID,AristoError)] =
## Create width-first search schedule (aka forest)
##
var wff = ? db.pedigree(db.dirty, db.pPrf)
if 0 < wff.leaf.len:
for vid in wff.leaf:
let node = db.getVtx(vid).toNode(db, beKeyOk=false).valueOr:
# Make sure that all those nodes are reachable
for needed in error:
if needed notin wff.base and
needed notin wff.pool:
return err((needed,HashifyVtxUnresolved))
continue
db.layersPutKey(VertexID(1), vid, node.digestTo(HashKey))
ok wff
proc processSched(
wff: var WidthFirstForest; # Search tree to process
db: AristoDbRef; # Database, top layer
): Result[void,(VertexID,AristoError)] =
## Traverse width-first schedule and update vertex hash labels.
##
while 0 < wff.base.len:
var
accept = false
redo: typeof(wff.base)
for (vid,toVid) in wff.base.pairs:
let vtx = db.getVtx vid
assert vtx.isValid
# Try to convert the vertex to a node. This is possible only if all
# link references have Merkle hash keys, already.
let node = vtx.toNode(db, stopEarly=false).valueOr:
# Do this vertex later, again
if wff.pool.hasValue(vid, wff):
wff.pool[vid] = toVid
accept = true # `redo[]` will be fifferent from `base[]`
else:
redo[vid] = toVid
continue
# End `valueOr` terminates error clause
# Could resolve => update Merkle hash
db.layersPutKey(VertexID(1), vid, node.digestTo HashKey)
# Set follow up link for next round
let toToVid = wff.pool.getOrVoid toVid
if toToVid.isValid:
if toToVid in redo:
# Got predecessor `(toVid,toToVid)` of `(toToVid,xxx)`,
# so move `(toToVid,xxx)` from `redo[]` to `pool[]`
wff.pool[toToVid] = redo.getOrVoid toToVid
redo.del toToVid
# Move `(toVid,toToVid)` from `pool[]` to `redo[]`
wff.pool.del toVid
redo[toVid] = toToVid
accept = true # `redo[]` will be fifferent from `base[]`
# End `for (vid,toVid)..`
# Make sure that `base[]` is different from `redo[]`
if not accept:
let vid = wff.base.keys.toSeq[0]
return err((vid,HashifyVtxUnresolved))
# Restart `wff.base[]`
wff.base.swap redo
ok()
proc finaliseRoots(
wff: var WidthFirstForest; # Search tree to process
db: AristoDbRef; # Database, top layer
): Result[void,(VertexID,AristoError)] =
## Process root vertices after all other vertices are done.
##
# Make sure that the pool has been exhausted
if 0 < wff.pool.len:
let vid = wff.pool.keys.toSeq.sorted[0]
return err((vid,HashifyVtxUnresolved))
# Update or verify root nodes
for vid in wff.root:
# Calculate hash key
let
node = db.getVtx(vid).toNode(db).valueOr:
return err((vid,HashifyRootVtxUnresolved))
key = node.digestTo(HashKey)
if vid notin db.pPrf:
db.layersPutKey(VertexID(1), vid, key)
elif key != db.getKey vid:
return err((vid,HashifyProofHashMismatch))
ok()
# ------------------------------------------------------------------------------
# Public functions
# ------------------------------------------------------------------------------
proc hashify*(
db: AristoDbRef; # Database, top layer
): Result[void,(VertexID,AristoError)] =
## Add keys to the `Patricia Trie` so that it becomes a `Merkle Patricia
## Tree`. If successful, the function returns the keys (aka Merkle hash) of
## the root vertices.
##
if 0 < db.dirty.len:
# Set up widh-first traversal schedule
var wff = ? db.createSched()
# Traverse tree spanned by `wff` and label remaining vertices.
? wff.processSched db
# Do/complete state root vertices
? wff.finaliseRoots db
db.top.final.dirty.clear # Mark top layer clean
ok()
# ------------------------------------------------------------------------------
# End
# ------------------------------------------------------------------------------