nimbus-eth1/nimbus/db/aristo/aristo_compute.nim
Jacek Sieka 188d689d9d
Speed up initial MPT root computation after import (#2788)
When `nimbus import` runs, we end up with a database without MPT roots
leading to long startup times the first time one is needed.

Computing the state root is slow because the on-disk order based on
VertexID sorting does not match the trie traversal order and therefore
makes lookups inefficent.

Here we introduce a helper that speeds up this computation by traversing
the trie in on-disk order and computing the trie hashes bottom up
instead - even though this leads to some redundant reads of nodes that
we cannot yet compute, it's still a net win as leaves and "bottom"
branches make up the majority of the database.

This PR also addresses a few other sources of inefficiency largely due
to the separation of AriKey and AriVtx into their own column families.

Each column family is its own LSM tree that produces hundreds of SST
filtes - with a limit of 512 open files, rocksdb must keep closing and
opening files which leads to expensive metadata reads during random
access.

When rocksdb makes a lookup, it has to read several layers of files for
each lookup. Ribbon filters to skip over files that don't have the
requested data but when these filters are not in memory, reading them is
slow - this happens in two cases: when opening a file and when the
filter has been evicted from the LRU cache. Addressing the open file
limit solves one source of inefficiency, but we must also increase the
block cache size to deal with this problem.

* rocksdb.max_open_files increased to 2048
* per-file size limits increased so that fewer files are created
* WAL size increased to avoid partial flushes which lead to small files
* rocksdb block cache increased

All these increases of course lead to increased memory usage, but at
least performance is acceptable - in the future, we'll need to explore
options such as joining AriVtx and AriKey and/or reducing the row count
(by grouping branch layers under a single vertexid).

With this PR, the mainnet state root can be computed in ~8 hours (down
from 2-3 days) - not great, but still better.

Further, we write all keys to the database, also those that are less
than 32 bytes - because the mpt path is part of the input, it is very
rare that we actually hit a key like this (about 200k such entries on
mainnet), so the code complexity is not worth the benefit really, in the
current database layout / design.
2024-10-27 11:08:37 +00:00

488 lines
17 KiB
Nim

# nimbus-eth1
# Copyright (c) 2023-2024 Status Research & Development GmbH
# Licensed under either of
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
# http://www.apache.org/licenses/LICENSE-2.0)
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or
# http://opensource.org/licenses/MIT)
# at your option. This file may not be copied, modified, or distributed
# except according to those terms.
{.push raises: [].}
import
system/ansi_c,
std/[strformat, math, hashes],
stew/staticfor,
chronicles,
eth/common,
results,
"."/[aristo_desc, aristo_get, aristo_serialise, aristo_walk/persistent],
./aristo_desc/desc_backend
type BasicBloomFilter = object
# School book implementation of bloom filter based on
# https://github.com/save-buffer/bloomfilter_benchmarks.
#
# In theory, this bloom filter could be turned into a reusable component but
# it is fairly specialised to the particular use case and gets used in a
# tight/hot loop in the code - a generalisation would require care so as not
# to introduce overhead but could of course be further optimised using
bytes: ptr UncheckedArray[byte]
proc computeBits(n: int, epsilon: float): int =
# Number of bits in the bloom filter required for n elements and eposilon
# false positive rate
int(-1.4427 * float(n) * log2(epsilon) + 0.5)
proc computeHashFns(epsilon: float): int =
# Number of hash functions given the desired false positive rate
int(-log2(epsilon) + 0.5)
const
bloomRate = 0.002
# The leaf cache computation is fairly sensitive to false positives as these
# ripple up the branch trie with false postivies being amplified by trie
# branching - this has to be balanced with the cost which
# goes up fairly quickly with ~13 bits per key at 0.002, meaning ~2gb of
# memory for the current setting below!
bloomHashes = computeHashFns(bloomRate)
expectedKeys = 1500000000
# expected number of elements in the bloom filter - this is reported as
# `keys` below and will need adjusting - the value is more or less accurate
# on mainnet as of block 2100000 (~oct 2024) for the number of leaves
# present - we use leaf count because bloom filter accuracy is most
# important for the first round of branches.
# TODO rocksdb can estimate the number of keys present in the vertex table -
# this would provide a reasonable estimate of what the bloom table size
# should be, though in reality we want leaf count per above argument -
# at the time of writing leaves make up around 3/4 of all verticies
bloomSize = uint32((computeBits(expectedKeys, bloomRate) + 7) / 8)
func hashes(v: uint64): (uint32, uint32) =
# Use the two halves of an uint64 to create two independent hashes functions
# for the bloom that allow efficiently generating more bloom hash functions
# per Kirsch and Mitzenmacher:
# https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
let
v = uint64(hash(v)) # `hash` for a better spread of bits into upper half
h1 = uint32(v)
h2 = uint32(v shr 32)
(h1, h2)
func insert(filter: var BasicBloomFilter, v: uint64) =
let (h1, h2) = hashes(v)
staticFor i, 0 ..< bloomHashes:
let
hash = (h1 + i * h2)
bitIdx = uint8(hash mod 8)
byteIdx = (hash div 8) mod bloomSize
filter.bytes[byteIdx] = filter.bytes[byteIdx] or (1'u8 shl bitIdx)
func query(filter: BasicBloomFilter, v: uint64): bool =
let (h1, h2) = hashes(v)
var match = 1'u8
staticFor i, 0 ..< bloomHashes:
let
hash = (h1 + i * h2)
bitIdx = uint8(hash mod 8)
byteIdx = (hash div 8) mod bloomSize
match = match and ((filter.bytes[byteIdx] shr bitIdx) and 1)
match > 0
proc init(T: type BasicBloomFilter): T =
# We use the C memory allocator so as to return memory to the operating system
# at the end of the computation - we don't want the one-off blob to remain in
# the hands of the Nim GC.
# `calloc` to get zeroed memory out of the box
let memory = c_calloc(csize_t(bloomSize), 1)
doAssert memory != nil, "Could not allocate memory for bloom filter"
T(bytes: cast[ptr UncheckedArray[byte]](memory))
proc release(v: BasicBloomFilter) =
# TODO with orc, this could be a destructor
c_free(v.bytes)
type WriteBatch = tuple[writer: PutHdlRef, count: int, depth: int, prefix: uint64]
# Keep write batch size _around_ 1mb, give or take some overhead - this is a
# tradeoff between efficiency and memory usage with diminishing returns the
# larger it is..
const batchSize = 1024 * 1024 div (sizeof(RootedVertexID) + sizeof(HashKey))
proc flush(batch: var WriteBatch, db: AristoDbRef): Result[void, AristoError] =
if batch.writer != nil:
?db.backend.putEndFn batch.writer
batch.writer = nil
ok()
proc putKey(
batch: var WriteBatch, db: AristoDbRef, rvid: RootedVertexID, key: HashKey
): Result[void, AristoError] =
if batch.writer == nil:
doAssert db.backend != nil, "source data is from the backend"
batch.writer = ?db.backend.putBegFn()
db.backend.putKeyFn(batch.writer, rvid, key)
batch.count += 1
ok()
func progress(batch: WriteBatch): string =
# Return an approximation on how much of the keyspace has been covered by
# looking at the path prefix that we're currently processing
&"{(float(batch.prefix) / float(uint64.high)) * 100:02.2f}%"
func enter(batch: var WriteBatch, nibble: int) =
batch.depth += 1
if batch.depth <= 16:
batch.prefix += uint64(nibble) shl ((16 - batch.depth) * 4)
func leave(batch: var WriteBatch, nibble: int) =
if batch.depth <= 16:
batch.prefix -= uint64(nibble) shl ((16 - batch.depth) * 4)
batch.depth -= 1
proc putKeyAtLevel(
db: AristoDbRef,
rvid: RootedVertexID,
key: HashKey,
level: int,
batch: var WriteBatch,
): Result[void, AristoError] =
## Store a hash key in the given layer or directly to the underlying database
## which helps ensure that memory usage is proportional to the pending change
## set (vertex data may have been committed to disk without computing the
## corresponding hash!)
# Only put computed keys in the database which keeps churn down by focusing on
# the ones that do not change!
if level == -2:
?batch.putKey(db, rvid, key)
if batch.count mod batchSize == 0:
?batch.flush(db)
if batch.count mod (batchSize * 100) == 0:
info "Writing computeKey cache", keys = batch.count, accounts = batch.progress
else:
debug "Writing computeKey cache", keys = batch.count, accounts = batch.progress
else:
db.deltaAtLevel(level).kMap[rvid] = key
ok()
func maxLevel(cur, other: int): int =
# Compare two levels and return the topmost in the stack, taking into account
# the odd reversal of order around the zero point
if cur < 0:
max(cur, other) # >= 0 is always more topmost than <0
elif other < 0:
cur
else:
min(cur, other) # Here the order is reversed and 0 is the top layer
template encodeLeaf(w: var RlpWriter, pfx: NibblesBuf, leafData: untyped): HashKey =
w.startList(2)
w.append(pfx.toHexPrefix(isLeaf = true).data())
w.append(leafData)
w.finish().digestTo(HashKey)
template encodeBranch(w: var RlpWriter, subKeyForN: untyped): HashKey =
w.startList(17)
for n {.inject.} in 0 .. 15:
w.append(subKeyForN)
w.append EmptyBlob
w.finish().digestTo(HashKey)
template encodeExt(w: var RlpWriter, pfx: NibblesBuf, branchKey: HashKey): HashKey =
w.startList(2)
w.append(pfx.toHexPrefix(isLeaf = false).data())
w.append(branchKey)
w.finish().digestTo(HashKey)
proc computeKeyImpl(
db: AristoDbRef,
rvid: RootedVertexID,
batch: var WriteBatch,
bloom: ptr BasicBloomFilter = nil,
): Result[(HashKey, int), AristoError] =
# The bloom filter available used only when creating the key cache from an
# empty state
if bloom == nil or bloom[].query(uint64(rvid.vid)):
db.getKeyRc(rvid).isErrOr:
# Value cached either in layers or database
return ok value
let (vtx, vl) = ?db.getVtxRc(rvid, {GetVtxFlag.PeekCache})
# Top-most level of all the verticies this hash compution depends on
var level = vl
# TODO this is the same code as when serializing NodeRef, without the NodeRef
var writer = initRlpWriter()
let key =
case vtx.vType
of Leaf:
writer.encodeLeaf(vtx.pfx):
case vtx.lData.pType
of AccountData:
let
stoID = vtx.lData.stoID
skey =
if stoID.isValid:
let (skey, sl) =
?db.computeKeyImpl((stoID.vid, stoID.vid), batch, bloom)
level = maxLevel(level, sl)
skey
else:
VOID_HASH_KEY
rlp.encode Account(
nonce: vtx.lData.account.nonce,
balance: vtx.lData.account.balance,
storageRoot: skey.to(Hash32),
codeHash: vtx.lData.account.codeHash,
)
of RawData:
vtx.lData.rawBlob
of StoData:
# TODO avoid memory allocation when encoding storage data
rlp.encode(vtx.lData.stoData)
of Branch:
template writeBranch(w: var RlpWriter): HashKey =
w.encodeBranch:
let vid = vtx.bVid[n]
if vid.isValid:
batch.enter(n)
let (bkey, bl) = ?db.computeKeyImpl((rvid.root, vid), batch, bloom)
batch.leave(n)
level = maxLevel(level, bl)
bkey
else:
VOID_HASH_KEY
if vtx.pfx.len > 0: # Extension node
writer.encodeExt(vtx.pfx):
var bwriter = initRlpWriter()
bwriter.writeBranch()
else:
writer.writeBranch()
# Cache the hash into the same storage layer as the the top-most value that it
# depends on (recursively) - this could be an ephemeral in-memory layer or the
# underlying database backend - typically, values closer to the root are more
# likely to live in an in-memory layer since any leaf change will lead to the
# root key also changing while leaves that have never been hashed will see
# their hash being saved directly to the backend.
?db.putKeyAtLevel(rvid, key, level, batch)
ok (key, level)
proc computeKeyImpl(
db: AristoDbRef, rvid: RootedVertexID, bloom: ptr BasicBloomFilter
): Result[HashKey, AristoError] =
var batch: WriteBatch
let res = computeKeyImpl(db, rvid, batch, bloom)
if res.isOk:
?batch.flush(db)
if batch.count > 0:
if batch.count >= batchSize * 100:
info "Wrote computeKey cache", keys = batch.count, accounts = "100.00%"
else:
debug "Wrote computeKey cache", keys = batch.count, accounts = "100.00%"
ok (?res)[0]
proc computeKey*(
db: AristoDbRef, # Database, top layer
rvid: RootedVertexID, # Vertex to convert
): Result[HashKey, AristoError] =
## Compute the key for an arbitrary vertex ID. If successful, the length of
## the resulting key might be smaller than 32. If it is used as a root vertex
## state/hash, it must be converted to a `Hash32` (using (`.to(Hash32)`) as
## in `db.computeKey(rvid).value.to(Hash32)` which always results in a
## 32 byte value.
computeKeyImpl(db, rvid, nil)
proc computeLeafKeysImpl(
T: type, db: AristoDbRef, root: VertexID
): Result[void, AristoError] =
for x in T.walkKeyBe(db):
debug "Skipping leaf key computation, cache is not empty"
return ok()
# Key computation function that works by iterating over the entries in the
# database (instead of traversing trie using point lookups) - due to how
# rocksdb is organised, this cache-friendly traversal order turns out to be
# more efficient even if we "touch" a lot of irrelevant entries.
# Computation works bottom-up starting with the leaves and proceeding with
# branches whose children were computed in the previous round one "layer"
# at a time until the the number of successfully computed nodes grows low.
# TODO progress indicator
info "Writing key cache (this may take a while)"
var batch: WriteBatch
# Bloom filter keeping track of keys we're added to the database already so
# as to avoid expensive speculative lookups
var bloom = BasicBloomFilter.init()
defer:
bloom.release()
var
# Reuse rlp writers to avoid superfluous memory allocations
writer = initRlpWriter()
writer2 = initRlpWriter()
level = 0
# Start with leaves - at the time of writing, this is roughly 3/4 of the
# of the entries in the database on mainnet - the ratio roughly corresponds to
# the fill ratio of the deepest branch nodes as nodes close to the MPT root
# don't come in significant numbers
for (rvid, vtx) in T.walkVtxBe(db, {Leaf}):
if vtx.lData.pType == AccountData and vtx.lData.stoID.isValid:
# Accounts whose key depends on the storage trie typically will not yet
# have their root node computed and several such contracts are
# significant in size, meaning that we might as well let their leaves
# be computed and then top up during regular trie traversal.
continue
writer.clear()
let key = writer.encodeLeaf(vtx.pfx):
case vtx.lData.pType
of AccountData:
writer2.clear()
writer2.append Account(
nonce: vtx.lData.account.nonce,
balance: vtx.lData.account.balance,
# Accounts with storage filtered out above
storageRoot: default(Hash32),
codeHash: vtx.lData.account.codeHash,
)
writer2.finish()
of RawData:
vtx.lData.rawBlob
of StoData:
writer2.clear()
writer2.append(vtx.lData.stoData)
writer2.finish()
?batch.putKey(db, rvid, key)
if batch.count mod batchSize == 0:
?batch.flush(db)
if batch.count mod (batchSize * 100) == 0:
info "Writing leaves", keys = batch.count, level
else:
debug "Writing leaves", keys = batch.count, level
bloom.insert(uint64(rvid.vid))
let leaves = batch.count
# The leaves have been written - we'll now proceed to branches expecting
# diminishing returns for each layer - not only beacuse there are fewer nodes
# closer to the root in the trie but also because leaves we skipped over lead
# larger and larger branch gaps and the advantage of iterating in disk order
# is lost
var lastRound = leaves
level += 1
# 16*16 looks like "2 levels of MPT" but in reality, the branch nodes close
# to the leaves are sparse - on average about 4 nodes per branch on mainnet -
# meaning that we'll do 3-4 levels of branch depending on the network
while lastRound > (leaves div (16 * 16)):
info "Starting branch layer", keys = batch.count, lastRound, level
var round = 0
for (rvid, vtx) in T.walkVtxBe(db, {Branch}):
if vtx.pfx.len > 0:
# TODO there shouldn't be many of these - is it worth the lookup?
continue
if level > 1:
# A hit on the bloom filter here means we **maybe** already computed a
# key for this branch node - we could verify this with a lookup but
# the generally low false positive rate makes this check more expensive
# than simply revisiting the node using trie traversal.
if bloom.query(uint64(rvid.vid)):
continue
block branchKey:
for b in vtx.bVid:
if b.isValid and not bloom.query(uint64(b)):
# If any child is missing from the branch, we can't compute the key
# trivially
break branchKey
writer.clear()
let key = writer.encodeBranch:
let vid = vtx.bVid[n]
if vid.isValid:
let bkey = db.getKeyUbe((rvid.root, vid)).valueOr:
# False positive on the bloom filter lookup
break branchKey
bkey
else:
VOID_HASH_KEY
?batch.putKey(db, rvid, key)
if batch.count mod batchSize == 0:
?batch.flush(db)
if batch.count mod (batchSize * 100) == 0:
info "Writing branches", keys = batch.count, round, level
else:
debug "Writing branches", keys = batch.count, round, level
round += 1
bloom.insert(uint64(rvid.vid))
lastRound = round
level += 1
?batch.flush(db)
info "Key cache base written",
keys = batch.count, lastRound, leaves, branches = batch.count - leaves
let rc = computeKeyImpl(db, (root, root), addr bloom)
if rc.isOk() or rc.error() == GetVtxNotFound:
# When there's no root vertex, the database is likely empty
ok()
else:
err(rc.error())
proc computeKeys*(db: AristoDbRef, root: VertexID): Result[void, AristoError] =
## Computing the leaf keys is a pre-processing step for when hash cache is
## empty.
##
## Computing it by traversing the trie can take days because of the mismatch
## between trie traversal order and the on-disk VertexID-based sorting.
##
## This implementation speeds up the inital seeding of the cache by traversing
## the full state in on-disk order and computing hashes bottom-up instead.
case db.backend.kind
of BackendMemory:
MemBackendRef.computeLeafKeysImpl db, root
of BackendRocksDB, BackendRdbHosting:
RdbBackendRef.computeLeafKeysImpl db, root
of BackendVoid:
ok()
# ------------------------------------------------------------------------------
# End
# ------------------------------------------------------------------------------