mirror of
synced 2025-03-01 12:20:49 +00:00
Currently, computed hash keys are stored in a separate column family with respect to the MPT data they're generated from - this has several disadvantages: * A lot of space is wasted because the lookup key (`RootedVertexID`) is repeated in both tables - this is 30% of the `AriKey` content! * rocksdb must maintain in-memory bloom filters and LRU caches for said keys, doubling its "minimal efficient cache size" * An extra disk traversal must be made to check for existence of cached hash key * Doubles the amount of files on disk due to each column family being its own set of files Here, the two CFs are joined such that both key and data is stored in `AriVtx`. This means: * we save ~30% disk space on repeated lookup keys * we save ~2gb of memory overhead that can be used to cache data instead of indices * we can skip storing hash keys for MPT leaf nodes - these are trivial to compute and waste a lot of space - previously they had to present in the `AriKey` CF to avoid having to look in two tables on the happy path. * There is a small increase in write amplification because when a hash value is updated for a branch node, we must write both key and branch data - previously we would write only the key * There's a small shift in CPU usage - instead of performing lookups in the database, hashes for leaf nodes are (re)-computed on the fly * We can return to slightly smaller on-disk SST files since there's fewer of them, which should reduce disk traffic a bit Internally, there are also other advantages: * when clearing keys, we no longer have to store a zero hash in memory - instead, we deduce staleness of the cached key from the presence of an updated VertexRef - this saves ~1gb of mem overhead during import * hash key cache becomes dedicated to branch keys since leaf keys are no longer stored in memory, reducing churn * key computation is a lot faster thanks to the skipped second disk traversal - a key computation for mainnet can be completed in 11 hours instead of ~2 days (!) thanks to better cache usage and less read amplification - with additional improvements to the on-disk format, we can probably get rid of the initial full traversal method of seeding the key cache on first start after import All in all, this PR reduces the size of a mainnet database from 160gb to 110gb and the peak memory footprint during import by ~1-2gb.
522 lines
18 KiB
522 lines
18 KiB
# nimbus-eth1
# Copyright (c) 2023-2024 Status Research & Development GmbH
# Licensed under either of
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
# http://www.apache.org/licenses/LICENSE-2.0)
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or
# http://opensource.org/licenses/MIT)
# at your option. This file may not be copied, modified, or distributed
# except according to those terms.
{.push raises: [].}
std/[strformat, math, hashes],
"."/[aristo_desc, aristo_get, aristo_serialise, aristo_walk/persistent],
type BasicBloomFilter = object
# School book implementation of bloom filter based on
# https://github.com/save-buffer/bloomfilter_benchmarks.
# In theory, this bloom filter could be turned into a reusable component but
# it is fairly specialised to the particular use case and gets used in a
# tight/hot loop in the code - a generalisation would require care so as not
# to introduce overhead but could of course be further optimised using
bytes: ptr UncheckedArray[byte]
proc computeBits(n: int, epsilon: float): int =
# Number of bits in the bloom filter required for n elements and eposilon
# false positive rate
int(-1.4427 * float(n) * log2(epsilon) + 0.5)
proc computeHashFns(epsilon: float): int =
# Number of hash functions given the desired false positive rate
int(-log2(epsilon) + 0.5)
bloomRate = 0.002
# The leaf cache computation is fairly sensitive to false positives as these
# ripple up the branch trie with false postivies being amplified by trie
# branching - this has to be balanced with the cost which
# goes up fairly quickly with ~13 bits per key at 0.002, meaning ~2gb of
# memory for the current setting below!
bloomHashes = computeHashFns(bloomRate)
expectedKeys = 1500000000
# expected number of elements in the bloom filter - this is reported as
# `keys` below and will need adjusting - the value is more or less accurate
# on mainnet as of block 2100000 (~oct 2024) for the number of leaves
# present - we use leaf count because bloom filter accuracy is most
# important for the first round of branches.
# TODO rocksdb can estimate the number of keys present in the vertex table -
# this would provide a reasonable estimate of what the bloom table size
# should be, though in reality we want leaf count per above argument -
# at the time of writing leaves make up around 3/4 of all verticies
bloomSize = uint32((computeBits(expectedKeys, bloomRate) + 7) / 8)
func hashes(v: uint64): (uint32, uint32) =
# Use the two halves of an uint64 to create two independent hashes functions
# for the bloom that allow efficiently generating more bloom hash functions
# per Kirsch and Mitzenmacher:
# https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
v = uint64(hash(v)) # `hash` for a better spread of bits into upper half
h1 = uint32(v)
h2 = uint32(v shr 32)
(h1, h2)
func insert(filter: var BasicBloomFilter, v: uint64) =
let (h1, h2) = hashes(v)
staticFor i, 0 ..< bloomHashes:
hash = (h1 + i * h2)
bitIdx = uint8(hash mod 8)
byteIdx = (hash div 8) mod bloomSize
filter.bytes[byteIdx] = filter.bytes[byteIdx] or (1'u8 shl bitIdx)
func query(filter: BasicBloomFilter, v: uint64): bool =
let (h1, h2) = hashes(v)
var match = 1'u8
staticFor i, 0 ..< bloomHashes:
hash = (h1 + i * h2)
bitIdx = uint8(hash mod 8)
byteIdx = (hash div 8) mod bloomSize
match = match and ((filter.bytes[byteIdx] shr bitIdx) and 1)
match > 0
proc init(T: type BasicBloomFilter): T =
# We use the C memory allocator so as to return memory to the operating system
# at the end of the computation - we don't want the one-off blob to remain in
# the hands of the Nim GC.
# `calloc` to get zeroed memory out of the box
let memory = c_calloc(csize_t(bloomSize), 1)
doAssert memory != nil, "Could not allocate memory for bloom filter"
T(bytes: cast[ptr UncheckedArray[byte]](memory))
proc release(v: BasicBloomFilter) =
# TODO with orc, this could be a destructor
type WriteBatch = tuple[writer: PutHdlRef, count: int, depth: int, prefix: uint64]
# Keep write batch size _around_ 1mb, give or take some overhead - this is a
# tradeoff between efficiency and memory usage with diminishing returns the
# larger it is..
const batchSize = 1024 * 1024 div (sizeof(RootedVertexID) + sizeof(HashKey))
proc flush(batch: var WriteBatch, db: AristoDbRef): Result[void, AristoError] =
if batch.writer != nil:
?db.backend.putEndFn batch.writer
batch.writer = nil
proc putVtx(
batch: var WriteBatch,
db: AristoDbRef,
rvid: RootedVertexID,
vtx: VertexRef,
key: HashKey,
): Result[void, AristoError] =
if batch.writer == nil:
doAssert db.backend != nil, "source data is from the backend"
batch.writer = ?db.backend.putBegFn()
db.backend.putVtxFn(batch.writer, rvid, vtx, key)
batch.count += 1
func progress(batch: WriteBatch): string =
# Return an approximation on how much of the keyspace has been covered by
# looking at the path prefix that we're currently processing
&"{(float(batch.prefix) / float(uint64.high)) * 100:02.2f}%"
func enter(batch: var WriteBatch, nibble: int) =
batch.depth += 1
if batch.depth <= 16:
batch.prefix += uint64(nibble) shl ((16 - batch.depth) * 4)
func leave(batch: var WriteBatch, nibble: int) =
if batch.depth <= 16:
batch.prefix -= uint64(nibble) shl ((16 - batch.depth) * 4)
batch.depth -= 1
proc putKeyAtLevel(
db: AristoDbRef,
rvid: RootedVertexID,
vtx: VertexRef,
key: HashKey,
level: int,
batch: var WriteBatch,
): Result[void, AristoError] =
## Store a hash key in the given layer or directly to the underlying database
## which helps ensure that memory usage is proportional to the pending change
## set (vertex data may have been committed to disk without computing the
## corresponding hash!)
if level == -2:
?batch.putVtx(db, rvid, vtx, key)
if batch.count mod batchSize == 0:
if batch.count mod (batchSize * 100) == 0:
info "Writing computeKey cache", keys = batch.count, accounts = batch.progress
debug "Writing computeKey cache", keys = batch.count, accounts = batch.progress
db.deltaAtLevel(level).sTab[rvid] = vtx
db.deltaAtLevel(level).kMap[rvid] = key
func maxLevel(cur, other: int): int =
# Compare two levels and return the topmost in the stack, taking into account
# the odd reversal of order around the zero point
if cur < 0:
max(cur, other) # >= 0 is always more topmost than <0
elif other < 0:
min(cur, other) # Here the order is reversed and 0 is the top layer
template encodeLeaf(w: var RlpWriter, pfx: NibblesBuf, leafData: untyped): HashKey =
w.append(pfx.toHexPrefix(isLeaf = true).data())
template encodeBranch(w: var RlpWriter, subKeyForN: untyped): HashKey =
for n {.inject.} in 0 .. 15:
w.append EmptyBlob
template encodeExt(w: var RlpWriter, pfx: NibblesBuf, branchKey: HashKey): HashKey =
w.append(pfx.toHexPrefix(isLeaf = false).data())
proc computeKeyImpl(
db: AristoDbRef,
rvid: RootedVertexID,
batch: var WriteBatch,
bloom: ptr BasicBloomFilter = nil,
): Result[(HashKey, int), AristoError] =
# The bloom filter available used only when creating the key cache from an
# empty state
if bloom == nil or bloom[].query(uint64(rvid.vid)):
# Value cached either in layers or database
return ok value
let (vtx, vl) = ?db.getVtxRc(rvid, {GetVtxFlag.PeekCache})
# Top-most level of all the verticies this hash compution depends on
var level = vl
# TODO this is the same code as when serializing NodeRef, without the NodeRef
var writer = initRlpWriter()
let key =
case vtx.vType
of Leaf:
case vtx.lData.pType
of AccountData:
stoID = vtx.lData.stoID
skey =
if stoID.isValid:
let (skey, sl) =
?db.computeKeyImpl((stoID.vid, stoID.vid), batch, bloom)
level = maxLevel(level, sl)
rlp.encode Account(
nonce: vtx.lData.account.nonce,
balance: vtx.lData.account.balance,
storageRoot: skey.to(Hash32),
codeHash: vtx.lData.account.codeHash,
of StoData:
# TODO avoid memory allocation when encoding storage data
of Branch:
template writeBranch(w: var RlpWriter): HashKey =
let vid = vtx.bVid[n]
if vid.isValid:
let (bkey, bl) = ?db.computeKeyImpl((rvid.root, vid), batch, bloom)
level = maxLevel(level, bl)
if vtx.pfx.len > 0: # Extension node
var bwriter = initRlpWriter()
# Cache the hash into the same storage layer as the the top-most value that it
# depends on (recursively) - this could be an ephemeral in-memory layer or the
# underlying database backend - typically, values closer to the root are more
# likely to live in an in-memory layer since any leaf change will lead to the
# root key also changing while leaves that have never been hashed will see
# their hash being saved directly to the backend.
if vtx.vType != Leaf:
?db.putKeyAtLevel(rvid, vtx, key, level, batch)
ok (key, level)
proc computeKeyImpl(
db: AristoDbRef, rvid: RootedVertexID, bloom: ptr BasicBloomFilter
): Result[HashKey, AristoError] =
var batch: WriteBatch
let res = computeKeyImpl(db, rvid, batch, bloom)
if res.isOk:
if batch.count > 0:
if batch.count >= batchSize * 100:
info "Wrote computeKey cache", keys = batch.count, accounts = "100.00%"
debug "Wrote computeKey cache", keys = batch.count, accounts = "100.00%"
ok (?res)[0]
proc computeKey*(
db: AristoDbRef, # Database, top layer
rvid: RootedVertexID, # Vertex to convert
): Result[HashKey, AristoError] =
## Compute the key for an arbitrary vertex ID. If successful, the length of
## the resulting key might be smaller than 32. If it is used as a root vertex
## state/hash, it must be converted to a `Hash32` (using (`.to(Hash32)`) as
## in `db.computeKey(rvid).value.to(Hash32)` which always results in a
## 32 byte value.
computeKeyImpl(db, rvid, nil)
proc computeLeafKeysImpl(
T: type, db: AristoDbRef, root: VertexID
): Result[void, AristoError] =
# Key computation function that works by iterating over the entries in the
# database (instead of traversing trie using point lookups) - due to how
# rocksdb is organised, this cache-friendly traversal order turns out to be
# more efficient even if we "touch" a lot of irrelevant entries.
# Computation works bottom-up starting with the leaves and proceeding with
# branches whose children were computed in the previous round one "layer"
# at a time until the the number of successfully computed nodes grows low.
# TODO progress indicator
if db.getKeyUbe((root, root)).isOk():
return ok() # Fast path for when the root is in the database already
# Smoke check to see if we can find lots of branch nodes with keys already
var branches, found: int
for (rvid, vtx) in T.walkVtxBe(db, {Branch}):
branches += 1
if db.getKeyUbe(rvid).isOk:
found += 1
# 10% found on the initial sample.. good enough? Some more randomness
# here would maybe make sense
if branches > 1000:
if found * 10 > branches:
return ok()
info "Writing key cache (this may take a while)"
var batch: WriteBatch
# Bloom filter keeping track of keys we're added to the database already so
# as to avoid expensive speculative lookups
var bloom = BasicBloomFilter.init()
# Reuse rlp writers to avoid superfluous memory allocations
writer = initRlpWriter()
writer2 = initRlpWriter()
writer3 = initRlpWriter()
level = 0
leaves = 0
# Load leaves into bloom filter so we can quickly skip over branch nodes where
# we know the lookup will fail.
# At the time of writing, this is roughly 3/4 of the of the entries in the
# database on mainnet - the ratio roughly corresponds to the fill ratio of the
# deepest branch nodes as nodes close to the MPT root don't come in
# significant numbers
# Leaf keys are not computed to save space - instead, if they are needed they
# are computed from the leaf data.
for (rvid, vtx) in T.walkVtxBe(db, {Leaf}):
if vtx.lData.pType == AccountData and vtx.lData.stoID.isValid:
# Accounts whose key depends on the storage trie typically will not yet
# have their root node computed and several such contracts are
# significant in size, meaning that we might as well let their leaves
# be computed and then top up during regular trie traversal.
leaves += 1
# The leaves have been loaded into the bloom filter - we'll now proceed to
# branches expecting diminishing returns for each layer - not only beacuse
# there are fewer nodes closer to the root in the trie but also because leaves
# we skipped over lead larger and larger branch gaps and the advantage of
# iterating in disk order is lost
var lastRound = leaves
level += 1
# 16*16 looks like "2 levels of MPT" but in reality, the branch nodes close
# to the leaves are sparse - on average about 4 nodes per branch on mainnet -
# meaning that we'll do 3-4 levels of branch depending on the network
var branches = 0
while lastRound > (leaves div (16 * 16)):
info "Starting branch layer", keys = batch.count, lastRound, level
var round = 0
branches = 0
for (rvid, vtx) in T.walkVtxBe(db, {Branch}):
branches += 1
if vtx.pfx.len > 0:
# TODO there shouldn't be many extension nodes - is it worth the lookup?
if level > 1:
# A hit on the bloom filter here means we **maybe** already computed a
# key for this branch node - we could verify this with a lookup but
# the generally low false positive rate makes this check more expensive
# than simply revisiting the node using trie traversal.
if bloom.query(uint64(rvid.vid)):
block branchKey:
for b in vtx.bVid:
if b.isValid and not bloom.query(uint64(b)):
# If any child is missing from the branch, we can't compute the key
# trivially
break branchKey
let key = writer.encodeBranch:
let vid = vtx.bVid[n]
if vid.isValid:
let bkeyOpt =
if level == 1: # No leaf keys in database
Result[HashKey, AristoError].err(GetKeyNotFound)
db.getKeyUbe((rvid.root, vid))
let bvtx = db.getVtxUbe((rvid.root, vid)).valueOr:
# Incomplete database?
break branchKey
if bvtx == nil or (
bvtx.vType == Leaf and bvtx.lData.pType == AccountData and
# It's unlikely storage root key has been computed already, so
# skip
# TODO maybe worth revisting - a not insignificant number of
# contracts have only a leaf storage slot so for those we
# could trivially compute account storage root..
break branchKey
case bvtx.vType
of Leaf:
case bvtx.lData.pType
of AccountData:
writer3.append Account(
nonce: bvtx.lData.account.nonce,
balance: bvtx.lData.account.balance,
# Accounts with storage filtered out above
storageRoot: EMPTY_ROOT_HASH,
codeHash: bvtx.lData.account.codeHash,
of StoData:
of Branch:
break branchKey
?batch.putVtx(db, rvid, vtx, key)
if batch.count mod batchSize == 0:
if batch.count mod (batchSize * 100) == 0:
info "Writing branches", keys = batch.count, round, level
debug "Writing branches", keys = batch.count, round, level
round += 1
lastRound = round
level += 1
info "Key cache base written",
keys = batch.count, lastRound, leaves, branches
let rc = computeKeyImpl(db, (root, root), addr bloom)
if rc.isOk() or rc.error() == GetVtxNotFound:
# When there's no root vertex, the database is likely empty
proc computeKeys*(db: AristoDbRef, root: VertexID): Result[void, AristoError] =
## Computing the leaf keys is a pre-processing step for when hash cache is
## empty.
## Computing it by traversing the trie can take days because of the mismatch
## between trie traversal order and the on-disk VertexID-based sorting.
## This implementation speeds up the inital seeding of the cache by traversing
## the full state in on-disk order and computing hashes bottom-up instead.
case db.backend.kind
of BackendMemory:
MemBackendRef.computeLeafKeysImpl db, root
of BackendRocksDB, BackendRdbHosting:
RdbBackendRef.computeLeafKeysImpl db, root
of BackendVoid:
# ------------------------------------------------------------------------------
# End
# ------------------------------------------------------------------------------