488 lines
17 KiB
Nim
488 lines
17 KiB
Nim
# nimbus-eth1
|
|
# Copyright (c) 2023-2024 Status Research & Development GmbH
|
|
# Licensed under either of
|
|
# * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
|
|
# http://www.apache.org/licenses/LICENSE-2.0)
|
|
# * MIT license ([LICENSE-MIT](LICENSE-MIT) or
|
|
# http://opensource.org/licenses/MIT)
|
|
# at your option. This file may not be copied, modified, or distributed
|
|
# except according to those terms.
|
|
|
|
{.push raises: [].}
|
|
|
|
import
|
|
system/ansi_c,
|
|
std/[strformat, math, hashes],
|
|
stew/staticfor,
|
|
chronicles,
|
|
eth/common,
|
|
results,
|
|
"."/[aristo_desc, aristo_get, aristo_serialise, aristo_walk/persistent],
|
|
./aristo_desc/desc_backend
|
|
|
|
type BasicBloomFilter = object
|
|
# School book implementation of bloom filter based on
|
|
# https://github.com/save-buffer/bloomfilter_benchmarks.
|
|
#
|
|
# In theory, this bloom filter could be turned into a reusable component but
|
|
# it is fairly specialised to the particular use case and gets used in a
|
|
# tight/hot loop in the code - a generalisation would require care so as not
|
|
# to introduce overhead but could of course be further optimised using
|
|
bytes: ptr UncheckedArray[byte]
|
|
|
|
proc computeBits(n: int, epsilon: float): int =
|
|
# Number of bits in the bloom filter required for n elements and eposilon
|
|
# false positive rate
|
|
int(-1.4427 * float(n) * log2(epsilon) + 0.5)
|
|
|
|
proc computeHashFns(epsilon: float): int =
|
|
# Number of hash functions given the desired false positive rate
|
|
int(-log2(epsilon) + 0.5)
|
|
|
|
const
|
|
bloomRate = 0.002
|
|
# The leaf cache computation is fairly sensitive to false positives as these
|
|
# ripple up the branch trie with false postivies being amplified by trie
|
|
# branching - this has to be balanced with the cost which
|
|
# goes up fairly quickly with ~13 bits per key at 0.002, meaning ~2gb of
|
|
# memory for the current setting below!
|
|
bloomHashes = computeHashFns(bloomRate)
|
|
expectedKeys = 1500000000
|
|
# expected number of elements in the bloom filter - this is reported as
|
|
# `keys` below and will need adjusting - the value is more or less accurate
|
|
# on mainnet as of block 2100000 (~oct 2024) for the number of leaves
|
|
# present - we use leaf count because bloom filter accuracy is most
|
|
# important for the first round of branches.
|
|
# TODO rocksdb can estimate the number of keys present in the vertex table -
|
|
# this would provide a reasonable estimate of what the bloom table size
|
|
# should be, though in reality we want leaf count per above argument -
|
|
# at the time of writing leaves make up around 3/4 of all verticies
|
|
bloomSize = uint32((computeBits(expectedKeys, bloomRate) + 7) / 8)
|
|
|
|
func hashes(v: uint64): (uint32, uint32) =
|
|
# Use the two halves of an uint64 to create two independent hashes functions
|
|
# for the bloom that allow efficiently generating more bloom hash functions
|
|
# per Kirsch and Mitzenmacher:
|
|
# https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
|
|
let
|
|
v = uint64(hash(v)) # `hash` for a better spread of bits into upper half
|
|
h1 = uint32(v)
|
|
h2 = uint32(v shr 32)
|
|
(h1, h2)
|
|
|
|
func insert(filter: var BasicBloomFilter, v: uint64) =
|
|
let (h1, h2) = hashes(v)
|
|
|
|
staticFor i, 0 ..< bloomHashes:
|
|
let
|
|
hash = (h1 + i * h2)
|
|
bitIdx = uint8(hash mod 8)
|
|
byteIdx = (hash div 8) mod bloomSize
|
|
filter.bytes[byteIdx] = filter.bytes[byteIdx] or (1'u8 shl bitIdx)
|
|
|
|
func query(filter: BasicBloomFilter, v: uint64): bool =
|
|
let (h1, h2) = hashes(v)
|
|
|
|
var match = 1'u8
|
|
|
|
staticFor i, 0 ..< bloomHashes:
|
|
let
|
|
hash = (h1 + i * h2)
|
|
bitIdx = uint8(hash mod 8)
|
|
byteIdx = (hash div 8) mod bloomSize
|
|
match = match and ((filter.bytes[byteIdx] shr bitIdx) and 1)
|
|
|
|
match > 0
|
|
|
|
proc init(T: type BasicBloomFilter): T =
|
|
# We use the C memory allocator so as to return memory to the operating system
|
|
# at the end of the computation - we don't want the one-off blob to remain in
|
|
# the hands of the Nim GC.
|
|
# `calloc` to get zeroed memory out of the box
|
|
let memory = c_calloc(csize_t(bloomSize), 1)
|
|
doAssert memory != nil, "Could not allocate memory for bloom filter"
|
|
T(bytes: cast[ptr UncheckedArray[byte]](memory))
|
|
|
|
proc release(v: BasicBloomFilter) =
|
|
# TODO with orc, this could be a destructor
|
|
c_free(v.bytes)
|
|
|
|
type WriteBatch = tuple[writer: PutHdlRef, count: int, depth: int, prefix: uint64]
|
|
|
|
# Keep write batch size _around_ 1mb, give or take some overhead - this is a
|
|
# tradeoff between efficiency and memory usage with diminishing returns the
|
|
# larger it is..
|
|
const batchSize = 1024 * 1024 div (sizeof(RootedVertexID) + sizeof(HashKey))
|
|
|
|
proc flush(batch: var WriteBatch, db: AristoDbRef): Result[void, AristoError] =
|
|
if batch.writer != nil:
|
|
?db.backend.putEndFn batch.writer
|
|
batch.writer = nil
|
|
ok()
|
|
|
|
proc putKey(
|
|
batch: var WriteBatch, db: AristoDbRef, rvid: RootedVertexID, key: HashKey
|
|
): Result[void, AristoError] =
|
|
if batch.writer == nil:
|
|
doAssert db.backend != nil, "source data is from the backend"
|
|
batch.writer = ?db.backend.putBegFn()
|
|
|
|
db.backend.putKeyFn(batch.writer, rvid, key)
|
|
batch.count += 1
|
|
|
|
ok()
|
|
|
|
func progress(batch: WriteBatch): string =
|
|
# Return an approximation on how much of the keyspace has been covered by
|
|
# looking at the path prefix that we're currently processing
|
|
&"{(float(batch.prefix) / float(uint64.high)) * 100:02.2f}%"
|
|
|
|
func enter(batch: var WriteBatch, nibble: int) =
|
|
batch.depth += 1
|
|
if batch.depth <= 16:
|
|
batch.prefix += uint64(nibble) shl ((16 - batch.depth) * 4)
|
|
|
|
func leave(batch: var WriteBatch, nibble: int) =
|
|
if batch.depth <= 16:
|
|
batch.prefix -= uint64(nibble) shl ((16 - batch.depth) * 4)
|
|
batch.depth -= 1
|
|
|
|
proc putKeyAtLevel(
|
|
db: AristoDbRef,
|
|
rvid: RootedVertexID,
|
|
key: HashKey,
|
|
level: int,
|
|
batch: var WriteBatch,
|
|
): Result[void, AristoError] =
|
|
## Store a hash key in the given layer or directly to the underlying database
|
|
## which helps ensure that memory usage is proportional to the pending change
|
|
## set (vertex data may have been committed to disk without computing the
|
|
## corresponding hash!)
|
|
|
|
# Only put computed keys in the database which keeps churn down by focusing on
|
|
# the ones that do not change!
|
|
if level == -2:
|
|
?batch.putKey(db, rvid, key)
|
|
|
|
if batch.count mod batchSize == 0:
|
|
?batch.flush(db)
|
|
|
|
if batch.count mod (batchSize * 100) == 0:
|
|
info "Writing computeKey cache", keys = batch.count, accounts = batch.progress
|
|
else:
|
|
debug "Writing computeKey cache", keys = batch.count, accounts = batch.progress
|
|
else:
|
|
db.deltaAtLevel(level).kMap[rvid] = key
|
|
|
|
ok()
|
|
|
|
func maxLevel(cur, other: int): int =
|
|
# Compare two levels and return the topmost in the stack, taking into account
|
|
# the odd reversal of order around the zero point
|
|
if cur < 0:
|
|
max(cur, other) # >= 0 is always more topmost than <0
|
|
elif other < 0:
|
|
cur
|
|
else:
|
|
min(cur, other) # Here the order is reversed and 0 is the top layer
|
|
|
|
template encodeLeaf(w: var RlpWriter, pfx: NibblesBuf, leafData: untyped): HashKey =
|
|
w.startList(2)
|
|
w.append(pfx.toHexPrefix(isLeaf = true).data())
|
|
w.append(leafData)
|
|
w.finish().digestTo(HashKey)
|
|
|
|
template encodeBranch(w: var RlpWriter, subKeyForN: untyped): HashKey =
|
|
w.startList(17)
|
|
for n {.inject.} in 0 .. 15:
|
|
w.append(subKeyForN)
|
|
w.append EmptyBlob
|
|
w.finish().digestTo(HashKey)
|
|
|
|
template encodeExt(w: var RlpWriter, pfx: NibblesBuf, branchKey: HashKey): HashKey =
|
|
w.startList(2)
|
|
w.append(pfx.toHexPrefix(isLeaf = false).data())
|
|
w.append(branchKey)
|
|
w.finish().digestTo(HashKey)
|
|
|
|
proc computeKeyImpl(
|
|
db: AristoDbRef,
|
|
rvid: RootedVertexID,
|
|
batch: var WriteBatch,
|
|
bloom: ptr BasicBloomFilter = nil,
|
|
): Result[(HashKey, int), AristoError] =
|
|
# The bloom filter available used only when creating the key cache from an
|
|
# empty state
|
|
if bloom == nil or bloom[].query(uint64(rvid.vid)):
|
|
db.getKeyRc(rvid).isErrOr:
|
|
# Value cached either in layers or database
|
|
return ok value
|
|
|
|
let (vtx, vl) = ?db.getVtxRc(rvid, {GetVtxFlag.PeekCache})
|
|
|
|
# Top-most level of all the verticies this hash compution depends on
|
|
var level = vl
|
|
|
|
# TODO this is the same code as when serializing NodeRef, without the NodeRef
|
|
var writer = initRlpWriter()
|
|
|
|
let key =
|
|
case vtx.vType
|
|
of Leaf:
|
|
writer.encodeLeaf(vtx.pfx):
|
|
case vtx.lData.pType
|
|
of AccountData:
|
|
let
|
|
stoID = vtx.lData.stoID
|
|
skey =
|
|
if stoID.isValid:
|
|
let (skey, sl) =
|
|
?db.computeKeyImpl((stoID.vid, stoID.vid), batch, bloom)
|
|
level = maxLevel(level, sl)
|
|
skey
|
|
else:
|
|
VOID_HASH_KEY
|
|
|
|
rlp.encode Account(
|
|
nonce: vtx.lData.account.nonce,
|
|
balance: vtx.lData.account.balance,
|
|
storageRoot: skey.to(Hash32),
|
|
codeHash: vtx.lData.account.codeHash,
|
|
)
|
|
of RawData:
|
|
vtx.lData.rawBlob
|
|
of StoData:
|
|
# TODO avoid memory allocation when encoding storage data
|
|
rlp.encode(vtx.lData.stoData)
|
|
of Branch:
|
|
template writeBranch(w: var RlpWriter): HashKey =
|
|
w.encodeBranch:
|
|
let vid = vtx.bVid[n]
|
|
if vid.isValid:
|
|
batch.enter(n)
|
|
let (bkey, bl) = ?db.computeKeyImpl((rvid.root, vid), batch, bloom)
|
|
batch.leave(n)
|
|
|
|
level = maxLevel(level, bl)
|
|
bkey
|
|
else:
|
|
VOID_HASH_KEY
|
|
|
|
if vtx.pfx.len > 0: # Extension node
|
|
writer.encodeExt(vtx.pfx):
|
|
var bwriter = initRlpWriter()
|
|
bwriter.writeBranch()
|
|
else:
|
|
writer.writeBranch()
|
|
|
|
# Cache the hash into the same storage layer as the the top-most value that it
|
|
# depends on (recursively) - this could be an ephemeral in-memory layer or the
|
|
# underlying database backend - typically, values closer to the root are more
|
|
# likely to live in an in-memory layer since any leaf change will lead to the
|
|
# root key also changing while leaves that have never been hashed will see
|
|
# their hash being saved directly to the backend.
|
|
?db.putKeyAtLevel(rvid, key, level, batch)
|
|
|
|
ok (key, level)
|
|
|
|
proc computeKeyImpl(
|
|
db: AristoDbRef, rvid: RootedVertexID, bloom: ptr BasicBloomFilter
|
|
): Result[HashKey, AristoError] =
|
|
var batch: WriteBatch
|
|
let res = computeKeyImpl(db, rvid, batch, bloom)
|
|
if res.isOk:
|
|
?batch.flush(db)
|
|
|
|
if batch.count > 0:
|
|
if batch.count >= batchSize * 100:
|
|
info "Wrote computeKey cache", keys = batch.count, accounts = "100.00%"
|
|
else:
|
|
debug "Wrote computeKey cache", keys = batch.count, accounts = "100.00%"
|
|
|
|
ok (?res)[0]
|
|
|
|
proc computeKey*(
|
|
db: AristoDbRef, # Database, top layer
|
|
rvid: RootedVertexID, # Vertex to convert
|
|
): Result[HashKey, AristoError] =
|
|
## Compute the key for an arbitrary vertex ID. If successful, the length of
|
|
## the resulting key might be smaller than 32. If it is used as a root vertex
|
|
## state/hash, it must be converted to a `Hash32` (using (`.to(Hash32)`) as
|
|
## in `db.computeKey(rvid).value.to(Hash32)` which always results in a
|
|
## 32 byte value.
|
|
|
|
computeKeyImpl(db, rvid, nil)
|
|
|
|
proc computeLeafKeysImpl(
|
|
T: type, db: AristoDbRef, root: VertexID
|
|
): Result[void, AristoError] =
|
|
for x in T.walkKeyBe(db):
|
|
debug "Skipping leaf key computation, cache is not empty"
|
|
return ok()
|
|
|
|
# Key computation function that works by iterating over the entries in the
|
|
# database (instead of traversing trie using point lookups) - due to how
|
|
# rocksdb is organised, this cache-friendly traversal order turns out to be
|
|
# more efficient even if we "touch" a lot of irrelevant entries.
|
|
# Computation works bottom-up starting with the leaves and proceeding with
|
|
# branches whose children were computed in the previous round one "layer"
|
|
# at a time until the the number of successfully computed nodes grows low.
|
|
# TODO progress indicator
|
|
info "Writing key cache (this may take a while)"
|
|
|
|
var batch: WriteBatch
|
|
|
|
# Bloom filter keeping track of keys we're added to the database already so
|
|
# as to avoid expensive speculative lookups
|
|
var bloom = BasicBloomFilter.init()
|
|
defer:
|
|
bloom.release()
|
|
|
|
var
|
|
# Reuse rlp writers to avoid superfluous memory allocations
|
|
writer = initRlpWriter()
|
|
writer2 = initRlpWriter()
|
|
level = 0
|
|
|
|
# Start with leaves - at the time of writing, this is roughly 3/4 of the
|
|
# of the entries in the database on mainnet - the ratio roughly corresponds to
|
|
# the fill ratio of the deepest branch nodes as nodes close to the MPT root
|
|
# don't come in significant numbers
|
|
|
|
for (rvid, vtx) in T.walkVtxBe(db, {Leaf}):
|
|
if vtx.lData.pType == AccountData and vtx.lData.stoID.isValid:
|
|
# Accounts whose key depends on the storage trie typically will not yet
|
|
# have their root node computed and several such contracts are
|
|
# significant in size, meaning that we might as well let their leaves
|
|
# be computed and then top up during regular trie traversal.
|
|
continue
|
|
|
|
writer.clear()
|
|
|
|
let key = writer.encodeLeaf(vtx.pfx):
|
|
case vtx.lData.pType
|
|
of AccountData:
|
|
writer2.clear()
|
|
writer2.append Account(
|
|
nonce: vtx.lData.account.nonce,
|
|
balance: vtx.lData.account.balance,
|
|
# Accounts with storage filtered out above
|
|
storageRoot: EMPTY_ROOT_HASH,
|
|
codeHash: vtx.lData.account.codeHash,
|
|
)
|
|
writer2.finish()
|
|
of RawData:
|
|
vtx.lData.rawBlob
|
|
of StoData:
|
|
writer2.clear()
|
|
writer2.append(vtx.lData.stoData)
|
|
writer2.finish()
|
|
|
|
?batch.putKey(db, rvid, key)
|
|
|
|
if batch.count mod batchSize == 0:
|
|
?batch.flush(db)
|
|
|
|
if batch.count mod (batchSize * 100) == 0:
|
|
info "Writing leaves", keys = batch.count, level
|
|
else:
|
|
debug "Writing leaves", keys = batch.count, level
|
|
|
|
bloom.insert(uint64(rvid.vid))
|
|
|
|
let leaves = batch.count
|
|
|
|
# The leaves have been written - we'll now proceed to branches expecting
|
|
# diminishing returns for each layer - not only beacuse there are fewer nodes
|
|
# closer to the root in the trie but also because leaves we skipped over lead
|
|
# larger and larger branch gaps and the advantage of iterating in disk order
|
|
# is lost
|
|
var lastRound = leaves
|
|
|
|
level += 1
|
|
|
|
# 16*16 looks like "2 levels of MPT" but in reality, the branch nodes close
|
|
# to the leaves are sparse - on average about 4 nodes per branch on mainnet -
|
|
# meaning that we'll do 3-4 levels of branch depending on the network
|
|
while lastRound > (leaves div (16 * 16)):
|
|
info "Starting branch layer", keys = batch.count, lastRound, level
|
|
var round = 0
|
|
for (rvid, vtx) in T.walkVtxBe(db, {Branch}):
|
|
if vtx.pfx.len > 0:
|
|
# TODO there shouldn't be many of these - is it worth the lookup?
|
|
continue
|
|
|
|
if level > 1:
|
|
# A hit on the bloom filter here means we **maybe** already computed a
|
|
# key for this branch node - we could verify this with a lookup but
|
|
# the generally low false positive rate makes this check more expensive
|
|
# than simply revisiting the node using trie traversal.
|
|
if bloom.query(uint64(rvid.vid)):
|
|
continue
|
|
|
|
block branchKey:
|
|
for b in vtx.bVid:
|
|
if b.isValid and not bloom.query(uint64(b)):
|
|
# If any child is missing from the branch, we can't compute the key
|
|
# trivially
|
|
break branchKey
|
|
|
|
writer.clear()
|
|
let key = writer.encodeBranch:
|
|
let vid = vtx.bVid[n]
|
|
if vid.isValid:
|
|
let bkey = db.getKeyUbe((rvid.root, vid)).valueOr:
|
|
# False positive on the bloom filter lookup
|
|
break branchKey
|
|
bkey
|
|
else:
|
|
VOID_HASH_KEY
|
|
|
|
?batch.putKey(db, rvid, key)
|
|
|
|
if batch.count mod batchSize == 0:
|
|
?batch.flush(db)
|
|
if batch.count mod (batchSize * 100) == 0:
|
|
info "Writing branches", keys = batch.count, round, level
|
|
else:
|
|
debug "Writing branches", keys = batch.count, round, level
|
|
|
|
round += 1
|
|
bloom.insert(uint64(rvid.vid))
|
|
|
|
lastRound = round
|
|
level += 1
|
|
|
|
?batch.flush(db)
|
|
|
|
info "Key cache base written",
|
|
keys = batch.count, lastRound, leaves, branches = batch.count - leaves
|
|
|
|
let rc = computeKeyImpl(db, (root, root), addr bloom)
|
|
if rc.isOk() or rc.error() == GetVtxNotFound:
|
|
# When there's no root vertex, the database is likely empty
|
|
ok()
|
|
else:
|
|
err(rc.error())
|
|
|
|
proc computeKeys*(db: AristoDbRef, root: VertexID): Result[void, AristoError] =
|
|
## Computing the leaf keys is a pre-processing step for when hash cache is
|
|
## empty.
|
|
##
|
|
## Computing it by traversing the trie can take days because of the mismatch
|
|
## between trie traversal order and the on-disk VertexID-based sorting.
|
|
##
|
|
## This implementation speeds up the inital seeding of the cache by traversing
|
|
## the full state in on-disk order and computing hashes bottom-up instead.
|
|
case db.backend.kind
|
|
of BackendMemory:
|
|
MemBackendRef.computeLeafKeysImpl db, root
|
|
of BackendRocksDB, BackendRdbHosting:
|
|
RdbBackendRef.computeLeafKeysImpl db, root
|
|
of BackendVoid:
|
|
ok()
|
|
|
|
# ------------------------------------------------------------------------------
|
|
# End
|
|
# ------------------------------------------------------------------------------
|