lru cache updates (#2590)

* replace rocksdb row cache with larger rdb lru caches - these serve the
same purpose but are more efficient because they skips serialization,
locking and rocksdb layering
* don't append fresh items to cache - this has the effect of evicting
the existing items and replacing them with low-value entries that might
never be read - during write-heavy periods of processing, the
newly-added entries were evicted during the store loop
* allow tuning rdb lru size at runtime
* add (hidden) option to print lru stats at exit (replacing the
compile-time flag)

pre:
```
INF 2024-09-03 15:07:01.136+02:00 Imported blocks
blockNumber=20012001 blocks=12000 importedSlot=9216851 txs=1837042
mgas=181911.265 bps=11.675 tps=1870.397 mgps=176.819 avgBps=10.288
avgTps=1574.889 avgMGps=155.952 elapsed=19m26s458ms
```

post:
```
INF 2024-09-03 13:54:26.730+02:00 Imported blocks
blockNumber=20012001 blocks=12000 importedSlot=9216851 txs=1837042
mgas=181911.265 bps=11.637 tps=1864.384 mgps=176.250 avgBps=11.202
avgTps=1714.920 avgMGps=169.818 elapsed=17m51s211ms
```

9%:ish import perf improvement on similar mem usage :)
This commit is contained in:
Jacek Sieka 2024-09-05 11:18:32 +02:00 committed by GitHub
parent 3c6400673d
commit d39c589ec3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 178 additions and 104 deletions

View File

@ -410,6 +410,23 @@ type
defaultValueDesc: $defaultBlockCacheSize
name: "debug-rocksdb-block-cache-size".}: int
rdbKeyCacheSize {.
hidden
defaultValue: defaultRdbKeyCacheSize
defaultValueDesc: $defaultRdbKeyCacheSize
name: "debug-rdb-key-cache-size".}: int
rdbVtxCacheSize {.
hidden
defaultValue: defaultRdbVtxCacheSize
defaultValueDesc: $defaultRdbVtxCacheSize
name: "debug-rdb-vtx-cache-size".}: int
rdbPrintStats {.
hidden
desc: "Print RDB statistics at exit"
name: "debug-rdb-print-stats".}: bool
case cmd* {.
command
defaultValue: NimbusCmd.noCommand }: NimbusCmd
@ -790,12 +807,19 @@ func era1Dir*(conf: NimbusConf): OutDir =
func eraDir*(conf: NimbusConf): OutDir =
conf.eraDirOpt.get(OutDir(conf.dataDir.string & "/era"))
func dbOptions*(conf: NimbusConf): DbOptions =
func dbOptions*(conf: NimbusConf, noKeyCache = false): DbOptions =
DbOptions.init(
maxOpenFiles = conf.rocksdbMaxOpenFiles,
writeBufferSize = conf.rocksdbWriteBufferSize,
rowCacheSize = conf.rocksdbRowCacheSize,
blockCacheSize = conf.rocksdbBlockCacheSize,
rdbKeyCacheSize =
if noKeyCache: 0 else: conf.rdbKeyCacheSize ,
rdbVtxCacheSize =
# The import command does not use the key cache - better give it to vtx
if noKeyCache: conf.rdbKeyCacheSize + conf.rdbVtxCacheSize
else: conf.rdbVtxCacheSize,
rdbPrintStats = conf.rdbPrintStats,
)
# KLUDGE: The `load()` template does currently not work within any exception

View File

@ -38,12 +38,13 @@ export
proc newAristoRdbDbRef(
basePath: string;
opts: DbOptions;
dbOpts: DbOptionsRef;
cfOpts: ColFamilyOptionsRef;
guestCFs: openArray[ColFamilyDescriptor];
): Result[(AristoDbRef, seq[ColFamilyReadWrite]), AristoError]=
let
(be, oCfs) = ? rocksDbBackend(basePath, dbOpts, cfOpts, guestCFs)
(be, oCfs) = ? rocksDbBackend(basePath, opts, dbOpts, cfOpts, guestCFs)
vTop = block:
let rc = be.getTuvFn()
if rc.isErr:
@ -62,6 +63,7 @@ proc init*(
T: type AristoDbRef;
B: type RdbBackendRef;
basePath: string;
opts: DbOptions;
dbOpts: DbOptionsRef;
cfOpts: ColFamilyOptionsRef;
guestCFs: openArray[ColFamilyDescriptor];
@ -69,7 +71,7 @@ proc init*(
## Generic constructor, `basePath` argument is ignored for memory backend
## databases (which also unconditionally succeed initialising.)
##
basePath.newAristoRdbDbRef dbOpts, cfOpts, guestCFs
basePath.newAristoRdbDbRef opts, dbOpts, cfOpts, guestCFs
proc activateWrTrigger*(
db: AristoDbRef;

View File

@ -248,6 +248,7 @@ proc putBegHostingFn(db: RdbBackendRef): PutBegFn =
proc rocksDbBackend*(
path: string;
opts: DbOptions;
dbOpts: DbOptionsRef;
cfOpts: ColFamilyOptionsRef;
guestCFs: openArray[ColFamilyDescriptor];
@ -257,7 +258,7 @@ proc rocksDbBackend*(
# Initialise RocksDB
let oCfs = block:
let rc = db.rdb.init(path, dbOpts, cfOpts, guestCFs)
let rc = db.rdb.init(path, opts, dbOpts, cfOpts, guestCFs)
if rc.isErr:
when extraTraceMessages:
trace logTxt "constructor failed",

View File

@ -15,6 +15,7 @@
import
std/os,
std/concurrency/atomics,
eth/common,
rocksdb,
stew/[endians2, keyed_queue],
@ -53,7 +54,9 @@ type
# handling of the longer key.)
#
rdKeyLru*: KeyedQueue[VertexID,HashKey] ## Read cache
rdKeySize*: int
rdVtxLru*: KeyedQueue[VertexID,VertexRef] ## Read cache
rdVtxSize*: int
basePath*: string ## Database directory
trgWriteEvent*: RdbWriteEventCb ## Database piggiback call back handler
@ -64,13 +67,24 @@ type
VtxCF = "AriVtx" ## Vertex column family name
KeyCF = "AriKey" ## Hash key column family name
RdbLruCounter* = array[bool, Atomic[uint64]]
RdbStateType* = enum
Account
World
const
BaseFolder* = "nimbus" ## Same as for Legacy DB
DataFolder* = "aristo" ## Legacy DB has "data"
RdKeyLruMaxSize* = 80000
## Max size of read cache for keys - ~4 levels of MPT
RdVtxLruMaxSize* = 80000
## Max size of read cache for vertex IDs - ~4 levels of MPT
var
# Hit/miss counters for LRU cache - global so as to integrate easily with
# nim-metrics and `uint64` to ensure that increasing them is fast - collection
# happens from a separate thread.
# TODO maybe turn this into more general framework for LRU reporting since
# we have lots of caches of this sort
rdbVtxLruStats*: array[RdbStateType, array[VertexType, RdbLruCounter]]
rdbKeyLruStats*: array[RdbStateType, RdbLruCounter]
# ------------------------------------------------------------------------------
# Public functions
@ -93,6 +107,15 @@ func dataDir*(rdb: RdbInst): string =
template toOpenArray*(xid: AdminTabID): openArray[byte] =
xid.uint64.toBytesBE.toOpenArray(0,7)
template to*(v: RootedVertexID, T: type RdbStateType): RdbStateType =
if v.root == VertexID(1): RdbStateType.World else: RdbStateType.Account
template inc*(v: var RdbLruCounter, hit: bool) =
discard v[hit].fetchAdd(1, moRelaxed)
template get*(v: RdbLruCounter, hit: bool): uint64 =
v[hit].load(moRelaxed)
# ------------------------------------------------------------------------------
# End
# ------------------------------------------------------------------------------

View File

@ -39,21 +39,6 @@ type
RdbVtxLruCounter = ref object of Counter
RdbKeyLruCounter = ref object of Counter
LruCounter = array[bool, Atomic[uint64]]
StateType = enum
Account
World
var
# Hit/miss counters for LRU cache - global so as to integrate easily with
# nim-metrics and `uint64` to ensure that increasing them is fast - collection
# happens from a separate thread.
# TODO maybe turn this into more general framework for LRU reporting since
# we have lots of caches of this sort
rdbVtxLruStats: array[StateType, array[VertexType, LruCounter]]
rdbKeyLruStats: array[StateType, LruCounter]
var
rdbVtxLruStatsMetric {.used.} = RdbVtxLruCounter.newCollector(
"aristo_rdb_vtx_lru_total",
@ -64,21 +49,12 @@ var
"aristo_rdb_key_lru_total", "HashKey LRU lookup", labels = ["state", "hit"]
)
template to(v: RootedVertexID, T: type StateType): StateType =
if v.root == VertexID(1): StateType.World else: StateType.Account
template inc(v: var LruCounter, hit: bool) =
discard v[hit].fetchAdd(1, moRelaxed)
template get(v: LruCounter, hit: bool): uint64 =
v[hit].load(moRelaxed)
method collect*(collector: RdbVtxLruCounter, output: MetricHandler) =
let timestamp = collector.now()
# We don't care about synchronization between each type of metric or between
# the metrics thread and others since small differences like this don't matter
for state in StateType:
for state in RdbStateType:
for vtype in VertexType:
for hit in [false, true]:
output(
@ -92,7 +68,7 @@ method collect*(collector: RdbVtxLruCounter, output: MetricHandler) =
method collect*(collector: RdbKeyLruCounter, output: MetricHandler) =
let timestamp = collector.now()
for state in StateType:
for state in RdbStateType:
for hit in [false, true]:
output(
name = "aristo_rdb_key_lru_total",
@ -129,10 +105,10 @@ proc getKey*(
# Try LRU cache first
var rc = rdb.rdKeyLru.lruFetch(rvid.vid)
if rc.isOK:
rdbKeyLruStats[rvid.to(StateType)].inc(true)
rdbKeyLruStats[rvid.to(RdbStateType)].inc(true)
return ok(move(rc.value))
rdbKeyLruStats[rvid.to(StateType)].inc(false)
rdbKeyLruStats[rvid.to(RdbStateType)].inc(false)
# Otherwise fetch from backend database
# A threadvar is used to avoid allocating an environment for onData
@ -153,17 +129,21 @@ proc getKey*(
return err((RdbHashKeyExpected,"")) # Parsing failed
# Update cache and return
ok rdb.rdKeyLru.lruAppend(rvid.vid, res.value(), RdKeyLruMaxSize)
if rdb.rdKeySize > 0:
ok rdb.rdKeyLru.lruAppend(rvid.vid, res.value(), rdb.rdKeySize)
else:
ok res.value()
proc getVtx*(
rdb: var RdbInst;
rvid: RootedVertexID;
): Result[VertexRef,(AristoError,string)] =
# Try LRU cache first
var rc = rdb.rdVtxLru.lruFetch(rvid.vid)
if rc.isOK:
rdbVtxLruStats[rvid.to(StateType)][rc.value().vType].inc(true)
return ok(move(rc.value))
if rdb.rdVtxSize > 0:
var rc = rdb.rdVtxLru.lruFetch(rvid.vid)
if rc.isOK:
rdbVtxLruStats[rvid.to(RdbStateType)][rc.value().vType].inc(true)
return ok(move(rc.value))
# Otherwise fetch from backend database
# A threadvar is used to avoid allocating an environment for onData
@ -179,61 +159,20 @@ proc getVtx*(
if not gotData:
# As a hack, we count missing data as leaf nodes
rdbVtxLruStats[rvid.to(StateType)][VertexType.Leaf].inc(false)
rdbVtxLruStats[rvid.to(RdbStateType)][VertexType.Leaf].inc(false)
return ok(VertexRef(nil))
if res.isErr():
return err((res.error(), "Parsing failed")) # Parsing failed
rdbVtxLruStats[rvid.to(StateType)][res.value().vType].inc(false)
rdbVtxLruStats[rvid.to(RdbStateType)][res.value().vType].inc(false)
# Update cache and return
ok rdb.rdVtxLru.lruAppend(rvid.vid, res.value(), RdVtxLruMaxSize)
if rdb.rdVtxSize > 0:
ok rdb.rdVtxLru.lruAppend(rvid.vid, res.value(), rdb.rdVtxSize)
else:
ok res.value()
# ------------------------------------------------------------------------------
# End
# ------------------------------------------------------------------------------
when defined(printStatsAtExit):
# Useful hack for printing exact metrics to compare runs with different
# settings
import std/[exitprocs, strformat]
addExitProc(
proc() =
block vtx:
var misses, hits: uint64
echo "vtxLru(", RdVtxLruMaxSize, ")"
echo " state vtype miss hit total hitrate"
for state in StateType:
for vtype in VertexType:
let
(miss, hit) = (
rdbVtxLruStats[state][vtype].get(false),
rdbVtxLruStats[state][vtype].get(true),
)
hitRate = float64(hit * 100) / (float64(hit + miss))
misses += miss
hits += hit
echo &"{state:>8} {vtype:>8} {miss:>10} {hit:>10} {miss+hit:>10} {hitRate:>6.2f}%"
let hitRate = float64(hits * 100) / (float64(hits + misses))
echo &" all all {misses:>10} {hits:>10} {misses+hits:>10} {hitRate:>6.2f}%"
block key:
var misses, hits: uint64
echo "keyLru(", RdKeyLruMaxSize, ") "
echo " state miss hit total hitrate"
for state in StateType:
let
(miss, hit) =
(rdbKeyLruStats[state].get(false), rdbKeyLruStats[state].get(true))
hitRate = float64(hit * 100) / (float64(hit + miss))
misses += miss
hits += hit
echo &"{state:>8} {miss:>10} {hit:>10} {miss+hit:>10} {hitRate:>5.2f}%"
let hitRate = float64(hits * 100) / (float64(hits + misses))
echo &" all {misses:>10} {hits:>10} {misses+hits:>10} {hitRate:>5.2f}%"
)

View File

@ -14,7 +14,7 @@
{.push raises: [].}
import
std/[sets, sequtils, os],
std/[exitprocs, sets, sequtils, strformat, os],
rocksdb,
results,
../../aristo_desc,
@ -25,9 +25,54 @@ import
# Private constructor
# ------------------------------------------------------------------------------
const
lruOverhead = 32
# Approximate LRU cache overhead per entry - although `keyed_queue` which is
# currently used has a much larger overhead, 32 is an easily reachable
# number which likely can be reduced in the future
proc dumpCacheStats(keySize, vtxSize: int) =
block vtx:
var misses, hits: uint64
echo "vtxLru(", vtxSize, ")"
echo " state vtype miss hit total hitrate"
for state in RdbStateType:
for vtype in VertexType:
let
(miss, hit) = (
rdbVtxLruStats[state][vtype].get(false),
rdbVtxLruStats[state][vtype].get(true),
)
hitRate = float64(hit * 100) / (float64(hit + miss))
misses += miss
hits += hit
echo &"{state:>8} {vtype:>8} {miss:>10} {hit:>10} {miss+hit:>10} {hitRate:>6.2f}%"
let hitRate = float64(hits * 100) / (float64(hits + misses))
echo &" all all {misses:>10} {hits:>10} {misses+hits:>10} {hitRate:>6.2f}%"
block key:
var misses, hits: uint64
echo "keyLru(", keySize, ") "
echo " state miss hit total hitrate"
for state in RdbStateType:
let
(miss, hit) =
(rdbKeyLruStats[state].get(false), rdbKeyLruStats[state].get(true))
hitRate = float64(hit * 100) / (float64(hit + miss))
misses += miss
hits += hit
echo &"{state:>8} {miss:>10} {hit:>10} {miss+hit:>10} {hitRate:>5.2f}%"
let hitRate = float64(hits * 100) / (float64(hits + misses))
echo &" all {misses:>10} {hits:>10} {misses+hits:>10} {hitRate:>5.2f}%"
proc initImpl(
rdb: var RdbInst;
basePath: string;
opts: DbOptions;
dbOpts: DbOptionsRef,
cfOpts: ColFamilyOptionsRef;
guestCFs: openArray[ColFamilyDescriptor] = [];
@ -37,6 +82,22 @@ proc initImpl(
rdb.basePath = basePath
# bytes -> entries based on overhead estimates
rdb.rdKeySize =
opts.rdbKeyCacheSize div (sizeof(VertexID) + sizeof(HashKey) + lruOverhead)
rdb.rdVtxSize =
opts.rdbVtxCacheSize div (sizeof(VertexID) + sizeof(default(VertexRef)[]) + lruOverhead)
if opts.rdbPrintStats:
let
ks = rdb.rdKeySize
vs = rdb.rdVtxSize
# TODO instead of dumping at exit, these stats could be logged or written
# to a file for better tracking over time - that said, this is mainly
# a debug utility at this point
addExitProc(proc() =
dumpCacheStats(ks, vs))
let
dataDir = rdb.dataDir
try:
@ -90,12 +151,13 @@ proc initImpl(
proc init*(
rdb: var RdbInst;
basePath: string;
opts: DbOptions;
dbOpts: DbOptionsRef;
cfOpts: ColFamilyOptionsRef;
guestCFs: openArray[ColFamilyDescriptor];
): Result[seq[ColFamilyReadWrite],(AristoError,string)] =
## Temporarily define a guest CF list here.
rdb.initImpl(basePath, dbOpts, cfOpts, guestCFs)
rdb.initImpl(basePath, opts, dbOpts, cfOpts, guestCFs)
proc destroy*(rdb: var RdbInst; eradicate: bool) =

View File

@ -98,9 +98,11 @@ proc putKey*(
trace logTxt "putKey()", vid, error=errSym, info=error
return err((rvid.vid,errSym,error))
# Update cache
if not rdb.rdKeyLru.lruUpdate(rvid.vid, key):
discard rdb.rdKeyLru.lruAppend(rvid.vid, key, RdKeyLruMaxSize)
if rdb.rdKeySize > 0:
# Update existing cached items but don't add new ones since doing so is
# likely to evict more useful items (when putting many items, we might even
# evict those that were just added)
discard rdb.rdKeyLru.lruUpdate(rvid.vid, key)
else:
dsc.delete(rvid.blobify().data(), rdb.keyCol.handle()).isOkOr:
@ -129,9 +131,11 @@ proc putVtx*(
trace logTxt "putVtx()", vid, error=errSym, info=error
return err((rvid.vid,errSym,error))
# Update cache
if not rdb.rdVtxLru.lruUpdate(rvid.vid, vtx):
discard rdb.rdVtxLru.lruAppend(rvid.vid, vtx, RdVtxLruMaxSize)
if rdb.rdVtxSize > 0:
# Update existing cached items but don't add new ones since doing so is
# likely to evict more useful items (when putting many items, we might even
# evict those that were just added)
discard rdb.rdVtxLru.lruUpdate(rvid.vid, vtx)
else:
dsc.delete(rvid.blobify().data(), rdb.vtxCol.handle()).isOkOr:

View File

@ -158,7 +158,7 @@ proc newAristoRocksDbCoreDbRef*(path: string, opts: DbOptions): CoreDbRef =
# Sharing opts means we also share caches between column families!
(dbOpts, cfOpts) = opts.toRocksDb()
guestCFs = RdbInst.guestCFs(cfOpts)
(adb, oCfs) = AristoDbRef.init(use_ari.RdbBackendRef, path, dbOpts, cfOpts, guestCFs).valueOr:
(adb, oCfs) = AristoDbRef.init(use_ari.RdbBackendRef, path, opts, dbOpts, cfOpts, guestCFs).valueOr:
raiseAssert aristoFail & ": " & $error
kdb = KvtDbRef.init(use_kvt.RdbBackendRef, adb, oCfs).valueOr:
raiseAssert kvtFail & ": " & $error
@ -169,7 +169,7 @@ proc newAristoDualRocksDbCoreDbRef*(path: string, opts: DbOptions): CoreDbRef =
## database backend.
let
(dbOpts, cfOpts) = opts.toRocksDb()
(adb, _) = AristoDbRef.init(use_ari.RdbBackendRef, path, dbOpts, cfOpts, []).valueOr:
(adb, _) = AristoDbRef.init(use_ari.RdbBackendRef, path, opts, dbOpts, cfOpts, []).valueOr:
raiseAssert aristoFail & ": " & $error
kdb = KvtDbRef.init(use_kvt.RdbBackendRef, path, dbOpts, cfOpts).valueOr:
raiseAssert kvtFail & ": " & $error

View File

@ -18,14 +18,25 @@ const
# https://github.com/facebook/rocksdb/wiki/Setup-Options-and-Basic-Tuning
defaultMaxOpenFiles* = 512
defaultWriteBufferSize* = 64 * 1024 * 1024
defaultRowCacheSize* = 1024 * 1024 * 1024
defaultRowCacheSize* = 0
## The row cache is disabled by default as the rdb lru caches do a better
## job at a similar abstraction level - ie they work at the same granularity
## as the rocksdb row cache but with less overhead
defaultBlockCacheSize* = 2 * 1024 * 1024 * 1024
defaultRdbVtxCacheSize* = 512 * 1024 * 1024
## Cache of branches and leaves in the state MPTs (world and account)
defaultRdbKeyCacheSize* = 256 * 1024 * 1024
## Hashes of the above
type DbOptions* = object # Options that are transported to the database layer
maxOpenFiles*: int
writeBufferSize*: int
rowCacheSize*: int
blockCacheSize*: int
rdbVtxCacheSize*: int
rdbKeyCacheSize*: int
rdbPrintStats*: bool
func init*(
T: type DbOptions,
@ -33,10 +44,16 @@ func init*(
writeBufferSize = defaultWriteBufferSize,
rowCacheSize = defaultRowCacheSize,
blockCacheSize = defaultBlockCacheSize,
rdbVtxCacheSize = defaultRdbVtxCacheSize,
rdbKeyCacheSize = defaultRdbKeyCacheSize,
rdbPrintStats = false,
): T =
T(
maxOpenFiles: maxOpenFiles,
writeBufferSize: writeBufferSize,
rowCacheSize: rowCacheSize,
blockCacheSize: blockCacheSize,
rdbVtxCacheSize: rdbVtxCacheSize,
rdbKeyCacheSize: rdbKeyCacheSize,
rdbPrintStats: rdbPrintStats,
)

View File

@ -224,7 +224,9 @@ proc run(nimbus: NimbusNode, conf: NimbusConf) =
# Resolve statically for database type
case conf.chainDbMode:
of Aristo,AriPrune:
AristoDbRocks.newCoreDbRef(string conf.dataDir, conf.dbOptions())
AristoDbRocks.newCoreDbRef(
string conf.dataDir,
conf.dbOptions(noKeyCache = conf.cmd == NimbusCmd.`import`))
setupMetrics(nimbus, conf)

View File

@ -103,7 +103,7 @@ proc dbTriplet(w: LeafQuartet; rdbPath: string): Result[DbTriplet,AristoError] =
let db = block:
if 0 < rdbPath.len:
let (dbOpts, cfOpts) = DbOptions.init().toRocksDb()
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, dbOpts, cfOpts, [])
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, DbOptions.init(), dbOpts, cfOpts, [])
xCheckRc rc.error == 0:
result = err(rc.error)
rc.value()[0]

View File

@ -111,7 +111,7 @@ proc testMergeProofAndKvpList*(
# New DB with disabled filter slots management
if 0 < rdbPath.len:
let (dbOpts, cfOpts) = DbOptions.init().toRocksDb()
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, dbOpts, cfOpts, [])
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, DbOptions.init(), dbOpts, cfOpts, [])
xCheckRc rc.error == 0
rc.value()[0]
else:

View File

@ -261,7 +261,7 @@ proc testTxMergeAndDeleteOneByOne*(
db = block:
if 0 < rdbPath.len:
let (dbOpts, cfOpts) = DbOptions.init().toRocksDb()
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, dbOpts, cfOpts, [])
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, DbOptions.init(), dbOpts, cfOpts, [])
xCheckRc rc.error == 0
rc.value()[0]
else:
@ -368,7 +368,7 @@ proc testTxMergeAndDeleteSubTree*(
db = block:
if 0 < rdbPath.len:
let (dbOpts, cfOpts) = DbOptions.init().toRocksDb()
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, dbOpts, cfOpts, [])
let rc = AristoDbRef.init(RdbBackendRef, rdbPath, DbOptions.init(), dbOpts, cfOpts, [])
xCheckRc rc.error == 0
rc.value()[0]
else: