From 35cc78c86d5a965c8bbf54404f6c4a8ff81b06cc Mon Sep 17 00:00:00 2001 From: Jacek Sieka Date: Mon, 2 Sep 2024 17:34:10 +0200 Subject: [PATCH] add metrics for rdb lru cache (#2586) This is a first step towards measuring the efficiency of the LRU caches over time - metrics can be collected during import or when running regulary. Since `nim-metrics` carries some overhead for its default way of reporting metrics, this PR implements a custom collector over atomic counters, given that this is one of the hottest spots in the block processing pipeline. Using a compile-time flag, the same metrics can be printed on exit which is useful when comparing different strategies for caching - here's a recent run over blocks 16000001-1616384 - this is a good candidate to expose in a better way in the future, maybe: ``` state vtype miss hit total hitrate Account Leaf 4909417 4466215 9375632 47.64% Account Branch 20742574 72015123 92757697 77.64% World Leaf 940483 1140946 2081429 54.82% World Branch 8224151 131496580 139720731 94.11% all all 34816625 209118864 243935489 85.73% ``` --- .../aristo/aristo_init/rocks_db/rdb_get.nim | 128 +++++++++++++++++- 1 file changed, 125 insertions(+), 3 deletions(-) diff --git a/nimbus/db/aristo/aristo_init/rocks_db/rdb_get.nim b/nimbus/db/aristo/aristo_init/rocks_db/rdb_get.nim index f3fad1f54..db3ca338a 100644 --- a/nimbus/db/aristo/aristo_init/rocks_db/rdb_get.nim +++ b/nimbus/db/aristo/aristo_init/rocks_db/rdb_get.nim @@ -20,7 +20,9 @@ import stew/keyed_queue, ../../[aristo_blobify, aristo_desc], ../init_common, - ./rdb_desc + ./rdb_desc, + metrics, + std/concurrency/atomics const extraTraceMessages = false @@ -33,6 +35,73 @@ when extraTraceMessages: logScope: topics = "aristo-rocksdb" +type + RdbVtxLruCounter = ref object of Counter + RdbKeyLruCounter = ref object of Counter + + LruCounter = array[bool, Atomic[uint64]] + + StateType = enum + Account + World + +var + # Hit/miss counters for LRU cache - global so as to integrate easily with + # nim-metrics and `uint64` to ensure that increasing them is fast - collection + # happens from a separate thread. + # TODO maybe turn this into more general framework for LRU reporting since + # we have lots of caches of this sort + rdbVtxLruStats: array[StateType, array[VertexType, LruCounter]] + rdbKeyLruStats: array[StateType, LruCounter] + +var + rdbVtxLruStatsMetric {.used.} = RdbVtxLruCounter.newCollector( + "aristo_rdb_vtx_lru_total", + "Vertex LRU lookup (hit/miss, world/account, branch/leaf)", + labels = ["state", "vtype", "hit"], + ) + rdbKeyLruStatsMetric {.used.} = RdbKeyLruCounter.newCollector( + "aristo_rdb_key_lru_total", "HashKey LRU lookup", labels = ["state", "hit"] + ) + +template to(v: RootedVertexID, T: type StateType): StateType = + if v.root == VertexID(1): StateType.World else: StateType.Account + +template inc(v: var LruCounter, hit: bool) = + discard v[hit].fetchAdd(1, moRelaxed) + +template get(v: LruCounter, hit: bool): uint64 = + v[hit].load(moRelaxed) + +method collect*(collector: RdbVtxLruCounter, output: MetricHandler) = + let timestamp = collector.now() + + # We don't care about synchronization between each type of metric or between + # the metrics thread and others since small differences like this don't matter + for state in StateType: + for vtype in VertexType: + for hit in [false, true]: + output( + name = "aristo_rdb_vtx_lru_total", + value = float64(rdbVtxLruStats[state][vtype].get(hit)), + labels = ["state", "vtype", "hit"], + labelValues = [$state, $vtype, $ord(hit)], + timestamp = timestamp, + ) + +method collect*(collector: RdbKeyLruCounter, output: MetricHandler) = + let timestamp = collector.now() + + for state in StateType: + for hit in [false, true]: + output( + name = "aristo_rdb_key_lru_total", + value = float64(rdbKeyLruStats[state].get(hit)), + labels = ["state", "hit"], + labelValues = [$state, $ord(hit)], + timestamp = timestamp, + ) + # ------------------------------------------------------------------------------ # Public functions # ------------------------------------------------------------------------------ @@ -60,8 +129,11 @@ proc getKey*( # Try LRU cache first var rc = rdb.rdKeyLru.lruFetch(rvid.vid) if rc.isOK: + rdbKeyLruStats[rvid.to(StateType)].inc(true) return ok(move(rc.value)) + rdbKeyLruStats[rvid.to(StateType)].inc(false) + # Otherwise fetch from backend database # A threadvar is used to avoid allocating an environment for onData var res{.threadvar.}: Opt[HashKey] @@ -90,6 +162,7 @@ proc getVtx*( # Try LRU cache first var rc = rdb.rdVtxLru.lruFetch(rvid.vid) if rc.isOK: + rdbVtxLruStats[rvid.to(StateType)][rc.value().vType].inc(true) return ok(move(rc.value)) # Otherwise fetch from backend database @@ -105,13 +178,62 @@ proc getVtx*( return err((errSym,error)) if not gotData: - res.ok(VertexRef(nil)) - elif res.isErr(): + # As a hack, we count missing data as leaf nodes + rdbVtxLruStats[rvid.to(StateType)][VertexType.Leaf].inc(false) + return ok(VertexRef(nil)) + + if res.isErr(): return err((res.error(), "Parsing failed")) # Parsing failed + rdbVtxLruStats[rvid.to(StateType)][res.value().vType].inc(false) + # Update cache and return ok rdb.rdVtxLru.lruAppend(rvid.vid, res.value(), RdVtxLruMaxSize) # ------------------------------------------------------------------------------ # End # ------------------------------------------------------------------------------ + +when defined(printStatsAtExit): + # Useful hack for printing exact metrics to compare runs with different + # settings + import std/[exitprocs, strformat] + addExitProc( + proc() = + block vtx: + var misses, hits: uint64 + echo "vtxLru(", RdVtxLruMaxSize, ")" + echo " state vtype miss hit total hitrate" + for state in StateType: + for vtype in VertexType: + let + (miss, hit) = ( + rdbVtxLruStats[state][vtype].get(false), + rdbVtxLruStats[state][vtype].get(true), + ) + hitRate = float64(hit * 100) / (float64(hit + miss)) + misses += miss + hits += hit + echo &"{state:>8} {vtype:>8} {miss:>10} {hit:>10} {miss+hit:>10} {hitRate:>6.2f}%" + let hitRate = float64(hits * 100) / (float64(hits + misses)) + echo &" all all {misses:>10} {hits:>10} {misses+hits:>10} {hitRate:>6.2f}%" + + block key: + var misses, hits: uint64 + echo "keyLru(", RdKeyLruMaxSize, ") " + + echo " state miss hit total hitrate" + + for state in StateType: + let + (miss, hit) = + (rdbKeyLruStats[state].get(false), rdbKeyLruStats[state].get(true)) + hitRate = float64(hit * 100) / (float64(hit + miss)) + misses += miss + hits += hit + + echo &"{state:>8} {miss:>10} {hit:>10} {miss+hit:>10} {hitRate:>5.2f}%" + + let hitRate = float64(hits * 100) / (float64(hits + misses)) + echo &" all {misses:>10} {hits:>10} {misses+hits:>10} {hitRate:>5.2f}%" + )