add metric for database checkpoint duration (#5897)

Database checkpointing can take seconds, e.g., while Geth is syncing.
Add a debug log + metric for it, and also info log if it takes longer
than 250ms, same as for the existing `State replayed` log. If the log
shows up for a user while the system is not overloaded, it may point
to slow disk speed or thermal issue.
This commit is contained in:
Etan Kissling 2024-02-19 11:00:11 +01:00 committed by GitHub
parent 5fdb06fcd1
commit 92197ce690
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 21 additions and 4 deletions

View File

@ -128,3 +128,5 @@ func fromFloatSeconds*(T: type Duration, f: float): Duration =
of fcInf: InfiniteDuration
chronicles.formatIt Duration: $it
const MinSignificantProcessingDuration* = 250.millis

View File

@ -15,7 +15,7 @@ import
state_transition, validator],
../spec/forks,
../spec/datatypes/[phase0, altair, bellatrix, capella],
".."/[beacon_chain_db, era_db],
".."/[beacon_chain_db, beacon_clock, era_db],
"."/[block_pools_types, block_quarantine]
from ../spec/datatypes/deneb import shortLog
@ -24,6 +24,8 @@ export
eth2_merkleization, eth2_ssz_serialization,
block_pools_types, results, beacon_chain_db
logScope: topics = "chaindag"
# https://github.com/ethereum/beacon-metrics/blob/master/metrics.md#interop-metrics
declareGauge beacon_head_root, "Root of the head block of the beacon chain"
declareGauge beacon_head_slot, "Slot of the head block of the beacon chain"
@ -47,7 +49,7 @@ declareGauge beacon_current_active_validators, "Number of validators in the acti
declareGauge beacon_pending_deposits, "Number of pending deposits (state.eth1_data.deposit_count - state.eth1_deposit_index)" # On block
declareGauge beacon_processed_deposits_total, "Number of total deposits included on chain" # On block
logScope: topics = "chaindag"
declareCounter total_state_replay_seconds, "Total time spent replaying states"
const
EPOCHS_PER_STATE_SNAPSHOT* = 32
@ -1838,9 +1840,10 @@ proc updateState*(
let
assignDur = assignTick - startTick
replayDur = Moment.now() - assignTick
total_state_replay_seconds.inc(replayDur.toFloatSeconds)
# TODO https://github.com/status-im/nim-chronicles/issues/108
if (assignDur + replayDur) >= 250.millis:
if (assignDur + replayDur) >= MinSignificantProcessingDuration:
# This might indicate there's a cache that's not in order or a disk that is
# too slow - for now, it's here for investigative purposes and the cutoff
# time might need tuning

View File

@ -35,6 +35,8 @@ from
import
TopicParams, validateParameters, init
logScope: topics = "beacnde"
# https://github.com/ethereum/eth2.0-metrics/blob/master/metrics.md#interop-metrics
declareGauge beacon_slot, "Latest slot of the beacon chain state"
declareGauge beacon_current_epoch, "Current epoch"
@ -49,7 +51,8 @@ declareGauge ticks_delay,
declareGauge next_action_wait,
"Seconds until the next attestation will be sent"
logScope: topics = "beacnde"
declareCounter total_db_checkpoint_seconds,
"Total time spent checkpointing the database to clear the WAL file"
proc doRunTrustedNodeSync(
db: BeaconChainDB,
@ -1381,10 +1384,19 @@ proc onSlotEnd(node: BeaconNode, slot: Slot) {.async.} =
except Exception:
# TODO upstream
raiseAssert "Unexpected exception during GC collection"
let gcCollectionTick = Moment.now()
# Checkpoint the database to clear the WAL file and make sure changes in
# the database are synced with the filesystem.
node.db.checkpoint()
let
dbCheckpointTick = Moment.now()
dbCheckpointDur = dbCheckpointTick - gcCollectionTick
total_db_checkpoint_seconds.inc(dbCheckpointDur.toFloatSeconds)
if dbCheckpointDur >= MinSignificantProcessingDuration:
info "Database checkpointed", dur = dbCheckpointDur
else:
debug "Database checkpointed", dur = dbCheckpointDur
node.syncCommitteeMsgPool[].pruneData(slot)
if slot.is_epoch: