add metric for database checkpoint duration (#5897)
Database checkpointing can take seconds, e.g., while Geth is syncing. Add a debug log + metric for it, and also info log if it takes longer than 250ms, same as for the existing `State replayed` log. If the log shows up for a user while the system is not overloaded, it may point to slow disk speed or thermal issue.
This commit is contained in:
parent
5fdb06fcd1
commit
92197ce690
|
@ -128,3 +128,5 @@ func fromFloatSeconds*(T: type Duration, f: float): Duration =
|
|||
of fcInf: InfiniteDuration
|
||||
|
||||
chronicles.formatIt Duration: $it
|
||||
|
||||
const MinSignificantProcessingDuration* = 250.millis
|
||||
|
|
|
@ -15,7 +15,7 @@ import
|
|||
state_transition, validator],
|
||||
../spec/forks,
|
||||
../spec/datatypes/[phase0, altair, bellatrix, capella],
|
||||
".."/[beacon_chain_db, era_db],
|
||||
".."/[beacon_chain_db, beacon_clock, era_db],
|
||||
"."/[block_pools_types, block_quarantine]
|
||||
|
||||
from ../spec/datatypes/deneb import shortLog
|
||||
|
@ -24,6 +24,8 @@ export
|
|||
eth2_merkleization, eth2_ssz_serialization,
|
||||
block_pools_types, results, beacon_chain_db
|
||||
|
||||
logScope: topics = "chaindag"
|
||||
|
||||
# https://github.com/ethereum/beacon-metrics/blob/master/metrics.md#interop-metrics
|
||||
declareGauge beacon_head_root, "Root of the head block of the beacon chain"
|
||||
declareGauge beacon_head_slot, "Slot of the head block of the beacon chain"
|
||||
|
@ -47,7 +49,7 @@ declareGauge beacon_current_active_validators, "Number of validators in the acti
|
|||
declareGauge beacon_pending_deposits, "Number of pending deposits (state.eth1_data.deposit_count - state.eth1_deposit_index)" # On block
|
||||
declareGauge beacon_processed_deposits_total, "Number of total deposits included on chain" # On block
|
||||
|
||||
logScope: topics = "chaindag"
|
||||
declareCounter total_state_replay_seconds, "Total time spent replaying states"
|
||||
|
||||
const
|
||||
EPOCHS_PER_STATE_SNAPSHOT* = 32
|
||||
|
@ -1838,9 +1840,10 @@ proc updateState*(
|
|||
let
|
||||
assignDur = assignTick - startTick
|
||||
replayDur = Moment.now() - assignTick
|
||||
total_state_replay_seconds.inc(replayDur.toFloatSeconds)
|
||||
|
||||
# TODO https://github.com/status-im/nim-chronicles/issues/108
|
||||
if (assignDur + replayDur) >= 250.millis:
|
||||
if (assignDur + replayDur) >= MinSignificantProcessingDuration:
|
||||
# This might indicate there's a cache that's not in order or a disk that is
|
||||
# too slow - for now, it's here for investigative purposes and the cutoff
|
||||
# time might need tuning
|
||||
|
|
|
@ -35,6 +35,8 @@ from
|
|||
import
|
||||
TopicParams, validateParameters, init
|
||||
|
||||
logScope: topics = "beacnde"
|
||||
|
||||
# https://github.com/ethereum/eth2.0-metrics/blob/master/metrics.md#interop-metrics
|
||||
declareGauge beacon_slot, "Latest slot of the beacon chain state"
|
||||
declareGauge beacon_current_epoch, "Current epoch"
|
||||
|
@ -49,7 +51,8 @@ declareGauge ticks_delay,
|
|||
declareGauge next_action_wait,
|
||||
"Seconds until the next attestation will be sent"
|
||||
|
||||
logScope: topics = "beacnde"
|
||||
declareCounter total_db_checkpoint_seconds,
|
||||
"Total time spent checkpointing the database to clear the WAL file"
|
||||
|
||||
proc doRunTrustedNodeSync(
|
||||
db: BeaconChainDB,
|
||||
|
@ -1381,10 +1384,19 @@ proc onSlotEnd(node: BeaconNode, slot: Slot) {.async.} =
|
|||
except Exception:
|
||||
# TODO upstream
|
||||
raiseAssert "Unexpected exception during GC collection"
|
||||
let gcCollectionTick = Moment.now()
|
||||
|
||||
# Checkpoint the database to clear the WAL file and make sure changes in
|
||||
# the database are synced with the filesystem.
|
||||
node.db.checkpoint()
|
||||
let
|
||||
dbCheckpointTick = Moment.now()
|
||||
dbCheckpointDur = dbCheckpointTick - gcCollectionTick
|
||||
total_db_checkpoint_seconds.inc(dbCheckpointDur.toFloatSeconds)
|
||||
if dbCheckpointDur >= MinSignificantProcessingDuration:
|
||||
info "Database checkpointed", dur = dbCheckpointDur
|
||||
else:
|
||||
debug "Database checkpointed", dur = dbCheckpointDur
|
||||
|
||||
node.syncCommitteeMsgPool[].pruneData(slot)
|
||||
if slot.is_epoch:
|
||||
|
|
Loading…
Reference in New Issue