From 30eef0a3697cd02554332f9dc221a84c6f6b52a6 Mon Sep 17 00:00:00 2001 From: Jacek Sieka Date: Wed, 6 Apr 2022 11:23:01 +0200 Subject: [PATCH] Validator monitor polish (#3569) * lower "Previous epoch attestation missing" to `NOTICE` for easier filtering * add delay logging to validator monitor logs * simplify delay logging code post-`BeaconTime` --- .../gossip_processing/block_processor.nim | 2 +- beacon_chain/validators/validator_duties.nim | 43 ++++++------------- beacon_chain/validators/validator_monitor.nim | 23 +++++----- docs/the_nimbus_book/src/validator-monitor.md | 6 ++- 4 files changed, 30 insertions(+), 44 deletions(-) diff --git a/beacon_chain/gossip_processing/block_processor.nim b/beacon_chain/gossip_processing/block_processor.nim index 02ab2437e..19bd12af7 100644 --- a/beacon_chain/gossip_processing/block_processor.nim +++ b/beacon_chain/gossip_processing/block_processor.nim @@ -200,7 +200,7 @@ proc storeBlock*( "index has been checked"), attestation.aggregation_bits): vm[].registerAttestationInBlock(attestation.data, validator_index, - trustedBlock.message) + trustedBlock.message.slot) withState(dag[].clearanceState): when stateFork >= BeaconStateFork.Altair and diff --git a/beacon_chain/validators/validator_duties.nim b/beacon_chain/validators/validator_duties.nim index b0af7247f..6e880cc92 100644 --- a/beacon_chain/validators/validator_duties.nim +++ b/beacon_chain/validators/validator_duties.nim @@ -386,20 +386,15 @@ proc createAndSendAttestation(node: BeaconNode, dump(node.config.dumpDirOutgoing, attestation.data, validator.pubkey) - let wallTime = node.beaconClock.now() - let deadline = attestationData.slot.attestation_deadline() - - let (delayStr, delaySecs) = - if wallTime < deadline: - ("-" & $(deadline - wallTime), -toFloatSeconds(deadline - wallTime)) - else: - ($(wallTime - deadline), toFloatSeconds(wallTime - deadline)) + let + wallTime = node.beaconClock.now() + delay = wallTime - attestationData.slot.attestation_deadline() notice "Attestation sent", attestation = shortLog(attestation), validator = shortLog(validator), - delay = delayStr, subnet_id + delay, subnet_id - beacon_attestation_sent_delay.observe(delaySecs) + beacon_attestation_sent_delay.observe(delay.toFloatSeconds()) except CatchableError as exc: # An error could happen here when the signature task fails - we must # not leak the exception because this is an asyncSpawn task @@ -734,20 +729,14 @@ proc createAndSendSyncCommitteeMessage(node: BeaconNode, let wallTime = node.beaconClock.now() - deadline = msg.slot.start_beacon_time() + syncCommitteeMessageSlotOffset - - let (delayStr, delaySecs) = - if wallTime < deadline: - ("-" & $(deadline - wallTime), -toFloatSeconds(deadline - wallTime)) - else: - ($(wallTime - deadline), toFloatSeconds(wallTime - deadline)) + delay = wallTime - msg.slot.sync_committee_message_deadline() notice "Sync committee message sent", message = shortLog(msg), validator = shortLog(validator), - delay = delayStr + delay - beacon_sync_committee_message_sent_delay.observe(delaySecs) + beacon_sync_committee_message_sent_delay.observe(delay.toFloatSeconds()) except CatchableError as exc: # An error could happen here when the signature task fails - we must # not leak the exception because this is an asyncSpawn task @@ -998,8 +987,9 @@ proc sendAggregatedAttestations( node.network.broadcastAggregateAndProof(signedAP) # The subnet on which the attestations (should have) arrived - let subnet_id = compute_subnet_for_attestation( - committees_per_slot, slot, data.committee_index) + let + subnet_id = compute_subnet_for_attestation( + committees_per_slot, slot, data.committee_index) notice "Aggregated attestation sent", aggregate = shortLog(signedAP.message.aggregate), aggregator_index = signedAP.message.aggregator_index, @@ -1233,17 +1223,12 @@ proc sendAttestation*(node: BeaconNode, let wallTime = node.processor.getCurrentBeaconTime() - deadline = attestation.data.slot.attestation_deadline() - (delayStr, delaySecs) = - if wallTime < deadline: - ("-" & $(deadline - wallTime), -toFloatSeconds(deadline - wallTime)) - else: - ($(wallTime - deadline), toFloatSeconds(wallTime - deadline)) + delay = wallTime - attestation.data.slot.attestation_deadline() notice "Attestation sent", - attestation = shortLog(attestation), delay = delayStr, subnet_id + attestation = shortLog(attestation), delay, subnet_id - beacon_attestation_sent_delay.observe(delaySecs) + beacon_attestation_sent_delay.observe(delay.toFloatSeconds()) return SendResult.ok() diff --git a/beacon_chain/validators/validator_monitor.nim b/beacon_chain/validators/validator_monitor.nim index 5f856e319..febcc91fd 100644 --- a/beacon_chain/validators/validator_monitor.nim +++ b/beacon_chain/validators/validator_monitor.nim @@ -8,8 +8,7 @@ import std/[options, tables], metrics, chronicles, - ../spec/[crypto, beaconstate, forks, helpers, presets], - ../spec/datatypes/[phase0, altair], + ../spec/[beaconstate, forks, helpers], ../beacon_clock {.push raises: [Defect].} @@ -461,7 +460,7 @@ proc registerEpochInfo*( validator_monitor_prev_epoch_on_chain_attester_miss.inc(1, [metricId]) validator_monitor_prev_epoch_on_chain_source_attester_miss.inc(1, [metricId]) - warn "Previous epoch attestation missing", + notice "Previous epoch attestation missing", epoch = prev_epoch, validator = id @@ -642,7 +641,7 @@ proc registerAttestation*( if not self.totals: info "Attestation seen", attestation = shortLog(attestation), - src, epoch = slot.epoch, validator = id + src, epoch = slot.epoch, validator = id, delay self.withEpochSummary(monitor, slot.epoch): epochSummary.attestations += 1 @@ -668,7 +667,7 @@ proc registerAggregate*( if not self.totals: info "Aggregated attestion seen", aggregate = shortLog(aggregate_and_proof.aggregate), - src, epoch = slot.epoch, validator = id + src, epoch = slot.epoch, validator = id, delay self.withEpochSummary(monitor, slot.epoch): epochSummary.aggregates += 1 @@ -693,11 +692,11 @@ proc registerAttestationInBlock*( self: var ValidatorMonitor, data: AttestationData, attesting_index: ValidatorIndex, - blck: auto) = + block_slot: Slot) = self.withMonitor(attesting_index): let id = monitor.id - inclusion_lag = (blck.slot - data.slot) - MIN_ATTESTATION_INCLUSION_DELAY + inclusion_lag = (block_slot - data.slot) - MIN_ATTESTATION_INCLUSION_DELAY epoch = data.slot.epoch validator_monitor_attestation_in_block.inc(1, ["block", metricId]) @@ -709,7 +708,7 @@ proc registerAttestationInBlock*( if not self.totals: info "Attestation included in block", attestation_data = shortLog(data), - block_slot = blck.slot, + block_slot, inclusion_lag_slots = inclusion_lag, epoch = epoch, validator = id @@ -722,7 +721,7 @@ proc registerBeaconBlock*( self: var ValidatorMonitor, src: MsgSource, seen_timestamp: BeaconTime, - blck: auto) = + blck: ForkyTrustedBeaconBlock) = self.withMonitor(blck.proposer_index): let id = monitor.id @@ -735,7 +734,7 @@ proc registerBeaconBlock*( if not self.totals: info "Block seen", - blck = shortLog(blck), src, epoch = slot.epoch, validator = id + blck = shortLog(blck), src, epoch = slot.epoch, validator = id, delay proc registerSyncCommitteeMessage*( self: var ValidatorMonitor, @@ -755,7 +754,7 @@ proc registerSyncCommitteeMessage*( if not self.totals: info "Sync committee message seen", syncCommitteeMessage = shortLog(sync_committee_message.beacon_block_root), - src, epoch = slot.epoch, validator = id + src, epoch = slot.epoch, validator = id, delay self.withEpochSummary(monitor, slot.epoch): epochSummary.sync_committee_messages += 1 @@ -782,7 +781,7 @@ proc registerSyncContribution*( if not self.totals: info "Sync contribution seen", contribution = shortLog(contribution_and_proof.contribution), - src, epoch = slot.epoch, validator = id + src, epoch = slot.epoch, validator = id, delay self.withEpochSummary(monitor, slot.epoch): epochSummary.sync_contributions += 1 diff --git a/docs/the_nimbus_book/src/validator-monitor.md b/docs/the_nimbus_book/src/validator-monitor.md index 11d18a07b..85a646dad 100644 --- a/docs/the_nimbus_book/src/validator-monitor.md +++ b/docs/the_nimbus_book/src/validator-monitor.md @@ -4,7 +4,7 @@ The validator monitoring feature allows for tracking the life-cycle and performance of one or more validators in detail. -Monitoring can be carried out for any validator, with slightly more detail for validators that are running through the same beacon node. +Monitoring can be carried out for any validator, with slightly more detail for validators that are running in the same beacon node. Every time the validator performs a duty, the duty is recorded and the monitor keeps track of the reward-related events for having performed it. For example: @@ -13,7 +13,7 @@ Every time the validator performs a duty, the duty is recorded and the monitor k Validator actions can be traced either through logging, or comprehensive metrics that allow for creating alerts in monitoring tools. -The metrics are broadly compatible with [Lighthouse](https://lighthouse-book.sigmaprime.io/validator-monitoring.html), thus dashboards and alerts can be used with either client. +The metrics are broadly compatible with [Lighthouse](https://lighthouse-book.sigmaprime.io/validator-monitoring.html), thus dashboards and alerts can be used with either client with minor adjustments. ## Enabling validator monitoring @@ -54,6 +54,8 @@ Failures at any point are recorded at a higher logging level, such as `NOT`(ice) NOT 2021-11-17 20:53:42.108+01:00 Attestation failed to match head topics="chaindag" epoch=81972 validator=... ``` +Failures are reported with a lag of two epochs (~13 minutes) - to examine the log for potential root causes, the logs from the epoch in the failure message should be looked at. + > ⚠️ It should be noted that metrics are tracked for the current history - in the case of a reorg on the chain - in particular a deep reorg - no attempt is made to revisit previously reported values. In the case that finality is delayed, the risk of stale metrics increases. Likewise, many metrics, such as aggregation inclusion, reflect conditions on the network - it may happen that the same message is counted more than once under certain conditions.