Validator monitor polish (#3569)

* lower "Previous epoch attestation missing" to `NOTICE` for easier
filtering
* add delay logging to validator monitor logs
* simplify delay logging code post-`BeaconTime`
This commit is contained in:
Jacek Sieka 2022-04-06 11:23:01 +02:00 committed by GitHub
parent 6f4fa32c1d
commit 30eef0a369
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 30 additions and 44 deletions

View File

@ -200,7 +200,7 @@ proc storeBlock*(
"index has been checked"),
attestation.aggregation_bits):
vm[].registerAttestationInBlock(attestation.data, validator_index,
trustedBlock.message)
trustedBlock.message.slot)
withState(dag[].clearanceState):
when stateFork >= BeaconStateFork.Altair and

View File

@ -386,20 +386,15 @@ proc createAndSendAttestation(node: BeaconNode,
dump(node.config.dumpDirOutgoing, attestation.data,
validator.pubkey)
let wallTime = node.beaconClock.now()
let deadline = attestationData.slot.attestation_deadline()
let (delayStr, delaySecs) =
if wallTime < deadline:
("-" & $(deadline - wallTime), -toFloatSeconds(deadline - wallTime))
else:
($(wallTime - deadline), toFloatSeconds(wallTime - deadline))
let
wallTime = node.beaconClock.now()
delay = wallTime - attestationData.slot.attestation_deadline()
notice "Attestation sent",
attestation = shortLog(attestation), validator = shortLog(validator),
delay = delayStr, subnet_id
delay, subnet_id
beacon_attestation_sent_delay.observe(delaySecs)
beacon_attestation_sent_delay.observe(delay.toFloatSeconds())
except CatchableError as exc:
# An error could happen here when the signature task fails - we must
# not leak the exception because this is an asyncSpawn task
@ -734,20 +729,14 @@ proc createAndSendSyncCommitteeMessage(node: BeaconNode,
let
wallTime = node.beaconClock.now()
deadline = msg.slot.start_beacon_time() + syncCommitteeMessageSlotOffset
let (delayStr, delaySecs) =
if wallTime < deadline:
("-" & $(deadline - wallTime), -toFloatSeconds(deadline - wallTime))
else:
($(wallTime - deadline), toFloatSeconds(wallTime - deadline))
delay = wallTime - msg.slot.sync_committee_message_deadline()
notice "Sync committee message sent",
message = shortLog(msg),
validator = shortLog(validator),
delay = delayStr
delay
beacon_sync_committee_message_sent_delay.observe(delaySecs)
beacon_sync_committee_message_sent_delay.observe(delay.toFloatSeconds())
except CatchableError as exc:
# An error could happen here when the signature task fails - we must
# not leak the exception because this is an asyncSpawn task
@ -998,8 +987,9 @@ proc sendAggregatedAttestations(
node.network.broadcastAggregateAndProof(signedAP)
# The subnet on which the attestations (should have) arrived
let subnet_id = compute_subnet_for_attestation(
committees_per_slot, slot, data.committee_index)
let
subnet_id = compute_subnet_for_attestation(
committees_per_slot, slot, data.committee_index)
notice "Aggregated attestation sent",
aggregate = shortLog(signedAP.message.aggregate),
aggregator_index = signedAP.message.aggregator_index,
@ -1233,17 +1223,12 @@ proc sendAttestation*(node: BeaconNode,
let
wallTime = node.processor.getCurrentBeaconTime()
deadline = attestation.data.slot.attestation_deadline()
(delayStr, delaySecs) =
if wallTime < deadline:
("-" & $(deadline - wallTime), -toFloatSeconds(deadline - wallTime))
else:
($(wallTime - deadline), toFloatSeconds(wallTime - deadline))
delay = wallTime - attestation.data.slot.attestation_deadline()
notice "Attestation sent",
attestation = shortLog(attestation), delay = delayStr, subnet_id
attestation = shortLog(attestation), delay, subnet_id
beacon_attestation_sent_delay.observe(delaySecs)
beacon_attestation_sent_delay.observe(delay.toFloatSeconds())
return SendResult.ok()

View File

@ -8,8 +8,7 @@
import
std/[options, tables],
metrics, chronicles,
../spec/[crypto, beaconstate, forks, helpers, presets],
../spec/datatypes/[phase0, altair],
../spec/[beaconstate, forks, helpers],
../beacon_clock
{.push raises: [Defect].}
@ -461,7 +460,7 @@ proc registerEpochInfo*(
validator_monitor_prev_epoch_on_chain_attester_miss.inc(1, [metricId])
validator_monitor_prev_epoch_on_chain_source_attester_miss.inc(1, [metricId])
warn "Previous epoch attestation missing",
notice "Previous epoch attestation missing",
epoch = prev_epoch,
validator = id
@ -642,7 +641,7 @@ proc registerAttestation*(
if not self.totals:
info "Attestation seen",
attestation = shortLog(attestation),
src, epoch = slot.epoch, validator = id
src, epoch = slot.epoch, validator = id, delay
self.withEpochSummary(monitor, slot.epoch):
epochSummary.attestations += 1
@ -668,7 +667,7 @@ proc registerAggregate*(
if not self.totals:
info "Aggregated attestion seen",
aggregate = shortLog(aggregate_and_proof.aggregate),
src, epoch = slot.epoch, validator = id
src, epoch = slot.epoch, validator = id, delay
self.withEpochSummary(monitor, slot.epoch):
epochSummary.aggregates += 1
@ -693,11 +692,11 @@ proc registerAttestationInBlock*(
self: var ValidatorMonitor,
data: AttestationData,
attesting_index: ValidatorIndex,
blck: auto) =
block_slot: Slot) =
self.withMonitor(attesting_index):
let
id = monitor.id
inclusion_lag = (blck.slot - data.slot) - MIN_ATTESTATION_INCLUSION_DELAY
inclusion_lag = (block_slot - data.slot) - MIN_ATTESTATION_INCLUSION_DELAY
epoch = data.slot.epoch
validator_monitor_attestation_in_block.inc(1, ["block", metricId])
@ -709,7 +708,7 @@ proc registerAttestationInBlock*(
if not self.totals:
info "Attestation included in block",
attestation_data = shortLog(data),
block_slot = blck.slot,
block_slot,
inclusion_lag_slots = inclusion_lag,
epoch = epoch, validator = id
@ -722,7 +721,7 @@ proc registerBeaconBlock*(
self: var ValidatorMonitor,
src: MsgSource,
seen_timestamp: BeaconTime,
blck: auto) =
blck: ForkyTrustedBeaconBlock) =
self.withMonitor(blck.proposer_index):
let
id = monitor.id
@ -735,7 +734,7 @@ proc registerBeaconBlock*(
if not self.totals:
info "Block seen",
blck = shortLog(blck), src, epoch = slot.epoch, validator = id
blck = shortLog(blck), src, epoch = slot.epoch, validator = id, delay
proc registerSyncCommitteeMessage*(
self: var ValidatorMonitor,
@ -755,7 +754,7 @@ proc registerSyncCommitteeMessage*(
if not self.totals:
info "Sync committee message seen",
syncCommitteeMessage = shortLog(sync_committee_message.beacon_block_root),
src, epoch = slot.epoch, validator = id
src, epoch = slot.epoch, validator = id, delay
self.withEpochSummary(monitor, slot.epoch):
epochSummary.sync_committee_messages += 1
@ -782,7 +781,7 @@ proc registerSyncContribution*(
if not self.totals:
info "Sync contribution seen",
contribution = shortLog(contribution_and_proof.contribution),
src, epoch = slot.epoch, validator = id
src, epoch = slot.epoch, validator = id, delay
self.withEpochSummary(monitor, slot.epoch):
epochSummary.sync_contributions += 1

View File

@ -4,7 +4,7 @@
The validator monitoring feature allows for tracking the life-cycle and performance of one or more validators in detail.
Monitoring can be carried out for any validator, with slightly more detail for validators that are running through the same beacon node.
Monitoring can be carried out for any validator, with slightly more detail for validators that are running in the same beacon node.
Every time the validator performs a duty, the duty is recorded and the monitor keeps track of the reward-related events for having performed it. For example:
@ -13,7 +13,7 @@ Every time the validator performs a duty, the duty is recorded and the monitor k
Validator actions can be traced either through logging, or comprehensive metrics that allow for creating alerts in monitoring tools.
The metrics are broadly compatible with [Lighthouse](https://lighthouse-book.sigmaprime.io/validator-monitoring.html), thus dashboards and alerts can be used with either client.
The metrics are broadly compatible with [Lighthouse](https://lighthouse-book.sigmaprime.io/validator-monitoring.html), thus dashboards and alerts can be used with either client with minor adjustments.
## Enabling validator monitoring
@ -54,6 +54,8 @@ Failures at any point are recorded at a higher logging level, such as `NOT`(ice)
NOT 2021-11-17 20:53:42.108+01:00 Attestation failed to match head topics="chaindag" epoch=81972 validator=...
```
Failures are reported with a lag of two epochs (~13 minutes) - to examine the log for potential root causes, the logs from the epoch in the failure message should be looked at.
> ⚠️ It should be noted that metrics are tracked for the current history - in the case of a reorg on the chain - in particular a deep reorg - no attempt is made to revisit previously reported values. In the case that finality is delayed, the risk of stale metrics increases.
Likewise, many metrics, such as aggregation inclusion, reflect conditions on the network - it may happen that the same message is counted more than once under certain conditions.