Validator monitor polish (#3569)
* lower "Previous epoch attestation missing" to `NOTICE` for easier filtering * add delay logging to validator monitor logs * simplify delay logging code post-`BeaconTime`
This commit is contained in:
parent
6f4fa32c1d
commit
30eef0a369
|
@ -200,7 +200,7 @@ proc storeBlock*(
|
|||
"index has been checked"),
|
||||
attestation.aggregation_bits):
|
||||
vm[].registerAttestationInBlock(attestation.data, validator_index,
|
||||
trustedBlock.message)
|
||||
trustedBlock.message.slot)
|
||||
|
||||
withState(dag[].clearanceState):
|
||||
when stateFork >= BeaconStateFork.Altair and
|
||||
|
|
|
@ -386,20 +386,15 @@ proc createAndSendAttestation(node: BeaconNode,
|
|||
dump(node.config.dumpDirOutgoing, attestation.data,
|
||||
validator.pubkey)
|
||||
|
||||
let wallTime = node.beaconClock.now()
|
||||
let deadline = attestationData.slot.attestation_deadline()
|
||||
|
||||
let (delayStr, delaySecs) =
|
||||
if wallTime < deadline:
|
||||
("-" & $(deadline - wallTime), -toFloatSeconds(deadline - wallTime))
|
||||
else:
|
||||
($(wallTime - deadline), toFloatSeconds(wallTime - deadline))
|
||||
let
|
||||
wallTime = node.beaconClock.now()
|
||||
delay = wallTime - attestationData.slot.attestation_deadline()
|
||||
|
||||
notice "Attestation sent",
|
||||
attestation = shortLog(attestation), validator = shortLog(validator),
|
||||
delay = delayStr, subnet_id
|
||||
delay, subnet_id
|
||||
|
||||
beacon_attestation_sent_delay.observe(delaySecs)
|
||||
beacon_attestation_sent_delay.observe(delay.toFloatSeconds())
|
||||
except CatchableError as exc:
|
||||
# An error could happen here when the signature task fails - we must
|
||||
# not leak the exception because this is an asyncSpawn task
|
||||
|
@ -734,20 +729,14 @@ proc createAndSendSyncCommitteeMessage(node: BeaconNode,
|
|||
|
||||
let
|
||||
wallTime = node.beaconClock.now()
|
||||
deadline = msg.slot.start_beacon_time() + syncCommitteeMessageSlotOffset
|
||||
|
||||
let (delayStr, delaySecs) =
|
||||
if wallTime < deadline:
|
||||
("-" & $(deadline - wallTime), -toFloatSeconds(deadline - wallTime))
|
||||
else:
|
||||
($(wallTime - deadline), toFloatSeconds(wallTime - deadline))
|
||||
delay = wallTime - msg.slot.sync_committee_message_deadline()
|
||||
|
||||
notice "Sync committee message sent",
|
||||
message = shortLog(msg),
|
||||
validator = shortLog(validator),
|
||||
delay = delayStr
|
||||
delay
|
||||
|
||||
beacon_sync_committee_message_sent_delay.observe(delaySecs)
|
||||
beacon_sync_committee_message_sent_delay.observe(delay.toFloatSeconds())
|
||||
except CatchableError as exc:
|
||||
# An error could happen here when the signature task fails - we must
|
||||
# not leak the exception because this is an asyncSpawn task
|
||||
|
@ -998,8 +987,9 @@ proc sendAggregatedAttestations(
|
|||
node.network.broadcastAggregateAndProof(signedAP)
|
||||
|
||||
# The subnet on which the attestations (should have) arrived
|
||||
let subnet_id = compute_subnet_for_attestation(
|
||||
committees_per_slot, slot, data.committee_index)
|
||||
let
|
||||
subnet_id = compute_subnet_for_attestation(
|
||||
committees_per_slot, slot, data.committee_index)
|
||||
notice "Aggregated attestation sent",
|
||||
aggregate = shortLog(signedAP.message.aggregate),
|
||||
aggregator_index = signedAP.message.aggregator_index,
|
||||
|
@ -1233,17 +1223,12 @@ proc sendAttestation*(node: BeaconNode,
|
|||
|
||||
let
|
||||
wallTime = node.processor.getCurrentBeaconTime()
|
||||
deadline = attestation.data.slot.attestation_deadline()
|
||||
(delayStr, delaySecs) =
|
||||
if wallTime < deadline:
|
||||
("-" & $(deadline - wallTime), -toFloatSeconds(deadline - wallTime))
|
||||
else:
|
||||
($(wallTime - deadline), toFloatSeconds(wallTime - deadline))
|
||||
delay = wallTime - attestation.data.slot.attestation_deadline()
|
||||
|
||||
notice "Attestation sent",
|
||||
attestation = shortLog(attestation), delay = delayStr, subnet_id
|
||||
attestation = shortLog(attestation), delay, subnet_id
|
||||
|
||||
beacon_attestation_sent_delay.observe(delaySecs)
|
||||
beacon_attestation_sent_delay.observe(delay.toFloatSeconds())
|
||||
|
||||
return SendResult.ok()
|
||||
|
||||
|
|
|
@ -8,8 +8,7 @@
|
|||
import
|
||||
std/[options, tables],
|
||||
metrics, chronicles,
|
||||
../spec/[crypto, beaconstate, forks, helpers, presets],
|
||||
../spec/datatypes/[phase0, altair],
|
||||
../spec/[beaconstate, forks, helpers],
|
||||
../beacon_clock
|
||||
|
||||
{.push raises: [Defect].}
|
||||
|
@ -461,7 +460,7 @@ proc registerEpochInfo*(
|
|||
validator_monitor_prev_epoch_on_chain_attester_miss.inc(1, [metricId])
|
||||
validator_monitor_prev_epoch_on_chain_source_attester_miss.inc(1, [metricId])
|
||||
|
||||
warn "Previous epoch attestation missing",
|
||||
notice "Previous epoch attestation missing",
|
||||
epoch = prev_epoch,
|
||||
validator = id
|
||||
|
||||
|
@ -642,7 +641,7 @@ proc registerAttestation*(
|
|||
if not self.totals:
|
||||
info "Attestation seen",
|
||||
attestation = shortLog(attestation),
|
||||
src, epoch = slot.epoch, validator = id
|
||||
src, epoch = slot.epoch, validator = id, delay
|
||||
|
||||
self.withEpochSummary(monitor, slot.epoch):
|
||||
epochSummary.attestations += 1
|
||||
|
@ -668,7 +667,7 @@ proc registerAggregate*(
|
|||
if not self.totals:
|
||||
info "Aggregated attestion seen",
|
||||
aggregate = shortLog(aggregate_and_proof.aggregate),
|
||||
src, epoch = slot.epoch, validator = id
|
||||
src, epoch = slot.epoch, validator = id, delay
|
||||
|
||||
self.withEpochSummary(monitor, slot.epoch):
|
||||
epochSummary.aggregates += 1
|
||||
|
@ -693,11 +692,11 @@ proc registerAttestationInBlock*(
|
|||
self: var ValidatorMonitor,
|
||||
data: AttestationData,
|
||||
attesting_index: ValidatorIndex,
|
||||
blck: auto) =
|
||||
block_slot: Slot) =
|
||||
self.withMonitor(attesting_index):
|
||||
let
|
||||
id = monitor.id
|
||||
inclusion_lag = (blck.slot - data.slot) - MIN_ATTESTATION_INCLUSION_DELAY
|
||||
inclusion_lag = (block_slot - data.slot) - MIN_ATTESTATION_INCLUSION_DELAY
|
||||
epoch = data.slot.epoch
|
||||
|
||||
validator_monitor_attestation_in_block.inc(1, ["block", metricId])
|
||||
|
@ -709,7 +708,7 @@ proc registerAttestationInBlock*(
|
|||
if not self.totals:
|
||||
info "Attestation included in block",
|
||||
attestation_data = shortLog(data),
|
||||
block_slot = blck.slot,
|
||||
block_slot,
|
||||
inclusion_lag_slots = inclusion_lag,
|
||||
epoch = epoch, validator = id
|
||||
|
||||
|
@ -722,7 +721,7 @@ proc registerBeaconBlock*(
|
|||
self: var ValidatorMonitor,
|
||||
src: MsgSource,
|
||||
seen_timestamp: BeaconTime,
|
||||
blck: auto) =
|
||||
blck: ForkyTrustedBeaconBlock) =
|
||||
self.withMonitor(blck.proposer_index):
|
||||
let
|
||||
id = monitor.id
|
||||
|
@ -735,7 +734,7 @@ proc registerBeaconBlock*(
|
|||
|
||||
if not self.totals:
|
||||
info "Block seen",
|
||||
blck = shortLog(blck), src, epoch = slot.epoch, validator = id
|
||||
blck = shortLog(blck), src, epoch = slot.epoch, validator = id, delay
|
||||
|
||||
proc registerSyncCommitteeMessage*(
|
||||
self: var ValidatorMonitor,
|
||||
|
@ -755,7 +754,7 @@ proc registerSyncCommitteeMessage*(
|
|||
if not self.totals:
|
||||
info "Sync committee message seen",
|
||||
syncCommitteeMessage = shortLog(sync_committee_message.beacon_block_root),
|
||||
src, epoch = slot.epoch, validator = id
|
||||
src, epoch = slot.epoch, validator = id, delay
|
||||
|
||||
self.withEpochSummary(monitor, slot.epoch):
|
||||
epochSummary.sync_committee_messages += 1
|
||||
|
@ -782,7 +781,7 @@ proc registerSyncContribution*(
|
|||
if not self.totals:
|
||||
info "Sync contribution seen",
|
||||
contribution = shortLog(contribution_and_proof.contribution),
|
||||
src, epoch = slot.epoch, validator = id
|
||||
src, epoch = slot.epoch, validator = id, delay
|
||||
|
||||
self.withEpochSummary(monitor, slot.epoch):
|
||||
epochSummary.sync_contributions += 1
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
|
||||
The validator monitoring feature allows for tracking the life-cycle and performance of one or more validators in detail.
|
||||
|
||||
Monitoring can be carried out for any validator, with slightly more detail for validators that are running through the same beacon node.
|
||||
Monitoring can be carried out for any validator, with slightly more detail for validators that are running in the same beacon node.
|
||||
|
||||
Every time the validator performs a duty, the duty is recorded and the monitor keeps track of the reward-related events for having performed it. For example:
|
||||
|
||||
|
@ -13,7 +13,7 @@ Every time the validator performs a duty, the duty is recorded and the monitor k
|
|||
|
||||
Validator actions can be traced either through logging, or comprehensive metrics that allow for creating alerts in monitoring tools.
|
||||
|
||||
The metrics are broadly compatible with [Lighthouse](https://lighthouse-book.sigmaprime.io/validator-monitoring.html), thus dashboards and alerts can be used with either client.
|
||||
The metrics are broadly compatible with [Lighthouse](https://lighthouse-book.sigmaprime.io/validator-monitoring.html), thus dashboards and alerts can be used with either client with minor adjustments.
|
||||
|
||||
## Enabling validator monitoring
|
||||
|
||||
|
@ -54,6 +54,8 @@ Failures at any point are recorded at a higher logging level, such as `NOT`(ice)
|
|||
NOT 2021-11-17 20:53:42.108+01:00 Attestation failed to match head topics="chaindag" epoch=81972 validator=...
|
||||
```
|
||||
|
||||
Failures are reported with a lag of two epochs (~13 minutes) - to examine the log for potential root causes, the logs from the epoch in the failure message should be looked at.
|
||||
|
||||
> ⚠️ It should be noted that metrics are tracked for the current history - in the case of a reorg on the chain - in particular a deep reorg - no attempt is made to revisit previously reported values. In the case that finality is delayed, the risk of stale metrics increases.
|
||||
|
||||
Likewise, many metrics, such as aggregation inclusion, reflect conditions on the network - it may happen that the same message is counted more than once under certain conditions.
|
||||
|
|
Loading…
Reference in New Issue