Validator monitor polish (#3569)
* lower "Previous epoch attestation missing" to `NOTICE` for easier filtering * add delay logging to validator monitor logs * simplify delay logging code post-`BeaconTime`
This commit is contained in:
parent
6f4fa32c1d
commit
30eef0a369
|
@ -200,7 +200,7 @@ proc storeBlock*(
|
||||||
"index has been checked"),
|
"index has been checked"),
|
||||||
attestation.aggregation_bits):
|
attestation.aggregation_bits):
|
||||||
vm[].registerAttestationInBlock(attestation.data, validator_index,
|
vm[].registerAttestationInBlock(attestation.data, validator_index,
|
||||||
trustedBlock.message)
|
trustedBlock.message.slot)
|
||||||
|
|
||||||
withState(dag[].clearanceState):
|
withState(dag[].clearanceState):
|
||||||
when stateFork >= BeaconStateFork.Altair and
|
when stateFork >= BeaconStateFork.Altair and
|
||||||
|
|
|
@ -386,20 +386,15 @@ proc createAndSendAttestation(node: BeaconNode,
|
||||||
dump(node.config.dumpDirOutgoing, attestation.data,
|
dump(node.config.dumpDirOutgoing, attestation.data,
|
||||||
validator.pubkey)
|
validator.pubkey)
|
||||||
|
|
||||||
let wallTime = node.beaconClock.now()
|
let
|
||||||
let deadline = attestationData.slot.attestation_deadline()
|
wallTime = node.beaconClock.now()
|
||||||
|
delay = wallTime - attestationData.slot.attestation_deadline()
|
||||||
let (delayStr, delaySecs) =
|
|
||||||
if wallTime < deadline:
|
|
||||||
("-" & $(deadline - wallTime), -toFloatSeconds(deadline - wallTime))
|
|
||||||
else:
|
|
||||||
($(wallTime - deadline), toFloatSeconds(wallTime - deadline))
|
|
||||||
|
|
||||||
notice "Attestation sent",
|
notice "Attestation sent",
|
||||||
attestation = shortLog(attestation), validator = shortLog(validator),
|
attestation = shortLog(attestation), validator = shortLog(validator),
|
||||||
delay = delayStr, subnet_id
|
delay, subnet_id
|
||||||
|
|
||||||
beacon_attestation_sent_delay.observe(delaySecs)
|
beacon_attestation_sent_delay.observe(delay.toFloatSeconds())
|
||||||
except CatchableError as exc:
|
except CatchableError as exc:
|
||||||
# An error could happen here when the signature task fails - we must
|
# An error could happen here when the signature task fails - we must
|
||||||
# not leak the exception because this is an asyncSpawn task
|
# not leak the exception because this is an asyncSpawn task
|
||||||
|
@ -734,20 +729,14 @@ proc createAndSendSyncCommitteeMessage(node: BeaconNode,
|
||||||
|
|
||||||
let
|
let
|
||||||
wallTime = node.beaconClock.now()
|
wallTime = node.beaconClock.now()
|
||||||
deadline = msg.slot.start_beacon_time() + syncCommitteeMessageSlotOffset
|
delay = wallTime - msg.slot.sync_committee_message_deadline()
|
||||||
|
|
||||||
let (delayStr, delaySecs) =
|
|
||||||
if wallTime < deadline:
|
|
||||||
("-" & $(deadline - wallTime), -toFloatSeconds(deadline - wallTime))
|
|
||||||
else:
|
|
||||||
($(wallTime - deadline), toFloatSeconds(wallTime - deadline))
|
|
||||||
|
|
||||||
notice "Sync committee message sent",
|
notice "Sync committee message sent",
|
||||||
message = shortLog(msg),
|
message = shortLog(msg),
|
||||||
validator = shortLog(validator),
|
validator = shortLog(validator),
|
||||||
delay = delayStr
|
delay
|
||||||
|
|
||||||
beacon_sync_committee_message_sent_delay.observe(delaySecs)
|
beacon_sync_committee_message_sent_delay.observe(delay.toFloatSeconds())
|
||||||
except CatchableError as exc:
|
except CatchableError as exc:
|
||||||
# An error could happen here when the signature task fails - we must
|
# An error could happen here when the signature task fails - we must
|
||||||
# not leak the exception because this is an asyncSpawn task
|
# not leak the exception because this is an asyncSpawn task
|
||||||
|
@ -998,8 +987,9 @@ proc sendAggregatedAttestations(
|
||||||
node.network.broadcastAggregateAndProof(signedAP)
|
node.network.broadcastAggregateAndProof(signedAP)
|
||||||
|
|
||||||
# The subnet on which the attestations (should have) arrived
|
# The subnet on which the attestations (should have) arrived
|
||||||
let subnet_id = compute_subnet_for_attestation(
|
let
|
||||||
committees_per_slot, slot, data.committee_index)
|
subnet_id = compute_subnet_for_attestation(
|
||||||
|
committees_per_slot, slot, data.committee_index)
|
||||||
notice "Aggregated attestation sent",
|
notice "Aggregated attestation sent",
|
||||||
aggregate = shortLog(signedAP.message.aggregate),
|
aggregate = shortLog(signedAP.message.aggregate),
|
||||||
aggregator_index = signedAP.message.aggregator_index,
|
aggregator_index = signedAP.message.aggregator_index,
|
||||||
|
@ -1233,17 +1223,12 @@ proc sendAttestation*(node: BeaconNode,
|
||||||
|
|
||||||
let
|
let
|
||||||
wallTime = node.processor.getCurrentBeaconTime()
|
wallTime = node.processor.getCurrentBeaconTime()
|
||||||
deadline = attestation.data.slot.attestation_deadline()
|
delay = wallTime - attestation.data.slot.attestation_deadline()
|
||||||
(delayStr, delaySecs) =
|
|
||||||
if wallTime < deadline:
|
|
||||||
("-" & $(deadline - wallTime), -toFloatSeconds(deadline - wallTime))
|
|
||||||
else:
|
|
||||||
($(wallTime - deadline), toFloatSeconds(wallTime - deadline))
|
|
||||||
|
|
||||||
notice "Attestation sent",
|
notice "Attestation sent",
|
||||||
attestation = shortLog(attestation), delay = delayStr, subnet_id
|
attestation = shortLog(attestation), delay, subnet_id
|
||||||
|
|
||||||
beacon_attestation_sent_delay.observe(delaySecs)
|
beacon_attestation_sent_delay.observe(delay.toFloatSeconds())
|
||||||
|
|
||||||
return SendResult.ok()
|
return SendResult.ok()
|
||||||
|
|
||||||
|
|
|
@ -8,8 +8,7 @@
|
||||||
import
|
import
|
||||||
std/[options, tables],
|
std/[options, tables],
|
||||||
metrics, chronicles,
|
metrics, chronicles,
|
||||||
../spec/[crypto, beaconstate, forks, helpers, presets],
|
../spec/[beaconstate, forks, helpers],
|
||||||
../spec/datatypes/[phase0, altair],
|
|
||||||
../beacon_clock
|
../beacon_clock
|
||||||
|
|
||||||
{.push raises: [Defect].}
|
{.push raises: [Defect].}
|
||||||
|
@ -461,7 +460,7 @@ proc registerEpochInfo*(
|
||||||
validator_monitor_prev_epoch_on_chain_attester_miss.inc(1, [metricId])
|
validator_monitor_prev_epoch_on_chain_attester_miss.inc(1, [metricId])
|
||||||
validator_monitor_prev_epoch_on_chain_source_attester_miss.inc(1, [metricId])
|
validator_monitor_prev_epoch_on_chain_source_attester_miss.inc(1, [metricId])
|
||||||
|
|
||||||
warn "Previous epoch attestation missing",
|
notice "Previous epoch attestation missing",
|
||||||
epoch = prev_epoch,
|
epoch = prev_epoch,
|
||||||
validator = id
|
validator = id
|
||||||
|
|
||||||
|
@ -642,7 +641,7 @@ proc registerAttestation*(
|
||||||
if not self.totals:
|
if not self.totals:
|
||||||
info "Attestation seen",
|
info "Attestation seen",
|
||||||
attestation = shortLog(attestation),
|
attestation = shortLog(attestation),
|
||||||
src, epoch = slot.epoch, validator = id
|
src, epoch = slot.epoch, validator = id, delay
|
||||||
|
|
||||||
self.withEpochSummary(monitor, slot.epoch):
|
self.withEpochSummary(monitor, slot.epoch):
|
||||||
epochSummary.attestations += 1
|
epochSummary.attestations += 1
|
||||||
|
@ -668,7 +667,7 @@ proc registerAggregate*(
|
||||||
if not self.totals:
|
if not self.totals:
|
||||||
info "Aggregated attestion seen",
|
info "Aggregated attestion seen",
|
||||||
aggregate = shortLog(aggregate_and_proof.aggregate),
|
aggregate = shortLog(aggregate_and_proof.aggregate),
|
||||||
src, epoch = slot.epoch, validator = id
|
src, epoch = slot.epoch, validator = id, delay
|
||||||
|
|
||||||
self.withEpochSummary(monitor, slot.epoch):
|
self.withEpochSummary(monitor, slot.epoch):
|
||||||
epochSummary.aggregates += 1
|
epochSummary.aggregates += 1
|
||||||
|
@ -693,11 +692,11 @@ proc registerAttestationInBlock*(
|
||||||
self: var ValidatorMonitor,
|
self: var ValidatorMonitor,
|
||||||
data: AttestationData,
|
data: AttestationData,
|
||||||
attesting_index: ValidatorIndex,
|
attesting_index: ValidatorIndex,
|
||||||
blck: auto) =
|
block_slot: Slot) =
|
||||||
self.withMonitor(attesting_index):
|
self.withMonitor(attesting_index):
|
||||||
let
|
let
|
||||||
id = monitor.id
|
id = monitor.id
|
||||||
inclusion_lag = (blck.slot - data.slot) - MIN_ATTESTATION_INCLUSION_DELAY
|
inclusion_lag = (block_slot - data.slot) - MIN_ATTESTATION_INCLUSION_DELAY
|
||||||
epoch = data.slot.epoch
|
epoch = data.slot.epoch
|
||||||
|
|
||||||
validator_monitor_attestation_in_block.inc(1, ["block", metricId])
|
validator_monitor_attestation_in_block.inc(1, ["block", metricId])
|
||||||
|
@ -709,7 +708,7 @@ proc registerAttestationInBlock*(
|
||||||
if not self.totals:
|
if not self.totals:
|
||||||
info "Attestation included in block",
|
info "Attestation included in block",
|
||||||
attestation_data = shortLog(data),
|
attestation_data = shortLog(data),
|
||||||
block_slot = blck.slot,
|
block_slot,
|
||||||
inclusion_lag_slots = inclusion_lag,
|
inclusion_lag_slots = inclusion_lag,
|
||||||
epoch = epoch, validator = id
|
epoch = epoch, validator = id
|
||||||
|
|
||||||
|
@ -722,7 +721,7 @@ proc registerBeaconBlock*(
|
||||||
self: var ValidatorMonitor,
|
self: var ValidatorMonitor,
|
||||||
src: MsgSource,
|
src: MsgSource,
|
||||||
seen_timestamp: BeaconTime,
|
seen_timestamp: BeaconTime,
|
||||||
blck: auto) =
|
blck: ForkyTrustedBeaconBlock) =
|
||||||
self.withMonitor(blck.proposer_index):
|
self.withMonitor(blck.proposer_index):
|
||||||
let
|
let
|
||||||
id = monitor.id
|
id = monitor.id
|
||||||
|
@ -735,7 +734,7 @@ proc registerBeaconBlock*(
|
||||||
|
|
||||||
if not self.totals:
|
if not self.totals:
|
||||||
info "Block seen",
|
info "Block seen",
|
||||||
blck = shortLog(blck), src, epoch = slot.epoch, validator = id
|
blck = shortLog(blck), src, epoch = slot.epoch, validator = id, delay
|
||||||
|
|
||||||
proc registerSyncCommitteeMessage*(
|
proc registerSyncCommitteeMessage*(
|
||||||
self: var ValidatorMonitor,
|
self: var ValidatorMonitor,
|
||||||
|
@ -755,7 +754,7 @@ proc registerSyncCommitteeMessage*(
|
||||||
if not self.totals:
|
if not self.totals:
|
||||||
info "Sync committee message seen",
|
info "Sync committee message seen",
|
||||||
syncCommitteeMessage = shortLog(sync_committee_message.beacon_block_root),
|
syncCommitteeMessage = shortLog(sync_committee_message.beacon_block_root),
|
||||||
src, epoch = slot.epoch, validator = id
|
src, epoch = slot.epoch, validator = id, delay
|
||||||
|
|
||||||
self.withEpochSummary(monitor, slot.epoch):
|
self.withEpochSummary(monitor, slot.epoch):
|
||||||
epochSummary.sync_committee_messages += 1
|
epochSummary.sync_committee_messages += 1
|
||||||
|
@ -782,7 +781,7 @@ proc registerSyncContribution*(
|
||||||
if not self.totals:
|
if not self.totals:
|
||||||
info "Sync contribution seen",
|
info "Sync contribution seen",
|
||||||
contribution = shortLog(contribution_and_proof.contribution),
|
contribution = shortLog(contribution_and_proof.contribution),
|
||||||
src, epoch = slot.epoch, validator = id
|
src, epoch = slot.epoch, validator = id, delay
|
||||||
|
|
||||||
self.withEpochSummary(monitor, slot.epoch):
|
self.withEpochSummary(monitor, slot.epoch):
|
||||||
epochSummary.sync_contributions += 1
|
epochSummary.sync_contributions += 1
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
The validator monitoring feature allows for tracking the life-cycle and performance of one or more validators in detail.
|
The validator monitoring feature allows for tracking the life-cycle and performance of one or more validators in detail.
|
||||||
|
|
||||||
Monitoring can be carried out for any validator, with slightly more detail for validators that are running through the same beacon node.
|
Monitoring can be carried out for any validator, with slightly more detail for validators that are running in the same beacon node.
|
||||||
|
|
||||||
Every time the validator performs a duty, the duty is recorded and the monitor keeps track of the reward-related events for having performed it. For example:
|
Every time the validator performs a duty, the duty is recorded and the monitor keeps track of the reward-related events for having performed it. For example:
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@ Every time the validator performs a duty, the duty is recorded and the monitor k
|
||||||
|
|
||||||
Validator actions can be traced either through logging, or comprehensive metrics that allow for creating alerts in monitoring tools.
|
Validator actions can be traced either through logging, or comprehensive metrics that allow for creating alerts in monitoring tools.
|
||||||
|
|
||||||
The metrics are broadly compatible with [Lighthouse](https://lighthouse-book.sigmaprime.io/validator-monitoring.html), thus dashboards and alerts can be used with either client.
|
The metrics are broadly compatible with [Lighthouse](https://lighthouse-book.sigmaprime.io/validator-monitoring.html), thus dashboards and alerts can be used with either client with minor adjustments.
|
||||||
|
|
||||||
## Enabling validator monitoring
|
## Enabling validator monitoring
|
||||||
|
|
||||||
|
@ -54,6 +54,8 @@ Failures at any point are recorded at a higher logging level, such as `NOT`(ice)
|
||||||
NOT 2021-11-17 20:53:42.108+01:00 Attestation failed to match head topics="chaindag" epoch=81972 validator=...
|
NOT 2021-11-17 20:53:42.108+01:00 Attestation failed to match head topics="chaindag" epoch=81972 validator=...
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Failures are reported with a lag of two epochs (~13 minutes) - to examine the log for potential root causes, the logs from the epoch in the failure message should be looked at.
|
||||||
|
|
||||||
> ⚠️ It should be noted that metrics are tracked for the current history - in the case of a reorg on the chain - in particular a deep reorg - no attempt is made to revisit previously reported values. In the case that finality is delayed, the risk of stale metrics increases.
|
> ⚠️ It should be noted that metrics are tracked for the current history - in the case of a reorg on the chain - in particular a deep reorg - no attempt is made to revisit previously reported values. In the case that finality is delayed, the risk of stale metrics increases.
|
||||||
|
|
||||||
Likewise, many metrics, such as aggregation inclusion, reflect conditions on the network - it may happen that the same message is counted more than once under certain conditions.
|
Likewise, many metrics, such as aggregation inclusion, reflect conditions on the network - it may happen that the same message is counted more than once under certain conditions.
|
||||||
|
|
Loading…
Reference in New Issue