Enable validator monitor by default (#4468)
By enabling the validator monitor, more precise information about the lifecycle of an attestation is logged at the higher `NOTICE` log level while current `sent` messages are logged at `INF` instead, since they are less interesting. In particular, missed attestations and those that vote for the wrong head are now detected and logged at NOTICE. In addition to logging, this feature enables rich metrics around attestation and sync committee performance - by default, validators are tracked in aggregate but a detailed mode exists as well This feature has been available since early Nimbus days, but it has now been tuned and optimised such that it is safe to enable by default, even for large setups. * enable automatic validator monitoring by default * replace `--validator-monitor-totals` flag with `--validator-monitor-details` - the detailed mode is disabled by default * lower "sent" log level to `INF` for several messages - in particular those that are traced by the validator monitor This is a retake on #3531 which was later reverted in #3578.
This commit is contained in:
parent
7ad0d3e6c2
commit
d8caab500d
|
@ -544,18 +544,23 @@ type
|
|||
name: "terminal-total-difficulty-override" .}: Option[string]
|
||||
|
||||
validatorMonitorAuto* {.
|
||||
desc: "Automatically monitor locally active validators (BETA)"
|
||||
defaultValue: false
|
||||
desc: "Monitor validator activity automatically for validators active on this beacon node"
|
||||
defaultValue: true
|
||||
name: "validator-monitor-auto" .}: bool
|
||||
|
||||
validatorMonitorPubkeys* {.
|
||||
desc: "One or more validators to monitor - works best when --subscribe-all-subnets is enabled (BETA)"
|
||||
desc: "One or more validators to monitor - works best when --subscribe-all-subnets is enabled"
|
||||
name: "validator-monitor-pubkey" .}: seq[ValidatorPubKey]
|
||||
|
||||
validatorMonitorDetails* {.
|
||||
desc: "Publish detailed metrics for each validator individually - may incur significant overhead with large numbers of validators"
|
||||
defaultValue: true
|
||||
name: "validator-monitor-details" .}: bool
|
||||
|
||||
validatorMonitorTotals* {.
|
||||
desc: "Publish metrics to single 'totals' label for better collection performance when monitoring many validators (BETA)"
|
||||
defaultValue: false
|
||||
name: "validator-monitor-totals" .}: bool
|
||||
hidden
|
||||
desc: "Deprecated in favour of --validator-monitor-details"
|
||||
name: "validator-monitor-totals" .}: Option[bool]
|
||||
|
||||
safeSlotsToImportOptimistically* {.
|
||||
# Never unhidden or documented, and deprecated > 22.9.1
|
||||
|
|
|
@ -530,9 +530,14 @@ proc init*(T: type BeaconNode,
|
|||
# Doesn't use std/random directly, but dependencies might
|
||||
randomize(rng[].rand(high(int)))
|
||||
|
||||
# The validatorMonitorTotals flag has been deprecated and should eventually be
|
||||
# removed - until then, it's given priority if set so as not to needlessly
|
||||
# break existing setups
|
||||
let
|
||||
validatorMonitor = newClone(ValidatorMonitor.init(
|
||||
config.validatorMonitorAuto, config.validatorMonitorTotals))
|
||||
config.validatorMonitorAuto,
|
||||
config.validatorMonitorTotals.get(
|
||||
not config.validatorMonitorDetails)))
|
||||
|
||||
for key in config.validatorMonitorPubkeys:
|
||||
validatorMonitor[].addMonitor(key, Opt.none(ValidatorIndex))
|
||||
|
@ -1782,6 +1787,7 @@ proc doRunBeaconNode(config: var BeaconNodeConf, rng: ref HmacDrbgContext) {.rai
|
|||
ignoreDeprecatedOption safeSlotsToImportOptimistically
|
||||
ignoreDeprecatedOption terminalTotalDifficultyOverride
|
||||
ignoreDeprecatedOption optimistic
|
||||
ignoreDeprecatedOption validatorMonitorTotals
|
||||
|
||||
createPidFile(config.dataDir.string / "beacon_node.pid")
|
||||
|
||||
|
|
|
@ -159,7 +159,7 @@ proc routeAttestation*(
|
|||
beacon_attestations_sent.inc()
|
||||
beacon_attestation_sent_delay.observe(delay.toFloatSeconds())
|
||||
|
||||
notice "Attestation sent",
|
||||
info "Attestation sent",
|
||||
attestation = shortLog(attestation), delay, subnet_id
|
||||
else: # "no broadcast" is not a fatal error
|
||||
notice "Attestation not sent",
|
||||
|
@ -223,7 +223,7 @@ proc routeSignedAggregateAndProof*(
|
|||
if res.isOk():
|
||||
beacon_aggregates_sent.inc()
|
||||
|
||||
notice "Aggregated attestation sent",
|
||||
info "Aggregated attestation sent",
|
||||
attestation = shortLog(proof.message.aggregate),
|
||||
aggregator_index = proof.message.aggregator_index,
|
||||
selection_proof = shortLog(proof.message.selection_proof),
|
||||
|
@ -260,7 +260,7 @@ proc routeSyncCommitteeMessage*(
|
|||
beacon_sync_committee_messages_sent.inc()
|
||||
beacon_sync_committee_message_sent_delay.observe(delay.toFloatSeconds())
|
||||
|
||||
notice "Sync committee message sent", message = shortLog(msg), delay
|
||||
info "Sync committee message sent", message = shortLog(msg), delay
|
||||
else: # "no broadcast" is not a fatal error
|
||||
notice "Sync committee message not sent",
|
||||
message = shortLog(msg), error = res.error()
|
||||
|
@ -375,7 +375,7 @@ proc routeSignedContributionAndProof*(
|
|||
let res = await router[].network.broadcastSignedContributionAndProof(msg)
|
||||
if res.isOk():
|
||||
beacon_sync_committee_contributions_sent.inc()
|
||||
notice "Contribution sent",
|
||||
info "Contribution sent",
|
||||
contribution = shortLog(msg.message.contribution),
|
||||
aggregator_index = msg.message.aggregator_index,
|
||||
selection_proof = shortLog(msg.message.selection_proof),
|
||||
|
|
|
@ -473,24 +473,27 @@ proc registerEpochInfo*(
|
|||
epoch = prev_epoch,
|
||||
validator = id
|
||||
|
||||
# Indicates if any on-chain attestation hit the head.
|
||||
if previous_epoch_matched_head:
|
||||
validator_monitor_prev_epoch_on_chain_head_attester_hit.inc(1, [metricId])
|
||||
else:
|
||||
validator_monitor_prev_epoch_on_chain_head_attester_miss.inc(1, [metricId])
|
||||
notice "Attestation failed to match head",
|
||||
epoch = prev_epoch,
|
||||
validator = id
|
||||
|
||||
# Indicates if any on-chain attestation hit the target.
|
||||
if previous_epoch_matched_target:
|
||||
validator_monitor_prev_epoch_on_chain_target_attester_hit.inc(1, [metricId])
|
||||
else:
|
||||
validator_monitor_prev_epoch_on_chain_target_attester_miss.inc(1, [metricId])
|
||||
|
||||
notice "Attestation failed to match target",
|
||||
epoch = prev_epoch,
|
||||
validator = id
|
||||
if previous_epoch_matched_source:
|
||||
notice "Attestation failed to match target and head",
|
||||
epoch = prev_epoch,
|
||||
validator = id
|
||||
|
||||
# Indicates if any on-chain attestation hit the head.
|
||||
if previous_epoch_matched_head:
|
||||
validator_monitor_prev_epoch_on_chain_head_attester_hit.inc(1, [metricId])
|
||||
else:
|
||||
validator_monitor_prev_epoch_on_chain_head_attester_miss.inc(1, [metricId])
|
||||
if previous_epoch_matched_target:
|
||||
notice "Attestation failed to match head",
|
||||
epoch = prev_epoch,
|
||||
validator = id
|
||||
|
||||
|
||||
when state isnot phase0.BeaconState: # altair+
|
||||
# Indicates the number of sync committee signatures that made it into
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
# Validator monitoring
|
||||
|
||||
The validator monitoring feature allows for tracking the life-cycle and performance of one or more validators in detail.
|
||||
!!! note ""
|
||||
This feature is available from `v23.1.0` onwards - earlier Nimbus versions included a preview version this feature behind a feature flag without enabling it by default.
|
||||
|
||||
The validator monitoring feature allows for tracking the life cycle and performance of one or more validators in detail.
|
||||
|
||||
Monitoring can be carried out for any validator, with slightly more detail for validators that are running in the same beacon node.
|
||||
|
||||
|
@ -13,30 +16,35 @@ Validator actions can be traced either through logging, or comprehensive metrics
|
|||
|
||||
The metrics are broadly compatible with [Lighthouse](https://lighthouse-book.sigmaprime.io/validator-monitoring.html), thus dashboards and alerts can be used with either client with minor adjustments.
|
||||
|
||||
## Enabling validator monitoring
|
||||
## Command line options
|
||||
|
||||
The monitor can be enabled either for all keys that are used with a particular beacon node, or for a specific list of validators, or both.
|
||||
The monitor is by default enabled for all keys that are validating via the beacon node. It can also be configured to monitor a specific list of validators, or be disabled entirely with `--validator-monitor-auto=false`.
|
||||
|
||||
```
|
||||
# Enable automatic monitoring of all validators used with this beacon node
|
||||
./run-mainnet-beacon-node.sh --validator-monitor-auto
|
||||
The `--validator-monitor-details` flag can be used to enable the detailed monitor mode - in this mode, the performance of each validator is monitored individually in metrics leading to a more detailed view of performance.
|
||||
|
||||
!!! tip
|
||||
The detailed mode significantly increases the total number of published metrics for each monitored validator - when used with more than 10 validators, it may adversely impact performance of metrics collection and display
|
||||
|
||||
```sh
|
||||
# Disable automatic monitoring of all validators used with this beacon node beacon node
|
||||
./run-mainnet-beacon-node.sh --validator-monitor-auto=false ...
|
||||
|
||||
# Enable monitoring of one or more specific validators
|
||||
./run-mainnet-beacon-node.sh \
|
||||
--validator-monitor-pubkey=0xa1d1ad0714035353258038e964ae9675dc0252ee22cea896825c01458e1807bfad2f9969338798548d9858a571f7425c \
|
||||
--validator-monitor-pubkey=0xb2ff4716ed345b05dd1dfc6a5a9fa70856d8c75dcc9e881dd2f766d5f891326f0d10e96f3a444ce6c912b69c22c6754d
|
||||
--validator-monitor-pubkey=0xb2ff4716ed345b05dd1dfc6a5a9fa70856d8c75dcc9e881dd2f766d5f891326f0d10e96f3a444ce6c912b69c22c6754d ...
|
||||
|
||||
# Publish metrics as totals for all monitored validators instead of each validator separately - used for limiting the load on metrics when monitoring many validators
|
||||
./run-mainnet-beacon-node.sh --validator-monitor-totals
|
||||
# Publish detailed metrics for each monitored validator individually instead of an aggregate totals value
|
||||
./run-mainnet-beacon-node.sh --validator-monitor-details ...
|
||||
```
|
||||
|
||||
## Understanding monitoring
|
||||
|
||||
When a validator performs a duty, such as signing an attestation or a sync committee message, this is broadcast to the network. Other nodes pick it up and package the message into an aggregate and later a block. The block is included in the canonical chain and a reward is given two epochs (~13 minutes) later.
|
||||
|
||||
The monitor tracks these actions and will log each step at the `INF` level. If any step is missed, a `NOT` log is shown instead.
|
||||
The monitor tracks each of these actions and will in detailed mode log each step at the `INF` level. If any step is missed (irrespective of detail mode), a `NOT` log is shown instead.
|
||||
|
||||
The typical lifecycle of an attestation might look something like the following:
|
||||
The typical life cycle of an attestation might look something like the following:
|
||||
|
||||
```
|
||||
INF 2021-11-22 11:32:44.228+01:00 Attestation seen topics="val_mon" attestation="(aggregation_bits: 0b0000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, data: (slot: 2656363, index: 11, beacon_block_root: \"bbe7fc25\", source: \"83010:a8a1b125\", target: \"83011:6db281cd\"), signature: \"b88ef2f2\")" src=api epoch=83011 validator=b93c290b
|
||||
|
@ -44,7 +52,7 @@ INF 2021-11-22 11:32:51.293+01:00 Attestation included in aggregate top
|
|||
INF 2021-11-22 11:33:07.193+01:00 Attestation included in block attestation_data="(slot: 2656364, index: 9, beacon_block_root: \"c7761767\", source: \"83010:a8a1b125\", target: \"83011:6db281cd\")" block_slot=2656365 inclusion_lag_slots=0 epoch=83011 validator=b65b6e1b
|
||||
```
|
||||
|
||||
The lifecycle of a particular message can be traced by following the `epoch=.... validator=...` fields in the message.
|
||||
The life cycle of a particular message can be traced by following the `epoch=.... validator=...` fields in the message.
|
||||
|
||||
Failures at any point are recorded at a higher logging level, such as `NOT`(ice):
|
||||
|
||||
|
@ -63,6 +71,6 @@ Likewise, many metrics, such as aggregation inclusion, reflect conditions on the
|
|||
|
||||
The full list of metrics supported by the validator monitoring feature can be seen in the [source code](https://github.com/status-im/nimbus-eth2/blob/unstable/beacon_chain/validators/validator_monitor.nim) or by examining the metrics output:
|
||||
|
||||
```
|
||||
```sh
|
||||
curl -s localhost:8008/metrics | grep HELP.*validator_
|
||||
```
|
||||
|
|
Loading…
Reference in New Issue