Assorted logging improvements (#3237)

* log doppelganger detection when it activates and when it causes missed
duties
* less prominent eth1 sync progress
* log in-progress sync at notice only when actually missing duties
* better detail in replay log
* don't log finalization checkpoints - this is quite verbose when
syncing and already included in "Slot start"
This commit is contained in:
Jacek Sieka 2022-01-03 22:18:49 +01:00 committed by GitHub
parent 3c63a78c01
commit c4ce59e55b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 104 additions and 53 deletions

View File

@ -853,7 +853,9 @@ proc updateStateData*(
# an earlier state must be loaded since there's no way to undo the slot
# transitions
let startTick = Moment.now()
let
startTick = Moment.now()
current {.used.} = state.blck.atSlot(getStateField(state.data, slot))
var
ancestors: seq[BlockRef]
@ -931,7 +933,7 @@ proc updateStateData*(
if not found:
debug "UpdateStateData cache miss",
bs, stateBlock = state.blck, stateSlot = getStateField(state.data, slot)
current = shortLog(current), target = shortLog(bs)
# Either the state is too new or was created by applying a different block.
# We'll now resort to loading the state from the database then reapplying
@ -967,8 +969,9 @@ proc updateStateData*(
# Starting state has been assigned, either from memory or database
let
assignTick = Moment.now()
startSlot {.used.} = getStateField(state.data, slot) # used in logs below
startRoot {.used.} = getStateRoot(state.data)
ancestor {.used.} = state.blck.atSlot(getStateField(state.data, slot))
ancestorRoot {.used.} = getStateRoot(state.data)
var info: ForkedEpochInfo
# Time to replay all the blocks between then and now
for i in countdown(ancestors.len - 1, 0):
@ -990,27 +993,47 @@ proc updateStateData*(
assignDur = assignTick - startTick
replayDur = Moment.now() - assignTick
logScope:
blocks = ancestors.len
slots = getStateField(state.data, slot) - startSlot
stateRoot = shortLog(getStateRoot(state.data))
stateSlot = getStateField(state.data, slot)
startRoot = shortLog(startRoot)
startSlot
blck = shortLog(bs)
found
assignDur
replayDur
# TODO https://github.com/status-im/nim-chronicles/issues/108
if (assignDur + replayDur) >= 250.millis:
# This might indicate there's a cache that's not in order or a disk that is
# too slow - for now, it's here for investigative purposes and the cutoff
# time might need tuning
info "State replayed"
info "State replayed",
blocks = ancestors.len,
slots = getStateField(state.data, slot) - ancestor.slot,
current = shortLog(current),
ancestor = shortLog(ancestor),
target = shortLog(bs),
ancestorStateRoot = shortLog(ancestorRoot),
targetStateRoot = shortLog(getStateRoot(state.data)),
found,
assignDur,
replayDur
elif ancestors.len > 0:
debug "State replayed"
else:
trace "State advanced" # Normal case!
debug "State replayed",
blocks = ancestors.len,
slots = getStateField(state.data, slot) - ancestor.slot,
current = shortLog(current),
ancestor = shortLog(ancestor),
target = shortLog(bs),
ancestorStateRoot = shortLog(ancestorRoot),
targetStateRoot = shortLog(getStateRoot(state.data)),
found,
assignDur,
replayDur
else: # Normal case!
trace "State advanced",
blocks = ancestors.len,
slots = getStateField(state.data, slot) - ancestor.slot,
current = shortLog(current),
ancestor = shortLog(ancestor),
target = shortLog(bs),
ancestorStateRoot = shortLog(ancestorRoot),
targetStateRoot = shortLog(getStateRoot(state.data)),
found,
assignDur,
replayDur
proc delState(dag: ChainDAGRef, bs: BlockSlot) =
# Delete state state and mapping for a particular block+slot
@ -1340,7 +1363,7 @@ proc updateHead*(
beacon_current_active_validators.set(number_of_active_validators)
if finalizedHead != dag.finalizedHead:
info "Reached new finalization checkpoint",
debug "Reached new finalization checkpoint",
head = shortLog(dag.headState.blck),
stateRoot = shortLog(getStateRoot(dag.headState.data)),
justified = shortLog(getStateField(

View File

@ -1054,7 +1054,7 @@ proc syncBlockRange(m: Eth1Monitor,
else:
discard
notice "Eth1 sync progress",
info "Eth1 sync progress",
blockNumber = lastBlock.number,
depositsProcessed = lastBlock.voteData.deposit_count

View File

@ -152,7 +152,8 @@ proc new*(T: type Eth2Processor,
(ref Eth2Processor)(
doppelGangerDetectionEnabled: doppelGangerDetectionEnabled,
doppelgangerDetection: DoppelgangerProtection(
nodeLaunchSlot: getBeaconTime().slotOrZero),
nodeLaunchSlot: getBeaconTime().slotOrZero,
broadcastStartEpoch: FAR_FUTURE_EPOCH),
blockProcessor: blockProcessor,
validatorMonitor: validatorMonitor,
dag: dag,
@ -226,6 +227,25 @@ proc blockValidator*(
v
proc setupDoppelgangerDetection*(self: var Eth2Processor, slot: Slot) =
# When another client's already running, this is very likely to detect
# potential duplicate validators, which can trigger slashing.
#
# Every missed attestation costs approximately 3*get_base_reward(), which
# can be up to around 10,000 Wei. Thus, skipping attestations isn't cheap
# and one should gauge the likelihood of this simultaneous launch to tune
# the epoch delay to one's perceived risk.
if self.validatorPool[].count() > 0:
const duplicateValidatorEpochs = 2
self.doppelgangerDetection.broadcastStartEpoch =
slot.epoch + duplicateValidatorEpochs
notice "Setting up doppelganger protection",
epoch = slot.epoch,
broadcastStartEpoch =
self.doppelgangerDetection.broadcastStartEpoch
proc checkForPotentialDoppelganger(
self: var Eth2Processor, attestation: Attestation,
attesterIndices: openArray[ValidatorIndex]) =

View File

@ -665,23 +665,6 @@ proc removeAltairMessageHandlers(node: BeaconNode, forkDigest: ForkDigest) =
node.network.unsubscribe(
getSyncCommitteeContributionAndProofTopic(forkDigest))
proc setupDoppelgangerDetection(node: BeaconNode, slot: Slot) =
# When another client's already running, this is very likely to detect
# potential duplicate validators, which can trigger slashing.
#
# Every missed attestation costs approximately 3*get_base_reward(), which
# can be up to around 10,000 Wei. Thus, skipping attestations isn't cheap
# and one should gauge the likelihood of this simultaneous launch to tune
# the epoch delay to one's perceived risk.
const duplicateValidatorEpochs = 2
node.processor.doppelgangerDetection.broadcastStartEpoch =
slot.epoch + duplicateValidatorEpochs
debug "Setting up doppelganger protection",
epoch = slot.epoch,
broadcastStartEpoch =
node.processor.doppelgangerDetection.broadcastStartEpoch
proc trackSyncCommitteeTopics*(node: BeaconNode) =
# TODO
discard
@ -743,7 +726,7 @@ proc updateGossipStatus(node: BeaconNode, slot: Slot) {.async.} =
headSlot = head.slot,
headDistance, targetGossipState
node.setupDoppelgangerDetection(slot)
node.processor[].setupDoppelgangerDetection(slot)
# Specially when waiting for genesis, we'll already be synced on startup -
# it might also happen on a sufficiently fast restart
@ -836,12 +819,8 @@ proc onSlotEnd(node: BeaconNode, slot: Slot) {.async.} =
node.actionTracker.updateActions(epochRef)
let
nextAttestationSlot = getNextValidatorAction(
node.actionTracker.attestingSlots,
node.actionTracker.lastCalculatedEpoch, slot)
nextProposalSlot = getNextValidatorAction(
node.actionTracker.proposingSlots,
node.actionTracker.lastCalculatedEpoch, slot)
nextAttestationSlot = node.actionTracker.getNextAttestationSlot(slot)
nextProposalSlot = node.actionTracker.getNextProposalSlot(slot)
nextActionWaitTime = saturate(fromNow(
node.beaconClock, min(nextAttestationSlot, nextProposalSlot)))

View File

@ -186,6 +186,16 @@ func getNextValidatorAction*(
FAR_FUTURE_SLOT
func getNextAttestationSlot*(tracker: ActionTracker, slot: Slot): Slot =
getNextValidatorAction(
tracker.attestingSlots,
tracker.lastCalculatedEpoch, slot)
func getNextProposalSlot*(tracker: ActionTracker, slot: Slot): Slot =
getNextValidatorAction(
tracker.proposingSlots,
tracker.lastCalculatedEpoch, slot)
proc updateActions*(tracker: var ActionTracker, epochRef: EpochRef) =
# Updates the schedule for upcoming attestation and proposal work
let

View File

@ -995,8 +995,15 @@ proc handleValidatorDuties*(node: BeaconNode, lastSlot, slot: Slot) {.async.} =
# await calls, thus we use a local variable to keep the logic straight here
var head = node.dag.head
if not node.isSynced(head):
notice "Syncing in progress; skipping validator duties for now",
slot, headSlot = head.slot
let
nextAttestationSlot = node.actionTracker.getNextAttestationSlot(slot)
nextProposalSlot = node.actionTracker.getNextProposalSlot(slot)
if slot in [nextAttestationSlot, nextProposalSlot]:
notice "Syncing in progress; skipping validator duties for now",
slot, headSlot = head.slot
else:
debug "Syncing in progress; skipping validator duties for now",
slot, headSlot = head.slot
# Rewards will be growing though, as we sync..
updateValidatorMetrics(node)
@ -1007,12 +1014,24 @@ proc handleValidatorDuties*(node: BeaconNode, lastSlot, slot: Slot) {.async.} =
# If broadcastStartEpoch is 0, it hasn't had time to initialize yet, which
# means that it'd be okay not to continue, but it won't gossip regardless.
if curSlot.epoch <
if curSlot.epoch <
node.processor[].doppelgangerDetection.broadcastStartEpoch and
node.config.doppelgangerDetection:
debug "Waiting to gossip out to detect potential duplicate validators",
broadcastStartEpoch =
node.processor[].doppelgangerDetection.broadcastStartEpoch
let
nextAttestationSlot = node.actionTracker.getNextAttestationSlot(slot)
nextProposalSlot = node.actionTracker.getNextProposalSlot(slot)
if slot in [nextAttestationSlot, nextProposalSlot]:
notice "Doppelganger detection active - skipping validator duties while observing activity on the network",
slot, epoch = slot.epoch, nextAttestationSlot, nextProposalSlot,
broadcastStartEpoch =
node.processor[].doppelgangerDetection.broadcastStartEpoch
else:
debug "Doppelganger detection active - skipping validator duties while observing activity on the network",
slot, epoch = slot.epoch, nextAttestationSlot, nextProposalSlot,
broadcastStartEpoch =
node.processor[].doppelgangerDetection.broadcastStartEpoch
return
# Start by checking if there's work we should have done in the past that we