Assorted logging improvements (#3237)

* log doppelganger detection when it activates and when it causes missed duties * less prominent eth1 sync progress * log in-progress sync at notice only when actually missing duties * better detail in replay log * don't log finalization checkpoints - this is quite verbose when syncing and already included in "Slot start"
2025-02-17 00:47:03 +00:00 · 2022-01-03 22:18:49 +01:00 · 2022-01-03 22:18:49 +01:00 · c4ce59e55b
commit c4ce59e55b
parent 3c63a78c01
6 changed files with 104 additions and 53 deletions
--- a/beacon_chain/consensus_object_pools/blockchain_dag.nim
+++ b/beacon_chain/consensus_object_pools/blockchain_dag.nim
@ -853,7 +853,9 @@ proc updateStateData*(
  # an earlier state must be loaded since there's no way to undo the slot
  # transitions

-  let startTick = Moment.now()
+  let
+    startTick = Moment.now()
+    current {.used.} = state.blck.atSlot(getStateField(state.data, slot))

  var
    ancestors: seq[BlockRef]
@ -931,7 +933,7 @@ proc updateStateData*(

  if not found:
    debug "UpdateStateData cache miss",
-      bs, stateBlock = state.blck, stateSlot = getStateField(state.data, slot)
+      current = shortLog(current), target = shortLog(bs)

    # Either the state is too new or was created by applying a different block.
    # We'll now resort to loading the state from the database then reapplying
@ -967,8 +969,9 @@ proc updateStateData*(
  # Starting state has been assigned, either from memory or database
  let
    assignTick = Moment.now()
-    startSlot {.used.} = getStateField(state.data, slot) # used in logs below
-    startRoot {.used.} = getStateRoot(state.data)
+    ancestor {.used.} = state.blck.atSlot(getStateField(state.data, slot))
+    ancestorRoot {.used.} = getStateRoot(state.data)
+
  var info: ForkedEpochInfo
  # Time to replay all the blocks between then and now
  for i in countdown(ancestors.len - 1, 0):
@ -990,27 +993,47 @@ proc updateStateData*(
    assignDur = assignTick - startTick
    replayDur = Moment.now() - assignTick

-  logScope:
-    blocks = ancestors.len
-    slots = getStateField(state.data, slot) - startSlot
-    stateRoot = shortLog(getStateRoot(state.data))
-    stateSlot = getStateField(state.data, slot)
-    startRoot = shortLog(startRoot)
-    startSlot
-    blck = shortLog(bs)
-    found
-    assignDur
-    replayDur
-
+  # TODO https://github.com/status-im/nim-chronicles/issues/108
  if (assignDur + replayDur) >= 250.millis:
    # This might indicate there's a cache that's not in order or a disk that is
    # too slow - for now, it's here for investigative purposes and the cutoff
    # time might need tuning
-    info "State replayed"
+    info "State replayed",
+      blocks = ancestors.len,
+      slots = getStateField(state.data, slot) - ancestor.slot,
+      current = shortLog(current),
+      ancestor = shortLog(ancestor),
+      target = shortLog(bs),
+      ancestorStateRoot = shortLog(ancestorRoot),
+      targetStateRoot = shortLog(getStateRoot(state.data)),
+      found,
+      assignDur,
+      replayDur
+
  elif ancestors.len > 0:
-    debug "State replayed"
-  else:
-    trace "State advanced" # Normal case!
+    debug "State replayed",
+      blocks = ancestors.len,
+      slots = getStateField(state.data, slot) - ancestor.slot,
+      current = shortLog(current),
+      ancestor = shortLog(ancestor),
+      target = shortLog(bs),
+      ancestorStateRoot = shortLog(ancestorRoot),
+      targetStateRoot = shortLog(getStateRoot(state.data)),
+      found,
+      assignDur,
+      replayDur
+  else: # Normal case!
+    trace "State advanced",
+      blocks = ancestors.len,
+      slots = getStateField(state.data, slot) - ancestor.slot,
+      current = shortLog(current),
+      ancestor = shortLog(ancestor),
+      target = shortLog(bs),
+      ancestorStateRoot = shortLog(ancestorRoot),
+      targetStateRoot = shortLog(getStateRoot(state.data)),
+      found,
+      assignDur,
+      replayDur

 proc delState(dag: ChainDAGRef, bs: BlockSlot) =
  # Delete state state and mapping for a particular block+slot
@ -1340,7 +1363,7 @@ proc updateHead*(
    beacon_current_active_validators.set(number_of_active_validators)

  if finalizedHead != dag.finalizedHead:
-    info "Reached new finalization checkpoint",
+    debug "Reached new finalization checkpoint",
      head = shortLog(dag.headState.blck),
      stateRoot = shortLog(getStateRoot(dag.headState.data)),
      justified = shortLog(getStateField(
--- a/beacon_chain/eth1/eth1_monitor.nim
+++ b/beacon_chain/eth1/eth1_monitor.nim
@ -1054,7 +1054,7 @@ proc syncBlockRange(m: Eth1Monitor,
      else:
        discard

-      notice "Eth1 sync progress",
+      info "Eth1 sync progress",
        blockNumber = lastBlock.number,
        depositsProcessed = lastBlock.voteData.deposit_count

--- a/beacon_chain/gossip_processing/eth2_processor.nim
+++ b/beacon_chain/gossip_processing/eth2_processor.nim
@ -152,7 +152,8 @@ proc new*(T: type Eth2Processor,
  (ref Eth2Processor)(
    doppelGangerDetectionEnabled: doppelGangerDetectionEnabled,
    doppelgangerDetection: DoppelgangerProtection(
-      nodeLaunchSlot: getBeaconTime().slotOrZero),
+      nodeLaunchSlot: getBeaconTime().slotOrZero,
+      broadcastStartEpoch: FAR_FUTURE_EPOCH),
    blockProcessor: blockProcessor,
    validatorMonitor: validatorMonitor,
    dag: dag,
@ -226,6 +227,25 @@ proc blockValidator*(

  v

+proc setupDoppelgangerDetection*(self: var Eth2Processor, slot: Slot) =
+  # When another client's already running, this is very likely to detect
+  # potential duplicate validators, which can trigger slashing.
+  #
+  # Every missed attestation costs approximately 3*get_base_reward(), which
+  # can be up to around 10,000 Wei. Thus, skipping attestations isn't cheap
+  # and one should gauge the likelihood of this simultaneous launch to tune
+  # the epoch delay to one's perceived risk.
+
+  if self.validatorPool[].count() > 0:
+    const duplicateValidatorEpochs = 2
+
+    self.doppelgangerDetection.broadcastStartEpoch =
+      slot.epoch + duplicateValidatorEpochs
+    notice "Setting up doppelganger protection",
+      epoch = slot.epoch,
+      broadcastStartEpoch =
+        self.doppelgangerDetection.broadcastStartEpoch
+
 proc checkForPotentialDoppelganger(
    self: var Eth2Processor, attestation: Attestation,
    attesterIndices: openArray[ValidatorIndex]) =
--- a/beacon_chain/nimbus_beacon_node.nim
+++ b/beacon_chain/nimbus_beacon_node.nim
@ -665,23 +665,6 @@ proc removeAltairMessageHandlers(node: BeaconNode, forkDigest: ForkDigest) =
  node.network.unsubscribe(
    getSyncCommitteeContributionAndProofTopic(forkDigest))

-proc setupDoppelgangerDetection(node: BeaconNode, slot: Slot) =
-  # When another client's already running, this is very likely to detect
-  # potential duplicate validators, which can trigger slashing.
-  #
-  # Every missed attestation costs approximately 3*get_base_reward(), which
-  # can be up to around 10,000 Wei. Thus, skipping attestations isn't cheap
-  # and one should gauge the likelihood of this simultaneous launch to tune
-  # the epoch delay to one's perceived risk.
-  const duplicateValidatorEpochs = 2
-
-  node.processor.doppelgangerDetection.broadcastStartEpoch =
-    slot.epoch + duplicateValidatorEpochs
-  debug "Setting up doppelganger protection",
-    epoch = slot.epoch,
-    broadcastStartEpoch =
-      node.processor.doppelgangerDetection.broadcastStartEpoch
-
 proc trackSyncCommitteeTopics*(node: BeaconNode) =
  # TODO
  discard
@ -743,7 +726,7 @@ proc updateGossipStatus(node: BeaconNode, slot: Slot) {.async.} =
      headSlot = head.slot,
      headDistance, targetGossipState

-    node.setupDoppelgangerDetection(slot)
+    node.processor[].setupDoppelgangerDetection(slot)

    # Specially when waiting for genesis, we'll already be synced on startup -
    # it might also happen on a sufficiently fast restart
@ -836,12 +819,8 @@ proc onSlotEnd(node: BeaconNode, slot: Slot) {.async.} =
    node.actionTracker.updateActions(epochRef)

  let
-    nextAttestationSlot = getNextValidatorAction(
-      node.actionTracker.attestingSlots,
-      node.actionTracker.lastCalculatedEpoch, slot)
-    nextProposalSlot = getNextValidatorAction(
-      node.actionTracker.proposingSlots,
-      node.actionTracker.lastCalculatedEpoch, slot)
+    nextAttestationSlot = node.actionTracker.getNextAttestationSlot(slot)
+    nextProposalSlot = node.actionTracker.getNextProposalSlot(slot)
    nextActionWaitTime = saturate(fromNow(
      node.beaconClock, min(nextAttestationSlot, nextProposalSlot)))

--- a/beacon_chain/validators/action_tracker.nim
+++ b/beacon_chain/validators/action_tracker.nim
@ -186,6 +186,16 @@ func getNextValidatorAction*(

  FAR_FUTURE_SLOT

+func getNextAttestationSlot*(tracker: ActionTracker, slot: Slot): Slot =
+  getNextValidatorAction(
+    tracker.attestingSlots,
+    tracker.lastCalculatedEpoch, slot)
+
+func getNextProposalSlot*(tracker: ActionTracker, slot: Slot): Slot =
+  getNextValidatorAction(
+    tracker.proposingSlots,
+    tracker.lastCalculatedEpoch, slot)
+
 proc updateActions*(tracker: var ActionTracker, epochRef: EpochRef) =
  # Updates the schedule for upcoming attestation and proposal work
  let
--- a/beacon_chain/validators/validator_duties.nim
+++ b/beacon_chain/validators/validator_duties.nim
@ -995,8 +995,15 @@ proc handleValidatorDuties*(node: BeaconNode, lastSlot, slot: Slot) {.async.} =
  # await calls, thus we use a local variable to keep the logic straight here
  var head = node.dag.head
  if not node.isSynced(head):
-    notice "Syncing in progress; skipping validator duties for now",
-      slot, headSlot = head.slot
+    let
+      nextAttestationSlot = node.actionTracker.getNextAttestationSlot(slot)
+      nextProposalSlot = node.actionTracker.getNextProposalSlot(slot)
+    if slot in [nextAttestationSlot, nextProposalSlot]:
+      notice "Syncing in progress; skipping validator duties for now",
+        slot, headSlot = head.slot
+    else:
+      debug "Syncing in progress; skipping validator duties for now",
+        slot, headSlot = head.slot

    # Rewards will be growing though, as we sync..
    updateValidatorMetrics(node)
@ -1007,12 +1014,24 @@ proc handleValidatorDuties*(node: BeaconNode, lastSlot, slot: Slot) {.async.} =

  # If broadcastStartEpoch is 0, it hasn't had time to initialize yet, which
  # means that it'd be okay not to continue, but it won't gossip regardless.
-  if  curSlot.epoch <
+  if curSlot.epoch <
        node.processor[].doppelgangerDetection.broadcastStartEpoch and
      node.config.doppelgangerDetection:
-    debug "Waiting to gossip out to detect potential duplicate validators",
-      broadcastStartEpoch =
-        node.processor[].doppelgangerDetection.broadcastStartEpoch
+    let
+      nextAttestationSlot = node.actionTracker.getNextAttestationSlot(slot)
+      nextProposalSlot = node.actionTracker.getNextProposalSlot(slot)
+
+    if slot in [nextAttestationSlot, nextProposalSlot]:
+      notice "Doppelganger detection active - skipping validator duties while observing activity on the network",
+        slot, epoch = slot.epoch, nextAttestationSlot, nextProposalSlot,
+        broadcastStartEpoch =
+          node.processor[].doppelgangerDetection.broadcastStartEpoch
+    else:
+      debug "Doppelganger detection active - skipping validator duties while observing activity on the network",
+        slot, epoch = slot.epoch, nextAttestationSlot, nextProposalSlot,
+        broadcastStartEpoch =
+          node.processor[].doppelgangerDetection.broadcastStartEpoch
+
    return

  # Start by checking if there's work we should have done in the past that we