Assorted logging improvements (#3237)

* log doppelganger detection when it activates and when it causes missed duties * less prominent eth1 sync progress * log in-progress sync at notice only when actually missing duties * better detail in replay log * don't log finalization checkpoints - this is quite verbose when syncing and already included in "Slot start"
2025-02-20 02:08:12 +00:00 · 2022-01-03 22:18:49 +01:00 · 2022-01-03 22:18:49 +01:00 · c4ce59e55b
commit c4ce59e55b
parent 3c63a78c01
6 changed files with 104 additions and 53 deletions
--- a/beacon_chain/consensus_object_pools/blockchain_dag.nim
+++ b/beacon_chain/consensus_object_pools/blockchain_dag.nim
@ -853,7 +853,9 @@ proc updateStateData*(
  # an earlier state must be loaded since there's no way to undo the slot
  # transitions
-  let startTick = Moment.now()
+  let
    startTick = Moment.now()
    current {.used.} = state.blck.atSlot(getStateField(state.data, slot))
  var
    ancestors: seq[BlockRef]
@ -931,7 +933,7 @@ proc updateStateData*(
  if not found:
    debug "UpdateStateData cache miss",
-      bs, stateBlock = state.blck, stateSlot = getStateField(state.data, slot)
+      current = shortLog(current), target = shortLog(bs)
    # Either the state is too new or was created by applying a different block.
    # We'll now resort to loading the state from the database then reapplying
@ -967,8 +969,9 @@ proc updateStateData*(
  # Starting state has been assigned, either from memory or database
  let
    assignTick = Moment.now()
-    startSlot {.used.} = getStateField(state.data, slot) # used in logs below
+    ancestor {.used.} = state.blck.atSlot(getStateField(state.data, slot))
-    startRoot {.used.} = getStateRoot(state.data)
+    ancestorRoot {.used.} = getStateRoot(state.data)
  var info: ForkedEpochInfo
  # Time to replay all the blocks between then and now
  for i in countdown(ancestors.len - 1, 0):
@ -990,27 +993,47 @@ proc updateStateData*(
    assignDur = assignTick - startTick
    replayDur = Moment.now() - assignTick
-  logScope:
+  # TODO https://github.com/status-im/nim-chronicles/issues/108
    blocks = ancestors.len
    slots = getStateField(state.data, slot) - startSlot
    stateRoot = shortLog(getStateRoot(state.data))
    stateSlot = getStateField(state.data, slot)
    startRoot = shortLog(startRoot)
    startSlot
    blck = shortLog(bs)
    found
    assignDur
    replayDur
  if (assignDur + replayDur) >= 250.millis:
    # This might indicate there's a cache that's not in order or a disk that is
    # too slow - for now, it's here for investigative purposes and the cutoff
    # time might need tuning
-    info "State replayed"
+    info "State replayed",
      blocks = ancestors.len,
      slots = getStateField(state.data, slot) - ancestor.slot,
      current = shortLog(current),
      ancestor = shortLog(ancestor),
      target = shortLog(bs),
      ancestorStateRoot = shortLog(ancestorRoot),
      targetStateRoot = shortLog(getStateRoot(state.data)),
      found,
      assignDur,
      replayDur
  elif ancestors.len > 0:
-    debug "State replayed"
+    debug "State replayed",
-  else:
+      blocks = ancestors.len,
-    trace "State advanced" # Normal case!
+      slots = getStateField(state.data, slot) - ancestor.slot,
      current = shortLog(current),
      ancestor = shortLog(ancestor),
      target = shortLog(bs),
      ancestorStateRoot = shortLog(ancestorRoot),
      targetStateRoot = shortLog(getStateRoot(state.data)),
      found,
      assignDur,
      replayDur
  else: # Normal case!
    trace "State advanced",
      blocks = ancestors.len,
      slots = getStateField(state.data, slot) - ancestor.slot,
      current = shortLog(current),
      ancestor = shortLog(ancestor),
      target = shortLog(bs),
      ancestorStateRoot = shortLog(ancestorRoot),
      targetStateRoot = shortLog(getStateRoot(state.data)),
      found,
      assignDur,
      replayDur
 proc delState(dag: ChainDAGRef, bs: BlockSlot) =
  # Delete state state and mapping for a particular block+slot
@ -1340,7 +1363,7 @@ proc updateHead*(
    beacon_current_active_validators.set(number_of_active_validators)
  if finalizedHead != dag.finalizedHead:
-    info "Reached new finalization checkpoint",
+    debug "Reached new finalization checkpoint",
      head = shortLog(dag.headState.blck),
      stateRoot = shortLog(getStateRoot(dag.headState.data)),
      justified = shortLog(getStateField(
--- a/beacon_chain/eth1/eth1_monitor.nim
+++ b/beacon_chain/eth1/eth1_monitor.nim
@ -1054,7 +1054,7 @@ proc syncBlockRange(m: Eth1Monitor,
      else:
        discard
-      notice "Eth1 sync progress",
+      info "Eth1 sync progress",
        blockNumber = lastBlock.number,
        depositsProcessed = lastBlock.voteData.deposit_count
--- a/beacon_chain/gossip_processing/eth2_processor.nim
+++ b/beacon_chain/gossip_processing/eth2_processor.nim
@ -152,7 +152,8 @@ proc new*(T: type Eth2Processor,
  (ref Eth2Processor)(
    doppelGangerDetectionEnabled: doppelGangerDetectionEnabled,
    doppelgangerDetection: DoppelgangerProtection(
-      nodeLaunchSlot: getBeaconTime().slotOrZero),
+      nodeLaunchSlot: getBeaconTime().slotOrZero,
      broadcastStartEpoch: FAR_FUTURE_EPOCH),
    blockProcessor: blockProcessor,
    validatorMonitor: validatorMonitor,
    dag: dag,
@ -226,6 +227,25 @@ proc blockValidator*(
  v
 proc setupDoppelgangerDetection*(self: var Eth2Processor, slot: Slot) =
  # When another client's already running, this is very likely to detect
  # potential duplicate validators, which can trigger slashing.
  #
  # Every missed attestation costs approximately 3*get_base_reward(), which
  # can be up to around 10,000 Wei. Thus, skipping attestations isn't cheap
  # and one should gauge the likelihood of this simultaneous launch to tune
  # the epoch delay to one's perceived risk.
  if self.validatorPool[].count() > 0:
    const duplicateValidatorEpochs = 2
    self.doppelgangerDetection.broadcastStartEpoch =
      slot.epoch + duplicateValidatorEpochs
    notice "Setting up doppelganger protection",
      epoch = slot.epoch,
      broadcastStartEpoch =
        self.doppelgangerDetection.broadcastStartEpoch
 proc checkForPotentialDoppelganger(
    self: var Eth2Processor, attestation: Attestation,
    attesterIndices: openArray[ValidatorIndex]) =
--- a/beacon_chain/nimbus_beacon_node.nim
+++ b/beacon_chain/nimbus_beacon_node.nim
@ -665,23 +665,6 @@ proc removeAltairMessageHandlers(node: BeaconNode, forkDigest: ForkDigest) =
  node.network.unsubscribe(
    getSyncCommitteeContributionAndProofTopic(forkDigest))
 proc setupDoppelgangerDetection(node: BeaconNode, slot: Slot) =
  # When another client's already running, this is very likely to detect
  # potential duplicate validators, which can trigger slashing.
  #
  # Every missed attestation costs approximately 3*get_base_reward(), which
  # can be up to around 10,000 Wei. Thus, skipping attestations isn't cheap
  # and one should gauge the likelihood of this simultaneous launch to tune
  # the epoch delay to one's perceived risk.
  const duplicateValidatorEpochs = 2
  node.processor.doppelgangerDetection.broadcastStartEpoch =
    slot.epoch + duplicateValidatorEpochs
  debug "Setting up doppelganger protection",
    epoch = slot.epoch,
    broadcastStartEpoch =
      node.processor.doppelgangerDetection.broadcastStartEpoch
 proc trackSyncCommitteeTopics*(node: BeaconNode) =
  # TODO
  discard
@ -743,7 +726,7 @@ proc updateGossipStatus(node: BeaconNode, slot: Slot) {.async.} =
      headSlot = head.slot,
      headDistance, targetGossipState
-    node.setupDoppelgangerDetection(slot)
+    node.processor[].setupDoppelgangerDetection(slot)
    # Specially when waiting for genesis, we'll already be synced on startup -
    # it might also happen on a sufficiently fast restart
@ -836,12 +819,8 @@ proc onSlotEnd(node: BeaconNode, slot: Slot) {.async.} =
    node.actionTracker.updateActions(epochRef)
  let
-    nextAttestationSlot = getNextValidatorAction(
+    nextAttestationSlot = node.actionTracker.getNextAttestationSlot(slot)
-      node.actionTracker.attestingSlots,
+    nextProposalSlot = node.actionTracker.getNextProposalSlot(slot)
      node.actionTracker.lastCalculatedEpoch, slot)
    nextProposalSlot = getNextValidatorAction(
      node.actionTracker.proposingSlots,
      node.actionTracker.lastCalculatedEpoch, slot)
    nextActionWaitTime = saturate(fromNow(
      node.beaconClock, min(nextAttestationSlot, nextProposalSlot)))
--- a/beacon_chain/validators/action_tracker.nim
+++ b/beacon_chain/validators/action_tracker.nim
@ -186,6 +186,16 @@ func getNextValidatorAction*(
  FAR_FUTURE_SLOT
 func getNextAttestationSlot*(tracker: ActionTracker, slot: Slot): Slot =
  getNextValidatorAction(
    tracker.attestingSlots,
    tracker.lastCalculatedEpoch, slot)
 func getNextProposalSlot*(tracker: ActionTracker, slot: Slot): Slot =
  getNextValidatorAction(
    tracker.proposingSlots,
    tracker.lastCalculatedEpoch, slot)
 proc updateActions*(tracker: var ActionTracker, epochRef: EpochRef) =
  # Updates the schedule for upcoming attestation and proposal work
  let
--- a/beacon_chain/validators/validator_duties.nim
+++ b/beacon_chain/validators/validator_duties.nim
@ -995,8 +995,15 @@ proc handleValidatorDuties*(node: BeaconNode, lastSlot, slot: Slot) {.async.} =
  # await calls, thus we use a local variable to keep the logic straight here
  var head = node.dag.head
  if not node.isSynced(head):
-    notice "Syncing in progress; skipping validator duties for now",
+    let
-      slot, headSlot = head.slot
+      nextAttestationSlot = node.actionTracker.getNextAttestationSlot(slot)
      nextProposalSlot = node.actionTracker.getNextProposalSlot(slot)
    if slot in [nextAttestationSlot, nextProposalSlot]:
      notice "Syncing in progress; skipping validator duties for now",
        slot, headSlot = head.slot
    else:
      debug "Syncing in progress; skipping validator duties for now",
        slot, headSlot = head.slot
    # Rewards will be growing though, as we sync..
    updateValidatorMetrics(node)
@ -1007,12 +1014,24 @@ proc handleValidatorDuties*(node: BeaconNode, lastSlot, slot: Slot) {.async.} =
  # If broadcastStartEpoch is 0, it hasn't had time to initialize yet, which
  # means that it'd be okay not to continue, but it won't gossip regardless.
-  if  curSlot.epoch <
+  if curSlot.epoch <
        node.processor[].doppelgangerDetection.broadcastStartEpoch and
      node.config.doppelgangerDetection:
-    debug "Waiting to gossip out to detect potential duplicate validators",
+    let
-      broadcastStartEpoch =
+      nextAttestationSlot = node.actionTracker.getNextAttestationSlot(slot)
-        node.processor[].doppelgangerDetection.broadcastStartEpoch
+      nextProposalSlot = node.actionTracker.getNextProposalSlot(slot)
    if slot in [nextAttestationSlot, nextProposalSlot]:
      notice "Doppelganger detection active - skipping validator duties while observing activity on the network",
        slot, epoch = slot.epoch, nextAttestationSlot, nextProposalSlot,
        broadcastStartEpoch =
          node.processor[].doppelgangerDetection.broadcastStartEpoch
    else:
      debug "Doppelganger detection active - skipping validator duties while observing activity on the network",
        slot, epoch = slot.epoch, nextAttestationSlot, nextProposalSlot,
        broadcastStartEpoch =
          node.processor[].doppelgangerDetection.broadcastStartEpoch
    return
  # Start by checking if there's work we should have done in the past that we