Beacon sync mitigate deadlock with bogus sticky peers (#2943)

* Metrics cosmetics * Better naming for error threshold constants * Treating header/body process error different from response errors why: Error handling becomes active not until some consecutive failures appear. As both types of errors may interleave (i.g. no response errors) the counter reset for one type might affect the other. By doing it wrong, a peer might send repeatedly a bogus block so locking in the syncer in an endless loop.
2025-03-01 04:10:45 +00:00 · 2024-12-16 16:26:38 +00:00 · 2024-12-16 16:26:38 +00:00 · 0ce5234231
commit 0ce5234231
parent 650fec5a26
7 changed files with 82 additions and 37 deletions
--- a/nimbus/sync/beacon/worker/blocks_staged.nim
+++ b/nimbus/sync/beacon/worker/blocks_staged.nim
@ -173,6 +173,9 @@ proc blocksStagedCollect*(
    # so that `async` can capture that properly.
    blk = (ref BlocksForImport)()
    # Flag, not to reset error count
    haveError = false
  # nFetchBodiesRequest
  while true:
    # Extract bottom range interval and fetch/stage it
@ -196,14 +199,20 @@ proc blocksStagedCollect*(
      # Throw away first time block fetch data. Keep other data for a
      # partially assembled list.
      if nBlkBlocks == 0:
-        buddy.only.nBdyRespErrors.inc
+        buddy.only.nBdyProcErrors.inc
        haveError = true
-        if (1 < buddy.only.nBdyRespErrors and buddy.ctrl.stopped) or
+        if ((0 < buddy.only.nBdyRespErrors or
-           fetchBodiesReqThresholdCount < buddy.only.nBdyRespErrors:
+             0 < buddy.only.nBdyProcErrors) and buddy.ctrl.stopped) or
           fetchBodiesReqErrThresholdCount < buddy.only.nBdyRespErrors or
           fetchBodiesProcessErrThresholdCount < buddy.only.nBdyProcErrors:
          # Make sure that this peer does not immediately reconnect
          buddy.ctrl.zombie = true
        trace info & ": current block list discarded", peer, iv, ivReq,
-          ctrl=buddy.ctrl.state, nRespErrors=buddy.only.nBdyRespErrors
+          nStaged=ctx.blk.staged.len, ctrl=buddy.ctrl.state,
          bdyErrors=buddy.bdyErrors
        ctx.blocksUnprocCommit(iv.len, iv)
        # At this stage allow a task switch so that some other peer might try
        # to work on the currently returned interval.
@ -239,8 +248,16 @@ proc blocksStagedCollect*(
    raiseAssert info & ": duplicate key on staged queue iv=" & $iv
  qItem.data = blk[]
  # Reset block process errors (not too many consecutive failures this time)
  if not haveError:
    buddy.only.nBdyProcErrors = 0
  trace info & ": staged blocks", peer, bottomBlock=iv.minPt.bnStr,
-    nBlocks=blk.blocks.len, nStaged=ctx.blk.staged.len, ctrl=buddy.ctrl.state
+    nBlocks=blk.blocks.len, nStaged=ctx.blk.staged.len, ctrl=buddy.ctrl.state,
    bdyErrors=buddy.bdyErrors
  # Update, so it can be followed nicely
  ctx.updateMetrics()
  return true
--- a/nimbus/sync/beacon/worker/blocks_staged/bodies.nim
+++ b/nimbus/sync/beacon/worker/blocks_staged/bodies.nim
@ -22,9 +22,12 @@ import
 # Public functions
 # ------------------------------------------------------------------------------
 func bdyErrors*(buddy: BeaconBuddyRef): string =
  $buddy.only.nBdyRespErrors & "/" & $buddy.only.nBdyProcErrors
 proc fetchRegisterError*(buddy: BeaconBuddyRef) =
  buddy.only.nBdyRespErrors.inc
-  if fetchBodiesReqThresholdCount < buddy.only.nBdyRespErrors:
+  if fetchBodiesReqErrThresholdCount < buddy.only.nBdyRespErrors:
    buddy.ctrl.zombie = true # abandon slow peer
 proc bodiesFetch*(
@ -39,8 +42,7 @@ proc bodiesFetch*(
    start = Moment.now()
    nReq = blockHashes.len
-  trace trEthSendSendingGetBlockBodies, peer, nReq,
+  trace trEthSendSendingGetBlockBodies, peer, nReq, bdyErrors=buddy.bdyErrors
    nRespErrors=buddy.only.nBdyRespErrors
  var resp: Option[blockBodiesObj]
  try:
@ -48,7 +50,7 @@ proc bodiesFetch*(
  except CatchableError as e:
    buddy.fetchRegisterError()
    `info` info & " error", peer, nReq, elapsed=(Moment.now() - start).toStr,
-      error=($e.name), msg=e.msg, nRespErrors=buddy.only.nBdyRespErrors
+      error=($e.name), msg=e.msg, bdyErrors=buddy.bdyErrors
    return err()
  let elapsed = Moment.now() - start
@ -57,8 +59,7 @@ proc bodiesFetch*(
  if resp.isNone or buddy.ctrl.stopped:
    buddy.fetchRegisterError()
    trace trEthRecvReceivedBlockBodies, peer, nReq, nResp=0,
-      elapsed=elapsed.toStr, ctrl=buddy.ctrl.state,
+      elapsed=elapsed.toStr, ctrl=buddy.ctrl.state, bdyErrors=buddy.bdyErrors
      nRespErrors=buddy.only.nBdyRespErrors
    return err()
  let b: seq[BlockBody] = resp.get.blocks
@ -71,15 +72,14 @@ proc bodiesFetch*(
  # Ban an overly slow peer for a while when seen in a row. Also there is a
  # mimimum share of the number of requested headers expected, typically 10%.
-  if fetchBodiesReqThresholdZombie < elapsed or
+  if fetchBodiesReqErrThresholdZombie < elapsed or
     b.len.uint64 * 100 < nReq.uint64 * fetchBodiesReqMinResponsePC:
    buddy.fetchRegisterError()
  else:
    buddy.only.nBdyRespErrors = 0 # reset error count
  trace trEthRecvReceivedBlockBodies, peer, nReq, nResp=b.len,
-    elapsed=elapsed.toStr, ctrl=buddy.ctrl.state,
+    elapsed=elapsed.toStr, ctrl=buddy.ctrl.state, bdyErrors=buddy.bdyErrors
      nRespErrors=buddy.only.nBdyRespErrors
  return ok(b)
--- a/nimbus/sync/beacon/worker/headers_staged.nim
+++ b/nimbus/sync/beacon/worker/headers_staged.nim
@ -132,6 +132,9 @@ proc headersStagedCollect*(
    # so that `async` can capture that properly.
    lhc = (ref LinkedHChain)(parentHash: topLink)
    # Flag, not to reset error count
    haveError = false
  while true:
    # Extract top range interval and fetch/stage it
    let
@ -151,15 +154,17 @@ proc headersStagedCollect*(
      # Throw away opportunistic data (or first time header fetch.) Keep
      # other data for a partially assembled list.
      if isOpportunistic or nLhcHeaders == 0:
-        buddy.only.nHdrRespErrors.inc
+        buddy.only.nHdrProcErrors.inc
        haveError = true
-        if (0 < buddy.only.nHdrRespErrors and buddy.ctrl.stopped) or
+        if ((0 < buddy.only.nHdrRespErrors or
-           fetchHeadersReqThresholdCount < buddy.only.nHdrRespErrors:
+             0 < buddy.only.nHdrProcErrors) and buddy.ctrl.stopped) or
           fetchHeadersReqErrThresholdCount < buddy.only.nHdrRespErrors or
           fetchHeadersProcessErrThresholdCount < buddy.only.nHdrProcErrors:
          # Make sure that this peer does not immediately reconnect
          buddy.ctrl.zombie = true
        trace info & ": current header list discarded", peer, iv, ivReq,
-          isOpportunistic,
+          isOpportunistic, ctrl=buddy.ctrl.state, hdrErrors=buddy.hdrErrors
          ctrl=buddy.ctrl.state, nRespErrors=buddy.only.nHdrRespErrors
        ctx.headersUnprocCommit(iv.len, iv)
        # At this stage allow a task switch so that some other peer might try
        # to work on the currently returned interval.
@ -195,9 +200,13 @@ proc headersStagedCollect*(
    raiseAssert info & ": duplicate key on staged queue iv=" & $iv
  qItem.data = lhc[]
-  trace info & ": staged a list of headers", peer,
+  # Reset header process errors (not too many consecutive failures this time)
-    topBlock=iv.maxPt.bnStr, nHeaders=lhc.revHdrs.len,
+  if not haveError:
-    nStaged=ctx.hdr.staged.len, isOpportunistic, ctrl=buddy.ctrl.state
+    buddy.only.nHdrProcErrors = 0
  trace info & ": staged a list of headers", peer, topBlock=iv.maxPt.bnStr,
    nHeaders=lhc.revHdrs.len, nStaged=ctx.hdr.staged.len, isOpportunistic,
    ctrl=buddy.ctrl.state, hdrErrors=buddy.hdrErrors
  return true
--- a/nimbus/sync/beacon/worker/headers_staged/headers.nim
+++ b/nimbus/sync/beacon/worker/headers_staged/headers.nim
@ -25,9 +25,17 @@ import
 proc registerError(buddy: BeaconBuddyRef) =
  buddy.only.nHdrRespErrors.inc
-  if fetchHeadersReqThresholdCount < buddy.only.nHdrRespErrors:
+  if fetchHeadersReqErrThresholdCount < buddy.only.nHdrRespErrors:
    buddy.ctrl.zombie = true # abandon slow peer
 # ------------------------------------------------------------------------------
 # Public debugging & logging helpers
 # ------------------------------------------------------------------------------
 func hdrErrors*(buddy: BeaconBuddyRef): string =
  $buddy.only.nHdrRespErrors & "/" & $buddy.only.nHdrProcErrors
 # ------------------------------------------------------------------------------
 # Public functions
 # ------------------------------------------------------------------------------
@ -63,7 +71,7 @@ proc headersFetchReversed*(
    start = Moment.now()
  trace trEthSendSendingGetBlockHeaders & " reverse", peer, ivReq,
-    nReq=req.maxResults, useHash, nRespErrors=buddy.only.nHdrRespErrors
+    nReq=req.maxResults, useHash, hdrErrors=buddy.hdrErrors
  # Fetch headers from peer
  var resp: Option[blockHeadersObj]
@ -78,7 +86,7 @@ proc headersFetchReversed*(
    buddy.registerError()
    `info` info & " error", peer, ivReq, nReq=req.maxResults, useHash,
      elapsed=(Moment.now() - start).toStr, error=($e.name), msg=e.msg,
-      nRespErrors=buddy.only.nHdrRespErrors
+      hdrErrors=buddy.hdrErrors
    return err()
  let elapsed = Moment.now() - start
@ -88,7 +96,7 @@ proc headersFetchReversed*(
    buddy.registerError()
    trace trEthRecvReceivedBlockHeaders, peer, nReq=req.maxResults, useHash,
      nResp=0, elapsed=elapsed.toStr, ctrl=buddy.ctrl.state,
-      nRespErrors=buddy.only.nHdrRespErrors
+      hdrErrors=buddy.hdrErrors
    return err()
  let h: seq[Header] = resp.get.headers
@ -96,12 +104,12 @@ proc headersFetchReversed*(
    buddy.registerError()
    trace trEthRecvReceivedBlockHeaders, peer, nReq=req.maxResults, useHash,
      nResp=h.len, elapsed=elapsed.toStr, ctrl=buddy.ctrl.state,
-      nRespErrors=buddy.only.nHdrRespErrors
+      hdrErrors=buddy.hdrErrors
    return err()
  # Ban an overly slow peer for a while when seen in a row. Also there is a
  # mimimum share of the number of requested headers expected, typically 10%.
-  if fetchHeadersReqThresholdZombie < elapsed or
+  if fetchHeadersReqErrThresholdZombie < elapsed or
     h.len.uint64 * 100 < req.maxResults * fetchHeadersReqMinResponsePC:
    buddy.registerError()
  else:
@ -109,8 +117,7 @@ proc headersFetchReversed*(
  trace trEthRecvReceivedBlockHeaders, peer, nReq=req.maxResults, useHash,
    ivResp=BnRange.new(h[^1].number,h[0].number), nResp=h.len,
-    elapsed=elapsed.toStr, ctrl=buddy.ctrl.state,
+    elapsed=elapsed.toStr, ctrl=buddy.ctrl.state, hdrErrors=buddy.hdrErrors
    nRespErrors=buddy.only.nHdrRespErrors
  return ok(h)
--- a/nimbus/sync/beacon/worker/update/metrics.nim
+++ b/nimbus/sync/beacon/worker/update/metrics.nim
@ -60,7 +60,10 @@ template updateMetricsImpl(ctx: BeaconCtxRef) =
  metrics.set(beacon_coupler, ctx.layout.coupler.int64)
  metrics.set(beacon_dangling, ctx.layout.dangling.int64)
  metrics.set(beacon_head, ctx.layout.head.int64)
-  metrics.set(beacon_target, ctx.target.consHead.number.int64)
+
  # Show last valid state.
  if 0 < ctx.target.consHead.number:
    metrics.set(beacon_target, ctx.target.consHead.number.int64)
  metrics.set(beacon_header_lists_staged, ctx.headersStagedQueueLen())
  metrics.set(beacon_headers_unprocessed,
--- a/nimbus/sync/beacon/worker_config.nim
+++ b/nimbus/sync/beacon/worker_config.nim
@ -54,12 +54,16 @@ const
    ## On `Geth`, responses to larger requests are all truncted to 1024 header
    ## entries (see `Geth` constant `maxHeadersServe`.)
-  fetchHeadersReqThresholdZombie* = chronos.seconds(2)
+  fetchHeadersReqErrThresholdZombie* = chronos.seconds(2)
-  fetchHeadersReqThresholdCount* = 3
+  fetchHeadersReqErrThresholdCount* = 3
    ## Response time allowance. If the response time for the set of headers
    ## exceeds this threshold for more than `fetchHeadersReqThresholdCount`
    ## times in a row, then this peer will be banned for a while.
  fetchHeadersProcessErrThresholdCount* = 3
    ## Similar to `fetchHeadersReqErrThresholdCount` but for the later part
    ## when errors occur while block headers are queued and further processed.
  fetchHeadersReqMinResponsePC* = 10
    ## Some peers only returned one header at a time. If these peers sit on a
    ## farm, they might collectively slow down the download process. So this
@ -92,10 +96,13 @@ const
  nFetchBodiesRequest* = 128
    ## Similar to `nFetchHeadersRequest`
-  fetchBodiesReqThresholdZombie* = chronos.seconds(4)
+  fetchBodiesReqErrThresholdZombie* = chronos.seconds(4)
-  fetchBodiesReqThresholdCount* = 5
+  fetchBodiesReqErrThresholdCount* = 3
    ## Similar to `fetchHeadersReqThreshold*`
  fetchBodiesProcessErrThresholdCount* = 3
    ## Similar to `fetchHeadersProcessErrThresholdCount`.
  fetchBodiesReqMinResponsePC* = 10
    ## Similar to `fetchHeadersReqMinResponsePC`
--- a/nimbus/sync/beacon/worker_desc.nim
+++ b/nimbus/sync/beacon/worker_desc.nim
@ -126,8 +126,10 @@ type
  BeaconBuddyData* = object
    ## Local descriptor data extension
-    nHdrRespErrors*: int             ## Number of errors/slow responses in a row
+    nHdrRespErrors*: uint8           ## Number of errors/slow responses in a row
-    nBdyRespErrors*: int             ## Ditto for bodies
+    nHdrProcErrors*: uint8           ## Number of post processing errors
    nBdyRespErrors*: uint8           ## Ditto for bodies
    nBdyProcErrors*: uint8
    # Debugging and logging.
    nMultiLoop*: int                 ## Number of runs