From 81690e04469ed83611004920f552bc026d7c529a Mon Sep 17 00:00:00 2001 From: Jordan Hrycaj Date: Fri, 22 Nov 2024 13:23:53 +0000 Subject: [PATCH] Beacon sync fix overlapping block list import (#2866) * Ignore `FC` overlapping blocks and the ones <= `base` why: Due to concurrently running `importBlock()` by `newPayload` RPC requests the `FC` module layout might differ when re-visiting for importing blocks. * Update logging and docu details: Reduce some logging noise Clarify activating/suspending syncer in log messages --- nimbus/sync/beacon/README.md | 10 +++++++--- nimbus/sync/beacon/worker/blocks_staged.nim | 17 +++++++++++++++-- nimbus/sync/beacon/worker/db.nim | 4 ++-- nimbus/sync/beacon/worker/start_stop.nim | 2 -- nimbus/sync/beacon/worker/update.nim | 7 +++---- 5 files changed, 27 insertions(+), 13 deletions(-) diff --git a/nimbus/sync/beacon/README.md b/nimbus/sync/beacon/README.md index e6b1780cd..6dbcab5eb 100644 --- a/nimbus/sync/beacon/README.md +++ b/nimbus/sync/beacon/README.md @@ -66,8 +66,8 @@ Implementation, The Gory Details The following diagram depicts a most general state view of the sync and the *FC* modules and at a given point of time - 0 L (5) - o--------------------o + 0 B L (5) + o---------o----------o | <--- imported ---> | C D H o---------------------o----------------o @@ -75,8 +75,12 @@ The following diagram depicts a most general state view of the sync and the where +* *B* -- **base**, current value of this entity (with the same name) of the + **FC** module (i.e. the current value when looked up.) + * *C* -- coupler, parent of the left endpoint of the chain of headers or blocks - to be fetched and imported. + to be fetched and imported. The block number of *C* is somewhere + between the ones of *B* and *C* inclusive. * *L* -- **latest**, current value of this entity (with the same name) of the **FC** module (i.e. the current value when looked up.) *L* need not diff --git a/nimbus/sync/beacon/worker/blocks_staged.nim b/nimbus/sync/beacon/worker/blocks_staged.nim index 64520ab76..c2152b710 100644 --- a/nimbus/sync/beacon/worker/blocks_staged.nim +++ b/nimbus/sync/beacon/worker/blocks_staged.nim @@ -255,11 +255,10 @@ proc blocksStagedImport*( # Fetch least record, accept only if it matches the global ledger state block: let imported = ctx.chain.latestNumber() - if qItem.key != imported + 1: + if imported + 1 < qItem.key: trace info & ": there is a gap L vs. staged", B=ctx.chain.baseNumber.bnStr, L=imported.bnStr, staged=qItem.key.bnStr, C=ctx.layout.coupler.bnStr - doAssert imported < qItem.key return false # Remove from queue @@ -275,7 +274,21 @@ proc blocksStagedImport*( var maxImport = iv.maxPt block importLoop: for n in 0 ..< nBlocks: + # It is known that `key <= imported + 1`. This means that some blocks + # potentally overlap with what is already known by `FC` (e.g. due to + # concurrently running `importBlock()` by a `newPayload` RPC requests.) + # + # It is not left to `FC` to ignore this record. Passing a block before + # the `base` (which also might have changed) is responded by `FC` with + # an error. This would cause throwing away all `nBlocks` rather than + # ignoring the first some. + # let nBn = qItem.data.blocks[n].header.number + if nBn <= ctx.chain.baseNumber: + trace info & ": ignoring block <= base", n, iv, + B=ctx.chain.baseNumber.bnStr, L=ctx.chain.latestNumber.bnStr, + nthBn=nBn.bnStr, nthHash=qItem.data.getNthHash(n) + continue ctx.pool.chain.importBlock(qItem.data.blocks[n]).isOkOr: warn info & ": import block error", n, iv, B=ctx.chain.baseNumber.bnStr, L=ctx.chain.latestNumber.bnStr, diff --git a/nimbus/sync/beacon/worker/db.nim b/nimbus/sync/beacon/worker/db.nim index 6853cec1c..e56defbc4 100644 --- a/nimbus/sync/beacon/worker/db.nim +++ b/nimbus/sync/beacon/worker/db.nim @@ -114,9 +114,9 @@ proc dbLoadSyncStateLayout*(ctx: BeaconCtxRef; info: static[string]): bool = # Add interval of unprocessed header range `(C,D)` from `README.md` ctx.headersUnprocSet(ctx.layout.coupler+1, ctx.layout.dangling-1) - trace info & ": restored sync state", L=latest.bnStr, + trace info & ": restored syncer state", L=latest.bnStr, C=ctx.layout.coupler.bnStr, D=ctx.layout.dangling.bnStr, - F=ctx.layout.final.bnStr, H=ctx.layout.head.bnStr + H=ctx.layout.head.bnStr true diff --git a/nimbus/sync/beacon/worker/start_stop.nim b/nimbus/sync/beacon/worker/start_stop.nim index 503202418..43fd9b8a1 100644 --- a/nimbus/sync/beacon/worker/start_stop.nim +++ b/nimbus/sync/beacon/worker/start_stop.nim @@ -122,8 +122,6 @@ proc setupDatabase*(ctx: BeaconCtxRef; info: static[string]) = # system becomes fully active. Otherwise there is some polling only waiting # for a new target so there is reduced service (aka `hibernate`.). ctx.hibernate = not ctx.dbLoadSyncStateLayout info - if ctx.hibernate: - trace info & ": hibernating", latest=ctx.chain.latestNumber.bnStr # Set blocks batch import value for block import if ctx.pool.nBodiesBatch < nFetchBodiesRequest: diff --git a/nimbus/sync/beacon/worker/update.nim b/nimbus/sync/beacon/worker/update.nim index 4026190ed..76870ce5a 100644 --- a/nimbus/sync/beacon/worker/update.nim +++ b/nimbus/sync/beacon/worker/update.nim @@ -121,8 +121,7 @@ proc startHibernating(ctx: BeaconCtxRef; info: static[string]) = ctx.hibernate = true - trace info & ": hibernating, awaiting sync target", - L=ctx.chain.latestNumber.bnStr + trace info & ": suspending syncer", L=ctx.chain.latestNumber.bnStr # Update, so it can be followed nicely ctx.updateMetrics() @@ -327,7 +326,7 @@ proc updateFinalBlockHeader*( b = ctx.chain.baseNumber() f = finHdr.number if f < b: - trace info & ": finalised number too low", + trace info & ": finalised block # too low", B=b.bnStr, finalised=f.bnStr, delta=(b - f) ctx.target.reset @@ -339,7 +338,7 @@ proc updateFinalBlockHeader*( # Activate running (unless done yet) if ctx.hibernate: ctx.hibernate = false - trace info & ": activated syncer", + trace info & ": activating syncer", B=b.bnStr, finalised=f.bnStr, head=ctx.target.consHead.bnStr # Update, so it can be followed nicely