fix checkpoint block potentially not getting backfilled into DB (#5863)

When using checkpoint sync, only checkpoint state is available, block is
not downloaded and backfilled later.

`dag.backfill` tracks latest filled `slot`, and latest `parent_root` for
which no block has been synced yet.

In checkpoint sync, this assumption is broken, because there, the start
`dag.backfill.slot` is set based on checkpoint state slot, and the block
is also not available.

However, sync manager in backward mode also requests `dag.backfill.slot`
and `block_clearance` then backfills the checkpoint block once it is
synced. But, there is no guarantee that a peer ever sends us that block.
They could send us all parent blocks and solely omit the checkpoint
block itself. In that situation, we would accept the parent blocks and
advance `dag.backfill`, and subsequently never request the checkpoint
block again, resulting in gap inside blocks DB that is never filled.

To mitigate that, the assumption is restored that `dag.backfill.slot`
is the latest filled `slot`, and `dag.backfill.parent_root` is the next
block that needs to be synced. By setting `slot` to `tail.slot + 1` and
`parent_root` to `tail.root`, we put a fake summary into `dag.backfill`
so that `block_clearance` only proceeds once checkpoint block exists.
This commit is contained in:
Etan Kissling 2024-02-09 11:20:36 +01:00 committed by GitHub
parent 4266e16835
commit 65e6f892de
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 143 additions and 57 deletions

View File

@ -23,11 +23,12 @@ OK: 1/1 Fail: 0/1 Skip: 0/1
OK: 12/12 Fail: 0/12 Skip: 0/12
## Backfill
```diff
+ Backfill to genesis OK
+ Init without genesis / block OK
+ backfill to genesis OK
+ reload backfill position OK
+ Reload backfill position OK
+ Restart after each block OK
```
OK: 3/3 Fail: 0/3 Skip: 0/3
OK: 4/4 Fail: 0/4 Skip: 0/4
## Beacon chain DB [Preset: mainnet]
```diff
+ empty database [Preset: mainnet] OK

View File

@ -204,6 +204,9 @@ type
slot*: Slot
parent_root*: Eth2Digest
func shortLog*(v: BeaconBlockSummary): auto =
(v.slot, shortLog(v.parent_root))
# Subkeys essentially create "tables" within the key-value store by prefixing
# each entry with a table id

View File

@ -349,7 +349,7 @@ proc addBackfillBlock*(
blockRoot = shortLog(signedBlock.root)
blck = shortLog(signedBlock.message)
signature = shortLog(signedBlock.signature)
backfill = (dag.backfill.slot, shortLog(dag.backfill.parent_root))
backfill = shortLog(dag.backfill)
template blck(): untyped = signedBlock.message # shortcuts without copy
template blockRoot(): untyped = signedBlock.root
@ -393,18 +393,22 @@ proc addBackfillBlock*(
if existing.isSome:
if existing.get().bid.slot == blck.slot and
existing.get().bid.root == blockRoot:
# Special case: when starting with only a checkpoint state, we will not
# have the head block data in the database
if dag.getForkedBlock(existing.get().bid).isNone():
let isDuplicate = dag.containsBlock(existing.get().bid)
if isDuplicate:
debug "Duplicate block"
else:
checkSignature()
debug "Block backfilled (checkpoint)"
debug "Block backfilled (known BlockId)"
dag.putBlock(signedBlock.asTrusted())
return ok()
debug "Duplicate block"
return err(VerifierError.Duplicate)
if blockRoot == dag.backfill.parent_root:
dag.backfill = blck.toBeaconBlockSummary()
return
if isDuplicate:
err(VerifierError.Duplicate)
else:
ok()
# Block is older than finalized, but different from the block in our
# canonical history: it must be from an unviable branch

View File

@ -156,6 +156,12 @@ type
## The backfill points to the oldest block with an unbroken ancestry from
## dag.tail - when backfilling, we'll move backwards in time starting
## with the parent of this block until we reach `frontfill`.
##
## - `backfill.slot` points to the earliest block that has been synced,
## or, if no blocks have been synced yet, to `checkpoint_state.slot + 1`
## which is the earliest slot that may have `parent_root` as ancestor.
## - `backfill.parent_root` is the latest block that is not yet synced.
## - Once backfill completes, `backfill.slot` refers to `GENESIS_SLOT`.
frontfillBlocks*: seq[Eth2Digest]
## A temporary cache of blocks that we could load from era files, once

View File

@ -117,7 +117,7 @@ proc updateFrontfillBlocks*(dag: ChainDAGRef) =
# era files match the chain
if dag.db.db.readOnly: return # TODO abstraction leak - where to put this?
if dag.frontfillBlocks.len == 0 or dag.backfill.slot > 0:
if dag.frontfillBlocks.len == 0 or dag.backfill.slot > GENESIS_SLOT:
return
info "Writing frontfill index", slots = dag.frontfillBlocks.len
@ -259,6 +259,9 @@ proc containsBlock(
cfg: RuntimeConfig, db: BeaconChainDB, slot: Slot, root: Eth2Digest): bool =
db.containsBlock(root, cfg.consensusForkAtEpoch(slot.epoch))
proc containsBlock*(dag: ChainDAGRef, bid: BlockId): bool =
dag.cfg.containsBlock(dag.db, bid.slot, bid.root)
proc getForkedBlock*(db: BeaconChainDB, root: Eth2Digest):
Opt[ForkedTrustedSignedBeaconBlock] =
# When we only have a digest, we don't know which fork it's from so we try
@ -1227,9 +1230,14 @@ proc init*(T: type ChainDAGRef, cfg: RuntimeConfig, db: BeaconChainDB,
db.getBeaconBlockSummary(backfillRoot).expect(
"Backfill block must have a summary: " & $backfillRoot)
else:
elif dag.containsBlock(dag.tail):
db.getBeaconBlockSummary(dag.tail.root).expect(
"Tail block must have a summary: " & $dag.tail.root)
else:
# Checkpoint sync, checkpoint block unavailable
BeaconBlockSummary(
slot: dag.tail.slot + 1,
parent_root: dag.tail.root)
dag.forkDigests = newClone ForkDigests.init(
cfg, getStateField(dag.headState, genesis_validators_root))
@ -1241,8 +1249,9 @@ proc init*(T: type ChainDAGRef, cfg: RuntimeConfig, db: BeaconChainDB,
let finalizedTick = Moment.now()
if dag.backfill.slot > 0: # See if we can frontfill blocks from era files
dag.frontfillBlocks = newSeqOfCap[Eth2Digest](dag.backfill.slot.int)
if dag.backfill.slot > GENESIS_SLOT: # Try frontfill from era files
let backfillSlot = dag.backfill.slot - 1
dag.frontfillBlocks = newSeqOfCap[Eth2Digest](backfillSlot.int)
let
historical_roots = getStateField(dag.headState, historical_roots).asSeq()
@ -1255,7 +1264,7 @@ proc init*(T: type ChainDAGRef, cfg: RuntimeConfig, db: BeaconChainDB,
# blocks from genesis to backfill, if possible.
for bid in dag.era.getBlockIds(
historical_roots, historical_summaries, Slot(0), Eth2Digest()):
if bid.slot >= dag.backfill.slot:
if bid.slot >= backfillSlot:
# If we end up in here, we failed the root comparison just below in
# an earlier iteration
fatal "Era summaries don't lead up to backfill, database or era files corrupt?",
@ -1304,7 +1313,7 @@ proc init*(T: type ChainDAGRef, cfg: RuntimeConfig, db: BeaconChainDB,
head = shortLog(dag.head),
finalizedHead = shortLog(dag.finalizedHead),
tail = shortLog(dag.tail),
backfill = (dag.backfill.slot, shortLog(dag.backfill.parent_root)),
backfill = shortLog(dag.backfill),
loadDur = loadTick - startTick,
summariesDur = summariesTick - loadTick,

View File

@ -28,7 +28,8 @@ proc updateExistingState(
## Wrapper around `updateState` for states expected to exist.
let ok = dag.updateState(state, bsi, save, cache)
if not ok:
error "State failed to load unexpectedly", bsi, tail = dag.tail.slot
error "State failed to load unexpectedly",
bsi, tail = dag.tail.slot, backfill = shortLog(dag.backfill)
doAssert strictVerification notin dag.updateFlags
ok
@ -41,7 +42,8 @@ template withUpdatedExistingState(
dag.withUpdatedState(stateParam, bsiParam) do:
okBody
do:
error "State failed to load unexpectedly", bsi, tail = dag.tail.slot
error "State failed to load unexpectedly",
bsi, tail = dag.tail.slot, backfill = shortLog(dag.backfill)
doAssert strictVerification notin dag.updateFlags
failureBody
@ -49,7 +51,8 @@ proc getExistingBlockIdAtSlot(dag: ChainDAGRef, slot: Slot): Opt[BlockSlotId] =
## Wrapper around `getBlockIdAtSlot` for blocks expected to exist.
let bsi = dag.getBlockIdAtSlot(slot)
if bsi.isNone:
error "Block failed to load unexpectedly", slot, tail = dag.tail.slot
error "Block failed to load unexpectedly",
slot, tail = dag.tail.slot, backfill = shortLog(dag.backfill)
doAssert strictVerification notin dag.updateFlags
bsi
@ -57,7 +60,8 @@ proc existingParent(dag: ChainDAGRef, bid: BlockId): Opt[BlockId] =
## Wrapper around `parent` for parents known to exist.
let parent = dag.parent(bid)
if parent.isNone:
error "Parent failed to load unexpectedly", bid, tail = dag.tail.slot
error "Parent failed to load unexpectedly",
bid, tail = dag.tail.slot, backfill = shortLog(dag.backfill)
doAssert strictVerification notin dag.updateFlags
parent
@ -66,7 +70,8 @@ proc getExistingForkedBlock(
## Wrapper around `getForkedBlock` for blocks expected to exist.
let bdata = dag.getForkedBlock(bid)
if bdata.isNone:
error "Block failed to load unexpectedly", bid, tail = dag.tail.slot
error "Block failed to load unexpectedly",
bid, tail = dag.tail.slot, backfill = shortLog(dag.backfill)
doAssert strictVerification notin dag.updateFlags
bdata
@ -78,7 +83,7 @@ proc existingCurrentSyncCommitteeForPeriod(
let syncCommittee = dag.currentSyncCommitteeForPeriod(tmpState, period)
if syncCommittee.isNone:
error "Current sync committee failed to load unexpectedly",
period, tail = dag.tail.slot
period, tail = dag.tail.slot, backfill = shortLog(dag.backfill)
doAssert strictVerification notin dag.updateFlags
syncCommittee
@ -360,7 +365,7 @@ proc initLightClientUpdateForPeriod(
continue
finalizedSlot = finalizedEpoch.start_slot
finalizedBsi =
if finalizedSlot >= dag.tail.slot:
if finalizedSlot >= max(dag.tail.slot, dag.backfill.slot):
dag.getExistingBlockIdAtSlot(finalizedSlot).valueOr:
dag.handleUnexpectedLightClientError(finalizedSlot)
res.err()
@ -542,7 +547,7 @@ proc assignLightClientData(
when lcDataFork > LightClientDataFork.None:
if finalized_slot == forkyObject.finalized_header.beacon.slot:
forkyObject.finality_branch = attested_data.finality_branch
elif finalized_slot < dag.tail.slot:
elif finalized_slot < max(dag.tail.slot, dag.backfill.slot):
forkyObject.finalized_header.reset()
forkyObject.finality_branch.reset()
else:
@ -632,13 +637,13 @@ proc createLightClientUpdate(
let
finalized_slot = attested_data.finalized_slot
finalized_bsi =
if finalized_slot >= dag.tail.slot:
if finalized_slot >= max(dag.tail.slot, dag.backfill.slot):
dag.getExistingBlockIdAtSlot(finalized_slot)
else:
Opt.none(BlockSlotId)
has_finality =
finalized_bsi.isSome and
finalized_bsi.get.bid.slot >= dag.tail.slot
finalized_bsi.get.bid.slot >= max(dag.tail.slot, dag.backfill.slot)
meta = LightClientUpdateMetadata(
attested_slot: attested_slot,
finalized_slot: finalized_slot,
@ -722,19 +727,7 @@ proc initLightClientDataCache*(dag: ChainDAGRef) =
# State availability, needed for `cacheLightClientData`
dag.tail.slot,
# Block availability, needed for `LightClientHeader.execution_branch`
if dag.backfill.slot != GENESIS_SLOT:
let existing = dag.getBlockIdAtSlot(dag.backfill.slot)
if existing.isSome:
if dag.getForkedBlock(existing.get.bid).isNone:
# Special case: when starting with only a checkpoint state,
# we will not have the head block data in the database
dag.backfill.slot + 1
else:
dag.backfill.slot
else:
dag.backfill.slot
else:
dag.backfill.slot))
dag.backfill.slot))
dag.lcDataStore.cache.tailSlot = max(dag.head.slot, targetTailSlot)
if dag.head.slot < dag.lcDataStore.cache.tailSlot:

View File

@ -323,7 +323,10 @@ proc initFullNode(
dag.finalizedHead.slot
func getBackfillSlot(): Slot =
dag.backfill.slot
if dag.backfill.parent_root != dag.tail.root:
dag.backfill.slot
else:
dag.tail.slot
func getFrontfillSlot(): Slot =
max(dag.frontfill.get(BlockId()).slot, dag.horizon)

View File

@ -407,7 +407,7 @@ proc doTrustedNodeSync*(
let
validatorMonitor = newClone(ValidatorMonitor.init(false, false))
dag = ChainDAGRef.init(cfg, db, validatorMonitor, {}, eraPath = eraDir)
backfillSlot = dag.backfill.slot
backfillSlot = max(dag.backfill.slot, 1.Slot) - 1
horizon = max(dag.horizon, dag.frontfill.valueOr(BlockId()).slot)
let canReindex = if backfillSlot <= horizon:
@ -418,7 +418,7 @@ proc doTrustedNodeSync*(
# detection to kick in, in addBackfillBlock
let missingSlots = dag.backfill.slot - horizon + 1
notice "Downloading historical blocks - you can interrupt this process at any time and it automatically be completed when you start the beacon node",
notice "Downloading historical blocks - you can interrupt this process at any time and it will automatically be completed when you start the beacon node",
backfillSlot, horizon, missingSlots
var # Same averaging as SyncManager

View File

@ -817,7 +817,7 @@ suite "Backfill":
let
db = BeaconChainDB.new("", inMemory = true)
test "backfill to genesis":
test "Backfill to genesis":
let
tailBlock = blocks[^1]
genBlock = get_initial_beacon_block(genState[])
@ -861,15 +861,17 @@ suite "Backfill":
dag.getFinalizedEpochRef() != nil
dag.backfill == tailBlock.phase0Data.message.toBeaconBlockSummary()
# Checkpoint block is unavailable, and should be backfileld first
not dag.containsBlock(dag.tail)
dag.backfill == BeaconBlockSummary(
slot: dag.tail.slot + 1,
parent_root: dag.tail.root)
# Check that we can propose right from the checkpoint state
dag.getProposalState(dag.head, dag.head.slot + 1, cache).isOk()
var
badBlock = blocks[^2].phase0Data
badBlock.signature = blocks[^3].phase0Data.signature
var badBlock = blocks[^1].phase0Data
badBlock.signature = blocks[^2].phase0Data.signature
check:
dag.addBackfillBlock(badBlock) == AddBackRes.err VerifierError.Invalid
@ -879,6 +881,8 @@ suite "Backfill":
dag.addBackfillBlock(genBlock.phase0Data.asSigned()) ==
AddBackRes.err VerifierError.MissingParent
dag.addBackfillBlock(blocks[^2].phase0Data) ==
AddBackRes.err VerifierError.MissingParent
dag.addBackfillBlock(tailBlock.phase0Data).isOk()
check:
@ -920,7 +924,10 @@ suite "Backfill":
check:
dag.getFinalizedEpochRef() != nil
test "reload backfill position":
for i in 0..<blocks.len:
check dag.containsBlock(blocks[i].toBlockId())
test "Reload backfill position":
let
tailBlock = blocks[^1]
@ -932,6 +939,9 @@ suite "Backfill":
dag = init(ChainDAGRef, defaultRuntimeConfig, db, validatorMonitor, {})
check:
dag.addBackfillBlock(blocks[^1].phase0Data).isOk()
dag.backfill == blocks[^1].phase0Data.message.toBeaconBlockSummary()
dag.addBackfillBlock(blocks[^2].phase0Data).isOk()
dag.backfill == blocks[^2].phase0Data.message.toBeaconBlockSummary()
@ -966,6 +976,11 @@ suite "Backfill":
check:
dag.getFinalizedEpochRef() != nil
# Try importing blocks too early
for i in 0..<blocks.len - 1:
check dag.addBackfillBlock(blocks[i].phase0Data) ==
AddBackRes.err VerifierError.MissingParent
for i in 0..<blocks.len:
check: dag.addBackfillBlock(
blocks[blocks.len - i - 1].phase0Data).isOk()
@ -995,6 +1010,50 @@ suite "Backfill":
check:
dag2.head.root == next.root
test "Restart after each block":
ChainDAGRef.preInit(db, tailState[])
for i in 1..blocks.len:
let
validatorMonitor = newClone(ValidatorMonitor.init())
dag = init(ChainDAGRef, defaultRuntimeConfig, db, validatorMonitor, {})
check dag.backfill == (
if i > 1:
blocks[^(i - 1)].phase0Data.message.toBeaconBlockSummary()
else:
BeaconBlockSummary(
slot: blocks[^1].phase0Data.message.slot + 1,
parent_root: blocks[^1].phase0Data.root))
for j in 1..blocks.len:
if j < i:
check dag.addBackfillBlock(blocks[^j].phase0Data) ==
AddBackRes.err VerifierError.Duplicate
elif j > i:
check dag.addBackfillBlock(blocks[^j].phase0Data) ==
AddBackRes.err VerifierError.MissingParent
else:
discard
check:
dag.addBackfillBlock(blocks[^i].phase0Data).isOk()
dag.backfill == blocks[^i].phase0Data.message.toBeaconBlockSummary()
block:
let
validatorMonitor = newClone(ValidatorMonitor.init())
dag = init(ChainDAGRef, defaultRuntimeConfig, db, validatorMonitor, {})
genBlock = get_initial_beacon_block(genState[])
check:
dag.addBackfillBlock(genBlock.phase0Data.asSigned()).isOk()
dag.backfill == default(BeaconBlockSummary)
let
validatorMonitor = newClone(ValidatorMonitor.init())
dag = init(ChainDAGRef, defaultRuntimeConfig, db, validatorMonitor, {})
check dag.backfill == default(BeaconBlockSummary)
suite "Starting states":
setup:
let
@ -1059,14 +1118,17 @@ suite "Starting states":
dag.getFinalizedEpochRef() != nil
dag.backfill == tailBlock.phase0Data.message.toBeaconBlockSummary()
# Checkpoint block is unavailable, and should be backfileld first
not dag.containsBlock(dag.tail)
dag.backfill == BeaconBlockSummary(
slot: dag.tail.slot + 1,
parent_root: dag.tail.root)
# Check that we can propose right from the checkpoint state
dag.getProposalState(dag.head, dag.head.slot + 1, cache).isOk()
var
badBlock = blocks[^2].phase0Data
badBlock.signature = blocks[^3].phase0Data.signature
var badBlock = blocks[^1].phase0Data
badBlock.signature = blocks[^2].phase0Data.signature
check:
dag.addBackfillBlock(badBlock) == AddBackRes.err VerifierError.Invalid
@ -1076,7 +1138,9 @@ suite "Starting states":
dag.addBackfillBlock(genBlock.phase0Data.asSigned()) ==
AddBackRes.err VerifierError.MissingParent
dag.addBackfillBlock(tailBlock.phase0Data) == AddBackRes.ok()
dag.addBackfillBlock(blocks[^2].phase0Data) ==
AddBackRes.err VerifierError.MissingParent
dag.addBackfillBlock(tailBlock.phase0Data).isOk()
check:
dag.addBackfillBlock(blocks[^2].phase0Data).isOk()
@ -1088,6 +1152,9 @@ suite "Starting states":
dag.getBlockId(blocks[^2].root).get().root == blocks[^2].root
dag.getBlockIdAtSlot(dag.tail.slot).get().bid == dag.tail
dag.getBlockIdAtSlot(dag.tail.slot - 1).get() ==
blocks[^2].toBlockId().atSlot()
dag.getBlockIdAtSlot(dag.tail.slot - 2).isNone
dag.backfill == blocks[^2].phase0Data.message.toBeaconBlockSummary()