nimbus-eth2/beacon_chain/sync/branch_discovery.nim

368 lines
14 KiB
Nim

# beacon_chain
# Copyright (c) 2024 Status Research & Development GmbH
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at https://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at https://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
{.push raises: [].}
# This module is started when a chain stall is detected, i.e.,
# a long period without any chain progress, while at the same time being
# connected to a healthy number of different peers.
#
# In such a scenario, the network may partition into multiple groups of peers
# that build on separate branches. It is vital to specifically target peers
# from different partitions and download branches which are not necessarily
# popularly served. Branches may be unpopular because they are expensive to
# apply, and may have a difficult time propagating despite a large weight in
# attestations backing them. This only exacerbates the longer the split view
# scenario is ongoing.
#
# While sync manager can sync popular chains well, it cannot reliably sync
# minority chains only served by a limited number of peers. This module
# augments sync manager in the split view scenario.
# Note that request manager is not running while sync manager is running.
#
# Note that the canonical chain may not be on the highest slot number,
# as some partitions of the network may have built on top of branches
# with lower validator support while the canonical chain was not visible.
#
# Despite its simplicity and brute-force approach, this module has been highly
# effective in the final month of Goerli. It managed to sync the entire Nimbus
# testnet fleet to the same branch, while also tracking >25 alternate branches.
# Further improvements should be applied:
#
# 1. Progress is currently limited by the size of `block_quarantine` per cycle,
# as this module downloads in backward order into the quarantine before the
# results get applied in forward order. This further limits the concurrency
# to a single peer at a time, because there is only a single quarantine that
# can hold a pending branch history.
#
# This could be addressed by probing the peer about the branch that it's on.
# We could send a by-root request for all our known heads to identify which
# ones they are aware of, followed by a binary search back to finalized slot
# to determine how far along the peer's progress is. From there on, by range
# requests allow forward sync and remembering partial progress along the way.
# We also wouldn't have to be as careful to avoid rate limit disconnections.
# Empty epoch progress also needs to be remembered across syncing sessions,
# because in a split view scenario often there are hundreds of empty epochs,
# and by-range syncing is highly ineffective.
#
# 2. The peer pool currently provides the best available peer on acquisition.
# Its filtering should be extended to have a better targeting for interesting
# peers, i.e., those that claim to know about head roots that we are unaware
# of and also have a head slot in the past, indicating that sync manager will
# not target those peers and will not manage to pull their branches quickly.
#
# 3. When monitoring gossip, peers that inform about blocks with unknown parent
# roots or aggregates referring to unknown beacon roots should be transferred
# into branch discovery as well. Gossip only propagates through peers that
# have validated the data themselves, so they must have the parent data.
#
# 4. Testing. Beyond Goerli, there is no regular long-lasting low participation
# network that reflects a realistic scenario. The network needs to be huge,
# geographically distributed with a variety of clients and lots of activity.
# Blocks need to take a while to apply to test the slow propagation when
# there are lots of empty epochs between blocks. There must be reorgs of
# hundreds of blocks to reflect EL suddenly going back to optimistic mode.
# A smaller simulation to run in CI may be achieveable by intentionally
# setting the `SECONDS_PER_SLOT` to a low value. Furthermore, synthetic
# scenarios can be tested in unit tests by mocking peers and blocks and
# making timers and rate limits configurable.
import
std/[algorithm, deques],
chronos, chronicles, metrics, results,
../spec/[forks, network],
../consensus_object_pools/block_pools_types,
../networking/[eth2_network, peer_pool],
./sync_protocol
logScope:
topics = "branchdiscovery"
declareGauge beacon_sync_branchdiscovery_state,
"Branch discovery module operating state"
declareCounter beacon_sync_branchdiscovery_discovered_blocks,
"Number of beacon blocks discovered by the branch discovery module"
type
BranchDiscoveryState* {.pure.} = enum
Stopped,
Suspended,
Active
GetSlotCallback* =
proc(): Slot {.gcsafe, raises: [].}
IsBlockKnownCallback* =
proc(blockRoot: Eth2Digest): bool {.gcsafe, raises: [].}
BlockVerifierCallback* = proc(
signedBlock: ForkedSignedBeaconBlock,
blobs: Opt[BlobSidecars]
): Future[Result[void, VerifierError]] {.async: (raises: [CancelledError]).}
BranchDiscovery*[A, B] = object
pool: PeerPool[A, B]
getFinalizedSlot: GetSlotCallback
isBlockKnown: IsBlockKnownCallback
blockVerifier: BlockVerifierCallback
isActive: AsyncEvent
loopFuture: Future[void].Raising([])
peerQueue: Deque[A]
proc new*[A, B](
T: type BranchDiscovery[A, B],
pool: PeerPool[A, B],
getFinalizedSlot: GetSlotCallback,
isBlockKnown: IsBlockKnownCallback,
blockVerifier: BlockVerifierCallback): ref BranchDiscovery[A, B] =
let self = (ref BranchDiscovery[A, B])(
pool: pool,
getFinalizedSlot: getFinalizedSlot,
isBlockKnown: isBlockKnown,
blockVerifier: blockVerifier,
isActive: newAsyncEvent())
self[].isActive.fire()
self
proc discoverBranch[A, B](
self: BranchDiscovery[A, B],
peer: A) {.async: (raises: [CancelledError]).} =
logScope:
peer
peer_score = peer.getScore()
let oldPeerHeadSlot = peer.getHeadSlot()
if Moment.now() - peer.getStatusLastTime() >= StatusExpirationTime:
if not(await peer.updateStatus()):
peer.updateScore(PeerScoreNoStatus)
debug "Failed to update status"
return
let peerHeadSlot = peer.getHeadSlot()
if peerHeadSlot != oldPeerHeadSlot:
peer.updateScore(PeerScoreGoodStatus)
debug "Peer has synced to a new head", oldPeerHeadSlot, peerHeadSlot
let finalizedSlot = self.getFinalizedSlot()
if peerHeadSlot <= finalizedSlot:
# This peer can sync from different peers, it is useless to us at this time
peer.updateScore(PeerScoreUseless)
debug "Peer's head slot is already finalized", peerHeadSlot, finalizedSlot
return
var blockRoot = peer.getHeadRoot()
logScope: blockRoot
if self.isBlockKnown(blockRoot):
# This peer may be actively syncing from us, only descore if no disconnect
if peer.getScore() >= PeerScoreLowLimit - PeerScoreUseless:
peer.updateScore(PeerScoreUseless)
debug "Peer's head block root is already known"
return
# Many peers disconnect on rate limit, we have to avoid getting hit by it
# to have a chance in picking up branches that don't have good propagation
const
maxRequestsPerBurst = 15
burstDuration = chronos.seconds(30)
let bucket = TokenBucket.new(maxRequestsPerBurst, burstDuration)
template consumeTokens(numTokens: int) =
try:
await bucket.consume(numTokens)
except CancelledError as exc:
raise exc
except CatchableError as exc:
raiseAssert "TokenBucket.consume should not fail: " & $exc.msg
var parentSlot = peerHeadSlot + 1
logScope: parentSlot
while true:
if self.isBlockKnown(blockRoot):
debug "Branch from peer no longer unknown"
return
if peer.getScore() < PeerScoreLowLimit:
debug "Failed to discover new branch from peer"
return
debug "Discovering new branch from peer"
consumeTokens(1)
let rsp = await peer.beaconBlocksByRoot_v2(BlockRootsList @[blockRoot])
if rsp.isErr:
# `eth2_network` already descored according to the specific error
debug "Failed to receive block", err = rsp.error
consumeTokens(5)
continue
template blocks: untyped = rsp.get
# The peer was the one providing us with this block root, it should exist
if blocks.len == 0:
peer.updateScore(PeerScoreNoValues)
debug "Received no blocks", numBlocks = blocks.len
consumeTokens(5)
continue
if blocks.len > 1:
peer.updateScore(PeerScoreBadResponse)
debug "Received too many blocks", numBlocks = blocks.len
return
template blck: untyped = blocks[0][]
if blck.slot >= parentSlot:
peer.updateScore(PeerScoreBadResponse)
debug "Received block older than parent", receivedSlot = blck.slot
return
if blck.root != blockRoot:
peer.updateScore(PeerScoreBadResponse)
debug "Received incorrect block", receivedRoot = blck.root
return
var blobIds: seq[BlobIdentifier]
withBlck(blck):
when consensusFork >= ConsensusFork.Deneb:
for i in 0 ..< forkyBlck.message.body.blob_kzg_commitments.len:
blobIds.add BlobIdentifier(block_root: blockRoot, index: i.BlobIndex)
var blobs: Opt[BlobSidecars]
if blobIds.len > 0:
while true:
if self.isBlockKnown(blockRoot):
debug "Branch from peer no longer unknown"
return
if peer.getScore() < PeerScoreLowLimit:
debug "Failed to discover new branch from peer"
return
consumeTokens(1)
let r = await peer.blobSidecarsByRoot(BlobIdentifierList blobIds)
if r.isErr:
# `eth2_network` already descored according to the specific error
debug "Failed to receive blobs", err = r.error
consumeTokens(5)
continue
template blobSidecars: untyped = r.unsafeGet
if blobSidecars.len < blobIds.len:
peer.updateScore(PeerScoreMissingValues)
debug "Received not all blobs",
numBlobs = blobSidecars.len, expectedNumBlobs = blobIds.len
consumeTokens(5)
continue
if blobSidecars.len > blobIds.len:
peer.updateScore(PeerScoreBadResponse)
debug "Received too many blobs",
numBlobs = blobSidecars.len, expectedNumBlobs = blobIds.len
return
for i, blobSidecar in blobSidecars:
let root = hash_tree_root(blobSidecar[].signed_block_header.message)
if root != blockRoot:
peer.updateScore(PeerScoreBadResponse)
debug "Received unexpected blob"
return
blobSidecar[].verify_blob_sidecar_inclusion_proof().isOkOr:
peer.updateScore(PeerScoreBadResponse)
debug "Received invalid blob"
return
blobs = Opt.some distinctBase(blobSidecars).sortedByIt(it.index)
for i, blobSidecar in blobs.get:
if blobSidecar[].index != i.BlobIndex:
peer.updateScore(PeerScoreBadResponse)
debug "Received duplicate blobs while others are missing"
return
break
let err = (await self.blockVerifier(blck, blobs)).errorOr:
peer.updateScore(PeerScoreGoodBatchValue + PeerScoreGoodValues)
beacon_sync_branchdiscovery_discovered_blocks.inc()
info "Discovered new branch from peer"
break
case err
of VerifierError.Invalid:
peer.updateScore(PeerScoreBadResponse)
debug "Received invalid block"
return
of VerifierError.UnviableFork:
peer.updateScore(PeerScoreUnviableFork)
debug "Received unviable block"
return
of VerifierError.Duplicate:
peer.updateScore(PeerScoreGoodValues)
debug "Connected new branch from peer"
break
of VerifierError.MissingParent:
peer.updateScore(PeerScoreGoodBatchValue)
parentSlot = blck.slot
blockRoot = blck.getForkedBlockField(parent_root)
continue
proc loop(self: ref BranchDiscovery) {.async: (raises: []).} =
try:
while true:
await self[].isActive.wait()
const pollInterval = chronos.seconds(2)
await sleepAsync(pollInterval)
let peer =
if self[].peerQueue.len > 0:
self[].peerQueue.popFirst()
else:
try:
self[].pool.acquireNoWait()
except PeerPoolError as exc:
debug "Failed to acquire peer", exc = exc.msg
continue
defer: self[].pool.release(peer)
await self[].discoverBranch(peer)
except CancelledError:
return
func state*(self: ref BranchDiscovery): BranchDiscoveryState =
if self[].loopFuture == nil:
BranchDiscoveryState.Stopped
elif not self[].isActive.isSet:
BranchDiscoveryState.Suspended
else:
BranchDiscoveryState.Active
proc clearPeerQueue(self: ref BranchDiscovery) =
while self[].peerQueue.len > 0:
let peer = self[].peerQueue.popLast()
self[].pool.release(peer)
proc start*(self: ref BranchDiscovery) =
doAssert self[].loopFuture == nil
info "Starting discovery of new branches"
self[].loopFuture = self.loop()
beacon_sync_branchdiscovery_state.set(self.state.ord().int64)
proc stop*(self: ref BranchDiscovery) {.async: (raises: []).} =
if self[].loopFuture != nil:
info "Stopping discovery of new branches"
await self[].loopFuture.cancelAndWait()
self[].loopFuture = nil
beacon_sync_branchdiscovery_state.set(self.state.ord().int64)
self.clearPeerQueue()
proc suspend*(self: ref BranchDiscovery) =
self[].isActive.clear()
beacon_sync_branchdiscovery_state.set(self.state.ord().int64)
self.clearPeerQueue()
proc resume*(self: ref BranchDiscovery) =
self[].isActive.fire()
beacon_sync_branchdiscovery_state.set(self.state.ord().int64)
proc transferOwnership*[A, B](self: ref BranchDiscovery[A, B], peer: A) =
const maxPeersInQueue = 10
if self.state != BranchDiscoveryState.Active or
self[].peerQueue.len >= maxPeersInQueue or
peer.getHeadSlot() <= self[].getFinalizedSlot() or
self[].isBlockKnown(peer.getHeadRoot()):
self[].pool.release(peer)
return
debug "Peer transferred to branch discovery",
peer, peer_score = peer.getScore()
self[].peerQueue.addLast(peer)