Merge branch 'dev/etan/zf-branchpull' into feat/splitview

This commit is contained in:
Etan Kissling 2024-03-27 16:14:38 +01:00
commit b869546524
No known key found for this signature in database
GPG Key ID: B21DA824C5A3D03D
8 changed files with 123 additions and 46 deletions

View File

@ -85,7 +85,7 @@ type
requestManager*: RequestManager
syncManager*: SyncManager[Peer, PeerId]
backfiller*: SyncManager[Peer, PeerId]
branchDiscovery*: ref BranchDiscovery
branchDiscovery*: ref BranchDiscovery[Peer, PeerId]
genesisSnapshotContent*: string
processor*: ref Eth2Processor
blockProcessor*: ref BlockProcessor

View File

@ -125,6 +125,12 @@ proc startLightClient*(node: BeaconNode) =
node.lightClient.start()
proc stopLightClient*(node: BeaconNode) {.async: (raises: []).} =
if not node.config.syncLightClient:
return
await node.lightClient.stop()
proc installLightClientMessageValidators*(node: BeaconNode) =
let eth2Processor =
if node.config.lightClientDataServe:

View File

@ -43,7 +43,7 @@ type
getBeaconTime: GetBeaconTimeFn
store: ref ForkedLightClientStore
processor: ref LightClientProcessor
manager: LightClientManager
manager: ref LightClientManager
gossipState: GossipState
onFinalizedHeader*, onOptimisticHeader*: LightClientHeaderCallback
bootstrapObserver*: LightClientBootstrapObserver
@ -173,7 +173,7 @@ proc createLightClient(
else:
GENESIS_SLOT.sync_committee_period
lightClient.manager = LightClientManager.init(
lightClient.manager = LightClientManager.new(
lightClient.network, rng, getTrustedBlockRoot,
bootstrapVerifier, updateVerifier, finalityVerifier, optimisticVerifier,
isLightClientStoreInitialized, isNextSyncCommitteeKnown,
@ -215,10 +215,18 @@ proc createLightClient*(
cfg, forkDigests, getBeaconTime, genesis_validators_root, finalizationMode)
proc start*(lightClient: LightClient) =
if lightClient.manager.isRunning:
return
notice "Starting light client",
trusted_block_root = lightClient.trustedBlockRoot
lightClient.manager.start()
proc stop*(lightClient: LightClient) {.async: (raises: [], raw: true).} =
if not lightClient.manager.isRunning:
return
notice "Stopping light client"
lightClient.manager.stop()
proc resetToFinalizedHeader*(
lightClient: LightClient,
header: ForkedLightClientHeader,

View File

@ -208,6 +208,9 @@ proc handleStatus(peer: Peer,
await peer.handlePeer()
true
const StatusExpirationTime* = chronos.minutes(2)
## Time time it takes for the peer's status information to expire.
proc updateStatus*(peer: Peer): Future[bool] {.async: (raises: [CancelledError]).} =
## Request `status` of remote peer ``peer``.
let

View File

@ -445,8 +445,8 @@ proc initFullNode(
blockProcessor, node.validatorMonitor, dag, attestationPool,
validatorChangePool, node.attachedValidators, syncCommitteeMsgPool,
lightClientPool, quarantine, blobQuarantine, rng, getBeaconTime, taskpool)
branchDiscovery = BranchDiscovery.new(
node.network, getFirstSlotAtFinalizedEpoch, isBlockKnown,
branchDiscovery = BranchDiscovery[Peer, PeerId].new(
node.network.peerPool, getFirstSlotAtFinalizedEpoch, isBlockKnown,
branchDiscoveryBlockVerifier)
fallbackSyncer = proc(peer: Peer) =
branchDiscovery.transferOwnership(peer)
@ -1637,6 +1637,12 @@ proc onSlotEnd(node: BeaconNode, slot: Slot) {.async.} =
if not node.syncManager.inProgress:
await node.branchDiscovery.stop()
# Light client is stopped while branch discovery is ongoing
if node.branchDiscovery.state != BranchDiscoveryState.Stopped:
node.startLightClient()
else:
await node.stopLightClient()
func formatNextConsensusFork(
node: BeaconNode, withVanityArt = false): Opt[string] =
let consensusFork =

View File

@ -27,6 +27,49 @@
# Note that the canonical chain may not be on the highest slot number,
# as some partitions of the network may have built on top of branches
# with lower validator support while the canonical chain was not visible.
#
# Despite its simplicity and brute-force approach, this module has been highly
# effective in the final month of Goerli. It managed to sync the entire Nimbus
# testnet fleet to the same branch, while also tracking >25 alternate branches.
# Further improvements should be applied:
#
# 1. Progress is currently limited by the size of `block_quarantine` per cycle,
# as this module downloads in backward order into the quarantine before the
# results get applied in forward order. This further limits the concurrency
# to a single peer at a time, because there is only a single quarantine that
# can hold a pending branch history.
#
# This could be addressed by probing the peer about the branch that it's on.
# We could send a by-root request for all our known heads to identify which
# ones they are aware of, followed by a binary search back to finalized slot
# to determine how far along the peer's progress is. From there on, by range
# requests allow forward sync and remembering partial progress along the way.
# We also wouldn't have to be as careful to avoid rate limit disconnections.
# Empty epoch progress also needs to be remembered across syncing sessions,
# because in a split view scenario often there are hundreds of empty epochs,
# and by-range syncing is highly ineffective.
#
# 2. The peer pool currently provides the best available peer on acquisition.
# Its filtering should be extended to have a better targeting for interesting
# peers, i.e., those that claim to know about head roots that we are unaware
# of and also have a head slot in the past, indicating that sync manager will
# not target those peers and will not manage to pull their branches quickly.
#
# 3. When monitoring gossip, peers that inform about blocks with unknown parent
# roots or aggregates referring to unknown beacon roots should be transferred
# into branch discovery as well. Gossip only propagates through peers that
# have validated the data themselves, so they must have the parent data.
#
# 4. Testing. Beyond Goerli, there is no regular long-lasting low participation
# network that reflects a realistic scenario. The network needs to be huge,
# geographically distributed with a variety of clients and lots of activity.
# Blocks need to take a while to apply to test the slow propagation when
# there are lots of empty epochs between blocks. There must be reorgs of
# hundreds of blocks to reflect EL suddenly going back to optimistic mode.
# A smaller simulation to run in CI may be achieveable by intentionally
# setting the `SECONDS_PER_SLOT` to a low value. Furthermore, synthetic
# scenarios can be tested in unit tests by mocking peers and blocks and
# making timers and rate limits configurable.
import
std/[algorithm, deques],
@ -62,23 +105,23 @@ type
blobs: Opt[BlobSidecars]
): Future[Result[void, VerifierError]] {.async: (raises: [CancelledError]).}
BranchDiscovery* = object
network: Eth2Node
BranchDiscovery*[A, B] = object
pool: PeerPool[A, B]
getFinalizedSlot: GetSlotCallback
isBlockKnown: IsBlockKnownCallback
blockVerifier: BlockVerifierCallback
isActive: AsyncEvent
loopFuture: Future[void].Raising([])
peerQueue: Deque[Peer]
peerQueue: Deque[A]
proc new*(
T: type BranchDiscovery,
network: Eth2Node,
proc new*[A, B](
T: type BranchDiscovery[A, B],
pool: PeerPool[A, B],
getFinalizedSlot: GetSlotCallback,
isBlockKnown: IsBlockKnownCallback,
blockVerifier: BlockVerifierCallback): ref BranchDiscovery =
let self = (ref BranchDiscovery)(
network: network,
blockVerifier: BlockVerifierCallback): ref BranchDiscovery[A, B] =
let self = (ref BranchDiscovery[A, B])(
pool: pool,
getFinalizedSlot: getFinalizedSlot,
isBlockKnown: isBlockKnown,
blockVerifier: blockVerifier,
@ -86,16 +129,27 @@ proc new*(
self[].isActive.fire()
self
proc discoverBranch(
self: BranchDiscovery, peer: Peer) {.async: (raises: [CancelledError]).} =
proc discoverBranch[A, B](
self: BranchDiscovery[A, B],
peer: A) {.async: (raises: [CancelledError]).} =
logScope:
peer
peer_score = peer.getScore()
let
finalizedSlot = self.getFinalizedSlot()
peerHeadSlot = peer.getHeadSlot()
let oldPeerHeadSlot = peer.getHeadSlot()
if Moment.now() - peer.getStatusLastTime() >= StatusExpirationTime:
if not(await peer.updateStatus()):
peer.updateScore(PeerScoreNoStatus)
debug "Failed to update status"
return
let peerHeadSlot = peer.getHeadSlot()
if peerHeadSlot != oldPeerHeadSlot:
peer.updateScore(PeerScoreGoodStatus)
debug "Peer has synced to a new head", oldPeerHeadSlot, peerHeadSlot
let finalizedSlot = self.getFinalizedSlot()
if peerHeadSlot <= finalizedSlot:
# This peer can sync from different peers, it is useless to us at this time
peer.updateScore(PeerScoreUseless)
debug "Peer's head slot is already finalized", peerHeadSlot, finalizedSlot
return
@ -103,11 +157,14 @@ proc discoverBranch(
var blockRoot = peer.getHeadRoot()
logScope: blockRoot
if self.isBlockKnown(blockRoot):
# This peer may be actively syncing from us, only descore if no disconnect
if peer.getScore() >= PeerScoreLowLimit - PeerScoreUseless:
peer.updateScore(PeerScoreUseless)
debug "Peer's head block root is already known"
return
# Many peers disconnect on rate limit, we have to avoid getting hit by it
# to have a chance in picking up branches that don't have good propagation
const
maxRequestsPerBurst = 15
burstDuration = chronos.seconds(30)
@ -250,11 +307,11 @@ proc loop(self: ref BranchDiscovery) {.async: (raises: []).} =
self[].peerQueue.popFirst()
else:
try:
self[].network.peerPool.acquireNoWait()
self[].pool.acquireNoWait()
except PeerPoolError as exc:
debug "Failed to acquire peer", exc = exc.msg
continue
defer: self[].network.peerPool.release(peer)
defer: self[].pool.release(peer)
await self[].discoverBranch(peer)
except CancelledError:
@ -271,7 +328,7 @@ func state*(self: ref BranchDiscovery): BranchDiscoveryState =
proc clearPeerQueue(self: ref BranchDiscovery) =
while self[].peerQueue.len > 0:
let peer = self[].peerQueue.popLast()
self[].network.peerPool.release(peer)
self[].pool.release(peer)
proc start*(self: ref BranchDiscovery) =
doAssert self[].loopFuture == nil
@ -296,13 +353,13 @@ proc resume*(self: ref BranchDiscovery) =
self[].isActive.fire()
beacon_sync_branchdiscovery_state.set(self.state.ord().int64)
proc transferOwnership*(self: ref BranchDiscovery, peer: Peer) =
proc transferOwnership*[A, B](self: ref BranchDiscovery[A, B], peer: A) =
const maxPeersInQueue = 10
if self.state != BranchDiscoveryState.Active or
self[].peerQueue.len >= maxPeersInQueue or
peer.getHeadSlot() <= self[].getFinalizedSlot() or
self[].isBlockKnown(peer.getHeadRoot()):
self[].network.peerPool.release(peer)
self[].pool.release(peer)
return
debug "Peer transferred to branch discovery",

View File

@ -67,7 +67,7 @@ type
getBeaconTime: GetBeaconTimeFn
loopFuture: Future[void].Raising([CancelledError])
func init*(
func new*(
T: type LightClientManager,
network: Eth2Node,
rng: ref HmacDrbgContext,
@ -81,9 +81,9 @@ func init*(
getFinalizedPeriod: GetSyncCommitteePeriodCallback,
getOptimisticPeriod: GetSyncCommitteePeriodCallback,
getBeaconTime: GetBeaconTimeFn
): LightClientManager =
): ref LightClientManager =
## Initialize light client manager.
LightClientManager(
(ref LightClientManager)(
network: network,
rng: rng,
getTrustedBlockRoot: getTrustedBlockRoot,
@ -99,16 +99,16 @@ func init*(
)
proc isGossipSupported*(
self: LightClientManager,
self: ref LightClientManager,
period: SyncCommitteePeriod
): bool =
## Indicate whether the light client is sufficiently synced to accept gossip.
if not self.isLightClientStoreInitialized():
if not self[].isLightClientStoreInitialized():
return false
period.isGossipSupported(
finalizedPeriod = self.getFinalizedPeriod(),
isNextSyncCommitteeKnown = self.isNextSyncCommitteeKnown())
finalizedPeriod = self[].getFinalizedPeriod(),
isNextSyncCommitteeKnown = self[].isNextSyncCommitteeKnown())
# https://github.com/ethereum/consensus-specs/blob/v1.4.0-beta.5/specs/altair/light-client/p2p-interface.md#getlightclientbootstrap
proc doRequest(
@ -381,13 +381,16 @@ proc loop(self: LightClientManager) {.async: (raises: [CancelledError]).} =
isNextSyncCommitteeKnown = self.isNextSyncCommitteeKnown(),
didLatestSyncTaskProgress = didProgress)
proc start*(self: var LightClientManager) =
## Start light client manager's loop.
doAssert self.loopFuture == nil
self.loopFuture = self.loop()
func isRunning*(self: ref LightClientManager): bool =
self[].loopFuture != nil
proc stop*(self: var LightClientManager) {.async: (raises: []).} =
proc start*(self: ref LightClientManager) =
## Start light client manager's loop.
doAssert self[].loopFuture == nil
self[].loopFuture = self[].loop()
proc stop*(self: ref LightClientManager) {.async: (raises: []).} =
## Stop light client manager's loop.
if self.loopFuture != nil:
await noCancel self.loopFuture.cancelAndWait()
self.loopFuture = nil
if self[].loopFuture != nil:
await noCancel self[].loopFuture.cancelAndWait()
self[].loopFuture = nil

View File

@ -28,12 +28,6 @@ const
SyncWorkersCount* = 10
## Number of sync workers to spawn
StatusUpdateInterval* = chronos.minutes(1)
## Minimum time between two subsequent calls to update peer's status
StatusExpirationTime* = chronos.minutes(2)
## Time time it takes for the peer's status information to expire.
type
PeerSyncer*[T] = proc(peer: T) {.gcsafe, raises: [].}