Make syncing performance more steady. (#6722)

* Initial commit.

* Update tests.

* Update AllTests.

* Post-rebase update AllTests.

* Improve log statements which was hidden because of logScope issues.

* Eliminate duplicate `direction` fields.
Debugging blobs map.

* Fix blobs shortmap generation.

* Add fixes to map generation and add test helpers.

* Fix missing dependency.

* Adjust callbacks functionality for untrusted sync.

* Attempt to fix defect state issue with new flag `setLastStateRoot`.

* Make possible fix less intrusive.

* Fix forward declaration missing `=`.

* Revert attempt to improve performance.

* Revert some all the changes.

* Remove spaces.

* Remove LFS fixes from the PR.

* Update AllTests.

* Update copyright year.

* Update AllTests.

* Post-rebase fixes.
This commit is contained in:
Eugene Kabanov 2025-02-04 17:36:25 +02:00 committed by GitHub
parent 426856edcb
commit ba934af925
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 2493 additions and 2166 deletions

View File

@ -984,33 +984,28 @@ OK: 1/1 Fail: 0/1 Skip: 0/1
OK: 7/7 Fail: 0/7 Skip: 0/7 OK: 7/7 Fail: 0/7 Skip: 0/7
## SyncManager test suite ## SyncManager test suite
```diff ```diff
+ Process all unviable blocks OK
+ [SyncManager] groupBlobs() test OK + [SyncManager] groupBlobs() test OK
+ [SyncQueue#Backward] Async unordered push test OK + [SyncQueue# & Backward] Combination of missing parent and good blocks [3 peers] test OK
+ [SyncQueue#Backward] Async unordered push with rewind test OK + [SyncQueue# & Backward] Failure request push test OK
+ [SyncQueue#Backward] Good response with missing values towards end OK + [SyncQueue# & Backward] Invalid block [3 peers] test OK
+ [SyncQueue#Backward] Handle out-of-band sync progress advancement OK + [SyncQueue# & Backward] Smoke [3 peers] test OK
+ [SyncQueue#Backward] Pass through established limits test OK + [SyncQueue# & Backward] Smoke [single peer] test OK
+ [SyncQueue#Backward] Smoke test OK + [SyncQueue# & Backward] Unviable block [3 peers] test OK
+ [SyncQueue#Backward] Start and finish slots equal OK + [SyncQueue# & Forward] Combination of missing parent and good blocks [3 peers] test OK
+ [SyncQueue#Backward] Two full requests success/fail OK + [SyncQueue# & Forward] Failure request push test OK
+ [SyncQueue# & Forward] Invalid block [3 peers] test OK
+ [SyncQueue# & Forward] Smoke [3 peers] test OK
+ [SyncQueue# & Forward] Smoke [single peer] test OK
+ [SyncQueue# & Forward] Unviable block [3 peers] test OK
+ [SyncQueue#Backward] Missing parent and exponential rewind [3 peers] test OK
+ [SyncQueue#Backward] getRewindPoint() test OK + [SyncQueue#Backward] getRewindPoint() test OK
+ [SyncQueue#Forward] Async unordered push test OK + [SyncQueue#Forward] Missing parent and exponential rewind [3 peers] test OK
+ [SyncQueue#Forward] Async unordered push with rewind test OK
+ [SyncQueue#Forward] Good response with missing values towards end OK
+ [SyncQueue#Forward] Handle out-of-band sync progress advancement OK
+ [SyncQueue#Forward] Pass through established limits test OK
+ [SyncQueue#Forward] Smoke test OK
+ [SyncQueue#Forward] Start and finish slots equal OK
+ [SyncQueue#Forward] Two full requests success/fail OK
+ [SyncQueue#Forward] getRewindPoint() test OK + [SyncQueue#Forward] getRewindPoint() test OK
+ [SyncQueue] checkBlobsResponse() test OK + [SyncQueue] checkBlobsResponse() test OK
+ [SyncQueue] checkResponse() test OK + [SyncQueue] checkResponse() test OK
+ [SyncQueue] contains() test OK
+ [SyncQueue] getLastNonEmptySlot() test OK
+ [SyncQueue] hasEndGap() test OK + [SyncQueue] hasEndGap() test OK
``` ```
OK: 25/25 Fail: 0/25 Skip: 0/25 OK: 20/20 Fail: 0/20 Skip: 0/20
## Type helpers ## Type helpers
```diff ```diff
+ BeaconBlock OK + BeaconBlock OK
@ -1154,4 +1149,4 @@ OK: 2/2 Fail: 0/2 Skip: 0/2
OK: 9/9 Fail: 0/9 Skip: 0/9 OK: 9/9 Fail: 0/9 Skip: 0/9
---TOTAL--- ---TOTAL---
OK: 783/788 Fail: 0/788 Skip: 5/788 OK: 778/783 Fail: 0/783 Skip: 5/783

View File

@ -520,10 +520,6 @@ proc addBackfillBlockData*(
"database corrupt?", clearanceBlock = shortLog(clearanceBlock) "database corrupt?", clearanceBlock = shortLog(clearanceBlock)
return err(VerifierError.MissingParent) return err(VerifierError.MissingParent)
# dag.clearanceState.setStateRoot(trustedStateRoot)
# TODO (cheatfate): This is last part of previous TODO comment, we should
# set state's `root` to block's `state_root`.
let proposerVerifyTick = Moment.now() let proposerVerifyTick = Moment.now()
if not(isNil(onStateUpdated)): if not(isNil(onStateUpdated)):

View File

@ -385,14 +385,11 @@ proc initFullNode(
else: else:
dag.tail.slot dag.tail.slot
proc getUntrustedBackfillSlot(): Slot = func getUntrustedBackfillSlot(): Slot =
if clist.tail.isSome(): if clist.tail.isSome():
clist.tail.get().blck.slot clist.tail.get().blck.slot
else: else:
getLocalWallSlot() dag.tail.slot
func getUntrustedFrontfillSlot(): Slot =
getFirstSlotAtFinalizedEpoch()
func getFrontfillSlot(): Slot = func getFrontfillSlot(): Slot =
max(dag.frontfill.get(BlockId()).slot, dag.horizon) max(dag.frontfill.get(BlockId()).slot, dag.horizon)
@ -535,7 +532,7 @@ proc initFullNode(
dag.cfg.DENEB_FORK_EPOCH, dag.cfg.MIN_EPOCHS_FOR_BLOB_SIDECARS_REQUESTS, dag.cfg.DENEB_FORK_EPOCH, dag.cfg.MIN_EPOCHS_FOR_BLOB_SIDECARS_REQUESTS,
SyncQueueKind.Backward, getLocalHeadSlot, SyncQueueKind.Backward, getLocalHeadSlot,
getLocalWallSlot, getFirstSlotAtFinalizedEpoch, getUntrustedBackfillSlot, getLocalWallSlot, getFirstSlotAtFinalizedEpoch, getUntrustedBackfillSlot,
getUntrustedFrontfillSlot, isWithinWeakSubjectivityPeriod, getFrontfillSlot, isWithinWeakSubjectivityPeriod,
clistPivotSlot, untrustedBlockVerifier, maxHeadAge = 0, clistPivotSlot, untrustedBlockVerifier, maxHeadAge = 0,
shutdownEvent = node.shutdownEvent, shutdownEvent = node.shutdownEvent,
flags = syncManagerFlags) flags = syncManagerFlags)

View File

@ -21,9 +21,6 @@ import
export phase0, altair, merge, chronos, chronicles, results, export phase0, altair, merge, chronos, chronicles, results,
helpers, peer_scores, sync_queue, forks, sync_protocol helpers, peer_scores, sync_queue, forks, sync_protocol
logScope:
topics = "syncman"
const const
SyncWorkersCount* = 10 SyncWorkersCount* = 10
## Number of sync workers to spawn ## Number of sync workers to spawn
@ -34,6 +31,12 @@ const
StatusExpirationTime* = chronos.minutes(2) StatusExpirationTime* = chronos.minutes(2)
## Time time it takes for the peer's status information to expire. ## Time time it takes for the peer's status information to expire.
ConcurrentRequestsCount* = 3
## Number of requests performed by one peer in single syncing step
RepeatingFailuresCount* = 2
## Number of repeating errors before starting rewind process.
WeakSubjectivityLogMessage* = WeakSubjectivityLogMessage* =
"Database state missing or too old, cannot sync - resync the client " & "Database state missing or too old, cannot sync - resync the client " &
"using a trusted node or allow lenient long-range syncing with the " & "using a trusted node or allow lenient long-range syncing with the " &
@ -81,6 +84,8 @@ type
direction: SyncQueueKind direction: SyncQueueKind
ident*: string ident*: string
flags: set[SyncManagerFlag] flags: set[SyncManagerFlag]
concurrentRequestsCount: int
repeatingFailuresCount: int
SyncMoment* = object SyncMoment* = object
stamp*: chronos.Moment stamp*: chronos.Moment
@ -115,8 +120,10 @@ proc initQueue[A, B](man: SyncManager[A, B]) =
of SyncQueueKind.Forward: of SyncQueueKind.Forward:
man.queue = SyncQueue.init(A, man.direction, man.getFirstSlot(), man.queue = SyncQueue.init(A, man.direction, man.getFirstSlot(),
man.getLastSlot(), man.chunkSize, man.getLastSlot(), man.chunkSize,
man.concurrentRequestsCount,
man.repeatingFailuresCount,
man.getSafeSlot, man.blockVerifier, man.getSafeSlot, man.blockVerifier,
1, man.ident) man.ident)
of SyncQueueKind.Backward: of SyncQueueKind.Backward:
let let
firstSlot = man.getFirstSlot() firstSlot = man.getFirstSlot()
@ -128,27 +135,34 @@ proc initQueue[A, B](man: SyncManager[A, B]) =
else: else:
firstSlot - 1'u64 firstSlot - 1'u64
man.queue = SyncQueue.init(A, man.direction, startSlot, lastSlot, man.queue = SyncQueue.init(A, man.direction, startSlot, lastSlot,
man.chunkSize, man.getSafeSlot, man.chunkSize,
man.blockVerifier, 1, man.ident) man.concurrentRequestsCount,
man.repeatingFailuresCount,
man.getSafeSlot,
man.blockVerifier, man.ident)
proc newSyncManager*[A, B](
pool: PeerPool[A, B],
denebEpoch: Epoch,
minEpochsForBlobSidecarsRequests: uint64,
direction: SyncQueueKind,
getLocalHeadSlotCb: GetSlotCallback,
getLocalWallSlotCb: GetSlotCallback,
getFinalizedSlotCb: GetSlotCallback,
getBackfillSlotCb: GetSlotCallback,
getFrontfillSlotCb: GetSlotCallback,
weakSubjectivityPeriodCb: GetBoolCallback,
progressPivot: Slot,
blockVerifier: BlockVerifier,
shutdownEvent: AsyncEvent,
maxHeadAge = uint64(SLOTS_PER_EPOCH * 1),
chunkSize = uint64(SLOTS_PER_EPOCH),
flags: set[SyncManagerFlag] = {},
concurrentRequestsCount = ConcurrentRequestsCount,
repeatingFailuresCount = RepeatingFailuresCount,
ident = "main"
): SyncManager[A, B] =
proc newSyncManager*[A, B](pool: PeerPool[A, B],
denebEpoch: Epoch,
minEpochsForBlobSidecarsRequests: uint64,
direction: SyncQueueKind,
getLocalHeadSlotCb: GetSlotCallback,
getLocalWallSlotCb: GetSlotCallback,
getFinalizedSlotCb: GetSlotCallback,
getBackfillSlotCb: GetSlotCallback,
getFrontfillSlotCb: GetSlotCallback,
weakSubjectivityPeriodCb: GetBoolCallback,
progressPivot: Slot,
blockVerifier: BlockVerifier,
shutdownEvent: AsyncEvent,
maxHeadAge = uint64(SLOTS_PER_EPOCH * 1),
chunkSize = uint64(SLOTS_PER_EPOCH),
flags: set[SyncManagerFlag] = {},
ident = "main"
): SyncManager[A, B] =
let (getFirstSlot, getLastSlot, getSafeSlot) = case direction let (getFirstSlot, getLastSlot, getSafeSlot) = case direction
of SyncQueueKind.Forward: of SyncQueueKind.Forward:
(getLocalHeadSlotCb, getLocalWallSlotCb, getFinalizedSlotCb) (getLocalHeadSlotCb, getLocalWallSlotCb, getFinalizedSlotCb)
@ -173,7 +187,9 @@ proc newSyncManager*[A, B](pool: PeerPool[A, B],
direction: direction, direction: direction,
shutdownEvent: shutdownEvent, shutdownEvent: shutdownEvent,
ident: ident, ident: ident,
flags: flags flags: flags,
concurrentRequestsCount: concurrentRequestsCount,
repeatingFailuresCount: repeatingFailuresCount
) )
res.initQueue() res.initQueue()
res res
@ -182,18 +198,15 @@ proc getBlocks[A, B](man: SyncManager[A, B], peer: A,
req: SyncRequest[A]): Future[BeaconBlocksRes] {. req: SyncRequest[A]): Future[BeaconBlocksRes] {.
async: (raises: [CancelledError], raw: true).} = async: (raises: [CancelledError], raw: true).} =
mixin getScore, `==` mixin getScore, `==`
logScope:
peer_score = peer.getScore()
peer_speed = peer.netKbps()
sync_ident = man.ident
direction = man.direction
topics = "syncman"
doAssert(not(req.isEmpty()), "Request must not be empty!") doAssert(not(req.isEmpty()), "Request must not be empty!")
debug "Requesting blocks from peer", request = req debug "Requesting blocks from peer",
request = req,
peer_score = req.item.getScore(),
peer_speed = req.item.netKbps(),
sync_ident = man.ident,
topics = "syncman"
beaconBlocksByRange_v2(peer, req.slot, req.count, 1'u64) beaconBlocksByRange_v2(peer, req.data.slot, req.data.count, 1'u64)
proc shouldGetBlobs[A, B](man: SyncManager[A, B], s: Slot): bool = proc shouldGetBlobs[A, B](man: SyncManager[A, B], s: Slot): bool =
let let
@ -204,23 +217,23 @@ proc shouldGetBlobs[A, B](man: SyncManager[A, B], s: Slot): bool =
epoch >= wallEpoch - man.MIN_EPOCHS_FOR_BLOB_SIDECARS_REQUESTS) epoch >= wallEpoch - man.MIN_EPOCHS_FOR_BLOB_SIDECARS_REQUESTS)
proc shouldGetBlobs[A, B](man: SyncManager[A, B], r: SyncRequest[A]): bool = proc shouldGetBlobs[A, B](man: SyncManager[A, B], r: SyncRequest[A]): bool =
man.shouldGetBlobs(r.slot) or man.shouldGetBlobs(r.slot + (r.count - 1)) man.shouldGetBlobs(r.data.slot) or
man.shouldGetBlobs(r.data.slot + (r.data.count - 1))
proc getBlobSidecars[A, B](man: SyncManager[A, B], peer: A, proc getBlobSidecars[A, B](man: SyncManager[A, B], peer: A,
req: SyncRequest[A]): Future[BlobSidecarsRes] req: SyncRequest[A]): Future[BlobSidecarsRes]
{.async: (raises: [CancelledError], raw: true).} = {.async: (raises: [CancelledError], raw: true).} =
mixin getScore, `==` mixin getScore, `==`
logScope:
peer_score = peer.getScore()
peer_speed = peer.netKbps()
sync_ident = man.ident
direction = man.direction
topics = "syncman"
doAssert(not(req.isEmpty()), "Request must not be empty!") doAssert(not(req.isEmpty()), "Request must not be empty!")
debug "Requesting blobs sidecars from peer", request = req debug "Requesting blobs sidecars from peer",
blobSidecarsByRange(peer, req.slot, req.count) request = req,
peer_score = req.item.getScore(),
peer_speed = req.item.netKbps(),
sync_ident = man.ident,
topics = "syncman"
blobSidecarsByRange(peer, req.data.slot, req.data.count)
proc remainingSlots(man: SyncManager): uint64 = proc remainingSlots(man: SyncManager): uint64 =
let let
@ -238,8 +251,8 @@ proc remainingSlots(man: SyncManager): uint64 =
0'u64 0'u64
func groupBlobs*( func groupBlobs*(
blocks: seq[ref ForkedSignedBeaconBlock], blocks: openArray[ref ForkedSignedBeaconBlock],
blobs: seq[ref BlobSidecar] blobs: openArray[ref BlobSidecar]
): Result[seq[BlobSidecars], string] = ): Result[seq[BlobSidecars], string] =
var var
grouped = newSeq[BlobSidecars](len(blocks)) grouped = newSeq[BlobSidecars](len(blocks))
@ -287,13 +300,12 @@ proc getSyncBlockData*[T](
): Future[SyncBlockDataRes] {.async: (raises: [CancelledError]).} = ): Future[SyncBlockDataRes] {.async: (raises: [CancelledError]).} =
mixin getScore mixin getScore
logScope: debug "Requesting block from peer",
slot = slot slot = slot,
peer_score = peer.getScore() peer = peer,
peer_speed = peer.netKbps() peer_score = peer.getScore(),
topics = "syncman" peer_speed = peer.netKbps(),
topics = "syncman"
debug "Requesting block from peer"
let blocksRange = let blocksRange =
block: block:
@ -312,7 +324,12 @@ proc getSyncBlockData*[T](
return err("Incorrect number of blocks was returned by peer, " & return err("Incorrect number of blocks was returned by peer, " &
$len(blocksRange)) $len(blocksRange))
debug "Received block on request" debug "Received block on request",
slot = slot,
peer = peer,
peer_score = peer.getScore(),
peer_speed = peer.netKbps(),
topics = "syncman"
if blocksRange[0][].slot != slot: if blocksRange[0][].slot != slot:
peer.updateScore(PeerScoreBadResponse) peer.updateScore(PeerScoreBadResponse)
@ -349,7 +366,13 @@ proc getSyncBlockData*[T](
peer.updateScore(PeerScoreBadResponse) peer.updateScore(PeerScoreBadResponse)
return err("Incorrect number of received blobs in the requested range") return err("Incorrect number of received blobs in the requested range")
debug "Received blobs on request", blobs_count = len(blobData) debug "Received blobs on request",
slot = slot,
blobs_count = len(blobData),
peer = peer,
peer_score = peer.getScore(),
peer_speed = peer.netKbps(),
topics = "syncman"
let groupedBlobs = groupBlobs(blocksRange, blobData).valueOr: let groupedBlobs = groupBlobs(blocksRange, blobData).valueOr:
peer.updateScore(PeerScoreNoValues) peer.updateScore(PeerScoreNoValues)
@ -365,84 +388,204 @@ proc getSyncBlockData*[T](
ok(SyncBlockData(blocks: blocksRange, blobs: blobsRange)) ok(SyncBlockData(blocks: blocksRange, blobs: blobsRange))
proc syncStep[A, B]( proc getSyncBlockData[A, B](
man: SyncManager[A, B], index: int, peer: A man: SyncManager[A, B],
) {.async: (raises: [CancelledError]).} = index: int,
logScope: sr: SyncRequest[A]
peer_score = peer.getScore() ): Future[SyncBlockDataRes] {.async: (raises: [CancelledError]).} =
peer_speed = peer.netKbps() let
index = index peer = sr.item
sync_ident = man.ident blocks = (await man.getBlocks(peer, sr)).valueOr:
topics = "syncman" peer.updateScore(PeerScoreNoValues)
return err("Failed to receive blocks on request, reason: " & $error)
blockSlots = mapIt(blocks, it[].slot)
var debug "Received blocks on request",
request = sr,
peer_score = sr.item.getScore(),
peer_speed = sr.item.netKbps(),
index = index,
blocks_count = len(blocks),
blocks_map = getShortMap(sr, blocks.toSeq()),
sync_ident = man.ident,
topics = "syncman"
checkResponse(sr, blockSlots).isOkOr:
peer.updateScore(PeerScoreBadResponse)
return err("Incorrect blocks sequence received, reason: " & $error)
let
shouldGetBlobs =
if not(man.shouldGetBlobs(sr)):
false
else:
var hasBlobs = false
for blck in blocks:
withBlck(blck[]):
when consensusFork >= ConsensusFork.Deneb:
if len(forkyBlck.message.body.blob_kzg_commitments) > 0:
hasBlobs = true
break
hasBlobs
blobs =
if shouldGetBlobs:
let
res = (await man.getBlobSidecars(peer, sr)).valueOr:
peer.updateScore(PeerScoreNoValues)
return err("Failed to receive blobs on request, reason: " & $error)
blobData = res.asSeq()
debug "Received blobs on request",
request = sr,
peer_score = sr.item.getScore(),
peer_speed = sr.item.netKbps(),
index = index,
blobs_count = len(blobData),
blobs_map = getShortMap(sr, blobData),
sync_ident = man.ident,
topics = "syncman"
if len(blobData) > 0:
let blobSlots = mapIt(blobData, it[].signed_block_header.message.slot)
checkBlobsResponse(sr, blobSlots).isOkOr:
peer.updateScore(PeerScoreBadResponse)
return err("Incorrect blobs sequence received, reason: " & $error)
let groupedBlobs = groupBlobs(blocks.asSeq(), blobData).valueOr:
peer.updateScore(PeerScoreNoValues)
return err(
"Received blobs sequence is inconsistent, reason: " & error)
groupedBlobs.checkBlobs().isOkOr:
peer.updateScore(PeerScoreBadResponse)
return err("Received blobs verification failed, reason: " & error)
Opt.some(groupedBlobs)
else:
Opt.none(seq[BlobSidecars])
ok(SyncBlockData(blocks: blocks.asSeq(), blobs: blobs))
proc getOrUpdatePeerStatus[A, B](
man: SyncManager[A, B], index: int, peer: A
): Future[Result[Slot, string]] {.async: (raises: [CancelledError]).} =
let
headSlot = man.getLocalHeadSlot() headSlot = man.getLocalHeadSlot()
wallSlot = man.getLocalWallSlot() wallSlot = man.getLocalWallSlot()
peerSlot = peer.getHeadSlot() peerSlot = peer.getHeadSlot()
block: # Check that peer status is recent and relevant debug "Peer's syncing status",
logScope: peer = peer,
peer = peer peer_score = peer.getScore(),
direction = man.direction peer_speed = peer.netKbps(),
index = index,
wall_clock_slot = wallSlot,
remote_head_slot = peerSlot,
local_head_slot = headSlot,
sync_ident = man.ident,
direction = man.direction,
topics = "syncman"
debug "Peer's syncing status", wall_clock_slot = wallSlot, let
remote_head_slot = peerSlot, local_head_slot = headSlot peerStatusAge = Moment.now() - peer.getStatusLastTime()
needsUpdate =
# Latest status we got is old
peerStatusAge >= StatusExpirationTime or
# The point we need to sync is close to where the peer is
man.getFirstSlot() >= peerSlot
let if not(needsUpdate):
peerStatusAge = Moment.now() - peer.getStatusLastTime() return ok(peerSlot)
needsUpdate =
# Latest status we got is old
peerStatusAge >= StatusExpirationTime or
# The point we need to sync is close to where the peer is
man.getFirstSlot() >= peerSlot
if needsUpdate: man.workers[index].status = SyncWorkerStatus.UpdatingStatus
man.workers[index].status = SyncWorkerStatus.UpdatingStatus
# Avoid a stampede of requests, but make them more frequent in case the # Avoid a stampede of requests, but make them more frequent in case the
# peer is "close" to the slot range of interest # peer is "close" to the slot range of interest
if peerStatusAge < StatusExpirationTime div 2: if peerStatusAge < (StatusExpirationTime div 2):
await sleepAsync(StatusExpirationTime div 2 - peerStatusAge) await sleepAsync((StatusExpirationTime div 2) - peerStatusAge)
trace "Updating peer's status information", wall_clock_slot = wallSlot, trace "Updating peer's status information",
remote_head_slot = peerSlot, local_head_slot = headSlot peer = peer,
peer_score = peer.getScore(),
peer_speed = peer.netKbps(),
index = index,
wall_clock_slot = wallSlot,
remote_head_slot = peerSlot,
local_head_slot = headSlot,
sync_ident = man.ident,
direction = man.direction,
topics = "syncman"
if not(await peer.updateStatus()): if not(await peer.updateStatus()):
peer.updateScore(PeerScoreNoStatus) peer.updateScore(PeerScoreNoStatus)
debug "Failed to get remote peer's status, exiting", return err("Failed to get remote peer status")
peer_head_slot = peerSlot
return let newPeerSlot = peer.getHeadSlot()
if peerSlot >= newPeerSlot:
peer.updateScore(PeerScoreStaleStatus)
debug "Peer's status information is stale",
peer = peer,
peer_score = peer.getScore(),
peer_speed = peer.netKbps(),
index = index,
wall_clock_slot = wallSlot,
remote_old_head_slot = peerSlot,
local_head_slot = headSlot,
remote_new_head_slot = newPeerSlot,
sync_ident = man.ident,
direction = man.direction,
topics = "syncman"
else:
debug "Peer's status information updated",
peer = peer,
peer_score = peer.getScore(),
peer_speed = peer.netKbps(),
index = index,
wall_clock_slot = wallSlot,
remote_old_head_slot = peerSlot,
local_head_slot = headSlot,
remote_new_head_slot = newPeerSlot,
sync_ident = man.ident,
direction = man.direction,
topics = "syncman"
peer.updateScore(PeerScoreGoodStatus)
ok(newPeerSlot)
let newPeerSlot = peer.getHeadSlot() proc syncStep[A, B](
if peerSlot >= newPeerSlot: man: SyncManager[A, B], index: int, peer: A
peer.updateScore(PeerScoreStaleStatus) ) {.async: (raises: [CancelledError]).} =
debug "Peer's status information is stale",
wall_clock_slot = wallSlot, remote_old_head_slot = peerSlot,
local_head_slot = headSlot, remote_new_head_slot = newPeerSlot
else:
debug "Peer's status information updated", wall_clock_slot = wallSlot,
remote_old_head_slot = peerSlot, local_head_slot = headSlot,
remote_new_head_slot = newPeerSlot
peer.updateScore(PeerScoreGoodStatus)
peerSlot = newPeerSlot
# Time passed - enough to move slots, if sleep happened let
peerSlot = (await man.getOrUpdatePeerStatus(index, peer)).valueOr:
return
headSlot = man.getLocalHeadSlot() headSlot = man.getLocalHeadSlot()
wallSlot = man.getLocalWallSlot() wallSlot = man.getLocalWallSlot()
if man.remainingSlots() <= man.maxHeadAge: if man.remainingSlots() <= man.maxHeadAge:
logScope:
peer = peer
direction = man.direction
case man.direction case man.direction
of SyncQueueKind.Forward: of SyncQueueKind.Forward:
info "We are in sync with network", wall_clock_slot = wallSlot, info "We are in sync with network",
remote_head_slot = peerSlot, local_head_slot = headSlot peer = peer,
peer_score = peer.getScore(),
peer_speed = peer.netKbps(),
index = index,
wall_clock_slot = wallSlot,
remote_head_slot = peerSlot,
local_head_slot = headSlot,
direction = man.direction,
sync_ident = man.ident,
topics = "syncman"
of SyncQueueKind.Backward: of SyncQueueKind.Backward:
info "Backfill complete", wall_clock_slot = wallSlot, info "Backfill complete",
remote_head_slot = peerSlot, local_head_slot = headSlot peer = peer,
peer_score = peer.getScore(),
peer_speed = peer.netKbps(),
index = index,
wall_clock_slot = wallSlot,
remote_head_slot = peerSlot,
local_head_slot = headSlot,
direction = man.direction,
sync_ident = man.ident,
topics = "syncman"
# We clear SyncManager's `notInSyncEvent` so all the workers will become # We clear SyncManager's `notInSyncEvent` so all the workers will become
# sleeping soon. # sleeping soon.
@ -462,161 +605,103 @@ proc syncStep[A, B](
# Right now we decreasing peer's score a bit, so it will not be # Right now we decreasing peer's score a bit, so it will not be
# disconnected due to low peer's score, but new fresh peers could replace # disconnected due to low peer's score, but new fresh peers could replace
# peers with low latest head. # peers with low latest head.
debug "Peer's head slot is lower then local head slot", peer = peer, debug "Peer's head slot is lower then local head slot",
wall_clock_slot = wallSlot, remote_head_slot = peerSlot, peer = peer,
peer_score = peer.getScore(),
peer_speed = peer.netKbps(),
index = index,
wall_clock_slot = wallSlot,
remote_head_slot = peerSlot,
local_last_slot = man.getLastSlot(), local_last_slot = man.getLastSlot(),
local_first_slot = man.getFirstSlot(), local_first_slot = man.getFirstSlot(),
direction = man.direction direction = man.direction,
sync_ident = man.ident,
topics = "syncman"
peer.updateScore(PeerScoreUseless) peer.updateScore(PeerScoreUseless)
return return
# Wall clock keeps ticking, so we need to update the queue # Wall clock keeps ticking, so we need to update the queue
man.queue.updateLastSlot(man.getLastSlot()) man.queue.updateLastSlot(man.getLastSlot())
man.workers[index].status = SyncWorkerStatus.Requesting proc processCallback() =
let req = man.queue.pop(peerSlot, peer) man.workers[index].status = SyncWorkerStatus.Processing
if req.isEmpty():
# SyncQueue could return empty request in 2 cases:
# 1. There no more slots in SyncQueue to download (we are synced, but
# our ``notInSyncEvent`` is not yet cleared).
# 2. Current peer's known head slot is too low to satisfy request.
#
# To avoid endless loop we going to wait for RESP_TIMEOUT time here.
# This time is enough for all pending requests to finish and it is also
# enough for main sync loop to clear ``notInSyncEvent``.
debug "Empty request received from queue, exiting", peer = peer,
local_head_slot = headSlot, remote_head_slot = peerSlot,
queue_input_slot = man.queue.inpSlot,
queue_output_slot = man.queue.outSlot,
queue_last_slot = man.queue.finalSlot, direction = man.direction
await sleepAsync(RESP_TIMEOUT_DUR)
return
debug "Creating new request for peer", wall_clock_slot = wallSlot, var jobs: seq[Future[void].Raising([CancelledError])]
remote_head_slot = peerSlot, local_head_slot = headSlot,
request = req
man.workers[index].status = SyncWorkerStatus.Downloading try:
for rindex in 0 ..< man.concurrentRequestsCount:
man.workers[index].status = SyncWorkerStatus.Requesting
let request = man.queue.pop(peerSlot, peer)
if request.isEmpty():
# SyncQueue could return empty request in 2 cases:
# 1. There no more slots in SyncQueue to download (we are synced, but
# our ``notInSyncEvent`` is not yet cleared).
# 2. Current peer's known head slot is too low to satisfy request.
#
# To avoid endless loop we going to wait for RESP_TIMEOUT time here.
# This time is enough for all pending requests to finish and it is also
# enough for main sync loop to clear ``notInSyncEvent``.
debug "Empty request received from queue",
peer = peer,
peer_score = peer.getScore(),
peer_speed = peer.netKbps(),
index = index,
local_head_slot = headSlot,
remote_head_slot = peerSlot,
queue_input_slot = man.queue.inpSlot,
queue_output_slot = man.queue.outSlot,
queue_last_slot = man.queue.finalSlot,
direction = man.direction,
sync_ident = man.ident,
topics = "syncman"
await sleepAsync(RESP_TIMEOUT_DUR)
break
let blocks = await man.getBlocks(peer, req) man.workers[index].status = SyncWorkerStatus.Downloading
if blocks.isErr(): let data = (await man.getSyncBlockData(index, request)).valueOr:
peer.updateScore(PeerScoreNoValues) debug "Failed to get block data",
man.queue.push(req) peer = peer,
debug "Failed to receive blocks on request", peer_score = peer.getScore(),
request = req, err = blocks.error peer_speed = peer.netKbps(),
return index = index,
let blockData = blocks.get().asSeq() reason = error,
debug "Received blocks on request", blocks_count = len(blockData), direction = man.direction,
blocks_map = getShortMap(req, blockData), request = req sync_ident = man.ident,
topics = "syncman"
man.queue.push(request)
break
let slots = mapIt(blockData, it[].slot) # Scoring will happen in `syncUpdate`.
checkResponse(req, slots).isOkOr: man.workers[index].status = SyncWorkerStatus.Queueing
peer.updateScore(PeerScoreBadResponse) let
man.queue.push(req) peerFinalized = peer.getFinalizedEpoch().start_slot()
warn "Incorrect blocks sequence received", lastSlot = request.data.slot + request.data.count - 1
blocks_count = len(blockData), # The peer claims the block is finalized - our own block processing will
blocks_map = getShortMap(req, blockData), # verify this point down the line
request = req, # TODO descore peers that lie
reason = error maybeFinalized = lastSlot < peerFinalized
return
let shouldGetBlobs = jobs.add(man.queue.push(request, data.blocks, data.blobs, maybeFinalized,
if not man.shouldGetBlobs(req): processCallback))
false
else:
var hasBlobs = false
for blck in blockData:
withBlck(blck[]):
when consensusFork >= ConsensusFork.Deneb:
if forkyBlck.message.body.blob_kzg_commitments.len > 0:
hasBlobs = true
break
hasBlobs
let blobData = if len(jobs) > 0:
if shouldGetBlobs: await allFutures(jobs)
let blobs = await man.getBlobSidecars(peer, req)
if blobs.isErr():
peer.updateScore(PeerScoreNoValues)
man.queue.push(req)
debug "Failed to receive blobs on request",
request = req, err = blobs.error
return
let blobData = blobs.get().asSeq()
debug "Received blobs on request",
blobs_count = len(blobData),
blobs_map = getShortMap(req, blobData), request = req
if len(blobData) > 0: except CancelledError as exc:
let slots = mapIt(blobData, it[].signed_block_header.message.slot) let pending = jobs.filterIt(not(it.finished)).mapIt(cancelAndWait(it))
checkBlobsResponse(req, slots).isOkOr: await noCancel allFutures(pending)
peer.updateScore(PeerScoreBadResponse) raise exc
man.queue.push(req)
warn "Incorrect blobs sequence received",
blobs_count = len(blobData),
blobs_map = getShortMap(req, blobData),
request = req,
reason = error
return
let groupedBlobs = groupBlobs(blockData, blobData).valueOr:
peer.updateScore(PeerScoreNoValues)
man.queue.push(req)
info "Received blobs sequence is inconsistent",
blobs_map = getShortMap(req, blobData),
request = req, msg = error
return
groupedBlobs.checkBlobs().isOkOr:
peer.updateScore(PeerScoreBadResponse)
man.queue.push(req)
warn "Received blobs verification failed",
blobs_count = len(blobData),
blobs_map = getShortMap(req, blobData),
request = req,
reason = error
return
Opt.some(groupedBlobs)
else:
Opt.none(seq[BlobSidecars])
if len(blockData) == 0 and man.direction == SyncQueueKind.Backward and
req.contains(man.getSafeSlot()):
# The sync protocol does not distinguish between:
# - All requested slots are empty
# - Peer does not have data available about requested range
#
# However, we include the `backfill` slot in backward sync requests.
# If we receive an empty response to a request covering that slot,
# we know that the response is incomplete and can descore.
peer.updateScore(PeerScoreNoValues)
man.queue.push(req)
debug "Response does not include known-to-exist block", request = req
return
# Scoring will happen in `syncUpdate`.
man.workers[index].status = SyncWorkerStatus.Queueing
let
peerFinalized = peer.getFinalizedEpoch().start_slot()
lastSlot = req.slot + req.count
# The peer claims the block is finalized - our own block processing will
# verify this point down the line
# TODO descore peers that lie
maybeFinalized = lastSlot < peerFinalized
await man.queue.push(req, blockData, blobData, maybeFinalized, proc() =
man.workers[index].status = SyncWorkerStatus.Processing)
proc syncWorker[A, B]( proc syncWorker[A, B](
man: SyncManager[A, B], index: int man: SyncManager[A, B], index: int
) {.async: (raises: [CancelledError]).} = ) {.async: (raises: [CancelledError]).} =
mixin getKey, getScore, getHeadSlot mixin getKey, getScore, getHeadSlot
logScope: debug "Starting syncing worker",
index = index index = index,
sync_ident = man.ident sync_ident = man.ident,
direction = man.direction direction = man.direction,
topics = "syncman" topics = "syncman"
debug "Starting syncing worker"
var peer: A = nil var peer: A = nil
@ -634,7 +719,11 @@ proc syncWorker[A, B](
if not(isNil(peer)): if not(isNil(peer)):
man.pool.release(peer) man.pool.release(peer)
debug "Sync worker stopped" debug "Sync worker stopped",
index = index,
sync_ident = man.ident,
direction = man.direction,
topics = "syncman"
proc getWorkersStats[A, B](man: SyncManager[A, B]): tuple[map: string, proc getWorkersStats[A, B](man: SyncManager[A, B]): tuple[map: string,
sleeping: int, sleeping: int,
@ -719,18 +808,20 @@ proc syncClose[A, B](
proc syncLoop[A, B]( proc syncLoop[A, B](
man: SyncManager[A, B] man: SyncManager[A, B]
) {.async: (raises: [CancelledError]).} = ) {.async: (raises: [CancelledError]).} =
logScope:
sync_ident = man.ident
direction = man.direction
topics = "syncman"
mixin getKey, getScore mixin getKey, getScore
var pauseTime = 0
# Update SyncQueue parameters, because callbacks used to calculate parameters
# could provide different values at moment when syncLoop() started.
man.initQueue()
man.startWorkers() man.startWorkers()
debug "Synchronization loop started" debug "Synchronization loop started",
sync_ident = man.ident,
direction = man.direction,
start_slot = man.queue.startSlot,
finish_slot = man.queue.finalSlot,
topics = "syncman"
proc averageSpeedTask() {.async: (raises: [CancelledError]).} = proc averageSpeedTask() {.async: (raises: [CancelledError]).} =
while true: while true:
@ -778,9 +869,11 @@ proc syncLoop[A, B](
pending_workers_count = pending, pending_workers_count = pending,
wall_head_slot = wallSlot, wall_head_slot = wallSlot,
local_head_slot = headSlot, local_head_slot = headSlot,
pause_time = $chronos.seconds(pauseTime),
avg_sync_speed = man.avgSyncSpeed.formatBiggestFloat(ffDecimal, 4), avg_sync_speed = man.avgSyncSpeed.formatBiggestFloat(ffDecimal, 4),
ins_sync_speed = man.insSyncSpeed.formatBiggestFloat(ffDecimal, 4) ins_sync_speed = man.insSyncSpeed.formatBiggestFloat(ffDecimal, 4),
sync_ident = man.ident,
direction = man.direction,
topics = "syncman"
of SyncQueueKind.Backward: of SyncQueueKind.Backward:
debug "Current syncing state", workers_map = map, debug "Current syncing state", workers_map = map,
sleeping_workers_count = sleeping, sleeping_workers_count = sleeping,
@ -788,9 +881,11 @@ proc syncLoop[A, B](
pending_workers_count = pending, pending_workers_count = pending,
wall_head_slot = wallSlot, wall_head_slot = wallSlot,
backfill_slot = man.getSafeSlot(), backfill_slot = man.getSafeSlot(),
pause_time = $chronos.seconds(pauseTime),
avg_sync_speed = man.avgSyncSpeed.formatBiggestFloat(ffDecimal, 4), avg_sync_speed = man.avgSyncSpeed.formatBiggestFloat(ffDecimal, 4),
ins_sync_speed = man.insSyncSpeed.formatBiggestFloat(ffDecimal, 4) ins_sync_speed = man.insSyncSpeed.formatBiggestFloat(ffDecimal, 4),
sync_ident = man.ident,
direction = man.direction,
topics = "syncman"
let let
pivot = man.progressPivot pivot = man.progressPivot
progress = progress =
@ -855,10 +950,17 @@ proc syncLoop[A, B](
# all sync workers are in `Sleeping` state. # all sync workers are in `Sleeping` state.
if pending > 0: if pending > 0:
debug "Synchronization loop waits for workers completion", debug "Synchronization loop waits for workers completion",
wall_head_slot = wallSlot, local_head_slot = headSlot, wall_head_slot = wallSlot,
difference = (wallSlot - headSlot), max_head_age = man.maxHeadAge, local_head_slot = headSlot,
difference = (wallSlot - headSlot),
max_head_age = man.maxHeadAge,
sleeping_workers_count = sleeping, sleeping_workers_count = sleeping,
waiting_workers_count = waiting, pending_workers_count = pending waiting_workers_count = waiting,
pending_workers_count = pending,
sync_ident = man.ident,
direction = man.direction,
topics = "syncman"
# We already synced, so we should reset all the pending workers from # We already synced, so we should reset all the pending workers from
# any state they have. # any state they have.
man.queue.clearAndWakeup() man.queue.clearAndWakeup()
@ -871,21 +973,33 @@ proc syncLoop[A, B](
await man.syncClose(averageSpeedTaskFut) await man.syncClose(averageSpeedTaskFut)
man.inProgress = false man.inProgress = false
debug "Forward synchronization process finished, exiting", debug "Forward synchronization process finished, exiting",
wall_head_slot = wallSlot, local_head_slot = headSlot, wall_head_slot = wallSlot,
local_head_slot = headSlot,
difference = (wallSlot - headSlot), difference = (wallSlot - headSlot),
max_head_age = man.maxHeadAge max_head_age = man.maxHeadAge,
sync_ident = man.ident,
direction = man.direction,
topics = "syncman"
break break
else: else:
man.inProgress = false man.inProgress = false
debug "Forward synchronization process finished, sleeping", debug "Forward synchronization process finished, sleeping",
wall_head_slot = wallSlot, local_head_slot = headSlot, wall_head_slot = wallSlot,
local_head_slot = headSlot,
difference = (wallSlot - headSlot), difference = (wallSlot - headSlot),
max_head_age = man.maxHeadAge max_head_age = man.maxHeadAge,
sync_ident = man.ident,
direction = man.direction,
topics = "syncman"
else: else:
debug "Synchronization loop sleeping", wall_head_slot = wallSlot, debug "Synchronization loop sleeping",
wall_head_slot = wallSlot,
local_head_slot = headSlot, local_head_slot = headSlot,
difference = (wallSlot - headSlot), difference = (wallSlot - headSlot),
max_head_age = man.maxHeadAge max_head_age = man.maxHeadAge,
sync_ident = man.ident,
direction = man.direction,
topics = "syncman"
of SyncQueueKind.Backward: of SyncQueueKind.Backward:
# Backward syncing is going to be executed only once, so we exit loop # Backward syncing is going to be executed only once, so we exit loop
# and stop all pending tasks which belongs to this instance (sync # and stop all pending tasks which belongs to this instance (sync
@ -893,9 +1007,13 @@ proc syncLoop[A, B](
await man.syncClose(averageSpeedTaskFut) await man.syncClose(averageSpeedTaskFut)
man.inProgress = false man.inProgress = false
debug "Backward synchronization process finished, exiting", debug "Backward synchronization process finished, exiting",
wall_head_slot = wallSlot, local_head_slot = headSlot, wall_head_slot = wallSlot,
local_head_slot = headSlot,
backfill_slot = man.getLastSlot(), backfill_slot = man.getLastSlot(),
max_head_age = man.maxHeadAge max_head_age = man.maxHeadAge,
sync_ident = man.ident,
direction = man.direction,
topics = "syncman"
break break
else: else:
if not(man.notInSyncEvent.isSet()): if not(man.notInSyncEvent.isSet()):
@ -905,10 +1023,14 @@ proc syncLoop[A, B](
man.notInSyncEvent.fire() man.notInSyncEvent.fire()
man.inProgress = true man.inProgress = true
debug "Node lost sync for more then preset period", debug "Node lost sync for more then preset period",
period = man.maxHeadAge, wall_head_slot = wallSlot, period = man.maxHeadAge,
wall_head_slot = wallSlot,
local_head_slot = headSlot, local_head_slot = headSlot,
missing_slots = man.remainingSlots(), missing_slots = man.remainingSlots(),
progress = float(man.queue.progress()) progress = float(man.queue.progress()),
sync_ident = man.ident,
direction = man.direction,
topics = "syncman"
else: else:
man.notInSyncEvent.fire() man.notInSyncEvent.fire()
man.inProgress = true man.inProgress = true

View File

@ -200,7 +200,7 @@ proc updatePerformance(overseer: SyncOverseerRef, startTick: Moment,
# Update status string # Update status string
overseer.statusMsg = Opt.some( overseer.statusMsg = Opt.some(
"fill: " & timeleft.toTimeLeftString() & " (" & timeleft.toTimeLeftString() & " (" &
(done * 100).formatBiggestFloat(ffDecimal, 2) & "%) " & (done * 100).formatBiggestFloat(ffDecimal, 2) & "%) " &
overseer.avgSpeed.formatBiggestFloat(ffDecimal, 4) & overseer.avgSpeed.formatBiggestFloat(ffDecimal, 4) &
"slots/s (" & $dag.head.slot & ")") "slots/s (" & $dag.head.slot & ")")
@ -521,8 +521,6 @@ proc mainLoop*(
quit 1 quit 1
overseer.untrustedInProgress = false overseer.untrustedInProgress = false
# Reset status bar
overseer.statusMsg = Opt.none(string)
# When we finished state rebuilding process - we could start forward # When we finished state rebuilding process - we could start forward
# SyncManager which could perform finish sync. # SyncManager which could perform finish sync.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff