# beacon_chain # Copyright (c) 2018-2024 Status Research & Development GmbH # Licensed and distributed under either of # * MIT license (license terms in the root directory or at https://opensource.org/licenses/MIT). # * Apache v2 license (license terms in the root directory or at https://www.apache.org/licenses/LICENSE-2.0). # at your option. This file may not be copied, modified, or distributed except according to those terms. {.push raises: [].} import std/[strutils, sequtils, algorithm] import stew/[results, base10], chronos, chronicles import ../spec/datatypes/[phase0, altair], ../spec/eth2_apis/rest_types, ../spec/[helpers, forks, network, eip7594_helpers], ../networking/[peer_pool, peer_scores, eth2_network], ../gossip_processing/block_processor, ../beacon_clock, "."/[sync_protocol, sync_queue] export phase0, altair, merge, chronos, chronicles, results, helpers, peer_scores, sync_queue, forks, sync_protocol logScope: topics = "syncman" const SyncWorkersCount* = 20 ## Number of sync workers to spawn StatusUpdateInterval* = chronos.minutes(1) ## Minimum time between two subsequent calls to update peer's status StatusExpirationTime* = chronos.minutes(2) ## Time time it takes for the peer's status information to expire. type SyncWorkerStatus* {.pure.} = enum Sleeping, WaitingPeer, UpdatingStatus, Requesting, Downloading, Queueing, Processing SyncManagerFlag* {.pure.} = enum NoMonitor SyncWorker*[A, B] = object future: Future[void].Raising([CancelledError]) status: SyncWorkerStatus SyncManager*[A, B] = ref object pool: PeerPool[A, B] DENEB_FORK_EPOCH: Epoch MIN_EPOCHS_FOR_BLOB_SIDECARS_REQUESTS: uint64 MIN_EPOCHS_FOR_DATA_COLUMN_SIDECARS_REQUESTS: uint64 responseTimeout: chronos.Duration supernode: bool maxHeadAge: uint64 getLocalHeadSlot: GetSlotCallback getLocalWallSlot: GetSlotCallback getSafeSlot: GetSlotCallback getFirstSlot: GetSlotCallback getLastSlot: GetSlotCallback progressPivot: Slot workers: array[SyncWorkersCount, SyncWorker[A, B]] notInSyncEvent: AsyncEvent rangeAge: uint64 chunkSize: uint64 queue: SyncQueue[A] syncFut: Future[void] blockVerifier: BlockVerifier inProgress*: bool insSyncSpeed*: float avgSyncSpeed*: float syncStatus*: string direction: SyncQueueKind ident*: string flags: set[SyncManagerFlag] SyncMoment* = object stamp*: chronos.Moment slots*: uint64 BeaconBlocksRes = NetRes[List[ref ForkedSignedBeaconBlock, Limit MAX_REQUEST_BLOCKS]] BlobSidecarsRes = NetRes[List[ref BlobSidecar, Limit(MAX_REQUEST_BLOB_SIDECARS)]] DataColumnSidecarsRes = NetRes[List[ref DataColumnSidecar, Limit(MAX_REQUEST_DATA_COLUMNS)]] proc now*(sm: typedesc[SyncMoment], slots: uint64): SyncMoment {.inline.} = SyncMoment(stamp: now(chronos.Moment), slots: slots) proc speed*(start, finish: SyncMoment): float {.inline.} = ## Returns number of slots per second. if finish.slots <= start.slots or finish.stamp <= start.stamp: 0.0 # replays for example else: let slots = float(finish.slots - start.slots) dur = toFloatSeconds(finish.stamp - start.stamp) slots / dur proc initQueue[A, B](man: SyncManager[A, B]) = case man.direction of SyncQueueKind.Forward: man.queue = SyncQueue.init(A, man.direction, man.getFirstSlot(), man.getLastSlot(), man.chunkSize, man.getSafeSlot, man.blockVerifier, 1, man.ident) of SyncQueueKind.Backward: let firstSlot = man.getFirstSlot() lastSlot = man.getLastSlot() startSlot = if firstSlot == lastSlot: # This case should never be happened in real life because # there is present check `needsBackfill(). firstSlot else: firstSlot - 1'u64 man.queue = SyncQueue.init(A, man.direction, startSlot, lastSlot, man.chunkSize, man.getSafeSlot, man.blockVerifier, 1, man.ident) proc newSyncManager*[A, B](pool: PeerPool[A, B], denebEpoch: Epoch, minEpochsForBlobSidecarsRequests: uint64, supernode: bool, direction: SyncQueueKind, getLocalHeadSlotCb: GetSlotCallback, getLocalWallSlotCb: GetSlotCallback, getFinalizedSlotCb: GetSlotCallback, getBackfillSlotCb: GetSlotCallback, getFrontfillSlotCb: GetSlotCallback, progressPivot: Slot, blockVerifier: BlockVerifier, maxHeadAge = uint64(SLOTS_PER_EPOCH * 1), chunkSize = uint64(SLOTS_PER_EPOCH), flags: set[SyncManagerFlag] = {}, ident = "main" ): SyncManager[A, B] = let (getFirstSlot, getLastSlot, getSafeSlot) = case direction of SyncQueueKind.Forward: (getLocalHeadSlotCb, getLocalWallSlotCb, getFinalizedSlotCb) of SyncQueueKind.Backward: (getBackfillSlotCb, getFrontfillSlotCb, getBackfillSlotCb) var res = SyncManager[A, B]( pool: pool, DENEB_FORK_EPOCH: denebEpoch, MIN_EPOCHS_FOR_BLOB_SIDECARS_REQUESTS: minEpochsForBlobSidecarsRequests, supernode: supernode, getLocalHeadSlot: getLocalHeadSlotCb, getLocalWallSlot: getLocalWallSlotCb, getSafeSlot: getSafeSlot, getFirstSlot: getFirstSlot, getLastSlot: getLastSlot, progressPivot: progressPivot, maxHeadAge: maxHeadAge, chunkSize: chunkSize, blockVerifier: blockVerifier, notInSyncEvent: newAsyncEvent(), direction: direction, ident: ident, flags: flags ) res.initQueue() res proc getBlocks[A, B](man: SyncManager[A, B], peer: A, req: SyncRequest): Future[BeaconBlocksRes] {. async: (raises: [CancelledError], raw: true).} = mixin getScore, `==` logScope: peer_score = peer.getScore() peer_speed = peer.netKbps() sync_ident = man.ident direction = man.direction topics = "syncman" doAssert(not(req.isEmpty()), "Request must not be empty!") debug "Requesting blocks from peer", request = req beaconBlocksByRange_v2(peer, req.slot, req.count, 1'u64) proc shouldGetBlobs[A, B](man: SyncManager[A, B], e: Epoch): bool = let wallEpoch = man.getLocalWallSlot().epoch e >= man.DENEB_FORK_EPOCH and (wallEpoch < man.MIN_EPOCHS_FOR_BLOB_SIDECARS_REQUESTS or e >= wallEpoch - man.MIN_EPOCHS_FOR_BLOB_SIDECARS_REQUESTS) proc shouldGetDataColumns[A, B](man: SyncManager[A, B], e: Epoch): bool = let wallEpoch = man.getLocalWallSlot().epoch e >= man.DENEB_FORK_EPOCH and (wallEpoch < man.MIN_EPOCHS_FOR_DATA_COLUMN_SIDECARS_REQUESTS or e >= wallEpoch - man.MIN_EPOCHS_FOR_DATA_COLUMN_SIDECARS_REQUESTS) proc getBlobSidecars[A, B](man: SyncManager[A, B], peer: A, req: SyncRequest): Future[BlobSidecarsRes] {.async: (raises: [CancelledError], raw: true).} = mixin getScore, `==` logScope: peer_score = peer.getScore() peer_speed = peer.netKbps() sync_ident = man.ident direction = man.direction topics = "syncman" doAssert(not(req.isEmpty()), "Request must not be empty!") debug "Requesting blobs sidecars from peer", request = req blobSidecarsByRange(peer, req.slot, req.count) proc remainingSlots(man: SyncManager): uint64 = let first = man.getFirstSlot() last = man.getLastSlot() if man.direction == SyncQueueKind.Forward: if last > first: man.getLastSlot() - man.getFirstSlot() else: 0'u64 else: if first > last: man.getFirstSlot() - man.getLastSlot() else: 0'u64 func groupBlobs*[T](req: SyncRequest[T], blocks: seq[ref ForkedSignedBeaconBlock], blobs: seq[ref BlobSidecar]): Result[seq[BlobSidecars], string] = var grouped = newSeq[BlobSidecars](len(blocks)) blob_cursor = 0 for block_idx, blck in blocks: withBlck(blck[]): when consensusFork >= ConsensusFork.Deneb: template kzgs: untyped = forkyBlck.message.body.blob_kzg_commitments if kzgs.len == 0: continue # Clients MUST include all blob sidecars of each block from which they include blob sidecars. # The following blob sidecars, where they exist, MUST be sent in consecutive (slot, index) order. # https://github.com/ethereum/consensus-specs/blob/v1.4.0-beta.5/specs/deneb/p2p-interface.md#blobsidecarsbyrange-v1 let header = forkyBlck.toSignedBeaconBlockHeader() for blob_idx, kzg_commitment in kzgs: if blob_cursor >= blobs.len: return err("BlobSidecar: response too short") let blob_sidecar = blobs[blob_cursor] if blob_sidecar.index != BlobIndex blob_idx: return err("BlobSidecar: unexpected index") if blob_sidecar.kzg_commitment != kzg_commitment: return err("BlobSidecar: unexpected kzg_commitment") if blob_sidecar.signed_block_header != header: return err("BlobSidecar: unexpected signed_block_header") grouped[block_idx].add(blob_sidecar) inc blob_cursor if blob_cursor != len(blobs): # we reached end of blocks without consuming all blobs so either # the peer we got too few blocks in the paired request, or the # peer is sending us spurious blobs. Result[seq[BlobSidecars], string].err "invalid block or blob sequence" else: Result[seq[BlobSidecars], string].ok grouped func checkBlobs(blobs: seq[BlobSidecars]): Result[void, string] = for blob_sidecars in blobs: for blob_sidecar in blob_sidecars: ? blob_sidecar[].verify_blob_sidecar_inclusion_proof() ok() proc getDataColumnSidecars[A, B](man: SyncManager[A, B], peer: A, req: SyncRequest): Future[DataColumnSidecarsRes] {.async: (raises: [CancelledError], raw: true).} = mixin getScore, `==` logScope: peer_score = peer.getScore() peer_speed = peer.netKbps() sync_ident = man.ident direction = man.direction topics = "syncman" doAssert(not(req.isEmpty()), "Request must not be empty!") debug "Requesting data column sidecars from peer", request = req dataColumnSidecarsByRange(peer, req.slot, req.count, req.columns) func groupDataColumns*[T](req: SyncRequest[T], blocks: seq[ref ForkedSignedBeaconBlock], data_columns: seq[ref DataColumnSidecar]): Result[seq[DataColumnSidecars], string] = var grouped = newSeq[DataColumnSidecars](len(blocks)) column_cursor = 0 for block_idx, blck in blocks: withBlck(blck[]): when consensusFork >= ConsensusFork.Deneb: template kzgs: untyped = forkyBlck.message.body.blob_kzg_commitments if kzgs.len == 0: continue # Clients MUST include all data column sidecars of each block from which they include data column sidecars. # The following data column sidecars, where they exist, MUST be sent in consecutive (slot, index) order. # https://github.com/ethereum/consensus-specs/blob/v1.5.0-alpha.3/specs/_features/eip7594/p2p-interface.md let header = forkyBlck.toSignedBeaconBlockHeader() for column_idx, kzg_commitment in kzgs: if column_cursor >= data_columns.len: return err("DataColumnSidecar: response too short") let data_column_sidecar = data_columns[column_cursor] if kzg_commitment notin data_column_sidecar.kzg_commitments: return err("DataColumnSidecar: unexpected kzg_commitment") if data_column_sidecar.signed_block_header != header: return err("DataColumnSidecar: unexpected signed_block_header") grouped[block_idx].add(data_column_sidecar) inc column_cursor if column_cursor != len(data_columns): # we reached end of blocks without consuming all data columns so either # the peer we got too few blocks in the paired request, or the # peer is sending us spurious data columns. Result[seq[DataColumnSidecars], string].err "invalid block or data column sequence" else: Result[seq[DataColumnSidecars], string].ok grouped func checkDataColumns(data_columns: seq[DataColumnSidecars]): Result[void, string] = for data_column_sidecars in data_columns: for data_column_sidecar in data_column_sidecars: ? data_column_sidecar[].verify_data_column_sidecar_inclusion_proof() ok() proc fetchCustodyColumnCountFromRemotePeer*(peer: Peer): uint64 = # Fetches the custody column count from a remote peer # if the peer advertises their custody column count # via the `csc` ENR field. If the peer does NOT, then # the default value is assume, i.e, CUSTODY_REQUIREMENT let enrOpt = peer.enr if enrOpt.isNone: debug "Could not get ENR from peer", peer_id = peer.peerId return 0 else: let enr = enrOpt.get enrFieldOpt = enr.get(enrCustodySubnetCountField, uint64) if not enrFieldOpt.isOk: debug "Issue with fetching `csc` field from ENR", enr = enr else: return(enrFieldOpt.get) proc checkValidPeerCustody*[A, B](man: SyncManager[A, B], peer: A): bool = # `constructValidaCustodyPeers` returns a list of peers that are an overall # superset of the given local node's custody columns and the custody columns # of the remote peers # Get the local custody subnet count let localCustodySubnetCount = if man.supernode: DATA_COLUMN_SIDECAR_SUBNET_COUNT.uint64 else: CUSTODY_REQUIREMENT # Get the local custody columns let localNodeId = peer.getNodeIdFromPeer() localCustodyColumns = localNodeId.get_custody_columns(localCustodySubnetCount).get var validPeerIds: seq[PeerId] # Get the custody subnet counts of the remote peer let remoteCustodySubnetCount = peer.fetchCustodyColumnCountFromRemotePeer() # Extract remote peer's nodeID from peerID # Fetch custody columns fromm remote peer let remoteNodeId = peer.getNodeIdFromPeer() remoteCustodyColumns = remoteNodeId.get_custody_columns(remoteCustodySubnetCount).get # If the remote peer custodies less columns than our local node # We skip it if remoteCustodyColumns.len < localCustodyColumns.len: return false # If the remote peer custodies all the possible columns if remoteCustodyColumns.len == NUMBER_OF_COLUMNS: return true # Filtering out the invalid peers for column in localCustodyColumns: if column notin remoteCustodyColumns: return true # Otherwise add the peer to the set # of valid peerIds # validPeerIds.add(peer.peerId) # validPeerIds return true proc syncStep[A, B](man: SyncManager[A, B], index: int, peer: A) {.async: (raises: [CancelledError]).} = logScope: peer_score = peer.getScore() peer_speed = peer.netKbps() index = index sync_ident = man.ident topics = "syncman" var headSlot = man.getLocalHeadSlot() wallSlot = man.getLocalWallSlot() peerSlot = peer.getHeadSlot() block: # Check that peer status is recent and relevant logScope: peer = peer direction = man.direction debug "Peer's syncing status", wall_clock_slot = wallSlot, remote_head_slot = peerSlot, local_head_slot = headSlot let peerStatusAge = Moment.now() - peer.getStatusLastTime() needsUpdate = # Latest status we got is old peerStatusAge >= StatusExpirationTime or # The point we need to sync is close to where the peer is man.getFirstSlot() >= peerSlot if needsUpdate: man.workers[index].status = SyncWorkerStatus.UpdatingStatus # Avoid a stampede of requests, but make them more frequent in case the # peer is "close" to the slot range of interest if peerStatusAge < StatusExpirationTime div 2: await sleepAsync(StatusExpirationTime div 2 - peerStatusAge) trace "Updating peer's status information", wall_clock_slot = wallSlot, remote_head_slot = peerSlot, local_head_slot = headSlot if not(await peer.updateStatus()): peer.updateScore(PeerScoreNoStatus) debug "Failed to get remote peer's status, exiting", peer_head_slot = peerSlot return let newPeerSlot = peer.getHeadSlot() if peerSlot >= newPeerSlot: peer.updateScore(PeerScoreStaleStatus) debug "Peer's status information is stale", wall_clock_slot = wallSlot, remote_old_head_slot = peerSlot, local_head_slot = headSlot, remote_new_head_slot = newPeerSlot else: debug "Peer's status information updated", wall_clock_slot = wallSlot, remote_old_head_slot = peerSlot, local_head_slot = headSlot, remote_new_head_slot = newPeerSlot peer.updateScore(PeerScoreGoodStatus) peerSlot = newPeerSlot # Time passed - enough to move slots, if sleep happened headSlot = man.getLocalHeadSlot() wallSlot = man.getLocalWallSlot() if man.remainingSlots() <= man.maxHeadAge: logScope: peer = peer direction = man.direction case man.direction of SyncQueueKind.Forward: info "We are in sync with network", wall_clock_slot = wallSlot, remote_head_slot = peerSlot, local_head_slot = headSlot of SyncQueueKind.Backward: info "Backfill complete", wall_clock_slot = wallSlot, remote_head_slot = peerSlot, local_head_slot = headSlot # We clear SyncManager's `notInSyncEvent` so all the workers will become # sleeping soon. man.notInSyncEvent.clear() return # Find out if the peer potentially can give useful blocks - in the case of # forward sync, they can be useful if they have blocks newer than our head - # in the case of backwards sync, they're useful if they have blocks newer than # the backfill point if man.getFirstSlot() >= peerSlot: # This is not very good solution because we should not discriminate and/or # penalize peers which are in sync process too, but their latest head is # lower then our latest head. We should keep connections with such peers # (so this peers are able to get in sync using our data), but we should # not use this peers for syncing because this peers are useless for us. # Right now we decreasing peer's score a bit, so it will not be # disconnected due to low peer's score, but new fresh peers could replace # peers with low latest head. debug "Peer's head slot is lower then local head slot", peer = peer, wall_clock_slot = wallSlot, remote_head_slot = peerSlot, local_last_slot = man.getLastSlot(), local_first_slot = man.getFirstSlot(), direction = man.direction peer.updateScore(PeerScoreUseless) return # Wall clock keeps ticking, so we need to update the queue man.queue.updateLastSlot(man.getLastSlot()) man.workers[index].status = SyncWorkerStatus.Requesting let req = man.queue.pop(peerSlot, peer) if req.isEmpty(): # SyncQueue could return empty request in 2 cases: # 1. There no more slots in SyncQueue to download (we are synced, but # our ``notInSyncEvent`` is not yet cleared). # 2. Current peer's known head slot is too low to satisfy request. # # To avoid endless loop we going to wait for RESP_TIMEOUT time here. # This time is enough for all pending requests to finish and it is also # enough for main sync loop to clear ``notInSyncEvent``. debug "Empty request received from queue, exiting", peer = peer, local_head_slot = headSlot, remote_head_slot = peerSlot, queue_input_slot = man.queue.inpSlot, queue_output_slot = man.queue.outSlot, queue_last_slot = man.queue.finalSlot, direction = man.direction await sleepAsync(RESP_TIMEOUT_DUR) return debug "Creating new request for peer", wall_clock_slot = wallSlot, remote_head_slot = peerSlot, local_head_slot = headSlot, request = req man.workers[index].status = SyncWorkerStatus.Downloading let blocks = await man.getBlocks(peer, req) if blocks.isErr(): peer.updateScore(PeerScoreNoValues) man.queue.push(req) debug "Failed to receive blocks on request", request = req, err = blocks.error return let blockData = blocks.get().asSeq() let blockSmap = getShortMap(req, blockData) debug "Received blocks on request", blocks_count = len(blockData), blocks_map = blockSmap, request = req let slots = mapIt(blockData, it[].slot) if not(checkResponse(req, slots)): peer.updateScore(PeerScoreBadResponse) man.queue.push(req) warn "Received blocks sequence is not in requested range", blocks_count = len(blockData), blocks_map = blockSmap, request = req return # let shouldGetBlobs = # if not man.shouldGetBlobs(req.slot.epoch): # false # else: # var hasBlobs = false # for blck in blockData: # withBlck(blck[]): # when consensusFork >= ConsensusFork.Deneb: # if forkyBlck.message.body.blob_kzg_commitments.len > 0: # hasBlobs = true # break # hasBlobs func combine(acc: seq[Slot], cur: Slot): seq[Slot] = var copy = acc if copy[copy.len-1] != cur: copy.add(cur) copy # let blobData = # if shouldGetBlobs: # let blobs = await man.getBlobSidecars(peer, req) # if blobs.isErr(): # peer.updateScore(PeerScoreNoValues) # man.queue.push(req) # debug "Failed to receive blobs on request", # request = req, err = blobs.error # return # let blobData = blobs.get().asSeq() # let blobSmap = getShortMap(req, blobData) # debug "Received blobs on request", blobs_count = len(blobData), # blobs_map = blobSmap, request = req # if len(blobData) > 0: # let slots = mapIt(blobData, it[].signed_block_header.message.slot) # let uniqueSlots = foldl(slots, combine(a, b), @[slots[0]]) # if not(checkResponse(req, uniqueSlots)): # peer.updateScore(PeerScoreBadResponse) # man.queue.push(req) # warn "Received blobs sequence is not in requested range", # blobs_count = len(blobData), blobs_map = getShortMap(req, blobData), # request = req # return # let groupedBlobs = groupBlobs(req, blockData, blobData) # if groupedBlobs.isErr(): # peer.updateScore(PeerScoreNoValues) # man.queue.push(req) # info "Received blobs sequence is inconsistent", # blobs_map = getShortMap(req, blobData), request = req, msg=groupedBlobs.error() # return # if (let checkRes = groupedBlobs.get.checkBlobs(); checkRes.isErr): # peer.updateScore(PeerScoreBadResponse) # man.queue.push(req) # warn "Received blobs sequence is invalid", # blobs_count = len(blobData), # blobs_map = getShortMap(req, blobData), # request = req, # msg = checkRes.error # return # Opt.some(groupedBlobs.get()) # else: # Opt.none(seq[BlobSidecars]) let shouldGetDataColumns = if not man.shouldGetDataColumns(req.slot.epoch): false else: var hasColumns = false for blck in blockData: withBlck(blck[]): when consensusFork >= ConsensusFork.Deneb: if forkyBlck.message.body.blob_kzg_commitments.len > 0: hasColumns = true break hasColumns let dataColumnData = if shouldGetDataColumns: if man.checkValidPeerCustody(peer): let data_columns = await man.getDataColumnSidecars(peer, req) if data_columns.isErr(): # peer.updateScore(PeerScoreNoValues) man.queue.push(req) debug "Failed to receive data columns on request", request = req, err = data_columns.error return let dataColumnData = data_columns.get().asSeq() let dataColumnSmap = getShortMap(req, dataColumnData) debug "Received data columns on request", data_columns_count = len(dataColumnData), data_columns_map = dataColumnSmap, request = req if len(dataColumnData) > 0: let slots = mapIt(dataColumnData, it[].signed_block_header.message.slot) let uniqueSlots = foldl(slots, combine(a, b), @[slots[0]]) if not(checkResponse(req, uniqueSlots)): # peer.updateScore(PeerScoreBadResponse) man.queue.push(req) warn "Received data columns sequence is not in requested range", data_columns_count = len(dataColumnData), data_columns_map = getShortMap(req, dataColumnData), request = req return let groupedDataColumns = groupDataColumns(req, blockData, dataColumnData) if groupedDataColumns.isErr(): # peer.updateScore(PeerScoreNoValues) man.queue.push(req) # warn "Received data columns is inconsistent", # data_columns_map = getShortMap(req, dataColumnData), request = req, msg=groupedDataColumns.error() return if (let checkRes = groupedDataColumns.get.checkDataColumns(); checkRes.isErr): # peer.updateScore(PeerScoreBadResponse) man.queue.push(req) warn "Received data columns is invalid", data_columns_count = len(dataColumnData), data_columns_map = getShortMap(req, dataColumnData), request = req, msg = checkRes.error return Opt.some(groupedDataColumns.get()) else: Opt.none(seq[DataColumnSidecars]) else: Opt.none(seq[DataColumnSidecars]) if len(blockData) == 0 and man.direction == SyncQueueKind.Backward and req.contains(man.getSafeSlot()): # The sync protocol does not distinguish between: # - All requested slots are empty # - Peer does not have data available about requested range # # However, we include the `backfill` slot in backward sync requests. # If we receive an empty response to a request covering that slot, # we know that the response is incomplete and can descore. peer.updateScore(PeerScoreNoValues) man.queue.push(req) debug "Response does not include known-to-exist block", request = req return # Scoring will happen in `syncUpdate`. man.workers[index].status = SyncWorkerStatus.Queueing let peerFinalized = peer.getFinalizedEpoch().start_slot() lastSlot = req.slot + req.count # The peer claims the block is finalized - our own block processing will # verify this point down the line # TODO descore peers that lie maybeFinalized = lastSlot < peerFinalized await man.queue.push(req, blockData, Opt.none(seq[BlobSidecars]), dataColumnData, maybeFinalized, proc() = man.workers[index].status = SyncWorkerStatus.Processing) proc syncWorker[A, B](man: SyncManager[A, B], index: int) {.async: (raises: [CancelledError]).} = mixin getKey, getScore, getHeadSlot logScope: index = index sync_ident = man.ident direction = man.direction topics = "syncman" debug "Starting syncing worker" var peer: A = nil try: while true: man.workers[index].status = SyncWorkerStatus.Sleeping # This event is going to be set until we are not in sync with network await man.notInSyncEvent.wait() man.workers[index].status = SyncWorkerStatus.WaitingPeer peer = await man.pool.acquire() await man.syncStep(index, peer) man.pool.release(peer) peer = nil finally: if not(isNil(peer)): man.pool.release(peer) debug "Sync worker stopped" proc getWorkersStats[A, B](man: SyncManager[A, B]): tuple[map: string, sleeping: int, waiting: int, pending: int] = var map = newString(len(man.workers)) var sleeping, waiting, pending: int for i in 0 ..< len(man.workers): var ch: char case man.workers[i].status of SyncWorkerStatus.Sleeping: ch = 's' inc(sleeping) of SyncWorkerStatus.WaitingPeer: ch = 'w' inc(waiting) of SyncWorkerStatus.UpdatingStatus: ch = 'U' inc(pending) of SyncWorkerStatus.Requesting: ch = 'R' inc(pending) of SyncWorkerStatus.Downloading: ch = 'D' inc(pending) of SyncWorkerStatus.Queueing: ch = 'Q' inc(pending) of SyncWorkerStatus.Processing: ch = 'P' inc(pending) map[i] = ch (map, sleeping, waiting, pending) proc startWorkers[A, B](man: SyncManager[A, B]) = # Starting all the synchronization workers. for i in 0 ..< len(man.workers): man.workers[i].future = syncWorker[A, B](man, i) proc toTimeLeftString*(d: Duration): string = if d == InfiniteDuration: "--h--m" else: var v = d var res = "" let ndays = chronos.days(v) if ndays > 0: res = res & (if ndays < 10: "0" & $ndays else: $ndays) & "d" v = v - chronos.days(ndays) let nhours = chronos.hours(v) if nhours > 0: res = res & (if nhours < 10: "0" & $nhours else: $nhours) & "h" v = v - chronos.hours(nhours) else: res = res & "00h" let nmins = chronos.minutes(v) if nmins > 0: res = res & (if nmins < 10: "0" & $nmins else: $nmins) & "m" v = v - chronos.minutes(nmins) else: res = res & "00m" res proc syncClose[A, B](man: SyncManager[A, B], speedTaskFut: Future[void]) {.async.} = var pending: seq[FutureBase] if not(speedTaskFut.finished()): pending.add(speedTaskFut.cancelAndWait()) for worker in man.workers: doAssert(worker.status in {Sleeping, WaitingPeer}) pending.add(worker.future.cancelAndWait()) await noCancel allFutures(pending) proc syncLoop[A, B](man: SyncManager[A, B]) {.async.} = logScope: sync_ident = man.ident direction = man.direction topics = "syncman" mixin getKey, getScore var pauseTime = 0 man.startWorkers() debug "Synchronization loop started" proc averageSpeedTask() {.async: (raises: [CancelledError]).} = while true: # Reset sync speeds between each loss-of-sync event man.avgSyncSpeed = 0 man.insSyncSpeed = 0 await man.notInSyncEvent.wait() # Give the node time to connect to peers and get the sync process started await sleepAsync(seconds(SECONDS_PER_SLOT.int64)) var stamp = SyncMoment.now(man.queue.progress()) syncCount = 0 while man.inProgress: await sleepAsync(seconds(SECONDS_PER_SLOT.int64)) let newStamp = SyncMoment.now(man.queue.progress()) slotsPerSec = speed(stamp, newStamp) syncCount += 1 man.insSyncSpeed = slotsPerSec man.avgSyncSpeed = man.avgSyncSpeed + (slotsPerSec - man.avgSyncSpeed) / float(syncCount) stamp = newStamp let averageSpeedTaskFut = averageSpeedTask() while true: let wallSlot = man.getLocalWallSlot() let headSlot = man.getLocalHeadSlot() let (map, sleeping, waiting, pending) = man.getWorkersStats() debug "Current syncing state", workers_map = map, sleeping_workers_count = sleeping, waiting_workers_count = waiting, pending_workers_count = pending, wall_head_slot = wallSlot, local_head_slot = headSlot, pause_time = $chronos.seconds(pauseTime), avg_sync_speed = man.avgSyncSpeed, ins_sync_speed = man.insSyncSpeed let pivot = man.progressPivot progress = case man.queue.kind of SyncQueueKind.Forward: if man.queue.outSlot >= pivot: man.queue.outSlot - pivot else: 0'u64 of SyncQueueKind.Backward: if pivot >= man.queue.outSlot: pivot - man.queue.outSlot else: 0'u64 total = case man.queue.kind of SyncQueueKind.Forward: if man.queue.finalSlot >= pivot: man.queue.finalSlot + 1'u64 - pivot else: 0'u64 of SyncQueueKind.Backward: if pivot >= man.queue.finalSlot: pivot + 1'u64 - man.queue.finalSlot else: 0'u64 remaining = total - progress done = if total > 0: progress.float / total.float else: 1.0 timeleft = if man.avgSyncSpeed >= 0.001: Duration.fromFloatSeconds(remaining.float / man.avgSyncSpeed) else: InfiniteDuration currentSlot = Base10.toString( if man.queue.kind == SyncQueueKind.Forward: max(uint64(man.queue.outSlot), 1'u64) - 1'u64 else: uint64(man.queue.outSlot) + 1'u64 ) # Update status string man.syncStatus = timeleft.toTimeLeftString() & " (" & (done * 100).formatBiggestFloat(ffDecimal, 2) & "%) " & man.avgSyncSpeed.formatBiggestFloat(ffDecimal, 4) & "slots/s (" & map & ":" & currentSlot & ")" if man.remainingSlots() <= man.maxHeadAge: man.notInSyncEvent.clear() # We are marking SyncManager as not working only when we are in sync and # all sync workers are in `Sleeping` state. if pending > 0: debug "Synchronization loop waits for workers completion", wall_head_slot = wallSlot, local_head_slot = headSlot, difference = (wallSlot - headSlot), max_head_age = man.maxHeadAge, sleeping_workers_count = sleeping, waiting_workers_count = waiting, pending_workers_count = pending # We already synced, so we should reset all the pending workers from # any state they have. man.queue.clearAndWakeup() man.inProgress = true else: case man.direction of SyncQueueKind.Forward: if man.inProgress: if SyncManagerFlag.NoMonitor in man.flags: await man.syncClose(averageSpeedTaskFut) man.inProgress = false debug "Forward synchronization process finished, exiting", wall_head_slot = wallSlot, local_head_slot = headSlot, difference = (wallSlot - headSlot), max_head_age = man.maxHeadAge break else: man.inProgress = false debug "Forward synchronization process finished, sleeping", wall_head_slot = wallSlot, local_head_slot = headSlot, difference = (wallSlot - headSlot), max_head_age = man.maxHeadAge else: debug "Synchronization loop sleeping", wall_head_slot = wallSlot, local_head_slot = headSlot, difference = (wallSlot - headSlot), max_head_age = man.maxHeadAge of SyncQueueKind.Backward: # Backward syncing is going to be executed only once, so we exit loop # and stop all pending tasks which belongs to this instance (sync # workers, speed calculation task). await man.syncClose(averageSpeedTaskFut) man.inProgress = false debug "Backward synchronization process finished, exiting", wall_head_slot = wallSlot, local_head_slot = headSlot, backfill_slot = man.getLastSlot(), max_head_age = man.maxHeadAge break else: if not(man.notInSyncEvent.isSet()): # We get here only if we lost sync for more then `maxHeadAge` period. if pending == 0: man.initQueue() man.notInSyncEvent.fire() man.inProgress = true debug "Node lost sync for more then preset period", period = man.maxHeadAge, wall_head_slot = wallSlot, local_head_slot = headSlot, missing_slots = man.remainingSlots(), progress = float(man.queue.progress()) else: man.notInSyncEvent.fire() man.inProgress = true await sleepAsync(chronos.seconds(2)) proc start*[A, B](man: SyncManager[A, B]) = ## Starts SyncManager's main loop. man.syncFut = man.syncLoop()