import chronicles import options, deques, heapqueue, tables, strutils, sequtils, math, algorithm import stew/results, chronos, chronicles import spec/[datatypes, digest, helpers], peer_pool, eth2_network import ./eth2_processor import block_pools/block_pools_types export datatypes, digest, chronos, chronicles, results, block_pools_types logScope: topics = "syncman" const PeerScoreNoStatus* = -100 ## Peer did not answer `status` request. PeerScoreStaleStatus* = -50 ## Peer's `status` answer do not progress in time. PeerScoreUseless* = -10 ## Peer's latest head is lower then ours. PeerScoreGoodStatus* = 50 ## Peer's `status` answer is fine. PeerScoreNoBlocks* = -100 ## Peer did not respond in time on `blocksByRange` request. PeerScoreGoodBlocks* = 100 ## Peer's `blocksByRange` answer is fine. PeerScoreBadBlocks* = -1000 ## Peer's response contains incorrect blocks. PeerScoreBadResponse* = -1000 ## Peer's response is not in requested range. PeerScoreMissingBlocks* = -200 ## Peer response contains too many empty blocks. SyncWorkersCount* = 20 ## Number of sync workers to spawn StatusUpdateInterval* = chronos.minutes(1) ## Minimum time between two subsequent calls to update peer's status StatusExpirationTime* = chronos.minutes(2) ## Time time it takes for the peer's status information to expire. type SyncFailureKind* = enum StatusInvalid, StatusDownload, StatusStale, EmptyProblem, BlockDownload, BadResponse GetSlotCallback* = proc(): Slot {.gcsafe, raises: [Defect].} SyncRequest*[T] = object index*: uint64 slot*: Slot count*: uint64 step*: uint64 item*: T SyncResult*[T] = object request*: SyncRequest[T] data*: seq[SignedBeaconBlock] SyncWaiter*[T] = object future: Future[bool] request: SyncRequest[T] RewindPoint = object failSlot: Slot epochCount: uint64 SyncQueue*[T] = ref object inpSlot*: Slot outSlot*: Slot startSlot*: Slot lastSlot: Slot chunkSize*: uint64 queueSize*: int counter*: uint64 opcounter*: uint64 pending*: Table[uint64, SyncRequest[T]] waiters: seq[SyncWaiter[T]] getFinalizedSlot*: GetSlotCallback debtsQueue: HeapQueue[SyncRequest[T]] debtsCount: uint64 readyQueue: HeapQueue[SyncResult[T]] rewind: Option[RewindPoint] outQueue: AsyncQueue[BlockEntry] SyncWorkerStatus* {.pure.} = enum Sleeping, WaitingPeer, UpdatingStatus, Requesting, Downloading, Processing SyncWorker*[A, B] = object future: Future[void] status: SyncWorkerStatus SyncManager*[A, B] = ref object pool: PeerPool[A, B] responseTimeout: chronos.Duration sleepTime: chronos.Duration maxStatusAge: uint64 maxHeadAge: uint64 toleranceValue: uint64 getLocalHeadSlot: GetSlotCallback getLocalWallSlot: GetSlotCallback getFinalizedSlot: GetSlotCallback workers: array[SyncWorkersCount, SyncWorker[A, B]] notInSyncEvent: AsyncEvent rangeAge: uint64 inRangeEvent*: AsyncEvent notInRangeEvent*: AsyncEvent chunkSize: uint64 queue: SyncQueue[A] syncFut: Future[void] outQueue: AsyncQueue[BlockEntry] inProgress*: bool insSyncSpeed*: float avgSyncSpeed*: float timeLeft*: Duration syncCount*: uint64 syncStatus*: string SyncMoment* = object stamp*: chronos.Moment slot*: Slot SyncFailure*[T] = object kind*: SyncFailureKind peer*: T stamp*: chronos.Moment SyncManagerError* = object of CatchableError BeaconBlocksRes* = NetRes[seq[SignedBeaconBlock]] proc validate*[T](sq: SyncQueue[T], blk: SignedBeaconBlock): Future[Result[void, BlockError]] {.async.} = let sblock = SyncBlock( blk: blk, resfut: newFuture[Result[void, BlockError]]("sync.manager.validate") ) await sq.outQueue.addLast(BlockEntry(v: sblock)) return await sblock.resfut proc getShortMap*[T](req: SyncRequest[T], data: openArray[SignedBeaconBlock]): string = ## Returns all slot numbers in ``data`` as placement map. var res = newStringOfCap(req.count) var slider = req.slot var last = 0 for i in 0 ..< req.count: if last < len(data): for k in last ..< len(data): if slider == data[k].message.slot: res.add('x') last = k + 1 break elif slider < data[k].message.slot: res.add('.') break else: res.add('.') slider = slider + req.step result = res proc contains*[T](req: SyncRequest[T], slot: Slot): bool {.inline.} = slot >= req.slot and slot < req.slot + req.count * req.step and ((slot - req.slot) mod req.step == 0) proc cmp*[T](a, b: SyncRequest[T]): int = result = cmp(uint64(a.slot), uint64(b.slot)) proc checkResponse*[T](req: SyncRequest[T], data: openArray[SignedBeaconBlock]): bool = if len(data) == 0: # Impossible to verify empty response. return true if uint64(len(data)) > req.count: # Number of blocks in response should be less or equal to number of # requested blocks. return false var slot = req.slot var rindex = 0'u64 var dindex = 0 while (rindex < req.count) and (dindex < len(data)): if slot < data[dindex].message.slot: discard elif slot == data[dindex].message.slot: inc(dindex) else: return false slot = slot + req.step rindex = rindex + 1'u64 if dindex == len(data): return true else: return false proc getFullMap*[T](req: SyncRequest[T], data: openArray[SignedBeaconBlock]): string = # Returns all slot numbers in ``data`` as comma-delimeted string. result = mapIt(data, $it.message.slot).join(", ") proc init*[T](t1: typedesc[SyncRequest], t2: typedesc[T], slot: Slot, count: uint64): SyncRequest[T] = result = SyncRequest[T](slot: slot, count: count, step: 1'u64) proc init*[T](t1: typedesc[SyncRequest], t2: typedesc[T], start: Slot, finish: Slot): SyncRequest[T] = let count = finish - start + 1'u64 result = SyncRequest[T](slot: start, count: count, step: 1'u64) proc init*[T](t1: typedesc[SyncRequest], t2: typedesc[T], slot: Slot, count: uint64, item: T): SyncRequest[T] = result = SyncRequest[T](slot: slot, count: count, item: item, step: 1'u64) proc init*[T](t1: typedesc[SyncRequest], t2: typedesc[T], start: Slot, finish: Slot, item: T): SyncRequest[T] = let count = finish - start + 1'u64 result = SyncRequest[T](slot: start, count: count, step: 1'u64, item: item) proc init*[T](t1: typedesc[SyncFailure], kind: SyncFailureKind, peer: T): SyncFailure[T] = result = SyncFailure[T](kind: kind, peer: peer, stamp: now(chronos.Moment)) proc empty*[T](t: typedesc[SyncRequest], t2: typedesc[T]): SyncRequest[T] {.inline.} = result = SyncRequest[T](step: 0'u64, count: 0'u64) proc setItem*[T](sr: var SyncRequest[T], item: T) = sr.item = item proc isEmpty*[T](sr: SyncRequest[T]): bool {.inline.} = result = (sr.step == 0'u64) and (sr.count == 0'u64) proc init*[T](t1: typedesc[SyncQueue], t2: typedesc[T], start, last: Slot, chunkSize: uint64, getFinalizedSlotCb: GetSlotCallback, outputQueue: AsyncQueue[BlockEntry], queueSize: int = -1): SyncQueue[T] = ## Create new synchronization queue with parameters ## ## ``start`` and ``last`` are starting and finishing Slots. ## ## ``chunkSize`` maximum number of slots in one request. ## ## ``queueSize`` maximum queue size for incoming data. If ``queueSize > 0`` ## queue will help to keep backpressure under control. If ``queueSize <= 0`` ## then queue size is unlimited (default). ## ## ``updateCb`` procedure which will be used to send downloaded blocks to ## consumer. Procedure should return ``false`` only when it receives ## incorrect blocks, and ``true`` if sequence of blocks is correct. # SyncQueue is the core of sync manager, this data structure distributes # requests to peers and manages responses from peers. # # Because SyncQueue is async data structure it manages backpressure and # order of incoming responses and it also resolves "joker's" problem. # # Joker's problem # # According to current Ethereum2 network specification # > Clients MUST respond with at least one block, if they have it and it # > exists in the range. Clients MAY limit the number of blocks in the # > response. # # Such rule can lead to very uncertain responses, for example let slots from # 10 to 12 will be not empty. Client which follows specification can answer # with any response from this list (X - block, `-` empty space): # # 1. X X X # 2. - - X # 3. - X - # 4. - X X # 5. X - - # 6. X - X # 7. X X - # # If peer answers with `1` everything will be fine and `block_pool` will be # able to process all 3 blocks. In case of `2`, `3`, `4`, `6` - `block_pool` # will fail immediately with chunk and report "parent is missing" error. # But in case of `5` and `7` blocks will be processed by `block_pool` without # any problems, however it will start producing problems right from this # uncertain last slot. SyncQueue will start producing requests for next # blocks, but all the responses from this point will fail with "parent is # missing" error. Lets call such peers "jokers", because they are joking # with responses. # # To fix "joker" problem we going to perform rollback to the latest finalized # epoch's first slot. doAssert(chunkSize > 0'u64, "Chunk size should not be zero") result = SyncQueue[T]( startSlot: start, lastSlot: last, chunkSize: chunkSize, queueSize: queueSize, getFinalizedSlot: getFinalizedSlotCb, waiters: newSeq[SyncWaiter[T]](), counter: 1'u64, pending: initTable[uint64, SyncRequest[T]](), debtsQueue: initHeapQueue[SyncRequest[T]](), inpSlot: start, outSlot: start, outQueue: outputQueue ) proc `<`*[T](a, b: SyncRequest[T]): bool {.inline.} = result = (a.slot < b.slot) proc `<`*[T](a, b: SyncResult[T]): bool {.inline.} = result = (a.request.slot < b.request.slot) proc `==`*[T](a, b: SyncRequest[T]): bool {.inline.} = result = ((a.slot == b.slot) and (a.count == b.count) and (a.step == b.step)) proc lastSlot*[T](req: SyncRequest[T]): Slot {.inline.} = ## Returns last slot for request ``req``. result = req.slot + req.count - 1'u64 proc makePending*[T](sq: SyncQueue[T], req: var SyncRequest[T]) = req.index = sq.counter sq.counter = sq.counter + 1'u64 sq.pending[req.index] = req proc updateLastSlot*[T](sq: SyncQueue[T], last: Slot) {.inline.} = ## Update last slot stored in queue ``sq`` with value ``last``. doAssert(sq.lastSlot <= last, "Last slot could not be lower then stored one " & $sq.lastSlot & " <= " & $last) sq.lastSlot = last proc wakeupWaiters[T](sq: SyncQueue[T], flag = true) = ## Wakeup one or all blocked waiters. for item in sq.waiters: if not(item.future.finished()): item.future.complete(flag) proc waitForChanges[T](sq: SyncQueue[T], req: SyncRequest[T]): Future[bool] {.async.} = ## Create new waiter and wait for completion from `wakeupWaiters()`. var waitfut = newFuture[bool]("SyncQueue.waitForChanges") let waititem = SyncWaiter[T](future: waitfut, request: req) sq.waiters.add(waititem) try: result = await waitfut finally: sq.waiters.delete(sq.waiters.find(waititem)) proc wakeupAndWaitWaiters[T](sq: SyncQueue[T]) {.async.} = ## This procedure will perform wakeupWaiters(false) and blocks until last ## waiter will be awakened. var waitChanges = sq.waitForChanges(SyncRequest.empty(T)) sq.wakeupWaiters(false) discard await waitChanges proc resetWait*[T](sq: SyncQueue[T], toSlot: Option[Slot]) {.async.} = ## Perform reset of all the blocked waiters in SyncQueue. ## ## We adding one more waiter to the waiters sequence and ## call wakeupWaiters(false). Because our waiter is last in sequence of ## waiters it will be resumed only after all waiters will be awakened and ## finished. # We are clearing pending list, so that all requests that are still running # around (still downloading, but not yet pushed to the SyncQueue) will be # expired. Its important to perform this call first (before await), otherwise # you can introduce race problem. sq.pending.clear() # We calculating minimal slot number to which we will be able to reset, # without missing any blocks. There 3 sources: # 1. Debts queue. # 2. Processing queue (`inpSlot`, `outSlot`). # 3. Requested slot `toSlot`. # # Queue's `outSlot` is the lowest slot we added to `block_pool`, but # `toSlot` slot can be less then `outSlot`. `debtsQueue` holds only not # added slot requests, so it can't be bigger then `outSlot` value. var minSlot = sq.outSlot if toSlot.isSome(): minSlot = min(toSlot.get(), sq.outSlot) sq.debtsQueue.clear() sq.debtsCount = 0 sq.readyQueue.clear() sq.inpSlot = minSlot sq.outSlot = minSlot # We are going to wakeup all the waiters and wait for last one. await sq.wakeupAndWaitWaiters() proc isEmpty*[T](sr: SyncResult[T]): bool {.inline.} = ## Returns ``true`` if response chain of blocks is empty (has only empty ## slots). len(sr.data) == 0 proc hasEndGap*[T](sr: SyncResult[T]): bool {.inline.} = ## Returns ``true`` if response chain of blocks has gap at the end. let lastslot = sr.request.slot + sr.request.count - 1'u64 if len(sr.data) == 0: return true if sr.data[^1].message.slot != lastslot: return true return false proc getLastNonEmptySlot*[T](sr: SyncResult[T]): Slot {.inline.} = ## Returns last non-empty slot from result ``sr``. If response has only ## empty slots, original request slot will be returned. if len(sr.data) == 0: # If response has only empty slots we going to use original request slot sr.request.slot else: sr.data[^1].message.slot proc toDebtsQueue[T](sq: SyncQueue[T], sr: SyncRequest[T]) = sq.debtsQueue.push(sr) sq.debtsCount = sq.debtsCount + sr.count proc getRewindPoint*[T](sq: SyncQueue[T], failSlot: Slot, finalizedSlot: Slot): Slot = # Calculate exponential rewind point in number of epochs. let epochCount = if sq.rewind.isSome(): let rewind = sq.rewind.get() if failSlot == rewind.failSlot: # `MissingParent` happened at same slot so we increase rewind point by # factor of 2. let epochs = rewind.epochCount * 2 sq.rewind = some(RewindPoint(failSlot: failSlot, epochCount: epochs)) epochs else: # `MissingParent` happened at different slot so we going to rewind for # 1 epoch only. sq.rewind = some(RewindPoint(failSlot: failSlot, epochCount: 1'u64)) 1'u64 else: # `MissingParent` happened first time. sq.rewind = some(RewindPoint(failSlot: failSlot, epochCount: 1'u64)) 1'u64 # Calculate the latest finalized epoch. let finalizedEpoch = compute_epoch_at_slot(finalizedSlot) # Calculate the rewind epoch, which should not be less than the latest # finalized epoch. let rewindEpoch = block: let failEpoch = compute_epoch_at_slot(failSlot) if failEpoch < finalizedEpoch + epochCount: finalizedEpoch else: failEpoch - epochCount compute_start_slot_at_epoch(rewindEpoch) proc push*[T](sq: SyncQueue[T], sr: SyncRequest[T], data: seq[SignedBeaconBlock]) {.async, gcsafe.} = ## Push successfull result to queue ``sq``. mixin updateScore if sr.index notin sq.pending: # If request `sr` not in our pending list, it only means that # SyncQueue.resetWait() happens and all pending requests are expired, so # we swallow `old` requests, and in such way sync-workers are able to get # proper new requests from SyncQueue. return sq.pending.del(sr.index) # This is backpressure handling algorithm, this algorithm is blocking # all pending `push` requests if `request.slot` not in range: # [current_queue_slot, current_queue_slot + sq.queueSize * sq.chunkSize]. var exitNow = false while true: if (sq.queueSize > 0) and (sr.slot >= sq.outSlot + uint64(sq.queueSize) * sq.chunkSize): let res = await sq.waitForChanges(sr) if res: continue else: # SyncQueue reset happens. We are exiting to wake up sync-worker. exitNow = true break let syncres = SyncResult[T](request: sr, data: data) sq.readyQueue.push(syncres) exitNow = false break if exitNow: return while len(sq.readyQueue) > 0: let minSlot = sq.readyQueue[0].request.slot if sq.outSlot != minSlot: break let item = sq.readyQueue.pop() # Validating received blocks one by one var res: Result[void, BlockError] var failSlot: Option[Slot] if len(item.data) > 0: for blk in item.data: trace "Pushing block", block_root = blk.root, block_slot = blk.message.slot res = await sq.validate(blk) if not(res.isOk): failSlot = some(blk.message.slot) break else: res = Result[void, BlockError].ok() # Increase progress counter, so watch task will be able to know that we are # not stuck. inc(sq.opcounter) if res.isOk: sq.outSlot = sq.outSlot + item.request.count if len(item.data) > 0: # If there no error and response was not empty we should reward peer # with some bonus score. item.request.item.updateScore(PeerScoreGoodBlocks) sq.wakeupWaiters() else: debug "Block pool rejected peer's response", peer = item.request.item, request_slot = item.request.slot, request_count = item.request.count, request_step = item.request.step, blocks_map = getShortMap(item.request, item.data), blocks_count = len(item.data), errCode = res.error, topics = "syncman" var resetSlot: Option[Slot] if res.error == BlockError.MissingParent: # If we got `BlockError.MissingParent` it means that peer returns chain # of blocks with holes or `block_pool` is in incomplete state. We going # to rewind to the first slot at latest finalized epoch. let req = item.request let finalizedSlot = sq.getFinalizedSlot() if finalizedSlot < req.slot: let rewindSlot = sq.getRewindPoint(failSlot.get(), finalizedSlot) warn "Unexpected missing parent, rewind happens", peer = req.item, rewind_to_slot = rewindSlot, rewind_epoch_count = sq.rewind.get().epochCount, rewind_fail_slot = failSlot.get(), finalized_slot = finalized_slot, request_slot = req.slot, request_count = req.count, request_step = req.step, blocks_count = len(item.data), blocks_map = getShortMap(req, item.data), topics = "syncman" resetSlot = some(rewindSlot) req.item.updateScore(PeerScoreMissingBlocks) else: error "Unexpected missing parent at finalized epoch slot", peer = req.item, to_slot = finalizedSlot, request_slot = req.slot, request_count = req.count, request_step = req.step, blocks_count = len(item.data), blocks_map = getShortMap(req, item.data), topics = "syncman" req.item.updateScore(PeerScoreBadBlocks) elif res.error == BlockError.Invalid: let req = item.request warn "Received invalid sequence of blocks", peer = req.item, request_slot = req.slot, request_count = req.count, request_step = req.step, blocks_count = len(item.data), blocks_map = getShortMap(req, item.data), topics = "syncman" req.item.updateScore(PeerScoreBadBlocks) else: let req = item.request warn "Received unexpected response from block_pool", peer = req.item, request_slot = req.slot, request_count = req.count, request_step = req.step, blocks_count = len(item.data), blocks_map = getShortMap(req, item.data), errorCode = res.error, topics = "syncman" req.item.updateScore(PeerScoreBadBlocks) # We need to move failed response to the debts queue. sq.toDebtsQueue(item.request) if resetSlot.isSome(): await sq.resetWait(resetSlot) debug "Rewind to slot was happened", reset_slot = reset_slot.get(), queue_input_slot = sq.inpSlot, queue_output_slot = sq.outSlot, rewind_epoch_count = sq.rewind.get().epochCount, rewind_fail_slot = sq.rewind.get().failSlot, reset_slot = resetSlot, topics = "syncman" break proc push*[T](sq: SyncQueue[T], sr: SyncRequest[T]) = ## Push failed request back to queue. if sr.index notin sq.pending: # If request `sr` not in our pending list, it only means that # SyncQueue.resetWait() happens and all pending requests are expired, so # we swallow `old` requests, and in such way sync-workers are able to get # proper new requests from SyncQueue. return sq.pending.del(sr.index) sq.toDebtsQueue(sr) proc pop*[T](sq: SyncQueue[T], maxslot: Slot, item: T): SyncRequest[T] = if len(sq.debtsQueue) > 0: if maxSlot < sq.debtsQueue[0].slot: return SyncRequest.empty(T) var sr = sq.debtsQueue.pop() if sr.lastSlot() <= maxSlot: sq.debtsCount = sq.debtsCount - sr.count sr.setItem(item) sq.makePending(sr) return sr var sr1 = SyncRequest.init(T, sr.slot, maxslot, item) let sr2 = SyncRequest.init(T, maxslot + 1'u64, sr.lastSlot()) sq.debtsQueue.push(sr2) sq.debtsCount = sq.debtsCount - sr1.count sq.makePending(sr1) return sr1 else: if maxSlot < sq.inpSlot: return SyncRequest.empty(T) if sq.inpSlot > sq.lastSlot: return SyncRequest.empty(T) let lastSlot = min(maxslot, sq.lastSlot) let count = min(sq.chunkSize, lastSlot + 1'u64 - sq.inpSlot) var sr = SyncRequest.init(T, sq.inpSlot, count, item) sq.inpSlot = sq.inpSlot + count sq.makePending(sr) return sr proc len*[T](sq: SyncQueue[T]): uint64 {.inline.} = ## Returns number of slots left in queue ``sq``. if sq.inpSlot > sq.lastSlot: result = sq.debtsCount else: result = sq.lastSlot - sq.inpSlot + 1'u64 - sq.debtsCount proc total*[T](sq: SyncQueue[T]): uint64 {.inline.} = ## Returns total number of slots in queue ``sq``. result = sq.lastSlot - sq.startSlot + 1'u64 proc progress*[T](sq: SyncQueue[T]): uint64 = ## Returns queue's ``sq`` progress string. let curSlot = sq.outSlot - sq.startSlot result = (curSlot * 100'u64) div sq.total() proc now*(sm: typedesc[SyncMoment], slot: Slot): SyncMoment {.inline.} = result = SyncMoment(stamp: now(chronos.Moment), slot: slot) proc speed*(start, finish: SyncMoment): float {.inline.} = ## Returns number of slots per second. let slots = finish.slot - start.slot let dur = finish.stamp - start.stamp let secs = float(chronos.seconds(1).nanoseconds) if isZero(dur): result = 0.0 else: let v = float(slots) * (secs / float(dur.nanoseconds)) # We doing round manually because stdlib.round is deprecated result = round(v * 10000) / 10000 proc newSyncManager*[A, B](pool: PeerPool[A, B], getLocalHeadSlotCb: GetSlotCallback, getLocalWallSlotCb: GetSlotCallback, getFinalizedSlotCb: GetSlotCallback, outputQueue: AsyncQueue[BlockEntry], maxStatusAge = uint64(SLOTS_PER_EPOCH * 4), maxHeadAge = uint64(SLOTS_PER_EPOCH * 1), sleepTime = (int(SLOTS_PER_EPOCH) * int(SECONDS_PER_SLOT)).seconds, chunkSize = uint64(SLOTS_PER_EPOCH), toleranceValue = uint64(1), rangeAge = uint64(SLOTS_PER_EPOCH * 4) ): SyncManager[A, B] = let queue = SyncQueue.init(A, getLocalHeadSlotCb(), getLocalWallSlotCb(), chunkSize, getFinalizedSlotCb, outputQueue, 1) result = SyncManager[A, B]( pool: pool, maxStatusAge: maxStatusAge, getLocalHeadSlot: getLocalHeadSlotCb, getLocalWallSlot: getLocalWallSlotCb, getFinalizedSlot: getFinalizedSlotCb, maxHeadAge: maxHeadAge, sleepTime: sleepTime, chunkSize: chunkSize, queue: queue, outQueue: outputQueue, notInSyncEvent: newAsyncEvent(), inRangeEvent: newAsyncEvent(), notInRangeEvent: newAsyncEvent(), rangeAge: rangeAge ) proc getBlocks*[A, B](man: SyncManager[A, B], peer: A, req: SyncRequest): Future[BeaconBlocksRes] {.async.} = mixin beaconBlocksByRange, getScore, `==` doAssert(not(req.isEmpty()), "Request must not be empty!") debug "Requesting blocks from peer", peer = peer, slot = req.slot, slot_count = req.count, step = req.step, peer_score = peer.getScore(), peer_speed = peer.netKbps(), topics = "syncman" var workFut = awaitne beaconBlocksByRange(peer, req.slot, req.count, req.step) if workFut.failed(): debug "Error, while waiting getBlocks response", peer = peer, slot = req.slot, slot_count = req.count, step = req.step, errMsg = workFut.readError().msg, peer_speed = peer.netKbps(), topics = "syncman" else: let res = workFut.read() if res.isErr: debug "Error, while reading getBlocks response", peer = peer, slot = req.slot, count = req.count, step = req.step, peer_speed = peer.netKbps(), topics = "syncman", error = $res.error() result = res template headAge(): uint64 = wallSlot - headSlot template queueAge(): uint64 = wallSlot - man.queue.outSlot template peerStatusAge(): Duration = Moment.now() - peer.state(BeaconSync).statusLastTime func syncQueueLen*[A, B](man: SyncManager[A, B]): uint64 = man.queue.len proc syncStep[A, B](man: SyncManager[A, B], index: int, peer: A) {.async.} = let wallSlot = man.getLocalWallSlot() let headSlot = man.getLocalHeadSlot() var peerSlot = peer.getHeadSlot() # We updating SyncQueue's last slot all the time man.queue.updateLastSlot(wallSlot) debug "Peer's syncing status", wall_clock_slot = wallSlot, remote_head_slot = peerSlot, local_head_slot = headSlot, peer_score = peer.getScore(), peer = peer, index = index, peer_speed = peer.netKbps(), topics = "syncman" # Check if peer's head slot is bigger than our wall clock slot. if peerSlot > wallSlot + man.toleranceValue: warn "Local timer is broken or peer's status information is invalid", wall_clock_slot = wallSlot, remote_head_slot = peerSlot, local_head_slot = headSlot, peer = peer, index = index, tolerance_value = man.toleranceValue, peer_speed = peer.netKbps(), peer_score = peer.getScore(), topics = "syncman" discard SyncFailure.init(SyncFailureKind.StatusInvalid, peer) return # Check if we need to update peer's status information if peerStatusAge >= StatusExpirationTime: # Peer's status information is very old, its time to update it man.workers[index].status = SyncWorkerStatus.UpdatingStatus trace "Updating peer's status information", wall_clock_slot = wallSlot, remote_head_slot = peerSlot, local_head_slot = headSlot, peer = peer, peer_score = peer.getScore(), index = index, peer_speed = peer.netKbps(), topics = "syncman" try: let res = await peer.updateStatus() if not(res): peer.updateScore(PeerScoreNoStatus) debug "Failed to get remote peer's status, exiting", peer = peer, peer_score = peer.getScore(), peer_head_slot = peerSlot, peer_speed = peer.netKbps(), index = index, topics = "syncman" discard SyncFailure.init(SyncFailureKind.StatusDownload, peer) return except CatchableError as exc: debug "Unexpected exception while updating peer's status", peer = peer, peer_score = peer.getScore(), peer_head_slot = peerSlot, peer_speed = peer.netKbps(), index = index, errMsg = exc.msg, topics = "syncman" return let newPeerSlot = peer.getHeadSlot() if peerSlot >= newPeerSlot: peer.updateScore(PeerScoreStaleStatus) debug "Peer's status information is stale", wall_clock_slot = wallSlot, remote_old_head_slot = peerSlot, local_head_slot = headSlot, remote_new_head_slot = newPeerSlot, peer = peer, peer_score = peer.getScore(), index = index, peer_speed = peer.netKbps(), topics = "syncman" else: debug "Peer's status information updated", wall_clock_slot = wallSlot, remote_old_head_slot = peerSlot, local_head_slot = headSlot, remote_new_head_slot = newPeerSlot, peer = peer, peer_score = peer.getScore(), peer_speed = peer.netKbps(), index = index, topics = "syncman" peer.updateScore(PeerScoreGoodStatus) peerSlot = newPeerSlot if headAge <= man.maxHeadAge: info "We are in sync with network", wall_clock_slot = wallSlot, remote_head_slot = peerSlot, local_head_slot = headSlot, peer = peer, peer_score = peer.getScore(), index = index, peer_speed = peer.netKbps(), topics = "syncman" # We clear SyncManager's `notInSyncEvent` so all the workers will become # sleeping soon. man.notInSyncEvent.clear() return if headSlot >= peerSlot - man.maxHeadAge: debug "We are in sync with peer; refreshing peer's status information", wall_clock_slot = wallSlot, remote_head_slot = peerSlot, local_head_slot = headSlot, peer = peer, peer_score = peer.getScore(), index = index, peer_speed = peer.netKbps(), topics = "syncman" man.workers[index].status = SyncWorkerStatus.UpdatingStatus if peerStatusAge <= StatusUpdateInterval: await sleepAsync(StatusUpdateInterval - peerStatusAge) try: let res = await peer.updateStatus() if not(res): peer.updateScore(PeerScoreNoStatus) debug "Failed to get remote peer's status, exiting", peer = peer, peer_score = peer.getScore(), peer_head_slot = peerSlot, peer_speed = peer.netKbps(), index = index, topics = "syncman" discard SyncFailure.init(SyncFailureKind.StatusDownload, peer) return except CatchableError as exc: debug "Unexpected exception while updating peer's status", peer = peer, peer_score = peer.getScore(), peer_head_slot = peerSlot, peer_speed = peer.netKbps(), index = index, errMsg = exc.msg, topics = "syncman" return let newPeerSlot = peer.getHeadSlot() if peerSlot >= newPeerSlot: peer.updateScore(PeerScoreStaleStatus) debug "Peer's status information is stale", wall_clock_slot = wallSlot, remote_old_head_slot = peerSlot, local_head_slot = headSlot, remote_new_head_slot = newPeerSlot, peer = peer, peer_score = peer.getScore(), index = index, peer_speed = peer.netKbps(), topics = "syncman" else: # This is not very good solution because we should not discriminate and/or # penalize peers which are in sync process too, but their latest head is # lower then our latest head. We should keep connections with such peers # (so this peers are able to get in sync using our data), but we should # not use this peers for syncing because this peers are useless for us. # Right now we decreasing peer's score a bit, so it will not be # disconnected due to low peer's score, but new fresh peers could replace # peers with low latest head. if headSlot >= newPeerSlot - man.maxHeadAge: # Peer's head slot is still lower then ours. debug "Peer's head slot is lower then local head slot", wall_clock_slot = wallSlot, remote_old_head_slot = peerSlot, local_head_slot = headSlot, remote_new_head_slot = newPeerSlot, peer = peer, peer_score = peer.getScore(), peer_speed = peer.netKbps(), index = index, topics = "syncman" peer.updateScore(PeerScoreUseless) else: debug "Peer's status information updated", wall_clock_slot = wallSlot, remote_old_head_slot = peerSlot, local_head_slot = headSlot, remote_new_head_slot = newPeerSlot, peer = peer, peer_score = peer.getScore(), peer_speed = peer.netKbps(), index = index, topics = "syncman" peer.updateScore(PeerScoreGoodStatus) peerSlot = newPeerSlot return man.workers[index].status = SyncWorkerStatus.Requesting let req = man.queue.pop(peerSlot, peer) if req.isEmpty(): # SyncQueue could return empty request in 2 cases: # 1. There no more slots in SyncQueue to download (we are synced, but # our ``notInSyncEvent`` is not yet cleared). # 2. Current peer's known head slot is too low to satisfy request. # # To avoid endless loop we going to wait for RESP_TIMEOUT time here. # This time is enough for all pending requests to finish and it is also # enough for main sync loop to clear ``notInSyncEvent``. debug "Empty request received from queue, exiting", peer = peer, local_head_slot = headSlot, remote_head_slot = peerSlot, queue_input_slot = man.queue.inpSlot, queue_output_slot = man.queue.outSlot, queue_last_slot = man.queue.lastSlot, peer_speed = peer.netKbps(), peer_score = peer.getScore(), index = index, topics = "syncman" await sleepAsync(RESP_TIMEOUT) return debug "Creating new request for peer", wall_clock_slot = wallSlot, remote_head_slot = peerSlot, local_head_slot = headSlot, request_slot = req.slot, request_count = req.count, request_step = req.step, peer = peer, peer_speed = peer.netKbps(), peer_score = peer.getScore(), index = index, topics = "syncman" man.workers[index].status = SyncWorkerStatus.Downloading try: let blocks = await man.getBlocks(peer, req) if blocks.isOk: let data = blocks.get() let smap = getShortMap(req, data) debug "Received blocks on request", blocks_count = len(data), blocks_map = smap, request_slot = req.slot, request_count = req.count, request_step = req.step, peer = peer, peer_score = peer.getScore(), peer_speed = peer.netKbps(), index = index, topics = "syncman" if not(checkResponse(req, data)): peer.updateScore(PeerScoreBadResponse) warn "Received blocks sequence is not in requested range", blocks_count = len(data), blocks_map = smap, request_slot = req.slot, request_count = req.count, request_step = req.step, peer = peer, peer_score = peer.getScore(), peer_speed = peer.netKbps(), index = index, topics = "syncman" discard SyncFailure.init(SyncFailureKind.BadResponse, peer) return # Scoring will happen in `syncUpdate`. man.workers[index].status = SyncWorkerStatus.Processing await man.queue.push(req, data) else: peer.updateScore(PeerScoreNoBlocks) man.queue.push(req) debug "Failed to receive blocks on request", request_slot = req.slot, request_count = req.count, request_step = req.step, peer = peer, index = index, peer_score = peer.getScore(), peer_speed = peer.netKbps(), topics = "syncman" discard SyncFailure.init(SyncFailureKind.BlockDownload, peer) return except CatchableError as exc: debug "Unexpected exception while receiving blocks", request_slot = req.slot, request_count = req.count, request_step = req.step, peer = peer, index = index, peer_score = peer.getScore(), peer_speed = peer.netKbps(), errMsg = exc.msg, topics = "syncman" return proc syncWorker[A, B](man: SyncManager[A, B], index: int) {.async.} = mixin getKey, getScore, getHeadSlot debug "Starting syncing worker", index = index, topics = "syncman" while true: man.workers[index].status = SyncWorkerStatus.Sleeping # This event is going to be set until we are not in sync with network await man.notInSyncEvent.wait() man.workers[index].status = SyncWorkerStatus.WaitingPeer let peer = await man.pool.acquire() await man.syncStep(index, peer) man.pool.release(peer) proc getWorkersStats[A, B](man: SyncManager[A, B]): tuple[map: string, sleeping: int, waiting: int, pending: int] = var map = newString(len(man.workers)) var sleeping, waiting, pending: int for i in 0 ..< len(man.workers): var ch: char case man.workers[i].status of SyncWorkerStatus.Sleeping: ch = 's' inc(sleeping) of SyncWorkerStatus.WaitingPeer: ch = 'w' inc(waiting) of SyncWorkerStatus.UpdatingStatus: ch = 'U' inc(pending) of SyncWorkerStatus.Requesting: ch = 'R' inc(pending) of SyncWorkerStatus.Downloading: ch = 'D' inc(pending) of SyncWorkerStatus.Processing: ch = 'P' inc(pending) map[i] = ch (map, sleeping, waiting, pending) proc guardTask[A, B](man: SyncManager[A, B]) {.async.} = var pending: array[SyncWorkersCount, Future[void]] # Starting all the synchronization workers. for i in 0 ..< len(man.workers): let future = syncWorker[A, B](man, i) man.workers[i].future = future pending[i] = future # Wait for synchronization worker's failure and replace it with new one. while true: let failFuture = await one(pending) let index = pending.find(failFuture) if failFuture.failed(): warn "Synchronization worker stopped working unexpectedly with an error", index = index, errMsg = failFuture.error.msg else: warn "Synchronization worker stopped working unexpectedly without error", index = index let future = syncWorker[A, B](man, index) man.workers[index].future = future pending[index] = future proc toTimeLeftString(d: Duration): string = var v = d var res = "" let ndays = chronos.days(v) if ndays > 0: res = res & (if ndays < 10: "0" & $ndays else: $ndays) & "d" v = v - chronos.days(ndays) let nhours = chronos.hours(v) if nhours > 0: res = res & (if nhours < 10: "0" & $nhours else: $nhours) & "h" v = v - chronos.hours(nhours) else: res = res & "00h" let nmins = chronos.minutes(v) if nmins > 0: res = res & (if nmins < 10: "0" & $nmins else: $nmins) & "m" v = v - chronos.minutes(nmins) else: res = res & "00m" res proc syncLoop[A, B](man: SyncManager[A, B]) {.async.} = mixin getKey, getScore var pauseTime = 0 asyncSpawn man.guardTask() debug "Synchronization loop started", topics = "syncman" proc watchTask() {.async.} = const MaxPauseTime = int(SECONDS_PER_SLOT) * int(SLOTS_PER_EPOCH) MinPauseTime = int(SECONDS_PER_SLOT) pauseTime = MinPauseTime while true: let op1 = man.queue.opcounter await sleepAsync(chronos.seconds(pauseTime)) let op2 = man.queue.opcounter let (_, _, _, pending) = man.getWorkersStats() if pending == 0: pauseTime = MinPauseTime else: if (op2 - op1 == 0'u64) and (pending > 1): # Syncing is NOT progressing, we double `pauseTime` value, but value # could not be bigger then `MaxPauseTime`. if (pauseTime shl 1) > MaxPauseTime: pauseTime = MaxPauseTime else: pauseTime = pauseTime shl 1 info "Syncing process is not progressing, reset the queue", start_op = op1, end_op = op2, pending_workers_count = pending, reset_to_slot = man.queue.outSlot, pause_time = $(chronos.seconds(pauseTime)), local_head_slot = man.getLocalHeadSlot(), topics = "syncman" await man.queue.resetWait(none[Slot]()) else: # Syncing progressing, so reduce `pauseTime` value in half, but value # could not be less then `MinPauseTime`. if (pauseTime shr 1) < MinPauseTime: pauseTime = MinPauseTime else: pauseTime = pauseTime shr 1 debug "Synchronization watch loop tick", wall_head_slot = man.getLocalWallSlot(), local_head_slot = man.getLocalHeadSlot(), queue_start_slot = man.queue.startSlot, queue_last_slot = man.queue.lastSlot, pause_time = $(chronos.seconds(pauseTime)), avg_sync_speed = man.avgSyncSpeed, ins_sync_speed = man.insSyncSpeed, pending_workers_count = pending, topics = "syncman" proc averageSpeedTask() {.async.} = while true: let wallSlot = man.getLocalWallSlot() let headSlot = man.getLocalHeadSlot() let lsm1 = SyncMoment.now(man.getLocalHeadSlot()) await sleepAsync(chronos.seconds(int(SECONDS_PER_SLOT))) let lsm2 = SyncMoment.now(man.getLocalHeadSlot()) let bps = if lsm2.slot - lsm1.slot == 0'u64: 0.0 else: speed(lsm1, lsm2) inc(man.syncCount) man.insSyncSpeed = bps man.avgSyncSpeed = man.avgSyncSpeed + (bps - man.avgSyncSpeed) / float(man.syncCount) let nsec = (float(wallSlot - headSlot) / man.avgSyncSpeed) * 1_000_000_000.0 man.timeLeft = chronos.nanoseconds(int64(nsec)) asyncSpawn watchTask() asyncSpawn averageSpeedTask() while true: let wallSlot = man.getLocalWallSlot() let headSlot = man.getLocalHeadSlot() let (map, sleeping, waiting, pending) = man.getWorkersStats() debug "Current syncing state", workers_map = map, sleeping_workers_count = sleeping, waiting_workers_count = waiting, pending_workers_count = pending, wall_head_slot = wallSlot, local_head_slot = headSlot, pause_time = $chronos.seconds(pauseTime), avg_sync_speed = man.avgSyncSpeed, ins_sync_speed = man.insSyncSpeed, topics = "syncman" # Update status string man.syncStatus = map & ":" & $pending & ":" & man.insSyncSpeed.formatBiggestFloat(ffDecimal, 4) & ":" & man.avgSyncSpeed.formatBiggestFloat(ffDecimal, 4) & ":" & man.timeLeft.toTimeLeftString() & " (" & $man.queue.outSlot & ")" if headAge <= man.maxHeadAge: man.notInSyncEvent.clear() # We are marking SyncManager as not working only when we are in sync and # all sync workers are in `Sleeping` state. if pending > 0: debug "Synchronization loop waits for workers completion", wall_head_slot = wallSlot, local_head_slot = headSlot, difference = (wallSlot - headSlot), max_head_age = man.maxHeadAge, sleeping_workers_count = sleeping, waiting_workers_count = waiting, pending_workers_count = pending, topics = "syncman" man.inProgress = true else: debug "Synchronization loop sleeping", wall_head_slot = wallSlot, local_head_slot = headSlot, difference = (wallSlot - headSlot), max_head_age = man.maxHeadAge, topics = "syncman" man.inProgress = false else: if not(man.notInSyncEvent.isSet()): # We get here only if we lost sync for more then `maxHeadAge` period. if pending == 0: man.queue = SyncQueue.init(A, man.getLocalHeadSlot(), man.getLocalWallSlot(), man.chunkSize, man.getFinalizedSlot, man.outQueue, 1) man.notInSyncEvent.fire() man.inProgress = true else: man.notInSyncEvent.fire() man.inProgress = true if queueAge <= man.rangeAge: # We are in requested range ``man.rangeAge``. man.inRangeEvent.fire() man.notInRangeEvent.clear() else: # We are not in requested range anymore ``man.rangeAge``. man.inRangeEvent.clear() man.notInRangeEvent.fire() await sleepAsync(chronos.seconds(2)) proc start*[A, B](man: SyncManager[A, B]) = ## Starts SyncManager's main loop. man.syncFut = man.syncLoop()