nimbus-eth2/beacon_chain/sync/sync_queue.nim

1099 lines
38 KiB
Nim
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# beacon_chain
# Copyright (c) 2018-2024 Status Research & Development GmbH
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at https://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at https://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
{.push raises: [].}
import std/[heapqueue, tables, strutils, sequtils, math]
import stew/[results, base10], chronos, chronicles
import
../spec/datatypes/[base, phase0, altair],
../spec/[helpers, forks],
../networking/[peer_pool, eth2_network],
../gossip_processing/block_processor,
../consensus_object_pools/block_pools_types
export base, phase0, altair, merge, chronos, chronicles, results,
block_pools_types, helpers
logScope:
topics = "syncqueue"
type
GetSlotCallback* = proc(): Slot {.gcsafe, raises: [].}
ProcessingCallback* = proc() {.gcsafe, raises: [].}
BlockVerifier* = proc(signedBlock: ForkedSignedBeaconBlock,
blobs: Opt[BlobSidecars], data_columns: Opt[DataColumnSidecars],
maybeFinalized: bool):
Future[Result[void, VerifierError]] {.async: (raises: [CancelledError]).}
SyncQueueKind* {.pure.} = enum
Forward, Backward
SyncRequest*[T] = object
kind*: SyncQueueKind
index*: uint64
slot*: Slot
count*: uint64
item*: T
SyncResult*[T] = object
request*: SyncRequest[T]
data*: seq[ref ForkedSignedBeaconBlock]
blobs*: Opt[seq[BlobSidecars]]
data_columns*: Opt[seq[DataColumnSidecars]]
GapItem*[T] = object
start*: Slot
finish*: Slot
item*: T
SyncWaiter* = ref object
future: Future[void].Raising([CancelledError])
reset: bool
RewindPoint = object
failSlot: Slot
epochCount: uint64
SyncQueue*[T] = ref object
kind*: SyncQueueKind
inpSlot*: Slot
outSlot*: Slot
startSlot*: Slot
finalSlot*: Slot
chunkSize*: uint64
queueSize*: int
counter*: uint64
pending*: Table[uint64, SyncRequest[T]]
gapList*: seq[GapItem[T]]
waiters: seq[SyncWaiter]
getSafeSlot*: GetSlotCallback
debtsQueue: HeapQueue[SyncRequest[T]]
debtsCount: uint64
readyQueue: HeapQueue[SyncResult[T]]
rewind: Option[RewindPoint]
blockVerifier: BlockVerifier
ident*: string
chronicles.formatIt SyncQueueKind: toLowerAscii($it)
template shortLog*[T](req: SyncRequest[T]): string =
Base10.toString(uint64(req.slot)) & ":" &
Base10.toString(req.count) & "@" &
Base10.toString(req.index)
chronicles.expandIt SyncRequest:
`it` = shortLog(it)
peer = shortLog(it.item)
direction = toLowerAscii($it.kind)
proc getShortMap*[T](req: SyncRequest[T],
data: openArray[ref ForkedSignedBeaconBlock]): string =
## Returns all slot numbers in ``data`` as placement map.
var res = newStringOfCap(req.count)
var slider = req.slot
var last = 0
for i in 0 ..< req.count:
if last < len(data):
for k in last ..< len(data):
if slider == data[k][].slot:
res.add('x')
last = k + 1
break
elif slider < data[k][].slot:
res.add('.')
break
else:
res.add('.')
slider = slider + 1
res
proc getShortMap*[T](req: SyncRequest[T],
data: openArray[ref BlobSidecar]): string =
## Returns all slot numbers in ``data`` as placement map.
var res = newStringOfCap(req.count * MAX_BLOBS_PER_BLOCK)
var cur : uint64 = 0
for slot in req.slot..<req.slot+req.count:
if cur >= lenu64(data):
res.add('|')
continue
if slot == data[cur].signed_block_header.message.slot:
for k in cur..<cur+MAX_BLOBS_PER_BLOCK:
if k >= lenu64(data) or slot != data[k].signed_block_header.message.slot:
res.add('|')
break
else:
inc(cur)
res.add('x')
else:
res.add('|')
res
proc getShortMap*[T](req: SyncRequest[T],
data: openArray[ref DataColumnSidecar]): string =
# Returns all slot numbers in ``data`` as a placement map
var res = newStringOfCap(req.count * MAX_BLOBS_PER_BLOCK)
var cur: uint64 = 0
for slot in req.slot..<req.slot+req.count:
if cur >= lenu64(data):
res.add('|')
continue
if slot == data[cur].signed_block_header.message.slot:
for k in cur..<cur+MAX_BLOBS_PER_BLOCK:
if k >= lenu64(data) or slot != data[k].signed_block_header.message.slot:
res.add('|')
break
else:
inc(cur)
res.add('|')
else:
res.add('|')
res
proc contains*[T](req: SyncRequest[T], slot: Slot): bool {.inline.} =
slot >= req.slot and slot < req.slot + req.count
proc cmp*[T](a, b: SyncRequest[T]): int =
cmp(uint64(a.slot), uint64(b.slot))
proc checkResponse*[T](req: SyncRequest[T],
data: openArray[Slot]): bool =
if len(data) == 0:
# Impossible to verify empty response.
return true
if uint64(len(data)) > req.count:
# Number of blocks in response should be less or equal to number of
# requested blocks.
return false
var slot = req.slot
var rindex = 0'u64
var dindex = 0
while (rindex < req.count) and (dindex < len(data)):
if slot < data[dindex]:
discard
elif slot == data[dindex]:
inc(dindex)
else:
return false
slot += 1'u64
rindex += 1'u64
if dindex == len(data):
return true
else:
return false
proc init[T](t1: typedesc[SyncRequest], kind: SyncQueueKind, start: Slot,
finish: Slot, t2: typedesc[T]): SyncRequest[T] =
let count = finish - start + 1'u64
SyncRequest[T](kind: kind, slot: start, count: count)
proc init[T](t1: typedesc[SyncRequest], kind: SyncQueueKind, slot: Slot,
count: uint64, item: T): SyncRequest[T] =
SyncRequest[T](kind: kind, slot: slot, count: count, item: item)
proc init[T](t1: typedesc[SyncRequest], kind: SyncQueueKind, start: Slot,
finish: Slot, item: T): SyncRequest[T] =
let count = finish - start + 1'u64
SyncRequest[T](kind: kind, slot: start, count: count, item: item)
proc empty*[T](t: typedesc[SyncRequest], kind: SyncQueueKind,
t2: typedesc[T]): SyncRequest[T] {.inline.} =
SyncRequest[T](kind: kind, count: 0'u64)
proc setItem*[T](sr: var SyncRequest[T], item: T) =
sr.item = item
proc isEmpty*[T](sr: SyncRequest[T]): bool {.inline.} =
(sr.count == 0'u64)
proc init*[T](t1: typedesc[SyncQueue], t2: typedesc[T],
queueKind: SyncQueueKind,
start, final: Slot, chunkSize: uint64,
getSafeSlotCb: GetSlotCallback,
blockVerifier: BlockVerifier,
syncQueueSize: int = -1,
ident: string = "main"): SyncQueue[T] =
## Create new synchronization queue with parameters
##
## ``start`` and ``final`` are starting and final Slots.
##
## ``chunkSize`` maximum number of slots in one request.
##
## ``syncQueueSize`` maximum queue size for incoming data.
## If ``syncQueueSize > 0`` queue will help to keep backpressure under
## control. If ``syncQueueSize <= 0`` then queue size is unlimited (default).
# SyncQueue is the core of sync manager, this data structure distributes
# requests to peers and manages responses from peers.
#
# Because SyncQueue is async data structure it manages backpressure and
# order of incoming responses and it also resolves "joker's" problem.
#
# Joker's problem
#
# According to pre-v0.12.0 Ethereum consensus network specification
# > Clients MUST respond with at least one block, if they have it and it
# > exists in the range. Clients MAY limit the number of blocks in the
# > response.
# https://github.com/ethereum/consensus-specs/blob/v0.11.3/specs/phase0/p2p-interface.md#L590
#
# Such rule can lead to very uncertain responses, for example let slots from
# 10 to 12 will be not empty. Client which follows specification can answer
# with any response from this list (X - block, `-` empty space):
#
# 1. X X X
# 2. - - X
# 3. - X -
# 4. - X X
# 5. X - -
# 6. X - X
# 7. X X -
#
# If peer answers with `1` everything will be fine and `block_processor`
# will be able to process all 3 blocks.
# In case of `2`, `3`, `4`, `6` - `block_processor` will fail immediately
# with chunk and report "parent is missing" error.
# But in case of `5` and `7` blocks will be processed by `block_processor`
# without any problems, however it will start producing problems right from
# this uncertain last slot. SyncQueue will start producing requests for next
# blocks, but all the responses from this point will fail with "parent is
# missing" error. Lets call such peers "jokers", because they are joking
# with responses.
#
# To fix "joker" problem we going to perform rollback to the latest finalized
# epoch's first slot.
#
# Note that as of spec v0.12.0, well-behaving clients are forbidden from
# answering this way. However, it still makes sense to attempt to handle
# this case to increase compatibility (e.g., with weak subjectivity nodes
# that are still backfilling blocks)
doAssert(chunkSize > 0'u64, "Chunk size should not be zero")
SyncQueue[T](
kind: queueKind,
startSlot: start,
finalSlot: final,
chunkSize: chunkSize,
queueSize: syncQueueSize,
getSafeSlot: getSafeSlotCb,
waiters: newSeq[SyncWaiter](),
counter: 1'u64,
pending: initTable[uint64, SyncRequest[T]](),
debtsQueue: initHeapQueue[SyncRequest[T]](),
inpSlot: start,
outSlot: start,
blockVerifier: blockVerifier,
ident: ident
)
proc `<`*[T](a, b: SyncRequest[T]): bool =
doAssert(a.kind == b.kind)
case a.kind
of SyncQueueKind.Forward:
a.slot < b.slot
of SyncQueueKind.Backward:
a.slot > b.slot
proc `<`*[T](a, b: SyncResult[T]): bool =
doAssert(a.request.kind == b.request.kind)
case a.request.kind
of SyncQueueKind.Forward:
a.request.slot < b.request.slot
of SyncQueueKind.Backward:
a.request.slot > b.request.slot
proc `==`*[T](a, b: SyncRequest[T]): bool =
(a.kind == b.kind) and (a.slot == b.slot) and (a.count == b.count)
proc lastSlot*[T](req: SyncRequest[T]): Slot =
## Returns last slot for request ``req``.
req.slot + req.count - 1'u64
proc makePending*[T](sq: SyncQueue[T], req: var SyncRequest[T]) =
req.index = sq.counter
sq.counter = sq.counter + 1'u64
sq.pending[req.index] = req
proc updateLastSlot*[T](sq: SyncQueue[T], last: Slot) {.inline.} =
## Update last slot stored in queue ``sq`` with value ``last``.
sq.finalSlot = last
proc wakeupWaiters[T](sq: SyncQueue[T], reset = false) =
## Wakeup one or all blocked waiters.
for item in sq.waiters:
if reset:
item.reset = true
if not(item.future.finished()):
item.future.complete()
proc waitForChanges[T](sq: SyncQueue[T]): Future[bool] {.async: (raises: [CancelledError]).} =
## Create new waiter and wait for completion from `wakeupWaiters()`.
let waitfut = Future[void].Raising([CancelledError]).init("SyncQueue.waitForChanges")
let waititem = SyncWaiter(future: waitfut)
sq.waiters.add(waititem)
try:
await waitfut
return waititem.reset
finally:
sq.waiters.delete(sq.waiters.find(waititem))
proc wakeupAndWaitWaiters[T](sq: SyncQueue[T]) {.async: (raises: [CancelledError]).} =
## This procedure will perform wakeupWaiters(true) and blocks until last
## waiter will be awakened.
var waitChanges = sq.waitForChanges()
sq.wakeupWaiters(true)
discard await waitChanges
proc clearAndWakeup*[T](sq: SyncQueue[T]) =
sq.pending.clear()
sq.wakeupWaiters(true)
proc resetWait*[T](sq: SyncQueue[T], toSlot: Option[Slot]) {.async: (raises: [CancelledError]).} =
## Perform reset of all the blocked waiters in SyncQueue.
##
## We adding one more waiter to the waiters sequence and
## call wakeupWaiters(true). Because our waiter is last in sequence of
## waiters it will be resumed only after all waiters will be awakened and
## finished.
# We are clearing pending list, so that all requests that are still running
# around (still downloading, but not yet pushed to the SyncQueue) will be
# expired. Its important to perform this call first (before await), otherwise
# you can introduce race problem.
sq.pending.clear()
# We calculating minimal slot number to which we will be able to reset,
# without missing any blocks. There 3 sources:
# 1. Debts queue.
# 2. Processing queue (`inpSlot`, `outSlot`).
# 3. Requested slot `toSlot`.
#
# Queue's `outSlot` is the lowest slot we added to `block_pool`, but
# `toSlot` slot can be less then `outSlot`. `debtsQueue` holds only not
# added slot requests, so it can't be bigger then `outSlot` value.
let minSlot =
case sq.kind
of SyncQueueKind.Forward:
if toSlot.isSome():
min(toSlot.get(), sq.outSlot)
else:
sq.outSlot
of SyncQueueKind.Backward:
if toSlot.isSome():
toSlot.get()
else:
sq.outSlot
sq.debtsQueue.clear()
sq.debtsCount = 0
sq.readyQueue.clear()
sq.inpSlot = minSlot
sq.outSlot = minSlot
# We are going to wakeup all the waiters and wait for last one.
await sq.wakeupAndWaitWaiters()
proc isEmpty*[T](sr: SyncResult[T]): bool {.inline.} =
## Returns ``true`` if response chain of blocks is empty (has only empty
## slots).
len(sr.data) == 0
proc hasEndGap*[T](sr: SyncResult[T]): bool {.inline.} =
## Returns ``true`` if response chain of blocks has gap at the end.
let lastslot = sr.request.slot + sr.request.count - 1'u64
if len(sr.data) == 0:
return true
if sr.data[^1][].slot != lastslot:
return true
return false
proc getLastNonEmptySlot*[T](sr: SyncResult[T]): Slot {.inline.} =
## Returns last non-empty slot from result ``sr``. If response has only
## empty slots, original request slot will be returned.
if len(sr.data) == 0:
# If response has only empty slots we going to use original request slot
sr.request.slot
else:
sr.data[^1][].slot
proc processGap[T](sq: SyncQueue[T], sr: SyncResult[T]) =
if sr.isEmpty():
let gitem = GapItem[T](start: sr.request.slot,
finish: sr.request.slot + sr.request.count - 1'u64,
item: sr.request.item)
sq.gapList.add(gitem)
else:
if sr.hasEndGap():
let gitem = GapItem[T](start: sr.getLastNonEmptySlot() + 1'u64,
finish: sr.request.slot + sr.request.count - 1'u64,
item: sr.request.item)
sq.gapList.add(gitem)
else:
sq.gapList.reset()
proc rewardForGaps[T](sq: SyncQueue[T], score: int) =
mixin updateScore, getStats
logScope:
sync_ident = sq.ident
direction = sq.kind
topics = "syncman"
for gap in sq.gapList:
if score < 0:
# Every empty response increases penalty by 25%, but not more than 200%.
let
emptyCount = gap.item.getStats(SyncResponseKind.Empty)
goodCount = gap.item.getStats(SyncResponseKind.Good)
if emptyCount <= goodCount:
gap.item.updateScore(score)
else:
let
weight = int(min(emptyCount - goodCount, 8'u64))
newScore = score + score * weight div 4
gap.item.updateScore(newScore)
debug "Peer received gap penalty", peer = gap.item,
penalty = newScore
else:
gap.item.updateScore(score)
proc toDebtsQueue[T](sq: SyncQueue[T], sr: SyncRequest[T]) =
sq.debtsQueue.push(sr)
sq.debtsCount = sq.debtsCount + sr.count
proc getRewindPoint*[T](sq: SyncQueue[T], failSlot: Slot,
safeSlot: Slot): Slot =
logScope:
sync_ident = sq.ident
direction = sq.kind
topics = "syncman"
case sq.kind
of SyncQueueKind.Forward:
# Calculate the latest finalized epoch.
let finalizedEpoch = epoch(safeSlot)
# Calculate failure epoch.
let failEpoch = epoch(failSlot)
# Calculate exponential rewind point in number of epochs.
let epochCount =
if sq.rewind.isSome():
let rewind = sq.rewind.get()
if failSlot == rewind.failSlot:
# `MissingParent` happened at same slot so we increase rewind point by
# factor of 2.
if failEpoch > finalizedEpoch:
let rewindPoint = rewind.epochCount shl 1
if rewindPoint < rewind.epochCount:
# If exponential rewind point produces `uint64` overflow we will
# make rewind to latest finalized epoch.
failEpoch - finalizedEpoch
else:
if (failEpoch < rewindPoint) or
(failEpoch - rewindPoint < finalizedEpoch):
# If exponential rewind point points to position which is far
# behind latest finalized epoch.
failEpoch - finalizedEpoch
else:
rewindPoint
else:
warn "Trying to rewind over the last finalized epoch",
finalized_slot = safeSlot, fail_slot = failSlot,
finalized_epoch = finalizedEpoch, fail_epoch = failEpoch,
rewind_epoch_count = rewind.epochCount,
finalized_epoch = finalizedEpoch
0'u64
else:
# `MissingParent` happened at different slot so we going to rewind for
# 1 epoch only.
if (failEpoch < 1'u64) or (failEpoch - 1'u64 < finalizedEpoch):
warn "Сould not rewind further than the last finalized epoch",
finalized_slot = safeSlot, fail_slot = failSlot,
finalized_epoch = finalizedEpoch, fail_epoch = failEpoch,
rewind_epoch_count = rewind.epochCount,
finalized_epoch = finalizedEpoch
0'u64
else:
1'u64
else:
# `MissingParent` happened first time.
if (failEpoch < 1'u64) or (failEpoch - 1'u64 < finalizedEpoch):
warn "Сould not rewind further than the last finalized epoch",
finalized_slot = safeSlot, fail_slot = failSlot,
finalized_epoch = finalizedEpoch, fail_epoch = failEpoch,
finalized_epoch = finalizedEpoch
0'u64
else:
1'u64
if epochCount == 0'u64:
warn "Unable to continue syncing, please restart the node",
finalized_slot = safeSlot, fail_slot = failSlot,
finalized_epoch = finalizedEpoch, fail_epoch = failEpoch,
finalized_epoch = finalizedEpoch
# Calculate the rewind epoch, which will be equal to last rewind point or
# finalizedEpoch
let rewindEpoch =
if sq.rewind.isNone():
finalizedEpoch
else:
epoch(sq.rewind.get().failSlot) - sq.rewind.get().epochCount
rewindEpoch.start_slot()
else:
# Calculate the rewind epoch, which should not be less than the latest
# finalized epoch.
let rewindEpoch = failEpoch - epochCount
# Update and save new rewind point in SyncQueue.
sq.rewind = some(RewindPoint(failSlot: failSlot, epochCount: epochCount))
rewindEpoch.start_slot()
of SyncQueueKind.Backward:
# While we perform backward sync, the only possible slot we could rewind is
# latest stored block.
if failSlot == safeSlot:
warn "Unable to continue syncing, please restart the node",
safe_slot = safeSlot, fail_slot = failSlot
safeSlot
# This belongs inside the blocks iterator below, but can't be there due to
# https://github.com/nim-lang/Nim/issues/21242
func getOpt(blobs: Opt[seq[BlobSidecars]], i: int): Opt[BlobSidecars] =
if blobs.isSome:
Opt.some(blobs.get()[i])
else:
Opt.none(BlobSidecars)
func getOpt(data_columns: Opt[seq[DataColumnSidecars]], i: int):
Opt[DataColumnSidecars] =
if data_columns.isSome:
Opt.some(data_columns.get()[i])
else:
Opt.none(DataColumnSidecars)
iterator blocks[T](sq: SyncQueue[T],
sr: SyncResult[T]): (ref ForkedSignedBeaconBlock, Opt[BlobSidecars]) =
case sq.kind
of SyncQueueKind.Forward:
for i in countup(0, len(sr.data) - 1):
yield (sr.data[i], sr.blobs.getOpt(i))
of SyncQueueKind.Backward:
for i in countdown(len(sr.data) - 1, 0):
yield (sr.data[i], sr.blobs.getOpt(i))
iterator das_blocks[T](sq: SyncQueue[T],
sr: SyncResult[T]): (ref ForkedSignedBeaconBlock, Opt[DataColumnSidecars]) =
case sq.kind
of SyncQueueKind.Forward:
for i in countup(0, len(sr.data) - 1):
yield (sr.data[i], sr.data_columns.getOpt(i))
of SyncQueueKind.Backward:
for i in countdown(len(sr.data) - 1, 0):
yield (sr.data[i], sr.data_columns.getOpt(i))
proc advanceOutput*[T](sq: SyncQueue[T], number: uint64) =
case sq.kind
of SyncQueueKind.Forward:
sq.outSlot = sq.outSlot + number
of SyncQueueKind.Backward:
sq.outSlot = sq.outSlot - number
proc advanceInput[T](sq: SyncQueue[T], number: uint64) =
case sq.kind
of SyncQueueKind.Forward:
sq.inpSlot = sq.inpSlot + number
of SyncQueueKind.Backward:
sq.inpSlot = sq.inpSlot - number
proc notInRange[T](sq: SyncQueue[T], sr: SyncRequest[T]): bool =
case sq.kind
of SyncQueueKind.Forward:
(sq.queueSize > 0) and (sr.slot > sq.outSlot)
of SyncQueueKind.Backward:
(sq.queueSize > 0) and (sr.lastSlot < sq.outSlot)
func numAlreadyKnownSlots[T](sq: SyncQueue[T], sr: SyncRequest[T]): uint64 =
## Compute the number of slots covered by a given `SyncRequest` that are
## already known and, hence, no longer relevant for sync progression.
let
outSlot = sq.outSlot
lowSlot = sr.slot
highSlot = sr.lastSlot
case sq.kind
of SyncQueueKind.Forward:
if outSlot > highSlot:
# Entire request is no longer relevant.
sr.count
elif outSlot > lowSlot:
# Request is only partially relevant.
outSlot - lowSlot
else:
# Entire request is still relevant.
0
of SyncQueueKind.Backward:
if lowSlot > outSlot:
# Entire request is no longer relevant.
sr.count
elif highSlot > outSlot:
# Request is only partially relevant.
highSlot - outSlot
else:
# Entire request is still relevant.
0
proc push*[T](sq: SyncQueue[T], sr: SyncRequest[T],
data: seq[ref ForkedSignedBeaconBlock],
blobs: Opt[seq[BlobSidecars]],
data_columns: Opt[seq[DataColumnSidecars]],
maybeFinalized: bool = false,
processingCb: ProcessingCallback = nil) {.async: (raises: [CancelledError]).} =
logScope:
sync_ident = sq.ident
topics = "syncman"
## Push successful result to queue ``sq``.
mixin updateScore, updateStats, getStats
if sr.index notin sq.pending:
# If request `sr` not in our pending list, it only means that
# SyncQueue.resetWait() happens and all pending requests are expired, so
# we swallow `old` requests, and in such way sync-workers are able to get
# proper new requests from SyncQueue.
return
sq.pending.del(sr.index)
# This is backpressure handling algorithm, this algorithm is blocking
# all pending `push` requests if `request.slot` not in range.
while true:
if sq.notInRange(sr):
let reset = await sq.waitForChanges()
if reset:
# SyncQueue reset happens. We are exiting to wake up sync-worker.
return
else:
let syncres = SyncResult[T](request: sr, data: data, blobs: blobs, data_columns: data_columns)
sq.readyQueue.push(syncres)
break
while len(sq.readyQueue) > 0:
let reqres =
case sq.kind
of SyncQueueKind.Forward:
let minSlot = sq.readyQueue[0].request.slot
if sq.outSlot < minSlot:
none[SyncResult[T]]()
else:
some(sq.readyQueue.pop())
of SyncQueueKind.Backward:
let maxslot = sq.readyQueue[0].request.slot +
(sq.readyQueue[0].request.count - 1'u64)
if sq.outSlot > maxslot:
none[SyncResult[T]]()
else:
some(sq.readyQueue.pop())
let item =
if reqres.isSome():
reqres.get()
else:
let rewindSlot = sq.getRewindPoint(sq.outSlot, sq.getSafeSlot())
warn "Got incorrect sync result in queue, rewind happens",
blocks_map = getShortMap(sq.readyQueue[0].request,
sq.readyQueue[0].data),
blocks_count = len(sq.readyQueue[0].data),
output_slot = sq.outSlot, input_slot = sq.inpSlot,
rewind_to_slot = rewindSlot, request = sq.readyQueue[0].request
await sq.resetWait(some(rewindSlot))
break
if processingCb != nil:
processingCb()
# Validating received blocks one by one
var
hasInvalidBlock = false
unviableBlock: Option[(Eth2Digest, Slot)]
missingParentSlot: Option[Slot]
goodBlock: Option[Slot]
# TODO when https://github.com/nim-lang/Nim/issues/21306 is fixed in used
# Nim versions, remove workaround and move `res` into for loop
res: Result[void, VerifierError]
# var i=0
# for blk, blb in sq.blocks(item):
# res = await sq.blockVerifier(blk[], blb, Opt.none(DataColumnSidecars), maybeFinalized)
# inc(i)
# if res.isOk():
# goodBlock = some(blk[].slot)
# else:
# case res.error()
# of VerifierError.MissingParent:
# missingParentSlot = some(blk[].slot)
# break
# of VerifierError.Duplicate:
# # Keep going, happens naturally
# discard
# of VerifierError.UnviableFork:
# # Keep going so as to register other unviable blocks with the
# # quarantine
# if unviableBlock.isNone:
# # Remember the first unviable block, so we can log it
# unviableBlock = some((blk[].root, blk[].slot))
# of VerifierError.Invalid:
# hasInvalidBlock = true
# let req = item.request
# notice "Received invalid sequence of blocks", request = req,
# blocks_count = len(item.data),
# blocks_map = getShortMap(req, item.data)
# req.item.updateScore(PeerScoreBadValues)
# break
var counter = 0
for blk, col in sq.das_blocks(item):
res = await sq.blockVerifier(blk[], Opt.none(BlobSidecars), col, maybeFinalized)
inc counter
if res.isOk:
debugEcho "Column into sync queue"
goodBlock = some(blk[].slot)
else:
case res.error()
of VerifierError.MissingParent:
missingParentSlot = some(blk[].slot)
break
of VerifierError.Duplicate:
# Keep going, happens naturally
discard
of VerifierError.UnviableFork:
# Keep going so as to register other unviable blocks with the
# quarantine
if unviableBlock.isNone:
# Remember the first unviable block, so we can log it
unviableBlock = some((blk[].root, blk[].slot))
of VerifierError.Invalid:
hasInvalidBlock = true
let req = item.request
notice "Received invalid sequence of blocks", request = req,
blocks_count = len(item.data),
blocks_map = getShortMap(req, item.data)
# req.item.updateScore(PeerScoreBadValues)
# When errors happen while processing blocks, we retry the same request
# with, hopefully, a different peer
let retryRequest =
hasInvalidBlock or unviableBlock.isSome() or missingParentSlot.isSome()
if not(retryRequest):
let numSlotsAdvanced = item.request.count - sq.numAlreadyKnownSlots(sr)
sq.advanceOutput(numSlotsAdvanced)
if goodBlock.isSome():
# If there no error and response was not empty we should reward peer
# with some bonus score - not for duplicate blocks though.
item.request.item.updateScore(PeerScoreGoodValues)
item.request.item.updateStats(SyncResponseKind.Good, 1'u64)
# BlockProcessor reports good block, so we can reward all the peers
# who sent us empty responses.
sq.rewardForGaps(PeerScoreGoodValues)
sq.gapList.reset()
else:
# Response was empty
item.request.item.updateStats(SyncResponseKind.Empty, 1'u64)
sq.processGap(item)
if numSlotsAdvanced > 0:
sq.wakeupWaiters()
else:
debug "Block pool rejected peer's response", request = item.request,
blocks_map = getShortMap(item.request, item.data),
blocks_count = len(item.data),
ok = goodBlock.isSome(),
unviable = unviableBlock.isSome(),
missing_parent = missingParentSlot.isSome()
# We need to move failed response to the debts queue.
sq.toDebtsQueue(item.request)
if unviableBlock.isSome():
let req = item.request
notice "Received blocks from an unviable fork", request = req,
blockRoot = unviableBlock.get()[0],
blockSlot = unviableBlock.get()[1],
blocks_count = len(item.data),
blocks_map = getShortMap(req, item.data)
req.item.updateScore(PeerScoreUnviableFork)
if missingParentSlot.isSome():
var
resetSlot: Option[Slot]
failSlot = missingParentSlot.get()
# If we got `VerifierError.MissingParent` it means that peer returns
# chain of blocks with holes or `block_pool` is in incomplete state. We
# going to rewind the SyncQueue some distance back (2ⁿ, where n∈[0,∞],
# but no more than `finalized_epoch`).
let
req = item.request
safeSlot = sq.getSafeSlot()
gapsCount = len(sq.gapList)
# We should penalize all the peers which responded with gaps.
sq.rewardForGaps(PeerScoreMissingValues)
sq.gapList.reset()
case sq.kind
of SyncQueueKind.Forward:
if goodBlock.isSome():
# `VerifierError.MissingParent` and `Success` present in response,
# it means that we just need to request this range one more time.
debug "Unexpected missing parent, but no rewind needed",
request = req, finalized_slot = safeSlot,
last_good_slot = goodBlock.get(),
missing_parent_slot = missingParentSlot.get(),
blocks_count = len(item.data),
blocks_map = getShortMap(req, item.data),
gaps_count = gapsCount
req.item.updateScore(PeerScoreMissingValues)
else:
if safeSlot < req.slot:
let rewindSlot = sq.getRewindPoint(failSlot, safeSlot)
debug "Unexpected missing parent, rewind happens",
request = req, rewind_to_slot = rewindSlot,
rewind_point = sq.rewind, finalized_slot = safeSlot,
blocks_count = len(item.data),
blocks_map = getShortMap(req, item.data),
gaps_count = gapsCount
resetSlot = some(rewindSlot)
else:
error "Unexpected missing parent at finalized epoch slot",
request = req, rewind_to_slot = safeSlot,
blocks_count = len(item.data),
blocks_map = getShortMap(req, item.data),
gaps_count = gapsCount
req.item.updateScore(PeerScoreBadValues)
of SyncQueueKind.Backward:
if safeSlot > failSlot:
let rewindSlot = sq.getRewindPoint(failSlot, safeSlot)
# It's quite common peers give us fewer blocks than we ask for
debug "Gap in block range response, rewinding", request = req,
rewind_to_slot = rewindSlot, rewind_fail_slot = failSlot,
finalized_slot = safeSlot, blocks_count = len(item.data),
blocks_map = getShortMap(req, item.data)
resetSlot = some(rewindSlot)
req.item.updateScore(PeerScoreMissingValues)
else:
error "Unexpected missing parent at safe slot", request = req,
to_slot = safeSlot, blocks_count = len(item.data),
blocks_map = getShortMap(req, item.data)
req.item.updateScore(PeerScoreBadValues)
if resetSlot.isSome():
await sq.resetWait(resetSlot)
case sq.kind
of SyncQueueKind.Forward:
debug "Rewind to slot has happened", reset_slot = resetSlot.get(),
queue_input_slot = sq.inpSlot, queue_output_slot = sq.outSlot,
rewind_point = sq.rewind, direction = sq.kind
of SyncQueueKind.Backward:
debug "Rewind to slot has happened", reset_slot = resetSlot.get(),
queue_input_slot = sq.inpSlot, queue_output_slot = sq.outSlot,
direction = sq.kind
break
proc push*[T](sq: SyncQueue[T], sr: SyncRequest[T]) =
## Push failed request back to queue.
if sr.index notin sq.pending:
# If request `sr` not in our pending list, it only means that
# SyncQueue.resetWait() happens and all pending requests are expired, so
# we swallow `old` requests, and in such way sync-workers are able to get
# proper new requests from SyncQueue.
return
sq.pending.del(sr.index)
sq.toDebtsQueue(sr)
proc handlePotentialSafeSlotAdvancement[T](sq: SyncQueue[T]) =
# It may happen that sync progress advanced to a newer `safeSlot`, either
# by a response that started with good values and only had errors late, or
# through an out-of-band mechanism, e.g., VC / REST.
# If that happens, advance to the new `safeSlot` to avoid repeating requests
# for data that is considered immutable and no longer relevant.
let safeSlot = sq.getSafeSlot()
func numSlotsBehindSafeSlot(slot: Slot): uint64 =
case sq.kind
of SyncQueueKind.Forward:
if safeSlot > slot:
safeSlot - slot
else:
0
of SyncQueueKind.Backward:
if slot > safeSlot:
slot - safeSlot
else:
0
let
numOutSlotsAdvanced = sq.outSlot.numSlotsBehindSafeSlot
numInpSlotsAdvanced =
case sq.kind
of SyncQueueKind.Forward:
sq.inpSlot.numSlotsBehindSafeSlot
of SyncQueueKind.Backward:
if sq.inpSlot == 0xFFFF_FFFF_FFFF_FFFF'u64:
0'u64
else:
sq.inpSlot.numSlotsBehindSafeSlot
if numOutSlotsAdvanced != 0 or numInpSlotsAdvanced != 0:
debug "Sync progress advanced out-of-band",
safeSlot, outSlot = sq.outSlot, inpSlot = sq.inpSlot
if numOutSlotsAdvanced != 0:
sq.advanceOutput(numOutSlotsAdvanced)
if numInpSlotsAdvanced != 0:
sq.advanceInput(numInpSlotsAdvanced)
sq.wakeupWaiters()
func updateRequestForNewSafeSlot[T](sq: SyncQueue[T], sr: var SyncRequest[T]) =
# Requests may have originated before the latest `safeSlot` advancement.
# Update it to not request any data prior to `safeSlot`.
let
outSlot = sq.outSlot
lowSlot = sr.slot
highSlot = sr.lastSlot
case sq.kind
of SyncQueueKind.Forward:
if outSlot <= lowSlot:
# Entire request is still relevant.
discard
elif outSlot <= highSlot:
# Request is only partially relevant.
let
numSlotsDone = outSlot - lowSlot
sr.slot += numSlotsDone
sr.count -= numSlotsDone
else:
# Entire request is no longer relevant.
sr.count = 0
of SyncQueueKind.Backward:
if outSlot >= highSlot:
# Entire request is still relevant.
discard
elif outSlot >= lowSlot:
# Request is only partially relevant.
let
numSlotsDone = highSlot - outSlot
sr.count -= numSlotsDone
else:
# Entire request is no longer relevant.
sr.count = 0
proc pop*[T](sq: SyncQueue[T], maxslot: Slot, item: T): SyncRequest[T] =
## Create new request according to current SyncQueue parameters.
sq.handlePotentialSafeSlotAdvancement()
while len(sq.debtsQueue) > 0:
if maxslot < sq.debtsQueue[0].slot:
# Peer's latest slot is less than starting request's slot.
return SyncRequest.empty(sq.kind, T)
if maxslot < sq.debtsQueue[0].lastSlot():
# Peer's latest slot is less than finishing request's slot.
return SyncRequest.empty(sq.kind, T)
var sr = sq.debtsQueue.pop()
sq.debtsCount = sq.debtsCount - sr.count
sq.updateRequestForNewSafeSlot(sr)
if sr.isEmpty:
continue
sr.setItem(item)
sq.makePending(sr)
return sr
case sq.kind
of SyncQueueKind.Forward:
if maxslot < sq.inpSlot:
# Peer's latest slot is less than queue's input slot.
return SyncRequest.empty(sq.kind, T)
if sq.inpSlot > sq.finalSlot:
# Queue's input slot is bigger than queue's final slot.
return SyncRequest.empty(sq.kind, T)
let lastSlot = min(maxslot, sq.finalSlot)
let count = min(sq.chunkSize, lastSlot + 1'u64 - sq.inpSlot)
var sr = SyncRequest.init(sq.kind, sq.inpSlot, count, item)
sq.advanceInput(count)
sq.makePending(sr)
sr
of SyncQueueKind.Backward:
if sq.inpSlot == 0xFFFF_FFFF_FFFF_FFFF'u64:
return SyncRequest.empty(sq.kind, T)
if sq.inpSlot < sq.finalSlot:
return SyncRequest.empty(sq.kind, T)
let (slot, count) =
block:
let baseSlot = sq.inpSlot + 1'u64
if baseSlot - sq.finalSlot < sq.chunkSize:
let count = uint64(baseSlot - sq.finalSlot)
(baseSlot - count, count)
else:
(baseSlot - sq.chunkSize, sq.chunkSize)
if (maxslot + 1'u64) < slot + count:
# Peer's latest slot is less than queue's input slot.
return SyncRequest.empty(sq.kind, T)
var sr = SyncRequest.init(sq.kind, slot, count, item)
sq.advanceInput(count)
sq.makePending(sr)
sr
proc debtLen*[T](sq: SyncQueue[T]): uint64 =
sq.debtsCount
proc pendingLen*[T](sq: SyncQueue[T]): uint64 =
case sq.kind
of SyncQueueKind.Forward:
# When moving forward `outSlot` will be <= of `inpSlot`.
sq.inpSlot - sq.outSlot
of SyncQueueKind.Backward:
# When moving backward `outSlot` will be >= of `inpSlot`
sq.outSlot - sq.inpSlot
proc len*[T](sq: SyncQueue[T]): uint64 {.inline.} =
## Returns number of slots left in queue ``sq``.
case sq.kind
of SyncQueueKind.Forward:
if sq.finalSlot >= sq.outSlot:
sq.finalSlot + 1'u64 - sq.outSlot
else:
0'u64
of SyncQueueKind.Backward:
if sq.outSlot >= sq.finalSlot:
sq.outSlot + 1'u64 - sq.finalSlot
else:
0'u64
proc total*[T](sq: SyncQueue[T]): uint64 {.inline.} =
## Returns total number of slots in queue ``sq``.
case sq.kind
of SyncQueueKind.Forward:
if sq.finalSlot >= sq.startSlot:
sq.finalSlot + 1'u64 - sq.startSlot
else:
0'u64
of SyncQueueKind.Backward:
if sq.startSlot >= sq.finalSlot:
sq.startSlot + 1'u64 - sq.finalSlot
else:
0'u64
proc progress*[T](sq: SyncQueue[T]): uint64 =
## How many useful slots we've synced so far, adjusting for how much has
## become obsolete by time movements
sq.total - sq.len