Fetch by-root request directly from quarantine (#5167)
When the requestmanager is busy fetching blocks, the queue might get filled with multiple entries of the same root - since there is no deduplication, requests containing the same root multiple times will be sent out. Also, because the items sit in the queue for a long time potentially, the request might be stale by the time that the manager is ready with the previous request. This PR removes the queue and directly fetches the blocks to download from the quarantine which solves both problems (the quarantine already de-duplicates and is clean of stale information). Removing the queue for blobs is left for a future PR. Co-authored-by: tersec <tersec@users.noreply.github.com>
This commit is contained in:
parent
174c33e5fa
commit
ca1775f725
|
@ -77,7 +77,7 @@ type
|
||||||
func init*(T: type Quarantine): T =
|
func init*(T: type Quarantine): T =
|
||||||
T()
|
T()
|
||||||
|
|
||||||
func checkMissing*(quarantine: var Quarantine): seq[FetchRecord] =
|
func checkMissing*(quarantine: var Quarantine, max: int): seq[FetchRecord] =
|
||||||
## Return a list of blocks that we should try to resolve from other client -
|
## Return a list of blocks that we should try to resolve from other client -
|
||||||
## to be called periodically but not too often (once per slot?)
|
## to be called periodically but not too often (once per slot?)
|
||||||
var done: seq[Eth2Digest]
|
var done: seq[Eth2Digest]
|
||||||
|
@ -85,16 +85,17 @@ func checkMissing*(quarantine: var Quarantine): seq[FetchRecord] =
|
||||||
for k, v in quarantine.missing.mpairs():
|
for k, v in quarantine.missing.mpairs():
|
||||||
if v.tries > 8:
|
if v.tries > 8:
|
||||||
done.add(k)
|
done.add(k)
|
||||||
else:
|
|
||||||
inc v.tries
|
|
||||||
|
|
||||||
for k in done:
|
for k in done:
|
||||||
quarantine.missing.del(k)
|
quarantine.missing.del(k)
|
||||||
|
|
||||||
# simple (simplistic?) exponential backoff for retries..
|
# simple (simplistic?) exponential backoff for retries..
|
||||||
for k, v in quarantine.missing:
|
for k, v in quarantine.missing.mpairs:
|
||||||
|
v.tries += 1
|
||||||
if countOnes(v.tries.uint64) == 1:
|
if countOnes(v.tries.uint64) == 1:
|
||||||
result.add(FetchRecord(root: k))
|
result.add(FetchRecord(root: k))
|
||||||
|
if result.len >= max:
|
||||||
|
break
|
||||||
|
|
||||||
# TODO stew/sequtils2
|
# TODO stew/sequtils2
|
||||||
template anyIt(s, pred: untyped): bool =
|
template anyIt(s, pred: untyped): bool =
|
||||||
|
|
|
@ -369,6 +369,8 @@ proc initFullNode(
|
||||||
resfut,
|
resfut,
|
||||||
maybeFinalized = maybeFinalized)
|
maybeFinalized = maybeFinalized)
|
||||||
resfut
|
resfut
|
||||||
|
|
||||||
|
|
||||||
processor = Eth2Processor.new(
|
processor = Eth2Processor.new(
|
||||||
config.doppelgangerDetection,
|
config.doppelgangerDetection,
|
||||||
blockProcessor, node.validatorMonitor, dag, attestationPool,
|
blockProcessor, node.validatorMonitor, dag, attestationPool,
|
||||||
|
@ -386,6 +388,10 @@ proc initFullNode(
|
||||||
router = (ref MessageRouter)(
|
router = (ref MessageRouter)(
|
||||||
processor: processor,
|
processor: processor,
|
||||||
network: node.network)
|
network: node.network)
|
||||||
|
requestManager = RequestManager.init(
|
||||||
|
node.network, dag.cfg.DENEB_FORK_EPOCH, getBeaconTime,
|
||||||
|
(proc(): bool = syncManager.inProgress),
|
||||||
|
quarantine, blobQuarantine, rmanBlockVerifier)
|
||||||
|
|
||||||
if node.config.lightClientDataServe:
|
if node.config.lightClientDataServe:
|
||||||
proc scheduleSendingLightClientUpdates(slot: Slot) =
|
proc scheduleSendingLightClientUpdates(slot: Slot) =
|
||||||
|
@ -417,12 +423,7 @@ proc initFullNode(
|
||||||
node.processor = processor
|
node.processor = processor
|
||||||
node.blockProcessor = blockProcessor
|
node.blockProcessor = blockProcessor
|
||||||
node.consensusManager = consensusManager
|
node.consensusManager = consensusManager
|
||||||
node.requestManager = RequestManager.init(node.network,
|
node.requestManager = requestManager
|
||||||
dag.cfg.DENEB_FORK_EPOCH,
|
|
||||||
getBeaconTime,
|
|
||||||
quarantine,
|
|
||||||
blobQuarantine,
|
|
||||||
rmanBlockVerifier)
|
|
||||||
node.syncManager = syncManager
|
node.syncManager = syncManager
|
||||||
node.backfiller = backfiller
|
node.backfiller = backfiller
|
||||||
node.router = router
|
node.router = router
|
||||||
|
@ -1393,16 +1394,9 @@ proc handleMissingBlobs(node: BeaconNode) =
|
||||||
debug "Requesting detected missing blobs", blobs = shortLog(fetches)
|
debug "Requesting detected missing blobs", blobs = shortLog(fetches)
|
||||||
node.requestManager.fetchMissingBlobs(fetches)
|
node.requestManager.fetchMissingBlobs(fetches)
|
||||||
|
|
||||||
proc handleMissingBlocks(node: BeaconNode) =
|
|
||||||
let missingBlocks = node.quarantine[].checkMissing()
|
|
||||||
if missingBlocks.len > 0:
|
|
||||||
debug "Requesting detected missing blocks", blocks = shortLog(missingBlocks)
|
|
||||||
node.requestManager.fetchAncestorBlocks(missingBlocks)
|
|
||||||
|
|
||||||
proc onSecond(node: BeaconNode, time: Moment) =
|
proc onSecond(node: BeaconNode, time: Moment) =
|
||||||
## This procedure will be called once per second.
|
## This procedure will be called once per second.
|
||||||
if not(node.syncManager.inProgress):
|
if not(node.syncManager.inProgress):
|
||||||
node.handleMissingBlocks()
|
|
||||||
node.handleMissingBlobs()
|
node.handleMissingBlobs()
|
||||||
|
|
||||||
# Nim GC metrics (for the main thread)
|
# Nim GC metrics (for the main thread)
|
||||||
|
|
|
@ -34,18 +34,23 @@ const
|
||||||
BLOB_GOSSIP_WAIT_TIME_NS* = 2 * 1_000_000_000
|
BLOB_GOSSIP_WAIT_TIME_NS* = 2 * 1_000_000_000
|
||||||
## How long to wait for blobs to arrive over gossip before fetching.
|
## How long to wait for blobs to arrive over gossip before fetching.
|
||||||
|
|
||||||
|
POLL_FREQUENCY = 1.seconds
|
||||||
|
|
||||||
type
|
type
|
||||||
BlockVerifier* =
|
BlockVerifierFn* =
|
||||||
proc(signedBlock: ForkedSignedBeaconBlock, maybeFinalized: bool):
|
proc(signedBlock: ForkedSignedBeaconBlock, maybeFinalized: bool):
|
||||||
Future[Result[void, VerifierError]] {.gcsafe, raises: [Defect].}
|
Future[Result[void, VerifierError]] {.gcsafe, raises: [].}
|
||||||
|
InhibitFn* = proc: bool {.gcsafe, raises:[].}
|
||||||
|
|
||||||
RequestManager* = object
|
RequestManager* = object
|
||||||
network*: Eth2Node
|
network*: Eth2Node
|
||||||
inpBlockQueue*: AsyncQueue[FetchRecord]
|
inpBlockQueue*: AsyncQueue[FetchRecord]
|
||||||
inpBlobQueue: AsyncQueue[BlobIdentifier]
|
inpBlobQueue: AsyncQueue[BlobIdentifier]
|
||||||
getBeaconTime: GetBeaconTimeFn
|
getBeaconTime: GetBeaconTimeFn
|
||||||
|
inhibit: InhibitFn
|
||||||
quarantine: ref Quarantine
|
quarantine: ref Quarantine
|
||||||
blobQuarantine: ref BlobQuarantine
|
blobQuarantine: ref BlobQuarantine
|
||||||
blockVerifier: BlockVerifier
|
blockVerifier: BlockVerifierFn
|
||||||
blockLoopFuture: Future[void]
|
blockLoopFuture: Future[void]
|
||||||
blobLoopFuture: Future[void]
|
blobLoopFuture: Future[void]
|
||||||
|
|
||||||
|
@ -58,14 +63,16 @@ func shortLog*(x: seq[FetchRecord]): string =
|
||||||
proc init*(T: type RequestManager, network: Eth2Node,
|
proc init*(T: type RequestManager, network: Eth2Node,
|
||||||
denebEpoch: Epoch,
|
denebEpoch: Epoch,
|
||||||
getBeaconTime: GetBeaconTimeFn,
|
getBeaconTime: GetBeaconTimeFn,
|
||||||
|
inhibit: InhibitFn,
|
||||||
quarantine: ref Quarantine,
|
quarantine: ref Quarantine,
|
||||||
blobQuarantine: ref BlobQuarantine,
|
blobQuarantine: ref BlobQuarantine,
|
||||||
blockVerifier: BlockVerifier): RequestManager =
|
blockVerifier: BlockVerifierFn): RequestManager =
|
||||||
RequestManager(
|
RequestManager(
|
||||||
network: network,
|
network: network,
|
||||||
inpBlockQueue: newAsyncQueue[FetchRecord](),
|
# TODO remove this queue and poll the quarantine directly
|
||||||
inpBlobQueue: newAsyncQueue[BlobIdentifier](),
|
inpBlobQueue: newAsyncQueue[BlobIdentifier](),
|
||||||
getBeaconTime: getBeaconTime,
|
getBeaconTime: getBeaconTime,
|
||||||
|
inhibit: inhibit,
|
||||||
quarantine: quarantine,
|
quarantine: quarantine,
|
||||||
blobQuarantine: blobQuarantine,
|
blobQuarantine: blobQuarantine,
|
||||||
blockVerifier: blockVerifier,
|
blockVerifier: blockVerifier,
|
||||||
|
@ -100,8 +107,7 @@ proc checkResponse(idList: seq[BlobIdentifier],
|
||||||
return false
|
return false
|
||||||
true
|
true
|
||||||
|
|
||||||
proc fetchAncestorBlocksFromNetwork(rman: RequestManager,
|
proc requestBlocksByRoot(rman: RequestManager, items: seq[Eth2Digest]) {.async.} =
|
||||||
items: seq[Eth2Digest]) {.async.} =
|
|
||||||
var peer: Peer
|
var peer: Peer
|
||||||
try:
|
try:
|
||||||
peer = await rman.network.peerPool.acquire()
|
peer = await rman.network.peerPool.acquire()
|
||||||
|
@ -154,21 +160,25 @@ proc fetchAncestorBlocksFromNetwork(rman: RequestManager,
|
||||||
peer.updateScore(PeerScoreUnviableFork)
|
peer.updateScore(PeerScoreUnviableFork)
|
||||||
elif gotGoodBlock:
|
elif gotGoodBlock:
|
||||||
debug "Request manager got good block",
|
debug "Request manager got good block",
|
||||||
peer = peer, blocks = shortLog(items)
|
peer = peer, blocks = shortLog(items), ublocks = len(ublocks)
|
||||||
|
|
||||||
# We reward peer only if it returns something.
|
# We reward peer only if it returns something.
|
||||||
peer.updateScore(PeerScoreGoodValues)
|
peer.updateScore(PeerScoreGoodValues)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
debug "Mismatching response to blocks by root",
|
||||||
|
peer = peer, blocks = shortLog(items), ublocks = len(ublocks)
|
||||||
peer.updateScore(PeerScoreBadResponse)
|
peer.updateScore(PeerScoreBadResponse)
|
||||||
else:
|
else:
|
||||||
|
debug "Blocks by root request failed",
|
||||||
|
peer = peer, blocks = shortLog(items), err = blocks.error()
|
||||||
peer.updateScore(PeerScoreNoValues)
|
peer.updateScore(PeerScoreNoValues)
|
||||||
|
|
||||||
except CancelledError as exc:
|
except CancelledError as exc:
|
||||||
raise exc
|
raise exc
|
||||||
except CatchableError as exc:
|
except CatchableError as exc:
|
||||||
peer.updateScore(PeerScoreNoValues)
|
peer.updateScore(PeerScoreNoValues)
|
||||||
debug "Error while fetching ancestor blocks", exc = exc.msg,
|
debug "Error while fetching blocks by root", exc = exc.msg,
|
||||||
items = shortLog(items), peer = peer, peer_score = peer.getScore()
|
items = shortLog(items), peer = peer, peer_score = peer.getScore()
|
||||||
raise exc
|
raise exc
|
||||||
finally:
|
finally:
|
||||||
|
@ -189,6 +199,8 @@ proc fetchBlobsFromNetwork(self: RequestManager,
|
||||||
if blobs.isOk:
|
if blobs.isOk:
|
||||||
let ublobs = blobs.get()
|
let ublobs = blobs.get()
|
||||||
if not checkResponse(idList, ublobs.asSeq()):
|
if not checkResponse(idList, ublobs.asSeq()):
|
||||||
|
debug "Mismatched response to blobs by root",
|
||||||
|
peer = peer, blobs = shortLog(idList), ublobs = len(ublobs)
|
||||||
peer.updateScore(PeerScoreBadResponse)
|
peer.updateScore(PeerScoreBadResponse)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -204,56 +216,64 @@ proc fetchBlobsFromNetwork(self: RequestManager,
|
||||||
# TODO:
|
# TODO:
|
||||||
# If appropriate, return a VerifierError.InvalidBlob from verification,
|
# If appropriate, return a VerifierError.InvalidBlob from verification,
|
||||||
# check for it here, and penalize the peer accordingly.
|
# check for it here, and penalize the peer accordingly.
|
||||||
|
else:
|
||||||
|
debug "Blobs by root request failed",
|
||||||
|
peer = peer, blobs = shortLog(idList), err = blobs.error()
|
||||||
|
peer.updateScore(PeerScoreNoValues)
|
||||||
|
|
||||||
except CancelledError as exc:
|
except CancelledError as exc:
|
||||||
raise exc
|
raise exc
|
||||||
except CatchableError as exc:
|
except CatchableError as exc:
|
||||||
peer.updateScore(PeerScoreNoValues)
|
peer.updateScore(PeerScoreNoValues)
|
||||||
debug "Error while fetching blobs", exc = exc.msg,
|
debug "Error while fetching blobs by root", exc = exc.msg,
|
||||||
idList = shortLog(idList), peer = peer, peer_score = peer.getScore()
|
idList = shortLog(idList), peer = peer, peer_score = peer.getScore()
|
||||||
raise exc
|
raise exc
|
||||||
finally:
|
finally:
|
||||||
if not(isNil(peer)):
|
if not(isNil(peer)):
|
||||||
self.network.peerPool.release(peer)
|
self.network.peerPool.release(peer)
|
||||||
|
|
||||||
|
|
||||||
proc requestManagerBlockLoop(rman: RequestManager) {.async.} =
|
proc requestManagerBlockLoop(rman: RequestManager) {.async.} =
|
||||||
var rootList = newSeq[Eth2Digest]()
|
|
||||||
var workers = newSeq[Future[void]](PARALLEL_REQUESTS)
|
|
||||||
while true:
|
while true:
|
||||||
|
# TODO This polling could be replaced with an AsyncEvent that is fired
|
||||||
|
# from the quarantine when there's work to do
|
||||||
|
await sleepAsync(POLL_FREQUENCY)
|
||||||
|
|
||||||
|
if rman.inhibit():
|
||||||
|
continue
|
||||||
|
|
||||||
|
let blocks = mapIt(rman.quarantine[].checkMissing(
|
||||||
|
SYNC_MAX_REQUESTED_BLOCKS), it.root)
|
||||||
|
if blocks.len == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
debug "Requesting detected missing blocks", blocks = shortLog(blocks)
|
||||||
try:
|
try:
|
||||||
rootList.setLen(0)
|
|
||||||
let req = await rman.inpBlockQueue.popFirst()
|
|
||||||
rootList.add(req.root)
|
|
||||||
|
|
||||||
var count = min(SYNC_MAX_REQUESTED_BLOCKS - 1, len(rman.inpBlockQueue))
|
|
||||||
while count > 0:
|
|
||||||
rootList.add(rman.inpBlockQueue.popFirstNoWait().root)
|
|
||||||
dec(count)
|
|
||||||
|
|
||||||
let start = SyncMoment.now(0)
|
let start = SyncMoment.now(0)
|
||||||
|
|
||||||
for i in 0 ..< PARALLEL_REQUESTS:
|
var workers: array[PARALLEL_REQUESTS, Future[void]]
|
||||||
workers[i] = rman.fetchAncestorBlocksFromNetwork(rootList)
|
|
||||||
|
for i in 0 ..< PARALLEL_REQUESTS:
|
||||||
|
workers[i] = rman.requestBlocksByRoot(blocks)
|
||||||
|
|
||||||
# We do not care about
|
|
||||||
await allFutures(workers)
|
await allFutures(workers)
|
||||||
|
|
||||||
let finish = SyncMoment.now(uint64(len(rootList)))
|
let finish = SyncMoment.now(uint64(len(blocks)))
|
||||||
|
|
||||||
var succeed = 0
|
var succeed = 0
|
||||||
for worker in workers:
|
for worker in workers:
|
||||||
if worker.finished() and not(worker.failed()):
|
if worker.completed():
|
||||||
inc(succeed)
|
inc(succeed)
|
||||||
|
|
||||||
debug "Request manager block tick", blocks_count = len(rootList),
|
debug "Request manager block tick", blocks = shortLog(blocks),
|
||||||
succeed = succeed,
|
succeed = succeed,
|
||||||
failed = (len(workers) - succeed),
|
failed = (len(workers) - succeed),
|
||||||
queue_size = len(rman.inpBlockQueue),
|
queue_size = len(rman.inpBlockQueue),
|
||||||
sync_speed = speed(start, finish)
|
sync_speed = speed(start, finish)
|
||||||
|
|
||||||
|
except CancelledError as exc:
|
||||||
|
break
|
||||||
except CatchableError as exc:
|
except CatchableError as exc:
|
||||||
debug "Got a problem in request manager", exc = exc.msg
|
warn "Unexpected error in request manager block loop", exc = exc.msg
|
||||||
|
|
||||||
proc requestManagerBlobLoop(rman: RequestManager) {.async.} =
|
proc requestManagerBlobLoop(rman: RequestManager) {.async.} =
|
||||||
var idList = newSeq[BlobIdentifier]()
|
var idList = newSeq[BlobIdentifier]()
|
||||||
|
@ -261,13 +281,13 @@ proc requestManagerBlobLoop(rman: RequestManager) {.async.} =
|
||||||
while true:
|
while true:
|
||||||
try:
|
try:
|
||||||
idList.setLen(0)
|
idList.setLen(0)
|
||||||
let id = await rman.inpBlobQueue.popFirst()
|
idList.add(await rman.inpBlobQueue.popFirst())
|
||||||
idList.add(id)
|
|
||||||
|
|
||||||
var count = min(MAX_REQUEST_BLOB_SIDECARS - 1, lenu64(rman.inpBlobQueue))
|
while len(rman.inpBlobQueue) > 0 and
|
||||||
while count > 0:
|
lenu64(idList) < MAX_REQUEST_BLOB_SIDECARS:
|
||||||
idList.add(rman.inpBlobQueue.popFirstNoWait())
|
let id = rman.inpBlobQueue.popFirstNoWait()
|
||||||
dec(count)
|
if id notin idList:
|
||||||
|
idList.add(id)
|
||||||
|
|
||||||
let start = SyncMoment.now(0)
|
let start = SyncMoment.now(0)
|
||||||
|
|
||||||
|
@ -286,8 +306,10 @@ proc requestManagerBlobLoop(rman: RequestManager) {.async.} =
|
||||||
failed = (len(workers) - succeed),
|
failed = (len(workers) - succeed),
|
||||||
queue_size = len(rman.inpBlobQueue)
|
queue_size = len(rman.inpBlobQueue)
|
||||||
|
|
||||||
|
except CancelledError as exc:
|
||||||
|
break
|
||||||
except CatchableError as exc:
|
except CatchableError as exc:
|
||||||
debug "Got a problem in request manager", exc = exc.msg
|
warn "Unexpected error in request manager blob loop", exc = exc.msg
|
||||||
|
|
||||||
proc start*(rman: var RequestManager) =
|
proc start*(rman: var RequestManager) =
|
||||||
## Start Request Manager's loops.
|
## Start Request Manager's loops.
|
||||||
|
@ -309,7 +331,6 @@ proc fetchAncestorBlocks*(rman: RequestManager, roots: seq[FetchRecord]) =
|
||||||
rman.inpBlockQueue.addLastNoWait(item)
|
rman.inpBlockQueue.addLastNoWait(item)
|
||||||
except AsyncQueueFullError: raiseAssert "unbounded queue"
|
except AsyncQueueFullError: raiseAssert "unbounded queue"
|
||||||
|
|
||||||
|
|
||||||
proc fetchMissingBlobs*(rman: RequestManager,
|
proc fetchMissingBlobs*(rman: RequestManager,
|
||||||
recs: seq[BlobFetchRecord]) =
|
recs: seq[BlobFetchRecord]) =
|
||||||
var idList: seq[BlobIdentifier]
|
var idList: seq[BlobIdentifier]
|
||||||
|
|
|
@ -68,7 +68,7 @@ suite "Block processor" & preset():
|
||||||
check:
|
check:
|
||||||
not dag.containsForkBlock(b2.root) # Unresolved, shouldn't show up
|
not dag.containsForkBlock(b2.root) # Unresolved, shouldn't show up
|
||||||
|
|
||||||
FetchRecord(root: b1.root) in quarantine[].checkMissing()
|
FetchRecord(root: b1.root) in quarantine[].checkMissing(32)
|
||||||
|
|
||||||
let
|
let
|
||||||
status = await processor.storeBlock(
|
status = await processor.storeBlock(
|
||||||
|
|
|
@ -42,11 +42,11 @@ suite "Block quarantine":
|
||||||
|
|
||||||
quarantine.addMissing(b1.root)
|
quarantine.addMissing(b1.root)
|
||||||
check:
|
check:
|
||||||
FetchRecord(root: b1.root) in quarantine.checkMissing()
|
FetchRecord(root: b1.root) in quarantine.checkMissing(32)
|
||||||
|
|
||||||
quarantine.addOrphan(Slot 0, b1).isOk
|
quarantine.addOrphan(Slot 0, b1).isOk
|
||||||
|
|
||||||
FetchRecord(root: b1.root) notin quarantine.checkMissing()
|
FetchRecord(root: b1.root) notin quarantine.checkMissing(32)
|
||||||
|
|
||||||
quarantine.addOrphan(Slot 0, b2).isOk
|
quarantine.addOrphan(Slot 0, b2).isOk
|
||||||
quarantine.addOrphan(Slot 0, b3).isOk
|
quarantine.addOrphan(Slot 0, b3).isOk
|
||||||
|
|
Loading…
Reference in New Issue