nimbus-eth2/beacon_chain/sync/request_manager.nim
Agnish Ghosh c1cb673d6c fix bug
2024-09-19 17:22:08 +05:30

653 lines
24 KiB
Nim

# beacon_chain
# Copyright (c) 2018-2024 Status Research & Development GmbH
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at https://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at https://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
{.push raises: [].}
import std/[sequtils, strutils]
import chronos, chronicles
import
../spec/datatypes/[phase0, deneb],
../spec/[forks, network, eip7594_helpers],
../spec/[eth2_ssz_serialization],
../networking/eth2_network,
../consensus_object_pools/block_quarantine,
../consensus_object_pools/blob_quarantine,
../consensus_object_pools/data_column_quarantine,
"."/sync_protocol, "."/sync_manager,
../gossip_processing/block_processor
from ../beacon_clock import GetBeaconTimeFn
export block_quarantine, sync_manager
logScope:
topics = "requman"
const
SYNC_MAX_REQUESTED_BLOCKS* = 32 # Spec allows up to MAX_REQUEST_BLOCKS.
## Maximum number of blocks which will be requested in each
## `beaconBlocksByRoot` invocation.
PARALLEL_REQUESTS* = 2
## Number of peers we using to resolve our request.
PARALLEL_REQUESTS_DATA_COLUMNS* = 16
BLOB_GOSSIP_WAIT_TIME_NS* = 2 * 1_000_000_000
## How long to wait for blobs to arrive over gossip before fetching.
DATA_COLUMN_GOSSIP_WAIT_TIME_NS* = 2 * 1_000_000_000
POLL_INTERVAL = 1.seconds
type
BlockVerifierFn* = proc(
signedBlock: ForkedSignedBeaconBlock,
maybeFinalized: bool
): Future[Result[void, VerifierError]] {.async: (raises: [CancelledError]).}
BlockLoaderFn* = proc(
blockRoot: Eth2Digest
): Opt[ForkedTrustedSignedBeaconBlock] {.gcsafe, raises: [].}
BlobLoaderFn* = proc(
blobId: BlobIdentifier): Opt[ref BlobSidecar] {.gcsafe, raises: [].}
DataColumnLoaderFn* = proc(
columnId: DataColumnIdentifier
): Opt[ref DataColumnSidecar] {.gcsafe, raises: [].}
InhibitFn* = proc: bool {.gcsafe, raises: [].}
RequestManager* = object
network*: Eth2Node
supernode: bool
getBeaconTime: GetBeaconTimeFn
inhibit: InhibitFn
quarantine: ref Quarantine
blobQuarantine: ref BlobQuarantine
dataColumnQuarantine: ref DataColumnQuarantine
blockVerifier: BlockVerifierFn
blockLoader: BlockLoaderFn
blobLoader: BlobLoaderFn
dataColumnLoader: DataColumnLoaderFn
blockLoopFuture: Future[void].Raising([CancelledError])
blobLoopFuture: Future[void].Raising([CancelledError])
dataColumnLoopFuture: Future[void].Raising([CancelledError])
func shortLog*(x: seq[Eth2Digest]): string =
"[" & x.mapIt(shortLog(it)).join(", ") & "]"
func shortLog*(x: seq[FetchRecord]): string =
"[" & x.mapIt(shortLog(it.root)).join(", ") & "]"
proc init*(T: type RequestManager, network: Eth2Node,
supernode: bool,
denebEpoch: Epoch,
getBeaconTime: GetBeaconTimeFn,
inhibit: InhibitFn,
quarantine: ref Quarantine,
blobQuarantine: ref BlobQuarantine,
dataColumnQuarantine: ref DataColumnQuarantine,
blockVerifier: BlockVerifierFn,
blockLoader: BlockLoaderFn = nil,
blobLoader: BlobLoaderFn = nil,
dataColumnLoader: DataColumnLoaderFn = nil): RequestManager =
RequestManager(
network: network,
supernode: supernode,
getBeaconTime: getBeaconTime,
inhibit: inhibit,
quarantine: quarantine,
blobQuarantine: blobQuarantine,
dataColumnQuarantine: dataColumnQuarantine,
blockVerifier: blockVerifier,
blockLoader: blockLoader,
blobLoader: blobLoader,
dataColumnLoader: dataColumnLoader)
proc checkResponse(roots: openArray[Eth2Digest],
blocks: openArray[ref ForkedSignedBeaconBlock]): bool =
## This procedure checks peer's response.
var checks = @roots
if len(blocks) > len(roots):
return false
for blk in blocks:
let res = checks.find(blk[].root)
if res == -1:
return false
else:
checks.del(res)
true
proc checkResponse(idList: seq[BlobIdentifier],
blobs: openArray[ref BlobSidecar]): bool =
if len(blobs) > len(idList):
return false
for blob in blobs:
let block_root = hash_tree_root(blob.signed_block_header.message)
var found = false
for id in idList:
if id.block_root == block_root and id.index == blob.index:
found = true
break
if not found:
return false
blob[].verify_blob_sidecar_inclusion_proof().isOkOr:
return false
true
proc checkResponse(colIdList: seq[DataColumnIdentifier],
columns: openArray[ref DataColumnSidecar]): bool =
if len(columns) > len(colIdList):
return false
for column in columns:
let block_root = hash_tree_root(column.signed_block_header.message)
var found = false
for id in colIdList:
if id.block_root == block_root and id.index == column.index:
found = true
break
if not found:
return false
column[].verify_data_column_sidecar_inclusion_proof().isOkOr:
return false
true
proc requestBlocksByRoot(rman: RequestManager, items: seq[Eth2Digest]) {.async: (raises: [CancelledError]).} =
var peer: Peer
try:
peer = await rman.network.peerPool.acquire()
debug "Requesting blocks by root", peer = peer, blocks = shortLog(items),
peer_score = peer.getScore()
let blocks = (await beaconBlocksByRoot_v2(peer, BlockRootsList items))
if blocks.isOk:
let ublocks = blocks.get()
if checkResponse(items, ublocks.asSeq()):
var
gotGoodBlock = false
gotUnviableBlock = false
for b in ublocks:
let ver = await rman.blockVerifier(b[], false)
if ver.isErr():
case ver.error()
of VerifierError.MissingParent:
# Ignoring because the order of the blocks that
# we requested may be different from the order in which we need
# these blocks to apply.
discard
of VerifierError.Duplicate:
# Ignoring because these errors could occur due to the
# concurrent/parallel requests we made.
discard
of VerifierError.UnviableFork:
# If they're working a different fork, we'll want to descore them
# but also process the other blocks (in case we can register the
# other blocks as unviable)
gotUnviableBlock = true
of VerifierError.Invalid:
# We stop processing blocks because peer is either sending us
# junk or working a different fork
notice "Received invalid block",
peer = peer, blocks = shortLog(items),
peer_score = peer.getScore()
peer.updateScore(PeerScoreBadValues)
return # Stop processing this junk...
else:
gotGoodBlock = true
if gotUnviableBlock:
notice "Received blocks from an unviable fork",
peer = peer, blocks = shortLog(items),
peer_score = peer.getScore()
peer.updateScore(PeerScoreUnviableFork)
elif gotGoodBlock:
debug "Request manager got good block",
peer = peer, blocks = shortLog(items), ublocks = len(ublocks)
# We reward peer only if it returns something.
peer.updateScore(PeerScoreGoodValues)
else:
debug "Mismatching response to blocks by root",
peer = peer, blocks = shortLog(items), ublocks = len(ublocks)
peer.updateScore(PeerScoreBadResponse)
else:
debug "Blocks by root request failed",
peer = peer, blocks = shortLog(items), err = blocks.error()
peer.updateScore(PeerScoreNoValues)
finally:
if not(isNil(peer)):
rman.network.peerPool.release(peer)
proc fetchBlobsFromNetwork(self: RequestManager,
idList: seq[BlobIdentifier])
{.async: (raises: [CancelledError]).} =
var peer: Peer
try:
peer = await self.network.peerPool.acquire()
debug "Requesting blobs by root", peer = peer, blobs = shortLog(idList),
peer_score = peer.getScore()
let blobs = await blobSidecarsByRoot(peer, BlobIdentifierList idList)
if blobs.isOk:
let ublobs = blobs.get()
if not checkResponse(idList, ublobs.asSeq()):
debug "Mismatched response to blobs by root",
peer = peer, blobs = shortLog(idList), ublobs = len(ublobs)
peer.updateScore(PeerScoreBadResponse)
return
for b in ublobs:
self.blobQuarantine[].put(b)
var curRoot: Eth2Digest
for b in ublobs:
let block_root = hash_tree_root(b.signed_block_header.message)
if block_root != curRoot:
curRoot = block_root
if (let o = self.quarantine[].popBlobless(curRoot); o.isSome):
let b = o.unsafeGet()
discard await self.blockVerifier(b, false)
# TODO:
# If appropriate, return a VerifierError.InvalidBlob from
# verification, check for it here, and penalize the peer accordingly
else:
debug "Blobs by root request failed",
peer = peer, blobs = shortLog(idList), err = blobs.error()
peer.updateScore(PeerScoreNoValues)
finally:
if not(isNil(peer)):
self.network.peerPool.release(peer)
proc constructValidCustodyPeers(rman: RequestManager,
peers: openArray[Peer]):
seq[Peer] =
let localCustodySubnetCount =
if rman.supernode:
DATA_COLUMN_SIDECAR_SUBNET_COUNT.uint64
else:
CUSTODY_REQUIREMENT
# Fetching the local cusotrdy columns
let
localNodeId = rman.network.nodeId
localCustodyColumns =
localNodeId.get_custody_columns(max(SAMPLES_PER_SLOT.uint64,
localCustodySubnetCount))
var validPeers: seq[Peer]
for peer in peers:
# Get the custody subnet count of the remote peer
let remoteCustodySubnetCount =
peer.lookupCscFromPeer()
# Extract remote peer's nodeID from peerID
# Fetch custody columns from remote peer
let
remoteNodeId = getNodeIdFromPeer(peer)
remoteCustodyColumns =
remoteNodeId.get_custody_columns(max(SAMPLES_PER_SLOT.uint64,
remoteCustodySubnetCount))
# If the remote peer custodies less columns than
# our local node
# We skip it
if remoteCustodyColumns.len < localCustodyColumns.len:
continue
# If the remote peer custodies all the possible columns
if remoteCustodyColumns.len == NUMBER_OF_COLUMNS:
validPeers.add(peer)
# Filtering out the inval;id peers
for column in localCustodyColumns:
if column notin remoteCustodyColumns:
continue
# Otherwise add the peer to the set of valid peers
validPeers.add(peer)
validPeers
proc fetchDataColumnsFromNetwork(rman: RequestManager,
colIdList: seq[DataColumnIdentifier])
{.async: (raises: [CancelledError]).} =
var peer: Peer
var peers: seq[Peer]
try:
peer = await rman.network.peerPool.acquire()
# Create a peer list, which shall be later trimmed off as to which
# of the peers have the valid custody columns
peers.add(peer)
let validPeers = rman.constructValidCustodyPeers(peers)
if peer in validPeers:
debug "Requesting data columns by root", peer = peer, columns = shortLog(colIdList),
peer_score = peer.getScore()
let columns = await dataColumnSidecarsByRoot(peer, DataColumnIdentifierList colIdList)
if columns.isOk:
let ucolumns = columns.get()
if not checkResponse(colIdList, ucolumns.asSeq()):
debug "Mismatched response to data columns by root",
peer = peer, columns = shortLog(colIdList), ucolumns = len(ucolumns)
# peer.updateScore(PeerScoreBadResponse)
return
for col in ucolumns:
rman.dataColumnQuarantine[].put(col)
var curRoot: Eth2Digest
for col in ucolumns:
let block_root = hash_tree_root(col.signed_block_header.message)
if block_root != curRoot:
curRoot = block_root
if (let o = rman.quarantine[].popColumnless(curRoot); o.isSome):
let col = o.unsafeGet()
discard await rman.blockVerifier(col, false)
else:
debug "Data columns by root request failed",
peer = peer, columns = shortLog(colIdList), err = columns.error()
# peer.updateScore(PeerScoreNoValues)
finally:
if not(isNil(peer)):
rman.network.peerPool.release(peer)
proc requestManagerBlockLoop(
rman: RequestManager) {.async: (raises: [CancelledError]).} =
while true:
# TODO This polling could be replaced with an AsyncEvent that is fired
# from the quarantine when there's work to do
await sleepAsync(POLL_INTERVAL)
if rman.inhibit():
continue
let missingBlockRoots =
rman.quarantine[].checkMissing(SYNC_MAX_REQUESTED_BLOCKS).mapIt(it.root)
if missingBlockRoots.len == 0:
continue
# TODO This logic can be removed if the database schema is extended
# to store non-canonical heads on top of the canonical head!
# If that is done, the database no longer contains extra blocks
# that have not yet been assigned a `BlockRef`
var blockRoots: seq[Eth2Digest]
if rman.blockLoader == nil:
blockRoots = missingBlockRoots
else:
var verifiers:
seq[Future[Result[void, VerifierError]].Raising([CancelledError])]
for blockRoot in missingBlockRoots:
let blck = rman.blockLoader(blockRoot).valueOr:
blockRoots.add blockRoot
continue
debug "Loaded orphaned block from storage", blockRoot
verifiers.add rman.blockVerifier(
blck.asSigned(), maybeFinalized = false)
try:
await allFutures(verifiers)
except CancelledError as exc:
var futs = newSeqOfCap[Future[void].Raising([])](verifiers.len)
for verifier in verifiers:
futs.add verifier.cancelAndWait()
await noCancel allFutures(futs)
raise exc
if blockRoots.len == 0:
continue
debug "Requesting detected missing blocks", blocks = shortLog(blockRoots)
let start = SyncMoment.now(0)
var workers:
array[PARALLEL_REQUESTS, Future[void].Raising([CancelledError])]
for i in 0 ..< PARALLEL_REQUESTS:
workers[i] = rman.requestBlocksByRoot(blockRoots)
await allFutures(workers)
let finish = SyncMoment.now(uint64(len(blockRoots)))
debug "Request manager block tick", blocks = shortLog(blockRoots),
sync_speed = speed(start, finish)
proc getMissingBlobs(rman: RequestManager): seq[BlobIdentifier] =
let
wallTime = rman.getBeaconTime()
wallSlot = wallTime.slotOrZero()
delay = wallTime - wallSlot.start_beacon_time()
waitDur = TimeDiff(nanoseconds: BLOB_GOSSIP_WAIT_TIME_NS)
var
fetches: seq[BlobIdentifier]
ready: seq[Eth2Digest]
for blobless in rman.quarantine[].peekBlobless():
withBlck(blobless):
when consensusFork >= ConsensusFork.Deneb:
# give blobs a chance to arrive over gossip
if forkyBlck.message.slot == wallSlot and delay < waitDur:
debug "Not handling missing blobs early in slot"
continue
if not rman.blobQuarantine[].hasBlobs(forkyBlck):
let missing = rman.blobQuarantine[].blobFetchRecord(forkyBlck)
if len(missing.indices) == 0:
warn "quarantine missing blobs, but missing indices is empty",
blk=blobless.root,
commitments=len(forkyBlck.message.body.blob_kzg_commitments)
for idx in missing.indices:
let id = BlobIdentifier(block_root: blobless.root, index: idx)
if id notin fetches:
fetches.add(id)
else:
# this is a programming error should it occur.
warn "missing blob handler found blobless block with all blobs",
blk=blobless.root,
commitments=len(forkyBlck.message.body.blob_kzg_commitments)
ready.add(blobless.root)
for root in ready:
let blobless = rman.quarantine[].popBlobless(root).valueOr:
continue
discard rman.blockVerifier(blobless, false)
fetches
proc requestManagerBlobLoop(
rman: RequestManager) {.async: (raises: [CancelledError]).} =
while true:
# TODO This polling could be replaced with an AsyncEvent that is fired
# from the quarantine when there's work to do
await sleepAsync(POLL_INTERVAL)
if rman.inhibit():
continue
let missingBlobIds = rman.getMissingBlobs()
if missingBlobIds.len == 0:
continue
# TODO This logic can be removed if the database schema is extended
# to store non-canonical heads on top of the canonical head!
# If that is done, the database no longer contains extra blocks
# that have not yet been assigned a `BlockRef`
var blobIds: seq[BlobIdentifier]
if rman.blobLoader == nil:
blobIds = missingBlobIds
else:
var
blockRoots: seq[Eth2Digest]
curRoot: Eth2Digest
for blobId in missingBlobIds:
if blobId.block_root != curRoot:
curRoot = blobId.block_root
blockRoots.add curRoot
let blob_sidecar = rman.blobLoader(blobId).valueOr:
blobIds.add blobId
if blockRoots.len > 0 and blockRoots[^1] == curRoot:
# A blob is missing, remove from list of fully available blocks
discard blockRoots.pop()
continue
debug "Loaded orphaned blob from storage", blobId
rman.blobQuarantine[].put(blob_sidecar)
var verifiers = newSeqOfCap[
Future[Result[void, VerifierError]]
.Raising([CancelledError])](blockRoots.len)
for blockRoot in blockRoots:
let blck = rman.quarantine[].popBlobless(blockRoot).valueOr:
continue
verifiers.add rman.blockVerifier(blck, maybeFinalized = false)
try:
await allFutures(verifiers)
except CancelledError as exc:
var futs = newSeqOfCap[Future[void].Raising([])](verifiers.len)
for verifier in verifiers:
futs.add verifier.cancelAndWait()
await noCancel allFutures(futs)
raise exc
if blobIds.len > 0:
debug "Requesting detected missing blobs", blobs = shortLog(blobIds)
let start = SyncMoment.now(0)
var workers:
array[PARALLEL_REQUESTS, Future[void].Raising([CancelledError])]
for i in 0 ..< PARALLEL_REQUESTS:
workers[i] = rman.fetchBlobsFromNetwork(blobIds)
await allFutures(workers)
let finish = SyncMoment.now(uint64(len(blobIds)))
debug "Request manager blob tick",
blobs_count = len(blobIds),
sync_speed = speed(start, finish)
proc getMissingDataColumns(rman: RequestManager): seq[DataColumnIdentifier] =
let
wallTime = rman.getBeaconTime()
wallSlot = wallTime.slotOrZero()
delay = wallTime - wallSlot.start_beacon_time()
const waitDur = TimeDiff(nanoseconds: DATA_COLUMN_GOSSIP_WAIT_TIME_NS)
var
fetches: seq[DataColumnIdentifier]
ready: seq[Eth2Digest]
for columnless in rman.quarantine[].peekColumnless():
withBlck(columnless):
when consensusFork >= ConsensusFork.Deneb:
# granting data columns a chance to arrive over gossip
if forkyBlck.message.slot == wallSlot and delay < waitDur:
debug "Not handling missing data columns early in slot"
continue
if not rman.dataColumnQuarantine[].hasDataColumns(forkyBlck):
let missing = rman.dataColumnQuarantine[].dataColumnFetchRecord(forkyBlck)
if len(missing.indices) == 0:
warn "quarantine is missing data columns, but missing indices are empty",
blk = columnless.root,
commitments = len(forkyBlck.message.body.blob_kzg_commitments)
for idx in missing.indices:
let id = DataColumnIdentifier(block_root: columnless.root, index: idx)
if id notin fetches:
fetches.add(id)
else:
# this is a programming error and it not should occur
warn "missing data column handler found columnless block with all data columns",
blk = columnless.root,
commitments=len(forkyBlck.message.body.blob_kzg_commitments)
ready.add(columnless.root)
for root in ready:
let columnless = rman.quarantine[].popColumnless(root).valueOr:
continue
discard rman.blockVerifier(columnless, false)
fetches
proc requestManagerDataColumnLoop(
rman: RequestManager) {.async: (raises: [CancelledError]).} =
while true:
await sleepAsync(POLL_INTERVAL)
if rman.inhibit():
continue
let missingColumnIds = rman.getMissingDataColumns()
if missingColumnIds.len == 0:
continue
var columnIds: seq[DataColumnIdentifier]
if rman.dataColumnLoader == nil:
columnIds = missingColumnIds
else:
var
blockRoots: seq[Eth2Digest]
curRoot: Eth2Digest
for columnId in missingColumnIds:
if columnId.block_root != curRoot:
curRoot = columnId.block_root
blockRoots.add curRoot
let data_column_sidecar = rman.dataColumnLoader(columnId).valueOr:
columnIds.add columnId
if blockRoots.len > 0 and blockRoots[^1] == curRoot:
# A data column is missing, remove from list of fully available data columns
discard blockRoots.pop()
continue
debug "Loaded orphaned data columns from storage", columnId
rman.dataColumnQuarantine[].put(data_column_sidecar)
var verifiers = newSeqOfCap[
Future[Result[void, VerifierError]]
.Raising([CancelledError])](blockRoots.len)
for blockRoot in blockRoots:
let blck = rman.quarantine[].popColumnless(blockRoot).valueOr:
continue
verifiers.add rman.blockVerifier(blck, maybeFinalized = false)
try:
await allFutures(verifiers)
except CancelledError as exc:
var futs = newSeqOfCap[Future[void].Raising([])](verifiers.len)
for verifier in verifiers:
futs.add verifier.cancelAndWait()
await noCancel allFutures(futs)
raise exc
if columnIds.len > 0:
debug "Requesting detected missing data columns", columns = shortLog(columnIds)
let start = SyncMoment.now(0)
var workers:
array[PARALLEL_REQUESTS_DATA_COLUMNS, Future[void].Raising([CancelledError])]
for i in 0..<PARALLEL_REQUESTS_DATA_COLUMNS:
workers[i] = rman.fetchDataColumnsFromNetwork(columnIds)
await allFutures(workers)
let finish = SyncMoment.now(uint64(len(columnIds)))
debug "Request manager data column tick",
data_columns_count = len(columnIds),
sync_speed = speed(start, finish)
proc start*(rman: var RequestManager) =
## Start Request Manager's loops.
rman.blockLoopFuture = rman.requestManagerBlockLoop()
rman.dataColumnLoopFuture = rman.requestManagerDataColumnLoop()
# rman.blobLoopFuture = rman.requestManagerBlobLoop()
proc stop*(rman: RequestManager) =
## Stop Request Manager's loop.
if not(isNil(rman.blockLoopFuture)):
rman.blockLoopFuture.cancelSoon()
if not(isNil(rman.blobLoopFuture)):
rman.blobLoopFuture.cancelSoon()
if not(isNil(rman.dataColumnLoopFuture)):
rman.dataColumnLoopFuture.cancelSoon()