feat: add strategic runtime metrics for block exchange monitoring

- Add codex_block_exchange_discovery_requests_total counter to track peer
  discovery frequency
- Add codex_block_exchange_peer_timeouts_total counter to monitor peer
  reliability issues
- Add codex_block_exchange_requests_failed_total counter to track request
  failure rates
This commit is contained in:
Chrysostomos Nanakos 2025-09-26 15:05:17 +03:00
parent 7f3004b5c0
commit 25a2b3e9ed
No known key found for this signature in database

View File

@ -69,6 +69,18 @@ declareCounter(
codex_block_exchange_spurious_blocks_received,
"codex blockexchange unrequested/duplicate blocks received",
)
declareCounter(
codex_block_exchange_discovery_requests_total,
"Total number of peer discovery requests sent",
)
declareCounter(
codex_block_exchange_peer_timeouts_total, "Total number of peer activity timeouts"
)
declareCounter(
codex_block_exchange_requests_failed_total,
"Total number of block requests that failed after exhausting retries"
)
const
DefaultMaxPeersPerRequest* = 10
@ -211,6 +223,7 @@ proc refreshBlockKnowledge(self: BlockExcEngine) {.async: (raises: [CancelledErr
proc searchForNewPeers(self: BlockExcEngine, cid: Cid) =
if self.lastDiscRequest + DiscoveryRateLimit < Moment.now():
trace "Searching for new peers for", cid = cid
codex_block_exchange_discovery_requests_total.inc()
self.lastDiscRequest = Moment.now() # always refresh before calling await!
self.discovery.queueFindBlocksReq(@[cid])
else:
@ -246,6 +259,7 @@ proc downloadInternal(
if self.pendingBlocks.retriesExhausted(address):
trace "Error retries exhausted"
codex_block_exchange_requests_failed_total.inc()
handle.fail(newException(RetriesExhaustedError, "Error retries exhausted"))
break
@ -310,6 +324,7 @@ proc downloadInternal(
else:
# If the peer timed out, retries immediately.
trace "Peer timed out during block request", peer = scheduledPeer.id
codex_block_exchange_peer_timeouts_total.inc()
await self.network.dropPeer(scheduledPeer.id)
# Evicts peer immediately or we may end up picking it again in the
# next retry.
@ -320,6 +335,7 @@ proc downloadInternal(
await handle.cancelAndWait()
except RetriesExhaustedError as exc:
warn "Retries exhausted for block", address, exc = exc.msg
codex_block_exchange_requests_failed_total.inc()
if not handle.finished:
handle.fail(exc)
finally: