From 25a2b3e9ed8a60a352a1d1cab21f2257f263d021 Mon Sep 17 00:00:00 2001 From: Chrysostomos Nanakos Date: Fri, 26 Sep 2025 15:05:17 +0300 Subject: [PATCH] feat: add strategic runtime metrics for block exchange monitoring - Add codex_block_exchange_discovery_requests_total counter to track peer discovery frequency - Add codex_block_exchange_peer_timeouts_total counter to monitor peer reliability issues - Add codex_block_exchange_requests_failed_total counter to track request failure rates --- codex/blockexchange/engine/engine.nim | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/codex/blockexchange/engine/engine.nim b/codex/blockexchange/engine/engine.nim index 4952b638..12970917 100644 --- a/codex/blockexchange/engine/engine.nim +++ b/codex/blockexchange/engine/engine.nim @@ -69,6 +69,18 @@ declareCounter( codex_block_exchange_spurious_blocks_received, "codex blockexchange unrequested/duplicate blocks received", ) +declareCounter( + codex_block_exchange_discovery_requests_total, + "Total number of peer discovery requests sent", +) +declareCounter( + codex_block_exchange_peer_timeouts_total, "Total number of peer activity timeouts" +) +declareCounter( + codex_block_exchange_requests_failed_total, + "Total number of block requests that failed after exhausting retries" +) + const DefaultMaxPeersPerRequest* = 10 @@ -211,6 +223,7 @@ proc refreshBlockKnowledge(self: BlockExcEngine) {.async: (raises: [CancelledErr proc searchForNewPeers(self: BlockExcEngine, cid: Cid) = if self.lastDiscRequest + DiscoveryRateLimit < Moment.now(): trace "Searching for new peers for", cid = cid + codex_block_exchange_discovery_requests_total.inc() self.lastDiscRequest = Moment.now() # always refresh before calling await! self.discovery.queueFindBlocksReq(@[cid]) else: @@ -246,6 +259,7 @@ proc downloadInternal( if self.pendingBlocks.retriesExhausted(address): trace "Error retries exhausted" + codex_block_exchange_requests_failed_total.inc() handle.fail(newException(RetriesExhaustedError, "Error retries exhausted")) break @@ -310,6 +324,7 @@ proc downloadInternal( else: # If the peer timed out, retries immediately. trace "Peer timed out during block request", peer = scheduledPeer.id + codex_block_exchange_peer_timeouts_total.inc() await self.network.dropPeer(scheduledPeer.id) # Evicts peer immediately or we may end up picking it again in the # next retry. @@ -320,6 +335,7 @@ proc downloadInternal( await handle.cancelAndWait() except RetriesExhaustedError as exc: warn "Retries exhausted for block", address, exc = exc.msg + codex_block_exchange_requests_failed_total.inc() if not handle.finished: handle.fail(exc) finally: