Chrysostomos Nanakos bb6ab1befa
chore: Block exchange protocol rewrite (#1411)
Signed-off-by: Chrysostomos Nanakos <chris@include.gr>
2026-04-25 00:37:42 +00:00

335 lines
9.4 KiB
Nim

## Logos Storage
## Copyright (c) 2026 Status Research & Development GmbH
## Licensed under either of
## * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE))
## * MIT license ([LICENSE-MIT](LICENSE-MIT))
## at your option.
## This file may not be copied, modified, or distributed except according to
## those terms.
import std/[tables, sets, options, random]
import pkg/chronos
import pkg/libp2p/peerid
import ../peers/peerctxstore
import ../peers/peerstats
import ../types
import ../../logutils
import ./peertracker
export peerctxstore, types
randomize()
logScope:
topics = "logos-storage swarm"
const
DefaultDeltaMin* = 2
DefaultDeltaMax* = 16
DefaultDeltaTarget* = 8
PeerStaleTimeout* = 30.seconds
PeerDefaultMaxFailures*: uint32 = 2
PeerDefaultMaxTimeouts*: uint32 = 5
ExplorationProbability* = 0.2
TimeoutPenaltyWeight* = 3.0
type
SwarmPeer* = ref object
availability*: BlockAvailability
lastSeen*: Moment
availabilityUpdated*: Moment
failureCount*: uint32
timeoutCount*: uint32
SwarmConfig* = object
deltaMin*: int
deltaMax*: int
deltaTarget*: int
maxPeerFailures*: uint32
maxPeerTimeouts*: uint32
SwarmHealth* = enum
shHealthy
shBelowTarget
shBelowMin
PeerSelectionKind* = enum
pskFound
pskAtCapacity
pskNoPeers
PeerSelection* = object
case kind*: PeerSelectionKind
of pskFound:
peer*: PeerContext
of pskAtCapacity, pskNoPeers:
discard
Swarm* = ref object
config*: SwarmConfig
peers: Table[PeerId, SwarmPeer]
removedPeers: HashSet[PeerId]
proc new*(T: type SwarmPeer, availability: BlockAvailability): SwarmPeer =
let now = Moment.now()
SwarmPeer(
availability: availability,
lastSeen: now,
availabilityUpdated: now,
failureCount: 0,
timeoutCount: 0,
)
proc isStale*(peer: SwarmPeer): bool =
Moment.now() - peer.lastSeen > PeerStaleTimeout
proc touch*(peer: SwarmPeer) =
peer.lastSeen = Moment.now()
proc updateAvailability*(peer: SwarmPeer, availability: BlockAvailability) =
peer.availability = peer.availability.merge(availability)
peer.availabilityUpdated = Moment.now()
peer.touch()
proc recordFailure*(peer: SwarmPeer) =
peer.failureCount += 1
proc recordTimeout*(peer: SwarmPeer) =
peer.timeoutCount += 1
proc resetFailures*(peer: SwarmPeer) =
peer.failureCount = 0
peer.timeoutCount = 0
proc defaultConfig*(_: type SwarmConfig): SwarmConfig =
SwarmConfig(
deltaMin: DefaultDeltaMin,
deltaMax: DefaultDeltaMax,
deltaTarget: DefaultDeltaTarget,
maxPeerFailures: PeerDefaultMaxFailures,
maxPeerTimeouts: PeerDefaultMaxTimeouts,
)
proc new*(T: type Swarm, config: SwarmConfig = SwarmConfig.defaultConfig()): Swarm =
Swarm(
config: config,
peers: initTable[PeerId, SwarmPeer](),
removedPeers: initHashSet[PeerId](),
)
proc addPeer*(swarm: Swarm, peerId: PeerId, availability: BlockAvailability): bool =
if peerId in swarm.removedPeers:
return false
if swarm.peers.len >= swarm.config.deltaMax:
return false
swarm.peers[peerId] = SwarmPeer.new(availability)
true
proc removePeer*(swarm: Swarm, peerId: PeerId): Option[SwarmPeer] =
swarm.peers.withValue(peerId, peer):
let res = some(peer[])
swarm.peers.del(peerId)
return res
return none(SwarmPeer)
proc banPeer*(swarm: Swarm, peerId: PeerId) =
swarm.peers.del(peerId)
swarm.removedPeers.incl(peerId)
proc getPeer*(swarm: Swarm, peerId: PeerId): Option[SwarmPeer] =
swarm.peers.withValue(peerId, peer):
return some(peer[])
return none(SwarmPeer)
proc updatePeerAvailability*(
swarm: Swarm, peerId: PeerId, availability: BlockAvailability
) =
swarm.peers.withValue(peerId, peer):
peer[].updateAvailability(availability)
proc recordPeerFailure*(swarm: Swarm, peerId: PeerId): bool =
## return true if peer should be removed
swarm.peers.withValue(peerId, peer):
peer[].recordFailure()
return peer[].failureCount >= swarm.config.maxPeerFailures
return false
proc recordPeerTimeout*(swarm: Swarm, peerId: PeerId): bool =
## return true if peer should be removed
swarm.peers.withValue(peerId, peer):
peer[].recordTimeout()
return peer[].timeoutCount >= swarm.config.maxPeerTimeouts
return false
proc recordBatchSuccess*(
swarm: Swarm, peer: PeerContext, rttMicros: uint64, totalBytes: uint64
) =
swarm.peers.withValue(peer.id, swarmPeer):
swarmPeer[].resetFailures()
swarmPeer[].touch()
peer.stats.recordRequest(rttMicros, totalBytes)
proc activePeerCount*(swarm: Swarm): int =
for peer in swarm.peers.values:
if not peer.isStale:
result += 1
proc peerCount*(swarm: Swarm): int =
swarm.peers.len
proc peersNeeded*(swarm: Swarm): SwarmHealth =
let active = swarm.activePeerCount()
if active < swarm.config.deltaMin:
shBelowMin
elif active < swarm.config.deltaTarget:
shBelowTarget
else:
shHealthy
proc connectedPeers*(swarm: Swarm): seq[PeerId] =
for peerId in swarm.peers.keys:
result.add(peerId)
proc peersWithRange*(swarm: Swarm, start: uint64, count: uint64): seq[PeerId] =
for peerId, peer in swarm.peers:
if not peer.isStale and peer.availability.hasRange(start, count):
result.add(peerId)
proc peersWithAnyInRange*(swarm: Swarm, start: uint64, count: uint64): seq[PeerId] =
for peerId, peer in swarm.peers:
if not peer.isStale and peer.availability.hasAnyInRange(start, count):
result.add(peerId)
proc staleUnknownPeers*(swarm: Swarm): seq[PeerId] =
for peerId, peer in swarm.peers:
if peer.isStale and peer.availability.kind == bakUnknown:
result.add(peerId)
proc selectByBDP*(
peers: seq[PeerContext],
batchBytes: uint64,
tracker: PeerInFlightTracker,
penalties: var Table[PeerId, float],
explorationProb: float = ExplorationProbability,
): Option[PeerContext] {.gcsafe, raises: [].} =
if peers.len == 0:
return none(PeerContext)
if peers.len == 1:
return some(peers[0])
var untriedPeers: seq[PeerContext]
for peer in peers:
if peer.stats.throughputBps().isNone:
let
pipelineDepth = peer.optimalPipelineDepth(batchBytes)
currentLoad = tracker.count(peer.id)
if currentLoad < pipelineDepth:
untriedPeers.add(peer)
if untriedPeers.len > 0:
var
bestPeer = untriedPeers[0]
bestLoad = tracker.count(bestPeer.id)
for i in 1 ..< untriedPeers.len:
let load = tracker.count(untriedPeers[i].id)
if load < bestLoad:
bestLoad = load
bestPeer = untriedPeers[i]
return some(bestPeer)
let exploreRoll = rand(1.0)
if exploreRoll < explorationProb:
var peersWithCapacity: seq[PeerContext]
for peer in peers:
let
pipelineDepth = peer.optimalPipelineDepth(batchBytes)
currentLoad = tracker.count(peer.id)
if currentLoad < pipelineDepth:
peersWithCapacity.add(peer)
if peersWithCapacity.len > 0:
let idx = rand(peersWithCapacity.len - 1)
return some(peersWithCapacity[idx])
var
bestPeers: seq[PeerContext] = @[peers[0]]
bestScore = peers[0].evalBDPScore(
batchBytes, tracker.count(peers[0].id), penalties.getOrDefault(peers[0].id, 0.0)
)
for i in 1 ..< peers.len:
let score = peers[i].evalBDPScore(
batchBytes, tracker.count(peers[i].id), penalties.getOrDefault(peers[i].id, 0.0)
)
if score < bestScore:
bestScore = score
bestPeers = @[peers[i]]
elif score == bestScore:
bestPeers.add(peers[i])
if bestPeers.len > 1:
let idx = rand(bestPeers.len - 1)
return some(bestPeers[idx])
else:
return some(bestPeers[0])
proc selectPeerForBatch*(
swarm: Swarm,
peers: PeerContextStore,
start: uint64,
count: uint64,
batchBytes: uint64,
tracker: PeerInFlightTracker,
): PeerSelection =
var penalties: Table[PeerId, float]
for peerId, swarmPeer in swarm.peers:
if swarmPeer.timeoutCount > 0:
penalties[peerId] = swarmPeer.timeoutCount.float * TimeoutPenaltyWeight
let candidates = swarm.peersWithRange(start, count)
if candidates.len == 0:
let partialCandidates = swarm.peersWithAnyInRange(start, count)
trace "No full range peers, checking partial",
start = start, count = count, partialPeers = partialCandidates.len
if partialCandidates.len == 0:
return PeerSelection(kind: pskNoPeers)
var peerCtxs: seq[PeerContext]
for peerId in partialCandidates:
let peer = peers.get(peerId)
if peer.isNil:
# peer disconnected, remove from swarm immediately
discard swarm.removePeer(peerId)
continue
if tracker.count(peerId) < peer.optimalPipelineDepth(batchBytes):
peerCtxs.add(peer)
if peerCtxs.len == 0:
return PeerSelection(kind: pskAtCapacity)
let selected = selectByBDP(peerCtxs, batchBytes, tracker, penalties)
if selected.isSome:
return PeerSelection(kind: pskFound, peer: selected.get())
return PeerSelection(kind: pskNoPeers)
var peerCtxs: seq[PeerContext]
for peerId in candidates:
let peer = peers.get(peerId)
if peer.isNil:
# peer disconnected - remove from swarm immediately
discard swarm.removePeer(peerId)
continue
if tracker.count(peerId) < peer.optimalPipelineDepth(batchBytes):
peerCtxs.add(peer)
if peerCtxs.len == 0:
return PeerSelection(kind: pskAtCapacity)
let selected = selectByBDP(peerCtxs, batchBytes, tracker, penalties)
if selected.isSome:
return PeerSelection(kind: pskFound, peer: selected.get())
return PeerSelection(kind: pskNoPeers)