mirror of
https://github.com/logos-storage/logos-storage-nim.git
synced 2026-01-29 10:43:07 +00:00
589 lines
19 KiB
Nim
589 lines
19 KiB
Nim
import std/algorithm
|
|
import std/intsets
|
|
|
|
import pkg/chronos
|
|
import pkg/libp2p
|
|
import pkg/questionable
|
|
|
|
import ./network/network
|
|
|
|
import ../blocktype
|
|
import ../discovery
|
|
import ../manifest
|
|
import ../merkletree/codex
|
|
import ../stores
|
|
import ../utils/trackedfutures
|
|
|
|
logScope:
|
|
topics = "blockexchange swarm"
|
|
|
|
const
|
|
DefaultMinNeighbors = 40
|
|
DefaultMaxNeighbors = 80
|
|
DefaultMaxPendingRequests = 100
|
|
DefaultRefreshPeerCount = 2
|
|
DefaultAdvertisementInterval = 15.minutes
|
|
DefaultMaxBlocksPerBatch = 160 # that's about 10 megabytes per transfer
|
|
|
|
type
|
|
BlockState* = object
|
|
requests: int
|
|
completed: bool
|
|
|
|
SwarmPeerCtx* = object
|
|
id*: PeerId
|
|
# Blocks peer has
|
|
blocks*: seq[bool]
|
|
# Number of block requests outstanding to peer
|
|
pendingRequests: int
|
|
# Last time we refreshed block knowledge from peer
|
|
lastRefresh: Moment
|
|
# Pending knowledge update request
|
|
pendingBlockKnowledgeRequest: bool
|
|
|
|
Swarm* = ref object of RootObj # Dataset Manifest
|
|
manifest: Manifest
|
|
# Min/max neighbors into swarm
|
|
minNeighbors: int
|
|
maxNeighbors: int
|
|
# Max pending requests per peer
|
|
maxPendingRequests: int
|
|
# How many blocks to send at once, at most, to a peer
|
|
maxBlocksPerBatch: int
|
|
# Download state for blocks
|
|
blocks: seq[BlockState]
|
|
downloadedBlocks: int
|
|
# Peers in the swarm (can't use a table because
|
|
# of: https://forum.nim-lang.org/t/12796)
|
|
peers*: Table[PeerId, SwarmPeerCtx]
|
|
# Local block store for this instance
|
|
localStore*: BlockStore
|
|
# Tracks futures of blockexc tasks
|
|
trackedFutures: TrackedFutures
|
|
# Network interface
|
|
network*: BlockExcNetwork
|
|
discovery*: Discovery
|
|
peerEventHandler: PeerEventHandler
|
|
completionHandle: Future[?!void]
|
|
lifecycleLock: AsyncLock
|
|
|
|
proc `$`(self: SwarmPeerCtx): string =
|
|
return "SwarmPeerCtx(id = " & $self.id & ")"
|
|
|
|
proc cid(self: Swarm): Cid =
|
|
return self.manifest.treeCid
|
|
|
|
proc downloadStatus*(self: Swarm): (int, int) =
|
|
return (self.downloadedBlocks, self.manifest.blocksCount)
|
|
|
|
iterator blockIndices(self: SwarmPeerCtx): int =
|
|
## Iterates over the indices of blocks that the peer has.
|
|
for i in 0 ..< self.blocks.len:
|
|
if self.blocks[i]:
|
|
yield i
|
|
|
|
proc isDownloaded(self: Swarm): bool =
|
|
return self.downloadedBlocks == self.manifest.blocksCount
|
|
|
|
proc updateDownloadedBlocks(self: Swarm, value: int) =
|
|
## Updates the number of downloaded blocks, completing the download
|
|
## handle if we're done.
|
|
doAssert self.downloadedBlocks <= value and value <= self.manifest.blocksCount
|
|
|
|
self.downloadedBlocks = value
|
|
if self.isDownloaded():
|
|
self.completionHandle.complete(success())
|
|
|
|
proc addNeighbor(self: Swarm, peer: PeerId) =
|
|
trace "Adding neighbor", peer = peer
|
|
if peer in self.peers:
|
|
warn "Neighbor already exists and will not be added again", peer = peer
|
|
return
|
|
|
|
self.peers[peer] = SwarmPeerCtx(
|
|
id: peer, lastRefresh: Moment.now(), blocks: newSeq[bool](self.manifest.blocksCount)
|
|
)
|
|
|
|
proc resample(self: Swarm) {.async: (raises: [CancelledError]).} =
|
|
var peers = await self.discovery.find(self.manifest.treeCid)
|
|
trace "Found neighbors", count = peers.len
|
|
|
|
peers = peers.filterIt(it.data.peerId notin self.peers).toSeq
|
|
|
|
let dialed = await allFinished(peers.mapIt(self.network.dialPeer(it.data)))
|
|
for i, f in dialed:
|
|
if f.failed:
|
|
trace "Dial peer failed", peer = peers[i].data.peerId
|
|
await self.discovery.removeProvider(peers[i].data.peerId)
|
|
else:
|
|
if self.peers.len < self.maxNeighbors:
|
|
self.addNeighbor(peers[i].data.peerId)
|
|
|
|
trace "Swarm neighbors after resample", count = self.peers.len
|
|
|
|
proc getStalePeers*(self: Swarm, n: int = DefaultRefreshPeerCount): seq[PeerId] =
|
|
proc cmpStale(x, y: SwarmPeerCtx): int =
|
|
cmp(x.lastRefresh, y.lastRefresh)
|
|
|
|
if self.peers.len == 0:
|
|
return @[]
|
|
|
|
var peers = self.peers.values().toSeq
|
|
peers.sort(cmpStale)
|
|
return peers[0 .. min(n, peers.len - 1)].mapIt(it.id).toSeq
|
|
|
|
proc refreshBlockKnowledge(
|
|
self: Swarm, peer: PeerId
|
|
): Future[void] {.async: (raises: [CancelledError]).} =
|
|
# Exchanges knowledge on blocks with peering neighbor
|
|
|
|
trace "Asking for block knowledge to peer", peer = peer
|
|
try:
|
|
if self.peers[peer].pendingBlockKnowledgeRequest:
|
|
trace "Pending knowledge update already in progress", peer = peer
|
|
return
|
|
|
|
trace "Setup reply future for block knowledge request"
|
|
|
|
self.peers[peer].pendingBlockKnowledgeRequest = true
|
|
|
|
# We abuse the want list message to ask for block knowledge.
|
|
await self.network.request.sendWantList(
|
|
peer,
|
|
addresses = @[init(BlockAddress, self.cid)],
|
|
wantType = WantType.WantHave,
|
|
full = true,
|
|
)
|
|
|
|
# Ideally we should only update this once we get a reply, and temporarily
|
|
# exclude the peer from the refresh loop while it is pending.
|
|
self.peers[peer].lastRefresh = Moment.now()
|
|
except KeyError:
|
|
trace "Cannot refresh update timestamp for dropped peer", peer = peer
|
|
|
|
proc neighborMaintenanceLoop*(self: Swarm): Future[void] {.async: (raises: []).} =
|
|
try:
|
|
trace "Starting neighbor maintenance loop", cid = self.cid
|
|
while true:
|
|
# Should check/remove dead peers.
|
|
if self.peers.len < self.minNeighbors:
|
|
trace "Too few neighbors, resampling", neighbors = self.peers.len
|
|
# XXX need backoff when you can't get enough neighbors
|
|
await self.resample()
|
|
|
|
trace "Resampled, neighbors", neighbors = self.peers.len
|
|
|
|
let peers = self.getStalePeers()
|
|
for peer in peers:
|
|
info "Refreshing block knowledge for peer", peer = peer
|
|
await self.refreshBlockKnowledge(peer)
|
|
|
|
# We should use separate refresh timers per peer and run this in an
|
|
# event driven fashion.
|
|
await sleepAsync(1.seconds)
|
|
except CancelledError:
|
|
trace "Swarm neighbor maintenance loop cancelled. Exiting."
|
|
|
|
proc advertiseLoop*(self: Swarm): Future[void] {.async: (raises: []).} =
|
|
try:
|
|
while true:
|
|
trace "Advertising CID", cid = self.cid
|
|
await self.discovery.provide(self.cid)
|
|
trace "Avertiser going to sleep"
|
|
await sleepAsync(DefaultAdvertisementInterval)
|
|
except CancelledError:
|
|
trace "Advertisement loop cancelled. Exiting."
|
|
|
|
proc loadBlockKnowledge*(self: Swarm): Future[void] {.async: (raises: []).} =
|
|
## Load block knowledge from local store
|
|
##
|
|
|
|
info "Loading block knowledge for CID", cid = self.cid
|
|
var totalBlocks = 0
|
|
|
|
self.blocks.setLen(self.manifest.blocksCount)
|
|
for blockIndex in 0 ..< self.manifest.blocksCount:
|
|
try:
|
|
without hasBlock =? await self.localStore.hasBlock(self.cid, blockIndex), err:
|
|
error "Failed to check block presence", err = err.msg
|
|
if hasBlock:
|
|
self.blocks[blockIndex].completed = true
|
|
totalBlocks += 1
|
|
except CatchableError as err:
|
|
error "Failed to check block presence", err = err.msg
|
|
|
|
info "Loaded block knowledge for CID", cid = self.cid, count = totalBlocks
|
|
|
|
self.updateDownloadedBlocks(totalBlocks)
|
|
|
|
proc sendBlockRequests(
|
|
self: Swarm, peer: PeerId, requests: seq[int]
|
|
) {.async: (raw: true).} =
|
|
return self.network.sendWantList(
|
|
peer,
|
|
requests.mapIt(init(BlockAddress, self.cid, it)),
|
|
wantType = WantType.WantBlock,
|
|
)
|
|
|
|
proc fillRequests*(self: Swarm, peer: PeerId) {.async: (raises: [CancelledError]).} =
|
|
## Selects the blocks to request to a neighboring peer up to a maximum
|
|
## number of pending requests.
|
|
|
|
trace "Fill request schedule for peer", peer = peer
|
|
|
|
without peerCtx =? self.peers[peer].catch, err:
|
|
error "Cannot fill request schedule for peer", peer = peer
|
|
return
|
|
|
|
var
|
|
requests: seq[int]
|
|
requested: int = 0
|
|
|
|
for blockIndex in peerCtx.blockIndices:
|
|
if peerCtx.pendingRequests >= self.maxPendingRequests:
|
|
trace "Max pending requests reached for peer, not sending new ones", peer = peer
|
|
break
|
|
|
|
# Already have the block.
|
|
if self.blocks[blockIndex].completed:
|
|
continue
|
|
|
|
# Skip busy blocks. This is not very robust - we should
|
|
# allow busy blocks once we're done with the idle ones. We
|
|
# also need to return failed block requests to idle state.
|
|
if self.blocks[blockIndex].requests > 0:
|
|
continue
|
|
|
|
requests.add(blockIndex)
|
|
requested += 1
|
|
self.blocks[blockIndex].requests += 1
|
|
|
|
try:
|
|
self.peers[peer].pendingRequests += 1
|
|
except KeyError:
|
|
error "Cannot update pending requests for peer", peer = peer
|
|
return
|
|
|
|
trace "New request schedule for peer", peer = peer, count = requested
|
|
|
|
if requested == 0:
|
|
trace "Peer has no blocks of interest", peer = peer
|
|
return
|
|
|
|
try:
|
|
# Request new blocks immediately.
|
|
await sendBlockRequests(self, peer, requests)
|
|
except CatchableError as err:
|
|
# For now we just give up
|
|
trace "Failed to send block requests to peer", peer = peer, err = err.msg
|
|
|
|
proc handleBlockDelivery(
|
|
self: Swarm, peer: PeerId, blocksDelivery: seq[BlockDelivery]
|
|
) {.async: (raises: []).} =
|
|
trace "Got blocks from peer", peer = peer, count = blocksDelivery.len
|
|
without peerCtx =? self.peers[peer].catch, err:
|
|
error "Cannot receive blocks from unknown peer", peer = peer
|
|
return
|
|
|
|
try:
|
|
for blockDelivery in blocksDelivery:
|
|
# Could be a duplicate receive
|
|
if self.blocks[blockDelivery.address.index].completed:
|
|
trace "Duplicate block received",
|
|
blockIndex = blockDelivery.address.index, peer = peer
|
|
continue
|
|
|
|
# Stores block and proof. We don't validate as this is just an experiment.
|
|
if err =? (await self.localStore.putBlock(blockDelivery.blk)).errorOption:
|
|
error "Unable to store block", err = err.msg
|
|
continue
|
|
|
|
if blockDelivery.address.leaf:
|
|
without proof =? blockDelivery.proof:
|
|
warn "Proof expected for a leaf block delivery"
|
|
continue
|
|
if err =? (
|
|
await self.localStore.putCidAndProof(
|
|
blockDelivery.address.treeCid, blockDelivery.address.index,
|
|
blockDelivery.blk.cid, proof,
|
|
)
|
|
).errorOption:
|
|
warn "Unable to store proof and cid for a block"
|
|
continue
|
|
|
|
trace "Block received", blockIndex = blockDelivery.address.index, peer = peer
|
|
self.blocks[blockDelivery.address.index].completed = true
|
|
|
|
try:
|
|
self.peers[peer].pendingRequests -= 1
|
|
except KeyError:
|
|
error "Cannot update pending requests for peer", peer = peer
|
|
return
|
|
|
|
self.updateDownloadedBlocks(self.downloadedBlocks + 1)
|
|
if self.isDownloaded():
|
|
info "Download completed", cid = self.cid
|
|
return
|
|
|
|
# Got some idle space, push more requests to peer.
|
|
await self.fillRequests(peer)
|
|
except CatchableError as err:
|
|
trace "Error handling block delivery", err = err.msg
|
|
|
|
proc setupPeer(self: Swarm, peer: PeerId) {.async: (raises: [CancelledError]).} =
|
|
# If this is an outbound connection, we should already have the peer in our
|
|
# neighbor set. If it's an inbound connection, it might be the first time we
|
|
# see this peer.
|
|
trace "Setting up peer", peer = peer
|
|
|
|
self.addNeighbor(peer)
|
|
# Starts by asking peer for block knowledge.
|
|
await self.refreshBlockKnowledge(peer)
|
|
|
|
proc handleBlockKnowledgeRequest(self: Swarm, peer: PeerId) {.async: (raises: []).} =
|
|
trace "Handling block knowledge request from peer", peer = peer
|
|
var presenceInfo: seq[BlockPresence]
|
|
for blockIndex in 0 ..< self.manifest.blocksCount:
|
|
if self.blocks[blockIndex].completed:
|
|
presenceInfo.add(
|
|
BlockPresence(
|
|
address: init(BlockAddress, self.cid, blockIndex),
|
|
`type`: BlockPresenceType.Have,
|
|
)
|
|
)
|
|
|
|
# XXX This will probably be way too expensive to keep sending, even just
|
|
# for prototyping
|
|
trace "Have block presences to send", count = presenceInfo.len
|
|
|
|
try:
|
|
await self.network.request.sendPresence(peer, presenceInfo)
|
|
except CancelledError:
|
|
trace "Sending of block presences cancelled", peer = peer
|
|
|
|
proc handleBlockKnowledgeResponse(
|
|
self: Swarm, peer: PeerId, presence: seq[BlockPresence]
|
|
) {.async: (raises: []).} =
|
|
try:
|
|
trace "Received block knowledge from peer", peer = peer
|
|
var i = 0
|
|
for blockPresence in presence:
|
|
i += 1
|
|
self.peers[peer].blocks[blockPresence.address.index.int] = true
|
|
|
|
trace "Peer has blocks", peer = peer, count = i
|
|
|
|
# Learned some potentially new blocks for peer, maybe we can push
|
|
# some more requests.
|
|
await self.fillRequests(peer)
|
|
except KeyError:
|
|
error "Cannot update block presence for peer", peer = peer
|
|
return
|
|
except CancelledError:
|
|
trace "Sending of block requests cancelled", peer = peer
|
|
finally:
|
|
try:
|
|
if not self.peers[peer].pendingBlockKnowledgeRequest:
|
|
trace "Illegal state for pending block knowledge request (already cleared)",
|
|
peer = peer
|
|
self.peers[peer].pendingBlockKnowledgeRequest = false
|
|
except KeyError:
|
|
trace "Error updating pending block knowledge request state (peer dropped)",
|
|
peer = peer
|
|
|
|
proc handleBlockRequest(
|
|
self: Swarm, peer: PeerId, addresses: seq[BlockAddress]
|
|
) {.async: (raises: []).} =
|
|
trace "Got request for blocks from peer", peer = peer, count = addresses.len
|
|
|
|
proc localLookup(address: BlockAddress): Future[?!BlockDelivery] {.async.} =
|
|
if address.leaf:
|
|
(await self.localStore.getBlockAndProof(address.treeCid, address.index)).map(
|
|
(blkAndProof: (Block, CodexProof)) =>
|
|
BlockDelivery(
|
|
address: address, blk: blkAndProof[0], proof: blkAndProof[1].some
|
|
)
|
|
)
|
|
else:
|
|
(await self.localStore.getBlock(address)).map(
|
|
(blk: Block) => BlockDelivery(
|
|
address: address, blk: blk, proof: CodexProof.none
|
|
)
|
|
)
|
|
|
|
try:
|
|
var
|
|
blocks: seq[BlockDelivery]
|
|
remainingSlots: int = self.maxBlocksPerBatch
|
|
|
|
# Sends blocks in batches.
|
|
for address in addresses:
|
|
if remainingSlots == 0:
|
|
trace "Sending batch of blocks to peer", peer = peer, count = blocks.len
|
|
await self.network.request.sendBlocksDelivery(peer, blocks)
|
|
blocks = @[]
|
|
remainingSlots = self.maxBlocksPerBatch
|
|
|
|
without aBlock =? (await localLookup(address)), err:
|
|
# XXX This looks harmless but it's not. The receiving peer needs to check
|
|
# that all requested blocks were sent back, and have a retry policy for
|
|
# blocks that didn't get there.
|
|
warn "Failed to lookup block", address = address
|
|
continue
|
|
blocks.add(aBlock)
|
|
remainingSlots -= 1
|
|
|
|
# Sends trailing batch.
|
|
if blocks.len > 0:
|
|
trace "Sending last batch of blocks to peer", peer = peer, count = blocks.len
|
|
await self.network.request.sendBlocksDelivery(peer, blocks)
|
|
except CatchableError as err:
|
|
error "Failed to send blocks to peer", err = err.msg
|
|
return
|
|
|
|
proc dropPeer*(self: Swarm, peer: PeerId) {.raises: [].} =
|
|
## Cleanup disconnected peer
|
|
##
|
|
|
|
trace "Dropping peer", peer
|
|
|
|
# drop the peer from the peers table
|
|
self.peers.del(peer)
|
|
|
|
# Since this is all very fragile, we add checks to make sure the messages we're
|
|
# getting are for the right swarm. This is especially important as we run iterated
|
|
# experiments and replace the handlers for each new download.
|
|
proc checkBlockAddress(self: Swarm, address: BlockAddress): bool =
|
|
if address.leaf:
|
|
return address.treeCid == self.manifest.treeCid
|
|
else:
|
|
return address.cid == self.manifest.treeCid
|
|
|
|
proc checkAddresses[T](self: Swarm, messages: seq[T]): bool =
|
|
return messages.allIt(checkBlockAddress(self, it.address))
|
|
|
|
proc installEventHandlers(self: Swarm) =
|
|
proc peerEventHandler(
|
|
peerId: PeerId, event: PeerEvent
|
|
): Future[void] {.gcsafe, async: (raises: [CancelledError]).} =
|
|
if event.kind == PeerEventKind.Joined:
|
|
await self.setupPeer(peerId)
|
|
else:
|
|
self.dropPeer(peerId)
|
|
|
|
self.peerEventHandler = peerEventHandler
|
|
|
|
proc wantListHandler(peer: PeerId, wantList: WantList) {.async: (raises: []).} =
|
|
if not self.checkAddresses(wantList.entries):
|
|
return
|
|
|
|
if wantList.full:
|
|
await self.handleBlockKnowledgeRequest(peer)
|
|
else:
|
|
await self.handleBlockRequest(peer, wantList.entries.mapIt(it.address).toSeq)
|
|
|
|
proc blocksPresenceHandler(
|
|
peer: PeerId, presence: seq[BlockPresence]
|
|
) {.async: (raises: []).} =
|
|
if not self.checkAddresses(presence):
|
|
return
|
|
|
|
self.handleBlockKnowledgeResponse(peer, presence)
|
|
|
|
proc blocksDeliveryHandler(
|
|
peer: PeerId, blocksDelivery: seq[BlockDelivery]
|
|
): Future[void] {.async: (raises: []).} =
|
|
if not self.checkAddresses(blocksDelivery):
|
|
return
|
|
self.handleBlockDelivery(peer, blocksDelivery)
|
|
|
|
if not isNil(self.network.switch):
|
|
self.network.switch.addPeerEventHandler(self.peerEventHandler, PeerEventKind.Joined)
|
|
self.network.switch.addPeerEventHandler(self.peerEventHandler, PeerEventKind.Left)
|
|
|
|
self.network.handlers = BlockExcHandlers(
|
|
onWantList: wantListHandler,
|
|
onBlocksDelivery: blocksDeliveryHandler,
|
|
onPresence: blocksPresenceHandler,
|
|
onAccount: nil,
|
|
onPayment: nil,
|
|
)
|
|
|
|
proc uninstallEventHandlers(self: Swarm) =
|
|
if not isNil(self.network.switch):
|
|
self.network.switch.removePeerEventHandler(
|
|
self.peerEventHandler, PeerEventKind.Joined
|
|
)
|
|
self.network.switch.removePeerEventHandler(
|
|
self.peerEventHandler, PeerEventKind.Left
|
|
)
|
|
|
|
self.network.handlers = BlockExcHandlers(
|
|
onWantList: nil,
|
|
onBlocksDelivery: nil,
|
|
onPresence: nil,
|
|
onAccount: nil,
|
|
onPayment: nil,
|
|
)
|
|
|
|
proc start*(self: Swarm): Future[void] {.async: (raises: []).} =
|
|
trace "Initialize swarm. Load block knowledge.", cid = self.cid
|
|
try:
|
|
await self.lifecycleLock.acquire()
|
|
await self.loadBlockKnowledge()
|
|
|
|
trace "Joining swarm."
|
|
self.installEventHandlers()
|
|
# Bootstraps
|
|
self.trackedFutures.track(self.neighborMaintenanceLoop())
|
|
self.trackedFutures.track(self.advertiseLoop())
|
|
except CancelledError:
|
|
return
|
|
finally:
|
|
try:
|
|
self.lifecycleLock.release()
|
|
except AsyncLockError as err:
|
|
# This is probably serious enough that I should raise defect in production code.
|
|
error "Failed to release lock, stopping the swarm might fail", err = err.msg
|
|
|
|
proc stop*(self: Swarm): Future[void] {.async: (raises: []).} =
|
|
trace "Stopping event loops and uninstalling handlers"
|
|
try:
|
|
await self.lifecycleLock.acquire()
|
|
# We should probably have a way to actively inform the DHT tracker
|
|
# that we're leaving.
|
|
await self.trackedFutures.cancelTracked()
|
|
# Messages that arrive after this will be ignored (or
|
|
# might be delivered to another swarm if we restart a download).
|
|
self.uninstallEventHandlers()
|
|
trace "Left swarm"
|
|
except CancelledError:
|
|
return
|
|
finally:
|
|
try:
|
|
self.lifecycleLock.release()
|
|
except AsyncLockError as err:
|
|
error "Failed to release lock. Restarting this swarm might fail", err = err.msg
|
|
|
|
proc new*(
|
|
T: type Swarm,
|
|
dataset: Manifest,
|
|
localStore: BlockStore,
|
|
network: BlockExcNetwork,
|
|
discovery: Discovery,
|
|
): Swarm =
|
|
return Swarm(
|
|
manifest: dataset,
|
|
localStore: localStore,
|
|
network: network,
|
|
discovery: discovery,
|
|
minNeighbors: DefaultMinNeighbors,
|
|
maxNeighbors: DefaultMaxNeighbors,
|
|
maxPendingRequests: DefaultMaxPendingRequests,
|
|
maxBlocksPerBatch: DefaultMaxBlocksPerBatch,
|
|
downloadedBlocks: 0,
|
|
trackedFutures: TrackedFutures.new(),
|
|
completionHandle: newFuture[?!void]("codex.blockexchange.swarm.start"),
|
|
lifecycleLock: newAsyncLock(),
|
|
)
|