Evict peer instead of abrupt disconnect and avoid sending unnecessary store requests (#3857)

* peer manager not disconnect abruptly ongoing service peers streams
* fix: recv_service delivers store-recovered messages (#3805)
* recv_service now delivers store-recovered messages via MessageReceivedEvent
This commit is contained in:
Ivan FB 2026-05-07 17:28:30 +02:00 committed by GitHub
parent 04b8e8c2a8
commit 35da224d5d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 335 additions and 93 deletions

View File

@ -5,4 +5,5 @@ import
./test_node_conf,
./test_api_send,
./test_api_subscription,
./test_api_receive,
./test_api_health

View File

@ -0,0 +1,193 @@
{.used.}
import std/[options, sequtils, net, sets]
import chronos, testutils/unittests, stew/byteutils
import libp2p/[peerid, peerinfo, crypto/crypto]
import ../testlib/[common, wakucore, wakunode, testasync]
import ../waku_archive/archive_utils
import
waku,
waku/[
waku_node,
waku_core,
common/broker/broker_context,
events/message_events,
waku_relay/protocol,
waku_archive,
waku_archive/common as archive_common,
node/delivery_service/delivery_service,
node/delivery_service/recv_service,
]
import waku/factory/waku_conf
import tools/confutils/cli_args
const TestTimeout = chronos.seconds(60)
type ReceiveEventListenerManager = ref object
brokerCtx: BrokerContext
receivedListener: MessageReceivedEventListener
receivedEvent: AsyncEvent
receivedMessages: seq[WakuMessage]
targetCount: int
proc newReceiveEventListenerManager(
brokerCtx: BrokerContext, expectedCount: int = 1
): ReceiveEventListenerManager =
let manager = ReceiveEventListenerManager(
brokerCtx: brokerCtx, receivedMessages: @[], targetCount: expectedCount
)
manager.receivedEvent = newAsyncEvent()
manager.receivedListener = MessageReceivedEvent
.listen(
brokerCtx,
proc(event: MessageReceivedEvent) {.async: (raises: []).} =
manager.receivedMessages.add(event.message)
if manager.receivedMessages.len >= manager.targetCount:
manager.receivedEvent.fire()
,
)
.expect("Failed to listen to MessageReceivedEvent")
return manager
proc teardown(manager: ReceiveEventListenerManager) =
MessageReceivedEvent.dropListener(manager.brokerCtx, manager.receivedListener)
proc waitForEvents(
manager: ReceiveEventListenerManager, timeout: Duration
): Future[bool] {.async.} =
return await manager.receivedEvent.wait().withTimeout(timeout)
proc createApiNodeConf(numShards: uint16 = 1): WakuNodeConf =
var conf = defaultWakuNodeConf().valueOr:
raiseAssert error
conf.mode = cli_args.WakuMode.Core
conf.listenAddress = parseIpAddress("0.0.0.0")
conf.tcpPort = Port(0)
conf.discv5UdpPort = Port(0)
conf.clusterId = 3'u16
conf.numShardsInNetwork = numShards
conf.reliabilityEnabled = true
conf.rest = false
result = conf
suite "Messaging API, Receive Service (store recovery)":
asyncTest "recv_service delivers store-recovered messages via MessageReceivedEvent":
## Message gets archived before subscriber exists, checkStore() recovers it.
## This is a regression test: it proves that messages recovered via store by
## the RecvService (instead of receiving via a live relay sub) are actually
## delivered via the MessageReceivedEvent API.
let numShards: uint16 = 1
let shards = @[PubsubTopic("/waku/2/rs/3/0")]
let shard = shards[0]
let testTopic = ContentTopic("/waku/2/recv-test/proto")
proc dummyHandler(topic: PubsubTopic, msg: WakuMessage) {.async, gcsafe.} =
discard
# store node has archive, store, relay
# it archives messages from relay and serves them to the
# subscriber's store client when it comes up (later)
var storeNode: WakuNode
lockNewGlobalBrokerContext:
storeNode =
newTestWakuNode(generateSecp256k1Key(), parseIpAddress("0.0.0.0"), Port(0))
storeNode.mountMetadata(3, toSeq(0'u16 ..< numShards)).expect(
"Failed to mount metadata on storeNode"
)
(await storeNode.mountRelay()).expect("Failed to mount relay on storeNode")
let archiveDriver = newSqliteArchiveDriver()
storeNode.mountArchive(archiveDriver).expect("Failed to mount archive")
await storeNode.mountStore()
await storeNode.mountLibp2pPing()
await storeNode.start()
for s in shards:
storeNode.subscribe((kind: PubsubSub, topic: s), dummyHandler).expect(
"Failed to sub storeNode"
)
let storeNodePeerInfo = storeNode.peerInfo.toRemotePeerInfo()
# publisher node (relay)
var publisher: WakuNode
lockNewGlobalBrokerContext:
publisher =
newTestWakuNode(generateSecp256k1Key(), parseIpAddress("0.0.0.0"), Port(0))
publisher.mountMetadata(3, toSeq(0'u16 ..< numShards)).expect(
"Failed to mount metadata on publisher"
)
(await publisher.mountRelay()).expect("Failed to mount relay on publisher")
await publisher.mountLibp2pPing()
await publisher.start()
for s in shards:
publisher.subscribe((kind: PubsubSub, topic: s), dummyHandler).expect(
"Failed to sub publisher"
)
# connect publisher to store so messages get archived
await publisher.connectToNodes(@[storeNodePeerInfo])
# wait for relay mesh
for _ in 0 ..< 50:
if publisher.wakuRelay.getNumPeersInMesh(shard).valueOr(0) > 0:
break
await sleepAsync(100.milliseconds)
# publish before subscriber exists, gets archived
let missedPayload = "This message was missed".toBytes()
let missedMsg = WakuMessage(
payload: missedPayload, contentTopic: testTopic, version: 0, timestamp: now()
)
discard (await publisher.publish(some(shard), missedMsg)).expect(
"Publish missed msg failed"
)
# wait for archive
block waitArchive:
for _ in 0 ..< 50:
let query = archive_common.ArchiveQuery(
includeData: false, contentTopics: @[testTopic], pubsubTopic: some(shard)
)
let res = await storeNode.wakuArchive.findMessages(query)
if res.isOk() and res.get().hashes.len > 0:
break waitArchive
await sleepAsync(100.milliseconds)
raiseAssert "Message was not archived in time"
# create subscriber
var subscriber: Waku
lockNewGlobalBrokerContext:
subscriber = (await createNode(createApiNodeConf(numShards))).expect(
"Failed to create subscriber"
)
(await startWaku(addr subscriber)).expect("Failed to start subscriber")
# connect subscriber to store (not publisher, so msg won't come via relay to it)
await subscriber.node.connectToNodes(@[storeNodePeerInfo])
# subscribe to content topic
(await subscriber.subscribe(testTopic)).expect("Failed to subscribe")
# listen before triggering store check
let eventManager = newReceiveEventListenerManager(subscriber.brokerCtx, 1)
defer:
eventManager.teardown()
# trigger store check, should recover and deliver via MessageReceivedEvent
await subscriber.deliveryService.recvService.checkStore()
let received = await eventManager.waitForEvents(TestTimeout)
check received
check eventManager.receivedMessages.len == 1
if eventManager.receivedMessages.len > 0:
check eventManager.receivedMessages[0].payload == missedPayload
# cleanup
(await subscriber.stop()).expect("Failed to stop subscriber")
await publisher.stop()
await storeNode.stop()

View File

@ -54,6 +54,44 @@ procSuite "Peer Manager":
nodes[0].peerManager.switch.peerStore.connectedness(nodes[1].peerInfo.peerId) ==
Connectedness.Connected
asyncTest "Peer manager tracks active store request state":
let nodes = toSeq(0 ..< 2).mapIt(
newTestWakuNode(generateSecp256k1Key(), parseIpAddress("0.0.0.0"), Port(0))
)
await allFutures(nodes.mapIt(it.start()))
await allFutures(nodes.mapIt(it.mountRelay()))
let peerId = nodes[1].peerInfo.peerId
require (
await nodes[0].peerManager.connectPeer(nodes[1].peerInfo.toRemotePeerInfo())
)
await sleepAsync(chronos.milliseconds(500))
nodes[0].peerManager.addActiveStoreRequest(peerId)
check:
nodes[0].peerManager.hasActiveStoreRequest(peerId)
await nodes[0].peerManager.evictPeer(peerId)
await sleepAsync(chronos.milliseconds(100))
check:
nodes[0].peerManager.switch.peerStore.connectedness(peerId) ==
Connectedness.Connected
nodes[0].peerManager.removeActiveStoreRequest(peerId)
check:
not nodes[0].peerManager.hasActiveStoreRequest(peerId)
await nodes[0].peerManager.evictPeer(peerId)
await sleepAsync(chronos.milliseconds(100))
check:
nodes[0].peerManager.switch.peerStore.connectedness(peerId) !=
Connectedness.Connected
await allFutures(nodes.mapIt(it.stop()))
asyncTest "dialPeer() works":
# Create 2 nodes
let nodes = toSeq(0 ..< 2).mapIt(

View File

@ -1,21 +1,5 @@
import waku/waku_core/[message/message, message/digest], waku/common/broker/event_broker
type DeliveryDirection* {.pure.} = enum
PUBLISHING
RECEIVING
type DeliverySuccess* {.pure.} = enum
SUCCESSFUL
UNSUCCESSFUL
EventBroker:
type DeliveryFeedbackEvent* = ref object
success*: DeliverySuccess
dir*: DeliveryDirection
comment*: string
msgHash*: WakuMessageHash
msg*: WakuMessage
EventBroker:
type OnFilterSubscribeEvent* = object
pubsubTopic*: string

View File

@ -12,7 +12,7 @@ import
type DeliveryService* = ref object
sendService*: SendService
recvService: RecvService
recvService*: RecvService
subscriptionManager*: SubscriptionManager
proc new*(

View File

@ -12,7 +12,6 @@ import
waku_store/common,
waku_filter_v2/client,
waku_core/topics,
events/delivery_events,
events/message_events,
waku_node,
common/broker/broker_context,
@ -27,7 +26,8 @@ const PruneOldMsgsPeriod = chronos.minutes(1)
const DelayExtra* = chronos.seconds(5)
## Additional security time to overlap the missing messages queries
type TupleHashAndMsg = tuple[hash: WakuMessageHash, msg: WakuMessage]
type TupleHashAndMsg =
tuple[hash: WakuMessageHash, msg: WakuMessage, pubsubTopic: PubsubTopic]
type RecvMessage = object
msgHash: WakuMessageHash
@ -59,88 +59,82 @@ proc getMissingMsgsFromStore(
return err("getMissingMsgsFromStore: " & $error)
let otherwiseMsg = WakuMessage()
## message to be returned if the Option message is none
let otherwiseTopic = PubsubTopic("")
return ok(
storeResp.messages.mapIt((hash: it.messageHash, msg: it.message.get(otherwiseMsg)))
storeResp.messages.mapIt(
(
hash: it.messageHash,
msg: it.message.get(otherwiseMsg),
pubsubTopic: it.pubsubTopic.get(otherwiseTopic),
)
)
)
proc performDeliveryFeedback(
self: RecvService,
success: DeliverySuccess,
dir: DeliveryDirection,
comment: string,
msgHash: WakuMessageHash,
msg: WakuMessage,
) {.gcsafe, raises: [].} =
info "recv monitor performDeliveryFeedback",
success, dir, comment, msg_hash = shortLog(msgHash)
DeliveryFeedbackEvent.emit(
brokerCtx = self.brokerCtx,
success = success,
dir = dir,
comment = comment,
msgHash = msgHash,
msg = msg,
)
proc msgChecker(self: RecvService) {.async.} =
## Continuously checks if a message has been received
while true:
await sleepAsync(StoreCheckPeriod)
self.endTimeToCheck = getNowInNanosecondTime()
var msgHashesInStore = newSeq[WakuMessageHash](0)
for pubsubTopic, contentTopics in self.subscriptionManager.subscribedTopics:
let storeResp: StoreQueryResponse = (
await self.node.wakuStoreClient.queryToAny(
StoreQueryRequest(
includeData: false,
pubsubTopic: some(pubsubTopic),
contentTopics: toSeq(contentTopics),
startTime: some(self.startTimeToCheck - DelayExtra.nanos),
endTime: some(self.endTimeToCheck + DelayExtra.nanos),
)
)
).valueOr:
error "msgChecker failed to get remote msgHashes",
pubsubTopic = pubsubTopic, cTopics = toSeq(contentTopics), error = $error
continue
msgHashesInStore.add(storeResp.messages.mapIt(it.messageHash))
## compare the msgHashes seen from the store vs the ones received directly
let rxMsgHashes = self.recentReceivedMsgs.mapIt(it.msgHash)
let missedHashes: seq[WakuMessageHash] =
msgHashesInStore.filterIt(not rxMsgHashes.contains(it))
## Now retrieve the missed WakuMessages
let missingMsgsRet = await self.getMissingMsgsFromStore(missedHashes)
if missingMsgsRet.isOk():
## Give feedback so that the api client can perfom any action with the missed messages
for msgTuple in missingMsgsRet.get():
self.performDeliveryFeedback(
DeliverySuccess.UNSUCCESSFUL, RECEIVING, "Missed message", msgTuple.hash,
msgTuple.msg,
)
else:
error "failed to retrieve missing messages: ", error = $missingMsgsRet.error
## update next check times
self.startTimeToCheck = self.endTimeToCheck
proc processIncomingMessageOfInterest(
self: RecvService, pubsubTopic: string, message: WakuMessage
) =
## Resolve an incoming network message that was already filtered by topic.
): bool =
## Deduplicate (by hash), store (saves in recently-seen messages) and emit
## the MAPI MessageReceivedEvent for every unique incoming message.
## Returns true if the message was new and the MessageReceivedEvent was properly emitted.
let msgHash = computeMessageHash(pubsubTopic, message)
if not self.recentReceivedMsgs.anyIt(it.msgHash == msgHash):
let rxMsg = RecvMessage(msgHash: msgHash, rxTime: message.timestamp)
self.recentReceivedMsgs.add(rxMsg)
MessageReceivedEvent.emit(self.brokerCtx, msgHash.to0xHex(), message)
return true
return false
proc checkStore*(self: RecvService) {.async.} =
## Checks the store for messages that were not received directly and
## delivers them via MessageReceivedEvent.
self.endTimeToCheck = getNowInNanosecondTime()
## query store and deliver new recovered messages per subscribed topic
for pubsubTopic, contentTopics in self.subscriptionManager.subscribedTopics:
let storeResp: StoreQueryResponse = (
await self.node.wakuStoreClient.queryToAny(
StoreQueryRequest(
includeData: false,
pubsubTopic: some(pubsubTopic),
contentTopics: toSeq(contentTopics),
startTime: some(self.startTimeToCheck - DelayExtra.nanos),
endTime: some(self.endTimeToCheck + DelayExtra.nanos),
)
)
).valueOr:
error "msgChecker failed to get remote msgHashes",
pubsubTopic = pubsubTopic, cTopics = toSeq(contentTopics), error = $error
continue
## compare the msgHashes seen from the store vs the ones received directly
let msgHashesInStore = storeResp.messages.mapIt(it.messageHash)
let rxMsgHashes = self.recentReceivedMsgs.mapIt(it.msgHash)
let missedHashes: seq[WakuMessageHash] =
msgHashesInStore.filterIt(not rxMsgHashes.contains(it))
if missedHashes.len > 0:
info "missed messages detected, checking store for missed messages",
pubsubTopic = pubsubTopic, missedCount = missedHashes.len
## Now retrieve the missing WakuMessages and deliver them
let missingMsgsRet = await self.getMissingMsgsFromStore(missedHashes)
if missingMsgsRet.isOk():
for msgTuple in missingMsgsRet.get():
if self.processIncomingMessageOfInterest(msgTuple.pubsubTopic, msgTuple.msg):
info "recv service store-recovered message",
msg_hash = shortLog(msgTuple.hash), pubsubTopic = msgTuple.pubsubTopic
else:
error "failed to retrieve missing messages: ", error = $missingMsgsRet.error
## update next check times
self.startTimeToCheck = self.endTimeToCheck
proc msgChecker(self: RecvService) {.async.} =
## Continuously checks if a message has been received
while true:
await sleepAsync(StoreCheckPeriod)
await self.checkStore()
proc new*(T: typedesc[RecvService], node: WakuNode, s: SubscriptionManager): T =
## The storeClient will help to acquire any possible missed messages
@ -176,7 +170,7 @@ proc startRecvService*(self: RecvService) =
shard = event.topic, contenttopic = event.message.contentTopic
return
self.processIncomingMessageOfInterest(event.topic, event.message),
discard self.processIncomingMessageOfInterest(event.topic, event.message),
).valueOr:
error "Failed to set MessageSeenEvent listener", error = error
quit(QuitFailure)

View File

@ -107,6 +107,7 @@ type PeerManager* = ref object of RootObj
online: bool ## state managed by online_monitor module
getShards: GetShards
maxConnections: int
activeStoreRequests*: Table[PeerId, int]
#~~~~~~~~~~~~~~~~~~~#
# Helper Functions #
@ -169,6 +170,23 @@ proc addPeer*(
proc getPeer*(pm: PeerManager, peerId: PeerId): RemotePeerInfo =
return pm.switch.peerStore.getPeer(peerId)
proc addActiveStoreRequest*(pm: PeerManager, peerId: PeerId) {.gcsafe.} =
pm.activeStoreRequests.mgetOrPut(peerId, 0).inc()
proc removeActiveStoreRequest*(pm: PeerManager, peerId: PeerId) {.gcsafe.} =
let count = pm.activeStoreRequests.getOrDefault(peerId, 0)
if count == 0:
return
let newCount = count - 1
if newCount <= 0:
pm.activeStoreRequests.del(peerId)
else:
pm.activeStoreRequests[peerId] = newCount
proc hasActiveStoreRequest*(pm: PeerManager, peerId: PeerId): bool {.gcsafe.} =
pm.activeStoreRequests.contains(peerId)
proc loadFromStorage(pm: PeerManager) {.gcsafe.} =
## Load peers from storage, if available
@ -519,6 +537,15 @@ proc connectedPeers*(
return (inPeers, outPeers)
proc evictPeer*(pm: PeerManager, peerId: PeerId) {.async.} =
## Policy-based eviction (relay-peer limit, IP colocation, pruning).
## Skips the disconnect when the peer has an in-flight store request to
## avoid aborting active store requests.
if pm.hasActiveStoreRequest(peerId):
trace "skipping peer eviction: active store request", peerId = peerId
return
await pm.switch.disconnect(peerId)
proc capablePeers*(pm: PeerManager, protocol: string): (seq[PeerId], seq[PeerId]) =
## Returns the PeerIds of peers with an active socket connection.
## If a protocol is specified, it returns peers that have identified
@ -770,11 +797,11 @@ proc onPeerEvent(pm: PeerManager, peerId: PeerId, event: PeerEvent) {.async.} =
let inRelayPeers = pm.connectedPeers(WakuRelayCodec)[0]
if inRelayPeers.len > pm.inRelayPeersTarget and
peerStore.hasPeer(peerId, WakuRelayCodec):
info "disconnecting relay peer because reached max num in-relay peers",
info "relay peer limit reached, evicting peer",
peerId = peerId,
inRelayPeers = inRelayPeers.len,
inRelayPeersTarget = pm.inRelayPeersTarget
await pm.switch.disconnect(peerId)
await pm.evictPeer(peerId)
## Apply max ip colocation limit
if (let ip = pm.getPeerIp(peerId); ip.isSome()):
@ -787,7 +814,7 @@ proc onPeerEvent(pm: PeerManager, peerId: PeerId, event: PeerEvent) {.async.} =
if pm.colocationLimit != 0 and peersBehindIp.len > pm.colocationLimit:
for peerId in peersBehindIp[0 ..< (peersBehindIp.len - pm.colocationLimit)]:
info "Pruning connection due to ip colocation", peerId = peerId, ip = ip
asyncSpawn(pm.switch.disconnect(peerId))
asyncSpawn(pm.evictPeer(peerId))
peerStore.delete(peerId)
WakuPeerEvent.emit(pm.brokerCtx, peerId, WakuPeerEventKind.EventConnected)
@ -1100,7 +1127,7 @@ proc pruneInRelayConns(pm: PeerManager, amount: int) {.async.} =
for p in inRelayPeers[0 ..< connsToPrune]:
trace "Pruning Peer", Peer = $p
asyncSpawn(pm.switch.disconnect(p))
asyncSpawn(pm.evictPeer(p))
proc addExtPeerEventHandler*(
pm: PeerManager, eventHandler: PeerEventHandler, eventKind: PeerEventKind
@ -1214,6 +1241,7 @@ proc new*(
pm.serviceSlots = initTable[string, RemotePeerInfo]()
pm.ipTable = initTable[string, seq[PeerId]]()
pm.activeStoreRequests = initTable[PeerId, int]()
if not storage.isNil():
trace "found persistent peer storage"

View File

@ -33,7 +33,9 @@ proc sendStoreRequest(
): Future[StoreQueryResult] {.async, gcsafe.} =
var req = request
self.peerManager.addActiveStoreRequest(connection.peerId)
defer:
self.peerManager.removeActiveStoreRequest(connection.peerId)
await connection.closeWithEof()
if req.requestId == "":

View File

@ -93,7 +93,9 @@ proc initProtocolHandler(self: WakuStore) =
var resBuf: StoreResp
var queryDuration: float
self.peerManager.addActiveStoreRequest(conn.peerId)
defer:
self.peerManager.removeActiveStoreRequest(conn.peerId)
await conn.closeWithEof()
self.requestRateLimiter.checkUsageLimit(WakuStoreCodec, conn):