logos-delivery/tests/node/test_wakunode_health_monitor.nim
Fabiana Cecin dc026bbff1
feat: active filter subscription management for edge nodes (#3773)
feat: active filter subscription management for edge nodes

## Subscription Manager
* edgeFilterSubLoop reconciles desired vs actual filter subscriptions
* edgeFilterHealthLoop pings filter peers, evicts stale ones
* EdgeFilterSubState per-shard tracking of confirmed peers and health
* best-effort unsubscribe on peer removal
* RequestEdgeShardHealth and RequestEdgeFilterPeerCount broker providers

## WakuNode
* Remove old edge health loop (loopEdgeHealth, edgeHealthEvent, calculateEdgeTopicHealth)
* Register MessageSeenEvent push handler on filter client during start
* startDeliveryService now returns `Result[void, string]` and propagates errors

## Health Monitor
* getFilterClientHealth queries RequestEdgeFilterPeerCount via broker
* Shard/content health providers fall back to RequestEdgeShardHealth when relay inactive
* Listen to EventShardTopicHealthChange for health recalculation
* Add missing return p.notReady() on failed edge filter peer count request
* HealthyThreshold constant moved to `connection_status.nim`

## Broker types
* RequestEdgeShardHealth, RequestEdgeFilterPeerCount request types
* EventShardTopicHealthChange event type

## Filter Client
* Add timeout parameter to ping proc

## Tests
* Health monitor event tests with per-node lockNewGlobalBrokerContext
* Edge (light client) health update test
* Edge health driven by confirmed filter subscriptions test
* API subscription tests: sub/receive, failover, peer replacement

Co-authored-by: Ivan FB <128452529+Ivansete-status@users.noreply.github.com>
Co-authored by Zoltan Nagy
2026-03-30 08:30:34 -03:00

435 lines
16 KiB
Nim

{.used.}
import
std/[json, options, sequtils, strutils, tables], testutils/unittests, chronos, results
import
waku/[
waku_core,
common/waku_protocol,
node/waku_node,
node/peer_manager,
node/health_monitor/health_status,
node/health_monitor/connection_status,
node/health_monitor/protocol_health,
node/health_monitor/topic_health,
node/health_monitor/node_health_monitor,
node/delivery_service/delivery_service,
node/delivery_service/subscription_manager,
node/kernel_api/relay,
node/kernel_api/store,
node/kernel_api/lightpush,
node/kernel_api/filter,
events/health_events,
events/peer_events,
waku_archive,
common/broker/broker_context,
]
import ../testlib/[wakunode, wakucore], ../waku_archive/archive_utils
const MockDLow = 4 # Mocked GossipSub DLow value
const TestConnectivityTimeLimit = 3.seconds
proc protoHealthMock(kind: WakuProtocol, health: HealthStatus): ProtocolHealth =
var ph = ProtocolHealth.init(kind)
if health == HealthStatus.READY:
return ph.ready()
else:
return ph.notReady("mock")
suite "Health Monitor - health state calculation":
test "Disconnected, zero peers":
let protocols = @[
protoHealthMock(RelayProtocol, HealthStatus.NOT_READY),
protoHealthMock(StoreClientProtocol, HealthStatus.NOT_READY),
protoHealthMock(FilterClientProtocol, HealthStatus.NOT_READY),
protoHealthMock(LightpushClientProtocol, HealthStatus.NOT_READY),
]
let strength = initTable[WakuProtocol, int]()
let state = calculateConnectionState(protocols, strength, some(MockDLow))
check state == ConnectionStatus.Disconnected
test "PartiallyConnected, weak relay":
let weakCount = MockDLow - 1
let protocols = @[protoHealthMock(RelayProtocol, HealthStatus.READY)]
var strength = initTable[WakuProtocol, int]()
strength[RelayProtocol] = weakCount
let state = calculateConnectionState(protocols, strength, some(MockDLow))
# Partially connected since relay connectivity is weak (> 0, but < dLow)
check state == ConnectionStatus.PartiallyConnected
test "Connected, robust relay":
let protocols = @[protoHealthMock(RelayProtocol, HealthStatus.READY)]
var strength = initTable[WakuProtocol, int]()
strength[RelayProtocol] = MockDLow
let state = calculateConnectionState(protocols, strength, some(MockDLow))
# Fully connected since relay connectivity is ideal (>= dLow)
check state == ConnectionStatus.Connected
test "Connected, robust edge":
let protocols = @[
protoHealthMock(RelayProtocol, HealthStatus.NOT_MOUNTED),
protoHealthMock(LightpushClientProtocol, HealthStatus.READY),
protoHealthMock(FilterClientProtocol, HealthStatus.READY),
protoHealthMock(StoreClientProtocol, HealthStatus.READY),
]
var strength = initTable[WakuProtocol, int]()
strength[LightpushClientProtocol] = HealthyThreshold
strength[FilterClientProtocol] = HealthyThreshold
strength[StoreClientProtocol] = HealthyThreshold
let state = calculateConnectionState(protocols, strength, some(MockDLow))
check state == ConnectionStatus.Connected
test "Disconnected, edge missing store":
let protocols = @[
protoHealthMock(LightpushClientProtocol, HealthStatus.READY),
protoHealthMock(FilterClientProtocol, HealthStatus.READY),
protoHealthMock(StoreClientProtocol, HealthStatus.NOT_READY),
]
var strength = initTable[WakuProtocol, int]()
strength[LightpushClientProtocol] = HealthyThreshold
strength[FilterClientProtocol] = HealthyThreshold
strength[StoreClientProtocol] = 0
let state = calculateConnectionState(protocols, strength, some(MockDLow))
check state == ConnectionStatus.Disconnected
test "PartiallyConnected, edge meets minimum failover requirement":
let weakCount = max(1, HealthyThreshold - 1)
let protocols = @[
protoHealthMock(LightpushClientProtocol, HealthStatus.READY),
protoHealthMock(FilterClientProtocol, HealthStatus.READY),
protoHealthMock(StoreClientProtocol, HealthStatus.READY),
]
var strength = initTable[WakuProtocol, int]()
strength[LightpushClientProtocol] = weakCount
strength[FilterClientProtocol] = weakCount
strength[StoreClientProtocol] = weakCount
let state = calculateConnectionState(protocols, strength, some(MockDLow))
check state == ConnectionStatus.PartiallyConnected
test "Connected, robust relay ignores store server":
let protocols = @[
protoHealthMock(RelayProtocol, HealthStatus.READY),
protoHealthMock(StoreProtocol, HealthStatus.READY),
]
var strength = initTable[WakuProtocol, int]()
strength[RelayProtocol] = MockDLow
strength[StoreProtocol] = 0
let state = calculateConnectionState(protocols, strength, some(MockDLow))
check state == ConnectionStatus.Connected
test "Connected, robust relay ignores store client":
let protocols = @[
protoHealthMock(RelayProtocol, HealthStatus.READY),
protoHealthMock(StoreProtocol, HealthStatus.READY),
protoHealthMock(StoreClientProtocol, HealthStatus.NOT_READY),
]
var strength = initTable[WakuProtocol, int]()
strength[RelayProtocol] = MockDLow
strength[StoreProtocol] = 0
strength[StoreClientProtocol] = 0
let state = calculateConnectionState(protocols, strength, some(MockDLow))
check state == ConnectionStatus.Connected
suite "Health Monitor - events":
asyncTest "Core (relay) health update":
var nodeA: WakuNode
lockNewGlobalBrokerContext:
let nodeAKey = generateSecp256k1Key()
nodeA = newTestWakuNode(nodeAKey, parseIpAddress("127.0.0.1"), Port(0))
(await nodeA.mountRelay()).expect("Node A failed to mount Relay")
await nodeA.start()
let monitorA = NodeHealthMonitor.new(nodeA)
var
lastStatus = ConnectionStatus.Disconnected
callbackCount = 0
healthChangeSignal = newAsyncEvent()
monitorA.onConnectionStatusChange = proc(status: ConnectionStatus) {.async.} =
lastStatus = status
callbackCount.inc()
healthChangeSignal.fire()
monitorA.startHealthMonitor().expect("Health monitor failed to start")
var nodeB: WakuNode
lockNewGlobalBrokerContext:
let nodeBKey = generateSecp256k1Key()
nodeB = newTestWakuNode(nodeBKey, parseIpAddress("127.0.0.1"), Port(0))
let driver = newSqliteArchiveDriver()
nodeB.mountArchive(driver).expect("Node B failed to mount archive")
(await nodeB.mountRelay()).expect("Node B failed to mount relay")
await nodeB.mountStore()
await nodeB.start()
await nodeA.connectToNodes(@[nodeB.switch.peerInfo.toRemotePeerInfo()])
proc dummyHandler(topic: PubsubTopic, msg: WakuMessage): Future[void] {.async.} =
discard
nodeA.subscribe((kind: PubsubSub, topic: DefaultPubsubTopic), dummyHandler).expect(
"Node A failed to subscribe"
)
nodeB.subscribe((kind: PubsubSub, topic: DefaultPubsubTopic), dummyHandler).expect(
"Node B failed to subscribe"
)
let connectTimeLimit = Moment.now() + TestConnectivityTimeLimit
var gotConnected = false
while Moment.now() < connectTimeLimit:
if lastStatus == ConnectionStatus.PartiallyConnected:
gotConnected = true
break
if await healthChangeSignal.wait().withTimeout(connectTimeLimit - Moment.now()):
healthChangeSignal.clear()
check:
gotConnected == true
callbackCount >= 1
lastStatus == ConnectionStatus.PartiallyConnected
healthChangeSignal.clear()
await nodeB.stop()
await nodeA.disconnectNode(nodeB.switch.peerInfo.toRemotePeerInfo())
let disconnectTimeLimit = Moment.now() + TestConnectivityTimeLimit
var gotDisconnected = false
while Moment.now() < disconnectTimeLimit:
if lastStatus == ConnectionStatus.Disconnected:
gotDisconnected = true
break
if await healthChangeSignal.wait().withTimeout(disconnectTimeLimit - Moment.now()):
healthChangeSignal.clear()
check:
gotDisconnected == true
await monitorA.stopHealthMonitor()
await nodeA.stop()
asyncTest "Edge (light client) health update":
var nodeA: WakuNode
lockNewGlobalBrokerContext:
let nodeAKey = generateSecp256k1Key()
nodeA = newTestWakuNode(nodeAKey, parseIpAddress("127.0.0.1"), Port(0))
nodeA.mountLightpushClient()
await nodeA.mountFilterClient()
nodeA.mountStoreClient()
require nodeA.mountAutoSharding(1, 8).isOk
nodeA.mountMetadata(1, @[0'u16]).expect("Node A failed to mount metadata")
await nodeA.start()
let ds =
DeliveryService.new(false, nodeA).expect("Failed to create DeliveryService")
ds.startDeliveryService().expect("Failed to start DeliveryService")
let monitorA = NodeHealthMonitor.new(nodeA)
var
lastStatus = ConnectionStatus.Disconnected
callbackCount = 0
healthChangeSignal = newAsyncEvent()
monitorA.onConnectionStatusChange = proc(status: ConnectionStatus) {.async.} =
lastStatus = status
callbackCount.inc()
healthChangeSignal.fire()
monitorA.startHealthMonitor().expect("Health monitor failed to start")
var nodeB: WakuNode
lockNewGlobalBrokerContext:
let nodeBKey = generateSecp256k1Key()
nodeB = newTestWakuNode(nodeBKey, parseIpAddress("127.0.0.1"), Port(0))
let driver = newSqliteArchiveDriver()
nodeB.mountArchive(driver).expect("Node B failed to mount archive")
(await nodeB.mountRelay()).expect("Node B failed to mount relay")
(await nodeB.mountLightpush()).expect("Node B failed to mount lightpush")
await nodeB.mountFilter()
await nodeB.mountStore()
require nodeB.mountAutoSharding(1, 8).isOk
nodeB.mountMetadata(1, toSeq(0'u16 ..< 8'u16)).expect(
"Node B failed to mount metadata"
)
await nodeB.start()
var metadataFut = newFuture[void]("waitForMetadata")
let metadataLis = WakuPeerEvent
.listen(
nodeA.brokerCtx,
proc(evt: WakuPeerEvent): Future[void] {.async: (raises: []), gcsafe.} =
if not metadataFut.finished and
evt.kind == WakuPeerEventKind.EventMetadataUpdated:
metadataFut.complete()
,
)
.expect("Failed to listen for metadata")
await nodeA.connectToNodes(@[nodeB.switch.peerInfo.toRemotePeerInfo()])
let metadataOk = await metadataFut.withTimeout(TestConnectivityTimeLimit)
WakuPeerEvent.dropListener(nodeA.brokerCtx, metadataLis)
require metadataOk
let connectTimeLimit = Moment.now() + TestConnectivityTimeLimit
var gotConnected = false
while Moment.now() < connectTimeLimit:
if lastStatus == ConnectionStatus.PartiallyConnected:
gotConnected = true
break
if await healthChangeSignal.wait().withTimeout(connectTimeLimit - Moment.now()):
healthChangeSignal.clear()
check:
gotConnected == true
callbackCount >= 1
lastStatus == ConnectionStatus.PartiallyConnected
healthChangeSignal.clear()
await nodeB.stop()
await nodeA.disconnectNode(nodeB.switch.peerInfo.toRemotePeerInfo())
let disconnectTimeLimit = Moment.now() + TestConnectivityTimeLimit
var gotDisconnected = false
while Moment.now() < disconnectTimeLimit:
if lastStatus == ConnectionStatus.Disconnected:
gotDisconnected = true
break
if await healthChangeSignal.wait().withTimeout(disconnectTimeLimit - Moment.now()):
healthChangeSignal.clear()
check:
gotDisconnected == true
lastStatus == ConnectionStatus.Disconnected
await monitorA.stopHealthMonitor()
await ds.stopDeliveryService()
await nodeA.stop()
asyncTest "Edge health driven by confirmed filter subscriptions":
var nodeA: WakuNode
lockNewGlobalBrokerContext:
let nodeAKey = generateSecp256k1Key()
nodeA = newTestWakuNode(nodeAKey, parseIpAddress("127.0.0.1"), Port(0))
await nodeA.mountFilterClient()
nodeA.mountLightpushClient()
nodeA.mountStoreClient()
require nodeA.mountAutoSharding(1, 8).isOk
nodeA.mountMetadata(1, @[0'u16]).expect("Node A failed to mount metadata")
await nodeA.start()
let ds =
DeliveryService.new(false, nodeA).expect("Failed to create DeliveryService")
ds.startDeliveryService().expect("Failed to start DeliveryService")
let subMgr = ds.subscriptionManager
var nodeB: WakuNode
lockNewGlobalBrokerContext:
let nodeBKey = generateSecp256k1Key()
nodeB = newTestWakuNode(nodeBKey, parseIpAddress("127.0.0.1"), Port(0))
let driver = newSqliteArchiveDriver()
nodeB.mountArchive(driver).expect("Node B failed to mount archive")
(await nodeB.mountRelay()).expect("Node B failed to mount relay")
(await nodeB.mountLightpush()).expect("Node B failed to mount lightpush")
await nodeB.mountFilter()
await nodeB.mountStore()
require nodeB.mountAutoSharding(1, 8).isOk
nodeB.mountMetadata(1, toSeq(0'u16 ..< 8'u16)).expect(
"Node B failed to mount metadata"
)
await nodeB.start()
let monitorA = NodeHealthMonitor.new(nodeA)
var
lastStatus = ConnectionStatus.Disconnected
healthSignal = newAsyncEvent()
monitorA.onConnectionStatusChange = proc(status: ConnectionStatus) {.async.} =
lastStatus = status
healthSignal.fire()
monitorA.startHealthMonitor().expect("Health monitor failed to start")
var metadataFut = newFuture[void]("waitForMetadata")
let metadataLis = WakuPeerEvent
.listen(
nodeA.brokerCtx,
proc(evt: WakuPeerEvent): Future[void] {.async: (raises: []), gcsafe.} =
if not metadataFut.finished and
evt.kind == WakuPeerEventKind.EventMetadataUpdated:
metadataFut.complete()
,
)
.expect("Failed to listen for metadata")
await nodeA.connectToNodes(@[nodeB.switch.peerInfo.toRemotePeerInfo()])
let metadataOk = await metadataFut.withTimeout(TestConnectivityTimeLimit)
WakuPeerEvent.dropListener(nodeA.brokerCtx, metadataLis)
require metadataOk
var deadline = Moment.now() + TestConnectivityTimeLimit
while Moment.now() < deadline:
if lastStatus == ConnectionStatus.PartiallyConnected:
break
if await healthSignal.wait().withTimeout(deadline - Moment.now()):
healthSignal.clear()
check lastStatus == ConnectionStatus.PartiallyConnected
var shardHealthFut = newFuture[EventShardTopicHealthChange]("waitForShardHealth")
let shardHealthLis = EventShardTopicHealthChange
.listen(
nodeA.brokerCtx,
proc(
evt: EventShardTopicHealthChange
): Future[void] {.async: (raises: []), gcsafe.} =
if not shardHealthFut.finished and (
evt.health == TopicHealth.MINIMALLY_HEALTHY or
evt.health == TopicHealth.SUFFICIENTLY_HEALTHY
):
shardHealthFut.complete(evt)
,
)
.expect("Failed to listen for shard health")
let contentTopic = ContentTopic("/waku/2/default-content/proto")
subMgr.subscribe(contentTopic).expect("Failed to subscribe")
let shardHealthOk = await shardHealthFut.withTimeout(TestConnectivityTimeLimit)
EventShardTopicHealthChange.dropListener(nodeA.brokerCtx, shardHealthLis)
check shardHealthOk == true
check subMgr.edgeFilterSubStates.len > 0
healthSignal.clear()
deadline = Moment.now() + TestConnectivityTimeLimit
while Moment.now() < deadline:
if lastStatus == ConnectionStatus.PartiallyConnected:
break
if await healthSignal.wait().withTimeout(deadline - Moment.now()):
healthSignal.clear()
check lastStatus == ConnectionStatus.PartiallyConnected
await ds.stopDeliveryService()
await monitorA.stopHealthMonitor()
await nodeB.stop()
await nodeA.stop()