logos-messaging-nim/waku/node/health_monitor/node_health_monitor.nim

{.push raises: [].}

import
  std/[options, sets, random, sequtils, json, strutils, tables],
  chronos,
  chronicles,
  libp2p/protocols/rendezvous,
  libp2p/protocols/pubsub,
  libp2p/protocols/pubsub/rpc/messages,
  waku/[
    waku_relay,
    waku_rln_relay,
    api/types,
    events/health_events,
    events/peer_events,
    node/waku_node,
    node/peer_manager,
    node/kernel_api,
    node/health_monitor/online_monitor,
    node/health_monitor/health_status,
    node/health_monitor/health_report,
    node/health_monitor/connection_status,
    node/health_monitor/protocol_health,
    node/health_monitor/event_loop_monitor,
    requests/health_requests,
  ]

## This module is aimed to check the state of the "self" Waku Node

# randomize initializes sdt/random's random number generator
# if not called, the outcome of randomization procedures will be the same in every run
random.randomize()

type NodeHealthMonitor* = ref object
  nodeHealth: HealthStatus
  node: WakuNode
  onlineMonitor*: OnlineMonitor
  keepAliveFut: Future[void]
  healthLoopFut: Future[void]
  eventLoopMonitorFut: Future[void]
  healthUpdateEvent: AsyncEvent
  connectionStatus: ConnectionStatus
  onConnectionStatusChange*: ConnectionStatusChangeHandler
  cachedProtocols: seq[ProtocolHealth]
    ## state of each protocol to report.
    ## calculated on last event that can change any protocol's state so fetching a report is fast.
  strength: Table[WakuProtocol, int]
    ## latest known connectivity strength (e.g. connected peer count) metric for each protocol.
    ## if it doesn't make sense for the protocol in question, this is set to zero.
  relayObserver: PubSubObserver
  peerEventListener: WakuPeerEventListener
  shardHealthListener: EventShardTopicHealthChangeListener
  eventLoopLagExceeded: bool
    ## set to true when the chronos event loop lag exceeds the severe threshold,
    ## causing the node health to be reported as EVENT_LOOP_LAGGING until lag recovers.

func getHealth*(report: HealthReport, kind: WakuProtocol): ProtocolHealth =
  for h in report.protocolsHealth:
    if h.protocol == $kind:
      return h
  # Shouldn't happen, but if it does, then assume protocol is not mounted
  return ProtocolHealth.init(kind)

proc countCapablePeers(hm: NodeHealthMonitor, codec: string): int =
  if isNil(hm.node.peerManager):
    return 0

  return hm.node.peerManager.getCapablePeersCount(codec)

proc getRelayHealth(hm: NodeHealthMonitor): ProtocolHealth =
  var p = ProtocolHealth.init(WakuProtocol.RelayProtocol)

  if isNil(hm.node.wakuRelay):
    hm.strength[WakuProtocol.RelayProtocol] = 0
    return p.notMounted()

  let relayPeers = hm.node.wakuRelay.getConnectedPubSubPeers(pubsubTopic = "").valueOr:
    hm.strength[WakuProtocol.RelayProtocol] = 0
    return p.notMounted()

  let count = relayPeers.len
  hm.strength[WakuProtocol.RelayProtocol] = count
  if count == 0:
    return p.notReady("No connected peers")

  return p.ready()

proc getRlnRelayHealth(hm: NodeHealthMonitor): Future[ProtocolHealth] {.async.} =
  var p = ProtocolHealth.init(WakuProtocol.RlnRelayProtocol)
  if isNil(hm.node.wakuRlnRelay):
    return p.notMounted()

  const FutIsReadyTimout = 5.seconds

  let isReadyStateFut = hm.node.wakuRlnRelay.isReady()
  if not await isReadyStateFut.withTimeout(FutIsReadyTimout):
    return p.notReady("Ready state check timed out")

  try:
    if not isReadyStateFut.completed():
      return p.notReady("Ready state check timed out")
    elif isReadyStateFut.read():
      return p.ready()

    return p.synchronizing()
  except:
    error "exception reading state: " & getCurrentExceptionMsg()
    return p.notReady("State cannot be determined")

proc getLightpushHealth(
    hm: NodeHealthMonitor, relayHealth: HealthStatus
): ProtocolHealth =
  var p = ProtocolHealth.init(WakuProtocol.LightpushProtocol)

  if isNil(hm.node.wakuLightPush):
    hm.strength[WakuProtocol.LightpushProtocol] = 0
    return p.notMounted()

  let peerCount = countCapablePeers(hm, WakuLightPushCodec)
  hm.strength[WakuProtocol.LightpushProtocol] = peerCount

  if relayHealth == HealthStatus.READY:
    return p.ready()

  return p.notReady("Node has no relay peers to fullfill push requests")

proc getLegacyLightpushHealth(
    hm: NodeHealthMonitor, relayHealth: HealthStatus
): ProtocolHealth =
  var p = ProtocolHealth.init(WakuProtocol.LegacyLightpushProtocol)

  if isNil(hm.node.wakuLegacyLightPush):
    hm.strength[WakuProtocol.LegacyLightpushProtocol] = 0
    return p.notMounted()

  let peerCount = countCapablePeers(hm, WakuLegacyLightPushCodec)
  hm.strength[WakuProtocol.LegacyLightpushProtocol] = peerCount

  if relayHealth == HealthStatus.READY:
    return p.ready()

  return p.notReady("Node has no relay peers to fullfill push requests")

proc getFilterHealth(hm: NodeHealthMonitor, relayHealth: HealthStatus): ProtocolHealth =
  var p = ProtocolHealth.init(WakuProtocol.FilterProtocol)

  if isNil(hm.node.wakuFilter):
    hm.strength[WakuProtocol.FilterProtocol] = 0
    return p.notMounted()

  let peerCount = countCapablePeers(hm, WakuFilterSubscribeCodec)
  hm.strength[WakuProtocol.FilterProtocol] = peerCount

  if relayHealth == HealthStatus.READY:
    return p.ready()

  return p.notReady("Relay is not ready, filter will not be able to sort out messages")

proc getStoreHealth(hm: NodeHealthMonitor): ProtocolHealth =
  var p = ProtocolHealth.init(WakuProtocol.StoreProtocol)

  if isNil(hm.node.wakuStore):
    hm.strength[WakuProtocol.StoreProtocol] = 0
    return p.notMounted()

  let peerCount = countCapablePeers(hm, WakuStoreCodec)
  hm.strength[WakuProtocol.StoreProtocol] = peerCount
  return p.ready()

proc getLightpushClientHealth(hm: NodeHealthMonitor): ProtocolHealth =
  var p = ProtocolHealth.init(WakuProtocol.LightpushClientProtocol)

  if isNil(hm.node.wakuLightpushClient):
    hm.strength[WakuProtocol.LightpushClientProtocol] = 0
    return p.notMounted()

  let peerCount = countCapablePeers(hm, WakuLightPushCodec)
  hm.strength[WakuProtocol.LightpushClientProtocol] = peerCount

  if peerCount > 0:
    return p.ready()
  return p.notReady("No Lightpush service peer available yet")

proc getLegacyLightpushClientHealth(hm: NodeHealthMonitor): ProtocolHealth =
  var p = ProtocolHealth.init(WakuProtocol.LegacyLightpushClientProtocol)

  if isNil(hm.node.wakuLegacyLightpushClient):
    hm.strength[WakuProtocol.LegacyLightpushClientProtocol] = 0
    return p.notMounted()

  let peerCount = countCapablePeers(hm, WakuLegacyLightPushCodec)
  hm.strength[WakuProtocol.LegacyLightpushClientProtocol] = peerCount

  if peerCount > 0:
    return p.ready()
  return p.notReady("No Lightpush service peer available yet")

proc getFilterClientHealth(hm: NodeHealthMonitor): ProtocolHealth =
  var p = ProtocolHealth.init(WakuProtocol.FilterClientProtocol)

  if isNil(hm.node.wakuFilterClient):
    hm.strength[WakuProtocol.FilterClientProtocol] = 0
    return p.notMounted()

  if isNil(hm.node.wakuRelay):
    let edgeRes = RequestEdgeFilterPeerCount.request(hm.node.brokerCtx)
    if edgeRes.isOk():
      let peerCount = edgeRes.get().peerCount
      if peerCount > 0:
        hm.strength[WakuProtocol.FilterClientProtocol] = peerCount
        return p.ready()
    else:
      error "Failed to request edge filter peer count", error = edgeRes.error
      return p.notReady("Failed to request edge filter peer count: " & edgeRes.error)

  let peerCount = countCapablePeers(hm, WakuFilterSubscribeCodec)
  hm.strength[WakuProtocol.FilterClientProtocol] = peerCount

  if peerCount > 0:
    return p.ready()
  return p.notReady("No Filter service peer available yet")

proc getStoreClientHealth(hm: NodeHealthMonitor): ProtocolHealth =
  var p = ProtocolHealth.init(WakuProtocol.StoreClientProtocol)

  if isNil(hm.node.wakuStoreClient):
    hm.strength[WakuProtocol.StoreClientProtocol] = 0
    return p.notMounted()

  let peerCount = countCapablePeers(hm, WakuStoreCodec)
  hm.strength[WakuProtocol.StoreClientProtocol] = peerCount

  if peerCount > 0 or not isNil(hm.node.wakuStore):
    return p.ready()

  return p.notReady(
    "No Store service peer available yet, neither Store service set up for the node"
  )

proc getPeerExchangeHealth(hm: NodeHealthMonitor): ProtocolHealth =
  var p = ProtocolHealth.init(WakuProtocol.PeerExchangeProtocol)

  if isNil(hm.node.wakuPeerExchange):
    hm.strength[WakuProtocol.PeerExchangeProtocol] = 0
    return p.notMounted()

  let peerCount = countCapablePeers(hm, WakuPeerExchangeCodec)
  hm.strength[WakuProtocol.PeerExchangeProtocol] = peerCount

  return p.ready()

proc getRendezvousHealth(hm: NodeHealthMonitor): ProtocolHealth =
  var p = ProtocolHealth.init(WakuProtocol.RendezvousProtocol)

  if isNil(hm.node.wakuRendezvous):
    hm.strength[WakuProtocol.RendezvousProtocol] = 0
    return p.notMounted()

  let peerCount = countCapablePeers(hm, RendezVousCodec)
  hm.strength[WakuProtocol.RendezvousProtocol] = peerCount
  if peerCount == 0:
    return p.notReady("No Rendezvous peers are available yet")

  return p.ready()

proc getMixHealth(hm: NodeHealthMonitor): ProtocolHealth =
  var p = ProtocolHealth.init(WakuProtocol.MixProtocol)

  if isNil(hm.node.wakuMix):
    return p.notMounted()

  return p.ready()

proc getSyncProtocolHealthInfo*(
    hm: NodeHealthMonitor, protocol: WakuProtocol
): ProtocolHealth =
  ## Get ProtocolHealth for a given protocol that can provide it synchronously
  ##
  case protocol
  of WakuProtocol.RelayProtocol:
    return hm.getRelayHealth()
  of WakuProtocol.StoreProtocol:
    return hm.getStoreHealth()
  of WakuProtocol.FilterProtocol:
    return hm.getFilterHealth(hm.getRelayHealth().health)
  of WakuProtocol.LightpushProtocol:
    return hm.getLightpushHealth(hm.getRelayHealth().health)
  of WakuProtocol.LegacyLightpushProtocol:
    return hm.getLegacyLightpushHealth(hm.getRelayHealth().health)
  of WakuProtocol.PeerExchangeProtocol:
    return hm.getPeerExchangeHealth()
  of WakuProtocol.RendezvousProtocol:
    return hm.getRendezvousHealth()
  of WakuProtocol.MixProtocol:
    return hm.getMixHealth()
  of WakuProtocol.StoreClientProtocol:
    return hm.getStoreClientHealth()
  of WakuProtocol.FilterClientProtocol:
    return hm.getFilterClientHealth()
  of WakuProtocol.LightpushClientProtocol:
    return hm.getLightpushClientHealth()
  of WakuProtocol.LegacyLightpushClientProtocol:
    return hm.getLegacyLightpushClientHealth()
  of WakuProtocol.RlnRelayProtocol:
    # Could waitFor here but we don't want to block the main thread.
    # Could also return a cached value from a previous check.
    var p = ProtocolHealth.init(protocol)
    return p.notReady("RLN Relay health check is async")
  else:
    var p = ProtocolHealth.init(protocol)
    return p.notMounted()

proc getProtocolHealthInfo*(
    hm: NodeHealthMonitor, protocol: WakuProtocol
): Future[ProtocolHealth] {.async.} =
  ## Get ProtocolHealth for a given protocol
  ##
  case protocol
  of WakuProtocol.RlnRelayProtocol:
    return await hm.getRlnRelayHealth()
  else:
    return hm.getSyncProtocolHealthInfo(protocol)

proc getSyncAllProtocolHealthInfo(hm: NodeHealthMonitor): seq[ProtocolHealth] =
  ## Get ProtocolHealth for the subset of protocols that can provide it synchronously
  ##
  var protocols: seq[ProtocolHealth] = @[]
  let relayHealth = hm.getRelayHealth()
  protocols.add(relayHealth)

  protocols.add(hm.getLightpushHealth(relayHealth.health))
  protocols.add(hm.getLegacyLightpushHealth(relayHealth.health))
  protocols.add(hm.getFilterHealth(relayHealth.health))
  protocols.add(hm.getStoreHealth())
  protocols.add(hm.getPeerExchangeHealth())
  protocols.add(hm.getRendezvousHealth())
  protocols.add(hm.getMixHealth())

  protocols.add(hm.getLightpushClientHealth())
  protocols.add(hm.getLegacyLightpushClientHealth())
  protocols.add(hm.getStoreClientHealth())
  protocols.add(hm.getFilterClientHealth())
  return protocols

proc getAllProtocolHealthInfo(
    hm: NodeHealthMonitor
): Future[seq[ProtocolHealth]] {.async.} =
  ## Get ProtocolHealth for all protocols
  ##
  var protocols = hm.getSyncAllProtocolHealthInfo()

  let rlnHealth = await hm.getRlnRelayHealth()
  protocols.add(rlnHealth)

  return protocols

proc calculateConnectionState*(
    protocols: seq[ProtocolHealth],
    strength: Table[WakuProtocol, int], ## latest connectivity strength (e.g. peer count) for a protocol
    dLowOpt: Option[int], ## minimum relay peers for Connected status if in Core (Relay) mode
): ConnectionStatus =
  var
    relayCount = 0
    lightpushCount = 0
    filterCount = 0
    storeClientCount = 0

  for p in protocols:
    let kind =
      try:
        parseEnum[WakuProtocol](p.protocol)
      except ValueError:
        continue

    if p.health != HealthStatus.READY:
      continue

    let strength = strength.getOrDefault(kind, 0)

    if kind in RelayProtocols:
      relayCount = max(relayCount, strength)
    elif kind in StoreClientProtocols:
      storeClientCount = max(storeClientCount, strength)
    elif kind in LightpushClientProtocols:
      lightpushCount = max(lightpushCount, strength)
    elif kind in FilterClientProtocols:
      filterCount = max(filterCount, strength)

  debug "calculateConnectionState",
    relayCount, storeClientCount, lightpushCount, filterCount

  # Relay connectivity should be a sufficient check in Core mode.
  # "Store peers" are relay peers because incoming messages in
  # the relay are input to the store server.
  # But if Store server (or client, even) is not mounted as well, this logic assumes
  # the user knows what they're doing.

  if dLowOpt.isSome():
    if relayCount >= dLowOpt.get():
      return ConnectionStatus.Connected

  if relayCount > 0:
    return ConnectionStatus.PartiallyConnected

  # No relay connectivity. Relay might not be mounted, or may just have zero peers.
  # Fall back to Edge check in any case to be sure.

  let canSend = lightpushCount > 0
  let canReceive = filterCount > 0
  let canStore = storeClientCount > 0

  let meetsMinimum = canSend and canReceive and canStore

  if not meetsMinimum:
    return ConnectionStatus.Disconnected

  let isEdgeRobust =
    (lightpushCount >= HealthyThreshold) and (filterCount >= HealthyThreshold) and
    (storeClientCount >= HealthyThreshold)

  if isEdgeRobust:
    return ConnectionStatus.Connected

  return ConnectionStatus.PartiallyConnected

proc calculateConnectionState*(hm: NodeHealthMonitor): ConnectionStatus =
  let dLow =
    if isNil(hm.node.wakuRelay):
      none(int)
    else:
      some(hm.node.wakuRelay.parameters.dLow)
  return calculateConnectionState(hm.cachedProtocols, hm.strength, dLow)

proc getNodeHealthReport*(hm: NodeHealthMonitor): Future[HealthReport] {.async.} =
  ## Get a HealthReport that includes all protocols
  ##
  var report: HealthReport

  if hm.nodeHealth == HealthStatus.INITIALIZING or
      hm.nodeHealth == HealthStatus.SHUTTING_DOWN:
    report.nodeHealth = hm.nodeHealth
    report.connectionStatus = ConnectionStatus.Disconnected
    return report

  if hm.cachedProtocols.len == 0:
    hm.cachedProtocols = await hm.getAllProtocolHealthInfo()
    hm.connectionStatus = hm.calculateConnectionState()

  report.nodeHealth =
    if hm.eventLoopLagExceeded: HealthStatus.EVENT_LOOP_LAGGING else: HealthStatus.READY
  report.connectionStatus = hm.connectionStatus
  report.protocolsHealth = hm.cachedProtocols
  return report

proc getSyncNodeHealthReport*(hm: NodeHealthMonitor): HealthReport =
  ## Get a HealthReport that includes the subset of protocols that inform health synchronously
  ##
  var report: HealthReport

  if hm.nodeHealth == HealthStatus.INITIALIZING or
      hm.nodeHealth == HealthStatus.SHUTTING_DOWN:
    report.nodeHealth = hm.nodeHealth
    report.connectionStatus = ConnectionStatus.Disconnected
    return report

  if hm.cachedProtocols.len == 0:
    hm.cachedProtocols = hm.getSyncAllProtocolHealthInfo()
    hm.connectionStatus = hm.calculateConnectionState()

  report.nodeHealth =
    if hm.eventLoopLagExceeded: HealthStatus.EVENT_LOOP_LAGGING else: HealthStatus.READY
  report.connectionStatus = hm.connectionStatus
  report.protocolsHealth = hm.cachedProtocols
  return report

proc onRelayMsg(
    hm: NodeHealthMonitor, peer: PubSubPeer, msg: var RPCMsg
) {.gcsafe, raises: [].} =
  ## Inspect Relay events for health-update relevance in Core (Relay) mode.
  ##
  ## For Core (Relay) mode, the connectivity health state is mostly determined
  ## by the relay protocol state (it is the dominant factor), and we know
  ## that a peer Relay can only affect this Relay's health if there is a
  ## subscription change or a mesh (GRAFT/PRUNE) change.
  ##

  if msg.subscriptions.len == 0:
    if msg.control.isNone():
      return
    let ctrl = msg.control.get()
    if ctrl.graft.len == 0 and ctrl.prune.len == 0:
      return

  hm.healthUpdateEvent.fire()

proc healthLoop(hm: NodeHealthMonitor) {.async.} =
  ## Re-evaluate the global health state of the node when notified of a potential change,
  ## and call back the application if an actual change from the last notified state happened.
  info "Health monitor loop start"
  while true:
    try:
      await hm.healthUpdateEvent.wait()
      hm.healthUpdateEvent.clear()

      hm.cachedProtocols = await hm.getAllProtocolHealthInfo()
      let newConnectionStatus = hm.calculateConnectionState()

      if newConnectionStatus != hm.connectionStatus:
        debug "connectionStatus change",
          oldstatus = hm.connectionStatus, newstatus = newConnectionStatus

        hm.connectionStatus = newConnectionStatus

        EventConnectionStatusChange.emit(hm.node.brokerCtx, newConnectionStatus)

        if not isNil(hm.onConnectionStatusChange):
          await hm.onConnectionStatusChange(newConnectionStatus)
    except CancelledError:
      break
    except Exception as e:
      error "HealthMonitor: error in update loop", error = e.msg

    # safety cooldown to protect from edge cases
    await sleepAsync(100.milliseconds)

  info "Health monitor loop end"

proc selectRandomPeersForKeepalive(
    node: WakuNode, outPeers: seq[PeerId], numRandomPeers: int
): Future[seq[PeerId]] {.async.} =
  ## Select peers for random keepalive, prioritizing mesh peers

  if isNil(node.wakuRelay):
    return selectRandomPeers(outPeers, numRandomPeers)

  let meshPeers = node.wakuRelay.getPeersInMesh().valueOr:
    error "Failed getting peers in mesh for ping", error = error
    # Fallback to random selection from all outgoing peers
    return selectRandomPeers(outPeers, numRandomPeers)

  trace "Mesh peers for keepalive", meshPeers = meshPeers

  # Get non-mesh peers and shuffle them
  var nonMeshPeers = outPeers.filterIt(it notin meshPeers)
  shuffle(nonMeshPeers)

  # Combine mesh peers + random non-mesh peers up to numRandomPeers total
  let numNonMeshPeers = max(0, numRandomPeers - len(meshPeers))
  let selectedNonMeshPeers = nonMeshPeers[0 ..< min(len(nonMeshPeers), numNonMeshPeers)]

  let selectedPeers = meshPeers & selectedNonMeshPeers
  trace "Selected peers for keepalive", selected = selectedPeers
  return selectedPeers

proc keepAliveLoop(
    node: WakuNode,
    randomPeersKeepalive: chronos.Duration,
    allPeersKeepAlive: chronos.Duration,
    numRandomPeers = 10,
) {.async.} =
  # Calculate how many random peer cycles before pinging all peers
  let randomToAllRatio =
    int(allPeersKeepAlive.seconds() / randomPeersKeepalive.seconds())
  var countdownToPingAll = max(0, randomToAllRatio - 1)

  # Sleep detection configuration
  let sleepDetectionInterval = 3 * randomPeersKeepalive

  # Failure tracking
  var consecutiveIterationFailures = 0
  const maxAllowedConsecutiveFailures = 2

  var lastTimeExecuted = Moment.now()

  while true:
    trace "Running keepalive loop"
    await sleepAsync(randomPeersKeepalive)

    if not node.started:
      continue

    let currentTime = Moment.now()

    # Check for sleep detection
    if currentTime - lastTimeExecuted > sleepDetectionInterval:
      warn "Keep alive hasn't been executed recently. Killing all connections"
      await node.peerManager.disconnectAllPeers()
      lastTimeExecuted = currentTime
      consecutiveIterationFailures = 0
      continue

    # Check for consecutive failures
    if consecutiveIterationFailures > maxAllowedConsecutiveFailures:
      warn "Too many consecutive ping failures, node likely disconnected. Killing all connections",
        consecutiveIterationFailures, maxAllowedConsecutiveFailures
      await node.peerManager.disconnectAllPeers()
      consecutiveIterationFailures = 0
      lastTimeExecuted = currentTime
      continue

    # Determine which peers to ping
    let outPeers = node.peerManager.connectedPeers()[1]
    let peersToPing =
      if countdownToPingAll > 0:
        await selectRandomPeersForKeepalive(node, outPeers, numRandomPeers)
      else:
        outPeers

    let numPeersToPing = len(peersToPing)

    if countdownToPingAll > 0:
      trace "Pinging random peers",
        count = numPeersToPing, countdownToPingAll = countdownToPingAll
      countdownToPingAll.dec()
    else:
      trace "Pinging all peers", count = numPeersToPing
      countdownToPingAll = max(0, randomToAllRatio - 1)

    # Execute keepalive pings
    let successfulPings = await parallelPings(node, peersToPing)

    if successfulPings != numPeersToPing:
      waku_node_errors.inc(
        amount = numPeersToPing - successfulPings, labelValues = ["keep_alive_failure"]
      )

    trace "Keepalive results",
      attemptedPings = numPeersToPing, successfulPings = successfulPings

    # Update failure tracking
    if numPeersToPing > 0 and successfulPings == 0:
      consecutiveIterationFailures.inc()
      error "All pings failed", consecutiveFailures = consecutiveIterationFailures
    else:
      consecutiveIterationFailures = 0

    lastTimeExecuted = currentTime

# 2 minutes default - 20% of the default chronosstream timeout duration
proc startKeepalive*(
    hm: NodeHealthMonitor,
    randomPeersKeepalive = 10.seconds,
    allPeersKeepalive = 2.minutes,
): Result[void, string] =
  # Validate input parameters
  if randomPeersKeepalive.isZero() or allPeersKeepAlive.isZero():
    error "startKeepalive: allPeersKeepAlive and randomPeersKeepalive must be greater than 0",
      randomPeersKeepalive = $randomPeersKeepalive,
      allPeersKeepAlive = $allPeersKeepAlive
    return err(
      "startKeepalive: allPeersKeepAlive and randomPeersKeepalive must be greater than 0"
    )

  if allPeersKeepAlive < randomPeersKeepalive:
    error "startKeepalive: allPeersKeepAlive can't be less than randomPeersKeepalive",
      allPeersKeepAlive = $allPeersKeepAlive,
      randomPeersKeepalive = $randomPeersKeepalive
    return
      err("startKeepalive: allPeersKeepAlive can't be less than randomPeersKeepalive")

  info "starting keepalive",
    randomPeersKeepalive = randomPeersKeepalive, allPeersKeepalive = allPeersKeepalive

  hm.keepAliveFut = hm.node.keepAliveLoop(randomPeersKeepalive, allPeersKeepalive)
  return ok()

proc setOverallHealth*(hm: NodeHealthMonitor, health: HealthStatus) =
  hm.nodeHealth = health

proc startHealthMonitor*(hm: NodeHealthMonitor): Result[void, string] =
  hm.onlineMonitor.startOnlineMonitor()

  if isNil(hm.node.peerManager):
    return err("startHealthMonitor: no node peerManager to monitor")

  if not isNil(hm.node.wakuRelay):
    hm.relayObserver = PubSubObserver(
      onRecv: proc(peer: PubSubPeer, msgs: var RPCMsg) {.gcsafe, raises: [].} =
        hm.onRelayMsg(peer, msgs)
    )
    hm.node.wakuRelay.addObserver(hm.relayObserver)

  hm.peerEventListener = WakuPeerEvent.listen(
    hm.node.brokerCtx,
    proc(evt: WakuPeerEvent): Future[void] {.async: (raises: []), gcsafe.} =
      ## Recompute health on any peer changing anything (join, leave, identify, metadata update)
      hm.healthUpdateEvent.fire(),
  ).valueOr:
    return err("Failed to subscribe to peer events: " & error)

  hm.shardHealthListener = EventShardTopicHealthChange.listen(
    hm.node.brokerCtx,
    proc(
        evt: EventShardTopicHealthChange
    ): Future[void] {.async: (raises: []), gcsafe.} =
      hm.healthUpdateEvent.fire(),
  ).valueOr:
    return err("Failed to subscribe to shard health events: " & error)

  hm.healthUpdateEvent = newAsyncEvent()
  hm.healthUpdateEvent.fire()

  hm.healthLoopFut = hm.healthLoop()
  hm.eventLoopMonitorFut = eventLoopMonitorLoop(
    proc(lagTooHigh: bool) {.gcsafe, raises: [].} =
      hm.eventLoopLagExceeded = lagTooHigh
      hm.healthUpdateEvent.fire()
  )

  hm.startKeepalive().isOkOr:
    return err("startHealthMonitor: failed starting keep alive: " & error)

  return ok()

proc stopHealthMonitor*(hm: NodeHealthMonitor) {.async.} =
  if not isNil(hm.onlineMonitor):
    await hm.onlineMonitor.stopOnlineMonitor()

  if not isNil(hm.keepAliveFut):
    await hm.keepAliveFut.cancelAndWait()

  if not isNil(hm.healthLoopFut):
    await hm.healthLoopFut.cancelAndWait()

  if not isNil(hm.eventLoopMonitorFut):
    await hm.eventLoopMonitorFut.cancelAndWait()

  await WakuPeerEvent.dropListener(hm.node.brokerCtx, hm.peerEventListener)
  await EventShardTopicHealthChange.dropListener(
    hm.node.brokerCtx, hm.shardHealthListener
  )

  if not isNil(hm.node.wakuRelay) and not isNil(hm.relayObserver):
    hm.node.wakuRelay.removeObserver(hm.relayObserver)

proc new*(
    T: type NodeHealthMonitor,
    node: WakuNode,
    dnsNameServers = @[parseIpAddress("1.1.1.1"), parseIpAddress("1.0.0.1")],
): T =
  let om = OnlineMonitor.init(dnsNameServers)
  om.setPeerStoreToOnlineMonitor(node.switch.peerStore)
  om.addOnlineStateObserver(node.peerManager.getOnlineStateObserver())
  T(
    nodeHealth: INITIALIZING,
    node: node,
    onlineMonitor: om,
    connectionStatus: ConnectionStatus.Disconnected,
    strength: initTable[WakuProtocol, int](),
  )