add loop lagging as health status

2026-04-14 04:03:20 +00:00 · 2026-04-08 14:45:56 +02:00 · 2026-04-08 14:45:56 +02:00 · 66513cbed4
commit 66513cbed4
parent 36e77c5d89
3 changed files with 24 additions and 4 deletions
--- a/waku/node/health_monitor/event_loop_monitor.nim
+++ b/waku/node/health_monitor/event_loop_monitor.nim
@ -10,7 +10,9 @@ const CheckInterval = 5.seconds
 declarePublicGauge event_loop_lag_seconds,
  "chronos event loop lag in seconds: difference between actual and expected wake-up interval"

-proc eventLoopMonitorLoop*() {.async.} =
+type OnLagChange* = proc(lagTooHigh: bool) {.gcsafe, raises: [].}
+
+proc eventLoopMonitorLoop*(onLagChange: OnLagChange = nil) {.async.} =
  ## Monitors chronos event loop responsiveness.
  ##
  ## Schedules a task every `CheckInterval`. Because chronos is single-threaded
@ -22,8 +24,10 @@ proc eventLoopMonitorLoop*() {.async.} =
  ##   actual_elapsed >> CheckInterval      → tasks are accumulating / loop is stalling
  ##
  ## The lag (actual - expected) is exposed via `event_loop_lag_seconds`.
+  ## When lag transitions above or below `CheckInterval`, `onLagChange` is called.

  var lastWakeup = Moment.now()
+  var lagWasHigh = false
  while true:
    await sleepAsync(CheckInterval)

@ -34,6 +38,8 @@ proc eventLoopMonitorLoop*() {.async.} =

    event_loop_lag_seconds.set(lagSecs)

+    let lagIsHigh = lag > CheckInterval
+
    if lag > CheckInterval:
      warn "chronos event loop severely lagging, many tasks may be accumulating",
        expected_secs = CheckInterval.seconds,
@ -45,4 +51,8 @@ proc eventLoopMonitorLoop*() {.async.} =
        actual_secs = actualElapsed.nanoseconds.float64 / 1_000_000_000.0,
        lag_secs = lagSecs

+    if not isNil(onLagChange) and lagIsHigh != lagWasHigh:
+      lagWasHigh = lagIsHigh
+      onLagChange(lagIsHigh)
+
    lastWakeup = now
--- a/waku/node/health_monitor/health_status.nim
+++ b/waku/node/health_monitor/health_status.nim
@ -7,6 +7,7 @@ type HealthStatus* {.pure.} = enum
  NOT_READY
  NOT_MOUNTED
  SHUTTING_DOWN
+  EVENT_LOOP_LAGGING

 proc init*(t: typedesc[HealthStatus], strRep: string): Result[HealthStatus, string] =
  try:
--- a/waku/node/health_monitor/node_health_monitor.nim
+++ b/waku/node/health_monitor/node_health_monitor.nim
@ -50,6 +50,9 @@ type NodeHealthMonitor* = ref object
  relayObserver: PubSubObserver
  peerEventListener: WakuPeerEventListener
  shardHealthListener: EventShardTopicHealthChangeListener
+  eventLoopLagExceeded: bool
+    ## set to true when the chronos event loop lag exceeds the severe threshold,
+    ## causing the node health to be reported as EVENT_LOOP_LAGGING until lag recovers.

 func getHealth*(report: HealthReport, kind: WakuProtocol): ProtocolHealth =
  for h in report.protocolsHealth:
@ -443,7 +446,8 @@ proc getNodeHealthReport*(hm: NodeHealthMonitor): Future[HealthReport] {.async.}
    hm.cachedProtocols = await hm.getAllProtocolHealthInfo()
    hm.connectionStatus = hm.calculateConnectionState()

-  report.nodeHealth = HealthStatus.READY
+  report.nodeHealth =
+    if hm.eventLoopLagExceeded: HealthStatus.EVENT_LOOP_LAGGING else: HealthStatus.READY
  report.connectionStatus = hm.connectionStatus
  report.protocolsHealth = hm.cachedProtocols
  return report
@ -463,7 +467,8 @@ proc getSyncNodeHealthReport*(hm: NodeHealthMonitor): HealthReport =
    hm.cachedProtocols = hm.getSyncAllProtocolHealthInfo()
    hm.connectionStatus = hm.calculateConnectionState()

-  report.nodeHealth = HealthStatus.READY
+  report.nodeHealth =
+    if hm.eventLoopLagExceeded: HealthStatus.EVENT_LOOP_LAGGING else: HealthStatus.READY
  report.connectionStatus = hm.connectionStatus
  report.protocolsHealth = hm.cachedProtocols
  return report
@ -696,7 +701,11 @@ proc startHealthMonitor*(hm: NodeHealthMonitor): Result[void, string] =
  hm.healthUpdateEvent.fire()

  hm.healthLoopFut = hm.healthLoop()
-  hm.eventLoopMonitorFut = eventLoopMonitorLoop()
+  hm.eventLoopMonitorFut = eventLoopMonitorLoop(
+    proc(lagTooHigh: bool) {.gcsafe, raises: [].} =
+      hm.eventLoopLagExceeded = lagTooHigh
+      hm.healthUpdateEvent.fire()
+  )

  hm.startKeepalive().isOkOr:
    return err("startHealthMonitor: failed starting keep alive: " & error)