diff --git a/waku/node/health_monitor/event_loop_monitor.nim b/waku/node/health_monitor/event_loop_monitor.nim index e423cd2ec..d4b8d98d2 100644 --- a/waku/node/health_monitor/event_loop_monitor.nim +++ b/waku/node/health_monitor/event_loop_monitor.nim @@ -10,7 +10,9 @@ const CheckInterval = 5.seconds declarePublicGauge event_loop_lag_seconds, "chronos event loop lag in seconds: difference between actual and expected wake-up interval" -proc eventLoopMonitorLoop*() {.async.} = +type OnLagChange* = proc(lagTooHigh: bool) {.gcsafe, raises: [].} + +proc eventLoopMonitorLoop*(onLagChange: OnLagChange = nil) {.async.} = ## Monitors chronos event loop responsiveness. ## ## Schedules a task every `CheckInterval`. Because chronos is single-threaded @@ -22,8 +24,10 @@ proc eventLoopMonitorLoop*() {.async.} = ## actual_elapsed >> CheckInterval → tasks are accumulating / loop is stalling ## ## The lag (actual - expected) is exposed via `event_loop_lag_seconds`. + ## When lag transitions above or below `CheckInterval`, `onLagChange` is called. var lastWakeup = Moment.now() + var lagWasHigh = false while true: await sleepAsync(CheckInterval) @@ -34,6 +38,8 @@ proc eventLoopMonitorLoop*() {.async.} = event_loop_lag_seconds.set(lagSecs) + let lagIsHigh = lag > CheckInterval + if lag > CheckInterval: warn "chronos event loop severely lagging, many tasks may be accumulating", expected_secs = CheckInterval.seconds, @@ -45,4 +51,8 @@ proc eventLoopMonitorLoop*() {.async.} = actual_secs = actualElapsed.nanoseconds.float64 / 1_000_000_000.0, lag_secs = lagSecs + if not isNil(onLagChange) and lagIsHigh != lagWasHigh: + lagWasHigh = lagIsHigh + onLagChange(lagIsHigh) + lastWakeup = now diff --git a/waku/node/health_monitor/health_status.nim b/waku/node/health_monitor/health_status.nim index 4dd2bdd9a..91663a507 100644 --- a/waku/node/health_monitor/health_status.nim +++ b/waku/node/health_monitor/health_status.nim @@ -7,6 +7,7 @@ type HealthStatus* {.pure.} = enum NOT_READY NOT_MOUNTED SHUTTING_DOWN + EVENT_LOOP_LAGGING proc init*(t: typedesc[HealthStatus], strRep: string): Result[HealthStatus, string] = try: diff --git a/waku/node/health_monitor/node_health_monitor.nim b/waku/node/health_monitor/node_health_monitor.nim index 966dc2fcc..c92dc1aaf 100644 --- a/waku/node/health_monitor/node_health_monitor.nim +++ b/waku/node/health_monitor/node_health_monitor.nim @@ -50,6 +50,9 @@ type NodeHealthMonitor* = ref object relayObserver: PubSubObserver peerEventListener: WakuPeerEventListener shardHealthListener: EventShardTopicHealthChangeListener + eventLoopLagExceeded: bool + ## set to true when the chronos event loop lag exceeds the severe threshold, + ## causing the node health to be reported as EVENT_LOOP_LAGGING until lag recovers. func getHealth*(report: HealthReport, kind: WakuProtocol): ProtocolHealth = for h in report.protocolsHealth: @@ -443,7 +446,8 @@ proc getNodeHealthReport*(hm: NodeHealthMonitor): Future[HealthReport] {.async.} hm.cachedProtocols = await hm.getAllProtocolHealthInfo() hm.connectionStatus = hm.calculateConnectionState() - report.nodeHealth = HealthStatus.READY + report.nodeHealth = + if hm.eventLoopLagExceeded: HealthStatus.EVENT_LOOP_LAGGING else: HealthStatus.READY report.connectionStatus = hm.connectionStatus report.protocolsHealth = hm.cachedProtocols return report @@ -463,7 +467,8 @@ proc getSyncNodeHealthReport*(hm: NodeHealthMonitor): HealthReport = hm.cachedProtocols = hm.getSyncAllProtocolHealthInfo() hm.connectionStatus = hm.calculateConnectionState() - report.nodeHealth = HealthStatus.READY + report.nodeHealth = + if hm.eventLoopLagExceeded: HealthStatus.EVENT_LOOP_LAGGING else: HealthStatus.READY report.connectionStatus = hm.connectionStatus report.protocolsHealth = hm.cachedProtocols return report @@ -696,7 +701,11 @@ proc startHealthMonitor*(hm: NodeHealthMonitor): Result[void, string] = hm.healthUpdateEvent.fire() hm.healthLoopFut = hm.healthLoop() - hm.eventLoopMonitorFut = eventLoopMonitorLoop() + hm.eventLoopMonitorFut = eventLoopMonitorLoop( + proc(lagTooHigh: bool) {.gcsafe, raises: [].} = + hm.eventLoopLagExceeded = lagTooHigh + hm.healthUpdateEvent.fire() + ) hm.startKeepalive().isOkOr: return err("startHealthMonitor: failed starting keep alive: " & error)