logos-delivery/waku/node/health_monitor/event_loop_monitor.nim
Ivan FB ca7ec3de05
add main loop lag monitor (#3803)
* add loop lagging as health status
2026-04-09 16:51:46 +02:00

59 lines
2.0 KiB
Nim

{.push raises: [].}
import chronos, chronicles, metrics
logScope:
topics = "waku event_loop_monitor"
const CheckInterval = 5.seconds
declarePublicGauge event_loop_lag_seconds,
"chronos event loop lag in seconds: difference between actual and expected wake-up interval"
type OnLagChange* = proc(lagTooHigh: bool) {.gcsafe, raises: [].}
proc eventLoopMonitorLoop*(onLagChange: OnLagChange = nil) {.async.} =
## Monitors chronos event loop responsiveness.
##
## Schedules a task every `CheckInterval`. Because chronos is single-threaded
## and cooperative, the task can only resume after all previously queued work
## completes. The actual elapsed time between iterations therefore reflects
## how saturated the event loop is:
##
## actual_elapsed ≈ CheckInterval → loop is healthy
## actual_elapsed >> CheckInterval → tasks are accumulating / loop is stalling
##
## The lag (actual - expected) is exposed via `event_loop_lag_seconds`.
## When lag transitions above or below `CheckInterval`, `onLagChange` is called.
var lastWakeup = Moment.now()
var lagWasHigh = false
while true:
await sleepAsync(CheckInterval)
let now = Moment.now()
let actualElapsed = now - lastWakeup
let lag = actualElapsed - CheckInterval
let lagSecs = lag.nanoseconds.float64 / 1_000_000_000.0
event_loop_lag_seconds.set(lagSecs)
let lagIsHigh = lag > CheckInterval
if lag > CheckInterval:
warn "chronos event loop severely lagging, many tasks may be accumulating",
expected_secs = CheckInterval.seconds,
actual_secs = actualElapsed.nanoseconds.float64 / 1_000_000_000.0,
lag_secs = lagSecs
elif lag > (CheckInterval div 2):
info "chronos event loop lag detected",
expected_secs = CheckInterval.seconds,
actual_secs = actualElapsed.nanoseconds.float64 / 1_000_000_000.0,
lag_secs = lagSecs
if not isNil(onLagChange) and lagIsHigh != lagWasHigh:
lagWasHigh = lagIsHigh
onLagChange(lagIsHigh)
lastWakeup = now