diff --git a/waku/node/health_monitor/event_loop_monitor.nim b/waku/node/health_monitor/event_loop_monitor.nim index d4b8d98d2..49a2bcc6e 100644 --- a/waku/node/health_monitor/event_loop_monitor.nim +++ b/waku/node/health_monitor/event_loop_monitor.nim @@ -1,58 +1,78 @@ {.push raises: [].} +import std/math import chronos, chronicles, metrics logScope: topics = "waku event_loop_monitor" -const CheckInterval = 5.seconds - -declarePublicGauge event_loop_lag_seconds, - "chronos event loop lag in seconds: difference between actual and expected wake-up interval" +declarePublicGauge event_loop_load, + "chronos event loop load EWMA by window (1.0 = sustained lag at MaxAcceptedLag)", + labels = ["window"] type OnLagChange* = proc(lagTooHigh: bool) {.gcsafe, raises: [].} proc eventLoopMonitorLoop*(onLagChange: OnLagChange = nil) {.async.} = - ## Monitors chronos event loop responsiveness. + ## Monitors chronos event loop responsiveness by measuring how much each + ## iteration oversleeps its `CheckInterval`. ## - ## Schedules a task every `CheckInterval`. Because chronos is single-threaded - ## and cooperative, the task can only resume after all previously queued work - ## completes. The actual elapsed time between iterations therefore reflects - ## how saturated the event loop is: + ## The lag is normalised against `MaxAcceptedLag` and tracked as an EWMA + ## over 1, 5, and 15-minute windows (Unix load-average decay model), + ## exposed via the `event_loop_load` gauge (labelled by window: 1m/5m/15m): ## - ## actual_elapsed ≈ CheckInterval → loop is healthy - ## actual_elapsed >> CheckInterval → tasks are accumulating / loop is stalling + ## load < 1.0 → within budget + ## load = 1.0 → sustained lag at MaxAcceptedLag (fully loaded) + ## load > 1.0 → over budget; e.g. 2.0 means twice the accepted lag ## - ## The lag (actual - expected) is exposed via `event_loop_lag_seconds`. - ## When lag transitions above or below `CheckInterval`, `onLagChange` is called. + ## `onLagChange` is called when instantaneous lag crosses `MaxAcceptedLag`. - var lastWakeup = Moment.now() + const CheckInterval = 5.seconds + const MaxAcceptedLag = 50.milliseconds + + # Decay factors: α = 1 − e^(−CheckInterval_secs / window_secs) + # Mirrors the Unix load-average convention so each EWMA has a half-life equal + # to its named window. + const alpha1m = 1.0 - exp(-5.0 / 60.0) # ≈ 0.0821 + const alpha5m = 1.0 - exp(-5.0 / 300.0) # ≈ 0.0165 + const alpha15m = 1.0 - exp(-5.0 / 900.0) # ≈ 0.0055 + + var ewma1m = 0.0 + var ewma5m = 0.0 + var ewma15m = 0.0 + + var now = Moment.now() var lagWasHigh = false + while true: + let lastWakeup = now await sleepAsync(CheckInterval) + now = Moment.now() - let now = Moment.now() let actualElapsed = now - lastWakeup - let lag = actualElapsed - CheckInterval + let lag = max(ZeroDuration, actualElapsed - CheckInterval) + const maxAcceptedLagSecs = MaxAcceptedLag.nanoseconds.float64 / 1_000_000_000.0 + let lagSecs = lag.nanoseconds.float64 / 1_000_000_000.0 + let load = lagSecs / maxAcceptedLagSecs - event_loop_lag_seconds.set(lagSecs) + ewma1m = alpha1m * load + (1.0 - alpha1m) * ewma1m + ewma5m = alpha5m * load + (1.0 - alpha5m) * ewma5m + ewma15m = alpha15m * load + (1.0 - alpha15m) * ewma15m - let lagIsHigh = lag > CheckInterval + event_loop_load.set(round(ewma1m, 4), labelValues = ["1m"]) + event_loop_load.set(round(ewma5m, 4), labelValues = ["5m"]) + event_loop_load.set(round(ewma15m, 4), labelValues = ["15m"]) + + let lagIsHigh = lag > MaxAcceptedLag if lag > CheckInterval: warn "chronos event loop severely lagging, many tasks may be accumulating", expected_secs = CheckInterval.seconds, - actual_secs = actualElapsed.nanoseconds.float64 / 1_000_000_000.0, - lag_secs = lagSecs - elif lag > (CheckInterval div 2): - info "chronos event loop lag detected", - expected_secs = CheckInterval.seconds, - actual_secs = actualElapsed.nanoseconds.float64 / 1_000_000_000.0, - lag_secs = lagSecs + lag_secs = round(lagSecs, 4), + load_1m = round(ewma1m, 4), + load_5m = round(ewma5m, 4), + load_15m = round(ewma15m, 4) - if not isNil(onLagChange) and lagIsHigh != lagWasHigh: + if not onLagChange.isNil() and lagIsHigh != lagWasHigh: lagWasHigh = lagIsHigh onLagChange(lagIsHigh) - - lastWakeup = now