Merge 37757bb65d80636115363df25fa48c6cc3907b6e into 166dc69c390a95efa82f6b439d002eabe712b57f

This commit is contained in:
Ivan FB 2026-04-13 22:12:11 +02:00 committed by GitHub
commit 376159749f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,58 +1,78 @@
{.push raises: [].}
import std/math
import chronos, chronicles, metrics
logScope:
topics = "waku event_loop_monitor"
const CheckInterval = 5.seconds
declarePublicGauge event_loop_lag_seconds,
"chronos event loop lag in seconds: difference between actual and expected wake-up interval"
declarePublicGauge event_loop_load,
"chronos event loop load EWMA by window (1.0 = sustained lag at MaxAcceptedLag)",
labels = ["window"]
type OnLagChange* = proc(lagTooHigh: bool) {.gcsafe, raises: [].}
proc eventLoopMonitorLoop*(onLagChange: OnLagChange = nil) {.async.} =
## Monitors chronos event loop responsiveness.
## Monitors chronos event loop responsiveness by measuring how much each
## iteration oversleeps its `CheckInterval`.
##
## Schedules a task every `CheckInterval`. Because chronos is single-threaded
## and cooperative, the task can only resume after all previously queued work
## completes. The actual elapsed time between iterations therefore reflects
## how saturated the event loop is:
## The lag is normalised against `MaxAcceptedLag` and tracked as an EWMA
## over 1, 5, and 15-minute windows (Unix load-average decay model),
## exposed via the `event_loop_load` gauge (labelled by window: 1m/5m/15m):
##
## actual_elapsed ≈ CheckInterval → loop is healthy
## actual_elapsed >> CheckInterval → tasks are accumulating / loop is stalling
## load < 1.0 → within budget
## load = 1.0 → sustained lag at MaxAcceptedLag (fully loaded)
## load > 1.0 → over budget; e.g. 2.0 means twice the accepted lag
##
## The lag (actual - expected) is exposed via `event_loop_lag_seconds`.
## When lag transitions above or below `CheckInterval`, `onLagChange` is called.
## `onLagChange` is called when instantaneous lag crosses `MaxAcceptedLag`.
var lastWakeup = Moment.now()
const CheckInterval = 5.seconds
const MaxAcceptedLag = 50.milliseconds
# Decay factors: α = 1 e^(CheckInterval_secs / window_secs)
# Mirrors the Unix load-average convention so each EWMA has a half-life equal
# to its named window.
const alpha1m = 1.0 - exp(-5.0 / 60.0) # ≈ 0.0821
const alpha5m = 1.0 - exp(-5.0 / 300.0) # ≈ 0.0165
const alpha15m = 1.0 - exp(-5.0 / 900.0) # ≈ 0.0055
var ewma1m = 0.0
var ewma5m = 0.0
var ewma15m = 0.0
var now = Moment.now()
var lagWasHigh = false
while true:
let lastWakeup = now
await sleepAsync(CheckInterval)
now = Moment.now()
let now = Moment.now()
let actualElapsed = now - lastWakeup
let lag = actualElapsed - CheckInterval
let lag = max(ZeroDuration, actualElapsed - CheckInterval)
const maxAcceptedLagSecs = MaxAcceptedLag.nanoseconds.float64 / 1_000_000_000.0
let lagSecs = lag.nanoseconds.float64 / 1_000_000_000.0
let load = lagSecs / maxAcceptedLagSecs
event_loop_lag_seconds.set(lagSecs)
ewma1m = alpha1m * load + (1.0 - alpha1m) * ewma1m
ewma5m = alpha5m * load + (1.0 - alpha5m) * ewma5m
ewma15m = alpha15m * load + (1.0 - alpha15m) * ewma15m
let lagIsHigh = lag > CheckInterval
event_loop_load.set(round(ewma1m, 4), labelValues = ["1m"])
event_loop_load.set(round(ewma5m, 4), labelValues = ["5m"])
event_loop_load.set(round(ewma15m, 4), labelValues = ["15m"])
let lagIsHigh = lag > MaxAcceptedLag
if lag > CheckInterval:
warn "chronos event loop severely lagging, many tasks may be accumulating",
expected_secs = CheckInterval.seconds,
actual_secs = actualElapsed.nanoseconds.float64 / 1_000_000_000.0,
lag_secs = lagSecs
elif lag > (CheckInterval div 2):
info "chronos event loop lag detected",
expected_secs = CheckInterval.seconds,
actual_secs = actualElapsed.nanoseconds.float64 / 1_000_000_000.0,
lag_secs = lagSecs
lag_secs = round(lagSecs, 4),
load_1m = round(ewma1m, 4),
load_5m = round(ewma5m, 4),
load_15m = round(ewma15m, 4)
if not isNil(onLagChange) and lagIsHigh != lagWasHigh:
if not onLagChange.isNil() and lagIsHigh != lagWasHigh:
lagWasHigh = lagIsHigh
onLagChange(lagIsHigh)
lastWakeup = now