2026-04-09 16:51:46 +02:00
|
|
|
|
{.push raises: [].}
|
|
|
|
|
|
|
2026-04-20 18:05:44 +02:00
|
|
|
|
import std/math
|
2026-04-09 16:51:46 +02:00
|
|
|
|
import chronos, chronicles, metrics
|
|
|
|
|
|
|
|
|
|
|
|
logScope:
|
|
|
|
|
|
topics = "waku event_loop_monitor"
|
|
|
|
|
|
|
2026-04-20 18:05:44 +02:00
|
|
|
|
declarePublicGauge event_loop_load,
|
|
|
|
|
|
"chronos event loop load EWMA by window (1.0 = sustained lag at MaxAcceptedLag)",
|
|
|
|
|
|
labels = ["window"]
|
2026-04-09 16:51:46 +02:00
|
|
|
|
|
2026-04-30 00:27:38 +02:00
|
|
|
|
declarePublicCounter event_loop_accumulated_lag_secs,
|
|
|
|
|
|
"chronos event loop total accumulated lag in seconds since node start"
|
|
|
|
|
|
|
2026-04-09 16:51:46 +02:00
|
|
|
|
type OnLagChange* = proc(lagTooHigh: bool) {.gcsafe, raises: [].}
|
|
|
|
|
|
|
|
|
|
|
|
proc eventLoopMonitorLoop*(onLagChange: OnLagChange = nil) {.async.} =
|
2026-04-20 18:05:44 +02:00
|
|
|
|
## Monitors chronos event loop responsiveness by measuring how much each
|
|
|
|
|
|
## iteration oversleeps its `CheckInterval`.
|
2026-04-09 16:51:46 +02:00
|
|
|
|
##
|
2026-04-20 18:05:44 +02:00
|
|
|
|
## The lag is normalised against `MaxAcceptedLag` and tracked as an EWMA
|
|
|
|
|
|
## over 1, 5, and 15-minute windows (Unix load-average decay model),
|
|
|
|
|
|
## exposed via the `event_loop_load` gauge (labelled by window: 1m/5m/15m):
|
2026-04-09 16:51:46 +02:00
|
|
|
|
##
|
2026-04-20 18:05:44 +02:00
|
|
|
|
## load < 1.0 → within budget
|
|
|
|
|
|
## load = 1.0 → sustained lag at MaxAcceptedLag (fully loaded)
|
|
|
|
|
|
## load > 1.0 → over budget; e.g. 2.0 means twice the accepted lag
|
2026-04-09 16:51:46 +02:00
|
|
|
|
##
|
2026-04-20 18:05:44 +02:00
|
|
|
|
## `onLagChange` is called when instantaneous lag crosses `MaxAcceptedLag`.
|
|
|
|
|
|
|
|
|
|
|
|
const CheckInterval = 5.seconds
|
|
|
|
|
|
const MaxAcceptedLag = 50.milliseconds
|
|
|
|
|
|
|
|
|
|
|
|
# Decay factors: α = 1 − e^(−CheckInterval_secs / window_secs)
|
|
|
|
|
|
# Mirrors the Unix load-average convention so each EWMA has a half-life equal
|
|
|
|
|
|
# to its named window.
|
|
|
|
|
|
const alpha1m = 1.0 - exp(-5.0 / 60.0) # ≈ 0.0821
|
|
|
|
|
|
const alpha5m = 1.0 - exp(-5.0 / 300.0) # ≈ 0.0165
|
|
|
|
|
|
const alpha15m = 1.0 - exp(-5.0 / 900.0) # ≈ 0.0055
|
2026-04-09 16:51:46 +02:00
|
|
|
|
|
2026-04-20 18:05:44 +02:00
|
|
|
|
var ewma1m = 0.0
|
|
|
|
|
|
var ewma5m = 0.0
|
|
|
|
|
|
var ewma15m = 0.0
|
|
|
|
|
|
|
|
|
|
|
|
var now = Moment.now()
|
2026-04-09 16:51:46 +02:00
|
|
|
|
var lagWasHigh = false
|
2026-04-20 18:05:44 +02:00
|
|
|
|
|
2026-04-09 16:51:46 +02:00
|
|
|
|
while true:
|
2026-04-20 18:05:44 +02:00
|
|
|
|
let lastWakeup = now
|
2026-04-09 16:51:46 +02:00
|
|
|
|
await sleepAsync(CheckInterval)
|
2026-04-20 18:05:44 +02:00
|
|
|
|
now = Moment.now()
|
2026-04-09 16:51:46 +02:00
|
|
|
|
|
|
|
|
|
|
let actualElapsed = now - lastWakeup
|
2026-04-20 18:05:44 +02:00
|
|
|
|
let lag = max(ZeroDuration, actualElapsed - CheckInterval)
|
|
|
|
|
|
const maxAcceptedLagSecs = MaxAcceptedLag.nanoseconds.float64 / 1_000_000_000.0
|
|
|
|
|
|
|
2026-04-09 16:51:46 +02:00
|
|
|
|
let lagSecs = lag.nanoseconds.float64 / 1_000_000_000.0
|
2026-04-20 18:05:44 +02:00
|
|
|
|
let load = lagSecs / maxAcceptedLagSecs
|
|
|
|
|
|
|
2026-04-30 00:27:38 +02:00
|
|
|
|
event_loop_accumulated_lag_secs.inc(lagSecs)
|
|
|
|
|
|
|
2026-04-20 18:05:44 +02:00
|
|
|
|
ewma1m = alpha1m * load + (1.0 - alpha1m) * ewma1m
|
|
|
|
|
|
ewma5m = alpha5m * load + (1.0 - alpha5m) * ewma5m
|
|
|
|
|
|
ewma15m = alpha15m * load + (1.0 - alpha15m) * ewma15m
|
2026-04-09 16:51:46 +02:00
|
|
|
|
|
2026-04-20 18:05:44 +02:00
|
|
|
|
event_loop_load.set(round(ewma1m, 4), labelValues = ["1m"])
|
|
|
|
|
|
event_loop_load.set(round(ewma5m, 4), labelValues = ["5m"])
|
|
|
|
|
|
event_loop_load.set(round(ewma15m, 4), labelValues = ["15m"])
|
2026-04-09 16:51:46 +02:00
|
|
|
|
|
2026-04-20 18:05:44 +02:00
|
|
|
|
let lagIsHigh = lag > MaxAcceptedLag
|
2026-04-09 16:51:46 +02:00
|
|
|
|
|
|
|
|
|
|
if lag > CheckInterval:
|
|
|
|
|
|
warn "chronos event loop severely lagging, many tasks may be accumulating",
|
|
|
|
|
|
expected_secs = CheckInterval.seconds,
|
2026-04-20 18:05:44 +02:00
|
|
|
|
lag_secs = round(lagSecs, 4),
|
|
|
|
|
|
load_1m = round(ewma1m, 4),
|
|
|
|
|
|
load_5m = round(ewma5m, 4),
|
|
|
|
|
|
load_15m = round(ewma15m, 4)
|
2026-04-09 16:51:46 +02:00
|
|
|
|
|
2026-04-20 18:05:44 +02:00
|
|
|
|
if not onLagChange.isNil() and lagIsHigh != lagWasHigh:
|
2026-04-09 16:51:46 +02:00
|
|
|
|
lagWasHigh = lagIsHigh
|
|
|
|
|
|
onLagChange(lagIsHigh)
|