mirror of
https://github.com/logos-messaging/logos-delivery.git
synced 2026-04-14 04:03:20 +00:00
add loop lagging as health status
This commit is contained in:
parent
36e77c5d89
commit
66513cbed4
@ -10,7 +10,9 @@ const CheckInterval = 5.seconds
|
||||
declarePublicGauge event_loop_lag_seconds,
|
||||
"chronos event loop lag in seconds: difference between actual and expected wake-up interval"
|
||||
|
||||
proc eventLoopMonitorLoop*() {.async.} =
|
||||
type OnLagChange* = proc(lagTooHigh: bool) {.gcsafe, raises: [].}
|
||||
|
||||
proc eventLoopMonitorLoop*(onLagChange: OnLagChange = nil) {.async.} =
|
||||
## Monitors chronos event loop responsiveness.
|
||||
##
|
||||
## Schedules a task every `CheckInterval`. Because chronos is single-threaded
|
||||
@ -22,8 +24,10 @@ proc eventLoopMonitorLoop*() {.async.} =
|
||||
## actual_elapsed >> CheckInterval → tasks are accumulating / loop is stalling
|
||||
##
|
||||
## The lag (actual - expected) is exposed via `event_loop_lag_seconds`.
|
||||
## When lag transitions above or below `CheckInterval`, `onLagChange` is called.
|
||||
|
||||
var lastWakeup = Moment.now()
|
||||
var lagWasHigh = false
|
||||
while true:
|
||||
await sleepAsync(CheckInterval)
|
||||
|
||||
@ -34,6 +38,8 @@ proc eventLoopMonitorLoop*() {.async.} =
|
||||
|
||||
event_loop_lag_seconds.set(lagSecs)
|
||||
|
||||
let lagIsHigh = lag > CheckInterval
|
||||
|
||||
if lag > CheckInterval:
|
||||
warn "chronos event loop severely lagging, many tasks may be accumulating",
|
||||
expected_secs = CheckInterval.seconds,
|
||||
@ -45,4 +51,8 @@ proc eventLoopMonitorLoop*() {.async.} =
|
||||
actual_secs = actualElapsed.nanoseconds.float64 / 1_000_000_000.0,
|
||||
lag_secs = lagSecs
|
||||
|
||||
if not isNil(onLagChange) and lagIsHigh != lagWasHigh:
|
||||
lagWasHigh = lagIsHigh
|
||||
onLagChange(lagIsHigh)
|
||||
|
||||
lastWakeup = now
|
||||
|
||||
@ -7,6 +7,7 @@ type HealthStatus* {.pure.} = enum
|
||||
NOT_READY
|
||||
NOT_MOUNTED
|
||||
SHUTTING_DOWN
|
||||
EVENT_LOOP_LAGGING
|
||||
|
||||
proc init*(t: typedesc[HealthStatus], strRep: string): Result[HealthStatus, string] =
|
||||
try:
|
||||
|
||||
@ -50,6 +50,9 @@ type NodeHealthMonitor* = ref object
|
||||
relayObserver: PubSubObserver
|
||||
peerEventListener: WakuPeerEventListener
|
||||
shardHealthListener: EventShardTopicHealthChangeListener
|
||||
eventLoopLagExceeded: bool
|
||||
## set to true when the chronos event loop lag exceeds the severe threshold,
|
||||
## causing the node health to be reported as EVENT_LOOP_LAGGING until lag recovers.
|
||||
|
||||
func getHealth*(report: HealthReport, kind: WakuProtocol): ProtocolHealth =
|
||||
for h in report.protocolsHealth:
|
||||
@ -443,7 +446,8 @@ proc getNodeHealthReport*(hm: NodeHealthMonitor): Future[HealthReport] {.async.}
|
||||
hm.cachedProtocols = await hm.getAllProtocolHealthInfo()
|
||||
hm.connectionStatus = hm.calculateConnectionState()
|
||||
|
||||
report.nodeHealth = HealthStatus.READY
|
||||
report.nodeHealth =
|
||||
if hm.eventLoopLagExceeded: HealthStatus.EVENT_LOOP_LAGGING else: HealthStatus.READY
|
||||
report.connectionStatus = hm.connectionStatus
|
||||
report.protocolsHealth = hm.cachedProtocols
|
||||
return report
|
||||
@ -463,7 +467,8 @@ proc getSyncNodeHealthReport*(hm: NodeHealthMonitor): HealthReport =
|
||||
hm.cachedProtocols = hm.getSyncAllProtocolHealthInfo()
|
||||
hm.connectionStatus = hm.calculateConnectionState()
|
||||
|
||||
report.nodeHealth = HealthStatus.READY
|
||||
report.nodeHealth =
|
||||
if hm.eventLoopLagExceeded: HealthStatus.EVENT_LOOP_LAGGING else: HealthStatus.READY
|
||||
report.connectionStatus = hm.connectionStatus
|
||||
report.protocolsHealth = hm.cachedProtocols
|
||||
return report
|
||||
@ -696,7 +701,11 @@ proc startHealthMonitor*(hm: NodeHealthMonitor): Result[void, string] =
|
||||
hm.healthUpdateEvent.fire()
|
||||
|
||||
hm.healthLoopFut = hm.healthLoop()
|
||||
hm.eventLoopMonitorFut = eventLoopMonitorLoop()
|
||||
hm.eventLoopMonitorFut = eventLoopMonitorLoop(
|
||||
proc(lagTooHigh: bool) {.gcsafe, raises: [].} =
|
||||
hm.eventLoopLagExceeded = lagTooHigh
|
||||
hm.healthUpdateEvent.fire()
|
||||
)
|
||||
|
||||
hm.startKeepalive().isOkOr:
|
||||
return err("startHealthMonitor: failed starting keep alive: " & error)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user