chore: Separation of node health and initialization state from rln_relay (#2612)

* Separation of node health and initialization state from rln_relay status. Make (only) health endpoint avail early and install others in the last stage of node setup.

* Proper json report from /health, adjusted and fixed test, added convenient script for checking node health

* Stop wakunode2 if configured rest server cannot be started

* Fix wakuRlnRelay protocol existence check

* Fix typo

* Removed unused imports from touched files.

* Added missing /health test for all
This commit is contained in:
NagyZoltanPeter 2024-04-23 18:53:18 +02:00 committed by GitHub
parent 1a23700d54
commit 6d135b0d1b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 391 additions and 103 deletions

View File

@ -18,7 +18,8 @@ import
../../waku/common/logging,
../../waku/factory/external_config,
../../waku/factory/networks_config,
../../waku/factory/app
../../waku/factory/app,
../../waku/node/health_monitor
logScope:
topics = "wakunode main"
@ -119,23 +120,43 @@ when isMainModule:
info "Running nwaku node", version = app.git_version
logConfig(conf)
# NOTE: {.threadvar.} is used to make the global variable GC safe for the closure uses it
# It will always be called from main thread anyway.
# Ref: https://nim-lang.org/docs/manual.html#threads-gc-safety
var nodeHealthMonitor {.threadvar.}: WakuNodeHealthMonitor
nodeHealthMonitor = WakuNodeHealthMonitor()
nodeHealthMonitor.setOverallHealth(HealthStatus.INITIALIZING)
let restServerRes = startRestServerEsentials(nodeHealthMonitor, conf)
if restServerRes.isErr():
error "Starting REST server failed.", error = $restServerRes.error()
quit(QuitFailure)
var wakunode2 = App.init(conf).valueOr:
error "App initialization failed", error = error
quit(QuitFailure)
nodeHealthMonitor.setNode(wakunode2.node)
wakunode2.startApp().isOkOr:
error "Starting app failed", error = error
quit(QuitFailure)
if conf.rest and not restServerRes.isErr():
wakunode2.restServer = restServerRes.value
wakunode2.setupMonitoringAndExternalInterfaces().isOkOr:
error "Starting monitoring and external interfaces failed", error = error
quit(QuitFailure)
nodeHealthMonitor.setOverallHealth(HealthStatus.READY)
debug "Setting up shutdown hooks"
## Setup shutdown hooks for this process.
## Stop node gracefully on shutdown.
proc asyncStopper(node: App) {.async: (raises: [Exception]).} =
nodeHealthMonitor.setOverallHealth(HealthStatus.SHUTTING_DOWN)
await node.stop()
quit(QuitSuccess)

55
scripts/chkhealth.sh Executable file
View File

@ -0,0 +1,55 @@
#!/usr/bin/env bash
# optional argument to specgify the ip address
ip_address=$1
plain_text_out=false
# Parse command line arguments
POSITIONAL_ARGS=()
while [[ $# -gt 0 ]]; do
case $1 in
-p|--plain)
plain_text_out=true
shift # past argument
;;
-*|--*)
echo "Unknown option $1"
exit 1
;;
*)
POSITIONAL_ARGS+=("$1") # save positional arg
shift # past argument
;;
esac
done
set -- "${POSITIONAL_ARGS[@]}" # restore positional parameters
# Check if an IP address is provided as an argument
if [[ -n "$1" ]]; then
ip_address="$1"
else
ip_address="localhost:8645"
fi
# check if curl is available
if ! command -v curl &> /dev/null
then
echo "curl could not be found"
exit 1
fi
response=$(curl -s GET http://${ip_address}/health)
if [[ -z "${response}" ]]; then
echo -e "$(date +'%H:%M:%S')\tnode health status is: unknown\n"
exit 1
fi
if ! command -v jq &> /dev/null || [[ "$plain_text_out" = true ]]; then
echo -e "$(date +'%H:%M:%S')\tnode health status is: ${response}\n"
else
echo -e "$(date +'%H:%M:%S')\tnode health status is:\n"
echo "${response}" | jq .
fi

View File

@ -77,7 +77,8 @@ import
./wakunode_rest/test_rest_filter,
./wakunode_rest/test_rest_lightpush,
./wakunode_rest/test_rest_admin,
./wakunode_rest/test_rest_cors
./wakunode_rest/test_rest_cors,
./wakunode_rest/test_rest_health
import ./waku_rln_relay/test_all

View File

@ -19,6 +19,7 @@ import
../../waku/waku_api/rest/health/handlers as health_api,
../../waku/waku_api/rest/health/client as health_api_client,
../../waku/waku_rln_relay,
../../waku/node/health_monitor,
../testlib/common,
../testlib/testutils,
../testlib/wakucore,
@ -35,17 +36,20 @@ proc testWakuNode(): WakuNode =
suite "Waku v2 REST API - health":
# TODO: better test for health
xasyncTest "Get node health info - GET /health":
asyncTest "Get node health info - GET /health":
# Given
let node = testWakuNode()
let healthMonitor = WakuNodeHealthMonitor()
await node.start()
await node.mountRelay()
healthMonitor.setOverallHealth(HealthStatus.INITIALIZING)
let restPort = Port(58001)
let restAddress = parseIpAddress("0.0.0.0")
let restServer = WakuRestServerRef.init(restAddress, restPort).tryGet()
installHealthApiHandler(restServer.router, node)
installHealthApiHandler(restServer.router, healthMonitor)
restServer.start()
let client = newRestHttpClient(initTAddress(restAddress, restPort))
@ -54,9 +58,10 @@ suite "Waku v2 REST API - health":
# Then
check:
response.status == 503
$response.contentType == $MIMETYPE_TEXT
response.data == "Node is not ready"
response.status == 200
$response.contentType == $MIMETYPE_JSON
response.data ==
HealthReport(nodeHealth: HealthStatus.INITIALIZING, protocolsHealth: @[])
# now kick in rln (currently the only check for health)
await node.mountRlnRelay(
@ -67,15 +72,19 @@ suite "Waku v2 REST API - health":
rlnRelayTreePath: genTempPath("rln_tree", "wakunode"),
)
)
healthMonitor.setNode(node)
healthMonitor.setOverallHealth(HealthStatus.READY)
# When
response = await client.healthCheck()
# Then
check:
response.status == 200
$response.contentType == $MIMETYPE_TEXT
response.data == "Node is healthy"
$response.contentType == $MIMETYPE_JSON
response.data.nodeHealth == HealthStatus.READY
response.data.protocolsHealth.len() == 1
response.data.protocolsHealth[0].protocol == "Rln Relay"
response.data.protocolsHealth[0].health == HealthStatus.READY
await restServer.stop()
await restServer.closeWait()

View File

@ -19,16 +19,11 @@ import
metrics,
metrics/chronos_httpserver
import
../../waku/common/utils/nat,
../../waku/common/utils/parse_size_units,
../../waku/common/databases/db_sqlite,
../../waku/waku_archive/driver/builder,
../../waku/waku_archive/retention_policy/builder,
../../waku/waku_core,
../../waku/waku_node,
../../waku/node/waku_metrics,
../../waku/node/peer_manager,
../../waku/node/peer_manager/peer_store/waku_peer_storage,
../../waku/node/health_monitor,
../../waku/waku_api/message_cache,
../../waku/waku_api/handlers,
../../waku/waku_api/rest/server,
@ -43,10 +38,8 @@ import
../../waku/discovery/waku_dnsdisc,
../../waku/discovery/waku_discv5,
../../waku/waku_enr/sharding,
../../waku/waku_peer_exchange,
../../waku/waku_rln_relay,
../../waku/waku_store,
../../waku/waku_lightpush/common,
../../waku/waku_filter_v2,
../../waku/factory/node_factory,
../../waku/factory/internal_config,
@ -70,7 +63,7 @@ type
node: WakuNode
restServer: Option[WakuRestServerRef]
restServer*: Option[WakuRestServerRef]
metricsServer: Option[MetricsHttpServerRef]
AppResult*[T] = Result[T, string]
@ -155,7 +148,7 @@ proc init*(T: type App, conf: WakuNodeConf): Result[App, string] =
## Setup DiscoveryV5
proc setupDiscoveryV5(app: App): WakuDiscoveryV5 =
proc setupDiscoveryV5*(app: App): WakuDiscoveryV5 =
let dynamicBootstrapEnrs =
app.dynamicBootstrapNodes.filterIt(it.hasUdpPort()).mapIt(it.enr.get())
@ -281,12 +274,19 @@ proc startApp*(app: var App): AppResult[void] =
## Monitoring and external interfaces
proc startRestServer(
app: App, address: IpAddress, port: Port, conf: WakuNodeConf
): AppResult[WakuRestServerRef] =
# Used to register api endpoints that are not currently installed as keys,
# values are holding error messages to be returned to the client
var notInstalledTab: Table[string, string] = initTable[string, string]()
# Used to register api endpoints that are not currently installed as keys,
# values are holding error messages to be returned to the client
# NOTE: {.threadvar.} is used to make the global variable GC safe for the closure uses it
# It will always be called from main thread anyway.
# Ref: https://nim-lang.org/docs/manual.html#threads-gc-safety
var restServerNotInstalledTab {.threadvar.}: TableRef[string, string]
restServerNotInstalledTab = newTable[string, string]()
proc startRestServerEsentials*(
nodeHealthMonitor: WakuNodeHealthMonitor, conf: WakuNodeConf
): AppResult[Option[WakuRestServerRef]] =
if not conf.rest:
return ok(none(WakuRestServerRef))
let requestErrorHandler: RestRequestErrorHandler = proc(
error: RestRequestError, request: HttpRequestRef
@ -302,7 +302,7 @@ proc startRestServer(
paths[1]
else:
""
notInstalledTab.withValue(rootPath, errMsg):
restServerNotInstalledTab[].withValue(rootPath, errMsg):
return await request.respond(Http404, errMsg[], HttpTable.init())
do:
return await request.respond(
@ -328,6 +328,8 @@ proc startRestServer(
else:
none(string)
let address = conf.restAddress
let port = Port(conf.restPort + conf.portsShift)
let server =
?newRestHttpServer(
address,
@ -336,37 +338,64 @@ proc startRestServer(
requestErrorHandler = requestErrorHandler,
)
## Health REST API
installHealthApiHandler(server.router, nodeHealthMonitor)
restServerNotInstalledTab["admin"] =
"/admin endpoints are not available while initializing."
restServerNotInstalledTab["debug"] =
"/debug endpoints are not available while initializing."
restServerNotInstalledTab["relay"] =
"/relay endpoints are not available while initializing."
restServerNotInstalledTab["filter"] =
"/filter endpoints are not available while initializing."
restServerNotInstalledTab["lightpush"] =
"/lightpush endpoints are not available while initializing."
restServerNotInstalledTab["store"] =
"/store endpoints are not available while initializing."
server.start()
info "Starting REST HTTP server", url = "http://" & $address & ":" & $port & "/"
ok(some(server))
proc startRestServerProtocolSupport(app: var App): AppResult[void] =
if not app.conf.rest or app.restServer.isNone():
## Maybe we don't need rest server at all, so it is ok.
return ok()
var router = app.restServer.get().router
## Admin REST API
if conf.restAdmin:
installAdminApiHandlers(server.router, app.node)
if app.conf.restAdmin:
installAdminApiHandlers(router, app.node)
else:
restServerNotInstalledTab["admin"] =
"/admin endpoints are not available. Please check your configuration: --rest-admin=true"
## Debug REST API
installDebugApiHandlers(server.router, app.node)
## Health REST API
installHealthApiHandler(server.router, app.node)
installDebugApiHandlers(router, app.node)
## Relay REST API
if conf.relay:
let cache = MessageCache.init(int(conf.restRelayCacheCapacity))
if app.conf.relay:
let cache = MessageCache.init(int(app.conf.restRelayCacheCapacity))
let handler = messageCacheHandler(cache)
for pubsubTopic in conf.pubsubTopics:
for pubsubTopic in app.conf.pubsubTopics:
cache.pubsubSubscribe(pubsubTopic)
app.node.subscribe((kind: PubsubSub, topic: pubsubTopic), some(handler))
for contentTopic in conf.contentTopics:
for contentTopic in app.conf.contentTopics:
cache.contentSubscribe(contentTopic)
app.node.subscribe((kind: ContentSub, topic: contentTopic), some(handler))
installRelayApiHandlers(server.router, app.node, cache)
installRelayApiHandlers(router, app.node, cache)
else:
notInstalledTab["relay"] =
restServerNotInstalledTab["relay"] =
"/relay endpoints are not available. Please check your configuration: --relay"
## Filter REST API
if conf.filternode != "" and app.node.wakuFilterClient != nil:
if app.conf.filternode != "" and app.node.wakuFilterClient != nil:
let filterCache = MessageCache.init()
let filterDiscoHandler =
@ -376,10 +405,10 @@ proc startRestServer(
none(DiscoveryHandler)
rest_filter_api.installFilterRestApiHandlers(
server.router, app.node, filterCache, filterDiscoHandler
router, app.node, filterCache, filterDiscoHandler
)
else:
notInstalledTab["filter"] =
restServerNotInstalledTab["filter"] =
"/filter endpoints are not available. Please check your configuration: --filternode"
## Store REST API
@ -389,10 +418,10 @@ proc startRestServer(
else:
none(DiscoveryHandler)
installStoreApiHandlers(server.router, app.node, storeDiscoHandler)
installStoreApiHandlers(router, app.node, storeDiscoHandler)
## Light push API
if conf.lightpushnode != "" and app.node.wakuLightpushClient != nil:
if app.conf.lightpushnode != "" and app.node.wakuLightpushClient != nil:
let lightDiscoHandler =
if app.wakuDiscv5.isSome():
some(defaultDiscoveryHandler(app.wakuDiscv5.get(), Lightpush))
@ -400,16 +429,14 @@ proc startRestServer(
none(DiscoveryHandler)
rest_lightpush_api.installLightPushRequestHandler(
server.router, app.node, lightDiscoHandler
router, app.node, lightDiscoHandler
)
else:
notInstalledTab["lightpush"] =
restServerNotInstalledTab["lightpush"] =
"/lightpush endpoints are not available. Please check your configuration: --lightpushnode"
server.start()
info "Starting REST HTTP server", url = "http://" & $address & ":" & $port & "/"
ok(server)
info "REST services are installed"
ok()
proc startMetricsServer(
serverIp: IpAddress, serverPort: Port
@ -434,15 +461,11 @@ proc startMetricsLogging(): AppResult[void] =
ok()
proc setupMonitoringAndExternalInterfaces*(app: var App): AppResult[void] =
if app.conf.rest:
let startRestServerRes = startRestServer(
app, app.conf.restAddress, Port(app.conf.restPort + app.conf.portsShift), app.conf
)
if startRestServerRes.isErr():
error "Starting REST server failed. Continuing in current state.",
error = startRestServerRes.error
else:
app.restServer = some(startRestServerRes.value)
if app.conf.rest and app.restServer.isSome():
let restProtocolSupportRes = startRestServerProtocolSupport(app)
if restProtocolSupportRes.isErr():
error "Starting REST server protocol support failed. Continuing in current state.",
error = restProtocolSupportRes.error
if app.conf.metricsServer:
let startMetricsServerRes = startMetricsServer(

View File

@ -0,0 +1,90 @@
when (NimMajor, NimMinor) < (1, 4):
{.push raises: [Defect].}
else:
{.push raises: [].}
import std/[options], chronos
import waku_node, ../waku_rln_relay
type
HealthStatus* = enum
INITIALIZING
SYNCHRONIZING
READY
NOT_READY
NOT_MOUNTED
SHUTTING_DOWN
ProtocolHealth* = object
protocol*: string
health*: HealthStatus
HealthReport* = object
nodeHealth*: HealthStatus
protocolsHealth*: seq[ProtocolHealth]
WakuNodeHealthMonitor* = ref object
nodeHealth: HealthStatus
node: Option[WakuNode]
proc `$`*(t: HealthStatus): string =
result =
case t
of INITIALIZING: "Initializing"
of SYNCHRONIZING: "Synchronizing"
of READY: "Ready"
of NOT_READY: "Not Ready"
of NOT_MOUNTED: "Not Mounted"
of SHUTTING_DOWN: "Shutting Down"
proc init*(
t: typedesc[HealthStatus], strRep: string
): HealthStatus {.raises: [ValueError].} =
case strRep
of "Initializing":
return HealthStatus.INITIALIZING
of "Synchronizing":
return HealthStatus.SYNCHRONIZING
of "Ready":
return HealthStatus.READY
of "Not Ready":
return HealthStatus.NOT_READY
of "Not Mounted":
return HealthStatus.NOT_MOUNTED
of "Shutting Down":
return HealthStatus.SHUTTING_DOWN
else:
raise newException(ValueError, "Invalid HealthStatus string representation")
const FutIsReadyTimout = 5.seconds
proc getNodeHealthReport*(hm: WakuNodeHealthMonitor): Future[HealthReport] {.async.} =
result.nodeHealth = hm.nodeHealth
if hm.node.isSome() and hm.node.get().wakuRlnRelay != nil:
let getRlnRelayHealth = proc(): Future[HealthStatus] {.async.} =
let isReadyStateFut = hm.node.get().wakuRlnRelay.isReady()
if not await isReadyStateFut.withTimeout(FutIsReadyTimout):
return HealthStatus.NOT_READY
try:
if not isReadyStateFut.completed():
return HealthStatus.NOT_READY
elif isReadyStateFut.read():
return HealthStatus.READY
return HealthStatus.SYNCHRONIZING
except:
error "exception reading state: " & getCurrentExceptionMsg()
return HealthStatus.NOT_READY
result.protocolsHealth.add(
ProtocolHealth(protocol: "Rln Relay", health: await getRlnRelayHealth())
)
proc setNode*(hm: WakuNodeHealthMonitor, node: WakuNode) =
hm.node = some(node)
proc setOverallHealth*(hm: WakuNodeHealthMonitor, health: HealthStatus) =
hm.nodeHealth = health

View File

@ -5,11 +5,11 @@ else:
import
chronicles, json_serialization, json_serialization/std/options, presto/[route, client]
import ../serdes, ../responses, ../rest_serdes
import ./types, ../serdes, ../responses, ../rest_serdes, ../../waku/node/health_monitor
logScope:
topics = "waku node rest health_api"
proc healthCheck*(): RestResponse[string] {.
proc healthCheck*(): RestResponse[HealthReport] {.
rest, endpoint: "/health", meth: HttpMethod.MethodGet
.}

View File

@ -4,33 +4,36 @@ else:
{.push raises: [].}
import chronicles, json_serialization, presto/route
import ../../../waku_node, ../responses, ../serdes
import ../../../waku_node, ../responses, ../serdes, ./types
logScope:
topics = "waku node rest health_api"
const ROUTE_HEALTH* = "/health"
const FutIsReadyTimout = 5.seconds
proc installHealthApiHandler*(router: var RestRouter, node: WakuNode) =
## /health endpoint provides information about node readiness to caller.
## Currently it is restricted to checking RLN (if mounted) proper setup
## TODO: Leter to extend it to a broader information about each subsystem state
## report. Rest response to change to JSON structure that can hold exact detailed
## information.
const FutHealthReportTimeout = 5.seconds
proc installHealthApiHandler*(
router: var RestRouter, nodeHealthMonitor: WakuNodeHealthMonitor
) =
router.api(MethodGet, ROUTE_HEALTH) do() -> RestApiResponse:
let isReadyStateFut = node.isReady()
if not await isReadyStateFut.withTimeout(FutIsReadyTimout):
let healthReportFut = nodeHealthMonitor.getNodeHealthReport()
if not await healthReportFut.withTimeout(FutHealthReportTimeout):
return RestApiResponse.internalServerError("Health check timed out")
var msg = "Node is healthy"
var msg = ""
var status = Http200
try:
if not isReadyStateFut.read():
msg = "Node is not ready"
if healthReportFut.completed():
let healthReport = healthReportFut.read()
return RestApiResponse.jsonResponse(healthReport, Http200).valueOr:
debug "An error ocurred while building the json healthReport response",
error = error
return
RestApiResponse.internalServerError("Failed to serialize health report")
else:
msg = "Health check failed"
status = Http503
except:
msg = "exception reading state: " & getCurrentExceptionMsg()

View File

@ -0,0 +1,82 @@
when (NimMajor, NimMinor) < (1, 4):
{.push raises: [Defect].}
else:
{.push raises: [].}
import
chronicles,
json_serialization,
json_serialization/std/options,
std/tables,
std/sequtils
import ../../../waku_node, ../serdes
#### Serialization and deserialization
proc writeValue*(
writer: var JsonWriter[RestJson], value: ProtocolHealth
) {.raises: [IOError].} =
writer.beginRecord()
writer.writeField(value.protocol, $value.health)
writer.endRecord()
proc readValue*(
reader: var JsonReader[RestJson], value: var ProtocolHealth
) {.gcsafe, raises: [SerializationError, IOError].} =
var health: HealthStatus
var fieldCount = 0
for fieldName in readObjectFields(reader):
if fieldCount > 0:
reader.raiseUnexpectedField("Too many fields", "ProtocolHealth")
fieldCount += 1
let fieldValue = reader.readValue(string)
try:
health = HealthStatus.init(fieldValue)
except ValueError:
reader.raiseUnexpectedValue("Invalid `health` value")
value = ProtocolHealth(protocol: fieldName, health: health)
proc writeValue*(
writer: var JsonWriter[RestJson], value: HealthReport
) {.raises: [IOError].} =
writer.beginRecord()
writer.writeField("nodeHealth", $value.nodeHealth)
writer.writeField("protocolsHealth", value.protocolsHealth)
writer.endRecord()
proc readValue*(
reader: var JsonReader[RestJson], value: var HealthReport
) {.raises: [SerializationError, IOError].} =
var
nodeHealth: Option[HealthStatus]
protocolsHealth: Option[seq[ProtocolHealth]]
for fieldName in readObjectFields(reader):
case fieldName
of "nodeHealth":
if nodeHealth.isSome():
reader.raiseUnexpectedField(
"Multiple `nodeHealth` fields found", "HealthReport"
)
try:
nodeHealth = some(HealthStatus.init(reader.readValue(string)))
except ValueError:
reader.raiseUnexpectedValue("Invalid `health` value")
of "protocolsHealth":
if protocolsHealth.isSome():
reader.raiseUnexpectedField(
"Multiple `protocolsHealth` fields found", "HealthReport"
)
protocolsHealth = some(reader.readValue(seq[ProtocolHealth]))
else:
unrecognizedFieldWarning()
if nodeHealth.isNone():
reader.raiseUnexpectedValue("Field `nodeHealth` is missing")
value =
HealthReport(nodeHealth: nodeHealth.get, protocolsHealth: protocolsHealth.get(@[]))

View File

@ -1,3 +1,7 @@
import ./node/config, ./node/waku_switch as switch, ./node/waku_node as node
import
./node/config,
./node/waku_switch as switch,
./node/waku_node as node,
./node/health_monitor as health_monitor
export config, switch, node
export config, switch, node, health_monitor