feat(monitoring): ping bootstrap nodes daily and open issue if some are dead

Creates a check_spr binary that reads an SPR and reports if that SPR is diallable (for tcp) or pingable (for udp). The check_spr binary takes several args and options:
Usage:
  check_spr [options]
  check_spr <spr-uri> [--timeout <secs>]

Options:
  --source <file>    Config file to read SPRs from (default: network_presets.json).
  --network <name>   Only probe the preset with this network name.
  --timeout <secs>   Per-node probe timeout in seconds (default: 10).
  --format <fmt>     Output format: "text" (default) or "json". "text" is the
                     human-readable table; "json" is a pretty-printed summary.
  --out <file>       Write the output to <file> instead of stdout. The content
                     is whichever --format is selected (text or json).
  --help, -h         Show this help and exit.

Arguments:
  <spr-uri>          A single "spr:" URI to probe instead of reading the config
                     file. Prints ALIVE/DEAD and exits with the matching status;
                     this mode ignores --format and --out.
This commit is contained in:
E M 2026-06-19 20:09:55 +10:00
parent acdc0fc325
commit f538083894
No known key found for this signature in database
6 changed files with 574 additions and 58 deletions

View File

@ -0,0 +1,102 @@
name: Bootstrap nodes health check
# Scheduled liveness check for the preset bootstrap nodes. Runs on a
# GitHub-hosted runner (public internet) so nodes advertising private/cloud
# internal IPs are correctly seen as unreachable. On any unreachable node it
# fails the job and opens/updates a tracking issue labelled `bootstrap-health`.
on:
push:
branches: [feat/bootstrap-health-check] # ← temporary for testing; remove before merge
schedule:
- cron: "0 6 * * *" # daily 06:00 UTC
workflow_dispatch:
env:
nim_version: v2.2.10
concurrency:
group: ${{ github.workflow }}
cancel-in-progress: false
permissions:
contents: read
issues: write
jobs:
ping:
name: Ping preset bootstrap nodes
runs-on: ubuntu-latest
steps:
- name: Checkout sources
uses: actions/checkout@v6
with:
submodules: recursive
- name: Setup Nimbus Build System
uses: ./.github/actions/nimbus-build-system
with:
os: linux
nim_version: ${{ env.nim_version }}
- name: Ping bootstrap nodes
id: ping
continue-on-error: true
run: make CI=true bootstrapHealthCheck
shell: bash
- name: Build report
id: report
if: always()
run: |
json=build/bootstrap-health-report.json
if [ ! -f "$json" ]; then
echo "no_output=true" >> "$GITHUB_OUTPUT"
echo "::error::check_spr produced no output file"
exit 0
fi
dead=$(jq '[.[] | select(.alive==false)] | length' "$json")
total=$(jq 'length' "$json")
echo "dead=$dead" >> "$GITHUB_OUTPUT"
{
echo "## Bootstrap node liveness ($((total - dead))/$total reachable)"
echo
echo "| Network | Result | Address | Reason |"
echo "|---|---|---|---|"
jq -r '.[] | "| \(.network) | \(if .alive then "✅ ALIVE" else "❌ DEAD" end) | \(.address) | \(.reason) |"' "$json"
} | tee report.md >> "$GITHUB_STEP_SUMMARY"
shell: bash
- name: Open or update tracking issue
if: always() && steps.report.outputs.dead != '0' && steps.report.outputs.dead != ''
env:
GH_TOKEN: ${{ github.token }}
DEAD: ${{ steps.report.outputs.dead }}
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
run: |
gh label create bootstrap-health --color B60205 \
--description "Automated bootstrap-node liveness alerts" 2>/dev/null || true
{
echo "Scheduled bootstrap-node liveness check found **${DEAD}** unreachable node(s)."
echo
echo "Run: ${RUN_URL}"
echo
cat report.md
} > issue-body.md
existing=$(gh issue list --label bootstrap-health --state open --json number --jq '.[0].number')
if [ -n "$existing" ]; then
gh issue comment "$existing" --body-file issue-body.md
else
gh issue create --title "Bootstrap nodes unreachable" \
--label bootstrap-health --body-file issue-body.md
fi
shell: bash
- name: Fail if any node is unreachable
if: always() && ((steps.report.outputs.dead != '0' && steps.report.outputs.dead != '') || steps.report.outputs.no_output == 'true')
env:
DEAD: ${{ steps.report.outputs.dead }}
run: |
echo "Bootstrap liveness check failed: ${DEAD} unreachable node(s)."
exit 1
shell: bash

View File

@ -152,6 +152,22 @@ testIntegration: | build deps
echo -e $(BUILD_MSG) "build/$@" && \
$(ENV_SCRIPT) nim testIntegration $(TEST_PARAMS) $(NIM_PARAMS) build.nims
BOOTSTRAP_HEALTH_CHECK_PARAMS :=
ifdef CI
BOOTSTRAP_HEALTH_CHECK_PARAMS := $(BOOTSTRAP_HEALTH_CHECK_PARAMS) -d:ci=$(CI)
endif
checkSpr: | build deps
echo -e $(BUILD_MSG) "build/check_spr" && \
$(ENV_SCRIPT) nim checkSpr $(NIM_PARAMS) build.nims
# Pings the preset bootstrap nodes and fails if any are unreachable.
# Run from OUTSIDE the fleet VPCs (e.g. a GitHub-hosted runner) so nodes that
# advertise private/cloud-internal IPs are correctly seen as unreachable.
bootstrapHealthCheck: | build deps
echo -e $(BUILD_MSG) "build/check_spr" && \
$(ENV_SCRIPT) nim bootstrapHealthCheck $(NIM_PARAMS) $(BOOTSTRAP_HEALTH_CHECK_PARAMS) build.nims
# Builds a C example that uses the libstorage C library and runs it
testLibstorageC: | build deps
$(MAKE) $(if $(ncpu),-j$(ncpu),) libstorage

View File

@ -81,6 +81,26 @@ task mixTools, "build mix tools (mix_pool, mix_relay_dht)":
buildBinary "mix_relay_dht",
outName = "mix_relay_dht", srcDir = "tools/mix/", params = mixParams
task checkSpr, "build check_spr used for checking bootstrap node health":
buildBinary "check_spr",
srcDir = "tools/",
params = "-d:release -d:chronicles_runtime_filtering -d:chronicles_log_level=WARN"
task bootstrapHealthCheck, "ping preset bootstrap nodes; non-zero exit if any are unreachable":
checkSprTask()
# get CI param from make if present
var args = ""
for i in 2 ..< paramCount():
if "ci" in paramStr(i) and truthy paramStr(i).split('=')[1]:
# Writes the JSON summary to a file before exiting, so the scheduled workflow
args = "--format json --out build/bootstrap-health-report.json"
break
# can read it. check_spr exits non-zero when a node is unreachable, failing
# the workflow run.
exec "build/check_spr " & args
task testStorage, "Build & run Logos Storage tests":
test "testStorage", outName = "testStorage"

34
network_presets.json Normal file
View File

@ -0,0 +1,34 @@
{
"presets": [
{
"name": "logos.test",
"description": "Logos testnet",
"records": [
"spr:CiUIAhIhA6rD-Sa1mJqHOoYMk8yad7B4BYDEI_toNwb1z0cYIRu6EgIDARpJCicAJQgCEiEDqsP5JrWYmoc6hgyTzJp3sHgFgMQj-2g3BvXPRxghG7oQwoe70AYaCwoJBEDhQ42RAiOCGgsKCQRA4UONkQIjgipHMEUCIQCulmrBDKTTxL8uBQYtEfp3_n3qDZFbO8lZ8mfIWHrRBAIgNXpVlWD1VlXzbGuJ4t7u8b7ymm3AYwm6-KjUvH6NfKU",
"spr:CiUIAhIhAsFwlXD-3VpX-Pa3taM15wdL3DS75l_dpVCIFhdaIKYREgIDARpJCicAJQgCEiECwXCVcP7dWlf49re1ozXnB0vcNLvmX92lUIgWF1ogphEQ4Ym70AYaCwoJBI5d6vqRAiOCGgsKCQSOXer6kQIjgipGMEQCIGaofSX23DDUcWEMElHtlaFbLAsM0YgrMB4UwOIqPMb8AiBFRodJ_5-bkvoLuPo3K2nMGzKXZqXnII4poJhhopSo8A",
"spr:CiUIAhIhAwSYqf83tfZom9eGFFdXXea-dblO-I7-I8B1kjhfJeEAEgIDARpJCicAJQgCEiEDBJip_ze19mib14YUV1dd5r51uU74jv4jwHWSOF8l4QAQjoi70AYaCwoJBAqAABKRAiOCGgsKCQQKgAASkQIjgipHMEUCIQDEtfOFABgYosMflQ-d_v-qkc5FhwSwd_PTcA414MBYAAIgGbhCTAwbSJr5boiARoVFZ-XrBhfFBc_J5Kk5drdQoTE",
"spr:CiUIAhIhA4T8XrxB6PKor8f7j7eqKxgIXH6mMST0_Uel5hZjSDp2EgIDARpJCicAJQgCEiEDhPxevEHo8qivx_uPt6orGAhcfqYxJPT9R6XmFmNIOnYQrYq70AYaCwoJBAqAAFuRAiOCGgsKCQQKgABbkQIjgipGMEQCIHA1l1NTOh06ca9seLlmAtPsTiNJo9Re0s51WakQTTf1AiAFJkhsi2Qv0fq8hY3AWlibqhhh_WiI3q6QabPGVXzuAA",
"spr:CiUIAhIhAqk6NgpRxbKvI02Up24XP3U-dD3TdKRurXpW-ak3Zvh-EgIDARpJCicAJQgCEiECqTo2ClHFsq8jTZSnbhc_dT50PdN0pG6telb5qTdm-H4Q3oa70AYaCwoJBKwf79KRAiOCGgsKCQSsH-_SkQIjgipHMEUCIQCRubKOjNcLZEJut0Ts6wy_BEij4z-1WO6WiOVzT0svfQIgWKOBWVoopNC7zk1byUJMpNMOi05cKVsLoCBkW3RC9-Q",
"spr:CiUIAhIhAvSGKPkE3mD7MP-ZCWS5AEvzcDNVsM6XFYeCBXNja7h2EgIDARpJCicAJQgCEiEC9IYo-QTeYPsw_5kJZLkAS_NwM1WwzpcVh4IFc2NruHYQ_Yi70AYaCwoJBKwf79ORAiOCGgsKCQSsH-_TkQIjgipHMEUCIQDJuV1B1sDyyxkNs8g3ahZ13GN9r7PEBP7xY4xGlm5n0AIgdyY2JEOyZ1FMdOzN6aZbAWo83AyjCrR-n0sietE1624"
]
},
{
"name": "logos.dev",
"description": "Logos devnet",
"records": [
"spr:CiUIAhIhAwfZDeTtWNlSgRbZlZfvxLI5Bpy0lFEYN7gImS3oHNaSEgIDARpJCicAJQgCEiEDB9kN5O1Y2VKBFtmVl-_EsjkGnLSUURg3uAiZLegc1pIQ__O20AYaCwoJBBiQTsiRAiOCGgsKCQQYkE7IkQIjgipHMEUCIQCIZx-HlVsLXJLhD6SEVx6Zt_1aG9IqMq-Luvz8No_J0wIgc8I9PRtheG4s5tzHjkEJMLcq3Jf09IT_FGkzPcJm8h4",
"spr:CiUIAhIhA8d4LjRirtXO1M-JEmbhVA0CQeA7hHNR9BA7DvFsPKTEEgIDARpJCicAJQgCEiEDx3guNGKu1c7Uz4kSZuFUDQJB4DuEc1H0EDsO8Ww8pMQQhPW20AYaCwoJBCIq5juRAiOCGgsKCQQiKuY7kQIjgipGMEQCIHV_8nJ0iedWjlAxUhBmdAbDPLu5g2RmcnmJBD8cbD98AiAp1w9nAJgLlPIr41aMcdkds_eSoh8ImOVKvq6Idx-Ugg",
"spr:CiUIAhIhA_MocWwn1_t__FEONMqYluUjc9ZVkcvYRLo6C0GzTkbfEgIDARpJCicAJQgCEiED8yhxbCfX-3_8UQ40ypiW5SNz1lWRy9hEujoLQbNORt8QlfO20AYaCwoJBC_u5W-RAiOCGgsKCQQv7uVvkQIjgipGMEQCIHMpQO31gg4FoKYtDyTTQS8xFz1KEmfqH385EeMUNbhPAiBblCkmOfQBmXj6eryaSiXWsftgohE-SPbKwsASZ1Zs3Q"
]
},
{
"name": "codex.dev",
"description": "Codex legacy devnet (deprecated)",
"records": [
"spr:CiUIAhIhA-VlcoiRm02KyIzrcTP-ljFpzTljfBRRKTIvhMIwqBqWEgIDARpJCicAJQgCEiED5WVyiJGbTYrIjOtxM_6WMWnNOWN8FFEpMi-EwjCoGpYQs8n8wQYaCwoJBHTKubmRAnU6GgsKCQR0yrm5kQJ1OipHMEUCIQDwUNsfReB4ty7JFS5WVQ6n1fcko89qVAOfQEHixa03rgIgan2-uFNDT-r4s9TOkLe9YBkCbsRWYCHGGVJ25rLj0QE",
"spr:CiUIAhIhApIj9p6zJDRbw2NoCo-tj98Y760YbppRiEpGIE1yGaMzEgIDARpJCicAJQgCEiECkiP2nrMkNFvDY2gKj62P3xjvrRhumlGISkYgTXIZozMQvcz8wQYaCwoJBAWhF3WRAnVEGgsKCQQFoRd1kQJ1RCpGMEQCIFZB84O_nzPNuViqEGRL1vJTjHBJ-i5ZDgFL5XZxm4HAAiB8rbLHkUdFfWdiOmlencYVn0noSMRHzn4lJYoShuVzlw",
"spr:CiUIAhIhApqRgeWRPSXocTS9RFkQmwTZRG-Cdt7UR2N7POoz606ZEgIDARpJCicAJQgCEiECmpGB5ZE9JehxNL1EWRCbBNlEb4J23tRHY3s86jPrTpkQj8_8wQYaCwoJBAXfEfiRAnVOGgsKCQQF3xH4kQJ1TipGMEQCIGWJMsF57N1iIEQgTH7IrVOgEgv0J2P2v3jvQr5Cjy-RAiAy4aiZ8QtyDvCfl_K_w6SyZ9csFGkRNTpirq_M_QNgKw"
]
}
]
}

View File

@ -4,6 +4,7 @@
# devnet and latest testnet, respectively.
import std/options
import std/strutils
import std/json
import pkg/chronicles
import pkg/codexdht/discv5/protocol
@ -31,7 +32,7 @@ proc init*(
func `$`*(preset: NetworkPreset): string =
"[" & preset.name & "]: " & preset.description
func `$`*[N](presets: array[N, NetworkPreset]): string =
func describePresets(presets: openArray[NetworkPreset]): string =
result = ""
for preset in presets:
result &= $preset & "; "
@ -52,62 +53,32 @@ proc `bootstrapNodes`*(self: NetworkPreset): seq[SignedPeerRecord] =
# it should crash the node.
result.add(parse(SignedPeerRecord, record).tryGet())
const NetworkPresets* = [
NetworkPreset.init(
"logos.test",
"Logos testnet",
@[
"spr:CiUIAhIhA6rD-Sa1mJqHOoYMk8yad7B4BYDEI_toNwb1z0cYIRu6EgIDARpJCicAJQgCEiEDqsP5JrWYmoc6hgyT" &
"zJp3sHgFgMQj-2g3BvXPRxghG7oQwoe70AYaCwoJBEDhQ42RAiOCGgsKCQRA4UONkQIjgipHMEUCIQCulmrBDKTTxL" &
"8uBQYtEfp3_n3qDZFbO8lZ8mfIWHrRBAIgNXpVlWD1VlXzbGuJ4t7u8b7ymm3AYwm6-KjUvH6NfKU",
"spr:CiUIAhIhAsFwlXD-3VpX-Pa3taM15wdL3DS75l_dpVCIFhdaIKYREgIDARpJCicAJQgCEiECwXCVcP7dWlf49re1" &
"ozXnB0vcNLvmX92lUIgWF1ogphEQ4Ym70AYaCwoJBI5d6vqRAiOCGgsKCQSOXer6kQIjgipGMEQCIGaofSX23DDUcW" &
"EMElHtlaFbLAsM0YgrMB4UwOIqPMb8AiBFRodJ_5-bkvoLuPo3K2nMGzKXZqXnII4poJhhopSo8A",
"spr:CiUIAhIhAwSYqf83tfZom9eGFFdXXea-dblO-I7-I8B1kjhfJeEAEgIDARpJCicAJQgCEiEDBJip_ze19mib14YU" &
"V1dd5r51uU74jv4jwHWSOF8l4QAQjoi70AYaCwoJBAqAABKRAiOCGgsKCQQKgAASkQIjgipHMEUCIQDEtfOFABgYo" &
"sMflQ-d_v-qkc5FhwSwd_PTcA414MBYAAIgGbhCTAwbSJr5boiARoVFZ-XrBhfFBc_J5Kk5drdQoTE",
"spr:CiUIAhIhA4T8XrxB6PKor8f7j7eqKxgIXH6mMST0_Uel5hZjSDp2EgIDARpJCicAJQgCEiEDhPxevEHo8qivx_uP" &
"t6orGAhcfqYxJPT9R6XmFmNIOnYQrYq70AYaCwoJBAqAAFuRAiOCGgsKCQQKgABbkQIjgipGMEQCIHA1l1NTOh06ca9s" &
"eLlmAtPsTiNJo9Re0s51WakQTTf1AiAFJkhsi2Qv0fq8hY3AWlibqhhh_WiI3q6QabPGVXzuAA",
"spr:CiUIAhIhAqk6NgpRxbKvI02Up24XP3U-dD3TdKRurXpW-ak3Zvh-EgIDARpJCicAJQgCEiECqTo2ClHFsq8jTZSn" &
"bhc_dT50PdN0pG6telb5qTdm-H4Q3oa70AYaCwoJBKwf79KRAiOCGgsKCQSsH-_SkQIjgipHMEUCIQCRubKOjNcLZEJu" &
"t0Ts6wy_BEij4z-1WO6WiOVzT0svfQIgWKOBWVoopNC7zk1byUJMpNMOi05cKVsLoCBkW3RC9-Q",
"spr:CiUIAhIhAvSGKPkE3mD7MP-ZCWS5AEvzcDNVsM6XFYeCBXNja7h2EgIDARpJCicAJQgCEiEC9IYo-QTeYPsw_5kJ" &
"ZLkAS_NwM1WwzpcVh4IFc2NruHYQ_Yi70AYaCwoJBKwf79ORAiOCGgsKCQSsH-_TkQIjgipHMEUCIQDJuV1B1sDyyx" &
"kNs8g3ahZ13GN9r7PEBP7xY4xGlm5n0AIgdyY2JEOyZ1FMdOzN6aZbAWo83AyjCrR-n0sietE1624",
],
),
NetworkPreset.init(
"logos.dev",
"Logos devnet",
@[
"spr:CiUIAhIhAwfZDeTtWNlSgRbZlZfvxLI5Bpy0lFEYN7gImS3oHNaSEgIDARpJCicAJQgCEiEDB9kN5O1Y2VKBFtmVl-" &
"_EsjkGnLSUURg3uAiZLegc1pIQ__O20AYaCwoJBBiQTsiRAiOCGgsKCQQYkE7IkQIjgipHMEUCIQCIZx-HlVsLXJLhD6SEV" &
"x6Zt_1aG9IqMq-Luvz8No_J0wIgc8I9PRtheG4s5tzHjkEJMLcq3Jf09IT_FGkzPcJm8h4",
"spr:CiUIAhIhA8d4LjRirtXO1M-JEmbhVA0CQeA7hHNR9BA7DvFsPKTEEgIDARpJCicAJQgCEiEDx3guNGKu1c7Uz4kSZu" &
"FUDQJB4DuEc1H0EDsO8Ww8pMQQhPW20AYaCwoJBCIq5juRAiOCGgsKCQQiKuY7kQIjgipGMEQCIHV_8nJ0iedWjlAxUhBm" &
"dAbDPLu5g2RmcnmJBD8cbD98AiAp1w9nAJgLlPIr41aMcdkds_eSoh8ImOVKvq6Idx-Ugg",
"spr:CiUIAhIhA_MocWwn1_t__FEONMqYluUjc9ZVkcvYRLo6C0GzTkbfEgIDARpJCicAJQgCEiED8yhxbCfX-3_8UQ40yp" &
"iW5SNz1lWRy9hEujoLQbNORt8QlfO20AYaCwoJBC_u5W-RAiOCGgsKCQQv7uVvkQIjgipGMEQCIHMpQO31gg4FoKYtDyTT" &
"QS8xFz1KEmfqH385EeMUNbhPAiBblCkmOfQBmXj6eryaSiXWsftgohE-SPbKwsASZ1Zs3Q",
],
),
NetworkPreset.init(
"codex.dev",
"Codex legacy devnet (deprecated)",
@[
"spr:CiUIAhIhA-VlcoiRm02KyIzrcTP-ljFpzTljfBRRKTIvhMIwqBqWEgIDARpJCicAJQgCEiED5WVyiJGbTYrIjOtxM_6" &
"WMWnNOWN8FFEpMi-EwjCoGpYQs8n8wQYaCwoJBHTKubmRAnU6GgsKCQR0yrm5kQJ1OipHMEUCIQDwUNsfReB4ty7JFS" &
"5WVQ6n1fcko89qVAOfQEHixa03rgIgan2-uFNDT-r4s9TOkLe9YBkCbsRWYCHGGVJ25rLj0QE",
"spr:CiUIAhIhApIj9p6zJDRbw2NoCo-tj98Y760YbppRiEpGIE1yGaMzEgIDARpJCicAJQgCEiECkiP2nrMkNFvDY2gKj62P" &
"3xjvrRhumlGISkYgTXIZozMQvcz8wQYaCwoJBAWhF3WRAnVEGgsKCQQFoRd1kQJ1RCpGMEQCIFZB84O_nzPNuViqEGRL" &
"1vJTjHBJ-i5ZDgFL5XZxm4HAAiB8rbLHkUdFfWdiOmlencYVn0noSMRHzn4lJYoShuVzlw",
"spr:CiUIAhIhApqRgeWRPSXocTS9RFkQmwTZRG-Cdt7UR2N7POoz606ZEgIDARpJCicAJQgCEiECmpGB5ZE9JehxNL1EWRCb" &
"BNlEb4J23tRHY3s86jPrTpkQj8_8wQYaCwoJBAXfEfiRAnVOGgsKCQQF3xH4kQJ1TipGMEQCIGWJMsF57N1iIEQgTH7I" &
"rVOgEgv0J2P2v3jvQr5Cjy-RAiAy4aiZ8QtyDvCfl_K_w6SyZ9csFGkRNTpirq_M_QNgKw",
],
),
]
# Bootstrap node SPRs live in a single source-of-truth config file at the repo
# root. staticRead embeds it into the binary at build time, so the node remains
# self-contained (the file is not needed alongside the binary at runtime). The
# same file is read at runtime by the bootstrap liveness checker (tools/check_spr).
const networkPresetsJson = staticRead("../network_presets.json")
proc parsePresetsJson(jsonStr: string): seq[NetworkPreset] =
let root = parseJson(jsonStr)
for p in root["presets"].items:
var records: seq[string]
for r in p["records"].items:
records.add(r.getStr)
result.add(NetworkPreset.init(p["name"].getStr, p["description"].getStr, records))
const NetworkPresets* = parsePresetsJson(networkPresetsJson)
proc loadNetworkPresets*(path: string): seq[NetworkPreset] =
## Runtime loader for the same bootstrap-node config file embedded at compile
## time into `NetworkPresets`. Used by the bootstrap liveness checker so the
## node and the checker share one source of truth.
parsePresetsJson(readFile(path))
proc rawRecords*(self: NetworkPreset): seq[string] =
## The unparsed `spr:` strings for this preset (for tooling that wants to
## handle parse failures per-record instead of crashing).
self.unparsedRecords
proc `default`*(presets: openArray[NetworkPreset]): NetworkPreset =
presets[0]
@ -115,7 +86,7 @@ proc `default`*(presets: openArray[NetworkPreset]): NetworkPreset =
# Precomputes those as as consts so we can use them in nim-confutils CLI
# help strings.
const
NetworkPresetsDescription* = $NetworkPresets
NetworkPresetsDescription* = describePresets(NetworkPresets)
DefaultNetworkPreset* = NetworkPresets.default
proc find*(presets: openArray[NetworkPreset], p: string): Option[NetworkPreset] =

373
tools/check_spr.nim Normal file
View File

@ -0,0 +1,373 @@
## check_spr - bootstrap-node liveness checker.
##
## Reads the bootstrap SPRs from the shared config file (network_presets.json by
## default) and probes each one. The probe depends on the transport advertised in
## the record:
## * TCP addresses -> a libp2p connection is attempted.
## * UDP addresses -> a discovery v5 (DHT) ping is sent.
## It prints a per-node report — a human-readable table by default, or JSON with
## `--format json` — and exits non-zero if any node is unreachable. A single
## `spr:` URI can also be passed for ad-hoc checks.
##
## IMPORTANT: run this from a host OUTSIDE the fleet VPCs (e.g. a GitHub-hosted
## runner), otherwise nodes advertising private/cloud-internal IPs will appear
## reachable and defeat the purpose.
##
## Usage:
## check_spr [--source <file>] [--network <name>] [--timeout <secs>]
## [--format text|json] [--out <file>]
## check_spr <spr-uri> [--timeout <secs>]
## check_spr --help
##
## Run `check_spr --help` for a full description of every option.
import std/[json, options, os, sequtils, strutils, typetraits, strformat, terminal]
import pkg/chronicles
import pkg/chronos
import pkg/libp2p
import pkg/libp2p/crypto/rng
import pkg/codexdht/discv5/spr
import pkg/codexdht/discv5/node
import pkg/codexdht/discv5/protocol as discv5
import ../storage/presets
const
DefaultTimeoutSecs = 10
DefaultSource = "network_presets.json"
type OutputFormat = enum
## String values double as the accepted `--format` argument spellings, so the
## parser can compare against them directly.
ofText = "text"
ofJson = "json"
type Verdict = object
alive: bool
reason: string
type Row = object
network: string
peerId: string
address: string
alive: bool
reason: string
proc hasCodec(addrs: seq[MultiAddress], codec: MultiCodec): bool =
addrs.anyIt(it.contains(codec).get(false))
proc setupLogging() =
## The project's `config.nims` forces `dynamic` chronicles sinks, whose output
## writers must be configured at runtime. Without this, every log message is
## dropped with a noisy "dynamic log output writer not configured" warning.
## Route libp2p logs to stderr so the tool's stdout (the ALIVE/DEAD verdict)
## stays clean and parseable.
when defaultChroniclesStream.outputs.type.arity == 3:
proc noOutput(logLevel: LogLevel, msg: LogOutputStr) =
discard
proc stderrFlush(logLevel: LogLevel, msg: LogOutputStr) =
try:
stderr.write(msg)
stderr.flushFile()
except IOError:
discard
defaultChroniclesStream.outputs[0].writer = stderrFlush
defaultChroniclesStream.outputs[1].writer = noOutput
defaultChroniclesStream.outputs[2].writer = noOutput
proc buildSwitch(): Switch =
SwitchBuilder
.new()
.withRng(newRng())
.withAddress(MultiAddress.init("/ip4/0.0.0.0/tcp/0").tryGet())
.withTcpTransport()
.withNoise()
.withYamux()
# match storage node switch builder, for TCP
.withMplex()
.build()
proc checkLibp2p(
peerId: PeerId, addresses: seq[MultiAddress], timeout: Duration
): Future[Verdict] {.async.} =
let switch = buildSwitch()
await switch.start()
defer:
await switch.stop()
try:
await switch.connect(peerId, addresses).wait(timeout)
return Verdict(alive: true, reason: "libp2p connection established")
except AsyncTimeoutError:
return Verdict(alive: false, reason: "libp2p connection timed out")
except CatchableError as exc:
return Verdict(alive: false, reason: "libp2p connection failed: " & exc.msg)
proc checkDiscv5(
record: SignedPeerRecord, timeout: Duration
): Future[Verdict] {.async.} =
let nodeRes = newNode(record)
if nodeRes.isErr:
return Verdict(alive: false, reason: "cannot build discv5 node: " & $nodeRes.error)
let targetNode = nodeRes.get()
let rng = newRng()
let privKey = PrivateKey.random(rng).tryGet()
let proto = discv5.newProtocol(
privKey, none(IpAddress), none(Port), none(Port), bindPort = Port(0), rng = rng
)
try:
proto.open()
except CatchableError as exc:
return Verdict(alive: false, reason: "failed to open discv5: " & exc.msg)
defer:
await proto.closeWait()
try:
let pong = await discv5.ping(proto, targetNode).wait(timeout)
if pong.isOk:
return Verdict(alive: true, reason: "discv5 pong received")
else:
return Verdict(alive: false, reason: "discv5 ping failed: " & $pong.error)
except AsyncTimeoutError:
return Verdict(alive: false, reason: "discv5 ping timed out")
except CatchableError as exc:
return Verdict(alive: false, reason: "discv5 ping failed: " & exc.msg)
proc probe(record: SignedPeerRecord, timeout: Duration): Future[Verdict] {.async.} =
let addresses = record.data.addresses.mapIt(it.address)
if addresses.len == 0:
return Verdict(alive: false, reason: "SPR contains no addresses")
if hasCodec(addresses, multiCodec("tcp")):
return await checkLibp2p(record.data.peerId, addresses, timeout)
elif hasCodec(addresses, multiCodec("udp")):
return await checkDiscv5(record, timeout)
else:
return Verdict(alive: false, reason: "no tcp or udp addresses to probe")
proc probeRecords(
source, networkFilter: string, timeout: Duration
): Future[seq[Row]] {.async.} =
let presets = loadNetworkPresets(source)
for preset in presets:
if networkFilter.len > 0 and preset.name != networkFilter:
continue
for rec in preset.rawRecords:
let parsed = SignedPeerRecord.parse(rec)
if parsed.isErr:
result.add Row(
network: preset.name,
alive: false,
reason: "SPR parse failed: " & parsed.error,
)
continue
let record = parsed.get
let v = await probe(record, timeout)
result.add Row(
network: preset.name,
peerId: $record.data.peerId,
address: record.data.addresses.mapIt($it.address).deduplicate.join(", "),
alive: v.alive,
reason: v.reason,
)
const
TableHeader = "FLEET RESULT ADDRESS REASON"
TableRule = '-'.repeat(TableHeader.len)
proc formatRow(r: Row): string =
## One table line for a probed node. The status cell is fixed-width ("DEAD " is
## padded to match "ALIVE") so columns line up regardless of the verdict. ANSI
## color, when added by the styled renderer, is zero-width on screen, so the
## plain-text alignment computed here stays visually correct.
let status = if r.alive: "ALIVE" else: "DEAD "
r.network.alignLeft(12) & " " & status & " " & r.address.alignLeft(43) & " " &
r.reason
proc renderTable(rows: seq[Row]): string =
## Plain (uncolored) table — used for files and non-interactive stdout, where
## ANSI escapes would be corruption rather than decoration.
result = TableHeader & "\n" & TableRule
for r in rows:
result.add("\n" & formatRow(r))
proc printStyledTable(rows: seq[Row]) =
## Colored table for an interactive terminal: alive rows green, dead rows red.
## styledEcho writes ANSI escapes unconditionally, so callers must restrict this
## to a TTY (see isatty guard at the call site).
styledEcho(styleBright, TableHeader)
styledEcho(styleDim, TableRule)
for r in rows:
styledEcho(if r.alive: fgGreen else: fgRed, formatRow(r))
proc rowsToJson(rows: seq[Row]): JsonNode =
result = newJArray()
for r in rows:
result.add %*{
"network": r.network,
"peerId": r.peerId,
"address": r.address,
"alive": r.alive,
"reason": r.reason,
}
proc parseTimeout(s: string): int =
try:
parseInt(s)
except ValueError:
quit("Error: timeout must be an integer number of seconds", QuitFailure)
proc parseFormat(s: string): OutputFormat =
case s
of $ofText:
ofText
of $ofJson:
ofJson
else:
quit("Error: format must be '" & $ofText & "' or '" & $ofJson & "'", QuitFailure)
proc printHelp() =
echo """check_spr - bootstrap-node liveness checker.
Reads bootstrap SPRs from a config file and probes each one (libp2p connect for
TCP addresses, discv5 ping for UDP). Prints a per-node table and exits non-zero
if any node is unreachable. A single spr: URI can be passed for an ad-hoc check.
IMPORTANT: run from a host OUTSIDE the fleet VPCs (e.g. a GitHub-hosted runner),
otherwise nodes advertising private/cloud-internal IPs appear reachable and
defeat the purpose.
Usage:
check_spr [options]
check_spr <spr-uri> [--timeout <secs>]
Options:
--source <file> Config file to read SPRs from (default: """ &
DefaultSource & """).
--network <name> Only probe the preset with this network name.
--timeout <secs> Per-node probe timeout in seconds (default: """ &
$DefaultTimeoutSecs & """).
--format <fmt> Output format: "text" (default) or "json". "text" is the
human-readable table; "json" is a pretty-printed summary.
--out <file> Write the output to <file> instead of stdout. The content
is whichever --format is selected (text or json).
--help, -h Show this help and exit.
Arguments:
<spr-uri> A single "spr:" URI to probe instead of reading the config
file. Prints ALIVE/DEAD and exits with the matching status;
this mode ignores --format and --out."""
when isMainModule:
setupLogging()
var
source = DefaultSource
networkFilter = ""
timeoutSecs = DefaultTimeoutSecs
singleSpr = ""
format = ofText
outFile = ""
let params = commandLineParams()
var i = 0
while i < params.len:
case params[i]
of "--help", "-h":
printHelp()
quit(QuitSuccess)
of "--source":
inc i
source = params[i]
of "--network":
inc i
networkFilter = params[i]
of "--timeout":
inc i
timeoutSecs = parseTimeout(params[i])
of "--format":
inc i
format = parseFormat(params[i])
of "--out":
inc i
outFile = params[i]
else:
if params[i].startsWith("spr:"):
singleSpr = params[i]
else:
quit("Error: unknown argument: " & params[i], QuitFailure)
inc i
let timeout = timeoutSecs.seconds
if singleSpr.len > 0:
let parsed = SignedPeerRecord.parse(singleSpr)
if parsed.isErr:
quit("Error: " & parsed.error, QuitFailure)
let v = waitFor probe(parsed.get, timeout)
echo (if v.alive: "ALIVE: " else: "DEAD: "), v.reason
quit(if v.alive: QuitSuccess else: QuitFailure)
let rows = waitFor probeRecords(source, networkFilter, timeout)
if outFile.len > 0:
# Files (and the JSON form) must stay plain — ANSI color codes would corrupt
# them — so always write the uncolored rendering. writeFile does not append a
# trailing newline the way echo does, so add one to keep the file POSIX-clean.
let output =
case format
of ofText:
renderTable(rows)
of ofJson:
pretty(rowsToJson(rows))
writeFile(outFile, output & "\n")
# The report is in the file, not on stdout, so point the caller at it. Use
# the absolute path so the line is unambiguous regardless of the cwd.
#!fmt: off
styledEcho bgWhite, fgBlack, styleBright,
"\n\n ",
styleUnderscore,
" BOOTSTRAP HEALTH REPORT \n\n",
resetStyle, bgWhite, fgBlack, styleBright,
""" Bootstrap health report for this run will be available at:""",
resetStyle, bgWhite, fgBlack,
&"\n\n {absolutePath(outFile)}\n\n",
resetStyle, bgWhite, fgBlack, styleBright,
" NOTE: For CI runs, the report will be displayed in the workflow summary\n"
#!fmt: on
else:
case format
of ofText:
# Color only when stdout is an interactive terminal; when piped or
# redirected, fall back to the plain table so escapes never reach the
# consumer (std/terminal emits ANSI unconditionally on POSIX).
if isatty(stdout):
printStyledTable(rows)
else:
echo renderTable(rows)
of ofJson:
echo pretty(rowsToJson(rows))
let dead = rows.filterIt(not it.alive)
if outFile.len > 0:
if dead.len > 0:
styledEcho styleBright,
fgRed,
&"\n\n[❌ ERROR] ",
resetStyle,
"One or more bootstrap nodes are unreachable, see report for details.\n\n"
else:
styledEcho styleBright,
fgGreen,
&"\n\n[✅ SUCCESS] ",
resetStyle,
"All bootstrap nodes are reachable, see report for details.\n\n"
if dead.len > 0:
quit(QuitFailure)