mirror of
https://github.com/logos-storage/logos-storage-nim.git
synced 2026-06-26 04:19:30 +00:00
feat(monitoring): ping bootstrap nodes daily and open issue if some are dead
Creates a check_spr binary that reads an SPR and reports if that SPR is diallable (for tcp) or pingable (for udp). The check_spr binary takes several args and options:
Usage:
check_spr [options]
check_spr <spr-uri> [--timeout <secs>]
Options:
--source <file> Config file to read SPRs from (default: network_presets.json).
--network <name> Only probe the preset with this network name.
--timeout <secs> Per-node probe timeout in seconds (default: 10).
--format <fmt> Output format: "text" (default) or "json". "text" is the
human-readable table; "json" is a pretty-printed summary.
--out <file> Write the output to <file> instead of stdout. The content
is whichever --format is selected (text or json).
--help, -h Show this help and exit.
Arguments:
<spr-uri> A single "spr:" URI to probe instead of reading the config
file. Prints ALIVE/DEAD and exits with the matching status;
this mode ignores --format and --out.
This commit is contained in:
parent
acdc0fc325
commit
f538083894
102
.github/workflows/bootstrap-health-check.yml
vendored
Normal file
102
.github/workflows/bootstrap-health-check.yml
vendored
Normal file
@ -0,0 +1,102 @@
|
||||
name: Bootstrap nodes health check
|
||||
|
||||
# Scheduled liveness check for the preset bootstrap nodes. Runs on a
|
||||
# GitHub-hosted runner (public internet) so nodes advertising private/cloud
|
||||
# internal IPs are correctly seen as unreachable. On any unreachable node it
|
||||
# fails the job and opens/updates a tracking issue labelled `bootstrap-health`.
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [feat/bootstrap-health-check] # ← temporary for testing; remove before merge
|
||||
schedule:
|
||||
- cron: "0 6 * * *" # daily 06:00 UTC
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
nim_version: v2.2.10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}
|
||||
cancel-in-progress: false
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
issues: write
|
||||
|
||||
jobs:
|
||||
ping:
|
||||
name: Ping preset bootstrap nodes
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout sources
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Setup Nimbus Build System
|
||||
uses: ./.github/actions/nimbus-build-system
|
||||
with:
|
||||
os: linux
|
||||
nim_version: ${{ env.nim_version }}
|
||||
|
||||
- name: Ping bootstrap nodes
|
||||
id: ping
|
||||
continue-on-error: true
|
||||
run: make CI=true bootstrapHealthCheck
|
||||
shell: bash
|
||||
|
||||
- name: Build report
|
||||
id: report
|
||||
if: always()
|
||||
run: |
|
||||
json=build/bootstrap-health-report.json
|
||||
if [ ! -f "$json" ]; then
|
||||
echo "no_output=true" >> "$GITHUB_OUTPUT"
|
||||
echo "::error::check_spr produced no output file"
|
||||
exit 0
|
||||
fi
|
||||
dead=$(jq '[.[] | select(.alive==false)] | length' "$json")
|
||||
total=$(jq 'length' "$json")
|
||||
echo "dead=$dead" >> "$GITHUB_OUTPUT"
|
||||
{
|
||||
echo "## Bootstrap node liveness ($((total - dead))/$total reachable)"
|
||||
echo
|
||||
echo "| Network | Result | Address | Reason |"
|
||||
echo "|---|---|---|---|"
|
||||
jq -r '.[] | "| \(.network) | \(if .alive then "✅ ALIVE" else "❌ DEAD" end) | \(.address) | \(.reason) |"' "$json"
|
||||
} | tee report.md >> "$GITHUB_STEP_SUMMARY"
|
||||
shell: bash
|
||||
|
||||
- name: Open or update tracking issue
|
||||
if: always() && steps.report.outputs.dead != '0' && steps.report.outputs.dead != ''
|
||||
env:
|
||||
GH_TOKEN: ${{ github.token }}
|
||||
DEAD: ${{ steps.report.outputs.dead }}
|
||||
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
run: |
|
||||
gh label create bootstrap-health --color B60205 \
|
||||
--description "Automated bootstrap-node liveness alerts" 2>/dev/null || true
|
||||
{
|
||||
echo "Scheduled bootstrap-node liveness check found **${DEAD}** unreachable node(s)."
|
||||
echo
|
||||
echo "Run: ${RUN_URL}"
|
||||
echo
|
||||
cat report.md
|
||||
} > issue-body.md
|
||||
existing=$(gh issue list --label bootstrap-health --state open --json number --jq '.[0].number')
|
||||
if [ -n "$existing" ]; then
|
||||
gh issue comment "$existing" --body-file issue-body.md
|
||||
else
|
||||
gh issue create --title "Bootstrap nodes unreachable" \
|
||||
--label bootstrap-health --body-file issue-body.md
|
||||
fi
|
||||
shell: bash
|
||||
|
||||
- name: Fail if any node is unreachable
|
||||
if: always() && ((steps.report.outputs.dead != '0' && steps.report.outputs.dead != '') || steps.report.outputs.no_output == 'true')
|
||||
env:
|
||||
DEAD: ${{ steps.report.outputs.dead }}
|
||||
run: |
|
||||
echo "Bootstrap liveness check failed: ${DEAD} unreachable node(s)."
|
||||
exit 1
|
||||
shell: bash
|
||||
16
Makefile
16
Makefile
@ -152,6 +152,22 @@ testIntegration: | build deps
|
||||
echo -e $(BUILD_MSG) "build/$@" && \
|
||||
$(ENV_SCRIPT) nim testIntegration $(TEST_PARAMS) $(NIM_PARAMS) build.nims
|
||||
|
||||
BOOTSTRAP_HEALTH_CHECK_PARAMS :=
|
||||
ifdef CI
|
||||
BOOTSTRAP_HEALTH_CHECK_PARAMS := $(BOOTSTRAP_HEALTH_CHECK_PARAMS) -d:ci=$(CI)
|
||||
endif
|
||||
|
||||
checkSpr: | build deps
|
||||
echo -e $(BUILD_MSG) "build/check_spr" && \
|
||||
$(ENV_SCRIPT) nim checkSpr $(NIM_PARAMS) build.nims
|
||||
|
||||
# Pings the preset bootstrap nodes and fails if any are unreachable.
|
||||
# Run from OUTSIDE the fleet VPCs (e.g. a GitHub-hosted runner) so nodes that
|
||||
# advertise private/cloud-internal IPs are correctly seen as unreachable.
|
||||
bootstrapHealthCheck: | build deps
|
||||
echo -e $(BUILD_MSG) "build/check_spr" && \
|
||||
$(ENV_SCRIPT) nim bootstrapHealthCheck $(NIM_PARAMS) $(BOOTSTRAP_HEALTH_CHECK_PARAMS) build.nims
|
||||
|
||||
# Builds a C example that uses the libstorage C library and runs it
|
||||
testLibstorageC: | build deps
|
||||
$(MAKE) $(if $(ncpu),-j$(ncpu),) libstorage
|
||||
|
||||
20
build.nims
20
build.nims
@ -81,6 +81,26 @@ task mixTools, "build mix tools (mix_pool, mix_relay_dht)":
|
||||
buildBinary "mix_relay_dht",
|
||||
outName = "mix_relay_dht", srcDir = "tools/mix/", params = mixParams
|
||||
|
||||
task checkSpr, "build check_spr used for checking bootstrap node health":
|
||||
buildBinary "check_spr",
|
||||
srcDir = "tools/",
|
||||
params = "-d:release -d:chronicles_runtime_filtering -d:chronicles_log_level=WARN"
|
||||
|
||||
task bootstrapHealthCheck, "ping preset bootstrap nodes; non-zero exit if any are unreachable":
|
||||
checkSprTask()
|
||||
|
||||
# get CI param from make if present
|
||||
var args = ""
|
||||
for i in 2 ..< paramCount():
|
||||
if "ci" in paramStr(i) and truthy paramStr(i).split('=')[1]:
|
||||
# Writes the JSON summary to a file before exiting, so the scheduled workflow
|
||||
args = "--format json --out build/bootstrap-health-report.json"
|
||||
break
|
||||
|
||||
# can read it. check_spr exits non-zero when a node is unreachable, failing
|
||||
# the workflow run.
|
||||
exec "build/check_spr " & args
|
||||
|
||||
task testStorage, "Build & run Logos Storage tests":
|
||||
test "testStorage", outName = "testStorage"
|
||||
|
||||
|
||||
34
network_presets.json
Normal file
34
network_presets.json
Normal file
@ -0,0 +1,34 @@
|
||||
{
|
||||
"presets": [
|
||||
{
|
||||
"name": "logos.test",
|
||||
"description": "Logos testnet",
|
||||
"records": [
|
||||
"spr:CiUIAhIhA6rD-Sa1mJqHOoYMk8yad7B4BYDEI_toNwb1z0cYIRu6EgIDARpJCicAJQgCEiEDqsP5JrWYmoc6hgyTzJp3sHgFgMQj-2g3BvXPRxghG7oQwoe70AYaCwoJBEDhQ42RAiOCGgsKCQRA4UONkQIjgipHMEUCIQCulmrBDKTTxL8uBQYtEfp3_n3qDZFbO8lZ8mfIWHrRBAIgNXpVlWD1VlXzbGuJ4t7u8b7ymm3AYwm6-KjUvH6NfKU",
|
||||
"spr:CiUIAhIhAsFwlXD-3VpX-Pa3taM15wdL3DS75l_dpVCIFhdaIKYREgIDARpJCicAJQgCEiECwXCVcP7dWlf49re1ozXnB0vcNLvmX92lUIgWF1ogphEQ4Ym70AYaCwoJBI5d6vqRAiOCGgsKCQSOXer6kQIjgipGMEQCIGaofSX23DDUcWEMElHtlaFbLAsM0YgrMB4UwOIqPMb8AiBFRodJ_5-bkvoLuPo3K2nMGzKXZqXnII4poJhhopSo8A",
|
||||
"spr:CiUIAhIhAwSYqf83tfZom9eGFFdXXea-dblO-I7-I8B1kjhfJeEAEgIDARpJCicAJQgCEiEDBJip_ze19mib14YUV1dd5r51uU74jv4jwHWSOF8l4QAQjoi70AYaCwoJBAqAABKRAiOCGgsKCQQKgAASkQIjgipHMEUCIQDEtfOFABgYosMflQ-d_v-qkc5FhwSwd_PTcA414MBYAAIgGbhCTAwbSJr5boiARoVFZ-XrBhfFBc_J5Kk5drdQoTE",
|
||||
"spr:CiUIAhIhA4T8XrxB6PKor8f7j7eqKxgIXH6mMST0_Uel5hZjSDp2EgIDARpJCicAJQgCEiEDhPxevEHo8qivx_uPt6orGAhcfqYxJPT9R6XmFmNIOnYQrYq70AYaCwoJBAqAAFuRAiOCGgsKCQQKgABbkQIjgipGMEQCIHA1l1NTOh06ca9seLlmAtPsTiNJo9Re0s51WakQTTf1AiAFJkhsi2Qv0fq8hY3AWlibqhhh_WiI3q6QabPGVXzuAA",
|
||||
"spr:CiUIAhIhAqk6NgpRxbKvI02Up24XP3U-dD3TdKRurXpW-ak3Zvh-EgIDARpJCicAJQgCEiECqTo2ClHFsq8jTZSnbhc_dT50PdN0pG6telb5qTdm-H4Q3oa70AYaCwoJBKwf79KRAiOCGgsKCQSsH-_SkQIjgipHMEUCIQCRubKOjNcLZEJut0Ts6wy_BEij4z-1WO6WiOVzT0svfQIgWKOBWVoopNC7zk1byUJMpNMOi05cKVsLoCBkW3RC9-Q",
|
||||
"spr:CiUIAhIhAvSGKPkE3mD7MP-ZCWS5AEvzcDNVsM6XFYeCBXNja7h2EgIDARpJCicAJQgCEiEC9IYo-QTeYPsw_5kJZLkAS_NwM1WwzpcVh4IFc2NruHYQ_Yi70AYaCwoJBKwf79ORAiOCGgsKCQSsH-_TkQIjgipHMEUCIQDJuV1B1sDyyxkNs8g3ahZ13GN9r7PEBP7xY4xGlm5n0AIgdyY2JEOyZ1FMdOzN6aZbAWo83AyjCrR-n0sietE1624"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "logos.dev",
|
||||
"description": "Logos devnet",
|
||||
"records": [
|
||||
"spr:CiUIAhIhAwfZDeTtWNlSgRbZlZfvxLI5Bpy0lFEYN7gImS3oHNaSEgIDARpJCicAJQgCEiEDB9kN5O1Y2VKBFtmVl-_EsjkGnLSUURg3uAiZLegc1pIQ__O20AYaCwoJBBiQTsiRAiOCGgsKCQQYkE7IkQIjgipHMEUCIQCIZx-HlVsLXJLhD6SEVx6Zt_1aG9IqMq-Luvz8No_J0wIgc8I9PRtheG4s5tzHjkEJMLcq3Jf09IT_FGkzPcJm8h4",
|
||||
"spr:CiUIAhIhA8d4LjRirtXO1M-JEmbhVA0CQeA7hHNR9BA7DvFsPKTEEgIDARpJCicAJQgCEiEDx3guNGKu1c7Uz4kSZuFUDQJB4DuEc1H0EDsO8Ww8pMQQhPW20AYaCwoJBCIq5juRAiOCGgsKCQQiKuY7kQIjgipGMEQCIHV_8nJ0iedWjlAxUhBmdAbDPLu5g2RmcnmJBD8cbD98AiAp1w9nAJgLlPIr41aMcdkds_eSoh8ImOVKvq6Idx-Ugg",
|
||||
"spr:CiUIAhIhA_MocWwn1_t__FEONMqYluUjc9ZVkcvYRLo6C0GzTkbfEgIDARpJCicAJQgCEiED8yhxbCfX-3_8UQ40ypiW5SNz1lWRy9hEujoLQbNORt8QlfO20AYaCwoJBC_u5W-RAiOCGgsKCQQv7uVvkQIjgipGMEQCIHMpQO31gg4FoKYtDyTTQS8xFz1KEmfqH385EeMUNbhPAiBblCkmOfQBmXj6eryaSiXWsftgohE-SPbKwsASZ1Zs3Q"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "codex.dev",
|
||||
"description": "Codex legacy devnet (deprecated)",
|
||||
"records": [
|
||||
"spr:CiUIAhIhA-VlcoiRm02KyIzrcTP-ljFpzTljfBRRKTIvhMIwqBqWEgIDARpJCicAJQgCEiED5WVyiJGbTYrIjOtxM_6WMWnNOWN8FFEpMi-EwjCoGpYQs8n8wQYaCwoJBHTKubmRAnU6GgsKCQR0yrm5kQJ1OipHMEUCIQDwUNsfReB4ty7JFS5WVQ6n1fcko89qVAOfQEHixa03rgIgan2-uFNDT-r4s9TOkLe9YBkCbsRWYCHGGVJ25rLj0QE",
|
||||
"spr:CiUIAhIhApIj9p6zJDRbw2NoCo-tj98Y760YbppRiEpGIE1yGaMzEgIDARpJCicAJQgCEiECkiP2nrMkNFvDY2gKj62P3xjvrRhumlGISkYgTXIZozMQvcz8wQYaCwoJBAWhF3WRAnVEGgsKCQQFoRd1kQJ1RCpGMEQCIFZB84O_nzPNuViqEGRL1vJTjHBJ-i5ZDgFL5XZxm4HAAiB8rbLHkUdFfWdiOmlencYVn0noSMRHzn4lJYoShuVzlw",
|
||||
"spr:CiUIAhIhApqRgeWRPSXocTS9RFkQmwTZRG-Cdt7UR2N7POoz606ZEgIDARpJCicAJQgCEiECmpGB5ZE9JehxNL1EWRCbBNlEb4J23tRHY3s86jPrTpkQj8_8wQYaCwoJBAXfEfiRAnVOGgsKCQQF3xH4kQJ1TipGMEQCIGWJMsF57N1iIEQgTH7IrVOgEgv0J2P2v3jvQr5Cjy-RAiAy4aiZ8QtyDvCfl_K_w6SyZ9csFGkRNTpirq_M_QNgKw"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -4,6 +4,7 @@
|
||||
# devnet and latest testnet, respectively.
|
||||
import std/options
|
||||
import std/strutils
|
||||
import std/json
|
||||
|
||||
import pkg/chronicles
|
||||
import pkg/codexdht/discv5/protocol
|
||||
@ -31,7 +32,7 @@ proc init*(
|
||||
func `$`*(preset: NetworkPreset): string =
|
||||
"[" & preset.name & "]: " & preset.description
|
||||
|
||||
func `$`*[N](presets: array[N, NetworkPreset]): string =
|
||||
func describePresets(presets: openArray[NetworkPreset]): string =
|
||||
result = ""
|
||||
for preset in presets:
|
||||
result &= $preset & "; "
|
||||
@ -52,62 +53,32 @@ proc `bootstrapNodes`*(self: NetworkPreset): seq[SignedPeerRecord] =
|
||||
# it should crash the node.
|
||||
result.add(parse(SignedPeerRecord, record).tryGet())
|
||||
|
||||
const NetworkPresets* = [
|
||||
NetworkPreset.init(
|
||||
"logos.test",
|
||||
"Logos testnet",
|
||||
@[
|
||||
"spr:CiUIAhIhA6rD-Sa1mJqHOoYMk8yad7B4BYDEI_toNwb1z0cYIRu6EgIDARpJCicAJQgCEiEDqsP5JrWYmoc6hgyT" &
|
||||
"zJp3sHgFgMQj-2g3BvXPRxghG7oQwoe70AYaCwoJBEDhQ42RAiOCGgsKCQRA4UONkQIjgipHMEUCIQCulmrBDKTTxL" &
|
||||
"8uBQYtEfp3_n3qDZFbO8lZ8mfIWHrRBAIgNXpVlWD1VlXzbGuJ4t7u8b7ymm3AYwm6-KjUvH6NfKU",
|
||||
"spr:CiUIAhIhAsFwlXD-3VpX-Pa3taM15wdL3DS75l_dpVCIFhdaIKYREgIDARpJCicAJQgCEiECwXCVcP7dWlf49re1" &
|
||||
"ozXnB0vcNLvmX92lUIgWF1ogphEQ4Ym70AYaCwoJBI5d6vqRAiOCGgsKCQSOXer6kQIjgipGMEQCIGaofSX23DDUcW" &
|
||||
"EMElHtlaFbLAsM0YgrMB4UwOIqPMb8AiBFRodJ_5-bkvoLuPo3K2nMGzKXZqXnII4poJhhopSo8A",
|
||||
"spr:CiUIAhIhAwSYqf83tfZom9eGFFdXXea-dblO-I7-I8B1kjhfJeEAEgIDARpJCicAJQgCEiEDBJip_ze19mib14YU" &
|
||||
"V1dd5r51uU74jv4jwHWSOF8l4QAQjoi70AYaCwoJBAqAABKRAiOCGgsKCQQKgAASkQIjgipHMEUCIQDEtfOFABgYo" &
|
||||
"sMflQ-d_v-qkc5FhwSwd_PTcA414MBYAAIgGbhCTAwbSJr5boiARoVFZ-XrBhfFBc_J5Kk5drdQoTE",
|
||||
"spr:CiUIAhIhA4T8XrxB6PKor8f7j7eqKxgIXH6mMST0_Uel5hZjSDp2EgIDARpJCicAJQgCEiEDhPxevEHo8qivx_uP" &
|
||||
"t6orGAhcfqYxJPT9R6XmFmNIOnYQrYq70AYaCwoJBAqAAFuRAiOCGgsKCQQKgABbkQIjgipGMEQCIHA1l1NTOh06ca9s" &
|
||||
"eLlmAtPsTiNJo9Re0s51WakQTTf1AiAFJkhsi2Qv0fq8hY3AWlibqhhh_WiI3q6QabPGVXzuAA",
|
||||
"spr:CiUIAhIhAqk6NgpRxbKvI02Up24XP3U-dD3TdKRurXpW-ak3Zvh-EgIDARpJCicAJQgCEiECqTo2ClHFsq8jTZSn" &
|
||||
"bhc_dT50PdN0pG6telb5qTdm-H4Q3oa70AYaCwoJBKwf79KRAiOCGgsKCQSsH-_SkQIjgipHMEUCIQCRubKOjNcLZEJu" &
|
||||
"t0Ts6wy_BEij4z-1WO6WiOVzT0svfQIgWKOBWVoopNC7zk1byUJMpNMOi05cKVsLoCBkW3RC9-Q",
|
||||
"spr:CiUIAhIhAvSGKPkE3mD7MP-ZCWS5AEvzcDNVsM6XFYeCBXNja7h2EgIDARpJCicAJQgCEiEC9IYo-QTeYPsw_5kJ" &
|
||||
"ZLkAS_NwM1WwzpcVh4IFc2NruHYQ_Yi70AYaCwoJBKwf79ORAiOCGgsKCQSsH-_TkQIjgipHMEUCIQDJuV1B1sDyyx" &
|
||||
"kNs8g3ahZ13GN9r7PEBP7xY4xGlm5n0AIgdyY2JEOyZ1FMdOzN6aZbAWo83AyjCrR-n0sietE1624",
|
||||
],
|
||||
),
|
||||
NetworkPreset.init(
|
||||
"logos.dev",
|
||||
"Logos devnet",
|
||||
@[
|
||||
"spr:CiUIAhIhAwfZDeTtWNlSgRbZlZfvxLI5Bpy0lFEYN7gImS3oHNaSEgIDARpJCicAJQgCEiEDB9kN5O1Y2VKBFtmVl-" &
|
||||
"_EsjkGnLSUURg3uAiZLegc1pIQ__O20AYaCwoJBBiQTsiRAiOCGgsKCQQYkE7IkQIjgipHMEUCIQCIZx-HlVsLXJLhD6SEV" &
|
||||
"x6Zt_1aG9IqMq-Luvz8No_J0wIgc8I9PRtheG4s5tzHjkEJMLcq3Jf09IT_FGkzPcJm8h4",
|
||||
"spr:CiUIAhIhA8d4LjRirtXO1M-JEmbhVA0CQeA7hHNR9BA7DvFsPKTEEgIDARpJCicAJQgCEiEDx3guNGKu1c7Uz4kSZu" &
|
||||
"FUDQJB4DuEc1H0EDsO8Ww8pMQQhPW20AYaCwoJBCIq5juRAiOCGgsKCQQiKuY7kQIjgipGMEQCIHV_8nJ0iedWjlAxUhBm" &
|
||||
"dAbDPLu5g2RmcnmJBD8cbD98AiAp1w9nAJgLlPIr41aMcdkds_eSoh8ImOVKvq6Idx-Ugg",
|
||||
"spr:CiUIAhIhA_MocWwn1_t__FEONMqYluUjc9ZVkcvYRLo6C0GzTkbfEgIDARpJCicAJQgCEiED8yhxbCfX-3_8UQ40yp" &
|
||||
"iW5SNz1lWRy9hEujoLQbNORt8QlfO20AYaCwoJBC_u5W-RAiOCGgsKCQQv7uVvkQIjgipGMEQCIHMpQO31gg4FoKYtDyTT" &
|
||||
"QS8xFz1KEmfqH385EeMUNbhPAiBblCkmOfQBmXj6eryaSiXWsftgohE-SPbKwsASZ1Zs3Q",
|
||||
],
|
||||
),
|
||||
NetworkPreset.init(
|
||||
"codex.dev",
|
||||
"Codex legacy devnet (deprecated)",
|
||||
@[
|
||||
"spr:CiUIAhIhA-VlcoiRm02KyIzrcTP-ljFpzTljfBRRKTIvhMIwqBqWEgIDARpJCicAJQgCEiED5WVyiJGbTYrIjOtxM_6" &
|
||||
"WMWnNOWN8FFEpMi-EwjCoGpYQs8n8wQYaCwoJBHTKubmRAnU6GgsKCQR0yrm5kQJ1OipHMEUCIQDwUNsfReB4ty7JFS" &
|
||||
"5WVQ6n1fcko89qVAOfQEHixa03rgIgan2-uFNDT-r4s9TOkLe9YBkCbsRWYCHGGVJ25rLj0QE",
|
||||
"spr:CiUIAhIhApIj9p6zJDRbw2NoCo-tj98Y760YbppRiEpGIE1yGaMzEgIDARpJCicAJQgCEiECkiP2nrMkNFvDY2gKj62P" &
|
||||
"3xjvrRhumlGISkYgTXIZozMQvcz8wQYaCwoJBAWhF3WRAnVEGgsKCQQFoRd1kQJ1RCpGMEQCIFZB84O_nzPNuViqEGRL" &
|
||||
"1vJTjHBJ-i5ZDgFL5XZxm4HAAiB8rbLHkUdFfWdiOmlencYVn0noSMRHzn4lJYoShuVzlw",
|
||||
"spr:CiUIAhIhApqRgeWRPSXocTS9RFkQmwTZRG-Cdt7UR2N7POoz606ZEgIDARpJCicAJQgCEiECmpGB5ZE9JehxNL1EWRCb" &
|
||||
"BNlEb4J23tRHY3s86jPrTpkQj8_8wQYaCwoJBAXfEfiRAnVOGgsKCQQF3xH4kQJ1TipGMEQCIGWJMsF57N1iIEQgTH7I" &
|
||||
"rVOgEgv0J2P2v3jvQr5Cjy-RAiAy4aiZ8QtyDvCfl_K_w6SyZ9csFGkRNTpirq_M_QNgKw",
|
||||
],
|
||||
),
|
||||
]
|
||||
# Bootstrap node SPRs live in a single source-of-truth config file at the repo
|
||||
# root. staticRead embeds it into the binary at build time, so the node remains
|
||||
# self-contained (the file is not needed alongside the binary at runtime). The
|
||||
# same file is read at runtime by the bootstrap liveness checker (tools/check_spr).
|
||||
const networkPresetsJson = staticRead("../network_presets.json")
|
||||
|
||||
proc parsePresetsJson(jsonStr: string): seq[NetworkPreset] =
|
||||
let root = parseJson(jsonStr)
|
||||
for p in root["presets"].items:
|
||||
var records: seq[string]
|
||||
for r in p["records"].items:
|
||||
records.add(r.getStr)
|
||||
result.add(NetworkPreset.init(p["name"].getStr, p["description"].getStr, records))
|
||||
|
||||
const NetworkPresets* = parsePresetsJson(networkPresetsJson)
|
||||
|
||||
proc loadNetworkPresets*(path: string): seq[NetworkPreset] =
|
||||
## Runtime loader for the same bootstrap-node config file embedded at compile
|
||||
## time into `NetworkPresets`. Used by the bootstrap liveness checker so the
|
||||
## node and the checker share one source of truth.
|
||||
parsePresetsJson(readFile(path))
|
||||
|
||||
proc rawRecords*(self: NetworkPreset): seq[string] =
|
||||
## The unparsed `spr:` strings for this preset (for tooling that wants to
|
||||
## handle parse failures per-record instead of crashing).
|
||||
self.unparsedRecords
|
||||
|
||||
proc `default`*(presets: openArray[NetworkPreset]): NetworkPreset =
|
||||
presets[0]
|
||||
@ -115,7 +86,7 @@ proc `default`*(presets: openArray[NetworkPreset]): NetworkPreset =
|
||||
# Precomputes those as as consts so we can use them in nim-confutils CLI
|
||||
# help strings.
|
||||
const
|
||||
NetworkPresetsDescription* = $NetworkPresets
|
||||
NetworkPresetsDescription* = describePresets(NetworkPresets)
|
||||
DefaultNetworkPreset* = NetworkPresets.default
|
||||
|
||||
proc find*(presets: openArray[NetworkPreset], p: string): Option[NetworkPreset] =
|
||||
|
||||
373
tools/check_spr.nim
Normal file
373
tools/check_spr.nim
Normal file
@ -0,0 +1,373 @@
|
||||
## check_spr - bootstrap-node liveness checker.
|
||||
##
|
||||
## Reads the bootstrap SPRs from the shared config file (network_presets.json by
|
||||
## default) and probes each one. The probe depends on the transport advertised in
|
||||
## the record:
|
||||
## * TCP addresses -> a libp2p connection is attempted.
|
||||
## * UDP addresses -> a discovery v5 (DHT) ping is sent.
|
||||
## It prints a per-node report — a human-readable table by default, or JSON with
|
||||
## `--format json` — and exits non-zero if any node is unreachable. A single
|
||||
## `spr:` URI can also be passed for ad-hoc checks.
|
||||
##
|
||||
## IMPORTANT: run this from a host OUTSIDE the fleet VPCs (e.g. a GitHub-hosted
|
||||
## runner), otherwise nodes advertising private/cloud-internal IPs will appear
|
||||
## reachable and defeat the purpose.
|
||||
##
|
||||
## Usage:
|
||||
## check_spr [--source <file>] [--network <name>] [--timeout <secs>]
|
||||
## [--format text|json] [--out <file>]
|
||||
## check_spr <spr-uri> [--timeout <secs>]
|
||||
## check_spr --help
|
||||
##
|
||||
## Run `check_spr --help` for a full description of every option.
|
||||
|
||||
import std/[json, options, os, sequtils, strutils, typetraits, strformat, terminal]
|
||||
|
||||
import pkg/chronicles
|
||||
import pkg/chronos
|
||||
import pkg/libp2p
|
||||
import pkg/libp2p/crypto/rng
|
||||
import pkg/codexdht/discv5/spr
|
||||
import pkg/codexdht/discv5/node
|
||||
import pkg/codexdht/discv5/protocol as discv5
|
||||
|
||||
import ../storage/presets
|
||||
|
||||
const
|
||||
DefaultTimeoutSecs = 10
|
||||
DefaultSource = "network_presets.json"
|
||||
|
||||
type OutputFormat = enum
|
||||
## String values double as the accepted `--format` argument spellings, so the
|
||||
## parser can compare against them directly.
|
||||
ofText = "text"
|
||||
ofJson = "json"
|
||||
|
||||
type Verdict = object
|
||||
alive: bool
|
||||
reason: string
|
||||
|
||||
type Row = object
|
||||
network: string
|
||||
peerId: string
|
||||
address: string
|
||||
alive: bool
|
||||
reason: string
|
||||
|
||||
proc hasCodec(addrs: seq[MultiAddress], codec: MultiCodec): bool =
|
||||
addrs.anyIt(it.contains(codec).get(false))
|
||||
|
||||
proc setupLogging() =
|
||||
## The project's `config.nims` forces `dynamic` chronicles sinks, whose output
|
||||
## writers must be configured at runtime. Without this, every log message is
|
||||
## dropped with a noisy "dynamic log output writer not configured" warning.
|
||||
## Route libp2p logs to stderr so the tool's stdout (the ALIVE/DEAD verdict)
|
||||
## stays clean and parseable.
|
||||
when defaultChroniclesStream.outputs.type.arity == 3:
|
||||
proc noOutput(logLevel: LogLevel, msg: LogOutputStr) =
|
||||
discard
|
||||
|
||||
proc stderrFlush(logLevel: LogLevel, msg: LogOutputStr) =
|
||||
try:
|
||||
stderr.write(msg)
|
||||
stderr.flushFile()
|
||||
except IOError:
|
||||
discard
|
||||
|
||||
defaultChroniclesStream.outputs[0].writer = stderrFlush
|
||||
defaultChroniclesStream.outputs[1].writer = noOutput
|
||||
defaultChroniclesStream.outputs[2].writer = noOutput
|
||||
|
||||
proc buildSwitch(): Switch =
|
||||
SwitchBuilder
|
||||
.new()
|
||||
.withRng(newRng())
|
||||
.withAddress(MultiAddress.init("/ip4/0.0.0.0/tcp/0").tryGet())
|
||||
.withTcpTransport()
|
||||
.withNoise()
|
||||
.withYamux()
|
||||
# match storage node switch builder, for TCP
|
||||
.withMplex()
|
||||
.build()
|
||||
|
||||
proc checkLibp2p(
|
||||
peerId: PeerId, addresses: seq[MultiAddress], timeout: Duration
|
||||
): Future[Verdict] {.async.} =
|
||||
let switch = buildSwitch()
|
||||
await switch.start()
|
||||
defer:
|
||||
await switch.stop()
|
||||
|
||||
try:
|
||||
await switch.connect(peerId, addresses).wait(timeout)
|
||||
return Verdict(alive: true, reason: "libp2p connection established")
|
||||
except AsyncTimeoutError:
|
||||
return Verdict(alive: false, reason: "libp2p connection timed out")
|
||||
except CatchableError as exc:
|
||||
return Verdict(alive: false, reason: "libp2p connection failed: " & exc.msg)
|
||||
|
||||
proc checkDiscv5(
|
||||
record: SignedPeerRecord, timeout: Duration
|
||||
): Future[Verdict] {.async.} =
|
||||
let nodeRes = newNode(record)
|
||||
if nodeRes.isErr:
|
||||
return Verdict(alive: false, reason: "cannot build discv5 node: " & $nodeRes.error)
|
||||
let targetNode = nodeRes.get()
|
||||
|
||||
let rng = newRng()
|
||||
let privKey = PrivateKey.random(rng).tryGet()
|
||||
let proto = discv5.newProtocol(
|
||||
privKey, none(IpAddress), none(Port), none(Port), bindPort = Port(0), rng = rng
|
||||
)
|
||||
|
||||
try:
|
||||
proto.open()
|
||||
except CatchableError as exc:
|
||||
return Verdict(alive: false, reason: "failed to open discv5: " & exc.msg)
|
||||
defer:
|
||||
await proto.closeWait()
|
||||
|
||||
try:
|
||||
let pong = await discv5.ping(proto, targetNode).wait(timeout)
|
||||
if pong.isOk:
|
||||
return Verdict(alive: true, reason: "discv5 pong received")
|
||||
else:
|
||||
return Verdict(alive: false, reason: "discv5 ping failed: " & $pong.error)
|
||||
except AsyncTimeoutError:
|
||||
return Verdict(alive: false, reason: "discv5 ping timed out")
|
||||
except CatchableError as exc:
|
||||
return Verdict(alive: false, reason: "discv5 ping failed: " & exc.msg)
|
||||
|
||||
proc probe(record: SignedPeerRecord, timeout: Duration): Future[Verdict] {.async.} =
|
||||
let addresses = record.data.addresses.mapIt(it.address)
|
||||
if addresses.len == 0:
|
||||
return Verdict(alive: false, reason: "SPR contains no addresses")
|
||||
|
||||
if hasCodec(addresses, multiCodec("tcp")):
|
||||
return await checkLibp2p(record.data.peerId, addresses, timeout)
|
||||
elif hasCodec(addresses, multiCodec("udp")):
|
||||
return await checkDiscv5(record, timeout)
|
||||
else:
|
||||
return Verdict(alive: false, reason: "no tcp or udp addresses to probe")
|
||||
|
||||
proc probeRecords(
|
||||
source, networkFilter: string, timeout: Duration
|
||||
): Future[seq[Row]] {.async.} =
|
||||
let presets = loadNetworkPresets(source)
|
||||
for preset in presets:
|
||||
if networkFilter.len > 0 and preset.name != networkFilter:
|
||||
continue
|
||||
for rec in preset.rawRecords:
|
||||
let parsed = SignedPeerRecord.parse(rec)
|
||||
if parsed.isErr:
|
||||
result.add Row(
|
||||
network: preset.name,
|
||||
alive: false,
|
||||
reason: "SPR parse failed: " & parsed.error,
|
||||
)
|
||||
continue
|
||||
let record = parsed.get
|
||||
let v = await probe(record, timeout)
|
||||
result.add Row(
|
||||
network: preset.name,
|
||||
peerId: $record.data.peerId,
|
||||
address: record.data.addresses.mapIt($it.address).deduplicate.join(", "),
|
||||
alive: v.alive,
|
||||
reason: v.reason,
|
||||
)
|
||||
|
||||
const
|
||||
TableHeader = "FLEET RESULT ADDRESS REASON"
|
||||
TableRule = '-'.repeat(TableHeader.len)
|
||||
|
||||
proc formatRow(r: Row): string =
|
||||
## One table line for a probed node. The status cell is fixed-width ("DEAD " is
|
||||
## padded to match "ALIVE") so columns line up regardless of the verdict. ANSI
|
||||
## color, when added by the styled renderer, is zero-width on screen, so the
|
||||
## plain-text alignment computed here stays visually correct.
|
||||
let status = if r.alive: "ALIVE" else: "DEAD "
|
||||
r.network.alignLeft(12) & " " & status & " " & r.address.alignLeft(43) & " " &
|
||||
r.reason
|
||||
|
||||
proc renderTable(rows: seq[Row]): string =
|
||||
## Plain (uncolored) table — used for files and non-interactive stdout, where
|
||||
## ANSI escapes would be corruption rather than decoration.
|
||||
result = TableHeader & "\n" & TableRule
|
||||
for r in rows:
|
||||
result.add("\n" & formatRow(r))
|
||||
|
||||
proc printStyledTable(rows: seq[Row]) =
|
||||
## Colored table for an interactive terminal: alive rows green, dead rows red.
|
||||
## styledEcho writes ANSI escapes unconditionally, so callers must restrict this
|
||||
## to a TTY (see isatty guard at the call site).
|
||||
styledEcho(styleBright, TableHeader)
|
||||
styledEcho(styleDim, TableRule)
|
||||
for r in rows:
|
||||
styledEcho(if r.alive: fgGreen else: fgRed, formatRow(r))
|
||||
|
||||
proc rowsToJson(rows: seq[Row]): JsonNode =
|
||||
result = newJArray()
|
||||
for r in rows:
|
||||
result.add %*{
|
||||
"network": r.network,
|
||||
"peerId": r.peerId,
|
||||
"address": r.address,
|
||||
"alive": r.alive,
|
||||
"reason": r.reason,
|
||||
}
|
||||
|
||||
proc parseTimeout(s: string): int =
|
||||
try:
|
||||
parseInt(s)
|
||||
except ValueError:
|
||||
quit("Error: timeout must be an integer number of seconds", QuitFailure)
|
||||
|
||||
proc parseFormat(s: string): OutputFormat =
|
||||
case s
|
||||
of $ofText:
|
||||
ofText
|
||||
of $ofJson:
|
||||
ofJson
|
||||
else:
|
||||
quit("Error: format must be '" & $ofText & "' or '" & $ofJson & "'", QuitFailure)
|
||||
|
||||
proc printHelp() =
|
||||
echo """check_spr - bootstrap-node liveness checker.
|
||||
|
||||
Reads bootstrap SPRs from a config file and probes each one (libp2p connect for
|
||||
TCP addresses, discv5 ping for UDP). Prints a per-node table and exits non-zero
|
||||
if any node is unreachable. A single spr: URI can be passed for an ad-hoc check.
|
||||
|
||||
IMPORTANT: run from a host OUTSIDE the fleet VPCs (e.g. a GitHub-hosted runner),
|
||||
otherwise nodes advertising private/cloud-internal IPs appear reachable and
|
||||
defeat the purpose.
|
||||
|
||||
Usage:
|
||||
check_spr [options]
|
||||
check_spr <spr-uri> [--timeout <secs>]
|
||||
|
||||
Options:
|
||||
--source <file> Config file to read SPRs from (default: """ &
|
||||
DefaultSource & """).
|
||||
--network <name> Only probe the preset with this network name.
|
||||
--timeout <secs> Per-node probe timeout in seconds (default: """ &
|
||||
$DefaultTimeoutSecs & """).
|
||||
--format <fmt> Output format: "text" (default) or "json". "text" is the
|
||||
human-readable table; "json" is a pretty-printed summary.
|
||||
--out <file> Write the output to <file> instead of stdout. The content
|
||||
is whichever --format is selected (text or json).
|
||||
--help, -h Show this help and exit.
|
||||
|
||||
Arguments:
|
||||
<spr-uri> A single "spr:" URI to probe instead of reading the config
|
||||
file. Prints ALIVE/DEAD and exits with the matching status;
|
||||
this mode ignores --format and --out."""
|
||||
|
||||
when isMainModule:
|
||||
setupLogging()
|
||||
|
||||
var
|
||||
source = DefaultSource
|
||||
networkFilter = ""
|
||||
timeoutSecs = DefaultTimeoutSecs
|
||||
singleSpr = ""
|
||||
format = ofText
|
||||
outFile = ""
|
||||
|
||||
let params = commandLineParams()
|
||||
var i = 0
|
||||
while i < params.len:
|
||||
case params[i]
|
||||
of "--help", "-h":
|
||||
printHelp()
|
||||
quit(QuitSuccess)
|
||||
of "--source":
|
||||
inc i
|
||||
source = params[i]
|
||||
of "--network":
|
||||
inc i
|
||||
networkFilter = params[i]
|
||||
of "--timeout":
|
||||
inc i
|
||||
timeoutSecs = parseTimeout(params[i])
|
||||
of "--format":
|
||||
inc i
|
||||
format = parseFormat(params[i])
|
||||
of "--out":
|
||||
inc i
|
||||
outFile = params[i]
|
||||
else:
|
||||
if params[i].startsWith("spr:"):
|
||||
singleSpr = params[i]
|
||||
else:
|
||||
quit("Error: unknown argument: " & params[i], QuitFailure)
|
||||
inc i
|
||||
|
||||
let timeout = timeoutSecs.seconds
|
||||
|
||||
if singleSpr.len > 0:
|
||||
let parsed = SignedPeerRecord.parse(singleSpr)
|
||||
if parsed.isErr:
|
||||
quit("Error: " & parsed.error, QuitFailure)
|
||||
let v = waitFor probe(parsed.get, timeout)
|
||||
echo (if v.alive: "ALIVE: " else: "DEAD: "), v.reason
|
||||
quit(if v.alive: QuitSuccess else: QuitFailure)
|
||||
|
||||
let rows = waitFor probeRecords(source, networkFilter, timeout)
|
||||
|
||||
if outFile.len > 0:
|
||||
# Files (and the JSON form) must stay plain — ANSI color codes would corrupt
|
||||
# them — so always write the uncolored rendering. writeFile does not append a
|
||||
# trailing newline the way echo does, so add one to keep the file POSIX-clean.
|
||||
let output =
|
||||
case format
|
||||
of ofText:
|
||||
renderTable(rows)
|
||||
of ofJson:
|
||||
pretty(rowsToJson(rows))
|
||||
writeFile(outFile, output & "\n")
|
||||
# The report is in the file, not on stdout, so point the caller at it. Use
|
||||
# the absolute path so the line is unambiguous regardless of the cwd.
|
||||
#!fmt: off
|
||||
styledEcho bgWhite, fgBlack, styleBright,
|
||||
"\n\n ",
|
||||
styleUnderscore,
|
||||
"ℹ️ BOOTSTRAP HEALTH REPORT ℹ️\n\n",
|
||||
resetStyle, bgWhite, fgBlack, styleBright,
|
||||
""" Bootstrap health report for this run will be available at:""",
|
||||
resetStyle, bgWhite, fgBlack,
|
||||
&"\n\n {absolutePath(outFile)}\n\n",
|
||||
resetStyle, bgWhite, fgBlack, styleBright,
|
||||
" NOTE: For CI runs, the report will be displayed in the workflow summary\n"
|
||||
#!fmt: on
|
||||
else:
|
||||
case format
|
||||
of ofText:
|
||||
# Color only when stdout is an interactive terminal; when piped or
|
||||
# redirected, fall back to the plain table so escapes never reach the
|
||||
# consumer (std/terminal emits ANSI unconditionally on POSIX).
|
||||
if isatty(stdout):
|
||||
printStyledTable(rows)
|
||||
else:
|
||||
echo renderTable(rows)
|
||||
of ofJson:
|
||||
echo pretty(rowsToJson(rows))
|
||||
|
||||
let dead = rows.filterIt(not it.alive)
|
||||
|
||||
if outFile.len > 0:
|
||||
if dead.len > 0:
|
||||
styledEcho styleBright,
|
||||
fgRed,
|
||||
&"\n\n[❌ ERROR] ",
|
||||
resetStyle,
|
||||
"One or more bootstrap nodes are unreachable, see report for details.\n\n"
|
||||
else:
|
||||
styledEcho styleBright,
|
||||
fgGreen,
|
||||
&"\n\n[✅ SUCCESS] ",
|
||||
resetStyle,
|
||||
"All bootstrap nodes are reachable, see report for details.\n\n"
|
||||
|
||||
if dead.len > 0:
|
||||
quit(QuitFailure)
|
||||
Loading…
x
Reference in New Issue
Block a user