diff --git a/.github/workflows/bootstrap-health-check.yml b/.github/workflows/bootstrap-health-check.yml new file mode 100644 index 00000000..7bdda398 --- /dev/null +++ b/.github/workflows/bootstrap-health-check.yml @@ -0,0 +1,102 @@ +name: Bootstrap nodes health check + +# Scheduled liveness check for the preset bootstrap nodes. Runs on a +# GitHub-hosted runner (public internet) so nodes advertising private/cloud +# internal IPs are correctly seen as unreachable. On any unreachable node it +# fails the job and opens/updates a tracking issue labelled `bootstrap-health`. + +on: + push: + branches: [feat/bootstrap-health-check] # ← temporary for testing; remove before merge + schedule: + - cron: "0 6 * * *" # daily 06:00 UTC + workflow_dispatch: + +env: + nim_version: v2.2.10 + +concurrency: + group: ${{ github.workflow }} + cancel-in-progress: false + +permissions: + contents: read + issues: write + +jobs: + ping: + name: Ping preset bootstrap nodes + runs-on: ubuntu-latest + steps: + - name: Checkout sources + uses: actions/checkout@v6 + with: + submodules: recursive + + - name: Setup Nimbus Build System + uses: ./.github/actions/nimbus-build-system + with: + os: linux + nim_version: ${{ env.nim_version }} + + - name: Ping bootstrap nodes + id: ping + continue-on-error: true + run: make CI=true bootstrapHealthCheck + shell: bash + + - name: Build report + id: report + if: always() + run: | + json=build/bootstrap-health-report.json + if [ ! -f "$json" ]; then + echo "no_output=true" >> "$GITHUB_OUTPUT" + echo "::error::check_spr produced no output file" + exit 0 + fi + dead=$(jq '[.[] | select(.alive==false)] | length' "$json") + total=$(jq 'length' "$json") + echo "dead=$dead" >> "$GITHUB_OUTPUT" + { + echo "## Bootstrap node liveness ($((total - dead))/$total reachable)" + echo + echo "| Network | Result | Address | Reason |" + echo "|---|---|---|---|" + jq -r '.[] | "| \(.network) | \(if .alive then "✅ ALIVE" else "❌ DEAD" end) | \(.address) | \(.reason) |"' "$json" + } | tee report.md >> "$GITHUB_STEP_SUMMARY" + shell: bash + + - name: Open or update tracking issue + if: always() && steps.report.outputs.dead != '0' && steps.report.outputs.dead != '' + env: + GH_TOKEN: ${{ github.token }} + DEAD: ${{ steps.report.outputs.dead }} + RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + run: | + gh label create bootstrap-health --color B60205 \ + --description "Automated bootstrap-node liveness alerts" 2>/dev/null || true + { + echo "Scheduled bootstrap-node liveness check found **${DEAD}** unreachable node(s)." + echo + echo "Run: ${RUN_URL}" + echo + cat report.md + } > issue-body.md + existing=$(gh issue list --label bootstrap-health --state open --json number --jq '.[0].number') + if [ -n "$existing" ]; then + gh issue comment "$existing" --body-file issue-body.md + else + gh issue create --title "Bootstrap nodes unreachable" \ + --label bootstrap-health --body-file issue-body.md + fi + shell: bash + + - name: Fail if any node is unreachable + if: always() && ((steps.report.outputs.dead != '0' && steps.report.outputs.dead != '') || steps.report.outputs.no_output == 'true') + env: + DEAD: ${{ steps.report.outputs.dead }} + run: | + echo "Bootstrap liveness check failed: ${DEAD} unreachable node(s)." + exit 1 + shell: bash diff --git a/Makefile b/Makefile index f7a8edb4..9d26a1ea 100644 --- a/Makefile +++ b/Makefile @@ -152,6 +152,22 @@ testIntegration: | build deps echo -e $(BUILD_MSG) "build/$@" && \ $(ENV_SCRIPT) nim testIntegration $(TEST_PARAMS) $(NIM_PARAMS) build.nims +BOOTSTRAP_HEALTH_CHECK_PARAMS := +ifdef CI + BOOTSTRAP_HEALTH_CHECK_PARAMS := $(BOOTSTRAP_HEALTH_CHECK_PARAMS) -d:ci=$(CI) +endif + +checkSpr: | build deps + echo -e $(BUILD_MSG) "build/check_spr" && \ + $(ENV_SCRIPT) nim checkSpr $(NIM_PARAMS) build.nims + +# Pings the preset bootstrap nodes and fails if any are unreachable. +# Run from OUTSIDE the fleet VPCs (e.g. a GitHub-hosted runner) so nodes that +# advertise private/cloud-internal IPs are correctly seen as unreachable. +bootstrapHealthCheck: | build deps + echo -e $(BUILD_MSG) "build/check_spr" && \ + $(ENV_SCRIPT) nim bootstrapHealthCheck $(NIM_PARAMS) $(BOOTSTRAP_HEALTH_CHECK_PARAMS) build.nims + # Builds a C example that uses the libstorage C library and runs it testLibstorageC: | build deps $(MAKE) $(if $(ncpu),-j$(ncpu),) libstorage diff --git a/build.nims b/build.nims index 60d48301..c9a52c92 100644 --- a/build.nims +++ b/build.nims @@ -81,6 +81,26 @@ task mixTools, "build mix tools (mix_pool, mix_relay_dht)": buildBinary "mix_relay_dht", outName = "mix_relay_dht", srcDir = "tools/mix/", params = mixParams +task checkSpr, "build check_spr used for checking bootstrap node health": + buildBinary "check_spr", + srcDir = "tools/", + params = "-d:release -d:chronicles_runtime_filtering -d:chronicles_log_level=WARN" + +task bootstrapHealthCheck, "ping preset bootstrap nodes; non-zero exit if any are unreachable": + checkSprTask() + + # get CI param from make if present + var args = "" + for i in 2 ..< paramCount(): + if "ci" in paramStr(i) and truthy paramStr(i).split('=')[1]: + # Writes the JSON summary to a file before exiting, so the scheduled workflow + args = "--format json --out build/bootstrap-health-report.json" + break + + # can read it. check_spr exits non-zero when a node is unreachable, failing + # the workflow run. + exec "build/check_spr " & args + task testStorage, "Build & run Logos Storage tests": test "testStorage", outName = "testStorage" diff --git a/network_presets.json b/network_presets.json new file mode 100644 index 00000000..eab2f20c --- /dev/null +++ b/network_presets.json @@ -0,0 +1,34 @@ +{ + "presets": [ + { + "name": "logos.test", + "description": "Logos testnet", + "records": [ + "spr:CiUIAhIhA6rD-Sa1mJqHOoYMk8yad7B4BYDEI_toNwb1z0cYIRu6EgIDARpJCicAJQgCEiEDqsP5JrWYmoc6hgyTzJp3sHgFgMQj-2g3BvXPRxghG7oQwoe70AYaCwoJBEDhQ42RAiOCGgsKCQRA4UONkQIjgipHMEUCIQCulmrBDKTTxL8uBQYtEfp3_n3qDZFbO8lZ8mfIWHrRBAIgNXpVlWD1VlXzbGuJ4t7u8b7ymm3AYwm6-KjUvH6NfKU", + "spr:CiUIAhIhAsFwlXD-3VpX-Pa3taM15wdL3DS75l_dpVCIFhdaIKYREgIDARpJCicAJQgCEiECwXCVcP7dWlf49re1ozXnB0vcNLvmX92lUIgWF1ogphEQ4Ym70AYaCwoJBI5d6vqRAiOCGgsKCQSOXer6kQIjgipGMEQCIGaofSX23DDUcWEMElHtlaFbLAsM0YgrMB4UwOIqPMb8AiBFRodJ_5-bkvoLuPo3K2nMGzKXZqXnII4poJhhopSo8A", + "spr:CiUIAhIhAwSYqf83tfZom9eGFFdXXea-dblO-I7-I8B1kjhfJeEAEgIDARpJCicAJQgCEiEDBJip_ze19mib14YUV1dd5r51uU74jv4jwHWSOF8l4QAQjoi70AYaCwoJBAqAABKRAiOCGgsKCQQKgAASkQIjgipHMEUCIQDEtfOFABgYosMflQ-d_v-qkc5FhwSwd_PTcA414MBYAAIgGbhCTAwbSJr5boiARoVFZ-XrBhfFBc_J5Kk5drdQoTE", + "spr:CiUIAhIhA4T8XrxB6PKor8f7j7eqKxgIXH6mMST0_Uel5hZjSDp2EgIDARpJCicAJQgCEiEDhPxevEHo8qivx_uPt6orGAhcfqYxJPT9R6XmFmNIOnYQrYq70AYaCwoJBAqAAFuRAiOCGgsKCQQKgABbkQIjgipGMEQCIHA1l1NTOh06ca9seLlmAtPsTiNJo9Re0s51WakQTTf1AiAFJkhsi2Qv0fq8hY3AWlibqhhh_WiI3q6QabPGVXzuAA", + "spr:CiUIAhIhAqk6NgpRxbKvI02Up24XP3U-dD3TdKRurXpW-ak3Zvh-EgIDARpJCicAJQgCEiECqTo2ClHFsq8jTZSnbhc_dT50PdN0pG6telb5qTdm-H4Q3oa70AYaCwoJBKwf79KRAiOCGgsKCQSsH-_SkQIjgipHMEUCIQCRubKOjNcLZEJut0Ts6wy_BEij4z-1WO6WiOVzT0svfQIgWKOBWVoopNC7zk1byUJMpNMOi05cKVsLoCBkW3RC9-Q", + "spr:CiUIAhIhAvSGKPkE3mD7MP-ZCWS5AEvzcDNVsM6XFYeCBXNja7h2EgIDARpJCicAJQgCEiEC9IYo-QTeYPsw_5kJZLkAS_NwM1WwzpcVh4IFc2NruHYQ_Yi70AYaCwoJBKwf79ORAiOCGgsKCQSsH-_TkQIjgipHMEUCIQDJuV1B1sDyyxkNs8g3ahZ13GN9r7PEBP7xY4xGlm5n0AIgdyY2JEOyZ1FMdOzN6aZbAWo83AyjCrR-n0sietE1624" + ] + }, + { + "name": "logos.dev", + "description": "Logos devnet", + "records": [ + "spr:CiUIAhIhAwfZDeTtWNlSgRbZlZfvxLI5Bpy0lFEYN7gImS3oHNaSEgIDARpJCicAJQgCEiEDB9kN5O1Y2VKBFtmVl-_EsjkGnLSUURg3uAiZLegc1pIQ__O20AYaCwoJBBiQTsiRAiOCGgsKCQQYkE7IkQIjgipHMEUCIQCIZx-HlVsLXJLhD6SEVx6Zt_1aG9IqMq-Luvz8No_J0wIgc8I9PRtheG4s5tzHjkEJMLcq3Jf09IT_FGkzPcJm8h4", + "spr:CiUIAhIhA8d4LjRirtXO1M-JEmbhVA0CQeA7hHNR9BA7DvFsPKTEEgIDARpJCicAJQgCEiEDx3guNGKu1c7Uz4kSZuFUDQJB4DuEc1H0EDsO8Ww8pMQQhPW20AYaCwoJBCIq5juRAiOCGgsKCQQiKuY7kQIjgipGMEQCIHV_8nJ0iedWjlAxUhBmdAbDPLu5g2RmcnmJBD8cbD98AiAp1w9nAJgLlPIr41aMcdkds_eSoh8ImOVKvq6Idx-Ugg", + "spr:CiUIAhIhA_MocWwn1_t__FEONMqYluUjc9ZVkcvYRLo6C0GzTkbfEgIDARpJCicAJQgCEiED8yhxbCfX-3_8UQ40ypiW5SNz1lWRy9hEujoLQbNORt8QlfO20AYaCwoJBC_u5W-RAiOCGgsKCQQv7uVvkQIjgipGMEQCIHMpQO31gg4FoKYtDyTTQS8xFz1KEmfqH385EeMUNbhPAiBblCkmOfQBmXj6eryaSiXWsftgohE-SPbKwsASZ1Zs3Q" + ] + }, + { + "name": "codex.dev", + "description": "Codex legacy devnet (deprecated)", + "records": [ + "spr:CiUIAhIhA-VlcoiRm02KyIzrcTP-ljFpzTljfBRRKTIvhMIwqBqWEgIDARpJCicAJQgCEiED5WVyiJGbTYrIjOtxM_6WMWnNOWN8FFEpMi-EwjCoGpYQs8n8wQYaCwoJBHTKubmRAnU6GgsKCQR0yrm5kQJ1OipHMEUCIQDwUNsfReB4ty7JFS5WVQ6n1fcko89qVAOfQEHixa03rgIgan2-uFNDT-r4s9TOkLe9YBkCbsRWYCHGGVJ25rLj0QE", + "spr:CiUIAhIhApIj9p6zJDRbw2NoCo-tj98Y760YbppRiEpGIE1yGaMzEgIDARpJCicAJQgCEiECkiP2nrMkNFvDY2gKj62P3xjvrRhumlGISkYgTXIZozMQvcz8wQYaCwoJBAWhF3WRAnVEGgsKCQQFoRd1kQJ1RCpGMEQCIFZB84O_nzPNuViqEGRL1vJTjHBJ-i5ZDgFL5XZxm4HAAiB8rbLHkUdFfWdiOmlencYVn0noSMRHzn4lJYoShuVzlw", + "spr:CiUIAhIhApqRgeWRPSXocTS9RFkQmwTZRG-Cdt7UR2N7POoz606ZEgIDARpJCicAJQgCEiECmpGB5ZE9JehxNL1EWRCbBNlEb4J23tRHY3s86jPrTpkQj8_8wQYaCwoJBAXfEfiRAnVOGgsKCQQF3xH4kQJ1TipGMEQCIGWJMsF57N1iIEQgTH7IrVOgEgv0J2P2v3jvQr5Cjy-RAiAy4aiZ8QtyDvCfl_K_w6SyZ9csFGkRNTpirq_M_QNgKw" + ] + } + ] +} diff --git a/storage/presets.nim b/storage/presets.nim index 81d441a6..983a5372 100644 --- a/storage/presets.nim +++ b/storage/presets.nim @@ -4,6 +4,7 @@ # devnet and latest testnet, respectively. import std/options import std/strutils +import std/json import pkg/chronicles import pkg/codexdht/discv5/protocol @@ -31,7 +32,7 @@ proc init*( func `$`*(preset: NetworkPreset): string = "[" & preset.name & "]: " & preset.description -func `$`*[N](presets: array[N, NetworkPreset]): string = +func describePresets(presets: openArray[NetworkPreset]): string = result = "" for preset in presets: result &= $preset & "; " @@ -52,62 +53,32 @@ proc `bootstrapNodes`*(self: NetworkPreset): seq[SignedPeerRecord] = # it should crash the node. result.add(parse(SignedPeerRecord, record).tryGet()) -const NetworkPresets* = [ - NetworkPreset.init( - "logos.test", - "Logos testnet", - @[ - "spr:CiUIAhIhA6rD-Sa1mJqHOoYMk8yad7B4BYDEI_toNwb1z0cYIRu6EgIDARpJCicAJQgCEiEDqsP5JrWYmoc6hgyT" & - "zJp3sHgFgMQj-2g3BvXPRxghG7oQwoe70AYaCwoJBEDhQ42RAiOCGgsKCQRA4UONkQIjgipHMEUCIQCulmrBDKTTxL" & - "8uBQYtEfp3_n3qDZFbO8lZ8mfIWHrRBAIgNXpVlWD1VlXzbGuJ4t7u8b7ymm3AYwm6-KjUvH6NfKU", - "spr:CiUIAhIhAsFwlXD-3VpX-Pa3taM15wdL3DS75l_dpVCIFhdaIKYREgIDARpJCicAJQgCEiECwXCVcP7dWlf49re1" & - "ozXnB0vcNLvmX92lUIgWF1ogphEQ4Ym70AYaCwoJBI5d6vqRAiOCGgsKCQSOXer6kQIjgipGMEQCIGaofSX23DDUcW" & - "EMElHtlaFbLAsM0YgrMB4UwOIqPMb8AiBFRodJ_5-bkvoLuPo3K2nMGzKXZqXnII4poJhhopSo8A", - "spr:CiUIAhIhAwSYqf83tfZom9eGFFdXXea-dblO-I7-I8B1kjhfJeEAEgIDARpJCicAJQgCEiEDBJip_ze19mib14YU" & - "V1dd5r51uU74jv4jwHWSOF8l4QAQjoi70AYaCwoJBAqAABKRAiOCGgsKCQQKgAASkQIjgipHMEUCIQDEtfOFABgYo" & - "sMflQ-d_v-qkc5FhwSwd_PTcA414MBYAAIgGbhCTAwbSJr5boiARoVFZ-XrBhfFBc_J5Kk5drdQoTE", - "spr:CiUIAhIhA4T8XrxB6PKor8f7j7eqKxgIXH6mMST0_Uel5hZjSDp2EgIDARpJCicAJQgCEiEDhPxevEHo8qivx_uP" & - "t6orGAhcfqYxJPT9R6XmFmNIOnYQrYq70AYaCwoJBAqAAFuRAiOCGgsKCQQKgABbkQIjgipGMEQCIHA1l1NTOh06ca9s" & - "eLlmAtPsTiNJo9Re0s51WakQTTf1AiAFJkhsi2Qv0fq8hY3AWlibqhhh_WiI3q6QabPGVXzuAA", - "spr:CiUIAhIhAqk6NgpRxbKvI02Up24XP3U-dD3TdKRurXpW-ak3Zvh-EgIDARpJCicAJQgCEiECqTo2ClHFsq8jTZSn" & - "bhc_dT50PdN0pG6telb5qTdm-H4Q3oa70AYaCwoJBKwf79KRAiOCGgsKCQSsH-_SkQIjgipHMEUCIQCRubKOjNcLZEJu" & - "t0Ts6wy_BEij4z-1WO6WiOVzT0svfQIgWKOBWVoopNC7zk1byUJMpNMOi05cKVsLoCBkW3RC9-Q", - "spr:CiUIAhIhAvSGKPkE3mD7MP-ZCWS5AEvzcDNVsM6XFYeCBXNja7h2EgIDARpJCicAJQgCEiEC9IYo-QTeYPsw_5kJ" & - "ZLkAS_NwM1WwzpcVh4IFc2NruHYQ_Yi70AYaCwoJBKwf79ORAiOCGgsKCQSsH-_TkQIjgipHMEUCIQDJuV1B1sDyyx" & - "kNs8g3ahZ13GN9r7PEBP7xY4xGlm5n0AIgdyY2JEOyZ1FMdOzN6aZbAWo83AyjCrR-n0sietE1624", - ], - ), - NetworkPreset.init( - "logos.dev", - "Logos devnet", - @[ - "spr:CiUIAhIhAwfZDeTtWNlSgRbZlZfvxLI5Bpy0lFEYN7gImS3oHNaSEgIDARpJCicAJQgCEiEDB9kN5O1Y2VKBFtmVl-" & - "_EsjkGnLSUURg3uAiZLegc1pIQ__O20AYaCwoJBBiQTsiRAiOCGgsKCQQYkE7IkQIjgipHMEUCIQCIZx-HlVsLXJLhD6SEV" & - "x6Zt_1aG9IqMq-Luvz8No_J0wIgc8I9PRtheG4s5tzHjkEJMLcq3Jf09IT_FGkzPcJm8h4", - "spr:CiUIAhIhA8d4LjRirtXO1M-JEmbhVA0CQeA7hHNR9BA7DvFsPKTEEgIDARpJCicAJQgCEiEDx3guNGKu1c7Uz4kSZu" & - "FUDQJB4DuEc1H0EDsO8Ww8pMQQhPW20AYaCwoJBCIq5juRAiOCGgsKCQQiKuY7kQIjgipGMEQCIHV_8nJ0iedWjlAxUhBm" & - "dAbDPLu5g2RmcnmJBD8cbD98AiAp1w9nAJgLlPIr41aMcdkds_eSoh8ImOVKvq6Idx-Ugg", - "spr:CiUIAhIhA_MocWwn1_t__FEONMqYluUjc9ZVkcvYRLo6C0GzTkbfEgIDARpJCicAJQgCEiED8yhxbCfX-3_8UQ40yp" & - "iW5SNz1lWRy9hEujoLQbNORt8QlfO20AYaCwoJBC_u5W-RAiOCGgsKCQQv7uVvkQIjgipGMEQCIHMpQO31gg4FoKYtDyTT" & - "QS8xFz1KEmfqH385EeMUNbhPAiBblCkmOfQBmXj6eryaSiXWsftgohE-SPbKwsASZ1Zs3Q", - ], - ), - NetworkPreset.init( - "codex.dev", - "Codex legacy devnet (deprecated)", - @[ - "spr:CiUIAhIhA-VlcoiRm02KyIzrcTP-ljFpzTljfBRRKTIvhMIwqBqWEgIDARpJCicAJQgCEiED5WVyiJGbTYrIjOtxM_6" & - "WMWnNOWN8FFEpMi-EwjCoGpYQs8n8wQYaCwoJBHTKubmRAnU6GgsKCQR0yrm5kQJ1OipHMEUCIQDwUNsfReB4ty7JFS" & - "5WVQ6n1fcko89qVAOfQEHixa03rgIgan2-uFNDT-r4s9TOkLe9YBkCbsRWYCHGGVJ25rLj0QE", - "spr:CiUIAhIhApIj9p6zJDRbw2NoCo-tj98Y760YbppRiEpGIE1yGaMzEgIDARpJCicAJQgCEiECkiP2nrMkNFvDY2gKj62P" & - "3xjvrRhumlGISkYgTXIZozMQvcz8wQYaCwoJBAWhF3WRAnVEGgsKCQQFoRd1kQJ1RCpGMEQCIFZB84O_nzPNuViqEGRL" & - "1vJTjHBJ-i5ZDgFL5XZxm4HAAiB8rbLHkUdFfWdiOmlencYVn0noSMRHzn4lJYoShuVzlw", - "spr:CiUIAhIhApqRgeWRPSXocTS9RFkQmwTZRG-Cdt7UR2N7POoz606ZEgIDARpJCicAJQgCEiECmpGB5ZE9JehxNL1EWRCb" & - "BNlEb4J23tRHY3s86jPrTpkQj8_8wQYaCwoJBAXfEfiRAnVOGgsKCQQF3xH4kQJ1TipGMEQCIGWJMsF57N1iIEQgTH7I" & - "rVOgEgv0J2P2v3jvQr5Cjy-RAiAy4aiZ8QtyDvCfl_K_w6SyZ9csFGkRNTpirq_M_QNgKw", - ], - ), -] +# Bootstrap node SPRs live in a single source-of-truth config file at the repo +# root. staticRead embeds it into the binary at build time, so the node remains +# self-contained (the file is not needed alongside the binary at runtime). The +# same file is read at runtime by the bootstrap liveness checker (tools/check_spr). +const networkPresetsJson = staticRead("../network_presets.json") + +proc parsePresetsJson(jsonStr: string): seq[NetworkPreset] = + let root = parseJson(jsonStr) + for p in root["presets"].items: + var records: seq[string] + for r in p["records"].items: + records.add(r.getStr) + result.add(NetworkPreset.init(p["name"].getStr, p["description"].getStr, records)) + +const NetworkPresets* = parsePresetsJson(networkPresetsJson) + +proc loadNetworkPresets*(path: string): seq[NetworkPreset] = + ## Runtime loader for the same bootstrap-node config file embedded at compile + ## time into `NetworkPresets`. Used by the bootstrap liveness checker so the + ## node and the checker share one source of truth. + parsePresetsJson(readFile(path)) + +proc rawRecords*(self: NetworkPreset): seq[string] = + ## The unparsed `spr:` strings for this preset (for tooling that wants to + ## handle parse failures per-record instead of crashing). + self.unparsedRecords proc `default`*(presets: openArray[NetworkPreset]): NetworkPreset = presets[0] @@ -115,7 +86,7 @@ proc `default`*(presets: openArray[NetworkPreset]): NetworkPreset = # Precomputes those as as consts so we can use them in nim-confutils CLI # help strings. const - NetworkPresetsDescription* = $NetworkPresets + NetworkPresetsDescription* = describePresets(NetworkPresets) DefaultNetworkPreset* = NetworkPresets.default proc find*(presets: openArray[NetworkPreset], p: string): Option[NetworkPreset] = diff --git a/tools/check_spr.nim b/tools/check_spr.nim new file mode 100644 index 00000000..068dd06e --- /dev/null +++ b/tools/check_spr.nim @@ -0,0 +1,373 @@ +## check_spr - bootstrap-node liveness checker. +## +## Reads the bootstrap SPRs from the shared config file (network_presets.json by +## default) and probes each one. The probe depends on the transport advertised in +## the record: +## * TCP addresses -> a libp2p connection is attempted. +## * UDP addresses -> a discovery v5 (DHT) ping is sent. +## It prints a per-node report — a human-readable table by default, or JSON with +## `--format json` — and exits non-zero if any node is unreachable. A single +## `spr:` URI can also be passed for ad-hoc checks. +## +## IMPORTANT: run this from a host OUTSIDE the fleet VPCs (e.g. a GitHub-hosted +## runner), otherwise nodes advertising private/cloud-internal IPs will appear +## reachable and defeat the purpose. +## +## Usage: +## check_spr [--source ] [--network ] [--timeout ] +## [--format text|json] [--out ] +## check_spr [--timeout ] +## check_spr --help +## +## Run `check_spr --help` for a full description of every option. + +import std/[json, options, os, sequtils, strutils, typetraits, strformat, terminal] + +import pkg/chronicles +import pkg/chronos +import pkg/libp2p +import pkg/libp2p/crypto/rng +import pkg/codexdht/discv5/spr +import pkg/codexdht/discv5/node +import pkg/codexdht/discv5/protocol as discv5 + +import ../storage/presets + +const + DefaultTimeoutSecs = 10 + DefaultSource = "network_presets.json" + +type OutputFormat = enum + ## String values double as the accepted `--format` argument spellings, so the + ## parser can compare against them directly. + ofText = "text" + ofJson = "json" + +type Verdict = object + alive: bool + reason: string + +type Row = object + network: string + peerId: string + address: string + alive: bool + reason: string + +proc hasCodec(addrs: seq[MultiAddress], codec: MultiCodec): bool = + addrs.anyIt(it.contains(codec).get(false)) + +proc setupLogging() = + ## The project's `config.nims` forces `dynamic` chronicles sinks, whose output + ## writers must be configured at runtime. Without this, every log message is + ## dropped with a noisy "dynamic log output writer not configured" warning. + ## Route libp2p logs to stderr so the tool's stdout (the ALIVE/DEAD verdict) + ## stays clean and parseable. + when defaultChroniclesStream.outputs.type.arity == 3: + proc noOutput(logLevel: LogLevel, msg: LogOutputStr) = + discard + + proc stderrFlush(logLevel: LogLevel, msg: LogOutputStr) = + try: + stderr.write(msg) + stderr.flushFile() + except IOError: + discard + + defaultChroniclesStream.outputs[0].writer = stderrFlush + defaultChroniclesStream.outputs[1].writer = noOutput + defaultChroniclesStream.outputs[2].writer = noOutput + +proc buildSwitch(): Switch = + SwitchBuilder + .new() + .withRng(newRng()) + .withAddress(MultiAddress.init("/ip4/0.0.0.0/tcp/0").tryGet()) + .withTcpTransport() + .withNoise() + .withYamux() + # match storage node switch builder, for TCP + .withMplex() + .build() + +proc checkLibp2p( + peerId: PeerId, addresses: seq[MultiAddress], timeout: Duration +): Future[Verdict] {.async.} = + let switch = buildSwitch() + await switch.start() + defer: + await switch.stop() + + try: + await switch.connect(peerId, addresses).wait(timeout) + return Verdict(alive: true, reason: "libp2p connection established") + except AsyncTimeoutError: + return Verdict(alive: false, reason: "libp2p connection timed out") + except CatchableError as exc: + return Verdict(alive: false, reason: "libp2p connection failed: " & exc.msg) + +proc checkDiscv5( + record: SignedPeerRecord, timeout: Duration +): Future[Verdict] {.async.} = + let nodeRes = newNode(record) + if nodeRes.isErr: + return Verdict(alive: false, reason: "cannot build discv5 node: " & $nodeRes.error) + let targetNode = nodeRes.get() + + let rng = newRng() + let privKey = PrivateKey.random(rng).tryGet() + let proto = discv5.newProtocol( + privKey, none(IpAddress), none(Port), none(Port), bindPort = Port(0), rng = rng + ) + + try: + proto.open() + except CatchableError as exc: + return Verdict(alive: false, reason: "failed to open discv5: " & exc.msg) + defer: + await proto.closeWait() + + try: + let pong = await discv5.ping(proto, targetNode).wait(timeout) + if pong.isOk: + return Verdict(alive: true, reason: "discv5 pong received") + else: + return Verdict(alive: false, reason: "discv5 ping failed: " & $pong.error) + except AsyncTimeoutError: + return Verdict(alive: false, reason: "discv5 ping timed out") + except CatchableError as exc: + return Verdict(alive: false, reason: "discv5 ping failed: " & exc.msg) + +proc probe(record: SignedPeerRecord, timeout: Duration): Future[Verdict] {.async.} = + let addresses = record.data.addresses.mapIt(it.address) + if addresses.len == 0: + return Verdict(alive: false, reason: "SPR contains no addresses") + + if hasCodec(addresses, multiCodec("tcp")): + return await checkLibp2p(record.data.peerId, addresses, timeout) + elif hasCodec(addresses, multiCodec("udp")): + return await checkDiscv5(record, timeout) + else: + return Verdict(alive: false, reason: "no tcp or udp addresses to probe") + +proc probeRecords( + source, networkFilter: string, timeout: Duration +): Future[seq[Row]] {.async.} = + let presets = loadNetworkPresets(source) + for preset in presets: + if networkFilter.len > 0 and preset.name != networkFilter: + continue + for rec in preset.rawRecords: + let parsed = SignedPeerRecord.parse(rec) + if parsed.isErr: + result.add Row( + network: preset.name, + alive: false, + reason: "SPR parse failed: " & parsed.error, + ) + continue + let record = parsed.get + let v = await probe(record, timeout) + result.add Row( + network: preset.name, + peerId: $record.data.peerId, + address: record.data.addresses.mapIt($it.address).deduplicate.join(", "), + alive: v.alive, + reason: v.reason, + ) + +const + TableHeader = "FLEET RESULT ADDRESS REASON" + TableRule = '-'.repeat(TableHeader.len) + +proc formatRow(r: Row): string = + ## One table line for a probed node. The status cell is fixed-width ("DEAD " is + ## padded to match "ALIVE") so columns line up regardless of the verdict. ANSI + ## color, when added by the styled renderer, is zero-width on screen, so the + ## plain-text alignment computed here stays visually correct. + let status = if r.alive: "ALIVE" else: "DEAD " + r.network.alignLeft(12) & " " & status & " " & r.address.alignLeft(43) & " " & + r.reason + +proc renderTable(rows: seq[Row]): string = + ## Plain (uncolored) table — used for files and non-interactive stdout, where + ## ANSI escapes would be corruption rather than decoration. + result = TableHeader & "\n" & TableRule + for r in rows: + result.add("\n" & formatRow(r)) + +proc printStyledTable(rows: seq[Row]) = + ## Colored table for an interactive terminal: alive rows green, dead rows red. + ## styledEcho writes ANSI escapes unconditionally, so callers must restrict this + ## to a TTY (see isatty guard at the call site). + styledEcho(styleBright, TableHeader) + styledEcho(styleDim, TableRule) + for r in rows: + styledEcho(if r.alive: fgGreen else: fgRed, formatRow(r)) + +proc rowsToJson(rows: seq[Row]): JsonNode = + result = newJArray() + for r in rows: + result.add %*{ + "network": r.network, + "peerId": r.peerId, + "address": r.address, + "alive": r.alive, + "reason": r.reason, + } + +proc parseTimeout(s: string): int = + try: + parseInt(s) + except ValueError: + quit("Error: timeout must be an integer number of seconds", QuitFailure) + +proc parseFormat(s: string): OutputFormat = + case s + of $ofText: + ofText + of $ofJson: + ofJson + else: + quit("Error: format must be '" & $ofText & "' or '" & $ofJson & "'", QuitFailure) + +proc printHelp() = + echo """check_spr - bootstrap-node liveness checker. + +Reads bootstrap SPRs from a config file and probes each one (libp2p connect for +TCP addresses, discv5 ping for UDP). Prints a per-node table and exits non-zero +if any node is unreachable. A single spr: URI can be passed for an ad-hoc check. + +IMPORTANT: run from a host OUTSIDE the fleet VPCs (e.g. a GitHub-hosted runner), +otherwise nodes advertising private/cloud-internal IPs appear reachable and +defeat the purpose. + +Usage: + check_spr [options] + check_spr [--timeout ] + +Options: + --source Config file to read SPRs from (default: """ & + DefaultSource & """). + --network Only probe the preset with this network name. + --timeout Per-node probe timeout in seconds (default: """ & + $DefaultTimeoutSecs & """). + --format Output format: "text" (default) or "json". "text" is the + human-readable table; "json" is a pretty-printed summary. + --out Write the output to instead of stdout. The content + is whichever --format is selected (text or json). + --help, -h Show this help and exit. + +Arguments: + A single "spr:" URI to probe instead of reading the config + file. Prints ALIVE/DEAD and exits with the matching status; + this mode ignores --format and --out.""" + +when isMainModule: + setupLogging() + + var + source = DefaultSource + networkFilter = "" + timeoutSecs = DefaultTimeoutSecs + singleSpr = "" + format = ofText + outFile = "" + + let params = commandLineParams() + var i = 0 + while i < params.len: + case params[i] + of "--help", "-h": + printHelp() + quit(QuitSuccess) + of "--source": + inc i + source = params[i] + of "--network": + inc i + networkFilter = params[i] + of "--timeout": + inc i + timeoutSecs = parseTimeout(params[i]) + of "--format": + inc i + format = parseFormat(params[i]) + of "--out": + inc i + outFile = params[i] + else: + if params[i].startsWith("spr:"): + singleSpr = params[i] + else: + quit("Error: unknown argument: " & params[i], QuitFailure) + inc i + + let timeout = timeoutSecs.seconds + + if singleSpr.len > 0: + let parsed = SignedPeerRecord.parse(singleSpr) + if parsed.isErr: + quit("Error: " & parsed.error, QuitFailure) + let v = waitFor probe(parsed.get, timeout) + echo (if v.alive: "ALIVE: " else: "DEAD: "), v.reason + quit(if v.alive: QuitSuccess else: QuitFailure) + + let rows = waitFor probeRecords(source, networkFilter, timeout) + + if outFile.len > 0: + # Files (and the JSON form) must stay plain — ANSI color codes would corrupt + # them — so always write the uncolored rendering. writeFile does not append a + # trailing newline the way echo does, so add one to keep the file POSIX-clean. + let output = + case format + of ofText: + renderTable(rows) + of ofJson: + pretty(rowsToJson(rows)) + writeFile(outFile, output & "\n") + # The report is in the file, not on stdout, so point the caller at it. Use + # the absolute path so the line is unambiguous regardless of the cwd. + #!fmt: off + styledEcho bgWhite, fgBlack, styleBright, + "\n\n ", + styleUnderscore, + "ℹ️ BOOTSTRAP HEALTH REPORT ℹ️\n\n", + resetStyle, bgWhite, fgBlack, styleBright, + """ Bootstrap health report for this run will be available at:""", + resetStyle, bgWhite, fgBlack, + &"\n\n {absolutePath(outFile)}\n\n", + resetStyle, bgWhite, fgBlack, styleBright, + " NOTE: For CI runs, the report will be displayed in the workflow summary\n" + #!fmt: on + else: + case format + of ofText: + # Color only when stdout is an interactive terminal; when piped or + # redirected, fall back to the plain table so escapes never reach the + # consumer (std/terminal emits ANSI unconditionally on POSIX). + if isatty(stdout): + printStyledTable(rows) + else: + echo renderTable(rows) + of ofJson: + echo pretty(rowsToJson(rows)) + + let dead = rows.filterIt(not it.alive) + + if outFile.len > 0: + if dead.len > 0: + styledEcho styleBright, + fgRed, + &"\n\n[❌ ERROR] ", + resetStyle, + "One or more bootstrap nodes are unreachable, see report for details.\n\n" + else: + styledEcho styleBright, + fgGreen, + &"\n\n[✅ SUCCESS] ", + resetStyle, + "All bootstrap nodes are reachable, see report for details.\n\n" + + if dead.len > 0: + quit(QuitFailure)