## check_spr - bootstrap-node liveness checker. ## ## Reads the bootstrap SPRs from the shared config file (network_presets.json by ## default) and probes each one. The probe depends on the transport advertised in ## the record: ## * TCP addresses -> a libp2p connection is attempted. ## * UDP addresses -> a discovery v5 (DHT) ping is sent. ## It prints a per-node report — a human-readable table (alive rows green, dead ## red on a terminal) by default, or JSON with `--format json` — and exits ## non-zero if any node is unreachable. The `--network` filter may be repeated to ## probe several presets. A single `spr:` URI can also be passed for ad-hoc ## checks. ## ## IMPORTANT: run this from a host OUTSIDE the fleet VPCs (e.g. a GitHub-hosted ## runner), otherwise nodes advertising private/cloud-internal IPs will appear ## reachable and defeat the purpose. ## ## Usage: ## check_spr [--source ] [--network ]... [--timeout ] ## [--format table|json] [--out ] ## check_spr [--timeout ] ## check_spr --help ## ## Run `check_spr --help` for a full description of every option. import std/[json, net, options, os, sequtils, strutils, typetraits, strformat, terminal] import pkg/chronicles import pkg/chronos import pkg/libp2p import pkg/libp2p/crypto/rng import pkg/codexdht/discv5/spr import pkg/codexdht/discv5/node import pkg/codexdht/discv5/protocol as discv5 import ../storage/presets const DefaultTimeoutSecs = 10 DefaultSource = "network_presets.json" type OutputFormat = enum ## String values double as the accepted `--format` argument spellings, so the ## parser can compare against them directly. ofTable = "table" ofJson = "json" type Verdict = object alive: bool reason: string type Row = object network: string peerId: string address: string alive: bool reason: string proc hasCodec(addrs: seq[MultiAddress], codec: MultiCodec): bool = addrs.anyIt(it.contains(codec).get(false)) proc setupLogging() = ## The project's `config.nims` forces `dynamic` chronicles sinks, whose output ## writers must be configured at runtime. Without this, every log message is ## dropped with a noisy "dynamic log output writer not configured" warning. ## Route libp2p logs to stderr so the tool's stdout (the report — a table or ## JSON, or the single-SPR ALIVE/DEAD verdict) stays clean and parseable. when defaultChroniclesStream.outputs.type.arity == 3: proc noOutput(logLevel: LogLevel, msg: LogOutputStr) = discard proc stderrFlush(logLevel: LogLevel, msg: LogOutputStr) = try: stderr.write(msg) stderr.flushFile() except IOError: discard defaultChroniclesStream.outputs[0].writer = stderrFlush defaultChroniclesStream.outputs[1].writer = noOutput defaultChroniclesStream.outputs[2].writer = noOutput proc buildSwitch(): Switch = SwitchBuilder .new() .withRng(newRng()) .withAddress(MultiAddress.init("/ip4/0.0.0.0/tcp/0").tryGet()) .withTcpTransport() .withNoise() .withYamux() # match storage node switch builder, for TCP .withMplex() .build() proc checkLibp2p( peerId: PeerId, addresses: seq[MultiAddress], timeout: Duration ): Future[Verdict] {.async.} = let switch = buildSwitch() await switch.start() defer: await switch.stop() try: await switch.connect(peerId, addresses).wait(timeout) return Verdict(alive: true, reason: "libp2p connection established") except AsyncTimeoutError: return Verdict(alive: false, reason: "libp2p connection timed out") except CatchableError as exc: return Verdict(alive: false, reason: "libp2p connection failed: " & exc.msg) proc checkDiscv5( record: SignedPeerRecord, timeout: Duration ): Future[Verdict] {.async.} = let nodeRes = newNode(record) if nodeRes.isErr: return Verdict(alive: false, reason: "cannot build discv5 node: " & $nodeRes.error) let targetNode = nodeRes.get() let rng = newRng() let privKey = PrivateKey.random(rng).tryGet() let proto = discv5.newProtocol( privKey, IPv4_any().some, none(Port), none(Port), bindPort = Port(0), rng = rng ) # Use IPv4_any address as the enrIp param in newProtocol to avoid the # warnings. It changes the SPR of the discv5 protocol ping tool (this), but # does not affect the SPRs of the target. try: proto.open() except CatchableError as exc: return Verdict(alive: false, reason: "failed to open discv5: " & exc.msg) defer: await proto.closeWait() try: let pong = await discv5.ping(proto, targetNode).wait(timeout) if pong.isOk: return Verdict(alive: true, reason: "discv5 pong received") else: return Verdict(alive: false, reason: "discv5 ping failed: " & $pong.error) except AsyncTimeoutError: return Verdict(alive: false, reason: "discv5 ping timed out") except CatchableError as exc: return Verdict(alive: false, reason: "discv5 ping failed: " & exc.msg) proc probe(record: SignedPeerRecord, timeout: Duration): Future[Verdict] {.async.} = let addresses = record.data.addresses.mapIt(it.address) if addresses.len == 0: return Verdict(alive: false, reason: "SPR contains no addresses") if hasCodec(addresses, multiCodec("tcp")): return await checkLibp2p(record.data.peerId, addresses, timeout) elif hasCodec(addresses, multiCodec("udp")): return await checkDiscv5(record, timeout) else: return Verdict(alive: false, reason: "no tcp or udp addresses to probe") proc probeRecords( source: string, networkFilters: seq[string], timeout: Duration ): Future[seq[Row]] {.async.} = let presets = loadNetworkPresets(source) for preset in presets: # An empty filter list means "probe everything"; otherwise keep only presets # whose name was named on the command line (--network may be repeated). if networkFilters.len > 0 and preset.name notin networkFilters: continue for rec in preset.rawRecords: let parsed = SignedPeerRecord.parse(rec) if parsed.isErr: result.add Row( network: preset.name, alive: false, reason: "SPR parse failed: " & parsed.error, ) continue let record = parsed.get let v = await probe(record, timeout) result.add Row( network: preset.name, peerId: $record.data.peerId, address: record.data.addresses.mapIt($it.address).deduplicate.join(", "), alive: v.alive, reason: v.reason, ) const TableHeader = "FLEET RESULT ADDRESS REASON" TableRule = '-'.repeat(TableHeader.len) proc formatRow(r: Row): string = ## One table line for a probed node. The status cell is fixed-width ("DEAD " is ## padded to match "ALIVE") so columns line up regardless of the verdict. ANSI ## color, when added by the styled renderer, is zero-width on screen, so the ## plain-text alignment computed here stays visually correct. let status = if r.alive: "ALIVE" else: "DEAD " r.network.alignLeft(12) & " " & status & " " & r.address.alignLeft(43) & " " & r.reason proc renderTable(rows: seq[Row]): string = ## Plain (uncolored) table — used for files and non-interactive stdout, where ## ANSI escapes would be corruption rather than decoration. result = TableHeader & "\n" & TableRule for r in rows: result.add("\n" & formatRow(r)) proc printStyledTable(rows: seq[Row]) = ## Colored table for an interactive terminal: alive rows green, dead rows red. ## styledEcho writes ANSI escapes unconditionally, so callers must restrict this ## to a TTY (see isatty guard at the call site). styledEcho(styleBright, TableHeader) styledEcho(styleDim, TableRule) for r in rows: styledEcho(if r.alive: fgGreen else: fgRed, formatRow(r)) proc rowsToJson(rows: seq[Row]): JsonNode = result = newJArray() for r in rows: result.add %*{ "network": r.network, "peerId": r.peerId, "address": r.address, "alive": r.alive, "reason": r.reason, } proc parseTimeout(s: string): int = try: parseInt(s) except ValueError: quit("Error: timeout must be an integer number of seconds", QuitFailure) proc nextValue(params: seq[string], i: var int, flag: string): string = ## Consume the value that follows a value-expecting flag. The flag sits at ## params[i]; its value is the next token, so advance and return it. It is a ## usage error (not an IndexDefect crash) if there is no next token — a trailing ## `--network` — or if the next token is itself a flag, as in `--network ## --timeout`, where `--timeout` would otherwise be swallowed as the value. No ## value this tool accepts begins with "-", so that is a reliable flag marker. inc i if i >= params.len or params[i].startsWith("-"): quit("Error: " & flag & " requires a value", QuitFailure) params[i] proc parseFormat(s: string): OutputFormat = case s of $ofTable: ofTable of $ofJson: ofJson else: quit("Error: format must be '" & $ofTable & "' or '" & $ofJson & "'", QuitFailure) proc printHelp() = echo """check_spr - bootstrap-node liveness checker. Reads bootstrap SPRs from a config file and probes each one (libp2p connect for TCP addresses, discv5 ping for UDP). Prints a per-node report (a table by default, JSON with --format json) and exits non-zero if any node is unreachable. A single spr: URI can be passed for an ad-hoc check. IMPORTANT: run from a host OUTSIDE the fleet VPCs (e.g. a GitHub-hosted runner), otherwise nodes advertising private/cloud-internal IPs appear reachable and defeat the purpose. Usage: check_spr [options] check_spr [--timeout ] Options: --source Config file to read SPRs from (default: """ & DefaultSource & """). --network Restrict probing to the named preset. Repeat the flag to probe several, e.g. --network logos.test --network logos.dev Omit entirely to probe every preset in the config. --timeout Per-node probe timeout in seconds (default: """ & $DefaultTimeoutSecs & """). --format Output format: "table" (default) or "json". "table" is the human-readable table; "json" is a pretty-printed summary. --out Write the output to instead of stdout. The content is whichever --format is selected (table or json). --help, -h Show this help and exit. Arguments: A single "spr:" URI to probe instead of reading the config file. Prints ALIVE/DEAD and exits with the matching status; this mode ignores --format and --out.""" when isMainModule: setupLogging() var source = DefaultSource networkFilters: seq[string] = @[] timeoutSecs = DefaultTimeoutSecs singleSpr = "" format = ofTable outFile = "" let params = commandLineParams() var i = 0 while i < params.len: case params[i] of "--help", "-h": printHelp() quit(QuitSuccess) of "--source": source = nextValue(params, i, "--source") of "--network": networkFilters.add nextValue(params, i, "--network") of "--timeout": timeoutSecs = parseTimeout(nextValue(params, i, "--timeout")) of "--format": format = parseFormat(nextValue(params, i, "--format")) of "--out": outFile = nextValue(params, i, "--out") else: if params[i].startsWith("spr:"): singleSpr = params[i] else: quit("Error: unknown argument: " & params[i], QuitFailure) inc i let timeout = timeoutSecs.seconds if singleSpr.len > 0: let parsed = SignedPeerRecord.parse(singleSpr) if parsed.isErr: quit("Error: " & parsed.error, QuitFailure) let v = waitFor probe(parsed.get, timeout) echo (if v.alive: "ALIVE: " else: "DEAD: "), v.reason quit(if v.alive: QuitSuccess else: QuitFailure) let rows = waitFor probeRecords(source, networkFilters, timeout) if outFile.len > 0: # Files (and the JSON form) must stay plain — ANSI color codes would corrupt # them — so always write the uncolored rendering. writeFile does not append a # trailing newline the way echo does, so add one to keep the file POSIX-clean. let output = case format of ofTable: renderTable(rows) of ofJson: pretty(rowsToJson(rows)) writeFile(outFile, output & "\n") # The report is in the file, not on stdout, so point the caller at it. Use # the absolute path so the line is unambiguous regardless of the cwd. #!fmt: off styledEcho bgWhite, fgBlack, styleBright, "\n\n ", styleUnderscore, "ℹ️ BOOTSTRAP HEALTH REPORT ℹ️\n\n", resetStyle, bgWhite, fgBlack, styleBright, """ Bootstrap health report for this run will be available at:""", resetStyle, bgWhite, fgBlack, &"\n\n {absolutePath(outFile)}\n\n", resetStyle, bgWhite, fgBlack, styleBright, " NOTE: For CI runs, the report will be displayed in the workflow summary\n" #!fmt: on else: case format of ofTable: # Color only when stdout is an interactive terminal; when piped or # redirected, fall back to the plain table so escapes never reach the # consumer (std/terminal emits ANSI unconditionally on POSIX). if isatty(stdout): printStyledTable(rows) else: echo renderTable(rows) of ofJson: echo pretty(rowsToJson(rows)) let dead = rows.filterIt(not it.alive) if outFile.len > 0: if dead.len > 0: styledEcho styleBright, fgRed, &"\n\n[❌ ERROR] ", resetStyle, "One or more bootstrap nodes are unreachable, see report for details.\n\n" else: styledEcho styleBright, fgGreen, &"\n\n[✅ SUCCESS] ", resetStyle, "All bootstrap nodes are reachable, see report for details.\n\n" if dead.len > 0: quit(QuitFailure)