logos-storage-nim/tools/check_spr.nim

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

392 lines
14 KiB
Nim
Raw Permalink Normal View History

## check_spr - bootstrap-node liveness checker.
##
## Reads the bootstrap SPRs from the shared config file (network_presets.json by
## default) and probes each one. The probe depends on the transport advertised in
## the record:
## * TCP addresses -> a libp2p connection is attempted.
## * UDP addresses -> a discovery v5 (DHT) ping is sent.
## It prints a per-node report — a human-readable table (alive rows green, dead
## red on a terminal) by default, or JSON with `--format json` — and exits
## non-zero if any node is unreachable. The `--network` filter may be repeated to
## probe several presets. A single `spr:` URI can also be passed for ad-hoc
## checks.
##
## IMPORTANT: run this from a host OUTSIDE the fleet VPCs (e.g. a GitHub-hosted
## runner), otherwise nodes advertising private/cloud-internal IPs will appear
## reachable and defeat the purpose.
##
## Usage:
## check_spr [--source <file>] [--network <name>]... [--timeout <secs>]
## [--format table|json] [--out <file>]
## check_spr <spr-uri> [--timeout <secs>]
## check_spr --help
##
## Run `check_spr --help` for a full description of every option.
2026-06-23 21:35:41 +10:00
import std/[json, net, options, os, sequtils, strutils, typetraits, strformat, terminal]
import pkg/chronicles
import pkg/chronos
import pkg/libp2p
import pkg/libp2p/crypto/rng
import pkg/codexdht/discv5/spr
import pkg/codexdht/discv5/node
import pkg/codexdht/discv5/protocol as discv5
import ../storage/presets
const
DefaultTimeoutSecs = 10
DefaultSource = "network_presets.json"
type OutputFormat = enum
## String values double as the accepted `--format` argument spellings, so the
## parser can compare against them directly.
ofTable = "table"
ofJson = "json"
type Verdict = object
alive: bool
reason: string
type Row = object
network: string
peerId: string
address: string
alive: bool
reason: string
proc hasCodec(addrs: seq[MultiAddress], codec: MultiCodec): bool =
addrs.anyIt(it.contains(codec).get(false))
proc setupLogging() =
## The project's `config.nims` forces `dynamic` chronicles sinks, whose output
## writers must be configured at runtime. Without this, every log message is
## dropped with a noisy "dynamic log output writer not configured" warning.
## Route libp2p logs to stderr so the tool's stdout (the report — a table or
## JSON, or the single-SPR ALIVE/DEAD verdict) stays clean and parseable.
when defaultChroniclesStream.outputs.type.arity == 3:
proc noOutput(logLevel: LogLevel, msg: LogOutputStr) =
discard
proc stderrFlush(logLevel: LogLevel, msg: LogOutputStr) =
try:
stderr.write(msg)
stderr.flushFile()
except IOError:
discard
defaultChroniclesStream.outputs[0].writer = stderrFlush
defaultChroniclesStream.outputs[1].writer = noOutput
defaultChroniclesStream.outputs[2].writer = noOutput
proc buildSwitch(): Switch =
SwitchBuilder
.new()
.withRng(newRng())
.withAddress(MultiAddress.init("/ip4/0.0.0.0/tcp/0").tryGet())
.withTcpTransport()
.withNoise()
.withYamux()
# match storage node switch builder, for TCP
.withMplex()
.build()
proc checkLibp2p(
peerId: PeerId, addresses: seq[MultiAddress], timeout: Duration
): Future[Verdict] {.async.} =
let switch = buildSwitch()
await switch.start()
defer:
await switch.stop()
try:
await switch.connect(peerId, addresses).wait(timeout)
return Verdict(alive: true, reason: "libp2p connection established")
except AsyncTimeoutError:
return Verdict(alive: false, reason: "libp2p connection timed out")
except CatchableError as exc:
return Verdict(alive: false, reason: "libp2p connection failed: " & exc.msg)
proc checkDiscv5(
record: SignedPeerRecord, timeout: Duration
): Future[Verdict] {.async.} =
let nodeRes = newNode(record)
if nodeRes.isErr:
return Verdict(alive: false, reason: "cannot build discv5 node: " & $nodeRes.error)
let targetNode = nodeRes.get()
let rng = newRng()
let privKey = PrivateKey.random(rng).tryGet()
let proto = discv5.newProtocol(
2026-06-23 21:35:41 +10:00
privKey, IPv4_any().some, none(Port), none(Port), bindPort = Port(0), rng = rng
)
2026-06-23 21:35:41 +10:00
# Use IPv4_any address as the enrIp param in newProtocol to avoid the
# warnings. It changes the SPR of the discv5 protocol ping tool (this), but
# does not affect the SPRs of the target.
try:
proto.open()
except CatchableError as exc:
return Verdict(alive: false, reason: "failed to open discv5: " & exc.msg)
defer:
await proto.closeWait()
try:
let pong = await discv5.ping(proto, targetNode).wait(timeout)
if pong.isOk:
return Verdict(alive: true, reason: "discv5 pong received")
else:
return Verdict(alive: false, reason: "discv5 ping failed: " & $pong.error)
except AsyncTimeoutError:
return Verdict(alive: false, reason: "discv5 ping timed out")
except CatchableError as exc:
return Verdict(alive: false, reason: "discv5 ping failed: " & exc.msg)
proc probe(record: SignedPeerRecord, timeout: Duration): Future[Verdict] {.async.} =
let addresses = record.data.addresses.mapIt(it.address)
if addresses.len == 0:
return Verdict(alive: false, reason: "SPR contains no addresses")
if hasCodec(addresses, multiCodec("tcp")):
return await checkLibp2p(record.data.peerId, addresses, timeout)
elif hasCodec(addresses, multiCodec("udp")):
return await checkDiscv5(record, timeout)
else:
return Verdict(alive: false, reason: "no tcp or udp addresses to probe")
proc probeRecords(
source: string, networkFilters: seq[string], timeout: Duration
): Future[seq[Row]] {.async.} =
let presets = loadNetworkPresets(source)
for preset in presets:
# An empty filter list means "probe everything"; otherwise keep only presets
# whose name was named on the command line (--network may be repeated).
if networkFilters.len > 0 and preset.name notin networkFilters:
continue
for rec in preset.rawRecords:
let parsed = SignedPeerRecord.parse(rec)
if parsed.isErr:
result.add Row(
network: preset.name,
alive: false,
reason: "SPR parse failed: " & parsed.error,
)
continue
let record = parsed.get
let v = await probe(record, timeout)
result.add Row(
network: preset.name,
peerId: $record.data.peerId,
address: record.data.addresses.mapIt($it.address).deduplicate.join(", "),
alive: v.alive,
reason: v.reason,
)
const
TableHeader = "FLEET RESULT ADDRESS REASON"
TableRule = '-'.repeat(TableHeader.len)
proc formatRow(r: Row): string =
## One table line for a probed node. The status cell is fixed-width ("DEAD " is
## padded to match "ALIVE") so columns line up regardless of the verdict. ANSI
## color, when added by the styled renderer, is zero-width on screen, so the
## plain-text alignment computed here stays visually correct.
let status = if r.alive: "ALIVE" else: "DEAD "
r.network.alignLeft(12) & " " & status & " " & r.address.alignLeft(43) & " " &
r.reason
proc renderTable(rows: seq[Row]): string =
## Plain (uncolored) table — used for files and non-interactive stdout, where
## ANSI escapes would be corruption rather than decoration.
result = TableHeader & "\n" & TableRule
for r in rows:
result.add("\n" & formatRow(r))
proc printStyledTable(rows: seq[Row]) =
## Colored table for an interactive terminal: alive rows green, dead rows red.
## styledEcho writes ANSI escapes unconditionally, so callers must restrict this
## to a TTY (see isatty guard at the call site).
styledEcho(styleBright, TableHeader)
styledEcho(styleDim, TableRule)
for r in rows:
styledEcho(if r.alive: fgGreen else: fgRed, formatRow(r))
proc rowsToJson(rows: seq[Row]): JsonNode =
result = newJArray()
for r in rows:
result.add %*{
"network": r.network,
"peerId": r.peerId,
"address": r.address,
"alive": r.alive,
"reason": r.reason,
}
proc parseTimeout(s: string): int =
try:
parseInt(s)
except ValueError:
quit("Error: timeout must be an integer number of seconds", QuitFailure)
proc nextValue(params: seq[string], i: var int, flag: string): string =
## Consume the value that follows a value-expecting flag. The flag sits at
## params[i]; its value is the next token, so advance and return it. It is a
## usage error (not an IndexDefect crash) if there is no next token — a trailing
## `--network` — or if the next token is itself a flag, as in `--network
## --timeout`, where `--timeout` would otherwise be swallowed as the value. No
## value this tool accepts begins with "-", so that is a reliable flag marker.
inc i
if i >= params.len or params[i].startsWith("-"):
quit("Error: " & flag & " requires a value", QuitFailure)
params[i]
proc parseFormat(s: string): OutputFormat =
case s
of $ofTable:
ofTable
of $ofJson:
ofJson
else:
quit("Error: format must be '" & $ofTable & "' or '" & $ofJson & "'", QuitFailure)
proc printHelp() =
echo """check_spr - bootstrap-node liveness checker.
Reads bootstrap SPRs from a config file and probes each one (libp2p connect for
TCP addresses, discv5 ping for UDP). Prints a per-node report (a table by
default, JSON with --format json) and exits non-zero if any node is unreachable.
A single spr: URI can be passed for an ad-hoc check.
IMPORTANT: run from a host OUTSIDE the fleet VPCs (e.g. a GitHub-hosted runner),
otherwise nodes advertising private/cloud-internal IPs appear reachable and
defeat the purpose.
Usage:
check_spr [options]
check_spr <spr-uri> [--timeout <secs>]
Options:
--source <file> Config file to read SPRs from (default: """ &
DefaultSource & """).
--network <name> Restrict probing to the named preset. Repeat the flag to
probe several, e.g.
--network logos.test --network logos.dev
Omit entirely to probe every preset in the config.
--timeout <secs> Per-node probe timeout in seconds (default: """ &
$DefaultTimeoutSecs & """).
--format <fmt> Output format: "table" (default) or "json". "table" is the
human-readable table; "json" is a pretty-printed summary.
--out <file> Write the output to <file> instead of stdout. The content
is whichever --format is selected (table or json).
--help, -h Show this help and exit.
Arguments:
<spr-uri> A single "spr:" URI to probe instead of reading the config
file. Prints ALIVE/DEAD and exits with the matching status;
this mode ignores --format and --out."""
when isMainModule:
setupLogging()
var
source = DefaultSource
networkFilters: seq[string] = @[]
timeoutSecs = DefaultTimeoutSecs
singleSpr = ""
format = ofTable
outFile = ""
let params = commandLineParams()
var i = 0
while i < params.len:
case params[i]
of "--help", "-h":
printHelp()
quit(QuitSuccess)
of "--source":
source = nextValue(params, i, "--source")
of "--network":
networkFilters.add nextValue(params, i, "--network")
of "--timeout":
timeoutSecs = parseTimeout(nextValue(params, i, "--timeout"))
of "--format":
format = parseFormat(nextValue(params, i, "--format"))
of "--out":
outFile = nextValue(params, i, "--out")
else:
if params[i].startsWith("spr:"):
singleSpr = params[i]
else:
quit("Error: unknown argument: " & params[i], QuitFailure)
inc i
let timeout = timeoutSecs.seconds
if singleSpr.len > 0:
let parsed = SignedPeerRecord.parse(singleSpr)
if parsed.isErr:
quit("Error: " & parsed.error, QuitFailure)
let v = waitFor probe(parsed.get, timeout)
echo (if v.alive: "ALIVE: " else: "DEAD: "), v.reason
quit(if v.alive: QuitSuccess else: QuitFailure)
let rows = waitFor probeRecords(source, networkFilters, timeout)
if outFile.len > 0:
# Files (and the JSON form) must stay plain — ANSI color codes would corrupt
# them — so always write the uncolored rendering. writeFile does not append a
# trailing newline the way echo does, so add one to keep the file POSIX-clean.
let output =
case format
of ofTable:
renderTable(rows)
of ofJson:
pretty(rowsToJson(rows))
writeFile(outFile, output & "\n")
# The report is in the file, not on stdout, so point the caller at it. Use
# the absolute path so the line is unambiguous regardless of the cwd.
#!fmt: off
styledEcho bgWhite, fgBlack, styleBright,
"\n\n ",
styleUnderscore,
" BOOTSTRAP HEALTH REPORT \n\n",
resetStyle, bgWhite, fgBlack, styleBright,
""" Bootstrap health report for this run will be available at:""",
resetStyle, bgWhite, fgBlack,
&"\n\n {absolutePath(outFile)}\n\n",
resetStyle, bgWhite, fgBlack, styleBright,
" NOTE: For CI runs, the report will be displayed in the workflow summary\n"
#!fmt: on
else:
case format
of ofTable:
# Color only when stdout is an interactive terminal; when piped or
# redirected, fall back to the plain table so escapes never reach the
# consumer (std/terminal emits ANSI unconditionally on POSIX).
if isatty(stdout):
printStyledTable(rows)
else:
echo renderTable(rows)
of ofJson:
echo pretty(rowsToJson(rows))
let dead = rows.filterIt(not it.alive)
if outFile.len > 0:
if dead.len > 0:
styledEcho styleBright,
fgRed,
&"\n\n[❌ ERROR] ",
resetStyle,
"One or more bootstrap nodes are unreachable, see report for details.\n\n"
else:
styledEcho styleBright,
fgGreen,
&"\n\n[✅ SUCCESS] ",
resetStyle,
"All bootstrap nodes are reachable, see report for details.\n\n"
if dead.len > 0:
quit(QuitFailure)