logos-storage-nim/tools/check_spr.nim
2026-06-23 11:35:41 +00:00

392 lines
14 KiB
Nim
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

## check_spr - bootstrap-node liveness checker.
##
## Reads the bootstrap SPRs from the shared config file (network_presets.json by
## default) and probes each one. The probe depends on the transport advertised in
## the record:
## * TCP addresses -> a libp2p connection is attempted.
## * UDP addresses -> a discovery v5 (DHT) ping is sent.
## It prints a per-node report — a human-readable table (alive rows green, dead
## red on a terminal) by default, or JSON with `--format json` — and exits
## non-zero if any node is unreachable. The `--network` filter may be repeated to
## probe several presets. A single `spr:` URI can also be passed for ad-hoc
## checks.
##
## IMPORTANT: run this from a host OUTSIDE the fleet VPCs (e.g. a GitHub-hosted
## runner), otherwise nodes advertising private/cloud-internal IPs will appear
## reachable and defeat the purpose.
##
## Usage:
## check_spr [--source <file>] [--network <name>]... [--timeout <secs>]
## [--format table|json] [--out <file>]
## check_spr <spr-uri> [--timeout <secs>]
## check_spr --help
##
## Run `check_spr --help` for a full description of every option.
import std/[json, net, options, os, sequtils, strutils, typetraits, strformat, terminal]
import pkg/chronicles
import pkg/chronos
import pkg/libp2p
import pkg/libp2p/crypto/rng
import pkg/codexdht/discv5/spr
import pkg/codexdht/discv5/node
import pkg/codexdht/discv5/protocol as discv5
import ../storage/presets
const
DefaultTimeoutSecs = 10
DefaultSource = "network_presets.json"
type OutputFormat = enum
## String values double as the accepted `--format` argument spellings, so the
## parser can compare against them directly.
ofTable = "table"
ofJson = "json"
type Verdict = object
alive: bool
reason: string
type Row = object
network: string
peerId: string
address: string
alive: bool
reason: string
proc hasCodec(addrs: seq[MultiAddress], codec: MultiCodec): bool =
addrs.anyIt(it.contains(codec).get(false))
proc setupLogging() =
## The project's `config.nims` forces `dynamic` chronicles sinks, whose output
## writers must be configured at runtime. Without this, every log message is
## dropped with a noisy "dynamic log output writer not configured" warning.
## Route libp2p logs to stderr so the tool's stdout (the report — a table or
## JSON, or the single-SPR ALIVE/DEAD verdict) stays clean and parseable.
when defaultChroniclesStream.outputs.type.arity == 3:
proc noOutput(logLevel: LogLevel, msg: LogOutputStr) =
discard
proc stderrFlush(logLevel: LogLevel, msg: LogOutputStr) =
try:
stderr.write(msg)
stderr.flushFile()
except IOError:
discard
defaultChroniclesStream.outputs[0].writer = stderrFlush
defaultChroniclesStream.outputs[1].writer = noOutput
defaultChroniclesStream.outputs[2].writer = noOutput
proc buildSwitch(): Switch =
SwitchBuilder
.new()
.withRng(newRng())
.withAddress(MultiAddress.init("/ip4/0.0.0.0/tcp/0").tryGet())
.withTcpTransport()
.withNoise()
.withYamux()
# match storage node switch builder, for TCP
.withMplex()
.build()
proc checkLibp2p(
peerId: PeerId, addresses: seq[MultiAddress], timeout: Duration
): Future[Verdict] {.async.} =
let switch = buildSwitch()
await switch.start()
defer:
await switch.stop()
try:
await switch.connect(peerId, addresses).wait(timeout)
return Verdict(alive: true, reason: "libp2p connection established")
except AsyncTimeoutError:
return Verdict(alive: false, reason: "libp2p connection timed out")
except CatchableError as exc:
return Verdict(alive: false, reason: "libp2p connection failed: " & exc.msg)
proc checkDiscv5(
record: SignedPeerRecord, timeout: Duration
): Future[Verdict] {.async.} =
let nodeRes = newNode(record)
if nodeRes.isErr:
return Verdict(alive: false, reason: "cannot build discv5 node: " & $nodeRes.error)
let targetNode = nodeRes.get()
let rng = newRng()
let privKey = PrivateKey.random(rng).tryGet()
let proto = discv5.newProtocol(
privKey, IPv4_any().some, none(Port), none(Port), bindPort = Port(0), rng = rng
)
# Use IPv4_any address as the enrIp param in newProtocol to avoid the
# warnings. It changes the SPR of the discv5 protocol ping tool (this), but
# does not affect the SPRs of the target.
try:
proto.open()
except CatchableError as exc:
return Verdict(alive: false, reason: "failed to open discv5: " & exc.msg)
defer:
await proto.closeWait()
try:
let pong = await discv5.ping(proto, targetNode).wait(timeout)
if pong.isOk:
return Verdict(alive: true, reason: "discv5 pong received")
else:
return Verdict(alive: false, reason: "discv5 ping failed: " & $pong.error)
except AsyncTimeoutError:
return Verdict(alive: false, reason: "discv5 ping timed out")
except CatchableError as exc:
return Verdict(alive: false, reason: "discv5 ping failed: " & exc.msg)
proc probe(record: SignedPeerRecord, timeout: Duration): Future[Verdict] {.async.} =
let addresses = record.data.addresses.mapIt(it.address)
if addresses.len == 0:
return Verdict(alive: false, reason: "SPR contains no addresses")
if hasCodec(addresses, multiCodec("tcp")):
return await checkLibp2p(record.data.peerId, addresses, timeout)
elif hasCodec(addresses, multiCodec("udp")):
return await checkDiscv5(record, timeout)
else:
return Verdict(alive: false, reason: "no tcp or udp addresses to probe")
proc probeRecords(
source: string, networkFilters: seq[string], timeout: Duration
): Future[seq[Row]] {.async.} =
let presets = loadNetworkPresets(source)
for preset in presets:
# An empty filter list means "probe everything"; otherwise keep only presets
# whose name was named on the command line (--network may be repeated).
if networkFilters.len > 0 and preset.name notin networkFilters:
continue
for rec in preset.rawRecords:
let parsed = SignedPeerRecord.parse(rec)
if parsed.isErr:
result.add Row(
network: preset.name,
alive: false,
reason: "SPR parse failed: " & parsed.error,
)
continue
let record = parsed.get
let v = await probe(record, timeout)
result.add Row(
network: preset.name,
peerId: $record.data.peerId,
address: record.data.addresses.mapIt($it.address).deduplicate.join(", "),
alive: v.alive,
reason: v.reason,
)
const
TableHeader = "FLEET RESULT ADDRESS REASON"
TableRule = '-'.repeat(TableHeader.len)
proc formatRow(r: Row): string =
## One table line for a probed node. The status cell is fixed-width ("DEAD " is
## padded to match "ALIVE") so columns line up regardless of the verdict. ANSI
## color, when added by the styled renderer, is zero-width on screen, so the
## plain-text alignment computed here stays visually correct.
let status = if r.alive: "ALIVE" else: "DEAD "
r.network.alignLeft(12) & " " & status & " " & r.address.alignLeft(43) & " " &
r.reason
proc renderTable(rows: seq[Row]): string =
## Plain (uncolored) table — used for files and non-interactive stdout, where
## ANSI escapes would be corruption rather than decoration.
result = TableHeader & "\n" & TableRule
for r in rows:
result.add("\n" & formatRow(r))
proc printStyledTable(rows: seq[Row]) =
## Colored table for an interactive terminal: alive rows green, dead rows red.
## styledEcho writes ANSI escapes unconditionally, so callers must restrict this
## to a TTY (see isatty guard at the call site).
styledEcho(styleBright, TableHeader)
styledEcho(styleDim, TableRule)
for r in rows:
styledEcho(if r.alive: fgGreen else: fgRed, formatRow(r))
proc rowsToJson(rows: seq[Row]): JsonNode =
result = newJArray()
for r in rows:
result.add %*{
"network": r.network,
"peerId": r.peerId,
"address": r.address,
"alive": r.alive,
"reason": r.reason,
}
proc parseTimeout(s: string): int =
try:
parseInt(s)
except ValueError:
quit("Error: timeout must be an integer number of seconds", QuitFailure)
proc nextValue(params: seq[string], i: var int, flag: string): string =
## Consume the value that follows a value-expecting flag. The flag sits at
## params[i]; its value is the next token, so advance and return it. It is a
## usage error (not an IndexDefect crash) if there is no next token — a trailing
## `--network` — or if the next token is itself a flag, as in `--network
## --timeout`, where `--timeout` would otherwise be swallowed as the value. No
## value this tool accepts begins with "-", so that is a reliable flag marker.
inc i
if i >= params.len or params[i].startsWith("-"):
quit("Error: " & flag & " requires a value", QuitFailure)
params[i]
proc parseFormat(s: string): OutputFormat =
case s
of $ofTable:
ofTable
of $ofJson:
ofJson
else:
quit("Error: format must be '" & $ofTable & "' or '" & $ofJson & "'", QuitFailure)
proc printHelp() =
echo """check_spr - bootstrap-node liveness checker.
Reads bootstrap SPRs from a config file and probes each one (libp2p connect for
TCP addresses, discv5 ping for UDP). Prints a per-node report (a table by
default, JSON with --format json) and exits non-zero if any node is unreachable.
A single spr: URI can be passed for an ad-hoc check.
IMPORTANT: run from a host OUTSIDE the fleet VPCs (e.g. a GitHub-hosted runner),
otherwise nodes advertising private/cloud-internal IPs appear reachable and
defeat the purpose.
Usage:
check_spr [options]
check_spr <spr-uri> [--timeout <secs>]
Options:
--source <file> Config file to read SPRs from (default: """ &
DefaultSource & """).
--network <name> Restrict probing to the named preset. Repeat the flag to
probe several, e.g.
--network logos.test --network logos.dev
Omit entirely to probe every preset in the config.
--timeout <secs> Per-node probe timeout in seconds (default: """ &
$DefaultTimeoutSecs & """).
--format <fmt> Output format: "table" (default) or "json". "table" is the
human-readable table; "json" is a pretty-printed summary.
--out <file> Write the output to <file> instead of stdout. The content
is whichever --format is selected (table or json).
--help, -h Show this help and exit.
Arguments:
<spr-uri> A single "spr:" URI to probe instead of reading the config
file. Prints ALIVE/DEAD and exits with the matching status;
this mode ignores --format and --out."""
when isMainModule:
setupLogging()
var
source = DefaultSource
networkFilters: seq[string] = @[]
timeoutSecs = DefaultTimeoutSecs
singleSpr = ""
format = ofTable
outFile = ""
let params = commandLineParams()
var i = 0
while i < params.len:
case params[i]
of "--help", "-h":
printHelp()
quit(QuitSuccess)
of "--source":
source = nextValue(params, i, "--source")
of "--network":
networkFilters.add nextValue(params, i, "--network")
of "--timeout":
timeoutSecs = parseTimeout(nextValue(params, i, "--timeout"))
of "--format":
format = parseFormat(nextValue(params, i, "--format"))
of "--out":
outFile = nextValue(params, i, "--out")
else:
if params[i].startsWith("spr:"):
singleSpr = params[i]
else:
quit("Error: unknown argument: " & params[i], QuitFailure)
inc i
let timeout = timeoutSecs.seconds
if singleSpr.len > 0:
let parsed = SignedPeerRecord.parse(singleSpr)
if parsed.isErr:
quit("Error: " & parsed.error, QuitFailure)
let v = waitFor probe(parsed.get, timeout)
echo (if v.alive: "ALIVE: " else: "DEAD: "), v.reason
quit(if v.alive: QuitSuccess else: QuitFailure)
let rows = waitFor probeRecords(source, networkFilters, timeout)
if outFile.len > 0:
# Files (and the JSON form) must stay plain — ANSI color codes would corrupt
# them — so always write the uncolored rendering. writeFile does not append a
# trailing newline the way echo does, so add one to keep the file POSIX-clean.
let output =
case format
of ofTable:
renderTable(rows)
of ofJson:
pretty(rowsToJson(rows))
writeFile(outFile, output & "\n")
# The report is in the file, not on stdout, so point the caller at it. Use
# the absolute path so the line is unambiguous regardless of the cwd.
#!fmt: off
styledEcho bgWhite, fgBlack, styleBright,
"\n\n ",
styleUnderscore,
" BOOTSTRAP HEALTH REPORT \n\n",
resetStyle, bgWhite, fgBlack, styleBright,
""" Bootstrap health report for this run will be available at:""",
resetStyle, bgWhite, fgBlack,
&"\n\n {absolutePath(outFile)}\n\n",
resetStyle, bgWhite, fgBlack, styleBright,
" NOTE: For CI runs, the report will be displayed in the workflow summary\n"
#!fmt: on
else:
case format
of ofTable:
# Color only when stdout is an interactive terminal; when piped or
# redirected, fall back to the plain table so escapes never reach the
# consumer (std/terminal emits ANSI unconditionally on POSIX).
if isatty(stdout):
printStyledTable(rows)
else:
echo renderTable(rows)
of ofJson:
echo pretty(rowsToJson(rows))
let dead = rows.filterIt(not it.alive)
if outFile.len > 0:
if dead.len > 0:
styledEcho styleBright,
fgRed,
&"\n\n[❌ ERROR] ",
resetStyle,
"One or more bootstrap nodes are unreachable, see report for details.\n\n"
else:
styledEcho styleBright,
fgGreen,
&"\n\n[✅ SUCCESS] ",
resetStyle,
"All bootstrap nodes are reachable, see report for details.\n\n"
if dead.len > 0:
quit(QuitFailure)