graceful shutdowns

Where possible, do not raise assert, as other nodes in the test may already be running. Instead, raise exceptions, catch them in multinodes.nim, and attempt to do a teardown before failing the test.

`abortOnError` is set to true so that `fail()` will quit immediately, after teardown has been run.
This commit is contained in:
Eric 2024-02-21 18:27:07 +11:00
parent de4b3bebf7
commit 858b6ae339
No known key found for this signature in database
2 changed files with 70 additions and 43 deletions

View File

@ -32,6 +32,10 @@ type
Provider, Provider,
Validator, Validator,
Hardhat Hardhat
MultiNodeSuiteError = object of CatchableError
proc raiseMultiNodeSuiteError(msg: string) =
raise newException(MultiNodeSuiteError, msg)
proc nextFreePort(startPort: int): Future[int] {.async.} = proc nextFreePort(startPort: int): Future[int] {.async.} =
@ -67,8 +71,6 @@ template multinodesuite*(name: string, body: untyped) =
var accounts {.inject, used.}: seq[Address] var accounts {.inject, used.}: seq[Address]
var snapshot: JsonNode var snapshot: JsonNode
proc teardownImpl(): Future[void] {.gcsafe.}
template test(tname, startNodeConfigs, tbody) = template test(tname, startNodeConfigs, tbody) =
currentTestName = tname currentTestName = tname
nodeConfigs = startNodeConfigs nodeConfigs = startNodeConfigs
@ -111,7 +113,10 @@ template multinodesuite*(name: string, body: untyped) =
args.add "--log-file=" & updatedLogFile args.add "--log-file=" & updatedLogFile
let node = await HardhatProcess.startNode(args, config.debugEnabled, "hardhat") let node = await HardhatProcess.startNode(args, config.debugEnabled, "hardhat")
await node.waitUntilStarted() try:
await node.waitUntilStarted()
except NodeProcessError as e:
raiseMultiNodeSuiteError "hardhat node not started: " & e.msg
trace "hardhat node started" trace "hardhat node started"
return node return node
@ -125,9 +130,8 @@ template multinodesuite*(name: string, body: untyped) =
var config = conf var config = conf
if nodeIdx > accounts.len - 1: if nodeIdx > accounts.len - 1:
await teardownImpl() raiseMultiNodeSuiteError "Cannot start node at nodeIdx " & $nodeIdx &
raiseAssert("Cannot start node at nodeIdx " & $nodeIdx & ", not enough eth accounts."
", not enough eth accounts.")
let datadir = getTempDir() / "Codex" / let datadir = getTempDir() / "Codex" /
sanitize($starttime) / sanitize($starttime) /
@ -146,19 +150,19 @@ template multinodesuite*(name: string, body: untyped) =
config.addCliOption("--disc-port", $ await nextFreePort(8090 + nodeIdx)) config.addCliOption("--disc-port", $ await nextFreePort(8090 + nodeIdx))
except CodexConfigError as e: except CodexConfigError as e:
fatal "invalid cli option", error = e.msg raiseMultiNodeSuiteError "invalid cli option, error: " & e.msg
echo "[FATAL] invalid cli option ", e.msg
await teardownImpl()
fail()
return
let node = await CodexProcess.startNode( let node = await CodexProcess.startNode(
config.cliArgs, config.cliArgs,
config.debugEnabled, config.debugEnabled,
$role & $roleIdx $role & $roleIdx
) )
await node.waitUntilStarted()
trace "node started", nodeName = $role & $roleIdx try:
await node.waitUntilStarted()
trace "node started", nodeName = $role & $roleIdx
except NodeProcessError as e:
raiseMultiNodeSuiteError "node not started, error: " & e.msg
return node return node
@ -215,7 +219,7 @@ template multinodesuite*(name: string, body: untyped) =
return await newCodexProcess(validatorIdx, config, Role.Validator) return await newCodexProcess(validatorIdx, config, Role.Validator)
proc teardownImpl {.async.} = proc teardownImpl() {.async.} =
for nodes in @[validators(), clients(), providers()]: for nodes in @[validators(), clients(), providers()]:
for node in nodes: for node in nodes:
await node.stop() # also stops rest client await node.stop() # also stops rest client
@ -231,10 +235,27 @@ template multinodesuite*(name: string, body: untyped) =
running = @[] running = @[]
template failAndTeardownOnError(message: string, tryBody: untyped) =
try:
tryBody
except CatchableError as er:
fatal message, error=er.msg
echo "[FATAL] ", message, ": ", er.msg
await teardownImpl()
when declared(teardownAllIMPL):
teardownAllIMPL()
fail()
quit(1)
setup: setup:
if var conf =? nodeConfigs.hardhat: if var conf =? nodeConfigs.hardhat:
let node = await startHardhatNode(conf) try:
running.add RunningNode(role: Role.Hardhat, node: node) let node = await startHardhatNode(conf)
running.add RunningNode(role: Role.Hardhat, node: node)
except CatchableError as e:
echo "failed to start hardhat node"
fail()
quit(1)
try: try:
ethProvider = JsonRpcProvider.new("ws://localhost:8545") ethProvider = JsonRpcProvider.new("ws://localhost:8545")
@ -244,37 +265,40 @@ template multinodesuite*(name: string, body: untyped) =
snapshot = await send(ethProvider, "evm_snapshot") snapshot = await send(ethProvider, "evm_snapshot")
accounts = await ethProvider.listAccounts() accounts = await ethProvider.listAccounts()
except CatchableError as e: except CatchableError as e:
fatal "failed to connect to hardhat", error = e.msg echo "Hardhat not running. Run hardhat manually " &
echo "[FATAL] Hardhat not running. Run hardhat manually before executing tests, or include a HardhatConfig in the test setup." "before executing tests, or include a " &
await teardownImpl() "HardhatConfig in the test setup."
fail() fail()
return quit(1)
if var clients =? nodeConfigs.clients: if var clients =? nodeConfigs.clients:
for config in clients.configs: failAndTeardownOnError "failed to start client nodes":
let node = await startClientNode(config) for config in clients.configs:
running.add RunningNode( let node = await startClientNode(config)
role: Role.Client, running.add RunningNode(
node: node role: Role.Client,
) node: node
if clients().len == 1: )
bootstrap = CodexProcess(node).client.info()["spr"].getStr() if clients().len == 1:
bootstrap = CodexProcess(node).client.info()["spr"].getStr()
if var providers =? nodeConfigs.providers: if var providers =? nodeConfigs.providers:
for config in providers.configs.mitems: failAndTeardownOnError "failed to start provider nodes":
let node = await startProviderNode(config) for config in providers.configs.mitems:
running.add RunningNode( let node = await startProviderNode(config)
role: Role.Provider, running.add RunningNode(
node: node role: Role.Provider,
) node: node
)
if var validators =? nodeConfigs.validators: if var validators =? nodeConfigs.validators:
for config in validators.configs.mitems: failAndTeardownOnError "failed to start validator nodes":
let node = await startValidatorNode(config) for config in validators.configs.mitems:
running.add RunningNode( let node = await startValidatorNode(config)
role: Role.Validator, running.add RunningNode(
node: node role: Role.Validator,
) node: node
)
teardown: teardown:
await teardownImpl() await teardownImpl()

View File

@ -23,6 +23,7 @@ type
debug: bool debug: bool
trackedFutures*: TrackedFutures trackedFutures*: TrackedFutures
name*: string name*: string
NodeProcessError* = object of CatchableError
method workingDir(node: NodeProcess): string {.base.} = method workingDir(node: NodeProcess): string {.base.} =
raiseAssert "not implemented" raiseAssert "not implemented"
@ -152,12 +153,14 @@ proc waitUntilStarted*(node: NodeProcess) {.async.} =
try: try:
discard node.captureOutput(node.startedOutput, started).track(node) discard node.captureOutput(node.startedOutput, started).track(node)
await started.wait(35.seconds) # allow enough time for proof generation await started.wait(35.seconds) # allow enough time for proof generation
except AsyncTimeoutError as e: except AsyncTimeoutError:
# attempt graceful shutdown in case node was partially started, prevent # attempt graceful shutdown in case node was partially started, prevent
# zombies # zombies
# TODO: raise error here so that all nodes can be shutdown gracefully
await node.stop() await node.stop()
raiseAssert "node did not output '" & node.startedOutput & "'" # raise error here so that all nodes (not just this one) can be
# shutdown gracefully
raise newException(NodeProcessError, "node did not output '" &
node.startedOutput & "'")
proc restart*(node: NodeProcess) {.async.} = proc restart*(node: NodeProcess) {.async.} =
await node.stop() await node.stop()