Add proper async exception tracking to multinodesuite

2026-02-23 06:53:08 +00:00 · 2025-02-24 17:29:44 +11:00 · 2025-02-24 17:29:44 +11:00 · bd68388d2a
commit bd68388d2a
parent faaddb879b
4 changed files with 162 additions and 73 deletions
--- a/tests/integration/codexprocess.nim
+++ b/tests/integration/codexprocess.nim
@ -85,7 +85,7 @@ proc client*(node: CodexProcess): CodexClient {.raises: [CodexProcessError].} =
  node.client = some client
  return client

-method stop*(node: CodexProcess) {.async.} =
+method stop*(node: CodexProcess) {.async: (raises: []).} =
  logScope:
    nodeName = node.name

--- a/tests/integration/hardhatprocess.nim
+++ b/tests/integration/hardhatprocess.nim
@ -101,7 +101,7 @@ proc startNode*(
    debug: string | bool = false,
    name: string,
    onOutputLineCaptured: OnOutputLineCaptured = nil,
-): Future[HardhatProcess] {.async.} =
+): Future[HardhatProcess] {.async: (raises: [CancelledError, NodeProcessError]).} =
  logScope:
    nodeName = name

@ -132,7 +132,7 @@ proc startNode*(

  return hardhat

-method onOutputLineCaptured(node: HardhatProcess, line: string) {.raises: [].} =
+method onOutputLineCaptured(node: HardhatProcess, line: string) =
  logScope:
    nodeName = node.name

@ -147,7 +147,7 @@ method onOutputLineCaptured(node: HardhatProcess, line: string) {.raises: [].} =
    discard logFile.closeFile()
    node.logFile = none IoHandle

-method stop*(node: HardhatProcess) {.async.} =
+method stop*(node: HardhatProcess) {.async: (raises: []).} =
  # terminate the process
  await procCall NodeProcess(node).stop()

--- a/tests/integration/multinodes.nim
+++ b/tests/integration/multinodes.nim
@ -1,3 +1,4 @@
+import std/httpclient
 import std/os
 import std/sequtils
 import std/strutils
@ -25,6 +26,8 @@ export hardhatconfig
 export codexconfig
 export nodeconfigs

+{.push raises: [].}
+
 type
  RunningNode* = ref object
    role*: Role
@ -37,6 +40,7 @@ type
    Hardhat

  MultiNodeSuiteError = object of CatchableError
+  SuiteTimeoutError = object of MultiNodeSuiteError

 const HardhatPort {.intdefine.}: int = 8545
 const CodexApiPort {.intdefine.}: int = 8080
@ -45,7 +49,9 @@ const TestId {.strdefine.}: string = "TestId"
 const DebugCodexNodes {.booldefine.}: bool = false
 const LogsDir {.strdefine.}: string = ""

-proc raiseMultiNodeSuiteError(msg: string, parent: ref CatchableError = nil) =
+proc raiseMultiNodeSuiteError(
+    msg: string, parent: ref CatchableError = nil
+) {.raises: [MultiNodeSuiteError].} =
  raise newException(MultiNodeSuiteError, msg, parent)

 template withLock(lock: AsyncLock, body: untyped) =
@ -98,6 +104,7 @@ template multinodesuite*(name: string, body: untyped) =
    var lastUsedCodexApiPort = CodexApiPort
    var lastUsedCodexDiscPort = CodexDiscPort
    var codexPortLock: AsyncLock
+    var futTimeout: Future[void]

    template test(tname, startNodeConfigs, tbody) =
      currentTestName = tname
@ -111,12 +118,25 @@ template multinodesuite*(name: string, body: untyped) =

    proc newHardhatProcess(
        config: HardhatConfig, role: Role
-    ): Future[NodeProcess] {.async.} =
+    ): Future[NodeProcess] {.async: (raises: [MultiNodeSuiteError, CancelledError]).} =
      var args: seq[string] = @[]
      if config.logFile:
-        let updatedLogFile =
-          getLogFile(LogsDir, starttime, name, currentTestName, $role, none int)
-        args.add "--log-file=" & updatedLogFile
+        try:
+          let updatedLogFile =
+            getLogFile(LogsDir, starttime, name, currentTestName, $role, none int)
+          args.add "--log-file=" & updatedLogFile
+        except IOError as e:
+          raiseMultiNodeSuiteError(
+            "failed to start hardhat because logfile path could not be obtained: " &
+              e.msg,
+            e,
+          )
+        except OSError as e:
+          raiseMultiNodeSuiteError(
+            "failed to start hardhat because logfile path could not be obtained: " &
+              e.msg,
+            e,
+          )

      let port = await nextFreePort(lastUsedHardhatPort)
      jsonRpcProviderUrl.updatePort(port)
@ -134,7 +154,7 @@ template multinodesuite*(name: string, body: untyped) =

    proc newCodexProcess(
        roleIdx: int, conf: CodexConfig, role: Role
-    ): Future[NodeProcess] {.async.} =
+    ): Future[NodeProcess] {.async: (raises: [MultiNodeSuiteError, CancelledError]).} =
      let nodeIdx = running.len
      var config = conf

@ -148,9 +168,22 @@ template multinodesuite*(name: string, body: untyped) =

      try:
        if config.logFile.isSome or DebugCodexNodes:
-          let updatedLogFile =
-            getLogFile(LogsDir, starttime, name, currentTestName, $role, some roleIdx)
-          config.withLogFile(updatedLogFile)
+          try:
+            let updatedLogFile =
+              getLogFile(LogsDir, starttime, name, currentTestName, $role, some roleIdx)
+            config.withLogFile(updatedLogFile)
+          except IOError as e:
+            raiseMultiNodeSuiteError(
+              "failed to start " & $role &
+                " because logfile path could not be obtained: " & e.msg,
+              e,
+            )
+          except OSError as e:
+            raiseMultiNodeSuiteError(
+              "failed to start " & $role &
+                " because logfile path could not be obtained: " & e.msg,
+              e,
+            )

        if DebugCodexNodes:
          config.debugEnabled = true
@ -172,17 +205,17 @@ template multinodesuite*(name: string, body: untyped) =
      except CodexConfigError as e:
        raiseMultiNodeSuiteError "invalid cli option, error: " & e.msg

-      let node = await CodexProcess.startNode(
-        config.cliArgs, config.debugEnabled, $role & $roleIdx
-      )
-
      try:
+        let node = await CodexProcess.startNode(
+          config.cliArgs, config.debugEnabled, $role & $roleIdx
+        )
        await node.waitUntilStarted()
        trace "node started", nodeName = $role & $roleIdx
+        return node
+      except CodexConfigError as e:
+        raiseMultiNodeSuiteError "failed to get cli args from config: " & e.msg, e
      except NodeProcessError as e:
-        raiseMultiNodeSuiteError "node not started, error: " & e.msg
-
-      return node
+        raiseMultiNodeSuiteError "node not started, error: " & e.msg, e

    proc hardhat(): HardhatProcess =
      for r in running:
@ -208,7 +241,9 @@ template multinodesuite*(name: string, body: untyped) =
          if r.role == Role.Validator:
            CodexProcess(r.node)

-    proc startHardhatNode(config: HardhatConfig): Future[NodeProcess] {.async.} =
+    proc startHardhatNode(
+        config: HardhatConfig
+    ): Future[NodeProcess] {.async: (raises: [MultiNodeSuiteError, CancelledError]).} =
      return await newHardhatProcess(config, Role.Hardhat)

    proc startClientNode(conf: CodexConfig): Future[NodeProcess] {.async.} =
@ -220,44 +255,63 @@ template multinodesuite*(name: string, body: untyped) =
      )
      return await newCodexProcess(clientIdx, config, Role.Client)

-    proc startProviderNode(conf: CodexConfig): Future[NodeProcess] {.async.} =
-      let providerIdx = providers().len
-      var config = conf
-      config.addCliOption(StartUpCmd.persistence, "--eth-provider", jsonRpcProviderUrl)
-      config.addCliOption(
-        StartUpCmd.persistence, "--eth-account", $accounts[running.len]
-      )
-      config.addCliOption(
-        PersistenceCmd.prover, "--circom-r1cs",
-        "vendor/codex-contracts-eth/verifier/networks/hardhat/proof_main.r1cs",
-      )
-      config.addCliOption(
-        PersistenceCmd.prover, "--circom-wasm",
-        "vendor/codex-contracts-eth/verifier/networks/hardhat/proof_main.wasm",
-      )
-      config.addCliOption(
-        PersistenceCmd.prover, "--circom-zkey",
-        "vendor/codex-contracts-eth/verifier/networks/hardhat/proof_main.zkey",
-      )
+    proc startProviderNode(
+        conf: CodexConfig
+    ): Future[NodeProcess] {.async: (raises: [MultiNodeSuiteError, CancelledError]).} =
+      try:
+        let providerIdx = providers().len
+        var config = conf
+        config.addCliOption(
+          StartUpCmd.persistence, "--eth-provider", jsonRpcProviderUrl
+        )
+        config.addCliOption(
+          StartUpCmd.persistence, "--eth-account", $accounts[running.len]
+        )
+        config.addCliOption(
+          PersistenceCmd.prover, "--circom-r1cs",
+          "vendor/codex-contracts-eth/verifier/networks/hardhat/proof_main.r1cs",
+        )
+        config.addCliOption(
+          PersistenceCmd.prover, "--circom-wasm",
+          "vendor/codex-contracts-eth/verifier/networks/hardhat/proof_main.wasm",
+        )
+        config.addCliOption(
+          PersistenceCmd.prover, "--circom-zkey",
+          "vendor/codex-contracts-eth/verifier/networks/hardhat/proof_main.zkey",
+        )

-      return await newCodexProcess(providerIdx, config, Role.Provider)
+        return await newCodexProcess(providerIdx, config, Role.Provider)
+      except CodexConfigError as e:
+        raiseMultiNodeSuiteError "Failed to start codex node, error adding cli options: " &
+          e.msg, e

-    proc startValidatorNode(conf: CodexConfig): Future[NodeProcess] {.async.} =
-      let validatorIdx = validators().len
-      var config = conf
-      config.addCliOption(StartUpCmd.persistence, "--eth-provider", jsonRpcProviderUrl)
-      config.addCliOption(
-        StartUpCmd.persistence, "--eth-account", $accounts[running.len]
-      )
-      config.addCliOption(StartUpCmd.persistence, "--validator")
+    proc startValidatorNode(
+        conf: CodexConfig
+    ): Future[NodeProcess] {.async: (raises: [MultiNodeSuiteError, CancelledError]).} =
+      try:
+        let validatorIdx = validators().len
+        var config = conf
+        config.addCliOption(
+          StartUpCmd.persistence, "--eth-provider", jsonRpcProviderUrl
+        )
+        config.addCliOption(
+          StartUpCmd.persistence, "--eth-account", $accounts[running.len]
+        )
+        config.addCliOption(StartUpCmd.persistence, "--validator")

-      return await newCodexProcess(validatorIdx, config, Role.Validator)
+        return await newCodexProcess(validatorIdx, config, Role.Validator)
+      except CodexConfigError as e:
+        raiseMultiNodeSuiteError "Failed to start validator node, error adding cli options: " &
+          e.msg, e

-    proc teardownImpl() {.async.} =
+    proc teardownImpl() {.async: (raises: []).} =
      for nodes in @[validators(), clients(), providers()]:
        for node in nodes:
          await node.stop() # also stops rest client
-          node.removeDataDir()
+          try:
+            node.removeDataDir()
+          except CodexProcessError as e:
+            error "Failed to remove data dir during teardown", error = e.msg

      # if hardhat was started in the test, kill the node
      # otherwise revert the snapshot taken in the test setup
@ -265,7 +319,10 @@ template multinodesuite*(name: string, body: untyped) =
      if not hardhat.isNil:
        await hardhat.stop()
      else:
-        discard await send(ethProvider, "evm_revert", @[snapshot])
+        try:
+          discard await noCancel send(ethProvider, "evm_revert", @[snapshot])
+        except ProviderError as e:
+          error "Failed to revert hardhat state during teardown", error = e.msg

        await ethProvider.close()

@ -274,6 +331,8 @@ template multinodesuite*(name: string, body: untyped) =
    template failAndTeardownOnError(message: string, tryBody: untyped) =
      try:
        tryBody
+      except CancelledError as e:
+        raise e
      except CatchableError as er:
        fatal message, error = er.msg
        echo "[FATAL] ", message, ": ", er.msg
@ -285,18 +344,35 @@ template multinodesuite*(name: string, body: untyped) =

    proc updateBootstrapNodes(
        node: CodexProcess
-    ): Future[void] {.async: (raises: [CatchableError]).} =
-      without ninfo =? await node.client.info():
-        # raise CatchableError instead of Defect (with .get or !) so we
-        # can gracefully shutdown and prevent zombies
-        raiseMultiNodeSuiteError "Failed to get node info"
-      bootstrapNodes.add ninfo["spr"].getStr()
+    ): Future[void] {.async: (raises: [MultiNodeSuiteError]).} =
+      try:
+        without ninfo =? await node.client.info():
+          # raise CatchableError instead of Defect (with .get or !) so we
+          # can gracefully shutdown and prevent zombies
+          raiseMultiNodeSuiteError "Failed to get node info"
+        bootstrapNodes.add ninfo["spr"].getStr()
+      except CatchableError as e:
+        raiseMultiNodeSuiteError "Failed to get node info: " & e.msg, e
+
+    setupAll:
+      proc raiseOnTimeout() {.async: (raises: [CancelledError, SuiteTimeoutError]).} =
+        await sleepAsync(chronos.seconds(10))
+        raise newException(SuiteTimeoutError, "suite timed out")
+
+      failAndTeardownOnError "suite timed out":
+        futTimeout = raiseOnTimeout()
+
+    teardownAll:
+      await futTimeout.cancelAndWait()

    setup:
      if var conf =? nodeConfigs.hardhat:
        try:
-          let node = await startHardhatNode(conf)
+          let node = await noCancel startHardhatNode(conf)
          running.add RunningNode(role: Role.Hardhat, node: node)
+        except CancelledError as e:
+          # should not happen because of noCancel, but added for clarity
+          raise e
        except CatchableError as e:
          echo "failed to start hardhat node"
          fail()
@ -312,6 +388,8 @@ template multinodesuite*(name: string, body: untyped) =
        if nodeConfigs.hardhat.isNone:
          snapshot = await send(ethProvider, "evm_snapshot")
        accounts = await ethProvider.listAccounts()
+      except CancelledError as e:
+        raise e
      except CatchableError as e:
        echo "Hardhat not running. Run hardhat manually " &
          "before executing tests, or include a " & "HardhatConfig in the test setup."
@ -321,21 +399,21 @@ template multinodesuite*(name: string, body: untyped) =
      if var clients =? nodeConfigs.clients:
        failAndTeardownOnError "failed to start client nodes":
          for config in clients.configs:
-            let node = await startClientNode(config)
+            let node = await noCancel startClientNode(config)
            running.add RunningNode(role: Role.Client, node: node)
            await CodexProcess(node).updateBootstrapNodes()

      if var providers =? nodeConfigs.providers:
        failAndTeardownOnError "failed to start provider nodes":
          for config in providers.configs.mitems:
-            let node = await startProviderNode(config)
+            let node = await noCancel startProviderNode(config)
            running.add RunningNode(role: Role.Provider, node: node)
            await CodexProcess(node).updateBootstrapNodes()

      if var validators =? nodeConfigs.validators:
        failAndTeardownOnError "failed to start validator nodes":
          for config in validators.configs.mitems:
-            let node = await startValidatorNode(config)
+            let node = await noCancel startValidatorNode(config)
            running.add RunningNode(role: Role.Validator, node: node)

      # ensure that we have a recent block with a fresh timestamp
--- a/tests/integration/nodeprocess.nim
+++ b/tests/integration/nodeprocess.nim
@ -47,7 +47,7 @@ method outputLineEndings(node: NodeProcess): string {.base, gcsafe.} =
 method onOutputLineCaptured(node: NodeProcess, line: string) {.base, gcsafe.} =
  raiseAssert "not implemented"

-method start*(node: NodeProcess) {.base, async.} =
+method start*(node: NodeProcess) {.base, async: (raises: [CancelledError]).} =
  logScope:
    nodeName = node.name

@ -104,7 +104,7 @@ proc captureOutput(

 proc startNode*[T: NodeProcess](
    _: type T, args: seq[string], debug: string | bool = false, name: string
-): Future[T] {.async.} =
+): Future[T] {.async: (raises: [CancelledError]).} =
  ## Starts a Codex Node with the specified arguments.
  ## Set debug to 'true' to see output of the node.
  let node = T(
@ -116,7 +116,9 @@ proc startNode*[T: NodeProcess](
  await node.start()
  return node

-method stop*(node: NodeProcess, expectedErrCode: int = -1) {.base, async.} =
+method stop*(
+    node: NodeProcess, expectedErrCode: int = -1
+) {.base, async: (raises: []).} =
  logScope:
    nodeName = node.name

@ -124,16 +126,14 @@ method stop*(node: NodeProcess, expectedErrCode: int = -1) {.base, async.} =
  if not node.process.isNil:
    trace "terminating node process..."
    try:
-      let exitCode = await node.process.terminateAndWaitForExit(2.seconds)
+      let exitCode = await noCancel node.process.terminateAndWaitForExit(2.seconds)
      if exitCode > 0 and exitCode != 143 and # 143 = SIGTERM (initiated above)
      exitCode != expectedErrCode:
        error "process exited with a non-zero exit code", exitCode
      trace "node stopped", exitCode
-    except CancelledError as error:
-      raise error
    except CatchableError:
      try:
-        let forcedExitCode = await node.process.killAndWaitForExit(3.seconds)
+        let forcedExitCode = await noCancel node.process.killAndWaitForExit(3.seconds)
        trace "node process forcibly killed with exit code: ", exitCode = forcedExitCode
      except CatchableError as e:
        error "failed to kill node process in time, it will be killed when the parent process exits",
@ -148,7 +148,9 @@ method stop*(node: NodeProcess, expectedErrCode: int = -1) {.base, async.} =

      asyncSpawn closeProcessStreams()

-proc waitUntilOutput*(node: NodeProcess, output: string) {.async.} =
+proc waitUntilOutput*(
+    node: NodeProcess, output: string
+) {.async: (raises: [CancelledError, AsyncTimeoutError]).} =
  logScope:
    nodeName = node.name

@ -158,9 +160,18 @@ proc waitUntilOutput*(node: NodeProcess, output: string) {.async.} =
  let fut = node.captureOutput(output, started)
  node.trackedFutures.track(fut)
  asyncSpawn fut
-  await started.wait(60.seconds) # allow enough time for proof generation
+  try:
+    await started.wait(60.seconds) # allow enough time for proof generation
+  except AsyncTimeoutError as e:
+    raise e
+  except CancelledError as e:
+    raise e
+  except CatchableError as e: # unsure where this originates from
+    error "unexpected error occurred waiting for node output", error = e.msg

-proc waitUntilStarted*(node: NodeProcess) {.async.} =
+proc waitUntilStarted*(
+    node: NodeProcess
+) {.async: (raises: [CancelledError, NodeProcessError]).} =
  logScope:
    nodeName = node.name