fix(ci): introduce a number of integration test fixes (#1342)

Signed-off-by: Slava <20563034+veaceslavdoina@users.noreply.github.com> Co-authored-by: Slava <20563034+veaceslavdoina@users.noreply.github.com> Co-authored-by: Arnaud <arnaud@status.im> Co-authored-by: gmega <giuliano.mega@gmail.com>
2026-03-14 00:03:12 +00:00 · 2026-01-16 21:47:59 +11:00 · 2026-01-16 21:47:59 +11:00 · 1acedcf71c
commit 1acedcf71c
parent cce002fcbf
25 changed files with 830 additions and 246 deletions
--- a/.github/actions/nimbus-build-system/action.yml
+++ b/.github/actions/nimbus-build-system/action.yml
@ -29,6 +29,7 @@ runs:
      shell: ${{ inputs.shell }} {0}
      run: |
        curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs/ | sh -s -- --default-toolchain=${{ inputs.rust_version }} -y
+        echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"

    - name: APT (Linux amd64/arm64)
      if: inputs.os == 'linux' && (inputs.cpu == 'amd64' || inputs.cpu == 'arm64')
@ -83,7 +84,7 @@ runs:

    - name: Install gcc 14 on Linux
      # We don't want to install gcc 14 for coverage (Ubuntu 20.04)
-      if : ${{ inputs.os == 'linux' && inputs.coverage != 'true' }}
+      if: ${{ inputs.os == 'linux' && inputs.coverage != 'true' }}
      shell: ${{ inputs.shell }} {0}
      run: |
        # Skip for older Ubuntu versions
@ -101,15 +102,22 @@ runs:
      if: inputs.os == 'linux' || inputs.os == 'macos'
      uses: hendrikmuhs/ccache-action@v1.2
      with:
-        create-symlink: true
-        key: ${{ inputs.os }}-${{ inputs.builder }}-${{ inputs.cpu }}-${{ inputs.tests }}-${{ inputs.nim_version }}
+        create-symlink: false
+        key: ${{ inputs.os }}-${{ inputs.builder }}-${{ inputs.cpu }}-${{ inputs.tests }}-${{ inputs.nim_version }}-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}
        evict-old-files: 7d

+    - name: Add ccache to path on Linux/Mac
+      if: inputs.os == 'linux' || inputs.os == 'macos'
+      shell: ${{ inputs.shell }} {0}
+      run: |
+        echo "/usr/lib/ccache:/usr/local/opt/ccache/libexec" >> "$GITHUB_PATH"
+        echo "/usr/local/opt/ccache/libexec" >> "$GITHUB_PATH"
+
    - name: Install ccache on Windows
      if: inputs.os == 'windows'
      uses: hendrikmuhs/ccache-action@v1.2
      with:
-        key: ${{ inputs.os }}-${{ inputs.builder }}-${{ inputs.cpu }}-${{ inputs.tests }}-${{ inputs.nim_version }}
+        key: ${{ inputs.os }}-${{ inputs.builder }}-${{ inputs.cpu }}-${{ inputs.tests }}-${{ inputs.nim_version }}-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}
        evict-old-files: 7d

    - name: Enable ccache on Windows
@ -117,11 +125,11 @@ runs:
      shell: ${{ inputs.shell }} {0}
      run: |
        CCACHE_DIR=$(dirname $(which ccache))/ccached
-        mkdir ${CCACHE_DIR}
-        ln -s $(which ccache) ${CCACHE_DIR}/gcc.exe
-        ln -s $(which ccache) ${CCACHE_DIR}/g++.exe
-        ln -s $(which ccache) ${CCACHE_DIR}/cc.exe
-        ln -s $(which ccache) ${CCACHE_DIR}/c++.exe
+        mkdir -p ${CCACHE_DIR}
+        ln -sf $(which ccache) ${CCACHE_DIR}/gcc.exe
+        ln -sf $(which ccache) ${CCACHE_DIR}/g++.exe
+        ln -sf $(which ccache) ${CCACHE_DIR}/cc.exe
+        ln -sf $(which ccache) ${CCACHE_DIR}/c++.exe
        echo "export PATH=${CCACHE_DIR}:\$PATH" >> $HOME/.bash_profile # prefix path in MSYS2

    - name: Derive environment variables
@ -202,10 +210,10 @@ runs:
    - name: Restore Nim toolchain binaries from cache
      id: nim-cache
      uses: actions/cache@v4
-      if : ${{ inputs.coverage != 'true'  }}
+      if: ${{ inputs.coverage != 'true' }}
      with:
        path: NimBinaries
-        key: ${{ inputs.os }}-${{ inputs.cpu }}-nim-${{ inputs.nim_version }}-cache-${{ env.cache_nonce }}-${{ github.run_id }}
+        key: ${{ inputs.os }}-${{ inputs.cpu }}-nim-${{ inputs.nim_version }}-cache-${{ env.cache_nonce }}-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}
        restore-keys: ${{ inputs.os }}-${{ inputs.cpu }}-nim-${{ inputs.nim_version }}-cache-${{ env.cache_nonce }}

    - name: Set NIM_COMMIT
--- a/.github/workflows/ci-reusable.yml
+++ b/.github/workflows/ci-reusable.yml
@ -54,13 +54,20 @@ jobs:
        with:
          node-version: 22

-      - name: Start Ethereum node with Logos Storage contracts
+      - name: Install Ethereum node dependencies
        if: matrix.tests == 'contract' || matrix.tests == 'integration' || matrix.tests == 'tools' || matrix.tests == 'all'
        working-directory: vendor/logos-storage-contracts-eth
        env:
          MSYS2_PATH_TYPE: inherit
        run: |
          npm ci
+
+      - name: Run Ethereum node with Logos Storage contracts
+        if: matrix.tests == 'contract' || matrix.tests == 'integration' || matrix.tests == 'tools' || matrix.tests == 'all'
+        working-directory: vendor/logos-storage-contracts-eth
+        env:
+          MSYS2_PATH_TYPE: inherit
+        run: |
          npm start &
          # Wait for the contracts to be deployed
          sleep 5
@ -75,7 +82,7 @@ jobs:
        if: matrix.tests == 'integration' || matrix.tests == 'all'
        env:
          CODEX_INTEGRATION_TEST_INCLUDES: ${{ matrix.includes }}
-        run: make -j${ncpu} testIntegration
+        run: make -j${ncpu} DEBUG=${{ runner.debug }} testIntegration

      - name: Upload integration tests log files
        uses: actions/upload-artifact@v4
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -18,6 +18,7 @@ concurrency:
 jobs:

  matrix:
+    name: Compute matrix
    runs-on: ubuntu-latest
    outputs:
      matrix: ${{ steps.matrix.outputs.matrix }}
--- a/16
+++ b/16
@ -140,10 +140,24 @@ testContracts: | build deps
 	echo -e $(BUILD_MSG) "build/$@" && \
 		$(ENV_SCRIPT) nim testContracts $(NIM_PARAMS) --define:ws_resubscribe=240 build.nims

+TEST_PARAMS :=
+ifdef DEBUG
+	TEST_PARAMS := $(TEST_PARAMS) -d:DebugTestHarness=$(DEBUG)
+  TEST_PARAMS := $(TEST_PARAMS) -d:NoCodexLogFilters=$(DEBUG)
+  TEST_PARAMS := $(TEST_PARAMS) -d:ShowContinuousStatusUpdates=$(DEBUG)
+  TEST_PARAMS := $(TEST_PARAMS) -d:DebugHardhat=$(DEBUG)
+endif
+ifdef TEST_TIMEOUT
+  TEST_PARAMS := $(TEST_PARAMS) -d:TestTimeout=$(TEST_TIMEOUT)
+endif
+ifdef PARALLEL
+  TEST_PARAMS := $(TEST_PARAMS) -d:EnableParallelTests=$(PARALLEL)
+endif
+
 # Builds and runs the integration tests
 testIntegration: | build deps
 	echo -e $(BUILD_MSG) "build/$@" && \
-		$(ENV_SCRIPT) nim testIntegration $(NIM_PARAMS) --define:ws_resubscribe=240 build.nims
+		$(ENV_SCRIPT) nim testIntegration $(TEST_PARAMS) $(NIM_PARAMS) --define:ws_resubscribe=240 build.nims

 # Builds and runs all tests (except for Taiko L2 tests)
 testAll: | build deps
--- a/build.nims
+++ b/build.nims
@ -1,9 +1,20 @@
 mode = ScriptMode.Verbose

 import std/os except commandLineParams
+import std/strutils

 ### Helper functions
-proc buildBinary(srcName: string, outName = os.lastPathPart(srcName),  srcDir = "./", params = "", lang = "c") =
+proc truthy(val: string): bool =
+  const truthySwitches = @["yes", "1", "on", "true"]
+  return val in truthySwitches
+
+proc buildBinary(
+    srcName: string,
+    outName = os.lastPathPart(srcName),
+    srcDir = "./",
+    params = "",
+    lang = "c",
+) =
  if not dirExists "build":
    mkDir "build"

@ -43,10 +54,8 @@ proc buildLibrary(name: string, srcDir = "./", params = "", `type` = "dynamic")
    exec "nim c" & " --out:build/" & name &
      ".a --threads:on --app:staticlib --opt:size --noMain --mm:refc --header --d:metrics " &
      "--nimMainPrefix:libstorage -d:noSignalHandler " &
-      "-d:LeopardExtraCompilerFlags=-fPIC " &
-      "-d:chronicles_runtime_filtering " &
-      "-d:chronicles_log_level=TRACE " &
-      params & " " & srcDir & name & ".nim"
+      "-d:LeopardExtraCompilerFlags=-fPIC " & "-d:chronicles_runtime_filtering " &
+      "-d:chronicles_log_level=TRACE " & params & " " & srcDir & name & ".nim"

 proc test(name: string, outName = name, srcDir = "tests/", params = "", lang = "c") =
  buildBinary name, outName, srcDir, params
@ -61,7 +70,8 @@ task toolsCirdl, "build tools/cirdl binary":
  buildBinary "tools/cirdl/cirdl"

 task testStorage, "Build & run Logos Storage tests":
-  test "testCodex", outName = "testStorage", params = "-d:storage_enable_proof_failures=true"
+  test "testCodex",
+    outName = "testStorage", params = "-d:storage_enable_proof_failures=true"

 task testContracts, "Build & run Logos Storage Contract tests":
  test "testContracts"
@ -70,11 +80,18 @@ task testIntegration, "Run integration tests":
  buildBinary "codex",
    outName = "storage",
    params =
-      "-d:chronicles_runtime_filtering -d:chronicles_log_level=TRACE -d:storage_enable_proof_failures=true"
-  test "testIntegration"
+      "-d:chronicles_runtime_filtering -d:chronicles_log_level=TRACE -d:chronicles_disabled_topics=JSONRPC-HTTP-CLIENT,websock,libp2p,discv5 -d:codex_enable_proof_failures=true"
+  var sinks = @["textlines[nocolors,file]"]
+  for i in 2 ..< paramCount():
+    if "DebugTestHarness" in paramStr(i) and truthy paramStr(i).split('=')[1]:
+      sinks.add "textlines[stdout]"
+      break
+  var testParams =
+    "-d:chronicles_log_level=TRACE -d:chronicles_sinks=\"" & sinks.join(",") & "\""
+  test "testIntegration", params = testParams
  # use params to enable logging from the integration test executable
  # test "testIntegration", params = "-d:chronicles_sinks=textlines[notimestamps,stdout],textlines[dynamic] " &
-  #   "-d:chronicles_enabled_topics:integration:TRACE"  
+  #   "-d:chronicles_enabled_topics:integration:TRACE"

 task build, "build Logos Storage binary":
  storageTask()
@ -139,7 +156,9 @@ task coverage, "generates code coverage report":
      nimSrcs
  )
  echo " ======== Generating HTML coverage report ======== "
-  exec("genhtml coverage/coverage.f.info --keep-going --output-directory coverage/report ")
+  exec(
+    "genhtml coverage/coverage.f.info --keep-going --output-directory coverage/report "
+  )
  echo " ======== Coverage report Done ======== "

 task showCoverage, "open coverage html":
--- a/codex.nimble
+++ b/codex.nimble
@ -4,6 +4,6 @@ description = "p2p data durability engine"
 license = "MIT"
 binDir = "build"
 srcDir = "."
-installFiles  = @["build.nims"]
+installFiles = @["build.nims"]

 include "build.nims"
--- a/codex/logutils.nim
+++ b/codex/logutils.nim
@ -92,6 +92,7 @@ import std/sugar
 import std/typetraits

 import pkg/chronicles except toJson, `%`
+from pkg/chronos import TransportAddress
 from pkg/libp2p import Cid, MultiAddress, `$`
 import pkg/questionable
 import pkg/questionable/results
@ -255,3 +256,5 @@ formatIt(LogFormat.textLines, array[32, byte]):
  it.short0xHexLog
 formatIt(LogFormat.json, array[32, byte]):
  it.to0xHex
+formatIt(TransportAddress):
+  $it
--- a/config.nims
+++ b/config.nims
@ -65,8 +65,8 @@ else:
    # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782
    # ("-fno-asynchronous-unwind-tables" breaks Nim's exception raising, sometimes)
    switch("passC", "-march=x86-64")
-  else: switch("passC", "-march=native")
-    
+  else:
+    switch("passC", "-march=native")

 --tlsEmulation:
  off
--- a/tests/asynctest.nim
+++ b/tests/asynctest.nim
@ -1,3 +1,13 @@
 import pkg/asynctest/chronos/unittest2

-export unittest2
+export unittest2 except eventually
+
+template eventuallySafe*(
+    expression: untyped, timeout = 5000, pollInterval = 1000
+): bool =
+  ## More sane defaults, for use with HTTP connections
+  eventually(expression, timeout, pollInterval)
+
+template eventually*(expression: untyped, timeout = 5000, pollInterval = 10): bool =
+  ## Fast defaults, do not use with HTTP connections!
+  eventually(expression, timeout, pollInterval)
--- a/tests/codex/slots/sampler/testutils.nim
+++ b/tests/codex/slots/sampler/testutils.nim
@ -77,15 +77,13 @@ asyncchecksuite "Test proof sampler utils":
      )

    proc getExpectedIndices(n: int): seq[Natural] =
-      return collect(
-        newSeq,
+      return collect(newSeq):
        (;
          for i in 1 .. n:
            cellIndex(
              proofInput.entropy, proofInput.slotRoot, proofInput.nCellsPerSlot, i
            )
-        ),
-      )
+        )

    check:
      slotCellIndices(3) == getExpectedIndices(3)
--- a/tests/ethertest.nim
+++ b/tests/ethertest.nim
@ -5,6 +5,8 @@ import pkg/chronos
 import ./asynctest
 import ./checktest

+const HardhatPort {.intdefine.}: int = 8545
+
 ## Unit testing suite that sets up an Ethereum testing environment.
 ## Injects a `ethProvider` instance, and a list of `accounts`.
 ## Calls the `evm_snapshot` and `evm_revert` methods to ensure that any
@ -16,7 +18,7 @@ template ethersuite*(name, body) =
    var snapshot: JsonNode

    setup:
-      ethProvider = JsonRpcProvider.new("ws://localhost:8545")
+      ethProvider = JsonRpcProvider.new("ws://localhost:" & $HardhatPort)
      snapshot = await send(ethProvider, "evm_snapshot")
      accounts = await ethProvider.listAccounts()
    teardown:
--- a/tests/helpers.nim
+++ b/tests/helpers.nim
@ -4,6 +4,8 @@ import helpers/templeveldb
 import std/times
 import std/sequtils, chronos

+import ./asynctest
+
 export multisetup, trackers, templeveldb

 ### taken from libp2p errorhelpers.nim
--- a/tests/integration/1_minute/testcli.nim
+++ b/tests/integration/1_minute/testcli.nim
@ -1,31 +1,91 @@
 import std/tempfiles
+import std/times
 import codex/conf
 import codex/utils/fileutils
 import ../../asynctest
 import ../../checktest
 import ../codexprocess
 import ../nodeprocess
+import ../utils
 import ../../examples

+const HardhatPort {.intdefine.}: int = 8545
+const CodexApiPort {.intdefine.}: int = 8080
+const CodexDiscPort {.intdefine.}: int = 8090
+const CodexLogToFile {.booldefine.}: bool = false
+const CodexLogLevel {.strdefine.}: string = ""
+const CodexLogsDir {.strdefine.}: string = ""
+
 asyncchecksuite "Command line interface":
+  let startTime = now().format("yyyy-MM-dd'_'HH:mm:ss")
  let key = "4242424242424242424242424242424242424242424242424242424242424242"

-  proc startCodex(args: seq[string]): Future[CodexProcess] {.async.} =
-    return await CodexProcess.startNode(args, false, "cli-test-node")
+  var currentTestName = ""
+  var testCount = 0
+  var nodeCount = 0
+
+  template test(tname, tbody) =
+    inc testCount
+    currentTestName = tname
+    test tname:
+      tbody
+
+  proc addLogFile(args: seq[string]): seq[string] =
+    var args = args
+    when CodexLogToFile:
+      args.add(
+        "--log-file=" &
+          getLogFile(
+            CodexLogsDir,
+            startTime,
+            "Command line interface",
+            currentTestName,
+            "Client",
+            some nodeCount mod testCount,
+          )
+      )
+    when CodexLogLevel != "":
+      args.add "--log-level=" & CodexLogLevel
+
+    return args
+
+  proc startCodex(arguments: seq[string]): Future[CodexProcess] {.async.} =
+    inc nodeCount
+    let args = arguments.addLogFile
+    return await CodexProcess.startNode(
+      args.concat(
+        @[
+          "--api-port=" & $(await nextFreePort(CodexApiPort + nodeCount)),
+          "--disc-port=" & $(await nextFreePort(CodexDiscPort + nodeCount)),
+        ]
+      ),
+      debug = false,
+      "cli-test-node",
+    )

  test "complains when persistence is enabled without ethereum account":
    let node = await startCodex(@["persistence"])
    await node.waitUntilOutput("Persistence enabled, but no Ethereum account was set")
-    await node.stop()
+    # Expect the codex process to return an exit code of 1 indicating the result
+    # of the operation was unsuccessful.
+    await node.stop(expectedExitCode = 1)

  test "complains when ethereum private key file has wrong permissions":
    let unsafeKeyFile = genTempPath("", "")
    discard unsafeKeyFile.writeFile(key, 0o666)
-    let node = await startCodex(@["persistence", "--eth-private-key=" & unsafeKeyFile])
+    let node = await startCodex(
+      @[
+        "persistence",
+        "--eth-provider=" & "ws://localhost:" & $HardhatPort,
+        "--eth-private-key=" & unsafeKeyFile,
+      ]
+    )
    await node.waitUntilOutput(
      "Ethereum private key file does not have safe file permissions"
    )
-    await node.stop()
+    # Expect the codex process to return an exit code of 1 indicating the result
+    # of the operation was unsuccessful.
+    await node.stop(expectedExitCode = 1)
    discard removeFile(unsafeKeyFile)

  let
@ -36,25 +96,37 @@ asyncchecksuite "Command line interface":
  test "suggests downloading of circuit files when persistence is enabled without accessible r1cs file":
    let node = await startCodex(@["persistence", "prover", marketplaceArg])
    await node.waitUntilOutput(expectedDownloadInstruction)
-    await node.stop()
+    # Expect the codex process to return an exit code of 1 indicating the result
+    # of the operation was unsuccessful.
+    await node.stop(expectedExitCode = 1)

  test "suggests downloading of circuit files when persistence is enabled without accessible wasm file":
    let node = await startCodex(
      @[
-        "persistence", "prover", marketplaceArg,
+        "persistence",
+        "--eth-provider=" & "ws://localhost:" & $HardhatPort,
+        "prover",
+        marketplaceArg,
        "--circom-r1cs=tests/circuits/fixtures/proof_main.r1cs",
      ]
    )
    await node.waitUntilOutput(expectedDownloadInstruction)
-    await node.stop()
+    # Expect the codex process to return an exit code of 1 indicating the result
+    # of the operation was unsuccessful.
+    await node.stop(expectedExitCode = 1)

  test "suggests downloading of circuit files when persistence is enabled without accessible zkey file":
    let node = await startCodex(
      @[
-        "persistence", "prover", marketplaceArg,
+        "persistence",
+        "--eth-provider=" & "ws://localhost:" & $HardhatPort,
+        "prover",
+        marketplaceArg,
        "--circom-r1cs=tests/circuits/fixtures/proof_main.r1cs",
        "--circom-wasm=tests/circuits/fixtures/proof_main.wasm",
      ]
    )
    await node.waitUntilOutput(expectedDownloadInstruction)
-    await node.stop()
+    # Expect the codex process to return an exit code of 1 indicating the result
+    # of the operation was unsuccessful.
+    await node.stop(expectedExitCode = 1)
--- a/tests/integration/1_minute/testecbug.nim
+++ b/tests/integration/1_minute/testecbug.nim
@ -9,12 +9,12 @@ marketplacesuite(
 ):
  test "should be able to create storage request and download dataset",
    NodeConfigs(
-      clients: CodexConfigs
-        .init(nodes = 1)
-        # .debug() # uncomment to enable console log output.debug()
-        .withLogFile()
-        # uncomment to output log file to tests/integration/logs/<start_datetime> <suite_name>/<test_name>/<node_role>_<node_idx>.log
-        .withLogTopics("node", "erasure", "marketplace").some,
+      clients: CodexConfigs.init(nodes = 1)
+      # .debug() # uncomment to enable console log output.debug()
+      # .withLogFile()
+      # uncomment to output log file to tests/integration/logs/<start_datetime> <suite_name>/<test_name>/<node_role>_<node_idx>.log
+      # .withLogTopics("node", "erasure", "marketplace")
+      .some,
      providers: CodexConfigs.init(nodes = 0).some,
    ):
    let
--- a/tests/integration/30_minutes/testmarketplace.nim.ignore
+++ b/tests/integration/30_minutes/testmarketplace.nim.ignore
@ -6,6 +6,7 @@ import ../../contracts/deployment
 import ./../marketplacesuite
 import ../twonodes
 import ../nodeconfigs
+from ../../helpers import eventuallySafe

 marketplacesuite(name = "Marketplace", stopOnRequestFail = true):
  let marketplaceConfig = NodeConfigs(
@ -122,11 +123,11 @@ marketplacesuite(name = "Marketplace", stopOnRequestFail = true):
    # Checking that the hosting node received reward for at least the time between <expiry;end>
    let slotSize = slotSize(blocks, ecNodes, ecTolerance)
    let pricePerSlotPerSecond = minPricePerBytePerSecond * slotSize
-    check eventually (await token.balanceOf(hostAccount)) - startBalanceHost >=
+    check eventuallySafe (await token.balanceOf(hostAccount)) - startBalanceHost >=
      (duration - 5 * 60).u256 * pricePerSlotPerSecond * ecNodes.u256

    # Checking that client node receives some funds back that were not used for the host nodes
-    check eventually(
+    check eventuallySafe(
      (await token.balanceOf(clientAccount)) - clientBalanceBeforeFinished > 0,
      timeout = 10 * 1000, # give client a bit of time to withdraw its funds
    )
@ -296,12 +297,12 @@ marketplacesuite(name = "Marketplace payouts", stopOnRequestFail = true):
    let slotSize = slotSize(blocks, ecNodes, ecTolerance)
    let pricePerSlotPerSecond = minPricePerBytePerSecond * slotSize

-    check eventually (
+    check eventuallySafe (
      let endBalanceProvider = (await token.balanceOf(provider.ethAccount))
      endBalanceProvider > startBalanceProvider and
        endBalanceProvider < startBalanceProvider + expiry.u256 * pricePerSlotPerSecond
    )
-    check eventually(
+    check eventuallySafe(
      (
        let endBalanceClient = (await token.balanceOf(client.ethAccount))
        let endBalanceProvider = (await token.balanceOf(provider.ethAccount))
--- a/tests/integration/30_minutes/testvalidator.nim.ignore
+++ b/tests/integration/30_minutes/testvalidator.nim.ignore
@ -28,12 +28,12 @@ marketplacesuite(name = "Validation", stopOnRequestFail = false):
    NodeConfigs(
      # Uncomment to start Hardhat automatically, typically so logs can be inspected locally
      hardhat: HardhatConfig.none,
-      clients: CodexConfigs
-        .init(nodes = 1)
-        # .debug() # uncomment to enable console log output
-        .withLogFile()
-        # uncomment to output log file to tests/integration/logs/<start_datetime> <suite_name>/<test_name>/<node_role>_<node_idx>.log
-        .withLogTopics("purchases", "onchain").some,
+      clients: CodexConfigs.init(nodes = 1)
+      # .debug() # uncomment to enable console log output
+      # .withLogFile()
+      # uncomment to output log file to tests/integration/logs/<start_datetime> <suite_name>/<test_name>/<node_role>_<node_idx>.log
+      # .withLogTopics("purchases", "onchain")
+      .some,
      providers: CodexConfigs
        .init(nodes = 1)
        .withSimulateProofFailures(idx = 0, failEveryNProofs = 1)
@ -47,9 +47,9 @@ marketplacesuite(name = "Validation", stopOnRequestFail = false):
        .withValidationGroupIndex(idx = 0, groupIndex = 0)
        .withValidationGroupIndex(idx = 1, groupIndex = 1)
        # .debug() # uncomment to enable console log output
-        .withLogFile()
+        # .withLogFile()
        # uncomment to output log file to tests/integration/logs/<start_datetime> <suite_name>/<test_name>/<node_role>_<node_idx>.log
-        .withLogTopics("validator")
+        # .withLogTopics("validator")
        # each topic as a separate string argument
        .some,
    ):
@ -100,12 +100,12 @@ marketplacesuite(name = "Validation", stopOnRequestFail = false):
    NodeConfigs(
      # Uncomment to start Hardhat automatically, typically so logs can be inspected locally
      hardhat: HardhatConfig.none,
-      clients: CodexConfigs
-        .init(nodes = 1)
-        # .debug() # uncomment to enable console log output
-        .withLogFile()
-        # uncomment to output log file to tests/integration/logs/<start_datetime> <suite_name>/<test_name>/<node_role>_<node_idx>.log
-        .withLogTopics("purchases", "onchain").some,
+      clients: CodexConfigs.init(nodes = 1)
+      # .debug() # uncomment to enable console log output
+      # .withLogFile()
+      # uncomment to output log file to tests/integration/logs/<start_datetime> <suite_name>/<test_name>/<node_role>_<node_idx>.log
+      # .withLogTopics("purchases", "onchain")
+      .some,
      providers: CodexConfigs
        .init(nodes = 1)
        .withSimulateProofFailures(idx = 0, failEveryNProofs = 1)
--- a/tests/integration/codexconfig.nim
+++ b/tests/integration/codexconfig.nim
@ -169,7 +169,8 @@ proc withLogFile*(self: CodexConfigs): CodexConfigs {.raises: [CodexConfigError]

 proc withLogFile*(
    self: var CodexConfig, logFile: string
-) {.raises: [CodexConfigError].} = #: CodexConfigs =
+) {.raises: [CodexConfigError].} =
+  #: CodexConfigs =
  ## typically called internally from the test suite, sets a log file path to
  ## be created during the test run, for a specified node in the group
  # var config = self
--- a/tests/integration/codexprocess.nim
+++ b/tests/integration/codexprocess.nim
@ -7,6 +7,7 @@ import pkg/ethers
 import pkg/libp2p
 import std/os
 import std/strutils
+import std/times
 import codex/conf
 import ./codexclient
 import ./nodeprocess
@ -15,11 +16,28 @@ export codexclient
 export chronicles
 export nodeprocess

+{.push raises: [].}
+
 logScope:
  topics = "integration testing codex process"

-type CodexProcess* = ref object of NodeProcess
-  client: ?CodexClient
+type
+  CodexProcess* = ref object of NodeProcess
+    client: ?CodexClient
+
+  CodexProcessError* = object of NodeProcessError
+
+proc raiseCodexProcessError(
+    msg: string, parent: ref CatchableError
+) {.raises: [CodexProcessError].} =
+  raise newException(CodexProcessError, msg & ": " & parent.msg, parent)
+
+template convertError(msg, body: typed) =
+  # Don't use this in an async proc, unless body does not raise CancelledError
+  try:
+    body
+  except CatchableError as parent:
+    raiseCodexProcessError(msg, parent)

 method workingDir(node: CodexProcess): string =
  return currentSourcePath() / ".." / ".." / ".."
@ -33,44 +51,83 @@ method startedOutput(node: CodexProcess): string =
 method processOptions(node: CodexProcess): set[AsyncProcessOption] =
  return {AsyncProcessOption.StdErrToStdOut}

-method outputLineEndings(node: CodexProcess): string {.raises: [].} =
+method outputLineEndings(node: CodexProcess): string =
  return "\n"

-method onOutputLineCaptured(node: CodexProcess, line: string) {.raises: [].} =
+method onOutputLineCaptured(node: CodexProcess, line: string) =
  discard

-proc dataDir(node: CodexProcess): string =
-  let config = CodexConf.load(cmdLine = node.arguments, quitOnFailure = false)
-  return config.dataDir.string
+proc config(node: CodexProcess): CodexConf {.raises: [CodexProcessError].} =
+  # cannot use convertError here as it uses typed parameters which forces type
+  # resolution, while confutils.load uses untyped parameters and expects type
+  # resolution not to happen yet. In other words, it won't compile.
+  try:
+    return CodexConf.load(
+      cmdLine = node.arguments, quitOnFailure = false, secondarySources = nil
+    )
+  except ConfigurationError as parent:
+    raiseCodexProcessError "Failed to load node arguments into CodexConf", parent

-proc ethAccount*(node: CodexProcess): Address =
-  let config = CodexConf.load(cmdLine = node.arguments, quitOnFailure = false)
-  without ethAccount =? config.ethAccount:
+proc dataDir(node: CodexProcess): string {.raises: [CodexProcessError].} =
+  return node.config.dataDir.string
+
+proc ethAccount*(node: CodexProcess): Address {.raises: [CodexProcessError].} =
+  without ethAccount =? node.config.ethAccount:
    raiseAssert "eth account not set"
  return Address(ethAccount)

-proc apiUrl*(node: CodexProcess): string =
-  let config = CodexConf.load(cmdLine = node.arguments, quitOnFailure = false)
-  return
-    "http://" & config.apiBindAddress.get() & ":" & $config.apiPort & "/api/storage/v1"
+proc apiUrl*(node: CodexProcess): string {.raises: [CodexProcessError].} =
+  let config = node.config
+  without apiBindAddress =? config.apiBindAddress:
+    raise
+      newException(CodexProcessError, "REST API not started: --api-bindaddr not set")
+  return "http://" & apiBindAddress & ":" & $config.apiPort & "/api/storage/v1"

-proc client*(node: CodexProcess): CodexClient =
+proc logFile*(node: CodexProcess): ?string {.raises: [CodexProcessError].} =
+  node.config.logFile
+
+proc client*(node: CodexProcess): CodexClient {.raises: [CodexProcessError].} =
  if client =? node.client:
    return client
  let client = CodexClient.new(node.apiUrl)
  node.client = some client
  return client

-method stop*(node: CodexProcess) {.async.} =
+proc updateLogFile(node: CodexProcess, newLogFile: string) =
+  for arg in node.arguments.mitems:
+    if arg.startsWith("--log-file="):
+      arg = "--log-file=" & newLogFile
+      break
+
+method restart*(node: CodexProcess) {.async.} =
+  trace "restarting codex"
+  await node.stop()
+  if logFile =? node.logFile:
+    # chronicles truncates the existing log file on start, so changed the log
+    # file cli param to create a new one
+    node.updateLogFile(
+      logFile & "_restartedAt_" & now().format("yyyy-MM-dd'_'HH-mm-ss") & ".log"
+    )
+  await node.start()
+  await node.waitUntilStarted()
+  trace "codex process restarted"
+
+method stop*(node: CodexProcess) {.async: (raises: []).} =
  logScope:
    nodeName = node.name

+  trace "stopping codex client"
  await procCall NodeProcess(node).stop()

-  trace "stopping Storage client"
+  if not node.process.isNil:
+    trace "closing node process' streams"
+    await node.process.closeWait()
+    trace "node process' streams closed"
+
  if client =? node.client:
    await client.close()
    node.client = none CodexClient

-method removeDataDir*(node: CodexProcess) =
-  removeDir(node.dataDir)
+method removeDataDir*(node: CodexProcess) {.raises: [CodexProcessError].} =
+  convertError("failed to remove codex node data directory"):
+    removeDir(node.dataDir)
--- a/tests/integration/hardhatprocess.nim
+++ b/tests/integration/hardhatprocess.nim
@ -8,28 +8,38 @@ import pkg/stew/io2
 import std/os
 import std/sets
 import std/sequtils
+import std/strformat
 import std/strutils
 import pkg/codex/conf
 import pkg/codex/utils/trackedfutures
 import ./codexclient
 import ./nodeprocess
+import ./utils

 export codexclient
 export chronicles
+export nodeprocess
+
+{.push raises: [].}

 logScope:
  topics = "integration testing hardhat process"
-  nodeName = "hardhat"

-type HardhatProcess* = ref object of NodeProcess
-  logFile: ?IoHandle
+type
+  OnOutputLineCaptured = proc(line: string) {.gcsafe, raises: [].}
+  HardhatProcess* = ref object of NodeProcess
+    logFile: ?IoHandle
+    onOutputLine: OnOutputLineCaptured
+
+  HardhatProcessError* = object of NodeProcessError

 method workingDir(node: HardhatProcess): string =
  return
    currentSourcePath() / ".." / ".." / ".." / "vendor" / "logos-storage-contracts-eth"

 method executable(node: HardhatProcess): string =
-  return "node_modules" / ".bin" / "hardhat"
+  return
+    "node_modules" / ".bin" / (when defined(windows): "hardhat.cmd" else: "hardhat")

 method startedOutput(node: HardhatProcess): string =
  return "Started HTTP and WebSocket JSON-RPC server at"
@ -37,7 +47,7 @@ method startedOutput(node: HardhatProcess): string =
 method processOptions(node: HardhatProcess): set[AsyncProcessOption] =
  return {}

-method outputLineEndings(node: HardhatProcess): string {.raises: [].} =
+method outputLineEndings(node: HardhatProcess): string =
  return "\n"

 proc openLogFile(node: HardhatProcess, logFilePath: string): IoHandle =
@ -52,7 +62,21 @@ proc openLogFile(node: HardhatProcess, logFilePath: string): IoHandle =

  return fileHandle

-method start*(node: HardhatProcess) {.async.} =
+method start*(
+    node: HardhatProcess
+) {.async: (raises: [CancelledError, NodeProcessError]).} =
+  logScope:
+    nodeName = node.name
+
+  var executable = ""
+  try:
+    executable = absolutePath(node.workingDir / node.executable)
+    if not fileExists(executable):
+      raiseAssert "cannot start hardhat, executable doesn't exist (looking for " &
+        &"{executable}). Try running `npm install` in {node.workingDir}."
+  except CatchableError as parent:
+    raiseAssert "failed build path to hardhat executable: " & parent.msg
+
  let poptions = node.processOptions + {AsyncProcessOption.StdErrToStdOut}

  trace "starting node",
@ -89,19 +113,37 @@ method start*(node: HardhatProcess) {.async.} =
    trace "hardhat post start scripts executed"
  except CancelledError as error:
    raise error
-  except CatchableError as e:
-    error "failed to start hardhat process", error = e.msg
+  except CatchableError as parent:
+    raise newException(
+      HardhatProcessError, "failed to start hardhat process: " & parent.msg, parent
+    )
+
+proc port(node: HardhatProcess): ?int =
+  var next = false
+  for arg in node.arguments:
+    # TODO: move to constructor
+    if next:
+      return parseInt(arg).catch.option
+    if arg.contains "--port":
+      next = true
+
+  return none int

 proc startNode*(
    _: type HardhatProcess,
    args: seq[string],
    debug: string | bool = false,
    name: string,
-): Future[HardhatProcess] {.async.} =
+    onOutputLineCaptured: OnOutputLineCaptured = nil,
+): Future[HardhatProcess] {.async: (raises: [CancelledError, NodeProcessError]).} =
+  logScope:
+    nodeName = name
+
  var logFilePath = ""

  var arguments = newSeq[string]()
  for arg in args:
+    # TODO: move to constructor
    if arg.contains "--log-file=":
      logFilePath = arg.split("=")[1]
    else:
@ -114,17 +156,25 @@ proc startNode*(
    arguments: arguments,
    debug: ($debug != "false"),
    trackedFutures: TrackedFutures.new(),
-    name: "hardhat",
+    name: name,
+    onOutputLine: onOutputLineCaptured,
  )

  await hardhat.start()

+  # TODO: move to constructor
  if logFilePath != "":
    hardhat.logFile = some hardhat.openLogFile(logFilePath)

  return hardhat

 method onOutputLineCaptured(node: HardhatProcess, line: string) =
+  logScope:
+    nodeName = node.name
+
+  if not node.onOutputLine.isNil:
+    node.onOutputLine(line)
+
  without logFile =? node.logFile:
    return

@ -133,13 +183,49 @@ method onOutputLineCaptured(node: HardhatProcess, line: string) =
    discard logFile.closeFile()
    node.logFile = none IoHandle

-method stop*(node: HardhatProcess) {.async.} =
+proc closeProcessStreams(node: HardhatProcess) {.async: (raises: []).} =
+  when not defined(windows):
+    if not node.process.isNil:
+      trace "closing node process' streams"
+      await node.process.closeWait()
+      trace "node process' streams closed"
+  else:
+    # Windows hangs when attempting to close hardhat's process streams, so try
+    # to kill the process externally.
+    without port =? node.port:
+      error "Failed to get port from Hardhat args"
+      return
+    try:
+      let cmdResult = await forceKillProcess("node.exe", &"--port {port}")
+      if cmdResult.status > 0:
+        error "Failed to forcefully kill windows hardhat process",
+          port, exitCode = cmdResult.status, stderr = cmdResult.stdError
+      else:
+        trace "Successfully killed windows hardhat process by force",
+          port, exitCode = cmdResult.status, stdout = cmdResult.stdOutput
+    except ValueError, OSError:
+      let eMsg = getCurrentExceptionMsg()
+      error "Failed to forcefully kill windows hardhat process, bad path to command",
+        error = eMsg
+    except CancelledError as e:
+      discard
+    except AsyncProcessError as e:
+      error "Failed to forcefully kill windows hardhat process", port, error = e.msg
+    except AsyncProcessTimeoutError as e:
+      error "Timeout while forcefully killing windows hardhat process",
+        port, error = e.msg
+
+method stop*(node: HardhatProcess) {.async: (raises: []).} =
  # terminate the process
  await procCall NodeProcess(node).stop()

+  await node.closeProcessStreams()
+
  if logFile =? node.logFile:
    trace "closing hardhat log file"
    discard logFile.closeFile()

+  node.process = nil
+
 method removeDataDir*(node: HardhatProcess) =
  discard
--- a/tests/integration/marketplacesuite.nim
+++ b/tests/integration/marketplacesuite.nim
@ -86,7 +86,10 @@ template marketplacesuite*(name: string, stopOnRequestFail: bool, body: untyped)
        duration: uint64,
        collateralPerByte: UInt256,
        minPricePerBytePerSecond: UInt256,
-    ): Future[void] {.async: (raises: [CancelledError, HttpError, ConfigurationError]).} =
+    ): Future[void] {.
+        async:
+          (raises: [CancelledError, HttpError, ConfigurationError, CodexProcessError])
+    .} =
      let totalCollateral = datasetSize.u256 * collateralPerByte
      # post availability to each provider
      for i in 0 ..< providers().len:
--- a/tests/integration/multinodes.nim
+++ b/tests/integration/multinodes.nim
@ -1,3 +1,4 @@
+import std/httpclient
 import std/os
 import std/sequtils
 import std/strutils
@ -13,6 +14,7 @@ import ./codexprocess
 import ./hardhatconfig
 import ./hardhatprocess
 import ./nodeconfigs
+import ./utils
 import ../asynctest
 import ../checktest

@ -24,6 +26,8 @@ export hardhatconfig
 export codexconfig
 export nodeconfigs

+{.push raises: [].}
+
 type
  RunningNode* = ref object
    role*: Role
@ -36,31 +40,33 @@ type
    Hardhat

  MultiNodeSuiteError = object of CatchableError
+  SuiteTimeoutError = object of MultiNodeSuiteError

-const jsonRpcProviderUrl* = "ws://localhost:8545"
+const HardhatPort {.intdefine.}: int = 8545
+const CodexApiPort {.intdefine.}: int = 8080
+const CodexDiscPort {.intdefine.}: int = 8090
+const TestId {.strdefine.}: string = "TestId"
+const CodexLogToFile {.booldefine.}: bool = false
+const CodexLogLevel {.strdefine.}: string = ""
+const CodexLogsDir {.strdefine.}: string = ""

-proc raiseMultiNodeSuiteError(msg: string) =
-  raise newException(MultiNodeSuiteError, msg)
+proc raiseMultiNodeSuiteError(
+    msg: string, parent: ref CatchableError = nil
+) {.raises: [MultiNodeSuiteError].} =
+  raise newException(MultiNodeSuiteError, msg, parent)

-proc nextFreePort*(startPort: int): Future[int] {.async.} =
-  proc client(server: StreamServer, transp: StreamTransport) {.async.} =
-    await transp.closeWait()
+template withLock(lock: AsyncLock, body: untyped) =
+  if lock.isNil:
+    lock = newAsyncLock()

-  var port = startPort
-  while true:
-    trace "checking if port is free", port
+  await lock.acquire()
+  try:
+    body
+  finally:
    try:
-      let host = initTAddress("127.0.0.1", port)
-      # We use ReuseAddr here only to be able to reuse the same IP/Port when
-      # there's a TIME_WAIT socket. It's useful when running the test multiple
-      # times or if a test ran previously using the same port.
-      var server = createStreamServer(host, client, {ReuseAddr})
-      trace "port is free", port
-      await server.closeWait()
-      return port
-    except TransportOsError:
-      trace "port is not free", port
-      inc port
+      lock.release()
+    except AsyncLockError as parent:
+      raiseMultiNodeSuiteError "lock error", parent

 proc sanitize(pathSegment: string): string =
  var sanitized = pathSegment
@ -71,8 +77,8 @@ proc sanitize(pathSegment: string): string =
 proc getTempDirName*(starttime: string, role: Role, roleIdx: int): string =
  getTempDir() / "Storage" / sanitize($starttime) / sanitize($role & "_" & $roleIdx)

-template multinodesuite*(name: string, body: untyped) =
-  asyncchecksuite name:
+template multinodesuite*(suiteName: string, body: untyped) =
+  asyncchecksuite suiteName:
    # Following the problem described here:
    # https://github.com/NomicFoundation/hardhat/issues/2053
    # It may be desirable to use http RPC provider.
@ -85,7 +91,7 @@ template multinodesuite*(name: string, body: untyped) =
    # If you want to use a different provider url in the nodes, you can
    # use withEthProvider config modifier in the node config
    # to set the desired provider url. E.g.:
-    #   NodeConfigs(    
+    #   NodeConfigs(
    #     hardhat:
    #       HardhatConfig.none,
    #     clients:
@ -93,6 +99,7 @@ template multinodesuite*(name: string, body: untyped) =
    #         .withEthProvider("ws://localhost:8545")
    #         .some,
    #     ...
+    var jsonRpcProviderUrl = "ws://localhost:" & $HardhatPort
    var running {.inject, used.}: seq[RunningNode]
    var bootstrapNodes: seq[string]
    let starttime = now().format("yyyy-MM-dd'_'HH:mm:ss")
@ -101,6 +108,10 @@ template multinodesuite*(name: string, body: untyped) =
    var ethProvider {.inject, used.}: JsonRpcProvider
    var accounts {.inject, used.}: seq[Address]
    var snapshot: JsonNode
+    var lastUsedHardhatPort = HardhatPort
+    var lastUsedCodexApiPort = CodexApiPort
+    var lastUsedCodexDiscPort = CodexDiscPort
+    var codexPortLock: AsyncLock

    template test(tname, startNodeConfigs, tbody) =
      currentTestName = tname
@ -108,47 +119,50 @@ template multinodesuite*(name: string, body: untyped) =
      test tname:
        tbody

-    proc sanitize(pathSegment: string): string =
-      var sanitized = pathSegment
-      for invalid in invalidFilenameChars.items:
-        sanitized = sanitized.replace(invalid, '_').replace(' ', '_')
-      sanitized
-
-    proc getLogFile(role: Role, index: ?int): string =
-      # create log file path, format:
-      # tests/integration/logs/<start_datetime> <suite_name>/<test_name>/<node_role>_<node_idx>.log
-
-      var logDir =
-        currentSourcePath.parentDir() / "logs" / sanitize($starttime & "__" & name) /
-        sanitize($currentTestName)
-      createDir(logDir)
-
-      var fn = $role
-      if idx =? index:
-        fn &= "_" & $idx
-      fn &= ".log"
-
-      let fileName = logDir / fn
-      return fileName
+    proc updatePort(url: var string, port: int) =
+      let parts = url.split(':')
+      url = @[parts[0], parts[1], $port].join(":")

    proc newHardhatProcess(
        config: HardhatConfig, role: Role
-    ): Future[NodeProcess] {.async.} =
+    ): Future[NodeProcess] {.async: (raises: [MultiNodeSuiteError, CancelledError]).} =
      var args: seq[string] = @[]
      if config.logFile:
-        let updatedLogFile = getLogFile(role, none int)
-        args.add "--log-file=" & updatedLogFile
+        try:
+          let updatedLogFile = getLogFile(
+            CodexLogsDir, starttime, suiteName, currentTestName, $role, none int
+          )
+          args.add "--log-file=" & updatedLogFile
+        except IOError as e:
+          raiseMultiNodeSuiteError(
+            "failed to start hardhat because logfile path could not be obtained: " &
+              e.msg,
+            e,
+          )
+        except OSError as e:
+          raiseMultiNodeSuiteError(
+            "failed to start hardhat because logfile path could not be obtained: " &
+              e.msg,
+            e,
+          )
+
+      let port = await nextFreePort(lastUsedHardhatPort)
+      jsonRpcProviderUrl.updatePort(port)
+      args.add("--port")
+      args.add($port)
+      lastUsedHardhatPort = port

      try:
        let node = await HardhatProcess.startNode(args, config.debugEnabled, "hardhat")
+        await node.waitUntilStarted()
        trace "hardhat node started"
        return node
      except NodeProcessError as e:
-        raiseMultiNodeSuiteError "cannot start hardhat process: " & e.msg
+        raiseMultiNodeSuiteError "hardhat node not started: " & e.msg

    proc newCodexProcess(
        roleIdx: int, conf: CodexConfig, role: Role
-    ): Future[NodeProcess] {.async.} =
+    ): Future[NodeProcess] {.async: (raises: [MultiNodeSuiteError, CancelledError]).} =
      let nodeIdx = running.len
      var config = conf

@ -156,34 +170,60 @@ template multinodesuite*(name: string, body: untyped) =
        raiseMultiNodeSuiteError "Cannot start node at nodeIdx " & $nodeIdx &
          ", not enough eth accounts."

-      let datadir = getTempDirName(starttime, role, roleIdx)
+      let datadir = getDataDir(TestId, currentTestName, $starttime, $role, some roleIdx)

      try:
-        if config.logFile.isSome:
-          let updatedLogFile = getLogFile(role, some roleIdx)
-          config.withLogFile(updatedLogFile)
+        if config.logFile.isSome or CodexLogToFile:
+          try:
+            let updatedLogFile = getLogFile(
+              CodexLogsDir, starttime, suiteName, currentTestName, $role, some roleIdx
+            )
+            config.withLogFile(updatedLogFile)
+          except IOError as e:
+            raiseMultiNodeSuiteError(
+              "failed to start " & $role &
+                " because logfile path could not be obtained: " & e.msg,
+              e,
+            )
+          except OSError as e:
+            raiseMultiNodeSuiteError(
+              "failed to start " & $role &
+                " because logfile path could not be obtained: " & e.msg,
+              e,
+            )
+
+        when CodexLogLevel != "":
+          config.addCliOption("--log-level", CodexLogLevel)
+
+        var apiPort, discPort: int
+        withLock(codexPortLock):
+          apiPort = await nextFreePort(lastUsedCodexApiPort + nodeIdx)
+          discPort = await nextFreePort(lastUsedCodexDiscPort + nodeIdx)
+          config.addCliOption("--api-port", $apiPort)
+          config.addCliOption("--disc-port", $discPort)
+          lastUsedCodexApiPort = apiPort
+          lastUsedCodexDiscPort = discPort

        for bootstrapNode in bootstrapNodes:
          config.addCliOption("--bootstrap-node", bootstrapNode)
-        config.addCliOption("--api-port", $await nextFreePort(8080 + nodeIdx))
+
        config.addCliOption("--data-dir", datadir)
        config.addCliOption("--nat", "none")
        config.addCliOption("--listen-addrs", "/ip4/127.0.0.1/tcp/0")
-        config.addCliOption("--disc-port", $await nextFreePort(8090 + nodeIdx))
      except CodexConfigError as e:
        raiseMultiNodeSuiteError "invalid cli option, error: " & e.msg

-      let node = await CodexProcess.startNode(
-        config.cliArgs, config.debugEnabled, $role & $roleIdx
-      )
-
      try:
+        let node = await CodexProcess.startNode(
+          config.cliArgs, config.debugEnabled, $role & $roleIdx
+        )
        await node.waitUntilStarted()
        trace "node started", nodeName = $role & $roleIdx
+        return node
+      except CodexConfigError as e:
+        raiseMultiNodeSuiteError "failed to get cli args from config: " & e.msg, e
      except NodeProcessError as e:
-        raiseMultiNodeSuiteError "node not started, error: " & e.msg
-
-      return node
+        raiseMultiNodeSuiteError "node not started, error: " & e.msg, e

    proc hardhat(): HardhatProcess =
      for r in running:
@ -209,7 +249,9 @@ template multinodesuite*(name: string, body: untyped) =
          if r.role == Role.Validator:
            CodexProcess(r.node)

-    proc startHardhatNode(config: HardhatConfig): Future[NodeProcess] {.async.} =
+    proc startHardhatNode(
+        config: HardhatConfig
+    ): Future[NodeProcess] {.async: (raises: [MultiNodeSuiteError, CancelledError]).} =
      return await newHardhatProcess(config, Role.Hardhat)

    proc startClientNode(conf: CodexConfig): Future[NodeProcess] {.async.} =
@ -221,44 +263,64 @@ template multinodesuite*(name: string, body: untyped) =
      )
      return await newCodexProcess(clientIdx, config, Role.Client)

-    proc startProviderNode(conf: CodexConfig): Future[NodeProcess] {.async.} =
-      let providerIdx = providers().len
-      var config = conf
-      config.addCliOption(StartUpCmd.persistence, "--eth-provider", jsonRpcProviderUrl)
-      config.addCliOption(
-        StartUpCmd.persistence, "--eth-account", $accounts[running.len]
-      )
-      config.addCliOption(
-        PersistenceCmd.prover, "--circom-r1cs",
-        "vendor/logos-storage-contracts-eth/verifier/networks/hardhat/proof_main.r1cs",
-      )
-      config.addCliOption(
-        PersistenceCmd.prover, "--circom-wasm",
-        "vendor/logos-storage-contracts-eth/verifier/networks/hardhat/proof_main.wasm",
-      )
-      config.addCliOption(
-        PersistenceCmd.prover, "--circom-zkey",
-        "vendor/logos-storage-contracts-eth/verifier/networks/hardhat/proof_main.zkey",
-      )
+    proc startProviderNode(
+        conf: CodexConfig
+    ): Future[NodeProcess] {.async: (raises: [MultiNodeSuiteError, CancelledError]).} =
+      try:
+        let providerIdx = providers().len
+        var config = conf
+        config.addCliOption(
+          StartUpCmd.persistence, "--eth-provider", jsonRpcProviderUrl
+        )
+        config.addCliOption(
+          StartUpCmd.persistence, "--eth-account", $accounts[running.len]
+        )
+        config.addCliOption(
+          PersistenceCmd.prover, "--circom-r1cs",
+          "vendor/logos-storage-contracts-eth/verifier/networks/hardhat/proof_main.r1cs",
+        )
+        config.addCliOption(
+          PersistenceCmd.prover, "--circom-wasm",
+          "vendor/logos-storage-contracts-eth/verifier/networks/hardhat/proof_main.wasm",
+        )
+        config.addCliOption(
+          PersistenceCmd.prover, "--circom-zkey",
+          "vendor/logos-storage-contracts-eth/verifier/networks/hardhat/proof_main.zkey",
+        )

-      return await newCodexProcess(providerIdx, config, Role.Provider)
+        return await newCodexProcess(providerIdx, config, Role.Provider)
+      except CodexConfigError as exc:
+        raiseMultiNodeSuiteError "Failed to start codex node, error adding cli options: " &
+          exc.msg, exc

-    proc startValidatorNode(conf: CodexConfig): Future[NodeProcess] {.async.} =
-      let validatorIdx = validators().len
-      var config = conf
-      config.addCliOption(StartUpCmd.persistence, "--eth-provider", jsonRpcProviderUrl)
-      config.addCliOption(
-        StartUpCmd.persistence, "--eth-account", $accounts[running.len]
-      )
-      config.addCliOption(StartUpCmd.persistence, "--validator")
+    proc startValidatorNode(
+        conf: CodexConfig
+    ): Future[NodeProcess] {.async: (raises: [MultiNodeSuiteError, CancelledError]).} =
+      try:
+        let validatorIdx = validators().len
+        var config = conf
+        config.addCliOption(
+          StartUpCmd.persistence, "--eth-provider", jsonRpcProviderUrl
+        )
+        config.addCliOption(
+          StartUpCmd.persistence, "--eth-account", $accounts[running.len]
+        )
+        config.addCliOption(StartUpCmd.persistence, "--validator")

-      return await newCodexProcess(validatorIdx, config, Role.Validator)
+        return await newCodexProcess(validatorIdx, config, Role.Validator)
+      except CodexConfigError as e:
+        raiseMultiNodeSuiteError "Failed to start validator node, error adding cli options: " &
+          e.msg, e

-    proc teardownImpl() {.async.} =
+    proc teardownImpl() {.async: (raises: []).} =
+      trace "Tearing down test", suite = suiteName, test = currentTestName
      for nodes in @[validators(), clients(), providers()]:
        for node in nodes:
          await node.stop() # also stops rest client
-          node.removeDataDir()
+          try:
+            node.removeDataDir()
+          except CodexProcessError as e:
+            error "Failed to remove data dir during teardown", error = e.msg

      # if hardhat was started in the test, kill the node
      # otherwise revert the snapshot taken in the test setup
@ -266,15 +328,28 @@ template multinodesuite*(name: string, body: untyped) =
      if not hardhat.isNil:
        await hardhat.stop()
      else:
-        discard await send(ethProvider, "evm_revert", @[snapshot])
+        try:
+          discard await noCancel send(ethProvider, "evm_revert", @[snapshot])
+        except ProviderError as e:
+          error "Failed to revert hardhat state during teardown", error = e.msg

-        await ethProvider.close()
+        # TODO: JsonRpcProvider.close should NOT raise any exceptions
+        try:
+          await ethProvider.close()
+        except CatchableError:
+          discard

      running = @[]

    template failAndTeardownOnError(message: string, tryBody: untyped) =
      try:
        tryBody
+      except CancelledError as e:
+        await teardownImpl()
+        when declared(teardownAllIMPL):
+          teardownAllIMPL()
+        fail()
+        quit(1)
      except CatchableError as er:
        fatal message, error = er.msg
        echo "[FATAL] ", message, ": ", er.msg
@ -286,19 +361,34 @@ template multinodesuite*(name: string, body: untyped) =

    proc updateBootstrapNodes(
        node: CodexProcess
-    ): Future[void] {.async: (raises: [CatchableError]).} =
-      without ninfo =? await node.client.info():
-        # raise CatchableError instead of Defect (with .get or !) so we
-        # can gracefully shutdown and prevent zombies
-        raiseMultiNodeSuiteError "Failed to get node info"
-      bootstrapNodes.add ninfo["spr"].getStr()
+    ): Future[void] {.async: (raises: [MultiNodeSuiteError]).} =
+      try:
+        without ninfo =? await node.client.info():
+          # raise CatchableError instead of Defect (with .get or !) so we
+          # can gracefully shutdown and prevent zombies
+          raiseMultiNodeSuiteError "Failed to get node info"
+        bootstrapNodes.add ninfo["spr"].getStr()
+      except CatchableError as e:
+        raiseMultiNodeSuiteError "Failed to get node info: " & e.msg, e
+
+    setupAll:
+      # When this file is run with `-d:chronicles_sinks=textlines[file]`, we
+      # need to set the log file path at runtime, otherwise chronicles didn't seem to
+      # create a log file even when using an absolute path
+      when defaultChroniclesStream.outputs is (FileOutput,) and CodexLogsDir.len > 0:
+        let logFile =
+          CodexLogsDir / sanitize(getAppFilename().extractFilename & ".chronicles.log")
+        let success = defaultChroniclesStream.outputs[0].open(logFile, fmAppend)
+        doAssert success, "Failed to open log file: " & logFile

    setup:
+      trace "Setting up test", suite = suiteName, test = currentTestName, nodeConfigs
+
      if var conf =? nodeConfigs.hardhat:
        try:
-          let node = await startHardhatNode(conf)
+          let node = await noCancel startHardhatNode(conf)
          running.add RunningNode(role: Role.Hardhat, node: node)
-        except CatchableError as e:
+        except CatchableError as e: # CancelledError not raised due to noCancel
          echo "failed to start hardhat node"
          fail()
          quit(1)
@ -307,12 +397,16 @@ template multinodesuite*(name: string, body: untyped) =
        # Workaround for https://github.com/NomicFoundation/hardhat/issues/2053
        # Do not use websockets, but use http and polling to stop subscriptions
        # from being removed after 5 minutes
-        ethProvider = JsonRpcProvider.new(jsonRpcProviderUrl)
+        ethProvider = JsonRpcProvider.new(
+          jsonRpcProviderUrl, pollingInterval = chronos.milliseconds(1000)
+        )
        # if hardhat was NOT started by the test, take a snapshot so it can be
        # reverted in the test teardown
        if nodeConfigs.hardhat.isNone:
          snapshot = await send(ethProvider, "evm_snapshot")
        accounts = await ethProvider.listAccounts()
+      except CancelledError as e:
+        raise e
      except CatchableError as e:
        echo "Hardhat not running. Run hardhat manually " &
          "before executing tests, or include a " & "HardhatConfig in the test setup."
@ -342,7 +436,10 @@ template multinodesuite*(name: string, body: untyped) =
      # ensure that we have a recent block with a fresh timestamp
      discard await send(ethProvider, "evm_mine")

+      trace "Starting test", suite = suiteName, test = currentTestName
+
    teardown:
      await teardownImpl()
+      trace "Test completed", suite = suiteName, test = currentTestName

    body
--- a/tests/integration/nodeprocess.nim
+++ b/tests/integration/nodeprocess.nim
@ -5,6 +5,7 @@ import pkg/chronicles
 import pkg/chronos/asyncproc
 import pkg/libp2p
 import std/os
+import std/strformat
 import std/strutils
 import codex/conf
 import codex/utils/exceptions
@ -14,6 +15,8 @@ import ./codexclient
 export codexclient
 export chronicles

+{.push raises: [].}
+
 logScope:
  topics = "integration testing node process"

@ -39,24 +42,19 @@ method startedOutput(node: NodeProcess): string {.base, gcsafe.} =
 method processOptions(node: NodeProcess): set[AsyncProcessOption] {.base, gcsafe.} =
  raiseAssert "not implemented"

-method outputLineEndings(node: NodeProcess): string {.base, gcsafe, raises: [].} =
+method outputLineEndings(node: NodeProcess): string {.base, gcsafe.} =
  raiseAssert "not implemented"

-method onOutputLineCaptured(
-    node: NodeProcess, line: string
-) {.base, gcsafe, raises: [].} =
+method onOutputLineCaptured(node: NodeProcess, line: string) {.base, gcsafe.} =
  raiseAssert "not implemented"

-method start*(node: NodeProcess) {.base, async.} =
+method start*(node: NodeProcess) {.base, async: (raises: [CancelledError]).} =
  logScope:
    nodeName = node.name

  let poptions = node.processOptions + {AsyncProcessOption.StdErrToStdOut}
  trace "starting node",
-    args = node.arguments,
-    executable = node.executable,
-    workingDir = node.workingDir,
-    processOptions = poptions
+    args = node.arguments, executable = node.executable, workingDir = node.workingDir

  try:
    if node.debug:
@ -81,11 +79,13 @@ proc captureOutput(

  trace "waiting for output", output

-  let stream = node.process.stdoutStream
-
  try:
    while node.process.running.option == some true:
-      while (let line = await stream.readLine(0, node.outputLineEndings); line != ""):
+      while (
+        let line = await node.process.stdoutStream.readLine(0, node.outputLineEndings)
+        line != ""
+      )
+      :
        if node.debug:
          # would be nice if chronicles could parse and display with colors
          echo line
@ -95,8 +95,8 @@ proc captureOutput(

        node.onOutputLineCaptured(line)

-        await sleepAsync(1.millis)
-      await sleepAsync(1.millis)
+        await sleepAsync(1.nanos)
+      await sleepAsync(1.nanos)
  except CancelledError:
    discard # do not propagate as captureOutput was asyncSpawned
  except AsyncStreamError as e:
@ -104,7 +104,7 @@ proc captureOutput(

 proc startNode*[T: NodeProcess](
    _: type T, args: seq[string], debug: string | bool = false, name: string
-): Future[T] {.async.} =
+): Future[T] {.async: (raises: [CancelledError]).} =
  ## Starts a Logos Storage Node with the specified arguments.
  ## Set debug to 'true' to see output of the node.
  let node = T(
@ -116,34 +116,36 @@ proc startNode*[T: NodeProcess](
  await node.start()
  return node

-method stop*(node: NodeProcess) {.base, async.} =
+method stop*(
+    node: NodeProcess, expectedExitCode: int = 0
+) {.base, async: (raises: []).} =
  logScope:
    nodeName = node.name

  await node.trackedFutures.cancelTracked()
-  if node.process != nil:
+  if not node.process.isNil:
+    let processId = node.process.processId
+    trace "terminating node process...", processId
    try:
-      trace "terminating node process..."
-      if errCode =? node.process.terminate().errorOption:
-        error "failed to terminate process", errCode = $errCode
+      let exitCode = await noCancel node.process.terminateAndWaitForExit(2.seconds)
+      if exitCode > 0 and exitCode != 143 and # 143 = SIGTERM (initiated above)
+      exitCode != expectedExitCode:
+        warn "process exited with a non-zero exit code", exitCode
+      trace "node process terminated", exitCode
+    except CatchableError:
+      try:
+        let forcedExitCode = await noCancel node.process.killAndWaitForExit(3.seconds)
+        trace "node process forcibly killed with exit code: ", exitCode = forcedExitCode
+      except CatchableError as e:
+        warn "failed to kill node process in time, it will be killed when the parent process exits",
+          error = e.msg
+        writeStackTrace()

-      trace "waiting for node process to exit"
-      let exitCode = await node.process.waitForExit(3.seconds)
-      if exitCode > 0:
-        error "failed to exit process, check for zombies", exitCode
+      trace "node stopped"

-      trace "closing node process' streams"
-      await node.process.closeWait()
-    except CancelledError as error:
-      raise error
-    except CatchableError as e:
-      error "error stopping node process", error = e.msg
-    finally:
-      node.process = nil
-
-    trace "node stopped"
-
-proc waitUntilOutput*(node: NodeProcess, output: string) {.async.} =
+proc waitUntilOutput*(
+    node: NodeProcess, output: string
+) {.async: (raises: [CancelledError, AsyncTimeoutError]).} =
  logScope:
    nodeName = node.name

@ -153,9 +155,21 @@ proc waitUntilOutput*(node: NodeProcess, output: string) {.async.} =
  let fut = node.captureOutput(output, started)
  node.trackedFutures.track(fut)
  asyncSpawn fut
-  await started.wait(60.seconds) # allow enough time for proof generation
+  try:
+    await started.wait(60.seconds) # allow enough time for proof generation
+  except AsyncTimeoutError as e:
+    raise e
+  except CancelledError as e:
+    raise e
+  except CatchableError as e: # unsure where this originates from
+    error "unexpected error occurred waiting for node output", error = e.msg
+
+proc waitUntilStarted*(
+    node: NodeProcess
+) {.async: (raises: [CancelledError, NodeProcessError]).} =
+  logScope:
+    nodeName = node.name

-proc waitUntilStarted*(node: NodeProcess) {.async.} =
  try:
    await node.waitUntilOutput(node.startedOutput)
    trace "node started"
@ -168,10 +182,10 @@ proc waitUntilStarted*(node: NodeProcess) {.async.} =
    raise
      newException(NodeProcessError, "node did not output '" & node.startedOutput & "'")

-proc restart*(node: NodeProcess) {.async.} =
+method restart*(node: NodeProcess) {.base, async.} =
  await node.stop()
  await node.start()
  await node.waitUntilStarted()

-method removeDataDir*(node: NodeProcess) {.base.} =
+method removeDataDir*(node: NodeProcess) {.base, raises: [NodeProcessError].} =
  raiseAssert "[removeDataDir] not implemented"
--- a/tests/integration/scripts/winkillprocess.sh
+++ b/tests/integration/scripts/winkillprocess.sh
@ -0,0 +1,97 @@
+#!/bin/bash
+
+# List all processes with a specific name
+list() {
+  local name=$1
+  echo "Listing all processes named '$name'..."
+  powershell.exe -Command "Get-CimInstance Win32_Process -Filter \"name = '$name'\" | Select-Object ProcessId, Name, CommandLine | Format-Table -AutoSize"
+}
+
+# Search for processes with a specific name and command line pattern
+search() {
+  local name=$1
+  local pattern=$2
+  echo "Searching for '$name' processes with command line matching '$pattern'..."
+  powershell.exe -Command "
+    \$processes = Get-CimInstance Win32_Process -Filter \"name = '$name'\" | Where-Object { \$_.CommandLine -match '$pattern' };
+    if (\$processes) {
+      \$processes | Select-Object ProcessId, Name, CommandLine | Format-Table -AutoSize;
+    } else {
+      Write-Host \"No matching '$name' processes found\";
+    }
+  "
+}
+
+# Kill all processes with a specific name
+killall() {
+  local name=$1
+  echo "Finding and killing all '$name' processes..."
+  powershell.exe -Command "
+    \$processes = Get-CimInstance Win32_Process -Filter \"name = '$name'\";
+    if (\$processes) {
+      foreach (\$process in \$processes) {
+        Stop-Process -Id \$process.ProcessId -Force;
+        Write-Host \"Killed process \$(\$process.ProcessId)\";
+      }
+    } else {
+      Write-Host \"No '$name' processes found\";
+    }
+  "
+}
+
+# Kill processes with a specific name and command line pattern
+kill() {
+  local name=$1
+  local pattern=$2
+  echo "Finding and killing '$name' processes with command line matching '$pattern'..."
+  powershell.exe -Command "
+    \$processes = Get-CimInstance Win32_Process -Filter \"name = '$name'\" | Where-Object { \$_.CommandLine -match '$pattern' };
+    if (\$processes) {
+      foreach (\$process in \$processes) {
+        Stop-Process -Id \$process.ProcessId -Force;
+        Write-Host \"Killed process \$(\$process.ProcessId)\";
+      }
+    } else {
+      Write-Host \"No matching '$name' processes found\";
+    }
+  "
+}
+
+# Check if being run directly or sourced
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  # If run directly (not sourced), provide command line interface
+  case "$1" in
+    list)
+      if [ -z "$2" ]; then
+        echo "Usage: $0 list PROCESS_NAME"
+        exit 1
+      fi
+      list "$2"
+      ;;
+    search)
+      if [ -z "$2" ] || [ -z "$3" ]; then
+        echo "Usage: $0 search PROCESS_NAME COMMANDLINE_PATTERN"
+        exit 1
+      fi
+      search "$2" "$3"
+      ;;
+    killall)
+      if [ -z "$2" ]; then
+        echo "Usage: $0 killall PROCESS_NAME"
+        exit 1
+      fi
+      killall "$2"
+      ;;
+    kill)
+      if [ -z "$2" ] || [ -z "$3" ]; then
+        echo "Usage: $0 kill PROCESS_NAME COMMANDLINE_PATTERN"
+        exit 1
+      fi
+      kill "$2" "$3"
+      ;;
+    *)
+      echo "Usage: $0 {list PROCESS_NAME|search PROCESS_NAME COMMANDLINE_PATTERN|killall PROCESS_NAME|kill PROCESS_NAME COMMANDLINE_PATTERN}"
+      exit 1
+      ;;
+  esac
+fi
--- a/tests/integration/utils.nim
+++ b/tests/integration/utils.nim
@ -0,0 +1,92 @@
+import std/os
+import std/strformat
+import pkg/chronos
+import pkg/chronos/asyncproc
+import pkg/codex/logutils
+
+{.push raises: [].}
+
+proc nextFreePort*(startPort: int): Future[int] {.async: (raises: [CancelledError]).} =
+  proc client(server: StreamServer, transp: StreamTransport) {.async: (raises: []).} =
+    await transp.closeWait()
+
+  var port = startPort
+  while true:
+    trace "checking if port is free", port
+    try:
+      let host = initTAddress("127.0.0.1", port)
+      # We use ReuseAddr here only to be able to reuse the same IP/Port when
+      # there's a TIME_WAIT socket. It's useful when running the test multiple
+      # times or if a test ran previously using the same port.
+      var server = createStreamServer(host, client, {ReuseAddr})
+      trace "port is free", port
+      await server.closeWait()
+      return port
+    except TransportOsError:
+      trace "port is not free", port
+      inc port
+    except TransportAddressError:
+      raiseAssert "bad address"
+
+proc sanitize*(pathSegment: string): string =
+  var sanitized = pathSegment
+  for invalid in invalidFilenameChars.items:
+    sanitized = sanitized.replace(invalid, '_').replace(' ', '_')
+  sanitized
+
+proc getLogFile*(
+    logDir, startTime, suiteName, testName, role: string, index = int.none
+): string {.raises: [IOError, OSError].} =
+  let logsDir =
+    if logDir == "":
+      currentSourcePath.parentDir() / "logs" / sanitize(startTime & "__" & suiteName) /
+        sanitize(testName)
+    else:
+      logDir / sanitize(suiteName) / sanitize(testName)
+
+  createDir(logsDir)
+
+  var fn = $role
+  if idx =? index:
+    fn &= "_" & $idx
+  fn &= ".log"
+
+  let fileName = logsDir / fn
+  return fileName
+
+proc appendFile*(filename: string, content: string) {.raises: [IOError].} =
+  ## Opens a file named `filename` for writing. Then writes the
+  ## `content` completely to the file and closes the file afterwards.
+  ## Raises an IO exception in case of an error.
+  var f: File
+  try:
+    f = open(filename, fmAppend)
+    f.write(content)
+  except IOError as e:
+    raise newException(IOError, "cannot open and write " & filename & ": " & e.msg)
+  finally:
+    close(f)
+
+when defined(windows):
+  proc forceKillProcess*(
+      processName, matchingCriteria: string
+  ): Future[CommandExResponse] {.
+      async: (
+        raises: [
+          AsyncProcessError, AsyncProcessTimeoutError, CancelledError, ValueError,
+          OSError,
+        ]
+      )
+  .} =
+    let path = splitFile(currentSourcePath()).dir / "scripts" / "winkillprocess.sh"
+    let cmd = &"{absolutePath(path)} kill {processName} \"{matchingCriteria}\""
+    trace "Forcefully killing windows process", processName, matchingCriteria, cmd
+    return await execCommandEx(cmd, timeout = 5.seconds)
+
+proc getDataDir*(testId, testName, startTime, role: string, index = int.none): string =
+  var suffix = role
+  if idx =? index:
+    suffix &= "_" & $idx
+
+  getTempDir() / "Codex" / sanitize(testId) / sanitize(testName) / sanitize(startTime) /
+    sanitize(suffix)
--- a/vendor/nim-chronos
+++ b/vendor/nim-chronos
@ -1 +1 @@
-Subproject commit c04576d829b8a0a1b12baaa8bc92037501b3a4a0
+Subproject commit 0646c444fce7c7ed08ef6f2c9a7abfd172ffe655