From 09522ff2fb333fcbe7d624fcb755a5a7bfde7da1 Mon Sep 17 00:00:00 2001 From: NagyZoltanPeter <113987313+NagyZoltanPeter@users.noreply.github.com> Date: Wed, 12 Jun 2024 07:49:55 +0200 Subject: [PATCH] fix: multi nat initialization causing dead lock in waku tests + serialize test runs to avoid timing and port occupied issues (#2799) * Prevent multiple nat module initialization that cause dead lock in nat refresh thread tear down during tests. * NPROC to 1 to avoid parallel test runs can lead to timing and port allocation issues Co-authored-by: gabrielmer <101006718+gabrielmer@users.noreply.github.com> --- .github/workflows/ci.yml | 8 +++--- waku/common/utils/nat.nim | 56 ++++++++++++++++++++++++++------------- 2 files changed, 41 insertions(+), 23 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 770d4824f..025ef880d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -111,15 +111,15 @@ jobs: - name: Run tests run: | - if [ ${{ runner.os }} == "Linux" ]; then - sudo docker run --rm -d -e POSTGRES_PASSWORD=test123 -p 5432:5432 postgres:15.4-alpine3.18 - fi - postgres_enabled=0 if [ ${{ runner.os }} == "Linux" ]; then + sudo docker run --rm -d -e POSTGRES_PASSWORD=test123 -p 5432:5432 postgres:15.4-alpine3.18 postgres_enabled=1 fi + export MAKEFLAGS="-j1" + export NIMFLAGS="--colors:off -d:chronicles_colors:none" + make RLN_V${{matrix.rln_version}}=true V=1 LOG_LEVEL=DEBUG QUICK_AND_DIRTY_COMPILER=1 POSTGRES=$postgres_enabled test testwakunode2 build-docker-image: diff --git a/waku/common/utils/nat.nim b/waku/common/utils/nat.nim index e05a345e6..5835a8e7f 100644 --- a/waku/common/utils/nat.nim +++ b/waku/common/utils/nat.nim @@ -9,6 +9,15 @@ import chronicles, eth/net/nat, stew/results, nativesockets logScope: topics = "nat" +## Due to the design of nim-eth/nat module we must ensure it is only initialized once. +## see: https://github.com/waku-org/nwaku/issues/2628 +## Details: nim-eth/nat module starts a meaintenance thread for refreshing the NAT mappings, but everything in the module is global, +## there is no room to store multiple configurations. +## Exact meaning: redirectPorts cannot be called twice in a program lifetime. +## During waku tests we happen to start several node instances in parallel thus resulting in multiple NAT configurations and multiple threads. +## Those threads will dead lock each other in tear down. +var singletonNat: bool = false + proc setupNat*( natConf, clientId: string, tcpPort, udpPort: Port ): Result[ @@ -26,26 +35,35 @@ proc setupNat*( tuple[ip: Option[IpAddress], tcpPort: Option[Port], udpPort: Option[Port]] if strategy != NatNone: - let extIp = getExternalIP(strategy) - if extIP.isSome(): - endpoint.ip = some(extIp.get()) - # RedirectPorts in considered a gcsafety violation - # because it obtains the address of a non-gcsafe proc? - var extPorts: Option[(Port, Port)] - try: - extPorts = ( - {.gcsafe.}: - redirectPorts(tcpPort = tcpPort, udpPort = udpPort, description = clientId) - ) - except CatchableError: - # TODO: nat.nim Error: can raise an unlisted exception: Exception. Isolate here for now. - error "unable to determine external ports" - extPorts = none((Port, Port)) + ## Only initialize the NAT module once + ## redirectPorts cannot be called twice in a program lifetime. + ## We can do it as same happens if getExternalIP fails and returns None + if singletonNat: + warn "NAT already initialized, skipping as cannot be done multiple times" + else: + singletonNat = true + let extIp = getExternalIP(strategy) + if extIP.isSome(): + endpoint.ip = some(extIp.get()) + # RedirectPorts in considered a gcsafety violation + # because it obtains the address of a non-gcsafe proc? + var extPorts: Option[(Port, Port)] + try: + extPorts = ( + {.gcsafe.}: + redirectPorts( + tcpPort = tcpPort, udpPort = udpPort, description = clientId + ) + ) + except CatchableError: + # TODO: nat.nim Error: can raise an unlisted exception: Exception. Isolate here for now. + error "unable to determine external ports" + extPorts = none((Port, Port)) - if extPorts.isSome(): - let (extTcpPort, extUdpPort) = extPorts.get() - endpoint.tcpPort = some(extTcpPort) - endpoint.udpPort = some(extUdpPort) + if extPorts.isSome(): + let (extTcpPort, extUdpPort) = extPorts.get() + endpoint.tcpPort = some(extTcpPort) + endpoint.udpPort = some(extUdpPort) else: # NatNone if not natConf.startsWith("extip:"): return err("not a valid NAT mechanism: " & $natConf)