From 639dffc505f022b223dc409908b619c556387b08 Mon Sep 17 00:00:00 2001
From: Egor Rachkovskii <32649334+at0m1x19@users.noreply.github.com>
Date: Tue, 14 Apr 2026 14:25:36 +0100
Subject: [PATCH] Fix nim waku daily failures (#169)

* Add REST API traffic bypass for network conditions manipulation

- Introduced methods to apply packet loss only to P2P traffic, excluding REST API traffic.
- Simplified test cases to leverage new differentiated packet loss handling.
- Removed unused and legacy metrics/tests for cleaner configuration and coverage.

* Refactor network conditions setup to streamline command execution

* Pin priomap so libp2p traffic actually hits netem

The default prio qdisc priomap routes SO_PRIORITY 6 and 7 to band 0,
which is our REST bypass class 1:1. libp2p/gossipsub packets set a high
SO_PRIORITY on their sockets, so they were silently escaping the netem
impairment via the priomap rather than through the u32 filter. The
result: test_relay_packet_loss_correlated_vs_uncorrelated became green
by accident because no loss was ever applied to relay traffic.

Forcing priomap to 1 1 1 1 ... on all 16 slots routes every SO_PRIORITY
value to band 1 (netem). The u32 filter remains the only path to 1:1,
so REST stays isolated and libp2p now takes the configured loss.

Verified in alpine netns: with SO_PRIORITY=6, 50 packets to a non-REST
port ended up in 1:1 under the old rules (0 drops); with the forced
priomap they land in 1:2 and see the expected ~50% drop rate.

* Refactor P2P traffic loss handling; isolate REST API traffic

- Added `_p2p_iface` to dynamically detect libp2p interface tied to the Waku network.
- Introduced `add_packet_loss_p2p_only` and `add_packet_loss_correlated_p2p_only` for targeted packet loss on libp2p traffic.
- Replaced REST API traffic bypass logic with simplified P2P interface-based tc rules.
- Updated tests to use `clear_p2p` for cleanup, ensuring REST traffic remains unaffected.

---------

Co-authored-by: Egor Rachkovskii <egorrachkovskii@status.im>
---
 src/steps/network_conditions.py      | 66 ++++++++++++++++++++++++++++
 tests/e2e/test_network_conditions.py | 10 ++---
 tests/metrics/test_metrics.py        |  5 ---
 3 files changed, 70 insertions(+), 11 deletions(-)

diff --git a/src/steps/network_conditions.py b/src/steps/network_conditions.py
index 8bbf6f301..b96d9a0cc 100644
--- a/src/steps/network_conditions.py
+++ b/src/steps/network_conditions.py
@@ -1,4 +1,5 @@
 import subprocess
+from src.env_vars import NETWORK_NAME
 from src.libs.custom_logger import get_custom_logger
 
 logger = get_custom_logger(__name__)
@@ -133,3 +134,68 @@ class TrafficController:
             ],
             iface=iface,
         )
+
+    def _p2p_iface(self, node) -> str:
+        """
+        Return the name of the container interface attached to the waku
+        network (where libp2p traffic flows).
+
+        DockerManager attaches each node to two networks: the default bridge
+        (where host-published ports land, typically `eth0`) and the waku
+        network (where inter-container libp2p/gossipsub traffic flows, typically
+        `eth1`). tc on the default bridge only affects REST control plane; for
+        a packet loss test targeting libp2p we need the waku interface.
+
+        This helper resolves the correct interface by looking up the node's
+        waku-network IP via Docker and matching it against `ip -o -4 addr`
+        output from inside the container.
+        """
+        if not node.container:
+            raise RuntimeError("Node container not started yet")
+        node.container.reload()
+        networks = node.container.attrs.get("NetworkSettings", {}).get("Networks", {})
+        waku_net = networks.get(NETWORK_NAME)
+        if not waku_net or not waku_net.get("IPAddress"):
+            raise RuntimeError(f"Container is not attached to the '{NETWORK_NAME}' docker network")
+        waku_ip = waku_net["IPAddress"]
+
+        exit_code, output = node.container.exec_run(["ip", "-o", "-4", "addr"])
+        if exit_code != 0:
+            raise RuntimeError(f"ip addr failed inside container: {output}")
+        for line in output.decode().splitlines():
+            if f" {waku_ip}/" in line:
+                tokens = line.split()
+                if len(tokens) >= 2:
+                    return tokens[1]
+        raise RuntimeError(f"No interface inside container holds waku IP {waku_ip}")
+
+    def clear_p2p(self, node):
+        """
+        Remove any tc rule previously installed on the node's waku (libp2p)
+        interface. Paired with add_packet_loss_p2p_only /
+        add_packet_loss_correlated_p2p_only.
+        """
+        self.clear(node, iface=self._p2p_iface(node))
+
+    def add_packet_loss_p2p_only(self, node, percent: float):
+        """
+        Apply uncorrelated packet loss to the waku (libp2p) network interface
+        of a node. REST API traffic rides a separate docker interface and is
+        not affected, so the test harness's control plane stays reliable.
+        """
+        iface = self._p2p_iface(node)
+        self.clear(node, iface=iface)
+        self._exec(node, f"qdisc add dev {iface} root netem loss {percent}%".split(), iface=iface)
+
+    def add_packet_loss_correlated_p2p_only(self, node, percent: float, correlation: float):
+        """
+        Correlated packet loss on the waku (libp2p) network interface. See
+        add_packet_loss_p2p_only for why REST stays unaffected.
+        """
+        iface = self._p2p_iface(node)
+        self.clear(node, iface=iface)
+        self._exec(
+            node,
+            f"qdisc add dev {iface} root netem loss {percent}% {correlation}%".split(),
+            iface=iface,
+        )
diff --git a/tests/e2e/test_network_conditions.py b/tests/e2e/test_network_conditions.py
index 4ec46654b..fed949313 100644
--- a/tests/e2e/test_network_conditions.py
+++ b/tests/e2e/test_network_conditions.py
@@ -1,12 +1,10 @@
 import pytest
-import logging
 from time import time, sleep
 from src.libs.custom_logger import get_custom_logger
 from src.env_vars import NODE_1, NODE_2
 from src.node.waku_node import WakuNode
 from src.steps.relay import StepsRelay
 from src.libs.common import delay
-from src.steps.common import StepsCommon
 from src.steps.network_conditions import TrafficController
 from src.libs.common import to_base64
 
@@ -426,7 +424,7 @@ class TestNetworkConditions(StepsRelay):
         window_s = 30.0
         loss = 50.0
 
-        self.tc.add_packet_loss(self.node1, percent=loss)
+        self.tc.add_packet_loss_p2p_only(self.node1, percent=loss)
         _ = self.node4.get_relay_messages(self.test_pubsub_topic)
 
         for _ in range(total_msgs):
@@ -434,9 +432,9 @@ class TestNetworkConditions(StepsRelay):
 
         delay(window_s)
         uncorrelated = len(self.node4.get_relay_messages(self.test_pubsub_topic) or [])
-        self.tc.clear(self.node1)
+        self.tc.clear_p2p(self.node1)
 
-        self.tc.add_packet_loss_correlated(self.node1, percent=loss, correlation=75.0)
+        self.tc.add_packet_loss_correlated_p2p_only(self.node1, percent=loss, correlation=75.0)
         _ = self.node4.get_relay_messages(self.test_pubsub_topic)
 
         for _ in range(total_msgs):
@@ -444,7 +442,7 @@ class TestNetworkConditions(StepsRelay):
 
         delay(window_s)
         correlated = len(self.node4.get_relay_messages(self.test_pubsub_topic) or [])
-        self.tc.clear(self.node1)
+        self.tc.clear_p2p(self.node1)
 
         logger.debug(f"uncorrelated={uncorrelated} correlated={correlated}")
         assert uncorrelated >= correlated
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 25eb9791c..e4e0547aa 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -94,11 +94,6 @@ class TestMetrics(StepsRelay, StepsMetrics, StepsFilter, StepsLightPush, StepsSt
         self.check_metric(self.publishing_node1, "waku_histogram_message_size_count", 1)
         self.check_metric(self.publishing_node1, 'waku_node_messages_total{type="relay"}', 1)
         if self.store_node1.is_nwaku():
-            self.check_metric(
-                self.store_node1,
-                f'waku_service_peers{{protocol="/vac/waku/store/2.0.0-beta4",peerId="{self.publishing_node1.get_tcp_address()}"}}',
-                1,
-            )
             self.check_metric(
                 self.store_node1,
                 f'waku_service_peers{{protocol="/vac/waku/store-query/3.0.0",peerId="{self.publishing_node1.get_tcp_address()}"}}',