mirror of
https://github.com/logos-messaging/logos-messaging-interop-tests.git
synced 2026-05-18 08:19:32 +00:00
Fix nim waku daily failures (#169)
* Add REST API traffic bypass for network conditions manipulation - Introduced methods to apply packet loss only to P2P traffic, excluding REST API traffic. - Simplified test cases to leverage new differentiated packet loss handling. - Removed unused and legacy metrics/tests for cleaner configuration and coverage. * Refactor network conditions setup to streamline command execution * Pin priomap so libp2p traffic actually hits netem The default prio qdisc priomap routes SO_PRIORITY 6 and 7 to band 0, which is our REST bypass class 1:1. libp2p/gossipsub packets set a high SO_PRIORITY on their sockets, so they were silently escaping the netem impairment via the priomap rather than through the u32 filter. The result: test_relay_packet_loss_correlated_vs_uncorrelated became green by accident because no loss was ever applied to relay traffic. Forcing priomap to 1 1 1 1 ... on all 16 slots routes every SO_PRIORITY value to band 1 (netem). The u32 filter remains the only path to 1:1, so REST stays isolated and libp2p now takes the configured loss. Verified in alpine netns: with SO_PRIORITY=6, 50 packets to a non-REST port ended up in 1:1 under the old rules (0 drops); with the forced priomap they land in 1:2 and see the expected ~50% drop rate. * Refactor P2P traffic loss handling; isolate REST API traffic - Added `_p2p_iface` to dynamically detect libp2p interface tied to the Waku network. - Introduced `add_packet_loss_p2p_only` and `add_packet_loss_correlated_p2p_only` for targeted packet loss on libp2p traffic. - Replaced REST API traffic bypass logic with simplified P2P interface-based tc rules. - Updated tests to use `clear_p2p` for cleanup, ensuring REST traffic remains unaffected. --------- Co-authored-by: Egor Rachkovskii <egorrachkovskii@status.im>
This commit is contained in:
parent
8644151be3
commit
639dffc505
@ -1,4 +1,5 @@
|
||||
import subprocess
|
||||
from src.env_vars import NETWORK_NAME
|
||||
from src.libs.custom_logger import get_custom_logger
|
||||
|
||||
logger = get_custom_logger(__name__)
|
||||
@ -133,3 +134,68 @@ class TrafficController:
|
||||
],
|
||||
iface=iface,
|
||||
)
|
||||
|
||||
def _p2p_iface(self, node) -> str:
|
||||
"""
|
||||
Return the name of the container interface attached to the waku
|
||||
network (where libp2p traffic flows).
|
||||
|
||||
DockerManager attaches each node to two networks: the default bridge
|
||||
(where host-published ports land, typically `eth0`) and the waku
|
||||
network (where inter-container libp2p/gossipsub traffic flows, typically
|
||||
`eth1`). tc on the default bridge only affects REST control plane; for
|
||||
a packet loss test targeting libp2p we need the waku interface.
|
||||
|
||||
This helper resolves the correct interface by looking up the node's
|
||||
waku-network IP via Docker and matching it against `ip -o -4 addr`
|
||||
output from inside the container.
|
||||
"""
|
||||
if not node.container:
|
||||
raise RuntimeError("Node container not started yet")
|
||||
node.container.reload()
|
||||
networks = node.container.attrs.get("NetworkSettings", {}).get("Networks", {})
|
||||
waku_net = networks.get(NETWORK_NAME)
|
||||
if not waku_net or not waku_net.get("IPAddress"):
|
||||
raise RuntimeError(f"Container is not attached to the '{NETWORK_NAME}' docker network")
|
||||
waku_ip = waku_net["IPAddress"]
|
||||
|
||||
exit_code, output = node.container.exec_run(["ip", "-o", "-4", "addr"])
|
||||
if exit_code != 0:
|
||||
raise RuntimeError(f"ip addr failed inside container: {output}")
|
||||
for line in output.decode().splitlines():
|
||||
if f" {waku_ip}/" in line:
|
||||
tokens = line.split()
|
||||
if len(tokens) >= 2:
|
||||
return tokens[1]
|
||||
raise RuntimeError(f"No interface inside container holds waku IP {waku_ip}")
|
||||
|
||||
def clear_p2p(self, node):
|
||||
"""
|
||||
Remove any tc rule previously installed on the node's waku (libp2p)
|
||||
interface. Paired with add_packet_loss_p2p_only /
|
||||
add_packet_loss_correlated_p2p_only.
|
||||
"""
|
||||
self.clear(node, iface=self._p2p_iface(node))
|
||||
|
||||
def add_packet_loss_p2p_only(self, node, percent: float):
|
||||
"""
|
||||
Apply uncorrelated packet loss to the waku (libp2p) network interface
|
||||
of a node. REST API traffic rides a separate docker interface and is
|
||||
not affected, so the test harness's control plane stays reliable.
|
||||
"""
|
||||
iface = self._p2p_iface(node)
|
||||
self.clear(node, iface=iface)
|
||||
self._exec(node, f"qdisc add dev {iface} root netem loss {percent}%".split(), iface=iface)
|
||||
|
||||
def add_packet_loss_correlated_p2p_only(self, node, percent: float, correlation: float):
|
||||
"""
|
||||
Correlated packet loss on the waku (libp2p) network interface. See
|
||||
add_packet_loss_p2p_only for why REST stays unaffected.
|
||||
"""
|
||||
iface = self._p2p_iface(node)
|
||||
self.clear(node, iface=iface)
|
||||
self._exec(
|
||||
node,
|
||||
f"qdisc add dev {iface} root netem loss {percent}% {correlation}%".split(),
|
||||
iface=iface,
|
||||
)
|
||||
|
||||
@ -1,12 +1,10 @@
|
||||
import pytest
|
||||
import logging
|
||||
from time import time, sleep
|
||||
from src.libs.custom_logger import get_custom_logger
|
||||
from src.env_vars import NODE_1, NODE_2
|
||||
from src.node.waku_node import WakuNode
|
||||
from src.steps.relay import StepsRelay
|
||||
from src.libs.common import delay
|
||||
from src.steps.common import StepsCommon
|
||||
from src.steps.network_conditions import TrafficController
|
||||
from src.libs.common import to_base64
|
||||
|
||||
@ -426,7 +424,7 @@ class TestNetworkConditions(StepsRelay):
|
||||
window_s = 30.0
|
||||
loss = 50.0
|
||||
|
||||
self.tc.add_packet_loss(self.node1, percent=loss)
|
||||
self.tc.add_packet_loss_p2p_only(self.node1, percent=loss)
|
||||
_ = self.node4.get_relay_messages(self.test_pubsub_topic)
|
||||
|
||||
for _ in range(total_msgs):
|
||||
@ -434,9 +432,9 @@ class TestNetworkConditions(StepsRelay):
|
||||
|
||||
delay(window_s)
|
||||
uncorrelated = len(self.node4.get_relay_messages(self.test_pubsub_topic) or [])
|
||||
self.tc.clear(self.node1)
|
||||
self.tc.clear_p2p(self.node1)
|
||||
|
||||
self.tc.add_packet_loss_correlated(self.node1, percent=loss, correlation=75.0)
|
||||
self.tc.add_packet_loss_correlated_p2p_only(self.node1, percent=loss, correlation=75.0)
|
||||
_ = self.node4.get_relay_messages(self.test_pubsub_topic)
|
||||
|
||||
for _ in range(total_msgs):
|
||||
@ -444,7 +442,7 @@ class TestNetworkConditions(StepsRelay):
|
||||
|
||||
delay(window_s)
|
||||
correlated = len(self.node4.get_relay_messages(self.test_pubsub_topic) or [])
|
||||
self.tc.clear(self.node1)
|
||||
self.tc.clear_p2p(self.node1)
|
||||
|
||||
logger.debug(f"uncorrelated={uncorrelated} correlated={correlated}")
|
||||
assert uncorrelated >= correlated
|
||||
|
||||
@ -94,11 +94,6 @@ class TestMetrics(StepsRelay, StepsMetrics, StepsFilter, StepsLightPush, StepsSt
|
||||
self.check_metric(self.publishing_node1, "waku_histogram_message_size_count", 1)
|
||||
self.check_metric(self.publishing_node1, 'waku_node_messages_total{type="relay"}', 1)
|
||||
if self.store_node1.is_nwaku():
|
||||
self.check_metric(
|
||||
self.store_node1,
|
||||
f'waku_service_peers{{protocol="/vac/waku/store/2.0.0-beta4",peerId="{self.publishing_node1.get_tcp_address()}"}}',
|
||||
1,
|
||||
)
|
||||
self.check_metric(
|
||||
self.store_node1,
|
||||
f'waku_service_peers{{protocol="/vac/waku/store-query/3.0.0",peerId="{self.publishing_node1.get_tcp_address()}"}}',
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user