2026-04-27 10:42:46 +02:00

705 lines
29 KiB
Python

from concurrent.futures import ThreadPoolExecutor
from time import time_ns
import pytest
from src.env_vars import NODE_2
from src.steps.common import StepsCommon
from src.libs.common import delay, to_base64
from src.libs.custom_logger import get_custom_logger
from src.node.waku_node import WakuNode
from src.node.wrappers_manager import WrapperManager
from src.node.wrapper_helpers import (
EventCollector,
create_message_bindings,
get_node_multiaddr,
wait_for_propagated,
wait_for_sent,
wait_for_error,
)
from src.steps.store import StepsStore
logger = get_custom_logger(__name__)
PROPAGATED_TIMEOUT_S = 30.0
SENT_TIMEOUT_S = 10.0
NO_SENT_OBSERVATION_S = 5.0
SENT_AFTER_STORE_TIMEOUT_S = 60.0
NO_STORE_OBSERVATION_S = 60.0
# MaxTimeInCache from send_service.nim.
MAX_TIME_IN_CACHE_S = 60.0
# Extra slack to cover the background retry loop tick after the window expires.
CACHE_EXPIRY_SLACK_S = 10.0
ERROR_AFTER_CACHE_EXPIRY_TIMEOUT_S = MAX_TIME_IN_CACHE_S + CACHE_EXPIRY_SLACK_S
RETRY_WINDOW_EXPIRED_MSG = "Unable to send within retry time window"
# S30: concurrent sends on the same content topic during initial auto-subscribe.
S30_CONCURRENT_SENDS = 5
S30_CONTENT_TOPIC = "/test/1/s30-concurrent/proto"
class TestSendBeforeRelay(StepsStore):
def test_s17_send_before_relay_peers_joins(self, node_config):
"""
S17: sender starts isolated, calls send()
- send() returns Ok(RequestId) immediately
- Propagated event eventually arrives
"""
sender_collector = EventCollector()
node_config.update(
{
"relay": True,
"store": False,
"discv5Discovery": False,
"numShardsInNetwork": 1,
}
)
sender_result = WrapperManager.create_and_start(
config=node_config,
event_cb=sender_collector.event_callback,
)
assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}"
with sender_result.ok_value as sender_node:
message = create_message_bindings()
send_result = sender_node.send_message(message=message)
assert send_result.is_ok(), f"send() must return Ok(RequestId) even with no peers, got: {send_result.err()}"
request_id = send_result.ok_value
assert request_id, "send() returned an empty RequestId"
# Step 2: start a relay peer with store enabled.
relay_config = {
**node_config,
"staticnodes": [get_node_multiaddr(sender_node)],
"portsshift": 1,
"store": True,
}
relay_result = WrapperManager.create_and_start(config=relay_config)
assert relay_result.is_ok(), f"Failed to start relay peer: {relay_result.err()}"
with relay_result.ok_value:
propagated_event = wait_for_propagated(
collector=sender_collector,
request_id=request_id,
timeout_s=PROPAGATED_TIMEOUT_S,
)
assert propagated_event is not None, (
f"No MessagePropagatedEvent received within {PROPAGATED_TIMEOUT_S}s "
f"after relay peer joined. Collected events: {sender_collector.events}"
)
sent_event = wait_for_sent(
collector=sender_collector,
request_id=request_id,
timeout_s=SENT_TIMEOUT_S,
)
assert sent_event is not None, (
f"No MessageSentEvent received within {SENT_TIMEOUT_S}s "
f"from a store-enabled relay peer. Collected events: {sender_collector.events}"
)
def test_s23_no_sent_event_when_relay_has_no_store(self, node_config):
"""
S23: non-ephemeral message, reliability enabled, no store peer ever reachable.
- Expected: Ok(RequestId), Propagated event only, no Sent and no terminal error.
"""
sender_collector = EventCollector()
node_config.update(
{
"relay": True,
"store": False,
"discv5Discovery": False,
"numShardsInNetwork": 1,
"reliabilityEnabled": True,
}
)
sender_result = WrapperManager.create_and_start(
config=node_config,
event_cb=sender_collector.event_callback,
)
assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}"
with sender_result.ok_value as sender_node:
message = create_message_bindings(ephemeral=False)
send_result = sender_node.send_message(message=message)
assert send_result.is_ok(), f"send() must return Ok(RequestId) even with no peers, got: {send_result.err()}"
request_id = send_result.ok_value
assert request_id, "send() returned an empty RequestId"
relay_config = {
**node_config,
"staticnodes": [get_node_multiaddr(sender_node)],
"portsshift": 1,
"store": False,
}
relay_result = WrapperManager.create_and_start(config=relay_config)
assert relay_result.is_ok(), f"Failed to start relay peer: {relay_result.err()}"
with relay_result.ok_value:
propagated_event = wait_for_propagated(
collector=sender_collector,
request_id=request_id,
timeout_s=PROPAGATED_TIMEOUT_S,
)
assert propagated_event is not None, (
f"No MessagePropagatedEvent received within {PROPAGATED_TIMEOUT_S}s "
f"after relay peer joined. Collected events: {sender_collector.events}"
)
sent_event = wait_for_sent(
collector=sender_collector,
request_id=request_id,
timeout_s=NO_STORE_OBSERVATION_S,
)
assert sent_event is None, (
f"Unexpected MessageSentEvent within {NO_STORE_OBSERVATION_S}s "
f"when relay peer has store=false.\n"
f"Sent event: {sent_event}\n"
f"Collected events: {sender_collector.events}"
)
# Regression guard: current behavior must NOT convert "no store
# reachable" into an immediate terminal error. If a future change
# starts emitting one, this assertion will catch it.
error_event = wait_for_error(
collector=sender_collector,
request_id=request_id,
timeout_s=0,
)
assert error_event is None, (
f"Unexpected terminal error event when no store peer is reachable. "
f"S23 expects silent behavior (Propagated only).\n"
f"Error event: {error_event}\n"
f"Collected events: {sender_collector.events}"
)
def test_s19_store_peer_appears_after_propagation(self, node_config):
"""
S19: a store peer comes online later.
question for Zoltan , is reliability = true mandatory for the store peer ?
what is the effect of the reliability here ?
- send() returns Ok(RequestId) immediately
- Propagated --- relay peer
- Sent when store peer is reachable
"""
sender_collector = EventCollector()
node_config.update(
{
"relay": True,
"store": False,
"discv5Discovery": False,
"numShardsInNetwork": 1,
# "p2preliability": True,
}
)
sender_result = WrapperManager.create_and_start(
config=node_config,
event_cb=sender_collector.event_callback,
)
assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}"
with sender_result.ok_value as sender_node:
# relay peer
relay_config = {
**node_config,
"staticnodes": [get_node_multiaddr(sender_node)],
"portsshift": 1,
"store": False,
# "p2preliability": False, # commented as the option not supported
}
relay_result = WrapperManager.create_and_start(config=relay_config)
assert relay_result.is_ok(), f"Failed to start relay peer: {relay_result.err()}"
with relay_result.ok_value as relay_peer:
# send(). Must return Ok(RequestId) immediately.
message = create_message_bindings()
send_result = sender_node.send_message(message=message)
assert send_result.is_ok(), f"send() must return Ok(RequestId), got: {send_result.err()}"
request_id = send_result.ok_value
assert request_id, "send() returned an empty RequestId"
# Propagated should arrive via the relay peer.
propagated_event = wait_for_propagated(
collector=sender_collector,
request_id=request_id,
timeout_s=PROPAGATED_TIMEOUT_S,
)
assert propagated_event is not None, (
f"No MessagePropagatedEvent received within {PROPAGATED_TIMEOUT_S}s. " f"Collected events: {sender_collector.events}"
)
early_sent_event = wait_for_sent(
collector=sender_collector,
request_id=request_id,
timeout_s=NO_SENT_OBSERVATION_S,
)
assert early_sent_event is None, f"MessageSentEvent arrived before any store peer was reachable. " f"Event: {early_sent_event}"
# Store peer
store_node = WakuNode(NODE_2, f"store_node")
store_node.start(relay="true", store="true", discv5_discovery="false")
store_node.set_relay_subscriptions([self.test_pubsub_topic])
relay_multiaddr = get_node_multiaddr(relay_peer)
sender_multiaddr = get_node_multiaddr(sender_node)
store_node.add_peers([relay_multiaddr, sender_multiaddr])
delay(3)
sent_event = wait_for_sent(
collector=sender_collector,
request_id=request_id,
timeout_s=SENT_AFTER_STORE_TIMEOUT_S,
)
assert sent_event is not None, (
f"No MessageSentEvent received within {SENT_AFTER_STORE_TIMEOUT_S}s "
f"after store peer joined. Collected events: {sender_collector.events}"
)
self.check_published_message_is_stored(
store_node=store_node,
pubsub_topic=self.test_pubsub_topic,
messages_to_check=[message],
page_size=5,
ascending="true",
)
def test_s21_error_when_retry_window_expires(self, node_config):
"""
S21: delivery retry window expires before any valid path recovers.
"""
sender_collector = EventCollector()
node_config.update(
{
"relay": True,
"store": False,
"lightpush": False,
"filter": False,
"discv5Discovery": False,
"numShardsInNetwork": 1,
}
)
sender_result = WrapperManager.create_and_start(
config=node_config,
event_cb=sender_collector.event_callback,
)
assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}"
with sender_result.ok_value as sender_node:
message = create_message_bindings()
send_result = sender_node.send_message(message=message)
assert send_result.is_ok(), f"send() must return Ok(RequestId) even with no peers, got: {send_result.err()}"
request_id = send_result.ok_value
assert request_id, "send() returned an empty RequestId"
# No peer
error_event = wait_for_error(
collector=sender_collector,
request_id=request_id,
timeout_s=ERROR_AFTER_CACHE_EXPIRY_TIMEOUT_S,
)
assert error_event is not None, (
f"No MessageErrorEvent received within {ERROR_AFTER_CACHE_EXPIRY_TIMEOUT_S}s "
f"(MaxTimeInCache={MAX_TIME_IN_CACHE_S}s + slack). "
f"Collected events: {sender_collector.events}"
)
logger.info(f"S21 received error event: {error_event}")
assert error_event.get("error") == RETRY_WINDOW_EXPIRED_MSG, (
f"Unexpected error message in message_error event.\n"
f"Expected: {RETRY_WINDOW_EXPIRED_MSG!r}\n"
f"Got: {error_event.get('error')!r}\n"
f"Full event: {error_event}"
)
def test_s22_non_ephemeral_message_with_reliability_disabled(self, node_config):
"""
S22: non-ephemeral message with reliabilityEnabled disabled.
- propagation path exists ,reliabilityEnabled = false.
- Expected: Ok(RequestId), Propagated event only, no Sent event.
Note: S17 already covers the positive path of this test with reliabilityEnabled=True.
"""
sender_collector = EventCollector()
node_config.update(
{
"relay": True,
"store": False,
"discv5Discovery": False,
"numShardsInNetwork": 1,
"reliabilityEnabled": False,
}
)
sender_result = WrapperManager.create_and_start(
config=node_config,
event_cb=sender_collector.event_callback,
)
assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}"
with sender_result.ok_value as sender_node:
relay_config = {
**node_config,
"staticnodes": [get_node_multiaddr(sender_node)],
"portsshift": 1,
"store": True,
}
relay_result = WrapperManager.create_and_start(config=relay_config)
assert relay_result.is_ok(), f"Failed to start relay peer: {relay_result.err()}"
with relay_result.ok_value:
message = create_message_bindings(ephemeral=False)
send_result = sender_node.send_message(message=message)
assert send_result.is_ok(), f"send() must return Ok(RequestId), got: {send_result.err()}"
request_id = send_result.ok_value
assert request_id, "send() returned an empty RequestId"
propagated_event = wait_for_propagated(
collector=sender_collector,
request_id=request_id,
timeout_s=PROPAGATED_TIMEOUT_S,
)
assert propagated_event is not None, (
f"No MessagePropagatedEvent received within {PROPAGATED_TIMEOUT_S}s. " f"Collected events: {sender_collector.events}"
)
sent_event = wait_for_sent(
collector=sender_collector,
request_id=request_id,
timeout_s=SENT_TIMEOUT_S,
)
assert sent_event is None, (
f"Unexpected MessageSentEvent received when reliabilityEnabled is disabled.\n"
f"Sent event: {sent_event}\n"
f"Collected events: {sender_collector.events}"
)
def test_s24_ephemeral_message_with_reachable_store(self, node_config):
"""
S24: ephemeral message, reliability enabled, reachable store peer.
- Setup: propagation path exists, relay peer has store=True (reachable),
- Expected: Ok(RequestId), Propagated event only, no Sent event.
"""
sender_collector = EventCollector()
node_config.update(
{
"relay": True,
"store": False,
"discv5Discovery": False,
"numShardsInNetwork": 1,
"reliabilityEnabled": True,
}
)
sender_result = WrapperManager.create_and_start(
config=node_config,
event_cb=sender_collector.event_callback,
)
assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}"
with sender_result.ok_value as sender_node:
relay_config = {
**node_config,
"staticnodes": [get_node_multiaddr(sender_node)],
"portsshift": 1,
"store": True,
}
relay_result = WrapperManager.create_and_start(config=relay_config)
assert relay_result.is_ok(), f"Failed to start relay peer: {relay_result.err()}"
with relay_result.ok_value:
message = create_message_bindings(ephemeral=True)
send_result = sender_node.send_message(message=message)
assert send_result.is_ok(), f"send() must return Ok(RequestId), got: {send_result.err()}"
request_id = send_result.ok_value
assert request_id, "send() returned an empty RequestId"
propagated_event = wait_for_propagated(
collector=sender_collector,
request_id=request_id,
timeout_s=PROPAGATED_TIMEOUT_S,
)
assert propagated_event is not None, (
f"No MessagePropagatedEvent received within {PROPAGATED_TIMEOUT_S}s. " f"Collected events: {sender_collector.events}"
)
sent_event = wait_for_sent(
collector=sender_collector,
request_id=request_id,
timeout_s=NO_STORE_OBSERVATION_S,
)
assert sent_event is None, (
f"Unexpected MessageSentEvent for an ephemeral message. "
f"Ephemeral messages must never be store-validated.\n"
f"Sent event: {sent_event}\n"
f"Collected events: {sender_collector.events}"
)
@pytest.mark.s26_note("LightPush not used, falls back to Relay")
def test_s26_lightpush_peer_churn_alternate_remains(self, node_config):
"""
S26: multiple lightpush peers, the selected one disappears,
an alternate remains.
- Propagated event eventually arrives (via P2)
- no message_error
"""
sender_collector = EventCollector()
# Two lightpush server peers: relay+lightpush, connected to each other.
peer1_config = {
**node_config,
"relay": True,
"lightpush": True,
"store": False,
"filter": False,
"discv5Discovery": True,
"numShardsInNetwork": 1,
"portsshift": 1,
}
peer1_result = WrapperManager.create_and_start(config=peer1_config)
assert peer1_result.is_ok(), f"Failed to start lightpush peer1: {peer1_result.err()}"
peer1 = peer1_result.ok_value
peer2_config = {
**peer1_config,
"staticnodes": [get_node_multiaddr(peer1)],
"portsshift": 2,
}
peer2_result = WrapperManager.create_and_start(config=peer2_config)
assert peer2_result.is_ok(), f"Failed to start lightpush peer2: {peer2_result.err()}"
with peer2_result.ok_value as peer2:
# Sender is a lightpush client: lightpush enabled,
# both peers in staticnodes so the sender has a choice.
sender_config = {
**node_config,
"lightpushnode": get_node_multiaddr(peer1),
"relay": True,
"lightpush": True,
"store": False,
"filter": False,
"discv5Discovery": False,
"numShardsInNetwork": 1,
"portsshift": 3,
"staticnodes": [
get_node_multiaddr(peer1),
get_node_multiaddr(peer2),
],
}
sender_result = WrapperManager.create_and_start(
config=sender_config,
event_cb=sender_collector.event_callback,
)
assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}"
with sender_result.ok_value as sender_node:
delay(2)
stop_result = peer1.stop_and_destroy()
assert stop_result.is_ok(), f"Failed to stop peer1: {stop_result.err()}"
message = create_message_bindings()
send_result = sender_node.send_message(message=message)
assert send_result.is_ok(), f"send() must return Ok(RequestId) during peer churn, " f"got: {send_result.err()}"
request_id = send_result.ok_value
assert request_id, "send() returned an empty RequestId"
# Expect Propagated via the surviving lightpush peer (peer2).
propagated_event = wait_for_propagated(
collector=sender_collector,
request_id=request_id,
timeout_s=PROPAGATED_TIMEOUT_S,
)
assert propagated_event is not None, (
f"No MessagePropagatedEvent within {PROPAGATED_TIMEOUT_S}s "
f"after the selected lightpush peer disappeared. "
f"Collected events: {sender_collector.events}"
)
error_event = wait_for_error(
collector=sender_collector,
request_id=request_id,
timeout_s=0,
)
assert error_event is None, f"Unexpected message_error event during peer churn: {error_event}"
def test_s30_concurrent_sends_during_auto_subscribe(self, node_config):
"""
S30: concurrent sends on the same content topic during initial auto-subscribe.
- Each call must return Ok(RequestId) with a unique id.
"""
sender_collector = EventCollector()
node_config.update(
{
"relay": True,
"store": False,
"discv5Discovery": False,
"numShardsInNetwork": 1,
}
)
sender_result = WrapperManager.create_and_start(
config=node_config,
event_cb=sender_collector.event_callback,
)
assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}"
with sender_result.ok_value as sender_node:
# Relay peer so the sender has a propagation path.
relay_config = {
**node_config,
"staticnodes": [get_node_multiaddr(sender_node)],
"portsshift": 1,
}
relay_result = WrapperManager.create_and_start(config=relay_config)
assert relay_result.is_ok(), f"Failed to start relay peer: {relay_result.err()}"
with relay_result.ok_value:
# Build one message per send, with distinct payloads so we can
# detect any cross-association between request ids and events.
messages = [
create_message_bindings(
contentTopic=S30_CONTENT_TOPIC,
payload=to_base64(f"s30-concurrent-{i}"),
)
for i in range(S30_CONCURRENT_SENDS)
]
# Fire all sends concurrently. The sender is not yet subscribed
# to S30_CONTENT_TOPIC, so this exercises the auto-subscribe path
# under contention.
with ThreadPoolExecutor(max_workers=S30_CONCURRENT_SENDS) as pool:
send_results = list(pool.map(sender_node.send_message, messages))
# Every send must return Ok(RequestId).
request_ids = []
for i, send_result in enumerate(send_results):
assert send_result.is_ok(), f"Concurrent send #{i} failed: {send_result.err()}"
request_id = send_result.ok_value
assert request_id, f"Concurrent send #{i} returned an empty RequestId"
request_ids.append(request_id)
# Request ids must be unique across concurrent sends.
assert len(set(request_ids)) == len(request_ids), f"Duplicate RequestIds returned by concurrent sends: {request_ids}"
# Each request id must get its own propagated event and no error.
for request_id in request_ids:
propagated_event = wait_for_propagated(
collector=sender_collector,
request_id=request_id,
timeout_s=PROPAGATED_TIMEOUT_S,
)
assert propagated_event is not None, (
f"No MessagePropagatedEvent for request_id={request_id} "
f"within {PROPAGATED_TIMEOUT_S}s. "
f"Collected events: {sender_collector.events}"
)
error_event = wait_for_error(
collector=sender_collector,
request_id=request_id,
timeout_s=0,
)
assert error_event is None, f"Unexpected message_error for request_id={request_id}: {error_event}"
# Cross-association guard: every event with a requestId must
# belong to exactly one of the request ids we issued.
issued = set(request_ids)
for event in sender_collector.events:
event_request_id = event.get("requestId")
if event_request_id is None:
continue
assert event_request_id in issued, (
f"Event carries an unknown requestId={event_request_id!r}, " f"not in issued set {issued}. Event: {event}"
)
class TestS06CoreSenderRelayOnly(StepsCommon):
"""
S06 — Core sender with relay peers only, no store.
Sender has local relay enabled and is connected to one relay peer.
Expected: send() returns Ok(RequestId), message_propagated event arrives,
no message_sent (store disabled), no message_error.
"""
def test_s06_relay_propagation_without_store(self, node_config):
sender_collector = EventCollector()
node_config.update(
{
"relay": True,
"store": False,
"lightpush": False,
"filter": False,
"discv5Discovery": False,
"numShardsInNetwork": 1,
}
)
sender_result = WrapperManager.create_and_start(
config=node_config,
event_cb=sender_collector.event_callback,
)
assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}"
with sender_result.ok_value as sender:
peer_config = {
**node_config,
"staticnodes": [get_node_multiaddr(sender)],
"portsshift": 1,
}
peer_result = WrapperManager.create_and_start(config=peer_config)
assert peer_result.is_ok(), f"Failed to start relay peer: {peer_result.err()}"
with peer_result.ok_value:
message = self.create_message(
payload=to_base64("S06 relay-only test payload"),
contentTopic="/test/1/s06-relay-only/proto",
)
send_result = sender.send_message(message=message)
assert send_result.is_ok(), f"send() failed: {send_result.err()}"
request_id = send_result.ok_value
assert request_id, "send() returned an empty RequestId"
propagated = wait_for_propagated(
collector=sender_collector,
request_id=request_id,
timeout_s=PROPAGATED_TIMEOUT_S,
)
assert propagated is not None, (
f"No message_propagated event within {PROPAGATED_TIMEOUT_S}s. " f"Collected events: {sender_collector.events}"
)
assert propagated["requestId"] == request_id
error = wait_for_error(sender_collector, request_id, timeout_s=0)
assert error is None, f"Unexpected message_error event: {error}"
sent = wait_for_sent(sender_collector, request_id, timeout_s=0)
assert sent is None, f"Unexpected message_sent event (store is disabled): {sent}"