From 4ba30a8aff2a52c8b60573e9bc1a7d455ab80821 Mon Sep 17 00:00:00 2001 From: AYAHASSAN287 <49167455+AYAHASSAN287@users.noreply.github.com> Date: Thu, 14 May 2026 15:48:14 +0300 Subject: [PATCH 1/9] e2e part3 (#181) * add test s17 * Add temp changes * Add s17 positive / negative scenarios * add S19 * Add S06 relay-only test and fix wrapper helpers (#173) * - Add S06 relay-only test case for testing message propagation without a store. - Update `wrapper_helpers` for clearer event type handling and type annotations (`Optional[...]` usage). - Simplify `get_node_multiaddr` to retrieve addresses via `get_node_info_raw`. - Refactor `wrappers_manager` to adjust bindings path to `vendor` directory and add `get_node_info_raw` method. - Update `.gitignore` to exclude `store.sqlite3*`. * Refactor S06 relay-only test: replace try-finally blocks with context managers for clarity and conciseness. * Migrate S06 relay-only test to `test_send_e2e.py` and refactor with `StepsCommon` for reusability. --------- Co-authored-by: Egor Rachkovskii * Modify S19 test * Adding S21 * Fix review comments * Adding S22/S23 * Adding S24 * Add S26 * Add S30 * Add S31 * Improve `wait_for_event` loop logic and add `assert_event_invariants` helper (#178) - Refactored the `wait_for_event` function for clarity and to ensure proper deadline handling within the loop. - Introduced `assert_event_invariants` to validate per-request event properties, enforcing invariants like correct `requestId`, no duplicate terminal events, and proper timing between `Propagated` and `Sent`. - Added tests for `assert_event_invariants` enforcement in `S14` and `S15` lightpush scenarios. Co-authored-by: Egor Rachkovskii * Add S07 and S10 send API tests with event invariants helper (#176) * Add `assert_event_invariants` to enforce per-request event constraints and integrate into relevant tests * Integrate `assert_event_invariants` into edge and store tests * Remove redundant comments from `test_send_e2e.py` --------- Co-authored-by: Egor Rachkovskii * Fix some tests * Add S02/S12 send API tests and PR CI pipeline (#174) * Add tests for auto-subscribe on first send and isolated sender with no peers * Add PR CI workflow with tiered test strategy - pr_tests.yml: build job with cache, wrapper-tests, smoke-tests, and label-triggered full-suite - test_common.yml: add deploy_allure/send_discord inputs so PR runs skip reporting side effects - Add docker_required marker to S19 (needs Docker, excluded from wrapper-only CI job) - Register docker_required marker in pytest.ini * Document PR CI test workflows in README * Refine PR CI test strategy: - Exclude `docker_required` tests from smoke set in `pr_tests.yml`. - Add `wait_for_connected` helper for connection state checks. - Update S19 test to dynamically create and clean up the store node setup. - General simplifications and improved test stability. * Add `wait_for_connected` assertion to ensure sender connection state before propagation test * Refine tests and CI workflows: - Replace `ERROR_TIMEOUT_S` with `ERROR_AFTER_CACHE_EXPIRY_TIMEOUT_S` in `test_send_e2e.py`. - Adjust timeout assertion for better clarity and accuracy. - Update `pr_tests.yml` to add retries (`--reruns`) and ignore wrapper tests in smoke tests. - Change `test_common.yml` default Discord reporting to `false`. * Normalize `portsshift` to `portsShift` in `test_send_e2e.py` configuration definitions. --------- Co-authored-by: Egor Rachkovskii * Add relay-to-lightpush fallback integration tests (S08/S09) (#180) Co-authored-by: Egor Rachkovskii * Ignore S19 * fix s26 * Ignore s20 / s31 for errors * Change image name * fix xfail syntax error * rename test file * FIx flaky tests * comment the skipped tests * Fix review comments * revert tag in yml in latest * commenting lightpush * Modify the PR * Fix the ports conflict * Modify S20 * fix portsshift option * remove the /true from yml to allow errors to exist * Modify the yml to continue on error * First set of review comments * adding xfail mark for failed tests * address review comments about xfail * cleanup unused lines * event collector fix * Address review comment about delay constant * fix the timeout review comment * Add assert_event_invariants * enhance comment on S26 test * mark the waku tests as docker_required * Add S01 * add S01 second scenario * Add S03 * Add S04 * Adding S11 * modify s11 scenario to pass * Adding test S05 * Adding the new tests in part3 file * Fix the yml file error * Add the new test file to the PR job * bump logos-delivery-python-bindings to include destroy_keep_ctx * modify the S01 test * mark S01 with xfail * mark the second S01 test as xfail too * use skip instead of xfail * comment the skip line to try S01 again * restore the xfail mark again * remove the wrapped text code from test file * Changing the test files names * skip S01 again * removed extra comments * Update logos-delivery-python-bindings submodule --------- Co-authored-by: Egor Rachkovskii <32649334+at0m1x19@users.noreply.github.com> Co-authored-by: Egor Rachkovskii --- .github/workflows/pr_tests.yml | 32 +- .github/workflows/test_common.yml | 2 +- src/node/waku_node.py | 46 -- src/node/wrappers_manager.py | 4 + .../helpers/send_invalid_handle.py | 148 ++++ .../test_send_errors_and_concurrency.py | 445 +++++++++++ .../test_send_handle_and_subscription.py | 394 ++++++++++ ...rt2.py => test_send_lightpush_and_edge.py} | 487 ++++++------ ...art1.py => test_send_relay_propagation.py} | 710 +++++------------- vendor/logos-delivery-python-bindings | 2 +- 10 files changed, 1437 insertions(+), 833 deletions(-) create mode 100644 tests/wrappers_tests/helpers/send_invalid_handle.py create mode 100644 tests/wrappers_tests/test_send_errors_and_concurrency.py create mode 100644 tests/wrappers_tests/test_send_handle_and_subscription.py rename tests/wrappers_tests/{test_send_e2e_part2.py => test_send_lightpush_and_edge.py} (66%) rename tests/wrappers_tests/{test_send_e2e_part1.py => test_send_relay_propagation.py} (52%) diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml index 1ac65b970..d9948c8e3 100644 --- a/.github/workflows/pr_tests.yml +++ b/.github/workflows/pr_tests.yml @@ -159,25 +159,45 @@ jobs: --reruns 2 \ --junit-xml=wrapper-results-basic.xml - - name: Run wrapper tests - send e2e part 1 + - name: Run wrapper tests - send handle and subscription continue-on-error: true env: PYTHONPATH: ${{ github.workspace }}/vendor/logos-delivery-python-bindings/waku run: | - pytest tests/wrappers_tests/test_send_e2e_part1.py \ + pytest tests/wrappers_tests/test_send_handle_and_subscription.py \ -m "not docker_required" \ --reruns 2 \ - --junit-xml=wrapper-results-send-part1.xml + --junit-xml=wrapper-results-send-handle-and-subscription.xml - - name: Run wrapper tests - send e2e part 2 + - name: Run wrapper tests - send relay propagation continue-on-error: true env: PYTHONPATH: ${{ github.workspace }}/vendor/logos-delivery-python-bindings/waku run: | - pytest tests/wrappers_tests/test_send_e2e_part2.py \ + pytest tests/wrappers_tests/test_send_relay_propagation.py \ -m "not docker_required" \ --reruns 2 \ - --junit-xml=wrapper-results-send-part2.xml + --junit-xml=wrapper-results-send-relay-propagation.xml + + - name: Run wrapper tests - send lightpush and edge + continue-on-error: true + env: + PYTHONPATH: ${{ github.workspace }}/vendor/logos-delivery-python-bindings/waku + run: | + pytest tests/wrappers_tests/test_send_lightpush_and_edge.py \ + -m "not docker_required" \ + --reruns 2 \ + --junit-xml=wrapper-results-send-lightpush-and-edge.xml + + - name: Run wrapper tests - send errors and concurrency + continue-on-error: true + env: + PYTHONPATH: ${{ github.workspace }}/vendor/logos-delivery-python-bindings/waku + run: | + pytest tests/wrappers_tests/test_send_errors_and_concurrency.py \ + -m "not docker_required" \ + --reruns 2 \ + --junit-xml=wrapper-results-send-errors-and-concurrency.xml - name: Test Report if: always() diff --git a/.github/workflows/test_common.yml b/.github/workflows/test_common.yml index 079be507c..695b3e019 100644 --- a/.github/workflows/test_common.yml +++ b/.github/workflows/test_common.yml @@ -52,7 +52,7 @@ jobs: strategy: fail-fast: false matrix: - shard: [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] + shard: [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] # total number of shards =18 means tests will split into 18 thread and run in parallel to increase execution speed # command for sharding : # pytest --shard-id= --num-shards= diff --git a/src/node/waku_node.py b/src/node/waku_node.py index 7afe881c6..d42391693 100644 --- a/src/node/waku_node.py +++ b/src/node/waku_node.py @@ -229,52 +229,6 @@ class WakuNode: logger.error(f"REST service did not become ready in time: {ex}") raise - def _start_wrapper(self, default_args, wait_for_node_sec): - logger.debug("Starting node using wrappers") - wrapper_config = self._default_args_to_wrapper_config(default_args) - - result = WrapperManager.create_and_start(config=wrapper_config, timeout_s=wait_for_node_sec) - if result.is_err(): - raise RuntimeError(f"Failed to start wrapper node: {result.err()}") - self._wrapper_node = result.ok_value - - logger.debug(f"Started wrapper node. REST: {self._rest_port}") - DS.waku_nodes.append(self) - delay(1) - try: - self.ensure_ready(timeout_duration=wait_for_node_sec) - except Exception as ex: - logger.error(f"REST service did not become ready in time: {ex}") - raise - - def _default_args_to_wrapper_config(self, default_args): - def _bool(key, default="false"): - return default_args.get(key, default).lower() == "true" - - bootstrap = default_args.get("discv5-bootstrap-node") - - return { - "logLevel": default_args.get("log-level", "DEBUG"), - "mode": "Core", - "networkingConfig": { - "listenIpv4": default_args.get("listen-address", "0.0.0.0"), - "p2pTcpPort": int(default_args["tcp-port"]), - "discv5UdpPort": int(default_args["discv5-udp-port"]), - "restPort": int(default_args["rest-port"]), - "restAddress": default_args.get("rest-address", "0.0.0.0"), - }, - "protocolsConfig": { - "clusterId": int(default_args.get("cluster-id", DEFAULT_CLUSTER_ID)), - "relay": _bool("relay"), - "store": _bool("store"), - "filter": _bool("filter"), - "lightpush": _bool("lightpush"), - "peerExchange": _bool("peer-exchange"), - "discv5Discovery": _bool("discv5-discovery", "true"), - "discv5BootstrapNodes": [bootstrap] if bootstrap else [], - }, - } - @retry(stop=stop_after_delay(250), wait=wait_fixed(0.1), reraise=True) def register_rln(self, **kwargs): logger.debug("Registering RLN credentials...") diff --git a/src/node/wrappers_manager.py b/src/node/wrappers_manager.py index a0d20e0d6..2a0e84982 100644 --- a/src/node/wrappers_manager.py +++ b/src/node/wrappers_manager.py @@ -62,6 +62,10 @@ class WrapperManager: def stop_and_destroy(self, *, timeout_s: float = 20.0) -> Result[int, str]: return self._node.stop_and_destroy(timeout_s=timeout_s) + def destroy_keep_ctx(self, *, timeout_s: float = 20.0) -> Result[int, str]: + """Pass-through for NodeWrapper.destroy_keep_ctx — see that method.""" + return self._node.destroy_keep_ctx(timeout_s=timeout_s) + def subscribe_content_topic(self, content_topic: str, *, timeout_s: float = 20.0) -> Result[int, str]: return self._node.subscribe_content_topic(content_topic, timeout_s=timeout_s) diff --git a/tests/wrappers_tests/helpers/send_invalid_handle.py b/tests/wrappers_tests/helpers/send_invalid_handle.py new file mode 100644 index 000000000..bf6280df3 --- /dev/null +++ b/tests/wrappers_tests/helpers/send_invalid_handle.py @@ -0,0 +1,148 @@ +"""S01 helpers: invoke send() against an invalid handle in an isolated process. + +Run via: + python -m tests.wrappers_tests.helpers.send_on_invalid_handle nil + python -m tests.wrappers_tests.helpers.send_on_invalid_handle destroyed + +Prints a single line to stdout starting with , followed by a JSON +payload describing the outcome. Runs in its own process so that a missing +C-ABI guard (which can SIGSEGV) fails the parent test cleanly instead of +taking the pytest runner down. + +Cases: +- nil: send() on a wrapper built with ctx=ffi.NULL. +- destroyed: send() after destroy_keep_ctx() — self.ctx stays non-nil so the + call reaches the C side with the original (now-stale) pointer. +""" + +import json +import sys +from pathlib import Path + + +def _ensure_bindings_on_path() -> None: + # The wrapper module lives outside the project tree, under vendor/. + # Helper file is at /tests/wrappers_tests/helpers/.py. + project_root = Path(__file__).resolve().parents[3] + bindings_path = project_root / "vendor" / "logos-delivery-python-bindings" / "waku" + if str(bindings_path) not in sys.path: + sys.path.insert(0, str(bindings_path)) + if str(project_root) not in sys.path: + sys.path.insert(0, str(project_root)) + + +def _emit(marker: str, payload: dict) -> None: + print(marker + json.dumps(payload)) + + +def _run_nil_handle(marker: str) -> None: + from wrapper import NodeWrapper, ffi # type: ignore[import-not-found] + from src.node.wrappers_manager import WrapperManager + from src.node.wrapper_helpers import create_message_bindings + + sender = WrapperManager(NodeWrapper(ctx=ffi.NULL, config_buffer=None, event_cb_handler=None)) + send_result = sender.send_message(message=create_message_bindings()) + + _emit( + marker, + { + "is_ok": send_result.is_ok(), + "ok": send_result.ok_value if send_result.is_ok() else None, + "err": send_result.err() if send_result.is_err() else None, + }, + ) + + +def _run_destroyed_handle(marker: str) -> None: + from src.node.wrappers_manager import WrapperManager + from src.node.wrapper_helpers import EventCollector, create_message_bindings + from tests.wrappers_tests.conftest import build_node_config + + collector = EventCollector() + + create_result = WrapperManager.create_and_start( + config=build_node_config(), + event_cb=collector.event_callback, + ) + if create_result.is_err(): + _emit( + marker, + { + "stage": "create_and_start", + "is_ok": False, + "ok": None, + "err": create_result.err(), + "events_after_send": [], + }, + ) + return + + sender = create_result.ok_value + + stop_result = sender.stop_node() + if stop_result.is_err(): + _emit( + marker, + { + "stage": "stop_node", + "is_ok": False, + "ok": None, + "err": stop_result.err(), + "events_after_send": [], + }, + ) + return + + destroy_result = sender.destroy_keep_ctx() + if destroy_result.is_err(): + _emit( + marker, + { + "stage": "destroy_keep_ctx", + "is_ok": False, + "ok": None, + "err": destroy_result.err(), + "events_after_send": [], + }, + ) + return + + events_before_send = len(collector.events) + + send_result = sender.send_message(message=create_message_bindings()) + + new_events = collector.events[events_before_send:] + + _emit( + marker, + { + "stage": "send_message", + "is_ok": send_result.is_ok(), + "ok": send_result.ok_value if send_result.is_ok() else None, + "err": send_result.err() if send_result.is_err() else None, + "events_after_send": [str(e) for e in new_events], + }, + ) + + +CASES = { + "nil": _run_nil_handle, + "destroyed": _run_destroyed_handle, +} + + +def main() -> int: + if len(sys.argv) != 3 or sys.argv[1] not in CASES: + cases = "|".join(CASES) + print(f"usage: send_on_invalid_handle <{cases}> ", file=sys.stderr) + return 2 + + case, marker = sys.argv[1], sys.argv[2] + + _ensure_bindings_on_path() + CASES[case](marker) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/wrappers_tests/test_send_errors_and_concurrency.py b/tests/wrappers_tests/test_send_errors_and_concurrency.py new file mode 100644 index 000000000..eaa895a60 --- /dev/null +++ b/tests/wrappers_tests/test_send_errors_and_concurrency.py @@ -0,0 +1,445 @@ +from concurrent.futures import ThreadPoolExecutor + +import pytest +from src.env_vars import NODE_2 +from src.steps.common import StepsCommon +from src.steps.store import StepsStore +from src.libs.common import delay, to_base64 +from src.libs.custom_logger import get_custom_logger +from src.node.waku_node import WakuNode +from src.node.wrappers_manager import WrapperManager +from src.node.wrapper_helpers import ( + EventCollector, + assert_event_invariants, + create_message_bindings, + get_node_multiaddr, + wait_for_propagated, + wait_for_sent, + wait_for_error, +) + +logger = get_custom_logger(__name__) + +PROPAGATED_TIMEOUT_S = 30.0 +RECOVERY_TIMEOUT_S = 45.0 + +# MaxTimeInCache from send_service.nim. +MAX_TIME_IN_CACHE_S = 60.0 +# Extra slack to cover the background retry loop tick after the window expires. +CACHE_EXPIRY_SLACK_S = 10.0 +ERROR_AFTER_CACHE_EXPIRY_TIMEOUT_S = MAX_TIME_IN_CACHE_S + CACHE_EXPIRY_SLACK_S +RETRY_WINDOW_EXPIRED_MSG = "Unable to send within retry time window" + +# S30: concurrent sends on the same content topic during initial auto-subscribe. +S30_CONCURRENT_SENDS = 5 +S30_CONTENT_TOPIC = "/test/1/s30-concurrent/proto" + +# S31: concurrent sends across mixed topics during peer churn. +S31_BURST_SIZE = 8 +S31_CONTENT_TOPICS = [ + "/test/1/s31-topic-a/proto", + "/test/1/s31-topic-b/proto", + "/test/1/s31-topic-c/proto", + "/test/1/s31-topic-d/proto", + "/test/1/s31-topic-e/proto", + "/test/1/s31-topic-f/proto", + "/test/1/s31-topic-g/proto", + "/test/1/s31-topic-h/proto", +] + + +class TestS12IsolatedSenderNoPeers(StepsCommon): + """ + S12 — Isolated sender, no peers. + Sender has relay enabled but zero relay peers and zero lightpush peers. + Expected: send() returns Ok(RequestId), but eventually a message_error + event arrives (no route to propagate). + """ + + def test_s12_send_with_no_peers_produces_error(self, node_config): + sender_collector = EventCollector() + + node_config.update( + { + "relay": True, + "store": False, + "lightpush": False, + "filter": False, + "discv5Discovery": False, + "numShardsInNetwork": 1, + } + ) + + sender_result = WrapperManager.create_and_start( + config=node_config, + event_cb=sender_collector.event_callback, + ) + assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" + + with sender_result.ok_value as sender: + message = create_message_bindings( + payload=to_base64("S12 isolated sender payload"), + contentTopic="/test/1/s12-isolated/proto", + ) + + send_result = sender.send_message(message=message) + assert send_result.is_ok(), f"send() must return Ok(RequestId) even with no peers, got: {send_result.err()}" + + request_id = send_result.ok_value + assert request_id, "send() returned an empty RequestId" + + error = wait_for_error( + collector=sender_collector, + request_id=request_id, + timeout_s=ERROR_AFTER_CACHE_EXPIRY_TIMEOUT_S, + ) + assert error is not None, ( + f"No message_error event within {ERROR_AFTER_CACHE_EXPIRY_TIMEOUT_S}s " + f"(MaxTimeInCache={MAX_TIME_IN_CACHE_S}s + slack) for isolated sender. " + f"Collected events: {sender_collector.events}" + ) + assert error["requestId"] == request_id + + propagated = wait_for_propagated(sender_collector, request_id, timeout_s=0) + assert propagated is None, f"Unexpected message_propagated event for isolated sender: {propagated}" + + +class TestS21ErrorWhenRetryWindowExpires(StepsCommon): + """ + S21: delivery retry window expires before any valid path recovers. + """ + + def test_s21_error_when_retry_window_expires(self, node_config): + sender_collector = EventCollector() + + node_config.update( + { + "relay": True, + "store": False, + "lightpush": False, + "filter": False, + "discv5Discovery": False, + "numShardsInNetwork": 1, + } + ) + + sender_result = WrapperManager.create_and_start( + config=node_config, + event_cb=sender_collector.event_callback, + ) + assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" + + with sender_result.ok_value as sender_node: + message = create_message_bindings() + send_result = sender_node.send_message(message=message) + assert send_result.is_ok(), f"send() must return Ok(RequestId) even with no peers, got: {send_result.err()}" + + request_id = send_result.ok_value + assert request_id, "send() returned an empty RequestId" + + # No peer + error_event = wait_for_error( + collector=sender_collector, + request_id=request_id, + timeout_s=ERROR_AFTER_CACHE_EXPIRY_TIMEOUT_S, + ) + assert error_event is not None, ( + f"No MessageErrorEvent received within {ERROR_AFTER_CACHE_EXPIRY_TIMEOUT_S}s " + f"(MaxTimeInCache={MAX_TIME_IN_CACHE_S}s + slack). " + f"Collected events: {sender_collector.events}" + ) + logger.info(f"S21 received error event: {error_event}") + + assert error_event.get("error") == RETRY_WINDOW_EXPIRED_MSG, ( + f"Unexpected error message in message_error event.\n" + f"Expected: {RETRY_WINDOW_EXPIRED_MSG!r}\n" + f"Got: {error_event.get('error')!r}\n" + f"Full event: {error_event}" + ) + + assert_event_invariants(sender_collector, request_id) + + +class TestS30ConcurrentSendsDuringAutoSubscribe(StepsCommon): + """ + S30: concurrent sends on the same content topic during initial auto-subscribe. + - Sender starts unsubscribed to the target topic. + - Several send() calls are issued at nearly the same time. + - Each call must return Ok(RequestId) with a unique id. + - Each request id must get its own propagated event, + with no dropped or cross-associated events. + """ + + def test_s30_concurrent_sends_during_auto_subscribe(self, node_config): + sender_collector = EventCollector() + + node_config.update( + { + "relay": True, + "store": False, + "discv5Discovery": False, + "numShardsInNetwork": 1, + } + ) + + sender_result = WrapperManager.create_and_start( + config=node_config, + event_cb=sender_collector.event_callback, + ) + assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" + + with sender_result.ok_value as sender_node: + # Relay peer so the sender has a propagation path. + relay_config = { + **node_config, + "staticnodes": [get_node_multiaddr(sender_node)], + "portsShift": 1, + } + + relay_result = WrapperManager.create_and_start(config=relay_config) + assert relay_result.is_ok(), f"Failed to start relay peer: {relay_result.err()}" + + with relay_result.ok_value: + # Build one message per send, with distinct payloads so we can + # detect any cross-association between request ids and events. + messages = [ + create_message_bindings( + contentTopic=S30_CONTENT_TOPIC, + payload=to_base64(f"s30-concurrent-{i}"), + ) + for i in range(S30_CONCURRENT_SENDS) + ] + + # Fire all sends concurrently. The sender is not yet subscribed + # to S30_CONTENT_TOPIC, so this exercises the auto-subscribe path + # under contention. + with ThreadPoolExecutor(max_workers=S30_CONCURRENT_SENDS) as pool: + send_results = list(pool.map(sender_node.send_message, messages)) + + # Every send must return Ok(RequestId). + request_ids = [] + for i, send_result in enumerate(send_results): + assert send_result.is_ok(), f"Concurrent send #{i} failed: {send_result.err()}" + request_id = send_result.ok_value + assert request_id, f"Concurrent send #{i} returned an empty RequestId" + request_ids.append(request_id) + + # Request ids must be unique across concurrent sends. + assert len(set(request_ids)) == len(request_ids), f"Duplicate RequestIds returned by concurrent sends: {request_ids}" + + # Each request id must get its own propagated event and no error. + for request_id in request_ids: + propagated_event = wait_for_propagated( + collector=sender_collector, + request_id=request_id, + timeout_s=PROPAGATED_TIMEOUT_S, + ) + assert propagated_event is not None, ( + f"No MessagePropagatedEvent for request_id={request_id} " + f"within {PROPAGATED_TIMEOUT_S}s. " + f"Collected events: {sender_collector.events}" + ) + + error_event = wait_for_error( + collector=sender_collector, + request_id=request_id, + timeout_s=0, + ) + assert error_event is None, f"Unexpected message_error for request_id={request_id}: {error_event}" + + # Cross-association guard: every event with a requestId must + # belong to exactly one of the request ids we issued. + issued = set(request_ids) + for event in sender_collector.snapshot(): + event_request_id = event.get("requestId") + if event_request_id is None: + continue + assert event_request_id in issued, ( + f"Event carries an unknown requestId={event_request_id!r}, " f"not in issued set {issued}. Event: {event}" + ) + + # Per-request invariants apply to every concurrent send + # (correct requestId, no duplicate terminal events, + # Sent never before Propagated). + for request_id in request_ids: + assert_event_invariants(sender_collector, request_id) + + +class TestS31ConcurrentSendsMixedTopicsDuringChurn(StepsStore): + """ + S31: concurrent sends across mixed content topics during peer churn. + """ + + @pytest.mark.docker_required + def test_s31_concurrent_sends_mixed_topics_during_churn(self, node_config): + sender_collector = EventCollector() + + relay_peer = WakuNode(NODE_2, f"s31_relay_peer_{self.test_id}") + relay_peer.start(relay="true", discv5_discovery="false") + relay_peer.set_relay_subscriptions([self.test_pubsub_topic]) + + lightpush_peer = WakuNode(NODE_2, f"s31_lightpush_peer_{self.test_id}") + lightpush_peer.start(relay="true", lightpush="true", discv5_discovery="false") + lightpush_peer.set_relay_subscriptions([self.test_pubsub_topic]) + + store_peer = WakuNode(NODE_2, f"s31_store_peer_{self.test_id}") + store_peer.start(relay="true", store="true", discv5_discovery="false") + store_peer.set_relay_subscriptions([self.test_pubsub_topic]) + + churn_peers = [relay_peer, lightpush_peer, store_peer] + + # Mesh docker peers so a lightpushed message can fan out to the store peer. + peer_multiaddrs = [p.get_multiaddr_with_id() for p in churn_peers] + for peer in churn_peers: + others = [a for a in peer_multiaddrs if a != peer.get_multiaddr_with_id()] + peer.add_peers(others) + + node_config.update( + { + "mode": "Edge", + "relay": True, + "lightpush": True, + "store": False, + "discv5Discovery": False, + "numShardsInNetwork": 1, + "lightpushnode": lightpush_peer.get_multiaddr_with_id(), + } + ) + + sender_result = WrapperManager.create_and_start( + config=node_config, + event_cb=sender_collector.event_callback, + ) + assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" + + with sender_result.ok_value as sender_node: + sender_multiaddr = get_node_multiaddr(sender_node) + for peer in churn_peers: + peer.add_peers([sender_multiaddr]) + delay(3) # let docker peers connect to the sender + + all_request_ids: list[str] = [] + phase1_ids = self._s31_fire_burst(sender_node, phase_label="phase1") + all_request_ids.extend(phase1_ids) + + for peer in churn_peers: + peer.restart() + delay(1) # small window so the restart is actually in-flight + phase2_ids = self._s31_fire_burst(sender_node, phase_label="phase2") + all_request_ids.extend(phase2_ids) + + # Wait for all peers to be ready again and re-attach the sender. + for peer in churn_peers: + peer.ensure_ready(timeout_duration=20) + peer.add_peers([sender_multiaddr]) + + peer_multiaddrs = [p.get_multiaddr_with_id() for p in churn_peers] + for peer in churn_peers: + others = [a for a in peer_multiaddrs if a != peer.get_multiaddr_with_id()] + peer.add_peers(others) + delay(3) + + phase3_ids = self._s31_fire_burst(sender_node, phase_label="phase3") + all_request_ids.extend(phase3_ids) + + assert len(set(all_request_ids)) == len(all_request_ids), f"Duplicate RequestIds across bursts: {all_request_ids}" + + # Phase 1 ran before any churn, so the mesh was stable — standard timeout. + # Phase 3 ran right after restart + re-attach, so the mesh needed to + # re-stabilize — use the recovery timeout to avoid CI flakiness. + phase_timeouts = [ + (phase1_ids, PROPAGATED_TIMEOUT_S), + (phase3_ids, RECOVERY_TIMEOUT_S), + ] + for request_ids, timeout_s in phase_timeouts: + for request_id in request_ids: + propagated_event = wait_for_propagated( + collector=sender_collector, + request_id=request_id, + timeout_s=timeout_s, + ) + assert propagated_event is not None, ( + f"No MessagePropagatedEvent for stable-phase " + f"request_id={request_id} within {timeout_s}s. " + f"Collected events: {sender_collector.events}" + ) + + error_event = wait_for_error( + collector=sender_collector, + request_id=request_id, + timeout_s=0, + ) + assert error_event is None, f"Unexpected message_error event for stable-phase " f"request_id={request_id}: {error_event}" + + for request_id in phase2_ids: + error_event = wait_for_error( + collector=sender_collector, + request_id=request_id, + timeout_s=0, + ) + assert error_event is None, f"Unexpected terminal message_error for phase-2 " f"request_id={request_id} after recovery: {error_event}" + + issued = set(all_request_ids) + for event in sender_collector.snapshot(): + event_request_id = event.get("requestId") + if event_request_id is None: + continue + assert event_request_id in issued, ( + f"Event carries an unknown requestId={event_request_id!r}, " f"not in issued set {issued}. Event: {event}" + ) + + # Use the hash the wrapper emitted on message_sent so the store + # lookup matches the exact bytes that were actually published. + phase3_hashes = [] + for request_id in phase3_ids: + sent_event = wait_for_sent( + collector=sender_collector, + request_id=request_id, + timeout_s=RECOVERY_TIMEOUT_S, + ) + assert sent_event is not None, ( + f"No message_sent event for phase-3 request_id={request_id} " + f"within {RECOVERY_TIMEOUT_S}s. Collected events: {sender_collector.events}" + ) + msg_hash = sent_event.get("messageHash") + assert msg_hash, f"message_sent event missing messageHash: {sent_event}" + phase3_hashes.append(msg_hash) + + # 3 phases × S31_BURST_SIZE messages, so the page must fit them all, + # otherwise phase-3 hashes (which sort last in ascending order) get cut off. + self.check_sent_message_is_stored( + expected_hashes=phase3_hashes, + store_node=store_peer, + pubsub_topic=self.test_pubsub_topic, + page_size=S31_BURST_SIZE * 3, + ascending="true", + ) + + # Per-request invariants apply across all phases, including the + # retry-path bursts (phase 2). If retries ever emit duplicate + # Propagated events or reorder Sent before Propagated, this catches it. + for request_id in all_request_ids: + assert_event_invariants(sender_collector, request_id) + + def _s31_fire_burst(self, sender_node, *, phase_label: str) -> list[str]: + """Fire S31_BURST_SIZE concurrent sends, one per topic in S31_CONTENT_TOPICS. + Returns the list of RequestIds. Asserts every send returned Ok.""" + messages = [ + self.create_message( + contentTopic=S31_CONTENT_TOPICS[i], + payload=to_base64(f"s31-{phase_label}-{i}"), + ) + for i in range(S31_BURST_SIZE) + ] + + with ThreadPoolExecutor(max_workers=S31_BURST_SIZE) as pool: + send_results = list(pool.map(sender_node.send_message, messages)) + + request_ids = [] + for i, send_result in enumerate(send_results): + assert send_result.is_ok(), f"{phase_label}: concurrent send #{i} failed: {send_result.err()}" + request_id = send_result.ok_value + assert request_id, f"{phase_label}: concurrent send #{i} returned an empty RequestId" + request_ids.append(request_id) + + return request_ids diff --git a/tests/wrappers_tests/test_send_handle_and_subscription.py b/tests/wrappers_tests/test_send_handle_and_subscription.py new file mode 100644 index 000000000..ee086d9cf --- /dev/null +++ b/tests/wrappers_tests/test_send_handle_and_subscription.py @@ -0,0 +1,394 @@ +import json +import subprocess +import sys +import pytest +from src.steps.common import StepsCommon +from src.libs.common import to_base64 +from src.libs.custom_logger import get_custom_logger +from src.node.wrappers_manager import WrapperManager +from src.node.wrapper_helpers import ( + EventCollector, + assert_event_invariants, + create_message_bindings, + get_node_multiaddr, + wait_for_connected, + wait_for_propagated, + wait_for_sent, + wait_for_error, +) + +logger = get_custom_logger(__name__) + +PROPAGATED_TIMEOUT_S = 30.0 + +S01_EXPECTED_ERROR_FRAGMENT = "not initialized" +# Destroyed-handle path fails synchronously in the C layer (no callback), +# so the binding surfaces a different string than the nil-handle path. +S01_DESTROYED_HANDLE_ERROR_FRAGMENT = "immediate call failed" +S01_SUBPROCESS_TIMEOUT_S = 30 +S01_RESULT_MARKER = "__S01_RESULT__" +SEND_AFTER_DESTROY_RESULT_MARKER = "__SEND_AFTER_DESTROY_RESULT__" +SEND_AFTER_DESTROY_SUBPROCESS_TIMEOUT_S = 60 + +S01_INVALID_HANDLE_HELPER = "tests.wrappers_tests.helpers.send_invalid_handle" + +# S05: malformed content topics +S05_EXPECTED_ERROR_FRAGMENT = "Failed to auto-subscribe" +S05_MALFORMED_CONTENT_TOPICS = [ + # No leading slash — parser rejects with "must start with slash". + ("s05-invalid-no-leading-slash", "no-leading-slash"), + # Empty string — parser rejects empty content topic. + ("", "empty"), + # Only 3 segments — content topics need /app/version/name/encoding. + ("/app/1/name", "missing-encoding-segment"), + # Empty middle segment between slashes. + ("/app//name/proto", "empty-middle-segment"), +] + + +class TestS01NilOrUninitializedHandle(StepsCommon): + """S01 — send() on a nil/destroyed handle must Err, no events, no crash.""" + + @pytest.mark.skip(reason="see https://github.com/logos-messaging/logos-delivery/issues/3873") + def test_s01_send_on_uninitialized_handle(self): + completed = subprocess.run( + [sys.executable, "-m", S01_INVALID_HANDLE_HELPER, "nil", S01_RESULT_MARKER], + capture_output=True, + text=True, + timeout=S01_SUBPROCESS_TIMEOUT_S, + ) + + assert completed.returncode == 0, ( + f"send() crashed on a nil handle (returncode={completed.returncode}). " f"stdout={completed.stdout!r} stderr={completed.stderr!r}" + ) + + result_line = next( + (l for l in completed.stdout.splitlines() if l.startswith(S01_RESULT_MARKER)), + None, + ) + assert result_line, f"missing result marker. stdout={completed.stdout!r} stderr={completed.stderr!r}" + + result = json.loads(result_line[len(S01_RESULT_MARKER) :]) + + assert result["is_ok"] is False, f"expected Err, got Ok({result['ok']!r})" + assert S01_EXPECTED_ERROR_FRAGMENT in ( + result["err"] or "" + ), f"expected error to mention {S01_EXPECTED_ERROR_FRAGMENT!r}, got: {result['err']!r}" + + @pytest.mark.skip(reason="see https://github.com/logos-messaging/logos-delivery/issues/3863") + def test_s01_send_on_destroyed_handle(self): + completed = subprocess.run( + [sys.executable, "-m", S01_INVALID_HANDLE_HELPER, "destroyed", SEND_AFTER_DESTROY_RESULT_MARKER], + capture_output=True, + text=True, + timeout=SEND_AFTER_DESTROY_SUBPROCESS_TIMEOUT_S, + ) + + assert completed.returncode == 0, ( + f"send() crashed on a destroyed handle (returncode={completed.returncode}). " f"stdout={completed.stdout!r} stderr={completed.stderr!r}" + ) + + result_line = next( + (l for l in completed.stdout.splitlines() if l.startswith(SEND_AFTER_DESTROY_RESULT_MARKER)), + None, + ) + assert result_line, f"missing result marker. stdout={completed.stdout!r} stderr={completed.stderr!r}" + + result = json.loads(result_line[len(SEND_AFTER_DESTROY_RESULT_MARKER) :]) + + assert result["stage"] == "send_message", f"setup failed at stage {result['stage']!r}: {result['err']!r}" + assert result["is_ok"] is False, f"expected Err, got Ok({result['ok']!r})" + err_msg = result["err"] or "" + assert S01_EXPECTED_ERROR_FRAGMENT in err_msg or S01_DESTROYED_HANDLE_ERROR_FRAGMENT in err_msg, ( + f"expected error to mention {S01_EXPECTED_ERROR_FRAGMENT!r} " f"or {S01_DESTROYED_HANDLE_ERROR_FRAGMENT!r}, got: {result['err']!r}" + ) + assert result["events_after_send"] == [], f"expected no events after send(), got: {result['events_after_send']}" + + +class TestS02AutoSubscribeOnFirstSend(StepsCommon): + """ + S02 — Auto-subscribe on first send. + Sender never calls subscribe_content_topic() before send(). + The send API must auto-subscribe to the content topic used in the message. + Expected: send() returns Ok(RequestId), message_propagated arrives. + """ + + def test_s02_send_without_explicit_subscribe(self, node_config): + sender_collector = EventCollector() + + node_config.update( + { + "relay": True, + "store": False, + "lightpush": False, + "filter": False, + "discv5Discovery": False, + "numShardsInNetwork": 1, + } + ) + + sender_result = WrapperManager.create_and_start( + config=node_config, + event_cb=sender_collector.event_callback, + ) + assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" + + with sender_result.ok_value as sender: + peer_config = { + **node_config, + "staticnodes": [get_node_multiaddr(sender)], + "portsShift": 1, + } + + peer_result = WrapperManager.create_and_start(config=peer_config) + assert peer_result.is_ok(), f"Failed to start relay peer: {peer_result.err()}" + + with peer_result.ok_value: + assert wait_for_connected(sender_collector) is not None, "Sender did not reach Connected/PartiallyConnected state" + + message = create_message_bindings( + payload=to_base64("S02 auto-subscribe test payload"), + contentTopic="/test/1/s02-auto-subscribe/proto", + ) + + send_result = sender.send_message(message=message) + assert send_result.is_ok(), f"send() failed: {send_result.err()}" + + request_id = send_result.ok_value + assert request_id, "send() returned an empty RequestId" + + propagated = wait_for_propagated( + collector=sender_collector, + request_id=request_id, + timeout_s=PROPAGATED_TIMEOUT_S, + ) + assert propagated is not None, ( + f"No message_propagated event within {PROPAGATED_TIMEOUT_S}s. " f"Collected events: {sender_collector.events}" + ) + assert propagated["requestId"] == request_id + + error = wait_for_error(sender_collector, request_id, timeout_s=0) + assert error is None, f"Unexpected message_error event: {error}" + + +class TestS03SendOnAlreadySubscribedTopic(StepsCommon): + """ + S03 — Send on already-subscribed content topic. + Sender explicitly calls subscribe_content_topic() before send(). + The send path must behave identically to the auto-subscribe case: + Propagated arrives, no Sent (store disabled), no Error. + Topology mirrors S06 (relay-only sender + relay peer, no store). + Purpose: proves the send path is identical when auto-subscription is skipped. + """ + + def test_s03_send_on_already_subscribed_content_topic(self, node_config): + sender_collector = EventCollector() + content_topic = "/test/1/s03-already-subscribed/proto" + + node_config.update( + { + "relay": True, + "store": False, + "lightpush": False, + "filter": False, + "discv5Discovery": False, + "numShardsInNetwork": 1, + "reliabilityEnabled": True, + } + ) + + sender_result = WrapperManager.create_and_start( + config=node_config, + event_cb=sender_collector.event_callback, + ) + assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" + + with sender_result.ok_value as sender: + peer_config = { + **node_config, + "staticnodes": [get_node_multiaddr(sender)], + "portsShift": 1, + } + + peer_result = WrapperManager.create_and_start(config=peer_config) + assert peer_result.is_ok(), f"Failed to start relay peer: {peer_result.err()}" + + with peer_result.ok_value: + assert wait_for_connected(sender_collector) is not None, "Sender did not reach Connected/PartiallyConnected state" + + # Explicit subscribe before send — this is what S03 is about. + # The send path must still return Ok(RequestId) and emit the + # same events as the auto-subscribe topology in S06. + subscribe_result = sender.subscribe_content_topic(content_topic) + assert subscribe_result.is_ok(), f"subscribe_content_topic failed: {subscribe_result.err()}" + + message = create_message_bindings( + payload=to_base64("S03 already-subscribed test payload"), + contentTopic=content_topic, + ) + + send_result = sender.send_message(message=message) + assert send_result.is_ok(), f"send() failed: {send_result.err()}" + + request_id = send_result.ok_value + assert request_id, "send() returned an empty RequestId" + + propagated = wait_for_propagated( + collector=sender_collector, + request_id=request_id, + timeout_s=PROPAGATED_TIMEOUT_S, + ) + assert propagated is not None, ( + f"No message_propagated event within {PROPAGATED_TIMEOUT_S}s. " f"Collected events: {sender_collector.events}" + ) + assert propagated["requestId"] == request_id + + error = wait_for_error(sender_collector, request_id, timeout_s=0) + assert error is None, f"Unexpected message_error event: {error}" + + sent = wait_for_sent(sender_collector, request_id, timeout_s=0) + assert sent is None, f"Unexpected message_sent event (store is disabled): {sent}" + + assert_event_invariants(sender_collector, request_id) + + +class TestS04UnsubscribeThenSendSameTopic(StepsCommon): + """ + S04 — Unsubscribe, then send the same content topic again. + Sender subscribes to topic A, unsubscribes from A, then sends on A. + The send path must re-establish topic interest and deliver normally. + Topology mirrors S06 (relay-only sender + relay peer, no store). + Expected: send() returns Ok(RequestId), Propagated arrives, + no Sent (store disabled), no Error. + Purpose: verifies send() re-establishes topic interest after local unsubscribe. + """ + + def test_s04_unsubscribe_then_send_same_content_topic(self, node_config): + sender_collector = EventCollector() + content_topic = "/test/1/s04-unsubscribe-resend/proto" + + node_config.update( + { + "relay": True, + "store": False, + "lightpush": False, + "filter": False, + "discv5Discovery": False, + "numShardsInNetwork": 1, + "reliabilityEnabled": True, + } + ) + + sender_result = WrapperManager.create_and_start( + config=node_config, + event_cb=sender_collector.event_callback, + ) + assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" + + with sender_result.ok_value as sender: + peer_config = { + **node_config, + "staticnodes": [get_node_multiaddr(sender)], + "portsShift": 1, + } + + peer_result = WrapperManager.create_and_start(config=peer_config) + assert peer_result.is_ok(), f"Failed to start relay peer: {peer_result.err()}" + + with peer_result.ok_value: + assert wait_for_connected(sender_collector) is not None, "Sender did not reach Connected/PartiallyConnected state" + + # subscribe -> unsubscribe -> send: send() must re-establish + # topic interest internally. + subscribe_result = sender.subscribe_content_topic(content_topic) + assert subscribe_result.is_ok(), f"subscribe_content_topic failed: {subscribe_result.err()}" + + unsubscribe_result = sender.unsubscribe_content_topic(content_topic) + assert unsubscribe_result.is_ok(), f"unsubscribe_content_topic failed: {unsubscribe_result.err()}" + + message = create_message_bindings( + payload=to_base64("S04 unsubscribe-then-send test payload"), + contentTopic=content_topic, + ) + + send_result = sender.send_message(message=message) + assert send_result.is_ok(), f"send() failed after unsubscribe: {send_result.err()}" + + request_id = send_result.ok_value + assert request_id, "send() returned an empty RequestId" + + propagated = wait_for_propagated( + collector=sender_collector, + request_id=request_id, + timeout_s=PROPAGATED_TIMEOUT_S, + ) + assert propagated is not None, ( + f"No message_propagated event within {PROPAGATED_TIMEOUT_S}s " + f"after unsubscribe + send. Collected events: {sender_collector.events}" + ) + assert propagated["requestId"] == request_id + + error = wait_for_error(sender_collector, request_id, timeout_s=0) + assert error is None, f"Unexpected message_error event: {error}" + + sent = wait_for_sent(sender_collector, request_id, timeout_s=0) + assert sent is None, f"Unexpected message_sent event (store is disabled): {sent}" + + assert_event_invariants(sender_collector, request_id) + + +class TestS05AutoSubscribeFailureBeforeTaskCreation(StepsCommon): + """ + S05 — Auto-subscribe failure before task creation. + Sender is initialized but auto-subscription is forced to fail by using a + malformed content topic that breaks shard resolution inside + SubscriptionManager.subscribe(). + Expected: send() returns Err with an "auto-subscribe" message, no events. + Purpose: covers the last synchronous error path before request ID creation, + across several distinct validator branches. + """ + + @pytest.mark.parametrize( + "content_topic", + [topic for topic, _ in S05_MALFORMED_CONTENT_TOPICS], + ids=[case_id for _, case_id in S05_MALFORMED_CONTENT_TOPICS], + ) + def test_s05_send_fails_when_auto_subscribe_fails(self, node_config, content_topic): + sender_collector = EventCollector() + + node_config.update( + { + "relay": True, + "numShardsInNetwork": 1, + } + ) + + sender_result = WrapperManager.create_and_start( + config=node_config, + event_cb=sender_collector.event_callback, + ) + assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" + + with sender_result.ok_value as sender: + # Malformed content topic — SubscriptionManager.subscribe() cannot + # resolve it to a shard, so auto-subscribe inside send() fails. + message = create_message_bindings( + payload=to_base64("S05 auto-subscribe failure payload"), + contentTopic=content_topic, + ) + + send_result = sender.send_message(message=message) + + assert send_result.is_err(), ( + f"send() must return Err when auto-subscribe fails for " f"content_topic={content_topic!r}, got Ok({send_result.ok_value!r})" + ) + + error_message = send_result.err() or "" + assert S05_EXPECTED_ERROR_FRAGMENT in error_message, ( + f"expected error to mention {S05_EXPECTED_ERROR_FRAGMENT!r} " f"for content_topic={content_topic!r}, got: {error_message!r}" + ) + + # No request id was created, so no events should be emitted. + assert sender_collector.events == [] or all( + event.get("eventType") == "connection_status_change" for event in sender_collector.events + ), f"Unexpected events after a pre-task-creation failure: {sender_collector.events}" diff --git a/tests/wrappers_tests/test_send_e2e_part2.py b/tests/wrappers_tests/test_send_lightpush_and_edge.py similarity index 66% rename from tests/wrappers_tests/test_send_e2e_part2.py rename to tests/wrappers_tests/test_send_lightpush_and_edge.py index f94a89829..917ecd918 100644 --- a/tests/wrappers_tests/test_send_e2e_part2.py +++ b/tests/wrappers_tests/test_send_lightpush_and_edge.py @@ -1,253 +1,36 @@ import base64 import pytest +from src.env_vars import NODE_1 from src.steps.common import StepsCommon from src.libs.common import delay, to_base64 from src.libs.custom_logger import get_custom_logger +from src.node.waku_node import WakuNode from src.node.wrappers_manager import WrapperManager from src.node.wrapper_helpers import ( EventCollector, assert_event_invariants, create_message_bindings, get_node_multiaddr, - wait_for_connected, wait_for_propagated, wait_for_sent, wait_for_error, ) -from tests.wrappers_tests.conftest import build_node_config +from src.test_data import DEFAULT_CLUSTER_ID +from tests.wrappers_tests.conftest import build_node_config, free_port logger = get_custom_logger(__name__) PROPAGATED_TIMEOUT_S = 30.0 -SENT_TIMEOUT_S = 10.0 NO_SENT_OBSERVATION_S = 5.0 SENT_AFTER_STORE_TIMEOUT_S = 60.0 OVERSIZED_PAYLOAD_BYTES = 200 * 1024 RECOVERY_TIMEOUT_S = 45.0 SERVICE_DOWN_SETTLE_S = 3.0 -# MaxTimeInCache from send_service.nim. -MAX_TIME_IN_CACHE_S = 60.0 -# Extra slack to cover the background retry loop tick after the window expires. -CACHE_EXPIRY_SLACK_S = 10.0 -ERROR_AFTER_CACHE_EXPIRY_TIMEOUT_S = MAX_TIME_IN_CACHE_S + CACHE_EXPIRY_SLACK_S -RETRY_WINDOW_EXPIRED_MSG = "Unable to send within retry time window" - - -class TestS02AutoSubscribeOnFirstSend(StepsCommon): - """ - S02 — Auto-subscribe on first send. - Sender never calls subscribe_content_topic() before send(). - The send API must auto-subscribe to the content topic used in the message. - Expected: send() returns Ok(RequestId), message_propagated arrives. - """ - - def test_s02_send_without_explicit_subscribe(self, node_config): - sender_collector = EventCollector() - - node_config.update( - { - "relay": True, - "store": False, - "lightpush": False, - "filter": False, - "discv5Discovery": False, - "numShardsInNetwork": 1, - } - ) - - sender_result = WrapperManager.create_and_start( - config=node_config, - event_cb=sender_collector.event_callback, - ) - assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" - - with sender_result.ok_value as sender: - peer_config = { - **node_config, - "staticnodes": [get_node_multiaddr(sender)], - "portsShift": 1, - } - - peer_result = WrapperManager.create_and_start(config=peer_config) - assert peer_result.is_ok(), f"Failed to start relay peer: {peer_result.err()}" - - with peer_result.ok_value: - assert wait_for_connected(sender_collector) is not None, "Sender did not reach Connected/PartiallyConnected state" - - message = create_message_bindings( - payload=to_base64("S02 auto-subscribe test payload"), - contentTopic="/test/1/s02-auto-subscribe/proto", - ) - - send_result = sender.send_message(message=message) - assert send_result.is_ok(), f"send() failed: {send_result.err()}" - - request_id = send_result.ok_value - assert request_id, "send() returned an empty RequestId" - - propagated = wait_for_propagated( - collector=sender_collector, - request_id=request_id, - timeout_s=PROPAGATED_TIMEOUT_S, - ) - assert propagated is not None, ( - f"No message_propagated event within {PROPAGATED_TIMEOUT_S}s. " f"Collected events: {sender_collector.events}" - ) - assert propagated["requestId"] == request_id - - error = wait_for_error(sender_collector, request_id, timeout_s=0) - assert error is None, f"Unexpected message_error event: {error}" - - -class TestS06CoreSenderRelayOnly(StepsCommon): - """ - S06 — Core sender with relay peers only, no store. - Sender has local relay enabled and is connected to one relay peer. - Expected: send() returns Ok(RequestId), message_propagated event arrives, - no message_sent (store disabled), no message_error. - """ - - def test_s06_relay_propagation_without_store(self, node_config): - sender_collector = EventCollector() - - node_config.update( - { - "relay": True, - "store": False, - "lightpush": False, - "filter": False, - "discv5Discovery": False, - "numShardsInNetwork": 1, - "reliabilityEnabled": True, - } - ) - - sender_result = WrapperManager.create_and_start( - config=node_config, - event_cb=sender_collector.event_callback, - ) - assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" - - with sender_result.ok_value as sender: - peer_config = { - **node_config, - "staticnodes": [get_node_multiaddr(sender)], - "portsShift": 1, - } - - peer_result = WrapperManager.create_and_start(config=peer_config) - assert peer_result.is_ok(), f"Failed to start relay peer: {peer_result.err()}" - - with peer_result.ok_value: - assert wait_for_connected(sender_collector) is not None, "Sender did not reach Connected/PartiallyConnected state" - - message = create_message_bindings( - payload=to_base64("S06 relay-only test payload"), - contentTopic="/test/1/s06-relay-only/proto", - ) - - send_result = sender.send_message(message=message) - assert send_result.is_ok(), f"send() failed: {send_result.err()}" - - request_id = send_result.ok_value - assert request_id, "send() returned an empty RequestId" - - propagated = wait_for_propagated( - collector=sender_collector, - request_id=request_id, - timeout_s=PROPAGATED_TIMEOUT_S, - ) - assert propagated is not None, ( - f"No message_propagated event within {PROPAGATED_TIMEOUT_S}s. " f"Collected events: {sender_collector.events}" - ) - assert propagated["requestId"] == request_id - - error = wait_for_error(sender_collector, request_id, timeout_s=0) - assert error is None, f"Unexpected message_error event: {error}" - - sent = wait_for_sent(sender_collector, request_id, timeout_s=0) - assert sent is None, f"Unexpected message_sent event (store is disabled): {sent}" - - assert_event_invariants(sender_collector, request_id) - - -class TestS07CoreSenderRelayAndStore(StepsCommon): - """ - S07 — Core sender with relay peers and store peer, reliability enabled. - Sender relays message to a store-capable peer; delivery service validates - the message reached the store via p2p reliability check. - Expected: Propagated, then Sent. - """ - - def test_s07_relay_propagation_with_store_validation(self, node_config): - sender_collector = EventCollector() - - node_config.update( - { - "relay": True, - "store": False, - "lightpush": False, - "filter": False, - "discv5Discovery": False, - "numShardsInNetwork": 1, - "reliabilityEnabled": True, - } - ) - - sender_result = WrapperManager.create_and_start( - config=node_config, - event_cb=sender_collector.event_callback, - ) - assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" - - with sender_result.ok_value as sender: - peer_config = { - **node_config, - "staticnodes": [get_node_multiaddr(sender)], - "portsShift": 1, - "store": True, - } - - peer_result = WrapperManager.create_and_start(config=peer_config) - assert peer_result.is_ok(), f"Failed to start store peer: {peer_result.err()}" - - with peer_result.ok_value: - message = create_message_bindings( - payload=to_base64("S07 relay+store test payload"), - contentTopic="/test/1/s07-relay-store/proto", - ) - - send_result = sender.send_message(message=message) - assert send_result.is_ok(), f"send() failed: {send_result.err()}" - - request_id = send_result.ok_value - assert request_id, "send() returned an empty RequestId" - - propagated = wait_for_propagated( - collector=sender_collector, - request_id=request_id, - timeout_s=PROPAGATED_TIMEOUT_S, - ) - assert propagated is not None, ( - f"No message_propagated event within {PROPAGATED_TIMEOUT_S}s. " f"Collected events: {sender_collector.events}" - ) - assert propagated["requestId"] == request_id - - sent = wait_for_sent( - collector=sender_collector, - request_id=request_id, - timeout_s=SENT_TIMEOUT_S, - ) - assert sent is not None, ( - f"No message_sent event within {SENT_TIMEOUT_S}s after propagation. " f"Collected events: {sender_collector.events}" - ) - assert sent["requestId"] == request_id - - error = wait_for_error(sender_collector, request_id, timeout_s=0) - assert error is None, f"Unexpected message_error event: {error}" - - assert_event_invariants(sender_collector, request_id) +# Default-cluster shard-0 pubsub topic; used to subscribe the S11 docker store +# peer so it joins the same relay mesh as the wrapper nodes (wrapper config +# uses numShardsInNetwork=1 => shard 0). +STORE_PEER_PUBSUB_TOPIC = f"/waku/2/rs/{DEFAULT_CLUSTER_ID}/0" class TestRelayToLightpushFallback(StepsCommon): @@ -417,7 +200,7 @@ class TestS10EdgeSenderLightpushOnly(StepsCommon): """ @pytest.mark.xfail(reason="lightpush peer discovery via staticnodes is broken, see https://github.com/logos-messaging/logos-delivery/issues/3847") - def test_s10_edge_lightpush_propagation(self, node_config): + def test_s10_edge_lightpush_propagation(self): sender_collector = EventCollector() common = { @@ -488,60 +271,109 @@ class TestS10EdgeSenderLightpushOnly(StepsCommon): assert_event_invariants(sender_collector, request_id) -class TestS12IsolatedSenderNoPeers(StepsCommon): +class TestS11EdgeSenderLightpushAndStore(StepsCommon): """ - S12 — Isolated sender, no peers. - Sender has relay enabled but zero relay peers and zero lightpush peers. - Expected: send() returns Ok(RequestId), but eventually a message_error - event arrives (no route to propagate). + S11 — Edge sender with lightpush path and store validation. + Edge sender has no local relay; it publishes via a wrapper lightpush peer + and validates delivery via a docker store peer. Reliability enabled. + Topology: + [LightpushPeer] wrapper, relay=True, lightpush=True, store=False + [StorePeer] docker WakuNode, relay=true, store=true, + dials the lightpush peer via add_peers and subscribes + to the same shard-0 pubsub topic so it joins the + relay mesh and archives propagated messages. + [Edge] wrapper, mode="Edge", + staticnodes=[lightpush_peer], + storenode=store_peer, + reliabilityEnabled=True + Expected: send() returns Ok(RequestId), Propagated arrives, then Sent + (store validation succeeds), no Error. + Purpose: edge-mode fully validated success path against a real docker + store node (cross-implementation check). """ - def test_s12_send_with_no_peers_produces_error(self, node_config): + def test_s11_edge_lightpush_with_store_validation(self): sender_collector = EventCollector() - node_config.update( - { - "relay": True, - "store": False, - "lightpush": False, - "filter": False, - "discv5Discovery": False, - "numShardsInNetwork": 1, - } + common = { + "filter": False, + "discv5Discovery": True, + "numShardsInNetwork": 1, + } + + lightpush_config = build_node_config( + relay=True, + lightpush=True, + store=False, + **common, ) - sender_result = WrapperManager.create_and_start( - config=node_config, - event_cb=sender_collector.event_callback, - ) - assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" + lightpush_result = WrapperManager.create_and_start(config=lightpush_config) + assert lightpush_result.is_ok(), f"Failed to start lightpush peer: {lightpush_result.err()}" - with sender_result.ok_value as sender: - message = create_message_bindings( - payload=to_base64("S12 isolated sender payload"), - contentTopic="/test/1/s12-isolated/proto", + with lightpush_result.ok_value as lightpush_peer: + lightpush_multiaddr = get_node_multiaddr(lightpush_peer) + + # Docker store peer — real nwaku node running as the store backend. + # Dial the lightpush peer via add_peers and subscribe to the same + # shard-0 pubsub topic so it joins the relay mesh and archives + # messages propagated by the lightpush peer. + store_peer = WakuNode(NODE_1, f"s11_store_{self.test_id}") + store_peer.start(relay="true", store="true") + self.add_node_peer(store_peer, [lightpush_multiaddr]) + store_peer.set_relay_subscriptions([STORE_PEER_PUBSUB_TOPIC]) + store_multiaddr = store_peer.get_multiaddr_with_id() + + edge_config = build_node_config( + mode="Edge", + # assuming disc5v already happened + staticnodes=[lightpush_multiaddr, store_multiaddr], + reliabilityEnabled=True, + **common, ) - send_result = sender.send_message(message=message) - assert send_result.is_ok(), f"send() must return Ok(RequestId) even with no peers, got: {send_result.err()}" - - request_id = send_result.ok_value - assert request_id, "send() returned an empty RequestId" - - error = wait_for_error( - collector=sender_collector, - request_id=request_id, - timeout_s=ERROR_AFTER_CACHE_EXPIRY_TIMEOUT_S, + edge_result = WrapperManager.create_and_start( + config=edge_config, + event_cb=sender_collector.event_callback, ) - assert error is not None, ( - f"No message_error event within {ERROR_AFTER_CACHE_EXPIRY_TIMEOUT_S}s " - f"(MaxTimeInCache={MAX_TIME_IN_CACHE_S}s + slack) for isolated sender. " - f"Collected events: {sender_collector.events}" - ) - assert error["requestId"] == request_id + assert edge_result.is_ok(), f"Failed to start edge sender: {edge_result.err()}" - propagated = wait_for_propagated(sender_collector, request_id, timeout_s=0) - assert propagated is None, f"Unexpected message_propagated event for isolated sender: {propagated}" + with edge_result.ok_value as edge_sender: + message = create_message_bindings( + payload=to_base64("S11 edge lightpush + store test payload"), + contentTopic="/test/1/s11-edge-lightpush-store/proto", + ) + + send_result = edge_sender.send_message(message=message) + assert send_result.is_ok(), f"send() failed: {send_result.err()}" + + request_id = send_result.ok_value + assert request_id, "send() returned an empty RequestId" + + propagated = wait_for_propagated( + collector=sender_collector, + request_id=request_id, + timeout_s=PROPAGATED_TIMEOUT_S, + ) + assert propagated is not None, ( + f"No message_propagated event within {PROPAGATED_TIMEOUT_S}s. " f"Collected events: {sender_collector.events}" + ) + assert propagated["requestId"] == request_id + + sent = wait_for_sent( + collector=sender_collector, + request_id=request_id, + timeout_s=SENT_AFTER_STORE_TIMEOUT_S, + ) + assert sent is not None, ( + f"No message_sent event within {SENT_AFTER_STORE_TIMEOUT_S}s " f"after propagation. Collected events: {sender_collector.events}" + ) + assert sent["requestId"] == request_id + + error = wait_for_error(sender_collector, request_id, timeout_s=0) + assert error is None, f"Unexpected message_error event: {error}" + + assert_event_invariants(sender_collector, request_id) class TestS14LightpushNonRetryableError(StepsCommon): @@ -698,3 +530,122 @@ class TestS15LightpushRetryableErrorRecovery(StepsCommon): assert error is None, f"Unexpected message_error after recovery: {error}" assert_event_invariants(sender_collector, request_id) + + +class TestS26LightpushPeerChurn(StepsCommon): + """ + S26: multiple lightpush peers, the selected one disappears, + an alternate remains. + + Topology (3 peers + sender): + - peer1: relay + lightpush. The lightpush server initially selected + by the sender. Stopped mid-test to simulate churn. + - relay_peer: relay-only. Kept alive throughout the test as a + stable gossipsub mesh neighbour, so that after peer1 disappears + peer2 still has a relay path to propagate the message. + - peer2: relay + lightpush. The surviving lightpush server that + must take over once peer1 is gone. + - sender: edge node with peer1 and peer2 as static lightpush peers. + """ + + def test_s26_lightpush_peer_churn_alternate_remains(self, node_config): + sender_collector = EventCollector() + peer1_config = { + **node_config, + "relay": True, + "lightpush": True, + "store": False, + "filter": False, + "discv5Discovery": True, + "numShardsInNetwork": 1, + "portsShift": 1, + "discv5UdpPort": free_port(), + } + peer1_result = WrapperManager.create_and_start(config=peer1_config) + assert peer1_result.is_ok(), f"Failed to start lightpush peer1: {peer1_result.err()}" + peer1 = peer1_result.ok_value + + relay_config = { + **node_config, + "relay": True, + "lightpush": False, + "store": False, + "filter": False, + "discv5Discovery": False, + "numShardsInNetwork": 1, + "portsShift": 4, + } + + relay_result = WrapperManager.create_and_start(config=relay_config) + assert relay_result.is_ok(), f"Failed to start relay peer: {relay_result.err()}" + + with relay_result.ok_value as relay_peer: + peer2_config = { + **peer1_config, + "staticnodes": [ + get_node_multiaddr(peer1), + get_node_multiaddr(relay_peer), + ], + "portsShift": 2, + "discv5UdpPort": free_port(), + } + + peer2_result = WrapperManager.create_and_start(config=peer2_config) + assert peer2_result.is_ok(), f"Failed to start lightpush peer2: {peer2_result.err()}" + + with peer2_result.ok_value as peer2: + sender_config = { + **node_config, + "mode": "Edge", + "relay": True, + "lightpush": True, + "store": False, + "filter": False, + "discv5Discovery": False, + "numShardsInNetwork": 1, + "portsShift": 3, + "staticnodes": [ + get_node_multiaddr(peer1), + get_node_multiaddr(peer2), + ], + } + + sender_result = WrapperManager.create_and_start( + config=sender_config, + event_cb=sender_collector.event_callback, + ) + assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" + + with sender_result.ok_value as sender_node: + delay(2) + stop_result = peer1.stop_and_destroy() + assert stop_result.is_ok(), f"Failed to stop peer1: {stop_result.err()}" + delay(2) + + message = create_message_bindings() + send_result = sender_node.send_message(message=message) + assert send_result.is_ok(), f"send() must return Ok(RequestId) during peer churn, got: {send_result.err()}" + + request_id = send_result.ok_value + assert request_id, "send() returned an empty RequestId" + + # Expect Propagated via the surviving lightpush peer (peer2). + propagated_event = wait_for_propagated( + collector=sender_collector, + request_id=request_id, + timeout_s=PROPAGATED_TIMEOUT_S, + ) + assert propagated_event is not None, ( + f"No MessagePropagatedEvent within {PROPAGATED_TIMEOUT_S}s " + f"after the selected lightpush peer disappeared. " + f"Collected events: {sender_collector.events}" + ) + + error_event = wait_for_error( + collector=sender_collector, + request_id=request_id, + timeout_s=0, + ) + assert error_event is None, f"Unexpected message_error event during peer churn: {error_event}" + + assert_event_invariants(sender_collector, request_id) diff --git a/tests/wrappers_tests/test_send_e2e_part1.py b/tests/wrappers_tests/test_send_relay_propagation.py similarity index 52% rename from tests/wrappers_tests/test_send_e2e_part1.py rename to tests/wrappers_tests/test_send_relay_propagation.py index 6a94adf81..a1cfce07c 100644 --- a/tests/wrappers_tests/test_send_e2e_part1.py +++ b/tests/wrappers_tests/test_send_relay_propagation.py @@ -1,8 +1,7 @@ -from concurrent.futures import ThreadPoolExecutor - import pytest from src.env_vars import NODE_2 from src.steps.common import StepsCommon +from src.steps.store import StepsStore from src.libs.common import delay, to_base64 from src.libs.custom_logger import get_custom_logger from src.node.waku_node import WakuNode @@ -17,55 +16,179 @@ from src.node.wrapper_helpers import ( wait_for_sent, wait_for_error, ) -from src.steps.store import StepsStore from tests.wrappers_tests.conftest import free_port logger = get_custom_logger(__name__) -## max time to wait after sending the message PROPAGATED_TIMEOUT_S = 30.0 SENT_TIMEOUT_S = 10.0 NO_SENT_OBSERVATION_S = 5.0 SENT_AFTER_STORE_TIMEOUT_S = 60.0 NO_STORE_OBSERVATION_S = 60.0 -RECOVERY_TIMEOUT_S = 45.0 # S20 stabilization delays for gossipsub mesh formation. MESH_STABILIZATION_S = 10 STORE_JOIN_STABILIZATION_S = 10 -# MaxTimeInCache from send_service.nim. -MAX_TIME_IN_CACHE_S = 60.0 -# Extra slack to cover the background retry loop tick after the window expires. -CACHE_EXPIRY_SLACK_S = 10.0 -ERROR_AFTER_CACHE_EXPIRY_TIMEOUT_S = MAX_TIME_IN_CACHE_S + CACHE_EXPIRY_SLACK_S -RETRY_WINDOW_EXPIRED_MSG = "Unable to send within retry time window" -# S30: concurrent sends on the same content topic during initial auto-subscribe. -S30_CONCURRENT_SENDS = 5 -S30_CONTENT_TOPIC = "/test/1/s30-concurrent/proto" +class TestS06CoreSenderRelayOnly(StepsCommon): + """ + S06 — Core sender with relay peers only, no store. + Sender has local relay enabled and is connected to one relay peer. + Expected: send() returns Ok(RequestId), message_propagated event arrives, + no message_sent (store disabled), no message_error. + """ -# S31: concurrent sends across mixed topics during peer churn. -S31_BURST_SIZE = 8 -S31_CONTENT_TOPICS = [ - "/test/1/s31-topic-a/proto", - "/test/1/s31-topic-b/proto", - "/test/1/s31-topic-c/proto", - "/test/1/s31-topic-d/proto", - "/test/1/s31-topic-e/proto", - "/test/1/s31-topic-f/proto", - "/test/1/s31-topic-g/proto", - "/test/1/s31-topic-h/proto", -] + def test_s06_relay_propagation_without_store(self, node_config): + sender_collector = EventCollector() + + node_config.update( + { + "relay": True, + "store": False, + "lightpush": False, + "filter": False, + "discv5Discovery": False, + "numShardsInNetwork": 1, + "reliabilityEnabled": True, + } + ) + + sender_result = WrapperManager.create_and_start( + config=node_config, + event_cb=sender_collector.event_callback, + ) + assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" + + with sender_result.ok_value as sender: + peer_config = { + **node_config, + "staticnodes": [get_node_multiaddr(sender)], + "portsShift": 1, + } + + peer_result = WrapperManager.create_and_start(config=peer_config) + assert peer_result.is_ok(), f"Failed to start relay peer: {peer_result.err()}" + + with peer_result.ok_value: + assert wait_for_connected(sender_collector) is not None, "Sender did not reach Connected/PartiallyConnected state" + + message = create_message_bindings( + payload=to_base64("S06 relay-only test payload"), + contentTopic="/test/1/s06-relay-only/proto", + ) + + send_result = sender.send_message(message=message) + assert send_result.is_ok(), f"send() failed: {send_result.err()}" + + request_id = send_result.ok_value + assert request_id, "send() returned an empty RequestId" + + propagated = wait_for_propagated( + collector=sender_collector, + request_id=request_id, + timeout_s=PROPAGATED_TIMEOUT_S, + ) + assert propagated is not None, ( + f"No message_propagated event within {PROPAGATED_TIMEOUT_S}s. " f"Collected events: {sender_collector.events}" + ) + assert propagated["requestId"] == request_id + + error = wait_for_error(sender_collector, request_id, timeout_s=0) + assert error is None, f"Unexpected message_error event: {error}" + + sent = wait_for_sent(sender_collector, request_id, timeout_s=0) + assert sent is None, f"Unexpected message_sent event (store is disabled): {sent}" + + assert_event_invariants(sender_collector, request_id) -class TestSendBeforeRelay(StepsStore): +class TestS07CoreSenderRelayAndStore(StepsCommon): + """ + S07 — Core sender with relay peers and store peer, reliability enabled. + Sender relays message to a store-capable peer; delivery service validates + the message reached the store via p2p reliability check. + Expected: Propagated, then Sent. + """ + + def test_s07_relay_propagation_with_store_validation(self, node_config): + sender_collector = EventCollector() + + node_config.update( + { + "relay": True, + "store": False, + "lightpush": False, + "filter": False, + "discv5Discovery": False, + "numShardsInNetwork": 1, + "reliabilityEnabled": True, + } + ) + + sender_result = WrapperManager.create_and_start( + config=node_config, + event_cb=sender_collector.event_callback, + ) + assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" + + with sender_result.ok_value as sender: + peer_config = { + **node_config, + "staticnodes": [get_node_multiaddr(sender)], + "portsShift": 1, + "store": True, + } + + peer_result = WrapperManager.create_and_start(config=peer_config) + assert peer_result.is_ok(), f"Failed to start store peer: {peer_result.err()}" + + with peer_result.ok_value: + message = create_message_bindings( + payload=to_base64("S07 relay+store test payload"), + contentTopic="/test/1/s07-relay-store/proto", + ) + + send_result = sender.send_message(message=message) + assert send_result.is_ok(), f"send() failed: {send_result.err()}" + + request_id = send_result.ok_value + assert request_id, "send() returned an empty RequestId" + + propagated = wait_for_propagated( + collector=sender_collector, + request_id=request_id, + timeout_s=PROPAGATED_TIMEOUT_S, + ) + assert propagated is not None, ( + f"No message_propagated event within {PROPAGATED_TIMEOUT_S}s. " f"Collected events: {sender_collector.events}" + ) + assert propagated["requestId"] == request_id + + sent = wait_for_sent( + collector=sender_collector, + request_id=request_id, + timeout_s=SENT_TIMEOUT_S, + ) + assert sent is not None, ( + f"No message_sent event within {SENT_TIMEOUT_S}s after propagation. " f"Collected events: {sender_collector.events}" + ) + assert sent["requestId"] == request_id + + error = wait_for_error(sender_collector, request_id, timeout_s=0) + assert error is None, f"Unexpected message_error event: {error}" + + assert_event_invariants(sender_collector, request_id) + + +class TestS17SendBeforeRelayPeersJoin(StepsCommon): + """ + S17: sender starts isolated, calls send() + - send() returns Ok(RequestId) immediately + - Propagated event eventually arrives after a relay peer joins + """ + def test_s17_send_before_relay_peers_joins(self, node_config): - """ - S17: sender starts isolated, calls send() - - send() returns Ok(RequestId) immediately - - Propagated event eventually arrives - """ sender_collector = EventCollector() node_config.update( @@ -132,15 +255,18 @@ class TestSendBeforeRelay(StepsStore): assert_event_invariants(sender_collector, request_id) + +class TestS19StorePeerAppearsAfterPropagation(StepsStore): + """ + S19: a store peer comes online later. + - send() returns Ok(RequestId) immediately + - Propagated --- relay peer + - Sent when store peer is reachable + """ + @pytest.mark.docker_required @pytest.mark.xfail(reason="fails to republish after store peer joins mesh see https://github.com/logos-messaging/logos-delivery/issues/3848") def test_s19_store_peer_appears_after_propagation(self, node_config): - """ - S19: a store peer comes online later. - - send() returns Ok(RequestId) immediately - - Propagated --- relay peer - - Sent when store peer is reachable - """ sender_collector = EventCollector() node_config.update({"relay": True, "store": False, "discv5Discovery": False, "numShardsInNetwork": 1, "reliabilityEnabled": True}) @@ -228,20 +354,22 @@ class TestSendBeforeRelay(StepsStore): assert_event_invariants(sender_collector, request_id) + +class TestS20StoreMissesInitiallyThenRetrySucceeds(StepsStore): + """ + S20: relay propagation succeeds, the first store query misses + (the store peer is reachable but does not yet have the message), + a later retry republishes through the relay mesh, and the store + peer then archives it. + + Covers state flow: + SuccessfullyPropagated -> NextRoundRetry + -> SuccessfullyPropagated -> SuccessfullyValidated + """ + @pytest.mark.docker_required @pytest.mark.skip(reason="Forcing the miss store round not possible") def test_s20_store_misses_initially_then_retry_succeeds(self, node_config): - """ - S20: relay propagation succeeds, the first store query misses - (the store peer is reachable but does not yet have the message), - a later retry republishes through the relay mesh, and the store - peer then archives it. - - Covers state flow: - SuccessfullyPropagated -> NextRoundRetry - -> SuccessfullyPropagated -> SuccessfullyValidated - - """ sender_collector = EventCollector() store_node = WakuNode(NODE_2, f"s20_store_node_{self.test_id}") store_node.start( @@ -350,66 +478,16 @@ class TestSendBeforeRelay(StepsStore): assert_event_invariants(sender_collector, request_id) - def test_s21_error_when_retry_window_expires(self, node_config): - """ - S21: delivery retry window expires before any valid path recovers. - """ - sender_collector = EventCollector() - node_config.update( - { - "relay": True, - "store": False, - "lightpush": False, - "filter": False, - "discv5Discovery": False, - "numShardsInNetwork": 1, - } - ) - - sender_result = WrapperManager.create_and_start( - config=node_config, - event_cb=sender_collector.event_callback, - ) - assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" - - with sender_result.ok_value as sender_node: - message = create_message_bindings() - send_result = sender_node.send_message(message=message) - assert send_result.is_ok(), f"send() must return Ok(RequestId) even with no peers, got: {send_result.err()}" - - request_id = send_result.ok_value - assert request_id, "send() returned an empty RequestId" - - # No peer - error_event = wait_for_error( - collector=sender_collector, - request_id=request_id, - timeout_s=ERROR_AFTER_CACHE_EXPIRY_TIMEOUT_S, - ) - assert error_event is not None, ( - f"No MessageErrorEvent received within {ERROR_AFTER_CACHE_EXPIRY_TIMEOUT_S}s " - f"(MaxTimeInCache={MAX_TIME_IN_CACHE_S}s + slack). " - f"Collected events: {sender_collector.events}" - ) - logger.info(f"S21 received error event: {error_event}") - - assert error_event.get("error") == RETRY_WINDOW_EXPIRED_MSG, ( - f"Unexpected error message in message_error event.\n" - f"Expected: {RETRY_WINDOW_EXPIRED_MSG!r}\n" - f"Got: {error_event.get('error')!r}\n" - f"Full event: {error_event}" - ) - - assert_event_invariants(sender_collector, request_id) +class TestS22NonEphemeralWithReliabilityDisabled(StepsCommon): + """ + S22: non-ephemeral message with reliabilityEnabled disabled. + - propagation path exists ,reliabilityEnabled = false. + - Expected: Ok(RequestId), Propagated event only, no Sent event. + Note: S17 already covers the positive path of this test with reliabilityEnabled=True. + """ def test_s22_non_ephemeral_message_with_reliability_disabled(self, node_config): - """ - S22: non-ephemeral message with reliabilityEnabled disabled. - - propagation path exists ,reliabilityEnabled = false. - - Expected: Ok(RequestId), Propagated event only, no Sent event. - Note: S17 already covers the positive path of this test with reliabilityEnabled=True. - """ sender_collector = EventCollector() node_config.update( @@ -476,11 +554,14 @@ class TestSendBeforeRelay(StepsStore): assert_event_invariants(sender_collector, request_id) + +class TestS23NoSentEventWhenRelayHasNoStore(StepsCommon): + """ + S23: non-ephemeral message, reliability enabled, no store peer ever reachable. + - Expected: Ok(RequestId), Propagated event only, no Sent and no terminal error. + """ + def test_s23_no_sent_event_when_relay_has_no_store(self, node_config): - """ - S23: non-ephemeral message, reliability enabled, no store peer ever reachable. - - Expected: Ok(RequestId), Propagated event only, no Sent and no terminal error. - """ sender_collector = EventCollector() node_config.update( @@ -557,13 +638,15 @@ class TestSendBeforeRelay(StepsStore): assert_event_invariants(sender_collector, request_id) - def test_s24_ephemeral_message_with_reachable_store(self, node_config): - """ - S24: ephemeral message, reliability enabled, reachable store peer. - - Setup: propagation path exists, relay peer has store=True (reachable), - - Expected: Ok(RequestId), Propagated event only, no Sent event. - """ +class TestS24EphemeralMessageWithReachableStore(StepsCommon): + """ + S24: ephemeral message, reliability enabled, reachable store peer. + - Setup: propagation path exists, relay peer has store=True (reachable), + - Expected: Ok(RequestId), Propagated event only, no Sent event. + """ + + def test_s24_ephemeral_message_with_reachable_store(self, node_config): sender_collector = EventCollector() node_config.update( @@ -623,398 +706,3 @@ class TestSendBeforeRelay(StepsStore): ) assert_event_invariants(sender_collector, request_id) - - def test_s26_lightpush_peer_churn_alternate_remains(self, node_config): - """ - S26: multiple lightpush peers, the selected one disappears, - an alternate remains. - - Topology (3 peers + sender): - - peer1: relay + lightpush. The lightpush server initially selected - by the sender. Stopped mid-test to simulate churn. - - relay_peer: relay-only. Kept alive throughout the test as a - stable gossipsub mesh neighbour, so that after peer1 disappears - peer2 still has a relay path to propagate the message. - - peer2: relay + lightpush. The surviving lightpush server that - must take over once peer1 is gone. - - sender: edge node with peer1 and peer2 as static lightpush peers. - """ - sender_collector = EventCollector() - peer1_config = { - **node_config, - "relay": True, - "lightpush": True, - "store": False, - "filter": False, - "discv5Discovery": True, - "numShardsInNetwork": 1, - "portsShift": 1, - "discv5UdpPort": free_port(), - } - peer1_result = WrapperManager.create_and_start(config=peer1_config) - assert peer1_result.is_ok(), f"Failed to start lightpush peer1: {peer1_result.err()}" - peer1 = peer1_result.ok_value - - relay_config = { - **node_config, - "relay": True, - "lightpush": False, - "store": False, - "filter": False, - "discv5Discovery": False, - "numShardsInNetwork": 1, - "portsShift": 4, - } - - relay_result = WrapperManager.create_and_start(config=relay_config) - assert relay_result.is_ok(), f"Failed to start relay peer: {relay_result.err()}" - - with relay_result.ok_value as relay_peer: - peer2_config = { - **peer1_config, - "staticnodes": [ - get_node_multiaddr(peer1), - get_node_multiaddr(relay_peer), - ], - "portsShift": 2, - "discv5UdpPort": free_port(), - } - - peer2_result = WrapperManager.create_and_start(config=peer2_config) - assert peer2_result.is_ok(), f"Failed to start lightpush peer2: {peer2_result.err()}" - - with peer2_result.ok_value as peer2: - sender_config = { - **node_config, - "mode": "Edge", - "relay": True, - "lightpush": True, - "store": False, - "filter": False, - "discv5Discovery": False, - "numShardsInNetwork": 1, - "portsShift": 3, - "staticnodes": [ - get_node_multiaddr(peer1), - get_node_multiaddr(peer2), - ], - } - - sender_result = WrapperManager.create_and_start( - config=sender_config, - event_cb=sender_collector.event_callback, - ) - assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" - - with sender_result.ok_value as sender_node: - delay(2) - stop_result = peer1.stop_and_destroy() - assert stop_result.is_ok(), f"Failed to stop peer1: {stop_result.err()}" - delay(2) - - message = create_message_bindings() - send_result = sender_node.send_message(message=message) - assert send_result.is_ok(), f"send() must return Ok(RequestId) during peer churn, got: {send_result.err()}" - - request_id = send_result.ok_value - assert request_id, "send() returned an empty RequestId" - - # Expect Propagated via the surviving lightpush peer (peer2). - propagated_event = wait_for_propagated( - collector=sender_collector, - request_id=request_id, - timeout_s=PROPAGATED_TIMEOUT_S, - ) - assert propagated_event is not None, ( - f"No MessagePropagatedEvent within {PROPAGATED_TIMEOUT_S}s " - f"after the selected lightpush peer disappeared. " - f"Collected events: {sender_collector.events}" - ) - - error_event = wait_for_error( - collector=sender_collector, - request_id=request_id, - timeout_s=0, - ) - assert error_event is None, f"Unexpected message_error event during peer churn: {error_event}" - - assert_event_invariants(sender_collector, request_id) - - def test_s30_concurrent_sends_during_auto_subscribe(self, node_config): - """ - S30: concurrent sends on the same content topic during initial auto-subscribe. - - Sender starts unsubscribed to the target topic. - - Several send() calls are issued at nearly the same time. - - Each call must return Ok(RequestId) with a unique id. - - Each request id must get its own propagated event, - with no dropped or cross-associated events. - """ - sender_collector = EventCollector() - - node_config.update( - { - "relay": True, - "store": False, - "discv5Discovery": False, - "numShardsInNetwork": 1, - } - ) - - sender_result = WrapperManager.create_and_start( - config=node_config, - event_cb=sender_collector.event_callback, - ) - assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" - - with sender_result.ok_value as sender_node: - # Relay peer so the sender has a propagation path. - relay_config = { - **node_config, - "staticnodes": [get_node_multiaddr(sender_node)], - "portsShift": 1, - } - - relay_result = WrapperManager.create_and_start(config=relay_config) - assert relay_result.is_ok(), f"Failed to start relay peer: {relay_result.err()}" - - with relay_result.ok_value: - # Build one message per send, with distinct payloads so we can - # detect any cross-association between request ids and events. - messages = [ - create_message_bindings( - contentTopic=S30_CONTENT_TOPIC, - payload=to_base64(f"s30-concurrent-{i}"), - ) - for i in range(S30_CONCURRENT_SENDS) - ] - - # Fire all sends concurrently. The sender is not yet subscribed - # to S30_CONTENT_TOPIC, so this exercises the auto-subscribe path - # under contention. - with ThreadPoolExecutor(max_workers=S30_CONCURRENT_SENDS) as pool: - send_results = list(pool.map(sender_node.send_message, messages)) - - # Every send must return Ok(RequestId). - request_ids = [] - for i, send_result in enumerate(send_results): - assert send_result.is_ok(), f"Concurrent send #{i} failed: {send_result.err()}" - request_id = send_result.ok_value - assert request_id, f"Concurrent send #{i} returned an empty RequestId" - request_ids.append(request_id) - - # Request ids must be unique across concurrent sends. - assert len(set(request_ids)) == len(request_ids), f"Duplicate RequestIds returned by concurrent sends: {request_ids}" - - # Each request id must get its own propagated event and no error. - for request_id in request_ids: - propagated_event = wait_for_propagated( - collector=sender_collector, - request_id=request_id, - timeout_s=PROPAGATED_TIMEOUT_S, - ) - assert propagated_event is not None, ( - f"No MessagePropagatedEvent for request_id={request_id} " - f"within {PROPAGATED_TIMEOUT_S}s. " - f"Collected events: {sender_collector.events}" - ) - - error_event = wait_for_error( - collector=sender_collector, - request_id=request_id, - timeout_s=0, - ) - assert error_event is None, f"Unexpected message_error for request_id={request_id}: {error_event}" - - # Cross-association guard: every event with a requestId must - # belong to exactly one of the request ids we issued. - issued = set(request_ids) - for event in sender_collector.snapshot(): - event_request_id = event.get("requestId") - if event_request_id is None: - continue - assert event_request_id in issued, ( - f"Event carries an unknown requestId={event_request_id!r}, " f"not in issued set {issued}. Event: {event}" - ) - - # Per-request invariants apply to every concurrent send - # (correct requestId, no duplicate terminal events, - # Sent never before Propagated). - for request_id in request_ids: - assert_event_invariants(sender_collector, request_id) - - @pytest.mark.docker_required - def test_s31_concurrent_sends_mixed_topics_during_churn(self, node_config): - """ - S31: concurrent sends across mixed content topics during peer churn. - """ - sender_collector = EventCollector() - - relay_peer = WakuNode(NODE_2, f"s31_relay_peer_{self.test_id}") - relay_peer.start(relay="true", discv5_discovery="false") - relay_peer.set_relay_subscriptions([self.test_pubsub_topic]) - - lightpush_peer = WakuNode(NODE_2, f"s31_lightpush_peer_{self.test_id}") - lightpush_peer.start(relay="true", lightpush="true", discv5_discovery="false") - lightpush_peer.set_relay_subscriptions([self.test_pubsub_topic]) - - store_peer = WakuNode(NODE_2, f"s31_store_peer_{self.test_id}") - store_peer.start(relay="true", store="true", discv5_discovery="false") - store_peer.set_relay_subscriptions([self.test_pubsub_topic]) - - churn_peers = [relay_peer, lightpush_peer, store_peer] - - # Mesh docker peers so a lightpushed message can fan out to the store peer. - peer_multiaddrs = [p.get_multiaddr_with_id() for p in churn_peers] - for peer in churn_peers: - others = [a for a in peer_multiaddrs if a != peer.get_multiaddr_with_id()] - peer.add_peers(others) - - node_config.update( - { - "mode": "Edge", - "relay": True, - "lightpush": True, - "store": False, - "discv5Discovery": False, - "numShardsInNetwork": 1, - "lightpushnode": lightpush_peer.get_multiaddr_with_id(), - } - ) - - sender_result = WrapperManager.create_and_start( - config=node_config, - event_cb=sender_collector.event_callback, - ) - assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" - - with sender_result.ok_value as sender_node: - sender_multiaddr = get_node_multiaddr(sender_node) - for peer in churn_peers: - peer.add_peers([sender_multiaddr]) - delay(3) # let docker peers connect to the sender - - all_request_ids: list[str] = [] - phase1_ids = self._s31_fire_burst(sender_node, phase_label="phase1") - all_request_ids.extend(phase1_ids) - - for peer in churn_peers: - peer.restart() - delay(1) # small window so the restart is actually in-flight - phase2_ids = self._s31_fire_burst(sender_node, phase_label="phase2") - all_request_ids.extend(phase2_ids) - - # Wait for all peers to be ready again and re-attach the sender. - for peer in churn_peers: - peer.ensure_ready(timeout_duration=20) - peer.add_peers([sender_multiaddr]) - - peer_multiaddrs = [p.get_multiaddr_with_id() for p in churn_peers] - for peer in churn_peers: - others = [a for a in peer_multiaddrs if a != peer.get_multiaddr_with_id()] - peer.add_peers(others) - delay(3) - - phase3_ids = self._s31_fire_burst(sender_node, phase_label="phase3") - all_request_ids.extend(phase3_ids) - - assert len(set(all_request_ids)) == len(all_request_ids), f"Duplicate RequestIds across bursts: {all_request_ids}" - - # Phase 1 ran before any churn, so the mesh was stable — standard timeout. - # Phase 3 ran right after restart + re-attach, so the mesh needed to - # re-stabilize — use the recovery timeout to avoid CI flakiness. - phase_timeouts = [ - (phase1_ids, PROPAGATED_TIMEOUT_S), - (phase3_ids, RECOVERY_TIMEOUT_S), - ] - for request_ids, timeout_s in phase_timeouts: - for request_id in request_ids: - propagated_event = wait_for_propagated( - collector=sender_collector, - request_id=request_id, - timeout_s=timeout_s, - ) - assert propagated_event is not None, ( - f"No MessagePropagatedEvent for stable-phase " - f"request_id={request_id} within {timeout_s}s. " - f"Collected events: {sender_collector.events}" - ) - - error_event = wait_for_error( - collector=sender_collector, - request_id=request_id, - timeout_s=0, - ) - assert error_event is None, f"Unexpected message_error event for stable-phase " f"request_id={request_id}: {error_event}" - - for request_id in phase2_ids: - error_event = wait_for_error( - collector=sender_collector, - request_id=request_id, - timeout_s=0, - ) - assert error_event is None, f"Unexpected terminal message_error for phase-2 " f"request_id={request_id} after recovery: {error_event}" - - issued = set(all_request_ids) - for event in sender_collector.snapshot(): - event_request_id = event.get("requestId") - if event_request_id is None: - continue - assert event_request_id in issued, ( - f"Event carries an unknown requestId={event_request_id!r}, " f"not in issued set {issued}. Event: {event}" - ) - - # Use the hash the wrapper emitted on message_sent so the store - # lookup matches the exact bytes that were actually published. - phase3_hashes = [] - for request_id in phase3_ids: - sent_event = wait_for_sent( - collector=sender_collector, - request_id=request_id, - timeout_s=RECOVERY_TIMEOUT_S, - ) - assert sent_event is not None, ( - f"No message_sent event for phase-3 request_id={request_id} " - f"within {RECOVERY_TIMEOUT_S}s. Collected events: {sender_collector.events}" - ) - msg_hash = sent_event.get("messageHash") - assert msg_hash, f"message_sent event missing messageHash: {sent_event}" - phase3_hashes.append(msg_hash) - - # 3 phases × S31_BURST_SIZE messages, so the page must fit them all, - # otherwise phase-3 hashes (which sort last in ascending order) get cut off. - self.check_sent_message_is_stored( - expected_hashes=phase3_hashes, - store_node=store_peer, - pubsub_topic=self.test_pubsub_topic, - page_size=S31_BURST_SIZE * 3, - ascending="true", - ) - - # Per-request invariants apply across all phases, including the - # retry-path bursts (phase 2). If retries ever emit duplicate - # Propagated events or reorder Sent before Propagated, this catches it. - for request_id in all_request_ids: - assert_event_invariants(sender_collector, request_id) - - def _s31_fire_burst(self, sender_node, *, phase_label: str) -> list[str]: - """Fire S31_BURST_SIZE concurrent sends, one per topic in S31_CONTENT_TOPICS. - Returns the list of RequestIds. Asserts every send returned Ok.""" - messages = [ - self.create_message( - contentTopic=S31_CONTENT_TOPICS[i], - payload=to_base64(f"s31-{phase_label}-{i}"), - ) - for i in range(S31_BURST_SIZE) - ] - - with ThreadPoolExecutor(max_workers=S31_BURST_SIZE) as pool: - send_results = list(pool.map(sender_node.send_message, messages)) - - request_ids = [] - for i, send_result in enumerate(send_results): - assert send_result.is_ok(), f"{phase_label}: concurrent send #{i} failed: {send_result.err()}" - request_id = send_result.ok_value - assert request_id, f"{phase_label}: concurrent send #{i} returned an empty RequestId" - request_ids.append(request_id) - - return request_ids diff --git a/vendor/logos-delivery-python-bindings b/vendor/logos-delivery-python-bindings index fd4d0a5a7..370693695 160000 --- a/vendor/logos-delivery-python-bindings +++ b/vendor/logos-delivery-python-bindings @@ -1 +1 @@ -Subproject commit fd4d0a5a7efcd56b395024f558afa416d7e27a2b +Subproject commit 370693695f8843a26108aa34c915bf6bd9f9f2d5 From 938e76194736c5de530649dff92f33039341e9ee Mon Sep 17 00:00:00 2001 From: AYAHASSAN287 <49167455+AYAHASSAN287@users.noreply.github.com> Date: Mon, 18 May 2026 17:04:16 +0300 Subject: [PATCH 2/9] E2e s13 s16 (#183) * Add S13 * Adding S16 * Add error message check for S13 * msrk s16 as xfail --- .../test_send_errors_and_concurrency.py | 81 +++++++++++++++++ .../test_send_lightpush_and_edge.py | 88 +++++++++++++++++++ 2 files changed, 169 insertions(+) diff --git a/tests/wrappers_tests/test_send_errors_and_concurrency.py b/tests/wrappers_tests/test_send_errors_and_concurrency.py index eaa895a60..7955f70b3 100644 --- a/tests/wrappers_tests/test_send_errors_and_concurrency.py +++ b/tests/wrappers_tests/test_send_errors_and_concurrency.py @@ -1,3 +1,4 @@ +import base64 from concurrent.futures import ThreadPoolExecutor import pytest @@ -13,6 +14,7 @@ from src.node.wrapper_helpers import ( assert_event_invariants, create_message_bindings, get_node_multiaddr, + wait_for_connected, wait_for_propagated, wait_for_sent, wait_for_error, @@ -30,6 +32,12 @@ CACHE_EXPIRY_SLACK_S = 10.0 ERROR_AFTER_CACHE_EXPIRY_TIMEOUT_S = MAX_TIME_IN_CACHE_S + CACHE_EXPIRY_SLACK_S RETRY_WINDOW_EXPIRED_MSG = "Unable to send within retry time window" +# Payload above DefaultMaxWakuMessageSize (150KiB), so the relay publish +# rejects it instead of failing with NO_PEERS_TO_RELAY. +OVERSIZED_PAYLOAD_BYTES = 200 * 1024 +ERROR_TIMEOUT_S = 30.0 +MESSAGE_SIZE_EXCEEDED_MSG = "Message size exceeded" + # S30: concurrent sends on the same content topic during initial auto-subscribe. S30_CONCURRENT_SENDS = 5 S30_CONTENT_TOPIC = "/test/1/s30-concurrent/proto" @@ -160,6 +168,79 @@ class TestS21ErrorWhenRetryWindowExpires(StepsCommon): assert_event_invariants(sender_collector, request_id) +class TestS13RelayHardFailureWithoutFallback(StepsCommon): + """ + S13: relay path is reachable (a relay peer is connected, so the publish + gets past NO_PEERS_TO_RELAY), but the relay publish fails for another + reason. An oversized payload is used so the relay processor rejects the + message immediately. No lightpush fallback is configured. + - Expected: Ok(RequestId), then a message_error event. + """ + + def test_s13_relay_hard_failure_without_fallback(self, node_config): + sender_collector = EventCollector() + + node_config.update( + { + "relay": True, + "numShardsInNetwork": 1, + } + ) + + sender_result = WrapperManager.create_and_start( + config=node_config, + event_cb=sender_collector.event_callback, + ) + assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" + + with sender_result.ok_value as sender_node: + relay_config = { + **node_config, + "staticnodes": [get_node_multiaddr(sender_node)], + "portsShift": 1, + } + + relay_result = WrapperManager.create_and_start(config=relay_config) + assert relay_result.is_ok(), f"Failed to start relay peer: {relay_result.err()}" + + with relay_result.ok_value: + # A connected relay peer means the publish gets past + # NO_PEERS_TO_RELAY and actually reaches the relay processor. + assert wait_for_connected(sender_collector) is not None, ( + f"Sender did not reach Connected/PartiallyConnected. " f"Collected events: {sender_collector.events}" + ) + + oversized_payload = base64.b64encode(b"x" * OVERSIZED_PAYLOAD_BYTES).decode() + message = create_message_bindings( + payload=oversized_payload, + contentTopic="/test/1/s13-relay-hard-failure/proto", + ) + + send_result = sender_node.send_message(message=message) + assert send_result.is_ok(), f"send() must return Ok(RequestId), got: {send_result.err()}" + + request_id = send_result.ok_value + assert request_id, "send() returned an empty RequestId" + + error_event = wait_for_error( + collector=sender_collector, + request_id=request_id, + timeout_s=ERROR_TIMEOUT_S, + ) + assert error_event is not None, ( + f"No message_error event within {ERROR_TIMEOUT_S}s from the " f"relay processor. Collected events: {sender_collector.events}" + ) + assert error_event["requestId"] == request_id + assert MESSAGE_SIZE_EXCEEDED_MSG in (error_event.get("error") or ""), ( + f"Expected error to contain {MESSAGE_SIZE_EXCEEDED_MSG!r}.\n" f"Got: {error_event.get('error')!r}\n" f"Full event: {error_event}" + ) + + propagated = wait_for_propagated(sender_collector, request_id, timeout_s=0) + assert propagated is None, f"Unexpected message_propagated event for a failed relay publish: {propagated}" + + assert_event_invariants(sender_collector, request_id) + + class TestS30ConcurrentSendsDuringAutoSubscribe(StepsCommon): """ S30: concurrent sends on the same content topic during initial auto-subscribe. diff --git a/tests/wrappers_tests/test_send_lightpush_and_edge.py b/tests/wrappers_tests/test_send_lightpush_and_edge.py index 917ecd918..529f5035a 100644 --- a/tests/wrappers_tests/test_send_lightpush_and_edge.py +++ b/tests/wrappers_tests/test_send_lightpush_and_edge.py @@ -532,6 +532,94 @@ class TestS15LightpushRetryableErrorRecovery(StepsCommon): assert_event_invariants(sender_collector, request_id) +class TestS16LightpushPeerAppearsLater(StepsCommon): + """ + S16 — No delivery peers at T0, lightpush peer appears later. + The edge sender has the lightpush service in its staticnodes, but the + service is stopped before the sender starts, so there is no reachable + delivery peer at T0. send() is called while the service is down. The + service is restarted during the retry window; the sender connects to it + and a later retry delivers the message. + Expected: send() returns Ok(RequestId), then eventually Propagated. + """ + + @pytest.mark.xfail(reason="binding cannot restart a node or add peers at runtime") + def test_s16_lightpush_peer_appears_later(self): + sender_collector = EventCollector() + + common = { + "store": False, + "filter": False, + "discv5Discovery": False, + "numShardsInNetwork": 1, + } + + # Start the lightpush service once to obtain its multiaddr, then stop + # it so the sender has no reachable peer at T0. The same node object + # is restarted later, so the address stays valid. + service_config = build_node_config(relay=True, lightpush=True, **common) + service_result = WrapperManager.create_and_start(config=service_config) + assert service_result.is_ok(), f"Failed to start lightpush peer: {service_result.err()}" + + with service_result.ok_value as service: + service_multiaddr = get_node_multiaddr(service) + + stop_result = service.stop_node() + assert stop_result.is_ok(), f"Failed to stop lightpush peer: {stop_result.err()}" + delay(SERVICE_DOWN_SETTLE_S) + + # Edge sender is a lightpush client; its only peer is the service, + # which is currently down. + edge_config = build_node_config( + mode="Edge", + lightpush=True, + staticnodes=[service_multiaddr], + **common, + ) + edge_result = WrapperManager.create_and_start( + config=edge_config, + event_cb=sender_collector.event_callback, + ) + assert edge_result.is_ok(), f"Failed to start edge sender: {edge_result.err()}" + + with edge_result.ok_value as edge_sender: + # send() is invoked while the service is down. + msg = create_message_bindings( + payload=to_base64("S16 lightpush peer appears later"), + contentTopic="/test/1/s16-late-lightpush/proto", + ) + send_result = edge_sender.send_message(message=msg) + assert send_result.is_ok(), f"send() failed: {send_result.err()}" + request_id = send_result.ok_value + assert request_id, "send() returned an empty RequestId" + + delay(SERVICE_DOWN_SETTLE_S) + + early_propagated = wait_for_propagated(sender_collector, request_id, timeout_s=0) + assert early_propagated is None, f"message_propagated arrived before the lightpush peer was reachable: {early_propagated}" + + # The lightpush peer comes back during the retry window. + restart_result = service.start_node() + assert restart_result.is_ok(), f"Failed to restart lightpush peer: {restart_result.err()}" + + propagated = wait_for_propagated( + collector=sender_collector, + request_id=request_id, + timeout_s=RECOVERY_TIMEOUT_S, + ) + assert propagated is not None, ( + f"No message_propagated within {RECOVERY_TIMEOUT_S}s " + f"after the lightpush peer joined. " + f"Collected events: {sender_collector.events}" + ) + assert propagated["requestId"] == request_id + + error = wait_for_error(sender_collector, request_id, timeout_s=0) + assert error is None, f"Unexpected message_error after recovery: {error}" + + assert_event_invariants(sender_collector, request_id) + + class TestS26LightpushPeerChurn(StepsCommon): """ S26: multiple lightpush peers, the selected one disappears, From abe86825e832c0a6bd0c5b9efea82677dc4331ad Mon Sep 17 00:00:00 2001 From: Aya Hassan Date: Tue, 19 May 2026 16:41:35 +0200 Subject: [PATCH 3/9] S18/S25 added --- .../test_send_lightpush_and_edge.py | 272 ++++++++++++++++++ 1 file changed, 272 insertions(+) diff --git a/tests/wrappers_tests/test_send_lightpush_and_edge.py b/tests/wrappers_tests/test_send_lightpush_and_edge.py index 529f5035a..d22568df5 100644 --- a/tests/wrappers_tests/test_send_lightpush_and_edge.py +++ b/tests/wrappers_tests/test_send_lightpush_and_edge.py @@ -737,3 +737,275 @@ class TestS26LightpushPeerChurn(StepsCommon): assert error_event is None, f"Unexpected message_error event during peer churn: {error_event}" assert_event_invariants(sender_collector, request_id) + + +class TestS18StagedTopologyReformation(StepsCommon): + """ + S18: relay absent at T0, lightpush absent at T0, both appear in stages. + + The sender has relay and lightpush enabled but starts fully isolated: + no relay peers and no lightpush peers. send() is called before any + peer exists. Peers then appear one stage at a time. The message must + succeed as soon as any valid path becomes available. + + Two orderings are covered, one per test method: + - lightpush appears first, then relay. + - relay appears first, then lightpush. + + Both prove the send service recovers from arbitrary staged topology + reformation. + + Common topology per stage: + [Sender] relay=True, lightpush=True, isolated at T0. + [LightpushPeer] relay=True, lightpush=True, staticnodes=[sender]. + [RelayPeer] relay=True, staticnodes=[sender]. + """ + + # Common config shared by the sender and both staged peers. + _COMMON = { + "relay": True, + "lightpush": True, + "store": False, + "filter": False, + "discv5Discovery": False, + "numShardsInNetwork": 1, + } + + def test_s18_lightpush_first_then_relay(self, node_config): + """Sender isolated at T0; lightpush peer appears, then relay peer.""" + sender_collector = EventCollector() + + node_config.update(self._COMMON) + + sender_result = WrapperManager.create_and_start( + config=node_config, + event_cb=sender_collector.event_callback, + ) + assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" + + with sender_result.ok_value as sender: + # send() before any peer exists must still return Ok(RequestId). + message = create_message_bindings( + payload=to_base64("S18 lightpush-first staged recovery"), + contentTopic="/test/1/s18-lightpush-first/proto", + ) + send_result = sender.send_message(message=message) + assert send_result.is_ok(), f"send() must return Ok(RequestId) while isolated, got: {send_result.err()}" + + request_id = send_result.ok_value + assert request_id, "send() returned an empty RequestId" + + # No peers yet: the message must not propagate. + delay(SERVICE_DOWN_SETTLE_S) + early_propagated = wait_for_propagated(sender_collector, request_id, timeout_s=0) + assert early_propagated is None, f"message_propagated arrived before any peer joined: {early_propagated}" + + # Stage 1: lightpush peer appears. + lightpush_config = { + **node_config, + "staticnodes": [get_node_multiaddr(sender)], + "portsShift": 1, + } + lightpush_result = WrapperManager.create_and_start(config=lightpush_config) + assert lightpush_result.is_ok(), f"Failed to start lightpush peer: {lightpush_result.err()}" + + with lightpush_result.ok_value: + # Stage 2: relay peer appears. + relay_config = { + **node_config, + "lightpush": False, + "staticnodes": [get_node_multiaddr(sender)], + "portsShift": 2, + } + relay_result = WrapperManager.create_and_start(config=relay_config) + assert relay_result.is_ok(), f"Failed to start relay peer: {relay_result.err()}" + + with relay_result.ok_value: + # The message must succeed once any valid path is available. + propagated = wait_for_propagated( + collector=sender_collector, + request_id=request_id, + timeout_s=RECOVERY_TIMEOUT_S, + ) + assert propagated is not None, ( + f"No message_propagated within {RECOVERY_TIMEOUT_S}s " + f"after peers appeared in stages. " + f"Collected events: {sender_collector.events}" + ) + assert propagated["requestId"] == request_id + + error = wait_for_error(sender_collector, request_id, timeout_s=0) + assert error is None, f"Unexpected message_error after staged recovery: {error}" + + assert_event_invariants(sender_collector, request_id) + + def test_s18_relay_first_then_lightpush(self, node_config): + """Sender isolated at T0; relay peer appears, then lightpush peer.""" + sender_collector = EventCollector() + + node_config.update(self._COMMON) + + sender_result = WrapperManager.create_and_start( + config=node_config, + event_cb=sender_collector.event_callback, + ) + assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" + + with sender_result.ok_value as sender: + # send() before any peer exists must still return Ok(RequestId). + message = create_message_bindings( + payload=to_base64("S18 relay-first staged recovery"), + contentTopic="/test/1/s18-relay-first/proto", + ) + send_result = sender.send_message(message=message) + assert send_result.is_ok(), f"send() must return Ok(RequestId) while isolated, got: {send_result.err()}" + + request_id = send_result.ok_value + assert request_id, "send() returned an empty RequestId" + + # No peers yet: the message must not propagate. + delay(SERVICE_DOWN_SETTLE_S) + early_propagated = wait_for_propagated(sender_collector, request_id, timeout_s=0) + assert early_propagated is None, f"message_propagated arrived before any peer joined: {early_propagated}" + + # Stage 1: relay peer appears. + relay_config = { + **node_config, + "lightpush": False, + "staticnodes": [get_node_multiaddr(sender)], + "portsShift": 1, + } + relay_result = WrapperManager.create_and_start(config=relay_config) + assert relay_result.is_ok(), f"Failed to start relay peer: {relay_result.err()}" + + with relay_result.ok_value: + # Stage 2: lightpush peer appears. + lightpush_config = { + **node_config, + "staticnodes": [get_node_multiaddr(sender)], + "portsShift": 2, + } + lightpush_result = WrapperManager.create_and_start(config=lightpush_config) + assert lightpush_result.is_ok(), f"Failed to start lightpush peer: {lightpush_result.err()}" + + with lightpush_result.ok_value: + # The message must succeed once any valid path is available. + propagated = wait_for_propagated( + collector=sender_collector, + request_id=request_id, + timeout_s=RECOVERY_TIMEOUT_S, + ) + assert propagated is not None, ( + f"No message_propagated within {RECOVERY_TIMEOUT_S}s " + f"after peers appeared in stages. " + f"Collected events: {sender_collector.events}" + ) + assert propagated["requestId"] == request_id + + error = wait_for_error(sender_collector, request_id, timeout_s=0) + assert error is None, f"Unexpected message_error after staged recovery: {error}" + + assert_event_invariants(sender_collector, request_id) + + +class TestS25EphemeralLightpushWithStore(StepsCommon): + """ + S25 — Ephemeral message over lightpush with a reachable store peer. + + The sender is an Edge node, so it has no local relay and publishes only + via lightpush. A docker store peer is reachable and joined to the + lightpush peer's relay mesh, so the message does reach a store. + + Because ephemeral messages are never store-validated, the expected result + is Propagated only — no Sent — even though the store peer is reachable. + This is the lightpush-transport counterpart of S24 and proves the + ephemeral rule is transport-independent. + + Topology mirrors S11: + [LightpushPeer] wrapper, relay=True, lightpush=True, store=False + [StorePeer] docker WakuNode, relay=true, store=true, joined to + the lightpush peer's shard-0 relay mesh + [Edge] wrapper, mode="Edge", reliabilityEnabled=True, + staticnodes=[lightpush_peer, store_peer] + """ + + def test_s25_ephemeral_lightpush_with_store(self): + sender_collector = EventCollector() + + common = { + "numShardsInNetwork": 1, + } + + lightpush_config = build_node_config( + relay=True, + lightpush=True, + **common, + ) + + lightpush_result = WrapperManager.create_and_start(config=lightpush_config) + assert lightpush_result.is_ok(), f"Failed to start lightpush peer: {lightpush_result.err()}" + + with lightpush_result.ok_value as lightpush_peer: + lightpush_multiaddr = get_node_multiaddr(lightpush_peer) + + # Docker store peer joined to the lightpush peer's shard-0 relay + # mesh, so messages propagated by the lightpush peer are archived. + store_peer = WakuNode(NODE_1, f"s25_store_{self.test_id}") + store_peer.start(relay="true", store="true") + self.add_node_peer(store_peer, [lightpush_multiaddr]) + store_peer.set_relay_subscriptions([STORE_PEER_PUBSUB_TOPIC]) + store_multiaddr = store_peer.get_multiaddr_with_id() + + edge_config = build_node_config( + mode="Edge", + staticnodes=[lightpush_multiaddr, store_multiaddr], + **common, + ) + + edge_result = WrapperManager.create_and_start( + config=edge_config, + event_cb=sender_collector.event_callback, + ) + assert edge_result.is_ok(), f"Failed to start edge sender: {edge_result.err()}" + + with edge_result.ok_value as edge_sender: + message = create_message_bindings( + payload=to_base64("S25 ephemeral lightpush + store payload"), + contentTopic="/test/1/s25-ephemeral-lightpush/proto", + ephemeral=True, + ) + + send_result = edge_sender.send_message(message=message) + assert send_result.is_ok(), f"send() failed: {send_result.err()}" + + request_id = send_result.ok_value + assert request_id, "send() returned an empty RequestId" + + propagated = wait_for_propagated( + collector=sender_collector, + request_id=request_id, + timeout_s=PROPAGATED_TIMEOUT_S, + ) + assert propagated is not None, ( + f"No message_propagated event within {PROPAGATED_TIMEOUT_S}s. " f"Collected events: {sender_collector.events}" + ) + assert propagated["requestId"] == request_id + + # Ephemeral messages are never store-validated, so no Sent + # event must arrive even though the store peer is reachable. + sent = wait_for_sent( + collector=sender_collector, + request_id=request_id, + timeout_s=NO_SENT_OBSERVATION_S, + ) + assert sent is None, ( + f"Unexpected message_sent event for an ephemeral message. " + f"Ephemeral messages must never be store-validated.\n" + f"Sent event: {sent}\n" + f"Collected events: {sender_collector.events}" + ) + + error = wait_for_error(sender_collector, request_id, timeout_s=0) + assert error is None, f"Unexpected message_error event: {error}" + + assert_event_invariants(sender_collector, request_id) From 27fd9df46a97f141d3a662b6b128aa0fa593ced7 Mon Sep 17 00:00:00 2001 From: Aya Hassan Date: Tue, 19 May 2026 16:49:42 +0200 Subject: [PATCH 4/9] Revert "S18/S25 added" This reverts commit abe86825e832c0a6bd0c5b9efea82677dc4331ad. --- .../test_send_lightpush_and_edge.py | 272 ------------------ 1 file changed, 272 deletions(-) diff --git a/tests/wrappers_tests/test_send_lightpush_and_edge.py b/tests/wrappers_tests/test_send_lightpush_and_edge.py index d22568df5..529f5035a 100644 --- a/tests/wrappers_tests/test_send_lightpush_and_edge.py +++ b/tests/wrappers_tests/test_send_lightpush_and_edge.py @@ -737,275 +737,3 @@ class TestS26LightpushPeerChurn(StepsCommon): assert error_event is None, f"Unexpected message_error event during peer churn: {error_event}" assert_event_invariants(sender_collector, request_id) - - -class TestS18StagedTopologyReformation(StepsCommon): - """ - S18: relay absent at T0, lightpush absent at T0, both appear in stages. - - The sender has relay and lightpush enabled but starts fully isolated: - no relay peers and no lightpush peers. send() is called before any - peer exists. Peers then appear one stage at a time. The message must - succeed as soon as any valid path becomes available. - - Two orderings are covered, one per test method: - - lightpush appears first, then relay. - - relay appears first, then lightpush. - - Both prove the send service recovers from arbitrary staged topology - reformation. - - Common topology per stage: - [Sender] relay=True, lightpush=True, isolated at T0. - [LightpushPeer] relay=True, lightpush=True, staticnodes=[sender]. - [RelayPeer] relay=True, staticnodes=[sender]. - """ - - # Common config shared by the sender and both staged peers. - _COMMON = { - "relay": True, - "lightpush": True, - "store": False, - "filter": False, - "discv5Discovery": False, - "numShardsInNetwork": 1, - } - - def test_s18_lightpush_first_then_relay(self, node_config): - """Sender isolated at T0; lightpush peer appears, then relay peer.""" - sender_collector = EventCollector() - - node_config.update(self._COMMON) - - sender_result = WrapperManager.create_and_start( - config=node_config, - event_cb=sender_collector.event_callback, - ) - assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" - - with sender_result.ok_value as sender: - # send() before any peer exists must still return Ok(RequestId). - message = create_message_bindings( - payload=to_base64("S18 lightpush-first staged recovery"), - contentTopic="/test/1/s18-lightpush-first/proto", - ) - send_result = sender.send_message(message=message) - assert send_result.is_ok(), f"send() must return Ok(RequestId) while isolated, got: {send_result.err()}" - - request_id = send_result.ok_value - assert request_id, "send() returned an empty RequestId" - - # No peers yet: the message must not propagate. - delay(SERVICE_DOWN_SETTLE_S) - early_propagated = wait_for_propagated(sender_collector, request_id, timeout_s=0) - assert early_propagated is None, f"message_propagated arrived before any peer joined: {early_propagated}" - - # Stage 1: lightpush peer appears. - lightpush_config = { - **node_config, - "staticnodes": [get_node_multiaddr(sender)], - "portsShift": 1, - } - lightpush_result = WrapperManager.create_and_start(config=lightpush_config) - assert lightpush_result.is_ok(), f"Failed to start lightpush peer: {lightpush_result.err()}" - - with lightpush_result.ok_value: - # Stage 2: relay peer appears. - relay_config = { - **node_config, - "lightpush": False, - "staticnodes": [get_node_multiaddr(sender)], - "portsShift": 2, - } - relay_result = WrapperManager.create_and_start(config=relay_config) - assert relay_result.is_ok(), f"Failed to start relay peer: {relay_result.err()}" - - with relay_result.ok_value: - # The message must succeed once any valid path is available. - propagated = wait_for_propagated( - collector=sender_collector, - request_id=request_id, - timeout_s=RECOVERY_TIMEOUT_S, - ) - assert propagated is not None, ( - f"No message_propagated within {RECOVERY_TIMEOUT_S}s " - f"after peers appeared in stages. " - f"Collected events: {sender_collector.events}" - ) - assert propagated["requestId"] == request_id - - error = wait_for_error(sender_collector, request_id, timeout_s=0) - assert error is None, f"Unexpected message_error after staged recovery: {error}" - - assert_event_invariants(sender_collector, request_id) - - def test_s18_relay_first_then_lightpush(self, node_config): - """Sender isolated at T0; relay peer appears, then lightpush peer.""" - sender_collector = EventCollector() - - node_config.update(self._COMMON) - - sender_result = WrapperManager.create_and_start( - config=node_config, - event_cb=sender_collector.event_callback, - ) - assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" - - with sender_result.ok_value as sender: - # send() before any peer exists must still return Ok(RequestId). - message = create_message_bindings( - payload=to_base64("S18 relay-first staged recovery"), - contentTopic="/test/1/s18-relay-first/proto", - ) - send_result = sender.send_message(message=message) - assert send_result.is_ok(), f"send() must return Ok(RequestId) while isolated, got: {send_result.err()}" - - request_id = send_result.ok_value - assert request_id, "send() returned an empty RequestId" - - # No peers yet: the message must not propagate. - delay(SERVICE_DOWN_SETTLE_S) - early_propagated = wait_for_propagated(sender_collector, request_id, timeout_s=0) - assert early_propagated is None, f"message_propagated arrived before any peer joined: {early_propagated}" - - # Stage 1: relay peer appears. - relay_config = { - **node_config, - "lightpush": False, - "staticnodes": [get_node_multiaddr(sender)], - "portsShift": 1, - } - relay_result = WrapperManager.create_and_start(config=relay_config) - assert relay_result.is_ok(), f"Failed to start relay peer: {relay_result.err()}" - - with relay_result.ok_value: - # Stage 2: lightpush peer appears. - lightpush_config = { - **node_config, - "staticnodes": [get_node_multiaddr(sender)], - "portsShift": 2, - } - lightpush_result = WrapperManager.create_and_start(config=lightpush_config) - assert lightpush_result.is_ok(), f"Failed to start lightpush peer: {lightpush_result.err()}" - - with lightpush_result.ok_value: - # The message must succeed once any valid path is available. - propagated = wait_for_propagated( - collector=sender_collector, - request_id=request_id, - timeout_s=RECOVERY_TIMEOUT_S, - ) - assert propagated is not None, ( - f"No message_propagated within {RECOVERY_TIMEOUT_S}s " - f"after peers appeared in stages. " - f"Collected events: {sender_collector.events}" - ) - assert propagated["requestId"] == request_id - - error = wait_for_error(sender_collector, request_id, timeout_s=0) - assert error is None, f"Unexpected message_error after staged recovery: {error}" - - assert_event_invariants(sender_collector, request_id) - - -class TestS25EphemeralLightpushWithStore(StepsCommon): - """ - S25 — Ephemeral message over lightpush with a reachable store peer. - - The sender is an Edge node, so it has no local relay and publishes only - via lightpush. A docker store peer is reachable and joined to the - lightpush peer's relay mesh, so the message does reach a store. - - Because ephemeral messages are never store-validated, the expected result - is Propagated only — no Sent — even though the store peer is reachable. - This is the lightpush-transport counterpart of S24 and proves the - ephemeral rule is transport-independent. - - Topology mirrors S11: - [LightpushPeer] wrapper, relay=True, lightpush=True, store=False - [StorePeer] docker WakuNode, relay=true, store=true, joined to - the lightpush peer's shard-0 relay mesh - [Edge] wrapper, mode="Edge", reliabilityEnabled=True, - staticnodes=[lightpush_peer, store_peer] - """ - - def test_s25_ephemeral_lightpush_with_store(self): - sender_collector = EventCollector() - - common = { - "numShardsInNetwork": 1, - } - - lightpush_config = build_node_config( - relay=True, - lightpush=True, - **common, - ) - - lightpush_result = WrapperManager.create_and_start(config=lightpush_config) - assert lightpush_result.is_ok(), f"Failed to start lightpush peer: {lightpush_result.err()}" - - with lightpush_result.ok_value as lightpush_peer: - lightpush_multiaddr = get_node_multiaddr(lightpush_peer) - - # Docker store peer joined to the lightpush peer's shard-0 relay - # mesh, so messages propagated by the lightpush peer are archived. - store_peer = WakuNode(NODE_1, f"s25_store_{self.test_id}") - store_peer.start(relay="true", store="true") - self.add_node_peer(store_peer, [lightpush_multiaddr]) - store_peer.set_relay_subscriptions([STORE_PEER_PUBSUB_TOPIC]) - store_multiaddr = store_peer.get_multiaddr_with_id() - - edge_config = build_node_config( - mode="Edge", - staticnodes=[lightpush_multiaddr, store_multiaddr], - **common, - ) - - edge_result = WrapperManager.create_and_start( - config=edge_config, - event_cb=sender_collector.event_callback, - ) - assert edge_result.is_ok(), f"Failed to start edge sender: {edge_result.err()}" - - with edge_result.ok_value as edge_sender: - message = create_message_bindings( - payload=to_base64("S25 ephemeral lightpush + store payload"), - contentTopic="/test/1/s25-ephemeral-lightpush/proto", - ephemeral=True, - ) - - send_result = edge_sender.send_message(message=message) - assert send_result.is_ok(), f"send() failed: {send_result.err()}" - - request_id = send_result.ok_value - assert request_id, "send() returned an empty RequestId" - - propagated = wait_for_propagated( - collector=sender_collector, - request_id=request_id, - timeout_s=PROPAGATED_TIMEOUT_S, - ) - assert propagated is not None, ( - f"No message_propagated event within {PROPAGATED_TIMEOUT_S}s. " f"Collected events: {sender_collector.events}" - ) - assert propagated["requestId"] == request_id - - # Ephemeral messages are never store-validated, so no Sent - # event must arrive even though the store peer is reachable. - sent = wait_for_sent( - collector=sender_collector, - request_id=request_id, - timeout_s=NO_SENT_OBSERVATION_S, - ) - assert sent is None, ( - f"Unexpected message_sent event for an ephemeral message. " - f"Ephemeral messages must never be store-validated.\n" - f"Sent event: {sent}\n" - f"Collected events: {sender_collector.events}" - ) - - error = wait_for_error(sender_collector, request_id, timeout_s=0) - assert error is None, f"Unexpected message_error event: {error}" - - assert_event_invariants(sender_collector, request_id) From d2afce79f4af4ee308c918b93361d2e1589ee668 Mon Sep 17 00:00:00 2001 From: Roman Zajic Date: Wed, 20 May 2026 13:28:57 +0800 Subject: [PATCH 5/9] fix: Enable RLN registration for fleet tests (#185) * fix: enable RLN registration with v0.38.1 * fix: use NODE_1 instead of DEFAULT_NWAKU for registration --- .github/workflows/fleet_tests.yml | 12 ++++++------ src/steps/rln.py | 4 ++-- tests/conftest.py | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/fleet_tests.yml b/.github/workflows/fleet_tests.yml index fda82f69e..13a6552f2 100644 --- a/.github/workflows/fleet_tests.yml +++ b/.github/workflows/fleet_tests.yml @@ -13,25 +13,25 @@ on: required: true description: "Node that usually publishes messages. Used for all tests" type: string - default: "wakuorg/nwaku:latest" + default: "wakuorg/nwaku:v0.38.1" node2: required: true description: "Node that usually queries for published messages. Used for all tests" type: string - default: "wakuorg/nwaku:latest" + default: "wakuorg/nwaku:v0.38.1" additional_nodes: required: false description: "Additional optional nodes used in e2e tests, separated by ," type: string - default: "wakuorg/nwaku:latest,wakuorg/nwaku:latest,wakuorg/nwaku:latest" + default: "wakuorg/nwaku:v0.38.1,wakuorg/nwaku:v0.38.1,wakuorg/nwaku:v0.38.1" jobs: test-common: uses: ./.github/workflows/test_common.yml secrets: inherit with: - node1: ${{ inputs.node1 || 'wakuorg/nwaku:latest' }} - node2: ${{ inputs.node2 || 'wakuorg/nwaku:latest' }} - additional_nodes: ${{ inputs.additional_nodes || 'wakuorg/nwaku:latest,wakuorg/nwaku:latest,wakuorg/nwaku:latest' }} + node1: ${{ inputs.node1 || 'wakuorg/nwaku:v0.38.1' }} + node2: ${{ inputs.node2 || 'wakuorg/nwaku:v0.38.1' }} + additional_nodes: ${{ inputs.additional_nodes || 'wakuorg/nwaku:v0.38.1,wakuorg/nwaku:v0.38.1,wakuorg/nwaku:v0.38.1' }} fleet_tests: true caller: "fleet" \ No newline at end of file diff --git a/src/steps/rln.py b/src/steps/rln.py index 1bf051df2..3fb3b09c7 100644 --- a/src/steps/rln.py +++ b/src/steps/rln.py @@ -8,7 +8,7 @@ import allure from src.steps.common import StepsCommon from src.test_data import PUBSUB_TOPICS_RLN -from src.env_vars import DEFAULT_NWAKU, RLN_CREDENTIALS, NODE_1, NODE_2, ADDITIONAL_NODES +from src.env_vars import RLN_CREDENTIALS, NODE_1, NODE_2, ADDITIONAL_NODES from src.libs.common import gen_step_id, delay from src.libs.custom_logger import get_custom_logger from src.node.waku_node import WakuNode, rln_credential_store_ready @@ -135,7 +135,7 @@ class StepsRLN(StepsCommon): @allure.step def register_rln_single_node(self, prefix="", **kwargs): logger.debug("Registering RLN credentials for single node") - self.node = WakuNode(DEFAULT_NWAKU, f"node_{gen_step_id()}") + self.node = WakuNode(NODE_1, f"node_{gen_step_id()}") return self.node.register_rln(rln_keystore_prefix=prefix, rln_creds_source=kwargs["rln_creds_source"], rln_creds_id=kwargs["rln_creds_id"]) @allure.step diff --git a/tests/conftest.py b/tests/conftest.py index 28f87dd73..b68f1e3d8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -50,7 +50,7 @@ def fleet_rln_state(request): return from src.node.waku_node import WakuNode - from src.env_vars import RLN_CREDENTIALS, DEFAULT_NWAKU + from src.env_vars import RLN_CREDENTIALS, NODE_1 if not RLN_CREDENTIALS: logger.info("Fleet RLN: RLN_CREDENTIALS not set – nodes will start without RLN") @@ -61,7 +61,7 @@ def fleet_rln_state(request): try: for i in range(2): prefix = "".join(random.choices(string.ascii_lowercase, k=4)) - node = WakuNode(DEFAULT_NWAKU, f"rln_reg_{i + 1}_{gen_step_id()}") + node = WakuNode(NODE_1, f"rln_reg_{i + 1}_{gen_step_id()}") membership_index = node.register_rln( rln_keystore_prefix=prefix, rln_creds_source=RLN_CREDENTIALS, From 5f2bbc08372a05e5452b44fc19999656b5b70f40 Mon Sep 17 00:00:00 2001 From: AYAHASSAN287 <49167455+AYAHASSAN287@users.noreply.github.com> Date: Thu, 21 May 2026 14:28:21 +0300 Subject: [PATCH 6/9] S18/S25/S29 added (#184) * Revert "Revert "S18/S25 added"" This reverts commit 27fd9df46a97f141d3a662b6b128aa0fa593ced7. * Adding S29 --- .../test_send_lightpush_and_edge.py | 272 ++++++++++++++++++ .../test_send_relay_propagation.py | 102 +++++++ 2 files changed, 374 insertions(+) diff --git a/tests/wrappers_tests/test_send_lightpush_and_edge.py b/tests/wrappers_tests/test_send_lightpush_and_edge.py index 529f5035a..d22568df5 100644 --- a/tests/wrappers_tests/test_send_lightpush_and_edge.py +++ b/tests/wrappers_tests/test_send_lightpush_and_edge.py @@ -737,3 +737,275 @@ class TestS26LightpushPeerChurn(StepsCommon): assert error_event is None, f"Unexpected message_error event during peer churn: {error_event}" assert_event_invariants(sender_collector, request_id) + + +class TestS18StagedTopologyReformation(StepsCommon): + """ + S18: relay absent at T0, lightpush absent at T0, both appear in stages. + + The sender has relay and lightpush enabled but starts fully isolated: + no relay peers and no lightpush peers. send() is called before any + peer exists. Peers then appear one stage at a time. The message must + succeed as soon as any valid path becomes available. + + Two orderings are covered, one per test method: + - lightpush appears first, then relay. + - relay appears first, then lightpush. + + Both prove the send service recovers from arbitrary staged topology + reformation. + + Common topology per stage: + [Sender] relay=True, lightpush=True, isolated at T0. + [LightpushPeer] relay=True, lightpush=True, staticnodes=[sender]. + [RelayPeer] relay=True, staticnodes=[sender]. + """ + + # Common config shared by the sender and both staged peers. + _COMMON = { + "relay": True, + "lightpush": True, + "store": False, + "filter": False, + "discv5Discovery": False, + "numShardsInNetwork": 1, + } + + def test_s18_lightpush_first_then_relay(self, node_config): + """Sender isolated at T0; lightpush peer appears, then relay peer.""" + sender_collector = EventCollector() + + node_config.update(self._COMMON) + + sender_result = WrapperManager.create_and_start( + config=node_config, + event_cb=sender_collector.event_callback, + ) + assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" + + with sender_result.ok_value as sender: + # send() before any peer exists must still return Ok(RequestId). + message = create_message_bindings( + payload=to_base64("S18 lightpush-first staged recovery"), + contentTopic="/test/1/s18-lightpush-first/proto", + ) + send_result = sender.send_message(message=message) + assert send_result.is_ok(), f"send() must return Ok(RequestId) while isolated, got: {send_result.err()}" + + request_id = send_result.ok_value + assert request_id, "send() returned an empty RequestId" + + # No peers yet: the message must not propagate. + delay(SERVICE_DOWN_SETTLE_S) + early_propagated = wait_for_propagated(sender_collector, request_id, timeout_s=0) + assert early_propagated is None, f"message_propagated arrived before any peer joined: {early_propagated}" + + # Stage 1: lightpush peer appears. + lightpush_config = { + **node_config, + "staticnodes": [get_node_multiaddr(sender)], + "portsShift": 1, + } + lightpush_result = WrapperManager.create_and_start(config=lightpush_config) + assert lightpush_result.is_ok(), f"Failed to start lightpush peer: {lightpush_result.err()}" + + with lightpush_result.ok_value: + # Stage 2: relay peer appears. + relay_config = { + **node_config, + "lightpush": False, + "staticnodes": [get_node_multiaddr(sender)], + "portsShift": 2, + } + relay_result = WrapperManager.create_and_start(config=relay_config) + assert relay_result.is_ok(), f"Failed to start relay peer: {relay_result.err()}" + + with relay_result.ok_value: + # The message must succeed once any valid path is available. + propagated = wait_for_propagated( + collector=sender_collector, + request_id=request_id, + timeout_s=RECOVERY_TIMEOUT_S, + ) + assert propagated is not None, ( + f"No message_propagated within {RECOVERY_TIMEOUT_S}s " + f"after peers appeared in stages. " + f"Collected events: {sender_collector.events}" + ) + assert propagated["requestId"] == request_id + + error = wait_for_error(sender_collector, request_id, timeout_s=0) + assert error is None, f"Unexpected message_error after staged recovery: {error}" + + assert_event_invariants(sender_collector, request_id) + + def test_s18_relay_first_then_lightpush(self, node_config): + """Sender isolated at T0; relay peer appears, then lightpush peer.""" + sender_collector = EventCollector() + + node_config.update(self._COMMON) + + sender_result = WrapperManager.create_and_start( + config=node_config, + event_cb=sender_collector.event_callback, + ) + assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" + + with sender_result.ok_value as sender: + # send() before any peer exists must still return Ok(RequestId). + message = create_message_bindings( + payload=to_base64("S18 relay-first staged recovery"), + contentTopic="/test/1/s18-relay-first/proto", + ) + send_result = sender.send_message(message=message) + assert send_result.is_ok(), f"send() must return Ok(RequestId) while isolated, got: {send_result.err()}" + + request_id = send_result.ok_value + assert request_id, "send() returned an empty RequestId" + + # No peers yet: the message must not propagate. + delay(SERVICE_DOWN_SETTLE_S) + early_propagated = wait_for_propagated(sender_collector, request_id, timeout_s=0) + assert early_propagated is None, f"message_propagated arrived before any peer joined: {early_propagated}" + + # Stage 1: relay peer appears. + relay_config = { + **node_config, + "lightpush": False, + "staticnodes": [get_node_multiaddr(sender)], + "portsShift": 1, + } + relay_result = WrapperManager.create_and_start(config=relay_config) + assert relay_result.is_ok(), f"Failed to start relay peer: {relay_result.err()}" + + with relay_result.ok_value: + # Stage 2: lightpush peer appears. + lightpush_config = { + **node_config, + "staticnodes": [get_node_multiaddr(sender)], + "portsShift": 2, + } + lightpush_result = WrapperManager.create_and_start(config=lightpush_config) + assert lightpush_result.is_ok(), f"Failed to start lightpush peer: {lightpush_result.err()}" + + with lightpush_result.ok_value: + # The message must succeed once any valid path is available. + propagated = wait_for_propagated( + collector=sender_collector, + request_id=request_id, + timeout_s=RECOVERY_TIMEOUT_S, + ) + assert propagated is not None, ( + f"No message_propagated within {RECOVERY_TIMEOUT_S}s " + f"after peers appeared in stages. " + f"Collected events: {sender_collector.events}" + ) + assert propagated["requestId"] == request_id + + error = wait_for_error(sender_collector, request_id, timeout_s=0) + assert error is None, f"Unexpected message_error after staged recovery: {error}" + + assert_event_invariants(sender_collector, request_id) + + +class TestS25EphemeralLightpushWithStore(StepsCommon): + """ + S25 — Ephemeral message over lightpush with a reachable store peer. + + The sender is an Edge node, so it has no local relay and publishes only + via lightpush. A docker store peer is reachable and joined to the + lightpush peer's relay mesh, so the message does reach a store. + + Because ephemeral messages are never store-validated, the expected result + is Propagated only — no Sent — even though the store peer is reachable. + This is the lightpush-transport counterpart of S24 and proves the + ephemeral rule is transport-independent. + + Topology mirrors S11: + [LightpushPeer] wrapper, relay=True, lightpush=True, store=False + [StorePeer] docker WakuNode, relay=true, store=true, joined to + the lightpush peer's shard-0 relay mesh + [Edge] wrapper, mode="Edge", reliabilityEnabled=True, + staticnodes=[lightpush_peer, store_peer] + """ + + def test_s25_ephemeral_lightpush_with_store(self): + sender_collector = EventCollector() + + common = { + "numShardsInNetwork": 1, + } + + lightpush_config = build_node_config( + relay=True, + lightpush=True, + **common, + ) + + lightpush_result = WrapperManager.create_and_start(config=lightpush_config) + assert lightpush_result.is_ok(), f"Failed to start lightpush peer: {lightpush_result.err()}" + + with lightpush_result.ok_value as lightpush_peer: + lightpush_multiaddr = get_node_multiaddr(lightpush_peer) + + # Docker store peer joined to the lightpush peer's shard-0 relay + # mesh, so messages propagated by the lightpush peer are archived. + store_peer = WakuNode(NODE_1, f"s25_store_{self.test_id}") + store_peer.start(relay="true", store="true") + self.add_node_peer(store_peer, [lightpush_multiaddr]) + store_peer.set_relay_subscriptions([STORE_PEER_PUBSUB_TOPIC]) + store_multiaddr = store_peer.get_multiaddr_with_id() + + edge_config = build_node_config( + mode="Edge", + staticnodes=[lightpush_multiaddr, store_multiaddr], + **common, + ) + + edge_result = WrapperManager.create_and_start( + config=edge_config, + event_cb=sender_collector.event_callback, + ) + assert edge_result.is_ok(), f"Failed to start edge sender: {edge_result.err()}" + + with edge_result.ok_value as edge_sender: + message = create_message_bindings( + payload=to_base64("S25 ephemeral lightpush + store payload"), + contentTopic="/test/1/s25-ephemeral-lightpush/proto", + ephemeral=True, + ) + + send_result = edge_sender.send_message(message=message) + assert send_result.is_ok(), f"send() failed: {send_result.err()}" + + request_id = send_result.ok_value + assert request_id, "send() returned an empty RequestId" + + propagated = wait_for_propagated( + collector=sender_collector, + request_id=request_id, + timeout_s=PROPAGATED_TIMEOUT_S, + ) + assert propagated is not None, ( + f"No message_propagated event within {PROPAGATED_TIMEOUT_S}s. " f"Collected events: {sender_collector.events}" + ) + assert propagated["requestId"] == request_id + + # Ephemeral messages are never store-validated, so no Sent + # event must arrive even though the store peer is reachable. + sent = wait_for_sent( + collector=sender_collector, + request_id=request_id, + timeout_s=NO_SENT_OBSERVATION_S, + ) + assert sent is None, ( + f"Unexpected message_sent event for an ephemeral message. " + f"Ephemeral messages must never be store-validated.\n" + f"Sent event: {sent}\n" + f"Collected events: {sender_collector.events}" + ) + + error = wait_for_error(sender_collector, request_id, timeout_s=0) + assert error is None, f"Unexpected message_error event: {error}" + + assert_event_invariants(sender_collector, request_id) diff --git a/tests/wrappers_tests/test_send_relay_propagation.py b/tests/wrappers_tests/test_send_relay_propagation.py index a1cfce07c..f758ef2ba 100644 --- a/tests/wrappers_tests/test_send_relay_propagation.py +++ b/tests/wrappers_tests/test_send_relay_propagation.py @@ -16,6 +16,7 @@ from src.node.wrapper_helpers import ( wait_for_sent, wait_for_error, ) +from src.test_data import CONTENT_TOPICS_DIFFERENT_SHARDS from tests.wrappers_tests.conftest import free_port logger = get_custom_logger(__name__) @@ -706,3 +707,104 @@ class TestS24EphemeralMessageWithReachableStore(StepsCommon): ) assert_event_invariants(sender_collector, request_id) + + +class TestS29SendOnTopicsMappingToDifferentShards(StepsCommon): + """ + S29 — Send on two different content topics that map to different shards. + Sender has a relay peer reachable on shard X and shard Y; topic A maps to + shard X and topic B maps to shard Y. Two independent sends, one per topic. + Expected: both sends return Ok(RequestId), and each request gets its own + message_propagated event following the availability of its own shard. + Purpose: ensures shard derivation and delivery behavior are topic-specific. + """ + + # Topic A -> shard 0, Topic B -> shard 1 (per CONTENT_TOPICS_DIFFERENT_SHARDS). + TOPIC_A = CONTENT_TOPICS_DIFFERENT_SHARDS[0] + TOPIC_B = CONTENT_TOPICS_DIFFERENT_SHARDS[1] + + def test_s29_send_on_topics_mapping_to_different_shards(self, node_config): + sender_collector = EventCollector() + + # numShardsInNetwork=8 so the two topics resolve to distinct shards + # (shard 0 and shard 1) instead of being collapsed onto shard 0. + node_config.update( + { + "relay": True, + "store": False, + "lightpush": False, + "filter": False, + "discv5Discovery": False, + "numShardsInNetwork": 8, + "reliabilityEnabled": True, + } + ) + + sender_result = WrapperManager.create_and_start( + config=node_config, + event_cb=sender_collector.event_callback, + ) + assert sender_result.is_ok(), f"Failed to start sender: {sender_result.err()}" + + with sender_result.ok_value as sender: + peer_config = { + **node_config, + "staticnodes": [get_node_multiaddr(sender)], + "portsShift": 1, + } + + peer_result = WrapperManager.create_and_start(config=peer_config) + assert peer_result.is_ok(), f"Failed to start relay peer: {peer_result.err()}" + + with peer_result.ok_value: + assert wait_for_connected(sender_collector) is not None, "Sender did not reach Connected/PartiallyConnected state" + + message_a = create_message_bindings( + payload=to_base64("S29 shard X payload"), + contentTopic=self.TOPIC_A, + ) + send_a = sender.send_message(message=message_a) + assert send_a.is_ok(), f"send() on TOPIC_A failed: {send_a.err()}" + request_id_a = send_a.ok_value + assert request_id_a, "send() on TOPIC_A returned an empty RequestId" + + # Send on topic B (shard Y). + message_b = create_message_bindings( + payload=to_base64("S29 shard Y payload"), + contentTopic=self.TOPIC_B, + ) + send_b = sender.send_message(message=message_b) + assert send_b.is_ok(), f"send() on TOPIC_B failed: {send_b.err()}" + request_id_b = send_b.ok_value + assert request_id_b, "send() on TOPIC_B returned an empty RequestId" + + assert request_id_a != request_id_b, "Each send must produce a distinct RequestId" + + # Each request propagates over its own shard's mesh independently. + propagated_a = wait_for_propagated( + collector=sender_collector, + request_id=request_id_a, + timeout_s=PROPAGATED_TIMEOUT_S, + ) + assert propagated_a is not None, ( + f"No message_propagated event for TOPIC_A within {PROPAGATED_TIMEOUT_S}s. " f"Collected events: {sender_collector.events}" + ) + assert propagated_a["requestId"] == request_id_a + + propagated_b = wait_for_propagated( + collector=sender_collector, + request_id=request_id_b, + timeout_s=PROPAGATED_TIMEOUT_S, + ) + assert propagated_b is not None, ( + f"No message_propagated event for TOPIC_B within {PROPAGATED_TIMEOUT_S}s. " f"Collected events: {sender_collector.events}" + ) + assert propagated_b["requestId"] == request_id_b + + # No cross-talk: neither request should produce an error. + for request_id in (request_id_a, request_id_b): + error = wait_for_error(sender_collector, request_id, timeout_s=0) + assert error is None, f"Unexpected message_error event for {request_id}: {error}" + + assert_event_invariants(sender_collector, request_id_a) + assert_event_invariants(sender_collector, request_id_b) From bb1ff0f2bcf17d6e346e95b9783e312b9484eae5 Mon Sep 17 00:00:00 2001 From: Roman Zajic Date: Tue, 26 May 2026 15:21:19 +0800 Subject: [PATCH 7/9] chore: change fleet tests to run once a week (#187) --- .github/workflows/fleet_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/fleet_tests.yml b/.github/workflows/fleet_tests.yml index 13a6552f2..64b3909b1 100644 --- a/.github/workflows/fleet_tests.yml +++ b/.github/workflows/fleet_tests.yml @@ -6,7 +6,7 @@ concurrency: on: schedule: - - cron: '0 2 * * *' + - cron: '0 2 * * 0' workflow_dispatch: inputs: node1: From ec86cc47756f2cdd4ba084af432f7592fb655799 Mon Sep 17 00:00:00 2001 From: Ivan FB <128452529+Ivansete-status@users.noreply.github.com> Date: Thu, 4 Jun 2026 13:47:35 +0200 Subject: [PATCH 8/9] ci: fix Nim checksum mismatch when building liblogosdelivery (#188) * Add docker mark for docker nodes tests * Add the ports allocation tests * trigger CI job * update the .so * chore: update logos-delivery-python-bindings * bump bindings submodule for fresh CI build * bump bindings to pull in logos-delivery #3828 * ci: pin nimble 0.22.3 to fix Nim checksum mismatch on liblogosdelivery build The "Build liblogosdelivery.so" job ran a bare `nimble install -y` inside the logos-delivery checkout, resolving the package's locked deps with whatever nimble choosenim ships. That nimble recomputes the locked Nim package checksum differently from the nimble that generated nimble.lock (0.22.3), so the build aborted with: Downloaded package checksum does not correspond to that in the lock file: Package: nim@v.2.2.4@r.911e0dbb... Expected checksum: 68bb85cb... Mirror logos-delivery's own CI: pin nimble 0.22.3 after installing Nim, and drop the redundant `nimble install -y` / no-op `make setup`. `make liblogosdelivery` already resolves the locked deps via its build-deps prerequisite (`nimble setup --localdeps`). Co-Authored-By: Claude Opus 4.8 * Bump bindings submodule to pull in logos-delivery latest master Points vendor/logos-delivery-python-bindings at the bump-logos-delivery-master branch (logos-delivery-python-bindings#6), which advances logos-delivery to master 4099ff26. Merge that bindings PR before this one so the pointer lands on bindings master. Co-Authored-By: Claude Opus 4.8 --------- Co-authored-by: Aya Hassan Co-authored-by: Claude Opus 4.8 Co-authored-by: AYAHASSAN287 <49167455+AYAHASSAN287@users.noreply.github.com> --- .github/workflows/pr_tests.yml | 26 +++-- .../test_send_lightpush_and_edge.py | 2 + .../test_wrapper_corner_cases.py | 96 +++++++++++++++++++ vendor/logos-delivery-python-bindings | 2 +- 4 files changed, 119 insertions(+), 7 deletions(-) create mode 100644 tests/wrappers_tests/test_wrapper_corner_cases.py diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml index d9948c8e3..040d49a87 100644 --- a/.github/workflows/pr_tests.yml +++ b/.github/workflows/pr_tests.yml @@ -44,7 +44,7 @@ jobs: run: | BINDINGS_HASH=$(git rev-parse HEAD:vendor/logos-delivery-python-bindings) DELIVERY_HASH=$(git -C vendor/logos-delivery-python-bindings rev-parse HEAD:vendor/logos-delivery) - echo "key=liblogosdelivery-${{ runner.os }}-nim2.2.4-${BINDINGS_HASH}-${DELIVERY_HASH}" >> "$GITHUB_OUTPUT" + echo "key=liblogosdelivery-${{ runner.os }}-nim2.2.4-v2-${BINDINGS_HASH}-${DELIVERY_HASH}" >> "$GITHUB_OUTPUT" - name: Cache liblogosdelivery.so id: cache-lib @@ -71,7 +71,7 @@ jobs: gcc \ g++ - - name: Install Nim 2.2.4 + - name: Install Nim 2.2.4 and Nimble 0.22.3 if: steps.cache-lib.outputs.cache-hit != 'true' run: | set -euo pipefail @@ -79,6 +79,10 @@ jobs: echo "$HOME/.nimble/bin" >> "$GITHUB_PATH" export PATH="$HOME/.nimble/bin:$PATH" choosenim 2.2.4 + # Pin the nimble that generated logos-delivery's nimble.lock. A newer + # nimble recomputes the locked Nim package checksum differently, which + # makes `nimble setup --localdeps` abort with a checksum mismatch. + (cd /tmp && nimble install "nimble@0.22.3" -y) nim --version nimble --version @@ -98,10 +102,10 @@ jobs: ln -sf waku.nimble waku.nims - nimble install -y - - make setup - + # `make liblogosdelivery` resolves the locked deps via + # `nimble setup --localdeps` (build-deps prerequisite); a bare + # `nimble install -y` here is redundant and pulls Nim from the lock, + # which is what triggered the checksum mismatch. make liblogosdelivery SO_PATH="$(find . -type f -name 'liblogosdelivery.so' | head -n 1)" @@ -199,6 +203,16 @@ jobs: --reruns 2 \ --junit-xml=wrapper-results-send-errors-and-concurrency.xml + - name: Run wrapper tests - corner cases + continue-on-error: true + env: + PYTHONPATH: ${{ github.workspace }}/vendor/logos-delivery-python-bindings/waku + run: | + pytest tests/wrappers_tests/test_wrapper_corner_cases.py \ + -m "not docker_required" \ + --reruns 2 \ + --junit-xml=wrapper-results-corner-cases.xml + - name: Test Report if: always() uses: dorny/test-reporter@95058abb17504553158e70e2c058fe1fda4392c2 diff --git a/tests/wrappers_tests/test_send_lightpush_and_edge.py b/tests/wrappers_tests/test_send_lightpush_and_edge.py index d22568df5..507dbbed5 100644 --- a/tests/wrappers_tests/test_send_lightpush_and_edge.py +++ b/tests/wrappers_tests/test_send_lightpush_and_edge.py @@ -292,6 +292,7 @@ class TestS11EdgeSenderLightpushAndStore(StepsCommon): store node (cross-implementation check). """ + @pytest.mark.docker_required def test_s11_edge_lightpush_with_store_validation(self): sender_collector = EventCollector() @@ -929,6 +930,7 @@ class TestS25EphemeralLightpushWithStore(StepsCommon): staticnodes=[lightpush_peer, store_peer] """ + @pytest.mark.docker_required def test_s25_ephemeral_lightpush_with_store(self): sender_collector = EventCollector() diff --git a/tests/wrappers_tests/test_wrapper_corner_cases.py b/tests/wrappers_tests/test_wrapper_corner_cases.py new file mode 100644 index 000000000..8aa7d639a --- /dev/null +++ b/tests/wrappers_tests/test_wrapper_corner_cases.py @@ -0,0 +1,96 @@ +import re +import pytest +from src.steps.common import StepsCommon +from src.libs.custom_logger import get_custom_logger +from src.node.wrappers_manager import WrapperManager +from src.node.wrapper_helpers import ( + EventCollector, + create_message_bindings, + get_node_multiaddr, + wait_for_propagated, +) +from tests.wrappers_tests.conftest import build_node_config + +logger = get_custom_logger(__name__) + +PROPAGATED_TIMEOUT_S = 30.0 + +# Matches the /tcp// segment in a libp2p multiaddr. +TCP_PORT_RE = re.compile(r"/tcp/(\d+)/") + + +def _extract_tcp_port(multiaddr: str) -> int: + match = TCP_PORT_RE.search(multiaddr) + assert match, f"multiaddr missing /tcp// segment: {multiaddr!r}" + return int(match.group(1)) + + +class TestWrapperAutoPortAllocation(StepsCommon): + """Corner case: port 0 triggers auto-port allocation. + + Tracks logos-messaging/logos-delivery#3828. Per that PR, auto-port is + opt-in (caller passes 0 explicitly) and only applies to tcpPort, + discv5UdpPort and webSocketPort. restPort and metricsServerPort still + require a concrete value, so they are not exercised here. + """ + + def test_auto_port_starts_node_with_tcp_and_discv5_zero(self): + config = build_node_config(tcpPort=0, discv5UdpPort=0) + + result = WrapperManager.create_and_start(config=config) + assert result.is_ok(), f"create_and_start failed with tcpPort=0, discv5UdpPort=0: " f"{result.err()}" + + with result.ok_value as node: + multiaddr = get_node_multiaddr(node) + tcp_port = _extract_tcp_port(multiaddr) + + assert tcp_port != 0, f"multiaddr still reports tcp port 0; auto-port did not " f"happen. multiaddr={multiaddr!r}" + assert 1024 <= tcp_port <= 65535, f"auto-allocated tcp port out of range: {tcp_port} " f"(multiaddr={multiaddr!r})" + + def test_auto_port_node_can_propagate_message(self): + # End-to-end: two nodes, sender uses auto-port for tcp + discv5. + # restPort stays concrete because REST does not support auto-port. + sender_collector = EventCollector() + sender_config = build_node_config(tcpPort=0, discv5UdpPort=0) + + sender_result = WrapperManager.create_and_start( + config=sender_config, + event_cb=sender_collector.event_callback, + ) + assert sender_result.is_ok(), f"sender start failed: {sender_result.err()}" + + with sender_result.ok_value as sender: + sender_addr = get_node_multiaddr(sender) + + peer_config = build_node_config( + tcpPort=0, + discv5UdpPort=0, + staticnodes=[sender_addr], + ) + peer_result = WrapperManager.create_and_start(config=peer_config) + assert peer_result.is_ok(), f"peer start failed: {peer_result.err()}" + + with peer_result.ok_value: + message = create_message_bindings() + send_result = sender.send_message(message=message) + assert send_result.is_ok(), f"send failed: {send_result.err()}" + + request_id = send_result.ok_value + assert request_id, "send returned empty RequestId" + + propagated = wait_for_propagated(sender_collector, request_id, PROPAGATED_TIMEOUT_S) + assert propagated is not None, f"no message_propagated with auto-allocated ports. " f"Events: {sender_collector.events}" + + @pytest.mark.parametrize("port_field", ["tcpPort", "discv5UdpPort"]) + def test_auto_port_per_field(self, port_field): + # Each auto-port-capable field set to 0 in isolation. restPort and + # metricsServerPort are intentionally excluded (see class docstring). + config = build_node_config(**{port_field: 0}) + + result = WrapperManager.create_and_start(config=config) + assert result.is_ok(), f"create_and_start failed with {port_field}=0: {result.err()}" + + with result.ok_value as node: + multiaddr = get_node_multiaddr(node) + tcp_port = _extract_tcp_port(multiaddr) + assert tcp_port != 0, f"tcp port is 0 in multiaddr with {port_field}=0; " f"multiaddr={multiaddr!r}" diff --git a/vendor/logos-delivery-python-bindings b/vendor/logos-delivery-python-bindings index 370693695..36041dae7 160000 --- a/vendor/logos-delivery-python-bindings +++ b/vendor/logos-delivery-python-bindings @@ -1 +1 @@ -Subproject commit 370693695f8843a26108aa34c915bf6bd9f9f2d5 +Subproject commit 36041dae76a53fff91fe53253933d8eb2ab385f9 From c89c967b9ad4bc8cfcfee4828a9bb36ec624b5d9 Mon Sep 17 00:00:00 2001 From: AYAHASSAN287 <49167455+AYAHASSAN287@users.noreply.github.com> Date: Sun, 7 Jun 2026 19:34:20 +0300 Subject: [PATCH 9/9] Add ports related tests (#186) * Add docker mark for docker nodes tests * Add the ports allocation tests * trigger CI job * update the .so * chore: update logos-delivery-python-bindings * bump bindings submodule for fresh CI build * bump bindings to pull in logos-delivery #3828 * bump bindings to latest logos-delivery master Co-Authored-By: Claude Opus 4.7 (1M context) * retrigger CI job * fix the number of shards issue * Add tests for other ports * fix the yml error * Fix review comments --------- Co-authored-by: Claude Opus 4.7 (1M context) --- .github/workflows/pr_tests.yml | 14 +- src/node/wrapper_helpers.py | 49 +++++++ .../test_wrapper_corner_cases.py | 126 +++++++++++------- 3 files changed, 134 insertions(+), 55 deletions(-) diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml index 040d49a87..7e2a5c395 100644 --- a/.github/workflows/pr_tests.yml +++ b/.github/workflows/pr_tests.yml @@ -44,7 +44,7 @@ jobs: run: | BINDINGS_HASH=$(git rev-parse HEAD:vendor/logos-delivery-python-bindings) DELIVERY_HASH=$(git -C vendor/logos-delivery-python-bindings rev-parse HEAD:vendor/logos-delivery) - echo "key=liblogosdelivery-${{ runner.os }}-nim2.2.4-v2-${BINDINGS_HASH}-${DELIVERY_HASH}" >> "$GITHUB_OUTPUT" + echo "key=liblogosdelivery-${{ runner.os }}-nim2.2.4-${BINDINGS_HASH}-${DELIVERY_HASH}" >> "$GITHUB_OUTPUT" - name: Cache liblogosdelivery.so id: cache-lib @@ -79,9 +79,7 @@ jobs: echo "$HOME/.nimble/bin" >> "$GITHUB_PATH" export PATH="$HOME/.nimble/bin:$PATH" choosenim 2.2.4 - # Pin the nimble that generated logos-delivery's nimble.lock. A newer - # nimble recomputes the locked Nim package checksum differently, which - # makes `nimble setup --localdeps` abort with a checksum mismatch. + # Pin the nimble that generated nimble.lock; newer ones fail the checksum. (cd /tmp && nimble install "nimble@0.22.3" -y) nim --version nimble --version @@ -102,11 +100,9 @@ jobs: ln -sf waku.nimble waku.nims - # `make liblogosdelivery` resolves the locked deps via - # `nimble setup --localdeps` (build-deps prerequisite); a bare - # `nimble install -y` here is redundant and pulls Nim from the lock, - # which is what triggered the checksum mismatch. - make liblogosdelivery + # Portable build (no -march=native) so the cached .so doesn't SIGILL + # on runners with older CPUs; via env so the Makefile's NIM_PARAMS apply. + NIM_PARAMS="-d:disableMarchNative -d:marchOptimized" make liblogosdelivery SO_PATH="$(find . -type f -name 'liblogosdelivery.so' | head -n 1)" diff --git a/src/node/wrapper_helpers.py b/src/node/wrapper_helpers.py index 7c9dd1a2d..972f29d0a 100644 --- a/src/node/wrapper_helpers.py +++ b/src/node/wrapper_helpers.py @@ -1,6 +1,8 @@ from __future__ import annotations +import base64 import json +import re import threading import time from typing import Optional @@ -195,6 +197,53 @@ def get_node_multiaddr(node) -> str: return addr +# Matches the /tcp// segment in a libp2p multiaddr. +TCP_PORT_RE = re.compile(r"/tcp/(\d+)/") + + +def get_node_tcp_port(node) -> int: + """Return the TCP port the node advertises in its multiaddr.""" + multiaddr = get_node_multiaddr(node) + match = TCP_PORT_RE.search(multiaddr) + if not match: + raise RuntimeError(f"multiaddr missing /tcp// segment: {multiaddr!r}") + return int(match.group(1)) + + +def get_node_bound_ports(node) -> dict: + """Return the MyBoundPorts debug info . + + Keys: tcp, webSocket, rest, discv5Udp, metrics. A value of 0 means the + service is disabled or did not bind. + """ + result = node.get_node_info_raw("MyBoundPorts") + if result.is_err(): + raise RuntimeError(f"MyBoundPorts query failed: {result.err()}") + return json.loads(result.ok_value) + + +def enr_udp_port(enr_uri: str) -> int: + """Extract the advertised udp port from a text-encoded ENR. + + An ENR is "enr:" + base64url(RLP). Instead of pulling in a full RLP + decoder, find the "udp" key in the raw bytes and read the value after it. + """ + if not enr_uri.startswith("enr:"): + raise RuntimeError(f"not an ENR URI: {enr_uri!r}") + b64 = enr_uri[len("enr:") :] + payload = base64.urlsafe_b64decode(b64 + "=" * (-len(b64) % 4)) + + key = payload.find(b"\x83udp") # "udp" encoded as a 3-byte RLP string + if key == -1: + raise RuntimeError(f"ENR has no udp entry: {enr_uri!r}") + + prefix = payload[key + 4] + if prefix < 0x80: # values < 128 are encoded as a single byte + return prefix + size = prefix - 0x80 # short string: prefix is 0x80 + length + return int.from_bytes(payload[key + 5 : key + 5 + size], "big") + + def create_message_bindings(**overrides) -> dict: envelope = { "contentTopic": DEFAULT_CONTENT_TOPIC, diff --git a/tests/wrappers_tests/test_wrapper_corner_cases.py b/tests/wrappers_tests/test_wrapper_corner_cases.py index 8aa7d639a..57418cfc0 100644 --- a/tests/wrappers_tests/test_wrapper_corner_cases.py +++ b/tests/wrappers_tests/test_wrapper_corner_cases.py @@ -1,4 +1,3 @@ -import re import pytest from src.steps.common import StepsCommon from src.libs.custom_logger import get_custom_logger @@ -6,91 +5,126 @@ from src.node.wrappers_manager import WrapperManager from src.node.wrapper_helpers import ( EventCollector, create_message_bindings, + enr_udp_port, + get_node_bound_ports, get_node_multiaddr, + get_node_tcp_port, wait_for_propagated, ) -from tests.wrappers_tests.conftest import build_node_config +from tests.wrappers_tests.conftest import build_node_config, free_port logger = get_custom_logger(__name__) PROPAGATED_TIMEOUT_S = 30.0 -# Matches the /tcp// segment in a libp2p multiaddr. -TCP_PORT_RE = re.compile(r"/tcp/(\d+)/") - - -def _extract_tcp_port(multiaddr: str) -> int: - match = TCP_PORT_RE.search(multiaddr) - assert match, f"multiaddr missing /tcp// segment: {multiaddr!r}" - return int(match.group(1)) +# The five service ports covered by logos-messaging/logos-delivery#3828: +# (config field, key in MyBoundPorts, config to enable the service, default port) +SERVICE_PORTS = [ + ("tcpPort", "tcp", {}, 60000), + ("discv5UdpPort", "discv5Udp", {"discv5Discovery": True}, 9000), + ("websocketPort", "webSocket", {"websocketSupport": True}, 8000), + ("restPort", "rest", {"rest": True}, 8645), + ("metricsServerPort", "metrics", {"metricsServer": True}, 8008), +] class TestWrapperAutoPortAllocation(StepsCommon): """Corner case: port 0 triggers auto-port allocation. - Tracks logos-messaging/logos-delivery#3828. Per that PR, auto-port is - opt-in (caller passes 0 explicitly) and only applies to tcpPort, - discv5UdpPort and webSocketPort. restPort and metricsServerPort still - require a concrete value, so they are not exercised here. + Tracks logos-messaging/logos-delivery#3828: + - any service port set to 0 gets a free port auto-assigned at bind time + - defaults remain concrete, so auto-port is opt-in via an explicit 0 + - bound values are exposed through MyBoundPorts (0 = service disabled) + - the ENR is rebuilt after discv5 startup, so it advertises the + actually-bound UDP port instead of the configured 0 """ def test_auto_port_starts_node_with_tcp_and_discv5_zero(self): config = build_node_config(tcpPort=0, discv5UdpPort=0) - result = WrapperManager.create_and_start(config=config) - assert result.is_ok(), f"create_and_start failed with tcpPort=0, discv5UdpPort=0: " f"{result.err()}" + assert result.is_ok(), f"create_and_start failed: {result.err()}" with result.ok_value as node: - multiaddr = get_node_multiaddr(node) - tcp_port = _extract_tcp_port(multiaddr) - - assert tcp_port != 0, f"multiaddr still reports tcp port 0; auto-port did not " f"happen. multiaddr={multiaddr!r}" - assert 1024 <= tcp_port <= 65535, f"auto-allocated tcp port out of range: {tcp_port} " f"(multiaddr={multiaddr!r})" + tcp_port = get_node_tcp_port(node) + assert tcp_port != 0, "multiaddr still reports port 0; auto-port did not happen" + assert get_node_bound_ports(node)["tcp"] == tcp_port, "MyBoundPorts disagrees with the multiaddr port" def test_auto_port_node_can_propagate_message(self): - # End-to-end: two nodes, sender uses auto-port for tcp + discv5. - # restPort stays concrete because REST does not support auto-port. - sender_collector = EventCollector() - sender_config = build_node_config(tcpPort=0, discv5UdpPort=0) - - sender_result = WrapperManager.create_and_start( - config=sender_config, - event_cb=sender_collector.event_callback, - ) + # End-to-end: two auto-port nodes exchange a message. + # numShardsInNetwork=1 enables autosharding, required by the send API. + collector = EventCollector() + sender_config = build_node_config(tcpPort=0, discv5UdpPort=0, numShardsInNetwork=1) + sender_result = WrapperManager.create_and_start(config=sender_config, event_cb=collector.event_callback) assert sender_result.is_ok(), f"sender start failed: {sender_result.err()}" with sender_result.ok_value as sender: - sender_addr = get_node_multiaddr(sender) - peer_config = build_node_config( tcpPort=0, discv5UdpPort=0, - staticnodes=[sender_addr], + numShardsInNetwork=1, + staticnodes=[get_node_multiaddr(sender)], ) peer_result = WrapperManager.create_and_start(config=peer_config) assert peer_result.is_ok(), f"peer start failed: {peer_result.err()}" with peer_result.ok_value: - message = create_message_bindings() - send_result = sender.send_message(message=message) + send_result = sender.send_message(message=create_message_bindings()) assert send_result.is_ok(), f"send failed: {send_result.err()}" request_id = send_result.ok_value assert request_id, "send returned empty RequestId" - propagated = wait_for_propagated(sender_collector, request_id, PROPAGATED_TIMEOUT_S) - assert propagated is not None, f"no message_propagated with auto-allocated ports. " f"Events: {sender_collector.events}" - - @pytest.mark.parametrize("port_field", ["tcpPort", "discv5UdpPort"]) - def test_auto_port_per_field(self, port_field): - # Each auto-port-capable field set to 0 in isolation. restPort and - # metricsServerPort are intentionally excluded (see class docstring). - config = build_node_config(**{port_field: 0}) + propagated = wait_for_propagated(collector, request_id, PROPAGATED_TIMEOUT_S) + assert propagated is not None, f"no message_propagated event. Events: {collector.events}" + @pytest.mark.parametrize("port_field, bound_key, enable_service, default_port", SERVICE_PORTS) + def test_auto_port_per_field(self, port_field, bound_key, enable_service, default_port): + # Each service port set to 0 in isolation, with its service enabled. + config = build_node_config(**{port_field: 0}, **enable_service) result = WrapperManager.create_and_start(config=config) assert result.is_ok(), f"create_and_start failed with {port_field}=0: {result.err()}" with result.ok_value as node: - multiaddr = get_node_multiaddr(node) - tcp_port = _extract_tcp_port(multiaddr) - assert tcp_port != 0, f"tcp port is 0 in multiaddr with {port_field}=0; " f"multiaddr={multiaddr!r}" + port = get_node_bound_ports(node)[bound_key] + assert port != 0, f"{port_field}=0 but nothing bound; auto-port did not happen" + assert port != default_port, f"{port_field}=0 bound the default {default_port}; expected an ephemeral port" + + def test_concrete_ports_are_respected(self): + # Auto-port is opt-in: non-zero ports must bind exactly as requested. + requested = {field: free_port() for field, _, _, _ in SERVICE_PORTS} + enable_all = {key: value for _, _, enable, _ in SERVICE_PORTS for key, value in enable.items()} + + config = build_node_config(**requested, **enable_all) + result = WrapperManager.create_and_start(config=config) + assert result.is_ok(), f"create_and_start failed: {result.err()}" + + with result.ok_value as node: + bound = get_node_bound_ports(node) + for field, bound_key, _, _ in SERVICE_PORTS: + assert bound[bound_key] == requested[field], f"{field}: requested {requested[field]}, bound {bound[bound_key]}" + + def test_bound_ports_zero_for_disabled_services(self): + # build_node_config leaves websocket, REST, metrics and discv5 off. + result = WrapperManager.create_and_start(config=build_node_config()) + assert result.is_ok(), f"create_and_start failed: {result.err()}" + + with result.ok_value as node: + bound = get_node_bound_ports(node) + assert bound["tcp"] != 0, "tcp is enabled but reports port 0" + for key in ("webSocket", "rest", "discv5Udp", "metrics"): + assert bound[key] == 0, f"{key} is disabled but reports port {bound[key]}" + + def test_enr_advertises_bound_discv5_port(self): + # #3828 rebuilds the ENR after discv5 startup so it advertises the + # actually-bound UDP port, not the configured 0. + config = build_node_config(discv5UdpPort=0, discv5Discovery=True) + result = WrapperManager.create_and_start(config=config) + assert result.is_ok(), f"create_and_start failed: {result.err()}" + + with result.ok_value as node: + discv5_port = get_node_bound_ports(node)["discv5Udp"] + assert discv5_port != 0, "discv5 enabled with port 0 but nothing bound" + + enr_result = node.get_node_info_raw("MyENR") + assert enr_result.is_ok(), f"MyENR query failed: {enr_result.err()}" + assert enr_udp_port(enr_result.ok_value.strip()) == discv5_port, "ENR was not rebuilt after discv5 startup"