From a6a04403128f8ae6bf3189a3180d8d9379e57b3d Mon Sep 17 00:00:00 2001
From: Florin Barbu <florin@status.im>
Date: Tue, 21 Nov 2023 09:29:48 +0200
Subject: [PATCH] Relay Publish: multiple nodes (#4)

* github actions report summary

* use env instead of inputs

* multiple nodes tests

* fix warm up

* fix warm up

* small fix after CI run

* small fix after CI run 2

* add new multi-node test

* self review
---
 .github/workflows/test.yml         |  23 ++++++-
 src/env_vars.py                    |   2 +
 src/steps/relay.py                 | 101 ++++++++++++++++++++---------
 tests/conftest.py                  |   6 ++
 tests/relay/test_multiple_nodes.py |  19 ++++++
 tests/relay/test_publish.py        |  18 ++---
 6 files changed, 128 insertions(+), 41 deletions(-)
 create mode 100644 tests/relay/test_multiple_nodes.py

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 16d8b78adb..64032efe98 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -9,13 +9,20 @@ on:
   workflow_dispatch:
     inputs:
       node1:
-        required: false
+        required: true
+        description: "Node that usually publishes messages. Used for all tests"
         type: string
         default: "wakuorg/nwaku:latest"
       node2:
-        required: false
+        required: true
+        description: "Node that usually queries for published messages. Used for all tests"
         type: string
         default: "wakuorg/go-waku:latest"
+      additional_nodes:
+        required: false
+        description: "Additional optional nodes used in e2e tests, separated by ,"
+        type: string
+        default: "wakuorg/nwaku:latest,wakuorg/go-waku:latest"
       protocol:
         description: "Protocol used to comunicate inside the network"
         required: true
@@ -29,6 +36,7 @@ env:
   FORCE_COLOR: "1"
   NODE_1: ${{ inputs.node1 }}
   NODE_2: ${{ inputs.node2 }}
+  ADDITIONAL_NODES: ${{ inputs.additional_nodes }}
   PROTOCOL: ${{ inputs.protocol || 'REST' }}
 
 jobs:
@@ -75,3 +83,14 @@ jobs:
         github_token: ${{ secrets.GITHUB_TOKEN }}
         publish_branch: gh-pages
         publish_dir: allure-history
+
+    - name: Create job summary
+      if: always()
+      run: |
+        echo "## Run Information" >> $GITHUB_STEP_SUMMARY
+        echo "- **Node1**: ${{ env.NODE_1 }}" >> $GITHUB_STEP_SUMMARY
+        echo "- **Node2**: ${{ env.NODE_2}}" >> $GITHUB_STEP_SUMMARY
+        echo "- **Additonal Nodes**: ${{ env.ADDITIONAL_NODES}}" >> $GITHUB_STEP_SUMMARY
+        echo "- **Protocol**: ${{ env.PROTOCOL }}" >> $GITHUB_STEP_SUMMARY
+        echo "## Test Results" >> $GITHUB_STEP_SUMMARY
+        echo "Allure report will be available at: https://waku-org.github.io/waku-interop-tests/${{ github.run_number }}" >> $GITHUB_STEP_SUMMARY
diff --git a/src/env_vars.py b/src/env_vars.py
index 3cf33609a0..a996661cc3 100644
--- a/src/env_vars.py
+++ b/src/env_vars.py
@@ -16,6 +16,8 @@ def get_env_var(var_name, default=None):
 # Configuration constants. Need to be upercase to appear in reports
 NODE_1 = get_env_var("NODE_1", "wakuorg/go-waku:latest")
 NODE_2 = get_env_var("NODE_2", "wakuorg/nwaku:latest")
+ADDITIONAL_NODES = get_env_var("ADDITIONAL_NODES", "wakuorg/nwaku:latest,wakuorg/go-waku:latest")
+# more nodes need to follow the NODE_X pattern
 DOCKER_LOG_DIR = get_env_var("DOCKER_LOG_DIR", "./log/docker")
 NETWORK_NAME = get_env_var("NETWORK_NAME", "waku")
 SUBNET = get_env_var("SUBNET", "172.18.0.0/16")
diff --git a/src/steps/relay.py b/src/steps/relay.py
index 720232d243..d9bafdbadd 100644
--- a/src/steps/relay.py
+++ b/src/steps/relay.py
@@ -1,3 +1,4 @@
+import inspect
 from src.libs.custom_logger import get_custom_logger
 import math
 from time import time
@@ -5,7 +6,7 @@ import pytest
 import allure
 from src.libs.common import to_base64, delay
 from src.data_classes import message_rpc_response_schema
-from src.env_vars import NODE_1, NODE_2, NODEKEY
+from src.env_vars import NODE_1, NODE_2, ADDITIONAL_NODES, NODEKEY, RUNNING_IN_CI
 from src.node.waku_node import WakuNode
 from tenacity import retry, stop_after_delay, wait_fixed
 
@@ -13,36 +14,81 @@ logger = get_custom_logger(__name__)
 
 
 class StepsRelay:
-    @pytest.fixture(scope="function", autouse=True)
-    def setup_nodes(self, request):
-        self.node1 = WakuNode(NODE_1, "node1_" + request.cls.test_id)
-        self.node1.start(relay="true", discv5_discovery="true", peer_exchange="true", nodekey=NODEKEY)
-        enr_uri = self.node1.info()["enrUri"]
-        self.node2 = WakuNode(NODE_2, "node2_" + request.cls.test_id)
-        self.node2.start(relay="true", discv5_discovery="true", discv5_bootstrap_node=enr_uri, peer_exchange="true")
-        self.test_pubsub_topic = "/waku/2/rs/18/1"
-        self.test_content_topic = "/test/1/waku-relay/proto"
-        self.test_payload = "Relay works!!"
-        self.node1.set_subscriptions([self.test_pubsub_topic])
-        self.node2.set_subscriptions([self.test_pubsub_topic])
+    test_pubsub_topic = "/waku/2/rs/18/1"
+    test_content_topic = "/test/1/waku-relay/proto"
+    test_payload = "Relay works!!"
 
-    @pytest.fixture(scope="function", autouse=True)
-    def network_warm_up(self, setup_nodes):
+    @pytest.fixture(scope="function")
+    def setup_main_relay_nodes(self, request):
+        logger.debug(f"Running fixture setup: {inspect.currentframe().f_code.co_name}")
+        self.node1 = WakuNode(NODE_1, f"node1_{request.cls.test_id}")
+        self.node1.start(relay="true", discv5_discovery="true", peer_exchange="true", nodekey=NODEKEY)
+        self.enr_uri = self.node1.info()["enrUri"]
+        self.node2 = WakuNode(NODE_2, f"node1_{request.cls.test_id}")
+        self.node2.start(relay="true", discv5_discovery="true", discv5_bootstrap_node=self.enr_uri, peer_exchange="true")
+        self.main_nodes = [self.node1, self.node2]
+        self.optional_nodes = []
+
+    @pytest.fixture(scope="function")
+    def setup_optional_relay_nodes(self, request):
+        logger.debug(f"Running fixture setup: {inspect.currentframe().f_code.co_name}")
+        if ADDITIONAL_NODES:
+            nodes = [node.strip() for node in ADDITIONAL_NODES.split(",")]
+        else:
+            pytest.skip("ADDITIONAL_NODES is empty, cannot run test")
+        for index, node in enumerate(nodes):
+            node = WakuNode(node, f"node{index}_{request.cls.test_id}")
+            node.start(relay="true", discv5_discovery="true", discv5_bootstrap_node=self.enr_uri, peer_exchange="true")
+            self.optional_nodes.append(node)
+
+    @pytest.fixture(scope="function")
+    def subscribe_main_relay_nodes(self):
+        logger.debug(f"Running fixture setup: {inspect.currentframe().f_code.co_name}")
+        self.ensure_subscriptions_on_nodes(self.main_nodes, [self.test_pubsub_topic])
+
+    @pytest.fixture(scope="function")
+    def subscribe_optional_relay_nodes(self):
+        logger.debug(f"Running fixture setup: {inspect.currentframe().f_code.co_name}")
+        self.ensure_subscriptions_on_nodes(self.optional_nodes, [self.test_pubsub_topic])
+
+    @pytest.fixture(scope="function")
+    def relay_warm_up(self):
         try:
-            self.wait_for_published_message_to_reach_peer(120)
-            logger.info("WARM UP successful !!")
+            self.wait_for_published_message_to_reach_peer()
+            logger.info("WARM UP successful!!")
         except Exception as ex:
             raise TimeoutError(f"WARM UP FAILED WITH: {ex}")
 
+    # this method should be used only for the tests that use the warm_up fixture
+    # otherwise use wait_for_published_message_to_reach_peer
     @allure.step
-    def check_published_message_reaches_peer(self, message, pubsub_topic=None, message_propagation_delay=0.1):
-        self.node1.send_message(message, pubsub_topic or self.test_pubsub_topic)
+    def check_published_message_reaches_peer(self, message, pubsub_topic=None, message_propagation_delay=0.1, sender=None, peer_list=None):
+        if not sender:
+            sender = self.node1
+        if not peer_list:
+            peer_list = self.main_nodes + self.optional_nodes
+        sender.send_message(message, pubsub_topic or self.test_pubsub_topic)
         delay(message_propagation_delay)
-        get_messages_response = self.node2.get_messages(pubsub_topic or self.test_pubsub_topic)
-        assert get_messages_response, "Peer node couldn't find any messages"
-        received_message = message_rpc_response_schema.load(get_messages_response[0])
-        self.assert_received_message(message, received_message)
+        for index, peer in enumerate(peer_list):
+            logger.debug(f"Checking that peer NODE_{index + 1}:{peer.image} can find the published message")
+            get_messages_response = peer.get_messages(pubsub_topic or self.test_pubsub_topic)
+            assert get_messages_response, f"Peer NODE_{index}:{peer.image} couldn't find any messages"
+            received_message = message_rpc_response_schema.load(get_messages_response[0])
+            self.assert_received_message(message, received_message)
 
+    # we need much bigger timeout in CI because we run tests in parallel there and the machine itself is slower
+    @allure.step
+    def wait_for_published_message_to_reach_peer(
+        self, timeout_duration=120 if RUNNING_IN_CI else 20, time_between_retries=1, sender=None, peer_list=None
+    ):
+        @retry(stop=stop_after_delay(timeout_duration), wait=wait_fixed(time_between_retries), reraise=True)
+        def check_peer_connection():
+            message = {"payload": to_base64(self.test_payload), "contentTopic": self.test_content_topic, "timestamp": int(time() * 1e9)}
+            self.check_published_message_reaches_peer(message, sender=sender, peer_list=peer_list)
+
+        check_peer_connection()
+
+    @allure.step
     def assert_received_message(self, sent_message, received_message):
         def assert_fail_message(field_name):
             return f"Incorrect field: {field_name}. Published: {sent_message[field_name]} Received: {getattr(received_message, field_name)}"
@@ -63,14 +109,7 @@ class StepsRelay:
         if "rateLimitProof" in sent_message:
             assert str(received_message.rateLimitProof) == str(sent_message["rateLimitProof"]), assert_fail_message("rateLimitProof")
 
-    def wait_for_published_message_to_reach_peer(self, timeout_duration, time_between_retries=1):
-        @retry(stop=stop_after_delay(timeout_duration), wait=wait_fixed(time_between_retries), reraise=True)
-        def check_peer_connection():
-            message = {"payload": to_base64(self.test_payload), "contentTopic": self.test_content_topic, "timestamp": int(time() * 1e9)}
-            self.check_published_message_reaches_peer(message)
-
-        check_peer_connection()
-
+    @allure.step
     def ensure_subscriptions_on_nodes(self, node_list, pubsub_topic_list):
         for node in node_list:
             node.set_subscriptions(pubsub_topic_list)
diff --git a/tests/conftest.py b/tests/conftest.py
index b412a29713..2e26b97cd4 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+import inspect
 import glob
 from src.libs.custom_logger import get_custom_logger
 import os
@@ -28,6 +29,7 @@ def pytest_runtest_makereport(item):
 def set_allure_env_variables():
     yield
     if os.path.isdir("allure-results") and not os.path.isfile(os.path.join("allure-results", "environment.properties")):
+        logger.debug(f"Running fixture teardown: {inspect.currentframe().f_code.co_name}")
         with open(os.path.join("allure-results", "environment.properties"), "w") as outfile:
             for attribute_name in dir(env_vars):
                 if attribute_name.isupper():
@@ -38,6 +40,7 @@ def set_allure_env_variables():
 @pytest.fixture(scope="function", autouse=True)
 def test_id(request):
     # setting up an unique test id to be used where needed
+    logger.debug(f"Running fixture setup: {inspect.currentframe().f_code.co_name}")
     request.cls.test_id = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}__{str(uuid4())}"
 
 
@@ -45,6 +48,7 @@ def test_id(request):
 def test_setup(request, test_id):
     logger.debug(f"Running test: {request.node.name} with id: {request.cls.test_id}")
     yield
+    logger.debug(f"Running fixture teardown: {inspect.currentframe().f_code.co_name}")
     for file in glob.glob(os.path.join(env_vars.DOCKER_LOG_DIR, "*")):
         if os.path.getmtime(file) < time() - 3600:
             logger.debug(f"Deleting old log file: {file}")
@@ -58,6 +62,7 @@ def test_setup(request, test_id):
 def attach_logs_on_fail(request):
     yield
     if env_vars.RUNNING_IN_CI and hasattr(request.node, "rep_call") and request.node.rep_call.failed:
+        logger.debug(f"Running fixture teardown: {inspect.currentframe().f_code.co_name}")
         logger.debug("Test failed, attempting to attach logs to the allure reports")
         for file in glob.glob(os.path.join(env_vars.DOCKER_LOG_DIR, "*" + request.cls.test_id + "*")):
             attach_allure_file(file)
@@ -67,6 +72,7 @@ def attach_logs_on_fail(request):
 def close_open_nodes(attach_logs_on_fail):
     DS.waku_nodes = []
     yield
+    logger.debug(f"Running fixture teardown: {inspect.currentframe().f_code.co_name}")
     crashed_containers = []
     for node in DS.waku_nodes:
         try:
diff --git a/tests/relay/test_multiple_nodes.py b/tests/relay/test_multiple_nodes.py
new file mode 100644
index 0000000000..aa7cc121e5
--- /dev/null
+++ b/tests/relay/test_multiple_nodes.py
@@ -0,0 +1,19 @@
+import pytest
+from src.steps.relay import StepsRelay
+
+
+@pytest.mark.usefixtures("setup_main_relay_nodes", "setup_optional_relay_nodes", "subscribe_main_relay_nodes")
+class TestMultipleNodes(StepsRelay):
+    def test_first_node_to_start_publishes(self, subscribe_optional_relay_nodes, relay_warm_up):
+        self.check_published_message_reaches_peer(self.create_message())
+
+    def test_last_node_to_start_publishes(self, subscribe_optional_relay_nodes, relay_warm_up):
+        self.check_published_message_reaches_peer(self.create_message(), sender=self.optional_nodes[-1])
+
+    def test_optional_nodes_not_subscribed_to_same_pubsub_topic(self):
+        self.wait_for_published_message_to_reach_peer(peer_list=self.main_nodes)
+        try:
+            self.check_published_message_reaches_peer(self.create_message(), peer_list=self.optional_nodes)
+            raise AssertionError("Non subscribed nodes received the message!!")
+        except Exception as ex:
+            assert "Not Found" in str(ex), "Expected 404 Not Found when the message is not found"
diff --git a/tests/relay/test_publish.py b/tests/relay/test_publish.py
index 3d49d19933..0f45ff9819 100644
--- a/tests/relay/test_publish.py
+++ b/tests/relay/test_publish.py
@@ -1,3 +1,4 @@
+import pytest
 from src.libs.custom_logger import get_custom_logger
 from time import time
 from src.libs.common import delay, to_base64
@@ -8,6 +9,7 @@ from src.data_classes import message_rpc_response_schema
 logger = get_custom_logger(__name__)
 
 
+@pytest.mark.usefixtures("setup_main_relay_nodes", "subscribe_main_relay_nodes", "relay_warm_up")
 class TestRelayPublish(StepsRelay):
     def test_publish_with_valid_payloads(self):
         failed_payloads = []
@@ -55,7 +57,7 @@ class TestRelayPublish(StepsRelay):
                 self.check_published_message_reaches_peer(message, message_propagation_delay=2)
                 raise AssertionError("Duplicate message was retrieved twice")
             except Exception as ex:
-                assert "Peer node couldn't find any messages" in str(ex)
+                assert "couldn't find any messages" in str(ex)
 
     def test_publish_with_valid_content_topics(self):
         failed_content_topics = []
@@ -90,7 +92,7 @@ class TestRelayPublish(StepsRelay):
             assert "Bad Request" in str(ex) or "Internal Server Error" in str(ex)
 
     def test_publish_on_multiple_pubsub_topics(self):
-        self.ensure_subscriptions_on_nodes([self.node1, self.node2], VALID_PUBSUB_TOPICS)
+        self.ensure_subscriptions_on_nodes(self.main_nodes, VALID_PUBSUB_TOPICS)
         failed_pubsub_topics = []
         for pubsub_topic in VALID_PUBSUB_TOPICS:
             logger.debug(f"Running test with pubsub topic {pubsub_topic}")
@@ -102,7 +104,7 @@ class TestRelayPublish(StepsRelay):
         assert not failed_pubsub_topics, f"PubusubTopic failed: {failed_pubsub_topics}"
 
     def test_message_published_on_different_pubsub_topic_is_not_retrieved(self):
-        self.ensure_subscriptions_on_nodes([self.node1, self.node2], VALID_PUBSUB_TOPICS)
+        self.ensure_subscriptions_on_nodes(self.main_nodes, VALID_PUBSUB_TOPICS)
         self.node1.send_message(self.create_message(), VALID_PUBSUB_TOPICS[0])
         delay(0.1)
         messages = self.node2.get_messages(VALID_PUBSUB_TOPICS[1])
@@ -194,7 +196,7 @@ class TestRelayPublish(StepsRelay):
             self.check_published_message_reaches_peer(message)
             raise AssertionError("Duplicate message was retrieved twice")
         except Exception as ex:
-            assert "Peer node couldn't find any messages" in str(ex)
+            assert "couldn't find any messages" in str(ex)
 
     def test_publish_while_peer_is_paused(self):
         message = self.create_message()
@@ -218,14 +220,14 @@ class TestRelayPublish(StepsRelay):
     def test_publish_after_node1_restarts(self):
         self.check_published_message_reaches_peer(self.create_message())
         self.node1.restart()
-        self.ensure_subscriptions_on_nodes([self.node1, self.node2], [self.test_pubsub_topic])
-        self.wait_for_published_message_to_reach_peer(20)
+        self.ensure_subscriptions_on_nodes(self.main_nodes, [self.test_pubsub_topic])
+        self.wait_for_published_message_to_reach_peer()
 
     def test_publish_after_node2_restarts(self):
         self.check_published_message_reaches_peer(self.create_message())
         self.node2.restart()
-        self.ensure_subscriptions_on_nodes([self.node1, self.node2], [self.test_pubsub_topic])
-        self.wait_for_published_message_to_reach_peer(20)
+        self.ensure_subscriptions_on_nodes(self.main_nodes, [self.test_pubsub_topic])
+        self.wait_for_published_message_to_reach_peer()
 
     def test_publish_and_retrieve_100_messages(self):
         num_messages = 100  # if increase this number make sure to also increase rest-relay-cache-capacity flag