diff --git a/README.md b/README.md index 592ddb9..5e13480 100644 --- a/README.md +++ b/README.md @@ -19,10 +19,10 @@ pytest Licensed and distributed under either of -- MIT license: [LICENSE-MIT](https://github.com/waku-org/js-waku/blob/master/LICENSE-MIT) or http://opensource.org/licenses/MIT +- MIT license: [LICENSE-MIT](http://opensource.org/licenses/MIT) or -- Apache License, Version 2.0, ([LICENSE-APACHE-v2](https://github.com/waku-org/js-waku/blob/master/LICENSE-APACHE-v2) or http://www.apache.org/licenses/LICENSE-2.0) +- Apache License, Version 2.0, [LICENSE-APACHE-v2](http://www.apache.org/licenses/LICENSE-2.0) at your option. These files may not be copied, modified, or distributed except according to those terms. diff --git a/cluster_config/cfgsync.yaml b/cluster_config/cfgsync.yaml new file mode 100644 index 0000000..10840a5 --- /dev/null +++ b/cluster_config/cfgsync.yaml @@ -0,0 +1,31 @@ +port: 4400 +n_hosts: 2 +timeout: 30 + +# ConsensusConfig related parameters +security_param: 10 +active_slot_coeff: 0.9 + +# DaConfig related parameters +subnetwork_size: 2 +dispersal_factor: 2 +num_samples: 1 +num_subnets: 2 +old_blobs_check_interval_secs: 5 +blobs_validity_duration_secs: 60 +global_params_path: "/kzgrs_test_params" + +# Tracing +tracing_settings: + logger: Stdout + tracing: !Otlp + endpoint: http://tempo:4317/ + sample_ratio: 0.5 + service_name: node + filter: !EnvFilter + filters: + nomos: debug + metrics: !Otlp + endpoint: http://prometheus:9090/api/v1/otlp/v1/metrics + host_identifier: node + level: INFO \ No newline at end of file diff --git a/cluster_config/scripts/run_cfgsync.sh b/cluster_config/scripts/run_cfgsync.sh new file mode 100755 index 0000000..d0f2659 --- /dev/null +++ b/cluster_config/scripts/run_cfgsync.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +set -e + +exec /usr/bin/cfgsync-server /etc/nomos/cfgsync.yaml diff --git a/cluster_config/scripts/run_nomos_executor.sh b/cluster_config/scripts/run_nomos_executor.sh new file mode 100755 index 0000000..9fab2a6 --- /dev/null +++ b/cluster_config/scripts/run_nomos_executor.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +set -e + +export CFG_FILE_PATH="/config.yaml" \ + CFG_SERVER_ADDR="http://cfgsync:4400" \ + CFG_HOST_IP=$(hostname -i) \ + CFG_HOST_KIND="executor" \ + CFG_HOST_IDENTIFIER="executor-$(hostname -i)" \ + LOG_LEVEL="INFO" \ + RISC0_DEV_MODE=true + +/usr/bin/cfgsync-client && \ + exec /usr/bin/nomos-executor /config.yaml diff --git a/cluster_config/scripts/run_nomos_node.sh b/cluster_config/scripts/run_nomos_node.sh new file mode 100755 index 0000000..d8c7338 --- /dev/null +++ b/cluster_config/scripts/run_nomos_node.sh @@ -0,0 +1,13 @@ +#!/bin/sh + +set -e + +export CFG_FILE_PATH="/config.yaml" \ + CFG_SERVER_ADDR="http://cfgsync:4400" \ + CFG_HOST_IP=$(hostname -i) \ + CFG_HOST_IDENTIFIER="validator-$(hostname -i)" \ + LOG_LEVEL="INFO" \ + RISC0_DEV_MODE=true + +/usr/bin/cfgsync-client && \ + exec /usr/bin/nomos-node /config.yaml diff --git a/requirements.txt b/requirements.txt index 48ed199..2965db5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,7 +31,7 @@ pytest-xdist==3.5.0 python-dotenv==1.0.1 pytest-dependency==0.6.0 PyYAML==6.0.1 -requests==2.32.0 +requests==2.31.0 setuptools==70.0.0 tenacity==8.2.3 typeguard==4.1.5 diff --git a/src/data_storage.py b/src/data_storage.py new file mode 100644 index 0000000..c6215e2 --- /dev/null +++ b/src/data_storage.py @@ -0,0 +1,3 @@ +# We use this class for global variables +class DS: + nomos_nodes = [] diff --git a/src/env_vars.py b/src/env_vars.py index bf4e97a..78c051d 100644 --- a/src/env_vars.py +++ b/src/env_vars.py @@ -15,10 +15,15 @@ def get_env_var(var_name, default=None): # Configuration constants. Need to be upercase to appear in reports -DEFAULT_NOMOS = "nomos:latest" -NODE_1 = get_env_var("NODE_1", DEFAULT_NOMOS) -NODE_2 = get_env_var("NODE_2", DEFAULT_NOMOS) -ADDITIONAL_NODES = get_env_var("ADDITIONAL_NODES", f"{DEFAULT_NOMOS},{DEFAULT_NOMOS}") +NOMOS = "nomos" +NOMOS_EXECUTOR = "nomos_executor" +CFGSYNC = "cfgsync" + +NODE_1 = get_env_var("NODE_1", NOMOS) +NODE_2 = get_env_var("NODE_2", NOMOS_EXECUTOR) +NODE_3 = get_env_var("NODE_3", CFGSYNC) + +ADDITIONAL_NODES = get_env_var("ADDITIONAL_NODES", f"{NOMOS},{NOMOS}") # more nodes need to follow the NODE_X pattern DOCKER_LOG_DIR = get_env_var("DOCKER_LOG_DIR", "./log/docker") NETWORK_NAME = get_env_var("NETWORK_NAME", "nomos") diff --git a/src/node/api_clients/rest.py b/src/node/api_clients/rest.py index c2197da..a899d83 100644 --- a/src/node/api_clients/rest.py +++ b/src/node/api_clients/rest.py @@ -20,6 +20,6 @@ class REST(BaseClient): headers = {"accept": "text/plain"} return self.make_request(method, url, headers=headers, data=payload) - def status(self): - info_response = self.rest_call("get", "cl/status") - return info_response.json() + def info(self): + status_response = self.rest_call("get", "cryptarchia/info") + return status_response.json() diff --git a/src/node/docker_mananger.py b/src/node/docker_mananger.py index 6318e8c..074fb41 100644 --- a/src/node/docker_mananger.py +++ b/src/node/docker_mananger.py @@ -35,7 +35,7 @@ class DockerManager: logger.debug(f"Network {network_name} created") return network - def start_container(self, image_name, ports, args, log_path, container_ip, volumes, remove_container=True): + def start_container(self, image_name, port_bindings, args, log_path, volumes, entrypoint, remove_container=True, name=None): cli_args = [] for key, value in args.items(): if isinstance(value, list): # Check if value is a list @@ -45,18 +45,21 @@ class DockerManager: else: cli_args.append(f"--{key}={value}") # Add a single command - port_bindings = {f"{port}/tcp": ("", port) for port in ports} - port_bindings_for_log = " ".join(f"-p {port}:{port}" for port in ports) cli_args_str_for_log = " ".join(cli_args) - logger.debug(f"docker run -i -t {port_bindings_for_log} {image_name} {cli_args_str_for_log}") + logger.debug(f"docker run -i -t {port_bindings} {image_name} {cli_args_str_for_log}") container = self._client.containers.run( - image_name, command=cli_args, ports=port_bindings, detach=True, remove=remove_container, auto_remove=remove_container, volumes=volumes + image_name, + command=cli_args, + ports=port_bindings, + detach=True, + remove=remove_container, + auto_remove=remove_container, + volumes=volumes, + entrypoint=entrypoint, + name=name, + network=NETWORK_NAME, ) - network = self._client.networks.get(NETWORK_NAME) - logger.debug(f"docker network connect --ip {container_ip} {NETWORK_NAME} {container.id}") - network.connect(container, ipv4_address=container_ip) - logger.debug(f"Container started with ID {container.short_id}. Setting up logs at {log_path}") log_thread = threading.Thread(target=self._log_container_output, args=(container, log_path)) log_thread.daemon = True @@ -105,7 +108,7 @@ class DockerManager: @staticmethod def generate_random_ext_ip(): - base_ip_fragments = ["172", "18"] + base_ip_fragments = ["172", "19"] ext_ip = ".".join(base_ip_fragments + [str(random.randint(0, 255)) for _ in range(2)]) logger.debug(f"Generated random external IP {ext_ip}") return ext_ip diff --git a/src/node/node_vars.py b/src/node/node_vars.py new file mode 100644 index 0000000..333a749 --- /dev/null +++ b/src/node/node_vars.py @@ -0,0 +1,20 @@ +nomos_nodes = { + "nomos": { + "image": "nomos:latest", + "volumes": ["cluster_config:/etc/nomos", "./kzgrs/kzgrs_test_params:/kzgrs_test_params:z"], + "ports": ["3000/udp", "18080/tcp"], + "entrypoint": "/etc/nomos/scripts/run_nomos_node.sh", + }, + "nomos_executor": { + "image": "nomos:latest", + "volumes": ["cluster_config:/etc/nomos", "./kzgrs/kzgrs_test_params:/kzgrs_test_params:z"], + "ports": ["3000/udp", "18080/tcp"], + "entrypoint": "/etc/nomos/scripts/run_nomos_executor.sh", + }, + "cfgsync": { + "image": "nomos:latest", + "volumes": ["cluster_config:/etc/nomos"], + "ports": "", + "entrypoint": "/etc/nomos/scripts/run_cfgsync.sh", + }, +} diff --git a/src/node/nomos_node.py b/src/node/nomos_node.py index 246106d..cd3f079 100644 --- a/src/node/nomos_node.py +++ b/src/node/nomos_node.py @@ -1,10 +1,14 @@ import os +from src.data_storage import DS from src.libs.custom_logger import get_custom_logger from tenacity import retry, stop_after_delay, wait_fixed + from src.node.api_clients.rest import REST from src.node.docker_mananger import DockerManager from src.env_vars import DOCKER_LOG_DIR +from src.node.node_vars import nomos_nodes +from src.test_data import LOG_ERROR_KEYWORDS logger = get_custom_logger(__name__) @@ -19,13 +23,125 @@ def sanitize_docker_flags(input_flags): class NomosNode: - def __init__(self, docker_image, docker_log_prefix=""): - self._image_name = docker_image - self._log_path = os.path.join(DOCKER_LOG_DIR, f"{docker_log_prefix}__{self._image_name.replace('/', '_')}.log") + def __init__(self, node_type, container_name=""): + logger.debug(f"Node is going to be initialized with this config {nomos_nodes[node_type]}") + self._image_name = nomos_nodes[node_type]["image"] + self._internal_ports = nomos_nodes[node_type]["ports"] + self._volumes = nomos_nodes[node_type]["volumes"] + self._entrypoint = nomos_nodes[node_type]["entrypoint"] + + self._log_path = os.path.join(DOCKER_LOG_DIR, f"{container_name}__{self._image_name.replace('/', '_')}.log") self._docker_manager = DockerManager(self._image_name) + self._container_name = container_name self._container = None + + cwd = os.getcwd() + for i, volume in enumerate(self._volumes): + self._volumes[i] = cwd + "/" + volume + logger.debug(f"NomosNode instance initialized with log path {self._log_path}") @retry(stop=stop_after_delay(60), wait=wait_fixed(0.1), reraise=True) - def start(self, wait_for_node_sec=20, **kwargs): + def start(self, wait_for_node_sec=120, **kwargs): logger.debug("Starting Node...") + self._docker_manager.create_network() + self._ext_ip = self._docker_manager.generate_random_ext_ip() + + number_of_ports = len(self._internal_ports) + self._port_map = {} + + if number_of_ports > 0: + self._external_ports = self._docker_manager.generate_ports(count=number_of_ports) + self._udp_port = self._external_ports[0] + self._tcp_port = self._external_ports[1] + self._api = REST(self._tcp_port) + + logger.debug(f"Internal ports {self._internal_ports}") + + for i, port in enumerate(self._internal_ports): + self._port_map[port] = int(self._external_ports[i]) + + default_args = {} + + logger.debug(f"Using volumes {self._volumes}") + + logger.debug(f"Port map {self._port_map}") + + self._container = self._docker_manager.start_container( + self._docker_manager.image, + port_bindings=self._port_map, + args=default_args, + log_path=self._log_path, + volumes=self._volumes, + entrypoint=self._entrypoint, + remove_container=True, + name=self._container_name, + ) + + logger.debug(f"Started container from image {self._image_name}. " f"REST: {getattr(self, '_tcp_port', 'N/A')}") + + DS.nomos_nodes.append(self) + + @retry(stop=stop_after_delay(5), wait=wait_fixed(0.1), reraise=True) + def stop(self): + if self._container: + logger.debug(f"Stopping container with id {self._container.short_id}") + self._container.stop() + try: + self._container.remove() + except: + pass + self._container = None + logger.debug("Container stopped.") + + @retry(stop=stop_after_delay(5), wait=wait_fixed(0.1), reraise=True) + def kill(self): + if self._container: + logger.debug(f"Killing container with id {self._container.short_id}") + self._container.kill() + try: + self._container.remove() + except: + pass + self._container = None + logger.debug("Container killed.") + + def restart(self): + if self._container: + logger.debug(f"Restarting container with id {self._container.short_id}") + self._container.restart() + + def pause(self): + if self._container: + logger.debug(f"Pausing container with id {self._container.short_id}") + self._container.pause() + + def unpause(self): + if self._container: + logger.debug(f"Unpause container with id {self._container.short_id}") + self._container.unpause() + + def ensure_ready(self, timeout_duration=10): + @retry(stop=stop_after_delay(timeout_duration), wait=wait_fixed(0.1), reraise=True) + def check_ready(node=self): + node.info_response = node.info() + logger.info("REST service is ready !!") + + if self.is_nomos(): + check_ready() + + def is_nomos(self): + return "nomos" in self._container_name + + def info(self): + return self._api.info() + + def check_nomos_log_errors(self, whitelist=None): + keywords = LOG_ERROR_KEYWORDS + + # If a whitelist is provided, remove those keywords from the keywords list + if whitelist: + keywords = [keyword for keyword in keywords if keyword not in whitelist] + + matches = self._docker_manager.search_log_for_keywords(self._log_path, keywords, False) + assert not matches, f"Found errors {matches}" diff --git a/src/test_data.py b/src/test_data.py new file mode 100644 index 0000000..e4d7665 --- /dev/null +++ b/src/test_data.py @@ -0,0 +1,26 @@ +from time import time +from datetime import datetime, timedelta + +LOG_ERROR_KEYWORDS = [ + "crash", + "fatal", + "panic", + "abort", + "segfault", + "corrupt", + "terminated", + "unhandled", + "stacktrace", + "deadlock", + "SIGSEGV", + "SIGABRT", + "stack overflow", + "index out of bounds", + "nil pointer dereference", + "goroutine exit", + "nil pointer", + "runtime error", + "goexit", + "race condition", + "double free", +] diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..66d83d4 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +import inspect +import glob +from src.libs.custom_logger import get_custom_logger +import os +import pytest +from datetime import datetime +from time import time +from uuid import uuid4 +from src.libs.common import attach_allure_file +import src.env_vars as env_vars +from src.data_storage import DS + +logger = get_custom_logger(__name__) + + +# See https://docs.pytest.org/en/latest/example/simple.html#making-test-result-information-available-in-fixtures +@pytest.hookimpl(hookwrapper=True, tryfirst=True) +def pytest_runtest_makereport(item): + outcome = yield + rep = outcome.get_result() + if rep.when == "call": + setattr(item, "rep_call", rep) + return rep + return None + + +@pytest.fixture(scope="session", autouse=True) +def set_allure_env_variables(): + yield + if os.path.isdir("allure-results") and not os.path.isfile(os.path.join("allure-results", "environment.properties")): + logger.debug(f"Running fixture teardown: {inspect.currentframe().f_code.co_name}") + with open(os.path.join("allure-results", "environment.properties"), "w") as outfile: + for attribute_name in dir(env_vars): + if attribute_name.isupper(): + attribute_value = getattr(env_vars, attribute_name) + outfile.write(f"{attribute_name}={attribute_value}\n") + + +@pytest.fixture(scope="function", autouse=True) +def test_id(request): + # setting up an unique test id to be used where needed + logger.debug(f"Running fixture setup: {inspect.currentframe().f_code.co_name}") + request.cls.test_id = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}__{str(uuid4())}" + + +@pytest.fixture(scope="function", autouse=True) +def test_setup(request, test_id): + logger.debug(f"Running test: {request.node.name} with id: {request.cls.test_id}") + yield + logger.debug(f"Running fixture teardown: {inspect.currentframe().f_code.co_name}") + for file in glob.glob(os.path.join(env_vars.DOCKER_LOG_DIR, "*")): + if os.path.getmtime(file) < time() - 3600: + logger.debug(f"Deleting old log file: {file}") + try: + os.remove(file) + except: + logger.error("Could not delete file") + + +@pytest.fixture(scope="function", autouse=True) +def attach_logs_on_fail(request): + yield + if env_vars.RUNNING_IN_CI and hasattr(request.node, "rep_call") and request.node.rep_call.failed: + logger.debug(f"Running fixture teardown: {inspect.currentframe().f_code.co_name}") + logger.debug("Test failed, attempting to attach logs to the allure reports") + for file in glob.glob(os.path.join(env_vars.DOCKER_LOG_DIR, "*" + request.cls.test_id + "*")): + attach_allure_file(file) + + +@pytest.fixture(scope="function", autouse=True) +def close_open_nodes(attach_logs_on_fail): + DS.nomos_nodes = [] + yield + logger.debug(f"Running fixture teardown: {inspect.currentframe().f_code.co_name}") + crashed_containers = [] + for node in DS.nomos_nodes: + try: + node.stop() + except Exception as ex: + if "No such container" in str(ex): + crashed_containers.append(node.image) + logger.error(f"Failed to stop container because of error {ex}") + assert not crashed_containers, f"Containers {crashed_containers} crashed during the test!!!" + + +@pytest.fixture(scope="function", autouse=True) +def check_nomos_log_errors(): + yield + logger.debug(f"Running fixture teardown: {inspect.currentframe().f_code.co_name}") + for node in DS.nomos_nodes: + node.check_nomos_log_errors() diff --git a/tests/data_integrity/__init__.py b/tests/data_integrity/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/e2e/__init__.py b/tests/e2e/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/e2e/test_2node_alive.py b/tests/e2e/test_2node_alive.py new file mode 100644 index 0000000..9553332 --- /dev/null +++ b/tests/e2e/test_2node_alive.py @@ -0,0 +1,24 @@ +from src.env_vars import CFGSYNC, NOMOS, NOMOS_EXECUTOR +from src.libs.custom_logger import get_custom_logger +from src.node.nomos_node import NomosNode + +logger = get_custom_logger(__name__) + + +class Test2NodeClAlive: + def test_cluster_start(self): + + self.node1 = NomosNode(CFGSYNC, "cfgsync") + self.node2 = NomosNode(NOMOS, "nomos_node_0") + self.node3 = NomosNode(NOMOS_EXECUTOR, "nomos_node_1") + + self.node1.start() + self.node2.start() + self.node3.start() + + try: + self.node2.ensure_ready() + self.node3.ensure_ready() + except Exception as ex: + logger.error(f"REST service did not become ready in time: {ex}") + raise