From 3491b066f7ac546b3a42171e0ccf7b484e5a7ba1 Mon Sep 17 00:00:00 2001 From: Roman Date: Thu, 26 Jun 2025 15:19:15 +0800 Subject: [PATCH 1/5] test: initial Tfidf based log parsing --- requirements.txt | 6 +++- src/node/nomos_node.py | 19 +++-------- src/tfidf/__init__.py | 0 src/tfidf/tfidf.py | 72 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 82 insertions(+), 15 deletions(-) create mode 100644 src/tfidf/__init__.py create mode 100644 src/tfidf/tfidf.py diff --git a/requirements.txt b/requirements.txt index 1b260ef..a9ef857 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,6 +34,7 @@ pytest-dependency==0.6.0 PyYAML==6.0.1 requests==2.31.0 ruamel.yaml==0.17.21 +scikit-learn~=1.6.1 setuptools==70.0.0 tenacity==8.2.3 typeguard==4.1.5 @@ -43,4 +44,7 @@ urllib3==2.2.2 virtualenv==20.25.0 Jinja2~=3.1.5 psutil~=7.0.0 -pytest-shard==0.1.2 \ No newline at end of file +pytest-shard==0.1.2 +learn~=1.0.0 +pandas~=2.3.0 +nltk~=3.9.1 \ No newline at end of file diff --git a/src/node/nomos_node.py b/src/node/nomos_node.py index a65f0e4..663f09b 100644 --- a/src/node/nomos_node.py +++ b/src/node/nomos_node.py @@ -12,6 +12,7 @@ from src.docker_manager import DockerManager, stop, kill from src.env_vars import DOCKER_LOG_DIR from src.node.node_vars import nomos_nodes from src.test_data import LOG_ERROR_KEYWORDS +from src.tfidf.tfidf import LogTfidf logger = get_custom_logger(__name__) @@ -146,22 +147,12 @@ class NomosNode: return internal_port.replace("/tcp", "") return None - def check_nomos_log_errors(self, whitelist=None): + def check_nomos_log_errors(self): keywords = LOG_ERROR_KEYWORDS - # If a whitelist is provided, remove those keywords from the keywords list - if whitelist: - keywords = [keyword for keyword in keywords if keyword not in whitelist] - - matches_found = self._docker_manager.search_log_for_keywords(self._log_path, keywords, False) - - logger.info(f"Printing log matches for {self.name()}") - if matches_found: - for keyword, log_lines in matches_found.items(): - for line in log_lines: - logger.debug(f"Log line matching keyword '{keyword}': {line}") - else: - logger.debug("No keyword matches found in the logs.") + logger.debug(f"Parsing log for node {self.name()}") + log_tfidf = LogTfidf() + log_tfidf.parse_log(self._log_path, f"{self._log_path}.parsed", keywords, True) def extract_config(self, target_file): # Copy the config file from first node diff --git a/src/tfidf/__init__.py b/src/tfidf/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/tfidf/tfidf.py b/src/tfidf/tfidf.py new file mode 100644 index 0000000..829c8db --- /dev/null +++ b/src/tfidf/tfidf.py @@ -0,0 +1,72 @@ +import re + +import sklearn.feature_extraction.text as ext +import pandas as pd +from nltk import word_tokenize +from nltk.stem.porter import PorterStemmer +from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS + + +def normalize_log_message(text): + # Remove timestamps (e.g., "2023-10-01 12:34:56") + text = re.sub(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", "", text) + # Remove numeric IDs (e.g., "user123", "session456") + text = re.sub(r"\b\w*\d+\w*\b", "", text) + return " ".join(text.split()) + + +class LogTfidf: + def __init__(self): + self.stemmer = PorterStemmer() + self.stop_words = self._generate_stop_words() + + def _generate_stop_words(self): + stop_words = [self.stemmer.stem(word) for word in ENGLISH_STOP_WORDS if word.isalpha()] + # Add any missing stemmed tokens from the warning + stop_words.extend(["anywh", "becau", "el", "elsewh", "everywh", "ind", "otherwi", "plea", "somewh"]) + return stop_words + + def get_stemmed_tokens(self, tokens): + return [self.stemmer.stem(token) for token in tokens if token.isalpha()] + + def get_tokens(self, text): + tokens = word_tokenize(text.lower()) + return self.get_stemmed_tokens(tokens) + + def parse_log(self, input_file, output_file, keywords, print_to_stdout=True): + vectorizer = ext.CountVectorizer(tokenizer=self.get_tokens, stop_words=self.stop_words, token_pattern=None) + with open(input_file, "r") as file: + lines = [line.rstrip() for line in file] + line_nos = dict(zip(range(1, len(lines)), lines)) + doc_matrix = vectorizer.fit_transform(lines) + + tf_idf_transformer = ext.TfidfTransformer().fit(doc_matrix) + sparse = tf_idf_transformer.transform(doc_matrix).toarray() + + per_line_score = [] + for row in sparse: + nonzero_count = len(row.nonzero()[0]) + score = row.sum() / nonzero_count if nonzero_count > 0 else 0 + per_line_score.append(score) + + line_scores = dict(zip(range(1, len(lines)), per_line_score)) + + # Filter by keywords and sort according to rarity + df = pd.DataFrame([line_nos, line_scores]).T + df.columns = ["d1", "d2"] # Simplified column naming for clarity + df = df.sort_values(by="d2", ascending=False) + pattern = "|".join(keywords) + df = df[~((df["d1"].str.contains("INFO")) & (~df["d1"].str.contains(pattern)))] + + # Normalize and deduplicate + df["d1_normalized"] = df["d1"].apply(normalize_log_message) + df = df.drop_duplicates(subset="d1_normalized", keep="first") + df = df.drop(columns="d1_normalized") + + with open(output_file, "w") as out_file: + for index, row in df.iterrows(): + line = "{0}\n" + line = line.format(row["d1"]) + out_file.write(line) + if print_to_stdout: + print(line) From 9a95908fb7feb3c92ed20eba8d407cd049263353 Mon Sep 17 00:00:00 2001 From: Roman Date: Thu, 26 Jun 2025 16:06:55 +0800 Subject: [PATCH 2/5] fix: git ignore MacOs files --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index ff4e95e..6c1f16b 100644 --- a/.gitignore +++ b/.gitignore @@ -105,6 +105,9 @@ dmypy.json # Pyre type checker .pyre/ +# Apple +.DS_Store + log/ kzgrs/ cluster_config/cfgsync.yaml \ No newline at end of file From 71a481c27fa27dd72a90dc7ab4e7875ed6a651c5 Mon Sep 17 00:00:00 2001 From: Roman Date: Fri, 27 Jun 2025 09:52:03 +0800 Subject: [PATCH 3/5] fix: skip DEBUG and TRACE lines without matching keywords --- src/tfidf/tfidf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tfidf/tfidf.py b/src/tfidf/tfidf.py index 829c8db..71b4ac2 100644 --- a/src/tfidf/tfidf.py +++ b/src/tfidf/tfidf.py @@ -56,7 +56,7 @@ class LogTfidf: df.columns = ["d1", "d2"] # Simplified column naming for clarity df = df.sort_values(by="d2", ascending=False) pattern = "|".join(keywords) - df = df[~((df["d1"].str.contains("INFO")) & (~df["d1"].str.contains(pattern)))] + df = df[~((df["d1"].str.contains("INFO|DEBUG|TRACE")) & (~df["d1"].str.contains(pattern)))] # Normalize and deduplicate df["d1_normalized"] = df["d1"].apply(normalize_log_message) From c55e4ff838e3c20d2be496fc625e6d6c1668b715 Mon Sep 17 00:00:00 2001 From: Roman Date: Fri, 27 Jun 2025 10:23:17 +0800 Subject: [PATCH 4/5] fix: add nltk resources download --- README.md | 31 ++++++++++++++++++++++++++++--- download_nltk_resources.py | 10 ++++++++++ 2 files changed, 38 insertions(+), 3 deletions(-) create mode 100644 download_nltk_resources.py diff --git a/README.md b/README.md index a01d94a..ec63024 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # nomos-e2e-tests -Nomos e2e framework used to test various implementations of the Nomos node. +Nomos E2E framework used to test various implementations of the Nomos node. ## Setup and contribute @@ -14,14 +14,39 @@ mkdir -p kzgrs wget https://raw.githubusercontent.com/logos-co/nomos-node/master/tests/kzgrs/kzgrs_test_params -O kzgrs/kzgrs_test_params pre-commit install (optional) Overwrite default vars from src/env_vars.py via env vars or by adding a .env file +(optional) python download_nltk_resources.py # Used when CHECK_LOG_ERRORS=True pytest ``` -Set optional environment variable to search logs for errors after each tests: + +### Additional instructions for dispersal resilience tests + +1. Build prerequisites +```sh +git clone https://github.com/logos-co/nomos-security-tests.git +cd nomos-security-tests +git fetch; git switch test-dispersal-resilience + +git checkout d8bbc464420ef86337df963c64ac2f7c3fd97008 +docker build --no-cache -f testnet/Dockerfile.debug -t nomos-mod-da-d8bbc46:testnet . +# (x86_64) docker build --no-cache -f testnet/Dockerfile -t nomos-mod-da-d8bbc46:testnet . + +git checkout d19a1f3d8c80f654e6cf6139641519f16fe670ec +docker build --no-cache -f testnet/Dockerfile.debug -t nomos-executor-mod-da-d19a1f3:testnet . + +git checkout 7f54114b6c320dc32577b0e8bb85c2d543b4bd56 +docker build --no-cache -f testnet/Dockerfile.debug -t nomos-executor-mod-da-7f54114:testnet . + +git checkout 4a58376ac4956d87502b9fd72b64a756396f2a8d +docker build --no-cache -f testnet/Dockerfile.debug -t nomos-executor-mod-da-4a58376:testnet . +``` + +2. Run tests with `pytest --run-with-mod-da-node tests/dispersal_resilience/test_dispersal_resilience.py` + +### Enable node log search with environment variable: ```shell export CHECK_LOG_ERRORS=True ``` - ## License Licensed and distributed under either of diff --git a/download_nltk_resources.py b/download_nltk_resources.py new file mode 100644 index 0000000..50c6221 --- /dev/null +++ b/download_nltk_resources.py @@ -0,0 +1,10 @@ +import nltk + + +def main(): + nltk.download("punkt") + nltk.download("punkt_tab") + + +if __name__ == "__main__": + main() From 62a2a3c3f766bb4e467fa3522457e9a1f2cb01d2 Mon Sep 17 00:00:00 2001 From: Roman Date: Fri, 27 Jun 2025 16:31:30 +0800 Subject: [PATCH 5/5] fix: disable parsed log output to file --- src/node/nomos_node.py | 2 +- src/tfidf/tfidf.py | 20 ++++++++++++-------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/node/nomos_node.py b/src/node/nomos_node.py index 663f09b..e2646a9 100644 --- a/src/node/nomos_node.py +++ b/src/node/nomos_node.py @@ -152,7 +152,7 @@ class NomosNode: logger.debug(f"Parsing log for node {self.name()}") log_tfidf = LogTfidf() - log_tfidf.parse_log(self._log_path, f"{self._log_path}.parsed", keywords, True) + log_tfidf.parse_log(self._log_path, keywords, None) def extract_config(self, target_file): # Copy the config file from first node diff --git a/src/tfidf/tfidf.py b/src/tfidf/tfidf.py index 71b4ac2..cf41bbe 100644 --- a/src/tfidf/tfidf.py +++ b/src/tfidf/tfidf.py @@ -15,6 +15,16 @@ def normalize_log_message(text): return " ".join(text.split()) +def write_output(df, output_file=None): + lines = df["d1"].astype(str) + "\n" + + if output_file: + with open(output_file, "w") as out_file: + out_file.writelines(lines) + + print("".join(lines), end="") + + class LogTfidf: def __init__(self): self.stemmer = PorterStemmer() @@ -33,7 +43,7 @@ class LogTfidf: tokens = word_tokenize(text.lower()) return self.get_stemmed_tokens(tokens) - def parse_log(self, input_file, output_file, keywords, print_to_stdout=True): + def parse_log(self, input_file, keywords, output_file=None): vectorizer = ext.CountVectorizer(tokenizer=self.get_tokens, stop_words=self.stop_words, token_pattern=None) with open(input_file, "r") as file: lines = [line.rstrip() for line in file] @@ -63,10 +73,4 @@ class LogTfidf: df = df.drop_duplicates(subset="d1_normalized", keep="first") df = df.drop(columns="d1_normalized") - with open(output_file, "w") as out_file: - for index, row in df.iterrows(): - line = "{0}\n" - line = line.format(row["d1"]) - out_file.write(line) - if print_to_stdout: - print(line) + write_output(df, output_file)