From 3491b066f7ac546b3a42171e0ccf7b484e5a7ba1 Mon Sep 17 00:00:00 2001
From: Roman <zajic@zajic.net>
Date: Thu, 26 Jun 2025 15:19:15 +0800
Subject: [PATCH 1/5] test: initial Tfidf based log parsing

---
 requirements.txt       |  6 +++-
 src/node/nomos_node.py | 19 +++--------
 src/tfidf/__init__.py  |  0
 src/tfidf/tfidf.py     | 72 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 82 insertions(+), 15 deletions(-)
 create mode 100644 src/tfidf/__init__.py
 create mode 100644 src/tfidf/tfidf.py

diff --git a/requirements.txt b/requirements.txt
index 1b260ef..a9ef857 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -34,6 +34,7 @@ pytest-dependency==0.6.0
 PyYAML==6.0.1
 requests==2.31.0
 ruamel.yaml==0.17.21
+scikit-learn~=1.6.1
 setuptools==70.0.0
 tenacity==8.2.3
 typeguard==4.1.5
@@ -43,4 +44,7 @@ urllib3==2.2.2
 virtualenv==20.25.0
 Jinja2~=3.1.5
 psutil~=7.0.0
-pytest-shard==0.1.2
\ No newline at end of file
+pytest-shard==0.1.2
+learn~=1.0.0
+pandas~=2.3.0
+nltk~=3.9.1
\ No newline at end of file
diff --git a/src/node/nomos_node.py b/src/node/nomos_node.py
index a65f0e4..663f09b 100644
--- a/src/node/nomos_node.py
+++ b/src/node/nomos_node.py
@@ -12,6 +12,7 @@ from src.docker_manager import DockerManager, stop, kill
 from src.env_vars import DOCKER_LOG_DIR
 from src.node.node_vars import nomos_nodes
 from src.test_data import LOG_ERROR_KEYWORDS
+from src.tfidf.tfidf import LogTfidf
 
 logger = get_custom_logger(__name__)
 
@@ -146,22 +147,12 @@ class NomosNode:
                 return internal_port.replace("/tcp", "")
         return None
 
-    def check_nomos_log_errors(self, whitelist=None):
+    def check_nomos_log_errors(self):
         keywords = LOG_ERROR_KEYWORDS
 
-        # If a whitelist is provided, remove those keywords from the keywords list
-        if whitelist:
-            keywords = [keyword for keyword in keywords if keyword not in whitelist]
-
-        matches_found = self._docker_manager.search_log_for_keywords(self._log_path, keywords, False)
-
-        logger.info(f"Printing log matches for {self.name()}")
-        if matches_found:
-            for keyword, log_lines in matches_found.items():
-                for line in log_lines:
-                    logger.debug(f"Log line matching keyword '{keyword}': {line}")
-        else:
-            logger.debug("No keyword matches found in the logs.")
+        logger.debug(f"Parsing log for node {self.name()}")
+        log_tfidf = LogTfidf()
+        log_tfidf.parse_log(self._log_path, f"{self._log_path}.parsed", keywords, True)
 
     def extract_config(self, target_file):
         # Copy the config file from first node
diff --git a/src/tfidf/__init__.py b/src/tfidf/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/tfidf/tfidf.py b/src/tfidf/tfidf.py
new file mode 100644
index 0000000..829c8db
--- /dev/null
+++ b/src/tfidf/tfidf.py
@@ -0,0 +1,72 @@
+import re
+
+import sklearn.feature_extraction.text as ext
+import pandas as pd
+from nltk import word_tokenize
+from nltk.stem.porter import PorterStemmer
+from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
+
+
+def normalize_log_message(text):
+    # Remove timestamps (e.g., "2023-10-01 12:34:56")
+    text = re.sub(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", "", text)
+    # Remove numeric IDs (e.g., "user123", "session456")
+    text = re.sub(r"\b\w*\d+\w*\b", "", text)
+    return " ".join(text.split())
+
+
+class LogTfidf:
+    def __init__(self):
+        self.stemmer = PorterStemmer()
+        self.stop_words = self._generate_stop_words()
+
+    def _generate_stop_words(self):
+        stop_words = [self.stemmer.stem(word) for word in ENGLISH_STOP_WORDS if word.isalpha()]
+        # Add any missing stemmed tokens from the warning
+        stop_words.extend(["anywh", "becau", "el", "elsewh", "everywh", "ind", "otherwi", "plea", "somewh"])
+        return stop_words
+
+    def get_stemmed_tokens(self, tokens):
+        return [self.stemmer.stem(token) for token in tokens if token.isalpha()]
+
+    def get_tokens(self, text):
+        tokens = word_tokenize(text.lower())
+        return self.get_stemmed_tokens(tokens)
+
+    def parse_log(self, input_file, output_file, keywords, print_to_stdout=True):
+        vectorizer = ext.CountVectorizer(tokenizer=self.get_tokens, stop_words=self.stop_words, token_pattern=None)
+        with open(input_file, "r") as file:
+            lines = [line.rstrip() for line in file]
+        line_nos = dict(zip(range(1, len(lines)), lines))
+        doc_matrix = vectorizer.fit_transform(lines)
+
+        tf_idf_transformer = ext.TfidfTransformer().fit(doc_matrix)
+        sparse = tf_idf_transformer.transform(doc_matrix).toarray()
+
+        per_line_score = []
+        for row in sparse:
+            nonzero_count = len(row.nonzero()[0])
+            score = row.sum() / nonzero_count if nonzero_count > 0 else 0
+            per_line_score.append(score)
+
+        line_scores = dict(zip(range(1, len(lines)), per_line_score))
+
+        # Filter by keywords and sort according to rarity
+        df = pd.DataFrame([line_nos, line_scores]).T
+        df.columns = ["d1", "d2"]  # Simplified column naming for clarity
+        df = df.sort_values(by="d2", ascending=False)
+        pattern = "|".join(keywords)
+        df = df[~((df["d1"].str.contains("INFO")) & (~df["d1"].str.contains(pattern)))]
+
+        # Normalize and deduplicate
+        df["d1_normalized"] = df["d1"].apply(normalize_log_message)
+        df = df.drop_duplicates(subset="d1_normalized", keep="first")
+        df = df.drop(columns="d1_normalized")
+
+        with open(output_file, "w") as out_file:
+            for index, row in df.iterrows():
+                line = "{0}\n"
+                line = line.format(row["d1"])
+                out_file.write(line)
+                if print_to_stdout:
+                    print(line)

From 9a95908fb7feb3c92ed20eba8d407cd049263353 Mon Sep 17 00:00:00 2001
From: Roman <zajic@zajic.net>
Date: Thu, 26 Jun 2025 16:06:55 +0800
Subject: [PATCH 2/5] fix: git ignore MacOs files

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index ff4e95e..6c1f16b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -105,6 +105,9 @@ dmypy.json
 # Pyre type checker
 .pyre/
 
+# Apple
+.DS_Store
+
 log/
 kzgrs/
 cluster_config/cfgsync.yaml
\ No newline at end of file

From 71a481c27fa27dd72a90dc7ab4e7875ed6a651c5 Mon Sep 17 00:00:00 2001
From: Roman <zajic@zajic.net>
Date: Fri, 27 Jun 2025 09:52:03 +0800
Subject: [PATCH 3/5] fix: skip DEBUG and TRACE lines without matching keywords

---
 src/tfidf/tfidf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tfidf/tfidf.py b/src/tfidf/tfidf.py
index 829c8db..71b4ac2 100644
--- a/src/tfidf/tfidf.py
+++ b/src/tfidf/tfidf.py
@@ -56,7 +56,7 @@ class LogTfidf:
         df.columns = ["d1", "d2"]  # Simplified column naming for clarity
         df = df.sort_values(by="d2", ascending=False)
         pattern = "|".join(keywords)
-        df = df[~((df["d1"].str.contains("INFO")) & (~df["d1"].str.contains(pattern)))]
+        df = df[~((df["d1"].str.contains("INFO|DEBUG|TRACE")) & (~df["d1"].str.contains(pattern)))]
 
         # Normalize and deduplicate
         df["d1_normalized"] = df["d1"].apply(normalize_log_message)

From c55e4ff838e3c20d2be496fc625e6d6c1668b715 Mon Sep 17 00:00:00 2001
From: Roman <zajic@zajic.net>
Date: Fri, 27 Jun 2025 10:23:17 +0800
Subject: [PATCH 4/5] fix: add nltk resources download

---
 README.md                  | 31 ++++++++++++++++++++++++++++---
 download_nltk_resources.py | 10 ++++++++++
 2 files changed, 38 insertions(+), 3 deletions(-)
 create mode 100644 download_nltk_resources.py

diff --git a/README.md b/README.md
index a01d94a..ec63024 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # nomos-e2e-tests
 
-Nomos e2e framework used to test various implementations of the Nomos node.
+Nomos E2E framework used to test various implementations of the Nomos node.
 
 ## Setup and contribute
 
@@ -14,14 +14,39 @@ mkdir -p kzgrs
 wget https://raw.githubusercontent.com/logos-co/nomos-node/master/tests/kzgrs/kzgrs_test_params -O kzgrs/kzgrs_test_params
 pre-commit install
 (optional) Overwrite default vars from src/env_vars.py via env vars or by adding a .env file
+(optional) python download_nltk_resources.py # Used when CHECK_LOG_ERRORS=True
 pytest
 ```
-Set optional environment variable to search logs for errors after each tests:
+
+### Additional instructions for dispersal resilience tests  
+
+1. Build prerequisites
+```sh
+git clone https://github.com/logos-co/nomos-security-tests.git
+cd nomos-security-tests
+git fetch; git switch test-dispersal-resilience
+
+git checkout d8bbc464420ef86337df963c64ac2f7c3fd97008
+docker build --no-cache -f testnet/Dockerfile.debug -t nomos-mod-da-d8bbc46:testnet .
+# (x86_64) docker build --no-cache -f testnet/Dockerfile -t nomos-mod-da-d8bbc46:testnet .
+
+git checkout d19a1f3d8c80f654e6cf6139641519f16fe670ec
+docker build --no-cache -f testnet/Dockerfile.debug -t nomos-executor-mod-da-d19a1f3:testnet . 
+
+git checkout 7f54114b6c320dc32577b0e8bb85c2d543b4bd56
+docker build --no-cache -f testnet/Dockerfile.debug -t nomos-executor-mod-da-7f54114:testnet . 
+
+git checkout 4a58376ac4956d87502b9fd72b64a756396f2a8d
+docker build --no-cache -f testnet/Dockerfile.debug -t nomos-executor-mod-da-4a58376:testnet . 
+```
+
+2. Run tests with `pytest --run-with-mod-da-node tests/dispersal_resilience/test_dispersal_resilience.py`
+
+### Enable node log search with environment variable:
 ```shell
 export CHECK_LOG_ERRORS=True
 ```
 
-
 ## License
 
 Licensed and distributed under either of
diff --git a/download_nltk_resources.py b/download_nltk_resources.py
new file mode 100644
index 0000000..50c6221
--- /dev/null
+++ b/download_nltk_resources.py
@@ -0,0 +1,10 @@
+import nltk
+
+
+def main():
+    nltk.download("punkt")
+    nltk.download("punkt_tab")
+
+
+if __name__ == "__main__":
+    main()

From 62a2a3c3f766bb4e467fa3522457e9a1f2cb01d2 Mon Sep 17 00:00:00 2001
From: Roman <zajic@zajic.net>
Date: Fri, 27 Jun 2025 16:31:30 +0800
Subject: [PATCH 5/5] fix: disable parsed log output to file

---
 src/node/nomos_node.py |  2 +-
 src/tfidf/tfidf.py     | 20 ++++++++++++--------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/node/nomos_node.py b/src/node/nomos_node.py
index 663f09b..e2646a9 100644
--- a/src/node/nomos_node.py
+++ b/src/node/nomos_node.py
@@ -152,7 +152,7 @@ class NomosNode:
 
         logger.debug(f"Parsing log for node {self.name()}")
         log_tfidf = LogTfidf()
-        log_tfidf.parse_log(self._log_path, f"{self._log_path}.parsed", keywords, True)
+        log_tfidf.parse_log(self._log_path, keywords, None)
 
     def extract_config(self, target_file):
         # Copy the config file from first node
diff --git a/src/tfidf/tfidf.py b/src/tfidf/tfidf.py
index 71b4ac2..cf41bbe 100644
--- a/src/tfidf/tfidf.py
+++ b/src/tfidf/tfidf.py
@@ -15,6 +15,16 @@ def normalize_log_message(text):
     return " ".join(text.split())
 
 
+def write_output(df, output_file=None):
+    lines = df["d1"].astype(str) + "\n"
+
+    if output_file:
+        with open(output_file, "w") as out_file:
+            out_file.writelines(lines)
+
+    print("".join(lines), end="")
+
+
 class LogTfidf:
     def __init__(self):
         self.stemmer = PorterStemmer()
@@ -33,7 +43,7 @@ class LogTfidf:
         tokens = word_tokenize(text.lower())
         return self.get_stemmed_tokens(tokens)
 
-    def parse_log(self, input_file, output_file, keywords, print_to_stdout=True):
+    def parse_log(self, input_file, keywords, output_file=None):
         vectorizer = ext.CountVectorizer(tokenizer=self.get_tokens, stop_words=self.stop_words, token_pattern=None)
         with open(input_file, "r") as file:
             lines = [line.rstrip() for line in file]
@@ -63,10 +73,4 @@ class LogTfidf:
         df = df.drop_duplicates(subset="d1_normalized", keep="first")
         df = df.drop(columns="d1_normalized")
 
-        with open(output_file, "w") as out_file:
-            for index, row in df.iterrows():
-                line = "{0}\n"
-                line = line.format(row["d1"])
-                out_file.write(line)
-                if print_to_stdout:
-                    print(line)
+        write_output(df, output_file)