test: initial Tfidf based log parsing

This commit is contained in:
Roman 2025-06-26 15:19:15 +08:00
parent ed65b67a31
commit 3491b066f7
No known key found for this signature in database
GPG Key ID: 583BDF43C238B83E
4 changed files with 82 additions and 15 deletions

View File

@ -34,6 +34,7 @@ pytest-dependency==0.6.0
PyYAML==6.0.1
requests==2.31.0
ruamel.yaml==0.17.21
scikit-learn~=1.6.1
setuptools==70.0.0
tenacity==8.2.3
typeguard==4.1.5
@ -43,4 +44,7 @@ urllib3==2.2.2
virtualenv==20.25.0
Jinja2~=3.1.5
psutil~=7.0.0
pytest-shard==0.1.2
pytest-shard==0.1.2
learn~=1.0.0
pandas~=2.3.0
nltk~=3.9.1

View File

@ -12,6 +12,7 @@ from src.docker_manager import DockerManager, stop, kill
from src.env_vars import DOCKER_LOG_DIR
from src.node.node_vars import nomos_nodes
from src.test_data import LOG_ERROR_KEYWORDS
from src.tfidf.tfidf import LogTfidf
logger = get_custom_logger(__name__)
@ -146,22 +147,12 @@ class NomosNode:
return internal_port.replace("/tcp", "")
return None
def check_nomos_log_errors(self, whitelist=None):
def check_nomos_log_errors(self):
keywords = LOG_ERROR_KEYWORDS
# If a whitelist is provided, remove those keywords from the keywords list
if whitelist:
keywords = [keyword for keyword in keywords if keyword not in whitelist]
matches_found = self._docker_manager.search_log_for_keywords(self._log_path, keywords, False)
logger.info(f"Printing log matches for {self.name()}")
if matches_found:
for keyword, log_lines in matches_found.items():
for line in log_lines:
logger.debug(f"Log line matching keyword '{keyword}': {line}")
else:
logger.debug("No keyword matches found in the logs.")
logger.debug(f"Parsing log for node {self.name()}")
log_tfidf = LogTfidf()
log_tfidf.parse_log(self._log_path, f"{self._log_path}.parsed", keywords, True)
def extract_config(self, target_file):
# Copy the config file from first node

0
src/tfidf/__init__.py Normal file
View File

72
src/tfidf/tfidf.py Normal file
View File

@ -0,0 +1,72 @@
import re
import sklearn.feature_extraction.text as ext
import pandas as pd
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
def normalize_log_message(text):
# Remove timestamps (e.g., "2023-10-01 12:34:56")
text = re.sub(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", "", text)
# Remove numeric IDs (e.g., "user123", "session456")
text = re.sub(r"\b\w*\d+\w*\b", "", text)
return " ".join(text.split())
class LogTfidf:
def __init__(self):
self.stemmer = PorterStemmer()
self.stop_words = self._generate_stop_words()
def _generate_stop_words(self):
stop_words = [self.stemmer.stem(word) for word in ENGLISH_STOP_WORDS if word.isalpha()]
# Add any missing stemmed tokens from the warning
stop_words.extend(["anywh", "becau", "el", "elsewh", "everywh", "ind", "otherwi", "plea", "somewh"])
return stop_words
def get_stemmed_tokens(self, tokens):
return [self.stemmer.stem(token) for token in tokens if token.isalpha()]
def get_tokens(self, text):
tokens = word_tokenize(text.lower())
return self.get_stemmed_tokens(tokens)
def parse_log(self, input_file, output_file, keywords, print_to_stdout=True):
vectorizer = ext.CountVectorizer(tokenizer=self.get_tokens, stop_words=self.stop_words, token_pattern=None)
with open(input_file, "r") as file:
lines = [line.rstrip() for line in file]
line_nos = dict(zip(range(1, len(lines)), lines))
doc_matrix = vectorizer.fit_transform(lines)
tf_idf_transformer = ext.TfidfTransformer().fit(doc_matrix)
sparse = tf_idf_transformer.transform(doc_matrix).toarray()
per_line_score = []
for row in sparse:
nonzero_count = len(row.nonzero()[0])
score = row.sum() / nonzero_count if nonzero_count > 0 else 0
per_line_score.append(score)
line_scores = dict(zip(range(1, len(lines)), per_line_score))
# Filter by keywords and sort according to rarity
df = pd.DataFrame([line_nos, line_scores]).T
df.columns = ["d1", "d2"] # Simplified column naming for clarity
df = df.sort_values(by="d2", ascending=False)
pattern = "|".join(keywords)
df = df[~((df["d1"].str.contains("INFO")) & (~df["d1"].str.contains(pattern)))]
# Normalize and deduplicate
df["d1_normalized"] = df["d1"].apply(normalize_log_message)
df = df.drop_duplicates(subset="d1_normalized", keep="first")
df = df.drop(columns="d1_normalized")
with open(output_file, "w") as out_file:
for index, row in df.iterrows():
line = "{0}\n"
line = line.format(row["d1"])
out_file.write(line)
if print_to_stdout:
print(line)