mirror of
https://github.com/logos-blockchain/logos-blockchain-e2e-tests.git
synced 2026-01-04 06:03:12 +00:00
Merge pull request #20 from logos-co/chore-tf-idf-log-parsing
chore: TF-IDF based log parsing
This commit is contained in:
commit
99fe7327c5
3
.gitignore
vendored
3
.gitignore
vendored
@ -105,6 +105,9 @@ dmypy.json
|
|||||||
# Pyre type checker
|
# Pyre type checker
|
||||||
.pyre/
|
.pyre/
|
||||||
|
|
||||||
|
# Apple
|
||||||
|
.DS_Store
|
||||||
|
|
||||||
log/
|
log/
|
||||||
kzgrs/
|
kzgrs/
|
||||||
cluster_config/cfgsync.yaml
|
cluster_config/cfgsync.yaml
|
||||||
@ -14,6 +14,7 @@ mkdir -p kzgrs
|
|||||||
wget https://raw.githubusercontent.com/logos-co/nomos-node/master/tests/kzgrs/kzgrs_test_params -O kzgrs/kzgrs_test_params
|
wget https://raw.githubusercontent.com/logos-co/nomos-node/master/tests/kzgrs/kzgrs_test_params -O kzgrs/kzgrs_test_params
|
||||||
pre-commit install
|
pre-commit install
|
||||||
(optional) Overwrite default vars from src/env_vars.py via env vars or by adding a .env file
|
(optional) Overwrite default vars from src/env_vars.py via env vars or by adding a .env file
|
||||||
|
(optional) python download_nltk_resources.py # Used when CHECK_LOG_ERRORS=True
|
||||||
pytest
|
pytest
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
10
download_nltk_resources.py
Normal file
10
download_nltk_resources.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
import nltk
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
nltk.download("punkt")
|
||||||
|
nltk.download("punkt_tab")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@ -34,6 +34,7 @@ pytest-dependency==0.6.0
|
|||||||
PyYAML==6.0.1
|
PyYAML==6.0.1
|
||||||
requests==2.31.0
|
requests==2.31.0
|
||||||
ruamel.yaml==0.17.21
|
ruamel.yaml==0.17.21
|
||||||
|
scikit-learn~=1.6.1
|
||||||
setuptools==70.0.0
|
setuptools==70.0.0
|
||||||
tenacity==8.2.3
|
tenacity==8.2.3
|
||||||
typeguard==4.1.5
|
typeguard==4.1.5
|
||||||
@ -43,4 +44,7 @@ urllib3==2.2.2
|
|||||||
virtualenv==20.25.0
|
virtualenv==20.25.0
|
||||||
Jinja2~=3.1.5
|
Jinja2~=3.1.5
|
||||||
psutil~=7.0.0
|
psutil~=7.0.0
|
||||||
pytest-shard==0.1.2
|
pytest-shard==0.1.2
|
||||||
|
learn~=1.0.0
|
||||||
|
pandas~=2.3.0
|
||||||
|
nltk~=3.9.1
|
||||||
@ -12,6 +12,7 @@ from src.docker_manager import DockerManager, stop, kill
|
|||||||
from src.env_vars import DOCKER_LOG_DIR
|
from src.env_vars import DOCKER_LOG_DIR
|
||||||
from src.node.node_vars import nomos_nodes
|
from src.node.node_vars import nomos_nodes
|
||||||
from src.test_data import LOG_ERROR_KEYWORDS
|
from src.test_data import LOG_ERROR_KEYWORDS
|
||||||
|
from src.tfidf.tfidf import LogTfidf
|
||||||
|
|
||||||
logger = get_custom_logger(__name__)
|
logger = get_custom_logger(__name__)
|
||||||
|
|
||||||
@ -146,22 +147,12 @@ class NomosNode:
|
|||||||
return internal_port.replace("/tcp", "")
|
return internal_port.replace("/tcp", "")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def check_nomos_log_errors(self, whitelist=None):
|
def check_nomos_log_errors(self):
|
||||||
keywords = LOG_ERROR_KEYWORDS
|
keywords = LOG_ERROR_KEYWORDS
|
||||||
|
|
||||||
# If a whitelist is provided, remove those keywords from the keywords list
|
logger.debug(f"Parsing log for node {self.name()}")
|
||||||
if whitelist:
|
log_tfidf = LogTfidf()
|
||||||
keywords = [keyword for keyword in keywords if keyword not in whitelist]
|
log_tfidf.parse_log(self._log_path, keywords, None)
|
||||||
|
|
||||||
matches_found = self._docker_manager.search_log_for_keywords(self._log_path, keywords, False)
|
|
||||||
|
|
||||||
logger.info(f"Printing log matches for {self.name()}")
|
|
||||||
if matches_found:
|
|
||||||
for keyword, log_lines in matches_found.items():
|
|
||||||
for line in log_lines:
|
|
||||||
logger.debug(f"Log line matching keyword '{keyword}': {line}")
|
|
||||||
else:
|
|
||||||
logger.debug("No keyword matches found in the logs.")
|
|
||||||
|
|
||||||
def extract_config(self, target_file):
|
def extract_config(self, target_file):
|
||||||
# Copy the config file from first node
|
# Copy the config file from first node
|
||||||
|
|||||||
0
src/tfidf/__init__.py
Normal file
0
src/tfidf/__init__.py
Normal file
76
src/tfidf/tfidf.py
Normal file
76
src/tfidf/tfidf.py
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
import sklearn.feature_extraction.text as ext
|
||||||
|
import pandas as pd
|
||||||
|
from nltk import word_tokenize
|
||||||
|
from nltk.stem.porter import PorterStemmer
|
||||||
|
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_log_message(text):
|
||||||
|
# Remove timestamps (e.g., "2023-10-01 12:34:56")
|
||||||
|
text = re.sub(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", "", text)
|
||||||
|
# Remove numeric IDs (e.g., "user123", "session456")
|
||||||
|
text = re.sub(r"\b\w*\d+\w*\b", "", text)
|
||||||
|
return " ".join(text.split())
|
||||||
|
|
||||||
|
|
||||||
|
def write_output(df, output_file=None):
|
||||||
|
lines = df["d1"].astype(str) + "\n"
|
||||||
|
|
||||||
|
if output_file:
|
||||||
|
with open(output_file, "w") as out_file:
|
||||||
|
out_file.writelines(lines)
|
||||||
|
|
||||||
|
print("".join(lines), end="")
|
||||||
|
|
||||||
|
|
||||||
|
class LogTfidf:
|
||||||
|
def __init__(self):
|
||||||
|
self.stemmer = PorterStemmer()
|
||||||
|
self.stop_words = self._generate_stop_words()
|
||||||
|
|
||||||
|
def _generate_stop_words(self):
|
||||||
|
stop_words = [self.stemmer.stem(word) for word in ENGLISH_STOP_WORDS if word.isalpha()]
|
||||||
|
# Add any missing stemmed tokens from the warning
|
||||||
|
stop_words.extend(["anywh", "becau", "el", "elsewh", "everywh", "ind", "otherwi", "plea", "somewh"])
|
||||||
|
return stop_words
|
||||||
|
|
||||||
|
def get_stemmed_tokens(self, tokens):
|
||||||
|
return [self.stemmer.stem(token) for token in tokens if token.isalpha()]
|
||||||
|
|
||||||
|
def get_tokens(self, text):
|
||||||
|
tokens = word_tokenize(text.lower())
|
||||||
|
return self.get_stemmed_tokens(tokens)
|
||||||
|
|
||||||
|
def parse_log(self, input_file, keywords, output_file=None):
|
||||||
|
vectorizer = ext.CountVectorizer(tokenizer=self.get_tokens, stop_words=self.stop_words, token_pattern=None)
|
||||||
|
with open(input_file, "r") as file:
|
||||||
|
lines = [line.rstrip() for line in file]
|
||||||
|
line_nos = dict(zip(range(1, len(lines)), lines))
|
||||||
|
doc_matrix = vectorizer.fit_transform(lines)
|
||||||
|
|
||||||
|
tf_idf_transformer = ext.TfidfTransformer().fit(doc_matrix)
|
||||||
|
sparse = tf_idf_transformer.transform(doc_matrix).toarray()
|
||||||
|
|
||||||
|
per_line_score = []
|
||||||
|
for row in sparse:
|
||||||
|
nonzero_count = len(row.nonzero()[0])
|
||||||
|
score = row.sum() / nonzero_count if nonzero_count > 0 else 0
|
||||||
|
per_line_score.append(score)
|
||||||
|
|
||||||
|
line_scores = dict(zip(range(1, len(lines)), per_line_score))
|
||||||
|
|
||||||
|
# Filter by keywords and sort according to rarity
|
||||||
|
df = pd.DataFrame([line_nos, line_scores]).T
|
||||||
|
df.columns = ["d1", "d2"] # Simplified column naming for clarity
|
||||||
|
df = df.sort_values(by="d2", ascending=False)
|
||||||
|
pattern = "|".join(keywords)
|
||||||
|
df = df[~((df["d1"].str.contains("INFO|DEBUG|TRACE")) & (~df["d1"].str.contains(pattern)))]
|
||||||
|
|
||||||
|
# Normalize and deduplicate
|
||||||
|
df["d1_normalized"] = df["d1"].apply(normalize_log_message)
|
||||||
|
df = df.drop_duplicates(subset="d1_normalized", keep="first")
|
||||||
|
df = df.drop(columns="d1_normalized")
|
||||||
|
|
||||||
|
write_output(df, output_file)
|
||||||
Loading…
x
Reference in New Issue
Block a user