From bf844a630552b75fefea5ffcc5f0bbe236511fd0 Mon Sep 17 00:00:00 2001 From: gmega Date: Thu, 12 Dec 2024 12:24:39 -0300 Subject: [PATCH] output proper extension for jsonl data --- benchmarks/cli.py | 7 ++++--- benchmarks/core/logging.py | 7 +++++-- benchmarks/core/tests/test_logging.py | 4 ++-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/benchmarks/cli.py b/benchmarks/cli.py index c6611ed..51fa49a 100644 --- a/benchmarks/cli.py +++ b/benchmarks/cli.py @@ -8,7 +8,7 @@ from pydantic_core import ValidationError from benchmarks.core.config import ConfigParser, ExperimentBuilder from benchmarks.core.experiments.experiments import Experiment -from benchmarks.core.logging import basic_log_parser, LogSplitter, LogEntry +from benchmarks.core.logging import basic_log_parser, LogSplitter, LogEntry, LogSplitterFormats from benchmarks.deluge.config import DelugeExperimentConfig from benchmarks.deluge.logging import DelugeTorrentDownload @@ -61,11 +61,12 @@ def cmd_logs(log: Path, output: Path): output.mkdir(exist_ok=True) - def output_factory(event_type: str): - return (output / f'{event_type}.csv').open('w', encoding='utf-8') + def output_factory(event_type: str, format: LogSplitterFormats): + return (output / f'{event_type}.{format.value}').open('w', encoding='utf-8') with (log.open('r', encoding='utf-8') as istream, LogSplitter(output_factory) as splitter): + splitter.set_format(DECLogEntry, LogSplitterFormats.jsonl) splitter.split(log_parser.parse(istream)) diff --git a/benchmarks/core/logging.py b/benchmarks/core/logging.py index 296ed2a..df23af0 100644 --- a/benchmarks/core/logging.py +++ b/benchmarks/core/logging.py @@ -74,6 +74,7 @@ class AdaptedLogEntry(LogEntry, ABC): def recover_instance(self) -> SnakeCaseModel: pass + class LogParser: """:class:`LogParser` will pick up log entries from a stream and parse them into :class:`LogEntry` instances. It works by trying to find a special marker (>>>) in the log line, and then parsing the JSON that follows it. @@ -123,7 +124,8 @@ class LogSplitter: """:class:`LogSplitter` will split parsed logs into different files based on the entry type. The output format can be set for each entry type.""" - def __init__(self, output_factory=Callable[[str], TextIO], output_entry_type=False) -> None: + def __init__(self, output_factory=Callable[[str, LogSplitterFormats], TextIO], + output_entry_type=False) -> None: self.output_factory = output_factory self.outputs: Dict[str, Tuple[Callable[[LogEntry], None], TextIO]] = {} self.formats: Dict[str, LogSplitterFormats] = {} @@ -137,8 +139,9 @@ class LogSplitter: write, _ = self.outputs.get(entry.entry_type, (None, None)) if write is None: - output_stream = self.output_factory(entry.entry_type) output_format = self.formats.get(entry.entry_type, LogSplitterFormats.csv) + output_stream = self.output_factory(entry.entry_type, output_format) + write = self._formatting_writer(entry, output_stream, output_format) self.outputs[entry.entry_type] = write, output_stream diff --git a/benchmarks/core/tests/test_logging.py b/benchmarks/core/tests/test_logging.py index 50441a3..c33bbd5 100644 --- a/benchmarks/core/tests/test_logging.py +++ b/benchmarks/core/tests/test_logging.py @@ -147,7 +147,7 @@ def test_should_split_intertwined_logs_by_entry_type(): outputs = defaultdict(StringIO) splitter = LogSplitter( - output_factory=lambda entry_type: outputs[entry_type], + output_factory=lambda entry_type, _: outputs[entry_type], ) splitter.split(parser.parse(log)) @@ -202,7 +202,7 @@ def test_should_store_split_logs_as_jsonl_for_requested_types(): outputs = defaultdict(StringIO) splitter = LogSplitter( - output_factory=lambda entry_type: outputs[entry_type], + output_factory=lambda entry_type, _: outputs[entry_type], ) splitter.set_format(SimpleEvent, LogSplitterFormats.jsonl)