output proper extension for jsonl data

2025-01-24 18:08:50 +00:00 · 2024-12-12 12:24:39 -03:00 · 2024-12-12 12:24:39 -03:00 · bf844a6305
commit bf844a6305
parent d716af5d8b
3 changed files with 11 additions and 7 deletions
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@ -8,7 +8,7 @@ from pydantic_core import ValidationError

 from benchmarks.core.config import ConfigParser, ExperimentBuilder
 from benchmarks.core.experiments.experiments import Experiment
-from benchmarks.core.logging import basic_log_parser, LogSplitter, LogEntry
+from benchmarks.core.logging import basic_log_parser, LogSplitter, LogEntry, LogSplitterFormats
 from benchmarks.deluge.config import DelugeExperimentConfig
 from benchmarks.deluge.logging import DelugeTorrentDownload

@ -61,11 +61,12 @@ def cmd_logs(log: Path, output: Path):

    output.mkdir(exist_ok=True)

-    def output_factory(event_type: str):
-        return (output / f'{event_type}.csv').open('w', encoding='utf-8')
+    def output_factory(event_type: str, format: LogSplitterFormats):
+        return (output / f'{event_type}.{format.value}').open('w', encoding='utf-8')

    with (log.open('r', encoding='utf-8') as istream,
          LogSplitter(output_factory) as splitter):
+        splitter.set_format(DECLogEntry, LogSplitterFormats.jsonl)
        splitter.split(log_parser.parse(istream))


--- a/benchmarks/core/logging.py
+++ b/benchmarks/core/logging.py
@ -74,6 +74,7 @@ class AdaptedLogEntry(LogEntry, ABC):
    def recover_instance(self) -> SnakeCaseModel:
        pass

+
 class LogParser:
    """:class:`LogParser` will pick up log entries from a stream and parse them into :class:`LogEntry` instances.
    It works by trying to find a special marker (>>>) in the log line, and then parsing the JSON that follows it.
@ -123,7 +124,8 @@ class LogSplitter:
    """:class:`LogSplitter` will split parsed logs into different files based on the entry type.
    The output format can be set for each entry type."""

-    def __init__(self, output_factory=Callable[[str], TextIO], output_entry_type=False) -> None:
+    def __init__(self, output_factory=Callable[[str, LogSplitterFormats], TextIO],
+                 output_entry_type=False) -> None:
        self.output_factory = output_factory
        self.outputs: Dict[str, Tuple[Callable[[LogEntry], None], TextIO]] = {}
        self.formats: Dict[str, LogSplitterFormats] = {}
@ -137,8 +139,9 @@ class LogSplitter:
            write, _ = self.outputs.get(entry.entry_type, (None, None))

            if write is None:
-                output_stream = self.output_factory(entry.entry_type)
                output_format = self.formats.get(entry.entry_type, LogSplitterFormats.csv)
+                output_stream = self.output_factory(entry.entry_type, output_format)
+
                write = self._formatting_writer(entry, output_stream, output_format)
                self.outputs[entry.entry_type] = write, output_stream

--- a/benchmarks/core/tests/test_logging.py
+++ b/benchmarks/core/tests/test_logging.py
@ -147,7 +147,7 @@ def test_should_split_intertwined_logs_by_entry_type():
    outputs = defaultdict(StringIO)

    splitter = LogSplitter(
-        output_factory=lambda entry_type: outputs[entry_type],
+        output_factory=lambda entry_type, _: outputs[entry_type],
    )

    splitter.split(parser.parse(log))
@ -202,7 +202,7 @@ def test_should_store_split_logs_as_jsonl_for_requested_types():
    outputs = defaultdict(StringIO)

    splitter = LogSplitter(
-        output_factory=lambda entry_type: outputs[entry_type],
+        output_factory=lambda entry_type, _: outputs[entry_type],
    )

    splitter.set_format(SimpleEvent, LogSplitterFormats.jsonl)