output proper extension for jsonl data

This commit is contained in:
gmega 2024-12-12 12:24:39 -03:00
parent d716af5d8b
commit bf844a6305
No known key found for this signature in database
GPG Key ID: 6290D34EAD824B18
3 changed files with 11 additions and 7 deletions

View File

@ -8,7 +8,7 @@ from pydantic_core import ValidationError
from benchmarks.core.config import ConfigParser, ExperimentBuilder
from benchmarks.core.experiments.experiments import Experiment
from benchmarks.core.logging import basic_log_parser, LogSplitter, LogEntry
from benchmarks.core.logging import basic_log_parser, LogSplitter, LogEntry, LogSplitterFormats
from benchmarks.deluge.config import DelugeExperimentConfig
from benchmarks.deluge.logging import DelugeTorrentDownload
@ -61,11 +61,12 @@ def cmd_logs(log: Path, output: Path):
output.mkdir(exist_ok=True)
def output_factory(event_type: str):
return (output / f'{event_type}.csv').open('w', encoding='utf-8')
def output_factory(event_type: str, format: LogSplitterFormats):
return (output / f'{event_type}.{format.value}').open('w', encoding='utf-8')
with (log.open('r', encoding='utf-8') as istream,
LogSplitter(output_factory) as splitter):
splitter.set_format(DECLogEntry, LogSplitterFormats.jsonl)
splitter.split(log_parser.parse(istream))

View File

@ -74,6 +74,7 @@ class AdaptedLogEntry(LogEntry, ABC):
def recover_instance(self) -> SnakeCaseModel:
pass
class LogParser:
""":class:`LogParser` will pick up log entries from a stream and parse them into :class:`LogEntry` instances.
It works by trying to find a special marker (>>>) in the log line, and then parsing the JSON that follows it.
@ -123,7 +124,8 @@ class LogSplitter:
""":class:`LogSplitter` will split parsed logs into different files based on the entry type.
The output format can be set for each entry type."""
def __init__(self, output_factory=Callable[[str], TextIO], output_entry_type=False) -> None:
def __init__(self, output_factory=Callable[[str, LogSplitterFormats], TextIO],
output_entry_type=False) -> None:
self.output_factory = output_factory
self.outputs: Dict[str, Tuple[Callable[[LogEntry], None], TextIO]] = {}
self.formats: Dict[str, LogSplitterFormats] = {}
@ -137,8 +139,9 @@ class LogSplitter:
write, _ = self.outputs.get(entry.entry_type, (None, None))
if write is None:
output_stream = self.output_factory(entry.entry_type)
output_format = self.formats.get(entry.entry_type, LogSplitterFormats.csv)
output_stream = self.output_factory(entry.entry_type, output_format)
write = self._formatting_writer(entry, output_stream, output_format)
self.outputs[entry.entry_type] = write, output_stream

View File

@ -147,7 +147,7 @@ def test_should_split_intertwined_logs_by_entry_type():
outputs = defaultdict(StringIO)
splitter = LogSplitter(
output_factory=lambda entry_type: outputs[entry_type],
output_factory=lambda entry_type, _: outputs[entry_type],
)
splitter.split(parser.parse(log))
@ -202,7 +202,7 @@ def test_should_store_split_logs_as_jsonl_for_requested_types():
outputs = defaultdict(StringIO)
splitter = LogSplitter(
output_factory=lambda entry_type: outputs[entry_type],
output_factory=lambda entry_type, _: outputs[entry_type],
)
splitter.set_format(SimpleEvent, LogSplitterFormats.jsonl)