mirror of
https://github.com/codex-storage/bittorrent-benchmarks.git
synced 2025-02-09 09:43:29 +00:00
125 lines
3.8 KiB
Python
125 lines
3.8 KiB
Python
import datetime
|
|
import json
|
|
import logging
|
|
from csv import DictWriter
|
|
from enum import Enum
|
|
from json import JSONDecodeError
|
|
from typing import Type, TextIO, Iterable, Callable, Dict, Tuple
|
|
|
|
from pydantic import ValidationError, computed_field, Field
|
|
|
|
from benchmarks.core.pydantic import SnakeCaseModel
|
|
|
|
MARKER = '>>'
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class LogEntry(SnakeCaseModel):
|
|
def __str__(self):
|
|
return f"{MARKER}{self.model_dump_json()}"
|
|
|
|
@computed_field # type: ignore
|
|
@property
|
|
def entry_type(self) -> str:
|
|
return self.alias()
|
|
|
|
|
|
class LogParser:
|
|
def __init__(self):
|
|
self.entry_types = {}
|
|
self.warn_counts = 10
|
|
|
|
def register(self, entry_type: Type[SnakeCaseModel]):
|
|
self.entry_types[entry_type.alias()] = entry_type
|
|
|
|
def parse(self, log: TextIO) -> Iterable[LogEntry]:
|
|
marker_len = len(MARKER)
|
|
for line in log:
|
|
index = line.find(MARKER)
|
|
if index == -1:
|
|
continue
|
|
|
|
type_tag = '' # just to calm down mypy
|
|
try:
|
|
# Should probably test this against a regex for the type tag to see which is faster.
|
|
json_line = json.loads(line[index + marker_len:])
|
|
type_tag = json_line.get('entry_type')
|
|
if not type_tag or (type_tag not in self.entry_types):
|
|
continue
|
|
yield self.entry_types[type_tag].model_validate(json_line)
|
|
except JSONDecodeError:
|
|
pass
|
|
except ValidationError as err:
|
|
# This is usually something we want to know about, as if the message has a type_tag
|
|
# that we know, then we should probably be able to parse it.
|
|
self.warn_counts -= 1 # avoid flooding everything with warnings
|
|
if self.warn_counts > 0:
|
|
logger.warning(f"Schema failed for line with known type tag {type_tag}: {err}")
|
|
elif self.warn_counts == 0:
|
|
logger.warning("Too many errors: suppressing further schema warnings.")
|
|
|
|
|
|
class LogSplitter:
|
|
def __init__(self, output_factory=Callable[[str], TextIO], output_entry_type=False) -> None:
|
|
self.output_factory = output_factory
|
|
self.dump = (
|
|
(lambda model: model.model_dump())
|
|
if output_entry_type
|
|
else (lambda model: model.model_dump(exclude={'entry_type'}))
|
|
)
|
|
|
|
self.outputs: Dict[str, Tuple[DictWriter, TextIO]] = {}
|
|
|
|
def split(self, log: Iterable[LogEntry]):
|
|
for entry in log:
|
|
writer, _ = self.outputs.get(entry.entry_type, (None, None))
|
|
entry_dict = self.dump(entry)
|
|
|
|
if writer is None:
|
|
output = self.output_factory(entry.entry_type)
|
|
writer = DictWriter(output, fieldnames=entry_dict.keys())
|
|
self.outputs[entry.entry_type] = writer, output
|
|
writer.writeheader()
|
|
|
|
writer.writerow(entry_dict)
|
|
|
|
def __enter__(self):
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
for _, output in self.outputs.values():
|
|
output.close()
|
|
|
|
|
|
type NodeId = str
|
|
|
|
|
|
class Event(LogEntry):
|
|
node: NodeId
|
|
name: str # XXX this ends up being redundant for custom event schemas... need to think of a better solution.
|
|
timestamp: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(datetime.UTC))
|
|
|
|
|
|
class Metric(Event):
|
|
value: int | float
|
|
|
|
|
|
class RequestEventType(Enum):
|
|
start = 'start'
|
|
end = 'end'
|
|
|
|
|
|
class RequestEvent(Event):
|
|
destination: NodeId
|
|
request_id: str
|
|
type: RequestEventType
|
|
|
|
|
|
def basic_log_parser() -> LogParser:
|
|
parser = LogParser()
|
|
parser.register(Event)
|
|
parser.register(Metric)
|
|
parser.register(RequestEvent)
|
|
return parser
|