bittorrent-benchmarks/benchmarks/core/logging.py

import datetime
import json
import logging
from csv import DictWriter
from enum import Enum
from json import JSONDecodeError
from typing import Type, TextIO, Iterable, Callable, Dict, Tuple

from pydantic import ValidationError, computed_field, Field

from benchmarks.core.pydantic import SnakeCaseModel

MARKER = '>>'

logger = logging.getLogger(__name__)


class LogEntry(SnakeCaseModel):
    def __str__(self):
        return f"{MARKER}{self.model_dump_json()}"

    @computed_field # type: ignore
    @property
    def entry_type(self) -> str:
        return self.alias()


class LogParser:
    def __init__(self):
        self.entry_types = {}
        self.warn_counts = 10

    def register(self, entry_type: Type[SnakeCaseModel]):
        self.entry_types[entry_type.alias()] = entry_type

    def parse(self, log: TextIO) -> Iterable[LogEntry]:
        marker_len = len(MARKER)
        for line in log:
            index = line.find(MARKER)
            if index == -1:
                continue

            type_tag = ''  # just to calm down mypy
            try:
                # Should probably test this against a regex for the type tag to see which is faster.
                json_line = json.loads(line[index + marker_len:])
                type_tag = json_line.get('entry_type')
                if not type_tag or (type_tag not in self.entry_types):
                    continue
                yield self.entry_types[type_tag].model_validate(json_line)
            except JSONDecodeError:
                pass
            except ValidationError as err:
                # This is usually something we want to know about, as if the message has a type_tag
                # that we know, then we should probably be able to parse it.
                self.warn_counts -= 1  # avoid flooding everything with warnings
                if self.warn_counts > 0:
                    logger.warning(f"Schema failed for line with known type tag {type_tag}: {err}")
                elif self.warn_counts == 0:
                    logger.warning("Too many errors: suppressing further schema warnings.")


class LogSplitter:
    def __init__(self, output_factory=Callable[[str], TextIO], output_entry_type=False) -> None:
        self.output_factory = output_factory
        self.dump = (
            (lambda model: model.model_dump())
            if output_entry_type
            else (lambda model: model.model_dump(exclude={'entry_type'}))
        )

        self.outputs: Dict[str, Tuple[DictWriter, TextIO]] = {}

    def split(self, log: Iterable[LogEntry]):
        for entry in log:
            writer, _ = self.outputs.get(entry.entry_type, (None, None))
            entry_dict = self.dump(entry)

            if writer is None:
                output = self.output_factory(entry.entry_type)
                writer = DictWriter(output, fieldnames=entry_dict.keys())
                self.outputs[entry.entry_type] = writer, output
                writer.writeheader()

            writer.writerow(entry_dict)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        for _, output in self.outputs.values():
            output.close()


type NodeId = str


class Event(LogEntry):
    node: NodeId
    name: str  # XXX this ends up being redundant for custom event schemas... need to think of a better solution.
    timestamp: datetime.datetime = Field(default_factory=lambda: datetime.datetime.now(datetime.UTC))


class Metric(Event):
    value: int | float


class RequestEventType(Enum):
    start = 'start'
    end = 'end'


class RequestEvent(Event):
    destination: NodeId
    request_id: str
    type: RequestEventType


def basic_log_parser() -> LogParser:
    parser = LogParser()
    parser.register(Event)
    parser.register(Metric)
    parser.register(RequestEvent)
    return parser