bittorrent-benchmarks/benchmarks/logging/sources/vector_flat_file.py

import json
from collections.abc import Iterator
from datetime import datetime
from functools import total_ordering
from heapq import heapify, heappush, heappop
from json import JSONDecodeError
from typing import TextIO, Optional, Tuple, List, Callable
import logging
import re

from benchmarks.logging.sources.sources import LogSource, ExperimentId, NodeId, RawLine

logger = logging.getLogger(__name__)

_POD_NAME_REGEX = re.compile(r'"pod_name":"(?P<pod_name>[^"]+)"')
_TIMESTAMP_REGEX = re.compile(r'"timestamp":"(?P<timestamp>[^"]+)"')


@total_ordering
class PodLog(object):
    """:class:`PodLog` allows us to iterate separately over the logs of the various pods even when they
    are merged into the same file. This is useful when trying to sort the logs of a vector file dump as
    those are guaranteed to be sorted per-pod, but not across pods."""

    def __init__(self, pod_name: str, file: TextIO) -> None:
        self.pod_name = pod_name
        self.file = file
        self.pointer: int = 0

        self.next_line: Optional[Tuple[str, datetime]] = self._scan_next()

    @property
    def timestamp(self) -> datetime:
        if not self.next_line:
            raise ValueError("Cannot compare: log has run out of entries")
        return self.next_line[1]

    def has_next(self) -> bool:
        """Returns true if there are more logs to read for this pod."""
        return self.next_line is not None

    def __eq__(self, other: object) -> bool:
        if not isinstance(other, PodLog):
            return NotImplemented
        return self.timestamp == other.timestamp

    def __lt__(self, other: object) -> bool:
        if not isinstance(other, PodLog):
            return NotImplemented
        return self.timestamp < other.timestamp

    def __next__(self) -> Tuple[str, datetime]:
        if self.next_line is None:
            raise StopIteration()
        value = self.next_line
        self.next_line = self._scan_next()
        return value

    def _iter_file(self) -> Iterator[str]:
        """Iterates over the file, yielding lines."""
        self.file.seek(self.pointer)
        for line in iter(self.file.readline, ""):
            self.pointer = self.file.tell()
            yield line

    def _scan_next(self) -> Optional[Tuple[str, datetime]]:
        pod_name = f'"pod_name":"{self.pod_name}"'
        for line in self._iter_file():
            self.pointer = self.file.tell()
            if pod_name not in line:
                continue

            timestamp = _TIMESTAMP_REGEX.search(line)
            if not timestamp:
                logger.error(f"Log line contains no timestamp {line}")
                continue

            return line, datetime.fromisoformat(timestamp.group("timestamp"))
        return None


class VectorFlatFileSource(LogSource):
    """Log source for flat JSONL files produced by [Vector](https://vector.dev/). This is typically used when running
    experiments locally within, say, Minikube or Kind."""

    def __init__(self, file: TextIO, app_name: str, sorted=False):
        self.file = file
        self.app_name = app_name
        self.sorted = sorted

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.file.close()

    def experiments(self, group_id: str) -> Iterator[str]:
        """
        Retrieves all experiment IDs within an experiment group. Can be quite slow as this source supports
        no indexing or aggregation.

        See also: :meth:`LogSource.experiments`.
        """
        app_label = f'"app.kubernetes.io/name":"{self.app_name}"'
        group_label = f'"app.kubernetes.io/part-of":"{group_id}"'
        seen = set()

        self.file.seek(0)
        for line in self.file:
            if app_label not in line or group_label not in line:
                continue

            parsed = json.loads(line)
            experiment_id = parsed["kubernetes"]["pod_labels"][
                "app.kubernetes.io/instance"
            ]
            if experiment_id in seen:
                continue
            seen.add(experiment_id)
            yield experiment_id

    def _sorted_logs(self, line_predicate: Callable[[str], bool]) -> Iterator[str]:
        sources = [
            source for source in self._pod_logs(line_predicate) if source.has_next()
        ]
        heapify(sources)
        while sources:
            log = heappop(sources)
            yield next(log)[0]
            if log.has_next():
                heappush(sources, log)

    def _unsorted_logs(self, line_predicate: Callable[[str], bool]) -> Iterator[str]:
        self.file.seek(0)
        for line in self.file:
            if not line_predicate(line):
                continue
            yield line

    def _pod_logs(self, line_predicate: Callable[[str], bool]) -> List[PodLog]:
        logger.info("Identifying pod logs.")
        self.file.seek(0)
        pod_logs = {}
        for line in self.file:
            if not line_predicate(line):
                continue
            match = _POD_NAME_REGEX.search(line)
            if not match:
                logger.error(f"Log line contains no pod name {line}")
                continue
            pod_name = match.group("pod_name")
            if pod_name not in pod_logs:
                logger.info(f"Pod found: {pod_name}")
                pod_logs[pod_name] = PodLog(pod_name, self.file)

        return list(pod_logs.values())

    def logs(
        self, group_id: str, experiment_id: Optional[str] = None
    ) -> Iterator[Tuple[ExperimentId, NodeId, RawLine]]:
        """Retrieves logs for either all experiments within a group, or a specific experiments. Again, since this
        source supports no indexing this can be quite slow, as each query represents a full pass on the file.
        I strongly encourage not attempting to retrieve logs for experiments individually.
        """
        app_label = f'"app.kubernetes.io/name":"{self.app_name}"'
        group_label = f'"app.kubernetes.io/part-of":"{group_id}"'
        experiment_label = f'"app.kubernetes.io/instance":"{experiment_id}"'

        def line_predicate(line: str) -> bool:
            return (
                app_label in line
                and group_label in line
                and (experiment_id is None or experiment_label in line)
            )

        logs = (
            self._sorted_logs(line_predicate)
            if self.sorted
            else self._unsorted_logs(line_predicate)
        )
        for line in logs:
            try:
                parsed = json.loads(line)
            except JSONDecodeError as err:
                logger.error(
                    f"Failed to parse line from vector from source {line}", err
                )
                continue

            k8s = parsed["kubernetes"]
            yield (
                k8s["pod_labels"]["app.kubernetes.io/instance"],
                k8s["pod_name"],
                parsed["message"],
            )

    def __str__(self):
        return f"VectorFlatFileSource({self.app_name})"