mirror of
https://github.com/logos-storage/bittorrent-benchmarks.git
synced 2026-01-03 21:43:09 +00:00
199 lines
6.9 KiB
Python
199 lines
6.9 KiB
Python
import json
|
|
from collections.abc import Iterator
|
|
from datetime import datetime
|
|
from functools import total_ordering
|
|
from heapq import heapify, heappush, heappop
|
|
from json import JSONDecodeError
|
|
from typing import TextIO, Optional, Tuple, List, Callable
|
|
import logging
|
|
import re
|
|
|
|
from benchmarks.logging.sources.sources import LogSource, ExperimentId, NodeId, RawLine
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_POD_NAME_REGEX = re.compile(r'"pod_name":"(?P<pod_name>[^"]+)"')
|
|
_TIMESTAMP_REGEX = re.compile(r'"timestamp":"(?P<timestamp>[^"]+)"')
|
|
|
|
|
|
@total_ordering
|
|
class PodLog(object):
|
|
""":class:`PodLog` allows us to iterate separately over the logs of the various pods even when they
|
|
are merged into the same file. This is useful when trying to sort the logs of a vector file dump as
|
|
those are guaranteed to be sorted per-pod, but not across pods."""
|
|
|
|
def __init__(self, pod_name: str, file: TextIO) -> None:
|
|
self.pod_name = pod_name
|
|
self.file = file
|
|
self.pointer: int = 0
|
|
|
|
self.next_line: Optional[Tuple[str, datetime]] = self._scan_next()
|
|
|
|
@property
|
|
def timestamp(self) -> datetime:
|
|
if not self.next_line:
|
|
raise ValueError("Cannot compare: log has run out of entries")
|
|
return self.next_line[1]
|
|
|
|
def has_next(self) -> bool:
|
|
"""Returns true if there are more logs to read for this pod."""
|
|
return self.next_line is not None
|
|
|
|
def __eq__(self, other: object) -> bool:
|
|
if not isinstance(other, PodLog):
|
|
return NotImplemented
|
|
return self.timestamp == other.timestamp
|
|
|
|
def __lt__(self, other: object) -> bool:
|
|
if not isinstance(other, PodLog):
|
|
return NotImplemented
|
|
return self.timestamp < other.timestamp
|
|
|
|
def __next__(self) -> Tuple[str, datetime]:
|
|
if self.next_line is None:
|
|
raise StopIteration()
|
|
value = self.next_line
|
|
self.next_line = self._scan_next()
|
|
return value
|
|
|
|
def _iter_file(self) -> Iterator[str]:
|
|
"""Iterates over the file, yielding lines."""
|
|
self.file.seek(self.pointer)
|
|
for line in iter(self.file.readline, ""):
|
|
self.pointer = self.file.tell()
|
|
yield line
|
|
|
|
def _scan_next(self) -> Optional[Tuple[str, datetime]]:
|
|
pod_name = f'"pod_name":"{self.pod_name}"'
|
|
for line in self._iter_file():
|
|
self.pointer = self.file.tell()
|
|
if pod_name not in line:
|
|
continue
|
|
|
|
timestamp = _TIMESTAMP_REGEX.search(line)
|
|
if not timestamp:
|
|
logger.error(f"Log line contains no timestamp {line}")
|
|
continue
|
|
|
|
return line, datetime.fromisoformat(timestamp.group("timestamp"))
|
|
return None
|
|
|
|
|
|
class VectorFlatFileSource(LogSource):
|
|
"""Log source for flat JSONL files produced by [Vector](https://vector.dev/). This is typically used when running
|
|
experiments locally within, say, Minikube or Kind."""
|
|
|
|
def __init__(self, file: TextIO, app_name: str, sorted=False):
|
|
self.file = file
|
|
self.app_name = app_name
|
|
self.sorted = sorted
|
|
|
|
def __enter__(self):
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
self.file.close()
|
|
|
|
def experiments(self, group_id: str) -> Iterator[str]:
|
|
"""
|
|
Retrieves all experiment IDs within an experiment group. Can be quite slow as this source supports
|
|
no indexing or aggregation.
|
|
|
|
See also: :meth:`LogSource.experiments`.
|
|
"""
|
|
app_label = f'"app.kubernetes.io/name":"{self.app_name}"'
|
|
group_label = f'"app.kubernetes.io/part-of":"{group_id}"'
|
|
seen = set()
|
|
|
|
self.file.seek(0)
|
|
for line in self.file:
|
|
if app_label not in line or group_label not in line:
|
|
continue
|
|
|
|
parsed = json.loads(line)
|
|
experiment_id = parsed["kubernetes"]["pod_labels"][
|
|
"app.kubernetes.io/instance"
|
|
]
|
|
if experiment_id in seen:
|
|
continue
|
|
seen.add(experiment_id)
|
|
yield experiment_id
|
|
|
|
def _sorted_logs(self, line_predicate: Callable[[str], bool]) -> Iterator[str]:
|
|
sources = [
|
|
source for source in self._pod_logs(line_predicate) if source.has_next()
|
|
]
|
|
heapify(sources)
|
|
while sources:
|
|
log = heappop(sources)
|
|
yield next(log)[0]
|
|
if log.has_next():
|
|
heappush(sources, log)
|
|
|
|
def _unsorted_logs(self, line_predicate: Callable[[str], bool]) -> Iterator[str]:
|
|
self.file.seek(0)
|
|
for line in self.file:
|
|
if not line_predicate(line):
|
|
continue
|
|
yield line
|
|
|
|
def _pod_logs(self, line_predicate: Callable[[str], bool]) -> List[PodLog]:
|
|
logger.info("Identifying pod logs.")
|
|
self.file.seek(0)
|
|
pod_logs = {}
|
|
for line in self.file:
|
|
if not line_predicate(line):
|
|
continue
|
|
match = _POD_NAME_REGEX.search(line)
|
|
if not match:
|
|
logger.error(f"Log line contains no pod name {line}")
|
|
continue
|
|
pod_name = match.group("pod_name")
|
|
if pod_name not in pod_logs:
|
|
logger.info(f"Pod found: {pod_name}")
|
|
pod_logs[pod_name] = PodLog(pod_name, self.file)
|
|
|
|
return list(pod_logs.values())
|
|
|
|
def logs(
|
|
self, group_id: str, experiment_id: Optional[str] = None
|
|
) -> Iterator[Tuple[ExperimentId, NodeId, RawLine]]:
|
|
"""Retrieves logs for either all experiments within a group, or a specific experiments. Again, since this
|
|
source supports no indexing this can be quite slow, as each query represents a full pass on the file.
|
|
I strongly encourage not attempting to retrieve logs for experiments individually.
|
|
"""
|
|
app_label = f'"app.kubernetes.io/name":"{self.app_name}"'
|
|
group_label = f'"app.kubernetes.io/part-of":"{group_id}"'
|
|
experiment_label = f'"app.kubernetes.io/instance":"{experiment_id}"'
|
|
|
|
def line_predicate(line: str) -> bool:
|
|
return (
|
|
app_label in line
|
|
and group_label in line
|
|
and (experiment_id is None or experiment_label in line)
|
|
)
|
|
|
|
logs = (
|
|
self._sorted_logs(line_predicate)
|
|
if self.sorted
|
|
else self._unsorted_logs(line_predicate)
|
|
)
|
|
for line in logs:
|
|
try:
|
|
parsed = json.loads(line)
|
|
except JSONDecodeError as err:
|
|
logger.error(
|
|
f"Failed to parse line from vector from source {line}", err
|
|
)
|
|
continue
|
|
|
|
k8s = parsed["kubernetes"]
|
|
yield (
|
|
k8s["pod_labels"]["app.kubernetes.io/instance"],
|
|
k8s["pod_name"],
|
|
parsed["message"],
|
|
)
|
|
|
|
def __str__(self):
|
|
return f"VectorFlatFileSource({self.app_name})"
|