344 lines
11 KiB
Python
Raw Normal View History

import argparse
import logging
2024-11-28 15:15:05 -03:00
import sys
from pathlib import Path
2024-12-11 15:43:52 -03:00
from typing import Dict
2024-11-28 15:15:05 -03:00
import uvicorn
from elasticsearch import Elasticsearch
from pydantic import IPvAnyAddress
2024-11-28 15:15:05 -03:00
from pydantic_core import ValidationError
from typing_extensions import TypeVar
2024-11-28 15:15:05 -03:00
from benchmarks.core.agent import AgentBuilder
from benchmarks.core.config import ConfigParser, Builder
2025-01-17 08:34:49 -03:00
from benchmarks.core.experiments.experiments import Experiment, ExperimentBuilder
from benchmarks.deluge.agent.api import DelugeAgentConfig
from benchmarks.deluge.config import DelugeExperimentConfig
from benchmarks.deluge.logging import DelugeTorrentDownload
from benchmarks.logging.logging import (
2024-12-14 06:34:11 -03:00
basic_log_parser,
LogSplitter,
LogEntry,
LogSplitterFormats,
)
from benchmarks.logging.sources.logstash import LogstashSource
from benchmarks.logging.sources.sources import (
FSOutputManager,
split_logs_in_source,
LogSource,
)
2025-01-21 08:42:26 -03:00
from benchmarks.logging.sources.vector_flat_file import VectorFlatFileSource
2024-11-28 15:15:05 -03:00
experiment_config_parser = ConfigParser[ExperimentBuilder]()
experiment_config_parser.register(DelugeExperimentConfig)
agent_config_parser = ConfigParser[AgentBuilder]()
agent_config_parser.register(DelugeAgentConfig)
2024-11-28 15:15:05 -03:00
log_parser = basic_log_parser()
log_parser.register(DelugeTorrentDownload)
DECLogEntry = LogEntry.adapt(DelugeExperimentConfig)
log_parser.register(DECLogEntry)
logger = logging.getLogger(__name__)
2024-11-28 15:15:05 -03:00
def cmd_list_experiment(experiments: Dict[str, ExperimentBuilder[Experiment]], _):
2024-12-14 06:34:11 -03:00
print("Available experiments are:")
2024-11-28 15:52:38 -03:00
for experiment in experiments.keys():
2024-12-14 06:34:11 -03:00
print(f" - {experiment}")
2024-11-28 15:15:05 -03:00
def cmd_run_experiment(experiments: Dict[str, ExperimentBuilder[Experiment]], args):
if args.experiment not in experiments:
2024-12-14 06:34:11 -03:00
print(f"Experiment {args.experiment} not found.")
sys.exit(-1)
experiment = experiments[args.experiment]
logger.info(DECLogEntry.adapt_instance(experiment))
experiment.build().run()
print(f"Experiment {args.experiment} completed successfully.")
def cmd_describe_experiment(args):
if not args.type:
2024-12-14 06:34:11 -03:00
print("Available experiment types are:")
for experiment in experiment_config_parser.experiment_types.keys():
2024-12-14 06:34:11 -03:00
print(f" - {experiment}")
return
print(experiment_config_parser.experiment_types[args.type].schema_json(indent=2))
def cmd_parse_single_log(log: Path, output: Path):
if not log.exists():
2024-12-14 06:34:11 -03:00
print(f"Log file {log} does not exist.")
sys.exit(-1)
if not output.parent.exists():
2024-12-14 06:34:11 -03:00
print(f"Folder {output.parent} does not exist.")
sys.exit(-1)
output.mkdir(exist_ok=True)
2024-12-12 12:24:39 -03:00
def output_factory(event_type: str, format: LogSplitterFormats):
2024-12-14 06:34:11 -03:00
return (output / f"{event_type}.{format.value}").open("w", encoding="utf-8")
2024-12-14 06:34:11 -03:00
with (
log.open("r", encoding="utf-8") as istream,
LogSplitter(output_factory) as splitter,
):
2024-12-12 12:24:39 -03:00
splitter.set_format(DECLogEntry, LogSplitterFormats.jsonl)
splitter.split(log_parser.parse(istream))
2025-01-22 15:27:51 -03:00
def cmd_split_log_source(source: LogSource, group_id: str, output_dir: Path):
if not output_dir.parent.exists():
print(f"Folder {output_dir.parent} does not exist.")
sys.exit(-1)
output_dir.mkdir(exist_ok=True)
with (
source as log_source,
FSOutputManager(output_dir) as output_manager,
):
split_logs_in_source(
log_source,
log_parser,
output_manager,
group_id,
formats=[(DECLogEntry, LogSplitterFormats.jsonl)],
)
2025-01-22 15:27:51 -03:00
def cmd_dump_single_experiment(source: LogSource, group_id: str, experiment_id: str):
with source as log_source:
for _, node_id, raw_line in log_source.logs(
group_id=group_id, experiment_id=experiment_id
):
print(f"[[{node_id}]] {raw_line}", file=sys.stdout)
2025-01-22 15:27:51 -03:00
def cmd_run_agent(agents: Dict[str, AgentBuilder], args):
if args.agent not in agents:
print(f"Agent type {args.experiment} not found.")
sys.exit(-1)
uvicorn.run(
agents[args.agent].build(),
host=str(args.host),
port=args.port,
reload=False,
workers=1,
)
T = TypeVar("T")
def _parse_config(
config: Path, parser: ConfigParser[Builder[T]]
) -> Dict[str, Builder[T]]:
if not config.exists():
2024-12-14 06:34:11 -03:00
print(f"Config file {config} does not exist.")
2024-11-28 15:15:05 -03:00
sys.exit(-1)
2024-12-14 06:34:11 -03:00
with config.open(encoding="utf-8") as infile:
try:
return parser.parse(infile)
except ValidationError as e:
2024-12-14 06:34:11 -03:00
print("There were errors parsing the config file.")
for error in e.errors():
print(f' - {error["loc"]}: {error["msg"]} {error["input"]}')
sys.exit(-1)
2025-01-23 09:05:48 -03:00
def _configure_logstash_source(args, structured_only=True):
import urllib3
2025-01-23 09:05:48 -03:00
urllib3.disable_warnings()
2025-01-23 09:05:48 -03:00
return LogstashSource(
Elasticsearch(args.es_url, verify_certs=False),
chronological=args.chronological,
structured_only=structured_only,
slices=args.slices,
)
def _configure_vector_source(args):
if not args.source_file.exists():
print(f"Log source file {args.source_file} does not exist.")
sys.exit(-1)
return VectorFlatFileSource(
app_name="codex-benchmarks", file=args.source_file.open(encoding="utf-8")
)
def _init_logging():
import logging
logging.basicConfig(
level=logging.INFO,
2024-12-14 06:34:11 -03:00
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
2024-11-28 15:15:05 -03:00
def main():
2025-01-23 09:05:48 -03:00
# TODO this is getting unwieldy, need pull this apart into submodules. For now we get away
# with title comments.
parser = argparse.ArgumentParser()
commands = parser.add_subparsers(required=True)
2025-01-23 09:05:48 -03:00
###########################################################################
# Experiments #
###########################################################################
2024-12-14 06:34:11 -03:00
experiments = commands.add_parser(
"experiments", help="List or run experiments in config file."
)
experiments.add_argument(
"config", type=Path, help="Path to the experiment configuration file."
)
experiment_commands = experiments.add_subparsers(required=True)
2024-12-14 06:34:11 -03:00
list_cmd = experiment_commands.add_parser(
"list", help="Lists available experiments."
)
list_cmd.set_defaults(
func=lambda args: cmd_list_experiment(_parse_config(args.config), args)
)
2024-12-14 06:34:11 -03:00
run_cmd = experiment_commands.add_parser("run", help="Runs an experiment")
run_cmd.add_argument("experiment", type=str, help="Name of the experiment to run.")
run_cmd.set_defaults(
func=lambda args: cmd_run_experiment(
_parse_config(args.config, experiment_config_parser), args
)
)
describe_cmd = commands.add_parser(
2024-12-14 06:34:11 -03:00
"describe", help="Shows the JSON schema for the various experiment types."
)
describe_cmd.add_argument(
2024-12-14 06:34:11 -03:00
"type",
type=str,
help="Type of the experiment to describe.",
choices=experiment_config_parser.experiment_types.keys(),
2024-12-14 06:34:11 -03:00
nargs="?",
)
describe_cmd.set_defaults(func=cmd_describe_experiment)
2025-01-23 09:05:48 -03:00
###########################################################################
# Logs #
###########################################################################
logs_cmd = commands.add_parser("logs", help="Parse logs.")
log_subcommands = logs_cmd.add_subparsers(required=True)
single_log_cmd = log_subcommands.add_parser(
"single", help="Parse a single log file."
)
single_log_cmd.add_argument("log", type=Path, help="Path to the log file.")
single_log_cmd.add_argument(
"output_dir", type=Path, help="Path to an output folder."
)
single_log_cmd.set_defaults(
func=lambda args: cmd_parse_single_log(args.log, args.output_dir)
)
log_source_cmd = log_subcommands.add_parser(
"source", help="Parse logs from a log source."
)
log_source_cmd.add_argument(
"group_id", type=str, help="ID of experiment group to parse."
)
2025-01-22 15:27:51 -03:00
single_or_split = log_source_cmd.add_mutually_exclusive_group(required=True)
single_or_split.add_argument(
"--experiment-id",
type=str,
help="Dumps logs for a single experiment onto stdout.",
)
single_or_split.add_argument(
"--output-dir",
type=Path,
help="Splits logs for the entire group into the specified folder.",
)
2025-01-23 09:05:48 -03:00
single_or_split.set_defaults(
func=lambda args: cmd_dump_single_experiment(
args.source(args, False), args.group_id, args.experiment_id
2025-01-22 15:27:51 -03:00
)
2025-01-23 09:05:48 -03:00
if args.experiment_id
else cmd_split_log_source(
args.source(args, True), args.group_id, args.output_dir
)
)
2025-01-23 09:05:48 -03:00
source_type = log_source_cmd.add_subparsers(required=True)
es_source = source_type.add_parser("logstash", help="Logstash source.")
es_source.add_argument(
"es_url", type=str, help="URL to a logstash Elasticsearch instance."
)
es_source.add_argument(
"--chronological", action="store_true", help="Sort logs chronologically."
)
es_source.add_argument(
"--slices",
type=int,
help="Number of scroll slices to use when reading the log.",
default=2,
)
es_source.set_defaults(
source=lambda args, structured_only: _configure_logstash_source(
args, structured_only=structured_only
)
)
vector_source = source_type.add_parser("vector", help="Vector flat file source.")
vector_source.add_argument(
"source_file", type=Path, help="Vector log file to parse from."
)
vector_source.set_defaults(source=lambda args, _: _configure_vector_source(args))
###########################################################################
# Agents #
###########################################################################
agent_cmd = commands.add_parser("agent", help="Starts a local agent.")
agent_cmd.add_argument(
"config", type=Path, help="Path to the agent configuration file."
)
agent_cmd.add_argument("agent", type=str, help="Name of the agent to run.")
agent_cmd.add_argument(
"--host",
type=IPvAnyAddress,
help="IP address to bind to.",
default=IPvAnyAddress("0.0.0.0"),
)
agent_cmd.add_argument(
"--port", type=int, help="Port to listen to connections.", default=9001
)
agent_cmd.set_defaults(
func=lambda args: cmd_run_agent(
_parse_config(args.config, agent_config_parser), args
)
)
args = parser.parse_args()
_init_logging()
2024-12-01 23:12:49 -03:00
args.func(args)
2024-12-01 23:12:49 -03:00
2024-12-14 06:34:11 -03:00
if __name__ == "__main__":
main()