feat: add Codex prometheus targets in experiments

This commit is contained in:
gmega 2025-06-23 17:27:21 -03:00
parent a0af4c4c90
commit 8cc5ee6896
No known key found for this signature in database
GPG Key ID: 6290D34EAD824B18
7 changed files with 65 additions and 25 deletions

View File

@ -2,9 +2,5 @@
LIB_SRC=${LIB_SRC:-$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)}
# shellcheck source=./src/utils.bash
source "${LIB_SRC}/utils.bash"
# shellcheck source=./src/procmon.bash
source "${LIB_SRC}/procmon.bash"
# shellcheck source=./src/codex.bash
source "${LIB_SRC}/codex.bash"
# shellcheck source=./src/experiment.bash
source "${LIB_SRC}/experiment.bash"

View File

@ -136,7 +136,7 @@ cdx_launch_node() {
cmd_array=()
IFS=' ' read -r -a cmd_array <<<"$codex_cmd"
pm_async "${cmd_array[@]}" -%- "codex"
pm_async "${cmd_array[@]}" -%- "codex" "${node_index}"
_cdx_pids[$node_index]=$!
cdx_ensure_ready "$node_index"

View File

@ -7,6 +7,8 @@ LIB_SRC=${LIB_SRC:-$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)}
source "${LIB_SRC}/utils.bash"
# shellcheck source=./src/codex.bash
source "${LIB_SRC}/codex.bash"
# shellcheck source=./src/prometheus.bash
source "${LIB_SRC}/prometheus.bash"
_experiment_type=""
_experiment_id=""
@ -14,7 +16,7 @@ _experiment_id=""
exp_start() {
local experiment_id experiment_type="$1"
experiment_id="${experiment_type}-$(date +%s)-${RANDOM}" || return 1
experiment_id="$(date +%s)-${RANDOM}" || return 1
# FIXME: this is pretty clumsy/confusing. We're "initing" the
# harness just so it sets the base output folder, and then
@ -28,4 +30,33 @@ exp_start() {
clh_init "${_clh_output}/${_experiment_id}" || return 1
cdx_add_defaultopts "--metrics"
pm_register_callback "codex" _codex_target_changed
}
_codex_target_changed() {
local event="$1"
if [ "$event" = "start" ]; then
shift 3
_add_target "$@"
elif [ "$event" = "exit" ]; then
shift 4
_remove_target "$@"
fi
}
_add_target() {
local node_index="$1" metrics_port
metrics_port=$(_cdx_metrics_port "$node_index") || return 1
prom_add "${metrics_port}" "${_experiment_type}" "${_experiment_id}"\
"${node_index}" "codex"
}
_remove_target() {
local node_index="$1" metrics_port
metrics_port=$(_cdx_metrics_port "$node_index") || return 1
prom_remove "${metrics_port}" "${_experiment_type}" "${_experiment_id}"\
"${node_index}" "codex"
}

View File

@ -215,8 +215,8 @@ pm_async() {
(
set +e
_pm_job_started "${BASHPID}" "$proc_type" "$@"
trap '_pm_job_exited "${BASHPID}" "$proc_type" "killed"' TERM
trap '_pm_job_exited "${BASHPID}" "$proc_type" "$?"' EXIT
trap '_pm_job_exited "${BASHPID}" "$proc_type" "killed" "$@"' TERM
trap '_pm_job_exited "${BASHPID}" "$proc_type" "$?" "$@"' EXIT
"${command[@]}"
) &
result=("$!")
@ -257,6 +257,8 @@ _pm_job_exited() {
proc_type=$2\
exit_code=$3
shift 3
local pid_file="${_pm_output}/${pid}.pid"
# If the process is not tracked, don't write down an exit code.
@ -265,7 +267,7 @@ _pm_job_exited() {
else
echo "$exit_code" > "$pid_file"
fi
_pm_invoke_callback "exit" "$proc_type" "$pid" "$exit_code"
_pm_invoke_callback "exit" "$proc_type" "$pid" "$exit_code" "$@"
}
pm_register_callback() {

View File

@ -27,7 +27,8 @@ echoerr() {
shift_arr () {
local -n arr_ref="$1"
arr_ref=("${arr_ref[@]:1}")
local shifts="${2:-1}"
arr_ref=("${arr_ref[@]:shifts}")
}
sha1() {

View File

@ -19,18 +19,25 @@ setup() {
assert [ -d "${_clh_output}" ]
}
# @test "should add a prometheus target for each Codex node when requested" {
# pm_start
@test "should launch Codex nodes with metrics enabled when there is an experiment in scope" {
exp_start "experiment-type"
# cdx_enable_prometheus "anexperiment" "84858"
[[ "$(cdx_cmdline 0)" =~ "--metrics-port=8290 --metrics-address=0.0.0.0" ]]
}
# cdx_launch_node 0
# config_file="${_prom_output}/8290-anexperiment-84858-node-1-codex.json"
# assert [ -f "$config_file" ]
@test "should add a prometheus target for each Codex node when there is an experiment in scope" {
exp_start "k-node"
# cdx_destroy_node 0
# assert [ ! -f "$config_file" ]
pm_start
cdx_launch_node 0
# pm_stop
# }
config_file="${_prom_output}/8290-k-node-${_experiment_id}-0-codex.json"
assert [ -f "$config_file" ]
cdx_destroy_node 0
assert [ ! -f "$config_file" ]
}
teardown() {
pm_stop
}

View File

@ -162,14 +162,16 @@ callback() {
if [ "$event" = "start" ]; then
shift 3
if [ "$#" -gt 0 ]; then
echo "$*" > "${_pm_output}/${pid}-${proc_type}-start-args"
fi
touch "${_pm_output}/${pid}-${proc_type}-start"
elif [ "$event" = "exit" ]; then
exit_code="$4"
shift 4
touch "${_pm_output}/${pid}-${proc_type}-${exit_code}-exit"
fi
if [ "$#" -gt 0 ]; then
echo "$*" > "${_pm_output}/${pid}-${proc_type}-${event}-args"
fi
}
@test "should call lifecycle callbacks when processes start and stop" {
@ -231,4 +233,5 @@ callback() {
await "$pid"
assert_equal "$(cat "${_pm_output}/${pid}-sleepy-start-args")" "arg1 arg2"
assert_equal "$(cat "${_pm_output}/${pid}-sleepy-exit-args")" "arg1 arg2"
}