feat: add recursive kill; improve procmon state and process handling

This commit is contained in:
gmega 2025-06-18 19:28:34 -03:00
parent 2d2c2dda6f
commit cfdea04de5
No known key found for this signature in database
GPG Key ID: 6290D34EAD824B18
5 changed files with 123 additions and 48 deletions

View File

@ -55,6 +55,7 @@ cdx_cmdline() {
return 1
fi
# shellcheck disable=SC2140
echo "${cdx_cmd}"\
" --log-file=${_cdx_logs}/codex-${node_index}.log --data-dir=${_cdx_data}/codex-${node_index}"\
" --api-port=${api_port} --disc-port=${disc_port} --loglevel=INFO"
@ -89,16 +90,14 @@ cdx_launch_node() {
}
cdx_ensure_ready() {
local node_index="$1" timeout=${2:-$_cdx_node_start_timeout} start now
start=$(date +%s)
local node_index="$1" timeout=${2:-$_cdx_node_start_timeout} start="${SECONDS}"
while true; do
if cdx_get_spr "$node_index"; then
echoerr "Codex node $node_index is ready."
return 0
fi
now=$(date +%s)
if (( now - start > timeout )); then
if (( SECONDS - start > timeout )); then
echoerr "Codex node $node_index did not start within ${timeout} seconds."
return 1
fi

View File

@ -1,5 +1,4 @@
#!/usr/bin/env bash
LIB_SRC=${LIB_SRC:-$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)}
# shellcheck source=./src/config.bash
@ -9,7 +8,6 @@ source "${LIB_SRC}/utils.bash"
_pm_output=$(clh_output_folder "pm")
_pm_pid=""
_pm_stop_mode=""
_pm_init_output() {
rm -rf "${_pm_output}" || true
@ -17,21 +15,12 @@ _pm_init_output() {
}
pm_start() {
if [ -n "$_pm_pid" ]; then
echoerr "[procmon] process monitor already started"
return 1
fi
_pm_assert_state_not "running" || return 1
_pm_init_output
_pm_stop_mode="$1"
local pid=$$
_pm_pgid=$(ps -o pgid= -p ${pid} | sed 's/ //g')
export _pm_pgid
export _pm_output
echoerr "[procmon] starting process monitor"
export _pm_output
(
_pm_pid=${BASHPID}
while true; do
@ -65,6 +54,8 @@ pm_start() {
}
pm_track_last_job() {
_pm_assert_state "running" || return 1
local pid=$!
if [ ! -f "${_pm_output}/${pid}.pid" ]; then
touch "${_pm_output}/${pid}.pid"
@ -72,6 +63,8 @@ pm_track_last_job() {
}
pm_known_pids() {
_pm_assert_state "running" || return 1
local base_name pid
result=()
for pid_file in "${_pm_output}"/*.pid; do
@ -106,27 +99,19 @@ pm_state() {
}
_pm_halt() {
if [ -z "$_pm_pid" ]; then
echoerr "[procmon] process monitor not started"
return 1
fi
_pm_assert_state "running" || return 1
if ! kill -0 "$_pm_pid"; then
echoerr "[procmon] process monitor not running"
return 1
fi
pm_known_pids
pids=("${result[@]}")
for pid in "${pids[@]}"; do
pm_kill_rec "${pid}"
done
echo "$1" > "${_pm_output}/pm_exit_code"
if [ "$_pm_stop_mode" = "kill_on_exit" ]; then
echoerr "[procmon] stop process group. This will halt the script."
kill -s TERM "-$_pm_pgid"
else
echoerr "[procmon] stop monitor only. Children will be left behind."
kill -s TERM "$_pm_pid"
await "$_pm_pid"
return 0
fi
# last but not least, harakiri
pm_kill_rec "$_pm_pid"
}
pm_stop() {
@ -135,7 +120,52 @@ pm_stop() {
pm_job_exit() {
exit_code=$1
echoerr "[procmon] $BASHPID exit with code $exit_code"
echo "$exit_code" > "${_pm_output}/${BASHPID}.pid"
exit "$exit_code"
}
}
pm_kill_rec() {
local parent="$1" descendant
pm_list_descendants "$parent"
for descendant in "${result[@]}"; do
echo "[procmon] killing process $descendant"
kill -s TERM "$descendant" 2> /dev/null || true
done
return 0
}
pm_list_descendants() {
result=()
_pm_list_descendants "$@"
}
_pm_list_descendants() {
local parent="$1"
result+=("${parent}")
for pid in $(ps -o pid --ppid "$parent" | tail -n +2 | tr -d ' '); do
_pm_list_descendants "$pid"
done
}
_pm_assert_state_not() {
local state="$1" current_state
current_state=$(pm_state)
if [ "$current_state" = "$state" ]; then
echoerr "[procmon] illegal state: $current_state"
return 1
fi
}
_pm_assert_state() {
local state="$1" current_state
current_state=$(pm_state)
if [ "$current_state" != "$state" ]; then
echoerr "[procmon] illegal state: $current_state"
return 1
fi
}

View File

@ -14,8 +14,13 @@ echoerr() {
}
await() {
local pid=$1
local pid=$1 timeout=${2:-30} start="${SECONDS}"
while kill -0 "$pid"; do
if ((SECONDS - start > timeout)); then
echoerr "Error: timeout waiting for process $pid to exit"
return 1
fi
sleep 0.1
done
return 0
}

View File

@ -50,14 +50,15 @@ setup() {
rm -rf "$data_dir"
}
#@test "should launch a Codex node" {
# export CLH_CODEX_BINARY="${BATS_TEST_DIRNAME}/codex/build/codex"
#
# assert cdx_launch_node 0
#
# # Node must be running and ready.
# assert [ ! -z "$(cdx_get_spr 0)" ]
# # We should see a log file and a data directory.
# assert [ -f "${_cdx_output}/logs/codex-0.log" ]
# assert [ -d "${_cdx_output}/data/codex-0" ]
#}
# @test "should launch a Codex node" {
# export CLH_CODEX_BINARY="${BATS_TEST_DIRNAME}/codex/build/codex"
# assert cdx_launch_node 0
# assert cdx_ensure_ready 0 1
# # We should see a log file and a data directory.
# assert [ -f "${_cdx_output}/logs/codex-0.log" ]
# assert [ -d "${_cdx_output}/data/codex-0" ]
# cdx_destroy_node 0
# }

View File

@ -5,6 +5,46 @@ setup() {
source "${LIB_SRC}/procmon.bash"
}
@test "should kill processes recursively" {
# Note that this is fragile. We need to structure
# the process tree such that the parent does not exit
# before the child, or the child will be reparented to
# init and won't be killed. The defensive thing to do
# here is to wait on any child you'd like cleaned before
# exiting its parent subshell.
(
# each backgrounded process generates two processes:
# the process itself, and its subshell.
sleep 500 &
sl1=$!
(
sleep 500 &
await $!
) &
sh1=$!
(
sleep 500 &
await $!
) &
sh2=$!
await $sl1
await $sh1
await $sh2
) &
parent=$!
pm_list_descendants "$parent"
assert_equal "${#result[@]}" 9
pm_kill_rec "$parent"
await "$parent" 5
pm_list_descendants "$parent"
# the parent will still show amongst its descendants,
# even though it is dead.
assert_equal "${#result[@]}" 1
}
@test "should not start process monitor twice" {
assert_equal $(pm_state) "halted"