From cfdea04de5772f43a7197675df72241ccd166d6c Mon Sep 17 00:00:00 2001 From: gmega Date: Wed, 18 Jun 2025 19:28:34 -0300 Subject: [PATCH] feat: add recursive kill; improve procmon state and process handling --- src/codex.bash | 7 ++-- src/procmon.bash | 94 ++++++++++++++++++++++++++++-------------- src/utils.bash | 7 +++- test/test_codex.bats | 23 ++++++----- test/test_procmon.bats | 40 ++++++++++++++++++ 5 files changed, 123 insertions(+), 48 deletions(-) diff --git a/src/codex.bash b/src/codex.bash index c2370ff..91cc1fe 100644 --- a/src/codex.bash +++ b/src/codex.bash @@ -55,6 +55,7 @@ cdx_cmdline() { return 1 fi + # shellcheck disable=SC2140 echo "${cdx_cmd}"\ " --log-file=${_cdx_logs}/codex-${node_index}.log --data-dir=${_cdx_data}/codex-${node_index}"\ " --api-port=${api_port} --disc-port=${disc_port} --loglevel=INFO" @@ -89,16 +90,14 @@ cdx_launch_node() { } cdx_ensure_ready() { - local node_index="$1" timeout=${2:-$_cdx_node_start_timeout} start now - start=$(date +%s) + local node_index="$1" timeout=${2:-$_cdx_node_start_timeout} start="${SECONDS}" while true; do if cdx_get_spr "$node_index"; then echoerr "Codex node $node_index is ready." return 0 fi - now=$(date +%s) - if (( now - start > timeout )); then + if (( SECONDS - start > timeout )); then echoerr "Codex node $node_index did not start within ${timeout} seconds." return 1 fi diff --git a/src/procmon.bash b/src/procmon.bash index 34ef0f0..d62b837 100644 --- a/src/procmon.bash +++ b/src/procmon.bash @@ -1,5 +1,4 @@ #!/usr/bin/env bash - LIB_SRC=${LIB_SRC:-$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)} # shellcheck source=./src/config.bash @@ -9,7 +8,6 @@ source "${LIB_SRC}/utils.bash" _pm_output=$(clh_output_folder "pm") _pm_pid="" -_pm_stop_mode="" _pm_init_output() { rm -rf "${_pm_output}" || true @@ -17,21 +15,12 @@ _pm_init_output() { } pm_start() { - if [ -n "$_pm_pid" ]; then - echoerr "[procmon] process monitor already started" - return 1 - fi - + _pm_assert_state_not "running" || return 1 _pm_init_output - _pm_stop_mode="$1" - - local pid=$$ - _pm_pgid=$(ps -o pgid= -p ${pid} | sed 's/ //g') - export _pm_pgid - export _pm_output echoerr "[procmon] starting process monitor" + export _pm_output ( _pm_pid=${BASHPID} while true; do @@ -65,6 +54,8 @@ pm_start() { } pm_track_last_job() { + _pm_assert_state "running" || return 1 + local pid=$! if [ ! -f "${_pm_output}/${pid}.pid" ]; then touch "${_pm_output}/${pid}.pid" @@ -72,6 +63,8 @@ pm_track_last_job() { } pm_known_pids() { + _pm_assert_state "running" || return 1 + local base_name pid result=() for pid_file in "${_pm_output}"/*.pid; do @@ -106,27 +99,19 @@ pm_state() { } _pm_halt() { - if [ -z "$_pm_pid" ]; then - echoerr "[procmon] process monitor not started" - return 1 - fi + _pm_assert_state "running" || return 1 - if ! kill -0 "$_pm_pid"; then - echoerr "[procmon] process monitor not running" - return 1 - fi + pm_known_pids + pids=("${result[@]}") + + for pid in "${pids[@]}"; do + pm_kill_rec "${pid}" + done echo "$1" > "${_pm_output}/pm_exit_code" - if [ "$_pm_stop_mode" = "kill_on_exit" ]; then - echoerr "[procmon] stop process group. This will halt the script." - kill -s TERM "-$_pm_pgid" - else - echoerr "[procmon] stop monitor only. Children will be left behind." - kill -s TERM "$_pm_pid" - await "$_pm_pid" - return 0 - fi + # last but not least, harakiri + pm_kill_rec "$_pm_pid" } pm_stop() { @@ -135,7 +120,52 @@ pm_stop() { pm_job_exit() { exit_code=$1 - echoerr "[procmon] $BASHPID exit with code $exit_code" echo "$exit_code" > "${_pm_output}/${BASHPID}.pid" exit "$exit_code" -} \ No newline at end of file +} + +pm_kill_rec() { + local parent="$1" descendant + + pm_list_descendants "$parent" + for descendant in "${result[@]}"; do + echo "[procmon] killing process $descendant" + kill -s TERM "$descendant" 2> /dev/null || true + done + + return 0 +} + +pm_list_descendants() { + result=() + _pm_list_descendants "$@" +} + +_pm_list_descendants() { + local parent="$1" + result+=("${parent}") + + for pid in $(ps -o pid --ppid "$parent" | tail -n +2 | tr -d ' '); do + _pm_list_descendants "$pid" + done +} + +_pm_assert_state_not() { + local state="$1" current_state + current_state=$(pm_state) + + if [ "$current_state" = "$state" ]; then + echoerr "[procmon] illegal state: $current_state" + return 1 + fi +} + +_pm_assert_state() { + local state="$1" current_state + current_state=$(pm_state) + + if [ "$current_state" != "$state" ]; then + echoerr "[procmon] illegal state: $current_state" + return 1 + fi +} diff --git a/src/utils.bash b/src/utils.bash index da78c52..d9c9fb9 100644 --- a/src/utils.bash +++ b/src/utils.bash @@ -14,8 +14,13 @@ echoerr() { } await() { - local pid=$1 + local pid=$1 timeout=${2:-30} start="${SECONDS}" while kill -0 "$pid"; do + if ((SECONDS - start > timeout)); then + echoerr "Error: timeout waiting for process $pid to exit" + return 1 + fi sleep 0.1 done + return 0 } \ No newline at end of file diff --git a/test/test_codex.bats b/test/test_codex.bats index 51d31d1..257d155 100755 --- a/test/test_codex.bats +++ b/test/test_codex.bats @@ -50,14 +50,15 @@ setup() { rm -rf "$data_dir" } -#@test "should launch a Codex node" { -# export CLH_CODEX_BINARY="${BATS_TEST_DIRNAME}/codex/build/codex" -# -# assert cdx_launch_node 0 -# -# # Node must be running and ready. -# assert [ ! -z "$(cdx_get_spr 0)" ] -# # We should see a log file and a data directory. -# assert [ -f "${_cdx_output}/logs/codex-0.log" ] -# assert [ -d "${_cdx_output}/data/codex-0" ] -#} \ No newline at end of file +# @test "should launch a Codex node" { +# export CLH_CODEX_BINARY="${BATS_TEST_DIRNAME}/codex/build/codex" + +# assert cdx_launch_node 0 +# assert cdx_ensure_ready 0 1 + +# # We should see a log file and a data directory. +# assert [ -f "${_cdx_output}/logs/codex-0.log" ] +# assert [ -d "${_cdx_output}/data/codex-0" ] + +# cdx_destroy_node 0 +# } \ No newline at end of file diff --git a/test/test_procmon.bats b/test/test_procmon.bats index 96a6a9f..5029d22 100644 --- a/test/test_procmon.bats +++ b/test/test_procmon.bats @@ -5,6 +5,46 @@ setup() { source "${LIB_SRC}/procmon.bash" } +@test "should kill processes recursively" { + # Note that this is fragile. We need to structure + # the process tree such that the parent does not exit + # before the child, or the child will be reparented to + # init and won't be killed. The defensive thing to do + # here is to wait on any child you'd like cleaned before + # exiting its parent subshell. + ( + # each backgrounded process generates two processes: + # the process itself, and its subshell. + sleep 500 & + sl1=$! + ( + sleep 500 & + await $! + ) & + sh1=$! + ( + sleep 500 & + await $! + ) & + sh2=$! + await $sl1 + await $sh1 + await $sh2 + ) & + parent=$! + + pm_list_descendants "$parent" + assert_equal "${#result[@]}" 9 + + pm_kill_rec "$parent" + await "$parent" 5 + + pm_list_descendants "$parent" + # the parent will still show amongst its descendants, + # even though it is dead. + assert_equal "${#result[@]}" 1 +} + @test "should not start process monitor twice" { assert_equal $(pm_state) "halted"