From 77ef4d1aa2d877d883b0a4c201af7dd1a3062855 Mon Sep 17 00:00:00 2001 From: gmega Date: Tue, 17 Jun 2025 20:57:03 -0300 Subject: [PATCH] feat: add halt all processes on single failure --- src/process_monitor.bash | 78 ++++++++++++++++++++++++++++------ test/test_process_monitor.bats | 56 +++++++++++++++++++----- 2 files changed, 110 insertions(+), 24 deletions(-) diff --git a/src/process_monitor.bash b/src/process_monitor.bash index 989a524..3b3f92c 100644 --- a/src/process_monitor.bash +++ b/src/process_monitor.bash @@ -9,6 +9,7 @@ source "${LIB_SRC}/utils.bash" _procmon_output=$(clh_output_folder "procmon") _procmon_pid="" +_procmon_stop_mode="" _procmon_init_output() { rm -rf "${_procmon_output}" || true @@ -16,13 +17,14 @@ _procmon_init_output() { } clh_start_process_monitor() { - _procmon_init_output - if [ -n "$_procmon_pid" ]; then echoerr "[procmon] process monitor already started" return 1 fi + _procmon_init_output + _procmon_stop_mode="$1" + local pid=$$ _pgid=$(ps -o pgid= -p ${pid} | sed 's/ //g') export _pgid @@ -31,16 +33,30 @@ clh_start_process_monitor() { echoerr "[procmon] start" ( - shutdown=false - while ! $shutdown; do + _procmon_pid=${BASHPID} + while true; do clh_get_tracked_pids for pid in "${result[@]}"; do - if ! kill -0 "${pid}"; then - echoerr "[procmon] ${pid} is dead" - rm "${_procmon_output}/${pid}.pid" + if kill -0 "${pid}"; then + continue fi - sleep 1 + + exit_code=$(cat "${_procmon_output}/${pid}.pid") + if [ -z "$exit_code" ]; then + echoerr "[procmon] ${pid} died with unknown exit code. Aborting." + _clh_halt "halted_no_return" + fi + + if [ "$exit_code" -eq 0 ]; then + echoerr "[procmon] ${pid} died with exit code $exit_code." + rm "${_procmon_output}/${pid}.pid" + continue + fi + + echoerr "[procmon] ${pid} is dead with exit code $exit_code. Aborting." + _clh_halt "halted_process_failure" done + sleep 1 done ) & _procmon_pid=$! @@ -65,7 +81,30 @@ clh_get_tracked_pids() { done } -clh_stop_process_monitor() { +clh_monitor_state() { + # No process ID, process monitor never ran. + if [ -z "$_procmon_pid" ]; then + echo "halted" + return 0 + fi + + # Process ID is set and process is running. + if kill -0 "$_procmon_pid"; then + echo "running" + return 0 + fi + + if [ -f "${_procmon_output}/procmon_exit_code" ]; then + exit_code=$(cat "${_procmon_output}/procmon_exit_code") + echo "$exit_code" + return 0 + fi + + echo "error_no_exit_code" + return 1 +} + +_clh_halt() { if [ -z "$_procmon_pid" ]; then echoerr "[procmon] process monitor not started" return 1 @@ -76,13 +115,26 @@ clh_stop_process_monitor() { return 1 fi - if [ "$1" = "monitor_only" ]; then + echo "$1" > "${_procmon_output}/procmon_exit_code" + + if [ "$_procmon_stop_mode" = "kill_on_exit" ]; then + echoerr "[procmon] stop process group. This will halt the script." + kill -s TERM "-$_pgid" + else echoerr "[procmon] stop monitor only. Children will be left behind." kill -s TERM "$_procmon_pid" await "$_procmon_pid" return 0 - else - echoerr "[procmon] stop process group. This will halt the script." - kill -s TERM "-$_pgid" fi +} + +clh_stop_process_monitor() { + _clh_halt "halted" +} + +clh_exit() { + exit_code=$1 + echoerr "[procmon] $BASHPID exit with code $exit_code" + echo "$exit_code" > "${_procmon_output}/${BASHPID}.pid" + exit "$exit_code" } \ No newline at end of file diff --git a/test/test_process_monitor.bats b/test/test_process_monitor.bats index e58d649..b9c2a9d 100644 --- a/test/test_process_monitor.bats +++ b/test/test_process_monitor.bats @@ -9,9 +9,15 @@ setup() { } @test "should not start process monitor twice" { + assert_equal $(clh_monitor_state) "halted" + assert clh_start_process_monitor + assert_equal $(clh_monitor_state) "running" + refute clh_start_process_monitor - assert clh_stop_process_monitor "monitor_only" + + assert clh_stop_process_monitor + assert_equal $(clh_monitor_state) "halted" } @test "should not stop the process monitor if it wasn't started" { @@ -19,24 +25,25 @@ setup() { } @test "should keep track of process IDs" { - echo "hi" assert clh_start_process_monitor clh_get_tracked_pids assert [ ${#result[@]} -eq 0 ] ( - while true; do + while [ ! -f "${_procmon_output}/sync" ]; do sleep 0.1 done + clh_exit 0 ) & clh_track_last_background_job p1=$! ( - while true; do + while [ ! -f "${_procmon_output}/sync" ]; do sleep 0.1 done + clh_exit 0 ) & clh_track_last_background_job p2=$! @@ -44,22 +51,49 @@ setup() { clh_get_tracked_pids assert [ ${#result[@]} -eq 2 ] - kill -s TERM "$p1" - kill -s TERM "$p2" + touch "${_procmon_output}/sync" - echo "Kill issued" > killissued - - # This will hang the bats runner for some reason. await "$p1" await "$p2" # This should be more than enough for the process monitor to # catch the exits. The alternative would be implementing temporal # predicates. - sleep 3 + sleep 1 clh_get_tracked_pids assert [ ${#result[@]} -eq 0 ] - clh_stop_process_monitor "monitor_only" + clh_stop_process_monitor } + +@test "should stop the monitor and all other processes if one process fails" { + assert clh_start_process_monitor + + ( + while [ ! -f "${_procmon_output}/sync" ]; do + sleep 0.1 + done + clh_exit 1 + ) & + clh_track_last_background_job + p1=$! + + ( + while [ ! -f "${_procmon_output}/sync" ]; do + sleep 1 + done + clh_exit 0 + ) & + clh_track_last_background_job + p2=$! + + touch "${_procmon_output}/sync" + + await "$p1" + await "$p2" + + sleep 1 + + assert_equal $(clh_monitor_state) "halted_process_failure" +} \ No newline at end of file