feat: add halt all processes on single failure

This commit is contained in:
gmega 2025-06-17 20:57:03 -03:00
parent 0c6f2b3b34
commit 77ef4d1aa2
No known key found for this signature in database
GPG Key ID: 6290D34EAD824B18
2 changed files with 110 additions and 24 deletions

View File

@ -9,6 +9,7 @@ source "${LIB_SRC}/utils.bash"
_procmon_output=$(clh_output_folder "procmon")
_procmon_pid=""
_procmon_stop_mode=""
_procmon_init_output() {
rm -rf "${_procmon_output}" || true
@ -16,13 +17,14 @@ _procmon_init_output() {
}
clh_start_process_monitor() {
_procmon_init_output
if [ -n "$_procmon_pid" ]; then
echoerr "[procmon] process monitor already started"
return 1
fi
_procmon_init_output
_procmon_stop_mode="$1"
local pid=$$
_pgid=$(ps -o pgid= -p ${pid} | sed 's/ //g')
export _pgid
@ -31,16 +33,30 @@ clh_start_process_monitor() {
echoerr "[procmon] start"
(
shutdown=false
while ! $shutdown; do
_procmon_pid=${BASHPID}
while true; do
clh_get_tracked_pids
for pid in "${result[@]}"; do
if ! kill -0 "${pid}"; then
echoerr "[procmon] ${pid} is dead"
rm "${_procmon_output}/${pid}.pid"
if kill -0 "${pid}"; then
continue
fi
sleep 1
exit_code=$(cat "${_procmon_output}/${pid}.pid")
if [ -z "$exit_code" ]; then
echoerr "[procmon] ${pid} died with unknown exit code. Aborting."
_clh_halt "halted_no_return"
fi
if [ "$exit_code" -eq 0 ]; then
echoerr "[procmon] ${pid} died with exit code $exit_code."
rm "${_procmon_output}/${pid}.pid"
continue
fi
echoerr "[procmon] ${pid} is dead with exit code $exit_code. Aborting."
_clh_halt "halted_process_failure"
done
sleep 1
done
) &
_procmon_pid=$!
@ -65,7 +81,30 @@ clh_get_tracked_pids() {
done
}
clh_stop_process_monitor() {
clh_monitor_state() {
# No process ID, process monitor never ran.
if [ -z "$_procmon_pid" ]; then
echo "halted"
return 0
fi
# Process ID is set and process is running.
if kill -0 "$_procmon_pid"; then
echo "running"
return 0
fi
if [ -f "${_procmon_output}/procmon_exit_code" ]; then
exit_code=$(cat "${_procmon_output}/procmon_exit_code")
echo "$exit_code"
return 0
fi
echo "error_no_exit_code"
return 1
}
_clh_halt() {
if [ -z "$_procmon_pid" ]; then
echoerr "[procmon] process monitor not started"
return 1
@ -76,13 +115,26 @@ clh_stop_process_monitor() {
return 1
fi
if [ "$1" = "monitor_only" ]; then
echo "$1" > "${_procmon_output}/procmon_exit_code"
if [ "$_procmon_stop_mode" = "kill_on_exit" ]; then
echoerr "[procmon] stop process group. This will halt the script."
kill -s TERM "-$_pgid"
else
echoerr "[procmon] stop monitor only. Children will be left behind."
kill -s TERM "$_procmon_pid"
await "$_procmon_pid"
return 0
else
echoerr "[procmon] stop process group. This will halt the script."
kill -s TERM "-$_pgid"
fi
}
clh_stop_process_monitor() {
_clh_halt "halted"
}
clh_exit() {
exit_code=$1
echoerr "[procmon] $BASHPID exit with code $exit_code"
echo "$exit_code" > "${_procmon_output}/${BASHPID}.pid"
exit "$exit_code"
}

View File

@ -9,9 +9,15 @@ setup() {
}
@test "should not start process monitor twice" {
assert_equal $(clh_monitor_state) "halted"
assert clh_start_process_monitor
assert_equal $(clh_monitor_state) "running"
refute clh_start_process_monitor
assert clh_stop_process_monitor "monitor_only"
assert clh_stop_process_monitor
assert_equal $(clh_monitor_state) "halted"
}
@test "should not stop the process monitor if it wasn't started" {
@ -19,24 +25,25 @@ setup() {
}
@test "should keep track of process IDs" {
echo "hi"
assert clh_start_process_monitor
clh_get_tracked_pids
assert [ ${#result[@]} -eq 0 ]
(
while true; do
while [ ! -f "${_procmon_output}/sync" ]; do
sleep 0.1
done
clh_exit 0
) &
clh_track_last_background_job
p1=$!
(
while true; do
while [ ! -f "${_procmon_output}/sync" ]; do
sleep 0.1
done
clh_exit 0
) &
clh_track_last_background_job
p2=$!
@ -44,22 +51,49 @@ setup() {
clh_get_tracked_pids
assert [ ${#result[@]} -eq 2 ]
kill -s TERM "$p1"
kill -s TERM "$p2"
touch "${_procmon_output}/sync"
echo "Kill issued" > killissued
# This will hang the bats runner for some reason.
await "$p1"
await "$p2"
# This should be more than enough for the process monitor to
# catch the exits. The alternative would be implementing temporal
# predicates.
sleep 3
sleep 1
clh_get_tracked_pids
assert [ ${#result[@]} -eq 0 ]
clh_stop_process_monitor "monitor_only"
clh_stop_process_monitor
}
@test "should stop the monitor and all other processes if one process fails" {
assert clh_start_process_monitor
(
while [ ! -f "${_procmon_output}/sync" ]; do
sleep 0.1
done
clh_exit 1
) &
clh_track_last_background_job
p1=$!
(
while [ ! -f "${_procmon_output}/sync" ]; do
sleep 1
done
clh_exit 0
) &
clh_track_last_background_job
p2=$!
touch "${_procmon_output}/sync"
await "$p1"
await "$p2"
sleep 1
assert_equal $(clh_monitor_state) "halted_process_failure"
}