mirror of
https://github.com/logos-storage/logos-storage-local-harness.git
synced 2026-01-02 13:33:11 +00:00
feat: add halt all processes on single failure
This commit is contained in:
parent
0c6f2b3b34
commit
77ef4d1aa2
@ -9,6 +9,7 @@ source "${LIB_SRC}/utils.bash"
|
||||
|
||||
_procmon_output=$(clh_output_folder "procmon")
|
||||
_procmon_pid=""
|
||||
_procmon_stop_mode=""
|
||||
|
||||
_procmon_init_output() {
|
||||
rm -rf "${_procmon_output}" || true
|
||||
@ -16,13 +17,14 @@ _procmon_init_output() {
|
||||
}
|
||||
|
||||
clh_start_process_monitor() {
|
||||
_procmon_init_output
|
||||
|
||||
if [ -n "$_procmon_pid" ]; then
|
||||
echoerr "[procmon] process monitor already started"
|
||||
return 1
|
||||
fi
|
||||
|
||||
_procmon_init_output
|
||||
_procmon_stop_mode="$1"
|
||||
|
||||
local pid=$$
|
||||
_pgid=$(ps -o pgid= -p ${pid} | sed 's/ //g')
|
||||
export _pgid
|
||||
@ -31,16 +33,30 @@ clh_start_process_monitor() {
|
||||
echoerr "[procmon] start"
|
||||
|
||||
(
|
||||
shutdown=false
|
||||
while ! $shutdown; do
|
||||
_procmon_pid=${BASHPID}
|
||||
while true; do
|
||||
clh_get_tracked_pids
|
||||
for pid in "${result[@]}"; do
|
||||
if ! kill -0 "${pid}"; then
|
||||
echoerr "[procmon] ${pid} is dead"
|
||||
rm "${_procmon_output}/${pid}.pid"
|
||||
if kill -0 "${pid}"; then
|
||||
continue
|
||||
fi
|
||||
sleep 1
|
||||
|
||||
exit_code=$(cat "${_procmon_output}/${pid}.pid")
|
||||
if [ -z "$exit_code" ]; then
|
||||
echoerr "[procmon] ${pid} died with unknown exit code. Aborting."
|
||||
_clh_halt "halted_no_return"
|
||||
fi
|
||||
|
||||
if [ "$exit_code" -eq 0 ]; then
|
||||
echoerr "[procmon] ${pid} died with exit code $exit_code."
|
||||
rm "${_procmon_output}/${pid}.pid"
|
||||
continue
|
||||
fi
|
||||
|
||||
echoerr "[procmon] ${pid} is dead with exit code $exit_code. Aborting."
|
||||
_clh_halt "halted_process_failure"
|
||||
done
|
||||
sleep 1
|
||||
done
|
||||
) &
|
||||
_procmon_pid=$!
|
||||
@ -65,7 +81,30 @@ clh_get_tracked_pids() {
|
||||
done
|
||||
}
|
||||
|
||||
clh_stop_process_monitor() {
|
||||
clh_monitor_state() {
|
||||
# No process ID, process monitor never ran.
|
||||
if [ -z "$_procmon_pid" ]; then
|
||||
echo "halted"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Process ID is set and process is running.
|
||||
if kill -0 "$_procmon_pid"; then
|
||||
echo "running"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [ -f "${_procmon_output}/procmon_exit_code" ]; then
|
||||
exit_code=$(cat "${_procmon_output}/procmon_exit_code")
|
||||
echo "$exit_code"
|
||||
return 0
|
||||
fi
|
||||
|
||||
echo "error_no_exit_code"
|
||||
return 1
|
||||
}
|
||||
|
||||
_clh_halt() {
|
||||
if [ -z "$_procmon_pid" ]; then
|
||||
echoerr "[procmon] process monitor not started"
|
||||
return 1
|
||||
@ -76,13 +115,26 @@ clh_stop_process_monitor() {
|
||||
return 1
|
||||
fi
|
||||
|
||||
if [ "$1" = "monitor_only" ]; then
|
||||
echo "$1" > "${_procmon_output}/procmon_exit_code"
|
||||
|
||||
if [ "$_procmon_stop_mode" = "kill_on_exit" ]; then
|
||||
echoerr "[procmon] stop process group. This will halt the script."
|
||||
kill -s TERM "-$_pgid"
|
||||
else
|
||||
echoerr "[procmon] stop monitor only. Children will be left behind."
|
||||
kill -s TERM "$_procmon_pid"
|
||||
await "$_procmon_pid"
|
||||
return 0
|
||||
else
|
||||
echoerr "[procmon] stop process group. This will halt the script."
|
||||
kill -s TERM "-$_pgid"
|
||||
fi
|
||||
}
|
||||
|
||||
clh_stop_process_monitor() {
|
||||
_clh_halt "halted"
|
||||
}
|
||||
|
||||
clh_exit() {
|
||||
exit_code=$1
|
||||
echoerr "[procmon] $BASHPID exit with code $exit_code"
|
||||
echo "$exit_code" > "${_procmon_output}/${BASHPID}.pid"
|
||||
exit "$exit_code"
|
||||
}
|
||||
@ -9,9 +9,15 @@ setup() {
|
||||
}
|
||||
|
||||
@test "should not start process monitor twice" {
|
||||
assert_equal $(clh_monitor_state) "halted"
|
||||
|
||||
assert clh_start_process_monitor
|
||||
assert_equal $(clh_monitor_state) "running"
|
||||
|
||||
refute clh_start_process_monitor
|
||||
assert clh_stop_process_monitor "monitor_only"
|
||||
|
||||
assert clh_stop_process_monitor
|
||||
assert_equal $(clh_monitor_state) "halted"
|
||||
}
|
||||
|
||||
@test "should not stop the process monitor if it wasn't started" {
|
||||
@ -19,24 +25,25 @@ setup() {
|
||||
}
|
||||
|
||||
@test "should keep track of process IDs" {
|
||||
echo "hi"
|
||||
assert clh_start_process_monitor
|
||||
|
||||
clh_get_tracked_pids
|
||||
assert [ ${#result[@]} -eq 0 ]
|
||||
|
||||
(
|
||||
while true; do
|
||||
while [ ! -f "${_procmon_output}/sync" ]; do
|
||||
sleep 0.1
|
||||
done
|
||||
clh_exit 0
|
||||
) &
|
||||
clh_track_last_background_job
|
||||
p1=$!
|
||||
|
||||
(
|
||||
while true; do
|
||||
while [ ! -f "${_procmon_output}/sync" ]; do
|
||||
sleep 0.1
|
||||
done
|
||||
clh_exit 0
|
||||
) &
|
||||
clh_track_last_background_job
|
||||
p2=$!
|
||||
@ -44,22 +51,49 @@ setup() {
|
||||
clh_get_tracked_pids
|
||||
assert [ ${#result[@]} -eq 2 ]
|
||||
|
||||
kill -s TERM "$p1"
|
||||
kill -s TERM "$p2"
|
||||
touch "${_procmon_output}/sync"
|
||||
|
||||
echo "Kill issued" > killissued
|
||||
|
||||
# This will hang the bats runner for some reason.
|
||||
await "$p1"
|
||||
await "$p2"
|
||||
|
||||
# This should be more than enough for the process monitor to
|
||||
# catch the exits. The alternative would be implementing temporal
|
||||
# predicates.
|
||||
sleep 3
|
||||
sleep 1
|
||||
|
||||
clh_get_tracked_pids
|
||||
assert [ ${#result[@]} -eq 0 ]
|
||||
|
||||
clh_stop_process_monitor "monitor_only"
|
||||
clh_stop_process_monitor
|
||||
}
|
||||
|
||||
@test "should stop the monitor and all other processes if one process fails" {
|
||||
assert clh_start_process_monitor
|
||||
|
||||
(
|
||||
while [ ! -f "${_procmon_output}/sync" ]; do
|
||||
sleep 0.1
|
||||
done
|
||||
clh_exit 1
|
||||
) &
|
||||
clh_track_last_background_job
|
||||
p1=$!
|
||||
|
||||
(
|
||||
while [ ! -f "${_procmon_output}/sync" ]; do
|
||||
sleep 1
|
||||
done
|
||||
clh_exit 0
|
||||
) &
|
||||
clh_track_last_background_job
|
||||
p2=$!
|
||||
|
||||
touch "${_procmon_output}/sync"
|
||||
|
||||
await "$p1"
|
||||
await "$p2"
|
||||
|
||||
sleep 1
|
||||
|
||||
assert_equal $(clh_monitor_state) "halted_process_failure"
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user