mirror of
https://github.com/logos-storage/logos-storage-local-harness.git
synced 2026-01-02 13:33:11 +00:00
feat: add recursive kill; improve procmon state and process handling
This commit is contained in:
parent
2d2c2dda6f
commit
cfdea04de5
@ -55,6 +55,7 @@ cdx_cmdline() {
|
||||
return 1
|
||||
fi
|
||||
|
||||
# shellcheck disable=SC2140
|
||||
echo "${cdx_cmd}"\
|
||||
" --log-file=${_cdx_logs}/codex-${node_index}.log --data-dir=${_cdx_data}/codex-${node_index}"\
|
||||
" --api-port=${api_port} --disc-port=${disc_port} --loglevel=INFO"
|
||||
@ -89,16 +90,14 @@ cdx_launch_node() {
|
||||
}
|
||||
|
||||
cdx_ensure_ready() {
|
||||
local node_index="$1" timeout=${2:-$_cdx_node_start_timeout} start now
|
||||
start=$(date +%s)
|
||||
local node_index="$1" timeout=${2:-$_cdx_node_start_timeout} start="${SECONDS}"
|
||||
while true; do
|
||||
if cdx_get_spr "$node_index"; then
|
||||
echoerr "Codex node $node_index is ready."
|
||||
return 0
|
||||
fi
|
||||
|
||||
now=$(date +%s)
|
||||
if (( now - start > timeout )); then
|
||||
if (( SECONDS - start > timeout )); then
|
||||
echoerr "Codex node $node_index did not start within ${timeout} seconds."
|
||||
return 1
|
||||
fi
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
LIB_SRC=${LIB_SRC:-$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)}
|
||||
|
||||
# shellcheck source=./src/config.bash
|
||||
@ -9,7 +8,6 @@ source "${LIB_SRC}/utils.bash"
|
||||
|
||||
_pm_output=$(clh_output_folder "pm")
|
||||
_pm_pid=""
|
||||
_pm_stop_mode=""
|
||||
|
||||
_pm_init_output() {
|
||||
rm -rf "${_pm_output}" || true
|
||||
@ -17,21 +15,12 @@ _pm_init_output() {
|
||||
}
|
||||
|
||||
pm_start() {
|
||||
if [ -n "$_pm_pid" ]; then
|
||||
echoerr "[procmon] process monitor already started"
|
||||
return 1
|
||||
fi
|
||||
|
||||
_pm_assert_state_not "running" || return 1
|
||||
_pm_init_output
|
||||
_pm_stop_mode="$1"
|
||||
|
||||
local pid=$$
|
||||
_pm_pgid=$(ps -o pgid= -p ${pid} | sed 's/ //g')
|
||||
export _pm_pgid
|
||||
export _pm_output
|
||||
|
||||
echoerr "[procmon] starting process monitor"
|
||||
|
||||
export _pm_output
|
||||
(
|
||||
_pm_pid=${BASHPID}
|
||||
while true; do
|
||||
@ -65,6 +54,8 @@ pm_start() {
|
||||
}
|
||||
|
||||
pm_track_last_job() {
|
||||
_pm_assert_state "running" || return 1
|
||||
|
||||
local pid=$!
|
||||
if [ ! -f "${_pm_output}/${pid}.pid" ]; then
|
||||
touch "${_pm_output}/${pid}.pid"
|
||||
@ -72,6 +63,8 @@ pm_track_last_job() {
|
||||
}
|
||||
|
||||
pm_known_pids() {
|
||||
_pm_assert_state "running" || return 1
|
||||
|
||||
local base_name pid
|
||||
result=()
|
||||
for pid_file in "${_pm_output}"/*.pid; do
|
||||
@ -106,27 +99,19 @@ pm_state() {
|
||||
}
|
||||
|
||||
_pm_halt() {
|
||||
if [ -z "$_pm_pid" ]; then
|
||||
echoerr "[procmon] process monitor not started"
|
||||
return 1
|
||||
fi
|
||||
_pm_assert_state "running" || return 1
|
||||
|
||||
if ! kill -0 "$_pm_pid"; then
|
||||
echoerr "[procmon] process monitor not running"
|
||||
return 1
|
||||
fi
|
||||
pm_known_pids
|
||||
pids=("${result[@]}")
|
||||
|
||||
for pid in "${pids[@]}"; do
|
||||
pm_kill_rec "${pid}"
|
||||
done
|
||||
|
||||
echo "$1" > "${_pm_output}/pm_exit_code"
|
||||
|
||||
if [ "$_pm_stop_mode" = "kill_on_exit" ]; then
|
||||
echoerr "[procmon] stop process group. This will halt the script."
|
||||
kill -s TERM "-$_pm_pgid"
|
||||
else
|
||||
echoerr "[procmon] stop monitor only. Children will be left behind."
|
||||
kill -s TERM "$_pm_pid"
|
||||
await "$_pm_pid"
|
||||
return 0
|
||||
fi
|
||||
# last but not least, harakiri
|
||||
pm_kill_rec "$_pm_pid"
|
||||
}
|
||||
|
||||
pm_stop() {
|
||||
@ -135,7 +120,52 @@ pm_stop() {
|
||||
|
||||
pm_job_exit() {
|
||||
exit_code=$1
|
||||
echoerr "[procmon] $BASHPID exit with code $exit_code"
|
||||
echo "$exit_code" > "${_pm_output}/${BASHPID}.pid"
|
||||
exit "$exit_code"
|
||||
}
|
||||
}
|
||||
|
||||
pm_kill_rec() {
|
||||
local parent="$1" descendant
|
||||
|
||||
pm_list_descendants "$parent"
|
||||
for descendant in "${result[@]}"; do
|
||||
echo "[procmon] killing process $descendant"
|
||||
kill -s TERM "$descendant" 2> /dev/null || true
|
||||
done
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
pm_list_descendants() {
|
||||
result=()
|
||||
_pm_list_descendants "$@"
|
||||
}
|
||||
|
||||
_pm_list_descendants() {
|
||||
local parent="$1"
|
||||
result+=("${parent}")
|
||||
|
||||
for pid in $(ps -o pid --ppid "$parent" | tail -n +2 | tr -d ' '); do
|
||||
_pm_list_descendants "$pid"
|
||||
done
|
||||
}
|
||||
|
||||
_pm_assert_state_not() {
|
||||
local state="$1" current_state
|
||||
current_state=$(pm_state)
|
||||
|
||||
if [ "$current_state" = "$state" ]; then
|
||||
echoerr "[procmon] illegal state: $current_state"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
_pm_assert_state() {
|
||||
local state="$1" current_state
|
||||
current_state=$(pm_state)
|
||||
|
||||
if [ "$current_state" != "$state" ]; then
|
||||
echoerr "[procmon] illegal state: $current_state"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
@ -14,8 +14,13 @@ echoerr() {
|
||||
}
|
||||
|
||||
await() {
|
||||
local pid=$1
|
||||
local pid=$1 timeout=${2:-30} start="${SECONDS}"
|
||||
while kill -0 "$pid"; do
|
||||
if ((SECONDS - start > timeout)); then
|
||||
echoerr "Error: timeout waiting for process $pid to exit"
|
||||
return 1
|
||||
fi
|
||||
sleep 0.1
|
||||
done
|
||||
return 0
|
||||
}
|
||||
@ -50,14 +50,15 @@ setup() {
|
||||
rm -rf "$data_dir"
|
||||
}
|
||||
|
||||
#@test "should launch a Codex node" {
|
||||
# export CLH_CODEX_BINARY="${BATS_TEST_DIRNAME}/codex/build/codex"
|
||||
#
|
||||
# assert cdx_launch_node 0
|
||||
#
|
||||
# # Node must be running and ready.
|
||||
# assert [ ! -z "$(cdx_get_spr 0)" ]
|
||||
# # We should see a log file and a data directory.
|
||||
# assert [ -f "${_cdx_output}/logs/codex-0.log" ]
|
||||
# assert [ -d "${_cdx_output}/data/codex-0" ]
|
||||
#}
|
||||
# @test "should launch a Codex node" {
|
||||
# export CLH_CODEX_BINARY="${BATS_TEST_DIRNAME}/codex/build/codex"
|
||||
|
||||
# assert cdx_launch_node 0
|
||||
# assert cdx_ensure_ready 0 1
|
||||
|
||||
# # We should see a log file and a data directory.
|
||||
# assert [ -f "${_cdx_output}/logs/codex-0.log" ]
|
||||
# assert [ -d "${_cdx_output}/data/codex-0" ]
|
||||
|
||||
# cdx_destroy_node 0
|
||||
# }
|
||||
@ -5,6 +5,46 @@ setup() {
|
||||
source "${LIB_SRC}/procmon.bash"
|
||||
}
|
||||
|
||||
@test "should kill processes recursively" {
|
||||
# Note that this is fragile. We need to structure
|
||||
# the process tree such that the parent does not exit
|
||||
# before the child, or the child will be reparented to
|
||||
# init and won't be killed. The defensive thing to do
|
||||
# here is to wait on any child you'd like cleaned before
|
||||
# exiting its parent subshell.
|
||||
(
|
||||
# each backgrounded process generates two processes:
|
||||
# the process itself, and its subshell.
|
||||
sleep 500 &
|
||||
sl1=$!
|
||||
(
|
||||
sleep 500 &
|
||||
await $!
|
||||
) &
|
||||
sh1=$!
|
||||
(
|
||||
sleep 500 &
|
||||
await $!
|
||||
) &
|
||||
sh2=$!
|
||||
await $sl1
|
||||
await $sh1
|
||||
await $sh2
|
||||
) &
|
||||
parent=$!
|
||||
|
||||
pm_list_descendants "$parent"
|
||||
assert_equal "${#result[@]}" 9
|
||||
|
||||
pm_kill_rec "$parent"
|
||||
await "$parent" 5
|
||||
|
||||
pm_list_descendants "$parent"
|
||||
# the parent will still show amongst its descendants,
|
||||
# even though it is dead.
|
||||
assert_equal "${#result[@]}" 1
|
||||
}
|
||||
|
||||
@test "should not start process monitor twice" {
|
||||
assert_equal $(pm_state) "halted"
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user