mirror of
https://github.com/logos-storage/logos-storage-local-harness.git
synced 2026-01-07 16:03:08 +00:00
feat: add recursive kill; improve procmon state and process handling
This commit is contained in:
parent
2d2c2dda6f
commit
cfdea04de5
@ -55,6 +55,7 @@ cdx_cmdline() {
|
|||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# shellcheck disable=SC2140
|
||||||
echo "${cdx_cmd}"\
|
echo "${cdx_cmd}"\
|
||||||
" --log-file=${_cdx_logs}/codex-${node_index}.log --data-dir=${_cdx_data}/codex-${node_index}"\
|
" --log-file=${_cdx_logs}/codex-${node_index}.log --data-dir=${_cdx_data}/codex-${node_index}"\
|
||||||
" --api-port=${api_port} --disc-port=${disc_port} --loglevel=INFO"
|
" --api-port=${api_port} --disc-port=${disc_port} --loglevel=INFO"
|
||||||
@ -89,16 +90,14 @@ cdx_launch_node() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
cdx_ensure_ready() {
|
cdx_ensure_ready() {
|
||||||
local node_index="$1" timeout=${2:-$_cdx_node_start_timeout} start now
|
local node_index="$1" timeout=${2:-$_cdx_node_start_timeout} start="${SECONDS}"
|
||||||
start=$(date +%s)
|
|
||||||
while true; do
|
while true; do
|
||||||
if cdx_get_spr "$node_index"; then
|
if cdx_get_spr "$node_index"; then
|
||||||
echoerr "Codex node $node_index is ready."
|
echoerr "Codex node $node_index is ready."
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
now=$(date +%s)
|
if (( SECONDS - start > timeout )); then
|
||||||
if (( now - start > timeout )); then
|
|
||||||
echoerr "Codex node $node_index did not start within ${timeout} seconds."
|
echoerr "Codex node $node_index did not start within ${timeout} seconds."
|
||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
LIB_SRC=${LIB_SRC:-$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)}
|
LIB_SRC=${LIB_SRC:-$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)}
|
||||||
|
|
||||||
# shellcheck source=./src/config.bash
|
# shellcheck source=./src/config.bash
|
||||||
@ -9,7 +8,6 @@ source "${LIB_SRC}/utils.bash"
|
|||||||
|
|
||||||
_pm_output=$(clh_output_folder "pm")
|
_pm_output=$(clh_output_folder "pm")
|
||||||
_pm_pid=""
|
_pm_pid=""
|
||||||
_pm_stop_mode=""
|
|
||||||
|
|
||||||
_pm_init_output() {
|
_pm_init_output() {
|
||||||
rm -rf "${_pm_output}" || true
|
rm -rf "${_pm_output}" || true
|
||||||
@ -17,21 +15,12 @@ _pm_init_output() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pm_start() {
|
pm_start() {
|
||||||
if [ -n "$_pm_pid" ]; then
|
_pm_assert_state_not "running" || return 1
|
||||||
echoerr "[procmon] process monitor already started"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
_pm_init_output
|
_pm_init_output
|
||||||
_pm_stop_mode="$1"
|
|
||||||
|
|
||||||
local pid=$$
|
|
||||||
_pm_pgid=$(ps -o pgid= -p ${pid} | sed 's/ //g')
|
|
||||||
export _pm_pgid
|
|
||||||
export _pm_output
|
|
||||||
|
|
||||||
echoerr "[procmon] starting process monitor"
|
echoerr "[procmon] starting process monitor"
|
||||||
|
|
||||||
|
export _pm_output
|
||||||
(
|
(
|
||||||
_pm_pid=${BASHPID}
|
_pm_pid=${BASHPID}
|
||||||
while true; do
|
while true; do
|
||||||
@ -65,6 +54,8 @@ pm_start() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pm_track_last_job() {
|
pm_track_last_job() {
|
||||||
|
_pm_assert_state "running" || return 1
|
||||||
|
|
||||||
local pid=$!
|
local pid=$!
|
||||||
if [ ! -f "${_pm_output}/${pid}.pid" ]; then
|
if [ ! -f "${_pm_output}/${pid}.pid" ]; then
|
||||||
touch "${_pm_output}/${pid}.pid"
|
touch "${_pm_output}/${pid}.pid"
|
||||||
@ -72,6 +63,8 @@ pm_track_last_job() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pm_known_pids() {
|
pm_known_pids() {
|
||||||
|
_pm_assert_state "running" || return 1
|
||||||
|
|
||||||
local base_name pid
|
local base_name pid
|
||||||
result=()
|
result=()
|
||||||
for pid_file in "${_pm_output}"/*.pid; do
|
for pid_file in "${_pm_output}"/*.pid; do
|
||||||
@ -106,27 +99,19 @@ pm_state() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
_pm_halt() {
|
_pm_halt() {
|
||||||
if [ -z "$_pm_pid" ]; then
|
_pm_assert_state "running" || return 1
|
||||||
echoerr "[procmon] process monitor not started"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! kill -0 "$_pm_pid"; then
|
pm_known_pids
|
||||||
echoerr "[procmon] process monitor not running"
|
pids=("${result[@]}")
|
||||||
return 1
|
|
||||||
fi
|
for pid in "${pids[@]}"; do
|
||||||
|
pm_kill_rec "${pid}"
|
||||||
|
done
|
||||||
|
|
||||||
echo "$1" > "${_pm_output}/pm_exit_code"
|
echo "$1" > "${_pm_output}/pm_exit_code"
|
||||||
|
|
||||||
if [ "$_pm_stop_mode" = "kill_on_exit" ]; then
|
# last but not least, harakiri
|
||||||
echoerr "[procmon] stop process group. This will halt the script."
|
pm_kill_rec "$_pm_pid"
|
||||||
kill -s TERM "-$_pm_pgid"
|
|
||||||
else
|
|
||||||
echoerr "[procmon] stop monitor only. Children will be left behind."
|
|
||||||
kill -s TERM "$_pm_pid"
|
|
||||||
await "$_pm_pid"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pm_stop() {
|
pm_stop() {
|
||||||
@ -135,7 +120,52 @@ pm_stop() {
|
|||||||
|
|
||||||
pm_job_exit() {
|
pm_job_exit() {
|
||||||
exit_code=$1
|
exit_code=$1
|
||||||
echoerr "[procmon] $BASHPID exit with code $exit_code"
|
|
||||||
echo "$exit_code" > "${_pm_output}/${BASHPID}.pid"
|
echo "$exit_code" > "${_pm_output}/${BASHPID}.pid"
|
||||||
exit "$exit_code"
|
exit "$exit_code"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pm_kill_rec() {
|
||||||
|
local parent="$1" descendant
|
||||||
|
|
||||||
|
pm_list_descendants "$parent"
|
||||||
|
for descendant in "${result[@]}"; do
|
||||||
|
echo "[procmon] killing process $descendant"
|
||||||
|
kill -s TERM "$descendant" 2> /dev/null || true
|
||||||
|
done
|
||||||
|
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
pm_list_descendants() {
|
||||||
|
result=()
|
||||||
|
_pm_list_descendants "$@"
|
||||||
|
}
|
||||||
|
|
||||||
|
_pm_list_descendants() {
|
||||||
|
local parent="$1"
|
||||||
|
result+=("${parent}")
|
||||||
|
|
||||||
|
for pid in $(ps -o pid --ppid "$parent" | tail -n +2 | tr -d ' '); do
|
||||||
|
_pm_list_descendants "$pid"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
_pm_assert_state_not() {
|
||||||
|
local state="$1" current_state
|
||||||
|
current_state=$(pm_state)
|
||||||
|
|
||||||
|
if [ "$current_state" = "$state" ]; then
|
||||||
|
echoerr "[procmon] illegal state: $current_state"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
_pm_assert_state() {
|
||||||
|
local state="$1" current_state
|
||||||
|
current_state=$(pm_state)
|
||||||
|
|
||||||
|
if [ "$current_state" != "$state" ]; then
|
||||||
|
echoerr "[procmon] illegal state: $current_state"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|||||||
@ -14,8 +14,13 @@ echoerr() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
await() {
|
await() {
|
||||||
local pid=$1
|
local pid=$1 timeout=${2:-30} start="${SECONDS}"
|
||||||
while kill -0 "$pid"; do
|
while kill -0 "$pid"; do
|
||||||
|
if ((SECONDS - start > timeout)); then
|
||||||
|
echoerr "Error: timeout waiting for process $pid to exit"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
done
|
done
|
||||||
|
return 0
|
||||||
}
|
}
|
||||||
@ -50,14 +50,15 @@ setup() {
|
|||||||
rm -rf "$data_dir"
|
rm -rf "$data_dir"
|
||||||
}
|
}
|
||||||
|
|
||||||
#@test "should launch a Codex node" {
|
# @test "should launch a Codex node" {
|
||||||
# export CLH_CODEX_BINARY="${BATS_TEST_DIRNAME}/codex/build/codex"
|
# export CLH_CODEX_BINARY="${BATS_TEST_DIRNAME}/codex/build/codex"
|
||||||
#
|
|
||||||
# assert cdx_launch_node 0
|
# assert cdx_launch_node 0
|
||||||
#
|
# assert cdx_ensure_ready 0 1
|
||||||
# # Node must be running and ready.
|
|
||||||
# assert [ ! -z "$(cdx_get_spr 0)" ]
|
# # We should see a log file and a data directory.
|
||||||
# # We should see a log file and a data directory.
|
# assert [ -f "${_cdx_output}/logs/codex-0.log" ]
|
||||||
# assert [ -f "${_cdx_output}/logs/codex-0.log" ]
|
# assert [ -d "${_cdx_output}/data/codex-0" ]
|
||||||
# assert [ -d "${_cdx_output}/data/codex-0" ]
|
|
||||||
#}
|
# cdx_destroy_node 0
|
||||||
|
# }
|
||||||
@ -5,6 +5,46 @@ setup() {
|
|||||||
source "${LIB_SRC}/procmon.bash"
|
source "${LIB_SRC}/procmon.bash"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@test "should kill processes recursively" {
|
||||||
|
# Note that this is fragile. We need to structure
|
||||||
|
# the process tree such that the parent does not exit
|
||||||
|
# before the child, or the child will be reparented to
|
||||||
|
# init and won't be killed. The defensive thing to do
|
||||||
|
# here is to wait on any child you'd like cleaned before
|
||||||
|
# exiting its parent subshell.
|
||||||
|
(
|
||||||
|
# each backgrounded process generates two processes:
|
||||||
|
# the process itself, and its subshell.
|
||||||
|
sleep 500 &
|
||||||
|
sl1=$!
|
||||||
|
(
|
||||||
|
sleep 500 &
|
||||||
|
await $!
|
||||||
|
) &
|
||||||
|
sh1=$!
|
||||||
|
(
|
||||||
|
sleep 500 &
|
||||||
|
await $!
|
||||||
|
) &
|
||||||
|
sh2=$!
|
||||||
|
await $sl1
|
||||||
|
await $sh1
|
||||||
|
await $sh2
|
||||||
|
) &
|
||||||
|
parent=$!
|
||||||
|
|
||||||
|
pm_list_descendants "$parent"
|
||||||
|
assert_equal "${#result[@]}" 9
|
||||||
|
|
||||||
|
pm_kill_rec "$parent"
|
||||||
|
await "$parent" 5
|
||||||
|
|
||||||
|
pm_list_descendants "$parent"
|
||||||
|
# the parent will still show amongst its descendants,
|
||||||
|
# even though it is dead.
|
||||||
|
assert_equal "${#result[@]}" 1
|
||||||
|
}
|
||||||
|
|
||||||
@test "should not start process monitor twice" {
|
@test "should not start process monitor twice" {
|
||||||
assert_equal $(pm_state) "halted"
|
assert_equal $(pm_state) "halted"
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user