mirror of
https://github.com/logos-storage/logos-storage-local-harness.git
synced 2026-01-02 13:33:11 +00:00
feat: add timing logs to downloads
This commit is contained in:
parent
9fcc15f79b
commit
7d6e1da293
@ -24,12 +24,24 @@ fi
|
||||
|
||||
# Output folders
|
||||
_cdx_output=$(clh_output_folder "codex")
|
||||
# generated files
|
||||
_cdx_genfiles="${_cdx_output}/genfiles"
|
||||
# downloaded files, per node. File names are CIDs
|
||||
_cdx_downloads="${_cdx_output}/downloads"
|
||||
# SHA1 of uploaded files, per node. File names are CIDs
|
||||
_cdx_uploads="${_cdx_output}/uploads"
|
||||
# Codex node logs, per node
|
||||
_cdx_logs="${_cdx_output}/logs"
|
||||
# Codex data directories, per node
|
||||
_cdx_data="${_cdx_output}/data"
|
||||
|
||||
# Partial timings, per operation per node
|
||||
_cdx_timing_partials="${_cdx_output}/timing"
|
||||
# Custom prefix for timing logs
|
||||
_cdx_timing_prefix=""
|
||||
# Log file where timings are aggregated
|
||||
_cdx_timing_log="/dev/null"
|
||||
|
||||
# Base ports and timeouts
|
||||
_cdx_base_api_port=8080
|
||||
_cdx_base_disc_port=8190
|
||||
@ -104,7 +116,8 @@ cdx_get_spr() {
|
||||
cdx_launch_node() {
|
||||
local node_index="$1"
|
||||
|
||||
_cdx_ensure_outputs "${node_index}" || return 1
|
||||
_cdx_init_global_outputs || return 1
|
||||
_cdx_init_node_outputs "${node_index}" || return 1
|
||||
|
||||
local codex_cmd
|
||||
codex_cmd=$(cdx_cmdline "$@") || return 1
|
||||
@ -172,15 +185,22 @@ cdx_ensure_ready() {
|
||||
done
|
||||
}
|
||||
|
||||
_cdx_ensure_outputs() {
|
||||
_cdx_init_node_outputs() {
|
||||
local node_index="$1"
|
||||
mkdir -p "${_cdx_logs}" || return 1
|
||||
mkdir -p "${_cdx_data}/codex-${node_index}" || return 1
|
||||
mkdir -p "${_cdx_genfiles}" || return 1
|
||||
mkdir -p "${_cdx_downloads}/codex-${node_index}" || return 1
|
||||
mkdir -p "${_cdx_uploads}/codex-${node_index}" || return 1
|
||||
}
|
||||
|
||||
# XXX: output initialization is a bit of a pain. Right now it's
|
||||
# being piggybacked on cdx_launch_node and cdx_log_timings_start
|
||||
# so we don't have to add extra initialization calls.
|
||||
_cdx_init_global_outputs() {
|
||||
mkdir -p "${_cdx_logs}" || return 1
|
||||
mkdir -p "${_cdx_genfiles}" || return 1
|
||||
mkdir -p "${_cdx_timing_partials}" || return 1
|
||||
}
|
||||
|
||||
cdx_generate_file() {
|
||||
local size_mb="${1}" filename
|
||||
filename="${_cdx_genfiles}/file-$(date +%s).bin"
|
||||
@ -208,19 +228,21 @@ cdx_upload_file() {
|
||||
}
|
||||
|
||||
cdx_download_file() {
|
||||
local node_index="$1" cid="$2"
|
||||
curl --silent --fail\
|
||||
local node_index="$1" cid="$2" timestamp
|
||||
timestamp="$(date +%s)" || return 1
|
||||
|
||||
TIMEFORMAT="${_cdx_timing_prefix}download,${node_index},${cid},%E,%U,%S"
|
||||
# Note that timing partial filenames are constructed so that lexicographic sorting
|
||||
# puts the most recent entries first, while at the same time breaking ties arbitrarily
|
||||
# for entries that happen within the same second.
|
||||
{ time curl --silent --fail\
|
||||
-XGET "http://localhost:$(_cdx_api_port "$node_index")/api/codex/v1/data/$cid/network/stream"\
|
||||
-o "${_cdx_downloads}/codex-${node_index}/$cid" || return 1
|
||||
-o "${_cdx_downloads}/codex-${node_index}/$cid" ; } 2> \
|
||||
"${_cdx_timing_partials}/codex-${node_index}-${timestamp}-${RANDOM}.csv"
|
||||
}
|
||||
|
||||
cdx_download_file_async() {
|
||||
(
|
||||
cdx_download_file "$@"
|
||||
pm_job_exit $?
|
||||
) &
|
||||
pm_track_last_job
|
||||
echo $!
|
||||
pm_async cdx_download_file "$@"
|
||||
}
|
||||
|
||||
cdx_upload_sha1() {
|
||||
@ -250,4 +272,32 @@ cdx_check_download() {
|
||||
return 1
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
cdx_log_timings_start() {
|
||||
_cdx_init_global_outputs || return 1
|
||||
|
||||
local log_file="$1" prefix="$2"
|
||||
|
||||
touch "$log_file" || return 1
|
||||
|
||||
_cdx_timing_log="$log_file"
|
||||
if [[ ! "$prefix" =~ ',$' ]]; then
|
||||
prefix="$prefix,"
|
||||
fi
|
||||
_cdx_timing_prefix="$prefix"
|
||||
}
|
||||
|
||||
cdx_flush_partial_timings() {
|
||||
for file in "${_cdx_timing_partials}"/*; do
|
||||
cat "$file" >> "${_cdx_timing_log}" || return 1
|
||||
rm "$file"
|
||||
done
|
||||
}
|
||||
|
||||
cdx_log_timings_end() {
|
||||
cdx_flush_partial_timings
|
||||
|
||||
_cdx_timing_log="/dev/null"
|
||||
_cdx_timing_prefix=""
|
||||
}
|
||||
@ -1,4 +1,10 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# procmon is a process monitor that tracks a set (group) of processes
|
||||
# and kills the entire process group and all of its descendants if one
|
||||
# of them fails or gets killed. It is used to ensure that no processes
|
||||
# from failed experiments are left behind.
|
||||
#
|
||||
set -o pipefail
|
||||
|
||||
LIB_SRC=${LIB_SRC:-$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)}
|
||||
@ -32,14 +38,14 @@ pm_start() {
|
||||
while true; do
|
||||
pm_known_pids
|
||||
for pid in "${result[@]}"; do
|
||||
if kill -0 "${pid}"; then
|
||||
if kill -0 "${pid}" 2> /dev/null; then
|
||||
continue
|
||||
fi
|
||||
|
||||
exit_code=$(cat "${_pm_output}/${pid}.pid")
|
||||
# If the cat fails, this means the file was deleted. This will typically
|
||||
# only happen this way if the process was removed from tracking and then
|
||||
# killed right after.
|
||||
# If the cat fails, this means the file was deleted, which means
|
||||
# the process is no longer being tracked but the call to pm_stop_tracking
|
||||
# happened after we called pm_known_pids last. Simply ignore the process.
|
||||
#
|
||||
# shellcheck disable=SC2181
|
||||
if [ $? -ne 0 ]; then
|
||||
@ -47,17 +53,22 @@ pm_start() {
|
||||
continue
|
||||
fi
|
||||
|
||||
# Parent process crashed or got killed. We won't get a return code in
|
||||
# these cases.
|
||||
if [ -z "$exit_code" ]; then
|
||||
echoerr "[procmon] ${pid} died with unknown exit code. Aborting."
|
||||
_pm_halt "halted_no_return"
|
||||
fi
|
||||
|
||||
# Parent process exited successfully, all good.
|
||||
if [ "$exit_code" -eq 0 ]; then
|
||||
echoerr "[procmon] ${pid} died with exit code $exit_code."
|
||||
rm "${_pm_output}/${pid}.pid"
|
||||
continue
|
||||
fi
|
||||
|
||||
# If we got thus far, the parent process died with a non-zero exit code,
|
||||
# so we kill the whole process group.
|
||||
echoerr "[procmon] ${pid} is dead with exit code $exit_code. Aborting."
|
||||
_pm_halt "halted_process_failure"
|
||||
done
|
||||
@ -89,17 +100,13 @@ pm_track_last_job() {
|
||||
fi
|
||||
}
|
||||
|
||||
# Stops tracking a given PID.
|
||||
# Stops tracking a given PID. This means that the process dying or exiting
|
||||
# with an error code will no longer stop the whole process group.
|
||||
# Arguments:
|
||||
# $1: PID to stop tracking
|
||||
# Returns:
|
||||
# 1 if the process monitor is not running
|
||||
# 0 otherwise
|
||||
# Note:
|
||||
# This function is flaky. The process monitor
|
||||
# might still see the PID as tracked after this
|
||||
# function returns for a short period of time,
|
||||
# so do not rely too much on it.
|
||||
pm_stop_tracking() {
|
||||
_pm_assert_state "running" || return 1
|
||||
|
||||
@ -172,10 +179,18 @@ pm_stop() {
|
||||
_pm_halt "halted"
|
||||
}
|
||||
|
||||
# Waits for the process monitor to exit. Returns immediately if
|
||||
# the process monitor is not running.
|
||||
# Arguments:
|
||||
# $1: timeout in seconds
|
||||
pm_join() {
|
||||
await "$_pm_pid" "$1"
|
||||
}
|
||||
|
||||
# This function is called by the shell running a background job before
|
||||
# it exits to communicate the exit code to the process monitor.
|
||||
# Arguments:
|
||||
# $1: exit code
|
||||
pm_job_exit() {
|
||||
local pid_file="${_pm_output}/${BASHPID}.pid" exit_code=$1
|
||||
# If the process is not tracked, don't write down an exit code.
|
||||
@ -187,6 +202,10 @@ pm_job_exit() {
|
||||
exit "$exit_code"
|
||||
}
|
||||
|
||||
# Kills a process and all of its descendants. This is full of caveats
|
||||
# so make sure you see `test_procmon` for an example of how to use it.
|
||||
# Arguments:
|
||||
# $1: process ID
|
||||
pm_kill_rec() {
|
||||
local parent="$1" descendant
|
||||
|
||||
@ -209,6 +228,15 @@ pm_list_descendants() {
|
||||
_pm_list_descendants "$@"
|
||||
}
|
||||
|
||||
pm_async() {
|
||||
(
|
||||
"$@"
|
||||
pm_job_exit "$?"
|
||||
) &
|
||||
pm_track_last_job
|
||||
echo $!
|
||||
}
|
||||
|
||||
_pm_list_descendants() {
|
||||
local parent="$1"
|
||||
result+=("${parent}")
|
||||
|
||||
@ -28,13 +28,14 @@ echoerr() {
|
||||
|
||||
await() {
|
||||
local pid=$1 timeout=${2:-30} start="${SECONDS}"
|
||||
while kill -0 "$pid"; do
|
||||
while kill -0 "$pid" 2> /dev/null; do
|
||||
if ((SECONDS - start > timeout)); then
|
||||
echoerr "Error: timeout waiting for process $pid to exit"
|
||||
return 1
|
||||
fi
|
||||
sleep 0.1
|
||||
done
|
||||
echoerr "Process $pid exited"
|
||||
return 0
|
||||
}
|
||||
|
||||
|
||||
@ -4,10 +4,12 @@ setup() {
|
||||
load test_helper/common_setup
|
||||
common_setup
|
||||
|
||||
# shellcheck source=./src/codex.bash
|
||||
source "${LIB_SRC}/codex.bash"
|
||||
}
|
||||
|
||||
@test "should generate the correct Codex command line for node 0" {
|
||||
# shellcheck disable=SC2140
|
||||
assert_equal "$(cdx_cmdline 0)" "${_cdx_binary} --nat:none"\
|
||||
" --log-file=${_cdx_output}/logs/codex-0.log"\
|
||||
" --data-dir=${_cdx_output}/data/codex-0"\
|
||||
@ -15,6 +17,7 @@ setup() {
|
||||
}
|
||||
|
||||
@test "should generate the correct Codex command line for node 1" {
|
||||
# shellcheck disable=SC2140
|
||||
assert_equal "$(cdx_cmdline 1 '--bootstrap-node' 'node-spr')" "${_cdx_binary} --nat:none"\
|
||||
" --bootstrap-node=node-spr --log-file=${_cdx_output}/logs/codex-1.log"\
|
||||
" --data-dir=${_cdx_output}/data/codex-1"\
|
||||
@ -27,6 +30,7 @@ setup() {
|
||||
}
|
||||
|
||||
@test "should generate metrics options when metrics enabled for node" {
|
||||
# shellcheck disable=SC2140
|
||||
assert_equal "$(cdx_cmdline 0 --metrics)" "${_cdx_binary} --nat:none"\
|
||||
" --metrics --metrics-port=8290 --metrics-address=0.0.0.0"\
|
||||
" --log-file=${_cdx_output}/logs/codex-0.log"\
|
||||
@ -69,7 +73,8 @@ setup() {
|
||||
refute [ -f "${_cdx_output}/logs/codex-0.log" ]
|
||||
assert [ -z "${_cdx_pids[0]}" ]
|
||||
|
||||
assert $(! kill -0 "$pid")
|
||||
# Node should already be dead.
|
||||
refute kill -0 "$pid"
|
||||
|
||||
pm_stop
|
||||
}
|
||||
@ -81,7 +86,7 @@ setup() {
|
||||
|
||||
filename=$(cdx_generate_file 10)
|
||||
|
||||
echo "$(sha1 "$filename")" > "${_cdx_uploads}/codex-0/fakecid.sha1"
|
||||
sha1 "$filename" > "${_cdx_uploads}/codex-0/fakecid.sha1"
|
||||
cp "$filename" "${_cdx_downloads}/codex-1/fakecid"
|
||||
|
||||
# Checks that the file uploaded at 0 matches the file downloaded at 1.
|
||||
@ -114,7 +119,7 @@ setup() {
|
||||
cid=$(cdx_upload_file 0 "$filename")
|
||||
|
||||
handle=$(cdx_download_file_async 0 "$cid")
|
||||
await $handle 3
|
||||
await "$handle" 3
|
||||
|
||||
assert cdx_check_download 0 0 "$cid"
|
||||
|
||||
@ -131,7 +136,7 @@ setup() {
|
||||
|
||||
handles=()
|
||||
for i in {1..4}; do
|
||||
handles+=($(cdx_download_file_async "$i" "$cid"))
|
||||
handles+=("$(cdx_download_file_async "$i" "$cid")")
|
||||
done
|
||||
|
||||
assert await_all "${handles[@]}"
|
||||
@ -143,6 +148,48 @@ setup() {
|
||||
pm_stop
|
||||
}
|
||||
|
||||
@test "should log download timing information when requested" {
|
||||
pm_start
|
||||
|
||||
cdx_log_timings_start "${_cdx_output}/experiment-0.csv" "experiment-0,100MB"
|
||||
|
||||
assert cdx_launch_network 5
|
||||
|
||||
filename=$(cdx_generate_file 10)
|
||||
cid=$(cdx_upload_file 0 "$filename")
|
||||
|
||||
handles=()
|
||||
for i in {1..4}; do
|
||||
handles+=("$(cdx_download_file_async "$i" "$cid")")
|
||||
done
|
||||
|
||||
assert await_all "${handles[@]}"
|
||||
|
||||
for i in {1..4}; do
|
||||
assert cdx_check_download 0 "$i" "$cid"
|
||||
done
|
||||
|
||||
cdx_log_timings_end
|
||||
|
||||
pm_stop
|
||||
|
||||
decimal_regex='^[0-9]+(\.[0-9]+)?$'
|
||||
|
||||
while IFS=',' read -r experiment file_size operation node_index recorded_cid wallclock user system; do
|
||||
assert [ "$experiment" = "experiment-0" ]
|
||||
assert [ "$file_size" = "100MB" ]
|
||||
assert [ "$recorded_cid" = "$cid" ]
|
||||
assert [ "$operation" = "download" ]
|
||||
|
||||
# We can't use asserts for regex matches so use "raw" bats
|
||||
# assertions.
|
||||
[[ "$node_index" =~ [1-4] ]]
|
||||
[[ "$wallclock" =~ $decimal_regex ]]
|
||||
[[ "$user" =~ $decimal_regex ]]
|
||||
[[ "$system" =~ $decimal_regex ]]
|
||||
done < "${_cdx_output}/experiment-0.csv"
|
||||
}
|
||||
|
||||
teardown() {
|
||||
clh_clear_outputs
|
||||
}
|
||||
}
|
||||
|
||||
@ -2,6 +2,7 @@ setup() {
|
||||
load test_helper/common_setup
|
||||
common_setup
|
||||
|
||||
# shellcheck source=./src/procmon.bash
|
||||
source "${LIB_SRC}/procmon.bash"
|
||||
}
|
||||
|
||||
@ -46,15 +47,15 @@ setup() {
|
||||
}
|
||||
|
||||
@test "should not start process monitor twice" {
|
||||
assert_equal $(pm_state) "halted"
|
||||
assert_equal "$(pm_state)" "halted"
|
||||
|
||||
assert pm_start
|
||||
assert_equal $(pm_state) "running"
|
||||
assert_equal "$(pm_state)" "running"
|
||||
|
||||
refute pm_start
|
||||
|
||||
assert pm_stop
|
||||
assert_equal $(pm_state) "halted"
|
||||
assert_equal "$(pm_state)" "halted"
|
||||
}
|
||||
|
||||
@test "should not stop the process monitor if it wasn't started" {
|
||||
@ -132,7 +133,7 @@ setup() {
|
||||
|
||||
pm_join 3
|
||||
|
||||
assert_equal $(pm_state) "halted_process_failure"
|
||||
assert_equal "$(pm_state)" "halted_process_failure"
|
||||
}
|
||||
|
||||
@test "should no longer track a process if requested" {
|
||||
@ -158,5 +159,6 @@ setup() {
|
||||
pm_stop
|
||||
pm_join 3
|
||||
|
||||
assert_equal $(pm_state) "halted"
|
||||
}
|
||||
assert_equal "$(pm_state)" "halted"
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user