From 7d6e1da293417ad8cb3190a832cbf6c8e491e357 Mon Sep 17 00:00:00 2001 From: gmega Date: Fri, 20 Jun 2025 14:35:51 -0300 Subject: [PATCH] feat: add timing logs to downloads --- src/codex.bash | 76 ++++++++++++++++++++++++++++++++++-------- src/procmon.bash | 48 ++++++++++++++++++++------ src/utils.bash | 3 +- test/test_codex.bats | 57 ++++++++++++++++++++++++++++--- test/test_procmon.bats | 14 ++++---- 5 files changed, 163 insertions(+), 35 deletions(-) diff --git a/src/codex.bash b/src/codex.bash index 06d6bf3..b49fed3 100644 --- a/src/codex.bash +++ b/src/codex.bash @@ -24,12 +24,24 @@ fi # Output folders _cdx_output=$(clh_output_folder "codex") +# generated files _cdx_genfiles="${_cdx_output}/genfiles" +# downloaded files, per node. File names are CIDs _cdx_downloads="${_cdx_output}/downloads" +# SHA1 of uploaded files, per node. File names are CIDs _cdx_uploads="${_cdx_output}/uploads" +# Codex node logs, per node _cdx_logs="${_cdx_output}/logs" +# Codex data directories, per node _cdx_data="${_cdx_output}/data" +# Partial timings, per operation per node +_cdx_timing_partials="${_cdx_output}/timing" +# Custom prefix for timing logs +_cdx_timing_prefix="" +# Log file where timings are aggregated +_cdx_timing_log="/dev/null" + # Base ports and timeouts _cdx_base_api_port=8080 _cdx_base_disc_port=8190 @@ -104,7 +116,8 @@ cdx_get_spr() { cdx_launch_node() { local node_index="$1" - _cdx_ensure_outputs "${node_index}" || return 1 + _cdx_init_global_outputs || return 1 + _cdx_init_node_outputs "${node_index}" || return 1 local codex_cmd codex_cmd=$(cdx_cmdline "$@") || return 1 @@ -172,15 +185,22 @@ cdx_ensure_ready() { done } -_cdx_ensure_outputs() { +_cdx_init_node_outputs() { local node_index="$1" - mkdir -p "${_cdx_logs}" || return 1 mkdir -p "${_cdx_data}/codex-${node_index}" || return 1 - mkdir -p "${_cdx_genfiles}" || return 1 mkdir -p "${_cdx_downloads}/codex-${node_index}" || return 1 mkdir -p "${_cdx_uploads}/codex-${node_index}" || return 1 } +# XXX: output initialization is a bit of a pain. Right now it's +# being piggybacked on cdx_launch_node and cdx_log_timings_start +# so we don't have to add extra initialization calls. +_cdx_init_global_outputs() { + mkdir -p "${_cdx_logs}" || return 1 + mkdir -p "${_cdx_genfiles}" || return 1 + mkdir -p "${_cdx_timing_partials}" || return 1 +} + cdx_generate_file() { local size_mb="${1}" filename filename="${_cdx_genfiles}/file-$(date +%s).bin" @@ -208,19 +228,21 @@ cdx_upload_file() { } cdx_download_file() { - local node_index="$1" cid="$2" - curl --silent --fail\ + local node_index="$1" cid="$2" timestamp + timestamp="$(date +%s)" || return 1 + + TIMEFORMAT="${_cdx_timing_prefix}download,${node_index},${cid},%E,%U,%S" + # Note that timing partial filenames are constructed so that lexicographic sorting + # puts the most recent entries first, while at the same time breaking ties arbitrarily + # for entries that happen within the same second. + { time curl --silent --fail\ -XGET "http://localhost:$(_cdx_api_port "$node_index")/api/codex/v1/data/$cid/network/stream"\ - -o "${_cdx_downloads}/codex-${node_index}/$cid" || return 1 + -o "${_cdx_downloads}/codex-${node_index}/$cid" ; } 2> \ + "${_cdx_timing_partials}/codex-${node_index}-${timestamp}-${RANDOM}.csv" } cdx_download_file_async() { - ( - cdx_download_file "$@" - pm_job_exit $? - ) & - pm_track_last_job - echo $! + pm_async cdx_download_file "$@" } cdx_upload_sha1() { @@ -250,4 +272,32 @@ cdx_check_download() { return 1 fi return 0 +} + +cdx_log_timings_start() { + _cdx_init_global_outputs || return 1 + + local log_file="$1" prefix="$2" + + touch "$log_file" || return 1 + + _cdx_timing_log="$log_file" + if [[ ! "$prefix" =~ ',$' ]]; then + prefix="$prefix," + fi + _cdx_timing_prefix="$prefix" +} + +cdx_flush_partial_timings() { + for file in "${_cdx_timing_partials}"/*; do + cat "$file" >> "${_cdx_timing_log}" || return 1 + rm "$file" + done +} + +cdx_log_timings_end() { + cdx_flush_partial_timings + + _cdx_timing_log="/dev/null" + _cdx_timing_prefix="" } \ No newline at end of file diff --git a/src/procmon.bash b/src/procmon.bash index 4a033a6..e4b0a2f 100644 --- a/src/procmon.bash +++ b/src/procmon.bash @@ -1,4 +1,10 @@ #!/usr/bin/env bash +# +# procmon is a process monitor that tracks a set (group) of processes +# and kills the entire process group and all of its descendants if one +# of them fails or gets killed. It is used to ensure that no processes +# from failed experiments are left behind. +# set -o pipefail LIB_SRC=${LIB_SRC:-$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)} @@ -32,14 +38,14 @@ pm_start() { while true; do pm_known_pids for pid in "${result[@]}"; do - if kill -0 "${pid}"; then + if kill -0 "${pid}" 2> /dev/null; then continue fi exit_code=$(cat "${_pm_output}/${pid}.pid") - # If the cat fails, this means the file was deleted. This will typically - # only happen this way if the process was removed from tracking and then - # killed right after. + # If the cat fails, this means the file was deleted, which means + # the process is no longer being tracked but the call to pm_stop_tracking + # happened after we called pm_known_pids last. Simply ignore the process. # # shellcheck disable=SC2181 if [ $? -ne 0 ]; then @@ -47,17 +53,22 @@ pm_start() { continue fi + # Parent process crashed or got killed. We won't get a return code in + # these cases. if [ -z "$exit_code" ]; then echoerr "[procmon] ${pid} died with unknown exit code. Aborting." _pm_halt "halted_no_return" fi + # Parent process exited successfully, all good. if [ "$exit_code" -eq 0 ]; then echoerr "[procmon] ${pid} died with exit code $exit_code." rm "${_pm_output}/${pid}.pid" continue fi + # If we got thus far, the parent process died with a non-zero exit code, + # so we kill the whole process group. echoerr "[procmon] ${pid} is dead with exit code $exit_code. Aborting." _pm_halt "halted_process_failure" done @@ -89,17 +100,13 @@ pm_track_last_job() { fi } -# Stops tracking a given PID. +# Stops tracking a given PID. This means that the process dying or exiting +# with an error code will no longer stop the whole process group. # Arguments: # $1: PID to stop tracking # Returns: # 1 if the process monitor is not running # 0 otherwise -# Note: -# This function is flaky. The process monitor -# might still see the PID as tracked after this -# function returns for a short period of time, -# so do not rely too much on it. pm_stop_tracking() { _pm_assert_state "running" || return 1 @@ -172,10 +179,18 @@ pm_stop() { _pm_halt "halted" } +# Waits for the process monitor to exit. Returns immediately if +# the process monitor is not running. +# Arguments: +# $1: timeout in seconds pm_join() { await "$_pm_pid" "$1" } +# This function is called by the shell running a background job before +# it exits to communicate the exit code to the process monitor. +# Arguments: +# $1: exit code pm_job_exit() { local pid_file="${_pm_output}/${BASHPID}.pid" exit_code=$1 # If the process is not tracked, don't write down an exit code. @@ -187,6 +202,10 @@ pm_job_exit() { exit "$exit_code" } +# Kills a process and all of its descendants. This is full of caveats +# so make sure you see `test_procmon` for an example of how to use it. +# Arguments: +# $1: process ID pm_kill_rec() { local parent="$1" descendant @@ -209,6 +228,15 @@ pm_list_descendants() { _pm_list_descendants "$@" } +pm_async() { + ( + "$@" + pm_job_exit "$?" + ) & + pm_track_last_job + echo $! +} + _pm_list_descendants() { local parent="$1" result+=("${parent}") diff --git a/src/utils.bash b/src/utils.bash index fa8e3bd..49779ba 100644 --- a/src/utils.bash +++ b/src/utils.bash @@ -28,13 +28,14 @@ echoerr() { await() { local pid=$1 timeout=${2:-30} start="${SECONDS}" - while kill -0 "$pid"; do + while kill -0 "$pid" 2> /dev/null; do if ((SECONDS - start > timeout)); then echoerr "Error: timeout waiting for process $pid to exit" return 1 fi sleep 0.1 done + echoerr "Process $pid exited" return 0 } diff --git a/test/test_codex.bats b/test/test_codex.bats index 27999a6..05161ca 100755 --- a/test/test_codex.bats +++ b/test/test_codex.bats @@ -4,10 +4,12 @@ setup() { load test_helper/common_setup common_setup + # shellcheck source=./src/codex.bash source "${LIB_SRC}/codex.bash" } @test "should generate the correct Codex command line for node 0" { + # shellcheck disable=SC2140 assert_equal "$(cdx_cmdline 0)" "${_cdx_binary} --nat:none"\ " --log-file=${_cdx_output}/logs/codex-0.log"\ " --data-dir=${_cdx_output}/data/codex-0"\ @@ -15,6 +17,7 @@ setup() { } @test "should generate the correct Codex command line for node 1" { + # shellcheck disable=SC2140 assert_equal "$(cdx_cmdline 1 '--bootstrap-node' 'node-spr')" "${_cdx_binary} --nat:none"\ " --bootstrap-node=node-spr --log-file=${_cdx_output}/logs/codex-1.log"\ " --data-dir=${_cdx_output}/data/codex-1"\ @@ -27,6 +30,7 @@ setup() { } @test "should generate metrics options when metrics enabled for node" { + # shellcheck disable=SC2140 assert_equal "$(cdx_cmdline 0 --metrics)" "${_cdx_binary} --nat:none"\ " --metrics --metrics-port=8290 --metrics-address=0.0.0.0"\ " --log-file=${_cdx_output}/logs/codex-0.log"\ @@ -69,7 +73,8 @@ setup() { refute [ -f "${_cdx_output}/logs/codex-0.log" ] assert [ -z "${_cdx_pids[0]}" ] - assert $(! kill -0 "$pid") + # Node should already be dead. + refute kill -0 "$pid" pm_stop } @@ -81,7 +86,7 @@ setup() { filename=$(cdx_generate_file 10) - echo "$(sha1 "$filename")" > "${_cdx_uploads}/codex-0/fakecid.sha1" + sha1 "$filename" > "${_cdx_uploads}/codex-0/fakecid.sha1" cp "$filename" "${_cdx_downloads}/codex-1/fakecid" # Checks that the file uploaded at 0 matches the file downloaded at 1. @@ -114,7 +119,7 @@ setup() { cid=$(cdx_upload_file 0 "$filename") handle=$(cdx_download_file_async 0 "$cid") - await $handle 3 + await "$handle" 3 assert cdx_check_download 0 0 "$cid" @@ -131,7 +136,7 @@ setup() { handles=() for i in {1..4}; do - handles+=($(cdx_download_file_async "$i" "$cid")) + handles+=("$(cdx_download_file_async "$i" "$cid")") done assert await_all "${handles[@]}" @@ -143,6 +148,48 @@ setup() { pm_stop } +@test "should log download timing information when requested" { + pm_start + + cdx_log_timings_start "${_cdx_output}/experiment-0.csv" "experiment-0,100MB" + + assert cdx_launch_network 5 + + filename=$(cdx_generate_file 10) + cid=$(cdx_upload_file 0 "$filename") + + handles=() + for i in {1..4}; do + handles+=("$(cdx_download_file_async "$i" "$cid")") + done + + assert await_all "${handles[@]}" + + for i in {1..4}; do + assert cdx_check_download 0 "$i" "$cid" + done + + cdx_log_timings_end + + pm_stop + + decimal_regex='^[0-9]+(\.[0-9]+)?$' + + while IFS=',' read -r experiment file_size operation node_index recorded_cid wallclock user system; do + assert [ "$experiment" = "experiment-0" ] + assert [ "$file_size" = "100MB" ] + assert [ "$recorded_cid" = "$cid" ] + assert [ "$operation" = "download" ] + + # We can't use asserts for regex matches so use "raw" bats + # assertions. + [[ "$node_index" =~ [1-4] ]] + [[ "$wallclock" =~ $decimal_regex ]] + [[ "$user" =~ $decimal_regex ]] + [[ "$system" =~ $decimal_regex ]] + done < "${_cdx_output}/experiment-0.csv" +} + teardown() { clh_clear_outputs -} \ No newline at end of file +} diff --git a/test/test_procmon.bats b/test/test_procmon.bats index eae54b8..9a7efcf 100644 --- a/test/test_procmon.bats +++ b/test/test_procmon.bats @@ -2,6 +2,7 @@ setup() { load test_helper/common_setup common_setup + # shellcheck source=./src/procmon.bash source "${LIB_SRC}/procmon.bash" } @@ -46,15 +47,15 @@ setup() { } @test "should not start process monitor twice" { - assert_equal $(pm_state) "halted" + assert_equal "$(pm_state)" "halted" assert pm_start - assert_equal $(pm_state) "running" + assert_equal "$(pm_state)" "running" refute pm_start assert pm_stop - assert_equal $(pm_state) "halted" + assert_equal "$(pm_state)" "halted" } @test "should not stop the process monitor if it wasn't started" { @@ -132,7 +133,7 @@ setup() { pm_join 3 - assert_equal $(pm_state) "halted_process_failure" + assert_equal "$(pm_state)" "halted_process_failure" } @test "should no longer track a process if requested" { @@ -158,5 +159,6 @@ setup() { pm_stop pm_join 3 - assert_equal $(pm_state) "halted" -} \ No newline at end of file + assert_equal "$(pm_state)" "halted" +} +