feat: add timing logs to downloads

2026-01-02 13:33:11 +00:00 · 2025-06-20 14:35:51 -03:00 · 2025-06-20 14:35:51 -03:00 · 7d6e1da293
commit 7d6e1da293
parent 9fcc15f79b
5 changed files with 163 additions and 35 deletions
--- a/src/codex.bash
+++ b/src/codex.bash
@ -24,12 +24,24 @@ fi

 # Output folders
 _cdx_output=$(clh_output_folder "codex")
+# generated files
 _cdx_genfiles="${_cdx_output}/genfiles"
+# downloaded files, per node. File names are CIDs
 _cdx_downloads="${_cdx_output}/downloads"
+# SHA1 of uploaded files, per node. File names are CIDs
 _cdx_uploads="${_cdx_output}/uploads"
+# Codex node logs, per node
 _cdx_logs="${_cdx_output}/logs"
+# Codex data directories, per node
 _cdx_data="${_cdx_output}/data"

+# Partial timings, per operation per node
+_cdx_timing_partials="${_cdx_output}/timing"
+# Custom prefix for timing logs
+_cdx_timing_prefix=""
+# Log file where timings are aggregated
+_cdx_timing_log="/dev/null"
+
 # Base ports and timeouts
 _cdx_base_api_port=8080
 _cdx_base_disc_port=8190
@ -104,7 +116,8 @@ cdx_get_spr() {
 cdx_launch_node() {
  local node_index="$1"

-  _cdx_ensure_outputs "${node_index}" || return 1
+  _cdx_init_global_outputs || return 1
+  _cdx_init_node_outputs "${node_index}" || return 1

  local codex_cmd
  codex_cmd=$(cdx_cmdline "$@") || return 1
@ -172,15 +185,22 @@ cdx_ensure_ready() {
  done
 }

-_cdx_ensure_outputs() {
+_cdx_init_node_outputs() {
  local node_index="$1"
-  mkdir -p "${_cdx_logs}" || return 1
  mkdir -p "${_cdx_data}/codex-${node_index}" || return 1
-  mkdir -p "${_cdx_genfiles}" || return 1
  mkdir -p "${_cdx_downloads}/codex-${node_index}" || return 1
  mkdir -p "${_cdx_uploads}/codex-${node_index}" || return 1
 }

+# XXX: output initialization is a bit of a pain. Right now it's
+#   being piggybacked on cdx_launch_node and cdx_log_timings_start
+#   so we don't have to add extra initialization calls.
+_cdx_init_global_outputs() {
+  mkdir -p "${_cdx_logs}" || return 1
+  mkdir -p "${_cdx_genfiles}" || return 1
+  mkdir -p "${_cdx_timing_partials}" || return 1
+}
+
 cdx_generate_file() {
  local size_mb="${1}" filename
  filename="${_cdx_genfiles}/file-$(date +%s).bin"
@ -208,19 +228,21 @@ cdx_upload_file() {
 }

 cdx_download_file() {
-  local node_index="$1" cid="$2"
-  curl --silent --fail\
+  local node_index="$1" cid="$2" timestamp
+  timestamp="$(date +%s)" || return 1
+
+  TIMEFORMAT="${_cdx_timing_prefix}download,${node_index},${cid},%E,%U,%S"
+  # Note that timing partial filenames are constructed so that lexicographic sorting
+  # puts the most recent entries first, while at the same time breaking ties arbitrarily
+  # for entries that happen within the same second.
+  { time curl --silent --fail\
    -XGET "http://localhost:$(_cdx_api_port "$node_index")/api/codex/v1/data/$cid/network/stream"\
-    -o "${_cdx_downloads}/codex-${node_index}/$cid" || return 1
+    -o "${_cdx_downloads}/codex-${node_index}/$cid" ; } 2> \
+    "${_cdx_timing_partials}/codex-${node_index}-${timestamp}-${RANDOM}.csv"
 }

 cdx_download_file_async() {
-  (
-    cdx_download_file "$@"
-    pm_job_exit $?
-  ) &
-  pm_track_last_job
-  echo $!
+  pm_async cdx_download_file "$@"
 }

 cdx_upload_sha1() {
@ -250,4 +272,32 @@ cdx_check_download() {
    return 1
  fi
  return 0
+}
+
+cdx_log_timings_start() {
+  _cdx_init_global_outputs || return 1
+
+  local log_file="$1" prefix="$2"
+
+  touch "$log_file" || return 1
+
+  _cdx_timing_log="$log_file"
+  if [[ ! "$prefix" =~ ',$' ]]; then
+    prefix="$prefix,"
+  fi
+  _cdx_timing_prefix="$prefix"
+}
+
+cdx_flush_partial_timings() {
+  for file in "${_cdx_timing_partials}"/*; do
+    cat "$file" >> "${_cdx_timing_log}" || return 1
+    rm "$file"
+  done
+}
+
+cdx_log_timings_end() {
+  cdx_flush_partial_timings
+
+  _cdx_timing_log="/dev/null"
+  _cdx_timing_prefix=""
 }
--- a/src/procmon.bash
+++ b/src/procmon.bash
@ -1,4 +1,10 @@
 #!/usr/bin/env bash
+#
+# procmon is a process monitor that tracks a set (group) of processes
+# and kills the entire process group and all of its descendants if one
+# of them fails or gets killed. It is used to ensure that no processes
+# from failed experiments are left behind.
+#
 set -o pipefail

 LIB_SRC=${LIB_SRC:-$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)}
@ -32,14 +38,14 @@ pm_start() {
    while true; do
      pm_known_pids
      for pid in "${result[@]}"; do
-        if kill -0 "${pid}"; then
+        if kill -0 "${pid}" 2> /dev/null; then
          continue
        fi

        exit_code=$(cat "${_pm_output}/${pid}.pid")
-        # If the cat fails, this means the file was deleted. This will typically
-        # only happen this way if the process was removed from tracking and then
-        # killed right after.
+        # If the cat fails, this means the file was deleted, which means
+        # the process is no longer being tracked but the call to pm_stop_tracking
+        # happened after we called pm_known_pids last. Simply ignore the process.
        #
        # shellcheck disable=SC2181
        if [ $? -ne 0 ]; then
@ -47,17 +53,22 @@ pm_start() {
          continue
        fi

+        # Parent process crashed or got killed. We won't get a return code in
+        # these cases.
        if [ -z "$exit_code" ]; then
          echoerr "[procmon] ${pid} died with unknown exit code. Aborting."
          _pm_halt "halted_no_return"
        fi

+        # Parent process exited successfully, all good.
        if [ "$exit_code" -eq 0 ]; then
          echoerr "[procmon] ${pid} died with exit code $exit_code."
          rm "${_pm_output}/${pid}.pid"
          continue
        fi

+        # If we got thus far, the parent process died with a non-zero exit code,
+        # so we kill the whole process group.
        echoerr "[procmon] ${pid} is dead with exit code $exit_code. Aborting."
        _pm_halt "halted_process_failure"
      done
@ -89,17 +100,13 @@ pm_track_last_job() {
  fi
 }

-# Stops tracking a given PID.
+# Stops tracking a given PID. This means that the process dying or exiting
+# with an error code will no longer stop the whole process group.
 # Arguments:
 #   $1: PID to stop tracking
 # Returns:
 #   1 if the process monitor is not running
 #   0 otherwise
-# Note:
-#   This function is flaky. The process monitor
-#   might still see the PID as tracked after this
-#   function returns for a short period of time,
-#   so do not rely too much on it.
 pm_stop_tracking() {
  _pm_assert_state "running" || return 1

@ -172,10 +179,18 @@ pm_stop() {
  _pm_halt "halted"
 }

+# Waits for the process monitor to exit. Returns immediately if
+# the process monitor is not running.
+# Arguments:
+#   $1: timeout in seconds
 pm_join() {
  await "$_pm_pid" "$1"
 }

+# This function is called by the shell running a background job before
+# it exits to communicate the exit code to the process monitor.
+# Arguments:
+#   $1: exit code
 pm_job_exit() {
  local pid_file="${_pm_output}/${BASHPID}.pid" exit_code=$1
  # If the process is not tracked, don't write down an exit code.
@ -187,6 +202,10 @@ pm_job_exit() {
  exit "$exit_code"
 }

+# Kills a process and all of its descendants. This is full of caveats
+# so make sure you see `test_procmon` for an example of how to use it.
+# Arguments:
+#   $1: process ID
 pm_kill_rec() {
  local parent="$1" descendant

@ -209,6 +228,15 @@ pm_list_descendants() {
  _pm_list_descendants "$@"
 }

+pm_async() {
+  (
+    "$@"
+    pm_job_exit "$?"
+  ) &
+  pm_track_last_job
+  echo $!
+}
+
 _pm_list_descendants() {
  local parent="$1"
  result+=("${parent}")
--- a/src/utils.bash
+++ b/src/utils.bash
@ -28,13 +28,14 @@ echoerr() {

 await() {
  local pid=$1 timeout=${2:-30} start="${SECONDS}"
-  while kill -0 "$pid"; do
+  while kill -0 "$pid" 2> /dev/null; do
    if ((SECONDS - start > timeout)); then
      echoerr "Error: timeout waiting for process $pid to exit"
      return 1
    fi
    sleep 0.1
  done
+  echoerr "Process $pid exited"
  return 0
 }

--- a/test/test_codex.bats
+++ b/test/test_codex.bats
@ -4,10 +4,12 @@ setup() {
  load test_helper/common_setup
  common_setup

+  # shellcheck source=./src/codex.bash
  source "${LIB_SRC}/codex.bash"
 }

@test "should generate the correct Codex command line for node 0" {
+  # shellcheck disable=SC2140
  assert_equal "$(cdx_cmdline 0)" "${_cdx_binary} --nat:none"\
 " --log-file=${_cdx_output}/logs/codex-0.log"\
 " --data-dir=${_cdx_output}/data/codex-0"\
@ -15,6 +17,7 @@ setup() {
 }

@test "should generate the correct Codex command line for node 1" {
+  # shellcheck disable=SC2140
  assert_equal "$(cdx_cmdline 1 '--bootstrap-node' 'node-spr')" "${_cdx_binary} --nat:none"\
 " --bootstrap-node=node-spr --log-file=${_cdx_output}/logs/codex-1.log"\
 " --data-dir=${_cdx_output}/data/codex-1"\
@ -27,6 +30,7 @@ setup() {
 }

@test "should generate metrics options when metrics enabled for node" {
+  # shellcheck disable=SC2140
  assert_equal "$(cdx_cmdline 0 --metrics)" "${_cdx_binary} --nat:none"\
 " --metrics --metrics-port=8290 --metrics-address=0.0.0.0"\
 " --log-file=${_cdx_output}/logs/codex-0.log"\
@ -69,7 +73,8 @@ setup() {
  refute [ -f "${_cdx_output}/logs/codex-0.log" ]
  assert [ -z "${_cdx_pids[0]}" ]

-  assert $(! kill -0 "$pid")
+  # Node should already be dead.
+  refute kill -0 "$pid"

  pm_stop
 }
@ -81,7 +86,7 @@ setup() {

  filename=$(cdx_generate_file 10)

-  echo "$(sha1 "$filename")" > "${_cdx_uploads}/codex-0/fakecid.sha1"
+  sha1 "$filename" > "${_cdx_uploads}/codex-0/fakecid.sha1"
  cp "$filename" "${_cdx_downloads}/codex-1/fakecid"

  # Checks that the file uploaded at 0 matches the file downloaded at 1.
@ -114,7 +119,7 @@ setup() {
  cid=$(cdx_upload_file 0 "$filename")

  handle=$(cdx_download_file_async 0 "$cid")
-  await $handle 3
+  await "$handle" 3

  assert cdx_check_download 0 0 "$cid"

@ -131,7 +136,7 @@ setup() {

  handles=()
  for i in {1..4}; do
-    handles+=($(cdx_download_file_async "$i" "$cid"))
+    handles+=("$(cdx_download_file_async "$i" "$cid")")
  done

  assert await_all "${handles[@]}"
@ -143,6 +148,48 @@ setup() {
  pm_stop
 }

+@test "should log download timing information when requested" {
+  pm_start
+
+  cdx_log_timings_start "${_cdx_output}/experiment-0.csv" "experiment-0,100MB"
+
+  assert cdx_launch_network 5
+
+  filename=$(cdx_generate_file 10)
+  cid=$(cdx_upload_file 0 "$filename")
+
+  handles=()
+  for i in {1..4}; do
+    handles+=("$(cdx_download_file_async "$i" "$cid")")
+  done
+
+  assert await_all "${handles[@]}"
+
+  for i in {1..4}; do
+    assert cdx_check_download 0 "$i" "$cid"
+  done
+
+  cdx_log_timings_end
+
+  pm_stop
+
+  decimal_regex='^[0-9]+(\.[0-9]+)?$'
+
+  while IFS=',' read -r experiment file_size operation node_index recorded_cid wallclock user system; do
+    assert [ "$experiment" = "experiment-0" ]
+    assert [ "$file_size" = "100MB" ]
+    assert [ "$recorded_cid" = "$cid" ]
+    assert [ "$operation" = "download" ]
+
+    # We can't use asserts for regex matches so use "raw" bats
+    # assertions.
+    [[ "$node_index" =~ [1-4] ]]
+    [[ "$wallclock" =~ $decimal_regex ]]
+    [[ "$user" =~ $decimal_regex ]]
+    [[ "$system" =~ $decimal_regex ]]
+  done < "${_cdx_output}/experiment-0.csv"
+}
+
 teardown() {
  clh_clear_outputs
-}
+}
--- a/test/test_procmon.bats
+++ b/test/test_procmon.bats
@ -2,6 +2,7 @@ setup() {
  load test_helper/common_setup
  common_setup

+  # shellcheck source=./src/procmon.bash
  source "${LIB_SRC}/procmon.bash"
 }

@ -46,15 +47,15 @@ setup() {
 }

@test "should not start process monitor twice" {
-  assert_equal $(pm_state) "halted"
+  assert_equal "$(pm_state)" "halted"

  assert pm_start
-  assert_equal $(pm_state) "running"
+  assert_equal "$(pm_state)" "running"

  refute pm_start

  assert pm_stop
-  assert_equal $(pm_state) "halted"
+  assert_equal "$(pm_state)" "halted"
 }

@test "should not stop the process monitor if it wasn't started" {
@ -132,7 +133,7 @@ setup() {

  pm_join 3

-  assert_equal $(pm_state) "halted_process_failure"
+  assert_equal "$(pm_state)" "halted_process_failure"
 }

@test "should no longer track a process if requested" {
@ -158,5 +159,6 @@ setup() {
  pm_stop
  pm_join 3

-  assert_equal $(pm_state) "halted"
-}
+  assert_equal "$(pm_state)" "halted"
+}
+