From 3cdb1c26011c299744438de1f2444a793e18c6de Mon Sep 17 00:00:00 2001
From: gmega <giuliano.mega@gmail.com>
Date: Fri, 20 Oct 2023 13:48:01 -0300
Subject: [PATCH] add log helper scripts and snippets, update analysis

---
 .gitignore                                 |  1 +
 analysis/analysis.Rmd                      | 56 ++++++++++++++++++++++
 bin/csv-concat.sh                          |  8 ++++
 bin/pull-all-logs.sh                       | 29 +++++++++++
 bin/{pull_pod_logs.sh => pull-pod-logs.sh} | 11 +++--
 bin/snippets/README.md                     |  4 ++
 bin/snippets/upload-bug.sh                 | 26 ++++++++++
 7 files changed, 130 insertions(+), 5 deletions(-)
 create mode 100755 bin/csv-concat.sh
 create mode 100755 bin/pull-all-logs.sh
 rename bin/{pull_pod_logs.sh => pull-pod-logs.sh} (51%)
 create mode 100644 bin/snippets/README.md
 create mode 100644 bin/snippets/upload-bug.sh

diff --git a/.gitignore b/.gitignore
index 5537745..031f1c6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ dist
 **/__pycache__
 .Rproj.user
 .RData
+.Rhistory
 *.html
 *.log
 *.csv
diff --git a/analysis/analysis.Rmd b/analysis/analysis.Rmd
index fc4b618..e406770 100644
--- a/analysis/analysis.Rmd
+++ b/analysis/analysis.Rmd
@@ -8,6 +8,8 @@ library(tidyverse)
 library(lubridate)
 ```
 
+# Node Crashing on Upload
+
 ```{r}
 uploads <- read_csv('./codex-continuous-tests-0codex3-5-77bdb95dc7-j7f46_codex3-5-uploads.csv')
 ```
@@ -150,3 +152,57 @@ ggplot(
   theme_minimal()
 ```
 
+
+# Whole-Cluster
+
+```{r}
+cluster_uploads <- read_csv('../data/20/pods/uploads/all_uploads.csv') |> filter(source != 'source')
+```
+
+```{r}
+cluster_upload_durations <- cluster_uploads |> group_by(source, upload) |> arrange(timestamp) |> summarise(duration = as.numeric(timestamp[n()] - timestamp[1]))
+```
+
+```{r fig.width=12}
+ggplot(cluster_upload_durations) + 
+  geom_line(aes(x = upload, y = duration, col = source)) + 
+  theme_minimal() +
+  facet_wrap(. ~ source) +
+  guides(color = FALSE)
+```
+
+```{r}
+cluster_interlog_intervals <- cluster_uploads |> 
+  group_by(source, upload) |> 
+  arrange(timestamp) |> 
+  mutate(log_interval = as.numeric(timestamp - lag(timestamp))) |>
+  ungroup()
+```
+
+```{r fig.width=10}
+cluster_interlog_intervals |>
+  group_by(source, upload) |>
+  summarise(
+    mean_li = mean(log_interval, na.rm=TRUE),
+    median_li = median(log_interval, na.rm=TRUE),
+    max_li = max(log_interval, na.rm=TRUE),
+  ) |>
+  pivot_longer(-c(source, upload)) %>% {
+    ggplot(.) + 
+      geom_line(aes(x = upload, y = value, col = name)) +
+      scale_y_log10() + 
+      theme_minimal() +
+      ylab('interval between log messages (logscale, seconds)') +
+      facet_wrap(. ~ source) 
+  } 
+```
+
+
+```{r}
+ggplot(cluster_interlog_intervals) + 
+  geom_line(aes(x = upload, y = duration, col = source)) + 
+  theme_minimal() +
+  facet_wrap(. ~ source) +
+  guides(color = FALSE)
+```
+
diff --git a/bin/csv-concat.sh b/bin/csv-concat.sh
new file mode 100755
index 0000000..d4ae906
--- /dev/null
+++ b/bin/csv-concat.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+# Concatenates CSV files that have identical headers by removing the header from all but the first file. This is
+# meant to be used after a call to `cat`; e.g., cat csv1.csv csv2.csv | lscsv-concat.sh
+set -e
+
+header=$(head -n 1)
+echo "$header"
+grep "$header" -Fv
\ No newline at end of file
diff --git a/bin/pull-all-logs.sh b/bin/pull-all-logs.sh
new file mode 100755
index 0000000..94f3d26
--- /dev/null
+++ b/bin/pull-all-logs.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Given a namespace and a base folder containing the runner logs for continuous tests, creates
+# a storage area (folder) and:
+#
+#  1. pulls pod logs into storage_area/pods
+#  2. copies runner logs to storage_area/runner
+#
+# Make sure you delete the original runner logs once this is done, as otherwise they might get copied into  more
+# than one storage area.
+set -e
+
+namespace=${1}
+runner_log_source=${2}
+
+if [ -z "$namespace" ] || [ -z "$runner_logs" ]; then
+  echo "Usage: bin/process_logs.sh <namespace> <runner_logs>"
+  exit 1
+fi
+
+run_id=$(date +'%Y-%m-%d-%H%M%S')
+logs="data/logs/$run_id"
+pod_logs="$logs/pods"
+runner_logs="$logs/runner"
+
+mkdir -p "$pod_logs"
+bash bin/pull_pod_logs.sh "$namespace" "$pod_logs"
+
+mkdir -p "$runner_logs"
+cp "$runner_log_source" "$runner_logs"
\ No newline at end of file
diff --git a/bin/pull_pod_logs.sh b/bin/pull-pod-logs.sh
similarity index 51%
rename from bin/pull_pod_logs.sh
rename to bin/pull-pod-logs.sh
index 95546bc..f21d230 100755
--- a/bin/pull_pod_logs.sh
+++ b/bin/pull-pod-logs.sh
@@ -1,22 +1,23 @@
 #!/bin/bash
 
-NAMESPACE=${1:-"codex-continuous-tests"}
+namespace=${1:-"codex-continuous-tests"}
+output_folder=${2:./}
 
 # List all pods in the namespace
-pods=$(kubectl get pods -n $NAMESPACE -o jsonpath='{.items[*].metadata.name}')
+pods=$(kubectl get pods -n "$namespace" -o jsonpath='{.items[*].metadata.name}')
 
 for pod in $pods; do
   echo "Fetching logs for $pod..."
 
   # Handle pods with multiple containers
-  containers=$(kubectl get pod $pod -n $NAMESPACE -o jsonpath='{.spec.containers[*].name}')
+  containers=$(kubectl get pod "$pod" -n "$namespace" -o jsonpath='{.spec.containers[*].name}')
   for container in $containers; do
     if [ "$container" == "$pod" ]; then
       # If there's only one container, name the log file after the pod
-      kubectl logs $pod -n $NAMESPACE > "${1}${pod}.log"
+      kubectl logs "$pod" -n "$namespace" > "${output_folder}/${pod}.log"
     else
       # If there are multiple containers, name the log file after the pod and container
-      kubectl logs $pod -c $container -n $NAMESPACE > "${1}${pod}_${container}.log"
+      kubectl logs "$pod" -c "$container" -n "$namespace" > "${output_folder}/${pod}_${container}.log"
     fi
   done
 done
diff --git a/bin/snippets/README.md b/bin/snippets/README.md
new file mode 100644
index 0000000..0aa9d74
--- /dev/null
+++ b/bin/snippets/README.md
@@ -0,0 +1,4 @@
+# Snippets
+
+Ad hoc snippets which reshape data for one-off analysis, not worth the trouble of making into scripts.
+
diff --git a/bin/snippets/upload-bug.sh b/bin/snippets/upload-bug.sh
new file mode 100644
index 0000000..ffdfd72
--- /dev/null
+++ b/bin/snippets/upload-bug.sh
@@ -0,0 +1,26 @@
+set -e
+
+base_folder=${1:-"./data/20"}
+mkdir -p "${base_folder}/pods/uploads"
+
+# tags uploads with id
+for i in "${base_folder}"/pods/codex-continuous-tests-0codex*; do
+  python -m adhoc.identify_uploads < "$i" > "${i%/*}/uploads/${i##*/}"
+done
+
+# transforms raw logs into single CSV
+for i in "${base_folder}"/pods/uploads/codex-continuous-tests-0codex*; do
+  python -m logtools.cli.to_csv < "$i" \
+  --extract-fields upload \
+  --constant-column \
+  source=${${i##*/}%.*} >> "${base_folder}"/pods/uploads/all_uploads.csv.temp
+done
+
+./bin/csv-concat.sh < "${base_folder}"/pods/uploads/all_uploads.csv.temp > "${base_folder}"/pods/uploads/all_uploads.csv
+rm "${base_folder}"/pods/uploads/all_uploads.csv.temp
+
+# extracts debug endpoint data and looks into wantlist sizes
+grep -h 'Before upload\|After download' "${base_folder}"/runner/*.log | \
+ sed -E 's/\[(.{28})\] <([A-Z]+[0-9]+)> (Before upload|After download): (.*)$/\4/p' > "${base_folder}"/runner/merged.jsonl
+
+jq '.pendingBlocks' < "${base_folder}"/runner/merged.jsonl | uniq # should print 0
\ No newline at end of file