remove stuff that no longer belongs here

2023-11-14 12:18:57 -03:00 · 2023-11-14 12:18:57 -03:00 · 73a7ed7480
parent 71c40244d3
commit 73a7ed7480
9 changed files with 0 additions and 336 deletions
--- a/adhoc/init.py
+++ b/adhoc/init.py
--- a/adhoc/identify_uploads.py
+++ b/adhoc/identify_uploads.py
@ -1,18 +0,0 @@
 """Ad-hoc script which tags uploads with a sequential number."""
 import sys
 uploading = False
 upload_no = 0
 for line in sys.stdin:
    if 'Handling file upload' in line:
        upload_no += 1
        uploading = True
    if uploading:
        line = line.strip()
        parts = line.rsplit(' ', maxsplit=1)
        line = ' '.join([parts[0], f'upload={upload_no}', parts[1]])
        print(line)
    if 'Uploaded file' in line:
        uploading = False
--- a/analysis/analysis.Rmd
+++ b/analysis/analysis.Rmd
@ -1,208 +0,0 @@
 ---
 title: "R Notebook"
 output: html_notebook
 ---
 ```{r}
 library(tidyverse)
 library(lubridate)
 ```
 # Node Crashing on Upload
 ```{r}
 uploads <- read_csv('./codex-continuous-tests-0codex3-5-77bdb95dc7-j7f46_codex3-5-uploads.csv')
 ```
 ```{r}
 durations <- uploads |> 
  arrange(count) |> 
  group_by(upload) |> 
  summarise(
    start = timestamp[1],
    end = timestamp[n()],
  ) |> 
  mutate(duration = end - start)
 ```
 How long are uploads taking?
 ```{r}
 ggplot(durations, aes(x = upload, y = duration)) + 
  geom_point() + 
  geom_line() +
  ylab('upload duration') + 
  xlab('upload number') + 
  theme_minimal()
 ```
 Are all uploads completing?
 ```{r}
 uploads |> 
  filter(message == 'Got data from stream') |> 
  group_by(upload) |> 
  count(name = 'blocks')
 ```
 Does the end of the upload coincide with the last chunk that gets stored?
 ```{r}
 uploads |> 
  filter(grepl('Got data from stream', message)) |> 
  group_by(upload) |>
  summarise(
    last_store = max(timestamp)
  ) |> 
  inner_join(durations, by='upload')
 ```
 ```{r}
 durations
 ```
 ```{r}
 uploads |> filter(grepl('Exception', message)) |> group_by(message) |> count() |> arrange(n)
 ```
 ```{r}
 uploads |> filter(upload == 18) |> group_by(message) |> count() |> arrange(n)
 ```
 ```{r}
 uploads |> filter(upload == 17) |> group_by(message)  |> count() |> arrange(n)
 ```
 ```{r}
 messages <- uploads |> group_by(message) |> count() |> filter(n > 100) |> pull(message)
 ```
 ```{r fig.height = 10}
 uploads |> filter(message %in% messages) |> group_by(upload, message) |> count() %>% {
  ggplot(.) + 
    geom_point(aes(x = message, y = n, color = as.factor(upload))) + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust=1)) +
    ylab('count') + 
    scale_color_manual(values=c('18'='red'))
 }
 ```
 ```{r}
 interlog_intervals <- uploads |> 
  group_by(upload) |> 
  arrange(timestamp) |> 
  mutate(log_interval = as.numeric(timestamp - lag(timestamp))) |>
  ungroup()
 ```
 ```{r}
 interlog_intervals |> 
  group_by(upload) |>
  summarise(
    mean_li = mean(log_interval, na.rm=TRUE),
    median_li = median(log_interval, na.rm=TRUE),
    max_li = max(log_interval, na.rm=TRUE),
  ) |>
  pivot_longer(-upload) %>% {
    ggplot(.) + 
      geom_line(aes(x = upload, y = value, col = name)) +
      scale_y_log10() + 
      theme_minimal() +
      ylab('duration (logscale, seconds)')
  }
 ```
 ```{r}
 interlog_intervals |> group_by(upload) |> count() |> arrange(desc(n))
 ```
 ```{r fig.height=5}
 interlog_intervals |> 
  group_by(upload) |>
  arrange(log_interval) |>
  mutate(rank = seq_along(log_interval)) |> ungroup() %>% {
    ggplot(.) + 
      geom_point(aes(x = rank, y = log_interval, col = as.factor(upload))) + 
      theme_minimal() +
      xlab('rank') + 
      ylab('time between two consecutive log messages') +
      guides(col = guide_legend(title = 'upload #'))
  }
 ```
 ```{r}
 ggplot(
  interlog_intervals |> 
    filter(upload == 18
           ) |>
    mutate(bucket = floor_date(timestamp, unit = '5 seconds')) |>
    group_by(bucket) |>
    mutate(
      mean_interval = mean(log_interval),
      p_70 = quantile(log_interval[-1], probs = c(0.95))
    ) |> 
    ungroup()
  ) + 
  geom_point(aes(x = timestamp, y = log_interval)) +
  geom_line(aes(x = bucket, y = mean_interval), col = 'red', lwd = 2) +
  geom_line(aes(x = bucket, y = p_70), col = 'orange', lwd = 2) + 
  theme_minimal()
 ```
 # Whole-Cluster
 ```{r}
 cluster_uploads <- read_csv('../data/20/pods/uploads/all_uploads.csv') |> filter(source != 'source')
 ```
 ```{r}
 cluster_upload_durations <- cluster_uploads |> group_by(source, upload) |> arrange(timestamp) |> summarise(duration = as.numeric(timestamp[n()] - timestamp[1]))
 ```
 ```{r fig.width=12}
 ggplot(cluster_upload_durations) + 
  geom_line(aes(x = upload, y = duration, col = source)) + 
  theme_minimal() +
  facet_wrap(. ~ source) +
  guides(color = FALSE)
 ```
 ```{r}
 cluster_interlog_intervals <- cluster_uploads |> 
  group_by(source, upload) |> 
  arrange(timestamp) |> 
  mutate(log_interval = as.numeric(timestamp - lag(timestamp))) |>
  ungroup()
 ```
 ```{r fig.width=10}
 cluster_interlog_intervals |>
  group_by(source, upload) |>
  summarise(
    mean_li = mean(log_interval, na.rm=TRUE),
    median_li = median(log_interval, na.rm=TRUE),
    max_li = max(log_interval, na.rm=TRUE),
  ) |>
  pivot_longer(-c(source, upload)) %>% {
    ggplot(.) + 
      geom_line(aes(x = upload, y = value, col = name)) +
      scale_y_log10() + 
      theme_minimal() +
      ylab('interval between log messages (logscale, seconds)') +
      facet_wrap(. ~ source) 
  } 
 ```
 ```{r}
 ggplot(cluster_interlog_intervals) + 
  geom_line(aes(x = upload, y = duration, col = source)) + 
  theme_minimal() +
  facet_wrap(. ~ source) +
  guides(color = FALSE)
 ```
--- a/analysis/analysis.Rproj
+++ b/analysis/analysis.Rproj
@ -1,13 +0,0 @@
 Version: 1.0
 RestoreWorkspace: Default
 SaveWorkspace: Default
 AlwaysSaveHistory: Default
 EnableCodeIndexing: Yes
 UseSpacesForTab: Yes
 NumSpacesForTab: 2
 Encoding: UTF-8
 RnwWeave: Sweave
 LaTeX: pdfLaTeX
--- a/bin/csv-concat.sh
+++ b/bin/csv-concat.sh
@ -1,8 +0,0 @@
 #!/usr/bin/env bash
 # Concatenates CSV files that have identical headers by removing the header from all but the first file. This is
 # meant to be used after a call to `cat`; e.g., cat csv1.csv csv2.csv | lscsv-concat.sh
 set -e
 header=$(head -n 1)
 echo "$header"
 grep "$header" -Fv
--- a/bin/pull-all-logs.sh
+++ b/bin/pull-all-logs.sh
@ -1,29 +0,0 @@
 #!/usr/bin/env bash
 # Given a namespace and a base folder containing the runner logs for continuous tests, creates
 # a storage area (folder) and:
 #
 #  1. pulls pod logs into storage_area/pods
 #  2. copies runner logs to storage_area/runner
 #
 # Make sure you delete the original runner logs once this is done, as otherwise they might get copied into  more
 # than one storage area.
 set -e
 namespace=${1}
 runner_log_source=${2}
 if [ -z "$namespace" ] || [ -z "$runner_log_source" ]; then
  echo "Usage: bin/process_logs.sh <namespace> <runner_logs>"
  exit 1
 fi
 run_id=$(date +'%Y-%m-%d-%H%M%S')
 logs="data/logs/$run_id"
 pod_logs="$logs/pods"
 runner_logs="$logs/runner"
 mkdir -p "$pod_logs"
 bash ./bin/pull-pod-logs.sh "$namespace" "$pod_logs"
 mkdir -p "$runner_logs"
 cp "$runner_log_source"/* "$runner_logs/"
--- a/bin/pull-pod-logs.sh
+++ b/bin/pull-pod-logs.sh
@ -1,30 +0,0 @@
 #!/bin/bash
 namespace=${1:-"codex-continuous-tests"}
 output_folder=${2:-./}
 # List all pods in the namespace
 pods=$(kubectl get pods -n "$namespace" -o jsonpath='{.items[*].metadata.name}')
 if [ -z "$pods" ]; then
  echo "No pods found in namespace $namespace."
  exit 1
 fi
 for pod in $pods; do
  echo "Fetching logs for $pod..."
  # Handle pods with multiple containers
  containers=$(kubectl get pod "$pod" -n "$namespace" -o jsonpath='{.spec.containers[*].name}')
  for container in $containers; do
    if [ "$container" == "$pod" ]; then
      # If there's only one container, name the log file after the pod
      kubectl logs "$pod" -n "$namespace" > "${output_folder}/${pod}.log"
    else
      # If there are multiple containers, name the log file after the pod and container
      kubectl logs "$pod" -c "$container" -n "$namespace" > "${output_folder}/${pod}_${container}.log"
    fi
  done
 done
 echo "Done fetching logs."
--- a/bin/snippets/README.md
+++ b/bin/snippets/README.md
@ -1,4 +0,0 @@
 # Snippets
 Ad hoc snippets which reshape data for one-off analysis, not worth the trouble of making into scripts.
--- a/bin/snippets/upload-bug.sh
+++ b/bin/snippets/upload-bug.sh
@ -1,26 +0,0 @@
 set -e
 base_folder=${1:-"./data/20"}
 mkdir -p "${base_folder}/pods/uploads"
 # tags uploads with id
 for i in "${base_folder}"/pods/codex-continuous-tests-0codex*; do
  python -m adhoc.identify_uploads < "$i" > "${i%/*}/uploads/${i##*/}"
 done
 # transforms raw logs into single CSV
 for i in "${base_folder}"/pods/uploads/codex-continuous-tests-0codex*; do
  python -m logtools.cli.to_csv < "$i" \
  --extract-fields upload \
  --constant-column \
  source=${${i##*/}%.*} >> "${base_folder}"/pods/uploads/all_uploads.csv.temp
 done
 ./bin/csv-concat.sh < "${base_folder}"/pods/uploads/all_uploads.csv.temp > "${base_folder}"/pods/uploads/all_uploads.csv
 rm "${base_folder}"/pods/uploads/all_uploads.csv.temp
 # extracts debug endpoint data and looks into wantlist sizes
 grep -h 'Before upload\|After download' "${base_folder}"/runner/*.log | \
 sed -E 's/\[(.{28})\] <([A-Z]+[0-9]+)> (Before upload|After download): (.*)$/\4/p' > "${base_folder}"/runner/merged.jsonl
 jq '.pendingBlocks' < "${base_folder}"/runner/merged.jsonl | uniq # should print 0
		`@ -1,4 +0,0 @@`
			`# Snippets`

			`Ad hoc snippets which reshape data for one-off analysis, not worth the trouble of making into scripts.`