From 73a7ed748074de2147ddb49f09b2bb0392141609 Mon Sep 17 00:00:00 2001
From: gmega <giuliano.mega@gmail.com>
Date: Tue, 14 Nov 2023 12:18:57 -0300
Subject: [PATCH] remove stuff that no longer belongs here

---
 adhoc/__init__.py          |   0
 adhoc/identify_uploads.py  |  18 ----
 analysis/analysis.Rmd      | 208 -------------------------------------
 analysis/analysis.Rproj    |  13 ---
 bin/csv-concat.sh          |   8 --
 bin/pull-all-logs.sh       |  29 ------
 bin/pull-pod-logs.sh       |  30 ------
 bin/snippets/README.md     |   4 -
 bin/snippets/upload-bug.sh |  26 -----
 9 files changed, 336 deletions(-)
 delete mode 100644 adhoc/__init__.py
 delete mode 100644 adhoc/identify_uploads.py
 delete mode 100644 analysis/analysis.Rmd
 delete mode 100644 analysis/analysis.Rproj
 delete mode 100755 bin/csv-concat.sh
 delete mode 100755 bin/pull-all-logs.sh
 delete mode 100755 bin/pull-pod-logs.sh
 delete mode 100644 bin/snippets/README.md
 delete mode 100644 bin/snippets/upload-bug.sh

diff --git a/adhoc/__init__.py b/adhoc/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/adhoc/identify_uploads.py b/adhoc/identify_uploads.py
deleted file mode 100644
index ebb5b62..0000000
--- a/adhoc/identify_uploads.py
+++ /dev/null
@@ -1,18 +0,0 @@
-"""Ad-hoc script which tags uploads with a sequential number."""
-import sys
-
-uploading = False
-upload_no = 0
-for line in sys.stdin:
-    if 'Handling file upload' in line:
-        upload_no += 1
-        uploading = True
-
-    if uploading:
-        line = line.strip()
-        parts = line.rsplit(' ', maxsplit=1)
-        line = ' '.join([parts[0], f'upload={upload_no}', parts[1]])
-        print(line)
-
-    if 'Uploaded file' in line:
-        uploading = False
diff --git a/analysis/analysis.Rmd b/analysis/analysis.Rmd
deleted file mode 100644
index e406770..0000000
--- a/analysis/analysis.Rmd
+++ /dev/null
@@ -1,208 +0,0 @@
----
-title: "R Notebook"
-output: html_notebook
----
-
-```{r}
-library(tidyverse)
-library(lubridate)
-```
-
-# Node Crashing on Upload
-
-```{r}
-uploads <- read_csv('./codex-continuous-tests-0codex3-5-77bdb95dc7-j7f46_codex3-5-uploads.csv')
-```
-
-
-```{r}
-durations <- uploads |> 
-  arrange(count) |> 
-  group_by(upload) |> 
-  summarise(
-    start = timestamp[1],
-    end = timestamp[n()],
-  ) |> 
-  mutate(duration = end - start)
-```
-
-How long are uploads taking?
-
-```{r}
-ggplot(durations, aes(x = upload, y = duration)) + 
-  geom_point() + 
-  geom_line() +
-  ylab('upload duration') + 
-  xlab('upload number') + 
-  theme_minimal()
-```
-Are all uploads completing?
-
-```{r}
-uploads |> 
-  filter(message == 'Got data from stream') |> 
-  group_by(upload) |> 
-  count(name = 'blocks')
-```
-
-Does the end of the upload coincide with the last chunk that gets stored?
-
-```{r}
-uploads |> 
-  filter(grepl('Got data from stream', message)) |> 
-  group_by(upload) |>
-  summarise(
-    last_store = max(timestamp)
-  ) |> 
-  inner_join(durations, by='upload')
-```
-
-```{r}
-durations
-```
-
-```{r}
-uploads |> filter(grepl('Exception', message)) |> group_by(message) |> count() |> arrange(n)
-```
-
-```{r}
-uploads |> filter(upload == 18) |> group_by(message) |> count() |> arrange(n)
-```
-
-```{r}
-uploads |> filter(upload == 17) |> group_by(message)  |> count() |> arrange(n)
-```
-
-```{r}
-messages <- uploads |> group_by(message) |> count() |> filter(n > 100) |> pull(message)
-```
-
-
-```{r fig.height = 10}
-uploads |> filter(message %in% messages) |> group_by(upload, message) |> count() %>% {
-  ggplot(.) + 
-    geom_point(aes(x = message, y = n, color = as.factor(upload))) + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust=1)) +
-    ylab('count') + 
-    scale_color_manual(values=c('18'='red'))
-}
-```
-
-
-```{r}
-interlog_intervals <- uploads |> 
-  group_by(upload) |> 
-  arrange(timestamp) |> 
-  mutate(log_interval = as.numeric(timestamp - lag(timestamp))) |>
-  ungroup()
-```
-
-
-```{r}
-interlog_intervals |> 
-  group_by(upload) |>
-  summarise(
-    mean_li = mean(log_interval, na.rm=TRUE),
-    median_li = median(log_interval, na.rm=TRUE),
-    max_li = max(log_interval, na.rm=TRUE),
-  ) |>
-  pivot_longer(-upload) %>% {
-    ggplot(.) + 
-      geom_line(aes(x = upload, y = value, col = name)) +
-      scale_y_log10() + 
-      theme_minimal() +
-      ylab('duration (logscale, seconds)')
-  }
-```
-
-```{r}
-interlog_intervals |> group_by(upload) |> count() |> arrange(desc(n))
-```
-
-
-```{r fig.height=5}
-interlog_intervals |> 
-  group_by(upload) |>
-  arrange(log_interval) |>
-  mutate(rank = seq_along(log_interval)) |> ungroup() %>% {
-    ggplot(.) + 
-      geom_point(aes(x = rank, y = log_interval, col = as.factor(upload))) + 
-      theme_minimal() +
-      xlab('rank') + 
-      ylab('time between two consecutive log messages') +
-      guides(col = guide_legend(title = 'upload #'))
-  }
-```
-
-```{r}
-ggplot(
-  interlog_intervals |> 
-    filter(upload == 18
-           ) |>
-    mutate(bucket = floor_date(timestamp, unit = '5 seconds')) |>
-    group_by(bucket) |>
-    mutate(
-      mean_interval = mean(log_interval),
-      p_70 = quantile(log_interval[-1], probs = c(0.95))
-    ) |> 
-    ungroup()
-  ) + 
-  geom_point(aes(x = timestamp, y = log_interval)) +
-  geom_line(aes(x = bucket, y = mean_interval), col = 'red', lwd = 2) +
-  geom_line(aes(x = bucket, y = p_70), col = 'orange', lwd = 2) + 
-  theme_minimal()
-```
-
-
-# Whole-Cluster
-
-```{r}
-cluster_uploads <- read_csv('../data/20/pods/uploads/all_uploads.csv') |> filter(source != 'source')
-```
-
-```{r}
-cluster_upload_durations <- cluster_uploads |> group_by(source, upload) |> arrange(timestamp) |> summarise(duration = as.numeric(timestamp[n()] - timestamp[1]))
-```
-
-```{r fig.width=12}
-ggplot(cluster_upload_durations) + 
-  geom_line(aes(x = upload, y = duration, col = source)) + 
-  theme_minimal() +
-  facet_wrap(. ~ source) +
-  guides(color = FALSE)
-```
-
-```{r}
-cluster_interlog_intervals <- cluster_uploads |> 
-  group_by(source, upload) |> 
-  arrange(timestamp) |> 
-  mutate(log_interval = as.numeric(timestamp - lag(timestamp))) |>
-  ungroup()
-```
-
-```{r fig.width=10}
-cluster_interlog_intervals |>
-  group_by(source, upload) |>
-  summarise(
-    mean_li = mean(log_interval, na.rm=TRUE),
-    median_li = median(log_interval, na.rm=TRUE),
-    max_li = max(log_interval, na.rm=TRUE),
-  ) |>
-  pivot_longer(-c(source, upload)) %>% {
-    ggplot(.) + 
-      geom_line(aes(x = upload, y = value, col = name)) +
-      scale_y_log10() + 
-      theme_minimal() +
-      ylab('interval between log messages (logscale, seconds)') +
-      facet_wrap(. ~ source) 
-  } 
-```
-
-
-```{r}
-ggplot(cluster_interlog_intervals) + 
-  geom_line(aes(x = upload, y = duration, col = source)) + 
-  theme_minimal() +
-  facet_wrap(. ~ source) +
-  guides(color = FALSE)
-```
-
diff --git a/analysis/analysis.Rproj b/analysis/analysis.Rproj
deleted file mode 100644
index 8e3c2eb..0000000
--- a/analysis/analysis.Rproj
+++ /dev/null
@@ -1,13 +0,0 @@
-Version: 1.0
-
-RestoreWorkspace: Default
-SaveWorkspace: Default
-AlwaysSaveHistory: Default
-
-EnableCodeIndexing: Yes
-UseSpacesForTab: Yes
-NumSpacesForTab: 2
-Encoding: UTF-8
-
-RnwWeave: Sweave
-LaTeX: pdfLaTeX
diff --git a/bin/csv-concat.sh b/bin/csv-concat.sh
deleted file mode 100755
index d4ae906..0000000
--- a/bin/csv-concat.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/env bash
-# Concatenates CSV files that have identical headers by removing the header from all but the first file. This is
-# meant to be used after a call to `cat`; e.g., cat csv1.csv csv2.csv | lscsv-concat.sh
-set -e
-
-header=$(head -n 1)
-echo "$header"
-grep "$header" -Fv
\ No newline at end of file
diff --git a/bin/pull-all-logs.sh b/bin/pull-all-logs.sh
deleted file mode 100755
index 33ebee2..0000000
--- a/bin/pull-all-logs.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env bash
-# Given a namespace and a base folder containing the runner logs for continuous tests, creates
-# a storage area (folder) and:
-#
-#  1. pulls pod logs into storage_area/pods
-#  2. copies runner logs to storage_area/runner
-#
-# Make sure you delete the original runner logs once this is done, as otherwise they might get copied into  more
-# than one storage area.
-set -e
-
-namespace=${1}
-runner_log_source=${2}
-
-if [ -z "$namespace" ] || [ -z "$runner_log_source" ]; then
-  echo "Usage: bin/process_logs.sh <namespace> <runner_logs>"
-  exit 1
-fi
-
-run_id=$(date +'%Y-%m-%d-%H%M%S')
-logs="data/logs/$run_id"
-pod_logs="$logs/pods"
-runner_logs="$logs/runner"
-
-mkdir -p "$pod_logs"
-bash ./bin/pull-pod-logs.sh "$namespace" "$pod_logs"
-
-mkdir -p "$runner_logs"
-cp "$runner_log_source"/* "$runner_logs/"
\ No newline at end of file
diff --git a/bin/pull-pod-logs.sh b/bin/pull-pod-logs.sh
deleted file mode 100755
index 64463b6..0000000
--- a/bin/pull-pod-logs.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-namespace=${1:-"codex-continuous-tests"}
-output_folder=${2:-./}
-
-# List all pods in the namespace
-pods=$(kubectl get pods -n "$namespace" -o jsonpath='{.items[*].metadata.name}')
-
-if [ -z "$pods" ]; then
-  echo "No pods found in namespace $namespace."
-  exit 1
-fi
-
-for pod in $pods; do
-  echo "Fetching logs for $pod..."
-
-  # Handle pods with multiple containers
-  containers=$(kubectl get pod "$pod" -n "$namespace" -o jsonpath='{.spec.containers[*].name}')
-  for container in $containers; do
-    if [ "$container" == "$pod" ]; then
-      # If there's only one container, name the log file after the pod
-      kubectl logs "$pod" -n "$namespace" > "${output_folder}/${pod}.log"
-    else
-      # If there are multiple containers, name the log file after the pod and container
-      kubectl logs "$pod" -c "$container" -n "$namespace" > "${output_folder}/${pod}_${container}.log"
-    fi
-  done
-done
-
-echo "Done fetching logs."
diff --git a/bin/snippets/README.md b/bin/snippets/README.md
deleted file mode 100644
index 0aa9d74..0000000
--- a/bin/snippets/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# Snippets
-
-Ad hoc snippets which reshape data for one-off analysis, not worth the trouble of making into scripts.
-
diff --git a/bin/snippets/upload-bug.sh b/bin/snippets/upload-bug.sh
deleted file mode 100644
index ffdfd72..0000000
--- a/bin/snippets/upload-bug.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-set -e
-
-base_folder=${1:-"./data/20"}
-mkdir -p "${base_folder}/pods/uploads"
-
-# tags uploads with id
-for i in "${base_folder}"/pods/codex-continuous-tests-0codex*; do
-  python -m adhoc.identify_uploads < "$i" > "${i%/*}/uploads/${i##*/}"
-done
-
-# transforms raw logs into single CSV
-for i in "${base_folder}"/pods/uploads/codex-continuous-tests-0codex*; do
-  python -m logtools.cli.to_csv < "$i" \
-  --extract-fields upload \
-  --constant-column \
-  source=${${i##*/}%.*} >> "${base_folder}"/pods/uploads/all_uploads.csv.temp
-done
-
-./bin/csv-concat.sh < "${base_folder}"/pods/uploads/all_uploads.csv.temp > "${base_folder}"/pods/uploads/all_uploads.csv
-rm "${base_folder}"/pods/uploads/all_uploads.csv.temp
-
-# extracts debug endpoint data and looks into wantlist sizes
-grep -h 'Before upload\|After download' "${base_folder}"/runner/*.log | \
- sed -E 's/\[(.{28})\] <([A-Z]+[0-9]+)> (Before upload|After download): (.*)$/\4/p' > "${base_folder}"/runner/merged.jsonl
-
-jq '.pendingBlocks' < "${base_folder}"/runner/merged.jsonl | uniq # should print 0
\ No newline at end of file