From 73a7ed748074de2147ddb49f09b2bb0392141609 Mon Sep 17 00:00:00 2001 From: gmega Date: Tue, 14 Nov 2023 12:18:57 -0300 Subject: [PATCH] remove stuff that no longer belongs here --- adhoc/__init__.py | 0 adhoc/identify_uploads.py | 18 ---- analysis/analysis.Rmd | 208 ------------------------------------- analysis/analysis.Rproj | 13 --- bin/csv-concat.sh | 8 -- bin/pull-all-logs.sh | 29 ------ bin/pull-pod-logs.sh | 30 ------ bin/snippets/README.md | 4 - bin/snippets/upload-bug.sh | 26 ----- 9 files changed, 336 deletions(-) delete mode 100644 adhoc/__init__.py delete mode 100644 adhoc/identify_uploads.py delete mode 100644 analysis/analysis.Rmd delete mode 100644 analysis/analysis.Rproj delete mode 100755 bin/csv-concat.sh delete mode 100755 bin/pull-all-logs.sh delete mode 100755 bin/pull-pod-logs.sh delete mode 100644 bin/snippets/README.md delete mode 100644 bin/snippets/upload-bug.sh diff --git a/adhoc/__init__.py b/adhoc/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/adhoc/identify_uploads.py b/adhoc/identify_uploads.py deleted file mode 100644 index ebb5b62..0000000 --- a/adhoc/identify_uploads.py +++ /dev/null @@ -1,18 +0,0 @@ -"""Ad-hoc script which tags uploads with a sequential number.""" -import sys - -uploading = False -upload_no = 0 -for line in sys.stdin: - if 'Handling file upload' in line: - upload_no += 1 - uploading = True - - if uploading: - line = line.strip() - parts = line.rsplit(' ', maxsplit=1) - line = ' '.join([parts[0], f'upload={upload_no}', parts[1]]) - print(line) - - if 'Uploaded file' in line: - uploading = False diff --git a/analysis/analysis.Rmd b/analysis/analysis.Rmd deleted file mode 100644 index e406770..0000000 --- a/analysis/analysis.Rmd +++ /dev/null @@ -1,208 +0,0 @@ ---- -title: "R Notebook" -output: html_notebook ---- - -```{r} -library(tidyverse) -library(lubridate) -``` - -# Node Crashing on Upload - -```{r} -uploads <- read_csv('./codex-continuous-tests-0codex3-5-77bdb95dc7-j7f46_codex3-5-uploads.csv') -``` - - -```{r} -durations <- uploads |> - arrange(count) |> - group_by(upload) |> - summarise( - start = timestamp[1], - end = timestamp[n()], - ) |> - mutate(duration = end - start) -``` - -How long are uploads taking? - -```{r} -ggplot(durations, aes(x = upload, y = duration)) + - geom_point() + - geom_line() + - ylab('upload duration') + - xlab('upload number') + - theme_minimal() -``` -Are all uploads completing? - -```{r} -uploads |> - filter(message == 'Got data from stream') |> - group_by(upload) |> - count(name = 'blocks') -``` - -Does the end of the upload coincide with the last chunk that gets stored? - -```{r} -uploads |> - filter(grepl('Got data from stream', message)) |> - group_by(upload) |> - summarise( - last_store = max(timestamp) - ) |> - inner_join(durations, by='upload') -``` - -```{r} -durations -``` - -```{r} -uploads |> filter(grepl('Exception', message)) |> group_by(message) |> count() |> arrange(n) -``` - -```{r} -uploads |> filter(upload == 18) |> group_by(message) |> count() |> arrange(n) -``` - -```{r} -uploads |> filter(upload == 17) |> group_by(message) |> count() |> arrange(n) -``` - -```{r} -messages <- uploads |> group_by(message) |> count() |> filter(n > 100) |> pull(message) -``` - - -```{r fig.height = 10} -uploads |> filter(message %in% messages) |> group_by(upload, message) |> count() %>% { - ggplot(.) + - geom_point(aes(x = message, y = n, color = as.factor(upload))) + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust=1)) + - ylab('count') + - scale_color_manual(values=c('18'='red')) -} -``` - - -```{r} -interlog_intervals <- uploads |> - group_by(upload) |> - arrange(timestamp) |> - mutate(log_interval = as.numeric(timestamp - lag(timestamp))) |> - ungroup() -``` - - -```{r} -interlog_intervals |> - group_by(upload) |> - summarise( - mean_li = mean(log_interval, na.rm=TRUE), - median_li = median(log_interval, na.rm=TRUE), - max_li = max(log_interval, na.rm=TRUE), - ) |> - pivot_longer(-upload) %>% { - ggplot(.) + - geom_line(aes(x = upload, y = value, col = name)) + - scale_y_log10() + - theme_minimal() + - ylab('duration (logscale, seconds)') - } -``` - -```{r} -interlog_intervals |> group_by(upload) |> count() |> arrange(desc(n)) -``` - - -```{r fig.height=5} -interlog_intervals |> - group_by(upload) |> - arrange(log_interval) |> - mutate(rank = seq_along(log_interval)) |> ungroup() %>% { - ggplot(.) + - geom_point(aes(x = rank, y = log_interval, col = as.factor(upload))) + - theme_minimal() + - xlab('rank') + - ylab('time between two consecutive log messages') + - guides(col = guide_legend(title = 'upload #')) - } -``` - -```{r} -ggplot( - interlog_intervals |> - filter(upload == 18 - ) |> - mutate(bucket = floor_date(timestamp, unit = '5 seconds')) |> - group_by(bucket) |> - mutate( - mean_interval = mean(log_interval), - p_70 = quantile(log_interval[-1], probs = c(0.95)) - ) |> - ungroup() - ) + - geom_point(aes(x = timestamp, y = log_interval)) + - geom_line(aes(x = bucket, y = mean_interval), col = 'red', lwd = 2) + - geom_line(aes(x = bucket, y = p_70), col = 'orange', lwd = 2) + - theme_minimal() -``` - - -# Whole-Cluster - -```{r} -cluster_uploads <- read_csv('../data/20/pods/uploads/all_uploads.csv') |> filter(source != 'source') -``` - -```{r} -cluster_upload_durations <- cluster_uploads |> group_by(source, upload) |> arrange(timestamp) |> summarise(duration = as.numeric(timestamp[n()] - timestamp[1])) -``` - -```{r fig.width=12} -ggplot(cluster_upload_durations) + - geom_line(aes(x = upload, y = duration, col = source)) + - theme_minimal() + - facet_wrap(. ~ source) + - guides(color = FALSE) -``` - -```{r} -cluster_interlog_intervals <- cluster_uploads |> - group_by(source, upload) |> - arrange(timestamp) |> - mutate(log_interval = as.numeric(timestamp - lag(timestamp))) |> - ungroup() -``` - -```{r fig.width=10} -cluster_interlog_intervals |> - group_by(source, upload) |> - summarise( - mean_li = mean(log_interval, na.rm=TRUE), - median_li = median(log_interval, na.rm=TRUE), - max_li = max(log_interval, na.rm=TRUE), - ) |> - pivot_longer(-c(source, upload)) %>% { - ggplot(.) + - geom_line(aes(x = upload, y = value, col = name)) + - scale_y_log10() + - theme_minimal() + - ylab('interval between log messages (logscale, seconds)') + - facet_wrap(. ~ source) - } -``` - - -```{r} -ggplot(cluster_interlog_intervals) + - geom_line(aes(x = upload, y = duration, col = source)) + - theme_minimal() + - facet_wrap(. ~ source) + - guides(color = FALSE) -``` - diff --git a/analysis/analysis.Rproj b/analysis/analysis.Rproj deleted file mode 100644 index 8e3c2eb..0000000 --- a/analysis/analysis.Rproj +++ /dev/null @@ -1,13 +0,0 @@ -Version: 1.0 - -RestoreWorkspace: Default -SaveWorkspace: Default -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -UseSpacesForTab: Yes -NumSpacesForTab: 2 -Encoding: UTF-8 - -RnwWeave: Sweave -LaTeX: pdfLaTeX diff --git a/bin/csv-concat.sh b/bin/csv-concat.sh deleted file mode 100755 index d4ae906..0000000 --- a/bin/csv-concat.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash -# Concatenates CSV files that have identical headers by removing the header from all but the first file. This is -# meant to be used after a call to `cat`; e.g., cat csv1.csv csv2.csv | lscsv-concat.sh -set -e - -header=$(head -n 1) -echo "$header" -grep "$header" -Fv \ No newline at end of file diff --git a/bin/pull-all-logs.sh b/bin/pull-all-logs.sh deleted file mode 100755 index 33ebee2..0000000 --- a/bin/pull-all-logs.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env bash -# Given a namespace and a base folder containing the runner logs for continuous tests, creates -# a storage area (folder) and: -# -# 1. pulls pod logs into storage_area/pods -# 2. copies runner logs to storage_area/runner -# -# Make sure you delete the original runner logs once this is done, as otherwise they might get copied into more -# than one storage area. -set -e - -namespace=${1} -runner_log_source=${2} - -if [ -z "$namespace" ] || [ -z "$runner_log_source" ]; then - echo "Usage: bin/process_logs.sh " - exit 1 -fi - -run_id=$(date +'%Y-%m-%d-%H%M%S') -logs="data/logs/$run_id" -pod_logs="$logs/pods" -runner_logs="$logs/runner" - -mkdir -p "$pod_logs" -bash ./bin/pull-pod-logs.sh "$namespace" "$pod_logs" - -mkdir -p "$runner_logs" -cp "$runner_log_source"/* "$runner_logs/" \ No newline at end of file diff --git a/bin/pull-pod-logs.sh b/bin/pull-pod-logs.sh deleted file mode 100755 index 64463b6..0000000 --- a/bin/pull-pod-logs.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -namespace=${1:-"codex-continuous-tests"} -output_folder=${2:-./} - -# List all pods in the namespace -pods=$(kubectl get pods -n "$namespace" -o jsonpath='{.items[*].metadata.name}') - -if [ -z "$pods" ]; then - echo "No pods found in namespace $namespace." - exit 1 -fi - -for pod in $pods; do - echo "Fetching logs for $pod..." - - # Handle pods with multiple containers - containers=$(kubectl get pod "$pod" -n "$namespace" -o jsonpath='{.spec.containers[*].name}') - for container in $containers; do - if [ "$container" == "$pod" ]; then - # If there's only one container, name the log file after the pod - kubectl logs "$pod" -n "$namespace" > "${output_folder}/${pod}.log" - else - # If there are multiple containers, name the log file after the pod and container - kubectl logs "$pod" -c "$container" -n "$namespace" > "${output_folder}/${pod}_${container}.log" - fi - done -done - -echo "Done fetching logs." diff --git a/bin/snippets/README.md b/bin/snippets/README.md deleted file mode 100644 index 0aa9d74..0000000 --- a/bin/snippets/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# Snippets - -Ad hoc snippets which reshape data for one-off analysis, not worth the trouble of making into scripts. - diff --git a/bin/snippets/upload-bug.sh b/bin/snippets/upload-bug.sh deleted file mode 100644 index ffdfd72..0000000 --- a/bin/snippets/upload-bug.sh +++ /dev/null @@ -1,26 +0,0 @@ -set -e - -base_folder=${1:-"./data/20"} -mkdir -p "${base_folder}/pods/uploads" - -# tags uploads with id -for i in "${base_folder}"/pods/codex-continuous-tests-0codex*; do - python -m adhoc.identify_uploads < "$i" > "${i%/*}/uploads/${i##*/}" -done - -# transforms raw logs into single CSV -for i in "${base_folder}"/pods/uploads/codex-continuous-tests-0codex*; do - python -m logtools.cli.to_csv < "$i" \ - --extract-fields upload \ - --constant-column \ - source=${${i##*/}%.*} >> "${base_folder}"/pods/uploads/all_uploads.csv.temp -done - -./bin/csv-concat.sh < "${base_folder}"/pods/uploads/all_uploads.csv.temp > "${base_folder}"/pods/uploads/all_uploads.csv -rm "${base_folder}"/pods/uploads/all_uploads.csv.temp - -# extracts debug endpoint data and looks into wantlist sizes -grep -h 'Before upload\|After download' "${base_folder}"/runner/*.log | \ - sed -E 's/\[(.{28})\] <([A-Z]+[0-9]+)> (Before upload|After download): (.*)$/\4/p' > "${base_folder}"/runner/merged.jsonl - -jq '.pendingBlocks' < "${base_folder}"/runner/merged.jsonl | uniq # should print 0 \ No newline at end of file