--- title: "R Notebook" output: html_notebook --- ```{r} library(tidyverse) library(lubridate) ``` # Node Crashing on Upload ```{r} uploads <- read_csv('./codex-continuous-tests-0codex3-5-77bdb95dc7-j7f46_codex3-5-uploads.csv') ``` ```{r} durations <- uploads |> arrange(count) |> group_by(upload) |> summarise( start = timestamp[1], end = timestamp[n()], ) |> mutate(duration = end - start) ``` How long are uploads taking? ```{r} ggplot(durations, aes(x = upload, y = duration)) + geom_point() + geom_line() + ylab('upload duration') + xlab('upload number') + theme_minimal() ``` Are all uploads completing? ```{r} uploads |> filter(message == 'Got data from stream') |> group_by(upload) |> count(name = 'blocks') ``` Does the end of the upload coincide with the last chunk that gets stored? ```{r} uploads |> filter(grepl('Got data from stream', message)) |> group_by(upload) |> summarise( last_store = max(timestamp) ) |> inner_join(durations, by='upload') ``` ```{r} durations ``` ```{r} uploads |> filter(grepl('Exception', message)) |> group_by(message) |> count() |> arrange(n) ``` ```{r} uploads |> filter(upload == 18) |> group_by(message) |> count() |> arrange(n) ``` ```{r} uploads |> filter(upload == 17) |> group_by(message) |> count() |> arrange(n) ``` ```{r} messages <- uploads |> group_by(message) |> count() |> filter(n > 100) |> pull(message) ``` ```{r fig.height = 10} uploads |> filter(message %in% messages) |> group_by(upload, message) |> count() %>% { ggplot(.) + geom_point(aes(x = message, y = n, color = as.factor(upload))) + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust=1)) + ylab('count') + scale_color_manual(values=c('18'='red')) } ``` ```{r} interlog_intervals <- uploads |> group_by(upload) |> arrange(timestamp) |> mutate(log_interval = as.numeric(timestamp - lag(timestamp))) |> ungroup() ``` ```{r} interlog_intervals |> group_by(upload) |> summarise( mean_li = mean(log_interval, na.rm=TRUE), median_li = median(log_interval, na.rm=TRUE), max_li = max(log_interval, na.rm=TRUE), ) |> pivot_longer(-upload) %>% { ggplot(.) + geom_line(aes(x = upload, y = value, col = name)) + scale_y_log10() + theme_minimal() + ylab('duration (logscale, seconds)') } ``` ```{r} interlog_intervals |> group_by(upload) |> count() |> arrange(desc(n)) ``` ```{r fig.height=5} interlog_intervals |> group_by(upload) |> arrange(log_interval) |> mutate(rank = seq_along(log_interval)) |> ungroup() %>% { ggplot(.) + geom_point(aes(x = rank, y = log_interval, col = as.factor(upload))) + theme_minimal() + xlab('rank') + ylab('time between two consecutive log messages') + guides(col = guide_legend(title = 'upload #')) } ``` ```{r} ggplot( interlog_intervals |> filter(upload == 18 ) |> mutate(bucket = floor_date(timestamp, unit = '5 seconds')) |> group_by(bucket) |> mutate( mean_interval = mean(log_interval), p_70 = quantile(log_interval[-1], probs = c(0.95)) ) |> ungroup() ) + geom_point(aes(x = timestamp, y = log_interval)) + geom_line(aes(x = bucket, y = mean_interval), col = 'red', lwd = 2) + geom_line(aes(x = bucket, y = p_70), col = 'orange', lwd = 2) + theme_minimal() ``` # Whole-Cluster ```{r} cluster_uploads <- read_csv('../data/20/pods/uploads/all_uploads.csv') |> filter(source != 'source') ``` ```{r} cluster_upload_durations <- cluster_uploads |> group_by(source, upload) |> arrange(timestamp) |> summarise(duration = as.numeric(timestamp[n()] - timestamp[1])) ``` ```{r fig.width=12} ggplot(cluster_upload_durations) + geom_line(aes(x = upload, y = duration, col = source)) + theme_minimal() + facet_wrap(. ~ source) + guides(color = FALSE) ``` ```{r} cluster_interlog_intervals <- cluster_uploads |> group_by(source, upload) |> arrange(timestamp) |> mutate(log_interval = as.numeric(timestamp - lag(timestamp))) |> ungroup() ``` ```{r fig.width=10} cluster_interlog_intervals |> group_by(source, upload) |> summarise( mean_li = mean(log_interval, na.rm=TRUE), median_li = median(log_interval, na.rm=TRUE), max_li = max(log_interval, na.rm=TRUE), ) |> pivot_longer(-c(source, upload)) %>% { ggplot(.) + geom_line(aes(x = upload, y = value, col = name)) + scale_y_log10() + theme_minimal() + ylab('interval between log messages (logscale, seconds)') + facet_wrap(. ~ source) } ``` ```{r} ggplot(cluster_interlog_intervals) + geom_line(aes(x = upload, y = duration, col = source)) + theme_minimal() + facet_wrap(. ~ source) + guides(color = FALSE) ```