diff --git a/analysis/final/.Rbuildignore b/analysis/final/.Rbuildignore index d821302..46fffb2 100644 --- a/analysis/final/.Rbuildignore +++ b/analysis/final/.Rbuildignore @@ -2,3 +2,5 @@ ^renv\.lock$ ^.*\.Rproj$ ^\.Rproj\.user$ +^\.positai$ +^\.claude$ diff --git a/analysis/final/R/analysis.R b/analysis/final/R/analysis.R index 34a6f87..4e4cba2 100644 --- a/analysis/final/R/analysis.R +++ b/analysis/final/R/analysis.R @@ -94,13 +94,21 @@ compute_download_times <- function(meta, request_event, download_metric, group_i ) |> group_by(node, run, seed_set) |> mutate( - time_to_first_byte = min(timestamp), - lookup_time = as.numeric(time_to_first_byte - seed_request_time) + # The time elapsed between the instant we start the download and the time + # we see the first log entry is approximated as "time to first byte". + # In reality, this factors lookup time, swarm bootstrap time, and + # part of the download itself. + first_byte_t = min(timestamp), + first_byte = as.numeric(first_byte_t - seed_request_time), + # "Transfer" time is the total download time minus the lookup time. Again, + # this is approximated, and likely reflects a shorter download time than + # the real download time. + transfer = as.numeric(max(timestamp) - first_byte_t) ) |> ungroup() if (nrow(download_times |> - filter(elapsed_download_time < 0 | lookup_time < 0)) > 0) { + filter(elapsed_download_time < 0 | first_byte < 0 | transfer < 0)) > 0) { stop('Calculation for download times contains negative numbers') } @@ -137,36 +145,49 @@ download_times <- function(experiment, piece_count_distinct, discard_incomplete completion_time_stats <- function(download_times, meta) { - completion_times <- download_times |> + filtered <- download_times |> filter(!is.na(elapsed_download_time), - is_completed(completed)) |> - pull(elapsed_download_time) + is_completed(completed)) n_experiments <- meta$repetitions * meta$seeder_sets n_leechers <- meta$nodes$network_size - meta$seeders n_points <- n_experiments * n_leechers tibble( - n = length(completion_times), + n = nrow(filtered), expected_n = n_points, missing = expected_n - n, - min = min(completion_times), - p05 = quantile(completion_times, p = 0.05), - p10 = quantile(completion_times, p = 0.10), - p20 = quantile(completion_times, p = 0.20), - p25 = quantile(completion_times, p = 0.25), - median = median(completion_times), - p75 = quantile(completion_times, p = 0.75), - p80 = quantile(completion_times, p = 0.80), - p90 = quantile(completion_times, p = 0.90), - p95 = quantile(completion_times, p = 0.95), - max = max(completion_times), + completion = distributional_stats(filtered$elapsed_download_time), + first_byte = distributional_stats(filtered$first_byte), + transfer = distributional_stats(filtered$transfer) + ) |> + unnest( + cols = c(completion, first_byte, transfer), + names_sep = '_' + ) +} + +distributional_stats <- function(x) { + n <- length(x) + tibble( + min = min(x), + p05 = quantile(x, p = 0.05), + p10 = quantile(x, p = 0.10), + p20 = quantile(x, p = 0.20), + p25 = quantile(x, p = 0.25), + median = median(x), + p75 = quantile(x, p = 0.75), + p80 = quantile(x, p = 0.80), + p90 = quantile(x, p = 0.90), + p95 = quantile(x, p = 0.95), + max = max(x), iqr = p75 - p25, # This gives us roughly a 95% ci for comparing medians. ci = (1.58 * iqr) / sqrt(n), w_top = median + ci, w_bottom = median - ci ) + } check_seeder_count <- function(download_times, seeders) { @@ -212,13 +233,12 @@ compute_speedups <- function(benchmarks, baseline, compare) { baseline_data <- benchmarks |> filter(label == baseline) |> select( - experiment_type, label, network_size, seeders, leechers, file_size, median + experiment_type, label, network_size, seeders, leechers, file_size, completion_median ) |> - rename(baseline_median = median) + rename(baseline_median = completion_median) lapply(compare, function(compare_label) { - browser() benchmarks |> filter(label == compare_label) |> inner_join( @@ -226,7 +246,7 @@ compute_speedups <- function(benchmarks, baseline, compare) { by = c('network_size', 'seeders', 'leechers', 'file_size') ) |> mutate( - relative_median = median / baseline_median + relative_median = completion_median / baseline_median ) |> mutate(label = label.x) |> select(-baseline_median, -label.y, -label.x) diff --git a/analysis/final/static-dissemination.Rmd b/analysis/final/static-dissemination.Rmd index abe6b12..07a2b98 100644 --- a/analysis/final/static-dissemination.Rmd +++ b/analysis/final/static-dissemination.Rmd @@ -31,9 +31,10 @@ devtools::load_all() ``` ```{r message = FALSE, include = !knitr::is_html_output()} -experiments <- read_all_experiments('./data/devnet/optimized/g1761924045/', label = 'deluge') |> - merge_experiments(read_all_experiments('./data/devnet/optimized/g1762505060/', label ='codex-baseline')) |> - merge_experiments(read_all_experiments('./data/devnet/optimized/g1761729711/', label = 'codex-optimized')) +experiments <- read_all_experiments('./data/do/g1761924045/', label = 'deluge') |> + merge_experiments(read_all_experiments('./data/do/g1762505060/', label = 'codex-baseline')) |> + merge_experiments(read_all_experiments('./data/do/g1761729711/', label = 'codex-optimized')) |> + merge_experiments(read_all_experiments('./data/do/g1775565300/', label = 'new-protocol')) ``` ```{r include = !knitr::is_html_output()} @@ -45,7 +46,7 @@ COUNT_DISTINCT = list( ```{r message = FALSE, include = !knitr::is_html_output()} benchmarks <- lapply(experiments, function(experiment) { - print(glue::glue('Process {experiment$experiment_id}')) + print(glue::glue('Process {experiment$experiment_id} - {experiment$label}')) download_time_stats <- tryCatch({ meta <- experiment$meta completion <- experiment |> @@ -77,9 +78,12 @@ benchmarks <- lapply(experiments, function(experiment) { levels = rlang::parse_bytes(as.character( unique(file_size[order(file_size, decreasing = TRUE)])))), seeder_ratio = seeders / network_size, - median_speed = file_size_bytes / median, - p25_speed = file_size_bytes / p25, - p75_speed = file_size_bytes / p75 + completion_median_speed = file_size_bytes / completion_median, + completion_p25_speed = file_size_bytes / completion_p25, + completion_p75_speed = file_size_bytes / completion_p75, + transfer_median_speed = file_size_bytes / transfer_median, + transfer_p25_speed = file_size_bytes / transfer_p25, + transfer_p75_speed = file_size_bytes / transfer_p75 ) |> relocate(file_size, network_size, seeders, leechers, file_size_bytes) ``` @@ -114,58 +118,62 @@ DT::datatable( relative_performance <- compute_speedups( benchmarks = benchmarks, base = 'deluge', - compare = c('codex-baseline', 'codex-optimized') + compare = c('codex-baseline', 'codex-optimized', 'new-protocol') ) ``` ## Median Download Speed ```{r fig.cap='Median download speed for Deluge and Codex', fig.width = 11, message = FALSE, echo = FALSE} -ggplot(benchmarks, aes(col = label, fill = label, group = label)) + - geom_ribbon(aes(ymin = p25_speed, ymax = p75_speed, x = network_size, fill = label, alpha = 0.5), col = 'lightgray') + - geom_point(aes(x = network_size, y = p25_speed), col = 'darkgray', size=10.0, shape='-') + - geom_point(aes(x = network_size, y = p75_speed), col = 'darkgray', size=10.0, shape='-') + - geom_line(aes(x = network_size, y = median_speed)) + - geom_point(aes(x = network_size, y = median_speed)) + - ylab('median download speed (bytes/second)') + - xlab('network size') + - theme_minimal(base_size=15) + - scale_y_continuous(labels = function(x) paste0(scales::label_bytes()(x), '/s')) + - facet_grid( - file_size ~ seeder_ratio, - labeller = labeller( - seeder_ratio = as_labeller(function(x) { - paste0("seeder ratio: ", scales::percent(as.numeric(x))) - })) - ) + - scale_color_discrete(name = '') + - guides(fill = 'none', alpha = 'none') +comparison_plot( + benchmarks, + completion_p25_speed, + completion_p75_speed, + completion_median_speed, + ylab = 'median download speed (bytes/second)', + free_y = TRUE +) + Y_BPS +``` + +## Median Transfer Speed + +```{r fig.width = 11, message = FALSE, echo = FALSE} +comparison_plot( + benchmarks, + transfer_p25_speed, + transfer_p75_speed, + transfer_median_speed, + ylab = 'median transfer speed (bytes/second)', + free_y = TRUE +) + Y_BPS ``` ## Median Download Time - ```{r fig.cap='Median time to download a whole file for Deluge and Codex', fig.width = 11, message = FALSE, echo = FALSE} -ggplot(benchmarks, aes(col = label, fill = label, group = label)) + - geom_ribbon(aes(ymin = p25, ymax = p75, x = network_size, fill = experiment_type, alpha = 0.5), col = 'lightgray') + - geom_point(aes(x = network_size, y = p25), col = 'darkgray', size=10.0, shape='-') + - geom_point(aes(x = network_size, y = p75), col = 'darkgray', size=10.0, shape='-') + - geom_line(aes(x = network_size, y = median)) + - geom_point(aes(x = network_size, y = median)) + - ylab('median download time') + - xlab('network size') + - theme_minimal(base_size = 15) + - scale_y_continuous(labels = scales::label_timespan()) + - facet_grid( - scales = 'free_y', - file_size ~ seeder_ratio, - labeller = labeller( - seeder_ratio = as_labeller(function(x) { - paste0("seeder ratio: ", scales::percent(as.numeric(x))) - })) - ) + - scale_color_discrete(name = '') + - guides(fill = 'none', alpha = 'none') +comparison_plot( + benchmarks, + completion_p25, + completion_p75, + completion_median, + ylab = 'median download time', + free_y = TRUE +) + Y_TIMESPAN +``` + +## Median Time to First Byte + +The time elapsed from the moment in which we ask a node to download a file to the time in which it logs having downloaded the first $x\%$ of the file -- whatever the logging granularity is -- marks our time to first byte. This is actually an approximation which factors in _i)_ DHT lookup latency; _ii)_ swarm bootstrap latency; _iii)_ a fraction, typically $1/100^{th}$, of the download time. This should impact smaller files more than it impacts larger files. + +```{r fig.cap='Median time-to-first-byte for Deluge and Codex', fig.width = 11, message = FALSE, echo = FALSE} +comparison_plot( + benchmarks, + first_byte_p25, + first_byte_p75, + first_byte_median, + ylab = 'median download time', + free_y = TRUE +) + Y_TIMESPAN ``` ## Median Download Time Ratio