extend analysis with time to first byte and transfer time

2026-07-25 16:03:22 +00:00 · 2026-04-28 14:05:14 -03:00 · 2026-04-28 14:05:14 -03:00 · 74e6486d56
commit 74e6486d56
parent 8a834ecfee
3 changed files with 100 additions and 70 deletions
--- a/analysis/final/.Rbuildignore
+++ b/analysis/final/.Rbuildignore
@ -2,3 +2,5 @@
 ^renv\.lock$
 ^.*\.Rproj$
 ^\.Rproj\.user$
+^\.positai$
+^\.claude$
--- a/analysis/final/R/analysis.R
+++ b/analysis/final/R/analysis.R
@ -94,13 +94,21 @@ compute_download_times <- function(meta, request_event, download_metric, group_i
    ) |>
    group_by(node, run, seed_set) |>
    mutate(
-      time_to_first_byte = min(timestamp),
-      lookup_time = as.numeric(time_to_first_byte - seed_request_time)
+      # The time elapsed between the instant we start the download and the time
+      # we see the first log entry is approximated as "time to first byte".
+      # In reality, this factors lookup time, swarm bootstrap time, and
+      # part of the download itself.
+      first_byte_t = min(timestamp),
+      first_byte = as.numeric(first_byte_t - seed_request_time),
+      # "Transfer" time is the total download time minus the lookup time. Again,
+      # this is approximated, and likely reflects a shorter download time than
+      # the real download time.
+      transfer = as.numeric(max(timestamp) - first_byte_t)
    ) |>
    ungroup()

  if (nrow(download_times |>
-           filter(elapsed_download_time < 0 | lookup_time < 0)) > 0) {
+           filter(elapsed_download_time < 0 | first_byte < 0 | transfer < 0)) > 0) {
    stop('Calculation for download times contains negative numbers')
  }

@ -137,36 +145,49 @@ download_times <- function(experiment, piece_count_distinct, discard_incomplete


 completion_time_stats <- function(download_times, meta) {
-  completion_times <- download_times |>
+  filtered <- download_times |>
    filter(!is.na(elapsed_download_time),
-           is_completed(completed)) |>
-    pull(elapsed_download_time)
+           is_completed(completed))

  n_experiments <- meta$repetitions * meta$seeder_sets
  n_leechers <- meta$nodes$network_size - meta$seeders
  n_points <- n_experiments * n_leechers

  tibble(
-    n = length(completion_times),
+    n = nrow(filtered),
    expected_n = n_points,
    missing = expected_n - n,
-    min = min(completion_times),
-    p05 = quantile(completion_times, p = 0.05),
-    p10 = quantile(completion_times, p = 0.10),
-    p20 = quantile(completion_times, p = 0.20),
-    p25 = quantile(completion_times, p = 0.25),
-    median = median(completion_times),
-    p75 = quantile(completion_times, p = 0.75),
-    p80 = quantile(completion_times, p = 0.80),
-    p90 = quantile(completion_times, p = 0.90),
-    p95 = quantile(completion_times, p = 0.95),
-    max = max(completion_times),
+    completion = distributional_stats(filtered$elapsed_download_time),
+    first_byte = distributional_stats(filtered$first_byte),
+    transfer = distributional_stats(filtered$transfer)
+  ) |>
+    unnest(
+      cols = c(completion, first_byte, transfer),
+      names_sep = '_'
+    )
+}
+
+distributional_stats <- function(x) {
+  n <- length(x)
+  tibble(
+    min = min(x),
+    p05 = quantile(x, p = 0.05),
+    p10 = quantile(x, p = 0.10),
+    p20 = quantile(x, p = 0.20),
+    p25 = quantile(x, p = 0.25),
+    median = median(x),
+    p75 = quantile(x, p = 0.75),
+    p80 = quantile(x, p = 0.80),
+    p90 = quantile(x, p = 0.90),
+    p95 = quantile(x, p = 0.95),
+    max = max(x),
    iqr = p75 - p25,
    # This gives us roughly a 95% ci for comparing medians.
    ci = (1.58 * iqr) / sqrt(n),
    w_top = median + ci,
    w_bottom = median - ci
  )
+
 }

 check_seeder_count <- function(download_times, seeders) {
@ -212,13 +233,12 @@ compute_speedups <- function(benchmarks, baseline, compare) {
  baseline_data <- benchmarks |>
    filter(label == baseline) |>
    select(
-      experiment_type, label, network_size, seeders, leechers, file_size, median
+      experiment_type, label, network_size, seeders, leechers, file_size, completion_median
    ) |>
-    rename(baseline_median = median)
+    rename(baseline_median = completion_median)


  lapply(compare, function(compare_label) {
-    browser()
    benchmarks |>
      filter(label == compare_label) |>
      inner_join(
@ -226,7 +246,7 @@ compute_speedups <- function(benchmarks, baseline, compare) {
        by = c('network_size', 'seeders', 'leechers', 'file_size')
      ) |>
      mutate(
-        relative_median = median / baseline_median
+        relative_median = completion_median / baseline_median
      ) |>
      mutate(label = label.x) |>
      select(-baseline_median, -label.y, -label.x)
--- a/analysis/final/static-dissemination.Rmd
+++ b/analysis/final/static-dissemination.Rmd
@ -31,9 +31,10 @@ devtools::load_all()
 ```

 ```{r message = FALSE, include = !knitr::is_html_output()}
-experiments <- read_all_experiments('./data/devnet/optimized/g1761924045/', label = 'deluge') |>
-  merge_experiments(read_all_experiments('./data/devnet/optimized/g1762505060/', label ='codex-baseline')) |>
-  merge_experiments(read_all_experiments('./data/devnet/optimized/g1761729711/', label = 'codex-optimized'))
+experiments <- read_all_experiments('./data/do/g1761924045/', label = 'deluge') |>
+  merge_experiments(read_all_experiments('./data/do/g1762505060/', label = 'codex-baseline')) |>
+  merge_experiments(read_all_experiments('./data/do/g1761729711/', label = 'codex-optimized')) |>
+  merge_experiments(read_all_experiments('./data/do/g1775565300/', label = 'new-protocol'))
 ```

 ```{r include = !knitr::is_html_output()}
@ -45,7 +46,7 @@ COUNT_DISTINCT = list(

 ```{r message = FALSE, include = !knitr::is_html_output()}
 benchmarks <- lapply(experiments, function(experiment) {
-  print(glue::glue('Process {experiment$experiment_id}'))
+  print(glue::glue('Process {experiment$experiment_id} - {experiment$label}'))
  download_time_stats <- tryCatch({
    meta <- experiment$meta
    completion <- experiment |>
@ -77,9 +78,12 @@ benchmarks <- lapply(experiments, function(experiment) {
                        levels = rlang::parse_bytes(as.character(
                          unique(file_size[order(file_size, decreasing = TRUE)])))),
    seeder_ratio = seeders / network_size,
-    median_speed = file_size_bytes / median,
-    p25_speed = file_size_bytes / p25,
-    p75_speed = file_size_bytes / p75
+    completion_median_speed = file_size_bytes / completion_median,
+    completion_p25_speed = file_size_bytes / completion_p25,
+    completion_p75_speed = file_size_bytes / completion_p75,
+    transfer_median_speed = file_size_bytes / transfer_median,
+    transfer_p25_speed = file_size_bytes / transfer_p25,
+    transfer_p75_speed = file_size_bytes / transfer_p75
  ) |>
  relocate(file_size, network_size, seeders, leechers, file_size_bytes)
 ```
@ -114,58 +118,62 @@ DT::datatable(
 relative_performance <- compute_speedups(
  benchmarks = benchmarks,
  base = 'deluge',
-  compare = c('codex-baseline', 'codex-optimized')
+  compare = c('codex-baseline', 'codex-optimized', 'new-protocol')
 )
 ```

 ## Median Download Speed

 ```{r fig.cap='Median download speed for Deluge and Codex', fig.width = 11, message = FALSE, echo = FALSE}
-ggplot(benchmarks, aes(col = label, fill = label, group = label)) +
-  geom_ribbon(aes(ymin = p25_speed, ymax = p75_speed, x = network_size, fill = label, alpha = 0.5), col = 'lightgray') +
-  geom_point(aes(x = network_size, y = p25_speed), col = 'darkgray', size=10.0, shape='-') +
-  geom_point(aes(x = network_size, y = p75_speed), col = 'darkgray', size=10.0, shape='-') +
-  geom_line(aes(x = network_size, y = median_speed)) +
-  geom_point(aes(x = network_size, y = median_speed)) +
-  ylab('median download speed (bytes/second)') +
-  xlab('network size') +
-  theme_minimal(base_size=15) +
-  scale_y_continuous(labels = function(x) paste0(scales::label_bytes()(x), '/s')) +
-  facet_grid(
-    file_size ~ seeder_ratio,
-    labeller = labeller(
-      seeder_ratio = as_labeller(function(x) {
-        paste0("seeder ratio: ", scales::percent(as.numeric(x)))
-      }))
-  ) +
-  scale_color_discrete(name = '') +
-  guides(fill = 'none', alpha = 'none')
+comparison_plot(
+  benchmarks,
+  completion_p25_speed,
+  completion_p75_speed,
+  completion_median_speed,
+  ylab = 'median download speed (bytes/second)',
+  free_y = TRUE
+) + Y_BPS
+```
+
+## Median Transfer Speed
+
+```{r fig.width = 11, message = FALSE, echo = FALSE}
+comparison_plot(
+  benchmarks,
+  transfer_p25_speed,
+  transfer_p75_speed,
+  transfer_median_speed,
+  ylab = 'median transfer speed (bytes/second)',
+  free_y = TRUE
+) + Y_BPS
 ```

 ## Median Download Time

-
 ```{r fig.cap='Median time to download a whole file for Deluge and Codex', fig.width = 11, message = FALSE, echo = FALSE}
-ggplot(benchmarks, aes(col = label, fill = label, group = label)) +
-  geom_ribbon(aes(ymin = p25, ymax = p75, x = network_size, fill = experiment_type, alpha = 0.5), col = 'lightgray') +
-  geom_point(aes(x = network_size, y = p25), col = 'darkgray', size=10.0, shape='-') +
-  geom_point(aes(x = network_size, y = p75), col = 'darkgray', size=10.0, shape='-') +
-  geom_line(aes(x = network_size, y = median)) +
-  geom_point(aes(x = network_size, y = median)) +
-  ylab('median download time') +
-  xlab('network size') +
-  theme_minimal(base_size = 15) +
-  scale_y_continuous(labels = scales::label_timespan()) +
-  facet_grid(
-    scales = 'free_y',
-    file_size ~ seeder_ratio,
-    labeller = labeller(
-      seeder_ratio = as_labeller(function(x) {
-        paste0("seeder ratio: ", scales::percent(as.numeric(x)))
-      }))
-  ) +
-  scale_color_discrete(name = '') +
-  guides(fill = 'none', alpha = 'none')
+comparison_plot(
+  benchmarks,
+  completion_p25,
+  completion_p75,
+  completion_median,
+  ylab = 'median download time',
+  free_y = TRUE
+) + Y_TIMESPAN
+```
+
+## Median Time to First Byte
+
+The time elapsed from the moment in which we ask a node to download a file to the time in which it logs having downloaded the first $x\%$ of the file -- whatever the logging granularity is -- marks our time to first byte. This is actually an approximation which factors in _i)_ DHT lookup latency; _ii)_ swarm bootstrap latency; _iii)_ a fraction, typically $1/100^{th}$, of the download time. This should impact smaller files more than it impacts larger files.
+
+```{r fig.cap='Median time-to-first-byte for Deluge and Codex', fig.width = 11, message = FALSE, echo = FALSE}
+comparison_plot(
+  benchmarks,
+  first_byte_p25,
+  first_byte_p75,
+  first_byte_median,
+  ylab = 'median download time',
+  free_y = TRUE
+) + Y_TIMESPAN
 ```

 ## Median Download Time Ratio