feat: generalize final analysis to Codex experiments

2026-02-18 03:43:05 +00:00 · 2025-02-18 17:44:25 -03:00 · 2025-02-18 17:44:25 -03:00 · ad6e94db7d
commit ad6e94db7d
parent d1077d0972
3 changed files with 109 additions and 86 deletions
--- a/analysis/final/R/analysis.R
+++ b/analysis/final/R/analysis.R
@ -1,13 +1,17 @@
-PIECE_SIZE <- 262144
+is_completed <- function(completion) 1.0 - completion > -1e-7

-piece_count <- function(experiment_meta) {
-  experiment_meta$file_size / PIECE_SIZE
-}
-
-extract_repetitions <- function(deluge_torrent_download) {
-  deluge_torrent_download |>
+#' Extracts repetition id and seed set id from the dataset name,
+#' which should be in the format `dataset-<seed_set>-<repetition>`.
+#'
+#' @param download_metric
+#' @param meta
+#'
+#' @returns
+#' @export
+extract_repetitions <- function(download_metric) {
+  download_metric |>
    mutate(
-      temp = str_remove(torrent_name, '^dataset-'),
+      temp = str_remove(dataset_name, '^dataset-'),
      seed_set = as.numeric(str_extract(temp, '^\\d+')),
      run = as.numeric(str_extract(temp, '\\d+$'))
    ) |>
@ -15,35 +19,42 @@ extract_repetitions <- function(deluge_torrent_download) {
    select(-temp, -name)
 }

-compute_pieces <- function(deluge_torrent_download, n_pieces) {
-  deluge_torrent_download |>
+#' Computes the progress, in percentage, of the download. The underlying
+#' assumption is that downloads are logged as discrete chunks of the same size,
+#' and that the `value` column contains something that identifies this chunk.
+#'
+#' This makes it compatible with BitTorrent, which logs piece ids, whereas with
+#' other systems we can simply use a byte count, provided the logger is smart
+#' enough to log at equally-sized, discrete intervals.
+#'
+compute_progress <- function(download_metric, meta, count_distinct) {
+  download_metric |>
    group_by(node, seed_set, run) |>
    arrange(timestamp) |>
    mutate(
-      piece_count = seq_along(timestamp)
+      piece_count = if (count_distinct) seq_along(timestamp) else piece
    ) |>
    ungroup() |>
-    mutate(completed = piece_count / n_pieces)
+    mutate(completed = (piece_count * meta$download_metric_unit_bytes) / meta$file_size)
 }

-process_incomplete_downloads <- function(deluge_torrent_download, n_pieces, discard_incomplete) {
-  incomplete_downloads <- deluge_torrent_download |>
+process_incomplete_downloads <- function(download_metric, discard_incomplete) {
+  incomplete_downloads <- download_metric |>
    group_by(node, seed_set, run) |>
-    count() |>
-    ungroup() |>
-    filter(n != n_pieces)
+    summarise(completed = max(completed)) |>
+    filter(!is_completed(completed))

  if(nrow(incomplete_downloads) > 0) {
    (if (!discard_incomplete) stop else warning)(
      'Experiment contained incomplete downloads.')
  }

-  deluge_torrent_download |> anti_join(
+  download_metric |> anti_join(
    incomplete_downloads, by = c('node', 'seed_set', 'run'))
 }

-process_incomplete_repetitions <- function(deluge_torrent_download, repetitions, allow_missing) {
-  mismatching_repetitions <- deluge_torrent_download |>
+process_incomplete_repetitions <- function(download_metric, repetitions, allow_missing) {
+  mismatching_repetitions <- download_metric |>
    select(seed_set, node, run) |>
    distinct() |>
    group_by(seed_set, node) |>
@ -55,10 +66,10 @@ process_incomplete_repetitions <- function(deluge_torrent_download, repetitions,
      'Experiment data did not have all repetitions.')
  }

-  deluge_torrent_download
+  download_metric
 }

-compute_download_times <- function(meta, request_event, deluge_torrent_download, group_id) {
+compute_download_times <- function(meta, request_event, download_metric, group_id) {
  n_leechers <- meta$nodes$network_size - meta$seeders

  download_start <- request_event |>
@ -68,14 +79,15 @@ compute_download_times <- function(meta, request_event, deluge_torrent_download,
      # We didn't log those on the runner side so I have to reconstruct them.
      run = rep(rep(
        1:meta$repetitions - 1,
-        each = n_leechers), times=meta$seeder_sets),
+        each = n_leechers), times = meta$seeder_sets),
      seed_set = rep(
        1:meta$seeder_sets - 1,
        each = n_leechers * meta$repetitions),
+      destination = gsub('"', '', destination) # sometimes we get double-quoted strings in logs
    ) |>
    transmute(node = destination, run, seed_set, seed_request_time = timestamp)

-  download_times <- deluge_torrent_download |>
+  download_times <- download_metric |>
    left_join(download_start, by = c('node', 'run', 'seed_set')) |>
    mutate(
      elapsed_download_time = as.numeric(timestamp - seed_request_time)
@ -95,38 +107,39 @@ compute_download_times <- function(meta, request_event, deluge_torrent_download,
  download_times
 }

-check_seeder_count <- function(download_times, seeders) {
-  mismatching_seeders <- download_times |>
-    filter(is.na(seed_request_time)) |>
-    select(node, seed_set, run) |>
-    distinct() |>
-    group_by(seed_set, run) |>
-    count() |>
-    filter(n != seeders)

-  nrow(mismatching_seeders) == 0
+download_times <- function(experiment, piece_count_distinct, discard_incomplete = TRUE, allow_missing = TRUE) {
+  meta <- experiment$meta
+  downloads <- experiment$download_metric |>
+    extract_repetitions() |>
+    compute_progress(meta, count_distinct = piece_count_distinct)
+
+  downloads <- process_incomplete_downloads(
+    downloads,
+    discard_incomplete
+  ) |>
+    process_incomplete_repetitions(meta$repetitions, allow_missing)
+
+  download_times <- compute_download_times(
+    meta,
+    experiment$request_event,
+    downloads,
+    group_id
+  )
+
+  if (!check_seeder_count(download_times, meta$seeders)) {
+    warning(glue::glue('Undefined download times do not match seeder count'))
+    return(NULL)
+  }
+
+  download_times
 }

-download_stats <- function(download_times) {
-  download_times |>
-    filter(!is.na(elapsed_download_time)) |>
-    group_by(piece_count, completed) |>
-    summarise(
-      mean = mean(elapsed_download_time),
-      median = median(elapsed_download_time),
-      max = max(elapsed_download_time),
-      min = min(elapsed_download_time),
-      p90 = quantile(elapsed_download_time, p = 0.95),
-      p10 = quantile(elapsed_download_time, p = 0.05),
-      .groups = 'drop'
-    )
-}

 completion_time_stats <- function(download_times, meta) {
-  n_pieces <- meta |> piece_count()
  completion_times <- download_times |>
    filter(!is.na(elapsed_download_time),
-           piece_count == n_pieces) |>
+           is_completed(completed)) |>
    pull(elapsed_download_time)

  n_experiments <- meta$repetitions * meta$seeder_sets
@ -156,35 +169,32 @@ completion_time_stats <- function(download_times, meta) {
  )
 }

-download_times <- function(experiment, discard_incomplete = TRUE, allow_missing = TRUE) {
-  meta <- experiment$meta
-  pieces <- experiment$meta |> piece_count()
-  downloads <- experiment$deluge_torrent_download |>
-    extract_repetitions() |>
-    compute_pieces(pieces)
+check_seeder_count <- function(download_times, seeders) {
+  mismatching_seeders <- download_times |>
+    filter(is.na(seed_request_time)) |>
+    select(node, seed_set, run) |>
+    distinct() |>
+    group_by(seed_set, run) |>
+    count() |>
+    filter(n != seeders)

-  downloads <- process_incomplete_downloads(
-    downloads,
-    pieces,
-    discard_incomplete
-  ) |>
-    process_incomplete_repetitions(meta$repetitions, allow_missing)
-
-  download_times <- compute_download_times(
-    meta,
-    experiment$request_event,
-    downloads,
-    group_id
-  )
-
-  if (!check_seeder_count(download_times, meta$seeders)) {
-    warning(glue::glue('Undefined download times do not match seeder count'))
-    return(NULL)
-  }
-
-  download_times
+  nrow(mismatching_seeders) == 0
 }

+download_stats <- function(download_times) {
+  download_times |>
+    filter(!is.na(elapsed_download_time)) |>
+    group_by(piece_count, completed) |>
+    summarise(
+      mean = mean(elapsed_download_time),
+      median = median(elapsed_download_time),
+      max = max(elapsed_download_time),
+      min = min(elapsed_download_time),
+      p90 = quantile(elapsed_download_time, p = 0.95),
+      p10 = quantile(elapsed_download_time, p = 0.05),
+      .groups = 'drop'
+    )
+}

 compute_compact_summary <- function(download_ecdf) {
  lapply(c(0.05, 0.5, 0.95), function(p)
--- a/analysis/final/R/read-all-experiments.R
+++ b/analysis/final/R/read-all-experiments.R
@ -1,9 +1,9 @@
-read_all_experiments <- function(base_path, skip_incomplete = TRUE) {
+read_all_experiments <- function(base_path, skip_incomplete = TRUE, prefix = '') {
  roots <- list.files(base_path,
                      include.dirs = TRUE, no.. = TRUE, full.names = TRUE)

  experiments <- lapply(roots, read_single_experiment)
-  names(experiments) <- sapply(roots, basename)
+  names(experiments) <- paste0(prefix, sapply(roots, basename))

  # Validates that no experiment has missing data.
  key_sets <- lapply(experiments, ls) |> unique()
@ -24,8 +24,7 @@ read_all_experiments <- function(base_path, skip_incomplete = TRUE) {
  experiments[!is.null(experiments)]
 }

-merge_experiments <- function(set_1, set_2, prefix) {
-  maxid <- max(as.integer(sub(pattern = 'e', '', ls(deluge))))
+merge_experiments <- function(set_1, set_2) {
  merged <- list()

  for (set_1_id in ls(set_1)) {
@ -33,7 +32,10 @@ merge_experiments <- function(set_1, set_2, prefix) {
  }

  for (set_2_id in ls(set_2)) {
-    merged[[paste0(prefix, set_2_id)]] <- set_2[[set_2_id]]
+    if (set_2_id %in% names(merged)) {
+      stop(glue::glue('Duplicate experiment ID {set_2_id}. Cannot merge.'))
+    }
+    merged[[set_2_id]] <- set_2[[set_2_id]]
  }

  merged
--- a/analysis/final/static-dissemination.Rmd
+++ b/analysis/final/static-dissemination.Rmd
@ -20,25 +20,34 @@ devtools::load_all()
 This is data that's been pre-parsed from an experiment [log source](https://github.com/codex-storage/bittorrent-benchmarks/blob/1ee8ea8a35a2c0fccea6e7c955183c4ed03eebb3/benchmarks/logging/sources.py#L27).

 ```{r}
-deluge <- read_all_experiments('./data/g1738145663/') |>
-  merge_experiments(
-    read_all_experiments('./data/g1738248455/'), prefix = 's')
+experiments <- read_all_experiments('./data/g1739826980')
 ```
+
+```{r}
+COUNT_DISTINCT = list(
+  'codex_static_dissemination' = FALSE,
+  'deluge_static_dissemination' = TRUE
+)
+```
+
+
 Computes the benchmark statistics from raw download logs.

 ```{r}
-benchmarks <- lapply(deluge, function(experiment) {
+benchmarks <- lapply(experiments, function(experiment) {
  print(glue::glue('Process {experiment$experiment_id}'))
  download_time_stats <- tryCatch({
    meta <- experiment$meta
    completion <- experiment |>
-      download_times() |>
+      download_times(
+        piece_count_distinct = COUNT_DISTINCT[[meta$experiment_type]]) |>
      completion_time_stats(meta)
    
    if (is.null(completion)) {
      NULL
    } else {
      completion |> mutate(
+        experiment_type = meta$experiment_type,
        network_size = meta$nodes$network_size,
        seeders = meta$seeders,
        leechers = network_size - meta$seeders,
@ -71,7 +80,7 @@ benchmarks
 We then plot the median by network size, and facet it by seeder ratio and file size to see if looks sane:

 ```{r fig.width = 10, warning=FALSE, message=FALSE}
-ggplot(benchmarks) +
+ggplot(benchmarks, aes(col = experiment_type, fill = experiment_type)) +
  geom_ribbon(aes(ymin = p25, ymax = p75, x = network_size),
              fill = scales::alpha('blue', 0.5), col = 'lightgray') +
  geom_point(aes(x = network_size, y = p25), col = 'darkgray', size=10.0, shape='-') +
@ -90,5 +99,7 @@ ggplot(benchmarks) +
        paste0("seeder ratio: ", scales::percent(as.numeric(x)))
      }))
  ) +
+  scale_color_discrete(name = 'experiment type') +
+  guides(fill = 'none') +
  ylim(c(0,NA))
 ```