feat: add multi-experiment analysis

2026-02-18 11:43:33 +00:00 · 2025-01-10 17:48:11 -03:00 · 2025-01-10 17:48:11 -03:00 · 0acd2e3086
commit 0acd2e3086
parent 84bac4594d
15 changed files with 1662 additions and 5 deletions
--- a/README.md
+++ b/README.md
@ -9,9 +9,9 @@ locally, however, using [Minikube](https://minikube.sigs.k8s.io/) (or Kind, or D

 ## Limits

-When running experiments locally in a Linux machine, you will bump onto a number of
-limitations. I have documented those here. I won't go into how to make those changes
-permanent within your system as there's significant variation across distributions.
+When running experiments locally in a Linux machine, you will likely need to adjust several
+of the default OS limits. I won't go into how to make those changes permanent within your
+system as there's significant variation across distributions.

 **ARP Cache.** The default size for the ARP cache is too small. You should bump it
 significantly, e.g.:
@ -22,7 +22,7 @@ echo 8192 | sudo tee /proc/sys/net/ipv4/neigh/default/gc_thresh2
 echo 16384 | sudo tee /proc/sys/net/ipv4/neigh/default/gc_thresh3
 ``` 

-**Inotify.** Kubernetes seems to enjoy watching the filesystem, so 
+**inotify.** Kubernetes seems to enjoy watching the filesystem, so
 you should increase inotify limits across the board:

 ```bash
@ -31,7 +31,7 @@ sudo sysctl -w fs.inotify.max_queued_events=2099999999
 sudo sysctl -w fs.inotify.max_user_watches=2099999999
 ``` 

-**Kernel key retention service.* Kubernetes also places a large number of keys 
+**Kernel key retention service.** Kubernetes also places a large number of keys
 within the kernel. Make sure you have enough room:

 ```bash
--- a/analysis/final.analysis/.Rbuildignore
+++ b/analysis/final.analysis/.Rbuildignore
@ -0,0 +1,4 @@
+^renv$
+^renv\.lock$
+^.*\.Rproj$
+^\.Rproj\.user$
--- a/analysis/final.analysis/.Rprofile
+++ b/analysis/final.analysis/.Rprofile
@ -0,0 +1 @@
+source("renv/activate.R")
--- a/analysis/final.analysis/.gitignore
+++ b/analysis/final.analysis/.gitignore
@ -0,0 +1 @@
+data
--- a/analysis/final.analysis/DESCRIPTION
+++ b/analysis/final.analysis/DESCRIPTION
@ -0,0 +1,16 @@
+Package: final.analysis
+Type: Package
+Title: What the Package Does (Title Case)
+Version: 0.1.0
+Authors@R: c(
+    person(
+      "Jane", "Doe",
+      email = "jane@example.com",
+      role = c("aut", "cre")
+    )
+  )
+Description: More about what it does (maybe more than one line).
+    Continuation lines should be indented.
+License: What license is it under?
+Encoding: UTF-8
+LazyData: true
--- a/analysis/final.analysis/NAMESPACE
+++ b/analysis/final.analysis/NAMESPACE
@ -0,0 +1 @@
+exportPattern("^[[:alpha:]]+")
--- a/analysis/final.analysis/R/analysis.R
+++ b/analysis/final.analysis/R/analysis.R
@ -0,0 +1,106 @@
+extract_repetitions <- function(deluge_torrent_download) {
+  deluge_torrent_download |>
+    mutate(
+      temp = str_remove(torrent_name, '^dataset-'),
+      seed_set = as.numeric(str_extract(temp, '^\\d+')),
+      run = as.numeric(str_extract(temp, '\\d+$'))
+    ) |>
+    rename(piece = value) |>
+    select(-temp, -name)
+}
+
+compute_pieces <- function(deluge_torrent_download, n_pieces) {
+  deluge_torrent_download |>
+    group_by(node, seed_set, run) |>
+    arrange(timestamp) |>
+    mutate(
+      piece_count = seq_along(timestamp)
+    ) |>
+    ungroup() |>
+    mutate(completed = piece_count / n_pieces)
+}
+
+check_incomplete_downloads <- function(deluge_torrent_download, n_pieces) {
+  incomplete_downloads <- downloads |>
+    group_by(node, seed_set, run) |>
+    count() |>
+    ungroup() |>
+    filter(n != n_pieces)
+
+  nrow(incomplete_downloads) == 0
+}
+
+check_mismatching_repetitions <- function(deluge_torrent_download, repetitions) {
+  mismatching_repetitions <- downloads |>
+    select(seed_set, node, run) |>
+    distinct() |>
+    group_by(seed_set, node) |>
+    count() |>
+    filter(n != repetitions)
+
+  nrow(mismatching_repetitions) == 0
+}
+
+compute_download_times <- function(meta, request_event, deluge_torrent_download, group_id) {
+  n_leechers <- meta$nodes$network_size - meta$seeders
+
+  download_start <- request_event |>
+    select(-request_id) |>
+    filter(name == 'leech', type == 'RequestEventType.end') |>
+    mutate(
+      # We didn't log those on the runner side so I have to reconstruct them.
+      run = rep(rep(
+        1:meta$repetitions - 1,
+        each = n_leechers), times=meta$seeder_sets),
+      seed_set = rep(
+        1:meta$seeder_sets - 1,
+        each = n_leechers * meta$repetitions),
+    ) |>
+    transmute(node = destination, run, seed_set, seed_request_time = timestamp)
+
+  download_times <- deluge_torrent_download |>
+    # FIXME remove this once we fix the chart
+    mutate(node = sub(pattern = glue::glue('-{group_id}$'), replacement = '', x = node)) |>
+    left_join(download_start, by = c('node', 'run', 'seed_set')) |>
+    mutate(
+      elapsed_download_time = as.numeric(timestamp - seed_request_time)
+    ) |>
+    group_by(node, run, seed_set) |>
+    mutate(lookup_time = as.numeric(min(timestamp) - seed_request_time)) |>
+    ungroup()
+
+  if (nrow(download_times |>
+           filter(elapsed_download_time < 0 | lookup_time < 0)) > 0) {
+    stop('Calculation for download times contains negative numbers')
+  }
+
+  download_times
+}
+
+check_seeder_count <- function(download_times, seeders) {
+  mismatching_seeders <- download_times |>
+    filter(is.na(seed_request_time)) |>
+    select(node, seed_set, run) |>
+    distinct() |>
+    group_by(seed_set, run) |>
+    count() |>
+    filter(n != seeders)
+
+  nrow(mismatching_seeders) == 0
+}
+
+download_time_stats <- function(download_times) {
+  download_times |>
+    filter(!is.na(elapsed_download_time)) |>
+    group_by(piece_count, completed) |>
+    summarise(
+      mean = mean(elapsed_download_time),
+      median = median(elapsed_download_time),
+      max = max(elapsed_download_time),
+      min = min(elapsed_download_time),
+      p90 = quantile(elapsed_download_time, p = 0.95),
+      p10 = quantile(elapsed_download_time, p = 0.05),
+      .groups = 'drop'
+    )
+}
+
--- a/analysis/final.analysis/R/read-all-experiments.R
+++ b/analysis/final.analysis/R/read-all-experiments.R
@ -0,0 +1,25 @@
+read_all_experiments <- function(base_path, skip_incomplete = TRUE) {
+  roots <- list.files(base_path,
+                      include.dirs = TRUE, no.. = TRUE, full.names = TRUE)
+
+  experiments <- lapply(roots, read_single_experiment)
+  names(experiments) <- sapply(roots, basename)
+
+  # Validates that no experiment has missing data.
+  key_sets <- lapply(experiments, ls) |> unique()
+  # Selects the largest keyset which is presumably the most complete.
+  key_set <- key_sets[[order(sapply(key_sets, length), decreasing = TRUE)[1]]]
+
+  # Discards any experiment that doesn't have all keys.
+  experiments <- lapply(experiments, function(experiment) {
+    if (!(all(key_set %in% names(experiment)))) {
+      warning(glue::glue('Experiment {experiment$experiment_id} is missing ',
+                          'some keys and will be discarded.'))
+      NULL
+    } else {
+      experiment
+    }
+  })
+
+  experiments[!is.null(experiments)]
+}
--- a/analysis/final.analysis/R/read-single-experiment.R
+++ b/analysis/final.analysis/R/read-single-experiment.R
@ -0,0 +1,36 @@
+read_single_experiment <- function(experiment_folder) {
+  # This is a structural assumption: the base folder for the experiment
+  # corresponds to its ID.
+  experiment_id <- basename(experiment_folder)
+  print(glue::glue('Reading experiment {experiment_id}'))
+
+  meta <- jsonlite::read_json(.lookup_experiment_config(experiment_folder))
+  table_files <- list.files(path = experiment_folder, '\\.csv$')
+  data <- lapply(table_files, function(table_file) {
+    read_csv(
+      file.path(experiment_folder, table_file),
+      show_col_types = FALSE
+    ) |>
+      mutate(
+        experiment_id = !!experiment_id
+      )
+  })
+
+  names(data) <- gsub('(\\..*)$', '', table_files)
+  data$meta <- meta
+  data$experiment_id <- experiment_id
+
+  data
+}
+
+.lookup_experiment_config <- function(experiment_folder) {
+  candidates <- list.files(path = experiment_folder,
+                           pattern = '_experiment_config_log_entry.jsonl$')
+
+  if (length(candidates) != 1) {
+    stop(glue::glue(
+      'Cannot establish the correct config file at {experiment_folder}.'))
+  }
+
+  file.path(experiment_folder, candidates)
+}
--- a/analysis/final.analysis/R/utils.R
+++ b/analysis/final.analysis/R/utils.R
@ -0,0 +1,4 @@
+drop_nulls <- function(a_list) {
+  a_copy <- a_list[!is.null(a_list)]
+  a_copy
+}
--- a/analysis/final.analysis/renv.lock
+++ b/analysis/final.analysis/renv.lock
@ -0,0 +1,23 @@
+{
+  "R": {
+    "Version": "4.3.3",
+    "Repositories": [
+      {
+        "Name": "CRAN",
+        "URL": "https://cloud.r-project.org"
+      }
+    ]
+  },
+  "Packages": {
+    "renv": {
+      "Package": "renv",
+      "Version": "1.0.11",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "utils"
+      ],
+      "Hash": "47623f66b4e80b3b0587bc5d7b309888"
+    }
+  }
+}
--- a/analysis/final.analysis/renv/.gitignore
+++ b/analysis/final.analysis/renv/.gitignore
@ -0,0 +1,7 @@
+library/
+local/
+cellar/
+lock/
+python/
+sandbox/
+staging/
--- a/analysis/final.analysis/renv/activate.R
+++ b/analysis/final.analysis/renv/activate.R
--- a/analysis/final.analysis/renv/settings.json
+++ b/analysis/final.analysis/renv/settings.json
@ -0,0 +1,19 @@
+{
+  "bioconductor.version": null,
+  "external.libraries": [],
+  "ignored.packages": [],
+  "package.dependency.fields": [
+    "Imports",
+    "Depends",
+    "LinkingTo"
+  ],
+  "ppm.enabled": null,
+  "ppm.ignored.urls": [],
+  "r.version": null,
+  "snapshot.type": "explicit",
+  "use.cache": true,
+  "vcs.ignore.cellar": true,
+  "vcs.ignore.library": true,
+  "vcs.ignore.local": true,
+  "vcs.manage.ignores": true
+}
--- a/analysis/final.analysis/static-dissemination.Rmd
+++ b/analysis/final.analysis/static-dissemination.Rmd
@ -0,0 +1,109 @@
+---
+title: "static-dissemination.Rmd"
+output: html_document
+date: "2025-01-10"
+---
+
+```{r}
+library(tidyverse)
+
+devtools::load_all()
+```
+
+```{r}
+group_id <- 'g1736505161'
+```
+
+
+```{r}
+deluge <- read_all_experiments('./data/deluge')
+```
+
+The torrent piece size is set at torrent creation time by [torrentool](https://github.com/idlesign/torrentool/blob/5f37d6dcc304758bae46d01c63e5be0f0a348bfc/torrentool/torrent.py#L354).
+```{r}
+PIECE_SIZE <- 262144
+``` 
+
+```{r}
+piece_count <- function(experiment) {
+  experiment$meta$file_size / PIECE_SIZE
+}
+```
+
+```{r}
+compute_download_time_stats <- function(experiment) {
+  meta <- experiment$meta
+  pieces <- experiment |> piece_count()
+  downloads <- experiment$deluge_torrent_download |> 
+    extract_repetitions() |>
+    compute_pieces(pieces)
+
+  if (!check_incomplete_downloads(downloads, pieces)) {
+    warning(glue::glue('Discard experiment {experiment$experiment_id} ',
+                       'due to incomplete downloads'))
+    return(NULL)
+  }
+
+  if (!check_mismatching_repetitions(downloads, meta$repetitions)) {
+    warning(glue::glue('Discard experiment {experiment$experiment_id} ',
+                       'due to mismatching repetitions'))
+    return(NULL)
+  }
+
+  download_times <- compute_download_times(
+    meta, 
+    experiment$request_event, 
+    downloads, 
+    group_id
+  )
+
+  if (!check_seeder_count(download_times, meta$seeders)) {
+    warning(glue::glue('Undefined download times do not match seeder count'))
+    return(NULL)
+  }
+  
+  network_size <- meta$nodes$network_size
+  
+  download_times |> 
+    download_time_stats() |>
+    mutate(
+      network_size = network_size,
+      seeders = meta$seeders,
+      leechers = network_size - meta$seeders,
+      file_size = meta$file_size
+    )
+}
+
+
+compute_compact_summary <- function(download_ecdf) {
+  lapply(c(0.05, 0.5, 0.95), function(p)
+    download_ecdf |>
+      filter(completed >= p) |>
+      slice_min(completed)
+    ) |>
+    bind_rows() |>
+    select(completed, network_size, file_size, seeders, leechers, median) |>
+    pivot_wider(id_cols = c('file_size', 'network_size', 'seeders', 'leechers'),
+                names_from = completed, values_from = median)
+}
+
+```
+
+```{r}
+lapply(deluge, function(experiment) {
+  print(glue::glue('Process {experiment$experiment_id}'))
+  download_time_stats <- tryCatch(compute_download_time_stats(experiment), error = function(e) NULL)
+  if (is.null(download_time_stats)) {
+    NULL
+  } else {
+    compute_compact_summary(download_time_stats)
+  }
+}) |> 
+  drop_nulls() |>
+  bind_rows() |>
+  arrange(seeders) |>
+  arrange(network_size) |>
+  arrange(file_size)
+```
+
+