feat: add lossy analysis

2026-02-17 19:33:12 +00:00 · 2025-01-30 11:07:47 -03:00 · 2025-01-30 11:07:47 -03:00 · df087127de
commit df087127de
parent 94893c0f93
2 changed files with 38 additions and 25 deletions
--- a/analysis/final/R/analysis.R
+++ b/analysis/final/R/analysis.R
@ -1,7 +1,7 @@
 PIECE_SIZE <- 262144

-piece_count <- function(experiment) {
-  experiment$meta$file_size / PIECE_SIZE
+piece_count <- function(experiment_meta) {
+  experiment_meta$file_size / PIECE_SIZE
 }

 extract_repetitions <- function(deluge_torrent_download) {
@ -26,17 +26,23 @@ compute_pieces <- function(deluge_torrent_download, n_pieces) {
    mutate(completed = piece_count / n_pieces)
 }

-check_incomplete_downloads <- function(deluge_torrent_download, n_pieces) {
+process_incomplete_downloads <- function(deluge_torrent_download, n_pieces, discard_incomplete) {
  incomplete_downloads <- deluge_torrent_download |>
    group_by(node, seed_set, run) |>
    count() |>
    ungroup() |>
    filter(n != n_pieces)

-  nrow(incomplete_downloads) == 0
+  if(nrow(incomplete_downloads) > 0) {
+    (if (!discard_incomplete) stop else warning)(
+      'Experiment contained incomplete downloads.')
+  }
+
+  deluge_torrent_download |> anti_join(
+    incomplete_downloads, by = c('node', 'seed_set', 'run'))
 }

-check_mismatching_repetitions <- function(deluge_torrent_download, repetitions) {
+process_incomplete_repetitions <- function(deluge_torrent_download, repetitions, allow_missing) {
  mismatching_repetitions <- deluge_torrent_download |>
    select(seed_set, node, run) |>
    distinct() |>
@ -44,7 +50,12 @@ check_mismatching_repetitions <- function(deluge_torrent_download, repetitions)
    count() |>
    filter(n != repetitions)

-  nrow(mismatching_repetitions) == 0
+  if(nrow(mismatching_repetitions) > 0) {
+    (if (!allow_missing) stop else warning)(
+      'Experiment data did not have all repetitions.')
+  }
+
+  deluge_torrent_download
 }

 compute_download_times <- function(meta, request_event, deluge_torrent_download, group_id) {
@ -111,13 +122,21 @@ download_stats <- function(download_times) {
    )
 }

-completion_time_stats <- function(download_times) {
+completion_time_stats <- function(download_times, meta) {
+  n_pieces <- meta |> piece_count()
  completion_times <- download_times |>
-    filter(!is.na(elapsed_download_time)) |>
+    filter(!is.na(elapsed_download_time),
+           piece_count == n_pieces) |>
    pull(elapsed_download_time)

+  n_experiments <- meta$repetitions * meta$seeder_sets
+  n_leechers <- meta$nodes$network_size - meta$seeders
+  n_points <- n_experiments * n_leechers

  tibble(
+    n = length(completion_times),
+    expected_n = n_points,
+    missing = expected_n - n,
    min = min(completion_times),
    p05 = quantile(completion_times, p = 0.05),
    p10 = quantile(completion_times, p = 0.10),
@ -126,28 +145,23 @@ completion_time_stats <- function(download_times) {
    p80 = quantile(completion_times, p = 0.80),
    p90 = quantile(completion_times, p = 0.90),
    p95 = quantile(completion_times, p = 0.95),
-    max = max(completion_times)
+    max = max(completion_times),
  )
 }

-download_times <- function(experiment) {
+download_times <- function(experiment, discard_incomplete = TRUE, allow_missing = TRUE) {
  meta <- experiment$meta
-  pieces <- experiment |> piece_count()
+  pieces <- experiment$meta |> piece_count()
  downloads <- experiment$deluge_torrent_download |>
    extract_repetitions() |>
    compute_pieces(pieces)

-  if (!check_incomplete_downloads(downloads, pieces)) {
-    warning(glue::glue('Discard experiment {experiment$experiment_id} ',
-                       'due to incomplete downloads'))
-    return(NULL)
-  }
-
-  if (!check_mismatching_repetitions(downloads, meta$repetitions)) {
-    warning(glue::glue('Discard experiment {experiment$experiment_id} ',
-                       'due to mismatching repetitions'))
-    return(NULL)
-  }
+  downloads <- process_incomplete_downloads(
+    downloads,
+    pieces,
+    discard_incomplete
+  ) |>
+    process_incomplete_repetitions(meta$repetitions, allow_missing)

  download_times <- compute_download_times(
    meta,
--- a/analysis/final/static-dissemination.Rmd
+++ b/analysis/final/static-dissemination.Rmd
@ -20,7 +20,7 @@ devtools::load_all()
 This is data that's been pre-parsed from an experiment [log source](https://github.com/codex-storage/bittorrent-benchmarks/blob/1ee8ea8a35a2c0fccea6e7c955183c4ed03eebb3/benchmarks/logging/sources.py#L27).

 ```{r}
-deluge <- read_all_experiments('./data/deluge-g1737553501/')
+deluge <- read_all_experiments('./data/g1738145663/')
 ```
 Computes the benchmark statistics from raw download logs.

@ -31,7 +31,7 @@ benchmarks <- lapply(deluge, function(experiment) {
    meta <- experiment$meta
    completion <- experiment |>
      download_times() |>
-      completion_time_stats()
+      completion_time_stats(meta)
    
    if (is.null(completion)) {
      NULL
@ -83,4 +83,3 @@ ggplot(benchmarks) +
  ) +
  ylim(c(0,NA))
 ```
-The data looks largely sane: a larger seeder ratio makes performance somewhat better; though not nearly as consistently as one would hope, at least in this data, and there is a linear performance degradation trend as the network grows larger. Also, the $100\text{MB}$ file seems to generate much better-behaved data than the $1\text{GB}$ case with those trends; i.e., larger seeder ratio improving performance, and network size linearly degrading it, being more pronounced.