diff --git a/analysis/final/R/analysis.R b/analysis/final/R/analysis.R index cf6b833..c466b09 100644 --- a/analysis/final/R/analysis.R +++ b/analysis/final/R/analysis.R @@ -1,13 +1,17 @@ -PIECE_SIZE <- 262144 +is_completed <- function(completion) 1.0 - completion > -1e-7 -piece_count <- function(experiment_meta) { - experiment_meta$file_size / PIECE_SIZE -} - -extract_repetitions <- function(deluge_torrent_download) { - deluge_torrent_download |> +#' Extracts repetition id and seed set id from the dataset name, +#' which should be in the format `dataset--`. +#' +#' @param download_metric +#' @param meta +#' +#' @returns +#' @export +extract_repetitions <- function(download_metric) { + download_metric |> mutate( - temp = str_remove(torrent_name, '^dataset-'), + temp = str_remove(dataset_name, '^dataset-'), seed_set = as.numeric(str_extract(temp, '^\\d+')), run = as.numeric(str_extract(temp, '\\d+$')) ) |> @@ -15,35 +19,42 @@ extract_repetitions <- function(deluge_torrent_download) { select(-temp, -name) } -compute_pieces <- function(deluge_torrent_download, n_pieces) { - deluge_torrent_download |> +#' Computes the progress, in percentage, of the download. The underlying +#' assumption is that downloads are logged as discrete chunks of the same size, +#' and that the `value` column contains something that identifies this chunk. +#' +#' This makes it compatible with BitTorrent, which logs piece ids, whereas with +#' other systems we can simply use a byte count, provided the logger is smart +#' enough to log at equally-sized, discrete intervals. +#' +compute_progress <- function(download_metric, meta, count_distinct) { + download_metric |> group_by(node, seed_set, run) |> arrange(timestamp) |> mutate( - piece_count = seq_along(timestamp) + piece_count = if (count_distinct) seq_along(timestamp) else piece ) |> ungroup() |> - mutate(completed = piece_count / n_pieces) + mutate(completed = (piece_count * meta$download_metric_unit_bytes) / meta$file_size) } -process_incomplete_downloads <- function(deluge_torrent_download, n_pieces, discard_incomplete) { - incomplete_downloads <- deluge_torrent_download |> +process_incomplete_downloads <- function(download_metric, discard_incomplete) { + incomplete_downloads <- download_metric |> group_by(node, seed_set, run) |> - count() |> - ungroup() |> - filter(n != n_pieces) + summarise(completed = max(completed)) |> + filter(!is_completed(completed)) if(nrow(incomplete_downloads) > 0) { (if (!discard_incomplete) stop else warning)( 'Experiment contained incomplete downloads.') } - deluge_torrent_download |> anti_join( + download_metric |> anti_join( incomplete_downloads, by = c('node', 'seed_set', 'run')) } -process_incomplete_repetitions <- function(deluge_torrent_download, repetitions, allow_missing) { - mismatching_repetitions <- deluge_torrent_download |> +process_incomplete_repetitions <- function(download_metric, repetitions, allow_missing) { + mismatching_repetitions <- download_metric |> select(seed_set, node, run) |> distinct() |> group_by(seed_set, node) |> @@ -55,10 +66,10 @@ process_incomplete_repetitions <- function(deluge_torrent_download, repetitions, 'Experiment data did not have all repetitions.') } - deluge_torrent_download + download_metric } -compute_download_times <- function(meta, request_event, deluge_torrent_download, group_id) { +compute_download_times <- function(meta, request_event, download_metric, group_id) { n_leechers <- meta$nodes$network_size - meta$seeders download_start <- request_event |> @@ -68,14 +79,15 @@ compute_download_times <- function(meta, request_event, deluge_torrent_download, # We didn't log those on the runner side so I have to reconstruct them. run = rep(rep( 1:meta$repetitions - 1, - each = n_leechers), times=meta$seeder_sets), + each = n_leechers), times = meta$seeder_sets), seed_set = rep( 1:meta$seeder_sets - 1, each = n_leechers * meta$repetitions), + destination = gsub('"', '', destination) # sometimes we get double-quoted strings in logs ) |> transmute(node = destination, run, seed_set, seed_request_time = timestamp) - download_times <- deluge_torrent_download |> + download_times <- download_metric |> left_join(download_start, by = c('node', 'run', 'seed_set')) |> mutate( elapsed_download_time = as.numeric(timestamp - seed_request_time) @@ -95,38 +107,39 @@ compute_download_times <- function(meta, request_event, deluge_torrent_download, download_times } -check_seeder_count <- function(download_times, seeders) { - mismatching_seeders <- download_times |> - filter(is.na(seed_request_time)) |> - select(node, seed_set, run) |> - distinct() |> - group_by(seed_set, run) |> - count() |> - filter(n != seeders) - nrow(mismatching_seeders) == 0 +download_times <- function(experiment, piece_count_distinct, discard_incomplete = TRUE, allow_missing = TRUE) { + meta <- experiment$meta + downloads <- experiment$download_metric |> + extract_repetitions() |> + compute_progress(meta, count_distinct = piece_count_distinct) + + downloads <- process_incomplete_downloads( + downloads, + discard_incomplete + ) |> + process_incomplete_repetitions(meta$repetitions, allow_missing) + + download_times <- compute_download_times( + meta, + experiment$request_event, + downloads, + group_id + ) + + if (!check_seeder_count(download_times, meta$seeders)) { + warning(glue::glue('Undefined download times do not match seeder count')) + return(NULL) + } + + download_times } -download_stats <- function(download_times) { - download_times |> - filter(!is.na(elapsed_download_time)) |> - group_by(piece_count, completed) |> - summarise( - mean = mean(elapsed_download_time), - median = median(elapsed_download_time), - max = max(elapsed_download_time), - min = min(elapsed_download_time), - p90 = quantile(elapsed_download_time, p = 0.95), - p10 = quantile(elapsed_download_time, p = 0.05), - .groups = 'drop' - ) -} completion_time_stats <- function(download_times, meta) { - n_pieces <- meta |> piece_count() completion_times <- download_times |> filter(!is.na(elapsed_download_time), - piece_count == n_pieces) |> + is_completed(completed)) |> pull(elapsed_download_time) n_experiments <- meta$repetitions * meta$seeder_sets @@ -156,35 +169,32 @@ completion_time_stats <- function(download_times, meta) { ) } -download_times <- function(experiment, discard_incomplete = TRUE, allow_missing = TRUE) { - meta <- experiment$meta - pieces <- experiment$meta |> piece_count() - downloads <- experiment$deluge_torrent_download |> - extract_repetitions() |> - compute_pieces(pieces) +check_seeder_count <- function(download_times, seeders) { + mismatching_seeders <- download_times |> + filter(is.na(seed_request_time)) |> + select(node, seed_set, run) |> + distinct() |> + group_by(seed_set, run) |> + count() |> + filter(n != seeders) - downloads <- process_incomplete_downloads( - downloads, - pieces, - discard_incomplete - ) |> - process_incomplete_repetitions(meta$repetitions, allow_missing) - - download_times <- compute_download_times( - meta, - experiment$request_event, - downloads, - group_id - ) - - if (!check_seeder_count(download_times, meta$seeders)) { - warning(glue::glue('Undefined download times do not match seeder count')) - return(NULL) - } - - download_times + nrow(mismatching_seeders) == 0 } +download_stats <- function(download_times) { + download_times |> + filter(!is.na(elapsed_download_time)) |> + group_by(piece_count, completed) |> + summarise( + mean = mean(elapsed_download_time), + median = median(elapsed_download_time), + max = max(elapsed_download_time), + min = min(elapsed_download_time), + p90 = quantile(elapsed_download_time, p = 0.95), + p10 = quantile(elapsed_download_time, p = 0.05), + .groups = 'drop' + ) +} compute_compact_summary <- function(download_ecdf) { lapply(c(0.05, 0.5, 0.95), function(p) diff --git a/analysis/final/R/read-all-experiments.R b/analysis/final/R/read-all-experiments.R index 4c885b5..4c9157b 100644 --- a/analysis/final/R/read-all-experiments.R +++ b/analysis/final/R/read-all-experiments.R @@ -1,9 +1,9 @@ -read_all_experiments <- function(base_path, skip_incomplete = TRUE) { +read_all_experiments <- function(base_path, skip_incomplete = TRUE, prefix = '') { roots <- list.files(base_path, include.dirs = TRUE, no.. = TRUE, full.names = TRUE) experiments <- lapply(roots, read_single_experiment) - names(experiments) <- sapply(roots, basename) + names(experiments) <- paste0(prefix, sapply(roots, basename)) # Validates that no experiment has missing data. key_sets <- lapply(experiments, ls) |> unique() @@ -24,8 +24,7 @@ read_all_experiments <- function(base_path, skip_incomplete = TRUE) { experiments[!is.null(experiments)] } -merge_experiments <- function(set_1, set_2, prefix) { - maxid <- max(as.integer(sub(pattern = 'e', '', ls(deluge)))) +merge_experiments <- function(set_1, set_2) { merged <- list() for (set_1_id in ls(set_1)) { @@ -33,7 +32,10 @@ merge_experiments <- function(set_1, set_2, prefix) { } for (set_2_id in ls(set_2)) { - merged[[paste0(prefix, set_2_id)]] <- set_2[[set_2_id]] + if (set_2_id %in% names(merged)) { + stop(glue::glue('Duplicate experiment ID {set_2_id}. Cannot merge.')) + } + merged[[set_2_id]] <- set_2[[set_2_id]] } merged diff --git a/analysis/final/static-dissemination.Rmd b/analysis/final/static-dissemination.Rmd index d1914cc..f7760e7 100644 --- a/analysis/final/static-dissemination.Rmd +++ b/analysis/final/static-dissemination.Rmd @@ -20,25 +20,34 @@ devtools::load_all() This is data that's been pre-parsed from an experiment [log source](https://github.com/codex-storage/bittorrent-benchmarks/blob/1ee8ea8a35a2c0fccea6e7c955183c4ed03eebb3/benchmarks/logging/sources.py#L27). ```{r} -deluge <- read_all_experiments('./data/g1738145663/') |> - merge_experiments( - read_all_experiments('./data/g1738248455/'), prefix = 's') +experiments <- read_all_experiments('./data/g1739826980') ``` + +```{r} +COUNT_DISTINCT = list( + 'codex_static_dissemination' = FALSE, + 'deluge_static_dissemination' = TRUE +) +``` + + Computes the benchmark statistics from raw download logs. ```{r} -benchmarks <- lapply(deluge, function(experiment) { +benchmarks <- lapply(experiments, function(experiment) { print(glue::glue('Process {experiment$experiment_id}')) download_time_stats <- tryCatch({ meta <- experiment$meta completion <- experiment |> - download_times() |> + download_times( + piece_count_distinct = COUNT_DISTINCT[[meta$experiment_type]]) |> completion_time_stats(meta) if (is.null(completion)) { NULL } else { completion |> mutate( + experiment_type = meta$experiment_type, network_size = meta$nodes$network_size, seeders = meta$seeders, leechers = network_size - meta$seeders, @@ -71,7 +80,7 @@ benchmarks We then plot the median by network size, and facet it by seeder ratio and file size to see if looks sane: ```{r fig.width = 10, warning=FALSE, message=FALSE} -ggplot(benchmarks) + +ggplot(benchmarks, aes(col = experiment_type, fill = experiment_type)) + geom_ribbon(aes(ymin = p25, ymax = p75, x = network_size), fill = scales::alpha('blue', 0.5), col = 'lightgray') + geom_point(aes(x = network_size, y = p25), col = 'darkgray', size=10.0, shape='-') + @@ -90,5 +99,7 @@ ggplot(benchmarks) + paste0("seeder ratio: ", scales::percent(as.numeric(x))) })) ) + + scale_color_discrete(name = 'experiment type') + + guides(fill = 'none') + ylim(c(0,NA)) ```