diff --git a/analysis/final/R/analysis.R b/analysis/final/R/analysis.R index e06b347..7aa598e 100644 --- a/analysis/final/R/analysis.R +++ b/analysis/final/R/analysis.R @@ -1,7 +1,7 @@ PIECE_SIZE <- 262144 -piece_count <- function(experiment) { - experiment$meta$file_size / PIECE_SIZE +piece_count <- function(experiment_meta) { + experiment_meta$file_size / PIECE_SIZE } extract_repetitions <- function(deluge_torrent_download) { @@ -26,17 +26,23 @@ compute_pieces <- function(deluge_torrent_download, n_pieces) { mutate(completed = piece_count / n_pieces) } -check_incomplete_downloads <- function(deluge_torrent_download, n_pieces) { +process_incomplete_downloads <- function(deluge_torrent_download, n_pieces, discard_incomplete) { incomplete_downloads <- deluge_torrent_download |> group_by(node, seed_set, run) |> count() |> ungroup() |> filter(n != n_pieces) - nrow(incomplete_downloads) == 0 + if(nrow(incomplete_downloads) > 0) { + (if (!discard_incomplete) stop else warning)( + 'Experiment contained incomplete downloads.') + } + + deluge_torrent_download |> anti_join( + incomplete_downloads, by = c('node', 'seed_set', 'run')) } -check_mismatching_repetitions <- function(deluge_torrent_download, repetitions) { +process_incomplete_repetitions <- function(deluge_torrent_download, repetitions, allow_missing) { mismatching_repetitions <- deluge_torrent_download |> select(seed_set, node, run) |> distinct() |> @@ -44,7 +50,12 @@ check_mismatching_repetitions <- function(deluge_torrent_download, repetitions) count() |> filter(n != repetitions) - nrow(mismatching_repetitions) == 0 + if(nrow(mismatching_repetitions) > 0) { + (if (!allow_missing) stop else warning)( + 'Experiment data did not have all repetitions.') + } + + deluge_torrent_download } compute_download_times <- function(meta, request_event, deluge_torrent_download, group_id) { @@ -111,13 +122,21 @@ download_stats <- function(download_times) { ) } -completion_time_stats <- function(download_times) { +completion_time_stats <- function(download_times, meta) { + n_pieces <- meta |> piece_count() completion_times <- download_times |> - filter(!is.na(elapsed_download_time)) |> + filter(!is.na(elapsed_download_time), + piece_count == n_pieces) |> pull(elapsed_download_time) + n_experiments <- meta$repetitions * meta$seeder_sets + n_leechers <- meta$nodes$network_size - meta$seeders + n_points <- n_experiments * n_leechers tibble( + n = length(completion_times), + expected_n = n_points, + missing = expected_n - n, min = min(completion_times), p05 = quantile(completion_times, p = 0.05), p10 = quantile(completion_times, p = 0.10), @@ -126,28 +145,23 @@ completion_time_stats <- function(download_times) { p80 = quantile(completion_times, p = 0.80), p90 = quantile(completion_times, p = 0.90), p95 = quantile(completion_times, p = 0.95), - max = max(completion_times) + max = max(completion_times), ) } -download_times <- function(experiment) { +download_times <- function(experiment, discard_incomplete = TRUE, allow_missing = TRUE) { meta <- experiment$meta - pieces <- experiment |> piece_count() + pieces <- experiment$meta |> piece_count() downloads <- experiment$deluge_torrent_download |> extract_repetitions() |> compute_pieces(pieces) - if (!check_incomplete_downloads(downloads, pieces)) { - warning(glue::glue('Discard experiment {experiment$experiment_id} ', - 'due to incomplete downloads')) - return(NULL) - } - - if (!check_mismatching_repetitions(downloads, meta$repetitions)) { - warning(glue::glue('Discard experiment {experiment$experiment_id} ', - 'due to mismatching repetitions')) - return(NULL) - } + downloads <- process_incomplete_downloads( + downloads, + pieces, + discard_incomplete + ) |> + process_incomplete_repetitions(meta$repetitions, allow_missing) download_times <- compute_download_times( meta, diff --git a/analysis/final/static-dissemination.Rmd b/analysis/final/static-dissemination.Rmd index f4bbac0..7ce627e 100644 --- a/analysis/final/static-dissemination.Rmd +++ b/analysis/final/static-dissemination.Rmd @@ -20,7 +20,7 @@ devtools::load_all() This is data that's been pre-parsed from an experiment [log source](https://github.com/codex-storage/bittorrent-benchmarks/blob/1ee8ea8a35a2c0fccea6e7c955183c4ed03eebb3/benchmarks/logging/sources.py#L27). ```{r} -deluge <- read_all_experiments('./data/deluge-g1737553501/') +deluge <- read_all_experiments('./data/g1738145663/') ``` Computes the benchmark statistics from raw download logs. @@ -31,7 +31,7 @@ benchmarks <- lapply(deluge, function(experiment) { meta <- experiment$meta completion <- experiment |> download_times() |> - completion_time_stats() + completion_time_stats(meta) if (is.null(completion)) { NULL @@ -83,4 +83,3 @@ ggplot(benchmarks) + ) + ylim(c(0,NA)) ``` -The data looks largely sane: a larger seeder ratio makes performance somewhat better; though not nearly as consistently as one would hope, at least in this data, and there is a linear performance degradation trend as the network grows larger. Also, the $100\text{MB}$ file seems to generate much better-behaved data than the $1\text{GB}$ case with those trends; i.e., larger seeder ratio improving performance, and network size linearly degrading it, being more pronounced.