feat: add lossy analysis

This commit is contained in:
gmega 2025-01-30 11:07:47 -03:00
parent 94893c0f93
commit df087127de
No known key found for this signature in database
GPG Key ID: 6290D34EAD824B18
2 changed files with 38 additions and 25 deletions

View File

@ -1,7 +1,7 @@
PIECE_SIZE <- 262144 PIECE_SIZE <- 262144
piece_count <- function(experiment) { piece_count <- function(experiment_meta) {
experiment$meta$file_size / PIECE_SIZE experiment_meta$file_size / PIECE_SIZE
} }
extract_repetitions <- function(deluge_torrent_download) { extract_repetitions <- function(deluge_torrent_download) {
@ -26,17 +26,23 @@ compute_pieces <- function(deluge_torrent_download, n_pieces) {
mutate(completed = piece_count / n_pieces) mutate(completed = piece_count / n_pieces)
} }
check_incomplete_downloads <- function(deluge_torrent_download, n_pieces) { process_incomplete_downloads <- function(deluge_torrent_download, n_pieces, discard_incomplete) {
incomplete_downloads <- deluge_torrent_download |> incomplete_downloads <- deluge_torrent_download |>
group_by(node, seed_set, run) |> group_by(node, seed_set, run) |>
count() |> count() |>
ungroup() |> ungroup() |>
filter(n != n_pieces) filter(n != n_pieces)
nrow(incomplete_downloads) == 0 if(nrow(incomplete_downloads) > 0) {
(if (!discard_incomplete) stop else warning)(
'Experiment contained incomplete downloads.')
}
deluge_torrent_download |> anti_join(
incomplete_downloads, by = c('node', 'seed_set', 'run'))
} }
check_mismatching_repetitions <- function(deluge_torrent_download, repetitions) { process_incomplete_repetitions <- function(deluge_torrent_download, repetitions, allow_missing) {
mismatching_repetitions <- deluge_torrent_download |> mismatching_repetitions <- deluge_torrent_download |>
select(seed_set, node, run) |> select(seed_set, node, run) |>
distinct() |> distinct() |>
@ -44,7 +50,12 @@ check_mismatching_repetitions <- function(deluge_torrent_download, repetitions)
count() |> count() |>
filter(n != repetitions) filter(n != repetitions)
nrow(mismatching_repetitions) == 0 if(nrow(mismatching_repetitions) > 0) {
(if (!allow_missing) stop else warning)(
'Experiment data did not have all repetitions.')
}
deluge_torrent_download
} }
compute_download_times <- function(meta, request_event, deluge_torrent_download, group_id) { compute_download_times <- function(meta, request_event, deluge_torrent_download, group_id) {
@ -111,13 +122,21 @@ download_stats <- function(download_times) {
) )
} }
completion_time_stats <- function(download_times) { completion_time_stats <- function(download_times, meta) {
n_pieces <- meta |> piece_count()
completion_times <- download_times |> completion_times <- download_times |>
filter(!is.na(elapsed_download_time)) |> filter(!is.na(elapsed_download_time),
piece_count == n_pieces) |>
pull(elapsed_download_time) pull(elapsed_download_time)
n_experiments <- meta$repetitions * meta$seeder_sets
n_leechers <- meta$nodes$network_size - meta$seeders
n_points <- n_experiments * n_leechers
tibble( tibble(
n = length(completion_times),
expected_n = n_points,
missing = expected_n - n,
min = min(completion_times), min = min(completion_times),
p05 = quantile(completion_times, p = 0.05), p05 = quantile(completion_times, p = 0.05),
p10 = quantile(completion_times, p = 0.10), p10 = quantile(completion_times, p = 0.10),
@ -126,28 +145,23 @@ completion_time_stats <- function(download_times) {
p80 = quantile(completion_times, p = 0.80), p80 = quantile(completion_times, p = 0.80),
p90 = quantile(completion_times, p = 0.90), p90 = quantile(completion_times, p = 0.90),
p95 = quantile(completion_times, p = 0.95), p95 = quantile(completion_times, p = 0.95),
max = max(completion_times) max = max(completion_times),
) )
} }
download_times <- function(experiment) { download_times <- function(experiment, discard_incomplete = TRUE, allow_missing = TRUE) {
meta <- experiment$meta meta <- experiment$meta
pieces <- experiment |> piece_count() pieces <- experiment$meta |> piece_count()
downloads <- experiment$deluge_torrent_download |> downloads <- experiment$deluge_torrent_download |>
extract_repetitions() |> extract_repetitions() |>
compute_pieces(pieces) compute_pieces(pieces)
if (!check_incomplete_downloads(downloads, pieces)) { downloads <- process_incomplete_downloads(
warning(glue::glue('Discard experiment {experiment$experiment_id} ', downloads,
'due to incomplete downloads')) pieces,
return(NULL) discard_incomplete
} ) |>
process_incomplete_repetitions(meta$repetitions, allow_missing)
if (!check_mismatching_repetitions(downloads, meta$repetitions)) {
warning(glue::glue('Discard experiment {experiment$experiment_id} ',
'due to mismatching repetitions'))
return(NULL)
}
download_times <- compute_download_times( download_times <- compute_download_times(
meta, meta,

View File

@ -20,7 +20,7 @@ devtools::load_all()
This is data that's been pre-parsed from an experiment [log source](https://github.com/codex-storage/bittorrent-benchmarks/blob/1ee8ea8a35a2c0fccea6e7c955183c4ed03eebb3/benchmarks/logging/sources.py#L27). This is data that's been pre-parsed from an experiment [log source](https://github.com/codex-storage/bittorrent-benchmarks/blob/1ee8ea8a35a2c0fccea6e7c955183c4ed03eebb3/benchmarks/logging/sources.py#L27).
```{r} ```{r}
deluge <- read_all_experiments('./data/deluge-g1737553501/') deluge <- read_all_experiments('./data/g1738145663/')
``` ```
Computes the benchmark statistics from raw download logs. Computes the benchmark statistics from raw download logs.
@ -31,7 +31,7 @@ benchmarks <- lapply(deluge, function(experiment) {
meta <- experiment$meta meta <- experiment$meta
completion <- experiment |> completion <- experiment |>
download_times() |> download_times() |>
completion_time_stats() completion_time_stats(meta)
if (is.null(completion)) { if (is.null(completion)) {
NULL NULL
@ -83,4 +83,3 @@ ggplot(benchmarks) +
) + ) +
ylim(c(0,NA)) ylim(c(0,NA))
``` ```
The data looks largely sane: a larger seeder ratio makes performance somewhat better; though not nearly as consistently as one would hope, at least in this data, and there is a linear performance degradation trend as the network grows larger. Also, the $100\text{MB}$ file seems to generate much better-behaved data than the $1\text{GB}$ case with those trends; i.e., larger seeder ratio improving performance, and network size linearly degrading it, being more pronounced.