diff --git a/analysis/exploratory/collect-logs.sh b/analysis/exploratory/collect-logs.sh index 7fbd99f..be8f217 100755 --- a/analysis/exploratory/collect-logs.sh +++ b/analysis/exploratory/collect-logs.sh @@ -1,16 +1,19 @@ #!/usr/bin/env bash set -e -if [ "$#" -ne 1 ]; then - echo "Usage: $0 " +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " exit 1 fi +group_id="${1}" +output=${2} + # TODO build auto naming for experiment folders based on metadata -mkdir -p "${1}" +mkdir -p "${output}" echo "Collect" -kubectl logs --prefix -n codex-benchmarks -l "app in (deluge-nodes,testrunner)" --tail=-1 > "${1}/raw-logs.log" +kubectl logs --prefix -n codex-benchmarks -l "app.kubernetes.io/part-of=${group_id}" --all-containers --tail=-1 > "${output}/raw-logs.log" echo "Parse" -python -m benchmarks.cli logs "${1}/raw-logs.log" "${1}/parsed" \ No newline at end of file +python -m benchmarks.cli logs single "${output}/raw-logs.log" "${output}/parsed" \ No newline at end of file diff --git a/analysis/exploratory/deluge-exploratory-analysis.Rmd b/analysis/exploratory/exploratory-analysis.Rmd similarity index 58% rename from analysis/exploratory/deluge-exploratory-analysis.Rmd rename to analysis/exploratory/exploratory-analysis.Rmd index c91433d..9e51403 100644 --- a/analysis/exploratory/deluge-exploratory-analysis.Rmd +++ b/analysis/exploratory/exploratory-analysis.Rmd @@ -1,68 +1,66 @@ --- -title: "Deluge Download Times -- Exploratory Analysis" +title: "Static Network Experiment" +subtitle: "Download Times -- Single Experiment Exploratory Analysis" output: bookdown::html_notebook2: number_sections: TRUE toc: TRUE --- -$$ -\newcommand{addtorrent}{\text{AddTorrent}} -$$ +## Goals + +The goal for this notebook is to provide a simple analysis for download time distributions over repeated runs of our [static dissemination experiment](https://github.com/codex-storage/bittorrent-benchmarks/blob/95651ad9d7e5ac4fb7050767cbac94ea75c8c07b/benchmarks/core/experiments/static_experiment.py#L22) over a _single parameter set_; i.e., a set of experiments for which: + +* network size; +* number of seeders; +* number of leechers; +* file size; + +remains constant. ```{r, warning=FALSE, message=FALSE} library(tidyverse) library(jsonlite) ``` - ## Experiment Parameters ```{r} -EXPERIMENT <- '10-network-4-seeders-4-seeder_sets-100MB-filesize' -experiment_file <- function(filename) file.path(EXPERIMENT, 'parsed', filename) +EXPERIMENT_ROOT <- 'data/codex/parsed' +experiment_file <- function(filename) file.path(EXPERIMENT_ROOT, filename) ``` -The torrent piece size is set at torrent creation time by [torrentool](https://github.com/idlesign/torrentool/blob/5f37d6dcc304758bae46d01c63e5be0f0a348bfc/torrentool/torrent.py#L354). -```{r} -PIECE_SIZE <- 262144 -``` - ```{r} -experiment_meta <- jsonlite::read_json(experiment_file('deluge_experiment_config_log_entry.jsonl')) +experiment_meta <- jsonlite::read_json(fs::dir_ls(EXPERIMENT_ROOT, glob='*.jsonl')) ``` -```{r results='asis'} -n_pieces <- experiment_meta$file_size / PIECE_SIZE -cat(paste0("File has ", rlang::as_bytes(experiment_meta$file_size), " and ", n_pieces, " pieces (blocks).")) -``` - -```{r results='asis'} +```{r} n_leechers <- length(experiment_meta$nodes$nodes) - experiment_meta$seeders -cat(paste0("Network has ", length(experiment_meta$nodes$nodes), " nodes with ", experiment_meta$seeders, " seeders and ", n_leechers, " leechers.")) - +tribble( + ~parameter, ~value, + 'network size', length(experiment_meta$nodes$nodes), + 'number of seeders', experiment_meta$seeders, + 'number of leechers', n_leechers, + 'file size (bytes)', experiment_meta$file_size, +) ``` -## Logs - -Read and extract run id and seed set from the dataset name. +## Download Logs ```{r} downloads <- read_csv( - experiment_file('deluge_torrent_download.csv'), + fs::dir_ls(EXPERIMENT_ROOT, glob='*download*'), show_col_types = FALSE, ) |> mutate( - temp = str_remove(torrent_name, '^dataset-'), + temp = str_remove(dataset_name, '^dataset-'), seed_set = as.numeric(str_extract(temp, '^\\d+')), - run = as.numeric(str_extract(temp, '\\d+$')) + run = as.numeric(str_extract(temp, '\\d+$')), + value = value * experiment_meta$download_metric_unit_bytes ) |> - rename(piece = value) |> select(-temp, -name) ``` -Since what we get are piece indices and they might be out of order, we need to actually count how many pieces were downloaded by the node up until a given instant: - ```{r} downloads <- downloads |> group_by(node, seed_set, run) |> @@ -71,12 +69,14 @@ downloads <- downloads |> piece_count = seq_along(timestamp) ) |> ungroup() |> - mutate(completed = piece_count / n_pieces) + mutate(completed = value / experiment_meta$file_size) ``` -We can have a brief look at the data to see that it makes sense. +## Results -```{r fig.width=10, fig.height=10} +### Sanity Checks and Loss Statistics + +```{r fig.cap='Raw data plot, per experiment.', fig.width=10, fig.height=10} ggplot(downloads |> filter(seed_set < 3) |> group_by(seed_set, run) |> @@ -92,20 +92,13 @@ ggplot(downloads |> theme_bw(base_size = 15) ``` -As we can see, the data seems to make sense. To the left we see the "download times" for seeders, which is almost instantaneous, followed by the downloads for the leechers. We see some variability across experiments, with some nodes seemingly struggling to complete their downloads at times. - -## Results - -### Sanity Checks - Have any nodes failed to download the entire file? ```{r} downloads |> group_by(node, seed_set, run) |> - count() |> - ungroup() |> - filter(n != n_pieces) + summarise(completed = max(completed)) |> + filter(completed < 1.0) ``` Do we have as many runs and seed sets as we expect? @@ -122,15 +115,16 @@ downloads |> ### Computing Download Times -We define the _download time_ for a Deluge node $d$ as the time elapsed from the client's response to an $\addtorrent$ request and the time at which the client reports having received the last piece of the downloaded file. Since seeders are already in possession of the file by construction, we only measure download times at _leechers_. +We define the _download time_ for a node $d$ as the time elapsed from the client's response to a [request to download a dataset](https://github.com/codex-storage/bittorrent-benchmarks/blob/95651ad9d7e5ac4fb7050767cbac94ea75c8c07b/benchmarks/core/network.py#L58) and the time at which the client reports having received the last piece of the downloaded file. The form of this request, as well as how the request is done, is client-specific, but typically involves an RPC or REST API call in one flavour or the other. Since seeders are already in possession of the file by construction, we only measure download times at _leechers_. ```{r} -add_torrent_requests <- read_csv( - experiment_file('request_event.csv'), show_col_types = FALSE) +download_requests <- read_csv( + experiment_file('request_event.csv'), show_col_types = FALSE) |> + mutate(destination = gsub("\"", "", destination)) ``` ```{r} -download_start <- add_torrent_requests |> +download_start <- download_requests |> select(-request_id) |> filter(name == 'leech', type == 'RequestEventType.end') |> mutate( @@ -188,6 +182,7 @@ ggplot(download_time_stats) + theme_minimal() + ylab("completion") + xlab("time (seconds)") + - ggtitle(paste0('download time (Deluge, ',rlang::as_bytes(experiment_meta$file_size),' file)')) + scale_y_continuous(labels = scales::percent) + + ggtitle(paste0('download time (Codex, ',rlang::as_bytes(experiment_meta$file_size),' file)')) ```