extend analysis with time to first byte and transfer time

This commit is contained in:
gmega 2026-04-28 14:05:14 -03:00
parent 8a834ecfee
commit 74e6486d56
No known key found for this signature in database
GPG Key ID: 6290D34EAD824B18
3 changed files with 100 additions and 70 deletions

View File

@ -2,3 +2,5 @@
^renv\.lock$
^.*\.Rproj$
^\.Rproj\.user$
^\.positai$
^\.claude$

View File

@ -94,13 +94,21 @@ compute_download_times <- function(meta, request_event, download_metric, group_i
) |>
group_by(node, run, seed_set) |>
mutate(
time_to_first_byte = min(timestamp),
lookup_time = as.numeric(time_to_first_byte - seed_request_time)
# The time elapsed between the instant we start the download and the time
# we see the first log entry is approximated as "time to first byte".
# In reality, this factors lookup time, swarm bootstrap time, and
# part of the download itself.
first_byte_t = min(timestamp),
first_byte = as.numeric(first_byte_t - seed_request_time),
# "Transfer" time is the total download time minus the lookup time. Again,
# this is approximated, and likely reflects a shorter download time than
# the real download time.
transfer = as.numeric(max(timestamp) - first_byte_t)
) |>
ungroup()
if (nrow(download_times |>
filter(elapsed_download_time < 0 | lookup_time < 0)) > 0) {
filter(elapsed_download_time < 0 | first_byte < 0 | transfer < 0)) > 0) {
stop('Calculation for download times contains negative numbers')
}
@ -137,36 +145,49 @@ download_times <- function(experiment, piece_count_distinct, discard_incomplete
completion_time_stats <- function(download_times, meta) {
completion_times <- download_times |>
filtered <- download_times |>
filter(!is.na(elapsed_download_time),
is_completed(completed)) |>
pull(elapsed_download_time)
is_completed(completed))
n_experiments <- meta$repetitions * meta$seeder_sets
n_leechers <- meta$nodes$network_size - meta$seeders
n_points <- n_experiments * n_leechers
tibble(
n = length(completion_times),
n = nrow(filtered),
expected_n = n_points,
missing = expected_n - n,
min = min(completion_times),
p05 = quantile(completion_times, p = 0.05),
p10 = quantile(completion_times, p = 0.10),
p20 = quantile(completion_times, p = 0.20),
p25 = quantile(completion_times, p = 0.25),
median = median(completion_times),
p75 = quantile(completion_times, p = 0.75),
p80 = quantile(completion_times, p = 0.80),
p90 = quantile(completion_times, p = 0.90),
p95 = quantile(completion_times, p = 0.95),
max = max(completion_times),
completion = distributional_stats(filtered$elapsed_download_time),
first_byte = distributional_stats(filtered$first_byte),
transfer = distributional_stats(filtered$transfer)
) |>
unnest(
cols = c(completion, first_byte, transfer),
names_sep = '_'
)
}
distributional_stats <- function(x) {
n <- length(x)
tibble(
min = min(x),
p05 = quantile(x, p = 0.05),
p10 = quantile(x, p = 0.10),
p20 = quantile(x, p = 0.20),
p25 = quantile(x, p = 0.25),
median = median(x),
p75 = quantile(x, p = 0.75),
p80 = quantile(x, p = 0.80),
p90 = quantile(x, p = 0.90),
p95 = quantile(x, p = 0.95),
max = max(x),
iqr = p75 - p25,
# This gives us roughly a 95% ci for comparing medians.
ci = (1.58 * iqr) / sqrt(n),
w_top = median + ci,
w_bottom = median - ci
)
}
check_seeder_count <- function(download_times, seeders) {
@ -212,13 +233,12 @@ compute_speedups <- function(benchmarks, baseline, compare) {
baseline_data <- benchmarks |>
filter(label == baseline) |>
select(
experiment_type, label, network_size, seeders, leechers, file_size, median
experiment_type, label, network_size, seeders, leechers, file_size, completion_median
) |>
rename(baseline_median = median)
rename(baseline_median = completion_median)
lapply(compare, function(compare_label) {
browser()
benchmarks |>
filter(label == compare_label) |>
inner_join(
@ -226,7 +246,7 @@ compute_speedups <- function(benchmarks, baseline, compare) {
by = c('network_size', 'seeders', 'leechers', 'file_size')
) |>
mutate(
relative_median = median / baseline_median
relative_median = completion_median / baseline_median
) |>
mutate(label = label.x) |>
select(-baseline_median, -label.y, -label.x)

View File

@ -31,9 +31,10 @@ devtools::load_all()
```
```{r message = FALSE, include = !knitr::is_html_output()}
experiments <- read_all_experiments('./data/devnet/optimized/g1761924045/', label = 'deluge') |>
merge_experiments(read_all_experiments('./data/devnet/optimized/g1762505060/', label ='codex-baseline')) |>
merge_experiments(read_all_experiments('./data/devnet/optimized/g1761729711/', label = 'codex-optimized'))
experiments <- read_all_experiments('./data/do/g1761924045/', label = 'deluge') |>
merge_experiments(read_all_experiments('./data/do/g1762505060/', label = 'codex-baseline')) |>
merge_experiments(read_all_experiments('./data/do/g1761729711/', label = 'codex-optimized')) |>
merge_experiments(read_all_experiments('./data/do/g1775565300/', label = 'new-protocol'))
```
```{r include = !knitr::is_html_output()}
@ -45,7 +46,7 @@ COUNT_DISTINCT = list(
```{r message = FALSE, include = !knitr::is_html_output()}
benchmarks <- lapply(experiments, function(experiment) {
print(glue::glue('Process {experiment$experiment_id}'))
print(glue::glue('Process {experiment$experiment_id} - {experiment$label}'))
download_time_stats <- tryCatch({
meta <- experiment$meta
completion <- experiment |>
@ -77,9 +78,12 @@ benchmarks <- lapply(experiments, function(experiment) {
levels = rlang::parse_bytes(as.character(
unique(file_size[order(file_size, decreasing = TRUE)])))),
seeder_ratio = seeders / network_size,
median_speed = file_size_bytes / median,
p25_speed = file_size_bytes / p25,
p75_speed = file_size_bytes / p75
completion_median_speed = file_size_bytes / completion_median,
completion_p25_speed = file_size_bytes / completion_p25,
completion_p75_speed = file_size_bytes / completion_p75,
transfer_median_speed = file_size_bytes / transfer_median,
transfer_p25_speed = file_size_bytes / transfer_p25,
transfer_p75_speed = file_size_bytes / transfer_p75
) |>
relocate(file_size, network_size, seeders, leechers, file_size_bytes)
```
@ -114,58 +118,62 @@ DT::datatable(
relative_performance <- compute_speedups(
benchmarks = benchmarks,
base = 'deluge',
compare = c('codex-baseline', 'codex-optimized')
compare = c('codex-baseline', 'codex-optimized', 'new-protocol')
)
```
## Median Download Speed
```{r fig.cap='Median download speed for Deluge and Codex', fig.width = 11, message = FALSE, echo = FALSE}
ggplot(benchmarks, aes(col = label, fill = label, group = label)) +
geom_ribbon(aes(ymin = p25_speed, ymax = p75_speed, x = network_size, fill = label, alpha = 0.5), col = 'lightgray') +
geom_point(aes(x = network_size, y = p25_speed), col = 'darkgray', size=10.0, shape='-') +
geom_point(aes(x = network_size, y = p75_speed), col = 'darkgray', size=10.0, shape='-') +
geom_line(aes(x = network_size, y = median_speed)) +
geom_point(aes(x = network_size, y = median_speed)) +
ylab('median download speed (bytes/second)') +
xlab('network size') +
theme_minimal(base_size=15) +
scale_y_continuous(labels = function(x) paste0(scales::label_bytes()(x), '/s')) +
facet_grid(
file_size ~ seeder_ratio,
labeller = labeller(
seeder_ratio = as_labeller(function(x) {
paste0("seeder ratio: ", scales::percent(as.numeric(x)))
}))
) +
scale_color_discrete(name = '') +
guides(fill = 'none', alpha = 'none')
comparison_plot(
benchmarks,
completion_p25_speed,
completion_p75_speed,
completion_median_speed,
ylab = 'median download speed (bytes/second)',
free_y = TRUE
) + Y_BPS
```
## Median Transfer Speed
```{r fig.width = 11, message = FALSE, echo = FALSE}
comparison_plot(
benchmarks,
transfer_p25_speed,
transfer_p75_speed,
transfer_median_speed,
ylab = 'median transfer speed (bytes/second)',
free_y = TRUE
) + Y_BPS
```
## Median Download Time
```{r fig.cap='Median time to download a whole file for Deluge and Codex', fig.width = 11, message = FALSE, echo = FALSE}
ggplot(benchmarks, aes(col = label, fill = label, group = label)) +
geom_ribbon(aes(ymin = p25, ymax = p75, x = network_size, fill = experiment_type, alpha = 0.5), col = 'lightgray') +
geom_point(aes(x = network_size, y = p25), col = 'darkgray', size=10.0, shape='-') +
geom_point(aes(x = network_size, y = p75), col = 'darkgray', size=10.0, shape='-') +
geom_line(aes(x = network_size, y = median)) +
geom_point(aes(x = network_size, y = median)) +
ylab('median download time') +
xlab('network size') +
theme_minimal(base_size = 15) +
scale_y_continuous(labels = scales::label_timespan()) +
facet_grid(
scales = 'free_y',
file_size ~ seeder_ratio,
labeller = labeller(
seeder_ratio = as_labeller(function(x) {
paste0("seeder ratio: ", scales::percent(as.numeric(x)))
}))
) +
scale_color_discrete(name = '') +
guides(fill = 'none', alpha = 'none')
comparison_plot(
benchmarks,
completion_p25,
completion_p75,
completion_median,
ylab = 'median download time',
free_y = TRUE
) + Y_TIMESPAN
```
## Median Time to First Byte
The time elapsed from the moment in which we ask a node to download a file to the time in which it logs having downloaded the first $x\%$ of the file -- whatever the logging granularity is -- marks our time to first byte. This is actually an approximation which factors in _i)_ DHT lookup latency; _ii)_ swarm bootstrap latency; _iii)_ a fraction, typically $1/100^{th}$, of the download time. This should impact smaller files more than it impacts larger files.
```{r fig.cap='Median time-to-first-byte for Deluge and Codex', fig.width = 11, message = FALSE, echo = FALSE}
comparison_plot(
benchmarks,
first_byte_p25,
first_byte_p75,
first_byte_median,
ylab = 'median download time',
free_y = TRUE
) + Y_TIMESPAN
```
## Median Download Time Ratio