add latency and diameter analysis

This commit is contained in:
gmega 2023-09-13 14:14:46 -03:00
parent 3ef22a70bc
commit 68353ad6d2
13 changed files with 3609 additions and 27 deletions

View File

@ -0,0 +1 @@
source("renv/activate.R")

View File

@ -17,7 +17,8 @@ Depends:
Hmisc,
playaxr,
plotly,
DT
DT,
pbmcapply
Remotes: gmega/playaxr
Suggests:
devtools,

View File

@ -0,0 +1,4 @@
# Generated by roxygen2: do not edit by hand
S3method(filter,list)
S3method(pull,list)

View File

@ -0,0 +1,26 @@
#' Keeps list items that match a condition
#'
#' List version of dplyr's filter verb.
#'
#' @export
filter.list <- function(x, pred, ...) {
expr <- substitute(pred)
caller <- rlang::caller_env()
Filter(function(item) eval(expr, envir = list2env(item, parent = caller)), x)
}
#' Extract a single column from a list of lists
#'
#' List version of dplyr's pull verb. Note that because not all objects define
#' their own object vectors and because vectors cannot have mixed types, we are
#' not always able to return a vector. So expect vectors only with simple types
#' or types that define their own version of the concatenation operator.
#'
#' @export
pull.list <- function(x, col, ...) {
colname <- deparse(substitute(col))
do.call("c", lapply(x, function(element) {
value <- element[[colname]]
if (is.null(value)) NA else value
}))
}

View File

@ -1,7 +1,7 @@
quantile_df <- function(x, probs = c(0.25, 0.5, 0.75)) {
tibble(
val = quantile(x, probs, na.rm = TRUE),
quant = probs
quant = formatted_factor(probs, function(x) glue('{x*100}'))
)
}
@ -11,7 +11,20 @@ formatted_factor <- function(x, formatter) {
factor(formatter(x), levels)
}
dataset <- function(symbol, block, storage = "csv", reload = FALSE) {
.CODEC = list(
csv = list(
read = function(file) read_csv(file = file, show_col_types = FALSE),
write = write_csv
),
rds = list(
read = read_rds,
write = write_rds
)
)
.CODEC$csv.bz2 = .CODEC$csv
dataset <- function(symbol, block, storage = "csv", reload = FALSE, recalc = FALSE) {
varname <- deparse(substitute(symbol))
env <- rlang::caller_env()
if ((varname %in% names(env)) && !reload) {
@ -19,15 +32,22 @@ dataset <- function(symbol, block, storage = "csv", reload = FALSE) {
return()
}
fname <- glue('./data/{varname}.{storage}')
env[[varname]] <- if (file.exists(fname)) {
env[[varname]] <- if (file.exists(fname) && !recalc) {
message(glue("Reading cached dataset from {fname}"))
read_csv(fname, show_col_types = FALSE)
.CODEC[[storage]]$read(file = fname)
} else {
message("Evaluating dataset expression.")
if (!dir.exists("./data")) dir.create("./data")
contents <- block
message(glue("Write dataset {fname}."))
write_csv(contents, file = fname)
.CODEC[[storage]]$write(content, file = fname)
contents
}
}
timeit <- function(block) {
start <- Sys.time()
result <- block
print(glue('Took {Sys.time() - start} seconds.'))
result
}

View File

@ -0,0 +1,11 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dplyr-extras.R
\name{filter.list}
\alias{filter.list}
\title{Keeps list items that match a condition}
\usage{
\method{filter}{list}(x, pred, ...)
}
\description{
List version of dplyr's filter verb.
}

View File

@ -0,0 +1,13 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dplyr-extras.R
\name{pull.list}
\alias{pull.list}
\title{Extract a single column from a list of lists}
\usage{
\method{pull}{list}(x, col, ...)
}
\description{
List version of dplyr's pull verb. Note that because there is no type
enforcement for lists of lists, we'll return a vector only if the actual
type of all elements in the column is the same. Otherwise we return a list.
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,7 @@
library/
local/
cellar/
lock/
python/
sandbox/
staging/

File diff suppressed because it is too large Load Diff

View File

@ -9,11 +9,30 @@ bibliography: [bibliography.bib]
link-citations: true
---
```{css zoom-lib-src, echo = FALSE}
script src = "https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js"
```
```{js zoom-jquery, echo = FALSE}
$(document).ready(function() {
$('body').prepend('<div class=\"zoomDiv\"><img src=\"\" class=\"zoomImg\"></div>');
// onClick function for all plots (img's)
$('img:not(.zoomImg)').click(function() {
$('.zoomImg').attr('src', $(this).attr('src')).css({width: '100%'});
$('.zoomDiv').css({opacity: '1', width: 'auto', border: '1px solid white', borderRadius: '5px', position: 'fixed', top: '50%', left: '50%', marginRight: '-50%', transform: 'translate(-50%, -50%)', boxShadow: '0px 0px 50px #888888', zIndex: '50', overflow: 'auto', maxHeight: '100%'});
});
// onClick function for zoomImg
$('img.zoomImg').click(function() {
$('.zoomDiv').css({opacity: '0', width: '0%'});
});
});
```
```{r cache=FALSE, echo=FALSE, warning=FALSE, message=FALSE}
knitr::read_chunk('R/swarmoverlay.R')
knitr::read_chunk('R/dissemination.R')
devtools::load_all()
knitr::opts_chunk$set(cache = TRUE, warning = FALSE, message = FALSE, cache.lazy = FALSE)
```
# Context
@ -185,38 +204,138 @@ This is all, to a certain degree, obvious, as the probability that a node gets s
The less obvious choice would be to have nodes reject neighbor requests once a threshold is met, effectively truncating the tail of the degree distribution. This could make the bootstrap procedure more complex/slower as a node would have to request more nodes from the bootstrap service again. We will keep those in mind for the next iteration.
## How fast should we expect a block to _percolate_ over the network?
## How fast should we expect a block to _percolate_ over the network?
In the absence of a link capacity and/or network delay model, graph topology should dominate dissemination time. The simplest case to analyse is to assume that nodes are able to broadcast the packet to _all of its neighbors_. The main appeal is that this is easy to implement, and can already provide some insight.
```{r disseminate-broadcast}
```
We will take the overlays we had from before and run a simple experiment where we pick $1, 2, 3$ and $4$ starting nodes chosen at random in the overlays, and compute the average dissemination times for those.
```{r}
n_sources_max <- 4
```
```{r eval=FALSE}
map(parameters[1], function(parameter) {
map(1:n_sources_max, function(n_sources) {
latencies <- map(1:n_samples, function(instance) {
sources <- sample(1:parameter$v, size = n_sources, replace = FALSE)
latencies <- edge_lists |>
filter(
d == parameter$d,
v == parameter$v,
instance == !!instance
) |>
as_overlay_graph() |>
disseminate_broadcast(sources)
}) |>
list_c() |>
quantile_df(c(0, 0.1, 0.25, 0.50, 0.75, 0.9, 0.95, 1)) |>
mutate(d = parameter$d, v = parameter$v, sources = n_sources)
}) |> bind_rows()
}) |> bind_rows()
```{r cache = TRUE}
# Pre-generates graphs as otherwise this will murder simulation performance.
dataset(
graphs,
storage = 'rds',
map(parameters, function(parameter) {
pbmclapply(
1:n_samples,
function(instance) {
list(
d = parameter$d,
v = parameter$v,
instance = instance,
g = edge_lists |>
filter(d == parameter$d, v == parameter$v, instance == !!instance) |>
as_overlay_graph()
)
},
mc.cores = 2 # don't go overboard or it will suck up your RAM and crash your machine :-)
)
}) |> list_c()
)
```
```{r}
dataset(
latency_stats,
storage = 'csv',
map(parameters, function(parameter) {
graph_instances <- graphs |> filter(v == parameter$v & d == parameter$d)
map(1:n_sources_max, function(n_sources) {
raw_latencies <- map(graph_instances, function(graph) {
sources <- sample(1:graph$v, size = n_sources, replace = FALSE)
disseminate_broadcast(graph$g, sources)
}) |>
list_c()
raw_latencies |>
quantile_df(c(0, 0.1, 0.25, 0.50, 0.75, 0.9, 0.95, 1)) |>
rename(stat = quant) |>
bind_rows(
tibble(
stat = c('mean', 'variance', 'sd'),
val = mean(raw_latencies), var(raw_latencies), sd(raw_latencies)
)
) |>
mutate(
d = parameter$d,
v = parameter$v,
sources = n_sources
)
}) |> bind_rows()
}) |> bind_rows()
)
```
Fig. \@ref(fig:latencies) shows a grid in which each cell contains a latency $\times$ swarm size (log scale) subplot filtered by a combination of latency percentile (columns) and initial bootstrap degree, or $d$ (rows). This means that at cell `percentile: 0.5` $\times$ `d: 2` we should see the median ($50^{th}$ percentile) latencies for swarms with bootstrap degree $d = 2$. Lines are then coloured based on the number of cache nodes present in the swarm ($1$, $2$, $3$, or $4$).
With that in mind, what we see is that:
1. dissemination times appear to increase logarithmically with swarm size at all percentiles, as evidenced by the straight lines in the log plots;
2. increasing $d$ changes the slope of that increase (base of the log), with the most visibly dramatic change happening from $d = 1$ to $d = 2$. In particular, latencies stay below $10$ hops for all swarm sizes once $d \geq 2$, and the median latency is below $5$ hops;
3. adding cache nodes appears to shift the intercept of the curve, even when considering a very small amount of nodes.
```{r latencies, fig.width = 15, fig.height = 12, fig.cap="**(click to zoom)** Latency percentiles as a function of swarm size and bootstrap degree (d)."}
ggplot(latency_stats |> rename(percentile = quant)) +
geom_hline(yintercept = 5, lty = 2, col = 'orange') +
geom_line(aes(x = v, y = val, col = as.factor(sources)), lwd = 1.1) +
facet_grid(d ~ percentile, labeller = label_both) +
xlab("swarm size (log)") +
ylab("latency (hops)") +
theme_playax() +
theme(legend.position = "bottom") +
labs(col = 'cache nodes') +
scale_x_log10() +
big_fonts(15)
```
Assuming a small block size and an RTT of $s$, this would mean a propagation time proportional to $5 \times s$ for a single block in the absence of congested links. Finally, we see that this loosely tracks the shape of the curves for the diameters of graphs of a given size (\@ref(fig:diameters)), which is unsurprising.
```{r}
dataset(
graph_diameters,
storage = 'csv',
map(graphs, function(graph)
tibble(d = graph$d,
v = graph$v,
instance = graph$instance,
diameter = diameter(graph$g))
) |>
bind_rows()
)
```
```{r diameters, fig.width = 10, fig.height = 4, fig.cap = "Swarm diameter percentiles as a function of their size."}
ggplot(
graph_diameters |> group_by(d, v) |> reframe(quantile_df(diameter, probs = c(0, 0.1, 0.25, 0.50, 0.75, 0.9, 0.95, 1)))
) +
geom_line(aes(x = v, y = val, col = as.factor(d))) +
facet_grid(.~quant) +
xlab("swarm size (log)") +
ylab("diameter") +
scale_x_log10() +
theme_playax() +
theme(legend.position = 'bottom') +
labs(col = 'bootstrap degree (d)')
```
# Next Steps
There are a few things that we could do next:
1. tweak the bootstrap protocol so that it mitigates swarm age bias -- where older nodes get selected more often as neighbors as part of the bootstrap procedure -- thus reducing degree imbalances. This involves adding some intelligence to the DHT tracker so that it biases samples towards nodes that have been sampled less;
2. run simulations with more realistic proportions of cache nodes. We constrained this to $1--4$ for efficiency and to get results out faster, but having swarm percentages may yield more interesting numbers;
3. add more sophisticated network dynamics (e.g. churn, link delays + multi-block simulations) to have more accurate estimates of performance and load.
Other issues potentially worth exploring:
1. performance of compressed bitsets vs. other set reconciliation data structures like inverted Bloom filters;
2. techniques for achieving Byzantine fault tolerance on random overlays.
# References

View File

@ -0,0 +1,12 @@
# This file is part of the standard setup for testthat.
# It is recommended that you do not modify it.
#
# Where should you do additional test configuration?
# Learn more about the roles of various files in:
# * https://r-pkgs.org/testing-design.html#sec-tests-files-overview
# * https://testthat.r-lib.org/articles/special-files.html
library(testthat)
library(`swarm-overlay-sims`)
test_check("swarm-overlay-sims")

View File

@ -0,0 +1,58 @@
test_that(
"should filter a list by predicate", {
a_list <- list(
list(a = 1, b = 2, d = 1),
list(a = 2, b = 1, d = 2),
list(a = 1, b = 2, d = 3)
)
expect_equal(a_list |> filter(a == 1 & b == 2), list(
list(a = 1, b = 2, d = 1),
list(a = 1, b = 2, d = 3)
))
}
)
test_that(
"should factor caller context in predicate evaluation", {
a_list <- list(
list(a = 1, b = 2, d = 1),
list(a = 2, b = 1, d = 2)
)
x <- 1
expect_equal(a_list |> filter(a == 2 & b == x), list(
list(a = 2, b = 1, d = 2)
))
}
)
test_that(
"should pull attribute as vector", {
a_list <- list(
list(a = 1, b = 2, d = 1),
list(a = 2, b = 1, d = 2),
list(a = 1, b = 2, d = 3),
list(b = 2, d = 3)
)
expect_equal(a_list |> pull(a), c(1, 2, 1, NA))
}
)
test_that(
"should return vector when c.XXX is defined", {
a_list <- list(
list(a = 1, date = as.Date('2003-03-03')),
list(a = 2, date = as.Date('2003-03-04')),
list(a = 1, date = as.Date('2003-03-05'))
)
column <- a_list |> pull(a)
expect_equal(class(column), "Date")
expect_equal(column, c(
as.Date('2003-03-03'), as.Date('2003-03-04'), as.Date('2003-03-05')))
}
)