mirror of
https://github.com/codex-storage/codex-research.git
synced 2025-01-09 10:22:38 +00:00
add latency and diameter analysis
This commit is contained in:
parent
3ef22a70bc
commit
68353ad6d2
1
analysis/swarm-overlay-sims/.Rprofile
Normal file
1
analysis/swarm-overlay-sims/.Rprofile
Normal file
@ -0,0 +1 @@
|
||||
source("renv/activate.R")
|
@ -17,7 +17,8 @@ Depends:
|
||||
Hmisc,
|
||||
playaxr,
|
||||
plotly,
|
||||
DT
|
||||
DT,
|
||||
pbmcapply
|
||||
Remotes: gmega/playaxr
|
||||
Suggests:
|
||||
devtools,
|
||||
|
4
analysis/swarm-overlay-sims/NAMESPACE
Normal file
4
analysis/swarm-overlay-sims/NAMESPACE
Normal file
@ -0,0 +1,4 @@
|
||||
# Generated by roxygen2: do not edit by hand
|
||||
|
||||
S3method(filter,list)
|
||||
S3method(pull,list)
|
26
analysis/swarm-overlay-sims/R/dplyr-extras.R
Normal file
26
analysis/swarm-overlay-sims/R/dplyr-extras.R
Normal file
@ -0,0 +1,26 @@
|
||||
#' Keeps list items that match a condition
|
||||
#'
|
||||
#' List version of dplyr's filter verb.
|
||||
#'
|
||||
#' @export
|
||||
filter.list <- function(x, pred, ...) {
|
||||
expr <- substitute(pred)
|
||||
caller <- rlang::caller_env()
|
||||
Filter(function(item) eval(expr, envir = list2env(item, parent = caller)), x)
|
||||
}
|
||||
|
||||
#' Extract a single column from a list of lists
|
||||
#'
|
||||
#' List version of dplyr's pull verb. Note that because not all objects define
|
||||
#' their own object vectors and because vectors cannot have mixed types, we are
|
||||
#' not always able to return a vector. So expect vectors only with simple types
|
||||
#' or types that define their own version of the concatenation operator.
|
||||
#'
|
||||
#' @export
|
||||
pull.list <- function(x, col, ...) {
|
||||
colname <- deparse(substitute(col))
|
||||
do.call("c", lapply(x, function(element) {
|
||||
value <- element[[colname]]
|
||||
if (is.null(value)) NA else value
|
||||
}))
|
||||
}
|
@ -1,7 +1,7 @@
|
||||
quantile_df <- function(x, probs = c(0.25, 0.5, 0.75)) {
|
||||
tibble(
|
||||
val = quantile(x, probs, na.rm = TRUE),
|
||||
quant = probs
|
||||
quant = formatted_factor(probs, function(x) glue('{x*100}'))
|
||||
)
|
||||
}
|
||||
|
||||
@ -11,7 +11,20 @@ formatted_factor <- function(x, formatter) {
|
||||
factor(formatter(x), levels)
|
||||
}
|
||||
|
||||
dataset <- function(symbol, block, storage = "csv", reload = FALSE) {
|
||||
.CODEC = list(
|
||||
csv = list(
|
||||
read = function(file) read_csv(file = file, show_col_types = FALSE),
|
||||
write = write_csv
|
||||
),
|
||||
rds = list(
|
||||
read = read_rds,
|
||||
write = write_rds
|
||||
)
|
||||
)
|
||||
|
||||
.CODEC$csv.bz2 = .CODEC$csv
|
||||
|
||||
dataset <- function(symbol, block, storage = "csv", reload = FALSE, recalc = FALSE) {
|
||||
varname <- deparse(substitute(symbol))
|
||||
env <- rlang::caller_env()
|
||||
if ((varname %in% names(env)) && !reload) {
|
||||
@ -19,15 +32,22 @@ dataset <- function(symbol, block, storage = "csv", reload = FALSE) {
|
||||
return()
|
||||
}
|
||||
fname <- glue('./data/{varname}.{storage}')
|
||||
env[[varname]] <- if (file.exists(fname)) {
|
||||
env[[varname]] <- if (file.exists(fname) && !recalc) {
|
||||
message(glue("Reading cached dataset from {fname}"))
|
||||
read_csv(fname, show_col_types = FALSE)
|
||||
.CODEC[[storage]]$read(file = fname)
|
||||
} else {
|
||||
message("Evaluating dataset expression.")
|
||||
if (!dir.exists("./data")) dir.create("./data")
|
||||
contents <- block
|
||||
message(glue("Write dataset {fname}."))
|
||||
write_csv(contents, file = fname)
|
||||
.CODEC[[storage]]$write(content, file = fname)
|
||||
contents
|
||||
}
|
||||
}
|
||||
|
||||
timeit <- function(block) {
|
||||
start <- Sys.time()
|
||||
result <- block
|
||||
print(glue('Took {Sys.time() - start} seconds.'))
|
||||
result
|
||||
}
|
11
analysis/swarm-overlay-sims/man/filter.list.Rd
Normal file
11
analysis/swarm-overlay-sims/man/filter.list.Rd
Normal file
@ -0,0 +1,11 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/dplyr-extras.R
|
||||
\name{filter.list}
|
||||
\alias{filter.list}
|
||||
\title{Keeps list items that match a condition}
|
||||
\usage{
|
||||
\method{filter}{list}(x, pred, ...)
|
||||
}
|
||||
\description{
|
||||
List version of dplyr's filter verb.
|
||||
}
|
13
analysis/swarm-overlay-sims/man/pull.list.Rd
Normal file
13
analysis/swarm-overlay-sims/man/pull.list.Rd
Normal file
@ -0,0 +1,13 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/dplyr-extras.R
|
||||
\name{pull.list}
|
||||
\alias{pull.list}
|
||||
\title{Extract a single column from a list of lists}
|
||||
\usage{
|
||||
\method{pull}{list}(x, col, ...)
|
||||
}
|
||||
\description{
|
||||
List version of dplyr's pull verb. Note that because there is no type
|
||||
enforcement for lists of lists, we'll return a vector only if the actual
|
||||
type of all elements in the column is the same. Otherwise we return a list.
|
||||
}
|
2109
analysis/swarm-overlay-sims/renv.lock
Normal file
2109
analysis/swarm-overlay-sims/renv.lock
Normal file
File diff suppressed because it is too large
Load Diff
7
analysis/swarm-overlay-sims/renv/.gitignore
vendored
Normal file
7
analysis/swarm-overlay-sims/renv/.gitignore
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
library/
|
||||
local/
|
||||
cellar/
|
||||
lock/
|
||||
python/
|
||||
sandbox/
|
||||
staging/
|
1201
analysis/swarm-overlay-sims/renv/activate.R
Normal file
1201
analysis/swarm-overlay-sims/renv/activate.R
Normal file
File diff suppressed because it is too large
Load Diff
@ -9,11 +9,30 @@ bibliography: [bibliography.bib]
|
||||
link-citations: true
|
||||
---
|
||||
|
||||
```{css zoom-lib-src, echo = FALSE}
|
||||
script src = "https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js"
|
||||
```
|
||||
|
||||
```{js zoom-jquery, echo = FALSE}
|
||||
$(document).ready(function() {
|
||||
$('body').prepend('<div class=\"zoomDiv\"><img src=\"\" class=\"zoomImg\"></div>');
|
||||
// onClick function for all plots (img's)
|
||||
$('img:not(.zoomImg)').click(function() {
|
||||
$('.zoomImg').attr('src', $(this).attr('src')).css({width: '100%'});
|
||||
$('.zoomDiv').css({opacity: '1', width: 'auto', border: '1px solid white', borderRadius: '5px', position: 'fixed', top: '50%', left: '50%', marginRight: '-50%', transform: 'translate(-50%, -50%)', boxShadow: '0px 0px 50px #888888', zIndex: '50', overflow: 'auto', maxHeight: '100%'});
|
||||
});
|
||||
// onClick function for zoomImg
|
||||
$('img.zoomImg').click(function() {
|
||||
$('.zoomDiv').css({opacity: '0', width: '0%'});
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
```{r cache=FALSE, echo=FALSE, warning=FALSE, message=FALSE}
|
||||
knitr::read_chunk('R/swarmoverlay.R')
|
||||
knitr::read_chunk('R/dissemination.R')
|
||||
devtools::load_all()
|
||||
knitr::opts_chunk$set(cache = TRUE, warning = FALSE, message = FALSE, cache.lazy = FALSE)
|
||||
```
|
||||
|
||||
# Context
|
||||
@ -185,38 +204,138 @@ This is all, to a certain degree, obvious, as the probability that a node gets s
|
||||
|
||||
The less obvious choice would be to have nodes reject neighbor requests once a threshold is met, effectively truncating the tail of the degree distribution. This could make the bootstrap procedure more complex/slower as a node would have to request more nodes from the bootstrap service again. We will keep those in mind for the next iteration.
|
||||
|
||||
## How fast should we expect a block to _percolate_ over the network?
|
||||
## How fast should we expect a block to _percolate_ over the network?
|
||||
|
||||
In the absence of a link capacity and/or network delay model, graph topology should dominate dissemination time. The simplest case to analyse is to assume that nodes are able to broadcast the packet to _all of its neighbors_. The main appeal is that this is easy to implement, and can already provide some insight.
|
||||
|
||||
```{r disseminate-broadcast}
|
||||
```
|
||||
|
||||
|
||||
We will take the overlays we had from before and run a simple experiment where we pick $1, 2, 3$ and $4$ starting nodes chosen at random in the overlays, and compute the average dissemination times for those.
|
||||
|
||||
```{r}
|
||||
n_sources_max <- 4
|
||||
```
|
||||
|
||||
```{r eval=FALSE}
|
||||
map(parameters[1], function(parameter) {
|
||||
map(1:n_sources_max, function(n_sources) {
|
||||
latencies <- map(1:n_samples, function(instance) {
|
||||
sources <- sample(1:parameter$v, size = n_sources, replace = FALSE)
|
||||
latencies <- edge_lists |>
|
||||
filter(
|
||||
d == parameter$d,
|
||||
v == parameter$v,
|
||||
instance == !!instance
|
||||
) |>
|
||||
as_overlay_graph() |>
|
||||
disseminate_broadcast(sources)
|
||||
}) |>
|
||||
list_c() |>
|
||||
quantile_df(c(0, 0.1, 0.25, 0.50, 0.75, 0.9, 0.95, 1)) |>
|
||||
mutate(d = parameter$d, v = parameter$v, sources = n_sources)
|
||||
}) |> bind_rows()
|
||||
}) |> bind_rows()
|
||||
```{r cache = TRUE}
|
||||
# Pre-generates graphs as otherwise this will murder simulation performance.
|
||||
dataset(
|
||||
graphs,
|
||||
storage = 'rds',
|
||||
map(parameters, function(parameter) {
|
||||
pbmclapply(
|
||||
1:n_samples,
|
||||
function(instance) {
|
||||
list(
|
||||
d = parameter$d,
|
||||
v = parameter$v,
|
||||
instance = instance,
|
||||
g = edge_lists |>
|
||||
filter(d == parameter$d, v == parameter$v, instance == !!instance) |>
|
||||
as_overlay_graph()
|
||||
)
|
||||
},
|
||||
mc.cores = 2 # don't go overboard or it will suck up your RAM and crash your machine :-)
|
||||
)
|
||||
}) |> list_c()
|
||||
)
|
||||
```
|
||||
|
||||
```{r}
|
||||
dataset(
|
||||
latency_stats,
|
||||
storage = 'csv',
|
||||
map(parameters, function(parameter) {
|
||||
graph_instances <- graphs |> filter(v == parameter$v & d == parameter$d)
|
||||
map(1:n_sources_max, function(n_sources) {
|
||||
raw_latencies <- map(graph_instances, function(graph) {
|
||||
sources <- sample(1:graph$v, size = n_sources, replace = FALSE)
|
||||
disseminate_broadcast(graph$g, sources)
|
||||
}) |>
|
||||
list_c()
|
||||
|
||||
raw_latencies |>
|
||||
quantile_df(c(0, 0.1, 0.25, 0.50, 0.75, 0.9, 0.95, 1)) |>
|
||||
rename(stat = quant) |>
|
||||
bind_rows(
|
||||
tibble(
|
||||
stat = c('mean', 'variance', 'sd'),
|
||||
val = mean(raw_latencies), var(raw_latencies), sd(raw_latencies)
|
||||
)
|
||||
) |>
|
||||
mutate(
|
||||
d = parameter$d,
|
||||
v = parameter$v,
|
||||
sources = n_sources
|
||||
)
|
||||
}) |> bind_rows()
|
||||
}) |> bind_rows()
|
||||
)
|
||||
```
|
||||
|
||||
Fig. \@ref(fig:latencies) shows a grid in which each cell contains a latency $\times$ swarm size (log scale) subplot filtered by a combination of latency percentile (columns) and initial bootstrap degree, or $d$ (rows). This means that at cell `percentile: 0.5` $\times$ `d: 2` we should see the median ($50^{th}$ percentile) latencies for swarms with bootstrap degree $d = 2$. Lines are then coloured based on the number of cache nodes present in the swarm ($1$, $2$, $3$, or $4$).
|
||||
|
||||
With that in mind, what we see is that:
|
||||
|
||||
1. dissemination times appear to increase logarithmically with swarm size at all percentiles, as evidenced by the straight lines in the log plots;
|
||||
2. increasing $d$ changes the slope of that increase (base of the log), with the most visibly dramatic change happening from $d = 1$ to $d = 2$. In particular, latencies stay below $10$ hops for all swarm sizes once $d \geq 2$, and the median latency is below $5$ hops;
|
||||
3. adding cache nodes appears to shift the intercept of the curve, even when considering a very small amount of nodes.
|
||||
|
||||
```{r latencies, fig.width = 15, fig.height = 12, fig.cap="**(click to zoom)** Latency percentiles as a function of swarm size and bootstrap degree (d)."}
|
||||
ggplot(latency_stats |> rename(percentile = quant)) +
|
||||
geom_hline(yintercept = 5, lty = 2, col = 'orange') +
|
||||
geom_line(aes(x = v, y = val, col = as.factor(sources)), lwd = 1.1) +
|
||||
facet_grid(d ~ percentile, labeller = label_both) +
|
||||
xlab("swarm size (log)") +
|
||||
ylab("latency (hops)") +
|
||||
theme_playax() +
|
||||
theme(legend.position = "bottom") +
|
||||
labs(col = 'cache nodes') +
|
||||
scale_x_log10() +
|
||||
big_fonts(15)
|
||||
```
|
||||
|
||||
Assuming a small block size and an RTT of $s$, this would mean a propagation time proportional to $5 \times s$ for a single block in the absence of congested links. Finally, we see that this loosely tracks the shape of the curves for the diameters of graphs of a given size (\@ref(fig:diameters)), which is unsurprising.
|
||||
|
||||
```{r}
|
||||
dataset(
|
||||
graph_diameters,
|
||||
storage = 'csv',
|
||||
map(graphs, function(graph)
|
||||
tibble(d = graph$d,
|
||||
v = graph$v,
|
||||
instance = graph$instance,
|
||||
diameter = diameter(graph$g))
|
||||
) |>
|
||||
bind_rows()
|
||||
)
|
||||
```
|
||||
|
||||
```{r diameters, fig.width = 10, fig.height = 4, fig.cap = "Swarm diameter percentiles as a function of their size."}
|
||||
ggplot(
|
||||
graph_diameters |> group_by(d, v) |> reframe(quantile_df(diameter, probs = c(0, 0.1, 0.25, 0.50, 0.75, 0.9, 0.95, 1)))
|
||||
) +
|
||||
geom_line(aes(x = v, y = val, col = as.factor(d))) +
|
||||
facet_grid(.~quant) +
|
||||
xlab("swarm size (log)") +
|
||||
ylab("diameter") +
|
||||
scale_x_log10() +
|
||||
theme_playax() +
|
||||
theme(legend.position = 'bottom') +
|
||||
labs(col = 'bootstrap degree (d)')
|
||||
```
|
||||
|
||||
# Next Steps
|
||||
|
||||
There are a few things that we could do next:
|
||||
|
||||
1. tweak the bootstrap protocol so that it mitigates swarm age bias -- where older nodes get selected more often as neighbors as part of the bootstrap procedure -- thus reducing degree imbalances. This involves adding some intelligence to the DHT tracker so that it biases samples towards nodes that have been sampled less;
|
||||
2. run simulations with more realistic proportions of cache nodes. We constrained this to $1--4$ for efficiency and to get results out faster, but having swarm percentages may yield more interesting numbers;
|
||||
3. add more sophisticated network dynamics (e.g. churn, link delays + multi-block simulations) to have more accurate estimates of performance and load.
|
||||
|
||||
Other issues potentially worth exploring:
|
||||
|
||||
1. performance of compressed bitsets vs. other set reconciliation data structures like inverted Bloom filters;
|
||||
2. techniques for achieving Byzantine fault tolerance on random overlays.
|
||||
|
||||
# References
|
||||
|
12
analysis/swarm-overlay-sims/tests/testthat.R
Normal file
12
analysis/swarm-overlay-sims/tests/testthat.R
Normal file
@ -0,0 +1,12 @@
|
||||
# This file is part of the standard setup for testthat.
|
||||
# It is recommended that you do not modify it.
|
||||
#
|
||||
# Where should you do additional test configuration?
|
||||
# Learn more about the roles of various files in:
|
||||
# * https://r-pkgs.org/testing-design.html#sec-tests-files-overview
|
||||
# * https://testthat.r-lib.org/articles/special-files.html
|
||||
|
||||
library(testthat)
|
||||
library(`swarm-overlay-sims`)
|
||||
|
||||
test_check("swarm-overlay-sims")
|
@ -0,0 +1,58 @@
|
||||
test_that(
|
||||
"should filter a list by predicate", {
|
||||
a_list <- list(
|
||||
list(a = 1, b = 2, d = 1),
|
||||
list(a = 2, b = 1, d = 2),
|
||||
list(a = 1, b = 2, d = 3)
|
||||
)
|
||||
|
||||
expect_equal(a_list |> filter(a == 1 & b == 2), list(
|
||||
list(a = 1, b = 2, d = 1),
|
||||
list(a = 1, b = 2, d = 3)
|
||||
))
|
||||
}
|
||||
)
|
||||
|
||||
test_that(
|
||||
"should factor caller context in predicate evaluation", {
|
||||
a_list <- list(
|
||||
list(a = 1, b = 2, d = 1),
|
||||
list(a = 2, b = 1, d = 2)
|
||||
)
|
||||
|
||||
x <- 1
|
||||
|
||||
expect_equal(a_list |> filter(a == 2 & b == x), list(
|
||||
list(a = 2, b = 1, d = 2)
|
||||
))
|
||||
}
|
||||
)
|
||||
|
||||
test_that(
|
||||
"should pull attribute as vector", {
|
||||
a_list <- list(
|
||||
list(a = 1, b = 2, d = 1),
|
||||
list(a = 2, b = 1, d = 2),
|
||||
list(a = 1, b = 2, d = 3),
|
||||
list(b = 2, d = 3)
|
||||
)
|
||||
|
||||
expect_equal(a_list |> pull(a), c(1, 2, 1, NA))
|
||||
}
|
||||
)
|
||||
|
||||
test_that(
|
||||
"should return vector when c.XXX is defined", {
|
||||
a_list <- list(
|
||||
list(a = 1, date = as.Date('2003-03-03')),
|
||||
list(a = 2, date = as.Date('2003-03-04')),
|
||||
list(a = 1, date = as.Date('2003-03-05'))
|
||||
)
|
||||
|
||||
column <- a_list |> pull(a)
|
||||
|
||||
expect_equal(class(column), "Date")
|
||||
expect_equal(column, c(
|
||||
as.Date('2003-03-03'), as.Date('2003-03-04'), as.Date('2003-03-05')))
|
||||
}
|
||||
)
|
Loading…
x
Reference in New Issue
Block a user