mirror of
https://github.com/codex-storage/codex-research.git
synced 2025-01-09 18:26:07 +00:00
add latency and diameter analysis
This commit is contained in:
parent
3ef22a70bc
commit
68353ad6d2
1
analysis/swarm-overlay-sims/.Rprofile
Normal file
1
analysis/swarm-overlay-sims/.Rprofile
Normal file
@ -0,0 +1 @@
|
|||||||
|
source("renv/activate.R")
|
@ -17,7 +17,8 @@ Depends:
|
|||||||
Hmisc,
|
Hmisc,
|
||||||
playaxr,
|
playaxr,
|
||||||
plotly,
|
plotly,
|
||||||
DT
|
DT,
|
||||||
|
pbmcapply
|
||||||
Remotes: gmega/playaxr
|
Remotes: gmega/playaxr
|
||||||
Suggests:
|
Suggests:
|
||||||
devtools,
|
devtools,
|
||||||
|
4
analysis/swarm-overlay-sims/NAMESPACE
Normal file
4
analysis/swarm-overlay-sims/NAMESPACE
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
# Generated by roxygen2: do not edit by hand
|
||||||
|
|
||||||
|
S3method(filter,list)
|
||||||
|
S3method(pull,list)
|
26
analysis/swarm-overlay-sims/R/dplyr-extras.R
Normal file
26
analysis/swarm-overlay-sims/R/dplyr-extras.R
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
#' Keeps list items that match a condition
|
||||||
|
#'
|
||||||
|
#' List version of dplyr's filter verb.
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
filter.list <- function(x, pred, ...) {
|
||||||
|
expr <- substitute(pred)
|
||||||
|
caller <- rlang::caller_env()
|
||||||
|
Filter(function(item) eval(expr, envir = list2env(item, parent = caller)), x)
|
||||||
|
}
|
||||||
|
|
||||||
|
#' Extract a single column from a list of lists
|
||||||
|
#'
|
||||||
|
#' List version of dplyr's pull verb. Note that because not all objects define
|
||||||
|
#' their own object vectors and because vectors cannot have mixed types, we are
|
||||||
|
#' not always able to return a vector. So expect vectors only with simple types
|
||||||
|
#' or types that define their own version of the concatenation operator.
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
pull.list <- function(x, col, ...) {
|
||||||
|
colname <- deparse(substitute(col))
|
||||||
|
do.call("c", lapply(x, function(element) {
|
||||||
|
value <- element[[colname]]
|
||||||
|
if (is.null(value)) NA else value
|
||||||
|
}))
|
||||||
|
}
|
@ -1,7 +1,7 @@
|
|||||||
quantile_df <- function(x, probs = c(0.25, 0.5, 0.75)) {
|
quantile_df <- function(x, probs = c(0.25, 0.5, 0.75)) {
|
||||||
tibble(
|
tibble(
|
||||||
val = quantile(x, probs, na.rm = TRUE),
|
val = quantile(x, probs, na.rm = TRUE),
|
||||||
quant = probs
|
quant = formatted_factor(probs, function(x) glue('{x*100}'))
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -11,7 +11,20 @@ formatted_factor <- function(x, formatter) {
|
|||||||
factor(formatter(x), levels)
|
factor(formatter(x), levels)
|
||||||
}
|
}
|
||||||
|
|
||||||
dataset <- function(symbol, block, storage = "csv", reload = FALSE) {
|
.CODEC = list(
|
||||||
|
csv = list(
|
||||||
|
read = function(file) read_csv(file = file, show_col_types = FALSE),
|
||||||
|
write = write_csv
|
||||||
|
),
|
||||||
|
rds = list(
|
||||||
|
read = read_rds,
|
||||||
|
write = write_rds
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
.CODEC$csv.bz2 = .CODEC$csv
|
||||||
|
|
||||||
|
dataset <- function(symbol, block, storage = "csv", reload = FALSE, recalc = FALSE) {
|
||||||
varname <- deparse(substitute(symbol))
|
varname <- deparse(substitute(symbol))
|
||||||
env <- rlang::caller_env()
|
env <- rlang::caller_env()
|
||||||
if ((varname %in% names(env)) && !reload) {
|
if ((varname %in% names(env)) && !reload) {
|
||||||
@ -19,15 +32,22 @@ dataset <- function(symbol, block, storage = "csv", reload = FALSE) {
|
|||||||
return()
|
return()
|
||||||
}
|
}
|
||||||
fname <- glue('./data/{varname}.{storage}')
|
fname <- glue('./data/{varname}.{storage}')
|
||||||
env[[varname]] <- if (file.exists(fname)) {
|
env[[varname]] <- if (file.exists(fname) && !recalc) {
|
||||||
message(glue("Reading cached dataset from {fname}"))
|
message(glue("Reading cached dataset from {fname}"))
|
||||||
read_csv(fname, show_col_types = FALSE)
|
.CODEC[[storage]]$read(file = fname)
|
||||||
} else {
|
} else {
|
||||||
message("Evaluating dataset expression.")
|
message("Evaluating dataset expression.")
|
||||||
if (!dir.exists("./data")) dir.create("./data")
|
if (!dir.exists("./data")) dir.create("./data")
|
||||||
contents <- block
|
contents <- block
|
||||||
message(glue("Write dataset {fname}."))
|
message(glue("Write dataset {fname}."))
|
||||||
write_csv(contents, file = fname)
|
.CODEC[[storage]]$write(content, file = fname)
|
||||||
contents
|
contents
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
timeit <- function(block) {
|
||||||
|
start <- Sys.time()
|
||||||
|
result <- block
|
||||||
|
print(glue('Took {Sys.time() - start} seconds.'))
|
||||||
|
result
|
||||||
}
|
}
|
11
analysis/swarm-overlay-sims/man/filter.list.Rd
Normal file
11
analysis/swarm-overlay-sims/man/filter.list.Rd
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/dplyr-extras.R
|
||||||
|
\name{filter.list}
|
||||||
|
\alias{filter.list}
|
||||||
|
\title{Keeps list items that match a condition}
|
||||||
|
\usage{
|
||||||
|
\method{filter}{list}(x, pred, ...)
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
List version of dplyr's filter verb.
|
||||||
|
}
|
13
analysis/swarm-overlay-sims/man/pull.list.Rd
Normal file
13
analysis/swarm-overlay-sims/man/pull.list.Rd
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/dplyr-extras.R
|
||||||
|
\name{pull.list}
|
||||||
|
\alias{pull.list}
|
||||||
|
\title{Extract a single column from a list of lists}
|
||||||
|
\usage{
|
||||||
|
\method{pull}{list}(x, col, ...)
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
List version of dplyr's pull verb. Note that because there is no type
|
||||||
|
enforcement for lists of lists, we'll return a vector only if the actual
|
||||||
|
type of all elements in the column is the same. Otherwise we return a list.
|
||||||
|
}
|
2109
analysis/swarm-overlay-sims/renv.lock
Normal file
2109
analysis/swarm-overlay-sims/renv.lock
Normal file
File diff suppressed because it is too large
Load Diff
7
analysis/swarm-overlay-sims/renv/.gitignore
vendored
Normal file
7
analysis/swarm-overlay-sims/renv/.gitignore
vendored
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
library/
|
||||||
|
local/
|
||||||
|
cellar/
|
||||||
|
lock/
|
||||||
|
python/
|
||||||
|
sandbox/
|
||||||
|
staging/
|
1201
analysis/swarm-overlay-sims/renv/activate.R
Normal file
1201
analysis/swarm-overlay-sims/renv/activate.R
Normal file
File diff suppressed because it is too large
Load Diff
@ -9,11 +9,30 @@ bibliography: [bibliography.bib]
|
|||||||
link-citations: true
|
link-citations: true
|
||||||
---
|
---
|
||||||
|
|
||||||
|
```{css zoom-lib-src, echo = FALSE}
|
||||||
|
script src = "https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js"
|
||||||
|
```
|
||||||
|
|
||||||
|
```{js zoom-jquery, echo = FALSE}
|
||||||
|
$(document).ready(function() {
|
||||||
|
$('body').prepend('<div class=\"zoomDiv\"><img src=\"\" class=\"zoomImg\"></div>');
|
||||||
|
// onClick function for all plots (img's)
|
||||||
|
$('img:not(.zoomImg)').click(function() {
|
||||||
|
$('.zoomImg').attr('src', $(this).attr('src')).css({width: '100%'});
|
||||||
|
$('.zoomDiv').css({opacity: '1', width: 'auto', border: '1px solid white', borderRadius: '5px', position: 'fixed', top: '50%', left: '50%', marginRight: '-50%', transform: 'translate(-50%, -50%)', boxShadow: '0px 0px 50px #888888', zIndex: '50', overflow: 'auto', maxHeight: '100%'});
|
||||||
|
});
|
||||||
|
// onClick function for zoomImg
|
||||||
|
$('img.zoomImg').click(function() {
|
||||||
|
$('.zoomDiv').css({opacity: '0', width: '0%'});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
```{r cache=FALSE, echo=FALSE, warning=FALSE, message=FALSE}
|
```{r cache=FALSE, echo=FALSE, warning=FALSE, message=FALSE}
|
||||||
knitr::read_chunk('R/swarmoverlay.R')
|
knitr::read_chunk('R/swarmoverlay.R')
|
||||||
knitr::read_chunk('R/dissemination.R')
|
knitr::read_chunk('R/dissemination.R')
|
||||||
devtools::load_all()
|
devtools::load_all()
|
||||||
|
knitr::opts_chunk$set(cache = TRUE, warning = FALSE, message = FALSE, cache.lazy = FALSE)
|
||||||
```
|
```
|
||||||
|
|
||||||
# Context
|
# Context
|
||||||
@ -185,38 +204,138 @@ This is all, to a certain degree, obvious, as the probability that a node gets s
|
|||||||
|
|
||||||
The less obvious choice would be to have nodes reject neighbor requests once a threshold is met, effectively truncating the tail of the degree distribution. This could make the bootstrap procedure more complex/slower as a node would have to request more nodes from the bootstrap service again. We will keep those in mind for the next iteration.
|
The less obvious choice would be to have nodes reject neighbor requests once a threshold is met, effectively truncating the tail of the degree distribution. This could make the bootstrap procedure more complex/slower as a node would have to request more nodes from the bootstrap service again. We will keep those in mind for the next iteration.
|
||||||
|
|
||||||
## How fast should we expect a block to _percolate_ over the network?
|
## How fast should we expect a block to _percolate_ over the network?
|
||||||
|
|
||||||
In the absence of a link capacity and/or network delay model, graph topology should dominate dissemination time. The simplest case to analyse is to assume that nodes are able to broadcast the packet to _all of its neighbors_. The main appeal is that this is easy to implement, and can already provide some insight.
|
In the absence of a link capacity and/or network delay model, graph topology should dominate dissemination time. The simplest case to analyse is to assume that nodes are able to broadcast the packet to _all of its neighbors_. The main appeal is that this is easy to implement, and can already provide some insight.
|
||||||
|
|
||||||
```{r disseminate-broadcast}
|
```{r disseminate-broadcast}
|
||||||
```
|
```
|
||||||
|
|
||||||
We will take the overlays we had from before and run a simple experiment where we pick $1, 2, 3$ and $4$ starting nodes chosen at random in the overlays, and compute the average dissemination times for those.
|
We will take the overlays we had from before and run a simple experiment where we pick $1, 2, 3$ and $4$ starting nodes chosen at random in the overlays, and compute the average dissemination times for those.
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
n_sources_max <- 4
|
n_sources_max <- 4
|
||||||
```
|
```
|
||||||
|
|
||||||
```{r eval=FALSE}
|
```{r cache = TRUE}
|
||||||
map(parameters[1], function(parameter) {
|
# Pre-generates graphs as otherwise this will murder simulation performance.
|
||||||
map(1:n_sources_max, function(n_sources) {
|
dataset(
|
||||||
latencies <- map(1:n_samples, function(instance) {
|
graphs,
|
||||||
sources <- sample(1:parameter$v, size = n_sources, replace = FALSE)
|
storage = 'rds',
|
||||||
latencies <- edge_lists |>
|
map(parameters, function(parameter) {
|
||||||
filter(
|
pbmclapply(
|
||||||
d == parameter$d,
|
1:n_samples,
|
||||||
v == parameter$v,
|
function(instance) {
|
||||||
instance == !!instance
|
list(
|
||||||
) |>
|
d = parameter$d,
|
||||||
as_overlay_graph() |>
|
v = parameter$v,
|
||||||
disseminate_broadcast(sources)
|
instance = instance,
|
||||||
}) |>
|
g = edge_lists |>
|
||||||
list_c() |>
|
filter(d == parameter$d, v == parameter$v, instance == !!instance) |>
|
||||||
quantile_df(c(0, 0.1, 0.25, 0.50, 0.75, 0.9, 0.95, 1)) |>
|
as_overlay_graph()
|
||||||
mutate(d = parameter$d, v = parameter$v, sources = n_sources)
|
)
|
||||||
}) |> bind_rows()
|
},
|
||||||
}) |> bind_rows()
|
mc.cores = 2 # don't go overboard or it will suck up your RAM and crash your machine :-)
|
||||||
|
)
|
||||||
|
}) |> list_c()
|
||||||
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
dataset(
|
||||||
|
latency_stats,
|
||||||
|
storage = 'csv',
|
||||||
|
map(parameters, function(parameter) {
|
||||||
|
graph_instances <- graphs |> filter(v == parameter$v & d == parameter$d)
|
||||||
|
map(1:n_sources_max, function(n_sources) {
|
||||||
|
raw_latencies <- map(graph_instances, function(graph) {
|
||||||
|
sources <- sample(1:graph$v, size = n_sources, replace = FALSE)
|
||||||
|
disseminate_broadcast(graph$g, sources)
|
||||||
|
}) |>
|
||||||
|
list_c()
|
||||||
|
|
||||||
|
raw_latencies |>
|
||||||
|
quantile_df(c(0, 0.1, 0.25, 0.50, 0.75, 0.9, 0.95, 1)) |>
|
||||||
|
rename(stat = quant) |>
|
||||||
|
bind_rows(
|
||||||
|
tibble(
|
||||||
|
stat = c('mean', 'variance', 'sd'),
|
||||||
|
val = mean(raw_latencies), var(raw_latencies), sd(raw_latencies)
|
||||||
|
)
|
||||||
|
) |>
|
||||||
|
mutate(
|
||||||
|
d = parameter$d,
|
||||||
|
v = parameter$v,
|
||||||
|
sources = n_sources
|
||||||
|
)
|
||||||
|
}) |> bind_rows()
|
||||||
|
}) |> bind_rows()
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Fig. \@ref(fig:latencies) shows a grid in which each cell contains a latency $\times$ swarm size (log scale) subplot filtered by a combination of latency percentile (columns) and initial bootstrap degree, or $d$ (rows). This means that at cell `percentile: 0.5` $\times$ `d: 2` we should see the median ($50^{th}$ percentile) latencies for swarms with bootstrap degree $d = 2$. Lines are then coloured based on the number of cache nodes present in the swarm ($1$, $2$, $3$, or $4$).
|
||||||
|
|
||||||
|
With that in mind, what we see is that:
|
||||||
|
|
||||||
|
1. dissemination times appear to increase logarithmically with swarm size at all percentiles, as evidenced by the straight lines in the log plots;
|
||||||
|
2. increasing $d$ changes the slope of that increase (base of the log), with the most visibly dramatic change happening from $d = 1$ to $d = 2$. In particular, latencies stay below $10$ hops for all swarm sizes once $d \geq 2$, and the median latency is below $5$ hops;
|
||||||
|
3. adding cache nodes appears to shift the intercept of the curve, even when considering a very small amount of nodes.
|
||||||
|
|
||||||
|
```{r latencies, fig.width = 15, fig.height = 12, fig.cap="**(click to zoom)** Latency percentiles as a function of swarm size and bootstrap degree (d)."}
|
||||||
|
ggplot(latency_stats |> rename(percentile = quant)) +
|
||||||
|
geom_hline(yintercept = 5, lty = 2, col = 'orange') +
|
||||||
|
geom_line(aes(x = v, y = val, col = as.factor(sources)), lwd = 1.1) +
|
||||||
|
facet_grid(d ~ percentile, labeller = label_both) +
|
||||||
|
xlab("swarm size (log)") +
|
||||||
|
ylab("latency (hops)") +
|
||||||
|
theme_playax() +
|
||||||
|
theme(legend.position = "bottom") +
|
||||||
|
labs(col = 'cache nodes') +
|
||||||
|
scale_x_log10() +
|
||||||
|
big_fonts(15)
|
||||||
|
```
|
||||||
|
|
||||||
|
Assuming a small block size and an RTT of $s$, this would mean a propagation time proportional to $5 \times s$ for a single block in the absence of congested links. Finally, we see that this loosely tracks the shape of the curves for the diameters of graphs of a given size (\@ref(fig:diameters)), which is unsurprising.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
dataset(
|
||||||
|
graph_diameters,
|
||||||
|
storage = 'csv',
|
||||||
|
map(graphs, function(graph)
|
||||||
|
tibble(d = graph$d,
|
||||||
|
v = graph$v,
|
||||||
|
instance = graph$instance,
|
||||||
|
diameter = diameter(graph$g))
|
||||||
|
) |>
|
||||||
|
bind_rows()
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r diameters, fig.width = 10, fig.height = 4, fig.cap = "Swarm diameter percentiles as a function of their size."}
|
||||||
|
ggplot(
|
||||||
|
graph_diameters |> group_by(d, v) |> reframe(quantile_df(diameter, probs = c(0, 0.1, 0.25, 0.50, 0.75, 0.9, 0.95, 1)))
|
||||||
|
) +
|
||||||
|
geom_line(aes(x = v, y = val, col = as.factor(d))) +
|
||||||
|
facet_grid(.~quant) +
|
||||||
|
xlab("swarm size (log)") +
|
||||||
|
ylab("diameter") +
|
||||||
|
scale_x_log10() +
|
||||||
|
theme_playax() +
|
||||||
|
theme(legend.position = 'bottom') +
|
||||||
|
labs(col = 'bootstrap degree (d)')
|
||||||
|
```
|
||||||
|
|
||||||
|
# Next Steps
|
||||||
|
|
||||||
|
There are a few things that we could do next:
|
||||||
|
|
||||||
|
1. tweak the bootstrap protocol so that it mitigates swarm age bias -- where older nodes get selected more often as neighbors as part of the bootstrap procedure -- thus reducing degree imbalances. This involves adding some intelligence to the DHT tracker so that it biases samples towards nodes that have been sampled less;
|
||||||
|
2. run simulations with more realistic proportions of cache nodes. We constrained this to $1--4$ for efficiency and to get results out faster, but having swarm percentages may yield more interesting numbers;
|
||||||
|
3. add more sophisticated network dynamics (e.g. churn, link delays + multi-block simulations) to have more accurate estimates of performance and load.
|
||||||
|
|
||||||
|
Other issues potentially worth exploring:
|
||||||
|
|
||||||
|
1. performance of compressed bitsets vs. other set reconciliation data structures like inverted Bloom filters;
|
||||||
|
2. techniques for achieving Byzantine fault tolerance on random overlays.
|
||||||
|
|
||||||
# References
|
# References
|
||||||
|
12
analysis/swarm-overlay-sims/tests/testthat.R
Normal file
12
analysis/swarm-overlay-sims/tests/testthat.R
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
# This file is part of the standard setup for testthat.
|
||||||
|
# It is recommended that you do not modify it.
|
||||||
|
#
|
||||||
|
# Where should you do additional test configuration?
|
||||||
|
# Learn more about the roles of various files in:
|
||||||
|
# * https://r-pkgs.org/testing-design.html#sec-tests-files-overview
|
||||||
|
# * https://testthat.r-lib.org/articles/special-files.html
|
||||||
|
|
||||||
|
library(testthat)
|
||||||
|
library(`swarm-overlay-sims`)
|
||||||
|
|
||||||
|
test_check("swarm-overlay-sims")
|
@ -0,0 +1,58 @@
|
|||||||
|
test_that(
|
||||||
|
"should filter a list by predicate", {
|
||||||
|
a_list <- list(
|
||||||
|
list(a = 1, b = 2, d = 1),
|
||||||
|
list(a = 2, b = 1, d = 2),
|
||||||
|
list(a = 1, b = 2, d = 3)
|
||||||
|
)
|
||||||
|
|
||||||
|
expect_equal(a_list |> filter(a == 1 & b == 2), list(
|
||||||
|
list(a = 1, b = 2, d = 1),
|
||||||
|
list(a = 1, b = 2, d = 3)
|
||||||
|
))
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
test_that(
|
||||||
|
"should factor caller context in predicate evaluation", {
|
||||||
|
a_list <- list(
|
||||||
|
list(a = 1, b = 2, d = 1),
|
||||||
|
list(a = 2, b = 1, d = 2)
|
||||||
|
)
|
||||||
|
|
||||||
|
x <- 1
|
||||||
|
|
||||||
|
expect_equal(a_list |> filter(a == 2 & b == x), list(
|
||||||
|
list(a = 2, b = 1, d = 2)
|
||||||
|
))
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
test_that(
|
||||||
|
"should pull attribute as vector", {
|
||||||
|
a_list <- list(
|
||||||
|
list(a = 1, b = 2, d = 1),
|
||||||
|
list(a = 2, b = 1, d = 2),
|
||||||
|
list(a = 1, b = 2, d = 3),
|
||||||
|
list(b = 2, d = 3)
|
||||||
|
)
|
||||||
|
|
||||||
|
expect_equal(a_list |> pull(a), c(1, 2, 1, NA))
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
test_that(
|
||||||
|
"should return vector when c.XXX is defined", {
|
||||||
|
a_list <- list(
|
||||||
|
list(a = 1, date = as.Date('2003-03-03')),
|
||||||
|
list(a = 2, date = as.Date('2003-03-04')),
|
||||||
|
list(a = 1, date = as.Date('2003-03-05'))
|
||||||
|
)
|
||||||
|
|
||||||
|
column <- a_list |> pull(a)
|
||||||
|
|
||||||
|
expect_equal(class(column), "Date")
|
||||||
|
expect_equal(column, c(
|
||||||
|
as.Date('2003-03-03'), as.Date('2003-03-04'), as.Date('2003-03-05')))
|
||||||
|
}
|
||||||
|
)
|
Loading…
x
Reference in New Issue
Block a user