add latency and diameter analysis

2025-01-09 10:22:38 +00:00 · 2023-09-13 14:14:46 -03:00 · 2023-09-13 14:14:46 -03:00 · 68353ad6d2
commit 68353ad6d2
parent 3ef22a70bc
13 changed files with 3609 additions and 27 deletions
--- a/analysis/swarm-overlay-sims/.Rprofile
+++ b/analysis/swarm-overlay-sims/.Rprofile
@ -0,0 +1 @@
+source("renv/activate.R")
--- a/analysis/swarm-overlay-sims/DESCRIPTION
+++ b/analysis/swarm-overlay-sims/DESCRIPTION
@ -17,7 +17,8 @@ Depends:
  Hmisc,
  playaxr,
  plotly,
-  DT
+  DT,
+  pbmcapply
 Remotes: gmega/playaxr
 Suggests:
  devtools,
--- a/analysis/swarm-overlay-sims/NAMESPACE
+++ b/analysis/swarm-overlay-sims/NAMESPACE
@ -0,0 +1,4 @@
+# Generated by roxygen2: do not edit by hand
+
+S3method(filter,list)
+S3method(pull,list)
--- a/analysis/swarm-overlay-sims/R/dplyr-extras.R
+++ b/analysis/swarm-overlay-sims/R/dplyr-extras.R
@ -0,0 +1,26 @@
+#' Keeps list items that match a condition
+#' 
+#' List version of dplyr's filter verb.
+#'
+#' @export
+filter.list <- function(x, pred, ...) {
+  expr <- substitute(pred)
+  caller <- rlang::caller_env()
+  Filter(function(item) eval(expr, envir = list2env(item, parent = caller)), x)
+}
+
+#' Extract a single column from a list of lists
+#' 
+#' List version of dplyr's pull verb. Note that because not all objects define
+#' their own object vectors and because vectors cannot have mixed types, we are
+#' not always able to return a vector. So expect vectors only with simple types
+#' or types that define their own version of the concatenation operator.
+#' 
+#' @export
+pull.list <- function(x, col, ...) {
+  colname <- deparse(substitute(col))
+  do.call("c", lapply(x, function(element) {
+    value <- element[[colname]]
+    if (is.null(value)) NA else value
+  }))
+}
--- a/analysis/swarm-overlay-sims/R/utils.R
+++ b/analysis/swarm-overlay-sims/R/utils.R
@ -1,7 +1,7 @@
 quantile_df <- function(x, probs = c(0.25, 0.5, 0.75)) {
  tibble(
    val = quantile(x, probs, na.rm = TRUE),
-    quant = probs
+    quant = formatted_factor(probs, function(x) glue('{x*100}'))
  )
 }

@ -11,7 +11,20 @@ formatted_factor <- function(x, formatter) {
  factor(formatter(x), levels)
 }

-dataset <- function(symbol, block, storage = "csv", reload = FALSE) {
+.CODEC = list(
+  csv = list(
+    read = function(file) read_csv(file = file, show_col_types = FALSE),
+    write = write_csv
+  ),
+  rds = list(
+    read = read_rds,
+    write = write_rds
+  )
+)
+
+.CODEC$csv.bz2 = .CODEC$csv
+
+dataset <- function(symbol, block, storage = "csv", reload = FALSE, recalc = FALSE) {
  varname <- deparse(substitute(symbol))
  env <- rlang::caller_env()
  if ((varname %in% names(env)) && !reload) {
@ -19,15 +32,22 @@ dataset <- function(symbol, block, storage = "csv", reload = FALSE) {
    return()
  }
  fname <- glue('./data/{varname}.{storage}')
-  env[[varname]] <- if (file.exists(fname)) {
+  env[[varname]] <- if (file.exists(fname) && !recalc) {
    message(glue("Reading cached dataset from {fname}"))
-    read_csv(fname, show_col_types = FALSE)
+    .CODEC[[storage]]$read(file = fname)
  } else {
    message("Evaluating dataset expression.")
    if (!dir.exists("./data")) dir.create("./data")
    contents <- block
    message(glue("Write dataset {fname}."))
-    write_csv(contents, file = fname)
+    .CODEC[[storage]]$write(content, file = fname)
    contents
  }
+}
+
+timeit <- function(block) {
+  start <- Sys.time()
+  result <- block
+  print(glue('Took {Sys.time() - start} seconds.'))
+  result
 }
--- a/analysis/swarm-overlay-sims/man/filter.list.Rd
+++ b/analysis/swarm-overlay-sims/man/filter.list.Rd
@ -0,0 +1,11 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dplyr-extras.R
+\name{filter.list}
+\alias{filter.list}
+\title{Keeps list items that match a condition}
+\usage{
+\method{filter}{list}(x, pred, ...)
+}
+\description{
+List version of dplyr's filter verb.
+}
--- a/analysis/swarm-overlay-sims/man/pull.list.Rd
+++ b/analysis/swarm-overlay-sims/man/pull.list.Rd
@ -0,0 +1,13 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dplyr-extras.R
+\name{pull.list}
+\alias{pull.list}
+\title{Extract a single column from a list of lists}
+\usage{
+\method{pull}{list}(x, col, ...)
+}
+\description{
+List version of dplyr's pull verb. Note that because there is no type
+enforcement for lists of lists, we'll return a vector only if the actual
+type of all elements in the column is the same. Otherwise we return a list.
+}
--- a/analysis/swarm-overlay-sims/renv.lock
+++ b/analysis/swarm-overlay-sims/renv.lock
--- a/analysis/swarm-overlay-sims/renv/.gitignore
+++ b/analysis/swarm-overlay-sims/renv/.gitignore
@ -0,0 +1,7 @@
+library/
+local/
+cellar/
+lock/
+python/
+sandbox/
+staging/
--- a/analysis/swarm-overlay-sims/renv/activate.R
+++ b/analysis/swarm-overlay-sims/renv/activate.R
--- a/analysis/swarm-overlay-sims/swarms.Rmd
+++ b/analysis/swarm-overlay-sims/swarms.Rmd
@ -9,11 +9,30 @@ bibliography: [bibliography.bib]
 link-citations: true
 ---

+```{css zoom-lib-src, echo = FALSE}
+script src = "https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js"
+```
+
+```{js zoom-jquery, echo = FALSE}
+ $(document).ready(function() {
+    $('body').prepend('<div class=\"zoomDiv\"><img src=\"\" class=\"zoomImg\"></div>');
+    // onClick function for all plots (img's)
+    $('img:not(.zoomImg)').click(function() {
+      $('.zoomImg').attr('src', $(this).attr('src')).css({width: '100%'});
+      $('.zoomDiv').css({opacity: '1', width: 'auto', border: '1px solid white', borderRadius: '5px', position: 'fixed', top: '50%', left: '50%', marginRight: '-50%', transform: 'translate(-50%, -50%)', boxShadow: '0px 0px 50px #888888', zIndex: '50', overflow: 'auto', maxHeight: '100%'});
+    });
+    // onClick function for zoomImg
+    $('img.zoomImg').click(function() {
+      $('.zoomDiv').css({opacity: '0', width: '0%'}); 
+    });
+  });
+```

 ```{r cache=FALSE, echo=FALSE, warning=FALSE, message=FALSE}
 knitr::read_chunk('R/swarmoverlay.R')
 knitr::read_chunk('R/dissemination.R')
 devtools::load_all()
+knitr::opts_chunk$set(cache = TRUE, warning = FALSE, message = FALSE, cache.lazy = FALSE)
 ```

 # Context
@ -185,38 +204,138 @@ This is all, to a certain degree, obvious, as the probability that a node gets s

 The less obvious choice would be to have nodes reject neighbor requests once a threshold is met, effectively truncating the tail of the degree distribution. This could make the bootstrap procedure more complex/slower as a node would have to request more nodes from the bootstrap service again. We will keep those in mind for the next iteration.

-##  How fast should we expect a block to _percolate_ over the network?
+## How fast should we expect a block to _percolate_ over the network?

 In the absence of a link capacity and/or network delay model, graph topology should dominate dissemination time. The simplest case to analyse is to assume that nodes are able to broadcast the packet to _all of its neighbors_. The main appeal is that this is easy to implement, and can already provide some insight.

 ```{r disseminate-broadcast}
 ```
-
+ 
 We will take the overlays we had from before and run a simple experiment where we pick $1, 2, 3$ and $4$ starting nodes chosen at random in the overlays, and compute the average dissemination times for those.

 ```{r}
 n_sources_max <- 4
 ```

-```{r eval=FALSE}
-map(parameters[1], function(parameter) {
-  map(1:n_sources_max, function(n_sources) {
-    latencies <- map(1:n_samples, function(instance) {
-      sources <- sample(1:parameter$v, size = n_sources, replace = FALSE)
-      latencies <- edge_lists |> 
-        filter(
-          d == parameter$d,
-          v == parameter$v,
-          instance == !!instance
-        ) |> 
-        as_overlay_graph() |>
-        disseminate_broadcast(sources)
-    }) |> 
-      list_c() |>
-      quantile_df(c(0, 0.1, 0.25, 0.50, 0.75, 0.9, 0.95, 1)) |>
-      mutate(d = parameter$d, v = parameter$v, sources = n_sources)
-  }) |> bind_rows()
-}) |> bind_rows()
+```{r cache = TRUE}
+# Pre-generates graphs as otherwise this will murder simulation performance.
+dataset(
+  graphs,
+  storage = 'rds',
+  map(parameters, function(parameter) {
+    pbmclapply(
+      1:n_samples,
+      function(instance) {
+        list(
+          d = parameter$d,
+          v = parameter$v, 
+          instance = instance,
+          g = edge_lists |> 
+            filter(d == parameter$d, v == parameter$v, instance == !!instance) |> 
+            as_overlay_graph()
+        )
+      },
+      mc.cores = 2 # don't go overboard or it will suck up your RAM and crash your machine :-)
+    )
+  }) |> list_c()
+)
 ```

+```{r}
+dataset(
+  latency_stats,
+  storage = 'csv',
+  map(parameters, function(parameter) {
+    graph_instances <- graphs |> filter(v == parameter$v & d == parameter$d)
+    map(1:n_sources_max, function(n_sources) {
+      raw_latencies <- map(graph_instances, function(graph) {
+        sources <- sample(1:graph$v, size = n_sources, replace = FALSE)
+        disseminate_broadcast(graph$g, sources)
+      }) |>
+        list_c()
+      
+      raw_latencies |>
+        quantile_df(c(0, 0.1, 0.25, 0.50, 0.75, 0.9, 0.95, 1)) |>
+        rename(stat = quant) |>
+        bind_rows(
+          tibble(
+            stat = c('mean', 'variance', 'sd'),
+            val = mean(raw_latencies), var(raw_latencies), sd(raw_latencies)
+          )
+        ) |>
+        mutate(
+          d = parameter$d,
+          v = parameter$v,
+          sources = n_sources
+        )
+    }) |> bind_rows()
+  }) |> bind_rows()
+)
+```
+
+Fig. \@ref(fig:latencies) shows a grid in which each cell contains a latency $\times$ swarm size (log scale) subplot filtered by a combination of latency percentile (columns) and initial bootstrap degree, or $d$ (rows). This means that at cell `percentile: 0.5` $\times$ `d: 2` we should see the median ($50^{th}$ percentile) latencies for swarms with bootstrap degree $d = 2$. Lines are then coloured based on the number of cache nodes present in the swarm ($1$, $2$, $3$, or $4$). 
+
+With that in mind, what we see is that:
+
+1. dissemination times appear to increase logarithmically with swarm size at all percentiles, as evidenced by the straight lines in the log plots;
+2. increasing $d$ changes the slope of that increase (base of the log), with the most visibly dramatic change happening from $d = 1$ to $d = 2$. In particular, latencies stay below $10$ hops for all swarm sizes once $d \geq 2$, and the median latency is below $5$ hops;
+3. adding cache nodes appears to shift the intercept of the curve, even when considering a very small amount of nodes.
+
+```{r latencies, fig.width = 15, fig.height = 12, fig.cap="**(click to zoom)** Latency percentiles as a function of swarm size and bootstrap degree (d)."}
+ggplot(latency_stats |> rename(percentile = quant)) + 
+  geom_hline(yintercept = 5, lty = 2, col = 'orange') + 
+  geom_line(aes(x = v, y = val, col = as.factor(sources)), lwd = 1.1) +
+  facet_grid(d  ~ percentile, labeller = label_both) +
+  xlab("swarm size (log)") + 
+  ylab("latency (hops)") +
+  theme_playax() +
+  theme(legend.position = "bottom") + 
+  labs(col = 'cache nodes') +
+  scale_x_log10() +
+  big_fonts(15)
+```
+
+Assuming a small block size and an RTT of $s$, this would mean a propagation time proportional to $5 \times s$ for a single block in the absence of congested links. Finally, we see that this loosely tracks the shape of the curves for the diameters of graphs of a given size (\@ref(fig:diameters)), which is unsurprising.
+
+```{r}
+dataset(
+  graph_diameters,
+  storage = 'csv',
+  map(graphs, function(graph) 
+    tibble(d = graph$d, 
+           v = graph$v, 
+           instance = graph$instance, 
+           diameter = diameter(graph$g))
+    ) |>
+    bind_rows()
+)
+```
+
+```{r diameters, fig.width = 10, fig.height = 4, fig.cap = "Swarm diameter percentiles as a function of their size."}
+ggplot(
+  graph_diameters |> group_by(d, v) |> reframe(quantile_df(diameter, probs = c(0, 0.1, 0.25, 0.50, 0.75, 0.9, 0.95, 1)))
+) +
+  geom_line(aes(x = v, y = val, col = as.factor(d))) +
+  facet_grid(.~quant) + 
+  xlab("swarm size (log)") +
+  ylab("diameter") +
+  scale_x_log10() + 
+  theme_playax() +
+  theme(legend.position = 'bottom') + 
+  labs(col = 'bootstrap degree (d)')
+```
+
+# Next Steps
+
+There are a few things that we could do next:
+
+1. tweak the bootstrap protocol so that it mitigates swarm age bias -- where older nodes get selected more often as neighbors as part of the bootstrap procedure -- thus reducing degree imbalances. This involves adding some intelligence to the DHT tracker so that it biases samples towards nodes that have been sampled less;
+2. run simulations with more realistic proportions of cache nodes. We constrained this to $1--4$ for efficiency and to get results out faster, but having swarm percentages may yield more interesting numbers;
+3. add more sophisticated network dynamics (e.g. churn, link delays + multi-block simulations) to have more accurate estimates of performance and load.
+
+Other issues potentially worth exploring:
+
+1. performance of compressed bitsets vs. other set reconciliation data structures like inverted Bloom filters;
+2. techniques for achieving Byzantine fault tolerance on random overlays.
+
 # References
--- a/analysis/swarm-overlay-sims/tests/testthat.R
+++ b/analysis/swarm-overlay-sims/tests/testthat.R
@ -0,0 +1,12 @@
+# This file is part of the standard setup for testthat.
+# It is recommended that you do not modify it.
+#
+# Where should you do additional test configuration?
+# Learn more about the roles of various files in:
+# * https://r-pkgs.org/testing-design.html#sec-tests-files-overview
+# * https://testthat.r-lib.org/articles/special-files.html
+
+library(testthat)
+library(`swarm-overlay-sims`)
+
+test_check("swarm-overlay-sims")
--- a/analysis/swarm-overlay-sims/tests/testthat/test-dplyr-extras.R
+++ b/analysis/swarm-overlay-sims/tests/testthat/test-dplyr-extras.R
@ -0,0 +1,58 @@
+test_that(
+  "should filter a list by predicate", {
+    a_list <- list(
+      list(a = 1, b = 2, d = 1),
+      list(a = 2, b = 1, d = 2),
+      list(a = 1, b = 2, d = 3)
+    )
+    
+    expect_equal(a_list |> filter(a == 1 & b == 2), list(
+      list(a = 1, b = 2, d = 1),
+      list(a = 1, b = 2, d = 3)
+    ))
+  }
+)
+
+test_that( 
+  "should factor caller context in predicate evaluation", {
+    a_list <- list(
+      list(a = 1, b = 2, d = 1),
+      list(a = 2, b = 1, d = 2)
+    )
+    
+    x <- 1
+    
+    expect_equal(a_list |> filter(a == 2 & b == x), list(
+      list(a = 2, b = 1, d = 2)
+    ))
+  }
+)
+
+test_that(
+  "should pull attribute as vector", {
+    a_list <- list(
+      list(a = 1, b = 2, d = 1),
+      list(a = 2, b = 1, d = 2),
+      list(a = 1, b = 2, d = 3),
+      list(b = 2, d = 3)
+    )
+    
+    expect_equal(a_list |> pull(a), c(1, 2, 1, NA))
+  }
+)
+
+test_that(
+  "should return vector when c.XXX is defined", {
+    a_list <- list(
+      list(a = 1, date = as.Date('2003-03-03')),
+      list(a = 2, date = as.Date('2003-03-04')),
+      list(a = 1, date = as.Date('2003-03-05'))
+    )
+    
+    column <- a_list |> pull(a)
+    
+    expect_equal(class(column), "Date")
+    expect_equal(column, c(
+      as.Date('2003-03-03'), as.Date('2003-03-04'), as.Date('2003-03-05')))
+  }
+)