add latency and diameter analysis

2025-01-09 18:26:07 +00:00 · 2023-09-13 14:14:46 -03:00 · 2023-09-13 14:14:46 -03:00 · 68353ad6d2
commit 68353ad6d2
parent 3ef22a70bc
13 changed files with 3609 additions and 27 deletions
--- a/analysis/swarm-overlay-sims/.Rprofile
+++ b/analysis/swarm-overlay-sims/.Rprofile
@ -0,0 +1 @@
 source("renv/activate.R")
--- a/analysis/swarm-overlay-sims/DESCRIPTION
+++ b/analysis/swarm-overlay-sims/DESCRIPTION
@ -17,7 +17,8 @@ Depends:
  Hmisc,
  playaxr,
  plotly,
-  DT
+  DT,
  pbmcapply
 Remotes: gmega/playaxr
 Suggests:
  devtools,
--- a/analysis/swarm-overlay-sims/NAMESPACE
+++ b/analysis/swarm-overlay-sims/NAMESPACE
@ -0,0 +1,4 @@
 # Generated by roxygen2: do not edit by hand
 S3method(filter,list)
 S3method(pull,list)
--- a/analysis/swarm-overlay-sims/R/dplyr-extras.R
+++ b/analysis/swarm-overlay-sims/R/dplyr-extras.R
@ -0,0 +1,26 @@
 #' Keeps list items that match a condition
 #' 
 #' List version of dplyr's filter verb.
 #'
 #' @export
 filter.list <- function(x, pred, ...) {
  expr <- substitute(pred)
  caller <- rlang::caller_env()
  Filter(function(item) eval(expr, envir = list2env(item, parent = caller)), x)
 }
 #' Extract a single column from a list of lists
 #' 
 #' List version of dplyr's pull verb. Note that because not all objects define
 #' their own object vectors and because vectors cannot have mixed types, we are
 #' not always able to return a vector. So expect vectors only with simple types
 #' or types that define their own version of the concatenation operator.
 #' 
 #' @export
 pull.list <- function(x, col, ...) {
  colname <- deparse(substitute(col))
  do.call("c", lapply(x, function(element) {
    value <- element[[colname]]
    if (is.null(value)) NA else value
  }))
 }
--- a/analysis/swarm-overlay-sims/R/utils.R
+++ b/analysis/swarm-overlay-sims/R/utils.R
@ -1,7 +1,7 @@
 quantile_df <- function(x, probs = c(0.25, 0.5, 0.75)) {
  tibble(
    val = quantile(x, probs, na.rm = TRUE),
-    quant = probs
+    quant = formatted_factor(probs, function(x) glue('{x*100}'))
  )
 }
@ -11,7 +11,20 @@ formatted_factor <- function(x, formatter) {
  factor(formatter(x), levels)
 }
-dataset <- function(symbol, block, storage = "csv", reload = FALSE) {
+.CODEC = list(
  csv = list(
    read = function(file) read_csv(file = file, show_col_types = FALSE),
    write = write_csv
  ),
  rds = list(
    read = read_rds,
    write = write_rds
  )
 )
 .CODEC$csv.bz2 = .CODEC$csv
 dataset <- function(symbol, block, storage = "csv", reload = FALSE, recalc = FALSE) {
  varname <- deparse(substitute(symbol))
  env <- rlang::caller_env()
  if ((varname %in% names(env)) && !reload) {
@ -19,15 +32,22 @@ dataset <- function(symbol, block, storage = "csv", reload = FALSE) {
    return()
  }
  fname <- glue('./data/{varname}.{storage}')
-  env[[varname]] <- if (file.exists(fname)) {
+  env[[varname]] <- if (file.exists(fname) && !recalc) {
    message(glue("Reading cached dataset from {fname}"))
-    read_csv(fname, show_col_types = FALSE)
+    .CODEC[[storage]]$read(file = fname)
  } else {
    message("Evaluating dataset expression.")
    if (!dir.exists("./data")) dir.create("./data")
    contents <- block
    message(glue("Write dataset {fname}."))
-    write_csv(contents, file = fname)
+    .CODEC[[storage]]$write(content, file = fname)
    contents
  }
 }
 timeit <- function(block) {
  start <- Sys.time()
  result <- block
  print(glue('Took {Sys.time() - start} seconds.'))
  result
 }
--- a/analysis/swarm-overlay-sims/man/filter.list.Rd
+++ b/analysis/swarm-overlay-sims/man/filter.list.Rd
@ -0,0 +1,11 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/dplyr-extras.R
 \name{filter.list}
 \alias{filter.list}
 \title{Keeps list items that match a condition}
 \usage{
 \method{filter}{list}(x, pred, ...)
 }
 \description{
 List version of dplyr's filter verb.
 }
--- a/analysis/swarm-overlay-sims/man/pull.list.Rd
+++ b/analysis/swarm-overlay-sims/man/pull.list.Rd
@ -0,0 +1,13 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/dplyr-extras.R
 \name{pull.list}
 \alias{pull.list}
 \title{Extract a single column from a list of lists}
 \usage{
 \method{pull}{list}(x, col, ...)
 }
 \description{
 List version of dplyr's pull verb. Note that because there is no type
 enforcement for lists of lists, we'll return a vector only if the actual
 type of all elements in the column is the same. Otherwise we return a list.
 }
--- a/analysis/swarm-overlay-sims/renv.lock
+++ b/analysis/swarm-overlay-sims/renv.lock
--- a/analysis/swarm-overlay-sims/renv/.gitignore
+++ b/analysis/swarm-overlay-sims/renv/.gitignore
@ -0,0 +1,7 @@
 library/
 local/
 cellar/
 lock/
 python/
 sandbox/
 staging/
--- a/analysis/swarm-overlay-sims/renv/activate.R
+++ b/analysis/swarm-overlay-sims/renv/activate.R
--- a/analysis/swarm-overlay-sims/swarms.Rmd
+++ b/analysis/swarm-overlay-sims/swarms.Rmd
@ -9,11 +9,30 @@ bibliography: [bibliography.bib]
 link-citations: true
 ---
 ```{css zoom-lib-src, echo = FALSE}
 script src = "https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js"
 ```
 ```{js zoom-jquery, echo = FALSE}
 $(document).ready(function() {
    $('body').prepend('<div class=\"zoomDiv\"><img src=\"\" class=\"zoomImg\"></div>');
    // onClick function for all plots (img's)
    $('img:not(.zoomImg)').click(function() {
      $('.zoomImg').attr('src', $(this).attr('src')).css({width: '100%'});
      $('.zoomDiv').css({opacity: '1', width: 'auto', border: '1px solid white', borderRadius: '5px', position: 'fixed', top: '50%', left: '50%', marginRight: '-50%', transform: 'translate(-50%, -50%)', boxShadow: '0px 0px 50px #888888', zIndex: '50', overflow: 'auto', maxHeight: '100%'});
    });
    // onClick function for zoomImg
    $('img.zoomImg').click(function() {
      $('.zoomDiv').css({opacity: '0', width: '0%'}); 
    });
  });
 ```
 ```{r cache=FALSE, echo=FALSE, warning=FALSE, message=FALSE}
 knitr::read_chunk('R/swarmoverlay.R')
 knitr::read_chunk('R/dissemination.R')
 devtools::load_all()
 knitr::opts_chunk$set(cache = TRUE, warning = FALSE, message = FALSE, cache.lazy = FALSE)
 ```
 # Context
@ -185,38 +204,138 @@ This is all, to a certain degree, obvious, as the probability that a node gets s
 The less obvious choice would be to have nodes reject neighbor requests once a threshold is met, effectively truncating the tail of the degree distribution. This could make the bootstrap procedure more complex/slower as a node would have to request more nodes from the bootstrap service again. We will keep those in mind for the next iteration.
-##  How fast should we expect a block to _percolate_ over the network?
+## How fast should we expect a block to _percolate_ over the network?
 In the absence of a link capacity and/or network delay model, graph topology should dominate dissemination time. The simplest case to analyse is to assume that nodes are able to broadcast the packet to _all of its neighbors_. The main appeal is that this is easy to implement, and can already provide some insight.
 ```{r disseminate-broadcast}
 ```
-
+ 
 We will take the overlays we had from before and run a simple experiment where we pick $1, 2, 3$ and $4$ starting nodes chosen at random in the overlays, and compute the average dissemination times for those.
 ```{r}
 n_sources_max <- 4
 ```
-```{r eval=FALSE}
+```{r cache = TRUE}
-map(parameters[1], function(parameter) {
+# Pre-generates graphs as otherwise this will murder simulation performance.
-  map(1:n_sources_max, function(n_sources) {
+dataset(
-    latencies <- map(1:n_samples, function(instance) {
+  graphs,
-      sources <- sample(1:parameter$v, size = n_sources, replace = FALSE)
+  storage = 'rds',
-      latencies <- edge_lists |> 
+  map(parameters, function(parameter) {
-        filter(
+    pbmclapply(
-          d == parameter$d,
+      1:n_samples,
-          v == parameter$v,
+      function(instance) {
-          instance == !!instance
+        list(
-        ) |> 
+          d = parameter$d,
-        as_overlay_graph() |>
+          v = parameter$v, 
-        disseminate_broadcast(sources)
+          instance = instance,
-    }) |> 
+          g = edge_lists |> 
-      list_c() |>
+            filter(d == parameter$d, v == parameter$v, instance == !!instance) |> 
-      quantile_df(c(0, 0.1, 0.25, 0.50, 0.75, 0.9, 0.95, 1)) |>
+            as_overlay_graph()
-      mutate(d = parameter$d, v = parameter$v, sources = n_sources)
+        )
-  }) |> bind_rows()
+      },
-}) |> bind_rows()
+      mc.cores = 2 # don't go overboard or it will suck up your RAM and crash your machine :-)
    )
  }) |> list_c()
 )
 ```
 ```{r}
 dataset(
  latency_stats,
  storage = 'csv',
  map(parameters, function(parameter) {
    graph_instances <- graphs |> filter(v == parameter$v & d == parameter$d)
    map(1:n_sources_max, function(n_sources) {
      raw_latencies <- map(graph_instances, function(graph) {
        sources <- sample(1:graph$v, size = n_sources, replace = FALSE)
        disseminate_broadcast(graph$g, sources)
      }) |>
        list_c()
      raw_latencies |>
        quantile_df(c(0, 0.1, 0.25, 0.50, 0.75, 0.9, 0.95, 1)) |>
        rename(stat = quant) |>
        bind_rows(
          tibble(
            stat = c('mean', 'variance', 'sd'),
            val = mean(raw_latencies), var(raw_latencies), sd(raw_latencies)
          )
        ) |>
        mutate(
          d = parameter$d,
          v = parameter$v,
          sources = n_sources
        )
    }) |> bind_rows()
  }) |> bind_rows()
 )
 ```
 Fig. \@ref(fig:latencies) shows a grid in which each cell contains a latency $\times$ swarm size (log scale) subplot filtered by a combination of latency percentile (columns) and initial bootstrap degree, or $d$ (rows). This means that at cell `percentile: 0.5` $\times$ `d: 2` we should see the median ($50^{th}$ percentile) latencies for swarms with bootstrap degree $d = 2$. Lines are then coloured based on the number of cache nodes present in the swarm ($1$, $2$, $3$, or $4$). 
 With that in mind, what we see is that:
 1. dissemination times appear to increase logarithmically with swarm size at all percentiles, as evidenced by the straight lines in the log plots;
 2. increasing $d$ changes the slope of that increase (base of the log), with the most visibly dramatic change happening from $d = 1$ to $d = 2$. In particular, latencies stay below $10$ hops for all swarm sizes once $d \geq 2$, and the median latency is below $5$ hops;
 3. adding cache nodes appears to shift the intercept of the curve, even when considering a very small amount of nodes.
 ```{r latencies, fig.width = 15, fig.height = 12, fig.cap="**(click to zoom)** Latency percentiles as a function of swarm size and bootstrap degree (d)."}
 ggplot(latency_stats |> rename(percentile = quant)) + 
  geom_hline(yintercept = 5, lty = 2, col = 'orange') + 
  geom_line(aes(x = v, y = val, col = as.factor(sources)), lwd = 1.1) +
  facet_grid(d  ~ percentile, labeller = label_both) +
  xlab("swarm size (log)") + 
  ylab("latency (hops)") +
  theme_playax() +
  theme(legend.position = "bottom") + 
  labs(col = 'cache nodes') +
  scale_x_log10() +
  big_fonts(15)
 ```
 Assuming a small block size and an RTT of $s$, this would mean a propagation time proportional to $5 \times s$ for a single block in the absence of congested links. Finally, we see that this loosely tracks the shape of the curves for the diameters of graphs of a given size (\@ref(fig:diameters)), which is unsurprising.
 ```{r}
 dataset(
  graph_diameters,
  storage = 'csv',
  map(graphs, function(graph) 
    tibble(d = graph$d, 
           v = graph$v, 
           instance = graph$instance, 
           diameter = diameter(graph$g))
    ) |>
    bind_rows()
 )
 ```
 ```{r diameters, fig.width = 10, fig.height = 4, fig.cap = "Swarm diameter percentiles as a function of their size."}
 ggplot(
  graph_diameters |> group_by(d, v) |> reframe(quantile_df(diameter, probs = c(0, 0.1, 0.25, 0.50, 0.75, 0.9, 0.95, 1)))
 ) +
  geom_line(aes(x = v, y = val, col = as.factor(d))) +
  facet_grid(.~quant) + 
  xlab("swarm size (log)") +
  ylab("diameter") +
  scale_x_log10() + 
  theme_playax() +
  theme(legend.position = 'bottom') + 
  labs(col = 'bootstrap degree (d)')
 ```
 # Next Steps
 There are a few things that we could do next:
 1. tweak the bootstrap protocol so that it mitigates swarm age bias -- where older nodes get selected more often as neighbors as part of the bootstrap procedure -- thus reducing degree imbalances. This involves adding some intelligence to the DHT tracker so that it biases samples towards nodes that have been sampled less;
 2. run simulations with more realistic proportions of cache nodes. We constrained this to $1--4$ for efficiency and to get results out faster, but having swarm percentages may yield more interesting numbers;
 3. add more sophisticated network dynamics (e.g. churn, link delays + multi-block simulations) to have more accurate estimates of performance and load.
 Other issues potentially worth exploring:
 1. performance of compressed bitsets vs. other set reconciliation data structures like inverted Bloom filters;
 2. techniques for achieving Byzantine fault tolerance on random overlays.
 # References
--- a/analysis/swarm-overlay-sims/tests/testthat.R
+++ b/analysis/swarm-overlay-sims/tests/testthat.R
@ -0,0 +1,12 @@
 # This file is part of the standard setup for testthat.
 # It is recommended that you do not modify it.
 #
 # Where should you do additional test configuration?
 # Learn more about the roles of various files in:
 # * https://r-pkgs.org/testing-design.html#sec-tests-files-overview
 # * https://testthat.r-lib.org/articles/special-files.html
 library(testthat)
 library(`swarm-overlay-sims`)
 test_check("swarm-overlay-sims")
--- a/analysis/swarm-overlay-sims/tests/testthat/test-dplyr-extras.R
+++ b/analysis/swarm-overlay-sims/tests/testthat/test-dplyr-extras.R
@ -0,0 +1,58 @@
 test_that(
  "should filter a list by predicate", {
    a_list <- list(
      list(a = 1, b = 2, d = 1),
      list(a = 2, b = 1, d = 2),
      list(a = 1, b = 2, d = 3)
    )
    expect_equal(a_list |> filter(a == 1 & b == 2), list(
      list(a = 1, b = 2, d = 1),
      list(a = 1, b = 2, d = 3)
    ))
  }
 )
 test_that( 
  "should factor caller context in predicate evaluation", {
    a_list <- list(
      list(a = 1, b = 2, d = 1),
      list(a = 2, b = 1, d = 2)
    )
    x <- 1
    expect_equal(a_list |> filter(a == 2 & b == x), list(
      list(a = 2, b = 1, d = 2)
    ))
  }
 )
 test_that(
  "should pull attribute as vector", {
    a_list <- list(
      list(a = 1, b = 2, d = 1),
      list(a = 2, b = 1, d = 2),
      list(a = 1, b = 2, d = 3),
      list(b = 2, d = 3)
    )
    expect_equal(a_list |> pull(a), c(1, 2, 1, NA))
  }
 )
 test_that(
  "should return vector when c.XXX is defined", {
    a_list <- list(
      list(a = 1, date = as.Date('2003-03-03')),
      list(a = 2, date = as.Date('2003-03-04')),
      list(a = 1, date = as.Date('2003-03-05'))
    )
    column <- a_list |> pull(a)
    expect_equal(class(column), "Date")
    expect_equal(column, c(
      as.Date('2003-03-03'), as.Date('2003-03-04'), as.Date('2003-03-05')))
  }
 )