Block discovery simulator and analysis (#175)
* add block discovery simulator * add analysis document for simpler cases of block discovery
This commit is contained in:
parent
7ace179f2f
commit
1d23b31461
|
@ -0,0 +1,4 @@
|
|||
^renv$
|
||||
^renv\.lock$
|
||||
^.*\.Rproj$
|
||||
^\.Rproj\.user$
|
|
@ -0,0 +1 @@
|
|||
source("renv/activate.R")
|
|
@ -0,0 +1,5 @@
|
|||
.Rproj.user
|
||||
.RData
|
||||
.Rhistory
|
||||
*nb.html
|
||||
rsconnect
|
|
@ -0,0 +1,18 @@
|
|||
Package: blockdiscoverysim
|
||||
Title: Block Discovery Simulator
|
||||
Version: 0.0.0.9000
|
||||
Description: Simple Simulation for Block Discovery
|
||||
Encoding: UTF-8
|
||||
Roxygen: list(markdown = TRUE)
|
||||
RoxygenNote: 7.2.3
|
||||
Depends:
|
||||
shiny (>= 1.7.4.1),
|
||||
tidyverse (>= 2.0.0),
|
||||
purrr (>= 1.0.1),
|
||||
VGAM (>= 1.1-8),
|
||||
R6 (>= 2.2.2),
|
||||
plotly (>= 4.10.2)
|
||||
Suggests:
|
||||
devtools,
|
||||
testthat (>= 3.0.0)
|
||||
Config/testthat/edition: 3
|
|
@ -0,0 +1,19 @@
|
|||
# We do this hack because rsconnect doesn't seem to like us bundling the app
|
||||
# as a package.
|
||||
|
||||
order <- c(
|
||||
'R/partition.R',
|
||||
'R/stats.R',
|
||||
'R/node.R',
|
||||
'R/sim.R'
|
||||
)
|
||||
|
||||
library(R6)
|
||||
library(purrr)
|
||||
library(tidyverse)
|
||||
|
||||
lapply(order, source)
|
||||
|
||||
run <- function() {
|
||||
rmarkdown::run('./block-discovery-sim.Rmd')
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
Node <- R6Class(
|
||||
'Node',
|
||||
public = list(
|
||||
node_id = NULL,
|
||||
storage = NULL,
|
||||
|
||||
initialize = function(node_id, storage) {
|
||||
self$node_id = node_id
|
||||
self$storage = storage
|
||||
},
|
||||
|
||||
name = function() paste0('node ', self$node_id)
|
||||
)
|
||||
)
|
|
@ -0,0 +1,18 @@
|
|||
#' Generates a random partition of a block array among a set of nodes. The
|
||||
#' partitioning follows the supplied distribution.
|
||||
#'
|
||||
#' @param block_array a vector containing blocks
|
||||
#' @param network_size the number of nodes in the network
|
||||
#' @param distribution a sample generator which generates a vector of n
|
||||
#' samples when called as distribution(n).
|
||||
#'
|
||||
partition <- function(block_array, network_size, distribution) {
|
||||
buckets <- distribution(length(block_array))
|
||||
|
||||
# We won't attempt to shift the data, instead just checking that it is
|
||||
# positive.
|
||||
stopifnot(all(buckets >= 0))
|
||||
|
||||
buckets <- trunc(buckets * (network_size - 1) / max(buckets)) + 1
|
||||
sapply(1:network_size, function(i) which(buckets == i))
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
run_download_simulation <- function(swarm, max_steps, coding_rate) {
|
||||
total_blocks <- sum(sapply(swarm, function(node) length(node$storage)))
|
||||
required_blocks <- round(total_blocks * coding_rate)
|
||||
completed_blocks <- 0
|
||||
storage <- c()
|
||||
|
||||
step <- 1
|
||||
stats <- Stats$new()
|
||||
while ((step < max_steps) && (completed_blocks < required_blocks)){
|
||||
neighbor <- swarm |> select_neighbor()
|
||||
storage <- neighbor |> download_blocks(storage)
|
||||
|
||||
completed_blocks <- length(storage)
|
||||
stats$add_stat(
|
||||
step = step,
|
||||
selected_neighbor = neighbor$node_id,
|
||||
total_blocks = total_blocks,
|
||||
required_blocks = required_blocks,
|
||||
completed_blocks = completed_blocks
|
||||
)
|
||||
|
||||
step <- step + 1
|
||||
}
|
||||
|
||||
stats$as_tibble()
|
||||
}
|
||||
|
||||
select_neighbor <- function(neighborhood) neighborhood[[sample(1:length(neighborhood), size = 1)]]
|
||||
|
||||
download_blocks <- function(neighbor, storage) unique(c(neighbor$storage, storage))
|
|
@ -0,0 +1,17 @@
|
|||
Stats <- R6Class(
|
||||
'Stats',
|
||||
public = list(
|
||||
stats = NULL,
|
||||
|
||||
initialize = function() {
|
||||
self$stats = list(list())
|
||||
},
|
||||
|
||||
add_stat = function(...) {
|
||||
self$stats <- c(self$stats, list(rlang::dots_list(...)))
|
||||
self
|
||||
},
|
||||
|
||||
as_tibble = function() purrr::map_df(self$stats, as_tibble)
|
||||
)
|
||||
)
|
|
@ -0,0 +1,37 @@
|
|||
Simple Block Discovery Simulator
|
||||
================================
|
||||
|
||||
Simple simulator for understanding of block discovery dynamics.
|
||||
|
||||
## Hosted Version
|
||||
|
||||
You can access the block discovery simulator on [shinyapps](https://gmega.shinyapps.io/block-discovery-sim/)
|
||||
|
||||
## Running
|
||||
|
||||
You will need R 4.1.2 with [renv](https://rstudio.github.io/renv/) installed. I also strongly recommend you run this
|
||||
from [RStudio](https://posit.co/products/open-source/rstudio/) as you will otherwise need to [install pandoc and set it up manually before running](https://stackoverflow.com/questions/28432607/pandoc-version-1-12-3-or-higher-is-required-and-was-not-found-r-shiny).
|
||||
|
||||
Once that's cared for and you are in the R terminal (Console in RStudio), you will need to first install deps:
|
||||
|
||||
```R
|
||||
> renv::install()
|
||||
```
|
||||
|
||||
If you are outside RStudio, then you will need to restart your R session. After that, you should load the package:
|
||||
|
||||
```R
|
||||
devtools::load_all()
|
||||
```
|
||||
|
||||
run the tests:
|
||||
|
||||
```R
|
||||
testthat::test_package('blockdiscoverysim')
|
||||
```
|
||||
|
||||
and, if all goes well, launch the simulator:
|
||||
|
||||
```R
|
||||
run()
|
||||
```
|
|
@ -0,0 +1,202 @@
|
|||
---
|
||||
title: "Block Discovery Sim"
|
||||
output: html_document
|
||||
runtime: shiny
|
||||
|
||||
# rsconnect uses this
|
||||
resource_files:
|
||||
- R/node.R
|
||||
- R/partition.R
|
||||
- R/sim.R
|
||||
- R/stats.R
|
||||
---
|
||||
|
||||
## Goal
|
||||
|
||||
The goal of this experiment is to understand -- under different assumptions about how blocks are partitioned among nodes -- how long a hypothetical downloader would take to discover enough blocks to make a successful download from storage nodes by randomly sampling the swarm. We therefore do not account for download times or network latency - we just measure how many times the node randomly samples the swarm before figuring out where enough of the blocks are.
|
||||
|
||||
```{r echo = FALSE, message = FALSE}
|
||||
library(shiny)
|
||||
library(plotly)
|
||||
|
||||
source('R/collate.R')
|
||||
|
||||
knitr::opts_chunk$set(echo = FALSE, message = FALSE)
|
||||
```
|
||||
|
||||
```{r}
|
||||
runs <- 10
|
||||
max_steps <- Inf
|
||||
```
|
||||
|
||||
```{r}
|
||||
DISTRIBUTIONS <- list(
|
||||
'uniform' = runif,
|
||||
'exponential' = rexp,
|
||||
'pareto' = VGAM::rparetoI
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
## Network
|
||||
|
||||
* Select the parameters of the network you would like to use in the experiments.
|
||||
* Preview the shape of the partitions by looking at the chart.
|
||||
* Generate more random partitions by clicking "Generate Another".
|
||||
|
||||
```{r}
|
||||
fluidPage(
|
||||
sidebarPanel(
|
||||
numericInput(
|
||||
'swarm_size',
|
||||
label = 'size of the swarm',
|
||||
value = 20,
|
||||
min = 1,
|
||||
max = 10000
|
||||
),
|
||||
numericInput(
|
||||
'file_size',
|
||||
label = 'number of blocks in the file',
|
||||
value = 1000,
|
||||
min = 1,
|
||||
max = 1e6
|
||||
),
|
||||
selectInput(
|
||||
'partition_distribution',
|
||||
label = 'shape of the distribution for the partitions',
|
||||
choices = names(DISTRIBUTIONS)
|
||||
),
|
||||
actionButton(
|
||||
'generate_network',
|
||||
label = 'Generate Another'
|
||||
)
|
||||
),
|
||||
mainPanel(
|
||||
plotOutput('network_sample')
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
```{r}
|
||||
observe({
|
||||
input$generate_network
|
||||
output$network_sample <- renderPlot({
|
||||
purrr::map_dfr(
|
||||
generate_network(
|
||||
number_of_blocks = input$file_size,
|
||||
network_size = input$swarm_size,
|
||||
partition_distribution = input$partition_distribution
|
||||
),
|
||||
function(node) tibble(node_id = node$node_id, blocks = length(node$storage))
|
||||
) %>%
|
||||
ggplot() +
|
||||
geom_bar(
|
||||
aes(x = node_id, y = blocks),
|
||||
stat = 'identity',
|
||||
col = 'black',
|
||||
fill = 'lightgray'
|
||||
) +
|
||||
labs(x = 'node') +
|
||||
theme_minimal()
|
||||
})}
|
||||
)
|
||||
```
|
||||
|
||||
## Experiment
|
||||
|
||||
Select the number of experiment runs. Each experiment will generate a network and then simulate a download operation where a hypothetical node:
|
||||
|
||||
1. joins the swarm;
|
||||
2. samples one neighbor per round in a round-based download protocol and asks for its block list.
|
||||
|
||||
The experiment ends when the downloading node recovers "enough" blocks. If we let the total number of blocks in the file be $n$ and the coding rate $r$, then the simulation ends when the set of blocks $D$ discovered by the downloading node satisfies $\left|D\right| \geq n\times r$.
|
||||
|
||||
We then show a "discovery curve": a curve that emerges as we look at the percentage of blocks the downloader has discovered so far as a function of the number of contacts it made.
|
||||
|
||||
The curve is actually an average of all experiments, meaning that a point $(5, 10\%)$ should be interpreted as: "on average, after $5$ contacts, a downloader will have discovered $10\%$ of the blocks it needs to get a successful download". We show the $5^{th}$ percentile and the $95^{th}$ percentiles of the experiments as error bands around the average.
|
||||
|
||||
```{r}
|
||||
fluidPage(
|
||||
fluidRow(
|
||||
class='well',
|
||||
column(
|
||||
width = 6,
|
||||
sliderInput('runs', 'How many experiments to run', min = 10, max = 10000, value = 10),
|
||||
actionButton('do_run', 'Run')
|
||||
),
|
||||
column(
|
||||
width = 6,
|
||||
numericInput('coding_rate', 'Coding rate (percentage of blocks required for a successful download)',
|
||||
min = 0.1, max = 1.0, step = 0.05, value = 0.5)
|
||||
)
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
```{r}
|
||||
experiment_results <- reactive({
|
||||
lapply(1:input$runs, function(i) {
|
||||
generate_network(
|
||||
number_of_blocks = input$file_size,
|
||||
network_size = input$swarm_size,
|
||||
partition_distribution = input$partition_distribution
|
||||
) |> run_experiment(run_id = i, coding_rate = input$coding_rate)
|
||||
})
|
||||
}) |> bindEvent(
|
||||
input$do_run,
|
||||
ignoreNULL = TRUE,
|
||||
ignoreInit = TRUE
|
||||
)
|
||||
```
|
||||
|
||||
```{r}
|
||||
renderPlotly({
|
||||
plot_results(do.call(rbind, experiment_results()))
|
||||
})
|
||||
```
|
||||
|
||||
```{r}
|
||||
generate_network <- function(number_of_blocks, network_size, partition_distribution) {
|
||||
block_array <- sample(1:number_of_blocks, replace = FALSE)
|
||||
|
||||
partitions <- partition(block_array, network_size, DISTRIBUTIONS[[partition_distribution]])
|
||||
sapply(1:network_size, function(i) Node$new(
|
||||
node_id = i,
|
||||
storage = partitions[[i]])
|
||||
)
|
||||
}
|
||||
```
|
||||
|
||||
```{r}
|
||||
run_experiment <- function(network, coding_rate, run_id = 0) {
|
||||
run_download_simulation(
|
||||
swarm = network,
|
||||
coding_rate = coding_rate,
|
||||
max_steps = max_steps
|
||||
) |> mutate(
|
||||
run = run_id
|
||||
)
|
||||
}
|
||||
```
|
||||
|
||||
```{r}
|
||||
plot_results <- function(results) {
|
||||
stats <- results |>
|
||||
mutate(completion = pmin(1.0, completed_blocks / required_blocks)) |>
|
||||
group_by(step) |>
|
||||
summarise(
|
||||
average = mean(completion),
|
||||
p_95 = quantile(completion, 0.95),
|
||||
p_05 = quantile(completion, 0.05),
|
||||
.groups = 'drop'
|
||||
)
|
||||
|
||||
plotly::ggplotly(ggplot(stats, aes(x = step)) +
|
||||
geom_line(aes(y = average), col = 'black', lwd = 1) +
|
||||
geom_ribbon(aes(ymin = p_05, ymax = p_95), fill = 'grey80', alpha = 0.5) +
|
||||
labs(x = 'contacts', y = 'blocks discovered (%)') +
|
||||
scale_y_continuous(labels = scales::percent_format()) +
|
||||
theme_minimal())
|
||||
}
|
||||
```
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
Version: 1.0
|
||||
|
||||
RestoreWorkspace: Default
|
||||
SaveWorkspace: Default
|
||||
AlwaysSaveHistory: Default
|
||||
|
||||
EnableCodeIndexing: Yes
|
||||
UseSpacesForTab: Yes
|
||||
NumSpacesForTab: 2
|
||||
Encoding: UTF-8
|
||||
|
||||
RnwWeave: Sweave
|
||||
LaTeX: pdfLaTeX
|
||||
|
||||
BuildType: Package
|
||||
PackageUseDevtools: Yes
|
||||
PackageInstallArgs: --no-multiarch --with-keep.source
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,7 @@
|
|||
library/
|
||||
local/
|
||||
cellar/
|
||||
lock/
|
||||
python/
|
||||
sandbox/
|
||||
staging/
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,17 @@
|
|||
{
|
||||
"bioconductor.version": null,
|
||||
"external.libraries": [],
|
||||
"ignored.packages": [],
|
||||
"package.dependency.fields": [
|
||||
"Imports",
|
||||
"Depends",
|
||||
"LinkingTo"
|
||||
],
|
||||
"r.version": null,
|
||||
"snapshot.type": "implicit",
|
||||
"use.cache": true,
|
||||
"vcs.ignore.cellar": true,
|
||||
"vcs.ignore.library": true,
|
||||
"vcs.ignore.local": true,
|
||||
"vcs.manage.ignores": true
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
# This file is part of the standard setup for testthat.
|
||||
# It is recommended that you do not modify it.
|
||||
#
|
||||
# Where should you do additional test configuration?
|
||||
# Learn more about the roles of various files in:
|
||||
# * https://r-pkgs.org/tests.html
|
||||
# * https://testthat.r-lib.org/reference/test_package.html#special-files
|
||||
|
||||
library(testthat)
|
||||
|
||||
test_check("blockdiscoverysim")
|
|
@ -0,0 +1,18 @@
|
|||
test_that(
|
||||
"should partition into linearly scaled buckets", {
|
||||
samples <- c(1, 100, 500, 800, 850)
|
||||
|
||||
partitions <- partition(
|
||||
block_array = 1:5,
|
||||
network_size = 4,
|
||||
distribution = function(n) samples[1:n]
|
||||
)
|
||||
|
||||
expect_equal(partitions, list(
|
||||
c(1, 2),
|
||||
c(3),
|
||||
c(4),
|
||||
c(5))
|
||||
)
|
||||
}
|
||||
)
|
|
@ -0,0 +1,17 @@
|
|||
test_that(
|
||||
"should collect stats as they are input", {
|
||||
stats <- Stats$new()
|
||||
|
||||
stats$add_stat(a = 1, b = 2, name = 'hello')
|
||||
stats$add_stat(a = 1, b = 3, name = 'world')
|
||||
|
||||
expect_equal(
|
||||
stats$as_tibble(),
|
||||
tribble(
|
||||
~a, ~b, ~name,
|
||||
1, 2, 'hello',
|
||||
1, 3, 'world',
|
||||
)
|
||||
)
|
||||
}
|
||||
)
|
|
@ -0,0 +1,104 @@
|
|||
---
|
||||
title: "Block Discovery Problem"
|
||||
output:
|
||||
bookdown::gitbook:
|
||||
number_sections: false
|
||||
---
|
||||
|
||||
$$
|
||||
\newcommand{\rv}[1]{\textbf{#1}}
|
||||
\newcommand{\imin}{\rv{I}_{\text{min}}}
|
||||
$$
|
||||
|
||||
## Problem Statement
|
||||
|
||||
Let $F = \left\{b_1, \cdots, b_m\right\}$ be an erasure-coded file, and let $O = \left\{o_1, \cdots, o_n\right\}$ be a set of nodes storing that file. We define a _storage function_ $s \longrightarrow O \times 2^F$ as a function mapping subsets of $F$ into nodes in $O$.
|
||||
|
||||
In the simplified block discovery problem, we have a _downloader node_ which is attempting to construct a subset $D \subseteq F$ of blocks by repeatedly sampling nodes from $O$. "Discovery", in this context, can be seen as the downloader node running a round-based protocol where, at round $i$, it samples a random contact $o_i$ and learns about $s(o_i)$.
|
||||
|
||||
To make this slightly more formal, we denote $D_i \subseteq F$ to be the set of blocks that the downloader has learned after $i^{th}$ contact. By the way we state the protocol to work, we have that:
|
||||
|
||||
$$
|
||||
\begin{equation}
|
||||
D_i = D_{i - 1} \cup s(o_i)
|
||||
(\#eq:discovery)
|
||||
\end{equation}
|
||||
$$
|
||||
|
||||
Since the file is erasure coded, the goal of the downloader is to learn some $D_i$ such that:
|
||||
|
||||
$$
|
||||
\begin{equation}
|
||||
\left|D_i\right| \geq c \times \left|F\right|
|
||||
(\#eq:complete)
|
||||
\end{equation}
|
||||
$$
|
||||
|
||||
When $D_i$ satisfies Eq. \@ref(eq:complete), we say that $D_i$ is _complete_. We can then state the problem as follows.
|
||||
|
||||
**Statement.** Let $\imin$ be a random variable representing the first round at which $D_i$ is complete. We want to estimate $F(i) = \mathbb{P}(\imin \leq i)$; namely, the probability that the downloader has discovered all relevant blocks by round $i$.
|
||||
|
||||
## Case (1) - Erasure Coding but no Replication
|
||||
|
||||
If we assume there is no replication then, unless we contact the same node twice, every node we contact contributes with new information. Indeed, the absence of replication implies, necessarily, that:
|
||||
|
||||
$$
|
||||
\begin{equation}
|
||||
\bigcap_{o \in O} s(o) = \emptyset
|
||||
(\#eq:disjoint)
|
||||
\end{equation}
|
||||
$$
|
||||
|
||||
So that if we are contacting a new node at round $i$, we must necessarily have that:
|
||||
|
||||
$$
|
||||
\begin{equation}
|
||||
\left|D_{i}\right| \stackrel{1}{=} \left|D_{i - 1} \cup s(o_i)\right| \stackrel{2}{=} \left|D_{i - 1}\right| + \left|s(o_i)\right|
|
||||
(\#eq:monotonic)
|
||||
\end{equation}
|
||||
$$
|
||||
Where (1) follows from Eq. \@ref(eq:discovery), and (2) follows from the $s(o_i)$ being disjoint (Eq. \@ref(eq:disjoint)). This leads to the corollary:
|
||||
|
||||
**Corollary 1.** After $\lceil c \times n\rceil$ rounds, the downloader will necessarily have learned enough blocks to download $F$.
|
||||
|
||||
which follows trivially from Eq. \@ref(eq:monotonic) and the implication that $D_{\lceil c \times n\rceil}$ must be complete. $\blacksquare$
|
||||
|
||||
As for $F(i)$, note that we can estimate the probability of completion by estimating the probability that $|D_i|$ is bigger than the completion number (Eq. \@ref(eq:complete)). How exactly that looks like and how tractable it is, however, depends on the formulation we give it.
|
||||
|
||||
### Independent Partition Sizes
|
||||
|
||||
Suppose we knew the distribution for partition sizes in $O$, i.e., we knew that the number of blocks assigned to a node in $O$ follows some distribution $\mathcal{B}$ (e.g., a truncated Gaussian).
|
||||
|
||||
If we have a "large enough" network, this means we would be able to approximate the number of blocks assigned to each node as $m$ independent random variables $\rv{Y}_i$, where $\rv{Y}_i \sim \mathcal{B}$. In that case, we would be able to express the total number of blocks learned by the downloader by round $i$ as a random variable $\rv{L}_i$ which represents the sum of the iid random variables $\rv{Y}_j \sim \mathcal{B}$:
|
||||
|
||||
$$
|
||||
\begin{equation}
|
||||
\rv{L}_i \sim \sum_{j = 1}^{i} \rv{Y}_j
|
||||
(\#eq:learning-sum)
|
||||
\end{equation}
|
||||
$$
|
||||
|
||||
The shape of the distribution would be the $i$-fold convolution of $\mathcal{B}$ with itself, which can be tractable for some distributions.
|
||||
|
||||
More interestingly, though, Eq. \@ref(eq:learning-sum) allows us to express a $\mathcal{B}$-independent estimate of the average number of rounds a downloader will undergo before completing a download. We have that:
|
||||
|
||||
$$
|
||||
\mathbb{E}(\rv{L}_i) \sim \sum_{j = 1}^i \mathbb{E}(\rv{Y}_j) = i\mathbb{E}(\rv{Y}) = i\times \mu_{\rv{Y}}
|
||||
$$
|
||||
|
||||
We can then solve for $i$ and the completion condition to get:
|
||||
|
||||
$$
|
||||
\begin{equation}
|
||||
i \times \mu_{\rv{Y}} \geq c \times |F| \iff i \geq \frac{c \times |F|}{\mu_{\rv{Y}}}
|
||||
(\#eq:average-completion)
|
||||
\end{equation}
|
||||
$$
|
||||
|
||||
note that this is intuitive to the point of being trivial. If we let $c = 1$, we get $i \geq |F|/\mu_{\rv{Y}}$, which just means that on average the node will have to sample a number of nodes equal to the number of blocks over the average partition size. In practice we can use $\overline{\mu_\rv{Y}} = \frac{1}{m}\sum_i \left|s(o_i)\right|$ instead of $\mu_{\rv{Y}}$ to estimate what $i$ can look like.
|
||||
|
||||
### Non-Independent Partition Sizes
|
||||
|
||||
If we cannot approximate partition sizes and independent random variables, then the problem changes. Stripping it down, we can cast it as follows. We have a set of integers $P = \{p_1, \cdots, p_m\}$ representing the sizes of each partition. We then want to understand the distribution of the partial sums for random permutations of $P$.
|
||||
|
||||
As I understand it, there is no good way of addressing this without running simulations. The difference is that if we assume disjoint partitions then the simulations are a lot simpler as we do not need to track the contents of $D_i$.
|
Loading…
Reference in New Issue