mirror of
https://github.com/codex-storage/codex-research.git
synced 2025-01-24 17:39:16 +00:00
Block discovery simulator and analysis (#175)
* add block discovery simulator * add analysis document for simpler cases of block discovery
This commit is contained in:
parent
7ace179f2f
commit
1d23b31461
4
analysis/block-discovery-sim/.Rbuildignore
Normal file
4
analysis/block-discovery-sim/.Rbuildignore
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
^renv$
|
||||||
|
^renv\.lock$
|
||||||
|
^.*\.Rproj$
|
||||||
|
^\.Rproj\.user$
|
1
analysis/block-discovery-sim/.Rprofile
Normal file
1
analysis/block-discovery-sim/.Rprofile
Normal file
@ -0,0 +1 @@
|
|||||||
|
source("renv/activate.R")
|
5
analysis/block-discovery-sim/.gitignore
vendored
Normal file
5
analysis/block-discovery-sim/.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
.Rproj.user
|
||||||
|
.RData
|
||||||
|
.Rhistory
|
||||||
|
*nb.html
|
||||||
|
rsconnect
|
18
analysis/block-discovery-sim/DESCRIPTION
Normal file
18
analysis/block-discovery-sim/DESCRIPTION
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
Package: blockdiscoverysim
|
||||||
|
Title: Block Discovery Simulator
|
||||||
|
Version: 0.0.0.9000
|
||||||
|
Description: Simple Simulation for Block Discovery
|
||||||
|
Encoding: UTF-8
|
||||||
|
Roxygen: list(markdown = TRUE)
|
||||||
|
RoxygenNote: 7.2.3
|
||||||
|
Depends:
|
||||||
|
shiny (>= 1.7.4.1),
|
||||||
|
tidyverse (>= 2.0.0),
|
||||||
|
purrr (>= 1.0.1),
|
||||||
|
VGAM (>= 1.1-8),
|
||||||
|
R6 (>= 2.2.2),
|
||||||
|
plotly (>= 4.10.2)
|
||||||
|
Suggests:
|
||||||
|
devtools,
|
||||||
|
testthat (>= 3.0.0)
|
||||||
|
Config/testthat/edition: 3
|
19
analysis/block-discovery-sim/R/collate.R
Normal file
19
analysis/block-discovery-sim/R/collate.R
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
# We do this hack because rsconnect doesn't seem to like us bundling the app
|
||||||
|
# as a package.
|
||||||
|
|
||||||
|
order <- c(
|
||||||
|
'R/partition.R',
|
||||||
|
'R/stats.R',
|
||||||
|
'R/node.R',
|
||||||
|
'R/sim.R'
|
||||||
|
)
|
||||||
|
|
||||||
|
library(R6)
|
||||||
|
library(purrr)
|
||||||
|
library(tidyverse)
|
||||||
|
|
||||||
|
lapply(order, source)
|
||||||
|
|
||||||
|
run <- function() {
|
||||||
|
rmarkdown::run('./block-discovery-sim.Rmd')
|
||||||
|
}
|
14
analysis/block-discovery-sim/R/node.R
Normal file
14
analysis/block-discovery-sim/R/node.R
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
Node <- R6Class(
|
||||||
|
'Node',
|
||||||
|
public = list(
|
||||||
|
node_id = NULL,
|
||||||
|
storage = NULL,
|
||||||
|
|
||||||
|
initialize = function(node_id, storage) {
|
||||||
|
self$node_id = node_id
|
||||||
|
self$storage = storage
|
||||||
|
},
|
||||||
|
|
||||||
|
name = function() paste0('node ', self$node_id)
|
||||||
|
)
|
||||||
|
)
|
18
analysis/block-discovery-sim/R/partition.R
Normal file
18
analysis/block-discovery-sim/R/partition.R
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
#' Generates a random partition of a block array among a set of nodes. The
|
||||||
|
#' partitioning follows the supplied distribution.
|
||||||
|
#'
|
||||||
|
#' @param block_array a vector containing blocks
|
||||||
|
#' @param network_size the number of nodes in the network
|
||||||
|
#' @param distribution a sample generator which generates a vector of n
|
||||||
|
#' samples when called as distribution(n).
|
||||||
|
#'
|
||||||
|
partition <- function(block_array, network_size, distribution) {
|
||||||
|
buckets <- distribution(length(block_array))
|
||||||
|
|
||||||
|
# We won't attempt to shift the data, instead just checking that it is
|
||||||
|
# positive.
|
||||||
|
stopifnot(all(buckets >= 0))
|
||||||
|
|
||||||
|
buckets <- trunc(buckets * (network_size - 1) / max(buckets)) + 1
|
||||||
|
sapply(1:network_size, function(i) which(buckets == i))
|
||||||
|
}
|
30
analysis/block-discovery-sim/R/sim.R
Normal file
30
analysis/block-discovery-sim/R/sim.R
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
run_download_simulation <- function(swarm, max_steps, coding_rate) {
|
||||||
|
total_blocks <- sum(sapply(swarm, function(node) length(node$storage)))
|
||||||
|
required_blocks <- round(total_blocks * coding_rate)
|
||||||
|
completed_blocks <- 0
|
||||||
|
storage <- c()
|
||||||
|
|
||||||
|
step <- 1
|
||||||
|
stats <- Stats$new()
|
||||||
|
while ((step < max_steps) && (completed_blocks < required_blocks)){
|
||||||
|
neighbor <- swarm |> select_neighbor()
|
||||||
|
storage <- neighbor |> download_blocks(storage)
|
||||||
|
|
||||||
|
completed_blocks <- length(storage)
|
||||||
|
stats$add_stat(
|
||||||
|
step = step,
|
||||||
|
selected_neighbor = neighbor$node_id,
|
||||||
|
total_blocks = total_blocks,
|
||||||
|
required_blocks = required_blocks,
|
||||||
|
completed_blocks = completed_blocks
|
||||||
|
)
|
||||||
|
|
||||||
|
step <- step + 1
|
||||||
|
}
|
||||||
|
|
||||||
|
stats$as_tibble()
|
||||||
|
}
|
||||||
|
|
||||||
|
select_neighbor <- function(neighborhood) neighborhood[[sample(1:length(neighborhood), size = 1)]]
|
||||||
|
|
||||||
|
download_blocks <- function(neighbor, storage) unique(c(neighbor$storage, storage))
|
17
analysis/block-discovery-sim/R/stats.R
Normal file
17
analysis/block-discovery-sim/R/stats.R
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
Stats <- R6Class(
|
||||||
|
'Stats',
|
||||||
|
public = list(
|
||||||
|
stats = NULL,
|
||||||
|
|
||||||
|
initialize = function() {
|
||||||
|
self$stats = list(list())
|
||||||
|
},
|
||||||
|
|
||||||
|
add_stat = function(...) {
|
||||||
|
self$stats <- c(self$stats, list(rlang::dots_list(...)))
|
||||||
|
self
|
||||||
|
},
|
||||||
|
|
||||||
|
as_tibble = function() purrr::map_df(self$stats, as_tibble)
|
||||||
|
)
|
||||||
|
)
|
37
analysis/block-discovery-sim/README.md
Normal file
37
analysis/block-discovery-sim/README.md
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
Simple Block Discovery Simulator
|
||||||
|
================================
|
||||||
|
|
||||||
|
Simple simulator for understanding of block discovery dynamics.
|
||||||
|
|
||||||
|
## Hosted Version
|
||||||
|
|
||||||
|
You can access the block discovery simulator on [shinyapps](https://gmega.shinyapps.io/block-discovery-sim/)
|
||||||
|
|
||||||
|
## Running
|
||||||
|
|
||||||
|
You will need R 4.1.2 with [renv](https://rstudio.github.io/renv/) installed. I also strongly recommend you run this
|
||||||
|
from [RStudio](https://posit.co/products/open-source/rstudio/) as you will otherwise need to [install pandoc and set it up manually before running](https://stackoverflow.com/questions/28432607/pandoc-version-1-12-3-or-higher-is-required-and-was-not-found-r-shiny).
|
||||||
|
|
||||||
|
Once that's cared for and you are in the R terminal (Console in RStudio), you will need to first install deps:
|
||||||
|
|
||||||
|
```R
|
||||||
|
> renv::install()
|
||||||
|
```
|
||||||
|
|
||||||
|
If you are outside RStudio, then you will need to restart your R session. After that, you should load the package:
|
||||||
|
|
||||||
|
```R
|
||||||
|
devtools::load_all()
|
||||||
|
```
|
||||||
|
|
||||||
|
run the tests:
|
||||||
|
|
||||||
|
```R
|
||||||
|
testthat::test_package('blockdiscoverysim')
|
||||||
|
```
|
||||||
|
|
||||||
|
and, if all goes well, launch the simulator:
|
||||||
|
|
||||||
|
```R
|
||||||
|
run()
|
||||||
|
```
|
202
analysis/block-discovery-sim/block-discovery-sim.Rmd
Normal file
202
analysis/block-discovery-sim/block-discovery-sim.Rmd
Normal file
@ -0,0 +1,202 @@
|
|||||||
|
---
|
||||||
|
title: "Block Discovery Sim"
|
||||||
|
output: html_document
|
||||||
|
runtime: shiny
|
||||||
|
|
||||||
|
# rsconnect uses this
|
||||||
|
resource_files:
|
||||||
|
- R/node.R
|
||||||
|
- R/partition.R
|
||||||
|
- R/sim.R
|
||||||
|
- R/stats.R
|
||||||
|
---
|
||||||
|
|
||||||
|
## Goal
|
||||||
|
|
||||||
|
The goal of this experiment is to understand -- under different assumptions about how blocks are partitioned among nodes -- how long a hypothetical downloader would take to discover enough blocks to make a successful download from storage nodes by randomly sampling the swarm. We therefore do not account for download times or network latency - we just measure how many times the node randomly samples the swarm before figuring out where enough of the blocks are.
|
||||||
|
|
||||||
|
```{r echo = FALSE, message = FALSE}
|
||||||
|
library(shiny)
|
||||||
|
library(plotly)
|
||||||
|
|
||||||
|
source('R/collate.R')
|
||||||
|
|
||||||
|
knitr::opts_chunk$set(echo = FALSE, message = FALSE)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
runs <- 10
|
||||||
|
max_steps <- Inf
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
DISTRIBUTIONS <- list(
|
||||||
|
'uniform' = runif,
|
||||||
|
'exponential' = rexp,
|
||||||
|
'pareto' = VGAM::rparetoI
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Network
|
||||||
|
|
||||||
|
* Select the parameters of the network you would like to use in the experiments.
|
||||||
|
* Preview the shape of the partitions by looking at the chart.
|
||||||
|
* Generate more random partitions by clicking "Generate Another".
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
fluidPage(
|
||||||
|
sidebarPanel(
|
||||||
|
numericInput(
|
||||||
|
'swarm_size',
|
||||||
|
label = 'size of the swarm',
|
||||||
|
value = 20,
|
||||||
|
min = 1,
|
||||||
|
max = 10000
|
||||||
|
),
|
||||||
|
numericInput(
|
||||||
|
'file_size',
|
||||||
|
label = 'number of blocks in the file',
|
||||||
|
value = 1000,
|
||||||
|
min = 1,
|
||||||
|
max = 1e6
|
||||||
|
),
|
||||||
|
selectInput(
|
||||||
|
'partition_distribution',
|
||||||
|
label = 'shape of the distribution for the partitions',
|
||||||
|
choices = names(DISTRIBUTIONS)
|
||||||
|
),
|
||||||
|
actionButton(
|
||||||
|
'generate_network',
|
||||||
|
label = 'Generate Another'
|
||||||
|
)
|
||||||
|
),
|
||||||
|
mainPanel(
|
||||||
|
plotOutput('network_sample')
|
||||||
|
)
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
observe({
|
||||||
|
input$generate_network
|
||||||
|
output$network_sample <- renderPlot({
|
||||||
|
purrr::map_dfr(
|
||||||
|
generate_network(
|
||||||
|
number_of_blocks = input$file_size,
|
||||||
|
network_size = input$swarm_size,
|
||||||
|
partition_distribution = input$partition_distribution
|
||||||
|
),
|
||||||
|
function(node) tibble(node_id = node$node_id, blocks = length(node$storage))
|
||||||
|
) %>%
|
||||||
|
ggplot() +
|
||||||
|
geom_bar(
|
||||||
|
aes(x = node_id, y = blocks),
|
||||||
|
stat = 'identity',
|
||||||
|
col = 'black',
|
||||||
|
fill = 'lightgray'
|
||||||
|
) +
|
||||||
|
labs(x = 'node') +
|
||||||
|
theme_minimal()
|
||||||
|
})}
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Experiment
|
||||||
|
|
||||||
|
Select the number of experiment runs. Each experiment will generate a network and then simulate a download operation where a hypothetical node:
|
||||||
|
|
||||||
|
1. joins the swarm;
|
||||||
|
2. samples one neighbor per round in a round-based download protocol and asks for its block list.
|
||||||
|
|
||||||
|
The experiment ends when the downloading node recovers "enough" blocks. If we let the total number of blocks in the file be $n$ and the coding rate $r$, then the simulation ends when the set of blocks $D$ discovered by the downloading node satisfies $\left|D\right| \geq n\times r$.
|
||||||
|
|
||||||
|
We then show a "discovery curve": a curve that emerges as we look at the percentage of blocks the downloader has discovered so far as a function of the number of contacts it made.
|
||||||
|
|
||||||
|
The curve is actually an average of all experiments, meaning that a point $(5, 10\%)$ should be interpreted as: "on average, after $5$ contacts, a downloader will have discovered $10\%$ of the blocks it needs to get a successful download". We show the $5^{th}$ percentile and the $95^{th}$ percentiles of the experiments as error bands around the average.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
fluidPage(
|
||||||
|
fluidRow(
|
||||||
|
class='well',
|
||||||
|
column(
|
||||||
|
width = 6,
|
||||||
|
sliderInput('runs', 'How many experiments to run', min = 10, max = 10000, value = 10),
|
||||||
|
actionButton('do_run', 'Run')
|
||||||
|
),
|
||||||
|
column(
|
||||||
|
width = 6,
|
||||||
|
numericInput('coding_rate', 'Coding rate (percentage of blocks required for a successful download)',
|
||||||
|
min = 0.1, max = 1.0, step = 0.05, value = 0.5)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
experiment_results <- reactive({
|
||||||
|
lapply(1:input$runs, function(i) {
|
||||||
|
generate_network(
|
||||||
|
number_of_blocks = input$file_size,
|
||||||
|
network_size = input$swarm_size,
|
||||||
|
partition_distribution = input$partition_distribution
|
||||||
|
) |> run_experiment(run_id = i, coding_rate = input$coding_rate)
|
||||||
|
})
|
||||||
|
}) |> bindEvent(
|
||||||
|
input$do_run,
|
||||||
|
ignoreNULL = TRUE,
|
||||||
|
ignoreInit = TRUE
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
renderPlotly({
|
||||||
|
plot_results(do.call(rbind, experiment_results()))
|
||||||
|
})
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
generate_network <- function(number_of_blocks, network_size, partition_distribution) {
|
||||||
|
block_array <- sample(1:number_of_blocks, replace = FALSE)
|
||||||
|
|
||||||
|
partitions <- partition(block_array, network_size, DISTRIBUTIONS[[partition_distribution]])
|
||||||
|
sapply(1:network_size, function(i) Node$new(
|
||||||
|
node_id = i,
|
||||||
|
storage = partitions[[i]])
|
||||||
|
)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
run_experiment <- function(network, coding_rate, run_id = 0) {
|
||||||
|
run_download_simulation(
|
||||||
|
swarm = network,
|
||||||
|
coding_rate = coding_rate,
|
||||||
|
max_steps = max_steps
|
||||||
|
) |> mutate(
|
||||||
|
run = run_id
|
||||||
|
)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
plot_results <- function(results) {
|
||||||
|
stats <- results |>
|
||||||
|
mutate(completion = pmin(1.0, completed_blocks / required_blocks)) |>
|
||||||
|
group_by(step) |>
|
||||||
|
summarise(
|
||||||
|
average = mean(completion),
|
||||||
|
p_95 = quantile(completion, 0.95),
|
||||||
|
p_05 = quantile(completion, 0.05),
|
||||||
|
.groups = 'drop'
|
||||||
|
)
|
||||||
|
|
||||||
|
plotly::ggplotly(ggplot(stats, aes(x = step)) +
|
||||||
|
geom_line(aes(y = average), col = 'black', lwd = 1) +
|
||||||
|
geom_ribbon(aes(ymin = p_05, ymax = p_95), fill = 'grey80', alpha = 0.5) +
|
||||||
|
labs(x = 'contacts', y = 'blocks discovered (%)') +
|
||||||
|
scale_y_continuous(labels = scales::percent_format()) +
|
||||||
|
theme_minimal())
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
17
analysis/block-discovery-sim/block-discovery-sim.Rproj
Normal file
17
analysis/block-discovery-sim/block-discovery-sim.Rproj
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
Version: 1.0
|
||||||
|
|
||||||
|
RestoreWorkspace: Default
|
||||||
|
SaveWorkspace: Default
|
||||||
|
AlwaysSaveHistory: Default
|
||||||
|
|
||||||
|
EnableCodeIndexing: Yes
|
||||||
|
UseSpacesForTab: Yes
|
||||||
|
NumSpacesForTab: 2
|
||||||
|
Encoding: UTF-8
|
||||||
|
|
||||||
|
RnwWeave: Sweave
|
||||||
|
LaTeX: pdfLaTeX
|
||||||
|
|
||||||
|
BuildType: Package
|
||||||
|
PackageUseDevtools: Yes
|
||||||
|
PackageInstallArgs: --no-multiarch --with-keep.source
|
2194
analysis/block-discovery-sim/renv.lock
Normal file
2194
analysis/block-discovery-sim/renv.lock
Normal file
File diff suppressed because it is too large
Load Diff
7
analysis/block-discovery-sim/renv/.gitignore
vendored
Normal file
7
analysis/block-discovery-sim/renv/.gitignore
vendored
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
library/
|
||||||
|
local/
|
||||||
|
cellar/
|
||||||
|
lock/
|
||||||
|
python/
|
||||||
|
sandbox/
|
||||||
|
staging/
|
1032
analysis/block-discovery-sim/renv/activate.R
Normal file
1032
analysis/block-discovery-sim/renv/activate.R
Normal file
File diff suppressed because it is too large
Load Diff
17
analysis/block-discovery-sim/renv/settings.json
Normal file
17
analysis/block-discovery-sim/renv/settings.json
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
{
|
||||||
|
"bioconductor.version": null,
|
||||||
|
"external.libraries": [],
|
||||||
|
"ignored.packages": [],
|
||||||
|
"package.dependency.fields": [
|
||||||
|
"Imports",
|
||||||
|
"Depends",
|
||||||
|
"LinkingTo"
|
||||||
|
],
|
||||||
|
"r.version": null,
|
||||||
|
"snapshot.type": "implicit",
|
||||||
|
"use.cache": true,
|
||||||
|
"vcs.ignore.cellar": true,
|
||||||
|
"vcs.ignore.library": true,
|
||||||
|
"vcs.ignore.local": true,
|
||||||
|
"vcs.manage.ignores": true
|
||||||
|
}
|
11
analysis/block-discovery-sim/tests/testthat.R
Normal file
11
analysis/block-discovery-sim/tests/testthat.R
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
# This file is part of the standard setup for testthat.
|
||||||
|
# It is recommended that you do not modify it.
|
||||||
|
#
|
||||||
|
# Where should you do additional test configuration?
|
||||||
|
# Learn more about the roles of various files in:
|
||||||
|
# * https://r-pkgs.org/tests.html
|
||||||
|
# * https://testthat.r-lib.org/reference/test_package.html#special-files
|
||||||
|
|
||||||
|
library(testthat)
|
||||||
|
|
||||||
|
test_check("blockdiscoverysim")
|
18
analysis/block-discovery-sim/tests/testthat/test-partition.R
Normal file
18
analysis/block-discovery-sim/tests/testthat/test-partition.R
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
test_that(
|
||||||
|
"should partition into linearly scaled buckets", {
|
||||||
|
samples <- c(1, 100, 500, 800, 850)
|
||||||
|
|
||||||
|
partitions <- partition(
|
||||||
|
block_array = 1:5,
|
||||||
|
network_size = 4,
|
||||||
|
distribution = function(n) samples[1:n]
|
||||||
|
)
|
||||||
|
|
||||||
|
expect_equal(partitions, list(
|
||||||
|
c(1, 2),
|
||||||
|
c(3),
|
||||||
|
c(4),
|
||||||
|
c(5))
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
17
analysis/block-discovery-sim/tests/testthat/test-stats.R
Normal file
17
analysis/block-discovery-sim/tests/testthat/test-stats.R
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
test_that(
|
||||||
|
"should collect stats as they are input", {
|
||||||
|
stats <- Stats$new()
|
||||||
|
|
||||||
|
stats$add_stat(a = 1, b = 2, name = 'hello')
|
||||||
|
stats$add_stat(a = 1, b = 3, name = 'world')
|
||||||
|
|
||||||
|
expect_equal(
|
||||||
|
stats$as_tibble(),
|
||||||
|
tribble(
|
||||||
|
~a, ~b, ~name,
|
||||||
|
1, 2, 'hello',
|
||||||
|
1, 3, 'world',
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
104
analysis/block-discovery.Rmd
Normal file
104
analysis/block-discovery.Rmd
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
---
|
||||||
|
title: "Block Discovery Problem"
|
||||||
|
output:
|
||||||
|
bookdown::gitbook:
|
||||||
|
number_sections: false
|
||||||
|
---
|
||||||
|
|
||||||
|
$$
|
||||||
|
\newcommand{\rv}[1]{\textbf{#1}}
|
||||||
|
\newcommand{\imin}{\rv{I}_{\text{min}}}
|
||||||
|
$$
|
||||||
|
|
||||||
|
## Problem Statement
|
||||||
|
|
||||||
|
Let $F = \left\{b_1, \cdots, b_m\right\}$ be an erasure-coded file, and let $O = \left\{o_1, \cdots, o_n\right\}$ be a set of nodes storing that file. We define a _storage function_ $s \longrightarrow O \times 2^F$ as a function mapping subsets of $F$ into nodes in $O$.
|
||||||
|
|
||||||
|
In the simplified block discovery problem, we have a _downloader node_ which is attempting to construct a subset $D \subseteq F$ of blocks by repeatedly sampling nodes from $O$. "Discovery", in this context, can be seen as the downloader node running a round-based protocol where, at round $i$, it samples a random contact $o_i$ and learns about $s(o_i)$.
|
||||||
|
|
||||||
|
To make this slightly more formal, we denote $D_i \subseteq F$ to be the set of blocks that the downloader has learned after $i^{th}$ contact. By the way we state the protocol to work, we have that:
|
||||||
|
|
||||||
|
$$
|
||||||
|
\begin{equation}
|
||||||
|
D_i = D_{i - 1} \cup s(o_i)
|
||||||
|
(\#eq:discovery)
|
||||||
|
\end{equation}
|
||||||
|
$$
|
||||||
|
|
||||||
|
Since the file is erasure coded, the goal of the downloader is to learn some $D_i$ such that:
|
||||||
|
|
||||||
|
$$
|
||||||
|
\begin{equation}
|
||||||
|
\left|D_i\right| \geq c \times \left|F\right|
|
||||||
|
(\#eq:complete)
|
||||||
|
\end{equation}
|
||||||
|
$$
|
||||||
|
|
||||||
|
When $D_i$ satisfies Eq. \@ref(eq:complete), we say that $D_i$ is _complete_. We can then state the problem as follows.
|
||||||
|
|
||||||
|
**Statement.** Let $\imin$ be a random variable representing the first round at which $D_i$ is complete. We want to estimate $F(i) = \mathbb{P}(\imin \leq i)$; namely, the probability that the downloader has discovered all relevant blocks by round $i$.
|
||||||
|
|
||||||
|
## Case (1) - Erasure Coding but no Replication
|
||||||
|
|
||||||
|
If we assume there is no replication then, unless we contact the same node twice, every node we contact contributes with new information. Indeed, the absence of replication implies, necessarily, that:
|
||||||
|
|
||||||
|
$$
|
||||||
|
\begin{equation}
|
||||||
|
\bigcap_{o \in O} s(o) = \emptyset
|
||||||
|
(\#eq:disjoint)
|
||||||
|
\end{equation}
|
||||||
|
$$
|
||||||
|
|
||||||
|
So that if we are contacting a new node at round $i$, we must necessarily have that:
|
||||||
|
|
||||||
|
$$
|
||||||
|
\begin{equation}
|
||||||
|
\left|D_{i}\right| \stackrel{1}{=} \left|D_{i - 1} \cup s(o_i)\right| \stackrel{2}{=} \left|D_{i - 1}\right| + \left|s(o_i)\right|
|
||||||
|
(\#eq:monotonic)
|
||||||
|
\end{equation}
|
||||||
|
$$
|
||||||
|
Where (1) follows from Eq. \@ref(eq:discovery), and (2) follows from the $s(o_i)$ being disjoint (Eq. \@ref(eq:disjoint)). This leads to the corollary:
|
||||||
|
|
||||||
|
**Corollary 1.** After $\lceil c \times n\rceil$ rounds, the downloader will necessarily have learned enough blocks to download $F$.
|
||||||
|
|
||||||
|
which follows trivially from Eq. \@ref(eq:monotonic) and the implication that $D_{\lceil c \times n\rceil}$ must be complete. $\blacksquare$
|
||||||
|
|
||||||
|
As for $F(i)$, note that we can estimate the probability of completion by estimating the probability that $|D_i|$ is bigger than the completion number (Eq. \@ref(eq:complete)). How exactly that looks like and how tractable it is, however, depends on the formulation we give it.
|
||||||
|
|
||||||
|
### Independent Partition Sizes
|
||||||
|
|
||||||
|
Suppose we knew the distribution for partition sizes in $O$, i.e., we knew that the number of blocks assigned to a node in $O$ follows some distribution $\mathcal{B}$ (e.g., a truncated Gaussian).
|
||||||
|
|
||||||
|
If we have a "large enough" network, this means we would be able to approximate the number of blocks assigned to each node as $m$ independent random variables $\rv{Y}_i$, where $\rv{Y}_i \sim \mathcal{B}$. In that case, we would be able to express the total number of blocks learned by the downloader by round $i$ as a random variable $\rv{L}_i$ which represents the sum of the iid random variables $\rv{Y}_j \sim \mathcal{B}$:
|
||||||
|
|
||||||
|
$$
|
||||||
|
\begin{equation}
|
||||||
|
\rv{L}_i \sim \sum_{j = 1}^{i} \rv{Y}_j
|
||||||
|
(\#eq:learning-sum)
|
||||||
|
\end{equation}
|
||||||
|
$$
|
||||||
|
|
||||||
|
The shape of the distribution would be the $i$-fold convolution of $\mathcal{B}$ with itself, which can be tractable for some distributions.
|
||||||
|
|
||||||
|
More interestingly, though, Eq. \@ref(eq:learning-sum) allows us to express a $\mathcal{B}$-independent estimate of the average number of rounds a downloader will undergo before completing a download. We have that:
|
||||||
|
|
||||||
|
$$
|
||||||
|
\mathbb{E}(\rv{L}_i) \sim \sum_{j = 1}^i \mathbb{E}(\rv{Y}_j) = i\mathbb{E}(\rv{Y}) = i\times \mu_{\rv{Y}}
|
||||||
|
$$
|
||||||
|
|
||||||
|
We can then solve for $i$ and the completion condition to get:
|
||||||
|
|
||||||
|
$$
|
||||||
|
\begin{equation}
|
||||||
|
i \times \mu_{\rv{Y}} \geq c \times |F| \iff i \geq \frac{c \times |F|}{\mu_{\rv{Y}}}
|
||||||
|
(\#eq:average-completion)
|
||||||
|
\end{equation}
|
||||||
|
$$
|
||||||
|
|
||||||
|
note that this is intuitive to the point of being trivial. If we let $c = 1$, we get $i \geq |F|/\mu_{\rv{Y}}$, which just means that on average the node will have to sample a number of nodes equal to the number of blocks over the average partition size. In practice we can use $\overline{\mu_\rv{Y}} = \frac{1}{m}\sum_i \left|s(o_i)\right|$ instead of $\mu_{\rv{Y}}$ to estimate what $i$ can look like.
|
||||||
|
|
||||||
|
### Non-Independent Partition Sizes
|
||||||
|
|
||||||
|
If we cannot approximate partition sizes and independent random variables, then the problem changes. Stripping it down, we can cast it as follows. We have a set of integers $P = \{p_1, \cdots, p_m\}$ representing the sizes of each partition. We then want to understand the distribution of the partial sums for random permutations of $P$.
|
||||||
|
|
||||||
|
As I understand it, there is no good way of addressing this without running simulations. The difference is that if we assume disjoint partitions then the simulations are a lot simpler as we do not need to track the contents of $D_i$.
|
Loading…
x
Reference in New Issue
Block a user