mirror of
https://github.com/logos-storage/bittorrent-benchmarks.git
synced 2026-01-02 13:03:13 +00:00
536 lines
20 KiB
YAML
536 lines
20 KiB
YAML
apiVersion: argoproj.io/v1alpha1
|
|
kind: Workflow
|
|
metadata:
|
|
generateName: codex-benchmark-
|
|
spec:
|
|
serviceAccountName: codex-benchmarks-workflows
|
|
entrypoint: benchmark-workflow
|
|
arguments:
|
|
parameters:
|
|
######################################## Global Settings ##############################################
|
|
# What are we benchmarking (one of: codex, deluge)?
|
|
- name: system
|
|
value: "codex"
|
|
|
|
################################ Experiment Parameter Matrix ##########################################
|
|
# Parameters in the experiment parameter matrix will be expanded, and can be set
|
|
# to lists or constrained lists of arguments.
|
|
|
|
# How many times should we repeat the experiment for each parameter set/random seeder set?
|
|
- name: repetitions
|
|
value: 5
|
|
# How many random seeder sets should we have?
|
|
- name: seederSets
|
|
value: 2
|
|
# What file size are we benchmarking?
|
|
- name: fileSize
|
|
value: '["100MB", "1GB", "5GB"]'
|
|
# What values for network size vs seeders should we use?
|
|
- name: constrained__networkSize_seeders
|
|
value: "[[2, 1], [8, [1, 2, 4]], [16, [1, 2, 4, 8]], [32, [1, 2, 4, 8, 16]]]"
|
|
# What's the maximum duration for this whole batch of experiments?
|
|
- name: maxExperimentDuration
|
|
value: 144h
|
|
# Which tag to use for the node (Codex, Deluge, whatnot) images? Useful if you want to run something
|
|
# off of a branch.
|
|
- name: nodeTag
|
|
value: "latest"
|
|
# Which tag to use for the experiment runner. Useful if you want to run something off of a branch.
|
|
- name: runnerTag
|
|
value: "latest"
|
|
# Which tag to use for the workflow runner. Useful if you want to run something off of a branch.
|
|
- name: workflowRunnerTag
|
|
value: "latest"
|
|
- name: removeData
|
|
value: "false"
|
|
- name: codexLogLevel
|
|
value: "INFO"
|
|
# value: "DEBUG;trace:swarm\\,blockexcnetworkpeer" # make sure to escape commas or helm will fail
|
|
|
|
###################################### Experiment Retries #############################################
|
|
# Allow the workflow to replay failed experiments from a previous run instead of running a new set.
|
|
|
|
# If set to an existing group ID (e.g. "g1740079931"), will replay the failed experiments
|
|
# in this group, ignoring all other parameters in the parameter matrix. Requires the Argo
|
|
# workflow service name to be known.
|
|
- name: retryGroup
|
|
value: ""
|
|
|
|
# You need to set the name for the argo workflows service in the k8s cluster manually, and this might differ
|
|
# between our cluster and minikube. This is a pain point, and should fade away as we move towards
|
|
# Hera (https://github.com/argoproj-labs/hera). I could set this similarly to what we do on define-image-settings,
|
|
# for the minikube env, but not worth it.
|
|
- name: argoService
|
|
value: "argo-workflows-server" # -> cluster
|
|
# value: "argo-server" # -> minikube
|
|
|
|
################################## Performance and Resource Usage #####################################
|
|
# Affects cluster resource usage (how many nodes can we use?) and overall performance.
|
|
|
|
# How many experiments should we run at a time? In clusters with lots of resources, more parallelism
|
|
# means experiments can run a lot faster.
|
|
# - name: experimentParallelism
|
|
# value: 1
|
|
# XXX Unfortunately Argo won't expand those properly for sub-workflows, so you have to set it manually all over.
|
|
# Another pain point that should disappear as we move to Hera.
|
|
|
|
# Groups the expansion such that all experiments with a given networkSize run together, smallest
|
|
# to largest. This can save significant amounts of time when running on a cluster with autoscaling.
|
|
# If you plan to run experiments in parallel, optimizing for smallest accrued VM usage might be
|
|
# more important.
|
|
- name: orderBy
|
|
value: '["fileSize", "seeders", "networkSize"]'
|
|
# The region in which to deploy machines. Leave empty to deploy to default.
|
|
- name: region
|
|
value: "hel1"
|
|
|
|
###################################### Dev. and Debugging #############################################
|
|
# Settings for running experiments locally or debugging failures.
|
|
|
|
# Set this to true to run workflows on Minikube.
|
|
- name: minikubeEnv
|
|
value: "false"
|
|
# If set to false, leaves pods for failed experiments behind so they can be inspected.
|
|
- name: cleanupOnFailure
|
|
value: "true"
|
|
# If set to false, does not parse/upload logs at the end of the experiment. You'll probably want to
|
|
# disable this when running local experiments.
|
|
- name: parseLogs
|
|
value: "false"
|
|
|
|
#######################################################################################################
|
|
|
|
templates:
|
|
- name: benchmark-workflow
|
|
dag:
|
|
tasks:
|
|
- name: define-image-settings
|
|
template: define-image-settings
|
|
|
|
- name: new-benchmark-run
|
|
template: new-benchmark-run
|
|
arguments:
|
|
parameters:
|
|
- name: runnerImage
|
|
value: "{{tasks.define-image-settings.outputs.parameters.image}}"
|
|
- name: imagePullPolicy
|
|
value: "{{tasks.define-image-settings.outputs.parameters.imagePullPolicy}}"
|
|
depends: "define-image-settings.Succeeded"
|
|
when: '"{{workflow.parameters.retryGroup}}" == ""'
|
|
|
|
- name: retry-benchmark-run
|
|
template: retry-benchmark-run
|
|
arguments:
|
|
parameters:
|
|
- name: runnerImage
|
|
value: "{{tasks.define-image-settings.outputs.parameters.image}}"
|
|
- name: imagePullPolicy
|
|
value: "{{tasks.define-image-settings.outputs.parameters.imagePullPolicy}}"
|
|
depends: "define-image-settings.Succeeded"
|
|
when: '"{{workflow.parameters.retryGroup}}" != ""'
|
|
|
|
- name: new-benchmark-run
|
|
parallelism: 1
|
|
inputs:
|
|
parameters:
|
|
- name: runnerImage
|
|
- name: imagePullPolicy
|
|
dag:
|
|
tasks:
|
|
- name: generate-group-id
|
|
template: generate-group-id
|
|
arguments:
|
|
parameters:
|
|
- name: runnerImage
|
|
value: "{{inputs.parameters.runnerImage}}"
|
|
- name: imagePullPolicy
|
|
value: "{{inputs.parameters.imagePullPolicy}}"
|
|
|
|
- name: expand-parameter-matrix
|
|
template: expand-parameter-matrix
|
|
arguments:
|
|
parameters:
|
|
- name: runnerImage
|
|
value: "{{inputs.parameters.runnerImage}}"
|
|
- name: imagePullPolicy
|
|
value: "{{inputs.parameters.imagePullPolicy}}"
|
|
depends: "generate-group-id.Succeeded"
|
|
|
|
- name: benchmark-experiment
|
|
template: wrapped-benchmark-experiment
|
|
arguments:
|
|
parameters:
|
|
- name: groupId
|
|
value: "{{tasks.generate-group-id.outputs.result}}"
|
|
- name: runId
|
|
value: "{{item.runId}}"
|
|
- name: fileSize
|
|
value: "{{item.fileSize}}"
|
|
- name: seederSets
|
|
value: "{{item.seederSets}}"
|
|
- name: networkSize
|
|
value: "{{item.networkSize}}"
|
|
- name: seeders
|
|
value: "{{item.seeders}}"
|
|
- name: repetitions
|
|
value: "{{item.repetitions}}"
|
|
- name: runnerImage
|
|
value: "{{inputs.parameters.runnerImage}}"
|
|
- name: imagePullPolicy
|
|
value: "{{inputs.parameters.imagePullPolicy}}"
|
|
|
|
withParam: "{{tasks.expand-parameter-matrix.outputs.result}}"
|
|
depends: "expand-parameter-matrix.Succeeded"
|
|
|
|
- name: parse-logs
|
|
template: parse-logs
|
|
arguments:
|
|
parameters:
|
|
- name: groupId
|
|
value: "{{tasks.generate-group-id.outputs.result}}"
|
|
depends: "benchmark-experiment.AnySucceeded"
|
|
when: '{{workflow.parameters.parseLogs}} == true'
|
|
|
|
- name: retry-benchmark-run
|
|
parallelism: 1
|
|
inputs:
|
|
parameters:
|
|
- name: runnerImage
|
|
- name: imagePullPolicy
|
|
dag:
|
|
tasks:
|
|
- name: collect-failed-parameters
|
|
template: collect-failed-parameters
|
|
arguments:
|
|
parameters:
|
|
- name: groupId
|
|
value: "{{workflow.parameters.retryGroup}}"
|
|
|
|
- name: increment-retry-counter
|
|
template: increment-retry-counter
|
|
arguments:
|
|
parameters:
|
|
- name: groupId
|
|
value: "{{workflow.parameters.retryGroup}}"
|
|
depends: "collect-failed-parameters.Succeeded"
|
|
|
|
- name: benchmark-experiment
|
|
template: wrapped-benchmark-experiment
|
|
arguments:
|
|
parameters:
|
|
- name: groupId
|
|
value: "{{tasks.increment-retry-counter.outputs.result}}"
|
|
- name: runId
|
|
value: "{{item.runId}}"
|
|
- name: fileSize
|
|
value: "{{item.fileSize}}"
|
|
- name: seederSets
|
|
value: "{{item.seederSets}}"
|
|
- name: networkSize
|
|
value: "{{item.networkSize}}"
|
|
- name: seeders
|
|
value: "{{item.seeders}}"
|
|
- name: repetitions
|
|
value: "{{item.repetitions}}"
|
|
- name: runnerImage
|
|
value: "{{inputs.parameters.runnerImage}}"
|
|
- name: imagePullPolicy
|
|
value: "{{inputs.parameters.imagePullPolicy}}"
|
|
|
|
withParam: "{{tasks.collect-failed-parameters.outputs.result}}"
|
|
depends: "increment-retry-counter.Succeeded"
|
|
|
|
- name: parse-logs
|
|
template: parse-logs
|
|
arguments:
|
|
parameters:
|
|
- name: groupId
|
|
value: "{{tasks.increment-retry-counter.outputs.result}}"
|
|
depends: "benchmark-experiment.AnySucceeded"
|
|
when: '{{workflow.parameters.parseLogs}} == true'
|
|
|
|
- name: parse-logs
|
|
inputs:
|
|
parameters:
|
|
- name: groupId
|
|
resource:
|
|
action: create
|
|
manifest: |
|
|
apiVersion: argoproj.io/v1alpha1
|
|
kind: Workflow
|
|
metadata:
|
|
generateName: log-parsing-
|
|
spec:
|
|
workflowTemplateRef:
|
|
name: log-parsing-workflow
|
|
arguments:
|
|
parameters:
|
|
- name: experimentGroupId
|
|
value: "{{inputs.parameters.groupId}}"
|
|
successCondition: status.phase == Succeeded
|
|
failureCondition: status.phase in (Failed, Error)
|
|
|
|
- name: define-image-settings
|
|
# I think this goes to show just how clumsy Argo Workflows is. If I want to select
|
|
# an image name and a pull policy from a workflow parameter, I need this. Sad.
|
|
script:
|
|
image: busybox:latest
|
|
command: ["sh", "-c"]
|
|
source: |
|
|
if [ "{{workflow.parameters.minikubeEnv}}" == "true" ]; then
|
|
echo "Using Minikube env"
|
|
echo "bittorrent-benchmarks-workflows:minikube" > /tmp/image.txt
|
|
echo "Never" > /tmp/imagePullPolicy.txt
|
|
else
|
|
echo "NOT using Minikube env"
|
|
echo "codexstorage/bittorrent-benchmarks-workflows:{{workflow.parameters.workflowRunnerTag}}" > /tmp/image.txt
|
|
echo "Always" > /tmp/imagePullPolicy.txt
|
|
fi
|
|
outputs:
|
|
parameters:
|
|
- name: image
|
|
valueFrom:
|
|
path: /tmp/image.txt
|
|
- name: imagePullPolicy
|
|
valueFrom:
|
|
path: /tmp/imagePullPolicy.txt
|
|
|
|
- name: expand-parameter-matrix
|
|
inputs:
|
|
parameters:
|
|
- name: runnerImage
|
|
- name: imagePullPolicy
|
|
script:
|
|
image: '{{inputs.parameters.runnerImage}}'
|
|
imagePullPolicy: '{{inputs.parameters.imagePullPolicy}}'
|
|
command: [ "python", "-m", "parameter_expander" ]
|
|
args:
|
|
- "{{ workflow.parameters.json }}"
|
|
|
|
- name: collect-failed-parameters
|
|
inputs:
|
|
parameters:
|
|
- name: groupId
|
|
script:
|
|
image: codexstorage/bittorrent-benchmarks:latest
|
|
command: ["python", "-m", "benchmarks.k8s.collect_failed_inputs"]
|
|
args:
|
|
- "{{inputs.parameters.groupId}}"
|
|
- "wrapped-benchmark-experiment"
|
|
- "{{workflow.parameters.argoService}}.argo.svc.cluster.local"
|
|
- "2746"
|
|
|
|
- name: increment-retry-counter
|
|
inputs:
|
|
parameters:
|
|
- name: groupId
|
|
script:
|
|
image: codexstorage/bittorrent-benchmarks:latest
|
|
command: ["python", "-m", "benchmarks.k8s.increment_retry_counter"]
|
|
args:
|
|
- "{{inputs.parameters.groupId}}"
|
|
|
|
- name: generate-group-id
|
|
inputs:
|
|
parameters:
|
|
- name: runnerImage
|
|
- name: imagePullPolicy
|
|
script:
|
|
image: '{{inputs.parameters.runnerImage}}'
|
|
imagePullPolicy: '{{inputs.parameters.imagePullPolicy}}'
|
|
command: [ "/bin/bash" ]
|
|
source: |
|
|
# The ID starts with a "g" as otherwise we can't use it as a label value in k8s.
|
|
echo "g$(date +%s)"
|
|
|
|
# We "wrap" the benchmark workflow with a dummy workflow so exit handlers behave properly. If we
|
|
# were to call benchmark-experiment directly from the main flow, the exit handlers would be run
|
|
# only when the entire set of experiments is done, not when each individual experiment is done.
|
|
- name: wrapped-benchmark-experiment
|
|
inputs:
|
|
parameters:
|
|
- name: runnerImage
|
|
- name: imagePullPolicy
|
|
- name: groupId
|
|
- name: runId
|
|
- name: fileSize
|
|
- name: seederSets
|
|
- name: networkSize
|
|
- name: seeders
|
|
- name: repetitions
|
|
|
|
dag:
|
|
tasks:
|
|
- name: benchmark-experiment
|
|
template: benchmark-experiment
|
|
arguments:
|
|
parameters:
|
|
- name: groupId
|
|
value: "{{inputs.parameters.groupId}}"
|
|
- name: runId
|
|
value: "{{inputs.parameters.runId}}"
|
|
- name: fileSize
|
|
value: "{{inputs.parameters.fileSize}}"
|
|
- name: seederSets
|
|
value: "{{inputs.parameters.seederSets}}"
|
|
- name: networkSize
|
|
value: "{{inputs.parameters.networkSize}}"
|
|
- name: seeders
|
|
value: "{{inputs.parameters.seeders}}"
|
|
- name: repetitions
|
|
value: "{{inputs.parameters.repetitions}}"
|
|
- name: runnerImage
|
|
value: "{{inputs.parameters.runnerImage}}"
|
|
- name: imagePullPolicy
|
|
value: "{{inputs.parameters.imagePullPolicy}}"
|
|
|
|
- name: cleanup-success
|
|
template: cleanup
|
|
arguments:
|
|
parameters:
|
|
- name: runId
|
|
value: "{{inputs.parameters.runId}}"
|
|
- name: runnerImage
|
|
value: "{{inputs.parameters.runnerImage}}"
|
|
- name: imagePullPolicy
|
|
value: "{{inputs.parameters.imagePullPolicy}}"
|
|
|
|
depends: "benchmark-experiment.Succeeded"
|
|
|
|
- name: cleanup-failure
|
|
template: cleanup
|
|
arguments:
|
|
parameters:
|
|
- name: runId
|
|
value: "{{inputs.parameters.runId}}"
|
|
- name: runnerImage
|
|
value: "{{inputs.parameters.runnerImage}}"
|
|
- name: imagePullPolicy
|
|
value: "{{inputs.parameters.imagePullPolicy}}"
|
|
|
|
depends: "benchmark-experiment.Failed"
|
|
when: '{{workflow.parameters.cleanupOnFailure}} == true'
|
|
|
|
- name: benchmark-experiment
|
|
inputs:
|
|
parameters:
|
|
- name: runnerImage
|
|
- name: imagePullPolicy
|
|
- name: groupId
|
|
- name: runId
|
|
- name: fileSize
|
|
- name: seederSets
|
|
- name: networkSize
|
|
- name: seeders
|
|
- name: repetitions
|
|
|
|
steps:
|
|
- - name: deploy-experiment
|
|
template: deploy-experiment
|
|
arguments:
|
|
parameters:
|
|
- name: runnerImage
|
|
value: "{{inputs.parameters.runnerImage}}"
|
|
- name: imagePullPolicy
|
|
value: "{{inputs.parameters.imagePullPolicy}}"
|
|
- name: groupId
|
|
value: "{{inputs.parameters.groupId}}"
|
|
- name: runId
|
|
value: "{{inputs.parameters.runId}}"
|
|
- name: fileSize
|
|
value: "{{inputs.parameters.fileSize}}"
|
|
- name: seederSets
|
|
value: "{{inputs.parameters.seederSets}}"
|
|
- name: networkSize
|
|
value: "{{inputs.parameters.networkSize}}"
|
|
- name: seeders
|
|
value: "{{inputs.parameters.seeders}}"
|
|
- name: repetitions
|
|
value: "{{inputs.parameters.repetitions}}"
|
|
|
|
- - name: wait-for-experiment
|
|
template: wait-for-experiment
|
|
arguments:
|
|
parameters:
|
|
- name: runnerImage
|
|
value: "{{inputs.parameters.runnerImage}}"
|
|
- name: imagePullPolicy
|
|
value: "{{inputs.parameters.imagePullPolicy}}"
|
|
- name: groupId
|
|
value: "{{inputs.parameters.groupId}}"
|
|
- name: runId
|
|
value: "{{inputs.parameters.runId}}"
|
|
|
|
- name: deploy-experiment
|
|
inputs:
|
|
parameters:
|
|
- name: groupId
|
|
- name: runId
|
|
- name: fileSize
|
|
- name: seederSets
|
|
- name: networkSize
|
|
- name: seeders
|
|
- name: repetitions
|
|
- name: runnerImage
|
|
- name: imagePullPolicy
|
|
|
|
script:
|
|
image: '{{inputs.parameters.runnerImage}}'
|
|
imagePullPolicy: '{{inputs.parameters.imagePullPolicy}}'
|
|
command: [ "/bin/bash" ]
|
|
source: |
|
|
set -e
|
|
|
|
if [[ "{{workflow.parameters.minikubeEnv}}" == "false" ]]; then
|
|
echo "Using devnet cluster values for deploy."
|
|
VALUE_FILE=(-f "./k8s/clusters/devnet/{{workflow.parameters.system}}-chart-values.yaml")
|
|
fi
|
|
|
|
helm install e{{inputs.parameters.runId}} ./k8s/charts/{{workflow.parameters.system}}\
|
|
--namespace codex-benchmarks "${VALUE_FILE[@]}"\
|
|
--set experiment.groupId={{inputs.parameters.groupId}}\
|
|
--set experiment.repetitions={{inputs.parameters.repetitions}}\
|
|
--set experiment.fileSize={{inputs.parameters.fileSize}}\
|
|
--set experiment.networkSize={{inputs.parameters.networkSize}}\
|
|
--set experiment.seeders={{inputs.parameters.seeders}}\
|
|
--set "experiment.codexLogLevel={{workflow.parameters.codexLogLevel}}"\
|
|
--set experiment.seederSets={{inputs.parameters.seederSets}}\
|
|
--set deployment.minikubeEnv={{workflow.parameters.minikubeEnv}}\
|
|
--set deployment.removeData={{workflow.parameters.removeData}}\
|
|
--set deployment.nodeTag={{workflow.parameters.nodeTag}}\
|
|
--set deployment.runnerTag={{workflow.parameters.runnerTag}}\
|
|
--set deployment.region={{workflow.parameters.region}}
|
|
|
|
- name: wait-for-experiment
|
|
inputs:
|
|
parameters:
|
|
- name: runnerImage
|
|
- name: imagePullPolicy
|
|
- name: groupId
|
|
- name: runId
|
|
script:
|
|
image: '{{inputs.parameters.runnerImage}}'
|
|
imagePullPolicy: '{{inputs.parameters.imagePullPolicy}}'
|
|
command: [ "/bin/bash" ]
|
|
source: |
|
|
./docker/bin/kubectl-wait-job\
|
|
--selector=app.kubernetes.io/component={{workflow.parameters.system}}-experiment-runner,\
|
|
app.kubernetes.io/instance=e{{inputs.parameters.runId}},\
|
|
app.kubernetes.io/part-of={{inputs.parameters.groupId}}\
|
|
--timeout={{workflow.parameters.maxExperimentDuration}}\
|
|
-n codex-benchmarks
|
|
|
|
- name: cleanup
|
|
inputs:
|
|
parameters:
|
|
- name: runnerImage
|
|
- name: imagePullPolicy
|
|
- name: runId
|
|
script:
|
|
image: '{{inputs.parameters.runnerImage}}'
|
|
imagePullPolicy: '{{inputs.parameters.imagePullPolicy}}'
|
|
command: [ "/bin/bash" ]
|
|
source: |
|
|
helm uninstall e{{inputs.parameters.runId}} -n codex-benchmarks
|
|
|