apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: generateName: codex-benchmark- spec: serviceAccountName: codex-benchmarks-workflows entrypoint: benchmark-workflow arguments: parameters: ######################################## Global Settings ############################################## # What are we benchmarking (one of: codex, deluge)? - name: system value: "deluge" ################################ Experiment Parameter Matrix ########################################## # Parameters in the experiment parameter matrix will be expanded, and can be set # to lists or constrained lists of arguments. # How many times should we repeat the experiment for each parameter set/random seeder set? - name: repetitions value: 5 # How many random seeder sets should we have? - name: seederSets value: 2 # What file size are we benchmarking? - name: fileSize value: '["100MB", "1GB", "5GB"]' # What values for network size vs seeders should we use? - name: constrained__networkSize_seeders value: "[[2, 1], [8, [1, 2, 4]], [16, [1, 2, 4, 8]], [32, [1, 2, 4, 8, 16]]]" # What's the maximum duration for this whole batch of experiments? - name: maxExperimentDuration value: 144h # Which tag to use for the node (Codex, Deluge, whatnot) images? Useful if you want to run something # off of a branch. - name: nodeTag value: "latest" ###################################### Experiment Retries ############################################# # Allow the workflow to replay failed experiments from a previous run instead of running a new set. # If set to an existing group ID (e.g. "g1740079931"), will replay the failed experiments # in this group, ignoring all other parameters in the parameter matrix. Requires the Argo # workflow service name to be known. - name: retryGroup value: "" # You need to set the name for the argo workflows service in the k8s cluster manually, and this might differ # between our cluster and minikube. This is a pain point, and should fade away as we move towards # Hera (https://github.com/argoproj-labs/hera). I could set this similarly to what we do on define-image-settings, # for the minikube env, but not worth it. - name: argoService value: "argo-workflows-server" # -> cluster # value: "argo-server" # -> minikube ################################## Performance and Resource Usage ##################################### # Affects cluster resource usage (how many nodes can we use?) and overall performance. # How many experiments should we run at a time? In clusters with lots of resources, more parallelism # means experiments can run a lot faster. # - name: experimentParallelism # value: 1 # XXX Unfortunately Argo won't expand those properly for sub-workflows, so you have to set it manually all over. # Another pain point that should disappear as we move to Hera. # Groups the expansion such that all experiments with a given networkSize run together, smallest # to largest. This can save significant amounts of time when running on a cluster with autoscaling. # If you plan to run experiments in parallel, optimizing for smallest accrued VM usage might be # more important. - name: orderBy value: '["fileSize", "seeders", "networkSize"]' # The region in which to deploy machines. Leave empty to deploy to default. - name: region value: "hel1" ###################################### Dev. and Debugging ############################################# # Settings for running experiments locally or debugging failures. # Set this to true to run workflows on Minikube. - name: minikubeEnv value: "false" # If set to false, leaves pods for failed experiments behind so they can be inspected. - name: cleanupOnFailure value: "true" # If set to false, does not parse/upload logs at the end of the experiment. You'll probably want to # disable this when running local experiments. - name: parseLogs value: "true" ####################################################################################################### templates: - name: benchmark-workflow dag: tasks: - name: define-image-settings template: define-image-settings - name: new-benchmark-run template: new-benchmark-run arguments: parameters: - name: runnerImage value: "{{tasks.define-image-settings.outputs.parameters.image}}" - name: imagePullPolicy value: "{{tasks.define-image-settings.outputs.parameters.imagePullPolicy}}" depends: "define-image-settings.Succeeded" when: '"{{workflow.parameters.retryGroup}}" == ""' - name: retry-benchmark-run template: retry-benchmark-run arguments: parameters: - name: runnerImage value: "{{tasks.define-image-settings.outputs.parameters.image}}" - name: imagePullPolicy value: "{{tasks.define-image-settings.outputs.parameters.imagePullPolicy}}" depends: "define-image-settings.Succeeded" when: '"{{workflow.parameters.retryGroup}}" != ""' - name: new-benchmark-run parallelism: 1 inputs: parameters: - name: runnerImage - name: imagePullPolicy dag: tasks: - name: generate-group-id template: generate-group-id arguments: parameters: - name: runnerImage value: "{{inputs.parameters.runnerImage}}" - name: imagePullPolicy value: "{{inputs.parameters.imagePullPolicy}}" - name: expand-parameter-matrix template: expand-parameter-matrix arguments: parameters: - name: runnerImage value: "{{inputs.parameters.runnerImage}}" - name: imagePullPolicy value: "{{inputs.parameters.imagePullPolicy}}" depends: "generate-group-id.Succeeded" - name: benchmark-experiment template: wrapped-benchmark-experiment arguments: parameters: - name: groupId value: "{{tasks.generate-group-id.outputs.result}}" - name: runId value: "{{item.runId}}" - name: fileSize value: "{{item.fileSize}}" - name: seederSets value: "{{item.seederSets}}" - name: networkSize value: "{{item.networkSize}}" - name: seeders value: "{{item.seeders}}" - name: repetitions value: "{{item.repetitions}}" - name: runnerImage value: "{{inputs.parameters.runnerImage}}" - name: imagePullPolicy value: "{{inputs.parameters.imagePullPolicy}}" withParam: "{{tasks.expand-parameter-matrix.outputs.result}}" depends: "expand-parameter-matrix.Succeeded" - name: parse-logs template: parse-logs arguments: parameters: - name: groupId value: "{{tasks.generate-group-id.outputs.result}}" depends: "benchmark-experiment.AnySucceeded" when: '{{workflow.parameters.parseLogs}} == true' - name: retry-benchmark-run parallelism: 2 inputs: parameters: - name: runnerImage - name: imagePullPolicy dag: tasks: - name: collect-failed-parameters template: collect-failed-parameters arguments: parameters: - name: groupId value: "{{workflow.parameters.retryGroup}}" - name: increment-retry-counter template: increment-retry-counter arguments: parameters: - name: groupId value: "{{workflow.parameters.retryGroup}}" depends: "collect-failed-parameters.Succeeded" - name: benchmark-experiment template: wrapped-benchmark-experiment arguments: parameters: - name: groupId value: "{{tasks.increment-retry-counter.outputs.result}}" - name: runId value: "{{item.runId}}" - name: fileSize value: "{{item.fileSize}}" - name: seederSets value: "{{item.seederSets}}" - name: networkSize value: "{{item.networkSize}}" - name: seeders value: "{{item.seeders}}" - name: repetitions value: "{{item.repetitions}}" - name: runnerImage value: "{{inputs.parameters.runnerImage}}" - name: imagePullPolicy value: "{{inputs.parameters.imagePullPolicy}}" withParam: "{{tasks.collect-failed-parameters.outputs.result}}" depends: "increment-retry-counter.Succeeded" - name: parse-logs template: parse-logs arguments: parameters: - name: groupId value: "{{tasks.increment-retry-counter.outputs.result}}" depends: "benchmark-experiment.AnySucceeded" when: '{{workflow.parameters.parseLogs}} == true' - name: parse-logs inputs: parameters: - name: groupId resource: action: create manifest: | apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: generateName: log-parsing- spec: workflowTemplateRef: name: log-parsing-workflow arguments: parameters: - name: experimentGroupId value: "{{inputs.parameters.groupId}}" successCondition: status.phase == Succeeded failureCondition: status.phase in (Failed, Error) - name: define-image-settings # I think this goes to show just how clumsy Argo Workflows is. If I want to select # an image name and a pull policy from a workflow parameter, I need this. Sad. script: image: busybox:latest command: ["sh", "-c"] source: | if [ "{{workflow.parameters.minikubeEnv}}" == "true" ]; then echo "Using Minikube env" echo "bittorrent-benchmarks-workflows:minikube" > /tmp/image.txt echo "Never" > /tmp/imagePullPolicy.txt else echo "NOT using Minikube env" echo "codexstorage/bittorrent-benchmarks-workflows:latest" > /tmp/image.txt echo "Always" > /tmp/imagePullPolicy.txt fi outputs: parameters: - name: image valueFrom: path: /tmp/image.txt - name: imagePullPolicy valueFrom: path: /tmp/imagePullPolicy.txt - name: expand-parameter-matrix inputs: parameters: - name: runnerImage - name: imagePullPolicy script: image: '{{inputs.parameters.runnerImage}}' imagePullPolicy: '{{inputs.parameters.imagePullPolicy}}' command: [ "python", "-m", "parameter_expander" ] args: - "{{ workflow.parameters.json }}" - name: collect-failed-parameters inputs: parameters: - name: groupId script: image: codexstorage/bittorrent-benchmarks:latest command: ["python", "-m", "benchmarks.k8s.collect_failed_inputs"] args: - "{{inputs.parameters.groupId}}" - "wrapped-benchmark-experiment" - "{{workflow.parameters.argoService}}.argo.svc.cluster.local" - "2746" - name: increment-retry-counter inputs: parameters: - name: groupId script: image: codexstorage/bittorrent-benchmarks:latest command: ["python", "-m", "benchmarks.k8s.increment_retry_counter"] args: - "{{inputs.parameters.groupId}}" - name: generate-group-id inputs: parameters: - name: runnerImage - name: imagePullPolicy script: image: '{{inputs.parameters.runnerImage}}' imagePullPolicy: '{{inputs.parameters.imagePullPolicy}}' command: [ "/bin/bash" ] source: | # The ID starts with a "g" as otherwise we can't use it as a label value in k8s. echo "g$(date +%s)" # We "wrap" the benchmark workflow with a dummy workflow so exit handlers behave properly. If we # were to call benchmark-experiment directly from the main flow, the exit handlers would be run # only when the entire set of experiments is done, not when each individual experiment is done. - name: wrapped-benchmark-experiment inputs: parameters: - name: runnerImage - name: imagePullPolicy - name: groupId - name: runId - name: fileSize - name: seederSets - name: networkSize - name: seeders - name: repetitions dag: tasks: - name: benchmark-experiment template: benchmark-experiment arguments: parameters: - name: groupId value: "{{inputs.parameters.groupId}}" - name: runId value: "{{inputs.parameters.runId}}" - name: fileSize value: "{{inputs.parameters.fileSize}}" - name: seederSets value: "{{inputs.parameters.seederSets}}" - name: networkSize value: "{{inputs.parameters.networkSize}}" - name: seeders value: "{{inputs.parameters.seeders}}" - name: repetitions value: "{{inputs.parameters.repetitions}}" - name: runnerImage value: "{{inputs.parameters.runnerImage}}" - name: imagePullPolicy value: "{{inputs.parameters.imagePullPolicy}}" - name: cleanup-success template: cleanup arguments: parameters: - name: runId value: "{{inputs.parameters.runId}}" - name: runnerImage value: "{{inputs.parameters.runnerImage}}" - name: imagePullPolicy value: "{{inputs.parameters.imagePullPolicy}}" depends: "benchmark-experiment.Succeeded" - name: cleanup-failure template: cleanup arguments: parameters: - name: runId value: "{{inputs.parameters.runId}}" - name: runnerImage value: "{{inputs.parameters.runnerImage}}" - name: imagePullPolicy value: "{{inputs.parameters.imagePullPolicy}}" depends: "benchmark-experiment.Failed" when: '{{workflow.parameters.cleanupOnFailure}} == true' - name: benchmark-experiment inputs: parameters: - name: runnerImage - name: imagePullPolicy - name: groupId - name: runId - name: fileSize - name: seederSets - name: networkSize - name: seeders - name: repetitions steps: - - name: deploy-experiment template: deploy-experiment arguments: parameters: - name: runnerImage value: "{{inputs.parameters.runnerImage}}" - name: imagePullPolicy value: "{{inputs.parameters.imagePullPolicy}}" - name: groupId value: "{{inputs.parameters.groupId}}" - name: runId value: "{{inputs.parameters.runId}}" - name: fileSize value: "{{inputs.parameters.fileSize}}" - name: seederSets value: "{{inputs.parameters.seederSets}}" - name: networkSize value: "{{inputs.parameters.networkSize}}" - name: seeders value: "{{inputs.parameters.seeders}}" - name: repetitions value: "{{inputs.parameters.repetitions}}" - - name: wait-for-experiment template: wait-for-experiment arguments: parameters: - name: runnerImage value: "{{inputs.parameters.runnerImage}}" - name: imagePullPolicy value: "{{inputs.parameters.imagePullPolicy}}" - name: groupId value: "{{inputs.parameters.groupId}}" - name: runId value: "{{inputs.parameters.runId}}" - name: deploy-experiment inputs: parameters: - name: groupId - name: runId - name: fileSize - name: seederSets - name: networkSize - name: seeders - name: repetitions - name: runnerImage - name: imagePullPolicy script: image: '{{inputs.parameters.runnerImage}}' imagePullPolicy: '{{inputs.parameters.imagePullPolicy}}' command: [ "/bin/bash" ] source: | set -e if [[ "{{workflow.parameters.minikubeEnv}}" == "false" ]]; then echo "Using devnet cluster values for deploy." VALUE_FILE=(-f "./k8s/clusters/devnet/{{workflow.parameters.system}}-chart-values.yaml") fi helm install e{{inputs.parameters.runId}} ./k8s/charts/{{workflow.parameters.system}}\ --namespace codex-benchmarks "${VALUE_FILE[@]}"\ --set experiment.groupId={{inputs.parameters.groupId}}\ --set experiment.repetitions={{inputs.parameters.repetitions}}\ --set experiment.fileSize={{inputs.parameters.fileSize}}\ --set experiment.networkSize={{inputs.parameters.networkSize}}\ --set experiment.seeders={{inputs.parameters.seeders}}\ --set experiment.seederSets={{inputs.parameters.seederSets}}\ --set deployment.minikubeEnv={{workflow.parameters.minikubeEnv}}\ --set deployment.nodeTag={{workflow.parameters.nodeTag}}\ --set deployment.region={{workflow.parameters.region}} - name: wait-for-experiment inputs: parameters: - name: runnerImage - name: imagePullPolicy - name: groupId - name: runId script: image: '{{inputs.parameters.runnerImage}}' imagePullPolicy: '{{inputs.parameters.imagePullPolicy}}' command: [ "/bin/bash" ] source: | ./docker/bin/kubectl-wait-job\ --selector=app.kubernetes.io/component={{workflow.parameters.system}}-experiment-runner,\ app.kubernetes.io/instance=e{{inputs.parameters.runId}},\ app.kubernetes.io/part-of={{inputs.parameters.groupId}}\ --timeout={{workflow.parameters.maxExperimentDuration}}\ -n codex-benchmarks - name: cleanup inputs: parameters: - name: runnerImage - name: imagePullPolicy - name: runId script: image: '{{inputs.parameters.runnerImage}}' imagePullPolicy: '{{inputs.parameters.imagePullPolicy}}' command: [ "/bin/bash" ] source: | helm uninstall e{{inputs.parameters.runId}} -n codex-benchmarks