feat: allow keeping pods around on failure, add optional log parsing at end of experiment run

This commit is contained in:
gmega 2025-01-29 07:20:23 -03:00
parent 7ed29ddb4c
commit a29c010e7a
No known key found for this signature in database
GPG Key ID: 6290D34EAD824B18

View File

@ -15,8 +15,6 @@ spec:
value: '["100MB", "1GB"]' value: '["100MB", "1GB"]'
- name: constrained__networkSize_seeders - name: constrained__networkSize_seeders
value: "[[2, 1], [8, [1, 2, 4]], [16, [1, 2, 4, 8]], [32, [1, 2, 4, 8, 16]]]" value: "[[2, 1], [8, [1, 2, 4]], [16, [1, 2, 4, 8]], [32, [1, 2, 4, 8, 16]]]"
- name: minikubeEnv
value: "false"
- name: maxExperimentDuration - name: maxExperimentDuration
value: 144h value: 144h
@ -24,57 +22,99 @@ spec:
# to largest. This can save significant amounts of time when running on a cluster with autoscaling. # to largest. This can save significant amounts of time when running on a cluster with autoscaling.
- name: orderBy - name: orderBy
value: '["networkSize", "seeders", "fileSize"]' value: '["networkSize", "seeders", "fileSize"]'
# Set this to true to run workflows on Minikube.
- name: minikubeEnv
value: "false"
# If set to false, leaves pods for failed experiments behind so they can be inspected.
- name: cleanupOnFailure
value: "false"
# If set to false, does not parse/upload logs at the end of the experiment.
- name: parseLogs
value: "true"
templates: templates:
- name: benchmark-workflow - name: benchmark-workflow
parallelism: 1 parallelism: 1
steps: dag:
- - name: define-image-settings tasks:
template: define-image-settings - name: define-image-settings
template: define-image-settings
- - name: generate-group-id - name: generate-group-id
template: generate-group-id template: generate-group-id
arguments:
parameters:
- name: runnerImage
value: "{{tasks.define-image-settings.outputs.parameters.image}}"
- name: imagePullPolicy
value: "{{tasks.define-image-settings.outputs.parameters.imagePullPolicy}}"
depends: "define-image-settings.Succeeded"
- name: expand-parameter-matrix
template: expand-parameter-matrix
arguments:
parameters:
- name: runnerImage
value: "{{tasks.define-image-settings.outputs.parameters.image}}"
- name: imagePullPolicy
value: "{{tasks.define-image-settings.outputs.parameters.imagePullPolicy}}"
depends: "generate-group-id.Succeeded"
- name: benchmark-experiment
template: wrapped-benchmark-experiment
arguments:
parameters:
- name: groupId
value: "{{tasks.generate-group-id.outputs.result}}"
- name: runId
value: "{{item.runId}}"
- name: fileSize
value: "{{item.fileSize}}"
- name: seederSets
value: "{{item.seederSets}}"
- name: networkSize
value: "{{item.networkSize}}"
- name: seeders
value: "{{item.seeders}}"
- name: repetitions
value: "{{item.repetitions}}"
- name: runnerImage
value: "{{tasks.define-image-settings.outputs.parameters.image}}"
- name: imagePullPolicy
value: "{{tasks.define-image-settings.outputs.parameters.imagePullPolicy}}"
withParam: "{{tasks.expand-parameter-matrix.outputs.result}}"
depends: "expand-parameter-matrix.Succeeded"
- name: parse-logs
template: parse-logs
arguments:
parameters:
- name: groupId
value: "{{tasks.generate-group-id.outputs.result}}"
depends: "benchmark-experiment.AnySucceeded"
when: '{{workflow.parameters.parseLogs}} == true'
- name: parse-logs
inputs:
parameters:
- name: groupId
resource:
action: create
manifest: |
apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
generateName: log-parsing-
spec:
workflowTemplateRef:
name: log-parsing-workflow
arguments: arguments:
parameters: parameters:
- name: runnerImage - name: experimentGroupId
value: "{{steps.define-image-settings.outputs.parameters.image}}" value: "{{inputs.parameters.groupId}}"
- name: imagePullPolicy successCondition: status.phase == Succeeded
value: "{{steps.define-image-settings.outputs.parameters.imagePullPolicy}}" failureCondition: status.phase in (Failed, Error)
- - name: expand-parameter-matrix
template: expand-parameter-matrix
arguments:
parameters:
- name: runnerImage
value: "{{steps.define-image-settings.outputs.parameters.image}}"
- name: imagePullPolicy
value: "{{steps.define-image-settings.outputs.parameters.imagePullPolicy}}"
- - name: benchmark-experiment
template: wrapped-benchmark-experiment
arguments:
parameters:
- name: groupId
value: "{{steps.generate-group-id.outputs.result}}"
- name: runId
value: "{{item.runId}}"
- name: fileSize
value: "{{item.fileSize}}"
- name: seederSets
value: "{{item.seederSets}}"
- name: networkSize
value: "{{item.networkSize}}"
- name: seeders
value: "{{item.seeders}}"
- name: repetitions
value: "{{item.repetitions}}"
- name: runnerImage
value: "{{steps.define-image-settings.outputs.parameters.image}}"
- name: imagePullPolicy
value: "{{steps.define-image-settings.outputs.parameters.imagePullPolicy}}"
withParam: "{{steps.expand-parameter-matrix.outputs.result}}"
- name: define-image-settings - name: define-image-settings
# I think this goes to show just how clumsy Argo Workflows is. If I want to select # I think this goes to show just how clumsy Argo Workflows is. If I want to select
@ -121,7 +161,8 @@ spec:
imagePullPolicy: '{{inputs.parameters.imagePullPolicy}}' imagePullPolicy: '{{inputs.parameters.imagePullPolicy}}'
command: [ "/bin/bash" ] command: [ "/bin/bash" ]
source: | source: |
echo "$(date +%s)" # The ID starts with a "g" as otherwise we can't use it as a label value in k8s.
echo "g$(date +%s)"
# We "wrap" the benchmark workflow with a dummy workflow so exit handlers behave properly. If we # We "wrap" the benchmark workflow with a dummy workflow so exit handlers behave properly. If we
# were to call benchmark-experiment directly from the main flow, the exit handlers would be run # were to call benchmark-experiment directly from the main flow, the exit handlers would be run
@ -138,20 +179,11 @@ spec:
- name: networkSize - name: networkSize
- name: seeders - name: seeders
- name: repetitions - name: repetitions
steps:
- - name: benchmark-experiment dag:
tasks:
- name: benchmark-experiment
template: benchmark-experiment template: benchmark-experiment
hooks:
exit:
template: cleanup
arguments:
parameters:
- name: runId
value: "{{inputs.parameters.runId}}"
- name: runnerImage
value: "{{inputs.parameters.runnerImage}}"
- name: imagePullPolicy
value: "{{inputs.parameters.imagePullPolicy}}"
arguments: arguments:
parameters: parameters:
- name: groupId - name: groupId
@ -173,6 +205,32 @@ spec:
- name: imagePullPolicy - name: imagePullPolicy
value: "{{inputs.parameters.imagePullPolicy}}" value: "{{inputs.parameters.imagePullPolicy}}"
- name: cleanup-success
template: cleanup
arguments:
parameters:
- name: runId
value: "{{inputs.parameters.runId}}"
- name: runnerImage
value: "{{inputs.parameters.runnerImage}}"
- name: imagePullPolicy
value: "{{inputs.parameters.imagePullPolicy}}"
depends: "benchmark-experiment.Succeeded"
- name: cleanup-failure
template: cleanup
arguments:
parameters:
- name: runId
value: "{{inputs.parameters.runId}}"
- name: runnerImage
value: "{{inputs.parameters.runnerImage}}"
- name: imagePullPolicy
value: "{{inputs.parameters.imagePullPolicy}}"
depends: "benchmark-experiment.Failed"
when: '{{workflows.inputs.parameters.cleanupOnFailure}} == true'
- name: benchmark-experiment - name: benchmark-experiment
inputs: inputs:
@ -251,7 +309,7 @@ spec:
helm install e{{inputs.parameters.runId}} ./k8s/charts/deluge\ helm install e{{inputs.parameters.runId}} ./k8s/charts/deluge\
--namespace codex-benchmarks "${VALUE_FILE[@]}"\ --namespace codex-benchmarks "${VALUE_FILE[@]}"\
--set experiment.groupId=g{{inputs.parameters.groupId}}\ --set experiment.groupId={{inputs.parameters.groupId}}\
--set experiment.repetitions={{inputs.parameters.repetitions}}\ --set experiment.repetitions={{inputs.parameters.repetitions}}\
--set experiment.fileSize={{inputs.parameters.fileSize}}\ --set experiment.fileSize={{inputs.parameters.fileSize}}\
--set experiment.networkSize={{inputs.parameters.networkSize}}\ --set experiment.networkSize={{inputs.parameters.networkSize}}\
@ -273,7 +331,7 @@ spec:
./docker/bin/kubectl-wait-job\ ./docker/bin/kubectl-wait-job\
--selector=app.kubernetes.io/component=deluge-experiment-runner,\ --selector=app.kubernetes.io/component=deluge-experiment-runner,\
app.kubernetes.io/instance=e{{inputs.parameters.runId}},\ app.kubernetes.io/instance=e{{inputs.parameters.runId}},\
app.kubernetes.io/part-of=g{{inputs.parameters.groupId}}\ app.kubernetes.io/part-of={{inputs.parameters.groupId}}\
--timeout={{workflow.parameters.maxExperimentDuration}}\ --timeout={{workflow.parameters.maxExperimentDuration}}\
-n codex-benchmarks -n codex-benchmarks