feat: add argo workflow sketch

This commit is contained in:
gmega 2024-12-18 14:34:22 -03:00
parent 1bddfc7426
commit bea51a5adf
No known key found for this signature in database
GPG Key ID: 6290D34EAD824B18
15 changed files with 265 additions and 5 deletions

49
.github/workflows/argo.yaml vendored Normal file
View File

@ -0,0 +1,49 @@
name: Build Argo Workflows Runner Image
on:
push:
branches:
- master
workflow_dispatch:
env:
DOCKER_FILE: ./docker/bittorrent-benchmarks-workflows.Dockerfile
DOCKER_REPO: codexstorage/bittorrent-benchmarks-workflows
jobs:
test-and-build:
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Setup Docker Metadata
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.DOCKER_REPO }}
flavor: |
latest=true
tags: |
type=sha
- name: Build and Push Prod. Image
uses: docker/build-push-action@v6
with:
context: .
file: ${{ env.DOCKER_FILE }}
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}

View File

@ -1,4 +1,4 @@
name: Test and Build
name: Test and Build Experiment Runner Image
on:
push:
@ -18,9 +18,6 @@ jobs:
- name: Checkout
uses: actions/checkout@v4
- name: Use Docker in rootless mode.
uses: ScribeMD/rootless-docker@0.2.2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
@ -74,4 +71,4 @@ jobs:
file: ${{ env.DOCKER_FILE }}
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
labels: ${{ steps.meta.outputs.labels }}

101
docker/bin/kubectl-wait-job Normal file
View File

@ -0,0 +1,101 @@
#!/bin/bash
#
# This is copied from: https://github.com/brianpursley/kubectl-wait-job
#
# This code is licensed under the Creative Commons Attribution-ShareAlike 4.0 International License.
# To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/4.0/
#
# Attribution: This code was inspired by an answer on Stack Overflow licensed under CC BY-SA 4.0.
# Original answer: https://stackoverflow.com/a/60286538/5074828 by Sebastian N (https://stackoverflow.com/users/3745474/sebastian-n)
#
# Check if --help is specified in the arguments and display help text
for arg in "$@"; do
if [[ "$arg" == "--help" ]]; then
echo "Usage: kubectl wait-job [ARGS] [OPTIONS]"
echo ""
echo "This plugin waits for a Kubernetes job to either complete or fail."
echo ""
echo "Arguments:"
echo " [kubectl args] Any args will be passed to kubectl wait."
echo ""
echo "Options:"
echo " [kubectl options] Any options will be passed to kubectl wait."
echo ""
echo "Example:"
echo " kubectl wait-job job-name"
echo ""
exit 0
fi
done
# Make sure there is no --for flag
for arg in "$@"; do
if [[ "$arg" == "--for" || "$arg" == --for=* ]]; then
echo "Error: The '--for' flag cannot be used with this plugin."
exit 2
fi
done
# Cleanup
cleanup() {
if [[ -n $COMPLETE_STDERR ]]; then
rm -f "$COMPLETE_STDERR" 2> /dev/null
fi
if [[ -n $FAILED_STDERR ]]; then
rm -f "$FAILED_STDERR" 2> /dev/null
fi
if [[ -n $COMPLETE_PID ]]; then
kill "$COMPLETE_PID" 2> /dev/null
fi
if [[ -n $FAILED_PID ]]; then
kill "$FAILED_PID" 2> /dev/null
fi
}
trap cleanup EXIT
# Create temporary files to store stderr output
COMPLETE_STDERR=$(mktemp -t kubectl-wait-job-stderr.XXXXXXXXXX) || { echo "error: failed to create temp file"; exit 3; }
FAILED_STDERR=$(mktemp -t kubectl-wait-job-stderr.XXXXXXXXXX) || { echo "error: failed to create temp file"; exit 3; }
# Wait for complete and failed conditions in parallel
kubectl wait job "$@" --for=condition=complete > /dev/null 2> "$COMPLETE_STDERR" &
COMPLETE_PID=$!
kubectl wait job "$@" --for=condition=failed > /dev/null 2> "$FAILED_STDERR" &
FAILED_PID=$!
# Wait for one of the processes to exit (using loop instead of wait -n for compatibility)
while true; do
# Check if the process waiting for the job to complete has exited
unset COMPLETE_RESULT
if ! kill -0 "$COMPLETE_PID" 2>/dev/null; then
wait $COMPLETE_PID;
COMPLETE_RESULT=$?
if [[ $COMPLETE_RESULT -eq 0 ]]; then
echo "Job completed successfully"
exit 0
fi
fi
# Check if the process waiting for the job to fail has exited
unset FAILED_RESULT
if ! kill -0 "$FAILED_PID" 2>/dev/null; then
wait $FAILED_PID
FAILED_RESULT=$?
if [[ $FAILED_RESULT -eq 0 ]]; then
echo "Job failed"
exit 1
fi
fi
# If either process failed, print the stderr output and exit
if [[ -n $COMPLETE_RESULT || -n $FAILED_RESULT ]]; then
cat "$COMPLETE_STDERR" 2> /dev/null
cat "$FAILED_STDERR" 2> /dev/null
echo "error: kubectl wait failed"
exit 3
fi
# Sleep for a short time before checking again
sleep 0.1
done

View File

@ -0,0 +1,16 @@
FROM bitnami/kubectl:1.31.1 as kubectl
FROM debian:bookworm-slim
COPY --from=kubectl /opt/bitnami/kubectl/bin/kubectl /usr/local/bin/kubectl
RUN apt-get update && apt-get install -y curl
RUN curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3
RUN chmod 700 get_helm.sh
RUN ./get_helm.sh
RUN mkdir /opt/bittorrent-benchmarks
WORKDIR /opt/bittorrent-benchmarks
COPY ./k8s .
COPY ./docker .

View File

@ -0,0 +1,49 @@
# This sets up a service account with the required permissions for running the Codex workflows. For now,
# this needs to be manually applied to the cluster running Argo Workflows.
apiVersion: v1
kind: ServiceAccount
metadata:
name: codex-benchmarks-workflows
namespace: argo
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: codex-workflows-runner
namespace: argo
rules:
- apiGroups: [ "" ]
resources: [ "namespaces", "persistentvolumeclaims", "pods", "services" ]
verbs: [ "*" ]
- apiGroups: [ "apps" ]
resources: [ "deployments", "statefulsets" ]
verbs: [ "*" ]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: codex-workflows-runner
namespace: argo
subjects:
- kind: ServiceAccount
name: codex-benchmarks-workflows
namespace: argo
roleRef:
kind: Role
name: codex-workflows-runner
apiGroup: rbac.authorization.k8s.io
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: codex-workflows-runner-executor
namespace: argo
subjects:
- kind: ServiceAccount
name: codex-benchmarks-workflows
namespace: argo
roleRef:
kind: Role
name: executor
apiGroup: rbac.authorization.k8s.io

View File

@ -0,0 +1,48 @@
apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
generateName: deluge-benchmark-
spec:
serviceAccountName: codex-benchmarks-workflows
entrypoint: deluge-benchmark-workflow
templates:
- name: deluge-benchmark-workflow
steps:
- - name: deploy-experiment
template: deploy-experiment
- - name: wait-for-testrunner
template: wait-for-testrunner
- - name: wait-for-completion
template: wait-for-completion
- name: deploy-experiment
script:
image: codexstorage/bittorrent-benchmarks-workflows:latest
command: ["/bin/bash"]
source: |
helm install e1 k8s/charts/deluge --namespace codex-benchmarks
- name: wait-for-testrunner
script:
image: codexstorage/bittorrent-benchmarks-workflows:latest
command: ["/bin/bash"]
source: |
kubectl wait --for=condition=Ready --selector=app=testrunner pod -n codex-benchmarks --timeout=300s
- name: wait-for-completion
script:
image: codexstorage/bittorrent-benchmarks-workflows:latest
command: ["/bin/bash"]
source: |
set -e
./ docker/bin/kubectl-wait-job --selector=app=testrunner -n codex-benchmarks
- name: cleanup
script:
image: codexstorage/bittorrent-benchmarks-workflows:latest
command: ["/bin/bash"]
source: |
helm uninstall e1 -n codex-benchmarks