E M 185aa06514
delete terraform state lock
When the workflow is cancelled, either manually, or automatically from a long-running step (timeout), the terraform state lock had to be manually deleted, or else the next workflow run would never succeed. This change ensures that the state lock file is always deleted after each run.
2026-04-30 18:15:57 +10:00

472 lines
19 KiB
YAML

name: Release
on:
push:
tags:
- 'v*.*.*'
branches:
- master
workflow_dispatch:
inputs:
branch:
description: 'dist-tests branch to run tests from'
required: false
default: 'master'
type: string
env:
cache_nonce: 0 # Allows for easily busting actions/cache caches
nim_version: pinned
storage_binary_base: logos-storage
c_bindings_lib_base: libstorage
build_dir: build
nim_flags: ''
windows_libs: 'libstdc++-6.dll libgomp-1.dll libgcc_s_seh-1.dll libwinpthread-1.dll'
jobs:
# Matrix
matrix:
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.matrix.outputs.matrix }}
steps:
- name: Compute matrix
id: matrix
uses: fabiocaccamo/create-matrix-action@v5
with:
matrix: |
os {linux}, cpu {amd64}, builder {ubuntu-22.04}, nim_version {${{ env.nim_version }}}, shell {bash --noprofile --norc -e -o pipefail}
os {linux}, cpu {arm64}, builder {ubuntu-22.04-arm}, nim_version {${{ env.nim_version }}}, shell {bash --noprofile --norc -e -o pipefail}
os {macos}, cpu {arm64}, builder {macos-14}, nim_version {${{ env.nim_version }}}, shell {bash --noprofile --norc -e -o pipefail}
os {windows}, cpu {amd64}, builder {windows-latest}, nim_version {${{ env.nim_version }}}, shell {msys2}
# Build
# build:
# needs: matrix
# strategy:
# fail-fast: false
# matrix:
# include: ${{fromJson(needs.matrix.outputs.matrix)}}
# defaults:
# run:
# shell: ${{ matrix.shell }} {0}
# name: ${{ matrix.os }}-${{ matrix.cpu }}-${{ matrix.nim_version }}
# runs-on: ${{ matrix.builder }}
# timeout-minutes: 80
# steps:
# - name: Set conditional env variables
# shell: bash
# run: |
# if [[ "${{ github.ref_type }}" == "tag" ]]; then
# echo "VERSION=${{ github.ref_name }}" >> $GITHUB_ENV
# echo "TAGGED_RELEASE=true" >> $GITHUB_ENV
# else
# echo "VERSION=${GITHUB_SHA::7}" >> $GITHUB_ENV
# echo "TAGGED_RELEASE=false" >> $GITHUB_ENV
# fi
# - name: Checkout sources
# uses: actions/checkout@v4
# with:
# submodules: recursive
# - name: Setup Nimbus Build System
# uses: ./.github/actions/nimbus-build-system
# with:
# os: ${{ matrix.os }}
# cpu: ${{ matrix.cpu }}
# shell: ${{ matrix.shell }}
# nim_version: ${{ matrix.nim_version }}
# - name: Compute binary name
# run: |
# case ${{ matrix.os }} in
# linux*) os_name="linux" ;;
# macos*) os_name="darwin" ;;
# windows*) os_name="windows" ;;
# esac
# storage_binary="${{ env.storage_binary_base }}-${os_name}-${{ matrix.cpu }}-${{ env.VERSION }}"
# c_bindings_lib="${{ env.c_bindings_lib_base }}-${os_name}-${{ matrix.cpu }}-${{ env.VERSION }}"
# if [[ ${os_name} == "windows" ]]; then
# storage_binary="${storage_binary}.exe"
# fi
# echo "storage_binary=${storage_binary}" >>$GITHUB_ENV
# echo "c_bindings_lib=${c_bindings_lib}" >>$GITHUB_ENV
# - name: Build Logos Storage binary
# run: |
# make NIMFLAGS="--out:${{ env.build_dir }}/${{ env.storage_binary }} ${{ env.nim_flags }}"
# - name: Package ${{ env.storage_binary_base }} Linux (compress and preserve perms)
# if: matrix.os == 'linux'
# run: |
# sudo apt-get update && sudo apt-get install -y zip
# zip -j "${{ env.build_dir }}/${{env.storage_binary}}.zip" ./${{ env.build_dir }}/*
# - name: Package ${{ env.storage_binary_base }} MacOS (compress and preserve perms)
# if: matrix.os == 'macos'
# run: |
# zip -j "${{ env.build_dir }}/${{env.storage_binary}}.zip" ./${{ env.build_dir }}/*
# - name: Package ${{ env.storage_binary_base }} Windows (compress and preserve perms)
# if: matrix.os == 'windows'
# shell: msys2 {0}
# run: |
# 7z a -tzip "${{ env.build_dir }}/${{env.storage_binary}}.zip" ./${{ env.build_dir }}/*
# - name: Upload Logos Storage binary to workflow artifacts
# uses: actions/upload-artifact@v4
# with:
# name: ${{ env.storage_binary }}.zip
# path: ${{ env.build_dir }}/${{ env.storage_binary }}.zip
# retention-days: 30
# - name: Copy and zip Windows dlls to build/dlls dir (Windows)
# if: matrix.os == 'windows'
# run: |
# mkdir -p "${{ env.build_dir }}/dlls"
# for lib in ${{ env.windows_libs }}; do
# cp -v "${MINGW_PREFIX}/bin/${lib}" "${{ env.build_dir }}/dlls"
# done
# 7z a -tzip "${{ env.build_dir }}/${{ env.storage_binary }}-dlls.zip" ./${{ env.build_dir }}/dlls/*.dll
# - name: Upload Windows dlls to workflow artifacts
# if: matrix.os == 'windows'
# uses: actions/upload-artifact@v4
# with:
# name: ${{ env.storage_binary }}-dlls.zip
# path: ${{ env.build_dir }}/${{ env.storage_binary }}-dlls.zip
# retention-days: 30
# - name: Build ${{ env.c_bindings_lib_base }} (Linux)
# if: matrix.os == 'linux'
# run: |
# make -j${ncpu} update
# make -j${ncpu} libstorage
# - name: Build ${{ env.c_bindings_lib_base }} (MacOS)
# if: matrix.os == 'macos'
# run: |
# make -j${ncpu} update
# STORAGE_LIB_PARAMS="--passL:\"-Wl,-install_name,@rpath/${{ env.c_bindings_lib_base }}.dylib\"" make -j${ncpu} libstorage
# - name: Build ${{ env.c_bindings_lib_base }} (Windows)
# if: matrix.os == 'windows'
# shell: msys2 {0}
# run: |
# make -j${ncpu} update
# make -j${ncpu} libstorage
# - name: Package ${{ env.c_bindings_lib_base }} Linux
# if: matrix.os == 'linux'
# run: |
# sudo apt-get update && sudo apt-get install -y zip
# zip -j "${{ env.build_dir }}/${{ env.c_bindings_lib }}.zip" ${{ env.build_dir }}/${{ env.c_bindings_lib_base }}.so
# zip -j "${{ env.build_dir }}/${{ env.c_bindings_lib }}.zip" library/${{ env.c_bindings_lib_base }}.h
# - name: Package ${{ env.c_bindings_lib_base }} MacOS
# if: matrix.os == 'macos'
# run: |
# zip -j "${{ env.build_dir }}/${{ env.c_bindings_lib }}.zip" ${{ env.build_dir }}/${{ env.c_bindings_lib_base }}.dylib
# zip -j "${{ env.build_dir }}/${{ env.c_bindings_lib }}.zip" library/${{ env.c_bindings_lib_base }}.h
# - name: Package ${{ env.c_bindings_lib_base }} (Windows)
# if: matrix.os == 'windows'
# shell: msys2 {0}
# run: |
# 7z a -tzip "${{ env.build_dir }}/${{ env.c_bindings_lib }}.zip" ./${{ env.build_dir }}/${{ env.c_bindings_lib_base }}.dll
# 7z a -tzip "${{ env.build_dir }}/${{ env.c_bindings_lib }}.zip" ./library/${{ env.c_bindings_lib_base }}.h
# - name: Upload ${{ env.c_bindings_lib_base }} to workflow artifacts
# uses: actions/upload-artifact@v4
# with:
# name: ${{ env.c_bindings_lib }}.zip
# path: ${{ env.build_dir }}/${{ env.c_bindings_lib }}.zip
# if-no-files-found: error
# # Build Docker logosstorage/logos-storage-nim:latest-dist-tests image for Logos Storage nodes in the cluster
# build-docker-dist-tests:
# name: Build Docker dist-tests image
# if: github.ref_type == 'tag' || github.event_name == 'workflow_dispatch'
# uses: ./.github/workflows/docker-reusable.yml
# with:
# nimflags: '-d:disableMarchNative -d:storage_enable_api_debug_peers=true -d:storage_enable_log_counter=true'
# nat_ip_auto: true
# tag_latest: false
# tag_stable: false
# tag_suffix: dist-tests
# tag_sha: false
# secrets: inherit
# Release tests
release-tests:
name: Release Tests
runs-on: ubuntu-latest
if: github.ref_type == 'tag' || github.event_name == 'workflow_dispatch'
# needs: build-docker-dist-tests
timeout-minutes: 60
permissions:
id-token: write
contents: read
env:
TF_VAR_project: ${{ vars.RELEASE_TESTS_GCP_PROJECT }}
TF_VAR_region: europe-west4
TF_VAR_zone: europe-west4-a
TF_PLUGIN_CACHE_DIR: /home/runner/.terraform.d/plugin-cache
STORAGEDOCKERIMAGE: ${{ github.ref_type == 'tag' && format('logosstorage/logos-storage-nim:{0}-dist-tests', github.ref_name) || 'logosstorage/logos-storage-nim:latest-dist-tests' }}
TEST_TYPE: release-tests
BRANCH: ${{ inputs.branch || 'master' }}
SOURCE: https://github.com/logos-storage/logos-storage-nim-cs-dist-tests
TF_DIR: .github/release/clusters/logos-storage-rel-tests-gcp-europe-west4
CLUSTER_NAME: logos-storage-rel-tests-gcp-europe-west4 # should always match the cluster_name variable in TF_DIR
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Create Terraform plugin cache dir
run: mkdir -p /home/runner/.terraform.d/plugin-cache
- name: Cache Terraform plugins
uses: actions/cache@v4
with:
path: ~/.terraform.d/plugin-cache
key: terraform-google-${{ hashFiles(format('{0}/.terraform.lock.hcl', env.TF_DIR)) }}
restore-keys: terraform-google-
- name: Authenticate to GCP
uses: google-github-actions/auth@v2
with:
workload_identity_provider: ${{ secrets.RELEASE_TESTS_GCP_WORKLOAD_IDENTITY_PROVIDER }}
service_account: ${{ secrets.RELEASE_TESTS_GCP_SERVICE_ACCOUNT }}
- name: Setup gcloud
uses: google-github-actions/setup-gcloud@v2
with:
install_components: gke-gcloud-auth-plugin
- name: Setup Terraform
uses: hashicorp/setup-terraform@v3
- name: Setup kubectl
uses: azure/setup-kubectl@v4
with:
version: v1.36.0
- name: Terraform init
working-directory: ${{ env.TF_DIR }}
run: terraform init -backend-config="bucket=${{ vars.RELEASE_TESTS_TF_STATE_BUCKET }}"
- name: Terraform apply
id: tf-apply
working-directory: ${{ env.TF_DIR }}
run: terraform apply -auto-approve
- name: Get kubeconfig
run: |
gcloud container clusters get-credentials $CLUSTER_NAME \
--zone ${{ env.TF_VAR_zone }} \
--project ${{ vars.RELEASE_TESTS_GCP_PROJECT }}
- name: Wait for runners-ci node to be Ready
run: |
kubectl wait \
--for=condition=Ready \
node \
-l workload-type=tests-runners-ci \
--timeout=300s
- name: Create in-cluster app kubeconfig secret
run: |
kubectl create serviceaccount release-tests-runner -n default
kubectl create clusterrolebinding release-tests-runner \
--clusterrole=cluster-admin \
--serviceaccount=default:release-tests-runner
export TOKEN=$(kubectl create token release-tests-runner -n default --duration=2h)
export SERVER=$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.server}')
export CA=$(kubectl config view --minify --raw -o jsonpath='{.clusters[0].cluster.certificate-authority-data}')
kubectl create secret generic storage-dist-tests-app-kubeconfig \
--from-file=kubeconfig.yaml=<(envsubst < .github/release/kubeconfig-template.yaml) \
-n default
- name: Set run variables
run: |
RUNID=$(date +%Y%m%d-%H%M%S)
echo "RUNID=${RUNID}" >> $GITHUB_ENV
echo "NAMEPREFIX=r-tests-${RUNID}" >> $GITHUB_ENV
echo "TESTID=$(git rev-parse --short HEAD)" >> $GITHUB_ENV
echo "JOB_START_TIME=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_ENV
- name: Deploy test job
run: |
envsubst < .github/release/job-release-tests.yaml | kubectl apply -f -
echo "--- Job ---"
kubectl get job $NAMEPREFIX -n default
echo "--- Pods ---"
kubectl get pods -n default
echo "--- Job events ---"
kubectl describe job $NAMEPREFIX -n default
- name: Print storage node log link
run: |
QUERY=$(printf '%s\n%s\n%s' \
'resource.type="k8s_container"' \
"resource.labels.cluster_name=\"${CLUSTER_NAME}\"" \
"labels.\"k8s-pod/runid\"=\"${RUNID}\"")
ENCODED=$(python3 -c "import urllib.parse,sys; print(urllib.parse.quote(sys.stdin.read(), safe=''))" <<< "$QUERY")
URL="https://console.cloud.google.com/logs/query;query=${ENCODED};startTime=${JOB_START_TIME}?project=${{ vars.RELEASE_TESTS_GCP_PROJECT }}"
echo "Storage node logs: $URL"
echo "## Storage Node Logs" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "Run ID: \`${RUNID}\`" >> "$GITHUB_STEP_SUMMARY" `
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "[View in Cloud Logging]($URL)" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "Filter: \`labels.\"k8s-pod/runid\"=\"${RUNID}\"\`" >> "$GITHUB_STEP_SUMMARY"`
- name: Wait for test pod to start
run: |
echo "Waiting for test pod to reach Running state..."
deadline=$((SECONDS + 300))
last_describe=0
while [[ $SECONDS -lt $deadline ]]; do
phase=$(kubectl get pods \
-l job-name=$NAMEPREFIX \
-n default \
-o jsonpath='{range .items[*]}{.status.phase}{end}' 2>/dev/null)
echo "Pod phase: ${phase:-not yet created}"
if [[ "$phase" == "Running" ]]; then break; fi
if [[ $((SECONDS - last_describe)) -ge 60 ]]; then
echo "--- kubectl describe job $NAMEPREFIX ---"
kubectl describe job $NAMEPREFIX -n default
last_describe=$SECONDS
fi
sleep 10
done
if [[ "$phase" != "Running" ]]; then
echo "Timed out waiting for pod to reach Running state"
exit 1
fi
- name: Stream test logs
run: |
POD=$(kubectl get pods -l job-name=$NAMEPREFIX -n default \
-o jsonpath='{.items[0].metadata.name}')
echo "Streaming logs for pod: $POD"
# Use pod name (not label selector) so the stream survives long silences
# between test completions. || true so the step doesn't fail if the
# API server closes the connection before the pod exits.
kubectl logs $POD -n default --follow || true
- name: Check job status
run: |
# kubectl logs may have exited early (API server closed the stream).
# Wait for the job to reach a terminal state before checking the result.
kubectl wait job/$NAMEPREFIX -n default \
--for=condition=Complete \
--timeout=300s \
|| kubectl wait job/$NAMEPREFIX -n default \
--for=condition=Failed \
--timeout=0s
job_status=$(kubectl get jobs $NAMEPREFIX -n default \
-o jsonpath='{.status.conditions[0].type}')
echo "Job status: $job_status"
[[ "$job_status" == "SuccessCriteriaMet" ]] || exit 1
- name: Generate test summary
if: always() && steps.tf-apply.conclusion == 'success'
env:
GCP_PROJECT: ${{ vars.RELEASE_TESTS_GCP_PROJECT }}
run: |
export JOB_START=$(kubectl get job "$NAMEPREFIX" -n default \
-o jsonpath='{.status.startTime}' 2>/dev/null || true)
export JOB_END=$(kubectl get job "$NAMEPREFIX" -n default \
-o jsonpath='{.status.completionTime}' 2>/dev/null || true)
FILTER="resource.type=\"k8s_container\""
FILTER+=" AND resource.labels.namespace_name=\"default\""
FILTER+=" AND resource.labels.container_name=\"runner\""
FILTER+=" AND labels.\"k8s-pod/runid\"=\"${RUNID}\""
FILTER+=" AND jsonPayload.type=\"test-result\""
export ENTRIES_FILE=$(mktemp)
for attempt in $(seq 1 12); do
gcloud logging read "$FILTER" --order=asc --limit=1000 --format=json \
--project="$GCP_PROJECT" > "$ENTRIES_FILE" 2>/dev/null || echo "[]" > "$ENTRIES_FILE"
[[ $(cat "$ENTRIES_FILE") != "[]" ]] && break
echo "Attempt $attempt/12: waiting for test results in Cloud Logging..."
[[ $attempt -lt 12 ]] && sleep 10
done
python3 .github/scripts/generate_test_summary.py
rm -f "$ENTRIES_FILE"
- name: Delete PVCs before cluster teardown
if: always() && steps.tf-apply.conclusion != 'skipped'
run: |
# Delete all PVCs so the CSI driver can release GCE PDs before the cluster is destroyed.
# Without this, terraform destroy orphans the PDs and they consume SSD quota indefinitely.
kubectl delete pvc --all --all-namespaces --wait=false 2>/dev/null || true
- name: Terraform destroy
if: always() && steps.tf-apply.conclusion != 'skipped'
working-directory: ${{ env.TF_DIR }}
run: terraform destroy -auto-approve
- name: Release Terraform state lock
if: always()
run: |
gcloud storage rm \
"gs://${{ vars.RELEASE_TESTS_TF_STATE_BUCKET }}/clusters/${CLUSTER_NAME}/default.tflock" \
2>/dev/null || true
# Release
# release:
# runs-on: ubuntu-latest
# needs: [build, release-tests]
# if: needs.build.result == 'success' && needs.release-tests.result == 'success'
# steps:
# - name: Set conditional env variables
# shell: bash
# run: |
# if [[ "${{ github.ref_type }}" == "tag" ]]; then
# echo "VERSION=${{ github.ref_name }}" >> $GITHUB_ENV
# echo "TAGGED_RELEASE=true" >> $GITHUB_ENV
# else
# echo "VERSION=${GITHUB_SHA::7}" >> $GITHUB_ENV
# echo "TAGGED_RELEASE=false" >> $GITHUB_ENV
# fi
# - name: Download binaries from workflow artifacts into temp folder
# uses: actions/download-artifact@v4
# with:
# pattern: ${{ env.storage_binary_base }}*
# merge-multiple: true
# path: /tmp/release
# - name: Download ${{ env.c_bindings_lib_base }} from workflow artifacts into temp folder
# uses: actions/download-artifact@v5
# with:
# pattern: ${{ env.c_bindings_lib_base }}*
# merge-multiple: true
# path: /tmp/release
# - name: Create GH release
# uses: softprops/action-gh-release@v2
# if: env.TAGGED_RELEASE == 'true'
# with:
# files: |
# /tmp/release/*-*
# make_latest: true
# - name: Generate Python SDK
# uses: peter-evans/repository-dispatch@v3
# if: env.TAGGED_RELEASE == 'true'
# with:
# token: ${{ secrets.DISPATCH_PAT }}
# repository: logos-storage/logos-storage-py-api-client
# event-type: generate
# client-payload: '{"openapi_url": "https://raw.githubusercontent.com/logos-storage/logos-storage-nim/${{ github.ref }}/openapi.yaml"}'