feat: end-to-end release test pipeline on GKE with structured logging (#1439)

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Eric 2026-06-05 16:27:55 +10:00 committed by GitHub
parent b892379ff1
commit d61512a5b7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
21 changed files with 680 additions and 36 deletions

View File

@ -1,4 +1,13 @@
{
"permissions": {
"allow": [
"Bash(npm install)",
"WebSearch",
"Bash(npm test:*)",
"WebFetch(domain:github.com)",
"WebFetch(domain:api.github.com)"
]
},
"env": {
"MCP_TIMEOUT": "120000"
}

View File

@ -88,7 +88,7 @@ runs:
- name: Install ccache on Linux/Mac
if: inputs.os == 'linux' || inputs.os == 'macos'
uses: hendrikmuhs/ccache-action@v1.2
uses: hendrikmuhs/ccache-action@v1.2.23
with:
create-symlink: false
key: ${{ inputs.os }}-${{ inputs.builder }}-${{ inputs.cpu }}-${{ inputs.tests }}-${{ inputs.nim_version }}-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}
@ -103,7 +103,7 @@ runs:
- name: Install ccache on Windows
if: inputs.os == 'windows'
uses: hendrikmuhs/ccache-action@v1.2
uses: hendrikmuhs/ccache-action@v1.2.23
with:
key: ${{ inputs.os }}-${{ inputs.builder }}-${{ inputs.cpu }}-${{ inputs.tests }}-${{ inputs.nim_version }}-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}
evict-old-files: 7d
@ -197,7 +197,7 @@ runs:
- name: Restore Nim toolchain binaries from cache
id: nim-cache
uses: actions/cache@v4
uses: actions/cache@v5
if: ${{ inputs.coverage != 'true' }}
with:
path: NimBinaries

View File

@ -0,0 +1,7 @@
terraform {
backend "gcs" {
prefix = "clusters/logos-storage-rel-tests-gcp-europe-west4"
# bucket is supplied at init time via:
# terraform init -backend-config="bucket=<bucket-name>"
}
}

View File

@ -0,0 +1,27 @@
# Both node pools are inline in the module so GCP provisions them in parallel.
module "gke" {
source = "../modules/gke"
name = "logos-storage-rel-tests"
project = var.project
region = var.region
zone = var.zone
node_pool_name = "runners-ci-e2-standard-2"
node_pool_machine_type = "e2-standard-2"
node_pool_count = 1
node_pool_labels = {
default-pool = "true"
scaling-type = "fixed"
workload-type = "tests-runners-ci"
}
tests_pool_name = "tests-e2-medium"
tests_pool_machine_type = "e2-medium"
tests_pool_count = 5
tests_pool_labels = {
default-pool = "false"
scaling-type = "fixed"
workload-type = "tests-pods"
}
}

View File

@ -0,0 +1,15 @@
# Providers
provider "google" {
project = var.project
region = var.region
}
# Used to authenticate the kubernetes provider against the cluster created in
# this same apply (short-lived OAuth access token from the active gcloud creds).
data "google_client_config" "default" {}
provider "kubernetes" {
host = "https://${module.gke.endpoint}"
cluster_ca_certificate = base64decode(module.gke.ca_certificate)
token = data.google_client_config.default.access_token
}

View File

@ -0,0 +1,33 @@
# In-cluster RBAC for the release-tests runner.
#
# The dist-tests Job runs inside this cluster and programmatically creates/deletes
# Kubernetes resources (storage-node pods) for each test, so it needs API
# credentials. The Job runs under the release-tests-runner ServiceAccount, which
# Kubernetes automatically mounts as a short-lived projected token no static
# kubeconfig or token Secret required.
resource "kubernetes_service_account" "release_tests_runner" {
metadata {
name = "release-tests-runner"
namespace = "default"
}
}
resource "kubernetes_cluster_role_binding" "release_tests_runner" {
metadata {
name = "release-tests-runner"
}
role_ref {
api_group = "rbac.authorization.k8s.io"
kind = "ClusterRole"
name = "cluster-admin"
}
subject {
kind = "ServiceAccount"
name = kubernetes_service_account.release_tests_runner.metadata[0].name
namespace = kubernetes_service_account.release_tests_runner.metadata[0].namespace
}
}

View File

@ -0,0 +1,14 @@
variable "project" {
description = "GCP project ID"
type = string
}
variable "region" {
description = "GCP region (e.g. europe-west4)"
type = string
}
variable "zone" {
description = "GCP zone for the cluster (e.g. europe-west4-b)"
type = string
}

View File

@ -0,0 +1,14 @@
# Terraform settings
terraform {
required_version = "~> 1.0"
required_providers {
google = {
source = "hashicorp/google"
version = "~> 6.0"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "~> 2.0"
}
}
}

View File

@ -0,0 +1,3 @@
locals {
name = "${var.name}-gcp-${var.region}"
}

View File

@ -0,0 +1,48 @@
# Both node pools are inline so GCP provisions them in parallel during
# cluster creation, avoiding the sequential create penalty of a separate
# google_container_node_pool resource.
resource "google_container_cluster" "this" {
name = local.name
location = var.zone
project = var.project
deletion_protection = false
# Send pod stdout/stderr to Cloud Logging automatically
logging_service = "logging.googleapis.com/kubernetes"
monitoring_service = "monitoring.googleapis.com/kubernetes"
timeouts {
create = "20m"
}
node_pool {
name = var.node_pool_name
node_count = var.node_pool_count
node_config {
machine_type = var.node_pool_machine_type
disk_size_gb = 50
labels = var.node_pool_labels
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform",
]
}
}
node_pool {
name = var.tests_pool_name
node_count = var.tests_pool_count
node_config {
machine_type = var.tests_pool_machine_type
disk_size_gb = 20
labels = var.tests_pool_labels
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform",
]
}
}
}

View File

@ -0,0 +1,21 @@
# Kubernetes cluster
output "kubernetes_cluster_id" {
value = google_container_cluster.this.id
description = "The fully-qualified ID of the GKE cluster."
}
output "kubernetes_cluster_name" {
value = google_container_cluster.this.name
description = "The name of the GKE cluster."
}
output "endpoint" {
value = google_container_cluster.this.endpoint
description = "The IP address of the cluster's Kubernetes API server."
}
output "ca_certificate" {
value = google_container_cluster.this.master_auth[0].cluster_ca_certificate
description = "Base64-encoded public CA certificate of the cluster's API server."
sensitive = true
}

View File

@ -0,0 +1,67 @@
# Main
variable "name" {
type = string
description = "A name for the created resources."
}
variable "project" {
type = string
description = "The GCP project ID."
}
variable "region" {
type = string
description = "The GCP region (used for the provider and node pool location)."
}
variable "zone" {
type = string
description = "The GCP zone for the cluster. Using a single zone avoids the longer provisioning time of a regional (multi-zone) cluster."
}
# Kubernetes default Node Pool
variable "node_pool_name" {
type = string
description = "A name for the default node pool."
}
variable "node_pool_machine_type" {
type = string
description = "The GCE machine type for nodes in the default pool."
}
variable "node_pool_count" {
type = number
description = "Fixed number of nodes in the default pool."
}
variable "node_pool_labels" {
type = map(string)
description = "A map of key/value pairs to apply as Kubernetes labels to nodes in the default pool."
default = {
default-pool = "true"
scaling-type = "fixed"
}
}
# Tests node pool (fixed size, single zone)
variable "tests_pool_name" {
type = string
description = "Name for the tests node pool."
}
variable "tests_pool_machine_type" {
type = string
description = "The GCE machine type for nodes in the tests pool."
}
variable "tests_pool_count" {
type = number
description = "Fixed number of nodes in the tests pool (no autoscaling; this is a transient cluster)."
}
variable "tests_pool_labels" {
type = map(string)
description = "Kubernetes labels to apply to nodes in the tests pool."
default = {}
}

View File

@ -0,0 +1,9 @@
# Terraform settings
terraform {
required_providers {
google = {
source = "hashicorp/google"
version = "~> 6.0"
}
}
}

55
.github/release/job-release-tests.yaml vendored Normal file
View File

@ -0,0 +1,55 @@
apiVersion: batch/v1
kind: Job
metadata:
name: ${NAMEPREFIX}
namespace: default
labels:
name: ${NAMEPREFIX}
runid: ${RUNID}
spec:
ttlSecondsAfterFinished: 86400
backoffLimit: 0
template:
metadata:
name: ${NAMEPREFIX}
labels:
app: ${TEST_TYPE}-runner
name: ${NAMEPREFIX}
runid: ${RUNID}
spec:
nodeSelector:
workload-type: "tests-runners-ci"
containers:
- name: runner
image: logosstorage/logos-storage-dist-tests:latest
imagePullPolicy: Always
resources:
requests:
memory: "4Gi"
limits:
memory: "6Gi"
env:
- name: LOGPATH
value: "/var/log/storage-${TEST_TYPE}"
- name: BRANCH
value: "${BRANCH}"
- name: SOURCE
value: "${SOURCE}"
- name: RUNID
value: "${RUNID}"
- name: STORAGEDOCKERIMAGE
value: "${STORAGEDOCKERIMAGE}"
- name: TESTID
value: "${TESTID}"
- name: TESTS_TYPE
value: "${TEST_TYPE}"
volumeMounts:
- name: logs
mountPath: /var/log/storage-${TEST_TYPE}
args: ["dotnet", "test", "Tests/LogosStorageReleaseTests", "-p:BuildInParallel=false"]
serviceAccountName: release-tests-runner
restartPolicy: Never
volumes:
- name: logs
hostPath:
path: /var/log/storage-${TEST_TYPE}

View File

@ -0,0 +1,86 @@
"""
Reads test-result Cloud Logging entries (written as JSON to runner pod stdout
by TearDownDistTest) and writes a Markdown summary to $GITHUB_STEP_SUMMARY.
Expected env vars (all set by the workflow before calling this script):
ENTRIES_FILE - path to a JSON file containing gcloud logging read output
RUNID - the test run ID (e.g. 20260430-060144)
CLUSTER_NAME - GKE cluster name
GCP_PROJECT - GCP project ID
JOB_START_TIME - ISO timestamp used as the Cloud Logging URL startTime
JOB_START - job startTime from kubectl (for duration calc)
JOB_END - job completionTime from kubectl (for duration calc)
GITHUB_STEP_SUMMARY - path to the GHA step summary file
"""
import json, os, sys, urllib.parse
from datetime import datetime
with open(os.environ["ENTRIES_FILE"]) as f:
entries = json.load(f)
runid = os.environ["RUNID"]
cluster = os.environ["CLUSTER_NAME"]
project = os.environ["GCP_PROJECT"]
start = os.environ["JOB_START_TIME"]
if not entries:
with open(os.environ["GITHUB_STEP_SUMMARY"], "a") as f:
f.write(f"No test results found for run `{runid}`\n")
sys.exit(0)
# Aggregate by fixture in run order; mark Failed if any method failed.
fixtures, order = {}, []
for entry in entries:
p = entry.get("jsonPayload", {})
fixture, status = p.get("fixture", ""), p.get("status", "")
if not fixture:
continue
if fixture not in fixtures:
order.append(fixture)
fixtures[fixture] = status
elif status == "Failed":
fixtures[fixture] = status
# Job duration
duration = ""
try:
fmt = "%Y-%m-%dT%H:%M:%SZ"
secs = int(
(
datetime.strptime(os.environ["JOB_END"], fmt)
- datetime.strptime(os.environ["JOB_START"], fmt)
).total_seconds()
)
duration = f" in {secs // 60}m {secs % 60}s"
except Exception:
pass
def log_url(fixture):
query = "\n".join([
'resource.type="k8s_container"',
f'resource.labels.cluster_name="{cluster}"',
f'labels."k8s-pod/runid"="{runid}"',
f'labels."k8s-pod/fixturename"="{fixture.lower()}"',
])
encoded = urllib.parse.quote(query, safe="")
return (
f"https://console.cloud.google.com/logs/query"
f";query={encoded}"
f";startTime={start}"
f"?project={project}"
)
passed = sum(1 for s in fixtures.values() if s == "Passed")
total = len(fixtures)
lines = ["## Test logs", "", "Filtered run logs by fixture", ""]
for fixture in order:
icon = "" if fixtures[fixture] == "Passed" else ""
lines.append(f"- {icon} [{fixture}]({log_url(fixture)})")
lines += ["", f"**{passed}/{total} tests passed{duration}**"]
with open(os.environ["GITHUB_STEP_SUMMARY"], "a") as f:
f.write("\n".join(lines) + "\n")

View File

@ -29,7 +29,7 @@ jobs:
timeout-minutes: 90
steps:
- name: Checkout sources
uses: actions/checkout@v4
uses: actions/checkout@v6
with:
submodules: recursive
ref: ${{ github.event.pull_request.head.sha }}
@ -56,7 +56,7 @@ jobs:
run: make -j${ncpu} DEBUG=${{ runner.debug }} testIntegration
- name: Upload integration tests log files
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v7
if: (matrix.tests == 'integration' || matrix.tests == 'all') && always()
with:
name: ${{ matrix.os }}-${{ matrix.cpu }}-${{ matrix.nim_version }}-${{ matrix.job_number }}-integration-tests-logs

View File

@ -25,7 +25,7 @@ jobs:
cache_nonce: ${{ env.cache_nonce }}
steps:
- name: Checkout sources
uses: actions/checkout@v4
uses: actions/checkout@v6
- name: Compute matrix
id: matrix
run: |
@ -44,7 +44,7 @@ jobs:
runs-on: ubuntu-latest
if: github.event_name == 'pull_request'
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v6
- name: Check `nph` formatting
uses: arnetheduck/nph-action@v1
with:
@ -57,7 +57,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout sources
uses: actions/checkout@v4
uses: actions/checkout@v6
with:
submodules: recursive
ref: ${{ github.event.pull_request.head.sha }}

View File

@ -128,26 +128,26 @@ jobs:
PLATFORM: ${{ format('{0}/{1}', 'linux', matrix.target.arch) }}
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v6
- name: Docker - Meta
id: meta
uses: docker/metadata-action@v5
uses: docker/metadata-action@v6
with:
images: ${{ env.DOCKER_REPO }}
- name: Docker - Set up Buildx
uses: docker/setup-buildx-action@v3
uses: docker/setup-buildx-action@v4
- name: Docker - Login to Docker Hub
uses: docker/login-action@v3
uses: docker/login-action@v4
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Docker - Build and Push by digest
id: build
uses: docker/build-push-action@v5
uses: docker/build-push-action@v7
with:
context: .
file: ${{ env.DOCKER_FILE }}
@ -167,7 +167,7 @@ jobs:
touch "/tmp/digests/${digest#sha256:}"
- name: Docker - Upload digest
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v7
with:
name: digests-${{ needs.compute.outputs.build_id }}-${{ matrix.target.arch }}
path: /tmp/digests/*
@ -208,18 +208,18 @@ jobs:
fi
- name: Docker - Download digests
uses: actions/download-artifact@v4
uses: actions/download-artifact@v8
with:
pattern: digests-${{ needs.compute.outputs.build_id }}-*
merge-multiple: true
path: /tmp/digests
- name: Docker - Set up Buildx
uses: docker/setup-buildx-action@v3
uses: docker/setup-buildx-action@v4
- name: Docker - Meta
id: meta
uses: docker/metadata-action@v5
uses: docker/metadata-action@v6
with:
images: ${{ env.DOCKER_REPO }}
flavor: |
@ -232,7 +232,7 @@ jobs:
type=sha,enable=${{ env.TAG_SHA }}
- name: Docker - Login to Docker Hub
uses: docker/login-action@v3
uses: docker/login-action@v4
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}

View File

@ -26,13 +26,13 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v6
with:
fetch-depth: 0
- uses: actions/setup-node@v4
- uses: actions/setup-node@v6
with:
node-version: 18
node-version: 24
- name: Lint OpenAPI
run: npx @redocly/cli lint openapi.yaml
@ -43,13 +43,13 @@ jobs:
if: startsWith(github.ref, 'refs/tags/')
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v6
with:
fetch-depth: 0
- uses: actions/setup-node@v4
- uses: actions/setup-node@v6
with:
node-version: 18
node-version: 24
- name: Build OpenAPI
run: npx @redocly/cli build-docs openapi.yaml --output openapi/index.html --title "Logos Storage API"
@ -63,4 +63,4 @@ jobs:
path: openapi
- name: Deploy to GitHub Pages
uses: actions/deploy-pages@v4
uses: actions/deploy-pages@v5

View File

@ -16,7 +16,7 @@ jobs:
cache_nonce: ${{ env.cache_nonce }}
steps:
- name: Checkout sources
uses: actions/checkout@v4
uses: actions/checkout@v6
- name: Compute matrix
id: matrix
run: |

View File

@ -7,6 +7,12 @@ on:
branches:
- master
workflow_dispatch:
inputs:
branch:
description: 'dist-tests branch to run tests from'
required: false
default: 'master'
type: string
env:
cache_nonce: 0 # Allows for easily busting actions/cache caches
@ -61,7 +67,7 @@ jobs:
echo "TAGGED_RELEASE=false" >> $GITHUB_ENV
fi
- name: Checkout sources
uses: actions/checkout@v4
uses: actions/checkout@v6
with:
submodules: recursive
@ -110,7 +116,7 @@ jobs:
7z a -tzip "${{ env.build_dir }}/${{env.storage_binary}}.zip" ./${{ env.build_dir }}/*
- name: Upload Logos Storage binary to workflow artifacts
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v7
with:
name: ${{ env.storage_binary }}.zip
path: ${{ env.build_dir }}/${{ env.storage_binary }}.zip
@ -127,7 +133,7 @@ jobs:
- name: Upload Windows dlls to workflow artifacts
if: matrix.os == 'windows'
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v7
with:
name: ${{ env.storage_binary }}-dlls.zip
path: ${{ env.build_dir }}/${{ env.storage_binary }}-dlls.zip
@ -173,17 +179,247 @@ jobs:
7z a -tzip "${{ env.build_dir }}/${{ env.c_bindings_lib }}.zip" ./library/${{ env.c_bindings_lib_base }}.h
- name: Upload ${{ env.c_bindings_lib_base }} to workflow artifacts
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v7
with:
name: ${{ env.c_bindings_lib }}.zip
path: ${{ env.build_dir }}/${{ env.c_bindings_lib }}.zip
if-no-files-found: error
# Build Docker logosstorage/logos-storage-nim:latest-dist-tests image for Logos Storage nodes in the cluster
build-docker-dist-tests:
name: Build Docker dist-tests image
if: github.ref_type == 'tag' || github.event_name == 'workflow_dispatch'
uses: ./.github/workflows/docker-reusable.yml
with:
nimflags: '-d:disableMarchNative -d:storage_enable_api_debug_peers=true -d:storage_enable_log_counter=true'
nat_ip_auto: true
tag_latest: false
tag_stable: false
tag_suffix: dist-tests
tag_sha: false
secrets: inherit
# Release tests
release-tests:
name: Release Tests
runs-on: ubuntu-latest
if: github.ref_type == 'tag' || github.event_name == 'workflow_dispatch'
needs: build-docker-dist-tests
timeout-minutes: 90
permissions:
id-token: write
contents: read
env:
TF_VAR_project: ${{ vars.RELEASE_TESTS_GCP_PROJECT }}
TF_VAR_region: europe-west4
TF_VAR_zone: europe-west4-a
TF_PLUGIN_CACHE_DIR: /home/runner/.terraform.d/plugin-cache
STORAGEDOCKERIMAGE: ${{ github.ref_type == 'tag' && format('logosstorage/logos-storage-nim:{0}-dist-tests', github.ref_name) || 'logosstorage/logos-storage-nim:latest-dist-tests' }}
TEST_TYPE: release-tests
BRANCH: ${{ inputs.branch || 'master' }}
SOURCE: https://github.com/logos-storage/logos-storage-nim-cs-dist-tests
TF_DIR: .github/release/clusters/logos-storage-rel-tests-gcp-europe-west4
CLUSTER_NAME: logos-storage-rel-tests-gcp-europe-west4 # should always match the cluster_name variable in TF_DIR
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Create Terraform plugin cache dir
run: mkdir -p /home/runner/.terraform.d/plugin-cache
- name: Cache Terraform plugins
uses: actions/cache@v5
with:
path: ~/.terraform.d/plugin-cache
key: terraform-google-${{ hashFiles(format('{0}/.terraform.lock.hcl', env.TF_DIR)) }}
restore-keys: terraform-google-
- name: Authenticate to GCP
uses: google-github-actions/auth@v3
with:
workload_identity_provider: ${{ secrets.RELEASE_TESTS_GCP_WORKLOAD_IDENTITY_PROVIDER }}
service_account: ${{ secrets.RELEASE_TESTS_GCP_SERVICE_ACCOUNT }}
- name: Setup gcloud
uses: google-github-actions/setup-gcloud@v3
with:
install_components: gke-gcloud-auth-plugin
- name: Setup Terraform
uses: hashicorp/setup-terraform@v4
- name: Setup kubectl
uses: azure/setup-kubectl@v5
with:
version: v1.36.0
- name: Terraform init
working-directory: ${{ env.TF_DIR }}
run: terraform init -backend-config="bucket=${{ vars.RELEASE_TESTS_TF_STATE_BUCKET }}"
- name: Terraform apply
id: tf-apply
working-directory: ${{ env.TF_DIR }}
run: terraform apply -auto-approve
- name: Get kubeconfig
run: |
gcloud container clusters get-credentials $CLUSTER_NAME \
--zone ${{ env.TF_VAR_zone }} \
--project ${{ vars.RELEASE_TESTS_GCP_PROJECT }}
- name: Wait for runners-ci node to be Ready
run: |
kubectl wait \
--for=condition=Ready \
node \
-l workload-type=tests-runners-ci \
--timeout=300s
- name: Set run variables
run: |
RUNID=$(date +%Y%m%d-%H%M%S)
echo "RUNID=${RUNID}" >> $GITHUB_ENV
echo "NAMEPREFIX=r-tests-${RUNID}" >> $GITHUB_ENV
echo "TESTID=$(git rev-parse --short HEAD)" >> $GITHUB_ENV
echo "JOB_START_TIME=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_ENV
- name: Deploy test job
run: |
envsubst < .github/release/job-release-tests.yaml | kubectl apply -f -
echo "--- Job ---"
kubectl get job $NAMEPREFIX -n default
echo "--- Pods ---"
kubectl get pods -n default
echo "--- Job events ---"
kubectl describe job $NAMEPREFIX -n default
- name: Print storage node log link
run: |
QUERY=$(printf '%s\n%s\n%s' \
'resource.type="k8s_container"' \
"resource.labels.cluster_name=\"${CLUSTER_NAME}\"" \
"labels.\"k8s-pod/runid\"=\"${RUNID}\"")
ENCODED=$(python3 -c "import urllib.parse,sys; print(urllib.parse.quote(sys.stdin.read(), safe=''))" <<< "$QUERY")
URL="https://console.cloud.google.com/logs/query;query=${ENCODED};startTime=${JOB_START_TIME}?project=${{ vars.RELEASE_TESTS_GCP_PROJECT }}"
echo "Storage node logs: $URL"
echo "## Summary" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "Run ID: \`${RUNID}\`" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "[Logs for entire run]($URL)" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "> [!TIP]" >> "$GITHUB_STEP_SUMMARY"
echo "> To see the runner logs, add filter \`resource.labels.container_name=\"runner\"\` or use the filters on the left-side panel" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
LOG_RETENTION_DATE=$(date -u -d "${JOB_START_TIME} + 30 days" +"%Y-%m-%dT%H:%M:%SZ")
echo "> [!IMPORTANT]" >> "$GITHUB_STEP_SUMMARY"
echo "> Logs are retained until ${LOG_RETENTION_DATE} UTC (30 days)" >> "$GITHUB_STEP_SUMMARY"
- name: Wait for runner pod to start
run: |
echo "Waiting for runner pod to reach Running state..."
deadline=$((SECONDS + 300))
last_describe=0
while [[ $SECONDS -lt $deadline ]]; do
phase=$(kubectl get pods \
-l job-name=$NAMEPREFIX \
-n default \
-o jsonpath='{range .items[*]}{.status.phase}{end}' 2>/dev/null)
echo "Pod phase: ${phase:-not yet created}"
if [[ "$phase" == "Running" ]]; then break; fi
if [[ $((SECONDS - last_describe)) -ge 60 ]]; then
echo "--- kubectl describe job $NAMEPREFIX ---"
kubectl describe job $NAMEPREFIX -n default
last_describe=$SECONDS
fi
sleep 10
done
if [[ "$phase" != "Running" ]]; then
echo "Timed out waiting for pod to reach Running state"
exit 1
fi
- name: Run tests and stream logs
timeout-minutes: 60
run: |
POD=$(kubectl get pods -l job-name=$NAMEPREFIX -n default \
-o jsonpath='{.items[0].metadata.name}')
echo "Streaming logs for pod: $POD"
# Use pod name (not label selector) so the stream survives long silences
# between test completions. || true so the step doesn't fail if the
# API server closes the connection before the pod exits.
kubectl logs $POD -n default --follow || true
- name: Generate test summary
env:
GCP_PROJECT: ${{ vars.RELEASE_TESTS_GCP_PROJECT }}
run: |
export JOB_START=$(kubectl get job "$NAMEPREFIX" -n default \
-o jsonpath='{.status.startTime}' 2>/dev/null || true)
export JOB_END=$(kubectl get job "$NAMEPREFIX" -n default \
-o jsonpath='{.status.completionTime}' 2>/dev/null || true)
export ENTRIES_FILE=$(mktemp)
# Read test results written by TearDownDistTest directly into ConfigMaps.
kubectl get configmaps -n default -l "runid=${RUNID},app=test-result" \
-o jsonpath='{range .items[*]}{.data.result}{"\n"}{end}' 2>/dev/null \
| jq -s '[.[] | {jsonPayload: .}]' > "$ENTRIES_FILE" || echo "[]" > "$ENTRIES_FILE"
python3 .github/scripts/generate_test_summary.py
rm -f "$ENTRIES_FILE"
- name: Check job status
run: |
# kubectl logs may have exited early (API server closed the stream).
# Wait for the job to reach a terminal state before checking the result.
kubectl wait job/$NAMEPREFIX -n default \
--for=condition=Complete \
--timeout=300s \
|| true
job_status=$(kubectl get jobs $NAMEPREFIX -n default \
-o jsonpath='{.status.conditions[0].type}')
echo "Job status: $job_status"
[[ "$job_status" == "SuccessCriteriaMet" ]] || exit 1
- name: Terraform destroy
if: always() && steps.tf-apply.conclusion != 'skipped'
working-directory: ${{ env.TF_DIR }}
run: terraform destroy -auto-approve
- name: Delete orphaned GCE disks
if: always() && steps.tf-apply.conclusion != 'skipped'
env:
GCP_PROJECT: ${{ vars.RELEASE_TESTS_GCP_PROJECT }}
run: |
# Safety net: delete any pvc-* disks the CSI driver did not release before
# the cluster was destroyed. Runs after terraform destroy so disks are
# guaranteed detached (GCE rejects deletes on attached disks). The
# releaseTestsDiskCleaner IAM role is granted out-of-band (not via Terraform)
# so it persists across cluster lifecycles — see CLAUDE.md for details.
gcloud compute disks list \
--project="$GCP_PROJECT" \
--filter="name~^pvc-" \
--format="value(name,zone.basename())" \
| while IFS=$'\t' read -r name zone; do
[[ -n "$name" && -n "$zone" ]] || continue
gcloud compute disks delete "$name" --zone="$zone" \
--project="$GCP_PROJECT" --quiet || true
done
- name: Release Terraform state lock
if: always()
run: |
gcloud storage rm \
"gs://${{ vars.RELEASE_TESTS_TF_STATE_BUCKET }}/clusters/${CLUSTER_NAME}/default.tflock" \
2>/dev/null || true
# Release
release:
runs-on: ubuntu-latest
needs: build
if: success() || failure()
needs: [build, release-tests]
if: needs.build.result == 'success' && needs.release-tests.result == 'success'
steps:
- name: Set conditional env variables
shell: bash
@ -196,21 +432,21 @@ jobs:
echo "TAGGED_RELEASE=false" >> $GITHUB_ENV
fi
- name: Download binaries from workflow artifacts into temp folder
uses: actions/download-artifact@v4
uses: actions/download-artifact@v8
with:
pattern: ${{ env.storage_binary_base }}*
merge-multiple: true
path: /tmp/release
- name: Download ${{ env.c_bindings_lib_base }} from workflow artifacts into temp folder
uses: actions/download-artifact@v5
uses: actions/download-artifact@v8
with:
pattern: ${{ env.c_bindings_lib_base }}*
merge-multiple: true
path: /tmp/release
- name: Create GH release
uses: softprops/action-gh-release@v2
uses: softprops/action-gh-release@v3
if: env.TAGGED_RELEASE == 'true'
with:
files: |
@ -218,7 +454,7 @@ jobs:
make_latest: true
- name: Generate Python SDK
uses: peter-evans/repository-dispatch@v3
uses: peter-evans/repository-dispatch@v4
if: env.TAGGED_RELEASE == 'true'
with:
token: ${{ secrets.DISPATCH_PAT }}