chore: reduce GKE release test cluster provisioning time and cost

- Configure runners-ci node pool inline in the cluster resource instead
  of using remove_default_node_pool=true, eliminating the
  provision-then-delete cycle that added ~5 min to terraform apply
- Remove the separate infra pool; runners-ci is now the only pool on
  the critical path of cluster creation
- Set tests-pods pool min_node_count=0 so no node is provisioned at
  apply time — nodes scale up only when test pods are scheduled
- Enable spot instances on the tests-pods pool for ~60-91% cost saving
- Add 60 min job timeout to release-tests to bound hung cluster cost
- Add Terraform plugin cache keyed on the lock file to skip provider
  re-downloads on subsequent runs (~30-60s saved)
- Install gke-gcloud-auth-plugin via setup-gcloud to fix kubectl auth

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
E M 2026-04-24 09:46:59 +10:00
parent 00a6264030
commit 8f13be1dc4
No known key found for this signature in database
3 changed files with 37 additions and 56 deletions

View File

@ -1,4 +1,4 @@
# Kubernetes cluster
# Kubernetes cluster runners-ci pool is configured inline in the module
module "gke" {
source = "../modules/gke"
@ -7,40 +7,15 @@ module "gke" {
region = var.region
zone = var.zone
kubernetes_release_channel = "STABLE"
node_pool_name = "infra-e2-standard-4"
node_pool_machine_type = "e2-standard-4"
node_pool_name = "runners-ci-e2-standard-2"
node_pool_machine_type = "e2-standard-2"
node_pool_min = 1
node_pool_max = 3
node_pool_max = 5
node_pool_labels = {
default-pool = "true"
scaling-type = "auto"
workload-type = "infra"
}
}
# Node pool - Runners CI
resource "google_container_node_pool" "runners-ci" {
name = "runners-ci-e2-standard-2"
cluster = module.gke.kubernetes_cluster_id
location = var.zone
project = var.project
autoscaling {
min_node_count = 1
max_node_count = 5
}
node_config {
machine_type = "e2-standard-2"
labels = {
allow-tests-pods = "false"
default-pool = "false"
scaling-type = "auto"
workload-type = "tests-runners-ci"
}
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform",
]
allow-tests-pods = "false"
default-pool = "true"
scaling-type = "auto"
workload-type = "tests-runners-ci"
}
}
@ -52,12 +27,13 @@ resource "google_container_node_pool" "tests-pods" {
project = var.project
autoscaling {
min_node_count = 1
min_node_count = 0
max_node_count = 10
}
node_config {
machine_type = "e2-medium"
spot = true
labels = {
allow-tests-pods = "true"
default-pool = "false"

View File

@ -1,13 +1,10 @@
# Kubernetes cluster
# Kubernetes cluster runners-ci pool configured inline to avoid the
# remove_default_node_pool create-then-delete cycle that adds ~5 min.
resource "google_container_cluster" "this" {
name = local.name
location = var.zone
project = var.project
# Create an empty cluster all node pools are managed as separate resources
remove_default_node_pool = true
initial_node_count = 1
deletion_protection = false
release_channel {
@ -22,26 +19,23 @@ resource "google_container_cluster" "this" {
# Send pod stdout/stderr to Cloud Logging automatically
logging_service = "logging.googleapis.com/kubernetes"
monitoring_service = "monitoring.googleapis.com/kubernetes"
}
# Default (infra) node pool
resource "google_container_node_pool" "default" {
name = var.node_pool_name
cluster = google_container_cluster.this.id
location = var.zone
project = var.project
node_pool {
name = var.node_pool_name
initial_node_count = var.node_pool_min
autoscaling {
min_node_count = var.node_pool_min
max_node_count = var.node_pool_max
}
autoscaling {
min_node_count = var.node_pool_min
max_node_count = var.node_pool_max
}
node_config {
machine_type = var.node_pool_machine_type
labels = var.node_pool_labels
node_config {
machine_type = var.node_pool_machine_type
labels = var.node_pool_labels
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform",
]
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform",
]
}
}
}

View File

@ -205,6 +205,7 @@ jobs:
runs-on: ubuntu-latest
if: github.ref_type == 'tag' || github.event_name == 'workflow_dispatch'
needs: build-docker-dist-tests
timeout-minutes: 60
permissions:
id-token: write
contents: read
@ -212,6 +213,7 @@ jobs:
TF_VAR_project: ${{ secrets.RELEASE_TESTS_GCP_PROJECT }}
TF_VAR_region: europe-west4
TF_VAR_zone: europe-west4-b
TF_PLUGIN_CACHE_DIR: ~/.terraform.d/plugin-cache
STORAGEDOCKERIMAGE: ${{ needs.build-docker-dist-tests.outputs.logos_storage_image }}
TEST_TYPE: release-tests
BRANCH: ${{ inputs.branch || 'master' }}
@ -221,6 +223,13 @@ jobs:
- name: Checkout
uses: actions/checkout@v4
- name: Cache Terraform plugins
uses: actions/cache@v4
with:
path: ~/.terraform.d/plugin-cache
key: terraform-google-${{ hashFiles(format('{0}/.terraform.lock.hcl', env.TF_DIR)) }}
restore-keys: terraform-google-
- name: Authenticate to GCP
uses: google-github-actions/auth@v2
with:
@ -229,6 +238,8 @@ jobs:
- name: Setup gcloud
uses: google-github-actions/setup-gcloud@v2
with:
install_components: gke-gcloud-auth-plugin
- name: Setup Terraform
uses: hashicorp/setup-terraform@v3