feat(testing): Release tests -- force pod spread to one pod per node (#1445)

This commit is contained in:
Eric 2026-06-15 16:17:51 +10:00 committed by GitHub
parent d65f32f819
commit cb928aacdd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 158 additions and 2 deletions

View File

@ -0,0 +1,55 @@
# VPC Architecture
## Purpose
The original purpose of creating a VPC was to allow increasing the number of pods, and therefore number of nodes given the anti-affinity constraint that enforces one pod per node, beyond 8, which is the default quota for external IPs given by Google. Adding a VPC means the number of nodes can be scaled to the limits of the VPC, not to the limits of the external IP quota, since each node no longer needs its own external IP. The VPC allows for the nodes to communicate with the wider internet, outbound only, for functions like pulling docker images, and dependency management.
## Architecture design
```ascii
Internet
│ (public endpoint, no
│ master_authorized_networks)
┌──────────────────────────┐
│ GKE Control Plane │
│ (Google-managed, peered)│
│ 172.16.0.0/28 │
└────────────┬─────────────┘
│ private peering
┌───────────────────────────────────── │ ──────────────────────────────────┐
│ VPC: logos-storage-rel-tests-vpc │ │
│ (custom, auto_create_subnetworks=false) │
│ ▼ │
│ ┌───────────────────────────────────────────────────────────────────┐ │
│ │ Subnet: logos-storage-rel-tests-subnet (europe-west4) │ │
│ │ primary range: 10.10.0.0/20 ← node internal IPs │ │
│ │ secondary "pods": 10.20.0.0/14 ← pod IPs (VPC-native) │ │
│ │ secondary "services": 10.30.0.0/20 ← ClusterIP services │ │
│ │ │ │
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │
│ │ │ GKE node 1 │ │ GKE node 2 │ ... │ GKE node N │ │ │
│ │ │ 10.10.0.x │ │ 10.10.0.x │ │ 10.10.0.x │ │ │
│ │ │ no ext IP │ │ no ext IP │ │ no ext IP │ │ │
│ │ │ pods:10.20.x│ │ pods:10.20.x│ │ pods:10.20.x│ │ │
│ │ └─────┬───────┘ └─────┬───────┘ └─────┬───────┘ │ │
│ └─────────┼────────────────┼─────────────────────┼──────────────────┘ │
│ └────────────────┴───────────┬─────────┘ │
│ node-to-node / pod-to-pod traffic, all internal │
│ │ │
│ ▼ │
│ ┌──────────────────────────────┐ │
│ │ Cloud Router + Cloud NAT │ │
│ │ (logos-storage-rel-tests-*) │ │
│ └──────────────┬───────────────┘ │
└─────────────────────────────────────────│────────────────────────────────┘
Internet
(image pulls, package mirrors,
outbound only — no inbound)
```
- Custom VPC + subnet replace the project's default network, giving us a dedicated address space with the secondary ranges GKE's VPC-native (alias-IP) mode requires.
- Three non-overlapping ranges on one subnet: node IPs (/20), pod IPs (/14), service IPs (/20) — ip_allocation_policy points the cluster at the pods/services secondary ranges.
- Nodes have no external IPs (enable_private_nodes = true) — node-to-node and pod-to-pod traffic stays entirely inside the VPC, satisfying the test framework's "real network" requirement without touching the constrained IN_USE_ADDRESSES quota.
- Cloud Router + Cloud NAT give the otherwise IP-less nodes outbound-only internet access (pulling container images, etc.), with no inbound exposure.
- Control plane keeps its public endpoint (enable_private_endpoint = false) — only the nodes are private, so the GitHub-hosted CI runner can still kubectl/terraform apply against the cluster's API server over the internet.

View File

@ -0,0 +1,3 @@
locals {
name = "logos-storage-rel-tests"
}

View File

@ -2,11 +2,19 @@
module "gke" {
source = "../modules/gke"
name = "logos-storage-rel-tests"
name = local.name
project = var.project
region = var.region
zone = var.zone
network = google_compute_network.this.id
subnetwork = google_compute_subnetwork.this.id
pods_range_name = "pods"
services_range_name = "services"
master_ipv4_cidr_block = "172.16.0.0/28"
node_pool_name = "runners-ci-e2-standard-2"
node_pool_machine_type = "e2-standard-2"
node_pool_count = 1
@ -18,7 +26,7 @@ module "gke" {
tests_pool_name = "tests-e2-medium"
tests_pool_machine_type = "e2-medium"
tests_pool_count = 5
tests_pool_count = 11
tests_pool_labels = {
default-pool = "false"
scaling-type = "fixed"

View File

@ -0,0 +1,45 @@
# Custom VPC + subnet, required for private GKE nodes (enable_private_nodes
# in main.tf). Without this, nodes would use the default network with no
# secondary ranges available for VPC-native pod/service IPs.
resource "google_compute_network" "this" {
name = "${local.name}-vpc"
project = var.project
auto_create_subnetworks = false
}
resource "google_compute_subnetwork" "this" {
name = "${local.name}-subnet"
project = var.project
region = var.region
network = google_compute_network.this.id
ip_cidr_range = "10.10.0.0/20"
secondary_ip_range {
range_name = "pods"
ip_cidr_range = "10.20.0.0/14"
}
secondary_ip_range {
range_name = "services"
ip_cidr_range = "10.30.0.0/20"
}
}
# Cloud Router + NAT: gives private nodes outbound internet access (pulling
# container images, apt packages, etc.) since they have no external IPs.
resource "google_compute_router" "this" {
name = "${local.name}-router"
project = var.project
region = var.region
network = google_compute_network.this.id
}
resource "google_compute_router_nat" "this" {
name = "${local.name}-nat"
project = var.project
router = google_compute_router.this.name
region = var.region
nat_ip_allocate_option = "AUTO_ONLY"
source_subnetwork_ip_ranges_to_nat = "ALL_SUBNETWORKS_ALL_IP_RANGES"
}

View File

@ -8,6 +8,25 @@ resource "google_container_cluster" "this" {
deletion_protection = false
network = var.network
subnetwork = var.subnetwork
# VPC-native cluster, required for private nodes.
ip_allocation_policy {
cluster_secondary_range_name = var.pods_range_name
services_secondary_range_name = var.services_range_name
}
# Nodes get only internal IPs, avoiding the per-region IN_USE_ADDRESSES
# quota. The control plane keeps its public endpoint (no
# master_authorized_networks_config) so the GitHub-hosted CI runner can
# still reach it.
private_cluster_config {
enable_private_nodes = true
enable_private_endpoint = false
master_ipv4_cidr_block = var.master_ipv4_cidr_block
}
# Send pod stdout/stderr to Cloud Logging automatically
logging_service = "logging.googleapis.com/kubernetes"
monitoring_service = "monitoring.googleapis.com/kubernetes"

View File

@ -19,6 +19,32 @@ variable "zone" {
description = "The GCP zone for the cluster. Using a single zone avoids the longer provisioning time of a regional (multi-zone) cluster."
}
# Networking (private nodes)
variable "network" {
type = string
description = "Self link or ID of the VPC network the cluster's nodes run in."
}
variable "subnetwork" {
type = string
description = "Self link or ID of the subnetwork the cluster's nodes run in."
}
variable "pods_range_name" {
type = string
description = "Name of the subnetwork secondary IP range to use for Pod IPs."
}
variable "services_range_name" {
type = string
description = "Name of the subnetwork secondary IP range to use for Service IPs."
}
variable "master_ipv4_cidr_block" {
type = string
description = "/28 CIDR range for the GKE control plane's private endpoint."
}
# Kubernetes default Node Pool
variable "node_pool_name" {
type = string