diff --git a/.github/release/clusters/logos-storage-rel-tests-gcp-europe-west4/NETWORK.md b/.github/release/clusters/logos-storage-rel-tests-gcp-europe-west4/NETWORK.md new file mode 100644 index 00000000..aef59588 --- /dev/null +++ b/.github/release/clusters/logos-storage-rel-tests-gcp-europe-west4/NETWORK.md @@ -0,0 +1,55 @@ +# VPC Architecture + +## Purpose + +The original purpose of creating a VPC was to allow increasing the number of pods, and therefore number of nodes given the anti-affinity constraint that enforces one pod per node, beyond 8, which is the default quota for external IPs given by Google. Adding a VPC means the number of nodes can be scaled to the limits of the VPC, not to the limits of the external IP quota, since each node no longer needs its own external IP. The VPC allows for the nodes to communicate with the wider internet, outbound only, for functions like pulling docker images, and dependency management. + +## Architecture design +```ascii + Internet + │ + │ (public endpoint, no + │ master_authorized_networks) + ▼ + ┌──────────────────────────┐ + │ GKE Control Plane │ + │ (Google-managed, peered)│ + │ 172.16.0.0/28 │ + └────────────┬─────────────┘ + │ private peering +┌───────────────────────────────────── │ ──────────────────────────────────┐ +│ VPC: logos-storage-rel-tests-vpc │ │ +│ (custom, auto_create_subnetworks=false) │ +│ ▼ │ +│ ┌───────────────────────────────────────────────────────────────────┐ │ +│ │ Subnet: logos-storage-rel-tests-subnet (europe-west4) │ │ +│ │ primary range: 10.10.0.0/20 ← node internal IPs │ │ +│ │ secondary "pods": 10.20.0.0/14 ← pod IPs (VPC-native) │ │ +│ │ secondary "services": 10.30.0.0/20 ← ClusterIP services │ │ +│ │ │ │ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │ +│ │ │ GKE node 1 │ │ GKE node 2 │ ... │ GKE node N │ │ │ +│ │ │ 10.10.0.x │ │ 10.10.0.x │ │ 10.10.0.x │ │ │ +│ │ │ no ext IP │ │ no ext IP │ │ no ext IP │ │ │ +│ │ │ pods:10.20.x│ │ pods:10.20.x│ │ pods:10.20.x│ │ │ +│ │ └─────┬───────┘ └─────┬───────┘ └─────┬───────┘ │ │ +│ └─────────┼────────────────┼─────────────────────┼──────────────────┘ │ +│ └────────────────┴───────────┬─────────┘ │ +│ node-to-node / pod-to-pod traffic, all internal │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────┐ │ +│ │ Cloud Router + Cloud NAT │ │ +│ │ (logos-storage-rel-tests-*) │ │ +│ └──────────────┬───────────────┘ │ +└─────────────────────────────────────────│────────────────────────────────┘ + ▼ + Internet + (image pulls, package mirrors, + outbound only — no inbound) +``` +- Custom VPC + subnet replace the project's default network, giving us a dedicated address space with the secondary ranges GKE's VPC-native (alias-IP) mode requires. +- Three non-overlapping ranges on one subnet: node IPs (/20), pod IPs (/14), service IPs (/20) — ip_allocation_policy points the cluster at the pods/services secondary ranges. +- Nodes have no external IPs (enable_private_nodes = true) — node-to-node and pod-to-pod traffic stays entirely inside the VPC, satisfying the test framework's "real network" requirement without touching the constrained IN_USE_ADDRESSES quota. +- Cloud Router + Cloud NAT give the otherwise IP-less nodes outbound-only internet access (pulling container images, etc.), with no inbound exposure. +- Control plane keeps its public endpoint (enable_private_endpoint = false) — only the nodes are private, so the GitHub-hosted CI runner can still kubectl/terraform apply against the cluster's API server over the internet. \ No newline at end of file diff --git a/.github/release/clusters/logos-storage-rel-tests-gcp-europe-west4/locals.tf b/.github/release/clusters/logos-storage-rel-tests-gcp-europe-west4/locals.tf new file mode 100644 index 00000000..1da5c3a5 --- /dev/null +++ b/.github/release/clusters/logos-storage-rel-tests-gcp-europe-west4/locals.tf @@ -0,0 +1,3 @@ +locals { + name = "logos-storage-rel-tests" +} diff --git a/.github/release/clusters/logos-storage-rel-tests-gcp-europe-west4/main.tf b/.github/release/clusters/logos-storage-rel-tests-gcp-europe-west4/main.tf index 7f51416a..e4798599 100644 --- a/.github/release/clusters/logos-storage-rel-tests-gcp-europe-west4/main.tf +++ b/.github/release/clusters/logos-storage-rel-tests-gcp-europe-west4/main.tf @@ -2,11 +2,19 @@ module "gke" { source = "../modules/gke" - name = "logos-storage-rel-tests" + name = local.name project = var.project region = var.region zone = var.zone + network = google_compute_network.this.id + subnetwork = google_compute_subnetwork.this.id + + pods_range_name = "pods" + services_range_name = "services" + + master_ipv4_cidr_block = "172.16.0.0/28" + node_pool_name = "runners-ci-e2-standard-2" node_pool_machine_type = "e2-standard-2" node_pool_count = 1 @@ -18,7 +26,7 @@ module "gke" { tests_pool_name = "tests-e2-medium" tests_pool_machine_type = "e2-medium" - tests_pool_count = 5 + tests_pool_count = 11 tests_pool_labels = { default-pool = "false" scaling-type = "fixed" diff --git a/.github/release/clusters/logos-storage-rel-tests-gcp-europe-west4/network.tf b/.github/release/clusters/logos-storage-rel-tests-gcp-europe-west4/network.tf new file mode 100644 index 00000000..a0a2bbb7 --- /dev/null +++ b/.github/release/clusters/logos-storage-rel-tests-gcp-europe-west4/network.tf @@ -0,0 +1,45 @@ +# Custom VPC + subnet, required for private GKE nodes (enable_private_nodes +# in main.tf). Without this, nodes would use the default network with no +# secondary ranges available for VPC-native pod/service IPs. +resource "google_compute_network" "this" { + name = "${local.name}-vpc" + project = var.project + auto_create_subnetworks = false +} + +resource "google_compute_subnetwork" "this" { + name = "${local.name}-subnet" + project = var.project + region = var.region + network = google_compute_network.this.id + ip_cidr_range = "10.10.0.0/20" + + secondary_ip_range { + range_name = "pods" + ip_cidr_range = "10.20.0.0/14" + } + + secondary_ip_range { + range_name = "services" + ip_cidr_range = "10.30.0.0/20" + } +} + +# Cloud Router + NAT: gives private nodes outbound internet access (pulling +# container images, apt packages, etc.) since they have no external IPs. +resource "google_compute_router" "this" { + name = "${local.name}-router" + project = var.project + region = var.region + network = google_compute_network.this.id +} + +resource "google_compute_router_nat" "this" { + name = "${local.name}-nat" + project = var.project + router = google_compute_router.this.name + region = var.region + + nat_ip_allocate_option = "AUTO_ONLY" + source_subnetwork_ip_ranges_to_nat = "ALL_SUBNETWORKS_ALL_IP_RANGES" +} diff --git a/.github/release/clusters/modules/gke/main.tf b/.github/release/clusters/modules/gke/main.tf index bc5344dc..42ee57fb 100644 --- a/.github/release/clusters/modules/gke/main.tf +++ b/.github/release/clusters/modules/gke/main.tf @@ -8,6 +8,25 @@ resource "google_container_cluster" "this" { deletion_protection = false + network = var.network + subnetwork = var.subnetwork + + # VPC-native cluster, required for private nodes. + ip_allocation_policy { + cluster_secondary_range_name = var.pods_range_name + services_secondary_range_name = var.services_range_name + } + + # Nodes get only internal IPs, avoiding the per-region IN_USE_ADDRESSES + # quota. The control plane keeps its public endpoint (no + # master_authorized_networks_config) so the GitHub-hosted CI runner can + # still reach it. + private_cluster_config { + enable_private_nodes = true + enable_private_endpoint = false + master_ipv4_cidr_block = var.master_ipv4_cidr_block + } + # Send pod stdout/stderr to Cloud Logging automatically logging_service = "logging.googleapis.com/kubernetes" monitoring_service = "monitoring.googleapis.com/kubernetes" diff --git a/.github/release/clusters/modules/gke/variables.tf b/.github/release/clusters/modules/gke/variables.tf index 54eb33b7..ad955f95 100644 --- a/.github/release/clusters/modules/gke/variables.tf +++ b/.github/release/clusters/modules/gke/variables.tf @@ -19,6 +19,32 @@ variable "zone" { description = "The GCP zone for the cluster. Using a single zone avoids the longer provisioning time of a regional (multi-zone) cluster." } +# Networking (private nodes) +variable "network" { + type = string + description = "Self link or ID of the VPC network the cluster's nodes run in." +} + +variable "subnetwork" { + type = string + description = "Self link or ID of the subnetwork the cluster's nodes run in." +} + +variable "pods_range_name" { + type = string + description = "Name of the subnetwork secondary IP range to use for Pod IPs." +} + +variable "services_range_name" { + type = string + description = "Name of the subnetwork secondary IP range to use for Service IPs." +} + +variable "master_ipv4_cidr_block" { + type = string + description = "/28 CIDR range for the GKE control plane's private endpoint." +} + # Kubernetes default Node Pool variable "node_pool_name" { type = string