From a3aa416de5a589941b2e7e934968725c188edbf6 Mon Sep 17 00:00:00 2001 From: andrussal Date: Tue, 16 Dec 2025 11:36:43 +0100 Subject: [PATCH] k8s: support in-image KZG and dynamic grafana NodePort --- .../runners/k8s/helm/nomos-runner/values.yaml | 2 +- .../runners/k8s/src/deployer/orchestrator.rs | 60 ++++++++++++++++--- .../runners/k8s/src/infrastructure/assets.rs | 29 +++++++-- .../runners/k8s/src/infrastructure/cluster.rs | 8 +++ 4 files changed, 86 insertions(+), 13 deletions(-) diff --git a/testing-framework/runners/k8s/helm/nomos-runner/values.yaml b/testing-framework/runners/k8s/helm/nomos-runner/values.yaml index 8d25abf..af59078 100644 --- a/testing-framework/runners/k8s/helm/nomos-runner/values.yaml +++ b/testing-framework/runners/k8s/helm/nomos-runner/values.yaml @@ -49,4 +49,4 @@ grafana: adminPassword: admin service: type: NodePort - nodePort: 30030 + nodePort: null diff --git a/testing-framework/runners/k8s/src/deployer/orchestrator.rs b/testing-framework/runners/k8s/src/deployer/orchestrator.rs index a9eb990..888cb44 100644 --- a/testing-framework/runners/k8s/src/deployer/orchestrator.rs +++ b/testing-framework/runners/k8s/src/deployer/orchestrator.rs @@ -1,6 +1,7 @@ use anyhow::Error; use async_trait::async_trait; -use kube::Client; +use k8s_openapi::api::core::v1::Service; +use kube::{Client, api::Api}; use testing_framework_core::{ scenario::{BlockFeedTask, CleanupGuard, Deployer, MetricsError, RunContext, Runner, Scenario}, topology::generation::GeneratedTopology, @@ -150,15 +151,45 @@ impl Deployer for K8sDeployer { prometheus_url = %format!("http://{}:{}/", node_host, prometheus_port), "prometheus endpoint available on host" ); - info!( - grafana_url = %format!("http://{}:{}/", node_host, 30030), - "grafana dashboard available via NodePort" - ); + if let Some(grafana_port) = cluster_grafana_node_port( + &client, + cluster + .as_ref() + .expect("cluster must be available") + .namespace(), + cluster + .as_ref() + .expect("cluster must be available") + .release(), + ) + .await + { + info!( + grafana_url = %format!("http://{}:{}/", node_host, grafana_port), + "grafana dashboard available via NodePort" + ); + } if std::env::var("TESTNET_PRINT_ENDPOINTS").is_ok() { + let grafana_port = cluster_grafana_node_port( + &client, + cluster + .as_ref() + .expect("cluster must be available") + .namespace(), + cluster + .as_ref() + .expect("cluster must be available") + .release(), + ) + .await; println!( - "TESTNET_ENDPOINTS prometheus=http://{}:{}/ grafana=http://{}:{}/", - node_host, prometheus_port, node_host, 30030 + "TESTNET_ENDPOINTS prometheus=http://{}:{}/ grafana={}", + node_host, + prometheus_port, + grafana_port + .map(|port| format!("http://{}:{}/", node_host, port)) + .unwrap_or_else(|| "".to_string()) ); for (idx, client) in node_clients.validator_clients().iter().enumerate() { @@ -216,6 +247,21 @@ fn cluster_prometheus_port(cluster: &Option) -> u16 { .prometheus_port() } +async fn cluster_grafana_node_port(client: &Client, namespace: &str, release: &str) -> Option { + let services: Api = Api::namespaced(client.clone(), namespace); + let service_name = format!("{release}-grafana"); + let service = services.get(&service_name).await.ok()?; + let spec = service.spec?; + if spec.type_.as_deref() != Some("NodePort") { + return None; + } + let ports = spec.ports?; + ports.into_iter().find_map(|port| { + let node_port = port.node_port?; + u16::try_from(node_port).ok() + }) +} + async fn fail_cluster(cluster: &mut Option, reason: &str) { if let Some(env) = cluster.as_mut() { env.fail(reason).await; diff --git a/testing-framework/runners/k8s/src/infrastructure/assets.rs b/testing-framework/runners/k8s/src/infrastructure/assets.rs index 7809eeb..8ad573e 100644 --- a/testing-framework/runners/k8s/src/infrastructure/assets.rs +++ b/testing-framework/runners/k8s/src/infrastructure/assets.rs @@ -94,7 +94,8 @@ pub fn prepare_assets(topology: &GeneratedTopology) -> Result Result Some(validate_kzg_params(&root)?), KzgMode::InImage => None, @@ -145,13 +145,23 @@ pub fn prepare_assets(topology: &GeneratedTopology) -> Result Result { +fn render_cfgsync_config( + root: &Path, + topology: &GeneratedTopology, + kzg_mode: KzgMode, +) -> Result { let cfgsync_template_path = stack_assets_root(root).join("cfgsync.yaml"); debug!(path = %cfgsync_template_path.display(), "loading cfgsync template"); let mut cfg = load_cfgsync_template(&cfgsync_template_path) .map_err(|source| AssetsError::Cfgsync { source })?; - apply_topology_overrides(&mut cfg, topology, true); + apply_topology_overrides(&mut cfg, topology, kzg_mode == KzgMode::HostPath); + if kzg_mode == KzgMode::InImage { + cfg.global_params_path = env::var("NOMOS_KZGRS_PARAMS_PATH") + .ok() + .unwrap_or_else(|| DEFAULT_IN_IMAGE_KZG_PARAMS_PATH.to_string()); + } cfg.timeout = cfg.timeout.max(CFGSYNC_K8S_TIMEOUT_SECS); render_cfgsync_yaml(&cfg).map_err(|source| AssetsError::Cfgsync { source }) } @@ -321,6 +331,15 @@ fn build_values(topology: &GeneratedTopology) -> HelmValues { let pol_mode = pol_proof_mode(); let image_pull_policy = env::var("NOMOS_TESTNET_IMAGE_PULL_POLICY").unwrap_or_else(|_| "IfNotPresent".into()); + let grafana_node_port = match kzg_mode() { + KzgMode::HostPath => Some(DEFAULT_GRAFANA_NODE_PORT), + KzgMode::InImage => env::var("NOMOS_GRAFANA_NODE_PORT").ok().and_then(|value| { + value + .parse::() + .ok() + .filter(|port| *port >= 30000 && *port <= 32767) + }), + }; let grafana = GrafanaValues { enabled: true, image: "grafana/grafana:10.4.1".into(), @@ -329,7 +348,7 @@ fn build_values(topology: &GeneratedTopology) -> HelmValues { admin_password: "admin".into(), service: GrafanaServiceValues { type_field: "NodePort".into(), - node_port: Some(DEFAULT_GRAFANA_NODE_PORT), + node_port: grafana_node_port, }, }; debug!(pol_mode, "rendering Helm values for k8s stack"); diff --git a/testing-framework/runners/k8s/src/infrastructure/cluster.rs b/testing-framework/runners/k8s/src/infrastructure/cluster.rs index 4670e3d..6546e46 100644 --- a/testing-framework/runners/k8s/src/infrastructure/cluster.rs +++ b/testing-framework/runners/k8s/src/infrastructure/cluster.rs @@ -89,6 +89,14 @@ impl ClusterEnvironment { ) } + pub fn namespace(&self) -> &str { + &self.namespace + } + + pub fn release(&self) -> &str { + &self.release + } + pub fn prometheus_port(&self) -> u16 { self.prometheus_port }