diff --git a/Cargo.lock b/Cargo.lock index 3605aff..99a0dc8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7267,6 +7267,7 @@ dependencies = [ "key-management-system-service", "nomos-core", "nomos-ledger", + "nomos-tracing", "nomos-tracing-service", "reqwest", "serde", @@ -7290,6 +7291,7 @@ dependencies = [ "async-trait", "k8s-openapi", "kube", + "nomos-tracing", "nomos-tracing-service", "reqwest", "serde", diff --git a/book/src/architecture-overview.md b/book/src/architecture-overview.md index 7e445c7..1ab8e09 100644 --- a/book/src/architecture-overview.md +++ b/book/src/architecture-overview.md @@ -90,7 +90,7 @@ Three deployer implementations: | `K8sDeployer` | Kubernetes Helm | Cluster + image loaded | Not yet | **Compose-specific features:** -- Includes Prometheus at `http://localhost:9090` (override via `TEST_FRAMEWORK_PROMETHEUS_PORT`) +- Observability is external (set `NOMOS_METRICS_QUERY_URL` / `NOMOS_METRICS_OTLP_INGEST_URL` / `NOMOS_GRAFANA_URL` as needed) - Optional OTLP trace/metrics endpoints (`NOMOS_OTLP_ENDPOINT`, `NOMOS_OTLP_METRICS_ENDPOINT`) - Node control for chaos testing (restart validators/executors) @@ -112,9 +112,9 @@ KZG parameters required for DA workloads: ### Compose Stack Templates and configs in `testing-framework/runners/compose/assets/`: -- `docker-compose.yml.tera` — Stack template (validators, executors, Prometheus, Grafana) +- `docker-compose.yml.tera` — Stack template (validators, executors) - Cfgsync config: `testing-framework/assets/stack/cfgsync.yaml` -- Monitoring: `testing-framework/assets/stack/monitoring/prometheus.yml` +- Monitoring assets (not deployed by the framework): `testing-framework/assets/stack/monitoring/` ## Logging Architecture @@ -134,14 +134,14 @@ Templates and configs in `testing-framework/runners/compose/assets/`: ## Observability -**Prometheus (Compose + K8s):** -- Exposed at `http://localhost:9090` (configurable) -- Scrapes all validator and executor metrics -- Accessible in expectations: `ctx.telemetry().prometheus().map(|p| p.base_url())` +**Prometheus-compatible metrics querying (optional):** +- The framework does **not** deploy Prometheus/Grafana. +- Provide a Prometheus-compatible base URL (PromQL API) via `NOMOS_METRICS_QUERY_URL`. +- Accessible in expectations when configured: `ctx.telemetry().prometheus().map(|p| p.base_url())` -**Grafana dashboards (Compose + K8s):** -- Provisioned automatically; URL is printed in `TESTNET_ENDPOINTS` when using `scripts/run-examples.sh` -- Default credentials: `admin` / `admin` +**Grafana dashboards (optional):** +- Dashboards live in `testing-framework/assets/stack/monitoring/grafana/dashboards/` and can be imported into your Grafana. +- If you set `NOMOS_GRAFANA_URL`, the deployer prints it in `TESTNET_ENDPOINTS`. **Node APIs:** - HTTP endpoints per node for consensus info, network status, DA membership diff --git a/book/src/operations.md b/book/src/operations.md index 93e2fcc..a7eb5bc 100644 --- a/book/src/operations.md +++ b/book/src/operations.md @@ -181,7 +181,9 @@ cargo run -p runner-examples --bin compose_runner - `POL_PROOF_DEV_MODE=true` — **Required** for all runners - `NOMOS_DEMO_VALIDATORS=3` / `NOMOS_DEMO_EXECUTORS=2` / `NOMOS_DEMO_RUN_SECS=120` — Topology overrides - `COMPOSE_NODE_PAIRS=1x1` — Alternative topology format: "validators×executors" -- `TEST_FRAMEWORK_PROMETHEUS_PORT=9091` — Override Prometheus port (default: 9090) +- `NOMOS_METRICS_QUERY_URL` — Prometheus-compatible base URL for the runner process to query (optional) +- `NOMOS_METRICS_OTLP_INGEST_URL` — Full OTLP HTTP ingest URL for node metrics export (optional) +- `NOMOS_GRAFANA_URL` — Grafana base URL for printing/logging (optional) - `COMPOSE_RUNNER_HOST=127.0.0.1` — Host address for port mappings - `COMPOSE_RUNNER_PRESERVE=1` — Keep containers running after test - `NOMOS_LOG_LEVEL=debug` / `NOMOS_LOG_FILTER=...` — Control node log verbosity (stdout/stderr) @@ -189,7 +191,7 @@ cargo run -p runner-examples --bin compose_runner **Compose-specific features:** - **Node control support**: Only runner that supports chaos testing (`.enable_node_control()` + chaos workloads) -- **Prometheus observability**: Metrics at `http://localhost:9090` +- **Observability is external**: Set `NOMOS_METRICS_*` / `NOMOS_GRAFANA_URL` to enable telemetry links and querying **Important:** - Containers expect KZG parameters at `/kzgrs_test_params/kzgrs_test_params` (note the repeated filename) @@ -229,21 +231,30 @@ cargo run -p runner-examples --bin k8s_runner - `NOMOS_TESTNET_IMAGE` — Image tag (required) - `POL_PROOF_DEV_MODE=true` — **Required** for all runners - `NOMOS_DEMO_VALIDATORS` / `NOMOS_DEMO_EXECUTORS` / `NOMOS_DEMO_RUN_SECS` — Topology overrides -- `K8S_RUNNER_EXTERNAL_PROMETHEUS_URL` (or `NOMOS_EXTERNAL_PROMETHEUS_URL`) — Reuse an existing Prometheus and skip deploying the in-chart Prometheus; also points node OTLP metrics export and the in-cluster Grafana datasource at that Prometheus +- `NOMOS_METRICS_QUERY_URL` — Prometheus-compatible base URL for the runner process to query (PromQL) +- `NOMOS_METRICS_OTLP_INGEST_URL` — Full OTLP HTTP ingest URL for node metrics export (optional) +- `NOMOS_GRAFANA_URL` — Grafana base URL for printing/logging (optional) -**External Prometheus (optional):** +**Metrics + Grafana (optional):** ```bash -export K8S_RUNNER_EXTERNAL_PROMETHEUS_URL=http://your-prometheus:9090 +export NOMOS_METRICS_QUERY_URL=http://your-prometheus:9090 +# Prometheus OTLP receiver example: +export NOMOS_METRICS_OTLP_INGEST_URL=http://your-prometheus:9090/api/v1/otlp/v1/metrics +# Optional: print a Grafana link in TESTNET_ENDPOINTS +export NOMOS_GRAFANA_URL=http://your-grafana:3000 cargo run -p runner-examples --bin k8s_runner ``` Notes: -- The runner config expects Prometheus to accept OTLP metrics at `/api/v1/otlp/v1/metrics` (the in-chart Prometheus is started with `--web.enable-otlp-receiver` and `--enable-feature=otlp-write-receiver`). -- Use a URL reachable from inside the cluster (for example a `Service` DNS name like `http://prometheus.monitoring:9090`). +- `NOMOS_METRICS_QUERY_URL` must be reachable from the runner process (often via `kubectl port-forward`). +- `NOMOS_METRICS_OTLP_INGEST_URL` must be reachable from nodes (pods/containers) and is backend-specific (Prometheus vs VictoriaMetrics paths differ). **Via `scripts/run-examples.sh` (optional):** ```bash -scripts/run-examples.sh -t 60 -v 1 -e 1 k8s --external-prometheus http://your-prometheus:9090 +scripts/run-examples.sh -t 60 -v 1 -e 1 k8s \ + --metrics-query-url http://your-prometheus:9090 \ + --metrics-otlp-ingest-url http://your-prometheus:9090/api/v1/otlp/v1/metrics \ + --grafana-url http://your-grafana:3000 ``` **In code (optional):** @@ -252,7 +263,8 @@ use testing_framework_core::scenario::ScenarioBuilder; use testing_framework_workflows::ObservabilityBuilderExt as _; let plan = ScenarioBuilder::with_node_counts(1, 1) - .with_external_prometheus_str("http://your-prometheus:9090") + .with_metrics_query_url_str("http://your-prometheus:9090") + .with_metrics_otlp_ingest_url_str("http://your-prometheus:9090/api/v1/otlp/v1/metrics") .build(); ``` @@ -484,7 +496,6 @@ cargo run -p runner-examples --bin compose_runner - `COMPOSE_RUNNER_HOST=127.0.0.1` — host used for readiness probes (override for remote Docker daemons / VM networking) - `COMPOSE_RUNNER_HOST_GATEWAY=host.docker.internal:host-gateway` — controls the `extra_hosts` entry injected into compose (set to `disable` to omit) - `TESTNET_RUNNER_PRESERVE=1` — alias for `COMPOSE_RUNNER_PRESERVE=1` -- `COMPOSE_GRAFANA_PORT=` — pin Grafana to a fixed host port instead of ephemeral assignment - `COMPOSE_RUNNER_HTTP_TIMEOUT_SECS=` — override compose node HTTP readiness timeout **Note:** Container names follow pattern `nomos-compose-{uuid}-validator-{index}-1` where `{uuid}` changes per run. @@ -553,15 +564,15 @@ cargo run -p runner-examples --bin local_runner Runners expose metrics and node HTTP endpoints for expectation code and debugging: -**Prometheus (Compose + K8s):** -- Default: `http://localhost:9090` -- Override: `TEST_FRAMEWORK_PROMETHEUS_PORT=9091` -- Note: the host port can vary if `9090` is unavailable; prefer the printed `TESTNET_ENDPOINTS` line as the source of truth. -- Access from expectations: `ctx.telemetry().prometheus().map(|p| p.base_url())` +**Prometheus-compatible metrics querying (optional):** +- The framework does **not** deploy Prometheus. +- Provide `NOMOS_METRICS_QUERY_URL` (PromQL base URL) to enable `ctx.telemetry()` queries. +- Access from expectations when configured: `ctx.telemetry().prometheus().map(|p| p.base_url())` -**Grafana dashboards (Compose + K8s):** -- The deployer prints the Grafana base URL in `TESTNET_ENDPOINTS`. -- Default credentials are `admin` / `admin`. +**Grafana (optional):** +- The framework does **not** deploy Grafana. +- If you set `NOMOS_GRAFANA_URL`, the deployer prints it in `TESTNET_ENDPOINTS`. +- Dashboards live in `testing-framework/assets/stack/monitoring/grafana/dashboards/` for import into your Grafana. **Node APIs:** - Access from expectations: `ctx.node_clients().validator_clients().get(0)` diff --git a/examples/src/bin/compose_runner.rs b/examples/src/bin/compose_runner.rs index e817e49..45f2411 100644 --- a/examples/src/bin/compose_runner.rs +++ b/examples/src/bin/compose_runner.rs @@ -109,7 +109,7 @@ async fn run_compose_case( }; if !runner.context().telemetry().is_configured() { - warn!("compose runner should expose prometheus metrics"); + warn!("metrics querying is disabled; set NOMOS_METRICS_QUERY_URL to enable PromQL queries"); } info!("running scenario"); diff --git a/examples/src/bin/k8s_runner.rs b/examples/src/bin/k8s_runner.rs index 150ff76..7c74e84 100644 --- a/examples/src/bin/k8s_runner.rs +++ b/examples/src/bin/k8s_runner.rs @@ -104,7 +104,7 @@ async fn run_k8s_case(validators: usize, executors: usize, run_duration: Duratio }; if !runner.context().telemetry().is_configured() { - warn!("k8s runner should expose prometheus metrics"); + warn!("metrics querying is disabled; set NOMOS_METRICS_QUERY_URL to enable PromQL queries"); } let validator_clients = runner.context().node_clients().validator_clients().to_vec(); diff --git a/scripts/run-examples.sh b/scripts/run-examples.sh index 5c749d0..5ad2b91 100755 --- a/scripts/run-examples.sh +++ b/scripts/run-examples.sh @@ -38,12 +38,13 @@ Options: -v, --validators N Number of validators (required) -e, --executors N Number of executors (required) --bundle PATH Convenience alias for setting NOMOS_BINARIES_TAR=PATH - --metrics-query-url URL (k8s) PromQL base URL the runner process can query (often localhost port-forward) - --metrics-query-grafana-url URL (k8s) PromQL base URL reachable from inside the cluster (Grafana datasource) - --metrics-otlp-ingest-url URL (k8s) Full OTLP HTTP ingest URL for node metrics export - --external-prometheus URL (k8s) Alias for --metrics-query-url - --external-prometheus-grafana-url URL (k8s) Alias for --metrics-query-grafana-url - --external-otlp-metrics-endpoint URL (k8s) Alias for --metrics-otlp-ingest-url + --metrics-query-url URL PromQL base URL the runner process can query (optional) + --metrics-query-grafana-url URL PromQL base URL for a Grafana datasource (optional) + --metrics-otlp-ingest-url URL Full OTLP HTTP ingest URL for node metrics export (optional) + --grafana-url URL Grafana base URL for printing/logging (optional) + --external-prometheus URL Alias for --metrics-query-url + --external-prometheus-grafana-url URL Alias for --metrics-query-grafana-url + --external-otlp-metrics-endpoint URL Alias for --metrics-otlp-ingest-url --local Use a local Docker image tag (default for docker-desktop k8s) --ecr Use an ECR image reference (default for non-docker-desktop k8s) --no-image-build Skip rebuilding the compose/k8s image (sets NOMOS_SKIP_IMAGE_BUILD=1) @@ -64,6 +65,8 @@ Environment: NOMOS_METRICS_QUERY_GRAFANA_URL Alias for K8S_RUNNER_METRICS_QUERY_GRAFANA_URL K8S_RUNNER_METRICS_OTLP_INGEST_URL Full OTLP HTTP ingest URL for node metrics export NOMOS_METRICS_OTLP_INGEST_URL Alias for K8S_RUNNER_METRICS_OTLP_INGEST_URL + K8S_RUNNER_GRAFANA_URL Grafana base URL for printing/logging (optional) + NOMOS_GRAFANA_URL Alias for K8S_RUNNER_GRAFANA_URL Deprecated env vars (still supported): K8S_RUNNER_EXTERNAL_PROMETHEUS_URL, NOMOS_EXTERNAL_PROMETHEUS_URL @@ -115,6 +118,7 @@ run_examples::parse_args() { METRICS_QUERY_URL="" METRICS_QUERY_GRAFANA_URL="" METRICS_OTLP_INGEST_URL="" + GRAFANA_URL="" RUN_SECS_RAW_SPECIFIED="" @@ -184,6 +188,14 @@ run_examples::parse_args() { METRICS_OTLP_INGEST_URL="${1#*=}" shift ;; + --grafana-url) + GRAFANA_URL="${2:-}" + shift 2 + ;; + --grafana-url=*) + GRAFANA_URL="${1#*=}" + shift + ;; --external-prometheus) METRICS_QUERY_URL="${2:-}" shift 2 @@ -262,18 +274,6 @@ run_examples::parse_args() { run_examples::fail_with_usage "executors must be a non-negative integer (pass -e/--executors)" fi - if [ -n "${METRICS_QUERY_URL}" ] && [ "${MODE}" != "k8s" ]; then - echo "Warning: --metrics-query-url is only used in k8s mode; ignoring." >&2 - METRICS_QUERY_URL="" - fi - if [ -n "${METRICS_QUERY_GRAFANA_URL}" ] && [ "${MODE}" != "k8s" ]; then - echo "Warning: --metrics-query-grafana-url is only used in k8s mode; ignoring." >&2 - METRICS_QUERY_GRAFANA_URL="" - fi - if [ -n "${METRICS_OTLP_INGEST_URL}" ] && [ "${MODE}" != "k8s" ]; then - echo "Warning: --metrics-otlp-ingest-url is only used in k8s mode; ignoring." >&2 - METRICS_OTLP_INGEST_URL="" - fi } run_examples::select_image() { @@ -582,17 +582,21 @@ run_examples::run() { export NOMOS_DEMO_VALIDATORS="${DEMO_VALIDATORS}" export NOMOS_DEMO_EXECUTORS="${DEMO_EXECUTORS}" - if [ "${MODE}" = "k8s" ] && [ -n "${METRICS_QUERY_URL}" ]; then - export K8S_RUNNER_METRICS_QUERY_URL="${METRICS_QUERY_URL}" + if [ -n "${METRICS_QUERY_URL}" ]; then export NOMOS_METRICS_QUERY_URL="${METRICS_QUERY_URL}" + export K8S_RUNNER_METRICS_QUERY_URL="${METRICS_QUERY_URL}" fi - if [ "${MODE}" = "k8s" ] && [ -n "${METRICS_QUERY_GRAFANA_URL}" ]; then - export K8S_RUNNER_METRICS_QUERY_GRAFANA_URL="${METRICS_QUERY_GRAFANA_URL}" + if [ -n "${METRICS_QUERY_GRAFANA_URL}" ]; then export NOMOS_METRICS_QUERY_GRAFANA_URL="${METRICS_QUERY_GRAFANA_URL}" + export K8S_RUNNER_METRICS_QUERY_GRAFANA_URL="${METRICS_QUERY_GRAFANA_URL}" fi - if [ "${MODE}" = "k8s" ] && [ -n "${METRICS_OTLP_INGEST_URL}" ]; then - export K8S_RUNNER_METRICS_OTLP_INGEST_URL="${METRICS_OTLP_INGEST_URL}" + if [ -n "${METRICS_OTLP_INGEST_URL}" ]; then export NOMOS_METRICS_OTLP_INGEST_URL="${METRICS_OTLP_INGEST_URL}" + export K8S_RUNNER_METRICS_OTLP_INGEST_URL="${METRICS_OTLP_INGEST_URL}" + fi + if [ -n "${GRAFANA_URL}" ]; then + export NOMOS_GRAFANA_URL="${GRAFANA_URL}" + export K8S_RUNNER_GRAFANA_URL="${GRAFANA_URL}" fi echo "==> Running ${BIN} for ${RUN_SECS}s (mode=${MODE}, image=${IMAGE})" diff --git a/testing-framework/assets/stack/cfgsync.yaml b/testing-framework/assets/stack/cfgsync.yaml index ab721bd..1b52e40 100644 --- a/testing-framework/assets/stack/cfgsync.yaml +++ b/testing-framework/assets/stack/cfgsync.yaml @@ -42,8 +42,6 @@ tracing_settings: filters: nomos: debug cryptarchia: debug - metrics: !Otlp - endpoint: http://prometheus:9090/api/v1/otlp/v1/metrics - host_identifier: node + metrics: None console: None level: INFO diff --git a/testing-framework/core/src/constants.rs b/testing-framework/core/src/constants.rs index fdbf3ba..fb1e2ec 100644 --- a/testing-framework/core/src/constants.rs +++ b/testing-framework/core/src/constants.rs @@ -21,18 +21,6 @@ pub const DEFAULT_NODE_HTTP_PROBE_TIMEOUT: Duration = Duration::from_secs(30); /// Default Kubernetes deployment readiness timeout. pub const DEFAULT_K8S_DEPLOYMENT_TIMEOUT: Duration = Duration::from_secs(180); -/// Default Prometheus HTTP port. -pub const DEFAULT_PROMETHEUS_HTTP_PORT: u16 = 9090; - -/// Default Prometheus HTTP timeout. -pub const DEFAULT_PROMETHEUS_HTTP_TIMEOUT: Duration = Duration::from_secs(240); - -/// Default Prometheus HTTP probe timeout for NodePort checks. -pub const DEFAULT_PROMETHEUS_HTTP_PROBE_TIMEOUT: Duration = Duration::from_secs(30); - -/// Default Prometheus service name. -pub const DEFAULT_PROMETHEUS_SERVICE_NAME: &str = "prometheus"; - /// Default API port used by nodes. pub const DEFAULT_API_PORT: u16 = 18080; diff --git a/testing-framework/core/src/scenario/capabilities.rs b/testing-framework/core/src/scenario/capabilities.rs index 9ecc867..c30c50a 100644 --- a/testing-framework/core/src/scenario/capabilities.rs +++ b/testing-framework/core/src/scenario/capabilities.rs @@ -8,9 +8,6 @@ use super::DynError; pub struct NodeControlCapability; /// Optional observability settings attached to a scenario. -/// -/// Runners may use this to decide whether to provision in-cluster Prometheus or -/// reuse an existing endpoint. #[derive(Clone, Debug, Default)] pub struct ObservabilityCapability { /// Prometheus-compatible base URL used by the *runner process* to query @@ -24,6 +21,8 @@ pub struct ObservabilityCapability { /// Full OTLP HTTP metrics ingest endpoint used by *nodes* to export metrics /// (backend-specific host and path). pub metrics_otlp_ingest_url: Option, + /// Optional Grafana base URL for printing/logging (human access). + pub grafana_url: Option, } /// Trait implemented by scenario capability markers to signal whether node diff --git a/testing-framework/core/src/scenario/mod.rs b/testing-framework/core/src/scenario/mod.rs index 412e38a..88b94e9 100644 --- a/testing-framework/core/src/scenario/mod.rs +++ b/testing-framework/core/src/scenario/mod.rs @@ -5,6 +5,7 @@ pub mod cfgsync; mod definition; mod expectation; pub mod http_probe; +mod observability; mod runtime; mod workload; @@ -15,6 +16,7 @@ pub use capabilities::{ }; pub use definition::{Builder, Scenario, ScenarioBuilder, TopologyConfigurator}; pub use expectation::Expectation; +pub use observability::{ObservabilityCapabilityProvider, ObservabilityInputs}; pub use runtime::{ BlockFeed, BlockFeedTask, BlockRecord, BlockStats, CleanupGuard, Deployer, NodeClients, RunContext, RunHandle, RunMetrics, Runner, ScenarioError, diff --git a/testing-framework/core/src/scenario/observability.rs b/testing-framework/core/src/scenario/observability.rs new file mode 100644 index 0000000..b9e4768 --- /dev/null +++ b/testing-framework/core/src/scenario/observability.rs @@ -0,0 +1,145 @@ +use std::env; + +use reqwest::Url; + +use super::{Metrics, MetricsError, NodeControlCapability, ObservabilityCapability}; + +/// Observability configuration inputs shared by deployers/runners. +/// +/// All fields are optional; missing values only matter when a caller needs the +/// corresponding capability (e.g. querying metrics from the runner process). +#[derive(Clone, Debug, Default)] +pub struct ObservabilityInputs { + /// Prometheus-compatible base URL used by the runner process to query + /// metrics (PromQL API endpoints). + pub metrics_query_url: Option, + /// Prometheus-compatible base URL intended for an in-cluster Grafana + /// datasource. + pub metrics_query_grafana_url: Option, + /// Full OTLP HTTP metrics ingest endpoint used by nodes to export metrics + /// (backend-specific host and path). + pub metrics_otlp_ingest_url: Option, + /// Optional Grafana base URL for printing/logging (human access). + pub grafana_url: Option, +} + +/// Capability helper for deployers that are generic over scenario capability +/// markers. +pub trait ObservabilityCapabilityProvider { + fn observability_capability(&self) -> Option<&ObservabilityCapability>; +} + +impl ObservabilityCapabilityProvider for () { + fn observability_capability(&self) -> Option<&ObservabilityCapability> { + None + } +} + +impl ObservabilityCapabilityProvider for NodeControlCapability { + fn observability_capability(&self) -> Option<&ObservabilityCapability> { + None + } +} + +impl ObservabilityCapabilityProvider for ObservabilityCapability { + fn observability_capability(&self) -> Option<&ObservabilityCapability> { + Some(self) + } +} + +impl ObservabilityInputs { + #[must_use] + pub fn from_capability(capabilities: &ObservabilityCapability) -> Self { + Self { + metrics_query_url: capabilities.metrics_query_url.clone(), + metrics_query_grafana_url: capabilities.metrics_query_grafana_url.clone(), + metrics_otlp_ingest_url: capabilities.metrics_otlp_ingest_url.clone(), + grafana_url: capabilities.grafana_url.clone(), + } + } + + /// Load observability inputs from environment variables. + /// + /// The `NOMOS_*` namespace applies to all deployers. Runner-specific env + /// vars are also accepted as aliases for backwards compatibility. + pub fn from_env() -> Result { + Ok(Self { + metrics_query_url: read_url_var(&[ + "NOMOS_METRICS_QUERY_URL", + "K8S_RUNNER_METRICS_QUERY_URL", + // Back-compat: + "K8S_RUNNER_EXTERNAL_PROMETHEUS_URL", + "NOMOS_EXTERNAL_PROMETHEUS_URL", + ])?, + metrics_query_grafana_url: read_url_var(&[ + "NOMOS_METRICS_QUERY_GRAFANA_URL", + "K8S_RUNNER_METRICS_QUERY_GRAFANA_URL", + // Back-compat: + "K8S_RUNNER_EXTERNAL_PROMETHEUS_GRAFANA_URL", + "NOMOS_EXTERNAL_PROMETHEUS_GRAFANA_URL", + ])?, + metrics_otlp_ingest_url: read_url_var(&[ + "NOMOS_METRICS_OTLP_INGEST_URL", + "K8S_RUNNER_METRICS_OTLP_INGEST_URL", + // Back-compat: + "K8S_RUNNER_EXTERNAL_OTLP_METRICS_ENDPOINT", + "NOMOS_EXTERNAL_OTLP_METRICS_ENDPOINT", + ])?, + grafana_url: read_url_var(&["NOMOS_GRAFANA_URL", "K8S_RUNNER_GRAFANA_URL"])?, + }) + } + + /// Apply defaults and fallbacks (pure function). + /// + /// Currently, the only fallback is using `metrics_query_url` as the Grafana + /// datasource URL when `metrics_query_grafana_url` is unset. + #[must_use] + pub fn normalized(mut self) -> Self { + if self.metrics_query_grafana_url.is_none() { + self.metrics_query_grafana_url = self.metrics_query_url.clone(); + } + self + } + + /// Overlay non-empty values from `overrides` onto `self`. + #[must_use] + pub fn with_overrides(mut self, overrides: Self) -> Self { + if overrides.metrics_query_url.is_some() { + self.metrics_query_url = overrides.metrics_query_url; + } + if overrides.metrics_query_grafana_url.is_some() { + self.metrics_query_grafana_url = overrides.metrics_query_grafana_url; + } + if overrides.metrics_otlp_ingest_url.is_some() { + self.metrics_otlp_ingest_url = overrides.metrics_otlp_ingest_url; + } + if overrides.grafana_url.is_some() { + self.grafana_url = overrides.grafana_url; + } + self + } + + /// Build the telemetry handle exposed in `RunContext::telemetry()`. + pub fn telemetry_handle(&self) -> Result { + match self.metrics_query_url.clone() { + Some(url) => Metrics::from_prometheus(url), + None => Ok(Metrics::empty()), + } + } +} + +fn read_url_var(keys: &[&'static str]) -> Result, MetricsError> { + for key in keys { + let Some(raw) = env::var(key).ok() else { + continue; + }; + let raw = raw.trim(); + if raw.is_empty() { + continue; + } + return Url::parse(raw) + .map(Some) + .map_err(|err| MetricsError::new(format!("invalid {key}: {err}"))); + } + Ok(None) +} diff --git a/testing-framework/deployers/compose/Cargo.toml b/testing-framework/deployers/compose/Cargo.toml index a00a0c6..3be1030 100644 --- a/testing-framework/deployers/compose/Cargo.toml +++ b/testing-framework/deployers/compose/Cargo.toml @@ -16,6 +16,8 @@ workspace = true anyhow = "1" async-trait = { workspace = true } cfgsync = { workspace = true } +nomos-tracing = { workspace = true } +nomos-tracing-service = { workspace = true } reqwest = { workspace = true, features = ["json"] } serde = { workspace = true, features = ["derive"] } tempfile = { workspace = true } @@ -32,6 +34,5 @@ groth16 = { workspace = true } key-management-system-service = { workspace = true } nomos-core = { workspace = true } nomos-ledger = { workspace = true } -nomos-tracing-service = { workspace = true } tests = { workspace = true } zksign = { workspace = true } diff --git a/testing-framework/deployers/compose/assets/docker-compose.yml.tera b/testing-framework/deployers/compose/assets/docker-compose.yml.tera index ac36c5f..83098c5 100644 --- a/testing-framework/deployers/compose/assets/docker-compose.yml.tera +++ b/testing-framework/deployers/compose/assets/docker-compose.yml.tera @@ -1,37 +1,4 @@ services: - prometheus: - image: prom/prometheus:v3.0.1 -{% if prometheus.platform %} platform: {{ prometheus.platform }} -{% endif %} command: - - --config.file=/etc/prometheus/prometheus.yml - - --storage.tsdb.retention.time=7d - - --web.enable-otlp-receiver - - --enable-feature=otlp-write-receiver - volumes: - - ./stack/monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:z - ports: - - {{ prometheus.host_port }} - restart: on-failure - - grafana: - image: grafana/grafana:10.4.1 - environment: - GF_PATHS_CONFIG: /etc/grafana/grafana.ini - GF_SECURITY_ADMIN_USER: admin - GF_SECURITY_ADMIN_PASSWORD: admin - ports: - - {{ grafana.host_port }} - volumes: - - ./stack/monitoring/grafana/datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:z - - ./stack/monitoring/grafana/dashboards.yml:/etc/grafana/provisioning/dashboards/dashboards.yaml:z - - ./stack/monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro - - ./stack/monitoring/grafana/grafana.ini:/etc/grafana/grafana.ini:ro - env_file: - - ./stack/monitoring/grafana/plugins.env - depends_on: - - prometheus - restart: on-failure - {% for node in validators %} {{ node.name }}: image: {{ node.image }} diff --git a/testing-framework/deployers/compose/src/deployer/mod.rs b/testing-framework/deployers/compose/src/deployer/mod.rs index 4060acf..6ebc0d0 100644 --- a/testing-framework/deployers/compose/src/deployer/mod.rs +++ b/testing-framework/deployers/compose/src/deployer/mod.rs @@ -6,7 +6,8 @@ pub mod setup; use async_trait::async_trait; use testing_framework_core::scenario::{ - BlockFeedTask, CleanupGuard, Deployer, RequiresNodeControl, Runner, Scenario, + BlockFeedTask, CleanupGuard, Deployer, ObservabilityCapabilityProvider, RequiresNodeControl, + Runner, Scenario, }; use crate::{errors::ComposeRunnerError, lifecycle::cleanup::RunnerCleanup}; @@ -41,7 +42,7 @@ impl ComposeDeployer { #[async_trait] impl Deployer for ComposeDeployer where - Caps: RequiresNodeControl + Send + Sync, + Caps: RequiresNodeControl + ObservabilityCapabilityProvider + Send + Sync, { type Error = ComposeRunnerError; diff --git a/testing-framework/deployers/compose/src/deployer/orchestrator.rs b/testing-framework/deployers/compose/src/deployer/orchestrator.rs index 598a0ae..f08ba90 100644 --- a/testing-framework/deployers/compose/src/deployer/orchestrator.rs +++ b/testing-framework/deployers/compose/src/deployer/orchestrator.rs @@ -1,7 +1,8 @@ use std::sync::Arc; use testing_framework_core::scenario::{ - NodeControlHandle, RequiresNodeControl, RunContext, Runner, Scenario, + NodeControlHandle, ObservabilityCapabilityProvider, ObservabilityInputs, RequiresNodeControl, + RunContext, Runner, Scenario, }; use tracing::info; @@ -20,7 +21,6 @@ use crate::{ environment::StackEnvironment, ports::{HostPortMapping, compose_runner_host}, }, - lifecycle::readiness::metrics_handle_from_port, }; pub struct DeploymentOrchestrator { @@ -37,21 +37,35 @@ impl DeploymentOrchestrator { scenario: &Scenario, ) -> Result where - Caps: RequiresNodeControl + Send + Sync, + Caps: RequiresNodeControl + ObservabilityCapabilityProvider + Send + Sync, { let setup = DeploymentSetup::new(scenario.topology()); setup.validate_environment().await?; + let env_inputs = ObservabilityInputs::from_env()?; + let cap_inputs = scenario + .capabilities() + .observability_capability() + .map(ObservabilityInputs::from_capability) + .unwrap_or_default(); + let observability = env_inputs.with_overrides(cap_inputs).normalized(); + let DeploymentContext { mut environment, descriptors, - } = setup.prepare_workspace().await?; + } = setup.prepare_workspace(&observability).await?; tracing::info!( validators = descriptors.validators().len(), executors = descriptors.executors().len(), duration_secs = scenario.duration().as_secs(), readiness_checks = self.deployer.readiness_checks, + metrics_query_url = observability.metrics_query_url.as_ref().map(|u| u.as_str()), + metrics_otlp_ingest_url = observability + .metrics_otlp_ingest_url + .as_ref() + .map(|u| u.as_str()), + grafana_url = observability.grafana_url.as_ref().map(|u| u.as_str()), "compose deployment starting" ); @@ -71,26 +85,31 @@ impl DeploymentOrchestrator { let node_clients = client_builder .build_node_clients(&descriptors, &host_ports, &host, &mut environment) .await?; - let telemetry = metrics_handle_from_port(environment.prometheus_port(), &host)?; + let telemetry = observability.telemetry_handle()?; let node_control = self.maybe_node_control::(&environment); - info!( - prometheus_url = %format!("http://{}:{}/", host, environment.prometheus_port()), - "prometheus endpoint available on host" - ); - info!( - grafana_url = %format!("http://{}:{}/", host, environment.grafana_port()), - "grafana dashboard available on host" - ); + if let Some(url) = observability.metrics_query_url.as_ref() { + info!(metrics_query_url = %url.as_str(), "metrics query endpoint configured"); + } + if let Some(url) = observability.grafana_url.as_ref() { + info!(grafana_url = %url.as_str(), "grafana url configured"); + } log_profiling_urls(&host, &host_ports); if std::env::var("TESTNET_PRINT_ENDPOINTS").is_ok() { + let prometheus = observability + .metrics_query_url + .as_ref() + .map(|u| u.as_str().to_string()) + .unwrap_or_else(|| "".to_string()); + let grafana = observability + .grafana_url + .as_ref() + .map(|u| u.as_str().to_string()) + .unwrap_or_else(|| "".to_string()); println!( - "TESTNET_ENDPOINTS prometheus=http://{}:{}/ grafana=http://{}:{}/", - host, - environment.prometheus_port(), - host, - environment.grafana_port() + "TESTNET_ENDPOINTS prometheus={} grafana={}", + prometheus, grafana ); print_profiling_urls(&host, &host_ports); diff --git a/testing-framework/deployers/compose/src/deployer/ports.rs b/testing-framework/deployers/compose/src/deployer/ports.rs index 624259a..2ca9708 100644 --- a/testing-framework/deployers/compose/src/deployer/ports.rs +++ b/testing-framework/deployers/compose/src/deployer/ports.rs @@ -26,7 +26,6 @@ impl PortManager { info!( validator_ports = ?mapping.validator_api_ports(), executor_ports = ?mapping.executor_api_ports(), - prometheus_port = environment.prometheus_port(), "resolved container host ports" ); Ok(mapping) diff --git a/testing-framework/deployers/compose/src/deployer/setup.rs b/testing-framework/deployers/compose/src/deployer/setup.rs index 3d9036d..3cccee7 100644 --- a/testing-framework/deployers/compose/src/deployer/setup.rs +++ b/testing-framework/deployers/compose/src/deployer/setup.rs @@ -1,22 +1,16 @@ -use std::{ - env, - net::{Ipv4Addr, TcpListener as StdTcpListener}, +use testing_framework_core::{ + scenario::ObservabilityInputs, topology::generation::GeneratedTopology, }; - -use testing_framework_core::topology::generation::GeneratedTopology; -use tracing::{debug, info}; +use tracing::info; use crate::{ docker::ensure_docker_available, errors::ComposeRunnerError, infrastructure::environment::{ - PortReservation, StackEnvironment, ensure_supported_topology, prepare_environment, + StackEnvironment, ensure_supported_topology, prepare_environment, }, }; -pub const PROMETHEUS_PORT_ENV: &str = "TEST_FRAMEWORK_PROMETHEUS_PORT"; -pub const DEFAULT_PROMETHEUS_PORT: u16 = 9090; - pub struct DeploymentSetup { descriptors: GeneratedTopology, } @@ -46,27 +40,15 @@ impl DeploymentSetup { Ok(()) } - pub async fn prepare_workspace(self) -> Result { - let prometheus_env = env::var(PROMETHEUS_PORT_ENV) - .ok() - .and_then(|raw| raw.parse::().ok()); - if prometheus_env.is_some() { - info!(port = prometheus_env, "using prometheus port from env"); - } - - let prometheus_port = prometheus_env - .and_then(|port| reserve_port(port)) - .or_else(|| allocate_prometheus_port()) - .unwrap_or_else(|| PortReservation::new(DEFAULT_PROMETHEUS_PORT, None)); - - debug!( - prometheus_port = prometheus_port.port(), - "selected prometheus port" - ); - - let environment = - prepare_environment(&self.descriptors, prometheus_port, prometheus_env.is_some()) - .await?; + pub async fn prepare_workspace( + self, + observability: &ObservabilityInputs, + ) -> Result { + let environment = prepare_environment( + &self.descriptors, + observability.metrics_otlp_ingest_url.as_ref(), + ) + .await?; info!( compose_file = %environment.compose_path().display(), @@ -81,13 +63,3 @@ impl DeploymentSetup { }) } } - -fn allocate_prometheus_port() -> Option { - reserve_port(DEFAULT_PROMETHEUS_PORT).or_else(|| reserve_port(0)) -} - -fn reserve_port(port: u16) -> Option { - let listener = StdTcpListener::bind((Ipv4Addr::LOCALHOST, port)).ok()?; - let actual_port = listener.local_addr().ok()?.port(); - Some(PortReservation::new(actual_port, Some(listener))) -} diff --git a/testing-framework/deployers/compose/src/descriptor/mod.rs b/testing-framework/deployers/compose/src/descriptor/mod.rs index 8b0317b..08921cb 100644 --- a/testing-framework/deployers/compose/src/descriptor/mod.rs +++ b/testing-framework/deployers/compose/src/descriptor/mod.rs @@ -1,6 +1,6 @@ use serde::Serialize; use testing_framework_core::{ - constants::{DEFAULT_CFGSYNC_PORT, DEFAULT_PROMETHEUS_HTTP_PORT, kzg_container_path}, + constants::{DEFAULT_CFGSYNC_PORT, kzg_container_path}, topology::generation::{GeneratedNodeConfig, GeneratedTopology}, }; @@ -10,18 +10,9 @@ mod node; pub use node::{EnvEntry, NodeDescriptor}; -/// Errors building a compose descriptor from the topology. -#[derive(Debug, thiserror::Error)] -pub enum DescriptorBuildError { - #[error("prometheus port is not configured for compose descriptor")] - MissingPrometheusPort, -} - /// Top-level docker-compose descriptor built from a GeneratedTopology. #[derive(Clone, Debug, Serialize)] pub struct ComposeDescriptor { - prometheus: PrometheusTemplate, - grafana: GrafanaTemplate, validators: Vec, executors: Vec, } @@ -50,8 +41,6 @@ pub struct ComposeDescriptorBuilder<'a> { topology: &'a GeneratedTopology, use_kzg_mount: bool, cfgsync_port: Option, - prometheus_port: Option, - grafana_port: Option, } impl<'a> ComposeDescriptorBuilder<'a> { @@ -60,8 +49,6 @@ impl<'a> ComposeDescriptorBuilder<'a> { topology, use_kzg_mount: false, cfgsync_port: None, - prometheus_port: None, - grafana_port: None, } } @@ -79,34 +66,12 @@ impl<'a> ComposeDescriptorBuilder<'a> { self } + /// Finish building the descriptor. #[must_use] - /// Set host port mapping for Prometheus. - pub const fn with_prometheus_port(mut self, port: u16) -> Self { - self.prometheus_port = Some(port); - self - } - - #[must_use] - /// Set host port mapping for Grafana. - pub const fn with_grafana_port(mut self, port: u16) -> Self { - self.grafana_port = Some(port); - self - } - - /// Finish building the descriptor, erroring if required fields are missing. - pub fn build(self) -> Result { + pub fn build(self) -> ComposeDescriptor { let cfgsync_port = self.cfgsync_port.unwrap_or(DEFAULT_CFGSYNC_PORT); - let prometheus_host_port = self - .prometheus_port - .ok_or(DescriptorBuildError::MissingPrometheusPort)?; - let grafana_host_port = self.grafana_port.unwrap_or(0); let (image, platform) = resolve_image(); - // Prometheus image is x86_64-only on some tags; set platform when on arm hosts. - let prometheus_platform = match std::env::consts::ARCH { - "aarch64" | "arm64" => Some(String::from("linux/arm64")), - _ => None, - }; let validators = build_nodes( self.topology.validators(), @@ -126,49 +91,13 @@ impl<'a> ComposeDescriptorBuilder<'a> { cfgsync_port, ); - Ok(ComposeDescriptor { - prometheus: PrometheusTemplate::new(prometheus_host_port, prometheus_platform), - grafana: GrafanaTemplate::new(grafana_host_port), + ComposeDescriptor { validators, executors, - }) - } -} - -/// Minimal Prometheus service mapping used in the compose template. -#[derive(Clone, Debug, Serialize)] -pub struct PrometheusTemplate { - host_port: String, - #[serde(skip_serializing_if = "Option::is_none")] - platform: Option, -} - -impl PrometheusTemplate { - fn new(port: u16, platform: Option) -> Self { - Self { - host_port: format!("127.0.0.1:{port}:{}", DEFAULT_PROMETHEUS_HTTP_PORT), - platform, } } } -/// Minimal Grafana service mapping used in the compose template. -#[derive(Clone, Debug, Serialize)] -pub struct GrafanaTemplate { - host_port: String, -} - -impl GrafanaTemplate { - fn new(port: u16) -> Self { - let host_port = match port { - 0 => "127.0.0.1::3000".to_string(), // docker assigns host port - _ => format!("127.0.0.1:{port}:3000"), - }; - - Self { host_port } - } -} - #[derive(Clone, Copy)] pub(crate) enum ComposeNodeKind { Validator, diff --git a/testing-framework/deployers/compose/src/docker/workspace.rs b/testing-framework/deployers/compose/src/docker/workspace.rs index 8200475..752eaa0 100644 --- a/testing-framework/deployers/compose/src/docker/workspace.rs +++ b/testing-framework/deployers/compose/src/docker/workspace.rs @@ -49,24 +49,6 @@ impl ComposeWorkspace { copy_dir_recursive(&scripts_source, &temp.path().join("stack/scripts"))?; } - // Ensure Prometheus config is a file (Docker bind mount fails if a directory - // exists). - let prometheus_src = stack_source.join("monitoring/prometheus.yml"); - let prometheus_dst = temp.path().join("stack/monitoring/prometheus.yml"); - if prometheus_dst.exists() && prometheus_dst.is_dir() { - fs::remove_dir_all(&prometheus_dst) - .with_context(|| format!("removing bogus dir {}", prometheus_dst.display()))?; - } - if !prometheus_dst.exists() { - fs::copy(&prometheus_src, &prometheus_dst).with_context(|| { - format!( - "copying prometheus.yml {} -> {}", - prometheus_src.display(), - prometheus_dst.display() - ) - })?; - } - let kzg_source = repo_root.join("testing-framework/assets/stack/kzgrs_test_params"); let target = temp.path().join("kzgrs_test_params"); if kzg_source.exists() { diff --git a/testing-framework/deployers/compose/src/errors.rs b/testing-framework/deployers/compose/src/errors.rs index fbe0e52..9653eb4 100644 --- a/testing-framework/deployers/compose/src/errors.rs +++ b/testing-framework/deployers/compose/src/errors.rs @@ -9,10 +9,7 @@ use testing_framework_core::{ }; use url::ParseError; -use crate::{ - descriptor::DescriptorBuildError, docker::commands::ComposeCommandError, - infrastructure::template::TemplateError, -}; +use crate::{docker::commands::ComposeCommandError, infrastructure::template::TemplateError}; #[derive(Debug, thiserror::Error)] /// Top-level compose runner errors. @@ -94,11 +91,6 @@ pub enum ConfigError { #[source] source: anyhow::Error, }, - #[error("failed to build compose descriptor: {source}")] - Descriptor { - #[source] - source: DescriptorBuildError, - }, #[error("failed to render compose template: {source}")] Template { #[source] diff --git a/testing-framework/deployers/compose/src/infrastructure/cfgsync.rs b/testing-framework/deployers/compose/src/infrastructure/cfgsync.rs index 224d4d7..8786cea 100644 --- a/testing-framework/deployers/compose/src/infrastructure/cfgsync.rs +++ b/testing-framework/deployers/compose/src/infrastructure/cfgsync.rs @@ -1,5 +1,8 @@ use std::{path::Path, process::Command as StdCommand}; +use nomos_tracing::metrics::otlp::OtlpMetricsConfig; +use nomos_tracing_service::MetricsLayer; +use reqwest::Url; use testing_framework_core::{ scenario::cfgsync::{apply_topology_overrides, load_cfgsync_template, write_cfgsync_template}, topology::generation::GeneratedTopology, @@ -60,6 +63,7 @@ pub fn update_cfgsync_config( topology: &GeneratedTopology, use_kzg_mount: bool, port: u16, + metrics_otlp_ingest_url: Option<&Url>, ) -> anyhow::Result<()> { debug!( path = %path.display(), @@ -72,6 +76,12 @@ pub fn update_cfgsync_config( let mut cfg = load_cfgsync_template(path)?; cfg.port = port; apply_topology_overrides(&mut cfg, topology, use_kzg_mount); + if let Some(endpoint) = metrics_otlp_ingest_url.cloned() { + cfg.tracing_settings.metrics = MetricsLayer::Otlp(OtlpMetricsConfig { + endpoint, + host_identifier: "node".into(), + }); + } write_cfgsync_template(path, &cfg)?; Ok(()) } diff --git a/testing-framework/deployers/compose/src/infrastructure/environment.rs b/testing-framework/deployers/compose/src/infrastructure/environment.rs index 2a6a894..07fdee3 100644 --- a/testing-framework/deployers/compose/src/infrastructure/environment.rs +++ b/testing-framework/deployers/compose/src/infrastructure/environment.rs @@ -1,20 +1,19 @@ use std::{ - env, net::{Ipv4Addr, TcpListener as StdTcpListener}, path::{Path, PathBuf}, time::Duration, }; -use anyhow::{Context as _, anyhow}; +use anyhow::anyhow; +use reqwest::Url; use testing_framework_core::{ adjust_timeout, scenario::CleanupGuard, topology::generation::GeneratedTopology, }; -use tokio::{process::Command, time::timeout}; -use tracing::{debug, error, info, warn}; +use tokio::process::Command; +use tracing::{debug, error, info}; use uuid::Uuid; use crate::{ - deployer::setup::DEFAULT_PROMETHEUS_PORT, descriptor::ComposeDescriptor, docker::{ commands::{compose_up, dump_compose_logs, run_docker_command}, @@ -31,8 +30,6 @@ use crate::{ }; const CFGSYNC_START_TIMEOUT: Duration = Duration::from_secs(180); -const COMPOSE_PORT_DISCOVERY_TIMEOUT: Duration = Duration::from_secs(30); -const STACK_BRINGUP_MAX_ATTEMPTS: usize = 3; /// Paths and flags describing the prepared compose workspace. pub struct WorkspaceState { @@ -49,8 +46,6 @@ pub struct StackEnvironment { root: PathBuf, workspace: Option, cfgsync_handle: Option, - prometheus_port: u16, - grafana_port: u16, } impl StackEnvironment { @@ -60,8 +55,6 @@ impl StackEnvironment { compose_path: PathBuf, project_name: String, cfgsync_handle: Option, - prometheus_port: u16, - grafana_port: u16, ) -> Self { let WorkspaceState { workspace, root, .. @@ -73,8 +66,6 @@ impl StackEnvironment { root, workspace: Some(workspace), cfgsync_handle, - prometheus_port, - grafana_port, } } @@ -82,16 +73,6 @@ impl StackEnvironment { &self.compose_path } - /// Host port exposed by Prometheus. - pub const fn prometheus_port(&self) -> u16 { - self.prometheus_port - } - - /// Host port exposed by Grafana. - pub const fn grafana_port(&self) -> u16 { - self.grafana_port - } - /// Docker compose project name. pub fn project_name(&self) -> &str { &self.project_name @@ -138,27 +119,6 @@ impl StackEnvironment { } } -/// Represents a claimed port, optionally guarded by an open socket. -pub struct PortReservation { - port: u16, - _guard: Option, -} - -impl PortReservation { - /// Holds a port and an optional socket guard to keep it reserved. - pub const fn new(port: u16, guard: Option) -> Self { - Self { - port, - _guard: guard, - } - } - - /// The reserved port number. - pub const fn port(&self) -> u16 { - self.port - } -} - /// Verifies the topology has at least one validator so compose can start. pub fn ensure_supported_topology( descriptors: &GeneratedTopology, @@ -210,10 +170,17 @@ pub fn update_cfgsync_logged( workspace: &WorkspaceState, descriptors: &GeneratedTopology, cfgsync_port: u16, + metrics_otlp_ingest_url: Option<&Url>, ) -> Result<(), ComposeRunnerError> { info!(cfgsync_port, "updating cfgsync configuration"); - configure_cfgsync(workspace, descriptors, cfgsync_port).map_err(Into::into) + configure_cfgsync( + workspace, + descriptors, + cfgsync_port, + metrics_otlp_ingest_url, + ) + .map_err(Into::into) } /// Start the cfgsync server container using the generated config. @@ -232,12 +199,14 @@ pub fn configure_cfgsync( workspace: &WorkspaceState, descriptors: &GeneratedTopology, cfgsync_port: u16, + metrics_otlp_ingest_url: Option<&Url>, ) -> Result<(), ConfigError> { update_cfgsync_config( &workspace.cfgsync_path, descriptors, workspace.use_kzg, cfgsync_port, + metrics_otlp_ingest_url, ) .map_err(|source| ConfigError::Cfgsync { path: workspace.cfgsync_path.clone(), @@ -328,23 +297,16 @@ pub fn write_compose_artifacts( workspace: &WorkspaceState, descriptors: &GeneratedTopology, cfgsync_port: u16, - prometheus_port: u16, - grafana_port: u16, ) -> Result { debug!( cfgsync_port, - prometheus_port, - grafana_port, workspace_root = %workspace.root.display(), "building compose descriptor" ); let descriptor = ComposeDescriptor::builder(descriptors) .with_kzg_mount(workspace.use_kzg) .with_cfgsync_port(cfgsync_port) - .with_prometheus_port(prometheus_port) - .with_grafana_port(grafana_port) - .build() - .map_err(|source| ConfigError::Descriptor { source })?; + .build(); let compose_path = workspace.root.join("compose.generated.yml"); write_compose_file(&descriptor, &compose_path) @@ -358,21 +320,9 @@ pub fn render_compose_logged( workspace: &WorkspaceState, descriptors: &GeneratedTopology, cfgsync_port: u16, - prometheus_port: u16, - grafana_port: u16, ) -> Result { - info!( - cfgsync_port, - prometheus_port, grafana_port, "rendering compose file with ports" - ); - write_compose_artifacts( - workspace, - descriptors, - cfgsync_port, - prometheus_port, - grafana_port, - ) - .map_err(Into::into) + info!(cfgsync_port, "rendering compose file"); + write_compose_artifacts(workspace, descriptors, cfgsync_port).map_err(Into::into) } /// Bring up docker compose; shut down cfgsync if start-up fails. @@ -404,172 +354,46 @@ pub async fn bring_up_stack_logged( /// Prepare workspace, cfgsync, compose artifacts, and launch the stack. pub async fn prepare_environment( descriptors: &GeneratedTopology, - mut prometheus_port: PortReservation, - prometheus_port_locked: bool, + metrics_otlp_ingest_url: Option<&Url>, ) -> Result { let workspace = prepare_workspace_logged()?; let cfgsync_port = allocate_cfgsync_port()?; - - let grafana_env = env::var("COMPOSE_GRAFANA_PORT") - .ok() - .and_then(|raw| raw.parse::().ok()); - if let Some(port) = grafana_env { - info!(port, "using grafana port from env"); - } - - update_cfgsync_logged(&workspace, descriptors, cfgsync_port)?; + update_cfgsync_logged( + &workspace, + descriptors, + cfgsync_port, + metrics_otlp_ingest_url, + )?; ensure_compose_image().await?; + let compose_path = render_compose_logged(&workspace, descriptors, cfgsync_port)?; - let attempts = if prometheus_port_locked { - 1 - } else { - STACK_BRINGUP_MAX_ATTEMPTS - }; - let mut last_err = None; + let project_name = format!("nomos-compose-{}", Uuid::new_v4()); + let mut cfgsync_handle = start_cfgsync_stage(&workspace, cfgsync_port).await?; - for _ in 0..attempts { - let prometheus_port_value = prometheus_port.port(); - let grafana_port_value = grafana_env.unwrap_or(0); - let compose_path = render_compose_logged( - &workspace, - descriptors, - cfgsync_port, - prometheus_port_value, - grafana_port_value, - )?; - - let project_name = format!("nomos-compose-{}", Uuid::new_v4()); - let mut cfgsync_handle = start_cfgsync_stage(&workspace, cfgsync_port).await?; - - drop(prometheus_port); - - match bring_up_stack_logged( - &compose_path, - &project_name, - &workspace.root, - &mut cfgsync_handle, - ) - .await - { - Ok(()) => { - let grafana_port_resolved = resolve_service_port( - &compose_path, - &project_name, - &workspace.root, - "grafana", - 3000, - ) - .await - .unwrap_or(grafana_port_value); - - info!( - project = %project_name, - compose_file = %compose_path.display(), - cfgsync_port, - prometheus_port = prometheus_port_value, - grafana_port = grafana_port_resolved, - "compose stack is up" - ); - return Ok(StackEnvironment::from_workspace( - workspace, - compose_path, - project_name, - Some(cfgsync_handle), - prometheus_port_value, - grafana_port_resolved, - )); - } - Err(err) => { - // Attempt to capture container logs even when bring-up fails early. - dump_compose_logs(&compose_path, &project_name, &workspace.root).await; - cfgsync_handle.shutdown(); - last_err = Some(err); - if prometheus_port_locked { - break; - } - warn!( - error = %last_err.as_ref().unwrap(), - "compose bring-up failed; retrying with a new prometheus port" - ); - prometheus_port = allocate_prometheus_port() - .unwrap_or_else(|| PortReservation::new(DEFAULT_PROMETHEUS_PORT, None)); - debug!( - next_prometheus_port = prometheus_port.port(), - "retrying compose bring-up" - ); - } - } + if let Err(err) = bring_up_stack_logged( + &compose_path, + &project_name, + &workspace.root, + &mut cfgsync_handle, + ) + .await + { + dump_compose_logs(&compose_path, &project_name, &workspace.root).await; + cfgsync_handle.shutdown(); + return Err(err); } - Err(last_err.expect("prepare_environment should return or fail with error")) -} - -fn allocate_prometheus_port() -> Option { - reserve_prometheus_port(DEFAULT_PROMETHEUS_PORT).or_else(|| reserve_prometheus_port(0)) -} - -fn reserve_prometheus_port(port: u16) -> Option { - let listener = StdTcpListener::bind((Ipv4Addr::LOCALHOST, port)).ok()?; - let actual_port = listener.local_addr().ok()?.port(); - Some(PortReservation::new(actual_port, Some(listener))) -} - -async fn resolve_service_port( - compose_file: &Path, - project_name: &str, - root: &Path, - service: &str, - container_port: u16, -) -> Result { - let mut cmd = Command::new("docker"); - cmd.arg("compose") - .arg("-f") - .arg(compose_file) - .arg("-p") - .arg(project_name) - .arg("port") - .arg(service) - .arg(container_port.to_string()) - .current_dir(root); - - let output = timeout(adjust_timeout(COMPOSE_PORT_DISCOVERY_TIMEOUT), cmd.output()) - .await - .map_err(|_| ComposeRunnerError::PortDiscovery { - service: service.to_owned(), - container_port, - source: anyhow!("docker compose port timed out"), - })? - .with_context(|| format!("running docker compose port {service} {container_port}")) - .map_err(|source| ComposeRunnerError::PortDiscovery { - service: service.to_owned(), - container_port, - source, - })?; - - if !output.status.success() { - return Err(ComposeRunnerError::PortDiscovery { - service: service.to_owned(), - container_port, - source: anyhow!("docker compose port exited with {}", output.status), - }); - } - - let stdout = String::from_utf8_lossy(&output.stdout); - for line in stdout.lines() { - let line = line.trim(); - if line.is_empty() { - continue; - } - if let Some(port_str) = line.rsplit(':').next() - && let Ok(port) = port_str.trim().parse::() - { - return Ok(port); - } - } - - Err(ComposeRunnerError::PortDiscovery { - service: service.to_owned(), - container_port, - source: anyhow!("unable to parse docker compose port output: {stdout}"), - }) + info!( + project = %project_name, + compose_file = %compose_path.display(), + cfgsync_port, + "compose stack is up" + ); + + Ok(StackEnvironment::from_workspace( + workspace, + compose_path, + project_name, + Some(cfgsync_handle), + )) } diff --git a/testing-framework/deployers/compose/src/lifecycle/readiness.rs b/testing-framework/deployers/compose/src/lifecycle/readiness.rs index d3d535c..7c7082c 100644 --- a/testing-framework/deployers/compose/src/lifecycle/readiness.rs +++ b/testing-framework/deployers/compose/src/lifecycle/readiness.rs @@ -3,7 +3,7 @@ use std::time::Duration; use reqwest::Url; use testing_framework_core::{ nodes::ApiClient, - scenario::{Metrics, MetricsError, NodeClients, http_probe::NodeRole as HttpNodeRole}, + scenario::{NodeClients, http_probe::NodeRole as HttpNodeRole}, topology::generation::{GeneratedTopology, NodeRole as TopologyNodeRole}, }; use tokio::time::sleep; @@ -16,13 +16,6 @@ use crate::{ const DISABLED_READINESS_SLEEP: Duration = Duration::from_secs(5); -/// Build a metrics client from host/port, validating the URL. -pub fn metrics_handle_from_port(port: u16, host: &str) -> Result { - let url = Url::parse(&format!("http://{host}:{port}/")) - .map_err(|err| MetricsError::new(format!("invalid prometheus url: {err}")))?; - Metrics::from_prometheus(url) -} - /// Wait until all validators respond on their API ports. pub async fn ensure_validators_ready_with_ports(ports: &[u16]) -> Result<(), StackReadinessError> { if ports.is_empty() { diff --git a/testing-framework/deployers/k8s/Cargo.toml b/testing-framework/deployers/k8s/Cargo.toml index d6ecbca..9ca0f41 100644 --- a/testing-framework/deployers/k8s/Cargo.toml +++ b/testing-framework/deployers/k8s/Cargo.toml @@ -17,6 +17,7 @@ anyhow = "1" async-trait = { workspace = true } k8s-openapi = { version = "0.20", features = ["latest"] } kube = { version = "0.87", default-features = false, features = ["client", "runtime", "rustls-tls"] } +nomos-tracing = { workspace = true } nomos-tracing-service = { workspace = true } reqwest = { workspace = true, features = ["json"] } serde = { version = "1", features = ["derive"] } diff --git a/testing-framework/deployers/k8s/helm/nomos-runner/grafana/dashboards/.gitignore b/testing-framework/deployers/k8s/helm/nomos-runner/grafana/dashboards/.gitignore deleted file mode 100644 index 0827618..0000000 --- a/testing-framework/deployers/k8s/helm/nomos-runner/grafana/dashboards/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -*.json -!.gitignore diff --git a/testing-framework/deployers/k8s/helm/nomos-runner/templates/_helpers.tpl b/testing-framework/deployers/k8s/helm/nomos-runner/templates/_helpers.tpl index 7d1b4b8..cd8406f 100644 --- a/testing-framework/deployers/k8s/helm/nomos-runner/templates/_helpers.tpl +++ b/testing-framework/deployers/k8s/helm/nomos-runner/templates/_helpers.tpl @@ -37,9 +37,3 @@ app.kubernetes.io/instance: {{ $root.Release.Name }} nomos/logical-role: executor nomos/executor-index: "{{ $index }}" {{- end -}} - -{{- define "nomos-runner.prometheusLabels" -}} -app.kubernetes.io/name: {{ include "nomos-runner.chart" . }} -app.kubernetes.io/instance: {{ .Release.Name }} -nomos/logical-role: prometheus -{{- end -}} diff --git a/testing-framework/deployers/k8s/helm/nomos-runner/templates/grafana-configmap.yaml b/testing-framework/deployers/k8s/helm/nomos-runner/templates/grafana-configmap.yaml deleted file mode 100644 index 6cf2938..0000000 --- a/testing-framework/deployers/k8s/helm/nomos-runner/templates/grafana-configmap.yaml +++ /dev/null @@ -1,33 +0,0 @@ -{{- if .Values.grafana.enabled }} -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "nomos-runner.fullname" . }}-grafana-config -data: - datasources.yaml: | - apiVersion: 1 - datasources: - - name: Prometheus - type: prometheus - access: proxy - isDefault: true - uid: PBFA97CFB590B2093 - orgId: 1 - url: {{ if .Values.prometheus.enabled }}http://prometheus:9090{{ else }}{{ required "prometheus.externalUrl must be set when prometheus.enabled=false" .Values.prometheus.externalUrl }}{{ end }} - editable: true - dashboards.yaml: | - apiVersion: 1 - providers: - - name: 'default' - orgId: 1 - folder: 'Nomos' - type: file - disableDeletion: false - editable: true - options: - path: /var/lib/grafana/dashboards -{{ range $path, $_ := .Files.Glob "grafana/dashboards/*.json" }} - {{ base $path }}: | -{{ $.Files.Get $path | indent 4 }} -{{ end }} -{{- end }} diff --git a/testing-framework/deployers/k8s/helm/nomos-runner/templates/grafana-deployment.yaml b/testing-framework/deployers/k8s/helm/nomos-runner/templates/grafana-deployment.yaml deleted file mode 100644 index 8f0448d..0000000 --- a/testing-framework/deployers/k8s/helm/nomos-runner/templates/grafana-deployment.yaml +++ /dev/null @@ -1,64 +0,0 @@ -{{- if .Values.grafana.enabled }} -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "nomos-runner.fullname" . }}-grafana - labels: - app: {{ include "nomos-runner.name" . }} - chart: {{ include "nomos-runner.chart" . }} - release: {{ .Release.Name }} - heritage: {{ .Release.Service }} -spec: - replicas: 1 - selector: - matchLabels: - app: {{ include "nomos-runner.name" . }} - component: grafana - template: - metadata: - labels: - app: {{ include "nomos-runner.name" . }} - component: grafana - spec: - containers: - - name: grafana - image: {{ .Values.grafana.image }} - imagePullPolicy: {{ .Values.grafana.imagePullPolicy }} - env: - - name: GF_SECURITY_ADMIN_USER - value: {{ .Values.grafana.adminUser | quote }} - - name: GF_SECURITY_ADMIN_PASSWORD - value: {{ .Values.grafana.adminPassword | quote }} - ports: - - containerPort: 3000 - name: http - volumeMounts: - - name: grafana-config - mountPath: /etc/grafana/provisioning/datasources/datasources.yaml - subPath: datasources.yaml - readOnly: true - - name: grafana-config - mountPath: /etc/grafana/provisioning/dashboards/dashboards.yaml - subPath: dashboards.yaml - readOnly: true - - name: grafana-dashboards - mountPath: /var/lib/grafana/dashboards - readOnly: true - volumes: - - name: grafana-config - configMap: - name: {{ include "nomos-runner.fullname" . }}-grafana-config - items: - - key: datasources.yaml - path: datasources.yaml - - key: dashboards.yaml - path: dashboards.yaml - - name: grafana-dashboards - configMap: - name: {{ include "nomos-runner.fullname" . }}-grafana-config - items: -{{ range $path, $_ := .Files.Glob "grafana/dashboards/*.json" }} - - key: {{ base $path }} - path: {{ base $path }} -{{ end }} -{{- end }} diff --git a/testing-framework/deployers/k8s/helm/nomos-runner/templates/grafana-service.yaml b/testing-framework/deployers/k8s/helm/nomos-runner/templates/grafana-service.yaml deleted file mode 100644 index b430a2a..0000000 --- a/testing-framework/deployers/k8s/helm/nomos-runner/templates/grafana-service.yaml +++ /dev/null @@ -1,22 +0,0 @@ -{{- if .Values.grafana.enabled }} -apiVersion: v1 -kind: Service -metadata: - name: {{ include "nomos-runner.fullname" . }}-grafana - labels: - app: {{ include "nomos-runner.name" . }} - component: grafana -spec: - type: {{ .Values.grafana.service.type }} - ports: - - port: 3000 - targetPort: http - protocol: TCP - name: http - {{- if and (eq .Values.grafana.service.type "NodePort") .Values.grafana.service.nodePort }} - nodePort: {{ .Values.grafana.service.nodePort }} - {{- end }} - selector: - app: {{ include "nomos-runner.name" . }} - component: grafana -{{- end }} diff --git a/testing-framework/deployers/k8s/helm/nomos-runner/templates/prometheus-configmap.yaml b/testing-framework/deployers/k8s/helm/nomos-runner/templates/prometheus-configmap.yaml deleted file mode 100644 index 7eaa16a..0000000 --- a/testing-framework/deployers/k8s/helm/nomos-runner/templates/prometheus-configmap.yaml +++ /dev/null @@ -1,16 +0,0 @@ -{{- if .Values.prometheus.enabled }} ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: prometheus - labels: - {{- include "nomos-runner.prometheusLabels" . | nindent 4 }} -data: - prometheus.yml: | -{{- if .Values.prometheus.config }} -{{ .Values.prometheus.config | indent 4 }} -{{- else }} -{{ "" | indent 4 }} -{{- end }} -{{- end }} diff --git a/testing-framework/deployers/k8s/helm/nomos-runner/templates/prometheus-deployment.yaml b/testing-framework/deployers/k8s/helm/nomos-runner/templates/prometheus-deployment.yaml deleted file mode 100644 index 4cda1c1..0000000 --- a/testing-framework/deployers/k8s/helm/nomos-runner/templates/prometheus-deployment.yaml +++ /dev/null @@ -1,38 +0,0 @@ -{{- if .Values.prometheus.enabled }} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: prometheus - labels: - {{- include "nomos-runner.prometheusLabels" . | nindent 4 }} -spec: - replicas: 1 - selector: - matchLabels: - {{- include "nomos-runner.prometheusLabels" . | nindent 6 }} - template: - metadata: - labels: - {{- include "nomos-runner.prometheusLabels" . | nindent 8 }} - spec: - containers: - - name: prometheus - image: {{ .Values.prometheus.image }} - imagePullPolicy: {{ .Values.prometheus.imagePullPolicy | default "IfNotPresent" }} - args: - - --config.file=/etc/prometheus/prometheus.yml - - --storage.tsdb.retention.time={{ .Values.prometheus.retention }} - - --web.enable-otlp-receiver - - --enable-feature=otlp-write-receiver - ports: - - containerPort: 9090 - name: http - volumeMounts: - - name: prometheus-config - mountPath: /etc/prometheus - volumes: - - name: prometheus-config - configMap: - name: prometheus -{{- end }} diff --git a/testing-framework/deployers/k8s/helm/nomos-runner/templates/prometheus-service.yaml b/testing-framework/deployers/k8s/helm/nomos-runner/templates/prometheus-service.yaml deleted file mode 100644 index c0d90e2..0000000 --- a/testing-framework/deployers/k8s/helm/nomos-runner/templates/prometheus-service.yaml +++ /dev/null @@ -1,20 +0,0 @@ -{{- if .Values.prometheus.enabled }} ---- -apiVersion: v1 -kind: Service -metadata: - name: prometheus - labels: - {{- include "nomos-runner.prometheusLabels" . | nindent 4 }} -spec: - type: {{ .Values.prometheus.service.type | default "NodePort" }} - selector: - {{- include "nomos-runner.prometheusLabels" . | nindent 4 }} - ports: - - name: http - port: 9090 - targetPort: http - {{- if and (eq (default "NodePort" .Values.prometheus.service.type) "NodePort") .Values.prometheus.service.nodePort }} - nodePort: {{ .Values.prometheus.service.nodePort }} - {{- end }} -{{- end }} diff --git a/testing-framework/deployers/k8s/helm/nomos-runner/templates/pv.yaml b/testing-framework/deployers/k8s/helm/nomos-runner/templates/pv.yaml deleted file mode 100644 index 6372cde..0000000 --- a/testing-framework/deployers/k8s/helm/nomos-runner/templates/pv.yaml +++ /dev/null @@ -1,18 +0,0 @@ -{{- if eq .Values.kzg.mode "hostPath" }} -apiVersion: v1 -kind: PersistentVolume -metadata: - name: {{ include "nomos-runner.fullname" . }}-kzg - labels: - {{- include "nomos-runner.labels" . | nindent 4 }} -spec: - capacity: - storage: {{ .Values.kzg.storageSize }} - accessModes: - - ReadOnlyMany - persistentVolumeReclaimPolicy: Delete - storageClassName: manual - hostPath: - path: {{ .Values.kzg.hostPath }} - type: {{ .Values.kzg.hostPathType }} -{{- end }} diff --git a/testing-framework/deployers/k8s/helm/nomos-runner/templates/pvc.yaml b/testing-framework/deployers/k8s/helm/nomos-runner/templates/pvc.yaml deleted file mode 100644 index 93f4081..0000000 --- a/testing-framework/deployers/k8s/helm/nomos-runner/templates/pvc.yaml +++ /dev/null @@ -1,16 +0,0 @@ -{{- if eq .Values.kzg.mode "hostPath" }} -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: {{ include "nomos-runner.fullname" . }}-kzg - labels: - {{- include "nomos-runner.labels" . | nindent 4 }} -spec: - accessModes: - - ReadOnlyMany - storageClassName: manual - volumeName: {{ include "nomos-runner.fullname" . }}-kzg - resources: - requests: - storage: {{ .Values.kzg.storageSize }} -{{- end }} diff --git a/testing-framework/deployers/k8s/helm/nomos-runner/values.yaml b/testing-framework/deployers/k8s/helm/nomos-runner/values.yaml index 4465321..bb6c63d 100644 --- a/testing-framework/deployers/k8s/helm/nomos-runner/values.yaml +++ b/testing-framework/deployers/k8s/helm/nomos-runner/values.yaml @@ -26,28 +26,3 @@ kzg: hostPath: "/var/lib/nomos/kzgrs_test_params" hostPathType: "Directory" storageSize: "1Gi" - -prometheus: - enabled: true - externalUrl: "" - image: "prom/prometheus:v3.0.1" - imagePullPolicy: IfNotPresent - retention: "7d" - service: - type: NodePort - nodePort: null - config: | - global: - evaluation_interval: 15s - external_labels: - monitor: "NomosRunner" - -grafana: - enabled: true - image: "grafana/grafana:10.4.1" - imagePullPolicy: IfNotPresent - adminUser: admin - adminPassword: admin - service: - type: NodePort - nodePort: null diff --git a/testing-framework/deployers/k8s/src/deployer/orchestrator.rs b/testing-framework/deployers/k8s/src/deployer/orchestrator.rs index c1873ca..c3517c0 100644 --- a/testing-framework/deployers/k8s/src/deployer/orchestrator.rs +++ b/testing-framework/deployers/k8s/src/deployer/orchestrator.rs @@ -1,11 +1,10 @@ use anyhow::Error; use async_trait::async_trait; use kube::Client; -use reqwest::Url; use testing_framework_core::{ scenario::{ - BlockFeedTask, CleanupGuard, Deployer, MetricsError, ObservabilityCapability, RunContext, - Runner, Scenario, + BlockFeedTask, CleanupGuard, Deployer, MetricsError, ObservabilityCapability, + ObservabilityInputs, RunContext, Runner, Scenario, }, topology::generation::GeneratedTopology, }; @@ -17,13 +16,12 @@ use crate::{ cluster::{ ClusterEnvironment, NodeClientError, PortSpecs, RemoteReadinessError, build_node_clients, cluster_identifiers, collect_port_specs, ensure_cluster_readiness, - install_stack, kill_port_forwards, metrics_handle_from_endpoint, - metrics_handle_from_url, wait_for_ports_or_cleanup, + install_stack, kill_port_forwards, wait_for_ports_or_cleanup, }, helm::HelmError, }, lifecycle::{block_feed::spawn_block_feed_with, cleanup::RunnerCleanup}, - wait::{ClusterWaitError, HostPort, PortForwardHandle}, + wait::{ClusterWaitError, PortForwardHandle}, }; /// Deploys a scenario into Kubernetes using Helm charts and port-forwards. @@ -93,7 +91,7 @@ impl Deployer for K8sDeployer { type Error = K8sRunnerError; async fn deploy(&self, scenario: &Scenario) -> Result { - deploy_with_observability(self, scenario, None, None, None).await + deploy_with_observability(self, scenario, None).await } } @@ -105,31 +103,10 @@ impl Deployer for K8sDeployer { &self, scenario: &Scenario, ) -> Result { - deploy_with_observability( - self, - scenario, - scenario.capabilities().metrics_query_url.clone(), - scenario.capabilities().metrics_query_grafana_url.clone(), - scenario.capabilities().metrics_otlp_ingest_url.clone(), - ) - .await + deploy_with_observability(self, scenario, Some(scenario.capabilities())).await } } -fn cluster_prometheus_endpoint(cluster: &Option) -> Option<&HostPort> { - cluster - .as_ref() - .expect("cluster must be available") - .prometheus_endpoint() -} - -fn cluster_grafana_endpoint(cluster: &Option) -> Option<&HostPort> { - cluster - .as_ref() - .expect("cluster must be available") - .grafana_endpoint() -} - async fn fail_cluster(cluster: &mut Option, reason: &str) { if let Some(env) = cluster.as_mut() { env.fail(reason).await; @@ -157,59 +134,13 @@ fn ensure_supported_topology(descriptors: &GeneratedTopology) -> Result<(), K8sR async fn deploy_with_observability( deployer: &K8sDeployer, scenario: &Scenario, - metrics_query_url: Option, - metrics_query_grafana_url: Option, - metrics_otlp_ingest_url: Option, + observability: Option<&ObservabilityCapability>, ) -> Result { - let external_prometheus = match metrics_query_url { - Some(url) => Some(url), - None => match std::env::var("K8S_RUNNER_METRICS_QUERY_URL") - .ok() - .or_else(|| std::env::var("NOMOS_METRICS_QUERY_URL").ok()) - // Back-compat: - .or_else(|| std::env::var("K8S_RUNNER_EXTERNAL_PROMETHEUS_URL").ok()) - .or_else(|| std::env::var("NOMOS_EXTERNAL_PROMETHEUS_URL").ok()) - { - Some(raw) if !raw.trim().is_empty() => { - Some(Url::parse(raw.trim()).map_err(|err| { - MetricsError::new(format!("invalid metrics query url: {err}")) - })?) - } - _ => None, - }, - }; - - let external_prometheus_grafana_url = match metrics_query_grafana_url { - Some(url) => Some(url), - None => match std::env::var("K8S_RUNNER_METRICS_QUERY_GRAFANA_URL") - .ok() - .or_else(|| std::env::var("NOMOS_METRICS_QUERY_GRAFANA_URL").ok()) - // Back-compat: - .or_else(|| std::env::var("K8S_RUNNER_EXTERNAL_PROMETHEUS_GRAFANA_URL").ok()) - .or_else(|| std::env::var("NOMOS_EXTERNAL_PROMETHEUS_GRAFANA_URL").ok()) - { - Some(raw) if !raw.trim().is_empty() => Some(Url::parse(raw.trim()).map_err(|err| { - MetricsError::new(format!("invalid metrics query grafana url: {err}")) - })?), - _ => None, - }, - }; - - let external_otlp_metrics_endpoint = match metrics_otlp_ingest_url { - Some(url) => Some(url), - None => match std::env::var("K8S_RUNNER_METRICS_OTLP_INGEST_URL") - .ok() - .or_else(|| std::env::var("NOMOS_METRICS_OTLP_INGEST_URL").ok()) - // Back-compat: - .or_else(|| std::env::var("K8S_RUNNER_EXTERNAL_OTLP_METRICS_ENDPOINT").ok()) - .or_else(|| std::env::var("NOMOS_EXTERNAL_OTLP_METRICS_ENDPOINT").ok()) - { - Some(raw) if !raw.trim().is_empty() => Some(Url::parse(raw.trim()).map_err(|err| { - MetricsError::new(format!("invalid metrics OTLP ingest url: {err}")) - })?), - _ => None, - }, - }; + let env_inputs = ObservabilityInputs::from_env()?; + let cap_inputs = observability + .map(ObservabilityInputs::from_capability) + .unwrap_or_default(); + let observability = env_inputs.with_overrides(cap_inputs).normalized(); let descriptors = scenario.topology().clone(); let validator_count = descriptors.validators().len(); @@ -225,9 +156,16 @@ async fn deploy_with_observability( executors = executor_count, duration_secs = scenario.duration().as_secs(), readiness_checks = deployer.readiness_checks, - metrics_query_url = external_prometheus.as_ref().map(|u| u.as_str()), - metrics_query_grafana_url = external_prometheus_grafana_url.as_ref().map(|u| u.as_str()), - metrics_otlp_ingest_url = external_otlp_metrics_endpoint.as_ref().map(|u| u.as_str()), + metrics_query_url = observability.metrics_query_url.as_ref().map(|u| u.as_str()), + metrics_query_grafana_url = observability + .metrics_query_grafana_url + .as_ref() + .map(|u| u.as_str()), + metrics_otlp_ingest_url = observability + .metrics_otlp_ingest_url + .as_ref() + .map(|u| u.as_str()), + grafana_url = observability.grafana_url.as_ref().map(|u| u.as_str()), "starting k8s deployment" ); @@ -238,9 +176,7 @@ async fn deploy_with_observability( &port_specs, &descriptors, deployer.readiness_checks, - external_prometheus.as_ref(), - external_prometheus_grafana_url.as_ref(), - external_otlp_metrics_endpoint.as_ref(), + &observability, ) .await?, ); @@ -259,23 +195,11 @@ async fn deploy_with_observability( } }; - let telemetry = match external_prometheus.clone() { - Some(url) => metrics_handle_from_url(url), - None => cluster - .as_ref() - .and_then(|cluster| cluster.prometheus_endpoint()) - .ok_or_else(|| MetricsError::new("prometheus endpoint unavailable")) - .and_then(metrics_handle_from_endpoint), - }; - let telemetry = match telemetry { + let telemetry = match observability.telemetry_handle() { Ok(handle) => handle, Err(err) => { - fail_cluster( - &mut cluster, - "failed to configure prometheus metrics handle", - ) - .await; - error!(error = ?err, "failed to configure prometheus metrics handle"); + fail_cluster(&mut cluster, "failed to configure metrics telemetry handle").await; + error!(error = ?err, "failed to configure metrics telemetry handle"); return Err(err.into()); } }; @@ -289,36 +213,29 @@ async fn deploy_with_observability( } }; - if let Some(url) = external_prometheus.as_ref() { - info!(prometheus_url = %url.as_str(), "using external prometheus endpoint"); - } else if let Some(prometheus) = cluster_prometheus_endpoint(&cluster) { + if let Some(url) = observability.metrics_query_url.as_ref() { info!( - prometheus_url = %format!("http://{}:{}/", prometheus.host, prometheus.port), - "prometheus endpoint available on host" + metrics_query_url = %url.as_str(), + "metrics query endpoint configured" ); } - if let Some(grafana) = cluster_grafana_endpoint(&cluster) { - info!( - grafana_url = %format!("http://{}:{}/", grafana.host, grafana.port), - "grafana dashboard available on host" - ); + if let Some(url) = observability.grafana_url.as_ref() { + info!(grafana_url = %url.as_str(), "grafana url configured"); } if std::env::var("TESTNET_PRINT_ENDPOINTS").is_ok() { - let prometheus = external_prometheus + let prometheus = observability + .metrics_query_url .as_ref() .map(|u| u.as_str().to_string()) - .or_else(|| { - cluster_prometheus_endpoint(&cluster) - .map(|endpoint| format!("http://{}:{}/", endpoint.host, endpoint.port)) - }) .unwrap_or_else(|| "".to_string()); - let grafana = cluster_grafana_endpoint(&cluster); println!( "TESTNET_ENDPOINTS prometheus={} grafana={}", prometheus, - grafana - .map(|endpoint| format!("http://{}:{}/", endpoint.host, endpoint.port)) + observability + .grafana_url + .as_ref() + .map(|u| u.as_str().to_string()) .unwrap_or_else(|| "".to_string()) ); @@ -374,16 +291,9 @@ async fn setup_cluster( specs: &PortSpecs, descriptors: &GeneratedTopology, readiness_checks: bool, - external_prometheus: Option<&Url>, - external_prometheus_grafana_url: Option<&Url>, - external_otlp_metrics_endpoint: Option<&Url>, + observability: &ObservabilityInputs, ) -> Result { - let assets = prepare_assets( - descriptors, - external_prometheus, - external_prometheus_grafana_url, - external_otlp_metrics_endpoint, - )?; + let assets = prepare_assets(descriptors, observability.metrics_otlp_ingest_url.as_ref())?; let validators = descriptors.validators().len(); let executors = descriptors.executors().len(); @@ -394,19 +304,8 @@ async fn setup_cluster( Some(install_stack(client, &assets, &namespace, &release, validators, executors).await?); info!("waiting for helm-managed services to become ready"); - let cluster_ready = wait_for_ports_or_cleanup( - client, - &namespace, - &release, - specs, - external_prometheus.is_none() && external_prometheus_grafana_url.is_none(), - &mut cleanup_guard, - ) - .await?; - - if let Some(prometheus) = cluster_ready.ports.prometheus.as_ref() { - info!(prometheus = ?prometheus, "discovered prometheus endpoint"); - } + let cluster_ready = + wait_for_ports_or_cleanup(client, &namespace, &release, specs, &mut cleanup_guard).await?; let environment = ClusterEnvironment::new( client.clone(), diff --git a/testing-framework/deployers/k8s/src/infrastructure/assets.rs b/testing-framework/deployers/k8s/src/infrastructure/assets.rs index fcc6060..c8fcc56 100644 --- a/testing-framework/deployers/k8s/src/infrastructure/assets.rs +++ b/testing-framework/deployers/k8s/src/infrastructure/assets.rs @@ -5,6 +5,7 @@ use std::{ }; use anyhow::{Context as _, Result as AnyResult}; +use nomos_tracing::metrics::otlp::OtlpMetricsConfig; use nomos_tracing_service::MetricsLayer; use reqwest::Url; use serde::Serialize; @@ -55,8 +56,6 @@ pub enum AssetsError { MissingKzg { path: PathBuf }, #[error("missing Helm chart at {path}; ensure the repository is up-to-date")] MissingChart { path: PathBuf }, - #[error("missing Grafana dashboards source at {path}")] - MissingGrafanaDashboards { path: PathBuf }, #[error("failed to create temporary directory for rendered assets: {source}")] TempDir { #[source] @@ -92,9 +91,7 @@ fn kzg_mode() -> KzgMode { /// topology. pub fn prepare_assets( topology: &GeneratedTopology, - external_prometheus: Option<&Url>, - external_prometheus_grafana_url: Option<&Url>, - external_otlp_metrics_endpoint: Option<&Url>, + metrics_otlp_ingest_url: Option<&Url>, ) -> Result { info!( validators = topology.validators().len(), @@ -104,13 +101,7 @@ pub fn prepare_assets( let root = workspace_root().map_err(|source| AssetsError::WorkspaceRoot { source })?; let kzg_mode = kzg_mode(); - let cfgsync_yaml = render_cfgsync_config( - &root, - topology, - kzg_mode, - external_prometheus, - external_otlp_metrics_endpoint, - )?; + let cfgsync_yaml = render_cfgsync_config(&root, topology, kzg_mode, metrics_otlp_ingest_url)?; let tempdir = tempfile::Builder::new() .prefix("nomos-helm-") @@ -124,12 +115,7 @@ pub fn prepare_assets( KzgMode::InImage => None, }; let chart_path = helm_chart_path()?; - sync_grafana_dashboards(&root, &chart_path)?; - let values_yaml = render_values_yaml( - topology, - external_prometheus, - external_prometheus_grafana_url, - )?; + let values_yaml = render_values_yaml(topology)?; let values_file = write_temp_file(tempdir.path(), "values.yaml", values_yaml)?; let image = env::var("NOMOS_TESTNET_IMAGE") .unwrap_or_else(|_| String::from("public.ecr.aws/r4s5t9y4/logos/logos-blockchain:test")); @@ -164,81 +150,13 @@ pub fn prepare_assets( } const CFGSYNC_K8S_TIMEOUT_SECS: u64 = 300; -const DEFAULT_GRAFANA_NODE_PORT: u16 = 30030; const DEFAULT_IN_IMAGE_KZG_PARAMS_PATH: &str = "/opt/nomos/kzg-params/kzgrs_test_params"; -fn sync_grafana_dashboards(root: &Path, chart_path: &Path) -> Result<(), AssetsError> { - let source_dir = stack_assets_root(root).join("monitoring/grafana/dashboards"); - let dest_dir = chart_path.join("grafana/dashboards"); - - if !source_dir.exists() { - return Err(AssetsError::MissingGrafanaDashboards { path: source_dir }); - } - - fs::create_dir_all(&dest_dir).map_err(|source| AssetsError::Io { - path: dest_dir.clone(), - source, - })?; - - let mut removed = 0usize; - for entry in fs::read_dir(&dest_dir).map_err(|source| AssetsError::Io { - path: dest_dir.clone(), - source, - })? { - let entry = entry.map_err(|source| AssetsError::Io { - path: dest_dir.clone(), - source, - })?; - let path = entry.path(); - if path.extension().and_then(|ext| ext.to_str()) != Some("json") { - continue; - } - fs::remove_file(&path).map_err(|source| AssetsError::Io { - path: path.clone(), - source, - })?; - removed += 1; - } - - let mut copied = 0usize; - for entry in fs::read_dir(&source_dir).map_err(|source| AssetsError::Io { - path: source_dir.clone(), - source, - })? { - let entry = entry.map_err(|source| AssetsError::Io { - path: source_dir.clone(), - source, - })?; - let path = entry.path(); - if path.extension().and_then(|ext| ext.to_str()) != Some("json") { - continue; - } - let file_name = path.file_name().unwrap_or_default(); - let dest_path = dest_dir.join(file_name); - fs::copy(&path, &dest_path).map_err(|source| AssetsError::Io { - path: dest_path.clone(), - source, - })?; - copied += 1; - } - - debug!( - source = %source_dir.display(), - dest = %dest_dir.display(), - removed, - copied, - "synced Grafana dashboards into Helm chart" - ); - - Ok(()) -} - fn render_cfgsync_config( root: &Path, topology: &GeneratedTopology, kzg_mode: KzgMode, - external_prometheus: Option<&Url>, - external_otlp_metrics_endpoint: Option<&Url>, + metrics_otlp_ingest_url: Option<&Url>, ) -> Result { let cfgsync_template_path = stack_assets_root(root).join("cfgsync.yaml"); debug!(path = %cfgsync_template_path.display(), "loading cfgsync template"); @@ -254,15 +172,11 @@ fn render_cfgsync_config( .unwrap_or_else(|| DEFAULT_IN_IMAGE_KZG_PARAMS_PATH.to_string()); } - let external_metrics_endpoint = match external_otlp_metrics_endpoint { - Some(endpoint) => Some(Ok(endpoint.clone())), - None => external_prometheus.map(derive_prometheus_otlp_metrics_endpoint), - }; - - if let Some(endpoint) = external_metrics_endpoint.transpose()? { - if let MetricsLayer::Otlp(ref mut config) = cfg.tracing_settings.metrics { - config.endpoint = endpoint; - } + if let Some(endpoint) = metrics_otlp_ingest_url.cloned() { + cfg.tracing_settings.metrics = MetricsLayer::Otlp(OtlpMetricsConfig { + endpoint, + host_identifier: "node".into(), + }); } cfg.timeout = cfg.timeout.max(CFGSYNC_K8S_TIMEOUT_SECS); @@ -270,16 +184,6 @@ fn render_cfgsync_config( render_cfgsync_yaml(&cfg).map_err(|source| AssetsError::Cfgsync { source }) } -fn derive_prometheus_otlp_metrics_endpoint(base: &Url) -> Result { - let base = base.as_str().trim_end_matches('/'); - let otlp_metrics = format!("{base}/api/v1/otlp/v1/metrics"); - Url::parse(&otlp_metrics).map_err(|source| AssetsError::Cfgsync { - source: anyhow::anyhow!( - "invalid OTLP metrics endpoint derived from external Prometheus url '{base}': {source}" - ), - }) -} - struct ScriptPaths { run_cfgsync: PathBuf, run_shared: PathBuf, @@ -337,16 +241,8 @@ fn helm_chart_path() -> Result { } } -fn render_values_yaml( - topology: &GeneratedTopology, - external_prometheus: Option<&Url>, - external_prometheus_grafana_url: Option<&Url>, -) -> Result { - let values = build_values( - topology, - external_prometheus, - external_prometheus_grafana_url, - ); +fn render_values_yaml(topology: &GeneratedTopology) -> Result { + let values = build_values(topology); serde_yaml::to_string(&values).map_err(|source| AssetsError::Values { source }) } @@ -402,8 +298,6 @@ struct HelmValues { cfgsync: CfgsyncValues, validators: NodeGroup, executors: NodeGroup, - prometheus: PrometheusValues, - grafana: GrafanaValues, } #[derive(Serialize)] @@ -426,72 +320,13 @@ struct NodeValues { env: BTreeMap, } -#[derive(Serialize)] -struct PrometheusValues { - enabled: bool, - #[serde(rename = "externalUrl", skip_serializing_if = "Option::is_none")] - external_url: Option, -} - -#[derive(Serialize)] -struct GrafanaValues { - enabled: bool, - image: String, - #[serde(rename = "imagePullPolicy")] - image_pull_policy: String, - #[serde(rename = "adminUser")] - admin_user: String, - #[serde(rename = "adminPassword")] - admin_password: String, - service: GrafanaServiceValues, -} - -#[derive(Serialize)] -struct GrafanaServiceValues { - #[serde(rename = "type")] - type_field: String, - #[serde(rename = "nodePort")] - node_port: Option, -} - -fn build_values( - topology: &GeneratedTopology, - external_prometheus: Option<&Url>, - external_prometheus_grafana_url: Option<&Url>, -) -> HelmValues { +fn build_values(topology: &GeneratedTopology) -> HelmValues { let cfgsync = CfgsyncValues { port: cfgsync_port(), }; let pol_mode = pol_proof_mode(); let image_pull_policy = env::var("NOMOS_TESTNET_IMAGE_PULL_POLICY").unwrap_or_else(|_| "IfNotPresent".into()); - let grafana_node_port = match kzg_mode() { - KzgMode::HostPath => Some(DEFAULT_GRAFANA_NODE_PORT), - KzgMode::InImage => env::var("NOMOS_GRAFANA_NODE_PORT").ok().and_then(|value| { - value - .parse::() - .ok() - .filter(|port| *port >= 30000 && *port <= 32767) - }), - }; - let grafana = GrafanaValues { - enabled: true, - image: "grafana/grafana:10.4.1".into(), - image_pull_policy: "IfNotPresent".into(), - admin_user: "admin".into(), - admin_password: "admin".into(), - service: GrafanaServiceValues { - type_field: "NodePort".into(), - node_port: grafana_node_port, - }, - }; - let prometheus_external_url = external_prometheus_grafana_url - .or(external_prometheus) - .map(|url| url.as_str().trim_end_matches('/').to_string()); - let prometheus = PrometheusValues { - enabled: prometheus_external_url.is_none(), - external_url: prometheus_external_url, - }; debug!(pol_mode, "rendering Helm values for k8s stack"); let validators = topology .validators() @@ -578,8 +413,6 @@ fn build_values( count: topology.executors().len(), nodes: executors, }, - prometheus, - grafana, } } diff --git a/testing-framework/deployers/k8s/src/infrastructure/cluster.rs b/testing-framework/deployers/k8s/src/infrastructure/cluster.rs index 4a7234c..1b964f9 100644 --- a/testing-framework/deployers/k8s/src/infrastructure/cluster.rs +++ b/testing-framework/deployers/k8s/src/infrastructure/cluster.rs @@ -4,7 +4,7 @@ use kube::Client; use reqwest::Url; use testing_framework_core::{ nodes::ApiClient, - scenario::{CleanupGuard, Metrics, MetricsError, NodeClients, http_probe::NodeRole}, + scenario::{CleanupGuard, NodeClients, http_probe::NodeRole}, topology::{generation::GeneratedTopology, readiness::ReadinessError}, }; use tracing::{debug, info}; @@ -15,8 +15,7 @@ use crate::{ infrastructure::assets::RunnerAssets, lifecycle::{cleanup::RunnerCleanup, logs::dump_namespace_logs}, wait::{ - ClusterPorts, ClusterReady, HostPort, NodeConfigPorts, PortForwardHandle, - wait_for_cluster_ready, + ClusterPorts, ClusterReady, NodeConfigPorts, PortForwardHandle, wait_for_cluster_ready, }, }; @@ -38,8 +37,6 @@ pub struct ClusterEnvironment { validator_testing_ports: Vec, executor_api_ports: Vec, executor_testing_ports: Vec, - prometheus: Option, - grafana: Option, port_forwards: Vec, } @@ -68,8 +65,6 @@ impl ClusterEnvironment { validator_testing_ports, executor_api_ports, executor_testing_ports, - prometheus: ports.prometheus.clone(), - grafana: ports.grafana.clone(), port_forwards, } } @@ -105,14 +100,6 @@ impl ClusterEnvironment { &self.release } - pub fn prometheus_endpoint(&self) -> Option<&HostPort> { - self.prometheus.as_ref() - } - - pub fn grafana_endpoint(&self) -> Option<&HostPort> { - self.grafana.as_ref() - } - pub fn validator_ports(&self) -> (&[u16], &[u16]) { (&self.validator_api_ports, &self.validator_testing_ports) } @@ -229,16 +216,6 @@ pub fn build_node_clients(cluster: &ClusterEnvironment) -> Result Result { - let url = cluster_host_url(&endpoint.host, endpoint.port) - .map_err(|err| MetricsError::new(format!("invalid prometheus url: {err}")))?; - Metrics::from_prometheus(url) -} - -pub fn metrics_handle_from_url(url: Url) -> Result { - Metrics::from_prometheus(url) -} - pub async fn ensure_cluster_readiness( descriptors: &GeneratedTopology, cluster: &ClusterEnvironment, @@ -324,7 +301,6 @@ pub async fn wait_for_ports_or_cleanup( namespace: &str, release: &str, specs: &PortSpecs, - prometheus_enabled: bool, cleanup_guard: &mut Option, ) -> Result { info!( @@ -340,13 +316,11 @@ pub async fn wait_for_ports_or_cleanup( release, &specs.validators, &specs.executors, - prometheus_enabled, ) .await { Ok(ports) => { info!( - prometheus = ?ports.ports.prometheus, validator_ports = ?ports.ports.validators, executor_ports = ?ports.ports.executors, "cluster port-forwards established" diff --git a/testing-framework/deployers/k8s/src/lifecycle/wait/grafana.rs b/testing-framework/deployers/k8s/src/lifecycle/wait/grafana.rs deleted file mode 100644 index b0241b1..0000000 --- a/testing-framework/deployers/k8s/src/lifecycle/wait/grafana.rs +++ /dev/null @@ -1,36 +0,0 @@ -use tokio::time::sleep; - -use super::{ClusterWaitError, node_http_probe_timeout, node_http_timeout}; -use crate::host::node_host; - -const GRAFANA_HTTP_POLL_INTERVAL: std::time::Duration = std::time::Duration::from_secs(1); - -pub async fn wait_for_grafana_http_nodeport(port: u16) -> Result<(), ClusterWaitError> { - let host = node_host(); - wait_for_grafana_http(&host, port, node_http_probe_timeout()).await -} - -pub async fn wait_for_grafana_http_port_forward(port: u16) -> Result<(), ClusterWaitError> { - wait_for_grafana_http("127.0.0.1", port, node_http_timeout()).await -} - -async fn wait_for_grafana_http( - host: &str, - port: u16, - timeout: std::time::Duration, -) -> Result<(), ClusterWaitError> { - let client = reqwest::Client::new(); - let url = format!("http://{host}:{port}/api/health"); - - let attempts = timeout.as_secs(); - for _ in 0..attempts { - if let Ok(resp) = client.get(&url).send().await - && resp.status().is_success() - { - return Ok(()); - } - sleep(GRAFANA_HTTP_POLL_INTERVAL).await; - } - - Err(ClusterWaitError::GrafanaTimeout { port }) -} diff --git a/testing-framework/deployers/k8s/src/lifecycle/wait/mod.rs b/testing-framework/deployers/k8s/src/lifecycle/wait/mod.rs index 544d9da..53353f6 100644 --- a/testing-framework/deployers/k8s/src/lifecycle/wait/mod.rs +++ b/testing-framework/deployers/k8s/src/lifecycle/wait/mod.rs @@ -4,9 +4,7 @@ use kube::Error as KubeError; use testing_framework_core::{ constants::{ DEFAULT_HTTP_POLL_INTERVAL, DEFAULT_K8S_DEPLOYMENT_TIMEOUT, - DEFAULT_NODE_HTTP_PROBE_TIMEOUT, DEFAULT_NODE_HTTP_TIMEOUT, DEFAULT_PROMETHEUS_HTTP_PORT, - DEFAULT_PROMETHEUS_HTTP_PROBE_TIMEOUT, DEFAULT_PROMETHEUS_HTTP_TIMEOUT, - DEFAULT_PROMETHEUS_SERVICE_NAME, + DEFAULT_NODE_HTTP_PROBE_TIMEOUT, DEFAULT_NODE_HTTP_TIMEOUT, }, scenario::http_probe::NodeRole, }; @@ -14,11 +12,9 @@ use thiserror::Error; mod deployment; mod forwarding; -mod grafana; mod http_probe; mod orchestrator; mod ports; -mod prometheus; pub use forwarding::PortForwardHandle; pub use orchestrator::wait_for_cluster_ready; @@ -44,15 +40,13 @@ pub struct HostPort { pub port: u16, } -/// All port assignments for the cluster plus Prometheus. +/// All port assignments for the cluster. #[derive(Debug)] pub struct ClusterPorts { pub validators: Vec, pub executors: Vec, pub validator_host: String, pub executor_host: String, - pub prometheus: Option, - pub grafana: Option, } /// Success result from waiting for the cluster: host ports and forward handles. @@ -96,10 +90,6 @@ pub enum ClusterWaitError { port: u16, timeout: Duration, }, - #[error("timeout waiting for prometheus readiness on NodePort {port}")] - PrometheusTimeout { port: u16 }, - #[error("timeout waiting for grafana readiness on port {port}")] - GrafanaTimeout { port: u16 }, #[error("failed to start port-forward for service {service} port {port}: {source}")] PortForward { service: String, @@ -149,32 +139,6 @@ pub(crate) fn http_poll_interval() -> Duration { *HTTP_POLL_INTERVAL } -pub(crate) const PROMETHEUS_HTTP_PORT: u16 = DEFAULT_PROMETHEUS_HTTP_PORT; - -static PROMETHEUS_HTTP_TIMEOUT: LazyLock = LazyLock::new(|| { - env_duration_secs( - "K8S_RUNNER_PROMETHEUS_HTTP_TIMEOUT_SECS", - DEFAULT_PROMETHEUS_HTTP_TIMEOUT, - ) -}); - -static PROMETHEUS_HTTP_PROBE_TIMEOUT: LazyLock = LazyLock::new(|| { - env_duration_secs( - "K8S_RUNNER_PROMETHEUS_HTTP_PROBE_TIMEOUT_SECS", - DEFAULT_PROMETHEUS_HTTP_PROBE_TIMEOUT, - ) -}); - -pub(crate) fn prometheus_http_timeout() -> Duration { - *PROMETHEUS_HTTP_TIMEOUT -} - -pub(crate) fn prometheus_http_probe_timeout() -> Duration { - *PROMETHEUS_HTTP_PROBE_TIMEOUT -} - -pub(crate) const PROMETHEUS_SERVICE_NAME: &str = DEFAULT_PROMETHEUS_SERVICE_NAME; - fn env_duration_secs(key: &str, default: Duration) -> Duration { env::var(key) .ok() diff --git a/testing-framework/deployers/k8s/src/lifecycle/wait/orchestrator.rs b/testing-framework/deployers/k8s/src/lifecycle/wait/orchestrator.rs index 0ca66cb..3db58e0 100644 --- a/testing-framework/deployers/k8s/src/lifecycle/wait/orchestrator.rs +++ b/testing-framework/deployers/k8s/src/lifecycle/wait/orchestrator.rs @@ -1,31 +1,20 @@ use kube::Client; use testing_framework_core::scenario::http_probe::NodeRole; -use super::{ - ClusterPorts, ClusterReady, ClusterWaitError, HostPort, NodeConfigPorts, PROMETHEUS_HTTP_PORT, - PROMETHEUS_SERVICE_NAME, prometheus_http_probe_timeout, -}; +use super::{ClusterPorts, ClusterReady, ClusterWaitError, NodeConfigPorts}; use crate::lifecycle::wait::{ deployment::wait_for_deployment_ready, - forwarding::{ - PortForwardHandle, PortForwardSpawn, kill_port_forwards, port_forward_group, - port_forward_service, - }, - grafana::{wait_for_grafana_http_nodeport, wait_for_grafana_http_port_forward}, + forwarding::{PortForwardHandle, kill_port_forwards, port_forward_group}, http_probe::{wait_for_node_http_nodeport, wait_for_node_http_port_forward}, - ports::{discover_node_ports, find_node_port}, - prometheus::{wait_for_prometheus_http_nodeport, wait_for_prometheus_http_port_forward}, + ports::discover_node_ports, }; -const GRAFANA_HTTP_PORT: u16 = 3000; - pub async fn wait_for_cluster_ready( client: &Client, namespace: &str, release: &str, validator_ports: &[NodeConfigPorts], executor_ports: &[NodeConfigPorts], - prometheus_enabled: bool, ) -> Result { if validator_ports.is_empty() { return Err(ClusterWaitError::MissingValidator); @@ -108,75 +97,12 @@ pub async fn wait_for_cluster_ready( } } - let mut prometheus = None; - if prometheus_enabled { - let mut prometheus_port = find_node_port( - client, - namespace, - PROMETHEUS_SERVICE_NAME, - PROMETHEUS_HTTP_PORT, - ) - .await?; - let mut prometheus_host = crate::host::node_host(); - if wait_for_prometheus_http_nodeport(prometheus_port, prometheus_http_probe_timeout()) - .await - .is_err() - { - let PortForwardSpawn { local_port, handle } = - port_forward_service(namespace, PROMETHEUS_SERVICE_NAME, PROMETHEUS_HTTP_PORT) - .map_err(|err| { - kill_port_forwards(&mut port_forwards); - err - })?; - prometheus_port = local_port; - prometheus_host = "127.0.0.1".to_owned(); - port_forwards.push(handle); - if let Err(err) = wait_for_prometheus_http_port_forward(prometheus_port).await { - return Err(cleanup_port_forwards(&mut port_forwards, err)); - } - } - prometheus = Some(HostPort { - host: prometheus_host, - port: prometheus_port, - }); - } - - let mut grafana = None; - let grafana_service = format!("{release}-grafana"); - if let Ok(node_port) = - find_node_port(client, namespace, &grafana_service, GRAFANA_HTTP_PORT).await - { - let mut grafana_host = crate::host::node_host(); - let mut grafana_port = node_port; - if wait_for_grafana_http_nodeport(grafana_port).await.is_err() { - let PortForwardSpawn { local_port, handle } = - port_forward_service(namespace, &grafana_service, GRAFANA_HTTP_PORT).map_err( - |err| { - kill_port_forwards(&mut port_forwards); - err - }, - )?; - grafana_host = "127.0.0.1".to_owned(); - grafana_port = local_port; - port_forwards.push(handle); - if let Err(err) = wait_for_grafana_http_port_forward(grafana_port).await { - return Err(cleanup_port_forwards(&mut port_forwards, err)); - } - } - grafana = Some(HostPort { - host: grafana_host, - port: grafana_port, - }); - } - Ok(ClusterReady { ports: ClusterPorts { validators: validator_allocations, executors: executor_allocations, validator_host, executor_host, - prometheus, - grafana, }, port_forwards, }) diff --git a/testing-framework/deployers/k8s/src/lifecycle/wait/prometheus.rs b/testing-framework/deployers/k8s/src/lifecycle/wait/prometheus.rs deleted file mode 100644 index 328eab3..0000000 --- a/testing-framework/deployers/k8s/src/lifecycle/wait/prometheus.rs +++ /dev/null @@ -1,39 +0,0 @@ -use tokio::time::sleep; - -use super::{ClusterWaitError, prometheus_http_timeout}; -use crate::host::node_host; - -const PROMETHEUS_HTTP_POLL_INTERVAL: std::time::Duration = std::time::Duration::from_secs(1); - -pub async fn wait_for_prometheus_http_nodeport( - port: u16, - timeout: std::time::Duration, -) -> Result<(), ClusterWaitError> { - let host = node_host(); - wait_for_prometheus_http(&host, port, timeout).await -} - -pub async fn wait_for_prometheus_http_port_forward(port: u16) -> Result<(), ClusterWaitError> { - wait_for_prometheus_http("127.0.0.1", port, prometheus_http_timeout()).await -} - -async fn wait_for_prometheus_http( - host: &str, - port: u16, - timeout: std::time::Duration, -) -> Result<(), ClusterWaitError> { - let client = reqwest::Client::new(); - let url = format!("http://{host}:{port}/-/ready"); - - let attempts = timeout.as_secs(); - for _ in 0..attempts { - if let Ok(resp) = client.get(&url).send().await - && resp.status().is_success() - { - return Ok(()); - } - sleep(PROMETHEUS_HTTP_POLL_INTERVAL).await; - } - - Err(ClusterWaitError::PrometheusTimeout { port }) -} diff --git a/testing-framework/workflows/src/builder/mod.rs b/testing-framework/workflows/src/builder/mod.rs index b9e233c..267a482 100644 --- a/testing-framework/workflows/src/builder/mod.rs +++ b/testing-framework/workflows/src/builder/mod.rs @@ -132,6 +132,12 @@ pub trait ObservabilityBuilderExt: Sized { url: &str, ) -> CoreScenarioBuilder; + /// Optional Grafana base URL for printing/logging (human access). + fn with_grafana_url(self, url: reqwest::Url) -> CoreScenarioBuilder; + + /// Convenience wrapper that parses a URL string (panics if invalid). + fn with_grafana_url_str(self, url: &str) -> CoreScenarioBuilder; + #[deprecated(note = "use with_metrics_query_url")] fn with_external_prometheus( self, @@ -190,6 +196,7 @@ impl ObservabilityBuilderExt for CoreScenarioBuilder<()> { metrics_query_url: Some(url), metrics_query_grafana_url: None, metrics_otlp_ingest_url: None, + grafana_url: None, }) } @@ -206,6 +213,7 @@ impl ObservabilityBuilderExt for CoreScenarioBuilder<()> { metrics_query_url: None, metrics_query_grafana_url: None, metrics_otlp_ingest_url: Some(url), + grafana_url: None, }) } @@ -225,6 +233,7 @@ impl ObservabilityBuilderExt for CoreScenarioBuilder<()> { metrics_query_url: None, metrics_query_grafana_url: Some(url), metrics_otlp_ingest_url: None, + grafana_url: None, }) } @@ -235,6 +244,20 @@ impl ObservabilityBuilderExt for CoreScenarioBuilder<()> { let parsed = reqwest::Url::parse(url).expect("metrics query grafana url must be valid"); self.with_metrics_query_grafana_url(parsed) } + + fn with_grafana_url(self, url: reqwest::Url) -> CoreScenarioBuilder { + self.with_capabilities(ObservabilityCapability { + metrics_query_url: None, + metrics_query_grafana_url: None, + metrics_otlp_ingest_url: None, + grafana_url: Some(url), + }) + } + + fn with_grafana_url_str(self, url: &str) -> CoreScenarioBuilder { + let parsed = reqwest::Url::parse(url).expect("grafana url must be valid"); + self.with_grafana_url(parsed) + } } impl ObservabilityBuilderExt for CoreScenarioBuilder { @@ -282,6 +305,19 @@ impl ObservabilityBuilderExt for CoreScenarioBuilder { let parsed = reqwest::Url::parse(url).expect("metrics query grafana url must be valid"); self.with_metrics_query_grafana_url(parsed) } + + fn with_grafana_url( + mut self, + url: reqwest::Url, + ) -> CoreScenarioBuilder { + self.capabilities_mut().grafana_url = Some(url); + self + } + + fn with_grafana_url_str(self, url: &str) -> CoreScenarioBuilder { + let parsed = reqwest::Url::parse(url).expect("grafana url must be valid"); + self.with_grafana_url(parsed) + } } /// Builder for transaction workloads.