mirror of
https://github.com/logos-blockchain/logos-blockchain-testing.git
synced 2026-01-02 13:23:13 +00:00
refactor(observability): remove embedded prometheus/grafana
Deployers no longer provision Prometheus/Grafana; metrics query/ingest now come from explicit URLs via env/flags.
This commit is contained in:
parent
3965669f12
commit
e05bf5e0bd
2
Cargo.lock
generated
2
Cargo.lock
generated
@ -7267,6 +7267,7 @@ dependencies = [
|
||||
"key-management-system-service",
|
||||
"nomos-core",
|
||||
"nomos-ledger",
|
||||
"nomos-tracing",
|
||||
"nomos-tracing-service",
|
||||
"reqwest",
|
||||
"serde",
|
||||
@ -7290,6 +7291,7 @@ dependencies = [
|
||||
"async-trait",
|
||||
"k8s-openapi",
|
||||
"kube",
|
||||
"nomos-tracing",
|
||||
"nomos-tracing-service",
|
||||
"reqwest",
|
||||
"serde",
|
||||
|
||||
@ -90,7 +90,7 @@ Three deployer implementations:
|
||||
| `K8sDeployer` | Kubernetes Helm | Cluster + image loaded | Not yet |
|
||||
|
||||
**Compose-specific features:**
|
||||
- Includes Prometheus at `http://localhost:9090` (override via `TEST_FRAMEWORK_PROMETHEUS_PORT`)
|
||||
- Observability is external (set `NOMOS_METRICS_QUERY_URL` / `NOMOS_METRICS_OTLP_INGEST_URL` / `NOMOS_GRAFANA_URL` as needed)
|
||||
- Optional OTLP trace/metrics endpoints (`NOMOS_OTLP_ENDPOINT`, `NOMOS_OTLP_METRICS_ENDPOINT`)
|
||||
- Node control for chaos testing (restart validators/executors)
|
||||
|
||||
@ -112,9 +112,9 @@ KZG parameters required for DA workloads:
|
||||
|
||||
### Compose Stack
|
||||
Templates and configs in `testing-framework/runners/compose/assets/`:
|
||||
- `docker-compose.yml.tera` — Stack template (validators, executors, Prometheus, Grafana)
|
||||
- `docker-compose.yml.tera` — Stack template (validators, executors)
|
||||
- Cfgsync config: `testing-framework/assets/stack/cfgsync.yaml`
|
||||
- Monitoring: `testing-framework/assets/stack/monitoring/prometheus.yml`
|
||||
- Monitoring assets (not deployed by the framework): `testing-framework/assets/stack/monitoring/`
|
||||
|
||||
## Logging Architecture
|
||||
|
||||
@ -134,14 +134,14 @@ Templates and configs in `testing-framework/runners/compose/assets/`:
|
||||
|
||||
## Observability
|
||||
|
||||
**Prometheus (Compose + K8s):**
|
||||
- Exposed at `http://localhost:9090` (configurable)
|
||||
- Scrapes all validator and executor metrics
|
||||
- Accessible in expectations: `ctx.telemetry().prometheus().map(|p| p.base_url())`
|
||||
**Prometheus-compatible metrics querying (optional):**
|
||||
- The framework does **not** deploy Prometheus/Grafana.
|
||||
- Provide a Prometheus-compatible base URL (PromQL API) via `NOMOS_METRICS_QUERY_URL`.
|
||||
- Accessible in expectations when configured: `ctx.telemetry().prometheus().map(|p| p.base_url())`
|
||||
|
||||
**Grafana dashboards (Compose + K8s):**
|
||||
- Provisioned automatically; URL is printed in `TESTNET_ENDPOINTS` when using `scripts/run-examples.sh`
|
||||
- Default credentials: `admin` / `admin`
|
||||
**Grafana dashboards (optional):**
|
||||
- Dashboards live in `testing-framework/assets/stack/monitoring/grafana/dashboards/` and can be imported into your Grafana.
|
||||
- If you set `NOMOS_GRAFANA_URL`, the deployer prints it in `TESTNET_ENDPOINTS`.
|
||||
|
||||
**Node APIs:**
|
||||
- HTTP endpoints per node for consensus info, network status, DA membership
|
||||
|
||||
@ -181,7 +181,9 @@ cargo run -p runner-examples --bin compose_runner
|
||||
- `POL_PROOF_DEV_MODE=true` — **Required** for all runners
|
||||
- `NOMOS_DEMO_VALIDATORS=3` / `NOMOS_DEMO_EXECUTORS=2` / `NOMOS_DEMO_RUN_SECS=120` — Topology overrides
|
||||
- `COMPOSE_NODE_PAIRS=1x1` — Alternative topology format: "validators×executors"
|
||||
- `TEST_FRAMEWORK_PROMETHEUS_PORT=9091` — Override Prometheus port (default: 9090)
|
||||
- `NOMOS_METRICS_QUERY_URL` — Prometheus-compatible base URL for the runner process to query (optional)
|
||||
- `NOMOS_METRICS_OTLP_INGEST_URL` — Full OTLP HTTP ingest URL for node metrics export (optional)
|
||||
- `NOMOS_GRAFANA_URL` — Grafana base URL for printing/logging (optional)
|
||||
- `COMPOSE_RUNNER_HOST=127.0.0.1` — Host address for port mappings
|
||||
- `COMPOSE_RUNNER_PRESERVE=1` — Keep containers running after test
|
||||
- `NOMOS_LOG_LEVEL=debug` / `NOMOS_LOG_FILTER=...` — Control node log verbosity (stdout/stderr)
|
||||
@ -189,7 +191,7 @@ cargo run -p runner-examples --bin compose_runner
|
||||
|
||||
**Compose-specific features:**
|
||||
- **Node control support**: Only runner that supports chaos testing (`.enable_node_control()` + chaos workloads)
|
||||
- **Prometheus observability**: Metrics at `http://localhost:9090`
|
||||
- **Observability is external**: Set `NOMOS_METRICS_*` / `NOMOS_GRAFANA_URL` to enable telemetry links and querying
|
||||
|
||||
**Important:**
|
||||
- Containers expect KZG parameters at `/kzgrs_test_params/kzgrs_test_params` (note the repeated filename)
|
||||
@ -229,21 +231,30 @@ cargo run -p runner-examples --bin k8s_runner
|
||||
- `NOMOS_TESTNET_IMAGE` — Image tag (required)
|
||||
- `POL_PROOF_DEV_MODE=true` — **Required** for all runners
|
||||
- `NOMOS_DEMO_VALIDATORS` / `NOMOS_DEMO_EXECUTORS` / `NOMOS_DEMO_RUN_SECS` — Topology overrides
|
||||
- `K8S_RUNNER_EXTERNAL_PROMETHEUS_URL` (or `NOMOS_EXTERNAL_PROMETHEUS_URL`) — Reuse an existing Prometheus and skip deploying the in-chart Prometheus; also points node OTLP metrics export and the in-cluster Grafana datasource at that Prometheus
|
||||
- `NOMOS_METRICS_QUERY_URL` — Prometheus-compatible base URL for the runner process to query (PromQL)
|
||||
- `NOMOS_METRICS_OTLP_INGEST_URL` — Full OTLP HTTP ingest URL for node metrics export (optional)
|
||||
- `NOMOS_GRAFANA_URL` — Grafana base URL for printing/logging (optional)
|
||||
|
||||
**External Prometheus (optional):**
|
||||
**Metrics + Grafana (optional):**
|
||||
```bash
|
||||
export K8S_RUNNER_EXTERNAL_PROMETHEUS_URL=http://your-prometheus:9090
|
||||
export NOMOS_METRICS_QUERY_URL=http://your-prometheus:9090
|
||||
# Prometheus OTLP receiver example:
|
||||
export NOMOS_METRICS_OTLP_INGEST_URL=http://your-prometheus:9090/api/v1/otlp/v1/metrics
|
||||
# Optional: print a Grafana link in TESTNET_ENDPOINTS
|
||||
export NOMOS_GRAFANA_URL=http://your-grafana:3000
|
||||
cargo run -p runner-examples --bin k8s_runner
|
||||
```
|
||||
|
||||
Notes:
|
||||
- The runner config expects Prometheus to accept OTLP metrics at `/api/v1/otlp/v1/metrics` (the in-chart Prometheus is started with `--web.enable-otlp-receiver` and `--enable-feature=otlp-write-receiver`).
|
||||
- Use a URL reachable from inside the cluster (for example a `Service` DNS name like `http://prometheus.monitoring:9090`).
|
||||
- `NOMOS_METRICS_QUERY_URL` must be reachable from the runner process (often via `kubectl port-forward`).
|
||||
- `NOMOS_METRICS_OTLP_INGEST_URL` must be reachable from nodes (pods/containers) and is backend-specific (Prometheus vs VictoriaMetrics paths differ).
|
||||
|
||||
**Via `scripts/run-examples.sh` (optional):**
|
||||
```bash
|
||||
scripts/run-examples.sh -t 60 -v 1 -e 1 k8s --external-prometheus http://your-prometheus:9090
|
||||
scripts/run-examples.sh -t 60 -v 1 -e 1 k8s \
|
||||
--metrics-query-url http://your-prometheus:9090 \
|
||||
--metrics-otlp-ingest-url http://your-prometheus:9090/api/v1/otlp/v1/metrics \
|
||||
--grafana-url http://your-grafana:3000
|
||||
```
|
||||
|
||||
**In code (optional):**
|
||||
@ -252,7 +263,8 @@ use testing_framework_core::scenario::ScenarioBuilder;
|
||||
use testing_framework_workflows::ObservabilityBuilderExt as _;
|
||||
|
||||
let plan = ScenarioBuilder::with_node_counts(1, 1)
|
||||
.with_external_prometheus_str("http://your-prometheus:9090")
|
||||
.with_metrics_query_url_str("http://your-prometheus:9090")
|
||||
.with_metrics_otlp_ingest_url_str("http://your-prometheus:9090/api/v1/otlp/v1/metrics")
|
||||
.build();
|
||||
```
|
||||
|
||||
@ -484,7 +496,6 @@ cargo run -p runner-examples --bin compose_runner
|
||||
- `COMPOSE_RUNNER_HOST=127.0.0.1` — host used for readiness probes (override for remote Docker daemons / VM networking)
|
||||
- `COMPOSE_RUNNER_HOST_GATEWAY=host.docker.internal:host-gateway` — controls the `extra_hosts` entry injected into compose (set to `disable` to omit)
|
||||
- `TESTNET_RUNNER_PRESERVE=1` — alias for `COMPOSE_RUNNER_PRESERVE=1`
|
||||
- `COMPOSE_GRAFANA_PORT=<port>` — pin Grafana to a fixed host port instead of ephemeral assignment
|
||||
- `COMPOSE_RUNNER_HTTP_TIMEOUT_SECS=<secs>` — override compose node HTTP readiness timeout
|
||||
|
||||
**Note:** Container names follow pattern `nomos-compose-{uuid}-validator-{index}-1` where `{uuid}` changes per run.
|
||||
@ -553,15 +564,15 @@ cargo run -p runner-examples --bin local_runner
|
||||
|
||||
Runners expose metrics and node HTTP endpoints for expectation code and debugging:
|
||||
|
||||
**Prometheus (Compose + K8s):**
|
||||
- Default: `http://localhost:9090`
|
||||
- Override: `TEST_FRAMEWORK_PROMETHEUS_PORT=9091`
|
||||
- Note: the host port can vary if `9090` is unavailable; prefer the printed `TESTNET_ENDPOINTS` line as the source of truth.
|
||||
- Access from expectations: `ctx.telemetry().prometheus().map(|p| p.base_url())`
|
||||
**Prometheus-compatible metrics querying (optional):**
|
||||
- The framework does **not** deploy Prometheus.
|
||||
- Provide `NOMOS_METRICS_QUERY_URL` (PromQL base URL) to enable `ctx.telemetry()` queries.
|
||||
- Access from expectations when configured: `ctx.telemetry().prometheus().map(|p| p.base_url())`
|
||||
|
||||
**Grafana dashboards (Compose + K8s):**
|
||||
- The deployer prints the Grafana base URL in `TESTNET_ENDPOINTS`.
|
||||
- Default credentials are `admin` / `admin`.
|
||||
**Grafana (optional):**
|
||||
- The framework does **not** deploy Grafana.
|
||||
- If you set `NOMOS_GRAFANA_URL`, the deployer prints it in `TESTNET_ENDPOINTS`.
|
||||
- Dashboards live in `testing-framework/assets/stack/monitoring/grafana/dashboards/` for import into your Grafana.
|
||||
|
||||
**Node APIs:**
|
||||
- Access from expectations: `ctx.node_clients().validator_clients().get(0)`
|
||||
|
||||
@ -109,7 +109,7 @@ async fn run_compose_case(
|
||||
};
|
||||
|
||||
if !runner.context().telemetry().is_configured() {
|
||||
warn!("compose runner should expose prometheus metrics");
|
||||
warn!("metrics querying is disabled; set NOMOS_METRICS_QUERY_URL to enable PromQL queries");
|
||||
}
|
||||
|
||||
info!("running scenario");
|
||||
|
||||
@ -104,7 +104,7 @@ async fn run_k8s_case(validators: usize, executors: usize, run_duration: Duratio
|
||||
};
|
||||
|
||||
if !runner.context().telemetry().is_configured() {
|
||||
warn!("k8s runner should expose prometheus metrics");
|
||||
warn!("metrics querying is disabled; set NOMOS_METRICS_QUERY_URL to enable PromQL queries");
|
||||
}
|
||||
|
||||
let validator_clients = runner.context().node_clients().validator_clients().to_vec();
|
||||
|
||||
@ -38,12 +38,13 @@ Options:
|
||||
-v, --validators N Number of validators (required)
|
||||
-e, --executors N Number of executors (required)
|
||||
--bundle PATH Convenience alias for setting NOMOS_BINARIES_TAR=PATH
|
||||
--metrics-query-url URL (k8s) PromQL base URL the runner process can query (often localhost port-forward)
|
||||
--metrics-query-grafana-url URL (k8s) PromQL base URL reachable from inside the cluster (Grafana datasource)
|
||||
--metrics-otlp-ingest-url URL (k8s) Full OTLP HTTP ingest URL for node metrics export
|
||||
--external-prometheus URL (k8s) Alias for --metrics-query-url
|
||||
--external-prometheus-grafana-url URL (k8s) Alias for --metrics-query-grafana-url
|
||||
--external-otlp-metrics-endpoint URL (k8s) Alias for --metrics-otlp-ingest-url
|
||||
--metrics-query-url URL PromQL base URL the runner process can query (optional)
|
||||
--metrics-query-grafana-url URL PromQL base URL for a Grafana datasource (optional)
|
||||
--metrics-otlp-ingest-url URL Full OTLP HTTP ingest URL for node metrics export (optional)
|
||||
--grafana-url URL Grafana base URL for printing/logging (optional)
|
||||
--external-prometheus URL Alias for --metrics-query-url
|
||||
--external-prometheus-grafana-url URL Alias for --metrics-query-grafana-url
|
||||
--external-otlp-metrics-endpoint URL Alias for --metrics-otlp-ingest-url
|
||||
--local Use a local Docker image tag (default for docker-desktop k8s)
|
||||
--ecr Use an ECR image reference (default for non-docker-desktop k8s)
|
||||
--no-image-build Skip rebuilding the compose/k8s image (sets NOMOS_SKIP_IMAGE_BUILD=1)
|
||||
@ -64,6 +65,8 @@ Environment:
|
||||
NOMOS_METRICS_QUERY_GRAFANA_URL Alias for K8S_RUNNER_METRICS_QUERY_GRAFANA_URL
|
||||
K8S_RUNNER_METRICS_OTLP_INGEST_URL Full OTLP HTTP ingest URL for node metrics export
|
||||
NOMOS_METRICS_OTLP_INGEST_URL Alias for K8S_RUNNER_METRICS_OTLP_INGEST_URL
|
||||
K8S_RUNNER_GRAFANA_URL Grafana base URL for printing/logging (optional)
|
||||
NOMOS_GRAFANA_URL Alias for K8S_RUNNER_GRAFANA_URL
|
||||
|
||||
Deprecated env vars (still supported):
|
||||
K8S_RUNNER_EXTERNAL_PROMETHEUS_URL, NOMOS_EXTERNAL_PROMETHEUS_URL
|
||||
@ -115,6 +118,7 @@ run_examples::parse_args() {
|
||||
METRICS_QUERY_URL=""
|
||||
METRICS_QUERY_GRAFANA_URL=""
|
||||
METRICS_OTLP_INGEST_URL=""
|
||||
GRAFANA_URL=""
|
||||
|
||||
RUN_SECS_RAW_SPECIFIED=""
|
||||
|
||||
@ -184,6 +188,14 @@ run_examples::parse_args() {
|
||||
METRICS_OTLP_INGEST_URL="${1#*=}"
|
||||
shift
|
||||
;;
|
||||
--grafana-url)
|
||||
GRAFANA_URL="${2:-}"
|
||||
shift 2
|
||||
;;
|
||||
--grafana-url=*)
|
||||
GRAFANA_URL="${1#*=}"
|
||||
shift
|
||||
;;
|
||||
--external-prometheus)
|
||||
METRICS_QUERY_URL="${2:-}"
|
||||
shift 2
|
||||
@ -262,18 +274,6 @@ run_examples::parse_args() {
|
||||
run_examples::fail_with_usage "executors must be a non-negative integer (pass -e/--executors)"
|
||||
fi
|
||||
|
||||
if [ -n "${METRICS_QUERY_URL}" ] && [ "${MODE}" != "k8s" ]; then
|
||||
echo "Warning: --metrics-query-url is only used in k8s mode; ignoring." >&2
|
||||
METRICS_QUERY_URL=""
|
||||
fi
|
||||
if [ -n "${METRICS_QUERY_GRAFANA_URL}" ] && [ "${MODE}" != "k8s" ]; then
|
||||
echo "Warning: --metrics-query-grafana-url is only used in k8s mode; ignoring." >&2
|
||||
METRICS_QUERY_GRAFANA_URL=""
|
||||
fi
|
||||
if [ -n "${METRICS_OTLP_INGEST_URL}" ] && [ "${MODE}" != "k8s" ]; then
|
||||
echo "Warning: --metrics-otlp-ingest-url is only used in k8s mode; ignoring." >&2
|
||||
METRICS_OTLP_INGEST_URL=""
|
||||
fi
|
||||
}
|
||||
|
||||
run_examples::select_image() {
|
||||
@ -582,17 +582,21 @@ run_examples::run() {
|
||||
export NOMOS_DEMO_VALIDATORS="${DEMO_VALIDATORS}"
|
||||
export NOMOS_DEMO_EXECUTORS="${DEMO_EXECUTORS}"
|
||||
|
||||
if [ "${MODE}" = "k8s" ] && [ -n "${METRICS_QUERY_URL}" ]; then
|
||||
export K8S_RUNNER_METRICS_QUERY_URL="${METRICS_QUERY_URL}"
|
||||
if [ -n "${METRICS_QUERY_URL}" ]; then
|
||||
export NOMOS_METRICS_QUERY_URL="${METRICS_QUERY_URL}"
|
||||
export K8S_RUNNER_METRICS_QUERY_URL="${METRICS_QUERY_URL}"
|
||||
fi
|
||||
if [ "${MODE}" = "k8s" ] && [ -n "${METRICS_QUERY_GRAFANA_URL}" ]; then
|
||||
export K8S_RUNNER_METRICS_QUERY_GRAFANA_URL="${METRICS_QUERY_GRAFANA_URL}"
|
||||
if [ -n "${METRICS_QUERY_GRAFANA_URL}" ]; then
|
||||
export NOMOS_METRICS_QUERY_GRAFANA_URL="${METRICS_QUERY_GRAFANA_URL}"
|
||||
export K8S_RUNNER_METRICS_QUERY_GRAFANA_URL="${METRICS_QUERY_GRAFANA_URL}"
|
||||
fi
|
||||
if [ "${MODE}" = "k8s" ] && [ -n "${METRICS_OTLP_INGEST_URL}" ]; then
|
||||
export K8S_RUNNER_METRICS_OTLP_INGEST_URL="${METRICS_OTLP_INGEST_URL}"
|
||||
if [ -n "${METRICS_OTLP_INGEST_URL}" ]; then
|
||||
export NOMOS_METRICS_OTLP_INGEST_URL="${METRICS_OTLP_INGEST_URL}"
|
||||
export K8S_RUNNER_METRICS_OTLP_INGEST_URL="${METRICS_OTLP_INGEST_URL}"
|
||||
fi
|
||||
if [ -n "${GRAFANA_URL}" ]; then
|
||||
export NOMOS_GRAFANA_URL="${GRAFANA_URL}"
|
||||
export K8S_RUNNER_GRAFANA_URL="${GRAFANA_URL}"
|
||||
fi
|
||||
|
||||
echo "==> Running ${BIN} for ${RUN_SECS}s (mode=${MODE}, image=${IMAGE})"
|
||||
|
||||
@ -42,8 +42,6 @@ tracing_settings:
|
||||
filters:
|
||||
nomos: debug
|
||||
cryptarchia: debug
|
||||
metrics: !Otlp
|
||||
endpoint: http://prometheus:9090/api/v1/otlp/v1/metrics
|
||||
host_identifier: node
|
||||
metrics: None
|
||||
console: None
|
||||
level: INFO
|
||||
|
||||
@ -21,18 +21,6 @@ pub const DEFAULT_NODE_HTTP_PROBE_TIMEOUT: Duration = Duration::from_secs(30);
|
||||
/// Default Kubernetes deployment readiness timeout.
|
||||
pub const DEFAULT_K8S_DEPLOYMENT_TIMEOUT: Duration = Duration::from_secs(180);
|
||||
|
||||
/// Default Prometheus HTTP port.
|
||||
pub const DEFAULT_PROMETHEUS_HTTP_PORT: u16 = 9090;
|
||||
|
||||
/// Default Prometheus HTTP timeout.
|
||||
pub const DEFAULT_PROMETHEUS_HTTP_TIMEOUT: Duration = Duration::from_secs(240);
|
||||
|
||||
/// Default Prometheus HTTP probe timeout for NodePort checks.
|
||||
pub const DEFAULT_PROMETHEUS_HTTP_PROBE_TIMEOUT: Duration = Duration::from_secs(30);
|
||||
|
||||
/// Default Prometheus service name.
|
||||
pub const DEFAULT_PROMETHEUS_SERVICE_NAME: &str = "prometheus";
|
||||
|
||||
/// Default API port used by nodes.
|
||||
pub const DEFAULT_API_PORT: u16 = 18080;
|
||||
|
||||
|
||||
@ -8,9 +8,6 @@ use super::DynError;
|
||||
pub struct NodeControlCapability;
|
||||
|
||||
/// Optional observability settings attached to a scenario.
|
||||
///
|
||||
/// Runners may use this to decide whether to provision in-cluster Prometheus or
|
||||
/// reuse an existing endpoint.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct ObservabilityCapability {
|
||||
/// Prometheus-compatible base URL used by the *runner process* to query
|
||||
@ -24,6 +21,8 @@ pub struct ObservabilityCapability {
|
||||
/// Full OTLP HTTP metrics ingest endpoint used by *nodes* to export metrics
|
||||
/// (backend-specific host and path).
|
||||
pub metrics_otlp_ingest_url: Option<Url>,
|
||||
/// Optional Grafana base URL for printing/logging (human access).
|
||||
pub grafana_url: Option<Url>,
|
||||
}
|
||||
|
||||
/// Trait implemented by scenario capability markers to signal whether node
|
||||
|
||||
@ -5,6 +5,7 @@ pub mod cfgsync;
|
||||
mod definition;
|
||||
mod expectation;
|
||||
pub mod http_probe;
|
||||
mod observability;
|
||||
mod runtime;
|
||||
mod workload;
|
||||
|
||||
@ -15,6 +16,7 @@ pub use capabilities::{
|
||||
};
|
||||
pub use definition::{Builder, Scenario, ScenarioBuilder, TopologyConfigurator};
|
||||
pub use expectation::Expectation;
|
||||
pub use observability::{ObservabilityCapabilityProvider, ObservabilityInputs};
|
||||
pub use runtime::{
|
||||
BlockFeed, BlockFeedTask, BlockRecord, BlockStats, CleanupGuard, Deployer, NodeClients,
|
||||
RunContext, RunHandle, RunMetrics, Runner, ScenarioError,
|
||||
|
||||
145
testing-framework/core/src/scenario/observability.rs
Normal file
145
testing-framework/core/src/scenario/observability.rs
Normal file
@ -0,0 +1,145 @@
|
||||
use std::env;
|
||||
|
||||
use reqwest::Url;
|
||||
|
||||
use super::{Metrics, MetricsError, NodeControlCapability, ObservabilityCapability};
|
||||
|
||||
/// Observability configuration inputs shared by deployers/runners.
|
||||
///
|
||||
/// All fields are optional; missing values only matter when a caller needs the
|
||||
/// corresponding capability (e.g. querying metrics from the runner process).
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct ObservabilityInputs {
|
||||
/// Prometheus-compatible base URL used by the runner process to query
|
||||
/// metrics (PromQL API endpoints).
|
||||
pub metrics_query_url: Option<Url>,
|
||||
/// Prometheus-compatible base URL intended for an in-cluster Grafana
|
||||
/// datasource.
|
||||
pub metrics_query_grafana_url: Option<Url>,
|
||||
/// Full OTLP HTTP metrics ingest endpoint used by nodes to export metrics
|
||||
/// (backend-specific host and path).
|
||||
pub metrics_otlp_ingest_url: Option<Url>,
|
||||
/// Optional Grafana base URL for printing/logging (human access).
|
||||
pub grafana_url: Option<Url>,
|
||||
}
|
||||
|
||||
/// Capability helper for deployers that are generic over scenario capability
|
||||
/// markers.
|
||||
pub trait ObservabilityCapabilityProvider {
|
||||
fn observability_capability(&self) -> Option<&ObservabilityCapability>;
|
||||
}
|
||||
|
||||
impl ObservabilityCapabilityProvider for () {
|
||||
fn observability_capability(&self) -> Option<&ObservabilityCapability> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl ObservabilityCapabilityProvider for NodeControlCapability {
|
||||
fn observability_capability(&self) -> Option<&ObservabilityCapability> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl ObservabilityCapabilityProvider for ObservabilityCapability {
|
||||
fn observability_capability(&self) -> Option<&ObservabilityCapability> {
|
||||
Some(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl ObservabilityInputs {
|
||||
#[must_use]
|
||||
pub fn from_capability(capabilities: &ObservabilityCapability) -> Self {
|
||||
Self {
|
||||
metrics_query_url: capabilities.metrics_query_url.clone(),
|
||||
metrics_query_grafana_url: capabilities.metrics_query_grafana_url.clone(),
|
||||
metrics_otlp_ingest_url: capabilities.metrics_otlp_ingest_url.clone(),
|
||||
grafana_url: capabilities.grafana_url.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Load observability inputs from environment variables.
|
||||
///
|
||||
/// The `NOMOS_*` namespace applies to all deployers. Runner-specific env
|
||||
/// vars are also accepted as aliases for backwards compatibility.
|
||||
pub fn from_env() -> Result<Self, MetricsError> {
|
||||
Ok(Self {
|
||||
metrics_query_url: read_url_var(&[
|
||||
"NOMOS_METRICS_QUERY_URL",
|
||||
"K8S_RUNNER_METRICS_QUERY_URL",
|
||||
// Back-compat:
|
||||
"K8S_RUNNER_EXTERNAL_PROMETHEUS_URL",
|
||||
"NOMOS_EXTERNAL_PROMETHEUS_URL",
|
||||
])?,
|
||||
metrics_query_grafana_url: read_url_var(&[
|
||||
"NOMOS_METRICS_QUERY_GRAFANA_URL",
|
||||
"K8S_RUNNER_METRICS_QUERY_GRAFANA_URL",
|
||||
// Back-compat:
|
||||
"K8S_RUNNER_EXTERNAL_PROMETHEUS_GRAFANA_URL",
|
||||
"NOMOS_EXTERNAL_PROMETHEUS_GRAFANA_URL",
|
||||
])?,
|
||||
metrics_otlp_ingest_url: read_url_var(&[
|
||||
"NOMOS_METRICS_OTLP_INGEST_URL",
|
||||
"K8S_RUNNER_METRICS_OTLP_INGEST_URL",
|
||||
// Back-compat:
|
||||
"K8S_RUNNER_EXTERNAL_OTLP_METRICS_ENDPOINT",
|
||||
"NOMOS_EXTERNAL_OTLP_METRICS_ENDPOINT",
|
||||
])?,
|
||||
grafana_url: read_url_var(&["NOMOS_GRAFANA_URL", "K8S_RUNNER_GRAFANA_URL"])?,
|
||||
})
|
||||
}
|
||||
|
||||
/// Apply defaults and fallbacks (pure function).
|
||||
///
|
||||
/// Currently, the only fallback is using `metrics_query_url` as the Grafana
|
||||
/// datasource URL when `metrics_query_grafana_url` is unset.
|
||||
#[must_use]
|
||||
pub fn normalized(mut self) -> Self {
|
||||
if self.metrics_query_grafana_url.is_none() {
|
||||
self.metrics_query_grafana_url = self.metrics_query_url.clone();
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
/// Overlay non-empty values from `overrides` onto `self`.
|
||||
#[must_use]
|
||||
pub fn with_overrides(mut self, overrides: Self) -> Self {
|
||||
if overrides.metrics_query_url.is_some() {
|
||||
self.metrics_query_url = overrides.metrics_query_url;
|
||||
}
|
||||
if overrides.metrics_query_grafana_url.is_some() {
|
||||
self.metrics_query_grafana_url = overrides.metrics_query_grafana_url;
|
||||
}
|
||||
if overrides.metrics_otlp_ingest_url.is_some() {
|
||||
self.metrics_otlp_ingest_url = overrides.metrics_otlp_ingest_url;
|
||||
}
|
||||
if overrides.grafana_url.is_some() {
|
||||
self.grafana_url = overrides.grafana_url;
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
/// Build the telemetry handle exposed in `RunContext::telemetry()`.
|
||||
pub fn telemetry_handle(&self) -> Result<Metrics, MetricsError> {
|
||||
match self.metrics_query_url.clone() {
|
||||
Some(url) => Metrics::from_prometheus(url),
|
||||
None => Ok(Metrics::empty()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn read_url_var(keys: &[&'static str]) -> Result<Option<Url>, MetricsError> {
|
||||
for key in keys {
|
||||
let Some(raw) = env::var(key).ok() else {
|
||||
continue;
|
||||
};
|
||||
let raw = raw.trim();
|
||||
if raw.is_empty() {
|
||||
continue;
|
||||
}
|
||||
return Url::parse(raw)
|
||||
.map(Some)
|
||||
.map_err(|err| MetricsError::new(format!("invalid {key}: {err}")));
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
@ -16,6 +16,8 @@ workspace = true
|
||||
anyhow = "1"
|
||||
async-trait = { workspace = true }
|
||||
cfgsync = { workspace = true }
|
||||
nomos-tracing = { workspace = true }
|
||||
nomos-tracing-service = { workspace = true }
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
serde = { workspace = true, features = ["derive"] }
|
||||
tempfile = { workspace = true }
|
||||
@ -32,6 +34,5 @@ groth16 = { workspace = true }
|
||||
key-management-system-service = { workspace = true }
|
||||
nomos-core = { workspace = true }
|
||||
nomos-ledger = { workspace = true }
|
||||
nomos-tracing-service = { workspace = true }
|
||||
tests = { workspace = true }
|
||||
zksign = { workspace = true }
|
||||
|
||||
@ -1,37 +1,4 @@
|
||||
services:
|
||||
prometheus:
|
||||
image: prom/prometheus:v3.0.1
|
||||
{% if prometheus.platform %} platform: {{ prometheus.platform }}
|
||||
{% endif %} command:
|
||||
- --config.file=/etc/prometheus/prometheus.yml
|
||||
- --storage.tsdb.retention.time=7d
|
||||
- --web.enable-otlp-receiver
|
||||
- --enable-feature=otlp-write-receiver
|
||||
volumes:
|
||||
- ./stack/monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:z
|
||||
ports:
|
||||
- {{ prometheus.host_port }}
|
||||
restart: on-failure
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:10.4.1
|
||||
environment:
|
||||
GF_PATHS_CONFIG: /etc/grafana/grafana.ini
|
||||
GF_SECURITY_ADMIN_USER: admin
|
||||
GF_SECURITY_ADMIN_PASSWORD: admin
|
||||
ports:
|
||||
- {{ grafana.host_port }}
|
||||
volumes:
|
||||
- ./stack/monitoring/grafana/datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:z
|
||||
- ./stack/monitoring/grafana/dashboards.yml:/etc/grafana/provisioning/dashboards/dashboards.yaml:z
|
||||
- ./stack/monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro
|
||||
- ./stack/monitoring/grafana/grafana.ini:/etc/grafana/grafana.ini:ro
|
||||
env_file:
|
||||
- ./stack/monitoring/grafana/plugins.env
|
||||
depends_on:
|
||||
- prometheus
|
||||
restart: on-failure
|
||||
|
||||
{% for node in validators %}
|
||||
{{ node.name }}:
|
||||
image: {{ node.image }}
|
||||
|
||||
@ -6,7 +6,8 @@ pub mod setup;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use testing_framework_core::scenario::{
|
||||
BlockFeedTask, CleanupGuard, Deployer, RequiresNodeControl, Runner, Scenario,
|
||||
BlockFeedTask, CleanupGuard, Deployer, ObservabilityCapabilityProvider, RequiresNodeControl,
|
||||
Runner, Scenario,
|
||||
};
|
||||
|
||||
use crate::{errors::ComposeRunnerError, lifecycle::cleanup::RunnerCleanup};
|
||||
@ -41,7 +42,7 @@ impl ComposeDeployer {
|
||||
#[async_trait]
|
||||
impl<Caps> Deployer<Caps> for ComposeDeployer
|
||||
where
|
||||
Caps: RequiresNodeControl + Send + Sync,
|
||||
Caps: RequiresNodeControl + ObservabilityCapabilityProvider + Send + Sync,
|
||||
{
|
||||
type Error = ComposeRunnerError;
|
||||
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use testing_framework_core::scenario::{
|
||||
NodeControlHandle, RequiresNodeControl, RunContext, Runner, Scenario,
|
||||
NodeControlHandle, ObservabilityCapabilityProvider, ObservabilityInputs, RequiresNodeControl,
|
||||
RunContext, Runner, Scenario,
|
||||
};
|
||||
use tracing::info;
|
||||
|
||||
@ -20,7 +21,6 @@ use crate::{
|
||||
environment::StackEnvironment,
|
||||
ports::{HostPortMapping, compose_runner_host},
|
||||
},
|
||||
lifecycle::readiness::metrics_handle_from_port,
|
||||
};
|
||||
|
||||
pub struct DeploymentOrchestrator {
|
||||
@ -37,21 +37,35 @@ impl DeploymentOrchestrator {
|
||||
scenario: &Scenario<Caps>,
|
||||
) -> Result<Runner, ComposeRunnerError>
|
||||
where
|
||||
Caps: RequiresNodeControl + Send + Sync,
|
||||
Caps: RequiresNodeControl + ObservabilityCapabilityProvider + Send + Sync,
|
||||
{
|
||||
let setup = DeploymentSetup::new(scenario.topology());
|
||||
setup.validate_environment().await?;
|
||||
|
||||
let env_inputs = ObservabilityInputs::from_env()?;
|
||||
let cap_inputs = scenario
|
||||
.capabilities()
|
||||
.observability_capability()
|
||||
.map(ObservabilityInputs::from_capability)
|
||||
.unwrap_or_default();
|
||||
let observability = env_inputs.with_overrides(cap_inputs).normalized();
|
||||
|
||||
let DeploymentContext {
|
||||
mut environment,
|
||||
descriptors,
|
||||
} = setup.prepare_workspace().await?;
|
||||
} = setup.prepare_workspace(&observability).await?;
|
||||
|
||||
tracing::info!(
|
||||
validators = descriptors.validators().len(),
|
||||
executors = descriptors.executors().len(),
|
||||
duration_secs = scenario.duration().as_secs(),
|
||||
readiness_checks = self.deployer.readiness_checks,
|
||||
metrics_query_url = observability.metrics_query_url.as_ref().map(|u| u.as_str()),
|
||||
metrics_otlp_ingest_url = observability
|
||||
.metrics_otlp_ingest_url
|
||||
.as_ref()
|
||||
.map(|u| u.as_str()),
|
||||
grafana_url = observability.grafana_url.as_ref().map(|u| u.as_str()),
|
||||
"compose deployment starting"
|
||||
);
|
||||
|
||||
@ -71,26 +85,31 @@ impl DeploymentOrchestrator {
|
||||
let node_clients = client_builder
|
||||
.build_node_clients(&descriptors, &host_ports, &host, &mut environment)
|
||||
.await?;
|
||||
let telemetry = metrics_handle_from_port(environment.prometheus_port(), &host)?;
|
||||
let telemetry = observability.telemetry_handle()?;
|
||||
let node_control = self.maybe_node_control::<Caps>(&environment);
|
||||
|
||||
info!(
|
||||
prometheus_url = %format!("http://{}:{}/", host, environment.prometheus_port()),
|
||||
"prometheus endpoint available on host"
|
||||
);
|
||||
info!(
|
||||
grafana_url = %format!("http://{}:{}/", host, environment.grafana_port()),
|
||||
"grafana dashboard available on host"
|
||||
);
|
||||
if let Some(url) = observability.metrics_query_url.as_ref() {
|
||||
info!(metrics_query_url = %url.as_str(), "metrics query endpoint configured");
|
||||
}
|
||||
if let Some(url) = observability.grafana_url.as_ref() {
|
||||
info!(grafana_url = %url.as_str(), "grafana url configured");
|
||||
}
|
||||
log_profiling_urls(&host, &host_ports);
|
||||
|
||||
if std::env::var("TESTNET_PRINT_ENDPOINTS").is_ok() {
|
||||
let prometheus = observability
|
||||
.metrics_query_url
|
||||
.as_ref()
|
||||
.map(|u| u.as_str().to_string())
|
||||
.unwrap_or_else(|| "<disabled>".to_string());
|
||||
let grafana = observability
|
||||
.grafana_url
|
||||
.as_ref()
|
||||
.map(|u| u.as_str().to_string())
|
||||
.unwrap_or_else(|| "<disabled>".to_string());
|
||||
println!(
|
||||
"TESTNET_ENDPOINTS prometheus=http://{}:{}/ grafana=http://{}:{}/",
|
||||
host,
|
||||
environment.prometheus_port(),
|
||||
host,
|
||||
environment.grafana_port()
|
||||
"TESTNET_ENDPOINTS prometheus={} grafana={}",
|
||||
prometheus, grafana
|
||||
);
|
||||
|
||||
print_profiling_urls(&host, &host_ports);
|
||||
|
||||
@ -26,7 +26,6 @@ impl PortManager {
|
||||
info!(
|
||||
validator_ports = ?mapping.validator_api_ports(),
|
||||
executor_ports = ?mapping.executor_api_ports(),
|
||||
prometheus_port = environment.prometheus_port(),
|
||||
"resolved container host ports"
|
||||
);
|
||||
Ok(mapping)
|
||||
|
||||
@ -1,22 +1,16 @@
|
||||
use std::{
|
||||
env,
|
||||
net::{Ipv4Addr, TcpListener as StdTcpListener},
|
||||
use testing_framework_core::{
|
||||
scenario::ObservabilityInputs, topology::generation::GeneratedTopology,
|
||||
};
|
||||
|
||||
use testing_framework_core::topology::generation::GeneratedTopology;
|
||||
use tracing::{debug, info};
|
||||
use tracing::info;
|
||||
|
||||
use crate::{
|
||||
docker::ensure_docker_available,
|
||||
errors::ComposeRunnerError,
|
||||
infrastructure::environment::{
|
||||
PortReservation, StackEnvironment, ensure_supported_topology, prepare_environment,
|
||||
StackEnvironment, ensure_supported_topology, prepare_environment,
|
||||
},
|
||||
};
|
||||
|
||||
pub const PROMETHEUS_PORT_ENV: &str = "TEST_FRAMEWORK_PROMETHEUS_PORT";
|
||||
pub const DEFAULT_PROMETHEUS_PORT: u16 = 9090;
|
||||
|
||||
pub struct DeploymentSetup {
|
||||
descriptors: GeneratedTopology,
|
||||
}
|
||||
@ -46,27 +40,15 @@ impl DeploymentSetup {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn prepare_workspace(self) -> Result<DeploymentContext, ComposeRunnerError> {
|
||||
let prometheus_env = env::var(PROMETHEUS_PORT_ENV)
|
||||
.ok()
|
||||
.and_then(|raw| raw.parse::<u16>().ok());
|
||||
if prometheus_env.is_some() {
|
||||
info!(port = prometheus_env, "using prometheus port from env");
|
||||
}
|
||||
|
||||
let prometheus_port = prometheus_env
|
||||
.and_then(|port| reserve_port(port))
|
||||
.or_else(|| allocate_prometheus_port())
|
||||
.unwrap_or_else(|| PortReservation::new(DEFAULT_PROMETHEUS_PORT, None));
|
||||
|
||||
debug!(
|
||||
prometheus_port = prometheus_port.port(),
|
||||
"selected prometheus port"
|
||||
);
|
||||
|
||||
let environment =
|
||||
prepare_environment(&self.descriptors, prometheus_port, prometheus_env.is_some())
|
||||
.await?;
|
||||
pub async fn prepare_workspace(
|
||||
self,
|
||||
observability: &ObservabilityInputs,
|
||||
) -> Result<DeploymentContext, ComposeRunnerError> {
|
||||
let environment = prepare_environment(
|
||||
&self.descriptors,
|
||||
observability.metrics_otlp_ingest_url.as_ref(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
info!(
|
||||
compose_file = %environment.compose_path().display(),
|
||||
@ -81,13 +63,3 @@ impl DeploymentSetup {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn allocate_prometheus_port() -> Option<PortReservation> {
|
||||
reserve_port(DEFAULT_PROMETHEUS_PORT).or_else(|| reserve_port(0))
|
||||
}
|
||||
|
||||
fn reserve_port(port: u16) -> Option<PortReservation> {
|
||||
let listener = StdTcpListener::bind((Ipv4Addr::LOCALHOST, port)).ok()?;
|
||||
let actual_port = listener.local_addr().ok()?.port();
|
||||
Some(PortReservation::new(actual_port, Some(listener)))
|
||||
}
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
use serde::Serialize;
|
||||
use testing_framework_core::{
|
||||
constants::{DEFAULT_CFGSYNC_PORT, DEFAULT_PROMETHEUS_HTTP_PORT, kzg_container_path},
|
||||
constants::{DEFAULT_CFGSYNC_PORT, kzg_container_path},
|
||||
topology::generation::{GeneratedNodeConfig, GeneratedTopology},
|
||||
};
|
||||
|
||||
@ -10,18 +10,9 @@ mod node;
|
||||
|
||||
pub use node::{EnvEntry, NodeDescriptor};
|
||||
|
||||
/// Errors building a compose descriptor from the topology.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum DescriptorBuildError {
|
||||
#[error("prometheus port is not configured for compose descriptor")]
|
||||
MissingPrometheusPort,
|
||||
}
|
||||
|
||||
/// Top-level docker-compose descriptor built from a GeneratedTopology.
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
pub struct ComposeDescriptor {
|
||||
prometheus: PrometheusTemplate,
|
||||
grafana: GrafanaTemplate,
|
||||
validators: Vec<NodeDescriptor>,
|
||||
executors: Vec<NodeDescriptor>,
|
||||
}
|
||||
@ -50,8 +41,6 @@ pub struct ComposeDescriptorBuilder<'a> {
|
||||
topology: &'a GeneratedTopology,
|
||||
use_kzg_mount: bool,
|
||||
cfgsync_port: Option<u16>,
|
||||
prometheus_port: Option<u16>,
|
||||
grafana_port: Option<u16>,
|
||||
}
|
||||
|
||||
impl<'a> ComposeDescriptorBuilder<'a> {
|
||||
@ -60,8 +49,6 @@ impl<'a> ComposeDescriptorBuilder<'a> {
|
||||
topology,
|
||||
use_kzg_mount: false,
|
||||
cfgsync_port: None,
|
||||
prometheus_port: None,
|
||||
grafana_port: None,
|
||||
}
|
||||
}
|
||||
|
||||
@ -79,34 +66,12 @@ impl<'a> ComposeDescriptorBuilder<'a> {
|
||||
self
|
||||
}
|
||||
|
||||
/// Finish building the descriptor.
|
||||
#[must_use]
|
||||
/// Set host port mapping for Prometheus.
|
||||
pub const fn with_prometheus_port(mut self, port: u16) -> Self {
|
||||
self.prometheus_port = Some(port);
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
/// Set host port mapping for Grafana.
|
||||
pub const fn with_grafana_port(mut self, port: u16) -> Self {
|
||||
self.grafana_port = Some(port);
|
||||
self
|
||||
}
|
||||
|
||||
/// Finish building the descriptor, erroring if required fields are missing.
|
||||
pub fn build(self) -> Result<ComposeDescriptor, DescriptorBuildError> {
|
||||
pub fn build(self) -> ComposeDescriptor {
|
||||
let cfgsync_port = self.cfgsync_port.unwrap_or(DEFAULT_CFGSYNC_PORT);
|
||||
let prometheus_host_port = self
|
||||
.prometheus_port
|
||||
.ok_or(DescriptorBuildError::MissingPrometheusPort)?;
|
||||
let grafana_host_port = self.grafana_port.unwrap_or(0);
|
||||
|
||||
let (image, platform) = resolve_image();
|
||||
// Prometheus image is x86_64-only on some tags; set platform when on arm hosts.
|
||||
let prometheus_platform = match std::env::consts::ARCH {
|
||||
"aarch64" | "arm64" => Some(String::from("linux/arm64")),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
let validators = build_nodes(
|
||||
self.topology.validators(),
|
||||
@ -126,49 +91,13 @@ impl<'a> ComposeDescriptorBuilder<'a> {
|
||||
cfgsync_port,
|
||||
);
|
||||
|
||||
Ok(ComposeDescriptor {
|
||||
prometheus: PrometheusTemplate::new(prometheus_host_port, prometheus_platform),
|
||||
grafana: GrafanaTemplate::new(grafana_host_port),
|
||||
ComposeDescriptor {
|
||||
validators,
|
||||
executors,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Minimal Prometheus service mapping used in the compose template.
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
pub struct PrometheusTemplate {
|
||||
host_port: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
platform: Option<String>,
|
||||
}
|
||||
|
||||
impl PrometheusTemplate {
|
||||
fn new(port: u16, platform: Option<String>) -> Self {
|
||||
Self {
|
||||
host_port: format!("127.0.0.1:{port}:{}", DEFAULT_PROMETHEUS_HTTP_PORT),
|
||||
platform,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Minimal Grafana service mapping used in the compose template.
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
pub struct GrafanaTemplate {
|
||||
host_port: String,
|
||||
}
|
||||
|
||||
impl GrafanaTemplate {
|
||||
fn new(port: u16) -> Self {
|
||||
let host_port = match port {
|
||||
0 => "127.0.0.1::3000".to_string(), // docker assigns host port
|
||||
_ => format!("127.0.0.1:{port}:3000"),
|
||||
};
|
||||
|
||||
Self { host_port }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub(crate) enum ComposeNodeKind {
|
||||
Validator,
|
||||
|
||||
@ -49,24 +49,6 @@ impl ComposeWorkspace {
|
||||
copy_dir_recursive(&scripts_source, &temp.path().join("stack/scripts"))?;
|
||||
}
|
||||
|
||||
// Ensure Prometheus config is a file (Docker bind mount fails if a directory
|
||||
// exists).
|
||||
let prometheus_src = stack_source.join("monitoring/prometheus.yml");
|
||||
let prometheus_dst = temp.path().join("stack/monitoring/prometheus.yml");
|
||||
if prometheus_dst.exists() && prometheus_dst.is_dir() {
|
||||
fs::remove_dir_all(&prometheus_dst)
|
||||
.with_context(|| format!("removing bogus dir {}", prometheus_dst.display()))?;
|
||||
}
|
||||
if !prometheus_dst.exists() {
|
||||
fs::copy(&prometheus_src, &prometheus_dst).with_context(|| {
|
||||
format!(
|
||||
"copying prometheus.yml {} -> {}",
|
||||
prometheus_src.display(),
|
||||
prometheus_dst.display()
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
||||
let kzg_source = repo_root.join("testing-framework/assets/stack/kzgrs_test_params");
|
||||
let target = temp.path().join("kzgrs_test_params");
|
||||
if kzg_source.exists() {
|
||||
|
||||
@ -9,10 +9,7 @@ use testing_framework_core::{
|
||||
};
|
||||
use url::ParseError;
|
||||
|
||||
use crate::{
|
||||
descriptor::DescriptorBuildError, docker::commands::ComposeCommandError,
|
||||
infrastructure::template::TemplateError,
|
||||
};
|
||||
use crate::{docker::commands::ComposeCommandError, infrastructure::template::TemplateError};
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
/// Top-level compose runner errors.
|
||||
@ -94,11 +91,6 @@ pub enum ConfigError {
|
||||
#[source]
|
||||
source: anyhow::Error,
|
||||
},
|
||||
#[error("failed to build compose descriptor: {source}")]
|
||||
Descriptor {
|
||||
#[source]
|
||||
source: DescriptorBuildError,
|
||||
},
|
||||
#[error("failed to render compose template: {source}")]
|
||||
Template {
|
||||
#[source]
|
||||
|
||||
@ -1,5 +1,8 @@
|
||||
use std::{path::Path, process::Command as StdCommand};
|
||||
|
||||
use nomos_tracing::metrics::otlp::OtlpMetricsConfig;
|
||||
use nomos_tracing_service::MetricsLayer;
|
||||
use reqwest::Url;
|
||||
use testing_framework_core::{
|
||||
scenario::cfgsync::{apply_topology_overrides, load_cfgsync_template, write_cfgsync_template},
|
||||
topology::generation::GeneratedTopology,
|
||||
@ -60,6 +63,7 @@ pub fn update_cfgsync_config(
|
||||
topology: &GeneratedTopology,
|
||||
use_kzg_mount: bool,
|
||||
port: u16,
|
||||
metrics_otlp_ingest_url: Option<&Url>,
|
||||
) -> anyhow::Result<()> {
|
||||
debug!(
|
||||
path = %path.display(),
|
||||
@ -72,6 +76,12 @@ pub fn update_cfgsync_config(
|
||||
let mut cfg = load_cfgsync_template(path)?;
|
||||
cfg.port = port;
|
||||
apply_topology_overrides(&mut cfg, topology, use_kzg_mount);
|
||||
if let Some(endpoint) = metrics_otlp_ingest_url.cloned() {
|
||||
cfg.tracing_settings.metrics = MetricsLayer::Otlp(OtlpMetricsConfig {
|
||||
endpoint,
|
||||
host_identifier: "node".into(),
|
||||
});
|
||||
}
|
||||
write_cfgsync_template(path, &cfg)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@ -1,20 +1,19 @@
|
||||
use std::{
|
||||
env,
|
||||
net::{Ipv4Addr, TcpListener as StdTcpListener},
|
||||
path::{Path, PathBuf},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use anyhow::{Context as _, anyhow};
|
||||
use anyhow::anyhow;
|
||||
use reqwest::Url;
|
||||
use testing_framework_core::{
|
||||
adjust_timeout, scenario::CleanupGuard, topology::generation::GeneratedTopology,
|
||||
};
|
||||
use tokio::{process::Command, time::timeout};
|
||||
use tracing::{debug, error, info, warn};
|
||||
use tokio::process::Command;
|
||||
use tracing::{debug, error, info};
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::{
|
||||
deployer::setup::DEFAULT_PROMETHEUS_PORT,
|
||||
descriptor::ComposeDescriptor,
|
||||
docker::{
|
||||
commands::{compose_up, dump_compose_logs, run_docker_command},
|
||||
@ -31,8 +30,6 @@ use crate::{
|
||||
};
|
||||
|
||||
const CFGSYNC_START_TIMEOUT: Duration = Duration::from_secs(180);
|
||||
const COMPOSE_PORT_DISCOVERY_TIMEOUT: Duration = Duration::from_secs(30);
|
||||
const STACK_BRINGUP_MAX_ATTEMPTS: usize = 3;
|
||||
|
||||
/// Paths and flags describing the prepared compose workspace.
|
||||
pub struct WorkspaceState {
|
||||
@ -49,8 +46,6 @@ pub struct StackEnvironment {
|
||||
root: PathBuf,
|
||||
workspace: Option<ComposeWorkspace>,
|
||||
cfgsync_handle: Option<CfgsyncServerHandle>,
|
||||
prometheus_port: u16,
|
||||
grafana_port: u16,
|
||||
}
|
||||
|
||||
impl StackEnvironment {
|
||||
@ -60,8 +55,6 @@ impl StackEnvironment {
|
||||
compose_path: PathBuf,
|
||||
project_name: String,
|
||||
cfgsync_handle: Option<CfgsyncServerHandle>,
|
||||
prometheus_port: u16,
|
||||
grafana_port: u16,
|
||||
) -> Self {
|
||||
let WorkspaceState {
|
||||
workspace, root, ..
|
||||
@ -73,8 +66,6 @@ impl StackEnvironment {
|
||||
root,
|
||||
workspace: Some(workspace),
|
||||
cfgsync_handle,
|
||||
prometheus_port,
|
||||
grafana_port,
|
||||
}
|
||||
}
|
||||
|
||||
@ -82,16 +73,6 @@ impl StackEnvironment {
|
||||
&self.compose_path
|
||||
}
|
||||
|
||||
/// Host port exposed by Prometheus.
|
||||
pub const fn prometheus_port(&self) -> u16 {
|
||||
self.prometheus_port
|
||||
}
|
||||
|
||||
/// Host port exposed by Grafana.
|
||||
pub const fn grafana_port(&self) -> u16 {
|
||||
self.grafana_port
|
||||
}
|
||||
|
||||
/// Docker compose project name.
|
||||
pub fn project_name(&self) -> &str {
|
||||
&self.project_name
|
||||
@ -138,27 +119,6 @@ impl StackEnvironment {
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents a claimed port, optionally guarded by an open socket.
|
||||
pub struct PortReservation {
|
||||
port: u16,
|
||||
_guard: Option<StdTcpListener>,
|
||||
}
|
||||
|
||||
impl PortReservation {
|
||||
/// Holds a port and an optional socket guard to keep it reserved.
|
||||
pub const fn new(port: u16, guard: Option<StdTcpListener>) -> Self {
|
||||
Self {
|
||||
port,
|
||||
_guard: guard,
|
||||
}
|
||||
}
|
||||
|
||||
/// The reserved port number.
|
||||
pub const fn port(&self) -> u16 {
|
||||
self.port
|
||||
}
|
||||
}
|
||||
|
||||
/// Verifies the topology has at least one validator so compose can start.
|
||||
pub fn ensure_supported_topology(
|
||||
descriptors: &GeneratedTopology,
|
||||
@ -210,10 +170,17 @@ pub fn update_cfgsync_logged(
|
||||
workspace: &WorkspaceState,
|
||||
descriptors: &GeneratedTopology,
|
||||
cfgsync_port: u16,
|
||||
metrics_otlp_ingest_url: Option<&Url>,
|
||||
) -> Result<(), ComposeRunnerError> {
|
||||
info!(cfgsync_port, "updating cfgsync configuration");
|
||||
|
||||
configure_cfgsync(workspace, descriptors, cfgsync_port).map_err(Into::into)
|
||||
configure_cfgsync(
|
||||
workspace,
|
||||
descriptors,
|
||||
cfgsync_port,
|
||||
metrics_otlp_ingest_url,
|
||||
)
|
||||
.map_err(Into::into)
|
||||
}
|
||||
|
||||
/// Start the cfgsync server container using the generated config.
|
||||
@ -232,12 +199,14 @@ pub fn configure_cfgsync(
|
||||
workspace: &WorkspaceState,
|
||||
descriptors: &GeneratedTopology,
|
||||
cfgsync_port: u16,
|
||||
metrics_otlp_ingest_url: Option<&Url>,
|
||||
) -> Result<(), ConfigError> {
|
||||
update_cfgsync_config(
|
||||
&workspace.cfgsync_path,
|
||||
descriptors,
|
||||
workspace.use_kzg,
|
||||
cfgsync_port,
|
||||
metrics_otlp_ingest_url,
|
||||
)
|
||||
.map_err(|source| ConfigError::Cfgsync {
|
||||
path: workspace.cfgsync_path.clone(),
|
||||
@ -328,23 +297,16 @@ pub fn write_compose_artifacts(
|
||||
workspace: &WorkspaceState,
|
||||
descriptors: &GeneratedTopology,
|
||||
cfgsync_port: u16,
|
||||
prometheus_port: u16,
|
||||
grafana_port: u16,
|
||||
) -> Result<PathBuf, ConfigError> {
|
||||
debug!(
|
||||
cfgsync_port,
|
||||
prometheus_port,
|
||||
grafana_port,
|
||||
workspace_root = %workspace.root.display(),
|
||||
"building compose descriptor"
|
||||
);
|
||||
let descriptor = ComposeDescriptor::builder(descriptors)
|
||||
.with_kzg_mount(workspace.use_kzg)
|
||||
.with_cfgsync_port(cfgsync_port)
|
||||
.with_prometheus_port(prometheus_port)
|
||||
.with_grafana_port(grafana_port)
|
||||
.build()
|
||||
.map_err(|source| ConfigError::Descriptor { source })?;
|
||||
.build();
|
||||
|
||||
let compose_path = workspace.root.join("compose.generated.yml");
|
||||
write_compose_file(&descriptor, &compose_path)
|
||||
@ -358,21 +320,9 @@ pub fn render_compose_logged(
|
||||
workspace: &WorkspaceState,
|
||||
descriptors: &GeneratedTopology,
|
||||
cfgsync_port: u16,
|
||||
prometheus_port: u16,
|
||||
grafana_port: u16,
|
||||
) -> Result<PathBuf, ComposeRunnerError> {
|
||||
info!(
|
||||
cfgsync_port,
|
||||
prometheus_port, grafana_port, "rendering compose file with ports"
|
||||
);
|
||||
write_compose_artifacts(
|
||||
workspace,
|
||||
descriptors,
|
||||
cfgsync_port,
|
||||
prometheus_port,
|
||||
grafana_port,
|
||||
)
|
||||
.map_err(Into::into)
|
||||
info!(cfgsync_port, "rendering compose file");
|
||||
write_compose_artifacts(workspace, descriptors, cfgsync_port).map_err(Into::into)
|
||||
}
|
||||
|
||||
/// Bring up docker compose; shut down cfgsync if start-up fails.
|
||||
@ -404,172 +354,46 @@ pub async fn bring_up_stack_logged(
|
||||
/// Prepare workspace, cfgsync, compose artifacts, and launch the stack.
|
||||
pub async fn prepare_environment(
|
||||
descriptors: &GeneratedTopology,
|
||||
mut prometheus_port: PortReservation,
|
||||
prometheus_port_locked: bool,
|
||||
metrics_otlp_ingest_url: Option<&Url>,
|
||||
) -> Result<StackEnvironment, ComposeRunnerError> {
|
||||
let workspace = prepare_workspace_logged()?;
|
||||
let cfgsync_port = allocate_cfgsync_port()?;
|
||||
|
||||
let grafana_env = env::var("COMPOSE_GRAFANA_PORT")
|
||||
.ok()
|
||||
.and_then(|raw| raw.parse::<u16>().ok());
|
||||
if let Some(port) = grafana_env {
|
||||
info!(port, "using grafana port from env");
|
||||
}
|
||||
|
||||
update_cfgsync_logged(&workspace, descriptors, cfgsync_port)?;
|
||||
update_cfgsync_logged(
|
||||
&workspace,
|
||||
descriptors,
|
||||
cfgsync_port,
|
||||
metrics_otlp_ingest_url,
|
||||
)?;
|
||||
ensure_compose_image().await?;
|
||||
let compose_path = render_compose_logged(&workspace, descriptors, cfgsync_port)?;
|
||||
|
||||
let attempts = if prometheus_port_locked {
|
||||
1
|
||||
} else {
|
||||
STACK_BRINGUP_MAX_ATTEMPTS
|
||||
};
|
||||
let mut last_err = None;
|
||||
let project_name = format!("nomos-compose-{}", Uuid::new_v4());
|
||||
let mut cfgsync_handle = start_cfgsync_stage(&workspace, cfgsync_port).await?;
|
||||
|
||||
for _ in 0..attempts {
|
||||
let prometheus_port_value = prometheus_port.port();
|
||||
let grafana_port_value = grafana_env.unwrap_or(0);
|
||||
let compose_path = render_compose_logged(
|
||||
&workspace,
|
||||
descriptors,
|
||||
cfgsync_port,
|
||||
prometheus_port_value,
|
||||
grafana_port_value,
|
||||
)?;
|
||||
|
||||
let project_name = format!("nomos-compose-{}", Uuid::new_v4());
|
||||
let mut cfgsync_handle = start_cfgsync_stage(&workspace, cfgsync_port).await?;
|
||||
|
||||
drop(prometheus_port);
|
||||
|
||||
match bring_up_stack_logged(
|
||||
&compose_path,
|
||||
&project_name,
|
||||
&workspace.root,
|
||||
&mut cfgsync_handle,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(()) => {
|
||||
let grafana_port_resolved = resolve_service_port(
|
||||
&compose_path,
|
||||
&project_name,
|
||||
&workspace.root,
|
||||
"grafana",
|
||||
3000,
|
||||
)
|
||||
.await
|
||||
.unwrap_or(grafana_port_value);
|
||||
|
||||
info!(
|
||||
project = %project_name,
|
||||
compose_file = %compose_path.display(),
|
||||
cfgsync_port,
|
||||
prometheus_port = prometheus_port_value,
|
||||
grafana_port = grafana_port_resolved,
|
||||
"compose stack is up"
|
||||
);
|
||||
return Ok(StackEnvironment::from_workspace(
|
||||
workspace,
|
||||
compose_path,
|
||||
project_name,
|
||||
Some(cfgsync_handle),
|
||||
prometheus_port_value,
|
||||
grafana_port_resolved,
|
||||
));
|
||||
}
|
||||
Err(err) => {
|
||||
// Attempt to capture container logs even when bring-up fails early.
|
||||
dump_compose_logs(&compose_path, &project_name, &workspace.root).await;
|
||||
cfgsync_handle.shutdown();
|
||||
last_err = Some(err);
|
||||
if prometheus_port_locked {
|
||||
break;
|
||||
}
|
||||
warn!(
|
||||
error = %last_err.as_ref().unwrap(),
|
||||
"compose bring-up failed; retrying with a new prometheus port"
|
||||
);
|
||||
prometheus_port = allocate_prometheus_port()
|
||||
.unwrap_or_else(|| PortReservation::new(DEFAULT_PROMETHEUS_PORT, None));
|
||||
debug!(
|
||||
next_prometheus_port = prometheus_port.port(),
|
||||
"retrying compose bring-up"
|
||||
);
|
||||
}
|
||||
}
|
||||
if let Err(err) = bring_up_stack_logged(
|
||||
&compose_path,
|
||||
&project_name,
|
||||
&workspace.root,
|
||||
&mut cfgsync_handle,
|
||||
)
|
||||
.await
|
||||
{
|
||||
dump_compose_logs(&compose_path, &project_name, &workspace.root).await;
|
||||
cfgsync_handle.shutdown();
|
||||
return Err(err);
|
||||
}
|
||||
|
||||
Err(last_err.expect("prepare_environment should return or fail with error"))
|
||||
}
|
||||
|
||||
fn allocate_prometheus_port() -> Option<PortReservation> {
|
||||
reserve_prometheus_port(DEFAULT_PROMETHEUS_PORT).or_else(|| reserve_prometheus_port(0))
|
||||
}
|
||||
|
||||
fn reserve_prometheus_port(port: u16) -> Option<PortReservation> {
|
||||
let listener = StdTcpListener::bind((Ipv4Addr::LOCALHOST, port)).ok()?;
|
||||
let actual_port = listener.local_addr().ok()?.port();
|
||||
Some(PortReservation::new(actual_port, Some(listener)))
|
||||
}
|
||||
|
||||
async fn resolve_service_port(
|
||||
compose_file: &Path,
|
||||
project_name: &str,
|
||||
root: &Path,
|
||||
service: &str,
|
||||
container_port: u16,
|
||||
) -> Result<u16, ComposeRunnerError> {
|
||||
let mut cmd = Command::new("docker");
|
||||
cmd.arg("compose")
|
||||
.arg("-f")
|
||||
.arg(compose_file)
|
||||
.arg("-p")
|
||||
.arg(project_name)
|
||||
.arg("port")
|
||||
.arg(service)
|
||||
.arg(container_port.to_string())
|
||||
.current_dir(root);
|
||||
|
||||
let output = timeout(adjust_timeout(COMPOSE_PORT_DISCOVERY_TIMEOUT), cmd.output())
|
||||
.await
|
||||
.map_err(|_| ComposeRunnerError::PortDiscovery {
|
||||
service: service.to_owned(),
|
||||
container_port,
|
||||
source: anyhow!("docker compose port timed out"),
|
||||
})?
|
||||
.with_context(|| format!("running docker compose port {service} {container_port}"))
|
||||
.map_err(|source| ComposeRunnerError::PortDiscovery {
|
||||
service: service.to_owned(),
|
||||
container_port,
|
||||
source,
|
||||
})?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Err(ComposeRunnerError::PortDiscovery {
|
||||
service: service.to_owned(),
|
||||
container_port,
|
||||
source: anyhow!("docker compose port exited with {}", output.status),
|
||||
});
|
||||
}
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
for line in stdout.lines() {
|
||||
let line = line.trim();
|
||||
if line.is_empty() {
|
||||
continue;
|
||||
}
|
||||
if let Some(port_str) = line.rsplit(':').next()
|
||||
&& let Ok(port) = port_str.trim().parse::<u16>()
|
||||
{
|
||||
return Ok(port);
|
||||
}
|
||||
}
|
||||
|
||||
Err(ComposeRunnerError::PortDiscovery {
|
||||
service: service.to_owned(),
|
||||
container_port,
|
||||
source: anyhow!("unable to parse docker compose port output: {stdout}"),
|
||||
})
|
||||
info!(
|
||||
project = %project_name,
|
||||
compose_file = %compose_path.display(),
|
||||
cfgsync_port,
|
||||
"compose stack is up"
|
||||
);
|
||||
|
||||
Ok(StackEnvironment::from_workspace(
|
||||
workspace,
|
||||
compose_path,
|
||||
project_name,
|
||||
Some(cfgsync_handle),
|
||||
))
|
||||
}
|
||||
|
||||
@ -3,7 +3,7 @@ use std::time::Duration;
|
||||
use reqwest::Url;
|
||||
use testing_framework_core::{
|
||||
nodes::ApiClient,
|
||||
scenario::{Metrics, MetricsError, NodeClients, http_probe::NodeRole as HttpNodeRole},
|
||||
scenario::{NodeClients, http_probe::NodeRole as HttpNodeRole},
|
||||
topology::generation::{GeneratedTopology, NodeRole as TopologyNodeRole},
|
||||
};
|
||||
use tokio::time::sleep;
|
||||
@ -16,13 +16,6 @@ use crate::{
|
||||
|
||||
const DISABLED_READINESS_SLEEP: Duration = Duration::from_secs(5);
|
||||
|
||||
/// Build a metrics client from host/port, validating the URL.
|
||||
pub fn metrics_handle_from_port(port: u16, host: &str) -> Result<Metrics, MetricsError> {
|
||||
let url = Url::parse(&format!("http://{host}:{port}/"))
|
||||
.map_err(|err| MetricsError::new(format!("invalid prometheus url: {err}")))?;
|
||||
Metrics::from_prometheus(url)
|
||||
}
|
||||
|
||||
/// Wait until all validators respond on their API ports.
|
||||
pub async fn ensure_validators_ready_with_ports(ports: &[u16]) -> Result<(), StackReadinessError> {
|
||||
if ports.is_empty() {
|
||||
|
||||
@ -17,6 +17,7 @@ anyhow = "1"
|
||||
async-trait = { workspace = true }
|
||||
k8s-openapi = { version = "0.20", features = ["latest"] }
|
||||
kube = { version = "0.87", default-features = false, features = ["client", "runtime", "rustls-tls"] }
|
||||
nomos-tracing = { workspace = true }
|
||||
nomos-tracing-service = { workspace = true }
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
|
||||
@ -1,2 +0,0 @@
|
||||
*.json
|
||||
!.gitignore
|
||||
@ -37,9 +37,3 @@ app.kubernetes.io/instance: {{ $root.Release.Name }}
|
||||
nomos/logical-role: executor
|
||||
nomos/executor-index: "{{ $index }}"
|
||||
{{- end -}}
|
||||
|
||||
{{- define "nomos-runner.prometheusLabels" -}}
|
||||
app.kubernetes.io/name: {{ include "nomos-runner.chart" . }}
|
||||
app.kubernetes.io/instance: {{ .Release.Name }}
|
||||
nomos/logical-role: prometheus
|
||||
{{- end -}}
|
||||
|
||||
@ -1,33 +0,0 @@
|
||||
{{- if .Values.grafana.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "nomos-runner.fullname" . }}-grafana-config
|
||||
data:
|
||||
datasources.yaml: |
|
||||
apiVersion: 1
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
isDefault: true
|
||||
uid: PBFA97CFB590B2093
|
||||
orgId: 1
|
||||
url: {{ if .Values.prometheus.enabled }}http://prometheus:9090{{ else }}{{ required "prometheus.externalUrl must be set when prometheus.enabled=false" .Values.prometheus.externalUrl }}{{ end }}
|
||||
editable: true
|
||||
dashboards.yaml: |
|
||||
apiVersion: 1
|
||||
providers:
|
||||
- name: 'default'
|
||||
orgId: 1
|
||||
folder: 'Nomos'
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
{{ range $path, $_ := .Files.Glob "grafana/dashboards/*.json" }}
|
||||
{{ base $path }}: |
|
||||
{{ $.Files.Get $path | indent 4 }}
|
||||
{{ end }}
|
||||
{{- end }}
|
||||
@ -1,64 +0,0 @@
|
||||
{{- if .Values.grafana.enabled }}
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ include "nomos-runner.fullname" . }}-grafana
|
||||
labels:
|
||||
app: {{ include "nomos-runner.name" . }}
|
||||
chart: {{ include "nomos-runner.chart" . }}
|
||||
release: {{ .Release.Name }}
|
||||
heritage: {{ .Release.Service }}
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: {{ include "nomos-runner.name" . }}
|
||||
component: grafana
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: {{ include "nomos-runner.name" . }}
|
||||
component: grafana
|
||||
spec:
|
||||
containers:
|
||||
- name: grafana
|
||||
image: {{ .Values.grafana.image }}
|
||||
imagePullPolicy: {{ .Values.grafana.imagePullPolicy }}
|
||||
env:
|
||||
- name: GF_SECURITY_ADMIN_USER
|
||||
value: {{ .Values.grafana.adminUser | quote }}
|
||||
- name: GF_SECURITY_ADMIN_PASSWORD
|
||||
value: {{ .Values.grafana.adminPassword | quote }}
|
||||
ports:
|
||||
- containerPort: 3000
|
||||
name: http
|
||||
volumeMounts:
|
||||
- name: grafana-config
|
||||
mountPath: /etc/grafana/provisioning/datasources/datasources.yaml
|
||||
subPath: datasources.yaml
|
||||
readOnly: true
|
||||
- name: grafana-config
|
||||
mountPath: /etc/grafana/provisioning/dashboards/dashboards.yaml
|
||||
subPath: dashboards.yaml
|
||||
readOnly: true
|
||||
- name: grafana-dashboards
|
||||
mountPath: /var/lib/grafana/dashboards
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: grafana-config
|
||||
configMap:
|
||||
name: {{ include "nomos-runner.fullname" . }}-grafana-config
|
||||
items:
|
||||
- key: datasources.yaml
|
||||
path: datasources.yaml
|
||||
- key: dashboards.yaml
|
||||
path: dashboards.yaml
|
||||
- name: grafana-dashboards
|
||||
configMap:
|
||||
name: {{ include "nomos-runner.fullname" . }}-grafana-config
|
||||
items:
|
||||
{{ range $path, $_ := .Files.Glob "grafana/dashboards/*.json" }}
|
||||
- key: {{ base $path }}
|
||||
path: {{ base $path }}
|
||||
{{ end }}
|
||||
{{- end }}
|
||||
@ -1,22 +0,0 @@
|
||||
{{- if .Values.grafana.enabled }}
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "nomos-runner.fullname" . }}-grafana
|
||||
labels:
|
||||
app: {{ include "nomos-runner.name" . }}
|
||||
component: grafana
|
||||
spec:
|
||||
type: {{ .Values.grafana.service.type }}
|
||||
ports:
|
||||
- port: 3000
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
name: http
|
||||
{{- if and (eq .Values.grafana.service.type "NodePort") .Values.grafana.service.nodePort }}
|
||||
nodePort: {{ .Values.grafana.service.nodePort }}
|
||||
{{- end }}
|
||||
selector:
|
||||
app: {{ include "nomos-runner.name" . }}
|
||||
component: grafana
|
||||
{{- end }}
|
||||
@ -1,16 +0,0 @@
|
||||
{{- if .Values.prometheus.enabled }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: prometheus
|
||||
labels:
|
||||
{{- include "nomos-runner.prometheusLabels" . | nindent 4 }}
|
||||
data:
|
||||
prometheus.yml: |
|
||||
{{- if .Values.prometheus.config }}
|
||||
{{ .Values.prometheus.config | indent 4 }}
|
||||
{{- else }}
|
||||
{{ "" | indent 4 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@ -1,38 +0,0 @@
|
||||
{{- if .Values.prometheus.enabled }}
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: prometheus
|
||||
labels:
|
||||
{{- include "nomos-runner.prometheusLabels" . | nindent 4 }}
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "nomos-runner.prometheusLabels" . | nindent 6 }}
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
{{- include "nomos-runner.prometheusLabels" . | nindent 8 }}
|
||||
spec:
|
||||
containers:
|
||||
- name: prometheus
|
||||
image: {{ .Values.prometheus.image }}
|
||||
imagePullPolicy: {{ .Values.prometheus.imagePullPolicy | default "IfNotPresent" }}
|
||||
args:
|
||||
- --config.file=/etc/prometheus/prometheus.yml
|
||||
- --storage.tsdb.retention.time={{ .Values.prometheus.retention }}
|
||||
- --web.enable-otlp-receiver
|
||||
- --enable-feature=otlp-write-receiver
|
||||
ports:
|
||||
- containerPort: 9090
|
||||
name: http
|
||||
volumeMounts:
|
||||
- name: prometheus-config
|
||||
mountPath: /etc/prometheus
|
||||
volumes:
|
||||
- name: prometheus-config
|
||||
configMap:
|
||||
name: prometheus
|
||||
{{- end }}
|
||||
@ -1,20 +0,0 @@
|
||||
{{- if .Values.prometheus.enabled }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: prometheus
|
||||
labels:
|
||||
{{- include "nomos-runner.prometheusLabels" . | nindent 4 }}
|
||||
spec:
|
||||
type: {{ .Values.prometheus.service.type | default "NodePort" }}
|
||||
selector:
|
||||
{{- include "nomos-runner.prometheusLabels" . | nindent 4 }}
|
||||
ports:
|
||||
- name: http
|
||||
port: 9090
|
||||
targetPort: http
|
||||
{{- if and (eq (default "NodePort" .Values.prometheus.service.type) "NodePort") .Values.prometheus.service.nodePort }}
|
||||
nodePort: {{ .Values.prometheus.service.nodePort }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@ -1,18 +0,0 @@
|
||||
{{- if eq .Values.kzg.mode "hostPath" }}
|
||||
apiVersion: v1
|
||||
kind: PersistentVolume
|
||||
metadata:
|
||||
name: {{ include "nomos-runner.fullname" . }}-kzg
|
||||
labels:
|
||||
{{- include "nomos-runner.labels" . | nindent 4 }}
|
||||
spec:
|
||||
capacity:
|
||||
storage: {{ .Values.kzg.storageSize }}
|
||||
accessModes:
|
||||
- ReadOnlyMany
|
||||
persistentVolumeReclaimPolicy: Delete
|
||||
storageClassName: manual
|
||||
hostPath:
|
||||
path: {{ .Values.kzg.hostPath }}
|
||||
type: {{ .Values.kzg.hostPathType }}
|
||||
{{- end }}
|
||||
@ -1,16 +0,0 @@
|
||||
{{- if eq .Values.kzg.mode "hostPath" }}
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: {{ include "nomos-runner.fullname" . }}-kzg
|
||||
labels:
|
||||
{{- include "nomos-runner.labels" . | nindent 4 }}
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadOnlyMany
|
||||
storageClassName: manual
|
||||
volumeName: {{ include "nomos-runner.fullname" . }}-kzg
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ .Values.kzg.storageSize }}
|
||||
{{- end }}
|
||||
@ -26,28 +26,3 @@ kzg:
|
||||
hostPath: "/var/lib/nomos/kzgrs_test_params"
|
||||
hostPathType: "Directory"
|
||||
storageSize: "1Gi"
|
||||
|
||||
prometheus:
|
||||
enabled: true
|
||||
externalUrl: ""
|
||||
image: "prom/prometheus:v3.0.1"
|
||||
imagePullPolicy: IfNotPresent
|
||||
retention: "7d"
|
||||
service:
|
||||
type: NodePort
|
||||
nodePort: null
|
||||
config: |
|
||||
global:
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
monitor: "NomosRunner"
|
||||
|
||||
grafana:
|
||||
enabled: true
|
||||
image: "grafana/grafana:10.4.1"
|
||||
imagePullPolicy: IfNotPresent
|
||||
adminUser: admin
|
||||
adminPassword: admin
|
||||
service:
|
||||
type: NodePort
|
||||
nodePort: null
|
||||
|
||||
@ -1,11 +1,10 @@
|
||||
use anyhow::Error;
|
||||
use async_trait::async_trait;
|
||||
use kube::Client;
|
||||
use reqwest::Url;
|
||||
use testing_framework_core::{
|
||||
scenario::{
|
||||
BlockFeedTask, CleanupGuard, Deployer, MetricsError, ObservabilityCapability, RunContext,
|
||||
Runner, Scenario,
|
||||
BlockFeedTask, CleanupGuard, Deployer, MetricsError, ObservabilityCapability,
|
||||
ObservabilityInputs, RunContext, Runner, Scenario,
|
||||
},
|
||||
topology::generation::GeneratedTopology,
|
||||
};
|
||||
@ -17,13 +16,12 @@ use crate::{
|
||||
cluster::{
|
||||
ClusterEnvironment, NodeClientError, PortSpecs, RemoteReadinessError,
|
||||
build_node_clients, cluster_identifiers, collect_port_specs, ensure_cluster_readiness,
|
||||
install_stack, kill_port_forwards, metrics_handle_from_endpoint,
|
||||
metrics_handle_from_url, wait_for_ports_or_cleanup,
|
||||
install_stack, kill_port_forwards, wait_for_ports_or_cleanup,
|
||||
},
|
||||
helm::HelmError,
|
||||
},
|
||||
lifecycle::{block_feed::spawn_block_feed_with, cleanup::RunnerCleanup},
|
||||
wait::{ClusterWaitError, HostPort, PortForwardHandle},
|
||||
wait::{ClusterWaitError, PortForwardHandle},
|
||||
};
|
||||
|
||||
/// Deploys a scenario into Kubernetes using Helm charts and port-forwards.
|
||||
@ -93,7 +91,7 @@ impl Deployer for K8sDeployer {
|
||||
type Error = K8sRunnerError;
|
||||
|
||||
async fn deploy(&self, scenario: &Scenario) -> Result<Runner, Self::Error> {
|
||||
deploy_with_observability(self, scenario, None, None, None).await
|
||||
deploy_with_observability(self, scenario, None).await
|
||||
}
|
||||
}
|
||||
|
||||
@ -105,31 +103,10 @@ impl Deployer<ObservabilityCapability> for K8sDeployer {
|
||||
&self,
|
||||
scenario: &Scenario<ObservabilityCapability>,
|
||||
) -> Result<Runner, Self::Error> {
|
||||
deploy_with_observability(
|
||||
self,
|
||||
scenario,
|
||||
scenario.capabilities().metrics_query_url.clone(),
|
||||
scenario.capabilities().metrics_query_grafana_url.clone(),
|
||||
scenario.capabilities().metrics_otlp_ingest_url.clone(),
|
||||
)
|
||||
.await
|
||||
deploy_with_observability(self, scenario, Some(scenario.capabilities())).await
|
||||
}
|
||||
}
|
||||
|
||||
fn cluster_prometheus_endpoint(cluster: &Option<ClusterEnvironment>) -> Option<&HostPort> {
|
||||
cluster
|
||||
.as_ref()
|
||||
.expect("cluster must be available")
|
||||
.prometheus_endpoint()
|
||||
}
|
||||
|
||||
fn cluster_grafana_endpoint(cluster: &Option<ClusterEnvironment>) -> Option<&HostPort> {
|
||||
cluster
|
||||
.as_ref()
|
||||
.expect("cluster must be available")
|
||||
.grafana_endpoint()
|
||||
}
|
||||
|
||||
async fn fail_cluster(cluster: &mut Option<ClusterEnvironment>, reason: &str) {
|
||||
if let Some(env) = cluster.as_mut() {
|
||||
env.fail(reason).await;
|
||||
@ -157,59 +134,13 @@ fn ensure_supported_topology(descriptors: &GeneratedTopology) -> Result<(), K8sR
|
||||
async fn deploy_with_observability<Caps>(
|
||||
deployer: &K8sDeployer,
|
||||
scenario: &Scenario<Caps>,
|
||||
metrics_query_url: Option<Url>,
|
||||
metrics_query_grafana_url: Option<Url>,
|
||||
metrics_otlp_ingest_url: Option<Url>,
|
||||
observability: Option<&ObservabilityCapability>,
|
||||
) -> Result<Runner, K8sRunnerError> {
|
||||
let external_prometheus = match metrics_query_url {
|
||||
Some(url) => Some(url),
|
||||
None => match std::env::var("K8S_RUNNER_METRICS_QUERY_URL")
|
||||
.ok()
|
||||
.or_else(|| std::env::var("NOMOS_METRICS_QUERY_URL").ok())
|
||||
// Back-compat:
|
||||
.or_else(|| std::env::var("K8S_RUNNER_EXTERNAL_PROMETHEUS_URL").ok())
|
||||
.or_else(|| std::env::var("NOMOS_EXTERNAL_PROMETHEUS_URL").ok())
|
||||
{
|
||||
Some(raw) if !raw.trim().is_empty() => {
|
||||
Some(Url::parse(raw.trim()).map_err(|err| {
|
||||
MetricsError::new(format!("invalid metrics query url: {err}"))
|
||||
})?)
|
||||
}
|
||||
_ => None,
|
||||
},
|
||||
};
|
||||
|
||||
let external_prometheus_grafana_url = match metrics_query_grafana_url {
|
||||
Some(url) => Some(url),
|
||||
None => match std::env::var("K8S_RUNNER_METRICS_QUERY_GRAFANA_URL")
|
||||
.ok()
|
||||
.or_else(|| std::env::var("NOMOS_METRICS_QUERY_GRAFANA_URL").ok())
|
||||
// Back-compat:
|
||||
.or_else(|| std::env::var("K8S_RUNNER_EXTERNAL_PROMETHEUS_GRAFANA_URL").ok())
|
||||
.or_else(|| std::env::var("NOMOS_EXTERNAL_PROMETHEUS_GRAFANA_URL").ok())
|
||||
{
|
||||
Some(raw) if !raw.trim().is_empty() => Some(Url::parse(raw.trim()).map_err(|err| {
|
||||
MetricsError::new(format!("invalid metrics query grafana url: {err}"))
|
||||
})?),
|
||||
_ => None,
|
||||
},
|
||||
};
|
||||
|
||||
let external_otlp_metrics_endpoint = match metrics_otlp_ingest_url {
|
||||
Some(url) => Some(url),
|
||||
None => match std::env::var("K8S_RUNNER_METRICS_OTLP_INGEST_URL")
|
||||
.ok()
|
||||
.or_else(|| std::env::var("NOMOS_METRICS_OTLP_INGEST_URL").ok())
|
||||
// Back-compat:
|
||||
.or_else(|| std::env::var("K8S_RUNNER_EXTERNAL_OTLP_METRICS_ENDPOINT").ok())
|
||||
.or_else(|| std::env::var("NOMOS_EXTERNAL_OTLP_METRICS_ENDPOINT").ok())
|
||||
{
|
||||
Some(raw) if !raw.trim().is_empty() => Some(Url::parse(raw.trim()).map_err(|err| {
|
||||
MetricsError::new(format!("invalid metrics OTLP ingest url: {err}"))
|
||||
})?),
|
||||
_ => None,
|
||||
},
|
||||
};
|
||||
let env_inputs = ObservabilityInputs::from_env()?;
|
||||
let cap_inputs = observability
|
||||
.map(ObservabilityInputs::from_capability)
|
||||
.unwrap_or_default();
|
||||
let observability = env_inputs.with_overrides(cap_inputs).normalized();
|
||||
|
||||
let descriptors = scenario.topology().clone();
|
||||
let validator_count = descriptors.validators().len();
|
||||
@ -225,9 +156,16 @@ async fn deploy_with_observability<Caps>(
|
||||
executors = executor_count,
|
||||
duration_secs = scenario.duration().as_secs(),
|
||||
readiness_checks = deployer.readiness_checks,
|
||||
metrics_query_url = external_prometheus.as_ref().map(|u| u.as_str()),
|
||||
metrics_query_grafana_url = external_prometheus_grafana_url.as_ref().map(|u| u.as_str()),
|
||||
metrics_otlp_ingest_url = external_otlp_metrics_endpoint.as_ref().map(|u| u.as_str()),
|
||||
metrics_query_url = observability.metrics_query_url.as_ref().map(|u| u.as_str()),
|
||||
metrics_query_grafana_url = observability
|
||||
.metrics_query_grafana_url
|
||||
.as_ref()
|
||||
.map(|u| u.as_str()),
|
||||
metrics_otlp_ingest_url = observability
|
||||
.metrics_otlp_ingest_url
|
||||
.as_ref()
|
||||
.map(|u| u.as_str()),
|
||||
grafana_url = observability.grafana_url.as_ref().map(|u| u.as_str()),
|
||||
"starting k8s deployment"
|
||||
);
|
||||
|
||||
@ -238,9 +176,7 @@ async fn deploy_with_observability<Caps>(
|
||||
&port_specs,
|
||||
&descriptors,
|
||||
deployer.readiness_checks,
|
||||
external_prometheus.as_ref(),
|
||||
external_prometheus_grafana_url.as_ref(),
|
||||
external_otlp_metrics_endpoint.as_ref(),
|
||||
&observability,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
@ -259,23 +195,11 @@ async fn deploy_with_observability<Caps>(
|
||||
}
|
||||
};
|
||||
|
||||
let telemetry = match external_prometheus.clone() {
|
||||
Some(url) => metrics_handle_from_url(url),
|
||||
None => cluster
|
||||
.as_ref()
|
||||
.and_then(|cluster| cluster.prometheus_endpoint())
|
||||
.ok_or_else(|| MetricsError::new("prometheus endpoint unavailable"))
|
||||
.and_then(metrics_handle_from_endpoint),
|
||||
};
|
||||
let telemetry = match telemetry {
|
||||
let telemetry = match observability.telemetry_handle() {
|
||||
Ok(handle) => handle,
|
||||
Err(err) => {
|
||||
fail_cluster(
|
||||
&mut cluster,
|
||||
"failed to configure prometheus metrics handle",
|
||||
)
|
||||
.await;
|
||||
error!(error = ?err, "failed to configure prometheus metrics handle");
|
||||
fail_cluster(&mut cluster, "failed to configure metrics telemetry handle").await;
|
||||
error!(error = ?err, "failed to configure metrics telemetry handle");
|
||||
return Err(err.into());
|
||||
}
|
||||
};
|
||||
@ -289,36 +213,29 @@ async fn deploy_with_observability<Caps>(
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(url) = external_prometheus.as_ref() {
|
||||
info!(prometheus_url = %url.as_str(), "using external prometheus endpoint");
|
||||
} else if let Some(prometheus) = cluster_prometheus_endpoint(&cluster) {
|
||||
if let Some(url) = observability.metrics_query_url.as_ref() {
|
||||
info!(
|
||||
prometheus_url = %format!("http://{}:{}/", prometheus.host, prometheus.port),
|
||||
"prometheus endpoint available on host"
|
||||
metrics_query_url = %url.as_str(),
|
||||
"metrics query endpoint configured"
|
||||
);
|
||||
}
|
||||
if let Some(grafana) = cluster_grafana_endpoint(&cluster) {
|
||||
info!(
|
||||
grafana_url = %format!("http://{}:{}/", grafana.host, grafana.port),
|
||||
"grafana dashboard available on host"
|
||||
);
|
||||
if let Some(url) = observability.grafana_url.as_ref() {
|
||||
info!(grafana_url = %url.as_str(), "grafana url configured");
|
||||
}
|
||||
|
||||
if std::env::var("TESTNET_PRINT_ENDPOINTS").is_ok() {
|
||||
let prometheus = external_prometheus
|
||||
let prometheus = observability
|
||||
.metrics_query_url
|
||||
.as_ref()
|
||||
.map(|u| u.as_str().to_string())
|
||||
.or_else(|| {
|
||||
cluster_prometheus_endpoint(&cluster)
|
||||
.map(|endpoint| format!("http://{}:{}/", endpoint.host, endpoint.port))
|
||||
})
|
||||
.unwrap_or_else(|| "<disabled>".to_string());
|
||||
let grafana = cluster_grafana_endpoint(&cluster);
|
||||
println!(
|
||||
"TESTNET_ENDPOINTS prometheus={} grafana={}",
|
||||
prometheus,
|
||||
grafana
|
||||
.map(|endpoint| format!("http://{}:{}/", endpoint.host, endpoint.port))
|
||||
observability
|
||||
.grafana_url
|
||||
.as_ref()
|
||||
.map(|u| u.as_str().to_string())
|
||||
.unwrap_or_else(|| "<disabled>".to_string())
|
||||
);
|
||||
|
||||
@ -374,16 +291,9 @@ async fn setup_cluster(
|
||||
specs: &PortSpecs,
|
||||
descriptors: &GeneratedTopology,
|
||||
readiness_checks: bool,
|
||||
external_prometheus: Option<&Url>,
|
||||
external_prometheus_grafana_url: Option<&Url>,
|
||||
external_otlp_metrics_endpoint: Option<&Url>,
|
||||
observability: &ObservabilityInputs,
|
||||
) -> Result<ClusterEnvironment, K8sRunnerError> {
|
||||
let assets = prepare_assets(
|
||||
descriptors,
|
||||
external_prometheus,
|
||||
external_prometheus_grafana_url,
|
||||
external_otlp_metrics_endpoint,
|
||||
)?;
|
||||
let assets = prepare_assets(descriptors, observability.metrics_otlp_ingest_url.as_ref())?;
|
||||
let validators = descriptors.validators().len();
|
||||
let executors = descriptors.executors().len();
|
||||
|
||||
@ -394,19 +304,8 @@ async fn setup_cluster(
|
||||
Some(install_stack(client, &assets, &namespace, &release, validators, executors).await?);
|
||||
|
||||
info!("waiting for helm-managed services to become ready");
|
||||
let cluster_ready = wait_for_ports_or_cleanup(
|
||||
client,
|
||||
&namespace,
|
||||
&release,
|
||||
specs,
|
||||
external_prometheus.is_none() && external_prometheus_grafana_url.is_none(),
|
||||
&mut cleanup_guard,
|
||||
)
|
||||
.await?;
|
||||
|
||||
if let Some(prometheus) = cluster_ready.ports.prometheus.as_ref() {
|
||||
info!(prometheus = ?prometheus, "discovered prometheus endpoint");
|
||||
}
|
||||
let cluster_ready =
|
||||
wait_for_ports_or_cleanup(client, &namespace, &release, specs, &mut cleanup_guard).await?;
|
||||
|
||||
let environment = ClusterEnvironment::new(
|
||||
client.clone(),
|
||||
|
||||
@ -5,6 +5,7 @@ use std::{
|
||||
};
|
||||
|
||||
use anyhow::{Context as _, Result as AnyResult};
|
||||
use nomos_tracing::metrics::otlp::OtlpMetricsConfig;
|
||||
use nomos_tracing_service::MetricsLayer;
|
||||
use reqwest::Url;
|
||||
use serde::Serialize;
|
||||
@ -55,8 +56,6 @@ pub enum AssetsError {
|
||||
MissingKzg { path: PathBuf },
|
||||
#[error("missing Helm chart at {path}; ensure the repository is up-to-date")]
|
||||
MissingChart { path: PathBuf },
|
||||
#[error("missing Grafana dashboards source at {path}")]
|
||||
MissingGrafanaDashboards { path: PathBuf },
|
||||
#[error("failed to create temporary directory for rendered assets: {source}")]
|
||||
TempDir {
|
||||
#[source]
|
||||
@ -92,9 +91,7 @@ fn kzg_mode() -> KzgMode {
|
||||
/// topology.
|
||||
pub fn prepare_assets(
|
||||
topology: &GeneratedTopology,
|
||||
external_prometheus: Option<&Url>,
|
||||
external_prometheus_grafana_url: Option<&Url>,
|
||||
external_otlp_metrics_endpoint: Option<&Url>,
|
||||
metrics_otlp_ingest_url: Option<&Url>,
|
||||
) -> Result<RunnerAssets, AssetsError> {
|
||||
info!(
|
||||
validators = topology.validators().len(),
|
||||
@ -104,13 +101,7 @@ pub fn prepare_assets(
|
||||
|
||||
let root = workspace_root().map_err(|source| AssetsError::WorkspaceRoot { source })?;
|
||||
let kzg_mode = kzg_mode();
|
||||
let cfgsync_yaml = render_cfgsync_config(
|
||||
&root,
|
||||
topology,
|
||||
kzg_mode,
|
||||
external_prometheus,
|
||||
external_otlp_metrics_endpoint,
|
||||
)?;
|
||||
let cfgsync_yaml = render_cfgsync_config(&root, topology, kzg_mode, metrics_otlp_ingest_url)?;
|
||||
|
||||
let tempdir = tempfile::Builder::new()
|
||||
.prefix("nomos-helm-")
|
||||
@ -124,12 +115,7 @@ pub fn prepare_assets(
|
||||
KzgMode::InImage => None,
|
||||
};
|
||||
let chart_path = helm_chart_path()?;
|
||||
sync_grafana_dashboards(&root, &chart_path)?;
|
||||
let values_yaml = render_values_yaml(
|
||||
topology,
|
||||
external_prometheus,
|
||||
external_prometheus_grafana_url,
|
||||
)?;
|
||||
let values_yaml = render_values_yaml(topology)?;
|
||||
let values_file = write_temp_file(tempdir.path(), "values.yaml", values_yaml)?;
|
||||
let image = env::var("NOMOS_TESTNET_IMAGE")
|
||||
.unwrap_or_else(|_| String::from("public.ecr.aws/r4s5t9y4/logos/logos-blockchain:test"));
|
||||
@ -164,81 +150,13 @@ pub fn prepare_assets(
|
||||
}
|
||||
|
||||
const CFGSYNC_K8S_TIMEOUT_SECS: u64 = 300;
|
||||
const DEFAULT_GRAFANA_NODE_PORT: u16 = 30030;
|
||||
const DEFAULT_IN_IMAGE_KZG_PARAMS_PATH: &str = "/opt/nomos/kzg-params/kzgrs_test_params";
|
||||
|
||||
fn sync_grafana_dashboards(root: &Path, chart_path: &Path) -> Result<(), AssetsError> {
|
||||
let source_dir = stack_assets_root(root).join("monitoring/grafana/dashboards");
|
||||
let dest_dir = chart_path.join("grafana/dashboards");
|
||||
|
||||
if !source_dir.exists() {
|
||||
return Err(AssetsError::MissingGrafanaDashboards { path: source_dir });
|
||||
}
|
||||
|
||||
fs::create_dir_all(&dest_dir).map_err(|source| AssetsError::Io {
|
||||
path: dest_dir.clone(),
|
||||
source,
|
||||
})?;
|
||||
|
||||
let mut removed = 0usize;
|
||||
for entry in fs::read_dir(&dest_dir).map_err(|source| AssetsError::Io {
|
||||
path: dest_dir.clone(),
|
||||
source,
|
||||
})? {
|
||||
let entry = entry.map_err(|source| AssetsError::Io {
|
||||
path: dest_dir.clone(),
|
||||
source,
|
||||
})?;
|
||||
let path = entry.path();
|
||||
if path.extension().and_then(|ext| ext.to_str()) != Some("json") {
|
||||
continue;
|
||||
}
|
||||
fs::remove_file(&path).map_err(|source| AssetsError::Io {
|
||||
path: path.clone(),
|
||||
source,
|
||||
})?;
|
||||
removed += 1;
|
||||
}
|
||||
|
||||
let mut copied = 0usize;
|
||||
for entry in fs::read_dir(&source_dir).map_err(|source| AssetsError::Io {
|
||||
path: source_dir.clone(),
|
||||
source,
|
||||
})? {
|
||||
let entry = entry.map_err(|source| AssetsError::Io {
|
||||
path: source_dir.clone(),
|
||||
source,
|
||||
})?;
|
||||
let path = entry.path();
|
||||
if path.extension().and_then(|ext| ext.to_str()) != Some("json") {
|
||||
continue;
|
||||
}
|
||||
let file_name = path.file_name().unwrap_or_default();
|
||||
let dest_path = dest_dir.join(file_name);
|
||||
fs::copy(&path, &dest_path).map_err(|source| AssetsError::Io {
|
||||
path: dest_path.clone(),
|
||||
source,
|
||||
})?;
|
||||
copied += 1;
|
||||
}
|
||||
|
||||
debug!(
|
||||
source = %source_dir.display(),
|
||||
dest = %dest_dir.display(),
|
||||
removed,
|
||||
copied,
|
||||
"synced Grafana dashboards into Helm chart"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn render_cfgsync_config(
|
||||
root: &Path,
|
||||
topology: &GeneratedTopology,
|
||||
kzg_mode: KzgMode,
|
||||
external_prometheus: Option<&Url>,
|
||||
external_otlp_metrics_endpoint: Option<&Url>,
|
||||
metrics_otlp_ingest_url: Option<&Url>,
|
||||
) -> Result<String, AssetsError> {
|
||||
let cfgsync_template_path = stack_assets_root(root).join("cfgsync.yaml");
|
||||
debug!(path = %cfgsync_template_path.display(), "loading cfgsync template");
|
||||
@ -254,15 +172,11 @@ fn render_cfgsync_config(
|
||||
.unwrap_or_else(|| DEFAULT_IN_IMAGE_KZG_PARAMS_PATH.to_string());
|
||||
}
|
||||
|
||||
let external_metrics_endpoint = match external_otlp_metrics_endpoint {
|
||||
Some(endpoint) => Some(Ok(endpoint.clone())),
|
||||
None => external_prometheus.map(derive_prometheus_otlp_metrics_endpoint),
|
||||
};
|
||||
|
||||
if let Some(endpoint) = external_metrics_endpoint.transpose()? {
|
||||
if let MetricsLayer::Otlp(ref mut config) = cfg.tracing_settings.metrics {
|
||||
config.endpoint = endpoint;
|
||||
}
|
||||
if let Some(endpoint) = metrics_otlp_ingest_url.cloned() {
|
||||
cfg.tracing_settings.metrics = MetricsLayer::Otlp(OtlpMetricsConfig {
|
||||
endpoint,
|
||||
host_identifier: "node".into(),
|
||||
});
|
||||
}
|
||||
|
||||
cfg.timeout = cfg.timeout.max(CFGSYNC_K8S_TIMEOUT_SECS);
|
||||
@ -270,16 +184,6 @@ fn render_cfgsync_config(
|
||||
render_cfgsync_yaml(&cfg).map_err(|source| AssetsError::Cfgsync { source })
|
||||
}
|
||||
|
||||
fn derive_prometheus_otlp_metrics_endpoint(base: &Url) -> Result<Url, AssetsError> {
|
||||
let base = base.as_str().trim_end_matches('/');
|
||||
let otlp_metrics = format!("{base}/api/v1/otlp/v1/metrics");
|
||||
Url::parse(&otlp_metrics).map_err(|source| AssetsError::Cfgsync {
|
||||
source: anyhow::anyhow!(
|
||||
"invalid OTLP metrics endpoint derived from external Prometheus url '{base}': {source}"
|
||||
),
|
||||
})
|
||||
}
|
||||
|
||||
struct ScriptPaths {
|
||||
run_cfgsync: PathBuf,
|
||||
run_shared: PathBuf,
|
||||
@ -337,16 +241,8 @@ fn helm_chart_path() -> Result<PathBuf, AssetsError> {
|
||||
}
|
||||
}
|
||||
|
||||
fn render_values_yaml(
|
||||
topology: &GeneratedTopology,
|
||||
external_prometheus: Option<&Url>,
|
||||
external_prometheus_grafana_url: Option<&Url>,
|
||||
) -> Result<String, AssetsError> {
|
||||
let values = build_values(
|
||||
topology,
|
||||
external_prometheus,
|
||||
external_prometheus_grafana_url,
|
||||
);
|
||||
fn render_values_yaml(topology: &GeneratedTopology) -> Result<String, AssetsError> {
|
||||
let values = build_values(topology);
|
||||
serde_yaml::to_string(&values).map_err(|source| AssetsError::Values { source })
|
||||
}
|
||||
|
||||
@ -402,8 +298,6 @@ struct HelmValues {
|
||||
cfgsync: CfgsyncValues,
|
||||
validators: NodeGroup,
|
||||
executors: NodeGroup,
|
||||
prometheus: PrometheusValues,
|
||||
grafana: GrafanaValues,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
@ -426,72 +320,13 @@ struct NodeValues {
|
||||
env: BTreeMap<String, String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct PrometheusValues {
|
||||
enabled: bool,
|
||||
#[serde(rename = "externalUrl", skip_serializing_if = "Option::is_none")]
|
||||
external_url: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct GrafanaValues {
|
||||
enabled: bool,
|
||||
image: String,
|
||||
#[serde(rename = "imagePullPolicy")]
|
||||
image_pull_policy: String,
|
||||
#[serde(rename = "adminUser")]
|
||||
admin_user: String,
|
||||
#[serde(rename = "adminPassword")]
|
||||
admin_password: String,
|
||||
service: GrafanaServiceValues,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct GrafanaServiceValues {
|
||||
#[serde(rename = "type")]
|
||||
type_field: String,
|
||||
#[serde(rename = "nodePort")]
|
||||
node_port: Option<u16>,
|
||||
}
|
||||
|
||||
fn build_values(
|
||||
topology: &GeneratedTopology,
|
||||
external_prometheus: Option<&Url>,
|
||||
external_prometheus_grafana_url: Option<&Url>,
|
||||
) -> HelmValues {
|
||||
fn build_values(topology: &GeneratedTopology) -> HelmValues {
|
||||
let cfgsync = CfgsyncValues {
|
||||
port: cfgsync_port(),
|
||||
};
|
||||
let pol_mode = pol_proof_mode();
|
||||
let image_pull_policy =
|
||||
env::var("NOMOS_TESTNET_IMAGE_PULL_POLICY").unwrap_or_else(|_| "IfNotPresent".into());
|
||||
let grafana_node_port = match kzg_mode() {
|
||||
KzgMode::HostPath => Some(DEFAULT_GRAFANA_NODE_PORT),
|
||||
KzgMode::InImage => env::var("NOMOS_GRAFANA_NODE_PORT").ok().and_then(|value| {
|
||||
value
|
||||
.parse::<u16>()
|
||||
.ok()
|
||||
.filter(|port| *port >= 30000 && *port <= 32767)
|
||||
}),
|
||||
};
|
||||
let grafana = GrafanaValues {
|
||||
enabled: true,
|
||||
image: "grafana/grafana:10.4.1".into(),
|
||||
image_pull_policy: "IfNotPresent".into(),
|
||||
admin_user: "admin".into(),
|
||||
admin_password: "admin".into(),
|
||||
service: GrafanaServiceValues {
|
||||
type_field: "NodePort".into(),
|
||||
node_port: grafana_node_port,
|
||||
},
|
||||
};
|
||||
let prometheus_external_url = external_prometheus_grafana_url
|
||||
.or(external_prometheus)
|
||||
.map(|url| url.as_str().trim_end_matches('/').to_string());
|
||||
let prometheus = PrometheusValues {
|
||||
enabled: prometheus_external_url.is_none(),
|
||||
external_url: prometheus_external_url,
|
||||
};
|
||||
debug!(pol_mode, "rendering Helm values for k8s stack");
|
||||
let validators = topology
|
||||
.validators()
|
||||
@ -578,8 +413,6 @@ fn build_values(
|
||||
count: topology.executors().len(),
|
||||
nodes: executors,
|
||||
},
|
||||
prometheus,
|
||||
grafana,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -4,7 +4,7 @@ use kube::Client;
|
||||
use reqwest::Url;
|
||||
use testing_framework_core::{
|
||||
nodes::ApiClient,
|
||||
scenario::{CleanupGuard, Metrics, MetricsError, NodeClients, http_probe::NodeRole},
|
||||
scenario::{CleanupGuard, NodeClients, http_probe::NodeRole},
|
||||
topology::{generation::GeneratedTopology, readiness::ReadinessError},
|
||||
};
|
||||
use tracing::{debug, info};
|
||||
@ -15,8 +15,7 @@ use crate::{
|
||||
infrastructure::assets::RunnerAssets,
|
||||
lifecycle::{cleanup::RunnerCleanup, logs::dump_namespace_logs},
|
||||
wait::{
|
||||
ClusterPorts, ClusterReady, HostPort, NodeConfigPorts, PortForwardHandle,
|
||||
wait_for_cluster_ready,
|
||||
ClusterPorts, ClusterReady, NodeConfigPorts, PortForwardHandle, wait_for_cluster_ready,
|
||||
},
|
||||
};
|
||||
|
||||
@ -38,8 +37,6 @@ pub struct ClusterEnvironment {
|
||||
validator_testing_ports: Vec<u16>,
|
||||
executor_api_ports: Vec<u16>,
|
||||
executor_testing_ports: Vec<u16>,
|
||||
prometheus: Option<HostPort>,
|
||||
grafana: Option<HostPort>,
|
||||
port_forwards: Vec<PortForwardHandle>,
|
||||
}
|
||||
|
||||
@ -68,8 +65,6 @@ impl ClusterEnvironment {
|
||||
validator_testing_ports,
|
||||
executor_api_ports,
|
||||
executor_testing_ports,
|
||||
prometheus: ports.prometheus.clone(),
|
||||
grafana: ports.grafana.clone(),
|
||||
port_forwards,
|
||||
}
|
||||
}
|
||||
@ -105,14 +100,6 @@ impl ClusterEnvironment {
|
||||
&self.release
|
||||
}
|
||||
|
||||
pub fn prometheus_endpoint(&self) -> Option<&HostPort> {
|
||||
self.prometheus.as_ref()
|
||||
}
|
||||
|
||||
pub fn grafana_endpoint(&self) -> Option<&HostPort> {
|
||||
self.grafana.as_ref()
|
||||
}
|
||||
|
||||
pub fn validator_ports(&self) -> (&[u16], &[u16]) {
|
||||
(&self.validator_api_ports, &self.validator_testing_ports)
|
||||
}
|
||||
@ -229,16 +216,6 @@ pub fn build_node_clients(cluster: &ClusterEnvironment) -> Result<NodeClients, N
|
||||
Ok(NodeClients::new(validators, executors))
|
||||
}
|
||||
|
||||
pub fn metrics_handle_from_endpoint(endpoint: &HostPort) -> Result<Metrics, MetricsError> {
|
||||
let url = cluster_host_url(&endpoint.host, endpoint.port)
|
||||
.map_err(|err| MetricsError::new(format!("invalid prometheus url: {err}")))?;
|
||||
Metrics::from_prometheus(url)
|
||||
}
|
||||
|
||||
pub fn metrics_handle_from_url(url: Url) -> Result<Metrics, MetricsError> {
|
||||
Metrics::from_prometheus(url)
|
||||
}
|
||||
|
||||
pub async fn ensure_cluster_readiness(
|
||||
descriptors: &GeneratedTopology,
|
||||
cluster: &ClusterEnvironment,
|
||||
@ -324,7 +301,6 @@ pub async fn wait_for_ports_or_cleanup(
|
||||
namespace: &str,
|
||||
release: &str,
|
||||
specs: &PortSpecs,
|
||||
prometheus_enabled: bool,
|
||||
cleanup_guard: &mut Option<RunnerCleanup>,
|
||||
) -> Result<ClusterReady, crate::deployer::K8sRunnerError> {
|
||||
info!(
|
||||
@ -340,13 +316,11 @@ pub async fn wait_for_ports_or_cleanup(
|
||||
release,
|
||||
&specs.validators,
|
||||
&specs.executors,
|
||||
prometheus_enabled,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(ports) => {
|
||||
info!(
|
||||
prometheus = ?ports.ports.prometheus,
|
||||
validator_ports = ?ports.ports.validators,
|
||||
executor_ports = ?ports.ports.executors,
|
||||
"cluster port-forwards established"
|
||||
|
||||
@ -1,36 +0,0 @@
|
||||
use tokio::time::sleep;
|
||||
|
||||
use super::{ClusterWaitError, node_http_probe_timeout, node_http_timeout};
|
||||
use crate::host::node_host;
|
||||
|
||||
const GRAFANA_HTTP_POLL_INTERVAL: std::time::Duration = std::time::Duration::from_secs(1);
|
||||
|
||||
pub async fn wait_for_grafana_http_nodeport(port: u16) -> Result<(), ClusterWaitError> {
|
||||
let host = node_host();
|
||||
wait_for_grafana_http(&host, port, node_http_probe_timeout()).await
|
||||
}
|
||||
|
||||
pub async fn wait_for_grafana_http_port_forward(port: u16) -> Result<(), ClusterWaitError> {
|
||||
wait_for_grafana_http("127.0.0.1", port, node_http_timeout()).await
|
||||
}
|
||||
|
||||
async fn wait_for_grafana_http(
|
||||
host: &str,
|
||||
port: u16,
|
||||
timeout: std::time::Duration,
|
||||
) -> Result<(), ClusterWaitError> {
|
||||
let client = reqwest::Client::new();
|
||||
let url = format!("http://{host}:{port}/api/health");
|
||||
|
||||
let attempts = timeout.as_secs();
|
||||
for _ in 0..attempts {
|
||||
if let Ok(resp) = client.get(&url).send().await
|
||||
&& resp.status().is_success()
|
||||
{
|
||||
return Ok(());
|
||||
}
|
||||
sleep(GRAFANA_HTTP_POLL_INTERVAL).await;
|
||||
}
|
||||
|
||||
Err(ClusterWaitError::GrafanaTimeout { port })
|
||||
}
|
||||
@ -4,9 +4,7 @@ use kube::Error as KubeError;
|
||||
use testing_framework_core::{
|
||||
constants::{
|
||||
DEFAULT_HTTP_POLL_INTERVAL, DEFAULT_K8S_DEPLOYMENT_TIMEOUT,
|
||||
DEFAULT_NODE_HTTP_PROBE_TIMEOUT, DEFAULT_NODE_HTTP_TIMEOUT, DEFAULT_PROMETHEUS_HTTP_PORT,
|
||||
DEFAULT_PROMETHEUS_HTTP_PROBE_TIMEOUT, DEFAULT_PROMETHEUS_HTTP_TIMEOUT,
|
||||
DEFAULT_PROMETHEUS_SERVICE_NAME,
|
||||
DEFAULT_NODE_HTTP_PROBE_TIMEOUT, DEFAULT_NODE_HTTP_TIMEOUT,
|
||||
},
|
||||
scenario::http_probe::NodeRole,
|
||||
};
|
||||
@ -14,11 +12,9 @@ use thiserror::Error;
|
||||
|
||||
mod deployment;
|
||||
mod forwarding;
|
||||
mod grafana;
|
||||
mod http_probe;
|
||||
mod orchestrator;
|
||||
mod ports;
|
||||
mod prometheus;
|
||||
|
||||
pub use forwarding::PortForwardHandle;
|
||||
pub use orchestrator::wait_for_cluster_ready;
|
||||
@ -44,15 +40,13 @@ pub struct HostPort {
|
||||
pub port: u16,
|
||||
}
|
||||
|
||||
/// All port assignments for the cluster plus Prometheus.
|
||||
/// All port assignments for the cluster.
|
||||
#[derive(Debug)]
|
||||
pub struct ClusterPorts {
|
||||
pub validators: Vec<NodePortAllocation>,
|
||||
pub executors: Vec<NodePortAllocation>,
|
||||
pub validator_host: String,
|
||||
pub executor_host: String,
|
||||
pub prometheus: Option<HostPort>,
|
||||
pub grafana: Option<HostPort>,
|
||||
}
|
||||
|
||||
/// Success result from waiting for the cluster: host ports and forward handles.
|
||||
@ -96,10 +90,6 @@ pub enum ClusterWaitError {
|
||||
port: u16,
|
||||
timeout: Duration,
|
||||
},
|
||||
#[error("timeout waiting for prometheus readiness on NodePort {port}")]
|
||||
PrometheusTimeout { port: u16 },
|
||||
#[error("timeout waiting for grafana readiness on port {port}")]
|
||||
GrafanaTimeout { port: u16 },
|
||||
#[error("failed to start port-forward for service {service} port {port}: {source}")]
|
||||
PortForward {
|
||||
service: String,
|
||||
@ -149,32 +139,6 @@ pub(crate) fn http_poll_interval() -> Duration {
|
||||
*HTTP_POLL_INTERVAL
|
||||
}
|
||||
|
||||
pub(crate) const PROMETHEUS_HTTP_PORT: u16 = DEFAULT_PROMETHEUS_HTTP_PORT;
|
||||
|
||||
static PROMETHEUS_HTTP_TIMEOUT: LazyLock<Duration> = LazyLock::new(|| {
|
||||
env_duration_secs(
|
||||
"K8S_RUNNER_PROMETHEUS_HTTP_TIMEOUT_SECS",
|
||||
DEFAULT_PROMETHEUS_HTTP_TIMEOUT,
|
||||
)
|
||||
});
|
||||
|
||||
static PROMETHEUS_HTTP_PROBE_TIMEOUT: LazyLock<Duration> = LazyLock::new(|| {
|
||||
env_duration_secs(
|
||||
"K8S_RUNNER_PROMETHEUS_HTTP_PROBE_TIMEOUT_SECS",
|
||||
DEFAULT_PROMETHEUS_HTTP_PROBE_TIMEOUT,
|
||||
)
|
||||
});
|
||||
|
||||
pub(crate) fn prometheus_http_timeout() -> Duration {
|
||||
*PROMETHEUS_HTTP_TIMEOUT
|
||||
}
|
||||
|
||||
pub(crate) fn prometheus_http_probe_timeout() -> Duration {
|
||||
*PROMETHEUS_HTTP_PROBE_TIMEOUT
|
||||
}
|
||||
|
||||
pub(crate) const PROMETHEUS_SERVICE_NAME: &str = DEFAULT_PROMETHEUS_SERVICE_NAME;
|
||||
|
||||
fn env_duration_secs(key: &str, default: Duration) -> Duration {
|
||||
env::var(key)
|
||||
.ok()
|
||||
|
||||
@ -1,31 +1,20 @@
|
||||
use kube::Client;
|
||||
use testing_framework_core::scenario::http_probe::NodeRole;
|
||||
|
||||
use super::{
|
||||
ClusterPorts, ClusterReady, ClusterWaitError, HostPort, NodeConfigPorts, PROMETHEUS_HTTP_PORT,
|
||||
PROMETHEUS_SERVICE_NAME, prometheus_http_probe_timeout,
|
||||
};
|
||||
use super::{ClusterPorts, ClusterReady, ClusterWaitError, NodeConfigPorts};
|
||||
use crate::lifecycle::wait::{
|
||||
deployment::wait_for_deployment_ready,
|
||||
forwarding::{
|
||||
PortForwardHandle, PortForwardSpawn, kill_port_forwards, port_forward_group,
|
||||
port_forward_service,
|
||||
},
|
||||
grafana::{wait_for_grafana_http_nodeport, wait_for_grafana_http_port_forward},
|
||||
forwarding::{PortForwardHandle, kill_port_forwards, port_forward_group},
|
||||
http_probe::{wait_for_node_http_nodeport, wait_for_node_http_port_forward},
|
||||
ports::{discover_node_ports, find_node_port},
|
||||
prometheus::{wait_for_prometheus_http_nodeport, wait_for_prometheus_http_port_forward},
|
||||
ports::discover_node_ports,
|
||||
};
|
||||
|
||||
const GRAFANA_HTTP_PORT: u16 = 3000;
|
||||
|
||||
pub async fn wait_for_cluster_ready(
|
||||
client: &Client,
|
||||
namespace: &str,
|
||||
release: &str,
|
||||
validator_ports: &[NodeConfigPorts],
|
||||
executor_ports: &[NodeConfigPorts],
|
||||
prometheus_enabled: bool,
|
||||
) -> Result<ClusterReady, ClusterWaitError> {
|
||||
if validator_ports.is_empty() {
|
||||
return Err(ClusterWaitError::MissingValidator);
|
||||
@ -108,75 +97,12 @@ pub async fn wait_for_cluster_ready(
|
||||
}
|
||||
}
|
||||
|
||||
let mut prometheus = None;
|
||||
if prometheus_enabled {
|
||||
let mut prometheus_port = find_node_port(
|
||||
client,
|
||||
namespace,
|
||||
PROMETHEUS_SERVICE_NAME,
|
||||
PROMETHEUS_HTTP_PORT,
|
||||
)
|
||||
.await?;
|
||||
let mut prometheus_host = crate::host::node_host();
|
||||
if wait_for_prometheus_http_nodeport(prometheus_port, prometheus_http_probe_timeout())
|
||||
.await
|
||||
.is_err()
|
||||
{
|
||||
let PortForwardSpawn { local_port, handle } =
|
||||
port_forward_service(namespace, PROMETHEUS_SERVICE_NAME, PROMETHEUS_HTTP_PORT)
|
||||
.map_err(|err| {
|
||||
kill_port_forwards(&mut port_forwards);
|
||||
err
|
||||
})?;
|
||||
prometheus_port = local_port;
|
||||
prometheus_host = "127.0.0.1".to_owned();
|
||||
port_forwards.push(handle);
|
||||
if let Err(err) = wait_for_prometheus_http_port_forward(prometheus_port).await {
|
||||
return Err(cleanup_port_forwards(&mut port_forwards, err));
|
||||
}
|
||||
}
|
||||
prometheus = Some(HostPort {
|
||||
host: prometheus_host,
|
||||
port: prometheus_port,
|
||||
});
|
||||
}
|
||||
|
||||
let mut grafana = None;
|
||||
let grafana_service = format!("{release}-grafana");
|
||||
if let Ok(node_port) =
|
||||
find_node_port(client, namespace, &grafana_service, GRAFANA_HTTP_PORT).await
|
||||
{
|
||||
let mut grafana_host = crate::host::node_host();
|
||||
let mut grafana_port = node_port;
|
||||
if wait_for_grafana_http_nodeport(grafana_port).await.is_err() {
|
||||
let PortForwardSpawn { local_port, handle } =
|
||||
port_forward_service(namespace, &grafana_service, GRAFANA_HTTP_PORT).map_err(
|
||||
|err| {
|
||||
kill_port_forwards(&mut port_forwards);
|
||||
err
|
||||
},
|
||||
)?;
|
||||
grafana_host = "127.0.0.1".to_owned();
|
||||
grafana_port = local_port;
|
||||
port_forwards.push(handle);
|
||||
if let Err(err) = wait_for_grafana_http_port_forward(grafana_port).await {
|
||||
return Err(cleanup_port_forwards(&mut port_forwards, err));
|
||||
}
|
||||
}
|
||||
grafana = Some(HostPort {
|
||||
host: grafana_host,
|
||||
port: grafana_port,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(ClusterReady {
|
||||
ports: ClusterPorts {
|
||||
validators: validator_allocations,
|
||||
executors: executor_allocations,
|
||||
validator_host,
|
||||
executor_host,
|
||||
prometheus,
|
||||
grafana,
|
||||
},
|
||||
port_forwards,
|
||||
})
|
||||
|
||||
@ -1,39 +0,0 @@
|
||||
use tokio::time::sleep;
|
||||
|
||||
use super::{ClusterWaitError, prometheus_http_timeout};
|
||||
use crate::host::node_host;
|
||||
|
||||
const PROMETHEUS_HTTP_POLL_INTERVAL: std::time::Duration = std::time::Duration::from_secs(1);
|
||||
|
||||
pub async fn wait_for_prometheus_http_nodeport(
|
||||
port: u16,
|
||||
timeout: std::time::Duration,
|
||||
) -> Result<(), ClusterWaitError> {
|
||||
let host = node_host();
|
||||
wait_for_prometheus_http(&host, port, timeout).await
|
||||
}
|
||||
|
||||
pub async fn wait_for_prometheus_http_port_forward(port: u16) -> Result<(), ClusterWaitError> {
|
||||
wait_for_prometheus_http("127.0.0.1", port, prometheus_http_timeout()).await
|
||||
}
|
||||
|
||||
async fn wait_for_prometheus_http(
|
||||
host: &str,
|
||||
port: u16,
|
||||
timeout: std::time::Duration,
|
||||
) -> Result<(), ClusterWaitError> {
|
||||
let client = reqwest::Client::new();
|
||||
let url = format!("http://{host}:{port}/-/ready");
|
||||
|
||||
let attempts = timeout.as_secs();
|
||||
for _ in 0..attempts {
|
||||
if let Ok(resp) = client.get(&url).send().await
|
||||
&& resp.status().is_success()
|
||||
{
|
||||
return Ok(());
|
||||
}
|
||||
sleep(PROMETHEUS_HTTP_POLL_INTERVAL).await;
|
||||
}
|
||||
|
||||
Err(ClusterWaitError::PrometheusTimeout { port })
|
||||
}
|
||||
@ -132,6 +132,12 @@ pub trait ObservabilityBuilderExt: Sized {
|
||||
url: &str,
|
||||
) -> CoreScenarioBuilder<ObservabilityCapability>;
|
||||
|
||||
/// Optional Grafana base URL for printing/logging (human access).
|
||||
fn with_grafana_url(self, url: reqwest::Url) -> CoreScenarioBuilder<ObservabilityCapability>;
|
||||
|
||||
/// Convenience wrapper that parses a URL string (panics if invalid).
|
||||
fn with_grafana_url_str(self, url: &str) -> CoreScenarioBuilder<ObservabilityCapability>;
|
||||
|
||||
#[deprecated(note = "use with_metrics_query_url")]
|
||||
fn with_external_prometheus(
|
||||
self,
|
||||
@ -190,6 +196,7 @@ impl ObservabilityBuilderExt for CoreScenarioBuilder<()> {
|
||||
metrics_query_url: Some(url),
|
||||
metrics_query_grafana_url: None,
|
||||
metrics_otlp_ingest_url: None,
|
||||
grafana_url: None,
|
||||
})
|
||||
}
|
||||
|
||||
@ -206,6 +213,7 @@ impl ObservabilityBuilderExt for CoreScenarioBuilder<()> {
|
||||
metrics_query_url: None,
|
||||
metrics_query_grafana_url: None,
|
||||
metrics_otlp_ingest_url: Some(url),
|
||||
grafana_url: None,
|
||||
})
|
||||
}
|
||||
|
||||
@ -225,6 +233,7 @@ impl ObservabilityBuilderExt for CoreScenarioBuilder<()> {
|
||||
metrics_query_url: None,
|
||||
metrics_query_grafana_url: Some(url),
|
||||
metrics_otlp_ingest_url: None,
|
||||
grafana_url: None,
|
||||
})
|
||||
}
|
||||
|
||||
@ -235,6 +244,20 @@ impl ObservabilityBuilderExt for CoreScenarioBuilder<()> {
|
||||
let parsed = reqwest::Url::parse(url).expect("metrics query grafana url must be valid");
|
||||
self.with_metrics_query_grafana_url(parsed)
|
||||
}
|
||||
|
||||
fn with_grafana_url(self, url: reqwest::Url) -> CoreScenarioBuilder<ObservabilityCapability> {
|
||||
self.with_capabilities(ObservabilityCapability {
|
||||
metrics_query_url: None,
|
||||
metrics_query_grafana_url: None,
|
||||
metrics_otlp_ingest_url: None,
|
||||
grafana_url: Some(url),
|
||||
})
|
||||
}
|
||||
|
||||
fn with_grafana_url_str(self, url: &str) -> CoreScenarioBuilder<ObservabilityCapability> {
|
||||
let parsed = reqwest::Url::parse(url).expect("grafana url must be valid");
|
||||
self.with_grafana_url(parsed)
|
||||
}
|
||||
}
|
||||
|
||||
impl ObservabilityBuilderExt for CoreScenarioBuilder<ObservabilityCapability> {
|
||||
@ -282,6 +305,19 @@ impl ObservabilityBuilderExt for CoreScenarioBuilder<ObservabilityCapability> {
|
||||
let parsed = reqwest::Url::parse(url).expect("metrics query grafana url must be valid");
|
||||
self.with_metrics_query_grafana_url(parsed)
|
||||
}
|
||||
|
||||
fn with_grafana_url(
|
||||
mut self,
|
||||
url: reqwest::Url,
|
||||
) -> CoreScenarioBuilder<ObservabilityCapability> {
|
||||
self.capabilities_mut().grafana_url = Some(url);
|
||||
self
|
||||
}
|
||||
|
||||
fn with_grafana_url_str(self, url: &str) -> CoreScenarioBuilder<ObservabilityCapability> {
|
||||
let parsed = reqwest::Url::parse(url).expect("grafana url must be valid");
|
||||
self.with_grafana_url(parsed)
|
||||
}
|
||||
}
|
||||
|
||||
/// Builder for transaction workloads.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user