mirror of
https://github.com/logos-blockchain/logos-blockchain-testing.git
synced 2026-01-04 06:13:09 +00:00
refactor(observability): remove embedded prometheus/grafana
Deployers no longer provision Prometheus/Grafana; metrics query/ingest now come from explicit URLs via env/flags.
This commit is contained in:
parent
3965669f12
commit
e05bf5e0bd
2
Cargo.lock
generated
2
Cargo.lock
generated
@ -7267,6 +7267,7 @@ dependencies = [
|
|||||||
"key-management-system-service",
|
"key-management-system-service",
|
||||||
"nomos-core",
|
"nomos-core",
|
||||||
"nomos-ledger",
|
"nomos-ledger",
|
||||||
|
"nomos-tracing",
|
||||||
"nomos-tracing-service",
|
"nomos-tracing-service",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"serde",
|
"serde",
|
||||||
@ -7290,6 +7291,7 @@ dependencies = [
|
|||||||
"async-trait",
|
"async-trait",
|
||||||
"k8s-openapi",
|
"k8s-openapi",
|
||||||
"kube",
|
"kube",
|
||||||
|
"nomos-tracing",
|
||||||
"nomos-tracing-service",
|
"nomos-tracing-service",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"serde",
|
"serde",
|
||||||
|
|||||||
@ -90,7 +90,7 @@ Three deployer implementations:
|
|||||||
| `K8sDeployer` | Kubernetes Helm | Cluster + image loaded | Not yet |
|
| `K8sDeployer` | Kubernetes Helm | Cluster + image loaded | Not yet |
|
||||||
|
|
||||||
**Compose-specific features:**
|
**Compose-specific features:**
|
||||||
- Includes Prometheus at `http://localhost:9090` (override via `TEST_FRAMEWORK_PROMETHEUS_PORT`)
|
- Observability is external (set `NOMOS_METRICS_QUERY_URL` / `NOMOS_METRICS_OTLP_INGEST_URL` / `NOMOS_GRAFANA_URL` as needed)
|
||||||
- Optional OTLP trace/metrics endpoints (`NOMOS_OTLP_ENDPOINT`, `NOMOS_OTLP_METRICS_ENDPOINT`)
|
- Optional OTLP trace/metrics endpoints (`NOMOS_OTLP_ENDPOINT`, `NOMOS_OTLP_METRICS_ENDPOINT`)
|
||||||
- Node control for chaos testing (restart validators/executors)
|
- Node control for chaos testing (restart validators/executors)
|
||||||
|
|
||||||
@ -112,9 +112,9 @@ KZG parameters required for DA workloads:
|
|||||||
|
|
||||||
### Compose Stack
|
### Compose Stack
|
||||||
Templates and configs in `testing-framework/runners/compose/assets/`:
|
Templates and configs in `testing-framework/runners/compose/assets/`:
|
||||||
- `docker-compose.yml.tera` — Stack template (validators, executors, Prometheus, Grafana)
|
- `docker-compose.yml.tera` — Stack template (validators, executors)
|
||||||
- Cfgsync config: `testing-framework/assets/stack/cfgsync.yaml`
|
- Cfgsync config: `testing-framework/assets/stack/cfgsync.yaml`
|
||||||
- Monitoring: `testing-framework/assets/stack/monitoring/prometheus.yml`
|
- Monitoring assets (not deployed by the framework): `testing-framework/assets/stack/monitoring/`
|
||||||
|
|
||||||
## Logging Architecture
|
## Logging Architecture
|
||||||
|
|
||||||
@ -134,14 +134,14 @@ Templates and configs in `testing-framework/runners/compose/assets/`:
|
|||||||
|
|
||||||
## Observability
|
## Observability
|
||||||
|
|
||||||
**Prometheus (Compose + K8s):**
|
**Prometheus-compatible metrics querying (optional):**
|
||||||
- Exposed at `http://localhost:9090` (configurable)
|
- The framework does **not** deploy Prometheus/Grafana.
|
||||||
- Scrapes all validator and executor metrics
|
- Provide a Prometheus-compatible base URL (PromQL API) via `NOMOS_METRICS_QUERY_URL`.
|
||||||
- Accessible in expectations: `ctx.telemetry().prometheus().map(|p| p.base_url())`
|
- Accessible in expectations when configured: `ctx.telemetry().prometheus().map(|p| p.base_url())`
|
||||||
|
|
||||||
**Grafana dashboards (Compose + K8s):**
|
**Grafana dashboards (optional):**
|
||||||
- Provisioned automatically; URL is printed in `TESTNET_ENDPOINTS` when using `scripts/run-examples.sh`
|
- Dashboards live in `testing-framework/assets/stack/monitoring/grafana/dashboards/` and can be imported into your Grafana.
|
||||||
- Default credentials: `admin` / `admin`
|
- If you set `NOMOS_GRAFANA_URL`, the deployer prints it in `TESTNET_ENDPOINTS`.
|
||||||
|
|
||||||
**Node APIs:**
|
**Node APIs:**
|
||||||
- HTTP endpoints per node for consensus info, network status, DA membership
|
- HTTP endpoints per node for consensus info, network status, DA membership
|
||||||
|
|||||||
@ -181,7 +181,9 @@ cargo run -p runner-examples --bin compose_runner
|
|||||||
- `POL_PROOF_DEV_MODE=true` — **Required** for all runners
|
- `POL_PROOF_DEV_MODE=true` — **Required** for all runners
|
||||||
- `NOMOS_DEMO_VALIDATORS=3` / `NOMOS_DEMO_EXECUTORS=2` / `NOMOS_DEMO_RUN_SECS=120` — Topology overrides
|
- `NOMOS_DEMO_VALIDATORS=3` / `NOMOS_DEMO_EXECUTORS=2` / `NOMOS_DEMO_RUN_SECS=120` — Topology overrides
|
||||||
- `COMPOSE_NODE_PAIRS=1x1` — Alternative topology format: "validators×executors"
|
- `COMPOSE_NODE_PAIRS=1x1` — Alternative topology format: "validators×executors"
|
||||||
- `TEST_FRAMEWORK_PROMETHEUS_PORT=9091` — Override Prometheus port (default: 9090)
|
- `NOMOS_METRICS_QUERY_URL` — Prometheus-compatible base URL for the runner process to query (optional)
|
||||||
|
- `NOMOS_METRICS_OTLP_INGEST_URL` — Full OTLP HTTP ingest URL for node metrics export (optional)
|
||||||
|
- `NOMOS_GRAFANA_URL` — Grafana base URL for printing/logging (optional)
|
||||||
- `COMPOSE_RUNNER_HOST=127.0.0.1` — Host address for port mappings
|
- `COMPOSE_RUNNER_HOST=127.0.0.1` — Host address for port mappings
|
||||||
- `COMPOSE_RUNNER_PRESERVE=1` — Keep containers running after test
|
- `COMPOSE_RUNNER_PRESERVE=1` — Keep containers running after test
|
||||||
- `NOMOS_LOG_LEVEL=debug` / `NOMOS_LOG_FILTER=...` — Control node log verbosity (stdout/stderr)
|
- `NOMOS_LOG_LEVEL=debug` / `NOMOS_LOG_FILTER=...` — Control node log verbosity (stdout/stderr)
|
||||||
@ -189,7 +191,7 @@ cargo run -p runner-examples --bin compose_runner
|
|||||||
|
|
||||||
**Compose-specific features:**
|
**Compose-specific features:**
|
||||||
- **Node control support**: Only runner that supports chaos testing (`.enable_node_control()` + chaos workloads)
|
- **Node control support**: Only runner that supports chaos testing (`.enable_node_control()` + chaos workloads)
|
||||||
- **Prometheus observability**: Metrics at `http://localhost:9090`
|
- **Observability is external**: Set `NOMOS_METRICS_*` / `NOMOS_GRAFANA_URL` to enable telemetry links and querying
|
||||||
|
|
||||||
**Important:**
|
**Important:**
|
||||||
- Containers expect KZG parameters at `/kzgrs_test_params/kzgrs_test_params` (note the repeated filename)
|
- Containers expect KZG parameters at `/kzgrs_test_params/kzgrs_test_params` (note the repeated filename)
|
||||||
@ -229,21 +231,30 @@ cargo run -p runner-examples --bin k8s_runner
|
|||||||
- `NOMOS_TESTNET_IMAGE` — Image tag (required)
|
- `NOMOS_TESTNET_IMAGE` — Image tag (required)
|
||||||
- `POL_PROOF_DEV_MODE=true` — **Required** for all runners
|
- `POL_PROOF_DEV_MODE=true` — **Required** for all runners
|
||||||
- `NOMOS_DEMO_VALIDATORS` / `NOMOS_DEMO_EXECUTORS` / `NOMOS_DEMO_RUN_SECS` — Topology overrides
|
- `NOMOS_DEMO_VALIDATORS` / `NOMOS_DEMO_EXECUTORS` / `NOMOS_DEMO_RUN_SECS` — Topology overrides
|
||||||
- `K8S_RUNNER_EXTERNAL_PROMETHEUS_URL` (or `NOMOS_EXTERNAL_PROMETHEUS_URL`) — Reuse an existing Prometheus and skip deploying the in-chart Prometheus; also points node OTLP metrics export and the in-cluster Grafana datasource at that Prometheus
|
- `NOMOS_METRICS_QUERY_URL` — Prometheus-compatible base URL for the runner process to query (PromQL)
|
||||||
|
- `NOMOS_METRICS_OTLP_INGEST_URL` — Full OTLP HTTP ingest URL for node metrics export (optional)
|
||||||
|
- `NOMOS_GRAFANA_URL` — Grafana base URL for printing/logging (optional)
|
||||||
|
|
||||||
**External Prometheus (optional):**
|
**Metrics + Grafana (optional):**
|
||||||
```bash
|
```bash
|
||||||
export K8S_RUNNER_EXTERNAL_PROMETHEUS_URL=http://your-prometheus:9090
|
export NOMOS_METRICS_QUERY_URL=http://your-prometheus:9090
|
||||||
|
# Prometheus OTLP receiver example:
|
||||||
|
export NOMOS_METRICS_OTLP_INGEST_URL=http://your-prometheus:9090/api/v1/otlp/v1/metrics
|
||||||
|
# Optional: print a Grafana link in TESTNET_ENDPOINTS
|
||||||
|
export NOMOS_GRAFANA_URL=http://your-grafana:3000
|
||||||
cargo run -p runner-examples --bin k8s_runner
|
cargo run -p runner-examples --bin k8s_runner
|
||||||
```
|
```
|
||||||
|
|
||||||
Notes:
|
Notes:
|
||||||
- The runner config expects Prometheus to accept OTLP metrics at `/api/v1/otlp/v1/metrics` (the in-chart Prometheus is started with `--web.enable-otlp-receiver` and `--enable-feature=otlp-write-receiver`).
|
- `NOMOS_METRICS_QUERY_URL` must be reachable from the runner process (often via `kubectl port-forward`).
|
||||||
- Use a URL reachable from inside the cluster (for example a `Service` DNS name like `http://prometheus.monitoring:9090`).
|
- `NOMOS_METRICS_OTLP_INGEST_URL` must be reachable from nodes (pods/containers) and is backend-specific (Prometheus vs VictoriaMetrics paths differ).
|
||||||
|
|
||||||
**Via `scripts/run-examples.sh` (optional):**
|
**Via `scripts/run-examples.sh` (optional):**
|
||||||
```bash
|
```bash
|
||||||
scripts/run-examples.sh -t 60 -v 1 -e 1 k8s --external-prometheus http://your-prometheus:9090
|
scripts/run-examples.sh -t 60 -v 1 -e 1 k8s \
|
||||||
|
--metrics-query-url http://your-prometheus:9090 \
|
||||||
|
--metrics-otlp-ingest-url http://your-prometheus:9090/api/v1/otlp/v1/metrics \
|
||||||
|
--grafana-url http://your-grafana:3000
|
||||||
```
|
```
|
||||||
|
|
||||||
**In code (optional):**
|
**In code (optional):**
|
||||||
@ -252,7 +263,8 @@ use testing_framework_core::scenario::ScenarioBuilder;
|
|||||||
use testing_framework_workflows::ObservabilityBuilderExt as _;
|
use testing_framework_workflows::ObservabilityBuilderExt as _;
|
||||||
|
|
||||||
let plan = ScenarioBuilder::with_node_counts(1, 1)
|
let plan = ScenarioBuilder::with_node_counts(1, 1)
|
||||||
.with_external_prometheus_str("http://your-prometheus:9090")
|
.with_metrics_query_url_str("http://your-prometheus:9090")
|
||||||
|
.with_metrics_otlp_ingest_url_str("http://your-prometheus:9090/api/v1/otlp/v1/metrics")
|
||||||
.build();
|
.build();
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -484,7 +496,6 @@ cargo run -p runner-examples --bin compose_runner
|
|||||||
- `COMPOSE_RUNNER_HOST=127.0.0.1` — host used for readiness probes (override for remote Docker daemons / VM networking)
|
- `COMPOSE_RUNNER_HOST=127.0.0.1` — host used for readiness probes (override for remote Docker daemons / VM networking)
|
||||||
- `COMPOSE_RUNNER_HOST_GATEWAY=host.docker.internal:host-gateway` — controls the `extra_hosts` entry injected into compose (set to `disable` to omit)
|
- `COMPOSE_RUNNER_HOST_GATEWAY=host.docker.internal:host-gateway` — controls the `extra_hosts` entry injected into compose (set to `disable` to omit)
|
||||||
- `TESTNET_RUNNER_PRESERVE=1` — alias for `COMPOSE_RUNNER_PRESERVE=1`
|
- `TESTNET_RUNNER_PRESERVE=1` — alias for `COMPOSE_RUNNER_PRESERVE=1`
|
||||||
- `COMPOSE_GRAFANA_PORT=<port>` — pin Grafana to a fixed host port instead of ephemeral assignment
|
|
||||||
- `COMPOSE_RUNNER_HTTP_TIMEOUT_SECS=<secs>` — override compose node HTTP readiness timeout
|
- `COMPOSE_RUNNER_HTTP_TIMEOUT_SECS=<secs>` — override compose node HTTP readiness timeout
|
||||||
|
|
||||||
**Note:** Container names follow pattern `nomos-compose-{uuid}-validator-{index}-1` where `{uuid}` changes per run.
|
**Note:** Container names follow pattern `nomos-compose-{uuid}-validator-{index}-1` where `{uuid}` changes per run.
|
||||||
@ -553,15 +564,15 @@ cargo run -p runner-examples --bin local_runner
|
|||||||
|
|
||||||
Runners expose metrics and node HTTP endpoints for expectation code and debugging:
|
Runners expose metrics and node HTTP endpoints for expectation code and debugging:
|
||||||
|
|
||||||
**Prometheus (Compose + K8s):**
|
**Prometheus-compatible metrics querying (optional):**
|
||||||
- Default: `http://localhost:9090`
|
- The framework does **not** deploy Prometheus.
|
||||||
- Override: `TEST_FRAMEWORK_PROMETHEUS_PORT=9091`
|
- Provide `NOMOS_METRICS_QUERY_URL` (PromQL base URL) to enable `ctx.telemetry()` queries.
|
||||||
- Note: the host port can vary if `9090` is unavailable; prefer the printed `TESTNET_ENDPOINTS` line as the source of truth.
|
- Access from expectations when configured: `ctx.telemetry().prometheus().map(|p| p.base_url())`
|
||||||
- Access from expectations: `ctx.telemetry().prometheus().map(|p| p.base_url())`
|
|
||||||
|
|
||||||
**Grafana dashboards (Compose + K8s):**
|
**Grafana (optional):**
|
||||||
- The deployer prints the Grafana base URL in `TESTNET_ENDPOINTS`.
|
- The framework does **not** deploy Grafana.
|
||||||
- Default credentials are `admin` / `admin`.
|
- If you set `NOMOS_GRAFANA_URL`, the deployer prints it in `TESTNET_ENDPOINTS`.
|
||||||
|
- Dashboards live in `testing-framework/assets/stack/monitoring/grafana/dashboards/` for import into your Grafana.
|
||||||
|
|
||||||
**Node APIs:**
|
**Node APIs:**
|
||||||
- Access from expectations: `ctx.node_clients().validator_clients().get(0)`
|
- Access from expectations: `ctx.node_clients().validator_clients().get(0)`
|
||||||
|
|||||||
@ -109,7 +109,7 @@ async fn run_compose_case(
|
|||||||
};
|
};
|
||||||
|
|
||||||
if !runner.context().telemetry().is_configured() {
|
if !runner.context().telemetry().is_configured() {
|
||||||
warn!("compose runner should expose prometheus metrics");
|
warn!("metrics querying is disabled; set NOMOS_METRICS_QUERY_URL to enable PromQL queries");
|
||||||
}
|
}
|
||||||
|
|
||||||
info!("running scenario");
|
info!("running scenario");
|
||||||
|
|||||||
@ -104,7 +104,7 @@ async fn run_k8s_case(validators: usize, executors: usize, run_duration: Duratio
|
|||||||
};
|
};
|
||||||
|
|
||||||
if !runner.context().telemetry().is_configured() {
|
if !runner.context().telemetry().is_configured() {
|
||||||
warn!("k8s runner should expose prometheus metrics");
|
warn!("metrics querying is disabled; set NOMOS_METRICS_QUERY_URL to enable PromQL queries");
|
||||||
}
|
}
|
||||||
|
|
||||||
let validator_clients = runner.context().node_clients().validator_clients().to_vec();
|
let validator_clients = runner.context().node_clients().validator_clients().to_vec();
|
||||||
|
|||||||
@ -38,12 +38,13 @@ Options:
|
|||||||
-v, --validators N Number of validators (required)
|
-v, --validators N Number of validators (required)
|
||||||
-e, --executors N Number of executors (required)
|
-e, --executors N Number of executors (required)
|
||||||
--bundle PATH Convenience alias for setting NOMOS_BINARIES_TAR=PATH
|
--bundle PATH Convenience alias for setting NOMOS_BINARIES_TAR=PATH
|
||||||
--metrics-query-url URL (k8s) PromQL base URL the runner process can query (often localhost port-forward)
|
--metrics-query-url URL PromQL base URL the runner process can query (optional)
|
||||||
--metrics-query-grafana-url URL (k8s) PromQL base URL reachable from inside the cluster (Grafana datasource)
|
--metrics-query-grafana-url URL PromQL base URL for a Grafana datasource (optional)
|
||||||
--metrics-otlp-ingest-url URL (k8s) Full OTLP HTTP ingest URL for node metrics export
|
--metrics-otlp-ingest-url URL Full OTLP HTTP ingest URL for node metrics export (optional)
|
||||||
--external-prometheus URL (k8s) Alias for --metrics-query-url
|
--grafana-url URL Grafana base URL for printing/logging (optional)
|
||||||
--external-prometheus-grafana-url URL (k8s) Alias for --metrics-query-grafana-url
|
--external-prometheus URL Alias for --metrics-query-url
|
||||||
--external-otlp-metrics-endpoint URL (k8s) Alias for --metrics-otlp-ingest-url
|
--external-prometheus-grafana-url URL Alias for --metrics-query-grafana-url
|
||||||
|
--external-otlp-metrics-endpoint URL Alias for --metrics-otlp-ingest-url
|
||||||
--local Use a local Docker image tag (default for docker-desktop k8s)
|
--local Use a local Docker image tag (default for docker-desktop k8s)
|
||||||
--ecr Use an ECR image reference (default for non-docker-desktop k8s)
|
--ecr Use an ECR image reference (default for non-docker-desktop k8s)
|
||||||
--no-image-build Skip rebuilding the compose/k8s image (sets NOMOS_SKIP_IMAGE_BUILD=1)
|
--no-image-build Skip rebuilding the compose/k8s image (sets NOMOS_SKIP_IMAGE_BUILD=1)
|
||||||
@ -64,6 +65,8 @@ Environment:
|
|||||||
NOMOS_METRICS_QUERY_GRAFANA_URL Alias for K8S_RUNNER_METRICS_QUERY_GRAFANA_URL
|
NOMOS_METRICS_QUERY_GRAFANA_URL Alias for K8S_RUNNER_METRICS_QUERY_GRAFANA_URL
|
||||||
K8S_RUNNER_METRICS_OTLP_INGEST_URL Full OTLP HTTP ingest URL for node metrics export
|
K8S_RUNNER_METRICS_OTLP_INGEST_URL Full OTLP HTTP ingest URL for node metrics export
|
||||||
NOMOS_METRICS_OTLP_INGEST_URL Alias for K8S_RUNNER_METRICS_OTLP_INGEST_URL
|
NOMOS_METRICS_OTLP_INGEST_URL Alias for K8S_RUNNER_METRICS_OTLP_INGEST_URL
|
||||||
|
K8S_RUNNER_GRAFANA_URL Grafana base URL for printing/logging (optional)
|
||||||
|
NOMOS_GRAFANA_URL Alias for K8S_RUNNER_GRAFANA_URL
|
||||||
|
|
||||||
Deprecated env vars (still supported):
|
Deprecated env vars (still supported):
|
||||||
K8S_RUNNER_EXTERNAL_PROMETHEUS_URL, NOMOS_EXTERNAL_PROMETHEUS_URL
|
K8S_RUNNER_EXTERNAL_PROMETHEUS_URL, NOMOS_EXTERNAL_PROMETHEUS_URL
|
||||||
@ -115,6 +118,7 @@ run_examples::parse_args() {
|
|||||||
METRICS_QUERY_URL=""
|
METRICS_QUERY_URL=""
|
||||||
METRICS_QUERY_GRAFANA_URL=""
|
METRICS_QUERY_GRAFANA_URL=""
|
||||||
METRICS_OTLP_INGEST_URL=""
|
METRICS_OTLP_INGEST_URL=""
|
||||||
|
GRAFANA_URL=""
|
||||||
|
|
||||||
RUN_SECS_RAW_SPECIFIED=""
|
RUN_SECS_RAW_SPECIFIED=""
|
||||||
|
|
||||||
@ -184,6 +188,14 @@ run_examples::parse_args() {
|
|||||||
METRICS_OTLP_INGEST_URL="${1#*=}"
|
METRICS_OTLP_INGEST_URL="${1#*=}"
|
||||||
shift
|
shift
|
||||||
;;
|
;;
|
||||||
|
--grafana-url)
|
||||||
|
GRAFANA_URL="${2:-}"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--grafana-url=*)
|
||||||
|
GRAFANA_URL="${1#*=}"
|
||||||
|
shift
|
||||||
|
;;
|
||||||
--external-prometheus)
|
--external-prometheus)
|
||||||
METRICS_QUERY_URL="${2:-}"
|
METRICS_QUERY_URL="${2:-}"
|
||||||
shift 2
|
shift 2
|
||||||
@ -262,18 +274,6 @@ run_examples::parse_args() {
|
|||||||
run_examples::fail_with_usage "executors must be a non-negative integer (pass -e/--executors)"
|
run_examples::fail_with_usage "executors must be a non-negative integer (pass -e/--executors)"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -n "${METRICS_QUERY_URL}" ] && [ "${MODE}" != "k8s" ]; then
|
|
||||||
echo "Warning: --metrics-query-url is only used in k8s mode; ignoring." >&2
|
|
||||||
METRICS_QUERY_URL=""
|
|
||||||
fi
|
|
||||||
if [ -n "${METRICS_QUERY_GRAFANA_URL}" ] && [ "${MODE}" != "k8s" ]; then
|
|
||||||
echo "Warning: --metrics-query-grafana-url is only used in k8s mode; ignoring." >&2
|
|
||||||
METRICS_QUERY_GRAFANA_URL=""
|
|
||||||
fi
|
|
||||||
if [ -n "${METRICS_OTLP_INGEST_URL}" ] && [ "${MODE}" != "k8s" ]; then
|
|
||||||
echo "Warning: --metrics-otlp-ingest-url is only used in k8s mode; ignoring." >&2
|
|
||||||
METRICS_OTLP_INGEST_URL=""
|
|
||||||
fi
|
|
||||||
}
|
}
|
||||||
|
|
||||||
run_examples::select_image() {
|
run_examples::select_image() {
|
||||||
@ -582,17 +582,21 @@ run_examples::run() {
|
|||||||
export NOMOS_DEMO_VALIDATORS="${DEMO_VALIDATORS}"
|
export NOMOS_DEMO_VALIDATORS="${DEMO_VALIDATORS}"
|
||||||
export NOMOS_DEMO_EXECUTORS="${DEMO_EXECUTORS}"
|
export NOMOS_DEMO_EXECUTORS="${DEMO_EXECUTORS}"
|
||||||
|
|
||||||
if [ "${MODE}" = "k8s" ] && [ -n "${METRICS_QUERY_URL}" ]; then
|
if [ -n "${METRICS_QUERY_URL}" ]; then
|
||||||
export K8S_RUNNER_METRICS_QUERY_URL="${METRICS_QUERY_URL}"
|
|
||||||
export NOMOS_METRICS_QUERY_URL="${METRICS_QUERY_URL}"
|
export NOMOS_METRICS_QUERY_URL="${METRICS_QUERY_URL}"
|
||||||
|
export K8S_RUNNER_METRICS_QUERY_URL="${METRICS_QUERY_URL}"
|
||||||
fi
|
fi
|
||||||
if [ "${MODE}" = "k8s" ] && [ -n "${METRICS_QUERY_GRAFANA_URL}" ]; then
|
if [ -n "${METRICS_QUERY_GRAFANA_URL}" ]; then
|
||||||
export K8S_RUNNER_METRICS_QUERY_GRAFANA_URL="${METRICS_QUERY_GRAFANA_URL}"
|
|
||||||
export NOMOS_METRICS_QUERY_GRAFANA_URL="${METRICS_QUERY_GRAFANA_URL}"
|
export NOMOS_METRICS_QUERY_GRAFANA_URL="${METRICS_QUERY_GRAFANA_URL}"
|
||||||
|
export K8S_RUNNER_METRICS_QUERY_GRAFANA_URL="${METRICS_QUERY_GRAFANA_URL}"
|
||||||
fi
|
fi
|
||||||
if [ "${MODE}" = "k8s" ] && [ -n "${METRICS_OTLP_INGEST_URL}" ]; then
|
if [ -n "${METRICS_OTLP_INGEST_URL}" ]; then
|
||||||
export K8S_RUNNER_METRICS_OTLP_INGEST_URL="${METRICS_OTLP_INGEST_URL}"
|
|
||||||
export NOMOS_METRICS_OTLP_INGEST_URL="${METRICS_OTLP_INGEST_URL}"
|
export NOMOS_METRICS_OTLP_INGEST_URL="${METRICS_OTLP_INGEST_URL}"
|
||||||
|
export K8S_RUNNER_METRICS_OTLP_INGEST_URL="${METRICS_OTLP_INGEST_URL}"
|
||||||
|
fi
|
||||||
|
if [ -n "${GRAFANA_URL}" ]; then
|
||||||
|
export NOMOS_GRAFANA_URL="${GRAFANA_URL}"
|
||||||
|
export K8S_RUNNER_GRAFANA_URL="${GRAFANA_URL}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "==> Running ${BIN} for ${RUN_SECS}s (mode=${MODE}, image=${IMAGE})"
|
echo "==> Running ${BIN} for ${RUN_SECS}s (mode=${MODE}, image=${IMAGE})"
|
||||||
|
|||||||
@ -42,8 +42,6 @@ tracing_settings:
|
|||||||
filters:
|
filters:
|
||||||
nomos: debug
|
nomos: debug
|
||||||
cryptarchia: debug
|
cryptarchia: debug
|
||||||
metrics: !Otlp
|
metrics: None
|
||||||
endpoint: http://prometheus:9090/api/v1/otlp/v1/metrics
|
|
||||||
host_identifier: node
|
|
||||||
console: None
|
console: None
|
||||||
level: INFO
|
level: INFO
|
||||||
|
|||||||
@ -21,18 +21,6 @@ pub const DEFAULT_NODE_HTTP_PROBE_TIMEOUT: Duration = Duration::from_secs(30);
|
|||||||
/// Default Kubernetes deployment readiness timeout.
|
/// Default Kubernetes deployment readiness timeout.
|
||||||
pub const DEFAULT_K8S_DEPLOYMENT_TIMEOUT: Duration = Duration::from_secs(180);
|
pub const DEFAULT_K8S_DEPLOYMENT_TIMEOUT: Duration = Duration::from_secs(180);
|
||||||
|
|
||||||
/// Default Prometheus HTTP port.
|
|
||||||
pub const DEFAULT_PROMETHEUS_HTTP_PORT: u16 = 9090;
|
|
||||||
|
|
||||||
/// Default Prometheus HTTP timeout.
|
|
||||||
pub const DEFAULT_PROMETHEUS_HTTP_TIMEOUT: Duration = Duration::from_secs(240);
|
|
||||||
|
|
||||||
/// Default Prometheus HTTP probe timeout for NodePort checks.
|
|
||||||
pub const DEFAULT_PROMETHEUS_HTTP_PROBE_TIMEOUT: Duration = Duration::from_secs(30);
|
|
||||||
|
|
||||||
/// Default Prometheus service name.
|
|
||||||
pub const DEFAULT_PROMETHEUS_SERVICE_NAME: &str = "prometheus";
|
|
||||||
|
|
||||||
/// Default API port used by nodes.
|
/// Default API port used by nodes.
|
||||||
pub const DEFAULT_API_PORT: u16 = 18080;
|
pub const DEFAULT_API_PORT: u16 = 18080;
|
||||||
|
|
||||||
|
|||||||
@ -8,9 +8,6 @@ use super::DynError;
|
|||||||
pub struct NodeControlCapability;
|
pub struct NodeControlCapability;
|
||||||
|
|
||||||
/// Optional observability settings attached to a scenario.
|
/// Optional observability settings attached to a scenario.
|
||||||
///
|
|
||||||
/// Runners may use this to decide whether to provision in-cluster Prometheus or
|
|
||||||
/// reuse an existing endpoint.
|
|
||||||
#[derive(Clone, Debug, Default)]
|
#[derive(Clone, Debug, Default)]
|
||||||
pub struct ObservabilityCapability {
|
pub struct ObservabilityCapability {
|
||||||
/// Prometheus-compatible base URL used by the *runner process* to query
|
/// Prometheus-compatible base URL used by the *runner process* to query
|
||||||
@ -24,6 +21,8 @@ pub struct ObservabilityCapability {
|
|||||||
/// Full OTLP HTTP metrics ingest endpoint used by *nodes* to export metrics
|
/// Full OTLP HTTP metrics ingest endpoint used by *nodes* to export metrics
|
||||||
/// (backend-specific host and path).
|
/// (backend-specific host and path).
|
||||||
pub metrics_otlp_ingest_url: Option<Url>,
|
pub metrics_otlp_ingest_url: Option<Url>,
|
||||||
|
/// Optional Grafana base URL for printing/logging (human access).
|
||||||
|
pub grafana_url: Option<Url>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Trait implemented by scenario capability markers to signal whether node
|
/// Trait implemented by scenario capability markers to signal whether node
|
||||||
|
|||||||
@ -5,6 +5,7 @@ pub mod cfgsync;
|
|||||||
mod definition;
|
mod definition;
|
||||||
mod expectation;
|
mod expectation;
|
||||||
pub mod http_probe;
|
pub mod http_probe;
|
||||||
|
mod observability;
|
||||||
mod runtime;
|
mod runtime;
|
||||||
mod workload;
|
mod workload;
|
||||||
|
|
||||||
@ -15,6 +16,7 @@ pub use capabilities::{
|
|||||||
};
|
};
|
||||||
pub use definition::{Builder, Scenario, ScenarioBuilder, TopologyConfigurator};
|
pub use definition::{Builder, Scenario, ScenarioBuilder, TopologyConfigurator};
|
||||||
pub use expectation::Expectation;
|
pub use expectation::Expectation;
|
||||||
|
pub use observability::{ObservabilityCapabilityProvider, ObservabilityInputs};
|
||||||
pub use runtime::{
|
pub use runtime::{
|
||||||
BlockFeed, BlockFeedTask, BlockRecord, BlockStats, CleanupGuard, Deployer, NodeClients,
|
BlockFeed, BlockFeedTask, BlockRecord, BlockStats, CleanupGuard, Deployer, NodeClients,
|
||||||
RunContext, RunHandle, RunMetrics, Runner, ScenarioError,
|
RunContext, RunHandle, RunMetrics, Runner, ScenarioError,
|
||||||
|
|||||||
145
testing-framework/core/src/scenario/observability.rs
Normal file
145
testing-framework/core/src/scenario/observability.rs
Normal file
@ -0,0 +1,145 @@
|
|||||||
|
use std::env;
|
||||||
|
|
||||||
|
use reqwest::Url;
|
||||||
|
|
||||||
|
use super::{Metrics, MetricsError, NodeControlCapability, ObservabilityCapability};
|
||||||
|
|
||||||
|
/// Observability configuration inputs shared by deployers/runners.
|
||||||
|
///
|
||||||
|
/// All fields are optional; missing values only matter when a caller needs the
|
||||||
|
/// corresponding capability (e.g. querying metrics from the runner process).
|
||||||
|
#[derive(Clone, Debug, Default)]
|
||||||
|
pub struct ObservabilityInputs {
|
||||||
|
/// Prometheus-compatible base URL used by the runner process to query
|
||||||
|
/// metrics (PromQL API endpoints).
|
||||||
|
pub metrics_query_url: Option<Url>,
|
||||||
|
/// Prometheus-compatible base URL intended for an in-cluster Grafana
|
||||||
|
/// datasource.
|
||||||
|
pub metrics_query_grafana_url: Option<Url>,
|
||||||
|
/// Full OTLP HTTP metrics ingest endpoint used by nodes to export metrics
|
||||||
|
/// (backend-specific host and path).
|
||||||
|
pub metrics_otlp_ingest_url: Option<Url>,
|
||||||
|
/// Optional Grafana base URL for printing/logging (human access).
|
||||||
|
pub grafana_url: Option<Url>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Capability helper for deployers that are generic over scenario capability
|
||||||
|
/// markers.
|
||||||
|
pub trait ObservabilityCapabilityProvider {
|
||||||
|
fn observability_capability(&self) -> Option<&ObservabilityCapability>;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ObservabilityCapabilityProvider for () {
|
||||||
|
fn observability_capability(&self) -> Option<&ObservabilityCapability> {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ObservabilityCapabilityProvider for NodeControlCapability {
|
||||||
|
fn observability_capability(&self) -> Option<&ObservabilityCapability> {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ObservabilityCapabilityProvider for ObservabilityCapability {
|
||||||
|
fn observability_capability(&self) -> Option<&ObservabilityCapability> {
|
||||||
|
Some(self)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ObservabilityInputs {
|
||||||
|
#[must_use]
|
||||||
|
pub fn from_capability(capabilities: &ObservabilityCapability) -> Self {
|
||||||
|
Self {
|
||||||
|
metrics_query_url: capabilities.metrics_query_url.clone(),
|
||||||
|
metrics_query_grafana_url: capabilities.metrics_query_grafana_url.clone(),
|
||||||
|
metrics_otlp_ingest_url: capabilities.metrics_otlp_ingest_url.clone(),
|
||||||
|
grafana_url: capabilities.grafana_url.clone(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Load observability inputs from environment variables.
|
||||||
|
///
|
||||||
|
/// The `NOMOS_*` namespace applies to all deployers. Runner-specific env
|
||||||
|
/// vars are also accepted as aliases for backwards compatibility.
|
||||||
|
pub fn from_env() -> Result<Self, MetricsError> {
|
||||||
|
Ok(Self {
|
||||||
|
metrics_query_url: read_url_var(&[
|
||||||
|
"NOMOS_METRICS_QUERY_URL",
|
||||||
|
"K8S_RUNNER_METRICS_QUERY_URL",
|
||||||
|
// Back-compat:
|
||||||
|
"K8S_RUNNER_EXTERNAL_PROMETHEUS_URL",
|
||||||
|
"NOMOS_EXTERNAL_PROMETHEUS_URL",
|
||||||
|
])?,
|
||||||
|
metrics_query_grafana_url: read_url_var(&[
|
||||||
|
"NOMOS_METRICS_QUERY_GRAFANA_URL",
|
||||||
|
"K8S_RUNNER_METRICS_QUERY_GRAFANA_URL",
|
||||||
|
// Back-compat:
|
||||||
|
"K8S_RUNNER_EXTERNAL_PROMETHEUS_GRAFANA_URL",
|
||||||
|
"NOMOS_EXTERNAL_PROMETHEUS_GRAFANA_URL",
|
||||||
|
])?,
|
||||||
|
metrics_otlp_ingest_url: read_url_var(&[
|
||||||
|
"NOMOS_METRICS_OTLP_INGEST_URL",
|
||||||
|
"K8S_RUNNER_METRICS_OTLP_INGEST_URL",
|
||||||
|
// Back-compat:
|
||||||
|
"K8S_RUNNER_EXTERNAL_OTLP_METRICS_ENDPOINT",
|
||||||
|
"NOMOS_EXTERNAL_OTLP_METRICS_ENDPOINT",
|
||||||
|
])?,
|
||||||
|
grafana_url: read_url_var(&["NOMOS_GRAFANA_URL", "K8S_RUNNER_GRAFANA_URL"])?,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Apply defaults and fallbacks (pure function).
|
||||||
|
///
|
||||||
|
/// Currently, the only fallback is using `metrics_query_url` as the Grafana
|
||||||
|
/// datasource URL when `metrics_query_grafana_url` is unset.
|
||||||
|
#[must_use]
|
||||||
|
pub fn normalized(mut self) -> Self {
|
||||||
|
if self.metrics_query_grafana_url.is_none() {
|
||||||
|
self.metrics_query_grafana_url = self.metrics_query_url.clone();
|
||||||
|
}
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Overlay non-empty values from `overrides` onto `self`.
|
||||||
|
#[must_use]
|
||||||
|
pub fn with_overrides(mut self, overrides: Self) -> Self {
|
||||||
|
if overrides.metrics_query_url.is_some() {
|
||||||
|
self.metrics_query_url = overrides.metrics_query_url;
|
||||||
|
}
|
||||||
|
if overrides.metrics_query_grafana_url.is_some() {
|
||||||
|
self.metrics_query_grafana_url = overrides.metrics_query_grafana_url;
|
||||||
|
}
|
||||||
|
if overrides.metrics_otlp_ingest_url.is_some() {
|
||||||
|
self.metrics_otlp_ingest_url = overrides.metrics_otlp_ingest_url;
|
||||||
|
}
|
||||||
|
if overrides.grafana_url.is_some() {
|
||||||
|
self.grafana_url = overrides.grafana_url;
|
||||||
|
}
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build the telemetry handle exposed in `RunContext::telemetry()`.
|
||||||
|
pub fn telemetry_handle(&self) -> Result<Metrics, MetricsError> {
|
||||||
|
match self.metrics_query_url.clone() {
|
||||||
|
Some(url) => Metrics::from_prometheus(url),
|
||||||
|
None => Ok(Metrics::empty()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_url_var(keys: &[&'static str]) -> Result<Option<Url>, MetricsError> {
|
||||||
|
for key in keys {
|
||||||
|
let Some(raw) = env::var(key).ok() else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
let raw = raw.trim();
|
||||||
|
if raw.is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
return Url::parse(raw)
|
||||||
|
.map(Some)
|
||||||
|
.map_err(|err| MetricsError::new(format!("invalid {key}: {err}")));
|
||||||
|
}
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
@ -16,6 +16,8 @@ workspace = true
|
|||||||
anyhow = "1"
|
anyhow = "1"
|
||||||
async-trait = { workspace = true }
|
async-trait = { workspace = true }
|
||||||
cfgsync = { workspace = true }
|
cfgsync = { workspace = true }
|
||||||
|
nomos-tracing = { workspace = true }
|
||||||
|
nomos-tracing-service = { workspace = true }
|
||||||
reqwest = { workspace = true, features = ["json"] }
|
reqwest = { workspace = true, features = ["json"] }
|
||||||
serde = { workspace = true, features = ["derive"] }
|
serde = { workspace = true, features = ["derive"] }
|
||||||
tempfile = { workspace = true }
|
tempfile = { workspace = true }
|
||||||
@ -32,6 +34,5 @@ groth16 = { workspace = true }
|
|||||||
key-management-system-service = { workspace = true }
|
key-management-system-service = { workspace = true }
|
||||||
nomos-core = { workspace = true }
|
nomos-core = { workspace = true }
|
||||||
nomos-ledger = { workspace = true }
|
nomos-ledger = { workspace = true }
|
||||||
nomos-tracing-service = { workspace = true }
|
|
||||||
tests = { workspace = true }
|
tests = { workspace = true }
|
||||||
zksign = { workspace = true }
|
zksign = { workspace = true }
|
||||||
|
|||||||
@ -1,37 +1,4 @@
|
|||||||
services:
|
services:
|
||||||
prometheus:
|
|
||||||
image: prom/prometheus:v3.0.1
|
|
||||||
{% if prometheus.platform %} platform: {{ prometheus.platform }}
|
|
||||||
{% endif %} command:
|
|
||||||
- --config.file=/etc/prometheus/prometheus.yml
|
|
||||||
- --storage.tsdb.retention.time=7d
|
|
||||||
- --web.enable-otlp-receiver
|
|
||||||
- --enable-feature=otlp-write-receiver
|
|
||||||
volumes:
|
|
||||||
- ./stack/monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:z
|
|
||||||
ports:
|
|
||||||
- {{ prometheus.host_port }}
|
|
||||||
restart: on-failure
|
|
||||||
|
|
||||||
grafana:
|
|
||||||
image: grafana/grafana:10.4.1
|
|
||||||
environment:
|
|
||||||
GF_PATHS_CONFIG: /etc/grafana/grafana.ini
|
|
||||||
GF_SECURITY_ADMIN_USER: admin
|
|
||||||
GF_SECURITY_ADMIN_PASSWORD: admin
|
|
||||||
ports:
|
|
||||||
- {{ grafana.host_port }}
|
|
||||||
volumes:
|
|
||||||
- ./stack/monitoring/grafana/datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:z
|
|
||||||
- ./stack/monitoring/grafana/dashboards.yml:/etc/grafana/provisioning/dashboards/dashboards.yaml:z
|
|
||||||
- ./stack/monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro
|
|
||||||
- ./stack/monitoring/grafana/grafana.ini:/etc/grafana/grafana.ini:ro
|
|
||||||
env_file:
|
|
||||||
- ./stack/monitoring/grafana/plugins.env
|
|
||||||
depends_on:
|
|
||||||
- prometheus
|
|
||||||
restart: on-failure
|
|
||||||
|
|
||||||
{% for node in validators %}
|
{% for node in validators %}
|
||||||
{{ node.name }}:
|
{{ node.name }}:
|
||||||
image: {{ node.image }}
|
image: {{ node.image }}
|
||||||
|
|||||||
@ -6,7 +6,8 @@ pub mod setup;
|
|||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use testing_framework_core::scenario::{
|
use testing_framework_core::scenario::{
|
||||||
BlockFeedTask, CleanupGuard, Deployer, RequiresNodeControl, Runner, Scenario,
|
BlockFeedTask, CleanupGuard, Deployer, ObservabilityCapabilityProvider, RequiresNodeControl,
|
||||||
|
Runner, Scenario,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::{errors::ComposeRunnerError, lifecycle::cleanup::RunnerCleanup};
|
use crate::{errors::ComposeRunnerError, lifecycle::cleanup::RunnerCleanup};
|
||||||
@ -41,7 +42,7 @@ impl ComposeDeployer {
|
|||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl<Caps> Deployer<Caps> for ComposeDeployer
|
impl<Caps> Deployer<Caps> for ComposeDeployer
|
||||||
where
|
where
|
||||||
Caps: RequiresNodeControl + Send + Sync,
|
Caps: RequiresNodeControl + ObservabilityCapabilityProvider + Send + Sync,
|
||||||
{
|
{
|
||||||
type Error = ComposeRunnerError;
|
type Error = ComposeRunnerError;
|
||||||
|
|
||||||
|
|||||||
@ -1,7 +1,8 @@
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use testing_framework_core::scenario::{
|
use testing_framework_core::scenario::{
|
||||||
NodeControlHandle, RequiresNodeControl, RunContext, Runner, Scenario,
|
NodeControlHandle, ObservabilityCapabilityProvider, ObservabilityInputs, RequiresNodeControl,
|
||||||
|
RunContext, Runner, Scenario,
|
||||||
};
|
};
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
@ -20,7 +21,6 @@ use crate::{
|
|||||||
environment::StackEnvironment,
|
environment::StackEnvironment,
|
||||||
ports::{HostPortMapping, compose_runner_host},
|
ports::{HostPortMapping, compose_runner_host},
|
||||||
},
|
},
|
||||||
lifecycle::readiness::metrics_handle_from_port,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct DeploymentOrchestrator {
|
pub struct DeploymentOrchestrator {
|
||||||
@ -37,21 +37,35 @@ impl DeploymentOrchestrator {
|
|||||||
scenario: &Scenario<Caps>,
|
scenario: &Scenario<Caps>,
|
||||||
) -> Result<Runner, ComposeRunnerError>
|
) -> Result<Runner, ComposeRunnerError>
|
||||||
where
|
where
|
||||||
Caps: RequiresNodeControl + Send + Sync,
|
Caps: RequiresNodeControl + ObservabilityCapabilityProvider + Send + Sync,
|
||||||
{
|
{
|
||||||
let setup = DeploymentSetup::new(scenario.topology());
|
let setup = DeploymentSetup::new(scenario.topology());
|
||||||
setup.validate_environment().await?;
|
setup.validate_environment().await?;
|
||||||
|
|
||||||
|
let env_inputs = ObservabilityInputs::from_env()?;
|
||||||
|
let cap_inputs = scenario
|
||||||
|
.capabilities()
|
||||||
|
.observability_capability()
|
||||||
|
.map(ObservabilityInputs::from_capability)
|
||||||
|
.unwrap_or_default();
|
||||||
|
let observability = env_inputs.with_overrides(cap_inputs).normalized();
|
||||||
|
|
||||||
let DeploymentContext {
|
let DeploymentContext {
|
||||||
mut environment,
|
mut environment,
|
||||||
descriptors,
|
descriptors,
|
||||||
} = setup.prepare_workspace().await?;
|
} = setup.prepare_workspace(&observability).await?;
|
||||||
|
|
||||||
tracing::info!(
|
tracing::info!(
|
||||||
validators = descriptors.validators().len(),
|
validators = descriptors.validators().len(),
|
||||||
executors = descriptors.executors().len(),
|
executors = descriptors.executors().len(),
|
||||||
duration_secs = scenario.duration().as_secs(),
|
duration_secs = scenario.duration().as_secs(),
|
||||||
readiness_checks = self.deployer.readiness_checks,
|
readiness_checks = self.deployer.readiness_checks,
|
||||||
|
metrics_query_url = observability.metrics_query_url.as_ref().map(|u| u.as_str()),
|
||||||
|
metrics_otlp_ingest_url = observability
|
||||||
|
.metrics_otlp_ingest_url
|
||||||
|
.as_ref()
|
||||||
|
.map(|u| u.as_str()),
|
||||||
|
grafana_url = observability.grafana_url.as_ref().map(|u| u.as_str()),
|
||||||
"compose deployment starting"
|
"compose deployment starting"
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -71,26 +85,31 @@ impl DeploymentOrchestrator {
|
|||||||
let node_clients = client_builder
|
let node_clients = client_builder
|
||||||
.build_node_clients(&descriptors, &host_ports, &host, &mut environment)
|
.build_node_clients(&descriptors, &host_ports, &host, &mut environment)
|
||||||
.await?;
|
.await?;
|
||||||
let telemetry = metrics_handle_from_port(environment.prometheus_port(), &host)?;
|
let telemetry = observability.telemetry_handle()?;
|
||||||
let node_control = self.maybe_node_control::<Caps>(&environment);
|
let node_control = self.maybe_node_control::<Caps>(&environment);
|
||||||
|
|
||||||
info!(
|
if let Some(url) = observability.metrics_query_url.as_ref() {
|
||||||
prometheus_url = %format!("http://{}:{}/", host, environment.prometheus_port()),
|
info!(metrics_query_url = %url.as_str(), "metrics query endpoint configured");
|
||||||
"prometheus endpoint available on host"
|
}
|
||||||
);
|
if let Some(url) = observability.grafana_url.as_ref() {
|
||||||
info!(
|
info!(grafana_url = %url.as_str(), "grafana url configured");
|
||||||
grafana_url = %format!("http://{}:{}/", host, environment.grafana_port()),
|
}
|
||||||
"grafana dashboard available on host"
|
|
||||||
);
|
|
||||||
log_profiling_urls(&host, &host_ports);
|
log_profiling_urls(&host, &host_ports);
|
||||||
|
|
||||||
if std::env::var("TESTNET_PRINT_ENDPOINTS").is_ok() {
|
if std::env::var("TESTNET_PRINT_ENDPOINTS").is_ok() {
|
||||||
|
let prometheus = observability
|
||||||
|
.metrics_query_url
|
||||||
|
.as_ref()
|
||||||
|
.map(|u| u.as_str().to_string())
|
||||||
|
.unwrap_or_else(|| "<disabled>".to_string());
|
||||||
|
let grafana = observability
|
||||||
|
.grafana_url
|
||||||
|
.as_ref()
|
||||||
|
.map(|u| u.as_str().to_string())
|
||||||
|
.unwrap_or_else(|| "<disabled>".to_string());
|
||||||
println!(
|
println!(
|
||||||
"TESTNET_ENDPOINTS prometheus=http://{}:{}/ grafana=http://{}:{}/",
|
"TESTNET_ENDPOINTS prometheus={} grafana={}",
|
||||||
host,
|
prometheus, grafana
|
||||||
environment.prometheus_port(),
|
|
||||||
host,
|
|
||||||
environment.grafana_port()
|
|
||||||
);
|
);
|
||||||
|
|
||||||
print_profiling_urls(&host, &host_ports);
|
print_profiling_urls(&host, &host_ports);
|
||||||
|
|||||||
@ -26,7 +26,6 @@ impl PortManager {
|
|||||||
info!(
|
info!(
|
||||||
validator_ports = ?mapping.validator_api_ports(),
|
validator_ports = ?mapping.validator_api_ports(),
|
||||||
executor_ports = ?mapping.executor_api_ports(),
|
executor_ports = ?mapping.executor_api_ports(),
|
||||||
prometheus_port = environment.prometheus_port(),
|
|
||||||
"resolved container host ports"
|
"resolved container host ports"
|
||||||
);
|
);
|
||||||
Ok(mapping)
|
Ok(mapping)
|
||||||
|
|||||||
@ -1,22 +1,16 @@
|
|||||||
use std::{
|
use testing_framework_core::{
|
||||||
env,
|
scenario::ObservabilityInputs, topology::generation::GeneratedTopology,
|
||||||
net::{Ipv4Addr, TcpListener as StdTcpListener},
|
|
||||||
};
|
};
|
||||||
|
use tracing::info;
|
||||||
use testing_framework_core::topology::generation::GeneratedTopology;
|
|
||||||
use tracing::{debug, info};
|
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
docker::ensure_docker_available,
|
docker::ensure_docker_available,
|
||||||
errors::ComposeRunnerError,
|
errors::ComposeRunnerError,
|
||||||
infrastructure::environment::{
|
infrastructure::environment::{
|
||||||
PortReservation, StackEnvironment, ensure_supported_topology, prepare_environment,
|
StackEnvironment, ensure_supported_topology, prepare_environment,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub const PROMETHEUS_PORT_ENV: &str = "TEST_FRAMEWORK_PROMETHEUS_PORT";
|
|
||||||
pub const DEFAULT_PROMETHEUS_PORT: u16 = 9090;
|
|
||||||
|
|
||||||
pub struct DeploymentSetup {
|
pub struct DeploymentSetup {
|
||||||
descriptors: GeneratedTopology,
|
descriptors: GeneratedTopology,
|
||||||
}
|
}
|
||||||
@ -46,26 +40,14 @@ impl DeploymentSetup {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn prepare_workspace(self) -> Result<DeploymentContext, ComposeRunnerError> {
|
pub async fn prepare_workspace(
|
||||||
let prometheus_env = env::var(PROMETHEUS_PORT_ENV)
|
self,
|
||||||
.ok()
|
observability: &ObservabilityInputs,
|
||||||
.and_then(|raw| raw.parse::<u16>().ok());
|
) -> Result<DeploymentContext, ComposeRunnerError> {
|
||||||
if prometheus_env.is_some() {
|
let environment = prepare_environment(
|
||||||
info!(port = prometheus_env, "using prometheus port from env");
|
&self.descriptors,
|
||||||
}
|
observability.metrics_otlp_ingest_url.as_ref(),
|
||||||
|
)
|
||||||
let prometheus_port = prometheus_env
|
|
||||||
.and_then(|port| reserve_port(port))
|
|
||||||
.or_else(|| allocate_prometheus_port())
|
|
||||||
.unwrap_or_else(|| PortReservation::new(DEFAULT_PROMETHEUS_PORT, None));
|
|
||||||
|
|
||||||
debug!(
|
|
||||||
prometheus_port = prometheus_port.port(),
|
|
||||||
"selected prometheus port"
|
|
||||||
);
|
|
||||||
|
|
||||||
let environment =
|
|
||||||
prepare_environment(&self.descriptors, prometheus_port, prometheus_env.is_some())
|
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
@ -81,13 +63,3 @@ impl DeploymentSetup {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn allocate_prometheus_port() -> Option<PortReservation> {
|
|
||||||
reserve_port(DEFAULT_PROMETHEUS_PORT).or_else(|| reserve_port(0))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn reserve_port(port: u16) -> Option<PortReservation> {
|
|
||||||
let listener = StdTcpListener::bind((Ipv4Addr::LOCALHOST, port)).ok()?;
|
|
||||||
let actual_port = listener.local_addr().ok()?.port();
|
|
||||||
Some(PortReservation::new(actual_port, Some(listener)))
|
|
||||||
}
|
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use testing_framework_core::{
|
use testing_framework_core::{
|
||||||
constants::{DEFAULT_CFGSYNC_PORT, DEFAULT_PROMETHEUS_HTTP_PORT, kzg_container_path},
|
constants::{DEFAULT_CFGSYNC_PORT, kzg_container_path},
|
||||||
topology::generation::{GeneratedNodeConfig, GeneratedTopology},
|
topology::generation::{GeneratedNodeConfig, GeneratedTopology},
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -10,18 +10,9 @@ mod node;
|
|||||||
|
|
||||||
pub use node::{EnvEntry, NodeDescriptor};
|
pub use node::{EnvEntry, NodeDescriptor};
|
||||||
|
|
||||||
/// Errors building a compose descriptor from the topology.
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
|
||||||
pub enum DescriptorBuildError {
|
|
||||||
#[error("prometheus port is not configured for compose descriptor")]
|
|
||||||
MissingPrometheusPort,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Top-level docker-compose descriptor built from a GeneratedTopology.
|
/// Top-level docker-compose descriptor built from a GeneratedTopology.
|
||||||
#[derive(Clone, Debug, Serialize)]
|
#[derive(Clone, Debug, Serialize)]
|
||||||
pub struct ComposeDescriptor {
|
pub struct ComposeDescriptor {
|
||||||
prometheus: PrometheusTemplate,
|
|
||||||
grafana: GrafanaTemplate,
|
|
||||||
validators: Vec<NodeDescriptor>,
|
validators: Vec<NodeDescriptor>,
|
||||||
executors: Vec<NodeDescriptor>,
|
executors: Vec<NodeDescriptor>,
|
||||||
}
|
}
|
||||||
@ -50,8 +41,6 @@ pub struct ComposeDescriptorBuilder<'a> {
|
|||||||
topology: &'a GeneratedTopology,
|
topology: &'a GeneratedTopology,
|
||||||
use_kzg_mount: bool,
|
use_kzg_mount: bool,
|
||||||
cfgsync_port: Option<u16>,
|
cfgsync_port: Option<u16>,
|
||||||
prometheus_port: Option<u16>,
|
|
||||||
grafana_port: Option<u16>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> ComposeDescriptorBuilder<'a> {
|
impl<'a> ComposeDescriptorBuilder<'a> {
|
||||||
@ -60,8 +49,6 @@ impl<'a> ComposeDescriptorBuilder<'a> {
|
|||||||
topology,
|
topology,
|
||||||
use_kzg_mount: false,
|
use_kzg_mount: false,
|
||||||
cfgsync_port: None,
|
cfgsync_port: None,
|
||||||
prometheus_port: None,
|
|
||||||
grafana_port: None,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -79,34 +66,12 @@ impl<'a> ComposeDescriptorBuilder<'a> {
|
|||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Finish building the descriptor.
|
||||||
#[must_use]
|
#[must_use]
|
||||||
/// Set host port mapping for Prometheus.
|
pub fn build(self) -> ComposeDescriptor {
|
||||||
pub const fn with_prometheus_port(mut self, port: u16) -> Self {
|
|
||||||
self.prometheus_port = Some(port);
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
#[must_use]
|
|
||||||
/// Set host port mapping for Grafana.
|
|
||||||
pub const fn with_grafana_port(mut self, port: u16) -> Self {
|
|
||||||
self.grafana_port = Some(port);
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Finish building the descriptor, erroring if required fields are missing.
|
|
||||||
pub fn build(self) -> Result<ComposeDescriptor, DescriptorBuildError> {
|
|
||||||
let cfgsync_port = self.cfgsync_port.unwrap_or(DEFAULT_CFGSYNC_PORT);
|
let cfgsync_port = self.cfgsync_port.unwrap_or(DEFAULT_CFGSYNC_PORT);
|
||||||
let prometheus_host_port = self
|
|
||||||
.prometheus_port
|
|
||||||
.ok_or(DescriptorBuildError::MissingPrometheusPort)?;
|
|
||||||
let grafana_host_port = self.grafana_port.unwrap_or(0);
|
|
||||||
|
|
||||||
let (image, platform) = resolve_image();
|
let (image, platform) = resolve_image();
|
||||||
// Prometheus image is x86_64-only on some tags; set platform when on arm hosts.
|
|
||||||
let prometheus_platform = match std::env::consts::ARCH {
|
|
||||||
"aarch64" | "arm64" => Some(String::from("linux/arm64")),
|
|
||||||
_ => None,
|
|
||||||
};
|
|
||||||
|
|
||||||
let validators = build_nodes(
|
let validators = build_nodes(
|
||||||
self.topology.validators(),
|
self.topology.validators(),
|
||||||
@ -126,47 +91,11 @@ impl<'a> ComposeDescriptorBuilder<'a> {
|
|||||||
cfgsync_port,
|
cfgsync_port,
|
||||||
);
|
);
|
||||||
|
|
||||||
Ok(ComposeDescriptor {
|
ComposeDescriptor {
|
||||||
prometheus: PrometheusTemplate::new(prometheus_host_port, prometheus_platform),
|
|
||||||
grafana: GrafanaTemplate::new(grafana_host_port),
|
|
||||||
validators,
|
validators,
|
||||||
executors,
|
executors,
|
||||||
})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Minimal Prometheus service mapping used in the compose template.
|
|
||||||
#[derive(Clone, Debug, Serialize)]
|
|
||||||
pub struct PrometheusTemplate {
|
|
||||||
host_port: String,
|
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
|
||||||
platform: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PrometheusTemplate {
|
|
||||||
fn new(port: u16, platform: Option<String>) -> Self {
|
|
||||||
Self {
|
|
||||||
host_port: format!("127.0.0.1:{port}:{}", DEFAULT_PROMETHEUS_HTTP_PORT),
|
|
||||||
platform,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Minimal Grafana service mapping used in the compose template.
|
|
||||||
#[derive(Clone, Debug, Serialize)]
|
|
||||||
pub struct GrafanaTemplate {
|
|
||||||
host_port: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl GrafanaTemplate {
|
|
||||||
fn new(port: u16) -> Self {
|
|
||||||
let host_port = match port {
|
|
||||||
0 => "127.0.0.1::3000".to_string(), // docker assigns host port
|
|
||||||
_ => format!("127.0.0.1:{port}:3000"),
|
|
||||||
};
|
|
||||||
|
|
||||||
Self { host_port }
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy)]
|
#[derive(Clone, Copy)]
|
||||||
|
|||||||
@ -49,24 +49,6 @@ impl ComposeWorkspace {
|
|||||||
copy_dir_recursive(&scripts_source, &temp.path().join("stack/scripts"))?;
|
copy_dir_recursive(&scripts_source, &temp.path().join("stack/scripts"))?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure Prometheus config is a file (Docker bind mount fails if a directory
|
|
||||||
// exists).
|
|
||||||
let prometheus_src = stack_source.join("monitoring/prometheus.yml");
|
|
||||||
let prometheus_dst = temp.path().join("stack/monitoring/prometheus.yml");
|
|
||||||
if prometheus_dst.exists() && prometheus_dst.is_dir() {
|
|
||||||
fs::remove_dir_all(&prometheus_dst)
|
|
||||||
.with_context(|| format!("removing bogus dir {}", prometheus_dst.display()))?;
|
|
||||||
}
|
|
||||||
if !prometheus_dst.exists() {
|
|
||||||
fs::copy(&prometheus_src, &prometheus_dst).with_context(|| {
|
|
||||||
format!(
|
|
||||||
"copying prometheus.yml {} -> {}",
|
|
||||||
prometheus_src.display(),
|
|
||||||
prometheus_dst.display()
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
}
|
|
||||||
|
|
||||||
let kzg_source = repo_root.join("testing-framework/assets/stack/kzgrs_test_params");
|
let kzg_source = repo_root.join("testing-framework/assets/stack/kzgrs_test_params");
|
||||||
let target = temp.path().join("kzgrs_test_params");
|
let target = temp.path().join("kzgrs_test_params");
|
||||||
if kzg_source.exists() {
|
if kzg_source.exists() {
|
||||||
|
|||||||
@ -9,10 +9,7 @@ use testing_framework_core::{
|
|||||||
};
|
};
|
||||||
use url::ParseError;
|
use url::ParseError;
|
||||||
|
|
||||||
use crate::{
|
use crate::{docker::commands::ComposeCommandError, infrastructure::template::TemplateError};
|
||||||
descriptor::DescriptorBuildError, docker::commands::ComposeCommandError,
|
|
||||||
infrastructure::template::TemplateError,
|
|
||||||
};
|
|
||||||
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
#[derive(Debug, thiserror::Error)]
|
||||||
/// Top-level compose runner errors.
|
/// Top-level compose runner errors.
|
||||||
@ -94,11 +91,6 @@ pub enum ConfigError {
|
|||||||
#[source]
|
#[source]
|
||||||
source: anyhow::Error,
|
source: anyhow::Error,
|
||||||
},
|
},
|
||||||
#[error("failed to build compose descriptor: {source}")]
|
|
||||||
Descriptor {
|
|
||||||
#[source]
|
|
||||||
source: DescriptorBuildError,
|
|
||||||
},
|
|
||||||
#[error("failed to render compose template: {source}")]
|
#[error("failed to render compose template: {source}")]
|
||||||
Template {
|
Template {
|
||||||
#[source]
|
#[source]
|
||||||
|
|||||||
@ -1,5 +1,8 @@
|
|||||||
use std::{path::Path, process::Command as StdCommand};
|
use std::{path::Path, process::Command as StdCommand};
|
||||||
|
|
||||||
|
use nomos_tracing::metrics::otlp::OtlpMetricsConfig;
|
||||||
|
use nomos_tracing_service::MetricsLayer;
|
||||||
|
use reqwest::Url;
|
||||||
use testing_framework_core::{
|
use testing_framework_core::{
|
||||||
scenario::cfgsync::{apply_topology_overrides, load_cfgsync_template, write_cfgsync_template},
|
scenario::cfgsync::{apply_topology_overrides, load_cfgsync_template, write_cfgsync_template},
|
||||||
topology::generation::GeneratedTopology,
|
topology::generation::GeneratedTopology,
|
||||||
@ -60,6 +63,7 @@ pub fn update_cfgsync_config(
|
|||||||
topology: &GeneratedTopology,
|
topology: &GeneratedTopology,
|
||||||
use_kzg_mount: bool,
|
use_kzg_mount: bool,
|
||||||
port: u16,
|
port: u16,
|
||||||
|
metrics_otlp_ingest_url: Option<&Url>,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
debug!(
|
debug!(
|
||||||
path = %path.display(),
|
path = %path.display(),
|
||||||
@ -72,6 +76,12 @@ pub fn update_cfgsync_config(
|
|||||||
let mut cfg = load_cfgsync_template(path)?;
|
let mut cfg = load_cfgsync_template(path)?;
|
||||||
cfg.port = port;
|
cfg.port = port;
|
||||||
apply_topology_overrides(&mut cfg, topology, use_kzg_mount);
|
apply_topology_overrides(&mut cfg, topology, use_kzg_mount);
|
||||||
|
if let Some(endpoint) = metrics_otlp_ingest_url.cloned() {
|
||||||
|
cfg.tracing_settings.metrics = MetricsLayer::Otlp(OtlpMetricsConfig {
|
||||||
|
endpoint,
|
||||||
|
host_identifier: "node".into(),
|
||||||
|
});
|
||||||
|
}
|
||||||
write_cfgsync_template(path, &cfg)?;
|
write_cfgsync_template(path, &cfg)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,20 +1,19 @@
|
|||||||
use std::{
|
use std::{
|
||||||
env,
|
|
||||||
net::{Ipv4Addr, TcpListener as StdTcpListener},
|
net::{Ipv4Addr, TcpListener as StdTcpListener},
|
||||||
path::{Path, PathBuf},
|
path::{Path, PathBuf},
|
||||||
time::Duration,
|
time::Duration,
|
||||||
};
|
};
|
||||||
|
|
||||||
use anyhow::{Context as _, anyhow};
|
use anyhow::anyhow;
|
||||||
|
use reqwest::Url;
|
||||||
use testing_framework_core::{
|
use testing_framework_core::{
|
||||||
adjust_timeout, scenario::CleanupGuard, topology::generation::GeneratedTopology,
|
adjust_timeout, scenario::CleanupGuard, topology::generation::GeneratedTopology,
|
||||||
};
|
};
|
||||||
use tokio::{process::Command, time::timeout};
|
use tokio::process::Command;
|
||||||
use tracing::{debug, error, info, warn};
|
use tracing::{debug, error, info};
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
deployer::setup::DEFAULT_PROMETHEUS_PORT,
|
|
||||||
descriptor::ComposeDescriptor,
|
descriptor::ComposeDescriptor,
|
||||||
docker::{
|
docker::{
|
||||||
commands::{compose_up, dump_compose_logs, run_docker_command},
|
commands::{compose_up, dump_compose_logs, run_docker_command},
|
||||||
@ -31,8 +30,6 @@ use crate::{
|
|||||||
};
|
};
|
||||||
|
|
||||||
const CFGSYNC_START_TIMEOUT: Duration = Duration::from_secs(180);
|
const CFGSYNC_START_TIMEOUT: Duration = Duration::from_secs(180);
|
||||||
const COMPOSE_PORT_DISCOVERY_TIMEOUT: Duration = Duration::from_secs(30);
|
|
||||||
const STACK_BRINGUP_MAX_ATTEMPTS: usize = 3;
|
|
||||||
|
|
||||||
/// Paths and flags describing the prepared compose workspace.
|
/// Paths and flags describing the prepared compose workspace.
|
||||||
pub struct WorkspaceState {
|
pub struct WorkspaceState {
|
||||||
@ -49,8 +46,6 @@ pub struct StackEnvironment {
|
|||||||
root: PathBuf,
|
root: PathBuf,
|
||||||
workspace: Option<ComposeWorkspace>,
|
workspace: Option<ComposeWorkspace>,
|
||||||
cfgsync_handle: Option<CfgsyncServerHandle>,
|
cfgsync_handle: Option<CfgsyncServerHandle>,
|
||||||
prometheus_port: u16,
|
|
||||||
grafana_port: u16,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl StackEnvironment {
|
impl StackEnvironment {
|
||||||
@ -60,8 +55,6 @@ impl StackEnvironment {
|
|||||||
compose_path: PathBuf,
|
compose_path: PathBuf,
|
||||||
project_name: String,
|
project_name: String,
|
||||||
cfgsync_handle: Option<CfgsyncServerHandle>,
|
cfgsync_handle: Option<CfgsyncServerHandle>,
|
||||||
prometheus_port: u16,
|
|
||||||
grafana_port: u16,
|
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let WorkspaceState {
|
let WorkspaceState {
|
||||||
workspace, root, ..
|
workspace, root, ..
|
||||||
@ -73,8 +66,6 @@ impl StackEnvironment {
|
|||||||
root,
|
root,
|
||||||
workspace: Some(workspace),
|
workspace: Some(workspace),
|
||||||
cfgsync_handle,
|
cfgsync_handle,
|
||||||
prometheus_port,
|
|
||||||
grafana_port,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -82,16 +73,6 @@ impl StackEnvironment {
|
|||||||
&self.compose_path
|
&self.compose_path
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Host port exposed by Prometheus.
|
|
||||||
pub const fn prometheus_port(&self) -> u16 {
|
|
||||||
self.prometheus_port
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Host port exposed by Grafana.
|
|
||||||
pub const fn grafana_port(&self) -> u16 {
|
|
||||||
self.grafana_port
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Docker compose project name.
|
/// Docker compose project name.
|
||||||
pub fn project_name(&self) -> &str {
|
pub fn project_name(&self) -> &str {
|
||||||
&self.project_name
|
&self.project_name
|
||||||
@ -138,27 +119,6 @@ impl StackEnvironment {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Represents a claimed port, optionally guarded by an open socket.
|
|
||||||
pub struct PortReservation {
|
|
||||||
port: u16,
|
|
||||||
_guard: Option<StdTcpListener>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PortReservation {
|
|
||||||
/// Holds a port and an optional socket guard to keep it reserved.
|
|
||||||
pub const fn new(port: u16, guard: Option<StdTcpListener>) -> Self {
|
|
||||||
Self {
|
|
||||||
port,
|
|
||||||
_guard: guard,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The reserved port number.
|
|
||||||
pub const fn port(&self) -> u16 {
|
|
||||||
self.port
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Verifies the topology has at least one validator so compose can start.
|
/// Verifies the topology has at least one validator so compose can start.
|
||||||
pub fn ensure_supported_topology(
|
pub fn ensure_supported_topology(
|
||||||
descriptors: &GeneratedTopology,
|
descriptors: &GeneratedTopology,
|
||||||
@ -210,10 +170,17 @@ pub fn update_cfgsync_logged(
|
|||||||
workspace: &WorkspaceState,
|
workspace: &WorkspaceState,
|
||||||
descriptors: &GeneratedTopology,
|
descriptors: &GeneratedTopology,
|
||||||
cfgsync_port: u16,
|
cfgsync_port: u16,
|
||||||
|
metrics_otlp_ingest_url: Option<&Url>,
|
||||||
) -> Result<(), ComposeRunnerError> {
|
) -> Result<(), ComposeRunnerError> {
|
||||||
info!(cfgsync_port, "updating cfgsync configuration");
|
info!(cfgsync_port, "updating cfgsync configuration");
|
||||||
|
|
||||||
configure_cfgsync(workspace, descriptors, cfgsync_port).map_err(Into::into)
|
configure_cfgsync(
|
||||||
|
workspace,
|
||||||
|
descriptors,
|
||||||
|
cfgsync_port,
|
||||||
|
metrics_otlp_ingest_url,
|
||||||
|
)
|
||||||
|
.map_err(Into::into)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Start the cfgsync server container using the generated config.
|
/// Start the cfgsync server container using the generated config.
|
||||||
@ -232,12 +199,14 @@ pub fn configure_cfgsync(
|
|||||||
workspace: &WorkspaceState,
|
workspace: &WorkspaceState,
|
||||||
descriptors: &GeneratedTopology,
|
descriptors: &GeneratedTopology,
|
||||||
cfgsync_port: u16,
|
cfgsync_port: u16,
|
||||||
|
metrics_otlp_ingest_url: Option<&Url>,
|
||||||
) -> Result<(), ConfigError> {
|
) -> Result<(), ConfigError> {
|
||||||
update_cfgsync_config(
|
update_cfgsync_config(
|
||||||
&workspace.cfgsync_path,
|
&workspace.cfgsync_path,
|
||||||
descriptors,
|
descriptors,
|
||||||
workspace.use_kzg,
|
workspace.use_kzg,
|
||||||
cfgsync_port,
|
cfgsync_port,
|
||||||
|
metrics_otlp_ingest_url,
|
||||||
)
|
)
|
||||||
.map_err(|source| ConfigError::Cfgsync {
|
.map_err(|source| ConfigError::Cfgsync {
|
||||||
path: workspace.cfgsync_path.clone(),
|
path: workspace.cfgsync_path.clone(),
|
||||||
@ -328,23 +297,16 @@ pub fn write_compose_artifacts(
|
|||||||
workspace: &WorkspaceState,
|
workspace: &WorkspaceState,
|
||||||
descriptors: &GeneratedTopology,
|
descriptors: &GeneratedTopology,
|
||||||
cfgsync_port: u16,
|
cfgsync_port: u16,
|
||||||
prometheus_port: u16,
|
|
||||||
grafana_port: u16,
|
|
||||||
) -> Result<PathBuf, ConfigError> {
|
) -> Result<PathBuf, ConfigError> {
|
||||||
debug!(
|
debug!(
|
||||||
cfgsync_port,
|
cfgsync_port,
|
||||||
prometheus_port,
|
|
||||||
grafana_port,
|
|
||||||
workspace_root = %workspace.root.display(),
|
workspace_root = %workspace.root.display(),
|
||||||
"building compose descriptor"
|
"building compose descriptor"
|
||||||
);
|
);
|
||||||
let descriptor = ComposeDescriptor::builder(descriptors)
|
let descriptor = ComposeDescriptor::builder(descriptors)
|
||||||
.with_kzg_mount(workspace.use_kzg)
|
.with_kzg_mount(workspace.use_kzg)
|
||||||
.with_cfgsync_port(cfgsync_port)
|
.with_cfgsync_port(cfgsync_port)
|
||||||
.with_prometheus_port(prometheus_port)
|
.build();
|
||||||
.with_grafana_port(grafana_port)
|
|
||||||
.build()
|
|
||||||
.map_err(|source| ConfigError::Descriptor { source })?;
|
|
||||||
|
|
||||||
let compose_path = workspace.root.join("compose.generated.yml");
|
let compose_path = workspace.root.join("compose.generated.yml");
|
||||||
write_compose_file(&descriptor, &compose_path)
|
write_compose_file(&descriptor, &compose_path)
|
||||||
@ -358,21 +320,9 @@ pub fn render_compose_logged(
|
|||||||
workspace: &WorkspaceState,
|
workspace: &WorkspaceState,
|
||||||
descriptors: &GeneratedTopology,
|
descriptors: &GeneratedTopology,
|
||||||
cfgsync_port: u16,
|
cfgsync_port: u16,
|
||||||
prometheus_port: u16,
|
|
||||||
grafana_port: u16,
|
|
||||||
) -> Result<PathBuf, ComposeRunnerError> {
|
) -> Result<PathBuf, ComposeRunnerError> {
|
||||||
info!(
|
info!(cfgsync_port, "rendering compose file");
|
||||||
cfgsync_port,
|
write_compose_artifacts(workspace, descriptors, cfgsync_port).map_err(Into::into)
|
||||||
prometheus_port, grafana_port, "rendering compose file with ports"
|
|
||||||
);
|
|
||||||
write_compose_artifacts(
|
|
||||||
workspace,
|
|
||||||
descriptors,
|
|
||||||
cfgsync_port,
|
|
||||||
prometheus_port,
|
|
||||||
grafana_port,
|
|
||||||
)
|
|
||||||
.map_err(Into::into)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Bring up docker compose; shut down cfgsync if start-up fails.
|
/// Bring up docker compose; shut down cfgsync if start-up fails.
|
||||||
@ -404,46 +354,23 @@ pub async fn bring_up_stack_logged(
|
|||||||
/// Prepare workspace, cfgsync, compose artifacts, and launch the stack.
|
/// Prepare workspace, cfgsync, compose artifacts, and launch the stack.
|
||||||
pub async fn prepare_environment(
|
pub async fn prepare_environment(
|
||||||
descriptors: &GeneratedTopology,
|
descriptors: &GeneratedTopology,
|
||||||
mut prometheus_port: PortReservation,
|
metrics_otlp_ingest_url: Option<&Url>,
|
||||||
prometheus_port_locked: bool,
|
|
||||||
) -> Result<StackEnvironment, ComposeRunnerError> {
|
) -> Result<StackEnvironment, ComposeRunnerError> {
|
||||||
let workspace = prepare_workspace_logged()?;
|
let workspace = prepare_workspace_logged()?;
|
||||||
let cfgsync_port = allocate_cfgsync_port()?;
|
let cfgsync_port = allocate_cfgsync_port()?;
|
||||||
|
update_cfgsync_logged(
|
||||||
let grafana_env = env::var("COMPOSE_GRAFANA_PORT")
|
|
||||||
.ok()
|
|
||||||
.and_then(|raw| raw.parse::<u16>().ok());
|
|
||||||
if let Some(port) = grafana_env {
|
|
||||||
info!(port, "using grafana port from env");
|
|
||||||
}
|
|
||||||
|
|
||||||
update_cfgsync_logged(&workspace, descriptors, cfgsync_port)?;
|
|
||||||
ensure_compose_image().await?;
|
|
||||||
|
|
||||||
let attempts = if prometheus_port_locked {
|
|
||||||
1
|
|
||||||
} else {
|
|
||||||
STACK_BRINGUP_MAX_ATTEMPTS
|
|
||||||
};
|
|
||||||
let mut last_err = None;
|
|
||||||
|
|
||||||
for _ in 0..attempts {
|
|
||||||
let prometheus_port_value = prometheus_port.port();
|
|
||||||
let grafana_port_value = grafana_env.unwrap_or(0);
|
|
||||||
let compose_path = render_compose_logged(
|
|
||||||
&workspace,
|
&workspace,
|
||||||
descriptors,
|
descriptors,
|
||||||
cfgsync_port,
|
cfgsync_port,
|
||||||
prometheus_port_value,
|
metrics_otlp_ingest_url,
|
||||||
grafana_port_value,
|
|
||||||
)?;
|
)?;
|
||||||
|
ensure_compose_image().await?;
|
||||||
|
let compose_path = render_compose_logged(&workspace, descriptors, cfgsync_port)?;
|
||||||
|
|
||||||
let project_name = format!("nomos-compose-{}", Uuid::new_v4());
|
let project_name = format!("nomos-compose-{}", Uuid::new_v4());
|
||||||
let mut cfgsync_handle = start_cfgsync_stage(&workspace, cfgsync_port).await?;
|
let mut cfgsync_handle = start_cfgsync_stage(&workspace, cfgsync_port).await?;
|
||||||
|
|
||||||
drop(prometheus_port);
|
if let Err(err) = bring_up_stack_logged(
|
||||||
|
|
||||||
match bring_up_stack_logged(
|
|
||||||
&compose_path,
|
&compose_path,
|
||||||
&project_name,
|
&project_name,
|
||||||
&workspace.root,
|
&workspace.root,
|
||||||
@ -451,125 +378,22 @@ pub async fn prepare_environment(
|
|||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(()) => {
|
dump_compose_logs(&compose_path, &project_name, &workspace.root).await;
|
||||||
let grafana_port_resolved = resolve_service_port(
|
cfgsync_handle.shutdown();
|
||||||
&compose_path,
|
return Err(err);
|
||||||
&project_name,
|
}
|
||||||
&workspace.root,
|
|
||||||
"grafana",
|
|
||||||
3000,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.unwrap_or(grafana_port_value);
|
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
project = %project_name,
|
project = %project_name,
|
||||||
compose_file = %compose_path.display(),
|
compose_file = %compose_path.display(),
|
||||||
cfgsync_port,
|
cfgsync_port,
|
||||||
prometheus_port = prometheus_port_value,
|
|
||||||
grafana_port = grafana_port_resolved,
|
|
||||||
"compose stack is up"
|
"compose stack is up"
|
||||||
);
|
);
|
||||||
return Ok(StackEnvironment::from_workspace(
|
|
||||||
|
Ok(StackEnvironment::from_workspace(
|
||||||
workspace,
|
workspace,
|
||||||
compose_path,
|
compose_path,
|
||||||
project_name,
|
project_name,
|
||||||
Some(cfgsync_handle),
|
Some(cfgsync_handle),
|
||||||
prometheus_port_value,
|
))
|
||||||
grafana_port_resolved,
|
|
||||||
));
|
|
||||||
}
|
|
||||||
Err(err) => {
|
|
||||||
// Attempt to capture container logs even when bring-up fails early.
|
|
||||||
dump_compose_logs(&compose_path, &project_name, &workspace.root).await;
|
|
||||||
cfgsync_handle.shutdown();
|
|
||||||
last_err = Some(err);
|
|
||||||
if prometheus_port_locked {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
warn!(
|
|
||||||
error = %last_err.as_ref().unwrap(),
|
|
||||||
"compose bring-up failed; retrying with a new prometheus port"
|
|
||||||
);
|
|
||||||
prometheus_port = allocate_prometheus_port()
|
|
||||||
.unwrap_or_else(|| PortReservation::new(DEFAULT_PROMETHEUS_PORT, None));
|
|
||||||
debug!(
|
|
||||||
next_prometheus_port = prometheus_port.port(),
|
|
||||||
"retrying compose bring-up"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Err(last_err.expect("prepare_environment should return or fail with error"))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn allocate_prometheus_port() -> Option<PortReservation> {
|
|
||||||
reserve_prometheus_port(DEFAULT_PROMETHEUS_PORT).or_else(|| reserve_prometheus_port(0))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn reserve_prometheus_port(port: u16) -> Option<PortReservation> {
|
|
||||||
let listener = StdTcpListener::bind((Ipv4Addr::LOCALHOST, port)).ok()?;
|
|
||||||
let actual_port = listener.local_addr().ok()?.port();
|
|
||||||
Some(PortReservation::new(actual_port, Some(listener)))
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn resolve_service_port(
|
|
||||||
compose_file: &Path,
|
|
||||||
project_name: &str,
|
|
||||||
root: &Path,
|
|
||||||
service: &str,
|
|
||||||
container_port: u16,
|
|
||||||
) -> Result<u16, ComposeRunnerError> {
|
|
||||||
let mut cmd = Command::new("docker");
|
|
||||||
cmd.arg("compose")
|
|
||||||
.arg("-f")
|
|
||||||
.arg(compose_file)
|
|
||||||
.arg("-p")
|
|
||||||
.arg(project_name)
|
|
||||||
.arg("port")
|
|
||||||
.arg(service)
|
|
||||||
.arg(container_port.to_string())
|
|
||||||
.current_dir(root);
|
|
||||||
|
|
||||||
let output = timeout(adjust_timeout(COMPOSE_PORT_DISCOVERY_TIMEOUT), cmd.output())
|
|
||||||
.await
|
|
||||||
.map_err(|_| ComposeRunnerError::PortDiscovery {
|
|
||||||
service: service.to_owned(),
|
|
||||||
container_port,
|
|
||||||
source: anyhow!("docker compose port timed out"),
|
|
||||||
})?
|
|
||||||
.with_context(|| format!("running docker compose port {service} {container_port}"))
|
|
||||||
.map_err(|source| ComposeRunnerError::PortDiscovery {
|
|
||||||
service: service.to_owned(),
|
|
||||||
container_port,
|
|
||||||
source,
|
|
||||||
})?;
|
|
||||||
|
|
||||||
if !output.status.success() {
|
|
||||||
return Err(ComposeRunnerError::PortDiscovery {
|
|
||||||
service: service.to_owned(),
|
|
||||||
container_port,
|
|
||||||
source: anyhow!("docker compose port exited with {}", output.status),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
for line in stdout.lines() {
|
|
||||||
let line = line.trim();
|
|
||||||
if line.is_empty() {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if let Some(port_str) = line.rsplit(':').next()
|
|
||||||
&& let Ok(port) = port_str.trim().parse::<u16>()
|
|
||||||
{
|
|
||||||
return Ok(port);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Err(ComposeRunnerError::PortDiscovery {
|
|
||||||
service: service.to_owned(),
|
|
||||||
container_port,
|
|
||||||
source: anyhow!("unable to parse docker compose port output: {stdout}"),
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -3,7 +3,7 @@ use std::time::Duration;
|
|||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
use testing_framework_core::{
|
use testing_framework_core::{
|
||||||
nodes::ApiClient,
|
nodes::ApiClient,
|
||||||
scenario::{Metrics, MetricsError, NodeClients, http_probe::NodeRole as HttpNodeRole},
|
scenario::{NodeClients, http_probe::NodeRole as HttpNodeRole},
|
||||||
topology::generation::{GeneratedTopology, NodeRole as TopologyNodeRole},
|
topology::generation::{GeneratedTopology, NodeRole as TopologyNodeRole},
|
||||||
};
|
};
|
||||||
use tokio::time::sleep;
|
use tokio::time::sleep;
|
||||||
@ -16,13 +16,6 @@ use crate::{
|
|||||||
|
|
||||||
const DISABLED_READINESS_SLEEP: Duration = Duration::from_secs(5);
|
const DISABLED_READINESS_SLEEP: Duration = Duration::from_secs(5);
|
||||||
|
|
||||||
/// Build a metrics client from host/port, validating the URL.
|
|
||||||
pub fn metrics_handle_from_port(port: u16, host: &str) -> Result<Metrics, MetricsError> {
|
|
||||||
let url = Url::parse(&format!("http://{host}:{port}/"))
|
|
||||||
.map_err(|err| MetricsError::new(format!("invalid prometheus url: {err}")))?;
|
|
||||||
Metrics::from_prometheus(url)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Wait until all validators respond on their API ports.
|
/// Wait until all validators respond on their API ports.
|
||||||
pub async fn ensure_validators_ready_with_ports(ports: &[u16]) -> Result<(), StackReadinessError> {
|
pub async fn ensure_validators_ready_with_ports(ports: &[u16]) -> Result<(), StackReadinessError> {
|
||||||
if ports.is_empty() {
|
if ports.is_empty() {
|
||||||
|
|||||||
@ -17,6 +17,7 @@ anyhow = "1"
|
|||||||
async-trait = { workspace = true }
|
async-trait = { workspace = true }
|
||||||
k8s-openapi = { version = "0.20", features = ["latest"] }
|
k8s-openapi = { version = "0.20", features = ["latest"] }
|
||||||
kube = { version = "0.87", default-features = false, features = ["client", "runtime", "rustls-tls"] }
|
kube = { version = "0.87", default-features = false, features = ["client", "runtime", "rustls-tls"] }
|
||||||
|
nomos-tracing = { workspace = true }
|
||||||
nomos-tracing-service = { workspace = true }
|
nomos-tracing-service = { workspace = true }
|
||||||
reqwest = { workspace = true, features = ["json"] }
|
reqwest = { workspace = true, features = ["json"] }
|
||||||
serde = { version = "1", features = ["derive"] }
|
serde = { version = "1", features = ["derive"] }
|
||||||
|
|||||||
@ -1,2 +0,0 @@
|
|||||||
*.json
|
|
||||||
!.gitignore
|
|
||||||
@ -37,9 +37,3 @@ app.kubernetes.io/instance: {{ $root.Release.Name }}
|
|||||||
nomos/logical-role: executor
|
nomos/logical-role: executor
|
||||||
nomos/executor-index: "{{ $index }}"
|
nomos/executor-index: "{{ $index }}"
|
||||||
{{- end -}}
|
{{- end -}}
|
||||||
|
|
||||||
{{- define "nomos-runner.prometheusLabels" -}}
|
|
||||||
app.kubernetes.io/name: {{ include "nomos-runner.chart" . }}
|
|
||||||
app.kubernetes.io/instance: {{ .Release.Name }}
|
|
||||||
nomos/logical-role: prometheus
|
|
||||||
{{- end -}}
|
|
||||||
|
|||||||
@ -1,33 +0,0 @@
|
|||||||
{{- if .Values.grafana.enabled }}
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
name: {{ include "nomos-runner.fullname" . }}-grafana-config
|
|
||||||
data:
|
|
||||||
datasources.yaml: |
|
|
||||||
apiVersion: 1
|
|
||||||
datasources:
|
|
||||||
- name: Prometheus
|
|
||||||
type: prometheus
|
|
||||||
access: proxy
|
|
||||||
isDefault: true
|
|
||||||
uid: PBFA97CFB590B2093
|
|
||||||
orgId: 1
|
|
||||||
url: {{ if .Values.prometheus.enabled }}http://prometheus:9090{{ else }}{{ required "prometheus.externalUrl must be set when prometheus.enabled=false" .Values.prometheus.externalUrl }}{{ end }}
|
|
||||||
editable: true
|
|
||||||
dashboards.yaml: |
|
|
||||||
apiVersion: 1
|
|
||||||
providers:
|
|
||||||
- name: 'default'
|
|
||||||
orgId: 1
|
|
||||||
folder: 'Nomos'
|
|
||||||
type: file
|
|
||||||
disableDeletion: false
|
|
||||||
editable: true
|
|
||||||
options:
|
|
||||||
path: /var/lib/grafana/dashboards
|
|
||||||
{{ range $path, $_ := .Files.Glob "grafana/dashboards/*.json" }}
|
|
||||||
{{ base $path }}: |
|
|
||||||
{{ $.Files.Get $path | indent 4 }}
|
|
||||||
{{ end }}
|
|
||||||
{{- end }}
|
|
||||||
@ -1,64 +0,0 @@
|
|||||||
{{- if .Values.grafana.enabled }}
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: {{ include "nomos-runner.fullname" . }}-grafana
|
|
||||||
labels:
|
|
||||||
app: {{ include "nomos-runner.name" . }}
|
|
||||||
chart: {{ include "nomos-runner.chart" . }}
|
|
||||||
release: {{ .Release.Name }}
|
|
||||||
heritage: {{ .Release.Service }}
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: {{ include "nomos-runner.name" . }}
|
|
||||||
component: grafana
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: {{ include "nomos-runner.name" . }}
|
|
||||||
component: grafana
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: grafana
|
|
||||||
image: {{ .Values.grafana.image }}
|
|
||||||
imagePullPolicy: {{ .Values.grafana.imagePullPolicy }}
|
|
||||||
env:
|
|
||||||
- name: GF_SECURITY_ADMIN_USER
|
|
||||||
value: {{ .Values.grafana.adminUser | quote }}
|
|
||||||
- name: GF_SECURITY_ADMIN_PASSWORD
|
|
||||||
value: {{ .Values.grafana.adminPassword | quote }}
|
|
||||||
ports:
|
|
||||||
- containerPort: 3000
|
|
||||||
name: http
|
|
||||||
volumeMounts:
|
|
||||||
- name: grafana-config
|
|
||||||
mountPath: /etc/grafana/provisioning/datasources/datasources.yaml
|
|
||||||
subPath: datasources.yaml
|
|
||||||
readOnly: true
|
|
||||||
- name: grafana-config
|
|
||||||
mountPath: /etc/grafana/provisioning/dashboards/dashboards.yaml
|
|
||||||
subPath: dashboards.yaml
|
|
||||||
readOnly: true
|
|
||||||
- name: grafana-dashboards
|
|
||||||
mountPath: /var/lib/grafana/dashboards
|
|
||||||
readOnly: true
|
|
||||||
volumes:
|
|
||||||
- name: grafana-config
|
|
||||||
configMap:
|
|
||||||
name: {{ include "nomos-runner.fullname" . }}-grafana-config
|
|
||||||
items:
|
|
||||||
- key: datasources.yaml
|
|
||||||
path: datasources.yaml
|
|
||||||
- key: dashboards.yaml
|
|
||||||
path: dashboards.yaml
|
|
||||||
- name: grafana-dashboards
|
|
||||||
configMap:
|
|
||||||
name: {{ include "nomos-runner.fullname" . }}-grafana-config
|
|
||||||
items:
|
|
||||||
{{ range $path, $_ := .Files.Glob "grafana/dashboards/*.json" }}
|
|
||||||
- key: {{ base $path }}
|
|
||||||
path: {{ base $path }}
|
|
||||||
{{ end }}
|
|
||||||
{{- end }}
|
|
||||||
@ -1,22 +0,0 @@
|
|||||||
{{- if .Values.grafana.enabled }}
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: {{ include "nomos-runner.fullname" . }}-grafana
|
|
||||||
labels:
|
|
||||||
app: {{ include "nomos-runner.name" . }}
|
|
||||||
component: grafana
|
|
||||||
spec:
|
|
||||||
type: {{ .Values.grafana.service.type }}
|
|
||||||
ports:
|
|
||||||
- port: 3000
|
|
||||||
targetPort: http
|
|
||||||
protocol: TCP
|
|
||||||
name: http
|
|
||||||
{{- if and (eq .Values.grafana.service.type "NodePort") .Values.grafana.service.nodePort }}
|
|
||||||
nodePort: {{ .Values.grafana.service.nodePort }}
|
|
||||||
{{- end }}
|
|
||||||
selector:
|
|
||||||
app: {{ include "nomos-runner.name" . }}
|
|
||||||
component: grafana
|
|
||||||
{{- end }}
|
|
||||||
@ -1,16 +0,0 @@
|
|||||||
{{- if .Values.prometheus.enabled }}
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
name: prometheus
|
|
||||||
labels:
|
|
||||||
{{- include "nomos-runner.prometheusLabels" . | nindent 4 }}
|
|
||||||
data:
|
|
||||||
prometheus.yml: |
|
|
||||||
{{- if .Values.prometheus.config }}
|
|
||||||
{{ .Values.prometheus.config | indent 4 }}
|
|
||||||
{{- else }}
|
|
||||||
{{ "" | indent 4 }}
|
|
||||||
{{- end }}
|
|
||||||
{{- end }}
|
|
||||||
@ -1,38 +0,0 @@
|
|||||||
{{- if .Values.prometheus.enabled }}
|
|
||||||
---
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: prometheus
|
|
||||||
labels:
|
|
||||||
{{- include "nomos-runner.prometheusLabels" . | nindent 4 }}
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
{{- include "nomos-runner.prometheusLabels" . | nindent 6 }}
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
{{- include "nomos-runner.prometheusLabels" . | nindent 8 }}
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: prometheus
|
|
||||||
image: {{ .Values.prometheus.image }}
|
|
||||||
imagePullPolicy: {{ .Values.prometheus.imagePullPolicy | default "IfNotPresent" }}
|
|
||||||
args:
|
|
||||||
- --config.file=/etc/prometheus/prometheus.yml
|
|
||||||
- --storage.tsdb.retention.time={{ .Values.prometheus.retention }}
|
|
||||||
- --web.enable-otlp-receiver
|
|
||||||
- --enable-feature=otlp-write-receiver
|
|
||||||
ports:
|
|
||||||
- containerPort: 9090
|
|
||||||
name: http
|
|
||||||
volumeMounts:
|
|
||||||
- name: prometheus-config
|
|
||||||
mountPath: /etc/prometheus
|
|
||||||
volumes:
|
|
||||||
- name: prometheus-config
|
|
||||||
configMap:
|
|
||||||
name: prometheus
|
|
||||||
{{- end }}
|
|
||||||
@ -1,20 +0,0 @@
|
|||||||
{{- if .Values.prometheus.enabled }}
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: prometheus
|
|
||||||
labels:
|
|
||||||
{{- include "nomos-runner.prometheusLabels" . | nindent 4 }}
|
|
||||||
spec:
|
|
||||||
type: {{ .Values.prometheus.service.type | default "NodePort" }}
|
|
||||||
selector:
|
|
||||||
{{- include "nomos-runner.prometheusLabels" . | nindent 4 }}
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
port: 9090
|
|
||||||
targetPort: http
|
|
||||||
{{- if and (eq (default "NodePort" .Values.prometheus.service.type) "NodePort") .Values.prometheus.service.nodePort }}
|
|
||||||
nodePort: {{ .Values.prometheus.service.nodePort }}
|
|
||||||
{{- end }}
|
|
||||||
{{- end }}
|
|
||||||
@ -1,18 +0,0 @@
|
|||||||
{{- if eq .Values.kzg.mode "hostPath" }}
|
|
||||||
apiVersion: v1
|
|
||||||
kind: PersistentVolume
|
|
||||||
metadata:
|
|
||||||
name: {{ include "nomos-runner.fullname" . }}-kzg
|
|
||||||
labels:
|
|
||||||
{{- include "nomos-runner.labels" . | nindent 4 }}
|
|
||||||
spec:
|
|
||||||
capacity:
|
|
||||||
storage: {{ .Values.kzg.storageSize }}
|
|
||||||
accessModes:
|
|
||||||
- ReadOnlyMany
|
|
||||||
persistentVolumeReclaimPolicy: Delete
|
|
||||||
storageClassName: manual
|
|
||||||
hostPath:
|
|
||||||
path: {{ .Values.kzg.hostPath }}
|
|
||||||
type: {{ .Values.kzg.hostPathType }}
|
|
||||||
{{- end }}
|
|
||||||
@ -1,16 +0,0 @@
|
|||||||
{{- if eq .Values.kzg.mode "hostPath" }}
|
|
||||||
apiVersion: v1
|
|
||||||
kind: PersistentVolumeClaim
|
|
||||||
metadata:
|
|
||||||
name: {{ include "nomos-runner.fullname" . }}-kzg
|
|
||||||
labels:
|
|
||||||
{{- include "nomos-runner.labels" . | nindent 4 }}
|
|
||||||
spec:
|
|
||||||
accessModes:
|
|
||||||
- ReadOnlyMany
|
|
||||||
storageClassName: manual
|
|
||||||
volumeName: {{ include "nomos-runner.fullname" . }}-kzg
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
storage: {{ .Values.kzg.storageSize }}
|
|
||||||
{{- end }}
|
|
||||||
@ -26,28 +26,3 @@ kzg:
|
|||||||
hostPath: "/var/lib/nomos/kzgrs_test_params"
|
hostPath: "/var/lib/nomos/kzgrs_test_params"
|
||||||
hostPathType: "Directory"
|
hostPathType: "Directory"
|
||||||
storageSize: "1Gi"
|
storageSize: "1Gi"
|
||||||
|
|
||||||
prometheus:
|
|
||||||
enabled: true
|
|
||||||
externalUrl: ""
|
|
||||||
image: "prom/prometheus:v3.0.1"
|
|
||||||
imagePullPolicy: IfNotPresent
|
|
||||||
retention: "7d"
|
|
||||||
service:
|
|
||||||
type: NodePort
|
|
||||||
nodePort: null
|
|
||||||
config: |
|
|
||||||
global:
|
|
||||||
evaluation_interval: 15s
|
|
||||||
external_labels:
|
|
||||||
monitor: "NomosRunner"
|
|
||||||
|
|
||||||
grafana:
|
|
||||||
enabled: true
|
|
||||||
image: "grafana/grafana:10.4.1"
|
|
||||||
imagePullPolicy: IfNotPresent
|
|
||||||
adminUser: admin
|
|
||||||
adminPassword: admin
|
|
||||||
service:
|
|
||||||
type: NodePort
|
|
||||||
nodePort: null
|
|
||||||
|
|||||||
@ -1,11 +1,10 @@
|
|||||||
use anyhow::Error;
|
use anyhow::Error;
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use kube::Client;
|
use kube::Client;
|
||||||
use reqwest::Url;
|
|
||||||
use testing_framework_core::{
|
use testing_framework_core::{
|
||||||
scenario::{
|
scenario::{
|
||||||
BlockFeedTask, CleanupGuard, Deployer, MetricsError, ObservabilityCapability, RunContext,
|
BlockFeedTask, CleanupGuard, Deployer, MetricsError, ObservabilityCapability,
|
||||||
Runner, Scenario,
|
ObservabilityInputs, RunContext, Runner, Scenario,
|
||||||
},
|
},
|
||||||
topology::generation::GeneratedTopology,
|
topology::generation::GeneratedTopology,
|
||||||
};
|
};
|
||||||
@ -17,13 +16,12 @@ use crate::{
|
|||||||
cluster::{
|
cluster::{
|
||||||
ClusterEnvironment, NodeClientError, PortSpecs, RemoteReadinessError,
|
ClusterEnvironment, NodeClientError, PortSpecs, RemoteReadinessError,
|
||||||
build_node_clients, cluster_identifiers, collect_port_specs, ensure_cluster_readiness,
|
build_node_clients, cluster_identifiers, collect_port_specs, ensure_cluster_readiness,
|
||||||
install_stack, kill_port_forwards, metrics_handle_from_endpoint,
|
install_stack, kill_port_forwards, wait_for_ports_or_cleanup,
|
||||||
metrics_handle_from_url, wait_for_ports_or_cleanup,
|
|
||||||
},
|
},
|
||||||
helm::HelmError,
|
helm::HelmError,
|
||||||
},
|
},
|
||||||
lifecycle::{block_feed::spawn_block_feed_with, cleanup::RunnerCleanup},
|
lifecycle::{block_feed::spawn_block_feed_with, cleanup::RunnerCleanup},
|
||||||
wait::{ClusterWaitError, HostPort, PortForwardHandle},
|
wait::{ClusterWaitError, PortForwardHandle},
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Deploys a scenario into Kubernetes using Helm charts and port-forwards.
|
/// Deploys a scenario into Kubernetes using Helm charts and port-forwards.
|
||||||
@ -93,7 +91,7 @@ impl Deployer for K8sDeployer {
|
|||||||
type Error = K8sRunnerError;
|
type Error = K8sRunnerError;
|
||||||
|
|
||||||
async fn deploy(&self, scenario: &Scenario) -> Result<Runner, Self::Error> {
|
async fn deploy(&self, scenario: &Scenario) -> Result<Runner, Self::Error> {
|
||||||
deploy_with_observability(self, scenario, None, None, None).await
|
deploy_with_observability(self, scenario, None).await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -105,31 +103,10 @@ impl Deployer<ObservabilityCapability> for K8sDeployer {
|
|||||||
&self,
|
&self,
|
||||||
scenario: &Scenario<ObservabilityCapability>,
|
scenario: &Scenario<ObservabilityCapability>,
|
||||||
) -> Result<Runner, Self::Error> {
|
) -> Result<Runner, Self::Error> {
|
||||||
deploy_with_observability(
|
deploy_with_observability(self, scenario, Some(scenario.capabilities())).await
|
||||||
self,
|
|
||||||
scenario,
|
|
||||||
scenario.capabilities().metrics_query_url.clone(),
|
|
||||||
scenario.capabilities().metrics_query_grafana_url.clone(),
|
|
||||||
scenario.capabilities().metrics_otlp_ingest_url.clone(),
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn cluster_prometheus_endpoint(cluster: &Option<ClusterEnvironment>) -> Option<&HostPort> {
|
|
||||||
cluster
|
|
||||||
.as_ref()
|
|
||||||
.expect("cluster must be available")
|
|
||||||
.prometheus_endpoint()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn cluster_grafana_endpoint(cluster: &Option<ClusterEnvironment>) -> Option<&HostPort> {
|
|
||||||
cluster
|
|
||||||
.as_ref()
|
|
||||||
.expect("cluster must be available")
|
|
||||||
.grafana_endpoint()
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn fail_cluster(cluster: &mut Option<ClusterEnvironment>, reason: &str) {
|
async fn fail_cluster(cluster: &mut Option<ClusterEnvironment>, reason: &str) {
|
||||||
if let Some(env) = cluster.as_mut() {
|
if let Some(env) = cluster.as_mut() {
|
||||||
env.fail(reason).await;
|
env.fail(reason).await;
|
||||||
@ -157,59 +134,13 @@ fn ensure_supported_topology(descriptors: &GeneratedTopology) -> Result<(), K8sR
|
|||||||
async fn deploy_with_observability<Caps>(
|
async fn deploy_with_observability<Caps>(
|
||||||
deployer: &K8sDeployer,
|
deployer: &K8sDeployer,
|
||||||
scenario: &Scenario<Caps>,
|
scenario: &Scenario<Caps>,
|
||||||
metrics_query_url: Option<Url>,
|
observability: Option<&ObservabilityCapability>,
|
||||||
metrics_query_grafana_url: Option<Url>,
|
|
||||||
metrics_otlp_ingest_url: Option<Url>,
|
|
||||||
) -> Result<Runner, K8sRunnerError> {
|
) -> Result<Runner, K8sRunnerError> {
|
||||||
let external_prometheus = match metrics_query_url {
|
let env_inputs = ObservabilityInputs::from_env()?;
|
||||||
Some(url) => Some(url),
|
let cap_inputs = observability
|
||||||
None => match std::env::var("K8S_RUNNER_METRICS_QUERY_URL")
|
.map(ObservabilityInputs::from_capability)
|
||||||
.ok()
|
.unwrap_or_default();
|
||||||
.or_else(|| std::env::var("NOMOS_METRICS_QUERY_URL").ok())
|
let observability = env_inputs.with_overrides(cap_inputs).normalized();
|
||||||
// Back-compat:
|
|
||||||
.or_else(|| std::env::var("K8S_RUNNER_EXTERNAL_PROMETHEUS_URL").ok())
|
|
||||||
.or_else(|| std::env::var("NOMOS_EXTERNAL_PROMETHEUS_URL").ok())
|
|
||||||
{
|
|
||||||
Some(raw) if !raw.trim().is_empty() => {
|
|
||||||
Some(Url::parse(raw.trim()).map_err(|err| {
|
|
||||||
MetricsError::new(format!("invalid metrics query url: {err}"))
|
|
||||||
})?)
|
|
||||||
}
|
|
||||||
_ => None,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
let external_prometheus_grafana_url = match metrics_query_grafana_url {
|
|
||||||
Some(url) => Some(url),
|
|
||||||
None => match std::env::var("K8S_RUNNER_METRICS_QUERY_GRAFANA_URL")
|
|
||||||
.ok()
|
|
||||||
.or_else(|| std::env::var("NOMOS_METRICS_QUERY_GRAFANA_URL").ok())
|
|
||||||
// Back-compat:
|
|
||||||
.or_else(|| std::env::var("K8S_RUNNER_EXTERNAL_PROMETHEUS_GRAFANA_URL").ok())
|
|
||||||
.or_else(|| std::env::var("NOMOS_EXTERNAL_PROMETHEUS_GRAFANA_URL").ok())
|
|
||||||
{
|
|
||||||
Some(raw) if !raw.trim().is_empty() => Some(Url::parse(raw.trim()).map_err(|err| {
|
|
||||||
MetricsError::new(format!("invalid metrics query grafana url: {err}"))
|
|
||||||
})?),
|
|
||||||
_ => None,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
let external_otlp_metrics_endpoint = match metrics_otlp_ingest_url {
|
|
||||||
Some(url) => Some(url),
|
|
||||||
None => match std::env::var("K8S_RUNNER_METRICS_OTLP_INGEST_URL")
|
|
||||||
.ok()
|
|
||||||
.or_else(|| std::env::var("NOMOS_METRICS_OTLP_INGEST_URL").ok())
|
|
||||||
// Back-compat:
|
|
||||||
.or_else(|| std::env::var("K8S_RUNNER_EXTERNAL_OTLP_METRICS_ENDPOINT").ok())
|
|
||||||
.or_else(|| std::env::var("NOMOS_EXTERNAL_OTLP_METRICS_ENDPOINT").ok())
|
|
||||||
{
|
|
||||||
Some(raw) if !raw.trim().is_empty() => Some(Url::parse(raw.trim()).map_err(|err| {
|
|
||||||
MetricsError::new(format!("invalid metrics OTLP ingest url: {err}"))
|
|
||||||
})?),
|
|
||||||
_ => None,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
let descriptors = scenario.topology().clone();
|
let descriptors = scenario.topology().clone();
|
||||||
let validator_count = descriptors.validators().len();
|
let validator_count = descriptors.validators().len();
|
||||||
@ -225,9 +156,16 @@ async fn deploy_with_observability<Caps>(
|
|||||||
executors = executor_count,
|
executors = executor_count,
|
||||||
duration_secs = scenario.duration().as_secs(),
|
duration_secs = scenario.duration().as_secs(),
|
||||||
readiness_checks = deployer.readiness_checks,
|
readiness_checks = deployer.readiness_checks,
|
||||||
metrics_query_url = external_prometheus.as_ref().map(|u| u.as_str()),
|
metrics_query_url = observability.metrics_query_url.as_ref().map(|u| u.as_str()),
|
||||||
metrics_query_grafana_url = external_prometheus_grafana_url.as_ref().map(|u| u.as_str()),
|
metrics_query_grafana_url = observability
|
||||||
metrics_otlp_ingest_url = external_otlp_metrics_endpoint.as_ref().map(|u| u.as_str()),
|
.metrics_query_grafana_url
|
||||||
|
.as_ref()
|
||||||
|
.map(|u| u.as_str()),
|
||||||
|
metrics_otlp_ingest_url = observability
|
||||||
|
.metrics_otlp_ingest_url
|
||||||
|
.as_ref()
|
||||||
|
.map(|u| u.as_str()),
|
||||||
|
grafana_url = observability.grafana_url.as_ref().map(|u| u.as_str()),
|
||||||
"starting k8s deployment"
|
"starting k8s deployment"
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -238,9 +176,7 @@ async fn deploy_with_observability<Caps>(
|
|||||||
&port_specs,
|
&port_specs,
|
||||||
&descriptors,
|
&descriptors,
|
||||||
deployer.readiness_checks,
|
deployer.readiness_checks,
|
||||||
external_prometheus.as_ref(),
|
&observability,
|
||||||
external_prometheus_grafana_url.as_ref(),
|
|
||||||
external_otlp_metrics_endpoint.as_ref(),
|
|
||||||
)
|
)
|
||||||
.await?,
|
.await?,
|
||||||
);
|
);
|
||||||
@ -259,23 +195,11 @@ async fn deploy_with_observability<Caps>(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let telemetry = match external_prometheus.clone() {
|
let telemetry = match observability.telemetry_handle() {
|
||||||
Some(url) => metrics_handle_from_url(url),
|
|
||||||
None => cluster
|
|
||||||
.as_ref()
|
|
||||||
.and_then(|cluster| cluster.prometheus_endpoint())
|
|
||||||
.ok_or_else(|| MetricsError::new("prometheus endpoint unavailable"))
|
|
||||||
.and_then(metrics_handle_from_endpoint),
|
|
||||||
};
|
|
||||||
let telemetry = match telemetry {
|
|
||||||
Ok(handle) => handle,
|
Ok(handle) => handle,
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
fail_cluster(
|
fail_cluster(&mut cluster, "failed to configure metrics telemetry handle").await;
|
||||||
&mut cluster,
|
error!(error = ?err, "failed to configure metrics telemetry handle");
|
||||||
"failed to configure prometheus metrics handle",
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
error!(error = ?err, "failed to configure prometheus metrics handle");
|
|
||||||
return Err(err.into());
|
return Err(err.into());
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -289,36 +213,29 @@ async fn deploy_with_observability<Caps>(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
if let Some(url) = external_prometheus.as_ref() {
|
if let Some(url) = observability.metrics_query_url.as_ref() {
|
||||||
info!(prometheus_url = %url.as_str(), "using external prometheus endpoint");
|
|
||||||
} else if let Some(prometheus) = cluster_prometheus_endpoint(&cluster) {
|
|
||||||
info!(
|
info!(
|
||||||
prometheus_url = %format!("http://{}:{}/", prometheus.host, prometheus.port),
|
metrics_query_url = %url.as_str(),
|
||||||
"prometheus endpoint available on host"
|
"metrics query endpoint configured"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
if let Some(grafana) = cluster_grafana_endpoint(&cluster) {
|
if let Some(url) = observability.grafana_url.as_ref() {
|
||||||
info!(
|
info!(grafana_url = %url.as_str(), "grafana url configured");
|
||||||
grafana_url = %format!("http://{}:{}/", grafana.host, grafana.port),
|
|
||||||
"grafana dashboard available on host"
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if std::env::var("TESTNET_PRINT_ENDPOINTS").is_ok() {
|
if std::env::var("TESTNET_PRINT_ENDPOINTS").is_ok() {
|
||||||
let prometheus = external_prometheus
|
let prometheus = observability
|
||||||
|
.metrics_query_url
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.map(|u| u.as_str().to_string())
|
.map(|u| u.as_str().to_string())
|
||||||
.or_else(|| {
|
|
||||||
cluster_prometheus_endpoint(&cluster)
|
|
||||||
.map(|endpoint| format!("http://{}:{}/", endpoint.host, endpoint.port))
|
|
||||||
})
|
|
||||||
.unwrap_or_else(|| "<disabled>".to_string());
|
.unwrap_or_else(|| "<disabled>".to_string());
|
||||||
let grafana = cluster_grafana_endpoint(&cluster);
|
|
||||||
println!(
|
println!(
|
||||||
"TESTNET_ENDPOINTS prometheus={} grafana={}",
|
"TESTNET_ENDPOINTS prometheus={} grafana={}",
|
||||||
prometheus,
|
prometheus,
|
||||||
grafana
|
observability
|
||||||
.map(|endpoint| format!("http://{}:{}/", endpoint.host, endpoint.port))
|
.grafana_url
|
||||||
|
.as_ref()
|
||||||
|
.map(|u| u.as_str().to_string())
|
||||||
.unwrap_or_else(|| "<disabled>".to_string())
|
.unwrap_or_else(|| "<disabled>".to_string())
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -374,16 +291,9 @@ async fn setup_cluster(
|
|||||||
specs: &PortSpecs,
|
specs: &PortSpecs,
|
||||||
descriptors: &GeneratedTopology,
|
descriptors: &GeneratedTopology,
|
||||||
readiness_checks: bool,
|
readiness_checks: bool,
|
||||||
external_prometheus: Option<&Url>,
|
observability: &ObservabilityInputs,
|
||||||
external_prometheus_grafana_url: Option<&Url>,
|
|
||||||
external_otlp_metrics_endpoint: Option<&Url>,
|
|
||||||
) -> Result<ClusterEnvironment, K8sRunnerError> {
|
) -> Result<ClusterEnvironment, K8sRunnerError> {
|
||||||
let assets = prepare_assets(
|
let assets = prepare_assets(descriptors, observability.metrics_otlp_ingest_url.as_ref())?;
|
||||||
descriptors,
|
|
||||||
external_prometheus,
|
|
||||||
external_prometheus_grafana_url,
|
|
||||||
external_otlp_metrics_endpoint,
|
|
||||||
)?;
|
|
||||||
let validators = descriptors.validators().len();
|
let validators = descriptors.validators().len();
|
||||||
let executors = descriptors.executors().len();
|
let executors = descriptors.executors().len();
|
||||||
|
|
||||||
@ -394,19 +304,8 @@ async fn setup_cluster(
|
|||||||
Some(install_stack(client, &assets, &namespace, &release, validators, executors).await?);
|
Some(install_stack(client, &assets, &namespace, &release, validators, executors).await?);
|
||||||
|
|
||||||
info!("waiting for helm-managed services to become ready");
|
info!("waiting for helm-managed services to become ready");
|
||||||
let cluster_ready = wait_for_ports_or_cleanup(
|
let cluster_ready =
|
||||||
client,
|
wait_for_ports_or_cleanup(client, &namespace, &release, specs, &mut cleanup_guard).await?;
|
||||||
&namespace,
|
|
||||||
&release,
|
|
||||||
specs,
|
|
||||||
external_prometheus.is_none() && external_prometheus_grafana_url.is_none(),
|
|
||||||
&mut cleanup_guard,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
if let Some(prometheus) = cluster_ready.ports.prometheus.as_ref() {
|
|
||||||
info!(prometheus = ?prometheus, "discovered prometheus endpoint");
|
|
||||||
}
|
|
||||||
|
|
||||||
let environment = ClusterEnvironment::new(
|
let environment = ClusterEnvironment::new(
|
||||||
client.clone(),
|
client.clone(),
|
||||||
|
|||||||
@ -5,6 +5,7 @@ use std::{
|
|||||||
};
|
};
|
||||||
|
|
||||||
use anyhow::{Context as _, Result as AnyResult};
|
use anyhow::{Context as _, Result as AnyResult};
|
||||||
|
use nomos_tracing::metrics::otlp::OtlpMetricsConfig;
|
||||||
use nomos_tracing_service::MetricsLayer;
|
use nomos_tracing_service::MetricsLayer;
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
@ -55,8 +56,6 @@ pub enum AssetsError {
|
|||||||
MissingKzg { path: PathBuf },
|
MissingKzg { path: PathBuf },
|
||||||
#[error("missing Helm chart at {path}; ensure the repository is up-to-date")]
|
#[error("missing Helm chart at {path}; ensure the repository is up-to-date")]
|
||||||
MissingChart { path: PathBuf },
|
MissingChart { path: PathBuf },
|
||||||
#[error("missing Grafana dashboards source at {path}")]
|
|
||||||
MissingGrafanaDashboards { path: PathBuf },
|
|
||||||
#[error("failed to create temporary directory for rendered assets: {source}")]
|
#[error("failed to create temporary directory for rendered assets: {source}")]
|
||||||
TempDir {
|
TempDir {
|
||||||
#[source]
|
#[source]
|
||||||
@ -92,9 +91,7 @@ fn kzg_mode() -> KzgMode {
|
|||||||
/// topology.
|
/// topology.
|
||||||
pub fn prepare_assets(
|
pub fn prepare_assets(
|
||||||
topology: &GeneratedTopology,
|
topology: &GeneratedTopology,
|
||||||
external_prometheus: Option<&Url>,
|
metrics_otlp_ingest_url: Option<&Url>,
|
||||||
external_prometheus_grafana_url: Option<&Url>,
|
|
||||||
external_otlp_metrics_endpoint: Option<&Url>,
|
|
||||||
) -> Result<RunnerAssets, AssetsError> {
|
) -> Result<RunnerAssets, AssetsError> {
|
||||||
info!(
|
info!(
|
||||||
validators = topology.validators().len(),
|
validators = topology.validators().len(),
|
||||||
@ -104,13 +101,7 @@ pub fn prepare_assets(
|
|||||||
|
|
||||||
let root = workspace_root().map_err(|source| AssetsError::WorkspaceRoot { source })?;
|
let root = workspace_root().map_err(|source| AssetsError::WorkspaceRoot { source })?;
|
||||||
let kzg_mode = kzg_mode();
|
let kzg_mode = kzg_mode();
|
||||||
let cfgsync_yaml = render_cfgsync_config(
|
let cfgsync_yaml = render_cfgsync_config(&root, topology, kzg_mode, metrics_otlp_ingest_url)?;
|
||||||
&root,
|
|
||||||
topology,
|
|
||||||
kzg_mode,
|
|
||||||
external_prometheus,
|
|
||||||
external_otlp_metrics_endpoint,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let tempdir = tempfile::Builder::new()
|
let tempdir = tempfile::Builder::new()
|
||||||
.prefix("nomos-helm-")
|
.prefix("nomos-helm-")
|
||||||
@ -124,12 +115,7 @@ pub fn prepare_assets(
|
|||||||
KzgMode::InImage => None,
|
KzgMode::InImage => None,
|
||||||
};
|
};
|
||||||
let chart_path = helm_chart_path()?;
|
let chart_path = helm_chart_path()?;
|
||||||
sync_grafana_dashboards(&root, &chart_path)?;
|
let values_yaml = render_values_yaml(topology)?;
|
||||||
let values_yaml = render_values_yaml(
|
|
||||||
topology,
|
|
||||||
external_prometheus,
|
|
||||||
external_prometheus_grafana_url,
|
|
||||||
)?;
|
|
||||||
let values_file = write_temp_file(tempdir.path(), "values.yaml", values_yaml)?;
|
let values_file = write_temp_file(tempdir.path(), "values.yaml", values_yaml)?;
|
||||||
let image = env::var("NOMOS_TESTNET_IMAGE")
|
let image = env::var("NOMOS_TESTNET_IMAGE")
|
||||||
.unwrap_or_else(|_| String::from("public.ecr.aws/r4s5t9y4/logos/logos-blockchain:test"));
|
.unwrap_or_else(|_| String::from("public.ecr.aws/r4s5t9y4/logos/logos-blockchain:test"));
|
||||||
@ -164,81 +150,13 @@ pub fn prepare_assets(
|
|||||||
}
|
}
|
||||||
|
|
||||||
const CFGSYNC_K8S_TIMEOUT_SECS: u64 = 300;
|
const CFGSYNC_K8S_TIMEOUT_SECS: u64 = 300;
|
||||||
const DEFAULT_GRAFANA_NODE_PORT: u16 = 30030;
|
|
||||||
const DEFAULT_IN_IMAGE_KZG_PARAMS_PATH: &str = "/opt/nomos/kzg-params/kzgrs_test_params";
|
const DEFAULT_IN_IMAGE_KZG_PARAMS_PATH: &str = "/opt/nomos/kzg-params/kzgrs_test_params";
|
||||||
|
|
||||||
fn sync_grafana_dashboards(root: &Path, chart_path: &Path) -> Result<(), AssetsError> {
|
|
||||||
let source_dir = stack_assets_root(root).join("monitoring/grafana/dashboards");
|
|
||||||
let dest_dir = chart_path.join("grafana/dashboards");
|
|
||||||
|
|
||||||
if !source_dir.exists() {
|
|
||||||
return Err(AssetsError::MissingGrafanaDashboards { path: source_dir });
|
|
||||||
}
|
|
||||||
|
|
||||||
fs::create_dir_all(&dest_dir).map_err(|source| AssetsError::Io {
|
|
||||||
path: dest_dir.clone(),
|
|
||||||
source,
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let mut removed = 0usize;
|
|
||||||
for entry in fs::read_dir(&dest_dir).map_err(|source| AssetsError::Io {
|
|
||||||
path: dest_dir.clone(),
|
|
||||||
source,
|
|
||||||
})? {
|
|
||||||
let entry = entry.map_err(|source| AssetsError::Io {
|
|
||||||
path: dest_dir.clone(),
|
|
||||||
source,
|
|
||||||
})?;
|
|
||||||
let path = entry.path();
|
|
||||||
if path.extension().and_then(|ext| ext.to_str()) != Some("json") {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
fs::remove_file(&path).map_err(|source| AssetsError::Io {
|
|
||||||
path: path.clone(),
|
|
||||||
source,
|
|
||||||
})?;
|
|
||||||
removed += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut copied = 0usize;
|
|
||||||
for entry in fs::read_dir(&source_dir).map_err(|source| AssetsError::Io {
|
|
||||||
path: source_dir.clone(),
|
|
||||||
source,
|
|
||||||
})? {
|
|
||||||
let entry = entry.map_err(|source| AssetsError::Io {
|
|
||||||
path: source_dir.clone(),
|
|
||||||
source,
|
|
||||||
})?;
|
|
||||||
let path = entry.path();
|
|
||||||
if path.extension().and_then(|ext| ext.to_str()) != Some("json") {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
let file_name = path.file_name().unwrap_or_default();
|
|
||||||
let dest_path = dest_dir.join(file_name);
|
|
||||||
fs::copy(&path, &dest_path).map_err(|source| AssetsError::Io {
|
|
||||||
path: dest_path.clone(),
|
|
||||||
source,
|
|
||||||
})?;
|
|
||||||
copied += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
debug!(
|
|
||||||
source = %source_dir.display(),
|
|
||||||
dest = %dest_dir.display(),
|
|
||||||
removed,
|
|
||||||
copied,
|
|
||||||
"synced Grafana dashboards into Helm chart"
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn render_cfgsync_config(
|
fn render_cfgsync_config(
|
||||||
root: &Path,
|
root: &Path,
|
||||||
topology: &GeneratedTopology,
|
topology: &GeneratedTopology,
|
||||||
kzg_mode: KzgMode,
|
kzg_mode: KzgMode,
|
||||||
external_prometheus: Option<&Url>,
|
metrics_otlp_ingest_url: Option<&Url>,
|
||||||
external_otlp_metrics_endpoint: Option<&Url>,
|
|
||||||
) -> Result<String, AssetsError> {
|
) -> Result<String, AssetsError> {
|
||||||
let cfgsync_template_path = stack_assets_root(root).join("cfgsync.yaml");
|
let cfgsync_template_path = stack_assets_root(root).join("cfgsync.yaml");
|
||||||
debug!(path = %cfgsync_template_path.display(), "loading cfgsync template");
|
debug!(path = %cfgsync_template_path.display(), "loading cfgsync template");
|
||||||
@ -254,15 +172,11 @@ fn render_cfgsync_config(
|
|||||||
.unwrap_or_else(|| DEFAULT_IN_IMAGE_KZG_PARAMS_PATH.to_string());
|
.unwrap_or_else(|| DEFAULT_IN_IMAGE_KZG_PARAMS_PATH.to_string());
|
||||||
}
|
}
|
||||||
|
|
||||||
let external_metrics_endpoint = match external_otlp_metrics_endpoint {
|
if let Some(endpoint) = metrics_otlp_ingest_url.cloned() {
|
||||||
Some(endpoint) => Some(Ok(endpoint.clone())),
|
cfg.tracing_settings.metrics = MetricsLayer::Otlp(OtlpMetricsConfig {
|
||||||
None => external_prometheus.map(derive_prometheus_otlp_metrics_endpoint),
|
endpoint,
|
||||||
};
|
host_identifier: "node".into(),
|
||||||
|
});
|
||||||
if let Some(endpoint) = external_metrics_endpoint.transpose()? {
|
|
||||||
if let MetricsLayer::Otlp(ref mut config) = cfg.tracing_settings.metrics {
|
|
||||||
config.endpoint = endpoint;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
cfg.timeout = cfg.timeout.max(CFGSYNC_K8S_TIMEOUT_SECS);
|
cfg.timeout = cfg.timeout.max(CFGSYNC_K8S_TIMEOUT_SECS);
|
||||||
@ -270,16 +184,6 @@ fn render_cfgsync_config(
|
|||||||
render_cfgsync_yaml(&cfg).map_err(|source| AssetsError::Cfgsync { source })
|
render_cfgsync_yaml(&cfg).map_err(|source| AssetsError::Cfgsync { source })
|
||||||
}
|
}
|
||||||
|
|
||||||
fn derive_prometheus_otlp_metrics_endpoint(base: &Url) -> Result<Url, AssetsError> {
|
|
||||||
let base = base.as_str().trim_end_matches('/');
|
|
||||||
let otlp_metrics = format!("{base}/api/v1/otlp/v1/metrics");
|
|
||||||
Url::parse(&otlp_metrics).map_err(|source| AssetsError::Cfgsync {
|
|
||||||
source: anyhow::anyhow!(
|
|
||||||
"invalid OTLP metrics endpoint derived from external Prometheus url '{base}': {source}"
|
|
||||||
),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ScriptPaths {
|
struct ScriptPaths {
|
||||||
run_cfgsync: PathBuf,
|
run_cfgsync: PathBuf,
|
||||||
run_shared: PathBuf,
|
run_shared: PathBuf,
|
||||||
@ -337,16 +241,8 @@ fn helm_chart_path() -> Result<PathBuf, AssetsError> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn render_values_yaml(
|
fn render_values_yaml(topology: &GeneratedTopology) -> Result<String, AssetsError> {
|
||||||
topology: &GeneratedTopology,
|
let values = build_values(topology);
|
||||||
external_prometheus: Option<&Url>,
|
|
||||||
external_prometheus_grafana_url: Option<&Url>,
|
|
||||||
) -> Result<String, AssetsError> {
|
|
||||||
let values = build_values(
|
|
||||||
topology,
|
|
||||||
external_prometheus,
|
|
||||||
external_prometheus_grafana_url,
|
|
||||||
);
|
|
||||||
serde_yaml::to_string(&values).map_err(|source| AssetsError::Values { source })
|
serde_yaml::to_string(&values).map_err(|source| AssetsError::Values { source })
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -402,8 +298,6 @@ struct HelmValues {
|
|||||||
cfgsync: CfgsyncValues,
|
cfgsync: CfgsyncValues,
|
||||||
validators: NodeGroup,
|
validators: NodeGroup,
|
||||||
executors: NodeGroup,
|
executors: NodeGroup,
|
||||||
prometheus: PrometheusValues,
|
|
||||||
grafana: GrafanaValues,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
@ -426,72 +320,13 @@ struct NodeValues {
|
|||||||
env: BTreeMap<String, String>,
|
env: BTreeMap<String, String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize)]
|
fn build_values(topology: &GeneratedTopology) -> HelmValues {
|
||||||
struct PrometheusValues {
|
|
||||||
enabled: bool,
|
|
||||||
#[serde(rename = "externalUrl", skip_serializing_if = "Option::is_none")]
|
|
||||||
external_url: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize)]
|
|
||||||
struct GrafanaValues {
|
|
||||||
enabled: bool,
|
|
||||||
image: String,
|
|
||||||
#[serde(rename = "imagePullPolicy")]
|
|
||||||
image_pull_policy: String,
|
|
||||||
#[serde(rename = "adminUser")]
|
|
||||||
admin_user: String,
|
|
||||||
#[serde(rename = "adminPassword")]
|
|
||||||
admin_password: String,
|
|
||||||
service: GrafanaServiceValues,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize)]
|
|
||||||
struct GrafanaServiceValues {
|
|
||||||
#[serde(rename = "type")]
|
|
||||||
type_field: String,
|
|
||||||
#[serde(rename = "nodePort")]
|
|
||||||
node_port: Option<u16>,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn build_values(
|
|
||||||
topology: &GeneratedTopology,
|
|
||||||
external_prometheus: Option<&Url>,
|
|
||||||
external_prometheus_grafana_url: Option<&Url>,
|
|
||||||
) -> HelmValues {
|
|
||||||
let cfgsync = CfgsyncValues {
|
let cfgsync = CfgsyncValues {
|
||||||
port: cfgsync_port(),
|
port: cfgsync_port(),
|
||||||
};
|
};
|
||||||
let pol_mode = pol_proof_mode();
|
let pol_mode = pol_proof_mode();
|
||||||
let image_pull_policy =
|
let image_pull_policy =
|
||||||
env::var("NOMOS_TESTNET_IMAGE_PULL_POLICY").unwrap_or_else(|_| "IfNotPresent".into());
|
env::var("NOMOS_TESTNET_IMAGE_PULL_POLICY").unwrap_or_else(|_| "IfNotPresent".into());
|
||||||
let grafana_node_port = match kzg_mode() {
|
|
||||||
KzgMode::HostPath => Some(DEFAULT_GRAFANA_NODE_PORT),
|
|
||||||
KzgMode::InImage => env::var("NOMOS_GRAFANA_NODE_PORT").ok().and_then(|value| {
|
|
||||||
value
|
|
||||||
.parse::<u16>()
|
|
||||||
.ok()
|
|
||||||
.filter(|port| *port >= 30000 && *port <= 32767)
|
|
||||||
}),
|
|
||||||
};
|
|
||||||
let grafana = GrafanaValues {
|
|
||||||
enabled: true,
|
|
||||||
image: "grafana/grafana:10.4.1".into(),
|
|
||||||
image_pull_policy: "IfNotPresent".into(),
|
|
||||||
admin_user: "admin".into(),
|
|
||||||
admin_password: "admin".into(),
|
|
||||||
service: GrafanaServiceValues {
|
|
||||||
type_field: "NodePort".into(),
|
|
||||||
node_port: grafana_node_port,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
let prometheus_external_url = external_prometheus_grafana_url
|
|
||||||
.or(external_prometheus)
|
|
||||||
.map(|url| url.as_str().trim_end_matches('/').to_string());
|
|
||||||
let prometheus = PrometheusValues {
|
|
||||||
enabled: prometheus_external_url.is_none(),
|
|
||||||
external_url: prometheus_external_url,
|
|
||||||
};
|
|
||||||
debug!(pol_mode, "rendering Helm values for k8s stack");
|
debug!(pol_mode, "rendering Helm values for k8s stack");
|
||||||
let validators = topology
|
let validators = topology
|
||||||
.validators()
|
.validators()
|
||||||
@ -578,8 +413,6 @@ fn build_values(
|
|||||||
count: topology.executors().len(),
|
count: topology.executors().len(),
|
||||||
nodes: executors,
|
nodes: executors,
|
||||||
},
|
},
|
||||||
prometheus,
|
|
||||||
grafana,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -4,7 +4,7 @@ use kube::Client;
|
|||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
use testing_framework_core::{
|
use testing_framework_core::{
|
||||||
nodes::ApiClient,
|
nodes::ApiClient,
|
||||||
scenario::{CleanupGuard, Metrics, MetricsError, NodeClients, http_probe::NodeRole},
|
scenario::{CleanupGuard, NodeClients, http_probe::NodeRole},
|
||||||
topology::{generation::GeneratedTopology, readiness::ReadinessError},
|
topology::{generation::GeneratedTopology, readiness::ReadinessError},
|
||||||
};
|
};
|
||||||
use tracing::{debug, info};
|
use tracing::{debug, info};
|
||||||
@ -15,8 +15,7 @@ use crate::{
|
|||||||
infrastructure::assets::RunnerAssets,
|
infrastructure::assets::RunnerAssets,
|
||||||
lifecycle::{cleanup::RunnerCleanup, logs::dump_namespace_logs},
|
lifecycle::{cleanup::RunnerCleanup, logs::dump_namespace_logs},
|
||||||
wait::{
|
wait::{
|
||||||
ClusterPorts, ClusterReady, HostPort, NodeConfigPorts, PortForwardHandle,
|
ClusterPorts, ClusterReady, NodeConfigPorts, PortForwardHandle, wait_for_cluster_ready,
|
||||||
wait_for_cluster_ready,
|
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -38,8 +37,6 @@ pub struct ClusterEnvironment {
|
|||||||
validator_testing_ports: Vec<u16>,
|
validator_testing_ports: Vec<u16>,
|
||||||
executor_api_ports: Vec<u16>,
|
executor_api_ports: Vec<u16>,
|
||||||
executor_testing_ports: Vec<u16>,
|
executor_testing_ports: Vec<u16>,
|
||||||
prometheus: Option<HostPort>,
|
|
||||||
grafana: Option<HostPort>,
|
|
||||||
port_forwards: Vec<PortForwardHandle>,
|
port_forwards: Vec<PortForwardHandle>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -68,8 +65,6 @@ impl ClusterEnvironment {
|
|||||||
validator_testing_ports,
|
validator_testing_ports,
|
||||||
executor_api_ports,
|
executor_api_ports,
|
||||||
executor_testing_ports,
|
executor_testing_ports,
|
||||||
prometheus: ports.prometheus.clone(),
|
|
||||||
grafana: ports.grafana.clone(),
|
|
||||||
port_forwards,
|
port_forwards,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -105,14 +100,6 @@ impl ClusterEnvironment {
|
|||||||
&self.release
|
&self.release
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn prometheus_endpoint(&self) -> Option<&HostPort> {
|
|
||||||
self.prometheus.as_ref()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn grafana_endpoint(&self) -> Option<&HostPort> {
|
|
||||||
self.grafana.as_ref()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn validator_ports(&self) -> (&[u16], &[u16]) {
|
pub fn validator_ports(&self) -> (&[u16], &[u16]) {
|
||||||
(&self.validator_api_ports, &self.validator_testing_ports)
|
(&self.validator_api_ports, &self.validator_testing_ports)
|
||||||
}
|
}
|
||||||
@ -229,16 +216,6 @@ pub fn build_node_clients(cluster: &ClusterEnvironment) -> Result<NodeClients, N
|
|||||||
Ok(NodeClients::new(validators, executors))
|
Ok(NodeClients::new(validators, executors))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn metrics_handle_from_endpoint(endpoint: &HostPort) -> Result<Metrics, MetricsError> {
|
|
||||||
let url = cluster_host_url(&endpoint.host, endpoint.port)
|
|
||||||
.map_err(|err| MetricsError::new(format!("invalid prometheus url: {err}")))?;
|
|
||||||
Metrics::from_prometheus(url)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn metrics_handle_from_url(url: Url) -> Result<Metrics, MetricsError> {
|
|
||||||
Metrics::from_prometheus(url)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn ensure_cluster_readiness(
|
pub async fn ensure_cluster_readiness(
|
||||||
descriptors: &GeneratedTopology,
|
descriptors: &GeneratedTopology,
|
||||||
cluster: &ClusterEnvironment,
|
cluster: &ClusterEnvironment,
|
||||||
@ -324,7 +301,6 @@ pub async fn wait_for_ports_or_cleanup(
|
|||||||
namespace: &str,
|
namespace: &str,
|
||||||
release: &str,
|
release: &str,
|
||||||
specs: &PortSpecs,
|
specs: &PortSpecs,
|
||||||
prometheus_enabled: bool,
|
|
||||||
cleanup_guard: &mut Option<RunnerCleanup>,
|
cleanup_guard: &mut Option<RunnerCleanup>,
|
||||||
) -> Result<ClusterReady, crate::deployer::K8sRunnerError> {
|
) -> Result<ClusterReady, crate::deployer::K8sRunnerError> {
|
||||||
info!(
|
info!(
|
||||||
@ -340,13 +316,11 @@ pub async fn wait_for_ports_or_cleanup(
|
|||||||
release,
|
release,
|
||||||
&specs.validators,
|
&specs.validators,
|
||||||
&specs.executors,
|
&specs.executors,
|
||||||
prometheus_enabled,
|
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(ports) => {
|
Ok(ports) => {
|
||||||
info!(
|
info!(
|
||||||
prometheus = ?ports.ports.prometheus,
|
|
||||||
validator_ports = ?ports.ports.validators,
|
validator_ports = ?ports.ports.validators,
|
||||||
executor_ports = ?ports.ports.executors,
|
executor_ports = ?ports.ports.executors,
|
||||||
"cluster port-forwards established"
|
"cluster port-forwards established"
|
||||||
|
|||||||
@ -1,36 +0,0 @@
|
|||||||
use tokio::time::sleep;
|
|
||||||
|
|
||||||
use super::{ClusterWaitError, node_http_probe_timeout, node_http_timeout};
|
|
||||||
use crate::host::node_host;
|
|
||||||
|
|
||||||
const GRAFANA_HTTP_POLL_INTERVAL: std::time::Duration = std::time::Duration::from_secs(1);
|
|
||||||
|
|
||||||
pub async fn wait_for_grafana_http_nodeport(port: u16) -> Result<(), ClusterWaitError> {
|
|
||||||
let host = node_host();
|
|
||||||
wait_for_grafana_http(&host, port, node_http_probe_timeout()).await
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn wait_for_grafana_http_port_forward(port: u16) -> Result<(), ClusterWaitError> {
|
|
||||||
wait_for_grafana_http("127.0.0.1", port, node_http_timeout()).await
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn wait_for_grafana_http(
|
|
||||||
host: &str,
|
|
||||||
port: u16,
|
|
||||||
timeout: std::time::Duration,
|
|
||||||
) -> Result<(), ClusterWaitError> {
|
|
||||||
let client = reqwest::Client::new();
|
|
||||||
let url = format!("http://{host}:{port}/api/health");
|
|
||||||
|
|
||||||
let attempts = timeout.as_secs();
|
|
||||||
for _ in 0..attempts {
|
|
||||||
if let Ok(resp) = client.get(&url).send().await
|
|
||||||
&& resp.status().is_success()
|
|
||||||
{
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
sleep(GRAFANA_HTTP_POLL_INTERVAL).await;
|
|
||||||
}
|
|
||||||
|
|
||||||
Err(ClusterWaitError::GrafanaTimeout { port })
|
|
||||||
}
|
|
||||||
@ -4,9 +4,7 @@ use kube::Error as KubeError;
|
|||||||
use testing_framework_core::{
|
use testing_framework_core::{
|
||||||
constants::{
|
constants::{
|
||||||
DEFAULT_HTTP_POLL_INTERVAL, DEFAULT_K8S_DEPLOYMENT_TIMEOUT,
|
DEFAULT_HTTP_POLL_INTERVAL, DEFAULT_K8S_DEPLOYMENT_TIMEOUT,
|
||||||
DEFAULT_NODE_HTTP_PROBE_TIMEOUT, DEFAULT_NODE_HTTP_TIMEOUT, DEFAULT_PROMETHEUS_HTTP_PORT,
|
DEFAULT_NODE_HTTP_PROBE_TIMEOUT, DEFAULT_NODE_HTTP_TIMEOUT,
|
||||||
DEFAULT_PROMETHEUS_HTTP_PROBE_TIMEOUT, DEFAULT_PROMETHEUS_HTTP_TIMEOUT,
|
|
||||||
DEFAULT_PROMETHEUS_SERVICE_NAME,
|
|
||||||
},
|
},
|
||||||
scenario::http_probe::NodeRole,
|
scenario::http_probe::NodeRole,
|
||||||
};
|
};
|
||||||
@ -14,11 +12,9 @@ use thiserror::Error;
|
|||||||
|
|
||||||
mod deployment;
|
mod deployment;
|
||||||
mod forwarding;
|
mod forwarding;
|
||||||
mod grafana;
|
|
||||||
mod http_probe;
|
mod http_probe;
|
||||||
mod orchestrator;
|
mod orchestrator;
|
||||||
mod ports;
|
mod ports;
|
||||||
mod prometheus;
|
|
||||||
|
|
||||||
pub use forwarding::PortForwardHandle;
|
pub use forwarding::PortForwardHandle;
|
||||||
pub use orchestrator::wait_for_cluster_ready;
|
pub use orchestrator::wait_for_cluster_ready;
|
||||||
@ -44,15 +40,13 @@ pub struct HostPort {
|
|||||||
pub port: u16,
|
pub port: u16,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// All port assignments for the cluster plus Prometheus.
|
/// All port assignments for the cluster.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct ClusterPorts {
|
pub struct ClusterPorts {
|
||||||
pub validators: Vec<NodePortAllocation>,
|
pub validators: Vec<NodePortAllocation>,
|
||||||
pub executors: Vec<NodePortAllocation>,
|
pub executors: Vec<NodePortAllocation>,
|
||||||
pub validator_host: String,
|
pub validator_host: String,
|
||||||
pub executor_host: String,
|
pub executor_host: String,
|
||||||
pub prometheus: Option<HostPort>,
|
|
||||||
pub grafana: Option<HostPort>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Success result from waiting for the cluster: host ports and forward handles.
|
/// Success result from waiting for the cluster: host ports and forward handles.
|
||||||
@ -96,10 +90,6 @@ pub enum ClusterWaitError {
|
|||||||
port: u16,
|
port: u16,
|
||||||
timeout: Duration,
|
timeout: Duration,
|
||||||
},
|
},
|
||||||
#[error("timeout waiting for prometheus readiness on NodePort {port}")]
|
|
||||||
PrometheusTimeout { port: u16 },
|
|
||||||
#[error("timeout waiting for grafana readiness on port {port}")]
|
|
||||||
GrafanaTimeout { port: u16 },
|
|
||||||
#[error("failed to start port-forward for service {service} port {port}: {source}")]
|
#[error("failed to start port-forward for service {service} port {port}: {source}")]
|
||||||
PortForward {
|
PortForward {
|
||||||
service: String,
|
service: String,
|
||||||
@ -149,32 +139,6 @@ pub(crate) fn http_poll_interval() -> Duration {
|
|||||||
*HTTP_POLL_INTERVAL
|
*HTTP_POLL_INTERVAL
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) const PROMETHEUS_HTTP_PORT: u16 = DEFAULT_PROMETHEUS_HTTP_PORT;
|
|
||||||
|
|
||||||
static PROMETHEUS_HTTP_TIMEOUT: LazyLock<Duration> = LazyLock::new(|| {
|
|
||||||
env_duration_secs(
|
|
||||||
"K8S_RUNNER_PROMETHEUS_HTTP_TIMEOUT_SECS",
|
|
||||||
DEFAULT_PROMETHEUS_HTTP_TIMEOUT,
|
|
||||||
)
|
|
||||||
});
|
|
||||||
|
|
||||||
static PROMETHEUS_HTTP_PROBE_TIMEOUT: LazyLock<Duration> = LazyLock::new(|| {
|
|
||||||
env_duration_secs(
|
|
||||||
"K8S_RUNNER_PROMETHEUS_HTTP_PROBE_TIMEOUT_SECS",
|
|
||||||
DEFAULT_PROMETHEUS_HTTP_PROBE_TIMEOUT,
|
|
||||||
)
|
|
||||||
});
|
|
||||||
|
|
||||||
pub(crate) fn prometheus_http_timeout() -> Duration {
|
|
||||||
*PROMETHEUS_HTTP_TIMEOUT
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn prometheus_http_probe_timeout() -> Duration {
|
|
||||||
*PROMETHEUS_HTTP_PROBE_TIMEOUT
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) const PROMETHEUS_SERVICE_NAME: &str = DEFAULT_PROMETHEUS_SERVICE_NAME;
|
|
||||||
|
|
||||||
fn env_duration_secs(key: &str, default: Duration) -> Duration {
|
fn env_duration_secs(key: &str, default: Duration) -> Duration {
|
||||||
env::var(key)
|
env::var(key)
|
||||||
.ok()
|
.ok()
|
||||||
|
|||||||
@ -1,31 +1,20 @@
|
|||||||
use kube::Client;
|
use kube::Client;
|
||||||
use testing_framework_core::scenario::http_probe::NodeRole;
|
use testing_framework_core::scenario::http_probe::NodeRole;
|
||||||
|
|
||||||
use super::{
|
use super::{ClusterPorts, ClusterReady, ClusterWaitError, NodeConfigPorts};
|
||||||
ClusterPorts, ClusterReady, ClusterWaitError, HostPort, NodeConfigPorts, PROMETHEUS_HTTP_PORT,
|
|
||||||
PROMETHEUS_SERVICE_NAME, prometheus_http_probe_timeout,
|
|
||||||
};
|
|
||||||
use crate::lifecycle::wait::{
|
use crate::lifecycle::wait::{
|
||||||
deployment::wait_for_deployment_ready,
|
deployment::wait_for_deployment_ready,
|
||||||
forwarding::{
|
forwarding::{PortForwardHandle, kill_port_forwards, port_forward_group},
|
||||||
PortForwardHandle, PortForwardSpawn, kill_port_forwards, port_forward_group,
|
|
||||||
port_forward_service,
|
|
||||||
},
|
|
||||||
grafana::{wait_for_grafana_http_nodeport, wait_for_grafana_http_port_forward},
|
|
||||||
http_probe::{wait_for_node_http_nodeport, wait_for_node_http_port_forward},
|
http_probe::{wait_for_node_http_nodeport, wait_for_node_http_port_forward},
|
||||||
ports::{discover_node_ports, find_node_port},
|
ports::discover_node_ports,
|
||||||
prometheus::{wait_for_prometheus_http_nodeport, wait_for_prometheus_http_port_forward},
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const GRAFANA_HTTP_PORT: u16 = 3000;
|
|
||||||
|
|
||||||
pub async fn wait_for_cluster_ready(
|
pub async fn wait_for_cluster_ready(
|
||||||
client: &Client,
|
client: &Client,
|
||||||
namespace: &str,
|
namespace: &str,
|
||||||
release: &str,
|
release: &str,
|
||||||
validator_ports: &[NodeConfigPorts],
|
validator_ports: &[NodeConfigPorts],
|
||||||
executor_ports: &[NodeConfigPorts],
|
executor_ports: &[NodeConfigPorts],
|
||||||
prometheus_enabled: bool,
|
|
||||||
) -> Result<ClusterReady, ClusterWaitError> {
|
) -> Result<ClusterReady, ClusterWaitError> {
|
||||||
if validator_ports.is_empty() {
|
if validator_ports.is_empty() {
|
||||||
return Err(ClusterWaitError::MissingValidator);
|
return Err(ClusterWaitError::MissingValidator);
|
||||||
@ -108,75 +97,12 @@ pub async fn wait_for_cluster_ready(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut prometheus = None;
|
|
||||||
if prometheus_enabled {
|
|
||||||
let mut prometheus_port = find_node_port(
|
|
||||||
client,
|
|
||||||
namespace,
|
|
||||||
PROMETHEUS_SERVICE_NAME,
|
|
||||||
PROMETHEUS_HTTP_PORT,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
let mut prometheus_host = crate::host::node_host();
|
|
||||||
if wait_for_prometheus_http_nodeport(prometheus_port, prometheus_http_probe_timeout())
|
|
||||||
.await
|
|
||||||
.is_err()
|
|
||||||
{
|
|
||||||
let PortForwardSpawn { local_port, handle } =
|
|
||||||
port_forward_service(namespace, PROMETHEUS_SERVICE_NAME, PROMETHEUS_HTTP_PORT)
|
|
||||||
.map_err(|err| {
|
|
||||||
kill_port_forwards(&mut port_forwards);
|
|
||||||
err
|
|
||||||
})?;
|
|
||||||
prometheus_port = local_port;
|
|
||||||
prometheus_host = "127.0.0.1".to_owned();
|
|
||||||
port_forwards.push(handle);
|
|
||||||
if let Err(err) = wait_for_prometheus_http_port_forward(prometheus_port).await {
|
|
||||||
return Err(cleanup_port_forwards(&mut port_forwards, err));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
prometheus = Some(HostPort {
|
|
||||||
host: prometheus_host,
|
|
||||||
port: prometheus_port,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut grafana = None;
|
|
||||||
let grafana_service = format!("{release}-grafana");
|
|
||||||
if let Ok(node_port) =
|
|
||||||
find_node_port(client, namespace, &grafana_service, GRAFANA_HTTP_PORT).await
|
|
||||||
{
|
|
||||||
let mut grafana_host = crate::host::node_host();
|
|
||||||
let mut grafana_port = node_port;
|
|
||||||
if wait_for_grafana_http_nodeport(grafana_port).await.is_err() {
|
|
||||||
let PortForwardSpawn { local_port, handle } =
|
|
||||||
port_forward_service(namespace, &grafana_service, GRAFANA_HTTP_PORT).map_err(
|
|
||||||
|err| {
|
|
||||||
kill_port_forwards(&mut port_forwards);
|
|
||||||
err
|
|
||||||
},
|
|
||||||
)?;
|
|
||||||
grafana_host = "127.0.0.1".to_owned();
|
|
||||||
grafana_port = local_port;
|
|
||||||
port_forwards.push(handle);
|
|
||||||
if let Err(err) = wait_for_grafana_http_port_forward(grafana_port).await {
|
|
||||||
return Err(cleanup_port_forwards(&mut port_forwards, err));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
grafana = Some(HostPort {
|
|
||||||
host: grafana_host,
|
|
||||||
port: grafana_port,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(ClusterReady {
|
Ok(ClusterReady {
|
||||||
ports: ClusterPorts {
|
ports: ClusterPorts {
|
||||||
validators: validator_allocations,
|
validators: validator_allocations,
|
||||||
executors: executor_allocations,
|
executors: executor_allocations,
|
||||||
validator_host,
|
validator_host,
|
||||||
executor_host,
|
executor_host,
|
||||||
prometheus,
|
|
||||||
grafana,
|
|
||||||
},
|
},
|
||||||
port_forwards,
|
port_forwards,
|
||||||
})
|
})
|
||||||
|
|||||||
@ -1,39 +0,0 @@
|
|||||||
use tokio::time::sleep;
|
|
||||||
|
|
||||||
use super::{ClusterWaitError, prometheus_http_timeout};
|
|
||||||
use crate::host::node_host;
|
|
||||||
|
|
||||||
const PROMETHEUS_HTTP_POLL_INTERVAL: std::time::Duration = std::time::Duration::from_secs(1);
|
|
||||||
|
|
||||||
pub async fn wait_for_prometheus_http_nodeport(
|
|
||||||
port: u16,
|
|
||||||
timeout: std::time::Duration,
|
|
||||||
) -> Result<(), ClusterWaitError> {
|
|
||||||
let host = node_host();
|
|
||||||
wait_for_prometheus_http(&host, port, timeout).await
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn wait_for_prometheus_http_port_forward(port: u16) -> Result<(), ClusterWaitError> {
|
|
||||||
wait_for_prometheus_http("127.0.0.1", port, prometheus_http_timeout()).await
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn wait_for_prometheus_http(
|
|
||||||
host: &str,
|
|
||||||
port: u16,
|
|
||||||
timeout: std::time::Duration,
|
|
||||||
) -> Result<(), ClusterWaitError> {
|
|
||||||
let client = reqwest::Client::new();
|
|
||||||
let url = format!("http://{host}:{port}/-/ready");
|
|
||||||
|
|
||||||
let attempts = timeout.as_secs();
|
|
||||||
for _ in 0..attempts {
|
|
||||||
if let Ok(resp) = client.get(&url).send().await
|
|
||||||
&& resp.status().is_success()
|
|
||||||
{
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
sleep(PROMETHEUS_HTTP_POLL_INTERVAL).await;
|
|
||||||
}
|
|
||||||
|
|
||||||
Err(ClusterWaitError::PrometheusTimeout { port })
|
|
||||||
}
|
|
||||||
@ -132,6 +132,12 @@ pub trait ObservabilityBuilderExt: Sized {
|
|||||||
url: &str,
|
url: &str,
|
||||||
) -> CoreScenarioBuilder<ObservabilityCapability>;
|
) -> CoreScenarioBuilder<ObservabilityCapability>;
|
||||||
|
|
||||||
|
/// Optional Grafana base URL for printing/logging (human access).
|
||||||
|
fn with_grafana_url(self, url: reqwest::Url) -> CoreScenarioBuilder<ObservabilityCapability>;
|
||||||
|
|
||||||
|
/// Convenience wrapper that parses a URL string (panics if invalid).
|
||||||
|
fn with_grafana_url_str(self, url: &str) -> CoreScenarioBuilder<ObservabilityCapability>;
|
||||||
|
|
||||||
#[deprecated(note = "use with_metrics_query_url")]
|
#[deprecated(note = "use with_metrics_query_url")]
|
||||||
fn with_external_prometheus(
|
fn with_external_prometheus(
|
||||||
self,
|
self,
|
||||||
@ -190,6 +196,7 @@ impl ObservabilityBuilderExt for CoreScenarioBuilder<()> {
|
|||||||
metrics_query_url: Some(url),
|
metrics_query_url: Some(url),
|
||||||
metrics_query_grafana_url: None,
|
metrics_query_grafana_url: None,
|
||||||
metrics_otlp_ingest_url: None,
|
metrics_otlp_ingest_url: None,
|
||||||
|
grafana_url: None,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -206,6 +213,7 @@ impl ObservabilityBuilderExt for CoreScenarioBuilder<()> {
|
|||||||
metrics_query_url: None,
|
metrics_query_url: None,
|
||||||
metrics_query_grafana_url: None,
|
metrics_query_grafana_url: None,
|
||||||
metrics_otlp_ingest_url: Some(url),
|
metrics_otlp_ingest_url: Some(url),
|
||||||
|
grafana_url: None,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -225,6 +233,7 @@ impl ObservabilityBuilderExt for CoreScenarioBuilder<()> {
|
|||||||
metrics_query_url: None,
|
metrics_query_url: None,
|
||||||
metrics_query_grafana_url: Some(url),
|
metrics_query_grafana_url: Some(url),
|
||||||
metrics_otlp_ingest_url: None,
|
metrics_otlp_ingest_url: None,
|
||||||
|
grafana_url: None,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -235,6 +244,20 @@ impl ObservabilityBuilderExt for CoreScenarioBuilder<()> {
|
|||||||
let parsed = reqwest::Url::parse(url).expect("metrics query grafana url must be valid");
|
let parsed = reqwest::Url::parse(url).expect("metrics query grafana url must be valid");
|
||||||
self.with_metrics_query_grafana_url(parsed)
|
self.with_metrics_query_grafana_url(parsed)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn with_grafana_url(self, url: reqwest::Url) -> CoreScenarioBuilder<ObservabilityCapability> {
|
||||||
|
self.with_capabilities(ObservabilityCapability {
|
||||||
|
metrics_query_url: None,
|
||||||
|
metrics_query_grafana_url: None,
|
||||||
|
metrics_otlp_ingest_url: None,
|
||||||
|
grafana_url: Some(url),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn with_grafana_url_str(self, url: &str) -> CoreScenarioBuilder<ObservabilityCapability> {
|
||||||
|
let parsed = reqwest::Url::parse(url).expect("grafana url must be valid");
|
||||||
|
self.with_grafana_url(parsed)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ObservabilityBuilderExt for CoreScenarioBuilder<ObservabilityCapability> {
|
impl ObservabilityBuilderExt for CoreScenarioBuilder<ObservabilityCapability> {
|
||||||
@ -282,6 +305,19 @@ impl ObservabilityBuilderExt for CoreScenarioBuilder<ObservabilityCapability> {
|
|||||||
let parsed = reqwest::Url::parse(url).expect("metrics query grafana url must be valid");
|
let parsed = reqwest::Url::parse(url).expect("metrics query grafana url must be valid");
|
||||||
self.with_metrics_query_grafana_url(parsed)
|
self.with_metrics_query_grafana_url(parsed)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn with_grafana_url(
|
||||||
|
mut self,
|
||||||
|
url: reqwest::Url,
|
||||||
|
) -> CoreScenarioBuilder<ObservabilityCapability> {
|
||||||
|
self.capabilities_mut().grafana_url = Some(url);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
fn with_grafana_url_str(self, url: &str) -> CoreScenarioBuilder<ObservabilityCapability> {
|
||||||
|
let parsed = reqwest::Url::parse(url).expect("grafana url must be valid");
|
||||||
|
self.with_grafana_url(parsed)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Builder for transaction workloads.
|
/// Builder for transaction workloads.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user