From b78add7792edccd2f412db56e07577dea49f4753 Mon Sep 17 00:00:00 2001 From: andrussal Date: Wed, 17 Dec 2025 17:30:17 +0100 Subject: [PATCH 1/2] docs(book): update metrics url options --- book/src/operations.md | 35 +++++++++++++++++++++++++++++++---- book/src/troubleshooting.md | 2 ++ 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/book/src/operations.md b/book/src/operations.md index 93e2fcc..e633d27 100644 --- a/book/src/operations.md +++ b/book/src/operations.md @@ -229,21 +229,24 @@ cargo run -p runner-examples --bin k8s_runner - `NOMOS_TESTNET_IMAGE` — Image tag (required) - `POL_PROOF_DEV_MODE=true` — **Required** for all runners - `NOMOS_DEMO_VALIDATORS` / `NOMOS_DEMO_EXECUTORS` / `NOMOS_DEMO_RUN_SECS` — Topology overrides -- `K8S_RUNNER_EXTERNAL_PROMETHEUS_URL` (or `NOMOS_EXTERNAL_PROMETHEUS_URL`) — Reuse an existing Prometheus and skip deploying the in-chart Prometheus; also points node OTLP metrics export and the in-cluster Grafana datasource at that Prometheus +- `K8S_RUNNER_METRICS_QUERY_URL` (or `NOMOS_METRICS_QUERY_URL`) — PromQL base URL the *runner process* can query (e.g. localhost port-forward or public LB) +- `K8S_RUNNER_METRICS_QUERY_GRAFANA_URL` (or `NOMOS_METRICS_QUERY_GRAFANA_URL`) — PromQL base URL the *Grafana pod* can query (cluster-reachable); defaults to `K8S_RUNNER_METRICS_QUERY_URL` if unset +- `K8S_RUNNER_METRICS_OTLP_INGEST_URL` (or `NOMOS_METRICS_OTLP_INGEST_URL`) — Full OTLP HTTP ingest URL used by *nodes* to export metrics (backend-specific path) **External Prometheus (optional):** ```bash -export K8S_RUNNER_EXTERNAL_PROMETHEUS_URL=http://your-prometheus:9090 +export K8S_RUNNER_METRICS_QUERY_URL=http://your-prometheus:9090 cargo run -p runner-examples --bin k8s_runner ``` Notes: - The runner config expects Prometheus to accept OTLP metrics at `/api/v1/otlp/v1/metrics` (the in-chart Prometheus is started with `--web.enable-otlp-receiver` and `--enable-feature=otlp-write-receiver`). - Use a URL reachable from inside the cluster (for example a `Service` DNS name like `http://prometheus.monitoring:9090`). +- If you set `K8S_RUNNER_METRICS_QUERY_URL` to a localhost port-forward (e.g. `http://127.0.0.1:8428`), also set `K8S_RUNNER_METRICS_QUERY_GRAFANA_URL` to a cluster-reachable `Service` DNS name so Grafana can query metrics. **Via `scripts/run-examples.sh` (optional):** ```bash -scripts/run-examples.sh -t 60 -v 1 -e 1 k8s --external-prometheus http://your-prometheus:9090 +scripts/run-examples.sh -t 60 -v 1 -e 1 k8s --metrics-query-url http://your-prometheus:9090 ``` **In code (optional):** @@ -252,7 +255,7 @@ use testing_framework_core::scenario::ScenarioBuilder; use testing_framework_workflows::ObservabilityBuilderExt as _; let plan = ScenarioBuilder::with_node_counts(1, 1) - .with_external_prometheus_str("http://your-prometheus:9090") + .with_metrics_query_url_str("http://your-prometheus:9090") .build(); ``` @@ -408,6 +411,30 @@ Common target prefixes for `NOMOS_LOG_FILTER`: NOMOS_LOG_FILTER="cryptarchia=trace,nomos_da_sampling=debug,chain_service=info,chain_network=info,chain_leader=info" ``` +### Keep Metrics Out of Log Output + +If you see “metric-like” fields (for example `counter.*`, `gauge.*`) showing up in node logs, it usually means the metric update is being emitted as a `tracing` event and then formatted by the logging layer. + +Preferred approach: emit metrics via a metrics API (OpenTelemetry meter / `metrics` crate), and keep `tracing` for logs/traces only. + +If you must emit metrics as `tracing` events, route them to a dedicated target and disable that target for the **log formatting** layer: + +```rust +tracing::info!( + target: "nomos_metrics", + counter.blend_connection_events_total = 1u64, + event = event_name, +); +``` + +Then add a filter directive to keep that target out of logs: + +```bash +NOMOS_LOG_FILTER="nomos_metrics=off,cryptarchia=trace,nomos_da_sampling=debug" +``` + +Note: this only works if the node’s subscriber applies the filter to the log/`fmt` layer (not globally), so your OTLP metrics pipeline still receives the event/instrumentation. + ### Accessing Logs Per Runner #### Local Runner diff --git a/book/src/troubleshooting.md b/book/src/troubleshooting.md index d9c0f2a..c4645ef 100644 --- a/book/src/troubleshooting.md +++ b/book/src/troubleshooting.md @@ -207,6 +207,8 @@ NOMOS_LOG_FILTER="cryptarchia=trace,nomos_da_sampling=debug" \ cargo run -p runner-examples --bin local_runner ``` +If metric updates are polluting your logs (fields like `counter.*` / `gauge.*`), move those events to a dedicated `tracing` target (e.g. `target: "nomos_metrics"`) and set `NOMOS_LOG_FILTER="nomos_metrics=off,..."` so they don’t get formatted into log output. + ### 5. Verify Observability Endpoints If expectations report observability issues: From 96fcea1f96ce6dc9639f28032d39214d83c0811f Mon Sep 17 00:00:00 2001 From: andrussal Date: Wed, 17 Dec 2025 17:34:32 +0100 Subject: [PATCH 2/2] workflows: retry tx submission Also bump pinned NOMOS_NODE_REV. --- .../workflows/src/workloads/util.rs | 36 ++++++++++++------- versions.env | 2 +- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/testing-framework/workflows/src/workloads/util.rs b/testing-framework/workflows/src/workloads/util.rs index 498140d..2e9f9a5 100644 --- a/testing-framework/workflows/src/workloads/util.rs +++ b/testing-framework/workflows/src/workloads/util.rs @@ -1,4 +1,4 @@ -use std::sync::Arc; +use std::{sync::Arc, time::Duration}; use nomos_core::{ block::Block, @@ -11,6 +11,9 @@ use rand::{seq::SliceRandom as _, thread_rng}; use testing_framework_core::scenario::{DynError, RunContext}; use tracing::debug; +const SUBMIT_RETRIES: usize = 5; +const SUBMIT_RETRY_DELAY: Duration = Duration::from_millis(500); + /// Scans a block and invokes the matcher for every operation until it returns /// `Some(...)`. Returns `None` when no matching operation is found. pub fn find_channel_op(block: &Block, matcher: &mut F) -> Option @@ -51,22 +54,29 @@ pub async fn submit_transaction_via_cluster( executor_clients.shuffle(&mut thread_rng()); let clients = validator_clients.into_iter().chain(executor_clients); + let mut clients: Vec<_> = clients.collect(); let mut last_err = None; - for client in clients { - let url = client.base_url().clone(); - debug!(?tx_hash, %url, "submitting transaction to client"); - match client - .submit_transaction(&tx) - .await - .map_err(|err| -> DynError { err.into() }) - { - Ok(()) => return Ok(()), - Err(err) => { - debug!(?tx_hash, %url, "transaction submission failed"); - last_err = Some(err); + for attempt in 0..SUBMIT_RETRIES { + clients.shuffle(&mut thread_rng()); + + for client in &clients { + let url = client.base_url().clone(); + debug!(?tx_hash, %url, attempt, "submitting transaction to client"); + match client + .submit_transaction(&tx) + .await + .map_err(|err| -> DynError { err.into() }) + { + Ok(()) => return Ok(()), + Err(err) => { + debug!(?tx_hash, %url, attempt, "transaction submission failed"); + last_err = Some(err); + } } } + + tokio::time::sleep(SUBMIT_RETRY_DELAY).await; } Err(last_err.unwrap_or_else(|| "cluster client exhausted all nodes".into())) diff --git a/versions.env b/versions.env index ac50f41..a5239fc 100644 --- a/versions.env +++ b/versions.env @@ -1,7 +1,7 @@ VERSION=v0.3.1 NOMOS_BUNDLE_VERSION=v4 # Pinned nomos-node revision used for CI builds and binary bundles. -NOMOS_NODE_REV=ad104981ca79da20183550b5aced9e49773fb6d5 +NOMOS_NODE_REV=6bdb09567d21cd1e53527846a9cd48493ad49387 # Optional: local nomos-node checkout override (do not commit absolute paths). # NOMOS_NODE_PATH=