From b78add7792edccd2f412db56e07577dea49f4753 Mon Sep 17 00:00:00 2001
From: andrussal <salumets.andrus@gmail.com>
Date: Wed, 17 Dec 2025 17:30:17 +0100
Subject: [PATCH 1/2] docs(book): update metrics url options

---
 book/src/operations.md      | 35 +++++++++++++++++++++++++++++++----
 book/src/troubleshooting.md |  2 ++
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/book/src/operations.md b/book/src/operations.md
index 93e2fcc..e633d27 100644
--- a/book/src/operations.md
+++ b/book/src/operations.md
@@ -229,21 +229,24 @@ cargo run -p runner-examples --bin k8s_runner
 - `NOMOS_TESTNET_IMAGE` — Image tag (required)
 - `POL_PROOF_DEV_MODE=true` — **Required** for all runners
 - `NOMOS_DEMO_VALIDATORS` / `NOMOS_DEMO_EXECUTORS` / `NOMOS_DEMO_RUN_SECS` — Topology overrides
-- `K8S_RUNNER_EXTERNAL_PROMETHEUS_URL` (or `NOMOS_EXTERNAL_PROMETHEUS_URL`) — Reuse an existing Prometheus and skip deploying the in-chart Prometheus; also points node OTLP metrics export and the in-cluster Grafana datasource at that Prometheus
+- `K8S_RUNNER_METRICS_QUERY_URL` (or `NOMOS_METRICS_QUERY_URL`) — PromQL base URL the *runner process* can query (e.g. localhost port-forward or public LB)
+- `K8S_RUNNER_METRICS_QUERY_GRAFANA_URL` (or `NOMOS_METRICS_QUERY_GRAFANA_URL`) — PromQL base URL the *Grafana pod* can query (cluster-reachable); defaults to `K8S_RUNNER_METRICS_QUERY_URL` if unset
+- `K8S_RUNNER_METRICS_OTLP_INGEST_URL` (or `NOMOS_METRICS_OTLP_INGEST_URL`) — Full OTLP HTTP ingest URL used by *nodes* to export metrics (backend-specific path)
 
 **External Prometheus (optional):**
 ```bash
-export K8S_RUNNER_EXTERNAL_PROMETHEUS_URL=http://your-prometheus:9090
+export K8S_RUNNER_METRICS_QUERY_URL=http://your-prometheus:9090
 cargo run -p runner-examples --bin k8s_runner
 ```
 
 Notes:
 - The runner config expects Prometheus to accept OTLP metrics at `/api/v1/otlp/v1/metrics` (the in-chart Prometheus is started with `--web.enable-otlp-receiver` and `--enable-feature=otlp-write-receiver`).
 - Use a URL reachable from inside the cluster (for example a `Service` DNS name like `http://prometheus.monitoring:9090`).
+- If you set `K8S_RUNNER_METRICS_QUERY_URL` to a localhost port-forward (e.g. `http://127.0.0.1:8428`), also set `K8S_RUNNER_METRICS_QUERY_GRAFANA_URL` to a cluster-reachable `Service` DNS name so Grafana can query metrics.
 
 **Via `scripts/run-examples.sh` (optional):**
 ```bash
-scripts/run-examples.sh -t 60 -v 1 -e 1 k8s --external-prometheus http://your-prometheus:9090
+scripts/run-examples.sh -t 60 -v 1 -e 1 k8s --metrics-query-url http://your-prometheus:9090
 ```
 
 **In code (optional):**
@@ -252,7 +255,7 @@ use testing_framework_core::scenario::ScenarioBuilder;
 use testing_framework_workflows::ObservabilityBuilderExt as _;
 
 let plan = ScenarioBuilder::with_node_counts(1, 1)
-    .with_external_prometheus_str("http://your-prometheus:9090")
+    .with_metrics_query_url_str("http://your-prometheus:9090")
     .build();
 ```
 
@@ -408,6 +411,30 @@ Common target prefixes for `NOMOS_LOG_FILTER`:
 NOMOS_LOG_FILTER="cryptarchia=trace,nomos_da_sampling=debug,chain_service=info,chain_network=info,chain_leader=info"
 ```
 
+### Keep Metrics Out of Log Output
+
+If you see “metric-like” fields (for example `counter.*`, `gauge.*`) showing up in node logs, it usually means the metric update is being emitted as a `tracing` event and then formatted by the logging layer.
+
+Preferred approach: emit metrics via a metrics API (OpenTelemetry meter / `metrics` crate), and keep `tracing` for logs/traces only.
+
+If you must emit metrics as `tracing` events, route them to a dedicated target and disable that target for the **log formatting** layer:
+
+```rust
+tracing::info!(
+    target: "nomos_metrics",
+    counter.blend_connection_events_total = 1u64,
+    event = event_name,
+);
+```
+
+Then add a filter directive to keep that target out of logs:
+
+```bash
+NOMOS_LOG_FILTER="nomos_metrics=off,cryptarchia=trace,nomos_da_sampling=debug"
+```
+
+Note: this only works if the node’s subscriber applies the filter to the log/`fmt` layer (not globally), so your OTLP metrics pipeline still receives the event/instrumentation.
+
 ### Accessing Logs Per Runner
 
 #### Local Runner
diff --git a/book/src/troubleshooting.md b/book/src/troubleshooting.md
index d9c0f2a..c4645ef 100644
--- a/book/src/troubleshooting.md
+++ b/book/src/troubleshooting.md
@@ -207,6 +207,8 @@ NOMOS_LOG_FILTER="cryptarchia=trace,nomos_da_sampling=debug" \
 cargo run -p runner-examples --bin local_runner
 ```
 
+If metric updates are polluting your logs (fields like `counter.*` / `gauge.*`), move those events to a dedicated `tracing` target (e.g. `target: "nomos_metrics"`) and set `NOMOS_LOG_FILTER="nomos_metrics=off,..."` so they don’t get formatted into log output.
+
 ### 5. Verify Observability Endpoints
 
 If expectations report observability issues:

From 96fcea1f96ce6dc9639f28032d39214d83c0811f Mon Sep 17 00:00:00 2001
From: andrussal <salumets.andrus@gmail.com>
Date: Wed, 17 Dec 2025 17:34:32 +0100
Subject: [PATCH 2/2] workflows: retry tx submission

Also bump pinned NOMOS_NODE_REV.
---
 .../workflows/src/workloads/util.rs           | 36 ++++++++++++-------
 versions.env                                  |  2 +-
 2 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/testing-framework/workflows/src/workloads/util.rs b/testing-framework/workflows/src/workloads/util.rs
index 498140d..2e9f9a5 100644
--- a/testing-framework/workflows/src/workloads/util.rs
+++ b/testing-framework/workflows/src/workloads/util.rs
@@ -1,4 +1,4 @@
-use std::sync::Arc;
+use std::{sync::Arc, time::Duration};
 
 use nomos_core::{
     block::Block,
@@ -11,6 +11,9 @@ use rand::{seq::SliceRandom as _, thread_rng};
 use testing_framework_core::scenario::{DynError, RunContext};
 use tracing::debug;
 
+const SUBMIT_RETRIES: usize = 5;
+const SUBMIT_RETRY_DELAY: Duration = Duration::from_millis(500);
+
 /// Scans a block and invokes the matcher for every operation until it returns
 /// `Some(...)`. Returns `None` when no matching operation is found.
 pub fn find_channel_op<F>(block: &Block<SignedMantleTx>, matcher: &mut F) -> Option<MsgId>
@@ -51,22 +54,29 @@ pub async fn submit_transaction_via_cluster(
     executor_clients.shuffle(&mut thread_rng());
 
     let clients = validator_clients.into_iter().chain(executor_clients);
+    let mut clients: Vec<_> = clients.collect();
     let mut last_err = None;
 
-    for client in clients {
-        let url = client.base_url().clone();
-        debug!(?tx_hash, %url, "submitting transaction to client");
-        match client
-            .submit_transaction(&tx)
-            .await
-            .map_err(|err| -> DynError { err.into() })
-        {
-            Ok(()) => return Ok(()),
-            Err(err) => {
-                debug!(?tx_hash, %url, "transaction submission failed");
-                last_err = Some(err);
+    for attempt in 0..SUBMIT_RETRIES {
+        clients.shuffle(&mut thread_rng());
+
+        for client in &clients {
+            let url = client.base_url().clone();
+            debug!(?tx_hash, %url, attempt, "submitting transaction to client");
+            match client
+                .submit_transaction(&tx)
+                .await
+                .map_err(|err| -> DynError { err.into() })
+            {
+                Ok(()) => return Ok(()),
+                Err(err) => {
+                    debug!(?tx_hash, %url, attempt, "transaction submission failed");
+                    last_err = Some(err);
+                }
             }
         }
+
+        tokio::time::sleep(SUBMIT_RETRY_DELAY).await;
     }
 
     Err(last_err.unwrap_or_else(|| "cluster client exhausted all nodes".into()))
diff --git a/versions.env b/versions.env
index ac50f41..a5239fc 100644
--- a/versions.env
+++ b/versions.env
@@ -1,7 +1,7 @@
 VERSION=v0.3.1
 NOMOS_BUNDLE_VERSION=v4
 # Pinned nomos-node revision used for CI builds and binary bundles.
-NOMOS_NODE_REV=ad104981ca79da20183550b5aced9e49773fb6d5
+NOMOS_NODE_REV=6bdb09567d21cd1e53527846a9cd48493ad49387
 
 # Optional: local nomos-node checkout override (do not commit absolute paths).
 # NOMOS_NODE_PATH=