refactor: remove embedded observability

This commit is contained in:
andrussal 2025-12-18 13:05:40 +01:00
parent d8be8e589a
commit 91c9044abb
33 changed files with 941 additions and 73 deletions

1
.gitignore vendored
View File

@ -3,6 +3,7 @@
.tmp/ .tmp/
/.tmp*/ /.tmp*/
tmp-local-logs/ tmp-local-logs/
tmp/node-logs/
# IDE / OS cruft # IDE / OS cruft
.idea/ .idea/
.DS_Store .DS_Store

View File

@ -4,7 +4,7 @@ members = [
"examples/doc-snippets", "examples/doc-snippets",
"testing-framework/configs", "testing-framework/configs",
"testing-framework/core", "testing-framework/core",
"testing-framework/cucumber_ext", "testing-framework/cucumber",
"testing-framework/deployers/compose", "testing-framework/deployers/compose",
"testing-framework/deployers/k8s", "testing-framework/deployers/k8s",
"testing-framework/deployers/local", "testing-framework/deployers/local",

View File

@ -26,6 +26,7 @@
- [Operations](operations.md) - [Operations](operations.md)
- [Part III — Developer Reference](part-iii.md) - [Part III — Developer Reference](part-iii.md)
- [Scenario Model (Developer Level)](scenario-model.md) - [Scenario Model (Developer Level)](scenario-model.md)
- [API Levels: Builder DSL vs. Direct](api-levels.md)
- [Extending the Framework](extending.md) - [Extending the Framework](extending.md)
- [Example: New Workload & Expectation (Rust)](custom-workload-example.md) - [Example: New Workload & Expectation (Rust)](custom-workload-example.md)
- [Internal Crate Reference](internal-crate-reference.md) - [Internal Crate Reference](internal-crate-reference.md)

131
book/src/api-levels.md Normal file
View File

@ -0,0 +1,131 @@
# API Levels: Builder DSL vs. Direct Instantiation
The framework supports two styles for constructing scenarios:
1. **High-level Builder DSL** (recommended): fluent helper methods (e.g. `.transactions_with(...)`)
2. **Low-level direct instantiation**: construct workload/expectation types explicitly, then attach them
Both styles produce the same runtime behavior because they ultimately call the same core builder APIs.
## High-Level Builder DSL (Recommended)
The DSL is implemented as extension traits (primarily `testing_framework_workflows::ScenarioBuilderExt`) on the core scenario builder.
```rust
use std::time::Duration;
use testing_framework_core::scenario::ScenarioBuilder;
use testing_framework_workflows::ScenarioBuilderExt;
let plan = ScenarioBuilder::topology_with(|t| t.network_star().validators(3).executors(2))
.wallets(5)
.transactions_with(|txs| txs.rate(5).users(3))
.da_with(|da| da.channel_rate(1).blob_rate(1).headroom_percent(20))
.expect_consensus_liveness()
.with_run_duration(Duration::from_secs(60))
.build();
```
**When to use:**
- Most test code (smoke, regression, CI)
- When you want sensible defaults and minimal boilerplate
## Low-Level Direct Instantiation
Direct instantiation gives you explicit control over the concrete types you attach:
```rust
use std::{
num::{NonZeroU64, NonZeroUsize},
time::Duration,
};
use testing_framework_core::scenario::ScenarioBuilder;
use testing_framework_workflows::{
expectations::ConsensusLiveness,
workloads::{da, transaction},
};
let tx_workload = transaction::Workload::with_rate(5)
.expect("transaction rate must be non-zero")
.with_user_limit(NonZeroUsize::new(3));
let da_workload = da::Workload::with_rate(
NonZeroU64::new(1).unwrap(), // blob rate per block
NonZeroU64::new(1).unwrap(), // channel rate per block
da::Workload::default_headroom_percent(),
);
let plan = ScenarioBuilder::topology_with(|t| t.network_star().validators(3).executors(2))
.wallets(5)
.with_workload(tx_workload)
.with_workload(da_workload)
.with_expectation(ConsensusLiveness::default())
.with_run_duration(Duration::from_secs(60))
.build();
```
**When to use:**
- Custom workload/expectation implementations
- Reusing preconfigured workload instances across multiple scenarios
- Debugging / exploring the underlying workload types
## Method Correspondence
| High-Level DSL | Low-Level Direct |
|----------------|------------------|
| `.transactions_with(\|txs\| txs.rate(5).users(3))` | `.with_workload(transaction::Workload::with_rate(5).expect(...).with_user_limit(...))` |
| `.da_with(\|da\| da.blob_rate(1).channel_rate(1))` | `.with_workload(da::Workload::with_rate(...))` |
| `.expect_consensus_liveness()` | `.with_expectation(ConsensusLiveness::default())` |
## Bundled Expectations (Important)
Workloads can bundle expectations by implementing `Workload::expectations()`.
These bundled expectations are attached automatically whenever you call `.with_workload(...)` (including when you use the DSL), because the core builder expands workload expectations during attachment.
## Mixing Both Styles
Mixing is common: use the DSL for built-ins, and direct instantiation for custom pieces.
```rust
use std::time::Duration;
use testing_framework_core::scenario::ScenarioBuilder;
use testing_framework_workflows::ScenarioBuilderExt;
let custom_workload = MyCustomWorkload::new(config);
let plan = ScenarioBuilder::topology_with(|t| t.network_star().validators(3).executors(2))
.transactions_with(|txs| txs.rate(5).users(3)) // DSL
.with_workload(custom_workload) // direct
.expect_consensus_liveness() // DSL
.with_run_duration(Duration::from_secs(60))
.build();
```
## Implementation Detail (How the DSL Works)
The DSL methods are thin wrappers. For example:
```rust
builder.transactions_with(|txs| txs.rate(5).users(3))
```
is roughly equivalent to:
```rust
builder.transactions().rate(5).users(3).apply()
```
## Troubleshooting
**DSL method not found**
- Ensure the extension traits are in scope, e.g. `use testing_framework_workflows::ScenarioBuilderExt;`
- Cross-check method names in [Builder API Quick Reference](dsl-cheat-sheet.md)
## See Also
- [Builder API Quick Reference](dsl-cheat-sheet.md)
- [Example: New Workload & Expectation (Rust)](custom-workload-example.md)
- [Extending the Framework](extending.md)

View File

@ -1,31 +1,311 @@
# Extending the Framework # Extending the Framework
## Adding a workload This guide shows how to extend the framework with custom workloads, expectations, runners, and topology helpers. Each section includes the trait outline and a minimal code example.
1) Implement `testing_framework_core::scenario::Workload`:
- Provide a name and any bundled expectations.
- In `init`, derive inputs from `GeneratedTopology` and `RunMetrics`; fail
fast if prerequisites are missing (e.g., wallet data, node addresses).
- In `start`, drive async traffic using the `RunContext` clients.
2) Expose the workload from a module under `testing-framework/workflows` and
consider adding a DSL helper for ergonomic wiring.
## Adding an expectation ## Adding a Workload
1) Implement `testing_framework_core::scenario::Expectation`:
- Use `start_capture` to snapshot baseline metrics.
- Use `evaluate` to assert outcomes after workloads finish; return all errors
so the runner can aggregate them.
2) Export it from `testing-framework/workflows` if it is reusable.
## Adding a runner **Steps:**
1) Implement `testing_framework_core::scenario::Deployer` for your backend. 1. Implement `testing_framework_core::scenario::Workload`
- Produce a `RunContext` with `NodeClients`, metrics endpoints, and optional 2. Provide a name and any bundled expectations
`NodeControlHandle`. 3. Use `init` to derive inputs from topology/metrics; fail fast if prerequisites missing
- Guard cleanup with `CleanupGuard` to reclaim resources even on failures. 4. Use `start` to drive async traffic using `RunContext` clients
2) Mirror the readiness and block-feed probes used by the existing runners so 5. Expose from `testing-framework/workflows` and optionally add a DSL helper
workloads can rely on consistent signals.
## Adding topology helpers **Trait outline:**
- Extend `testing_framework_core::topology::config::TopologyBuilder` with new layouts or
configuration presets (e.g., specialized DA parameters). Keep defaults safe: ```rust
ensure at least one participant and clamp dispersal factors as the current use async_trait::async_trait;
helpers do. use testing_framework_core::scenario::{
DynError, Expectation, RunContext, RunMetrics, Workload,
};
use testing_framework_core::topology::generation::GeneratedTopology;
pub struct MyWorkload {
// Configuration fields
target_rate: u64,
}
impl MyWorkload {
pub fn new(target_rate: u64) -> Self {
Self { target_rate }
}
}
#[async_trait]
impl Workload for MyWorkload {
fn name(&self) -> &str {
"my_workload"
}
fn expectations(&self) -> Vec<Box<dyn Expectation>> {
// Return bundled expectations that should run with this workload
vec![Box::new(MyExpectation::new(self.target_rate))]
}
fn init(
&mut self,
topology: &GeneratedTopology,
_run_metrics: &RunMetrics,
) -> Result<(), DynError> {
// Validate prerequisites (e.g., enough nodes, wallet data present)
if topology.validators().is_empty() {
return Err("no validators available".into());
}
Ok(())
}
async fn start(&self, ctx: &RunContext) -> Result<(), DynError> {
// Drive async activity: submit transactions, query nodes, etc.
let clients = ctx.node_clients().validator_clients();
for client in clients {
let info = client.consensus_info().await?;
tracing::info!(?info, "workload queried node");
}
Ok(())
}
}
```
**Key points:**
- `name()` identifies the workload in logs
- `expectations()` bundles default checks (can be empty)
- `init()` validates topology before run starts
- `start()` executes concurrently with other workloads; it should complete before run duration expires
See [Example: New Workload & Expectation](custom-workload-example.md) for a complete, runnable example.
## Adding an Expectation
**Steps:**
1. Implement `testing_framework_core::scenario::Expectation`
2. Use `start_capture` to snapshot baseline metrics (optional)
3. Use `evaluate` to assert outcomes after workloads finish
4. Return descriptive errors; the runner aggregates them
5. Export from `testing-framework/workflows` if reusable
**Trait outline:**
```rust
use async_trait::async_trait;
use testing_framework_core::scenario::{DynError, Expectation, RunContext};
pub struct MyExpectation {
expected_value: u64,
captured_baseline: Option<u64>,
}
impl MyExpectation {
pub fn new(expected_value: u64) -> Self {
Self {
expected_value,
captured_baseline: None,
}
}
}
#[async_trait]
impl Expectation for MyExpectation {
fn name(&self) -> &str {
"my_expectation"
}
async fn start_capture(&mut self, ctx: &RunContext) -> Result<(), DynError> {
// Optional: capture baseline state before workloads start
let client = ctx.node_clients().validator_clients().first()
.ok_or("no validators")?;
let info = client.consensus_info().await?;
self.captured_baseline = Some(info.current_block_id.slot);
tracing::info!(baseline = self.captured_baseline, "captured baseline");
Ok(())
}
async fn evaluate(&mut self, ctx: &RunContext) -> Result<(), DynError> {
// Assert the expected condition holds after workloads finish
let client = ctx.node_clients().validator_clients().first()
.ok_or("no validators")?;
let info = client.consensus_info().await?;
let final_slot = info.current_block_id.slot;
let baseline = self.captured_baseline.unwrap_or(0);
let delta = final_slot.saturating_sub(baseline);
if delta < self.expected_value {
return Err(format!(
"expected at least {} blocks, got {}",
self.expected_value, delta
).into());
}
tracing::info!(delta, "expectation passed");
Ok(())
}
}
```
**Key points:**
- `name()` identifies the expectation in logs
- `start_capture()` runs before workloads start (optional)
- `evaluate()` runs after workloads finish; return descriptive errors
- Expectations run sequentially; keep them fast
## Adding a Runner (Deployer)
**Steps:**
1. Implement `testing_framework_core::scenario::Deployer<Caps>` for your capability type
2. Deploy infrastructure and return a `Runner`
3. Construct `NodeClients` and spawn a `BlockFeed`
4. Build a `RunContext` and provide a `CleanupGuard` for teardown
**Trait outline:**
```rust
use async_trait::async_trait;
use testing_framework_core::scenario::{
CleanupGuard, Deployer, DynError, Metrics, NodeClients, RunContext, Runner, Scenario,
spawn_block_feed,
};
use testing_framework_core::topology::deployment::Topology;
pub struct MyDeployer {
// Configuration: cluster connection details, etc.
}
impl MyDeployer {
pub fn new() -> Self {
Self {}
}
}
#[async_trait]
impl Deployer<()> for MyDeployer {
type Error = DynError;
async fn deploy(&self, scenario: &Scenario<()>) -> Result<Runner, Self::Error> {
// 1. Launch nodes using scenario.topology()
// 2. Wait for readiness (e.g., consensus info endpoint responds)
// 3. Build NodeClients for validators/executors
// 4. Spawn a block feed for expectations (optional but recommended)
// 5. Create NodeControlHandle if you support restarts (optional)
// 6. Return a Runner wrapping RunContext + CleanupGuard
tracing::info!("deploying scenario with MyDeployer");
let topology: Option<Topology> = None; // Some(topology) if you spawned one
let node_clients = NodeClients::default(); // Or NodeClients::from_topology(...)
let (block_feed, block_feed_guard) = spawn_block_feed(&node_clients).await?;
let telemetry = Metrics::empty(); // or Metrics::from_prometheus(...)
let node_control = None; // or Some(Arc<dyn NodeControlHandle>)
let context = RunContext::new(
scenario.topology().clone(),
topology,
node_clients,
scenario.duration(),
telemetry,
block_feed,
node_control,
);
// If you also have other resources to clean up (containers/pods/etc),
// wrap them in your own CleanupGuard implementation and call
// CleanupGuard::cleanup(Box::new(block_feed_guard)) inside it.
Ok(Runner::new(context, Some(Box::new(block_feed_guard))))
}
}
```
**Key points:**
- `deploy()` must return a fully prepared `Runner`
- Block until nodes are ready before returning (avoid false negatives)
- Use a `CleanupGuard` to tear down resources on failure (and on `RunHandle` drop)
- If you want chaos workloads, also provide a `NodeControlHandle` via `RunContext`
## Adding Topology Helpers
**Steps:**
1. Extend `testing_framework_core::topology::config::TopologyBuilder` with new layouts
2. Keep defaults safe: ensure at least one participant, clamp dispersal factors
3. Consider adding configuration presets for specialized parameters
**Example:**
```rust
use testing_framework_core::topology::config::TopologyBuilder;
impl TopologyBuilder {
/// Creates a "ring" topology where each node connects to its neighbors
pub fn network_ring(&mut self) -> &mut Self {
// Configure peer connections in a ring layout
self.with_network_layout(|layout| {
// Implement ring connection logic
layout.ring_peers()
});
self
}
/// Preset for high-throughput DA configuration
pub fn da_high_throughput(&mut self) -> &mut Self {
self.with_da_params(|params| {
params
.dispersal_factor(8)
.replication_factor(16)
.chunk_size(4096)
});
self
}
}
```
**Key points:**
- Maintain method chaining (return `&mut Self`)
- Validate inputs: clamp factors, enforce minimums
- Document assumptions (e.g., "requires at least 4 nodes")
## Adding a DSL Helper
To expose your custom workload through the high-level DSL, add a trait extension:
```rust
use testing_framework_core::scenario::Builder as ScenarioBuilder;
pub trait MyWorkloadDsl {
fn my_workload_with(
self,
f: impl FnOnce(MyWorkloadBuilder) -> MyWorkloadBuilder,
) -> Self;
}
impl<Caps> MyWorkloadDsl for ScenarioBuilder<Caps> {
fn my_workload_with(
self,
f: impl FnOnce(MyWorkloadBuilder) -> MyWorkloadBuilder,
) -> Self {
let builder = f(MyWorkloadBuilder::default());
self.with_workload(builder.build())
}
}
```
Users can then call:
```rust
ScenarioBuilder::topology_with(|t| { /* ... */ })
.my_workload_with(|w| {
w.target_rate(10)
.some_option(true)
})
.build()
```
## See Also
- [API Levels: Builder DSL vs. Direct](api-levels.md) - Understanding the two API levels
- [Custom Workload Example](custom-workload-example.md) - Complete runnable example
- [Internal Crate Reference](internal-crate-reference.md) - Where to add new code

View File

@ -37,7 +37,7 @@ Both **LocalDeployer** and **ComposeDeployer** work in CI environments:
**ComposeDeployer in CI (recommended):** **ComposeDeployer in CI (recommended):**
- Better isolation (containerized) - Better isolation (containerized)
- Reproducible environment - Reproducible environment
- Includes Prometheus/observability - Can integrate with external Prometheus/Grafana (optional)
- **Trade-off:** Slower startup (Docker image build) - **Trade-off:** Slower startup (Docker image build)
- **Trade-off:** Requires Docker daemon - **Trade-off:** Requires Docker daemon
@ -60,7 +60,21 @@ scripts/run-examples.sh -t 60 -v 1 -e 1 compose
scripts/run-examples.sh -t 60 -v 1 -e 1 k8s scripts/run-examples.sh -t 60 -v 1 -e 1 k8s
``` ```
This script handles circuit setup, binary building/bundling, image building, and execution. This script handles circuit setup, binary building/bundling, (local) image building, and execution.
Note: for `k8s` runs against non-local clusters (e.g. EKS), the cluster pulls images from a registry,
so a local `docker build` is not used. In that case, build + push your image separately (see
`scripts/build_test_image.sh`) and set `NOMOS_TESTNET_IMAGE` to the pushed reference.
### Quick Smoke Matrix (Host/Compose/K8s)
For a small “does everything still run?” matrix (including `--no-image-build` variants where relevant), use:
```bash
scripts/run-test-matrix.sh -t 120 -v 1 -e 1
```
This is useful after making runner/image/script changes, and it forwards `--metrics-*` options through to `scripts/run-examples.sh`.
**Environment overrides:** **Environment overrides:**
- `VERSION=v0.3.1` — Circuit version - `VERSION=v0.3.1` — Circuit version
@ -192,6 +206,7 @@ cargo run -p runner-examples --bin compose_runner
**Compose-specific features:** **Compose-specific features:**
- **Node control support**: Only runner that supports chaos testing (`.enable_node_control()` + chaos workloads) - **Node control support**: Only runner that supports chaos testing (`.enable_node_control()` + chaos workloads)
- **Observability is external**: Set `NOMOS_METRICS_*` / `NOMOS_GRAFANA_URL` to enable telemetry links and querying - **Observability is external**: Set `NOMOS_METRICS_*` / `NOMOS_GRAFANA_URL` to enable telemetry links and querying
- Quickstart: `scripts/setup-observability.sh compose up` then `scripts/setup-observability.sh compose env`
**Important:** **Important:**
- Containers expect KZG parameters at `/kzgrs_test_params/kzgrs_test_params` (note the repeated filename) - Containers expect KZG parameters at `/kzgrs_test_params/kzgrs_test_params` (note the repeated filename)
@ -248,13 +263,13 @@ cargo run -p runner-examples --bin k8s_runner
Notes: Notes:
- `NOMOS_METRICS_QUERY_URL` must be reachable from the runner process (often via `kubectl port-forward`). - `NOMOS_METRICS_QUERY_URL` must be reachable from the runner process (often via `kubectl port-forward`).
- `NOMOS_METRICS_OTLP_INGEST_URL` must be reachable from nodes (pods/containers) and is backend-specific (Prometheus vs VictoriaMetrics paths differ). - `NOMOS_METRICS_OTLP_INGEST_URL` must be reachable from nodes (pods/containers) and is backend-specific (Prometheus vs VictoriaMetrics paths differ).
- Quickstart installer: `scripts/setup-observability.sh k8s install` then `scripts/setup-observability.sh k8s env` (optional dashboards: `scripts/setup-observability.sh k8s dashboards`)
**Via `scripts/run-examples.sh` (optional):** **Via `scripts/run-examples.sh` (optional):**
```bash ```bash
scripts/run-examples.sh -t 60 -v 1 -e 1 k8s \ scripts/run-examples.sh -t 60 -v 1 -e 1 k8s \
--metrics-query-url http://your-prometheus:9090 \ --metrics-query-url http://your-prometheus:9090 \
--metrics-otlp-ingest-url http://your-prometheus:9090/api/v1/otlp/v1/metrics \ --metrics-otlp-ingest-url http://your-prometheus:9090/api/v1/otlp/v1/metrics
--grafana-url http://your-grafana:3000
``` ```
**In code (optional):** **In code (optional):**
@ -565,12 +580,15 @@ cargo run -p runner-examples --bin local_runner
Runners expose metrics and node HTTP endpoints for expectation code and debugging: Runners expose metrics and node HTTP endpoints for expectation code and debugging:
**Prometheus-compatible metrics querying (optional):** **Prometheus-compatible metrics querying (optional):**
- The framework does **not** deploy Prometheus. - Runners do **not** provision Prometheus automatically.
- For a ready-to-run stack, use `scripts/setup-observability.sh`:
- Compose: `scripts/setup-observability.sh compose up` then `scripts/setup-observability.sh compose env`
- K8s: `scripts/setup-observability.sh k8s install` then `scripts/setup-observability.sh k8s env`
- Provide `NOMOS_METRICS_QUERY_URL` (PromQL base URL) to enable `ctx.telemetry()` queries. - Provide `NOMOS_METRICS_QUERY_URL` (PromQL base URL) to enable `ctx.telemetry()` queries.
- Access from expectations when configured: `ctx.telemetry().prometheus().map(|p| p.base_url())` - Access from expectations when configured: `ctx.telemetry().prometheus().map(|p| p.base_url())`
**Grafana (optional):** **Grafana (optional):**
- The framework does **not** deploy Grafana. - Runners do **not** provision Grafana automatically (but `scripts/setup-observability.sh` can).
- If you set `NOMOS_GRAFANA_URL`, the deployer prints it in `TESTNET_ENDPOINTS`. - If you set `NOMOS_GRAFANA_URL`, the deployer prints it in `TESTNET_ENDPOINTS`.
- Dashboards live in `testing-framework/assets/stack/monitoring/grafana/dashboards/` for import into your Grafana. - Dashboards live in `testing-framework/assets/stack/monitoring/grafana/dashboards/` for import into your Grafana.

View File

@ -163,7 +163,7 @@ pub fn step_5_run_duration() -> testing_framework_core::scenario::Builder<()> {
} }
``` ```
Run for 60 seconds (~27 blocks with default 2s slots, 0.9 coefficient). Framework ensures this is at least 2× the consensus slot duration. Run for 60 seconds (~27 blocks with default 2s slots, 0.9 coefficient). Framework ensures this is at least 2× the consensus slot duration. Adjust consensus timing via `CONSENSUS_SLOT_TIME` and `CONSENSUS_ACTIVE_SLOT_COEFF`.
### 6. Deploy and Execute ### 6. Deploy and Execute
@ -239,7 +239,18 @@ POL_PROOF_DEV_MODE=true \
cargo run -p runner-examples --bin compose_runner cargo run -p runner-examples --bin compose_runner
``` ```
**Benefit:** Reproducible containerized environment with Prometheus at `http://localhost:9090`. **Benefit:** Reproducible containerized environment (Dockerized nodes, repeatable deployments).
**Optional: Prometheus + Grafana**
The runner can integrate with external observability endpoints. For a ready-to-run local stack:
```bash
scripts/setup-observability.sh compose up
eval "$(scripts/setup-observability.sh compose env)"
```
Then run your compose scenario as usual (the environment variables enable PromQL querying and node OTLP metrics export).
**Note:** Compose expects KZG parameters at `/kzgrs_test_params/kzgrs_test_params` inside containers (the directory name is repeated as the filename). **Note:** Compose expects KZG parameters at `/kzgrs_test_params/kzgrs_test_params` inside containers (the directory name is repeated as the filename).

View File

@ -36,7 +36,7 @@ Reason in **blocks** and **consensus intervals**, not wall-clock seconds.
**Consensus defaults:** **Consensus defaults:**
- Slot duration: 2 seconds (NTP-synchronized, configurable via `CONSENSUS_SLOT_TIME`) - Slot duration: 2 seconds (NTP-synchronized, configurable via `CONSENSUS_SLOT_TIME`)
- Active slot coefficient: 0.9 (90% block probability per slot) - Active slot coefficient: 0.9 (90% block probability per slot, configurable via `CONSENSUS_ACTIVE_SLOT_COEFF`)
- Expected rate: ~27 blocks per minute - Expected rate: ~27 blocks per minute
```rust ```rust
@ -159,7 +159,7 @@ pub fn minimum_run_windows() {
**Note:** Block counts assume default consensus parameters: **Note:** Block counts assume default consensus parameters:
- Slot duration: 2 seconds (configurable via `CONSENSUS_SLOT_TIME`) - Slot duration: 2 seconds (configurable via `CONSENSUS_SLOT_TIME`)
- Active slot coefficient: 0.9 (90% block probability per slot) - Active slot coefficient: 0.9 (90% block probability per slot, configurable via `CONSENSUS_ACTIVE_SLOT_COEFF`)
- Formula: `blocks ≈ (duration / slot_duration) × active_slot_coeff` - Formula: `blocks ≈ (duration / slot_duration) × active_slot_coeff`
If upstream changes these parameters, adjust your duration expectations accordingly. If upstream changes these parameters, adjust your duration expectations accordingly.

View File

@ -12,7 +12,7 @@ version = "0.1.0"
[dependencies] [dependencies]
anyhow = "1" anyhow = "1"
cucumber = { version = "0.22.0" } cucumber = { version = "0.22.0" }
cucumber_ext = { path = "../testing-framework/cucumber_ext" } cucumber_ext = { path = "../testing-framework/cucumber" }
testing-framework-core = { workspace = true } testing-framework-core = { workspace = true }
testing-framework-runner-compose = { workspace = true } testing-framework-runner-compose = { workspace = true }
testing-framework-runner-k8s = { workspace = true } testing-framework-runner-k8s = { workspace = true }

View File

@ -1,4 +1,9 @@
use std::{env, process, time::Duration}; use std::{
env, fs,
path::{Path, PathBuf},
process,
time::Duration,
};
use anyhow::{Context as _, Result}; use anyhow::{Context as _, Result};
use runner_examples::{ChaosBuilderExt as _, ScenarioBuilderExt as _, read_env_any}; use runner_examples::{ChaosBuilderExt as _, ScenarioBuilderExt as _, read_env_any};
@ -25,6 +30,8 @@ const DA_BLOB_RATE: u64 = 1;
#[tokio::main] #[tokio::main]
async fn main() { async fn main() {
init_node_log_dir_defaults();
// Compose containers mount KZG params at /kzgrs_test_params; ensure the // Compose containers mount KZG params at /kzgrs_test_params; ensure the
// generated configs point there unless the caller overrides explicitly. // generated configs point there unless the caller overrides explicitly.
if env::var("NOMOS_KZGRS_PARAMS_PATH").is_err() { if env::var("NOMOS_KZGRS_PARAMS_PATH").is_err() {
@ -57,6 +64,35 @@ async fn main() {
} }
} }
fn init_node_log_dir_defaults() {
if env::var_os("NOMOS_LOG_DIR").is_some() {
return;
}
let repo_root = repo_root();
let host_dir = repo_root.join("tmp").join("node-logs");
let _ = fs::create_dir_all(&host_dir);
// In compose mode, node processes run inside containers; configs should
// point to the container path, while the compose deployer mounts the host
// repo's `tmp/node-logs` there.
unsafe {
env::set_var("NOMOS_LOG_DIR", "/tmp/node-logs");
}
}
fn repo_root() -> PathBuf {
env::var("CARGO_WORKSPACE_DIR")
.map(PathBuf::from)
.ok()
.or_else(|| {
Path::new(env!("CARGO_MANIFEST_DIR"))
.parent()
.map(Path::to_path_buf)
})
.expect("repo root must be discoverable from CARGO_WORKSPACE_DIR or CARGO_MANIFEST_DIR")
}
async fn run_compose_case( async fn run_compose_case(
validators: usize, validators: usize,
executors: usize, executors: usize,

View File

@ -1,8 +1,11 @@
use runner_examples::cucumber::{Mode, init_logging_defaults, init_tracing, run}; use runner_examples::cucumber::{
Mode, init_logging_defaults, init_node_log_dir_defaults, init_tracing, run,
};
#[tokio::main(flavor = "current_thread")] #[tokio::main(flavor = "current_thread")]
async fn main() { async fn main() {
init_logging_defaults(); init_logging_defaults();
init_node_log_dir_defaults(Mode::Compose);
init_tracing(); init_tracing();
run(Mode::Compose).await; run(Mode::Compose).await;

View File

@ -1,8 +1,11 @@
use runner_examples::cucumber::{Mode, init_logging_defaults, init_tracing, run}; use runner_examples::cucumber::{
Mode, init_logging_defaults, init_node_log_dir_defaults, init_tracing, run,
};
#[tokio::main(flavor = "current_thread")] #[tokio::main(flavor = "current_thread")]
async fn main() { async fn main() {
init_logging_defaults(); init_logging_defaults();
init_node_log_dir_defaults(Mode::Host);
init_tracing(); init_tracing();
run(Mode::Host).await; run(Mode::Host).await;

View File

@ -1,4 +1,9 @@
use std::{env, process, time::Duration}; use std::{
env, fs,
path::{Path, PathBuf},
process,
time::Duration,
};
use anyhow::{Context as _, Result}; use anyhow::{Context as _, Result};
use runner_examples::{ScenarioBuilderExt as _, read_env_any}; use runner_examples::{ScenarioBuilderExt as _, read_env_any};
@ -12,11 +17,13 @@ const DEFAULT_RUN_SECS: u64 = 60;
const MIXED_TXS_PER_BLOCK: u64 = 5; const MIXED_TXS_PER_BLOCK: u64 = 5;
const TOTAL_WALLETS: usize = 1000; const TOTAL_WALLETS: usize = 1000;
const TRANSACTION_WALLETS: usize = 500; const TRANSACTION_WALLETS: usize = 500;
const DA_BLOB_RATE: u64 = 1; const DA_BLOB_RATE: u64 = 3;
const SMOKE_RUN_SECS_MAX: u64 = 30; const SMOKE_RUN_SECS_MAX: u64 = 30;
#[tokio::main] #[tokio::main]
async fn main() { async fn main() {
init_node_log_dir_defaults();
tracing_subscriber::fmt::init(); tracing_subscriber::fmt::init();
if env::var("POL_PROOF_DEV_MODE").is_err() { if env::var("POL_PROOF_DEV_MODE").is_err() {
@ -39,6 +46,30 @@ async fn main() {
} }
} }
fn init_node_log_dir_defaults() {
if env::var_os("NOMOS_LOG_DIR").is_some() {
return;
}
let host_dir = repo_root().join("tmp").join("node-logs");
let _ = fs::create_dir_all(&host_dir);
unsafe {
env::set_var("NOMOS_LOG_DIR", host_dir);
}
}
fn repo_root() -> PathBuf {
env::var("CARGO_WORKSPACE_DIR")
.map(PathBuf::from)
.ok()
.or_else(|| {
Path::new(env!("CARGO_MANIFEST_DIR"))
.parent()
.map(Path::to_path_buf)
})
.expect("repo root must be discoverable from CARGO_WORKSPACE_DIR or CARGO_MANIFEST_DIR")
}
async fn run_local_case(validators: usize, executors: usize, run_duration: Duration) -> Result<()> { async fn run_local_case(validators: usize, executors: usize, run_duration: Duration) -> Result<()> {
info!( info!(
validators, validators,

View File

@ -1,3 +1,8 @@
use std::{
env, fs,
path::{Path, PathBuf},
};
use cucumber::World; use cucumber::World;
use cucumber_ext::TestingFrameworkWorld; use cucumber_ext::TestingFrameworkWorld;
use tracing_subscriber::{EnvFilter, fmt}; use tracing_subscriber::{EnvFilter, fmt};
@ -31,11 +36,36 @@ fn is_compose(
pub fn init_logging_defaults() { pub fn init_logging_defaults() {
set_default_env("POL_PROOF_DEV_MODE", "true"); set_default_env("POL_PROOF_DEV_MODE", "true");
set_default_env("NOMOS_TESTS_KEEP_LOGS", "1"); set_default_env("NOMOS_TESTS_KEEP_LOGS", "1");
set_default_env("NOMOS_LOG_DIR", ".tmp/cucumber-logs");
set_default_env("NOMOS_LOG_LEVEL", "info"); set_default_env("NOMOS_LOG_LEVEL", "info");
set_default_env("RUST_LOG", "info"); set_default_env("RUST_LOG", "info");
} }
pub fn init_node_log_dir_defaults(mode: Mode) {
if env::var_os("NOMOS_LOG_DIR").is_some() {
return;
}
let host_dir = repo_root().join("tmp").join("node-logs");
let _ = fs::create_dir_all(&host_dir);
match mode {
Mode::Host => set_default_env("NOMOS_LOG_DIR", &host_dir.display().to_string()),
Mode::Compose => set_default_env("NOMOS_LOG_DIR", "/tmp/node-logs"),
}
}
fn repo_root() -> PathBuf {
env::var("CARGO_WORKSPACE_DIR")
.map(PathBuf::from)
.ok()
.or_else(|| {
Path::new(env!("CARGO_MANIFEST_DIR"))
.parent()
.map(Path::to_path_buf)
})
.expect("repo root must be discoverable from CARGO_WORKSPACE_DIR or CARGO_MANIFEST_DIR")
}
pub fn init_tracing() { pub fn init_tracing() {
let filter = EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")); let filter = EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info"));
let _ = fmt().with_env_filter(filter).with_target(true).try_init(); let _ = fmt().with_env_filter(filter).with_target(true).try_init();

View File

@ -196,6 +196,24 @@ build_bundle::clean_cargo_linux_cache() {
rm -rf "${ROOT_DIR}/.tmp/cargo-linux/registry" "${ROOT_DIR}/.tmp/cargo-linux/git" rm -rf "${ROOT_DIR}/.tmp/cargo-linux/registry" "${ROOT_DIR}/.tmp/cargo-linux/git"
} }
build_bundle::docker_platform_suffix() {
# Map a docker platform string (e.g. linux/amd64) to a filesystem-safe suffix
# used for arch-specific target dirs, to avoid mixing build artifacts between
# different container architectures.
local platform="${1:-}"
if [ -z "${platform}" ]; then
echo ""
return 0
fi
platform="${platform#linux/}"
platform="${platform//\//-}"
if [ -z "${platform}" ] || [ "${platform}" = "linux" ]; then
echo ""
return 0
fi
echo "-${platform}"
}
build_bundle::maybe_run_linux_build_in_docker() { build_bundle::maybe_run_linux_build_in_docker() {
# With `set -e`, this function must return 0 when no Docker cross-build is needed. # With `set -e`, this function must return 0 when no Docker cross-build is needed.
if [ "${PLATFORM}" != "linux" ] || [ "$(uname -s)" = "Linux" ] || [ -n "${BUNDLE_IN_CONTAINER:-}" ]; then if [ "${PLATFORM}" != "linux" ] || [ "$(uname -s)" = "Linux" ] || [ -n "${BUNDLE_IN_CONTAINER:-}" ]; then
@ -224,7 +242,10 @@ build_bundle::maybe_run_linux_build_in_docker() {
echo "==> Building Linux bundle inside Docker" echo "==> Building Linux bundle inside Docker"
local container_output="/workspace${OUTPUT#"${ROOT_DIR}"}" local container_output="/workspace${OUTPUT#"${ROOT_DIR}"}"
mkdir -p "${ROOT_DIR}/.tmp/cargo-linux" "${ROOT_DIR}/.tmp/nomos-node-linux-target" local target_suffix
target_suffix="$(build_bundle::docker_platform_suffix "${DOCKER_PLATFORM}")"
local host_target_dir="${ROOT_DIR}/.tmp/nomos-node-linux-target${target_suffix}"
mkdir -p "${ROOT_DIR}/.tmp/cargo-linux" "${host_target_dir}"
local -a features_args=() local -a features_args=()
if [ -n "${NOMOS_EXTRA_FEATURES:-}" ]; then if [ -n "${NOMOS_EXTRA_FEATURES:-}" ]; then
@ -242,15 +263,16 @@ build_bundle::maybe_run_linux_build_in_docker() {
-e VERSION="${VERSION}" \ -e VERSION="${VERSION}" \
-e NOMOS_NODE_REV="${NOMOS_NODE_REV}" \ -e NOMOS_NODE_REV="${NOMOS_NODE_REV}" \
-e NOMOS_NODE_PATH="${node_path_env}" \ -e NOMOS_NODE_PATH="${node_path_env}" \
-e NOMOS_BUNDLE_DOCKER_PLATFORM="${DOCKER_PLATFORM}" \
-e NOMOS_CIRCUITS="/workspace/.tmp/nomos-circuits-linux" \ -e NOMOS_CIRCUITS="/workspace/.tmp/nomos-circuits-linux" \
-e STACK_DIR="/workspace/.tmp/nomos-circuits-linux" \ -e STACK_DIR="/workspace/.tmp/nomos-circuits-linux" \
-e HOST_DIR="/workspace/.tmp/nomos-circuits-linux" \ -e HOST_DIR="/workspace/.tmp/nomos-circuits-linux" \
-e NOMOS_EXTRA_FEATURES="${NOMOS_EXTRA_FEATURES:-}" \ -e NOMOS_EXTRA_FEATURES="${NOMOS_EXTRA_FEATURES:-}" \
-e BUNDLE_IN_CONTAINER=1 \ -e BUNDLE_IN_CONTAINER=1 \
-e CARGO_HOME=/workspace/.tmp/cargo-linux \ -e CARGO_HOME=/workspace/.tmp/cargo-linux \
-e CARGO_TARGET_DIR=/workspace/.tmp/nomos-node-linux-target \ -e CARGO_TARGET_DIR="/workspace/.tmp/nomos-node-linux-target${target_suffix}" \
-v "${ROOT_DIR}/.tmp/cargo-linux":/workspace/.tmp/cargo-linux \ -v "${ROOT_DIR}/.tmp/cargo-linux":/workspace/.tmp/cargo-linux \
-v "${ROOT_DIR}/.tmp/nomos-node-linux-target":/workspace/.tmp/nomos-node-linux-target \ -v "${host_target_dir}:/workspace/.tmp/nomos-node-linux-target${target_suffix}" \
-v "${ROOT_DIR}:/workspace" \ -v "${ROOT_DIR}:/workspace" \
"${extra_mounts[@]}" \ "${extra_mounts[@]}" \
-w /workspace \ -w /workspace \
@ -267,7 +289,14 @@ build_bundle::prepare_circuits() {
NODE_TARGET="${ROOT_DIR}/.tmp/nomos-node-host-target" NODE_TARGET="${ROOT_DIR}/.tmp/nomos-node-host-target"
else else
CIRCUITS_DIR="${ROOT_DIR}/.tmp/nomos-circuits-linux" CIRCUITS_DIR="${ROOT_DIR}/.tmp/nomos-circuits-linux"
NODE_TARGET="${ROOT_DIR}/.tmp/nomos-node-linux-target" # When building Linux bundles in Docker, avoid reusing the same target dir
# across different container architectures (e.g. linux/arm64 vs linux/amd64),
# as the native-host `target/debug` layout would otherwise get mixed.
local target_suffix=""
if [ -n "${BUNDLE_IN_CONTAINER:-}" ]; then
target_suffix="$(build_bundle::docker_platform_suffix "${NOMOS_BUNDLE_DOCKER_PLATFORM:-}")"
fi
NODE_TARGET="${ROOT_DIR}/.tmp/nomos-node-linux-target${target_suffix}"
fi fi
NODE_SRC_DEFAULT="${ROOT_DIR}/.tmp/nomos-node-${PLATFORM}-src" NODE_SRC_DEFAULT="${ROOT_DIR}/.tmp/nomos-node-${PLATFORM}-src"

View File

@ -0,0 +1,38 @@
services:
prometheus:
image: prom/prometheus:v2.53.0
command:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
# Exposes OTLP HTTP ingest at /api/v1/otlp/v1/metrics
- --enable-feature=otlp-write-receiver
- --web.enable-lifecycle
- --web.enable-admin-api
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus-data:/prometheus
ports:
- "9090:9090"
grafana:
image: grafana/grafana:11.4.0
depends_on:
- prometheus
env_file:
- ../../../testing-framework/assets/stack/monitoring/grafana/plugins.env
environment:
GF_SECURITY_ADMIN_USER: admin
GF_SECURITY_ADMIN_PASSWORD: admin
GF_USERS_ALLOW_SIGN_UP: "false"
volumes:
- grafana-data:/var/lib/grafana
- ../../../testing-framework/assets/stack/monitoring/grafana/grafana.ini:/etc/grafana/grafana.ini:ro
- ../../../testing-framework/assets/stack/monitoring/grafana/datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:ro
- ../../../testing-framework/assets/stack/monitoring/grafana/dashboards.yml:/etc/grafana/provisioning/dashboards/dashboards.yml:ro
- ../../../testing-framework/assets/stack/monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro
ports:
- "3000:3000"
volumes:
prometheus-data: {}
grafana-data: {}

View File

@ -0,0 +1,10 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
monitor: "NomosTesting"
scrape_configs:
- job_name: prometheus
static_configs:
- targets: ["prometheus:9090"]

View File

@ -0,0 +1,19 @@
prometheus:
prometheusSpec:
enableOTLPReceiver: true
additionalArgs:
- name: web.enable-admin-api
# Basic OTLP → Prometheus translation defaults are fine for most setups.
# See: https://prometheus.io/docs/guides/opentelemetry/
otlp: {}
grafana:
adminUser: admin
adminPassword: admin
sidecar:
dashboards:
enabled: true
label: grafana_dashboard
labelValue: "1"
datasources:
enabled: true

View File

@ -23,6 +23,15 @@ readonly DEFAULT_PRIVATE_AWS_REGION="ap-southeast-2"
readonly DEFAULT_PULL_POLICY_LOCAL="IfNotPresent" readonly DEFAULT_PULL_POLICY_LOCAL="IfNotPresent"
readonly DEFAULT_PULL_POLICY_ECR="Always" readonly DEFAULT_PULL_POLICY_ECR="Always"
readonly DOCKER_DESKTOP_CONTEXT="docker-desktop" readonly DOCKER_DESKTOP_CONTEXT="docker-desktop"
readonly DEFAULT_K8S_ECR_SKIP_IMAGE_BUILD="1"
run_examples::cleanup() {
rm -f "${SETUP_OUT:-}" 2>/dev/null || true
}
# Avoid inheriting environment-provided EXIT traps (e.g., from BASH_ENV) that can
# reference missing functions and fail at script termination.
trap run_examples::cleanup EXIT
run_examples::usage() { run_examples::usage() {
cat <<EOF cat <<EOF
@ -40,7 +49,6 @@ Options:
--bundle PATH Convenience alias for setting NOMOS_BINARIES_TAR=PATH --bundle PATH Convenience alias for setting NOMOS_BINARIES_TAR=PATH
--metrics-query-url URL PromQL base URL the runner process can query (optional) --metrics-query-url URL PromQL base URL the runner process can query (optional)
--metrics-otlp-ingest-url URL Full OTLP HTTP ingest URL for node metrics export (optional) --metrics-otlp-ingest-url URL Full OTLP HTTP ingest URL for node metrics export (optional)
--grafana-url URL Grafana base URL for printing/logging (optional)
--external-prometheus URL Alias for --metrics-query-url --external-prometheus URL Alias for --metrics-query-url
--external-otlp-metrics-endpoint URL Alias for --metrics-otlp-ingest-url --external-otlp-metrics-endpoint URL Alias for --metrics-otlp-ingest-url
--local Use a local Docker image tag (default for docker-desktop k8s) --local Use a local Docker image tag (default for docker-desktop k8s)
@ -48,6 +56,8 @@ Options:
Environment: Environment:
VERSION Circuits version (default from versions.env) VERSION Circuits version (default from versions.env)
CONSENSUS_SLOT_TIME Consensus slot duration in seconds (default 2)
CONSENSUS_ACTIVE_SLOT_COEFF Probability a slot is active (default 0.9); expected block interval ≈ slot_time / coeff
NOMOS_TESTNET_IMAGE Image reference (overridden by --local/--ecr selection) NOMOS_TESTNET_IMAGE Image reference (overridden by --local/--ecr selection)
ECR_IMAGE Full image reference for --ecr (overrides ECR_REGISTRY/ECR_REPO/TAG) ECR_IMAGE Full image reference for --ecr (overrides ECR_REGISTRY/ECR_REPO/TAG)
ECR_REGISTRY Registry hostname for --ecr (default ${DEFAULT_PUBLIC_ECR_REGISTRY}) ECR_REGISTRY Registry hostname for --ecr (default ${DEFAULT_PUBLIC_ECR_REGISTRY})
@ -56,9 +66,16 @@ Environment:
NOMOS_TESTNET_IMAGE_PULL_POLICY K8s imagePullPolicy (default ${DEFAULT_PULL_POLICY_LOCAL}; set to ${DEFAULT_PULL_POLICY_ECR} for --ecr) NOMOS_TESTNET_IMAGE_PULL_POLICY K8s imagePullPolicy (default ${DEFAULT_PULL_POLICY_LOCAL}; set to ${DEFAULT_PULL_POLICY_ECR} for --ecr)
NOMOS_BINARIES_TAR Path to prebuilt binaries/circuits tarball (default .tmp/nomos-binaries-<platform>-<version>.tar.gz) NOMOS_BINARIES_TAR Path to prebuilt binaries/circuits tarball (default .tmp/nomos-binaries-<platform>-<version>.tar.gz)
NOMOS_SKIP_IMAGE_BUILD Set to 1 to skip rebuilding the compose/k8s image NOMOS_SKIP_IMAGE_BUILD Set to 1 to skip rebuilding the compose/k8s image
NOMOS_FORCE_IMAGE_BUILD Set to 1 to force image rebuild even for k8s ECR mode
NOMOS_METRICS_QUERY_URL PromQL base URL for the runner process (optional) NOMOS_METRICS_QUERY_URL PromQL base URL for the runner process (optional)
NOMOS_METRICS_OTLP_INGEST_URL Full OTLP HTTP ingest URL for node metrics export (optional) NOMOS_METRICS_OTLP_INGEST_URL Full OTLP HTTP ingest URL for node metrics export (optional)
NOMOS_GRAFANA_URL Grafana base URL for printing/logging (optional) NOMOS_GRAFANA_URL Grafana base URL for printing/logging (optional)
Notes:
- For k8s runs on non-docker-desktop clusters (e.g. EKS), a locally built Docker image is not
visible to the cluster. By default, this script skips local image rebuilds in that case.
If you need a custom image, run scripts/build_test_image.sh and push it to a registry the
cluster can pull from, then set NOMOS_TESTNET_IMAGE accordingly.
EOF EOF
} }
@ -104,7 +121,6 @@ run_examples::parse_args() {
IMAGE_SELECTION_MODE="auto" IMAGE_SELECTION_MODE="auto"
METRICS_QUERY_URL="" METRICS_QUERY_URL=""
METRICS_OTLP_INGEST_URL="" METRICS_OTLP_INGEST_URL=""
GRAFANA_URL=""
RUN_SECS_RAW_SPECIFIED="" RUN_SECS_RAW_SPECIFIED=""
@ -166,14 +182,6 @@ run_examples::parse_args() {
METRICS_OTLP_INGEST_URL="${1#*=}" METRICS_OTLP_INGEST_URL="${1#*=}"
shift shift
;; ;;
--grafana-url)
GRAFANA_URL="${2:-}"
shift 2
;;
--grafana-url=*)
GRAFANA_URL="${1#*=}"
shift
;;
--external-prometheus) --external-prometheus)
METRICS_QUERY_URL="${2:-}" METRICS_QUERY_URL="${2:-}"
shift 2 shift 2
@ -279,12 +287,20 @@ run_examples::select_image() {
run_examples::fail_with_usage "Unknown image selection mode: ${selection}" run_examples::fail_with_usage "Unknown image selection mode: ${selection}"
fi fi
export NOMOS_IMAGE_SELECTION="${selection}"
export IMAGE_TAG="${IMAGE}" export IMAGE_TAG="${IMAGE}"
export NOMOS_TESTNET_IMAGE="${IMAGE}" export NOMOS_TESTNET_IMAGE="${IMAGE}"
if [ "${MODE}" = "k8s" ]; then if [ "${MODE}" = "k8s" ]; then
if [ "${selection}" = "ecr" ]; then if [ "${selection}" = "ecr" ]; then
export NOMOS_KZG_MODE="${NOMOS_KZG_MODE:-inImage}" export NOMOS_KZG_MODE="${NOMOS_KZG_MODE:-inImage}"
# A locally built Docker image isn't visible to remote clusters (e.g. EKS). Default to
# skipping the local rebuild, unless the user explicitly set NOMOS_SKIP_IMAGE_BUILD or
# overrides via NOMOS_FORCE_IMAGE_BUILD=1.
if [ "${NOMOS_FORCE_IMAGE_BUILD:-0}" != "1" ]; then
NOMOS_SKIP_IMAGE_BUILD="${NOMOS_SKIP_IMAGE_BUILD:-${DEFAULT_K8S_ECR_SKIP_IMAGE_BUILD}}"
export NOMOS_SKIP_IMAGE_BUILD
fi
else else
export NOMOS_KZG_MODE="${NOMOS_KZG_MODE:-hostPath}" export NOMOS_KZG_MODE="${NOMOS_KZG_MODE:-hostPath}"
fi fi
@ -548,9 +564,6 @@ run_examples::run() {
if [ -n "${METRICS_OTLP_INGEST_URL}" ]; then if [ -n "${METRICS_OTLP_INGEST_URL}" ]; then
export NOMOS_METRICS_OTLP_INGEST_URL="${METRICS_OTLP_INGEST_URL}" export NOMOS_METRICS_OTLP_INGEST_URL="${METRICS_OTLP_INGEST_URL}"
fi fi
if [ -n "${GRAFANA_URL}" ]; then
export NOMOS_GRAFANA_URL="${GRAFANA_URL}"
fi
echo "==> Running ${BIN} for ${RUN_SECS}s (mode=${MODE}, image=${IMAGE})" echo "==> Running ${BIN} for ${RUN_SECS}s (mode=${MODE}, image=${IMAGE})"
cd "${ROOT_DIR}" cd "${ROOT_DIR}"
@ -576,8 +589,6 @@ run_examples::main() {
echo "==> Using restored circuits/binaries bundle" echo "==> Using restored circuits/binaries bundle"
SETUP_OUT="$(common::tmpfile nomos-setup-output.XXXXXX)" SETUP_OUT="$(common::tmpfile nomos-setup-output.XXXXXX)"
cleanup() { rm -f "${SETUP_OUT}" 2>/dev/null || true; }
trap cleanup EXIT
run_examples::maybe_rebuild_image run_examples::maybe_rebuild_image
run_examples::maybe_restore_host_after_image run_examples::maybe_restore_host_after_image

View File

@ -25,7 +25,6 @@ Options:
--force-k8s-image-build Allow the k8s "rebuild image" run even on non-docker-desktop clusters --force-k8s-image-build Allow the k8s "rebuild image" run even on non-docker-desktop clusters
--metrics-query-url URL Forwarded to scripts/run-examples.sh (optional) --metrics-query-url URL Forwarded to scripts/run-examples.sh (optional)
--metrics-otlp-ingest-url URL Forwarded to scripts/run-examples.sh (optional) --metrics-otlp-ingest-url URL Forwarded to scripts/run-examples.sh (optional)
--grafana-url URL Forwarded to scripts/run-examples.sh (optional)
-h, --help Show this help -h, --help Show this help
Notes: Notes:
@ -51,7 +50,6 @@ matrix::parse_args() {
FORCE_K8S_IMAGE_BUILD=0 FORCE_K8S_IMAGE_BUILD=0
METRICS_QUERY_URL="" METRICS_QUERY_URL=""
METRICS_OTLP_INGEST_URL="" METRICS_OTLP_INGEST_URL=""
GRAFANA_URL=""
while [ "$#" -gt 0 ]; do while [ "$#" -gt 0 ]; do
case "$1" in case "$1" in
@ -71,8 +69,6 @@ matrix::parse_args() {
--metrics-query-url=*) METRICS_QUERY_URL="${1#*=}"; shift ;; --metrics-query-url=*) METRICS_QUERY_URL="${1#*=}"; shift ;;
--metrics-otlp-ingest-url) METRICS_OTLP_INGEST_URL="${2:-}"; shift 2 ;; --metrics-otlp-ingest-url) METRICS_OTLP_INGEST_URL="${2:-}"; shift 2 ;;
--metrics-otlp-ingest-url=*) METRICS_OTLP_INGEST_URL="${1#*=}"; shift ;; --metrics-otlp-ingest-url=*) METRICS_OTLP_INGEST_URL="${1#*=}"; shift ;;
--grafana-url) GRAFANA_URL="${2:-}"; shift 2 ;;
--grafana-url=*) GRAFANA_URL="${1#*=}"; shift ;;
*) matrix::die "Unknown argument: $1" ;; *) matrix::die "Unknown argument: $1" ;;
esac esac
done done
@ -104,9 +100,6 @@ matrix::forwarded_args() {
if [ -n "${METRICS_OTLP_INGEST_URL}" ]; then if [ -n "${METRICS_OTLP_INGEST_URL}" ]; then
args+=(--metrics-otlp-ingest-url "${METRICS_OTLP_INGEST_URL}") args+=(--metrics-otlp-ingest-url "${METRICS_OTLP_INGEST_URL}")
fi fi
if [ -n "${GRAFANA_URL}" ]; then
args+=(--grafana-url "${GRAFANA_URL}")
fi
printf '%s\0' "${args[@]}" printf '%s\0' "${args[@]}"
} }
@ -148,6 +141,7 @@ matrix::k8s_context() {
matrix::main() { matrix::main() {
ROOT_DIR="$(common::repo_root)" ROOT_DIR="$(common::repo_root)"
export ROOT_DIR export ROOT_DIR
export RUST_LOG="${RUST_LOG:-info}"
matrix::parse_args "$@" matrix::parse_args "$@"
matrix::split_modes matrix::split_modes
@ -211,11 +205,17 @@ matrix::main() {
fi fi
if [ "${ctx}" = "docker-desktop" ] || [ "${FORCE_K8S_IMAGE_BUILD}" -eq 1 ]; then if [ "${ctx}" = "docker-desktop" ] || [ "${FORCE_K8S_IMAGE_BUILD}" -eq 1 ]; then
# On non-docker-desktop clusters, run-examples.sh defaults to skipping local image builds
# since the cluster can't see them. Honor the matrix "force" option by overriding.
if [ "${ctx}" != "docker-desktop" ] && [ "${FORCE_K8S_IMAGE_BUILD}" -eq 1 ]; then
export NOMOS_FORCE_IMAGE_BUILD=1
fi
matrix::run_case "k8s.image_build" \ matrix::run_case "k8s.image_build" \
"${ROOT_DIR}/scripts/run-examples.sh" \ "${ROOT_DIR}/scripts/run-examples.sh" \
-t "${RUN_SECS}" -v "${VALIDATORS}" -e "${EXECUTORS}" \ -t "${RUN_SECS}" -v "${VALIDATORS}" -e "${EXECUTORS}" \
"${forward[@]}" \ "${forward[@]}" \
k8s k8s
unset NOMOS_FORCE_IMAGE_BUILD || true
else else
echo "==> [k8s] Detected context '${ctx}'; skipping image-build variant (use --force-k8s-image-build to override)" echo "==> [k8s] Detected context '${ctx}'; skipping image-build variant (use --force-k8s-image-build to override)"
fi fi
@ -259,4 +259,3 @@ matrix::main() {
if [[ "${BASH_SOURCE[0]}" == "$0" ]]; then if [[ "${BASH_SOURCE[0]}" == "$0" ]]; then
matrix::main "$@" matrix::main "$@"
fi fi

168
scripts/setup-observability.sh Executable file
View File

@ -0,0 +1,168 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck disable=SC1091
. "${SCRIPT_DIR}/common.sh"
common::ensure_bash "$@"
ROOT="$(common::repo_root)"
usage() {
cat <<'USAGE'
Usage:
scripts/setup-observability.sh compose up|down|logs|env
scripts/setup-observability.sh k8s install|uninstall|dashboards|env
Compose:
- Runs Prometheus (+ OTLP receiver) and Grafana via docker compose.
- Prints NOMOS_METRICS_* / NOMOS_GRAFANA_URL exports to wire into runs.
Kubernetes:
- Installs prometheus-community/kube-prometheus-stack into namespace
"nomos-observability" and optionally loads Nomos Grafana dashboards.
- Prints port-forward commands + NOMOS_METRICS_* / NOMOS_GRAFANA_URL exports.
USAGE
}
require_cmd() {
command -v "$1" >/dev/null 2>&1 || common::die "Missing required command: $1"
}
compose_file() {
echo "${ROOT}/scripts/observability/compose/docker-compose.yml"
}
compose_run() {
local file
file="$(compose_file)"
common::require_file "${file}"
docker compose -f "${file}" "$@"
}
compose_env() {
cat <<'EOF'
export NOMOS_METRICS_QUERY_URL=http://localhost:9090
export NOMOS_METRICS_OTLP_INGEST_URL=http://host.docker.internal:9090/api/v1/otlp/v1/metrics
export NOMOS_GRAFANA_URL=http://localhost:3000
EOF
}
k8s_namespace() { echo "nomos-observability"; }
k8s_release() { echo "nomos-observability"; }
k8s_values() { echo "${ROOT}/scripts/observability/k8s/kube-prometheus-stack.values.yaml"; }
k8s_install() {
require_cmd kubectl
require_cmd helm
local ns release values
ns="$(k8s_namespace)"
release="$(k8s_release)"
values="$(k8s_values)"
common::require_file "${values}"
kubectl get ns "${ns}" >/dev/null 2>&1 || kubectl create ns "${ns}"
if ! helm repo list | grep -q '^prometheus-community[[:space:]]'; then
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
fi
helm repo update prometheus-community
helm upgrade --install "${release}" prometheus-community/kube-prometheus-stack \
-n "${ns}" \
-f "${values}"
kubectl -n "${ns}" wait --for=condition=Available deploy -l "release=${release}" --timeout=10m || true
kubectl -n "${ns}" wait --for=condition=Ready pod -l "release=${release}" --timeout=10m || true
}
k8s_uninstall() {
require_cmd kubectl
require_cmd helm
local ns release
ns="$(k8s_namespace)"
release="$(k8s_release)"
helm uninstall "${release}" -n "${ns}" 2>/dev/null || true
kubectl delete ns "${ns}" --ignore-not-found
}
k8s_apply_dashboards() {
require_cmd kubectl
local ns dash_dir
ns="$(k8s_namespace)"
dash_dir="${ROOT}/testing-framework/assets/stack/monitoring/grafana/dashboards"
[ -d "${dash_dir}" ] || common::die "Missing dashboards directory: ${dash_dir}"
local file base name
for file in "${dash_dir}"/*.json; do
base="$(basename "${file}" .json)"
name="nomos-dashboard-${base//[^a-zA-Z0-9-]/-}"
kubectl -n "${ns}" create configmap "${name}" \
--from-file="$(basename "${file}")=${file}" \
--dry-run=client -o yaml | kubectl apply -f -
kubectl -n "${ns}" label configmap "${name}" grafana_dashboard=1 --overwrite >/dev/null
done
}
k8s_env() {
local ns release
ns="$(k8s_namespace)"
release="$(k8s_release)"
cat <<EOF
# Prometheus (runner-side): port-forward then set:
kubectl -n ${ns} port-forward svc/${release}-kube-p-prometheus 9090:9090
export NOMOS_METRICS_QUERY_URL=http://localhost:9090
# Grafana (runner-side): port-forward then set:
kubectl -n ${ns} port-forward svc/${release}-grafana 3000:80
export NOMOS_GRAFANA_URL=http://localhost:3000
# Prometheus OTLP ingest (node-side inside the cluster):
export NOMOS_METRICS_OTLP_INGEST_URL=http://${release}-kube-p-prometheus.${ns}:9090/api/v1/otlp/v1/metrics
EOF
}
main() {
local target="${1:-}"
local action="${2:-}"
case "${target}" in
compose)
require_cmd docker
case "${action}" in
up) compose_run up -d ;;
down) compose_run down -v ;;
logs) compose_run logs -f ;;
env) compose_env ;;
""|help|-h|--help) usage ;;
*) common::die "Unknown compose action: ${action}" ;;
esac
;;
k8s)
case "${action}" in
install) k8s_install ;;
uninstall) k8s_uninstall ;;
dashboards) k8s_apply_dashboards ;;
env) k8s_env ;;
""|help|-h|--help) usage ;;
*) common::die "Unknown k8s action: ${action}" ;;
esac
;;
""|help|-h|--help)
usage
;;
*)
common::die "Unknown target: ${target}"
;;
esac
}
main "$@"

View File

@ -1,5 +1,7 @@
use std::{ use std::{
env,
num::{NonZero, NonZeroU64}, num::{NonZero, NonZeroU64},
str::FromStr as _,
sync::Arc, sync::Arc,
}; };
@ -35,8 +37,30 @@ pub struct ConsensusParams {
} }
impl ConsensusParams { impl ConsensusParams {
const DEFAULT_ACTIVE_SLOT_COEFF: f64 = 0.9;
const CONSENSUS_ACTIVE_SLOT_COEFF_VAR: &str = "CONSENSUS_ACTIVE_SLOT_COEFF";
#[must_use] #[must_use]
pub const fn default_for_participants(n_participants: usize) -> Self { pub fn default_for_participants(n_participants: usize) -> Self {
let active_slot_coeff = env::var(Self::CONSENSUS_ACTIVE_SLOT_COEFF_VAR)
.map(|s| {
f64::from_str(&s).unwrap_or_else(|err| {
panic!(
"invalid {}='{}' (expected a float in (0.0, 1.0]): {err}",
Self::CONSENSUS_ACTIVE_SLOT_COEFF_VAR,
s
)
})
})
.unwrap_or(Self::DEFAULT_ACTIVE_SLOT_COEFF);
assert!(
(0.0..=1.0).contains(&active_slot_coeff) && active_slot_coeff > 0.0,
"{} must be in (0.0, 1.0], got {}",
Self::CONSENSUS_ACTIVE_SLOT_COEFF_VAR,
active_slot_coeff
);
Self { Self {
n_participants, n_participants,
// by setting the slot coeff to 1, we also increase the probability of multiple blocks // by setting the slot coeff to 1, we also increase the probability of multiple blocks
@ -45,7 +69,7 @@ impl ConsensusParams {
// deciding on the longest chain. // deciding on the longest chain.
security_param: NonZero::new(10).unwrap(), security_param: NonZero::new(10).unwrap(),
// a block should be produced (on average) every slot // a block should be produced (on average) every slot
active_slot_coeff: 0.9, active_slot_coeff,
} }
} }
} }

View File

@ -27,6 +27,7 @@ pub fn default_time_config() -> GeneralTimeConfig {
let slot_duration = std::env::var(CONSENSUS_SLOT_TIME_VAR) let slot_duration = std::env::var(CONSENSUS_SLOT_TIME_VAR)
.map(|s| <u64>::from_str(&s).unwrap()) .map(|s| <u64>::from_str(&s).unwrap())
.unwrap_or(DEFAULT_SLOT_TIME); .unwrap_or(DEFAULT_SLOT_TIME);
GeneralTimeConfig { GeneralTimeConfig {
slot_duration: Duration::from_secs(slot_duration), slot_duration: Duration::from_secs(slot_duration),
chain_start_time: OffsetDateTime::now_utc(), chain_start_time: OffsetDateTime::now_utc(),

View File

@ -1,3 +1,8 @@
use std::{
env,
path::{Path, PathBuf},
};
use serde::Serialize; use serde::Serialize;
use testing_framework_core::{ use testing_framework_core::{
constants::{DEFAULT_CFGSYNC_PORT, kzg_container_path}, constants::{DEFAULT_CFGSYNC_PORT, kzg_container_path},
@ -150,9 +155,26 @@ fn base_volumes(use_kzg_mount: bool) -> Vec<String> {
if use_kzg_mount { if use_kzg_mount {
volumes.push("./kzgrs_test_params:/kzgrs_test_params:z".into()); volumes.push("./kzgrs_test_params:/kzgrs_test_params:z".into());
} }
if let Some(host_log_dir) = repo_root()
.map(|root| root.join("tmp").join("node-logs"))
.map(|dir| dir.display().to_string())
{
volumes.push(format!("{host_log_dir}:/tmp/node-logs"));
}
volumes volumes
} }
fn repo_root() -> Option<PathBuf> {
if let Ok(root) = env::var("CARGO_WORKSPACE_DIR") {
return Some(PathBuf::from(root));
}
Path::new(env!("CARGO_MANIFEST_DIR"))
.parent()
.and_then(Path::parent)
.and_then(Path::parent)
.map(Path::to_path_buf)
}
fn default_extra_hosts() -> Vec<String> { fn default_extra_hosts() -> Vec<String> {
host_gateway_entry().into_iter().collect() host_gateway_entry().into_iter().collect()
} }

View File

@ -311,6 +311,7 @@ pub fn write_compose_artifacts(
let compose_path = workspace.root.join("compose.generated.yml"); let compose_path = workspace.root.join("compose.generated.yml");
write_compose_file(&descriptor, &compose_path) write_compose_file(&descriptor, &compose_path)
.map_err(|source| ConfigError::Template { source })?; .map_err(|source| ConfigError::Template { source })?;
debug!(compose_file = %compose_path.display(), "rendered compose file"); debug!(compose_file = %compose_path.display(), "rendered compose file");
Ok(compose_path) Ok(compose_path)
} }

View File

@ -302,6 +302,7 @@ impl<Caps> TransactionFlowBuilder<Caps> {
let workload = transaction::Workload::with_rate(self.rate.get()) let workload = transaction::Workload::with_rate(self.rate.get())
.expect("transaction rate must be non-zero") .expect("transaction rate must be non-zero")
.with_user_limit(self.users); .with_user_limit(self.users);
tracing::info!( tracing::info!(
rate = self.rate.get(), rate = self.rate.get(),
users = self.users.map(|u| u.get()), users = self.users.map(|u| u.get()),