diff --git a/.cargo-deny.toml b/.cargo-deny.toml index f4542dd..4cb03f9 100644 --- a/.cargo-deny.toml +++ b/.cargo-deny.toml @@ -46,6 +46,11 @@ expression = "MIT AND ISC" license-files = [{ hash = 0xbd0eed23, path = "LICENSE" }] name = "ring" +[[licenses.clarify]] +expression = "MIT" +license-files = [{ hash = 0xcb90f5db, path = "LICENSE" }] +name = "jsonpath-rust" + [sources] allow-git = ["https://github.com/EspressoSystems/jellyfish.git"] unknown-git = "deny" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f1c365f..d933e10 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,7 +11,7 @@ repos: - id: clippy args: ["--all", "--all-targets", "--all-features", "--", "-D", "warnings"] - repo: https://github.com/EmbarkStudios/cargo-deny - rev: 0.18.2 + rev: 0.18.9 hooks: - id: cargo-deny args: diff --git a/Cargo.toml b/Cargo.toml index 7ccb54a..c29b430 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,7 @@ resolver = "2" [workspace.package] categories = [] -description = "Nomos testing framework workspace (split out from nomos-node)" +description = "Logos testing framework workspace (split out from nomos-node)" edition = "2024" keywords = ["framework", "nomos", "testing"] license = "MIT OR Apache-2.0" @@ -40,7 +40,7 @@ testing-framework-runner-k8s = { default-features = false, path = "testing-f testing-framework-runner-local = { default-features = false, path = "testing-framework/deployers/local" } testing-framework-workflows = { default-features = false, path = "testing-framework/workflows" } -# Nomos git dependencies (pinned to latest master) +# Logos git dependencies (pinned to latest master) broadcast-service = { default-features = false, git = "https://github.com/logos-co/nomos-node.git", rev = "1fce2dc3f482c16361316eb2a1b6ccd1206aa917" } cfgsync = { default-features = false, path = "testing-framework/tools/cfgsync" } chain-leader = { default-features = false, git = "https://github.com/logos-co/nomos-node.git", rev = "1fce2dc3f482c16361316eb2a1b6ccd1206aa917", features = [ diff --git a/README.md b/README.md index 82cfe32..3f8534c 100644 --- a/README.md +++ b/README.md @@ -150,7 +150,7 @@ This project is part of the Logos blockchain implementation. - **Documentation**: https://logos-blockchain.github.io/logos-blockchain-testing/ - **Logos Project**: https://github.com/logos-co -- **Nomos Node**: https://github.com/logos-co/nomos-node +- **Logos Node (repo: nomos-node)**: https://github.com/logos-co/nomos-node ## Support diff --git a/book/README.md b/book/README.md index 736450b..af21ddd 100644 --- a/book/README.md +++ b/book/README.md @@ -419,7 +419,7 @@ rg "github.com.*404" src/ rg "(TODO|FIXME|XXX)" src/ # Check for inconsistent terminology -rg "(Nomos node|nomos blockchain)" src/ # Should be "Logos" +rg "(Nomos node|nomos blockchain)" src/ # Should be "Logos node|Logos blockchain" ``` --- diff --git a/book/book.toml b/book/book.toml index cc919e8..bb2fe50 100644 --- a/book/book.toml +++ b/book/book.toml @@ -1,5 +1,5 @@ [book] -authors = ["Nomos Testing"] +authors = ["Logos Testing"] language = "en" src = "src" title = "Logos Blockchain Testing Framework Book" diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md index 113c647..abdea2e 100644 --- a/book/src/SUMMARY.md +++ b/book/src/SUMMARY.md @@ -18,6 +18,7 @@ - [Usage Patterns](usage-patterns.md) - [Examples](examples.md) - [Advanced & Artificial Examples](examples-advanced.md) + - [Cucumber/BDD Interface](cucumber-bdd.md) - [Running Scenarios](running-scenarios.md) - [Runners](runners.md) - [RunContext: BlockFeed & Node Control](node-control.md) @@ -29,14 +30,14 @@ - [Extending the Framework](extending.md) - [Example: New Workload & Expectation (Rust)](custom-workload-example.md) - [Internal Crate Reference](internal-crate-reference.md) -- [Part V — Operations & Deployment](part-v.md) +- [Part IV — Operations & Deployment](part-iv.md) - [Overview](operations-overview.md) - [Prerequisites & Setup](prerequisites.md) - [Running Examples](running-examples.md) - [CI Integration](ci-integration.md) - [Environment Variables](environment-variables.md) - [Logging & Observability](logging-observability.md) -- [Part VI — Appendix](part-vi.md) +- [Part V — Appendix](part-v.md) - [Builder API Quick Reference](dsl-cheat-sheet.md) - [Troubleshooting Scenarios](troubleshooting.md) - [FAQ](faq.md) diff --git a/book/src/authoring-scenarios.md b/book/src/authoring-scenarios.md index 363e552..5db7240 100644 --- a/book/src/authoring-scenarios.md +++ b/book/src/authoring-scenarios.md @@ -1,20 +1,389 @@ # Authoring Scenarios -Creating a scenario is a declarative exercise: +Creating a scenario is a declarative exercise. This page walks you through the core authoring loop with concrete examples, explains the units and timing model, and shows how to structure scenarios in Rust test suites. -1. **Shape the topology**: decide how many validators and executors to run, and - what high-level network and data-availability characteristics matter for the - test. -2. **Attach workloads**: pick traffic generators that align with your goals - (transactions, data-availability blobs, or chaos for resilience probes). -3. **Define expectations**: specify the health signals that must hold when the - run finishes (e.g., consensus liveness, inclusion of submitted activity; see - [Core Content: Workloads & Expectations](workloads.md)). -4. **Set duration**: choose a run window long enough to observe meaningful - block progression and the effects of your workloads. -5. **Choose a runner**: target local processes for fast iteration, Docker - Compose for reproducible multi-node stacks, or Kubernetes for cluster-grade - validation. For environment considerations, see [Operations Overview](operations-overview.md). +--- -Keep scenarios small and explicit: make the intended behavior and the success -criteria clear so failures are easy to interpret and act upon. +## The Core Authoring Loop + +Every scenario follows the same pattern: + +```mermaid +flowchart LR + A[1. Topology] --> B[2. Workloads] + B --> C[3. Expectations] + C --> D[4. Duration] + D --> E[5. Deploy & Run] +``` + +1. **Shape the topology** — How many nodes, what roles, what network shape +2. **Attach workloads** — What traffic to generate (transactions, blobs, chaos) +3. **Define expectations** — What success looks like (liveness, inclusion, recovery) +4. **Set duration** — How long to run the experiment +5. **Choose a runner** — Where to execute (local, compose, k8s) + +--- + +## Hello Scenario: Your First Test + +Let's build a minimal consensus liveness test step-by-step. + +### Step 1: Shape the Topology + +```rust,ignore +use testing_framework_core::scenario::ScenarioBuilder; +use testing_framework_workflows::ScenarioBuilderExt; + +let scenario = ScenarioBuilder::topology_with(|t| { + t.network_star() // Star network (one gateway + nodes) + .validators(3) // 3 validator nodes + .executors(1) // 1 executor node +}) +``` + +**What goes in topology?** +- Node counts (validators, executors) +- Network shape (`network_star()` is currently the only built-in layout) +- Role split (validators vs. executors) + +**What does NOT go in topology?** +- Traffic rates (that's workloads) +- Success criteria (that's expectations) +- Runtime configuration (that's duration/runner) + +### Step 2: Attach Workloads + +```rust,ignore +.wallets(20) // Seed funded wallet accounts for transaction workloads +.transactions_with(|tx| { + tx.rate(10) // 10 transactions per block + .users(5) // distributed across 5 wallets +}) +``` + +**What goes in workloads?** +- Transaction traffic (rate, users) +- DA traffic (channels, blobs) +- Chaos injection (restarts, delays) + +**Units explained:** +- `.rate(10)` = **10 transactions per block** (not per second!) +- `.users(5)` = use 5 distinct wallet accounts +- The framework adapts to block time automatically + +### Step 3: Define Expectations + +```rust,ignore +.expect_consensus_liveness() +``` + +**What goes in expectations?** +- Health checks that run after the scenario completes +- Liveness (blocks produced) +- Inclusion (workload activity landed on-chain) +- Recovery (system survived chaos) + +**When do expectations run?** +After the duration window ends, during the **evaluation phase** of the scenario lifecycle. + +### Step 4: Set Duration + +```rust,ignore +use std::time::Duration; + +.with_run_duration(Duration::from_secs(60)) +``` + +**How long is enough?** +- Minimum: 2× the expected block time × number of blocks you want +- For consensus liveness: 30-60 seconds +- For transaction inclusion: 60-120 seconds +- For chaos recovery: 2-5 minutes + +**What happens during this window?** +- Nodes are running +- Workloads generate traffic +- Metrics/logs are collected +- BlockFeed broadcasts observations in real-time + +### Step 5: Build and Deploy + +```rust,ignore +.build(); + +// Choose a runner +use testing_framework_core::scenario::Deployer; +use testing_framework_runner_local::LocalDeployer; + +let deployer = LocalDeployer::default(); +let runner = deployer.deploy(&scenario).await?; +let _result = runner.run(&mut scenario).await?; +``` + +--- + +## Complete "Hello Scenario" + +Putting it all together: + +```rust,ignore +use std::time::Duration; + +use anyhow::Result; +use testing_framework_core::scenario::{Deployer, ScenarioBuilder}; +use testing_framework_runner_local::LocalDeployer; +use testing_framework_workflows::ScenarioBuilderExt; + +#[tokio::test] +async fn hello_consensus_liveness() -> Result<()> { + let mut scenario = ScenarioBuilder::topology_with(|t| { + t.network_star() + .validators(3) + .executors(1) + }) + .wallets(20) + .transactions_with(|tx| tx.rate(10).users(5)) + .expect_consensus_liveness() + .with_run_duration(Duration::from_secs(60)) + .build(); + + let deployer = LocalDeployer::default(); + let runner = deployer.deploy(&scenario).await?; + runner.run(&mut scenario).await?; + + Ok(()) +} +``` + +**Run it:** +```bash +POL_PROOF_DEV_MODE=true cargo test hello_consensus_liveness +``` + +--- + +## Understanding Units & Timing + +### Transaction Rate: Per-Block, Not Per-Second + +**Wrong mental model:** `.rate(10)` = 10 tx/second + +**Correct mental model:** `.rate(10)` = 10 tx/block + +**Why?** The blockchain produces blocks at variable rates depending on consensus timing. The framework submits the configured rate **per block** to ensure predictable load regardless of block time. + +**Example:** +- Block time = 2 seconds +- `.rate(10)` → 10 tx/block → 5 tx/second average +- Block time = 5 seconds +- `.rate(10)` → 10 tx/block → 2 tx/second average + +### Duration: Wall-Clock Time + +`.with_run_duration(Duration::from_secs(60))` means the scenario runs for **60 seconds of real time**, not 60 blocks. + +**How many blocks will be produced?** +Depends on consensus timing (slot time, active slot coefficient). Typical: 1-2 seconds per block. + +**Rule of thumb:** +- 60 seconds → ~30-60 blocks +- 120 seconds → ~60-120 blocks + +--- + +## Structuring Scenarios in a Test Suite + +### Pattern 1: Integration Test Module + +```rust,ignore +// tests/integration_test.rs +use std::time::Duration; + +use anyhow::Result; +use testing_framework_core::scenario::{Deployer, ScenarioBuilder}; +use testing_framework_runner_local::LocalDeployer; +use testing_framework_workflows::ScenarioBuilderExt; + +#[tokio::test] +async fn test_consensus_liveness() -> Result<()> { + let mut scenario = ScenarioBuilder::topology_with(|t| { + t.network_star().validators(3).executors(1) + }) + .expect_consensus_liveness() + .with_run_duration(Duration::from_secs(30)) + .build(); + + let deployer = LocalDeployer::default(); + let runner = deployer.deploy(&scenario).await?; + runner.run(&mut scenario).await?; + Ok(()) +} + +#[tokio::test] +async fn test_transaction_inclusion() -> Result<()> { + let mut scenario = ScenarioBuilder::topology_with(|t| { + t.network_star().validators(2).executors(1) + }) + .wallets(10) + .transactions_with(|tx| tx.rate(5).users(5)) + .expect_consensus_liveness() + .with_run_duration(Duration::from_secs(60)) + .build(); + + let deployer = LocalDeployer::default(); + let runner = deployer.deploy(&scenario).await?; + runner.run(&mut scenario).await?; + Ok(()) +} +``` + +### Pattern 2: Shared Scenario Builders + +Extract common topology patterns: + +```rust,ignore +// tests/helpers.rs +use testing_framework_core::scenario::ScenarioBuilder; +use testing_framework_workflows::ScenarioBuilderExt; + +pub fn minimal_topology() -> ScenarioBuilder { + ScenarioBuilder::topology_with(|t| { + t.network_star().validators(2).executors(1) + }) +} + +pub fn production_like_topology() -> ScenarioBuilder { + ScenarioBuilder::topology_with(|t| { + t.network_star().validators(7).executors(3) + }) +} + +// tests/consensus_tests.rs +use std::time::Duration; + +use helpers::*; + +#[tokio::test] +async fn small_cluster_liveness() -> anyhow::Result<()> { + let mut scenario = minimal_topology() + .expect_consensus_liveness() + .with_run_duration(Duration::from_secs(30)) + .build(); + // ... deploy and run + Ok(()) +} + +#[tokio::test] +async fn large_cluster_liveness() -> anyhow::Result<()> { + let mut scenario = production_like_topology() + .expect_consensus_liveness() + .with_run_duration(Duration::from_secs(60)) + .build(); + // ... deploy and run + Ok(()) +} +``` + +### Pattern 3: Parameterized Scenarios + +Test the same behavior across different scales: + +```rust,ignore +use std::time::Duration; + +use anyhow::Result; +use testing_framework_core::scenario::{Deployer, ScenarioBuilder}; +use testing_framework_runner_local::LocalDeployer; +use testing_framework_workflows::ScenarioBuilderExt; + +async fn test_liveness_with_topology(validators: usize, executors: usize) -> Result<()> { + let mut scenario = ScenarioBuilder::topology_with(|t| { + t.network_star() + .validators(validators) + .executors(executors) + }) + .expect_consensus_liveness() + .with_run_duration(Duration::from_secs(60)) + .build(); + + let deployer = LocalDeployer::default(); + let runner = deployer.deploy(&scenario).await?; + runner.run(&mut scenario).await?; + Ok(()) +} + +#[tokio::test] +async fn liveness_small() -> Result<()> { + test_liveness_with_topology(2, 1).await +} + +#[tokio::test] +async fn liveness_medium() -> Result<()> { + test_liveness_with_topology(5, 2).await +} + +#[tokio::test] +async fn liveness_large() -> Result<()> { + test_liveness_with_topology(10, 3).await +} +``` + +--- + +## What Belongs Where? + +### Topology + +**Do include:** +- Node counts (`.validators(3)`, `.executors(1)`) +- Network shape (`.network_star()`) +- Role split (validators vs. executors) + +**Don't include:** +- Traffic rates (workload concern) +- Expected outcomes (expectation concern) +- Runtime behavior (runner/duration concern) + +### Workloads + +**Do include:** +- Transaction traffic (`.transactions_with(|tx| ...)`) +- DA traffic (`.da_with(|da| ...)`) +- Chaos injection (`.with_workload(RandomRestartWorkload::new(...))`) +- Rates, users, timing + +**Don't include:** +- Node configuration (topology concern) +- Success criteria (expectation concern) + +### Expectations + +**Do include:** +- Health checks (`.expect_consensus_liveness()`) +- Inclusion verification (built-in to workloads) +- Custom assertions (`.with_expectation(MyExpectation::new())`) + +**Don't include:** +- Traffic generation (workload concern) +- Cluster shape (topology concern) + +--- + +## Best Practices + +1. **Keep scenarios focused**: One scenario = one behavior under test +2. **Start small**: 2-3 validators, 1 executor, 30-60 seconds +3. **Use descriptive names**: `test_consensus_survives_validator_restart` not `test_1` +4. **Extract common patterns**: Shared topology builders, helper functions +5. **Document intent**: Add comments explaining what you're testing and why +6. **Mind the units**: `.rate(N)` is per-block, `.with_run_duration()` is wall-clock +7. **Set realistic durations**: Allow enough time for multiple blocks + workload effects + +--- + +## Next Steps + +- **[Core Content: Workloads & Expectations](workloads.md)** — Comprehensive reference for built-in workloads and expectations +- **[Examples](examples.md)** — More scenario patterns (DA, chaos, advanced topologies) +- **[Running Scenarios](running-scenarios.md)** — How execution works, artifacts produced, per-runner details +- **[API Levels](api-levels.md)** — When to use builder DSL vs. direct instantiation diff --git a/book/src/best-practices.md b/book/src/best-practices.md index 5564c3d..57dcd47 100644 --- a/book/src/best-practices.md +++ b/book/src/best-practices.md @@ -65,7 +65,7 @@ pub const LONG_RUN_DURATION: Duration = Duration::from_secs(300); - Use block statistics (`block_feed.stats().total_transactions()`) to verify inclusion **Collect metrics** -- Set up Prometheus/Grafana via `scripts/observability/deploy.sh -t compose -a up` for visualizing node behavior +- Set up Prometheus/Grafana via `scripts/setup/setup-observability.sh compose up` for visualizing node behavior - Use metrics to identify bottlenecks before adding more load - Monitor mempool size, block size, and consensus timing diff --git a/book/src/cucumber-bdd.md b/book/src/cucumber-bdd.md new file mode 100644 index 0000000..d084646 --- /dev/null +++ b/book/src/cucumber-bdd.md @@ -0,0 +1,85 @@ +# Cucumber/BDD Interface + +The Logos testing repo includes a small Cucumber (Gherkin) harness for “smoke” scenarios. It is useful when you want readable acceptance-style checks, but it intentionally exposes a limited surface area compared to Rust scenarios. + +--- + +## What Exists Today + +- Step definitions live in `testing-framework/cucumber`. +- The runnable entrypoints are binaries in `examples` (crate `runner-examples`): + - `cucumber_host` (local/host deployer) + - `cucumber_compose` (compose deployer) +- Feature files live in `examples/cucumber/features/`. +- Supported deployers: `local` and `compose` (no k8s runner integration in Cucumber yet). + +--- + +## Example Feature (Matches Current Steps) + +This is the shape used by the repo’s smoke features: + +```gherkin +Feature: Testing Framework - Local Runner + + Scenario: Run a local smoke scenario (tx + DA + liveness) + Given deployer is "local" + And topology has 1 validators and 1 executors + And run duration is 60 seconds + And wallets total funds is 1000000000 split across 50 users + And transactions rate is 1 per block + And data availability channel rate is 1 per block and blob rate is 1 per block + And expect consensus liveness + When run scenario + Then scenario should succeed +``` + +--- + +## Running The Smoke Features + +Local runner smoke: + +```bash +POL_PROOF_DEV_MODE=true \ +cargo run -p runner-examples --bin cucumber_host +``` + +Compose runner smoke: + +```bash +POL_PROOF_DEV_MODE=true \ +cargo run -p runner-examples --bin cucumber_compose +``` + +--- + +## Available Steps (Current) + +Topology / runner selection: +- `Given deployer is "local"|"compose"` +- `Given topology has validators and executors` + +Run configuration: +- `Given run duration is seconds` +- `Given wallets total funds is split across users` + +Workloads: +- `Given transactions rate is per block` +- `Given transactions rate is per block using users` +- `Given data availability channel rate is per block and blob rate is per block` + +Expectations: +- `Given expect consensus liveness` +- `Given consensus liveness lag allowance is ` + +Execution + assertion: +- `When run scenario` +- `Then scenario should succeed` + +--- + +## Notes + +- The Cucumber harness builds scenarios using the same core + workflow builder APIs as the Rust examples, so the same prerequisites apply (notably `POL_PROOF_DEV_MODE=true` for practical runs). +- If you need more flexibility (custom workloads/expectations, richer checks, node control/chaos), write Rust scenarios instead: see [Examples](examples.md) and [Extending the Framework](extending.md). diff --git a/book/src/glossary.md b/book/src/glossary.md index 32a2649..39152d4 100644 --- a/book/src/glossary.md +++ b/book/src/glossary.md @@ -55,4 +55,4 @@ ## External Resources -- **[Nomos Project Documentation](https://nomos-tech.notion.site/project)** — Protocol specifications, node internals, and architecture details +- **[Logos Project Documentation](https://nomos-tech.notion.site/project)** — Protocol specifications, node internals, and architecture details diff --git a/book/src/introduction.md b/book/src/introduction.md index e5b90ca..ae9c3c2 100644 --- a/book/src/introduction.md +++ b/book/src/introduction.md @@ -1,6 +1,6 @@ # Introduction -The Nomos Testing Framework is a purpose-built toolkit for exercising Logos in +The Logos Testing Framework is a purpose-built toolkit for exercising Logos in realistic, multi-node environments. It solves the gap between small, isolated tests and full-system validation by letting teams describe a cluster layout, drive meaningful traffic, and assert the outcomes in one coherent plan. @@ -43,4 +43,4 @@ runner.run(&mut scenario).await?; This pattern—topology, workloads, expectations, duration—repeats across all scenarios in this book. -**Learn more:** For protocol-level documentation and node internals, see the [Nomos Project Documentation](https://nomos-tech.notion.site/project). +**Learn more:** For protocol-level documentation and node internals, see the [Logos Project Documentation](https://nomos-tech.notion.site/project). diff --git a/book/src/logging-observability.md b/book/src/logging-observability.md index 016f2a0..88c2f0f 100644 --- a/book/src/logging-observability.md +++ b/book/src/logging-observability.md @@ -273,15 +273,21 @@ scripts/run/run-examples.sh -t 60 -v 3 -e 1 compose - Runners do **not** provision Grafana automatically (but `scripts/setup/setup-observability.sh` can) - If you set `NOMOS_GRAFANA_URL`, the deployer prints it in `TESTNET_ENDPOINTS` -- Dashboards live in `testing-framework/assets/stack/monitoring/grafana/dashboards/` for import into your Grafana +- Dashboards live in `testing-framework/assets/stack/monitoring/grafana/dashboards/` (the bundled stack auto-provisions them) **Example:** ```bash +# Bring up the bundled Prometheus+Grafana stack (optional) +scripts/setup/setup-observability.sh compose up +eval $(scripts/setup/setup-observability.sh compose env) + export NOMOS_GRAFANA_URL=http://localhost:3000 POL_PROOF_DEV_MODE=true scripts/run/run-examples.sh -t 60 -v 3 -e 1 compose ``` +**Default bundled Grafana login:** `admin` / `admin` (see `scripts/observability/compose/docker-compose.yml`). + ### Node APIs - Access from expectations: `ctx.node_clients().validator_clients().get(0)` diff --git a/book/src/part-iv.md b/book/src/part-iv.md new file mode 100644 index 0000000..2b61617 --- /dev/null +++ b/book/src/part-iv.md @@ -0,0 +1,44 @@ +# Part IV — Operations & Deployment + +This section covers operational aspects of running the testing framework: prerequisites, deployment configuration, continuous integration, and observability. + +## What You'll Learn + +- **Prerequisites & Setup**: Required files, binaries, circuit assets, and environment configuration +- **Running Examples**: How to execute scenarios across host, compose, and k8s runners +- **CI Integration**: Automating tests in continuous integration pipelines with caching and matrix testing +- **Environment Variables**: Complete reference of all configuration variables +- **Logging & Observability**: Log collection strategies, metrics integration, and debugging techniques + +## Who This Section Is For + +- **Operators** setting up the framework for the first time +- **DevOps Engineers** integrating tests into CI/CD pipelines +- **Developers** debugging test failures or performance issues +- **Platform Engineers** deploying across different environments (local, Docker, Kubernetes) + +## Navigation + +This section is organized for progressive depth: + +1. Start with [Operations Overview](operations-overview.md) for the big picture +2. Follow [Prerequisites & Setup](prerequisites.md) to prepare your environment +3. Use [Running Examples](running-examples.md) to execute your first scenarios +4. Integrate with [CI Integration](ci-integration.md) for automated testing +5. Reference [Environment Variables](environment-variables.md) for complete configuration options +6. Debug with [Logging & Observability](logging-observability.md) when issues arise + +## Key Principles + +**Operational Hygiene:** Assets present, prerequisites satisfied, observability reachable + +**Environment Fit:** Choose the right deployment target based on isolation, reproducibility, and resource needs + +**Clear Signals:** Verify runners report node readiness before starting workloads + +**Failure Triage:** Map failures to specific causes—missing prerequisites, platform issues, or unmet expectations + +--- + +Ready to get started? Begin with [Operations Overview](operations-overview.md) → + diff --git a/book/src/part-v.md b/book/src/part-v.md index 6f8b082..2ff5245 100644 --- a/book/src/part-v.md +++ b/book/src/part-v.md @@ -1,44 +1,28 @@ -# Part V — Operations & Deployment +# Part V — Appendix -This section covers operational aspects of running the testing framework: prerequisites, deployment configuration, continuous integration, and observability. +Quick reference materials, troubleshooting guides, and supplementary information. -## What You'll Learn +## Contents -- **Prerequisites & Setup**: Required files, binaries, circuit assets, and environment configuration -- **Running Examples**: How to execute scenarios across host, compose, and k8s runners -- **CI Integration**: Automating tests in continuous integration pipelines with caching and matrix testing -- **Environment Variables**: Complete reference of all configuration variables -- **Logging & Observability**: Log collection strategies, metrics integration, and debugging techniques +- **Builder API Quick Reference**: Cheat sheet for DSL methods +- **Troubleshooting Scenarios**: Common issues and their solutions, including "What Failure Looks Like" with realistic examples +- **FAQ**: Frequently asked questions +- **Glossary**: Terminology reference -## Who This Section Is For +## When to Use This Section -- **Operators** setting up the framework for the first time -- **DevOps Engineers** integrating tests into CI/CD pipelines -- **Developers** debugging test failures or performance issues -- **Platform Engineers** deploying across different environments (local, Docker, Kubernetes) +- **Quick lookups**: Find DSL method signatures without reading full guides +- **Debugging failures**: Match symptoms to known issues and fixes +- **Clarifying concepts**: Look up unfamiliar terms in the glossary +- **Common questions**: Check FAQ before asking for help -## Navigation - -This section is organized for progressive depth: - -1. Start with [Operations Overview](operations-overview.md) for the big picture -2. Follow [Prerequisites & Setup](prerequisites.md) to prepare your environment -3. Use [Running Examples](running-examples.md) to execute your first scenarios -4. Integrate with [CI Integration](ci-integration.md) for automated testing -5. Reference [Environment Variables](environment-variables.md) for complete configuration options -6. Debug with [Logging & Observability](logging-observability.md) when issues arise - -## Key Principles - -**Operational Hygiene:** Assets present, prerequisites satisfied, observability reachable - -**Environment Fit:** Choose the right deployment target based on isolation, reproducibility, and resource needs - -**Clear Signals:** Verify runners report node readiness before starting workloads - -**Failure Triage:** Map failures to specific causes—missing prerequisites, platform issues, or unmet expectations +This section complements the main documentation with practical reference materials that you'll return to frequently during development and operations. --- -Ready to get started? Begin with [Operations Overview](operations-overview.md) → +Jump to: +- [Builder API Quick Reference](dsl-cheat-sheet.md) +- [Troubleshooting Scenarios](troubleshooting.md) +- [FAQ](faq.md) +- [Glossary](glossary.md) diff --git a/book/src/part-vi.md b/book/src/part-vi.md deleted file mode 100644 index 3e81d96..0000000 --- a/book/src/part-vi.md +++ /dev/null @@ -1,28 +0,0 @@ -# Part VI — Appendix - -Quick reference materials, troubleshooting guides, and supplementary information. - -## Contents - -- **Builder API Quick Reference**: Cheat sheet for DSL methods -- **Troubleshooting Scenarios**: Common issues and their solutions, including "What Failure Looks Like" with realistic examples -- **FAQ**: Frequently asked questions -- **Glossary**: Terminology reference - -## When to Use This Section - -- **Quick lookups**: Find DSL method signatures without reading full guides -- **Debugging failures**: Match symptoms to known issues and fixes -- **Clarifying concepts**: Look up unfamiliar terms in the glossary -- **Common questions**: Check FAQ before asking for help - -This section complements the main documentation with practical reference materials that you'll return to frequently during development and operations. - ---- - -Jump to: -- [Builder API Quick Reference](dsl-cheat-sheet.md) -- [Troubleshooting Scenarios](troubleshooting.md) -- [FAQ](faq.md) -- [Glossary](glossary.md) - diff --git a/book/src/project-context-primer.md b/book/src/project-context-primer.md index 5db6c1e..ef5b3f3 100644 --- a/book/src/project-context-primer.md +++ b/book/src/project-context-primer.md @@ -1,8 +1,8 @@ -# Nomos Testing Framework +# Logos Testing Framework **Declarative, multi-node blockchain testing for the Logos network** -The Nomos Testing Framework enables you to test consensus, data availability, and transaction workloads across local processes, Docker Compose, and Kubernetes deployments—all with a unified scenario API. +The Logos Testing Framework enables you to test consensus, data availability, and transaction workloads across local processes, Docker Compose, and Kubernetes deployments—all with a unified scenario API. [**Get Started**](quickstart.md) @@ -131,9 +131,9 @@ Check the **[Developer Reference](part-iii.md)** to implement custom workloads, These roles interact tightly, which is why meaningful testing must be performed in multi-node environments that include real networking, timing, and DA interaction. -The Nomos Testing Framework provides the infrastructure to orchestrate these multi-node scenarios reliably across development, CI, and production-like environments. +The Logos Testing Framework provides the infrastructure to orchestrate these multi-node scenarios reliably across development, CI, and production-like environments. -**Learn more about the protocol:** [Nomos Project Documentation](https://nomos-tech.notion.site/project) +**Learn more about the protocol:** [Logos Project Documentation](https://nomos-tech.notion.site/project) --- @@ -144,8 +144,8 @@ The Nomos Testing Framework provides the infrastructure to orchestrate these mul | **[Foundations](part-i.md)** | Architecture, philosophy, and design principles | | **[User Guide](part-ii.md)** | Writing and running scenarios, workloads, and expectations | | **[Developer Reference](part-iii.md)** | Extending the framework with custom components | -| **[Operations & Deployment](part-v.md)** | Setup, CI integration, and environment configuration | -| **[Appendix](part-vi.md)** | Quick reference, troubleshooting, FAQ, and glossary | +| **[Operations & Deployment](part-iv.md)** | Setup, CI integration, and environment configuration | +| **[Appendix](part-v.md)** | Quick reference, troubleshooting, FAQ, and glossary | --- diff --git a/book/src/running-scenarios.md b/book/src/running-scenarios.md index 9b144d1..87f948e 100644 --- a/book/src/running-scenarios.md +++ b/book/src/running-scenarios.md @@ -1,18 +1,118 @@ # Running Scenarios -Running a scenario follows the same conceptual flow regardless of environment: +This page focuses on how scenarios are executed (deploy → run → evaluate → cleanup), what artifacts you get back, and how that differs across runners. -1. Select or author a scenario plan that pairs a topology with workloads, - expectations, and a suitable run window. -2. Choose a deployer aligned with your environment (local, compose, or k8s) and - ensure its prerequisites are available. -3. Deploy the plan through the deployer, which provisions infrastructure and - returns a runner. -4. The runner orchestrates workload execution for the planned duration; keep - observability signals visible so you can correlate outcomes. -5. The runner evaluates expectations and captures results as the primary - pass/fail signal. +For “just run something that works” commands, see [Running Examples](running-examples.md). -Use the same plan across different deployers to compare behavior between local -development and CI or cluster settings. For environment prerequisites and -flags, see [Prerequisites & Setup](prerequisites.md) and [Environment Variables](environment-variables.md). +--- + +## Execution Flow (High Level) + +When you run a built scenario via a deployer, the run follows the same shape: + +```mermaid +flowchart TD + Build[Scenario built] --> Deploy[Deploy] + Deploy --> Capture[Capture] + Capture --> Execute[Execute] + Execute --> Evaluate[Evaluate] + Evaluate --> Cleanup[Cleanup] +``` + +- **Deploy**: provision infrastructure and start nodes (processes/containers/pods) +- **Capture**: establish clients/observability and capture initial state +- **Execute**: run workloads for the configured wall-clock duration +- **Evaluate**: run expectations (after the execution window ends) +- **Cleanup**: stop resources and finalize artifacts + +--- + +## The Core API + +```rust,ignore +use std::time::Duration; + +use testing_framework_core::scenario::{Deployer as _, ScenarioBuilder}; +use testing_framework_runner_local::LocalDeployer; +use testing_framework_workflows::ScenarioBuilderExt; + +async fn run_once() -> anyhow::Result<()> { + let mut scenario = ScenarioBuilder::topology_with(|t| t.network_star().validators(3).executors(1)) + .wallets(20) + .transactions_with(|tx| tx.rate(1).users(5)) + .expect_consensus_liveness() + .with_run_duration(Duration::from_secs(60)) + .build()?; + + let runner = LocalDeployer::default().deploy(&scenario).await?; + runner.run(&mut scenario).await?; + + Ok(()) +} +``` + +Notes: +- `with_run_duration(...)` is wall-clock time, not “number of blocks”. +- `.transactions_with(...)` rates are per-block. +- Most users should run scenarios via `scripts/run/run-examples.sh` unless they are embedding the framework in their own test crate. + +--- + +## Runner Differences + +### Local (Host) Runner + +- **Best for**: fast iteration and debugging +- **Logs/state**: stored under a temporary run directory unless you set `NOMOS_TESTS_KEEP_LOGS=1` and/or `NOMOS_LOG_DIR=...` +- **Limitations**: no node-control capability (chaos workflows that require node control won’t work here) + +Run the built-in local examples: + +```bash +POL_PROOF_DEV_MODE=true \ +scripts/run/run-examples.sh -t 60 -v 3 -e 1 host +``` + +### Compose Runner + +- **Best for**: reproducible multi-node environments and node control +- **Logs**: primarily via `docker compose logs` (and any node-level log configuration you apply) +- **Debugging**: set `COMPOSE_RUNNER_PRESERVE=1` to keep the environment up after a run + +Run the built-in compose examples: + +```bash +POL_PROOF_DEV_MODE=true \ +scripts/run/run-examples.sh -t 60 -v 3 -e 1 compose +``` + +### K8s Runner + +- **Best for**: production-like behavior, cluster scheduling/networking +- **Logs**: `kubectl logs ...` +- **Debugging**: set `K8S_RUNNER_PRESERVE=1` and `K8S_RUNNER_NAMESPACE=...` to keep resources around + +Run the built-in k8s examples: + +```bash +POL_PROOF_DEV_MODE=true \ +scripts/run/run-examples.sh -t 60 -v 3 -e 1 k8s +``` + +--- + +## Artifacts & Where to Look + +- **Node logs**: configure via `NOMOS_LOG_DIR`, `NOMOS_LOG_LEVEL`, `NOMOS_LOG_FILTER` (see [Logging & Observability](logging-observability.md)) +- **Runner logs**: controlled by `RUST_LOG` (runner process only) +- **Keep run directories**: set `NOMOS_TESTS_KEEP_LOGS=1` +- **Compose environment preservation**: set `COMPOSE_RUNNER_PRESERVE=1` +- **K8s environment preservation**: set `K8S_RUNNER_PRESERVE=1` + +--- + +## See Also + +- [Scenario Lifecycle](scenario-lifecycle.md) +- [Running Examples](running-examples.md) +- [Troubleshooting Scenarios](troubleshooting.md) diff --git a/book/src/troubleshooting.md b/book/src/troubleshooting.md index c42a54e..db396cd 100644 --- a/book/src/troubleshooting.md +++ b/book/src/troubleshooting.md @@ -481,17 +481,17 @@ cargo run -p runner-examples --bin local_runner When a test fails, check these in order: -1. ✅ **`POL_PROOF_DEV_MODE=true` is set** (REQUIRED for all runners) -2. ✅ **`versions.env` exists at repo root** -3. ✅ **KZG circuit assets present** (for DA workloads): `testing-framework/assets/stack/kzgrs_test_params/kzgrs_test_params` -4. ✅ **Node binaries available** (`NOMOS_NODE_BIN` / `NOMOS_EXECUTOR_BIN` set, or using `run-examples.sh`) -5. ✅ **Docker daemon running** (for compose/k8s) -6. ✅ **Docker image built** (`logos-blockchain-testing:local` exists for compose/k8s) -7. ✅ **No port conflicts** (`lsof -i :18080`, kill orphaned processes) -8. ✅ **Sufficient wallets** (`.wallets(N)` ≥ `.users(M)`) -9. ✅ **Enough resources** (Docker memory 8GB+, ulimit -n 4096) -10. ✅ **Run duration appropriate** (long enough for consensus timing) -11. ✅ **Logs persisted** (`NOMOS_LOG_DIR` + `NOMOS_TESTS_KEEP_LOGS=1` if needed) +1. **`POL_PROOF_DEV_MODE=true` is set** (REQUIRED for all runners) +2. **`versions.env` exists at repo root** +3. **KZG circuit assets present** (for DA workloads): `testing-framework/assets/stack/kzgrs_test_params/kzgrs_test_params` +4. **Node binaries available** (`NOMOS_NODE_BIN` / `NOMOS_EXECUTOR_BIN` set, or using `run-examples.sh`) +5. **Docker daemon running** (for compose/k8s) +6. **Docker image built** (`logos-blockchain-testing:local` exists for compose/k8s) +7. **No port conflicts** (`lsof -i :18080`, kill orphaned processes) +8. **Sufficient wallets** (`.wallets(N)` ≥ `.users(M)`) +9. **Enough resources** (Docker memory 8GB+, ulimit -n 4096) +10. **Run duration appropriate** (long enough for consensus timing) +11. **Logs persisted** (`NOMOS_LOG_DIR` + `NOMOS_TESTS_KEEP_LOGS=1` if needed) **Still stuck?** Check node logs (see [Where to Find Logs](#where-to-find-logs)) for the actual error. diff --git a/book/src/what-you-will-learn.md b/book/src/what-you-will-learn.md index d5e97a8..23a5f63 100644 --- a/book/src/what-you-will-learn.md +++ b/book/src/what-you-will-learn.md @@ -56,7 +56,7 @@ without changing the plan. ## What This Book Does NOT Cover -- **Nomos node internals** — This book focuses on testing infrastructure, not the blockchain protocol implementation. See the Nomos node repository for protocol documentation. +- **Logos node internals** — This book focuses on testing infrastructure, not the blockchain protocol implementation. See the Logos node repository (`nomos-node`) for protocol documentation. - **Consensus algorithm theory** — We assume familiarity with basic blockchain concepts (validators, blocks, transactions, data availability). - **Rust language basics** — Examples use Rust, but we don't teach the language. See [The Rust Book](https://doc.rust-lang.org/book/) if you're new to Rust. - **Kubernetes administration** — We show how to use the K8s runner, but don't cover cluster setup, networking, or operations. diff --git a/book/src/workloads.md b/book/src/workloads.md index b552410..ddcd0f2 100644 --- a/book/src/workloads.md +++ b/book/src/workloads.md @@ -1,27 +1,10 @@ # Core Content: Workloads & Expectations -Workloads describe the activity a scenario generates; expectations describe the -signals that must hold when that activity completes. Both are pluggable so -scenarios stay readable and purpose-driven. +Workloads describe the activity a scenario generates; expectations describe the signals that must hold when that activity completes. This page is the **canonical reference** for all built-in workloads and expectations, including configuration knobs, defaults, prerequisites, and debugging guidance. -## Workloads +--- -- **Transaction workload**: submits user-level transactions at a configurable - rate and can limit how many distinct actors participate. -- **Data-availability workload**: drives blob and channel activity to exercise - data-availability paths. -- **Chaos workload**: triggers controlled node restarts to test resilience and - recovery behaviors (requires a runner that can control nodes). - -## Expectations - -- **Consensus liveness**: verifies the system continues to produce blocks in - line with the planned workload and timing window. -- **Workload-specific checks**: each workload can attach its own success - criteria (e.g., inclusion of submitted activity) so scenarios remain concise. - -Together, workloads and expectations let you express both the pressure applied -to the system and the definition of "healthy" for that run. +## Overview ```mermaid flowchart TD @@ -31,8 +14,511 @@ flowchart TD Collect --> Eval[Expectations evaluate] ``` +**Key concepts:** +- **Workloads** run during the **execution phase** (generate traffic) +- **Expectations** run during the **evaluation phase** (check health signals) +- Each workload can attach its own expectations automatically +- Expectations can also be added explicitly + +--- + +## Built-in Workloads + +### 1. Transaction Workload + +Submits user-level transactions at a configurable rate to exercise transaction processing and inclusion paths. + +**Import:** +```rust,ignore +use testing_framework_workflows::workloads::transaction::Workload; +``` + +#### Configuration + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `rate` | `u64` | **Required** | Transactions per block (not per second!) | +| `users` | `Option` | All wallets | Number of distinct wallet accounts to use | + +#### DSL Usage + +```rust,ignore +use testing_framework_workflows::ScenarioBuilderExt; + +ScenarioBuilder::topology_with(|t| t.network_star().validators(3).executors(1)) + .wallets(20) // Seed 20 wallet accounts + .transactions_with(|tx| { + tx.rate(10) // 10 transactions per block + .users(5) // Use only 5 of the 20 wallets + }) + .with_run_duration(Duration::from_secs(60)) + .build(); +``` + +#### Direct Instantiation + +```rust,ignore +use testing_framework_workflows::workloads::transaction; + +let tx_workload = transaction::Workload::with_rate(10) + .expect("transaction rate must be non-zero"); + +ScenarioBuilder::topology_with(|t| t.network_star().validators(3).executors(1)) + .wallets(20) + .with_workload(tx_workload) + .with_run_duration(Duration::from_secs(60)) + .build(); +``` + +#### Prerequisites + +1. **Wallet accounts must be seeded:** + ```rust,ignore + .wallets(N) // Before .transactions_with() + ``` + The workload will fail during `init()` if no wallets are configured. + +2. **Proof generation must be fast:** + ```bash + export POL_PROOF_DEV_MODE=true + ``` + Without this, proof generation takes ~30-60 seconds per transaction, causing timeouts. + +3. **Circuit artifacts must be available:** + - Automatically staged by `scripts/run/run-examples.sh` + - Or manually via `scripts/setup/setup-circuits-stack.sh` (recommended) / `scripts/setup/setup-nomos-circuits.sh` + +#### Attached Expectation + +**TxInclusionExpectation** — Verifies that submitted transactions were included in blocks. + +**What it checks:** +- At least `N` transactions were included on-chain (where N = rate × user count × expected block count) +- Uses BlockFeed to count transactions across all observed blocks + +**Failure modes:** +- "Expected >= X transactions, observed Y" (Y < X) +- Common causes: proof generation timeouts, node crashes, insufficient duration + +#### What Failure Looks Like + +```text +Error: Expectation failed: TxInclusionExpectation + Expected: >= 600 transactions (10 tx/block × 60 blocks) + Observed: 127 transactions + + Possible causes: + - POL_PROOF_DEV_MODE not set (proof generation too slow) + - Duration too short (nodes still syncing) + - Node crashes (check logs for panics/OOM) + - Wallet accounts not seeded (check topology config) +``` + +**How to debug:** +1. Check logs for proof generation timing: + ```bash + grep "proof generation" $NOMOS_LOG_DIR/executor-0/*.log + ``` +2. Verify `POL_PROOF_DEV_MODE=true` was set +3. Increase duration: `.with_run_duration(Duration::from_secs(120))` +4. Reduce rate: `.rate(5)` instead of `.rate(10)` + +--- + +### 2. Data Availability (DA) Workload + +Drives blob and channel activity to exercise data availability paths and storage. + +**Import:** +```rust,ignore +use testing_framework_workflows::workloads::da::Workload; +``` + +#### Configuration + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `blob_rate_per_block` | `NonZeroU64` | **Required** | Blobs to publish per block | +| `channel_rate_per_block` | `NonZeroU64` | **Required** | Channels to create per block | +| `headroom_percent` | `u64` | `20` | Extra capacity for channel planning (avoids saturation) | + +#### DSL Usage + +```rust,ignore +use testing_framework_workflows::ScenarioBuilderExt; + +ScenarioBuilder::topology_with(|t| t.network_star().validators(3).executors(2)) + .da_with(|da| { + da.channel_rate(2) // 2 channels per block + .blob_rate(4) // 4 blobs per block + }) + .with_run_duration(Duration::from_secs(120)) + .build(); +``` + +#### Direct Instantiation + +```rust,ignore +use std::num::NonZeroU64; +use testing_framework_workflows::workloads::da; + +let da_workload = da::Workload::with_rate( + NonZeroU64::new(4).unwrap(), // blob_rate_per_block + NonZeroU64::new(2).unwrap(), // channel_rate_per_block + 20, // headroom_percent +); + +ScenarioBuilder::topology_with(|t| t.network_star().validators(3).executors(2)) + .with_workload(da_workload) + .with_run_duration(Duration::from_secs(120)) + .build(); +``` + +#### Prerequisites + +1. **Executors must be present:** + ```rust,ignore + .executors(N) // At least 1 executor + ``` + DA workload requires executor nodes to handle blob publishing. + +2. **Sufficient duration:** + Channel creation and blob publishing are slower than transaction submission. Allow 120+ seconds. + +3. **Circuit artifacts:** + Same as transaction workload (POL_PROOF_DEV_MODE, circuits staged). + +#### Attached Expectation + +**DaWorkloadExpectation** — Verifies blobs and channels were created and published. + +**What it checks:** +- At least `N` channels were created (where N = channel_rate × expected blocks) +- At least `M` blobs were published (where M = blob_rate × expected blocks × headroom) +- Uses BlockFeed and executor API to verify + +**Failure modes:** +- "Expected >= X channels, observed Y" (Y < X) +- "Expected >= X blobs, observed Y" (Y < X) +- Common causes: executor crashes, insufficient duration, DA saturation + +#### What Failure Looks Like + +```text +Error: Expectation failed: DaWorkloadExpectation + Expected: >= 60 channels (2 channels/block × 30 blocks) + Observed: 23 channels + + Possible causes: + - Executors crashed or restarted (check executor logs) + - Duration too short (channels still being created) + - Blob publishing failed (check executor API errors) + - Network issues (check validator/executor connectivity) +``` + +**How to debug:** +1. Check executor logs: + ```bash + grep "channel\|blob" $NOMOS_LOG_DIR/executor-0/*.log + ``` +2. Verify executors stayed running: + ```bash + grep "panic\|killed" $NOMOS_LOG_DIR/executor-*/*.log + ``` +3. Increase duration: `.with_run_duration(Duration::from_secs(180))` +4. Reduce rates: `.channel_rate(1).blob_rate(2)` + +--- + +### 3. Chaos Workload (Random Restart) + +Triggers controlled node restarts to test resilience and recovery behaviors. + +**Import:** +```rust,ignore +use testing_framework_workflows::workloads::chaos::RandomRestartWorkload; +``` + +#### Configuration + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `min_delay` | `Duration` | **Required** | Minimum time between restart attempts | +| `max_delay` | `Duration` | **Required** | Maximum time between restart attempts | +| `target_cooldown` | `Duration` | **Required** | Minimum time before restarting same node again | +| `include_validators` | `bool` | **Required** | Whether to restart validators | +| `include_executors` | `bool` | **Required** | Whether to restart executors | + +#### Usage + +```rust,ignore +use std::time::Duration; + +use testing_framework_core::scenario::ScenarioBuilder; +use testing_framework_workflows::{ScenarioBuilderExt, workloads::chaos::RandomRestartWorkload}; + +let scenario = ScenarioBuilder::topology_with(|t| { + t.network_star().validators(3).executors(2) +}) +.enable_node_control() // REQUIRED for chaos +.with_workload(RandomRestartWorkload::new( + Duration::from_secs(45), // min_delay + Duration::from_secs(75), // max_delay + Duration::from_secs(120), // target_cooldown + true, // include_validators + true, // include_executors +)) +.expect_consensus_liveness() +.with_run_duration(Duration::from_secs(180)) +.build(); +``` + +#### Prerequisites + +1. **Node control must be enabled:** + ```rust,ignore + .enable_node_control() + ``` + This adds `NodeControlCapability` to the scenario. + +2. **Runner must support node control:** + - **Compose runner:** Supported + - **Local runner:** Not supported + - **K8s runner:** Not yet implemented + +3. **Sufficient topology:** + - For validators: Need >1 validator (workload skips if only 1) + - For executors: Can restart all executors + +4. **Realistic timing:** + - Total duration should be 2-3× the max_delay + cooldown + - Example: max_delay=75s, cooldown=120s → duration >= 180s + +#### Attached Expectation + +None. You must explicitly add expectations (typically `.expect_consensus_liveness()`). + +**Why?** Chaos workloads are about testing recovery under disruption. The appropriate expectation depends on what you're testing: +- Consensus survives restarts → `.expect_consensus_liveness()` +- Height converges after chaos → Custom expectation checking BlockFeed + +#### What Failure Looks Like + +```text +Error: Workload failed: chaos_restart + Cause: NodeControlHandle not available + + Possible causes: + - Forgot .enable_node_control() in scenario builder + - Using local runner (doesn't support node control) + - Using k8s runner (doesn't support node control) +``` + +**Or:** + +```text +Error: Expectation failed: ConsensusLiveness + Expected: >= 20 blocks + Observed: 8 blocks + + Possible causes: + - Restart frequency too high (nodes can't recover) + - Consensus timing too slow (increase duration) + - Too many validators restarted simultaneously + - Nodes crashed after restart (check logs) +``` + +**How to debug:** +1. Check restart events in logs: + ```bash + grep "restarting\|restart complete" $NOMOS_LOG_DIR/*/*.log + ``` +2. Verify node control is enabled: + ```bash + grep "NodeControlHandle" $NOMOS_LOG_DIR/*/*.log + ``` +3. Increase cooldown: `Duration::from_secs(180)` +4. Reduce restart scope: `include_validators = false` (test executors only) +5. Increase duration: `.with_run_duration(Duration::from_secs(300))` + +--- + +## Built-in Expectations + +### 1. Consensus Liveness + +Verifies the system continues to produce blocks during the execution window. + +**Import:** +```rust,ignore +use testing_framework_workflows::ScenarioBuilderExt; +``` + +#### DSL Usage + +```rust,ignore +ScenarioBuilder::topology_with(|t| t.network_star().validators(3).executors(1)) + .expect_consensus_liveness() + .with_run_duration(Duration::from_secs(60)) + .build(); +``` + +#### What It Checks + +- At least `N` blocks were produced (where N = duration / expected_block_time) +- Uses BlockFeed to count observed blocks +- Compares against a minimum threshold (typically 50% of theoretical max) + +#### Failure Modes + +```text +Error: Expectation failed: ConsensusLiveness + Expected: >= 30 blocks + Observed: 3 blocks + + Possible causes: + - Nodes crashed or never started (check logs) + - Consensus timing misconfigured (CONSENSUS_SLOT_TIME too high) + - Insufficient validators (need >= 2 for BFT consensus) + - Duration too short (nodes still syncing) +``` + +#### How to Debug + +1. Check if nodes started: + ```bash + grep "node started\|listening on" $NOMOS_LOG_DIR/*/*.log + ``` +2. Check block production: + ```bash + grep "block.*height" $NOMOS_LOG_DIR/validator-*/*.log + ``` +3. Check consensus participation: + ```bash + grep "consensus.*slot\|proposal" $NOMOS_LOG_DIR/validator-*/*.log + ``` +4. Increase duration: `.with_run_duration(Duration::from_secs(120))` +5. Check env vars: `echo $CONSENSUS_SLOT_TIME $CONSENSUS_ACTIVE_SLOT_COEFF` + +--- + +### 2. Workload-Specific Expectations + +Each workload automatically attaches its own expectation: + +| Workload | Expectation | What It Checks | +|----------|-------------|----------------| +| Transaction | `TxInclusionExpectation` | Transactions were included in blocks | +| DA | `DaWorkloadExpectation` | Blobs and channels were created/published | +| Chaos | (None) | Add `.expect_consensus_liveness()` explicitly | + +These expectations are added automatically when using the DSL (`.transactions_with()`, `.da_with()`). + +--- + +## Configuration Quick Reference + +### Transaction Workload + +```rust,ignore +.wallets(20) +.transactions_with(|tx| tx.rate(10).users(5)) +``` + +| What | Value | Unit | +|------|-------|------| +| Rate | 10 | tx/block | +| Users | 5 | wallet accounts | +| Wallets | 20 | total seeded | + +### DA Workload + +```rust,ignore +.da_with(|da| da.channel_rate(2).blob_rate(4)) +``` + +| What | Value | Unit | +|------|-------|------| +| Channel rate | 2 | channels/block | +| Blob rate | 4 | blobs/block | +| Headroom | 20 | percent | + +### Chaos Workload + +```rust,ignore +.enable_node_control() +.with_workload(RandomRestartWorkload::new( + Duration::from_secs(45), // min + Duration::from_secs(75), // max + Duration::from_secs(120), // cooldown + true, // validators + true, // executors +)) +``` + +--- + +## Common Patterns + +### Pattern 1: Multiple Workloads + +```rust,ignore +ScenarioBuilder::topology_with(|t| t.network_star().validators(3).executors(2)) + .wallets(20) + .transactions_with(|tx| tx.rate(5).users(10)) + .da_with(|da| da.channel_rate(2).blob_rate(2)) + .expect_consensus_liveness() + .with_run_duration(Duration::from_secs(120)) + .build(); +``` + +All workloads run concurrently. Expectations for each workload run after the execution window ends. + +### Pattern 2: Custom Expectation + +```rust,ignore +use testing_framework_core::scenario::Expectation; + +struct MyCustomExpectation; + +#[async_trait] +impl Expectation for MyCustomExpectation { + async fn evaluate(&self, ctx: &RunContext) -> Result<(), DynError> { + // Access BlockFeed, metrics, topology, etc. + let block_count = ctx.block_feed()?.count(); + if block_count < 10 { + return Err("Not enough blocks".into()); + } + Ok(()) + } +} + +ScenarioBuilder::topology_with(|t| t.network_star().validators(3).executors(1)) + .with_expectation(MyCustomExpectation) + .with_run_duration(Duration::from_secs(60)) + .build(); +``` + +--- + +## Debugging Checklist + +When a workload or expectation fails: + +1. Check logs: `$NOMOS_LOG_DIR/*/` or `docker compose logs` or `kubectl logs` +2. Verify environment variables: `POL_PROOF_DEV_MODE`, `NOMOS_NODE_BIN`, etc. +3. Check prerequisites: wallets, executors, node control, circuits +4. Increase duration: Double the run duration and retry +5. Reduce rates: Half the traffic rates and retry +6. Check metrics: Prometheus queries for block height, tx count, DA stats +7. Reproduce locally: Use local runner for faster iteration + +--- + ## See Also -- **[RunContext: BlockFeed & Node Control](node-control.md)** — Learn how to use BlockFeed in expectations to observe blocks in real-time, and how to access node control for chaos testing +- **[Authoring Scenarios](authoring-scenarios.md)** — Step-by-step tutorial for building scenarios +- **[RunContext: BlockFeed & Node Control](node-control.md)** — Learn how to use BlockFeed in expectations and access node control - **[Examples](examples.md)** — Concrete scenario patterns combining workloads and expectations - **[Extending the Framework](extending.md)** — Implement custom workloads and expectations +- **[Troubleshooting](troubleshooting.md)** — Common failure scenarios and fixes diff --git a/docs/_config.yml b/docs/_config.yml index efa17fd..cacc7d2 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -1,3 +1,3 @@ -title: Nomos Testing Docs +title: Logos Testing Docs description: Redirect to mdBook documentation theme: jekyll-theme-primer diff --git a/examples/src/bin/cucumber_compose.rs b/examples/src/bin/cucumber_compose.rs index 6defa41..fdf079a 100644 --- a/examples/src/bin/cucumber_compose.rs +++ b/examples/src/bin/cucumber_compose.rs @@ -1,13 +1,14 @@ -use runner_examples::{ - cucumber::run, - defaults::{Mode, init_logging_defaults, init_node_log_dir_defaults, init_tracing}, +use cucumber::World; +use cucumber_ext::TestingFrameworkWorld; +use runner_examples::defaults::{ + Mode, init_logging_defaults, init_node_log_dir_defaults, init_tracing, }; -#[tokio::main(flavor = "current_thread")] +#[tokio::main] async fn main() { init_logging_defaults(); init_node_log_dir_defaults(Mode::Compose); init_tracing(); - run(Mode::Compose).await; + TestingFrameworkWorld::run("examples/cucumber/features/compose_smoke.feature").await; } diff --git a/examples/src/bin/cucumber_host.rs b/examples/src/bin/cucumber_host.rs index fb5670d..074dc7e 100644 --- a/examples/src/bin/cucumber_host.rs +++ b/examples/src/bin/cucumber_host.rs @@ -1,13 +1,14 @@ -use runner_examples::{ - cucumber::run, - defaults::{Mode, init_logging_defaults, init_node_log_dir_defaults, init_tracing}, +use cucumber::World; +use cucumber_ext::TestingFrameworkWorld; +use runner_examples::defaults::{ + Mode, init_logging_defaults, init_node_log_dir_defaults, init_tracing, }; -#[tokio::main(flavor = "current_thread")] +#[tokio::main] async fn main() { init_logging_defaults(); init_node_log_dir_defaults(Mode::Host); init_tracing(); - run(Mode::Host).await; + TestingFrameworkWorld::run("examples/cucumber/features/local_smoke.feature").await; } diff --git a/scripts/build/build_test_image.sh b/scripts/build/build_test_image.sh index aea541c..9619a40 100755 --- a/scripts/build/build_test_image.sh +++ b/scripts/build/build_test_image.sh @@ -126,7 +126,7 @@ build_test_image::print_config() { echo "Dockerfile: ${DOCKERFILE_PATH}" echo "Base image tag: ${BASE_IMAGE_TAG}" echo "Base Dockerfile: ${BASE_DOCKERFILE_PATH}" - echo "Nomos node rev: ${NOMOS_NODE_REV}" + echo "Logos node rev: ${NOMOS_NODE_REV}" echo "Circuits override: ${CIRCUITS_OVERRIDE:-}" echo "Circuits version (download fallback): ${VERSION}" echo "Circuits platform: ${CIRCUITS_PLATFORM}" diff --git a/scripts/setup/setup-observability.sh b/scripts/setup/setup-observability.sh index 99f8ca0..a8f9d03 100755 --- a/scripts/setup/setup-observability.sh +++ b/scripts/setup/setup-observability.sh @@ -21,7 +21,7 @@ Compose: Kubernetes: - Installs prometheus-community/kube-prometheus-stack into namespace - "nomos-observability" and optionally loads Nomos Grafana dashboards. + "logos-observability" and optionally loads Logos Grafana dashboards. - Prints port-forward commands + NOMOS_METRICS_* / NOMOS_GRAFANA_URL exports. USAGE } @@ -49,8 +49,8 @@ export NOMOS_GRAFANA_URL=http://localhost:3000 EOF } -k8s_namespace() { echo "nomos-observability"; } -k8s_release() { echo "nomos-observability"; } +k8s_namespace() { echo "${LOGOS_OBSERVABILITY_NAMESPACE:-${NOMOS_OBSERVABILITY_NAMESPACE:-logos-observability}}"; } +k8s_release() { echo "${LOGOS_OBSERVABILITY_RELEASE:-${NOMOS_OBSERVABILITY_RELEASE:-logos-observability}}"; } k8s_values() { echo "${ROOT}/scripts/observability/k8s/kube-prometheus-stack.values.yaml"; } k8s_install() { @@ -103,7 +103,7 @@ k8s_apply_dashboards() { local file base name for file in "${dash_dir}"/*.json; do base="$(basename "${file}" .json)" - name="nomos-dashboard-${base//[^a-zA-Z0-9-]/-}" + name="logos-dashboard-${base//[^a-zA-Z0-9-]/-}" kubectl -n "${ns}" create configmap "${name}" \ --from-file="$(basename "${file}")=${file}" \ --dry-run=client -o yaml | kubectl apply -f - diff --git a/testing-framework/assets/stack/Dockerfile.base b/testing-framework/assets/stack/Dockerfile.base index ac1d306..ad2e179 100644 --- a/testing-framework/assets/stack/Dockerfile.base +++ b/testing-framework/assets/stack/Dockerfile.base @@ -20,7 +20,7 @@ ARG CIRCUITS_PLATFORM LABEL maintainer="augustinas@status.im" \ source="https://github.com/logos-co/nomos-node" \ - description="Nomos testnet build image" + description="Logos testnet build image" WORKDIR /workspace COPY . . @@ -61,7 +61,7 @@ FROM ubuntu:24.04 AS base LABEL maintainer="augustinas@status.im" \ source="https://github.com/logos-co/nomos-node" \ - description="Nomos base runtime image (testing)" + description="Logos base runtime image (testing)" RUN apt-get update && apt-get install -yq \ libstdc++6 \ diff --git a/testing-framework/assets/stack/Dockerfile.runtime b/testing-framework/assets/stack/Dockerfile.runtime index aa050fa..1599f22 100644 --- a/testing-framework/assets/stack/Dockerfile.runtime +++ b/testing-framework/assets/stack/Dockerfile.runtime @@ -4,6 +4,6 @@ ARG BASE_IMAGE=logos-blockchain-testing:base FROM ${BASE_IMAGE} -LABEL description="Nomos runtime image for compose/k8s testing" +LABEL description="Logos runtime image for compose/k8s testing" ENTRYPOINT ["/usr/bin/nomos-node"] diff --git a/testing-framework/assets/stack/Dockerfile.testnet b/testing-framework/assets/stack/Dockerfile.testnet index 07cae93..ef5f46e 100644 --- a/testing-framework/assets/stack/Dockerfile.testnet +++ b/testing-framework/assets/stack/Dockerfile.testnet @@ -4,6 +4,6 @@ ARG BASE_IMAGE=logos-blockchain-testing:base FROM ${BASE_IMAGE} -LABEL description="Nomos testnet image (publishable)" +LABEL description="Logos testnet image (publishable)" ENTRYPOINT ["/usr/bin/nomos-node"] diff --git a/testing-framework/assets/stack/monitoring/grafana/dashboards/overview-dashboard.json b/testing-framework/assets/stack/monitoring/grafana/dashboards/overview-dashboard.json index c591d69..a3a26fe 100644 --- a/testing-framework/assets/stack/monitoring/grafana/dashboards/overview-dashboard.json +++ b/testing-framework/assets/stack/monitoring/grafana/dashboards/overview-dashboard.json @@ -1019,7 +1019,7 @@ }, "timepicker": {}, "timezone": "", - "title": "Nomos Overview Dashboard", + "title": "Logos Overview Dashboard", "uid": "overview-dashboard", "version": 1, "weekStart": "" diff --git a/testing-framework/cucumber/src/world.rs b/testing-framework/cucumber/src/world.rs index f4ffaae..b8ee0d6 100644 --- a/testing-framework/cucumber/src/world.rs +++ b/testing-framework/cucumber/src/world.rs @@ -243,7 +243,7 @@ impl TestingFrameworkWorld { if !(node_ok && exec_ok) { return Err(StepError::Preflight { - message: "Missing Nomos host binaries. Set NOMOS_NODE_BIN and NOMOS_EXECUTOR_BIN, or run `scripts/run/run-examples.sh host` to restore them into `testing-framework/assets/stack/bin`.".to_owned(), + message: "Missing Logos host binaries. Set NOMOS_NODE_BIN and NOMOS_EXECUTOR_BIN, or run `scripts/run/run-examples.sh host` to restore them into `testing-framework/assets/stack/bin`.".to_owned(), }); } } diff --git a/testing-framework/deployers/compose/src/deployer/mod.rs b/testing-framework/deployers/compose/src/deployer/mod.rs index 22c6acb..27dff56 100644 --- a/testing-framework/deployers/compose/src/deployer/mod.rs +++ b/testing-framework/deployers/compose/src/deployer/mod.rs @@ -12,7 +12,7 @@ use testing_framework_core::scenario::{ use crate::{errors::ComposeRunnerError, lifecycle::cleanup::RunnerCleanup}; -/// Docker Compose-based deployer for Nomos test scenarios. +/// Docker Compose-based deployer for Logos test scenarios. #[derive(Clone, Copy)] pub struct ComposeDeployer { readiness_checks: bool, diff --git a/testing-framework/deployers/k8s/helm/nomos-runner/Chart.yaml b/testing-framework/deployers/k8s/helm/nomos-runner/Chart.yaml index 1785e7e..4174927 100644 --- a/testing-framework/deployers/k8s/helm/nomos-runner/Chart.yaml +++ b/testing-framework/deployers/k8s/helm/nomos-runner/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: nomos-runner -description: Helm chart for Nomos integration test runner assets +description: Helm chart for Logos integration test runner assets type: application version: 0.1.0 appVersion: "0.1.0"