diff --git a/book/book.toml b/book/book.toml
new file mode 100644
index 0000000..385e6fb
--- /dev/null
+++ b/book/book.toml
@@ -0,0 +1,13 @@
+[book]
+authors      = ["Nomos Testing"]
+language     = "en"
+multilingual = false
+src          = "src"
+title        = "Nomos Testing Book"
+
+[build]
+# Keep book output in target/ to avoid polluting the workspace root.
+build-dir = "../target/book"
+
+[output.html]
+default-theme = "light"
diff --git a/book/combined.md b/book/combined.md
new file mode 100644
index 0000000..6909b9f
--- /dev/null
+++ b/book/combined.md
@@ -0,0 +1,549 @@
+# Nomos Testing Framework — Combined Reference
+
+## Project Context Primer
+This book focuses on the Nomos Testing Framework. It assumes familiarity with
+the Nomos architecture, but for completeness, here is a short primer.
+
+- **Nomos** is a modular blockchain protocol composed of validators, executors,
+  and a data-availability (DA) subsystem.
+- **Validators** participate in consensus and produce blocks.
+- **Executors** run application logic or off-chain computations referenced by
+  blocks.
+- **Data Availability (DA)** ensures that data referenced in blocks is
+  published and retrievable, including blobs or channel data used by workloads.
+
+These roles interact tightly, which is why meaningful testing must be performed
+in multi-node environments that include real networking, timing, and DA
+interaction.
+
+## What You Will Learn
+This book gives you a clear mental model for Nomos multi-node testing, shows how
+to author scenarios that pair realistic workloads with explicit expectations,
+and guides you to run them across local, containerized, and cluster environments
+without changing the plan.
+
+## Part I — Foundations
+
+### Introduction
+The Nomos Testing Framework is a purpose-built toolkit for exercising Nomos in
+realistic, multi-node environments. It solves the gap between small, isolated
+tests and full-system validation by letting teams describe a cluster layout,
+drive meaningful traffic, and assert the outcomes in one coherent plan.
+
+It is for protocol engineers, infrastructure operators, and QA teams who need
+repeatable confidence that validators, executors, and data-availability
+components work together under network and timing constraints.
+
+Multi-node integration testing is required because many Nomos behaviors—block
+progress, data availability, liveness under churn—only emerge when several
+roles interact over real networking and time. This framework makes those checks
+declarative, observable, and portable across environments.
+
+### Architecture Overview
+The framework follows a clear flow: **Topology → Scenario → Runner → Workloads → Expectations**.
+
+- **Topology** describes the cluster: how many nodes, their roles, and the high-level network and data-availability parameters they should follow.
+- **Scenario** combines that topology with the activities to run and the checks to perform, forming a single plan.
+- **Deployer/Runner** pair turns the plan into a live environment on the chosen backend (local processes, Docker Compose, or Kubernetes) and brokers readiness.
+- **Workloads** generate traffic and conditions that exercise the system.
+- **Expectations** observe the run and judge success or failure once activity completes.
+
+Conceptual diagram:
+```
+Topology  →  Scenario  →  Runner  →  Workloads  →  Expectations
+ (shape       (plan)      (deploy     (drive         (verify
+ cluster)                & orchestrate) traffic)     outcomes)
+```
+
+Mermaid view:
+```mermaid
+flowchart LR
+    A(Topology<br/>shape cluster) --> B(Scenario<br/>plan)
+    B --> C(Deployer/Runner<br/>deploy & orchestrate)
+    C --> D(Workloads<br/>drive traffic)
+    D --> E(Expectations<br/>verify outcomes)
+```
+
+Each layer has a narrow responsibility so that cluster shape, deployment choice, traffic generation, and health checks can evolve independently while fitting together predictably.
+
+### Testing Philosophy
+- **Declarative over imperative**: describe the desired cluster shape, traffic, and success criteria; let the framework orchestrate the run.
+- **Observable health signals**: prefer liveness and inclusion signals that reflect real user impact instead of internal debug state.
+- **Determinism first**: default scenarios aim for repeatable outcomes with fixed topologies and traffic rates; variability is opt-in.
+- **Targeted non-determinism**: introduce randomness (e.g., restarts) only when probing resilience or operational robustness.
+- **Protocol time, not wall time**: reason in blocks and protocol-driven intervals to reduce dependence on host speed or scheduler noise.
+- **Minimum run window**: always allow enough block production to make assertions meaningful; very short runs risk false confidence.
+- **Use chaos with intent**: chaos workloads are for recovery and fault-tolerance validation, not for baseline functional checks.
+
+### Scenario Lifecycle (Conceptual)
+1. **Build the plan**: Declare a topology, attach workloads and expectations, and set the run window. The plan is the single source of truth for what will happen.
+2. **Deploy**: Hand the plan to a runner. It provisions the environment on the chosen backend and waits for nodes to signal readiness.
+3. **Drive workloads**: Start traffic and behaviors (transactions, data-availability activity, restarts) for the planned duration.
+4. **Observe blocks and signals**: Track block progression and other high-level metrics during or after the run window to ground assertions in protocol time.
+5. **Evaluate expectations**: Once activity stops (and optional cooldown completes), check liveness and workload-specific outcomes to decide pass or fail.
+6. **Cleanup**: Tear down resources so successive runs start fresh and do not inherit leaked state.
+
+Conceptual lifecycle diagram:
+```
+Plan → Deploy → Readiness → Drive Workloads → Observe → Evaluate → Cleanup
+```
+
+Mermaid view:
+```mermaid
+flowchart LR
+    P[Plan<br/>topology + workloads + expectations] --> D[Deploy<br/>runner provisions]
+    D --> R[Readiness<br/>wait for nodes]
+    R --> W[Drive Workloads]
+    W --> O[Observe<br/>blocks/metrics]
+    O --> E[Evaluate Expectations]
+    E --> C[Cleanup]
+```
+
+### Design Rationale
+- **Modular crates** keep configuration, orchestration, workloads, and runners decoupled so each can evolve without breaking the others.
+- **Pluggable runners** let the same scenario run on a laptop, a Docker host, or a Kubernetes cluster, making validation portable across environments.
+- **Separated workloads and expectations** clarify intent: what traffic to generate versus how to judge success. This simplifies review and reuse.
+- **Declarative topology** makes cluster shape explicit and repeatable, reducing surprise when moving between CI and developer machines.
+- **Maintainability through predictability**: a clear flow from plan to deployment to verification lowers the cost of extending the framework and interpreting failures.
+
+## Part II — User Guide
+
+### Workspace Layout
+The workspace focuses on multi-node integration testing and sits alongside a `nomos-node` checkout. Its crates separate concerns to keep scenarios repeatable and portable:
+
+- **Configs**: prepares high-level node, network, tracing, and wallet settings used across test environments.
+- **Core scenario orchestration**: the engine that holds topology descriptions, scenario plans, runtimes, workloads, and expectations.
+- **Workflows**: ready-made workloads (transactions, data-availability, chaos) and reusable expectations assembled into a user-facing DSL.
+- **Runners**: deployment backends for local processes, Docker Compose, and Kubernetes, all consuming the same scenario plan.
+- **Test workflows**: example scenarios and integration checks that show how the pieces fit together.
+
+This split keeps configuration, orchestration, reusable traffic patterns, and deployment adapters loosely coupled while sharing one mental model for tests.
+
+### Annotated Tree
+High-level view of the workspace and how pieces relate:
+```
+nomos-testing/
+├─ testing-framework/
+│  ├─ configs/          # shared configuration helpers
+│  ├─ core/             # scenario model, runtime, topology
+│  ├─ workflows/        # workloads, expectations, DSL extensions
+│  └─ runners/          # local, compose, k8s deployment backends
+├─ tests/               # integration scenarios using the framework
+└─ scripts/             # supporting setup utilities (e.g., assets)
+```
+
+Each area maps to a responsibility: describe configs, orchestrate scenarios, package common traffic and assertions, adapt to environments, and demonstrate end-to-end usage.
+
+### Authoring Scenarios
+Creating a scenario is a declarative exercise:
+
+1. **Shape the topology**: decide how many validators and executors to run, and what high-level network and data-availability characteristics matter for the test.
+2. **Attach workloads**: pick traffic generators that align with your goals (transactions, data-availability blobs, or chaos for resilience probes).
+3. **Define expectations**: specify the health signals that must hold when the run finishes (e.g., consensus liveness, inclusion of submitted activity; see [Core Content: Workloads & Expectations](workloads.md)).
+4. **Set duration**: choose a run window long enough to observe meaningful block progression and the effects of your workloads.
+5. **Choose a runner**: target local processes for fast iteration, Docker Compose for reproducible multi-node stacks, or Kubernetes for cluster-grade validation. For environment considerations, see [Operations](operations.md).
+
+Keep scenarios small and explicit: make the intended behavior and the success criteria clear so failures are easy to interpret and act upon.
+
+### Core Content: Workloads & Expectations
+Workloads describe the activity a scenario generates; expectations describe the signals that must hold when that activity completes. Both are pluggable so scenarios stay readable and purpose-driven.
+
+#### Workloads
+- **Transaction workload**: submits user-level transactions at a configurable rate and can limit how many distinct actors participate.
+- **Data-availability workload**: drives blob and channel activity to exercise data-availability paths.
+- **Chaos workload**: triggers controlled node restarts to test resilience and recovery behaviors (requires a runner that can control nodes).
+
+#### Expectations
+- **Consensus liveness**: verifies the system continues to produce blocks in line with the planned workload and timing window.
+- **Workload-specific checks**: each workload can attach its own success criteria (e.g., inclusion of submitted activity) so scenarios remain concise.
+
+Together, workloads and expectations let you express both the pressure applied to the system and the definition of “healthy” for that run.
+
+Workload pipeline (conceptual):
+```
+Inputs (topology + wallets + rates)
+    │
+    ▼
+Workload init → Drive traffic → Collect signals
+                                   │
+                                   ▼
+                           Expectations evaluate
+```
+
+Mermaid view:
+```mermaid
+flowchart TD
+    I[Inputs<br/>(topology + wallets + rates)] --> Init[Workload init]
+    Init --> Drive[Drive traffic]
+    Drive --> Collect[Collect signals]
+    Collect --> Eval[Expectations evaluate]
+```
+
+### Core Content: ScenarioBuilderExt Patterns
+Patterns that keep scenarios readable and reusable:
+
+- **Topology-first**: start by shaping the cluster (counts, layout) so later steps inherit a clear foundation.
+- **Bundle defaults**: use the DSL helpers to attach common expectations (like liveness) whenever you add a matching workload, reducing forgotten checks.
+- **Intentional rates**: express traffic in per-block terms to align with protocol timing rather than wall-clock assumptions.
+- **Opt-in chaos**: enable restart patterns only in scenarios meant to probe resilience; keep functional smoke tests deterministic.
+- **Wallet clarity**: seed only the number of actors you need; it keeps transaction scenarios deterministic and interpretable.
+
+These patterns make scenario definitions self-explanatory while staying aligned with the framework’s block-oriented timing model.
+
+### Best Practices
+- **State your intent**: document the goal of each scenario (throughput, DA validation, resilience) so expectation choices are obvious.
+- **Keep runs meaningful**: choose durations that allow multiple blocks and make timing-based assertions trustworthy.
+- **Separate concerns**: start with deterministic workloads for functional checks; add chaos in dedicated resilience scenarios to avoid noisy failures.
+- **Reuse patterns**: standardize on shared topology and workload presets so results are comparable across environments and teams.
+- **Observe first, tune second**: rely on liveness and inclusion signals to interpret outcomes before tweaking rates or topology.
+- **Environment fit**: pick runners that match the feedback loop you need—local for speed, compose for reproducible stacks, k8s for cluster-grade fidelity.
+- **Minimal surprises**: seed only necessary wallets and keep configuration deltas explicit when moving between CI and developer machines.
+
+### Examples
+Concrete scenario shapes that illustrate how to combine topologies, workloads, and expectations. Adjust counts, rates, and durations to fit your environment.
+
+#### Simple 2-validator transaction workload
+- **Topology**: two validators.
+- **Workload**: transaction submissions at a modest per-block rate with a small set of wallet actors.
+- **Expectations**: consensus liveness and inclusion of submitted activity.
+- **When to use**: smoke tests for consensus and transaction flow on minimal hardware.
+
+#### DA + transaction workload
+- **Topology**: validators plus executors if available.
+- **Workloads**: data-availability blobs/channels and transactions running together to stress both paths.
+- **Expectations**: consensus liveness and workload-level inclusion/availability checks.
+- **When to use**: end-to-end coverage of transaction and DA layers in one run.
+
+#### Chaos + liveness check
+- **Topology**: validators (optionally executors) with node control enabled.
+- **Workloads**: baseline traffic (transactions or DA) plus chaos restarts on selected roles.
+- **Expectations**: consensus liveness to confirm the system keeps progressing despite restarts; workload-specific inclusion if traffic is present.
+- **When to use**: resilience validation and operational readiness drills.
+
+### Advanced & Artificial Examples
+These illustrative scenarios stretch the framework to show how to build new workloads, expectations, deployers, and topology tricks. They are intentionally “synthetic” to teach capabilities rather than prescribe production tests.
+
+#### Synthetic Delay Workload (Network Latency Simulation)
+- **Idea**: inject fake latency between node interactions using internal timers, not OS-level tooling.
+- **Demonstrates**: sequencing control inside a workload, verifying protocol progression under induced lag, using timers to pace submissions.
+- **Shape**: wrap submissions in delays that mimic slow peers; ensure the expectation checks blocks still progress.
+
+#### Oscillating Load Workload (Traffic Waves)
+- **Idea**: traffic rate changes every block or N seconds (e.g., blocks 1–3 low, 4–5 high, 6–7 zero, repeat).
+- **Demonstrates**: dynamic, stateful workloads that use `RunMetrics` to time phases; modeling real-world burstiness.
+- **Shape**: schedule per-phase rates; confirm inclusion/liveness across peaks and troughs.
+
+#### Byzantine Behavior Mock
+- **Idea**: a workload that drops half its planned submissions, sometimes double-submits, and intentionally triggers expectation failures.
+- **Demonstrates**: negative testing, resilience checks, and the value of clear expectations when behavior is adversarial by design.
+- **Shape**: parameterize drop/double-submit probabilities; pair with an expectation that documents what “bad” looks like.
+
+#### Custom Expectation: Block Finality Drift
+- **Idea**: assert the last few blocks differ and block time stays within a tolerated drift budget.
+- **Demonstrates**: consuming `BlockFeed` or time-series metrics to validate protocol cadence; crafting post-run assertions around block diversity and timing.
+- **Shape**: collect recent blocks, confirm no duplicates, and compare observed intervals to a drift threshold.
+
+#### Custom Deployer: Dry-Run Deployer
+- **Idea**: a deployer that never starts nodes; it emits configs, simulates readiness, provides fake blockfeed/metrics.
+- **Demonstrates**: full power of the deployer interface for CI dry-runs, config verification, and ultra-fast feedback without Nomos binaries.
+- **Shape**: produce logs/artifacts, stub readiness, and feed synthetic blocks so expectations can still run.
+
+#### Stochastic Topology Generator
+- **Idea**: topology parameters change at runtime (random validators, DA settings, network shapes).
+- **Demonstrates**: randomized property testing and fuzzing approaches to topology building.
+- **Shape**: pick roles and network layouts randomly per run; keep expectations tolerant to variability while still asserting core liveness.
+
+#### Multi-Phase Scenario (“Pipelines”)
+- **Idea**: scenario runs in phases (e.g., phase 1 transactions, phase 2 DA, phase 3 restarts, phase 4 sync check).
+- **Demonstrates**: multi-stage tests, modular scenario assembly, and deliberate lifecycle control.
+- **Shape**: drive phase-specific workloads/expectations sequentially; enforce clear boundaries and post-phase checks.
+
+### Running Scenarios
+Running a scenario follows the same conceptual flow regardless of environment:
+
+1. Select or author a scenario plan that pairs a topology with workloads, expectations, and a suitable run window.
+2. Choose a runner aligned with your environment (local, compose, or k8s) and ensure its prerequisites are available.
+3. Deploy the plan through the runner; wait for readiness signals before starting workloads.
+4. Let workloads drive activity for the planned duration; keep observability signals visible so you can correlate outcomes.
+5. Evaluate expectations and capture results as the primary pass/fail signal.
+
+Use the same plan across different runners to compare behavior between local development and CI or cluster settings. For environment prerequisites and flags, see [Operations](operations.md).
+
+### Runners
+Runners turn a scenario plan into a live environment while keeping the plan unchanged. Choose based on feedback speed, reproducibility, and fidelity. For environment and operational considerations, see [Operations](operations.md):
+
+#### Local runner
+- Launches node processes directly on the host.
+- Fastest feedback loop and minimal orchestration overhead.
+- Best for development-time iteration and debugging.
+
+#### Docker Compose runner
+- Starts nodes in containers to provide a reproducible multi-node stack on a single machine.
+- Discovers service ports and wires observability for convenient inspection.
+- Good balance between fidelity and ease of setup.
+
+#### Kubernetes runner
+- Deploys nodes onto a cluster for higher-fidelity, longer-running scenarios.
+- Suits CI or shared environments where cluster behavior and scheduling matter.
+
+#### Common expectations
+- All runners require at least one validator and, for transaction scenarios, access to seeded wallets.
+- Readiness probes gate workload start so traffic begins only after nodes are reachable.
+- Environment flags can relax timeouts or increase tracing when diagnostics are needed.
+
+Runner abstraction:
+```
+Scenario Plan
+    │
+    ▼
+Runner (local | compose | k8s)
+    │  provisions env + readiness
+    ▼
+Runtime + Observability
+    │
+    ▼
+Workloads / Expectations execute
+```
+
+Mermaid view:
+```mermaid
+flowchart TD
+    Plan[Scenario Plan] --> RunSel{Runner<br/>(local | compose | k8s)}
+    RunSel --> Provision[Provision & readiness]
+    Provision --> Runtime[Runtime + observability]
+    Runtime --> Exec[Workloads & Expectations execute]
+```
+
+### Operations
+Operational readiness focuses on prerequisites, environment fit, and clear signals:
+
+- **Prerequisites**: keep a sibling `nomos-node` checkout available; ensure the chosen runner’s platform needs are met (local binaries for host runs, Docker for compose, cluster access for k8s).
+- **Artifacts**: some scenarios depend on prover or circuit assets; fetch them ahead of time with the provided helper scripts when needed.
+- **Environment flags**: use slow-environment toggles to relax timeouts, enable tracing when debugging, and adjust observability ports to avoid clashes.
+- **Readiness checks**: verify runners report node readiness before starting workloads; this avoids false negatives from starting too early.
+- **Failure triage**: map failures to missing prerequisites (wallet seeding, node control availability), runner platform issues, or unmet expectations. Start with liveness signals, then dive into workload-specific assertions.
+
+Treat operational hygiene—assets present, prerequisites satisfied, observability reachable—as the first step to reliable scenario outcomes.
+
+Metrics and observability flow:
+```
+Runner exposes endpoints/ports
+    │
+    ▼
+Runtime collects block/health signals
+    │
+    ▼
+Expectations consume signals to decide pass/fail
+    │
+    ▼
+Operators inspect logs/metrics when failures arise
+```
+
+Mermaid view:
+```mermaid
+flowchart TD
+    Expose[Runner exposes endpoints/ports] --> Collect[Runtime collects block/health signals]
+    Collect --> Consume[Expectations consume signals<br/>decide pass/fail]
+    Consume --> Inspect[Operators inspect logs/metrics<br/>when failures arise]
+```
+
+## Part III — Developer Reference
+
+### Scenario Model (Developer Level)
+The scenario model defines clear, composable responsibilities:
+
+- **Topology**: a declarative description of the cluster—how many nodes, their roles, and the broad network and data-availability characteristics. It represents the intended shape of the system under test.
+- **Scenario**: a plan combining topology, workloads, expectations, and a run window. Building a scenario validates prerequisites (like seeded wallets) and ensures the run lasts long enough to observe meaningful block progression.
+- **Workloads**: asynchronous tasks that generate traffic or conditions. They use shared context to interact with the deployed cluster and may bundle default expectations.
+- **Expectations**: post-run assertions. They can capture baselines before workloads start and evaluate success once activity stops.
+- **Runtime**: coordinates workloads and expectations for the configured duration, enforces cooldowns when control actions occur, and ensures cleanup so runs do not leak resources.
+
+Developers extending the model should keep these boundaries strict: topology describes, scenarios assemble, runners deploy, workloads drive, and expectations judge outcomes. For guidance on adding new capabilities, see [Extending the Framework](extending.md).
+
+### Extending the Framework
+#### Adding a workload
+1) Implement the workload contract: provide a name, optional bundled expectations, validate prerequisites up front, and drive asynchronous activity against the deployed cluster.
+2) Export it through the workflows layer and consider adding DSL helpers for ergonomic wiring.
+
+#### Adding an expectation
+1) Implement the expectation contract: capture baselines if needed and evaluate outcomes after workloads finish; report meaningful errors to aid debugging.
+2) Expose reusable expectations from the workflows layer so scenarios can attach them declaratively.
+
+#### Adding a runner
+1) Implement the deployer contract for the target backend, producing a runtime context with client access, metrics endpoints, and optional node control.
+2) Preserve cleanup guarantees so resources are reclaimed even when runs fail; mirror readiness and observation signals used by existing runners for consistency.
+
+#### Adding topology helpers
+Extend the topology description with new layouts or presets while keeping defaults safe and predictable; favor declarative inputs over ad hoc logic so scenarios stay reviewable.
+
+### Internal Crate Reference
+High-level roles of the crates that make up the framework:
+
+- **Configs**: prepares reusable configuration primitives for nodes, networking, tracing, data availability, and wallets, shared by all scenarios and runners.
+- **Core scenario orchestration**: houses the topology and scenario model, runtime coordination, node clients, and readiness/health probes.
+- **Workflows**: packages workloads and expectations into reusable building blocks and offers a fluent DSL to assemble them.
+- **Runners**: implements deployment backends (local host, Docker Compose, Kubernetes) that all consume the same scenario plan.
+- **Test workflows**: example scenarios and integration checks that exercise the framework end to end and serve as living documentation.
+
+Use this map to locate where to add new capabilities: configuration primitives in configs, orchestration changes in core, reusable traffic/assertions in workflows, environment adapters in runners, and demonstrations in tests.
+
+### Example: New Workload & Expectation (Rust)
+A minimal, end-to-end illustration of adding a custom workload and matching expectation. This shows the shape of the traits and where to plug into the framework; expand the logic to fit your real test.
+
+#### Workload: simple reachability probe
+Key ideas:
+- **name**: identifies the workload in logs.
+- **expectations**: workloads can bundle defaults so callers don’t forget checks.
+- **init**: derive inputs from the generated topology (e.g., pick a target node).
+- **start**: drive async activity using the shared `RunContext`.
+
+```rust
+use std::sync::Arc;
+use async_trait::async_trait;
+use testing_framework_core::scenario::{
+    DynError, Expectation, RunContext, RunMetrics, Workload,
+};
+use testing_framework_core::topology::GeneratedTopology;
+
+pub struct ReachabilityWorkload {
+    target_idx: usize,
+    bundled: Vec<Box<dyn Expectation>>,
+}
+
+impl ReachabilityWorkload {
+    pub fn new(target_idx: usize) -> Self {
+        Self {
+            target_idx,
+            bundled: vec![Box::new(ReachabilityExpectation::new(target_idx))],
+        }
+    }
+}
+
+#[async_trait]
+impl Workload for ReachabilityWorkload {
+    fn name(&self) -> &'static str {
+        "reachability_workload"
+    }
+
+    fn expectations(&self) -> Vec<Box<dyn Expectation>> {
+        self.bundled.clone()
+    }
+
+    fn init(
+        &mut self,
+        topology: &GeneratedTopology,
+        _metrics: &RunMetrics,
+    ) -> Result<(), DynError> {
+        if topology.validators().get(self.target_idx).is_none() {
+            return Err("no validator at requested index".into());
+        }
+        Ok(())
+    }
+
+    async fn start(&self, ctx: &RunContext) -> Result<(), DynError> {
+        let client = ctx
+            .clients()
+            .validators()
+            .get(self.target_idx)
+            .ok_or("missing target client")?;
+
+        // Pseudo-action: issue a lightweight RPC to prove reachability.
+        client.health_check().await.map_err(|e| e.into())
+    }
+}
+```
+
+#### Expectation: confirm the target stayed reachable
+Key ideas:
+- **start_capture**: snapshot baseline if needed (not used here).
+- **evaluate**: assert the condition after workloads finish.
+
+```rust
+use async_trait::async_trait;
+use testing_framework_core::scenario::{DynError, Expectation, RunContext};
+
+pub struct ReachabilityExpectation {
+    target_idx: usize,
+}
+
+impl ReachabilityExpectation {
+    pub fn new(target_idx: usize) -> Self {
+        Self { target_idx }
+    }
+}
+
+#[async_trait]
+impl Expectation for ReachabilityExpectation {
+    fn name(&self) -> &str {
+        "target_reachable"
+    }
+
+    async fn evaluate(&mut self, ctx: &RunContext) -> Result<(), DynError> {
+        let client = ctx
+            .clients()
+            .validators()
+            .get(self.target_idx)
+            .ok_or("missing target client")?;
+
+        client.health_check().await.map_err(|e| {
+            format!("target became unreachable during run: {e}").into()
+        })
+    }
+}
+```
+
+#### How to wire it
+- Build your scenario as usual and call `.with_workload(ReachabilityWorkload::new(0))`.
+- The bundled expectation is attached automatically; you can add more with `.with_expectation(...)` if needed.
+- Keep the logic minimal and fast for smoke tests; grow it into richer probes for deeper scenarios.
+
+## Part IV — Appendix
+
+### DSL Cheat Sheet
+The framework offers a fluent builder style to keep scenarios readable. Common knobs:
+
+- **Topology shaping**: set validator and executor counts, pick a network layout style, and adjust high-level data-availability traits.
+- **Wallet seeding**: define how many users participate and the total funds available for transaction workloads.
+- **Workload tuning**: configure transaction rates, data-availability channel and blob rates, and whether chaos restarts should include validators, executors, or both.
+- **Expectations**: attach liveness and workload-specific checks so success is explicit.
+- **Run window**: set a minimum duration long enough for multiple blocks to be observed and verified.
+
+Use these knobs to express intent clearly, keeping scenario definitions concise and consistent across teams.
+
+### Troubleshooting Scenarios
+Common symptoms and likely causes:
+
+- **No or slow block progression**: runner started workloads before readiness, insufficient run window, or environment too slow—extend duration or enable slow-environment tuning.
+- **Transactions not included**: missing or insufficient wallet seeding, misaligned transaction rate with block cadence, or network instability—reduce rate and verify wallet setup.
+- **Chaos stalls the run**: node control not available for the chosen runner or restart cadence too aggressive—enable control capability and widen restart intervals.
+- **Observability gaps**: metrics or logs unreachable because ports clash or services are not exposed—adjust observability ports and confirm runner wiring.
+- **Flaky behavior across runs**: mixing chaos with functional smoke tests or inconsistent topology between environments—separate deterministic and chaos scenarios and standardize topology presets.
+
+### FAQ
+**Why block-oriented timing?**  
+Using block cadence reduces dependence on host speed and keeps assertions aligned with protocol behavior.
+
+**Can I reuse the same scenario across runners?**  
+Yes. The plan stays the same; swap runners (local, compose, k8s) to target different environments.
+
+**When should I enable chaos workloads?**  
+Only when testing resilience or operational recovery; keep functional smoke tests deterministic.
+
+**How long should runs be?**  
+Long enough for multiple blocks so liveness and inclusion checks are meaningful; very short runs risk false confidence.
+
+**Do I always need seeded wallets?**  
+Only for transaction scenarios. Data-availability or pure chaos scenarios may not require them, but liveness checks still need validators producing blocks.
+
+**What if expectations fail but workloads “look fine”?**  
+Trust expectations first—they capture the intended success criteria. Use the observability signals and runner logs to pinpoint why the system missed the target.
+
+### Glossary
+- **Validator**: node role responsible for participating in consensus and block production.
+- **Executor**: node role that processes transactions or workloads delegated by validators.
+- **DA (Data Availability)**: subsystem ensuring blobs or channel data are published and retrievable for validation.
+- **Workload**: traffic or behavior generator that exercises the system during a scenario run.
+- **Expectation**: post-run assertion that judges whether the system met the intended success criteria.
+- **Topology**: declarative description of the cluster shape, roles, and high-level parameters for a scenario.
+- **Blockfeed**: stream of block observations used for liveness or inclusion signals during a run.
+- **Control capability**: the ability for a runner to start, stop, or restart nodes, used by chaos workloads.
diff --git a/book/nomos_testing_framework_book_v4.md b/book/nomos_testing_framework_book_v4.md
new file mode 100644
index 0000000..5c94a95
--- /dev/null
+++ b/book/nomos_testing_framework_book_v4.md
@@ -0,0 +1,1711 @@
+# Nomos Testing Framework — Complete Reference
+
+> **GitBook Structure Note**: This document is organized with `<!-- FILE: path/to/file.md -->` markers indicating how to split for GitBook deployment.
+
+---
+
+<!-- FILE: README.md -->
+
+# Nomos Testing Framework
+
+A purpose-built toolkit for exercising Nomos in realistic, multi-node environments.
+
+## Quick Links
+
+- [5-Minute Quickstart](#5-minute-quickstart) — Get running immediately
+- [Foundations](#part-i--foundations) — Core concepts and architecture
+- [User Guide](#part-ii--user-guide) — Authoring and running scenarios
+- [Developer Reference](#part-iii--developer-reference) — Extending the framework
+- [Recipes](#part-v--scenario-recipes) — Copy-paste runnable examples
+
+## Reading Guide by Role
+
+| If you are... | Start with... | Then read... |
+|---------------|---------------|--------------|
+| **Protocol/Core Engineer** | Quickstart → Testing Philosophy | Workloads & Expectations → Recipes |
+| **Infra/DevOps** | Quickstart → Runners | Operations → Configuration Sync → Troubleshooting |
+| **Test Designer** | Quickstart → Authoring Scenarios | DSL Cheat Sheet → Recipes → Extending |
+
+## Prerequisites
+
+This book assumes:
+
+- Rust competency (async/await, traits, cargo)
+- Basic familiarity with Nomos architecture (validators, executors, DA)
+- Docker knowledge (for Compose runner)
+- Optional: Kubernetes access (for K8s runner)
+
+---
+
+<!-- FILE: quickstart.md -->
+
+# 5-Minute Quickstart
+
+Get a scenario running in under 5 minutes.
+
+## Step 1: Clone and Build
+
+```bash
+# Clone the testing framework (assumes nomos-node sibling checkout)
+# Note: If the testing framework lives inside the main Nomos monorepo,
+# adjust the clone URL and paths accordingly.
+git clone https://github.com/logos-co/nomos-testing.git
+cd nomos-testing
+
+# Build the testing framework crates
+cargo build -p testing-framework-core -p testing-framework-workflows
+```
+
+> **Build modes**: Node binaries use `--release` for realistic performance. Framework crates use debug for faster iteration. For pure development speed, you can build everything in debug mode.
+
+## Step 2: Run the Simplest Scenario
+
+```bash
+# Run a local 2-validator smoke test
+cargo test --package tests-workflows --test local_runner -- local_runner_mixed_workloads --nocapture
+```
+
+## Step 3: What Good Output Looks Like
+
+```
+running 1 test
+[INFO] Spawning validator 0 on port 18800
+[INFO] Spawning validator 1 on port 18810
+[INFO] Waiting for network readiness...
+[INFO] Network ready: all peers connected
+[INFO] Waiting for membership readiness...
+[INFO] Membership ready for session 0
+[INFO] Starting workloads...
+[INFO] Transaction workload submitting at 5 tx/block
+[INFO] DA workload: channel inscription submitted
+[INFO] Block 1 observed: 3 transactions
+[INFO] Block 2 observed: 5 transactions
+...
+[INFO] Workloads complete, evaluating expectations
+[INFO] consensus_liveness: target=8, observed heights=[12, 11] ✓
+[INFO] tx_inclusion_expectation: 42/50 included (84%) ✓
+test local_runner_mixed_workloads ... ok
+```
+
+## Step 4: What Failure Looks Like
+
+```
+[ERROR] consensus_liveness violated (target=8):
+- validator-0 height 2 below target 8
+- validator-1 height 3 below target 8
+
+test local_runner_mixed_workloads ... FAILED
+```
+
+Common causes: run duration too short, readiness not complete, node crashed.
+
+## Step 5: Modify a Scenario
+
+Open `tests/workflows/tests/local_runner.rs`:
+
+```rust
+// Change this:
+const RUN_DURATION: Duration = Duration::from_secs(60);
+
+// To this for a longer run:
+const RUN_DURATION: Duration = Duration::from_secs(120);
+
+// Or change validator count:
+const VALIDATORS: usize = 3;  // was 2
+```
+
+Re-run:
+
+```bash
+cargo test --package tests-workflows --test local_runner -- --nocapture
+```
+
+You're now ready to explore the framework!
+
+---
+
+<!-- FILE: foundations/introduction.md -->
+
+# Part I — Foundations
+
+## Introduction
+
+The Nomos Testing Framework solves the gap between small, isolated unit tests and full-system validation by letting teams:
+
+1. **Describe** a cluster layout (topology)
+2. **Drive** meaningful traffic (workloads)
+3. **Assert** outcomes (expectations)
+
+...all in one coherent, portable plan (a `Scenario` in code terms).
+
+### Why Multi-Node Testing?
+
+Many Nomos behaviors only emerge when multiple roles interact:
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                BEHAVIORS REQUIRING MULTI-NODE                   │
+├─────────────────────────────────────────────────────────────────┤
+│ • Block progression across validators                           │
+│ • Data availability sampling and dispersal                      │
+│ • Consensus under network partitions                            │
+│ • Liveness recovery after node restarts                         │
+│ • Transaction propagation and inclusion                         │
+│ • Membership and session transitions                            │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+Unit tests can't catch these. This framework makes multi-node checks declarative, observable, and repeatable.
+
+### Target Audience
+
+| Role | Primary Concerns |
+|------|------------------|
+| **Protocol Engineers** | Consensus correctness, DA behavior, block progression |
+| **Infrastructure/DevOps** | Runners, CI integration, logs, failure triage |
+| **QA/Test Designers** | Scenario composition, workload tuning, coverage |
+
+---
+
+<!-- FILE: foundations/architecture.md -->
+
+## Architecture Overview
+
+The framework follows a clear pipeline:
+
+```
+┌──────────┐    ┌──────────┐    ┌──────────┐    ┌──────────┐    ┌─────────────┐
+│ TOPOLOGY │───▶│ SCENARIO │───▶│  RUNNER  │───▶│ WORKLOADS│───▶│EXPECTATIONS │
+│          │    │          │    │          │    │          │    │             │
+│ Shape    │    │ Assemble │    │ Deploy & │    │ Drive    │    │ Verify      │
+│ cluster  │    │ plan     │    │ wait     │    │ traffic  │    │ outcomes    │
+└──────────┘    └──────────┘    └──────────┘    └──────────┘    └─────────────┘
+```
+
+### Component Responsibilities
+
+| Component | Responsibility | Key Types |
+|-----------|----------------|-----------|
+| **Topology** | Declares cluster shape: node counts, network layout, DA parameters | `TopologyConfig`, `GeneratedTopology`, `TopologyBuilder` |
+| **Scenario** | Assembles topology + workloads + expectations + duration | `Scenario<Caps>`, `ScenarioBuilder` |
+| **Runner** | Deploys to environment, waits for readiness, provides `RunContext` | `Runner`, `LocalDeployer`, `ComposeRunner`, `K8sRunner` |
+| **Workloads** | Generate traffic/conditions during the run | `Workload` trait, `TransactionWorkload`, `DaWorkload`, `RandomRestartWorkload` |
+| **Expectations** | Judge success/failure after workloads complete | `Expectation` trait, `ConsensusLiveness`, `TxInclusionExpectation` |
+
+### Type Flow Diagram
+
+```
+TopologyConfig
+    │
+    │ TopologyBuilder::new()
+    ▼
+TopologyBuilder ──.build()──▶ GeneratedTopology
+                                    │
+                                    │ contains
+                                    ▼
+                            GeneratedNodeConfig[]
+                                    │
+                                    │ Runner spawns
+                                    ▼
+                              Topology (live nodes)
+                                    │
+                                    │ provides
+                                    ▼
+                              NodeClients
+                                    │
+                                    │ wrapped in
+                                    ▼
+                              RunContext
+```
+
+```
+ScenarioBuilder
+    │
+    │ .with_workload() / .with_expectation() / .with_run_duration()
+    │
+    │ .build()
+    ▼
+Scenario<Caps>
+    │
+    │ Deployer::deploy()
+    ▼
+Runner
+    │
+    │ .run(&mut scenario)
+    ▼
+RunHandle (success) or ScenarioError (failure)
+```
+
+---
+
+<!-- FILE: foundations/testing-philosophy.md -->
+
+## Testing Philosophy
+
+### Core Principles
+
+1. **Declarative over imperative**
+   - Describe desired state, let framework orchestrate
+   - Scenarios are data, not scripts
+
+2. **Observable health signals**
+   - Prefer liveness/inclusion signals over internal debug state
+   - If users can't see it, don't assert on it
+
+3. **Determinism first**
+   - Fixed topologies and traffic rates by default
+   - Variability is opt-in (chaos workloads)
+
+4. **Protocol time, not wall time**
+   - Reason in blocks and slots
+   - Reduces host speed dependence
+
+5. **Minimum run window**
+   - Always allow enough blocks for meaningful assertions
+   - Framework enforces minimum 2 blocks
+
+6. **Chaos with intent**
+   - Chaos workloads for resilience testing only
+   - Avoid chaos in basic functional smoke tests; reserve it for dedicated resilience scenarios
+
+### Testing Spectrum
+
+```
+┌────────────────────────────────────────────────────────────────┐
+│                    WHERE THIS FRAMEWORK FITS                   │
+├──────────────┬────────────────────┬────────────────────────────┤
+│  UNIT TESTS  │  INTEGRATION       │  MULTI-NODE SCENARIOS      │
+│              │                    │                            │
+│  Fast        │  Single process    │  ◀── THIS FRAMEWORK        │
+│  Isolated    │  Mock network      │                            │
+│  Deterministic│  No real timing   │  Real networking           │
+│              │                    │  Protocol timing           │
+│  ~1000s/sec  │  ~100s/sec         │  ~1-10/hour                │
+└──────────────┴────────────────────┴────────────────────────────┘
+```
+
+---
+
+<!-- FILE: foundations/lifecycle.md -->
+
+## Scenario Lifecycle
+
+### Phase Overview
+
+```
+┌─────────┐   ┌─────────┐   ┌───────────┐   ┌─────────┐   ┌──────────┐   ┌──────────┐   ┌─────────┐
+│  PLAN   │──▶│ DEPLOY  │──▶│ READINESS │──▶│  DRIVE  │──▶│ COOLDOWN │──▶│ EVALUATE │──▶│ CLEANUP │
+└─────────┘   └─────────┘   └───────────┘   └─────────┘   └──────────┘   └──────────┘   └─────────┘
+```
+
+### Detailed Timeline
+
+```
+Time ──────────────────────────────────────────────────────────────────────▶
+
+     │ PLAN          │ DEPLOY        │ READY    │ WORKLOADS      │COOL│ EVAL │
+     │               │               │          │                │DOWN│      │
+     │ Build         │ Spawn         │ Network  │ Traffic runs   │    │Check │
+     │ scenario      │ nodes         │ DA       │ Blocks produce │ 5× │ all  │
+     │               │ (local/       │ Member   │                │blk │expect│
+     │               │ docker/k8s)   │ ship     │                │    │      │
+     │               │               │          │                │    │      │
+     ▼               ▼               ▼          ▼                ▼    ▼      ▼
+   t=0            t=5s           t=30s       t=35s            t=95s t=100s t=105s
+                                                                          │
+                                                              (example    │
+                                                               60s run)   ▼
+                                                                       CLEANUP
+```
+
+### Phase Details
+
+| Phase | What Happens | Code Entry Point |
+|-------|--------------|------------------|
+| **Plan** | Declare topology, attach workloads/expectations, set duration | `ScenarioBuilder::build()` |
+| **Deploy** | Runner provisions environment | `deployer.deploy(&scenario)` |
+| **Readiness** | Wait for network peers, DA balancer, membership | `wait_network_ready()`, `wait_membership_ready()`, `wait_da_balancer_ready()` |
+| **Drive** | Workloads run concurrently for configured duration | `workload.start(ctx)` inside `Runner::run_workloads()` |
+| **Cooldown** | Stabilization period (5× block interval, 30s min if chaos used) | Automatic in `Runner::cooldown()` |
+| **Evaluate** | All expectations run; failures **aggregated** (not short-circuited) | `expectation.evaluate(ctx)` |
+| **Cleanup** | Resources reclaimed via `CleanupGuard` | `Drop` impl on `Runner` |
+
+### Readiness Phases (Detail)
+
+Runners perform three distinct readiness checks:
+
+```
+┌─────────────────┐     ┌─────────────────┐     ┌─────────────────┐
+│ NETWORK         │────▶│ MEMBERSHIP      │────▶│ DA BALANCER     │
+│                 │     │                 │     │                 │
+│ libp2p peers    │     │ Session 0       │     │ Dispersal peers │
+│ connected       │     │ assignments     │     │ available       │
+│                 │     │ propagated      │     │                 │
+│ Timeout: 60s    │     │ Timeout: 60s    │     │ Timeout: 60s    │
+└─────────────────┘     └─────────────────┘     └─────────────────┘
+```
+
+---
+
+<!-- FILE: guide/authoring-scenarios.md -->
+
+# Part II — User Guide
+
+## Authoring Scenarios
+
+### The 5-Step Process
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    SCENARIO AUTHORING FLOW                      │
+├─────────────────────────────────────────────────────────────────┤
+│                                                                 │
+│  1. SHAPE TOPOLOGY          2. ATTACH WORKLOADS                 │
+│     ┌─────────────┐            ┌─────────────┐                  │
+│     │ Validators  │            │ Transactions│                  │
+│     │ Executors   │            │ DA blobs    │                  │
+│     │ Network     │            │ Chaos       │                  │
+│     │ DA params   │            └─────────────┘                  │
+│     └─────────────┘                                             │
+│                                                                 │
+│  3. DEFINE EXPECTATIONS     4. SET DURATION                     │
+│     ┌─────────────┐            ┌─────────────┐                  │
+│     │ Liveness    │            │ See duration│                  │
+│     │ Inclusion   │            │ heuristics  │                  │
+│     │ Custom      │            │ table below │                  │
+│     └─────────────┘            └─────────────┘                  │
+│                                                                 │
+│  5. CHOOSE RUNNER                                               │
+│     ┌─────────┐ ┌─────────┐ ┌─────────┐                         │
+│     │ Local   │ │ Compose │ │ K8s     │                         │
+│     └─────────┘ └─────────┘ └─────────┘                         │
+│                                                                 │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+### Duration Heuristics
+
+Use protocol time (blocks), not wall time. With default 2-second slots and active slot coefficient of 0.9, expect roughly one block every ~2–3 seconds (subject to randomness). Individual topologies may override these defaults.
+
+| Scenario Type | Min Blocks | Recommended Duration | Notes |
+|---------------|------------|---------------------|-------|
+| Smoke test | 5-10 | 30-60s | Quick validation |
+| Tx throughput | 20-50 | 2-3 min | Capture steady state |
+| DA + tx combined | 30-50 | 3-5 min | Observe interaction |
+| Chaos/resilience | 50-100 | 5-10 min | Allow restart recovery |
+| Long-run stability | 100+ | 10-30 min | Trend validation |
+
+> **Note**: The framework enforces a minimum of 2 blocks. Very short durations are clamped automatically.
+
+### Builder Pattern Overview
+
+```rust
+ScenarioBuilder::with_node_counts(validators, executors)
+    // 1. Topology sub-builder
+    .topology()
+        .network_star()
+        .validators(n)
+        .executors(n)
+        .apply()  // Returns to main builder
+    
+    // 2. Wallet seeding
+    .wallets(user_count)
+    
+    // 3. Workload sub-builders
+    .transactions()
+        .rate(per_block)
+        .users(actors)
+        .apply()
+    
+    .da()
+        .channel_rate(n)
+        .blob_rate(n)
+        .apply()
+    
+    // 4. Optional chaos (changes Caps type)
+    .enable_node_control()
+    .chaos_random_restart()
+        .validators(true)
+        .executors(true)
+        .min_delay(Duration)
+        .max_delay(Duration)
+        .target_cooldown(Duration)
+        .apply()
+    
+    // 5. Duration and expectations
+    .with_run_duration(duration)
+    .expect_consensus_liveness()
+    
+    // 6. Build
+    .build()
+```
+
+---
+
+<!-- FILE: guide/workloads.md -->
+
+## Workloads
+
+Workloads generate traffic and conditions during a scenario run.
+
+### Available Workloads
+
+| Workload | Purpose | Key Config | Bundled Expectation |
+|----------|---------|------------|---------------------|
+| **Transaction** | Submit transactions at configurable rate | `rate`, `users` | `TxInclusionExpectation` |
+| **DA** | Create channels, publish blobs | `channel_rate`, `blob_rate` | `DaWorkloadExpectation` |
+| **Chaos** | Restart nodes randomly | `min_delay`, `max_delay`, `target_cooldown` | None (use `ConsensusLiveness`) |
+
+### Transaction Workload
+
+Submits user-level transactions at a configurable rate.
+
+```rust
+.transactions()
+    .rate(5)      // 5 transactions per block opportunity
+    .users(8)     // Use 8 distinct wallet actors
+    .apply()
+```
+
+**Requires**: Seeded wallets (`.wallets(n)`)
+
+### DA Workload
+
+Drives data-availability paths: channel inscriptions and blob publishing.
+
+```rust
+.da()
+    .channel_rate(1)  // 1 channel operation per block
+    .blob_rate(1)     // 1 blob per channel
+    .apply()
+```
+
+**Requires**: At least one executor for blob publishing.
+
+### Chaos Workload
+
+Triggers controlled node restarts to test resilience.
+
+```rust
+.enable_node_control()  // Required capability
+.chaos_random_restart()
+    .validators(true)           // Include validators
+    .executors(true)            // Include executors
+    .min_delay(Duration::from_secs(45))    // Min time between restarts
+    .max_delay(Duration::from_secs(75))    // Max time between restarts
+    .target_cooldown(Duration::from_secs(120))  // Per-node cooldown
+    .apply()
+```
+
+**Safety behavior**: If only one validator is configured, the chaos workload automatically skips validator restarts to avoid halting consensus.
+
+**Cooldown behavior**: After chaos workloads, the runner adds a minimum 30-second cooldown before evaluating expectations.
+
+---
+
+<!-- FILE: guide/expectations.md -->
+
+## Expectations
+
+Expectations are post-run assertions that judge success or failure.
+
+### Available Expectations
+
+| Expectation | Asserts | Default Tolerance |
+|-------------|---------|-------------------|
+| **ConsensusLiveness** | All validators reach minimum block height | 80% of expected blocks |
+| **TxInclusionExpectation** | Submitted transactions appear in blocks | 50% inclusion ratio |
+| **DaWorkloadExpectation** | Planned channels/blobs were included | 80% inclusion ratio |
+| **PrometheusBlockProduction** | Prometheus metrics show block production | Exact minimum |
+
+### ConsensusLiveness
+
+The primary health check. Polls each validator's HTTP consensus info.
+
+```rust
+// With default 80% tolerance:
+.expect_consensus_liveness()
+
+// Or with specific minimum:
+.with_expectation(ConsensusLiveness::with_minimum(10))
+
+// Or with custom tolerance:
+.with_expectation(ConsensusLiveness::with_tolerance(0.9))
+```
+
+> **Note for advanced users**: There are two `ConsensusLiveness` implementations in the codebase:
+> - `testing_framework_workflows::ConsensusLiveness` — HTTP-based, checks heights via `consensus_info()` API. This is what `.expect_consensus_liveness()` uses.
+> - `testing_framework_core::scenario::expectations::ConsensusLiveness` — Also HTTP-based but with different tolerance semantics.
+> 
+> There's also `PrometheusBlockProduction` in core for Prometheus-based metrics checks when telemetry is configured.
+
+### Expectation Lifecycle
+
+```
+┌─────────────┐     ┌─────────────┐     ┌─────────────┐
+│    init()   │────▶│start_capture│────▶│  evaluate() │
+│             │     │    ()       │     │             │
+│ Validate    │     │ Snapshot    │     │ Assert      │
+│ prereqs     │     │ baseline    │     │ conditions  │
+│             │     │ (optional)  │     │             │
+└─────────────┘     └─────────────┘     └─────────────┘
+     │                    │                    │
+     ▼                    ▼                    ▼
+  At build()         Before workloads     After workloads
+```
+
+### Common Expectation Mistakes
+
+| Mistake | Why It Fails | Fix |
+|---------|--------------|-----|
+| Expecting inclusion too soon | Transactions need blocks to be included | Increase duration |
+| Wall-clock timing assertions | Host speed varies | Use block counts via `RunMetrics` |
+| Duration too short | Not enough blocks observed | Use duration heuristics table |
+| Skipping `start_capture()` | Baseline not established | Implement if comparing before/after |
+| Asserting on internal state | Framework can't observe it | Use `consensus_info()` or `BlockFeed` |
+
+---
+
+<!-- FILE: guide/blockfeed.md -->
+
+## BlockFeed Deep Dive
+
+The `BlockFeed` is the primary mechanism for observing block production during a run.
+
+### What BlockFeed Provides
+
+```rust
+pub struct BlockFeed {
+    // Subscribe to receive block notifications
+    pub fn subscribe(&self) -> broadcast::Receiver<Arc<BlockRecord>>;
+    
+    // Access aggregate statistics
+    pub fn stats(&self) -> Arc<BlockStats>;
+}
+
+pub struct BlockRecord {
+    pub header: HeaderId,                    // Block header ID
+    pub block: Arc<Block<SignedMantleTx>>,   // Full block with transactions
+}
+
+pub struct BlockStats {
+    // Total transactions observed across all blocks
+    pub fn total_transactions(&self) -> u64;
+}
+```
+
+### How It Works
+
+```
+┌────────────────┐     ┌────────────────┐     ┌────────────────┐
+│  BlockScanner  │────▶│   BlockFeed    │────▶│  Subscribers   │
+│                │     │                │     │                │
+│ Polls validator│     │ broadcast      │     │ Workloads      │
+│ consensus_info │     │ channel        │     │ Expectations   │
+│ every 1 second │     │ (1024 buffer)  │     │                │
+│                │     │                │     │                │
+│ Fetches blocks │     │ Records stats  │     │                │
+│ via storage_   │     │                │     │                │
+│ block()        │     │                │     │                │
+└────────────────┘     └────────────────┘     └────────────────┘
+```
+
+### Using BlockFeed in Workloads
+
+```rust
+async fn start(&self, ctx: &RunContext) -> Result<(), DynError> {
+    let mut receiver = ctx.block_feed().subscribe();
+    
+    loop {
+        match receiver.recv().await {
+            Ok(record) => {
+                // Process block
+                let height = record.block.header().slot().into();
+                let tx_count = record.block.transactions().len();
+                
+                // Check for specific transactions
+                for tx in record.block.transactions() {
+                    // ... examine transaction
+                }
+            }
+            Err(broadcast::error::RecvError::Lagged(n)) => {
+                // Fell behind, n messages skipped
+                continue;
+            }
+            Err(broadcast::error::RecvError::Closed) => {
+                return Err("block feed closed".into());
+            }
+        }
+    }
+}
+```
+
+### Using BlockFeed in Expectations
+
+```rust
+async fn start_capture(&mut self, ctx: &RunContext) -> Result<(), DynError> {
+    let mut receiver = ctx.block_feed().subscribe();
+    let observed = Arc::new(Mutex::new(HashSet::new()));
+    let observed_clone = Arc::clone(&observed);
+    
+    // Spawn background task to collect observations
+    tokio::spawn(async move {
+        while let Ok(record) = receiver.recv().await {
+            // Record what we observe
+            let mut guard = observed_clone.lock().unwrap();
+            for tx in record.block.transactions() {
+                guard.insert(tx.hash());
+            }
+        }
+    });
+    
+    self.observed = Some(observed);
+    Ok(())
+}
+
+async fn evaluate(&mut self, ctx: &RunContext) -> Result<(), DynError> {
+    let observed = self.observed.as_ref().ok_or("not captured")?;
+    let guard = observed.lock().unwrap();
+    
+    // Compare observed vs expected
+    if guard.len() < self.expected_count {
+        return Err(format!(
+            "insufficient inclusions: {} < {}",
+            guard.len(), self.expected_count
+        ).into());
+    }
+    Ok(())
+}
+```
+
+---
+
+<!-- FILE: runners/local.md -->
+
+# Runner: Local
+
+Runs node binaries as local processes on the host.
+
+## What It Does
+
+- Spawns validators/executors directly on the host with ephemeral data dirs.
+- Binds HTTP/libp2p ports on localhost; no containers involved.
+- Fastest feedback loop; best for unit-level scenarios and debugging.
+
+## Prerequisites
+
+- Rust toolchain installed.
+- No ports in use on the default ranges (see runner config if you need to override).
+
+## How to Run
+
+```bash
+cargo test -p tests-workflows --test local_runner -- local_runner_mixed_workloads --nocapture
+```
+
+Adjust validator/executor counts inside the test file or via the scenario builder.
+
+## Troubleshooting
+
+- Port already in use → change base ports in the test or stop the conflicting process.
+- Slow start on first run → binaries need to be built; reruns are faster.
+- No blocks → ensure workloads enabled and duration long enough (≥60s default).
+
+---
+
+<!-- FILE: runners/compose.md -->
+
+# Runner: Docker Compose
+
+Runs validators/executors in Docker containers using docker-compose.
+
+## What It Does
+
+- Builds/pulls the node image, then creates a network and one container per role.
+- Uses Compose health checks for readiness, then runs workloads/expectations.
+- Cleans up containers and network unless preservation is requested.
+
+## Prerequisites
+
+- Docker with the Compose plugin.
+- Built node image available locally (default `nomos-testnet:local`).
+  - Build from repo root: `testnet/scripts/build_test_image.sh`
+- Optional env vars:
+  - `NOMOS_TESTNET_IMAGE` (override tag)
+  - `COMPOSE_NODE_PAIRS=1x1` (validators x executors)
+  - `COMPOSE_RUNNER_PRESERVE=1` to keep the stack for inspection
+
+## How to Run
+
+```bash
+POL_PROOF_DEV_MODE=true COMPOSE_NODE_PAIRS=1x1 \
+cargo test -p tests-workflows compose_runner_mixed_workloads -- --nocapture
+```
+
+## Troubleshooting
+
+- Image not found → set `NOMOS_TESTNET_IMAGE` to a built/pulled tag.
+- Peers not connecting → inspect `docker compose logs` for validator/executor.
+- Stack left behind → `docker compose -p <project> down` and remove the network.
+
+---
+
+<!-- FILE: runners/k8s.md -->
+
+# Runner: Kubernetes
+
+Deploys validators/executors as a Helm release into the current Kubernetes context.
+
+## What It Does
+
+- Builds/pulls the node image, packages Helm assets, installs into a unique namespace.
+- Waits for pod readiness and validator HTTP endpoint, then drives workloads.
+- Tears down the namespace unless preservation is requested.
+
+## Prerequisites
+
+- kubectl and Helm on PATH; a running Kubernetes cluster/context (e.g., Docker Desktop, kind).
+- Docker buildx to build the node image for your arch.
+- Built image tag exported:
+  - Build: `testnet/scripts/build_test_image.sh` (default tag `nomos-testnet:local`)
+  - Export: `export NOMOS_TESTNET_IMAGE=nomos-testnet:local`
+- Optional: `K8S_RUNNER_PRESERVE=1` to keep the namespace for debugging.
+
+## How to Run
+
+```bash
+NOMOS_TESTNET_IMAGE=nomos-testnet:local \
+cargo test -p tests-workflows demo_k8s_runner_tx_workload -- --nocapture
+```
+
+## Troubleshooting
+
+- Timeout waiting for validator HTTP → check pod logs: `kubectl logs -n <ns> deploy/validator`.
+- No peers/tx inclusion → inspect rendered `/config.yaml` in the pod and cfgsync logs.
+- Cleanup stuck → `kubectl delete namespace <ns>` from the preserved namespace name.
+
+---
+
+<!-- FILE: guide/runners.md -->
+
+## Runners
+
+Runners deploy scenarios to different environments.
+
+### Runner Decision Matrix
+
+| Goal | Recommended Runner | Why |
+|------|-------------------|-----|
+| Fast local iteration | `LocalDeployer` | No container overhead |
+| Reproducible e2e checks | `ComposeRunner` | Stable multi-node isolation |
+| High fidelity / CI | `K8sRunner` | Real cluster behavior |
+| Config validation only | Dry-run (future) | Catch errors before nodes |
+
+### Runner Comparison
+
+| Aspect | LocalDeployer | ComposeRunner | K8sRunner |
+|--------|---------------|---------------|-----------|
+| **Speed** | ⚡ Fastest | 🔄 Medium | 🏗️ Slowest |
+| **Setup** | Binaries only | Docker daemon | Cluster access |
+| **Isolation** | Process-level | Container-level | Pod-level |
+| **Port discovery** | Direct | Auto via Docker | NodePort |
+| **Node control** | Full | Via container restart | Via pod restart |
+| **Observability** | Local files | Container logs | Prometheus + logs |
+| **CI suitability** | Dev only | Good | Best |
+
+### LocalDeployer
+
+Spawns nodes as host processes.
+
+```rust
+let deployer = LocalDeployer::default();
+// Or skip membership check for faster startup:
+let deployer = LocalDeployer::new().with_membership_check(false);
+
+let runner = deployer.deploy(&scenario).await?;
+```
+
+### ComposeRunner
+
+Starts nodes in Docker containers via Docker Compose.
+
+```rust
+let deployer = ComposeRunner::default();
+let runner = deployer.deploy(&scenario).await?;
+```
+
+**Uses Configuration Sync (cfgsync)** — see Operations section.
+
+### K8sRunner
+
+Deploys to a Kubernetes cluster.
+
+```rust
+let deployer = K8sRunner::new();
+let runner = match deployer.deploy(&scenario).await {
+    Ok(r) => r,
+    Err(K8sRunnerError::ClientInit { source }) => {
+        // Cluster unavailable
+        return;
+    }
+    Err(e) => panic!("deployment failed: {e}"),
+};
+```
+
+---
+
+<!-- FILE: guide/operations.md -->
+
+## Operations
+
+### Prerequisites Checklist
+
+```
+□ nomos-node checkout available (sibling directory)
+□ Binaries built: cargo build -p nomos-node -p nomos-executor
+□ Runner platform ready:
+  □ Local: binaries in target/debug/
+  □ Compose: Docker daemon running
+  □ K8s: kubectl configured, cluster accessible
+□ KZG prover assets fetched (for DA scenarios)
+□ Ports available (default ranges: 18800+, 4400 for cfgsync)
+```
+
+### Environment Variables
+
+| Variable | Effect | Default |
+|----------|--------|---------|
+| `SLOW_TEST_ENV=true` | 2× timeout multiplier for all readiness checks | `false` |
+| `NOMOS_TESTS_TRACING=true` | Enable debug tracing output | `false` |
+| `NOMOS_TESTS_KEEP_LOGS=1` | Preserve temp directories after run | Delete |
+| `NOMOS_TESTNET_IMAGE` | Docker image for Compose/K8s runners | `nomos-testnet:local` |
+| `COMPOSE_RUNNER_PRESERVE=1` | Keep Compose resources after run | Delete |
+| `TEST_FRAMEWORK_PROMETHEUS_PORT` | Host port for Prometheus (Compose) | `9090` |
+
+### Configuration Synchronization (cfgsync)
+
+When running in Docker Compose or Kubernetes, the framework uses **dynamic configuration injection** instead of static config files.
+
+```
+┌─────────────────┐                    ┌─────────────────┐
+│  RUNNER HOST    │                    │  NODE CONTAINER │
+│                 │                    │                 │
+│ ┌─────────────┐ │   HTTP :4400       │ ┌─────────────┐ │
+│ │ cfgsync     │◀├───────────────────┤│ cfgsync     │ │
+│ │ server      │ │                    │ │ client      │ │
+│ │             │ │  1. Request config │ │             │ │
+│ │ Holds       │ │  2. Receive YAML   │ │ Fetches     │ │
+│ │ generated   │ │  3. Start node     │ │ config at   │ │
+│ │ topology    │ │                    │ │ startup     │ │
+│ └─────────────┘ │                    │ └─────────────┘ │
+└─────────────────┘                    └─────────────────┘
+```
+
+**Why cfgsync?**
+- Handles dynamic port discovery
+- Injects cryptographic keys
+- Supports topology changes without rebuilding images
+
+**Troubleshooting cfgsync:**
+
+| Symptom | Cause | Fix |
+|---------|-------|-----|
+| Containers stuck at startup | cfgsync server unreachable | Check port 4400 is not blocked |
+| "connection refused" in logs | Server not started | Verify runner started cfgsync |
+| Config mismatch errors | Stale cfgsync template | Clean temp directories |
+
+---
+
+<!-- FILE: reference/troubleshooting.md -->
+
+# Part IV — Reference
+
+## Troubleshooting
+
+### Error Messages and Fixes
+
+#### Readiness Timeout
+
+```
+Error: readiness probe failed: timed out waiting for network readiness:
+  validator#0@18800: 0 peers (expected 1)
+  validator#1@18810: 0 peers (expected 1)
+```
+
+**Causes:**
+- Nodes not fully started
+- Network configuration mismatch
+- Ports blocked
+
+**Fixes:**
+- Set `SLOW_TEST_ENV=true` for 2× timeout
+- Check node logs for startup errors
+- Verify ports are available
+
+#### Consensus Liveness Violation
+
+```
+Error: expectations failed:
+consensus liveness violated (target=8):
+- validator-0 height 2 below target 8
+- validator-1 height 3 below target 8
+```
+
+**Causes:**
+- Run duration too short
+- Node crashed during run
+- Consensus stalled
+
+**Fixes:**
+- Increase `with_run_duration()`
+- Check node logs for panics
+- Verify network connectivity
+
+#### Transaction Inclusion Below Threshold
+
+```
+Error: tx_inclusion_expectation: observed 15 below required 25
+```
+
+**Causes:**
+- Wallet not seeded
+- Transaction rate too high
+- Mempool full
+
+**Fixes:**
+- Add `.wallets(n)` to scenario
+- Reduce `.rate()` in transaction workload
+- Increase duration for more blocks
+
+#### Chaos Workload No Targets
+
+```
+Error: chaos restart workload has no eligible targets
+```
+
+**Causes:**
+- No validators or executors configured
+- Only one validator (skipped for safety)
+- Chaos disabled for both roles
+
+**Fixes:**
+- Add more validators (≥2) for chaos
+- Enable `.executors(true)` if executors present
+- Use different workload for single-validator tests
+
+#### BlockFeed Closed
+
+```
+Error: block feed closed while waiting for channel operations
+```
+
+**Causes:**
+- Source validator crashed
+- Network partition
+- Run ended prematurely
+
+**Fixes:**
+- Check validator logs
+- Increase run duration
+- Verify readiness completed
+
+### Log Locations
+
+| Runner | Log Location |
+|--------|--------------|
+| Local | Temp directory (printed at startup), or set `NOMOS_TESTS_KEEP_LOGS=1` |
+| Compose | `docker logs <container_name>` |
+| K8s | `kubectl logs <pod_name>` |
+
+### Debugging Flow
+
+```
+┌─────────────────┐
+│ Scenario fails  │
+└────────┬────────┘
+         ▼
+┌────────────────────────────────────────┐
+│ 1. Check error message category        │
+│    - Readiness? → Check startup logs   │
+│    - Workload? → Check workload config │
+│    - Expectation? → Check assertions   │
+└────────┬───────────────────────────────┘
+         ▼
+┌────────────────────────────────────────┐
+│ 2. Check node logs                     │
+│    - Panics? → Bug in node             │
+│    - Connection errors? → Network      │
+│    - Config errors? → cfgsync issue    │
+└────────┬───────────────────────────────┘
+         ▼
+┌────────────────────────────────────────┐
+│ 3. Reproduce with tracing              │
+│    NOMOS_TESTS_TRACING=true cargo test │
+└────────┬───────────────────────────────┘
+         ▼
+┌────────────────────────────────────────┐
+│ 4. Simplify scenario                   │
+│    - Reduce validators                 │
+│    - Remove workloads one by one       │
+│    - Increase duration                 │
+└────────────────────────────────────────┘
+```
+
+---
+
+<!-- FILE: reference/dsl-cheat-sheet.md -->
+
+## DSL Cheat Sheet
+
+### Complete Builder Reference
+
+```rust
+// ═══════════════════════════════════════════════════════════════
+// TOPOLOGY
+// ═══════════════════════════════════════════════════════════════
+
+ScenarioBuilder::with_node_counts(validators, executors)
+
+    .topology()
+        .network_star()              // Star layout (hub-spoke)
+        .validators(count)           // Validator count
+        .executors(count)            // Executor count
+        .apply()                     // Return to main builder
+
+// ═══════════════════════════════════════════════════════════════
+// WALLET SEEDING
+// ═══════════════════════════════════════════════════════════════
+
+    .wallets(user_count)             // Uniform: 100 funds/user
+    .with_wallet_config(custom)      // Custom WalletConfig
+
+// ═══════════════════════════════════════════════════════════════
+// TRANSACTION WORKLOAD
+// ═══════════════════════════════════════════════════════════════
+
+    .transactions()
+        .rate(txs_per_block)         // NonZeroU64
+        .users(actor_count)          // NonZeroUsize
+        .apply()
+
+// ═══════════════════════════════════════════════════════════════
+// DA WORKLOAD
+// ═══════════════════════════════════════════════════════════════
+
+    .da()
+        .channel_rate(ops_per_block) // Channel inscriptions
+        .blob_rate(blobs_per_chan)   // Blobs per channel
+        .apply()
+
+// ═══════════════════════════════════════════════════════════════
+// CHAOS WORKLOAD (requires .enable_node_control())
+// ═══════════════════════════════════════════════════════════════
+
+    .enable_node_control()           // Required first!
+    
+    .chaos_random_restart()
+        .validators(bool)            // Restart validators?
+        .executors(bool)             // Restart executors?
+        .min_delay(Duration)         // Min between restarts
+        .max_delay(Duration)         // Max between restarts
+        .target_cooldown(Duration)   // Per-node cooldown
+        .apply()
+
+// ═══════════════════════════════════════════════════════════════
+// DURATION & EXPECTATIONS
+// ═══════════════════════════════════════════════════════════════
+
+    .with_run_duration(Duration)     // Clamped to ≥2 blocks
+    
+    .expect_consensus_liveness()     // Default 80% tolerance
+    
+    .with_expectation(custom)        // Add custom Expectation
+    .with_workload(custom)           // Add custom Workload
+
+// ═══════════════════════════════════════════════════════════════
+// BUILD
+// ═══════════════════════════════════════════════════════════════
+
+    .build()                         // Returns Scenario<Caps>
+```
+
+### Quick Patterns
+
+```rust
+// Minimal smoke test
+ScenarioBuilder::with_node_counts(2, 0)
+    .with_run_duration(Duration::from_secs(30))
+    .expect_consensus_liveness()
+    .build()
+
+// Transaction throughput
+ScenarioBuilder::with_node_counts(2, 0)
+    .wallets(64)
+    .transactions().rate(10).users(8).apply()
+    .with_run_duration(Duration::from_secs(120))
+    .expect_consensus_liveness()
+    .build()
+
+// DA + transactions
+ScenarioBuilder::with_node_counts(1, 1)
+    .wallets(64)
+    .transactions().rate(5).users(4).apply()
+    .da().channel_rate(1).blob_rate(1).apply()
+    .with_run_duration(Duration::from_secs(180))
+    .expect_consensus_liveness()
+    .build()
+
+// Chaos resilience
+ScenarioBuilder::with_node_counts(3, 1)
+    .enable_node_control()
+    .wallets(64)
+    .transactions().rate(3).users(4).apply()
+    .chaos_random_restart()
+        .validators(true).executors(true)
+        .min_delay(Duration::from_secs(45))
+        .max_delay(Duration::from_secs(75))
+        .target_cooldown(Duration::from_secs(120))
+        .apply()
+    .with_run_duration(Duration::from_secs(300))
+    .expect_consensus_liveness()
+    .build()
+```
+
+---
+
+<!-- FILE: reference/api-reference.md -->
+
+## API Quick Reference
+
+### RunContext
+
+```rust
+impl RunContext {
+    // ─────────────────────────────────────────────────────────────
+    // TOPOLOGY ACCESS
+    // ─────────────────────────────────────────────────────────────
+    
+    /// Static topology configuration
+    pub fn descriptors(&self) -> &GeneratedTopology;
+    
+    /// Live node handles (if available)
+    pub fn topology(&self) -> Option<&Topology>;
+    
+    // ─────────────────────────────────────────────────────────────
+    // CLIENT ACCESS
+    // ─────────────────────────────────────────────────────────────
+    
+    /// All node clients
+    pub fn node_clients(&self) -> &NodeClients;
+    
+    /// Random node client
+    pub fn random_node_client(&self) -> Option<&ApiClient>;
+    
+    /// Cluster client with retry logic
+    pub fn cluster_client(&self) -> ClusterClient<'_>;
+    
+    // ─────────────────────────────────────────────────────────────
+    // WALLET ACCESS
+    // ─────────────────────────────────────────────────────────────
+    
+    /// Seeded wallet accounts
+    pub fn wallet_accounts(&self) -> &[WalletAccount];
+    
+    // ─────────────────────────────────────────────────────────────
+    // OBSERVABILITY
+    // ─────────────────────────────────────────────────────────────
+    
+    /// Block observation stream
+    pub fn block_feed(&self) -> BlockFeed;
+    
+    /// Prometheus metrics (if configured)
+    pub fn telemetry(&self) -> &Metrics;
+    
+    // ─────────────────────────────────────────────────────────────
+    // TIMING
+    // ─────────────────────────────────────────────────────────────
+    
+    /// Configured run duration
+    pub fn run_duration(&self) -> Duration;
+    
+    /// Expected block count for this run
+    pub fn expected_blocks(&self) -> u64;
+    
+    /// Full timing metrics
+    pub fn run_metrics(&self) -> RunMetrics;
+    
+    // ─────────────────────────────────────────────────────────────
+    // NODE CONTROL (CHAOS)
+    // ─────────────────────────────────────────────────────────────
+    
+    /// Node control handle (if enabled)
+    pub fn node_control(&self) -> Option<Arc<dyn NodeControlHandle>>;
+}
+```
+
+### NodeClients
+
+```rust
+impl NodeClients {
+    pub fn validator_clients(&self) -> &[ApiClient];
+    pub fn executor_clients(&self) -> &[ApiClient];
+    pub fn random_validator(&self) -> Option<&ApiClient>;
+    pub fn random_executor(&self) -> Option<&ApiClient>;
+    pub fn all_clients(&self) -> impl Iterator<Item = &ApiClient>;
+    pub fn any_client(&self) -> Option<&ApiClient>;
+    pub fn cluster_client(&self) -> ClusterClient<'_>;
+}
+```
+
+### ApiClient
+
+```rust
+impl ApiClient {
+    // Consensus
+    pub async fn consensus_info(&self) -> reqwest::Result<CryptarchiaInfo>;
+    
+    // Network
+    pub async fn network_info(&self) -> reqwest::Result<Libp2pInfo>;
+    
+    // Transactions
+    pub async fn submit_transaction(&self, tx: &SignedMantleTx) -> reqwest::Result<()>;
+    
+    // Storage
+    pub async fn storage_block(&self, id: &HeaderId) 
+        -> reqwest::Result<Option<Block<SignedMantleTx>>>;
+    
+    // DA
+    pub async fn balancer_stats(&self) -> reqwest::Result<BalancerStats>;
+    pub async fn monitor_stats(&self) -> reqwest::Result<MonitorStats>;
+    pub async fn da_get_membership(&self, session: &SessionNumber)
+        -> reqwest::Result<MembershipResponse>;
+    
+    // URLs
+    pub fn base_url(&self) -> &Url;
+}
+```
+
+### CryptarchiaInfo
+
+```rust
+pub struct CryptarchiaInfo {
+    pub height: u64,      // Current block height
+    pub slot: Slot,       // Current slot number
+    pub tip: HeaderId,    // Tip of the chain
+    // ... additional fields
+}
+```
+
+### Key Traits
+
+```rust
+#[async_trait]
+pub trait Workload: Send + Sync {
+    fn name(&self) -> &str;
+    fn expectations(&self) -> Vec<Box<dyn Expectation>> { vec![] }
+    fn init(&mut self, topology: &GeneratedTopology, metrics: &RunMetrics) 
+        -> Result<(), DynError> { Ok(()) }
+    async fn start(&self, ctx: &RunContext) -> Result<(), DynError>;
+}
+
+#[async_trait]
+pub trait Expectation: Send + Sync {
+    fn name(&self) -> &str;
+    fn init(&mut self, topology: &GeneratedTopology, metrics: &RunMetrics)
+        -> Result<(), DynError> { Ok(()) }
+    async fn start_capture(&mut self, ctx: &RunContext) -> Result<(), DynError> { Ok(()) }
+    async fn evaluate(&mut self, ctx: &RunContext) -> Result<(), DynError>;
+}
+
+#[async_trait]
+pub trait Deployer<Caps = ()>: Send + Sync {
+    type Error;
+    async fn deploy(&self, scenario: &Scenario<Caps>) -> Result<Runner, Self::Error>;
+}
+
+#[async_trait]
+pub trait NodeControlHandle: Send + Sync {
+    async fn restart_validator(&self, index: usize) -> Result<(), DynError>;
+    async fn restart_executor(&self, index: usize) -> Result<(), DynError>;
+}
+```
+
+---
+
+<!-- FILE: reference/glossary.md -->
+
+## Glossary
+
+### Protocol Terms
+
+| Term | Definition |
+|------|------------|
+| **Slot** | Fixed time interval in the consensus protocol (default: 2 seconds) |
+| **Block** | Unit of consensus; contains transactions and header |
+| **Active Slot Coefficient** | Probability of block production per slot (default: 0.5) |
+| **Protocol Interval** | Expected time between blocks: `slot_duration / active_slot_coeff` |
+
+### Framework Terms
+
+| Term | Definition |
+|------|------------|
+| **Topology** | Declarative description of cluster shape, roles, and parameters |
+| **GeneratedTopology** | Concrete topology with generated configs, ports, and keys |
+| **Scenario** | Plan combining topology + workloads + expectations + duration |
+| **Workload** | Traffic/behavior generator during a run |
+| **Expectation** | Post-run assertion judging success/failure |
+| **BlockFeed** | Stream of block observations for workloads/expectations |
+| **RunContext** | Shared context with clients, metrics, observability |
+| **RunMetrics** | Computed timing: expected blocks, block interval, duration |
+| **NodeClients** | Collection of API clients for validators and executors |
+| **ApiClient** | HTTP client for node consensus, network, and DA endpoints |
+| **cfgsync** | Dynamic configuration injection for distributed runners |
+
+### Runner Terms
+
+| Term | Definition |
+|------|------------|
+| **Deployer** | Creates a `Runner` from a `Scenario` |
+| **Runner** | Manages execution: workloads, expectations, cleanup |
+| **RunHandle** | Returned on success; holds context and cleanup |
+| **CleanupGuard** | Ensures resources are reclaimed on drop |
+| **NodeControlHandle** | Interface for restarting nodes (chaos) |
+
+---
+
+<!-- FILE: recipes/index.md -->
+
+# Part V — Scenario Recipes
+
+Complete, copy-paste runnable scenarios.
+
+## Recipe 1: Minimal Smoke Test
+
+**Goal**: Verify basic consensus works with minimal setup.
+
+```rust
+use std::time::Duration;
+use testing_framework_core::scenario::{Deployer as _, ScenarioBuilder};
+use testing_framework_runner_local::LocalDeployer;
+
+#[tokio::test]
+async fn smoke_test_consensus() {
+    // Minimal: 2 validators, no workloads, just check blocks produced
+    let mut plan = ScenarioBuilder::with_node_counts(2, 0)
+        .topology()
+            .network_star()
+            .validators(2)
+            .executors(0)
+            .apply()
+        .with_run_duration(Duration::from_secs(30))
+        .expect_consensus_liveness()
+        .build();
+
+    let deployer = LocalDeployer::default();
+    let runner = deployer.deploy(&plan).await.expect("deployment");
+    runner.run(&mut plan).await.expect("scenario passed");
+}
+```
+
+**Expected output**:
+```
+[INFO] consensus_liveness: target=4, observed heights=[6, 5] ✓
+```
+
+**Common failures**:
+- `height 0 below target`: Nodes didn't start, check binaries exist
+- Timeout: Increase to 60s or set `SLOW_TEST_ENV=true`
+
+---
+
+## Recipe 2: Transaction Throughput Baseline
+
+**Goal**: Measure transaction inclusion under load.
+
+```rust
+use std::time::Duration;
+use testing_framework_core::scenario::{Deployer as _, ScenarioBuilder};
+use testing_framework_runner_local::LocalDeployer;
+use tests_workflows::ScenarioBuilderExt as _;
+
+const VALIDATORS: usize = 2;
+const TX_RATE: u64 = 10;
+const USERS: usize = 8;
+const WALLETS: usize = 64;
+const DURATION: Duration = Duration::from_secs(120);
+
+#[tokio::test]
+async fn transaction_throughput_baseline() {
+    let mut plan = ScenarioBuilder::with_node_counts(VALIDATORS, 0)
+        .topology()
+            .network_star()
+            .validators(VALIDATORS)
+            .executors(0)
+            .apply()
+        .wallets(WALLETS)
+        .transactions()
+            .rate(TX_RATE)
+            .users(USERS)
+            .apply()
+        .with_run_duration(DURATION)
+        .expect_consensus_liveness()
+        .build();
+
+    let deployer = LocalDeployer::default();
+    let runner = deployer.deploy(&plan).await.expect("deployment");
+    
+    let handle = runner.run(&mut plan).await.expect("scenario passed");
+    
+    // Optional: Check stats
+    let stats = handle.context().block_feed().stats();
+    println!("Total transactions included: {}", stats.total_transactions());
+}
+```
+
+**Expected output**:
+```
+[INFO] tx_inclusion_expectation: 180/200 included (90%) ✓
+[INFO] consensus_liveness: target=15, observed heights=[18, 17] ✓
+Total transactions included: 180
+```
+
+**Common failures**:
+- `observed 0 below required`: Forgot `.wallets()`
+- Low inclusion: Reduce `TX_RATE` or increase `DURATION`
+
+---
+
+## Recipe 3: DA + Transaction Combined Stress
+
+**Goal**: Exercise both transaction and data-availability paths.
+
+```rust
+use std::time::Duration;
+use testing_framework_core::scenario::{Deployer as _, ScenarioBuilder};
+use testing_framework_runner_local::LocalDeployer;
+use tests_workflows::ScenarioBuilderExt as _;
+
+#[tokio::test]
+async fn da_tx_combined_stress() {
+    let mut plan = ScenarioBuilder::with_node_counts(1, 1)  // Need executor for DA
+        .topology()
+            .network_star()
+            .validators(1)
+            .executors(1)
+            .apply()
+        .wallets(64)
+        .transactions()
+            .rate(5)
+            .users(4)
+            .apply()
+        .da()
+            .channel_rate(2)   // 2 channel inscriptions per block
+            .blob_rate(1)      // 1 blob per channel
+            .apply()
+        .with_run_duration(Duration::from_secs(180))
+        .expect_consensus_liveness()
+        .build();
+
+    let deployer = LocalDeployer::default();
+    let runner = deployer.deploy(&plan).await.expect("deployment");
+    runner.run(&mut plan).await.expect("scenario passed");
+}
+```
+
+**Expected output**:
+```
+[INFO] da_workload_inclusions: 2/2 channels inscribed ✓
+[INFO] tx_inclusion_expectation: 45/50 included (90%) ✓
+[INFO] consensus_liveness: target=22, observed heights=[25, 24] ✓
+```
+
+**Common failures**:
+- `da workload requires at least one executor`: Add executor to topology
+- Blob publish failures: Check DA balancer readiness
+
+---
+
+## Recipe 4: Chaos Resilience Test
+
+**Goal**: Verify system recovers from node restarts.
+
+```rust
+use std::time::Duration;
+use testing_framework_core::scenario::{Deployer as _, ScenarioBuilder};
+use testing_framework_runner_local::LocalDeployer;
+use tests_workflows::{ChaosBuilderExt as _, ScenarioBuilderExt as _};
+
+#[tokio::test]
+async fn chaos_resilience_test() {
+    let mut plan = ScenarioBuilder::with_node_counts(3, 1)  // Need >1 validator for chaos
+        .enable_node_control()  // Required for chaos!
+        .topology()
+            .network_star()
+            .validators(3)
+            .executors(1)
+            .apply()
+        .wallets(64)
+        .transactions()
+            .rate(3)  // Lower rate for stability during chaos
+            .users(4)
+            .apply()
+        .chaos_random_restart()
+            .validators(true)
+            .executors(true)
+            .min_delay(Duration::from_secs(45))
+            .max_delay(Duration::from_secs(75))
+            .target_cooldown(Duration::from_secs(120))
+            .apply()
+        .with_run_duration(Duration::from_secs(300))  // 5 minutes
+        .expect_consensus_liveness()
+        .build();
+
+    let deployer = LocalDeployer::default();
+    let runner = deployer.deploy(&plan).await.expect("deployment");
+    runner.run(&mut plan).await.expect("chaos scenario passed");
+}
+```
+
+**Expected output**:
+```
+[INFO] Restarting validator-1
+[INFO] Restarting executor-0
+[INFO] Restarting validator-2
+[INFO] consensus_liveness: target=35, observed heights=[42, 38, 40, 39] ✓
+```
+
+**Common failures**:
+- `no eligible targets`: Need ≥2 validators (safety skips single validator)
+- Liveness violation: Increase `target_cooldown`, reduce restart frequency
+
+---
+
+## Recipe 5: Docker Compose Reproducible Test
+
+**Goal**: Run in containers for CI reproducibility.
+
+```rust
+use std::time::Duration;
+use testing_framework_core::scenario::{Deployer as _, ScenarioBuilder};
+use testing_framework_runner_compose::ComposeRunner;
+use tests_workflows::ScenarioBuilderExt as _;
+
+#[tokio::test]
+#[ignore = "requires Docker"]
+async fn compose_reproducible_test() {
+    let mut plan = ScenarioBuilder::with_node_counts(2, 1)
+        .topology()
+            .network_star()
+            .validators(2)
+            .executors(1)
+            .apply()
+        .wallets(64)
+        .transactions()
+            .rate(5)
+            .users(8)
+            .apply()
+        .da()
+            .channel_rate(1)
+            .blob_rate(1)
+            .apply()
+        .with_run_duration(Duration::from_secs(120))
+        .expect_consensus_liveness()
+        .build();
+
+    let deployer = ComposeRunner::default();
+    let runner = deployer.deploy(&plan).await.expect("compose deployment");
+    
+    // Verify Prometheus is available
+    assert!(runner.context().telemetry().is_configured());
+    
+    runner.run(&mut plan).await.expect("compose scenario passed");
+}
+```
+
+**Required environment**:
+```bash
+# Build the Docker image first
+docker build -t nomos-testnet:local .
+
+# Or use custom image
+export NOMOS_TESTNET_IMAGE=myregistry/nomos-testnet:v1.0
+```
+
+**Common failures**:
+- `cfgsync connection refused`: Check port 4400 is accessible
+- Image not found: Build or pull `nomos-testnet:local`
+
+---
+
+<!-- FILE: reference/faq.md -->
+
+## FAQ
+
+**Q: Why does chaos skip validators when only one is configured?**
+
+A: Restarting the only validator would halt consensus entirely. The framework protects against this by requiring ≥2 validators for chaos to restart validators. See `RandomRestartWorkload::targets()`.
+
+**Q: Can I run the same scenario on different runners?**
+
+A: Yes! The `Scenario` is runner-agnostic. Just swap the deployer:
+
+```rust
+let plan = build_my_scenario();  // Same plan
+
+// Local
+let runner = LocalDeployer::default().deploy(&plan).await?;
+
+// Or Compose
+let runner = ComposeRunner::default().deploy(&plan).await?;
+
+// Or K8s
+let runner = K8sRunner::new().deploy(&plan).await?;
+```
+
+**Q: How do I debug a flaky scenario?**
+
+A: 
+1. Enable tracing: `NOMOS_TESTS_TRACING=true`
+2. Keep logs: `NOMOS_TESTS_KEEP_LOGS=1`
+3. Increase duration
+4. Simplify (remove workloads one by one)
+
+**Q: Why are expectations evaluated after all workloads, not during?**
+
+A: This ensures the system has reached steady state. If you need continuous assertions, implement them inside your workload using `BlockFeed`.
+
+**Q: How long should my scenario run?**
+
+A: See the [Duration Heuristics](#duration-heuristics) table. Rule of thumb: enough blocks to observe your workload's effects plus margin for variability.
+
+**Q: What's the difference between `Plan` and `Scenario`?**
+
+A: In the code, `ScenarioBuilder` builds a `Scenario`. The term "plan" is informal shorthand for "fully constructed scenario ready for deployment."
+
+---
+
+## Changelog
+
+### v3 (Current)
+
+**New sections:**
+- 5-Minute Quickstart
+- Reading Guide by Role
+- Duration Heuristics table
+- BlockFeed Deep Dive
+- Configuration Sync (cfgsync) documentation
+- Environment Variables reference
+- Complete Scenario Recipes (5 recipes)
+- Common Expectation Mistakes table
+- Debugging Flow diagram
+- GitBook structure markers
+
+**Fixes from v2:**
+- All API method names verified against codebase
+- Error messages taken from actual error types
+- Environment variables verified in source
+
+**Improvements:**
+- More diagrams (timeline, readiness phases, type flow)
+- Troubleshooting with actual error messages
+- FAQ expanded with common questions
diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md
new file mode 100644
index 0000000..d21deb9
--- /dev/null
+++ b/book/src/SUMMARY.md
@@ -0,0 +1,31 @@
+# Summary
+- [Project Context Primer](project-context-primer.md)
+- [What You Will Learn](what-you-will-learn.md)
+- [Part I — Foundations](part-i.md)
+  - [Introduction](introduction.md)
+  - [Architecture Overview](architecture-overview.md)
+  - [Testing Philosophy](testing-philosophy.md)
+  - [Scenario Lifecycle (Conceptual)](scenario-lifecycle.md)
+  - [Design Rationale](design-rationale.md)
+- [Part II — User Guide](part-ii.md)
+  - [Workspace Layout](workspace-layout.md)
+  - [Annotated Tree](annotated-tree.md)
+  - [Authoring Scenarios](authoring-scenarios.md)
+  - [Core Content: Workloads & Expectations](workloads.md)
+  - [Core Content: ScenarioBuilderExt Patterns](scenario-builder-ext-patterns.md)
+  - [Best Practices](best-practices.md)
+  - [Examples](examples.md)
+  - [Advanced & Artificial Examples](examples-advanced.md)
+  - [Running Scenarios](running-scenarios.md)
+  - [Runners](runners.md)
+  - [Operations](operations.md)
+- [Part III — Developer Reference](part-iii.md)
+  - [Scenario Model (Developer Level)](scenario-model.md)
+  - [Extending the Framework](extending.md)
+  - [Example: New Workload & Expectation (Rust)](custom-workload-example.md)
+  - [Internal Crate Reference](internal-crate-reference.md)
+- [Part IV — Appendix](part-iv.md)
+  - [DSL Cheat Sheet](dsl-cheat-sheet.md)
+  - [Troubleshooting Scenarios](troubleshooting.md)
+  - [FAQ](faq.md)
+  - [Glossary](glossary.md)
diff --git a/book/src/annotated-tree.md b/book/src/annotated-tree.md
new file mode 100644
index 0000000..7a62a24
--- /dev/null
+++ b/book/src/annotated-tree.md
@@ -0,0 +1,17 @@
+# Annotated Tree
+
+High-level view of the workspace and how pieces relate:
+```
+nomos-testing/
+├─ testing-framework/
+│  ├─ configs/          # shared configuration helpers
+│  ├─ core/             # scenario model, runtime, topology
+│  ├─ workflows/        # workloads, expectations, DSL extensions
+│  └─ runners/          # local, compose, k8s deployment backends
+├─ tests/               # integration scenarios using the framework
+└─ scripts/             # supporting setup utilities (e.g., assets)
+```
+
+Each area maps to a responsibility: describe configs, orchestrate scenarios,
+package common traffic and assertions, adapt to environments, and demonstrate
+end-to-end usage.
diff --git a/book/src/architecture-overview.md b/book/src/architecture-overview.md
new file mode 100644
index 0000000..76ac1cf
--- /dev/null
+++ b/book/src/architecture-overview.md
@@ -0,0 +1,29 @@
+# Architecture Overview
+
+The framework follows a clear flow: **Topology → Scenario → Runner → Workloads → Expectations**.
+
+- **Topology** describes the cluster: how many nodes, their roles, and the high-level network and data-availability parameters they should follow.
+- **Scenario** combines that topology with the activities to run and the checks to perform, forming a single plan.
+- **Deployer/Runner** pair turns the plan into a live environment on the chosen backend (local processes, Docker Compose, or Kubernetes) and brokers readiness.
+- **Workloads** generate traffic and conditions that exercise the system.
+- **Expectations** observe the run and judge success or failure once activity completes.
+
+Conceptual diagram:
+```
+Topology  →  Scenario  →  Runner  →  Workloads  →  Expectations
+ (shape       (plan)      (deploy     (drive         (verify
+ cluster)                & orchestrate) traffic)     outcomes)
+```
+
+Mermaid view:
+```mermaid
+flowchart LR
+    A(Topology<br/>shape cluster) --> B(Scenario<br/>plan)
+    B --> C(Deployer/Runner<br/>deploy & orchestrate)
+    C --> D(Workloads<br/>drive traffic)
+    D --> E(Expectations<br/>verify outcomes)
+```
+
+Each layer has a narrow responsibility so that cluster shape, deployment choice,
+traffic generation, and health checks can evolve independently while fitting
+together predictably.
diff --git a/book/src/authoring-scenarios.md b/book/src/authoring-scenarios.md
new file mode 100644
index 0000000..a7035e0
--- /dev/null
+++ b/book/src/authoring-scenarios.md
@@ -0,0 +1,20 @@
+# Authoring Scenarios
+
+Creating a scenario is a declarative exercise:
+
+1. **Shape the topology**: decide how many validators and executors to run, and
+   what high-level network and data-availability characteristics matter for the
+   test.
+2. **Attach workloads**: pick traffic generators that align with your goals
+   (transactions, data-availability blobs, or chaos for resilience probes).
+3. **Define expectations**: specify the health signals that must hold when the
+   run finishes (e.g., consensus liveness, inclusion of submitted activity; see
+   [Core Content: Workloads & Expectations](workloads.md)).
+4. **Set duration**: choose a run window long enough to observe meaningful
+   block progression and the effects of your workloads.
+5. **Choose a runner**: target local processes for fast iteration, Docker
+   Compose for reproducible multi-node stacks, or Kubernetes for cluster-grade
+   validation. For environment considerations, see [Operations](operations.md).
+
+Keep scenarios small and explicit: make the intended behavior and the success
+criteria clear so failures are easy to interpret and act upon.
diff --git a/book/src/best-practices.md b/book/src/best-practices.md
new file mode 100644
index 0000000..eb3bf08
--- /dev/null
+++ b/book/src/best-practices.md
@@ -0,0 +1,16 @@
+# Best Practices
+
+- **State your intent**: document the goal of each scenario (throughput, DA
+  validation, resilience) so expectation choices are obvious.
+- **Keep runs meaningful**: choose durations that allow multiple blocks and make
+  timing-based assertions trustworthy.
+- **Separate concerns**: start with deterministic workloads for functional
+  checks; add chaos in dedicated resilience scenarios to avoid noisy failures.
+- **Reuse patterns**: standardize on shared topology and workload presets so
+  results are comparable across environments and teams.
+- **Observe first, tune second**: rely on liveness and inclusion signals to
+  interpret outcomes before tweaking rates or topology.
+- **Environment fit**: pick runners that match the feedback loop you need—local
+  for speed, compose for reproducible stacks, k8s for cluster-grade fidelity.
+- **Minimal surprises**: seed only necessary wallets and keep configuration
+  deltas explicit when moving between CI and developer machines.
diff --git a/book/src/custom-workload-example.md b/book/src/custom-workload-example.md
new file mode 100644
index 0000000..341527b
--- /dev/null
+++ b/book/src/custom-workload-example.md
@@ -0,0 +1,116 @@
+# Example: New Workload & Expectation (Rust)
+
+A minimal, end-to-end illustration of adding a custom workload and matching
+expectation. This shows the shape of the traits and where to plug into the
+framework; expand the logic to fit your real test.
+
+## Workload: simple reachability probe
+
+Key ideas:
+- **name**: identifies the workload in logs.
+- **expectations**: workloads can bundle defaults so callers don’t forget checks.
+- **init**: derive inputs from the generated topology (e.g., pick a target node).
+- **start**: drive async activity using the shared `RunContext`.
+
+```rust
+use std::sync::Arc;
+use async_trait::async_trait;
+use testing_framework_core::scenario::{
+    DynError, Expectation, RunContext, RunMetrics, Workload,
+};
+use testing_framework_core::topology::GeneratedTopology;
+
+pub struct ReachabilityWorkload {
+    target_idx: usize,
+    bundled: Vec<Box<dyn Expectation>>,
+}
+
+impl ReachabilityWorkload {
+    pub fn new(target_idx: usize) -> Self {
+        Self {
+            target_idx,
+            bundled: vec![Box::new(ReachabilityExpectation::new(target_idx))],
+        }
+    }
+}
+
+#[async_trait]
+impl Workload for ReachabilityWorkload {
+    fn name(&self) -> &'static str {
+        "reachability_workload"
+    }
+
+    fn expectations(&self) -> Vec<Box<dyn Expectation>> {
+        self.bundled.clone()
+    }
+
+    fn init(
+        &mut self,
+        topology: &GeneratedTopology,
+        _metrics: &RunMetrics,
+    ) -> Result<(), DynError> {
+        if topology.validators().get(self.target_idx).is_none() {
+            return Err("no validator at requested index".into());
+        }
+        Ok(())
+    }
+
+    async fn start(&self, ctx: &RunContext) -> Result<(), DynError> {
+        let client = ctx
+            .clients()
+            .validators()
+            .get(self.target_idx)
+            .ok_or("missing target client")?;
+
+        // Pseudo-action: issue a lightweight RPC to prove reachability.
+        client.health_check().await.map_err(|e| e.into())
+    }
+}
+```
+
+## Expectation: confirm the target stayed reachable
+
+Key ideas:
+- **start_capture**: snapshot baseline if needed (not used here).
+- **evaluate**: assert the condition after workloads finish.
+
+```rust
+use async_trait::async_trait;
+use testing_framework_core::scenario::{DynError, Expectation, RunContext};
+
+pub struct ReachabilityExpectation {
+    target_idx: usize,
+}
+
+impl ReachabilityExpectation {
+    pub fn new(target_idx: usize) -> Self {
+        Self { target_idx }
+    }
+}
+
+#[async_trait]
+impl Expectation for ReachabilityExpectation {
+    fn name(&self) -> &str {
+        "target_reachable"
+    }
+
+    async fn evaluate(&mut self, ctx: &RunContext) -> Result<(), DynError> {
+        let client = ctx
+            .clients()
+            .validators()
+            .get(self.target_idx)
+            .ok_or("missing target client")?;
+
+        client.health_check().await.map_err(|e| {
+            format!("target became unreachable during run: {e}").into()
+        })
+    }
+}
+```
+
+## How to wire it
+- Build your scenario as usual and call `.with_workload(ReachabilityWorkload::new(0))`.
+- The bundled expectation is attached automatically; you can add more with
+  `.with_expectation(...)` if needed.
+- Keep the logic minimal and fast for smoke tests; grow it into richer probes
+  for deeper scenarios.
diff --git a/book/src/design-rationale.md b/book/src/design-rationale.md
new file mode 100644
index 0000000..94961b6
--- /dev/null
+++ b/book/src/design-rationale.md
@@ -0,0 +1,7 @@
+# Design Rationale
+
+- **Modular crates** keep configuration, orchestration, workloads, and runners decoupled so each can evolve without breaking the others.
+- **Pluggable runners** let the same scenario run on a laptop, a Docker host, or a Kubernetes cluster, making validation portable across environments.
+- **Separated workloads and expectations** clarify intent: what traffic to generate versus how to judge success. This simplifies review and reuse.
+- **Declarative topology** makes cluster shape explicit and repeatable, reducing surprise when moving between CI and developer machines.
+- **Maintainability through predictability**: a clear flow from plan to deployment to verification lowers the cost of extending the framework and interpreting failures.
diff --git a/book/src/dsl-cheat-sheet.md b/book/src/dsl-cheat-sheet.md
new file mode 100644
index 0000000..6b5a770
--- /dev/null
+++ b/book/src/dsl-cheat-sheet.md
@@ -0,0 +1,19 @@
+# Core Content: DSL Cheat Sheet
+
+The framework offers a fluent builder style to keep scenarios readable. Common
+knobs:
+
+- **Topology shaping**: set validator and executor counts, pick a network layout
+  style, and adjust high-level data-availability traits.
+- **Wallet seeding**: define how many users participate and the total funds
+  available for transaction workloads.
+- **Workload tuning**: configure transaction rates, data-availability channel
+  and blob rates, and whether chaos restarts should include validators,
+  executors, or both.
+- **Expectations**: attach liveness and workload-specific checks so success is
+  explicit.
+- **Run window**: set a minimum duration long enough for multiple blocks to be
+  observed and verified.
+
+Use these knobs to express intent clearly, keeping scenario definitions concise
+and consistent across teams.
diff --git a/book/src/examples-advanced.md b/book/src/examples-advanced.md
new file mode 100644
index 0000000..2f3a7c4
--- /dev/null
+++ b/book/src/examples-advanced.md
@@ -0,0 +1,62 @@
+# Advanced & Artificial Examples
+
+These illustrative scenarios stretch the framework to show how to build new
+workloads, expectations, deployers, and topology tricks. They are intentionally
+“synthetic” to teach capabilities rather than prescribe production tests.
+
+## Synthetic Delay Workload (Network Latency Simulation)
+- **Idea**: inject fake latency between node interactions using internal timers,
+  not OS-level tooling.
+- **Demonstrates**: sequencing control inside a workload, verifying protocol
+  progression under induced lag, using timers to pace submissions.
+- **Shape**: wrap submissions in delays that mimic slow peers; ensure the
+  expectation checks blocks still progress.
+
+## Oscillating Load Workload (Traffic Waves)
+- **Idea**: traffic rate changes every block or N seconds (e.g., blocks 1–3 low,
+  4–5 high, 6–7 zero, repeat).
+- **Demonstrates**: dynamic, stateful workloads that use `RunMetrics` to time
+  phases; modeling real-world burstiness.
+- **Shape**: schedule per-phase rates; confirm inclusion/liveness across peaks
+  and troughs.
+
+## Byzantine Behavior Mock
+- **Idea**: a workload that drops half its planned submissions, sometimes
+  double-submits, and intentionally triggers expectation failures.
+- **Demonstrates**: negative testing, resilience checks, and the value of clear
+  expectations when behavior is adversarial by design.
+- **Shape**: parameterize drop/double-submit probabilities; pair with an
+  expectation that documents what “bad” looks like.
+
+## Custom Expectation: Block Finality Drift
+- **Idea**: assert the last few blocks differ and block time stays within a
+  tolerated drift budget.
+- **Demonstrates**: consuming `BlockFeed` or time-series metrics to validate
+  protocol cadence; crafting post-run assertions around block diversity and
+  timing.
+- **Shape**: collect recent blocks, confirm no duplicates, and compare observed
+  intervals to a drift threshold.
+
+## Custom Deployer: Dry-Run Deployer
+- **Idea**: a deployer that never starts nodes; it emits configs, simulates
+  readiness, provides fake blockfeed/metrics.
+- **Demonstrates**: full power of the deployer interface for CI dry-runs,
+  config verification, and ultra-fast feedback without Nomos binaries.
+- **Shape**: produce logs/artifacts, stub readiness, and feed synthetic blocks
+  so expectations can still run.
+
+## Stochastic Topology Generator
+- **Idea**: topology parameters change at runtime (random validators, DA
+  settings, network shapes).
+- **Demonstrates**: randomized property testing and fuzzing approaches to
+  topology building.
+- **Shape**: pick roles and network layouts randomly per run; keep expectations
+  tolerant to variability while still asserting core liveness.
+
+## Multi-Phase Scenario (“Pipelines”)
+- **Idea**: scenario runs in phases (e.g., phase 1 transactions, phase 2 DA,
+  phase 3 restarts, phase 4 sync check).
+- **Demonstrates**: multi-stage tests, modular scenario assembly, and deliberate
+  lifecycle control.
+- **Shape**: drive phase-specific workloads/expectations sequentially; enforce
+  clear boundaries and post-phase checks.
diff --git a/book/src/examples.md b/book/src/examples.md
new file mode 100644
index 0000000..d8156be
--- /dev/null
+++ b/book/src/examples.md
@@ -0,0 +1,28 @@
+# Examples
+
+Concrete scenario shapes that illustrate how to combine topologies, workloads,
+and expectations. Adjust counts, rates, and durations to fit your environment.
+
+## Simple 2-validator transaction workload
+- **Topology**: two validators.
+- **Workload**: transaction submissions at a modest per-block rate with a small
+  set of wallet actors.
+- **Expectations**: consensus liveness and inclusion of submitted activity.
+- **When to use**: smoke tests for consensus and transaction flow on minimal
+  hardware.
+
+## DA + transaction workload
+- **Topology**: validators plus executors if available.
+- **Workloads**: data-availability blobs/channels and transactions running
+  together to stress both paths.
+- **Expectations**: consensus liveness and workload-level inclusion/availability
+  checks.
+- **When to use**: end-to-end coverage of transaction and DA layers in one run.
+
+## Chaos + liveness check
+- **Topology**: validators (optionally executors) with node control enabled.
+- **Workloads**: baseline traffic (transactions or DA) plus chaos restarts on
+  selected roles.
+- **Expectations**: consensus liveness to confirm the system keeps progressing
+  despite restarts; workload-specific inclusion if traffic is present.
+- **When to use**: resilience validation and operational readiness drills.
diff --git a/book/src/extending.md b/book/src/extending.md
new file mode 100644
index 0000000..f96d9c2
--- /dev/null
+++ b/book/src/extending.md
@@ -0,0 +1,31 @@
+# Extending the Framework
+
+## Adding a workload
+1) Implement `testing_framework_core::scenario::Workload`:
+   - Provide a name and any bundled expectations.
+   - In `init`, derive inputs from `GeneratedTopology` and `RunMetrics`; fail
+     fast if prerequisites are missing (e.g., wallet data, node addresses).
+   - In `start`, drive async traffic using the `RunContext` clients.
+2) Expose the workload from a module under `testing-framework/workflows` and
+   consider adding a DSL helper for ergonomic wiring.
+
+## Adding an expectation
+1) Implement `testing_framework_core::scenario::Expectation`:
+   - Use `start_capture` to snapshot baseline metrics.
+   - Use `evaluate` to assert outcomes after workloads finish; return all errors
+     so the runner can aggregate them.
+2) Export it from `testing-framework/workflows` if it is reusable.
+
+## Adding a runner
+1) Implement `testing_framework_core::scenario::Deployer` for your backend.
+   - Produce a `RunContext` with `NodeClients`, metrics endpoints, and optional
+     `NodeControlHandle`.
+   - Guard cleanup with `CleanupGuard` to reclaim resources even on failures.
+2) Mirror the readiness and block-feed probes used by the existing runners so
+   workloads can rely on consistent signals.
+
+## Adding topology helpers
+- Extend `testing_framework_core::topology::TopologyBuilder` with new layouts or
+  configuration presets (e.g., specialized DA parameters). Keep defaults safe:
+  ensure at least one participant and clamp dispersal factors as the current
+  helpers do.
diff --git a/book/src/faq.md b/book/src/faq.md
new file mode 100644
index 0000000..0cf7dd3
--- /dev/null
+++ b/book/src/faq.md
@@ -0,0 +1,26 @@
+# FAQ
+
+**Why block-oriented timing?**  
+Using block cadence reduces dependence on host speed and keeps assertions aligned
+with protocol behavior.
+
+**Can I reuse the same scenario across runners?**  
+Yes. The plan stays the same; swap runners (local, compose, k8s) to target
+different environments.
+
+**When should I enable chaos workloads?**  
+Only when testing resilience or operational recovery; keep functional smoke
+tests deterministic.
+
+**How long should runs be?**  
+Long enough for multiple blocks so liveness and inclusion checks are
+meaningful; very short runs risk false confidence.
+
+**Do I always need seeded wallets?**  
+Only for transaction scenarios. Data-availability or pure chaos scenarios may
+not require them, but liveness checks still need validators producing blocks.
+
+**What if expectations fail but workloads “look fine”?**  
+Trust expectations first—they capture the intended success criteria. Use the
+observability signals and runner logs to pinpoint why the system missed the
+target.
diff --git a/book/src/glossary.md b/book/src/glossary.md
new file mode 100644
index 0000000..fbed85c
--- /dev/null
+++ b/book/src/glossary.md
@@ -0,0 +1,18 @@
+# Glossary
+
+- **Validator**: node role responsible for participating in consensus and block
+  production.
+- **Executor**: node role that processes transactions or workloads delegated by
+  validators.
+- **DA (Data Availability)**: subsystem ensuring blobs or channel data are
+  published and retrievable for validation.
+- **Workload**: traffic or behavior generator that exercises the system during a
+  scenario run.
+- **Expectation**: post-run assertion that judges whether the system met the
+  intended success criteria.
+- **Topology**: declarative description of the cluster shape, roles, and
+  high-level parameters for a scenario.
+- **Blockfeed**: stream of block observations used for liveness or inclusion
+  signals during a run.
+- **Control capability**: the ability for a runner to start, stop, or restart
+  nodes, used by chaos workloads.
diff --git a/book/src/internal-crate-reference.md b/book/src/internal-crate-reference.md
new file mode 100644
index 0000000..80d45fc
--- /dev/null
+++ b/book/src/internal-crate-reference.md
@@ -0,0 +1,18 @@
+# Internal Crate Reference
+
+High-level roles of the crates that make up the framework:
+
+- **Configs**: prepares reusable configuration primitives for nodes, networking,
+  tracing, data availability, and wallets, shared by all scenarios and runners.
+- **Core scenario orchestration**: houses the topology and scenario model,
+  runtime coordination, node clients, and readiness/health probes.
+- **Workflows**: packages workloads and expectations into reusable building
+  blocks and offers a fluent DSL to assemble them.
+- **Runners**: implements deployment backends (local host, Docker Compose,
+  Kubernetes) that all consume the same scenario plan.
+- **Test workflows**: example scenarios and integration checks that exercise the
+  framework end to end and serve as living documentation.
+
+Use this map to locate where to add new capabilities: configuration primitives
+in configs, orchestration changes in core, reusable traffic/assertions in
+workflows, environment adapters in runners, and demonstrations in tests.
diff --git a/book/src/introduction.md b/book/src/introduction.md
new file mode 100644
index 0000000..d153b67
--- /dev/null
+++ b/book/src/introduction.md
@@ -0,0 +1,15 @@
+# Introduction
+
+The Nomos Testing Framework is a purpose-built toolkit for exercising Nomos in
+realistic, multi-node environments. It solves the gap between small, isolated
+tests and full-system validation by letting teams describe a cluster layout,
+drive meaningful traffic, and assert the outcomes in one coherent plan.
+
+It is for protocol engineers, infrastructure operators, and QA teams who need
+repeatable confidence that validators, executors, and data-availability
+components work together under network and timing constraints.
+
+Multi-node integration testing is required because many Nomos behaviors—block
+progress, data availability, liveness under churn—only emerge when several
+roles interact over real networking and time. This framework makes those checks
+declarative, observable, and portable across environments.
diff --git a/book/src/operations.md b/book/src/operations.md
new file mode 100644
index 0000000..208446c
--- /dev/null
+++ b/book/src/operations.md
@@ -0,0 +1,42 @@
+# Operations
+
+Operational readiness focuses on prerequisites, environment fit, and clear
+signals:
+
+- **Prerequisites**: keep a sibling `nomos-node` checkout available; ensure the
+  chosen runner’s platform needs are met (local binaries for host runs, Docker
+  for compose, cluster access for k8s).
+- **Artifacts**: some scenarios depend on prover or circuit assets; fetch them
+  ahead of time with the provided helper scripts when needed.
+- **Environment flags**: use slow-environment toggles to relax timeouts, enable
+  tracing when debugging, and adjust observability ports to avoid clashes.
+- **Readiness checks**: verify runners report node readiness before starting
+  workloads; this avoids false negatives from starting too early.
+- **Failure triage**: map failures to missing prerequisites (wallet seeding,
+  node control availability), runner platform issues, or unmet expectations.
+  Start with liveness signals, then dive into workload-specific assertions.
+
+Treat operational hygiene—assets present, prerequisites satisfied, observability
+reachable—as the first step to reliable scenario outcomes.
+
+Metrics and observability flow:
+```
+Runner exposes endpoints/ports
+    │
+    ▼
+Runtime collects block/health signals
+    │
+    ▼
+Expectations consume signals to decide pass/fail
+    │
+    ▼
+Operators inspect logs/metrics when failures arise
+```
+
+Mermaid view:
+```mermaid
+flowchart TD
+    Expose[Runner exposes endpoints/ports] --> Collect[Runtime collects block/health signals]
+    Collect --> Consume[Expectations consume signals<br/>decide pass/fail]
+    Consume --> Inspect[Operators inspect logs/metrics<br/>when failures arise]
+```
diff --git a/book/src/part-i.md b/book/src/part-i.md
new file mode 100644
index 0000000..74e4ac6
--- /dev/null
+++ b/book/src/part-i.md
@@ -0,0 +1,4 @@
+# Part I — Foundations
+
+Conceptual chapters that establish the mental model for the framework and how
+it approaches multi-node testing.
diff --git a/book/src/part-ii.md b/book/src/part-ii.md
new file mode 100644
index 0000000..36eb205
--- /dev/null
+++ b/book/src/part-ii.md
@@ -0,0 +1,4 @@
+# Part II — User Guide
+
+Practical guidance for shaping scenarios, combining workloads and expectations,
+and running them across different environments.
diff --git a/book/src/part-iii.md b/book/src/part-iii.md
new file mode 100644
index 0000000..107c890
--- /dev/null
+++ b/book/src/part-iii.md
@@ -0,0 +1,4 @@
+# Part III — Developer Reference
+
+Deep dives for contributors who extend the framework, evolve its abstractions,
+or maintain the crate set.
diff --git a/book/src/part-iv.md b/book/src/part-iv.md
new file mode 100644
index 0000000..51b08b6
--- /dev/null
+++ b/book/src/part-iv.md
@@ -0,0 +1,4 @@
+# Part IV — Appendix
+
+Quick-reference material and supporting guidance to keep scenarios discoverable,
+debuggable, and consistent.
diff --git a/book/src/project-context-primer.md b/book/src/project-context-primer.md
new file mode 100644
index 0000000..3cbe0b8
--- /dev/null
+++ b/book/src/project-context-primer.md
@@ -0,0 +1,16 @@
+# Project Context Primer
+
+This book focuses on the Nomos Testing Framework. It assumes familiarity with
+the Nomos architecture, but for completeness, here is a short primer.
+
+- **Nomos** is a modular blockchain protocol composed of validators, executors,
+  and a data-availability (DA) subsystem.
+- **Validators** participate in consensus and produce blocks.
+- **Executors** run application logic or off-chain computations referenced by
+  blocks.
+- **Data Availability (DA)** ensures that data referenced in blocks is
+  published and retrievable, including blobs or channel data used by workloads.
+
+These roles interact tightly, which is why meaningful testing must be performed
+in multi-node environments that include real networking, timing, and DA
+interaction.
diff --git a/book/src/runners.md b/book/src/runners.md
new file mode 100644
index 0000000..2f09fd0
--- /dev/null
+++ b/book/src/runners.md
@@ -0,0 +1,51 @@
+# Runners
+
+Runners turn a scenario plan into a live environment while keeping the plan
+unchanged. Choose based on feedback speed, reproducibility, and fidelity. For
+environment and operational considerations, see [Operations](operations.md):
+
+## Local runner
+- Launches node processes directly on the host.
+- Fastest feedback loop and minimal orchestration overhead.
+- Best for development-time iteration and debugging.
+
+## Docker Compose runner
+- Starts nodes in containers to provide a reproducible multi-node stack on a
+  single machine.
+- Discovers service ports and wires observability for convenient inspection.
+- Good balance between fidelity and ease of setup.
+
+## Kubernetes runner
+- Deploys nodes onto a cluster for higher-fidelity, longer-running scenarios.
+- Suits CI or shared environments where cluster behavior and scheduling matter.
+
+### Common expectations
+- All runners require at least one validator and, for transaction scenarios,
+  access to seeded wallets.
+- Readiness probes gate workload start so traffic begins only after nodes are
+  reachable.
+- Environment flags can relax timeouts or increase tracing when diagnostics are
+  needed.
+
+Runner abstraction:
+```
+Scenario Plan
+    │
+    ▼
+Runner (local | compose | k8s)
+    │  provisions env + readiness
+    ▼
+Runtime + Observability
+    │
+    ▼
+Workloads / Expectations execute
+```
+
+Mermaid view:
+```mermaid
+flowchart TD
+    Plan[Scenario Plan] --> RunSel{Runner<br/>(local | compose | k8s)}
+    RunSel --> Provision[Provision & readiness]
+    Provision --> Runtime[Runtime + observability]
+    Runtime --> Exec[Workloads & Expectations execute]
+```
diff --git a/book/src/running-scenarios.md b/book/src/running-scenarios.md
new file mode 100644
index 0000000..bf0776a
--- /dev/null
+++ b/book/src/running-scenarios.md
@@ -0,0 +1,17 @@
+# Running Scenarios
+
+Running a scenario follows the same conceptual flow regardless of environment:
+
+1. Select or author a scenario plan that pairs a topology with workloads,
+   expectations, and a suitable run window.
+2. Choose a runner aligned with your environment (local, compose, or k8s) and
+   ensure its prerequisites are available.
+3. Deploy the plan through the runner; wait for readiness signals before
+   starting workloads.
+4. Let workloads drive activity for the planned duration; keep observability
+   signals visible so you can correlate outcomes.
+5. Evaluate expectations and capture results as the primary pass/fail signal.
+
+Use the same plan across different runners to compare behavior between local
+development and CI or cluster settings. For environment prerequisites and
+flags, see [Operations](operations.md).
diff --git a/book/src/scenario-builder-ext-patterns.md b/book/src/scenario-builder-ext-patterns.md
new file mode 100644
index 0000000..e365e72
--- /dev/null
+++ b/book/src/scenario-builder-ext-patterns.md
@@ -0,0 +1,17 @@
+# Core Content: ScenarioBuilderExt Patterns
+
+Patterns that keep scenarios readable and reusable:
+
+- **Topology-first**: start by shaping the cluster (counts, layout) so later
+  steps inherit a clear foundation.
+- **Bundle defaults**: use the DSL helpers to attach common expectations (like
+  liveness) whenever you add a matching workload, reducing forgotten checks.
+- **Intentional rates**: express traffic in per-block terms to align with
+  protocol timing rather than wall-clock assumptions.
+- **Opt-in chaos**: enable restart patterns only in scenarios meant to probe
+  resilience; keep functional smoke tests deterministic.
+- **Wallet clarity**: seed only the number of actors you need; it keeps
+  transaction scenarios deterministic and interpretable.
+
+These patterns make scenario definitions self-explanatory while staying aligned
+with the framework’s block-oriented timing model.
diff --git a/book/src/scenario-lifecycle.md b/book/src/scenario-lifecycle.md
new file mode 100644
index 0000000..00d7f80
--- /dev/null
+++ b/book/src/scenario-lifecycle.md
@@ -0,0 +1,24 @@
+# Scenario Lifecycle (Conceptual)
+
+1. **Build the plan**: Declare a topology, attach workloads and expectations, and set the run window. The plan is the single source of truth for what will happen.
+2. **Deploy**: Hand the plan to a runner. It provisions the environment on the chosen backend and waits for nodes to signal readiness.
+3. **Drive workloads**: Start traffic and behaviors (transactions, data-availability activity, restarts) for the planned duration.
+4. **Observe blocks and signals**: Track block progression and other high-level metrics during or after the run window to ground assertions in protocol time.
+5. **Evaluate expectations**: Once activity stops (and optional cooldown completes), check liveness and workload-specific outcomes to decide pass or fail.
+6. **Cleanup**: Tear down resources so successive runs start fresh and do not inherit leaked state.
+
+Conceptual lifecycle diagram:
+```
+Plan → Deploy → Readiness → Drive Workloads → Observe → Evaluate → Cleanup
+```
+
+Mermaid view:
+```mermaid
+flowchart LR
+    P[Plan<br/>topology + workloads + expectations] --> D[Deploy<br/>runner provisions]
+    D --> R[Readiness<br/>wait for nodes]
+    R --> W[Drive Workloads]
+    W --> O[Observe<br/>blocks/metrics]
+    O --> E[Evaluate Expectations]
+    E --> C[Cleanup]
+```
diff --git a/book/src/scenario-model.md b/book/src/scenario-model.md
new file mode 100644
index 0000000..85480f2
--- /dev/null
+++ b/book/src/scenario-model.md
@@ -0,0 +1,23 @@
+# Scenario Model (Developer Level)
+
+The scenario model defines clear, composable responsibilities:
+
+- **Topology**: a declarative description of the cluster—how many nodes, their
+  roles, and the broad network and data-availability characteristics. It
+  represents the intended shape of the system under test.
+- **Scenario**: a plan combining topology, workloads, expectations, and a run
+  window. Building a scenario validates prerequisites (like seeded wallets) and
+  ensures the run lasts long enough to observe meaningful block progression.
+- **Workloads**: asynchronous tasks that generate traffic or conditions. They
+  use shared context to interact with the deployed cluster and may bundle
+  default expectations.
+- **Expectations**: post-run assertions. They can capture baselines before
+  workloads start and evaluate success once activity stops.
+- **Runtime**: coordinates workloads and expectations for the configured
+  duration, enforces cooldowns when control actions occur, and ensures cleanup
+  so runs do not leak resources.
+
+Developers extending the model should keep these boundaries strict: topology
+describes, scenarios assemble, runners deploy, workloads drive, and expectations
+judge outcomes. For guidance on adding new capabilities, see
+[Extending the Framework](extending.md).
diff --git a/book/src/testing-philosophy.md b/book/src/testing-philosophy.md
new file mode 100644
index 0000000..4c5efbc
--- /dev/null
+++ b/book/src/testing-philosophy.md
@@ -0,0 +1,9 @@
+# Testing Philosophy
+
+- **Declarative over imperative**: describe the desired cluster shape, traffic, and success criteria; let the framework orchestrate the run.
+- **Observable health signals**: prefer liveness and inclusion signals that reflect real user impact instead of internal debug state.
+- **Determinism first**: default scenarios aim for repeatable outcomes with fixed topologies and traffic rates; variability is opt-in.
+- **Targeted non-determinism**: introduce randomness (e.g., restarts) only when probing resilience or operational robustness.
+- **Protocol time, not wall time**: reason in blocks and protocol-driven intervals to reduce dependence on host speed or scheduler noise.
+- **Minimum run window**: always allow enough block production to make assertions meaningful; very short runs risk false confidence.
+- **Use chaos with intent**: chaos workloads are for recovery and fault-tolerance validation, not for baseline functional checks.
diff --git a/book/src/troubleshooting.md b/book/src/troubleshooting.md
new file mode 100644
index 0000000..3dfe2a0
--- /dev/null
+++ b/book/src/troubleshooting.md
@@ -0,0 +1,9 @@
+# Troubleshooting Scenarios
+
+Common symptoms and likely causes:
+
+- **No or slow block progression**: runner started workloads before readiness, insufficient run window, or environment too slow—extend duration or enable slow-environment tuning.
+- **Transactions not included**: missing or insufficient wallet seeding, misaligned transaction rate with block cadence, or network instability—reduce rate and verify wallet setup.
+- **Chaos stalls the run**: node control not available for the chosen runner or restart cadence too aggressive—enable control capability and widen restart intervals.
+- **Observability gaps**: metrics or logs unreachable because ports clash or services are not exposed—adjust observability ports and confirm runner wiring.
+- **Flaky behavior across runs**: mixing chaos with functional smoke tests or inconsistent topology between environments—separate deterministic and chaos scenarios and standardize topology presets.
diff --git a/book/src/usage-patterns.md b/book/src/usage-patterns.md
new file mode 100644
index 0000000..d95ba77
--- /dev/null
+++ b/book/src/usage-patterns.md
@@ -0,0 +1,7 @@
+# Usage Patterns
+
+- **Shape a topology, pick a runner**: choose local for quick iteration, compose for reproducible multi-node stacks with observability, or k8s for cluster-grade validation.
+- **Compose workloads deliberately**: pair transactions and data-availability traffic for end-to-end coverage; add chaos only when assessing recovery and resilience.
+- **Align expectations with goals**: use liveness-style checks to confirm the system keeps up with planned activity, and add workload-specific assertions for inclusion or availability.
+- **Reuse plans across environments**: keep the scenario constant while swapping runners to compare behavior between developer machines and CI clusters.
+- **Iterate with clear signals**: treat expectation outcomes as the primary pass/fail indicator, and adjust topology or workloads based on what those signals reveal.
diff --git a/book/src/what-you-will-learn.md b/book/src/what-you-will-learn.md
new file mode 100644
index 0000000..294339b
--- /dev/null
+++ b/book/src/what-you-will-learn.md
@@ -0,0 +1,6 @@
+# What You Will Learn
+
+This book gives you a clear mental model for Nomos multi-node testing, shows how
+to author scenarios that pair realistic workloads with explicit expectations,
+and guides you to run them across local, containerized, and cluster environments
+without changing the plan.
diff --git a/book/src/workloads.md b/book/src/workloads.md
new file mode 100644
index 0000000..3fb5064
--- /dev/null
+++ b/book/src/workloads.md
@@ -0,0 +1,42 @@
+# Core Content: Workloads & Expectations
+
+Workloads describe the activity a scenario generates; expectations describe the
+signals that must hold when that activity completes. Both are pluggable so
+scenarios stay readable and purpose-driven.
+
+## Workloads
+- **Transaction workload**: submits user-level transactions at a configurable
+  rate and can limit how many distinct actors participate.
+- **Data-availability workload**: drives blob and channel activity to exercise
+  data-availability paths.
+- **Chaos workload**: triggers controlled node restarts to test resilience and
+  recovery behaviors (requires a runner that can control nodes).
+
+## Expectations
+- **Consensus liveness**: verifies the system continues to produce blocks in
+  line with the planned workload and timing window.
+- **Workload-specific checks**: each workload can attach its own success
+  criteria (e.g., inclusion of submitted activity) so scenarios remain concise.
+
+Together, workloads and expectations let you express both the pressure applied
+to the system and the definition of “healthy” for that run.
+
+Workload pipeline (conceptual):
+```
+Inputs (topology + wallets + rates)
+    │
+    ▼
+Workload init → Drive traffic → Collect signals
+                                   │
+                                   ▼
+                           Expectations evaluate
+```
+
+Mermaid view:
+```mermaid
+flowchart TD
+    I[Inputs<br/>(topology + wallets + rates)] --> Init[Workload init]
+    Init --> Drive[Drive traffic]
+    Drive --> Collect[Collect signals]
+    Collect --> Eval[Expectations evaluate]
+```
diff --git a/book/src/workspace-layout.md b/book/src/workspace-layout.md
new file mode 100644
index 0000000..2ed180d
--- /dev/null
+++ b/book/src/workspace-layout.md
@@ -0,0 +1,19 @@
+# Workspace Layout
+
+The workspace focuses on multi-node integration testing and sits alongside a
+`nomos-node` checkout. Its crates separate concerns to keep scenarios
+repeatable and portable:
+
+- **Configs**: prepares high-level node, network, tracing, and wallet settings
+  used across test environments.
+- **Core scenario orchestration**: the engine that holds topology descriptions,
+  scenario plans, runtimes, workloads, and expectations.
+- **Workflows**: ready-made workloads (transactions, data-availability, chaos)
+  and reusable expectations assembled into a user-facing DSL.
+- **Runners**: deployment backends for local processes, Docker Compose, and
+  Kubernetes, all consuming the same scenario plan.
+- **Test workflows**: example scenarios and integration checks that show how
+  the pieces fit together.
+
+This split keeps configuration, orchestration, reusable traffic patterns, and
+deployment adapters loosely coupled while sharing one mental model for tests.
diff --git a/scripts/build-rapidsnark.sh b/scripts/build-rapidsnark.sh
index 4f5fc90..2410c31 100755
--- a/scripts/build-rapidsnark.sh
+++ b/scripts/build-rapidsnark.sh
@@ -21,6 +21,20 @@ if [ ! -d "$CIRCUITS_DIR" ]; then
     exit 1
 fi
 
+system_gmp_package() {
+    local multiarch
+    multiarch="$(gcc -print-multiarch 2>/dev/null || echo aarch64-linux-gnu)"
+    local lib_path="/usr/lib/${multiarch}/libgmp.a"
+    if [ ! -f "$lib_path" ]; then
+        echo "system libgmp.a not found at $lib_path" >&2
+        return 1
+    fi
+    mkdir -p depends/gmp/package_aarch64/lib depends/gmp/package_aarch64/include
+    cp "$lib_path" depends/gmp/package_aarch64/lib/
+    # Headers are small; copy the public ones the build expects.
+    cp /usr/include/gmp*.h depends/gmp/package_aarch64/include/ || true
+}
+
 case "$TARGET_ARCH" in
     arm64 | aarch64)
         ;;
@@ -41,12 +55,23 @@ git submodule update --init --recursive >&2
 if [ "${RAPIDSNARK_BUILD_GMP:-1}" = "1" ]; then
     GMP_TARGET="${RAPIDSNARK_GMP_TARGET:-aarch64}"
     ./build_gmp.sh "$GMP_TARGET" >&2
+else
+    echo "Using system libgmp to satisfy rapidsnark dependencies" >&2
+    system_gmp_package
 fi
 
-MAKE_TARGET="${RAPIDSNARK_MAKE_TARGET:-host_arm64}"
 PACKAGE_DIR="${RAPIDSNARK_PACKAGE_DIR:-package_arm64}"
 
-make "$MAKE_TARGET" -j"$(nproc)" >&2
+rm -rf build_prover_arm64
+mkdir build_prover_arm64
+cd build_prover_arm64
+cmake .. \
+    -DTARGET_PLATFORM=aarch64 \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_INSTALL_PREFIX="../${PACKAGE_DIR}" \
+    -DBUILD_SHARED_LIBS=OFF >&2
+cmake --build . --target prover verifier -- -j"$(nproc)" >&2
 
-install -m 0755 "${PACKAGE_DIR}/bin/prover" "$CIRCUITS_DIR/prover"
+install -m 0755 "src/prover" "$CIRCUITS_DIR/prover"
+install -m 0755 "src/verifier" "$CIRCUITS_DIR/verifier"
 echo "rapidsnark prover installed to $CIRCUITS_DIR/prover" >&2
diff --git a/scripts/setup-nomos-circuits.sh b/scripts/setup-nomos-circuits.sh
index 8057058..bbd2dd1 100755
--- a/scripts/setup-nomos-circuits.sh
+++ b/scripts/setup-nomos-circuits.sh
@@ -121,7 +121,7 @@ download_release() {
         print_error "Please check that version ${VERSION} exists for platform ${platform}"
         print_error "Available releases: https://github.com/${REPO}/releases"
         rm -rf "$temp_dir"
-        exit 1
+        return 1
     fi
 
     print_success "Download complete"
@@ -132,7 +132,7 @@ download_release() {
     if ! tar -xzf "${temp_dir}/${artifact}" -C "$INSTALL_DIR" --strip-components=1; then
         print_error "Failed to extract archive"
         rm -rf "$temp_dir"
-        exit 1
+        return 1
     fi
 
     rm -rf "$temp_dir"
@@ -171,8 +171,18 @@ main() {
     # Check existing installation
     check_existing_installation
 
-    # Download and extract
-    download_release "$platform"
+    # Download and extract (retry with x86_64 bundle on aarch64 if needed)
+    if ! download_release "$platform"; then
+        if [[ "$platform" == linux-aarch64 ]]; then
+            print_warning "Falling back to linux-x86_64 circuits bundle; will rebuild prover for aarch64."
+            rm -rf "$INSTALL_DIR"
+            if ! download_release "linux-x86_64"; then
+                exit 1
+            fi
+        else
+            exit 1
+        fi
+    fi
 
     # Handle macOS quarantine if needed
     if [[ "$platform" == macos-* ]]; then
diff --git a/testing-framework/configs/src/nodes/executor.rs b/testing-framework/configs/src/nodes/executor.rs
index eebbd3b..334cf29 100644
--- a/testing-framework/configs/src/nodes/executor.rs
+++ b/testing-framework/configs/src/nodes/executor.rs
@@ -82,7 +82,7 @@ pub fn create_executor_config(config: GeneralConfig) -> ExecutorConfig {
             // non-string keys and keep services alive.
             recovery_file: PathBuf::new(),
             bootstrap: chain_service::BootstrapConfig {
-                prolonged_bootstrap_period: Duration::from_secs(3),
+                prolonged_bootstrap_period: config.bootstrapping_config.prolonged_bootstrap_period,
                 force_bootstrap: false,
                 offline_grace_period: chain_service::OfflineGracePeriodConfig {
                     grace_period: Duration::from_secs(20 * 60),
diff --git a/testing-framework/runners/k8s/src/assets.rs b/testing-framework/runners/k8s/src/assets.rs
index 7117a8c..9258998 100644
--- a/testing-framework/runners/k8s/src/assets.rs
+++ b/testing-framework/runners/k8s/src/assets.rs
@@ -204,7 +204,8 @@ fn build_values(topology: &GeneratedTopology) -> HelmValues {
     let validators = topology
         .validators()
         .iter()
-        .map(|validator| {
+        .enumerate()
+        .map(|(index, validator)| {
             let mut env = BTreeMap::new();
             env.insert(
                 "CFG_NETWORK_PORT".into(),
@@ -225,6 +226,8 @@ fn build_values(topology: &GeneratedTopology) -> HelmValues {
                     .port()
                     .to_string(),
             );
+            env.insert("CFG_HOST_KIND".into(), "validator".into());
+            env.insert("CFG_HOST_IDENTIFIER".into(), format!("validator-{index}"));
 
             NodeValues {
                 api_port: validator.general.api_config.address.port(),
@@ -237,7 +240,8 @@ fn build_values(topology: &GeneratedTopology) -> HelmValues {
     let executors = topology
         .executors()
         .iter()
-        .map(|executor| {
+        .enumerate()
+        .map(|(index, executor)| {
             let mut env = BTreeMap::new();
             env.insert(
                 "CFG_NETWORK_PORT".into(),
@@ -258,6 +262,8 @@ fn build_values(topology: &GeneratedTopology) -> HelmValues {
                     .port()
                     .to_string(),
             );
+            env.insert("CFG_HOST_KIND".into(), "executor".into());
+            env.insert("CFG_HOST_IDENTIFIER".into(), format!("executor-{index}"));
 
             NodeValues {
                 api_port: executor.general.api_config.address.port(),
diff --git a/testing-framework/runners/k8s/src/runner.rs b/testing-framework/runners/k8s/src/runner.rs
index c917d7f..6241ad5 100644
--- a/testing-framework/runners/k8s/src/runner.rs
+++ b/testing-framework/runners/k8s/src/runner.rs
@@ -22,7 +22,7 @@ use crate::{
     helm::{HelmError, install_release},
     host::node_host,
     logs::dump_namespace_logs,
-    wait::{ClusterPorts, ClusterWaitError, NodeConfigPorts, wait_for_cluster_ready},
+    wait::{ClusterPorts, ClusterReady, ClusterWaitError, NodeConfigPorts, wait_for_cluster_ready},
 };
 
 pub struct K8sRunner {
@@ -66,6 +66,7 @@ struct ClusterEnvironment {
     executor_api_ports: Vec<u16>,
     executor_testing_ports: Vec<u16>,
     prometheus_port: u16,
+    port_forwards: Vec<std::process::Child>,
 }
 
 impl ClusterEnvironment {
@@ -75,6 +76,7 @@ impl ClusterEnvironment {
         release: String,
         cleanup: RunnerCleanup,
         ports: &ClusterPorts,
+        port_forwards: Vec<std::process::Child>,
     ) -> Self {
         Self {
             client,
@@ -86,6 +88,7 @@ impl ClusterEnvironment {
             executor_api_ports: ports.executors.iter().map(|ports| ports.api).collect(),
             executor_testing_ports: ports.executors.iter().map(|ports| ports.testing).collect(),
             prometheus_port: ports.prometheus,
+            port_forwards,
         }
     }
 
@@ -97,15 +100,17 @@ impl ClusterEnvironment {
             "k8s stack failure; collecting diagnostics"
         );
         dump_namespace_logs(&self.client, &self.namespace).await;
+        kill_port_forwards(&mut self.port_forwards);
         if let Some(guard) = self.cleanup.take() {
             Box::new(guard).cleanup();
         }
     }
 
-    fn into_cleanup(mut self) -> RunnerCleanup {
-        self.cleanup
-            .take()
-            .expect("cleanup guard should be available")
+    fn into_cleanup(self) -> (RunnerCleanup, Vec<std::process::Child>) {
+        (
+            self.cleanup.expect("cleanup guard should be available"),
+            self.port_forwards,
+        )
     }
 }
 
@@ -264,12 +269,15 @@ impl Deployer for K8sRunner {
                 return Err(err);
             }
         };
-        let cleanup = cluster
+        let (cleanup, port_forwards) = cluster
             .take()
             .expect("cluster should still be available")
             .into_cleanup();
-        let cleanup_guard: Box<dyn CleanupGuard> =
-            Box::new(K8sCleanupGuard::new(cleanup, block_feed_guard));
+        let cleanup_guard: Box<dyn CleanupGuard> = Box::new(K8sCleanupGuard::new(
+            cleanup,
+            block_feed_guard,
+            port_forwards,
+        ));
         let context = RunContext::new(
             descriptors,
             None,
@@ -301,6 +309,14 @@ fn ensure_supported_topology(descriptors: &GeneratedTopology) -> Result<(), K8sR
     Ok(())
 }
 
+fn kill_port_forwards(handles: &mut Vec<std::process::Child>) {
+    for handle in handles.iter_mut() {
+        let _ = handle.kill();
+        let _ = handle.wait();
+    }
+    handles.clear();
+}
+
 fn collect_port_specs(descriptors: &GeneratedTopology) -> PortSpecs {
     let validators = descriptors
         .validators()
@@ -386,11 +402,11 @@ async fn setup_cluster(
     let mut cleanup_guard =
         Some(install_stack(client, &assets, &namespace, &release, validators, executors).await?);
 
-    let cluster_ports =
+    let cluster_ready =
         wait_for_ports_or_cleanup(client, &namespace, &release, specs, &mut cleanup_guard).await?;
 
     info!(
-        prometheus_port = cluster_ports.prometheus,
+        prometheus_port = cluster_ready.ports.prometheus,
         "discovered prometheus endpoint"
     );
 
@@ -401,7 +417,8 @@ async fn setup_cluster(
         cleanup_guard
             .take()
             .expect("cleanup guard must exist after successful cluster startup"),
-        &cluster_ports,
+        &cluster_ready.ports,
+        cluster_ready.port_forwards,
     );
 
     if readiness_checks {
@@ -448,7 +465,7 @@ async fn wait_for_ports_or_cleanup(
     release: &str,
     specs: &PortSpecs,
     cleanup_guard: &mut Option<RunnerCleanup>,
-) -> Result<ClusterPorts, K8sRunnerError> {
+) -> Result<ClusterReady, K8sRunnerError> {
     match wait_for_cluster_ready(
         client,
         namespace,
@@ -498,13 +515,19 @@ async fn ensure_cluster_readiness(
 struct K8sCleanupGuard {
     cleanup: RunnerCleanup,
     block_feed: Option<BlockFeedTask>,
+    port_forwards: Vec<std::process::Child>,
 }
 
 impl K8sCleanupGuard {
-    const fn new(cleanup: RunnerCleanup, block_feed: BlockFeedTask) -> Self {
+    const fn new(
+        cleanup: RunnerCleanup,
+        block_feed: BlockFeedTask,
+        port_forwards: Vec<std::process::Child>,
+    ) -> Self {
         Self {
             cleanup,
             block_feed: Some(block_feed),
+            port_forwards,
         }
     }
 }
@@ -514,6 +537,7 @@ impl CleanupGuard for K8sCleanupGuard {
         if let Some(block_feed) = self.block_feed.take() {
             CleanupGuard::cleanup(Box::new(block_feed));
         }
+        kill_port_forwards(&mut self.port_forwards);
         CleanupGuard::cleanup(Box::new(self.cleanup));
     }
 }
diff --git a/testing-framework/runners/k8s/src/wait.rs b/testing-framework/runners/k8s/src/wait.rs
index 3453198..f983acf 100644
--- a/testing-framework/runners/k8s/src/wait.rs
+++ b/testing-framework/runners/k8s/src/wait.rs
@@ -1,4 +1,9 @@
-use std::time::Duration;
+use std::{
+    net::{Ipv4Addr, TcpListener, TcpStream},
+    process::{Command as StdCommand, Stdio},
+    thread,
+    time::Duration,
+};
 
 use k8s_openapi::api::{apps::v1::Deployment, core::v1::Service};
 use kube::{Api, Client, Error as KubeError};
@@ -9,7 +14,12 @@ use tokio::time::sleep;
 use crate::host::node_host;
 
 const DEPLOYMENT_TIMEOUT: Duration = Duration::from_secs(180);
+const NODE_HTTP_TIMEOUT: Duration = Duration::from_secs(240);
+const NODE_HTTP_PROBE_TIMEOUT: Duration = Duration::from_secs(30);
+const HTTP_POLL_INTERVAL: Duration = Duration::from_secs(1);
 const PROMETHEUS_HTTP_PORT: u16 = 9090;
+const PROMETHEUS_HTTP_TIMEOUT: Duration = Duration::from_secs(240);
+const PROMETHEUS_HTTP_PROBE_TIMEOUT: Duration = Duration::from_secs(30);
 const PROMETHEUS_SERVICE_NAME: &str = "prometheus";
 
 #[derive(Clone, Copy)]
@@ -30,6 +40,11 @@ pub struct ClusterPorts {
     pub prometheus: u16,
 }
 
+pub struct ClusterReady {
+    pub ports: ClusterPorts,
+    pub port_forwards: Vec<std::process::Child>,
+}
+
 #[derive(Debug, Error)]
 pub enum ClusterWaitError {
     #[error("deployment {name} in namespace {namespace} did not become ready within {timeout:?}")]
@@ -62,6 +77,13 @@ pub enum ClusterWaitError {
     },
     #[error("timeout waiting for prometheus readiness on NodePort {port}")]
     PrometheusTimeout { port: u16 },
+    #[error("failed to start port-forward for service {service} port {port}: {source}")]
+    PortForward {
+        service: String,
+        port: u16,
+        #[source]
+        source: anyhow::Error,
+    },
 }
 
 pub async fn wait_for_deployment_ready(
@@ -159,7 +181,7 @@ pub async fn wait_for_cluster_ready(
     release: &str,
     validator_ports: &[NodeConfigPorts],
     executor_ports: &[NodeConfigPorts],
-) -> Result<ClusterPorts, ClusterWaitError> {
+) -> Result<ClusterReady, ClusterWaitError> {
     if validator_ports.is_empty() {
         return Err(ClusterWaitError::MissingValidator);
     }
@@ -177,11 +199,40 @@ pub async fn wait_for_cluster_ready(
         });
     }
 
+    let mut port_forwards = Vec::new();
+
     let validator_api_ports: Vec<u16> = validator_allocations
         .iter()
         .map(|ports| ports.api)
         .collect();
-    wait_for_node_http(&validator_api_ports, NodeRole::Validator).await?;
+    if wait_for_node_http_nodeport(
+        &validator_api_ports,
+        NodeRole::Validator,
+        NODE_HTTP_PROBE_TIMEOUT,
+    )
+    .await
+    .is_err()
+    {
+        // Fall back to port-forwarding when NodePorts are unreachable from the host.
+        validator_allocations.clear();
+        port_forwards = port_forward_group(
+            namespace,
+            release,
+            "validator",
+            validator_ports,
+            &mut validator_allocations,
+        )?;
+        let validator_api_ports: Vec<u16> = validator_allocations
+            .iter()
+            .map(|ports| ports.api)
+            .collect();
+        if let Err(err) =
+            wait_for_node_http_port_forward(&validator_api_ports, NodeRole::Validator).await
+        {
+            kill_port_forwards(&mut port_forwards);
+            return Err(err);
+        }
+    }
 
     let mut executor_allocations = Vec::with_capacity(executor_ports.len());
     for (index, ports) in executor_ports.iter().enumerate() {
@@ -195,39 +246,102 @@ pub async fn wait_for_cluster_ready(
         });
     }
 
-    if !executor_allocations.is_empty() {
+    let executor_api_ports: Vec<u16> = executor_allocations.iter().map(|ports| ports.api).collect();
+    if !executor_allocations.is_empty()
+        && wait_for_node_http_nodeport(
+            &executor_api_ports,
+            NodeRole::Executor,
+            NODE_HTTP_PROBE_TIMEOUT,
+        )
+        .await
+        .is_err()
+    {
+        executor_allocations.clear();
+        match port_forward_group(
+            namespace,
+            release,
+            "executor",
+            executor_ports,
+            &mut executor_allocations,
+        ) {
+            Ok(forwards) => port_forwards.extend(forwards),
+            Err(err) => {
+                kill_port_forwards(&mut port_forwards);
+                return Err(err);
+            }
+        }
         let executor_api_ports: Vec<u16> =
             executor_allocations.iter().map(|ports| ports.api).collect();
-        wait_for_node_http(&executor_api_ports, NodeRole::Executor).await?;
+        if let Err(err) =
+            wait_for_node_http_port_forward(&executor_api_ports, NodeRole::Executor).await
+        {
+            kill_port_forwards(&mut port_forwards);
+            return Err(err);
+        }
     }
 
-    let prometheus_port = find_node_port(
+    let mut prometheus_port = find_node_port(
         client,
         namespace,
         PROMETHEUS_SERVICE_NAME,
         PROMETHEUS_HTTP_PORT,
     )
     .await?;
-    wait_for_prometheus_http(prometheus_port).await?;
+    if wait_for_prometheus_http_nodeport(prometheus_port, PROMETHEUS_HTTP_PROBE_TIMEOUT)
+        .await
+        .is_err()
+    {
+        let (local_port, forward) =
+            port_forward_service(namespace, PROMETHEUS_SERVICE_NAME, PROMETHEUS_HTTP_PORT)
+                .map_err(|err| {
+                    kill_port_forwards(&mut port_forwards);
+                    err
+                })?;
+        prometheus_port = local_port;
+        port_forwards.push(forward);
+        if let Err(err) =
+            wait_for_prometheus_http_port_forward(prometheus_port, PROMETHEUS_HTTP_TIMEOUT).await
+        {
+            kill_port_forwards(&mut port_forwards);
+            return Err(err);
+        }
+    }
 
-    Ok(ClusterPorts {
-        validators: validator_allocations,
-        executors: executor_allocations,
-        prometheus: prometheus_port,
+    Ok(ClusterReady {
+        ports: ClusterPorts {
+            validators: validator_allocations,
+            executors: executor_allocations,
+            prometheus: prometheus_port,
+        },
+        port_forwards,
     })
 }
 
-async fn wait_for_node_http(ports: &[u16], role: NodeRole) -> Result<(), ClusterWaitError> {
+async fn wait_for_node_http_nodeport(
+    ports: &[u16],
+    role: NodeRole,
+    timeout: Duration,
+) -> Result<(), ClusterWaitError> {
     let host = node_host();
-    http_probe::wait_for_http_ports_with_host(
-        ports,
-        role,
-        &host,
-        Duration::from_secs(240),
-        Duration::from_secs(1),
-    )
-    .await
-    .map_err(map_http_error)
+    wait_for_node_http_on_host(ports, role, &host, timeout).await
+}
+
+async fn wait_for_node_http_port_forward(
+    ports: &[u16],
+    role: NodeRole,
+) -> Result<(), ClusterWaitError> {
+    wait_for_node_http_on_host(ports, role, "127.0.0.1", NODE_HTTP_TIMEOUT).await
+}
+
+async fn wait_for_node_http_on_host(
+    ports: &[u16],
+    role: NodeRole,
+    host: &str,
+    timeout: Duration,
+) -> Result<(), ClusterWaitError> {
+    http_probe::wait_for_http_ports_with_host(ports, role, host, timeout, HTTP_POLL_INTERVAL)
+        .await
+        .map_err(map_http_error)
 }
 
 const fn map_http_error(error: HttpReadinessError) -> ClusterWaitError {
@@ -238,11 +352,30 @@ const fn map_http_error(error: HttpReadinessError) -> ClusterWaitError {
     }
 }
 
-pub async fn wait_for_prometheus_http(port: u16) -> Result<(), ClusterWaitError> {
-    let client = reqwest::Client::new();
-    let url = format!("http://{}:{port}/-/ready", node_host());
+pub async fn wait_for_prometheus_http_nodeport(
+    port: u16,
+    timeout: Duration,
+) -> Result<(), ClusterWaitError> {
+    let host = node_host();
+    wait_for_prometheus_http(&host, port, timeout).await
+}
 
-    for _ in 0..240 {
+pub async fn wait_for_prometheus_http_port_forward(
+    port: u16,
+    timeout: Duration,
+) -> Result<(), ClusterWaitError> {
+    wait_for_prometheus_http("127.0.0.1", port, timeout).await
+}
+
+pub async fn wait_for_prometheus_http(
+    host: &str,
+    port: u16,
+    timeout: Duration,
+) -> Result<(), ClusterWaitError> {
+    let client = reqwest::Client::new();
+    let url = format!("http://{host}:{port}/-/ready");
+
+    for _ in 0..timeout.as_secs() {
         if let Ok(resp) = client.get(&url).send().await
             && resp.status().is_success()
         {
@@ -253,3 +386,101 @@ pub async fn wait_for_prometheus_http(port: u16) -> Result<(), ClusterWaitError>
 
     Err(ClusterWaitError::PrometheusTimeout { port })
 }
+
+fn port_forward_group(
+    namespace: &str,
+    release: &str,
+    kind: &str,
+    ports: &[NodeConfigPorts],
+    allocations: &mut Vec<NodePortAllocation>,
+) -> Result<Vec<std::process::Child>, ClusterWaitError> {
+    let mut forwards = Vec::new();
+    for (index, ports) in ports.iter().enumerate() {
+        let service = format!("{release}-{kind}-{index}");
+        let (api_port, api_forward) = match port_forward_service(namespace, &service, ports.api) {
+            Ok(forward) => forward,
+            Err(err) => {
+                kill_port_forwards(&mut forwards);
+                return Err(err);
+            }
+        };
+        let (testing_port, testing_forward) =
+            match port_forward_service(namespace, &service, ports.testing) {
+                Ok(forward) => forward,
+                Err(err) => {
+                    kill_port_forwards(&mut forwards);
+                    return Err(err);
+                }
+            };
+        allocations.push(NodePortAllocation {
+            api: api_port,
+            testing: testing_port,
+        });
+        forwards.push(api_forward);
+        forwards.push(testing_forward);
+    }
+    Ok(forwards)
+}
+
+fn port_forward_service(
+    namespace: &str,
+    service: &str,
+    remote_port: u16,
+) -> Result<(u16, std::process::Child), ClusterWaitError> {
+    let local_port = allocate_local_port().map_err(|source| ClusterWaitError::PortForward {
+        service: service.to_owned(),
+        port: remote_port,
+        source,
+    })?;
+
+    let mut child = StdCommand::new("kubectl")
+        .arg("port-forward")
+        .arg("-n")
+        .arg(namespace)
+        .arg(format!("svc/{service}"))
+        .arg(format!("{local_port}:{remote_port}"))
+        .stdout(Stdio::null())
+        .stderr(Stdio::null())
+        .spawn()
+        .map_err(|source| ClusterWaitError::PortForward {
+            service: service.to_owned(),
+            port: remote_port,
+            source: source.into(),
+        })?;
+
+    for _ in 0..20 {
+        if let Ok(Some(status)) = child.try_wait() {
+            return Err(ClusterWaitError::PortForward {
+                service: service.to_owned(),
+                port: remote_port,
+                source: anyhow::anyhow!("kubectl exited with {status}"),
+            });
+        }
+        if TcpStream::connect((Ipv4Addr::LOCALHOST, local_port)).is_ok() {
+            return Ok((local_port, child));
+        }
+        thread::sleep(Duration::from_millis(250));
+    }
+
+    let _ = child.kill();
+    Err(ClusterWaitError::PortForward {
+        service: service.to_owned(),
+        port: remote_port,
+        source: anyhow::anyhow!("port-forward did not become ready"),
+    })
+}
+
+fn allocate_local_port() -> anyhow::Result<u16> {
+    let listener = TcpListener::bind((Ipv4Addr::LOCALHOST, 0))?;
+    let port = listener.local_addr()?.port();
+    drop(listener);
+    Ok(port)
+}
+
+fn kill_port_forwards(handles: &mut Vec<std::process::Child>) {
+    for handle in handles.iter_mut() {
+        let _ = handle.kill();
+        let _ = handle.wait();
+    }
+    handles.clear();
+}
diff --git a/testnet/Dockerfile b/testnet/Dockerfile
index 1a6338a..4955e5f 100644
--- a/testnet/Dockerfile
+++ b/testnet/Dockerfile
@@ -2,7 +2,8 @@
 # check=skip=SecretsUsedInArgOrEnv
 # Ignore warnings about sensitive information as this is test data.
 
-ARG VERSION=v0.2.0
+ARG VERSION=v0.3.1
+ARG CIRCUITS_OVERRIDE
 
 # ===========================
 # BUILD IMAGE
@@ -11,24 +12,61 @@ ARG VERSION=v0.2.0
 FROM rust:1.91.0-slim-bookworm AS builder
 
 ARG VERSION
+ARG CIRCUITS_OVERRIDE
 
 LABEL maintainer="augustinas@status.im" \
     source="https://github.com/logos-co/nomos-node" \
     description="Nomos testnet build image"
 
-WORKDIR /nomos
+WORKDIR /workspace
 COPY . .
 
 # Install dependencies needed for building RocksDB.
 RUN apt-get update && apt-get install -yq \
-    git gcc g++ clang libssl-dev pkg-config ca-certificates curl
+    git gcc g++ clang make cmake m4 xz-utils libgmp-dev libssl-dev pkg-config ca-certificates curl wget
 
-RUN chmod +x scripts/setup-nomos-circuits.sh && \
-    scripts/setup-nomos-circuits.sh "$VERSION" "/opt/circuits"
+RUN mkdir -p /opt/circuits && \
+    select_circuits_source() { \
+        # Prefer an explicit override when it exists (file or directory). \
+        if [ -n "$CIRCUITS_OVERRIDE" ] && [ -e "/workspace/${CIRCUITS_OVERRIDE}" ]; then \
+            echo "/workspace/${CIRCUITS_OVERRIDE}"; \
+            return 0; \
+        fi; \
+        # Fall back to the workspace bundle shipped with the repo. \
+        if [ -e "/workspace/tests/kzgrs/kzgrs_test_params" ]; then \
+            echo "/workspace/tests/kzgrs/kzgrs_test_params"; \
+            return 0; \
+        fi; \
+        return 1; \
+    }; \
+    if CIRCUITS_PATH="$(select_circuits_source)"; then \
+        echo "Using prebuilt circuits bundle from ${CIRCUITS_PATH#/workspace/}"; \
+        if [ -d "$CIRCUITS_PATH" ]; then \
+            cp -R "${CIRCUITS_PATH}/." /opt/circuits; \
+        else \
+            cp "${CIRCUITS_PATH}" /opt/circuits/; \
+        fi; \
+    fi; \
+    if [ ! -f "/opt/circuits/pol/verification_key.json" ]; then \
+        echo "Local circuits missing pol artifacts; downloading ${VERSION} bundle and rebuilding"; \
+        chmod +x scripts/setup-nomos-circuits.sh && \
+        NOMOS_CIRCUITS_REBUILD_RAPIDSNARK=1 \
+        RAPIDSNARK_BUILD_GMP=1 \
+            scripts/setup-nomos-circuits.sh "$VERSION" "/opt/circuits"; \
+    fi
 
 ENV NOMOS_CIRCUITS=/opt/circuits
+ENV CARGO_TARGET_DIR=/workspace/target
 
-RUN cargo build --release --all-features
+# Fetch the nomos-node sources pinned in Cargo.lock and build the runtime binaries.
+RUN git clone https://github.com/logos-co/nomos-node.git /workspace/nomos-node && \
+    cd /workspace/nomos-node && \
+    git fetch --depth 1 origin 2f60a0372c228968c3526c341ebc7e58bbd178dd && \
+    git checkout 2f60a0372c228968c3526c341ebc7e58bbd178dd && \
+    cargo build --release --all-features --bins
+
+# Build cfgsync binaries from this workspace.
+RUN cargo build --release --locked --manifest-path /workspace/testnet/cfgsync/Cargo.toml --bins
 
 # ===========================
 # NODE IMAGE
@@ -50,11 +88,11 @@ RUN apt-get update && apt-get install -yq \
 
 COPY --from=builder /opt/circuits /opt/circuits
 
-COPY --from=builder /nomos/target/release/nomos-node /usr/bin/nomos-node
-COPY --from=builder /nomos/target/release/nomos-executor /usr/bin/nomos-executor
-COPY --from=builder /nomos/target/release/nomos-cli /usr/bin/nomos-cli
-COPY --from=builder /nomos/target/release/cfgsync-server /usr/bin/cfgsync-server
-COPY --from=builder /nomos/target/release/cfgsync-client /usr/bin/cfgsync-client
+COPY --from=builder /workspace/target/release/nomos-node /usr/bin/nomos-node
+COPY --from=builder /workspace/target/release/nomos-executor /usr/bin/nomos-executor
+COPY --from=builder /workspace/target/release/nomos-cli /usr/bin/nomos-cli
+COPY --from=builder /workspace/target/release/cfgsync-server /usr/bin/cfgsync-server
+COPY --from=builder /workspace/target/release/cfgsync-client /usr/bin/cfgsync-client
 
 ENV NOMOS_CIRCUITS=/opt/circuits
 
diff --git a/testnet/scripts/build_test_image.sh b/testnet/scripts/build_test_image.sh
new file mode 100755
index 0000000..f25dbd9
--- /dev/null
+++ b/testnet/scripts/build_test_image.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+set -euo pipefail
+
+# Builds the testnet image with circuits. Prefers a local circuits bundle
+# (tests/kzgrs/kzgrs_test_params) or a custom override; otherwise downloads
+# from logos-co/nomos-circuits.
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+IMAGE_TAG="${IMAGE_TAG:-nomos-testnet:local}"
+VERSION="${VERSION:-v0.3.1}"
+CIRCUITS_OVERRIDE="${CIRCUITS_OVERRIDE:-tests/kzgrs/kzgrs_test_params}"
+
+echo "Workspace root: ${ROOT_DIR}"
+echo "Image tag: ${IMAGE_TAG}"
+echo "Circuits override: ${CIRCUITS_OVERRIDE:-<none>}"
+echo "Circuits version (fallback download): ${VERSION}"
+
+build_args=(
+  -f "${ROOT_DIR}/testnet/Dockerfile"
+  -t "${IMAGE_TAG}"
+  "${ROOT_DIR}"
+)
+
+# Pass override/version args to the Docker build.
+if [ -n "${CIRCUITS_OVERRIDE}" ]; then
+  build_args+=(--build-arg "CIRCUITS_OVERRIDE=${CIRCUITS_OVERRIDE}")
+fi
+build_args+=(--build-arg "VERSION=${VERSION}")
+
+echo "Running: docker build ${build_args[*]}"
+docker build "${build_args[@]}"
+
+cat <<EOF
+
+Build complete.
+- Use this image in k8s/compose by exporting NOMOS_TESTNET_IMAGE=${IMAGE_TAG}
+- Circuits source: ${CIRCUITS_OVERRIDE:-download ${VERSION}}
+EOF
diff --git a/testnet/scripts/run_nomos_executor.sh b/testnet/scripts/run_nomos_executor.sh
index c5a6553..f632690 100755
--- a/testnet/scripts/run_nomos_executor.sh
+++ b/testnet/scripts/run_nomos_executor.sh
@@ -14,5 +14,9 @@ export CFG_FILE_PATH="/config.yaml" \
 # persist state.
 mkdir -p /recovery
 
-/usr/bin/cfgsync-client && \
-    exec /usr/bin/nomos-executor /config.yaml
+/usr/bin/cfgsync-client
+
+# Align bootstrap timing with validators to keep configs consistent.
+sed -i "s/prolonged_bootstrap_period: .*/prolonged_bootstrap_period: '3.000000000'/" /config.yaml
+
+exec /usr/bin/nomos-executor /config.yaml
diff --git a/testnet/scripts/run_nomos_node.sh b/testnet/scripts/run_nomos_node.sh
index 9d99e83..74eda60 100755
--- a/testnet/scripts/run_nomos_node.sh
+++ b/testnet/scripts/run_nomos_node.sh
@@ -14,5 +14,9 @@ export CFG_FILE_PATH="/config.yaml" \
 # persist state.
 mkdir -p /recovery
 
-/usr/bin/cfgsync-client && \
-    exec /usr/bin/nomos-node /config.yaml
+/usr/bin/cfgsync-client
+
+# Align bootstrap timing with executors to keep configs consistent.
+sed -i "s/prolonged_bootstrap_period: .*/prolonged_bootstrap_period: '3.000000000'/" /config.yaml
+
+exec /usr/bin/nomos-node /config.yaml
diff --git a/testnet/scripts/setup-nomos-circuits.sh b/testnet/scripts/setup-nomos-circuits.sh
new file mode 100644
index 0000000..ebbd024
--- /dev/null
+++ b/testnet/scripts/setup-nomos-circuits.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+#
+# Setup script for nomos-circuits
+#
+# Usage: ./setup-nomos-circuits.sh [VERSION] [INSTALL_DIR]
+#   VERSION      - Optional. Version to install (default: v0.3.1)
+#   INSTALL_DIR  - Optional. Installation directory (default: $HOME/.nomos-circuits)
+#
+# Examples:
+#   ./setup-nomos-circuits.sh                    # Install default version to default location
+#   ./setup-nomos-circuits.sh v0.2.0             # Install specific version to default location
+#   ./setup-nomos-circuits.sh v0.2.0 /opt/circuits  # Install to custom location
+
+set -euo pipefail
+
+VERSION="${1:-v0.3.1}"
+DEFAULT_INSTALL_DIR="$HOME/.nomos-circuits"
+INSTALL_DIR="${2:-$DEFAULT_INSTALL_DIR}"
+REPO="logos-co/nomos-circuits"
+
+detect_platform() {
+    local os=""
+    local arch=""
+    case "$(uname -s)" in
+        Linux*) os="linux" ;;
+        Darwin*) os="macos" ;;
+        MINGW*|MSYS*|CYGWIN*) os="windows" ;;
+        *) echo "Unsupported operating system: $(uname -s)" >&2; exit 1 ;;
+    esac
+    case "$(uname -m)" in
+        x86_64) arch="x86_64" ;;
+        aarch64|arm64) arch="aarch64" ;;
+        *) echo "Unsupported architecture: $(uname -m)" >&2; exit 1 ;;
+    esac
+    echo "${os}-${arch}"
+}
+
+download_release() {
+    local platform="$1"
+    local artifact="nomos-circuits-${VERSION}-${platform}.tar.gz"
+    local url="https://github.com/${REPO}/releases/download/${VERSION}/${artifact}"
+    local temp_dir
+    temp_dir=$(mktemp -d)
+
+    echo "Downloading nomos-circuits ${VERSION} for ${platform}..."
+    if [ -n "${GITHUB_TOKEN:-}" ]; then
+        auth_header="Authorization: Bearer ${GITHUB_TOKEN}"
+    else
+        auth_header=""
+    fi
+
+    if ! curl -L ${auth_header:+-H "$auth_header"} -o "${temp_dir}/${artifact}" "${url}"; then
+        echo "Failed to download release artifact from ${url}" >&2
+        rm -rf "${temp_dir}"
+        exit 1
+    fi
+
+    echo "Extracting to ${INSTALL_DIR}..."
+    rm -rf "${INSTALL_DIR}"
+    mkdir -p "${INSTALL_DIR}"
+    if ! tar -xzf "${temp_dir}/${artifact}" -C "${INSTALL_DIR}" --strip-components=1; then
+        echo "Failed to extract ${artifact}" >&2
+        rm -rf "${temp_dir}"
+        exit 1
+    fi
+    rm -rf "${temp_dir}"
+}
+
+platform=$(detect_platform)
+echo "Setting up nomos-circuits ${VERSION} for ${platform}"
+echo "Installing to ${INSTALL_DIR}"
+
+download_release "${platform}"
+
+echo "Installation complete. Circuits installed at: ${INSTALL_DIR}"
+echo "If using a custom directory, set NOMOS_CIRCUITS=${INSTALL_DIR}"