demo-apps: kvstore, queue, and openraft_kv

2026-05-31 05:59:38 +00:00 · 2026-04-14 21:10:18 +07:00 · 2026-04-14 21:10:18 +07:00 · 8700bd5a6c
commit 8700bd5a6c
parent 41c105354d bc287c85e2
102 changed files with 7454 additions and 74 deletions
--- a/.cargo-deny.toml
+++ b/.cargo-deny.toml
@ -6,7 +6,11 @@ exclude-dev         = true
 no-default-features = true

 [advisories]
-ignore = []
+ignore = [
+  # Existing workspace dependencies still resolve rand 0.8 via tera/tokio-retry.
+  # Track removal when those upstream edges move to a fixed release.
+  "RUSTSEC-2026-0097",
+]
 yanked = "deny"

 [bans]
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -4,6 +4,18 @@ members = [
  "cfgsync/artifacts",
  "cfgsync/core",
  "cfgsync/runtime",
+  "examples/kvstore/examples",
+  "examples/kvstore/kvstore-node",
+  "examples/kvstore/testing/integration",
+  "examples/kvstore/testing/workloads",
+  "examples/openraft_kv/examples",
+  "examples/openraft_kv/openraft-kv-node",
+  "examples/openraft_kv/testing/integration",
+  "examples/openraft_kv/testing/workloads",
+  "examples/queue/examples",
+  "examples/queue/queue-node",
+  "examples/queue/testing/integration",
+  "examples/queue/testing/workloads",
  "examples/metrics_counter/examples",
  "examples/metrics_counter/metrics-counter-node",
  "examples/metrics_counter/testing/integration",
@ -56,6 +68,8 @@ bytes               = { default-features = false, version = "1.3" }
 hex                 = { default-features = false, version = "0.4.3" }
 libp2p              = { default-features = false, version = "0.55" }
 num-bigint          = { default-features = false, version = "0.4" }
+openraft            = { default-features = true, features = ["serde", "type-alias"], version = "0.10.0-alpha.17" }
+openraft-memstore   = { default-features = true, version = "0.10.0-alpha.17" }
 parking_lot         = { default-features = false, version = "0.12" }
 rand                = { default-features = false, features = ["std", "std_rng"], version = "0.8" }
 reqwest             = { default-features = false, version = "0.12" }
--- a/docs/observation-runtime-plan.md
+++ b/docs/observation-runtime-plan.md
@ -0,0 +1,314 @@
+# Observation Runtime Plan
+
+## Why this work exists
+
+TF is good at deployment plumbing. It is weak at continuous observation.
+
+Today, the same problems are solved repeatedly with custom loops:
+- TF block feed logic in Logos
+- Cucumber manual-cluster polling loops
+- ad hoc catch-up scans for wallet and chain state
+- app-local state polling in expectations
+
+That is the gap this work should close.
+
+The goal is not a generic "distributed systems DSL".
+The goal is one reusable observation runtime that:
+- continuously collects data from dynamic sources
+- keeps typed materialized state
+- exposes both current snapshot and delta/history views
+- fits naturally in TF scenarios and Cucumber manual-cluster code
+
+## Constraints
+
+### TF constraints
+- TF abstractions must stay universal and simple.
+- TF must not know app semantics like blocks, wallets, leaders, jobs, or topics.
+- TF must remain useful for simple apps such as `openraft_kv`, not only Logos.
+
+### App constraints
+- Apps must be able to build richer abstractions on top of TF.
+- Logos must be able to support:
+  - current block-feed replacement
+  - fork-aware chain state
+  - public-peer sync targets
+  - multi-wallet UTXO tracking
+- Apps must be able to adopt this incrementally.
+
+### Migration constraints
+- We do not want a flag-day rewrite.
+- Existing loops can coexist with the new runtime until replacements are proven.
+
+## Non-goals
+
+This work should not:
+- put feed back onto the base `Application` trait
+- build app-specific semantics into TF core
+- replace filesystem blockchain snapshots used for startup/restore
+- force every app to use continuous observation
+- introduce a large public abstraction stack that nobody can explain
+
+## Core idea
+
+Introduce one TF-level observation runtime.
+
+That runtime owns:
+- source refresh
+- scheduling
+- polling/ingestion
+- bounded history
+- latest snapshot caching
+- delta publication
+- freshness/error tracking
+- lifecycle hooks for TF and Cucumber
+
+Apps own:
+- source types
+- raw observation logic
+- materialized state
+- snapshot shape
+- delta/event shape
+- higher-level projections such as wallet state
+
+## Public TF surface
+
+The TF public surface should stay small.
+
+### `ObservedSource<S>`
+A named source instance.
+
+Used for:
+- local node clients
+- public peer endpoints
+- any other app-owned source type
+
+### `SourceProvider<S>`
+Returns the current source set.
+
+This must support dynamic source lists because:
+- manual cluster nodes come and go
+- Cucumber worlds may attach public peers
+- node control may restart or replace sources during a run
+
+### `Observer`
+App-owned observation logic.
+
+It defines:
+- `Source`
+- `State`
+- `Snapshot`
+- `Event`
+
+And it implements:
+- `init(...)`
+- `poll(...)`
+- `snapshot(...)`
+
+The important boundary is:
+- TF owns the runtime
+- app code owns materialization
+
+### `ObservationRuntime`
+The engine that:
+- starts the loop
+- refreshes sources
+- calls `poll(...)`
+- stores history
+- publishes deltas
+- updates latest snapshot
+- tracks last error and freshness
+
+### `ObservationHandle`
+The read-side interface for workloads, expectations, and Cucumber steps.
+
+It should expose at least:
+- latest snapshot
+- delta subscription
+- bounded history
+- last error
+
+## Intended shape
+
+```rust
+pub struct ObservedSource<S> {
+    pub name: String,
+    pub source: S,
+}
+
+#[async_trait]
+pub trait SourceProvider<S>: Send + Sync + 'static {
+    async fn sources(&self) -> Vec<ObservedSource<S>>;
+}
+
+#[async_trait]
+pub trait Observer: Send + Sync + 'static {
+    type Source: Clone + Send + Sync + 'static;
+    type State: Send + Sync + 'static;
+    type Snapshot: Clone + Send + Sync + 'static;
+    type Event: Clone + Send + Sync + 'static;
+
+    async fn init(
+        &self,
+        sources: &[ObservedSource<Self::Source>],
+    ) -> Result<Self::State, DynError>;
+
+    async fn poll(
+        &self,
+        sources: &[ObservedSource<Self::Source>],
+        state: &mut Self::State,
+    ) -> Result<Vec<Self::Event>, DynError>;
+
+    fn snapshot(&self, state: &Self::State) -> Self::Snapshot;
+}
+```
+
+This is enough.
+
+If more helper layers are needed, they should stay internal first.
+
+## How current use cases fit
+
+### `openraft_kv`
+Use one simple observer.
+
+- sources: node clients
+- state: latest per-node Raft state
+- snapshot: sorted node-state view
+- events: optional deltas, possibly empty at first
+
+This is the simplest proving case.
+It validates the runtime without dragging in Logos complexity.
+
+### Logos block feed replacement
+Use one shared chain observer.
+
+- sources: local node clients
+- state:
+  - node heads
+  - block graph
+  - heights
+  - seen headers
+  - recent history
+- snapshot:
+  - current head/lib/graph summary
+- events:
+  - newly discovered blocks
+
+This covers both existing Logos feed use cases:
+- current snapshot consumers
+- delta/subscription consumers
+
+### Cucumber manual-cluster sync
+Use the same observer runtime with a different source set.
+
+- sources:
+  - local manual-cluster node clients
+  - public peer endpoints
+- state:
+  - local consensus views
+  - public consensus views
+  - derived majority public target
+- snapshot:
+  - current local and public sync picture
+
+This removes custom poll/sleep loops from steps.
+
+### Multi-wallet fork-aware tracking
+This should not be a TF concept.
+
+It should be a Logos projection built on top of the shared chain observer.
+
+- input: chain observer state
+- output: per-header wallet state cache keyed by block header
+- property: naturally fork-aware because it follows actual ancestry
+
+That replaces repeated backward scans from tip with continuous maintained state.
+
+## Logos layering
+
+Logos should not put every concern into one giant impl.
+
+Recommended layering:
+
+1. **Chain source adapter**
+   - local node reads
+   - public peer reads
+
+2. **Shared chain observer**
+   - catch-up
+   - continuous ingestion
+   - graph/history materialization
+
+3. **Logos projections**
+   - head view
+   - public sync target
+   - fork graph queries
+   - wallet state
+   - tx inclusion helpers
+
+TF provides the runtime.
+Logos provides the domain model built on top.
+
+## Adoption plan
+
+### Phase 1: add TF observation runtime
+- add `ObservedSource`, `SourceProvider`, `Observer`, `ObservationRuntime`, `ObservationHandle`
+- keep the public API small
+- no app migrations yet
+
+### Phase 2: prove it on `openraft_kv`
+- add one simple observer over `/state`
+- migrate one expectation to use the observation handle
+- validate local, compose, and k8s
+
+### Phase 3: add Logos shared chain observer
+- implement it alongside current feed/loops
+- do not remove existing consumers yet
+- prove snapshot and delta outputs are useful
+
+### Phase 4: migrate one Logos consumer at a time
+Suggested order:
+1. fork/head snapshot consumer
+2. tx inclusion consumer
+3. Cucumber sync-to-public-chain logic
+4. wallet/UTXO tracking
+
+### Phase 5: delete old loops and feed paths
+- only after the new runtime has replaced real consumers cleanly
+
+## Validation gates
+
+Each phase should have clear checks.
+
+### Runtime-level
+- crate-level `cargo check`
+- targeted tests for runtime lifecycle and history retention
+- explicit tests for dynamic source refresh
+
+### App-level
+- `openraft_kv`:
+  - local failover
+  - compose failover
+  - k8s failover
+- Logos:
+  - one snapshot consumer migrated
+  - one delta consumer migrated
+- Cucumber:
+  - one manual-cluster sync path migrated
+
+## Open questions
+
+These should stay open until implementation forces a decision:
+- whether `ObservationHandle` should expose full history directly or only cursor/subscription access
+- how much error/freshness metadata belongs in the generic runtime vs app snapshot types
+- whether multiple observers should share one scheduler/runtime instance or simply run independently first
+
+## Design guardrails
+
+When implementing this work:
+- keep TF public abstractions minimal
+- keep app semantics out of TF core
+- do not chase a generic testing DSL
+- build from reusable blocks, not one-off mega impls
+- keep migration incremental
+- prefer simple, explainable runtime behavior over clever abstraction
--- a/examples/kvstore/Dockerfile
+++ b/examples/kvstore/Dockerfile
@ -0,0 +1,28 @@
+# Build stage
+FROM rustlang/rust:nightly-bookworm AS builder
+
+WORKDIR /build
+
+# Copy all workspace files required for workspace build.
+COPY Cargo.toml Cargo.lock ./
+COPY cfgsync/ ./cfgsync/
+COPY examples/ ./examples/
+COPY testing-framework/ ./testing-framework/
+
+# Build kvstore-node in release mode.
+RUN cargo build --release -p kvstore-node
+
+# Runtime stage
+FROM debian:bookworm-slim
+
+RUN apt-get update && \
+    apt-get install -y ca-certificates && \
+    rm -rf /var/lib/apt/lists/*
+
+COPY --from=builder /build/target/release/kvstore-node /usr/local/bin/kvstore-node
+
+RUN mkdir -p /etc/kvstore
+WORKDIR /app
+
+ENTRYPOINT ["/usr/local/bin/kvstore-node"]
+CMD ["--config", "/etc/kvstore/config.yaml"]
--- a/examples/kvstore/README.md
+++ b/examples/kvstore/README.md
@ -0,0 +1,64 @@
+# KV Store Example
+
+This example runs a small replicated key-value store.
+
+The usual scenario writes keys through one node and checks that the other nodes
+eventually return the same values.
+
+## How TF runs this
+
+Each example follows the same pattern:
+
+- TF starts a small deployment of kvstore nodes
+- a workload writes keys through one node
+- an expectation keeps reading from all nodes until they agree on the values
+
+## Scenarios
+
+- `basic_convergence` runs the convergence check locally
+- `compose_convergence` runs the same check in Docker Compose
+- `k8s_convergence` runs it on Kubernetes
+- `k8s_manual_convergence` starts the nodes through the k8s manual cluster API, restarts one node, and checks convergence again
+
+## API
+
+Each node exposes:
+
+- `PUT /kv/:key` to write a value
+- `GET /kv/:key` to read a value
+- `GET /internal/snapshot` to read the local replicated state
+
+## Run locally
+
+```bash
+cargo run -p kvstore-examples --bin kvstore_basic_convergence
+```
+
+## Run with Docker Compose
+
+```bash
+cargo run -p kvstore-examples --bin kvstore_compose_convergence
+```
+
+Set `KVSTORE_IMAGE` to override the default compose image tag.
+
+## Run with Kubernetes
+
+```bash
+docker build -t kvstore-node:local -f examples/kvstore/Dockerfile .
+cargo run -p kvstore-examples --bin kvstore_k8s_convergence
+```
+
+Prerequisites:
+- `kubectl` configured with a reachable cluster
+- `helm` installed
+
+Optional image override:
+- `KVSTORE_K8S_IMAGE` (falls back to `KVSTORE_IMAGE`, then `kvstore-node:local`)
+
+## Run with Kubernetes manual cluster
+
+```bash
+docker build -t kvstore-node:local -f examples/kvstore/Dockerfile .
+cargo run -p kvstore-examples --bin kvstore_k8s_manual_convergence
+```
--- a/examples/kvstore/examples/Cargo.toml
+++ b/examples/kvstore/examples/Cargo.toml
@ -0,0 +1,35 @@
+[package]
+edition.workspace = true
+license.workspace = true
+name              = "kvstore-examples"
+version.workspace = true
+
+[[bin]]
+name = "kvstore_basic_convergence"
+path = "src/bin/basic_convergence.rs"
+
+[[bin]]
+name = "kvstore_compose_convergence"
+path = "src/bin/compose_convergence.rs"
+
+[[bin]]
+name = "kvstore_k8s_convergence"
+path = "src/bin/k8s_convergence.rs"
+
+[[bin]]
+name = "kvstore_k8s_manual_convergence"
+path = "src/bin/k8s_manual_convergence.rs"
+
+[dependencies]
+kvstore-node                     = { path = "../kvstore-node" }
+kvstore-runtime-ext              = { path = "../testing/integration" }
+kvstore-runtime-workloads        = { path = "../testing/workloads" }
+testing-framework-core           = { workspace = true }
+testing-framework-runner-compose = { workspace = true }
+testing-framework-runner-k8s     = { workspace = true }
+
+anyhow             = "1.0"
+serde              = { workspace = true }
+tokio              = { workspace = true, features = ["full"] }
+tracing            = { workspace = true }
+tracing-subscriber = { version = "0.3", features = ["env-filter"] }
--- a/examples/kvstore/examples/src/bin/basic_convergence.rs
+++ b/examples/kvstore/examples/src/bin/basic_convergence.rs
@ -0,0 +1,31 @@
+use std::time::Duration;
+
+use kvstore_runtime_ext::KvLocalDeployer;
+use kvstore_runtime_workloads::{
+    KvBuilderExt, KvConverges, KvScenarioBuilder, KvTopology, KvWriteWorkload,
+};
+use testing_framework_core::scenario::Deployer;
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
+        .init();
+
+    let mut scenario = KvScenarioBuilder::deployment_with(|_| KvTopology::new(3))
+        .with_run_duration(Duration::from_secs(30))
+        .with_workload(
+            KvWriteWorkload::new()
+                .operations(300)
+                .key_count(30)
+                .rate_per_sec(30)
+                .key_prefix("demo"),
+        )
+        .with_expectation(KvConverges::new("demo", 30).timeout(Duration::from_secs(25)))
+        .build()?;
+
+    let deployer = KvLocalDeployer::default();
+    let runner = deployer.deploy(&scenario).await?;
+    runner.run(&mut scenario).await?;
+    Ok(())
+}
--- a/examples/kvstore/examples/src/bin/compose_convergence.rs
+++ b/examples/kvstore/examples/src/bin/compose_convergence.rs
@ -0,0 +1,44 @@
+use std::time::Duration;
+
+use anyhow::{Context as _, Result};
+use kvstore_runtime_workloads::{
+    KvBuilderExt, KvConverges, KvScenarioBuilder, KvTopology, KvWriteWorkload,
+};
+use testing_framework_core::scenario::Deployer;
+use testing_framework_runner_compose::ComposeRunnerError;
+use tracing::{info, warn};
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
+        .init();
+
+    let mut scenario = KvScenarioBuilder::deployment_with(|_| KvTopology::new(3))
+        .with_run_duration(Duration::from_secs(30))
+        .with_workload(
+            KvWriteWorkload::new()
+                .operations(200)
+                .key_count(20)
+                .rate_per_sec(20),
+        )
+        .with_expectation(KvConverges::new("kv-demo", 20).timeout(Duration::from_secs(25)))
+        .build()?;
+
+    let deployer = kvstore_runtime_ext::KvComposeDeployer::new();
+    let runner = match deployer.deploy(&scenario).await {
+        Ok(runner) => runner,
+        Err(ComposeRunnerError::DockerUnavailable) => {
+            warn!("docker unavailable; skipping compose kv run");
+            return Ok(());
+        }
+        Err(error) => return Err(anyhow::Error::new(error)).context("deploying kv compose stack"),
+    };
+
+    info!("running kv compose convergence scenario");
+    runner
+        .run(&mut scenario)
+        .await
+        .context("running kv compose scenario")?;
+    Ok(())
+}
--- a/examples/kvstore/examples/src/bin/k8s_convergence.rs
+++ b/examples/kvstore/examples/src/bin/k8s_convergence.rs
@ -0,0 +1,58 @@
+use std::time::Duration;
+
+use anyhow::{Context as _, Result};
+use kvstore_runtime_ext::KvK8sDeployer;
+use kvstore_runtime_workloads::{
+    KvBuilderExt, KvConverges, KvScenarioBuilder, KvTopology, KvWriteWorkload,
+};
+use testing_framework_core::scenario::Deployer;
+use testing_framework_runner_k8s::K8sRunnerError;
+use tracing::{info, warn};
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
+        .init();
+
+    let mut scenario = KvScenarioBuilder::deployment_with(|_| KvTopology::new(3))
+        .with_run_duration(Duration::from_secs(30))
+        .with_workload(
+            KvWriteWorkload::new()
+                .operations(200)
+                .key_count(20)
+                .rate_per_sec(20),
+        )
+        .with_expectation(KvConverges::new("kv-demo", 20).timeout(Duration::from_secs(25)))
+        .build()?;
+
+    let deployer = KvK8sDeployer::new();
+    let runner = match deployer.deploy(&scenario).await {
+        Ok(runner) => runner,
+        Err(K8sRunnerError::ClientInit { source }) => {
+            warn!("k8s unavailable ({source}); skipping kv k8s run");
+            return Ok(());
+        }
+        Err(K8sRunnerError::InstallStack { source })
+            if k8s_cluster_unavailable(&source.to_string()) =>
+        {
+            warn!("k8s unavailable ({source}); skipping kv k8s run");
+            return Ok(());
+        }
+        Err(error) => return Err(anyhow::Error::new(error)).context("deploying kv k8s stack"),
+    };
+
+    info!("running kv k8s convergence scenario");
+    runner
+        .run(&mut scenario)
+        .await
+        .context("running kv k8s scenario")?;
+
+    Ok(())
+}
+
+fn k8s_cluster_unavailable(message: &str) -> bool {
+    message.contains("Unable to connect to the server")
+        || message.contains("TLS handshake timeout")
+        || message.contains("connection refused")
+}
--- a/examples/kvstore/examples/src/bin/k8s_manual_convergence.rs
+++ b/examples/kvstore/examples/src/bin/k8s_manual_convergence.rs
@ -0,0 +1,155 @@
+use std::time::Duration;
+
+use anyhow::{Context as _, Result, anyhow};
+use kvstore_node::KvHttpClient;
+use kvstore_runtime_ext::{KvK8sDeployer, KvTopology};
+use serde::{Deserialize, Serialize};
+use testing_framework_runner_k8s::ManualClusterError;
+use tracing::{info, warn};
+
+#[derive(Serialize)]
+struct PutRequest {
+    value: String,
+    expected_version: Option<u64>,
+}
+
+#[derive(Deserialize)]
+struct PutResponse {
+    applied: bool,
+}
+
+#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
+struct ValueRecord {
+    value: String,
+    version: u64,
+    origin: u64,
+}
+
+#[derive(Deserialize)]
+struct GetResponse {
+    record: Option<ValueRecord>,
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
+        .init();
+
+    let deployer = KvK8sDeployer::new();
+    let cluster = match deployer
+        .manual_cluster_from_descriptors(KvTopology::new(3))
+        .await
+    {
+        Ok(cluster) => cluster,
+        Err(ManualClusterError::ClientInit { source }) => {
+            warn!("k8s unavailable ({source}); skipping kv k8s manual run");
+            return Ok(());
+        }
+        Err(ManualClusterError::InstallStack { source })
+            if k8s_cluster_unavailable(&source.to_string()) =>
+        {
+            warn!("k8s unavailable ({source}); skipping kv k8s manual run");
+            return Ok(());
+        }
+        Err(error) => {
+            return Err(anyhow::Error::new(error)).context("creating kv k8s manual cluster");
+        }
+    };
+
+    let node0 = cluster.start_node("node-0").await?.client;
+    let node1 = cluster.start_node("node-1").await?.client;
+    let node2 = cluster.start_node("node-2").await?.client;
+
+    cluster.wait_network_ready().await?;
+
+    write_keys(&node0, "kv-manual", 12).await?;
+    wait_for_convergence(
+        &[node0.clone(), node1.clone(), node2.clone()],
+        "kv-manual",
+        12,
+    )
+    .await?;
+
+    info!("restarting node-2 in manual cluster");
+    cluster.restart_node("node-2").await?;
+    cluster.wait_network_ready().await?;
+
+    let node2 = cluster
+        .node_client("node-2")
+        .ok_or_else(|| anyhow!("node-2 client missing after restart"))?;
+    wait_for_convergence(&[node0, node1, node2], "kv-manual", 12).await?;
+
+    cluster.stop_all();
+    Ok(())
+}
+
+async fn write_keys(client: &KvHttpClient, prefix: &str, key_count: usize) -> Result<()> {
+    for index in 0..key_count {
+        let key = format!("{prefix}-{index}");
+        let response: PutResponse = client
+            .put(
+                &format!("/kv/{key}"),
+                &PutRequest {
+                    value: format!("value-{index}"),
+                    expected_version: None,
+                },
+            )
+            .await
+            .map_err(|error| anyhow!(error.to_string()))
+            .with_context(|| format!("writing key {key}"))?;
+
+        if !response.applied {
+            return Err(anyhow!("write rejected for key {key}"));
+        }
+    }
+
+    Ok(())
+}
+
+async fn wait_for_convergence(
+    clients: &[KvHttpClient],
+    prefix: &str,
+    key_count: usize,
+) -> Result<()> {
+    let deadline = tokio::time::Instant::now() + Duration::from_secs(30);
+
+    while tokio::time::Instant::now() < deadline {
+        if is_converged(clients, prefix, key_count).await? {
+            info!(key_count, "kv manual cluster converged");
+            return Ok(());
+        }
+        tokio::time::sleep(Duration::from_millis(500)).await;
+    }
+
+    Err(anyhow!("kv manual cluster did not converge within timeout"))
+}
+
+async fn is_converged(clients: &[KvHttpClient], prefix: &str, key_count: usize) -> Result<bool> {
+    for index in 0..key_count {
+        let key = format!("{prefix}-{index}");
+        let first = read_key(&clients[0], &key).await?;
+        for client in &clients[1..] {
+            if read_key(client, &key).await? != first {
+                return Ok(false);
+            }
+        }
+    }
+
+    Ok(true)
+}
+
+async fn read_key(client: &KvHttpClient, key: &str) -> Result<Option<ValueRecord>> {
+    let response: GetResponse = client
+        .get(&format!("/kv/{key}"))
+        .await
+        .map_err(|error| anyhow!(error.to_string()))
+        .with_context(|| format!("reading key {key}"))?;
+    Ok(response.record)
+}
+
+fn k8s_cluster_unavailable(message: &str) -> bool {
+    message.contains("Unable to connect to the server")
+        || message.contains("TLS handshake timeout")
+        || message.contains("connection refused")
+}
--- a/examples/kvstore/kvstore-node/Cargo.toml
+++ b/examples/kvstore/kvstore-node/Cargo.toml
@ -0,0 +1,24 @@
+[package]
+edition.workspace = true
+license.workspace = true
+name              = "kvstore-node"
+version.workspace = true
+
+[[bin]]
+name = "kvstore-node"
+path = "src/main.rs"
+
+[dependencies]
+axum       = "0.7"
+tower-http = { version = "0.6", features = ["trace"] }
+
+serde      = { workspace = true }
+serde_yaml = { workspace = true }
+
+tokio              = { workspace = true, features = ["full"] }
+tracing            = { workspace = true }
+tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+
+anyhow  = "1.0"
+clap    = { version = "4.0", features = ["derive"] }
+reqwest = { workspace = true, features = ["json"] }
--- a/examples/kvstore/kvstore-node/src/client.rs
+++ b/examples/kvstore/kvstore-node/src/client.rs
@ -0,0 +1,40 @@
+use reqwest::Url;
+use serde::Serialize;
+
+#[derive(Clone)]
+pub struct KvHttpClient {
+    base_url: Url,
+    client: reqwest::Client,
+}
+
+impl KvHttpClient {
+    #[must_use]
+    pub fn new(base_url: Url) -> Self {
+        Self {
+            base_url,
+            client: reqwest::Client::new(),
+        }
+    }
+
+    pub async fn get<T: serde::de::DeserializeOwned>(&self, path: &str) -> anyhow::Result<T> {
+        let url = self.base_url.join(path)?;
+        let response = self.client.get(url).send().await?.error_for_status()?;
+        Ok(response.json().await?)
+    }
+
+    pub async fn put<B: Serialize, T: serde::de::DeserializeOwned>(
+        &self,
+        path: &str,
+        body: &B,
+    ) -> anyhow::Result<T> {
+        let url = self.base_url.join(path)?;
+        let response = self
+            .client
+            .put(url)
+            .json(body)
+            .send()
+            .await?
+            .error_for_status()?;
+        Ok(response.json().await?)
+    }
+}
--- a/examples/kvstore/kvstore-node/src/config.rs
+++ b/examples/kvstore/kvstore-node/src/config.rs
@ -0,0 +1,30 @@
+use std::{fs, path::Path};
+
+use serde::{Deserialize, Serialize};
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct PeerInfo {
+    pub node_id: u64,
+    pub http_address: String,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct KvConfig {
+    pub node_id: u64,
+    pub http_port: u16,
+    pub peers: Vec<PeerInfo>,
+    #[serde(default = "default_sync_interval_ms")]
+    pub sync_interval_ms: u64,
+}
+
+impl KvConfig {
+    pub fn load(path: &Path) -> anyhow::Result<Self> {
+        let raw = fs::read_to_string(path)?;
+        let config = serde_yaml::from_str(&raw)?;
+        Ok(config)
+    }
+}
+
+const fn default_sync_interval_ms() -> u64 {
+    1000
+}
--- a/examples/kvstore/kvstore-node/src/lib.rs
+++ b/examples/kvstore/kvstore-node/src/lib.rs
@ -0,0 +1,3 @@
+pub mod client;
+
+pub use client::KvHttpClient;
--- a/examples/kvstore/kvstore-node/src/main.rs
+++ b/examples/kvstore/kvstore-node/src/main.rs
@ -0,0 +1,36 @@
+mod config;
+mod server;
+mod state;
+mod sync;
+
+use std::path::PathBuf;
+
+use clap::Parser;
+use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};
+
+use crate::{config::KvConfig, state::KvState, sync::SyncService};
+
+#[derive(Parser, Debug)]
+#[command(name = "kvstore-node")]
+struct Args {
+    #[arg(short, long)]
+    config: PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    tracing_subscriber::registry()
+        .with(
+            tracing_subscriber::EnvFilter::try_from_default_env()
+                .unwrap_or_else(|_| "kvstore_node=info,tower_http=debug".into()),
+        )
+        .with(tracing_subscriber::fmt::layer())
+        .init();
+
+    let args = Args::parse();
+    let config = KvConfig::load(&args.config)?;
+
+    let state = KvState::new(config.node_id);
+    SyncService::new(config.clone(), state.clone()).start();
+    server::start_server(config, state).await
+}
--- a/examples/kvstore/kvstore-node/src/server.rs
+++ b/examples/kvstore/kvstore-node/src/server.rs
@ -0,0 +1,112 @@
+use std::net::SocketAddr;
+
+use axum::{
+    Router,
+    extract::{Path, State},
+    http::StatusCode,
+    response::Json,
+    routing::get,
+};
+use serde::{Deserialize, Serialize};
+use tower_http::trace::TraceLayer;
+
+use crate::{
+    config::KvConfig,
+    state::{KvState, Snapshot, ValueRecord},
+};
+
+#[derive(Serialize)]
+struct HealthResponse {
+    status: &'static str,
+}
+
+#[derive(Deserialize)]
+struct PutRequest {
+    value: String,
+    expected_version: Option<u64>,
+}
+
+#[derive(Serialize)]
+struct PutResponse {
+    applied: bool,
+    version: u64,
+}
+
+#[derive(Serialize)]
+struct GetResponse {
+    key: String,
+    record: Option<ValueRecord>,
+}
+
+pub async fn start_server(config: KvConfig, state: KvState) -> anyhow::Result<()> {
+    let app = Router::new()
+        .route("/health/live", get(health_live))
+        .route("/health/ready", get(health_ready))
+        .route("/kv/:key", get(get_key).put(put_key))
+        .route("/internal/snapshot", get(get_snapshot))
+        .layer(TraceLayer::new_for_http())
+        .with_state(state.clone());
+
+    let addr = SocketAddr::from(([0, 0, 0, 0], config.http_port));
+    let listener = tokio::net::TcpListener::bind(addr).await?;
+
+    state.set_ready(true).await;
+    tracing::info!(node_id = state.node_id(), %addr, "kv node ready");
+
+    axum::serve(listener, app).await?;
+    Ok(())
+}
+
+async fn health_live() -> (StatusCode, Json<HealthResponse>) {
+    (StatusCode::OK, Json(HealthResponse { status: "alive" }))
+}
+
+async fn health_ready(State(state): State<KvState>) -> (StatusCode, Json<HealthResponse>) {
+    if state.is_ready().await {
+        (StatusCode::OK, Json(HealthResponse { status: "ready" }))
+    } else {
+        (
+            StatusCode::SERVICE_UNAVAILABLE,
+            Json(HealthResponse {
+                status: "not-ready",
+            }),
+        )
+    }
+}
+
+async fn get_key(Path(key): Path<String>, State(state): State<KvState>) -> Json<GetResponse> {
+    let record = state.get(&key).await;
+    Json(GetResponse { key, record })
+}
+
+async fn put_key(
+    Path(key): Path<String>,
+    State(state): State<KvState>,
+    Json(request): Json<PutRequest>,
+) -> (StatusCode, Json<PutResponse>) {
+    let outcome = state
+        .put_local(key, request.value, request.expected_version)
+        .await;
+
+    if outcome.applied {
+        (
+            StatusCode::OK,
+            Json(PutResponse {
+                applied: true,
+                version: outcome.current_version,
+            }),
+        )
+    } else {
+        (
+            StatusCode::CONFLICT,
+            Json(PutResponse {
+                applied: false,
+                version: outcome.current_version,
+            }),
+        )
+    }
+}
+
+async fn get_snapshot(State(state): State<KvState>) -> Json<Snapshot> {
+    Json(state.snapshot().await)
+}
--- a/examples/kvstore/kvstore-node/src/state.rs
+++ b/examples/kvstore/kvstore-node/src/state.rs
@ -0,0 +1,111 @@
+use std::{collections::HashMap, sync::Arc};
+
+use serde::{Deserialize, Serialize};
+use tokio::sync::RwLock;
+
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub struct ValueRecord {
+    pub value: String,
+    pub version: u64,
+    pub origin: u64,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct Snapshot {
+    pub node_id: u64,
+    pub entries: HashMap<String, ValueRecord>,
+}
+
+#[derive(Clone, Debug)]
+pub struct PutOutcome {
+    pub applied: bool,
+    pub current_version: u64,
+}
+
+#[derive(Clone)]
+pub struct KvState {
+    node_id: u64,
+    ready: Arc<RwLock<bool>>,
+    entries: Arc<RwLock<HashMap<String, ValueRecord>>>,
+}
+
+impl KvState {
+    pub fn new(node_id: u64) -> Self {
+        Self {
+            node_id,
+            ready: Arc::new(RwLock::new(false)),
+            entries: Arc::new(RwLock::new(HashMap::new())),
+        }
+    }
+
+    pub const fn node_id(&self) -> u64 {
+        self.node_id
+    }
+
+    pub async fn set_ready(&self, value: bool) {
+        *self.ready.write().await = value;
+    }
+
+    pub async fn is_ready(&self) -> bool {
+        *self.ready.read().await
+    }
+
+    pub async fn get(&self, key: &str) -> Option<ValueRecord> {
+        self.entries.read().await.get(key).cloned()
+    }
+
+    pub async fn put_local(
+        &self,
+        key: String,
+        value: String,
+        expected_version: Option<u64>,
+    ) -> PutOutcome {
+        let mut entries = self.entries.write().await;
+        let current_version = entries.get(&key).map_or(0, |record| record.version);
+
+        if expected_version.is_some_and(|expected| expected != current_version) {
+            return PutOutcome {
+                applied: false,
+                current_version,
+            };
+        }
+
+        let next_version = current_version.saturating_add(1);
+        entries.insert(
+            key,
+            ValueRecord {
+                value,
+                version: next_version,
+                origin: self.node_id,
+            },
+        );
+
+        PutOutcome {
+            applied: true,
+            current_version: next_version,
+        }
+    }
+
+    pub async fn merge_snapshot(&self, snapshot: Snapshot) {
+        let mut local = self.entries.write().await;
+        for (key, incoming) in snapshot.entries {
+            match local.get(&key) {
+                Some(existing) if !is_newer_record(&incoming, existing) => {}
+                _ => {
+                    local.insert(key, incoming);
+                }
+            }
+        }
+    }
+
+    pub async fn snapshot(&self) -> Snapshot {
+        Snapshot {
+            node_id: self.node_id,
+            entries: self.entries.read().await.clone(),
+        }
+    }
+}
+
+fn is_newer_record(candidate: &ValueRecord, existing: &ValueRecord) -> bool {
+    (candidate.version, candidate.origin) > (existing.version, existing.origin)
+}
--- a/examples/kvstore/kvstore-node/src/sync.rs
+++ b/examples/kvstore/kvstore-node/src/sync.rs
@ -0,0 +1,103 @@
+use std::{collections::HashMap, sync::Arc, time::Duration};
+
+use reqwest::Client;
+use tokio::sync::Mutex;
+use tracing::{debug, warn};
+
+use crate::{
+    config::KvConfig,
+    state::{KvState, Snapshot},
+};
+
+const WARN_AFTER_CONSECUTIVE_FAILURES: u32 = 5;
+
+#[derive(Clone)]
+pub struct SyncService {
+    config: Arc<KvConfig>,
+    state: KvState,
+    client: Client,
+    failures_by_peer: Arc<Mutex<HashMap<String, u32>>>,
+}
+
+impl SyncService {
+    pub fn new(config: KvConfig, state: KvState) -> Self {
+        Self {
+            config: Arc::new(config),
+            state,
+            client: Client::new(),
+            failures_by_peer: Arc::new(Mutex::new(HashMap::new())),
+        }
+    }
+
+    pub fn start(&self) {
+        let service = self.clone();
+        tokio::spawn(async move {
+            service.run().await;
+        });
+    }
+
+    async fn run(self) {
+        let interval = Duration::from_millis(self.config.sync_interval_ms.max(100));
+        loop {
+            self.sync_once().await;
+            tokio::time::sleep(interval).await;
+        }
+    }
+
+    async fn sync_once(&self) {
+        for peer in &self.config.peers {
+            match self.fetch_snapshot(&peer.http_address).await {
+                Ok(snapshot) => {
+                    self.state.merge_snapshot(snapshot).await;
+                    self.clear_failure_counter(&peer.http_address).await;
+                }
+                Err(error) => {
+                    self.record_sync_failure(&peer.http_address, &error).await;
+                }
+            }
+        }
+    }
+
+    async fn fetch_snapshot(&self, peer_address: &str) -> anyhow::Result<Snapshot> {
+        let url = format!("http://{peer_address}/internal/snapshot");
+        let snapshot = self
+            .client
+            .get(url)
+            .send()
+            .await?
+            .error_for_status()?
+            .json()
+            .await?;
+        Ok(snapshot)
+    }
+
+    async fn clear_failure_counter(&self, peer_address: &str) {
+        let mut failures = self.failures_by_peer.lock().await;
+        failures.remove(peer_address);
+    }
+
+    async fn record_sync_failure(&self, peer_address: &str, error: &anyhow::Error) {
+        let consecutive_failures = {
+            let mut failures = self.failures_by_peer.lock().await;
+            let entry = failures.entry(peer_address.to_owned()).or_insert(0);
+            *entry += 1;
+            *entry
+        };
+
+        if consecutive_failures >= WARN_AFTER_CONSECUTIVE_FAILURES {
+            warn!(
+                peer = %peer_address,
+                %error,
+                consecutive_failures,
+                "kv sync repeatedly failing"
+            );
+        } else {
+            debug!(
+                peer = %peer_address,
+                %error,
+                consecutive_failures,
+                "kv sync failed"
+            );
+        }
+    }
+}
--- a/examples/kvstore/testing/integration/Cargo.toml
+++ b/examples/kvstore/testing/integration/Cargo.toml
@ -0,0 +1,15 @@
+[package]
+edition.workspace = true
+license.workspace = true
+name              = "kvstore-runtime-ext"
+version.workspace = true
+
+[dependencies]
+testing-framework-core           = { workspace = true }
+testing-framework-runner-compose = { workspace = true }
+testing-framework-runner-k8s     = { workspace = true }
+testing-framework-runner-local   = { workspace = true }
+
+async-trait  = { workspace = true }
+kvstore-node = { path = "../../kvstore-node" }
+serde        = { workspace = true }
--- a/examples/kvstore/testing/integration/src/app.rs
+++ b/examples/kvstore/testing/integration/src/app.rs
@ -0,0 +1,75 @@
+use std::io::Error;
+
+use async_trait::async_trait;
+use kvstore_node::KvHttpClient;
+use serde::{Deserialize, Serialize};
+use testing_framework_core::scenario::{
+    Application, ClusterNodeConfigApplication, ClusterNodeView, ClusterPeerView, DynError,
+    NodeAccess, serialize_cluster_yaml_config,
+};
+
+pub type KvTopology = testing_framework_core::topology::ClusterTopology;
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct KvPeerInfo {
+    pub node_id: u64,
+    pub http_address: String,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct KvNodeConfig {
+    pub node_id: u64,
+    pub http_port: u16,
+    pub peers: Vec<KvPeerInfo>,
+    pub sync_interval_ms: u64,
+}
+
+pub struct KvEnv;
+
+#[async_trait]
+impl Application for KvEnv {
+    type Deployment = KvTopology;
+    type NodeClient = KvHttpClient;
+    type NodeConfig = KvNodeConfig;
+    fn build_node_client(access: &NodeAccess) -> Result<Self::NodeClient, DynError> {
+        Ok(KvHttpClient::new(access.api_base_url()?))
+    }
+
+    fn node_readiness_path() -> &'static str {
+        "/health/ready"
+    }
+}
+
+impl ClusterNodeConfigApplication for KvEnv {
+    type ConfigError = Error;
+
+    fn static_network_port() -> u16 {
+        8080
+    }
+
+    fn build_cluster_node_config(
+        node: &ClusterNodeView,
+        peers: &[ClusterPeerView],
+    ) -> Result<Self::NodeConfig, Self::ConfigError> {
+        let peers = peers
+            .iter()
+            .map(|peer| KvPeerInfo {
+                node_id: peer.index() as u64,
+                http_address: peer.authority(),
+            })
+            .collect::<Vec<_>>();
+
+        Ok(KvNodeConfig {
+            node_id: node.index() as u64,
+            http_port: node.network_port(),
+            peers,
+            sync_interval_ms: 500,
+        })
+    }
+
+    fn serialize_cluster_node_config(
+        config: &Self::NodeConfig,
+    ) -> Result<String, Self::ConfigError> {
+        serialize_cluster_yaml_config(config).map_err(Error::other)
+    }
+}
--- a/examples/kvstore/testing/integration/src/compose_env.rs
+++ b/examples/kvstore/testing/integration/src/compose_env.rs
@ -0,0 +1,15 @@
+use testing_framework_runner_compose::{BinaryConfigNodeSpec, ComposeBinaryApp};
+
+use crate::KvEnv;
+
+const NODE_CONFIG_PATH: &str = "/etc/kvstore/config.yaml";
+
+impl ComposeBinaryApp for KvEnv {
+    fn compose_node_spec() -> BinaryConfigNodeSpec {
+        BinaryConfigNodeSpec::conventional(
+            "/usr/local/bin/kvstore-node",
+            NODE_CONFIG_PATH,
+            vec![8080, 8081],
+        )
+    }
+}
--- a/examples/kvstore/testing/integration/src/k8s_env.rs
+++ b/examples/kvstore/testing/integration/src/k8s_env.rs
@ -0,0 +1,21 @@
+use testing_framework_runner_k8s::{BinaryConfigK8sSpec, K8sBinaryApp};
+
+use crate::KvEnv;
+
+const CONTAINER_CONFIG_PATH: &str = "/etc/kvstore/config.yaml";
+const CONTAINER_HTTP_PORT: u16 = 8080;
+const SERVICE_TESTING_PORT: u16 = 8081;
+const NODE_NAME_PREFIX: &str = "kvstore-node";
+
+impl K8sBinaryApp for KvEnv {
+    fn k8s_binary_spec() -> BinaryConfigK8sSpec {
+        BinaryConfigK8sSpec::conventional(
+            "kvstore",
+            NODE_NAME_PREFIX,
+            "/usr/local/bin/kvstore-node",
+            CONTAINER_CONFIG_PATH,
+            CONTAINER_HTTP_PORT,
+            SERVICE_TESTING_PORT,
+        )
+    }
+}
--- a/examples/kvstore/testing/integration/src/lib.rs
+++ b/examples/kvstore/testing/integration/src/lib.rs
@ -0,0 +1,12 @@
+mod app;
+mod compose_env;
+mod k8s_env;
+mod local_env;
+pub mod scenario;
+
+pub use app::*;
+pub use scenario::{KvBuilderExt, KvScenarioBuilder};
+
+pub type KvLocalDeployer = testing_framework_runner_local::ProcessDeployer<KvEnv>;
+pub type KvComposeDeployer = testing_framework_runner_compose::ComposeDeployer<KvEnv>;
+pub type KvK8sDeployer = testing_framework_runner_k8s::K8sDeployer<KvEnv>;
--- a/examples/kvstore/testing/integration/src/local_env.rs
+++ b/examples/kvstore/testing/integration/src/local_env.rs
@ -0,0 +1,41 @@
+use std::collections::HashMap;
+
+use testing_framework_core::scenario::{DynError, StartNodeOptions};
+use testing_framework_runner_local::{
+    LocalBinaryApp, LocalNodePorts, LocalPeerNode, LocalProcessSpec,
+    build_local_cluster_node_config, yaml_node_config,
+};
+
+use crate::{KvEnv, KvNodeConfig};
+
+impl LocalBinaryApp for KvEnv {
+    fn initial_node_name_prefix() -> &'static str {
+        "kv-node"
+    }
+
+    fn build_local_node_config_with_peers(
+        _topology: &Self::Deployment,
+        index: usize,
+        ports: &LocalNodePorts,
+        peers: &[LocalPeerNode],
+        _peer_ports_by_name: &HashMap<String, u16>,
+        _options: &StartNodeOptions<Self>,
+        _template_config: Option<
+            &<Self as testing_framework_core::scenario::Application>::NodeConfig,
+        >,
+    ) -> Result<<Self as testing_framework_core::scenario::Application>::NodeConfig, DynError> {
+        build_local_cluster_node_config::<Self>(index, ports, peers)
+    }
+
+    fn local_process_spec() -> LocalProcessSpec {
+        LocalProcessSpec::new("KVSTORE_NODE_BIN", "kvstore-node").with_rust_log("kvstore_node=info")
+    }
+
+    fn render_local_config(config: &KvNodeConfig) -> Result<Vec<u8>, DynError> {
+        yaml_node_config(config)
+    }
+
+    fn http_api_port(config: &KvNodeConfig) -> u16 {
+        config.http_port
+    }
+}
--- a/examples/kvstore/testing/integration/src/scenario.rs
+++ b/examples/kvstore/testing/integration/src/scenario.rs
@ -0,0 +1,15 @@
+use testing_framework_core::scenario::ScenarioBuilder;
+
+use crate::{KvEnv, KvTopology};
+
+pub type KvScenarioBuilder = ScenarioBuilder<KvEnv>;
+
+pub trait KvBuilderExt: Sized {
+    fn deployment_with(f: impl FnOnce(KvTopology) -> KvTopology) -> Self;
+}
+
+impl KvBuilderExt for KvScenarioBuilder {
+    fn deployment_with(f: impl FnOnce(KvTopology) -> KvTopology) -> Self {
+        KvScenarioBuilder::with_deployment(f(KvTopology::new(3)))
+    }
+}
--- a/examples/kvstore/testing/workloads/Cargo.toml
+++ b/examples/kvstore/testing/workloads/Cargo.toml
@ -0,0 +1,15 @@
+[package]
+edition.workspace = true
+license.workspace = true
+name              = "kvstore-runtime-workloads"
+version.workspace = true
+
+[dependencies]
+kvstore-node           = { path = "../../kvstore-node" }
+kvstore-runtime-ext    = { path = "../integration" }
+testing-framework-core = { workspace = true }
+
+async-trait = { workspace = true }
+serde       = { workspace = true }
+tokio       = { workspace = true, features = ["full"] }
+tracing     = { workspace = true }
--- a/examples/kvstore/testing/workloads/src/expectations.rs
+++ b/examples/kvstore/testing/workloads/src/expectations.rs
@ -0,0 +1,100 @@
+use std::time::Duration;
+
+use async_trait::async_trait;
+use kvstore_runtime_ext::KvEnv;
+use serde::Deserialize;
+use testing_framework_core::scenario::{DynError, Expectation, RunContext};
+use tracing::info;
+
+#[derive(Clone)]
+pub struct KvConverges {
+    key_prefix: String,
+    key_count: usize,
+    timeout: Duration,
+    poll_interval: Duration,
+}
+
+#[derive(Deserialize, Clone, Debug, Eq, PartialEq)]
+struct ValueRecord {
+    value: String,
+    version: u64,
+    origin: u64,
+}
+
+#[derive(Deserialize)]
+struct GetResponse {
+    record: Option<ValueRecord>,
+}
+
+impl KvConverges {
+    #[must_use]
+    pub fn new(key_prefix: impl Into<String>, key_count: usize) -> Self {
+        Self {
+            key_prefix: key_prefix.into(),
+            key_count,
+            timeout: Duration::from_secs(20),
+            poll_interval: Duration::from_millis(500),
+        }
+    }
+
+    #[must_use]
+    pub const fn timeout(mut self, timeout: Duration) -> Self {
+        self.timeout = timeout;
+        self
+    }
+}
+
+#[async_trait]
+impl Expectation<KvEnv> for KvConverges {
+    fn name(&self) -> &str {
+        "kv_converges"
+    }
+
+    async fn evaluate(&mut self, ctx: &RunContext<KvEnv>) -> Result<(), DynError> {
+        let clients = ctx.node_clients().snapshot();
+        if clients.is_empty() {
+            return Err("no kv node clients available".into());
+        }
+
+        let deadline = tokio::time::Instant::now() + self.timeout;
+        while tokio::time::Instant::now() < deadline {
+            if self.is_converged(&clients).await? {
+                info!(key_count = self.key_count, "kv convergence reached");
+                return Ok(());
+            }
+            tokio::time::sleep(self.poll_interval).await;
+        }
+
+        Err(format!(
+            "kv convergence not reached within {:?} for {} keys",
+            self.timeout, self.key_count
+        )
+        .into())
+    }
+}
+
+impl KvConverges {
+    async fn is_converged(&self, clients: &[kvstore_node::KvHttpClient]) -> Result<bool, DynError> {
+        for key_idx in 0..self.key_count {
+            let key = format!("{}-{key_idx}", self.key_prefix);
+            let first = read_key(clients, &key, 0).await?;
+            for node_idx in 1..clients.len() {
+                let current = read_key(clients, &key, node_idx).await?;
+                if current != first {
+                    return Ok(false);
+                }
+            }
+        }
+
+        Ok(true)
+    }
+}
+
+async fn read_key(
+    clients: &[kvstore_node::KvHttpClient],
+    key: &str,
+    index: usize,
+) -> Result<Option<ValueRecord>, DynError> {
+    let response: GetResponse = clients[index].get(&format!("/kv/{key}")).await?;
+    Ok(response.record)
+}
--- a/examples/kvstore/testing/workloads/src/lib.rs
+++ b/examples/kvstore/testing/workloads/src/lib.rs
@ -0,0 +1,6 @@
+mod expectations;
+mod write;
+
+pub use expectations::KvConverges;
+pub use kvstore_runtime_ext::{KvBuilderExt, KvEnv, KvScenarioBuilder, KvTopology};
+pub use write::KvWriteWorkload;
--- a/examples/kvstore/testing/workloads/src/write.rs
+++ b/examples/kvstore/testing/workloads/src/write.rs
@ -0,0 +1,135 @@
+use std::time::Duration;
+
+use async_trait::async_trait;
+use kvstore_runtime_ext::KvEnv;
+use serde::{Deserialize, Serialize};
+use testing_framework_core::scenario::{DynError, RunContext, Workload};
+use tracing::info;
+
+#[derive(Clone)]
+pub struct KvWriteWorkload {
+    operations: usize,
+    key_count: usize,
+    rate_per_sec: Option<usize>,
+    key_prefix: String,
+}
+
+#[derive(Serialize)]
+struct PutRequest {
+    value: String,
+    expected_version: Option<u64>,
+}
+
+#[derive(Deserialize)]
+struct PutResponse {
+    applied: bool,
+    version: u64,
+}
+
+impl KvWriteWorkload {
+    #[must_use]
+    pub fn new() -> Self {
+        Self {
+            operations: 200,
+            key_count: 20,
+            rate_per_sec: Some(25),
+            key_prefix: "kv-demo".to_owned(),
+        }
+    }
+
+    #[must_use]
+    pub const fn operations(mut self, value: usize) -> Self {
+        self.operations = value;
+        self
+    }
+
+    #[must_use]
+    pub const fn key_count(mut self, value: usize) -> Self {
+        self.key_count = value;
+        self
+    }
+
+    #[must_use]
+    pub const fn rate_per_sec(mut self, value: usize) -> Self {
+        self.rate_per_sec = Some(value);
+        self
+    }
+
+    #[must_use]
+    pub fn key_prefix(mut self, value: impl Into<String>) -> Self {
+        self.key_prefix = value.into();
+        self
+    }
+}
+
+impl Default for KvWriteWorkload {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[async_trait]
+impl Workload<KvEnv> for KvWriteWorkload {
+    fn name(&self) -> &str {
+        "kv_write_workload"
+    }
+
+    async fn start(&self, ctx: &RunContext<KvEnv>) -> Result<(), DynError> {
+        let clients = ctx.node_clients().snapshot();
+        let Some(leader) = clients.first() else {
+            return Err("no kv node clients available".into());
+        };
+
+        if self.key_count == 0 {
+            return Err("kv workload key_count must be > 0".into());
+        }
+
+        let interval = self.rate_per_sec.and_then(compute_interval);
+        info!(
+            operations = self.operations,
+            key_count = self.key_count,
+            rate_per_sec = ?self.rate_per_sec,
+            "starting kv write workload"
+        );
+
+        for idx in 0..self.operations {
+            let key = format!("{}-{}", self.key_prefix, idx % self.key_count);
+            let value = format!("value-{idx}");
+            let response: PutResponse = leader
+                .put(
+                    &format!("/kv/{key}"),
+                    &PutRequest {
+                        value,
+                        expected_version: None,
+                    },
+                )
+                .await?;
+
+            if !response.applied {
+                return Err(format!("leader rejected write for key {key}").into());
+            }
+
+            if (idx + 1) % 25 == 0 {
+                info!(
+                    completed = idx + 1,
+                    version = response.version,
+                    "kv write progress"
+                );
+            }
+
+            if let Some(delay) = interval {
+                tokio::time::sleep(delay).await;
+            }
+        }
+
+        Ok(())
+    }
+}
+
+fn compute_interval(rate_per_sec: usize) -> Option<Duration> {
+    if rate_per_sec == 0 {
+        return None;
+    }
+
+    Some(Duration::from_millis((1000 / rate_per_sec as u64).max(1)))
+}
--- a/examples/metrics_counter/README.md
+++ b/examples/metrics_counter/README.md
@ -31,7 +31,7 @@ Each node exposes:

 ```bash
 LOGOS_BLOCKCHAIN_METRICS_QUERY_URL=http://127.0.0.1:19091 \
-cargo run -p metrics-counter-examples --bin compose_prometheus_expectation
+cargo run -p metrics-counter-examples --bin metrics_counter_compose_prometheus_expectation
 ```

 ## Run with Kubernetes
@ -39,7 +39,7 @@ cargo run -p metrics-counter-examples --bin compose_prometheus_expectation
 ```bash
 docker build -t metrics-counter-node:local -f examples/metrics_counter/Dockerfile .
 LOGOS_BLOCKCHAIN_METRICS_QUERY_URL=http://127.0.0.1:30991 \
-cargo run -p metrics-counter-examples --bin k8s_prometheus_expectation
+cargo run -p metrics-counter-examples --bin metrics_counter_k8s_prometheus_expectation
 ```

 Overrides:
@ -51,5 +51,5 @@ Overrides:
 ```bash
 docker build -t metrics-counter-node:local -f examples/metrics_counter/Dockerfile .
 LOGOS_BLOCKCHAIN_METRICS_QUERY_URL=http://127.0.0.1:30991 \
-cargo run -p metrics-counter-examples --bin k8s_manual_prometheus
+cargo run -p metrics-counter-examples --bin metrics_counter_k8s_manual_prometheus
 ```
--- a/examples/metrics_counter/examples/Cargo.toml
+++ b/examples/metrics_counter/examples/Cargo.toml
@ -4,6 +4,18 @@ license.workspace = true
 name              = "metrics-counter-examples"
 version.workspace = true

+[[bin]]
+name = "metrics_counter_compose_prometheus_expectation"
+path = "src/bin/compose_prometheus_expectation.rs"
+
+[[bin]]
+name = "metrics_counter_k8s_prometheus_expectation"
+path = "src/bin/k8s_prometheus_expectation.rs"
+
+[[bin]]
+name = "metrics_counter_k8s_manual_prometheus"
+path = "src/bin/k8s_manual_prometheus.rs"
+
 [dependencies]
 anyhow                            = "1.0"
 metrics-counter-node              = { path = "../metrics-counter-node" }
--- a/examples/nats/README.md
+++ b/examples/nats/README.md
@ -23,23 +23,23 @@ Each example follows the same pattern:
 ## Run locally

 ```bash
-cargo run -p nats-examples --bin basic_roundtrip
+cargo run -p nats-examples --bin nats_basic_roundtrip
 ```

 If `nats-server` is not on `PATH`:

 ```bash
-NATS_SERVER_BIN=/path/to/nats-server cargo run -p nats-examples --bin basic_roundtrip
+NATS_SERVER_BIN=/path/to/nats-server cargo run -p nats-examples --bin nats_basic_roundtrip
 ```

 ## Run with Docker Compose

 ```bash
-cargo run -p nats-examples --bin compose_roundtrip
+cargo run -p nats-examples --bin nats_compose_roundtrip
 ```

 ## Run the parity check

 ```bash
-cargo run -p nats-examples --bin parity_check
+cargo run -p nats-examples --bin nats_parity_check
 ```
--- a/examples/nats/examples/Cargo.toml
+++ b/examples/nats/examples/Cargo.toml
@ -4,6 +4,18 @@ license.workspace = true
 name              = "nats-examples"
 version.workspace = true

+[[bin]]
+name = "nats_basic_roundtrip"
+path = "src/bin/basic_roundtrip.rs"
+
+[[bin]]
+name = "nats_compose_roundtrip"
+path = "src/bin/compose_roundtrip.rs"
+
+[[bin]]
+name = "nats_parity_check"
+path = "src/bin/parity_check.rs"
+
 [dependencies]
 anyhow                           = "1.0"
 nats-runtime-ext                 = { path = "../testing/integration" }
--- a/examples/openraft_kv/Dockerfile
+++ b/examples/openraft_kv/Dockerfile
@ -0,0 +1,25 @@
+# Build stage
+FROM rustlang/rust:nightly-bookworm AS builder
+
+WORKDIR /build
+
+COPY Cargo.toml Cargo.lock ./
+COPY cfgsync/ ./cfgsync/
+COPY examples/ ./examples/
+COPY testing-framework/ ./testing-framework/
+
+RUN cargo build --release -p openraft-kv-node
+
+FROM debian:bookworm-slim
+
+RUN apt-get update && \
+    apt-get install -y ca-certificates && \
+    rm -rf /var/lib/apt/lists/*
+
+COPY --from=builder /build/target/release/openraft-kv-node /usr/local/bin/openraft-kv-node
+
+RUN mkdir -p /etc/openraft-kv
+WORKDIR /app
+
+ENTRYPOINT ["/usr/local/bin/openraft-kv-node"]
+CMD ["--config", "/etc/openraft-kv/config.yaml"]
--- a/examples/openraft_kv/README.md
+++ b/examples/openraft_kv/README.md
@ -0,0 +1,87 @@
+# OpenRaft KV Example
+
+This example runs a small key-value service built on top of `OpenRaft`.
+
+The main scenario does four things:
+
+- bootstraps node 0 as a one-node cluster
+- adds nodes 1 and 2 as learners and promotes them to voters
+- writes one batch of keys through the current leader
+- restarts that leader, waits for a new leader, writes again, and then checks
+  that all three nodes expose the same replicated state
+
+## How TF runs this
+
+- TF starts three OpenRaft nodes
+- the workload bootstraps the cluster through the admin API
+- the workload writes a first batch, restarts the current leader, waits for failover, and writes again
+- the expectation checks that all three nodes converge on the same key/value state and membership
+
+## Scenario
+
+- `basic_failover` runs the leader-restart flow locally
+- `compose_failover` runs the same flow in Docker Compose
+- `k8s_failover` runs the same flow against a manual Kubernetes cluster deployment
+
+## API
+
+Each node exposes:
+
+- `GET /healthz` for readiness
+- `GET /state` for current Raft role, leader, membership, log progress, and replicated key/value data
+- `POST /kv/write` to submit a write through the local Raft node
+- `POST /kv/read` to read a key from the local state machine
+- `POST /admin/init` to initialize a single-node cluster
+- `POST /admin/add-learner` to add a new Raft learner
+- `POST /admin/change-membership` to promote learners into the voting set
+
+The node also exposes internal Raft RPC endpoints used only for replication:
+
+- `POST /raft/vote`
+- `POST /raft/append`
+- `POST /raft/snapshot`
+
+## Run locally
+
+```bash
+OPENRAFT_KV_NODE_BIN="$(pwd)/target/debug/openraft-kv-node" \
+cargo run -p openraft-kv-examples --bin openraft_kv_basic_failover
+```
+
+Build the node first if you have not done that yet:
+
+```bash
+cargo build -p openraft-kv-node
+```
+
+## Run with Docker Compose
+
+Build the image first:
+
+```bash
+docker build -t openraft-kv-node:local -f examples/openraft_kv/Dockerfile .
+```
+
+Then run:
+
+```bash
+cargo run -p openraft-kv-examples --bin openraft_kv_compose_failover
+```
+
+Set `OPENRAFT_KV_IMAGE` to override the default compose image tag.
+
+## Run on Kubernetes
+
+Build the same image first:
+
+```bash
+docker build -t openraft-kv-node:local -f examples/openraft_kv/Dockerfile .
+```
+
+Then run:
+
+```bash
+cargo run -p openraft-kv-examples --bin openraft_kv_k8s_failover
+```
+
+If no cluster is available, the example exits early and prints a skip message.
--- a/examples/openraft_kv/examples/Cargo.toml
+++ b/examples/openraft_kv/examples/Cargo.toml
@ -0,0 +1,28 @@
+[package]
+edition.workspace = true
+license.workspace = true
+name              = "openraft-kv-examples"
+version.workspace = true
+
+[[bin]]
+name = "openraft_kv_basic_failover"
+path = "src/bin/basic_failover.rs"
+
+[[bin]]
+name = "openraft_kv_compose_failover"
+path = "src/bin/compose_failover.rs"
+
+[[bin]]
+name = "openraft_kv_k8s_failover"
+path = "src/bin/k8s_failover.rs"
+
+[dependencies]
+anyhow                        = "1.0"
+openraft-kv-node              = { path = "../openraft-kv-node" }
+openraft-kv-runtime-ext       = { path = "../testing/integration" }
+openraft-kv-runtime-workloads = { path = "../testing/workloads" }
+testing-framework-core        = { workspace = true }
+testing-framework-runner-k8s  = { workspace = true }
+tokio                         = { workspace = true, features = ["full"] }
+tracing                       = { workspace = true }
+tracing-subscriber            = { version = "0.3", features = ["env-filter"] }
--- a/examples/openraft_kv/examples/src/bin/basic_failover.rs
+++ b/examples/openraft_kv/examples/src/bin/basic_failover.rs
@ -0,0 +1,20 @@
+use std::time::Duration;
+
+use openraft_kv_examples::build_failover_scenario;
+use openraft_kv_runtime_ext::OpenRaftKvLocalDeployer;
+use testing_framework_core::scenario::Deployer;
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
+        .init();
+
+    let mut scenario = build_failover_scenario(Duration::from_secs(45), Duration::from_secs(30))?;
+
+    let deployer = OpenRaftKvLocalDeployer::default();
+    let runner = deployer.deploy(&scenario).await?;
+    runner.run(&mut scenario).await?;
+
+    Ok(())
+}
--- a/examples/openraft_kv/examples/src/bin/compose_failover.rs
+++ b/examples/openraft_kv/examples/src/bin/compose_failover.rs
@ -0,0 +1,20 @@
+use std::time::Duration;
+
+use openraft_kv_examples::build_failover_scenario;
+use openraft_kv_runtime_ext::OpenRaftKvComposeDeployer;
+use testing_framework_core::scenario::Deployer;
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
+        .init();
+
+    let mut scenario = build_failover_scenario(Duration::from_secs(60), Duration::from_secs(40))?;
+
+    let deployer = OpenRaftKvComposeDeployer::new();
+    let runner = deployer.deploy(&scenario).await?;
+    runner.run(&mut scenario).await?;
+
+    Ok(())
+}
--- a/examples/openraft_kv/examples/src/bin/k8s_failover.rs
+++ b/examples/openraft_kv/examples/src/bin/k8s_failover.rs
@ -0,0 +1,195 @@
+use std::{sync::Arc, time::Duration};
+
+use anyhow::{Context as _, Result, anyhow};
+use openraft_kv_examples::{
+    INITIAL_WRITE_BATCH, RAFT_KEY_PREFIX, SECOND_WRITE_BATCH, TOTAL_WRITES,
+};
+use openraft_kv_node::OpenRaftKvClient;
+use openraft_kv_runtime_ext::{
+    OpenRaftClusterObserver, OpenRaftKvEnv, OpenRaftKvK8sDeployer, OpenRaftKvTopology,
+    OpenRaftManualClusterSourceProvider,
+};
+use openraft_kv_runtime_workloads::{
+    OpenRaftMembership, expected_kv, wait_for_observed_leader, wait_for_observed_membership,
+    wait_for_observed_replication, write_batch,
+};
+use testing_framework_core::observation::{ObservationHandle, ObservationRuntime};
+use testing_framework_runner_k8s::{ManualCluster, ManualClusterError};
+use tracing::{info, warn};
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
+        .init();
+
+    let deployer = OpenRaftKvK8sDeployer::new();
+    let cluster = match deployer
+        .manual_cluster_from_descriptors(OpenRaftKvTopology::new(3))
+        .await
+    {
+        Ok(cluster) => cluster,
+        Err(ManualClusterError::ClientInit { source }) => {
+            warn!("k8s unavailable ({source}); skipping openraft k8s run");
+
+            return Ok(());
+        }
+        Err(ManualClusterError::InstallStack { source })
+            if k8s_cluster_unavailable(&source.to_string()) =>
+        {
+            warn!("k8s unavailable ({source}); skipping openraft k8s run");
+
+            return Ok(());
+        }
+        Err(error) => {
+            return Err(anyhow::Error::new(error)).context("creating openraft k8s cluster");
+        }
+    };
+
+    run_failover(Arc::new(cluster), Duration::from_secs(40)).await
+}
+
+async fn run_failover(cluster: Arc<ManualCluster<OpenRaftKvEnv>>, timeout: Duration) -> Result<()> {
+    start_cluster(cluster.as_ref()).await?;
+
+    let observation_runtime = start_observer(Arc::clone(&cluster)).await?;
+    let observer = observation_runtime.handle();
+
+    client_for_node(cluster.as_ref(), 0)?.init_self().await?;
+
+    let initial_leader = wait_for_observed_leader(&observer, timeout, None).await?;
+    let membership = current_membership(&observer)?;
+
+    add_learners_and_promote(
+        cluster.as_ref(),
+        &observer,
+        initial_leader,
+        &membership,
+        timeout,
+    )
+    .await?;
+    write_initial_batch(cluster.as_ref(), initial_leader).await?;
+
+    restart_leader(cluster.as_ref(), initial_leader).await?;
+
+    let new_leader = wait_for_observed_leader(&observer, timeout, Some(initial_leader)).await?;
+    write_second_batch(cluster.as_ref(), new_leader).await?;
+
+    let expected = expected_kv(RAFT_KEY_PREFIX, TOTAL_WRITES);
+    wait_for_observed_replication(&observer, &expected, timeout).await?;
+
+    cluster.stop_all();
+
+    Ok(())
+}
+
+async fn start_cluster(cluster: &ManualCluster<OpenRaftKvEnv>) -> Result<()> {
+    cluster.start_node("node-0").await?;
+    cluster.start_node("node-1").await?;
+    cluster.start_node("node-2").await?;
+
+    cluster.wait_network_ready().await?;
+
+    Ok(())
+}
+
+async fn start_observer(
+    cluster: Arc<ManualCluster<OpenRaftKvEnv>>,
+) -> Result<ObservationRuntime<OpenRaftClusterObserver>> {
+    let provider = OpenRaftManualClusterSourceProvider::new(cluster, 3);
+
+    ObservationRuntime::start(
+        provider,
+        OpenRaftClusterObserver,
+        OpenRaftClusterObserver::config(),
+    )
+    .await
+    .map_err(anyhow::Error::new)
+    .context("starting openraft k8s observer")
+}
+
+async fn add_learners_and_promote(
+    cluster: &ManualCluster<OpenRaftKvEnv>,
+    observer: &ObservationHandle<OpenRaftClusterObserver>,
+    leader_id: u64,
+    membership: &OpenRaftMembership,
+    timeout: Duration,
+) -> Result<()> {
+    let leader = client_for_node(cluster, leader_id)?;
+
+    for learner in membership.learner_targets(leader_id) {
+        info!(
+            target = learner.node_id,
+            addr = %learner.public_addr,
+            "adding learner"
+        );
+
+        leader
+            .add_learner(learner.node_id, &learner.public_addr)
+            .await?;
+    }
+
+    let voter_ids = membership.voter_ids();
+    leader.change_membership(voter_ids.iter().copied()).await?;
+
+    wait_for_observed_membership(observer, &voter_ids, timeout).await?;
+
+    Ok(())
+}
+
+async fn write_initial_batch(cluster: &ManualCluster<OpenRaftKvEnv>, leader_id: u64) -> Result<()> {
+    let leader = client_for_node(cluster, leader_id)?;
+
+    write_batch(&leader, RAFT_KEY_PREFIX, 0, INITIAL_WRITE_BATCH).await?;
+
+    Ok(())
+}
+
+async fn write_second_batch(cluster: &ManualCluster<OpenRaftKvEnv>, leader_id: u64) -> Result<()> {
+    let leader = client_for_node(cluster, leader_id)?;
+
+    write_batch(
+        &leader,
+        RAFT_KEY_PREFIX,
+        INITIAL_WRITE_BATCH,
+        SECOND_WRITE_BATCH,
+    )
+    .await?;
+
+    Ok(())
+}
+
+async fn restart_leader(cluster: &ManualCluster<OpenRaftKvEnv>, leader_id: u64) -> Result<()> {
+    let leader_name = format!("node-{leader_id}");
+    info!(%leader_name, "restarting current leader");
+
+    cluster.restart_node(&leader_name).await?;
+    cluster.wait_network_ready().await?;
+
+    Ok(())
+}
+
+fn current_membership(
+    observer: &ObservationHandle<OpenRaftClusterObserver>,
+) -> Result<OpenRaftMembership> {
+    let snapshot = observer
+        .latest_snapshot()
+        .ok_or_else(|| anyhow!("openraft observer has not produced a snapshot yet"))?;
+
+    Ok(OpenRaftMembership::from_states(snapshot.value.states()))
+}
+
+fn client_for_node(
+    cluster: &ManualCluster<OpenRaftKvEnv>,
+    node_id: u64,
+) -> Result<OpenRaftKvClient> {
+    cluster
+        .node_client(&format!("node-{node_id}"))
+        .ok_or_else(|| anyhow!("node-{node_id} client missing"))
+}
+
+fn k8s_cluster_unavailable(message: &str) -> bool {
+    message.contains("Unable to connect to the server")
+        || message.contains("TLS handshake timeout")
+        || message.contains("connection refused")
+}
--- a/examples/openraft_kv/examples/src/lib.rs
+++ b/examples/openraft_kv/examples/src/lib.rs
@ -0,0 +1,41 @@
+use std::time::Duration;
+
+use openraft_kv_runtime_ext::{OpenRaftKvBuilderExt, OpenRaftKvEnv, OpenRaftKvScenarioBuilder};
+use openraft_kv_runtime_workloads::{OpenRaftKvConverges, OpenRaftKvFailoverWorkload};
+use testing_framework_core::scenario::{NodeControlCapability, Scenario};
+
+/// Number of writes issued before the leader restart.
+pub const INITIAL_WRITE_BATCH: usize = 8;
+/// Number of writes issued after the leader restart.
+pub const SECOND_WRITE_BATCH: usize = 8;
+/// Total write count expected after the scenario completes.
+pub const TOTAL_WRITES: usize = INITIAL_WRITE_BATCH + SECOND_WRITE_BATCH;
+/// Key prefix shared by the failover workload and convergence expectation.
+pub const RAFT_KEY_PREFIX: &str = "raft-key";
+
+/// Builds the standard failover scenario used by the local and compose
+/// binaries.
+pub fn build_failover_scenario(
+    run_duration: Duration,
+    workload_timeout: Duration,
+) -> anyhow::Result<Scenario<OpenRaftKvEnv, NodeControlCapability>> {
+    Ok(
+        OpenRaftKvScenarioBuilder::deployment_with(|deployment| deployment)
+            .with_cluster_observer()
+            .enable_node_control()
+            .with_run_duration(run_duration)
+            .with_workload(
+                OpenRaftKvFailoverWorkload::new()
+                    .first_batch(INITIAL_WRITE_BATCH)
+                    .second_batch(SECOND_WRITE_BATCH)
+                    .timeout(workload_timeout)
+                    .key_prefix(RAFT_KEY_PREFIX),
+            )
+            .with_expectation(
+                OpenRaftKvConverges::new(TOTAL_WRITES)
+                    .timeout(run_duration)
+                    .key_prefix(RAFT_KEY_PREFIX),
+            )
+            .build()?,
+    )
+}
--- a/examples/openraft_kv/openraft-kv-node/Cargo.toml
+++ b/examples/openraft_kv/openraft-kv-node/Cargo.toml
@ -0,0 +1,23 @@
+[package]
+edition.workspace = true
+license.workspace = true
+name              = "openraft-kv-node"
+version.workspace = true
+
+[[bin]]
+name = "openraft-kv-node"
+path = "src/main.rs"
+
+[dependencies]
+anyhow             = "1.0"
+axum               = "0.7"
+clap               = { version = "4.0", features = ["derive"] }
+openraft           = { workspace = true }
+openraft-memstore  = { workspace = true }
+reqwest            = { workspace = true, features = ["json"] }
+serde              = { workspace = true }
+serde_yaml         = { workspace = true }
+tokio              = { workspace = true, features = ["full"] }
+tower-http         = { version = "0.6", features = ["trace"] }
+tracing            = { workspace = true }
+tracing-subscriber = { version = "0.3", features = ["env-filter"] }
--- a/examples/openraft_kv/openraft-kv-node/src/client.rs
+++ b/examples/openraft_kv/openraft-kv-node/src/client.rs
@ -0,0 +1,136 @@
+use std::{collections::BTreeSet, time::Duration};
+
+use reqwest::Url;
+use serde::{Serialize, de::DeserializeOwned};
+
+use crate::types::{
+    AddLearnerRequest, AddLearnerResult, ChangeMembershipRequest, ChangeMembershipResult,
+    InitResult, OpenRaftKvReadRequest, OpenRaftKvReadResponse, OpenRaftKvState,
+    OpenRaftKvWriteRequest, OpenRaftKvWriteResponse,
+};
+
+/// Small HTTP client for the OpenRaft example node and its admin endpoints.
+#[derive(Clone)]
+pub struct OpenRaftKvClient {
+    base_url: Url,
+    client: reqwest::Client,
+}
+
+impl OpenRaftKvClient {
+    /// Builds a client for one node base URL.
+    #[must_use]
+    pub fn new(base_url: Url) -> Self {
+        Self {
+            base_url,
+            client: reqwest::Client::builder()
+                .timeout(Duration::from_secs(2))
+                .connect_timeout(Duration::from_secs(2))
+                .build()
+                .expect("openraft kv client timeout configuration is valid"),
+        }
+    }
+
+    /// Fetches the node's current Raft and application state.
+    pub async fn state(&self) -> anyhow::Result<OpenRaftKvState> {
+        self.get("state").await
+    }
+
+    /// Replicates one key/value write through the current leader.
+    pub async fn write(
+        &self,
+        key: &str,
+        value: &str,
+        serial: u64,
+    ) -> anyhow::Result<OpenRaftKvWriteResponse> {
+        self.post_result(
+            "kv/write",
+            &OpenRaftKvWriteRequest {
+                key: key.to_owned(),
+                value: value.to_owned(),
+                serial,
+            },
+        )
+        .await
+    }
+
+    /// Reads one key from the replicated state machine.
+    pub async fn read(&self, key: &str) -> anyhow::Result<Option<String>> {
+        let response: OpenRaftKvReadResponse = self
+            .post_result(
+                "kv/read",
+                &OpenRaftKvReadRequest {
+                    key: key.to_owned(),
+                },
+            )
+            .await?;
+        Ok(response.value)
+    }
+
+    /// Bootstraps a one-node cluster on this node.
+    pub async fn init_self(&self) -> anyhow::Result<()> {
+        let _: InitResult = self.post("admin/init", &()).await?;
+        Ok(())
+    }
+
+    /// Registers another node as a learner with the current leader.
+    pub async fn add_learner(&self, node_id: u64, addr: &str) -> anyhow::Result<()> {
+        let _: AddLearnerResult = self
+            .post(
+                "admin/add-learner",
+                &AddLearnerRequest {
+                    node_id,
+                    addr: addr.to_owned(),
+                },
+            )
+            .await?;
+        Ok(())
+    }
+
+    /// Promotes the cluster to the provided voter set.
+    pub async fn change_membership(
+        &self,
+        voters: impl IntoIterator<Item = u64>,
+    ) -> anyhow::Result<()> {
+        let voters = normalize_voters(voters);
+        let request = ChangeMembershipRequest { voters };
+
+        let _: ChangeMembershipResult = self.post("admin/change-membership", &request).await?;
+        Ok(())
+    }
+
+    async fn get<T: DeserializeOwned>(&self, path: &str) -> anyhow::Result<T> {
+        let url = self.base_url.join(path)?;
+        let response = self.client.get(url).send().await?;
+        let response = response.error_for_status()?;
+
+        Ok(response.json().await?)
+    }
+
+    async fn post<B: Serialize, T: DeserializeOwned>(
+        &self,
+        path: &str,
+        body: &B,
+    ) -> anyhow::Result<T> {
+        let url = self.base_url.join(path)?;
+
+        let response = self.client.post(url).json(body).send().await?;
+
+        let response = response.error_for_status()?;
+
+        Ok(response.json().await?)
+    }
+
+    async fn post_result<B: Serialize, T: DeserializeOwned>(
+        &self,
+        path: &str,
+        body: &B,
+    ) -> anyhow::Result<T> {
+        let result: Result<T, String> = self.post(path, body).await?;
+        result.map_err(anyhow::Error::msg)
+    }
+}
+
+fn normalize_voters(voters: impl IntoIterator<Item = u64>) -> Vec<u64> {
+    let unique_voters = voters.into_iter().collect::<BTreeSet<_>>();
+    unique_voters.into_iter().collect()
+}
--- a/examples/openraft_kv/openraft-kv-node/src/config.rs
+++ b/examples/openraft_kv/openraft-kv-node/src/config.rs
@ -0,0 +1,46 @@
+use std::{collections::BTreeMap, fs, path::Path};
+
+use serde::{Deserialize, Serialize};
+
+/// Static node config written by TF for one OpenRaft node process.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct OpenRaftKvNodeConfig {
+    /// Stable OpenRaft node identifier.
+    pub node_id: u64,
+    /// HTTP port bound by the node process.
+    pub http_port: u16,
+    /// Advertised Raft address for this node.
+    pub public_addr: String,
+    /// Advertised Raft addresses for the other known nodes.
+    #[serde(default)]
+    pub peer_addrs: BTreeMap<u64, String>,
+    /// Heartbeat interval passed to the OpenRaft config.
+    #[serde(default = "default_heartbeat_interval_ms")]
+    pub heartbeat_interval_ms: u64,
+    /// Lower election timeout bound passed to OpenRaft.
+    #[serde(default = "default_election_timeout_min_ms")]
+    pub election_timeout_min_ms: u64,
+    /// Upper election timeout bound passed to OpenRaft.
+    #[serde(default = "default_election_timeout_max_ms")]
+    pub election_timeout_max_ms: u64,
+}
+
+impl OpenRaftKvNodeConfig {
+    /// Loads one node config from YAML on disk.
+    pub fn load(path: &Path) -> anyhow::Result<Self> {
+        let raw = fs::read_to_string(path)?;
+        Ok(serde_yaml::from_str(&raw)?)
+    }
+}
+
+const fn default_heartbeat_interval_ms() -> u64 {
+    500
+}
+
+const fn default_election_timeout_min_ms() -> u64 {
+    1_500
+}
+
+const fn default_election_timeout_max_ms() -> u64 {
+    3_000
+}
--- a/examples/openraft_kv/openraft-kv-node/src/lib.rs
+++ b/examples/openraft_kv/openraft-kv-node/src/lib.rs
@ -0,0 +1,25 @@
+//! OpenRaft-backed key-value node used by the `examples-simple-clusters`
+//! branch.
+
+/// HTTP client for interacting with one OpenRaft node.
+pub mod client;
+/// YAML node configuration used by TF and the node binary.
+pub mod config;
+mod network;
+/// Axum server bootstrap and request handlers for one node process.
+pub mod server;
+/// Shared request, response, and state payload types.
+pub mod types;
+
+/// Re-export of the node HTTP client.
+pub use client::OpenRaftKvClient;
+/// Re-export of the node YAML config type.
+pub use config::OpenRaftKvNodeConfig;
+/// Re-export of the public request and state payloads.
+pub use types::{
+    AddLearnerRequest, ChangeMembershipRequest, OpenRaftKvReadRequest, OpenRaftKvReadResponse,
+    OpenRaftKvState, OpenRaftKvWriteRequest, OpenRaftKvWriteResponse,
+};
+
+/// OpenRaft type configuration shared by the in-memory log and state machine.
+pub type TypeConfig = openraft_memstore::TypeConfig;
--- a/examples/openraft_kv/openraft-kv-node/src/main.rs
+++ b/examples/openraft_kv/openraft-kv-node/src/main.rs
@ -0,0 +1,24 @@
+use std::path::PathBuf;
+
+use clap::Parser;
+use openraft_kv_node::{config::OpenRaftKvNodeConfig, server::run_server};
+use tracing_subscriber::EnvFilter;
+
+#[derive(Parser, Clone, Debug)]
+#[command(author, version, about)]
+struct Opt {
+    #[arg(long)]
+    config: PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(EnvFilter::from_default_env())
+        .with_ansi(false)
+        .init();
+
+    let options = Opt::parse();
+    let config = OpenRaftKvNodeConfig::load(&options.config)?;
+    run_server(config).await
+}
--- a/examples/openraft_kv/openraft-kv-node/src/network.rs
+++ b/examples/openraft_kv/openraft-kv-node/src/network.rs
@ -0,0 +1,158 @@
+//! HTTP transport used by OpenRaft to replicate between example nodes.
+
+use std::{collections::BTreeMap, sync::Arc};
+
+use openraft::{
+    RaftNetworkFactory, RaftNetworkV2,
+    alias::{SnapshotOf, VoteOf},
+    errors::{RPCError, StreamingError, Unreachable},
+    network::RPCOption,
+};
+use reqwest::Url;
+use tokio::sync::RwLock;
+
+use crate::{
+    TypeConfig,
+    types::{InstallFullSnapshotBody, SnapshotRpcResult},
+};
+
+/// Shared node-address book used by Raft RPC clients.
+#[derive(Clone, Default)]
+pub struct HttpNetworkFactory {
+    client: reqwest::Client,
+    known_nodes: Arc<RwLock<BTreeMap<u64, String>>>,
+}
+
+/// Per-target HTTP client used for Raft replication traffic.
+pub struct HttpNetworkClient {
+    client: reqwest::Client,
+    target: u64,
+    target_addr: Option<String>,
+}
+
+impl HttpNetworkFactory {
+    /// Creates a network factory backed by one shared node-address map.
+    #[must_use]
+    pub fn new(known_nodes: Arc<RwLock<BTreeMap<u64, String>>>) -> Self {
+        Self {
+            client: reqwest::Client::new(),
+            known_nodes,
+        }
+    }
+}
+
+impl RaftNetworkFactory<TypeConfig> for HttpNetworkFactory {
+    type Network = HttpNetworkClient;
+
+    async fn new_client(&mut self, target: u64, _node: &()) -> Self::Network {
+        let target_addr = self.known_nodes.read().await.get(&target).cloned();
+
+        HttpNetworkClient {
+            client: self.client.clone(),
+            target,
+            target_addr,
+        }
+    }
+}
+
+impl RaftNetworkV2<TypeConfig> for HttpNetworkClient {
+    async fn append_entries(
+        &mut self,
+        rpc: openraft::raft::AppendEntriesRequest<TypeConfig>,
+        _option: RPCOption,
+    ) -> Result<openraft::raft::AppendEntriesResponse<TypeConfig>, RPCError<TypeConfig>> {
+        self.post_rpc("raft/append", &rpc).await
+    }
+
+    async fn vote(
+        &mut self,
+        rpc: openraft::raft::VoteRequest<TypeConfig>,
+        _option: RPCOption,
+    ) -> Result<openraft::raft::VoteResponse<TypeConfig>, RPCError<TypeConfig>> {
+        self.post_rpc("raft/vote", &rpc).await
+    }
+
+    async fn full_snapshot(
+        &mut self,
+        vote: VoteOf<TypeConfig>,
+        snapshot: SnapshotOf<TypeConfig>,
+        _cancel: impl std::future::Future<Output = openraft::errors::ReplicationClosed>
+        + openraft::OptionalSend
+        + 'static,
+        _option: RPCOption,
+    ) -> Result<openraft::raft::SnapshotResponse<TypeConfig>, StreamingError<TypeConfig>> {
+        let body = InstallFullSnapshotBody {
+            vote,
+            meta: snapshot.meta,
+            data: snapshot.snapshot.into_inner(),
+        };
+
+        self.post_snapshot("raft/snapshot", &body).await
+    }
+}
+
+impl HttpNetworkClient {
+    async fn post_rpc<B, T>(&self, path: &str, body: &B) -> Result<T, RPCError<TypeConfig>>
+    where
+        B: serde::Serialize,
+        T: serde::de::DeserializeOwned,
+    {
+        let url = self.endpoint_url(path)?;
+        let response = self
+            .client
+            .post(url)
+            .json(body)
+            .send()
+            .await
+            .map_err(|err| RPCError::Unreachable(Unreachable::new(&err)))?
+            .error_for_status()
+            .map_err(|err| RPCError::Unreachable(Unreachable::new(&err)))?;
+
+        let result: Result<T, String> = response
+            .json()
+            .await
+            .map_err(|err| RPCError::Unreachable(Unreachable::new(&err)))?;
+
+        result.map_err(|err| RPCError::Unreachable(Unreachable::from_string(err)))
+    }
+
+    async fn post_snapshot(
+        &self,
+        path: &str,
+        body: &InstallFullSnapshotBody,
+    ) -> Result<openraft::raft::SnapshotResponse<TypeConfig>, StreamingError<TypeConfig>> {
+        let url = self
+            .endpoint_url(path)
+            .map_err(|err| StreamingError::Unreachable(Unreachable::new(&err)))?;
+        let response = self
+            .client
+            .post(url)
+            .json(body)
+            .send()
+            .await
+            .map_err(|err| StreamingError::Unreachable(Unreachable::new(&err)))?
+            .error_for_status()
+            .map_err(|err| StreamingError::Unreachable(Unreachable::new(&err)))?;
+
+        let result: SnapshotRpcResult = response
+            .json()
+            .await
+            .map_err(|err| StreamingError::Unreachable(Unreachable::new(&err)))?;
+
+        result.map_err(|err| StreamingError::Unreachable(Unreachable::from_string(err)))
+    }
+
+    fn endpoint_url(&self, path: &str) -> Result<Url, Unreachable<TypeConfig>> {
+        let Some(addr) = &self.target_addr else {
+            return Err(Unreachable::from_string(format!(
+                "target {} has no known address",
+                self.target
+            )));
+        };
+
+        let mut url =
+            Url::parse(&format!("http://{addr}/")).map_err(|err| Unreachable::new(&err))?;
+        url.set_path(path);
+        Ok(url)
+    }
+}
--- a/examples/openraft_kv/openraft-kv-node/src/server.rs
+++ b/examples/openraft_kv/openraft-kv-node/src/server.rs
@ -0,0 +1,276 @@
+//! Axum server that exposes the OpenRaft example node and its admin endpoints.
+
+use std::{
+    collections::{BTreeMap, BTreeSet},
+    sync::Arc,
+};
+
+use axum::{
+    Json, Router,
+    extract::State,
+    http::StatusCode,
+    routing::{get, post},
+};
+use openraft::{Config, Raft, SnapshotPolicy, type_config::async_runtime::WatchReceiver};
+use openraft_memstore::{ClientRequest, MemLogStore, MemStateMachine, new_mem_store};
+use tokio::sync::RwLock;
+use tower_http::trace::TraceLayer;
+use tracing::info;
+
+use crate::{
+    TypeConfig,
+    config::OpenRaftKvNodeConfig,
+    network::HttpNetworkFactory,
+    types::{
+        AddLearnerRequest, AppendRpcResult, ChangeMembershipRequest, InitResult,
+        InstallSnapshotBody, MetricsResult, OpenRaftKvReadRequest, OpenRaftKvReadResponse,
+        OpenRaftKvState, OpenRaftKvWriteRequest, OpenRaftKvWriteResponse, SnapshotRpcResult,
+        VoteRpcResult,
+    },
+};
+
+type KnownNodes = Arc<RwLock<BTreeMap<u64, String>>>;
+
+/// Shared state used by the HTTP handlers exposed by one node.
+#[derive(Clone)]
+pub struct AppState {
+    config: OpenRaftKvNodeConfig,
+    raft: Raft<TypeConfig, Arc<MemStateMachine>>,
+    state_machine: Arc<MemStateMachine>,
+    known_nodes: KnownNodes,
+}
+
+impl AppState {
+    /// Builds the application state for one node process.
+    pub fn new(
+        config: OpenRaftKvNodeConfig,
+        raft: Raft<TypeConfig, Arc<MemStateMachine>>,
+        state_machine: Arc<MemStateMachine>,
+        known_nodes: KnownNodes,
+    ) -> Self {
+        Self {
+            config,
+            raft,
+            state_machine,
+            known_nodes,
+        }
+    }
+}
+
+/// Starts one OpenRaft-backed HTTP node.
+pub async fn run_server(config: OpenRaftKvNodeConfig) -> anyhow::Result<()> {
+    let raft_config = Arc::new(
+        Config {
+            cluster_name: "openraft-kv".to_owned(),
+            heartbeat_interval: config.heartbeat_interval_ms,
+            election_timeout_min: config.election_timeout_min_ms,
+            election_timeout_max: config.election_timeout_max_ms,
+            snapshot_policy: SnapshotPolicy::Never,
+            ..Default::default()
+        }
+        .validate()?,
+    );
+
+    let known_nodes = Arc::new(RwLock::new(known_nodes(&config)));
+
+    let (log_store, state_machine): (Arc<MemLogStore>, Arc<MemStateMachine>) = new_mem_store();
+    let network = HttpNetworkFactory::new(known_nodes.clone());
+
+    let raft = Raft::new(
+        config.node_id,
+        raft_config,
+        network,
+        log_store,
+        state_machine.clone(),
+    )
+    .await?;
+
+    let app_state = AppState::new(config.clone(), raft, state_machine, known_nodes);
+    let app = router(app_state);
+    let address = std::net::SocketAddr::from(([0, 0, 0, 0], config.http_port));
+
+    info!(
+        node_id = config.node_id,
+        public_addr = %config.public_addr,
+        peers = ?config.peer_addrs,
+        %address,
+        "starting openraft kv node"
+    );
+
+    let listener = tokio::net::TcpListener::bind(address).await?;
+    axum::serve(listener, app).await?;
+    Ok(())
+}
+
+fn router(app_state: AppState) -> Router {
+    let app_routes = Router::new()
+        .route("/healthz", get(healthz))
+        .route("/state", get(cluster_state))
+        .route("/kv/write", post(write))
+        .route("/kv/read", post(read));
+
+    let admin_routes = Router::new()
+        .route("/admin/init", post(init))
+        .route("/admin/add-learner", post(add_learner))
+        .route("/admin/change-membership", post(change_membership))
+        .route("/admin/metrics", get(metrics));
+
+    let raft_routes = Router::new()
+        .route("/raft/vote", post(vote))
+        .route("/raft/append", post(append))
+        .route("/raft/snapshot", post(snapshot));
+
+    app_routes
+        .merge(admin_routes)
+        .merge(raft_routes)
+        .layer(TraceLayer::new_for_http())
+        .with_state(app_state)
+}
+
+async fn healthz() -> &'static str {
+    "ok"
+}
+
+async fn cluster_state(State(app): State<AppState>) -> Result<Json<OpenRaftKvState>, StatusCode> {
+    let metrics = app.raft.metrics().borrow_watched().clone();
+
+    let sm = app.state_machine.get_state_machine().await;
+
+    let voters = metrics
+        .membership_config
+        .membership()
+        .voter_ids()
+        .collect::<Vec<_>>();
+
+    let kv = sm.client_status.into_iter().collect::<BTreeMap<_, _>>();
+
+    Ok(Json(OpenRaftKvState {
+        node_id: app.config.node_id,
+        public_addr: app.config.public_addr.clone(),
+        role: format!("{:?}", metrics.state),
+        current_leader: metrics.current_leader,
+        current_term: metrics.current_term,
+        last_log_index: metrics.last_log_index,
+        last_applied_index: metrics.last_applied.as_ref().map(|log_id| log_id.index()),
+        voters,
+        kv,
+    }))
+}
+
+async fn metrics(State(app): State<AppState>) -> Json<MetricsResult> {
+    Json(Ok(app.raft.metrics().borrow_watched().clone()))
+}
+
+async fn init(State(app): State<AppState>) -> Json<InitResult> {
+    let members = BTreeSet::from([app.config.node_id]);
+
+    Json(
+        app.raft
+            .initialize(members)
+            .await
+            .map_err(|err| err.to_string()),
+    )
+}
+
+async fn add_learner(
+    State(app): State<AppState>,
+    Json(request): Json<AddLearnerRequest>,
+) -> Json<InitResult> {
+    let mut known_nodes = app.known_nodes.write().await;
+    known_nodes.insert(request.node_id, request.addr.clone());
+    drop(known_nodes);
+
+    Json(
+        app.raft
+            .add_learner(request.node_id, (), true)
+            .await
+            .map(|_| ())
+            .map_err(|err| err.to_string()),
+    )
+}
+
+async fn change_membership(
+    State(app): State<AppState>,
+    Json(request): Json<ChangeMembershipRequest>,
+) -> Json<InitResult> {
+    Json(
+        app.raft
+            .change_membership(request.voters.into_iter().collect::<BTreeSet<_>>(), false)
+            .await
+            .map(|_| ())
+            .map_err(|err| err.to_string()),
+    )
+}
+
+async fn write(
+    State(app): State<AppState>,
+    Json(request): Json<OpenRaftKvWriteRequest>,
+) -> Json<Result<OpenRaftKvWriteResponse, String>> {
+    let result = app
+        .raft
+        .client_write(ClientRequest {
+            client: request.key,
+            serial: request.serial,
+            status: request.value,
+        })
+        .await
+        .map(|response| OpenRaftKvWriteResponse {
+            previous: response.response().0.clone(),
+        })
+        .map_err(|err| err.to_string());
+
+    Json(result)
+}
+
+async fn read(
+    State(app): State<AppState>,
+    Json(request): Json<OpenRaftKvReadRequest>,
+) -> Json<Result<OpenRaftKvReadResponse, String>> {
+    let sm = app.state_machine.get_state_machine().await;
+
+    Json(Ok(OpenRaftKvReadResponse {
+        value: sm.client_status.get(&request.key).cloned(),
+    }))
+}
+
+async fn vote(
+    State(app): State<AppState>,
+    Json(request): Json<openraft::raft::VoteRequest<TypeConfig>>,
+) -> Json<VoteRpcResult> {
+    Json(app.raft.vote(request).await.map_err(|err| err.to_string()))
+}
+
+async fn append(
+    State(app): State<AppState>,
+    Json(request): Json<openraft::raft::AppendEntriesRequest<TypeConfig>>,
+) -> Json<AppendRpcResult> {
+    Json(
+        app.raft
+            .append_entries(request)
+            .await
+            .map_err(|err| err.to_string()),
+    )
+}
+
+async fn snapshot(
+    State(app): State<AppState>,
+    Json(request): Json<InstallSnapshotBody>,
+) -> Json<SnapshotRpcResult> {
+    let snapshot = openraft::alias::SnapshotOf::<TypeConfig> {
+        meta: request.meta,
+        snapshot: std::io::Cursor::new(request.data),
+    };
+
+    Json(
+        app.raft
+            .install_full_snapshot(request.vote, snapshot)
+            .await
+            .map_err(|err| err.to_string()),
+    )
+}
+
+fn known_nodes(config: &OpenRaftKvNodeConfig) -> BTreeMap<u64, String> {
+    let mut known_nodes = config.peer_addrs.clone();
+    known_nodes.insert(config.node_id, config.public_addr.clone());
+    known_nodes
+}
--- a/examples/openraft_kv/openraft-kv-node/src/types.rs
+++ b/examples/openraft_kv/openraft-kv-node/src/types.rs
@ -0,0 +1,112 @@
+use std::collections::BTreeMap;
+
+use openraft::{
+    RaftMetrics,
+    alias::{SnapshotMetaOf, VoteOf},
+    raft::InstallSnapshotRequest,
+};
+use serde::{Deserialize, Serialize};
+
+use crate::TypeConfig;
+
+/// Result shape used by the simple admin endpoints in this example.
+pub type OpenRaftResult<T> = Result<T, String>;
+
+/// Request body for a replicated write submitted through the leader.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct OpenRaftKvWriteRequest {
+    /// Application key to write.
+    pub key: String,
+    /// Value stored for the key.
+    pub value: String,
+    /// Client-side serial used by OpenRaft's example state machine.
+    pub serial: u64,
+}
+
+/// Response body returned after a replicated write is committed.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct OpenRaftKvWriteResponse {
+    /// Previous value stored under the key, if any.
+    pub previous: Option<String>,
+}
+
+/// Request body for a key lookup.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct OpenRaftKvReadRequest {
+    /// Application key to look up.
+    pub key: String,
+}
+
+/// Response body returned by a key lookup.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct OpenRaftKvReadResponse {
+    /// Current value stored under the key, if any.
+    pub value: Option<String>,
+}
+
+/// Admin request used to register a learner in the current cluster.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct AddLearnerRequest {
+    /// OpenRaft node identifier for the learner.
+    pub node_id: u64,
+    /// Advertised Raft address for the learner.
+    pub addr: String,
+}
+
+/// Admin request used to promote the cluster to a concrete voter set.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ChangeMembershipRequest {
+    /// Full voter set that should own the cluster after the change.
+    pub voters: Vec<u64>,
+}
+
+/// Snapshot of one node's externally visible Raft and application state.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct OpenRaftKvState {
+    /// Stable OpenRaft node identifier.
+    pub node_id: u64,
+    /// Advertised Raft address for this node.
+    pub public_addr: String,
+    /// Current OpenRaft role rendered as text.
+    pub role: String,
+    /// Leader known by this node, if any.
+    pub current_leader: Option<u64>,
+    /// Current term reported by this node.
+    pub current_term: u64,
+    /// Highest log index stored locally.
+    pub last_log_index: Option<u64>,
+    /// Highest log index applied to the state machine.
+    pub last_applied_index: Option<u64>,
+    /// Current voter set reported by this node.
+    pub voters: Vec<u64>,
+    /// Application state machine contents.
+    pub kv: BTreeMap<String, String>,
+}
+
+/// JSON representation used for full-snapshot replication over HTTP.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct InstallFullSnapshotBody {
+    /// Vote bundled with the snapshot transfer.
+    pub vote: VoteOf<TypeConfig>,
+    /// Snapshot metadata describing the transferred state.
+    pub meta: SnapshotMetaOf<TypeConfig>,
+    /// Serialized state machine bytes.
+    pub data: Vec<u8>,
+}
+
+/// Serialized result of a vote RPC.
+pub type VoteRpcResult = Result<openraft::raft::VoteResponse<TypeConfig>, String>;
+/// Serialized result of an append-entries RPC.
+pub type AppendRpcResult = Result<openraft::raft::AppendEntriesResponse<TypeConfig>, String>;
+/// Serialized result of a full-snapshot RPC.
+pub type SnapshotRpcResult = Result<openraft::raft::SnapshotResponse<TypeConfig>, String>;
+/// JSON payload returned by the metrics endpoint.
+pub type MetricsResult = Result<RaftMetrics<TypeConfig>, String>;
+/// JSON payload returned by `/admin/init`.
+pub type InitResult = Result<(), String>;
+/// JSON payload returned by `/admin/add-learner`.
+pub type AddLearnerResult = Result<(), String>;
+/// JSON payload returned by `/admin/change-membership`.
+pub type ChangeMembershipResult = Result<(), String>;
+/// Request type accepted by the snapshot endpoint.
+pub type InstallSnapshotBody = InstallSnapshotRequest<TypeConfig>;
--- a/examples/openraft_kv/testing/integration/Cargo.toml
+++ b/examples/openraft_kv/testing/integration/Cargo.toml
@ -0,0 +1,14 @@
+[package]
+edition.workspace = true
+license.workspace = true
+name              = "openraft-kv-runtime-ext"
+version.workspace = true
+
+[dependencies]
+async-trait                      = { workspace = true }
+openraft-kv-node                 = { path = "../../openraft-kv-node" }
+reqwest                          = { workspace = true }
+testing-framework-core           = { workspace = true }
+testing-framework-runner-compose = { workspace = true }
+testing-framework-runner-k8s     = { workspace = true }
+testing-framework-runner-local   = { workspace = true }
--- a/examples/openraft_kv/testing/integration/src/app.rs
+++ b/examples/openraft_kv/testing/integration/src/app.rs
@ -0,0 +1,59 @@
+use std::io::Error;
+
+use openraft_kv_node::{OpenRaftKvClient, OpenRaftKvNodeConfig};
+use testing_framework_core::scenario::{
+    Application, ClusterNodeConfigApplication, ClusterNodeView, ClusterPeerView, DynError,
+    NodeAccess, serialize_cluster_yaml_config,
+};
+
+/// Three-node topology used by the OpenRaft example scenarios.
+pub type OpenRaftKvTopology = testing_framework_core::topology::ClusterTopology;
+
+/// Application environment wiring for the OpenRaft-backed key-value example.
+pub struct OpenRaftKvEnv;
+
+impl Application for OpenRaftKvEnv {
+    type Deployment = OpenRaftKvTopology;
+    type NodeClient = OpenRaftKvClient;
+    type NodeConfig = OpenRaftKvNodeConfig;
+
+    fn build_node_client(access: &NodeAccess) -> Result<Self::NodeClient, DynError> {
+        Ok(OpenRaftKvClient::new(access.api_base_url()?))
+    }
+
+    fn node_readiness_path() -> &'static str {
+        "/healthz"
+    }
+}
+
+impl ClusterNodeConfigApplication for OpenRaftKvEnv {
+    type ConfigError = Error;
+
+    fn static_network_port() -> u16 {
+        8080
+    }
+
+    fn build_cluster_node_config(
+        node: &ClusterNodeView,
+        peers: &[ClusterPeerView],
+    ) -> Result<Self::NodeConfig, Self::ConfigError> {
+        Ok(OpenRaftKvNodeConfig {
+            node_id: node.index() as u64,
+            http_port: node.network_port(),
+            public_addr: node.authority(),
+            peer_addrs: peers
+                .iter()
+                .map(|peer| (peer.index() as u64, peer.authority()))
+                .collect(),
+            heartbeat_interval_ms: 500,
+            election_timeout_min_ms: 1_500,
+            election_timeout_max_ms: 3_000,
+        })
+    }
+
+    fn serialize_cluster_node_config(
+        config: &Self::NodeConfig,
+    ) -> Result<String, Self::ConfigError> {
+        serialize_cluster_yaml_config(config).map_err(Error::other)
+    }
+}
--- a/examples/openraft_kv/testing/integration/src/compose_env.rs
+++ b/examples/openraft_kv/testing/integration/src/compose_env.rs
@ -0,0 +1,112 @@
+use std::{fs, path::Path};
+
+use testing_framework_core::{
+    cfgsync::StaticNodeConfigProvider,
+    scenario::{Application, DynError},
+    topology::DeploymentDescriptor,
+};
+use testing_framework_runner_compose::{
+    BinaryConfigNodeSpec, ComposeDeployEnv, ComposeDescriptor, NodeDescriptor,
+    binary_config_node_runtime_spec, node_identifier,
+};
+
+use crate::OpenRaftKvEnv;
+
+const NODE_CONFIG_PATH: &str = "/etc/openraft-kv/config.yaml";
+const COMPOSE_HTTP_PORT_BASE: u16 = 47_080;
+
+fn compose_node_spec() -> BinaryConfigNodeSpec {
+    BinaryConfigNodeSpec::conventional(
+        "/usr/local/bin/openraft-kv-node",
+        NODE_CONFIG_PATH,
+        vec![8080],
+    )
+}
+
+fn fixed_loopback_port_binding(host_port: u16, container_port: u16) -> String {
+    format!("127.0.0.1:{host_port}:{container_port}")
+}
+
+impl ComposeDeployEnv for OpenRaftKvEnv {
+    fn prepare_compose_configs(
+        path: &Path,
+        topology: &<Self as Application>::Deployment,
+        _cfgsync_port: u16,
+        _metrics_otlp_ingest_url: Option<&reqwest::Url>,
+    ) -> Result<(), DynError> {
+        let hostnames = Self::cfgsync_hostnames(topology);
+        let stack_dir = path
+            .parent()
+            .ok_or_else(|| std::io::Error::other("compose config path has no parent"))?;
+        let configs_dir = stack_dir.join("configs");
+        fs::create_dir_all(&configs_dir)?;
+
+        for index in 0..topology.node_count() {
+            let mut config = Self::build_node_config(topology, index)?;
+            Self::rewrite_for_hostnames(topology, index, &hostnames, &mut config)?;
+            let rendered = Self::serialize_node_config(&config)?;
+            fs::write(
+                configs_dir.join(Self::static_node_config_file_name(index)),
+                rendered,
+            )?;
+        }
+
+        Ok(())
+    }
+
+    fn static_node_config_file_name(index: usize) -> String {
+        format!("node-{index}.yaml")
+    }
+
+    fn binary_config_node_spec(
+        _topology: &<Self as Application>::Deployment,
+        _index: usize,
+    ) -> Result<Option<BinaryConfigNodeSpec>, DynError> {
+        Ok(Some(compose_node_spec()))
+    }
+
+    fn compose_descriptor(
+        topology: &<Self as Application>::Deployment,
+        _cfgsync_port: u16,
+    ) -> Result<ComposeDescriptor, DynError> {
+        let spec = compose_node_spec();
+
+        let nodes = (0..topology.node_count())
+            .map(|index| {
+                let runtime = binary_config_node_runtime_spec(index, &spec);
+                let file_name = Self::static_node_config_file_name(index);
+
+                let host_port = COMPOSE_HTTP_PORT_BASE + index as u16;
+                let ports = compose_node_ports(host_port, &runtime.container_ports);
+
+                NodeDescriptor::new(
+                    node_identifier(index),
+                    runtime.image,
+                    runtime.entrypoint,
+                    vec![format!(
+                        "./stack/configs/{file_name}:{}:ro",
+                        spec.config_container_path
+                    )],
+                    runtime.extra_hosts,
+                    ports,
+                    runtime.container_ports,
+                    runtime.environment,
+                    runtime.platform,
+                )
+            })
+            .collect();
+
+        Ok(ComposeDescriptor::new(nodes))
+    }
+}
+
+fn compose_node_ports(host_port: u16, container_ports: &[u16]) -> Vec<String> {
+    container_ports
+        .iter()
+        .map(|port| {
+            // OpenRaft failover restarts the leader. Fixed host ports keep TF
+            // clients stable across `docker compose restart`.
+            fixed_loopback_port_binding(host_port, *port)
+        })
+        .collect()
+}
--- a/examples/openraft_kv/testing/integration/src/k8s_env.rs
+++ b/examples/openraft_kv/testing/integration/src/k8s_env.rs
@ -0,0 +1,21 @@
+use testing_framework_runner_k8s::{BinaryConfigK8sSpec, K8sBinaryApp};
+
+use crate::OpenRaftKvEnv;
+
+const CONTAINER_CONFIG_PATH: &str = "/etc/openraft-kv/config.yaml";
+const CONTAINER_HTTP_PORT: u16 = 8080;
+const SERVICE_TESTING_PORT: u16 = 8081;
+const NODE_NAME_PREFIX: &str = "openraft-kv-node";
+
+impl K8sBinaryApp for OpenRaftKvEnv {
+    fn k8s_binary_spec() -> BinaryConfigK8sSpec {
+        BinaryConfigK8sSpec::conventional(
+            "openraft-kv",
+            NODE_NAME_PREFIX,
+            "/usr/local/bin/openraft-kv-node",
+            CONTAINER_CONFIG_PATH,
+            CONTAINER_HTTP_PORT,
+            SERVICE_TESTING_PORT,
+        )
+    }
+}
--- a/examples/openraft_kv/testing/integration/src/lib.rs
+++ b/examples/openraft_kv/testing/integration/src/lib.rs
@ -0,0 +1,18 @@
+mod app;
+mod compose_env;
+mod k8s_env;
+mod local_env;
+mod observation;
+pub mod scenario;
+
+pub use app::*;
+pub use observation::*;
+pub use scenario::{OpenRaftKvBuilderExt, OpenRaftKvScenarioBuilder};
+
+/// Local process deployer for the OpenRaft example app.
+pub type OpenRaftKvLocalDeployer = testing_framework_runner_local::ProcessDeployer<OpenRaftKvEnv>;
+/// Docker Compose deployer for the OpenRaft example app.
+pub type OpenRaftKvComposeDeployer =
+    testing_framework_runner_compose::ComposeDeployer<OpenRaftKvEnv>;
+/// Kubernetes deployer for the OpenRaft example app.
+pub type OpenRaftKvK8sDeployer = testing_framework_runner_k8s::K8sDeployer<OpenRaftKvEnv>;
--- a/examples/openraft_kv/testing/integration/src/local_env.rs
+++ b/examples/openraft_kv/testing/integration/src/local_env.rs
@ -0,0 +1,125 @@
+use std::collections::{BTreeMap, HashMap};
+
+use openraft_kv_node::OpenRaftKvNodeConfig;
+use testing_framework_core::{
+    scenario::{DynError, StartNodeOptions},
+    topology::DeploymentDescriptor,
+};
+use testing_framework_runner_local::{
+    BuiltNodeConfig, LocalDeployerEnv, LocalNodePorts, LocalProcessSpec, NodeConfigEntry,
+    reserve_local_node_ports, yaml_node_config,
+};
+
+use crate::OpenRaftKvEnv;
+
+impl LocalDeployerEnv for OpenRaftKvEnv {
+    fn build_node_config_from_template(
+        _topology: &Self::Deployment,
+        index: usize,
+        _peer_ports_by_name: &HashMap<String, u16>,
+        _options: &StartNodeOptions<Self>,
+        peer_ports: &[u16],
+        template_config: Option<&OpenRaftKvNodeConfig>,
+    ) -> Result<BuiltNodeConfig<OpenRaftKvNodeConfig>, DynError> {
+        let mut reserved = reserve_local_node_ports(1, &[], "node")
+            .map_err(|source| -> DynError { source.into() })?;
+
+        let ports = reserved
+            .pop()
+            .ok_or_else(|| std::io::Error::other("failed to reserve local node ports"))?;
+
+        let mut config = template_config
+            .cloned()
+            .unwrap_or_else(|| local_node_config(index, ports.network_port(), BTreeMap::new()));
+
+        // OpenRaft peer config is index-sensitive, so local restarts must rebuild
+        // the full peer map from the current reserved port set.
+        let network_port = ports.network_port();
+        config.node_id = index as u64;
+        config.http_port = network_port;
+        config.public_addr = local_addr(network_port);
+        config.peer_addrs = peer_addrs_from_ports(peer_ports, index);
+
+        Ok(BuiltNodeConfig {
+            config,
+            network_port,
+        })
+    }
+
+    fn build_initial_node_configs(
+        topology: &Self::Deployment,
+    ) -> Result<
+        Vec<NodeConfigEntry<OpenRaftKvNodeConfig>>,
+        testing_framework_runner_local::process::ProcessSpawnError,
+    > {
+        let reserved_ports = reserve_local_node_ports(topology.node_count(), &[], "node")?;
+
+        let peer_ports = reserved_ports
+            .iter()
+            .map(LocalNodePorts::network_port)
+            .collect::<Vec<_>>();
+
+        // Build every node from the same reserved port view so the initial
+        // cluster starts with a consistent peer list on all nodes.
+        Ok(reserved_ports
+            .iter()
+            .enumerate()
+            .map(|(index, ports)| NodeConfigEntry {
+                name: format!("node-{index}"),
+                config: local_node_config(
+                    index,
+                    ports.network_port(),
+                    peer_addrs_from_ports(&peer_ports, index),
+                ),
+            })
+            .collect())
+    }
+
+    fn initial_node_name_prefix() -> &'static str {
+        "node"
+    }
+
+    fn local_process_spec() -> Option<LocalProcessSpec> {
+        Some(
+            LocalProcessSpec::new("OPENRAFT_KV_NODE_BIN", "openraft-kv-node").with_rust_log("info"),
+        )
+    }
+
+    fn render_local_config(config: &OpenRaftKvNodeConfig) -> Result<Vec<u8>, DynError> {
+        yaml_node_config(config)
+    }
+
+    fn http_api_port(config: &OpenRaftKvNodeConfig) -> Option<u16> {
+        Some(config.http_port)
+    }
+}
+
+fn local_node_config(
+    index: usize,
+    network_port: u16,
+    peer_addrs: BTreeMap<u64, String>,
+) -> OpenRaftKvNodeConfig {
+    OpenRaftKvNodeConfig {
+        node_id: index as u64,
+        http_port: network_port,
+        public_addr: local_addr(network_port),
+        peer_addrs,
+
+        heartbeat_interval_ms: 500,
+        election_timeout_min_ms: 1_500,
+        election_timeout_max_ms: 3_000,
+    }
+}
+
+fn peer_addrs_from_ports(peer_ports: &[u16], local_index: usize) -> BTreeMap<u64, String> {
+    peer_ports
+        .iter()
+        .enumerate()
+        .filter(|(peer_index, _)| *peer_index != local_index)
+        .map(|(peer_index, peer_port)| (peer_index as u64, local_addr(*peer_port)))
+        .collect()
+}
+
+fn local_addr(port: u16) -> String {
+    format!("127.0.0.1:{port}")
+}
--- a/examples/openraft_kv/testing/integration/src/observation.rs
+++ b/examples/openraft_kv/testing/integration/src/observation.rs
@ -0,0 +1,262 @@
+use std::{
+    collections::{BTreeMap, BTreeSet},
+    sync::Arc,
+    time::Duration,
+};
+
+use async_trait::async_trait;
+use openraft_kv_node::{OpenRaftKvClient, OpenRaftKvState};
+use testing_framework_core::{
+    observation::{
+        BoxedSourceProvider, ObservationConfig, ObservedSource, Observer, StaticSourceProvider,
+    },
+    scenario::{Application, DynError, NodeClients},
+};
+use testing_framework_runner_k8s::ManualCluster;
+
+use crate::OpenRaftKvEnv;
+
+const OBSERVATION_INTERVAL: Duration = Duration::from_millis(250);
+const OBSERVATION_HISTORY_LIMIT: usize = 16;
+
+/// Materialized OpenRaft cluster state built from the latest node polls.
+#[derive(Clone, Debug, Default)]
+pub struct OpenRaftClusterSnapshot {
+    states: Vec<OpenRaftKvState>,
+    failures: Vec<OpenRaftSourceFailure>,
+}
+
+impl OpenRaftClusterSnapshot {
+    /// Returns the successfully observed node states sorted by node id.
+    #[must_use]
+    pub fn states(&self) -> &[OpenRaftKvState] {
+        &self.states
+    }
+
+    /// Returns `true` when the snapshot contains no successful node states.
+    #[must_use]
+    pub fn is_empty(&self) -> bool {
+        self.states.is_empty()
+    }
+
+    /// Returns the unique observed leader when all responding nodes agree.
+    #[must_use]
+    pub fn agreed_leader(&self, different_from: Option<u64>) -> Option<u64> {
+        let observed = self
+            .states
+            .iter()
+            .filter_map(|state| state.current_leader)
+            .collect::<BTreeSet<_>>();
+
+        let leader = observed.iter().next().copied()?;
+
+        (observed.len() == 1 && different_from != Some(leader)).then_some(leader)
+    }
+
+    /// Returns `true` when every observed node reports the expected voter set.
+    #[must_use]
+    pub fn all_voters_match(&self, expected_voters: &BTreeSet<u64>) -> bool {
+        !self.states.is_empty()
+            && self.failures.is_empty()
+            && self.states.iter().all(|state| {
+                state.voters.iter().copied().collect::<BTreeSet<_>>() == *expected_voters
+            })
+    }
+
+    /// Returns `true` when every observed node exposes the expected replicated
+    /// key/value data.
+    #[must_use]
+    pub fn all_kv_match(
+        &self,
+        expected: &BTreeMap<String, String>,
+        full_voter_set: &[u64],
+    ) -> bool {
+        !self.states.is_empty()
+            && self.failures.is_empty()
+            && self.states.iter().all(|state| {
+                state.current_leader.is_some()
+                    && state.voters == full_voter_set
+                    && expected
+                        .iter()
+                        .all(|(key, value)| state.kv.get(key) == Some(value))
+            })
+    }
+
+    /// Returns a concise summary for timeout and validation errors.
+    #[must_use]
+    pub fn summary(&self) -> String {
+        let mut lines = self
+            .states
+            .iter()
+            .map(|state| {
+                format!(
+                    "node={} leader={:?} voters={:?} keys={}",
+                    state.node_id,
+                    state.current_leader,
+                    state.voters,
+                    state.kv.len()
+                )
+            })
+            .collect::<Vec<_>>();
+
+        lines.extend(self.failures.iter().map(OpenRaftSourceFailure::summary));
+
+        if lines.is_empty() {
+            return "no state observed yet".to_owned();
+        }
+
+        lines.join("; ")
+    }
+}
+
+/// One failed source read captured during an observation cycle.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct OpenRaftSourceFailure {
+    source_name: String,
+    message: String,
+}
+
+impl OpenRaftSourceFailure {
+    fn new(source_name: &str, message: &str) -> Self {
+        Self {
+            source_name: source_name.to_owned(),
+            message: message.to_owned(),
+        }
+    }
+
+    fn summary(&self) -> String {
+        format!("source={} error={}", self.source_name, self.message)
+    }
+}
+
+/// Observer that keeps the latest per-node OpenRaft state.
+#[derive(Clone, Debug, Default)]
+pub struct OpenRaftClusterObserver;
+
+impl OpenRaftClusterObserver {
+    /// Default runtime configuration for the OpenRaft example observer.
+    #[must_use]
+    pub fn config() -> ObservationConfig {
+        ObservationConfig {
+            interval: OBSERVATION_INTERVAL,
+            history_limit: OBSERVATION_HISTORY_LIMIT,
+        }
+    }
+}
+
+/// Captures one best-effort OpenRaft cluster snapshot from the provided node
+/// clients.
+pub async fn capture_openraft_cluster_snapshot(
+    clients: &[OpenRaftKvClient],
+) -> OpenRaftClusterSnapshot {
+    capture_cluster_snapshot(&named_sources(clients.to_vec())).await
+}
+
+#[async_trait]
+impl Observer for OpenRaftClusterObserver {
+    type Source = OpenRaftKvClient;
+    type State = OpenRaftClusterSnapshot;
+    type Snapshot = OpenRaftClusterSnapshot;
+    type Event = ();
+
+    async fn init(
+        &self,
+        sources: &[ObservedSource<Self::Source>],
+    ) -> Result<Self::State, DynError> {
+        Ok(capture_cluster_snapshot(sources).await)
+    }
+
+    async fn poll(
+        &self,
+        sources: &[ObservedSource<Self::Source>],
+        state: &mut Self::State,
+    ) -> Result<Vec<Self::Event>, DynError> {
+        *state = capture_cluster_snapshot(sources).await;
+
+        Ok(Vec::new())
+    }
+
+    fn snapshot(&self, state: &Self::State) -> Self::Snapshot {
+        state.clone()
+    }
+}
+
+/// Builds the fixed source provider used by the scenario-based OpenRaft
+/// examples.
+pub fn openraft_cluster_source_provider(
+    _deployment: &<OpenRaftKvEnv as Application>::Deployment,
+    node_clients: NodeClients<OpenRaftKvEnv>,
+) -> Result<BoxedSourceProvider<OpenRaftKvClient>, DynError> {
+    Ok(Box::new(StaticSourceProvider::new(named_sources(
+        node_clients.snapshot(),
+    ))))
+}
+
+/// Dynamic source provider backed by a manual cluster.
+///
+/// This keeps observation aligned with the latest client handles after manual
+/// node restarts.
+#[derive(Clone)]
+pub struct OpenRaftManualClusterSourceProvider {
+    cluster: Arc<ManualCluster<OpenRaftKvEnv>>,
+    node_names: Vec<String>,
+}
+
+impl OpenRaftManualClusterSourceProvider {
+    /// Builds a provider for the fixed node names used by the OpenRaft
+    /// examples.
+    #[must_use]
+    pub fn new(cluster: Arc<ManualCluster<OpenRaftKvEnv>>, node_count: usize) -> Self {
+        Self {
+            cluster,
+            node_names: (0..node_count)
+                .map(|index| format!("node-{index}"))
+                .collect(),
+        }
+    }
+}
+
+#[async_trait]
+impl testing_framework_core::observation::SourceProvider<OpenRaftKvClient>
+    for OpenRaftManualClusterSourceProvider
+{
+    async fn sources(&self) -> Result<Vec<ObservedSource<OpenRaftKvClient>>, DynError> {
+        Ok(self
+            .node_names
+            .iter()
+            .filter_map(|name| {
+                self.cluster
+                    .node_client(name)
+                    .map(|client| ObservedSource::new(name, client))
+            })
+            .collect())
+    }
+}
+
+fn named_sources(clients: Vec<OpenRaftKvClient>) -> Vec<ObservedSource<OpenRaftKvClient>> {
+    clients
+        .into_iter()
+        .enumerate()
+        .map(|(index, client)| ObservedSource::new(&format!("node-{index}"), client))
+        .collect()
+}
+
+async fn capture_cluster_snapshot(
+    sources: &[ObservedSource<OpenRaftKvClient>],
+) -> OpenRaftClusterSnapshot {
+    let mut states = Vec::with_capacity(sources.len());
+    let mut failures = Vec::new();
+
+    for source in sources {
+        match source.source.state().await {
+            Ok(state) => states.push(state),
+            Err(error) => {
+                failures.push(OpenRaftSourceFailure::new(&source.name, &error.to_string()))
+            }
+        }
+    }
+
+    states.sort_by_key(|state| state.node_id);
+
+    OpenRaftClusterSnapshot { states, failures }
+}
--- a/examples/openraft_kv/testing/integration/src/scenario.rs
+++ b/examples/openraft_kv/testing/integration/src/scenario.rs
@ -0,0 +1,32 @@
+use testing_framework_core::scenario::{CoreBuilderExt, ScenarioBuilder};
+
+use crate::{
+    OpenRaftClusterObserver, OpenRaftKvEnv, OpenRaftKvTopology, openraft_cluster_source_provider,
+};
+
+/// Scenario builder alias used by the OpenRaft example binaries.
+pub type OpenRaftKvScenarioBuilder = ScenarioBuilder<OpenRaftKvEnv>;
+
+/// Convenience helpers for constructing the fixed three-node OpenRaft topology.
+pub trait OpenRaftKvBuilderExt: Sized {
+    /// Starts from the default three-node deployment and lets callers adjust
+    /// it.
+    fn deployment_with(f: impl FnOnce(OpenRaftKvTopology) -> OpenRaftKvTopology) -> Self;
+
+    /// Attaches the default OpenRaft cluster observer to the scenario.
+    fn with_cluster_observer(self) -> Self;
+}
+
+impl OpenRaftKvBuilderExt for OpenRaftKvScenarioBuilder {
+    fn deployment_with(f: impl FnOnce(OpenRaftKvTopology) -> OpenRaftKvTopology) -> Self {
+        OpenRaftKvScenarioBuilder::with_deployment(f(OpenRaftKvTopology::new(3)))
+    }
+
+    fn with_cluster_observer(self) -> Self {
+        self.with_observer(
+            OpenRaftClusterObserver,
+            openraft_cluster_source_provider,
+            OpenRaftClusterObserver::config(),
+        )
+    }
+}
--- a/examples/openraft_kv/testing/workloads/Cargo.toml
+++ b/examples/openraft_kv/testing/workloads/Cargo.toml
@ -0,0 +1,15 @@
+[package]
+edition.workspace = true
+license.workspace = true
+name              = "openraft-kv-runtime-workloads"
+version.workspace = true
+
+[dependencies]
+anyhow                  = "1.0"
+async-trait             = { workspace = true }
+openraft-kv-node        = { path = "../../openraft-kv-node" }
+openraft-kv-runtime-ext = { path = "../integration" }
+testing-framework-core  = { workspace = true }
+thiserror               = "2.0"
+tokio                   = { workspace = true, features = ["full"] }
+tracing                 = { workspace = true }
--- a/examples/openraft_kv/testing/workloads/src/convergence.rs
+++ b/examples/openraft_kv/testing/workloads/src/convergence.rs
@ -0,0 +1,61 @@
+use std::time::Duration;
+
+use async_trait::async_trait;
+use openraft_kv_runtime_ext::{OpenRaftClusterObserver, OpenRaftKvEnv};
+use testing_framework_core::{
+    observation::ObservationHandle,
+    scenario::{DynError, Expectation, RunContext},
+};
+
+use crate::support::{expected_kv, wait_for_observed_replication};
+
+/// Expectation that waits for the full voter set and the writes from this run
+/// to converge on every node.
+#[derive(Clone)]
+pub struct OpenRaftKvConverges {
+    total_writes: usize,
+    timeout: Duration,
+    key_prefix: String,
+}
+
+impl OpenRaftKvConverges {
+    /// Creates a convergence check for the given number of replicated writes.
+    #[must_use]
+    pub fn new(total_writes: usize) -> Self {
+        Self {
+            total_writes,
+            timeout: Duration::from_secs(30),
+            key_prefix: "raft-key".to_owned(),
+        }
+    }
+
+    /// Overrides the key prefix used to derive expected writes.
+    #[must_use]
+    pub fn key_prefix(mut self, value: &str) -> Self {
+        self.key_prefix = value.to_owned();
+        self
+    }
+
+    /// Overrides the convergence timeout.
+    #[must_use]
+    pub const fn timeout(mut self, value: Duration) -> Self {
+        self.timeout = value;
+        self
+    }
+}
+
+#[async_trait]
+impl Expectation<OpenRaftKvEnv> for OpenRaftKvConverges {
+    fn name(&self) -> &str {
+        "openraft_kv_converges"
+    }
+
+    async fn evaluate(&mut self, ctx: &RunContext<OpenRaftKvEnv>) -> Result<(), DynError> {
+        let expected = expected_kv(&self.key_prefix, self.total_writes);
+        let observer = ctx.require_extension::<ObservationHandle<OpenRaftClusterObserver>>()?;
+
+        wait_for_observed_replication(&observer, &expected, self.timeout).await?;
+
+        Ok(())
+    }
+}
--- a/examples/openraft_kv/testing/workloads/src/failover.rs
+++ b/examples/openraft_kv/testing/workloads/src/failover.rs
@ -0,0 +1,207 @@
+use std::time::Duration;
+
+use async_trait::async_trait;
+use openraft_kv_node::OpenRaftKvClient;
+use openraft_kv_runtime_ext::{OpenRaftClusterObserver, OpenRaftKvEnv};
+use testing_framework_core::{
+    observation::ObservationHandle,
+    scenario::{DynError, RunContext, Workload},
+};
+use tracing::info;
+
+use crate::support::{
+    OpenRaftMembership, ensure_cluster_size, resolve_client_for_node, wait_for_observed_leader,
+    wait_for_observed_membership, write_batch,
+};
+
+/// Workload that bootstraps the cluster, expands it to three voters, writes one
+/// batch, restarts the leader, then writes a second batch through the new
+/// leader.
+#[derive(Clone)]
+pub struct OpenRaftKvFailoverWorkload {
+    first_batch: usize,
+    second_batch: usize,
+    timeout: Duration,
+    key_prefix: String,
+}
+
+impl OpenRaftKvFailoverWorkload {
+    /// Creates the default failover workload configuration.
+    #[must_use]
+    pub fn new() -> Self {
+        Self {
+            first_batch: 8,
+            second_batch: 8,
+            timeout: Duration::from_secs(30),
+            key_prefix: "raft-key".to_owned(),
+        }
+    }
+
+    /// Sets the number of writes issued before the leader restart.
+    #[must_use]
+    pub const fn first_batch(mut self, value: usize) -> Self {
+        self.first_batch = value;
+        self
+    }
+
+    /// Sets the number of writes issued after the leader restart.
+    #[must_use]
+    pub const fn second_batch(mut self, value: usize) -> Self {
+        self.second_batch = value;
+        self
+    }
+
+    /// Overrides the key prefix used for generated writes.
+    #[must_use]
+    pub fn key_prefix(mut self, value: &str) -> Self {
+        self.key_prefix = value.to_owned();
+        self
+    }
+
+    /// Overrides the timeout used for leader and membership transitions.
+    #[must_use]
+    pub const fn timeout(mut self, value: Duration) -> Self {
+        self.timeout = value;
+        self
+    }
+}
+
+impl Default for OpenRaftKvFailoverWorkload {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[async_trait]
+impl Workload<OpenRaftKvEnv> for OpenRaftKvFailoverWorkload {
+    fn name(&self) -> &str {
+        "openraft_kv_failover_workload"
+    }
+
+    async fn start(&self, ctx: &RunContext<OpenRaftKvEnv>) -> Result<(), DynError> {
+        let clients = ctx.node_clients().snapshot();
+        let observer = ctx.require_extension::<ObservationHandle<OpenRaftClusterObserver>>()?;
+
+        ensure_cluster_size(&clients, 3)?;
+
+        self.bootstrap_cluster(&clients).await?;
+
+        let initial_leader = wait_for_observed_leader(&observer, self.timeout, None).await?;
+        let membership = OpenRaftMembership::discover(&clients).await?;
+
+        self.promote_cluster(&observer, &clients, initial_leader, &membership)
+            .await?;
+        self.write_initial_batch(&clients, initial_leader).await?;
+
+        let new_leader = self
+            .restart_leader_and_wait_for_failover(ctx, &observer, initial_leader)
+            .await?;
+        self.write_second_batch(&clients, new_leader).await?;
+
+        Ok(())
+    }
+}
+
+impl OpenRaftKvFailoverWorkload {
+    async fn bootstrap_cluster(&self, clients: &[OpenRaftKvClient]) -> Result<(), DynError> {
+        info!("initializing openraft cluster");
+
+        clients[0].init_self().await?;
+
+        Ok(())
+    }
+
+    async fn promote_cluster(
+        &self,
+        observer: &ObservationHandle<OpenRaftClusterObserver>,
+        clients: &[OpenRaftKvClient],
+        leader_id: u64,
+        membership: &OpenRaftMembership,
+    ) -> Result<(), DynError> {
+        let leader = resolve_client_for_node(clients, leader_id, self.timeout).await?;
+
+        for learner in membership.learner_targets(leader_id) {
+            info!(
+                target = learner.node_id,
+                addr = %learner.public_addr,
+                "adding learner"
+            );
+
+            leader
+                .add_learner(learner.node_id, &learner.public_addr)
+                .await?;
+        }
+
+        let voter_ids = membership.voter_ids();
+        leader.change_membership(voter_ids.iter().copied()).await?;
+
+        wait_for_observed_membership(observer, &voter_ids, self.timeout).await?;
+
+        Ok(())
+    }
+
+    async fn write_initial_batch(
+        &self,
+        clients: &[OpenRaftKvClient],
+        leader_id: u64,
+    ) -> Result<(), DynError> {
+        info!(
+            leader = leader_id,
+            writes = self.first_batch,
+            "writing initial batch"
+        );
+
+        let leader = resolve_client_for_node(clients, leader_id, self.timeout).await?;
+        write_batch(&leader, &self.key_prefix, 0, self.first_batch).await?;
+
+        Ok(())
+    }
+
+    async fn restart_leader_and_wait_for_failover(
+        &self,
+        ctx: &RunContext<OpenRaftKvEnv>,
+        observer: &ObservationHandle<OpenRaftClusterObserver>,
+        leader_id: u64,
+    ) -> Result<u64, DynError> {
+        let Some(control) = ctx.node_control() else {
+            return Err("openraft failover workload requires node control".into());
+        };
+
+        let leader_name = format!("node-{leader_id}");
+        info!(%leader_name, "restarting current leader");
+
+        control.restart_node(&leader_name).await?;
+
+        let new_leader = wait_for_observed_leader(observer, self.timeout, Some(leader_id)).await?;
+
+        info!(
+            old_leader = leader_id,
+            new_leader, "leader changed after restart"
+        );
+
+        Ok(new_leader)
+    }
+
+    async fn write_second_batch(
+        &self,
+        clients: &[OpenRaftKvClient],
+        leader_id: u64,
+    ) -> Result<(), DynError> {
+        info!(
+            leader = leader_id,
+            writes = self.second_batch,
+            "writing second batch"
+        );
+
+        let leader = resolve_client_for_node(clients, leader_id, self.timeout).await?;
+        write_batch(
+            &leader,
+            &self.key_prefix,
+            self.first_batch,
+            self.second_batch,
+        )
+        .await?;
+
+        Ok(())
+    }
+}
--- a/examples/openraft_kv/testing/workloads/src/lib.rs
+++ b/examples/openraft_kv/testing/workloads/src/lib.rs
@ -0,0 +1,14 @@
+mod convergence;
+mod failover;
+mod support;
+
+/// Replication expectation used by the OpenRaft example binaries.
+pub use convergence::OpenRaftKvConverges;
+/// Failover workload used by the OpenRaft example binaries.
+pub use failover::OpenRaftKvFailoverWorkload;
+/// Shared cluster helpers used by the OpenRaft workload and manual k8s example.
+pub use support::{
+    FULL_VOTER_SET, OpenRaftClusterError, OpenRaftMembership, ensure_cluster_size, expected_kv,
+    resolve_client_for_node, wait_for_leader, wait_for_membership, wait_for_observed_leader,
+    wait_for_observed_membership, wait_for_observed_replication, wait_for_replication, write_batch,
+};
--- a/examples/openraft_kv/testing/workloads/src/support.rs
+++ b/examples/openraft_kv/testing/workloads/src/support.rs
@ -0,0 +1,328 @@
+use std::{
+    collections::{BTreeMap, BTreeSet},
+    time::Duration,
+};
+
+use openraft_kv_node::{OpenRaftKvClient, OpenRaftKvState};
+use openraft_kv_runtime_ext::{
+    OpenRaftClusterObserver, OpenRaftClusterSnapshot, capture_openraft_cluster_snapshot,
+};
+use testing_framework_core::observation::{ObservationHandle, ObservationSnapshot};
+use thiserror::Error;
+use tokio::time::{Instant, sleep};
+
+const POLL_INTERVAL: Duration = Duration::from_millis(250);
+const CLIENT_RESOLUTION_INTERVAL: Duration = Duration::from_millis(200);
+
+/// Fixed voter set used by the example cluster.
+pub const FULL_VOTER_SET: [u64; 3] = [0, 1, 2];
+
+/// One learner candidate discovered from cluster state.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct LearnerTarget {
+    /// Node identifier used by OpenRaft membership.
+    pub node_id: u64,
+    /// Public address advertised for Raft traffic.
+    pub public_addr: String,
+}
+
+/// Membership view captured from the current node states.
+#[derive(Clone, Debug)]
+pub struct OpenRaftMembership {
+    states: Vec<OpenRaftKvState>,
+}
+
+impl OpenRaftMembership {
+    /// Builds a membership view from already observed node states.
+    #[must_use]
+    pub fn from_states(states: &[OpenRaftKvState]) -> Self {
+        let mut states = states.to_vec();
+        states.sort_by_key(|state| state.node_id);
+
+        Self { states }
+    }
+
+    /// Reads and sorts the current node states by id.
+    pub async fn discover(clients: &[OpenRaftKvClient]) -> Result<Self, OpenRaftClusterError> {
+        let mut states = Vec::with_capacity(clients.len());
+
+        for client in clients {
+            states.push(client.state().await.map_err(OpenRaftClusterError::Client)?);
+        }
+
+        Ok(Self::from_states(&states))
+    }
+
+    /// Returns the full voter set implied by the discovered nodes.
+    #[must_use]
+    pub fn voter_ids(&self) -> BTreeSet<u64> {
+        self.states.iter().map(|state| state.node_id).collect()
+    }
+
+    /// Returns every non-leader node as a learner target.
+    #[must_use]
+    pub fn learner_targets(&self, leader_id: u64) -> Vec<LearnerTarget> {
+        self.states
+            .iter()
+            .filter(|state| state.node_id != leader_id)
+            .map(|state| LearnerTarget {
+                node_id: state.node_id,
+                public_addr: state.public_addr.clone(),
+            })
+            .collect()
+    }
+}
+
+/// Errors raised by the OpenRaft example cluster helpers.
+#[derive(Debug, Error)]
+pub enum OpenRaftClusterError {
+    #[error("openraft example requires at least {expected} node clients, got {actual}")]
+    InsufficientClients { expected: usize, actual: usize },
+    #[error("failed to query openraft node state: {0}")]
+    Client(#[source] anyhow::Error),
+    #[error("openraft cluster observation is not available yet")]
+    MissingObservation,
+    #[error(
+        "timed out waiting for {action} after {timeout:?}; last observation: {last_observation}"
+    )]
+    Timeout {
+        action: &'static str,
+        timeout: Duration,
+        last_observation: String,
+    },
+    #[error("timed out resolving node client for {node_id} after {timeout:?}")]
+    ClientResolution { node_id: u64, timeout: Duration },
+}
+
+/// Ensures the example cluster has the expected number of node clients.
+pub fn ensure_cluster_size(
+    clients: &[OpenRaftKvClient],
+    expected: usize,
+) -> Result<(), OpenRaftClusterError> {
+    if clients.len() < expected {
+        return Err(OpenRaftClusterError::InsufficientClients {
+            expected,
+            actual: clients.len(),
+        });
+    }
+
+    Ok(())
+}
+
+/// Waits until the cluster converges on one leader.
+pub async fn wait_for_leader(
+    clients: &[OpenRaftKvClient],
+    timeout: Duration,
+    different_from: Option<u64>,
+) -> Result<u64, OpenRaftClusterError> {
+    let deadline = Instant::now() + timeout;
+
+    loop {
+        let last_observation = capture_openraft_cluster_snapshot(clients).await;
+
+        if let Some(leader) = last_observation.agreed_leader(different_from) {
+            return Ok(leader);
+        }
+
+        if Instant::now() >= deadline {
+            return Err(OpenRaftClusterError::Timeout {
+                action: "leader agreement",
+                timeout,
+                last_observation: last_observation.summary(),
+            });
+        }
+
+        sleep(POLL_INTERVAL).await;
+    }
+}
+
+/// Waits until every node reports the expected voter set.
+pub async fn wait_for_membership(
+    clients: &[OpenRaftKvClient],
+    expected_voters: &BTreeSet<u64>,
+    timeout: Duration,
+) -> Result<(), OpenRaftClusterError> {
+    let deadline = Instant::now() + timeout;
+
+    loop {
+        let last_observation = capture_openraft_cluster_snapshot(clients).await;
+
+        if last_observation.all_voters_match(expected_voters) {
+            return Ok(());
+        }
+
+        if Instant::now() >= deadline {
+            return Err(OpenRaftClusterError::Timeout {
+                action: "membership convergence",
+                timeout,
+                last_observation: last_observation.summary(),
+            });
+        }
+
+        sleep(POLL_INTERVAL).await;
+    }
+}
+
+/// Waits until every node reports the full replicated key set.
+pub async fn wait_for_replication(
+    clients: &[OpenRaftKvClient],
+    expected: &BTreeMap<String, String>,
+    timeout: Duration,
+) -> Result<(), OpenRaftClusterError> {
+    let deadline = Instant::now() + timeout;
+
+    loop {
+        let last_observation = capture_openraft_cluster_snapshot(clients).await;
+
+        if last_observation.all_kv_match(expected, &FULL_VOTER_SET) {
+            return Ok(());
+        }
+
+        if Instant::now() >= deadline {
+            return Err(OpenRaftClusterError::Timeout {
+                action: "replicated state convergence",
+                timeout,
+                last_observation: last_observation.summary(),
+            });
+        }
+
+        sleep(POLL_INTERVAL).await;
+    }
+}
+
+/// Waits until the observer reports one agreed leader.
+pub async fn wait_for_observed_leader(
+    handle: &ObservationHandle<OpenRaftClusterObserver>,
+    timeout: Duration,
+    different_from: Option<u64>,
+) -> Result<u64, OpenRaftClusterError> {
+    let snapshot =
+        wait_for_observed_snapshot(handle, timeout, "observed leader agreement", |snapshot| {
+            snapshot.agreed_leader(different_from).is_some()
+        })
+        .await?;
+
+    snapshot
+        .value
+        .agreed_leader(different_from)
+        .ok_or(OpenRaftClusterError::MissingObservation)
+}
+
+/// Waits until the observer reports the expected voter set on every node.
+pub async fn wait_for_observed_membership(
+    handle: &ObservationHandle<OpenRaftClusterObserver>,
+    expected_voters: &BTreeSet<u64>,
+    timeout: Duration,
+) -> Result<(), OpenRaftClusterError> {
+    wait_for_observed_snapshot(
+        handle,
+        timeout,
+        "observed membership convergence",
+        |snapshot| snapshot.all_voters_match(expected_voters),
+    )
+    .await?;
+
+    Ok(())
+}
+
+/// Waits until the observer reports the full replicated key set.
+pub async fn wait_for_observed_replication(
+    handle: &ObservationHandle<OpenRaftClusterObserver>,
+    expected: &BTreeMap<String, String>,
+    timeout: Duration,
+) -> Result<(), OpenRaftClusterError> {
+    wait_for_observed_snapshot(
+        handle,
+        timeout,
+        "observed replicated state convergence",
+        |snapshot| snapshot.all_kv_match(expected, &FULL_VOTER_SET),
+    )
+    .await?;
+
+    Ok(())
+}
+
+/// Resolves the client handle that currently identifies as `node_id`.
+pub async fn resolve_client_for_node(
+    clients: &[OpenRaftKvClient],
+    node_id: u64,
+    timeout: Duration,
+) -> Result<OpenRaftKvClient, OpenRaftClusterError> {
+    let deadline = Instant::now() + timeout;
+
+    loop {
+        for client in clients {
+            let Ok(state) = client.state().await else {
+                continue;
+            };
+
+            if state.node_id == node_id {
+                return Ok(client.clone());
+            }
+        }
+
+        if Instant::now() >= deadline {
+            return Err(OpenRaftClusterError::ClientResolution { node_id, timeout });
+        }
+
+        sleep(CLIENT_RESOLUTION_INTERVAL).await;
+    }
+}
+
+/// Issues a contiguous batch of writes through the current leader.
+pub async fn write_batch(
+    leader: &OpenRaftKvClient,
+    prefix: &str,
+    start: usize,
+    count: usize,
+) -> Result<(), OpenRaftClusterError> {
+    for index in start..(start + count) {
+        let key = format!("{prefix}-{index}");
+        let value = format!("value-{index}");
+
+        leader
+            .write(&key, &value, index as u64 + 1)
+            .await
+            .map_err(OpenRaftClusterError::Client)?;
+    }
+
+    Ok(())
+}
+
+/// Builds the replicated key/value map expected after the workload completes.
+#[must_use]
+pub fn expected_kv(prefix: &str, total_writes: usize) -> BTreeMap<String, String> {
+    (0..total_writes)
+        .map(|index| (format!("{prefix}-{index}"), format!("value-{index}")))
+        .collect()
+}
+
+async fn wait_for_observed_snapshot(
+    handle: &ObservationHandle<OpenRaftClusterObserver>,
+    timeout: Duration,
+    action: &'static str,
+    matches: impl Fn(&OpenRaftClusterSnapshot) -> bool,
+) -> Result<ObservationSnapshot<OpenRaftClusterSnapshot>, OpenRaftClusterError> {
+    let deadline = Instant::now() + timeout;
+    let mut last_summary = "no state observed yet".to_owned();
+
+    loop {
+        if let Some(snapshot) = handle.latest_snapshot() {
+            last_summary = snapshot.value.summary();
+
+            if matches(&snapshot.value) {
+                return Ok(snapshot);
+            }
+        }
+
+        if Instant::now() >= deadline {
+            return Err(OpenRaftClusterError::Timeout {
+                action,
+                timeout,
+                last_observation: last_summary,
+            });
+        }
+
+        sleep(POLL_INTERVAL).await;
+    }
+}
--- a/examples/pubsub/README.md
+++ b/examples/pubsub/README.md
@ -26,15 +26,15 @@ Each example follows the same pattern:
 ## Run locally

 ```bash
-cargo run -p pubsub-examples --bin basic_ws_roundtrip
-cargo run -p pubsub-examples --bin basic_ws_reconnect
+cargo run -p pubsub-examples --bin pubsub_basic_ws_roundtrip
+cargo run -p pubsub-examples --bin pubsub_basic_ws_reconnect
 ```

 ## Run with Docker Compose

 ```bash
-cargo run -p pubsub-examples --bin compose_ws_roundtrip
-cargo run -p pubsub-examples --bin compose_ws_reconnect
+cargo run -p pubsub-examples --bin pubsub_compose_ws_roundtrip
+cargo run -p pubsub-examples --bin pubsub_compose_ws_reconnect
 ```

 Set `PUBSUB_IMAGE` to override the default compose image tag.
@ -43,7 +43,7 @@ Set `PUBSUB_IMAGE` to override the default compose image tag.

 ```bash
 docker build -t pubsub-node:local -f examples/pubsub/Dockerfile .
-cargo run -p pubsub-examples --bin k8s_ws_roundtrip
+cargo run -p pubsub-examples --bin pubsub_k8s_ws_roundtrip
 ```

 Prerequisites:
@ -57,5 +57,5 @@ Optional image override:

 ```bash
 docker build -t pubsub-node:local -f examples/pubsub/Dockerfile .
-cargo run -p pubsub-examples --bin k8s_manual_ws_roundtrip
+cargo run -p pubsub-examples --bin pubsub_k8s_manual_ws_roundtrip
 ```
--- a/examples/pubsub/examples/Cargo.toml
+++ b/examples/pubsub/examples/Cargo.toml
@ -4,6 +4,30 @@ license.workspace = true
 name              = "pubsub-examples"
 version.workspace = true

+[[bin]]
+name = "pubsub_basic_ws_roundtrip"
+path = "src/bin/basic_ws_roundtrip.rs"
+
+[[bin]]
+name = "pubsub_basic_ws_reconnect"
+path = "src/bin/basic_ws_reconnect.rs"
+
+[[bin]]
+name = "pubsub_compose_ws_roundtrip"
+path = "src/bin/compose_ws_roundtrip.rs"
+
+[[bin]]
+name = "pubsub_compose_ws_reconnect"
+path = "src/bin/compose_ws_reconnect.rs"
+
+[[bin]]
+name = "pubsub_k8s_ws_roundtrip"
+path = "src/bin/k8s_ws_roundtrip.rs"
+
+[[bin]]
+name = "pubsub_k8s_manual_ws_roundtrip"
+path = "src/bin/k8s_manual_ws_roundtrip.rs"
+
 [dependencies]
 anyhow                           = "1.0"
 pubsub-node                      = { path = "../pubsub-node" }
--- a/examples/queue/Dockerfile
+++ b/examples/queue/Dockerfile
@ -0,0 +1,24 @@
+FROM rustlang/rust:nightly-bookworm AS builder
+
+WORKDIR /build
+
+COPY Cargo.toml Cargo.lock ./
+COPY cfgsync/ ./cfgsync/
+COPY examples/ ./examples/
+COPY testing-framework/ ./testing-framework/
+
+RUN cargo build --release -p queue-node
+
+FROM debian:bookworm-slim
+
+RUN apt-get update && \
+    apt-get install -y ca-certificates && \
+    rm -rf /var/lib/apt/lists/*
+
+COPY --from=builder /build/target/release/queue-node /usr/local/bin/queue-node
+
+RUN mkdir -p /etc/queue
+WORKDIR /app
+
+ENTRYPOINT ["/usr/local/bin/queue-node"]
+CMD ["--config", "/etc/queue/config.yaml"]
--- a/examples/queue/README.md
+++ b/examples/queue/README.md
@ -0,0 +1,47 @@
+# Queue Example
+
+This example runs a small replicated FIFO queue.
+
+The scenarios enqueue messages, dequeue them again, and check that queue state
+either converges or drains as expected.
+
+## How TF runs this
+
+Each example follows the same pattern:
+
+- TF starts a small deployment of queue nodes
+- a workload produces messages, or produces and consumes them
+- an expectation checks either that queue state converges or that the queue drains
+
+## Scenarios
+
+- `basic_convergence` produces messages and checks that queue state converges locally
+- `basic_roundtrip` produces and consumes messages locally until the queue drains
+- `basic_restart_chaos` injects random local node restarts during the run
+- `compose_convergence` and `compose_roundtrip` run the same checks in Docker Compose
+
+## API
+
+Each node exposes:
+
+- `POST /queue/enqueue` to add a message
+- `POST /queue/dequeue` to remove a message
+- `GET /queue/state` to inspect the current queue state
+- `GET /internal/snapshot` to read the local replicated state
+
+## Run locally
+
+```bash
+cargo run -p queue-examples --bin queue_basic_convergence
+cargo run -p queue-examples --bin queue_basic_roundtrip
+cargo run -p queue-examples --bin queue_basic_restart_chaos
+```
+
+## Run with Docker Compose
+
+```bash
+cargo run -p queue-examples --bin queue_compose_convergence
+cargo run -p queue-examples --bin queue_compose_roundtrip
+```
+
+Set `QUEUE_IMAGE` to override the default compose image tag.
--- a/examples/queue/examples/Cargo.toml
+++ b/examples/queue/examples/Cargo.toml
@ -0,0 +1,36 @@
+[package]
+edition.workspace = true
+license.workspace = true
+name              = "queue-examples"
+version.workspace = true
+
+[[bin]]
+name = "queue_basic_convergence"
+path = "src/bin/basic_convergence.rs"
+
+[[bin]]
+name = "queue_basic_restart_chaos"
+path = "src/bin/basic_restart_chaos.rs"
+
+[[bin]]
+name = "queue_basic_roundtrip"
+path = "src/bin/basic_roundtrip.rs"
+
+[[bin]]
+name = "queue_compose_convergence"
+path = "src/bin/compose_convergence.rs"
+
+[[bin]]
+name = "queue_compose_roundtrip"
+path = "src/bin/compose_roundtrip.rs"
+
+[dependencies]
+anyhow                           = "1.0"
+async-trait                      = { workspace = true }
+queue-runtime-ext                = { path = "../testing/integration" }
+queue-runtime-workloads          = { path = "../testing/workloads" }
+testing-framework-core           = { workspace = true }
+testing-framework-runner-compose = { workspace = true }
+tokio                            = { workspace = true, features = ["full"] }
+tracing                          = { workspace = true }
+tracing-subscriber               = { version = "0.3", features = ["env-filter"] }
--- a/examples/queue/examples/src/bin/basic_convergence.rs
+++ b/examples/queue/examples/src/bin/basic_convergence.rs
@ -0,0 +1,32 @@
+use std::time::Duration;
+
+use queue_runtime_ext::QueueLocalDeployer;
+use queue_runtime_workloads::{
+    QueueBuilderExt, QueueConverges, QueueProduceWorkload, QueueScenarioBuilder, QueueTopology,
+};
+use testing_framework_core::scenario::Deployer;
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
+        .init();
+
+    let operations = 300;
+
+    let mut scenario = QueueScenarioBuilder::deployment_with(|_| QueueTopology::new(3))
+        .with_run_duration(Duration::from_secs(30))
+        .with_workload(
+            QueueProduceWorkload::new()
+                .operations(operations)
+                .rate_per_sec(30)
+                .payload_prefix("demo"),
+        )
+        .with_expectation(QueueConverges::new(operations).timeout(Duration::from_secs(25)))
+        .build()?;
+
+    let deployer = QueueLocalDeployer::default();
+    let runner = deployer.deploy(&scenario).await?;
+    runner.run(&mut scenario).await?;
+    Ok(())
+}
--- a/examples/queue/examples/src/bin/basic_restart_chaos.rs
+++ b/examples/queue/examples/src/bin/basic_restart_chaos.rs
@ -0,0 +1,84 @@
+use std::time::Duration;
+
+use async_trait::async_trait;
+use queue_runtime_ext::QueueLocalDeployer;
+use queue_runtime_workloads::{
+    QueueBuilderExt, QueueConverges, QueueProduceWorkload, QueueScenarioBuilder, QueueTopology,
+};
+use testing_framework_core::{
+    scenario::{Deployer, DynError, RunContext, Workload},
+    topology::DeploymentDescriptor,
+};
+use tracing::info;
+
+#[derive(Clone)]
+struct FixedRestartChaosWorkload {
+    restarts: usize,
+    delay: Duration,
+}
+
+impl FixedRestartChaosWorkload {
+    const fn new(restarts: usize, delay: Duration) -> Self {
+        Self { restarts, delay }
+    }
+}
+
+#[async_trait]
+impl Workload<queue_runtime_workloads::QueueEnv> for FixedRestartChaosWorkload {
+    fn name(&self) -> &str {
+        "fixed_restart_chaos"
+    }
+
+    async fn start(
+        &self,
+        ctx: &RunContext<queue_runtime_workloads::QueueEnv>,
+    ) -> Result<(), DynError> {
+        let Some(control) = ctx.node_control() else {
+            return Err("fixed restart chaos requires node control".into());
+        };
+
+        let node_count = ctx.descriptors().node_count();
+        if node_count == 0 {
+            return Err("fixed restart chaos requires at least one node".into());
+        }
+
+        for step in 0..self.restarts {
+            tokio::time::sleep(self.delay).await;
+            let target_index = if node_count > 1 {
+                (step % (node_count - 1)) + 1
+            } else {
+                0
+            };
+            let target = format!("node-{target_index}");
+            info!(step, %target, "triggering controlled chaos restart");
+            control.restart_node(&target).await?;
+        }
+
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
+        .init();
+
+    let mut scenario = QueueScenarioBuilder::deployment_with(|_| QueueTopology::new(3))
+        .enable_node_control()
+        .with_workload(FixedRestartChaosWorkload::new(3, Duration::from_secs(8)))
+        .with_run_duration(Duration::from_secs(30))
+        .with_workload(
+            QueueProduceWorkload::new()
+                .operations(400)
+                .rate_per_sec(40)
+                .payload_prefix("queue-chaos"),
+        )
+        .with_expectation(QueueConverges::new(200).timeout(Duration::from_secs(30)))
+        .build()?;
+
+    let deployer = QueueLocalDeployer::default();
+    let runner = deployer.deploy(&scenario).await?;
+    runner.run(&mut scenario).await?;
+    Ok(())
+}
--- a/examples/queue/examples/src/bin/basic_roundtrip.rs
+++ b/examples/queue/examples/src/bin/basic_roundtrip.rs
@ -0,0 +1,31 @@
+use std::time::Duration;
+
+use queue_runtime_ext::QueueLocalDeployer;
+use queue_runtime_workloads::{
+    QueueBuilderExt, QueueDrained, QueueRoundTripWorkload, QueueScenarioBuilder, QueueTopology,
+};
+use testing_framework_core::scenario::Deployer;
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
+        .init();
+
+    let operations = 200;
+
+    let mut scenario = QueueScenarioBuilder::deployment_with(|_| QueueTopology::new(3))
+        .with_run_duration(Duration::from_secs(30))
+        .with_workload(
+            QueueRoundTripWorkload::new()
+                .operations(operations)
+                .rate_per_sec(25),
+        )
+        .with_expectation(QueueDrained::new().timeout(Duration::from_secs(25)))
+        .build()?;
+
+    let deployer = QueueLocalDeployer::default();
+    let runner = deployer.deploy(&scenario).await?;
+    runner.run(&mut scenario).await?;
+    Ok(())
+}
--- a/examples/queue/examples/src/bin/compose_convergence.rs
+++ b/examples/queue/examples/src/bin/compose_convergence.rs
@ -0,0 +1,47 @@
+use std::time::Duration;
+
+use anyhow::{Context as _, Result};
+use queue_runtime_workloads::{
+    QueueBuilderExt, QueueConverges, QueueProduceWorkload, QueueScenarioBuilder, QueueTopology,
+};
+use testing_framework_core::scenario::Deployer;
+use testing_framework_runner_compose::ComposeRunnerError;
+use tracing::{info, warn};
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
+        .init();
+
+    let operations = 200;
+
+    let mut scenario = QueueScenarioBuilder::deployment_with(|_| QueueTopology::new(3))
+        .with_run_duration(Duration::from_secs(30))
+        .with_workload(
+            QueueProduceWorkload::new()
+                .operations(operations)
+                .rate_per_sec(20),
+        )
+        .with_expectation(QueueConverges::new(operations).timeout(Duration::from_secs(25)))
+        .build()?;
+
+    let deployer = queue_runtime_ext::QueueComposeDeployer::new();
+    let runner = match deployer.deploy(&scenario).await {
+        Ok(runner) => runner,
+        Err(ComposeRunnerError::DockerUnavailable) => {
+            warn!("docker unavailable; skipping compose queue run");
+            return Ok(());
+        }
+        Err(error) => {
+            return Err(anyhow::Error::new(error)).context("deploying queue compose stack");
+        }
+    };
+
+    info!("running queue compose convergence scenario");
+    runner
+        .run(&mut scenario)
+        .await
+        .context("running queue compose scenario")?;
+    Ok(())
+}
--- a/examples/queue/examples/src/bin/compose_roundtrip.rs
+++ b/examples/queue/examples/src/bin/compose_roundtrip.rs
@ -0,0 +1,48 @@
+use std::time::Duration;
+
+use anyhow::{Context as _, Result};
+use queue_runtime_workloads::{
+    QueueBuilderExt, QueueDrained, QueueRoundTripWorkload, QueueScenarioBuilder, QueueTopology,
+};
+use testing_framework_core::scenario::Deployer;
+use testing_framework_runner_compose::ComposeRunnerError;
+use tracing::{info, warn};
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
+        .init();
+
+    let operations = 200;
+
+    let mut scenario = QueueScenarioBuilder::deployment_with(|_| QueueTopology::new(3))
+        .with_run_duration(Duration::from_secs(30))
+        .with_workload(
+            QueueRoundTripWorkload::new()
+                .operations(operations)
+                .rate_per_sec(20),
+        )
+        .with_expectation(QueueDrained::new().timeout(Duration::from_secs(25)))
+        .build()?;
+
+    let deployer = queue_runtime_ext::QueueComposeDeployer::new();
+    let runner = match deployer.deploy(&scenario).await {
+        Ok(runner) => runner,
+        Err(ComposeRunnerError::DockerUnavailable) => {
+            warn!("docker unavailable; skipping compose queue roundtrip run");
+            return Ok(());
+        }
+        Err(error) => {
+            return Err(anyhow::Error::new(error))
+                .context("deploying queue compose roundtrip stack");
+        }
+    };
+
+    info!("running queue compose roundtrip scenario");
+    runner
+        .run(&mut scenario)
+        .await
+        .context("running queue compose roundtrip scenario")?;
+    Ok(())
+}
--- a/examples/queue/queue-node/Cargo.toml
+++ b/examples/queue/queue-node/Cargo.toml
@ -0,0 +1,21 @@
+[package]
+edition.workspace = true
+license.workspace = true
+name              = "queue-node"
+version.workspace = true
+
+[[bin]]
+name = "queue-node"
+path = "src/main.rs"
+
+[dependencies]
+anyhow             = "1.0"
+axum               = "0.7"
+clap               = { version = "4.0", features = ["derive"] }
+reqwest            = { workspace = true, features = ["json"] }
+serde              = { workspace = true }
+serde_yaml         = { workspace = true }
+tokio              = { workspace = true, features = ["full"] }
+tower-http         = { version = "0.6", features = ["trace"] }
+tracing            = { workspace = true }
+tracing-subscriber = { version = "0.3", features = ["env-filter"] }
--- a/examples/queue/queue-node/src/client.rs
+++ b/examples/queue/queue-node/src/client.rs
@ -0,0 +1,40 @@
+use reqwest::Url;
+use serde::Serialize;
+
+#[derive(Clone)]
+pub struct QueueHttpClient {
+    base_url: Url,
+    client: reqwest::Client,
+}
+
+impl QueueHttpClient {
+    #[must_use]
+    pub fn new(base_url: Url) -> Self {
+        Self {
+            base_url,
+            client: reqwest::Client::new(),
+        }
+    }
+
+    pub async fn get<T: serde::de::DeserializeOwned>(&self, path: &str) -> anyhow::Result<T> {
+        let url = self.base_url.join(path)?;
+        let response = self.client.get(url).send().await?.error_for_status()?;
+        Ok(response.json().await?)
+    }
+
+    pub async fn post<B: Serialize, T: serde::de::DeserializeOwned>(
+        &self,
+        path: &str,
+        body: &B,
+    ) -> anyhow::Result<T> {
+        let url = self.base_url.join(path)?;
+        let response = self
+            .client
+            .post(url)
+            .json(body)
+            .send()
+            .await?
+            .error_for_status()?;
+        Ok(response.json().await?)
+    }
+}
--- a/examples/queue/queue-node/src/config.rs
+++ b/examples/queue/queue-node/src/config.rs
@ -0,0 +1,29 @@
+use std::{fs, path::Path};
+
+use serde::{Deserialize, Serialize};
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct PeerInfo {
+    pub node_id: u64,
+    pub http_address: String,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct QueueConfig {
+    pub node_id: u64,
+    pub http_port: u16,
+    pub peers: Vec<PeerInfo>,
+    #[serde(default = "default_sync_interval_ms")]
+    pub sync_interval_ms: u64,
+}
+
+impl QueueConfig {
+    pub fn load(path: &Path) -> anyhow::Result<Self> {
+        let raw = fs::read_to_string(path)?;
+        Ok(serde_yaml::from_str(&raw)?)
+    }
+}
+
+const fn default_sync_interval_ms() -> u64 {
+    1000
+}
--- a/examples/queue/queue-node/src/lib.rs
+++ b/examples/queue/queue-node/src/lib.rs
@ -0,0 +1,3 @@
+pub mod client;
+
+pub use client::QueueHttpClient;
--- a/examples/queue/queue-node/src/main.rs
+++ b/examples/queue/queue-node/src/main.rs
@ -0,0 +1,36 @@
+mod config;
+mod server;
+mod state;
+mod sync;
+
+use std::path::PathBuf;
+
+use clap::Parser;
+use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};
+
+use crate::{config::QueueConfig, state::QueueState, sync::SyncService};
+
+#[derive(Parser, Debug)]
+#[command(name = "queue-node")]
+struct Args {
+    #[arg(short, long)]
+    config: PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    tracing_subscriber::registry()
+        .with(
+            tracing_subscriber::EnvFilter::try_from_default_env()
+                .unwrap_or_else(|_| "queue_node=info,tower_http=debug".into()),
+        )
+        .with(tracing_subscriber::fmt::layer())
+        .init();
+
+    let args = Args::parse();
+    let config = QueueConfig::load(&args.config)?;
+
+    let state = QueueState::new(config.node_id);
+    SyncService::new(config.clone(), state.clone()).start();
+    server::start_server(config, state).await
+}
--- a/examples/queue/queue-node/src/server.rs
+++ b/examples/queue/queue-node/src/server.rs
@ -0,0 +1,115 @@
+use std::net::SocketAddr;
+
+use axum::{
+    Router,
+    extract::State,
+    http::StatusCode,
+    response::Json,
+    routing::{get, post},
+};
+use serde::{Deserialize, Serialize};
+use tower_http::trace::TraceLayer;
+
+use crate::{
+    config::QueueConfig,
+    state::{QueueMessage, QueueRevision, QueueState, QueueStateView, Snapshot},
+};
+
+#[derive(Serialize)]
+struct HealthResponse {
+    status: &'static str,
+}
+
+#[derive(Deserialize)]
+struct EnqueueRequest {
+    payload: String,
+}
+
+#[derive(Serialize)]
+struct EnqueueResponse {
+    accepted: bool,
+    id: u64,
+    queue_len: usize,
+    revision: QueueRevision,
+}
+
+#[derive(Serialize)]
+struct DequeueResponse {
+    message: Option<QueueMessage>,
+    queue_len: usize,
+    revision: QueueRevision,
+}
+
+pub async fn start_server(config: QueueConfig, state: QueueState) -> anyhow::Result<()> {
+    let app = Router::new()
+        .route("/health/live", get(health_live))
+        .route("/health/ready", get(health_ready))
+        .route("/queue/enqueue", post(enqueue))
+        .route("/queue/dequeue", post(dequeue))
+        .route("/queue/state", get(queue_state))
+        .route("/internal/snapshot", get(get_snapshot))
+        .layer(TraceLayer::new_for_http())
+        .with_state(state.clone());
+
+    let addr = SocketAddr::from(([0, 0, 0, 0], config.http_port));
+    let listener = tokio::net::TcpListener::bind(addr).await?;
+
+    state.set_ready(true).await;
+    tracing::info!(node_id = state.node_id(), %addr, "queue node ready");
+
+    axum::serve(listener, app).await?;
+    Ok(())
+}
+
+async fn health_live() -> (StatusCode, Json<HealthResponse>) {
+    (StatusCode::OK, Json(HealthResponse { status: "alive" }))
+}
+
+async fn health_ready(State(state): State<QueueState>) -> (StatusCode, Json<HealthResponse>) {
+    if state.is_ready().await {
+        (StatusCode::OK, Json(HealthResponse { status: "ready" }))
+    } else {
+        (
+            StatusCode::SERVICE_UNAVAILABLE,
+            Json(HealthResponse {
+                status: "not-ready",
+            }),
+        )
+    }
+}
+
+async fn enqueue(
+    State(state): State<QueueState>,
+    Json(request): Json<EnqueueRequest>,
+) -> (StatusCode, Json<EnqueueResponse>) {
+    let outcome = state.enqueue_local(request.payload).await;
+    (
+        StatusCode::OK,
+        Json(EnqueueResponse {
+            accepted: outcome.accepted,
+            id: outcome.id,
+            queue_len: outcome.queue_len,
+            revision: outcome.revision,
+        }),
+    )
+}
+
+async fn dequeue(State(state): State<QueueState>) -> (StatusCode, Json<DequeueResponse>) {
+    let outcome = state.dequeue_local().await;
+    (
+        StatusCode::OK,
+        Json(DequeueResponse {
+            message: outcome.message,
+            queue_len: outcome.queue_len,
+            revision: outcome.revision,
+        }),
+    )
+}
+
+async fn queue_state(State(state): State<QueueState>) -> Json<QueueStateView> {
+    Json(state.queue_state().await)
+}
+
+async fn get_snapshot(State(state): State<QueueState>) -> Json<Snapshot> {
+    Json(state.snapshot().await)
+}
--- a/examples/queue/queue-node/src/state.rs
+++ b/examples/queue/queue-node/src/state.rs
@ -0,0 +1,151 @@
+use std::{collections::VecDeque, sync::Arc};
+
+use serde::{Deserialize, Serialize};
+use tokio::sync::RwLock;
+
+#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
+pub struct QueueRevision {
+    pub version: u64,
+    pub origin: u64,
+}
+
+#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
+pub struct QueueMessage {
+    pub id: u64,
+    pub payload: String,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct Snapshot {
+    pub node_id: u64,
+    pub revision: QueueRevision,
+    pub messages: Vec<QueueMessage>,
+}
+
+#[derive(Clone, Debug, Eq, PartialEq, Serialize)]
+pub struct QueueStateView {
+    pub revision: QueueRevision,
+    pub queue_len: usize,
+    pub head_id: Option<u64>,
+    pub tail_id: Option<u64>,
+}
+
+#[derive(Clone, Debug)]
+pub struct EnqueueOutcome {
+    pub accepted: bool,
+    pub id: u64,
+    pub queue_len: usize,
+    pub revision: QueueRevision,
+}
+
+#[derive(Clone, Debug)]
+pub struct DequeueOutcome {
+    pub message: Option<QueueMessage>,
+    pub queue_len: usize,
+    pub revision: QueueRevision,
+}
+
+#[derive(Debug, Default)]
+struct QueueData {
+    revision: QueueRevision,
+    messages: VecDeque<QueueMessage>,
+}
+
+#[derive(Clone)]
+pub struct QueueState {
+    node_id: u64,
+    ready: Arc<RwLock<bool>>,
+    data: Arc<RwLock<QueueData>>,
+}
+
+impl QueueState {
+    pub fn new(node_id: u64) -> Self {
+        Self {
+            node_id,
+            ready: Arc::new(RwLock::new(false)),
+            data: Arc::new(RwLock::new(QueueData::default())),
+        }
+    }
+
+    pub const fn node_id(&self) -> u64 {
+        self.node_id
+    }
+
+    pub async fn set_ready(&self, value: bool) {
+        *self.ready.write().await = value;
+    }
+
+    pub async fn is_ready(&self) -> bool {
+        *self.ready.read().await
+    }
+
+    pub async fn enqueue_local(&self, payload: String) -> EnqueueOutcome {
+        let mut data = self.data.write().await;
+        let id = next_message_id(&data.messages);
+        data.messages.push_back(QueueMessage { id, payload });
+        bump_revision(&mut data.revision, self.node_id);
+
+        EnqueueOutcome {
+            accepted: true,
+            id,
+            queue_len: data.messages.len(),
+            revision: data.revision,
+        }
+    }
+
+    pub async fn dequeue_local(&self) -> DequeueOutcome {
+        let mut data = self.data.write().await;
+        let message = data.messages.pop_front();
+        if message.is_some() {
+            bump_revision(&mut data.revision, self.node_id);
+        }
+
+        DequeueOutcome {
+            message,
+            queue_len: data.messages.len(),
+            revision: data.revision,
+        }
+    }
+
+    pub async fn queue_state(&self) -> QueueStateView {
+        let data = self.data.read().await;
+        QueueStateView {
+            revision: data.revision,
+            queue_len: data.messages.len(),
+            head_id: data.messages.front().map(|message| message.id),
+            tail_id: data.messages.back().map(|message| message.id),
+        }
+    }
+
+    pub async fn merge_snapshot(&self, snapshot: Snapshot) {
+        let mut data = self.data.write().await;
+        if is_newer_revision(snapshot.revision, data.revision) {
+            data.revision = snapshot.revision;
+            data.messages = snapshot.messages.into();
+        }
+    }
+
+    pub async fn snapshot(&self) -> Snapshot {
+        let data = self.data.read().await;
+        Snapshot {
+            node_id: self.node_id,
+            revision: data.revision,
+            messages: data.messages.iter().cloned().collect(),
+        }
+    }
+}
+
+fn next_message_id(messages: &VecDeque<QueueMessage>) -> u64 {
+    messages
+        .back()
+        .map_or(1, |message| message.id.saturating_add(1))
+}
+
+fn bump_revision(revision: &mut QueueRevision, node_id: u64) {
+    revision.version = revision.version.saturating_add(1);
+    revision.origin = node_id;
+}
+
+fn is_newer_revision(candidate: QueueRevision, existing: QueueRevision) -> bool {
+    (candidate.version, candidate.origin) > (existing.version, existing.origin)
+}
--- a/examples/queue/queue-node/src/sync.rs
+++ b/examples/queue/queue-node/src/sync.rs
@ -0,0 +1,103 @@
+use std::{collections::HashMap, sync::Arc, time::Duration};
+
+use reqwest::Client;
+use tokio::sync::Mutex;
+use tracing::{debug, warn};
+
+use crate::{
+    config::QueueConfig,
+    state::{QueueState, Snapshot},
+};
+
+const WARN_AFTER_CONSECUTIVE_FAILURES: u32 = 5;
+
+#[derive(Clone)]
+pub struct SyncService {
+    config: Arc<QueueConfig>,
+    state: QueueState,
+    client: Client,
+    failures_by_peer: Arc<Mutex<HashMap<String, u32>>>,
+}
+
+impl SyncService {
+    pub fn new(config: QueueConfig, state: QueueState) -> Self {
+        Self {
+            config: Arc::new(config),
+            state,
+            client: Client::new(),
+            failures_by_peer: Arc::new(Mutex::new(HashMap::new())),
+        }
+    }
+
+    pub fn start(&self) {
+        let service = self.clone();
+        tokio::spawn(async move {
+            service.run().await;
+        });
+    }
+
+    async fn run(self) {
+        let interval = Duration::from_millis(self.config.sync_interval_ms.max(100));
+        loop {
+            self.sync_once().await;
+            tokio::time::sleep(interval).await;
+        }
+    }
+
+    async fn sync_once(&self) {
+        for peer in &self.config.peers {
+            match self.fetch_snapshot(&peer.http_address).await {
+                Ok(snapshot) => {
+                    self.state.merge_snapshot(snapshot).await;
+                    self.clear_failure_counter(&peer.http_address).await;
+                }
+                Err(error) => {
+                    self.record_sync_failure(&peer.http_address, &error).await;
+                }
+            }
+        }
+    }
+
+    async fn fetch_snapshot(&self, peer_address: &str) -> anyhow::Result<Snapshot> {
+        let url = format!("http://{peer_address}/internal/snapshot");
+        let snapshot = self
+            .client
+            .get(url)
+            .send()
+            .await?
+            .error_for_status()?
+            .json()
+            .await?;
+        Ok(snapshot)
+    }
+
+    async fn clear_failure_counter(&self, peer_address: &str) {
+        let mut failures = self.failures_by_peer.lock().await;
+        failures.remove(peer_address);
+    }
+
+    async fn record_sync_failure(&self, peer_address: &str, error: &anyhow::Error) {
+        let consecutive_failures = {
+            let mut failures = self.failures_by_peer.lock().await;
+            let entry = failures.entry(peer_address.to_owned()).or_insert(0);
+            *entry += 1;
+            *entry
+        };
+
+        if consecutive_failures >= WARN_AFTER_CONSECUTIVE_FAILURES {
+            warn!(
+                peer = %peer_address,
+                %error,
+                consecutive_failures,
+                "queue sync repeatedly failing"
+            );
+        } else {
+            debug!(
+                peer = %peer_address,
+                %error,
+                consecutive_failures,
+                "queue sync failed"
+            );
+        }
+    }
+}
--- a/examples/queue/testing/integration/Cargo.toml
+++ b/examples/queue/testing/integration/Cargo.toml
@ -0,0 +1,13 @@
+[package]
+edition.workspace = true
+license.workspace = true
+name              = "queue-runtime-ext"
+version.workspace = true
+
+[dependencies]
+async-trait                      = { workspace = true }
+queue-node                       = { path = "../../queue-node" }
+serde                            = { workspace = true }
+testing-framework-core           = { workspace = true }
+testing-framework-runner-compose = { workspace = true }
+testing-framework-runner-local   = { workspace = true }
--- a/examples/queue/testing/integration/src/app.rs
+++ b/examples/queue/testing/integration/src/app.rs
@ -0,0 +1,75 @@
+use std::io::Error;
+
+use async_trait::async_trait;
+use queue_node::QueueHttpClient;
+use serde::{Deserialize, Serialize};
+use testing_framework_core::scenario::{
+    Application, ClusterNodeConfigApplication, ClusterNodeView, ClusterPeerView, DynError,
+    NodeAccess, serialize_cluster_yaml_config,
+};
+
+pub type QueueTopology = testing_framework_core::topology::ClusterTopology;
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct QueuePeerInfo {
+    pub node_id: u64,
+    pub http_address: String,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct QueueNodeConfig {
+    pub node_id: u64,
+    pub http_port: u16,
+    pub peers: Vec<QueuePeerInfo>,
+    pub sync_interval_ms: u64,
+}
+
+pub struct QueueEnv;
+
+#[async_trait]
+impl Application for QueueEnv {
+    type Deployment = QueueTopology;
+    type NodeClient = QueueHttpClient;
+    type NodeConfig = QueueNodeConfig;
+    fn build_node_client(access: &NodeAccess) -> Result<Self::NodeClient, DynError> {
+        Ok(QueueHttpClient::new(access.api_base_url()?))
+    }
+
+    fn node_readiness_path() -> &'static str {
+        "/health/ready"
+    }
+}
+
+impl ClusterNodeConfigApplication for QueueEnv {
+    type ConfigError = Error;
+
+    fn static_network_port() -> u16 {
+        8080
+    }
+
+    fn build_cluster_node_config(
+        node: &ClusterNodeView,
+        peers: &[ClusterPeerView],
+    ) -> Result<Self::NodeConfig, Self::ConfigError> {
+        let peers = peers
+            .iter()
+            .map(|peer| QueuePeerInfo {
+                node_id: peer.index() as u64,
+                http_address: peer.authority(),
+            })
+            .collect::<Vec<_>>();
+
+        Ok(QueueNodeConfig {
+            node_id: node.index() as u64,
+            http_port: node.network_port(),
+            peers,
+            sync_interval_ms: 500,
+        })
+    }
+
+    fn serialize_cluster_node_config(
+        config: &Self::NodeConfig,
+    ) -> Result<String, Self::ConfigError> {
+        serialize_cluster_yaml_config(config).map_err(Error::other)
+    }
+}
--- a/examples/queue/testing/integration/src/compose_env.rs
+++ b/examples/queue/testing/integration/src/compose_env.rs
@ -0,0 +1,15 @@
+use testing_framework_runner_compose::{BinaryConfigNodeSpec, ComposeBinaryApp};
+
+use crate::QueueEnv;
+
+const NODE_CONFIG_PATH: &str = "/etc/queue/config.yaml";
+
+impl ComposeBinaryApp for QueueEnv {
+    fn compose_node_spec() -> BinaryConfigNodeSpec {
+        BinaryConfigNodeSpec::conventional(
+            "/usr/local/bin/queue-node",
+            NODE_CONFIG_PATH,
+            vec![8080, 8081],
+        )
+    }
+}
--- a/examples/queue/testing/integration/src/lib.rs
+++ b/examples/queue/testing/integration/src/lib.rs
@ -0,0 +1,10 @@
+mod app;
+mod compose_env;
+mod local_env;
+pub mod scenario;
+
+pub use app::*;
+pub use scenario::{QueueBuilderExt, QueueScenarioBuilder};
+
+pub type QueueLocalDeployer = testing_framework_runner_local::ProcessDeployer<QueueEnv>;
+pub type QueueComposeDeployer = testing_framework_runner_compose::ComposeDeployer<QueueEnv>;
--- a/examples/queue/testing/integration/src/local_env.rs
+++ b/examples/queue/testing/integration/src/local_env.rs
@ -0,0 +1,41 @@
+use std::collections::HashMap;
+
+use testing_framework_core::scenario::{DynError, StartNodeOptions};
+use testing_framework_runner_local::{
+    LocalBinaryApp, LocalNodePorts, LocalPeerNode, LocalProcessSpec,
+    build_local_cluster_node_config, yaml_node_config,
+};
+
+use crate::{QueueEnv, QueueNodeConfig};
+
+impl LocalBinaryApp for QueueEnv {
+    fn initial_node_name_prefix() -> &'static str {
+        "queue-node"
+    }
+
+    fn build_local_node_config_with_peers(
+        _topology: &Self::Deployment,
+        index: usize,
+        ports: &LocalNodePorts,
+        peers: &[LocalPeerNode],
+        _peer_ports_by_name: &HashMap<String, u16>,
+        _options: &StartNodeOptions<Self>,
+        _template_config: Option<
+            &<Self as testing_framework_core::scenario::Application>::NodeConfig,
+        >,
+    ) -> Result<<Self as testing_framework_core::scenario::Application>::NodeConfig, DynError> {
+        build_local_cluster_node_config::<Self>(index, ports, peers)
+    }
+
+    fn local_process_spec() -> LocalProcessSpec {
+        LocalProcessSpec::new("QUEUE_NODE_BIN", "queue-node").with_rust_log("queue_node=info")
+    }
+
+    fn render_local_config(config: &QueueNodeConfig) -> Result<Vec<u8>, DynError> {
+        yaml_node_config(config)
+    }
+
+    fn http_api_port(config: &QueueNodeConfig) -> u16 {
+        config.http_port
+    }
+}
--- a/examples/queue/testing/integration/src/scenario.rs
+++ b/examples/queue/testing/integration/src/scenario.rs
@ -0,0 +1,15 @@
+use testing_framework_core::scenario::ScenarioBuilder;
+
+use crate::{QueueEnv, QueueTopology};
+
+pub type QueueScenarioBuilder = ScenarioBuilder<QueueEnv>;
+
+pub trait QueueBuilderExt: Sized {
+    fn deployment_with(f: impl FnOnce(QueueTopology) -> QueueTopology) -> Self;
+}
+
+impl QueueBuilderExt for QueueScenarioBuilder {
+    fn deployment_with(f: impl FnOnce(QueueTopology) -> QueueTopology) -> Self {
+        QueueScenarioBuilder::with_deployment(f(QueueTopology::new(3)))
+    }
+}
--- a/examples/queue/testing/workloads/Cargo.toml
+++ b/examples/queue/testing/workloads/Cargo.toml
@ -0,0 +1,14 @@
+[package]
+edition.workspace = true
+license.workspace = true
+name              = "queue-runtime-workloads"
+version.workspace = true
+
+[dependencies]
+async-trait            = { workspace = true }
+queue-node             = { path = "../../queue-node" }
+queue-runtime-ext      = { path = "../integration" }
+serde                  = { workspace = true }
+testing-framework-core = { workspace = true }
+tokio                  = { workspace = true, features = ["full"] }
+tracing                = { workspace = true }
--- a/examples/queue/testing/workloads/src/drained.rs
+++ b/examples/queue/testing/workloads/src/drained.rs
@ -0,0 +1,104 @@
+use std::time::Duration;
+
+use async_trait::async_trait;
+use queue_runtime_ext::QueueEnv;
+use serde::Deserialize;
+use testing_framework_core::scenario::{DynError, Expectation, RunContext};
+use tracing::info;
+
+#[derive(Clone)]
+pub struct QueueDrained {
+    timeout: Duration,
+    poll_interval: Duration,
+}
+
+#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
+struct QueueRevision {
+    version: u64,
+    origin: u64,
+}
+
+#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
+struct QueueStateResponse {
+    revision: QueueRevision,
+    queue_len: usize,
+    head_id: Option<u64>,
+    tail_id: Option<u64>,
+}
+
+impl QueueDrained {
+    #[must_use]
+    pub fn new() -> Self {
+        Self {
+            timeout: Duration::from_secs(20),
+            poll_interval: Duration::from_millis(500),
+        }
+    }
+
+    #[must_use]
+    pub const fn timeout(mut self, timeout: Duration) -> Self {
+        self.timeout = timeout;
+        self
+    }
+}
+
+impl Default for QueueDrained {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[async_trait]
+impl Expectation<QueueEnv> for QueueDrained {
+    fn name(&self) -> &str {
+        "queue_drained"
+    }
+
+    async fn evaluate(&mut self, ctx: &RunContext<QueueEnv>) -> Result<(), DynError> {
+        let clients = ctx.node_clients().snapshot();
+        if clients.is_empty() {
+            return Err("no queue node clients available".into());
+        }
+
+        let deadline = tokio::time::Instant::now() + self.timeout;
+        while tokio::time::Instant::now() < deadline {
+            if is_drained_and_converged(&clients).await? {
+                info!("queue drained and converged");
+                return Ok(());
+            }
+            tokio::time::sleep(self.poll_interval).await;
+        }
+
+        Err(format!("queue not drained within {:?}", self.timeout).into())
+    }
+}
+
+async fn is_drained_and_converged(
+    clients: &[queue_node::QueueHttpClient],
+) -> Result<bool, DynError> {
+    let Some((first, rest)) = clients.split_first() else {
+        return Ok(false);
+    };
+
+    let baseline = read_state(first).await?;
+    if !is_drained(&baseline) {
+        return Ok(false);
+    }
+
+    for client in rest {
+        let current = read_state(client).await?;
+        if current != baseline {
+            return Ok(false);
+        }
+    }
+
+    Ok(true)
+}
+
+fn is_drained(state: &QueueStateResponse) -> bool {
+    state.queue_len == 0 && state.head_id.is_none() && state.tail_id.is_none()
+}
+
+async fn read_state(client: &queue_node::QueueHttpClient) -> Result<QueueStateResponse, DynError> {
+    Ok(client.get("/queue/state").await?)
+}
--- a/examples/queue/testing/workloads/src/expectations.rs
+++ b/examples/queue/testing/workloads/src/expectations.rs
@ -0,0 +1,106 @@
+use std::time::Duration;
+
+use async_trait::async_trait;
+use queue_runtime_ext::QueueEnv;
+use serde::Deserialize;
+use testing_framework_core::scenario::{DynError, Expectation, RunContext};
+use tracing::info;
+
+#[derive(Clone)]
+pub struct QueueConverges {
+    min_queue_len: usize,
+    timeout: Duration,
+    poll_interval: Duration,
+}
+
+#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
+struct QueueRevision {
+    version: u64,
+    origin: u64,
+}
+
+#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
+struct QueueStateResponse {
+    revision: QueueRevision,
+    queue_len: usize,
+    head_id: Option<u64>,
+    tail_id: Option<u64>,
+}
+
+impl QueueConverges {
+    #[must_use]
+    pub fn new(min_queue_len: usize) -> Self {
+        Self {
+            min_queue_len,
+            timeout: Duration::from_secs(20),
+            poll_interval: Duration::from_millis(500),
+        }
+    }
+
+    #[must_use]
+    pub const fn timeout(mut self, timeout: Duration) -> Self {
+        self.timeout = timeout;
+        self
+    }
+}
+
+#[async_trait]
+impl Expectation<QueueEnv> for QueueConverges {
+    fn name(&self) -> &str {
+        "queue_converges"
+    }
+
+    async fn evaluate(&mut self, ctx: &RunContext<QueueEnv>) -> Result<(), DynError> {
+        let clients = ctx.node_clients().snapshot();
+        if clients.is_empty() {
+            return Err("no queue node clients available".into());
+        }
+
+        let deadline = tokio::time::Instant::now() + self.timeout;
+        while tokio::time::Instant::now() < deadline {
+            if self.is_converged(&clients).await? {
+                info!(
+                    min_queue_len = self.min_queue_len,
+                    "queue convergence reached"
+                );
+                return Ok(());
+            }
+            tokio::time::sleep(self.poll_interval).await;
+        }
+
+        Err(format!(
+            "queue convergence not reached within {:?} (min_queue_len={})",
+            self.timeout, self.min_queue_len
+        )
+        .into())
+    }
+}
+
+impl QueueConverges {
+    async fn is_converged(
+        &self,
+        clients: &[queue_node::QueueHttpClient],
+    ) -> Result<bool, DynError> {
+        let Some((first, rest)) = clients.split_first() else {
+            return Ok(false);
+        };
+
+        let baseline = read_state(first).await?;
+        if baseline.queue_len < self.min_queue_len {
+            return Ok(false);
+        }
+
+        for client in rest {
+            let current = read_state(client).await?;
+            if current != baseline {
+                return Ok(false);
+            }
+        }
+
+        Ok(true)
+    }
+}
+
+async fn read_state(client: &queue_node::QueueHttpClient) -> Result<QueueStateResponse, DynError> {
+    Ok(client.get("/queue/state").await?)
+}
--- a/examples/queue/testing/workloads/src/lib.rs
+++ b/examples/queue/testing/workloads/src/lib.rs
@ -0,0 +1,10 @@
+mod drained;
+mod expectations;
+mod produce;
+mod roundtrip;
+
+pub use drained::QueueDrained;
+pub use expectations::QueueConverges;
+pub use produce::QueueProduceWorkload;
+pub use queue_runtime_ext::{QueueBuilderExt, QueueEnv, QueueScenarioBuilder, QueueTopology};
+pub use roundtrip::QueueRoundTripWorkload;
--- a/examples/queue/testing/workloads/src/produce.rs
+++ b/examples/queue/testing/workloads/src/produce.rs
@ -0,0 +1,116 @@
+use std::time::Duration;
+
+use async_trait::async_trait;
+use queue_runtime_ext::QueueEnv;
+use serde::{Deserialize, Serialize};
+use testing_framework_core::scenario::{DynError, RunContext, Workload};
+use tracing::info;
+
+#[derive(Clone)]
+pub struct QueueProduceWorkload {
+    operations: usize,
+    rate_per_sec: Option<usize>,
+    payload_prefix: String,
+}
+
+#[derive(Serialize)]
+struct EnqueueRequest {
+    payload: String,
+}
+
+#[derive(Deserialize)]
+struct EnqueueResponse {
+    accepted: bool,
+    id: u64,
+    queue_len: usize,
+}
+
+impl QueueProduceWorkload {
+    #[must_use]
+    pub fn new() -> Self {
+        Self {
+            operations: 200,
+            rate_per_sec: Some(25),
+            payload_prefix: "queue-demo".to_owned(),
+        }
+    }
+
+    #[must_use]
+    pub const fn operations(mut self, value: usize) -> Self {
+        self.operations = value;
+        self
+    }
+
+    #[must_use]
+    pub const fn rate_per_sec(mut self, value: usize) -> Self {
+        self.rate_per_sec = Some(value);
+        self
+    }
+
+    #[must_use]
+    pub fn payload_prefix(mut self, value: impl Into<String>) -> Self {
+        self.payload_prefix = value.into();
+        self
+    }
+}
+
+impl Default for QueueProduceWorkload {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[async_trait]
+impl Workload<QueueEnv> for QueueProduceWorkload {
+    fn name(&self) -> &str {
+        "queue_produce_workload"
+    }
+
+    async fn start(&self, ctx: &RunContext<QueueEnv>) -> Result<(), DynError> {
+        let clients = ctx.node_clients().snapshot();
+        let Some(producer) = clients.first() else {
+            return Err("no queue node clients available".into());
+        };
+
+        let interval = self.rate_per_sec.and_then(compute_interval);
+        info!(
+            operations = self.operations,
+            rate_per_sec = ?self.rate_per_sec,
+            "starting queue produce workload"
+        );
+
+        for idx in 0..self.operations {
+            let payload = format!("{}-{idx}", self.payload_prefix);
+            let response: EnqueueResponse = producer
+                .post("/queue/enqueue", &EnqueueRequest { payload })
+                .await?;
+
+            if !response.accepted {
+                return Err(format!("node rejected enqueue at operation {idx}").into());
+            }
+
+            if (idx + 1) % 25 == 0 {
+                info!(
+                    completed = idx + 1,
+                    last_id = response.id,
+                    queue_len = response.queue_len,
+                    "queue produce progress"
+                );
+            }
+
+            if let Some(delay) = interval {
+                tokio::time::sleep(delay).await;
+            }
+        }
+
+        Ok(())
+    }
+}
+
+fn compute_interval(rate_per_sec: usize) -> Option<Duration> {
+    if rate_per_sec == 0 {
+        return None;
+    }
+
+    Some(Duration::from_millis((1000 / rate_per_sec as u64).max(1)))
+}
--- a/examples/queue/testing/workloads/src/roundtrip.rs
+++ b/examples/queue/testing/workloads/src/roundtrip.rs
@ -0,0 +1,179 @@
+use std::{collections::HashSet, time::Duration};
+
+use async_trait::async_trait;
+use queue_runtime_ext::QueueEnv;
+use serde::{Deserialize, Serialize};
+use testing_framework_core::scenario::{DynError, RunContext, Workload};
+use tokio::time::{Instant, sleep};
+use tracing::info;
+
+#[derive(Clone)]
+pub struct QueueRoundTripWorkload {
+    operations: usize,
+    rate_per_sec: Option<usize>,
+    payload_prefix: String,
+    drain_timeout: Duration,
+    empty_poll_interval: Duration,
+}
+
+#[derive(Serialize)]
+struct EnqueueRequest {
+    payload: String,
+}
+
+#[derive(Deserialize)]
+struct EnqueueResponse {
+    accepted: bool,
+    id: u64,
+}
+
+#[derive(Serialize)]
+struct DequeueRequest {}
+
+#[derive(Deserialize)]
+struct QueueMessage {
+    id: u64,
+    payload: String,
+}
+
+#[derive(Deserialize)]
+struct DequeueResponse {
+    message: Option<QueueMessage>,
+}
+
+impl QueueRoundTripWorkload {
+    #[must_use]
+    pub fn new() -> Self {
+        Self {
+            operations: 200,
+            rate_per_sec: Some(25),
+            payload_prefix: "queue-roundtrip".to_owned(),
+            drain_timeout: Duration::from_secs(20),
+            empty_poll_interval: Duration::from_millis(100),
+        }
+    }
+
+    #[must_use]
+    pub const fn operations(mut self, value: usize) -> Self {
+        self.operations = value;
+        self
+    }
+
+    #[must_use]
+    pub const fn rate_per_sec(mut self, value: usize) -> Self {
+        self.rate_per_sec = Some(value);
+        self
+    }
+
+    #[must_use]
+    pub fn payload_prefix(mut self, value: impl Into<String>) -> Self {
+        self.payload_prefix = value.into();
+        self
+    }
+
+    #[must_use]
+    pub const fn drain_timeout(mut self, value: Duration) -> Self {
+        self.drain_timeout = value;
+        self
+    }
+}
+
+impl Default for QueueRoundTripWorkload {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[async_trait]
+impl Workload<QueueEnv> for QueueRoundTripWorkload {
+    fn name(&self) -> &str {
+        "queue_roundtrip_workload"
+    }
+
+    async fn start(&self, ctx: &RunContext<QueueEnv>) -> Result<(), DynError> {
+        let clients = ctx.node_clients().snapshot();
+        let Some(driver) = clients.first() else {
+            return Err("no queue node clients available".into());
+        };
+
+        let interval = self.rate_per_sec.and_then(compute_interval);
+        let mut produced_ids = HashSet::with_capacity(self.operations);
+
+        info!(
+            operations = self.operations,
+            "queue roundtrip: produce phase"
+        );
+        for idx in 0..self.operations {
+            let payload = format!("{}-{idx}", self.payload_prefix);
+            let response: EnqueueResponse = driver
+                .post("/queue/enqueue", &EnqueueRequest { payload })
+                .await?;
+
+            if !response.accepted {
+                return Err(format!("enqueue rejected at operation {idx}").into());
+            }
+
+            if !produced_ids.insert(response.id) {
+                return Err(format!("duplicate enqueue id observed: {}", response.id).into());
+            }
+
+            if let Some(delay) = interval {
+                sleep(delay).await;
+            }
+        }
+
+        info!(
+            operations = self.operations,
+            "queue roundtrip: consume phase"
+        );
+        let mut consumed = 0usize;
+        let deadline = Instant::now() + self.drain_timeout;
+
+        while consumed < self.operations && Instant::now() < deadline {
+            let response: DequeueResponse =
+                driver.post("/queue/dequeue", &DequeueRequest {}).await?;
+
+            match response.message {
+                Some(message) => {
+                    if !message.payload.starts_with(&self.payload_prefix) {
+                        return Err(format!("unexpected payload: {}", message.payload).into());
+                    }
+                    if !produced_ids.remove(&message.id) {
+                        return Err(
+                            format!("unknown or duplicate dequeue id: {}", message.id).into()
+                        );
+                    }
+                    consumed += 1;
+                }
+                None => sleep(self.empty_poll_interval).await,
+            }
+        }
+
+        if consumed != self.operations {
+            return Err(format!(
+                "queue roundtrip timed out: consumed {consumed}/{} messages",
+                self.operations
+            )
+            .into());
+        }
+
+        if !produced_ids.is_empty() {
+            return Err(format!(
+                "queue roundtrip ended with {} undrained produced ids",
+                produced_ids.len()
+            )
+            .into());
+        }
+
+        info!(operations = self.operations, "queue roundtrip finished");
+        Ok(())
+    }
+}
+
+fn compute_interval(rate_per_sec: usize) -> Option<Duration> {
+    if rate_per_sec == 0 {
+        return None;
+    }
+
+    Some(Duration::from_millis((1000 / rate_per_sec as u64).max(1)))
+}
--- a/examples/redis_streams/README.md
+++ b/examples/redis_streams/README.md
@ -25,11 +25,11 @@ Each example follows the same pattern:
 ## Run with Docker Compose

 ```bash
-cargo run -p redis-streams-examples --bin compose_roundtrip
+cargo run -p redis-streams-examples --bin redis_streams_compose_roundtrip
 ```

 ## Run the reclaim scenario

 ```bash
-cargo run -p redis-streams-examples --bin compose_failover
+cargo run -p redis-streams-examples --bin redis_streams_compose_failover
 ```
--- a/examples/redis_streams/examples/Cargo.toml
+++ b/examples/redis_streams/examples/Cargo.toml
@ -4,6 +4,14 @@ license.workspace = true
 name              = "redis-streams-examples"
 version.workspace = true

+[[bin]]
+name = "redis_streams_compose_roundtrip"
+path = "src/bin/compose_roundtrip.rs"
+
+[[bin]]
+name = "redis_streams_compose_failover"
+path = "src/bin/compose_failover.rs"
+
 [dependencies]
 anyhow                           = "1.0"
 redis-streams-runtime-ext        = { path = "../testing/integration" }
--- a/testing-framework/core/Cargo.toml
+++ b/testing-framework/core/Cargo.toml
@ -29,5 +29,5 @@ reqwest               = { features = ["json"], workspace = true }
 serde                 = { workspace = true }
 serde_yaml            = { workspace = true }
 thiserror             = { workspace = true }
-tokio                 = { features = ["macros", "process", "rt-multi-thread", "time"], workspace = true }
+tokio                 = { features = ["macros", "process", "rt-multi-thread", "sync", "time"], workspace = true }
 tracing               = { workspace = true }
--- a/testing-framework/core/src/lib.rs
+++ b/testing-framework/core/src/lib.rs
@ -1,5 +1,6 @@
 pub mod cfgsync;
 pub mod env;
+pub mod observation;
 pub mod runtime;
 pub mod scenario;
 pub mod topology;
--- a/testing-framework/core/src/observation/factory.rs
+++ b/testing-framework/core/src/observation/factory.rs
@ -0,0 +1,161 @@
+use std::{marker::PhantomData, sync::Arc};
+
+use async_trait::async_trait;
+
+use super::{
+    ObservationConfig, ObservationHandle, ObservationRuntime, ObservedSource, Observer,
+    SourceProvider,
+};
+use crate::scenario::{
+    Application, DynError, NodeClients, PreparedRuntimeExtension, RuntimeExtensionFactory,
+};
+
+/// Boxed source provider used by observation factories.
+pub type BoxedSourceProvider<S> = Box<dyn SourceProvider<S>>;
+
+/// Builds an observation source provider once node clients are available.
+pub trait SourceProviderFactory<E: Application, S>: Send + Sync + 'static {
+    /// Builds the source provider for one scenario run.
+    fn build_source_provider(
+        &self,
+        deployment: &E::Deployment,
+        node_clients: NodeClients<E>,
+    ) -> Result<BoxedSourceProvider<S>, DynError>;
+}
+
+impl<E, S, F> SourceProviderFactory<E, S> for F
+where
+    E: Application,
+    S: Clone + Send + Sync + 'static,
+    F: Fn(&E::Deployment, NodeClients<E>) -> Result<BoxedSourceProvider<S>, DynError>
+        + Send
+        + Sync
+        + 'static,
+{
+    fn build_source_provider(
+        &self,
+        deployment: &E::Deployment,
+        node_clients: NodeClients<E>,
+    ) -> Result<BoxedSourceProvider<S>, DynError> {
+        self(deployment, node_clients)
+    }
+}
+
+/// Fixed source provider for scenario runs with a stable source set.
+#[derive(Clone, Debug)]
+pub struct StaticSourceProvider<S> {
+    sources: Vec<ObservedSource<S>>,
+}
+
+impl<S> StaticSourceProvider<S> {
+    /// Builds a provider from a fixed source list.
+    #[must_use]
+    pub fn new(sources: Vec<ObservedSource<S>>) -> Self {
+        Self { sources }
+    }
+}
+
+#[async_trait]
+impl<S> SourceProvider<S> for StaticSourceProvider<S>
+where
+    S: Clone + Send + Sync + 'static,
+{
+    async fn sources(&self) -> Result<Vec<ObservedSource<S>>, DynError> {
+        Ok(self.sources.clone())
+    }
+}
+
+/// Runtime extension factory that starts one observer and stores its handle in
+/// `RunContext`.
+pub struct ObservationExtensionFactory<E: Application, O: Observer> {
+    observer_builder: Arc<dyn Fn() -> O + Send + Sync>,
+    source_provider_factory: Arc<dyn SourceProviderFactory<E, O::Source>>,
+    config: ObservationConfig,
+    env_marker: PhantomData<E>,
+}
+
+impl<E: Application, O: Observer> ObservationExtensionFactory<E, O> {
+    /// Builds an observation extension factory from builders.
+    #[must_use]
+    pub fn from_parts(
+        observer_builder: impl Fn() -> O + Send + Sync + 'static,
+        source_provider_factory: impl SourceProviderFactory<E, O::Source>,
+        config: ObservationConfig,
+    ) -> Self {
+        Self {
+            observer_builder: Arc::new(observer_builder),
+            source_provider_factory: Arc::new(source_provider_factory),
+            config,
+            env_marker: PhantomData,
+        }
+    }
+}
+
+impl<E, O> ObservationExtensionFactory<E, O>
+where
+    E: Application,
+    O: Observer + Clone,
+{
+    /// Builds an observation extension factory from one clonable observer.
+    #[must_use]
+    pub fn new(
+        observer: O,
+        source_provider_factory: impl SourceProviderFactory<E, O::Source>,
+        config: ObservationConfig,
+    ) -> Self {
+        Self::from_parts(move || observer.clone(), source_provider_factory, config)
+    }
+}
+
+#[async_trait]
+impl<E, O> RuntimeExtensionFactory<E> for ObservationExtensionFactory<E, O>
+where
+    E: Application,
+    O: Observer,
+{
+    async fn prepare(
+        &self,
+        deployment: &E::Deployment,
+        node_clients: NodeClients<E>,
+    ) -> Result<PreparedRuntimeExtension, DynError> {
+        let source_provider = self
+            .source_provider_factory
+            .build_source_provider(deployment, node_clients)?;
+
+        let observer = (self.observer_builder)();
+        let runtime =
+            ObservationRuntime::start(source_provider, observer, self.config.clone()).await?;
+
+        let (handle, task) = runtime.into_parts();
+
+        Ok(PreparedRuntimeExtension::from_task(handle, task))
+    }
+}
+
+#[async_trait]
+impl<S, P> SourceProvider<S> for Box<P>
+where
+    S: Clone + Send + Sync + 'static,
+    P: SourceProvider<S> + ?Sized,
+{
+    async fn sources(&self) -> Result<Vec<ObservedSource<S>>, DynError> {
+        (**self).sources().await
+    }
+}
+
+#[async_trait]
+impl<S, P> SourceProvider<S> for Arc<P>
+where
+    S: Clone + Send + Sync + 'static,
+    P: SourceProvider<S> + ?Sized,
+{
+    async fn sources(&self) -> Result<Vec<ObservedSource<S>>, DynError> {
+        (**self).sources().await
+    }
+}
+
+impl<O: Observer> From<ObservationHandle<O>> for PreparedRuntimeExtension {
+    fn from(handle: ObservationHandle<O>) -> Self {
+        PreparedRuntimeExtension::new(handle)
+    }
+}
--- a/testing-framework/core/src/observation/mod.rs
+++ b/testing-framework/core/src/observation/mod.rs
@ -0,0 +1,503 @@
+//! Generic continuous observation runtime.
+//!
+//! This module provides the reusable runtime needed by both TF scenarios and
+//! manual-cluster consumers such as Cucumber worlds. It does not know any app
+//! semantics. Apps provide source types, observation logic, materialized state,
+//! snapshots, and delta events.
+
+mod factory;
+
+use std::{
+    any::type_name,
+    collections::VecDeque,
+    sync::Arc,
+    time::{Duration, SystemTime},
+};
+
+use async_trait::async_trait;
+pub use factory::{
+    BoxedSourceProvider, ObservationExtensionFactory, SourceProviderFactory, StaticSourceProvider,
+};
+use parking_lot::Mutex;
+use tokio::{
+    sync::broadcast,
+    task::JoinHandle,
+    time::{MissedTickBehavior, interval},
+};
+use tracing::{debug, info, warn};
+
+use crate::scenario::DynError;
+
+/// Configuration for a background observation runtime.
+#[derive(Clone, Debug)]
+pub struct ObservationConfig {
+    /// Time between observation cycles.
+    pub interval: Duration,
+    /// Maximum number of non-empty event batches retained in memory.
+    pub history_limit: usize,
+}
+
+impl Default for ObservationConfig {
+    fn default() -> Self {
+        Self {
+            interval: Duration::from_secs(1),
+            history_limit: 64,
+        }
+    }
+}
+
+/// One named observation source.
+#[derive(Clone, Debug)]
+pub struct ObservedSource<S> {
+    /// Human-readable source name used in logs and app-level reporting.
+    pub name: String,
+    /// App-owned source handle.
+    pub source: S,
+}
+
+impl<S> ObservedSource<S> {
+    /// Builds one named observation source.
+    #[must_use]
+    pub fn new(name: &str, source: S) -> Self {
+        Self {
+            name: name.to_owned(),
+            source,
+        }
+    }
+}
+
+/// Supplies the current observation source set.
+#[async_trait]
+pub trait SourceProvider<S>: Send + Sync + 'static {
+    /// Returns the current source set for the next observation cycle.
+    async fn sources(&self) -> Result<Vec<ObservedSource<S>>, DynError>;
+}
+
+/// App-owned observation logic.
+#[async_trait]
+pub trait Observer: Send + Sync + 'static {
+    /// App-owned source type.
+    type Source: Clone + Send + Sync + 'static;
+    /// App-owned retained materialized state.
+    type State: Send + Sync + 'static;
+    /// App-owned current snapshot view.
+    type Snapshot: Clone + Send + Sync + 'static;
+    /// App-owned delta event type emitted per cycle.
+    type Event: Clone + Send + Sync + 'static;
+
+    /// Builds the initial retained state from the current source set.
+    async fn init(&self, sources: &[ObservedSource<Self::Source>])
+    -> Result<Self::State, DynError>;
+
+    /// Advances retained state by one cycle and returns any new delta events.
+    async fn poll(
+        &self,
+        sources: &[ObservedSource<Self::Source>],
+        state: &mut Self::State,
+    ) -> Result<Vec<Self::Event>, DynError>;
+
+    /// Builds the current snapshot view from retained state.
+    fn snapshot(&self, state: &Self::State) -> Self::Snapshot;
+}
+
+/// One materialized snapshot emitted by the runtime.
+#[derive(Clone, Debug)]
+pub struct ObservationSnapshot<S> {
+    /// Monotonic cycle number.
+    pub cycle: u64,
+    /// Capture timestamp.
+    pub observed_at: SystemTime,
+    /// Number of sources used for this snapshot.
+    pub source_count: usize,
+    /// App-owned snapshot payload.
+    pub value: S,
+}
+
+/// One delta batch emitted by a successful observation cycle.
+#[derive(Clone, Debug)]
+pub struct ObservationBatch<E> {
+    /// Monotonic cycle number.
+    pub cycle: u64,
+    /// Capture timestamp.
+    pub observed_at: SystemTime,
+    /// Number of sources used for this batch.
+    pub source_count: usize,
+    /// App-owned delta events discovered in this cycle.
+    pub events: Vec<E>,
+}
+
+/// Observation runtime failure stage.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum ObservationFailureStage {
+    /// Source refresh failed before a poll could run.
+    SourceRefresh,
+    /// Observer poll failed after sources were refreshed.
+    Poll,
+}
+
+/// Last failed observation cycle.
+#[derive(Clone, Debug)]
+pub struct ObservationFailure {
+    /// Monotonic cycle number.
+    pub cycle: u64,
+    /// Failure timestamp.
+    pub observed_at: SystemTime,
+    /// Number of sources involved in the failed cycle.
+    pub source_count: usize,
+    /// Runtime stage that failed.
+    pub stage: ObservationFailureStage,
+    /// Human-readable failure message.
+    pub message: String,
+}
+
+/// Errors returned while starting an observation runtime.
+#[derive(Debug, thiserror::Error)]
+pub enum ObservationRuntimeError {
+    /// The configured interval is invalid.
+    #[error("observation interval must be greater than zero")]
+    InvalidInterval,
+    /// Source discovery failed during runtime startup.
+    #[error("failed to refresh observation sources during startup: {source}")]
+    SourceRefresh {
+        #[source]
+        source: DynError,
+    },
+    /// Observer state initialization failed during runtime startup.
+    #[error("failed to initialize observation state: {source}")]
+    ObserverInit {
+        #[source]
+        source: DynError,
+    },
+}
+
+/// Read-side handle for one running observer.
+pub struct ObservationHandle<O: Observer> {
+    shared: Arc<Mutex<SharedObservationState<O>>>,
+    batches: broadcast::Sender<Arc<ObservationBatch<O::Event>>>,
+}
+
+impl<O: Observer> Clone for ObservationHandle<O> {
+    fn clone(&self) -> Self {
+        Self {
+            shared: Arc::clone(&self.shared),
+            batches: self.batches.clone(),
+        }
+    }
+}
+
+impl<O: Observer> ObservationHandle<O> {
+    /// Returns the latest successful snapshot, if one has been produced.
+    #[must_use]
+    pub fn latest_snapshot(&self) -> Option<ObservationSnapshot<O::Snapshot>> {
+        self.shared.lock().latest_snapshot.clone()
+    }
+
+    /// Returns retained non-empty event batches.
+    #[must_use]
+    pub fn history(&self) -> Vec<Arc<ObservationBatch<O::Event>>> {
+        self.shared.lock().history.iter().cloned().collect()
+    }
+
+    /// Returns the most recent cycle failure, if any.
+    #[must_use]
+    pub fn last_error(&self) -> Option<ObservationFailure> {
+        self.shared.lock().last_error.clone()
+    }
+
+    /// Subscribes to future non-empty event batches.
+    #[must_use]
+    pub fn subscribe(&self) -> broadcast::Receiver<Arc<ObservationBatch<O::Event>>> {
+        self.batches.subscribe()
+    }
+}
+
+/// Lifecycle owner for one background observation runtime.
+pub struct ObservationRuntime<O: Observer> {
+    handle: ObservationHandle<O>,
+    task: Option<JoinHandle<()>>,
+}
+
+impl<O: Observer> ObservationRuntime<O> {
+    /// Starts one background observation runtime.
+    pub async fn start<P>(
+        provider: P,
+        observer: O,
+        config: ObservationConfig,
+    ) -> Result<Self, ObservationRuntimeError>
+    where
+        P: SourceProvider<O::Source>,
+    {
+        ensure_positive_interval(config.interval)?;
+
+        let sources = provider
+            .sources()
+            .await
+            .map_err(|source| ObservationRuntimeError::SourceRefresh { source })?;
+
+        let source_count = sources.len();
+        let state = observer
+            .init(&sources)
+            .await
+            .map_err(|source| ObservationRuntimeError::ObserverInit { source })?;
+
+        let snapshot = build_snapshot(0, source_count, &observer, &state);
+        let batches = broadcast::channel(config.history_limit.max(1)).0;
+        let shared = Arc::new(Mutex::new(SharedObservationState::new(snapshot)));
+        let handle = ObservationHandle {
+            shared: Arc::clone(&shared),
+            batches,
+        };
+
+        info!(
+            observer = type_name::<O>(),
+            interval_ms = config.interval.as_millis(),
+            history_limit = config.history_limit,
+            source_count,
+            "starting observation runtime"
+        );
+
+        let runtime_handle = handle.clone();
+        let task = tokio::spawn(run_observation_loop(
+            provider,
+            observer,
+            config,
+            shared,
+            runtime_handle.batches.clone(),
+            state,
+        ));
+
+        Ok(Self {
+            handle: runtime_handle,
+            task: Some(task),
+        })
+    }
+
+    /// Returns a read-side handle for the running observer.
+    #[must_use]
+    pub fn handle(&self) -> ObservationHandle<O> {
+        self.handle.clone()
+    }
+
+    /// Splits the runtime into its handle and background task.
+    #[must_use]
+    pub fn into_parts(mut self) -> (ObservationHandle<O>, JoinHandle<()>) {
+        let task = self
+            .task
+            .take()
+            .expect("observation runtime task is always present before into_parts");
+
+        (self.handle.clone(), task)
+    }
+
+    /// Aborts the background task.
+    pub fn abort(&mut self) {
+        if let Some(task) = self.task.take() {
+            task.abort();
+        }
+    }
+}
+
+impl<O: Observer> Drop for ObservationRuntime<O> {
+    fn drop(&mut self) {
+        self.abort();
+    }
+}
+
+struct SharedObservationState<O: Observer> {
+    latest_snapshot: Option<ObservationSnapshot<O::Snapshot>>,
+    history: VecDeque<Arc<ObservationBatch<O::Event>>>,
+    last_error: Option<ObservationFailure>,
+}
+
+impl<O: Observer> SharedObservationState<O> {
+    fn new(snapshot: ObservationSnapshot<O::Snapshot>) -> Self {
+        Self {
+            latest_snapshot: Some(snapshot),
+            history: VecDeque::new(),
+            last_error: None,
+        }
+    }
+}
+
+async fn run_observation_loop<O, P>(
+    provider: P,
+    observer: O,
+    config: ObservationConfig,
+    shared: Arc<Mutex<SharedObservationState<O>>>,
+    batches: broadcast::Sender<Arc<ObservationBatch<O::Event>>>,
+    mut state: O::State,
+) where
+    O: Observer,
+    P: SourceProvider<O::Source>,
+{
+    let mut ticker = build_interval(config.interval);
+    let mut cycle = 1u64;
+
+    ticker.tick().await;
+
+    loop {
+        ticker.tick().await;
+
+        let cycle_outcome = observe_cycle(&provider, &observer, cycle, &mut state).await;
+
+        match cycle_outcome {
+            Ok(success) => record_cycle_success(&shared, &batches, &config, success),
+            Err(failure) => record_cycle_failure(&shared, failure),
+        }
+
+        cycle += 1;
+    }
+}
+
+struct CycleSuccess<O: Observer> {
+    snapshot: ObservationSnapshot<O::Snapshot>,
+    batch: Option<Arc<ObservationBatch<O::Event>>>,
+}
+
+async fn observe_cycle<O, P>(
+    provider: &P,
+    observer: &O,
+    cycle: u64,
+    state: &mut O::State,
+) -> Result<CycleSuccess<O>, ObservationFailure>
+where
+    O: Observer,
+    P: SourceProvider<O::Source>,
+{
+    let sources = provider.sources().await.map_err(|source| {
+        build_failure(cycle, 0, ObservationFailureStage::SourceRefresh, source)
+    })?;
+
+    let source_count = sources.len();
+    let events = observer.poll(&sources, state).await.map_err(|source| {
+        build_failure(cycle, source_count, ObservationFailureStage::Poll, source)
+    })?;
+
+    let snapshot = build_snapshot(cycle, source_count, observer, state);
+    let batch = build_batch(cycle, source_count, events);
+
+    Ok(CycleSuccess { snapshot, batch })
+}
+
+fn record_cycle_success<O: Observer>(
+    shared: &Arc<Mutex<SharedObservationState<O>>>,
+    batches: &broadcast::Sender<Arc<ObservationBatch<O::Event>>>,
+    config: &ObservationConfig,
+    success: CycleSuccess<O>,
+) {
+    debug!(
+        observer = type_name::<O>(),
+        cycle = success.snapshot.cycle,
+        source_count = success.snapshot.source_count,
+        event_count = success.batch.as_ref().map_or(0, |batch| batch.events.len()),
+        "observation cycle completed"
+    );
+
+    let mut state = shared.lock();
+    state.latest_snapshot = Some(success.snapshot);
+    state.last_error = None;
+
+    let Some(batch) = success.batch else {
+        return;
+    };
+
+    push_history(&mut state.history, Arc::clone(&batch), config.history_limit);
+    drop(state);
+
+    let _ = batches.send(batch);
+}
+
+fn record_cycle_failure<O: Observer>(
+    shared: &Arc<Mutex<SharedObservationState<O>>>,
+    failure: ObservationFailure,
+) {
+    warn!(
+        observer = type_name::<O>(),
+        cycle = failure.cycle,
+        source_count = failure.source_count,
+        stage = ?failure.stage,
+        message = %failure.message,
+        "observation cycle failed"
+    );
+
+    shared.lock().last_error = Some(failure);
+}
+
+fn ensure_positive_interval(interval: Duration) -> Result<(), ObservationRuntimeError> {
+    if interval.is_zero() {
+        return Err(ObservationRuntimeError::InvalidInterval);
+    }
+
+    Ok(())
+}
+
+fn build_interval(period: Duration) -> tokio::time::Interval {
+    let mut ticker = interval(period);
+    ticker.set_missed_tick_behavior(MissedTickBehavior::Delay);
+    ticker
+}
+
+fn build_snapshot<O: Observer>(
+    cycle: u64,
+    source_count: usize,
+    observer: &O,
+    state: &O::State,
+) -> ObservationSnapshot<O::Snapshot> {
+    ObservationSnapshot {
+        cycle,
+        observed_at: SystemTime::now(),
+        source_count,
+        value: observer.snapshot(state),
+    }
+}
+
+fn build_batch<E>(
+    cycle: u64,
+    source_count: usize,
+    events: Vec<E>,
+) -> Option<Arc<ObservationBatch<E>>> {
+    if events.is_empty() {
+        return None;
+    }
+
+    Some(Arc::new(ObservationBatch {
+        cycle,
+        observed_at: SystemTime::now(),
+        source_count,
+        events,
+    }))
+}
+
+fn build_failure(
+    cycle: u64,
+    source_count: usize,
+    stage: ObservationFailureStage,
+    source: DynError,
+) -> ObservationFailure {
+    ObservationFailure {
+        cycle,
+        observed_at: SystemTime::now(),
+        source_count,
+        stage,
+        message: source.to_string(),
+    }
+}
+
+fn push_history<E>(
+    history: &mut VecDeque<Arc<ObservationBatch<E>>>,
+    batch: Arc<ObservationBatch<E>>,
+    history_limit: usize,
+) {
+    if history_limit == 0 {
+        return;
+    }
+
+    history.push_back(batch);
+
+    while history.len() > history_limit {
+        history.pop_front();
+    }
+}
+
+#[cfg(test)]
+mod tests;
--- a/testing-framework/core/src/observation/tests.rs
+++ b/testing-framework/core/src/observation/tests.rs
@ -0,0 +1,250 @@
+use std::{
+    sync::{
+        Arc,
+        atomic::{AtomicUsize, Ordering},
+    },
+    time::Duration,
+};
+
+use async_trait::async_trait;
+use parking_lot::Mutex;
+use tokio::time::{Instant, sleep};
+
+use super::{
+    ObservationConfig, ObservationFailureStage, ObservationRuntime, ObservedSource, Observer,
+    SourceProvider,
+};
+use crate::scenario::DynError;
+
+#[derive(Clone)]
+struct TestSourceProvider {
+    sources: Arc<Mutex<Vec<ObservedSource<u64>>>>,
+    fail_refreshes: Arc<AtomicUsize>,
+}
+
+impl TestSourceProvider {
+    fn new(sources: Vec<ObservedSource<u64>>) -> Self {
+        Self {
+            sources: Arc::new(Mutex::new(sources)),
+            fail_refreshes: Arc::new(AtomicUsize::new(0)),
+        }
+    }
+
+    fn replace_sources(&self, sources: Vec<ObservedSource<u64>>) {
+        *self.sources.lock() = sources;
+    }
+
+    fn fail_next_refresh(&self) {
+        self.fail_refreshes.store(1, Ordering::SeqCst);
+    }
+}
+
+#[async_trait]
+impl SourceProvider<u64> for TestSourceProvider {
+    async fn sources(&self) -> Result<Vec<ObservedSource<u64>>, DynError> {
+        if self.fail_refreshes.swap(0, Ordering::SeqCst) == 1 {
+            return Err("refresh failed".into());
+        }
+
+        Ok(self.sources.lock().clone())
+    }
+}
+
+#[derive(Clone, Debug, Eq, PartialEq)]
+struct TestSnapshot {
+    total_sources_seen: u64,
+    last_source_count: usize,
+}
+
+#[derive(Clone, Debug, Eq, PartialEq)]
+struct TestEvent {
+    total_sources_seen: u64,
+}
+
+#[derive(Default)]
+struct TestState {
+    total_sources_seen: u64,
+    last_source_count: usize,
+}
+
+struct CountingObserver;
+
+#[async_trait]
+impl Observer for CountingObserver {
+    type Source = u64;
+    type State = TestState;
+    type Snapshot = TestSnapshot;
+    type Event = TestEvent;
+
+    async fn init(
+        &self,
+        sources: &[ObservedSource<Self::Source>],
+    ) -> Result<Self::State, DynError> {
+        Ok(TestState {
+            total_sources_seen: sources.iter().map(|source| source.source).sum(),
+            last_source_count: sources.len(),
+        })
+    }
+
+    async fn poll(
+        &self,
+        sources: &[ObservedSource<Self::Source>],
+        state: &mut Self::State,
+    ) -> Result<Vec<Self::Event>, DynError> {
+        state.total_sources_seen += sources.iter().map(|source| source.source).sum::<u64>();
+        state.last_source_count = sources.len();
+
+        Ok(vec![TestEvent {
+            total_sources_seen: state.total_sources_seen,
+        }])
+    }
+
+    fn snapshot(&self, state: &Self::State) -> Self::Snapshot {
+        TestSnapshot {
+            total_sources_seen: state.total_sources_seen,
+            last_source_count: state.last_source_count,
+        }
+    }
+}
+
+#[tokio::test]
+async fn runtime_updates_snapshot_and_history() {
+    let provider = TestSourceProvider::new(vec![ObservedSource::new("node-0", 2)]);
+    let runtime = ObservationRuntime::start(
+        provider,
+        CountingObserver,
+        ObservationConfig {
+            interval: Duration::from_millis(25),
+            history_limit: 2,
+        },
+    )
+    .await
+    .expect("runtime should start");
+
+    let handle = runtime.handle();
+    wait_for_cycle(&handle, 2).await;
+
+    let snapshot = handle.latest_snapshot().expect("snapshot should exist");
+    assert!(snapshot.cycle >= 2);
+    assert_eq!(snapshot.source_count, 1);
+    assert_eq!(snapshot.value.last_source_count, 1);
+    assert!(snapshot.value.total_sources_seen >= 6);
+
+    let history = handle.history();
+    assert_eq!(history.len(), 2);
+    assert!(history.iter().all(|batch| !batch.events.is_empty()));
+}
+
+#[tokio::test]
+async fn runtime_refreshes_sources_each_cycle() {
+    let provider = TestSourceProvider::new(vec![ObservedSource::new("node-0", 1)]);
+    let runtime = ObservationRuntime::start(
+        provider.clone(),
+        CountingObserver,
+        ObservationConfig {
+            interval: Duration::from_millis(25),
+            history_limit: 4,
+        },
+    )
+    .await
+    .expect("runtime should start");
+
+    let handle = runtime.handle();
+    wait_for_cycle(&handle, 1).await;
+
+    provider.replace_sources(vec![
+        ObservedSource::new("node-0", 1),
+        ObservedSource::new("node-1", 3),
+    ]);
+
+    wait_for_snapshot_source_count(&handle, 2).await;
+
+    let snapshot = handle.latest_snapshot().expect("snapshot should exist");
+    assert_eq!(snapshot.source_count, 2);
+    assert_eq!(snapshot.value.last_source_count, 2);
+}
+
+#[tokio::test]
+async fn runtime_records_cycle_failures() {
+    let provider = TestSourceProvider::new(vec![ObservedSource::new("node-0", 1)]);
+    let runtime = ObservationRuntime::start(
+        provider.clone(),
+        CountingObserver,
+        ObservationConfig {
+            interval: Duration::from_millis(25),
+            history_limit: 2,
+        },
+    )
+    .await
+    .expect("runtime should start");
+
+    let handle = runtime.handle();
+    provider.fail_next_refresh();
+
+    wait_for_failure(&handle).await;
+
+    let failure = handle.last_error().expect("failure should exist");
+    assert_eq!(failure.stage, ObservationFailureStage::SourceRefresh);
+    assert_eq!(failure.message, "refresh failed");
+}
+
+async fn wait_for_cycle(handle: &super::ObservationHandle<CountingObserver>, cycle: u64) {
+    let deadline = Instant::now() + Duration::from_secs(2);
+
+    loop {
+        let Some(snapshot) = handle.latest_snapshot() else {
+            sleep(Duration::from_millis(10)).await;
+            continue;
+        };
+
+        if snapshot.cycle >= cycle {
+            return;
+        }
+
+        assert!(
+            Instant::now() < deadline,
+            "timed out waiting for cycle {cycle}"
+        );
+
+        sleep(Duration::from_millis(10)).await;
+    }
+}
+
+async fn wait_for_snapshot_source_count(
+    handle: &super::ObservationHandle<CountingObserver>,
+    source_count: usize,
+) {
+    let deadline = Instant::now() + Duration::from_secs(2);
+
+    loop {
+        let Some(snapshot) = handle.latest_snapshot() else {
+            sleep(Duration::from_millis(10)).await;
+            continue;
+        };
+
+        if snapshot.source_count == source_count {
+            return;
+        }
+
+        assert!(
+            Instant::now() < deadline,
+            "timed out waiting for source_count {source_count}"
+        );
+
+        sleep(Duration::from_millis(10)).await;
+    }
+}
+
+async fn wait_for_failure(handle: &super::ObservationHandle<CountingObserver>) {
+    let deadline = Instant::now() + Duration::from_secs(2);
+
+    loop {
+        if handle.last_error().is_some() {
+            return;
+        }
+
+        assert!(Instant::now() < deadline, "timed out waiting for failure");
+
+        sleep(Duration::from_millis(10)).await;
+    }
+}
--- a/testing-framework/core/src/scenario/common_builder_ext.rs
+++ b/testing-framework/core/src/scenario/common_builder_ext.rs
@ -4,7 +4,12 @@ use super::{
    Application, CleanupPolicy, DeploymentPolicy, Expectation, HttpReadinessRequirement,
    RetryPolicy, RuntimeExtensionFactory, Workload, internal::CoreBuilderAccess,
 };
-use crate::topology::{DeploymentProvider, DeploymentSeed};
+use crate::{
+    observation::{
+        ObservationConfig, ObservationExtensionFactory, Observer, SourceProviderFactory,
+    },
+    topology::{DeploymentProvider, DeploymentSeed},
+};

 type DeploymentProviderHandle<E> = Box<dyn DeploymentProvider<<E as Application>::Deployment>>;

@ -60,6 +65,48 @@ pub trait CoreBuilderExt: CoreBuilderAccess + Sized {
        self.map_core_builder(|builder| builder.with_runtime_extension_factory(extension))
    }

+    /// Registers one clonable observer as a runtime extension.
+    #[must_use]
+    fn with_observer<O>(
+        self,
+        observer: O,
+        source_provider_factory: impl SourceProviderFactory<Self::Env, O::Source>,
+        config: ObservationConfig,
+    ) -> Self
+    where
+        O: Observer + Clone,
+        Self::Env: Application,
+    {
+        let extension = ObservationExtensionFactory::<Self::Env, O>::new(
+            observer,
+            source_provider_factory,
+            config,
+        );
+
+        self.with_runtime_extension_factory(Box::new(extension))
+    }
+
+    /// Registers one observer built lazily per run as a runtime extension.
+    #[must_use]
+    fn with_observer_factory<O>(
+        self,
+        observer_builder: impl Fn() -> O + Send + Sync + 'static,
+        source_provider_factory: impl SourceProviderFactory<Self::Env, O::Source>,
+        config: ObservationConfig,
+    ) -> Self
+    where
+        O: Observer,
+        Self::Env: Application,
+    {
+        let extension = ObservationExtensionFactory::<Self::Env, O>::from_parts(
+            observer_builder,
+            source_provider_factory,
+            config,
+        );
+
+        self.with_runtime_extension_factory(Box::new(extension))
+    }
+
    #[must_use]
    fn with_run_duration(self, duration: Duration) -> Self {
        self.map_core_builder(|builder| builder.with_run_duration(duration))
--- a/Show More
+++ b/Show More