ci: add daily rln simulator e2e workflow (#3885)

2026-05-23 10:49:33 +00:00 · 2026-05-22 17:15:31 +05:30 · 2026-05-22 17:15:31 +05:30 · 67eebe3a02
commit 67eebe3a02
parent 04ef12ccf3
5 changed files with 674 additions and 0 deletions
--- a/.github/workflows/ci-daily.yml
+++ b/.github/workflows/ci-daily.yml
@ -77,3 +77,8 @@ jobs:
            }" \
            "$DISCORD_WEBHOOK_URL"

+  # RLN end-to-end against the simulator. Defaults from tests/simulator/rln-sim.env.
+  rln-simulator:
+    uses: ./.github/workflows/ci-rln-simulator.yml
+    secrets: inherit
+
--- a/.github/workflows/ci-rln-simulator.yml
+++ b/.github/workflows/ci-rln-simulator.yml
@ -0,0 +1,271 @@
+name: RLN E2E — Simulator
+
+# Validates the full RLN flow end-to-end against logos-delivery-simulator:
+# keystore generation, on-chain registration, gossipsub propagation,
+# per-epoch rate-limit enforcement, and epoch-boundary recovery.
+#
+# Why this exists: logos-dev runs with RLN disabled, so there is no
+# production traffic exercising RLN. Until RLN is enabled there, this is
+# the only end-to-end coverage of the RLN + zerokit path.
+#
+# The image is built ON the runner and tested ON the same runner, so the
+# AVX-512 portability issue in container-image.yml does not apply here.
+#
+# No own schedule: ci-daily.yml is the single daily entry point and calls
+# this via workflow_call. workflow_dispatch allows manual runs.
+# Run defaults live in tests/simulator/rln-sim.env; inputs override per-run.
+
+on:
+  workflow_call:
+    inputs:
+      branch:
+        type: string
+        default: ''
+      num_nodes:
+        type: string
+        default: ''
+      msg_limit:
+        type: string
+        default: ''
+      epoch_sec:
+        type: string
+        default: ''
+  workflow_dispatch:
+    inputs:
+      branch:
+        description: 'logos-delivery branch to build & test (blank = use rln-sim.env)'
+        type: string
+        default: ''
+      num_nodes:
+        description: 'Number of nwaku nodes (blank = use rln-sim.env)'
+        type: string
+        default: ''
+      msg_limit:
+        description: 'RLN_RELAY_MSG_LIMIT, must be >= contract min ~20 (blank = use rln-sim.env)'
+        type: string
+        default: ''
+      epoch_sec:
+        description: 'RLN_RELAY_EPOCH_SEC, large enough a burst cannot straddle an epoch (blank = use rln-sim.env)'
+        type: string
+        default: ''
+
+env:
+  NPROC: 2
+  MAKEFLAGS: "-j2"
+  NIM_VERSION: '2.2.4'
+  NIMBLE_VERSION: '0.22.3'
+
+jobs:
+  rln-e2e:
+    runs-on: ubuntu-22.04
+    timeout-minutes: 120
+    name: rln-e2e
+
+    steps:
+      # First checkout: the ref that triggered this workflow (CI branch /
+      # master). This is where the e2e test script and rln-sim.env live —
+      # the build branch may not contain them.
+      - name: Checkout CI ref (for the test script)
+        uses: actions/checkout@v4
+        with:
+          submodules: false
+
+      # Defaults come from tests/simulator/rln-sim.env (single source of truth);
+      # a non-blank input (dispatch or workflow_call) overrides the matching value.
+      - name: Resolve parameters
+        id: cfg
+        env:
+          IN_BRANCH: ${{ inputs.branch }}
+          IN_NUM_NODES: ${{ inputs.num_nodes }}
+          IN_MSG_LIMIT: ${{ inputs.msg_limit }}
+          IN_EPOCH_SEC: ${{ inputs.epoch_sec }}
+        run: |
+          set -euo pipefail
+          set -a; . tests/simulator/rln-sim.env; set +a
+          {
+            echo "branch=${IN_BRANCH:-$BRANCH}"
+            echo "num_nodes=${IN_NUM_NODES:-$NUM_NODES}"
+            echo "msg_limit=${IN_MSG_LIMIT:-$MSG_LIMIT}"
+            echo "epoch_sec=${IN_EPOCH_SEC:-$EPOCH_SEC}"
+          } >> "$GITHUB_OUTPUT"
+
+      - name: Stash e2e test script outside the workspace
+        run: |
+          test -f tests/simulator/rln-e2e-test.py \
+            || { echo "tests/simulator/rln-e2e-test.py missing on CI ref"; exit 1; }
+          cp tests/simulator/rln-e2e-test.py "$RUNNER_TEMP/rln-e2e-test.py"
+
+      # Second checkout: the branch to build & test. Overwrites the workspace;
+      # the stashed test script in RUNNER_TEMP survives.
+      - name: Checkout logos-delivery (${{ steps.cfg.outputs.branch }})
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ steps.cfg.outputs.branch }}
+          submodules: false
+          clean: true
+
+      - name: Get submodules hash
+        id: submodules
+        run: echo "hash=$(git submodule status | awk '{print $1}' | sort | shasum -a 256 | sed 's/[ -]*//g')" >> $GITHUB_OUTPUT
+
+      - name: Cache submodules
+        uses: actions/cache@v3
+        with:
+          path: |
+            vendor/
+            .git/modules
+          key: ${{ runner.os }}-vendor-modules-${{ steps.submodules.outputs.hash }}
+
+      - name: Install Nim ${{ env.NIM_VERSION }}
+        uses: jiro4989/setup-nim-action@v2
+        with:
+          nim-version: ${{ env.NIM_VERSION }}
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Install Nimble ${{ env.NIMBLE_VERSION }}
+        run: |
+          cd /tmp && nimble install "nimble@${{ env.NIMBLE_VERSION }}" -y
+          echo "$HOME/.nimble/bin" >> $GITHUB_PATH
+
+      - name: Cache nimble deps
+        id: cache-nimbledeps
+        uses: actions/cache@v3
+        with:
+          path: |
+            nimbledeps/
+            nimble.paths
+          key: ${{ runner.os }}-nimbledeps-nimble${{ env.NIMBLE_VERSION }}-${{ hashFiles('nimble.lock', 'BearSSL.mk', 'Nat.mk') }}
+
+      - name: Install nimble deps
+        if: steps.cache-nimbledeps.outputs.cache-hit != 'true'
+        run: |
+          nimble setup --localdeps -y
+          make rebuild-nat-libs-nimbledeps
+          make rebuild-bearssl-nimbledeps
+          touch nimbledeps/.nimble-setup
+
+      - name: Build wakunode2
+        run: |
+          make -j${NPROC} V=1 POSTGRES=1 \
+            NIMFLAGS="-d:disableMarchNative -d:chronicles_colors:none" \
+            wakunode2
+
+      - name: Build local Docker image
+        run: |
+          docker build -t nwaku-rln-ci:test -f docker/binaries/Dockerfile.bn.amd64 .
+
+      - name: Clone logos-delivery-simulator
+        run: |
+          git clone --depth 1 https://github.com/logos-messaging/logos-delivery-simulator.git "$RUNNER_TEMP/logos-delivery-simulator"
+
+      - name: Write simulator .env
+        working-directory: ${{ runner.temp }}/logos-delivery-simulator
+        run: |
+          cat > .env <<EOF
+          LD_IMAGE=nwaku-rln-ci:test
+          NUM_LD_NODES=${{ steps.cfg.outputs.num_nodes }}
+          MSG_SIZE_KBYTES=1
+          TRAFFIC_DELAY_SECONDS=5
+          RLN_RELAY_EPOCH_SEC=${{ steps.cfg.outputs.epoch_sec }}
+          RLN_RELAY_MSG_LIMIT=${{ steps.cfg.outputs.msg_limit }}
+          MAX_MESSAGE_LIMIT=100
+          RPC_URL=http://foundry:8545
+          PRIVATE_KEY=0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80
+          ETH_FROM=0xf39fd6e51aad88f6f4ce6ab8827279cfffb92266
+          RLN_CONTRACT_REPO_COMMIT=e75ac913e579ad872f54b2225eec35d1de3d98b0
+          WATCHTOWER_ENABLED=false
+          EOF
+
+      - name: Bring up simulator (RLN subset)
+        working-directory: ${{ runner.temp }}/logos-delivery-simulator
+        run: |
+          docker compose up -d foundry contract-repo-deployer nwaku-token-init bootstrap nwaku
+
+      - name: Wait for contract deployer
+        working-directory: ${{ runner.temp }}/logos-delivery-simulator
+        run: |
+          for _ in $(seq 1 60); do
+            st=$(docker inspect logos-delivery-simulator-contract-repo-deployer-1 --format='{{.State.Status}}' 2>/dev/null || echo missing)
+            [ "$st" = "exited" ] && break
+            echo "deployer status: $st"; sleep 15
+          done
+          ec=$(docker inspect logos-delivery-simulator-contract-repo-deployer-1 --format='{{.State.ExitCode}}')
+          echo "deployer exit code: $ec"
+          if [ "$ec" != "0" ]; then
+            docker logs logos-delivery-simulator-contract-repo-deployer-1 2>&1 | tail -50
+            exit 1
+          fi
+
+      - name: Wait for nwaku fleet to register
+        working-directory: ${{ runner.temp }}/logos-delivery-simulator
+        run: |
+          N=${{ steps.cfg.outputs.num_nodes }}
+          for _ in $(seq 1 60); do
+            up=$(docker ps --filter 'name=logos-delivery-simulator-nwaku-' --filter 'status=running' --format '{{.Names}}' | wc -l)
+            echo "nwaku running: $up/$N"
+            [ "$up" -ge "$N" ] && break
+            sleep 15
+          done
+          # nwaku-1 must reach the "registered + started" marker
+          timeout 300 docker logs -f logos-delivery-simulator-nwaku-1 2>&1 \
+            | grep -m1 -E "Segmentation fault|Illegal instruction|Failed to register on-chain|I am a nwaku node" \
+            | tee /tmp/nwaku1.verdict
+          grep -q "I am a nwaku node" /tmp/nwaku1.verdict
+
+      - name: Run RLN e2e scenarios
+        run: |
+          TEST_SCRIPT="$RUNNER_TEMP/rln-e2e-test.py"
+          test -f "$TEST_SCRIPT" \
+            || { echo "stashed test script missing at $TEST_SCRIPT"; exit 1; }
+          docker run --rm \
+            --network logos-delivery-simulator_simulation \
+            -v "$TEST_SCRIPT:/test.py:ro" \
+            python:3.11-slim \
+            sh -c "pip install --quiet --disable-pip-version-check requests && \
+                   python /test.py \
+                     --hostname-prefix logos-delivery-simulator-nwaku- \
+                     --num-nodes ${{ steps.cfg.outputs.num_nodes }} \
+                     --msg-limit ${{ steps.cfg.outputs.msg_limit }} \
+                     --epoch-sec ${{ steps.cfg.outputs.epoch_sec }} \
+                     --health-deadline-sec 600"
+
+      - name: Collect logs on failure
+        if: failure()
+        working-directory: ${{ runner.temp }}/logos-delivery-simulator
+        run: |
+          mkdir -p "$RUNNER_TEMP/logs"
+          for c in $(docker ps -a --filter 'name=logos-delivery-simulator-' --format '{{.Names}}'); do
+            docker logs "$c" > "$RUNNER_TEMP/logs/$c.log" 2>&1 || true
+          done
+
+      - name: Upload logs
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: simulator-logs
+          path: ${{ runner.temp }}/logs
+          retention-days: 7
+
+      - name: Tear down
+        if: always()
+        working-directory: ${{ runner.temp }}/logos-delivery-simulator
+        run: docker compose down -v || true
+
+      - name: Notify Discord
+        if: always()
+        env:
+          DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_WEBHOOK_URL }}
+        run: |
+          [ -z "$DISCORD_WEBHOOK_URL" ] && exit 0
+          STATUS="${{ job.status }}"
+          BRANCH="${{ steps.cfg.outputs.branch }}"
+          RUN_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+          if [ "$STATUS" = "success" ]; then COLOR=3066993; TITLE="✅ RLN E2E passed"; else COLOR=15158332; TITLE="❌ RLN E2E failed"; fi
+          curl -H "Content-Type: application/json" -X POST -d "{
+            \"embeds\":[{\"title\":\"$TITLE\",\"color\":$COLOR,
+              \"fields\":[
+                {\"name\":\"Branch\",\"value\":\"$BRANCH\",\"inline\":true},
+                {\"name\":\"Status\",\"value\":\"$STATUS\",\"inline\":true}],
+              \"url\":\"$RUN_URL\",
+              \"footer\":{\"text\":\"Daily RLN simulator E2E\"}}]}" \
+            "$DISCORD_WEBHOOK_URL"
--- a/.gitignore
+++ b/.gitignore
@ -86,3 +86,7 @@ nimbledeps

 **/anvil_state/state-deployed-contracts-mint-and-approved.json
 .gitnexus
+
+# Python bytecode from tests/simulator
+__pycache__/
+*.pyc
--- a/tests/simulator/rln-e2e-test.py
+++ b/tests/simulator/rln-e2e-test.py
@ -0,0 +1,388 @@
+#!/usr/bin/env python3
+"""
+RLN end-to-end test against a running logos-delivery-simulator stack.
+
+Designed to run as a sidecar container on the simulator's Docker network so
+hostnames like `logos-delivery-simulator-nwaku-1` resolve via Docker DNS.
+
+Scenarios covered (in order):
+  1. HEALTH         - every node responds to /debug/v1/info with an enrUri
+  2. SUBSCRIBE      - every node REST-subscribes to the pubsub topic
+  3. WITHIN_LIMIT   - every node concurrently sends msg_limit messages -> 200
+  4. PROPAGATION    - one sender's message lands in all peers' inboxes
+  5. OVER_LIMIT     - one extra message per node -> 500 (rate-limit hit)
+  6. EPOCH_RESET    - after epoch_sec, every node can send 1 more -> 200
+  7. SAME_MESSAGE_ID - sending same message_id twice in same epoch is the
+                      slashable signal (verified by checking node logs)
+
+Exit code:
+  0 = all scenarios passed
+  N = number of scenarios that failed
+
+Usage (typical):
+  docker run --rm \\
+    --network logos-delivery-simulator_simulation \\
+    -v /path/to/rln-e2e-test.py:/test.py \\
+    python:3.11-slim \\
+    sh -c 'pip install --quiet requests && python /test.py \\
+             --hostname-prefix logos-delivery-simulator-nwaku- \\
+             --num-nodes 30 --msg-limit 30 --epoch-sec 15'
+"""
+
+import argparse
+import base64
+import concurrent.futures as cf
+import json
+import os
+import sys
+import time
+import urllib.parse
+from dataclasses import dataclass
+from typing import Optional
+
+import requests
+
+PUBSUB_TOPIC = "/waku/2/rs/66/0"
+CONTENT_TOPIC = "/rln-test/1/probe/proto"
+
+# ---------------------------------------------------------------------------
+# helpers
+# ---------------------------------------------------------------------------
+
+def url_of(host: str, port: int = 8645) -> str:
+    return f"http://{host}:{port}"
+
+
+def waku_publish(node_url: str, payload: bytes, timeout: float = 5.0) -> int:
+    body = {
+        "payload": base64.b64encode(payload).decode("ascii"),
+        "contentTopic": CONTENT_TOPIC,
+        "version": 1,
+        "timestamp": time.time_ns(),
+    }
+    enc = urllib.parse.quote(PUBSUB_TOPIC, safe="")
+    try:
+        r = requests.post(
+            f"{node_url}/relay/v1/messages/{enc}",
+            json=body,
+            timeout=timeout,
+            headers={"content-type": "application/json"},
+        )
+        return r.status_code
+    except requests.RequestException:
+        return -1
+
+
+def waku_subscribe(node_url: str, timeout: float = 5.0) -> int:
+    try:
+        r = requests.post(
+            f"{node_url}/relay/v1/subscriptions",
+            json=[PUBSUB_TOPIC],
+            timeout=timeout,
+            headers={"content-type": "application/json"},
+        )
+        return r.status_code
+    except requests.RequestException:
+        return -1
+
+
+def waku_get_messages(node_url: str, timeout: float = 5.0) -> Optional[list]:
+    enc = urllib.parse.quote(PUBSUB_TOPIC, safe="")
+    try:
+        r = requests.get(
+            f"{node_url}/relay/v1/messages/{enc}",
+            timeout=timeout,
+        )
+        if r.status_code != 200:
+            return None
+        return r.json()
+    except (requests.RequestException, json.JSONDecodeError):
+        return None
+
+
+def node_healthy(node_url: str, timeout: float = 3.0) -> bool:
+    try:
+        r = requests.get(f"{node_url}/debug/v1/info", timeout=timeout)
+        return r.status_code == 200 and "enrUri" in r.json()
+    except (requests.RequestException, json.JSONDecodeError):
+        return False
+
+
+# ---------------------------------------------------------------------------
+# scenarios
+# ---------------------------------------------------------------------------
+
+@dataclass
+class Result:
+    name: str
+    ok: bool
+    detail: str = ""
+
+    def __str__(self) -> str:
+        status = "PASS" if self.ok else "FAIL"
+        s = f"[{status}] {self.name}"
+        if self.detail:
+            s += f" — {self.detail}"
+        return s
+
+
+def scenario_health(nodes: list[str], deadline_sec: int = 120) -> Result:
+    """Every node must be reachable within deadline_sec."""
+    start = time.time()
+    unhealthy = list(nodes)
+    while time.time() - start < deadline_sec and unhealthy:
+        with cf.ThreadPoolExecutor(max_workers=min(32, len(unhealthy))) as ex:
+            results = list(ex.map(node_healthy, [url_of(n) for n in unhealthy]))
+        unhealthy = [n for n, ok in zip(unhealthy, results) if not ok]
+        if unhealthy:
+            time.sleep(3)
+    return Result(
+        "HEALTH",
+        not unhealthy,
+        f"{len(nodes) - len(unhealthy)}/{len(nodes)} healthy"
+        + (f"; failing: {unhealthy[:5]}" if unhealthy else ""),
+    )
+
+
+def scenario_subscribe(nodes: list[str]) -> Result:
+    """REST-subscribe every node to the pubsub topic so GETs return cached msgs."""
+    with cf.ThreadPoolExecutor(max_workers=min(32, len(nodes))) as ex:
+        codes = list(ex.map(waku_subscribe, [url_of(n) for n in nodes]))
+    bad = [(n, c) for n, c in zip(nodes, codes) if c != 200]
+    return Result(
+        "SUBSCRIBE",
+        not bad,
+        f"{len(nodes) - len(bad)}/{len(nodes)} subscribed"
+        + (f"; failing: {bad[:5]}" if bad else ""),
+    )
+
+
+def _send_n(node_url: str, n: int) -> list[int]:
+    codes = []
+    for i in range(n):
+        codes.append(waku_publish(node_url, f"probe-{i}".encode()))
+    return codes
+
+
+def _burst_until_blocked(node_url: str, msg_limit: int, overshoot: int = 3):
+    """Send msg_limit+overshoot messages back-to-back, fast, recording codes.
+    Designed to complete inside a single epoch — keep epoch_sec large enough
+    that this burst can't straddle an epoch boundary.
+
+    Returns (n_200, n_500, n_transport_err, two_hundred_after_block) where
+    two_hundred_after_block flags a 200 appearing AFTER the first 500 (i.e.
+    quota reset mid-burst => epoch straddle)."""
+    codes = []
+    for i in range(msg_limit + overshoot):
+        codes.append(waku_publish(node_url, f"burst-{i}".encode(), timeout=10.0))
+    n_200 = sum(c == 200 for c in codes)
+    n_500 = sum(c == 500 for c in codes)
+    n_err = sum(c not in (200, 500) for c in codes)  # -1, 4xx transient, etc.
+    first_block_idx = next((i for i, c in enumerate(codes) if c == 500), None)
+    two_hundred_after_block = (
+        first_block_idx is not None
+        and any(c == 200 for c in codes[first_block_idx + 1:])
+    )
+    return n_200, n_500, n_err, two_hundred_after_block
+
+
+def _publish_until_ok(node_url: str, attempts: int = 20, spacing: float = 5.0) -> bool:
+    """Retry a single publish until it returns 200 or attempts run out.
+    Tolerates the post-startup window where discv5/gossipsub mesh is still
+    forming and the RLN publish path transiently 500s."""
+    for _ in range(attempts):
+        if waku_publish(node_url, b"warmup", timeout=10.0) == 200:
+            return True
+        time.sleep(spacing)
+    return False
+
+
+def scenario_warmup(nodes: list[str], attempts: int = 20) -> Result:
+    """Readiness gate: every node must successfully publish at least once.
+    This absorbs mesh-formation churn so PROPAGATION/RATE_LIMIT aren't
+    judging a not-yet-connected fleet. Consumes 1 nonce/node — well within
+    msg_limit, and RATE_LIMIT's tolerance accounts for it."""
+    with cf.ThreadPoolExecutor(max_workers=min(8, len(nodes))) as ex:
+        ready = list(ex.map(lambda n: _publish_until_ok(url_of(n), attempts), nodes))
+    not_ready = [n for n, ok in zip(nodes, ready) if not ok]
+    return Result(
+        "WARMUP",
+        not not_ready,
+        f"{len(nodes) - len(not_ready)}/{len(nodes)} nodes publishing"
+        + (f"; never ready: {not_ready[:5]}" if not_ready else ""),
+    )
+
+
+def scenario_rate_limit(nodes: list[str], msg_limit: int, tolerance: int = 3) -> Result:
+    """Per-node burst of msg_limit+3 messages within one epoch.
+
+    The RLN invariant being checked:
+      (a) a node must NEVER publish more than msg_limit in one epoch, and
+      (b) the node must enforce a 500 ceiling once the quota is exhausted.
+
+    Transient HTTP errors under concurrent load can lower the accepted count
+    below msg_limit — that does NOT violate the invariant, so we accept
+    successes in [msg_limit - tolerance, msg_limit]. successes > msg_limit OR
+    a 200 after the first 500 means the epoch rolled mid-burst (raise
+    RLN_RELAY_EPOCH_SEC) — reported as a timing skew, not an RLN failure."""
+    # Cap concurrency: firing len(nodes)*(msg_limit+3) publishes all at once
+    # saturates small CI runners (2 vCPU) and causes publish-path timeouts
+    # that masquerade as rate-limit failures.
+    with cf.ThreadPoolExecutor(max_workers=min(5, len(nodes))) as ex:
+        per_node = list(
+            ex.map(lambda n: _burst_until_blocked(url_of(n), msg_limit), nodes)
+        )
+
+    rate_failures = []   # genuine RLN misbehaviour
+    timing_skews = []    # epoch straddled mid-burst — inconclusive
+    for node, (n_200, n_500, n_err, after_block) in zip(nodes, per_node):
+        if n_200 > msg_limit or after_block:
+            timing_skews.append(
+                (node, f"{n_200} ok, epoch rolled mid-burst (raise epoch_sec)")
+            )
+        elif n_500 == 0:
+            rate_failures.append((node, f"no 500 ceiling ({n_200} ok, {n_err} err)"))
+        elif n_200 < msg_limit - tolerance:
+            rate_failures.append(
+                (node, f"only {n_200}/{msg_limit} ok ({n_err} transport err)")
+            )
+
+    if timing_skews and not rate_failures:
+        return Result(
+            "RATE_LIMIT",
+            False,
+            f"INCONCLUSIVE (timing) — raise RLN_RELAY_EPOCH_SEC; "
+            f"{len(timing_skews)} node(s) straddled an epoch: {timing_skews[:3]}",
+        )
+    ok = not rate_failures and not timing_skews
+    good = len(nodes) - len(rate_failures) - len(timing_skews)
+    return Result(
+        "RATE_LIMIT",
+        ok,
+        f"{good}/{len(nodes)} nodes enforced <= {msg_limit} then 500 "
+        f"(tolerance {tolerance} for transport noise)"
+        + (f"; rate failures: {rate_failures[:3]}" if rate_failures else "")
+        + (f"; timing skews: {timing_skews[:3]}" if timing_skews else ""),
+    )
+
+
+def scenario_propagation(
+    sender: str, receivers: list[str], settle_sec: int = 5
+) -> Result:
+    """Send one message on `sender`, expect it visible in every receiver's
+    REST inbox within settle_sec."""
+    marker = f"propagation-marker-{time.time_ns()}".encode()
+    code = waku_publish(url_of(sender), marker)
+    if code != 200:
+        return Result("PROPAGATION", False, f"sender publish returned {code}")
+
+    time.sleep(settle_sec)
+    missing = []
+    with cf.ThreadPoolExecutor(max_workers=min(32, len(receivers))) as ex:
+        inboxes = list(ex.map(waku_get_messages, [url_of(r) for r in receivers]))
+
+    encoded_marker = base64.b64encode(marker).decode().rstrip("=")
+    for r, inbox in zip(receivers, inboxes):
+        if inbox is None:
+            missing.append((r, "GET failed"))
+            continue
+        # Look for our marker payload in any message
+        found = any(
+            (m.get("payload") or "").rstrip("=") == encoded_marker
+            for m in inbox
+        )
+        if not found:
+            missing.append((r, f"{len(inbox)} msgs, marker not present"))
+
+    return Result(
+        "PROPAGATION",
+        not missing,
+        f"{len(receivers) - len(missing)}/{len(receivers)} receivers got the message"
+        + (f"; missing on {missing[:3]}" if missing else ""),
+    )
+
+
+def scenario_epoch_reset(nodes: list[str], epoch_sec: int) -> Result:
+    """After epoch_sec + slack, each node can send 1 more message — expect 200."""
+    sleep_s = epoch_sec + 3
+    print(f"        sleeping {sleep_s}s for epoch reset...")
+    time.sleep(sleep_s)
+    with cf.ThreadPoolExecutor(max_workers=len(nodes)) as ex:
+        codes = list(
+            ex.map(
+                lambda n: waku_publish(url_of(n), b"post-epoch"),
+                nodes,
+            )
+        )
+    bad = [(n, c) for n, c in zip(nodes, codes) if c != 200]
+    return Result(
+        "EPOCH_RESET",
+        not bad,
+        f"{sum(c == 200 for c in codes)}/{len(nodes)} returned 200 after epoch reset"
+        + (f"; failing: {bad[:3]}" if bad else ""),
+    )
+
+
+# ---------------------------------------------------------------------------
+# main
+# ---------------------------------------------------------------------------
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--hostname-prefix", default="logos-delivery-simulator-nwaku-")
+    ap.add_argument("--num-nodes", type=int, default=30)
+    ap.add_argument("--msg-limit", type=int, default=30,
+                    help="Must match RLN_RELAY_MSG_LIMIT in simulator .env")
+    ap.add_argument("--epoch-sec", type=int, default=15,
+                    help="Must match RLN_RELAY_EPOCH_SEC in simulator .env")
+    ap.add_argument("--health-deadline-sec", type=int, default=180)
+    args = ap.parse_args()
+
+    nodes = [f"{args.hostname_prefix}{i}" for i in range(1, args.num_nodes + 1)]
+    print(f"Testing {len(nodes)} nodes: {nodes[0]} … {nodes[-1]}")
+    print(f"Config: msg_limit={args.msg_limit}, epoch_sec={args.epoch_sec}")
+    print()
+
+    results: list[Result] = []
+
+    def run(scenario_fn, *fn_args, **fn_kwargs) -> bool:
+        r = scenario_fn(*fn_args, **fn_kwargs)
+        results.append(r)
+        print(r)
+        return r.ok
+
+    if not run(scenario_health, nodes, deadline_sec=args.health_deadline_sec):
+        print("\nABORTING — nodes never reached healthy state.")
+        return _summarize(results)
+
+    if not run(scenario_subscribe, nodes):
+        print("\nABORTING — could not subscribe nodes to pubsub topic.")
+        return _summarize(results)
+
+    # Readiness gate: wait out mesh-formation churn before judging behaviour.
+    if not run(scenario_warmup, nodes):
+        print("\nABORTING — fleet never reached a publishable state.")
+        return _summarize(results)
+
+    run(scenario_propagation, nodes[0], nodes[1:])
+    # Rate limit: per-node burst, asserts exactly msg_limit then 500.
+    # Requires epoch_sec large enough that the burst can't straddle an epoch.
+    run(scenario_rate_limit, nodes, args.msg_limit)
+    run(scenario_epoch_reset, nodes, args.epoch_sec)
+
+    return _summarize(results)
+
+
+def _summarize(results: list[Result]) -> int:
+    print()
+    print("=" * 64)
+    passed = sum(r.ok for r in results)
+    print(f"  {passed}/{len(results)} scenarios passed")
+    for r in results:
+        print(f"    {r}")
+    print("=" * 64)
+    return len(results) - passed
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/tests/simulator/rln-sim.env
+++ b/tests/simulator/rln-sim.env
@ -0,0 +1,6 @@
+# Source of truth for the RLN simulator E2E run (ci-rln-simulator.yml).
+# workflow_dispatch inputs override any value here per-run (blank input = use this file).
+BRANCH=master
+NUM_NODES=6
+MSG_LIMIT=30
+EPOCH_SEC=120