mirror of
https://github.com/logos-messaging/logos-messaging-nim.git
synced 2026-05-23 10:49:33 +00:00
ci: add daily rln simulator e2e workflow (#3885)
This commit is contained in:
parent
04ef12ccf3
commit
67eebe3a02
5
.github/workflows/ci-daily.yml
vendored
5
.github/workflows/ci-daily.yml
vendored
@ -77,3 +77,8 @@ jobs:
|
||||
}" \
|
||||
"$DISCORD_WEBHOOK_URL"
|
||||
|
||||
# RLN end-to-end against the simulator. Defaults from tests/simulator/rln-sim.env.
|
||||
rln-simulator:
|
||||
uses: ./.github/workflows/ci-rln-simulator.yml
|
||||
secrets: inherit
|
||||
|
||||
|
||||
271
.github/workflows/ci-rln-simulator.yml
vendored
Normal file
271
.github/workflows/ci-rln-simulator.yml
vendored
Normal file
@ -0,0 +1,271 @@
|
||||
name: RLN E2E — Simulator
|
||||
|
||||
# Validates the full RLN flow end-to-end against logos-delivery-simulator:
|
||||
# keystore generation, on-chain registration, gossipsub propagation,
|
||||
# per-epoch rate-limit enforcement, and epoch-boundary recovery.
|
||||
#
|
||||
# Why this exists: logos-dev runs with RLN disabled, so there is no
|
||||
# production traffic exercising RLN. Until RLN is enabled there, this is
|
||||
# the only end-to-end coverage of the RLN + zerokit path.
|
||||
#
|
||||
# The image is built ON the runner and tested ON the same runner, so the
|
||||
# AVX-512 portability issue in container-image.yml does not apply here.
|
||||
#
|
||||
# No own schedule: ci-daily.yml is the single daily entry point and calls
|
||||
# this via workflow_call. workflow_dispatch allows manual runs.
|
||||
# Run defaults live in tests/simulator/rln-sim.env; inputs override per-run.
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
branch:
|
||||
type: string
|
||||
default: ''
|
||||
num_nodes:
|
||||
type: string
|
||||
default: ''
|
||||
msg_limit:
|
||||
type: string
|
||||
default: ''
|
||||
epoch_sec:
|
||||
type: string
|
||||
default: ''
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
branch:
|
||||
description: 'logos-delivery branch to build & test (blank = use rln-sim.env)'
|
||||
type: string
|
||||
default: ''
|
||||
num_nodes:
|
||||
description: 'Number of nwaku nodes (blank = use rln-sim.env)'
|
||||
type: string
|
||||
default: ''
|
||||
msg_limit:
|
||||
description: 'RLN_RELAY_MSG_LIMIT, must be >= contract min ~20 (blank = use rln-sim.env)'
|
||||
type: string
|
||||
default: ''
|
||||
epoch_sec:
|
||||
description: 'RLN_RELAY_EPOCH_SEC, large enough a burst cannot straddle an epoch (blank = use rln-sim.env)'
|
||||
type: string
|
||||
default: ''
|
||||
|
||||
env:
|
||||
NPROC: 2
|
||||
MAKEFLAGS: "-j2"
|
||||
NIM_VERSION: '2.2.4'
|
||||
NIMBLE_VERSION: '0.22.3'
|
||||
|
||||
jobs:
|
||||
rln-e2e:
|
||||
runs-on: ubuntu-22.04
|
||||
timeout-minutes: 120
|
||||
name: rln-e2e
|
||||
|
||||
steps:
|
||||
# First checkout: the ref that triggered this workflow (CI branch /
|
||||
# master). This is where the e2e test script and rln-sim.env live —
|
||||
# the build branch may not contain them.
|
||||
- name: Checkout CI ref (for the test script)
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: false
|
||||
|
||||
# Defaults come from tests/simulator/rln-sim.env (single source of truth);
|
||||
# a non-blank input (dispatch or workflow_call) overrides the matching value.
|
||||
- name: Resolve parameters
|
||||
id: cfg
|
||||
env:
|
||||
IN_BRANCH: ${{ inputs.branch }}
|
||||
IN_NUM_NODES: ${{ inputs.num_nodes }}
|
||||
IN_MSG_LIMIT: ${{ inputs.msg_limit }}
|
||||
IN_EPOCH_SEC: ${{ inputs.epoch_sec }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
set -a; . tests/simulator/rln-sim.env; set +a
|
||||
{
|
||||
echo "branch=${IN_BRANCH:-$BRANCH}"
|
||||
echo "num_nodes=${IN_NUM_NODES:-$NUM_NODES}"
|
||||
echo "msg_limit=${IN_MSG_LIMIT:-$MSG_LIMIT}"
|
||||
echo "epoch_sec=${IN_EPOCH_SEC:-$EPOCH_SEC}"
|
||||
} >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Stash e2e test script outside the workspace
|
||||
run: |
|
||||
test -f tests/simulator/rln-e2e-test.py \
|
||||
|| { echo "tests/simulator/rln-e2e-test.py missing on CI ref"; exit 1; }
|
||||
cp tests/simulator/rln-e2e-test.py "$RUNNER_TEMP/rln-e2e-test.py"
|
||||
|
||||
# Second checkout: the branch to build & test. Overwrites the workspace;
|
||||
# the stashed test script in RUNNER_TEMP survives.
|
||||
- name: Checkout logos-delivery (${{ steps.cfg.outputs.branch }})
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ steps.cfg.outputs.branch }}
|
||||
submodules: false
|
||||
clean: true
|
||||
|
||||
- name: Get submodules hash
|
||||
id: submodules
|
||||
run: echo "hash=$(git submodule status | awk '{print $1}' | sort | shasum -a 256 | sed 's/[ -]*//g')" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Cache submodules
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: |
|
||||
vendor/
|
||||
.git/modules
|
||||
key: ${{ runner.os }}-vendor-modules-${{ steps.submodules.outputs.hash }}
|
||||
|
||||
- name: Install Nim ${{ env.NIM_VERSION }}
|
||||
uses: jiro4989/setup-nim-action@v2
|
||||
with:
|
||||
nim-version: ${{ env.NIM_VERSION }}
|
||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Install Nimble ${{ env.NIMBLE_VERSION }}
|
||||
run: |
|
||||
cd /tmp && nimble install "nimble@${{ env.NIMBLE_VERSION }}" -y
|
||||
echo "$HOME/.nimble/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Cache nimble deps
|
||||
id: cache-nimbledeps
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: |
|
||||
nimbledeps/
|
||||
nimble.paths
|
||||
key: ${{ runner.os }}-nimbledeps-nimble${{ env.NIMBLE_VERSION }}-${{ hashFiles('nimble.lock', 'BearSSL.mk', 'Nat.mk') }}
|
||||
|
||||
- name: Install nimble deps
|
||||
if: steps.cache-nimbledeps.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
nimble setup --localdeps -y
|
||||
make rebuild-nat-libs-nimbledeps
|
||||
make rebuild-bearssl-nimbledeps
|
||||
touch nimbledeps/.nimble-setup
|
||||
|
||||
- name: Build wakunode2
|
||||
run: |
|
||||
make -j${NPROC} V=1 POSTGRES=1 \
|
||||
NIMFLAGS="-d:disableMarchNative -d:chronicles_colors:none" \
|
||||
wakunode2
|
||||
|
||||
- name: Build local Docker image
|
||||
run: |
|
||||
docker build -t nwaku-rln-ci:test -f docker/binaries/Dockerfile.bn.amd64 .
|
||||
|
||||
- name: Clone logos-delivery-simulator
|
||||
run: |
|
||||
git clone --depth 1 https://github.com/logos-messaging/logos-delivery-simulator.git "$RUNNER_TEMP/logos-delivery-simulator"
|
||||
|
||||
- name: Write simulator .env
|
||||
working-directory: ${{ runner.temp }}/logos-delivery-simulator
|
||||
run: |
|
||||
cat > .env <<EOF
|
||||
LD_IMAGE=nwaku-rln-ci:test
|
||||
NUM_LD_NODES=${{ steps.cfg.outputs.num_nodes }}
|
||||
MSG_SIZE_KBYTES=1
|
||||
TRAFFIC_DELAY_SECONDS=5
|
||||
RLN_RELAY_EPOCH_SEC=${{ steps.cfg.outputs.epoch_sec }}
|
||||
RLN_RELAY_MSG_LIMIT=${{ steps.cfg.outputs.msg_limit }}
|
||||
MAX_MESSAGE_LIMIT=100
|
||||
RPC_URL=http://foundry:8545
|
||||
PRIVATE_KEY=0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80
|
||||
ETH_FROM=0xf39fd6e51aad88f6f4ce6ab8827279cfffb92266
|
||||
RLN_CONTRACT_REPO_COMMIT=e75ac913e579ad872f54b2225eec35d1de3d98b0
|
||||
WATCHTOWER_ENABLED=false
|
||||
EOF
|
||||
|
||||
- name: Bring up simulator (RLN subset)
|
||||
working-directory: ${{ runner.temp }}/logos-delivery-simulator
|
||||
run: |
|
||||
docker compose up -d foundry contract-repo-deployer nwaku-token-init bootstrap nwaku
|
||||
|
||||
- name: Wait for contract deployer
|
||||
working-directory: ${{ runner.temp }}/logos-delivery-simulator
|
||||
run: |
|
||||
for _ in $(seq 1 60); do
|
||||
st=$(docker inspect logos-delivery-simulator-contract-repo-deployer-1 --format='{{.State.Status}}' 2>/dev/null || echo missing)
|
||||
[ "$st" = "exited" ] && break
|
||||
echo "deployer status: $st"; sleep 15
|
||||
done
|
||||
ec=$(docker inspect logos-delivery-simulator-contract-repo-deployer-1 --format='{{.State.ExitCode}}')
|
||||
echo "deployer exit code: $ec"
|
||||
if [ "$ec" != "0" ]; then
|
||||
docker logs logos-delivery-simulator-contract-repo-deployer-1 2>&1 | tail -50
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Wait for nwaku fleet to register
|
||||
working-directory: ${{ runner.temp }}/logos-delivery-simulator
|
||||
run: |
|
||||
N=${{ steps.cfg.outputs.num_nodes }}
|
||||
for _ in $(seq 1 60); do
|
||||
up=$(docker ps --filter 'name=logos-delivery-simulator-nwaku-' --filter 'status=running' --format '{{.Names}}' | wc -l)
|
||||
echo "nwaku running: $up/$N"
|
||||
[ "$up" -ge "$N" ] && break
|
||||
sleep 15
|
||||
done
|
||||
# nwaku-1 must reach the "registered + started" marker
|
||||
timeout 300 docker logs -f logos-delivery-simulator-nwaku-1 2>&1 \
|
||||
| grep -m1 -E "Segmentation fault|Illegal instruction|Failed to register on-chain|I am a nwaku node" \
|
||||
| tee /tmp/nwaku1.verdict
|
||||
grep -q "I am a nwaku node" /tmp/nwaku1.verdict
|
||||
|
||||
- name: Run RLN e2e scenarios
|
||||
run: |
|
||||
TEST_SCRIPT="$RUNNER_TEMP/rln-e2e-test.py"
|
||||
test -f "$TEST_SCRIPT" \
|
||||
|| { echo "stashed test script missing at $TEST_SCRIPT"; exit 1; }
|
||||
docker run --rm \
|
||||
--network logos-delivery-simulator_simulation \
|
||||
-v "$TEST_SCRIPT:/test.py:ro" \
|
||||
python:3.11-slim \
|
||||
sh -c "pip install --quiet --disable-pip-version-check requests && \
|
||||
python /test.py \
|
||||
--hostname-prefix logos-delivery-simulator-nwaku- \
|
||||
--num-nodes ${{ steps.cfg.outputs.num_nodes }} \
|
||||
--msg-limit ${{ steps.cfg.outputs.msg_limit }} \
|
||||
--epoch-sec ${{ steps.cfg.outputs.epoch_sec }} \
|
||||
--health-deadline-sec 600"
|
||||
|
||||
- name: Collect logs on failure
|
||||
if: failure()
|
||||
working-directory: ${{ runner.temp }}/logos-delivery-simulator
|
||||
run: |
|
||||
mkdir -p "$RUNNER_TEMP/logs"
|
||||
for c in $(docker ps -a --filter 'name=logos-delivery-simulator-' --format '{{.Names}}'); do
|
||||
docker logs "$c" > "$RUNNER_TEMP/logs/$c.log" 2>&1 || true
|
||||
done
|
||||
|
||||
- name: Upload logs
|
||||
if: failure()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: simulator-logs
|
||||
path: ${{ runner.temp }}/logs
|
||||
retention-days: 7
|
||||
|
||||
- name: Tear down
|
||||
if: always()
|
||||
working-directory: ${{ runner.temp }}/logos-delivery-simulator
|
||||
run: docker compose down -v || true
|
||||
|
||||
- name: Notify Discord
|
||||
if: always()
|
||||
env:
|
||||
DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_WEBHOOK_URL }}
|
||||
run: |
|
||||
[ -z "$DISCORD_WEBHOOK_URL" ] && exit 0
|
||||
STATUS="${{ job.status }}"
|
||||
BRANCH="${{ steps.cfg.outputs.branch }}"
|
||||
RUN_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
if [ "$STATUS" = "success" ]; then COLOR=3066993; TITLE="✅ RLN E2E passed"; else COLOR=15158332; TITLE="❌ RLN E2E failed"; fi
|
||||
curl -H "Content-Type: application/json" -X POST -d "{
|
||||
\"embeds\":[{\"title\":\"$TITLE\",\"color\":$COLOR,
|
||||
\"fields\":[
|
||||
{\"name\":\"Branch\",\"value\":\"$BRANCH\",\"inline\":true},
|
||||
{\"name\":\"Status\",\"value\":\"$STATUS\",\"inline\":true}],
|
||||
\"url\":\"$RUN_URL\",
|
||||
\"footer\":{\"text\":\"Daily RLN simulator E2E\"}}]}" \
|
||||
"$DISCORD_WEBHOOK_URL"
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@ -86,3 +86,7 @@ nimbledeps
|
||||
|
||||
**/anvil_state/state-deployed-contracts-mint-and-approved.json
|
||||
.gitnexus
|
||||
|
||||
# Python bytecode from tests/simulator
|
||||
__pycache__/
|
||||
*.pyc
|
||||
|
||||
388
tests/simulator/rln-e2e-test.py
Executable file
388
tests/simulator/rln-e2e-test.py
Executable file
@ -0,0 +1,388 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
RLN end-to-end test against a running logos-delivery-simulator stack.
|
||||
|
||||
Designed to run as a sidecar container on the simulator's Docker network so
|
||||
hostnames like `logos-delivery-simulator-nwaku-1` resolve via Docker DNS.
|
||||
|
||||
Scenarios covered (in order):
|
||||
1. HEALTH - every node responds to /debug/v1/info with an enrUri
|
||||
2. SUBSCRIBE - every node REST-subscribes to the pubsub topic
|
||||
3. WITHIN_LIMIT - every node concurrently sends msg_limit messages -> 200
|
||||
4. PROPAGATION - one sender's message lands in all peers' inboxes
|
||||
5. OVER_LIMIT - one extra message per node -> 500 (rate-limit hit)
|
||||
6. EPOCH_RESET - after epoch_sec, every node can send 1 more -> 200
|
||||
7. SAME_MESSAGE_ID - sending same message_id twice in same epoch is the
|
||||
slashable signal (verified by checking node logs)
|
||||
|
||||
Exit code:
|
||||
0 = all scenarios passed
|
||||
N = number of scenarios that failed
|
||||
|
||||
Usage (typical):
|
||||
docker run --rm \\
|
||||
--network logos-delivery-simulator_simulation \\
|
||||
-v /path/to/rln-e2e-test.py:/test.py \\
|
||||
python:3.11-slim \\
|
||||
sh -c 'pip install --quiet requests && python /test.py \\
|
||||
--hostname-prefix logos-delivery-simulator-nwaku- \\
|
||||
--num-nodes 30 --msg-limit 30 --epoch-sec 15'
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import concurrent.futures as cf
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
|
||||
PUBSUB_TOPIC = "/waku/2/rs/66/0"
|
||||
CONTENT_TOPIC = "/rln-test/1/probe/proto"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def url_of(host: str, port: int = 8645) -> str:
|
||||
return f"http://{host}:{port}"
|
||||
|
||||
|
||||
def waku_publish(node_url: str, payload: bytes, timeout: float = 5.0) -> int:
|
||||
body = {
|
||||
"payload": base64.b64encode(payload).decode("ascii"),
|
||||
"contentTopic": CONTENT_TOPIC,
|
||||
"version": 1,
|
||||
"timestamp": time.time_ns(),
|
||||
}
|
||||
enc = urllib.parse.quote(PUBSUB_TOPIC, safe="")
|
||||
try:
|
||||
r = requests.post(
|
||||
f"{node_url}/relay/v1/messages/{enc}",
|
||||
json=body,
|
||||
timeout=timeout,
|
||||
headers={"content-type": "application/json"},
|
||||
)
|
||||
return r.status_code
|
||||
except requests.RequestException:
|
||||
return -1
|
||||
|
||||
|
||||
def waku_subscribe(node_url: str, timeout: float = 5.0) -> int:
|
||||
try:
|
||||
r = requests.post(
|
||||
f"{node_url}/relay/v1/subscriptions",
|
||||
json=[PUBSUB_TOPIC],
|
||||
timeout=timeout,
|
||||
headers={"content-type": "application/json"},
|
||||
)
|
||||
return r.status_code
|
||||
except requests.RequestException:
|
||||
return -1
|
||||
|
||||
|
||||
def waku_get_messages(node_url: str, timeout: float = 5.0) -> Optional[list]:
|
||||
enc = urllib.parse.quote(PUBSUB_TOPIC, safe="")
|
||||
try:
|
||||
r = requests.get(
|
||||
f"{node_url}/relay/v1/messages/{enc}",
|
||||
timeout=timeout,
|
||||
)
|
||||
if r.status_code != 200:
|
||||
return None
|
||||
return r.json()
|
||||
except (requests.RequestException, json.JSONDecodeError):
|
||||
return None
|
||||
|
||||
|
||||
def node_healthy(node_url: str, timeout: float = 3.0) -> bool:
|
||||
try:
|
||||
r = requests.get(f"{node_url}/debug/v1/info", timeout=timeout)
|
||||
return r.status_code == 200 and "enrUri" in r.json()
|
||||
except (requests.RequestException, json.JSONDecodeError):
|
||||
return False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# scenarios
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class Result:
|
||||
name: str
|
||||
ok: bool
|
||||
detail: str = ""
|
||||
|
||||
def __str__(self) -> str:
|
||||
status = "PASS" if self.ok else "FAIL"
|
||||
s = f"[{status}] {self.name}"
|
||||
if self.detail:
|
||||
s += f" — {self.detail}"
|
||||
return s
|
||||
|
||||
|
||||
def scenario_health(nodes: list[str], deadline_sec: int = 120) -> Result:
|
||||
"""Every node must be reachable within deadline_sec."""
|
||||
start = time.time()
|
||||
unhealthy = list(nodes)
|
||||
while time.time() - start < deadline_sec and unhealthy:
|
||||
with cf.ThreadPoolExecutor(max_workers=min(32, len(unhealthy))) as ex:
|
||||
results = list(ex.map(node_healthy, [url_of(n) for n in unhealthy]))
|
||||
unhealthy = [n for n, ok in zip(unhealthy, results) if not ok]
|
||||
if unhealthy:
|
||||
time.sleep(3)
|
||||
return Result(
|
||||
"HEALTH",
|
||||
not unhealthy,
|
||||
f"{len(nodes) - len(unhealthy)}/{len(nodes)} healthy"
|
||||
+ (f"; failing: {unhealthy[:5]}" if unhealthy else ""),
|
||||
)
|
||||
|
||||
|
||||
def scenario_subscribe(nodes: list[str]) -> Result:
|
||||
"""REST-subscribe every node to the pubsub topic so GETs return cached msgs."""
|
||||
with cf.ThreadPoolExecutor(max_workers=min(32, len(nodes))) as ex:
|
||||
codes = list(ex.map(waku_subscribe, [url_of(n) for n in nodes]))
|
||||
bad = [(n, c) for n, c in zip(nodes, codes) if c != 200]
|
||||
return Result(
|
||||
"SUBSCRIBE",
|
||||
not bad,
|
||||
f"{len(nodes) - len(bad)}/{len(nodes)} subscribed"
|
||||
+ (f"; failing: {bad[:5]}" if bad else ""),
|
||||
)
|
||||
|
||||
|
||||
def _send_n(node_url: str, n: int) -> list[int]:
|
||||
codes = []
|
||||
for i in range(n):
|
||||
codes.append(waku_publish(node_url, f"probe-{i}".encode()))
|
||||
return codes
|
||||
|
||||
|
||||
def _burst_until_blocked(node_url: str, msg_limit: int, overshoot: int = 3):
|
||||
"""Send msg_limit+overshoot messages back-to-back, fast, recording codes.
|
||||
Designed to complete inside a single epoch — keep epoch_sec large enough
|
||||
that this burst can't straddle an epoch boundary.
|
||||
|
||||
Returns (n_200, n_500, n_transport_err, two_hundred_after_block) where
|
||||
two_hundred_after_block flags a 200 appearing AFTER the first 500 (i.e.
|
||||
quota reset mid-burst => epoch straddle)."""
|
||||
codes = []
|
||||
for i in range(msg_limit + overshoot):
|
||||
codes.append(waku_publish(node_url, f"burst-{i}".encode(), timeout=10.0))
|
||||
n_200 = sum(c == 200 for c in codes)
|
||||
n_500 = sum(c == 500 for c in codes)
|
||||
n_err = sum(c not in (200, 500) for c in codes) # -1, 4xx transient, etc.
|
||||
first_block_idx = next((i for i, c in enumerate(codes) if c == 500), None)
|
||||
two_hundred_after_block = (
|
||||
first_block_idx is not None
|
||||
and any(c == 200 for c in codes[first_block_idx + 1:])
|
||||
)
|
||||
return n_200, n_500, n_err, two_hundred_after_block
|
||||
|
||||
|
||||
def _publish_until_ok(node_url: str, attempts: int = 20, spacing: float = 5.0) -> bool:
|
||||
"""Retry a single publish until it returns 200 or attempts run out.
|
||||
Tolerates the post-startup window where discv5/gossipsub mesh is still
|
||||
forming and the RLN publish path transiently 500s."""
|
||||
for _ in range(attempts):
|
||||
if waku_publish(node_url, b"warmup", timeout=10.0) == 200:
|
||||
return True
|
||||
time.sleep(spacing)
|
||||
return False
|
||||
|
||||
|
||||
def scenario_warmup(nodes: list[str], attempts: int = 20) -> Result:
|
||||
"""Readiness gate: every node must successfully publish at least once.
|
||||
This absorbs mesh-formation churn so PROPAGATION/RATE_LIMIT aren't
|
||||
judging a not-yet-connected fleet. Consumes 1 nonce/node — well within
|
||||
msg_limit, and RATE_LIMIT's tolerance accounts for it."""
|
||||
with cf.ThreadPoolExecutor(max_workers=min(8, len(nodes))) as ex:
|
||||
ready = list(ex.map(lambda n: _publish_until_ok(url_of(n), attempts), nodes))
|
||||
not_ready = [n for n, ok in zip(nodes, ready) if not ok]
|
||||
return Result(
|
||||
"WARMUP",
|
||||
not not_ready,
|
||||
f"{len(nodes) - len(not_ready)}/{len(nodes)} nodes publishing"
|
||||
+ (f"; never ready: {not_ready[:5]}" if not_ready else ""),
|
||||
)
|
||||
|
||||
|
||||
def scenario_rate_limit(nodes: list[str], msg_limit: int, tolerance: int = 3) -> Result:
|
||||
"""Per-node burst of msg_limit+3 messages within one epoch.
|
||||
|
||||
The RLN invariant being checked:
|
||||
(a) a node must NEVER publish more than msg_limit in one epoch, and
|
||||
(b) the node must enforce a 500 ceiling once the quota is exhausted.
|
||||
|
||||
Transient HTTP errors under concurrent load can lower the accepted count
|
||||
below msg_limit — that does NOT violate the invariant, so we accept
|
||||
successes in [msg_limit - tolerance, msg_limit]. successes > msg_limit OR
|
||||
a 200 after the first 500 means the epoch rolled mid-burst (raise
|
||||
RLN_RELAY_EPOCH_SEC) — reported as a timing skew, not an RLN failure."""
|
||||
# Cap concurrency: firing len(nodes)*(msg_limit+3) publishes all at once
|
||||
# saturates small CI runners (2 vCPU) and causes publish-path timeouts
|
||||
# that masquerade as rate-limit failures.
|
||||
with cf.ThreadPoolExecutor(max_workers=min(5, len(nodes))) as ex:
|
||||
per_node = list(
|
||||
ex.map(lambda n: _burst_until_blocked(url_of(n), msg_limit), nodes)
|
||||
)
|
||||
|
||||
rate_failures = [] # genuine RLN misbehaviour
|
||||
timing_skews = [] # epoch straddled mid-burst — inconclusive
|
||||
for node, (n_200, n_500, n_err, after_block) in zip(nodes, per_node):
|
||||
if n_200 > msg_limit or after_block:
|
||||
timing_skews.append(
|
||||
(node, f"{n_200} ok, epoch rolled mid-burst (raise epoch_sec)")
|
||||
)
|
||||
elif n_500 == 0:
|
||||
rate_failures.append((node, f"no 500 ceiling ({n_200} ok, {n_err} err)"))
|
||||
elif n_200 < msg_limit - tolerance:
|
||||
rate_failures.append(
|
||||
(node, f"only {n_200}/{msg_limit} ok ({n_err} transport err)")
|
||||
)
|
||||
|
||||
if timing_skews and not rate_failures:
|
||||
return Result(
|
||||
"RATE_LIMIT",
|
||||
False,
|
||||
f"INCONCLUSIVE (timing) — raise RLN_RELAY_EPOCH_SEC; "
|
||||
f"{len(timing_skews)} node(s) straddled an epoch: {timing_skews[:3]}",
|
||||
)
|
||||
ok = not rate_failures and not timing_skews
|
||||
good = len(nodes) - len(rate_failures) - len(timing_skews)
|
||||
return Result(
|
||||
"RATE_LIMIT",
|
||||
ok,
|
||||
f"{good}/{len(nodes)} nodes enforced <= {msg_limit} then 500 "
|
||||
f"(tolerance {tolerance} for transport noise)"
|
||||
+ (f"; rate failures: {rate_failures[:3]}" if rate_failures else "")
|
||||
+ (f"; timing skews: {timing_skews[:3]}" if timing_skews else ""),
|
||||
)
|
||||
|
||||
|
||||
def scenario_propagation(
|
||||
sender: str, receivers: list[str], settle_sec: int = 5
|
||||
) -> Result:
|
||||
"""Send one message on `sender`, expect it visible in every receiver's
|
||||
REST inbox within settle_sec."""
|
||||
marker = f"propagation-marker-{time.time_ns()}".encode()
|
||||
code = waku_publish(url_of(sender), marker)
|
||||
if code != 200:
|
||||
return Result("PROPAGATION", False, f"sender publish returned {code}")
|
||||
|
||||
time.sleep(settle_sec)
|
||||
missing = []
|
||||
with cf.ThreadPoolExecutor(max_workers=min(32, len(receivers))) as ex:
|
||||
inboxes = list(ex.map(waku_get_messages, [url_of(r) for r in receivers]))
|
||||
|
||||
encoded_marker = base64.b64encode(marker).decode().rstrip("=")
|
||||
for r, inbox in zip(receivers, inboxes):
|
||||
if inbox is None:
|
||||
missing.append((r, "GET failed"))
|
||||
continue
|
||||
# Look for our marker payload in any message
|
||||
found = any(
|
||||
(m.get("payload") or "").rstrip("=") == encoded_marker
|
||||
for m in inbox
|
||||
)
|
||||
if not found:
|
||||
missing.append((r, f"{len(inbox)} msgs, marker not present"))
|
||||
|
||||
return Result(
|
||||
"PROPAGATION",
|
||||
not missing,
|
||||
f"{len(receivers) - len(missing)}/{len(receivers)} receivers got the message"
|
||||
+ (f"; missing on {missing[:3]}" if missing else ""),
|
||||
)
|
||||
|
||||
|
||||
def scenario_epoch_reset(nodes: list[str], epoch_sec: int) -> Result:
|
||||
"""After epoch_sec + slack, each node can send 1 more message — expect 200."""
|
||||
sleep_s = epoch_sec + 3
|
||||
print(f" sleeping {sleep_s}s for epoch reset...")
|
||||
time.sleep(sleep_s)
|
||||
with cf.ThreadPoolExecutor(max_workers=len(nodes)) as ex:
|
||||
codes = list(
|
||||
ex.map(
|
||||
lambda n: waku_publish(url_of(n), b"post-epoch"),
|
||||
nodes,
|
||||
)
|
||||
)
|
||||
bad = [(n, c) for n, c in zip(nodes, codes) if c != 200]
|
||||
return Result(
|
||||
"EPOCH_RESET",
|
||||
not bad,
|
||||
f"{sum(c == 200 for c in codes)}/{len(nodes)} returned 200 after epoch reset"
|
||||
+ (f"; failing: {bad[:3]}" if bad else ""),
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument("--hostname-prefix", default="logos-delivery-simulator-nwaku-")
|
||||
ap.add_argument("--num-nodes", type=int, default=30)
|
||||
ap.add_argument("--msg-limit", type=int, default=30,
|
||||
help="Must match RLN_RELAY_MSG_LIMIT in simulator .env")
|
||||
ap.add_argument("--epoch-sec", type=int, default=15,
|
||||
help="Must match RLN_RELAY_EPOCH_SEC in simulator .env")
|
||||
ap.add_argument("--health-deadline-sec", type=int, default=180)
|
||||
args = ap.parse_args()
|
||||
|
||||
nodes = [f"{args.hostname_prefix}{i}" for i in range(1, args.num_nodes + 1)]
|
||||
print(f"Testing {len(nodes)} nodes: {nodes[0]} … {nodes[-1]}")
|
||||
print(f"Config: msg_limit={args.msg_limit}, epoch_sec={args.epoch_sec}")
|
||||
print()
|
||||
|
||||
results: list[Result] = []
|
||||
|
||||
def run(scenario_fn, *fn_args, **fn_kwargs) -> bool:
|
||||
r = scenario_fn(*fn_args, **fn_kwargs)
|
||||
results.append(r)
|
||||
print(r)
|
||||
return r.ok
|
||||
|
||||
if not run(scenario_health, nodes, deadline_sec=args.health_deadline_sec):
|
||||
print("\nABORTING — nodes never reached healthy state.")
|
||||
return _summarize(results)
|
||||
|
||||
if not run(scenario_subscribe, nodes):
|
||||
print("\nABORTING — could not subscribe nodes to pubsub topic.")
|
||||
return _summarize(results)
|
||||
|
||||
# Readiness gate: wait out mesh-formation churn before judging behaviour.
|
||||
if not run(scenario_warmup, nodes):
|
||||
print("\nABORTING — fleet never reached a publishable state.")
|
||||
return _summarize(results)
|
||||
|
||||
run(scenario_propagation, nodes[0], nodes[1:])
|
||||
# Rate limit: per-node burst, asserts exactly msg_limit then 500.
|
||||
# Requires epoch_sec large enough that the burst can't straddle an epoch.
|
||||
run(scenario_rate_limit, nodes, args.msg_limit)
|
||||
run(scenario_epoch_reset, nodes, args.epoch_sec)
|
||||
|
||||
return _summarize(results)
|
||||
|
||||
|
||||
def _summarize(results: list[Result]) -> int:
|
||||
print()
|
||||
print("=" * 64)
|
||||
passed = sum(r.ok for r in results)
|
||||
print(f" {passed}/{len(results)} scenarios passed")
|
||||
for r in results:
|
||||
print(f" {r}")
|
||||
print("=" * 64)
|
||||
return len(results) - passed
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
6
tests/simulator/rln-sim.env
Normal file
6
tests/simulator/rln-sim.env
Normal file
@ -0,0 +1,6 @@
|
||||
# Source of truth for the RLN simulator E2E run (ci-rln-simulator.yml).
|
||||
# workflow_dispatch inputs override any value here per-run (blank input = use this file).
|
||||
BRANCH=master
|
||||
NUM_NODES=6
|
||||
MSG_LIMIT=30
|
||||
EPOCH_SEC=120
|
||||
Loading…
x
Reference in New Issue
Block a user