From 222436ed8dd090d5c1ee2cd9c302c70509fd96cd Mon Sep 17 00:00:00 2001 From: andrussal Date: Thu, 18 Dec 2025 17:26:02 +0100 Subject: [PATCH] Reorganize scripts into subdirectories Move helper scripts under scripts/{run,build,setup,ops,lib} and update all references across docs, CI, Docker, and Rust call sites. --- README.md | 14 +- book/src/SUMMARY.md | 12 +- book/src/annotated-tree.md | 2 +- book/src/architecture-overview.md | 10 +- book/src/authoring-scenarios.md | 2 +- book/src/ci-integration.md | 423 ++++++++++++ book/src/environment-variables.md | 395 ++++++++++++ book/src/examples-advanced.md | 5 + book/src/examples.md | 7 +- book/src/logging-observability.md | 365 +++++++++++ book/src/node-control.md | 300 ++++++++- book/src/operations-overview.md | 63 ++ book/src/operations.md | 605 ------------------ book/src/part-v.md | 44 ++ book/src/part-vi.md | 28 + book/src/prerequisites.md | 286 +++++++++ book/src/quickstart.md | 16 +- book/src/runners.md | 8 +- book/src/running-examples.md | 307 +++++++++ book/src/running-scenarios.md | 2 +- book/src/troubleshooting.md | 489 +++++++++++++- book/src/workloads.md | 8 +- scripts/{ => build}/build-bundle.sh | 8 +- scripts/{ => build}/build-linux-binaries.sh | 8 +- scripts/{ => build}/build-rapidsnark.sh | 2 +- scripts/{ => build}/build_test_image.sh | 4 +- scripts/{ => lib}/common.sh | 0 scripts/{ => ops}/clean.sh | 4 +- scripts/{ => ops}/push-ecr-test.sh | 4 +- scripts/{ => ops}/update-nomos-rev.sh | 8 +- scripts/{ => run}/checks.sh | 8 +- scripts/{ => run}/run-examples.sh | 10 +- scripts/{ => run}/run-test-matrix.sh | 31 +- scripts/{ => setup}/setup-circuits-stack.sh | 8 +- scripts/{ => setup}/setup-nomos-circuits.sh | 4 +- scripts/{ => setup}/setup-observability.sh | 6 +- .../assets/stack/Dockerfile.base | 4 +- .../stack/scripts/docker/prepare_circuits.sh | 5 +- testing-framework/cucumber/src/world.rs | 2 +- .../deployers/compose/src/docker/mod.rs | 8 +- 40 files changed, 2805 insertions(+), 710 deletions(-) create mode 100644 book/src/ci-integration.md create mode 100644 book/src/environment-variables.md create mode 100644 book/src/logging-observability.md create mode 100644 book/src/operations-overview.md delete mode 100644 book/src/operations.md create mode 100644 book/src/part-v.md create mode 100644 book/src/part-vi.md create mode 100644 book/src/prerequisites.md create mode 100644 book/src/running-examples.md rename scripts/{ => build}/build-bundle.sh (97%) rename scripts/{ => build}/build-linux-binaries.sh (95%) rename scripts/{ => build}/build-rapidsnark.sh (98%) rename scripts/{ => build}/build_test_image.sh (98%) rename scripts/{ => lib}/common.sh (100%) rename scripts/{ => ops}/clean.sh (96%) rename scripts/{ => ops}/push-ecr-test.sh (91%) rename scripts/{ => ops}/update-nomos-rev.sh (96%) rename scripts/{ => run}/checks.sh (97%) rename scripts/{ => run}/run-examples.sh (98%) rename scripts/{ => run}/run-test-matrix.sh (87%) rename scripts/{ => setup}/setup-circuits-stack.sh (95%) rename scripts/{ => setup}/setup-nomos-circuits.sh (98%) rename scripts/{ => setup}/setup-observability.sh (96%) diff --git a/README.md b/README.md index d9a26b9..82cfe32 100644 --- a/README.md +++ b/README.md @@ -26,13 +26,13 @@ This framework enables you to define, deploy, and execute integration tests for ```bash # Host mode (local processes) - fastest iteration -scripts/run-examples.sh -t 60 -v 1 -e 1 host +scripts/run/run-examples.sh -t 60 -v 1 -e 1 host # Compose mode (Docker containers) - reproducible environment -scripts/run-examples.sh -t 60 -v 1 -e 1 compose +scripts/run/run-examples.sh -t 60 -v 1 -e 1 compose # K8s mode (Kubernetes cluster) - production-like fidelity -scripts/run-examples.sh -t 60 -v 1 -e 1 k8s +scripts/run/run-examples.sh -t 60 -v 1 -e 1 k8s ``` The script handles circuit setup, binary building, image preparation, and scenario execution automatically. @@ -101,7 +101,7 @@ cd book && mdbook serve cargo test # Run integration examples -scripts/run-examples.sh -t 60 -v 2 -e 1 host +scripts/run/run-examples.sh -t 60 -v 2 -e 1 host ``` ### Creating Prebuilt Bundles @@ -110,11 +110,11 @@ For compose/k8s deployments, you can create prebuilt bundles to speed up image b ```bash # Build Linux bundle (required for compose/k8s) -scripts/build-bundle.sh --platform linux +scripts/build/build-bundle.sh --platform linux # Use the bundle when building images export NOMOS_BINARIES_TAR=.tmp/nomos-binaries-linux-v0.3.1.tar.gz -scripts/build_test_image.sh +scripts/build/build_test_image.sh ``` ## Environment Variables @@ -123,7 +123,7 @@ Key environment variables for customization: | Variable | Purpose | Default | |----------|---------|---------| -| `POL_PROOF_DEV_MODE=true` | **Required** — Disable expensive proof generation (set automatically by `scripts/run-examples.sh`) | (none) | +| `POL_PROOF_DEV_MODE=true` | **Required** — Disable expensive proof generation (set automatically by `scripts/run/run-examples.sh`) | (none) | | `NOMOS_TESTNET_IMAGE` | Docker image tag for compose/k8s | `logos-blockchain-testing:local` | | `NOMOS_DEMO_VALIDATORS` | Number of validator nodes | Varies by example | | `NOMOS_DEMO_EXECUTORS` | Number of executor nodes | Varies by example | diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md index e62d711..113c647 100644 --- a/book/src/SUMMARY.md +++ b/book/src/SUMMARY.md @@ -20,17 +20,23 @@ - [Advanced & Artificial Examples](examples-advanced.md) - [Running Scenarios](running-scenarios.md) - [Runners](runners.md) - - [Node Control & RunContext](node-control.md) + - [RunContext: BlockFeed & Node Control](node-control.md) - [Chaos Workloads](chaos.md) - [Topology & Chaos Patterns](topology-chaos.md) - - [Operations](operations.md) - [Part III — Developer Reference](part-iii.md) - [Scenario Model (Developer Level)](scenario-model.md) - [API Levels: Builder DSL vs. Direct](api-levels.md) - [Extending the Framework](extending.md) - [Example: New Workload & Expectation (Rust)](custom-workload-example.md) - [Internal Crate Reference](internal-crate-reference.md) -- [Part IV — Appendix](part-iv.md) +- [Part V — Operations & Deployment](part-v.md) + - [Overview](operations-overview.md) + - [Prerequisites & Setup](prerequisites.md) + - [Running Examples](running-examples.md) + - [CI Integration](ci-integration.md) + - [Environment Variables](environment-variables.md) + - [Logging & Observability](logging-observability.md) +- [Part VI — Appendix](part-vi.md) - [Builder API Quick Reference](dsl-cheat-sheet.md) - [Troubleshooting Scenarios](troubleshooting.md) - [FAQ](faq.md) diff --git a/book/src/annotated-tree.md b/book/src/annotated-tree.md index 6dbfc93..7a644ac 100644 --- a/book/src/annotated-tree.md +++ b/book/src/annotated-tree.md @@ -90,7 +90,7 @@ Helper utilities: - `NOMOS_LOG_FILTER` — Target-specific filtering (e.g., `cryptarchia=trace,nomos_da_sampling=debug`) - `NOMOS_TESTS_TRACING` — Enable file logging for local runner -See [Logging and Observability](operations.md#logging-and-observability) for details. +See [Logging & Observability](logging-observability.md) for details. ## Navigation Guide diff --git a/book/src/architecture-overview.md b/book/src/architecture-overview.md index 1ab8e09..eac3bf8 100644 --- a/book/src/architecture-overview.md +++ b/book/src/architecture-overview.md @@ -37,7 +37,7 @@ The framework is consumed via **runnable example binaries** in `examples/src/bin **Recommended:** Use the convenience script: ```bash -scripts/run-examples.sh -t -v -e +scripts/run/run-examples.sh -t -v -e # mode: host, compose, or k8s ``` @@ -97,18 +97,18 @@ Three deployer implementations: ## Assets and Images ### Docker Image -Built via `scripts/build_test_image.sh`: +Built via `scripts/build/build_test_image.sh`: - Embeds KZG circuit parameters and binaries from `testing-framework/assets/stack/kzgrs_test_params/kzgrs_test_params` - Includes runner scripts: `run_nomos_node.sh`, `run_nomos_executor.sh` - Tagged as `NOMOS_TESTNET_IMAGE` (default: `logos-blockchain-testing:local`) -- **Recommended:** Use prebuilt bundle via `scripts/build-bundle.sh --platform linux` and set `NOMOS_BINARIES_TAR` before building image +- **Recommended:** Use prebuilt bundle via `scripts/build/build-bundle.sh --platform linux` and set `NOMOS_BINARIES_TAR` before building image ### Circuit Assets KZG parameters required for DA workloads: - **Host path:** `testing-framework/assets/stack/kzgrs_test_params/kzgrs_test_params` (note repeated filename—directory contains file `kzgrs_test_params`) - **Container path:** `/kzgrs_test_params/kzgrs_test_params` (for compose/k8s) - **Override:** `NOMOS_KZGRS_PARAMS_PATH=/custom/path/to/file` (must point to file) -- **Fetch via:** `scripts/setup-nomos-circuits.sh v0.3.1 /tmp/circuits` or use `scripts/run-examples.sh` +- **Fetch via:** `scripts/setup/setup-nomos-circuits.sh v0.3.1 /tmp/circuits` or use `scripts/run/run-examples.sh` ### Compose Stack Templates and configs in `testing-framework/runners/compose/assets/`: @@ -152,4 +152,4 @@ Templates and configs in `testing-framework/runners/compose/assets/`: - Metrics endpoint: `NOMOS_OTLP_METRICS_ENDPOINT=http://localhost:4318` - Disabled by default (no noise if unset) -For detailed logging configuration, see [Logging and Observability](operations.md#logging-and-observability). +For detailed logging configuration, see [Logging & Observability](logging-observability.md). diff --git a/book/src/authoring-scenarios.md b/book/src/authoring-scenarios.md index a7035e0..363e552 100644 --- a/book/src/authoring-scenarios.md +++ b/book/src/authoring-scenarios.md @@ -14,7 +14,7 @@ Creating a scenario is a declarative exercise: block progression and the effects of your workloads. 5. **Choose a runner**: target local processes for fast iteration, Docker Compose for reproducible multi-node stacks, or Kubernetes for cluster-grade - validation. For environment considerations, see [Operations](operations.md). + validation. For environment considerations, see [Operations Overview](operations-overview.md). Keep scenarios small and explicit: make the intended behavior and the success criteria clear so failures are easy to interpret and act upon. diff --git a/book/src/ci-integration.md b/book/src/ci-integration.md new file mode 100644 index 0000000..d494a90 --- /dev/null +++ b/book/src/ci-integration.md @@ -0,0 +1,423 @@ +# CI Integration + +Both **LocalDeployer** and **ComposeDeployer** work well in CI environments. Choose based on your tradeoffs. + +## Runner Comparison for CI + +**LocalDeployer (Host Runner):** +- Faster startup (no Docker overhead) +- Good for quick smoke tests +- **Trade-off:** Less isolation (processes share host resources) + +**ComposeDeployer (Recommended for CI):** +- Better isolation (containerized) +- Reproducible environment +- Can integrate with external Prometheus/Grafana (optional) +- **Trade-offs:** Slower startup (Docker image build), requires Docker daemon + +**K8sDeployer:** +- Production-like environment +- Full resource isolation +- **Trade-offs:** Slowest (cluster setup + image loading), requires cluster access +- Best for nightly/weekly runs or production validation + +**Existing Examples:** + +See `.github/workflows/lint.yml` (jobs: `host_smoke`, `compose_smoke`) for CI examples running the demo scenarios in this repository. + +## Complete CI Workflow Example + +Here's a comprehensive GitHub Actions workflow demonstrating host and compose runners with caching, matrix testing, and log collection: + +```yaml +name: Testing Framework CI + +on: + push: + branches: [main, develop] + pull_request: + branches: [main] + +env: + POL_PROOF_DEV_MODE: true + CARGO_TERM_COLOR: always + RUST_BACKTRACE: 1 + +jobs: + # Quick smoke test with host runner (no Docker) + host_smoke: + name: Host Runner Smoke Test + runs-on: ubuntu-latest + timeout-minutes: 15 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Rust toolchain + uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: nightly + override: true + + - name: Cache Rust dependencies + uses: actions/cache@v3 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + target/ + key: ${{ runner.os }}-cargo-host-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-host- + + - name: Cache nomos-node build + uses: actions/cache@v3 + with: + path: | + ../nomos-node/target/release/nomos-node + ../nomos-node/target/release/nomos-executor + key: ${{ runner.os }}-nomos-${{ hashFiles('../nomos-node/**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-nomos- + + - name: Run host smoke test + run: | + # Use run-examples.sh which handles setup automatically + scripts/run/run-examples.sh -t 120 -v 3 -e 1 host + + - name: Upload logs on failure + if: failure() + uses: actions/upload-artifact@v3 + with: + name: host-runner-logs + path: | + .tmp/ + *.log + retention-days: 7 + + # Compose runner matrix (with Docker) + compose_matrix: + name: Compose Runner (${{ matrix.topology }}) + runs-on: ubuntu-latest + timeout-minutes: 25 + + strategy: + fail-fast: false + matrix: + topology: + - "3v1e" + - "5v1e" + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Rust toolchain + uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: nightly + override: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Cache Rust dependencies + uses: actions/cache@v3 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + target/ + key: ${{ runner.os }}-cargo-compose-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-compose- + + - name: Cache Docker layers + uses: actions/cache@v3 + with: + path: /tmp/.buildx-cache + key: ${{ runner.os }}-buildx-${{ hashFiles('Dockerfile', 'scripts/build/build_test_image.sh') }} + restore-keys: | + ${{ runner.os }}-buildx- + + - name: Run compose test + env: + TOPOLOGY: ${{ matrix.topology }} + run: | + # Build and run with the specified topology + scripts/run/run-examples.sh -t 120 -v ${TOPOLOGY:0:1} -e ${TOPOLOGY:2:1} compose + + - name: Collect Docker logs on failure + if: failure() + run: | + mkdir -p logs + for container in $(docker ps -a --filter "name=nomos-compose-" -q); do + docker logs $container > logs/$(docker inspect --format='{{.Name}}' $container).log 2>&1 + done + + - name: Upload logs and artifacts + if: failure() + uses: actions/upload-artifact@v3 + with: + name: compose-${{ matrix.topology }}-logs + path: | + logs/ + .tmp/ + retention-days: 7 + + - name: Clean up Docker resources + if: always() + run: | + docker compose down -v 2>/dev/null || true + docker ps -a --filter "name=nomos-compose-" -q | xargs -r docker rm -f + + # Cucumber/BDD integration tests (if enabled) + cucumber_tests: + name: Cucumber BDD Tests + runs-on: ubuntu-latest + timeout-minutes: 20 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Rust toolchain + uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: nightly + override: true + + - name: Cache dependencies + uses: actions/cache@v3 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + target/ + key: ${{ runner.os }}-cargo-cucumber-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-cucumber- + + - name: Run Cucumber tests + run: | + # Build prerequisites + scripts/build/build-bundle.sh --platform linux + export NOMOS_BINARIES_TAR=$(ls -t .tmp/nomos-binaries-linux-*.tar.gz | head -1) + + # Run Cucumber tests (host runner) + cargo test -p runner-examples --bin cucumber_host + + - name: Upload test report + if: always() + uses: actions/upload-artifact@v3 + with: + name: cucumber-report + path: | + target/cucumber-reports/ + retention-days: 14 + + # Summary job (requires all tests to pass) + ci_success: + name: CI Success + needs: [host_smoke, compose_matrix, cucumber_tests] + runs-on: ubuntu-latest + if: always() + + steps: + - name: Check all jobs + run: | + if [[ "${{ needs.host_smoke.result }}" != "success" ]] || \ + [[ "${{ needs.compose_matrix.result }}" != "success" ]] || \ + [[ "${{ needs.cucumber_tests.result }}" != "success" ]]; then + echo "One or more CI jobs failed" + exit 1 + fi + echo "All CI jobs passed!" +``` + +## Workflow Features + +1. **Matrix Testing:** Runs compose tests with different topologies (`3v1e`, `5v1e`) +2. **Caching:** Caches Rust dependencies, Docker layers, and nomos-node builds for faster runs +3. **Log Collection:** Automatically uploads logs and artifacts when tests fail +4. **Timeout Protection:** Reasonable timeouts prevent jobs from hanging indefinitely +5. **Cucumber Integration:** Shows how to integrate BDD tests into CI +6. **Clean Teardown:** Ensures Docker resources are cleaned up even on failure + +## Customization Points + +**Topology Matrix:** + +Add more topologies for comprehensive testing: + +```yaml +matrix: + topology: + - "3v1e" + - "5v1e" + - "10v2e" # Larger scale +``` + +**Timeout Adjustments:** + +Increase `timeout-minutes` for longer-running scenarios or slower environments: + +```yaml +timeout-minutes: 30 # Instead of 15 +``` + +**Artifact Retention:** + +Change `retention-days` based on your storage needs: + +```yaml +retention-days: 14 # Keep logs for 2 weeks +``` + +**Conditional Execution:** + +Run expensive tests only on merge to main: + +```yaml +if: github.event_name == 'push' && github.ref == 'refs/heads/main' +``` + +## Best Practices + +### Required: Set POL_PROOF_DEV_MODE + +**Always set `POL_PROOF_DEV_MODE=true` globally** in your workflow env: + +```yaml +env: + POL_PROOF_DEV_MODE: true # REQUIRED! +``` + +Without this, tests will hang due to expensive proof generation. + +### Use Helper Scripts + +Prefer `scripts/run/run-examples.sh` which handles all setup automatically: + +```bash +scripts/run/run-examples.sh -t 120 -v 3 -e 1 host +``` + +This is more reliable than manual `cargo run` commands. + +### Cache Aggressively + +Cache Rust dependencies, nomos-node builds, and Docker layers to speed up CI: + +```yaml +- name: Cache Rust dependencies + uses: actions/cache@v3 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + target/ + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} +``` + +### Collect Logs on Failure + +Always upload logs when tests fail for easier debugging: + +```yaml +- name: Upload logs on failure + if: failure() + uses: actions/upload-artifact@v3 + with: + name: test-logs + path: | + .tmp/ + *.log + retention-days: 7 +``` + +### Split Workflows for Faster Iteration + +For large projects, split host/compose/k8s into separate workflow files: + +- `.github/workflows/test-host.yml` — Fast smoke tests +- `.github/workflows/test-compose.yml` — Reproducible integration tests +- `.github/workflows/test-k8s.yml` — Production-like validation (nightly) + +### Run K8s Tests Less Frequently + +K8s tests are slower. Consider running them only on main branch or scheduled: + +```yaml +on: + push: + branches: [main] + schedule: + - cron: '0 2 * * *' # Daily at 2 AM +``` + +## Platform-Specific Notes + +### Ubuntu Runners + +- Docker pre-installed and running +- Best for compose/k8s runners +- Most common choice + +### macOS Runners + +- Docker Desktop not installed by default +- Slower and more expensive +- Use only if testing macOS-specific issues + +### Self-Hosted Runners + +- Cache Docker images locally for faster builds +- Set resource limits (`SLOW_TEST_ENV=true` if needed) +- Ensure cleanup scripts run (`docker system prune`) + +## Debugging CI Failures + +### Enable Debug Logging + +Add debug environment variables temporarily: + +```yaml +env: + RUST_LOG: debug + NOMOS_LOG_LEVEL: debug +``` + +### Preserve Containers (Compose) + +Set `COMPOSE_RUNNER_PRESERVE=1` to keep containers running for inspection: + +```yaml +- name: Run compose test (preserve on failure) + env: + COMPOSE_RUNNER_PRESERVE: 1 + run: scripts/run/run-examples.sh -t 120 -v 3 -e 1 compose +``` + +### Access Artifacts + +Download uploaded artifacts from the GitHub Actions UI to inspect logs locally. + +## Next Steps + +- [Running Examples](running-examples.md) — Manual execution for local development +- [Environment Variables](environment-variables.md) — Full variable reference +- [Troubleshooting](troubleshooting.md) — Common CI-specific issues + diff --git a/book/src/environment-variables.md b/book/src/environment-variables.md new file mode 100644 index 0000000..0a3922b --- /dev/null +++ b/book/src/environment-variables.md @@ -0,0 +1,395 @@ +# Environment Variables Reference + +Complete reference of environment variables used by the testing framework, organized by category. + +## Critical Variables + +These MUST be set for successful test runs: + +| Variable | Required | Default | Effect | +|----------|----------|---------|--------| +| `POL_PROOF_DEV_MODE` | **YES** | — | **REQUIRED for all runners**. Set to `true` to use fast dev-mode proving instead of expensive Groth16. Without this, tests will hang/timeout. | + +**Example:** + +```bash +export POL_PROOF_DEV_MODE=true +``` + +Or add to your shell profile (`~/.bashrc`, `~/.zshrc`): + +```bash +# Required for nomos-testing framework +export POL_PROOF_DEV_MODE=true +``` + +--- + +## Runner Selection & Topology + +Control which runner to use and the test topology: + +| Variable | Default | Effect | +|----------|---------|--------| +| `NOMOS_DEMO_VALIDATORS` | 1 | Number of validators (all runners) | +| `NOMOS_DEMO_EXECUTORS` | 1 | Number of executors (all runners) | +| `NOMOS_DEMO_RUN_SECS` | 60 | Run duration in seconds (all runners) | +| `LOCAL_DEMO_VALIDATORS` | — | Legacy: Number of validators (host runner only) | +| `LOCAL_DEMO_EXECUTORS` | — | Legacy: Number of executors (host runner only) | +| `LOCAL_DEMO_RUN_SECS` | — | Legacy: Run duration (host runner only) | +| `COMPOSE_NODE_PAIRS` | — | Compose-specific topology format: "validators×executors" (e.g., `3x2`) | + +**Example:** + +```bash +# Run with 5 validators, 2 executors, for 120 seconds +NOMOS_DEMO_VALIDATORS=5 \ +NOMOS_DEMO_EXECUTORS=2 \ +NOMOS_DEMO_RUN_SECS=120 \ +scripts/run/run-examples.sh -t 120 -v 5 -e 2 host +``` + +--- + +## Node Binaries (Host Runner) + +Required for host runner when not using helper scripts: + +| Variable | Required | Default | Effect | +|----------|----------|---------|--------| +| `NOMOS_NODE_BIN` | Yes (host) | — | Path to `nomos-node` binary | +| `NOMOS_EXECUTOR_BIN` | Yes (host) | — | Path to `nomos-executor` binary | +| `NOMOS_NODE_PATH` | No | — | Path to nomos-node git checkout (dev workflow) | + +**Example:** + +```bash +export NOMOS_NODE_BIN=/path/to/nomos-node/target/release/nomos-node +export NOMOS_EXECUTOR_BIN=/path/to/nomos-node/target/release/nomos-executor +``` + +--- + +## Docker Images (Compose / K8s) + +Required for compose and k8s runners: + +| Variable | Required | Default | Effect | +|----------|----------|---------|--------| +| `NOMOS_TESTNET_IMAGE` | Yes (compose/k8s) | `logos-blockchain-testing:local` | Docker image tag for node containers | +| `NOMOS_BINARIES_TAR` | No | — | Path to prebuilt bundle (`.tar.gz`) for image build | +| `NOMOS_SKIP_IMAGE_BUILD` | No | 0 | Skip image rebuild (compose/k8s); assumes image already exists | + +**Example:** + +```bash +# Using prebuilt bundle +export NOMOS_BINARIES_TAR=.tmp/nomos-binaries-linux-v0.3.1.tar.gz +export NOMOS_TESTNET_IMAGE=logos-blockchain-testing:local +scripts/build/build_test_image.sh + +# Using pre-existing image (skip build) +export NOMOS_SKIP_IMAGE_BUILD=1 +scripts/run/run-examples.sh -t 60 -v 3 -e 1 compose +``` + +--- + +## Circuit Assets (KZG Parameters) + +Circuit asset configuration for DA workloads: + +| Variable | Default | Effect | +|----------|---------|--------| +| `NOMOS_KZGRS_PARAMS_PATH` | `testing-framework/assets/stack/kzgrs_test_params/kzgrs_test_params` | Path to KZG proving key file | +| `NOMOS_KZG_DIR_REL` | `testing-framework/assets/stack/kzgrs_test_params` | Directory containing KZG assets (relative to workspace root) | +| `VERSION` | From `versions.env` | Circuit release tag (used by helper scripts) | + +**Example:** + +```bash +# Use custom circuit assets +NOMOS_KZGRS_PARAMS_PATH=/custom/path/to/kzgrs_test_params \ +cargo run -p runner-examples --bin local_runner +``` + +--- + +## Node Logging + +Control node log output (not framework runner logs): + +| Variable | Default | Effect | +|----------|---------|--------| +| `NOMOS_LOG_LEVEL` | `info` | Global log level: `error`, `warn`, `info`, `debug`, `trace` | +| `NOMOS_LOG_FILTER` | — | Fine-grained module filtering (e.g., `cryptarchia=trace,nomos_da_sampling=debug`) | +| `NOMOS_LOG_DIR` | — | Host runner: directory for per-node log files (persistent). Compose/k8s: use `cfgsync.yaml` for file logging. | +| `NOMOS_TESTS_KEEP_LOGS` | 0 | Keep per-run temporary directories (useful for debugging/CI artifacts) | +| `NOMOS_TESTS_TRACING` | false | Enable debug tracing preset (combine with `NOMOS_LOG_DIR` unless external tracing backends configured) | + +**Important:** Nodes ignore `RUST_LOG` and only respond to `NOMOS_*` variables. + +**Example:** + +```bash +# Debug logging to files +NOMOS_LOG_DIR=/tmp/test-logs \ +NOMOS_LOG_LEVEL=debug \ +NOMOS_LOG_FILTER="cryptarchia=trace,nomos_da_sampling=debug" \ +POL_PROOF_DEV_MODE=true \ +cargo run -p runner-examples --bin local_runner + +# Inspect logs +ls /tmp/test-logs/ +# nomos-node-0.2024-12-18T14-30-00.log +# nomos-node-1.2024-12-18T14-30-00.log +``` + +**Common filter targets:** + +| Target Prefix | Subsystem | +|---------------|-----------| +| `cryptarchia` | Consensus (Cryptarchia) | +| `nomos_da_sampling` | DA sampling service | +| `nomos_da_dispersal` | DA dispersal service | +| `nomos_da_verifier` | DA verification | +| `nomos_blend` | Mix network/privacy layer | +| `chain_service` | Chain service (node APIs/state) | +| `chain_network` | P2P networking | +| `chain_leader` | Leader election | + +--- + +## Observability & Metrics + +Optional observability integration: + +| Variable | Default | Effect | +|----------|---------|--------| +| `NOMOS_METRICS_QUERY_URL` | — | Prometheus-compatible base URL for runner to query (e.g., `http://localhost:9090`) | +| `NOMOS_METRICS_OTLP_INGEST_URL` | — | Full OTLP HTTP ingest URL for node metrics export (e.g., `http://localhost:9090/api/v1/otlp/v1/metrics`) | +| `NOMOS_GRAFANA_URL` | — | Grafana base URL for printing/logging (e.g., `http://localhost:3000`) | +| `NOMOS_OTLP_ENDPOINT` | — | OTLP trace endpoint (optional) | +| `NOMOS_OTLP_METRICS_ENDPOINT` | — | OTLP metrics endpoint (optional) | + +**Example:** + +```bash +# Enable Prometheus querying +export NOMOS_METRICS_QUERY_URL=http://localhost:9090 +export NOMOS_METRICS_OTLP_INGEST_URL=http://localhost:9090/api/v1/otlp/v1/metrics +export NOMOS_GRAFANA_URL=http://localhost:3000 + +scripts/run/run-examples.sh -t 60 -v 3 -e 1 compose +``` + +--- + +## Compose Runner Specific + +Variables specific to Docker Compose deployment: + +| Variable | Default | Effect | +|----------|---------|--------| +| `COMPOSE_RUNNER_HOST` | `127.0.0.1` | Host address for port mappings | +| `COMPOSE_RUNNER_PRESERVE` | 0 | Keep containers running after test (for debugging) | +| `COMPOSE_RUNNER_HTTP_TIMEOUT_SECS` | — | Override HTTP readiness timeout (seconds) | +| `COMPOSE_RUNNER_HOST_GATEWAY` | `host.docker.internal:host-gateway` | Controls `extra_hosts` entry injected into compose (set to `disable` to omit) | +| `TESTNET_RUNNER_PRESERVE` | — | Alias for `COMPOSE_RUNNER_PRESERVE` | + +**Example:** + +```bash +# Keep containers after test for debugging +COMPOSE_RUNNER_PRESERVE=1 \ +scripts/run/run-examples.sh -t 60 -v 3 -e 1 compose + +# Containers remain running +docker ps --filter "name=nomos-compose-" +docker logs +``` + +--- + +## K8s Runner Specific + +Variables specific to Kubernetes deployment: + +| Variable | Default | Effect | +|----------|---------|--------| +| `K8S_RUNNER_NAMESPACE` | Random UUID | Kubernetes namespace (pin for debugging) | +| `K8S_RUNNER_RELEASE` | Random UUID | Helm release name (pin for debugging) | +| `K8S_RUNNER_NODE_HOST` | — | NodePort host resolution for non-local clusters | +| `K8S_RUNNER_DEBUG` | 0 | Log Helm stdout/stderr for install commands | +| `K8S_RUNNER_PRESERVE` | 0 | Keep namespace/release after run (for debugging) | +| `K8S_RUNNER_DEPLOYMENT_TIMEOUT_SECS` | — | Override deployment readiness timeout | +| `K8S_RUNNER_HTTP_TIMEOUT_SECS` | — | Override HTTP readiness timeout (port-forwards) | +| `K8S_RUNNER_HTTP_PROBE_TIMEOUT_SECS` | — | Override HTTP readiness timeout (NodePort probes) | +| `K8S_RUNNER_PROMETHEUS_HTTP_TIMEOUT_SECS` | — | Override Prometheus readiness timeout | +| `K8S_RUNNER_PROMETHEUS_HTTP_PROBE_TIMEOUT_SECS` | — | Override Prometheus NodePort probe timeout | + +**Example:** + +```bash +# Pin namespace for debugging +K8S_RUNNER_NAMESPACE=nomos-test-debug \ +K8S_RUNNER_PRESERVE=1 \ +K8S_RUNNER_DEBUG=1 \ +scripts/run/run-examples.sh -t 60 -v 3 -e 1 k8s + +# Inspect resources +kubectl get pods -n nomos-test-debug +kubectl logs -n nomos-test-debug -l nomos/logical-role=validator +``` + +--- + +## Platform & Build Configuration + +Platform-specific build configuration: + +| Variable | Default | Effect | +|----------|---------|--------| +| `NOMOS_BUNDLE_DOCKER_PLATFORM` | Host arch | Docker platform for bundle builds: `linux/arm64` or `linux/amd64` (macOS/Windows hosts) | +| `COMPOSE_CIRCUITS_PLATFORM` | Host arch | Circuits platform for image builds: `linux-aarch64` or `linux-x86_64` | + +**macOS / Apple Silicon:** + +```bash +# Native performance (recommended for local testing) +export NOMOS_BUNDLE_DOCKER_PLATFORM=linux/arm64 + +# Or target amd64 (slower via emulation) +export NOMOS_BUNDLE_DOCKER_PLATFORM=linux/amd64 +``` + +--- + +## Timeouts & Performance + +Timeout and performance tuning: + +| Variable | Default | Effect | +|----------|---------|--------| +| `SLOW_TEST_ENV` | false | Doubles built-in readiness timeouts (useful in CI / constrained laptops) | +| `TESTNET_PRINT_ENDPOINTS` | 0 | Print `TESTNET_ENDPOINTS` / `TESTNET_PPROF` lines during deploy (set automatically by `scripts/run/run-examples.sh`) | + +**Example:** + +```bash +# Increase timeouts for slow environments +SLOW_TEST_ENV=true \ +scripts/run/run-examples.sh -t 120 -v 5 -e 2 compose +``` + +--- + +## Node Configuration (Advanced) + +Node-level configuration passed through to nomos-node/nomos-executor: + +| Variable | Default | Effect | +|----------|---------|--------| +| `CONSENSUS_SLOT_TIME` | — | Consensus slot time (seconds) | +| `CONSENSUS_ACTIVE_SLOT_COEFF` | — | Active slot coefficient (0.0-1.0) | + +**Example:** + +```bash +# Faster block production +CONSENSUS_SLOT_TIME=5 \ +CONSENSUS_ACTIVE_SLOT_COEFF=0.9 \ +POL_PROOF_DEV_MODE=true \ +cargo run -p runner-examples --bin local_runner +``` + +--- + +## Framework Runner Logging (Not Node Logs) + +Control framework runner process logs (uses `RUST_LOG`, not `NOMOS_*`): + +| Variable | Default | Effect | +|----------|---------|--------| +| `RUST_LOG` | — | Framework runner log level (e.g., `debug`, `info`) | +| `RUST_BACKTRACE` | — | Enable Rust backtraces on panic (`1` or `full`) | +| `CARGO_TERM_COLOR` | — | Cargo output color (`always`, `never`, `auto`) | + +**Example:** + +```bash +# Debug framework runner (not nodes) +RUST_LOG=debug \ +RUST_BACKTRACE=1 \ +cargo run -p runner-examples --bin local_runner +``` + +--- + +## Helper Script Variables + +Variables used by helper scripts (`scripts/run/run-examples.sh`, etc.): + +| Variable | Default | Effect | +|----------|---------|--------| +| `NOMOS_NODE_REV` | From `versions.env` | nomos-node git revision to build/fetch | +| `NOMOS_BUNDLE_VERSION` | From `versions.env` | Bundle schema version | + +--- + +## Quick Reference Examples + +### Minimal Host Run + +```bash +POL_PROOF_DEV_MODE=true \ +scripts/run/run-examples.sh -t 60 -v 3 -e 1 host +``` + +### Debug Logging (Host) + +```bash +POL_PROOF_DEV_MODE=true \ +NOMOS_LOG_DIR=/tmp/logs \ +NOMOS_LOG_LEVEL=debug \ +NOMOS_LOG_FILTER="cryptarchia=trace" \ +scripts/run/run-examples.sh -t 60 -v 3 -e 1 host +``` + +### Compose with Observability + +```bash +POL_PROOF_DEV_MODE=true \ +NOMOS_METRICS_QUERY_URL=http://localhost:9090 \ +NOMOS_GRAFANA_URL=http://localhost:3000 \ +scripts/run/run-examples.sh -t 60 -v 3 -e 1 compose +``` + +### K8s with Debug + +```bash +POL_PROOF_DEV_MODE=true \ +K8S_RUNNER_NAMESPACE=nomos-debug \ +K8S_RUNNER_DEBUG=1 \ +K8S_RUNNER_PRESERVE=1 \ +scripts/run/run-examples.sh -t 60 -v 3 -e 1 k8s +``` + +### CI Environment + +```yaml +env: + POL_PROOF_DEV_MODE: true + RUST_BACKTRACE: 1 + NOMOS_TESTS_KEEP_LOGS: 1 +``` + +--- + +## See Also + +- [Prerequisites & Setup](prerequisites.md) — Required files and setup +- [Running Examples](running-examples.md) — How to run scenarios +- [Logging & Observability](logging-observability.md) — Log collection details +- [CI Integration](ci-integration.md) — CI-specific variables +- [Troubleshooting](troubleshooting.md) — Common issues with variables diff --git a/book/src/examples-advanced.md b/book/src/examples-advanced.md index 4d63526..8bff59c 100644 --- a/book/src/examples-advanced.md +++ b/book/src/examples-advanced.md @@ -2,6 +2,11 @@ Realistic advanced scenarios demonstrating framework capabilities for production testing. +**Adapt from Complete Source:** +- [compose_runner.rs](https://github.com/logos-co/nomos-node/blob/master/testnet/testing-framework/runner-examples/src/bin/compose_runner.rs) — Compose examples with workloads +- [k8s_runner.rs](https://github.com/logos-co/nomos-node/blob/master/testnet/testing-framework/runner-examples/src/bin/k8s_runner.rs) — K8s production patterns +- [Chaos testing patterns](https://github.com/logos-co/nomos-node/blob/master/testnet/testing-framework/workflows/src/chaos.rs) — Node control implementation + ## Summary | Example | Topology | Workloads | Deployer | Key Feature | diff --git a/book/src/examples.md b/book/src/examples.md index 5867596..966e112 100644 --- a/book/src/examples.md +++ b/book/src/examples.md @@ -3,12 +3,17 @@ Concrete scenario shapes that illustrate how to combine topologies, workloads, and expectations. +**View Complete Source Code:** +- [local_runner.rs](https://github.com/logos-co/nomos-node/blob/master/testnet/testing-framework/runner-examples/src/bin/local_runner.rs) — Host processes (local) +- [compose_runner.rs](https://github.com/logos-co/nomos-node/blob/master/testnet/testing-framework/runner-examples/src/bin/compose_runner.rs) — Docker Compose +- [k8s_runner.rs](https://github.com/logos-co/nomos-node/blob/master/testnet/testing-framework/runner-examples/src/bin/k8s_runner.rs) — Kubernetes + **Runnable examples:** The repo includes complete binaries in `examples/src/bin/`: - `local_runner.rs` — Host processes (local) - `compose_runner.rs` — Docker Compose (requires image built) - `k8s_runner.rs` — Kubernetes (requires cluster access and image loaded) -**Recommended:** Use `scripts/run-examples.sh -t -v -e ` where mode is `host`, `compose`, or `k8s`. +**Recommended:** Use `scripts/run/run-examples.sh -t -v -e ` where mode is `host`, `compose`, or `k8s`. **Alternative:** Direct cargo run: `POL_PROOF_DEV_MODE=true cargo run -p runner-examples --bin ` diff --git a/book/src/logging-observability.md b/book/src/logging-observability.md new file mode 100644 index 0000000..f1722ae --- /dev/null +++ b/book/src/logging-observability.md @@ -0,0 +1,365 @@ +# Logging & Observability + +Comprehensive guide to log collection, metrics, and debugging across all runners. + +## Node Logging vs Framework Logging + +**Critical distinction:** Node logs and framework logs use different configuration mechanisms. + +| Component | Controlled By | Purpose | +|-----------|--------------|---------| +| **Framework binaries** (`cargo run -p runner-examples --bin local_runner`) | `RUST_LOG` | Runner orchestration, deployment logs | +| **Node processes** (validators, executors spawned by runner) | `NOMOS_LOG_LEVEL`, `NOMOS_LOG_FILTER` (+ `NOMOS_LOG_DIR` on host runner) | Consensus, DA, mempool, network logs | + +**Common mistake:** Setting `RUST_LOG=debug` only increases verbosity of the runner binary itself. Node logs remain at their default level unless you also set `NOMOS_LOG_LEVEL=debug`. + +**Example:** + +```bash +# This only makes the RUNNER verbose, not the nodes: +RUST_LOG=debug cargo run -p runner-examples --bin local_runner + +# This makes the NODES verbose: +NOMOS_LOG_LEVEL=debug cargo run -p runner-examples --bin local_runner + +# Both verbose (typically not needed): +RUST_LOG=debug NOMOS_LOG_LEVEL=debug cargo run -p runner-examples --bin local_runner +``` + +## Logging Environment Variables + +See [Environment Variables Reference](environment-variables.md) for complete details. Quick summary: + +| Variable | Default | Effect | +|----------|---------|--------| +| `NOMOS_LOG_DIR` | None (console only) | Host runner: directory for per-node log files. Compose/k8s: use `cfgsync.yaml` | +| `NOMOS_LOG_LEVEL` | `info` | Global log level: `error`, `warn`, `info`, `debug`, `trace` | +| `NOMOS_LOG_FILTER` | None | Fine-grained target filtering (e.g., `cryptarchia=trace,nomos_da_sampling=debug`) | +| `NOMOS_TESTS_TRACING` | false | Enable debug tracing preset | +| `NOMOS_OTLP_ENDPOINT` | None | OTLP trace endpoint (optional) | +| `NOMOS_OTLP_METRICS_ENDPOINT` | None | OTLP metrics endpoint (optional) | + +**Example:** Full debug logging to files: + +```bash +NOMOS_TESTS_TRACING=true \ +NOMOS_LOG_DIR=/tmp/test-logs \ +NOMOS_LOG_LEVEL=debug \ +NOMOS_LOG_FILTER="cryptarchia=trace,nomos_da_sampling=debug,nomos_da_dispersal=debug,nomos_da_verifier=debug" \ +POL_PROOF_DEV_MODE=true \ +cargo run -p runner-examples --bin local_runner +``` + +## Per-Node Log Files + +When `NOMOS_LOG_DIR` is set, each node writes logs to separate files: + +**File naming pattern:** +- **Validators**: Prefix `nomos-node-0`, `nomos-node-1`, etc. (may include timestamp suffix) +- **Executors**: Prefix `nomos-executor-0`, `nomos-executor-1`, etc. (may include timestamp suffix) + +**Example filenames:** +- `nomos-node-0.2024-12-18T14-30-00.log` +- `nomos-node-1.2024-12-18T14-30-00.log` +- `nomos-executor-0.2024-12-18T14-30-00.log` + +**Local runner note:** The local runner uses per-run temporary directories under the current working directory and removes them after the run unless `NOMOS_TESTS_KEEP_LOGS=1`. Use `NOMOS_LOG_DIR=/path/to/logs` to write per-node log files to a stable location. + +## Filter Target Names + +Common target prefixes for `NOMOS_LOG_FILTER`: + +| Target Prefix | Subsystem | +|---------------|-----------| +| `cryptarchia` | Consensus (Cryptarchia) | +| `nomos_da_sampling` | DA sampling service | +| `nomos_da_dispersal` | DA dispersal service | +| `nomos_da_verifier` | DA verification | +| `nomos_blend` | Mix network/privacy layer | +| `chain_service` | Chain service (node APIs/state) | +| `chain_network` | P2P networking | +| `chain_leader` | Leader election | + +**Example filter:** + +```bash +NOMOS_LOG_FILTER="cryptarchia=trace,nomos_da_sampling=debug,chain_service=info,chain_network=info" +``` + +--- + +## Accessing Logs by Runner + +### Local Runner (Host Processes) + +**Default (temporary directories, auto-cleanup):** + +```bash +POL_PROOF_DEV_MODE=true cargo run -p runner-examples --bin local_runner +# Logs written to temporary directories in working directory +# Automatically cleaned up after test completes +``` + +**Persistent file output:** + +```bash +NOMOS_LOG_DIR=/tmp/local-logs \ +POL_PROOF_DEV_MODE=true \ +cargo run -p runner-examples --bin local_runner + +# After test completes: +ls /tmp/local-logs/ +# Files with prefix: nomos-node-0*, nomos-node-1*, nomos-executor-0* +# May include timestamps in filename +``` + +**Tip:** Use `NOMOS_LOG_DIR` for persistent per-node log files, and `NOMOS_TESTS_KEEP_LOGS=1` if you want to keep the per-run temporary directories (configs/state) for post-mortem inspection. + +### Compose Runner (Docker Containers) + +**Via Docker logs (default, recommended):** + +```bash +# List containers (note the UUID prefix in names) +docker ps --filter "name=nomos-compose-" + +# Stream logs from specific container +docker logs -f + +# Or use name pattern matching: +docker logs -f $(docker ps --filter "name=nomos-compose-.*-validator-0" -q | head -1) + +# Show last 100 lines +docker logs --tail 100 +``` + +**Via file collection (advanced):** + +To write per-node log files inside containers, set `tracing_settings.logger: !File` in `testing-framework/assets/stack/cfgsync.yaml` (and ensure the directory is writable). To access them, you must either: + +1. **Copy files out after the run:** + +```bash +# Ensure cfgsync.yaml is configured to log to /logs +NOMOS_TESTNET_IMAGE=logos-blockchain-testing:local \ +POL_PROOF_DEV_MODE=true \ +cargo run -p runner-examples --bin compose_runner + +# After test, copy files from containers: +docker ps --filter "name=nomos-compose-" +docker cp :/logs/node* /tmp/ +``` + +2. **Mount a host volume** (requires modifying compose template): + +```yaml +volumes: + - /tmp/host-logs:/logs # Add to docker-compose.yml.tera +``` + +**Recommendation:** Use `docker logs` by default. File collection inside containers is complex and rarely needed. + +**Keep containers for debugging:** + +```bash +COMPOSE_RUNNER_PRESERVE=1 \ +NOMOS_TESTNET_IMAGE=logos-blockchain-testing:local \ +cargo run -p runner-examples --bin compose_runner +# Containers remain running after test—inspect with docker logs or docker exec +``` + +**Compose debugging variables:** +- `COMPOSE_RUNNER_HOST=127.0.0.1` — host used for readiness probes +- `COMPOSE_RUNNER_HOST_GATEWAY=host.docker.internal:host-gateway` — controls `extra_hosts` entry (set to `disable` to omit) +- `TESTNET_RUNNER_PRESERVE=1` — alias for `COMPOSE_RUNNER_PRESERVE=1` +- `COMPOSE_RUNNER_HTTP_TIMEOUT_SECS=` — override HTTP readiness timeout + +**Note:** Container names follow pattern `nomos-compose-{uuid}-validator-{index}-1` where `{uuid}` changes per run. + +### K8s Runner (Kubernetes Pods) + +**Via kubectl logs (use label selectors):** + +```bash +# List pods +kubectl get pods + +# Stream logs using label selectors (recommended) +# Helm chart labels: +# - nomos/logical-role=validator|executor +# - nomos/validator-index / nomos/executor-index +kubectl logs -l nomos/logical-role=validator -f +kubectl logs -l nomos/logical-role=executor -f + +# Stream logs from specific pod +kubectl logs -f nomos-validator-0 + +# Previous logs from crashed pods +kubectl logs --previous -l nomos/logical-role=validator +``` + +**Download logs for offline analysis:** + +```bash +# Using label selectors +kubectl logs -l nomos/logical-role=validator --tail=1000 > all-validators.log +kubectl logs -l nomos/logical-role=executor --tail=1000 > all-executors.log + +# Specific pods +kubectl logs nomos-validator-0 > validator-0.log +kubectl logs nomos-executor-1 > executor-1.log +``` + +**K8s debugging variables:** +- `K8S_RUNNER_DEBUG=1` — logs Helm stdout/stderr for install commands +- `K8S_RUNNER_PRESERVE=1` — keep namespace/release after run +- `K8S_RUNNER_NODE_HOST=` — override NodePort host resolution +- `K8S_RUNNER_NAMESPACE=` / `K8S_RUNNER_RELEASE=` — pin namespace/release (useful for debugging) + +**Specify namespace (if not using default):** + +```bash +kubectl logs -n my-namespace -l nomos/logical-role=validator -f +``` + +**Note:** K8s runner is optimized for local clusters (Docker Desktop K8s, minikube, kind). Remote clusters require additional setup. + +--- + +## OTLP and Telemetry + +**OTLP exporters are optional.** If you see errors about unreachable OTLP endpoints, it's safe to ignore them unless you're actively collecting traces/metrics. + +**To enable OTLP:** + +```bash +NOMOS_OTLP_ENDPOINT=http://localhost:4317 \ +NOMOS_OTLP_METRICS_ENDPOINT=http://localhost:4318 \ +cargo run -p runner-examples --bin local_runner +``` + +**To silence OTLP errors:** Simply leave these variables unset (the default). + +--- + +## Observability: Prometheus and Node APIs + +Runners expose metrics and node HTTP endpoints for expectation code and debugging. + +### Prometheus-Compatible Metrics Querying (Optional) + +- Runners do **not** provision Prometheus automatically +- For a ready-to-run stack, use `scripts/setup/setup-observability.sh`: + - Compose: `scripts/setup/setup-observability.sh compose up` then `scripts/setup/setup-observability.sh compose env` + - K8s: `scripts/setup/setup-observability.sh k8s install` then `scripts/setup/setup-observability.sh k8s env` +- Provide `NOMOS_METRICS_QUERY_URL` (PromQL base URL) to enable `ctx.telemetry()` queries +- Access from expectations when configured: `ctx.telemetry().prometheus().map(|p| p.base_url())` + +**Example:** + +```bash +# Start observability stack (Compose) +scripts/setup/setup-observability.sh compose up + +# Get environment variables +eval $(scripts/setup/setup-observability.sh compose env) + +# Run scenario with metrics +POL_PROOF_DEV_MODE=true \ +scripts/run/run-examples.sh -t 60 -v 3 -e 1 compose +``` + +### Grafana (Optional) + +- Runners do **not** provision Grafana automatically (but `scripts/setup/setup-observability.sh` can) +- If you set `NOMOS_GRAFANA_URL`, the deployer prints it in `TESTNET_ENDPOINTS` +- Dashboards live in `testing-framework/assets/stack/monitoring/grafana/dashboards/` for import into your Grafana + +**Example:** + +```bash +export NOMOS_GRAFANA_URL=http://localhost:3000 +POL_PROOF_DEV_MODE=true scripts/run/run-examples.sh -t 60 -v 3 -e 1 compose +``` + +### Node APIs + +- Access from expectations: `ctx.node_clients().validator_clients().get(0)` +- Endpoints: consensus info, network info, DA membership, etc. +- See `testing-framework/core/src/nodes/api_client.rs` for available methods + +**Example usage in expectations:** + +```rust +async fn evaluate(&self, ctx: &RunContext) -> Result<(), DynError> { + let clients = ctx.node_clients().validator_clients(); + let client = &clients[0]; + + let info = client.consensus_info().await?; + tracing::info!(?info, "consensus info from validator 0"); + + Ok(()) +} +``` + +--- + +## Observability Flow + +```mermaid +flowchart TD + Expose[Runner exposes endpoints/ports] --> Collect[Runtime collects block/health signals] + Collect --> Consume[Expectations consume signals
decide pass/fail] + Consume --> Inspect[Operators inspect logs/metrics
when failures arise] +``` + +--- + +## Quick Reference + +### Debug Logging (Host) + +```bash +NOMOS_LOG_DIR=/tmp/logs \ +NOMOS_LOG_LEVEL=debug \ +NOMOS_LOG_FILTER="cryptarchia=trace" \ +POL_PROOF_DEV_MODE=true \ +scripts/run/run-examples.sh -t 60 -v 3 -e 1 host +``` + +### Compose with Observability + +```bash +# Start observability stack +scripts/setup/setup-observability.sh compose up +eval $(scripts/setup/setup-observability.sh compose env) + +# Run with metrics +POL_PROOF_DEV_MODE=true \ +scripts/run/run-examples.sh -t 60 -v 3 -e 1 compose + +# Access Grafana at http://localhost:3000 +``` + +### K8s with Debug + +```bash +K8S_RUNNER_NAMESPACE=nomos-debug \ +K8S_RUNNER_DEBUG=1 \ +K8S_RUNNER_PRESERVE=1 \ +POL_PROOF_DEV_MODE=true \ +scripts/run/run-examples.sh -t 60 -v 3 -e 1 k8s + +# Inspect logs +kubectl logs -n nomos-debug -l nomos/logical-role=validator +``` + +--- + +## See Also + +- [Environment Variables](environment-variables.md) — Complete variable reference +- [Troubleshooting](troubleshooting.md) — Log-related debugging (see "Where to Find Logs") +- [Running Examples](running-examples.md) — Runner-specific logging details +- [Prerequisites & Setup](prerequisites.md) — Setup before running + diff --git a/book/src/node-control.md b/book/src/node-control.md index 95adbe5..1a7f525 100644 --- a/book/src/node-control.md +++ b/book/src/node-control.md @@ -1,4 +1,4 @@ -# Node Control & RunContext +# RunContext: BlockFeed & Node Control The deployer supplies a `RunContext` that workloads and expectations share. It provides: @@ -8,6 +8,304 @@ provides: - Metrics (`RunMetrics`, `Metrics`) and block feed - Optional `NodeControlHandle` for managing nodes +## BlockFeed: Observing Block Production + +The `BlockFeed` is a broadcast stream of block observations that allows workloads and expectations to monitor blockchain progress in real-time. It polls a validator node continuously and broadcasts new blocks to all subscribers. + +### What BlockFeed Provides + +**Real-time block stream:** +- Subscribe to receive `BlockRecord` notifications as blocks are produced +- Each record includes the block header (`HeaderId`) and full block payload +- Backed by a background task that polls node storage every second + +**Block statistics:** +- Track total transactions across all observed blocks +- Access via `block_feed.stats().total_transactions()` + +**Broadcast semantics:** +- Multiple subscribers can receive the same blocks independently +- Late subscribers start receiving from current block (no history replay) +- Lagged subscribers skip missed blocks automatically + +### Accessing BlockFeed + +BlockFeed is available through `RunContext`: + +```rust +let block_feed = ctx.block_feed(); +``` + +### Usage in Expectations + +Expectations typically use BlockFeed to verify block production and inclusion of transactions/data. + +**Example: Counting blocks during a run** + +```rust +use std::sync::{Arc, atomic::{AtomicU64, Ordering}}; +use async_trait::async_trait; +use testing_framework_core::scenario::{DynError, Expectation, RunContext}; +use tokio::{spawn, time::sleep, select, pin}; + +struct MinimumBlocksExpectation { + min_blocks: u64, + captured_blocks: Option>, +} + +#[async_trait] +impl Expectation for MinimumBlocksExpectation { + fn name(&self) -> &'static str { + "minimum_blocks" + } + + async fn start_capture(&mut self, ctx: &RunContext) -> Result<(), DynError> { + let block_count = Arc::new(AtomicU64::new(0)); + let block_count_task = Arc::clone(&block_count); + + // Subscribe to block feed + let mut receiver = ctx.block_feed().subscribe(); + + // Spawn a task to count blocks + spawn(async move { + loop { + match receiver.recv().await { + Ok(_record) => { + block_count_task.fetch_add(1, Ordering::Relaxed); + } + Err(tokio::sync::broadcast::error::RecvError::Lagged(skipped)) => { + tracing::debug!(skipped, "receiver lagged, skipping blocks"); + } + Err(tokio::sync::broadcast::error::RecvError::Closed) => { + tracing::debug!("block feed closed"); + break; + } + } + } + }); + + self.captured_blocks = Some(block_count); + Ok(()) + } + + async fn evaluate(&mut self, ctx: &RunContext) -> Result<(), DynError> { + let blocks = self.captured_blocks + .as_ref() + .expect("start_capture must be called first") + .load(Ordering::Relaxed); + + if blocks < self.min_blocks { + return Err(format!( + "expected at least {} blocks, observed {}", + self.min_blocks, blocks + ).into()); + } + + tracing::info!(blocks, min = self.min_blocks, "minimum blocks expectation passed"); + Ok(()) + } +} +``` + +**Example: Inspecting block contents** + +```rust +use testing_framework_core::scenario::{BlockRecord, RunContext}; + +async fn start_capture(&mut self, ctx: &RunContext) -> Result<(), DynError> { + let mut receiver = ctx.block_feed().subscribe(); + + spawn(async move { + loop { + match receiver.recv().await { + Ok(record) => { + // Access block header + let header_id = &record.header; + + // Access full block + let block = &record.block; + let tx_count = block.transactions().len(); + + tracing::debug!( + ?header_id, + tx_count, + "observed block" + ); + + // Process transactions, DA blobs, etc. + } + Err(tokio::sync::broadcast::error::RecvError::Closed) => break, + Err(_) => continue, + } + } + }); + + Ok(()) +} +``` + +### Usage in Workloads + +Workloads can use BlockFeed to coordinate timing or wait for specific conditions before proceeding. + +**Example: Wait for N blocks before starting** + +```rust +use async_trait::async_trait; +use testing_framework_core::scenario::{DynError, RunContext, Workload}; + +struct DelayedWorkload { + wait_blocks: usize, +} + +#[async_trait] +impl Workload for DelayedWorkload { + fn name(&self) -> &str { + "delayed_workload" + } + + async fn start(&self, ctx: &RunContext) -> Result<(), DynError> { + tracing::info!(wait_blocks = self.wait_blocks, "waiting for blocks before starting"); + + // Subscribe to block feed + let mut receiver = ctx.block_feed().subscribe(); + let mut count = 0; + + // Wait for N blocks + while count < self.wait_blocks { + match receiver.recv().await { + Ok(_) => count += 1, + Err(tokio::sync::broadcast::error::RecvError::Lagged(_)) => continue, + Err(tokio::sync::broadcast::error::RecvError::Closed) => { + return Err("block feed closed before reaching target".into()); + } + } + } + + tracing::info!("warmup complete, starting actual workload"); + + // Now do the actual work + // ... + + Ok(()) + } +} +``` + +**Example: Rate limiting based on block production** + +```rust +async fn start(&self, ctx: &RunContext) -> Result<(), DynError> { + let clients = ctx.node_clients().validator_clients(); + let mut receiver = ctx.block_feed().subscribe(); + let mut pending_txs = Vec::new(); + + loop { + tokio::select! { + // Send batch on new block + Ok(_record) = receiver.recv() => { + if !pending_txs.is_empty() { + tracing::debug!(count = pending_txs.len(), "sending batch on new block"); + for tx in pending_txs.drain(..) { + clients[0].send_transaction(tx).await?; + } + } + } + + // Generate transactions continuously + Some(tx) = generate_transaction() => { + pending_txs.push(tx); + } + } + } +} +``` + +### BlockFeed vs Direct Polling + +**Use BlockFeed when:** +- You need to react to blocks as they're produced +- Multiple components need to observe the same blocks +- You want automatic retry/reconnect logic +- You're tracking statistics across many blocks + +**Use direct polling when:** +- You need to query specific historical blocks +- You're checking final state after workloads complete +- You need transaction receipts or other indexed data +- You're implementing a one-time health check + +Example direct polling in expectations: + +```rust +async fn evaluate(&mut self, ctx: &RunContext) -> Result<(), DynError> { + let client = &ctx.node_clients().validator_clients()[0]; + + // Poll current height once + let info = client.consensus_info().await?; + tracing::info!(height = info.height, "final block height"); + + // This is simpler than BlockFeed for one-time checks + Ok(()) +} +``` + +### Block Statistics + +Access aggregated statistics without subscribing to the feed: + +```rust +async fn evaluate(&mut self, ctx: &RunContext) -> Result<(), DynError> { + let stats = ctx.block_feed().stats(); + let total_txs = stats.total_transactions(); + + tracing::info!(total_txs, "transactions observed across all blocks"); + + if total_txs < self.expected_min { + return Err(format!( + "expected at least {} transactions, observed {}", + self.expected_min, total_txs + ).into()); + } + + Ok(()) +} +``` + +### Important Notes + +**Subscription timing:** +- Subscribe in `start_capture()` for expectations +- Subscribe in `start()` for workloads +- Late subscribers miss historical blocks (no replay) + +**Lagged receivers:** +- If your subscriber is too slow, it may lag behind +- Handle `RecvError::Lagged(skipped)` gracefully +- Consider increasing processing speed or reducing block rate + +**Feed lifetime:** +- BlockFeed runs for the entire scenario duration +- Automatically cleaned up when the run completes +- Closed channels signal graceful shutdown + +**Performance:** +- BlockFeed polls nodes every 1 second +- Broadcasts to all subscribers with minimal overhead +- Suitable for scenarios with hundreds of blocks + +### Real-World Examples + +The framework's built-in expectations use BlockFeed extensively: + +- **`ConsensusLiveness`**: Doesn't directly subscribe but uses block feed stats to verify progress +- **`DataAvailabilityExpectation`**: Subscribes to inspect DA blobs in each block and track inscription/dispersal +- **`TransactionInclusion`**: Subscribes to find specific transactions in blocks + +See [Examples](examples.md) and [Workloads & Expectations](workloads.md) for more patterns. + +--- + ## Current Chaos Capabilities and Limitations The framework currently supports **process-level chaos** (node restarts) for diff --git a/book/src/operations-overview.md b/book/src/operations-overview.md new file mode 100644 index 0000000..321a369 --- /dev/null +++ b/book/src/operations-overview.md @@ -0,0 +1,63 @@ +# Operations & Deployment Overview + +Operational readiness focuses on prerequisites, environment fit, and clear signals that ensure your test scenarios run reliably across different deployment targets. + +## Core Principles + +- **Prerequisites First**: Ensure all required files, binaries, and assets are in place before attempting to run scenarios +- **Environment Fit**: Choose the right deployment target (host, compose, k8s) based on your isolation, reproducibility, and resource needs +- **Clear Signals**: Verify runners report node readiness before starting workloads to avoid false negatives +- **Failure Triage**: Map failures to specific causes—missing prerequisites, platform issues, or unmet expectations + +## Key Operational Concerns + +**Prerequisites:** +- `versions.env` file at repository root (required by helper scripts) +- Node binaries (`nomos-node`, `nomos-executor`) available or built on demand +- Platform requirements met (Docker for compose, cluster access for k8s) +- Circuit assets for DA workloads + +**Artifacts:** +- KZG parameters (circuit assets) for Data Availability scenarios +- Docker images for compose/k8s deployments +- Binary bundles for reproducible builds + +**Environment Configuration:** +- `POL_PROOF_DEV_MODE=true` is **REQUIRED for all runners** to avoid expensive proof generation +- Logging configured via `NOMOS_LOG_*` variables +- Observability endpoints (Prometheus, Grafana) optional but useful + +**Readiness & Health:** +- Runners verify node readiness before starting workloads +- Health checks prevent premature workload execution +- Consensus liveness expectations validate basic operation + +## Operational Workflow + +```mermaid +flowchart LR + Setup[Prerequisites & Setup] --> Run[Run Scenarios] + Run --> Monitor[Monitor & Observe] + Monitor --> Debug{Success?} + Debug -->|No| Triage[Failure Triage] + Triage --> Setup + Debug -->|Yes| Done[Complete] +``` + +1. **Setup**: Verify prerequisites, configure environment, prepare assets +2. **Run**: Execute scenarios using appropriate runner (host/compose/k8s) +3. **Monitor**: Collect logs, metrics, and observability signals +4. **Triage**: When failures occur, map to root causes and fix prerequisites + +## Documentation Structure + +This Operations & Deployment section covers: + +- [Prerequisites & Setup](prerequisites.md) — Required files, binaries, and environment setup +- [Running Examples](running-examples.md) — How to run scenarios across different runners +- [CI Integration](ci-integration.md) — Automating tests in continuous integration pipelines +- [Environment Variables](environment-variables.md) — Complete reference of configuration variables +- [Logging & Observability](logging-observability.md) — Log collection, metrics, and debugging + +**Philosophy:** Treat operational hygiene—assets present, prerequisites satisfied, observability reachable—as the first step to reliable scenario outcomes. + diff --git a/book/src/operations.md b/book/src/operations.md deleted file mode 100644 index a7ca56a..0000000 --- a/book/src/operations.md +++ /dev/null @@ -1,605 +0,0 @@ -# Operations - -Operational readiness focuses on prerequisites, environment fit, and clear -signals: - -- **Prerequisites**: - - **`versions.env` file** at repository root (required by helper scripts; defines VERSION, NOMOS_NODE_REV, NOMOS_BUNDLE_VERSION) - - Keep a sibling `nomos-node` checkout available, or use `scripts/run-examples.sh` which clones/builds on demand - - Ensure the chosen runner's platform needs are met (Docker for compose, cluster access for k8s) - - CI uses prebuilt binary artifacts from the `build-binaries` workflow -- **Artifacts**: DA scenarios require KZG parameters (circuit assets) located at - `testing-framework/assets/stack/kzgrs_test_params`. Fetch them via - `scripts/setup-nomos-circuits.sh` or override the path with `NOMOS_KZGRS_PARAMS_PATH`. -- **Environment flags**: `POL_PROOF_DEV_MODE=true` is **required for all runners** - (local, compose, k8s) unless you want expensive Groth16 proof generation that - will cause tests to timeout. Configure logging via `NOMOS_LOG_DIR`, `NOMOS_LOG_LEVEL`, - and `NOMOS_LOG_FILTER` (see [Logging and Observability](#logging-and-observability) - for details). Note that nodes ignore `RUST_LOG` and only respond to `NOMOS_*` variables. -- **Readiness checks**: verify runners report node readiness before starting - workloads; this avoids false negatives from starting too early. -- **Failure triage**: map failures to missing prerequisites (wallet seeding, - node control availability), runner platform issues, or unmet expectations. - Start with liveness signals, then dive into workload-specific assertions. - -Treat operational hygiene—assets present, prerequisites satisfied, observability -reachable—as the first step to reliable scenario outcomes. - -## CI Usage - -Both **LocalDeployer** and **ComposeDeployer** work in CI environments: - -**LocalDeployer in CI:** -- Faster (no Docker overhead) -- Good for quick smoke tests -- **Trade-off:** Less isolation (processes share host) - -**ComposeDeployer in CI (recommended):** -- Better isolation (containerized) -- Reproducible environment -- Can integrate with external Prometheus/Grafana (optional) -- **Trade-off:** Slower startup (Docker image build) -- **Trade-off:** Requires Docker daemon - -See `.github/workflows/lint.yml` (jobs: `host_smoke`, `compose_smoke`) for CI examples running the demo scenarios. - -## Running Examples - -The framework provides three runner modes: **host** (local processes), **compose** (Docker Compose), and **k8s** (Kubernetes). - -**Recommended:** Use `scripts/run-examples.sh` for all modes: - -```bash -# Host mode (local processes) -scripts/run-examples.sh -t 60 -v 1 -e 1 host - -# Compose mode (Docker Compose) -scripts/run-examples.sh -t 60 -v 1 -e 1 compose - -# K8s mode (Kubernetes) -scripts/run-examples.sh -t 60 -v 1 -e 1 k8s -``` - -This script handles circuit setup, binary building/bundling, (local) image building, and execution. - -Note: for `k8s` runs against non-local clusters (e.g. EKS), the cluster pulls images from a registry, -so a local `docker build` is not used. In that case, build + push your image separately (see -`scripts/build_test_image.sh`) and set `NOMOS_TESTNET_IMAGE` to the pushed reference. - -### Quick Smoke Matrix (Host/Compose/K8s) - -For a small “does everything still run?” matrix (including `--no-image-build` variants where relevant), use: - -```bash -scripts/run-test-matrix.sh -t 120 -v 1 -e 1 -``` - -This is useful after making runner/image/script changes, and it forwards `--metrics-*` options through to `scripts/run-examples.sh`. - -**Environment overrides:** -- `VERSION=v0.3.1` — Circuit version -- `NOMOS_NODE_REV=` — nomos-node git revision -- `NOMOS_BINARIES_TAR=path/to/bundle.tar.gz` — Use prebuilt bundle -- `NOMOS_SKIP_IMAGE_BUILD=1` — Skip image rebuild (compose/k8s) -- `NOMOS_BUNDLE_DOCKER_PLATFORM=linux/arm64|linux/amd64` — Docker platform used when building a Linux bundle on non-Linux hosts (macOS/Windows) -- `COMPOSE_CIRCUITS_PLATFORM=linux-aarch64|linux-x86_64` — Circuits platform used when building the compose/k8s image (defaults based on host arch) -- `SLOW_TEST_ENV=true` — Doubles built-in readiness timeouts (useful in slower CI / constrained laptops) -- `TESTNET_PRINT_ENDPOINTS=1` — Print `TESTNET_ENDPOINTS` / `TESTNET_PPROF` lines during deploy (set automatically by `scripts/run-examples.sh`) -- `COMPOSE_RUNNER_HTTP_TIMEOUT_SECS=` — Override compose node HTTP readiness timeout -- `K8S_RUNNER_DEPLOYMENT_TIMEOUT_SECS=` — Override k8s deployment readiness timeout -- `K8S_RUNNER_HTTP_TIMEOUT_SECS=` — Override k8s HTTP readiness timeout for port-forwards -- `K8S_RUNNER_HTTP_PROBE_TIMEOUT_SECS=` — Override k8s HTTP readiness timeout for NodePort probes -- `K8S_RUNNER_PROMETHEUS_HTTP_TIMEOUT_SECS=` — Override k8s Prometheus readiness timeout -- `K8S_RUNNER_PROMETHEUS_HTTP_PROBE_TIMEOUT_SECS=` — Override k8s Prometheus NodePort probe timeout - -### Updating `nomos-node` Revision (Dev Workflow) - -The repo pins a `nomos-node` revision in `versions.env` for reproducible builds. To update it (or point to a local checkout), use the helper script: - -```bash -# Pin to a new git revision (updates versions.env + Cargo.toml git revs) -scripts/update-nomos-rev.sh --rev - -# Use a local nomos-node checkout instead (for development) -scripts/update-nomos-rev.sh --path /path/to/nomos-node - -# If Cargo.toml was marked skip-worktree, clear it -scripts/update-nomos-rev.sh --unskip-worktree -``` - -Notes: -- Don’t commit absolute `NOMOS_NODE_PATH` values; prefer `--rev` for shared history/CI. -- After changing rev/path, expect `Cargo.lock` to update on the next `cargo build`/`cargo test`. - -### Cleanup Helper - -If you hit Docker build failures, mysterious I/O errors, or are running out of disk space: - -```bash -scripts/clean.sh -``` - -For extra Docker cache cleanup: - -```bash -scripts/clean.sh --docker -``` - -### Host Runner (Direct Cargo Run) - -For manual control, you can run the `local_runner` binary directly: - -```bash -POL_PROOF_DEV_MODE=true \ -NOMOS_NODE_BIN=/path/to/nomos-node \ -NOMOS_EXECUTOR_BIN=/path/to/nomos-executor \ -cargo run -p runner-examples --bin local_runner -``` - -**Environment variables:** -- `NOMOS_DEMO_VALIDATORS=3` — Number of validators (default: 1, or use legacy `LOCAL_DEMO_VALIDATORS`) -- `NOMOS_DEMO_EXECUTORS=2` — Number of executors (default: 1, or use legacy `LOCAL_DEMO_EXECUTORS`) -- `NOMOS_DEMO_RUN_SECS=120` — Run duration in seconds (default: 60, or use legacy `LOCAL_DEMO_RUN_SECS`) -- `NOMOS_NODE_BIN` / `NOMOS_EXECUTOR_BIN` — Paths to binaries (required for direct run) -- `NOMOS_LOG_DIR=/tmp/logs` — Directory for per-node log files (host runner). For compose/k8s, set `tracing_settings.logger: !File` in `testing-framework/assets/stack/cfgsync.yaml`. -- `NOMOS_TESTS_KEEP_LOGS=1` — Keep per-run temporary directories (useful for debugging/CI artifacts) -- `NOMOS_TESTS_TRACING=true` — Enable the debug tracing preset (optional; combine with `NOMOS_LOG_DIR` unless you have external tracing backends configured) -- `NOMOS_LOG_LEVEL=debug` — Set log level (default: info) -- `NOMOS_LOG_FILTER="cryptarchia=trace,nomos_da_sampling=debug"` — Fine-grained module filtering - -**Note:** Requires circuit assets and host binaries. Use `scripts/run-examples.sh host` to handle setup automatically. - -### Compose Runner (Direct Cargo Run) - -For manual control, you can run the `compose_runner` binary directly. Compose requires a Docker image with embedded assets. - -**Recommended setup:** Use a prebuilt bundle: - -```bash -# Build a Linux bundle (includes binaries + circuits) -scripts/build-bundle.sh --platform linux -# Creates .tmp/nomos-binaries-linux-v0.3.1.tar.gz - -# Build image (embeds bundle assets) -export NOMOS_BINARIES_TAR=.tmp/nomos-binaries-linux-v0.3.1.tar.gz -scripts/build_test_image.sh - -# Run -NOMOS_TESTNET_IMAGE=logos-blockchain-testing:local \ -POL_PROOF_DEV_MODE=true \ -cargo run -p runner-examples --bin compose_runner -``` - -**Platform note (macOS / Apple silicon):** -- Docker Desktop runs a `linux/arm64` engine. If Linux bundle builds are slow/unstable when producing `.tmp/nomos-binaries-linux-*.tar.gz`, prefer `NOMOS_BUNDLE_DOCKER_PLATFORM=linux/arm64` for local compose/k8s runs. -- If you need amd64 images/binaries specifically (e.g., deploying to amd64-only environments), set `NOMOS_BUNDLE_DOCKER_PLATFORM=linux/amd64` and expect slower builds via emulation. - -**Alternative:** Manual circuit/image setup (rebuilds during image build): - -```bash -# Fetch and copy circuits -scripts/setup-nomos-circuits.sh v0.3.1 /tmp/nomos-circuits -cp -r /tmp/nomos-circuits/* testing-framework/assets/stack/kzgrs_test_params/ - -# Build image -scripts/build_test_image.sh - -# Run -NOMOS_TESTNET_IMAGE=logos-blockchain-testing:local \ -POL_PROOF_DEV_MODE=true \ -cargo run -p runner-examples --bin compose_runner -``` - -**Environment variables:** -- `NOMOS_TESTNET_IMAGE=logos-blockchain-testing:local` — Image tag (required, must match built image) -- `POL_PROOF_DEV_MODE=true` — **Required** for all runners -- `NOMOS_DEMO_VALIDATORS=3` / `NOMOS_DEMO_EXECUTORS=2` / `NOMOS_DEMO_RUN_SECS=120` — Topology overrides -- `COMPOSE_NODE_PAIRS=1x1` — Alternative topology format: "validators×executors" -- `NOMOS_METRICS_QUERY_URL` — Prometheus-compatible base URL for the runner process to query (optional) -- `NOMOS_METRICS_OTLP_INGEST_URL` — Full OTLP HTTP ingest URL for node metrics export (optional) -- `NOMOS_GRAFANA_URL` — Grafana base URL for printing/logging (optional) -- `COMPOSE_RUNNER_HOST=127.0.0.1` — Host address for port mappings -- `COMPOSE_RUNNER_PRESERVE=1` — Keep containers running after test -- `NOMOS_LOG_LEVEL=debug` / `NOMOS_LOG_FILTER=...` — Control node log verbosity (stdout/stderr) -- `testing-framework/assets/stack/cfgsync.yaml` (`tracing_settings.logger`) — Switch node logs between stdout/stderr and file output - -**Compose-specific features:** -- **Node control support**: Only runner that supports chaos testing (`.enable_node_control()` + chaos workloads) -- **Observability is external**: Set `NOMOS_METRICS_*` / `NOMOS_GRAFANA_URL` to enable telemetry links and querying - - Quickstart: `scripts/setup-observability.sh compose up` then `scripts/setup-observability.sh compose env` - -**Important:** -- Containers expect KZG parameters at `/kzgrs_test_params/kzgrs_test_params` (note the repeated filename) -- Use `scripts/run-examples.sh compose` to handle all setup automatically - -### K8s Runner (Direct Cargo Run) - -For manual control, you can run the `k8s_runner` binary directly. K8s requires the same image setup as Compose. - -**Prerequisites:** -1. **Kubernetes cluster** with `kubectl` configured -2. **Test image built** (same as Compose, preferably with prebuilt bundle) -3. **Image available in cluster** (loaded or pushed to registry) - -**Build and load image:** -```bash -# Build image with bundle (recommended) -scripts/build-bundle.sh --platform linux -export NOMOS_BINARIES_TAR=.tmp/nomos-binaries-linux-v0.3.1.tar.gz -scripts/build_test_image.sh - -# Load into cluster -export NOMOS_TESTNET_IMAGE=logos-blockchain-testing:local -kind load docker-image logos-blockchain-testing:local # For kind -# OR: minikube image load logos-blockchain-testing:local # For minikube -# OR: docker push your-registry/logos-blockchain-testing:local # For remote -``` - -**Run the example:** -```bash -export NOMOS_TESTNET_IMAGE=logos-blockchain-testing:local -export POL_PROOF_DEV_MODE=true -cargo run -p runner-examples --bin k8s_runner -``` - -**Environment variables:** -- `NOMOS_TESTNET_IMAGE` — Image tag (required) -- `POL_PROOF_DEV_MODE=true` — **Required** for all runners -- `NOMOS_DEMO_VALIDATORS` / `NOMOS_DEMO_EXECUTORS` / `NOMOS_DEMO_RUN_SECS` — Topology overrides -- `NOMOS_METRICS_QUERY_URL` — Prometheus-compatible base URL for the runner process to query (PromQL) -- `NOMOS_METRICS_OTLP_INGEST_URL` — Full OTLP HTTP ingest URL for node metrics export (optional) -- `NOMOS_GRAFANA_URL` — Grafana base URL for printing/logging (optional) - -**Metrics + Grafana (optional):** -```bash -export NOMOS_METRICS_QUERY_URL=http://your-prometheus:9090 -# Prometheus OTLP receiver example: -export NOMOS_METRICS_OTLP_INGEST_URL=http://your-prometheus:9090/api/v1/otlp/v1/metrics -# Optional: print a Grafana link in TESTNET_ENDPOINTS -export NOMOS_GRAFANA_URL=http://your-grafana:3000 -cargo run -p runner-examples --bin k8s_runner -``` - -Notes: -- `NOMOS_METRICS_QUERY_URL` must be reachable from the runner process (often via `kubectl port-forward`). -- `NOMOS_METRICS_OTLP_INGEST_URL` must be reachable from nodes (pods/containers) and is backend-specific (Prometheus vs VictoriaMetrics paths differ). - - Quickstart installer: `scripts/setup-observability.sh k8s install` then `scripts/setup-observability.sh k8s env` (optional dashboards: `scripts/setup-observability.sh k8s dashboards`) - -**Via `scripts/run-examples.sh` (optional):** -```bash -scripts/run-examples.sh -t 60 -v 1 -e 1 k8s \ - --metrics-query-url http://your-prometheus:9090 \ - --metrics-otlp-ingest-url http://your-prometheus:9090/api/v1/otlp/v1/metrics -``` - -**In code (optional):** -```rust -use testing_framework_core::scenario::ScenarioBuilder; -use testing_framework_workflows::ObservabilityBuilderExt as _; - -let plan = ScenarioBuilder::with_node_counts(1, 1) - .with_metrics_query_url_str("http://your-prometheus:9090") - .with_metrics_otlp_ingest_url_str("http://your-prometheus:9090/api/v1/otlp/v1/metrics") - .build(); -``` - -**Important:** -- K8s runner mounts `testing-framework/assets/stack/kzgrs_test_params` as a hostPath volume with file `/kzgrs_test_params/kzgrs_test_params` inside pods -- **No node control support yet**: Chaos workloads (`.enable_node_control()`) will fail -- Use `scripts/run-examples.sh k8s` to handle all setup automatically - -## Circuit Assets (KZG Parameters) - -DA workloads require KZG cryptographic parameters for polynomial commitment schemes. - -### Asset Location - -**Default path:** `testing-framework/assets/stack/kzgrs_test_params/kzgrs_test_params` - -Note the repeated filename: the directory `kzgrs_test_params/` contains a file named `kzgrs_test_params`. This is the actual proving key file. - -**Container path** (compose/k8s): `/kzgrs_test_params/kzgrs_test_params` - -**Override:** Set `NOMOS_KZGRS_PARAMS_PATH` to use a custom location (must point to the file): -```bash -NOMOS_KZGRS_PARAMS_PATH=/path/to/custom/params cargo run -p runner-examples --bin local_runner -``` - -### Directory vs File (KZG) - -The system uses KZG assets in two distinct ways: - -| Concept | Used by | Meaning | -|--------|---------|---------| -| **KZG directory** | deployers/scripts | A directory that contains the KZG file (and related artifacts). Defaults to `testing-framework/assets/stack/kzgrs_test_params` and is controlled by `NOMOS_KZG_DIR_REL` (relative to the workspace root). | -| **KZG file path** | node processes | A single file path passed to nodes via `NOMOS_KZGRS_PARAMS_PATH` (inside containers/pods this is typically `/kzgrs_test_params/kzgrs_test_params`). | - -### Getting Circuit Assets - -**Option 1: Use helper script** (recommended): -```bash -# From the repository root -chmod +x scripts/setup-nomos-circuits.sh -scripts/setup-nomos-circuits.sh v0.3.1 /tmp/nomos-circuits - -# Copy to default location -cp -r /tmp/nomos-circuits/* testing-framework/assets/stack/kzgrs_test_params/ -``` - -**Option 2: Build locally** (advanced): -```bash -# This repository does not provide a `make kzgrs_test_params` target. -# If you need to regenerate KZG params from source, follow upstream tooling -# instructions (unspecified here) or use the helper scripts above to fetch a -# known-good bundle. -``` - -### CI Workflow - -The CI automatically fetches and places assets: -```yaml -- name: Install circuits for host build - run: | - scripts/setup-nomos-circuits.sh v0.3.1 "$TMPDIR/nomos-circuits" - cp -a "$TMPDIR/nomos-circuits"/. testing-framework/assets/stack/kzgrs_test_params/ -``` - -### When Are Assets Needed? - -| Runner | When Required | -|--------|---------------| -| **Local** | Always (for DA workloads) | -| **Compose** | During image build (baked into `NOMOS_TESTNET_IMAGE`) | -| **K8s** | During image build + deployed to cluster via hostPath volume | - -**Error without assets:** -``` -Error: missing KZG parameters at testing-framework/assets/stack/kzgrs_test_params/kzgrs_test_params -``` - -If you see this error, the file `kzgrs_test_params` is missing from the directory. Use `scripts/run-examples.sh` or `scripts/setup-nomos-circuits.sh` to fetch it. - -## Logging and Observability - -### Node Logging vs Framework Logging - -**Critical distinction:** Node logs and framework logs use different configuration mechanisms. - -| Component | Controlled By | Purpose | -|-----------|--------------|---------| -| **Framework binaries** (`cargo run -p runner-examples --bin local_runner`) | `RUST_LOG` | Runner orchestration, deployment logs | -| **Node processes** (validators, executors spawned by runner) | `NOMOS_LOG_LEVEL`, `NOMOS_LOG_FILTER` (+ `NOMOS_LOG_DIR` on host runner) | Consensus, DA, mempool, network logs | - -**Common mistake:** Setting `RUST_LOG=debug` only increases verbosity of the runner binary itself. Node logs remain at their default level unless you also set `NOMOS_LOG_LEVEL=debug`. - -**Example:** -```bash -# This only makes the RUNNER verbose, not the nodes: -RUST_LOG=debug cargo run -p runner-examples --bin local_runner - -# This makes the NODES verbose: -NOMOS_LOG_LEVEL=debug cargo run -p runner-examples --bin local_runner - -# Both verbose (typically not needed): -RUST_LOG=debug NOMOS_LOG_LEVEL=debug cargo run -p runner-examples --bin local_runner -``` - -### Logging Environment Variables - -| Variable | Default | Effect | -|----------|---------|--------| -| `NOMOS_LOG_DIR` | None (console only) | Host runner: directory for per-node log files. Compose/k8s: use `testing-framework/assets/stack/cfgsync.yaml` (`tracing_settings.logger: !File`) and mount a writable directory. | -| `NOMOS_LOG_LEVEL` | `info` | Global log level: `error`, `warn`, `info`, `debug`, `trace` | -| `NOMOS_LOG_FILTER` | None | Fine-grained target filtering (e.g., `cryptarchia=trace,nomos_da_sampling=debug`) | -| `NOMOS_TESTS_TRACING` | `false` | Enable the debug tracing preset (optional; combine with `NOMOS_LOG_DIR` unless you have external tracing backends configured) | -| `NOMOS_OTLP_ENDPOINT` | None | OTLP trace endpoint (optional, disables OTLP noise if unset) | -| `NOMOS_OTLP_METRICS_ENDPOINT` | None | OTLP metrics endpoint (optional) | - -**Example:** Full debug logging to files: -```bash -NOMOS_TESTS_TRACING=true \ -NOMOS_LOG_DIR=/tmp/test-logs \ -NOMOS_LOG_LEVEL=debug \ -NOMOS_LOG_FILTER="cryptarchia=trace,nomos_da_sampling=debug,nomos_da_dispersal=debug,nomos_da_verifier=debug,nomos_blend=debug,chain_service=info,chain_network=info,chain_leader=info" \ -POL_PROOF_DEV_MODE=true \ -cargo run -p runner-examples --bin local_runner -``` - -### Per-Node Log Files - -When `NOMOS_LOG_DIR` is set, each node writes logs to separate files: - -**File naming pattern:** -- **Validators**: Prefix `nomos-node-0`, `nomos-node-1`, etc. (may include timestamp suffix) -- **Executors**: Prefix `nomos-executor-0`, `nomos-executor-1`, etc. (may include timestamp suffix) - -**Local runner note:** The local runner uses per-run temporary directories under the current working directory and removes them after the run unless `NOMOS_TESTS_KEEP_LOGS=1`. Use `NOMOS_LOG_DIR=/path/to/logs` to write per-node log files to a stable location. - -### Filter Target Names - -Common target prefixes for `NOMOS_LOG_FILTER`: - -| Target Prefix | Subsystem | -|---------------|-----------| -| `cryptarchia` | Consensus (Cryptarchia) | -| `nomos_da_sampling` | DA sampling service | -| `nomos_da_dispersal` | DA dispersal service | -| `nomos_da_verifier` | DA verification | -| `nomos_blend` | Mix network/privacy layer | -| `chain_service` | Chain service (node APIs/state) | -| `chain_network` | P2P networking | -| `chain_leader` | Leader election | - -**Example filter:** -```bash -NOMOS_LOG_FILTER="cryptarchia=trace,nomos_da_sampling=debug,chain_service=info,chain_network=info,chain_leader=info" -``` - -### Accessing Logs Per Runner - -#### Local Runner - -**Default (temporary directories, auto-cleanup):** -```bash -POL_PROOF_DEV_MODE=true cargo run -p runner-examples --bin local_runner -# Logs written to temporary directories in working directory -# Automatically cleaned up after test completes -``` - -**Persistent file output:** -```bash -NOMOS_LOG_DIR=/tmp/local-logs \ -POL_PROOF_DEV_MODE=true \ -cargo run -p runner-examples --bin local_runner - -# After test completes: -ls /tmp/local-logs/ -# Files with prefix: nomos-node-0*, nomos-node-1*, nomos-executor-0* -# May include timestamps in filename -``` - -**Tip:** Use `NOMOS_LOG_DIR` for persistent per-node log files, and `NOMOS_TESTS_KEEP_LOGS=1` if you want to keep the per-run temporary directories (configs/state) for post-mortem inspection. - -#### Compose Runner - -**Via Docker logs (default, recommended):** -```bash -# List containers (note the UUID prefix in names) -docker ps --filter "name=nomos-compose-" - -# Stream logs from specific container -docker logs -f - -# Or use name pattern matching: -docker logs -f $(docker ps --filter "name=nomos-compose-.*-validator-0" -q | head -1) -``` - -**Via file collection (advanced):** - -To write per-node log files inside containers, set `tracing_settings.logger: !File` in `testing-framework/assets/stack/cfgsync.yaml` (and ensure the directory is writable). To access them, you must either: - -1. **Copy files out after the run:** -```bash -# Ensure `testing-framework/assets/stack/cfgsync.yaml` is configured to log to `/logs` -# via `tracing_settings.logger: !File`. -NOMOS_TESTNET_IMAGE=logos-blockchain-testing:local \ -POL_PROOF_DEV_MODE=true \ -cargo run -p runner-examples --bin compose_runner - -# After test, copy files from containers: -docker ps --filter "name=nomos-compose-" -docker cp :/logs/node* /tmp/ -``` - -2. **Mount a host volume** (requires modifying compose template): -```yaml -volumes: - - /tmp/host-logs:/logs # Add to docker-compose.yml.tera -``` - -**Recommendation:** Use `docker logs` by default. File collection inside containers is complex and rarely needed. - -**Keep containers for debugging:** -```bash -COMPOSE_RUNNER_PRESERVE=1 \ -NOMOS_TESTNET_IMAGE=logos-blockchain-testing:local \ -cargo run -p runner-examples --bin compose_runner -# Containers remain running after test—inspect with docker logs or docker exec -``` - -**Compose networking/debug knobs:** -- `COMPOSE_RUNNER_HOST=127.0.0.1` — host used for readiness probes (override for remote Docker daemons / VM networking) -- `COMPOSE_RUNNER_HOST_GATEWAY=host.docker.internal:host-gateway` — controls the `extra_hosts` entry injected into compose (set to `disable` to omit) -- `TESTNET_RUNNER_PRESERVE=1` — alias for `COMPOSE_RUNNER_PRESERVE=1` -- `COMPOSE_RUNNER_HTTP_TIMEOUT_SECS=` — override compose node HTTP readiness timeout - -**Note:** Container names follow pattern `nomos-compose-{uuid}-validator-{index}-1` where `{uuid}` changes per run. - -#### K8s Runner - -**Via kubectl logs (use label selectors):** -```bash -# List pods -kubectl get pods - -# Stream logs using label selectors (recommended) -# Helm chart labels: -# - nomos/logical-role=validator|executor -# - nomos/validator-index / nomos/executor-index -kubectl logs -l nomos/logical-role=validator -f -kubectl logs -l nomos/logical-role=executor -f - -# Stream logs from specific pod -kubectl logs -f nomos-validator-0 - -# Previous logs from crashed pods -kubectl logs --previous -l nomos/logical-role=validator -``` - -**Download logs for offline analysis:** -```bash -# Using label selectors -kubectl logs -l nomos/logical-role=validator --tail=1000 > all-validators.log -kubectl logs -l nomos/logical-role=executor --tail=1000 > all-executors.log - -# Specific pods -kubectl logs nomos-validator-0 > validator-0.log -kubectl logs nomos-executor-1 > executor-1.log -``` - -**K8s environment notes:** -- The k8s runner is optimized for local clusters (Docker Desktop Kubernetes / minikube / kind): - - The default image `logos-blockchain-testing:local` must be available on the cluster’s nodes (Docker Desktop shares the local daemon; kind/minikube often requires an explicit image load step). - - The Helm chart mounts KZG params via a `hostPath` to your workspace path; this typically won’t work on remote/managed clusters without replacing it with a PV/CSI volume or baking the params into an image. -- Debug helpers: - - `K8S_RUNNER_DEBUG=1` — logs Helm stdout/stderr for install commands. - - `K8S_RUNNER_PRESERVE=1` — keep the namespace/release after the run. -- `K8S_RUNNER_NODE_HOST=` — override NodePort host resolution for non-local clusters. -- `K8S_RUNNER_NAMESPACE=` / `K8S_RUNNER_RELEASE=` — pin namespace/release instead of random IDs (useful for debugging) - -**Specify namespace (if not using default):** -```bash -kubectl logs -n my-namespace -l nomos/logical-role=validator -f -``` - -### OTLP and Telemetry - -**OTLP exporters are optional.** If you see errors about unreachable OTLP endpoints, it's safe to ignore them unless you're actively collecting traces/metrics. - -**To enable OTLP:** -```bash -NOMOS_OTLP_ENDPOINT=http://localhost:4317 \ -NOMOS_OTLP_METRICS_ENDPOINT=http://localhost:4318 \ -cargo run -p runner-examples --bin local_runner -``` - -**To silence OTLP errors:** Simply leave these variables unset (the default). - -### Observability: Prometheus and Node APIs - -Runners expose metrics and node HTTP endpoints for expectation code and debugging: - -**Prometheus-compatible metrics querying (optional):** -- Runners do **not** provision Prometheus automatically. -- For a ready-to-run stack, use `scripts/setup-observability.sh`: - - Compose: `scripts/setup-observability.sh compose up` then `scripts/setup-observability.sh compose env` - - K8s: `scripts/setup-observability.sh k8s install` then `scripts/setup-observability.sh k8s env` -- Provide `NOMOS_METRICS_QUERY_URL` (PromQL base URL) to enable `ctx.telemetry()` queries. -- Access from expectations when configured: `ctx.telemetry().prometheus().map(|p| p.base_url())` - -**Grafana (optional):** -- Runners do **not** provision Grafana automatically (but `scripts/setup-observability.sh` can). -- If you set `NOMOS_GRAFANA_URL`, the deployer prints it in `TESTNET_ENDPOINTS`. -- Dashboards live in `testing-framework/assets/stack/monitoring/grafana/dashboards/` for import into your Grafana. - -**Node APIs:** -- Access from expectations: `ctx.node_clients().validator_clients().get(0)` -- Endpoints: consensus info, network info, DA membership, etc. -- See `testing-framework/core/src/nodes/api_client.rs` for available methods - -```mermaid -flowchart TD - Expose[Runner exposes endpoints/ports] --> Collect[Runtime collects block/health signals] - Collect --> Consume[Expectations consume signals
decide pass/fail] - Consume --> Inspect[Operators inspect logs/metrics
when failures arise] -``` diff --git a/book/src/part-v.md b/book/src/part-v.md new file mode 100644 index 0000000..6f8b082 --- /dev/null +++ b/book/src/part-v.md @@ -0,0 +1,44 @@ +# Part V — Operations & Deployment + +This section covers operational aspects of running the testing framework: prerequisites, deployment configuration, continuous integration, and observability. + +## What You'll Learn + +- **Prerequisites & Setup**: Required files, binaries, circuit assets, and environment configuration +- **Running Examples**: How to execute scenarios across host, compose, and k8s runners +- **CI Integration**: Automating tests in continuous integration pipelines with caching and matrix testing +- **Environment Variables**: Complete reference of all configuration variables +- **Logging & Observability**: Log collection strategies, metrics integration, and debugging techniques + +## Who This Section Is For + +- **Operators** setting up the framework for the first time +- **DevOps Engineers** integrating tests into CI/CD pipelines +- **Developers** debugging test failures or performance issues +- **Platform Engineers** deploying across different environments (local, Docker, Kubernetes) + +## Navigation + +This section is organized for progressive depth: + +1. Start with [Operations Overview](operations-overview.md) for the big picture +2. Follow [Prerequisites & Setup](prerequisites.md) to prepare your environment +3. Use [Running Examples](running-examples.md) to execute your first scenarios +4. Integrate with [CI Integration](ci-integration.md) for automated testing +5. Reference [Environment Variables](environment-variables.md) for complete configuration options +6. Debug with [Logging & Observability](logging-observability.md) when issues arise + +## Key Principles + +**Operational Hygiene:** Assets present, prerequisites satisfied, observability reachable + +**Environment Fit:** Choose the right deployment target based on isolation, reproducibility, and resource needs + +**Clear Signals:** Verify runners report node readiness before starting workloads + +**Failure Triage:** Map failures to specific causes—missing prerequisites, platform issues, or unmet expectations + +--- + +Ready to get started? Begin with [Operations Overview](operations-overview.md) → + diff --git a/book/src/part-vi.md b/book/src/part-vi.md new file mode 100644 index 0000000..3e81d96 --- /dev/null +++ b/book/src/part-vi.md @@ -0,0 +1,28 @@ +# Part VI — Appendix + +Quick reference materials, troubleshooting guides, and supplementary information. + +## Contents + +- **Builder API Quick Reference**: Cheat sheet for DSL methods +- **Troubleshooting Scenarios**: Common issues and their solutions, including "What Failure Looks Like" with realistic examples +- **FAQ**: Frequently asked questions +- **Glossary**: Terminology reference + +## When to Use This Section + +- **Quick lookups**: Find DSL method signatures without reading full guides +- **Debugging failures**: Match symptoms to known issues and fixes +- **Clarifying concepts**: Look up unfamiliar terms in the glossary +- **Common questions**: Check FAQ before asking for help + +This section complements the main documentation with practical reference materials that you'll return to frequently during development and operations. + +--- + +Jump to: +- [Builder API Quick Reference](dsl-cheat-sheet.md) +- [Troubleshooting Scenarios](troubleshooting.md) +- [FAQ](faq.md) +- [Glossary](glossary.md) + diff --git a/book/src/prerequisites.md b/book/src/prerequisites.md new file mode 100644 index 0000000..d246a01 --- /dev/null +++ b/book/src/prerequisites.md @@ -0,0 +1,286 @@ +# Prerequisites & Setup + +This page covers everything you need before running your first scenario. + +## Required Files + +### `versions.env` (Required) + +All helper scripts require a `versions.env` file at the repository root: + +```bash +VERSION=v0.3.1 +NOMOS_NODE_REV=abc123def456789 +NOMOS_BUNDLE_VERSION=v1 +``` + +**What it defines:** +- `VERSION` — Circuit release tag for KZG parameters +- `NOMOS_NODE_REV` — Git revision of nomos-node to build/fetch +- `NOMOS_BUNDLE_VERSION` — Bundle schema version + +**Where it's used:** +- `scripts/run/run-examples.sh` +- `scripts/build/build-bundle.sh` +- `scripts/setup/setup-nomos-circuits.sh` +- CI workflows + +**Error if missing:** +``` +ERROR: versions.env not found at repository root +This file is required and should define: + VERSION= + NOMOS_NODE_REV= + NOMOS_BUNDLE_VERSION= +``` + +**Fix:** Ensure you're in the repository root. The file should already exist in the checked-out repo. + +## Node Binaries + +Scenarios need compiled `nomos-node` and `nomos-executor` binaries. + +### Option 1: Use Helper Scripts (Recommended) + +```bash +scripts/run/run-examples.sh -t 60 -v 3 -e 1 host +``` + +This automatically: +- Clones/updates nomos-node checkout +- Builds required binaries +- Sets `NOMOS_NODE_BIN` / `NOMOS_EXECUTOR_BIN` + +### Option 2: Manual Build + +If you have a sibling `nomos-node` checkout: + +```bash +cd ../nomos-node +cargo build --release --bin nomos-node --bin nomos-executor + +# Set environment variables +export NOMOS_NODE_BIN=$PWD/target/release/nomos-node +export NOMOS_EXECUTOR_BIN=$PWD/target/release/nomos-executor + +# Return to testing framework +cd ../nomos-testing +``` + +### Option 3: Prebuilt Bundles (CI) + +CI workflows use prebuilt artifacts: + +```yaml +- name: Download nomos binaries + uses: actions/download-artifact@v3 + with: + name: nomos-binaries-linux + path: .tmp/ + +- name: Extract bundle + run: | + tar -xzf .tmp/nomos-binaries-linux-*.tar.gz -C .tmp/ + export NOMOS_NODE_BIN=$PWD/.tmp/nomos-node + export NOMOS_EXECUTOR_BIN=$PWD/.tmp/nomos-executor +``` + +## Circuit Assets (KZG Parameters) + +Data Availability (DA) workloads require KZG cryptographic parameters. + +### Asset Location + +**Default path:** `testing-framework/assets/stack/kzgrs_test_params/kzgrs_test_params` + +Note: The directory `kzgrs_test_params/` contains a file named `kzgrs_test_params`. This is the proving key file (~120MB). + +**Container path (compose/k8s):** `/kzgrs_test_params/kzgrs_test_params` + +### Getting Assets + +**Option 1: Use helper script** (recommended): + +```bash +# Fetch circuits +scripts/setup/setup-nomos-circuits.sh v0.3.1 /tmp/nomos-circuits + +# Copy to default location +mkdir -p testing-framework/assets/stack/kzgrs_test_params +cp -r /tmp/nomos-circuits/* testing-framework/assets/stack/kzgrs_test_params/ + +# Verify (should be ~120MB) +ls -lh testing-framework/assets/stack/kzgrs_test_params/kzgrs_test_params +``` + +**Option 2: Let `run-examples.sh` handle it**: + +```bash +scripts/run/run-examples.sh -t 60 -v 3 -e 1 host +``` + +This automatically fetches and places assets. + +### Override Path + +Set `NOMOS_KZGRS_PARAMS_PATH` to use a custom location: + +```bash +NOMOS_KZGRS_PARAMS_PATH=/custom/path/to/kzgrs_test_params \ +cargo run -p runner-examples --bin local_runner +``` + +### When Are Assets Needed? + +| Runner | When Required | +|--------|---------------| +| **Host (local)** | Always (for DA workloads) | +| **Compose** | During image build (baked into image) | +| **K8s** | During image build + mounted via hostPath | + +**Error without assets:** + +``` +Error: Custom { kind: NotFound, error: "Circuit file not found at: testing-framework/assets/stack/kzgrs_test_params/kzgrs_test_params" } +``` + +## Platform Requirements + +### Host Runner (Local Processes) + +**Requires:** +- Rust nightly toolchain +- Node binaries built +- KZG circuit assets (for DA workloads) +- Available ports (18080+, 3100+, etc.) + +**No Docker required.** + +**Best for:** +- Quick iteration +- Development +- Smoke tests + +### Compose Runner (Docker Compose) + +**Requires:** +- Docker daemon running +- Docker image built: `logos-blockchain-testing:local` +- KZG assets baked into image +- Docker Desktop (macOS) or Docker Engine (Linux) + +**Platform notes (macOS / Apple silicon):** +- Prefer `NOMOS_BUNDLE_DOCKER_PLATFORM=linux/arm64` for native performance +- Use `linux/amd64` only if targeting amd64 environments (slower via emulation) + +**Best for:** +- Reproducible environments +- CI testing +- Chaos workloads (node control support) + +### K8s Runner (Kubernetes) + +**Requires:** +- Kubernetes cluster (Docker Desktop K8s, minikube, kind, or remote) +- `kubectl` configured +- Docker image built and loaded/pushed +- KZG assets baked into image + mounted via hostPath + +**Local cluster setup:** + +```bash +# Docker Desktop: Enable Kubernetes in settings + +# OR: Use kind +kind create cluster +kind load docker-image logos-blockchain-testing:local + +# OR: Use minikube +minikube start +minikube image load logos-blockchain-testing:local +``` + +**Remote cluster:** Push image to registry and set `NOMOS_TESTNET_IMAGE`. + +**Best for:** +- Production-like testing +- Resource isolation +- Large topologies + +## Critical Environment Variable + +**`POL_PROOF_DEV_MODE=true` is REQUIRED for ALL runners!** + +Without this, proof generation uses expensive Groth16 proving, causing: +- Tests "hang" for minutes +- CPU spikes to 100% +- Timeouts and failures + +**Always set:** + +```bash +POL_PROOF_DEV_MODE=true cargo run -p runner-examples --bin local_runner +POL_PROOF_DEV_MODE=true scripts/run/run-examples.sh -t 60 -v 3 -e 1 compose +# etc. +``` + +**Or add to your shell profile:** + +```bash +# ~/.bashrc or ~/.zshrc +export POL_PROOF_DEV_MODE=true +``` + +## Quick Setup Check + +Run this checklist before your first scenario: + +```bash +# 1. Verify versions.env exists +cat versions.env + +# 2. Check circuit assets (for DA workloads) +ls -lh testing-framework/assets/stack/kzgrs_test_params/kzgrs_test_params + +# 3. Verify POL_PROOF_DEV_MODE is set +echo $POL_PROOF_DEV_MODE # Should print: true + +# 4. For compose/k8s: verify Docker is running +docker ps + +# 5. For compose/k8s: verify image exists +docker images | grep logos-blockchain-testing + +# 6. For host runner: verify node binaries (if not using scripts) +$NOMOS_NODE_BIN --version +$NOMOS_EXECUTOR_BIN --version +``` + +## Recommended: Use Helper Scripts + +The easiest path is to let the helper scripts handle everything: + +```bash +# Host runner +scripts/run/run-examples.sh -t 60 -v 3 -e 1 host + +# Compose runner +scripts/run/run-examples.sh -t 60 -v 3 -e 1 compose + +# K8s runner +scripts/run/run-examples.sh -t 60 -v 3 -e 1 k8s +``` + +These scripts: +- Verify `versions.env` exists +- Clone/build nomos-node if needed +- Fetch circuit assets if missing +- Build Docker images (compose/k8s) +- Load images into cluster (k8s) +- Run the scenario with proper environment + +**Next Steps:** +- [Running Examples](running-examples.md) — Learn how to run scenarios +- [Environment Variables](environment-variables.md) — Full variable reference +- [Troubleshooting](troubleshooting.md) — Common issues and fixes + diff --git a/book/src/quickstart.md b/book/src/quickstart.md index a51c02e..837b4b7 100644 --- a/book/src/quickstart.md +++ b/book/src/quickstart.md @@ -23,7 +23,7 @@ The framework ships with runnable example binaries in `examples/src/bin/`. ```bash # From the logos-blockchain-testing directory -scripts/run-examples.sh -t 60 -v 1 -e 1 host +scripts/run/run-examples.sh -t 60 -v 1 -e 1 host ``` This handles circuit setup, binary building, and runs a complete scenario: 1 validator + 1 executor, mixed transaction + DA workload (5 tx/block + 1 channel + 1 blob), 60s duration. @@ -191,7 +191,7 @@ pub async fn step_6_deploy_and_execute() -> Result<()> { ```bash # Scale up to 3 validators + 2 executors, run for 2 minutes -scripts/run-examples.sh -t 120 -v 3 -e 2 host +scripts/run/run-examples.sh -t 120 -v 3 -e 2 host ``` **With direct cargo run:** @@ -212,7 +212,7 @@ Use the same API with a different deployer for reproducible containerized enviro **Recommended:** Use the convenience script (handles everything): ```bash -scripts/run-examples.sh -t 60 -v 1 -e 1 compose +scripts/run/run-examples.sh -t 60 -v 1 -e 1 compose ``` This automatically: @@ -225,13 +225,13 @@ This automatically: ```bash # Option 1: Use prebuilt bundle (recommended for compose/k8s) -scripts/build-bundle.sh --platform linux # Creates .tmp/nomos-binaries-linux-v0.3.1.tar.gz +scripts/build/build-bundle.sh --platform linux # Creates .tmp/nomos-binaries-linux-v0.3.1.tar.gz export NOMOS_BINARIES_TAR=.tmp/nomos-binaries-linux-v0.3.1.tar.gz # Option 2: Manual circuit/image setup (rebuilds during image build) -scripts/setup-nomos-circuits.sh v0.3.1 /tmp/nomos-circuits +scripts/setup/setup-nomos-circuits.sh v0.3.1 /tmp/nomos-circuits cp -r /tmp/nomos-circuits/* testing-framework/assets/stack/kzgrs_test_params/ -scripts/build_test_image.sh +scripts/build/build_test_image.sh # Run with Compose NOMOS_TESTNET_IMAGE=logos-blockchain-testing:local \ @@ -246,8 +246,8 @@ cargo run -p runner-examples --bin compose_runner The runner can integrate with external observability endpoints. For a ready-to-run local stack: ```bash -scripts/setup-observability.sh compose up -eval "$(scripts/setup-observability.sh compose env)" +scripts/setup/setup-observability.sh compose up +eval "$(scripts/setup/setup-observability.sh compose env)" ``` Then run your compose scenario as usual (the environment variables enable PromQL querying and node OTLP metrics export). diff --git a/book/src/runners.md b/book/src/runners.md index ea309d7..4ea2df3 100644 --- a/book/src/runners.md +++ b/book/src/runners.md @@ -2,7 +2,7 @@ Runners turn a scenario plan into a live environment while keeping the plan unchanged. Choose based on feedback speed, reproducibility, and fidelity. For -environment and operational considerations, see [Operations](operations.md). +environment and operational considerations, see [Operations Overview](operations-overview.md). **Important:** All runners require `POL_PROOF_DEV_MODE=true` to avoid expensive Groth16 proof generation that causes timeouts. @@ -14,7 +14,7 @@ environment and operational considerations, see [Operations](operations.md). - **Can run in CI** for fast smoke tests. - **Node control:** Not supported (chaos workloads not available) -**Run with:** `scripts/run-examples.sh -t 60 -v 1 -e 1 host` +**Run with:** `scripts/run/run-examples.sh -t 60 -v 1 -e 1 host` ## Docker Compose runner - Starts nodes in containers to provide a reproducible multi-node stack on a @@ -25,7 +25,7 @@ environment and operational considerations, see [Operations](operations.md). - **Recommended for CI pipelines** (isolated environment, reproducible). - **Node control:** Supported (can restart nodes for chaos testing) -**Run with:** `scripts/run-examples.sh -t 60 -v 1 -e 1 compose` +**Run with:** `scripts/run/run-examples.sh -t 60 -v 1 -e 1 compose` ## Kubernetes runner - Deploys nodes onto a cluster for higher-fidelity, longer-running scenarios (via `K8sDeployer`). @@ -34,7 +34,7 @@ environment and operational considerations, see [Operations](operations.md). and scheduling matter. - **Node control:** Not supported yet (chaos workloads not available) -**Run with:** `scripts/run-examples.sh -t 60 -v 1 -e 1 k8s` +**Run with:** `scripts/run/run-examples.sh -t 60 -v 1 -e 1 k8s` ### Common expectations - All runners require at least one validator and, for transaction scenarios, diff --git a/book/src/running-examples.md b/book/src/running-examples.md new file mode 100644 index 0000000..60606c5 --- /dev/null +++ b/book/src/running-examples.md @@ -0,0 +1,307 @@ +# Running Examples + +The framework provides three runner modes: **host** (local processes), **compose** (Docker Compose), and **k8s** (Kubernetes). + +## Quick Start (Recommended) + +Use `scripts/run/run-examples.sh` for all modes—it handles all setup automatically: + +```bash +# Host mode (local processes) +scripts/run/run-examples.sh -t 60 -v 3 -e 1 host + +# Compose mode (Docker Compose) +scripts/run/run-examples.sh -t 60 -v 3 -e 1 compose + +# K8s mode (Kubernetes) +scripts/run/run-examples.sh -t 60 -v 3 -e 1 k8s +``` + +**Parameters:** +- `-t 60` — Run duration in seconds +- `-v 3` — Number of validators +- `-e 1` — Number of executors +- `host|compose|k8s` — Deployment mode + +This script handles: +- Circuit asset setup +- Binary building/bundling +- Image building (compose/k8s) +- Image loading into cluster (k8s) +- Execution with proper environment + +**Note:** For `k8s` runs against non-local clusters (e.g. EKS), the cluster pulls images from a registry. In that case, build + push your image separately (see `scripts/build/build_test_image.sh`) and set `NOMOS_TESTNET_IMAGE` to the pushed reference. + +## Quick Smoke Matrix + +For a small "does everything still run?" matrix across all runners: + +```bash +scripts/run/run-test-matrix.sh -t 120 -v 1 -e 1 +``` + +This runs host, compose, and k8s modes with various image-build configurations. Useful after making runner/image/script changes. Forwards `--metrics-*` options through to `scripts/run/run-examples.sh`. + +**Environment overrides:** +- `VERSION=v0.3.1` — Circuit version +- `NOMOS_NODE_REV=` — nomos-node git revision +- `NOMOS_BINARIES_TAR=path/to/bundle.tar.gz` — Use prebuilt bundle +- `NOMOS_SKIP_IMAGE_BUILD=1` — Skip image rebuild (compose/k8s) +- `NOMOS_BUNDLE_DOCKER_PLATFORM=linux/arm64|linux/amd64` — Docker platform for bundle builds (macOS/Windows) +- `COMPOSE_CIRCUITS_PLATFORM=linux-aarch64|linux-x86_64` — Circuits platform for image builds +- `SLOW_TEST_ENV=true` — Doubles built-in readiness timeouts (useful in CI / constrained laptops) +- `TESTNET_PRINT_ENDPOINTS=1` — Print `TESTNET_ENDPOINTS` / `TESTNET_PPROF` lines during deploy + +## Dev Workflow: Updating nomos-node Revision + +The repo pins a `nomos-node` revision in `versions.env` for reproducible builds. To update it or point to a local checkout: + +```bash +# Pin to a new git revision (updates versions.env + Cargo.toml git revs) +scripts/ops/update-nomos-rev.sh --rev + +# Use a local nomos-node checkout instead (for development) +scripts/ops/update-nomos-rev.sh --path /path/to/nomos-node + +# If Cargo.toml was marked skip-worktree, clear it +scripts/ops/update-nomos-rev.sh --unskip-worktree +``` + +**Notes:** +- Don't commit absolute `NOMOS_NODE_PATH` values; prefer `--rev` for shared history/CI +- After changing rev/path, expect `Cargo.lock` to update on the next `cargo build`/`cargo test` + +## Cleanup Helper + +If you hit Docker build failures, I/O errors, or disk space issues: + +```bash +scripts/ops/clean.sh +``` + +For extra Docker cache cleanup: + +```bash +scripts/ops/clean.sh --docker +``` + +--- + +## Host Runner (Direct Cargo Run) + +For manual control, run the `local_runner` binary directly: + +```bash +POL_PROOF_DEV_MODE=true \ +NOMOS_NODE_BIN=/path/to/nomos-node \ +NOMOS_EXECUTOR_BIN=/path/to/nomos-executor \ +cargo run -p runner-examples --bin local_runner +``` + +### Host Runner Environment Variables + +| Variable | Default | Effect | +|----------|---------|--------| +| `NOMOS_DEMO_VALIDATORS` | 1 | Number of validators (legacy: `LOCAL_DEMO_VALIDATORS`) | +| `NOMOS_DEMO_EXECUTORS` | 1 | Number of executors (legacy: `LOCAL_DEMO_EXECUTORS`) | +| `NOMOS_DEMO_RUN_SECS` | 60 | Run duration in seconds (legacy: `LOCAL_DEMO_RUN_SECS`) | +| `NOMOS_NODE_BIN` | — | Path to nomos-node binary (required) | +| `NOMOS_EXECUTOR_BIN` | — | Path to nomos-executor binary (required) | +| `NOMOS_LOG_DIR` | None | Directory for per-node log files | +| `NOMOS_TESTS_KEEP_LOGS` | 0 | Keep per-run temporary directories (useful for debugging/CI) | +| `NOMOS_TESTS_TRACING` | false | Enable debug tracing preset | +| `NOMOS_LOG_LEVEL` | info | Global log level: error, warn, info, debug, trace | +| `NOMOS_LOG_FILTER` | None | Fine-grained module filtering (e.g., `cryptarchia=trace,nomos_da_sampling=debug`) | +| `POL_PROOF_DEV_MODE` | — | **REQUIRED**: Set to `true` for all runners | + +**Note:** Requires circuit assets and host binaries. Use `scripts/run/run-examples.sh host` to handle setup automatically. + +--- + +## Compose Runner (Direct Cargo Run) + +For manual control, run the `compose_runner` binary directly. Compose requires a Docker image with embedded assets. + +### Option 1: Prebuilt Bundle (Recommended) + +```bash +# 1. Build a Linux bundle (includes binaries + circuits) +scripts/build/build-bundle.sh --platform linux +# Creates .tmp/nomos-binaries-linux-v0.3.1.tar.gz + +# 2. Build image (embeds bundle assets) +export NOMOS_BINARIES_TAR=.tmp/nomos-binaries-linux-v0.3.1.tar.gz +scripts/build/build_test_image.sh + +# 3. Run +NOMOS_TESTNET_IMAGE=logos-blockchain-testing:local \ +POL_PROOF_DEV_MODE=true \ +cargo run -p runner-examples --bin compose_runner +``` + +### Option 2: Manual Circuit/Image Setup + +```bash +# Fetch and copy circuits +scripts/setup/setup-nomos-circuits.sh v0.3.1 /tmp/nomos-circuits +cp -r /tmp/nomos-circuits/* testing-framework/assets/stack/kzgrs_test_params/ + +# Build image +scripts/build/build_test_image.sh + +# Run +NOMOS_TESTNET_IMAGE=logos-blockchain-testing:local \ +POL_PROOF_DEV_MODE=true \ +cargo run -p runner-examples --bin compose_runner +``` + +### Platform Note (macOS / Apple Silicon) + +- Docker Desktop runs a `linux/arm64` engine by default +- For native performance: `NOMOS_BUNDLE_DOCKER_PLATFORM=linux/arm64` (recommended for local testing) +- For amd64 targets: `NOMOS_BUNDLE_DOCKER_PLATFORM=linux/amd64` (slower via emulation) + +### Compose Runner Environment Variables + +| Variable | Default | Effect | +|----------|---------|--------| +| `NOMOS_TESTNET_IMAGE` | — | Image tag (required, must match built image) | +| `POL_PROOF_DEV_MODE` | — | **REQUIRED**: Set to `true` for all runners | +| `NOMOS_DEMO_VALIDATORS` | 1 | Number of validators | +| `NOMOS_DEMO_EXECUTORS` | 1 | Number of executors | +| `NOMOS_DEMO_RUN_SECS` | 60 | Run duration in seconds | +| `COMPOSE_NODE_PAIRS` | — | Alternative topology format: "validators×executors" (e.g., `3x2`) | +| `NOMOS_METRICS_QUERY_URL` | None | Prometheus-compatible base URL for runner to query | +| `NOMOS_METRICS_OTLP_INGEST_URL` | None | Full OTLP HTTP ingest URL for node metrics export | +| `NOMOS_GRAFANA_URL` | None | Grafana base URL for printing/logging | +| `COMPOSE_RUNNER_HOST` | 127.0.0.1 | Host address for port mappings | +| `COMPOSE_RUNNER_PRESERVE` | 0 | Keep containers running after test | +| `NOMOS_LOG_LEVEL` | info | Node log level (stdout/stderr) | +| `NOMOS_LOG_FILTER` | None | Fine-grained module filtering | + +**Config file option:** `testing-framework/assets/stack/cfgsync.yaml` (`tracing_settings.logger`) — Switch node logs between stdout/stderr and file output + +### Compose-Specific Features + +- **Node control support**: Only runner that supports chaos testing (`.enable_node_control()` + chaos workloads) +- **External observability**: Set `NOMOS_METRICS_*` / `NOMOS_GRAFANA_URL` to enable telemetry links and querying + - Quickstart: `scripts/setup/setup-observability.sh compose up` then `scripts/setup/setup-observability.sh compose env` + +**Important:** +- Containers expect KZG parameters at `/kzgrs_test_params/kzgrs_test_params` (note the repeated filename) +- Use `scripts/run/run-examples.sh compose` to handle all setup automatically + +--- + +## K8s Runner (Direct Cargo Run) + +For manual control, run the `k8s_runner` binary directly. K8s requires the same image setup as Compose. + +### Prerequisites + +1. **Kubernetes cluster** with `kubectl` configured +2. **Test image built** (same as Compose, preferably with prebuilt bundle) +3. **Image available in cluster** (loaded or pushed to registry) + +### Build and Load Image + +```bash +# 1. Build image with bundle (recommended) +scripts/build/build-bundle.sh --platform linux +export NOMOS_BINARIES_TAR=.tmp/nomos-binaries-linux-v0.3.1.tar.gz +scripts/build/build_test_image.sh + +# 2. Load into cluster (choose one) +export NOMOS_TESTNET_IMAGE=logos-blockchain-testing:local + +# For kind: +kind load docker-image logos-blockchain-testing:local + +# For minikube: +minikube image load logos-blockchain-testing:local + +# For remote cluster (push to registry): +docker tag logos-blockchain-testing:local your-registry/logos-blockchain-testing:latest +docker push your-registry/logos-blockchain-testing:latest +export NOMOS_TESTNET_IMAGE=your-registry/logos-blockchain-testing:latest +``` + +### Run the Example + +```bash +export NOMOS_TESTNET_IMAGE=logos-blockchain-testing:local +export POL_PROOF_DEV_MODE=true +cargo run -p runner-examples --bin k8s_runner +``` + +### K8s Runner Environment Variables + +| Variable | Default | Effect | +|----------|---------|--------| +| `NOMOS_TESTNET_IMAGE` | — | Image tag (required) | +| `POL_PROOF_DEV_MODE` | — | **REQUIRED**: Set to `true` for all runners | +| `NOMOS_DEMO_VALIDATORS` | 1 | Number of validators | +| `NOMOS_DEMO_EXECUTORS` | 1 | Number of executors | +| `NOMOS_DEMO_RUN_SECS` | 60 | Run duration in seconds | +| `NOMOS_METRICS_QUERY_URL` | None | Prometheus-compatible base URL for runner to query (PromQL) | +| `NOMOS_METRICS_OTLP_INGEST_URL` | None | Full OTLP HTTP ingest URL for node metrics export | +| `NOMOS_GRAFANA_URL` | None | Grafana base URL for printing/logging | +| `K8S_RUNNER_NAMESPACE` | Random | Kubernetes namespace (pin for debugging) | +| `K8S_RUNNER_RELEASE` | Random | Helm release name (pin for debugging) | +| `K8S_RUNNER_NODE_HOST` | — | NodePort host resolution for non-local clusters | +| `K8S_RUNNER_DEBUG` | 0 | Log Helm stdout/stderr for install commands | +| `K8S_RUNNER_PRESERVE` | 0 | Keep namespace/release after run (for debugging) | + +### K8s + Observability (Optional) + +```bash +export NOMOS_METRICS_QUERY_URL=http://your-prometheus:9090 +# Prometheus OTLP receiver example: +export NOMOS_METRICS_OTLP_INGEST_URL=http://your-prometheus:9090/api/v1/otlp/v1/metrics +# Optional: print Grafana link in TESTNET_ENDPOINTS +export NOMOS_GRAFANA_URL=http://your-grafana:3000 +cargo run -p runner-examples --bin k8s_runner +``` + +**Notes:** +- `NOMOS_METRICS_QUERY_URL` must be reachable from the runner process (often via `kubectl port-forward`) +- `NOMOS_METRICS_OTLP_INGEST_URL` must be reachable from nodes (pods/containers) and is backend-specific + - Quickstart installer: `scripts/setup/setup-observability.sh k8s install` then `scripts/setup/setup-observability.sh k8s env` + - Optional dashboards: `scripts/setup/setup-observability.sh k8s dashboards` + +### Via `scripts/run/run-examples.sh` (Recommended) + +```bash +scripts/run/run-examples.sh -t 60 -v 3 -e 1 k8s \ + --metrics-query-url http://your-prometheus:9090 \ + --metrics-otlp-ingest-url http://your-prometheus:9090/api/v1/otlp/v1/metrics +``` + +### In Code (Optional) + +```rust +use testing_framework_core::scenario::ScenarioBuilder; +use testing_framework_workflows::ObservabilityBuilderExt as _; + +let plan = ScenarioBuilder::with_node_counts(1, 1) + .with_metrics_query_url_str("http://your-prometheus:9090") + .with_metrics_otlp_ingest_url_str("http://your-prometheus:9090/api/v1/otlp/v1/metrics") + .build(); +``` + +### Important K8s Notes + +- K8s runner mounts `testing-framework/assets/stack/kzgrs_test_params` as a hostPath volume +- File path inside pods: `/kzgrs_test_params/kzgrs_test_params` +- **No node control support yet**: Chaos workloads (`.enable_node_control()`) will fail +- Optimized for local clusters (Docker Desktop K8s / minikube / kind) + - Remote clusters require additional setup (registry push, PV/CSI for assets, etc.) +- Use `scripts/run/run-examples.sh k8s` to handle all setup automatically + +## Next Steps + +- [CI Integration](ci-integration.md) — Automate tests in continuous integration +- [Environment Variables](environment-variables.md) — Full variable reference +- [Logging & Observability](logging-observability.md) — Log collection and metrics +- [Troubleshooting](troubleshooting.md) — Common issues and fixes + diff --git a/book/src/running-scenarios.md b/book/src/running-scenarios.md index 6c7d598..9b144d1 100644 --- a/book/src/running-scenarios.md +++ b/book/src/running-scenarios.md @@ -15,4 +15,4 @@ Running a scenario follows the same conceptual flow regardless of environment: Use the same plan across different deployers to compare behavior between local development and CI or cluster settings. For environment prerequisites and -flags, see [Operations](operations.md). +flags, see [Prerequisites & Setup](prerequisites.md) and [Environment Variables](environment-variables.md). diff --git a/book/src/troubleshooting.md b/book/src/troubleshooting.md index d9c0f2a..9488fa0 100644 --- a/book/src/troubleshooting.md +++ b/book/src/troubleshooting.md @@ -9,10 +9,10 @@ - **macOS + Docker Desktop (Apple silicon):** prefer `NOMOS_BUNDLE_DOCKER_PLATFORM=linux/arm64` for local compose/k8s runs to avoid slow/fragile amd64 emulation builds. - **Disk space:** bundle/image builds are storage-heavy. If you see I/O errors or Docker build failures, check free space and prune old artifacts (`.tmp/`, `target/`, and Docker build cache) before retrying. - **K8s runner scope:** the default Helm chart mounts KZG params via `hostPath` and uses a local image tag (`logos-blockchain-testing:local`). This is intended for local clusters (Docker Desktop / minikube / kind), not remote managed clusters without additional setup. - - Quick cleanup: `scripts/clean.sh` (and `scripts/clean.sh --docker` if needed). - - Destructive cleanup (last resort): `scripts/clean.sh --docker-system --dangerous` (add `--volumes` if you also want to prune Docker volumes). + - Quick cleanup: `scripts/ops/clean.sh` (and `scripts/ops/clean.sh --docker` if needed). + - Destructive cleanup (last resort): `scripts/ops/clean.sh --docker-system --dangerous` (add `--volumes` if you also want to prune Docker volumes). -**Recommended:** Use `scripts/run-examples.sh` which handles all setup automatically. +**Recommended:** Use `scripts/run/run-examples.sh` which handles all setup automatically. ## Quick Symptom Guide @@ -24,6 +24,475 @@ Common symptoms and likely causes: - **Observability gaps**: metrics or logs unreachable because ports clash or services are not exposed—adjust observability ports and confirm runner wiring. - **Flaky behavior across runs**: mixing chaos with functional smoke tests or inconsistent topology between environments—separate deterministic and chaos scenarios and standardize topology presets. +## What Failure Looks Like + +This section shows what you'll actually see when common issues occur. Each example includes realistic console output and the fix. + +### 1. Missing `POL_PROOF_DEV_MODE=true` (Most Common!) + +**Symptoms:** +- Test "hangs" with no visible progress +- CPU usage spikes to 100% +- Eventually hits timeout after several minutes +- Nodes appear to start but blocks aren't produced + +**What you'll see:** + +``` +$ cargo run -p runner-examples --bin local_runner + Finished dev [unoptimized + debuginfo] target(s) in 0.48s + Running `target/debug/local_runner` +[INFO runner_examples::local_runner] Starting local runner scenario +[INFO testing_framework_runner_local] Launching 3 validators +[INFO testing_framework_runner_local] Waiting for node readiness... +(hangs here for 5+ minutes, CPU at 100%) +thread 'main' panicked at 'readiness timeout expired' +``` + +**Root Cause:** Groth16 proof generation is extremely slow without dev mode. The system tries to compute real cryptographic proofs, which can take minutes per block. + +**Fix:** + +```bash +POL_PROOF_DEV_MODE=true cargo run -p runner-examples --bin local_runner +``` + +**Prevention:** Set this in your shell profile or `.env` file so you never forget it. + +--- + +### 2. Missing `versions.env` File + +**Symptoms:** +- Helper scripts fail immediately +- Error about missing file at repo root +- Scripts can't determine which circuit/node versions to use + +**What you'll see:** + +``` +$ scripts/run/run-examples.sh -t 60 -v 1 -e 1 host +ERROR: versions.env not found at repository root +This file is required and should define: + VERSION= + NOMOS_NODE_REV= + NOMOS_BUNDLE_VERSION= +``` + +**Root Cause:** Helper scripts need `versions.env` to know which versions to build/fetch. + +**Fix:** Ensure you're in the repository root directory. The `versions.env` file should already exist—verify it's present: + +```bash +cat versions.env +# Should show: +# VERSION=v0.3.1 +# NOMOS_NODE_REV=abc123def456 +# NOMOS_BUNDLE_VERSION=v1 +``` + +--- + +### 3. Missing KZG Circuit Assets (DA Workloads) + +**Symptoms:** +- DA workload tests fail +- Error messages about missing circuit files +- Nodes crash during DA operations + +**What you'll see:** + +``` +$ POL_PROOF_DEV_MODE=true cargo run -p runner-examples --bin local_runner +[INFO testing_framework_runner_local] Starting DA workload +[ERROR nomos_da_dispersal] Failed to load KZG parameters +Error: Custom { kind: NotFound, error: "Circuit file not found at: testing-framework/assets/stack/kzgrs_test_params/kzgrs_test_params" } +thread 'main' panicked at 'workload init failed' +``` + +**Root Cause:** DA (Data Availability) workloads require KZG cryptographic parameters. The file must exist at: `testing-framework/assets/stack/kzgrs_test_params/kzgrs_test_params` (note the repeated filename). + +**Fix (recommended):** + +```bash +# Use run-examples.sh which handles setup automatically +scripts/run/run-examples.sh -t 60 -v 1 -e 1 host +``` + +**Fix (manual):** + +```bash +# Fetch circuits +scripts/setup/setup-nomos-circuits.sh v0.3.1 /tmp/nomos-circuits + +# Copy to expected location +mkdir -p testing-framework/assets/stack/kzgrs_test_params +cp -r /tmp/nomos-circuits/* testing-framework/assets/stack/kzgrs_test_params/ + +# Verify (should be ~120MB) +ls -lh testing-framework/assets/stack/kzgrs_test_params/kzgrs_test_params +``` + +--- + +### 4. Node Binaries Not Found + +**Symptoms:** +- Error about missing `nomos-node` or `nomos-executor` binary +- "file not found" or "no such file or directory" +- Environment variables `NOMOS_NODE_BIN` / `NOMOS_EXECUTOR_BIN` not set + +**What you'll see:** + +``` +$ POL_PROOF_DEV_MODE=true cargo run -p runner-examples --bin local_runner +[INFO testing_framework_runner_local] Spawning validator 0 +Error: Os { code: 2, kind: NotFound, message: "No such file or directory" } +thread 'main' panicked at 'failed to spawn nomos-node process' +``` + +**Root Cause:** The local runner needs compiled `nomos-node` and `nomos-executor` binaries, but doesn't know where they are. + +**Fix (recommended):** + +```bash +# Use run-examples.sh which builds binaries automatically +scripts/run/run-examples.sh -t 60 -v 1 -e 1 host +``` + +**Fix (manual - set paths explicitly):** + +```bash +# Build binaries first +cd ../nomos-node # or wherever your nomos-node checkout is +cargo build --release --bin nomos-node --bin nomos-executor + +# Set environment variables +export NOMOS_NODE_BIN=$PWD/target/release/nomos-node +export NOMOS_EXECUTOR_BIN=$PWD/target/release/nomos-executor + +# Return to testing framework +cd ../nomos-testing +POL_PROOF_DEV_MODE=true cargo run -p runner-examples --bin local_runner +``` + +--- + +### 5. Docker Daemon Not Running (Compose) + +**Symptoms:** +- Compose tests fail immediately +- "Cannot connect to Docker daemon" +- Docker commands don't work + +**What you'll see:** + +``` +$ scripts/run/run-examples.sh -t 60 -v 1 -e 1 compose +[INFO runner_examples::compose_runner] Starting compose deployment +Error: Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running? +thread 'main' panicked at 'compose deployment failed' +``` + +**Root Cause:** Docker Desktop isn't running, or your user doesn't have permission to access Docker. + +**Fix:** + +```bash +# macOS: Start Docker Desktop application +open -a Docker + +# Linux: Start Docker daemon +sudo systemctl start docker + +# Verify Docker is working +docker ps + +# If permission denied, add your user to docker group (Linux) +sudo usermod -aG docker $USER +# Then log out and log back in +``` + +--- + +### 6. Image Not Found (Compose/K8s) + +**Symptoms:** +- Compose/K8s tests fail during deployment +- "Image not found: logos-blockchain-testing:local" +- Containers fail to start + +**What you'll see:** + +``` +$ POL_PROOF_DEV_MODE=true cargo run -p runner-examples --bin compose_runner +[INFO testing_framework_runner_compose] Starting compose deployment +Error: Failed to pull image 'logos-blockchain-testing:local': No such image +thread 'main' panicked at 'compose deployment failed' +``` + +**Root Cause:** The Docker image hasn't been built yet, or was pruned. + +**Fix (recommended):** + +```bash +# Use run-examples.sh which builds the image automatically +scripts/run/run-examples.sh -t 60 -v 1 -e 1 compose +``` + +**Fix (manual):** + +```bash +# 1. Build Linux bundle +scripts/build/build-bundle.sh --platform linux + +# 2. Set bundle path +export NOMOS_BINARIES_TAR=$(ls -t .tmp/nomos-binaries-linux-*.tar.gz | head -1) + +# 3. Build Docker image +scripts/build/build_test_image.sh + +# 4. Verify image exists +docker images | grep logos-blockchain-testing + +# 5. For kind/minikube: load image into cluster +kind load docker-image logos-blockchain-testing:local +# OR: minikube image load logos-blockchain-testing:local +``` + +--- + +### 7. Port Conflicts + +**Symptoms:** +- "Address already in use" errors +- Tests fail during node startup +- Observability stack (Prometheus/Grafana) won't start + +**What you'll see:** + +``` +$ POL_PROOF_DEV_MODE=true cargo run -p runner-examples --bin local_runner +[INFO testing_framework_runner_local] Launching validator 0 on port 18080 +Error: Os { code: 48, kind: AddrInUse, message: "Address already in use" } +thread 'main' panicked at 'failed to bind port 18080' +``` + +**Root Cause:** Previous test didn't clean up properly, or another service is using the port. + +**Fix:** + +```bash +# Find processes using the port +lsof -i :18080 # macOS/Linux +netstat -ano | findstr :18080 # Windows + +# Kill orphaned nomos processes +pkill nomos-node +pkill nomos-executor + +# For compose: ensure containers are stopped +docker compose down +docker ps -a --filter "name=nomos-compose-" -q | xargs docker rm -f + +# Check if port is now free +lsof -i :18080 # Should return nothing +``` + +**For Observability Stack Port Conflicts:** + +```bash +# Edit ports in observability compose file +vim scripts/observability/compose/docker-compose.yml + +# Change conflicting port mappings: +# ports: +# - "9090:9090" # Prometheus - change to "19090:9090" if needed +# - "3000:3000" # Grafana - change to "13000:3000" if needed +``` + +--- + +### 8. Wallet Seeding Failed (Insufficient Funds) + +**Symptoms:** +- Transaction workload reports wallet issues +- "Insufficient funds" errors +- Transactions aren't being submitted + +**What you'll see:** + +``` +$ POL_PROOF_DEV_MODE=true cargo run -p runner-examples --bin local_runner +[INFO testing_framework_workflows] Starting transaction workload with 10 users +[ERROR testing_framework_workflows] Wallet seeding failed: requested 10 users but only 3 wallets available +thread 'main' panicked at 'workload init failed: insufficient wallets' +``` + +**Root Cause:** Topology configured fewer wallets than the workload needs. Transaction workload has `.users(M)` but topology only has `.wallets(N)` where N < M. + +**Fix:** + +```rust +// In your scenario: +let scenario = Scenario::builder("my_test") + .topology( + Topology::preset_3v1e() + .wallets(20) // ← Increase wallet count + ) + .workload(TransactionWorkload::new() + .users(10) // ← Must be ≤ wallets(20) + .rate(5.0)) + .build(); +``` + +--- + +### 9. Resource Exhaustion (OOM / CPU) + +**Symptoms:** +- Nodes crash randomly +- "OOM Killed" messages +- Test becomes flaky under load +- Docker containers restart repeatedly + +**What you'll see:** + +``` +$ docker ps --filter "name=nomos-compose-" +CONTAINER ID STATUS +abc123def456 Restarting (137) 30 seconds ago # 137 = OOM killed + +$ docker logs abc123def456 +[INFO nomos_node] Starting validator +[INFO consensus] Processing block +Killed # ← OOM killer terminated the process +``` + +**Root Cause:** Too many nodes, too much workload traffic, or insufficient Docker resources. + +**Fix:** + +```bash +# 1. Reduce topology size +# In your scenario: +# .topology(Topology::preset_3v1e()) # Instead of preset_10v2e() + +# 2. Reduce workload rates +# .workload(TransactionWorkload::new().rate(5.0)) # Instead of rate(100.0) + +# 3. Increase Docker resources (Docker Desktop) +# Settings → Resources → Memory: 8GB minimum (12GB+ recommended for large topologies) +# Settings → Resources → CPUs: 4+ cores recommended + +# 4. Increase file descriptor limits (Linux/macOS) +ulimit -n 4096 + +# 5. Close other heavy applications (browsers, IDEs, etc.) +``` + +--- + +### 10. Logs Disappear After Run + +**Symptoms:** +- Test completes but no logs on disk +- Can't debug failures because logs are gone +- Temporary directories cleaned up automatically + +**What you'll see:** + +``` +$ POL_PROOF_DEV_MODE=true cargo run -p runner-examples --bin local_runner +[INFO runner_examples] Test complete, cleaning up +[INFO testing_framework_runner_local] Removing temporary directories +$ ls .tmp/ +# Empty or missing +``` + +**Root Cause:** Framework cleans up temporary directories by default to avoid disk bloat. + +**Fix:** + +```bash +# Persist logs to a specific directory +NOMOS_LOG_DIR=/tmp/test-logs \ +NOMOS_TESTS_KEEP_LOGS=1 \ +POL_PROOF_DEV_MODE=true \ +cargo run -p runner-examples --bin local_runner + +# Logs persist after run +ls /tmp/test-logs/ +# nomos-node-0.2024-12-18T14-30-00.log +# nomos-node-1.2024-12-18T14-30-00.log +# ... +``` + +--- + +### 11. Consensus Timing Too Tight / Run Duration Too Short + +**Symptoms:** +- "Consensus liveness expectation failed" +- Only 1-2 blocks produced (or zero) +- Nodes appear healthy but not making progress + +**What you'll see:** + +``` +$ POL_PROOF_DEV_MODE=true cargo run -p runner-examples --bin local_runner +[INFO testing_framework_core] Starting workloads +[INFO testing_framework_core] Run window: 10 seconds +[INFO testing_framework_core] Evaluating expectations +[ERROR testing_framework_core] Consensus liveness expectation failed: expected min 5 blocks, got 1 +thread 'main' panicked at 'expectations failed' +``` + +**Root Cause:** Run duration too short for consensus parameters. If `CONSENSUS_SLOT_TIME=20s` but run duration is only `10s`, you can't produce many blocks. + +**Fix:** + +```rust +// Increase run duration to allow more blocks +let scenario = Scenario::builder("my_test") + .topology(Topology::preset_3v1e()) + .with_run_duration(Duration::from_secs(120)) // ← Give more time + .expectation(ConsensusLiveness::new() + .min_blocks(5) // ← Adjust expectation to match duration + ) + .build(); +``` + +**Or adjust consensus timing (if you control node config):** + +```bash +# Faster block production (shorter slot time) +CONSENSUS_SLOT_TIME=5 \ +CONSENSUS_ACTIVE_SLOT_COEFF=0.9 \ +POL_PROOF_DEV_MODE=true \ +cargo run -p runner-examples --bin local_runner +``` + +--- + +## Summary: Quick Checklist for Failed Runs + +When a test fails, check these in order: + +1. ✅ **`POL_PROOF_DEV_MODE=true` is set** (REQUIRED for all runners) +2. ✅ **`versions.env` exists at repo root** +3. ✅ **KZG circuit assets present** (for DA workloads): `testing-framework/assets/stack/kzgrs_test_params/kzgrs_test_params` +4. ✅ **Node binaries available** (`NOMOS_NODE_BIN` / `NOMOS_EXECUTOR_BIN` set, or using `run-examples.sh`) +5. ✅ **Docker daemon running** (for compose/k8s) +6. ✅ **Docker image built** (`logos-blockchain-testing:local` exists for compose/k8s) +7. ✅ **No port conflicts** (`lsof -i :18080`, kill orphaned processes) +8. ✅ **Sufficient wallets** (`.wallets(N)` ≥ `.users(M)`) +9. ✅ **Enough resources** (Docker memory 8GB+, ulimit -n 4096) +10. ✅ **Run duration appropriate** (long enough for consensus timing) +11. ✅ **Logs persisted** (`NOMOS_LOG_DIR` + `NOMOS_TESTS_KEEP_LOGS=1` if needed) + +**Still stuck?** Check node logs (see [Where to Find Logs](#where-to-find-logs)) for the actual error. + ## Where to Find Logs ### Log Location Quick Reference @@ -87,7 +556,7 @@ POL_PROOF_DEV_MODE=true \ cargo run -p runner-examples --bin compose_runner # OR: Use run-examples.sh (handles setup automatically) -COMPOSE_RUNNER_PRESERVE=1 scripts/run-examples.sh -t 60 -v 1 -e 1 compose +COMPOSE_RUNNER_PRESERVE=1 scripts/run/run-examples.sh -t 60 -v 1 -e 1 compose # After test failure, containers remain running: docker ps --filter "name=nomos-compose-" @@ -289,12 +758,12 @@ Run a minimal baseline test (e.g., 2 validators, consensus liveness only). If it baked into the image. - **Fix (recommended)**: Use run-examples.sh which handles everything: ```bash - scripts/run-examples.sh -t 60 -v 1 -e 1 compose + scripts/run/run-examples.sh -t 60 -v 1 -e 1 compose ``` - **Fix (manual)**: - 1. Build bundle: `scripts/build-bundle.sh --platform linux` + 1. Build bundle: `scripts/build/build-bundle.sh --platform linux` 2. Set bundle path: `export NOMOS_BINARIES_TAR=.tmp/nomos-binaries-linux-v0.3.1.tar.gz` - 3. Build image: `scripts/build_test_image.sh` + 3. Build image: `scripts/build/build_test_image.sh` 4. **kind/minikube:** load the image into the cluster nodes (e.g. `kind load docker-image logos-blockchain-testing:local`, or `minikube image load ...`), or push to a registry and set `NOMOS_TESTNET_IMAGE` accordingly. ### "Failed to load KZG parameters" or "Circuit file not found" @@ -302,12 +771,12 @@ Run a minimal baseline test (e.g., 2 validators, consensus liveness only). If it - **Cause**: DA workload requires KZG circuit assets. The file `testing-framework/assets/stack/kzgrs_test_params/kzgrs_test_params` (note repeated filename) must exist. Inside containers, it's at `/kzgrs_test_params/kzgrs_test_params`. - **Fix (recommended)**: Use run-examples.sh which handles setup: ```bash - scripts/run-examples.sh -t 60 -v 1 -e 1 + scripts/run/run-examples.sh -t 60 -v 1 -e 1 ``` - **Fix (manual)**: - 1. Fetch assets: `scripts/setup-nomos-circuits.sh v0.3.1 /tmp/nomos-circuits` + 1. Fetch assets: `scripts/setup/setup-nomos-circuits.sh v0.3.1 /tmp/nomos-circuits` 2. Copy to expected path: `cp -r /tmp/nomos-circuits/* testing-framework/assets/stack/kzgrs_test_params/` 3. Verify file exists: `ls -lh testing-framework/assets/stack/kzgrs_test_params/kzgrs_test_params` 4. For Compose/K8s: rebuild image with assets baked in -For detailed logging configuration and observability setup, see [Operations](operations.md). +For detailed logging configuration and observability setup, see [Logging & Observability](logging-observability.md). diff --git a/book/src/workloads.md b/book/src/workloads.md index 99f1414..b552410 100644 --- a/book/src/workloads.md +++ b/book/src/workloads.md @@ -21,7 +21,7 @@ scenarios stay readable and purpose-driven. criteria (e.g., inclusion of submitted activity) so scenarios remain concise. Together, workloads and expectations let you express both the pressure applied -to the system and the definition of “healthy” for that run. +to the system and the definition of "healthy" for that run. ```mermaid flowchart TD @@ -30,3 +30,9 @@ flowchart TD Drive --> Collect[Collect signals] Collect --> Eval[Expectations evaluate] ``` + +## See Also + +- **[RunContext: BlockFeed & Node Control](node-control.md)** — Learn how to use BlockFeed in expectations to observe blocks in real-time, and how to access node control for chaos testing +- **[Examples](examples.md)** — Concrete scenario patterns combining workloads and expectations +- **[Extending the Framework](extending.md)** — Implement custom workloads and expectations diff --git a/scripts/build-bundle.sh b/scripts/build/build-bundle.sh similarity index 97% rename from scripts/build-bundle.sh rename to scripts/build/build-bundle.sh index 8007c6e..b9a71b2 100755 --- a/scripts/build-bundle.sh +++ b/scripts/build/build-bundle.sh @@ -6,7 +6,7 @@ if [ -z "${BASH_VERSION:-}" ]; then fi # shellcheck disable=SC1091 -. "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh" +. "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/../lib/common.sh" readonly DOCKER_RUST_IMAGE="rust:1.80-bullseye" declare -ar DOCKER_APT_PACKAGES=( @@ -25,7 +25,7 @@ declare -ar DOCKER_APT_PACKAGES=( build_bundle::usage() { cat <<'USAGE' -Usage: scripts/build-bundle.sh [--platform host|linux] [--output PATH] +Usage: scripts/build/build-bundle.sh [--platform host|linux] [--output PATH] Options: --platform Target platform for binaries (default: host) @@ -277,7 +277,7 @@ build_bundle::maybe_run_linux_build_in_docker() { "${extra_mounts[@]}" \ -w /workspace \ "${DOCKER_RUST_IMAGE}" \ - bash -c "apt-get update && apt-get install -y ${DOCKER_APT_PACKAGES[*]} && ./scripts/build-bundle.sh --platform linux --output \"${container_output}\" ${src_args[*]} ${features_args[*]}" + bash -c "apt-get update && apt-get install -y ${DOCKER_APT_PACKAGES[*]} && ./scripts/build/build-bundle.sh --platform linux --output \"${container_output}\" ${src_args[*]} ${features_args[*]}" exit 0 } @@ -316,7 +316,7 @@ build_bundle::prepare_circuits() { echo "Circuits already present at ${CIRCUITS_DIR}; skipping download" else STACK_DIR="${CIRCUITS_DIR}" HOST_DIR="${CIRCUITS_DIR}" \ - "${ROOT_DIR}/scripts/setup-circuits-stack.sh" "${VERSION}" Building Linux bundle" - VERSION="${VERSION}" "${ROOT_DIR}/scripts/build-bundle.sh" "${BUILD_ARGS[@]}" + VERSION="${VERSION}" "${ROOT_DIR}/scripts/build/build-bundle.sh" "${BUILD_ARGS[@]}" BUNDLE_TAR="${OUTPUT_TAR}" } diff --git a/scripts/build-rapidsnark.sh b/scripts/build/build-rapidsnark.sh similarity index 98% rename from scripts/build-rapidsnark.sh rename to scripts/build/build-rapidsnark.sh index f9abd39..cc1775d 100755 --- a/scripts/build-rapidsnark.sh +++ b/scripts/build/build-rapidsnark.sh @@ -2,7 +2,7 @@ # # Rebuild the rapidsnark prover for the current architecture. # -# Usage: ./scripts/build-rapidsnark.sh +# Usage: ./scripts/build/build-rapidsnark.sh set -euo pipefail diff --git a/scripts/build_test_image.sh b/scripts/build/build_test_image.sh similarity index 98% rename from scripts/build_test_image.sh rename to scripts/build/build_test_image.sh index e71e836..aea541c 100755 --- a/scripts/build_test_image.sh +++ b/scripts/build/build_test_image.sh @@ -6,11 +6,11 @@ if [ -z "${BASH_VERSION:-}" ]; then fi # shellcheck disable=SC1091 -. "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh" +. "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/../lib/common.sh" build_test_image::usage() { cat <<'USAGE' -Usage: scripts/build_test_image.sh [options] +Usage: scripts/build/build_test_image.sh [options] Builds the compose/k8s test image (bakes in binaries + circuit assets). diff --git a/scripts/common.sh b/scripts/lib/common.sh similarity index 100% rename from scripts/common.sh rename to scripts/lib/common.sh diff --git a/scripts/clean.sh b/scripts/ops/clean.sh similarity index 96% rename from scripts/clean.sh rename to scripts/ops/clean.sh index 63c3b92..f106de0 100755 --- a/scripts/clean.sh +++ b/scripts/ops/clean.sh @@ -6,11 +6,11 @@ if [ -z "${BASH_VERSION:-}" ]; then fi # shellcheck disable=SC1091 -. "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh" +. "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/../lib/common.sh" clean::usage() { cat <<'USAGE' -Usage: scripts/clean.sh [options] +Usage: scripts/ops/clean.sh [options] Removes local build artifacts that commonly cause disk pressure and flaky Docker builds. diff --git a/scripts/push-ecr-test.sh b/scripts/ops/push-ecr-test.sh similarity index 91% rename from scripts/push-ecr-test.sh rename to scripts/ops/push-ecr-test.sh index 80a7924..172ad0c 100755 --- a/scripts/push-ecr-test.sh +++ b/scripts/ops/push-ecr-test.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -euo pipefail -ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" readonly DEFAULT_TAG="test" readonly DEFAULT_ECR_IMAGE_REPO="public.ecr.aws/r4s5t9y4/logos/logos-blockchain" @@ -32,7 +32,7 @@ export DOCKER_DEFAULT_PLATFORM="${DEFAULT_DOCKER_PLATFORM}" export CIRCUITS_PLATFORM="${CIRCUITS_PLATFORM:-${DEFAULT_CIRCUITS_PLATFORM}}" export IMAGE_TAG="${REMOTE_IMAGE}" - "${ROOT_DIR}/scripts/build_test_image.sh" --dockerfile "${ROOT_DIR}/testing-framework/assets/stack/Dockerfile.testnet" + "${ROOT_DIR}/scripts/build/build_test_image.sh" --dockerfile "${ROOT_DIR}/testing-framework/assets/stack/Dockerfile.testnet" if [[ "${ECR_IMAGE_REPO}" == ${PUBLIC_ECR_HOST}/* ]]; then aws ecr-public get-login-password --region "${AWS_REGION}" \ diff --git a/scripts/update-nomos-rev.sh b/scripts/ops/update-nomos-rev.sh similarity index 96% rename from scripts/update-nomos-rev.sh rename to scripts/ops/update-nomos-rev.sh index ef5f098..3e75512 100755 --- a/scripts/update-nomos-rev.sh +++ b/scripts/ops/update-nomos-rev.sh @@ -6,14 +6,14 @@ if [ -z "${BASH_VERSION:-}" ]; then fi # shellcheck disable=SC1091 -. "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh" +. "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/../lib/common.sh" update_nomos_rev::usage() { cat <<'EOF' Usage: - scripts/update-nomos-rev.sh --rev - scripts/update-nomos-rev.sh --path - scripts/update-nomos-rev.sh --unskip-worktree + scripts/ops/update-nomos-rev.sh --rev + scripts/ops/update-nomos-rev.sh --path + scripts/ops/update-nomos-rev.sh --unskip-worktree Notes: --rev sets NOMOS_NODE_REV and updates Cargo.toml revs diff --git a/scripts/checks.sh b/scripts/run/checks.sh similarity index 97% rename from scripts/checks.sh rename to scripts/run/checks.sh index 90b478b..e7af547 100755 --- a/scripts/checks.sh +++ b/scripts/run/checks.sh @@ -6,11 +6,11 @@ if [ -z "${BASH_VERSION:-}" ]; then fi # shellcheck disable=SC1091 -. "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh" +. "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/../lib/common.sh" checks::usage() { cat <<'USAGE' -Usage: scripts/checks.sh [--help] +Usage: scripts/run/checks.sh [--help] Runs a best-effort local environment sanity check for the testing framework (assets, Rust, Docker, Kubernetes). @@ -97,7 +97,7 @@ checks::print_kzg_params() { if [ -f "${host_kzg_path}" ]; then checks::ok "KZG params file exists" else - checks::warn "KZG params file missing (DA workloads will fail); run: scripts/run-examples.sh (auto) or scripts/setup-nomos-circuits.sh" + checks::warn "KZG params file missing (DA workloads will fail); run: scripts/run/run-examples.sh (auto) or scripts/setup/setup-nomos-circuits.sh" fi } @@ -295,7 +295,7 @@ checks::main() { checks::print_debug_flags checks::section "Done" - checks::say "If something looks off, start with: scripts/run-examples.sh -t 60 -v 1 -e 1" + checks::say "If something looks off, start with: scripts/run/run-examples.sh -t 60 -v 1 -e 1" } if [[ "${BASH_SOURCE[0]}" == "$0" ]]; then diff --git a/scripts/run-examples.sh b/scripts/run/run-examples.sh similarity index 98% rename from scripts/run-examples.sh rename to scripts/run/run-examples.sh index 45c05bd..0515d18 100755 --- a/scripts/run-examples.sh +++ b/scripts/run/run-examples.sh @@ -6,7 +6,7 @@ if [ -z "${BASH_VERSION:-}" ]; then fi # shellcheck disable=SC1091 -. "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh" +. "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/../lib/common.sh" readonly DEFAULT_KZG_DIR_REL="testing-framework/assets/stack/kzgrs_test_params" readonly DEFAULT_KZG_FILE="kzgrs_test_params" @@ -35,7 +35,7 @@ trap run_examples::cleanup EXIT run_examples::usage() { cat < Building fresh binaries bundle (${platform}) at ${tar_path}" - "${ROOT_DIR}/scripts/build-bundle.sh" --platform "${platform}" --output "${tar_path}" --rev "${NOMOS_NODE_REV}" + "${ROOT_DIR}/scripts/build/build-bundle.sh" --platform "${platform}" --output "${tar_path}" --rev "${NOMOS_NODE_REV}" } run_examples::prepare_bundles() { @@ -480,7 +480,7 @@ run_examples::maybe_rebuild_image() { echo "==> Rebuilding testnet image (${IMAGE})" IMAGE_TAG="${IMAGE}" COMPOSE_CIRCUITS_PLATFORM="${COMPOSE_CIRCUITS_PLATFORM:-}" \ - bash "${ROOT_DIR}/scripts/build_test_image.sh" + bash "${ROOT_DIR}/scripts/build/build_test_image.sh" } run_examples::maybe_restore_host_after_image() { diff --git a/scripts/run-test-matrix.sh b/scripts/run/run-test-matrix.sh similarity index 87% rename from scripts/run-test-matrix.sh rename to scripts/run/run-test-matrix.sh index 814f5d6..a8da2ac 100755 --- a/scripts/run-test-matrix.sh +++ b/scripts/run/run-test-matrix.sh @@ -6,11 +6,11 @@ if [ -z "${BASH_VERSION:-}" ]; then fi # shellcheck disable=SC1091 -. "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh" +. "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/../lib/common.sh" matrix::usage() { cat <<'USAGE' -Usage: scripts/run-test-matrix.sh [options] +Usage: scripts/run/run-test-matrix.sh [options] Runs a small matrix of runner examples (host/compose/k8s) with and without image rebuilds (where it makes sense), after cleaning and rebuilding bundles. @@ -20,11 +20,11 @@ Options: -v, --validators N Validators (default: 1) -e, --executors N Executors (default: 1) --modes LIST Comma-separated: host,compose,k8s (default: host,compose,k8s) - --no-clean Skip scripts/clean.sh step - --no-bundles Skip scripts/build-bundle.sh (uses existing .tmp tarballs) + --no-clean Skip scripts/ops/clean.sh step + --no-bundles Skip scripts/build/build-bundle.sh (uses existing .tmp tarballs) --force-k8s-image-build Allow the k8s "rebuild image" run even on non-docker-desktop clusters - --metrics-query-url URL Forwarded to scripts/run-examples.sh (optional) - --metrics-otlp-ingest-url URL Forwarded to scripts/run-examples.sh (optional) + --metrics-query-url URL Forwarded to scripts/run/run-examples.sh (optional) + --metrics-otlp-ingest-url URL Forwarded to scripts/run/run-examples.sh (optional) -h, --help Show this help Notes: @@ -108,7 +108,8 @@ matrix::run_case() { shift local log="${LOG_DIR}/${name}.log" - echo "==> [${name}] $(date -Is)" + mkdir -p "$(dirname "${log}")" + echo "==> [${name}] $(date -u +'%Y-%m-%dT%H:%M:%SZ')" echo "==> [${name}] cmd: $*" local start end status @@ -156,13 +157,13 @@ matrix::main() { if [ "${DO_CLEAN}" -eq 1 ]; then echo "==> Cleaning workspace artifacts" - "${ROOT_DIR}/scripts/clean.sh" --tmp --target --docker + "${ROOT_DIR}/scripts/ops/clean.sh" --tmp --target --docker fi if [ "${DO_BUNDLES}" -eq 1 ]; then echo "==> Building bundles (host + linux)" - "${ROOT_DIR}/scripts/build-bundle.sh" --platform host - "${ROOT_DIR}/scripts/build-bundle.sh" --platform linux + "${ROOT_DIR}/scripts/build/build-bundle.sh" --platform host + "${ROOT_DIR}/scripts/build/build-bundle.sh" --platform linux fi CASE_NAMES=() @@ -177,20 +178,20 @@ matrix::main() { case "${mode}" in host) matrix::run_case "host" \ - "${ROOT_DIR}/scripts/run-examples.sh" \ + "${ROOT_DIR}/scripts/run/run-examples.sh" \ -t "${RUN_SECS}" -v "${VALIDATORS}" -e "${EXECUTORS}" \ "${forward[@]}" \ host ;; compose) matrix::run_case "compose.image_build" \ - "${ROOT_DIR}/scripts/run-examples.sh" \ + "${ROOT_DIR}/scripts/run/run-examples.sh" \ -t "${RUN_SECS}" -v "${VALIDATORS}" -e "${EXECUTORS}" \ "${forward[@]}" \ compose matrix::run_case "compose.skip_image_build" \ - "${ROOT_DIR}/scripts/run-examples.sh" \ + "${ROOT_DIR}/scripts/run/run-examples.sh" \ --no-image-build \ -t "${RUN_SECS}" -v "${VALIDATORS}" -e "${EXECUTORS}" \ "${forward[@]}" \ @@ -211,7 +212,7 @@ matrix::main() { export NOMOS_FORCE_IMAGE_BUILD=1 fi matrix::run_case "k8s.image_build" \ - "${ROOT_DIR}/scripts/run-examples.sh" \ + "${ROOT_DIR}/scripts/run/run-examples.sh" \ -t "${RUN_SECS}" -v "${VALIDATORS}" -e "${EXECUTORS}" \ "${forward[@]}" \ k8s @@ -221,7 +222,7 @@ matrix::main() { fi matrix::run_case "k8s.skip_image_build" \ - "${ROOT_DIR}/scripts/run-examples.sh" \ + "${ROOT_DIR}/scripts/run/run-examples.sh" \ --no-image-build \ -t "${RUN_SECS}" -v "${VALIDATORS}" -e "${EXECUTORS}" \ "${forward[@]}" \ diff --git a/scripts/setup-circuits-stack.sh b/scripts/setup/setup-circuits-stack.sh similarity index 95% rename from scripts/setup-circuits-stack.sh rename to scripts/setup/setup-circuits-stack.sh index b4ffcb4..358fc12 100755 --- a/scripts/setup-circuits-stack.sh +++ b/scripts/setup/setup-circuits-stack.sh @@ -6,7 +6,7 @@ if [ -z "${BASH_VERSION:-}" ]; then fi # shellcheck disable=SC1091 -. "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh" +. "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/../lib/common.sh" readonly DEFAULT_CIRCUITS_VERSION="v0.3.1" readonly DEFAULT_LINUX_PLATFORM="linux-x86_64" @@ -18,7 +18,7 @@ readonly RAW_GITHUB_BASE_URL="https://raw.githubusercontent.com" setup_circuits_stack::usage() { cat <<'EOF' -Usage: scripts/setup-circuits-stack.sh [VERSION] +Usage: scripts/setup/setup-circuits-stack.sh [VERSION] Prepares circuits for both the Docker image (Linux/x86_64) and the host (for witness generators). @@ -73,7 +73,7 @@ setup_circuits_stack::fetch_bundle() { NOMOS_CIRCUITS_PLATFORM="${platform}" \ NOMOS_CIRCUITS_REBUILD_RAPIDSNARK="${rebuild}" \ - "${ROOT_DIR}/scripts/setup-nomos-circuits.sh" "${VERSION}" "${dest}" + "${ROOT_DIR}/scripts/setup/setup-nomos-circuits.sh" "${VERSION}" "${dest}" } setup_circuits_stack::fetch_kzg_params() { @@ -166,7 +166,7 @@ setup_circuits_stack::main() { Done. - For Docker/compose: rebuild the image to bake the Linux bundle: - scripts/build_test_image.sh + scripts/build/build_test_image.sh - For host runs (e.g., compose_runner): ensure NOMOS_CIRCUITS points to the host bundle above. EOF } diff --git a/scripts/setup-nomos-circuits.sh b/scripts/setup/setup-nomos-circuits.sh similarity index 98% rename from scripts/setup-nomos-circuits.sh rename to scripts/setup/setup-nomos-circuits.sh index 198a5b3..1098fe9 100755 --- a/scripts/setup-nomos-circuits.sh +++ b/scripts/setup/setup-nomos-circuits.sh @@ -28,7 +28,7 @@ readonly ICON_ERR="✗" setup_nomos_circuits::usage() { cat < { warn!(image, status = ?code, "test image build failed"); Err(ComposeRunnerError::Compose(ComposeCommandError::Failed { - command: String::from("scripts/build_test_image.sh"), + command: String::from("scripts/build/build_test_image.sh"), status: code, })) }