diff --git a/book/src/operations.md b/book/src/operations.md index a547d4e..f72eb31 100644 --- a/book/src/operations.md +++ b/book/src/operations.md @@ -427,6 +427,7 @@ cargo run -p runner-examples --bin compose_runner - `COMPOSE_RUNNER_HOST_GATEWAY=host.docker.internal:host-gateway` — controls the `extra_hosts` entry injected into compose (set to `disable` to omit) - `TESTNET_RUNNER_PRESERVE=1` — alias for `COMPOSE_RUNNER_PRESERVE=1` - `COMPOSE_GRAFANA_PORT=` — pin Grafana to a fixed host port instead of ephemeral assignment +- `COMPOSE_RUNNER_HTTP_TIMEOUT_SECS=` — override compose node HTTP readiness timeout **Note:** Container names follow pattern `nomos-compose-{uuid}-validator-{index}-1` where `{uuid}` changes per run. @@ -466,7 +467,8 @@ kubectl logs nomos-executor-1 > executor-1.log - Debug helpers: - `K8S_RUNNER_DEBUG=1` — logs Helm stdout/stderr for install commands. - `K8S_RUNNER_PRESERVE=1` — keep the namespace/release after the run. - - `K8S_RUNNER_NODE_HOST=` — override NodePort host resolution for non-local clusters. +- `K8S_RUNNER_NODE_HOST=` — override NodePort host resolution for non-local clusters. +- `K8S_RUNNER_NAMESPACE=` / `K8S_RUNNER_RELEASE=` — pin namespace/release instead of random IDs (useful for debugging) **Specify namespace (if not using default):** ```bash diff --git a/book/src/troubleshooting.md b/book/src/troubleshooting.md index 21d6c1b..2a37006 100644 --- a/book/src/troubleshooting.md +++ b/book/src/troubleshooting.md @@ -10,6 +10,7 @@ - **Disk space:** bundle/image builds are storage-heavy. If you see I/O errors or Docker build failures, check free space and prune old artifacts (`.tmp/`, `target/`, and Docker build cache) before retrying. - **K8s runner scope:** the default Helm chart mounts KZG params via `hostPath` and uses a local image tag (`logos-blockchain-testing:local`). This is intended for local clusters (Docker Desktop / minikube / kind), not remote managed clusters without additional setup. - Quick cleanup: `scripts/clean` (and `scripts/clean --docker` if needed). + - Destructive cleanup (last resort): `scripts/clean --docker-system --dangerous` (add `--volumes` if you also want to prune Docker volumes). **Recommended:** Use `scripts/run-examples.sh` which handles all setup automatically. diff --git a/examples/src/bin/compose_runner.rs b/examples/src/bin/compose_runner.rs index 8e48ba3..f19c7de 100644 --- a/examples/src/bin/compose_runner.rs +++ b/examples/src/bin/compose_runner.rs @@ -94,6 +94,7 @@ async fn run_compose_case( let deployer = ComposeDeployer::new(); info!("deploying compose stack"); + let runner: Runner = match deployer.deploy(&plan).await { Ok(runner) => runner, Err(ComposeRunnerError::DockerUnavailable) => { diff --git a/examples/src/bin/k8s_runner.rs b/examples/src/bin/k8s_runner.rs index e2fa81c..33d0b22 100644 --- a/examples/src/bin/k8s_runner.rs +++ b/examples/src/bin/k8s_runner.rs @@ -67,6 +67,7 @@ async fn run_k8s_case( let deployer = K8sDeployer::new(); info!("deploying k8s stack"); + let runner: Runner = match deployer.deploy(&plan).await { Ok(runner) => runner, Err(K8sRunnerError::ClientInit { source }) => { diff --git a/examples/src/bin/local_runner.rs b/examples/src/bin/local_runner.rs index e985848..e980ba4 100644 --- a/examples/src/bin/local_runner.rs +++ b/examples/src/bin/local_runner.rs @@ -18,6 +18,7 @@ async fn main() { if std::env::var("POL_PROOF_DEV_MODE").is_err() { warn!("POL_PROOF_DEV_MODE=true is required for the local runner demo"); + std::process::exit(1); } @@ -41,6 +42,7 @@ async fn main() { if let Err(err) = run_local_case(validators, executors, Duration::from_secs(run_secs)).await { warn!("local runner demo failed: {err}"); + std::process::exit(1); } } diff --git a/scripts/checks b/scripts/checks index 8a8bfe2..587e47b 100755 --- a/scripts/checks +++ b/scripts/checks @@ -162,6 +162,49 @@ else warn "helm not found (k8s runner uses helm)" fi +section "K8s Image Visibility" +image="${NOMOS_TESTNET_IMAGE:-logos-blockchain-testing:local}" +if [ -n "${ctx:-}" ]; then + case "${ctx}" in + docker-desktop) + ok "docker-desktop context shares local Docker images" + ;; + kind-*) + if [[ "${image}" == *":local" ]]; then + warn "kind cluster won't see local Docker images by default" + say "Suggested: kind load docker-image ${image}" + fi + ;; + minikube) + if [[ "${image}" == *":local" ]]; then + warn "minikube may not see local Docker images by default" + say "Suggested: minikube image load ${image}" + fi + ;; + *) + if [[ "${image}" == *":local" ]]; then + warn "current context is ${ctx}; a :local image tag may not be reachable by cluster nodes" + say "Suggested: push to a registry and set NOMOS_TESTNET_IMAGE, or load into the cluster if supported" + fi + ;; + esac +fi + +section "Docker Desktop Kubernetes Health (best-effort)" +if have kubectl && [ "${ctx:-}" = "docker-desktop" ]; then + if ! kubectl -n kube-system get pod storage-provisioner >/dev/null 2>&1; then + warn "storage-provisioner pod not found" + else + phase="$(kubectl -n kube-system get pod storage-provisioner -o jsonpath='{.status.phase}' 2>/dev/null || true)" + reason="$(kubectl -n kube-system get pod storage-provisioner -o jsonpath='{.status.containerStatuses[0].state.waiting.reason}' 2>/dev/null || true)" + if [ "${phase}" = "Running" ] || [ "${phase}" = "Succeeded" ]; then + ok "storage-provisioner: ${phase}" + else + warn "storage-provisioner: ${phase:-} ${reason}" + fi + fi +fi + section "Runner Debug Flags (optional)" say "SLOW_TEST_ENV=${SLOW_TEST_ENV:-} (if true: doubles readiness timeouts)" say "NOMOS_SKIP_IMAGE_BUILD=${NOMOS_SKIP_IMAGE_BUILD:-} (compose/k8s)" @@ -170,6 +213,7 @@ say "K8S_RUNNER_PRESERVE=${K8S_RUNNER_PRESERVE:-} (k8s)" say "K8S_RUNNER_DEBUG=${K8S_RUNNER_DEBUG:-} (k8s helm debug)" say "COMPOSE_RUNNER_HOST=${COMPOSE_RUNNER_HOST:-} (compose readiness host override)" say "K8S_RUNNER_NODE_HOST=${K8S_RUNNER_NODE_HOST:-} (k8s NodePort host override)" +say "K8S_RUNNER_NAMESPACE=${K8S_RUNNER_NAMESPACE:-} (k8s fixed namespace)" section "Done" say "If something looks off, start with: scripts/run-examples.sh -t 60 -v 1 -e 1" diff --git a/scripts/clean b/scripts/clean index e5acd4f..5d5dac1 100755 --- a/scripts/clean +++ b/scripts/clean @@ -13,6 +13,9 @@ Options: --tmp Remove .tmp (default) --target Remove target (default) --docker Prune Docker builder cache (docker builder prune -f) + --docker-system Prune Docker system objects (requires --dangerous) + --volumes With --docker-system, also prune volumes + --dangerous Required for --docker-system (destructive) --all Equivalent to --tmp --target --docker -h, --help Show this help EOF @@ -21,6 +24,9 @@ EOF DO_TMP=0 DO_TARGET=0 DO_DOCKER=0 +DO_DOCKER_SYSTEM=0 +DO_VOLUMES=0 +DANGEROUS=0 if [ "$#" -eq 0 ]; then DO_TMP=1 @@ -32,6 +38,9 @@ while [ "$#" -gt 0 ]; do --tmp) DO_TMP=1; shift ;; --target) DO_TARGET=1; shift ;; --docker) DO_DOCKER=1; shift ;; + --docker-system) DO_DOCKER_SYSTEM=1; shift ;; + --volumes) DO_VOLUMES=1; shift ;; + --dangerous) DANGEROUS=1; shift ;; --all) DO_TMP=1; DO_TARGET=1; DO_DOCKER=1; shift ;; -h|--help) usage; exit 0 ;; *) echo "Unknown argument: $1" >&2; usage; exit 2 ;; @@ -60,5 +69,22 @@ if [ "${DO_DOCKER}" -eq 1 ]; then fi fi -echo "Done." +if [ "${DO_DOCKER_SYSTEM}" -eq 1 ]; then + if [ "${DANGEROUS}" -ne 1 ]; then + echo "ERROR: --docker-system requires --dangerous" >&2 + exit 2 + fi + if command -v docker >/dev/null 2>&1; then + echo "==> Pruning Docker system objects" + if [ "${DO_VOLUMES}" -eq 1 ]; then + docker system prune -af --volumes >/dev/null + else + docker system prune -af >/dev/null + fi + echo "==> Docker system prune complete" + else + echo "WARN: docker not found; skipping Docker system prune" >&2 + fi +fi +echo "Done." diff --git a/scripts/run-examples.sh b/scripts/run-examples.sh index 1d8d388..5546afa 100755 --- a/scripts/run-examples.sh +++ b/scripts/run-examples.sh @@ -30,6 +30,7 @@ Options: -t, --run-seconds N Duration to run the demo (required) -v, --validators N Number of validators (required) -e, --executors N Number of executors (required) + --no-image-build Skip rebuilding the compose/k8s image (sets NOMOS_SKIP_IMAGE_BUILD=1) Environment: VERSION Circuits version (default v0.3.1) @@ -38,6 +39,13 @@ Environment: NOMOS_CIRCUITS_REBUILD_RAPIDSNARK Force rapidsnark rebuild NOMOS_BINARIES_TAR Path to prebuilt binaries/circuits tarball (required) NOMOS_SKIP_IMAGE_BUILD Set to 1 to skip rebuilding the compose/k8s image + TESTNET_PRINT_ENDPOINTS If set, runners print TESTNET_ENDPOINTS/TESTNET_PPROF (set automatically) + COMPOSE_RUNNER_HTTP_TIMEOUT_SECS Compose readiness timeout override + K8S_RUNNER_DEPLOYMENT_TIMEOUT_SECS K8s deployment readiness timeout override + K8S_RUNNER_HTTP_TIMEOUT_SECS K8s port-forward readiness timeout override + K8S_RUNNER_HTTP_PROBE_TIMEOUT_SECS K8s NodePort readiness timeout override + K8S_RUNNER_PROMETHEUS_HTTP_TIMEOUT_SECS K8s Prometheus port-forward readiness timeout override + K8S_RUNNER_PROMETHEUS_HTTP_PROBE_TIMEOUT_SECS K8s Prometheus NodePort probe timeout override EOF } @@ -90,6 +98,10 @@ while [ "$#" -gt 0 ]; do DEMO_VALIDATORS="${2:-}"; shift 2 ;; -e|--executors) DEMO_EXECUTORS="${2:-}"; shift 2 ;; + --no-image-build) + NOMOS_SKIP_IMAGE_BUILD=1 + export NOMOS_SKIP_IMAGE_BUILD + shift ;; compose|host|k8s) MODE="$1"; shift ;; *) diff --git a/testing-framework/runners/k8s/src/infrastructure/cluster.rs b/testing-framework/runners/k8s/src/infrastructure/cluster.rs index 9d1e74b..8fce680 100644 --- a/testing-framework/runners/k8s/src/infrastructure/cluster.rs +++ b/testing-framework/runners/k8s/src/infrastructure/cluster.rs @@ -231,6 +231,16 @@ pub async fn ensure_cluster_readiness( } pub fn cluster_identifiers() -> (String, String) { + if let Ok(namespace) = env::var("K8S_RUNNER_NAMESPACE") + && !namespace.is_empty() + { + let release = env::var("K8S_RUNNER_RELEASE") + .ok() + .filter(|value| !value.is_empty()) + .unwrap_or_else(|| namespace.clone()); + return (namespace, release); + } + let run_id = Uuid::new_v4().simple().to_string(); let namespace = format!("nomos-k8s-{run_id}"); (namespace.clone(), namespace) diff --git a/testing-framework/runners/local/src/runner.rs b/testing-framework/runners/local/src/runner.rs index c737054..fc4e26a 100644 --- a/testing-framework/runners/local/src/runner.rs +++ b/testing-framework/runners/local/src/runner.rs @@ -106,6 +106,7 @@ impl LocalDeployer { let skip_membership = !membership_check; if let Err(source) = wait_for_readiness(&topology, skip_membership).await { debug!(error = ?source, "local readiness failed"); + return Err(LocalDeployerError::ReadinessFailed { source }); } @@ -134,6 +135,7 @@ async fn wait_for_readiness( } info!("waiting for membership readiness"); topology.wait_membership_ready().await?; + info!("waiting for DA balancer readiness"); topology.wait_da_balancer_ready().await } @@ -154,6 +156,7 @@ async fn spawn_block_feed_with( })?; info!("starting block feed"); + spawn_block_feed(block_source_client) .await .map_err(|source| LocalDeployerError::WorkloadFailed { diff --git a/testing-framework/workflows/src/builder/mod.rs b/testing-framework/workflows/src/builder/mod.rs index ed4c810..034e4e4 100644 --- a/testing-framework/workflows/src/builder/mod.rs +++ b/testing-framework/workflows/src/builder/mod.rs @@ -149,6 +149,7 @@ impl TransactionFlowBuilder { users = self.users.map(|u| u.get()), "attaching transaction workload" ); + self.builder = self.builder.with_workload(workload); self.builder } @@ -225,6 +226,7 @@ impl DataAvailabilityFlowBuilder { headroom_percent = self.headroom_percent, "attaching data-availability workload" ); + self.builder = self.builder.with_workload(workload); self.builder }