Improve environment robustness (checks/clean/timeouts)

This commit is contained in:
andrussal 2025-12-15 20:38:58 +01:00
parent ca2f2785ad
commit be0c1ba91e
15 changed files with 258 additions and 28 deletions

View File

@ -70,6 +70,27 @@ This script handles circuit setup, binary building/bundling, image building, and
- `NOMOS_BUNDLE_DOCKER_PLATFORM=linux/arm64|linux/amd64` — Docker platform used when building a Linux bundle on non-Linux hosts (macOS/Windows)
- `COMPOSE_CIRCUITS_PLATFORM=linux-aarch64|linux-x86_64` — Circuits platform used when building the compose/k8s image (defaults based on host arch)
- `SLOW_TEST_ENV=true` — Doubles built-in readiness timeouts (useful in slower CI / constrained laptops)
- `TESTNET_PRINT_ENDPOINTS=1` — Print `TESTNET_ENDPOINTS` / `TESTNET_PPROF` lines during deploy (set automatically by `scripts/run-examples.sh`)
- `COMPOSE_RUNNER_HTTP_TIMEOUT_SECS=<secs>` — Override compose node HTTP readiness timeout
- `K8S_RUNNER_DEPLOYMENT_TIMEOUT_SECS=<secs>` — Override k8s deployment readiness timeout
- `K8S_RUNNER_HTTP_TIMEOUT_SECS=<secs>` — Override k8s HTTP readiness timeout for port-forwards
- `K8S_RUNNER_HTTP_PROBE_TIMEOUT_SECS=<secs>` — Override k8s HTTP readiness timeout for NodePort probes
- `K8S_RUNNER_PROMETHEUS_HTTP_TIMEOUT_SECS=<secs>` — Override k8s Prometheus readiness timeout
- `K8S_RUNNER_PROMETHEUS_HTTP_PROBE_TIMEOUT_SECS=<secs>` — Override k8s Prometheus NodePort probe timeout
### Cleanup Helper
If you hit Docker build failures, mysterious I/O errors, or are running out of disk space:
```bash
scripts/clean
```
For extra Docker cache cleanup:
```bash
scripts/clean --docker
```
### Host Runner (Direct Cargo Run)

View File

@ -9,6 +9,7 @@
- **macOS + Docker Desktop (Apple silicon):** prefer `NOMOS_BUNDLE_DOCKER_PLATFORM=linux/arm64` for local compose/k8s runs to avoid slow/fragile amd64 emulation builds.
- **Disk space:** bundle/image builds are storage-heavy. If you see I/O errors or Docker build failures, check free space and prune old artifacts (`.tmp/`, `target/`, and Docker build cache) before retrying.
- **K8s runner scope:** the default Helm chart mounts KZG params via `hostPath` and uses a local image tag (`logos-blockchain-testing:local`). This is intended for local clusters (Docker Desktop / minikube / kind), not remote managed clusters without additional setup.
- Quick cleanup: `scripts/clean` (and `scripts/clean --docker` if needed).
**Recommended:** Use `scripts/run-examples.sh` which handles all setup automatically.

View File

@ -9,7 +9,7 @@ set -euo pipefail
# --rev nomos-node git revision to build (overrides NOMOS_NODE_REV)
# --path Use local nomos-node checkout at DIR (skip fetch/checkout)
# --features Extra cargo features to enable (comma-separated); base always includes "testing"
# --docker-platform Docker platform for Linux bundle when running on non-Linux host (default: linux/amd64)
# --docker-platform Docker platform for Linux bundle when running on non-Linux host (default: auto; linux/arm64 on Apple silicon Docker Desktop, else linux/amd64)
# Always run under bash; bail out if someone invokes via sh.
if [ -z "${BASH_VERSION:-}" ]; then
@ -26,7 +26,7 @@ Options:
--rev nomos-node git revision to build (overrides NOMOS_NODE_REV)
--path Use local nomos-node checkout at DIR (skip fetch/checkout)
--features Extra cargo features to enable (comma-separated); base always includes "testing"
--docker-platform Docker platform for Linux bundle when running on non-Linux host (default: linux/amd64)
--docker-platform Docker platform for Linux bundle when running on non-Linux host (default: auto; linux/arm64 on Apple silicon Docker Desktop, else linux/amd64)
Notes:
- For compose/k8s, use platform=linux. If running on macOS, this script will
@ -61,13 +61,24 @@ PLATFORM="host"
OUTPUT=""
REV_OVERRIDE=""
PATH_OVERRIDE=""
DOCKER_PLATFORM="${NOMOS_BUNDLE_DOCKER_PLATFORM:-${NOMOS_BIN_PLATFORM:-linux/amd64}}"
DOCKER_PLATFORM="${NOMOS_BUNDLE_DOCKER_PLATFORM:-${NOMOS_BIN_PLATFORM:-}}"
BUNDLE_RUSTUP_TOOLCHAIN="${BUNDLE_RUSTUP_TOOLCHAIN:-}"
if [ -z "${BUNDLE_RUSTUP_TOOLCHAIN}" ] && command -v rustup >/dev/null 2>&1 && [ -f "${ROOT_DIR}/rust-toolchain.toml" ]; then
BUNDLE_RUSTUP_TOOLCHAIN="$(awk -F '\"' '/^[[:space:]]*channel[[:space:]]*=/{print $2; exit}' "${ROOT_DIR}/rust-toolchain.toml")"
fi
# Default Docker platform to the engine architecture when possible.
if [ -z "${DOCKER_PLATFORM}" ] && command -v docker >/dev/null 2>&1; then
docker_arch="$(docker version --format '{{.Server.Arch}}' 2>/dev/null || true)"
case "${docker_arch}" in
arm64|aarch64) DOCKER_PLATFORM="linux/arm64" ;;
amd64|x86_64) DOCKER_PLATFORM="linux/amd64" ;;
*) DOCKER_PLATFORM="linux/amd64" ;;
esac
fi
DOCKER_PLATFORM="${DOCKER_PLATFORM:-linux/amd64}"
# To avoid confusing cache corruption errors inside the Dockerized Linux build,
# always start from a clean cargo registry/git cache for the cross-build.
rm -rf "${ROOT_DIR}/.tmp/cargo-linux/registry" "${ROOT_DIR}/.tmp/cargo-linux/git"

View File

@ -102,8 +102,18 @@ if have docker; then
warn "could not query docker engine arch (is Docker running?)"
fi
bundle_platform="${NOMOS_BUNDLE_DOCKER_PLATFORM:-${NOMOS_BIN_PLATFORM:-linux/amd64}}"
say "NOMOS_BUNDLE_DOCKER_PLATFORM=${bundle_platform}"
bundle_platform="${NOMOS_BUNDLE_DOCKER_PLATFORM:-${NOMOS_BIN_PLATFORM:-}}"
if [ -z "${bundle_platform}" ]; then
say "NOMOS_BUNDLE_DOCKER_PLATFORM=<auto>"
if [[ "${server_arch}" == *"linux/arm64"* ]]; then
say "bundle docker platform (auto): linux/arm64"
else
say "bundle docker platform (auto): linux/amd64"
fi
bundle_platform="auto"
else
say "NOMOS_BUNDLE_DOCKER_PLATFORM=${bundle_platform}"
fi
if [[ "${server_arch}" == *"linux/arm64"* ]] && [ "${bundle_platform}" = "linux/amd64" ]; then
warn "Docker engine is linux/arm64 but bundle platform is linux/amd64 (emulation). If builds are slow/flaky, set: NOMOS_BUNDLE_DOCKER_PLATFORM=linux/arm64"

64
scripts/clean Executable file
View File

@ -0,0 +1,64 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
usage() {
cat <<'EOF'
Usage: scripts/clean [options]
Removes local build artifacts that commonly cause disk pressure and flaky Docker builds.
Options:
--tmp Remove .tmp (default)
--target Remove target (default)
--docker Prune Docker builder cache (docker builder prune -f)
--all Equivalent to --tmp --target --docker
-h, --help Show this help
EOF
}
DO_TMP=0
DO_TARGET=0
DO_DOCKER=0
if [ "$#" -eq 0 ]; then
DO_TMP=1
DO_TARGET=1
fi
while [ "$#" -gt 0 ]; do
case "$1" in
--tmp) DO_TMP=1; shift ;;
--target) DO_TARGET=1; shift ;;
--docker) DO_DOCKER=1; shift ;;
--all) DO_TMP=1; DO_TARGET=1; DO_DOCKER=1; shift ;;
-h|--help) usage; exit 0 ;;
*) echo "Unknown argument: $1" >&2; usage; exit 2 ;;
esac
done
echo "Workspace: ${ROOT_DIR}"
if [ "${DO_TMP}" -eq 1 ]; then
echo "==> Removing ${ROOT_DIR}/.tmp"
rm -rf "${ROOT_DIR}/.tmp"
fi
if [ "${DO_TARGET}" -eq 1 ]; then
echo "==> Removing ${ROOT_DIR}/target"
rm -rf "${ROOT_DIR}/target"
fi
if [ "${DO_DOCKER}" -eq 1 ]; then
if command -v docker >/dev/null 2>&1; then
echo "==> Pruning Docker builder cache"
docker builder prune -f >/dev/null
echo "==> Docker builder cache pruned"
else
echo "WARN: docker not found; skipping Docker prune" >&2
fi
fi
echo "Done."

View File

@ -400,6 +400,7 @@ if [ -n "${DEMO_EXECUTORS}" ]; then
export NOMOS_DEMO_EXECUTORS="${DEMO_EXECUTORS}"
fi
POL_PROOF_DEV_MODE=true \
TESTNET_PRINT_ENDPOINTS=1 \
NOMOS_TESTNET_IMAGE="${IMAGE}" \
NOMOS_CIRCUITS="${HOST_BUNDLE_PATH}" \
NOMOS_KZGRS_PARAMS_PATH="${KZG_PATH}" \

View File

@ -84,8 +84,16 @@ impl DeploymentOrchestrator {
);
log_profiling_urls(&host, &host_ports);
// Log profiling endpoints (profiling feature must be enabled in the binaries).
log_profiling_urls(&host, &host_ports);
if std::env::var("TESTNET_PRINT_ENDPOINTS").is_ok() {
println!(
"TESTNET_ENDPOINTS prometheus=http://{}:{}/ grafana=http://{}:{}/",
host,
environment.prometheus_port(),
host,
environment.grafana_port()
);
print_profiling_urls(&host, &host_ports);
}
let (block_feed, block_feed_guard) = client_builder
.start_block_feed(&node_clients, &mut environment)
@ -152,3 +160,18 @@ fn log_profiling_urls(host: &str, ports: &HostPortMapping) {
);
}
}
fn print_profiling_urls(host: &str, ports: &HostPortMapping) {
for (idx, node) in ports.validators.iter().enumerate() {
println!(
"TESTNET_PPROF validator_{}=http://{}:{}/debug/pprof/profile?seconds=15&format=proto",
idx, host, node.api
);
}
for (idx, node) in ports.executors.iter().enumerate() {
println!(
"TESTNET_PPROF executor_{}=http://{}:{}/debug/pprof/profile?seconds=15&format=proto",
idx, host, node.api
);
}
}

View File

@ -19,12 +19,13 @@ pub async fn wait_for_executors(ports: &[u16]) -> Result<(), HttpReadinessError>
async fn wait_for_ports(ports: &[u16], role: NodeRole) -> Result<(), HttpReadinessError> {
let host = compose_runner_host();
let timeout = compose_http_timeout();
info!(role = ?role, ports = ?ports, host, "waiting for compose HTTP readiness");
http_probe::wait_for_http_ports_with_host(
ports,
role,
&host,
adjust_timeout(DEFAULT_WAIT),
adjust_timeout(timeout),
POLL_INTERVAL,
)
.await
@ -35,3 +36,11 @@ fn compose_runner_host() -> String {
debug!(host, "compose runner host resolved");
host
}
fn compose_http_timeout() -> Duration {
env::var("COMPOSE_RUNNER_HTTP_TIMEOUT_SECS")
.ok()
.and_then(|raw| raw.parse::<u64>().ok())
.map(Duration::from_secs)
.unwrap_or(DEFAULT_WAIT)
}

View File

@ -151,10 +151,38 @@ impl Deployer for K8sDeployer {
}
};
tracing::info!(
grafana_url = %format!("http://{}:{}/", crate::host::node_host(), 30030),
let node_host = crate::host::node_host();
info!(
prometheus_url = %format!("http://{}:{}/", node_host, cluster.as_ref().expect("cluster ready").prometheus_port()),
"prometheus endpoint available on host"
);
info!(
grafana_url = %format!("http://{}:{}/", node_host, 30030),
"grafana dashboard available via NodePort"
);
if std::env::var("TESTNET_PRINT_ENDPOINTS").is_ok() {
println!(
"TESTNET_ENDPOINTS prometheus=http://{}:{}/ grafana=http://{}:{}/",
node_host,
cluster.as_ref().expect("cluster ready").prometheus_port(),
node_host,
30030
);
for (idx, client) in node_clients.validator_clients().iter().enumerate() {
println!(
"TESTNET_PPROF validator_{}={}/debug/pprof/profile?seconds=15&format=proto",
idx,
client.base_url()
);
}
for (idx, client) in node_clients.executor_clients().iter().enumerate() {
println!(
"TESTNET_PPROF executor_{}={}/debug/pprof/profile?seconds=15&format=proto",
idx,
client.base_url()
);
}
}
let (cleanup, port_forwards) = cluster
.take()
.expect("cluster should still be available")

View File

@ -2,7 +2,7 @@ use k8s_openapi::api::apps::v1::Deployment;
use kube::{Api, Client};
use tokio::time::sleep;
use super::{ClusterWaitError, DEPLOYMENT_TIMEOUT};
use super::{ClusterWaitError, deployment_timeout};
pub async fn wait_for_deployment_ready(
client: &Client,
@ -12,7 +12,8 @@ pub async fn wait_for_deployment_ready(
let mut elapsed = std::time::Duration::ZERO;
let interval = std::time::Duration::from_secs(2);
while elapsed <= DEPLOYMENT_TIMEOUT {
let timeout = deployment_timeout();
while elapsed <= timeout {
match Api::<Deployment>::namespaced(client.clone(), namespace)
.get(name)
.await
@ -47,6 +48,6 @@ pub async fn wait_for_deployment_ready(
Err(ClusterWaitError::DeploymentTimeout {
name: name.to_owned(),
namespace: namespace.to_owned(),
timeout: DEPLOYMENT_TIMEOUT,
timeout,
})
}

View File

@ -1,6 +1,6 @@
use testing_framework_core::scenario::http_probe::{self, HttpReadinessError, NodeRole};
use super::{ClusterWaitError, HTTP_POLL_INTERVAL, NODE_HTTP_PROBE_TIMEOUT, NODE_HTTP_TIMEOUT};
use super::{ClusterWaitError, http_poll_interval, node_http_probe_timeout, node_http_timeout};
use crate::host::node_host;
pub async fn wait_for_node_http_nodeport(
@ -8,14 +8,14 @@ pub async fn wait_for_node_http_nodeport(
role: NodeRole,
) -> Result<(), ClusterWaitError> {
let host = node_host();
wait_for_node_http_on_host(ports, role, &host, NODE_HTTP_PROBE_TIMEOUT).await
wait_for_node_http_on_host(ports, role, &host, node_http_probe_timeout()).await
}
pub async fn wait_for_node_http_port_forward(
ports: &[u16],
role: NodeRole,
) -> Result<(), ClusterWaitError> {
wait_for_node_http_on_host(ports, role, "127.0.0.1", NODE_HTTP_TIMEOUT).await
wait_for_node_http_on_host(ports, role, "127.0.0.1", node_http_timeout()).await
}
async fn wait_for_node_http_on_host(
@ -24,7 +24,7 @@ async fn wait_for_node_http_on_host(
host: &str,
timeout: std::time::Duration,
) -> Result<(), ClusterWaitError> {
http_probe::wait_for_http_ports_with_host(ports, role, host, timeout, HTTP_POLL_INTERVAL)
http_probe::wait_for_http_ports_with_host(ports, role, host, timeout, http_poll_interval())
.await
.map_err(map_http_error)
}

View File

@ -1,4 +1,4 @@
use std::time::Duration;
use std::{env, sync::LazyLock, time::Duration};
use kube::Error as KubeError;
use testing_framework_core::{
@ -95,11 +95,71 @@ pub enum ClusterWaitError {
},
}
pub(crate) const DEPLOYMENT_TIMEOUT: Duration = DEFAULT_K8S_DEPLOYMENT_TIMEOUT;
pub(crate) const NODE_HTTP_TIMEOUT: Duration = DEFAULT_NODE_HTTP_TIMEOUT;
pub(crate) const NODE_HTTP_PROBE_TIMEOUT: Duration = DEFAULT_NODE_HTTP_PROBE_TIMEOUT;
pub(crate) const HTTP_POLL_INTERVAL: Duration = DEFAULT_HTTP_POLL_INTERVAL;
static DEPLOYMENT_TIMEOUT: LazyLock<Duration> = LazyLock::new(|| {
env_duration_secs(
"K8S_RUNNER_DEPLOYMENT_TIMEOUT_SECS",
DEFAULT_K8S_DEPLOYMENT_TIMEOUT,
)
});
static NODE_HTTP_TIMEOUT: LazyLock<Duration> =
LazyLock::new(|| env_duration_secs("K8S_RUNNER_HTTP_TIMEOUT_SECS", DEFAULT_NODE_HTTP_TIMEOUT));
static NODE_HTTP_PROBE_TIMEOUT: LazyLock<Duration> = LazyLock::new(|| {
env_duration_secs(
"K8S_RUNNER_HTTP_PROBE_TIMEOUT_SECS",
DEFAULT_NODE_HTTP_PROBE_TIMEOUT,
)
});
static HTTP_POLL_INTERVAL: LazyLock<Duration> = LazyLock::new(|| {
env_duration_secs(
"K8S_RUNNER_HTTP_POLL_INTERVAL_SECS",
DEFAULT_HTTP_POLL_INTERVAL,
)
});
pub(crate) fn deployment_timeout() -> Duration {
*DEPLOYMENT_TIMEOUT
}
pub(crate) fn node_http_timeout() -> Duration {
*NODE_HTTP_TIMEOUT
}
pub(crate) fn node_http_probe_timeout() -> Duration {
*NODE_HTTP_PROBE_TIMEOUT
}
pub(crate) fn http_poll_interval() -> Duration {
*HTTP_POLL_INTERVAL
}
pub(crate) const PROMETHEUS_HTTP_PORT: u16 = DEFAULT_PROMETHEUS_HTTP_PORT;
pub(crate) const PROMETHEUS_HTTP_TIMEOUT: Duration = DEFAULT_PROMETHEUS_HTTP_TIMEOUT;
pub(crate) const PROMETHEUS_HTTP_PROBE_TIMEOUT: Duration = DEFAULT_PROMETHEUS_HTTP_PROBE_TIMEOUT;
static PROMETHEUS_HTTP_TIMEOUT: LazyLock<Duration> = LazyLock::new(|| {
env_duration_secs(
"K8S_RUNNER_PROMETHEUS_HTTP_TIMEOUT_SECS",
DEFAULT_PROMETHEUS_HTTP_TIMEOUT,
)
});
static PROMETHEUS_HTTP_PROBE_TIMEOUT: LazyLock<Duration> = LazyLock::new(|| {
env_duration_secs(
"K8S_RUNNER_PROMETHEUS_HTTP_PROBE_TIMEOUT_SECS",
DEFAULT_PROMETHEUS_HTTP_PROBE_TIMEOUT,
)
});
pub(crate) fn prometheus_http_timeout() -> Duration {
*PROMETHEUS_HTTP_TIMEOUT
}
pub(crate) fn prometheus_http_probe_timeout() -> Duration {
*PROMETHEUS_HTTP_PROBE_TIMEOUT
}
pub(crate) const PROMETHEUS_SERVICE_NAME: &str = DEFAULT_PROMETHEUS_SERVICE_NAME;
fn env_duration_secs(key: &str, default: Duration) -> Duration {
env::var(key)
.ok()
.and_then(|raw| raw.parse::<u64>().ok())
.map(Duration::from_secs)
.unwrap_or(default)
}

View File

@ -3,7 +3,7 @@ use testing_framework_core::scenario::http_probe::NodeRole;
use super::{
ClusterPorts, ClusterReady, ClusterWaitError, NodeConfigPorts, PROMETHEUS_HTTP_PORT,
PROMETHEUS_HTTP_PROBE_TIMEOUT, PROMETHEUS_SERVICE_NAME,
PROMETHEUS_SERVICE_NAME, prometheus_http_probe_timeout,
};
use crate::lifecycle::wait::{
deployment::wait_for_deployment_ready,
@ -108,7 +108,7 @@ pub async fn wait_for_cluster_ready(
PROMETHEUS_HTTP_PORT,
)
.await?;
if wait_for_prometheus_http_nodeport(prometheus_port, PROMETHEUS_HTTP_PROBE_TIMEOUT)
if wait_for_prometheus_http_nodeport(prometheus_port, prometheus_http_probe_timeout())
.await
.is_err()
{

View File

@ -1,6 +1,6 @@
use tokio::time::sleep;
use super::{ClusterWaitError, PROMETHEUS_HTTP_TIMEOUT};
use super::{ClusterWaitError, prometheus_http_timeout};
use crate::host::node_host;
pub async fn wait_for_prometheus_http_nodeport(
@ -12,7 +12,7 @@ pub async fn wait_for_prometheus_http_nodeport(
}
pub async fn wait_for_prometheus_http_port_forward(port: u16) -> Result<(), ClusterWaitError> {
wait_for_prometheus_http("127.0.0.1", port, PROMETHEUS_HTTP_TIMEOUT).await
wait_for_prometheus_http("127.0.0.1", port, prometheus_http_timeout()).await
}
async fn wait_for_prometheus_http(

View File

@ -28,6 +28,7 @@ non_zero_rate_fn!(
transaction_rate_checked,
"transaction rate must be non-zero"
);
non_zero_rate_fn!(channel_rate_checked, "channel rate must be non-zero");
non_zero_rate_fn!(blob_rate_checked, "blob rate must be non-zero");