use std::time::Duration; use async_trait::async_trait; use nomos_core::header::HeaderId; use testing_framework_core::{ nodes::ApiClient, scenario::{DynError, Expectation, RunContext}, }; use thiserror::Error; use tokio::time::sleep; #[derive(Clone, Copy, Debug)] /// Checks that every node reaches near the highest observed height within an /// allowance. pub struct ConsensusLiveness { lag_allowance: u64, } impl Default for ConsensusLiveness { fn default() -> Self { Self { lag_allowance: LAG_ALLOWANCE, } } } const LAG_ALLOWANCE: u64 = 2; const MIN_PROGRESS_BLOCKS: u64 = 5; const REQUEST_RETRIES: usize = 5; const REQUEST_RETRY_DELAY: Duration = Duration::from_secs(2); const MAX_LAG_ALLOWANCE: u64 = 5; #[async_trait] impl Expectation for ConsensusLiveness { fn name(&self) -> &'static str { "consensus_liveness" } async fn evaluate(&mut self, ctx: &RunContext) -> Result<(), DynError> { Self::ensure_participants(ctx)?; let target_hint = Self::target_blocks(ctx); tracing::info!(target_hint, "consensus liveness: collecting samples"); let check = Self::collect_results(ctx).await; (*self).report(target_hint, check) } } const fn consensus_target_blocks(ctx: &RunContext) -> u64 { ctx.expected_blocks() } #[derive(Debug, Error)] enum ConsensusLivenessIssue { #[error("{node} height {height} below target {target}")] HeightBelowTarget { node: String, height: u64, target: u64, }, #[error("{node} consensus_info failed: {source}")] RequestFailed { node: String, #[source] source: DynError, }, } #[derive(Debug, Error)] enum ConsensusLivenessError { #[error("consensus liveness requires at least one validator or executor")] MissingParticipants, #[error("consensus liveness violated (target={target}):\n{details}")] Violations { target: u64, #[source] details: ViolationIssues, }, } #[derive(Debug, Error)] #[error("{message}")] struct ViolationIssues { issues: Vec, message: String, } impl ConsensusLiveness { const fn target_blocks(ctx: &RunContext) -> u64 { consensus_target_blocks(ctx) } fn ensure_participants(ctx: &RunContext) -> Result<(), DynError> { if ctx.node_clients().all_clients().count() == 0 { Err(Box::new(ConsensusLivenessError::MissingParticipants)) } else { Ok(()) } } async fn collect_results(ctx: &RunContext) -> LivenessCheck { let clients: Vec<_> = ctx.node_clients().all_clients().collect(); let mut samples = Vec::with_capacity(clients.len()); let mut issues = Vec::new(); for (idx, client) in clients.iter().enumerate() { for attempt in 0..REQUEST_RETRIES { match Self::fetch_cluster_info(client).await { Ok((height, tip)) => { let label = format!("node-{idx}"); tracing::debug!(node = %label, height, tip = ?tip, attempt, "consensus_info collected"); samples.push(NodeSample { label, height, tip }); break; } Err(err) if attempt + 1 == REQUEST_RETRIES => { tracing::warn!(node = %format!("node-{idx}"), %err, "consensus_info failed after retries"); issues.push(ConsensusLivenessIssue::RequestFailed { node: format!("node-{idx}"), source: err, }); } Err(_) => sleep(REQUEST_RETRY_DELAY).await, } } } LivenessCheck { samples, issues } } async fn fetch_cluster_info(client: &ApiClient) -> Result<(u64, HeaderId), DynError> { client .consensus_info() .await .map(|info| (info.height, info.tip)) .map_err(|err| -> DynError { err.into() }) } #[must_use] /// Adjusts how many blocks behind the leader a node may be before failing. pub const fn with_lag_allowance(mut self, lag_allowance: u64) -> Self { self.lag_allowance = lag_allowance; self } fn effective_lag_allowance(&self, target: u64) -> u64 { (target / 10).clamp(self.lag_allowance, MAX_LAG_ALLOWANCE) } fn report(self, target_hint: u64, mut check: LivenessCheck) -> Result<(), DynError> { if check.samples.is_empty() { return Err(Box::new(ConsensusLivenessError::MissingParticipants)); } let max_height = check .samples .iter() .map(|sample| sample.height) .max() .unwrap_or(0); let mut target = target_hint; if target == 0 || target > max_height { target = max_height; } let lag_allowance = self.effective_lag_allowance(target); if max_height < MIN_PROGRESS_BLOCKS { check .issues .push(ConsensusLivenessIssue::HeightBelowTarget { node: "network".to_owned(), height: max_height, target: MIN_PROGRESS_BLOCKS, }); } for sample in &check.samples { if sample.height + lag_allowance < target { check .issues .push(ConsensusLivenessIssue::HeightBelowTarget { node: sample.label.clone(), height: sample.height, target, }); } } if check.issues.is_empty() { let observed_heights: Vec<_> = check.samples.iter().map(|s| s.height).collect(); let observed_tips: Vec<_> = check.samples.iter().map(|s| s.tip).collect(); tracing::info!( target, samples = check.samples.len(), heights = ?observed_heights, tips = ?observed_tips, "consensus liveness expectation satisfied" ); Ok(()) } else { for issue in &check.issues { tracing::warn!(?issue, "consensus liveness issue"); } Err(Box::new(ConsensusLivenessError::Violations { target, details: check.issues.into(), })) } } } struct NodeSample { label: String, height: u64, tip: HeaderId, } struct LivenessCheck { samples: Vec, issues: Vec, } impl From> for ViolationIssues { fn from(issues: Vec) -> Self { let mut message = String::new(); for issue in &issues { if !message.is_empty() { message.push('\n'); } message.push_str("- "); message.push_str(&issue.to_string()); } Self { issues, message } } }