From f60580b29a4bd1082e26be266a9ff067dfa998a5 Mon Sep 17 00:00:00 2001 From: andrussal Date: Thu, 11 Dec 2025 07:53:44 +0100 Subject: [PATCH] Revert "Add metrics plan and improve consensus liveness logging" This reverts commit a7c9f45dd9aa2ab249d64098a638bac198e6a4b9. --- metrics-plan.md | 53 ------------------- .../src/expectations/consensus_liveness.rs | 17 ++---- 2 files changed, 5 insertions(+), 65 deletions(-) delete mode 100644 metrics-plan.md diff --git a/metrics-plan.md b/metrics-plan.md deleted file mode 100644 index 37e1485..0000000 --- a/metrics-plan.md +++ /dev/null @@ -1,53 +0,0 @@ -# Node Metrics Blueprint - -Prometheus-friendly metrics organized by domain. Use low-cardinality labels: `node_id`, `role` (validator/executor), `network_id`, plus `service/component`, `direction`, `status/type`, `endpoint/op` (coarse). Avoid per-peer labels on hot paths. - -## Consensus -- Gauges: `consensus_height`, `consensus_finalized_height`, `consensus_round`, `consensus_active_validators`, `consensus_forks`, `consensus_lag_blocks`. -- Counters: `consensus_blocks_proposed_total`, `consensus_blocks_committed_total`, `consensus_blocks_rejected_total`, `consensus_view_changes_total`, `consensus_equivocation_events_total`, `consensus_leader_election_rounds_total`, `consensus_fork_detections_total`. -- Histograms: `consensus_block_production_duration_seconds`, `consensus_block_validation_duration_seconds`, `consensus_finality_time_seconds`, `consensus_fork_resolution_duration_seconds`, `consensus_message_size_bytes`, `consensus_message_latency_seconds`. -- Derived: `consensus_participation_rate`. - -## Data Availability (DA) -- Gauges: `da_slot_height`, `da_storage_utilization_bytes`, `da_replication_factor`. -- Counters: `da_blobs_committed_total`, `da_blob_failures_total{reason}`, `da_sampling_requests_total`, `da_sampling_success_total`, `da_reconstruction_success_total`, `da_reconstruction_failures_total`, `da_erasure_coding_failures_total`. -- Histograms: `da_blob_dispersal_duration_seconds`, `da_blob_reconstruction_duration_seconds`, `da_sampling_duration_seconds`, `da_kzg_proof_generation_duration_seconds`, `da_kzg_proof_verification_duration_seconds`, `da_download_duration_seconds`, `da_network_message_size_bytes`. -- Derived: `da_availability_rate`, `da_sampling_success_rate`, `da_reconstruction_success_rate`. - -## Blend / Subnet Balancer -- Gauges: `blend_subnets_total`, `blend_active_subnets`, `blend_peer_assignments`, `blend_anonymity_set_size` (if measurable). -- Counters: `blend_rebalances_total`, `blend_subnet_membership_changes_total`, `blend_subnet_join_failures_total`, `blend_cover_traffic_messages_total`. -- Histograms: `blend_rebalance_duration_seconds`, `blend_message_mix_duration_seconds`, `blend_layer_processing_duration_seconds`, `blend_zk_proof_generation_duration_seconds`. - -## Networking (per service: libp2p, consensus, DA, API transport) -- Gauges: `network_peer_count{service}`, `network_connections{service,state}`, `network_dials_inflight{service}`. -- Counters: `network_messages_total{service,direction,type}`, `network_bytes_total{service,direction}`, `network_connect_failures_total{service,reason}`, `network_dial_attempts_total{service}`. -- Histograms: `network_message_size_bytes{service}`, `network_message_latency_seconds{service}`, `network_connection_duration_seconds{service}`, `network_rtt_seconds` (sampled). - -## Storage -- Gauges: `storage_db_size_bytes{db}`, `storage_column_size_bytes{db,column}` (if cheap), `storage_state_size_bytes`, `storage_cache_hit_ratio`. -- Counters: `storage_ops_total{db,op}`, `storage_errors_total{db,op,reason}`, `storage_compactions_total{db}`. -- Histograms: `storage_op_duration_seconds{db,op}`, `storage_compaction_duration_seconds{db}`. - -## Mempool / Transactions -- Gauges: `mempool_tx_count`, `mempool_size_bytes`, `mempool_utilization_ratio`. -- Counters: `mempool_txs_submitted_total`, `mempool_txs_committed_total`, `mempool_txs_rejected_total{reason}`, `mempool_txs_evicted_total{reason}`, `mempool_txs_broadcast_total`. -- Histograms: `mempool_enqueue_duration_seconds`, `mempool_tx_lifetime_seconds`, `tx_validation_duration_seconds`, `tx_processing_duration_seconds`. -- Derived: `tx_throughput_tps`, `tx_rejection_rate`, `tx_latency_seconds` (end-to-end). - -## API (HTTP/gRPC) -- Counters: `api_requests_total{method,endpoint,status_class}`, `api_errors_total{endpoint,reason}`, `api_auth_failures_total`. -- Histograms: `api_request_duration_seconds{endpoint}`, `api_request_size_bytes{endpoint}`, `api_response_size_bytes{endpoint}`. -- Gauges: `api_concurrent_connections`. - -## General Node Health -- Gauges/counters: `node_uptime_seconds` (counter), `node_restart_total`, `node_cpu_seconds_total`, `node_memory_bytes`, `node_threads`, `node_fd_used`, `node_logical_disk_usage_bytes`. - -## Cross-Domain Signals -- Counters: `consensus_blocks_missing_da_total`, `consensus_blocks_missing_witness_total`, `da_proof_verification_failures_total`. -- Gauges: `consensus_sync_catchup_in_progress` (0/1). - -## Label Guidance -- Core: `node_id`, `role` (validator/executor), `network_id`, `version`. -- Domain: `service`/`component`, `direction` (ingress/egress), `status_class` (2xx/4xx/5xx), `op`/`endpoint`, `reason` (coarse buckets). -- Avoid per-peer labels on hot metrics; aggregate where possible. diff --git a/testing-framework/workflows/src/expectations/consensus_liveness.rs b/testing-framework/workflows/src/expectations/consensus_liveness.rs index 20591c3..2d95dcc 100644 --- a/testing-framework/workflows/src/expectations/consensus_liveness.rs +++ b/testing-framework/workflows/src/expectations/consensus_liveness.rs @@ -39,7 +39,6 @@ impl Expectation for ConsensusLiveness { async fn evaluate(&mut self, ctx: &RunContext) -> Result<(), DynError> { Self::ensure_participants(ctx)?; let target_hint = Self::target_blocks(ctx); - tracing::info!(target_hint, "consensus liveness: collecting samples"); let check = Self::collect_results(ctx).await; (*self).report(target_hint, check) } @@ -106,13 +105,14 @@ impl ConsensusLiveness { for attempt in 0..REQUEST_RETRIES { match Self::fetch_cluster_info(client).await { Ok((height, tip)) => { - let label = format!("node-{idx}"); - tracing::debug!(node = %label, height, tip = ?tip, attempt, "consensus_info collected"); - samples.push(NodeSample { label, height, tip }); + samples.push(NodeSample { + label: format!("node-{idx}"), + height, + tip, + }); break; } Err(err) if attempt + 1 == REQUEST_RETRIES => { - tracing::warn!(node = %format!("node-{idx}"), %err, "consensus_info failed after retries"); issues.push(ConsensusLivenessIssue::RequestFailed { node: format!("node-{idx}"), source: err, @@ -150,13 +150,6 @@ impl ConsensusLiveness { return Err(Box::new(ConsensusLivenessError::MissingParticipants)); } - tracing::info!( - target_hint, - samples = check.samples.len(), - issues = check.issues.len(), - "consensus liveness report" - ); - let max_height = check .samples .iter()