From 722c58ab8b4a3d74e04b0d433e4c4fbfff05a409 Mon Sep 17 00:00:00 2001 From: Gabriel mermelstein Date: Wed, 24 Apr 2024 17:52:22 +0300 Subject: [PATCH] Having first alert working --- monitoring/alert-rules.yml | 40 +------------------------------- monitoring/prometheus-config.yml | 3 +++ 2 files changed, 4 insertions(+), 39 deletions(-) diff --git a/monitoring/alert-rules.yml b/monitoring/alert-rules.yml index 9478457..24d3dbc 100644 --- a/monitoring/alert-rules.yml +++ b/monitoring/alert-rules.yml @@ -1,49 +1,11 @@ ---- groups: - name: waku rules: - - alert: HighNimWakuNodeTraffic - expr: > - abs( - netdata_net_net_kilobits_persec_average{ - family!~"^(wg|veth|br|docker).*", - fleet=~"(shards|waku|status|wakuv2)\\..*" - }) > 40000 - for: 5m - annotations: - summary: "Too high traffic for {{ $labels.instance }}" - description: "Host {{ $labels.instance }} running nim-waku has {{ $labels.dimension }} traffic higher 40Mbps" - - alert: HighNimWakuMemUsage expr: > - nim_gc_mem_bytes{ - fleet=~"(shards|waku|status|wakuv2)\\..*" - } > 1073741824 + nim_gc_mem_bytes{} > 1073741824 for: 5m annotations: summary: "Too high memory usage for {{ $labels.instance }}" description: "Host {{ $labels.instance }} running nim-waku has GC memory usage higher 1GB" - current_value: "{{ $value }}" - - - alert: HighNimWakuNodeOpenSockets - expr: > - netdata_ipv4_sockstat_tcp_sockets_sockets_average{ - fleet=~"(shards|waku|status|wakuv2)\\..*" - } > 300 - for: 5m - annotations: - summary: "Too high open sockets for {{ $labels.instance }}" - description: "Host {{ $labels.instance }} running nim-waku has more than 300 open sockets" - current_value: "{{ $value }}" - - - record: job:waku_libp2p_peers - expr: libp2p_peers{fleet=~"(shards|waku|status|wakuv2)\\..*"} - - - alert: NimWakuPeersDecrease - expr: > - (job:waku_libp2p_peers / avg_over_time(job:waku_libp2p_peers[12h])) < 0.50 - for: 15m - annotations: - summary: "Drop of libp2p_peers on {{ $labels.instance }}" - description: "Host {{ $labels.instance }} running nim-waku has more than 50% drop of peers compared to 12h average" current_value: "{{ $value }}" \ No newline at end of file diff --git a/monitoring/prometheus-config.yml b/monitoring/prometheus-config.yml index 5368395..322f305 100644 --- a/monitoring/prometheus-config.yml +++ b/monitoring/prometheus-config.yml @@ -10,6 +10,9 @@ alerting: static_configs: - targets: [ 'alertmanager:9093' ] +rule_files: + - "./alert-rules.yml" + scrape_configs: - job_name: cadvisor scrape_interval: 5s