From 8551bffaf002259164d262ee3f65388c419f3304 Mon Sep 17 00:00:00 2001 From: Gabriel mermelstein Date: Wed, 24 Apr 2024 14:25:03 +0300 Subject: [PATCH] adding alert manager service --- docker-compose.yml | 15 +++++++++ monitoring/alert-rules.yml | 49 ++++++++++++++++++++++++++++++ monitoring/alertmanager-config.yml | 13 ++++++++ monitoring/prometheus-config.yml | 6 ++++ 4 files changed, 83 insertions(+) create mode 100644 monitoring/alert-rules.yml create mode 100644 monitoring/alertmanager-config.yml diff --git a/docker-compose.yml b/docker-compose.yml index 346644d..8927623 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -108,12 +108,15 @@ services: image: prom/prometheus:latest volumes: - ./monitoring/prometheus-config.yml:/etc/prometheus/prometheus.yml:z + - ./monitoring/alert-rules.yml:/etc/prometheus/alert-rules.yml:z command: - --config.file=/etc/prometheus/prometheus.yml - --storage.tsdb.retention.time=7d ports: - 127.0.0.1:9090:9090 restart: on-failure + depends_on: + - alertmanager networks: - simulation @@ -212,5 +215,17 @@ services: - mongodb - redis - foundry + networks: + - simulation + + alertmanager: + image: prom/alertmanager:latest + volumes: + - ./monitoring/alertmanager-config.yml:/etc/alertmanager/alertmanager.yml:z + command: + - --config.file=/etc/alertmanager/alertmanager.yml + ports: + - 127.0.0.1:9093:9093 + restart: on-failure networks: - simulation \ No newline at end of file diff --git a/monitoring/alert-rules.yml b/monitoring/alert-rules.yml new file mode 100644 index 0000000..9478457 --- /dev/null +++ b/monitoring/alert-rules.yml @@ -0,0 +1,49 @@ +--- +groups: + - name: waku + rules: + - alert: HighNimWakuNodeTraffic + expr: > + abs( + netdata_net_net_kilobits_persec_average{ + family!~"^(wg|veth|br|docker).*", + fleet=~"(shards|waku|status|wakuv2)\\..*" + }) > 40000 + for: 5m + annotations: + summary: "Too high traffic for {{ $labels.instance }}" + description: "Host {{ $labels.instance }} running nim-waku has {{ $labels.dimension }} traffic higher 40Mbps" + + - alert: HighNimWakuMemUsage + expr: > + nim_gc_mem_bytes{ + fleet=~"(shards|waku|status|wakuv2)\\..*" + } > 1073741824 + for: 5m + annotations: + summary: "Too high memory usage for {{ $labels.instance }}" + description: "Host {{ $labels.instance }} running nim-waku has GC memory usage higher 1GB" + current_value: "{{ $value }}" + + - alert: HighNimWakuNodeOpenSockets + expr: > + netdata_ipv4_sockstat_tcp_sockets_sockets_average{ + fleet=~"(shards|waku|status|wakuv2)\\..*" + } > 300 + for: 5m + annotations: + summary: "Too high open sockets for {{ $labels.instance }}" + description: "Host {{ $labels.instance }} running nim-waku has more than 300 open sockets" + current_value: "{{ $value }}" + + - record: job:waku_libp2p_peers + expr: libp2p_peers{fleet=~"(shards|waku|status|wakuv2)\\..*"} + + - alert: NimWakuPeersDecrease + expr: > + (job:waku_libp2p_peers / avg_over_time(job:waku_libp2p_peers[12h])) < 0.50 + for: 15m + annotations: + summary: "Drop of libp2p_peers on {{ $labels.instance }}" + description: "Host {{ $labels.instance }} running nim-waku has more than 50% drop of peers compared to 12h average" + current_value: "{{ $value }}" \ No newline at end of file diff --git a/monitoring/alertmanager-config.yml b/monitoring/alertmanager-config.yml new file mode 100644 index 0000000..742e72e --- /dev/null +++ b/monitoring/alertmanager-config.yml @@ -0,0 +1,13 @@ +global: + resolve_timeout: 5m + +route: + group_by: ['alertname'] + group_wait: 10s + group_interval: 10s + repeat_interval: 1h + receiver: 'discord' + +receivers: +- name: 'discord' + \ No newline at end of file diff --git a/monitoring/prometheus-config.yml b/monitoring/prometheus-config.yml index 69102ab..5368395 100644 --- a/monitoring/prometheus-config.yml +++ b/monitoring/prometheus-config.yml @@ -4,6 +4,12 @@ global: external_labels: monitor: "Monitoring" +alerting: + alertmanagers: + - scheme: http + static_configs: + - targets: [ 'alertmanager:9093' ] + scrape_configs: - job_name: cadvisor scrape_interval: 5s