From 8551bffaf002259164d262ee3f65388c419f3304 Mon Sep 17 00:00:00 2001 From: Gabriel mermelstein Date: Wed, 24 Apr 2024 14:25:03 +0300 Subject: [PATCH 1/7] adding alert manager service --- docker-compose.yml | 15 +++++++++ monitoring/alert-rules.yml | 49 ++++++++++++++++++++++++++++++ monitoring/alertmanager-config.yml | 13 ++++++++ monitoring/prometheus-config.yml | 6 ++++ 4 files changed, 83 insertions(+) create mode 100644 monitoring/alert-rules.yml create mode 100644 monitoring/alertmanager-config.yml diff --git a/docker-compose.yml b/docker-compose.yml index 346644d..8927623 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -108,12 +108,15 @@ services: image: prom/prometheus:latest volumes: - ./monitoring/prometheus-config.yml:/etc/prometheus/prometheus.yml:z + - ./monitoring/alert-rules.yml:/etc/prometheus/alert-rules.yml:z command: - --config.file=/etc/prometheus/prometheus.yml - --storage.tsdb.retention.time=7d ports: - 127.0.0.1:9090:9090 restart: on-failure + depends_on: + - alertmanager networks: - simulation @@ -212,5 +215,17 @@ services: - mongodb - redis - foundry + networks: + - simulation + + alertmanager: + image: prom/alertmanager:latest + volumes: + - ./monitoring/alertmanager-config.yml:/etc/alertmanager/alertmanager.yml:z + command: + - --config.file=/etc/alertmanager/alertmanager.yml + ports: + - 127.0.0.1:9093:9093 + restart: on-failure networks: - simulation \ No newline at end of file diff --git a/monitoring/alert-rules.yml b/monitoring/alert-rules.yml new file mode 100644 index 0000000..9478457 --- /dev/null +++ b/monitoring/alert-rules.yml @@ -0,0 +1,49 @@ +--- +groups: + - name: waku + rules: + - alert: HighNimWakuNodeTraffic + expr: > + abs( + netdata_net_net_kilobits_persec_average{ + family!~"^(wg|veth|br|docker).*", + fleet=~"(shards|waku|status|wakuv2)\\..*" + }) > 40000 + for: 5m + annotations: + summary: "Too high traffic for {{ $labels.instance }}" + description: "Host {{ $labels.instance }} running nim-waku has {{ $labels.dimension }} traffic higher 40Mbps" + + - alert: HighNimWakuMemUsage + expr: > + nim_gc_mem_bytes{ + fleet=~"(shards|waku|status|wakuv2)\\..*" + } > 1073741824 + for: 5m + annotations: + summary: "Too high memory usage for {{ $labels.instance }}" + description: "Host {{ $labels.instance }} running nim-waku has GC memory usage higher 1GB" + current_value: "{{ $value }}" + + - alert: HighNimWakuNodeOpenSockets + expr: > + netdata_ipv4_sockstat_tcp_sockets_sockets_average{ + fleet=~"(shards|waku|status|wakuv2)\\..*" + } > 300 + for: 5m + annotations: + summary: "Too high open sockets for {{ $labels.instance }}" + description: "Host {{ $labels.instance }} running nim-waku has more than 300 open sockets" + current_value: "{{ $value }}" + + - record: job:waku_libp2p_peers + expr: libp2p_peers{fleet=~"(shards|waku|status|wakuv2)\\..*"} + + - alert: NimWakuPeersDecrease + expr: > + (job:waku_libp2p_peers / avg_over_time(job:waku_libp2p_peers[12h])) < 0.50 + for: 15m + annotations: + summary: "Drop of libp2p_peers on {{ $labels.instance }}" + description: "Host {{ $labels.instance }} running nim-waku has more than 50% drop of peers compared to 12h average" + current_value: "{{ $value }}" \ No newline at end of file diff --git a/monitoring/alertmanager-config.yml b/monitoring/alertmanager-config.yml new file mode 100644 index 0000000..742e72e --- /dev/null +++ b/monitoring/alertmanager-config.yml @@ -0,0 +1,13 @@ +global: + resolve_timeout: 5m + +route: + group_by: ['alertname'] + group_wait: 10s + group_interval: 10s + repeat_interval: 1h + receiver: 'discord' + +receivers: +- name: 'discord' + \ No newline at end of file diff --git a/monitoring/prometheus-config.yml b/monitoring/prometheus-config.yml index 69102ab..5368395 100644 --- a/monitoring/prometheus-config.yml +++ b/monitoring/prometheus-config.yml @@ -4,6 +4,12 @@ global: external_labels: monitor: "Monitoring" +alerting: + alertmanagers: + - scheme: http + static_configs: + - targets: [ 'alertmanager:9093' ] + scrape_configs: - job_name: cadvisor scrape_interval: 5s From 722c58ab8b4a3d74e04b0d433e4c4fbfff05a409 Mon Sep 17 00:00:00 2001 From: Gabriel mermelstein Date: Wed, 24 Apr 2024 17:52:22 +0300 Subject: [PATCH 2/7] Having first alert working --- monitoring/alert-rules.yml | 40 +------------------------------- monitoring/prometheus-config.yml | 3 +++ 2 files changed, 4 insertions(+), 39 deletions(-) diff --git a/monitoring/alert-rules.yml b/monitoring/alert-rules.yml index 9478457..24d3dbc 100644 --- a/monitoring/alert-rules.yml +++ b/monitoring/alert-rules.yml @@ -1,49 +1,11 @@ ---- groups: - name: waku rules: - - alert: HighNimWakuNodeTraffic - expr: > - abs( - netdata_net_net_kilobits_persec_average{ - family!~"^(wg|veth|br|docker).*", - fleet=~"(shards|waku|status|wakuv2)\\..*" - }) > 40000 - for: 5m - annotations: - summary: "Too high traffic for {{ $labels.instance }}" - description: "Host {{ $labels.instance }} running nim-waku has {{ $labels.dimension }} traffic higher 40Mbps" - - alert: HighNimWakuMemUsage expr: > - nim_gc_mem_bytes{ - fleet=~"(shards|waku|status|wakuv2)\\..*" - } > 1073741824 + nim_gc_mem_bytes{} > 1073741824 for: 5m annotations: summary: "Too high memory usage for {{ $labels.instance }}" description: "Host {{ $labels.instance }} running nim-waku has GC memory usage higher 1GB" - current_value: "{{ $value }}" - - - alert: HighNimWakuNodeOpenSockets - expr: > - netdata_ipv4_sockstat_tcp_sockets_sockets_average{ - fleet=~"(shards|waku|status|wakuv2)\\..*" - } > 300 - for: 5m - annotations: - summary: "Too high open sockets for {{ $labels.instance }}" - description: "Host {{ $labels.instance }} running nim-waku has more than 300 open sockets" - current_value: "{{ $value }}" - - - record: job:waku_libp2p_peers - expr: libp2p_peers{fleet=~"(shards|waku|status|wakuv2)\\..*"} - - - alert: NimWakuPeersDecrease - expr: > - (job:waku_libp2p_peers / avg_over_time(job:waku_libp2p_peers[12h])) < 0.50 - for: 15m - annotations: - summary: "Drop of libp2p_peers on {{ $labels.instance }}" - description: "Host {{ $labels.instance }} running nim-waku has more than 50% drop of peers compared to 12h average" current_value: "{{ $value }}" \ No newline at end of file diff --git a/monitoring/prometheus-config.yml b/monitoring/prometheus-config.yml index 5368395..322f305 100644 --- a/monitoring/prometheus-config.yml +++ b/monitoring/prometheus-config.yml @@ -10,6 +10,9 @@ alerting: static_configs: - targets: [ 'alertmanager:9093' ] +rule_files: + - "./alert-rules.yml" + scrape_configs: - job_name: cadvisor scrape_interval: 5s From 8a0c7084b74e1c3b16e7fe924c4d4b38136db9a2 Mon Sep 17 00:00:00 2001 From: Gabriel mermelstein Date: Thu, 25 Apr 2024 14:09:32 +0300 Subject: [PATCH 3/7] adding libp2p_peers drop alert --- monitoring/alert-rules.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/monitoring/alert-rules.yml b/monitoring/alert-rules.yml index 24d3dbc..5d02aaa 100644 --- a/monitoring/alert-rules.yml +++ b/monitoring/alert-rules.yml @@ -8,4 +8,16 @@ groups: annotations: summary: "Too high memory usage for {{ $labels.instance }}" description: "Host {{ $labels.instance }} running nim-waku has GC memory usage higher 1GB" + current_value: "{{ $value }}" + + - record: job:waku_libp2p_peers + expr: libp2p_peers{} + + - alert: NimWakuPeersDecrease + expr: > + (job:waku_libp2p_peers / avg_over_time(job:waku_libp2p_peers[12h])) < 0.50 + for: 15m + annotations: + summary: "Drop of libp2p_peers on {{ $labels.instance }}" + description: "Host {{ $labels.instance }} running nim-waku has more than 50% drop of peers compared to 12h average" current_value: "{{ $value }}" \ No newline at end of file From 83ae90030a7a7aade000237d4957b6f730e04e2d Mon Sep 17 00:00:00 2001 From: Gabriel mermelstein Date: Thu, 25 Apr 2024 14:29:50 +0300 Subject: [PATCH 4/7] improving alertmanager-config --- monitoring/alertmanager-config.yml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/monitoring/alertmanager-config.yml b/monitoring/alertmanager-config.yml index 742e72e..49b7e90 100644 --- a/monitoring/alertmanager-config.yml +++ b/monitoring/alertmanager-config.yml @@ -1,11 +1,8 @@ -global: - resolve_timeout: 5m - route: group_by: ['alertname'] - group_wait: 10s - group_interval: 10s - repeat_interval: 1h + group_wait: 60s + group_interval: 5m + repeat_interval: 1d receiver: 'discord' receivers: From 60c9f3523b43fc515ecca63414a2499028600261 Mon Sep 17 00:00:00 2001 From: Gabriel mermelstein Date: Thu, 25 Apr 2024 14:31:32 +0300 Subject: [PATCH 5/7] adding to do comment --- monitoring/alertmanager-config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/monitoring/alertmanager-config.yml b/monitoring/alertmanager-config.yml index 49b7e90..be1af7c 100644 --- a/monitoring/alertmanager-config.yml +++ b/monitoring/alertmanager-config.yml @@ -5,6 +5,7 @@ route: repeat_interval: 1d receiver: 'discord' +# TODO: fill data about Discord receiver receivers: - name: 'discord' \ No newline at end of file From fc0247055cf3df5800506cf3022380fdc5501089 Mon Sep 17 00:00:00 2001 From: Gabriel mermelstein Date: Mon, 29 Apr 2024 17:49:23 +0300 Subject: [PATCH 6/7] setting up discord receiver --- monitoring/alertmanager-config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/monitoring/alertmanager-config.yml b/monitoring/alertmanager-config.yml index be1af7c..77bf5fa 100644 --- a/monitoring/alertmanager-config.yml +++ b/monitoring/alertmanager-config.yml @@ -5,7 +5,7 @@ route: repeat_interval: 1d receiver: 'discord' -# TODO: fill data about Discord receiver receivers: -- name: 'discord' - \ No newline at end of file + - name: 'discord' + discord_configs: + - webhook_url: # TO DO: fill discord webhook \ No newline at end of file From 59fce1f32474a75eef53efa18e2e25ce23ccc67c Mon Sep 17 00:00:00 2001 From: Gabriel mermelstein Date: Tue, 30 Apr 2024 17:21:06 +0300 Subject: [PATCH 7/7] adding support for discord webhook in env --- .gitignore | 1 + docker-compose.yml | 17 ++++++++++++++--- monitoring/{ => alert-manager}/alert-rules.yml | 2 +- .../alertmanager-config.yml.template | 11 +++++++++++ monitoring/alertmanager-config.yml | 11 ----------- 5 files changed, 27 insertions(+), 15 deletions(-) rename monitoring/{ => alert-manager}/alert-rules.yml (95%) create mode 100644 monitoring/alert-manager/alertmanager-config.yml.template delete mode 100644 monitoring/alertmanager-config.yml diff --git a/.gitignore b/.gitignore index 047d548..65e2a1a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ **/.DS_Store *.env !wakusim.env +alertmanager-config.yml diff --git a/docker-compose.yml b/docker-compose.yml index 8927623..cbd99d8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -108,7 +108,7 @@ services: image: prom/prometheus:latest volumes: - ./monitoring/prometheus-config.yml:/etc/prometheus/prometheus.yml:z - - ./monitoring/alert-rules.yml:/etc/prometheus/alert-rules.yml:z + - ./monitoring/alert-manager/alert-rules.yml:/etc/prometheus/alert-rules.yml:z command: - --config.file=/etc/prometheus/prometheus.yml - --storage.tsdb.retention.time=7d @@ -218,14 +218,25 @@ services: networks: - simulation + env_replacer: + image: alpine:3.19.1 + environment: + - DISCORD_WEBHOOK=$DISCORD_WEBHOOK + volumes: + - ./monitoring/alert-manager/alertmanager-config.yml.template:/etc/alertmanager/alertmanager.yml.template:z + - ./monitoring/alert-manager/alertmanager-config.yml:/etc/alertmanager/alertmanager.yml:z + command: ["/bin/sh", "-c", "apk add --no-cache gettext && envsubst < /etc/alertmanager/alertmanager.yml.template > /etc/alertmanager/alertmanager.yml"] + alertmanager: image: prom/alertmanager:latest volumes: - - ./monitoring/alertmanager-config.yml:/etc/alertmanager/alertmanager.yml:z + - ./monitoring/alert-manager/alertmanager-config.yml:/etc/alertmanager/alertmanager.yml:z command: - --config.file=/etc/alertmanager/alertmanager.yml ports: - 127.0.0.1:9093:9093 restart: on-failure networks: - - simulation \ No newline at end of file + - simulation + depends_on: + - env_replacer \ No newline at end of file diff --git a/monitoring/alert-rules.yml b/monitoring/alert-manager/alert-rules.yml similarity index 95% rename from monitoring/alert-rules.yml rename to monitoring/alert-manager/alert-rules.yml index 5d02aaa..e7bff1c 100644 --- a/monitoring/alert-rules.yml +++ b/monitoring/alert-manager/alert-rules.yml @@ -3,7 +3,7 @@ groups: rules: - alert: HighNimWakuMemUsage expr: > - nim_gc_mem_bytes{} > 1073741824 + nim_gc_mem_bytes{} > 1 for: 5m annotations: summary: "Too high memory usage for {{ $labels.instance }}" diff --git a/monitoring/alert-manager/alertmanager-config.yml.template b/monitoring/alert-manager/alertmanager-config.yml.template new file mode 100644 index 0000000..f08be0d --- /dev/null +++ b/monitoring/alert-manager/alertmanager-config.yml.template @@ -0,0 +1,11 @@ +route: + group_by: ['alertname'] + group_wait: 10s + group_interval: 10s + repeat_interval: 2m + receiver: 'discord' + +receivers: + - name: 'discord' + discord_configs: + - webhook_url: ${DISCORD_WEBHOOK} \ No newline at end of file diff --git a/monitoring/alertmanager-config.yml b/monitoring/alertmanager-config.yml deleted file mode 100644 index 77bf5fa..0000000 --- a/monitoring/alertmanager-config.yml +++ /dev/null @@ -1,11 +0,0 @@ -route: - group_by: ['alertname'] - group_wait: 60s - group_interval: 5m - repeat_interval: 1d - receiver: 'discord' - -receivers: - - name: 'discord' - discord_configs: - - webhook_url: # TO DO: fill discord webhook \ No newline at end of file