Merge 59fce1f32474a75eef53efa18e2e25ce23ccc67c into bdcef6e9f8232c2a4c43e4335ed228957eed770a

This commit is contained in:
gabrielmer 2024-04-30 17:23:37 +03:00 committed by GitHub
commit 644c525c8c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 71 additions and 1 deletions

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
**/.DS_Store
*.env
!wakusim.env
alertmanager-config.yml

View File

@ -108,12 +108,15 @@ services:
image: prom/prometheus:latest
volumes:
- ./monitoring/prometheus-config.yml:/etc/prometheus/prometheus.yml:z
- ./monitoring/alert-manager/alert-rules.yml:/etc/prometheus/alert-rules.yml:z
command:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.retention.time=7d
ports:
- 127.0.0.1:9090:9090
restart: on-failure
depends_on:
- alertmanager
networks:
- simulation
@ -213,4 +216,27 @@ services:
- redis
- foundry
networks:
- simulation
- simulation
env_replacer:
image: alpine:3.19.1
environment:
- DISCORD_WEBHOOK=$DISCORD_WEBHOOK
volumes:
- ./monitoring/alert-manager/alertmanager-config.yml.template:/etc/alertmanager/alertmanager.yml.template:z
- ./monitoring/alert-manager/alertmanager-config.yml:/etc/alertmanager/alertmanager.yml:z
command: ["/bin/sh", "-c", "apk add --no-cache gettext && envsubst < /etc/alertmanager/alertmanager.yml.template > /etc/alertmanager/alertmanager.yml"]
alertmanager:
image: prom/alertmanager:latest
volumes:
- ./monitoring/alert-manager/alertmanager-config.yml:/etc/alertmanager/alertmanager.yml:z
command:
- --config.file=/etc/alertmanager/alertmanager.yml
ports:
- 127.0.0.1:9093:9093
restart: on-failure
networks:
- simulation
depends_on:
- env_replacer

View File

@ -0,0 +1,23 @@
groups:
- name: waku
rules:
- alert: HighNimWakuMemUsage
expr: >
nim_gc_mem_bytes{} > 1
for: 5m
annotations:
summary: "Too high memory usage for {{ $labels.instance }}"
description: "Host {{ $labels.instance }} running nim-waku has GC memory usage higher 1GB"
current_value: "{{ $value }}"
- record: job:waku_libp2p_peers
expr: libp2p_peers{}
- alert: NimWakuPeersDecrease
expr: >
(job:waku_libp2p_peers / avg_over_time(job:waku_libp2p_peers[12h])) < 0.50
for: 15m
annotations:
summary: "Drop of libp2p_peers on {{ $labels.instance }}"
description: "Host {{ $labels.instance }} running nim-waku has more than 50% drop of peers compared to 12h average"
current_value: "{{ $value }}"

View File

@ -0,0 +1,11 @@
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 2m
receiver: 'discord'
receivers:
- name: 'discord'
discord_configs:
- webhook_url: ${DISCORD_WEBHOOK}

View File

@ -4,6 +4,15 @@ global:
external_labels:
monitor: "Monitoring"
alerting:
alertmanagers:
- scheme: http
static_configs:
- targets: [ 'alertmanager:9093' ]
rule_files:
- "./alert-rules.yml"
scrape_configs:
- job_name: cadvisor
scrape_interval: 5s