diff --git a/.gitignore b/.gitignore index 047d548..65e2a1a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ **/.DS_Store *.env !wakusim.env +alertmanager-config.yml diff --git a/docker-compose.yml b/docker-compose.yml index 346644d..cbd99d8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -108,12 +108,15 @@ services: image: prom/prometheus:latest volumes: - ./monitoring/prometheus-config.yml:/etc/prometheus/prometheus.yml:z + - ./monitoring/alert-manager/alert-rules.yml:/etc/prometheus/alert-rules.yml:z command: - --config.file=/etc/prometheus/prometheus.yml - --storage.tsdb.retention.time=7d ports: - 127.0.0.1:9090:9090 restart: on-failure + depends_on: + - alertmanager networks: - simulation @@ -213,4 +216,27 @@ services: - redis - foundry networks: - - simulation \ No newline at end of file + - simulation + + env_replacer: + image: alpine:3.19.1 + environment: + - DISCORD_WEBHOOK=$DISCORD_WEBHOOK + volumes: + - ./monitoring/alert-manager/alertmanager-config.yml.template:/etc/alertmanager/alertmanager.yml.template:z + - ./monitoring/alert-manager/alertmanager-config.yml:/etc/alertmanager/alertmanager.yml:z + command: ["/bin/sh", "-c", "apk add --no-cache gettext && envsubst < /etc/alertmanager/alertmanager.yml.template > /etc/alertmanager/alertmanager.yml"] + + alertmanager: + image: prom/alertmanager:latest + volumes: + - ./monitoring/alert-manager/alertmanager-config.yml:/etc/alertmanager/alertmanager.yml:z + command: + - --config.file=/etc/alertmanager/alertmanager.yml + ports: + - 127.0.0.1:9093:9093 + restart: on-failure + networks: + - simulation + depends_on: + - env_replacer \ No newline at end of file diff --git a/monitoring/alert-manager/alert-rules.yml b/monitoring/alert-manager/alert-rules.yml new file mode 100644 index 0000000..e7bff1c --- /dev/null +++ b/monitoring/alert-manager/alert-rules.yml @@ -0,0 +1,23 @@ +groups: + - name: waku + rules: + - alert: HighNimWakuMemUsage + expr: > + nim_gc_mem_bytes{} > 1 + for: 5m + annotations: + summary: "Too high memory usage for {{ $labels.instance }}" + description: "Host {{ $labels.instance }} running nim-waku has GC memory usage higher 1GB" + current_value: "{{ $value }}" + + - record: job:waku_libp2p_peers + expr: libp2p_peers{} + + - alert: NimWakuPeersDecrease + expr: > + (job:waku_libp2p_peers / avg_over_time(job:waku_libp2p_peers[12h])) < 0.50 + for: 15m + annotations: + summary: "Drop of libp2p_peers on {{ $labels.instance }}" + description: "Host {{ $labels.instance }} running nim-waku has more than 50% drop of peers compared to 12h average" + current_value: "{{ $value }}" \ No newline at end of file diff --git a/monitoring/alert-manager/alertmanager-config.yml.template b/monitoring/alert-manager/alertmanager-config.yml.template new file mode 100644 index 0000000..f08be0d --- /dev/null +++ b/monitoring/alert-manager/alertmanager-config.yml.template @@ -0,0 +1,11 @@ +route: + group_by: ['alertname'] + group_wait: 10s + group_interval: 10s + repeat_interval: 2m + receiver: 'discord' + +receivers: + - name: 'discord' + discord_configs: + - webhook_url: ${DISCORD_WEBHOOK} \ No newline at end of file diff --git a/monitoring/prometheus-config.yml b/monitoring/prometheus-config.yml index 69102ab..322f305 100644 --- a/monitoring/prometheus-config.yml +++ b/monitoring/prometheus-config.yml @@ -4,6 +4,15 @@ global: external_labels: monitor: "Monitoring" +alerting: + alertmanagers: + - scheme: http + static_configs: + - targets: [ 'alertmanager:9093' ] + +rule_files: + - "./alert-rules.yml" + scrape_configs: - job_name: cadvisor scrape_interval: 5s