mirror of
https://github.com/logos-messaging/logos-messaging-simulator.git
synced 2026-02-26 16:53:13 +00:00
adding alert manager service
This commit is contained in:
parent
29c914995a
commit
8551bffaf0
@ -108,12 +108,15 @@ services:
|
||||
image: prom/prometheus:latest
|
||||
volumes:
|
||||
- ./monitoring/prometheus-config.yml:/etc/prometheus/prometheus.yml:z
|
||||
- ./monitoring/alert-rules.yml:/etc/prometheus/alert-rules.yml:z
|
||||
command:
|
||||
- --config.file=/etc/prometheus/prometheus.yml
|
||||
- --storage.tsdb.retention.time=7d
|
||||
ports:
|
||||
- 127.0.0.1:9090:9090
|
||||
restart: on-failure
|
||||
depends_on:
|
||||
- alertmanager
|
||||
networks:
|
||||
- simulation
|
||||
|
||||
@ -212,5 +215,17 @@ services:
|
||||
- mongodb
|
||||
- redis
|
||||
- foundry
|
||||
networks:
|
||||
- simulation
|
||||
|
||||
alertmanager:
|
||||
image: prom/alertmanager:latest
|
||||
volumes:
|
||||
- ./monitoring/alertmanager-config.yml:/etc/alertmanager/alertmanager.yml:z
|
||||
command:
|
||||
- --config.file=/etc/alertmanager/alertmanager.yml
|
||||
ports:
|
||||
- 127.0.0.1:9093:9093
|
||||
restart: on-failure
|
||||
networks:
|
||||
- simulation
|
||||
49
monitoring/alert-rules.yml
Normal file
49
monitoring/alert-rules.yml
Normal file
@ -0,0 +1,49 @@
|
||||
---
|
||||
groups:
|
||||
- name: waku
|
||||
rules:
|
||||
- alert: HighNimWakuNodeTraffic
|
||||
expr: >
|
||||
abs(
|
||||
netdata_net_net_kilobits_persec_average{
|
||||
family!~"^(wg|veth|br|docker).*",
|
||||
fleet=~"(shards|waku|status|wakuv2)\\..*"
|
||||
}) > 40000
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: "Too high traffic for {{ $labels.instance }}"
|
||||
description: "Host {{ $labels.instance }} running nim-waku has {{ $labels.dimension }} traffic higher 40Mbps"
|
||||
|
||||
- alert: HighNimWakuMemUsage
|
||||
expr: >
|
||||
nim_gc_mem_bytes{
|
||||
fleet=~"(shards|waku|status|wakuv2)\\..*"
|
||||
} > 1073741824
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: "Too high memory usage for {{ $labels.instance }}"
|
||||
description: "Host {{ $labels.instance }} running nim-waku has GC memory usage higher 1GB"
|
||||
current_value: "{{ $value }}"
|
||||
|
||||
- alert: HighNimWakuNodeOpenSockets
|
||||
expr: >
|
||||
netdata_ipv4_sockstat_tcp_sockets_sockets_average{
|
||||
fleet=~"(shards|waku|status|wakuv2)\\..*"
|
||||
} > 300
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: "Too high open sockets for {{ $labels.instance }}"
|
||||
description: "Host {{ $labels.instance }} running nim-waku has more than 300 open sockets"
|
||||
current_value: "{{ $value }}"
|
||||
|
||||
- record: job:waku_libp2p_peers
|
||||
expr: libp2p_peers{fleet=~"(shards|waku|status|wakuv2)\\..*"}
|
||||
|
||||
- alert: NimWakuPeersDecrease
|
||||
expr: >
|
||||
(job:waku_libp2p_peers / avg_over_time(job:waku_libp2p_peers[12h])) < 0.50
|
||||
for: 15m
|
||||
annotations:
|
||||
summary: "Drop of libp2p_peers on {{ $labels.instance }}"
|
||||
description: "Host {{ $labels.instance }} running nim-waku has more than 50% drop of peers compared to 12h average"
|
||||
current_value: "{{ $value }}"
|
||||
13
monitoring/alertmanager-config.yml
Normal file
13
monitoring/alertmanager-config.yml
Normal file
@ -0,0 +1,13 @@
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
group_by: ['alertname']
|
||||
group_wait: 10s
|
||||
group_interval: 10s
|
||||
repeat_interval: 1h
|
||||
receiver: 'discord'
|
||||
|
||||
receivers:
|
||||
- name: 'discord'
|
||||
|
||||
@ -4,6 +4,12 @@ global:
|
||||
external_labels:
|
||||
monitor: "Monitoring"
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- scheme: http
|
||||
static_configs:
|
||||
- targets: [ 'alertmanager:9093' ]
|
||||
|
||||
scrape_configs:
|
||||
- job_name: cadvisor
|
||||
scrape_interval: 5s
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user