add role files from infra-hq
Signed-off-by: Jakub Sokołowski <jakub@status.im>
This commit is contained in:
parent
0c7c8165c3
commit
dbb90251c2
|
@ -0,0 +1,50 @@
|
||||||
|
# Description
|
||||||
|
|
||||||
|
This role configures [AlertManager](https://prometheus.io/docs/alerting/alertmanager/) to notify people of threshold breaches in rules configured in Prometheus __master__ instance.
|
||||||
|
|
||||||
|
# Service
|
||||||
|
|
||||||
|
AlertManager runs in a cluster to achieve high availability. The peer connect via [Tinc VPN](https://github.com/status-im/infra-role-bootstrap/tree/master/tasks/tinc).
|
||||||
|
The service listens on `:9093` and the Prometheus instance connects to that port via the VPN to inform it of threshold breaches.
|
||||||
|
|
||||||
|
The service UI is available at: https://alerts.status.im/
|
||||||
|
|
||||||
|
# CLI Tool
|
||||||
|
|
||||||
|
You can manage existing alerts by using the `amtool` on any of the hosts running this:
|
||||||
|
```
|
||||||
|
> amtool alert
|
||||||
|
Alertname Starts At Summary
|
||||||
|
Test_Alert 2018-07-06 18:30:18 UTC This is a testing alert!
|
||||||
|
> amtool silence
|
||||||
|
ID Matchers Starts At Ends At Updated At Created By Comment
|
||||||
|
9635b573-5177-4601-a3b0-ac6a25d0a4ef alertname=InstanceDown 2018-07-06 12:37:04 UTC 2018-07-06 14:36:05 UTC 2018-07-06 12:37:04 UTC jakubgs test
|
||||||
|
```
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
|
||||||
|
The main configuration resides in [`templates/alertmanager.yml.j2`](templates/alertmanager.yml.j2). It configures all the receivers of alerts generated by Prometheus __master__ instance.
|
||||||
|
|
||||||
|
The are three main sections:
|
||||||
|
|
||||||
|
* `global` - Configure general auth related options for SMTP and Slack receivers.
|
||||||
|
* `receivers` - Defines destinations of alets which can be used in the `route` section.
|
||||||
|
* `route` - Defines rules based on which alerts are directed to defined receivers.
|
||||||
|
|
||||||
|
For more details see: https://prometheus.io/docs/alerting/configuration/
|
||||||
|
|
||||||
|
# Ansible Variables
|
||||||
|
|
||||||
|
The bare minimum should be:
|
||||||
|
```yml
|
||||||
|
alertmanager_domain: 'alerts.example.org'
|
||||||
|
alertmanager_admin_email: 'admin@example.org'
|
||||||
|
alertmanager_smtp_host: smtp.mail.example.org'
|
||||||
|
alertmanager_smtp_from: 'alerts@example.org'
|
||||||
|
alertmanager_smtp_user: 'secret-smtp-user'
|
||||||
|
alertmanager_smtp_pass: 'secret-smtp-pass'
|
||||||
|
|
||||||
|
alertmanager_victorops_api_key: 'secret-victorops-api-key'
|
||||||
|
alertmanager_victorops_routing_key: 'alert-manager'
|
||||||
|
```
|
||||||
|
Take note you will have to create an `alert-manager` routing rule in VictorOps.
|
|
@ -0,0 +1,36 @@
|
||||||
|
---
|
||||||
|
alertmanager_cont_tag: 'v0.21.0'
|
||||||
|
alertmanager_cont_image: 'quay.io/prometheus/alertmanager:{{ alertmanager_cont_tag }}'
|
||||||
|
alertmanager_cont_name: 'alertmanager'
|
||||||
|
alertmanager_cont_vol: '/docker/{{ alertmanager_cont_name }}'
|
||||||
|
alertmanager_cont_log_lvl: info
|
||||||
|
|
||||||
|
# port used for the web ui
|
||||||
|
alertmanager_webui_port: 9093
|
||||||
|
# port used for clustering AlertManager peers
|
||||||
|
alertmanager_cluster_port: 9094
|
||||||
|
|
||||||
|
alertmanager_smtp_host: ~
|
||||||
|
alertmanager_smtp_port: ~
|
||||||
|
alertmanager_smtp_from: ~
|
||||||
|
alertmanager_smtp_user: ~
|
||||||
|
alertmanager_smtp_pass: ~
|
||||||
|
|
||||||
|
alertmanager_domain: ~
|
||||||
|
alertmanager_url: 'https://{{ alertmanager_domain | mandatory }}/'
|
||||||
|
alertmanager_admin_email: ~
|
||||||
|
|
||||||
|
# VictorOps paging service
|
||||||
|
alertmanager_victorops_api_key: ~
|
||||||
|
alertmanager_victorops_service_url: ~
|
||||||
|
alertmanager_victorops_routing_key: ~
|
||||||
|
alertmanager_victorops_message_type: 'ERROR'
|
||||||
|
alertmanager_victorops_state_message: '{% raw %}Alert: {{ .CommonLabels.alertname }}. Summary:{{ .CommonAnnotations.summary }}. RawData: {{ .CommonLabels }}{% endraw %}'
|
||||||
|
|
||||||
|
# For discovery of prometheus master nodes
|
||||||
|
consul_catalog_url: 'http://localhost:8500/v1/catalog'
|
||||||
|
|
||||||
|
# Generic container options
|
||||||
|
cont_state: started
|
||||||
|
cont_recreate: false
|
||||||
|
cont_restart: false
|
|
@ -0,0 +1,3 @@
|
||||||
|
---
|
||||||
|
- name: Save iptables rules
|
||||||
|
shell: iptables-save > /etc/iptables/rules.v4
|
|
@ -0,0 +1,22 @@
|
||||||
|
---
|
||||||
|
- name: Create container directories
|
||||||
|
file:
|
||||||
|
path: '{{ alertmanager_cont_vol }}/{{ item }}'
|
||||||
|
state: 'directory'
|
||||||
|
owner: 'dockremap'
|
||||||
|
group: 'docker'
|
||||||
|
recurse: true
|
||||||
|
with_items:
|
||||||
|
- data
|
||||||
|
- conf
|
||||||
|
|
||||||
|
- name: Create config files
|
||||||
|
template:
|
||||||
|
src: '{{ item }}.j2'
|
||||||
|
dest: '{{ alertmanager_cont_vol }}/conf/{{ item }}'
|
||||||
|
owner: 'dockremap'
|
||||||
|
group: 'docker'
|
||||||
|
register: alertmanager_cont_config
|
||||||
|
with_items:
|
||||||
|
- 'alertmanager.yml'
|
||||||
|
- 'amtool.yml'
|
|
@ -0,0 +1,19 @@
|
||||||
|
---
|
||||||
|
- name: Create Consul service definition
|
||||||
|
include_role: name=consul-service
|
||||||
|
vars:
|
||||||
|
consul_config_name: '{{ alertmanager_cont_name }}'
|
||||||
|
consul_services:
|
||||||
|
- name: '{{ alertmanager_cont_name }}'
|
||||||
|
tags: ['metrics', 'alertmanager']
|
||||||
|
port: '{{ alertmanager_webui_port }}'
|
||||||
|
address: '{{ ansible_local.tinc.vpn_ip }}'
|
||||||
|
checks:
|
||||||
|
- id: alertmanager-status
|
||||||
|
name: Alert Manager status
|
||||||
|
type: http
|
||||||
|
http: 'http://localhost:{{ alertmanager_webui_port }}/api/v1/receivers'
|
||||||
|
|
||||||
|
# We need to do this for discover step to work
|
||||||
|
- name: Reload Consul right away
|
||||||
|
command: consul reload
|
|
@ -0,0 +1,29 @@
|
||||||
|
---
|
||||||
|
- name: Start container
|
||||||
|
docker_container:
|
||||||
|
name: '{{ alertmanager_cont_name }}'
|
||||||
|
image: '{{ alertmanager_cont_image }}'
|
||||||
|
user: root
|
||||||
|
pull: true
|
||||||
|
restart_policy: always
|
||||||
|
state: '{{ cont_state }}'
|
||||||
|
recreate: '{{ cont_recreate }}'
|
||||||
|
restart: '{{ alertmanager_cont_config.changed | default(cont_restart) }}'
|
||||||
|
ports:
|
||||||
|
- '0.0.0.0:{{ alertmanager_cluster_port }}:{{ alertmanager_cluster_port }}'
|
||||||
|
- '0.0.0.0:{{ alertmanager_webui_port }}:{{ alertmanager_webui_port }}'
|
||||||
|
volumes:
|
||||||
|
- '{{ alertmanager_cont_vol }}/conf/alertmanager.yml:/etc/alertmanager.yml:ro'
|
||||||
|
- '{{ alertmanager_cont_vol }}/conf/amtool.yml:/etc/amtool/config.yml:ro'
|
||||||
|
- '{{ alertmanager_cont_vol }}/data:/data'
|
||||||
|
- '/certs:/certs'
|
||||||
|
command: |
|
||||||
|
--storage.path=/data
|
||||||
|
--log.level={{ alertmanager_cont_log_lvl }}
|
||||||
|
--config.file=/etc/alertmanager.yml
|
||||||
|
--web.external-url={{ alertmanager_url }}
|
||||||
|
--cluster.advertise-address="{{ ansible_local.tinc.vpn_ip }}:{{ alertmanager_cluster_port }}"
|
||||||
|
--cluster.listen-address="0.0.0.0:{{ alertmanager_cluster_port }}"
|
||||||
|
{% for peer in cluster_peers %}
|
||||||
|
--cluster.peer={{ peer }}:{{ alertmanager_cluster_port }}
|
||||||
|
{% endfor %}
|
|
@ -0,0 +1,21 @@
|
||||||
|
---
|
||||||
|
- name: Get data centers
|
||||||
|
uri:
|
||||||
|
url: '{{ consul_catalog_url }}/datacenters'
|
||||||
|
register: data_centers
|
||||||
|
|
||||||
|
- name: Get AlertManager node addresses
|
||||||
|
uri:
|
||||||
|
url: '{{ consul_catalog_url }}/service/{{ alertmanager_cont_name }}?dc={{ item }}'
|
||||||
|
method: GET
|
||||||
|
validate_certs: no
|
||||||
|
with_items: '{{ data_centers.json }}'
|
||||||
|
register: alertmanager_services
|
||||||
|
|
||||||
|
- name: Extract IP addresses
|
||||||
|
set_fact:
|
||||||
|
cluster_peers: |
|
||||||
|
{{ alertmanager_services.results
|
||||||
|
| sum(attribute="json", start=[])
|
||||||
|
| map(attribute="ServiceAddress")
|
||||||
|
| list }}
|
|
@ -0,0 +1,6 @@
|
||||||
|
---
|
||||||
|
- include_tasks: consul.yml
|
||||||
|
- include_tasks: discover.yml
|
||||||
|
- include_tasks: config.yml
|
||||||
|
- include_tasks: container.yml
|
||||||
|
- include_tasks: wrapper.yml
|
|
@ -0,0 +1,8 @@
|
||||||
|
---
|
||||||
|
- name: Create amtool wrapper
|
||||||
|
copy:
|
||||||
|
content: |
|
||||||
|
#/usr/bin/env bash
|
||||||
|
docker exec alertmanager amtool $@
|
||||||
|
dest: '/usr/local/bin/amtool'
|
||||||
|
mode: 0755
|
|
@ -0,0 +1,38 @@
|
||||||
|
global:
|
||||||
|
# SMTP authentication information.
|
||||||
|
smtp_from: '{{ alertmanager_smtp_from }}'
|
||||||
|
smtp_smarthost: '{{ alertmanager_smtp_host }}:{{ alertmanager_smtp_port }}'
|
||||||
|
smtp_auth_username: '{{ alertmanager_smtp_user }}'
|
||||||
|
smtp_auth_password: '{{ alertmanager_smtp_pass }}'
|
||||||
|
smtp_require_tls: true
|
||||||
|
# VictorOps configuration
|
||||||
|
victorops_api_key: '{{ alertmanager_victorops_api_key }}'
|
||||||
|
victorops_api_url: '{{ alertmanager_victorops_service_url }}'
|
||||||
|
|
||||||
|
route:
|
||||||
|
# Default destination fo all alerts
|
||||||
|
receiver: 'admin-email'
|
||||||
|
# How to group together alerts
|
||||||
|
group_by: ['alertname', 'cluster']
|
||||||
|
# Wait this much before initial notification to group them.
|
||||||
|
group_wait: 30s
|
||||||
|
# Wait before sending another batch for a group.
|
||||||
|
group_interval: 3m
|
||||||
|
# Wait this much to resend notifications.
|
||||||
|
repeat_interval: 1h
|
||||||
|
|
||||||
|
routes:
|
||||||
|
# send all notifications to slack too
|
||||||
|
- receiver: 'victorops-alerts'
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
- name: 'admin-email'
|
||||||
|
email_configs:
|
||||||
|
- to: '{{ alertmanager_admin_email }}'
|
||||||
|
send_resolved: true
|
||||||
|
|
||||||
|
- name: 'victorops-alerts'
|
||||||
|
victorops_configs:
|
||||||
|
- routing_key: '{{ alertmanager_victorops_routing_key }}'
|
||||||
|
state_message: '{{ alertmanager_victorops_state_message }}'
|
||||||
|
message_type: '{{ alertmanager_victorops_message_type }}'
|
|
@ -0,0 +1,14 @@
|
||||||
|
# an find your `alertmanager` instance at
|
||||||
|
alertmanager.url: "http://localhost:{{ alertmanager_webui_port }}"
|
||||||
|
|
||||||
|
# Override the default author. (unset defaults to your username)
|
||||||
|
author: jakub@status.im
|
||||||
|
|
||||||
|
# Force amtool to give you an error if you don't include a comment on a silence
|
||||||
|
comment_required: true
|
||||||
|
|
||||||
|
# Set a default output format. (unset defaults to simple)
|
||||||
|
output: extended
|
||||||
|
|
||||||
|
# Set a default receiver
|
||||||
|
receiver: team-X-pager
|
Loading…
Reference in New Issue