add role files from infra-hq

Signed-off-by: Jakub Sokołowski <jakub@status.im>
This commit is contained in:
Jakub Sokołowski 2020-12-07 12:20:33 +01:00
parent 0c7c8165c3
commit dbb90251c2
No known key found for this signature in database
GPG Key ID: 4EF064D0E6D63020
11 changed files with 246 additions and 0 deletions

50
README.md Normal file
View File

@ -0,0 +1,50 @@
# Description
This role configures [AlertManager](https://prometheus.io/docs/alerting/alertmanager/) to notify people of threshold breaches in rules configured in Prometheus __master__ instance.
# Service
AlertManager runs in a cluster to achieve high availability. The peer connect via [Tinc VPN](https://github.com/status-im/infra-role-bootstrap/tree/master/tasks/tinc).
The service listens on `:9093` and the Prometheus instance connects to that port via the VPN to inform it of threshold breaches.
The service UI is available at: https://alerts.status.im/
# CLI Tool
You can manage existing alerts by using the `amtool` on any of the hosts running this:
```
> amtool alert
Alertname Starts At Summary
Test_Alert 2018-07-06 18:30:18 UTC This is a testing alert!
> amtool silence
ID Matchers Starts At Ends At Updated At Created By Comment
9635b573-5177-4601-a3b0-ac6a25d0a4ef alertname=InstanceDown 2018-07-06 12:37:04 UTC 2018-07-06 14:36:05 UTC 2018-07-06 12:37:04 UTC jakubgs test
```
# Configuration
The main configuration resides in [`templates/alertmanager.yml.j2`](templates/alertmanager.yml.j2). It configures all the receivers of alerts generated by Prometheus __master__ instance.
The are three main sections:
* `global` - Configure general auth related options for SMTP and Slack receivers.
* `receivers` - Defines destinations of alets which can be used in the `route` section.
* `route` - Defines rules based on which alerts are directed to defined receivers.
For more details see: https://prometheus.io/docs/alerting/configuration/
# Ansible Variables
The bare minimum should be:
```yml
alertmanager_domain: 'alerts.example.org'
alertmanager_admin_email: 'admin@example.org'
alertmanager_smtp_host: smtp.mail.example.org'
alertmanager_smtp_from: 'alerts@example.org'
alertmanager_smtp_user: 'secret-smtp-user'
alertmanager_smtp_pass: 'secret-smtp-pass'
alertmanager_victorops_api_key: 'secret-victorops-api-key'
alertmanager_victorops_routing_key: 'alert-manager'
```
Take note you will have to create an `alert-manager` routing rule in VictorOps.

36
defaults/main.yml Normal file
View File

@ -0,0 +1,36 @@
---
alertmanager_cont_tag: 'v0.21.0'
alertmanager_cont_image: 'quay.io/prometheus/alertmanager:{{ alertmanager_cont_tag }}'
alertmanager_cont_name: 'alertmanager'
alertmanager_cont_vol: '/docker/{{ alertmanager_cont_name }}'
alertmanager_cont_log_lvl: info
# port used for the web ui
alertmanager_webui_port: 9093
# port used for clustering AlertManager peers
alertmanager_cluster_port: 9094
alertmanager_smtp_host: ~
alertmanager_smtp_port: ~
alertmanager_smtp_from: ~
alertmanager_smtp_user: ~
alertmanager_smtp_pass: ~
alertmanager_domain: ~
alertmanager_url: 'https://{{ alertmanager_domain | mandatory }}/'
alertmanager_admin_email: ~
# VictorOps paging service
alertmanager_victorops_api_key: ~
alertmanager_victorops_service_url: ~
alertmanager_victorops_routing_key: ~
alertmanager_victorops_message_type: 'ERROR'
alertmanager_victorops_state_message: '{% raw %}Alert: {{ .CommonLabels.alertname }}. Summary:{{ .CommonAnnotations.summary }}. RawData: {{ .CommonLabels }}{% endraw %}'
# For discovery of prometheus master nodes
consul_catalog_url: 'http://localhost:8500/v1/catalog'
# Generic container options
cont_state: started
cont_recreate: false
cont_restart: false

3
handlers/main.yml Normal file
View File

@ -0,0 +1,3 @@
---
- name: Save iptables rules
shell: iptables-save > /etc/iptables/rules.v4

22
tasks/config.yml Normal file
View File

@ -0,0 +1,22 @@
---
- name: Create container directories
file:
path: '{{ alertmanager_cont_vol }}/{{ item }}'
state: 'directory'
owner: 'dockremap'
group: 'docker'
recurse: true
with_items:
- data
- conf
- name: Create config files
template:
src: '{{ item }}.j2'
dest: '{{ alertmanager_cont_vol }}/conf/{{ item }}'
owner: 'dockremap'
group: 'docker'
register: alertmanager_cont_config
with_items:
- 'alertmanager.yml'
- 'amtool.yml'

19
tasks/consul.yml Normal file
View File

@ -0,0 +1,19 @@
---
- name: Create Consul service definition
include_role: name=consul-service
vars:
consul_config_name: '{{ alertmanager_cont_name }}'
consul_services:
- name: '{{ alertmanager_cont_name }}'
tags: ['metrics', 'alertmanager']
port: '{{ alertmanager_webui_port }}'
address: '{{ ansible_local.tinc.vpn_ip }}'
checks:
- id: alertmanager-status
name: Alert Manager status
type: http
http: 'http://localhost:{{ alertmanager_webui_port }}/api/v1/receivers'
# We need to do this for discover step to work
- name: Reload Consul right away
command: consul reload

29
tasks/container.yml Normal file
View File

@ -0,0 +1,29 @@
---
- name: Start container
docker_container:
name: '{{ alertmanager_cont_name }}'
image: '{{ alertmanager_cont_image }}'
user: root
pull: true
restart_policy: always
state: '{{ cont_state }}'
recreate: '{{ cont_recreate }}'
restart: '{{ alertmanager_cont_config.changed | default(cont_restart) }}'
ports:
- '0.0.0.0:{{ alertmanager_cluster_port }}:{{ alertmanager_cluster_port }}'
- '0.0.0.0:{{ alertmanager_webui_port }}:{{ alertmanager_webui_port }}'
volumes:
- '{{ alertmanager_cont_vol }}/conf/alertmanager.yml:/etc/alertmanager.yml:ro'
- '{{ alertmanager_cont_vol }}/conf/amtool.yml:/etc/amtool/config.yml:ro'
- '{{ alertmanager_cont_vol }}/data:/data'
- '/certs:/certs'
command: |
--storage.path=/data
--log.level={{ alertmanager_cont_log_lvl }}
--config.file=/etc/alertmanager.yml
--web.external-url={{ alertmanager_url }}
--cluster.advertise-address="{{ ansible_local.tinc.vpn_ip }}:{{ alertmanager_cluster_port }}"
--cluster.listen-address="0.0.0.0:{{ alertmanager_cluster_port }}"
{% for peer in cluster_peers %}
--cluster.peer={{ peer }}:{{ alertmanager_cluster_port }}
{% endfor %}

21
tasks/discover.yml Normal file
View File

@ -0,0 +1,21 @@
---
- name: Get data centers
uri:
url: '{{ consul_catalog_url }}/datacenters'
register: data_centers
- name: Get AlertManager node addresses
uri:
url: '{{ consul_catalog_url }}/service/{{ alertmanager_cont_name }}?dc={{ item }}'
method: GET
validate_certs: no
with_items: '{{ data_centers.json }}'
register: alertmanager_services
- name: Extract IP addresses
set_fact:
cluster_peers: |
{{ alertmanager_services.results
| sum(attribute="json", start=[])
| map(attribute="ServiceAddress")
| list }}

6
tasks/main.yml Normal file
View File

@ -0,0 +1,6 @@
---
- include_tasks: consul.yml
- include_tasks: discover.yml
- include_tasks: config.yml
- include_tasks: container.yml
- include_tasks: wrapper.yml

8
tasks/wrapper.yml Normal file
View File

@ -0,0 +1,8 @@
---
- name: Create amtool wrapper
copy:
content: |
#/usr/bin/env bash
docker exec alertmanager amtool $@
dest: '/usr/local/bin/amtool'
mode: 0755

View File

@ -0,0 +1,38 @@
global:
# SMTP authentication information.
smtp_from: '{{ alertmanager_smtp_from }}'
smtp_smarthost: '{{ alertmanager_smtp_host }}:{{ alertmanager_smtp_port }}'
smtp_auth_username: '{{ alertmanager_smtp_user }}'
smtp_auth_password: '{{ alertmanager_smtp_pass }}'
smtp_require_tls: true
# VictorOps configuration
victorops_api_key: '{{ alertmanager_victorops_api_key }}'
victorops_api_url: '{{ alertmanager_victorops_service_url }}'
route:
# Default destination fo all alerts
receiver: 'admin-email'
# How to group together alerts
group_by: ['alertname', 'cluster']
# Wait this much before initial notification to group them.
group_wait: 30s
# Wait before sending another batch for a group.
group_interval: 3m
# Wait this much to resend notifications.
repeat_interval: 1h
routes:
# send all notifications to slack too
- receiver: 'victorops-alerts'
receivers:
- name: 'admin-email'
email_configs:
- to: '{{ alertmanager_admin_email }}'
send_resolved: true
- name: 'victorops-alerts'
victorops_configs:
- routing_key: '{{ alertmanager_victorops_routing_key }}'
state_message: '{{ alertmanager_victorops_state_message }}'
message_type: '{{ alertmanager_victorops_message_type }}'

14
templates/amtool.yml.j2 Normal file
View File

@ -0,0 +1,14 @@
# an find your `alertmanager` instance at
alertmanager.url: "http://localhost:{{ alertmanager_webui_port }}"
# Override the default author. (unset defaults to your username)
author: jakub@status.im
# Force amtool to give you an error if you don't include a comment on a silence
comment_required: true
# Set a default output format. (unset defaults to simple)
output: extended
# Set a default receiver
receiver: team-X-pager