add role files from infra-hq
Signed-off-by: Jakub Sokołowski <jakub@status.im>
This commit is contained in:
parent
0c7c8165c3
commit
dbb90251c2
|
@ -0,0 +1,50 @@
|
|||
# Description
|
||||
|
||||
This role configures [AlertManager](https://prometheus.io/docs/alerting/alertmanager/) to notify people of threshold breaches in rules configured in Prometheus __master__ instance.
|
||||
|
||||
# Service
|
||||
|
||||
AlertManager runs in a cluster to achieve high availability. The peer connect via [Tinc VPN](https://github.com/status-im/infra-role-bootstrap/tree/master/tasks/tinc).
|
||||
The service listens on `:9093` and the Prometheus instance connects to that port via the VPN to inform it of threshold breaches.
|
||||
|
||||
The service UI is available at: https://alerts.status.im/
|
||||
|
||||
# CLI Tool
|
||||
|
||||
You can manage existing alerts by using the `amtool` on any of the hosts running this:
|
||||
```
|
||||
> amtool alert
|
||||
Alertname Starts At Summary
|
||||
Test_Alert 2018-07-06 18:30:18 UTC This is a testing alert!
|
||||
> amtool silence
|
||||
ID Matchers Starts At Ends At Updated At Created By Comment
|
||||
9635b573-5177-4601-a3b0-ac6a25d0a4ef alertname=InstanceDown 2018-07-06 12:37:04 UTC 2018-07-06 14:36:05 UTC 2018-07-06 12:37:04 UTC jakubgs test
|
||||
```
|
||||
|
||||
# Configuration
|
||||
|
||||
The main configuration resides in [`templates/alertmanager.yml.j2`](templates/alertmanager.yml.j2). It configures all the receivers of alerts generated by Prometheus __master__ instance.
|
||||
|
||||
The are three main sections:
|
||||
|
||||
* `global` - Configure general auth related options for SMTP and Slack receivers.
|
||||
* `receivers` - Defines destinations of alets which can be used in the `route` section.
|
||||
* `route` - Defines rules based on which alerts are directed to defined receivers.
|
||||
|
||||
For more details see: https://prometheus.io/docs/alerting/configuration/
|
||||
|
||||
# Ansible Variables
|
||||
|
||||
The bare minimum should be:
|
||||
```yml
|
||||
alertmanager_domain: 'alerts.example.org'
|
||||
alertmanager_admin_email: 'admin@example.org'
|
||||
alertmanager_smtp_host: smtp.mail.example.org'
|
||||
alertmanager_smtp_from: 'alerts@example.org'
|
||||
alertmanager_smtp_user: 'secret-smtp-user'
|
||||
alertmanager_smtp_pass: 'secret-smtp-pass'
|
||||
|
||||
alertmanager_victorops_api_key: 'secret-victorops-api-key'
|
||||
alertmanager_victorops_routing_key: 'alert-manager'
|
||||
```
|
||||
Take note you will have to create an `alert-manager` routing rule in VictorOps.
|
|
@ -0,0 +1,36 @@
|
|||
---
|
||||
alertmanager_cont_tag: 'v0.21.0'
|
||||
alertmanager_cont_image: 'quay.io/prometheus/alertmanager:{{ alertmanager_cont_tag }}'
|
||||
alertmanager_cont_name: 'alertmanager'
|
||||
alertmanager_cont_vol: '/docker/{{ alertmanager_cont_name }}'
|
||||
alertmanager_cont_log_lvl: info
|
||||
|
||||
# port used for the web ui
|
||||
alertmanager_webui_port: 9093
|
||||
# port used for clustering AlertManager peers
|
||||
alertmanager_cluster_port: 9094
|
||||
|
||||
alertmanager_smtp_host: ~
|
||||
alertmanager_smtp_port: ~
|
||||
alertmanager_smtp_from: ~
|
||||
alertmanager_smtp_user: ~
|
||||
alertmanager_smtp_pass: ~
|
||||
|
||||
alertmanager_domain: ~
|
||||
alertmanager_url: 'https://{{ alertmanager_domain | mandatory }}/'
|
||||
alertmanager_admin_email: ~
|
||||
|
||||
# VictorOps paging service
|
||||
alertmanager_victorops_api_key: ~
|
||||
alertmanager_victorops_service_url: ~
|
||||
alertmanager_victorops_routing_key: ~
|
||||
alertmanager_victorops_message_type: 'ERROR'
|
||||
alertmanager_victorops_state_message: '{% raw %}Alert: {{ .CommonLabels.alertname }}. Summary:{{ .CommonAnnotations.summary }}. RawData: {{ .CommonLabels }}{% endraw %}'
|
||||
|
||||
# For discovery of prometheus master nodes
|
||||
consul_catalog_url: 'http://localhost:8500/v1/catalog'
|
||||
|
||||
# Generic container options
|
||||
cont_state: started
|
||||
cont_recreate: false
|
||||
cont_restart: false
|
|
@ -0,0 +1,3 @@
|
|||
---
|
||||
- name: Save iptables rules
|
||||
shell: iptables-save > /etc/iptables/rules.v4
|
|
@ -0,0 +1,22 @@
|
|||
---
|
||||
- name: Create container directories
|
||||
file:
|
||||
path: '{{ alertmanager_cont_vol }}/{{ item }}'
|
||||
state: 'directory'
|
||||
owner: 'dockremap'
|
||||
group: 'docker'
|
||||
recurse: true
|
||||
with_items:
|
||||
- data
|
||||
- conf
|
||||
|
||||
- name: Create config files
|
||||
template:
|
||||
src: '{{ item }}.j2'
|
||||
dest: '{{ alertmanager_cont_vol }}/conf/{{ item }}'
|
||||
owner: 'dockremap'
|
||||
group: 'docker'
|
||||
register: alertmanager_cont_config
|
||||
with_items:
|
||||
- 'alertmanager.yml'
|
||||
- 'amtool.yml'
|
|
@ -0,0 +1,19 @@
|
|||
---
|
||||
- name: Create Consul service definition
|
||||
include_role: name=consul-service
|
||||
vars:
|
||||
consul_config_name: '{{ alertmanager_cont_name }}'
|
||||
consul_services:
|
||||
- name: '{{ alertmanager_cont_name }}'
|
||||
tags: ['metrics', 'alertmanager']
|
||||
port: '{{ alertmanager_webui_port }}'
|
||||
address: '{{ ansible_local.tinc.vpn_ip }}'
|
||||
checks:
|
||||
- id: alertmanager-status
|
||||
name: Alert Manager status
|
||||
type: http
|
||||
http: 'http://localhost:{{ alertmanager_webui_port }}/api/v1/receivers'
|
||||
|
||||
# We need to do this for discover step to work
|
||||
- name: Reload Consul right away
|
||||
command: consul reload
|
|
@ -0,0 +1,29 @@
|
|||
---
|
||||
- name: Start container
|
||||
docker_container:
|
||||
name: '{{ alertmanager_cont_name }}'
|
||||
image: '{{ alertmanager_cont_image }}'
|
||||
user: root
|
||||
pull: true
|
||||
restart_policy: always
|
||||
state: '{{ cont_state }}'
|
||||
recreate: '{{ cont_recreate }}'
|
||||
restart: '{{ alertmanager_cont_config.changed | default(cont_restart) }}'
|
||||
ports:
|
||||
- '0.0.0.0:{{ alertmanager_cluster_port }}:{{ alertmanager_cluster_port }}'
|
||||
- '0.0.0.0:{{ alertmanager_webui_port }}:{{ alertmanager_webui_port }}'
|
||||
volumes:
|
||||
- '{{ alertmanager_cont_vol }}/conf/alertmanager.yml:/etc/alertmanager.yml:ro'
|
||||
- '{{ alertmanager_cont_vol }}/conf/amtool.yml:/etc/amtool/config.yml:ro'
|
||||
- '{{ alertmanager_cont_vol }}/data:/data'
|
||||
- '/certs:/certs'
|
||||
command: |
|
||||
--storage.path=/data
|
||||
--log.level={{ alertmanager_cont_log_lvl }}
|
||||
--config.file=/etc/alertmanager.yml
|
||||
--web.external-url={{ alertmanager_url }}
|
||||
--cluster.advertise-address="{{ ansible_local.tinc.vpn_ip }}:{{ alertmanager_cluster_port }}"
|
||||
--cluster.listen-address="0.0.0.0:{{ alertmanager_cluster_port }}"
|
||||
{% for peer in cluster_peers %}
|
||||
--cluster.peer={{ peer }}:{{ alertmanager_cluster_port }}
|
||||
{% endfor %}
|
|
@ -0,0 +1,21 @@
|
|||
---
|
||||
- name: Get data centers
|
||||
uri:
|
||||
url: '{{ consul_catalog_url }}/datacenters'
|
||||
register: data_centers
|
||||
|
||||
- name: Get AlertManager node addresses
|
||||
uri:
|
||||
url: '{{ consul_catalog_url }}/service/{{ alertmanager_cont_name }}?dc={{ item }}'
|
||||
method: GET
|
||||
validate_certs: no
|
||||
with_items: '{{ data_centers.json }}'
|
||||
register: alertmanager_services
|
||||
|
||||
- name: Extract IP addresses
|
||||
set_fact:
|
||||
cluster_peers: |
|
||||
{{ alertmanager_services.results
|
||||
| sum(attribute="json", start=[])
|
||||
| map(attribute="ServiceAddress")
|
||||
| list }}
|
|
@ -0,0 +1,6 @@
|
|||
---
|
||||
- include_tasks: consul.yml
|
||||
- include_tasks: discover.yml
|
||||
- include_tasks: config.yml
|
||||
- include_tasks: container.yml
|
||||
- include_tasks: wrapper.yml
|
|
@ -0,0 +1,8 @@
|
|||
---
|
||||
- name: Create amtool wrapper
|
||||
copy:
|
||||
content: |
|
||||
#/usr/bin/env bash
|
||||
docker exec alertmanager amtool $@
|
||||
dest: '/usr/local/bin/amtool'
|
||||
mode: 0755
|
|
@ -0,0 +1,38 @@
|
|||
global:
|
||||
# SMTP authentication information.
|
||||
smtp_from: '{{ alertmanager_smtp_from }}'
|
||||
smtp_smarthost: '{{ alertmanager_smtp_host }}:{{ alertmanager_smtp_port }}'
|
||||
smtp_auth_username: '{{ alertmanager_smtp_user }}'
|
||||
smtp_auth_password: '{{ alertmanager_smtp_pass }}'
|
||||
smtp_require_tls: true
|
||||
# VictorOps configuration
|
||||
victorops_api_key: '{{ alertmanager_victorops_api_key }}'
|
||||
victorops_api_url: '{{ alertmanager_victorops_service_url }}'
|
||||
|
||||
route:
|
||||
# Default destination fo all alerts
|
||||
receiver: 'admin-email'
|
||||
# How to group together alerts
|
||||
group_by: ['alertname', 'cluster']
|
||||
# Wait this much before initial notification to group them.
|
||||
group_wait: 30s
|
||||
# Wait before sending another batch for a group.
|
||||
group_interval: 3m
|
||||
# Wait this much to resend notifications.
|
||||
repeat_interval: 1h
|
||||
|
||||
routes:
|
||||
# send all notifications to slack too
|
||||
- receiver: 'victorops-alerts'
|
||||
|
||||
receivers:
|
||||
- name: 'admin-email'
|
||||
email_configs:
|
||||
- to: '{{ alertmanager_admin_email }}'
|
||||
send_resolved: true
|
||||
|
||||
- name: 'victorops-alerts'
|
||||
victorops_configs:
|
||||
- routing_key: '{{ alertmanager_victorops_routing_key }}'
|
||||
state_message: '{{ alertmanager_victorops_state_message }}'
|
||||
message_type: '{{ alertmanager_victorops_message_type }}'
|
|
@ -0,0 +1,14 @@
|
|||
# an find your `alertmanager` instance at
|
||||
alertmanager.url: "http://localhost:{{ alertmanager_webui_port }}"
|
||||
|
||||
# Override the default author. (unset defaults to your username)
|
||||
author: jakub@status.im
|
||||
|
||||
# Force amtool to give you an error if you don't include a comment on a silence
|
||||
comment_required: true
|
||||
|
||||
# Set a default output format. (unset defaults to simple)
|
||||
output: extended
|
||||
|
||||
# Set a default receiver
|
||||
receiver: team-X-pager
|
Loading…
Reference in New Issue