From dbb90251c25a82cc6233c5340f3077bf945399c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Soko=C5=82owski?= Date: Mon, 7 Dec 2020 12:20:33 +0100 Subject: [PATCH] add role files from infra-hq MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Jakub SokoĊ‚owski --- README.md | 50 +++++++++++++++++++++++++++++++++++ defaults/main.yml | 36 +++++++++++++++++++++++++ handlers/main.yml | 3 +++ tasks/config.yml | 22 +++++++++++++++ tasks/consul.yml | 19 +++++++++++++ tasks/container.yml | 29 ++++++++++++++++++++ tasks/discover.yml | 21 +++++++++++++++ tasks/main.yml | 6 +++++ tasks/wrapper.yml | 8 ++++++ templates/alertmanager.yml.j2 | 38 ++++++++++++++++++++++++++ templates/amtool.yml.j2 | 14 ++++++++++ 11 files changed, 246 insertions(+) create mode 100644 README.md create mode 100644 defaults/main.yml create mode 100644 handlers/main.yml create mode 100644 tasks/config.yml create mode 100644 tasks/consul.yml create mode 100644 tasks/container.yml create mode 100644 tasks/discover.yml create mode 100644 tasks/main.yml create mode 100644 tasks/wrapper.yml create mode 100644 templates/alertmanager.yml.j2 create mode 100644 templates/amtool.yml.j2 diff --git a/README.md b/README.md new file mode 100644 index 0000000..9001ed3 --- /dev/null +++ b/README.md @@ -0,0 +1,50 @@ +# Description + +This role configures [AlertManager](https://prometheus.io/docs/alerting/alertmanager/) to notify people of threshold breaches in rules configured in Prometheus __master__ instance. + +# Service + +AlertManager runs in a cluster to achieve high availability. The peer connect via [Tinc VPN](https://github.com/status-im/infra-role-bootstrap/tree/master/tasks/tinc). +The service listens on `:9093` and the Prometheus instance connects to that port via the VPN to inform it of threshold breaches. + +The service UI is available at: https://alerts.status.im/ + +# CLI Tool + +You can manage existing alerts by using the `amtool` on any of the hosts running this: +``` + > amtool alert +Alertname Starts At Summary +Test_Alert 2018-07-06 18:30:18 UTC This is a testing alert! + > amtool silence +ID Matchers Starts At Ends At Updated At Created By Comment +9635b573-5177-4601-a3b0-ac6a25d0a4ef alertname=InstanceDown 2018-07-06 12:37:04 UTC 2018-07-06 14:36:05 UTC 2018-07-06 12:37:04 UTC jakubgs test +``` + +# Configuration + +The main configuration resides in [`templates/alertmanager.yml.j2`](templates/alertmanager.yml.j2). It configures all the receivers of alerts generated by Prometheus __master__ instance. + +The are three main sections: + +* `global` - Configure general auth related options for SMTP and Slack receivers. +* `receivers` - Defines destinations of alets which can be used in the `route` section. +* `route` - Defines rules based on which alerts are directed to defined receivers. + +For more details see: https://prometheus.io/docs/alerting/configuration/ + +# Ansible Variables + +The bare minimum should be: +```yml +alertmanager_domain: 'alerts.example.org' +alertmanager_admin_email: 'admin@example.org' +alertmanager_smtp_host: smtp.mail.example.org' +alertmanager_smtp_from: 'alerts@example.org' +alertmanager_smtp_user: 'secret-smtp-user' +alertmanager_smtp_pass: 'secret-smtp-pass' + +alertmanager_victorops_api_key: 'secret-victorops-api-key' +alertmanager_victorops_routing_key: 'alert-manager' +``` +Take note you will have to create an `alert-manager` routing rule in VictorOps. diff --git a/defaults/main.yml b/defaults/main.yml new file mode 100644 index 0000000..43d348d --- /dev/null +++ b/defaults/main.yml @@ -0,0 +1,36 @@ +--- +alertmanager_cont_tag: 'v0.21.0' +alertmanager_cont_image: 'quay.io/prometheus/alertmanager:{{ alertmanager_cont_tag }}' +alertmanager_cont_name: 'alertmanager' +alertmanager_cont_vol: '/docker/{{ alertmanager_cont_name }}' +alertmanager_cont_log_lvl: info + +# port used for the web ui +alertmanager_webui_port: 9093 +# port used for clustering AlertManager peers +alertmanager_cluster_port: 9094 + +alertmanager_smtp_host: ~ +alertmanager_smtp_port: ~ +alertmanager_smtp_from: ~ +alertmanager_smtp_user: ~ +alertmanager_smtp_pass: ~ + +alertmanager_domain: ~ +alertmanager_url: 'https://{{ alertmanager_domain | mandatory }}/' +alertmanager_admin_email: ~ + +# VictorOps paging service +alertmanager_victorops_api_key: ~ +alertmanager_victorops_service_url: ~ +alertmanager_victorops_routing_key: ~ +alertmanager_victorops_message_type: 'ERROR' +alertmanager_victorops_state_message: '{% raw %}Alert: {{ .CommonLabels.alertname }}. Summary:{{ .CommonAnnotations.summary }}. RawData: {{ .CommonLabels }}{% endraw %}' + +# For discovery of prometheus master nodes +consul_catalog_url: 'http://localhost:8500/v1/catalog' + +# Generic container options +cont_state: started +cont_recreate: false +cont_restart: false diff --git a/handlers/main.yml b/handlers/main.yml new file mode 100644 index 0000000..bd6da46 --- /dev/null +++ b/handlers/main.yml @@ -0,0 +1,3 @@ +--- +- name: Save iptables rules + shell: iptables-save > /etc/iptables/rules.v4 diff --git a/tasks/config.yml b/tasks/config.yml new file mode 100644 index 0000000..4f51ee6 --- /dev/null +++ b/tasks/config.yml @@ -0,0 +1,22 @@ +--- +- name: Create container directories + file: + path: '{{ alertmanager_cont_vol }}/{{ item }}' + state: 'directory' + owner: 'dockremap' + group: 'docker' + recurse: true + with_items: + - data + - conf + +- name: Create config files + template: + src: '{{ item }}.j2' + dest: '{{ alertmanager_cont_vol }}/conf/{{ item }}' + owner: 'dockremap' + group: 'docker' + register: alertmanager_cont_config + with_items: + - 'alertmanager.yml' + - 'amtool.yml' diff --git a/tasks/consul.yml b/tasks/consul.yml new file mode 100644 index 0000000..fc37c08 --- /dev/null +++ b/tasks/consul.yml @@ -0,0 +1,19 @@ +--- +- name: Create Consul service definition + include_role: name=consul-service + vars: + consul_config_name: '{{ alertmanager_cont_name }}' + consul_services: + - name: '{{ alertmanager_cont_name }}' + tags: ['metrics', 'alertmanager'] + port: '{{ alertmanager_webui_port }}' + address: '{{ ansible_local.tinc.vpn_ip }}' + checks: + - id: alertmanager-status + name: Alert Manager status + type: http + http: 'http://localhost:{{ alertmanager_webui_port }}/api/v1/receivers' + +# We need to do this for discover step to work +- name: Reload Consul right away + command: consul reload diff --git a/tasks/container.yml b/tasks/container.yml new file mode 100644 index 0000000..7db1d62 --- /dev/null +++ b/tasks/container.yml @@ -0,0 +1,29 @@ +--- +- name: Start container + docker_container: + name: '{{ alertmanager_cont_name }}' + image: '{{ alertmanager_cont_image }}' + user: root + pull: true + restart_policy: always + state: '{{ cont_state }}' + recreate: '{{ cont_recreate }}' + restart: '{{ alertmanager_cont_config.changed | default(cont_restart) }}' + ports: + - '0.0.0.0:{{ alertmanager_cluster_port }}:{{ alertmanager_cluster_port }}' + - '0.0.0.0:{{ alertmanager_webui_port }}:{{ alertmanager_webui_port }}' + volumes: + - '{{ alertmanager_cont_vol }}/conf/alertmanager.yml:/etc/alertmanager.yml:ro' + - '{{ alertmanager_cont_vol }}/conf/amtool.yml:/etc/amtool/config.yml:ro' + - '{{ alertmanager_cont_vol }}/data:/data' + - '/certs:/certs' + command: | + --storage.path=/data + --log.level={{ alertmanager_cont_log_lvl }} + --config.file=/etc/alertmanager.yml + --web.external-url={{ alertmanager_url }} + --cluster.advertise-address="{{ ansible_local.tinc.vpn_ip }}:{{ alertmanager_cluster_port }}" + --cluster.listen-address="0.0.0.0:{{ alertmanager_cluster_port }}" + {% for peer in cluster_peers %} + --cluster.peer={{ peer }}:{{ alertmanager_cluster_port }} + {% endfor %} diff --git a/tasks/discover.yml b/tasks/discover.yml new file mode 100644 index 0000000..28e7492 --- /dev/null +++ b/tasks/discover.yml @@ -0,0 +1,21 @@ +--- +- name: Get data centers + uri: + url: '{{ consul_catalog_url }}/datacenters' + register: data_centers + +- name: Get AlertManager node addresses + uri: + url: '{{ consul_catalog_url }}/service/{{ alertmanager_cont_name }}?dc={{ item }}' + method: GET + validate_certs: no + with_items: '{{ data_centers.json }}' + register: alertmanager_services + +- name: Extract IP addresses + set_fact: + cluster_peers: | + {{ alertmanager_services.results + | sum(attribute="json", start=[]) + | map(attribute="ServiceAddress") + | list }} diff --git a/tasks/main.yml b/tasks/main.yml new file mode 100644 index 0000000..d23e183 --- /dev/null +++ b/tasks/main.yml @@ -0,0 +1,6 @@ +--- +- include_tasks: consul.yml +- include_tasks: discover.yml +- include_tasks: config.yml +- include_tasks: container.yml +- include_tasks: wrapper.yml diff --git a/tasks/wrapper.yml b/tasks/wrapper.yml new file mode 100644 index 0000000..405e033 --- /dev/null +++ b/tasks/wrapper.yml @@ -0,0 +1,8 @@ +--- +- name: Create amtool wrapper + copy: + content: | + #/usr/bin/env bash + docker exec alertmanager amtool $@ + dest: '/usr/local/bin/amtool' + mode: 0755 diff --git a/templates/alertmanager.yml.j2 b/templates/alertmanager.yml.j2 new file mode 100644 index 0000000..8af83fa --- /dev/null +++ b/templates/alertmanager.yml.j2 @@ -0,0 +1,38 @@ +global: + # SMTP authentication information. + smtp_from: '{{ alertmanager_smtp_from }}' + smtp_smarthost: '{{ alertmanager_smtp_host }}:{{ alertmanager_smtp_port }}' + smtp_auth_username: '{{ alertmanager_smtp_user }}' + smtp_auth_password: '{{ alertmanager_smtp_pass }}' + smtp_require_tls: true + # VictorOps configuration + victorops_api_key: '{{ alertmanager_victorops_api_key }}' + victorops_api_url: '{{ alertmanager_victorops_service_url }}' + +route: + # Default destination fo all alerts + receiver: 'admin-email' + # How to group together alerts + group_by: ['alertname', 'cluster'] + # Wait this much before initial notification to group them. + group_wait: 30s + # Wait before sending another batch for a group. + group_interval: 3m + # Wait this much to resend notifications. + repeat_interval: 1h + + routes: + # send all notifications to slack too + - receiver: 'victorops-alerts' + +receivers: + - name: 'admin-email' + email_configs: + - to: '{{ alertmanager_admin_email }}' + send_resolved: true + + - name: 'victorops-alerts' + victorops_configs: + - routing_key: '{{ alertmanager_victorops_routing_key }}' + state_message: '{{ alertmanager_victorops_state_message }}' + message_type: '{{ alertmanager_victorops_message_type }}' diff --git a/templates/amtool.yml.j2 b/templates/amtool.yml.j2 new file mode 100644 index 0000000..2acdd0e --- /dev/null +++ b/templates/amtool.yml.j2 @@ -0,0 +1,14 @@ +# an find your `alertmanager` instance at +alertmanager.url: "http://localhost:{{ alertmanager_webui_port }}" + +# Override the default author. (unset defaults to your username) +author: jakub@status.im + +# Force amtool to give you an error if you don't include a comment on a silence +comment_required: true + +# Set a default output format. (unset defaults to simple) +output: extended + +# Set a default receiver +receiver: team-X-pager