nimbus-stats: port to use systemd timer, fix consul query

Signed-off-by: Jakub Sokołowski <jakub@status.im>
This commit is contained in:
Jakub Sokołowski 2020-02-19 11:51:09 +01:00
parent 9091772f37
commit fcd17845b8
No known key found for this signature in database
GPG Key ID: 4EF064D0E6D63020
8 changed files with 83 additions and 36 deletions

View File

@ -1,8 +1,12 @@
---
# general container config
cont_restart: false
cont_recreate: 'smart'
cont_recreate: false
cont_state: 'present'
# general compose config
compose_restart: false
compose_recreate: 'smart'
compose_state: 'present'
# CloudFlare Origin certificates
origin_certs:
@ -12,7 +16,7 @@ origin_certs:
default: true
# Consul Catalog Query URL
consul_catalog_url: 'http://localhost:8500/v1/catalog/service'
consul_catalog_url: 'http://localhost:8500/v1/catalog'
# Root password
bootstrap__root_pass: '{{lookup("passwordstore", "hosts/admin-pass")}}'

View File

@ -21,7 +21,7 @@
pre_tasks:
- name: Fetch available elasticsearch nodes
uri:
url: '{{ consul_catalog_url }}/elasticsearch?dc={{ data_center }}&tag={{ es_lb_cluster_name }}'
url: '{{ consul_catalog_url }}/service/elasticsearch?dc={{ data_center }}&tag={{ es_lb_cluster_name }}'
register: es_services
- name: Extract ElasticSearch hostnames
set_fact:

View File

@ -11,8 +11,8 @@
hosts:
- nimbus-master
roles:
#- { role: origin-certs, tags: origin-certs }
#- { role: nimbus-stats, tags: nimbus-stats }
- { role: origin-certs, tags: origin-certs }
- { role: nimbus-stats, tags: nimbus-stats }
- name: Configure Nimbus cluster
hosts:

View File

@ -53,3 +53,7 @@
- name: elasticsearch-lb
src: git@github.com:status-im/infra-role-elasticsearch-lb.git
scm: git
- name: systemd-timer
src: git@github.com:status-im/infra-role-systemd-timer.git
scm: git

View File

@ -7,6 +7,9 @@ This role defined a simple service which lists the current state of the nodes in
The only setting that should be usually changed is the domain:
```yaml
nimbus_stats_domain: my-amazing-domain.example.org
# To query for ElasticSearch Load Blancer to query
consul_service_name: elasticsearch-lb
consul_service_tag: my-logs-cluster-name
```
# Script
@ -34,11 +37,13 @@ Options:
-p PROGRAM, --program=PROGRAM
Program to query for. (*beacon-node-*)
-s SINCE, --since=SINCE
Period for which to query logs. (now-30m)
Period for which to query logs. (now-15m)
-S PAGE_SIZE, --page-size=PAGE_SIZE
Size of results page. (10000)
-f FLEET, --fleet=FLEET
Fleet to query for. (nimbus.test)
-t TIMEOUT, --timeout=TIMEOUT
Connection timeout in seconds. (120)
-l LOG_LEVEL, --log-level=LOG_LEVEL
Logging level. (INFO)
-o OUTPUT_FILE, --output-file=OUTPUT_FILE
@ -47,7 +52,31 @@ Options:
Example: collect -i logstash-2019.03.01 output.json
```
# Timer
The script runs on a [systemd timer](https://github.com/status-im/infra-role-systemd-timer) which can be checked with:
```
$ sudo systemctl list-timers -a nimbus-stats.timer
NEXT LEFT LAST PASSED UNIT ACTIVATES
Wed 2020-02-19 10:50:00 UTC 37s left Wed 2020-02-19 10:45:00 UTC 4min 21s ago nimbus-stats.timer nimbus-stats.service
```
Which triggers the `nimbus-stats` service:
```
$ sudo systemctl status nimbus-stats.service
● nimbus-stats.service - Generates stats for Nimbus cluster.
Loaded: loaded (/lib/systemd/system/nimbus-stats.service; static; vendor preset: enabled)
Active: inactive (dead) since Wed 2020-02-19 10:47:24 UTC; 2min 33s ago
Docs: https://github.com/status-im/infra-role-systemd-timer
Process: 24950 ExecStart=/usr/local/bin/nimbus-stats (code=exited, status=0/SUCCESS)
Main PID: 24950 (code=exited, status=0/SUCCESS)
Feb 19 10:47:21 master-01.aws-eu-central-1a.nimbus.test systemd[1]: Starting Generates stats for Nimbus cluster....
Feb 19 10:47:22 master-01.aws-eu-central-1a.nimbus.test nimbus-stats[24950]: [INFO]: Querying fleet: nimbus.test
Feb 19 10:47:24 master-01.aws-eu-central-1a.nimbus.test nimbus-stats[24950]: [INFO]: Found matching logs: 10000
Feb 19 10:47:24 master-01.aws-eu-central-1a.nimbus.test nimbus-stats[24950]: [INFO]: Saving to file: /var/www/nimbus/nimbus_stats.json
Feb 19 10:47:24 master-01.aws-eu-central-1a.nimbus.test systemd[1]: Started Generates stats for Nimbus cluster..
```
# Context
For more details see: https://github.com/status-im/infra-nimbus/issues/1

View File

@ -5,9 +5,10 @@ nimbus_stats_json_name: nimbus_stats.json
nimbus_stats_json: '{{ nimbus_stats_web_root }}/{{ nimbus_stats_json_name }}'
nimbus_stats_script: /usr/local/bin/collect_nimbus_stats.py
nimbus_stats_cron_script: /usr/local/bin/save_nimbus_stats.py
nimbus_stats_service_name: 'nimbus-stats'
# necessary to query for ES lb
consul_base_url: 'http://localhost:8500/v1/catalog'
consul_catalog_url: 'http://localhost:8500/v1/catalog'
consul_service_name: elasticsearch-lb
consul_service_tag: status-logs-search
consul_query_url: '{{ consul_base_url }}/service/{{ consul_service_name }}?tag={{ consul_service_tag }}'
consul_service_tag: nimbus-logs-search
consul_query_url: '{{ consul_catalog_url }}/service/{{ consul_service_name }}?tag={{ consul_service_tag }}'

View File

@ -20,14 +20,14 @@ DEFAULT_MESSAGES = [
ENV = os.environ
LOG = logging.getLogger('root')
handler = logging.StreamHandler(sys.stderr)
formatter = logging.Formatter('%(asctime)s [%(levelname)s]: %(message)s')
formatter = logging.Formatter('[%(levelname)s]: %(message)s')
handler.setFormatter(formatter)
LOG.addHandler(handler)
class ES:
def __init__(self, host, port, page_size):
def __init__(self, host, port, page_size, timeout):
self.page_size = page_size
self.es = Elasticsearch([host], port=port, timeout=30)
self.es = Elasticsearch([host], port=port, timeout=timeout)
def make_query(self, fleet, program, messages, after):
return {
@ -99,12 +99,14 @@ def parse_opts():
help='ElasticSearch port. (%default)')
parser.add_option('-p', '--program', default='*beacon-node-*',
help='Program to query for. (%default)')
parser.add_option('-s', '--since', default='now-30m',
parser.add_option('-s', '--since', default='now-15m',
help='Period for which to query logs. (%default)')
parser.add_option('-S', '--page-size', default=10000,
help='Size of results page. (%default)')
parser.add_option('-f', '--fleet', default='nimbus.test',
help='Fleet to query for. (%default)')
parser.add_option('-t', '--timeout', default=120,
help='Connection timeout in seconds. (%default)')
parser.add_option('-l', '--log-level', default='INFO',
help='Logging level. (%default)')
parser.add_option('-o', '--output-file',
@ -123,13 +125,13 @@ def main():
debug_options(opts)
es = ES(opts.es_host, opts.es_port, opts.page_size)
es = ES(opts.es_host, opts.es_port, opts.page_size, opts.timeout)
LOG.info('Querying fleet: %s', opts.fleet)
query = es.make_query(opts.fleet, opts.program, opts.messages, opts.since)
rval = es.get_logs(query)
LOG.info('Found matching logs: %d', rval['hits']['total'])
LOG.info('Found matching logs: %d', rval['hits']['total']['value'])
logs = rval['hits']['hits']
data = get_first_for_node(logs)

View File

@ -15,6 +15,12 @@
url: '{{ consul_query_url }}'
register: es_lbs
- name: Verify a load balancer was found
assert:
that: es_lbs.json | length > 0
quiet: true
fail_msg: 'No ElasticSearch LB found!'
- name: Install stat collecting script
copy:
src: collect.py
@ -22,27 +28,15 @@
mode: 0755
register: nimbus_script
- name: Create the cron script
copy:
dest: '{{ nimbus_stats_cron_script }}'
mode: 0755
content: |
#!/usr/bin/env bash
exec {{ nimbus_stats_script }} \
-H {{ es_lbs.json[0].ServiceAddress }} \
-o {{ nimbus_stats_json }}
- name: Create www directory
file:
path: '{{ nimbus_stats_web_root }}'
state: directory
owner: www-data
group: www-data
recurse: true
mode: 0755
- name: Run the script before Nginx configuration
command: '{{ nimbus_stats_cron_script }}'
when: nimbus_script.changed
- name: Create nginx config
template:
src: proxy.conf.j2
@ -67,9 +61,22 @@
notify:
- Save iptables rules
- name: Create a cron job for updating stats
cron:
name: Nimbus Fleet Stats
minute: '*/5'
user: root
job: '{{ nimbus_stats_cron_script }}'
- name: Set systemd timer
include_role: name=systemd-timer
vars:
systemd_timer_name: '{{ nimbus_stats_service_name }}'
systemd_timer_description: 'Generates stats for Nimbus cluster.'
systemd_timer_user: 'www-data'
systemd_timer_frequency: '*:0/5' # every 5 minutes
systemd_timer_timeout_sec: 120
systemd_timer_requires_extra: 'network.target'
systemd_timer_script_content: |
#!/usr/bin/env bash
exec {{ nimbus_stats_script }} \
-H {{ es_lbs.json[0].ServiceAddress }} \
-o {{ nimbus_stats_json }}
- name: Run the script before Nginx configuration
systemd:
name: '{{ nimbus_stats_service_name }}'
state: 'started'