mirror of
https://github.com/status-im/cabot.git
synced 2025-02-24 10:28:06 +00:00
Merge branch 'graphite-check-overhaul' of https://github.com/reddit/cabot into reddit-graphite-check-overhaul
This commit is contained in:
commit
ebbd626943
@ -79,6 +79,7 @@ def parse_metric(metric, mins_to_check=5):
|
||||
for target in data:
|
||||
series = {'values': [
|
||||
float(t[0]) for t in target['datapoints'][-mins_to_check:] if t[0] is not None]}
|
||||
series["target"] = target["target"]
|
||||
all_values.extend(series['values'])
|
||||
if series['values']:
|
||||
ret['num_series_with_data'] += 1
|
||||
|
@ -27,6 +27,7 @@ import re
|
||||
import time
|
||||
import os
|
||||
import subprocess
|
||||
import itertools
|
||||
|
||||
import requests
|
||||
from celery.utils.log import get_task_logger
|
||||
@ -593,6 +594,27 @@ class ICMPStatusCheck(StatusCheck):
|
||||
return result
|
||||
|
||||
|
||||
def minimize_targets(targets):
|
||||
split = [target.split(".") for target in targets]
|
||||
|
||||
prefix_nodes_in_common = 0
|
||||
for i, nodes in enumerate(itertools.izip(*split)):
|
||||
if any(node != nodes[0] for node in nodes):
|
||||
prefix_nodes_in_common = i
|
||||
break
|
||||
split = [nodes[prefix_nodes_in_common:] for nodes in split]
|
||||
|
||||
suffix_nodes_in_common = 0
|
||||
for i, nodes in enumerate(reversed(zip(*split))):
|
||||
if any(node != nodes[0] for node in nodes):
|
||||
suffix_nodes_in_common = i
|
||||
break
|
||||
if suffix_nodes_in_common:
|
||||
split = [nodes[:-suffix_nodes_in_common] for nodes in split]
|
||||
|
||||
return [".".join(nodes) for nodes in split]
|
||||
|
||||
|
||||
class GraphiteStatusCheck(StatusCheck):
|
||||
|
||||
class Meta(StatusCheck.Meta):
|
||||
@ -602,98 +624,80 @@ class GraphiteStatusCheck(StatusCheck):
|
||||
def check_category(self):
|
||||
return "Metric check"
|
||||
|
||||
def format_error_message(self, failure_value, actual_hosts, actual_failures):
|
||||
"""
|
||||
A summary of why the check is failing for inclusion in short alert messages
|
||||
Returns something like:
|
||||
"5.0 > 4 | 1/2 hosts"
|
||||
"""
|
||||
if isinstance(failure_value, (list, tuple)):
|
||||
failure_value = ', '.join([u'%0.1f' % v for v in failure_value])
|
||||
def format_error_message(self, failures, actual_hosts, hosts_by_target):
|
||||
if actual_hosts < self.expected_num_hosts:
|
||||
return "Hosts missing | %d/%d hosts" % (
|
||||
actual_hosts, self.expected_num_hosts)
|
||||
elif actual_hosts > 1:
|
||||
threshold = float(self.value)
|
||||
failures_by_host = ["%s: %s %s %0.1f" % (
|
||||
hosts_by_target[target], value, self.check_type, threshold)
|
||||
for target, value in failures]
|
||||
return ", ".join(failures_by_host)
|
||||
else:
|
||||
failure_value = u'%0.1f' % failure_value
|
||||
hosts_string = u''
|
||||
failures_string = u''
|
||||
if self.expected_num_hosts > 0:
|
||||
hosts_string = u' | %s/%s hosts' % (actual_hosts,
|
||||
self.expected_num_hosts)
|
||||
if self.expected_num_hosts > actual_hosts:
|
||||
return u'Hosts missing%s' % hosts_string
|
||||
if self.allowed_num_failures and actual_failures:
|
||||
failures_string = u' | %s/%s series failing (%s allowed)' % (
|
||||
actual_failures,
|
||||
actual_hosts,
|
||||
self.allowed_num_failures,
|
||||
)
|
||||
if failure_value is None:
|
||||
return "Failed to get metric from Graphite"
|
||||
return u"%s %s %0.1f%s%s" % (
|
||||
failure_value,
|
||||
self.check_type,
|
||||
float(self.value),
|
||||
hosts_string,
|
||||
failures_string,
|
||||
)
|
||||
target, value = failures[0]
|
||||
return "%s %s %0.1f" % (value, self.check_type, float(self.value))
|
||||
|
||||
def _run(self):
|
||||
result = StatusCheckResult(check=self)
|
||||
|
||||
failures = []
|
||||
graphite_output = parse_metric(self.metric, mins_to_check=self.frequency)
|
||||
if graphite_output['num_series_with_data'] > 0:
|
||||
result.average_value = graphite_output['average_value']
|
||||
failed = False
|
||||
for s in graphite_output['series']:
|
||||
failure_value = None
|
||||
if self.check_type == '<':
|
||||
failed = float(s['min']) < float(self.value)
|
||||
if failed:
|
||||
failure_value = s['min']
|
||||
elif self.check_type == '<=':
|
||||
failed = float(s['min']) <= float(self.value)
|
||||
if failed:
|
||||
failure_value = s['min']
|
||||
elif self.check_type == '>':
|
||||
failed = float(s['max']) > float(self.value)
|
||||
if failed:
|
||||
failure_value = s['max']
|
||||
elif self.check_type == '>=':
|
||||
failed = float(s['max']) >= float(self.value)
|
||||
if failed:
|
||||
failure_value = s['max']
|
||||
elif self.check_type == '==':
|
||||
failed = float(self.value) in s['values']
|
||||
if failed:
|
||||
failure_value = float(self.value)
|
||||
else:
|
||||
raise Exception(u'Check type %s not supported' %
|
||||
self.check_type)
|
||||
|
||||
if not failure_value is None:
|
||||
failures.append(failure_value)
|
||||
|
||||
allowed_num_failures = self.allowed_num_failures or 0
|
||||
# If there are more than expected failures
|
||||
if len(failures) - self.allowed_num_failures > 0:
|
||||
result.succeeded = False
|
||||
else:
|
||||
if graphite_output['error']:
|
||||
result.succeeded = False
|
||||
if graphite_output['num_series_with_data'] < self.expected_num_hosts:
|
||||
result.succeeded = False
|
||||
else:
|
||||
result.succeeded = True
|
||||
|
||||
try:
|
||||
result.raw_data = json.dumps(graphite_output['raw'])
|
||||
except:
|
||||
result.raw_data = graphite_output['raw']
|
||||
|
||||
if graphite_output["error"]:
|
||||
result.succeeded = False
|
||||
result.error = graphite_output["error"]
|
||||
return result
|
||||
|
||||
if graphite_output['num_series_with_data'] > 0:
|
||||
result.average_value = graphite_output['average_value']
|
||||
for s in graphite_output['series']:
|
||||
if not s["values"]:
|
||||
continue
|
||||
failure_value = None
|
||||
if self.check_type == '<':
|
||||
if float(s['min']) < float(self.value):
|
||||
failure_value = s['min']
|
||||
elif self.check_type == '<=':
|
||||
if float(s['min']) <= float(self.value):
|
||||
failure_value = s['min']
|
||||
elif self.check_type == '>':
|
||||
if float(s['max']) > float(self.value):
|
||||
failure_value = s['max']
|
||||
elif self.check_type == '>=':
|
||||
if float(s['max']) >= float(self.value):
|
||||
failure_value = s['max']
|
||||
elif self.check_type == '==':
|
||||
if float(self.value) in s['values']:
|
||||
failure_value = float(self.value)
|
||||
else:
|
||||
raise Exception(u'Check type %s not supported' %
|
||||
self.check_type)
|
||||
|
||||
if not failure_value is None:
|
||||
failures.append((s["target"], failure_value))
|
||||
|
||||
if len(failures) > self.allowed_num_failures:
|
||||
result.succeeded = False
|
||||
elif graphite_output['num_series_with_data'] < self.expected_num_hosts:
|
||||
result.succeeded = False
|
||||
else:
|
||||
result.succeeded = True
|
||||
|
||||
if not result.succeeded:
|
||||
targets = [s["target"] for s in graphite_output["series"]]
|
||||
hosts = minimize_targets(targets)
|
||||
hosts_by_target = dict(zip(targets, hosts))
|
||||
|
||||
result.error = self.format_error_message(
|
||||
failures,
|
||||
graphite_output['num_series_with_data'],
|
||||
len(failures),
|
||||
hosts_by_target,
|
||||
)
|
||||
|
||||
return result
|
||||
|
@ -23,7 +23,7 @@ from mock import Mock, patch
|
||||
from cabot.cabotapp.models import (
|
||||
GraphiteStatusCheck, JenkinsStatusCheck,
|
||||
HttpStatusCheck, ICMPStatusCheck, Service, Instance,
|
||||
StatusCheckResult, UserProfile)
|
||||
StatusCheckResult, UserProfile, minimize_targets)
|
||||
from cabot.cabotapp.views import StatusCheckReportForm
|
||||
from cabot.cabotapp.alert import send_alert
|
||||
from cabot.cabotapp.graphite import parse_metric
|
||||
@ -892,3 +892,42 @@ class TestAlerts(LocalTestCase):
|
||||
self.service.alert()
|
||||
self.assertEqual(fake_send_alert.call_count, 1)
|
||||
fake_send_alert.assert_called_with(self.service, duty_officers=[])
|
||||
|
||||
|
||||
class TestMinimizeTargets(LocalTestCase):
|
||||
def test_null(self):
|
||||
result = minimize_targets([])
|
||||
self.assertEqual(result, [])
|
||||
|
||||
def test_all_same(self):
|
||||
result = minimize_targets(["a", "a"])
|
||||
self.assertEqual(result, ["a", "a"])
|
||||
|
||||
def test_all_different(self):
|
||||
result = minimize_targets(["a", "b"])
|
||||
self.assertEqual(result, ["a", "b"])
|
||||
|
||||
def test_same_prefix(self):
|
||||
result = minimize_targets(["prefix.a", "prefix.b"])
|
||||
self.assertEqual(result, ["a", "b"])
|
||||
|
||||
result = minimize_targets(["prefix.second.a", "prefix.second.b"])
|
||||
self.assertEqual(result, ["a", "b"])
|
||||
|
||||
def test_same_suffix(self):
|
||||
result = minimize_targets(["a.suffix", "b.suffix"])
|
||||
self.assertEqual(result, ["a", "b"])
|
||||
|
||||
result = minimize_targets(["a.suffix.suffix", "b.suffix.suffix"])
|
||||
self.assertEqual(result, ["a", "b"])
|
||||
|
||||
result = minimize_targets(["a.b.suffix.suffix", "b.c.suffix.suffix"])
|
||||
self.assertEqual(result, ["a.b", "b.c"])
|
||||
|
||||
def test_same_prefix_and_suffix(self):
|
||||
result = minimize_targets(["prefix.a.suffix", "prefix.b.suffix"])
|
||||
self.assertEqual(result, ["a", "b"])
|
||||
|
||||
result = minimize_targets(["prefix.prefix.a.suffix.suffix",
|
||||
"prefix.prefix.b.suffix.suffix",])
|
||||
self.assertEqual(result, ["a", "b"])
|
||||
|
Loading…
x
Reference in New Issue
Block a user