Merge branch 'graphite-check-overhaul' of https://github.com/reddit/cabot into reddit-graphite-check-overhaul

2025-02-24 10:28:06 +00:00 · 2015-10-05 18:52:48 +01:00 · 2015-10-05 18:52:48 +01:00 · ebbd626943
commit ebbd626943
parent 80e56089a7 14f77c2f49
3 changed files with 120 additions and 76 deletions
--- a/cabot/cabotapp/graphite.py
+++ b/cabot/cabotapp/graphite.py
@ -79,6 +79,7 @@ def parse_metric(metric, mins_to_check=5):
    for target in data:
        series = {'values': [
            float(t[0]) for t in target['datapoints'][-mins_to_check:] if t[0] is not None]}
+        series["target"] = target["target"]
        all_values.extend(series['values'])
        if series['values']:
            ret['num_series_with_data'] += 1
--- a/cabot/cabotapp/models.py
+++ b/cabot/cabotapp/models.py
@ -27,6 +27,7 @@ import re
 import time
 import os
 import subprocess
+import itertools

 import requests
 from celery.utils.log import get_task_logger
@ -593,6 +594,27 @@ class ICMPStatusCheck(StatusCheck):
        return result


+def minimize_targets(targets):
+    split = [target.split(".") for target in targets]
+
+    prefix_nodes_in_common = 0
+    for i, nodes in enumerate(itertools.izip(*split)):
+        if any(node != nodes[0] for node in nodes):
+            prefix_nodes_in_common = i
+            break
+    split = [nodes[prefix_nodes_in_common:] for nodes in split]
+
+    suffix_nodes_in_common = 0
+    for i, nodes in enumerate(reversed(zip(*split))):
+        if any(node != nodes[0] for node in nodes):
+            suffix_nodes_in_common = i
+            break
+    if suffix_nodes_in_common:
+        split = [nodes[:-suffix_nodes_in_common] for nodes in split]
+
+    return [".".join(nodes) for nodes in split]
+
+
 class GraphiteStatusCheck(StatusCheck):

    class Meta(StatusCheck.Meta):
@ -602,98 +624,80 @@ class GraphiteStatusCheck(StatusCheck):
    def check_category(self):
        return "Metric check"

-    def format_error_message(self, failure_value, actual_hosts, actual_failures):
-        """
-        A summary of why the check is failing for inclusion in short alert messages
-        Returns something like:
-        "5.0 > 4 | 1/2 hosts"
-        """
-        if isinstance(failure_value, (list, tuple)):
-            failure_value = ', '.join([u'%0.1f' % v for v in failure_value])
+    def format_error_message(self, failures, actual_hosts, hosts_by_target):
+        if actual_hosts < self.expected_num_hosts:
+            return "Hosts missing | %d/%d hosts" % (
+                actual_hosts, self.expected_num_hosts)
+        elif actual_hosts > 1:
+            threshold = float(self.value)
+            failures_by_host = ["%s: %s %s %0.1f" % (
+                hosts_by_target[target], value, self.check_type, threshold)
+                for target, value in failures]
+            return ", ".join(failures_by_host)
        else:
-            failure_value = u'%0.1f' % failure_value
-        hosts_string = u''
-        failures_string = u''
-        if self.expected_num_hosts > 0:
-            hosts_string = u' | %s/%s hosts' % (actual_hosts,
-                                                self.expected_num_hosts)
-            if self.expected_num_hosts > actual_hosts:
-                return u'Hosts missing%s' % hosts_string
-        if self.allowed_num_failures and actual_failures:
-            failures_string = u' | %s/%s series failing (%s allowed)' % (
-                actual_failures,
-                actual_hosts,
-                self.allowed_num_failures,
-            )
-        if failure_value is None:
-            return "Failed to get metric from Graphite"
-        return u"%s %s %0.1f%s%s" % (
-            failure_value,
-            self.check_type,
-            float(self.value),
-            hosts_string,
-            failures_string,
-        )
+            target, value = failures[0]
+            return "%s %s %0.1f" % (value, self.check_type, float(self.value))

    def _run(self):
        result = StatusCheckResult(check=self)

        failures = []
        graphite_output = parse_metric(self.metric, mins_to_check=self.frequency)
-        if graphite_output['num_series_with_data'] > 0:
-            result.average_value = graphite_output['average_value']
-            failed = False
-            for s in graphite_output['series']:
-                failure_value = None
-                if self.check_type == '<':
-                    failed = float(s['min']) < float(self.value)
-                    if failed:
-                        failure_value = s['min']
-                elif self.check_type == '<=':
-                    failed = float(s['min']) <= float(self.value)
-                    if failed:
-                        failure_value = s['min']
-                elif self.check_type == '>':
-                    failed = float(s['max']) > float(self.value)
-                    if failed:
-                        failure_value = s['max']
-                elif self.check_type == '>=':
-                    failed = float(s['max']) >= float(self.value)
-                    if failed:
-                        failure_value = s['max']
-                elif self.check_type == '==':
-                    failed = float(self.value) in s['values']
-                    if failed:
-                        failure_value = float(self.value)
-                else:
-                    raise Exception(u'Check type %s not supported' %
-                                    self.check_type)
-
-                if not failure_value is None:
-                    failures.append(failure_value)
-
-        allowed_num_failures = self.allowed_num_failures or 0
-        # If there are more than expected failures
-        if len(failures) - self.allowed_num_failures > 0:
-            result.succeeded = False
-        else:
-            if graphite_output['error']:
-                result.succeeded = False
-            if graphite_output['num_series_with_data'] < self.expected_num_hosts:
-                result.succeeded = False
-            else:
-                result.succeeded = True

        try:
            result.raw_data = json.dumps(graphite_output['raw'])
        except:
            result.raw_data = graphite_output['raw']

+        if graphite_output["error"]:
+            result.succeeded = False
+            result.error = graphite_output["error"]
+            return result
+
+        if graphite_output['num_series_with_data'] > 0:
+            result.average_value = graphite_output['average_value']
+            for s in graphite_output['series']:
+                if not s["values"]:
+                    continue
+                failure_value = None
+                if self.check_type == '<':
+                    if float(s['min']) < float(self.value):
+                        failure_value = s['min']
+                elif self.check_type == '<=':
+                    if float(s['min']) <= float(self.value):
+                        failure_value = s['min']
+                elif self.check_type == '>':
+                    if float(s['max']) > float(self.value):
+                        failure_value = s['max']
+                elif self.check_type == '>=':
+                    if float(s['max']) >= float(self.value):
+                        failure_value = s['max']
+                elif self.check_type == '==':
+                    if float(self.value) in s['values']:
+                        failure_value = float(self.value)
+                else:
+                    raise Exception(u'Check type %s not supported' %
+                                    self.check_type)
+
+                if not failure_value is None:
+                    failures.append((s["target"], failure_value))
+
+        if len(failures) > self.allowed_num_failures:
+            result.succeeded = False
+        elif graphite_output['num_series_with_data'] < self.expected_num_hosts:
+            result.succeeded = False
+        else:
+            result.succeeded = True
+
        if not result.succeeded:
+            targets = [s["target"] for s in graphite_output["series"]]
+            hosts = minimize_targets(targets)
+            hosts_by_target = dict(zip(targets, hosts))
+
            result.error = self.format_error_message(
                failures,
                graphite_output['num_series_with_data'],
-                len(failures),
+                hosts_by_target,
            )

        return result
--- a/cabot/cabotapp/tests/tests_basic.py
+++ b/cabot/cabotapp/tests/tests_basic.py
@ -23,7 +23,7 @@ from mock import Mock, patch
 from cabot.cabotapp.models import (
    GraphiteStatusCheck, JenkinsStatusCheck,
    HttpStatusCheck, ICMPStatusCheck, Service, Instance,
-    StatusCheckResult, UserProfile)
+    StatusCheckResult, UserProfile, minimize_targets)
 from cabot.cabotapp.views import StatusCheckReportForm
 from cabot.cabotapp.alert import send_alert
 from cabot.cabotapp.graphite import parse_metric
@ -892,3 +892,42 @@ class TestAlerts(LocalTestCase):
        self.service.alert()
        self.assertEqual(fake_send_alert.call_count, 1)
        fake_send_alert.assert_called_with(self.service, duty_officers=[])
+
+
+class TestMinimizeTargets(LocalTestCase):
+    def test_null(self):
+        result = minimize_targets([])
+        self.assertEqual(result, [])
+
+    def test_all_same(self):
+        result = minimize_targets(["a", "a"])
+        self.assertEqual(result, ["a", "a"])
+
+    def test_all_different(self):
+        result = minimize_targets(["a", "b"])
+        self.assertEqual(result, ["a", "b"])
+
+    def test_same_prefix(self):
+        result = minimize_targets(["prefix.a", "prefix.b"])
+        self.assertEqual(result, ["a", "b"])
+
+        result = minimize_targets(["prefix.second.a", "prefix.second.b"])
+        self.assertEqual(result, ["a", "b"])
+
+    def test_same_suffix(self):
+        result = minimize_targets(["a.suffix", "b.suffix"])
+        self.assertEqual(result, ["a", "b"])
+
+        result = minimize_targets(["a.suffix.suffix", "b.suffix.suffix"])
+        self.assertEqual(result, ["a", "b"])
+
+        result = minimize_targets(["a.b.suffix.suffix", "b.c.suffix.suffix"])
+        self.assertEqual(result, ["a.b", "b.c"])
+
+    def test_same_prefix_and_suffix(self):
+        result = minimize_targets(["prefix.a.suffix", "prefix.b.suffix"])
+        self.assertEqual(result, ["a", "b"])
+
+        result = minimize_targets(["prefix.prefix.a.suffix.suffix",
+                                   "prefix.prefix.b.suffix.suffix",])
+        self.assertEqual(result, ["a", "b"])