Instances now get their Status updated correctly, ICMPChecks can't have more than one Instance attached anymore (used to error instead). Minor cleanups.

This commit is contained in:
Jonathan Montineri 2014-07-31 17:25:50 +01:00 committed by David Buxton
parent 6cb4bd4e97
commit 558f18c04e
5 changed files with 78 additions and 24 deletions

View File

@ -3,6 +3,7 @@ from django.conf import settings
from django.core.exceptions import ValidationError
from polymorphic import PolymorphicModel
from django.db.models import F
from django.core.urlresolvers import reverse
from django.contrib.admin.models import User
from jenkins import get_job_status
@ -10,7 +11,7 @@ from .alert import send_alert
from .calendar import get_events
from .graphite import parse_metric
from .alert import send_alert
from .tasks import update_service
from .tasks import update_service, update_instance
from datetime import datetime, timedelta
from django.utils import timezone
@ -18,6 +19,7 @@ import json
import re
import time
import os
import subprocess
import requests
from celery.utils.log import get_task_logger
@ -132,24 +134,6 @@ class CheckGroupMixin(models.Model):
def __unicode__(self):
return self.name
def update_status(self):
self.old_overall_status = self.overall_status
# Only active checks feed into our calculation
status_checks_failed_count = self.all_failing_checks().count()
self.overall_status = self.most_severe(self.all_failing_checks())
self.snapshot = ServiceStatusSnapshot(
service=self,
num_checks_active=self.active_status_checks().count(),
num_checks_passing=self.active_status_checks(
).count() - status_checks_failed_count,
num_checks_failing=status_checks_failed_count,
overall_status=self.overall_status,
time=timezone.now(),
)
self.snapshot.save()
self.save()
if not (self.overall_status == Service.PASSING_STATUS and self.old_overall_status == Service.PASSING_STATUS):
self.alert()
def most_severe(self, check_list):
failures = [c.importance for c in check_list]
@ -233,6 +217,24 @@ class CheckGroupMixin(models.Model):
class Service(CheckGroupMixin):
def update_status(self):
self.old_overall_status = self.overall_status
# Only active checks feed into our calculation
status_checks_failed_count = self.all_failing_checks().count()
self.overall_status = self.most_severe(self.all_failing_checks())
self.snapshot = ServiceStatusSnapshot(
service=self,
num_checks_active=self.active_status_checks().count(),
num_checks_passing=self.active_status_checks(
).count() - status_checks_failed_count,
num_checks_failing=status_checks_failed_count,
overall_status=self.overall_status,
time=timezone.now(),
)
self.snapshot.save()
self.save()
if not (self.overall_status == Service.PASSING_STATUS and self.old_overall_status == Service.PASSING_STATUS):
self.alert()
instances = models.ManyToManyField(
'Instance',
blank=True,
@ -252,6 +254,26 @@ class Service(CheckGroupMixin):
class Instance(CheckGroupMixin):
def update_status(self):
self.old_overall_status = self.overall_status
# Only active checks feed into our calculation
status_checks_failed_count = self.all_failing_checks().count()
self.overall_status = self.most_severe(self.all_failing_checks())
self.snapshot = InstanceStatusSnapshot(
instance=self,
num_checks_active=self.active_status_checks().count(),
num_checks_passing=self.active_status_checks(
).count() - status_checks_failed_count,
num_checks_failing=status_checks_failed_count,
overall_status=self.overall_status,
time=timezone.now(),
)
self.snapshot.save()
self.save()
if not (self.overall_status == Service.PASSING_STATUS and self.old_overall_status == Service.PASSING_STATUS):
self.alert()
class Meta:
ordering = ['name']
@ -279,6 +301,17 @@ class ServiceStatusSnapshot(models.Model):
def __unicode__(self):
return u"%s: %s" % (self.service.name, self.overall_status)
class InstanceStatusSnapshot(models.Model):
instance = models.ForeignKey(Instance, related_name='snapshots')
time = models.DateTimeField(db_index=True)
num_checks_active = models.IntegerField(default=0)
num_checks_passing = models.IntegerField(default=0)
num_checks_failing = models.IntegerField(default=0)
overall_status = models.TextField(default=Service.PASSING_STATUS)
did_send_alert = models.IntegerField(default=False)
def __unicode__(self):
return u"%s: %s" % (self.instance.name, self.overall_status)
class StatusCheck(PolymorphicModel):
@ -403,6 +436,11 @@ class StatusCheck(PolymorphicModel):
except Exception as e:
result = StatusCheckResult(check=self)
result.error = u'Error in performing check: %s' % (e,)
if result.error.startswith("Error in performing check: get() returned more than one Instance"):
first_instance = self.instance_set.all().order_by('id')[0]
self.instance_set = [first_instance]
first_instance_link = '<a href="%s">' % reverse('instance', kwargs={'pk': first_instance.pk}) + first_instance.name + "</a>"
result.error = "Error: This type of check can only be attached to one instance. All instances, apart from the oldest one (%s), have been detached from this check. The check will run normally next time." % first_instance_link
result.succeeded = False
finish = timezone.now()
result.time = start
@ -427,6 +465,7 @@ class StatusCheck(PolymorphicModel):
ret = super(StatusCheck, self).save(*args, **kwargs)
# Update linked services
self.update_related_services()
self.update_related_instances()
return ret
def update_related_services(self):
@ -434,6 +473,7 @@ class StatusCheck(PolymorphicModel):
for service in services:
update_service.delay(service.id)
def update_related_instances(self):
instances = self.instance_set.all()
for instance in instances:
update_service.delay(instance.id)
@ -451,12 +491,18 @@ class ICMPStatusCheck(StatusCheck):
result = StatusCheckResult(check=self)
instances = self.instance_set.all()
target = self.instance_set.get().address
response = os.system("ping -c 1 " + target)
#We need to read both STDOUT and STDERR because ping can write to both, depending on the kind of error. Thanks a lot, ping.
ping_process = subprocess.Popen("ping -c 1 " + target, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
response = ping_process.wait()
if response == 0:
result.succeeded = True
else:
output = ping_process.stdout.read()
result.succeeded = False
result.error = "Could not connect, host is most likely down"
result.error = output
return result

View File

@ -71,6 +71,14 @@ def update_service(service_or_id):
service = service_or_id
service.update_status()
@task(ignore_result=True)
def update_instance(instance_or_id):
from .models import Instance
if not isinstance(instance_or_id, Service):
instance = Instance.objects.get(id=instance_or_id)
else:
instance = instance_or_id
instance.update_status()
@task(ignore_result=True)
def update_shifts():

View File

@ -70,7 +70,7 @@
</td>
{% endif %}
<td title="">
{% if check.polymorphic_ctype.model == 'graphitestatuscheck' %}{{ check.metric|truncatechars:70 }} {{ check.check_type }} {{ check.value }}{% if check.expected_num_hosts %} (from {{ check.expected_num_hosts }} hosts){% endif %}{% elif check.polymorphic_ctype.model == 'httpstatuscheck' %}Status code {{ check.status_code }} from {{ check.endpoint }}{% if check.text_match %}; match text /{{ check.text_match }}/{% endif %}{% elif check.polymorphic_ctype.model == 'jenkinsstatuscheck' %}Monitor job {{ check.name }}{% if check.max_queued_build_time %}; check no build waiting for >{{ check.max_queued_build_time }} minutes{% endif %}{% endif %}
{% if check.polymorphic_ctype.model == 'graphitestatuscheck' %}{{ check.metric|truncatechars:70 }} {{ check.check_type }} {{ check.value }}{% if check.expected_num_hosts %} (from {{ check.expected_num_hosts }} hosts){% endif %}{% elif check.polymorphic_ctype.model == 'icmpstatuscheck' %}ICMP Reply from {{ check.instance_set.all.0.address }}{% elif check.polymorphic_ctype.model == 'httpstatuscheck' %}Status code {{ check.status_code }} from {{ check.endpoint }}{% if check.text_match %}; match text /{{ check.text_match }}/{% endif %}{% elif check.polymorphic_ctype.model == 'jenkinsstatuscheck' %}Monitor job {{ check.name }}{% if check.max_queued_build_time %}; check no build waiting for >{{ check.max_queued_build_time }} minutes{% endif %}{% endif %}
</td>
<td>{{ check.get_importance_display }}</td>
<td>

View File

@ -70,7 +70,7 @@
</td>
{% endif %}
<td title="">
{% if check.polymorphic_ctype.model == 'graphitestatuscheck' %}{{ check.metric|truncatechars:70 }} {{ check.check_type }} {{ check.value }}{% if check.expected_num_hosts %} (from {{ check.expected_num_hosts }} hosts){% endif %}{% elif check.polymorphic_ctype.model == 'httpstatuscheck' %}Status code {{ check.status_code }} from {{ check.endpoint }}{% if check.text_match %}; match text /{{ check.text_match }}/{% endif %}{% elif check.polymorphic_ctype.model == 'jenkinsstatuscheck' %}Monitor job {{ check.name }}{% if check.max_queued_build_time %}; check no build waiting for >{{ check.max_queued_build_time }} minutes{% endif %}{% endif %}
{% if check.polymorphic_ctype.model == 'graphitestatuscheck' %}{{ check.metric|truncatechars:70 }} {{ check.check_type }} {{ check.value }}{% if check.expected_num_hosts %} (from {{ check.expected_num_hosts }} hosts){% endif %}{% elif check.polymorphic_ctype.model == 'icmpstatuscheck' %}ICMP Reply from {{ check.instance_set.all.0.address }}{% elif check.polymorphic_ctype.model == 'httpstatuscheck' %}Status code {{ check.status_code }} from {{ check.endpoint }}{% if check.text_match %}; match text /{{ check.text_match }}/{% endif %}{% elif check.polymorphic_ctype.model == 'jenkinsstatuscheck' %}Monitor job {{ check.name }}{% if check.max_queued_build_time %}; check no build waiting for >{{ check.max_queued_build_time }} minutes{% endif %}{% endif %}
</td>
<td>{{ check.get_importance_display }}</td>
<td>

View File

@ -49,7 +49,7 @@
</td>
<td>{{ result.time_complete }}</td>
<td>{{ result.took }}</td>
<td>{{ result.error|default:"" }}</td>
<td>{% autoescape off %}{{ result.error|default:"" }}{% endautoescape %}</td>
</tr>
{% endfor %}
</tbody>