This commit is contained in:
David Buxton 2014-08-01 16:45:37 +01:00
parent 4e3968fd96
commit fb761f5603
7 changed files with 73 additions and 42 deletions

View File

@ -5,6 +5,7 @@ from polymorphic import PolymorphicModel
from django.db.models import F from django.db.models import F
from django.core.urlresolvers import reverse from django.core.urlresolvers import reverse
from django.contrib.admin.models import User from django.contrib.admin.models import User
from celery.exceptions import SoftTimeLimitExceeded
from jenkins import get_job_status from jenkins import get_job_status
from .alert import send_alert from .alert import send_alert
@ -13,6 +14,7 @@ from .graphite import parse_metric
from .tasks import update_service, update_instance from .tasks import update_service, update_instance
from datetime import datetime, timedelta from datetime import datetime, timedelta
from django.utils import timezone from django.utils import timezone
from django.db import transaction
import json import json
import re import re
@ -172,7 +174,7 @@ class CheckGroupMixin(models.Model):
self.save() self.save()
self.snapshot.did_send_alert = True self.snapshot.did_send_alert = True
self.snapshot.save() self.snapshot.save()
service_send_alert(self, duty_officers=get_duty_officers()) send_alert(self, duty_officers=get_duty_officers())
@property @property
def recent_snapshots(self): def recent_snapshots(self):
@ -261,7 +263,7 @@ class Instance(CheckGroupMixin):
new_instance.save() new_instance.save()
for check in checks: for check in checks:
check.duplicate(inst_set=[new_instance,], serv_set=check.service_set.all()) check.duplicate(inst_set=[new_instance], serv_set=())
return new_instance.pk return new_instance.pk
@ -281,12 +283,6 @@ class Instance(CheckGroupMixin):
) )
self.snapshot.save() self.snapshot.save()
self.save() self.save()
if not (self.overall_status == Service.PASSING_STATUS and self.old_overall_status == Service.PASSING_STATUS):
self.alert()
def alert():
return
#We don't want alerts for instances
class Meta: class Meta:
ordering = ['name'] ordering = ['name']
@ -302,9 +298,11 @@ class Instance(CheckGroupMixin):
def active_icmp_status_checks(self): def active_icmp_status_checks(self):
return self.icmp_status_checks().filter(active=True) return self.icmp_status_checks().filter(active=True)
class Snapshot(models.Model):
class Meta:
abstract = True
class ServiceStatusSnapshot(models.Model):
service = models.ForeignKey(Service, related_name='snapshots')
time = models.DateTimeField(db_index=True) time = models.DateTimeField(db_index=True)
num_checks_active = models.IntegerField(default=0) num_checks_active = models.IntegerField(default=0)
num_checks_passing = models.IntegerField(default=0) num_checks_passing = models.IntegerField(default=0)
@ -312,17 +310,14 @@ class ServiceStatusSnapshot(models.Model):
overall_status = models.TextField(default=Service.PASSING_STATUS) overall_status = models.TextField(default=Service.PASSING_STATUS)
did_send_alert = models.IntegerField(default=False) did_send_alert = models.IntegerField(default=False)
class ServiceStatusSnapshot(Snapshot):
service = models.ForeignKey(Service, related_name='snapshots')
def __unicode__(self): def __unicode__(self):
return u"%s: %s" % (self.service.name, self.overall_status) return u"%s: %s" % (self.service.name, self.overall_status)
class InstanceStatusSnapshot(models.Model): class InstanceStatusSnapshot(Snapshot):
instance = models.ForeignKey(Instance, related_name='snapshots') instance = models.ForeignKey(Instance, related_name='snapshots')
time = models.DateTimeField(db_index=True)
num_checks_active = models.IntegerField(default=0)
num_checks_passing = models.IntegerField(default=0)
num_checks_failing = models.IntegerField(default=0)
overall_status = models.TextField(default=Service.PASSING_STATUS)
did_send_alert = models.IntegerField(default=False)
def __unicode__(self): def __unicode__(self):
return u"%s: %s" % (self.instance.name, self.overall_status) return u"%s: %s" % (self.instance.name, self.overall_status)
@ -435,11 +430,11 @@ class StatusCheck(PolymorphicModel):
return self.name return self.name
def recent_results(self): def recent_results(self):
return self.statuscheckresult_set.all().order_by('-time_complete').defer('raw_data')[:10] return StatusCheckResult.objects.filter(check=self).order_by('-time_complete').defer('raw_data')[:10]
def last_result(self): def last_result(self):
try: try:
return self.recent_results()[0] return StatusCheckResult.objects.filter(check=self).order_by('-time_complete').defer('raw_data')[0]
except: except:
return None return None
@ -447,14 +442,13 @@ class StatusCheck(PolymorphicModel):
start = timezone.now() start = timezone.now()
try: try:
result = self._run() result = self._run()
except SoftTimeLimitExceeded as e:
result = StatusCheckResult(check=self)
result.error = u'Error in performing check: Celery soft time limit exceeded'
result.succeeded = False
except Exception as e: except Exception as e:
result = StatusCheckResult(check=self) result = StatusCheckResult(check=self)
result.error = u'Error in performing check: %s' % (e,) result.error = u'Error in performing check: %s' % (e,)
if result.error.startswith("Error in performing check: get() returned more than one Instance"):
first_instance = self.instance_set.all().order_by('id')[0]
self.instance_set = [first_instance]
first_instance_link = '<a href="%s">' % reverse('instance', kwargs={'pk': first_instance.pk}) + first_instance.name + "</a>"
result.error = "Error: This type of check can only be attached to one instance. All instances, apart from the oldest one (%s), have been detached from this check. The check will run normally next time." % first_instance_link
result.succeeded = False result.succeeded = False
finish = timezone.now() finish = timezone.now()
result.time = start result.time = start
@ -470,24 +464,49 @@ class StatusCheck(PolymorphicModel):
raise NotImplementedError('Subclasses should implement') raise NotImplementedError('Subclasses should implement')
def save(self, *args, **kwargs): def save(self, *args, **kwargs):
recent_results = self.recent_results() if self.pk:
# This should not be necessary
with transaction.commit_manually():
try:
recent_results = list(self.recent_results())
if calculate_debounced_passing(recent_results, self.debounce): if calculate_debounced_passing(recent_results, self.debounce):
self.calculated_status = Service.CALCULATED_PASSING_STATUS self.calculated_status = Service.CALCULATED_PASSING_STATUS
else: else:
self.calculated_status = Service.CALCULATED_FAILING_STATUS self.calculated_status = Service.CALCULATED_FAILING_STATUS
self.cached_health = serialize_recent_results(recent_results) self.cached_health = serialize_recent_results(recent_results)
transaction.commit()
except SoftTimeLimitExceeded as e:
# Something weird with postgres
transaction.rollback()
logger.error('Celery time limit exceeded for getting results for %s' % self.pk)
self.calculated_status = Service.CALCULATED_FAILING_STATUS
self.cached_health = '-1'
except Exception as e:
transaction.rollback()
logger.error('Got exception when saving check: %s' % e)
self.calculated_status = Service.CALCULATED_FAILING_STATUS
self.cached_health = '-1'
try:
updated = StatusCheck.objects.get(pk=self.pk)
except StatusCheck.DoesNotExist as e:
logger.error('Cannot find myself (check %s) in the database, presumably have been deleted' % self.pk)
return
else:
self.cached_health = ''
self.calculated_status = Service.CALCULATED_PASSING_STATUS
ret = super(StatusCheck, self).save(*args, **kwargs) ret = super(StatusCheck, self).save(*args, **kwargs)
# Update linked services
self.update_related_services() self.update_related_services()
self.update_related_instances() self.update_related_instances()
return ret return ret
def duplicate(self, inst_set=[None,], serv_set=[None,]): def duplicate(self, inst_set=None, serv_set=None):
new_check = self new_check = self
new_check.pk = None new_check.pk = None
new_check.id = None new_check.id = None
new_check.save() new_check.save()
if inst_set is not None:
new_check.instance_set = inst_set new_check.instance_set = inst_set
if serv_set is not None:
new_check.service_set = serv_set new_check.service_set = serv_set
new_check.save() new_check.save()
return new_check.pk return new_check.pk
@ -500,7 +519,7 @@ class StatusCheck(PolymorphicModel):
def update_related_instances(self): def update_related_instances(self):
instances = self.instance_set.all() instances = self.instance_set.all()
for instance in instances: for instance in instances:
update_service.delay(instance.id) update_instance.delay(instance.id)
class ICMPStatusCheck(StatusCheck): class ICMPStatusCheck(StatusCheck):
@ -516,7 +535,7 @@ class ICMPStatusCheck(StatusCheck):
instances = self.instance_set.all() instances = self.instance_set.all()
target = self.instance_set.get().address target = self.instance_set.get().address
#We need to read both STDOUT and STDERR because ping can write to both, depending on the kind of error. Thanks a lot, ping. # We need to read both STDOUT and STDERR because ping can write to both, depending on the kind of error. Thanks a lot, ping.
ping_process = subprocess.Popen("ping -c 1 " + target, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) ping_process = subprocess.Popen("ping -c 1 " + target, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
response = ping_process.wait() response = ping_process.wait()

View File

@ -71,15 +71,17 @@ def update_service(service_or_id):
service = service_or_id service = service_or_id
service.update_status() service.update_status()
@task(ignore_result=True) @task(ignore_result=True)
def update_instance(instance_or_id): def update_instance(instance_or_id):
from .models import Instance from .models import Instance
if not isinstance(instance_or_id, Service): if not isinstance(instance_or_id, Instance):
instance = Instance.objects.get(id=instance_or_id) instance = Instance.objects.get(id=instance_or_id)
else: else:
instance = instance_or_id instance = instance_or_id
instance.update_status() instance.update_status()
@task(ignore_result=True) @task(ignore_result=True)
def update_shifts(): def update_shifts():
from .models import update_shifts as _update_shifts from .models import update_shifts as _update_shifts

View File

@ -85,6 +85,7 @@ class StatusCheckResultDetailView(LoginRequiredMixin, DetailView):
model = StatusCheckResult model = StatusCheckResult
context_object_name = 'result' context_object_name = 'result'
class SymmetricalForm(forms.ModelForm): class SymmetricalForm(forms.ModelForm):
symmetrical_fields = () # Iterable of 2-tuples (field, model) symmetrical_fields = () # Iterable of 2-tuples (field, model)
@ -115,7 +116,9 @@ base_widgets = {
class StatusCheckForm(SymmetricalForm): class StatusCheckForm(SymmetricalForm):
symmetrical_fields = ('service_set', 'instance_set') symmetrical_fields = ('service_set', 'instance_set')
service_set = forms.ModelMultipleChoiceField( service_set = forms.ModelMultipleChoiceField(
queryset=Service.objects.all(), queryset=Service.objects.all(),
required=False, required=False,
@ -140,6 +143,7 @@ class StatusCheckForm(SymmetricalForm):
) )
) )
class GraphiteStatusCheckForm(StatusCheckForm): class GraphiteStatusCheckForm(StatusCheckForm):
class Meta: class Meta:
@ -184,6 +188,7 @@ class ICMPStatusCheckForm(StatusCheckForm):
) )
widgets = dict(**base_widgets) widgets = dict(**base_widgets)
class HttpStatusCheckForm(StatusCheckForm): class HttpStatusCheckForm(StatusCheckForm):
class Meta: class Meta:
@ -244,6 +249,7 @@ class UserProfileForm(forms.ModelForm):
model = UserProfile model = UserProfile
exclude = ('user',) exclude = ('user',)
class InstanceForm(SymmetricalForm): class InstanceForm(SymmetricalForm):
symmetrical_fields = ('service_set',) symmetrical_fields = ('service_set',)
@ -291,8 +297,6 @@ class InstanceForm(SymmetricalForm):
return ret return ret
class ServiceForm(forms.ModelForm): class ServiceForm(forms.ModelForm):
class Meta: class Meta:
@ -523,6 +527,7 @@ class InstanceListView(LoginRequiredMixin, ListView):
def get_queryset(self): def get_queryset(self):
return Instance.objects.all().order_by('name').prefetch_related('status_checks') return Instance.objects.all().order_by('name').prefetch_related('status_checks')
class ServiceListView(LoginRequiredMixin, ListView): class ServiceListView(LoginRequiredMixin, ListView):
model = Service model = Service
context_object_name = 'services' context_object_name = 'services'
@ -629,12 +634,14 @@ class ServiceDeleteView(LoginRequiredMixin, DeleteView):
context_object_name = 'service' context_object_name = 'service'
template_name = 'cabotapp/service_confirm_delete.html' template_name = 'cabotapp/service_confirm_delete.html'
class InstanceDeleteView(LoginRequiredMixin, DeleteView): class InstanceDeleteView(LoginRequiredMixin, DeleteView):
model = Instance model = Instance
success_url = reverse_lazy('instances') success_url = reverse_lazy('instances')
context_object_name = 'instance' context_object_name = 'instance'
template_name = 'cabotapp/instance_confirm_delete.html' template_name = 'cabotapp/instance_confirm_delete.html'
class ShiftListView(LoginRequiredMixin, ListView): class ShiftListView(LoginRequiredMixin, ListView):
model = Shift model = Shift
context_object_name = 'shifts' context_object_name = 'shifts'

View File

@ -6,6 +6,8 @@ CELERY_IMPORTS = ('app.cabotapp.tasks', )
CELERYBEAT_SCHEDULER = "djcelery.schedulers.DatabaseScheduler" CELERYBEAT_SCHEDULER = "djcelery.schedulers.DatabaseScheduler"
CELERY_TASK_SERIALIZER = "json" CELERY_TASK_SERIALIZER = "json"
CELERY_ACCEPT_CONTENT = ['json', 'msgpack', 'yaml'] CELERY_ACCEPT_CONTENT = ['json', 'msgpack', 'yaml']
CELERYD_TASK_SOFT_TIME_LIMIT = 120
CELERYD_TASK_TIME_LIMIT = 150
CELERYBEAT_SCHEDULE = { CELERYBEAT_SCHEDULE = {
'run-all-checks': { 'run-all-checks': {

View File

@ -91,6 +91,7 @@ MIDDLEWARE_CLASSES = (
'django.middleware.csrf.CsrfViewMiddleware', 'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware', 'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware', 'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.transaction.TransactionMiddleware',
) )
ROOT_URLCONF = 'app.urls' ROOT_URLCONF = 'app.urls'

View File

@ -18,7 +18,7 @@
</div> </div>
{% if form.instance.id %} {% if form.instance.id %}
<div class="col-xs-4"> <div class="col-xs-4">
<a class="btn btn-danger" href="{% url delete-service form.instance.id %}">Delete instance</a> <a class="btn btn-danger" href="{% url delete-instance form.instance.id %}">Delete instance</a>
</div> </div>
{% endif %} {% endif %}
</div> </div>

View File

@ -1,4 +1,4 @@
Django==1.4.10 Django==1.4.13
PyJWT==0.1.2 PyJWT==0.1.2
South==0.7.6 South==0.7.6
amqp==1.3.3 amqp==1.3.3