From fb761f56034246452411cdacb9303a4b866beaf9 Mon Sep 17 00:00:00 2001 From: David Buxton Date: Fri, 1 Aug 2014 16:45:37 +0100 Subject: [PATCH] Bugfixes --- app/cabotapp/models.py | 93 ++++++++++++++--------- app/cabotapp/tasks.py | 4 +- app/cabotapp/views.py | 11 ++- app/celeryconfig.py | 2 + app/settings.py | 1 + app/templates/cabotapp/instance_form.html | 2 +- requirements.txt | 2 +- 7 files changed, 73 insertions(+), 42 deletions(-) diff --git a/app/cabotapp/models.py b/app/cabotapp/models.py index e4f6e3b..a22c137 100644 --- a/app/cabotapp/models.py +++ b/app/cabotapp/models.py @@ -5,6 +5,7 @@ from polymorphic import PolymorphicModel from django.db.models import F from django.core.urlresolvers import reverse from django.contrib.admin.models import User +from celery.exceptions import SoftTimeLimitExceeded from jenkins import get_job_status from .alert import send_alert @@ -13,6 +14,7 @@ from .graphite import parse_metric from .tasks import update_service, update_instance from datetime import datetime, timedelta from django.utils import timezone +from django.db import transaction import json import re @@ -172,7 +174,7 @@ class CheckGroupMixin(models.Model): self.save() self.snapshot.did_send_alert = True self.snapshot.save() - service_send_alert(self, duty_officers=get_duty_officers()) + send_alert(self, duty_officers=get_duty_officers()) @property def recent_snapshots(self): @@ -257,11 +259,11 @@ class Instance(CheckGroupMixin): new_instance.pk = None new_instance.id = None new_instance.name = "Copy of %s" % self.name - + new_instance.save() for check in checks: - check.duplicate(inst_set=[new_instance,], serv_set=check.service_set.all()) + check.duplicate(inst_set=[new_instance], serv_set=()) return new_instance.pk @@ -281,12 +283,6 @@ class Instance(CheckGroupMixin): ) self.snapshot.save() self.save() - if not (self.overall_status == Service.PASSING_STATUS and self.old_overall_status == Service.PASSING_STATUS): - self.alert() - - def alert(): - return -#We don't want alerts for instances class Meta: ordering = ['name'] @@ -302,9 +298,11 @@ class Instance(CheckGroupMixin): def active_icmp_status_checks(self): return self.icmp_status_checks().filter(active=True) +class Snapshot(models.Model): + + class Meta: + abstract = True -class ServiceStatusSnapshot(models.Model): - service = models.ForeignKey(Service, related_name='snapshots') time = models.DateTimeField(db_index=True) num_checks_active = models.IntegerField(default=0) num_checks_passing = models.IntegerField(default=0) @@ -312,17 +310,14 @@ class ServiceStatusSnapshot(models.Model): overall_status = models.TextField(default=Service.PASSING_STATUS) did_send_alert = models.IntegerField(default=False) +class ServiceStatusSnapshot(Snapshot): + service = models.ForeignKey(Service, related_name='snapshots') + def __unicode__(self): return u"%s: %s" % (self.service.name, self.overall_status) -class InstanceStatusSnapshot(models.Model): +class InstanceStatusSnapshot(Snapshot): instance = models.ForeignKey(Instance, related_name='snapshots') - time = models.DateTimeField(db_index=True) - num_checks_active = models.IntegerField(default=0) - num_checks_passing = models.IntegerField(default=0) - num_checks_failing = models.IntegerField(default=0) - overall_status = models.TextField(default=Service.PASSING_STATUS) - did_send_alert = models.IntegerField(default=False) def __unicode__(self): return u"%s: %s" % (self.instance.name, self.overall_status) @@ -360,7 +355,7 @@ class StatusCheck(PolymorphicModel): null=True, help_text='Number of successive failures permitted before check will be marked as failed. Default is 0, i.e. fail on first failure.' ) - created_by = models.ForeignKey(User, null=True) + created_by = models.ForeignKey(User, null=True) calculated_status = models.CharField( max_length=50, choices=Service.STATUSES, default=Service.CALCULATED_PASSING_STATUS, blank=True) last_run = models.DateTimeField(null=True) @@ -435,11 +430,11 @@ class StatusCheck(PolymorphicModel): return self.name def recent_results(self): - return self.statuscheckresult_set.all().order_by('-time_complete').defer('raw_data')[:10] + return StatusCheckResult.objects.filter(check=self).order_by('-time_complete').defer('raw_data')[:10] def last_result(self): try: - return self.recent_results()[0] + return StatusCheckResult.objects.filter(check=self).order_by('-time_complete').defer('raw_data')[0] except: return None @@ -447,14 +442,13 @@ class StatusCheck(PolymorphicModel): start = timezone.now() try: result = self._run() + except SoftTimeLimitExceeded as e: + result = StatusCheckResult(check=self) + result.error = u'Error in performing check: Celery soft time limit exceeded' + result.succeeded = False except Exception as e: result = StatusCheckResult(check=self) result.error = u'Error in performing check: %s' % (e,) - if result.error.startswith("Error in performing check: get() returned more than one Instance"): - first_instance = self.instance_set.all().order_by('id')[0] - self.instance_set = [first_instance] - first_instance_link = '' % reverse('instance', kwargs={'pk': first_instance.pk}) + first_instance.name + "" - result.error = "Error: This type of check can only be attached to one instance. All instances, apart from the oldest one (%s), have been detached from this check. The check will run normally next time." % first_instance_link result.succeeded = False finish = timezone.now() result.time = start @@ -470,25 +464,50 @@ class StatusCheck(PolymorphicModel): raise NotImplementedError('Subclasses should implement') def save(self, *args, **kwargs): - recent_results = self.recent_results() - if calculate_debounced_passing(recent_results, self.debounce): - self.calculated_status = Service.CALCULATED_PASSING_STATUS + if self.pk: + # This should not be necessary + with transaction.commit_manually(): + try: + recent_results = list(self.recent_results()) + if calculate_debounced_passing(recent_results, self.debounce): + self.calculated_status = Service.CALCULATED_PASSING_STATUS + else: + self.calculated_status = Service.CALCULATED_FAILING_STATUS + self.cached_health = serialize_recent_results(recent_results) + transaction.commit() + except SoftTimeLimitExceeded as e: + # Something weird with postgres + transaction.rollback() + logger.error('Celery time limit exceeded for getting results for %s' % self.pk) + self.calculated_status = Service.CALCULATED_FAILING_STATUS + self.cached_health = '-1' + except Exception as e: + transaction.rollback() + logger.error('Got exception when saving check: %s' % e) + self.calculated_status = Service.CALCULATED_FAILING_STATUS + self.cached_health = '-1' + try: + updated = StatusCheck.objects.get(pk=self.pk) + except StatusCheck.DoesNotExist as e: + logger.error('Cannot find myself (check %s) in the database, presumably have been deleted' % self.pk) + return else: - self.calculated_status = Service.CALCULATED_FAILING_STATUS - self.cached_health = serialize_recent_results(recent_results) + self.cached_health = '' + self.calculated_status = Service.CALCULATED_PASSING_STATUS ret = super(StatusCheck, self).save(*args, **kwargs) - # Update linked services self.update_related_services() self.update_related_instances() return ret - def duplicate(self, inst_set=[None,], serv_set=[None,]): + def duplicate(self, inst_set=None, serv_set=None): new_check = self new_check.pk = None new_check.id = None new_check.save() - new_check.instance_set = inst_set - new_check.service_set = serv_set + if inst_set is not None: + new_check.instance_set = inst_set + if serv_set is not None: + new_check.service_set = serv_set new_check.save() return new_check.pk @@ -500,7 +519,7 @@ class StatusCheck(PolymorphicModel): def update_related_instances(self): instances = self.instance_set.all() for instance in instances: - update_service.delay(instance.id) + update_instance.delay(instance.id) class ICMPStatusCheck(StatusCheck): @@ -516,7 +535,7 @@ class ICMPStatusCheck(StatusCheck): instances = self.instance_set.all() target = self.instance_set.get().address -#We need to read both STDOUT and STDERR because ping can write to both, depending on the kind of error. Thanks a lot, ping. + # We need to read both STDOUT and STDERR because ping can write to both, depending on the kind of error. Thanks a lot, ping. ping_process = subprocess.Popen("ping -c 1 " + target, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) response = ping_process.wait() diff --git a/app/cabotapp/tasks.py b/app/cabotapp/tasks.py index 6e0aeba..5b3d5aa 100644 --- a/app/cabotapp/tasks.py +++ b/app/cabotapp/tasks.py @@ -71,15 +71,17 @@ def update_service(service_or_id): service = service_or_id service.update_status() + @task(ignore_result=True) def update_instance(instance_or_id): from .models import Instance - if not isinstance(instance_or_id, Service): + if not isinstance(instance_or_id, Instance): instance = Instance.objects.get(id=instance_or_id) else: instance = instance_or_id instance.update_status() + @task(ignore_result=True) def update_shifts(): from .models import update_shifts as _update_shifts diff --git a/app/cabotapp/views.py b/app/cabotapp/views.py index a2dd6b7..a4f2127 100644 --- a/app/cabotapp/views.py +++ b/app/cabotapp/views.py @@ -85,6 +85,7 @@ class StatusCheckResultDetailView(LoginRequiredMixin, DetailView): model = StatusCheckResult context_object_name = 'result' + class SymmetricalForm(forms.ModelForm): symmetrical_fields = () # Iterable of 2-tuples (field, model) @@ -115,7 +116,9 @@ base_widgets = { class StatusCheckForm(SymmetricalForm): + symmetrical_fields = ('service_set', 'instance_set') + service_set = forms.ModelMultipleChoiceField( queryset=Service.objects.all(), required=False, @@ -140,6 +143,7 @@ class StatusCheckForm(SymmetricalForm): ) ) + class GraphiteStatusCheckForm(StatusCheckForm): class Meta: @@ -184,6 +188,7 @@ class ICMPStatusCheckForm(StatusCheckForm): ) widgets = dict(**base_widgets) + class HttpStatusCheckForm(StatusCheckForm): class Meta: @@ -244,6 +249,7 @@ class UserProfileForm(forms.ModelForm): model = UserProfile exclude = ('user',) + class InstanceForm(SymmetricalForm): symmetrical_fields = ('service_set',) @@ -291,8 +297,6 @@ class InstanceForm(SymmetricalForm): return ret - - class ServiceForm(forms.ModelForm): class Meta: @@ -523,6 +527,7 @@ class InstanceListView(LoginRequiredMixin, ListView): def get_queryset(self): return Instance.objects.all().order_by('name').prefetch_related('status_checks') + class ServiceListView(LoginRequiredMixin, ListView): model = Service context_object_name = 'services' @@ -629,12 +634,14 @@ class ServiceDeleteView(LoginRequiredMixin, DeleteView): context_object_name = 'service' template_name = 'cabotapp/service_confirm_delete.html' + class InstanceDeleteView(LoginRequiredMixin, DeleteView): model = Instance success_url = reverse_lazy('instances') context_object_name = 'instance' template_name = 'cabotapp/instance_confirm_delete.html' + class ShiftListView(LoginRequiredMixin, ListView): model = Shift context_object_name = 'shifts' diff --git a/app/celeryconfig.py b/app/celeryconfig.py index 1516062..11dab4b 100644 --- a/app/celeryconfig.py +++ b/app/celeryconfig.py @@ -6,6 +6,8 @@ CELERY_IMPORTS = ('app.cabotapp.tasks', ) CELERYBEAT_SCHEDULER = "djcelery.schedulers.DatabaseScheduler" CELERY_TASK_SERIALIZER = "json" CELERY_ACCEPT_CONTENT = ['json', 'msgpack', 'yaml'] +CELERYD_TASK_SOFT_TIME_LIMIT = 120 +CELERYD_TASK_TIME_LIMIT = 150 CELERYBEAT_SCHEDULE = { 'run-all-checks': { diff --git a/app/settings.py b/app/settings.py index 3f8075f..c972123 100644 --- a/app/settings.py +++ b/app/settings.py @@ -91,6 +91,7 @@ MIDDLEWARE_CLASSES = ( 'django.middleware.csrf.CsrfViewMiddleware', 'django.contrib.auth.middleware.AuthenticationMiddleware', 'django.contrib.messages.middleware.MessageMiddleware', + 'django.middleware.transaction.TransactionMiddleware', ) ROOT_URLCONF = 'app.urls' diff --git a/app/templates/cabotapp/instance_form.html b/app/templates/cabotapp/instance_form.html index 5da7d3c..c0d3063 100644 --- a/app/templates/cabotapp/instance_form.html +++ b/app/templates/cabotapp/instance_form.html @@ -18,7 +18,7 @@ {% if form.instance.id %}
- Delete instance + Delete instance
{% endif %} diff --git a/requirements.txt b/requirements.txt index 5478ca2..4f2f07e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -Django==1.4.10 +Django==1.4.13 PyJWT==0.1.2 South==0.7.6 amqp==1.3.3