This commit is contained in:
David Buxton 2014-08-01 16:45:37 +01:00
parent 4e3968fd96
commit fb761f5603
7 changed files with 73 additions and 42 deletions

View File

@ -5,6 +5,7 @@ from polymorphic import PolymorphicModel
from django.db.models import F
from django.core.urlresolvers import reverse
from django.contrib.admin.models import User
from celery.exceptions import SoftTimeLimitExceeded
from jenkins import get_job_status
from .alert import send_alert
@ -13,6 +14,7 @@ from .graphite import parse_metric
from .tasks import update_service, update_instance
from datetime import datetime, timedelta
from django.utils import timezone
from django.db import transaction
import json
import re
@ -172,7 +174,7 @@ class CheckGroupMixin(models.Model):
self.save()
self.snapshot.did_send_alert = True
self.snapshot.save()
service_send_alert(self, duty_officers=get_duty_officers())
send_alert(self, duty_officers=get_duty_officers())
@property
def recent_snapshots(self):
@ -257,11 +259,11 @@ class Instance(CheckGroupMixin):
new_instance.pk = None
new_instance.id = None
new_instance.name = "Copy of %s" % self.name
new_instance.save()
for check in checks:
check.duplicate(inst_set=[new_instance,], serv_set=check.service_set.all())
check.duplicate(inst_set=[new_instance], serv_set=())
return new_instance.pk
@ -281,12 +283,6 @@ class Instance(CheckGroupMixin):
)
self.snapshot.save()
self.save()
if not (self.overall_status == Service.PASSING_STATUS and self.old_overall_status == Service.PASSING_STATUS):
self.alert()
def alert():
return
#We don't want alerts for instances
class Meta:
ordering = ['name']
@ -302,9 +298,11 @@ class Instance(CheckGroupMixin):
def active_icmp_status_checks(self):
return self.icmp_status_checks().filter(active=True)
class Snapshot(models.Model):
class Meta:
abstract = True
class ServiceStatusSnapshot(models.Model):
service = models.ForeignKey(Service, related_name='snapshots')
time = models.DateTimeField(db_index=True)
num_checks_active = models.IntegerField(default=0)
num_checks_passing = models.IntegerField(default=0)
@ -312,17 +310,14 @@ class ServiceStatusSnapshot(models.Model):
overall_status = models.TextField(default=Service.PASSING_STATUS)
did_send_alert = models.IntegerField(default=False)
class ServiceStatusSnapshot(Snapshot):
service = models.ForeignKey(Service, related_name='snapshots')
def __unicode__(self):
return u"%s: %s" % (self.service.name, self.overall_status)
class InstanceStatusSnapshot(models.Model):
class InstanceStatusSnapshot(Snapshot):
instance = models.ForeignKey(Instance, related_name='snapshots')
time = models.DateTimeField(db_index=True)
num_checks_active = models.IntegerField(default=0)
num_checks_passing = models.IntegerField(default=0)
num_checks_failing = models.IntegerField(default=0)
overall_status = models.TextField(default=Service.PASSING_STATUS)
did_send_alert = models.IntegerField(default=False)
def __unicode__(self):
return u"%s: %s" % (self.instance.name, self.overall_status)
@ -360,7 +355,7 @@ class StatusCheck(PolymorphicModel):
null=True,
help_text='Number of successive failures permitted before check will be marked as failed. Default is 0, i.e. fail on first failure.'
)
created_by = models.ForeignKey(User, null=True)
created_by = models.ForeignKey(User, null=True)
calculated_status = models.CharField(
max_length=50, choices=Service.STATUSES, default=Service.CALCULATED_PASSING_STATUS, blank=True)
last_run = models.DateTimeField(null=True)
@ -435,11 +430,11 @@ class StatusCheck(PolymorphicModel):
return self.name
def recent_results(self):
return self.statuscheckresult_set.all().order_by('-time_complete').defer('raw_data')[:10]
return StatusCheckResult.objects.filter(check=self).order_by('-time_complete').defer('raw_data')[:10]
def last_result(self):
try:
return self.recent_results()[0]
return StatusCheckResult.objects.filter(check=self).order_by('-time_complete').defer('raw_data')[0]
except:
return None
@ -447,14 +442,13 @@ class StatusCheck(PolymorphicModel):
start = timezone.now()
try:
result = self._run()
except SoftTimeLimitExceeded as e:
result = StatusCheckResult(check=self)
result.error = u'Error in performing check: Celery soft time limit exceeded'
result.succeeded = False
except Exception as e:
result = StatusCheckResult(check=self)
result.error = u'Error in performing check: %s' % (e,)
if result.error.startswith("Error in performing check: get() returned more than one Instance"):
first_instance = self.instance_set.all().order_by('id')[0]
self.instance_set = [first_instance]
first_instance_link = '<a href="%s">' % reverse('instance', kwargs={'pk': first_instance.pk}) + first_instance.name + "</a>"
result.error = "Error: This type of check can only be attached to one instance. All instances, apart from the oldest one (%s), have been detached from this check. The check will run normally next time." % first_instance_link
result.succeeded = False
finish = timezone.now()
result.time = start
@ -470,25 +464,50 @@ class StatusCheck(PolymorphicModel):
raise NotImplementedError('Subclasses should implement')
def save(self, *args, **kwargs):
recent_results = self.recent_results()
if calculate_debounced_passing(recent_results, self.debounce):
self.calculated_status = Service.CALCULATED_PASSING_STATUS
if self.pk:
# This should not be necessary
with transaction.commit_manually():
try:
recent_results = list(self.recent_results())
if calculate_debounced_passing(recent_results, self.debounce):
self.calculated_status = Service.CALCULATED_PASSING_STATUS
else:
self.calculated_status = Service.CALCULATED_FAILING_STATUS
self.cached_health = serialize_recent_results(recent_results)
transaction.commit()
except SoftTimeLimitExceeded as e:
# Something weird with postgres
transaction.rollback()
logger.error('Celery time limit exceeded for getting results for %s' % self.pk)
self.calculated_status = Service.CALCULATED_FAILING_STATUS
self.cached_health = '-1'
except Exception as e:
transaction.rollback()
logger.error('Got exception when saving check: %s' % e)
self.calculated_status = Service.CALCULATED_FAILING_STATUS
self.cached_health = '-1'
try:
updated = StatusCheck.objects.get(pk=self.pk)
except StatusCheck.DoesNotExist as e:
logger.error('Cannot find myself (check %s) in the database, presumably have been deleted' % self.pk)
return
else:
self.calculated_status = Service.CALCULATED_FAILING_STATUS
self.cached_health = serialize_recent_results(recent_results)
self.cached_health = ''
self.calculated_status = Service.CALCULATED_PASSING_STATUS
ret = super(StatusCheck, self).save(*args, **kwargs)
# Update linked services
self.update_related_services()
self.update_related_instances()
return ret
def duplicate(self, inst_set=[None,], serv_set=[None,]):
def duplicate(self, inst_set=None, serv_set=None):
new_check = self
new_check.pk = None
new_check.id = None
new_check.save()
new_check.instance_set = inst_set
new_check.service_set = serv_set
if inst_set is not None:
new_check.instance_set = inst_set
if serv_set is not None:
new_check.service_set = serv_set
new_check.save()
return new_check.pk
@ -500,7 +519,7 @@ class StatusCheck(PolymorphicModel):
def update_related_instances(self):
instances = self.instance_set.all()
for instance in instances:
update_service.delay(instance.id)
update_instance.delay(instance.id)
class ICMPStatusCheck(StatusCheck):
@ -516,7 +535,7 @@ class ICMPStatusCheck(StatusCheck):
instances = self.instance_set.all()
target = self.instance_set.get().address
#We need to read both STDOUT and STDERR because ping can write to both, depending on the kind of error. Thanks a lot, ping.
# We need to read both STDOUT and STDERR because ping can write to both, depending on the kind of error. Thanks a lot, ping.
ping_process = subprocess.Popen("ping -c 1 " + target, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
response = ping_process.wait()

View File

@ -71,15 +71,17 @@ def update_service(service_or_id):
service = service_or_id
service.update_status()
@task(ignore_result=True)
def update_instance(instance_or_id):
from .models import Instance
if not isinstance(instance_or_id, Service):
if not isinstance(instance_or_id, Instance):
instance = Instance.objects.get(id=instance_or_id)
else:
instance = instance_or_id
instance.update_status()
@task(ignore_result=True)
def update_shifts():
from .models import update_shifts as _update_shifts

View File

@ -85,6 +85,7 @@ class StatusCheckResultDetailView(LoginRequiredMixin, DetailView):
model = StatusCheckResult
context_object_name = 'result'
class SymmetricalForm(forms.ModelForm):
symmetrical_fields = () # Iterable of 2-tuples (field, model)
@ -115,7 +116,9 @@ base_widgets = {
class StatusCheckForm(SymmetricalForm):
symmetrical_fields = ('service_set', 'instance_set')
service_set = forms.ModelMultipleChoiceField(
queryset=Service.objects.all(),
required=False,
@ -140,6 +143,7 @@ class StatusCheckForm(SymmetricalForm):
)
)
class GraphiteStatusCheckForm(StatusCheckForm):
class Meta:
@ -184,6 +188,7 @@ class ICMPStatusCheckForm(StatusCheckForm):
)
widgets = dict(**base_widgets)
class HttpStatusCheckForm(StatusCheckForm):
class Meta:
@ -244,6 +249,7 @@ class UserProfileForm(forms.ModelForm):
model = UserProfile
exclude = ('user',)
class InstanceForm(SymmetricalForm):
symmetrical_fields = ('service_set',)
@ -291,8 +297,6 @@ class InstanceForm(SymmetricalForm):
return ret
class ServiceForm(forms.ModelForm):
class Meta:
@ -523,6 +527,7 @@ class InstanceListView(LoginRequiredMixin, ListView):
def get_queryset(self):
return Instance.objects.all().order_by('name').prefetch_related('status_checks')
class ServiceListView(LoginRequiredMixin, ListView):
model = Service
context_object_name = 'services'
@ -629,12 +634,14 @@ class ServiceDeleteView(LoginRequiredMixin, DeleteView):
context_object_name = 'service'
template_name = 'cabotapp/service_confirm_delete.html'
class InstanceDeleteView(LoginRequiredMixin, DeleteView):
model = Instance
success_url = reverse_lazy('instances')
context_object_name = 'instance'
template_name = 'cabotapp/instance_confirm_delete.html'
class ShiftListView(LoginRequiredMixin, ListView):
model = Shift
context_object_name = 'shifts'

View File

@ -6,6 +6,8 @@ CELERY_IMPORTS = ('app.cabotapp.tasks', )
CELERYBEAT_SCHEDULER = "djcelery.schedulers.DatabaseScheduler"
CELERY_TASK_SERIALIZER = "json"
CELERY_ACCEPT_CONTENT = ['json', 'msgpack', 'yaml']
CELERYD_TASK_SOFT_TIME_LIMIT = 120
CELERYD_TASK_TIME_LIMIT = 150
CELERYBEAT_SCHEDULE = {
'run-all-checks': {

View File

@ -91,6 +91,7 @@ MIDDLEWARE_CLASSES = (
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.transaction.TransactionMiddleware',
)
ROOT_URLCONF = 'app.urls'

View File

@ -18,7 +18,7 @@
</div>
{% if form.instance.id %}
<div class="col-xs-4">
<a class="btn btn-danger" href="{% url delete-service form.instance.id %}">Delete instance</a>
<a class="btn btn-danger" href="{% url delete-instance form.instance.id %}">Delete instance</a>
</div>
{% endif %}
</div>

View File

@ -1,4 +1,4 @@
Django==1.4.10
Django==1.4.13
PyJWT==0.1.2
South==0.7.6
amqp==1.3.3