Add failure tracking fields and error handling to alert backends

Co-authored-by: vanschelven <223833+vanschelven@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot]
2025-07-28 14:18:07 +00:00
committed by Klaas van Schelven
parent c6ced06a2f
commit b564774f21
4 changed files with 346 additions and 15 deletions

View File

@@ -0,0 +1,43 @@
# Generated by Django 4.2.23 on 2025-07-28 14:23
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('alerts', '0002_alter_messagingserviceconfig_project'),
]
operations = [
migrations.AddField(
model_name='messagingserviceconfig',
name='last_failure_error_message',
field=models.TextField(blank=True, help_text='Error message from the exception', null=True),
),
migrations.AddField(
model_name='messagingserviceconfig',
name='last_failure_error_type',
field=models.CharField(blank=True, help_text="Type of error that occurred (e.g., 'requests.HTTPError')", max_length=100, null=True),
),
migrations.AddField(
model_name='messagingserviceconfig',
name='last_failure_is_json',
field=models.BooleanField(blank=True, help_text='Whether the response was valid JSON', null=True),
),
migrations.AddField(
model_name='messagingserviceconfig',
name='last_failure_response_text',
field=models.TextField(blank=True, help_text='Response text from the failed request', null=True),
),
migrations.AddField(
model_name='messagingserviceconfig',
name='last_failure_status_code',
field=models.IntegerField(blank=True, help_text='HTTP status code of the failed request', null=True),
),
migrations.AddField(
model_name='messagingserviceconfig',
name='last_failure_timestamp',
field=models.DateTimeField(blank=True, help_text='When the last failure occurred', null=True),
),
]

View File

@@ -12,7 +12,34 @@ class MessagingServiceConfig(models.Model):
kind = models.CharField(choices=[("slack", "Slack (or compatible)"), ], max_length=20, default="slack")
config = models.TextField(blank=False)
# Alert backend failure tracking
last_failure_timestamp = models.DateTimeField(null=True, blank=True,
help_text="When the last failure occurred")
last_failure_status_code = models.IntegerField(null=True, blank=True,
help_text="HTTP status code of the failed request")
last_failure_response_text = models.TextField(null=True, blank=True,
help_text="Response text from the failed request")
last_failure_is_json = models.BooleanField(null=True, blank=True,
help_text="Whether the response was valid JSON")
last_failure_error_type = models.CharField(max_length=100, null=True, blank=True,
help_text="Type of error that occurred (e.g., 'requests.HTTPError')")
last_failure_error_message = models.TextField(null=True, blank=True,
help_text="Error message from the exception")
def get_backend(self):
# once we have multiple backends: lookup by kind.
return SlackBackend(self)
def clear_failure_status(self):
"""Clear all failure tracking fields on successful operation"""
self.last_failure_timestamp = None
self.last_failure_status_code = None
self.last_failure_response_text = None
self.last_failure_is_json = None
self.last_failure_error_type = None
self.last_failure_error_message = None
def has_recent_failure(self):
"""Check if this config has a recent failure"""
return self.last_failure_timestamp is not None

View File

@@ -1,11 +1,13 @@
import json
import requests
from django.utils import timezone
from django import forms
from django.template.defaultfilters import truncatechars
from snappea.decorators import shared_task
from bugsink.app_settings import get_settings
from bugsink.transaction import immediate_atomic
from issues.models import Issue
@@ -32,8 +34,57 @@ def _safe_markdown(text):
return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;").replace("*", "\\*").replace("_", "\\_")
def _store_failure_info(service_config_id, exception, response=None):
"""Store failure information in the MessagingServiceConfig with immediate_atomic"""
from alerts.models import MessagingServiceConfig
with immediate_atomic(only_if_needed=True):
try:
config = MessagingServiceConfig.objects.get(id=service_config_id)
config.last_failure_timestamp = timezone.now()
config.last_failure_error_type = type(exception).__name__
config.last_failure_error_message = str(exception)
# Handle requests-specific errors
if response is not None:
config.last_failure_status_code = response.status_code
config.last_failure_response_text = response.text[:2000] # Limit response text size
# Check if response is JSON
try:
json.loads(response.text)
config.last_failure_is_json = True
except (json.JSONDecodeError, ValueError):
config.last_failure_is_json = False
else:
# Non-HTTP errors
config.last_failure_status_code = None
config.last_failure_response_text = None
config.last_failure_is_json = None
config.save()
except MessagingServiceConfig.DoesNotExist:
# Config was deleted while task was running
pass
def _store_success_info(service_config_id):
"""Clear failure information on successful operation"""
from alerts.models import MessagingServiceConfig
with immediate_atomic(only_if_needed=True):
try:
config = MessagingServiceConfig.objects.get(id=service_config_id)
config.clear_failure_status()
config.save()
except MessagingServiceConfig.DoesNotExist:
# Config was deleted while task was running
pass
@shared_task
def slack_backend_send_test_message(webhook_url, project_name, display_name):
def slack_backend_send_test_message(webhook_url, project_name, display_name, service_config_id):
# See Slack's Block Kit Builder
data = {"blocks": [
@@ -67,17 +118,35 @@ def slack_backend_send_test_message(webhook_url, project_name, display_name):
]}
result = requests.post(
webhook_url,
data=json.dumps(data),
headers={"Content-Type": "application/json"},
)
try:
result = requests.post(
webhook_url,
data=json.dumps(data),
headers={"Content-Type": "application/json"},
)
result.raise_for_status()
result.raise_for_status()
# Success - clear any previous failure status
_store_success_info(service_config_id)
except requests.RequestException as e:
# Store failure information for requests-related errors
# For HTTPError from raise_for_status(), the response is in the exception
response = getattr(e, 'response', None)
if response is None and 'result' in locals():
# Fallback: if no response in exception, try to get it from the result
response = result
_store_failure_info(service_config_id, e, response)
raise
except Exception as e:
# Store failure information for other errors
_store_failure_info(service_config_id, e)
raise
@shared_task
def slack_backend_send_alert(webhook_url, issue_id, state_description, alert_article, alert_reason, unmute_reason=None):
def slack_backend_send_alert(webhook_url, issue_id, state_description, alert_article, alert_reason, service_config_id, unmute_reason=None):
issue = Issue.objects.get(id=issue_id)
issue_url = get_settings().BASE_URL + issue.get_absolute_url()
@@ -134,13 +203,31 @@ def slack_backend_send_alert(webhook_url, issue_id, state_description, alert_art
},
]}
result = requests.post(
webhook_url,
data=json.dumps(data),
headers={"Content-Type": "application/json"},
)
try:
result = requests.post(
webhook_url,
data=json.dumps(data),
headers={"Content-Type": "application/json"},
)
result.raise_for_status()
result.raise_for_status()
# Success - clear any previous failure status
_store_success_info(service_config_id)
except requests.RequestException as e:
# Store failure information for requests-related errors
# For HTTPError from raise_for_status(), the response is in the exception
response = getattr(e, 'response', None)
if response is None and 'result' in locals():
# Fallback: if no response in exception, try to get it from the result
response = result
_store_failure_info(service_config_id, e, response)
raise
except Exception as e:
# Store failure information for other errors
_store_failure_info(service_config_id, e)
raise
class SlackBackend:
@@ -156,9 +243,10 @@ class SlackBackend:
json.loads(self.service_config.config)["webhook_url"],
self.service_config.project.name,
self.service_config.display_name,
self.service_config.id,
)
def send_alert(self, issue_id, state_description, alert_article, alert_reason, **kwargs):
slack_backend_send_alert.delay(
json.loads(self.service_config.config)["webhook_url"],
issue_id, state_description, alert_article, alert_reason, **kwargs)
issue_id, state_description, alert_article, alert_reason, self.service_config.id, **kwargs)

View File

@@ -1,14 +1,20 @@
from django.test import TestCase as DjangoTestCase
from unittest.mock import patch, Mock
import json
import requests
from django.core import mail
from django.contrib.auth import get_user_model
from django.template.loader import get_template
from django.utils import timezone
from issues.factories import get_or_create_issue
from projects.models import Project, ProjectMembership
from events.factories import create_event
from teams.models import Team, TeamMembership
from .models import MessagingServiceConfig
from .service_backends.slack import slack_backend_send_test_message, slack_backend_send_alert
from .tasks import send_new_issue_alert, send_regression_alert, send_unmute_alert, _get_users_for_email_alert
from .views import DEBUG_CONTEXTS
@@ -132,3 +138,170 @@ class TestAlertSending(DjangoTestCase):
user.send_email_alerts = True
user.save()
self.assertEqual(list(_get_users_for_email_alert(issue)), [user])
class TestSlackBackendErrorHandling(DjangoTestCase):
def setUp(self):
self.project = Project.objects.create(name="Test project")
self.config = MessagingServiceConfig.objects.create(
project=self.project,
display_name="Test Slack",
kind="slack",
config=json.dumps({"webhook_url": "https://hooks.slack.com/test"}),
)
@patch('alerts.service_backends.slack.requests.post')
def test_slack_test_message_success_clears_failure_status(self, mock_post):
# Set up existing failure status
self.config.last_failure_timestamp = timezone.now()
self.config.last_failure_status_code = 500
self.config.last_failure_response_text = "Server Error"
self.config.save()
# Mock successful response
mock_response = Mock()
mock_response.status_code = 200
mock_response.raise_for_status.return_value = None
mock_post.return_value = mock_response
# Send test message
slack_backend_send_test_message(
"https://hooks.slack.com/test",
"Test project",
"Test Slack",
self.config.id
)
# Verify failure status was cleared
self.config.refresh_from_db()
self.assertIsNone(self.config.last_failure_timestamp)
self.assertIsNone(self.config.last_failure_status_code)
self.assertIsNone(self.config.last_failure_response_text)
@patch('alerts.service_backends.slack.requests.post')
def test_slack_test_message_http_error_stores_failure(self, mock_post):
# Mock HTTP error response
mock_response = Mock()
mock_response.status_code = 404
mock_response.text = '{"error": "webhook_not_found"}'
# Create the HTTPError with response attached
http_error = requests.HTTPError()
http_error.response = mock_response
mock_response.raise_for_status.side_effect = http_error
mock_post.return_value = mock_response
# Send test message and expect it to raise
with self.assertRaises(requests.HTTPError):
slack_backend_send_test_message(
"https://hooks.slack.com/test",
"Test project",
"Test Slack",
self.config.id
)
# Verify failure status was stored
self.config.refresh_from_db()
self.assertIsNotNone(self.config.last_failure_timestamp)
self.assertEqual(self.config.last_failure_status_code, 404)
self.assertEqual(self.config.last_failure_response_text, '{"error": "webhook_not_found"}')
self.assertTrue(self.config.last_failure_is_json)
self.assertEqual(self.config.last_failure_error_type, "HTTPError")
@patch('alerts.service_backends.slack.requests.post')
def test_slack_test_message_non_json_error_stores_failure(self, mock_post):
# Mock HTTP error response with non-JSON text
mock_response = Mock()
mock_response.status_code = 500
mock_response.text = 'Internal Server Error'
# Create the HTTPError with response attached
http_error = requests.HTTPError()
http_error.response = mock_response
mock_response.raise_for_status.side_effect = http_error
mock_post.return_value = mock_response
# Send test message and expect it to raise
with self.assertRaises(requests.HTTPError):
slack_backend_send_test_message(
"https://hooks.slack.com/test",
"Test project",
"Test Slack",
self.config.id
)
# Verify failure status was stored
self.config.refresh_from_db()
self.assertIsNotNone(self.config.last_failure_timestamp)
self.assertEqual(self.config.last_failure_status_code, 500)
self.assertEqual(self.config.last_failure_response_text, 'Internal Server Error')
self.assertFalse(self.config.last_failure_is_json)
@patch('alerts.service_backends.slack.requests.post')
def test_slack_test_message_connection_error_stores_failure(self, mock_post):
# Mock connection error
mock_post.side_effect = requests.ConnectionError("Connection failed")
# Send test message and expect it to raise
with self.assertRaises(requests.ConnectionError):
slack_backend_send_test_message(
"https://hooks.slack.com/test",
"Test project",
"Test Slack",
self.config.id
)
# Verify failure status was stored
self.config.refresh_from_db()
self.assertIsNotNone(self.config.last_failure_timestamp)
self.assertIsNone(self.config.last_failure_status_code) # No HTTP response
self.assertIsNone(self.config.last_failure_response_text)
self.assertIsNone(self.config.last_failure_is_json)
self.assertEqual(self.config.last_failure_error_type, "ConnectionError")
self.assertEqual(self.config.last_failure_error_message, "Connection failed")
@patch('alerts.service_backends.slack.requests.post')
def test_slack_alert_message_success_clears_failure_status(self, mock_post):
# Set up existing failure status
self.config.last_failure_timestamp = timezone.now()
self.config.last_failure_status_code = 500
self.config.save()
# Create issue
issue, _ = get_or_create_issue(project=self.project)
# Mock successful response
mock_response = Mock()
mock_response.status_code = 200
mock_response.raise_for_status.return_value = None
mock_post.return_value = mock_response
# Send alert message
slack_backend_send_alert(
"https://hooks.slack.com/test",
issue.id,
"New issue",
"a",
"NEW",
self.config.id
)
# Verify failure status was cleared
self.config.refresh_from_db()
self.assertIsNone(self.config.last_failure_timestamp)
def test_has_recent_failure_method(self):
# Initially no failure
self.assertFalse(self.config.has_recent_failure())
# Set failure
self.config.last_failure_timestamp = timezone.now()
self.config.save()
self.assertTrue(self.config.has_recent_failure())
# Clear failure
self.config.clear_failure_status()
self.config.save()
self.assertFalse(self.config.has_recent_failure())