Introduce 'Grouping' data-modeling

This commit is contained in:
Klaas van Schelven
2024-04-08 11:41:15 +02:00
parent 226415ce43
commit 48307daa0f
8 changed files with 159 additions and 34 deletions

View File

@@ -13,8 +13,8 @@ from rest_framework import exceptions
from compat.auth import parse_auth_header_value
from projects.models import Project
from issues.models import Issue, IssueStateManager
from issues.utils import get_hash_for_data
from issues.models import Issue, IssueStateManager, Grouping
from issues.utils import get_issue_grouper_for_data
from issues.regressions import issue_is_regression
import sentry_sdk_extensions
@@ -103,16 +103,30 @@ class BaseIngestAPIView(APIView):
# leave this at the top -- it may involve reading from the DB which should come before any DB writing
pc_registry = get_pc_registry()
hash_ = get_hash_for_data(event_data)
issue, issue_created = Issue.objects.get_or_create(
project=ingested_event.project,
hash=hash_,
defaults={
"first_seen": ingested_event.timestamp,
"last_seen": ingested_event.timestamp,
"event_count": 1,
},
)
grouping_key = get_issue_grouper_for_data(event_data)
if not Grouping.objects.filter(project=ingested_event.project, grouping_key=grouping_key).exists():
issue = Issue.objects.create(
project=ingested_event.project,
first_seen=ingested_event.timestamp,
last_seen=ingested_event.timestamp,
event_count=1,
)
# even though in our data-model a given grouping does not imply a single Issue (in fact, that's the whole
# point of groupings as a data-model), at-creation such implication does exist, because manual information
# ("this grouper is actually part of some other issue") can by definition not yet have been specified.
issue_created = True
grouping = Grouping.objects.create(
project=ingested_event.project,
grouping_key=grouping_key,
issue=issue,
)
else:
grouping = Grouping.objects.get(project=ingested_event.project, grouping_key=grouping_key)
issue = grouping.issue
issue_created = False
event, event_created = Event.from_ingested(ingested_event, issue, event_data)
if not event_created:

View File

@@ -1,13 +1,21 @@
from django.contrib import admin
from .models import Issue
from .models import Issue, Grouping
class GroupingInline(admin.TabularInline):
model = Grouping
extra = 0
exclude = ['project']
readonly_fields = [
'grouping_key',
]
@admin.register(Issue)
class IssueAdmin(admin.ModelAdmin):
fields = [
'project',
'hash',
'last_seen',
'first_seen',
'is_resolved',
@@ -19,9 +27,12 @@ class IssueAdmin(admin.ModelAdmin):
'event_count',
]
inlines = [
GroupingInline,
]
list_display = [
"title",
"hash",
"project",
"event_count", # expensive operation as written now (query in loop)
]

View File

@@ -4,8 +4,8 @@ from django.utils import timezone
from projects.models import Project
from .models import Issue
from .utils import get_hash_for_data
from .models import Issue, Grouping
from .utils import get_issue_grouper_for_data
def get_or_create_issue(project=None, event_data=None):
@@ -15,12 +15,26 @@ def get_or_create_issue(project=None, event_data=None):
if project is None:
project = Project.objects.create(name="Test project")
hash_ = get_hash_for_data(event_data)
issue, issue_created = Issue.objects.get_or_create(
project=project,
hash=hash_,
defaults=denormalized_issue_fields(),
)
grouping_key = get_issue_grouper_for_data(event_data)
if not Grouping.objects.filter(project=project, grouping_key=grouping_key).exists():
issue = Issue.objects.create(
project=project,
**denormalized_issue_fields(),
)
issue_created = True
grouping = Grouping.objects.create(
project=project,
grouping_key=grouping_key,
issue=issue,
)
else:
grouping = Grouping.objects.get(project=project, grouping_key=grouping_key)
issue = grouping.issue
issue_created = False
return issue, issue_created

View File

@@ -0,0 +1,28 @@
# Generated by Django 4.2.11 on 2024-04-08 08:59
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('projects', '0006_alter_projectmembership_unique_together'),
('issues', '0014_alter_issue_events_at_alter_issue_fixed_at'),
]
operations = [
migrations.RemoveField(
model_name='issue',
name='hash',
),
migrations.CreateModel(
name='Grouping',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('grouping_key', models.TextField()),
('issue', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to='issues.issue')),
('project', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to='projects.project')),
],
),
]

View File

@@ -0,0 +1,36 @@
import json
from django.db import migrations
from issues.utils import get_issue_grouper_for_data
def create_grouping_objects(apps, schema_editor):
Issue = apps.get_model('issues', 'Issue')
Grouping = apps.get_model('issues', 'Grouping')
for issue in Issue.objects.all():
# we can do this because we know that there is at least one event, and because the events are already grouped
# per issue (we just need to reconstruct the grouping key as implied by the hash)
some_random_event = issue.event_set.first()
event_data = json.loads(some_random_event.data)
# we have _not_ inlined the code (which is standard good practice when creating datamigrations). Reason: this
# data-migration is just here for my own development process, we don't need it to be perfect.
grouping_key = get_issue_grouper_for_data(event_data)
Grouping.objects.create(
project=issue.project,
issue=issue,
grouping_key=grouping_key,
)
class Migration(migrations.Migration):
dependencies = [
('issues', '0015_remove_issue_hash_grouping'),
]
operations = [
migrations.RunPython(create_grouping_objects),
]

View File

@@ -17,11 +17,15 @@ class IncongruentStateException(Exception):
class Issue(models.Model):
"""
An Issue models a group of similar events. In particular: it models the result of both automatic (client-side and
server-side) and manual ("merge") grouping.
"""
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
project = models.ForeignKey(
"projects.Project", blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later'
hash = models.CharField(max_length=32, blank=False, null=False)
# denormalized fields:
last_seen = models.DateTimeField(blank=False, null=False) # based on event.server_side_timestamp
@@ -110,6 +114,24 @@ class Issue(models.Model):
]
class Grouping(models.Model):
"""A Grouping models an automatically calculated grouping key (from the event data, with a key role for the SDK-side
fingerprint).
"""
project = models.ForeignKey(
"projects.Project", blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later'
# NOTE: I don't want to have any principled maximum on the grouping key, nor do I want to prematurely optimize the
# lookup. If lookups are slow, we _could_ examine whether manually hashing these values and matching on the hash
# helps.
grouping_key = models.TextField(blank=False, null=False)
issue = models.ForeignKey("Issue", blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later'
def __str__(self):
return self.grouping_key
def add_periods_to_datetime(dt, nr_of_periods, period_name):
dateutil_kwargs_map = {
"year": "years",

View File

@@ -1,5 +1,3 @@
import hashlib
from sentry.eventtypes.base import DefaultEvent
from sentry.eventtypes.error import ErrorEvent
@@ -30,12 +28,6 @@ def get_issue_grouper_for_data(data):
return default_issue_grouper(title, transaction, event_type_name)
def get_hash_for_data(data):
"""Generate hash used for grouping issues (note: not a cryptographically secure hash)"""
issue_grouper = get_issue_grouper_for_data(data)
return hashlib.md5(issue_grouper.encode()).hexdigest()
# utilities related to storing and retrieving release-versions; we use the fact that sentry (and we've adopted their
# limitation) disallows the use of newlines in release-versions, so we can use newlines as a separator

View File

@@ -8,7 +8,7 @@ from django.conf import settings
from performance.bursty_data import generate_bursty_data, buckets_to_points_in_time
from projects.models import Project
from issues.models import Issue
from issues.models import Issue, Grouping
from events.models import Event
@@ -20,6 +20,7 @@ class Command(BaseCommand):
raise ValueError("This command should only be run on the performance-test database")
Project.objects.all().delete()
Grouping.objects.all().delete()
Issue.objects.all().delete()
Event.objects.all().delete()
@@ -31,7 +32,14 @@ class Command(BaseCommand):
issues_by_project = {}
for p in projects:
issues_by_project[p.id] = [Issue.objects.create(project=p, hash="hash %d" % i) for i in range(100)]
issues_by_project[p.id] = []
for i in range(100):
issues_by_project[p.id].append(Issue.objects.create(project=p))
Grouping.objects.create(
project=p,
grouping_key="grouping key %d" % i,
issue=issues_by_project[p.id][i],
)
# now we have 10 projects, each with 100 issues. let's create 10k events for each project.
for p in projects: