diff --git a/ingest/views.py b/ingest/views.py index 27abaa4..bfe3eee 100644 --- a/ingest/views.py +++ b/ingest/views.py @@ -13,8 +13,8 @@ from rest_framework import exceptions from compat.auth import parse_auth_header_value from projects.models import Project -from issues.models import Issue, IssueStateManager -from issues.utils import get_hash_for_data +from issues.models import Issue, IssueStateManager, Grouping +from issues.utils import get_issue_grouper_for_data from issues.regressions import issue_is_regression import sentry_sdk_extensions @@ -103,16 +103,30 @@ class BaseIngestAPIView(APIView): # leave this at the top -- it may involve reading from the DB which should come before any DB writing pc_registry = get_pc_registry() - hash_ = get_hash_for_data(event_data) - issue, issue_created = Issue.objects.get_or_create( - project=ingested_event.project, - hash=hash_, - defaults={ - "first_seen": ingested_event.timestamp, - "last_seen": ingested_event.timestamp, - "event_count": 1, - }, - ) + grouping_key = get_issue_grouper_for_data(event_data) + + if not Grouping.objects.filter(project=ingested_event.project, grouping_key=grouping_key).exists(): + issue = Issue.objects.create( + project=ingested_event.project, + first_seen=ingested_event.timestamp, + last_seen=ingested_event.timestamp, + event_count=1, + ) + # even though in our data-model a given grouping does not imply a single Issue (in fact, that's the whole + # point of groupings as a data-model), at-creation such implication does exist, because manual information + # ("this grouper is actually part of some other issue") can by definition not yet have been specified. + issue_created = True + + grouping = Grouping.objects.create( + project=ingested_event.project, + grouping_key=grouping_key, + issue=issue, + ) + + else: + grouping = Grouping.objects.get(project=ingested_event.project, grouping_key=grouping_key) + issue = grouping.issue + issue_created = False event, event_created = Event.from_ingested(ingested_event, issue, event_data) if not event_created: diff --git a/issues/admin.py b/issues/admin.py index 10a855f..4acdab5 100644 --- a/issues/admin.py +++ b/issues/admin.py @@ -1,13 +1,21 @@ from django.contrib import admin -from .models import Issue +from .models import Issue, Grouping + + +class GroupingInline(admin.TabularInline): + model = Grouping + extra = 0 + exclude = ['project'] + readonly_fields = [ + 'grouping_key', + ] @admin.register(Issue) class IssueAdmin(admin.ModelAdmin): fields = [ 'project', - 'hash', 'last_seen', 'first_seen', 'is_resolved', @@ -19,9 +27,12 @@ class IssueAdmin(admin.ModelAdmin): 'event_count', ] + inlines = [ + GroupingInline, + ] + list_display = [ "title", - "hash", "project", "event_count", # expensive operation as written now (query in loop) ] diff --git a/issues/factories.py b/issues/factories.py index 7fa93a2..9e1d031 100644 --- a/issues/factories.py +++ b/issues/factories.py @@ -4,8 +4,8 @@ from django.utils import timezone from projects.models import Project -from .models import Issue -from .utils import get_hash_for_data +from .models import Issue, Grouping +from .utils import get_issue_grouper_for_data def get_or_create_issue(project=None, event_data=None): @@ -15,12 +15,26 @@ def get_or_create_issue(project=None, event_data=None): if project is None: project = Project.objects.create(name="Test project") - hash_ = get_hash_for_data(event_data) - issue, issue_created = Issue.objects.get_or_create( - project=project, - hash=hash_, - defaults=denormalized_issue_fields(), - ) + grouping_key = get_issue_grouper_for_data(event_data) + + if not Grouping.objects.filter(project=project, grouping_key=grouping_key).exists(): + issue = Issue.objects.create( + project=project, + **denormalized_issue_fields(), + ) + issue_created = True + + grouping = Grouping.objects.create( + project=project, + grouping_key=grouping_key, + issue=issue, + ) + + else: + grouping = Grouping.objects.get(project=project, grouping_key=grouping_key) + issue = grouping.issue + issue_created = False + return issue, issue_created diff --git a/issues/migrations/0015_remove_issue_hash_grouping.py b/issues/migrations/0015_remove_issue_hash_grouping.py new file mode 100644 index 0000000..ad59a66 --- /dev/null +++ b/issues/migrations/0015_remove_issue_hash_grouping.py @@ -0,0 +1,28 @@ +# Generated by Django 4.2.11 on 2024-04-08 08:59 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('projects', '0006_alter_projectmembership_unique_together'), + ('issues', '0014_alter_issue_events_at_alter_issue_fixed_at'), + ] + + operations = [ + migrations.RemoveField( + model_name='issue', + name='hash', + ), + migrations.CreateModel( + name='Grouping', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('grouping_key', models.TextField()), + ('issue', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to='issues.issue')), + ('project', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to='projects.project')), + ], + ), + ] diff --git a/issues/migrations/0016_create_grouping_objects.py b/issues/migrations/0016_create_grouping_objects.py new file mode 100644 index 0000000..16251d0 --- /dev/null +++ b/issues/migrations/0016_create_grouping_objects.py @@ -0,0 +1,36 @@ +import json +from django.db import migrations + +from issues.utils import get_issue_grouper_for_data + + +def create_grouping_objects(apps, schema_editor): + Issue = apps.get_model('issues', 'Issue') + Grouping = apps.get_model('issues', 'Grouping') + + for issue in Issue.objects.all(): + # we can do this because we know that there is at least one event, and because the events are already grouped + # per issue (we just need to reconstruct the grouping key as implied by the hash) + some_random_event = issue.event_set.first() + event_data = json.loads(some_random_event.data) + + # we have _not_ inlined the code (which is standard good practice when creating datamigrations). Reason: this + # data-migration is just here for my own development process, we don't need it to be perfect. + grouping_key = get_issue_grouper_for_data(event_data) + + Grouping.objects.create( + project=issue.project, + issue=issue, + grouping_key=grouping_key, + ) + + +class Migration(migrations.Migration): + + dependencies = [ + ('issues', '0015_remove_issue_hash_grouping'), + ] + + operations = [ + migrations.RunPython(create_grouping_objects), + ] diff --git a/issues/models.py b/issues/models.py index a698d40..abae33b 100644 --- a/issues/models.py +++ b/issues/models.py @@ -17,11 +17,15 @@ class IncongruentStateException(Exception): class Issue(models.Model): + """ + An Issue models a group of similar events. In particular: it models the result of both automatic (client-side and + server-side) and manual ("merge") grouping. + """ + id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) project = models.ForeignKey( "projects.Project", blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later' - hash = models.CharField(max_length=32, blank=False, null=False) # denormalized fields: last_seen = models.DateTimeField(blank=False, null=False) # based on event.server_side_timestamp @@ -110,6 +114,24 @@ class Issue(models.Model): ] +class Grouping(models.Model): + """A Grouping models an automatically calculated grouping key (from the event data, with a key role for the SDK-side + fingerprint). + """ + project = models.ForeignKey( + "projects.Project", blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later' + + # NOTE: I don't want to have any principled maximum on the grouping key, nor do I want to prematurely optimize the + # lookup. If lookups are slow, we _could_ examine whether manually hashing these values and matching on the hash + # helps. + grouping_key = models.TextField(blank=False, null=False) + + issue = models.ForeignKey("Issue", blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later' + + def __str__(self): + return self.grouping_key + + def add_periods_to_datetime(dt, nr_of_periods, period_name): dateutil_kwargs_map = { "year": "years", diff --git a/issues/utils.py b/issues/utils.py index 0dcc5e3..2a1491c 100644 --- a/issues/utils.py +++ b/issues/utils.py @@ -1,5 +1,3 @@ -import hashlib - from sentry.eventtypes.base import DefaultEvent from sentry.eventtypes.error import ErrorEvent @@ -30,12 +28,6 @@ def get_issue_grouper_for_data(data): return default_issue_grouper(title, transaction, event_type_name) -def get_hash_for_data(data): - """Generate hash used for grouping issues (note: not a cryptographically secure hash)""" - issue_grouper = get_issue_grouper_for_data(data) - return hashlib.md5(issue_grouper.encode()).hexdigest() - - # utilities related to storing and retrieving release-versions; we use the fact that sentry (and we've adopted their # limitation) disallows the use of newlines in release-versions, so we can use newlines as a separator diff --git a/performance/management/commands/load_performance_fixture.py b/performance/management/commands/load_performance_fixture.py index e03b194..7b1c510 100644 --- a/performance/management/commands/load_performance_fixture.py +++ b/performance/management/commands/load_performance_fixture.py @@ -8,7 +8,7 @@ from django.conf import settings from performance.bursty_data import generate_bursty_data, buckets_to_points_in_time from projects.models import Project -from issues.models import Issue +from issues.models import Issue, Grouping from events.models import Event @@ -20,6 +20,7 @@ class Command(BaseCommand): raise ValueError("This command should only be run on the performance-test database") Project.objects.all().delete() + Grouping.objects.all().delete() Issue.objects.all().delete() Event.objects.all().delete() @@ -31,7 +32,14 @@ class Command(BaseCommand): issues_by_project = {} for p in projects: - issues_by_project[p.id] = [Issue.objects.create(project=p, hash="hash %d" % i) for i in range(100)] + issues_by_project[p.id] = [] + for i in range(100): + issues_by_project[p.id].append(Issue.objects.create(project=p)) + Grouping.objects.create( + project=p, + grouping_key="grouping key %d" % i, + issue=issues_by_project[p.id][i], + ) # now we have 10 projects, each with 100 issues. let's create 10k events for each project. for p in projects: