Tags; deducing tags; search on tags; WIP

This commit is contained in:
Klaas van Schelven
2025-02-27 13:12:49 +01:00
parent 4b7ed8f4ec
commit 7a19e2d277
15 changed files with 447 additions and 8 deletions

View File

@@ -39,6 +39,19 @@ def pairwise(it):
prev = current
def tuplewise(iterable):
"""
>>> list(tuplewise(range(4)))
[(0, 1), (2, 3)]
"""
i = iter(iterable)
while True:
try:
yield next(i), next(i)
except StopIteration:
return
def batched(iterable, n):
# itertools.batched was introduced in Python 3.12, but it's "roughly equivalent" to this:

View File

@@ -78,6 +78,7 @@ BUGSINK_APPS = [
'ingest',
'issues',
'events',
'tags',
'alerts',
'performance',

View File

@@ -132,6 +132,7 @@ def eat_your_own_dogfood(sentry_dsn, **kwargs):
"sentry",
"sentry_sdk_extensions",
"snappea",
"tags",
"teams",
"theme",
"users",

View File

@@ -59,6 +59,10 @@ def write_to_storage(event_id, parsed_data):
class Event(models.Model):
# TODO now that the Tag models are introduced, an number of the below fields are actually already stored as tags.
# At some point we should decide whether to proceed with "just as tags" or "just in the event table". Will depend on
# findings about performance (and how generic our solution with tags really is).
# Lines quotes with ">" are from the following to resources:
# https://develop.sentry.dev/sdk/event-payloads/ (supposedly more human-readable)
# https://develop.sentry.dev/sdk/event-payloads/types/ (more up-to-date and complete)
@@ -155,7 +159,7 @@ class Event(models.Model):
# them to be [yet]):
#
# > Optional. A map or list of tags for this event. Each tag must be less than 200 characters.
# tags
# tags = # implemented in tags/models.py
#
# > A list of relevant modules and their versions.
# modules =

View File

@@ -35,6 +35,7 @@ from events.retention import evict_for_max_events, should_evict
from releases.models import create_release_if_needed
from alerts.tasks import send_new_issue_alert, send_regression_alert
from compat.timestamp import format_timestamp, parse_timestamp
from tags.models import digest_tags
from .parsers import StreamingEnvelopeParser, ParseError
from .filestore import get_filename_for_event_id
@@ -389,6 +390,10 @@ class BaseIngestAPIView(View):
issue.save()
# intentionally at the end: possible future optimization is to push this out of the transaction (or even use
# a separate DB for this)
digest_tags(event_data, event, issue)
@classmethod
def count_project_periods_and_act_on_it(cls, project, now):
# returns: True if any further processing should be done.

View File

@@ -1,10 +1,11 @@
from collections import namedtuple
import json
import sentry_sdk
import re
from django.utils import timezone
from django.shortcuts import render, get_object_or_404, redirect
from django.db.models import Q
from django.db.models import Q, Subquery
from django.http import HttpResponseRedirect, HttpResponseNotAllowed
from django.utils.safestring import mark_safe
from django.template.defaultfilters import date
@@ -13,6 +14,9 @@ from django.core.exceptions import PermissionDenied
from django.http import Http404
from django.core.paginator import Paginator
from sentry.utils.safe import get_path
from bugsink.moreiterutils import tuplewise
from bugsink.decorators import project_membership_required, issue_membership_required, atomic_for_request_method
from bugsink.transaction import durable_atomic
from bugsink.period_utils import add_periods_to_datetime
@@ -22,7 +26,7 @@ from events.ua_stuff import get_contexts_enriched_with_ua
from compat.timestamp import format_timestamp
from projects.models import ProjectMembership
from sentry.utils.safe import get_path
from tags.models import TagValue, IssueTag
from .models import Issue, IssueQuerysetStateManager, IssueStateManager, TurningPoint, TurningPointKind
from .forms import CommentForm
@@ -216,6 +220,60 @@ def _issue_list_pt_1(request, project, state_filter="open"):
return project, unapplied_issue_ids
def remove_slices(s, slices_to_remove):
"""Returns s with the slices removed."""
items = [item for tup in slices_to_remove for item in tup]
slices_to_preserve = tuplewise([0] + items + [None])
return "".join(s[start:stop] for start, stop in slices_to_preserve)
def _and_join(q_objects):
if len(q_objects) == 0:
# we _could_ just return Q(), but I'd force the calling location to not do this
raise ValueError("empty list of Q objects")
result = q_objects[0]
for q in q_objects[1:]:
result &= q
return result
def _search(project, issue_list, q):
# The simplest possible query-language that could have any value: key:value is recognized as such; the rest is "free
# text"; no support for quoting of spaces.
slices_to_remove = []
clauses = []
for match in re.finditer(r"(\S+):(\S+)", q):
slices_to_remove.append(match.span())
key, value = match.groups()
try:
tag_value_obj = TagValue.objects.get(project=project, key__key=key, value=value)
except TagValue.DoesNotExist:
# if the tag doesn't exist, we can't have any issues with it; the below short-circuit is fine, I think (I
# mean: we _could_ say "tag x is to blame" but that's not what one does generally in search, is it?
return issue_list.none()
# TODO: Extensive performance testing of various choices here is necessary; in particular the choice of Subquery
# vs. joins; and the choice of a separate query to get TagValue v.s. doing everything in a single big query will
# have different trade-offs _in practice_.
clauses.append(
Q(id__in=Subquery(IssueTag.objects.filter(value=tag_value_obj).values_list("issue_id", flat=True))))
# this is really TSTTCPW (or more like a "fake it till you make it" thing); but I'd rather "have something" and then
# have really-good-search than to have either nothing at all, or half-baked search. Note that we didn't even bother
# to set indexes on the fields we search on (nor create a single searchable field for the whole of 'title').
plain_text_q = remove_slices(q, slices_to_remove).strip()
if plain_text_q:
clauses.append(Q(Q(calculated_type__icontains=plain_text_q) | Q(calculated_value__icontains=plain_text_q)))
# if we reach this point, there's always either a plain_text_q or some key/value pair (this is a condition for
# _and_join)
issue_list = issue_list.filter(_and_join(clauses))
return issue_list
def _issue_list_pt_2(request, project, state_filter, unapplied_issue_ids):
d_state_filter = {
"open": lambda qs: qs.filter(is_resolved=False, is_muted=False),
@@ -229,12 +287,8 @@ def _issue_list_pt_2(request, project, state_filter, unapplied_issue_ids):
Issue.objects.filter(project=project)
).order_by("-last_seen")
# this is really TSTTCPW (or more like a "fake it till you make it" thing); but I'd rather "have something" and then
# have really-good-search than to have either nothing at all, or half-baked search. Note that we didn't even bother
# to set indexes on the fields we search on (nor create a single searchable field for the whole of 'title').
if request.GET.get("q"):
issue_list = issue_list.filter(
Q(calculated_type__icontains=request.GET["q"]) | Q(calculated_value__icontains=request.GET["q"]))
issue_list = _search(project, issue_list, request.GET["q"])
paginator = Paginator(issue_list, 250)
page_number = request.GET.get("page")

View File

@@ -54,6 +54,7 @@ include = [
"sentry_sdk_extensions*",
"snappea*",
"static*",
"tags*",
"teams*",
"templates*",
"theme*",

0
tags/__init__.py Normal file
View File

0
tags/admin.py Normal file
View File

6
tags/apps.py Normal file
View File

@@ -0,0 +1,6 @@
from django.apps import AppConfig
class TagsConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "tags"

View File

@@ -0,0 +1,169 @@
# Generated by Django 4.2.19 on 2025-02-27 12:04
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
initial = True
dependencies = [
("events", "0019_event_storage_backend"),
("issues", "0010_issue_list_indexes"),
("projects", "0011_fill_stored_event_count"),
]
operations = [
migrations.CreateModel(
name="TagKey",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("key", models.CharField(max_length=32)),
(
"project",
models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
to="projects.project",
),
),
],
),
migrations.CreateModel(
name="TagValue",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("value", models.CharField(db_index=True, max_length=200)),
(
"key",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE, to="tags.tagkey"
),
),
(
"project",
models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
to="projects.project",
),
),
],
options={
"unique_together": {("project", "key", "value")},
},
),
migrations.CreateModel(
name="IssueTag",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"issue",
models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="tags",
to="issues.issue",
),
),
(
"project",
models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
to="projects.project",
),
),
(
"value",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE, to="tags.tagvalue"
),
),
],
),
migrations.CreateModel(
name="EventTag",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"event",
models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="tags",
to="events.event",
),
),
(
"project",
models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
to="projects.project",
),
),
(
"value",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE, to="tags.tagvalue"
),
),
],
),
migrations.AddIndex(
model_name="tagkey",
index=models.Index(fields=["key"], name="tags_tagkey_key_5e8a5a_idx"),
),
migrations.AlterUniqueTogether(
name="tagkey",
unique_together={("project", "key")},
),
migrations.AddIndex(
model_name="issuetag",
index=models.Index(
fields=["issue", "value"], name="tags_issuet_issue_i_f06f20_idx"
),
),
migrations.AlterUniqueTogether(
name="issuetag",
unique_together={("value", "issue")},
),
migrations.AlterUniqueTogether(
name="eventtag",
unique_together={("value", "event")},
),
]

View File

103
tags/models.py Normal file
View File

@@ -0,0 +1,103 @@
"""
Tags provide support for arbitrary key/value pairs (both strings) on Events and Issues, allowing for searching &
counting. Some notes:
* Arbitrary Tags can be set programatically in the SDKs, which we need to support (Sentry API Compatability).
* Some "synthetic" Tags are introduced by Bugsink itself: attributes of an Event are deduced and stored explicitly as a
Tag. The main reason to do this: stay flexible in terms of DB design and allow for generic code for searching and
counting. _However_, we don't make a commitment to any particular implementation, and if the deduce-and-store approach
turns out to be a performance bottleneck, it may be replaced. Particular notes on what we deduce are in `deduce_tags`.
https://docs.sentry.io/platforms/python/enriching-events/tags/
> Tag keys have a maximum length of 32 characters and can contain only letters (a-zA-Z), numbers (0-9), underscores (_),
> periods (.), colons (:), and dashes (-).
>
> Tag values have a maximum length of 200 characters and they cannot contain the newline (\n) character.
"""
from django.db import models
from projects.models import Project
from tags.utils import deduce_tags
class TagKey(models.Model):
project = models.ForeignKey(Project, blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later'
key = models.CharField(max_length=32, blank=False, null=False)
# I briefly considered being explicit about is_deduced; but it's annoying to store this info on the TagKey, and it's
# probably redundant if we just come up with a list of "reserved" tags or similar.
# is_deduced = models.BooleanField(default=False)
class Meta:
unique_together = ('project', 'key')
indexes = [
# Untested assumption: when searching, we search by TagValue[..]key__key=key, so we need an
# index-not-prefixed-by-project too
models.Index(fields=['key']),
]
class TagValue(models.Model):
project = models.ForeignKey(Project, blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later'
# CASCADE: TBD; one argument might be: decouple deletions from events ingestion, but at least have them internally
# consistent
key = models.ForeignKey(TagKey, blank=False, null=False, on_delete=models.CASCADE)
value = models.CharField(max_length=200, blank=False, null=False, db_index=True)
class Meta:
# matches what we do in search
unique_together = ('project', 'key', 'value')
def __str__(self):
return f"{self.key.key}:{self.value}"
class EventTag(models.Model):
project = models.ForeignKey(Project, blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later'
# value already implies key in our current setup.
value = models.ForeignKey(TagValue, blank=False, null=False, on_delete=models.CASCADE)
event = models.ForeignKey('events.Event', blank=False, null=True, on_delete=models.SET_NULL, related_name='tags')
class Meta:
unique_together = ('value', 'event')
# class GroupingTag is not needed (not even for future-proofing); it would only be needed if you'd want to "unmerge"
# manually merged issues while preserving the tags/counts. I think that's in "not worth it" territory.
class IssueTag(models.Model):
project = models.ForeignKey(Project, blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later'
# value already implies key in our current setup.
value = models.ForeignKey(TagValue, blank=False, null=False, on_delete=models.CASCADE)
issue = models.ForeignKey('issues.Issue', blank=False, null=True, on_delete=models.SET_NULL, related_name='tags')
class Meta:
# searching: by value, then Issue (is this so though? .qs is already on the other!)
unique_together = ('value', 'issue')
indexes = [
models.Index(fields=['issue', 'value']), # make lookups by issue (for detail pages) faster
]
def digest_tags(event_data, event, issue):
tags = deduce_tags(event_data)
for key, value in tags.items():
# The max length of 200 is from TFM for user-provided tags. Still, we just apply it on synthetic tags as well;
# It's a reasonably safe guess that this will not terribly confuse people, and avoids triggering errors on-save.
value = value[:200]
# TODO just use bulk_create for each of the types of objects
# TODO check: event.project won't trigger a query, right? it's already loaded, right?
tag_key, _ = TagKey.objects.get_or_create(project=event.project, key=key)
tag_value, _ = TagValue.objects.get_or_create(project=event.project, key=tag_key, value=value)
EventTag.objects.get_or_create(project=event.project, value=tag_value, event=event)
IssueTag.objects.get_or_create(project=event.project, value=tag_value, issue=issue)

1
tags/tests.py Normal file
View File

@@ -0,0 +1 @@
# from django.test import TestCase

81
tags/utils.py Normal file
View File

@@ -0,0 +1,81 @@
from events.ua_stuff import get_contexts_enriched_with_ua
from sentry.utils.safe import get_path
EVENT_DATA_CONVERSION_TABLE = {
# NOTE that "level" is not included here; Sentry puts this front and center, and although we may give it _some_
# place in the UI, Bugsink's take is that "level: Error" in an Error Tracker is an open door/waste of space.
"server_name": "server_name",
"release": "release",
"environment": "environment",
"transaction": "transaction",
}
CONTEXT_CONVERSION_TABLE = {
"trace": ("trace", "trace_id"),
"trace.span": ("trace", "span_id"),
"browser.name": ("browser", "name"),
"os.name": ("os", "name"),
# TODO probably useful, simply not done yet:
# runtime
# runtime.name
# device.something
}
def deduce_user_tag(contexts):
# quick & dirty / barebones implementation; we don't try to mimick Sentry's full behavior, instead just pick the
# most relevant piece of the user context that we can find. For reference, Sentry has the concept of an "EventUser"
# (`src/sentry/models/eventuser.py`)
if "user" not in contexts:
return None
for key in ["id", "username", "email", "ip_address"]:
if contexts["user"].get(key):
return contexts["user"][key]
return None
def deduce_tags(event_data):
"""
Deduce tags for `event_data`. Used as an "opportunistic" (generic) way to implement counting and searching. Although
Sentry does something similar, we're not striving to replicate Sentry's behavior (tag names etc). In particular, we
feel that Sentry's choices are poorly documented, the separation of concerns between events/contexts/tags is
unclear, and some choices are straight up not so great. We'd rather think about what information matters ourselves.
"""
# we start with the explicitly provided tags
tags = event_data.get('tags', {})
for tag_key, lookup_path in EVENT_DATA_CONVERSION_TABLE.items():
value = get_path(event_data, lookup_path)
if value not in [None, ""]:
tags[tag_key] = value
# deduce from contexts
contexts = get_contexts_enriched_with_ua(event_data)
for tag_key, path in CONTEXT_CONVERSION_TABLE.items():
value = get_path(contexts, *path)
if value not in [None, ""]:
tags[tag_key] = value
if "trace" in tags and "trace.span" in tags:
tags["trace.ctx"] = f"{tags['trace']}.{tags['trace.span']}"
if "browser.name" in tags and (browser_version := tags.get("browser.version")):
tags["browser"] = f"{tags['browser.name']} {browser_version}"
if user_tag := deduce_user_tag(contexts):
tags["user"] = user_tag
# TODO further tags that are probably useful:
# url
# logger
# mechanism
return tags