Event (tag) search: performance improvement

Done by denormalizing EventTag.issue, and adding that into an index. Targets:

* get-event-within-query (when it's 'last' or 'first')
* .count (of search query results)
* min/max (for the first/prev/next/last buttons)

(The min/max query's performance significantly improved by the addition of
the index, but was also rewritten into a simple SELECT rather than MIN/MAX).

When this code was written, I thought I had spectacularly improved performance.
I now believe this was based on an error in my measurements, but that this
still represents (mostly) an improvement, so I'll let it stand and will take
it from here in subsequent commits.
This commit is contained in:
Klaas van Schelven
2025-03-11 12:53:59 +01:00
parent 0358af9a59
commit b031792784
5 changed files with 115 additions and 53 deletions

View File

@@ -2,7 +2,7 @@ from collections import namedtuple
import json
import sentry_sdk
from django.db.models import Min, Max, Q
from django.db.models import Q
from django.utils import timezone
from django.shortcuts import render, get_object_or_404, redirect
from django.http import HttpResponseRedirect, HttpResponseNotAllowed
@@ -333,9 +333,7 @@ def issue_event_stacktrace(request, issue, event_pk=None, digest_order=None, nav
if request.method == "POST":
return _handle_post(request, issue)
event_qs = Event.objects.filter(issue=issue)
if request.GET.get("q"):
event_qs = search_events(issue.project, event_qs, request.GET["q"])
event_qs = search_events(issue.project, issue, request.GET.get("q", ""))
try:
event = _get_event(event_qs, event_pk, digest_order, nav)
@@ -397,7 +395,7 @@ def issue_event_stacktrace(request, issue, event_pk=None, digest_order=None, nav
def issue_event_404(request, issue, event_qs, tab, this_view):
"""If the Event is 404, but the issue is not, we can still show the issue page; we show a message for the event"""
last_event = event_qs.last() # used for switching to an event-page (using tabs)
last_event = event_qs.order_by("digest_order").last() # used for switching to an event-page (using tabs)
return render(request, "issues/event_404.html", {
"tab": tab,
"this_view": this_view,
@@ -418,9 +416,7 @@ def issue_event_breadcrumbs(request, issue, event_pk=None, digest_order=None, na
if request.method == "POST":
return _handle_post(request, issue)
event_qs = Event.objects.filter(issue=issue)
if request.GET.get("q"):
event_qs = search_events(issue.project, event_qs, request.GET["q"])
event_qs = search_events(issue.project, issue, request.GET.get("q", ""))
try:
event = _get_event(event_qs, event_pk, digest_order, nav)
@@ -452,10 +448,16 @@ def _date_with_milis_html(timestamp):
def _has_next_prev(event, event_qs):
d = event_qs.aggregate(lo=Min("digest_order"), hi=Max("digest_order"))
# guarding against empty event_qs is not necessary, because we only get here if any event exists (because otherwise
# event would be None too)
# this was once implemented with Min/Max, but just doing 2 queries is (on sqlite at least) ~100× faster.
first = event_qs.order_by("digest_order").values("digest_order").first()
last = event_qs.order_by("-digest_order").values("digest_order").first()
return {
"has_prev": event.digest_order > d["lo"] if d.get("lo") is not None else False,
"has_next": event.digest_order < d["hi"] if d.get("hi") is not None else False,
"has_prev": event.digest_order > first["digest_order"],
"has_next": event.digest_order < last["digest_order"],
}
@@ -465,9 +467,7 @@ def issue_event_details(request, issue, event_pk=None, digest_order=None, nav=No
if request.method == "POST":
return _handle_post(request, issue)
event_qs = Event.objects.filter(issue=issue)
if request.GET.get("q"):
event_qs = search_events(issue.project, event_qs, request.GET["q"])
event_qs = search_events(issue.project, issue, request.GET.get("q", ""))
try:
event = _get_event(event_qs, event_pk, digest_order, nav)
@@ -571,8 +571,8 @@ def issue_history(request, issue):
if request.method == "POST":
return _handle_post(request, issue)
event_qs = search_events(issue.project, issue.event_set.order_by("digest_order"), request.GET.get("q", ""))
last_event = event_qs.last() # used for switching to an event-page (using tabs)
event_qs = search_events(issue.project, issue, request.GET.get("q", ""))
last_event = event_qs.order_by("digest_order").last() # used for switching to an event-page (using tabs)
return render(request, "issues/history.html", {
"tab": "history",
"project": issue.project,
@@ -590,8 +590,8 @@ def issue_tags(request, issue):
if request.method == "POST":
return _handle_post(request, issue)
event_qs = search_events(issue.project, issue.event_set.order_by("digest_order"), request.GET.get("q", ""))
last_event = event_qs.last() # used for switching to an event-page (using tabs)
event_qs = search_events(issue.project, issue, request.GET.get("q", ""))
last_event = event_qs.order_by("digest_order").last() # used for switching to an event-page (using tabs)
return render(request, "issues/tags.html", {
"tab": "tags",
"project": issue.project,
@@ -609,8 +609,8 @@ def issue_grouping(request, issue):
if request.method == "POST":
return _handle_post(request, issue)
event_qs = search_events(issue.project, issue.event_set.order_by("digest_order"), request.GET.get("q", ""))
last_event = event_qs.last() # used for switching to an event-page (using tabs)
event_qs = search_events(issue.project, issue, request.GET.get("q", ""))
last_event = event_qs.order_by("digest_order").last() # used for switching to an event-page (using tabs)
return render(request, "issues/grouping.html", {
"tab": "grouping",
"project": issue.project,
@@ -628,12 +628,11 @@ def issue_event_list(request, issue):
if request.method == "POST":
return _handle_post(request, issue)
event_list = issue.event_set.order_by("digest_order")
if "q" in request.GET:
event_list = search_events(issue.project, event_list, request.GET["q"])
event_list = search_events(issue.project, issue, request.GET["q"]).order_by("digest_order")
paginator = Paginator(event_list, 250) # might as well use Paginator; the cost of .count() is incurred anyway
else:
event_list = issue.event_set.order_by("digest_order")
# re 250: in general "big is good" because it allows a lot "at a glance".
paginator = KnownCountPaginator(event_list, 250, count=issue.stored_event_count)

View File

@@ -8,8 +8,8 @@ class Migration(migrations.Migration):
dependencies = [
("issues", "0012_alter_issue_calculated_type_and_more"),
("projects", "0011_fill_stored_event_count"),
("events", "0019_event_storage_backend"),
("projects", "0011_fill_stored_event_count"),
]
operations = [
@@ -138,6 +138,15 @@ class Migration(migrations.Migration):
to="events.event",
),
),
(
"issue",
models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="event_tags",
to="issues.issue",
),
),
(
"project",
models.ForeignKey(
@@ -157,7 +166,10 @@ class Migration(migrations.Migration):
"indexes": [
models.Index(
fields=["event"], name="tags_eventt_event_i_ac6453_idx"
)
),
models.Index(
fields=["value", "issue"], name="tags_eventt_value_i_255b9c_idx"
),
],
"unique_together": {("value", "event")},
},

View File

@@ -74,6 +74,11 @@ class EventTag(models.Model):
# value already implies key in our current setup.
value = models.ForeignKey(TagValue, blank=False, null=False, on_delete=models.CASCADE)
# issue is a denormalization that allows for a single-table-index for efficient search.
# SET_NULL: Issue deletion is not actually possible yet, so this is moot (for now).
issue = models.ForeignKey(
'issues.Issue', blank=False, null=True, on_delete=models.SET_NULL, related_name="event_tags")
# DO_NOTHING: we manually implement CASCADE (i.e. when an event is cleaned up, clean up associated tags) in the
# eviction process. Why CASCADE? [1] you'll have to do it "at some point", so you might as well do it right when
# evicting (async in the 'most resilient setup' anyway, b/c that happens when ingesting) [2] the order of magnitude
@@ -81,11 +86,16 @@ class EventTag(models.Model):
event = models.ForeignKey('events.Event', blank=False, null=False, on_delete=models.DO_NOTHING, related_name='tags')
class Meta:
# This is the obvious constraint, which doubles as a lookup index for search (a big OR-match on value).
# This is the obvious constraint
unique_together = ('value', 'event')
indexes = [
models.Index(fields=['event']), # for lookups by event (for event-details page, event-deletions)
# for search, which filters a list of EventTag down to those matching certain values and a given issue.
# (both orderings of the index would work for the current search query; if we ever introduce "search across
# issues" the below would work for that too (but the reverse wouldn't))
models.Index(fields=['value', 'issue']),
]
@@ -168,7 +178,8 @@ def store_tags(event, issue, tags):
# tag_key, _ = TagKey.objects.get_or_create(
# project_id=event.project_id, key=key, mostly_unique=is_mostly_unique(key))
# tag_value, _ = TagValue.objects.get_or_create(project_id=event.project_id, key=tag_key, value=value)
# EventTag.objects.get_or_create(project_id=event.project_id, value=tag_value, event=event)
# EventTag.objects.get_or_create(project_id=event.project_id, value=tag_value, event=event,
# defaults={'issue': issue})
# IssueTag.objects.get_or_create(project_id=event.project_id, value=tag_value, issue=issue)
#
# # the 0-case is implied here too, which avoids some further guards in the code below
@@ -200,7 +211,12 @@ def store_tags(event, issue, tags):
Q(key=key_obj, value=tags[key_obj.key]) for key_obj in tag_key_objects]))
EventTag.objects.bulk_create([
EventTag(project_id=event.project_id, value=tag_value, event=event) for tag_value in tag_value_objects
EventTag(
project_id=event.project_id,
value=tag_value,
event=event,
issue=issue
) for tag_value in tag_value_objects
], ignore_conflicts=True)
IssueTag.objects.bulk_create([

View File

@@ -9,6 +9,7 @@ from django.db.models import Q, Subquery
from collections import namedtuple
from bugsink.moreiterutils import tuplewise
from events.models import Event
from .models import TagValue, IssueTag, EventTag
@@ -61,9 +62,9 @@ def parse_query(q):
return ParsedQuery(tags, plain_text_q)
def _search(TagClz, fk_fieldname, project, obj_list, q):
def _search(m2m_qs, fk_fieldname, project, obj_list_all, obj_list_filtered, q):
if not q:
return obj_list
return obj_list_filtered
parsed = parse_query(q)
@@ -78,13 +79,17 @@ def _search(TagClz, fk_fieldname, project, obj_list, q):
except TagValue.DoesNotExist:
# if the tag doesn't exist, we can't have any issues with it; the below short-circuit is fine, I think (I
# mean: we _could_ say "tag x is to blame" but that's not what one does generally in search, is it?
return obj_list.none()
return obj_list_all.none()
# TODO: Extensive performance testing of various choices here is necessary; in particular the choice of Subquery
# vs. joins; and the choice of a separate query to get TagValue v.s. doing everything in a single big query will
# have different trade-offs _in practice_.
clauses.append(
Q(id__in=Subquery(TagClz.objects.filter(value=tag_value_obj).values_list(fk_fieldname, flat=True))))
Q(id__in=Subquery(m2m_qs.filter(value=tag_value_obj).values_list(fk_fieldname, flat=True))))
# the idea is: if there are clauses from m2m tags, the filter (on Issue, for Events) is implied by those and not
# involving the on the base_qs is more efficient; if there aren't any clauses, that's necessary though.
obj_list = obj_list_all if clauses else obj_list_filtered
# this is really TSTTCPW (or more like a "fake it till you make it" thing); but I'd rather "have something" and then
# have really-good-search than to have either nothing at all, or half-baked search. Note that we didn't even bother
@@ -111,8 +116,15 @@ def _search(TagClz, fk_fieldname, project, obj_list, q):
def search_issues(project, issue_list, q):
return _search(IssueTag, "issue_id", project, issue_list, q)
return _search(IssueTag.objects.all(), "issue_id", project, issue_list, issue_list, q)
def search_events(project, event_list, q):
return _search(EventTag, "event_id", project, event_list, q)
def search_events(project, issue, q):
return _search(
EventTag.objects.filter(issue=issue),
"event_id",
project,
Event.objects.all(),
Event.objects.filter(issue=issue),
q,
)

View File

@@ -5,9 +5,8 @@ from projects.models import Project
from issues.factories import get_or_create_issue, denormalized_issue_fields
from events.factories import create_event
from issues.models import Issue
from events.models import Event
from .models import store_tags
from .models import store_tags, EventTag
from .utils import deduce_tags
from .search import search_events, search_issues, parse_query
@@ -88,6 +87,7 @@ class StoreTagsTestCase(DjangoTestCase):
self.assertEqual(self.issue.tags.count(), 1)
self.assertEqual(self.event.tags.first().value.value, "bar")
self.assertEqual(self.event.tags.first().issue, self.issue)
self.assertEqual(self.issue.tags.first().count, 1)
self.assertEqual(self.issue.tags.first().value.key.key, "foo")
@@ -164,46 +164,69 @@ class SearchTestCase(DjangoTestCase):
def setUp(self):
self.project = Project.objects.create(name="Test Project")
# we create a single issue to group all the events under; this is not what would happen in the real world
# scenario (in which there would be some relation between the tags of issues and events), but it allows us to
# test event_search more easily (if each event is tied to a different issue, searching for tags is meaningless,
# since you always search within the context of an issue).
self.global_issue = Issue.objects.create(project=self.project, **denormalized_issue_fields())
issue_with_tags_and_text = Issue.objects.create(project=self.project, **denormalized_issue_fields())
event_with_tags_and_text = create_event(self.project, issue=issue_with_tags_and_text)
event_with_tags_and_text = create_event(self.project, issue=self.global_issue)
issue_with_tags_no_text = Issue.objects.create(project=self.project, **denormalized_issue_fields())
event_with_tags_no_text = create_event(self.project, issue=issue_with_tags_no_text)
event_with_tags_no_text = create_event(self.project, issue=self.global_issue)
store_tags(event_with_tags_and_text, issue_with_tags_and_text, {f"k-{i}": f"v-{i}" for i in range(5)})
store_tags(event_with_tags_no_text, issue_with_tags_no_text, {f"k-{i}": f"v-{i}" for i in range(5)})
# fix the EventTag objects' issue, which is broken per the non-real-world setup (see above)
EventTag.objects.all().update(issue=self.global_issue)
issue_without_tags = Issue.objects.create(project=self.project, **denormalized_issue_fields())
event_without_tags = create_event(self.project, issue=issue_without_tags)
event_without_tags = create_event(self.project, issue=self.global_issue)
for obj in [issue_with_tags_and_text, event_with_tags_and_text, issue_without_tags, event_without_tags]:
obj.calculated_type = "FindableException"
obj.calculated_value = "findable value"
obj.save()
issue_with_nothing = Issue.objects.create(project=self.project, **denormalized_issue_fields())
create_event(self.project, issue=issue_with_nothing)
Issue.objects.create(project=self.project, **denormalized_issue_fields())
create_event(self.project, issue=self.global_issue)
def _test_search(self, search_x, clz):
# we create 2 items with tags
self.assertEqual(search_x(self.project, clz.objects.all(), "k-0:v-0").count(), 2)
def _test_search(self, search_x):
# no query: all results
self.assertEqual(search_x("").count(), search_x("").model.objects.count())
# in the above, we create 2 items with tags
self.assertEqual(search_x("k-0:v-0").count(), 2)
# non-matching tag: no results
self.assertEqual(search_x(self.project, clz.objects.all(), "k-0:nosuchthing").count(), 0)
self.assertEqual(search_x("k-0:nosuchthing").count(), 0)
# findable-by-text: 2 such items
self.assertEqual(search_x(self.project, clz.objects.all(), "findable value").count(), 2)
self.assertEqual(search_x(self.project, clz.objects.all(), "FindableException").count(), 2)
self.assertEqual(search_x("findable value").count(), 2)
self.assertEqual(search_x("FindableException").count(), 2)
# non-matching text: no results
self.assertEqual(search_x(self.project, clz.objects.all(), "nosuchthing").count(), 0)
self.assertEqual(search_x(self.project, clz.objects.all(), "k-0:v-0 nosuchthing").count(), 0)
self.assertEqual(search_x("nosuchthing").count(), 0)
self.assertEqual(search_x("k-0:v-0 nosuchthing").count(), 0)
# findable-by-text, tagged: 1 such item
self.assertEqual(search_x(self.project, clz.objects.all(), "findable value k-0:v-0").count(), 1)
self.assertEqual(search_x("findable value k-0:v-0").count(), 1)
def test_search_events(self):
self._test_search(search_events, Event)
self._test_search(lambda query: search_events(self.project, self.global_issue, query))
def test_search_events_wrong_issue(self):
issue_without_events = Issue.objects.create(project=self.project, **denormalized_issue_fields())
search_x = lambda query: search_events(self.project, issue_without_events, query)
# those lines from _test_search() that had non-zero results are now expected to have 0 results
self.assertEqual(search_x("").count(), 0)
self.assertEqual(search_x("k-0:v-0").count(), 0)
self.assertEqual(search_x("findable value").count(), 0)
self.assertEqual(search_x("FindableException").count(), 0)
self.assertEqual(search_x("findable value k-0:v-0").count(), 0)
def test_search_issues(self):
self._test_search(search_issues, Issue)
self._test_search(lambda query: search_issues(self.project, Issue.objects.all(), query))