Merge pull request #142 from bugsink/issue-tag-cleanup-as-vacuum

In the light of the discussion on #134, this implements the "clean up later"
solution: a vacuum task that deletes IssueTags no longer referenced by any
EventTag on the same Issue. 

In the same sweep TagValues are pruned.

Performance-wise, this is a relatively safe path forward; it can run off-hours
or not at all, depending on preferences.

Semantically it's the least clear: whether an Issue appears to be tagged may
now depend on whether vacuum has run, and `IssueTag.count` (representing total
observed) is left in a "weird place" in the sense that it may disappear from
the total more or less at random (i.e. depending on calling a command, and
depending on the current state of not-yet-evicted Events).
This commit is contained in:
Klaas van Schelven
2025-07-08 16:57:16 +02:00
committed by GitHub
5 changed files with 216 additions and 19 deletions

View File

@@ -206,29 +206,17 @@ def prune_orphans(model, d_ids_to_check):
1. the inline version is easier to reason about, it "just happens ASAP", and in the context of a given issue;
vacuum-based has to take into consideration the full DB including non-orphaned values.
2. repeated work is somewhat minimalized b/c of the IssueTag/EventTag relationship as described below.
2. repeated work is somewhat minimalized b/c of the IssueTag/EventTag relationship as described in prune_tagvalues.
"""
from tags.models import TagValue, IssueTag # avoid circular import
from tags.models import prune_tagvalues # avoid circular import
if model.__name__ != "IssueTag":
return # we only prune IssueTag orphans
ids_to_check = [d["value_id"] for d in d_ids_to_check]
ids_to_check = [d["value_id"] for d in d_ids_to_check] # d_ids_to_check: mirrors fields_for_prune_orphans(model)
# used_in_event check is not needed, because non-existence of IssueTag always implies non-existince of EventTag,
# since [1] EventTag creation implies IssueTag creation and [2] in the cleanup code EventTag is deleted first.
used_in_issue = set(
IssueTag.objects.filter(value_id__in=ids_to_check).values_list('value_id', flat=True)
)
unused = [pk for pk in ids_to_check if pk not in used_in_issue]
if unused:
TagValue.objects.filter(id__in=unused).delete()
# The principled approach would be to clean up TagKeys as well at this point, but in practice there will be orders
# of magnitude fewer TagKey objects, and they are much less likely to become dangling, so the GC-like algo of "just
# vacuuming once in a while" is a much better fit for that.
prune_tagvalues(ids_to_check)
def do_pre_delete(project_id, model, pks_to_delete, is_for_project):

View File

@@ -0,0 +1,10 @@
from django.core.management.base import BaseCommand
from tags.tasks import vacuum_eventless_issuetags
class Command(BaseCommand):
help = "Kick off tag cleanup by vacuuming IssueTag objects for which there is no EventTag equivalent"
def handle(self, *args, **options):
vacuum_eventless_issuetags.delay()
self.stdout.write("Started tag vacuum via task queue.")

View File

@@ -23,6 +23,7 @@ from django.db.models import Q, F
from projects.models import Project
from tags.utils import deduce_tags, is_mostly_unique
from bugsink.moreiterutils import batched
# Notes on .project as it lives on TagValue, IssueTag and EventTag:
# In all cases, project could be derived through other means: for TagValue it's implied by TagKey.project; for IssueTag
@@ -52,6 +53,9 @@ class TagKey(models.Model):
# the obvious constraint, which doubles as a lookup index for store_tags and search.
unique_together = ('project', 'key')
def __str__(self):
return f"{self.key}"
class TagValue(models.Model):
project = models.ForeignKey(Project, blank=False, null=False, on_delete=models.DO_NOTHING)
@@ -74,7 +78,7 @@ class EventTag(models.Model):
# value already implies key in our current setup.
value = models.ForeignKey(TagValue, blank=False, null=False, on_delete=models.DO_NOTHING)
# issue is a denormalization that allows for a single-table-index for efficient search.
# issue is a denormalization that allows for a single-table-index for efficient search/vacuum_eventless_issuetags.
issue = models.ForeignKey(
'issues.Issue', blank=False, null=False, on_delete=models.DO_NOTHING, related_name="event_tags")
@@ -97,6 +101,7 @@ class EventTag(models.Model):
# for search, which filters a list of EventTag down to those matching certain values and a given issue.
# (both orderings of the (value, issue) would work for the current search query; if we ever introduce
# "search across issues" the below would work for that too (but the reverse wouldn't))
# also used by vacuum_eventless_issuetags (ORed Q(issue_id, value_id))
models.Index(fields=['value', 'issue', 'digest_order']),
]
@@ -161,6 +166,17 @@ def digest_tags(event_data, event, issue):
def store_tags(event, issue, tags):
# observed: a non-batched implementation of store_tags() would crash (e.g. in sqlite: Expression tree is too large
# (maximum depth 1000));
# The value of 64 was arrived at by trying all powers of 2 (locally, on sqlite), observing that 256 was the last
# non-failing one, and then taking a factor 4 safety-margin. Realistically, 64 tags _per event_ "should be enough
# for anyone".
for kv_batch in batched(tags.items(), 64):
_store_tags(event, issue, {k: v for k, v in kv_batch})
def _store_tags(event, issue, tags):
if not tags:
return # short-circuit; which is a performance optimization which also avoids some the need for further guards
@@ -231,3 +247,19 @@ def store_tags(event, issue, tags):
IssueTag.objects.filter(value__in=tag_value_objects, issue=issue).update(
count=F('count') + 1
)
def prune_tagvalues(ids_to_check):
# used_in_event check is not needed, because non-existence of IssueTag always implies non-existince of EventTag,
# since [1] EventTag creation implies IssueTag creation and [2] in the cleanup code EventTag is deleted first.
used_in_issuetag = set(
IssueTag.objects.filter(value_id__in=ids_to_check).values_list('value_id', flat=True)
)
unused = [pk for pk in ids_to_check if pk not in used_in_issuetag]
if unused:
TagValue.objects.filter(id__in=unused).delete()
# The principled approach would be to clean up TagKeys as well at this point, but in practice there will be orders
# of magnitude fewer TagKey objects, and they are much less likely to become dangling, so the GC-like algo of "just
# vacuuming once in a while" is a much better fit for that.

View File

@@ -1,7 +1,10 @@
from django.db.models import Q
from snappea.decorators import shared_task
from bugsink.moreiterutils import batched
from bugsink.transaction import immediate_atomic, delay_on_commit
from tags.models import TagValue, TagKey, EventTag, IssueTag
from tags.models import TagValue, TagKey, EventTag, IssueTag, _or_join, prune_tagvalues
BATCH_SIZE = 10_000
@@ -82,3 +85,73 @@ def vacuum_tagkeys(min_id=0):
# Defer next batch
vacuum_tagkeys.delay(ids_to_check[-1])
@shared_task
def vacuum_eventless_issuetags(min_id=0):
# This task removes IssueTag entries that are no longer referenced by any EventTag for an Event on the same Issue.
#
# Under normal operation, we evict Events and their EventTags. However, we do not track how many EventTags back
# an IssueTag, so we have historically chosen not to clean up IssueTags during Event deletion. (see #134)
#
# This has the upside of being cheap and preserving all known values for an Issue (e.g. all environments/releases
# ever seen). But it comes with downsides:
#
# * stale IssueTags remain for deleted Events
# * search-by-tag may return Issues without matching Events
# * TagValues will not be vacuumed as long as theyre still referenced by an IssueTag
#
# This task aims to reconcile that, in a delayed and resumable fashion.
# Empirically determined: at this size, each batch is approx .3s (local dev, sqlite); Note that we're "nearer to the
# edge of the object-graph" than for e.g. even-retention, so we can both afford bigger batches (less cascading
# effects per item) as well as need bigger batches (because there are more expected items in a fanning-out
# object-graph).
BATCH_SIZE = 2048
# Community wisdom (says ChatGPT, w/o source): queries with dozens of OR clauses can slow down significantly. 64 is
# a safe, batch size that avoids planner overhead and keeps things fast across databases.
INNER_BATCH_SIZE = 64
with immediate_atomic():
issue_tag_infos = list(
IssueTag.objects
.filter(id__gt=min_id)
.order_by('id')
.values('id', 'issue_id', 'value_id')[:BATCH_SIZE]
)
for issue_tag_infos_batch in batched(issue_tag_infos, INNER_BATCH_SIZE):
matching_eventtags = _or_join([
Q(issue_id=it['issue_id'], value_id=it['value_id']) for it in issue_tag_infos_batch
])
if matching_eventtags:
in_use_issue_value_pairs = set(
EventTag.objects
.filter(matching_eventtags)
.values_list('issue_id', 'value_id')
)
else:
in_use_issue_value_pairs = set()
stale_issuetags = [
it
for it in issue_tag_infos_batch
if (it['issue_id'], it['value_id']) not in in_use_issue_value_pairs
]
if stale_issuetags:
IssueTag.objects.filter(id__in=[it['id'] for it in stale_issuetags]).delete()
# inline pruning of TagValue (as opposed to using "vacuum later") following the same reasoning as in
# prune_orphans.
prune_tagvalues([it['value_id'] for it in stale_issuetags])
if not issue_tag_infos:
# We don't have a continuation for the "done" case. One could argue: kick off vacuum_tagvalues there, but I'd
# rather rather build the toolbox of cleanup tasks first and see how they might fit together later. Because the
# downside of triggering the next vacuum command would be that "more things might happen too soon".
return
vacuum_eventless_issuetags.delay(issue_tag_infos[-1]['id'])

View File

@@ -1,14 +1,16 @@
from unittest import TestCase as RegularTestCase
from django.test import TestCase as DjangoTestCase
from bugsink.test_utils import TransactionTestCase25251 as TransactionTestCase
from projects.models import Project
from issues.factories import get_or_create_issue, denormalized_issue_fields
from events.factories import create_event, create_event_data
from issues.models import Issue
from .models import store_tags, EventTag
from .models import store_tags, EventTag, IssueTag, TagValue
from .utils import deduce_tags
from .search import search_events, search_issues, parse_query, search_events_optimized
from .tasks import vacuum_eventless_issuetags
class DeduceTagsTestCase(RegularTestCase):
@@ -119,6 +121,23 @@ class StoreTagsTestCase(DjangoTestCase):
self.assertEqual(self.issue.tags.first().count, 2)
self.assertEqual(self.issue.tags.first().value.key.key, "foo")
def test_store_same_tag_on_two_issues_creates_two_issuetags(self):
store_tags(self.event, self.issue, {"foo": "bar"})
other_issue, _ = get_or_create_issue(self.project, event_data=create_event_data("other_issue"))
other_event = create_event(self.project, issue=other_issue)
store_tags(other_event, other_issue, {"foo": "bar"})
self.assertEqual(IssueTag.objects.count(), 2)
self.assertEqual(2, IssueTag.objects.filter(value__key__key="foo").count())
def test_store_many_tags(self):
# observed: a non-batched implementation of store_tags() would crash (e.g. in sqlite: Expression tree is too
# large (maximum depth 1000)); if the below doesn't crash, we've got a batched implementation that works
event = create_event(self.project, issue=self.issue)
store_tags(event, self.issue, {f"key-{i}": f"value-{i}" for i in range(512)})
self.assertEqual(IssueTag.objects.filter(issue=self.issue).count(), 512)
class SearchParserTestCase(RegularTestCase):
@@ -247,3 +266,78 @@ class SearchTestCase(DjangoTestCase):
def test_search_issues(self):
self._test_search(lambda query: search_issues(self.project, Issue.objects.all(), query))
class VacuumEventlessIssueTagsTestCase(TransactionTestCase):
# Note: this test depends on EAGER mode in both the setup (delete_derred to trigger cascading deletes) and the
# testing of the thing under test (vacuum_eventless_issuetags).
def setUp(self):
self.project = Project.objects.create(name="T")
self.issue, _ = get_or_create_issue(self.project)
def test_no_eventtags_means_vacuum(self):
event = create_event(self.project, issue=self.issue)
store_tags(event, self.issue, {"foo": "bar"})
event.delete_deferred()
self.assertEqual(IssueTag.objects.count(), 1)
vacuum_eventless_issuetags()
# in the above we deleted EventTag; implies 0 after-vacuum
self.assertEqual(IssueTag.objects.count(), 0)
def test_one_eventtag_preserves_issuetag(self):
event = create_event(self.project, issue=self.issue)
store_tags(event, self.issue, {"foo": "bar"})
self.assertEqual(IssueTag.objects.count(), 1)
vacuum_eventless_issuetags()
# in the above we did not delete EventTag; implies 1 after-vacuum
self.assertEqual(IssueTag.objects.count(), 1)
def test_other_event_same_tag_same_issue_preserves(self):
event1 = create_event(self.project, issue=self.issue)
event2 = create_event(self.project, issue=self.issue)
store_tags(event1, self.issue, {"foo": "bar"})
store_tags(event2, self.issue, {"foo": "bar"})
event1.delete_deferred()
self.assertEqual(IssueTag.objects.count(), 1)
vacuum_eventless_issuetags()
# we deleted the EventTag for event1, but since event2 has the same tag, it should be preserved on the Issue
self.assertEqual(IssueTag.objects.count(), 1)
def test_other_event_same_tag_other_issue_does_not_preserve(self):
event1 = create_event(self.project, issue=self.issue)
store_tags(event1, self.issue, {"foo": "bar"})
other_issue, _ = get_or_create_issue(self.project, event_data=create_event_data("other_issue"))
event2 = create_event(self.project, issue=other_issue)
store_tags(event2, other_issue, {"foo": "bar"})
event1.delete_deferred()
self.assertEqual(IssueTag.objects.filter(issue=self.issue).count(), 1)
vacuum_eventless_issuetags()
self.assertEqual(IssueTag.objects.filter(issue=self.issue).count(), 0)
def test_many_tags_spanning_chunks(self):
event = create_event(self.project, issue=self.issue)
store_tags(event, self.issue, {f"key-{i}": f"value-{i}" for i in range(2048 + 1)}) # bigger than BATCH_SIZE
# check setup: all issue tags are there
self.assertEqual(IssueTag.objects.filter(issue=self.issue).count(), 2048 + 1)
event.delete_deferred()
vacuum_eventless_issuetags()
# all tags should be gone after vacuum
self.assertEqual(IssueTag.objects.filter(issue=self.issue).count(), 0)
def test_tagvalue_is_pruned(self):
event = create_event(self.project, issue=self.issue)
store_tags(event, self.issue, {"foo": "bar"})
event.delete_deferred()
vacuum_eventless_issuetags()
self.assertEqual(TagValue.objects.all().count(), 0)