Merge pull request #142 from bugsink/issue-tag-cleanup-as-vacuum

In the light of the discussion on #134, this implements the "clean up later" solution: a vacuum task that deletes IssueTags no longer referenced by any EventTag on the same Issue. In the same sweep TagValues are pruned. Performance-wise, this is a relatively safe path forward; it can run off-hours or not at all, depending on preferences. Semantically it's the least clear: whether an Issue appears to be tagged may now depend on whether vacuum has run, and `IssueTag.count` (representing total observed) is left in a "weird place" in the sense that it may disappear from the total more or less at random (i.e. depending on calling a command, and depending on the current state of not-yet-evicted Events).
2025-12-30 09:50:11 -06:00 · 2025-07-08 16:57:16 +02:00
parent a91fdcd656 1965b0f8c2
commit 27af846de9
5 changed files with 216 additions and 19 deletions
--- a/bugsink/utils.py
+++ b/bugsink/utils.py
@@ -206,29 +206,17 @@ def prune_orphans(model, d_ids_to_check):

    1. the inline version is easier to reason about, it "just happens ASAP", and in the context of a given issue;
       vacuum-based has to take into consideration the full DB including non-orphaned values.
-    2. repeated work is somewhat minimalized b/c of the IssueTag/EventTag relationship as described below.
+    2. repeated work is somewhat minimalized b/c of the IssueTag/EventTag relationship as described in prune_tagvalues.
    """

-    from tags.models import TagValue, IssueTag  # avoid circular import
+    from tags.models import prune_tagvalues  # avoid circular import

    if model.__name__ != "IssueTag":
        return  # we only prune IssueTag orphans

-    ids_to_check = [d["value_id"] for d in d_ids_to_check]
+    ids_to_check = [d["value_id"] for d in d_ids_to_check]  # d_ids_to_check: mirrors fields_for_prune_orphans(model)

-    # used_in_event check is not needed, because non-existence of IssueTag always implies non-existince of EventTag,
-    # since [1] EventTag creation implies IssueTag creation and [2] in the cleanup code EventTag is deleted first.
-    used_in_issue = set(
-        IssueTag.objects.filter(value_id__in=ids_to_check).values_list('value_id', flat=True)
-    )
-    unused = [pk for pk in ids_to_check if pk not in used_in_issue]
-
-    if unused:
-        TagValue.objects.filter(id__in=unused).delete()
-
-    # The principled approach would be to clean up TagKeys as well at this point, but in practice there will be orders
-    # of magnitude fewer TagKey objects, and they are much less likely to become dangling, so the GC-like algo of "just
-    # vacuuming once in a while" is a much better fit for that.
+    prune_tagvalues(ids_to_check)


 def do_pre_delete(project_id, model, pks_to_delete, is_for_project):
--- a/tags/management/commands/vacuum_eventless_issuetags.py
+++ b/tags/management/commands/vacuum_eventless_issuetags.py
@@ -0,0 +1,10 @@
+from django.core.management.base import BaseCommand
+from tags.tasks import vacuum_eventless_issuetags
+
+
+class Command(BaseCommand):
+    help = "Kick off tag cleanup by vacuuming IssueTag objects for which there is no EventTag equivalent"
+
+    def handle(self, *args, **options):
+        vacuum_eventless_issuetags.delay()
+        self.stdout.write("Started tag vacuum via task queue.")
--- a/tags/models.py
+++ b/tags/models.py
@@ -23,6 +23,7 @@ from django.db.models import Q, F

 from projects.models import Project
 from tags.utils import deduce_tags, is_mostly_unique
+from bugsink.moreiterutils import batched

 # Notes on .project as it lives on TagValue, IssueTag and EventTag:
 # In all cases, project could be derived through other means: for TagValue it's implied by TagKey.project; for IssueTag
@@ -52,6 +53,9 @@ class TagKey(models.Model):
        # the obvious constraint, which doubles as a lookup index for store_tags and search.
        unique_together = ('project', 'key')

+    def __str__(self):
+        return f"{self.key}"
+

 class TagValue(models.Model):
    project = models.ForeignKey(Project, blank=False, null=False, on_delete=models.DO_NOTHING)
@@ -74,7 +78,7 @@ class EventTag(models.Model):
    # value already implies key in our current setup.
    value = models.ForeignKey(TagValue, blank=False, null=False, on_delete=models.DO_NOTHING)

-    # issue is a denormalization that allows for a single-table-index for efficient search.
+    # issue is a denormalization that allows for a single-table-index for efficient search/vacuum_eventless_issuetags.
    issue = models.ForeignKey(
        'issues.Issue', blank=False, null=False, on_delete=models.DO_NOTHING, related_name="event_tags")

@@ -97,6 +101,7 @@ class EventTag(models.Model):
            # for search, which filters a list of EventTag down to those matching certain values and a given issue.
            # (both orderings of the (value, issue) would work for the current search query; if we ever introduce
            # "search across issues" the below would work for that too (but the reverse wouldn't))
+            # also used by vacuum_eventless_issuetags (ORed Q(issue_id, value_id))
            models.Index(fields=['value', 'issue', 'digest_order']),
        ]

@@ -161,6 +166,17 @@ def digest_tags(event_data, event, issue):


 def store_tags(event, issue, tags):
+    # observed: a non-batched implementation of store_tags() would crash (e.g. in sqlite: Expression tree is too large
+    # (maximum depth 1000));
+
+    # The value of 64 was arrived at by trying all powers of 2 (locally, on sqlite), observing that 256 was the last
+    # non-failing one, and then taking a factor 4 safety-margin. Realistically, 64 tags _per event_ "should be enough
+    # for anyone".
+    for kv_batch in batched(tags.items(), 64):
+        _store_tags(event, issue, {k: v for k, v in kv_batch})
+
+
+def _store_tags(event, issue, tags):
    if not tags:
        return  # short-circuit; which is a performance optimization which also avoids some the need for further guards

@@ -231,3 +247,19 @@ def store_tags(event, issue, tags):
    IssueTag.objects.filter(value__in=tag_value_objects, issue=issue).update(
        count=F('count') + 1
    )
+
+
+def prune_tagvalues(ids_to_check):
+    # used_in_event check is not needed, because non-existence of IssueTag always implies non-existince of EventTag,
+    # since [1] EventTag creation implies IssueTag creation and [2] in the cleanup code EventTag is deleted first.
+    used_in_issuetag = set(
+        IssueTag.objects.filter(value_id__in=ids_to_check).values_list('value_id', flat=True)
+    )
+    unused = [pk for pk in ids_to_check if pk not in used_in_issuetag]
+
+    if unused:
+        TagValue.objects.filter(id__in=unused).delete()
+
+    # The principled approach would be to clean up TagKeys as well at this point, but in practice there will be orders
+    # of magnitude fewer TagKey objects, and they are much less likely to become dangling, so the GC-like algo of "just
+    # vacuuming once in a while" is a much better fit for that.
--- a/tags/tasks.py
+++ b/tags/tasks.py
@@ -1,7 +1,10 @@
+from django.db.models import Q
+
 from snappea.decorators import shared_task

+from bugsink.moreiterutils import batched
 from bugsink.transaction import immediate_atomic, delay_on_commit
-from tags.models import TagValue, TagKey, EventTag, IssueTag
+from tags.models import TagValue, TagKey, EventTag, IssueTag, _or_join, prune_tagvalues

 BATCH_SIZE = 10_000

@@ -82,3 +85,73 @@ def vacuum_tagkeys(min_id=0):

    # Defer next batch
    vacuum_tagkeys.delay(ids_to_check[-1])
+
+
+@shared_task
+def vacuum_eventless_issuetags(min_id=0):
+    # This task removes IssueTag entries that are no longer referenced by any EventTag for an Event on the same Issue.
+    #
+    # Under normal operation, we evict Events and their EventTags. However, we do not track how many EventTags back
+    # an IssueTag, so we have historically chosen not to clean up IssueTags during Event deletion. (see #134)
+    #
+    # This has the upside of being cheap and preserving all known values for an Issue (e.g. all environments/releases
+    # ever seen). But it comes with downsides:
+    #
+    # * stale IssueTags remain for deleted Events
+    # * search-by-tag may return Issues without matching Events
+    # * TagValues will not be vacuumed as long as they’re still referenced by an IssueTag
+    #
+    # This task aims to reconcile that, in a delayed and resumable fashion.
+
+    # Empirically determined: at this size, each batch is approx .3s (local dev, sqlite); Note that we're "nearer to the
+    # edge of the object-graph" than for e.g. even-retention, so we can both afford bigger batches (less cascading
+    # effects per item) as well as need bigger batches (because there are more expected items in a fanning-out
+    # object-graph).
+    BATCH_SIZE = 2048
+
+    # Community wisdom (says ChatGPT, w/o source): queries with dozens of OR clauses can slow down significantly. 64 is
+    # a safe, batch size that avoids planner overhead and keeps things fast across databases.
+    INNER_BATCH_SIZE = 64
+
+    with immediate_atomic():
+        issue_tag_infos = list(
+            IssueTag.objects
+            .filter(id__gt=min_id)
+            .order_by('id')
+            .values('id', 'issue_id', 'value_id')[:BATCH_SIZE]
+        )
+
+        for issue_tag_infos_batch in batched(issue_tag_infos, INNER_BATCH_SIZE):
+            matching_eventtags = _or_join([
+                Q(issue_id=it['issue_id'], value_id=it['value_id']) for it in issue_tag_infos_batch
+            ])
+
+            if matching_eventtags:
+                in_use_issue_value_pairs = set(
+                    EventTag.objects
+                    .filter(matching_eventtags)
+                    .values_list('issue_id', 'value_id')
+                )
+            else:
+                in_use_issue_value_pairs = set()
+
+            stale_issuetags = [
+                it
+                for it in issue_tag_infos_batch
+                if (it['issue_id'], it['value_id']) not in in_use_issue_value_pairs
+            ]
+
+            if stale_issuetags:
+                IssueTag.objects.filter(id__in=[it['id'] for it in stale_issuetags]).delete()
+
+                # inline pruning of TagValue (as opposed to using "vacuum later") following the same reasoning as in
+                # prune_orphans.
+                prune_tagvalues([it['value_id'] for it in stale_issuetags])
+
+    if not issue_tag_infos:
+        # We don't have a continuation for the "done" case. One could argue: kick off vacuum_tagvalues there, but I'd
+        # rather rather build the toolbox of cleanup tasks first and see how they might fit together later. Because the
+        # downside of triggering the next vacuum command would be that "more things might happen too soon".
+        return
+
+    vacuum_eventless_issuetags.delay(issue_tag_infos[-1]['id'])
--- a/tags/tests.py
+++ b/tags/tests.py
@@ -1,14 +1,16 @@
 from unittest import TestCase as RegularTestCase
 from django.test import TestCase as DjangoTestCase

+from bugsink.test_utils import TransactionTestCase25251 as TransactionTestCase
 from projects.models import Project
 from issues.factories import get_or_create_issue, denormalized_issue_fields
 from events.factories import create_event, create_event_data
 from issues.models import Issue

-from .models import store_tags, EventTag
+from .models import store_tags, EventTag, IssueTag, TagValue
 from .utils import deduce_tags
 from .search import search_events, search_issues, parse_query, search_events_optimized
+from .tasks import vacuum_eventless_issuetags


 class DeduceTagsTestCase(RegularTestCase):
@@ -119,6 +121,23 @@ class StoreTagsTestCase(DjangoTestCase):
        self.assertEqual(self.issue.tags.first().count, 2)
        self.assertEqual(self.issue.tags.first().value.key.key, "foo")

+    def test_store_same_tag_on_two_issues_creates_two_issuetags(self):
+        store_tags(self.event, self.issue, {"foo": "bar"})
+
+        other_issue, _ = get_or_create_issue(self.project, event_data=create_event_data("other_issue"))
+        other_event = create_event(self.project, issue=other_issue)
+        store_tags(other_event, other_issue, {"foo": "bar"})
+
+        self.assertEqual(IssueTag.objects.count(), 2)
+        self.assertEqual(2, IssueTag.objects.filter(value__key__key="foo").count())
+
+    def test_store_many_tags(self):
+        # observed: a non-batched implementation of store_tags() would crash (e.g. in sqlite: Expression tree is too
+        # large (maximum depth 1000)); if the below doesn't crash, we've got a batched implementation that works
+        event = create_event(self.project, issue=self.issue)
+        store_tags(event, self.issue, {f"key-{i}": f"value-{i}" for i in range(512)})
+        self.assertEqual(IssueTag.objects.filter(issue=self.issue).count(), 512)
+

 class SearchParserTestCase(RegularTestCase):

@@ -247,3 +266,78 @@ class SearchTestCase(DjangoTestCase):

    def test_search_issues(self):
        self._test_search(lambda query: search_issues(self.project, Issue.objects.all(), query))
+
+
+class VacuumEventlessIssueTagsTestCase(TransactionTestCase):
+    # Note: this test depends on EAGER mode in both the setup (delete_derred to trigger cascading deletes) and the
+    # testing of the thing under test (vacuum_eventless_issuetags).
+
+    def setUp(self):
+        self.project = Project.objects.create(name="T")
+        self.issue, _ = get_or_create_issue(self.project)
+
+    def test_no_eventtags_means_vacuum(self):
+        event = create_event(self.project, issue=self.issue)
+        store_tags(event, self.issue, {"foo": "bar"})
+        event.delete_deferred()
+
+        self.assertEqual(IssueTag.objects.count(), 1)
+        vacuum_eventless_issuetags()
+        # in the above we deleted EventTag; implies 0 after-vacuum
+        self.assertEqual(IssueTag.objects.count(), 0)
+
+    def test_one_eventtag_preserves_issuetag(self):
+        event = create_event(self.project, issue=self.issue)
+        store_tags(event, self.issue, {"foo": "bar"})
+
+        self.assertEqual(IssueTag.objects.count(), 1)
+        vacuum_eventless_issuetags()
+        # in the above we did not delete EventTag; implies 1 after-vacuum
+        self.assertEqual(IssueTag.objects.count(), 1)
+
+    def test_other_event_same_tag_same_issue_preserves(self):
+        event1 = create_event(self.project, issue=self.issue)
+        event2 = create_event(self.project, issue=self.issue)
+        store_tags(event1, self.issue, {"foo": "bar"})
+        store_tags(event2, self.issue, {"foo": "bar"})
+        event1.delete_deferred()
+
+        self.assertEqual(IssueTag.objects.count(), 1)
+        vacuum_eventless_issuetags()
+        # we deleted the EventTag for event1, but since event2 has the same tag, it should be preserved on the Issue
+        self.assertEqual(IssueTag.objects.count(), 1)
+
+    def test_other_event_same_tag_other_issue_does_not_preserve(self):
+        event1 = create_event(self.project, issue=self.issue)
+        store_tags(event1, self.issue, {"foo": "bar"})
+
+        other_issue, _ = get_or_create_issue(self.project, event_data=create_event_data("other_issue"))
+        event2 = create_event(self.project, issue=other_issue)
+        store_tags(event2, other_issue, {"foo": "bar"})
+
+        event1.delete_deferred()
+
+        self.assertEqual(IssueTag.objects.filter(issue=self.issue).count(), 1)
+        vacuum_eventless_issuetags()
+        self.assertEqual(IssueTag.objects.filter(issue=self.issue).count(), 0)
+
+    def test_many_tags_spanning_chunks(self):
+        event = create_event(self.project, issue=self.issue)
+        store_tags(event, self.issue, {f"key-{i}": f"value-{i}" for i in range(2048 + 1)})  # bigger than BATCH_SIZE
+
+        # check setup: all issue tags are there
+        self.assertEqual(IssueTag.objects.filter(issue=self.issue).count(), 2048 + 1)
+
+        event.delete_deferred()
+        vacuum_eventless_issuetags()
+
+        # all tags should be gone after vacuum
+        self.assertEqual(IssueTag.objects.filter(issue=self.issue).count(), 0)
+
+    def test_tagvalue_is_pruned(self):
+        event = create_event(self.project, issue=self.issue)
+        store_tags(event, self.issue, {"foo": "bar"})
+        event.delete_deferred()
+
+        vacuum_eventless_issuetags()
+        self.assertEqual(TagValue.objects.all().count(), 0)