diff --git a/bugsink/utils.py b/bugsink/utils.py index 8ace562..def5795 100644 --- a/bugsink/utils.py +++ b/bugsink/utils.py @@ -206,29 +206,17 @@ def prune_orphans(model, d_ids_to_check): 1. the inline version is easier to reason about, it "just happens ASAP", and in the context of a given issue; vacuum-based has to take into consideration the full DB including non-orphaned values. - 2. repeated work is somewhat minimalized b/c of the IssueTag/EventTag relationship as described below. + 2. repeated work is somewhat minimalized b/c of the IssueTag/EventTag relationship as described in prune_tagvalues. """ - from tags.models import TagValue, IssueTag # avoid circular import + from tags.models import prune_tagvalues # avoid circular import if model.__name__ != "IssueTag": return # we only prune IssueTag orphans - ids_to_check = [d["value_id"] for d in d_ids_to_check] + ids_to_check = [d["value_id"] for d in d_ids_to_check] # d_ids_to_check: mirrors fields_for_prune_orphans(model) - # used_in_event check is not needed, because non-existence of IssueTag always implies non-existince of EventTag, - # since [1] EventTag creation implies IssueTag creation and [2] in the cleanup code EventTag is deleted first. - used_in_issue = set( - IssueTag.objects.filter(value_id__in=ids_to_check).values_list('value_id', flat=True) - ) - unused = [pk for pk in ids_to_check if pk not in used_in_issue] - - if unused: - TagValue.objects.filter(id__in=unused).delete() - - # The principled approach would be to clean up TagKeys as well at this point, but in practice there will be orders - # of magnitude fewer TagKey objects, and they are much less likely to become dangling, so the GC-like algo of "just - # vacuuming once in a while" is a much better fit for that. + prune_tagvalues(ids_to_check) def do_pre_delete(project_id, model, pks_to_delete, is_for_project): diff --git a/tags/management/commands/vacuum_eventless_issuetags.py b/tags/management/commands/vacuum_eventless_issuetags.py new file mode 100644 index 0000000..4c7f0d3 --- /dev/null +++ b/tags/management/commands/vacuum_eventless_issuetags.py @@ -0,0 +1,10 @@ +from django.core.management.base import BaseCommand +from tags.tasks import vacuum_eventless_issuetags + + +class Command(BaseCommand): + help = "Kick off tag cleanup by vacuuming IssueTag objects for which there is no EventTag equivalent" + + def handle(self, *args, **options): + vacuum_eventless_issuetags.delay() + self.stdout.write("Started tag vacuum via task queue.") diff --git a/tags/models.py b/tags/models.py index 97a4eb2..37a9754 100644 --- a/tags/models.py +++ b/tags/models.py @@ -23,6 +23,7 @@ from django.db.models import Q, F from projects.models import Project from tags.utils import deduce_tags, is_mostly_unique +from bugsink.moreiterutils import batched # Notes on .project as it lives on TagValue, IssueTag and EventTag: # In all cases, project could be derived through other means: for TagValue it's implied by TagKey.project; for IssueTag @@ -52,6 +53,9 @@ class TagKey(models.Model): # the obvious constraint, which doubles as a lookup index for store_tags and search. unique_together = ('project', 'key') + def __str__(self): + return f"{self.key}" + class TagValue(models.Model): project = models.ForeignKey(Project, blank=False, null=False, on_delete=models.DO_NOTHING) @@ -74,7 +78,7 @@ class EventTag(models.Model): # value already implies key in our current setup. value = models.ForeignKey(TagValue, blank=False, null=False, on_delete=models.DO_NOTHING) - # issue is a denormalization that allows for a single-table-index for efficient search. + # issue is a denormalization that allows for a single-table-index for efficient search/vacuum_eventless_issuetags. issue = models.ForeignKey( 'issues.Issue', blank=False, null=False, on_delete=models.DO_NOTHING, related_name="event_tags") @@ -97,6 +101,7 @@ class EventTag(models.Model): # for search, which filters a list of EventTag down to those matching certain values and a given issue. # (both orderings of the (value, issue) would work for the current search query; if we ever introduce # "search across issues" the below would work for that too (but the reverse wouldn't)) + # also used by vacuum_eventless_issuetags (ORed Q(issue_id, value_id)) models.Index(fields=['value', 'issue', 'digest_order']), ] @@ -161,6 +166,17 @@ def digest_tags(event_data, event, issue): def store_tags(event, issue, tags): + # observed: a non-batched implementation of store_tags() would crash (e.g. in sqlite: Expression tree is too large + # (maximum depth 1000)); + + # The value of 64 was arrived at by trying all powers of 2 (locally, on sqlite), observing that 256 was the last + # non-failing one, and then taking a factor 4 safety-margin. Realistically, 64 tags _per event_ "should be enough + # for anyone". + for kv_batch in batched(tags.items(), 64): + _store_tags(event, issue, {k: v for k, v in kv_batch}) + + +def _store_tags(event, issue, tags): if not tags: return # short-circuit; which is a performance optimization which also avoids some the need for further guards @@ -231,3 +247,19 @@ def store_tags(event, issue, tags): IssueTag.objects.filter(value__in=tag_value_objects, issue=issue).update( count=F('count') + 1 ) + + +def prune_tagvalues(ids_to_check): + # used_in_event check is not needed, because non-existence of IssueTag always implies non-existince of EventTag, + # since [1] EventTag creation implies IssueTag creation and [2] in the cleanup code EventTag is deleted first. + used_in_issuetag = set( + IssueTag.objects.filter(value_id__in=ids_to_check).values_list('value_id', flat=True) + ) + unused = [pk for pk in ids_to_check if pk not in used_in_issuetag] + + if unused: + TagValue.objects.filter(id__in=unused).delete() + + # The principled approach would be to clean up TagKeys as well at this point, but in practice there will be orders + # of magnitude fewer TagKey objects, and they are much less likely to become dangling, so the GC-like algo of "just + # vacuuming once in a while" is a much better fit for that. diff --git a/tags/tasks.py b/tags/tasks.py index 9d93846..13a9908 100644 --- a/tags/tasks.py +++ b/tags/tasks.py @@ -1,7 +1,10 @@ +from django.db.models import Q + from snappea.decorators import shared_task +from bugsink.moreiterutils import batched from bugsink.transaction import immediate_atomic, delay_on_commit -from tags.models import TagValue, TagKey, EventTag, IssueTag +from tags.models import TagValue, TagKey, EventTag, IssueTag, _or_join, prune_tagvalues BATCH_SIZE = 10_000 @@ -82,3 +85,73 @@ def vacuum_tagkeys(min_id=0): # Defer next batch vacuum_tagkeys.delay(ids_to_check[-1]) + + +@shared_task +def vacuum_eventless_issuetags(min_id=0): + # This task removes IssueTag entries that are no longer referenced by any EventTag for an Event on the same Issue. + # + # Under normal operation, we evict Events and their EventTags. However, we do not track how many EventTags back + # an IssueTag, so we have historically chosen not to clean up IssueTags during Event deletion. (see #134) + # + # This has the upside of being cheap and preserving all known values for an Issue (e.g. all environments/releases + # ever seen). But it comes with downsides: + # + # * stale IssueTags remain for deleted Events + # * search-by-tag may return Issues without matching Events + # * TagValues will not be vacuumed as long as they’re still referenced by an IssueTag + # + # This task aims to reconcile that, in a delayed and resumable fashion. + + # Empirically determined: at this size, each batch is approx .3s (local dev, sqlite); Note that we're "nearer to the + # edge of the object-graph" than for e.g. even-retention, so we can both afford bigger batches (less cascading + # effects per item) as well as need bigger batches (because there are more expected items in a fanning-out + # object-graph). + BATCH_SIZE = 2048 + + # Community wisdom (says ChatGPT, w/o source): queries with dozens of OR clauses can slow down significantly. 64 is + # a safe, batch size that avoids planner overhead and keeps things fast across databases. + INNER_BATCH_SIZE = 64 + + with immediate_atomic(): + issue_tag_infos = list( + IssueTag.objects + .filter(id__gt=min_id) + .order_by('id') + .values('id', 'issue_id', 'value_id')[:BATCH_SIZE] + ) + + for issue_tag_infos_batch in batched(issue_tag_infos, INNER_BATCH_SIZE): + matching_eventtags = _or_join([ + Q(issue_id=it['issue_id'], value_id=it['value_id']) for it in issue_tag_infos_batch + ]) + + if matching_eventtags: + in_use_issue_value_pairs = set( + EventTag.objects + .filter(matching_eventtags) + .values_list('issue_id', 'value_id') + ) + else: + in_use_issue_value_pairs = set() + + stale_issuetags = [ + it + for it in issue_tag_infos_batch + if (it['issue_id'], it['value_id']) not in in_use_issue_value_pairs + ] + + if stale_issuetags: + IssueTag.objects.filter(id__in=[it['id'] for it in stale_issuetags]).delete() + + # inline pruning of TagValue (as opposed to using "vacuum later") following the same reasoning as in + # prune_orphans. + prune_tagvalues([it['value_id'] for it in stale_issuetags]) + + if not issue_tag_infos: + # We don't have a continuation for the "done" case. One could argue: kick off vacuum_tagvalues there, but I'd + # rather rather build the toolbox of cleanup tasks first and see how they might fit together later. Because the + # downside of triggering the next vacuum command would be that "more things might happen too soon". + return + + vacuum_eventless_issuetags.delay(issue_tag_infos[-1]['id']) diff --git a/tags/tests.py b/tags/tests.py index b124b1e..318f1af 100644 --- a/tags/tests.py +++ b/tags/tests.py @@ -1,14 +1,16 @@ from unittest import TestCase as RegularTestCase from django.test import TestCase as DjangoTestCase +from bugsink.test_utils import TransactionTestCase25251 as TransactionTestCase from projects.models import Project from issues.factories import get_or_create_issue, denormalized_issue_fields from events.factories import create_event, create_event_data from issues.models import Issue -from .models import store_tags, EventTag +from .models import store_tags, EventTag, IssueTag, TagValue from .utils import deduce_tags from .search import search_events, search_issues, parse_query, search_events_optimized +from .tasks import vacuum_eventless_issuetags class DeduceTagsTestCase(RegularTestCase): @@ -119,6 +121,23 @@ class StoreTagsTestCase(DjangoTestCase): self.assertEqual(self.issue.tags.first().count, 2) self.assertEqual(self.issue.tags.first().value.key.key, "foo") + def test_store_same_tag_on_two_issues_creates_two_issuetags(self): + store_tags(self.event, self.issue, {"foo": "bar"}) + + other_issue, _ = get_or_create_issue(self.project, event_data=create_event_data("other_issue")) + other_event = create_event(self.project, issue=other_issue) + store_tags(other_event, other_issue, {"foo": "bar"}) + + self.assertEqual(IssueTag.objects.count(), 2) + self.assertEqual(2, IssueTag.objects.filter(value__key__key="foo").count()) + + def test_store_many_tags(self): + # observed: a non-batched implementation of store_tags() would crash (e.g. in sqlite: Expression tree is too + # large (maximum depth 1000)); if the below doesn't crash, we've got a batched implementation that works + event = create_event(self.project, issue=self.issue) + store_tags(event, self.issue, {f"key-{i}": f"value-{i}" for i in range(512)}) + self.assertEqual(IssueTag.objects.filter(issue=self.issue).count(), 512) + class SearchParserTestCase(RegularTestCase): @@ -247,3 +266,78 @@ class SearchTestCase(DjangoTestCase): def test_search_issues(self): self._test_search(lambda query: search_issues(self.project, Issue.objects.all(), query)) + + +class VacuumEventlessIssueTagsTestCase(TransactionTestCase): + # Note: this test depends on EAGER mode in both the setup (delete_derred to trigger cascading deletes) and the + # testing of the thing under test (vacuum_eventless_issuetags). + + def setUp(self): + self.project = Project.objects.create(name="T") + self.issue, _ = get_or_create_issue(self.project) + + def test_no_eventtags_means_vacuum(self): + event = create_event(self.project, issue=self.issue) + store_tags(event, self.issue, {"foo": "bar"}) + event.delete_deferred() + + self.assertEqual(IssueTag.objects.count(), 1) + vacuum_eventless_issuetags() + # in the above we deleted EventTag; implies 0 after-vacuum + self.assertEqual(IssueTag.objects.count(), 0) + + def test_one_eventtag_preserves_issuetag(self): + event = create_event(self.project, issue=self.issue) + store_tags(event, self.issue, {"foo": "bar"}) + + self.assertEqual(IssueTag.objects.count(), 1) + vacuum_eventless_issuetags() + # in the above we did not delete EventTag; implies 1 after-vacuum + self.assertEqual(IssueTag.objects.count(), 1) + + def test_other_event_same_tag_same_issue_preserves(self): + event1 = create_event(self.project, issue=self.issue) + event2 = create_event(self.project, issue=self.issue) + store_tags(event1, self.issue, {"foo": "bar"}) + store_tags(event2, self.issue, {"foo": "bar"}) + event1.delete_deferred() + + self.assertEqual(IssueTag.objects.count(), 1) + vacuum_eventless_issuetags() + # we deleted the EventTag for event1, but since event2 has the same tag, it should be preserved on the Issue + self.assertEqual(IssueTag.objects.count(), 1) + + def test_other_event_same_tag_other_issue_does_not_preserve(self): + event1 = create_event(self.project, issue=self.issue) + store_tags(event1, self.issue, {"foo": "bar"}) + + other_issue, _ = get_or_create_issue(self.project, event_data=create_event_data("other_issue")) + event2 = create_event(self.project, issue=other_issue) + store_tags(event2, other_issue, {"foo": "bar"}) + + event1.delete_deferred() + + self.assertEqual(IssueTag.objects.filter(issue=self.issue).count(), 1) + vacuum_eventless_issuetags() + self.assertEqual(IssueTag.objects.filter(issue=self.issue).count(), 0) + + def test_many_tags_spanning_chunks(self): + event = create_event(self.project, issue=self.issue) + store_tags(event, self.issue, {f"key-{i}": f"value-{i}" for i in range(2048 + 1)}) # bigger than BATCH_SIZE + + # check setup: all issue tags are there + self.assertEqual(IssueTag.objects.filter(issue=self.issue).count(), 2048 + 1) + + event.delete_deferred() + vacuum_eventless_issuetags() + + # all tags should be gone after vacuum + self.assertEqual(IssueTag.objects.filter(issue=self.issue).count(), 0) + + def test_tagvalue_is_pruned(self): + event = create_event(self.project, issue=self.issue) + store_tags(event, self.issue, {"foo": "bar"}) + event.delete_deferred() + + vacuum_eventless_issuetags() + self.assertEqual(TagValue.objects.all().count(), 0)