mirror of
https://github.com/bugsink/bugsink.git
synced 2025-12-30 09:50:11 -06:00
Merge pull request #142 from bugsink/issue-tag-cleanup-as-vacuum
In the light of the discussion on #134, this implements the "clean up later" solution: a vacuum task that deletes IssueTags no longer referenced by any EventTag on the same Issue. In the same sweep TagValues are pruned. Performance-wise, this is a relatively safe path forward; it can run off-hours or not at all, depending on preferences. Semantically it's the least clear: whether an Issue appears to be tagged may now depend on whether vacuum has run, and `IssueTag.count` (representing total observed) is left in a "weird place" in the sense that it may disappear from the total more or less at random (i.e. depending on calling a command, and depending on the current state of not-yet-evicted Events).
This commit is contained in:
@@ -206,29 +206,17 @@ def prune_orphans(model, d_ids_to_check):
|
||||
|
||||
1. the inline version is easier to reason about, it "just happens ASAP", and in the context of a given issue;
|
||||
vacuum-based has to take into consideration the full DB including non-orphaned values.
|
||||
2. repeated work is somewhat minimalized b/c of the IssueTag/EventTag relationship as described below.
|
||||
2. repeated work is somewhat minimalized b/c of the IssueTag/EventTag relationship as described in prune_tagvalues.
|
||||
"""
|
||||
|
||||
from tags.models import TagValue, IssueTag # avoid circular import
|
||||
from tags.models import prune_tagvalues # avoid circular import
|
||||
|
||||
if model.__name__ != "IssueTag":
|
||||
return # we only prune IssueTag orphans
|
||||
|
||||
ids_to_check = [d["value_id"] for d in d_ids_to_check]
|
||||
ids_to_check = [d["value_id"] for d in d_ids_to_check] # d_ids_to_check: mirrors fields_for_prune_orphans(model)
|
||||
|
||||
# used_in_event check is not needed, because non-existence of IssueTag always implies non-existince of EventTag,
|
||||
# since [1] EventTag creation implies IssueTag creation and [2] in the cleanup code EventTag is deleted first.
|
||||
used_in_issue = set(
|
||||
IssueTag.objects.filter(value_id__in=ids_to_check).values_list('value_id', flat=True)
|
||||
)
|
||||
unused = [pk for pk in ids_to_check if pk not in used_in_issue]
|
||||
|
||||
if unused:
|
||||
TagValue.objects.filter(id__in=unused).delete()
|
||||
|
||||
# The principled approach would be to clean up TagKeys as well at this point, but in practice there will be orders
|
||||
# of magnitude fewer TagKey objects, and they are much less likely to become dangling, so the GC-like algo of "just
|
||||
# vacuuming once in a while" is a much better fit for that.
|
||||
prune_tagvalues(ids_to_check)
|
||||
|
||||
|
||||
def do_pre_delete(project_id, model, pks_to_delete, is_for_project):
|
||||
|
||||
10
tags/management/commands/vacuum_eventless_issuetags.py
Normal file
10
tags/management/commands/vacuum_eventless_issuetags.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from tags.tasks import vacuum_eventless_issuetags
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "Kick off tag cleanup by vacuuming IssueTag objects for which there is no EventTag equivalent"
|
||||
|
||||
def handle(self, *args, **options):
|
||||
vacuum_eventless_issuetags.delay()
|
||||
self.stdout.write("Started tag vacuum via task queue.")
|
||||
@@ -23,6 +23,7 @@ from django.db.models import Q, F
|
||||
|
||||
from projects.models import Project
|
||||
from tags.utils import deduce_tags, is_mostly_unique
|
||||
from bugsink.moreiterutils import batched
|
||||
|
||||
# Notes on .project as it lives on TagValue, IssueTag and EventTag:
|
||||
# In all cases, project could be derived through other means: for TagValue it's implied by TagKey.project; for IssueTag
|
||||
@@ -52,6 +53,9 @@ class TagKey(models.Model):
|
||||
# the obvious constraint, which doubles as a lookup index for store_tags and search.
|
||||
unique_together = ('project', 'key')
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.key}"
|
||||
|
||||
|
||||
class TagValue(models.Model):
|
||||
project = models.ForeignKey(Project, blank=False, null=False, on_delete=models.DO_NOTHING)
|
||||
@@ -74,7 +78,7 @@ class EventTag(models.Model):
|
||||
# value already implies key in our current setup.
|
||||
value = models.ForeignKey(TagValue, blank=False, null=False, on_delete=models.DO_NOTHING)
|
||||
|
||||
# issue is a denormalization that allows for a single-table-index for efficient search.
|
||||
# issue is a denormalization that allows for a single-table-index for efficient search/vacuum_eventless_issuetags.
|
||||
issue = models.ForeignKey(
|
||||
'issues.Issue', blank=False, null=False, on_delete=models.DO_NOTHING, related_name="event_tags")
|
||||
|
||||
@@ -97,6 +101,7 @@ class EventTag(models.Model):
|
||||
# for search, which filters a list of EventTag down to those matching certain values and a given issue.
|
||||
# (both orderings of the (value, issue) would work for the current search query; if we ever introduce
|
||||
# "search across issues" the below would work for that too (but the reverse wouldn't))
|
||||
# also used by vacuum_eventless_issuetags (ORed Q(issue_id, value_id))
|
||||
models.Index(fields=['value', 'issue', 'digest_order']),
|
||||
]
|
||||
|
||||
@@ -161,6 +166,17 @@ def digest_tags(event_data, event, issue):
|
||||
|
||||
|
||||
def store_tags(event, issue, tags):
|
||||
# observed: a non-batched implementation of store_tags() would crash (e.g. in sqlite: Expression tree is too large
|
||||
# (maximum depth 1000));
|
||||
|
||||
# The value of 64 was arrived at by trying all powers of 2 (locally, on sqlite), observing that 256 was the last
|
||||
# non-failing one, and then taking a factor 4 safety-margin. Realistically, 64 tags _per event_ "should be enough
|
||||
# for anyone".
|
||||
for kv_batch in batched(tags.items(), 64):
|
||||
_store_tags(event, issue, {k: v for k, v in kv_batch})
|
||||
|
||||
|
||||
def _store_tags(event, issue, tags):
|
||||
if not tags:
|
||||
return # short-circuit; which is a performance optimization which also avoids some the need for further guards
|
||||
|
||||
@@ -231,3 +247,19 @@ def store_tags(event, issue, tags):
|
||||
IssueTag.objects.filter(value__in=tag_value_objects, issue=issue).update(
|
||||
count=F('count') + 1
|
||||
)
|
||||
|
||||
|
||||
def prune_tagvalues(ids_to_check):
|
||||
# used_in_event check is not needed, because non-existence of IssueTag always implies non-existince of EventTag,
|
||||
# since [1] EventTag creation implies IssueTag creation and [2] in the cleanup code EventTag is deleted first.
|
||||
used_in_issuetag = set(
|
||||
IssueTag.objects.filter(value_id__in=ids_to_check).values_list('value_id', flat=True)
|
||||
)
|
||||
unused = [pk for pk in ids_to_check if pk not in used_in_issuetag]
|
||||
|
||||
if unused:
|
||||
TagValue.objects.filter(id__in=unused).delete()
|
||||
|
||||
# The principled approach would be to clean up TagKeys as well at this point, but in practice there will be orders
|
||||
# of magnitude fewer TagKey objects, and they are much less likely to become dangling, so the GC-like algo of "just
|
||||
# vacuuming once in a while" is a much better fit for that.
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
from django.db.models import Q
|
||||
|
||||
from snappea.decorators import shared_task
|
||||
|
||||
from bugsink.moreiterutils import batched
|
||||
from bugsink.transaction import immediate_atomic, delay_on_commit
|
||||
from tags.models import TagValue, TagKey, EventTag, IssueTag
|
||||
from tags.models import TagValue, TagKey, EventTag, IssueTag, _or_join, prune_tagvalues
|
||||
|
||||
BATCH_SIZE = 10_000
|
||||
|
||||
@@ -82,3 +85,73 @@ def vacuum_tagkeys(min_id=0):
|
||||
|
||||
# Defer next batch
|
||||
vacuum_tagkeys.delay(ids_to_check[-1])
|
||||
|
||||
|
||||
@shared_task
|
||||
def vacuum_eventless_issuetags(min_id=0):
|
||||
# This task removes IssueTag entries that are no longer referenced by any EventTag for an Event on the same Issue.
|
||||
#
|
||||
# Under normal operation, we evict Events and their EventTags. However, we do not track how many EventTags back
|
||||
# an IssueTag, so we have historically chosen not to clean up IssueTags during Event deletion. (see #134)
|
||||
#
|
||||
# This has the upside of being cheap and preserving all known values for an Issue (e.g. all environments/releases
|
||||
# ever seen). But it comes with downsides:
|
||||
#
|
||||
# * stale IssueTags remain for deleted Events
|
||||
# * search-by-tag may return Issues without matching Events
|
||||
# * TagValues will not be vacuumed as long as they’re still referenced by an IssueTag
|
||||
#
|
||||
# This task aims to reconcile that, in a delayed and resumable fashion.
|
||||
|
||||
# Empirically determined: at this size, each batch is approx .3s (local dev, sqlite); Note that we're "nearer to the
|
||||
# edge of the object-graph" than for e.g. even-retention, so we can both afford bigger batches (less cascading
|
||||
# effects per item) as well as need bigger batches (because there are more expected items in a fanning-out
|
||||
# object-graph).
|
||||
BATCH_SIZE = 2048
|
||||
|
||||
# Community wisdom (says ChatGPT, w/o source): queries with dozens of OR clauses can slow down significantly. 64 is
|
||||
# a safe, batch size that avoids planner overhead and keeps things fast across databases.
|
||||
INNER_BATCH_SIZE = 64
|
||||
|
||||
with immediate_atomic():
|
||||
issue_tag_infos = list(
|
||||
IssueTag.objects
|
||||
.filter(id__gt=min_id)
|
||||
.order_by('id')
|
||||
.values('id', 'issue_id', 'value_id')[:BATCH_SIZE]
|
||||
)
|
||||
|
||||
for issue_tag_infos_batch in batched(issue_tag_infos, INNER_BATCH_SIZE):
|
||||
matching_eventtags = _or_join([
|
||||
Q(issue_id=it['issue_id'], value_id=it['value_id']) for it in issue_tag_infos_batch
|
||||
])
|
||||
|
||||
if matching_eventtags:
|
||||
in_use_issue_value_pairs = set(
|
||||
EventTag.objects
|
||||
.filter(matching_eventtags)
|
||||
.values_list('issue_id', 'value_id')
|
||||
)
|
||||
else:
|
||||
in_use_issue_value_pairs = set()
|
||||
|
||||
stale_issuetags = [
|
||||
it
|
||||
for it in issue_tag_infos_batch
|
||||
if (it['issue_id'], it['value_id']) not in in_use_issue_value_pairs
|
||||
]
|
||||
|
||||
if stale_issuetags:
|
||||
IssueTag.objects.filter(id__in=[it['id'] for it in stale_issuetags]).delete()
|
||||
|
||||
# inline pruning of TagValue (as opposed to using "vacuum later") following the same reasoning as in
|
||||
# prune_orphans.
|
||||
prune_tagvalues([it['value_id'] for it in stale_issuetags])
|
||||
|
||||
if not issue_tag_infos:
|
||||
# We don't have a continuation for the "done" case. One could argue: kick off vacuum_tagvalues there, but I'd
|
||||
# rather rather build the toolbox of cleanup tasks first and see how they might fit together later. Because the
|
||||
# downside of triggering the next vacuum command would be that "more things might happen too soon".
|
||||
return
|
||||
|
||||
vacuum_eventless_issuetags.delay(issue_tag_infos[-1]['id'])
|
||||
|
||||
@@ -1,14 +1,16 @@
|
||||
from unittest import TestCase as RegularTestCase
|
||||
from django.test import TestCase as DjangoTestCase
|
||||
|
||||
from bugsink.test_utils import TransactionTestCase25251 as TransactionTestCase
|
||||
from projects.models import Project
|
||||
from issues.factories import get_or_create_issue, denormalized_issue_fields
|
||||
from events.factories import create_event, create_event_data
|
||||
from issues.models import Issue
|
||||
|
||||
from .models import store_tags, EventTag
|
||||
from .models import store_tags, EventTag, IssueTag, TagValue
|
||||
from .utils import deduce_tags
|
||||
from .search import search_events, search_issues, parse_query, search_events_optimized
|
||||
from .tasks import vacuum_eventless_issuetags
|
||||
|
||||
|
||||
class DeduceTagsTestCase(RegularTestCase):
|
||||
@@ -119,6 +121,23 @@ class StoreTagsTestCase(DjangoTestCase):
|
||||
self.assertEqual(self.issue.tags.first().count, 2)
|
||||
self.assertEqual(self.issue.tags.first().value.key.key, "foo")
|
||||
|
||||
def test_store_same_tag_on_two_issues_creates_two_issuetags(self):
|
||||
store_tags(self.event, self.issue, {"foo": "bar"})
|
||||
|
||||
other_issue, _ = get_or_create_issue(self.project, event_data=create_event_data("other_issue"))
|
||||
other_event = create_event(self.project, issue=other_issue)
|
||||
store_tags(other_event, other_issue, {"foo": "bar"})
|
||||
|
||||
self.assertEqual(IssueTag.objects.count(), 2)
|
||||
self.assertEqual(2, IssueTag.objects.filter(value__key__key="foo").count())
|
||||
|
||||
def test_store_many_tags(self):
|
||||
# observed: a non-batched implementation of store_tags() would crash (e.g. in sqlite: Expression tree is too
|
||||
# large (maximum depth 1000)); if the below doesn't crash, we've got a batched implementation that works
|
||||
event = create_event(self.project, issue=self.issue)
|
||||
store_tags(event, self.issue, {f"key-{i}": f"value-{i}" for i in range(512)})
|
||||
self.assertEqual(IssueTag.objects.filter(issue=self.issue).count(), 512)
|
||||
|
||||
|
||||
class SearchParserTestCase(RegularTestCase):
|
||||
|
||||
@@ -247,3 +266,78 @@ class SearchTestCase(DjangoTestCase):
|
||||
|
||||
def test_search_issues(self):
|
||||
self._test_search(lambda query: search_issues(self.project, Issue.objects.all(), query))
|
||||
|
||||
|
||||
class VacuumEventlessIssueTagsTestCase(TransactionTestCase):
|
||||
# Note: this test depends on EAGER mode in both the setup (delete_derred to trigger cascading deletes) and the
|
||||
# testing of the thing under test (vacuum_eventless_issuetags).
|
||||
|
||||
def setUp(self):
|
||||
self.project = Project.objects.create(name="T")
|
||||
self.issue, _ = get_or_create_issue(self.project)
|
||||
|
||||
def test_no_eventtags_means_vacuum(self):
|
||||
event = create_event(self.project, issue=self.issue)
|
||||
store_tags(event, self.issue, {"foo": "bar"})
|
||||
event.delete_deferred()
|
||||
|
||||
self.assertEqual(IssueTag.objects.count(), 1)
|
||||
vacuum_eventless_issuetags()
|
||||
# in the above we deleted EventTag; implies 0 after-vacuum
|
||||
self.assertEqual(IssueTag.objects.count(), 0)
|
||||
|
||||
def test_one_eventtag_preserves_issuetag(self):
|
||||
event = create_event(self.project, issue=self.issue)
|
||||
store_tags(event, self.issue, {"foo": "bar"})
|
||||
|
||||
self.assertEqual(IssueTag.objects.count(), 1)
|
||||
vacuum_eventless_issuetags()
|
||||
# in the above we did not delete EventTag; implies 1 after-vacuum
|
||||
self.assertEqual(IssueTag.objects.count(), 1)
|
||||
|
||||
def test_other_event_same_tag_same_issue_preserves(self):
|
||||
event1 = create_event(self.project, issue=self.issue)
|
||||
event2 = create_event(self.project, issue=self.issue)
|
||||
store_tags(event1, self.issue, {"foo": "bar"})
|
||||
store_tags(event2, self.issue, {"foo": "bar"})
|
||||
event1.delete_deferred()
|
||||
|
||||
self.assertEqual(IssueTag.objects.count(), 1)
|
||||
vacuum_eventless_issuetags()
|
||||
# we deleted the EventTag for event1, but since event2 has the same tag, it should be preserved on the Issue
|
||||
self.assertEqual(IssueTag.objects.count(), 1)
|
||||
|
||||
def test_other_event_same_tag_other_issue_does_not_preserve(self):
|
||||
event1 = create_event(self.project, issue=self.issue)
|
||||
store_tags(event1, self.issue, {"foo": "bar"})
|
||||
|
||||
other_issue, _ = get_or_create_issue(self.project, event_data=create_event_data("other_issue"))
|
||||
event2 = create_event(self.project, issue=other_issue)
|
||||
store_tags(event2, other_issue, {"foo": "bar"})
|
||||
|
||||
event1.delete_deferred()
|
||||
|
||||
self.assertEqual(IssueTag.objects.filter(issue=self.issue).count(), 1)
|
||||
vacuum_eventless_issuetags()
|
||||
self.assertEqual(IssueTag.objects.filter(issue=self.issue).count(), 0)
|
||||
|
||||
def test_many_tags_spanning_chunks(self):
|
||||
event = create_event(self.project, issue=self.issue)
|
||||
store_tags(event, self.issue, {f"key-{i}": f"value-{i}" for i in range(2048 + 1)}) # bigger than BATCH_SIZE
|
||||
|
||||
# check setup: all issue tags are there
|
||||
self.assertEqual(IssueTag.objects.filter(issue=self.issue).count(), 2048 + 1)
|
||||
|
||||
event.delete_deferred()
|
||||
vacuum_eventless_issuetags()
|
||||
|
||||
# all tags should be gone after vacuum
|
||||
self.assertEqual(IssueTag.objects.filter(issue=self.issue).count(), 0)
|
||||
|
||||
def test_tagvalue_is_pruned(self):
|
||||
event = create_event(self.project, issue=self.issue)
|
||||
store_tags(event, self.issue, {"foo": "bar"})
|
||||
event.delete_deferred()
|
||||
|
||||
vacuum_eventless_issuetags()
|
||||
self.assertEqual(TagValue.objects.all().count(), 0)
|
||||
|
||||
Reference in New Issue
Block a user