Vacuum Tags command

See #135
This commit is contained in:
Klaas van Schelven
2025-07-02 17:34:33 +02:00
parent aed19e70d3
commit ee9add5e5f
3 changed files with 112 additions and 1 deletions
+18 -1
View File
@@ -25,6 +25,7 @@ from events.models import Event
from ingest.views import BaseIngestAPIView
from issues.factories import get_or_create_issue
from tags.models import store_tags
from tags.tasks import vacuum_tagvalues
from .models import Issue, IssueStateManager, TurningPoint, TurningPointKind
from .regressions import is_regression, is_regression_2, issue_is_regression
@@ -692,7 +693,12 @@ class IssueDeletionTestCase(TransactionTestCase):
'events.Event', 'tags.EventTag', 'issues.Issue',
]]
for model in models:
# 'vacuum' models are those that are not deleted when an issue is deleted, because they are exclusively owned
# by any given issue.
vacuum_models = [apps.get_model(app_label=s.split('.')[0], model_name=s.split('.')[1].lower())
for s in ['tags.TagKey', 'tags.TagValue']]
for model in models + vacuum_models:
# test-the-test: make sure some instances of the models actually exist after setup
self.assertTrue(model.objects.exists(), f"Some {model.__name__} should exist")
@@ -701,3 +707,14 @@ class IssueDeletionTestCase(TransactionTestCase):
# tests run w/ TASK_ALWAYS_EAGER, so in the below we can just check the database directly
for model in models:
self.assertFalse(model.objects.exists(), f"No {model.__name__}s should exist after issue deletion")
for model in vacuum_models:
# 'should' in quotes because this isn't so because we believe it's better if they did, but because the
# code currently does not delete them.
self.assertTrue(model.objects.exists(), f"Some {model.__name__}s 'should' exist after issue deletion")
vacuum_tagvalues()
# tests run w/ TASK_ALWAYS_EAGER, so any "delayed" (recursive) calls can be expected to have run
for model in vacuum_models:
self.assertFalse(model.objects.exists(), f"No {model.__name__}s should exist after vacuuming")
+10
View File
@@ -0,0 +1,10 @@
from django.core.management.base import BaseCommand
from tags.tasks import vacuum_tagvalues
class Command(BaseCommand):
help = "Kick off tag cleanup by vacuuming orphaned TagValue and TagKey entries."
def handle(self, *args, **options):
vacuum_tagvalues.delay()
self.stdout.write("Started tag vacuum via task queue.")
+84
View File
@@ -0,0 +1,84 @@
from snappea.decorators import shared_task
from bugsink.transaction import immediate_atomic, delay_on_commit
from tags.models import TagValue, TagKey, EventTag, IssueTag
BATCH_SIZE = 10_000
@shared_task
def vacuum_tagvalues(min_id=0):
# This task cleans up unused TagValue in batches. A TagValue can be unused if no IssueTag or EventTag references it,
# this can happen if IssueTag or EventTag entries are deleted. Cleanup is avoided in that case to avoid repeated
# checks. But it still needs to be done eventually to avoid bloating the database, which is what this task does.
# Impl. notes:
#
# * select id_to_check first, and then check which of those are used in EventTag or IssueTag. This avoids doing
# TagValue.exclude(some_usage_pattern) which may be slow / for which reasoning about performance is hard.
# * batched to allow for incremental cleanup, using a defer-with-min-id pattern to implement the batching.
#
# Known limitation:
# with _many_ TagValues (whether used or not) and when running in EAGER mode, this thing overflows the stack.
# Basically: because then the "delayed recursion" is not actually delayed, it just runs immediately. Answer: for
# "big things" (basically: serious setups) set up snappea.
with immediate_atomic():
# Select candidate TagValue IDs above min_id
ids_to_check = list(
TagValue.objects
.filter(id__gt=min_id)
.order_by('id')
.values_list('id', flat=True)[:BATCH_SIZE]
)
if not ids_to_check:
# Done with TagValues → start TagKey cleanup
delay_on_commit(vacuum_tagkeys, 0)
return
# Determine which ids_to_check are referenced
used_in_event = set(
EventTag.objects.filter(value_id__in=ids_to_check).values_list('value_id', flat=True)
)
used_in_issue = set(
IssueTag.objects.filter(value_id__in=ids_to_check).values_list('value_id', flat=True)
)
unused = [pk for pk in ids_to_check if pk not in used_in_event and pk not in used_in_issue]
# Actual deletion
if unused:
TagValue.objects.filter(id__in=unused).delete()
# Defer next batch
vacuum_tagvalues.delay(ids_to_check[-1])
@shared_task
def vacuum_tagkeys(min_id=0):
with immediate_atomic():
# Select candidate TagKey IDs above min_id
ids_to_check = list(
TagKey.objects
.filter(id__gt=min_id)
.order_by('id')
.values_list('id', flat=True)[:BATCH_SIZE]
)
if not ids_to_check:
return # done
# Determine which ids_to_check are referenced
used = set(
TagValue.objects.filter(key_id__in=ids_to_check).values_list('key_id', flat=True)
)
unused = [pk for pk in ids_to_check if pk not in used]
# Actual deletion
if unused:
TagKey.objects.filter(id__in=unused).delete()
# Defer next batch
vacuum_tagkeys.delay(ids_to_check[-1])