diff --git a/events/utils.py b/events/utils.py index 20c9ffe..d8ea8a6 100644 --- a/events/utils.py +++ b/events/utils.py @@ -1,9 +1,15 @@ +from datetime import datetime, timezone from uuid import UUID import json import sourcemap from issues.utils import get_values +from bugsink.transaction import delay_on_commit + +from compat.timestamp import format_timestamp + from files.models import FileMetadata +from files.tasks import record_file_accesses # Dijkstra, Sourcemaps and Python lists start at 0, but editors and our UI show lines starting at 1. @@ -116,6 +122,9 @@ def apply_sourcemaps(event_data): debug_id__in=debug_id_for_filename.values(), file_type="source_map").select_related("file") } + metadata_ids = [metadata_obj.id for metadata_obj in metadata_obj_lookup.values()] + delay_on_commit(record_file_accesses, metadata_ids, format_timestamp(datetime.now(timezone.utc))) + filenames_with_metas = [ (filename, metadata_obj_lookup[debug_id]) for (filename, debug_id) in debug_id_for_filename.items() diff --git a/files/admin.py b/files/admin.py index 198bdf5..d9d79f5 100644 --- a/files/admin.py +++ b/files/admin.py @@ -7,14 +7,14 @@ from .models import Chunk, File, FileMetadata @admin.register(Chunk) class ChunkAdmin(admin.ModelAdmin): - list_display = ('checksum', 'size') + list_display = ('checksum', 'size', 'created_at') search_fields = ('checksum',) readonly_fields = ('data',) @admin.register(File) class FileAdmin(admin.ModelAdmin): - list_display = ('filename', 'checksum', 'size', 'download_link') + list_display = ('filename', 'checksum', 'size', 'download_link', 'created_at', 'accessed_at') search_fields = ('checksum',) readonly_fields = ('data', 'download_link') @@ -27,5 +27,5 @@ class FileAdmin(admin.ModelAdmin): @admin.register(FileMetadata) class FileMetadataAdmin(admin.ModelAdmin): - list_display = ('debug_id', 'file_type', 'file') + list_display = ('debug_id', 'file_type', 'file', 'created_at') search_fields = ('file__checksum', 'debug_id', 'file_type') diff --git a/files/management/__init__.py b/files/management/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/files/management/commands/__init__.py b/files/management/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/files/management/commands/vacuum_files.py b/files/management/commands/vacuum_files.py new file mode 100644 index 0000000..5161c85 --- /dev/null +++ b/files/management/commands/vacuum_files.py @@ -0,0 +1,10 @@ +from django.core.management.base import BaseCommand +from files.tasks import vacuum_files + + +class Command(BaseCommand): + help = "Kick off (sourcemaps-)files cleanup by vacuuming old entries." + + def handle(self, *args, **options): + vacuum_files.delay() + self.stdout.write("Called vacuum_files.delay(); the task will run in the background (snapea).") diff --git a/files/migrations/0002_chunk_created_at_file_accessed_at_file_created_at_and_more.py b/files/migrations/0002_chunk_created_at_file_accessed_at_file_created_at_and_more.py new file mode 100644 index 0000000..3585433 --- /dev/null +++ b/files/migrations/0002_chunk_created_at_file_accessed_at_file_created_at_and_more.py @@ -0,0 +1,44 @@ +from django.db import migrations, models +import django.utils.timezone + + +class Migration(migrations.Migration): + + dependencies = [ + ("files", "0001_initial"), + ] + + operations = [ + migrations.AddField( + model_name="chunk", + name="created_at", + field=models.DateTimeField( + auto_now_add=True, db_index=True, default=django.utils.timezone.now + ), + preserve_default=False, + ), + migrations.AddField( + model_name="file", + name="accessed_at", + field=models.DateTimeField( + auto_now_add=True, db_index=True, default=django.utils.timezone.now + ), + preserve_default=False, + ), + migrations.AddField( + model_name="file", + name="created_at", + field=models.DateTimeField( + auto_now_add=True, db_index=True, default=django.utils.timezone.now + ), + preserve_default=False, + ), + migrations.AddField( + model_name="filemetadata", + name="created_at", + field=models.DateTimeField( + auto_now_add=True, db_index=True, default=django.utils.timezone.now + ), + preserve_default=False, + ), + ] diff --git a/files/models.py b/files/models.py index 91ba42c..0011df7 100644 --- a/files/models.py +++ b/files/models.py @@ -5,6 +5,7 @@ class Chunk(models.Model): checksum = models.CharField(max_length=40, unique=True) # unique implies index, which we also use for lookups size = models.PositiveIntegerField() data = models.BinaryField(null=False) # as with Events, we can "eventually" move this out of the database + created_at = models.DateTimeField(auto_now_add=True, editable=False, db_index=True) def __str__(self): return self.checksum @@ -23,6 +24,8 @@ class File(models.Model): size = models.PositiveIntegerField() data = models.BinaryField(null=False) # as with Events, we can "eventually" move this out of the database + created_at = models.DateTimeField(auto_now_add=True, editable=False, db_index=True) + accessed_at = models.DateTimeField(auto_now_add=True, editable=False, db_index=True) def __str__(self): return self.filename @@ -36,6 +39,7 @@ class FileMetadata(models.Model): debug_id = models.UUIDField(max_length=40, null=True, blank=True) file_type = models.CharField(max_length=255, null=True, blank=True) data = models.TextField() # we just dump the rest in here; let's see how much we really need. + created_at = models.DateTimeField(auto_now_add=True, editable=False, db_index=True) def __str__(self): # somewhat useless when debug_id is None; but that's not the case we care about ATM diff --git a/files/tasks.py b/files/tasks.py index 53a9c8e..fe50614 100644 --- a/files/tasks.py +++ b/files/tasks.py @@ -1,12 +1,15 @@ +from datetime import timedelta from zipfile import ZipFile import json from hashlib import sha1 from io import BytesIO from os.path import basename +from django.utils import timezone +from compat.timestamp import parse_timestamp from snappea.decorators import shared_task -from bugsink.transaction import immediate_atomic +from bugsink.transaction import immediate_atomic, delay_on_commit from bugsink.app_settings import get_settings from .models import Chunk, File, FileMetadata @@ -92,3 +95,54 @@ def assemble_file(checksum, chunk_checksums, filename): # is unlikely. chunks.delete() return result + + +@shared_task +def record_file_accesses(metadata_ids, accessed_at): + # implemented as a task to get around the fact that file-access happens in an otherwise read-only view (and the fact + # that the access happened is a write to the DB). + + # a few thoughts on the context of "doing this as a task": [1] the expected througput is relatively low (UI) so the + # task overhead should be OK [2] it's not "absolutely criticial" to always record this (99% is enough) and [3] it's + # not related to the reading transaction _at all_ (all we need to record is the fact that it happened. + # + # thought on instead pulling it to the top of the UI's view: code-wise, it's annoying but doable (annoying b/c + # 'for_request_method' won't work anymore). But this would still make this key UI view depend on the write lock + # which is such a shame for responsiveness so we'll stick with task-based. + + with immediate_atomic(): + parsed_accessed_at = parse_timestamp(accessed_at) + + # note: filtering on IDs comes with "robust for deletions" out-of-the-box (and: 2 queries only) + file_ids = FileMetadata.objects.filter(id__in=metadata_ids).values_list("file_id", flat=True) + File.objects.filter(id__in=file_ids).update(accessed_at=parsed_accessed_at) + + +@shared_task +def vacuum_files(): + now = timezone.now() + with immediate_atomic(): + # budget is not yet tuned; reasons for high values: we're dealing with "leaves in the model-dep-tree here"; + # reasons for low values: deletion of files might just be expensive. + budget = 500 + num_deleted = 0 + + for model, field_name, max_days in [ + (Chunk, 'created_at', 1,), # 1 is already quite long... Chunks are used immediately, or not at all. + (File, 'accessed_at', 90), + # for FileMetadata we rely on cascading from File (which will always happen "eventually") + ]: + + while num_deleted < budget: + ids = (model.objects.filter(**{f"{field_name}__lt": now - timedelta(days=max_days)})[:budget]. + values_list('id', flat=True)) + + if len(ids) == 0: + break + + model.objects.filter(id__in=ids).delete() + num_deleted += len(ids) + + if num_deleted == budget: + # budget exhausted but possibly more to delete, so we re-schedule the task + delay_on_commit(vacuum_files)