From ea6aa9bbcac55a4c2e16fb81e28727480b42f366 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Fri, 21 Jun 2024 11:50:13 +0200 Subject: [PATCH] Retention/quotas: something that 'seems to work' (doesn't immediately crash) --- .../0004_event_irrelevance_for_retention.py | 19 +++++ events/models.py | 3 +- events/retention.py | 71 +++++++++++-------- ingest/views.py | 25 +++++-- .../0003_project_retention_max_event_count.py | 16 +++++ projects/models.py | 7 +- 6 files changed, 100 insertions(+), 41 deletions(-) create mode 100644 events/migrations/0004_event_irrelevance_for_retention.py create mode 100644 projects/migrations/0003_project_retention_max_event_count.py diff --git a/events/migrations/0004_event_irrelevance_for_retention.py b/events/migrations/0004_event_irrelevance_for_retention.py new file mode 100644 index 0000000..df4a68a --- /dev/null +++ b/events/migrations/0004_event_irrelevance_for_retention.py @@ -0,0 +1,19 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('events', '0003_initial'), + ] + + operations = [ + migrations.AddField( + model_name='event', + name='irrelevance_for_retention', + # Given the (very) small group of test users, we can get away with just setting a default of 0 here (no + # datamigration) + field=models.PositiveIntegerField(default=0), + preserve_default=False, + ), + ] diff --git a/events/models.py b/events/models.py index 929a050..34b6350 100644 --- a/events/models.py +++ b/events/models.py @@ -179,8 +179,7 @@ class Event(models.Model): # below at least puts the parsed_data in the right place, and does some of the basic object set up (FKs to other # objects etc). - # +1 because we're about to add a new event and want the number for _that_ event - irrelevance_for_retention = get_random_irrelevance(stored_event_count + 1) + irrelevance_for_retention = get_random_irrelevance(stored_event_count) try: event = cls.objects.create( diff --git a/events/retention.py b/events/retention.py index 2ec9b79..a4bbc1c 100644 --- a/events/retention.py +++ b/events/retention.py @@ -22,7 +22,7 @@ def get_epoch_bounds(lower, upper=None): return Q() if lower is None: - return Q(timestamp__gte=datetime_for_epoch(lower)) + return Q(timestamp__lt=datetime_for_epoch(upper)) if upper is None: return Q(timestamp__gte=datetime_for_epoch(lower)) @@ -39,7 +39,6 @@ def nonzero_leading_bits(n): 101000 -> 3 110001 -> 6 """ - s = format(n, 'b') return len(s.rstrip('0')) @@ -58,14 +57,12 @@ def get_random_irrelevance(event_count): def should_evict(project, timestamp, stored_event_count): - if stored_event_count is None: - return False # no events, nothing to evict + # if/when we implement 'just drop' this might go somewhere (maybe not here) + # if (project.retention_last_eviction is not None and + # get_epoch(project.retention_last_eviction) != get_epoch(timestamp)): + # return True - if (project.retention_last_eviction is not None and - get_epoch(project.retention_last_eviction) != get_epoch(timestamp)): - return True - - if stored_event_count >= project.retention_max_event_count: # >= because this function is called right before add + if stored_event_count > project.retention_max_event_count: # > because: do something when _over_ the max return True return False @@ -101,15 +98,17 @@ def get_epoch_bounds_with_irrelevance(project, current_timestamp): from .models import Event # TODO 'first_seen' is cheaper? I don't think it exists at project-level though. - first_epoch = get_epoch(Event.objects.filter(project=project).aggregate(Min('timestamp'))) + # We can safely assume some Event exists when this point is reached because of the conditions in `should_evict` + first_epoch = get_epoch(Event.objects.filter(project=project).aggregate(val=Min('server_side_timestamp'))['val']) current_epoch = get_epoch(current_timestamp) difference = current_epoch - first_epoch - bounds = pairwise( + # because we construct in reverse order (from the most recent to the oldest) we end up with the pairs swapped + swapped_bounds = pairwise( [None] + [current_epoch - n for n in list(map_N_until(get_age_for_irrelevance, difference))] + [None]) - return [(bound, age_based_irrelevance) for age_based_irrelevance, bound in enumerate(bounds)] + return [((lb, ub), age_based_irrelevance) for age_based_irrelevance, (ub, lb) in enumerate(swapped_bounds)] def get_irrelevance_pairs(project, epoch_bounds_with_irrelevance): @@ -117,9 +116,10 @@ def get_irrelevance_pairs(project, epoch_bounds_with_irrelevance): from .models import Event for (lower_bound, upper_bound), age_based_irrelevance in epoch_bounds_with_irrelevance: - max_event_irrelevance = \ - Event.objects.filter(get_epoch_bounds(lower_bound, upper_bound)).aggregate(Max('irrelevance_for_retention')) - yield age_based_irrelevance, max_event_irrelevance + d = Event.objects.filter(get_epoch_bounds(lower_bound, upper_bound)).aggregate(Max('irrelevance_for_retention')) + max_event_irrelevance = d["irrelevance_for_retention__max"] or 0 + + yield (age_based_irrelevance, max_event_irrelevance) def map_N_until(f, until, onemore=False): @@ -144,21 +144,36 @@ def pairwise(it): prev = current +def filter_for_work(epoch_bounds_with_irrelevance, pairs, max_total_irrelevance): + # Here, we filter out epoch bounds for which there is obviously no work to be done, based on the pairs that we have + # selected at the beginning of the algo. We compare the selected irrelevances with the total, and if they do not + # exceed it no work will need to be done for that set of epochs. + + # Since it uses only the (already available) information that was gathered at the beginning of the algo, it is not a + # full filter for avoiding useless deletions, but at least we use the available info (from queries) to the fullest. + for pair, ebwi in zip(pairs, epoch_bounds_with_irrelevance): + if sum(pair) > max_total_irrelevance: # > because only if it is strictly greater will anything be evicted. + yield ebwi + + def evict_for_max_events(project, timestamp, stored_event_count=None): from .models import Event if stored_event_count is None: # allowed as a pass-in to save a query (we generally start off knowing this) - stored_event_count = Event.objects.filter(project=project).count() + stored_event_count = Event.objects.filter(project=project).count() + 1 epoch_bounds_with_irrelevance = get_epoch_bounds_with_irrelevance(project, timestamp) # we start off with the currently observed max total irrelevance - # +1 to correct for -= 1 at the beginning of the loop - max_total_irrelevance = max(sum(pair) for pair in get_irrelevance_pairs(project, epoch_bounds_with_irrelevance)) + 1 + pairs = get_irrelevance_pairs(project, epoch_bounds_with_irrelevance) + max_total_irrelevance = max(sum(pair) for pair in pairs) - while stored_event_count >= project.retention_max_event_count: # >= as elsewhere: eviction is before add-new + while stored_event_count > project.retention_max_event_count: # > as in `should_evict` + # -1 at the beginning of the loop; this means the actually observed max value is precisely the first thing that + # will be evicted (since `evict_for_irrelevance` will evict anything above (but not including) the given value) max_total_irrelevance -= 1 + evict_for_irrelevance(max_total_irrelevance, epoch_bounds_with_irrelevance) stored_event_count = Event.objects.filter(project=project).count() @@ -173,27 +188,25 @@ def evict_for_max_events(project, timestamp, stored_event_count=None): def evict_for_irrelevance(max_total_irrelevance, epoch_bounds_with_irrelevance): + # print("evict_for_irrelevance(%d, %s)" % (max_total_irrelevance, epoch_bounds_with_irrelevance)) + # max_total_irrelevance, i.e. the total may not exceed this (but it may equal it) for (_, epoch_ub_exclusive), irrelevance_for_age in epoch_bounds_with_irrelevance: max_item_irrelevance = max_total_irrelevance - irrelevance_for_age - if max_item_irrelevance < -1: + evict_for_epoch_and_irrelevance(epoch_ub_exclusive, max_item_irrelevance) + + if max_item_irrelevance <= -1: # in the actual eviction, the test on max_item_irrelevance is done exclusively, i.e. only items of greater # irrelevance are evicted. The minimal actually occuring value is 0. Such items can be evicted with a call - # with max_item_irrelevance = -1. Once the value is smaller than that, there's nothing to evict left. - - # (the above is true under conditions, which we meet, namely): - # * the actual eviction takes only an UB for the epochs, so older epochs are always cleaned as part of the - # newer ones (see picture below) - # * we always have at least a single pass throught the loop for the current epoch (which has 0 irrelevance) - # and we always have at least one call with at least 0 max_total_irrelevance + # with max_item_irrelevance = -1. This means that if we just did such an eviction, we're done for all epochs break - evict_for_epoch_and_irrelevance(epoch_ub_exclusive, max_item_irrelevance) - def evict_for_epoch_and_irrelevance(max_epoch, max_irrelevance): + # print("evict_for_epoch_and_irrelevance(%s, %s)" % (max_epoch, max_irrelevance)) + from .models import Event # evicting "at", based on the total irrelevance split out into 2 parts: max item irrelevance, and an epoch as # implied by the age-based irrelevance. diff --git a/ingest/views.py b/ingest/views.py index 76e74ca..27b39be 100644 --- a/ingest/views.py +++ b/ingest/views.py @@ -28,6 +28,7 @@ from bugsink.streams import content_encoding_reader, MaxDataReader, MaxDataWrite from bugsink.app_settings import get_settings from events.models import Event +from events.retention import evict_for_max_events, should_evict from releases.models import create_release_if_needed from alerts.tasks import send_new_issue_alert, send_regression_alert from compat.timestamp import format_timestamp, parse_timestamp @@ -194,20 +195,30 @@ class BaseIngestAPIView(View): # NOTE: possibly expensive. "in theory" we can just do some bookkeeping for a denormalized value, but that may # be hard to keep in-sync in practice. Let's check the actual cost first. - stored_event_count = issue.event_set.count() + # +1 because we're about to add one event. + project_stored_event_count = (project.event_set.count() or 0) + 1 + issue_stored_event_count = (issue.event_set.count() or 0) + 1 - if should_evict(project, timestamp, stored_event_count): - project.retention_last_eviction = timestamp - project.retention_max_total_irrelevance = evict_for_max_events(project, timestamp, stored_event_count) - # TODO: actually save the project? or use an update call? - # TODO: the idea of cooling off the max_total_irrelevance + if should_evict(project, timestamp, project_stored_event_count): + # Note: I considered pushing this into some async process, but it makes reasoning much harder, and it's + # doubtful whether it would help, because in the end there's just a single pipeline of ingested-related + # stuff todo, might as well do the work straight away. Similar thoughts about pushing this into something + # cron-like. (not exactly the same, because for cron-like time savings are possible if the cron-likeness + # causes the work to be outside of the 'rush hour' -- OTOH this also introduces a lot of complexity about + # "what is a limit anyway, if you can go either over it, or work is done before the limit is reached") + evict_for_max_events(project, timestamp, project_stored_event_count) + + # project.retention_last_eviction = timestamp + # project.retention_max_total_irrelevance + # TODO-if-the-above: actually save the project? or use an update call? + # TODO-if-the-above: the idea of cooling off the max_total_irrelevance # NOTE: an event always has a single (automatically calculated) Grouping associated with it. Since we have that # information available here, we could add it to the Event model. event, event_created = Event.from_ingested( event_metadata, issue.event_count, - stored_event_count, + issue_stored_event_count, issue, event_data, denormalized_fields, diff --git a/projects/migrations/0003_project_retention_max_event_count.py b/projects/migrations/0003_project_retention_max_event_count.py new file mode 100644 index 0000000..e1c258f --- /dev/null +++ b/projects/migrations/0003_project_retention_max_event_count.py @@ -0,0 +1,16 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('projects', '0002_initial'), + ] + + operations = [ + migrations.AddField( + model_name='project', + name='retention_max_event_count', + field=models.PositiveIntegerField(default=10000), + ), + ] diff --git a/projects/models.py b/projects/models.py index 5d70623..776c35e 100644 --- a/projects/models.py +++ b/projects/models.py @@ -61,11 +61,12 @@ class Project(models.Model): visibility = models.IntegerField(choices=ProjectVisibility.choices, default=ProjectVisibility.TEAM_MEMBERS) # retention quota - retention_event_count_quota = models.PositiveIntegerField(default=10_000) + retention_max_event_count = models.PositiveIntegerField(default=10_000) # bookkeeping of the eviction algorithm - retention_last_eviction = models.DateTimeField(null=True, blank=True) - retention_max_total_irrelevance = models.PositiveIntegerField(null=True, blank=True) + # this will be needed if/when we take an approach of 'drop immediately' + # retention_last_eviction = models.DateTimeField(null=True, blank=True) + # retention_max_total_irrelevance = models.PositiveIntegerField(null=True, blank=True) def __str__(self): return self.name