diff --git a/bugsink/settings.py b/bugsink/settings.py index 437d78c..ab5fade 100644 --- a/bugsink/settings.py +++ b/bugsink/settings.py @@ -41,6 +41,8 @@ INSTALLED_APPS = [ 'ingest', 'issues', 'events', + + 'performance', ] TAILWIND_APP_NAME = 'theme' diff --git a/performance/__init__.py b/performance/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/performance/management/__init__.py b/performance/management/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/performance/management/commands/__init__.py b/performance/management/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/performance/management/commands/document_performance_insights.py b/performance/management/commands/document_performance_insights.py new file mode 100644 index 0000000..d95fa20 --- /dev/null +++ b/performance/management/commands/document_performance_insights.py @@ -0,0 +1,161 @@ +from django.core.management.base import BaseCommand + +import random +import time +from datetime import datetime, timezone + +from django.conf import settings + +from bugsink.period_counter import _prev_tup, PeriodCounter +from performance.bursty_data import generate_bursty_data, buckets_to_points_in_time +from bugsink.registry import get_pc_registry + +from projects.models import Project +from issues.models import Issue +from events.models import Event + + +# this file is the beginning of an approach to getting a handle on performance. + + +class Command(BaseCommand): + help = "..." + + def handle(self, *args, **options): + if "performance" not in str(settings.DATABASES["default"]["NAME"]): + raise ValueError("This command should only be run on the performance-test database") + + print_thoughts_about_prev_tup() + print_thoughts_about_inc() + print_thoughts_about_event_evaluation() + print_thoughts_about_pc_registry() + + +class passed_time(object): + def __enter__(self): + self.t0 = time.time() + return self + + def __exit__(self, type, value, traceback): + self.elapsed = (time.time() - self.t0) * 1_000 # miliseconds is a good unit for timeing things + + +def print_thoughts_about_prev_tup(): + v = (2020, 1, 1, 10, 10) + with passed_time() as t: + for i in range(1_000): + v = _prev_tup(v) + + print(f"""## _prev_tup() + +1_000 iterations of _prev_tup in {t.elapsed:.3f}ms. The main thing we care about is not this little +private helper though, but PeriodCounter.inc(). Let's test that next. + +""") + + +def print_thoughts_about_inc(): + random.seed(42) + + pc = PeriodCounter() + + # make sure the pc has some data before we start + for point in buckets_to_points_in_time( + generate_bursty_data(num_buckets=350, expected_nr_of_bursts=10), + datetime(2020, 10, 15, tzinfo=timezone.utc), + datetime(2021, 10, 15, 10, 5, tzinfo=timezone.utc), + 10_000, + ): + + pc.inc(point) + + points = buckets_to_points_in_time( + generate_bursty_data(num_buckets=25, expected_nr_of_bursts=5), + datetime(2021, 10, 15, 10, 5, tzinfo=timezone.utc), + datetime(2021, 10, 16, 10, 5, tzinfo=timezone.utc), + 1000) + + with passed_time() as t: + for point in points: + pc.inc(point) + + print(f"""## PeriodCounter.inc() + +1_000 iterations of PeriodCounter.inc() in {t.elapsed:.3f}ms. We care about evaluation of some event more though. Let's +test that next. +""") + + +def print_thoughts_about_event_evaluation(): + random.seed(42) + + pc = PeriodCounter() + + def noop(): + pass + + # Now, let's add some event-listeners. These are chosen to match a typical setup of quota for a given Issue or + # Project. In this setup, the monthly maximum is spread out in a way that the smaller parts are a bit more than just + # splitting things equally. Why? We want some flexibility for bursts of activity without using up the entire budget + # for a longer time all at once. + pc.add_event_listener("day", 30, 10_000, noop, noop, initial_event_state=False) # 1 month rolling window + pc.add_event_listener("hour", 24, 1_000, noop, noop, initial_event_state=False) # 1 day rolling window + pc.add_event_listener("minute", 60, 200, noop, noop, initial_event_state=False) # 1 hour rolling window + + # make sure the pc has some data before we start. we pick a 1-month period to match the listeners in the above. + for point in buckets_to_points_in_time( + generate_bursty_data(num_buckets=350, expected_nr_of_bursts=10), + datetime(2021, 10, 15, tzinfo=timezone.utc), + datetime(2021, 11, 15, 10, 5, tzinfo=timezone.utc), + 10_000, + ): + + pc.inc(point) + + # now we start the test: we generate a bursty data-set for a 1-day period, and see how long it takes to evaluate + points = buckets_to_points_in_time( + generate_bursty_data(num_buckets=25, expected_nr_of_bursts=5), + datetime(2021, 11, 15, 10, 5, tzinfo=timezone.utc), + datetime(2021, 11, 16, 10, 5, tzinfo=timezone.utc), + 1000) + + with passed_time() as t: + for point in points: + pc.inc(point) + + print(f"""## PeriodCounter.inc() + +1_000 iterations of PeriodCounter.inc() in {t.elapsed:.3f}ms. (when 3 event-listeners are active). I'm not sure exactly +what a good performance would be here, but I can say the following: this means when a 1,000 events happen in a second, +the period-counter uses up 3% of the budget. A first guess would be: this is good enough.""") + + +def print_thoughts_about_pc_registry(): + # note: in load_performance_insights we use minimal (non-data-containing) events here. this may not be + # representative of real world performance. having said that: this immediately triggers the thought that for real + # initialization only timestamps and issue_ids are needed, and that we should adjust the code accordingly + + with passed_time() as t: + get_pc_registry() + + print(f"""## get_pc_registry() + +getting the pc-registry takes {t.elapsed:.3f}ms. (with the default fixtures, which contain + +* { Project.objects.count() } projects, +* { Issue.objects.count() } issues, +* { Event.objects.count() } events + +This means (surprisingly) we can take our eye off optimizing this particular part of code (for now), because: + +* in the (expected) production setup where we we cut ingestion and handling in 2 parts, 6s delay on the handling server + boot is fine. +* in the debugserver (integrated ingestion/handling) we don't expect 100k events; and even if we did a 6s delay on the + first event/request is fine. + +Ways forward once we do decide to improve: + +* regular saving of state (savepoint in time, with "unhandled after") (the regularity of saving is left as an exercise + to the reader) +* more granular caching/loading, e.g. load per project/issue on demand +""") diff --git a/performance/management/commands/load_performance_fixture.py b/performance/management/commands/load_performance_fixture.py new file mode 100644 index 0000000..e03b194 --- /dev/null +++ b/performance/management/commands/load_performance_fixture.py @@ -0,0 +1,61 @@ +from django.core.management.base import BaseCommand +import uuid + +import random +from datetime import datetime, timezone + +from django.conf import settings + +from performance.bursty_data import generate_bursty_data, buckets_to_points_in_time +from projects.models import Project +from issues.models import Issue +from events.models import Event + + +class Command(BaseCommand): + help = "..." + + def handle(self, *args, **options): + if "performance" not in str(settings.DATABASES["default"]["NAME"]): + raise ValueError("This command should only be run on the performance-test database") + + Project.objects.all().delete() + Issue.objects.all().delete() + Event.objects.all().delete() + + # as a first approach, let's focus on a 'typical' (whatever that means) local setup (not hosted), for a small + # team. maybe 10 people would work on max 10 projects. let's assume we have 10k per-project limits for events + # set up. and let's assume 100 issues per project (far from inbox-zero, approach bug-sewer territory) + # + projects = [Project.objects.create(name="project %s" % i) for i in range(10)] + issues_by_project = {} + + for p in projects: + issues_by_project[p.id] = [Issue.objects.create(project=p, hash="hash %d" % i) for i in range(100)] + + # now we have 10 projects, each with 100 issues. let's create 10k events for each project. + for p in projects: + print("loading 10k events for project", p.name) + points = buckets_to_points_in_time( + generate_bursty_data(num_buckets=350, expected_nr_of_bursts=10), + datetime(2020, 10, 15, tzinfo=timezone.utc), + datetime(2021, 10, 15, 10, 5, tzinfo=timezone.utc), + 10_000, + ) + + for i, point in enumerate(points): + if i % 1_000 == 0: + print("loaded", i, "events") + + # note: because we use such minimal (non-data-containing) events here, the setup in the below may + # actually not be representative of real world performance. + Event.objects.create( + project=p, + issue=random.choice(issues_by_project[p.id]), + server_side_timestamp=point, + timestamp=point, + event_id=uuid.uuid4().hex, + has_exception=True, + has_logentry=True, + data="{}", + ) diff --git a/performance/out/some_script.txt b/performance/out/some_script.txt index 330408c..3df834f 100644 --- a/performance/out/some_script.txt +++ b/performance/out/some_script.txt @@ -1,16 +1,37 @@ ## _prev_tup() -1_000 iterations of _prev_tup in 0.816ms. The main thing we care about is not this little +1_000 iterations of _prev_tup in 0.832ms. The main thing we care about is not this little private helper though, but PeriodCounter.inc(). Let's test that next. ## PeriodCounter.inc() -1_000 iterations of PeriodCounter.inc() in 7.766ms. We care about evaluation of some event more though. Let's +1_000 iterations of PeriodCounter.inc() in 7.885ms. We care about evaluation of some event more though. Let's test that next. ## PeriodCounter.inc() -1_000 iterations of PeriodCounter.inc() in 29.593ms. (when 3 event-listeners are active). I'm not sure exactly +1_000 iterations of PeriodCounter.inc() in 29.567ms. (when 3 event-listeners are active). I'm not sure exactly what a good performance would be here, but I can say the following: this means when a 1,000 events happen in a second, the period-counter uses up 3% of the budget. A first guess would be: this is good enough. +## get_pc_registry() + +getting the pc-registry takes 6615.371ms. (with the default fixtures, which contain + +* 10 projects, +* 1000 issues, +* 100000 events + +This means (surprisingly) we can take our eye off optimizing this particular part of code (for now), because: + +* in the (expected) production setup where we we cut ingestion and handling in 2 parts, 6s delay on the handling server + boot is fine. +* in the debugserver (integrated ingestion/handling) we don't expect 100k events; and even if we did a 6s delay on the + first event/request is fine. + +Ways forward once we do decide to improve: + +* regular saving of state (savepoint in time, with "unhandled after") (the regularity of saving is left as an exercise + to the reader) +* more granular caching/loading, e.g. load per project/issue on demand + diff --git a/performance/some_script.py b/performance/some_script.py index 9c9d6e0..88c97c1 100644 --- a/performance/some_script.py +++ b/performance/some_script.py @@ -5,6 +5,9 @@ from datetime import datetime, timezone from bugsink.period_counter import _prev_tup, PeriodCounter from performance.bursty_data import generate_bursty_data, buckets_to_points_in_time +from projects.models import Project +from issues.models import Issue +from events.models import Event # this file is the beginning of an approach to getting a handle on performance. @@ -109,6 +112,34 @@ what a good performance would be here, but I can say the following: this means w the period-counter uses up 3% of the budget. A first guess would be: this is good enough.""") +def print_thoughts_about_pc_registry(): + # as a first approach, let's focus on a 'typical' (whatever that means) local setup (not hosted), for a small team. + # maybe 10 people would work on max 10 projects. let's assume we have 10k per-project limits for events set up. and + # let's assume 100 issues per project (far from inbox-zero, approach bug-sewer territory) + # + projects = [Project.objects.create(name="project %s" % i) for i in range(10)] + issues_by_project = {} + + for p in projects: + issues_by_project[p.id] = [Issue.objects.create(project=p, hash="hash %d" % i) for i in range(100)] + + # now we have 10 projects, each with 100 issues. let's create 10k events for each project. + for p in projects: + points = buckets_to_points_in_time( + generate_bursty_data(num_buckets=350, expected_nr_of_bursts=10), + datetime(2020, 10, 15, tzinfo=timezone.utc), + datetime(2021, 10, 15, 10, 5, tzinfo=timezone.utc), + 10_000, + ) + + for point in points: + # note: because we use such minimal (non-data-containing) events here, the setup in the below may actually + # not be representative of real world performance. having said that: this immediately triggers the thought + # that for real initialization only timestamps and issue_ids are needed, and that we should adjust the code + # accordingly + Event.objects.create(project=p, issue=random.choice(issues_by_project[p.id]), server_side_timestamp=point) + + print_thoughts_about_prev_tup() print_thoughts_about_inc() print_thoughts_about_event_evaluation()