Fix 'different_runtime_limit' race conditions

This commit fixes 3 related issues with the way runtime_limit was administered; which could lead to race conditions (and hence: the wrong runtime_limit applying at some point in time). Post-fix, the folllowing holds: 1. We use thread_locals to store this info, since there are at least 2 sources of threaded code that touch this (snappea's workers and the django debugserver) 2. We distinguish between the "from connection settings" timeout and the "temporarily overridden" ones, since we cannot assume connection-initialization happens first (as per the comment in base.py) 3. We store runtime-limits per alias ('using'). Needed for [2] (each connection may have a different moment-of-initialization, clobbering CM-set values from the other connection) and also needed once you realize there may be different defaults for the timeouts. General context: I've recently started introducing the 'different runtime' helper quite a bit more; and across connections (snappea!), which created more and more doubts as to it actually working as advertised. Thoughts on "using" being required. I used to think "you can reason about a global timeout value, and the current transaction makes clear what you're actually doing", but as per the notes above that doesn't really work. Thoughts on reproducing: A few thoughts/notes on reproducing problems with race conditions. Basic note: that's always hairy. So in the end I settled on a solution that's hopefully easy to reason about, even if it's verbose. When I started work on this commit, I focussed on thread-safety; "proving the problem" consisted of F5/^R on a web page with 2 context managers with different timeouts, hoping to show that the stack unrolling didn't work properly. However, during those "tests" I noticed quite a few resets-to-5s (from the connection defaults), which prompted fix [2] from above.
2025-12-18 02:54:55 -06:00 · 2025-04-22 22:08:53 +02:00
parent 5c0e45a16d
commit 53d4be8183
5 changed files with 65 additions and 18 deletions
--- a/bugsink/context_processors.py
+++ b/bugsink/context_processors.py
@@ -40,7 +40,7 @@ def get_snappea_warnings():
        return []

    with durable_atomic(using="snappea"):
-        with different_runtime_limit(0.1):
+        with different_runtime_limit(0.1, using="snappea"):
            try:
                task_count = Task.objects.all().count()
            except OperationalError as e:
--- a/bugsink/settings/development.py
+++ b/bugsink/settings/development.py
@@ -64,8 +64,12 @@ elif os.getenv("DB", "sqlite") == "sqlite":
    # store databases.
    DATABASES["default"]["NAME"] = BASE_DIR / 'db.sqlite3'
    DATABASES["default"]["TEST"]["NAME"] = BASE_DIR / 'test.sqlite3'
-    DATABASES["default"]["OPTIONS"]["query_timeout"] = 0.1  # canary config: fail-fast in development.
+    DATABASES["default"]["OPTIONS"]["query_timeout"] = 0.11  # canary config: fail-fast in development.
+
    DATABASES["snappea"]["NAME"] = BASE_DIR / 'snappea.sqlite3'
+    # canary config: fail-fast. slightly distinct value from the above to allow for eadier debugging of the timeout
+    # mechanism itself. (i.e. to eyeball where the value came from)
+    DATABASES["snappea"]["OPTIONS"]["query_timeout"] = 0.12

 else:
    raise ValueError("Unknown DB", os.getenv("DB"))
--- a/bugsink/timed_sqlite_backend/base.py
+++ b/bugsink/timed_sqlite_backend/base.py
@@ -1,33 +1,78 @@
+from collections import namedtuple
 from copy import deepcopy
 import time
 from contextlib import contextmanager
 from django.conf import settings
+from threading import local

 from django.db import DEFAULT_DB_ALIAS
 from django.db.backends.sqlite3.base import (
    DatabaseWrapper as UnpatchedDatabaseWrapper, SQLiteCursorWrapper as UnpatchedSQLiteCursorWrapper,
 )

+# We disinguish between the default runtime limit for a connection (set in the settings) and a runtime limit set by the
+# "with different_runtime_limit" idiom, i.e. temporarily. The reason we need to distinguish these two concepts (and keep
+# track of their values) explicitly, and provide the fallback getter mechanism (cm if available, otherwise
+# connection-default) rather than have the program's stack determine this implicitly, is that we do not generally know
+# the moment the connection default value is set. It's set when a connection is needed, which may in fact be _after_
+# some CMs have already been called. (For nested CMs we do not have this problem, so we just keep track of "old" values
+# inside the CMs as they live on the Python stack)
+Runtimes = namedtuple("Runtimes", ["default_for_connection", "override_by_cm"])
+NoneRuntimes = Runtimes(None, None)

-_runtime_limit = 5.0
+thread_locals = local()
+
+
+def _set_runtime_limit(using, is_default_for_connection, seconds):
+    if using is None:
+        using = DEFAULT_DB_ALIAS
+
+    if not hasattr(thread_locals, "runtime_limits"):
+        thread_locals.runtime_limits = {}
+
+    if is_default_for_connection:
+        thread_locals.runtime_limits[using] = Runtimes(
+            default_for_connection=seconds,
+            override_by_cm=thread_locals.runtime_limits.get(using, NoneRuntimes).override_by_cm,
+        )
+    else:
+        thread_locals.runtime_limits[using] = Runtimes(
+            default_for_connection=thread_locals.runtime_limits.get(using, NoneRuntimes).default_for_connection,
+            override_by_cm=seconds,
+        )
+
+
+def _get_runtime_limit(using=None):
+    if using is None:
+        using = DEFAULT_DB_ALIAS
+
+    if not hasattr(thread_locals, "runtime_limits"):
+        # somewhat overly defensive, since you'd always pass through the DatabaseWrapper which sets at least once.
+        thread_locals.runtime_limits = {}
+
+    tup = thread_locals.runtime_limits.get(using, NoneRuntimes)
+    return tup.override_by_cm if tup.override_by_cm is not None else tup.default_for_connection


 def allow_long_running_queries():
    """Set a global flag to allow long-running queries. Useful for one-off commands, where slowness is expected."""
-    global _runtime_limit
-    _runtime_limit = float("inf")
+    # we use is_default_for_connection=False here, to make this act like a "context manager which doesn't reset", i.e.
+    # what we do here takes precedence over the connection default value.
+    _set_runtime_limit(using=None, is_default_for_connection=False, seconds=float("inf"))


@contextmanager
-def different_runtime_limit(seconds):
-    global _runtime_limit
-    old = _runtime_limit
-    _runtime_limit = seconds
+def different_runtime_limit(seconds, using=None):
+    if using is None:
+        using = DEFAULT_DB_ALIAS
+
+    old = _get_runtime_limit(using=using)
+    _set_runtime_limit(using=using, is_default_for_connection=False, seconds=seconds)

    try:
        yield
    finally:
-        _runtime_limit = old
+        _set_runtime_limit(using=using, is_default_for_connection=False, seconds=old)


@contextmanager
@@ -35,7 +80,7 @@ def limit_runtime(conn):
    start = time.time()

    def check_time():
-        if time.time() > start + _runtime_limit:
+        if time.time() > start + _get_runtime_limit():
            return 1

        return 0
@@ -74,11 +119,9 @@ class PrintOnClose(object):
 class DatabaseWrapper(UnpatchedDatabaseWrapper):

    def __init__(self, settings_dict, alias=DEFAULT_DB_ALIAS):
-        global _runtime_limit
        settings_dict = deepcopy(settings_dict)
-        # clobbering of the global var should not be a problem, because connections are single-threaded in Django
-        configured_runtime_limit = settings_dict.get("OPTIONS").pop("query_timeout", 5.0)
-        _runtime_limit = configured_runtime_limit
+        configured_runtime_limit = settings_dict.get("OPTIONS", {}).pop("query_timeout", 5.0)
+        _set_runtime_limit(using=alias, is_default_for_connection=True, seconds=configured_runtime_limit)

        super().__init__(settings_dict, alias=alias)

--- a/bugsink/views.py
+++ b/bugsink/views.py
@@ -182,8 +182,8 @@ def counts(request):

    # when you have some 7 - 10 models (tag-related, events, issues) that can have many instances, spending max .3 on
    # each before giving up would seem reasonable to stay below th 5s limit; the rest is via the caches anyway.
-    with different_runtime_limit(0.3):
-        for app_label in interesting_apps:
+    for app_label in interesting_apps:
+        with different_runtime_limit(0.3, using="snappea" if app_label == "snappea" else "default"):
            counts[app_label] = {}
            app_config = apps.apps.get_app_config(app_label)
            for model in app_config.get_models():
--- a/snappea/stats.py
+++ b/snappea/stats.py
@@ -107,7 +107,7 @@ class Stats:
            with immediate_atomic(using="snappea"):  # explicit is better than impl.; and we combine read/write here
                # having stats is great, but I don't want to hog task-processing too long (which would happen
                # precisely when the backlog grows large)
-                with different_runtime_limit(0.1):
+                with different_runtime_limit(0.1, using="snappea"):
                    try:
                        task_counts = Task.objects.values("task_name").annotate(count=Count("task_name"))
                    except OperationalError as e: