From 53d4be81833425f81084cb7386c37609faba9cf3 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Tue, 22 Apr 2025 22:08:53 +0200
Subject: [PATCH] Fix 'different_runtime_limit' race conditions

This commit fixes 3 related issues with the way runtime_limit was administered;
which could lead to race conditions (and hence: the wrong runtime_limit
applying at some point in time). Post-fix, the folllowing holds:

1. We use thread_locals to store this info, since there are at least 2 sources of
    threaded code that touch this (snappea's workers and the django debugserver)

2. We distinguish between the "from connection settings" timeout and the
    "temporarily overridden" ones, since we cannot assume
    connection-initialization happens first (as per the comment in base.py)

3. We store runtime-limits per alias ('using'). Needed for [2] (each connection
    may have a different moment-of-initialization, clobbering CM-set values from
    the other connection) and also needed once you realize there may be
    different defaults for the timeouts.

General context: I've recently started introducing the 'different runtime'
helper quite a bit more; and across connections (snappea!), which created more
and more doubts as to it actually working as advertised.

Thoughts on "using" being required. I used to think "you can reason about a
global timeout value, and the current transaction makes clear what you're
actually doing", but as per the notes above that doesn't really work.

Thoughts on reproducing:
A few thoughts/notes on reproducing problems with race conditions. Basic note:
that's always hairy. So in the end I settled on a solution that's hopefully
easy to reason about, even if it's verbose.

When I started work on this commit, I focussed on thread-safety; "proving the
problem" consisted of F5/^R on a web page with 2 context managers with different
timeouts, hoping to show that the stack unrolling didn't work properly.
However, during those "tests" I noticed quite a few resets-to-5s (from the
connection defaults), which prompted fix [2] from above.
---
 bugsink/context_processors.py        |  2 +-
 bugsink/settings/development.py      |  6 ++-
 bugsink/timed_sqlite_backend/base.py | 69 ++++++++++++++++++++++------
 bugsink/views.py                     |  4 +-
 snappea/stats.py                     |  2 +-
 5 files changed, 65 insertions(+), 18 deletions(-)

diff --git a/bugsink/context_processors.py b/bugsink/context_processors.py
index 76679c6..7093b8f 100644
--- a/bugsink/context_processors.py
+++ b/bugsink/context_processors.py
@@ -40,7 +40,7 @@ def get_snappea_warnings():
         return []
 
     with durable_atomic(using="snappea"):
-        with different_runtime_limit(0.1):
+        with different_runtime_limit(0.1, using="snappea"):
             try:
                 task_count = Task.objects.all().count()
             except OperationalError as e:
diff --git a/bugsink/settings/development.py b/bugsink/settings/development.py
index 03bafdc..b6b23dc 100644
--- a/bugsink/settings/development.py
+++ b/bugsink/settings/development.py
@@ -64,8 +64,12 @@ elif os.getenv("DB", "sqlite") == "sqlite":
     # store databases.
     DATABASES["default"]["NAME"] = BASE_DIR / 'db.sqlite3'
     DATABASES["default"]["TEST"]["NAME"] = BASE_DIR / 'test.sqlite3'
-    DATABASES["default"]["OPTIONS"]["query_timeout"] = 0.1  # canary config: fail-fast in development.
+    DATABASES["default"]["OPTIONS"]["query_timeout"] = 0.11  # canary config: fail-fast in development.
+
     DATABASES["snappea"]["NAME"] = BASE_DIR / 'snappea.sqlite3'
+    # canary config: fail-fast. slightly distinct value from the above to allow for eadier debugging of the timeout
+    # mechanism itself. (i.e. to eyeball where the value came from)
+    DATABASES["snappea"]["OPTIONS"]["query_timeout"] = 0.12
 
 else:
     raise ValueError("Unknown DB", os.getenv("DB"))
diff --git a/bugsink/timed_sqlite_backend/base.py b/bugsink/timed_sqlite_backend/base.py
index 4cca055..df76a4e 100644
--- a/bugsink/timed_sqlite_backend/base.py
+++ b/bugsink/timed_sqlite_backend/base.py
@@ -1,33 +1,78 @@
+from collections import namedtuple
 from copy import deepcopy
 import time
 from contextlib import contextmanager
 from django.conf import settings
+from threading import local
 
 from django.db import DEFAULT_DB_ALIAS
 from django.db.backends.sqlite3.base import (
     DatabaseWrapper as UnpatchedDatabaseWrapper, SQLiteCursorWrapper as UnpatchedSQLiteCursorWrapper,
 )
 
+# We disinguish between the default runtime limit for a connection (set in the settings) and a runtime limit set by the
+# "with different_runtime_limit" idiom, i.e. temporarily. The reason we need to distinguish these two concepts (and keep
+# track of their values) explicitly, and provide the fallback getter mechanism (cm if available, otherwise
+# connection-default) rather than have the program's stack determine this implicitly, is that we do not generally know
+# the moment the connection default value is set. It's set when a connection is needed, which may in fact be _after_
+# some CMs have already been called. (For nested CMs we do not have this problem, so we just keep track of "old" values
+# inside the CMs as they live on the Python stack)
+Runtimes = namedtuple("Runtimes", ["default_for_connection", "override_by_cm"])
+NoneRuntimes = Runtimes(None, None)
 
-_runtime_limit = 5.0
+thread_locals = local()
+
+
+def _set_runtime_limit(using, is_default_for_connection, seconds):
+    if using is None:
+        using = DEFAULT_DB_ALIAS
+
+    if not hasattr(thread_locals, "runtime_limits"):
+        thread_locals.runtime_limits = {}
+
+    if is_default_for_connection:
+        thread_locals.runtime_limits[using] = Runtimes(
+            default_for_connection=seconds,
+            override_by_cm=thread_locals.runtime_limits.get(using, NoneRuntimes).override_by_cm,
+        )
+    else:
+        thread_locals.runtime_limits[using] = Runtimes(
+            default_for_connection=thread_locals.runtime_limits.get(using, NoneRuntimes).default_for_connection,
+            override_by_cm=seconds,
+        )
+
+
+def _get_runtime_limit(using=None):
+    if using is None:
+        using = DEFAULT_DB_ALIAS
+
+    if not hasattr(thread_locals, "runtime_limits"):
+        # somewhat overly defensive, since you'd always pass through the DatabaseWrapper which sets at least once.
+        thread_locals.runtime_limits = {}
+
+    tup = thread_locals.runtime_limits.get(using, NoneRuntimes)
+    return tup.override_by_cm if tup.override_by_cm is not None else tup.default_for_connection
 
 
 def allow_long_running_queries():
     """Set a global flag to allow long-running queries. Useful for one-off commands, where slowness is expected."""
-    global _runtime_limit
-    _runtime_limit = float("inf")
+    # we use is_default_for_connection=False here, to make this act like a "context manager which doesn't reset", i.e.
+    # what we do here takes precedence over the connection default value.
+    _set_runtime_limit(using=None, is_default_for_connection=False, seconds=float("inf"))
 
 
 @contextmanager
-def different_runtime_limit(seconds):
-    global _runtime_limit
-    old = _runtime_limit
-    _runtime_limit = seconds
+def different_runtime_limit(seconds, using=None):
+    if using is None:
+        using = DEFAULT_DB_ALIAS
+
+    old = _get_runtime_limit(using=using)
+    _set_runtime_limit(using=using, is_default_for_connection=False, seconds=seconds)
 
     try:
         yield
     finally:
-        _runtime_limit = old
+        _set_runtime_limit(using=using, is_default_for_connection=False, seconds=old)
 
 
 @contextmanager
@@ -35,7 +80,7 @@ def limit_runtime(conn):
     start = time.time()
 
     def check_time():
-        if time.time() > start + _runtime_limit:
+        if time.time() > start + _get_runtime_limit():
             return 1
 
         return 0
@@ -74,11 +119,9 @@ class PrintOnClose(object):
 class DatabaseWrapper(UnpatchedDatabaseWrapper):
 
     def __init__(self, settings_dict, alias=DEFAULT_DB_ALIAS):
-        global _runtime_limit
         settings_dict = deepcopy(settings_dict)
-        # clobbering of the global var should not be a problem, because connections are single-threaded in Django
-        configured_runtime_limit = settings_dict.get("OPTIONS").pop("query_timeout", 5.0)
-        _runtime_limit = configured_runtime_limit
+        configured_runtime_limit = settings_dict.get("OPTIONS", {}).pop("query_timeout", 5.0)
+        _set_runtime_limit(using=alias, is_default_for_connection=True, seconds=configured_runtime_limit)
 
         super().__init__(settings_dict, alias=alias)
 
diff --git a/bugsink/views.py b/bugsink/views.py
index 4858bbb..475844e 100644
--- a/bugsink/views.py
+++ b/bugsink/views.py
@@ -182,8 +182,8 @@ def counts(request):
 
     # when you have some 7 - 10 models (tag-related, events, issues) that can have many instances, spending max .3 on
     # each before giving up would seem reasonable to stay below th 5s limit; the rest is via the caches anyway.
-    with different_runtime_limit(0.3):
-        for app_label in interesting_apps:
+    for app_label in interesting_apps:
+        with different_runtime_limit(0.3, using="snappea" if app_label == "snappea" else "default"):
             counts[app_label] = {}
             app_config = apps.apps.get_app_config(app_label)
             for model in app_config.get_models():
diff --git a/snappea/stats.py b/snappea/stats.py
index 53723e2..491f580 100644
--- a/snappea/stats.py
+++ b/snappea/stats.py
@@ -107,7 +107,7 @@ class Stats:
             with immediate_atomic(using="snappea"):  # explicit is better than impl.; and we combine read/write here
                 # having stats is great, but I don't want to hog task-processing too long (which would happen
                 # precisely when the backlog grows large)
-                with different_runtime_limit(0.1):
+                with different_runtime_limit(0.1, using="snappea"):
                     try:
                         task_counts = Task.objects.values("task_name").annotate(count=Count("task_name"))
                     except OperationalError as e: