mirror of
https://github.com/bugsink/bugsink.git
synced 2025-12-18 02:54:55 -06:00
Fix 'different_runtime_limit' race conditions
This commit fixes 3 related issues with the way runtime_limit was administered;
which could lead to race conditions (and hence: the wrong runtime_limit
applying at some point in time). Post-fix, the folllowing holds:
1. We use thread_locals to store this info, since there are at least 2 sources of
threaded code that touch this (snappea's workers and the django debugserver)
2. We distinguish between the "from connection settings" timeout and the
"temporarily overridden" ones, since we cannot assume
connection-initialization happens first (as per the comment in base.py)
3. We store runtime-limits per alias ('using'). Needed for [2] (each connection
may have a different moment-of-initialization, clobbering CM-set values from
the other connection) and also needed once you realize there may be
different defaults for the timeouts.
General context: I've recently started introducing the 'different runtime'
helper quite a bit more; and across connections (snappea!), which created more
and more doubts as to it actually working as advertised.
Thoughts on "using" being required. I used to think "you can reason about a
global timeout value, and the current transaction makes clear what you're
actually doing", but as per the notes above that doesn't really work.
Thoughts on reproducing:
A few thoughts/notes on reproducing problems with race conditions. Basic note:
that's always hairy. So in the end I settled on a solution that's hopefully
easy to reason about, even if it's verbose.
When I started work on this commit, I focussed on thread-safety; "proving the
problem" consisted of F5/^R on a web page with 2 context managers with different
timeouts, hoping to show that the stack unrolling didn't work properly.
However, during those "tests" I noticed quite a few resets-to-5s (from the
connection defaults), which prompted fix [2] from above.
This commit is contained in:
@@ -40,7 +40,7 @@ def get_snappea_warnings():
|
||||
return []
|
||||
|
||||
with durable_atomic(using="snappea"):
|
||||
with different_runtime_limit(0.1):
|
||||
with different_runtime_limit(0.1, using="snappea"):
|
||||
try:
|
||||
task_count = Task.objects.all().count()
|
||||
except OperationalError as e:
|
||||
|
||||
@@ -64,8 +64,12 @@ elif os.getenv("DB", "sqlite") == "sqlite":
|
||||
# store databases.
|
||||
DATABASES["default"]["NAME"] = BASE_DIR / 'db.sqlite3'
|
||||
DATABASES["default"]["TEST"]["NAME"] = BASE_DIR / 'test.sqlite3'
|
||||
DATABASES["default"]["OPTIONS"]["query_timeout"] = 0.1 # canary config: fail-fast in development.
|
||||
DATABASES["default"]["OPTIONS"]["query_timeout"] = 0.11 # canary config: fail-fast in development.
|
||||
|
||||
DATABASES["snappea"]["NAME"] = BASE_DIR / 'snappea.sqlite3'
|
||||
# canary config: fail-fast. slightly distinct value from the above to allow for eadier debugging of the timeout
|
||||
# mechanism itself. (i.e. to eyeball where the value came from)
|
||||
DATABASES["snappea"]["OPTIONS"]["query_timeout"] = 0.12
|
||||
|
||||
else:
|
||||
raise ValueError("Unknown DB", os.getenv("DB"))
|
||||
|
||||
@@ -1,33 +1,78 @@
|
||||
from collections import namedtuple
|
||||
from copy import deepcopy
|
||||
import time
|
||||
from contextlib import contextmanager
|
||||
from django.conf import settings
|
||||
from threading import local
|
||||
|
||||
from django.db import DEFAULT_DB_ALIAS
|
||||
from django.db.backends.sqlite3.base import (
|
||||
DatabaseWrapper as UnpatchedDatabaseWrapper, SQLiteCursorWrapper as UnpatchedSQLiteCursorWrapper,
|
||||
)
|
||||
|
||||
# We disinguish between the default runtime limit for a connection (set in the settings) and a runtime limit set by the
|
||||
# "with different_runtime_limit" idiom, i.e. temporarily. The reason we need to distinguish these two concepts (and keep
|
||||
# track of their values) explicitly, and provide the fallback getter mechanism (cm if available, otherwise
|
||||
# connection-default) rather than have the program's stack determine this implicitly, is that we do not generally know
|
||||
# the moment the connection default value is set. It's set when a connection is needed, which may in fact be _after_
|
||||
# some CMs have already been called. (For nested CMs we do not have this problem, so we just keep track of "old" values
|
||||
# inside the CMs as they live on the Python stack)
|
||||
Runtimes = namedtuple("Runtimes", ["default_for_connection", "override_by_cm"])
|
||||
NoneRuntimes = Runtimes(None, None)
|
||||
|
||||
_runtime_limit = 5.0
|
||||
thread_locals = local()
|
||||
|
||||
|
||||
def _set_runtime_limit(using, is_default_for_connection, seconds):
|
||||
if using is None:
|
||||
using = DEFAULT_DB_ALIAS
|
||||
|
||||
if not hasattr(thread_locals, "runtime_limits"):
|
||||
thread_locals.runtime_limits = {}
|
||||
|
||||
if is_default_for_connection:
|
||||
thread_locals.runtime_limits[using] = Runtimes(
|
||||
default_for_connection=seconds,
|
||||
override_by_cm=thread_locals.runtime_limits.get(using, NoneRuntimes).override_by_cm,
|
||||
)
|
||||
else:
|
||||
thread_locals.runtime_limits[using] = Runtimes(
|
||||
default_for_connection=thread_locals.runtime_limits.get(using, NoneRuntimes).default_for_connection,
|
||||
override_by_cm=seconds,
|
||||
)
|
||||
|
||||
|
||||
def _get_runtime_limit(using=None):
|
||||
if using is None:
|
||||
using = DEFAULT_DB_ALIAS
|
||||
|
||||
if not hasattr(thread_locals, "runtime_limits"):
|
||||
# somewhat overly defensive, since you'd always pass through the DatabaseWrapper which sets at least once.
|
||||
thread_locals.runtime_limits = {}
|
||||
|
||||
tup = thread_locals.runtime_limits.get(using, NoneRuntimes)
|
||||
return tup.override_by_cm if tup.override_by_cm is not None else tup.default_for_connection
|
||||
|
||||
|
||||
def allow_long_running_queries():
|
||||
"""Set a global flag to allow long-running queries. Useful for one-off commands, where slowness is expected."""
|
||||
global _runtime_limit
|
||||
_runtime_limit = float("inf")
|
||||
# we use is_default_for_connection=False here, to make this act like a "context manager which doesn't reset", i.e.
|
||||
# what we do here takes precedence over the connection default value.
|
||||
_set_runtime_limit(using=None, is_default_for_connection=False, seconds=float("inf"))
|
||||
|
||||
|
||||
@contextmanager
|
||||
def different_runtime_limit(seconds):
|
||||
global _runtime_limit
|
||||
old = _runtime_limit
|
||||
_runtime_limit = seconds
|
||||
def different_runtime_limit(seconds, using=None):
|
||||
if using is None:
|
||||
using = DEFAULT_DB_ALIAS
|
||||
|
||||
old = _get_runtime_limit(using=using)
|
||||
_set_runtime_limit(using=using, is_default_for_connection=False, seconds=seconds)
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
_runtime_limit = old
|
||||
_set_runtime_limit(using=using, is_default_for_connection=False, seconds=old)
|
||||
|
||||
|
||||
@contextmanager
|
||||
@@ -35,7 +80,7 @@ def limit_runtime(conn):
|
||||
start = time.time()
|
||||
|
||||
def check_time():
|
||||
if time.time() > start + _runtime_limit:
|
||||
if time.time() > start + _get_runtime_limit():
|
||||
return 1
|
||||
|
||||
return 0
|
||||
@@ -74,11 +119,9 @@ class PrintOnClose(object):
|
||||
class DatabaseWrapper(UnpatchedDatabaseWrapper):
|
||||
|
||||
def __init__(self, settings_dict, alias=DEFAULT_DB_ALIAS):
|
||||
global _runtime_limit
|
||||
settings_dict = deepcopy(settings_dict)
|
||||
# clobbering of the global var should not be a problem, because connections are single-threaded in Django
|
||||
configured_runtime_limit = settings_dict.get("OPTIONS").pop("query_timeout", 5.0)
|
||||
_runtime_limit = configured_runtime_limit
|
||||
configured_runtime_limit = settings_dict.get("OPTIONS", {}).pop("query_timeout", 5.0)
|
||||
_set_runtime_limit(using=alias, is_default_for_connection=True, seconds=configured_runtime_limit)
|
||||
|
||||
super().__init__(settings_dict, alias=alias)
|
||||
|
||||
|
||||
@@ -182,8 +182,8 @@ def counts(request):
|
||||
|
||||
# when you have some 7 - 10 models (tag-related, events, issues) that can have many instances, spending max .3 on
|
||||
# each before giving up would seem reasonable to stay below th 5s limit; the rest is via the caches anyway.
|
||||
with different_runtime_limit(0.3):
|
||||
for app_label in interesting_apps:
|
||||
for app_label in interesting_apps:
|
||||
with different_runtime_limit(0.3, using="snappea" if app_label == "snappea" else "default"):
|
||||
counts[app_label] = {}
|
||||
app_config = apps.apps.get_app_config(app_label)
|
||||
for model in app_config.get_models():
|
||||
|
||||
@@ -107,7 +107,7 @@ class Stats:
|
||||
with immediate_atomic(using="snappea"): # explicit is better than impl.; and we combine read/write here
|
||||
# having stats is great, but I don't want to hog task-processing too long (which would happen
|
||||
# precisely when the backlog grows large)
|
||||
with different_runtime_limit(0.1):
|
||||
with different_runtime_limit(0.1, using="snappea"):
|
||||
try:
|
||||
task_counts = Task.objects.values("task_name").annotate(count=Count("task_name"))
|
||||
except OperationalError as e:
|
||||
|
||||
Reference in New Issue
Block a user