From 74ace016bc6eed37a55c587d5746d2adfe3804f5 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Wed, 29 Oct 2025 17:25:49 +0100 Subject: [PATCH 01/31] api_catch_all: fix for request.POST debugging --- files/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/files/views.py b/files/views.py index c29b6df..881584c 100644 --- a/files/views.py +++ b/files/views.py @@ -231,10 +231,10 @@ def api_catch_all(request, subpath): if request.GET: lines.append(f" GET: {request.GET.dict()}") + body = request.body # note: must be above request.POST access to avoid "You cannot access body after reading ..." if request.POST: lines.append(f" POST: {request.POST.dict()}") - body = request.body if body: try: decoded = body.decode("utf-8", errors="replace").strip() From b09e6d02a12b524ac92502d8008b4fe51540bb0b Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Tue, 4 Nov 2025 09:39:50 +0100 Subject: [PATCH 02/31] Minidump handling utils: from BSD-Licensed Sentry (minor adaptations) --- requirements.txt | 1 + sentry/minidump.py | 84 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 sentry/minidump.py diff --git a/requirements.txt b/requirements.txt index 33292b3..8fe767c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,4 @@ verbose_csrf_middleware==1.0.* ecma426>=0.2.0 djangorestframework==3.16.* drf-spectacular[sidecar] +symbolic==8.7.2 # the last version to support ProcessState (i.e. minidump parsing) diff --git a/sentry/minidump.py b/sentry/minidump.py new file mode 100644 index 0000000..4f15379 --- /dev/null +++ b/sentry/minidump.py @@ -0,0 +1,84 @@ +# copied from: +# https://github.com/getsentry/sentry/blob/f0ac91f2ec6b45ad18e5eea6df72c5c72573e964/src/sentry/models/minidump.py#L53 +# with (as it stands) minor modifications. + +import logging +from symbolic import ProcessState + + +LOG_LEVELS = { + logging.NOTSET: "sample", + logging.DEBUG: "debug", + logging.INFO: "info", + logging.WARNING: "warning", + logging.ERROR: "error", + logging.FATAL: "fatal", +} + +LOG_LEVELS_MAP = {v: k for k, v in LOG_LEVELS.items()} + + +def merge_minidump_event(data, minidump_path): + state = ProcessState.from_minidump(minidump_path) + + data['level'] = LOG_LEVELS_MAP['fatal'] if state.crashed else LOG_LEVELS_MAP['info'] + data['message'] = 'Assertion Error: %s' % state.assertion if state.assertion \ + else 'Fatal Error: %s' % state.crash_reason + + if state.timestamp: + data['timestamp'] = float(state.timestamp) + + # Extract as much system information as we can. TODO: We should create + # a custom context and implement a specific minidump view in the event + # UI. + info = state.system_info + context = data.setdefault('contexts', {}) + os = context.setdefault('os', {}) + device = context.setdefault('device', {}) + os['name'] = info.os_name + os['version'] = info.os_version + device['arch'] = info.cpu_family + + # We can extract stack traces here already but since CFI is not + # available yet (without debug symbols), the stackwalker will + # resort to stack scanning which yields low-quality results. If + # the user provides us with debug symbols, we will reprocess this + # minidump and add improved stacktraces later. + threads = [{ + 'id': thread.thread_id, + 'crashed': False, + 'stacktrace': { + 'frames': [{ + 'instruction_addr': '0x%x' % frame.instruction, + 'function': '', # Required by interface + } for frame in thread.frames()], + }, + } for thread in state.threads()] + data.setdefault('threads', {})['values'] = threads + + # Mark the crashed thread and add its stacktrace to the exception + crashed_thread = threads[state.requesting_thread] + crashed_thread['crashed'] = True + + # Extract the crash reason and infos + exception = { + 'value': data['message'], + 'thread_id': crashed_thread['id'], + 'type': state.crash_reason, + # Move stacktrace here from crashed_thread (mutating!) + 'stacktrace': crashed_thread.pop('stacktrace'), + } + + data.setdefault('exception', {}) \ + .setdefault('values', []) \ + .append(exception) + + # Extract referenced (not all loaded) images + images = [{ + 'type': 'apple', # Required by interface + # 'uuid': module.uuid, NO_BANANA + 'image_addr': module.addr, + 'image_size': module.size, + # 'name': module.name, NO_BANANA + } for module in state.modules()] + data.setdefault('debug_meta', {})['images'] = images From e7aad45db22d569b2202cbc70dd713ffb70104b2 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Tue, 4 Nov 2025 10:47:04 +0100 Subject: [PATCH 03/31] Minidumps: PoC for minidump 'endpoint' See #82 --- ingest/urls.py | 5 ++- ingest/views.py | 54 +++++++++++++++++++++++++ issues/templates/issues/stacktrace.html | 4 +- sentry/minidump.py | 23 ++++------- 4 files changed, 67 insertions(+), 19 deletions(-) diff --git a/ingest/urls.py b/ingest/urls.py index e075f75..9d42b03 100644 --- a/ingest/urls.py +++ b/ingest/urls.py @@ -1,9 +1,12 @@ from django.urls import path -from .views import IngestEventAPIView, IngestEnvelopeAPIView +from .views import IngestEventAPIView, IngestEnvelopeAPIView, MinidumpAPIView urlpatterns = [ # project_pk has to be an int per Sentry Client expectations. path("/store/", IngestEventAPIView.as_view()), path("/envelope/", IngestEnvelopeAPIView.as_view()), + + # is this "ingest"? it is at least in the sense that it matches the API schema and downstream auth etc. + path("/minidump/", MinidumpAPIView.as_view()), ] diff --git a/ingest/views.py b/ingest/views.py index fdae6c5..b9d47c9 100644 --- a/ingest/views.py +++ b/ingest/views.py @@ -39,6 +39,7 @@ from alerts.tasks import send_new_issue_alert, send_regression_alert from compat.timestamp import format_timestamp, parse_timestamp from tags.models import digest_tags from bsmain.utils import b108_makedirs +from sentry.minidump import merge_minidump_event from .parsers import StreamingEnvelopeParser, ParseError from .filestore import get_filename_for_event_id @@ -680,6 +681,59 @@ class IngestEnvelopeAPIView(BaseIngestAPIView): # +class MinidumpAPIView(BaseIngestAPIView): + # A Base "Ingest" APIView in the sense that it reuses some key building blocks (auth). + # I'm not 100% sure whether "philosophically" the minidump endpoint is also "ingesting"; we'll see. + + @classmethod + def _ingest(cls, ingested_at, event_data, project, request): + # TSTTCPW: just ingest the invent as normally after we've done the minidump-parsing "immediately". We make + # ready for the expectations of process_event (DIGEST_IMMEDIATELY/event_output_stream) with an if-statement + + event_output_stream = MaxDataWriter("MAX_EVENT_SIZE", io.BytesIO()) + if get_settings().DIGEST_IMMEDIATELY: + # in this case the stream will be an BytesIO object, so we can actually call .get_value() on it. + event_output_stream.write(json.dumps(event_data).encode("utf-8")) + + else: + # no need to actually touch event_output_stream for this case, we just need to write a file + filename = get_filename_for_event_id(event_data["event_id"]) + b108_makedirs(os.path.dirname(filename)) + with open(filename, 'w') as f: + json.dump(event_data, f) + + cls.process_event(ingested_at, event_data["event_id"], event_output_stream, project, request) + + def post(self, request, project_pk=None): + # not reusing the CORS stuff here; minidump-from-browser doesn't make sense. + + ingested_at = datetime.now(timezone.utc) + project = self.get_project_for_request(project_pk, request) + + try: + # in this flow, we don't get an event_id from the client, so we just generate one here. + event_id = uuid.uuid4().hex + + minidump_bytes = request.FILES["upload_file_minidump"].read() + + data = { + "event_id": event_id, + "platform": "native", + "extra": {}, + "errors": [], + } + + merge_minidump_event(data, minidump_bytes) + + self._ingest(ingested_at, data, project, request) + + return JsonResponse({"id": event_id}) + + except Exception as e: + raise + return JsonResponse({"detail": str(e)}, status=HTTP_400_BAD_REQUEST) + + @user_passes_test(lambda u: u.is_superuser) def download_envelope(request, envelope_id=None): envelope = get_object_or_404(Envelope, pk=envelope_id) diff --git a/issues/templates/issues/stacktrace.html b/issues/templates/issues/stacktrace.html index 870735c..9eb0a62 100644 --- a/issues/templates/issues/stacktrace.html +++ b/issues/templates/issues/stacktrace.html @@ -66,9 +66,9 @@
{# filename, function, lineno #} {% if frame.in_app %} - {{ frame.filename }}{% if frame.function %} in {{ frame.function }}{% endif %}{% if frame.lineno %} line {{ frame.lineno }}{% endif %}. + {{ frame.filename }}{% if frame.function %} in {{ frame.function }}{% endif %}{% if frame.lineno %} line {{ frame.lineno }}{% endif %}{% if frame.instruction_addr %} {{ frame.instruction_addr }}{% endif %}. {% else %} - {{ frame.filename }}{% if frame.function %} in {{ frame.function }}{% endif %}{% if frame.lineno%} line {{ frame.lineno }}{% endif %}. + {{ frame.filename }}{% if frame.function %} in {{ frame.function }}{% endif %}{% if frame.lineno%} line {{ frame.lineno }}{% endif %}{% if frame.instruction_addr %} {{ frame.instruction_addr }}{% endif %}. {% endif %}
diff --git a/sentry/minidump.py b/sentry/minidump.py index 4f15379..8a126c3 100644 --- a/sentry/minidump.py +++ b/sentry/minidump.py @@ -6,24 +6,15 @@ import logging from symbolic import ProcessState -LOG_LEVELS = { - logging.NOTSET: "sample", - logging.DEBUG: "debug", - logging.INFO: "info", - logging.WARNING: "warning", - logging.ERROR: "error", - logging.FATAL: "fatal", -} +def merge_minidump_event(data, minidump_bytes): + state = ProcessState.from_minidump_buffer(minidump_bytes) -LOG_LEVELS_MAP = {v: k for k, v in LOG_LEVELS.items()} + data['level'] = 'fatal' if state.crashed else 'info' - -def merge_minidump_event(data, minidump_path): - state = ProcessState.from_minidump(minidump_path) - - data['level'] = LOG_LEVELS_MAP['fatal'] if state.crashed else LOG_LEVELS_MAP['info'] - data['message'] = 'Assertion Error: %s' % state.assertion if state.assertion \ + exception_value = 'Assertion Error: %s' % state.assertion if state.assertion \ else 'Fatal Error: %s' % state.crash_reason + # NO_BANANA: data['message'] is not the right target + # data['message'] = exception_value if state.timestamp: data['timestamp'] = float(state.timestamp) @@ -62,7 +53,7 @@ def merge_minidump_event(data, minidump_path): # Extract the crash reason and infos exception = { - 'value': data['message'], + 'value': exception_value, 'thread_id': crashed_thread['id'], 'type': state.crash_reason, # Move stacktrace here from crashed_thread (mutating!) From 391e22bcf0d3fefa69343a1ef1b4350571f93292 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Tue, 4 Nov 2025 13:55:41 +0100 Subject: [PATCH 04/31] parser: event_output_stream closing pushed in in preparation of the minidump handling through the envelope path, which probably requires dealing with the whole envelope as a whole. "theoretically" this might be less efficient, but [a] see the notes at the top of the parser on how we think about streaming parsing and [b] the inefficiencies are in the "immediate" path anyway (which has the assumtion that it's not in high-performance envs). "blind commit" (tests not run against this commit). --- bugsink/streams.py | 9 +++++++++ ingest/parsers.py | 23 +++++++++++++---------- ingest/views.py | 36 ++++++++++++++---------------------- 3 files changed, 36 insertions(+), 32 deletions(-) diff --git a/bugsink/streams.py b/bugsink/streams.py index 295fa56..e3c6120 100644 --- a/bugsink/streams.py +++ b/bugsink/streams.py @@ -170,3 +170,12 @@ class NullWriter: def close(self): pass + + +class UnclosableBytesIO(io.BytesIO): + """Intentionally does nothing on-close: BytesIO normally discards its buffer on .close(), breaking .getvalue(); this + overrides it so that we can use it in code that usually deals with real files (and calls .close()) while still using + the in-memory data afterwards. We just rely on the garbage collector for the actual cleanup.""" + + def close(self): + pass diff --git a/ingest/parsers.py b/ingest/parsers.py index 3cd419d..bdc1401 100644 --- a/ingest/parsers.py +++ b/ingest/parsers.py @@ -155,7 +155,6 @@ class StreamingEnvelopeParser: def get_items(self, output_stream_factory): # yields the item_headers and item_output_streams (with the content of the items written into them) - # closing the item_output_stream is the responsibility of the calller self.get_envelope_headers() @@ -175,17 +174,21 @@ class StreamingEnvelopeParser: finder = NewlineFinder() item_output_stream = output_stream_factory(item_headers) - self.remainder, self.at_eof = readuntil( - self.input_stream, self.remainder, finder, item_output_stream, self.chunk_size) - if "length" in item_headers: - # items with an explicit length are terminated by a newline (if at EOF, this is optional as per the set - # of examples in the docs) - should_be_empty = io.BytesIO() + try: self.remainder, self.at_eof = readuntil( - self.input_stream, self.remainder, NewlineFinder(), should_be_empty, self.chunk_size) - if should_be_empty.getvalue() != b"": - raise ParseError("Item with explicit length not terminated by newline/EOF") + self.input_stream, self.remainder, finder, item_output_stream, self.chunk_size) + + if "length" in item_headers: + # items with an explicit length are terminated by a newline (if at EOF, this is optional as per the + # set of examples in the docs) + should_be_empty = io.BytesIO() + self.remainder, self.at_eof = readuntil( + self.input_stream, self.remainder, NewlineFinder(), should_be_empty, self.chunk_size) + if should_be_empty.getvalue() != b"": + raise ParseError("Item with explicit length not terminated by newline/EOF") + finally: + item_output_stream.close() yield item_headers, item_output_stream diff --git a/ingest/views.py b/ingest/views.py index b9d47c9..93b1a49 100644 --- a/ingest/views.py +++ b/ingest/views.py @@ -2,7 +2,6 @@ import uuid import hashlib import os import logging -import io from datetime import datetime, timezone import json import jsonschema @@ -29,7 +28,8 @@ from issues.regressions import issue_is_regression from bugsink.transaction import immediate_atomic, delay_on_commit from bugsink.exceptions import ViolatedExpectation -from bugsink.streams import content_encoding_reader, MaxDataReader, MaxDataWriter, NullWriter, MaxLengthExceeded +from bugsink.streams import ( + content_encoding_reader, MaxDataReader, MaxDataWriter, NullWriter, MaxLengthExceeded, UnclosableBytesIO) from bugsink.app_settings import get_settings from events.models import Event @@ -162,10 +162,6 @@ class BaseIngestAPIView(View): performance_logger.info("ingested event with %s bytes", len(event_data_bytes)) cls.digest_event(event_metadata, event_data) else: - # In this case the stream will be a file that has been written the event's content to it. - # To ensure that the (possibly EAGER) handling of the digest has the file available, we flush it here: - event_data_stream.flush() - performance_logger.info("ingested event with %s bytes", event_data_stream.bytes_written) digest.delay(event_id, event_metadata) @@ -621,7 +617,7 @@ class IngestEnvelopeAPIView(BaseIngestAPIView): def factory(item_headers): if item_headers.get("type") == "event": if get_settings().DIGEST_IMMEDIATELY: - return MaxDataWriter("MAX_EVENT_SIZE", io.BytesIO()) + return MaxDataWriter("MAX_EVENT_SIZE", UnclosableBytesIO()) # envelope_headers["event_id"] is required when type=event per the spec (and takes precedence over the # payload's event_id), so we can rely on it having been set. @@ -643,23 +639,19 @@ class IngestEnvelopeAPIView(BaseIngestAPIView): return NullWriter() for item_headers, event_output_stream in parser.get_items(factory): - try: - if item_headers.get("type") != "event": - logger.info("skipping non-event item: %s", item_headers.get("type")) + if item_headers.get("type") != "event": + logger.info("skipping non-event item: %s", item_headers.get("type")) - if item_headers.get("type") == "transaction": - # From the spec of type=event: This Item is mutually exclusive with `"transaction"` Items. - # i.e. when we see a transaction, a regular event will not be present and we can stop. - logger.info("discarding the rest of the envelope") - break + if item_headers.get("type") == "transaction": + # From the spec of type=event: This Item is mutually exclusive with `"transaction"` Items. + # i.e. when we see a transaction, a regular event will not be present and we can stop. + logger.info("discarding the rest of the envelope") + break - continue + continue - self.process_event(ingested_at, envelope_headers["event_id"], event_output_stream, project, request) - break # From the spec of type=event: This Item may occur at most once per Envelope. once seen: done - - finally: - event_output_stream.close() + self.process_event(ingested_at, envelope_headers["event_id"], event_output_stream, project, request) + break # From the spec of type=event: This Item may occur at most once per Envelope. once seen: done return HttpResponse() @@ -690,7 +682,7 @@ class MinidumpAPIView(BaseIngestAPIView): # TSTTCPW: just ingest the invent as normally after we've done the minidump-parsing "immediately". We make # ready for the expectations of process_event (DIGEST_IMMEDIATELY/event_output_stream) with an if-statement - event_output_stream = MaxDataWriter("MAX_EVENT_SIZE", io.BytesIO()) + event_output_stream = MaxDataWriter("MAX_EVENT_SIZE", UnclosableBytesIO()) if get_settings().DIGEST_IMMEDIATELY: # in this case the stream will be an BytesIO object, so we can actually call .get_value() on it. event_output_stream.write(json.dumps(event_data).encode("utf-8")) From de9a37aab625de4aa160701f6b19a312bc1d6f69 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Wed, 5 Nov 2025 08:39:40 +0100 Subject: [PATCH 05/31] Fix tests for UnclosableBytesIO i.e. fix tests for 391e22bcf0d3 (the changes in the present commit in `ingest/tests.py` are not strictly necessary but they are principly right) --- ingest/parsers.py | 4 ++-- ingest/tests.py | 15 ++++++++------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/ingest/parsers.py b/ingest/parsers.py index bdc1401..3f4b1ec 100644 --- a/ingest/parsers.py +++ b/ingest/parsers.py @@ -1,7 +1,7 @@ import json import io -from bugsink.streams import MaxDataWriter +from bugsink.streams import MaxDataWriter, UnclosableBytesIO from .exceptions import ParseError from .header_validators import filter_valid_envelope_headers, filter_valid_item_headers @@ -195,5 +195,5 @@ class StreamingEnvelopeParser: def get_items_directly(self): # this method is just convenience for testing - for item_headers, output_stream in self.get_items(lambda item_headers: io.BytesIO()): + for item_headers, output_stream in self.get_items(lambda item_headers: UnclosableBytesIO()): yield item_headers, output_stream.getvalue() diff --git a/ingest/tests.py b/ingest/tests.py index 3e0d634..2f81df5 100644 --- a/ingest/tests.py +++ b/ingest/tests.py @@ -25,6 +25,7 @@ from issues.factories import get_or_create_issue from issues.models import IssueStateManager, Issue, TurningPoint, TurningPointKind from issues.utils import get_values from bugsink.app_settings import override_settings +from bugsink.streams import UnclosableBytesIO from compat.timestamp import format_timestamp from compat.dsn import get_header_value from bsmain.management.commands.send_json import Command as SendJsonCommand @@ -631,7 +632,7 @@ class TestParser(RegularTestCase): initial_chunk = b"line 0\nline 1\n" input_stream.seek(0) - output_stream = io.BytesIO() + output_stream = UnclosableBytesIO() remainder, at_eof = readuntil(input_stream, initial_chunk, NewlineFinder(), output_stream, 3) self.assertFalse(at_eof) @@ -644,7 +645,7 @@ class TestParser(RegularTestCase): initial_chunk = b"lin" input_stream.seek(0) - output_stream = io.BytesIO() + output_stream = UnclosableBytesIO() remainder, at_eof = readuntil(input_stream, initial_chunk, NewlineFinder(), output_stream, 3) self.assertFalse(at_eof) @@ -657,7 +658,7 @@ class TestParser(RegularTestCase): initial_chunk = b"" input_stream.seek(0) - output_stream = io.BytesIO() + output_stream = UnclosableBytesIO() remainder, at_eof = readuntil(input_stream, initial_chunk, NewlineFinder(), output_stream, 3) self.assertFalse(at_eof) @@ -670,7 +671,7 @@ class TestParser(RegularTestCase): initial_chunk = b"" input_stream.seek(0) - output_stream = io.BytesIO() + output_stream = UnclosableBytesIO() remainder, at_eof = readuntil(input_stream, initial_chunk, NewlineFinder(), output_stream, 3) self.assertTrue(at_eof) @@ -683,7 +684,7 @@ class TestParser(RegularTestCase): initial_chunk = b"lin" input_stream.seek(0) - output_stream = io.BytesIO() + output_stream = UnclosableBytesIO() remainder, at_eof = readuntil(input_stream, initial_chunk, NewlineFinder(), output_stream, 1024) self.assertFalse(at_eof) @@ -696,7 +697,7 @@ class TestParser(RegularTestCase): initial_chunk = b"lin" input_stream.seek(0) - output_stream = io.BytesIO() + output_stream = UnclosableBytesIO() remainder, at_eof = readuntil(input_stream, initial_chunk, LengthFinder(10, "eof not ok"), output_stream, 3) self.assertFalse(at_eof) @@ -709,7 +710,7 @@ class TestParser(RegularTestCase): initial_chunk = b"lin" input_stream.seek(0) - output_stream = io.BytesIO() + output_stream = UnclosableBytesIO() with self.assertRaises(ParseError): remainder, at_eof = readuntil(input_stream, initial_chunk, LengthFinder(100, "EOF"), output_stream, 1000) From 7f831f52d4ca97ea01cd2e22f55a2596d46c675b Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Wed, 5 Nov 2025 09:03:17 +0100 Subject: [PATCH 06/31] Remove DIGEST_IMMEDIATELY option Although DIGEST_IMMEDIATELY=True is theoretically a nice thing to have, the upkeep is not worth it now that we're about to introduce minidump ingestion. The only thing that you're saving is the round-trip via the filesystem, but performance of that is negligable, and if you're configuring DIGEST_IMMEDIATELY you're actually _not_ in the performance-critical path anyway. Getting rid of it _also_ harmonizes/reduces the number of paths to test. It's approximately 1% of our installed base. --- bugsink/app_settings.py | 1 - bugsink/conf_templates/docker.py.template | 4 +- bugsink/conf_templates/local.py.template | 5 -- .../conf_templates/singleserver.py.template | 4 -- bugsink/settings/development.py | 2 - ingest/tests.py | 5 -- ingest/views.py | 46 ++++++------------- phonehome/tasks.py | 1 - 8 files changed, 16 insertions(+), 52 deletions(-) diff --git a/bugsink/app_settings.py b/bugsink/app_settings.py index 5bd1ce4..3ba51aa 100644 --- a/bugsink/app_settings.py +++ b/bugsink/app_settings.py @@ -42,7 +42,6 @@ DEFAULTS = { "TEAM_CREATION": CB_MEMBERS, # who can create new teams. default: members, which means "any member of the site" # System inner workings: - "DIGEST_IMMEDIATELY": True, "VALIDATE_ON_DIGEST": "none", # other legal values are "warn" and "strict" "KEEP_ENVELOPES": 0, # set to a number to store that many; 0 means "store none". This is for debugging. "API_LOG_UNIMPLEMENTED_CALLS": False, # if True, log unimplemented API calls; see #153 diff --git a/bugsink/conf_templates/docker.py.template b/bugsink/conf_templates/docker.py.template index f2b044b..71845c8 100644 --- a/bugsink/conf_templates/docker.py.template +++ b/bugsink/conf_templates/docker.py.template @@ -52,7 +52,7 @@ eat_your_own_dogfood(SENTRY_DSN) # Our Docker image is hard-coded to run with snappea in the background; this means we hard-code (as opposed to reading -# the from the env) certain variables: TASK_ALWAYS_EAGER, WORKAHOLIC and DIGEST_IMMEDIATELY. +# the from the env) certain variables: TASK_ALWAYS_EAGER and WORKAHOLIC SNAPPEA = { "TASK_ALWAYS_EAGER": False, # hard-coded, corresponds to Docker setup "WORKAHOLIC": True, # hard-coded, corresponds to Docker setup @@ -131,8 +131,6 @@ CB_NOBODY = "CB_NOBODY" BUGSINK = { - "DIGEST_IMMEDIATELY": False, # hard-coded, corresponds to Docker setup - # The URL where the Bugsink instance is hosted. This is used in the email notifications and to construct DSNs. "BASE_URL": os.getenv("BASE_URL", f"http://localhost:{_PORT}"), # no trailing slash diff --git a/bugsink/conf_templates/local.py.template b/bugsink/conf_templates/local.py.template index 63edee7..d645dfa 100644 --- a/bugsink/conf_templates/local.py.template +++ b/bugsink/conf_templates/local.py.template @@ -68,11 +68,6 @@ BUGSINK = { # you can customize this as e.g. "My Bugsink" or "Bugsink for My Company" # "SITE_TITLE": "Bugsink", - # When running locally, it is recommended to configure the Bugsink to digest events immediately. (This is basically - # implied by the "TASK_ALWAYS_EAGER" setting above, but setting DIGEST_IMMEDIATELY to True removes one more step - # from the process.) - "DIGEST_IMMEDIATELY": True, - # You are licenced to run Bugsink locally in single-user mode. By changing the settings below, you may open the door # to more uses; make sure to buy a licence if you do. "SINGLE_USER": True, diff --git a/bugsink/conf_templates/singleserver.py.template b/bugsink/conf_templates/singleserver.py.template index 854a482..20d735d 100644 --- a/bugsink/conf_templates/singleserver.py.template +++ b/bugsink/conf_templates/singleserver.py.template @@ -98,10 +98,6 @@ BUGSINK = { "SINGLE_TEAM": False, "TEAM_CREATION": CB_MEMBERS, # who can create new teams. default: members, which means "any member of the site" - # In the singleserver production setup, we do not digest events immediately, but instead offload this to Snappea. - # This ensures a more response and reliable server when there are peak loads in the events. - "DIGEST_IMMEDIATELY": False, - # "MAX_EVENT_SIZE": _MEBIBYTE, # "MAX_EVENT_COMPRESSED_SIZE": 200 * _KIBIBYTE, # "MAX_ENVELOPE_SIZE": 100 * _MEBIBYTE, diff --git a/bugsink/settings/development.py b/bugsink/settings/development.py index 4e1a0ca..45b8928 100644 --- a/bugsink/settings/development.py +++ b/bugsink/settings/development.py @@ -83,8 +83,6 @@ SERVER_EMAIL = DEFAULT_FROM_EMAIL = 'Klaas van Schelven ' BUGSINK = { - "DIGEST_IMMEDIATELY": False, - # "MAX_EVENT_SIZE": _MEBIBYTE, # "MAX_EVENT_COMPRESSED_SIZE": 200 * _KIBIBYTE, # "MAX_ENVELOPE_SIZE": 100 * _MEBIBYTE, diff --git a/ingest/tests.py b/ingest/tests.py index 2f81df5..986043d 100644 --- a/ingest/tests.py +++ b/ingest/tests.py @@ -394,11 +394,6 @@ class IngestViewTestCase(TransactionTestCase): with self.assertRaises(ViolatedExpectation): check() - @tag("samples") - def test_envelope_endpoint_digest_non_immediate(self): - with override_settings(DIGEST_IMMEDIATELY=False): - self.test_envelope_endpoint() - @tag("samples") def test_filestore(self): # quick & dirty way to test the filestore; in absence of a proper test for it, we just run a more-or-less diff --git a/ingest/views.py b/ingest/views.py index 93b1a49..f1b77b3 100644 --- a/ingest/views.py +++ b/ingest/views.py @@ -28,8 +28,7 @@ from issues.regressions import issue_is_regression from bugsink.transaction import immediate_atomic, delay_on_commit from bugsink.exceptions import ViolatedExpectation -from bugsink.streams import ( - content_encoding_reader, MaxDataReader, MaxDataWriter, NullWriter, MaxLengthExceeded, UnclosableBytesIO) +from bugsink.streams import content_encoding_reader, MaxDataReader, MaxDataWriter, NullWriter, MaxLengthExceeded from bugsink.app_settings import get_settings from events.models import Event @@ -154,16 +153,8 @@ class BaseIngestAPIView(View): @classmethod def process_event(cls, ingested_at, event_id, event_data_stream, project, request): event_metadata = cls.get_event_meta(event_id, ingested_at, request, project) - - if get_settings().DIGEST_IMMEDIATELY: - # in this case the stream will be an BytesIO object, so we can actually call .get_value() on it. - event_data_bytes = event_data_stream.getvalue() - event_data = json.loads(event_data_bytes.decode("utf-8")) - performance_logger.info("ingested event with %s bytes", len(event_data_bytes)) - cls.digest_event(event_metadata, event_data) - else: - performance_logger.info("ingested event with %s bytes", event_data_stream.bytes_written) - digest.delay(event_id, event_metadata) + performance_logger.info("ingested event with %s bytes", event_data_stream.bytes_written) + digest.delay(event_id, event_metadata) @classmethod def get_event_meta(cls, event_id, ingested_at, request, project): @@ -616,9 +607,6 @@ class IngestEnvelopeAPIView(BaseIngestAPIView): def factory(item_headers): if item_headers.get("type") == "event": - if get_settings().DIGEST_IMMEDIATELY: - return MaxDataWriter("MAX_EVENT_SIZE", UnclosableBytesIO()) - # envelope_headers["event_id"] is required when type=event per the spec (and takes precedence over the # payload's event_id), so we can rely on it having been set. if "event_id" not in envelope_headers: @@ -650,7 +638,10 @@ class IngestEnvelopeAPIView(BaseIngestAPIView): continue - self.process_event(ingested_at, envelope_headers["event_id"], event_output_stream, project, request) + performance_logger.info("ingested event with %s bytes", event_output_stream.bytes_written) + event_metadata = self.get_event_meta(envelope_headers["event_id"], ingested_at, request, project) + digest.delay(envelope_headers["event_id"], event_metadata) + break # From the spec of type=event: This Item may occur at most once per Envelope. once seen: done return HttpResponse() @@ -679,22 +670,15 @@ class MinidumpAPIView(BaseIngestAPIView): @classmethod def _ingest(cls, ingested_at, event_data, project, request): - # TSTTCPW: just ingest the invent as normally after we've done the minidump-parsing "immediately". We make - # ready for the expectations of process_event (DIGEST_IMMEDIATELY/event_output_stream) with an if-statement + # TSTTCPW: convert the minidump data to an event and then proceed as usual. + filename = get_filename_for_event_id(event_data["event_id"]) + b108_makedirs(os.path.dirname(filename)) + with open(filename, 'w') as f: + json.dump(event_data, f) - event_output_stream = MaxDataWriter("MAX_EVENT_SIZE", UnclosableBytesIO()) - if get_settings().DIGEST_IMMEDIATELY: - # in this case the stream will be an BytesIO object, so we can actually call .get_value() on it. - event_output_stream.write(json.dumps(event_data).encode("utf-8")) - - else: - # no need to actually touch event_output_stream for this case, we just need to write a file - filename = get_filename_for_event_id(event_data["event_id"]) - b108_makedirs(os.path.dirname(filename)) - with open(filename, 'w') as f: - json.dump(event_data, f) - - cls.process_event(ingested_at, event_data["event_id"], event_output_stream, project, request) + # performance_logger.info("ingested event with %s bytes", event_output_stream.bytes_written) TODO for minidump + event_metadata = cls.get_event_meta(event_data["event_id"], ingested_at, request, project) + digest.delay(event_data["event_id"], event_metadata) def post(self, request, project_pk=None): # not reusing the CORS stuff here; minidump-from-browser doesn't make sense. diff --git a/phonehome/tasks.py b/phonehome/tasks.py index 17612f8..d14ee04 100644 --- a/phonehome/tasks.py +++ b/phonehome/tasks.py @@ -92,7 +92,6 @@ def _make_message_body(): "SINGLE_TEAM": get_settings().SINGLE_TEAM, "EMAIL_BACKEND": settings.EMAIL_BACKEND, "TASK_ALWAYS_EAGER": get_snappea_settings().TASK_ALWAYS_EAGER, - "DIGEST_IMMEDIATELY": get_settings().DIGEST_IMMEDIATELY, "IS_DOCKER": settings.IS_DOCKER, "DATABASE_ENGINE": settings.DATABASES["default"]["ENGINE"], }, From 31596a9b44f9331b2279b4d628e1f56b636984d7 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Wed, 5 Nov 2025 09:20:35 +0100 Subject: [PATCH 07/31] /store/ endpoint: non-immediate digestion this makes this consistent with the work we did in the previous commit at the price of being slightly more inefficient. but it's a deprecated endpoint anyway --- ingest/views.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/ingest/views.py b/ingest/views.py index f1b77b3..c28fb1b 100644 --- a/ingest/views.py +++ b/ingest/views.py @@ -503,30 +503,31 @@ class BaseIngestAPIView(View): class IngestEventAPIView(BaseIngestAPIView): def _post(self, request, project_pk=None): + # This endpoint is deprecated. Personally, I think it's the simpler (and given my goals therefore better) of the + # two, but fighting windmills and all... given that it's deprecated, I'm not going to give it quite as much love + # (at least for now). + # + # The main point of "inefficiency" is that the event data is parsed twice: once here (to get the event_id), and + # once in the actual digest.delay() ingested_at = datetime.now(timezone.utc) project = self.get_project_for_request(project_pk, request) if project.quota_exceeded_until is not None and ingested_at < project.quota_exceeded_until: return HttpResponse(status=HTTP_429_TOO_MANY_REQUESTS) - # This endpoint is deprecated. Personally, I think it's the simpler (and given my goals therefore better) of the - # two, but fighting windmills and all... given that it's deprecated, I'm not going to give it quite as much love - # (at least for now). Interfaces between the internal methods quite changed a bit recently, and this one did not - # keep up. - # - # In particular I'd like to just call process_event() here, but that takes both an event_id and an unparsed data - # stream, and we don't have an event_id here before parsing (and we don't want to parse twice). similarly, - # event_metadata construction requires the event_id. - # - # Instead, we just copy/pasted the relevant parts of process_event() here, and take only one branch (the one - # that digests immediately); i.e. we always digest immediately, independent of the setting. + event_data_bytes = MaxDataReader( + "MAX_EVENT_SIZE", content_encoding_reader(MaxDataReader("MAX_EVENT_COMPRESSED_SIZE", request))).read() - event_data = json.loads( - MaxDataReader("MAX_EVENT_SIZE", content_encoding_reader( - MaxDataReader("MAX_EVENT_COMPRESSED_SIZE", request))).read()) + performance_logger.info("ingested event with %s bytes", len(event_data_bytes)) + + event_data = json.loads(event_data_bytes) + filename = get_filename_for_event_id(event_data["event_id"]) + b108_makedirs(os.path.dirname(filename)) + with open(filename, 'w') as f: + json.dump(event_data, f) event_metadata = self.get_event_meta(event_data["event_id"], ingested_at, request, project) - self.digest_event(event_metadata, event_data) + digest.delay(event_data["event_id"], event_metadata) return HttpResponse() From d945b39259876d29029efd9e95248d189fb22b05 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Wed, 5 Nov 2025 09:27:10 +0100 Subject: [PATCH 08/31] Dead code removal this was inlined in 7f831f52d4ca --- ingest/views.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ingest/views.py b/ingest/views.py index c28fb1b..91c7b3a 100644 --- a/ingest/views.py +++ b/ingest/views.py @@ -150,12 +150,6 @@ class BaseIngestAPIView(View): sentry_key = cls.get_sentry_key_for_request(request) return cls.get_project(project_pk, sentry_key) - @classmethod - def process_event(cls, ingested_at, event_id, event_data_stream, project, request): - event_metadata = cls.get_event_meta(event_id, ingested_at, request, project) - performance_logger.info("ingested event with %s bytes", event_data_stream.bytes_written) - digest.delay(event_id, event_metadata) - @classmethod def get_event_meta(cls, event_id, ingested_at, request, project): # Meta means: not part of the event data. Basically: information that is available at the time of ingestion, and From d807ea2c50a359ecc0ac0af6b0ea0ad8b5b13804 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Wed, 5 Nov 2025 11:10:14 +0100 Subject: [PATCH 09/31] Minidump: via envelope interface See #82 --- bugsink/app_settings.py | 1 + bugsink/streams.py | 4 +- ingest/filestore.py | 8 ++- ingest/tasks.py | 8 ++- ingest/views.py | 152 +++++++++++++++++++++++++--------------- 5 files changed, 113 insertions(+), 60 deletions(-) diff --git a/bugsink/app_settings.py b/bugsink/app_settings.py index 3ba51aa..1d59347 100644 --- a/bugsink/app_settings.py +++ b/bugsink/app_settings.py @@ -49,6 +49,7 @@ DEFAULTS = { # MAX* below mirror the (current) values for the Sentry Relay "MAX_EVENT_SIZE": _MEBIBYTE, + "MAX_ATTACHMENT_SIZE": 100 * _MEBIBYTE, "MAX_EVENT_COMPRESSED_SIZE": 200 * _KIBIBYTE, # Note: this only applies to the deprecated "store" endpoint. "MAX_ENVELOPE_SIZE": 100 * _MEBIBYTE, "MAX_ENVELOPE_COMPRESSED_SIZE": 20 * _MEBIBYTE, diff --git a/bugsink/streams.py b/bugsink/streams.py index e3c6120..cb9d5b8 100644 --- a/bugsink/streams.py +++ b/bugsink/streams.py @@ -116,7 +116,7 @@ class MaxDataReader: self.bytes_read = 0 self.stream = stream - if isinstance(max_length, str): # reusing this is a bit of a hack, but leads to readable code at usage + if isinstance(max_length, str): # support for settings-name max_length makes both the code and errors better self.max_length = get_settings()[max_length] self.reason = "%s: %s" % (max_length, self.max_length) else: @@ -145,7 +145,7 @@ class MaxDataWriter: self.bytes_written = 0 self.stream = stream - if isinstance(max_length, str): # reusing this is a bit of a hack, but leads to readable code at usage + if isinstance(max_length, str): # support for settings-name max_length makes both the code and errors better self.max_length = get_settings()[max_length] self.reason = "%s: %s" % (max_length, self.max_length) else: diff --git a/ingest/filestore.py b/ingest/filestore.py index b8737cf..82a0321 100644 --- a/ingest/filestore.py +++ b/ingest/filestore.py @@ -4,7 +4,7 @@ from django.utils._os import safe_join from bugsink.app_settings import get_settings -def get_filename_for_event_id(event_id): +def get_filename_for_event_id(event_id, filetype="event"): # The idea of having some levels of directories here (to avoid too many files in a single dir) is not yet # implemented. However, counterpoint: when doing stress tests, it was quite hard to get a serious backlog going # (snappea was very well able to play catch-up). So this might not be necessary. @@ -15,4 +15,8 @@ def get_filename_for_event_id(event_id): # without needing to inspect all call-sites) event_id_normalized = uuid.UUID(event_id).hex - return safe_join(get_settings().INGEST_STORE_BASE_DIR, event_id_normalized) + basename = event_id_normalized + if filetype == "minidump": + basename += ".dmp" + + return safe_join(get_settings().INGEST_STORE_BASE_DIR, basename) diff --git a/ingest/tasks.py b/ingest/tasks.py index 362de72..522ee3d 100644 --- a/ingest/tasks.py +++ b/ingest/tasks.py @@ -18,8 +18,14 @@ def digest(event_id, event_metadata): with open(get_filename_for_event_id(event_id), "rb") as f: event_data = json.loads(f.read().decode("utf-8")) + if event_metadata.get("has_minidump"): + with open(get_filename_for_event_id(event_id, filetype="minidump"), "rb") as f: + minidump_bytes = f.read() + else: + minidump_bytes = None + try: - BaseIngestAPIView.digest_event(event_metadata, event_data) + BaseIngestAPIView.digest_event(event_metadata, event_data, minidump_bytes=minidump_bytes) except ValidationError as e: logger.warning("ValidationError in digest_event", exc_info=e) finally: diff --git a/ingest/views.py b/ingest/views.py index 91c7b3a..1a08819 100644 --- a/ingest/views.py +++ b/ingest/views.py @@ -1,3 +1,4 @@ +from collections import defaultdict import uuid import hashlib import os @@ -150,6 +151,34 @@ class BaseIngestAPIView(View): sentry_key = cls.get_sentry_key_for_request(request) return cls.get_project(project_pk, sentry_key) + @classmethod + def process_minidump(cls, ingested_at, minidump_bytes, project, request): + # This is for the "pure" minidump case, i.e. no associated event data. TSTTCPW: convert the minidump data to an + # event and then proceed as usual. + + performance_logger.info("ingested minidump with %s bytes", len(minidump_bytes)) + + event_id = uuid.uuid4().hex + event_data = { + "event_id": event_id, + "platform": "native", + "extra": {}, + "errors": [], + } + + merge_minidump_event(event_data, minidump_bytes) + + # write the event data to disk: + filename = get_filename_for_event_id(event_data["event_id"]) + b108_makedirs(os.path.dirname(filename)) + with open(filename, 'w') as f: + json.dump(event_data, f) + + event_metadata = cls.get_event_meta(event_data["event_id"], ingested_at, request, project) + digest.delay(event_data["event_id"], event_metadata) + + return event_id + @classmethod def get_event_meta(cls, event_id, ingested_at, request, project): # Meta means: not part of the event data. Basically: information that is available at the time of ingestion, and @@ -224,7 +253,7 @@ class BaseIngestAPIView(View): @classmethod @immediate_atomic() - def digest_event(cls, event_metadata, event_data, digested_at=None): + def digest_event(cls, event_metadata, event_data, digested_at=None, minidump_bytes=None): # ingested_at is passed from the point-of-ingestion; digested_at is determined here. Because this happens inside # `immediate_atomic`, we know digestions are serialized, and assuming non-decreasing server clocks, not decrea- # sing. (no so for ingestion times: clock-watching happens outside the snappe transaction, and threading in the @@ -254,6 +283,10 @@ class BaseIngestAPIView(View): if get_settings().VALIDATE_ON_DIGEST in ["warn", "strict"]: cls.validate_event_data(event_data, get_settings().VALIDATE_ON_DIGEST) + if minidump_bytes is not None: + # we merge after validation: validation is about what's provided _externally_, not our own merging. + merge_minidump_event(event_data, minidump_bytes) + # I resisted the temptation to put `get_denormalized_fields_for_data` in an if-statement: you basically "always" # need this info... except when duplicate event-ids are sent. But the latter is the exception, and putting this # in an if-statement would require more rework (and possibly extra queries) than it's worth. @@ -601,43 +634,78 @@ class IngestEnvelopeAPIView(BaseIngestAPIView): return HttpResponse(status=HTTP_429_TOO_MANY_REQUESTS) def factory(item_headers): - if item_headers.get("type") == "event": - # envelope_headers["event_id"] is required when type=event per the spec (and takes precedence over the - # payload's event_id), so we can rely on it having been set. - if "event_id" not in envelope_headers: - raise ParseError("event_id not found in envelope headers") + type_ = item_headers.get("type") - try: - # validate that the event_id is a valid UUID as per the spec (validate at the edge) - uuid.UUID(envelope_headers["event_id"]) - except ValueError: - raise ParseError("event_id in envelope headers is not a valid UUID") + if ((type_ not in ["event", "attachment"]) or + (item_headers.get("type") == "attachment" and + item_headers.get("attachment_type") != "event.minidump")): - filename = get_filename_for_event_id(envelope_headers["event_id"]) - b108_makedirs(os.path.dirname(filename)) - return MaxDataWriter("MAX_EVENT_SIZE", open(filename, 'wb')) + # non-event/minidumps can be discarded; (we don't check for individual size limits, because these differ + # per item type, we have the envelope limit to protect us, and we incur almost no cost (NullWriter)) + return NullWriter() - # everything else can be discarded; (we don't check for individual size limits, because these differ - # per item type, we have the envelope limit to protect us, and we incur almost no cost (NullWriter) anyway. - return NullWriter() + # envelope_headers["event_id"] is required when type in ["event", "attachment"] per the spec (and takes + # precedence over the payload's event_id), so we can rely on it having been set. + if "event_id" not in envelope_headers: + raise ParseError("event_id not found in envelope headers") - for item_headers, event_output_stream in parser.get_items(factory): - if item_headers.get("type") != "event": - logger.info("skipping non-event item: %s", item_headers.get("type")) + try: + # validate that the event_id is a valid UUID as per the spec (validate at the edge) + uuid.UUID(envelope_headers["event_id"]) + except ValueError: + raise ParseError("event_id in envelope headers is not a valid UUID") - if item_headers.get("type") == "transaction": - # From the spec of type=event: This Item is mutually exclusive with `"transaction"` Items. - # i.e. when we see a transaction, a regular event will not be present and we can stop. - logger.info("discarding the rest of the envelope") - break + filetype = "event" if type_ == "event" else "minidump" + filename = get_filename_for_event_id(envelope_headers["event_id"], filetype=filetype) + b108_makedirs(os.path.dirname(filename)) + size_conf = "MAX_EVENT_SIZE" if type_ == "event" else "MAX_ATTACHMENT_SIZE" + return MaxDataWriter(size_conf, open(filename, 'wb')) + + # We ingest the whole envelope first and organize by type; this enables "digest once" across envelope-parts + items_by_type = defaultdict(list) + for item_headers, output_stream in parser.get_items(factory): + type_ = item_headers.get("type") + if type_ not in ["event", "attachment"]: + logger.info("skipping non-supported envelope item: %s", item_headers.get("type")) continue - performance_logger.info("ingested event with %s bytes", event_output_stream.bytes_written) - event_metadata = self.get_event_meta(envelope_headers["event_id"], ingested_at, request, project) + if type_ == "attachment" and item_headers.get("attachment_type") != "event.minidump": + logger.info("skipping non-supported attachment type: %s", item_headers.get("attachment_type")) + continue + + performance_logger.info("ingested %s with %s bytes", type_, output_stream.bytes_written) + items_by_type[type_].append(output_stream) + + event_count = len(items_by_type.get("event", [])) + minidump_count = len(items_by_type.get("attachment", [])) + + if event_count > 1 or minidump_count > 1: + # TODO: we do 2 passes (one for storing, one for calling the right task), and we check certain conditions + # only on the second pass; this means that we may not clean up after ourselves yet. + # TODO we don't do any minidump files cleanup yet in any of the cases. + + logger.info( + "can only deal with one event/minidump per envelope but found %s/%s, ignoring this envelope.", + event_count, minidump_count) + return HttpResponse() + + event_metadata = self.get_event_meta(envelope_headers["event_id"], ingested_at, request, project) + + if event_count == 1: + if minidump_count == 1: + event_metadata["has_minidump"] = True digest.delay(envelope_headers["event_id"], event_metadata) - break # From the spec of type=event: This Item may occur at most once per Envelope. once seen: done + else: + # as it stands, we implement the minidump->event path for the minidump-only case on-ingest; we could push + # this to a task too if needed or for reasons of symmetry. + with open(get_filename_for_event_id(envelope_headers["event_id"], filetype="minidump"), 'rb') as f: + minidump_bytes = f.read() + + # TODO: error handling + # NOTE "The file should start with the MDMP magic bytes." is not checked yet. + self.process_minidump(ingested_at, minidump_bytes, project, request) return HttpResponse() @@ -663,18 +731,6 @@ class MinidumpAPIView(BaseIngestAPIView): # A Base "Ingest" APIView in the sense that it reuses some key building blocks (auth). # I'm not 100% sure whether "philosophically" the minidump endpoint is also "ingesting"; we'll see. - @classmethod - def _ingest(cls, ingested_at, event_data, project, request): - # TSTTCPW: convert the minidump data to an event and then proceed as usual. - filename = get_filename_for_event_id(event_data["event_id"]) - b108_makedirs(os.path.dirname(filename)) - with open(filename, 'w') as f: - json.dump(event_data, f) - - # performance_logger.info("ingested event with %s bytes", event_output_stream.bytes_written) TODO for minidump - event_metadata = cls.get_event_meta(event_data["event_id"], ingested_at, request, project) - digest.delay(event_data["event_id"], event_metadata) - def post(self, request, project_pk=None): # not reusing the CORS stuff here; minidump-from-browser doesn't make sense. @@ -682,26 +738,12 @@ class MinidumpAPIView(BaseIngestAPIView): project = self.get_project_for_request(project_pk, request) try: - # in this flow, we don't get an event_id from the client, so we just generate one here. - event_id = uuid.uuid4().hex - minidump_bytes = request.FILES["upload_file_minidump"].read() - data = { - "event_id": event_id, - "platform": "native", - "extra": {}, - "errors": [], - } - - merge_minidump_event(data, minidump_bytes) - - self._ingest(ingested_at, data, project, request) + event_id = self.process_minidump(ingested_at, minidump_bytes, project, request) return JsonResponse({"id": event_id}) - except Exception as e: - raise return JsonResponse({"detail": str(e)}, status=HTTP_400_BAD_REQUEST) From 2e2a8cfeeb9eca45785c8b0f044ba06f97fdd984 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Wed, 5 Nov 2025 11:20:30 +0100 Subject: [PATCH 10/31] envelope endpoint tests: slight cleanup --- ingest/tests.py | 139 +++++++++++++++++++++--------------------------- 1 file changed, 61 insertions(+), 78 deletions(-) diff --git a/ingest/tests.py b/ingest/tests.py index 986043d..879f1a8 100644 --- a/ingest/tests.py +++ b/ingest/tests.py @@ -21,6 +21,7 @@ from projects.models import Project from events.factories import create_event_data, create_event from events.retention import evict_for_max_events from events.storage_registry import override_event_storages +from events.models import Event from issues.factories import get_or_create_issue from issues.models import IssueStateManager, Issue, TurningPoint, TurningPointKind from issues.utils import get_values @@ -293,38 +294,71 @@ class IngestViewTestCase(TransactionTestCase): sentry_auth_header = get_header_value(f"http://{ project.sentry_key }@hostisignored/{ project.id }") - # first, we ingest many issues - command = SendJsonCommand() - command.stdout = io.StringIO() - command.stderr = io.StringIO() + SAMPLES_DIR = os.getenv("SAMPLES_DIR", "../event-samples") + + filename = glob(SAMPLES_DIR + "/sentry/mobile1-xen.json")[0] # pick a fixed one for reproducibility + + for i, include_event_id in enumerate([True, False]): + with open(filename) as f: + data = json.loads(f.read()) + + data["event_id"] = uuid.uuid4().hex # for good measure we reset this to avoid duplicates. + + if "timestamp" not in data: + # as per send_json command ("weirdly enough a large numer of sentry test data don't actually...") + data["timestamp"] = time.time() + + event_id = data["event_id"] + if not include_event_id: + del data["event_id"] + + data_bytes = json.dumps(data).encode("utf-8") + data_bytes = ( + b'{"event_id": "%s"}\n{"type": "event"}\n' % event_id.encode("utf-8") + data_bytes) + + response = self.client.post( + f"/api/{ project.id }/envelope/", + content_type="application/json", + headers={ + "X-Sentry-Auth": sentry_auth_header, + "X-BugSink-DebugInfo": filename, + }, + data=data_bytes, + ) + self.assertEqual( + 200, response.status_code, response.content if response.status_code != 302 else response.url) + + self.assertEqual(1 + i, Event.objects.count()) + + @tag("samples") + def test_envelope_endpoint_reused_ids_different_exceptions(self): + # dirty copy/paste from test_envelope_endpoint, + project = Project.objects.create(name="test") + + sentry_auth_header = get_header_value(f"http://{ project.sentry_key }@hostisignored/{ project.id }") SAMPLES_DIR = os.getenv("SAMPLES_DIR", "../event-samples") - event_samples = glob(SAMPLES_DIR + "/sentry/mobile1-xen.json") # pick a fixed one for reproducibility - known_broken = [SAMPLES_DIR + "/" + s.strip() for s in _readlines(SAMPLES_DIR + "/KNOWN-BROKEN")] + filename = glob(SAMPLES_DIR + "/sentry/mobile1-xen.json")[0] # this one has 'exception.values[0].type' - if len(event_samples) == 0: - raise Exception(f"No event samples found in {SAMPLES_DIR}; I insist on having some to test with.") + with open(filename) as f: + data = json.loads(f.read()) + data["event_id"] = uuid.uuid4().hex # we set it once, before the loop. - for include_event_id in [True, False]: - for filename in [sample for sample in event_samples if sample not in known_broken][:1]: # one is enough - with open(filename) as f: - data = json.loads(f.read()) + for type_ in ["Foo", "Bar"]: # forces different groupers, leading to separate Issue objects + data['exception']['values'][0]['type'] = type_ - data["event_id"] = uuid.uuid4().hex # for good measure we reset this to avoid duplicates. + if "timestamp" not in data: + # as per send_json command ("weirdly enough a large numer of sentry test data don't actually...") + data["timestamp"] = time.time() - if "timestamp" not in data: - # as per send_json command ("weirdly enough a large numer of sentry test data don't actually...") - data["timestamp"] = time.time() + event_id = data["event_id"] - event_id = data["event_id"] - if not include_event_id: - del data["event_id"] - - data_bytes = json.dumps(data).encode("utf-8") - data_bytes = ( - b'{"event_id": "%s"}\n{"type": "event"}\n' % event_id.encode("utf-8") + data_bytes) + data_bytes = json.dumps(data).encode("utf-8") + data_bytes = ( + b'{"event_id": "%s"}\n{"type": "event"}\n' % event_id.encode("utf-8") + data_bytes) + def check(): response = self.client.post( f"/api/{ project.id }/envelope/", content_type="application/json", @@ -337,62 +371,11 @@ class IngestViewTestCase(TransactionTestCase): self.assertEqual( 200, response.status_code, response.content if response.status_code != 302 else response.url) - @tag("samples") - def test_envelope_endpoint_reused_ids_different_exceptions(self): - # dirty copy/paste from test_envelope_endpoint, - project = Project.objects.create(name="test") - - sentry_auth_header = get_header_value(f"http://{ project.sentry_key }@hostisignored/{ project.id }") - - # first, we ingest many issues - command = SendJsonCommand() - command.stdout = io.StringIO() - command.stderr = io.StringIO() - - SAMPLES_DIR = os.getenv("SAMPLES_DIR", "../event-samples") - - event_samples = glob(SAMPLES_DIR + "/sentry/mobile1-xen.json") # this one has 'exception.values[0].type' - known_broken = [SAMPLES_DIR + "/" + s.strip() for s in _readlines(SAMPLES_DIR + "/KNOWN-BROKEN")] - - if len(event_samples) == 0: - raise Exception(f"No event samples found in {SAMPLES_DIR}; I insist on having some to test with.") - - for filename in [sample for sample in event_samples if sample not in known_broken][:1]: # one is enough - with open(filename) as f: - data = json.loads(f.read()) - data["event_id"] = uuid.uuid4().hex # we set it once, before the loop. - - for type_ in ["Foo", "Bar"]: # forces different groupers, leading to separate Issue objects - data['exception']['values'][0]['type'] = type_ - - if "timestamp" not in data: - # as per send_json command ("weirdly enough a large numer of sentry test data don't actually...") - data["timestamp"] = time.time() - - event_id = data["event_id"] - - data_bytes = json.dumps(data).encode("utf-8") - data_bytes = ( - b'{"event_id": "%s"}\n{"type": "event"}\n' % event_id.encode("utf-8") + data_bytes) - - def check(): - response = self.client.post( - f"/api/{ project.id }/envelope/", - content_type="application/json", - headers={ - "X-Sentry-Auth": sentry_auth_header, - "X-BugSink-DebugInfo": filename, - }, - data=data_bytes, - ) - self.assertEqual( - 200, response.status_code, response.content if response.status_code != 302 else response.url) - - if type_ == "Foo": + if type_ == "Foo": + check() + else: + with self.assertRaises(ViolatedExpectation): check() - else: - with self.assertRaises(ViolatedExpectation): - check() @tag("samples") def test_filestore(self): From 48a818bed1f48184245414ea38597fcce71119c6 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Wed, 5 Nov 2025 12:36:17 +0100 Subject: [PATCH 11/31] Tests for envelope minidump API --- ingest/tests.py | 92 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 89 insertions(+), 3 deletions(-) diff --git a/ingest/tests.py b/ingest/tests.py index 879f1a8..d7e96ea 100644 --- a/ingest/tests.py +++ b/ingest/tests.py @@ -296,7 +296,7 @@ class IngestViewTestCase(TransactionTestCase): SAMPLES_DIR = os.getenv("SAMPLES_DIR", "../event-samples") - filename = glob(SAMPLES_DIR + "/sentry/mobile1-xen.json")[0] # pick a fixed one for reproducibility + filename = glob(SAMPLES_DIR + "/bugsink/contexts.json")[0] # pick a fixed one for reproducibility for i, include_event_id in enumerate([True, False]): with open(filename) as f: @@ -330,6 +330,94 @@ class IngestViewTestCase(TransactionTestCase): self.assertEqual(1 + i, Event.objects.count()) + @tag("samples") + def test_envelope_endpoint_event_and_minidump(self): + # dirty copy/paste from the integration test, let's start with "something", we can always clean it later. + project = Project.objects.create(name="test") + + sentry_auth_header = get_header_value(f"http://{ project.sentry_key }@hostisignored/{ project.id }") + + SAMPLES_DIR = os.getenv("SAMPLES_DIR", "../event-samples") + + filename = glob(SAMPLES_DIR + "/bugsink/contexts.json")[0] # pick a fixed one for reproducibility + with open(filename) as f: + data = json.loads(f.read()) + + data["event_id"] = uuid.uuid4().hex # for good measure we reset this to avoid duplicates. + + if "timestamp" not in data: + # as per send_json command ("weirdly enough a large numer of sentry test data don't actually...") + data["timestamp"] = time.time() + + filename = glob(SAMPLES_DIR + "/minidumps/linux_overflow.dmp")[0] # pick a fixed one for reproducibility + with open(filename, 'rb') as f: + minidump_bytes = f.read() + + event_id = data["event_id"] + + event_bytes = json.dumps(data).encode("utf-8") + data_bytes = ( + b'{"event_id": "%s"}\n' % event_id.encode("utf-8") + + b'{"type": "event"}\n' + event_bytes + b"\n" + + b'{"type": "attachment", "attachment_type": "event.minidump", "length": %d}\n' % len(minidump_bytes) + + minidump_bytes + ) + + response = self.client.post( + f"/api/{ project.id }/envelope/", + content_type="application/json", + headers={ + "X-Sentry-Auth": sentry_auth_header, + }, + data=data_bytes, + ) + self.assertEqual( + 200, response.status_code, response.content if response.status_code != 302 else response.url) + + self.assertEqual(1, Event.objects.count()) + event = Event.objects.get() + self.assertTrue("prod" in ([tag.value.value for tag in event.tags.all()])) # from the sample event + + self.assertEqual('SIGABRT: Fatal Error: SIGABRT', Event.objects.get().title()) + + @tag("samples") + def test_envelope_endpoint_minidump_only(self): + # dirty copy/paste from the integration test, let's start with "something", we can always clean it later. + project = Project.objects.create(name="test") + + sentry_auth_header = get_header_value(f"http://{ project.sentry_key }@hostisignored/{ project.id }") + + SAMPLES_DIR = os.getenv("SAMPLES_DIR", "../event-samples") + + filename = glob(SAMPLES_DIR + "/minidumps/linux_overflow.dmp")[0] # pick a fixed one for reproducibility + with open(filename, 'rb') as f: + minidump_bytes = f.read() + + event_id = uuid.uuid4().hex # required at the envelope level so we provide it. + + data_bytes = ( + b'{"event_id": "%s"}\n' % event_id.encode("utf-8") + + b'{"type": "attachment", "attachment_type": "event.minidump", "length": %d}\n' % len(minidump_bytes) + + minidump_bytes + ) + + response = self.client.post( + f"/api/{ project.id }/envelope/", + content_type="application/json", + headers={ + "X-Sentry-Auth": sentry_auth_header, + }, + data=data_bytes, + ) + self.assertEqual( + 200, response.status_code, response.content if response.status_code != 302 else response.url) + + self.assertEqual(1, Event.objects.count()) + event = Event.objects.get() + self.assertFalse("prod" in ([tag.value.value for tag in event.tags.all()])) # no sample event, so False + + self.assertEqual('SIGABRT: Fatal Error: SIGABRT', Event.objects.get().title()) + @tag("samples") def test_envelope_endpoint_reused_ids_different_exceptions(self): # dirty copy/paste from test_envelope_endpoint, @@ -364,7 +452,6 @@ class IngestViewTestCase(TransactionTestCase): content_type="application/json", headers={ "X-Sentry-Auth": sentry_auth_header, - "X-BugSink-DebugInfo": filename, }, data=data_bytes, ) @@ -450,7 +537,6 @@ class IngestViewTestCase(TransactionTestCase): content_type="application/json", headers={ "X-Sentry-Auth": sentry_auth_header, - "X-BugSink-DebugInfo": filename, }, data=data_bytes, ) From c4cf038a93df7843f69c10076e289c95b8375701 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Wed, 5 Nov 2025 12:59:42 +0100 Subject: [PATCH 12/31] minidump after-digest cleanup envelope-based happy path only --- ingest/tasks.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/ingest/tasks.py b/ingest/tasks.py index 522ee3d..0077d46 100644 --- a/ingest/tasks.py +++ b/ingest/tasks.py @@ -1,3 +1,4 @@ +import contextlib import os import logging import json @@ -17,10 +18,12 @@ def digest(event_id, event_metadata): with open(get_filename_for_event_id(event_id), "rb") as f: event_data = json.loads(f.read().decode("utf-8")) + opened = [get_filename_for_event_id(event_id)] if event_metadata.get("has_minidump"): with open(get_filename_for_event_id(event_id, filetype="minidump"), "rb") as f: minidump_bytes = f.read() + opened += [get_filename_for_event_id(event_id, filetype="minidump")] else: minidump_bytes = None @@ -29,11 +32,10 @@ def digest(event_id, event_metadata): except ValidationError as e: logger.warning("ValidationError in digest_event", exc_info=e) finally: - # NOTE: if an SDK misbehaves, and sends the same event_id multiple times in quick succession, the line below - # will trigger a FileNotFoundError on the second attempt to delete the file (the files also overwrite each other - # on-ingest). In that case your logs will also a "ValidationError in digest_event". Although that means an error - # bubbles up from the below, at least for now I'm OK with that. (next steps _could_ be: [a] catching the error - # as expected [b] refusing to "just overwrite and doubly enqueue on-ingest" [c] reporting about this particular - # problem to the end-user etc... but at least "getting it really right" might actually be quite hard (race - # conditions) and I'm not so sure it's worth it. - os.unlink(get_filename_for_event_id(event_id)) + # NOTE: if an SDK misbehaves, and sends the same event_id multiple times in quick succession, the os.unlink + # below will trigger a FileNotFoundError on the second attempt to delete the file (the events also overwrite + # each other on-ingest, but that's separately dealt with, showing a "ValidationError in digest_event". We're + # just catching those and ignoring them (bubble-up is not desirable because it hinders follow-up cleanups) + for filename in opened: + with contextlib.suppress(FileNotFoundError): + os.unlink(filename) From cb8f913cbe6592a267fff0c1e7e05a61f82ed255 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Wed, 5 Nov 2025 13:23:31 +0100 Subject: [PATCH 13/31] Document 2 TODOs --- ingest/views.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ingest/views.py b/ingest/views.py index 1a08819..84b3bc7 100644 --- a/ingest/views.py +++ b/ingest/views.py @@ -285,6 +285,8 @@ class BaseIngestAPIView(View): if minidump_bytes is not None: # we merge after validation: validation is about what's provided _externally_, not our own merging. + # TODO error handling + # TODO should not be inside immediate_atomic if it turns out to be slow merge_minidump_event(event_data, minidump_bytes) # I resisted the temptation to put `get_denormalized_fields_for_data` in an if-statement: you basically "always" From 21297eff2a726bd3e5348485d91bdcf8af032459 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Wed, 5 Nov 2025 14:36:38 +0100 Subject: [PATCH 14/31] Comment about understanding (from reading the code, not actually double-checked right now) --- files/views.py | 1 + 1 file changed, 1 insertion(+) diff --git a/files/views.py b/files/views.py index 881584c..74db12e 100644 --- a/files/views.py +++ b/files/views.py @@ -151,6 +151,7 @@ def chunk_upload(request, organization_slug): # POST: upload (full-size) "chunks" and store them as Chunk objects; file.name whould be the sha1 of the content. chunks = [] if request.FILES: + # "file" and "file_gzip" are both possible multi-value keys for uploading (with associated semantics each) chunks = request.FILES.getlist("file") # NOTE: we read the whole unzipped file into memory; we _could_ take an approach like bugsink/streams.py. From 7d008da4a136e254f02bea9f5973035569282351 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Wed, 5 Nov 2025 21:40:07 +0100 Subject: [PATCH 15/31] PoC for difs_assemble works w/ sentry-client; no actual handling yet; TODOs in-code See #82 --- bugsink/urls.py | 5 ++- files/tasks.py | 8 ++--- files/views.py | 81 ++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 85 insertions(+), 9 deletions(-) diff --git a/bugsink/urls.py b/bugsink/urls.py index 2a65174..9e15360 100644 --- a/bugsink/urls.py +++ b/bugsink/urls.py @@ -14,7 +14,7 @@ from teams.views import debug_email as debug_teams_email from bugsink.app_settings import get_settings from users.views import signup, confirm_email, resend_confirmation, request_reset_password, reset_password, preferences from ingest.views import download_envelope -from files.views import chunk_upload, artifact_bundle_assemble, api_root, api_catch_all +from files.views import chunk_upload, artifact_bundle_assemble, difs_assemble, api_root, api_catch_all from bugsink.decorators import login_exempt from events.api_views import EventViewSet @@ -71,6 +71,9 @@ urlpatterns = [ path("api/0/organizations//artifactbundle/assemble/", artifact_bundle_assemble, name="artifact_bundle_assemble"), + path("api/0/projects///files/difs/assemble/", difs_assemble, + name="difs_assemble"), + path('api/', include('ingest.urls')), path('api/0/', api_root, name='api_root'), diff --git a/files/tasks.py b/files/tasks.py index a193708..4e6a62b 100644 --- a/files/tasks.py +++ b/files/tasks.py @@ -105,8 +105,8 @@ def assemble_file(checksum, chunk_checksums, filename): # NOTE: unimplemented checks/tricks # * total file-size v.s. some max - # * explicit check chunk availability (as it stands, our processing is synchronous, so no need) - # * skip-on-checksum-exists + # * explicit check chunk availability + # * skip this whole thing when the (whole-file) checksum exists chunks = Chunk.objects.filter(checksum__in=chunk_checksums) chunks_dicts = {chunk.checksum: chunk for chunk in chunks} @@ -117,7 +117,7 @@ def assemble_file(checksum, chunk_checksums, filename): if sha1(data, usedforsecurity=False).hexdigest() != checksum: raise Exception("checksum mismatch") - result = File.objects.get_or_create( + file, created = File.objects.get_or_create( checksum=checksum, defaults={ "size": len(data), @@ -129,7 +129,7 @@ def assemble_file(checksum, chunk_checksums, filename): # be used in multiple files (which are still being assembled) but with chunksizes in the order of 1MiB, I'd say this # is unlikely. chunks.delete() - return result + return file, created @shared_task diff --git a/files/views.py b/files/views.py index 74db12e..e357fe2 100644 --- a/files/views.py +++ b/files/views.py @@ -15,8 +15,8 @@ from bugsink.app_settings import get_settings from bugsink.transaction import durable_atomic, immediate_atomic from bsmain.models import AuthToken -from .models import Chunk, File -from .tasks import assemble_artifact_bundle +from .models import Chunk, File, FileMetadata +from .tasks import assemble_artifact_bundle, assemble_file logger = logging.getLogger("bugsink.api") @@ -86,7 +86,8 @@ def get_chunk_upload_settings(request, organization_slug): # yet. "release_files", - # this would seem to be the "javascript sourcemaps" thing, but how exactly I did not check yet. + # on second reading I would say: this is "actual source code", but I did not check yet and "don't touch it" + # (even though we don't actually have an implementation for sources yet) "sources", # https://github.com/getsentry/sentry/discussions/46967 @@ -100,7 +101,7 @@ def get_chunk_upload_settings(request, organization_slug): # "artifact_bundles_v2", # the rest of the options are below: - # "debug_files", + "debug_files", # "release_files", # "pdbs", # "bcsymbolmaps", @@ -199,6 +200,78 @@ def artifact_bundle_assemble(request, organization_slug): return JsonResponse({"state": ChunkFileState.CREATED, "missingChunks": []}) +@csrf_exempt # we're in API context here; this could potentially be pulled up to a higher level though +@requires_auth_token +def difs_assemble(request, organization_slug, project_slug): + # TODO move to tasks.something.delay + # TODO think about the right transaction around this + data = json.loads(request.body) + + file_checksums = set(data.keys()) + + existing_files = { + f.file.checksum: f + for f in FileMetadata.objects.filter(file__checksum__in=file_checksums) + } + + all_requested_chunks = { + chunk + for file_info in data.values() + for chunk in file_info.get("chunks", []) + } + + available_chunks = set( + Chunk.objects.filter(checksum__in=all_requested_chunks).values_list("checksum", flat=True) + ) + + response = {} + + for file_checksum, file_info in data.items(): + if file_checksum in existing_files: + response[file_checksum] = { + "state": ChunkFileState.OK, + "missingChunks": [], + # "dif": serialize(existing_files[file_checksum]), # TODO: figure out if this is required. + } + continue + + file_chunks = file_info.get("chunks", []) + + # the sentry-cli sends an empty "chunks" list when just polling for file existence; since we already handled the + # case of existing files above, we can simply return NOT_FOUND here. + if not file_chunks: + response[file_checksum] = { + "state": ChunkFileState.NOT_FOUND, + "missingChunks": [], + } + continue + + missing_chunks = [c for c in file_chunks if c not in available_chunks] + if missing_chunks: + response[file_checksum] = { + "state": ChunkFileState.NOT_FOUND, + "missingChunks": missing_chunks, + } + continue + + file, _ = assemble_file(file_checksum, file_chunks, filename=file_info["name"]) + FileMetadata.objects.get_or_create( + debug_id=file_info["debug_id"], + file_type="dif", # I think? check! + defaults={ + "file": file, + "data": "{}", # this is the "catch all" field but I don't think we have anything in this case. + } + ) + + response[file_checksum] = { + "state": ChunkFileState.OK, + "missingChunks": [], + } + + return JsonResponse(response) + + @user_passes_test(lambda u: u.is_superuser) @durable_atomic def download_file(request, checksum): From ad077b405611db7058c9eea11708ad7ce494d6bf Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Sun, 9 Nov 2025 23:11:10 +0100 Subject: [PATCH 16/31] file_info's debug_id is optional (as per my notes; didn't recheck this when committing) --- files/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/files/views.py b/files/views.py index e357fe2..ce307bb 100644 --- a/files/views.py +++ b/files/views.py @@ -256,7 +256,7 @@ def difs_assemble(request, organization_slug, project_slug): file, _ = assemble_file(file_checksum, file_chunks, filename=file_info["name"]) FileMetadata.objects.get_or_create( - debug_id=file_info["debug_id"], + debug_id=file_info.get("debug_id"), file_type="dif", # I think? check! defaults={ "file": file, From 1ed03ce053c3004e5bec1de91d0e0265667025df Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Tue, 11 Nov 2025 09:39:04 +0100 Subject: [PATCH 17/31] Support request.body when doing Chuncked Transfer Encoding (ran into this b/c the native minidump upload uses chunked mode and our impl. of that looks at request.body (via FILES)) See #9 --- bugsink/wsgi.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/bugsink/wsgi.py b/bugsink/wsgi.py index b527904..8e87d2f 100644 --- a/bugsink/wsgi.py +++ b/bugsink/wsgi.py @@ -47,6 +47,29 @@ def allowed_hosts_error_message(domain, allowed_hosts): return msg + "Add '%s' to ALLOWED_HOSTS or configure proxy to use 'Host: %s'." % (domain, proxy_suggestion) +class NoopClose: + """Delegator: Gunicorn's Body doesn't implement .close(); Django calls that it in request.body's finally clause. + That .close() call in itself is slightly surprising to me (and I have not copied it in my own streaming reads) b/c + the [WSGI spec](https://peps.python.org/pep-3333/#input-and-error-streams) says: + + > Applications conforming to this specification must not use any other methods or attributes of the input or errors + > objects. In particular, applications must not attempt to close these streams, even if they possess close() + > methods. + + In the end, Django conforms to spec because LimitedStream _also_ drops the .close() (it's subclassing `io.IOBase`), + but one wonders why they call it in the first place. Anyway, just stub it and we're good. + """ + + def __init__(self, stream): + self._stream = stream + + def __getattr__(self, name): + return getattr(self._stream, name) + + def close(self): + return None + + class CustomWSGIRequest(WSGIRequest): """ Custom WSQIRequest subclass with 3 fixes/changes: @@ -71,7 +94,7 @@ class CustomWSGIRequest(WSGIRequest): super().__init__(environ) if "CONTENT_LENGTH" not in environ and "HTTP_TRANSFER_ENCODING" in environ: - self._stream = self.environ["wsgi.input"] + self._stream = NoopClose(self.environ["wsgi.input"]) def get_host(self): """ From 80f65c7058dbbac69bcad3bba7056f28e1e5d32b Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Tue, 11 Nov 2025 09:45:53 +0100 Subject: [PATCH 18/31] Comment: CustomWSGIRequest.get_host(): no changes needed for Django 5.2 upgrade this method wasn't changed upstream. See #89 --- bugsink/wsgi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bugsink/wsgi.py b/bugsink/wsgi.py index 8e87d2f..2130b3b 100644 --- a/bugsink/wsgi.py +++ b/bugsink/wsgi.py @@ -107,7 +107,7 @@ class CustomWSGIRequest(WSGIRequest): # For /health/ endpoints, we skip the ALLOWED_HOSTS validation (see #140). return self._get_raw_host() - # copied from HttpRequest.get_host() in Django 4.2, with modifications. + # copied from HttpRequest.get_host() in Django 5.2, with modifications. host = self._get_raw_host() From 54c96eb68001af01b176515caf0061aed43cb800 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Tue, 11 Nov 2025 09:48:00 +0100 Subject: [PATCH 19/31] Minidump upload: more explicit errors (and logging) --- ingest/views.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ingest/views.py b/ingest/views.py index edbb7ba..95478e7 100644 --- a/ingest/views.py +++ b/ingest/views.py @@ -39,6 +39,7 @@ from alerts.tasks import send_new_issue_alert, send_regression_alert from compat.timestamp import format_timestamp, parse_timestamp from tags.models import digest_tags from bsmain.utils import b108_makedirs +from sentry_sdk_extensions import capture_or_log_exception from sentry.minidump import merge_minidump_event from .parsers import StreamingEnvelopeParser, ParseError @@ -735,12 +736,17 @@ class MinidumpAPIView(BaseIngestAPIView): project = self.get_project_for_request(project_pk, request) try: - minidump_bytes = request.FILES["upload_file_minidump"].read() + if "upload_file_minidump" not in request.FILES: + return JsonResponse({"detail": "upload_file_minidump not found"}, status=HTTP_400_BAD_REQUEST) + minidump_bytes = request.FILES["upload_file_minidump"].read() event_id = self.process_minidump(ingested_at, minidump_bytes, project, request) return JsonResponse({"id": event_id}) except Exception as e: + # we're still figuring out what this endpoint should do; so we log errors to learn from them while saying + # to the client "400 Bad Request" if we can't handle their stuff. + capture_or_log_exception(e, logger) return JsonResponse({"detail": str(e)}, status=HTTP_400_BAD_REQUEST) From 72aab81d7d8bfca16fd1565d1ab9ab55ad962cfb Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Tue, 11 Nov 2025 13:39:44 +0100 Subject: [PATCH 20/31] Add ContentEncodingCheckMiddleware --- bugsink/middleware.py | 37 +++++++++++++++++++++++++++++++++++++ bugsink/settings/default.py | 1 + bugsink/tests.py | 7 +++++++ ingest/tests.py | 5 ++++- ingest/urls.py | 4 ++-- 5 files changed, 51 insertions(+), 3 deletions(-) diff --git a/bugsink/middleware.py b/bugsink/middleware.py index 807f617..c7a8c9d 100644 --- a/bugsink/middleware.py +++ b/bugsink/middleware.py @@ -9,11 +9,48 @@ from django.utils.translation import get_supported_language_variant from django.utils.translation.trans_real import parse_accept_lang_header from django.utils import translation from django.urls import get_script_prefix +from django.http import HttpResponseBadRequest performance_logger = logging.getLogger("bugsink.performance.views") +class ContentEncodingCheckMiddleware: + """ + We don't just globally interpret Content-Encoding for all views since: + + 1. this increases our attack service (or forces us to reason about how it doesn't) + 2. forces us to think about the interplay of Django's POST/FILES handling and maximums (DATA_UPLOAD_MAX_MEMORY_SIZE) + and our own maximums and handling. + 3. the various maximums for reading from streaming requests are per-view (underlying data-type) anyway. + + Instead, the only global thing we do is "fail explicitly". + """ + + # NOTE: once this list becomes long, we could switch to a per-view decorator (with the maximum bytes as a value) + SUPPORTED_VIEWS = [ + "ingest-store", + "ingest-envelope", + ] + + def __init__(self, get_response): + self.get_response = get_response + + def __call__(self, request): + return self.get_response(request) + + def process_view(self, request, view_func, view_args, view_kwargs): + if request.resolver_match: + view_name = request.resolver_match.view_name + else: + view_name = "[unknown]" + + if "HTTP_CONTENT_ENCODING" in request.META and view_name not in self.SUPPORTED_VIEWS: + return HttpResponseBadRequest(f"Content-Encoding handling is not supported for endpoint `{view_name}`") + + return None # proceed normally + + class DisallowChunkedMiddleware: def __init__(self, get_response): self.get_response = get_response diff --git a/bugsink/settings/default.py b/bugsink/settings/default.py index 4591012..60867ba 100644 --- a/bugsink/settings/default.py +++ b/bugsink/settings/default.py @@ -129,6 +129,7 @@ AUTH_USER_MODEL = "users.User" TAILWIND_APP_NAME = 'theme' MIDDLEWARE = [ + "bugsink.middleware.ContentEncodingCheckMiddleware", 'bugsink.middleware.SetRemoteAddrMiddleware', 'bugsink.middleware.DisallowChunkedMiddleware', 'django.middleware.security.SecurityMiddleware', diff --git a/bugsink/tests.py b/bugsink/tests.py index 865e7f7..7154a87 100644 --- a/bugsink/tests.py +++ b/bugsink/tests.py @@ -401,6 +401,13 @@ class SetRemoteAddrMiddlewareTestCase(RegularTestCase): SetRemoteAddrMiddleware.parse_x_forwarded_for("123.123.123.123,1.2.3.4") +class ContentEncodingCheckMiddlewareTestCase(DjangoTestCase): + + def test_speak_brotli_with_arbitrary_view_fails(self): + response = self.client.post("/", headers={"Content-Encoding": "br"}) + self.assertTrue(b"Content-Encoding handling is not supported for endpoint `home`" in response.content) + + class AllowedHostsMsgTestCase(DjangoTestCase): def test_allowed_hosts_error_message(self): diff --git a/ingest/tests.py b/ingest/tests.py index b245ad1..1719297 100644 --- a/ingest/tests.py +++ b/ingest/tests.py @@ -471,7 +471,7 @@ class IngestViewTestCase(TransactionTestCase): data_bytes = BROTLI_BOMB_4G t0 = time.time() - self.client.post( + response = self.client.post( f"/api/{ project.id }/envelope/", content_type="application/json", headers={ @@ -486,6 +486,9 @@ class IngestViewTestCase(TransactionTestCase): # the failing version is well above 5s (I just stopped the process after ~30s) self.fail("Brotli bomb caused excessive processing time: %d seconds" % (time.time() - t0)) + self.assertTrue(b"Max length" in response.content, response.content) + self.assertTrue(b"exceeded" in response.content, response.content) + @tag("samples") def test_filestore(self): # quick & dirty way to test the filestore; in absence of a proper test for it, we just run a more-or-less diff --git a/ingest/urls.py b/ingest/urls.py index 9d42b03..d98eaf3 100644 --- a/ingest/urls.py +++ b/ingest/urls.py @@ -4,8 +4,8 @@ from .views import IngestEventAPIView, IngestEnvelopeAPIView, MinidumpAPIView urlpatterns = [ # project_pk has to be an int per Sentry Client expectations. - path("/store/", IngestEventAPIView.as_view()), - path("/envelope/", IngestEnvelopeAPIView.as_view()), + path("/store/", IngestEventAPIView.as_view(), name="ingest-store"), + path("/envelope/", IngestEnvelopeAPIView.as_view(), name="ingest-envelope"), # is this "ingest"? it is at least in the sense that it matches the API schema and downstream auth etc. path("/minidump/", MinidumpAPIView.as_view()), From 937df4cbb8d841667cbe5a091e79387aedbe8936 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Tue, 11 Nov 2025 13:50:07 +0100 Subject: [PATCH 21/31] minidump endpoint: support content encoding adds readline() method to GeneratorReader (ChatGPT-generated; eyeballed for correctness) to match the Django FILES/POST handling expectations. --- bugsink/middleware.py | 1 + bugsink/streams.py | 52 +++++++++++++++++++++++++++++++++++++++---- ingest/urls.py | 2 +- ingest/views.py | 8 ++++++- 4 files changed, 57 insertions(+), 6 deletions(-) diff --git a/bugsink/middleware.py b/bugsink/middleware.py index c7a8c9d..0aa63fd 100644 --- a/bugsink/middleware.py +++ b/bugsink/middleware.py @@ -31,6 +31,7 @@ class ContentEncodingCheckMiddleware: SUPPORTED_VIEWS = [ "ingest-store", "ingest-envelope", + "ingest-minidump", ] def __init__(self, get_response): diff --git a/bugsink/streams.py b/bugsink/streams.py index 2453f08..c542009 100644 --- a/bugsink/streams.py +++ b/bugsink/streams.py @@ -5,6 +5,7 @@ import io import brotli from bugsink.app_settings import get_settings +from bugsink.utils import assert_ DEFAULT_CHUNK_SIZE = 8 * 1024 @@ -119,22 +120,65 @@ class GeneratorReader: del self.buffer[:size] return result + def readline(self, size=-1): + newline_index = self.buffer.find(b"\n") + while newline_index == -1: + chunk = self.read(DEFAULT_CHUNK_SIZE) + if not chunk: + break + self.buffer.extend(chunk) + newline_index = self.buffer.find(b"\n") + + if newline_index != -1: + end = newline_index + 1 + else: + end = len(self.buffer) + + if size >= 0: + end = min(end, size) + + result = bytes(self.buffer[:end]) + del self.buffer[:end] + return result + def content_encoding_reader(request): encoding = request.META.get("HTTP_CONTENT_ENCODING", "").lower() - if encoding == "gzip": - return GeneratorReader(zlib_generator(request, WBITS_PARAM_FOR_GZIP), bad_request_exceptions=(zlib.error,)) + return GeneratorReader( + zlib_generator(request._stream, WBITS_PARAM_FOR_GZIP), + bad_request_exceptions=(zlib.error,), + ) if encoding == "deflate": - return GeneratorReader(zlib_generator(request, WBITS_PARAM_FOR_DEFLATE), bad_request_exceptions=(zlib.error,)) + return GeneratorReader( + zlib_generator(request._stream, WBITS_PARAM_FOR_DEFLATE), + bad_request_exceptions=(zlib.error,) + ) if encoding == "br": - return GeneratorReader(brotli_generator(request), bad_request_exceptions=(brotli.error, BrotliError)) + return GeneratorReader( + brotli_generator(request._stream), + bad_request_exceptions=(brotli.error, BrotliError) + ) return request +def handle_request_content_encoding(request): + """Turns a request w/ Content-Encoding into an unpacked equivalent; for further "regular" (POST, FILES) handling + by Django. + """ + + encoding = request.META.get("HTTP_CONTENT_ENCODING", "").lower() + if encoding in ["gzip", "deflate", "br"]: + assert_(not request._read_started) + request._stream = content_encoding_reader(request) + + request.META["CONTENT_LENGTH"] = str(pow(2, 32) - 1) # large enough (we can't predict the decompressed value) + request.META.pop("HTTP_CONTENT_ENCODING") # the resulting request is no longer encoded + + def compress_with_zlib(input_stream, wbits, chunk_size=DEFAULT_CHUNK_SIZE): # mostly useful for testing (compress-decompress cycles) diff --git a/ingest/urls.py b/ingest/urls.py index d98eaf3..202bc2c 100644 --- a/ingest/urls.py +++ b/ingest/urls.py @@ -8,5 +8,5 @@ urlpatterns = [ path("/envelope/", IngestEnvelopeAPIView.as_view(), name="ingest-envelope"), # is this "ingest"? it is at least in the sense that it matches the API schema and downstream auth etc. - path("/minidump/", MinidumpAPIView.as_view()), + path("/minidump/", MinidumpAPIView.as_view(), name="ingest-minidump"), ] diff --git a/ingest/views.py b/ingest/views.py index 95478e7..d286db5 100644 --- a/ingest/views.py +++ b/ingest/views.py @@ -29,7 +29,9 @@ from issues.regressions import issue_is_regression from bugsink.transaction import immediate_atomic, delay_on_commit from bugsink.exceptions import ViolatedExpectation -from bugsink.streams import content_encoding_reader, MaxDataReader, MaxDataWriter, NullWriter, MaxLengthExceeded +from bugsink.streams import ( + content_encoding_reader, MaxDataReader, MaxDataWriter, NullWriter, MaxLengthExceeded, + handle_request_content_encoding) from bugsink.app_settings import get_settings from events.models import Event @@ -732,6 +734,10 @@ class MinidumpAPIView(BaseIngestAPIView): def post(self, request, project_pk=None): # not reusing the CORS stuff here; minidump-from-browser doesn't make sense. + # TODO: actually implement max (we just use Django defaults now, which will switch to write-to-tmp after 2.5M + # for the file, but this can still swamp your CPU/tmp dir. + handle_request_content_encoding(request) + ingested_at = datetime.now(timezone.utc) project = self.get_project_for_request(project_pk, request) From ab065a632932e5c85cf714ad8c9bde987ca87116 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Tue, 11 Nov 2025 15:25:51 +0100 Subject: [PATCH 22/31] api_catch_all: header-based rather than try-and-recover, just look at the headers and show body/POST etc. this avoids hard-to-reason about situations where either of those won't work because the other has already been executed; in combination with reasoning about max size usage the explicit solution is simply easier to reason about. further: * makes api_catch_all one of the content_encoding-ready views. * implement a max length for the ingest api view --- bugsink/middleware.py | 2 ++ bugsink/streams.py | 4 ++-- files/views.py | 42 +++++++++++++++++++++++++++++++----------- ingest/views.py | 5 ++--- 4 files changed, 37 insertions(+), 16 deletions(-) diff --git a/bugsink/middleware.py b/bugsink/middleware.py index 0aa63fd..27dddf8 100644 --- a/bugsink/middleware.py +++ b/bugsink/middleware.py @@ -32,6 +32,8 @@ class ContentEncodingCheckMiddleware: "ingest-store", "ingest-envelope", "ingest-minidump", + + "api_catch_all", ] def __init__(self, get_response): diff --git a/bugsink/streams.py b/bugsink/streams.py index c542009..3fc93b7 100644 --- a/bugsink/streams.py +++ b/bugsink/streams.py @@ -165,7 +165,7 @@ def content_encoding_reader(request): return request -def handle_request_content_encoding(request): +def handle_request_content_encoding(request, max_length): """Turns a request w/ Content-Encoding into an unpacked equivalent; for further "regular" (POST, FILES) handling by Django. """ @@ -173,7 +173,7 @@ def handle_request_content_encoding(request): encoding = request.META.get("HTTP_CONTENT_ENCODING", "").lower() if encoding in ["gzip", "deflate", "br"]: assert_(not request._read_started) - request._stream = content_encoding_reader(request) + request._stream = MaxDataReader(max_length, content_encoding_reader(request)) request.META["CONTENT_LENGTH"] = str(pow(2, 32) - 1) # large enough (we can't predict the decompressed value) request.META.pop("HTTP_CONTENT_ENCODING") # the resulting request is no longer encoded diff --git a/files/views.py b/files/views.py index ce307bb..96922b8 100644 --- a/files/views.py +++ b/files/views.py @@ -13,6 +13,7 @@ from sentry.assemble import ChunkFileState from bugsink.app_settings import get_settings from bugsink.transaction import durable_atomic, immediate_atomic +from bugsink.streams import handle_request_content_encoding from bsmain.models import AuthToken from .models import Chunk, File, FileMetadata @@ -292,6 +293,8 @@ def api_catch_all(request, subpath): # the existance of this view (and the associated URL pattern) has the effect of `APPEND_SLASH=False` for our API # endpoints, which is a good thing: for API enpoints you generally don't want this kind of magic (explicit breakage # is desirable for APIs, and redirects don't even work for POST/PUT data) + MAX_API_CATCH_ALL_SIZE = 1_000_000 # security and usability meet at this value (or below) + handle_request_content_encoding(request, MAX_API_CATCH_ALL_SIZE) if not get_settings().API_LOG_UNIMPLEMENTED_CALLS: raise Http404("Unimplemented API endpoint: /api/" + subpath) @@ -302,27 +305,44 @@ def api_catch_all(request, subpath): f" Method: {request.method}", ] + interesting_meta_keys = ["CONTENT_TYPE", "CONTENT_LENGTH", "HTTP_TRANSFER_ENCODING"] + interesting_headers = { + k: request.META[k] for k in interesting_meta_keys if k in request.META + } + + if interesting_headers: + lines.append(" Headers:") + for k, v in interesting_headers.items(): + lines.append(f" {k}: {v}") + if request.GET: lines.append(f" GET: {request.GET.dict()}") - body = request.body # note: must be above request.POST access to avoid "You cannot access body after reading ..." - if request.POST: - lines.append(f" POST: {request.POST.dict()}") + content_type = request.META.get("CONTENT_TYPE", "") + if content_type == "application/x-www-form-urlencoded" or content_type.startswith("multipart/form-data"): + if request.POST: + lines.append(f" POST: {request.POST.dict()}") + if request.FILES: + lines.append(f" FILES: {[f.name for f in request.FILES.values()]}") - if body: - try: - decoded = body.decode("utf-8", errors="replace").strip() - lines.append(" Body:") - lines.append(f" {decoded[:500]}") + else: + body = request.read(MAX_API_CATCH_ALL_SIZE) + decoded = body.decode("utf-8", errors="replace").strip() + + if content_type == "application/json": + shown_pretty = False try: parsed = json.loads(decoded) - pretty = json.dumps(parsed, indent=2)[:10_000] + pretty = json.dumps(parsed, indent=2) lines.append(" JSON body:") lines.extend(f" {line}" for line in pretty.splitlines()) + shown_pretty = True except json.JSONDecodeError: pass - except Exception as e: - lines.append(f" Body: ") + + if not shown_pretty: + lines.append(" Body:") + lines.append(f" {body}") logger.info("\n".join(lines)) raise Http404("Unimplemented API endpoint: /api/" + subpath) diff --git a/ingest/views.py b/ingest/views.py index d286db5..3aebb60 100644 --- a/ingest/views.py +++ b/ingest/views.py @@ -734,9 +734,8 @@ class MinidumpAPIView(BaseIngestAPIView): def post(self, request, project_pk=None): # not reusing the CORS stuff here; minidump-from-browser doesn't make sense. - # TODO: actually implement max (we just use Django defaults now, which will switch to write-to-tmp after 2.5M - # for the file, but this can still swamp your CPU/tmp dir. - handle_request_content_encoding(request) + # TODO: actually pick/configure max + handle_request_content_encoding(request, 50 * 1024 * 1024) ingested_at = datetime.now(timezone.utc) project = self.get_project_for_request(project_pk, request) From b60980c8f36b25e659f052d0965d29b763d8dbe9 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Wed, 12 Nov 2025 14:57:02 +0100 Subject: [PATCH 23/31] PoC: Minidumps w/ symbolification Plenty of TODOs left; but this proves we can find: * file names * function names * line nos * source context See #82 --- files/minidump.py | 143 +++++++++++++++++++++++++++++++++++++++++++++ files/views.py | 8 ++- sentry/minidump.py | 59 +++++++++---------- 3 files changed, 177 insertions(+), 33 deletions(-) create mode 100644 files/minidump.py diff --git a/files/minidump.py b/files/minidump.py new file mode 100644 index 0000000..2ac523c --- /dev/null +++ b/files/minidump.py @@ -0,0 +1,143 @@ +import io +import zipfile +import symbolic +from sentry_sdk_extensions import capture_or_log_exception + +from bugsink.utils import assert_ +from .models import FileMetadata + + +def get_single_object(archive): + # our understanding: sentry-cli uploads single-object archives; we need to get the single object out of it... + # ...but this does raise the question of why archives exist at all... hence the assert + objects = list(archive.iter_objects()) + assert_(len(objects) == 1) + return objects[0] + + +def build_cfi_map_from_minidump_bytes(minidump_bytes): + process_state = symbolic.minidump.ProcessState.from_minidump_buffer(minidump_bytes) + + frame_info_map = symbolic.minidump.FrameInfoMap.new() + + for module in process_state.modules(): + if not module.debug_id: + continue + + dashed_debug_id = symbolic.debuginfo.id_from_breakpad(module.debug_id) + if FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="dbg").count() == 0: + continue + + dif_bytes = FileMetadata.objects.get(debug_id=dashed_debug_id, file_type="dbg").file.data + archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes) + + debug_object = get_single_object(archive) + + cfi = symbolic.minidump.CfiCache.from_object(debug_object) + frame_info_map.add(module.debug_id, cfi) + + return frame_info_map + + +def extract_dif_metadata(dif_bytes): + try: + archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes) + debug_object = get_single_object(archive) + return { + "kind": debug_object.kind, # "dbg", "lib", "src" + "code_id": debug_object.code_id, + "debug_id": debug_object.debug_id, + # "file_format": debug_object.file_format, # "elf", "macho", "pe", "sourcebundle" + } + except Exception as e: + raise # TODO stabalize what we do later + capture_or_log_exception(e) + return {} + + +def extract_source_context(src_bytes, filename, center_line, context=5): + + # TODO the usual worries about zip bombs/memory usage apply here. + with zipfile.ZipFile(io.BytesIO(src_bytes)) as zf: + # sourcebundle entries use relative paths like "src/main.c" or so says ChatGPT + candidates = [n for n in zf.namelist() if n.endswith(filename)] + + if not candidates: + return [], None, [] + + with zf.open(candidates[0]) as f: + lines = f.read().decode("utf-8").splitlines() + + # Clamp line range to valid indices + start = max(center_line - context - 1, 0) + end = min(center_line + context, len(lines)) + + pre_context = lines[start:center_line - 1] + context_line = lines[center_line - 1] if 0 <= center_line - 1 < len(lines) else None + post_context = lines[center_line:end] + + return pre_context, context_line, post_context + + +def _find_module_for_address(process_state, abs_addr: int): + for m in process_state.modules(): + if m.addr and m.size and m.addr <= abs_addr < (m.addr + m.size): + return m + return None + + +def event_threads_for_process_state(process_state): + threads = [] + for thread in process_state.threads(): + thread_frames = [] + + for frame in thread.frames(): + module = _find_module_for_address(process_state, frame.instruction) + fn = file = None + line = 0 + + if module and module.debug_id: + dashed_debug_id = symbolic.debuginfo.id_from_breakpad(module.debug_id) + + file_metadata = FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="dbg").first() + if file_metadata: + dif_bytes = file_metadata.file.data + + archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes) + objects = list(archive.iter_objects()) + assert len(objects) == 1 + obj = objects[0] + + symcache = obj.make_symcache() + + rel = frame.instruction - module.addr + infos = symcache.lookup(rel) or symcache.lookup(rel - 1) # "or -1" from ChatGPT... should we do it? + if infos: + li = infos[0] + fn = li.function_name + file = li.filename + line = li.line + + # if we have line info, try source bundle + src_meta = FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="src").first() + if src_meta and file and line: + src_bytes = src_meta.file.data + pre_ctx, ctx_line, post_ctx = extract_source_context(src_bytes, file, line) + + thread_frames.append({ + "instruction_addr": f"0x{frame.instruction:x}", + "function": fn or "", + "filename": file, + "lineno": line, + "pre_context": pre_ctx, + "context_line": ctx_line, + "post_context": post_ctx, + }) + + threads.append({ + "id": thread.thread_id, + "crashed": (thread.thread_id == process_state.requesting_thread), + "stacktrace": {"frames": thread_frames}, + }) + + return threads diff --git a/files/views.py b/files/views.py index 96922b8..31ee4f2 100644 --- a/files/views.py +++ b/files/views.py @@ -18,6 +18,7 @@ from bsmain.models import AuthToken from .models import Chunk, File, FileMetadata from .tasks import assemble_artifact_bundle, assemble_file +from .minidump import extract_dif_metadata logger = logging.getLogger("bugsink.api") @@ -256,9 +257,12 @@ def difs_assemble(request, organization_slug, project_slug): continue file, _ = assemble_file(file_checksum, file_chunks, filename=file_info["name"]) + + symbolic_metadata = extract_dif_metadata(file.data) + FileMetadata.objects.get_or_create( - debug_id=file_info.get("debug_id"), - file_type="dif", # I think? check! + debug_id=file_info.get("debug_id"), # TODO : .get implies "no debug_id", but in that case it's useless + file_type=symbolic_metadata["kind"], # NOTE: symbolic's kind goes into file_type... defaults={ "file": file, "data": "{}", # this is the "catch all" field but I don't think we have anything in this case. diff --git a/sentry/minidump.py b/sentry/minidump.py index 8a126c3..a798505 100644 --- a/sentry/minidump.py +++ b/sentry/minidump.py @@ -2,27 +2,28 @@ # https://github.com/getsentry/sentry/blob/f0ac91f2ec6b45ad18e5eea6df72c5c72573e964/src/sentry/models/minidump.py#L53 # with (as it stands) minor modifications. -import logging -from symbolic import ProcessState +import symbolic +from files.minidump import build_cfi_map_from_minidump_bytes, event_threads_for_process_state def merge_minidump_event(data, minidump_bytes): - state = ProcessState.from_minidump_buffer(minidump_bytes) + frame_info_map = build_cfi_map_from_minidump_bytes(minidump_bytes) + process_state = symbolic.ProcessState.from_minidump_buffer(minidump_bytes, frame_infos=frame_info_map) - data['level'] = 'fatal' if state.crashed else 'info' + data['level'] = 'fatal' if process_state.crashed else 'info' - exception_value = 'Assertion Error: %s' % state.assertion if state.assertion \ - else 'Fatal Error: %s' % state.crash_reason + exception_value = 'Assertion Error: %s' % process_state.assertion if process_state.assertion \ + else 'Fatal Error: %s' % process_state.crash_reason # NO_BANANA: data['message'] is not the right target # data['message'] = exception_value - if state.timestamp: - data['timestamp'] = float(state.timestamp) + if process_state.timestamp: + data['timestamp'] = float(process_state.timestamp) # Extract as much system information as we can. TODO: We should create # a custom context and implement a specific minidump view in the event # UI. - info = state.system_info + info = process_state.system_info context = data.setdefault('contexts', {}) os = context.setdefault('os', {}) device = context.setdefault('device', {}) @@ -30,46 +31,42 @@ def merge_minidump_event(data, minidump_bytes): os['version'] = info.os_version device['arch'] = info.cpu_family - # We can extract stack traces here already but since CFI is not - # available yet (without debug symbols), the stackwalker will - # resort to stack scanning which yields low-quality results. If - # the user provides us with debug symbols, we will reprocess this - # minidump and add improved stacktraces later. - threads = [{ - 'id': thread.thread_id, - 'crashed': False, - 'stacktrace': { - 'frames': [{ - 'instruction_addr': '0x%x' % frame.instruction, - 'function': '', # Required by interface - } for frame in thread.frames()], - }, - } for thread in state.threads()] - data.setdefault('threads', {})['values'] = threads + threads = event_threads_for_process_state(process_state) + data.setdefault("threads", {})["values"] = threads # Mark the crashed thread and add its stacktrace to the exception - crashed_thread = threads[state.requesting_thread] + crashed_thread = threads[process_state.requesting_thread] crashed_thread['crashed'] = True # Extract the crash reason and infos exception = { 'value': exception_value, 'thread_id': crashed_thread['id'], - 'type': state.crash_reason, + 'type': process_state.crash_reason, # Move stacktrace here from crashed_thread (mutating!) 'stacktrace': crashed_thread.pop('stacktrace'), } + for frame in exception['stacktrace']['frames']: + frame['in_app'] = True # minidumps don't distinguish in_app frames; assume all are in_app + + exception['stacktrace']['frames'].reverse() # "Frames should be sorted from oldest to newest." + # TODO we don't have display-info for threads yet, I think? + # we may need to revert the per-thread stacktraces above as well then + data.setdefault('exception', {}) \ .setdefault('values', []) \ .append(exception) # Extract referenced (not all loaded) images images = [{ - 'type': 'apple', # Required by interface - # 'uuid': module.uuid, NO_BANANA + 'type': 'elf', # TODO not sure what this should _actually_ be 'image_addr': module.addr, 'image_size': module.size, - # 'name': module.name, NO_BANANA - } for module in state.modules()] + 'code_file': module.code_file, + 'code_id': module.code_id, + 'debug_file': module.debug_file, + 'debug_id': symbolic.debuginfo.id_from_breakpad(module.debug_id) if module.debug_id else None, + } for module in process_state.modules()] + data.setdefault('debug_meta', {})['images'] = images From 5757b4f9b52f43f177f69022a308ed0017522e7d Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Wed, 12 Nov 2025 21:03:03 +0100 Subject: [PATCH 24/31] Typo in comment --- issues/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/issues/utils.py b/issues/utils.py index 58a210a..e2348fc 100644 --- a/issues/utils.py +++ b/issues/utils.py @@ -120,7 +120,7 @@ def get_exception_type_and_value_for_exception(data): # From the sentry docs: # > An optional flag indicating that this error is synthetic. Synthetic errors are errors that carry little # > meaning by themselves. - # If this flag is set, we ignored the Exception's type and used the function name instead (if available). + # If this flag is set, we ignore the Exception's type and used the function name instead (if available). if get_path(exception, "mechanism", "synthetic"): _, function = get_crash_location(data) if function: From 54ec6eaceb18c0b97d39a8e23fddda46e7141865 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Wed, 12 Nov 2025 21:04:49 +0100 Subject: [PATCH 25/31] Populate exception['value'] mirrors how we show fetch it in `get_exception_type_and_value_for_exception` --- sentry/minidump.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sentry/minidump.py b/sentry/minidump.py index a798505..abf40c4 100644 --- a/sentry/minidump.py +++ b/sentry/minidump.py @@ -12,11 +12,6 @@ def merge_minidump_event(data, minidump_bytes): data['level'] = 'fatal' if process_state.crashed else 'info' - exception_value = 'Assertion Error: %s' % process_state.assertion if process_state.assertion \ - else 'Fatal Error: %s' % process_state.crash_reason - # NO_BANANA: data['message'] is not the right target - # data['message'] = exception_value - if process_state.timestamp: data['timestamp'] = float(process_state.timestamp) @@ -38,13 +33,18 @@ def merge_minidump_event(data, minidump_bytes): crashed_thread = threads[process_state.requesting_thread] crashed_thread['crashed'] = True + exception_value = 'Assertion Error: %s' % process_state.assertion if process_state.assertion \ + else 'Fatal Error: %s' %process_state.crash_reason + # Extract the crash reason and infos exception = { 'value': exception_value, 'thread_id': crashed_thread['id'], 'type': process_state.crash_reason, + # Move stacktrace here from crashed_thread (mutating!) 'stacktrace': crashed_thread.pop('stacktrace'), + 'value': exception_value, } for frame in exception['stacktrace']['frames']: From eea5f032e205e9b1e5acd6cd8399b86175b93ae9 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Wed, 12 Nov 2025 21:33:18 +0100 Subject: [PATCH 26/31] Clarified meaning of process_state.requesting_thread (the now-removed 'treat as pid' was hallunicated by the bot; the taken-from-sentry version missed the guard against -1) > The index of the thread that requested a dump be written in the > threads vector. [..] If the dump was not produced as a result of an exception > [..] this field will be set to -1, --- files/minidump.py | 4 ++-- sentry/minidump.py | 42 +++++++++++++++++++----------------------- 2 files changed, 21 insertions(+), 25 deletions(-) diff --git a/files/minidump.py b/files/minidump.py index 2ac523c..9739454 100644 --- a/files/minidump.py +++ b/files/minidump.py @@ -88,7 +88,7 @@ def _find_module_for_address(process_state, abs_addr: int): def event_threads_for_process_state(process_state): threads = [] - for thread in process_state.threads(): + for thread_index, thread in enumerate(process_state.threads()): thread_frames = [] for frame in thread.frames(): @@ -136,7 +136,7 @@ def event_threads_for_process_state(process_state): threads.append({ "id": thread.thread_id, - "crashed": (thread.thread_id == process_state.requesting_thread), + "crashed": thread_index == process_state.requesting_thread, "stacktrace": {"frames": thread_frames}, }) diff --git a/sentry/minidump.py b/sentry/minidump.py index abf40c4..3322649 100644 --- a/sentry/minidump.py +++ b/sentry/minidump.py @@ -29,34 +29,30 @@ def merge_minidump_event(data, minidump_bytes): threads = event_threads_for_process_state(process_state) data.setdefault("threads", {})["values"] = threads - # Mark the crashed thread and add its stacktrace to the exception - crashed_thread = threads[process_state.requesting_thread] - crashed_thread['crashed'] = True + if process_state.requesting_thread > -1: + crashed_thread = threads[process_state.requesting_thread] - exception_value = 'Assertion Error: %s' % process_state.assertion if process_state.assertion \ - else 'Fatal Error: %s' %process_state.crash_reason + exception_value = 'Assertion Error: %s' % process_state.assertion if process_state.assertion \ + else 'Fatal Error: %s' % process_state.crash_reason - # Extract the crash reason and infos - exception = { - 'value': exception_value, - 'thread_id': crashed_thread['id'], - 'type': process_state.crash_reason, + exception = { + 'value': exception_value, + 'thread_id': crashed_thread['id'], + 'type': process_state.crash_reason, + 'stacktrace': crashed_thread.pop('stacktrace'), + 'value': exception_value, + } - # Move stacktrace here from crashed_thread (mutating!) - 'stacktrace': crashed_thread.pop('stacktrace'), - 'value': exception_value, - } + for frame in exception['stacktrace']['frames']: + frame['in_app'] = True # minidumps don't distinguish in_app frames; assume all are in_app - for frame in exception['stacktrace']['frames']: - frame['in_app'] = True # minidumps don't distinguish in_app frames; assume all are in_app + exception['stacktrace']['frames'].reverse() # "Frames should be sorted from oldest to newest." + # TODO we don't have display-info for threads yet, I think? + # we may need to revert the per-thread stacktraces above as well then - exception['stacktrace']['frames'].reverse() # "Frames should be sorted from oldest to newest." - # TODO we don't have display-info for threads yet, I think? - # we may need to revert the per-thread stacktraces above as well then - - data.setdefault('exception', {}) \ - .setdefault('values', []) \ - .append(exception) + data.setdefault('exception', {}) \ + .setdefault('values', []) \ + .append(exception) # Extract referenced (not all loaded) images images = [{ From 9f2a7c67375abb98c0181f17fcc74dbfe7909e66 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Wed, 12 Nov 2025 22:05:42 +0100 Subject: [PATCH 27/31] de-chatgptize event_threads_for_process_state this code was created in a REPL/ChatGPT/minidump-API/HITL session, until I had something that "seemed to work". the present commit is the result of rereading, refactoring for understanding etc. it's not "pure refacting" in the sense that it's behavior-changing, but AFAICT for the better. e.g. "line 0" => just leave that out and many similar changes. --- files/minidump.py | 54 +++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/files/minidump.py b/files/minidump.py index 9739454..a252372 100644 --- a/files/minidump.py +++ b/files/minidump.py @@ -88,13 +88,13 @@ def _find_module_for_address(process_state, abs_addr: int): def event_threads_for_process_state(process_state): threads = [] - for thread_index, thread in enumerate(process_state.threads()): - thread_frames = [] + for thread_index, symbolic_thread in enumerate(process_state.threads()): + frames = [] - for frame in thread.frames(): - module = _find_module_for_address(process_state, frame.instruction) - fn = file = None - line = 0 + for symbolic_frame in symbolic_thread.frames(): + module = _find_module_for_address(process_state, symbolic_frame.instruction) + + frame = {"instruction_addr": f"0x{symbolic_frame.instruction:x}"} if module and module.debug_id: dashed_debug_id = symbolic.debuginfo.id_from_breakpad(module.debug_id) @@ -104,40 +104,34 @@ def event_threads_for_process_state(process_state): dif_bytes = file_metadata.file.data archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes) - objects = list(archive.iter_objects()) - assert len(objects) == 1 - obj = objects[0] + + obj = get_single_object(archive) symcache = obj.make_symcache() - rel = frame.instruction - module.addr - infos = symcache.lookup(rel) or symcache.lookup(rel - 1) # "or -1" from ChatGPT... should we do it? + rel = symbolic_frame.instruction - module.addr + infos = symcache.lookup(rel) if infos: - li = infos[0] - fn = li.function_name - file = li.filename - line = li.line + # tentative understanding: lookup may give multiple results (e.g. inlined code). we just pick + # the first arbitrarily which is "good enough for a PoC until proven otherwise" + line_info = infos[0] + + frame["function"] = line_info.function_name + if line_info.filename: + frame["filename"] = line_info.filename + frame["lineno"] = line_info.line - # if we have line info, try source bundle src_meta = FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="src").first() - if src_meta and file and line: - src_bytes = src_meta.file.data - pre_ctx, ctx_line, post_ctx = extract_source_context(src_bytes, file, line) + if src_meta and line_info.filename and line_info.line: + frame["pre_context"], frame["context_line"], frame["post_context"] = extract_source_context( + src_meta.file.data, line_info.filename, line_info.line) - thread_frames.append({ - "instruction_addr": f"0x{frame.instruction:x}", - "function": fn or "", - "filename": file, - "lineno": line, - "pre_context": pre_ctx, - "context_line": ctx_line, - "post_context": post_ctx, - }) + frames.append(frame) threads.append({ - "id": thread.thread_id, + "id": symbolic_thread.thread_id, "crashed": thread_index == process_state.requesting_thread, - "stacktrace": {"frames": thread_frames}, + "stacktrace": {"frames": frames}, }) return threads From 9f6cd88ec67e73f15c13a55ccf12fb3cdce5f277 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Thu, 13 Nov 2025 08:33:12 +0100 Subject: [PATCH 28/31] Remove unneeded layer of indirection in query. --- files/views.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/files/views.py b/files/views.py index 31ee4f2..1aeb321 100644 --- a/files/views.py +++ b/files/views.py @@ -212,8 +212,8 @@ def difs_assemble(request, organization_slug, project_slug): file_checksums = set(data.keys()) existing_files = { - f.file.checksum: f - for f in FileMetadata.objects.filter(file__checksum__in=file_checksums) + file.checksum: file + for file in File.objects.filter(checksum__in=file_checksums) } all_requested_chunks = { From 97c1e4c71c86661ff209794349e34ff43c4e4768 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Thu, 13 Nov 2025 08:33:31 +0100 Subject: [PATCH 29/31] Comment about difs_assemble endpoint --- files/views.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/files/views.py b/files/views.py index 1aeb321..79e5049 100644 --- a/files/views.py +++ b/files/views.py @@ -233,7 +233,9 @@ def difs_assemble(request, organization_slug, project_slug): response[file_checksum] = { "state": ChunkFileState.OK, "missingChunks": [], - # "dif": serialize(existing_files[file_checksum]), # TODO: figure out if this is required. + # if it is ever needed, we could add something akin to the below, but so far we've not seen client-side + # actually using this; let's add it on-demand. + # "dif": json_repr_with_key_info_about(existing_files[file_checksum]), } continue From 661d83bd939ff2c46e91df9328694f570390933a Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Sat, 15 Nov 2025 13:33:49 +0100 Subject: [PATCH 30/31] minidumps: FEATURE flag --- bugsink/app_settings.py | 3 +++ bugsink/settings/development.py | 3 +++ files/views.py | 3 +++ ingest/views.py | 9 ++++++++- 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/bugsink/app_settings.py b/bugsink/app_settings.py index 1d59347..5d2da85 100644 --- a/bugsink/app_settings.py +++ b/bugsink/app_settings.py @@ -73,6 +73,9 @@ DEFAULTS = { # Security: "MINIMIZE_INFORMATION_EXPOSURE": False, "PHONEHOME": True, + + # Feature flags: + "FEATURE_MINIDUMPS": False, # minidumps are experimental/early-stage and likely a DOS-magnet; disabled by default } diff --git a/bugsink/settings/development.py b/bugsink/settings/development.py index 48e86c4..a221949 100644 --- a/bugsink/settings/development.py +++ b/bugsink/settings/development.py @@ -109,6 +109,9 @@ BUGSINK = { "MAX_EMAILS_PER_MONTH": 10, # for development: a thing to tune if you want to the the quota system "KEEP_ARTIFACT_BUNDLES": True, # in development: useful to preserve sourcemap uploads + + # in development we want optional features enabled to [1] play with them and [2] have the tests work + "FEATURE_MINIDUMPS": True, } diff --git a/files/views.py b/files/views.py index 79e5049..1f32a6f 100644 --- a/files/views.py +++ b/files/views.py @@ -205,6 +205,9 @@ def artifact_bundle_assemble(request, organization_slug): @csrf_exempt # we're in API context here; this could potentially be pulled up to a higher level though @requires_auth_token def difs_assemble(request, organization_slug, project_slug): + if not get_settings().FEATURE_MINIDUMPS: + return JsonResponse({"detail": "minidumps not enabled"}, status=404) + # TODO move to tasks.something.delay # TODO think about the right transaction around this data = json.loads(request.body) diff --git a/ingest/views.py b/ingest/views.py index 3aebb60..f48f8bb 100644 --- a/ingest/views.py +++ b/ingest/views.py @@ -53,6 +53,7 @@ from .models import StoreEnvelope, DontStoreEnvelope, Envelope HTTP_429_TOO_MANY_REQUESTS = 429 HTTP_400_BAD_REQUEST = 400 +HTTP_404_NOT_FOUND = 404 HTTP_501_NOT_IMPLEMENTED = 501 @@ -638,7 +639,10 @@ class IngestEnvelopeAPIView(BaseIngestAPIView): if ((type_ not in ["event", "attachment"]) or (item_headers.get("type") == "attachment" and - item_headers.get("attachment_type") != "event.minidump")): + item_headers.get("attachment_type") != "event.minidump") or + (item_headers.get("type") == "attachment" and + item_headers.get("attachment_type") == "event.minidump" and + not get_settings().FEATURE_MINIDUMPS)): # non-event/minidumps can be discarded; (we don't check for individual size limits, because these differ # per item type, we have the envelope limit to protect us, and we incur almost no cost (NullWriter)) @@ -732,6 +736,9 @@ class MinidumpAPIView(BaseIngestAPIView): # I'm not 100% sure whether "philosophically" the minidump endpoint is also "ingesting"; we'll see. def post(self, request, project_pk=None): + if not get_settings().FEATURE_MINIDUMPS: + return JsonResponse({"detail": "minidumps not enabled"}, status=HTTP_404_NOT_FOUND) + # not reusing the CORS stuff here; minidump-from-browser doesn't make sense. # TODO: actually pick/configure max From 7df7bc7f4dfecfc97cf092d9a7774e6e817326fd Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Sat, 15 Nov 2025 13:38:09 +0100 Subject: [PATCH 31/31] Minidump feature flag: configurable in docker --- bugsink/conf_templates/docker.py.template | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bugsink/conf_templates/docker.py.template b/bugsink/conf_templates/docker.py.template index d938b8f..9f32b8e 100644 --- a/bugsink/conf_templates/docker.py.template +++ b/bugsink/conf_templates/docker.py.template @@ -179,6 +179,9 @@ BUGSINK = { os.getenv("MINIMIZE_INFORMATION_EXPOSURE", "false").lower() in ("true", "1", "yes"), "PHONEHOME": os.getenv("PHONEHOME", "true").lower() in ("true", "1", "yes"), + + # Feature flags + "FEATURE_MINIDUMPS": os.getenv("PHONEHOME", "false").lower() in ("true", "1", "yes"), }