From 74ace016bc6eed37a55c587d5746d2adfe3804f5 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Wed, 29 Oct 2025 17:25:49 +0100
Subject: [PATCH 01/31] api_catch_all: fix for request.POST debugging

---
 files/views.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/files/views.py b/files/views.py
index c29b6df..881584c 100644
--- a/files/views.py
+++ b/files/views.py
@@ -231,10 +231,10 @@ def api_catch_all(request, subpath):
     if request.GET:
         lines.append(f"  GET:    {request.GET.dict()}")
 
+    body = request.body  # note: must be above request.POST access to avoid "You cannot access body after reading ..."
     if request.POST:
         lines.append(f"  POST:   {request.POST.dict()}")
 
-    body = request.body
     if body:
         try:
             decoded = body.decode("utf-8", errors="replace").strip()

From b09e6d02a12b524ac92502d8008b4fe51540bb0b Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Tue, 4 Nov 2025 09:39:50 +0100
Subject: [PATCH 02/31] Minidump handling utils: from BSD-Licensed Sentry
 (minor adaptations)

---
 requirements.txt   |  1 +
 sentry/minidump.py | 84 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)
 create mode 100644 sentry/minidump.py

diff --git a/requirements.txt b/requirements.txt
index 33292b3..8fe767c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,3 +18,4 @@ verbose_csrf_middleware==1.0.*
 ecma426>=0.2.0
 djangorestframework==3.16.*
 drf-spectacular[sidecar]
+symbolic==8.7.2  # the last version to support ProcessState (i.e. minidump parsing)
diff --git a/sentry/minidump.py b/sentry/minidump.py
new file mode 100644
index 0000000..4f15379
--- /dev/null
+++ b/sentry/minidump.py
@@ -0,0 +1,84 @@
+# copied from:
+# https://github.com/getsentry/sentry/blob/f0ac91f2ec6b45ad18e5eea6df72c5c72573e964/src/sentry/models/minidump.py#L53
+# with (as it stands) minor modifications.
+
+import logging
+from symbolic import ProcessState
+
+
+LOG_LEVELS = {
+    logging.NOTSET: "sample",
+    logging.DEBUG: "debug",
+    logging.INFO: "info",
+    logging.WARNING: "warning",
+    logging.ERROR: "error",
+    logging.FATAL: "fatal",
+}
+
+LOG_LEVELS_MAP = {v: k for k, v in LOG_LEVELS.items()}
+
+
+def merge_minidump_event(data, minidump_path):
+    state = ProcessState.from_minidump(minidump_path)
+
+    data['level'] = LOG_LEVELS_MAP['fatal'] if state.crashed else LOG_LEVELS_MAP['info']
+    data['message'] = 'Assertion Error: %s' % state.assertion if state.assertion \
+        else 'Fatal Error: %s' % state.crash_reason
+
+    if state.timestamp:
+        data['timestamp'] = float(state.timestamp)
+
+    # Extract as much system information as we can. TODO: We should create
+    # a custom context and implement a specific minidump view in the event
+    # UI.
+    info = state.system_info
+    context = data.setdefault('contexts', {})
+    os = context.setdefault('os', {})
+    device = context.setdefault('device', {})
+    os['name'] = info.os_name
+    os['version'] = info.os_version
+    device['arch'] = info.cpu_family
+
+    # We can extract stack traces here already but since CFI is not
+    # available yet (without debug symbols), the stackwalker will
+    # resort to stack scanning which yields low-quality results. If
+    # the user provides us with debug symbols, we will reprocess this
+    # minidump and add improved stacktraces later.
+    threads = [{
+        'id': thread.thread_id,
+        'crashed': False,
+        'stacktrace': {
+            'frames': [{
+                'instruction_addr': '0x%x' % frame.instruction,
+                'function': '<unknown>',  # Required by interface
+            } for frame in thread.frames()],
+        },
+    } for thread in state.threads()]
+    data.setdefault('threads', {})['values'] = threads
+
+    # Mark the crashed thread and add its stacktrace to the exception
+    crashed_thread = threads[state.requesting_thread]
+    crashed_thread['crashed'] = True
+
+    # Extract the crash reason and infos
+    exception = {
+        'value': data['message'],
+        'thread_id': crashed_thread['id'],
+        'type': state.crash_reason,
+        # Move stacktrace here from crashed_thread (mutating!)
+        'stacktrace': crashed_thread.pop('stacktrace'),
+    }
+
+    data.setdefault('exception', {}) \
+        .setdefault('values', []) \
+        .append(exception)
+
+    # Extract referenced (not all loaded) images
+    images = [{
+        'type': 'apple',  # Required by interface
+        # 'uuid': module.uuid, NO_BANANA
+        'image_addr': module.addr,
+        'image_size': module.size,
+        # 'name': module.name, NO_BANANA
+    } for module in state.modules()]
+    data.setdefault('debug_meta', {})['images'] = images

From e7aad45db22d569b2202cbc70dd713ffb70104b2 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Tue, 4 Nov 2025 10:47:04 +0100
Subject: [PATCH 03/31] Minidumps: PoC for minidump 'endpoint'

See #82
---
 ingest/urls.py                          |  5 ++-
 ingest/views.py                         | 54 +++++++++++++++++++++++++
 issues/templates/issues/stacktrace.html |  4 +-
 sentry/minidump.py                      | 23 ++++-------
 4 files changed, 67 insertions(+), 19 deletions(-)

diff --git a/ingest/urls.py b/ingest/urls.py
index e075f75..9d42b03 100644
--- a/ingest/urls.py
+++ b/ingest/urls.py
@@ -1,9 +1,12 @@
 from django.urls import path
 
-from .views import IngestEventAPIView, IngestEnvelopeAPIView
+from .views import IngestEventAPIView, IngestEnvelopeAPIView, MinidumpAPIView
 
 urlpatterns = [
     # project_pk has to be an int per Sentry Client expectations.
     path("<int:project_pk>/store/", IngestEventAPIView.as_view()),
     path("<int:project_pk>/envelope/", IngestEnvelopeAPIView.as_view()),
+
+    # is this "ingest"? it is at least in the sense that it matches the API schema and downstream auth etc.
+    path("<int:project_pk>/minidump/", MinidumpAPIView.as_view()),
 ]
diff --git a/ingest/views.py b/ingest/views.py
index fdae6c5..b9d47c9 100644
--- a/ingest/views.py
+++ b/ingest/views.py
@@ -39,6 +39,7 @@ from alerts.tasks import send_new_issue_alert, send_regression_alert
 from compat.timestamp import format_timestamp, parse_timestamp
 from tags.models import digest_tags
 from bsmain.utils import b108_makedirs
+from sentry.minidump import merge_minidump_event
 
 from .parsers import StreamingEnvelopeParser, ParseError
 from .filestore import get_filename_for_event_id
@@ -680,6 +681,59 @@ class IngestEnvelopeAPIView(BaseIngestAPIView):
 #
 
 
+class MinidumpAPIView(BaseIngestAPIView):
+    # A Base "Ingest" APIView in the sense that it reuses some key building blocks (auth).
+    # I'm not 100% sure whether "philosophically" the minidump endpoint is also "ingesting"; we'll see.
+
+    @classmethod
+    def _ingest(cls, ingested_at, event_data, project, request):
+        # TSTTCPW: just ingest the invent as normally after we've done the minidump-parsing "immediately". We make
+        # ready for the expectations of process_event (DIGEST_IMMEDIATELY/event_output_stream) with an if-statement
+
+        event_output_stream = MaxDataWriter("MAX_EVENT_SIZE", io.BytesIO())
+        if get_settings().DIGEST_IMMEDIATELY:
+            # in this case the stream will be an BytesIO object, so we can actually call .get_value() on it.
+            event_output_stream.write(json.dumps(event_data).encode("utf-8"))
+
+        else:
+            # no need to actually touch event_output_stream for this case, we just need to write a file
+            filename = get_filename_for_event_id(event_data["event_id"])
+            b108_makedirs(os.path.dirname(filename))
+            with open(filename, 'w') as f:
+                json.dump(event_data, f)
+
+        cls.process_event(ingested_at, event_data["event_id"], event_output_stream, project, request)
+
+    def post(self, request, project_pk=None):
+        # not reusing the CORS stuff here; minidump-from-browser doesn't make sense.
+
+        ingested_at = datetime.now(timezone.utc)
+        project = self.get_project_for_request(project_pk, request)
+
+        try:
+            # in this flow, we don't get an event_id from the client, so we just generate one here.
+            event_id = uuid.uuid4().hex
+
+            minidump_bytes = request.FILES["upload_file_minidump"].read()
+
+            data = {
+                "event_id": event_id,
+                "platform": "native",
+                "extra": {},
+                "errors": [],
+            }
+
+            merge_minidump_event(data, minidump_bytes)
+
+            self._ingest(ingested_at, data, project, request)
+
+            return JsonResponse({"id": event_id})
+
+        except Exception as e:
+            raise
+            return JsonResponse({"detail": str(e)}, status=HTTP_400_BAD_REQUEST)
+
+
 @user_passes_test(lambda u: u.is_superuser)
 def download_envelope(request, envelope_id=None):
     envelope = get_object_or_404(Envelope, pk=envelope_id)
diff --git a/issues/templates/issues/stacktrace.html b/issues/templates/issues/stacktrace.html
index 870735c..9eb0a62 100644
--- a/issues/templates/issues/stacktrace.html
+++ b/issues/templates/issues/stacktrace.html
@@ -66,9 +66,9 @@
 
                 <div class="text-ellipsis overflow-hidden"> {# filename, function, lineno #}
                     {% if frame.in_app %}
-                        <span class="font-bold">{{ frame.filename }}</span>{% if frame.function %} in <span class="font-bold">{{ frame.function }}</span>{% endif %}{% if frame.lineno %} line <span class="font-bold">{{ frame.lineno }}</span>{% endif %}.
+                        <span class="font-bold">{{ frame.filename }}</span>{% if frame.function %} in <span class="font-bold">{{ frame.function }}</span>{% endif %}{% if frame.lineno %} line <span class="font-bold">{{ frame.lineno }}</span>{% endif %}{% if frame.instruction_addr %} {{ frame.instruction_addr }}{% endif %}.
                     {% else %}
-                        <span class="italic">{{ frame.filename }}{% if frame.function %} in {{ frame.function }}{% endif %}{% if frame.lineno%} line {{ frame.lineno }}{% endif %}.</span>
+                        <span class="italic">{{ frame.filename }}{% if frame.function %} in {{ frame.function }}{% endif %}{% if frame.lineno%} line {{ frame.lineno }}{% endif %}{% if frame.instruction_addr %} {{ frame.instruction_addr }}{% endif %}.</span>
                     {% endif %}
                 </div>
 
diff --git a/sentry/minidump.py b/sentry/minidump.py
index 4f15379..8a126c3 100644
--- a/sentry/minidump.py
+++ b/sentry/minidump.py
@@ -6,24 +6,15 @@ import logging
 from symbolic import ProcessState
 
 
-LOG_LEVELS = {
-    logging.NOTSET: "sample",
-    logging.DEBUG: "debug",
-    logging.INFO: "info",
-    logging.WARNING: "warning",
-    logging.ERROR: "error",
-    logging.FATAL: "fatal",
-}
+def merge_minidump_event(data, minidump_bytes):
+    state = ProcessState.from_minidump_buffer(minidump_bytes)
 
-LOG_LEVELS_MAP = {v: k for k, v in LOG_LEVELS.items()}
+    data['level'] = 'fatal' if state.crashed else 'info'
 
-
-def merge_minidump_event(data, minidump_path):
-    state = ProcessState.from_minidump(minidump_path)
-
-    data['level'] = LOG_LEVELS_MAP['fatal'] if state.crashed else LOG_LEVELS_MAP['info']
-    data['message'] = 'Assertion Error: %s' % state.assertion if state.assertion \
+    exception_value =  'Assertion Error: %s' % state.assertion if state.assertion \
         else 'Fatal Error: %s' % state.crash_reason
+    # NO_BANANA: data['message'] is not the right target
+    # data['message'] = exception_value
 
     if state.timestamp:
         data['timestamp'] = float(state.timestamp)
@@ -62,7 +53,7 @@ def merge_minidump_event(data, minidump_path):
 
     # Extract the crash reason and infos
     exception = {
-        'value': data['message'],
+        'value': exception_value,
         'thread_id': crashed_thread['id'],
         'type': state.crash_reason,
         # Move stacktrace here from crashed_thread (mutating!)

From 391e22bcf0d3fefa69343a1ef1b4350571f93292 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Tue, 4 Nov 2025 13:55:41 +0100
Subject: [PATCH 04/31] parser: event_output_stream closing pushed in

in preparation of the minidump handling through the envelope path,
which probably requires dealing with the whole envelope as a whole.

"theoretically" this might be less efficient, but [a] see the notes
at the top of the parser on how we think about streaming parsing and
[b] the inefficiencies are in the "immediate" path anyway (which has
the assumtion that it's not in high-performance envs).

"blind commit" (tests not run against this commit).
---
 bugsink/streams.py |  9 +++++++++
 ingest/parsers.py  | 23 +++++++++++++----------
 ingest/views.py    | 36 ++++++++++++++----------------------
 3 files changed, 36 insertions(+), 32 deletions(-)

diff --git a/bugsink/streams.py b/bugsink/streams.py
index 295fa56..e3c6120 100644
--- a/bugsink/streams.py
+++ b/bugsink/streams.py
@@ -170,3 +170,12 @@ class NullWriter:
 
     def close(self):
         pass
+
+
+class UnclosableBytesIO(io.BytesIO):
+    """Intentionally does nothing on-close: BytesIO normally discards its buffer on .close(), breaking .getvalue(); this
+    overrides it so that we can use it in code that usually deals with real files (and calls .close()) while still using
+    the in-memory data afterwards. We just rely on the garbage collector for the actual cleanup."""
+
+    def close(self):
+        pass
diff --git a/ingest/parsers.py b/ingest/parsers.py
index 3cd419d..bdc1401 100644
--- a/ingest/parsers.py
+++ b/ingest/parsers.py
@@ -155,7 +155,6 @@ class StreamingEnvelopeParser:
 
     def get_items(self, output_stream_factory):
         # yields the item_headers and item_output_streams (with the content of the items written into them)
-        # closing the item_output_stream is the responsibility of the calller
 
         self.get_envelope_headers()
 
@@ -175,17 +174,21 @@ class StreamingEnvelopeParser:
                 finder = NewlineFinder()
 
             item_output_stream = output_stream_factory(item_headers)
-            self.remainder, self.at_eof = readuntil(
-                self.input_stream, self.remainder, finder, item_output_stream, self.chunk_size)
 
-            if "length" in item_headers:
-                # items with an explicit length are terminated by a newline (if at EOF, this is optional as per the set
-                # of examples in the docs)
-                should_be_empty = io.BytesIO()
+            try:
                 self.remainder, self.at_eof = readuntil(
-                    self.input_stream, self.remainder, NewlineFinder(), should_be_empty, self.chunk_size)
-                if should_be_empty.getvalue() != b"":
-                    raise ParseError("Item with explicit length not terminated by newline/EOF")
+                    self.input_stream, self.remainder, finder, item_output_stream, self.chunk_size)
+
+                if "length" in item_headers:
+                    # items with an explicit length are terminated by a newline (if at EOF, this is optional as per the
+                    # set of examples in the docs)
+                    should_be_empty = io.BytesIO()
+                    self.remainder, self.at_eof = readuntil(
+                        self.input_stream, self.remainder, NewlineFinder(), should_be_empty, self.chunk_size)
+                    if should_be_empty.getvalue() != b"":
+                        raise ParseError("Item with explicit length not terminated by newline/EOF")
+            finally:
+                item_output_stream.close()
 
             yield item_headers, item_output_stream
 
diff --git a/ingest/views.py b/ingest/views.py
index b9d47c9..93b1a49 100644
--- a/ingest/views.py
+++ b/ingest/views.py
@@ -2,7 +2,6 @@ import uuid
 import hashlib
 import os
 import logging
-import io
 from datetime import datetime, timezone
 import json
 import jsonschema
@@ -29,7 +28,8 @@ from issues.regressions import issue_is_regression
 
 from bugsink.transaction import immediate_atomic, delay_on_commit
 from bugsink.exceptions import ViolatedExpectation
-from bugsink.streams import content_encoding_reader, MaxDataReader, MaxDataWriter, NullWriter, MaxLengthExceeded
+from bugsink.streams import (
+    content_encoding_reader, MaxDataReader, MaxDataWriter, NullWriter, MaxLengthExceeded, UnclosableBytesIO)
 from bugsink.app_settings import get_settings
 
 from events.models import Event
@@ -162,10 +162,6 @@ class BaseIngestAPIView(View):
             performance_logger.info("ingested event with %s bytes", len(event_data_bytes))
             cls.digest_event(event_metadata, event_data)
         else:
-            # In this case the stream will be a file that has been written the event's content to it.
-            # To ensure that the (possibly EAGER) handling of the digest has the file available, we flush it here:
-            event_data_stream.flush()
-
             performance_logger.info("ingested event with %s bytes", event_data_stream.bytes_written)
             digest.delay(event_id, event_metadata)
 
@@ -621,7 +617,7 @@ class IngestEnvelopeAPIView(BaseIngestAPIView):
         def factory(item_headers):
             if item_headers.get("type") == "event":
                 if get_settings().DIGEST_IMMEDIATELY:
-                    return MaxDataWriter("MAX_EVENT_SIZE", io.BytesIO())
+                    return MaxDataWriter("MAX_EVENT_SIZE", UnclosableBytesIO())
 
                 # envelope_headers["event_id"] is required when type=event per the spec (and takes precedence over the
                 # payload's event_id), so we can rely on it having been set.
@@ -643,23 +639,19 @@ class IngestEnvelopeAPIView(BaseIngestAPIView):
             return NullWriter()
 
         for item_headers, event_output_stream in parser.get_items(factory):
-            try:
-                if item_headers.get("type") != "event":
-                    logger.info("skipping non-event item: %s", item_headers.get("type"))
+            if item_headers.get("type") != "event":
+                logger.info("skipping non-event item: %s", item_headers.get("type"))
 
-                    if item_headers.get("type") == "transaction":
-                        # From the spec of type=event: This Item is mutually exclusive with `"transaction"` Items.
-                        # i.e. when we see a transaction, a regular event will not be present and we can stop.
-                        logger.info("discarding the rest of the envelope")
-                        break
+                if item_headers.get("type") == "transaction":
+                    # From the spec of type=event: This Item is mutually exclusive with `"transaction"` Items.
+                    # i.e. when we see a transaction, a regular event will not be present and we can stop.
+                    logger.info("discarding the rest of the envelope")
+                    break
 
-                    continue
+                continue
 
-                self.process_event(ingested_at, envelope_headers["event_id"], event_output_stream, project, request)
-                break  # From the spec of type=event: This Item may occur at most once per Envelope. once seen: done
-
-            finally:
-                event_output_stream.close()
+            self.process_event(ingested_at, envelope_headers["event_id"], event_output_stream, project, request)
+            break  # From the spec of type=event: This Item may occur at most once per Envelope. once seen: done
 
         return HttpResponse()
 
@@ -690,7 +682,7 @@ class MinidumpAPIView(BaseIngestAPIView):
         # TSTTCPW: just ingest the invent as normally after we've done the minidump-parsing "immediately". We make
         # ready for the expectations of process_event (DIGEST_IMMEDIATELY/event_output_stream) with an if-statement
 
-        event_output_stream = MaxDataWriter("MAX_EVENT_SIZE", io.BytesIO())
+        event_output_stream = MaxDataWriter("MAX_EVENT_SIZE", UnclosableBytesIO())
         if get_settings().DIGEST_IMMEDIATELY:
             # in this case the stream will be an BytesIO object, so we can actually call .get_value() on it.
             event_output_stream.write(json.dumps(event_data).encode("utf-8"))

From de9a37aab625de4aa160701f6b19a312bc1d6f69 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Wed, 5 Nov 2025 08:39:40 +0100
Subject: [PATCH 05/31] Fix tests for UnclosableBytesIO

i.e. fix tests for 391e22bcf0d3

(the changes in the present commit in `ingest/tests.py` are not strictly necessary
but they are principly right)
---
 ingest/parsers.py |  4 ++--
 ingest/tests.py   | 15 ++++++++-------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/ingest/parsers.py b/ingest/parsers.py
index bdc1401..3f4b1ec 100644
--- a/ingest/parsers.py
+++ b/ingest/parsers.py
@@ -1,7 +1,7 @@
 import json
 import io
 
-from bugsink.streams import MaxDataWriter
+from bugsink.streams import MaxDataWriter, UnclosableBytesIO
 
 from .exceptions import ParseError
 from .header_validators import filter_valid_envelope_headers, filter_valid_item_headers
@@ -195,5 +195,5 @@ class StreamingEnvelopeParser:
     def get_items_directly(self):
         # this method is just convenience for testing
 
-        for item_headers, output_stream in self.get_items(lambda item_headers: io.BytesIO()):
+        for item_headers, output_stream in self.get_items(lambda item_headers: UnclosableBytesIO()):
             yield item_headers, output_stream.getvalue()
diff --git a/ingest/tests.py b/ingest/tests.py
index 3e0d634..2f81df5 100644
--- a/ingest/tests.py
+++ b/ingest/tests.py
@@ -25,6 +25,7 @@ from issues.factories import get_or_create_issue
 from issues.models import IssueStateManager, Issue, TurningPoint, TurningPointKind
 from issues.utils import get_values
 from bugsink.app_settings import override_settings
+from bugsink.streams import UnclosableBytesIO
 from compat.timestamp import format_timestamp
 from compat.dsn import get_header_value
 from bsmain.management.commands.send_json import Command as SendJsonCommand
@@ -631,7 +632,7 @@ class TestParser(RegularTestCase):
         initial_chunk = b"line 0\nline 1\n"
         input_stream.seek(0)
 
-        output_stream = io.BytesIO()
+        output_stream = UnclosableBytesIO()
         remainder, at_eof = readuntil(input_stream, initial_chunk, NewlineFinder(), output_stream, 3)
 
         self.assertFalse(at_eof)
@@ -644,7 +645,7 @@ class TestParser(RegularTestCase):
         initial_chunk = b"lin"
         input_stream.seek(0)
 
-        output_stream = io.BytesIO()
+        output_stream = UnclosableBytesIO()
         remainder, at_eof = readuntil(input_stream, initial_chunk, NewlineFinder(), output_stream, 3)
 
         self.assertFalse(at_eof)
@@ -657,7 +658,7 @@ class TestParser(RegularTestCase):
         initial_chunk = b""
         input_stream.seek(0)
 
-        output_stream = io.BytesIO()
+        output_stream = UnclosableBytesIO()
         remainder, at_eof = readuntil(input_stream, initial_chunk, NewlineFinder(), output_stream, 3)
 
         self.assertFalse(at_eof)
@@ -670,7 +671,7 @@ class TestParser(RegularTestCase):
         initial_chunk = b""
         input_stream.seek(0)
 
-        output_stream = io.BytesIO()
+        output_stream = UnclosableBytesIO()
         remainder, at_eof = readuntil(input_stream, initial_chunk, NewlineFinder(), output_stream, 3)
 
         self.assertTrue(at_eof)
@@ -683,7 +684,7 @@ class TestParser(RegularTestCase):
         initial_chunk = b"lin"
         input_stream.seek(0)
 
-        output_stream = io.BytesIO()
+        output_stream = UnclosableBytesIO()
         remainder, at_eof = readuntil(input_stream, initial_chunk, NewlineFinder(), output_stream, 1024)
 
         self.assertFalse(at_eof)
@@ -696,7 +697,7 @@ class TestParser(RegularTestCase):
         initial_chunk = b"lin"
         input_stream.seek(0)
 
-        output_stream = io.BytesIO()
+        output_stream = UnclosableBytesIO()
         remainder, at_eof = readuntil(input_stream, initial_chunk, LengthFinder(10, "eof not ok"), output_stream, 3)
 
         self.assertFalse(at_eof)
@@ -709,7 +710,7 @@ class TestParser(RegularTestCase):
         initial_chunk = b"lin"
         input_stream.seek(0)
 
-        output_stream = io.BytesIO()
+        output_stream = UnclosableBytesIO()
         with self.assertRaises(ParseError):
             remainder, at_eof = readuntil(input_stream, initial_chunk, LengthFinder(100, "EOF"), output_stream, 1000)
 

From 7f831f52d4ca97ea01cd2e22f55a2596d46c675b Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Wed, 5 Nov 2025 09:03:17 +0100
Subject: [PATCH 06/31] Remove DIGEST_IMMEDIATELY option

Although DIGEST_IMMEDIATELY=True is theoretically a nice thing to
have, the upkeep is not worth it now that we're about to introduce
minidump ingestion.

The only thing that you're saving is the round-trip via the filesystem,
but performance of that is negligable, and if you're configuring
DIGEST_IMMEDIATELY you're actually _not_ in the performance-critical path
anyway.

Getting rid of it _also_ harmonizes/reduces the number of paths to test.

It's approximately 1% of our installed base.
---
 bugsink/app_settings.py                       |  1 -
 bugsink/conf_templates/docker.py.template     |  4 +-
 bugsink/conf_templates/local.py.template      |  5 --
 .../conf_templates/singleserver.py.template   |  4 --
 bugsink/settings/development.py               |  2 -
 ingest/tests.py                               |  5 --
 ingest/views.py                               | 46 ++++++-------------
 phonehome/tasks.py                            |  1 -
 8 files changed, 16 insertions(+), 52 deletions(-)

diff --git a/bugsink/app_settings.py b/bugsink/app_settings.py
index 5bd1ce4..3ba51aa 100644
--- a/bugsink/app_settings.py
+++ b/bugsink/app_settings.py
@@ -42,7 +42,6 @@ DEFAULTS = {
     "TEAM_CREATION": CB_MEMBERS,  # who can create new teams. default: members, which means "any member of the site"
 
     # System inner workings:
-    "DIGEST_IMMEDIATELY": True,
     "VALIDATE_ON_DIGEST": "none",  # other legal values are "warn" and "strict"
     "KEEP_ENVELOPES": 0,  # set to a number to store that many; 0 means "store none". This is for debugging.
     "API_LOG_UNIMPLEMENTED_CALLS": False,  # if True, log unimplemented API calls; see #153
diff --git a/bugsink/conf_templates/docker.py.template b/bugsink/conf_templates/docker.py.template
index f2b044b..71845c8 100644
--- a/bugsink/conf_templates/docker.py.template
+++ b/bugsink/conf_templates/docker.py.template
@@ -52,7 +52,7 @@ eat_your_own_dogfood(SENTRY_DSN)
 
 
 # Our Docker image is hard-coded to run with snappea in the background; this means we hard-code (as opposed to reading
-# the from the env) certain variables: TASK_ALWAYS_EAGER, WORKAHOLIC and DIGEST_IMMEDIATELY.
+# the from the env) certain variables: TASK_ALWAYS_EAGER and WORKAHOLIC
 SNAPPEA = {
     "TASK_ALWAYS_EAGER": False,  # hard-coded, corresponds to Docker setup
     "WORKAHOLIC": True,  # hard-coded, corresponds to Docker setup
@@ -131,8 +131,6 @@ CB_NOBODY = "CB_NOBODY"
 
 
 BUGSINK = {
-    "DIGEST_IMMEDIATELY": False,  # hard-coded, corresponds to Docker setup
-
     # The URL where the Bugsink instance is hosted. This is used in the email notifications and to construct DSNs.
     "BASE_URL": os.getenv("BASE_URL", f"http://localhost:{_PORT}"),  # no trailing slash
 
diff --git a/bugsink/conf_templates/local.py.template b/bugsink/conf_templates/local.py.template
index 63edee7..d645dfa 100644
--- a/bugsink/conf_templates/local.py.template
+++ b/bugsink/conf_templates/local.py.template
@@ -68,11 +68,6 @@ BUGSINK = {
     # you can customize this as e.g. "My Bugsink" or "Bugsink for My Company"
     # "SITE_TITLE": "Bugsink",  
 
-    # When running locally, it is recommended to configure the Bugsink to digest events immediately. (This is basically
-    # implied by the "TASK_ALWAYS_EAGER" setting above, but setting DIGEST_IMMEDIATELY to True removes one more step
-    # from the process.)
-    "DIGEST_IMMEDIATELY": True,
-
     # You are licenced to run Bugsink locally in single-user mode. By changing the settings below, you may open the door
     # to more uses; make sure to buy a licence if you do.
     "SINGLE_USER": True,
diff --git a/bugsink/conf_templates/singleserver.py.template b/bugsink/conf_templates/singleserver.py.template
index 854a482..20d735d 100644
--- a/bugsink/conf_templates/singleserver.py.template
+++ b/bugsink/conf_templates/singleserver.py.template
@@ -98,10 +98,6 @@ BUGSINK = {
     "SINGLE_TEAM": False,
     "TEAM_CREATION": CB_MEMBERS,  # who can create new teams. default: members, which means "any member of the site"
 
-    # In the singleserver production setup, we do not digest events immediately, but instead offload this to Snappea.
-    # This ensures a more response and reliable server when there are peak loads in the events.
-    "DIGEST_IMMEDIATELY": False,
-
     # "MAX_EVENT_SIZE": _MEBIBYTE,
     # "MAX_EVENT_COMPRESSED_SIZE": 200 * _KIBIBYTE,
     # "MAX_ENVELOPE_SIZE": 100 * _MEBIBYTE,
diff --git a/bugsink/settings/development.py b/bugsink/settings/development.py
index 4e1a0ca..45b8928 100644
--- a/bugsink/settings/development.py
+++ b/bugsink/settings/development.py
@@ -83,8 +83,6 @@ SERVER_EMAIL = DEFAULT_FROM_EMAIL = 'Klaas van Schelven <klaas@bugsink.com>'
 
 
 BUGSINK = {
-    "DIGEST_IMMEDIATELY": False,
-
     # "MAX_EVENT_SIZE": _MEBIBYTE,
     # "MAX_EVENT_COMPRESSED_SIZE": 200 * _KIBIBYTE,
     # "MAX_ENVELOPE_SIZE": 100 * _MEBIBYTE,
diff --git a/ingest/tests.py b/ingest/tests.py
index 2f81df5..986043d 100644
--- a/ingest/tests.py
+++ b/ingest/tests.py
@@ -394,11 +394,6 @@ class IngestViewTestCase(TransactionTestCase):
                     with self.assertRaises(ViolatedExpectation):
                         check()
 
-    @tag("samples")
-    def test_envelope_endpoint_digest_non_immediate(self):
-        with override_settings(DIGEST_IMMEDIATELY=False):
-            self.test_envelope_endpoint()
-
     @tag("samples")
     def test_filestore(self):
         # quick & dirty way to test the filestore; in absence of a proper test for it, we just run a more-or-less
diff --git a/ingest/views.py b/ingest/views.py
index 93b1a49..f1b77b3 100644
--- a/ingest/views.py
+++ b/ingest/views.py
@@ -28,8 +28,7 @@ from issues.regressions import issue_is_regression
 
 from bugsink.transaction import immediate_atomic, delay_on_commit
 from bugsink.exceptions import ViolatedExpectation
-from bugsink.streams import (
-    content_encoding_reader, MaxDataReader, MaxDataWriter, NullWriter, MaxLengthExceeded, UnclosableBytesIO)
+from bugsink.streams import content_encoding_reader, MaxDataReader, MaxDataWriter, NullWriter, MaxLengthExceeded
 from bugsink.app_settings import get_settings
 
 from events.models import Event
@@ -154,16 +153,8 @@ class BaseIngestAPIView(View):
     @classmethod
     def process_event(cls, ingested_at, event_id, event_data_stream, project, request):
         event_metadata = cls.get_event_meta(event_id, ingested_at, request, project)
-
-        if get_settings().DIGEST_IMMEDIATELY:
-            # in this case the stream will be an BytesIO object, so we can actually call .get_value() on it.
-            event_data_bytes = event_data_stream.getvalue()
-            event_data = json.loads(event_data_bytes.decode("utf-8"))
-            performance_logger.info("ingested event with %s bytes", len(event_data_bytes))
-            cls.digest_event(event_metadata, event_data)
-        else:
-            performance_logger.info("ingested event with %s bytes", event_data_stream.bytes_written)
-            digest.delay(event_id, event_metadata)
+        performance_logger.info("ingested event with %s bytes", event_data_stream.bytes_written)
+        digest.delay(event_id, event_metadata)
 
     @classmethod
     def get_event_meta(cls, event_id, ingested_at, request, project):
@@ -616,9 +607,6 @@ class IngestEnvelopeAPIView(BaseIngestAPIView):
 
         def factory(item_headers):
             if item_headers.get("type") == "event":
-                if get_settings().DIGEST_IMMEDIATELY:
-                    return MaxDataWriter("MAX_EVENT_SIZE", UnclosableBytesIO())
-
                 # envelope_headers["event_id"] is required when type=event per the spec (and takes precedence over the
                 # payload's event_id), so we can rely on it having been set.
                 if "event_id" not in envelope_headers:
@@ -650,7 +638,10 @@ class IngestEnvelopeAPIView(BaseIngestAPIView):
 
                 continue
 
-            self.process_event(ingested_at, envelope_headers["event_id"], event_output_stream, project, request)
+            performance_logger.info("ingested event with %s bytes", event_output_stream.bytes_written)
+            event_metadata = self.get_event_meta(envelope_headers["event_id"], ingested_at, request, project)
+            digest.delay(envelope_headers["event_id"], event_metadata)
+
             break  # From the spec of type=event: This Item may occur at most once per Envelope. once seen: done
 
         return HttpResponse()
@@ -679,22 +670,15 @@ class MinidumpAPIView(BaseIngestAPIView):
 
     @classmethod
     def _ingest(cls, ingested_at, event_data, project, request):
-        # TSTTCPW: just ingest the invent as normally after we've done the minidump-parsing "immediately". We make
-        # ready for the expectations of process_event (DIGEST_IMMEDIATELY/event_output_stream) with an if-statement
+        # TSTTCPW: convert the minidump data to an event and then proceed as usual.
+        filename = get_filename_for_event_id(event_data["event_id"])
+        b108_makedirs(os.path.dirname(filename))
+        with open(filename, 'w') as f:
+            json.dump(event_data, f)
 
-        event_output_stream = MaxDataWriter("MAX_EVENT_SIZE", UnclosableBytesIO())
-        if get_settings().DIGEST_IMMEDIATELY:
-            # in this case the stream will be an BytesIO object, so we can actually call .get_value() on it.
-            event_output_stream.write(json.dumps(event_data).encode("utf-8"))
-
-        else:
-            # no need to actually touch event_output_stream for this case, we just need to write a file
-            filename = get_filename_for_event_id(event_data["event_id"])
-            b108_makedirs(os.path.dirname(filename))
-            with open(filename, 'w') as f:
-                json.dump(event_data, f)
-
-        cls.process_event(ingested_at, event_data["event_id"], event_output_stream, project, request)
+        # performance_logger.info("ingested event with %s bytes", event_output_stream.bytes_written)  TODO for minidump
+        event_metadata = cls.get_event_meta(event_data["event_id"], ingested_at, request, project)
+        digest.delay(event_data["event_id"], event_metadata)
 
     def post(self, request, project_pk=None):
         # not reusing the CORS stuff here; minidump-from-browser doesn't make sense.
diff --git a/phonehome/tasks.py b/phonehome/tasks.py
index 17612f8..d14ee04 100644
--- a/phonehome/tasks.py
+++ b/phonehome/tasks.py
@@ -92,7 +92,6 @@ def _make_message_body():
             "SINGLE_TEAM": get_settings().SINGLE_TEAM,
             "EMAIL_BACKEND": settings.EMAIL_BACKEND,
             "TASK_ALWAYS_EAGER": get_snappea_settings().TASK_ALWAYS_EAGER,
-            "DIGEST_IMMEDIATELY": get_settings().DIGEST_IMMEDIATELY,
             "IS_DOCKER": settings.IS_DOCKER,
             "DATABASE_ENGINE": settings.DATABASES["default"]["ENGINE"],
         },

From 31596a9b44f9331b2279b4d628e1f56b636984d7 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Wed, 5 Nov 2025 09:20:35 +0100
Subject: [PATCH 07/31] /store/ endpoint: non-immediate digestion

this makes this consistent with the work we did in the previous commit
at the price of being slightly more inefficient.

but it's a deprecated endpoint anyway
---
 ingest/views.py | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/ingest/views.py b/ingest/views.py
index f1b77b3..c28fb1b 100644
--- a/ingest/views.py
+++ b/ingest/views.py
@@ -503,30 +503,31 @@ class BaseIngestAPIView(View):
 class IngestEventAPIView(BaseIngestAPIView):
 
     def _post(self, request, project_pk=None):
+        # This endpoint is deprecated. Personally, I think it's the simpler (and given my goals therefore better) of the
+        # two, but fighting windmills and all... given that it's deprecated, I'm not going to give it quite as much love
+        # (at least for now).
+        #
+        # The main point of "inefficiency" is that the event data is parsed twice: once here (to get the event_id), and
+        # once in the actual digest.delay()
         ingested_at = datetime.now(timezone.utc)
         project = self.get_project_for_request(project_pk, request)
         if project.quota_exceeded_until is not None and ingested_at < project.quota_exceeded_until:
             return HttpResponse(status=HTTP_429_TOO_MANY_REQUESTS)
 
-        # This endpoint is deprecated. Personally, I think it's the simpler (and given my goals therefore better) of the
-        # two, but fighting windmills and all... given that it's deprecated, I'm not going to give it quite as much love
-        # (at least for now). Interfaces between the internal methods quite changed a bit recently, and this one did not
-        # keep up.
-        #
-        # In particular I'd like to just call process_event() here, but that takes both an event_id and an unparsed data
-        # stream, and we don't have an event_id here before parsing (and we don't want to parse twice). similarly,
-        # event_metadata construction requires the event_id.
-        #
-        # Instead, we just copy/pasted the relevant parts of process_event() here, and take only one branch (the one
-        # that digests immediately); i.e. we always digest immediately, independent of the setting.
+        event_data_bytes = MaxDataReader(
+            "MAX_EVENT_SIZE", content_encoding_reader(MaxDataReader("MAX_EVENT_COMPRESSED_SIZE", request))).read()
 
-        event_data = json.loads(
-            MaxDataReader("MAX_EVENT_SIZE", content_encoding_reader(
-                MaxDataReader("MAX_EVENT_COMPRESSED_SIZE", request))).read())
+        performance_logger.info("ingested event with %s bytes", len(event_data_bytes))
+
+        event_data = json.loads(event_data_bytes)
+        filename = get_filename_for_event_id(event_data["event_id"])
+        b108_makedirs(os.path.dirname(filename))
+        with open(filename, 'w') as f:
+            json.dump(event_data, f)
 
         event_metadata = self.get_event_meta(event_data["event_id"], ingested_at, request, project)
 
-        self.digest_event(event_metadata, event_data)
+        digest.delay(event_data["event_id"], event_metadata)
 
         return HttpResponse()
 

From d945b39259876d29029efd9e95248d189fb22b05 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Wed, 5 Nov 2025 09:27:10 +0100
Subject: [PATCH 08/31] Dead code removal

this was inlined in 7f831f52d4ca
---
 ingest/views.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/ingest/views.py b/ingest/views.py
index c28fb1b..91c7b3a 100644
--- a/ingest/views.py
+++ b/ingest/views.py
@@ -150,12 +150,6 @@ class BaseIngestAPIView(View):
         sentry_key = cls.get_sentry_key_for_request(request)
         return cls.get_project(project_pk, sentry_key)
 
-    @classmethod
-    def process_event(cls, ingested_at, event_id, event_data_stream, project, request):
-        event_metadata = cls.get_event_meta(event_id, ingested_at, request, project)
-        performance_logger.info("ingested event with %s bytes", event_data_stream.bytes_written)
-        digest.delay(event_id, event_metadata)
-
     @classmethod
     def get_event_meta(cls, event_id, ingested_at, request, project):
         # Meta means: not part of the event data. Basically: information that is available at the time of ingestion, and

From d807ea2c50a359ecc0ac0af6b0ea0ad8b5b13804 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Wed, 5 Nov 2025 11:10:14 +0100
Subject: [PATCH 09/31] Minidump: via envelope interface

See #82
---
 bugsink/app_settings.py |   1 +
 bugsink/streams.py      |   4 +-
 ingest/filestore.py     |   8 ++-
 ingest/tasks.py         |   8 ++-
 ingest/views.py         | 152 +++++++++++++++++++++++++---------------
 5 files changed, 113 insertions(+), 60 deletions(-)

diff --git a/bugsink/app_settings.py b/bugsink/app_settings.py
index 3ba51aa..1d59347 100644
--- a/bugsink/app_settings.py
+++ b/bugsink/app_settings.py
@@ -49,6 +49,7 @@ DEFAULTS = {
 
     # MAX* below mirror the (current) values for the Sentry Relay
     "MAX_EVENT_SIZE": _MEBIBYTE,
+    "MAX_ATTACHMENT_SIZE": 100 * _MEBIBYTE,
     "MAX_EVENT_COMPRESSED_SIZE": 200 * _KIBIBYTE,  # Note: this only applies to the deprecated "store" endpoint.
     "MAX_ENVELOPE_SIZE": 100 * _MEBIBYTE,
     "MAX_ENVELOPE_COMPRESSED_SIZE": 20 * _MEBIBYTE,
diff --git a/bugsink/streams.py b/bugsink/streams.py
index e3c6120..cb9d5b8 100644
--- a/bugsink/streams.py
+++ b/bugsink/streams.py
@@ -116,7 +116,7 @@ class MaxDataReader:
         self.bytes_read = 0
         self.stream = stream
 
-        if isinstance(max_length, str):  # reusing this is a bit of a hack, but leads to readable code at usage
+        if isinstance(max_length, str):  # support for settings-name max_length makes both the code and errors better
             self.max_length = get_settings()[max_length]
             self.reason = "%s: %s" % (max_length, self.max_length)
         else:
@@ -145,7 +145,7 @@ class MaxDataWriter:
         self.bytes_written = 0
         self.stream = stream
 
-        if isinstance(max_length, str):  # reusing this is a bit of a hack, but leads to readable code at usage
+        if isinstance(max_length, str):  # support for settings-name max_length makes both the code and errors better
             self.max_length = get_settings()[max_length]
             self.reason = "%s: %s" % (max_length, self.max_length)
         else:
diff --git a/ingest/filestore.py b/ingest/filestore.py
index b8737cf..82a0321 100644
--- a/ingest/filestore.py
+++ b/ingest/filestore.py
@@ -4,7 +4,7 @@ from django.utils._os import safe_join
 from bugsink.app_settings import get_settings
 
 
-def get_filename_for_event_id(event_id):
+def get_filename_for_event_id(event_id, filetype="event"):
     # The idea of having some levels of directories here (to avoid too many files in a single dir) is not yet
     # implemented. However, counterpoint: when doing stress tests, it was quite hard to get a serious backlog going
     # (snappea was very well able to play catch-up). So this might not be necessary.
@@ -15,4 +15,8 @@ def get_filename_for_event_id(event_id):
     # without needing to inspect all call-sites)
     event_id_normalized = uuid.UUID(event_id).hex
 
-    return safe_join(get_settings().INGEST_STORE_BASE_DIR, event_id_normalized)
+    basename = event_id_normalized
+    if filetype == "minidump":
+        basename += ".dmp"
+
+    return safe_join(get_settings().INGEST_STORE_BASE_DIR, basename)
diff --git a/ingest/tasks.py b/ingest/tasks.py
index 362de72..522ee3d 100644
--- a/ingest/tasks.py
+++ b/ingest/tasks.py
@@ -18,8 +18,14 @@ def digest(event_id, event_metadata):
     with open(get_filename_for_event_id(event_id), "rb") as f:
         event_data = json.loads(f.read().decode("utf-8"))
 
+    if event_metadata.get("has_minidump"):
+        with open(get_filename_for_event_id(event_id, filetype="minidump"), "rb") as f:
+            minidump_bytes = f.read()
+    else:
+        minidump_bytes = None
+
     try:
-        BaseIngestAPIView.digest_event(event_metadata, event_data)
+        BaseIngestAPIView.digest_event(event_metadata, event_data, minidump_bytes=minidump_bytes)
     except ValidationError as e:
         logger.warning("ValidationError in digest_event", exc_info=e)
     finally:
diff --git a/ingest/views.py b/ingest/views.py
index 91c7b3a..1a08819 100644
--- a/ingest/views.py
+++ b/ingest/views.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
 import uuid
 import hashlib
 import os
@@ -150,6 +151,34 @@ class BaseIngestAPIView(View):
         sentry_key = cls.get_sentry_key_for_request(request)
         return cls.get_project(project_pk, sentry_key)
 
+    @classmethod
+    def process_minidump(cls, ingested_at, minidump_bytes, project, request):
+        # This is for the "pure" minidump case, i.e. no associated event data. TSTTCPW: convert the minidump data to an
+        # event and then proceed as usual.
+
+        performance_logger.info("ingested minidump with %s bytes", len(minidump_bytes))
+
+        event_id = uuid.uuid4().hex
+        event_data = {
+            "event_id": event_id,
+            "platform": "native",
+            "extra": {},
+            "errors": [],
+        }
+
+        merge_minidump_event(event_data, minidump_bytes)
+
+        # write the event data to disk:
+        filename = get_filename_for_event_id(event_data["event_id"])
+        b108_makedirs(os.path.dirname(filename))
+        with open(filename, 'w') as f:
+            json.dump(event_data, f)
+
+        event_metadata = cls.get_event_meta(event_data["event_id"], ingested_at, request, project)
+        digest.delay(event_data["event_id"], event_metadata)
+
+        return event_id
+
     @classmethod
     def get_event_meta(cls, event_id, ingested_at, request, project):
         # Meta means: not part of the event data. Basically: information that is available at the time of ingestion, and
@@ -224,7 +253,7 @@ class BaseIngestAPIView(View):
 
     @classmethod
     @immediate_atomic()
-    def digest_event(cls, event_metadata, event_data, digested_at=None):
+    def digest_event(cls, event_metadata, event_data, digested_at=None, minidump_bytes=None):
         # ingested_at is passed from the point-of-ingestion; digested_at is determined here. Because this happens inside
         # `immediate_atomic`, we know digestions are serialized, and assuming non-decreasing server clocks, not decrea-
         # sing. (no so for ingestion times: clock-watching happens outside the snappe transaction, and threading in the
@@ -254,6 +283,10 @@ class BaseIngestAPIView(View):
         if get_settings().VALIDATE_ON_DIGEST in ["warn", "strict"]:
             cls.validate_event_data(event_data, get_settings().VALIDATE_ON_DIGEST)
 
+        if minidump_bytes is not None:
+            # we merge after validation: validation is about what's provided _externally_, not our own merging.
+            merge_minidump_event(event_data, minidump_bytes)
+
         # I resisted the temptation to put `get_denormalized_fields_for_data` in an if-statement: you basically "always"
         # need this info... except when duplicate event-ids are sent. But the latter is the exception, and putting this
         # in an if-statement would require more rework (and possibly extra queries) than it's worth.
@@ -601,43 +634,78 @@ class IngestEnvelopeAPIView(BaseIngestAPIView):
             return HttpResponse(status=HTTP_429_TOO_MANY_REQUESTS)
 
         def factory(item_headers):
-            if item_headers.get("type") == "event":
-                # envelope_headers["event_id"] is required when type=event per the spec (and takes precedence over the
-                # payload's event_id), so we can rely on it having been set.
-                if "event_id" not in envelope_headers:
-                    raise ParseError("event_id not found in envelope headers")
+            type_ = item_headers.get("type")
 
-                try:
-                    # validate that the event_id is a valid UUID as per the spec (validate at the edge)
-                    uuid.UUID(envelope_headers["event_id"])
-                except ValueError:
-                    raise ParseError("event_id in envelope headers is not a valid UUID")
+            if ((type_ not in ["event", "attachment"]) or
+                    (item_headers.get("type") == "attachment" and
+                     item_headers.get("attachment_type") != "event.minidump")):
 
-                filename = get_filename_for_event_id(envelope_headers["event_id"])
-                b108_makedirs(os.path.dirname(filename))
-                return MaxDataWriter("MAX_EVENT_SIZE", open(filename, 'wb'))
+                # non-event/minidumps can be discarded; (we don't check for individual size limits, because these differ
+                # per item type, we have the envelope limit to protect us, and we incur almost no cost (NullWriter))
+                return NullWriter()
 
-            # everything else can be discarded; (we don't check for individual size limits, because these differ
-            # per item type, we have the envelope limit to protect us, and we incur almost no cost (NullWriter) anyway.
-            return NullWriter()
+            # envelope_headers["event_id"] is required when type in ["event", "attachment"] per the spec (and takes
+            # precedence over the payload's event_id), so we can rely on it having been set.
+            if "event_id" not in envelope_headers:
+                raise ParseError("event_id not found in envelope headers")
 
-        for item_headers, event_output_stream in parser.get_items(factory):
-            if item_headers.get("type") != "event":
-                logger.info("skipping non-event item: %s", item_headers.get("type"))
+            try:
+                # validate that the event_id is a valid UUID as per the spec (validate at the edge)
+                uuid.UUID(envelope_headers["event_id"])
+            except ValueError:
+                raise ParseError("event_id in envelope headers is not a valid UUID")
 
-                if item_headers.get("type") == "transaction":
-                    # From the spec of type=event: This Item is mutually exclusive with `"transaction"` Items.
-                    # i.e. when we see a transaction, a regular event will not be present and we can stop.
-                    logger.info("discarding the rest of the envelope")
-                    break
+            filetype = "event" if type_ == "event" else "minidump"
+            filename = get_filename_for_event_id(envelope_headers["event_id"], filetype=filetype)
+            b108_makedirs(os.path.dirname(filename))
 
+            size_conf = "MAX_EVENT_SIZE" if type_ == "event" else "MAX_ATTACHMENT_SIZE"
+            return MaxDataWriter(size_conf, open(filename, 'wb'))
+
+        # We ingest the whole envelope first and organize by type; this enables "digest once" across envelope-parts
+        items_by_type = defaultdict(list)
+        for item_headers, output_stream in parser.get_items(factory):
+            type_ = item_headers.get("type")
+            if type_ not in ["event", "attachment"]:
+                logger.info("skipping non-supported envelope item: %s", item_headers.get("type"))
                 continue
 
-            performance_logger.info("ingested event with %s bytes", event_output_stream.bytes_written)
-            event_metadata = self.get_event_meta(envelope_headers["event_id"], ingested_at, request, project)
+            if type_ == "attachment" and item_headers.get("attachment_type") != "event.minidump":
+                logger.info("skipping non-supported attachment type: %s", item_headers.get("attachment_type"))
+                continue
+
+            performance_logger.info("ingested %s with %s bytes", type_, output_stream.bytes_written)
+            items_by_type[type_].append(output_stream)
+
+        event_count = len(items_by_type.get("event", []))
+        minidump_count = len(items_by_type.get("attachment", []))
+
+        if event_count > 1 or minidump_count > 1:
+            # TODO: we do 2 passes (one for storing, one for calling the right task), and we check certain conditions
+            # only on the second pass; this means that we may not clean up after ourselves yet.
+            # TODO we don't do any minidump files cleanup yet in any of the cases.
+
+            logger.info(
+                "can only deal with one event/minidump per envelope but found %s/%s, ignoring this envelope.",
+                event_count, minidump_count)
+            return HttpResponse()
+
+        event_metadata = self.get_event_meta(envelope_headers["event_id"], ingested_at, request, project)
+
+        if event_count == 1:
+            if minidump_count == 1:
+                event_metadata["has_minidump"] = True
             digest.delay(envelope_headers["event_id"], event_metadata)
 
-            break  # From the spec of type=event: This Item may occur at most once per Envelope. once seen: done
+        else:
+            # as it stands, we implement the minidump->event path for the minidump-only case on-ingest; we could push
+            # this to a task too if needed or for reasons of symmetry.
+            with open(get_filename_for_event_id(envelope_headers["event_id"], filetype="minidump"), 'rb') as f:
+                minidump_bytes = f.read()
+
+            # TODO: error handling
+            # NOTE "The file should start with the MDMP magic bytes." is not checked yet.
+            self.process_minidump(ingested_at, minidump_bytes, project, request)
 
         return HttpResponse()
 
@@ -663,18 +731,6 @@ class MinidumpAPIView(BaseIngestAPIView):
     # A Base "Ingest" APIView in the sense that it reuses some key building blocks (auth).
     # I'm not 100% sure whether "philosophically" the minidump endpoint is also "ingesting"; we'll see.
 
-    @classmethod
-    def _ingest(cls, ingested_at, event_data, project, request):
-        # TSTTCPW: convert the minidump data to an event and then proceed as usual.
-        filename = get_filename_for_event_id(event_data["event_id"])
-        b108_makedirs(os.path.dirname(filename))
-        with open(filename, 'w') as f:
-            json.dump(event_data, f)
-
-        # performance_logger.info("ingested event with %s bytes", event_output_stream.bytes_written)  TODO for minidump
-        event_metadata = cls.get_event_meta(event_data["event_id"], ingested_at, request, project)
-        digest.delay(event_data["event_id"], event_metadata)
-
     def post(self, request, project_pk=None):
         # not reusing the CORS stuff here; minidump-from-browser doesn't make sense.
 
@@ -682,26 +738,12 @@ class MinidumpAPIView(BaseIngestAPIView):
         project = self.get_project_for_request(project_pk, request)
 
         try:
-            # in this flow, we don't get an event_id from the client, so we just generate one here.
-            event_id = uuid.uuid4().hex
-
             minidump_bytes = request.FILES["upload_file_minidump"].read()
 
-            data = {
-                "event_id": event_id,
-                "platform": "native",
-                "extra": {},
-                "errors": [],
-            }
-
-            merge_minidump_event(data, minidump_bytes)
-
-            self._ingest(ingested_at, data, project, request)
+            event_id = self.process_minidump(ingested_at, minidump_bytes, project, request)
 
             return JsonResponse({"id": event_id})
-
         except Exception as e:
-            raise
             return JsonResponse({"detail": str(e)}, status=HTTP_400_BAD_REQUEST)
 
 

From 2e2a8cfeeb9eca45785c8b0f044ba06f97fdd984 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Wed, 5 Nov 2025 11:20:30 +0100
Subject: [PATCH 10/31] envelope endpoint tests: slight cleanup

---
 ingest/tests.py | 139 +++++++++++++++++++++---------------------------
 1 file changed, 61 insertions(+), 78 deletions(-)

diff --git a/ingest/tests.py b/ingest/tests.py
index 986043d..879f1a8 100644
--- a/ingest/tests.py
+++ b/ingest/tests.py
@@ -21,6 +21,7 @@ from projects.models import Project
 from events.factories import create_event_data, create_event
 from events.retention import evict_for_max_events
 from events.storage_registry import override_event_storages
+from events.models import Event
 from issues.factories import get_or_create_issue
 from issues.models import IssueStateManager, Issue, TurningPoint, TurningPointKind
 from issues.utils import get_values
@@ -293,38 +294,71 @@ class IngestViewTestCase(TransactionTestCase):
 
         sentry_auth_header = get_header_value(f"http://{ project.sentry_key }@hostisignored/{ project.id }")
 
-        # first, we ingest many issues
-        command = SendJsonCommand()
-        command.stdout = io.StringIO()
-        command.stderr = io.StringIO()
+        SAMPLES_DIR = os.getenv("SAMPLES_DIR", "../event-samples")
+
+        filename = glob(SAMPLES_DIR + "/sentry/mobile1-xen.json")[0]  # pick a fixed one for reproducibility
+
+        for i, include_event_id in enumerate([True, False]):
+            with open(filename) as f:
+                data = json.loads(f.read())
+
+            data["event_id"] = uuid.uuid4().hex  # for good measure we reset this to avoid duplicates.
+
+            if "timestamp" not in data:
+                # as per send_json command ("weirdly enough a large numer of sentry test data don't actually...")
+                data["timestamp"] = time.time()
+
+            event_id = data["event_id"]
+            if not include_event_id:
+                del data["event_id"]
+
+            data_bytes = json.dumps(data).encode("utf-8")
+            data_bytes = (
+                b'{"event_id": "%s"}\n{"type": "event"}\n' % event_id.encode("utf-8") + data_bytes)
+
+            response = self.client.post(
+                f"/api/{ project.id }/envelope/",
+                content_type="application/json",
+                headers={
+                    "X-Sentry-Auth": sentry_auth_header,
+                    "X-BugSink-DebugInfo": filename,
+                },
+                data=data_bytes,
+            )
+            self.assertEqual(
+                200, response.status_code, response.content if response.status_code != 302 else response.url)
+
+            self.assertEqual(1 + i, Event.objects.count())
+
+    @tag("samples")
+    def test_envelope_endpoint_reused_ids_different_exceptions(self):
+        # dirty copy/paste from test_envelope_endpoint,
+        project = Project.objects.create(name="test")
+
+        sentry_auth_header = get_header_value(f"http://{ project.sentry_key }@hostisignored/{ project.id }")
 
         SAMPLES_DIR = os.getenv("SAMPLES_DIR", "../event-samples")
 
-        event_samples = glob(SAMPLES_DIR + "/sentry/mobile1-xen.json")  # pick a fixed one for reproducibility
-        known_broken = [SAMPLES_DIR + "/" + s.strip() for s in _readlines(SAMPLES_DIR + "/KNOWN-BROKEN")]
+        filename = glob(SAMPLES_DIR + "/sentry/mobile1-xen.json")[0]  # this one has 'exception.values[0].type'
 
-        if len(event_samples) == 0:
-            raise Exception(f"No event samples found in {SAMPLES_DIR}; I insist on having some to test with.")
+        with open(filename) as f:
+            data = json.loads(f.read())
+        data["event_id"] = uuid.uuid4().hex  # we set it once, before the loop.
 
-        for include_event_id in [True, False]:
-            for filename in [sample for sample in event_samples if sample not in known_broken][:1]:  # one is enough
-                with open(filename) as f:
-                    data = json.loads(f.read())
+        for type_ in ["Foo", "Bar"]:  # forces different groupers, leading to separate Issue objects
+            data['exception']['values'][0]['type'] = type_
 
-                data["event_id"] = uuid.uuid4().hex  # for good measure we reset this to avoid duplicates.
+            if "timestamp" not in data:
+                # as per send_json command ("weirdly enough a large numer of sentry test data don't actually...")
+                data["timestamp"] = time.time()
 
-                if "timestamp" not in data:
-                    # as per send_json command ("weirdly enough a large numer of sentry test data don't actually...")
-                    data["timestamp"] = time.time()
+            event_id = data["event_id"]
 
-                event_id = data["event_id"]
-                if not include_event_id:
-                    del data["event_id"]
-
-                data_bytes = json.dumps(data).encode("utf-8")
-                data_bytes = (
-                    b'{"event_id": "%s"}\n{"type": "event"}\n' % event_id.encode("utf-8") + data_bytes)
+            data_bytes = json.dumps(data).encode("utf-8")
+            data_bytes = (
+                b'{"event_id": "%s"}\n{"type": "event"}\n' % event_id.encode("utf-8") + data_bytes)
 
+            def check():
                 response = self.client.post(
                     f"/api/{ project.id }/envelope/",
                     content_type="application/json",
@@ -337,62 +371,11 @@ class IngestViewTestCase(TransactionTestCase):
                 self.assertEqual(
                     200, response.status_code, response.content if response.status_code != 302 else response.url)
 
-    @tag("samples")
-    def test_envelope_endpoint_reused_ids_different_exceptions(self):
-        # dirty copy/paste from test_envelope_endpoint,
-        project = Project.objects.create(name="test")
-
-        sentry_auth_header = get_header_value(f"http://{ project.sentry_key }@hostisignored/{ project.id }")
-
-        # first, we ingest many issues
-        command = SendJsonCommand()
-        command.stdout = io.StringIO()
-        command.stderr = io.StringIO()
-
-        SAMPLES_DIR = os.getenv("SAMPLES_DIR", "../event-samples")
-
-        event_samples = glob(SAMPLES_DIR + "/sentry/mobile1-xen.json")  # this one has 'exception.values[0].type'
-        known_broken = [SAMPLES_DIR + "/" + s.strip() for s in _readlines(SAMPLES_DIR + "/KNOWN-BROKEN")]
-
-        if len(event_samples) == 0:
-            raise Exception(f"No event samples found in {SAMPLES_DIR}; I insist on having some to test with.")
-
-        for filename in [sample for sample in event_samples if sample not in known_broken][:1]:  # one is enough
-            with open(filename) as f:
-                data = json.loads(f.read())
-            data["event_id"] = uuid.uuid4().hex  # we set it once, before the loop.
-
-            for type_ in ["Foo", "Bar"]:  # forces different groupers, leading to separate Issue objects
-                data['exception']['values'][0]['type'] = type_
-
-                if "timestamp" not in data:
-                    # as per send_json command ("weirdly enough a large numer of sentry test data don't actually...")
-                    data["timestamp"] = time.time()
-
-                event_id = data["event_id"]
-
-                data_bytes = json.dumps(data).encode("utf-8")
-                data_bytes = (
-                    b'{"event_id": "%s"}\n{"type": "event"}\n' % event_id.encode("utf-8") + data_bytes)
-
-                def check():
-                    response = self.client.post(
-                        f"/api/{ project.id }/envelope/",
-                        content_type="application/json",
-                        headers={
-                            "X-Sentry-Auth": sentry_auth_header,
-                            "X-BugSink-DebugInfo": filename,
-                        },
-                        data=data_bytes,
-                    )
-                    self.assertEqual(
-                        200, response.status_code, response.content if response.status_code != 302 else response.url)
-
-                if type_ == "Foo":
+            if type_ == "Foo":
+                check()
+            else:
+                with self.assertRaises(ViolatedExpectation):
                     check()
-                else:
-                    with self.assertRaises(ViolatedExpectation):
-                        check()
 
     @tag("samples")
     def test_filestore(self):

From 48a818bed1f48184245414ea38597fcce71119c6 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Wed, 5 Nov 2025 12:36:17 +0100
Subject: [PATCH 11/31] Tests for envelope minidump API

---
 ingest/tests.py | 92 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 89 insertions(+), 3 deletions(-)

diff --git a/ingest/tests.py b/ingest/tests.py
index 879f1a8..d7e96ea 100644
--- a/ingest/tests.py
+++ b/ingest/tests.py
@@ -296,7 +296,7 @@ class IngestViewTestCase(TransactionTestCase):
 
         SAMPLES_DIR = os.getenv("SAMPLES_DIR", "../event-samples")
 
-        filename = glob(SAMPLES_DIR + "/sentry/mobile1-xen.json")[0]  # pick a fixed one for reproducibility
+        filename = glob(SAMPLES_DIR + "/bugsink/contexts.json")[0]  # pick a fixed one for reproducibility
 
         for i, include_event_id in enumerate([True, False]):
             with open(filename) as f:
@@ -330,6 +330,94 @@ class IngestViewTestCase(TransactionTestCase):
 
             self.assertEqual(1 + i, Event.objects.count())
 
+    @tag("samples")
+    def test_envelope_endpoint_event_and_minidump(self):
+        # dirty copy/paste from the integration test, let's start with "something", we can always clean it later.
+        project = Project.objects.create(name="test")
+
+        sentry_auth_header = get_header_value(f"http://{ project.sentry_key }@hostisignored/{ project.id }")
+
+        SAMPLES_DIR = os.getenv("SAMPLES_DIR", "../event-samples")
+
+        filename = glob(SAMPLES_DIR + "/bugsink/contexts.json")[0]  # pick a fixed one for reproducibility
+        with open(filename) as f:
+            data = json.loads(f.read())
+
+        data["event_id"] = uuid.uuid4().hex  # for good measure we reset this to avoid duplicates.
+
+        if "timestamp" not in data:
+            # as per send_json command ("weirdly enough a large numer of sentry test data don't actually...")
+            data["timestamp"] = time.time()
+
+        filename = glob(SAMPLES_DIR + "/minidumps/linux_overflow.dmp")[0]  # pick a fixed one for reproducibility
+        with open(filename, 'rb') as f:
+            minidump_bytes = f.read()
+
+        event_id = data["event_id"]
+
+        event_bytes = json.dumps(data).encode("utf-8")
+        data_bytes = (
+            b'{"event_id": "%s"}\n' % event_id.encode("utf-8") +
+            b'{"type": "event"}\n' + event_bytes + b"\n" +
+            b'{"type": "attachment", "attachment_type": "event.minidump", "length": %d}\n' % len(minidump_bytes) +
+            minidump_bytes
+            )
+
+        response = self.client.post(
+            f"/api/{ project.id }/envelope/",
+            content_type="application/json",
+            headers={
+                "X-Sentry-Auth": sentry_auth_header,
+            },
+            data=data_bytes,
+        )
+        self.assertEqual(
+            200, response.status_code, response.content if response.status_code != 302 else response.url)
+
+        self.assertEqual(1, Event.objects.count())
+        event = Event.objects.get()
+        self.assertTrue("prod" in ([tag.value.value for tag in event.tags.all()]))  # from the sample event
+
+        self.assertEqual('SIGABRT: Fatal Error: SIGABRT', Event.objects.get().title())
+
+    @tag("samples")
+    def test_envelope_endpoint_minidump_only(self):
+        # dirty copy/paste from the integration test, let's start with "something", we can always clean it later.
+        project = Project.objects.create(name="test")
+
+        sentry_auth_header = get_header_value(f"http://{ project.sentry_key }@hostisignored/{ project.id }")
+
+        SAMPLES_DIR = os.getenv("SAMPLES_DIR", "../event-samples")
+
+        filename = glob(SAMPLES_DIR + "/minidumps/linux_overflow.dmp")[0]  # pick a fixed one for reproducibility
+        with open(filename, 'rb') as f:
+            minidump_bytes = f.read()
+
+        event_id = uuid.uuid4().hex  # required at the envelope level so we provide it.
+
+        data_bytes = (
+            b'{"event_id": "%s"}\n' % event_id.encode("utf-8") +
+            b'{"type": "attachment", "attachment_type": "event.minidump", "length": %d}\n' % len(minidump_bytes) +
+            minidump_bytes
+            )
+
+        response = self.client.post(
+            f"/api/{ project.id }/envelope/",
+            content_type="application/json",
+            headers={
+                "X-Sentry-Auth": sentry_auth_header,
+            },
+            data=data_bytes,
+        )
+        self.assertEqual(
+            200, response.status_code, response.content if response.status_code != 302 else response.url)
+
+        self.assertEqual(1, Event.objects.count())
+        event = Event.objects.get()
+        self.assertFalse("prod" in ([tag.value.value for tag in event.tags.all()]))  # no sample event, so False
+
+        self.assertEqual('SIGABRT: Fatal Error: SIGABRT', Event.objects.get().title())
+
     @tag("samples")
     def test_envelope_endpoint_reused_ids_different_exceptions(self):
         # dirty copy/paste from test_envelope_endpoint,
@@ -364,7 +452,6 @@ class IngestViewTestCase(TransactionTestCase):
                     content_type="application/json",
                     headers={
                         "X-Sentry-Auth": sentry_auth_header,
-                        "X-BugSink-DebugInfo": filename,
                     },
                     data=data_bytes,
                 )
@@ -450,7 +537,6 @@ class IngestViewTestCase(TransactionTestCase):
                     content_type="application/json",
                     headers={
                         "X-Sentry-Auth": sentry_auth_header,
-                        "X-BugSink-DebugInfo": filename,
                     },
                     data=data_bytes,
                 )

From c4cf038a93df7843f69c10076e289c95b8375701 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Wed, 5 Nov 2025 12:59:42 +0100
Subject: [PATCH 12/31] minidump after-digest cleanup

envelope-based happy path only
---
 ingest/tasks.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/ingest/tasks.py b/ingest/tasks.py
index 522ee3d..0077d46 100644
--- a/ingest/tasks.py
+++ b/ingest/tasks.py
@@ -1,3 +1,4 @@
+import contextlib
 import os
 import logging
 import json
@@ -17,10 +18,12 @@ def digest(event_id, event_metadata):
 
     with open(get_filename_for_event_id(event_id), "rb") as f:
         event_data = json.loads(f.read().decode("utf-8"))
+        opened = [get_filename_for_event_id(event_id)]
 
     if event_metadata.get("has_minidump"):
         with open(get_filename_for_event_id(event_id, filetype="minidump"), "rb") as f:
             minidump_bytes = f.read()
+            opened += [get_filename_for_event_id(event_id, filetype="minidump")]
     else:
         minidump_bytes = None
 
@@ -29,11 +32,10 @@ def digest(event_id, event_metadata):
     except ValidationError as e:
         logger.warning("ValidationError in digest_event", exc_info=e)
     finally:
-        # NOTE: if an SDK misbehaves, and sends the same event_id multiple times in quick succession, the line below
-        # will trigger a FileNotFoundError on the second attempt to delete the file (the files also overwrite each other
-        # on-ingest). In that case your logs will also a "ValidationError in digest_event". Although that means an error
-        # bubbles up from the below, at least for now I'm OK with that. (next steps _could_ be: [a] catching the error
-        # as expected [b] refusing to "just overwrite and doubly enqueue on-ingest" [c] reporting about this particular
-        # problem to the end-user etc... but at least "getting it really right" might actually be quite hard (race
-        # conditions) and I'm not so sure it's worth it.
-        os.unlink(get_filename_for_event_id(event_id))
+        # NOTE: if an SDK misbehaves, and sends the same event_id multiple times in quick succession, the os.unlink
+        # below will trigger a FileNotFoundError on the second attempt to delete the file (the events also overwrite
+        # each other on-ingest, but that's separately dealt with, showing a "ValidationError in digest_event". We're
+        # just catching those and ignoring them (bubble-up is not desirable because it hinders follow-up cleanups)
+        for filename in opened:
+            with contextlib.suppress(FileNotFoundError):
+                os.unlink(filename)

From cb8f913cbe6592a267fff0c1e7e05a61f82ed255 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Wed, 5 Nov 2025 13:23:31 +0100
Subject: [PATCH 13/31] Document 2 TODOs

---
 ingest/views.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ingest/views.py b/ingest/views.py
index 1a08819..84b3bc7 100644
--- a/ingest/views.py
+++ b/ingest/views.py
@@ -285,6 +285,8 @@ class BaseIngestAPIView(View):
 
         if minidump_bytes is not None:
             # we merge after validation: validation is about what's provided _externally_, not our own merging.
+            # TODO error handling
+            # TODO should not be inside immediate_atomic if it turns out to be slow
             merge_minidump_event(event_data, minidump_bytes)
 
         # I resisted the temptation to put `get_denormalized_fields_for_data` in an if-statement: you basically "always"

From 21297eff2a726bd3e5348485d91bdcf8af032459 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Wed, 5 Nov 2025 14:36:38 +0100
Subject: [PATCH 14/31] Comment about understanding

(from reading the code, not actually double-checked right now)
---
 files/views.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/files/views.py b/files/views.py
index 881584c..74db12e 100644
--- a/files/views.py
+++ b/files/views.py
@@ -151,6 +151,7 @@ def chunk_upload(request, organization_slug):
     # POST: upload (full-size) "chunks" and store them as Chunk objects; file.name whould be the sha1 of the content.
     chunks = []
     if request.FILES:
+        # "file" and "file_gzip" are both possible multi-value keys for uploading (with associated semantics each)
         chunks = request.FILES.getlist("file")
 
         # NOTE: we read the whole unzipped file into memory; we _could_ take an approach like bugsink/streams.py.

From 7d008da4a136e254f02bea9f5973035569282351 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Wed, 5 Nov 2025 21:40:07 +0100
Subject: [PATCH 15/31] PoC for difs_assemble

works w/ sentry-client; no actual handling yet; TODOs in-code

See #82
---
 bugsink/urls.py |  5 ++-
 files/tasks.py  |  8 ++---
 files/views.py  | 81 ++++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 85 insertions(+), 9 deletions(-)

diff --git a/bugsink/urls.py b/bugsink/urls.py
index 2a65174..9e15360 100644
--- a/bugsink/urls.py
+++ b/bugsink/urls.py
@@ -14,7 +14,7 @@ from teams.views import debug_email as debug_teams_email
 from bugsink.app_settings import get_settings
 from users.views import signup, confirm_email, resend_confirmation, request_reset_password, reset_password, preferences
 from ingest.views import download_envelope
-from files.views import chunk_upload, artifact_bundle_assemble, api_root, api_catch_all
+from files.views import chunk_upload, artifact_bundle_assemble, difs_assemble, api_root, api_catch_all
 from bugsink.decorators import login_exempt
 
 from events.api_views import EventViewSet
@@ -71,6 +71,9 @@ urlpatterns = [
     path("api/0/organizations/<slug:organization_slug>/artifactbundle/assemble/", artifact_bundle_assemble,
          name="artifact_bundle_assemble"),
 
+    path("api/0/projects/<slug:organization_slug>/<slug:project_slug>/files/difs/assemble/", difs_assemble,
+         name="difs_assemble"),
+
     path('api/', include('ingest.urls')),
     path('api/0/', api_root, name='api_root'),
 
diff --git a/files/tasks.py b/files/tasks.py
index a193708..4e6a62b 100644
--- a/files/tasks.py
+++ b/files/tasks.py
@@ -105,8 +105,8 @@ def assemble_file(checksum, chunk_checksums, filename):
 
     # NOTE: unimplemented checks/tricks
     # * total file-size v.s. some max
-    # * explicit check chunk availability (as it stands, our processing is synchronous, so no need)
-    # * skip-on-checksum-exists
+    # * explicit check chunk availability
+    # * skip this whole thing when the (whole-file) checksum exists
 
     chunks = Chunk.objects.filter(checksum__in=chunk_checksums)
     chunks_dicts = {chunk.checksum: chunk for chunk in chunks}
@@ -117,7 +117,7 @@ def assemble_file(checksum, chunk_checksums, filename):
     if sha1(data, usedforsecurity=False).hexdigest() != checksum:
         raise Exception("checksum mismatch")
 
-    result = File.objects.get_or_create(
+    file, created = File.objects.get_or_create(
         checksum=checksum,
         defaults={
             "size": len(data),
@@ -129,7 +129,7 @@ def assemble_file(checksum, chunk_checksums, filename):
     # be used in multiple files (which are still being assembled) but with chunksizes in the order of 1MiB, I'd say this
     # is unlikely.
     chunks.delete()
-    return result
+    return file, created
 
 
 @shared_task
diff --git a/files/views.py b/files/views.py
index 74db12e..e357fe2 100644
--- a/files/views.py
+++ b/files/views.py
@@ -15,8 +15,8 @@ from bugsink.app_settings import get_settings
 from bugsink.transaction import durable_atomic, immediate_atomic
 from bsmain.models import AuthToken
 
-from .models import Chunk, File
-from .tasks import assemble_artifact_bundle
+from .models import Chunk, File, FileMetadata
+from .tasks import assemble_artifact_bundle, assemble_file
 
 logger = logging.getLogger("bugsink.api")
 
@@ -86,7 +86,8 @@ def get_chunk_upload_settings(request, organization_slug):
             # yet.
             "release_files",
 
-            # this would seem to be the "javascript sourcemaps" thing, but how exactly I did not check yet.
+            # on second reading I would say: this is "actual source code", but I did not check yet and "don't touch it"
+            # (even though we don't actually have an implementation for sources yet)
             "sources",
 
             # https://github.com/getsentry/sentry/discussions/46967
@@ -100,7 +101,7 @@ def get_chunk_upload_settings(request, organization_slug):
             # "artifact_bundles_v2",
 
             # the rest of the options are below:
-            # "debug_files",
+            "debug_files",
             # "release_files",
             # "pdbs",
             # "bcsymbolmaps",
@@ -199,6 +200,78 @@ def artifact_bundle_assemble(request, organization_slug):
     return JsonResponse({"state": ChunkFileState.CREATED, "missingChunks": []})
 
 
+@csrf_exempt  # we're in API context here; this could potentially be pulled up to a higher level though
+@requires_auth_token
+def difs_assemble(request, organization_slug, project_slug):
+    # TODO move to tasks.something.delay
+    # TODO think about the right transaction around this
+    data = json.loads(request.body)
+
+    file_checksums = set(data.keys())
+
+    existing_files = {
+        f.file.checksum: f
+        for f in FileMetadata.objects.filter(file__checksum__in=file_checksums)
+    }
+
+    all_requested_chunks = {
+        chunk
+        for file_info in data.values()
+        for chunk in file_info.get("chunks", [])
+    }
+
+    available_chunks = set(
+        Chunk.objects.filter(checksum__in=all_requested_chunks).values_list("checksum", flat=True)
+    )
+
+    response = {}
+
+    for file_checksum, file_info in data.items():
+        if file_checksum in existing_files:
+            response[file_checksum] = {
+                "state": ChunkFileState.OK,
+                "missingChunks": [],
+                # "dif": serialize(existing_files[file_checksum]),  # TODO: figure out if this is required.
+            }
+            continue
+
+        file_chunks = file_info.get("chunks", [])
+
+        # the sentry-cli sends an empty "chunks" list when just polling for file existence; since we already handled the
+        # case of existing files above, we can simply return NOT_FOUND here.
+        if not file_chunks:
+            response[file_checksum] = {
+                "state": ChunkFileState.NOT_FOUND,
+                "missingChunks": [],
+            }
+            continue
+
+        missing_chunks = [c for c in file_chunks if c not in available_chunks]
+        if missing_chunks:
+            response[file_checksum] = {
+                "state": ChunkFileState.NOT_FOUND,
+                "missingChunks": missing_chunks,
+            }
+            continue
+
+        file, _ = assemble_file(file_checksum, file_chunks, filename=file_info["name"])
+        FileMetadata.objects.get_or_create(
+            debug_id=file_info["debug_id"],
+            file_type="dif",  # I think? check!
+            defaults={
+                "file": file,
+                "data": "{}",  # this is the "catch all" field but I don't think we have anything in this case.
+            }
+        )
+
+        response[file_checksum] = {
+            "state": ChunkFileState.OK,
+            "missingChunks": [],
+        }
+
+    return JsonResponse(response)
+
+
 @user_passes_test(lambda u: u.is_superuser)
 @durable_atomic
 def download_file(request, checksum):

From ad077b405611db7058c9eea11708ad7ce494d6bf Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Sun, 9 Nov 2025 23:11:10 +0100
Subject: [PATCH 16/31] file_info's debug_id is optional

(as per my notes; didn't recheck this when committing)
---
 files/views.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/files/views.py b/files/views.py
index e357fe2..ce307bb 100644
--- a/files/views.py
+++ b/files/views.py
@@ -256,7 +256,7 @@ def difs_assemble(request, organization_slug, project_slug):
 
         file, _ = assemble_file(file_checksum, file_chunks, filename=file_info["name"])
         FileMetadata.objects.get_or_create(
-            debug_id=file_info["debug_id"],
+            debug_id=file_info.get("debug_id"),
             file_type="dif",  # I think? check!
             defaults={
                 "file": file,

From 1ed03ce053c3004e5bec1de91d0e0265667025df Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Tue, 11 Nov 2025 09:39:04 +0100
Subject: [PATCH 17/31] Support request.body when doing Chuncked Transfer
 Encoding

(ran into this b/c the native minidump upload uses chunked mode and
our impl. of that looks at request.body (via FILES))

See #9
---
 bugsink/wsgi.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/bugsink/wsgi.py b/bugsink/wsgi.py
index b527904..8e87d2f 100644
--- a/bugsink/wsgi.py
+++ b/bugsink/wsgi.py
@@ -47,6 +47,29 @@ def allowed_hosts_error_message(domain, allowed_hosts):
     return msg + "Add '%s' to ALLOWED_HOSTS or configure proxy to use 'Host: %s'." % (domain, proxy_suggestion)
 
 
+class NoopClose:
+    """Delegator: Gunicorn's Body doesn't implement .close(); Django calls that it in request.body's finally clause.
+    That .close() call in itself is slightly surprising to me (and I have not copied it in my own streaming reads) b/c
+    the [WSGI spec](https://peps.python.org/pep-3333/#input-and-error-streams) says:
+
+    > Applications conforming to this specification must not use any other methods or attributes of the input or errors
+    > objects. In particular, applications must not attempt to close these streams, even if they possess close()
+    > methods.
+
+    In the end, Django conforms to spec because LimitedStream _also_ drops the .close() (it's subclassing `io.IOBase`),
+    but one wonders why they call it in the first place. Anyway, just stub it and we're good.
+    """
+
+    def __init__(self, stream):
+        self._stream = stream
+
+    def __getattr__(self, name):
+        return getattr(self._stream, name)
+
+    def close(self):
+        return None
+
+
 class CustomWSGIRequest(WSGIRequest):
     """
     Custom WSQIRequest subclass with 3 fixes/changes:
@@ -71,7 +94,7 @@ class CustomWSGIRequest(WSGIRequest):
         super().__init__(environ)
 
         if "CONTENT_LENGTH" not in environ and "HTTP_TRANSFER_ENCODING" in environ:
-            self._stream = self.environ["wsgi.input"]
+            self._stream = NoopClose(self.environ["wsgi.input"])
 
     def get_host(self):
         """

From 80f65c7058dbbac69bcad3bba7056f28e1e5d32b Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Tue, 11 Nov 2025 09:45:53 +0100
Subject: [PATCH 18/31] Comment: CustomWSGIRequest.get_host(): no changes
 needed for Django 5.2 upgrade

this method wasn't changed upstream.

See #89
---
 bugsink/wsgi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bugsink/wsgi.py b/bugsink/wsgi.py
index 8e87d2f..2130b3b 100644
--- a/bugsink/wsgi.py
+++ b/bugsink/wsgi.py
@@ -107,7 +107,7 @@ class CustomWSGIRequest(WSGIRequest):
             # For /health/ endpoints, we skip the ALLOWED_HOSTS validation (see #140).
             return self._get_raw_host()
 
-        # copied from HttpRequest.get_host() in Django 4.2, with modifications.
+        # copied from HttpRequest.get_host() in Django 5.2, with modifications.
 
         host = self._get_raw_host()
 

From 54c96eb68001af01b176515caf0061aed43cb800 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Tue, 11 Nov 2025 09:48:00 +0100
Subject: [PATCH 19/31] Minidump upload: more explicit errors (and logging)

---
 ingest/views.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/ingest/views.py b/ingest/views.py
index edbb7ba..95478e7 100644
--- a/ingest/views.py
+++ b/ingest/views.py
@@ -39,6 +39,7 @@ from alerts.tasks import send_new_issue_alert, send_regression_alert
 from compat.timestamp import format_timestamp, parse_timestamp
 from tags.models import digest_tags
 from bsmain.utils import b108_makedirs
+from sentry_sdk_extensions import capture_or_log_exception
 from sentry.minidump import merge_minidump_event
 
 from .parsers import StreamingEnvelopeParser, ParseError
@@ -735,12 +736,17 @@ class MinidumpAPIView(BaseIngestAPIView):
         project = self.get_project_for_request(project_pk, request)
 
         try:
-            minidump_bytes = request.FILES["upload_file_minidump"].read()
+            if "upload_file_minidump" not in request.FILES:
+                return JsonResponse({"detail": "upload_file_minidump not found"}, status=HTTP_400_BAD_REQUEST)
 
+            minidump_bytes = request.FILES["upload_file_minidump"].read()
             event_id = self.process_minidump(ingested_at, minidump_bytes, project, request)
 
             return JsonResponse({"id": event_id})
         except Exception as e:
+            # we're still figuring out what this endpoint should do; so we log errors to learn from them while saying
+            # to the client "400 Bad Request" if we can't handle their stuff.
+            capture_or_log_exception(e, logger)
             return JsonResponse({"detail": str(e)}, status=HTTP_400_BAD_REQUEST)
 
 

From 72aab81d7d8bfca16fd1565d1ab9ab55ad962cfb Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Tue, 11 Nov 2025 13:39:44 +0100
Subject: [PATCH 20/31] Add ContentEncodingCheckMiddleware

---
 bugsink/middleware.py       | 37 +++++++++++++++++++++++++++++++++++++
 bugsink/settings/default.py |  1 +
 bugsink/tests.py            |  7 +++++++
 ingest/tests.py             |  5 ++++-
 ingest/urls.py              |  4 ++--
 5 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/bugsink/middleware.py b/bugsink/middleware.py
index 807f617..c7a8c9d 100644
--- a/bugsink/middleware.py
+++ b/bugsink/middleware.py
@@ -9,11 +9,48 @@ from django.utils.translation import get_supported_language_variant
 from django.utils.translation.trans_real import parse_accept_lang_header
 from django.utils import translation
 from django.urls import get_script_prefix
+from django.http import HttpResponseBadRequest
 
 
 performance_logger = logging.getLogger("bugsink.performance.views")
 
 
+class ContentEncodingCheckMiddleware:
+    """
+    We don't just globally interpret Content-Encoding for all views since:
+
+    1. this increases our attack service (or forces us to reason about how it doesn't)
+    2. forces us to think about the interplay of Django's POST/FILES handling and maximums (DATA_UPLOAD_MAX_MEMORY_SIZE)
+       and our own maximums and handling.
+    3. the various maximums for reading from streaming requests are per-view (underlying data-type) anyway.
+
+    Instead, the only global thing we do is "fail explicitly".
+    """
+
+    # NOTE: once this list becomes long, we could switch to a per-view decorator (with the maximum bytes as a value)
+    SUPPORTED_VIEWS = [
+        "ingest-store",
+        "ingest-envelope",
+    ]
+
+    def __init__(self, get_response):
+        self.get_response = get_response
+
+    def __call__(self, request):
+        return self.get_response(request)
+
+    def process_view(self, request, view_func, view_args, view_kwargs):
+        if request.resolver_match:
+            view_name = request.resolver_match.view_name
+        else:
+            view_name = "[unknown]"
+
+        if "HTTP_CONTENT_ENCODING" in request.META and view_name not in self.SUPPORTED_VIEWS:
+            return HttpResponseBadRequest(f"Content-Encoding handling is not supported for endpoint `{view_name}`")
+
+        return None  # proceed normally
+
+
 class DisallowChunkedMiddleware:
     def __init__(self, get_response):
         self.get_response = get_response
diff --git a/bugsink/settings/default.py b/bugsink/settings/default.py
index 4591012..60867ba 100644
--- a/bugsink/settings/default.py
+++ b/bugsink/settings/default.py
@@ -129,6 +129,7 @@ AUTH_USER_MODEL = "users.User"
 TAILWIND_APP_NAME = 'theme'
 
 MIDDLEWARE = [
+    "bugsink.middleware.ContentEncodingCheckMiddleware",
     'bugsink.middleware.SetRemoteAddrMiddleware',
     'bugsink.middleware.DisallowChunkedMiddleware',
     'django.middleware.security.SecurityMiddleware',
diff --git a/bugsink/tests.py b/bugsink/tests.py
index 865e7f7..7154a87 100644
--- a/bugsink/tests.py
+++ b/bugsink/tests.py
@@ -401,6 +401,13 @@ class SetRemoteAddrMiddlewareTestCase(RegularTestCase):
             SetRemoteAddrMiddleware.parse_x_forwarded_for("123.123.123.123,1.2.3.4")
 
 
+class ContentEncodingCheckMiddlewareTestCase(DjangoTestCase):
+
+    def test_speak_brotli_with_arbitrary_view_fails(self):
+        response = self.client.post("/", headers={"Content-Encoding": "br"})
+        self.assertTrue(b"Content-Encoding handling is not supported for endpoint `home`" in response.content)
+
+
 class AllowedHostsMsgTestCase(DjangoTestCase):
 
     def test_allowed_hosts_error_message(self):
diff --git a/ingest/tests.py b/ingest/tests.py
index b245ad1..1719297 100644
--- a/ingest/tests.py
+++ b/ingest/tests.py
@@ -471,7 +471,7 @@ class IngestViewTestCase(TransactionTestCase):
         data_bytes = BROTLI_BOMB_4G
 
         t0 = time.time()
-        self.client.post(
+        response = self.client.post(
             f"/api/{ project.id }/envelope/",
             content_type="application/json",
             headers={
@@ -486,6 +486,9 @@ class IngestViewTestCase(TransactionTestCase):
             # the failing version is well above 5s (I just stopped the process after ~30s)
             self.fail("Brotli bomb caused excessive processing time: %d seconds" % (time.time() - t0))
 
+        self.assertTrue(b"Max length" in response.content, response.content)
+        self.assertTrue(b"exceeded" in response.content, response.content)
+
     @tag("samples")
     def test_filestore(self):
         # quick & dirty way to test the filestore; in absence of a proper test for it, we just run a more-or-less
diff --git a/ingest/urls.py b/ingest/urls.py
index 9d42b03..d98eaf3 100644
--- a/ingest/urls.py
+++ b/ingest/urls.py
@@ -4,8 +4,8 @@ from .views import IngestEventAPIView, IngestEnvelopeAPIView, MinidumpAPIView
 
 urlpatterns = [
     # project_pk has to be an int per Sentry Client expectations.
-    path("<int:project_pk>/store/", IngestEventAPIView.as_view()),
-    path("<int:project_pk>/envelope/", IngestEnvelopeAPIView.as_view()),
+    path("<int:project_pk>/store/", IngestEventAPIView.as_view(), name="ingest-store"),
+    path("<int:project_pk>/envelope/", IngestEnvelopeAPIView.as_view(), name="ingest-envelope"),
 
     # is this "ingest"? it is at least in the sense that it matches the API schema and downstream auth etc.
     path("<int:project_pk>/minidump/", MinidumpAPIView.as_view()),

From 937df4cbb8d841667cbe5a091e79387aedbe8936 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Tue, 11 Nov 2025 13:50:07 +0100
Subject: [PATCH 21/31] minidump endpoint: support content encoding

adds readline() method to GeneratorReader (ChatGPT-generated; eyeballed for
correctness) to match the Django FILES/POST handling expectations.
---
 bugsink/middleware.py |  1 +
 bugsink/streams.py    | 52 +++++++++++++++++++++++++++++++++++++++----
 ingest/urls.py        |  2 +-
 ingest/views.py       |  8 ++++++-
 4 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/bugsink/middleware.py b/bugsink/middleware.py
index c7a8c9d..0aa63fd 100644
--- a/bugsink/middleware.py
+++ b/bugsink/middleware.py
@@ -31,6 +31,7 @@ class ContentEncodingCheckMiddleware:
     SUPPORTED_VIEWS = [
         "ingest-store",
         "ingest-envelope",
+        "ingest-minidump",
     ]
 
     def __init__(self, get_response):
diff --git a/bugsink/streams.py b/bugsink/streams.py
index 2453f08..c542009 100644
--- a/bugsink/streams.py
+++ b/bugsink/streams.py
@@ -5,6 +5,7 @@ import io
 import brotli
 
 from bugsink.app_settings import get_settings
+from bugsink.utils import assert_
 
 
 DEFAULT_CHUNK_SIZE = 8 * 1024
@@ -119,22 +120,65 @@ class GeneratorReader:
         del self.buffer[:size]
         return result
 
+    def readline(self, size=-1):
+        newline_index = self.buffer.find(b"\n")
+        while newline_index == -1:
+            chunk = self.read(DEFAULT_CHUNK_SIZE)
+            if not chunk:
+                break
+            self.buffer.extend(chunk)
+            newline_index = self.buffer.find(b"\n")
+
+        if newline_index != -1:
+            end = newline_index + 1
+        else:
+            end = len(self.buffer)
+
+        if size >= 0:
+            end = min(end, size)
+
+        result = bytes(self.buffer[:end])
+        del self.buffer[:end]
+        return result
+
 
 def content_encoding_reader(request):
     encoding = request.META.get("HTTP_CONTENT_ENCODING", "").lower()
-
     if encoding == "gzip":
-        return GeneratorReader(zlib_generator(request, WBITS_PARAM_FOR_GZIP), bad_request_exceptions=(zlib.error,))
+        return GeneratorReader(
+            zlib_generator(request._stream, WBITS_PARAM_FOR_GZIP),
+            bad_request_exceptions=(zlib.error,),
+        )
 
     if encoding == "deflate":
-        return GeneratorReader(zlib_generator(request, WBITS_PARAM_FOR_DEFLATE), bad_request_exceptions=(zlib.error,))
+        return GeneratorReader(
+            zlib_generator(request._stream, WBITS_PARAM_FOR_DEFLATE),
+            bad_request_exceptions=(zlib.error,)
+        )
 
     if encoding == "br":
-        return GeneratorReader(brotli_generator(request), bad_request_exceptions=(brotli.error, BrotliError))
+        return GeneratorReader(
+            brotli_generator(request._stream),
+            bad_request_exceptions=(brotli.error, BrotliError)
+        )
 
     return request
 
 
+def handle_request_content_encoding(request):
+    """Turns a request w/ Content-Encoding into an unpacked equivalent; for further "regular" (POST, FILES) handling
+    by Django.
+    """
+
+    encoding = request.META.get("HTTP_CONTENT_ENCODING", "").lower()
+    if encoding in ["gzip", "deflate", "br"]:
+        assert_(not request._read_started)
+        request._stream = content_encoding_reader(request)
+
+        request.META["CONTENT_LENGTH"] = str(pow(2, 32) - 1)  # large enough (we can't predict the decompressed value)
+        request.META.pop("HTTP_CONTENT_ENCODING")  # the resulting request is no longer encoded
+
+
 def compress_with_zlib(input_stream, wbits, chunk_size=DEFAULT_CHUNK_SIZE):
     # mostly useful for testing (compress-decompress cycles)
 
diff --git a/ingest/urls.py b/ingest/urls.py
index d98eaf3..202bc2c 100644
--- a/ingest/urls.py
+++ b/ingest/urls.py
@@ -8,5 +8,5 @@ urlpatterns = [
     path("<int:project_pk>/envelope/", IngestEnvelopeAPIView.as_view(), name="ingest-envelope"),
 
     # is this "ingest"? it is at least in the sense that it matches the API schema and downstream auth etc.
-    path("<int:project_pk>/minidump/", MinidumpAPIView.as_view()),
+    path("<int:project_pk>/minidump/", MinidumpAPIView.as_view(), name="ingest-minidump"),
 ]
diff --git a/ingest/views.py b/ingest/views.py
index 95478e7..d286db5 100644
--- a/ingest/views.py
+++ b/ingest/views.py
@@ -29,7 +29,9 @@ from issues.regressions import issue_is_regression
 
 from bugsink.transaction import immediate_atomic, delay_on_commit
 from bugsink.exceptions import ViolatedExpectation
-from bugsink.streams import content_encoding_reader, MaxDataReader, MaxDataWriter, NullWriter, MaxLengthExceeded
+from bugsink.streams import (
+    content_encoding_reader, MaxDataReader, MaxDataWriter, NullWriter, MaxLengthExceeded,
+    handle_request_content_encoding)
 from bugsink.app_settings import get_settings
 
 from events.models import Event
@@ -732,6 +734,10 @@ class MinidumpAPIView(BaseIngestAPIView):
     def post(self, request, project_pk=None):
         # not reusing the CORS stuff here; minidump-from-browser doesn't make sense.
 
+        # TODO: actually implement max (we just use Django defaults now, which will switch to write-to-tmp after 2.5M
+        # for the file, but this can still swamp your CPU/tmp dir.
+        handle_request_content_encoding(request)
+
         ingested_at = datetime.now(timezone.utc)
         project = self.get_project_for_request(project_pk, request)
 

From ab065a632932e5c85cf714ad8c9bde987ca87116 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Tue, 11 Nov 2025 15:25:51 +0100
Subject: [PATCH 22/31] api_catch_all: header-based

rather than try-and-recover, just look at the headers and show body/POST etc.
this avoids hard-to-reason about situations where either of those won't work
because the other has already been executed; in combination with reasoning
about max size usage the explicit solution is simply easier to reason about.

further:

* makes api_catch_all one of the content_encoding-ready views.
* implement a max length for the ingest api view
---
 bugsink/middleware.py |  2 ++
 bugsink/streams.py    |  4 ++--
 files/views.py        | 42 +++++++++++++++++++++++++++++++-----------
 ingest/views.py       |  5 ++---
 4 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/bugsink/middleware.py b/bugsink/middleware.py
index 0aa63fd..27dddf8 100644
--- a/bugsink/middleware.py
+++ b/bugsink/middleware.py
@@ -32,6 +32,8 @@ class ContentEncodingCheckMiddleware:
         "ingest-store",
         "ingest-envelope",
         "ingest-minidump",
+
+        "api_catch_all",
     ]
 
     def __init__(self, get_response):
diff --git a/bugsink/streams.py b/bugsink/streams.py
index c542009..3fc93b7 100644
--- a/bugsink/streams.py
+++ b/bugsink/streams.py
@@ -165,7 +165,7 @@ def content_encoding_reader(request):
     return request
 
 
-def handle_request_content_encoding(request):
+def handle_request_content_encoding(request, max_length):
     """Turns a request w/ Content-Encoding into an unpacked equivalent; for further "regular" (POST, FILES) handling
     by Django.
     """
@@ -173,7 +173,7 @@ def handle_request_content_encoding(request):
     encoding = request.META.get("HTTP_CONTENT_ENCODING", "").lower()
     if encoding in ["gzip", "deflate", "br"]:
         assert_(not request._read_started)
-        request._stream = content_encoding_reader(request)
+        request._stream = MaxDataReader(max_length, content_encoding_reader(request))
 
         request.META["CONTENT_LENGTH"] = str(pow(2, 32) - 1)  # large enough (we can't predict the decompressed value)
         request.META.pop("HTTP_CONTENT_ENCODING")  # the resulting request is no longer encoded
diff --git a/files/views.py b/files/views.py
index ce307bb..96922b8 100644
--- a/files/views.py
+++ b/files/views.py
@@ -13,6 +13,7 @@ from sentry.assemble import ChunkFileState
 
 from bugsink.app_settings import get_settings
 from bugsink.transaction import durable_atomic, immediate_atomic
+from bugsink.streams import handle_request_content_encoding
 from bsmain.models import AuthToken
 
 from .models import Chunk, File, FileMetadata
@@ -292,6 +293,8 @@ def api_catch_all(request, subpath):
     # the existance of this view (and the associated URL pattern) has the effect of `APPEND_SLASH=False` for our API
     # endpoints, which is a good thing: for API enpoints you generally don't want this kind of magic (explicit breakage
     # is desirable for APIs, and redirects don't even work for POST/PUT data)
+    MAX_API_CATCH_ALL_SIZE = 1_000_000  # security and usability meet at this value (or below)
+    handle_request_content_encoding(request, MAX_API_CATCH_ALL_SIZE)
 
     if not get_settings().API_LOG_UNIMPLEMENTED_CALLS:
         raise Http404("Unimplemented API endpoint: /api/" + subpath)
@@ -302,27 +305,44 @@ def api_catch_all(request, subpath):
         f"  Method: {request.method}",
     ]
 
+    interesting_meta_keys = ["CONTENT_TYPE", "CONTENT_LENGTH", "HTTP_TRANSFER_ENCODING"]
+    interesting_headers = {
+        k: request.META[k] for k in interesting_meta_keys if k in request.META
+    }
+
+    if interesting_headers:
+        lines.append("  Headers:")
+        for k, v in interesting_headers.items():
+            lines.append(f"    {k}: {v}")
+
     if request.GET:
         lines.append(f"  GET:    {request.GET.dict()}")
 
-    body = request.body  # note: must be above request.POST access to avoid "You cannot access body after reading ..."
-    if request.POST:
-        lines.append(f"  POST:   {request.POST.dict()}")
+    content_type = request.META.get("CONTENT_TYPE", "")
+    if content_type == "application/x-www-form-urlencoded" or content_type.startswith("multipart/form-data"):
+        if request.POST:
+            lines.append(f"  POST:   {request.POST.dict()}")
+        if request.FILES:
+            lines.append(f"  FILES:  {[f.name for f in request.FILES.values()]}")
 
-    if body:
-        try:
-            decoded = body.decode("utf-8", errors="replace").strip()
-            lines.append("  Body:")
-            lines.append(f"    {decoded[:500]}")
+    else:
+        body = request.read(MAX_API_CATCH_ALL_SIZE)
+        decoded = body.decode("utf-8", errors="replace").strip()
+
+        if content_type == "application/json":
+            shown_pretty = False
             try:
                 parsed = json.loads(decoded)
-                pretty = json.dumps(parsed, indent=2)[:10_000]
+                pretty = json.dumps(parsed, indent=2)
                 lines.append("  JSON body:")
                 lines.extend(f"    {line}" for line in pretty.splitlines())
+                shown_pretty = True
             except json.JSONDecodeError:
                 pass
-        except Exception as e:
-            lines.append(f"  Body: <decode error: {e}>")
+
+        if not shown_pretty:
+            lines.append("  Body:")
+            lines.append(f"    {body}")
 
     logger.info("\n".join(lines))
     raise Http404("Unimplemented API endpoint: /api/" + subpath)
diff --git a/ingest/views.py b/ingest/views.py
index d286db5..3aebb60 100644
--- a/ingest/views.py
+++ b/ingest/views.py
@@ -734,9 +734,8 @@ class MinidumpAPIView(BaseIngestAPIView):
     def post(self, request, project_pk=None):
         # not reusing the CORS stuff here; minidump-from-browser doesn't make sense.
 
-        # TODO: actually implement max (we just use Django defaults now, which will switch to write-to-tmp after 2.5M
-        # for the file, but this can still swamp your CPU/tmp dir.
-        handle_request_content_encoding(request)
+        # TODO: actually pick/configure max
+        handle_request_content_encoding(request, 50 * 1024 * 1024)
 
         ingested_at = datetime.now(timezone.utc)
         project = self.get_project_for_request(project_pk, request)

From b60980c8f36b25e659f052d0965d29b763d8dbe9 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Wed, 12 Nov 2025 14:57:02 +0100
Subject: [PATCH 23/31] PoC: Minidumps w/ symbolification

Plenty of TODOs left; but this proves we can find:

* file names
* function names
* line nos
* source context

See #82
---
 files/minidump.py  | 143 +++++++++++++++++++++++++++++++++++++++++++++
 files/views.py     |   8 ++-
 sentry/minidump.py |  59 +++++++++----------
 3 files changed, 177 insertions(+), 33 deletions(-)
 create mode 100644 files/minidump.py

diff --git a/files/minidump.py b/files/minidump.py
new file mode 100644
index 0000000..2ac523c
--- /dev/null
+++ b/files/minidump.py
@@ -0,0 +1,143 @@
+import io
+import zipfile
+import symbolic
+from sentry_sdk_extensions import capture_or_log_exception
+
+from bugsink.utils import assert_
+from .models import FileMetadata
+
+
+def get_single_object(archive):
+    # our understanding: sentry-cli uploads single-object archives; we need to get the single object out of it...
+    # ...but this does raise the question of why archives exist at all... hence the assert
+    objects = list(archive.iter_objects())
+    assert_(len(objects) == 1)
+    return objects[0]
+
+
+def build_cfi_map_from_minidump_bytes(minidump_bytes):
+    process_state = symbolic.minidump.ProcessState.from_minidump_buffer(minidump_bytes)
+
+    frame_info_map = symbolic.minidump.FrameInfoMap.new()
+
+    for module in process_state.modules():
+        if not module.debug_id:
+            continue
+
+        dashed_debug_id = symbolic.debuginfo.id_from_breakpad(module.debug_id)
+        if FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="dbg").count() == 0:
+            continue
+
+        dif_bytes = FileMetadata.objects.get(debug_id=dashed_debug_id, file_type="dbg").file.data
+        archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes)
+
+        debug_object = get_single_object(archive)
+
+        cfi = symbolic.minidump.CfiCache.from_object(debug_object)
+        frame_info_map.add(module.debug_id, cfi)
+
+    return frame_info_map
+
+
+def extract_dif_metadata(dif_bytes):
+    try:
+        archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes)
+        debug_object = get_single_object(archive)
+        return {
+            "kind": debug_object.kind,  # "dbg", "lib", "src"
+            "code_id": debug_object.code_id,
+            "debug_id": debug_object.debug_id,
+            # "file_format": debug_object.file_format,  # "elf", "macho", "pe", "sourcebundle"
+        }
+    except Exception as e:
+        raise  # TODO stabalize what we do later
+        capture_or_log_exception(e)
+        return {}
+
+
+def extract_source_context(src_bytes, filename, center_line, context=5):
+
+    # TODO the usual worries about zip bombs/memory usage apply here.
+    with zipfile.ZipFile(io.BytesIO(src_bytes)) as zf:
+        # sourcebundle entries use relative paths like "src/main.c" or so says ChatGPT
+        candidates = [n for n in zf.namelist() if n.endswith(filename)]
+
+        if not candidates:
+            return [], None, []
+
+        with zf.open(candidates[0]) as f:
+            lines = f.read().decode("utf-8").splitlines()
+
+        # Clamp line range to valid indices
+        start = max(center_line - context - 1, 0)
+        end = min(center_line + context, len(lines))
+
+        pre_context = lines[start:center_line - 1]
+        context_line = lines[center_line - 1] if 0 <= center_line - 1 < len(lines) else None
+        post_context = lines[center_line:end]
+
+        return pre_context, context_line, post_context
+
+
+def _find_module_for_address(process_state, abs_addr: int):
+    for m in process_state.modules():
+        if m.addr and m.size and m.addr <= abs_addr < (m.addr + m.size):
+            return m
+    return None
+
+
+def event_threads_for_process_state(process_state):
+    threads = []
+    for thread in process_state.threads():
+        thread_frames = []
+
+        for frame in thread.frames():
+            module = _find_module_for_address(process_state, frame.instruction)
+            fn = file = None
+            line = 0
+
+            if module and module.debug_id:
+                dashed_debug_id = symbolic.debuginfo.id_from_breakpad(module.debug_id)
+
+                file_metadata = FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="dbg").first()
+                if file_metadata:
+                    dif_bytes = file_metadata.file.data
+
+                    archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes)
+                    objects = list(archive.iter_objects())
+                    assert len(objects) == 1
+                    obj = objects[0]
+
+                    symcache = obj.make_symcache()
+
+                    rel = frame.instruction - module.addr
+                    infos = symcache.lookup(rel) or symcache.lookup(rel - 1)  # "or -1" from ChatGPT... should we do it?
+                    if infos:
+                        li = infos[0]
+                        fn = li.function_name
+                        file = li.filename
+                        line = li.line
+
+                        # if we have line info, try source bundle
+                        src_meta = FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="src").first()
+                        if src_meta and file and line:
+                            src_bytes = src_meta.file.data
+                            pre_ctx, ctx_line, post_ctx = extract_source_context(src_bytes, file, line)
+
+            thread_frames.append({
+                "instruction_addr": f"0x{frame.instruction:x}",
+                "function": fn or "<unknown>",
+                "filename": file,
+                "lineno": line,
+                "pre_context": pre_ctx,
+                "context_line": ctx_line,
+                "post_context": post_ctx,
+            })
+
+        threads.append({
+            "id": thread.thread_id,
+            "crashed": (thread.thread_id == process_state.requesting_thread),
+            "stacktrace": {"frames": thread_frames},
+        })
+
+    return threads
diff --git a/files/views.py b/files/views.py
index 96922b8..31ee4f2 100644
--- a/files/views.py
+++ b/files/views.py
@@ -18,6 +18,7 @@ from bsmain.models import AuthToken
 
 from .models import Chunk, File, FileMetadata
 from .tasks import assemble_artifact_bundle, assemble_file
+from .minidump import extract_dif_metadata
 
 logger = logging.getLogger("bugsink.api")
 
@@ -256,9 +257,12 @@ def difs_assemble(request, organization_slug, project_slug):
             continue
 
         file, _ = assemble_file(file_checksum, file_chunks, filename=file_info["name"])
+
+        symbolic_metadata = extract_dif_metadata(file.data)
+
         FileMetadata.objects.get_or_create(
-            debug_id=file_info.get("debug_id"),
-            file_type="dif",  # I think? check!
+            debug_id=file_info.get("debug_id"),  # TODO : .get implies "no debug_id", but in that case it's useless
+            file_type=symbolic_metadata["kind"],  # NOTE: symbolic's kind goes into file_type...
             defaults={
                 "file": file,
                 "data": "{}",  # this is the "catch all" field but I don't think we have anything in this case.
diff --git a/sentry/minidump.py b/sentry/minidump.py
index 8a126c3..a798505 100644
--- a/sentry/minidump.py
+++ b/sentry/minidump.py
@@ -2,27 +2,28 @@
 # https://github.com/getsentry/sentry/blob/f0ac91f2ec6b45ad18e5eea6df72c5c72573e964/src/sentry/models/minidump.py#L53
 # with (as it stands) minor modifications.
 
-import logging
-from symbolic import ProcessState
+import symbolic
+from files.minidump import build_cfi_map_from_minidump_bytes, event_threads_for_process_state
 
 
 def merge_minidump_event(data, minidump_bytes):
-    state = ProcessState.from_minidump_buffer(minidump_bytes)
+    frame_info_map = build_cfi_map_from_minidump_bytes(minidump_bytes)
+    process_state = symbolic.ProcessState.from_minidump_buffer(minidump_bytes, frame_infos=frame_info_map)
 
-    data['level'] = 'fatal' if state.crashed else 'info'
+    data['level'] = 'fatal' if process_state.crashed else 'info'
 
-    exception_value =  'Assertion Error: %s' % state.assertion if state.assertion \
-        else 'Fatal Error: %s' % state.crash_reason
+    exception_value = 'Assertion Error: %s' % process_state.assertion if process_state.assertion \
+        else 'Fatal Error: %s' % process_state.crash_reason
     # NO_BANANA: data['message'] is not the right target
     # data['message'] = exception_value
 
-    if state.timestamp:
-        data['timestamp'] = float(state.timestamp)
+    if process_state.timestamp:
+        data['timestamp'] = float(process_state.timestamp)
 
     # Extract as much system information as we can. TODO: We should create
     # a custom context and implement a specific minidump view in the event
     # UI.
-    info = state.system_info
+    info = process_state.system_info
     context = data.setdefault('contexts', {})
     os = context.setdefault('os', {})
     device = context.setdefault('device', {})
@@ -30,46 +31,42 @@ def merge_minidump_event(data, minidump_bytes):
     os['version'] = info.os_version
     device['arch'] = info.cpu_family
 
-    # We can extract stack traces here already but since CFI is not
-    # available yet (without debug symbols), the stackwalker will
-    # resort to stack scanning which yields low-quality results. If
-    # the user provides us with debug symbols, we will reprocess this
-    # minidump and add improved stacktraces later.
-    threads = [{
-        'id': thread.thread_id,
-        'crashed': False,
-        'stacktrace': {
-            'frames': [{
-                'instruction_addr': '0x%x' % frame.instruction,
-                'function': '<unknown>',  # Required by interface
-            } for frame in thread.frames()],
-        },
-    } for thread in state.threads()]
-    data.setdefault('threads', {})['values'] = threads
+    threads = event_threads_for_process_state(process_state)
+    data.setdefault("threads", {})["values"] = threads
 
     # Mark the crashed thread and add its stacktrace to the exception
-    crashed_thread = threads[state.requesting_thread]
+    crashed_thread = threads[process_state.requesting_thread]
     crashed_thread['crashed'] = True
 
     # Extract the crash reason and infos
     exception = {
         'value': exception_value,
         'thread_id': crashed_thread['id'],
-        'type': state.crash_reason,
+        'type': process_state.crash_reason,
         # Move stacktrace here from crashed_thread (mutating!)
         'stacktrace': crashed_thread.pop('stacktrace'),
     }
 
+    for frame in exception['stacktrace']['frames']:
+        frame['in_app'] = True  # minidumps don't distinguish in_app frames; assume all are in_app
+
+    exception['stacktrace']['frames'].reverse()  # "Frames should be sorted from oldest to newest."
+    # TODO we don't have display-info for threads yet, I think?
+    # we may need to revert the per-thread stacktraces above as well then
+
     data.setdefault('exception', {}) \
         .setdefault('values', []) \
         .append(exception)
 
     # Extract referenced (not all loaded) images
     images = [{
-        'type': 'apple',  # Required by interface
-        # 'uuid': module.uuid, NO_BANANA
+        'type': 'elf',                    # TODO not sure what this should _actually_ be
         'image_addr': module.addr,
         'image_size': module.size,
-        # 'name': module.name, NO_BANANA
-    } for module in state.modules()]
+        'code_file': module.code_file,
+        'code_id': module.code_id,
+        'debug_file': module.debug_file,
+        'debug_id': symbolic.debuginfo.id_from_breakpad(module.debug_id) if module.debug_id else None,
+    } for module in process_state.modules()]
+
     data.setdefault('debug_meta', {})['images'] = images

From 5757b4f9b52f43f177f69022a308ed0017522e7d Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Wed, 12 Nov 2025 21:03:03 +0100
Subject: [PATCH 24/31] Typo in comment

---
 issues/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/issues/utils.py b/issues/utils.py
index 58a210a..e2348fc 100644
--- a/issues/utils.py
+++ b/issues/utils.py
@@ -120,7 +120,7 @@ def get_exception_type_and_value_for_exception(data):
     # From the sentry docs:
     # > An optional flag indicating that this error is synthetic. Synthetic errors are errors that carry little
     # > meaning by themselves.
-    # If this flag is set, we ignored the Exception's type and used the function name instead (if available).
+    # If this flag is set, we ignore the Exception's type and used the function name instead (if available).
     if get_path(exception, "mechanism", "synthetic"):
         _, function = get_crash_location(data)
         if function:

From 54ec6eaceb18c0b97d39a8e23fddda46e7141865 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Wed, 12 Nov 2025 21:04:49 +0100
Subject: [PATCH 25/31] Populate exception['value']

mirrors how we show fetch it in `get_exception_type_and_value_for_exception`
---
 sentry/minidump.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sentry/minidump.py b/sentry/minidump.py
index a798505..abf40c4 100644
--- a/sentry/minidump.py
+++ b/sentry/minidump.py
@@ -12,11 +12,6 @@ def merge_minidump_event(data, minidump_bytes):
 
     data['level'] = 'fatal' if process_state.crashed else 'info'
 
-    exception_value = 'Assertion Error: %s' % process_state.assertion if process_state.assertion \
-        else 'Fatal Error: %s' % process_state.crash_reason
-    # NO_BANANA: data['message'] is not the right target
-    # data['message'] = exception_value
-
     if process_state.timestamp:
         data['timestamp'] = float(process_state.timestamp)
 
@@ -38,13 +33,18 @@ def merge_minidump_event(data, minidump_bytes):
     crashed_thread = threads[process_state.requesting_thread]
     crashed_thread['crashed'] = True
 
+    exception_value = 'Assertion Error: %s' % process_state.assertion if process_state.assertion \
+         else 'Fatal Error: %s' %process_state.crash_reason
+
     # Extract the crash reason and infos
     exception = {
         'value': exception_value,
         'thread_id': crashed_thread['id'],
         'type': process_state.crash_reason,
+
         # Move stacktrace here from crashed_thread (mutating!)
         'stacktrace': crashed_thread.pop('stacktrace'),
+        'value': exception_value,
     }
 
     for frame in exception['stacktrace']['frames']:

From eea5f032e205e9b1e5acd6cd8399b86175b93ae9 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Wed, 12 Nov 2025 21:33:18 +0100
Subject: [PATCH 26/31] Clarified meaning of process_state.requesting_thread

(the now-removed 'treat as pid' was hallunicated by the bot; the
taken-from-sentry version missed the guard against -1)

> The index of the thread that requested a dump be written in the
> threads vector. [..] If the dump was not produced as a result of an exception
> [..] this field will be set to -1,
---
 files/minidump.py  |  4 ++--
 sentry/minidump.py | 42 +++++++++++++++++++-----------------------
 2 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/files/minidump.py b/files/minidump.py
index 2ac523c..9739454 100644
--- a/files/minidump.py
+++ b/files/minidump.py
@@ -88,7 +88,7 @@ def _find_module_for_address(process_state, abs_addr: int):
 
 def event_threads_for_process_state(process_state):
     threads = []
-    for thread in process_state.threads():
+    for thread_index, thread in enumerate(process_state.threads()):
         thread_frames = []
 
         for frame in thread.frames():
@@ -136,7 +136,7 @@ def event_threads_for_process_state(process_state):
 
         threads.append({
             "id": thread.thread_id,
-            "crashed": (thread.thread_id == process_state.requesting_thread),
+            "crashed": thread_index == process_state.requesting_thread,
             "stacktrace": {"frames": thread_frames},
         })
 
diff --git a/sentry/minidump.py b/sentry/minidump.py
index abf40c4..3322649 100644
--- a/sentry/minidump.py
+++ b/sentry/minidump.py
@@ -29,34 +29,30 @@ def merge_minidump_event(data, minidump_bytes):
     threads = event_threads_for_process_state(process_state)
     data.setdefault("threads", {})["values"] = threads
 
-    # Mark the crashed thread and add its stacktrace to the exception
-    crashed_thread = threads[process_state.requesting_thread]
-    crashed_thread['crashed'] = True
+    if process_state.requesting_thread > -1:
+        crashed_thread = threads[process_state.requesting_thread]
 
-    exception_value = 'Assertion Error: %s' % process_state.assertion if process_state.assertion \
-         else 'Fatal Error: %s' %process_state.crash_reason
+        exception_value = 'Assertion Error: %s' % process_state.assertion if process_state.assertion \
+             else 'Fatal Error: %s' % process_state.crash_reason
 
-    # Extract the crash reason and infos
-    exception = {
-        'value': exception_value,
-        'thread_id': crashed_thread['id'],
-        'type': process_state.crash_reason,
+        exception = {
+            'value': exception_value,
+            'thread_id': crashed_thread['id'],
+            'type': process_state.crash_reason,
+            'stacktrace': crashed_thread.pop('stacktrace'),
+            'value': exception_value,
+        }
 
-        # Move stacktrace here from crashed_thread (mutating!)
-        'stacktrace': crashed_thread.pop('stacktrace'),
-        'value': exception_value,
-    }
+        for frame in exception['stacktrace']['frames']:
+            frame['in_app'] = True  # minidumps don't distinguish in_app frames; assume all are in_app
 
-    for frame in exception['stacktrace']['frames']:
-        frame['in_app'] = True  # minidumps don't distinguish in_app frames; assume all are in_app
+        exception['stacktrace']['frames'].reverse()  # "Frames should be sorted from oldest to newest."
+        # TODO we don't have display-info for threads yet, I think?
+        # we may need to revert the per-thread stacktraces above as well then
 
-    exception['stacktrace']['frames'].reverse()  # "Frames should be sorted from oldest to newest."
-    # TODO we don't have display-info for threads yet, I think?
-    # we may need to revert the per-thread stacktraces above as well then
-
-    data.setdefault('exception', {}) \
-        .setdefault('values', []) \
-        .append(exception)
+        data.setdefault('exception', {}) \
+            .setdefault('values', []) \
+            .append(exception)
 
     # Extract referenced (not all loaded) images
     images = [{

From 9f2a7c67375abb98c0181f17fcc74dbfe7909e66 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Wed, 12 Nov 2025 22:05:42 +0100
Subject: [PATCH 27/31] de-chatgptize event_threads_for_process_state

this code was created in a REPL/ChatGPT/minidump-API/HITL session,
until I had something that "seemed to work". the present commit
is the result of rereading, refactoring for understanding etc.
it's not "pure refacting" in the sense that it's behavior-changing,
but AFAICT for the better. e.g. "line 0" => just leave that out and
many similar changes.
---
 files/minidump.py | 54 +++++++++++++++++++++--------------------------
 1 file changed, 24 insertions(+), 30 deletions(-)

diff --git a/files/minidump.py b/files/minidump.py
index 9739454..a252372 100644
--- a/files/minidump.py
+++ b/files/minidump.py
@@ -88,13 +88,13 @@ def _find_module_for_address(process_state, abs_addr: int):
 
 def event_threads_for_process_state(process_state):
     threads = []
-    for thread_index, thread in enumerate(process_state.threads()):
-        thread_frames = []
+    for thread_index, symbolic_thread in enumerate(process_state.threads()):
+        frames = []
 
-        for frame in thread.frames():
-            module = _find_module_for_address(process_state, frame.instruction)
-            fn = file = None
-            line = 0
+        for symbolic_frame in symbolic_thread.frames():
+            module = _find_module_for_address(process_state, symbolic_frame.instruction)
+
+            frame = {"instruction_addr": f"0x{symbolic_frame.instruction:x}"}
 
             if module and module.debug_id:
                 dashed_debug_id = symbolic.debuginfo.id_from_breakpad(module.debug_id)
@@ -104,40 +104,34 @@ def event_threads_for_process_state(process_state):
                     dif_bytes = file_metadata.file.data
 
                     archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes)
-                    objects = list(archive.iter_objects())
-                    assert len(objects) == 1
-                    obj = objects[0]
+
+                    obj = get_single_object(archive)
 
                     symcache = obj.make_symcache()
 
-                    rel = frame.instruction - module.addr
-                    infos = symcache.lookup(rel) or symcache.lookup(rel - 1)  # "or -1" from ChatGPT... should we do it?
+                    rel = symbolic_frame.instruction - module.addr
+                    infos = symcache.lookup(rel)
                     if infos:
-                        li = infos[0]
-                        fn = li.function_name
-                        file = li.filename
-                        line = li.line
+                        # tentative understanding: lookup may give multiple results (e.g. inlined code). we just pick
+                        # the first arbitrarily which is "good enough for a PoC until proven otherwise"
+                        line_info = infos[0]
+
+                        frame["function"] = line_info.function_name
+                        if line_info.filename:
+                            frame["filename"] = line_info.filename
+                        frame["lineno"] = line_info.line
 
-                        # if we have line info, try source bundle
                         src_meta = FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="src").first()
-                        if src_meta and file and line:
-                            src_bytes = src_meta.file.data
-                            pre_ctx, ctx_line, post_ctx = extract_source_context(src_bytes, file, line)
+                        if src_meta and line_info.filename and line_info.line:
+                            frame["pre_context"], frame["context_line"], frame["post_context"] = extract_source_context(
+                                src_meta.file.data, line_info.filename, line_info.line)
 
-            thread_frames.append({
-                "instruction_addr": f"0x{frame.instruction:x}",
-                "function": fn or "<unknown>",
-                "filename": file,
-                "lineno": line,
-                "pre_context": pre_ctx,
-                "context_line": ctx_line,
-                "post_context": post_ctx,
-            })
+            frames.append(frame)
 
         threads.append({
-            "id": thread.thread_id,
+            "id": symbolic_thread.thread_id,
             "crashed": thread_index == process_state.requesting_thread,
-            "stacktrace": {"frames": thread_frames},
+            "stacktrace": {"frames": frames},
         })
 
     return threads

From 9f6cd88ec67e73f15c13a55ccf12fb3cdce5f277 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Thu, 13 Nov 2025 08:33:12 +0100
Subject: [PATCH 28/31] Remove unneeded layer of indirection in query.

---
 files/views.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/files/views.py b/files/views.py
index 31ee4f2..1aeb321 100644
--- a/files/views.py
+++ b/files/views.py
@@ -212,8 +212,8 @@ def difs_assemble(request, organization_slug, project_slug):
     file_checksums = set(data.keys())
 
     existing_files = {
-        f.file.checksum: f
-        for f in FileMetadata.objects.filter(file__checksum__in=file_checksums)
+        file.checksum: file
+        for file in File.objects.filter(checksum__in=file_checksums)
     }
 
     all_requested_chunks = {

From 97c1e4c71c86661ff209794349e34ff43c4e4768 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Thu, 13 Nov 2025 08:33:31 +0100
Subject: [PATCH 29/31] Comment about difs_assemble endpoint

---
 files/views.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/files/views.py b/files/views.py
index 1aeb321..79e5049 100644
--- a/files/views.py
+++ b/files/views.py
@@ -233,7 +233,9 @@ def difs_assemble(request, organization_slug, project_slug):
             response[file_checksum] = {
                 "state": ChunkFileState.OK,
                 "missingChunks": [],
-                # "dif": serialize(existing_files[file_checksum]),  # TODO: figure out if this is required.
+                # if it is ever needed, we could add something akin to the below, but so far we've not seen client-side
+                # actually using this; let's add it on-demand.
+                # "dif": json_repr_with_key_info_about(existing_files[file_checksum]),
             }
             continue
 

From 661d83bd939ff2c46e91df9328694f570390933a Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Sat, 15 Nov 2025 13:33:49 +0100
Subject: [PATCH 30/31] minidumps: FEATURE flag

---
 bugsink/app_settings.py         | 3 +++
 bugsink/settings/development.py | 3 +++
 files/views.py                  | 3 +++
 ingest/views.py                 | 9 ++++++++-
 4 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/bugsink/app_settings.py b/bugsink/app_settings.py
index 1d59347..5d2da85 100644
--- a/bugsink/app_settings.py
+++ b/bugsink/app_settings.py
@@ -73,6 +73,9 @@ DEFAULTS = {
     # Security:
     "MINIMIZE_INFORMATION_EXPOSURE": False,
     "PHONEHOME": True,
+
+    # Feature flags:
+    "FEATURE_MINIDUMPS": False,  # minidumps are experimental/early-stage and likely a DOS-magnet; disabled by default
 }
 
 
diff --git a/bugsink/settings/development.py b/bugsink/settings/development.py
index 48e86c4..a221949 100644
--- a/bugsink/settings/development.py
+++ b/bugsink/settings/development.py
@@ -109,6 +109,9 @@ BUGSINK = {
     "MAX_EMAILS_PER_MONTH": 10,  # for development: a thing to tune if you want to the the quota system
 
     "KEEP_ARTIFACT_BUNDLES": True,  # in development: useful to preserve sourcemap uploads
+
+    # in development we want optional features enabled to [1] play with them and [2] have the tests work
+    "FEATURE_MINIDUMPS": True,
 }
 
 
diff --git a/files/views.py b/files/views.py
index 79e5049..1f32a6f 100644
--- a/files/views.py
+++ b/files/views.py
@@ -205,6 +205,9 @@ def artifact_bundle_assemble(request, organization_slug):
 @csrf_exempt  # we're in API context here; this could potentially be pulled up to a higher level though
 @requires_auth_token
 def difs_assemble(request, organization_slug, project_slug):
+    if not get_settings().FEATURE_MINIDUMPS:
+        return JsonResponse({"detail": "minidumps not enabled"}, status=404)
+
     # TODO move to tasks.something.delay
     # TODO think about the right transaction around this
     data = json.loads(request.body)
diff --git a/ingest/views.py b/ingest/views.py
index 3aebb60..f48f8bb 100644
--- a/ingest/views.py
+++ b/ingest/views.py
@@ -53,6 +53,7 @@ from .models import StoreEnvelope, DontStoreEnvelope, Envelope
 
 HTTP_429_TOO_MANY_REQUESTS = 429
 HTTP_400_BAD_REQUEST = 400
+HTTP_404_NOT_FOUND = 404
 HTTP_501_NOT_IMPLEMENTED = 501
 
 
@@ -638,7 +639,10 @@ class IngestEnvelopeAPIView(BaseIngestAPIView):
 
             if ((type_ not in ["event", "attachment"]) or
                     (item_headers.get("type") == "attachment" and
-                     item_headers.get("attachment_type") != "event.minidump")):
+                     item_headers.get("attachment_type") != "event.minidump") or
+                    (item_headers.get("type") == "attachment" and
+                     item_headers.get("attachment_type") == "event.minidump" and
+                     not get_settings().FEATURE_MINIDUMPS)):
 
                 # non-event/minidumps can be discarded; (we don't check for individual size limits, because these differ
                 # per item type, we have the envelope limit to protect us, and we incur almost no cost (NullWriter))
@@ -732,6 +736,9 @@ class MinidumpAPIView(BaseIngestAPIView):
     # I'm not 100% sure whether "philosophically" the minidump endpoint is also "ingesting"; we'll see.
 
     def post(self, request, project_pk=None):
+        if not get_settings().FEATURE_MINIDUMPS:
+            return JsonResponse({"detail": "minidumps not enabled"}, status=HTTP_404_NOT_FOUND)
+
         # not reusing the CORS stuff here; minidump-from-browser doesn't make sense.
 
         # TODO: actually pick/configure max

From 7df7bc7f4dfecfc97cf092d9a7774e6e817326fd Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Sat, 15 Nov 2025 13:38:09 +0100
Subject: [PATCH 31/31] Minidump feature flag: configurable in docker

---
 bugsink/conf_templates/docker.py.template | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/bugsink/conf_templates/docker.py.template b/bugsink/conf_templates/docker.py.template
index d938b8f..9f32b8e 100644
--- a/bugsink/conf_templates/docker.py.template
+++ b/bugsink/conf_templates/docker.py.template
@@ -179,6 +179,9 @@ BUGSINK = {
         os.getenv("MINIMIZE_INFORMATION_EXPOSURE", "false").lower() in ("true", "1", "yes"),
 
     "PHONEHOME": os.getenv("PHONEHOME", "true").lower() in ("true", "1", "yes"),
+
+    # Feature flags
+    "FEATURE_MINIDUMPS": os.getenv("PHONEHOME", "false").lower() in ("true", "1", "yes"),
 }