Envelope parsing: validate headers as per the docs

headers means: envelope headers and item headers.

Provides more robustness and a layer of defense-in-depth
Only those headers that we might rely on in a near future (event-based)
are included.

See #173
This commit is contained in:
Klaas van Schelven
2025-07-29 23:17:55 +02:00
parent 354edc81f9
commit 2fede39985
4 changed files with 124 additions and 3 deletions

View File

@@ -58,3 +58,18 @@ def get_header_value(sentry_dsn):
def get_sentry_key(sentry_dsn):
parts = urllib.parse.urlsplit(sentry_dsn)
return parts.username
def validate_sentry_dsn(sentry_dsn):
parts = urllib.parse.urlsplit(sentry_dsn)
if not parts.scheme or not parts.hostname or not parts.username:
raise ValueError("Invalid Sentry DSN format. It must contain a scheme, hostname, and public_key.")
if parts.scheme not in ("http", "https"):
raise ValueError("Invalid Sentry DSN scheme. It must be 'http' or 'https'.")
if (not parts.path) or ("/" not in parts.path) or (not parts.path.rsplit("/", 1)[1]):
raise ValueError("Invalid DSN: path must include '/<project_id>'")
return True

2
ingest/exceptions.py Normal file
View File

@@ -0,0 +1,2 @@
class ParseError(Exception):
pass

101
ingest/header_validators.py Normal file
View File

@@ -0,0 +1,101 @@
import re
from datetime import datetime
from compat.dsn import validate_sentry_dsn
from .exceptions import ParseError
# Based on the documentation here:
#
# https://develop.sentry.dev/sdk/data-model/envelopes/
# https://develop.sentry.dev/sdk/data-model/envelope-items/
#
# From the docs, we deduced validation for
#
# * envelope headers -> all of them
# * item headers -> only those that are relevant for "event" items
_RFC3339_Z = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z$")
_UUID32 = re.compile(r"^[0-9a-fA-F]{32}$")
_UUID36 = re.compile(r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$")
def validate_dsn(v):
try:
validate_sentry_dsn(v)
except ValueError as e:
raise ParseError(f'Envelope header "dsn" invalid: {e}')
def validate_sdk(v):
if not isinstance(v, dict):
raise ParseError('Envelope header "sdk" must be an object')
def validate_sent_at(v):
if not isinstance(v, str) or not _RFC3339_Z.match(v):
raise ParseError(f'Envelope header "sent_at" must be an RFC3339 UTC timestamp ending in Z: {v}')
try:
datetime.strptime(v, "%Y-%m-%dT%H:%M:%SZ")
except ValueError:
datetime.fromisoformat(v.replace("Z", "+00:00"))
def validate_event_id(v):
if not isinstance(v, str) or not (_UUID32.match(v) or _UUID36.match(v)):
raise ParseError(f'Envelope header "event_id" must be a valid UUID string: {v}')
envelope_validators = {
"dsn": validate_dsn,
"sdk": validate_sdk,
"sent_at": validate_sent_at,
"event_id": validate_event_id,
}
def validate_envelope_headers(headers):
for key, val in headers.items():
if key in envelope_validators:
envelope_validators[key](val)
ALLOWED_TYPES = {
"event", "transaction", "attachment", "session", "sessions", "feedback", "user_report", "client_report",
"replay_event", "replay_recording", "profile", "profile_chunk", "check_in", "log", "otel_log"
}
def validate_type(v):
return
# alternatively (1):
# if v not in _allowed_types:
# Sentry's protocol might add new item types in the future; we don't want to raise an error for those.
# logger.warning(f'Item header "type" is not recognized: {v}.'
#
# alternatively (2):
# raise ParseError(f'Item header "type" must be one of {_allowed_types}, got: {v}')
def _validate_length(v):
if not isinstance(v, int) or v < 0:
raise ParseError(f'Item header "length" must be a non-negative integer, got: {v}')
item_validators = {
"type": validate_type,
"length": _validate_length,
}
def validate_item_headers(headers):
if headers.get("type") != "event":
# Only validate item headers for events. Reason: it's the only type of event that we actually process; rather
# than trying to keep the validation in sync with for a part of the protocol that we don't use, we skip it.
return
for key, val in headers.items():
if key in item_validators:
item_validators[key](val)

View File

@@ -3,9 +3,8 @@ import io
from bugsink.streams import MaxDataWriter
class ParseError(Exception):
pass
from .exceptions import ParseError
from .header_validators import validate_envelope_headers, validate_item_headers
class NewlineFinder:
@@ -150,6 +149,7 @@ class StreamingEnvelopeParser:
if self.envelope_headers is None:
# see test_eof_after_envelope_headers for why we don't error on EOF-after-header here
self.envelope_headers = self._parse_headers(empty_is_error=True, eof_after_header_is_error=False)
validate_envelope_headers(self.envelope_headers)
return self.envelope_headers
@@ -161,10 +161,13 @@ class StreamingEnvelopeParser:
while not self.at_eof:
item_headers = self._parse_headers(empty_is_error=False, eof_after_header_is_error=True)
if item_headers is None:
self.at_eof = True
break
validate_item_headers(item_headers)
if "length" in item_headers:
length = item_headers["length"]
finder = LengthFinder(length, error_for_eof="EOF while reading item with explicitly specified length")