diff --git a/compat/dsn.py b/compat/dsn.py index 561ebaa..bc7a297 100644 --- a/compat/dsn.py +++ b/compat/dsn.py @@ -58,3 +58,18 @@ def get_header_value(sentry_dsn): def get_sentry_key(sentry_dsn): parts = urllib.parse.urlsplit(sentry_dsn) return parts.username + + +def validate_sentry_dsn(sentry_dsn): + parts = urllib.parse.urlsplit(sentry_dsn) + + if not parts.scheme or not parts.hostname or not parts.username: + raise ValueError("Invalid Sentry DSN format. It must contain a scheme, hostname, and public_key.") + + if parts.scheme not in ("http", "https"): + raise ValueError("Invalid Sentry DSN scheme. It must be 'http' or 'https'.") + + if (not parts.path) or ("/" not in parts.path) or (not parts.path.rsplit("/", 1)[1]): + raise ValueError("Invalid DSN: path must include '/'") + + return True diff --git a/ingest/exceptions.py b/ingest/exceptions.py new file mode 100644 index 0000000..831cca9 --- /dev/null +++ b/ingest/exceptions.py @@ -0,0 +1,2 @@ +class ParseError(Exception): + pass diff --git a/ingest/header_validators.py b/ingest/header_validators.py new file mode 100644 index 0000000..c82caaa --- /dev/null +++ b/ingest/header_validators.py @@ -0,0 +1,101 @@ +import re +from datetime import datetime + +from compat.dsn import validate_sentry_dsn +from .exceptions import ParseError + + +# Based on the documentation here: +# +# https://develop.sentry.dev/sdk/data-model/envelopes/ +# https://develop.sentry.dev/sdk/data-model/envelope-items/ +# +# From the docs, we deduced validation for +# +# * envelope headers -> all of them +# * item headers -> only those that are relevant for "event" items + + +_RFC3339_Z = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z$") +_UUID32 = re.compile(r"^[0-9a-fA-F]{32}$") +_UUID36 = re.compile(r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$") + + +def validate_dsn(v): + try: + validate_sentry_dsn(v) + except ValueError as e: + raise ParseError(f'Envelope header "dsn" invalid: {e}') + + +def validate_sdk(v): + if not isinstance(v, dict): + raise ParseError('Envelope header "sdk" must be an object') + + +def validate_sent_at(v): + if not isinstance(v, str) or not _RFC3339_Z.match(v): + raise ParseError(f'Envelope header "sent_at" must be an RFC3339 UTC timestamp ending in Z: {v}') + + try: + datetime.strptime(v, "%Y-%m-%dT%H:%M:%SZ") + except ValueError: + datetime.fromisoformat(v.replace("Z", "+00:00")) + + +def validate_event_id(v): + if not isinstance(v, str) or not (_UUID32.match(v) or _UUID36.match(v)): + raise ParseError(f'Envelope header "event_id" must be a valid UUID string: {v}') + + +envelope_validators = { + "dsn": validate_dsn, + "sdk": validate_sdk, + "sent_at": validate_sent_at, + "event_id": validate_event_id, +} + + +def validate_envelope_headers(headers): + for key, val in headers.items(): + if key in envelope_validators: + envelope_validators[key](val) + + +ALLOWED_TYPES = { + "event", "transaction", "attachment", "session", "sessions", "feedback", "user_report", "client_report", + "replay_event", "replay_recording", "profile", "profile_chunk", "check_in", "log", "otel_log" +} + + +def validate_type(v): + return + # alternatively (1): + # if v not in _allowed_types: + # Sentry's protocol might add new item types in the future; we don't want to raise an error for those. + # logger.warning(f'Item header "type" is not recognized: {v}.' + # + # alternatively (2): + # raise ParseError(f'Item header "type" must be one of {_allowed_types}, got: {v}') + + +def _validate_length(v): + if not isinstance(v, int) or v < 0: + raise ParseError(f'Item header "length" must be a non-negative integer, got: {v}') + + +item_validators = { + "type": validate_type, + "length": _validate_length, +} + + +def validate_item_headers(headers): + if headers.get("type") != "event": + # Only validate item headers for events. Reason: it's the only type of event that we actually process; rather + # than trying to keep the validation in sync with for a part of the protocol that we don't use, we skip it. + return + + for key, val in headers.items(): + if key in item_validators: + item_validators[key](val) diff --git a/ingest/parsers.py b/ingest/parsers.py index 3da7b29..f60f8bc 100644 --- a/ingest/parsers.py +++ b/ingest/parsers.py @@ -3,9 +3,8 @@ import io from bugsink.streams import MaxDataWriter - -class ParseError(Exception): - pass +from .exceptions import ParseError +from .header_validators import validate_envelope_headers, validate_item_headers class NewlineFinder: @@ -150,6 +149,7 @@ class StreamingEnvelopeParser: if self.envelope_headers is None: # see test_eof_after_envelope_headers for why we don't error on EOF-after-header here self.envelope_headers = self._parse_headers(empty_is_error=True, eof_after_header_is_error=False) + validate_envelope_headers(self.envelope_headers) return self.envelope_headers @@ -161,10 +161,13 @@ class StreamingEnvelopeParser: while not self.at_eof: item_headers = self._parse_headers(empty_is_error=False, eof_after_header_is_error=True) + if item_headers is None: self.at_eof = True break + validate_item_headers(item_headers) + if "length" in item_headers: length = item_headers["length"] finder = LengthFinder(length, error_for_eof="EOF while reading item with explicitly specified length")