Use fastjsonschema for validation

This commit is contained in:
Klaas van Schelven
2024-09-19 12:17:41 +02:00
parent d0eb6bea95
commit febbbc67c7
2 changed files with 39 additions and 15 deletions

View File

@@ -4,6 +4,7 @@ import io
from datetime import datetime, timezone
import json
import jsonschema
import fastjsonschema
from django.conf import settings
from django.shortcuts import get_object_or_404
@@ -122,29 +123,51 @@ class BaseIngestAPIView(View):
@classmethod
def validate_event_data(cls, data, validation_setting):
if not hasattr(cls, "_event_validator"):
# the schema is loaded once and cached on the class (it's in the 1-2ms range, but still seems worth it)
# rough notes on performance (50k event):
# fastjsonschema: compile: ~200ms, validation: ~2ms
# jsonschema: 'compile': ~2ms, validation ~15ms
#
# note that this method raising an exception ("strict") creates additional overhead in the ~100ms range;
# presumably because of the transaction rollback. Possible future direction: check pre-transaction. Not
# optimizing that yet though, because [1] presumably rare and [2] incorrect data might trigger arbitrary
# exceptions, so you'd have that cost-of-rollback anyway.
def get_schema():
schema_filename = settings.BASE_DIR / 'api/event.schema.json'
with open(schema_filename, 'r') as f:
event_schema = json.loads(f.read())
ValidatorClass = jsonschema.validators.validator_for(event_schema)
# left here for documentation purposes, not needed, because this already (implicitly) happens in tests
# ValidatorClass.check_schema(event_schema)
cls._event_validator = ValidatorClass(event_schema)
return json.loads(f.read())
def validate():
# helper function that wraps the idea of "validate quickly, but fail meaningfully"
try:
cls._event_validator(data_to_validate)
except fastjsonschema.exceptions.JsonSchemaValueException as fastjsonschema_e:
# fastjsonchema's exceptions provide almost no information (in the case of many anyOfs), so we just fall
# back to jsonschema for that. Slow (and double cost), but failing is the rare case, so we don't care.
# https://github.com/horejsek/python-fastjsonschema/issues/72 and 37 for some context
try:
jsonschema.validate(data_to_validate, get_schema())
except jsonschema.ValidationError as inner_e:
raise ValidationError(understandable_json_error(inner_e), code="invalid_event_data") from inner_e
# in the (presumably not-happening) case that our fallback validation succeeds, fail w/o useful message
raise ValidationError(fastjsonschema_e.message, code="invalid_event_data") from fastjsonschema_e
# the schema is loaded once and cached on the class (it's ~200ms)
# use_formats=False for "uint64", see https://github.com/horejsek/python-fastjsonschema/issues/108
if not hasattr(cls, "_event_validator"):
cls._event_validator = fastjsonschema.compile(get_schema(), use_formats=False)
# known fields that are not part of the schema (and that we don't want to validate)
data_to_validate = {k: v for k, v in data.items() if k != "_meta"}
if validation_setting == "strict":
try:
cls._event_validator.validate(data_to_validate)
except jsonschema.ValidationError as e:
raise ValidationError(understandable_json_error(e), code="invalid_event_data") from e
validate()
else: # "warn"
else: # i.e. "warn" - we never reach this function for "none"
try:
cls._event_validator.validate(data_to_validate)
except jsonschema.ValidationError as e:
logger.warn("event data validation failed: %s", understandable_json_error(e))
validate()
except ValidationError as e:
logger.warn("event data validation failed: %s", e)
@classmethod
@immediate_atomic()

View File

@@ -13,3 +13,4 @@ whitenoise
requests # for sentry-sdk-extensions, which is loaded in non-dev setup too
monofy
user_agents
fastjsonschema