rework URL normalization

This commit is contained in:
Jakob Pinterits
2025-02-22 21:10:28 +01:00
parent 3ababd0446
commit bbeb3e8e87
8 changed files with 155 additions and 95 deletions

View File

@@ -65,11 +65,7 @@ class AbstractAppServer(abc.ABC):
# This is currently handled as follows: If a base URL is specified, it
# must contain the full URL. If no base URL is specified, the URL
# provided in the request is used instead.
if base_url is None:
self.base_url = None
else:
self.base_url = utils.normalize_url(base_url)
self.base_url = base_url
self._session_serve_tasks = dict[rio.Session, asyncio.Task[object]]()
self._disconnected_sessions = dict[rio.Session, float]()
@@ -248,7 +244,10 @@ class AbstractAppServer(abc.ABC):
## Raises
`NavigationFailed`: If a page guard crashes
`NavigationFailed`: If a page guard crashes.
`NavigationFailed`: If the initial page URL is not a child of the app's
base URL.
"""
# Normalize and deduplicate the languages
preferred_languages: list[str] = []
@@ -324,7 +323,7 @@ class AbstractAppServer(abc.ABC):
# Prepare the initial URL. This will be exposed to the session as the
# `active_page_url`, but overridden later once the page guards have been
# run.
initial_page_url = rio.URL(initial_message.url.lower())
initial_page_url = rio.URL(initial_message.url)
# Determine the base URL. If one was explicitly provided when creating
# the app server, use that. Otherwise derive one from the initial HTTP
@@ -333,7 +332,6 @@ class AbstractAppServer(abc.ABC):
base_url = (
initial_page_url.with_path("").with_query("").with_fragment("")
)
base_url = utils.normalize_url(base_url)
else:
base_url = self.base_url
@@ -421,9 +419,17 @@ class AbstractAppServer(abc.ABC):
active_page_url_absolute,
) = routing.check_page_guards(sess, initial_page_url)
# The initial page URL can be problematic. Logically it must obviously
# be a child of the app's base URL, but that may not be the case if a
# malicious URL was sent or the base misconfigured. Check for this.
if active_page_instances_and_path_arguments is None:
raise rio.NavigationFailed(
f"The initial page URL `{initial_page_url}` is not a child of the app's base URL `{self.base_url}`."
)
# Is this a page, or a full URL to another site?
try:
utils.make_url_relative(
utils.url_relative_to_base(
sess._base_url,
active_page_url_absolute,
)

View File

@@ -11,7 +11,7 @@ import rio
import rio.app_server.fastapi_server
import rio.cli
from ... import nice_traceback, utils
from ... import nice_traceback
from . import run_models
@@ -140,17 +140,13 @@ class UvicornWorker:
# Store the new app
self.app_server = app_server
self.app_server.base_url = self.base_url
self.app_server.debug_mode = self.debug_mode
self.app_server.running_in_window = self.run_in_window
self.app_server.internal_on_app_start = (
lambda: self.on_server_is_ready_or_failed.set_result(None)
)
if self.base_url is None:
self.app_server.base_url = None
else:
self.app_server.base_url = utils.normalize_url(self.base_url)
# There is no need to inject the new app or server anywhere. Since
# uvicorn was fed a shim function instead of the app directly, any
# requests will automatically be redirected to the new server instance.

View File

@@ -588,7 +588,8 @@ def check_page_guards(
tuple[
tuple[ComponentPage, dict[str, object]],
...,
],
]
| None,
rio.URL,
]:
"""
@@ -605,6 +606,10 @@ def check_page_guards(
error. The result will still be valid. That is because navigation is
possible, it's just that some `PageView`s will display a 404 page.
If attempting to navigate to a URL that isn't path of the app, i.e. it's not
a child of the app's configured base URL, `None` is returned instead of
active pages.
This function does not perform any actual navigation. It simply checks
whether navigation to the target page is possible.
@@ -627,11 +632,13 @@ def check_page_guards(
past_redirects = [target_url_absolute]
while True:
# TODO: What if the URL is not a child of the base URL? i.e. redirecting
# to a completely different site
target_url_relative = utils.make_url_relative(
sess._base_url, target_url_absolute
)
# Strip the base. This also detects navigation to outside of the app
try:
target_url_relative = utils.url_relative_to_base(
sess._base_url, target_url_absolute
)
except ValueError:
return None, target_url_absolute
# Find all pages which would by activated by this navigation
active_page_instances_and_path_arguments = tuple(

View File

@@ -1008,35 +1008,12 @@ window.resizeTo(screen.availWidth, screen.availHeight);
replaced with the new page. If `False`, a new history entry is
created, allowing the user to go back to the previous page.
"""
# Normalize the target URL
target_url = utils.normalize_url(target_url)
if isinstance(target_url, str):
target_url = rio.URL(target_url)
# Determine the full page to navigate to
target_url_absolute = self.active_page_url.join(target_url)
# Is this a page, or a full URL to another site?
try:
utils.make_url_relative(
self._base_url,
target_url_absolute,
)
# This is an external URL. Let the browser handle the navigation.
except ValueError:
logging.debug(
f"Navigating to external site `{target_url_absolute}`"
)
async def history_worker() -> None:
await self._evaluate_javascript(
f"""
window.location.href = {json.dumps(str(target_url))};
""",
)
self.create_task(history_worker(), name="history worker")
return
# Is any guard opposed to this page?
active_page_instances_and_path_arguments, active_page_url = (
routing.check_page_guards(self, target_url_absolute)
@@ -1044,6 +1021,25 @@ window.location.href = {json.dumps(str(target_url))};
del target_url, target_url_absolute
# Is this one of the app's pages, or a full URL to another site? If
# navigating to another site, let the browser handle the navigation.
#
# TODO: What if running in a window? Navigating to another site doesn't
# make any sense in that case, since the window isn't supposed to be a
# browser.
if active_page_instances_and_path_arguments is None:
logging.debug(f"Navigating to external site `{active_page_url}`")
async def history_worker() -> None:
await self._evaluate_javascript(
f"""
window.location.href = {json.dumps(str(active_page_url))};
""",
)
self.create_task(history_worker(), name="history worker")
return
# Update the current page
logging.debug(f"Navigating to page `{active_page_url}`")
self._active_page_url = active_page_url
@@ -3505,34 +3501,38 @@ a.remove();
@unicall.local(name="openUrl")
async def _open_url(self, url: str) -> None:
# Case: Running as website
#
# If running in a browser, JS can take care of changing the URL or
# opening a new tab. The only reason why the frontend would tell us
# to open the url is because it's a local url.
#
# (Note: Of course an attacker could send us an external url to
# open, but `navigate_to` has to handle that gracefully anyway.)
if self.running_as_website:
# If running in a browser, JS can take care of changing the URL or
# opening a new tab. The only reason why the frontend would tell us
# to open the url is because it's a local url.
#
# (Note: Of course an attacker could send us an external url to
# open, but `navigate_to` has to handle that gracefully anyway.)
self.navigate_to(url)
return
# Normalize the URL to make for easier comparisons
yarl_url = utils.normalize_url(url)
# Case: Running in a window.
yarl_url = rio.URL(url)
del url
# If running_in_window, local urls are *always* navigated to, even if
# they're meant to be opened in a new tab. The `run_in_window` code
# isn't designed to handle multiple sessions, so we can't open a new
# tab or a 2nd window.
is_local_url = yarl_url.path.startswith(self._base_url.path)
if is_local_url:
self.navigate_to(yarl_url)
return
try:
utils.url_relative_to_base(yarl_url, self._base_url)
# And if it's an external url, it must be opened in a web browser.
import webbrowser
except ValueError:
import webbrowser
webbrowser.open(str(yarl_url))
webbrowser.open(str(yarl_url))
# Local URL
else:
self.navigate_to(yarl_url)
@unicall.local(name="ping")
async def _ping(self, ping: str) -> str:
@@ -3544,6 +3544,10 @@ a.remove();
Called by the client when the page changes.
"""
# Try to navigate to the new page
import revel
revel.debug(f"Called navigate to {new_url}")
self.navigate_to(
new_url,
replace=True,

View File

@@ -1,4 +1,6 @@
# <additional-imports>
# </additional-imports>
from dataclasses import field
from pathlib import Path
import numpy as np
@@ -10,8 +12,6 @@ import rio
from .. import components as comps
from .. import data_models
# </additional-imports>
# <component>
@rio.page(

View File

@@ -3,12 +3,13 @@ from __future__ import annotations
# <additional-imports>
import datetime
# </additional-imports>
from dataclasses import field
import rio
from ..data_models import TodoItem
# </additional-imports>
# <component>
class NewTodoItemInput(rio.Component):

View File

@@ -356,21 +356,28 @@ class FileInfo:
raise ValueError("Invalid type. Expected 'r' or 'rb'.")
def make_url_relative(base: URL, other: URL) -> URL:
def url_relative_to_base(base: URL, other: URL) -> URL:
"""
Returns `other` as a relative URL to `base`. Raises a `ValueError` if
`other` is not a child of `base`.
`other` is not a child of `base`. This will never generate URLs containing
`..`. If those would be needed a `ValueError` is raised instead.
This will never generate URLs containing `..`. If those would be needed a
`ValueError` is raised instead.
## Casing
The scheme and host are case insensitive, as by the URL standard. The path
is case sensitive, but a warning will be logged should that be the only
reason two URLs are not considered equal.
"""
# Verify the URLs have the same scheme and host
if base.scheme != other.scheme:
if base.scheme.lower() != other.scheme.lower():
raise ValueError(
f'URLs have different schemes: "{base.scheme}" and "{other.scheme}"'
)
if base.host != other.host:
base_host = None if base.host is None else base.host.lower()
other_host = None if other.host is None else other.host.lower()
if base_host != other_host:
raise ValueError(
f'URLs have different hosts: "{base.host}" and "{other.host}"'
)
@@ -387,8 +394,16 @@ def make_url_relative(base: URL, other: URL) -> URL:
other_parts = other_parts[:-1]
# See if the base is a parent of the other URL
if base_parts != other_parts[: len(base_parts)]:
raise ValueError(f'"{other}" is not a child of "{base}"')
for base_part, other_part in zip(base_parts, other_parts):
if base_part != other_part:
# Log a warning if the only difference is casing
if base_part.lower() == other_part.lower():
message = f"`{base}` is not considered a base URL of `{other}`, because the URL paths are cased differently. This is considered a different, as by the URL specification. If you were trying to navigate to your own site and it didn't pick this up check your spelling."
raise ValueError(message)
raise ValueError(
f"`{base}` is not a base URL of `{other}`. The paths differ at `{base_part}` and `{other_part}`"
)
# Remove the common parts from the URL
other_parts = other_parts[len(base_parts) :]
@@ -566,30 +581,33 @@ def safe_build(
return placeholder_component
def normalize_url(url: str | rio.URL) -> rio.URL:
"""
Returns a normalized version of the given URL.
# def normalize_url(url: str | rio.URL) -> rio.URL:
# """
# Returns a normalized version of the given URL.
This returns a new URL instance which is identical to the given URL, but
with the following guarantees:
# This returns a new URL instance which is identical to the given URL, but
# with the following guarantees:
- The origin is lowercase
- The URL has no trailing slashes
"""
url = rio.URL(url)
# - The protocol, host and path are lowercase
# - The URL has no trailing slashes
if url.scheme:
url = url.with_scheme(url.scheme.lower())
# Query parameters and fragments are considered case-sensitive, and are not
# modified.
# """
# url = rio.URL(url)
if url.host is not None:
url = url.with_host(url.host.lower())
# if url.scheme:
# url = url.with_scheme(url.scheme.lower())
if url.path != "/":
# Doing `url.with_path(url.path)` erases the query parameters.
# Unbelievable.
url = url.with_path(url.path.rstrip("/")).with_query(url.query)
# if url.host is not None:
# url = url.with_host(url.host.lower())
return url
# if url.path != "/":
# # Doing `url.with_path(url.path)` erases the query parameters.
# # Unbelievable.
# url = url.with_path(url.path.rstrip("/").lower()).with_query(url.query)
# return url
def is_python_script(path: Path) -> bool:

View File

@@ -131,18 +131,18 @@ def test_redirects(
# Create a fake session. It contains everything used by the routing system.
app = rio.App(pages=PAGES)
app_server = rio.app_server.TestingServer(app)
fake_session = app_server.create_dummy_session()
session = app_server.create_dummy_session()
# Determine the final URL
active_pages_and_path_arguments, absolute_url_after_redirects_is = (
rio.routing.check_page_guards(
fake_session,
fake_session._base_url.join(rio.URL(relative_url_before_redirects)),
session,
session._base_url.join(rio.URL(relative_url_before_redirects)),
)
)
# Verify the final URL
absolute_url_after_redirects_should = fake_session._base_url.join(
absolute_url_after_redirects_should = session._base_url.join(
rio.URL(relative_url_after_redirects_should)
)
@@ -179,6 +179,34 @@ def test_url_parameter_parsing_failure() -> None:
session._base_url.join(rio.URL("/3.5")),
)
assert isinstance(active_pages_and_path_arguments, list)
assert len(active_pages_and_path_arguments) == 1
assert active_pages_and_path_arguments[0][0] == float_page
assert active_pages_and_path_arguments[0][1] == {"path_param": 3.5}
def test_redirect_offsite() -> None:
"""
Redirect to a site other than this app.
"""
page = rio.ComponentPage(
name="Home",
url_segment="",
build=FakeComponent,
)
app = rio.App(pages=(page,))
app_server = rio.app_server.TestingServer(app)
session = app_server.create_dummy_session()
external_url = rio.URL("http://example.com")
active_pages_and_path_arguments, absolute_url_after_redirects = (
rio.routing.check_page_guards(
session,
external_url,
)
)
assert active_pages_and_path_arguments is None
assert absolute_url_after_redirects == external_url