rework URL normalization

2026-02-09 07:09:00 -06:00 · 2025-02-22 21:10:28 +01:00
parent 3ababd0446
commit bbeb3e8e87
8 changed files with 155 additions and 95 deletions
--- a/rio/app_server/abstract_app_server.py
+++ b/rio/app_server/abstract_app_server.py
@@ -65,11 +65,7 @@ class AbstractAppServer(abc.ABC):
        # This is currently handled as follows: If a base URL is specified, it
        # must contain the full URL. If no base URL is specified, the URL
        # provided in the request is used instead.
-        if base_url is None:
-            self.base_url = None
-        else:
-            self.base_url = utils.normalize_url(base_url)
-
+        self.base_url = base_url
        self._session_serve_tasks = dict[rio.Session, asyncio.Task[object]]()
        self._disconnected_sessions = dict[rio.Session, float]()

@@ -248,7 +244,10 @@ class AbstractAppServer(abc.ABC):

        ## Raises

-        `NavigationFailed`: If a page guard crashes
+        `NavigationFailed`: If a page guard crashes.
+
+        `NavigationFailed`: If the initial page URL is not a child of the app's
+            base URL.
        """
        # Normalize and deduplicate the languages
        preferred_languages: list[str] = []
@@ -324,7 +323,7 @@ class AbstractAppServer(abc.ABC):
        # Prepare the initial URL. This will be exposed to the session as the
        # `active_page_url`, but overridden later once the page guards have been
        # run.
-        initial_page_url = rio.URL(initial_message.url.lower())
+        initial_page_url = rio.URL(initial_message.url)

        # Determine the base URL. If one was explicitly provided when creating
        # the app server, use that. Otherwise derive one from the initial HTTP
@@ -333,7 +332,6 @@ class AbstractAppServer(abc.ABC):
            base_url = (
                initial_page_url.with_path("").with_query("").with_fragment("")
            )
-            base_url = utils.normalize_url(base_url)
        else:
            base_url = self.base_url

@@ -421,9 +419,17 @@ class AbstractAppServer(abc.ABC):
            active_page_url_absolute,
        ) = routing.check_page_guards(sess, initial_page_url)

+        # The initial page URL can be problematic. Logically it must obviously
+        # be a child of the app's base URL, but that may not be the case if a
+        # malicious URL was sent or the base misconfigured. Check for this.
+        if active_page_instances_and_path_arguments is None:
+            raise rio.NavigationFailed(
+                f"The initial page URL `{initial_page_url}` is not a child of the app's base URL `{self.base_url}`."
+            )
+
        # Is this a page, or a full URL to another site?
        try:
-            utils.make_url_relative(
+            utils.url_relative_to_base(
                sess._base_url,
                active_page_url_absolute,
            )
--- a/rio/cli/run_project/uvicorn_worker.py
+++ b/rio/cli/run_project/uvicorn_worker.py
@@ -11,7 +11,7 @@ import rio
 import rio.app_server.fastapi_server
 import rio.cli

-from ... import nice_traceback, utils
+from ... import nice_traceback
 from . import run_models


@@ -140,17 +140,13 @@ class UvicornWorker:
        # Store the new app
        self.app_server = app_server

+        self.app_server.base_url = self.base_url
        self.app_server.debug_mode = self.debug_mode
        self.app_server.running_in_window = self.run_in_window
        self.app_server.internal_on_app_start = (
            lambda: self.on_server_is_ready_or_failed.set_result(None)
        )

-        if self.base_url is None:
-            self.app_server.base_url = None
-        else:
-            self.app_server.base_url = utils.normalize_url(self.base_url)
-
        # There is no need to inject the new app or server anywhere. Since
        # uvicorn was fed a shim function instead of the app directly, any
        # requests will automatically be redirected to the new server instance.
--- a/rio/routing.py
+++ b/rio/routing.py
@@ -588,7 +588,8 @@ def check_page_guards(
    tuple[
        tuple[ComponentPage, dict[str, object]],
        ...,
-    ],
+    ]
+    | None,
    rio.URL,
 ]:
    """
@@ -605,6 +606,10 @@ def check_page_guards(
    error. The result will still be valid. That is because navigation is
    possible, it's just that some `PageView`s will display a 404 page.

+    If attempting to navigate to a URL that isn't path of the app, i.e. it's not
+    a child of the app's configured base URL, `None` is returned instead of
+    active pages.
+
    This function does not perform any actual navigation. It simply checks
    whether navigation to the target page is possible.

@@ -627,11 +632,13 @@ def check_page_guards(
    past_redirects = [target_url_absolute]

    while True:
-        # TODO: What if the URL is not a child of the base URL? i.e. redirecting
-        #   to a completely different site
-        target_url_relative = utils.make_url_relative(
-            sess._base_url, target_url_absolute
-        )
+        # Strip the base. This also detects navigation to outside of the app
+        try:
+            target_url_relative = utils.url_relative_to_base(
+                sess._base_url, target_url_absolute
+            )
+        except ValueError:
+            return None, target_url_absolute

        # Find all pages which would by activated by this navigation
        active_page_instances_and_path_arguments = tuple(
--- a/rio/session.py
+++ b/rio/session.py
@@ -1008,35 +1008,12 @@ window.resizeTo(screen.availWidth, screen.availHeight);
            replaced with the new page. If `False`, a new history entry is
            created, allowing the user to go back to the previous page.
        """
-        # Normalize the target URL
-        target_url = utils.normalize_url(target_url)
+        if isinstance(target_url, str):
+            target_url = rio.URL(target_url)

        # Determine the full page to navigate to
        target_url_absolute = self.active_page_url.join(target_url)

-        # Is this a page, or a full URL to another site?
-        try:
-            utils.make_url_relative(
-                self._base_url,
-                target_url_absolute,
-            )
-
-        # This is an external URL. Let the browser handle the navigation.
-        except ValueError:
-            logging.debug(
-                f"Navigating to external site `{target_url_absolute}`"
-            )
-
-            async def history_worker() -> None:
-                await self._evaluate_javascript(
-                    f"""
-window.location.href = {json.dumps(str(target_url))};
-""",
-                )
-
-            self.create_task(history_worker(), name="history worker")
-            return
-
        # Is any guard opposed to this page?
        active_page_instances_and_path_arguments, active_page_url = (
            routing.check_page_guards(self, target_url_absolute)
@@ -1044,6 +1021,25 @@ window.location.href = {json.dumps(str(target_url))};

        del target_url, target_url_absolute

+        # Is this one of the app's pages, or a full URL to another site? If
+        # navigating to another site, let the browser handle the navigation.
+        #
+        # TODO: What if running in a window? Navigating to another site doesn't
+        # make any sense in that case, since the window isn't supposed to be a
+        # browser.
+        if active_page_instances_and_path_arguments is None:
+            logging.debug(f"Navigating to external site `{active_page_url}`")
+
+            async def history_worker() -> None:
+                await self._evaluate_javascript(
+                    f"""
+window.location.href = {json.dumps(str(active_page_url))};
+""",
+                )
+
+            self.create_task(history_worker(), name="history worker")
+            return
+
        # Update the current page
        logging.debug(f"Navigating to page `{active_page_url}`")
        self._active_page_url = active_page_url
@@ -3505,34 +3501,38 @@ a.remove();

    @unicall.local(name="openUrl")
    async def _open_url(self, url: str) -> None:
+        # Case: Running as website
+        #
+        # If running in a browser, JS can take care of changing the URL or
+        # opening a new tab. The only reason why the frontend would tell us
+        # to open the url is because it's a local url.
+        #
+        # (Note: Of course an attacker could send us an external url to
+        # open, but `navigate_to` has to handle that gracefully anyway.)
        if self.running_as_website:
-            # If running in a browser, JS can take care of changing the URL or
-            # opening a new tab. The only reason why the frontend would tell us
-            # to open the url is because it's a local url.
-            #
-            # (Note: Of course an attacker could send us an external url to
-            # open, but `navigate_to` has to handle that gracefully anyway.)
            self.navigate_to(url)
            return

-        # Normalize the URL to make for easier comparisons
-        yarl_url = utils.normalize_url(url)
+        # Case: Running in a window.
+        yarl_url = rio.URL(url)
        del url

        # If running_in_window, local urls are *always* navigated to, even if
        # they're meant to be opened in a new tab. The `run_in_window` code
        # isn't designed to handle multiple sessions, so we can't open a new
        # tab or a 2nd window.
-        is_local_url = yarl_url.path.startswith(self._base_url.path)
-
-        if is_local_url:
-            self.navigate_to(yarl_url)
-            return
+        try:
+            utils.url_relative_to_base(yarl_url, self._base_url)

        # And if it's an external url, it must be opened in a web browser.
-        import webbrowser
+        except ValueError:
+            import webbrowser

-        webbrowser.open(str(yarl_url))
+            webbrowser.open(str(yarl_url))
+
+        # Local URL
+        else:
+            self.navigate_to(yarl_url)

    @unicall.local(name="ping")
    async def _ping(self, ping: str) -> str:
@@ -3544,6 +3544,10 @@ a.remove();
        Called by the client when the page changes.
        """
        # Try to navigate to the new page
+        import revel
+
+        revel.debug(f"Called navigate to {new_url}")
+
        self.navigate_to(
            new_url,
            replace=True,
--- a/rio/snippets/snippet-files/project-template-Crypto
+++ b/rio/snippets/snippet-files/project-template-Crypto
@@ -1,4 +1,6 @@
 # <additional-imports>
+# </additional-imports>
+from dataclasses import field
 from pathlib import Path

 import numpy as np
@@ -10,8 +12,6 @@ import rio
 from .. import components as comps
 from .. import data_models

-# </additional-imports>
-

 # <component>
@rio.page(
--- a/rio/snippets/snippet-files/project-template-Todo
+++ b/rio/snippets/snippet-files/project-template-Todo
@@ -3,12 +3,13 @@ from __future__ import annotations
 # <additional-imports>
 import datetime

+# </additional-imports>
+from dataclasses import field
+
 import rio

 from ..data_models import TodoItem

-# </additional-imports>
-

 # <component>
 class NewTodoItemInput(rio.Component):
--- a/rio/utils.py
+++ b/rio/utils.py
@@ -356,21 +356,28 @@ class FileInfo:
        raise ValueError("Invalid type. Expected 'r' or 'rb'.")


-def make_url_relative(base: URL, other: URL) -> URL:
+def url_relative_to_base(base: URL, other: URL) -> URL:
    """
    Returns `other` as a relative URL to `base`. Raises a `ValueError` if
-    `other` is not a child of `base`.
+    `other` is not a child of `base`. This will never generate URLs containing
+    `..`. If those would be needed a `ValueError` is raised instead.

-    This will never generate URLs containing `..`. If those would be needed a
-    `ValueError` is raised instead.
+    ## Casing
+
+    The scheme and host are case insensitive, as by the URL standard. The path
+    is case sensitive, but a warning will be logged should that be the only
+    reason two URLs are not considered equal.
    """
    # Verify the URLs have the same scheme and host
-    if base.scheme != other.scheme:
+    if base.scheme.lower() != other.scheme.lower():
        raise ValueError(
            f'URLs have different schemes: "{base.scheme}" and "{other.scheme}"'
        )

-    if base.host != other.host:
+    base_host = None if base.host is None else base.host.lower()
+    other_host = None if other.host is None else other.host.lower()
+
+    if base_host != other_host:
        raise ValueError(
            f'URLs have different hosts: "{base.host}" and "{other.host}"'
        )
@@ -387,8 +394,16 @@ def make_url_relative(base: URL, other: URL) -> URL:
        other_parts = other_parts[:-1]

    # See if the base is a parent of the other URL
-    if base_parts != other_parts[: len(base_parts)]:
-        raise ValueError(f'"{other}" is not a child of "{base}"')
+    for base_part, other_part in zip(base_parts, other_parts):
+        if base_part != other_part:
+            # Log a warning if the only difference is casing
+            if base_part.lower() == other_part.lower():
+                message = f"`{base}` is not considered a base URL of `{other}`, because the URL paths are cased differently. This is considered a different, as by the URL specification. If you were trying to navigate to your own site and it didn't pick this up check your spelling."
+                raise ValueError(message)
+
+            raise ValueError(
+                f"`{base}` is not a base URL of `{other}`. The paths differ at `{base_part}` and `{other_part}`"
+            )

    # Remove the common parts from the URL
    other_parts = other_parts[len(base_parts) :]
@@ -566,30 +581,33 @@ def safe_build(
    return placeholder_component


-def normalize_url(url: str | rio.URL) -> rio.URL:
-    """
-    Returns a normalized version of the given URL.
+# def normalize_url(url: str | rio.URL) -> rio.URL:
+#     """
+#     Returns a normalized version of the given URL.

-    This returns a new URL instance which is identical to the given URL, but
-    with the following guarantees:
+#     This returns a new URL instance which is identical to the given URL, but
+#     with the following guarantees:

-    - The origin is lowercase
-    - The URL has no trailing slashes
-    """
-    url = rio.URL(url)
+#     - The protocol, host and path are lowercase
+#     - The URL has no trailing slashes

-    if url.scheme:
-        url = url.with_scheme(url.scheme.lower())
+#     Query parameters and fragments are considered case-sensitive, and are not
+#     modified.
+#     """
+#     url = rio.URL(url)

-    if url.host is not None:
-        url = url.with_host(url.host.lower())
+#     if url.scheme:
+#         url = url.with_scheme(url.scheme.lower())

-    if url.path != "/":
-        # Doing `url.with_path(url.path)` erases the query parameters.
-        # Unbelievable.
-        url = url.with_path(url.path.rstrip("/")).with_query(url.query)
+#     if url.host is not None:
+#         url = url.with_host(url.host.lower())

-    return url
+#     if url.path != "/":
+#         # Doing `url.with_path(url.path)` erases the query parameters.
+#         # Unbelievable.
+#         url = url.with_path(url.path.rstrip("/").lower()).with_query(url.query)
+
+#     return url


 def is_python_script(path: Path) -> bool:
--- a/tests/test_routing.py
+++ b/tests/test_routing.py
@@ -131,18 +131,18 @@ def test_redirects(
    # Create a fake session. It contains everything used by the routing system.
    app = rio.App(pages=PAGES)
    app_server = rio.app_server.TestingServer(app)
-    fake_session = app_server.create_dummy_session()
+    session = app_server.create_dummy_session()

    # Determine the final URL
    active_pages_and_path_arguments, absolute_url_after_redirects_is = (
        rio.routing.check_page_guards(
-            fake_session,
-            fake_session._base_url.join(rio.URL(relative_url_before_redirects)),
+            session,
+            session._base_url.join(rio.URL(relative_url_before_redirects)),
        )
    )

    # Verify the final URL
-    absolute_url_after_redirects_should = fake_session._base_url.join(
+    absolute_url_after_redirects_should = session._base_url.join(
        rio.URL(relative_url_after_redirects_should)
    )

@@ -179,6 +179,34 @@ def test_url_parameter_parsing_failure() -> None:
        session._base_url.join(rio.URL("/3.5")),
    )

+    assert isinstance(active_pages_and_path_arguments, list)
    assert len(active_pages_and_path_arguments) == 1
    assert active_pages_and_path_arguments[0][0] == float_page
    assert active_pages_and_path_arguments[0][1] == {"path_param": 3.5}
+
+
+def test_redirect_offsite() -> None:
+    """
+    Redirect to a site other than this app.
+    """
+    page = rio.ComponentPage(
+        name="Home",
+        url_segment="",
+        build=FakeComponent,
+    )
+
+    app = rio.App(pages=(page,))
+    app_server = rio.app_server.TestingServer(app)
+    session = app_server.create_dummy_session()
+
+    external_url = rio.URL("http://example.com")
+
+    active_pages_and_path_arguments, absolute_url_after_redirects = (
+        rio.routing.check_page_guards(
+            session,
+            external_url,
+        )
+    )
+
+    assert active_pages_and_path_arguments is None
+    assert absolute_url_after_redirects == external_url