From bbeb3e8e87bca3857cf28293a0db36009547cc7e Mon Sep 17 00:00:00 2001
From: Jakob Pinterits <jakob.pinterits@gmail.com>
Date: Sat, 22 Feb 2025 21:10:28 +0100
Subject: [PATCH] rework URL normalization

---
 rio/app_server/abstract_app_server.py         | 24 ++++--
 rio/cli/run_project/uvicorn_worker.py         |  8 +-
 rio/routing.py                                | 19 +++--
 rio/session.py                                | 84 ++++++++++---------
 .../pages/dashboard_page.py                   |  4 +-
 .../components/new_todo_item_input.py         |  5 +-
 rio/utils.py                                  | 70 ++++++++++------
 tests/test_routing.py                         | 36 +++++++-
 8 files changed, 155 insertions(+), 95 deletions(-)

diff --git a/rio/app_server/abstract_app_server.py b/rio/app_server/abstract_app_server.py
index 04464095..27822cb1 100644
--- a/rio/app_server/abstract_app_server.py
+++ b/rio/app_server/abstract_app_server.py
@@ -65,11 +65,7 @@ class AbstractAppServer(abc.ABC):
         # This is currently handled as follows: If a base URL is specified, it
         # must contain the full URL. If no base URL is specified, the URL
         # provided in the request is used instead.
-        if base_url is None:
-            self.base_url = None
-        else:
-            self.base_url = utils.normalize_url(base_url)
-
+        self.base_url = base_url
         self._session_serve_tasks = dict[rio.Session, asyncio.Task[object]]()
         self._disconnected_sessions = dict[rio.Session, float]()
 
@@ -248,7 +244,10 @@ class AbstractAppServer(abc.ABC):
 
         ## Raises
 
-        `NavigationFailed`: If a page guard crashes
+        `NavigationFailed`: If a page guard crashes.
+
+        `NavigationFailed`: If the initial page URL is not a child of the app's
+            base URL.
         """
         # Normalize and deduplicate the languages
         preferred_languages: list[str] = []
@@ -324,7 +323,7 @@ class AbstractAppServer(abc.ABC):
         # Prepare the initial URL. This will be exposed to the session as the
         # `active_page_url`, but overridden later once the page guards have been
         # run.
-        initial_page_url = rio.URL(initial_message.url.lower())
+        initial_page_url = rio.URL(initial_message.url)
 
         # Determine the base URL. If one was explicitly provided when creating
         # the app server, use that. Otherwise derive one from the initial HTTP
@@ -333,7 +332,6 @@ class AbstractAppServer(abc.ABC):
             base_url = (
                 initial_page_url.with_path("").with_query("").with_fragment("")
             )
-            base_url = utils.normalize_url(base_url)
         else:
             base_url = self.base_url
 
@@ -421,9 +419,17 @@ class AbstractAppServer(abc.ABC):
             active_page_url_absolute,
         ) = routing.check_page_guards(sess, initial_page_url)
 
+        # The initial page URL can be problematic. Logically it must obviously
+        # be a child of the app's base URL, but that may not be the case if a
+        # malicious URL was sent or the base misconfigured. Check for this.
+        if active_page_instances_and_path_arguments is None:
+            raise rio.NavigationFailed(
+                f"The initial page URL `{initial_page_url}` is not a child of the app's base URL `{self.base_url}`."
+            )
+
         # Is this a page, or a full URL to another site?
         try:
-            utils.make_url_relative(
+            utils.url_relative_to_base(
                 sess._base_url,
                 active_page_url_absolute,
             )
diff --git a/rio/cli/run_project/uvicorn_worker.py b/rio/cli/run_project/uvicorn_worker.py
index 3302b166..7bf9edd0 100644
--- a/rio/cli/run_project/uvicorn_worker.py
+++ b/rio/cli/run_project/uvicorn_worker.py
@@ -11,7 +11,7 @@ import rio
 import rio.app_server.fastapi_server
 import rio.cli
 
-from ... import nice_traceback, utils
+from ... import nice_traceback
 from . import run_models
 
 
@@ -140,17 +140,13 @@ class UvicornWorker:
         # Store the new app
         self.app_server = app_server
 
+        self.app_server.base_url = self.base_url
         self.app_server.debug_mode = self.debug_mode
         self.app_server.running_in_window = self.run_in_window
         self.app_server.internal_on_app_start = (
             lambda: self.on_server_is_ready_or_failed.set_result(None)
         )
 
-        if self.base_url is None:
-            self.app_server.base_url = None
-        else:
-            self.app_server.base_url = utils.normalize_url(self.base_url)
-
         # There is no need to inject the new app or server anywhere. Since
         # uvicorn was fed a shim function instead of the app directly, any
         # requests will automatically be redirected to the new server instance.
diff --git a/rio/routing.py b/rio/routing.py
index 0f47f821..47a6293b 100644
--- a/rio/routing.py
+++ b/rio/routing.py
@@ -588,7 +588,8 @@ def check_page_guards(
     tuple[
         tuple[ComponentPage, dict[str, object]],
         ...,
-    ],
+    ]
+    | None,
     rio.URL,
 ]:
     """
@@ -605,6 +606,10 @@ def check_page_guards(
     error. The result will still be valid. That is because navigation is
     possible, it's just that some `PageView`s will display a 404 page.
 
+    If attempting to navigate to a URL that isn't path of the app, i.e. it's not
+    a child of the app's configured base URL, `None` is returned instead of
+    active pages.
+
     This function does not perform any actual navigation. It simply checks
     whether navigation to the target page is possible.
 
@@ -627,11 +632,13 @@ def check_page_guards(
     past_redirects = [target_url_absolute]
 
     while True:
-        # TODO: What if the URL is not a child of the base URL? i.e. redirecting
-        #   to a completely different site
-        target_url_relative = utils.make_url_relative(
-            sess._base_url, target_url_absolute
-        )
+        # Strip the base. This also detects navigation to outside of the app
+        try:
+            target_url_relative = utils.url_relative_to_base(
+                sess._base_url, target_url_absolute
+            )
+        except ValueError:
+            return None, target_url_absolute
 
         # Find all pages which would by activated by this navigation
         active_page_instances_and_path_arguments = tuple(
diff --git a/rio/session.py b/rio/session.py
index d0eaabe3..4701a29b 100644
--- a/rio/session.py
+++ b/rio/session.py
@@ -1008,35 +1008,12 @@ window.resizeTo(screen.availWidth, screen.availHeight);
             replaced with the new page. If `False`, a new history entry is
             created, allowing the user to go back to the previous page.
         """
-        # Normalize the target URL
-        target_url = utils.normalize_url(target_url)
+        if isinstance(target_url, str):
+            target_url = rio.URL(target_url)
 
         # Determine the full page to navigate to
         target_url_absolute = self.active_page_url.join(target_url)
 
-        # Is this a page, or a full URL to another site?
-        try:
-            utils.make_url_relative(
-                self._base_url,
-                target_url_absolute,
-            )
-
-        # This is an external URL. Let the browser handle the navigation.
-        except ValueError:
-            logging.debug(
-                f"Navigating to external site `{target_url_absolute}`"
-            )
-
-            async def history_worker() -> None:
-                await self._evaluate_javascript(
-                    f"""
-window.location.href = {json.dumps(str(target_url))};
-""",
-                )
-
-            self.create_task(history_worker(), name="history worker")
-            return
-
         # Is any guard opposed to this page?
         active_page_instances_and_path_arguments, active_page_url = (
             routing.check_page_guards(self, target_url_absolute)
@@ -1044,6 +1021,25 @@ window.location.href = {json.dumps(str(target_url))};
 
         del target_url, target_url_absolute
 
+        # Is this one of the app's pages, or a full URL to another site? If
+        # navigating to another site, let the browser handle the navigation.
+        #
+        # TODO: What if running in a window? Navigating to another site doesn't
+        # make any sense in that case, since the window isn't supposed to be a
+        # browser.
+        if active_page_instances_and_path_arguments is None:
+            logging.debug(f"Navigating to external site `{active_page_url}`")
+
+            async def history_worker() -> None:
+                await self._evaluate_javascript(
+                    f"""
+window.location.href = {json.dumps(str(active_page_url))};
+""",
+                )
+
+            self.create_task(history_worker(), name="history worker")
+            return
+
         # Update the current page
         logging.debug(f"Navigating to page `{active_page_url}`")
         self._active_page_url = active_page_url
@@ -3505,34 +3501,38 @@ a.remove();
 
     @unicall.local(name="openUrl")
     async def _open_url(self, url: str) -> None:
+        # Case: Running as website
+        #
+        # If running in a browser, JS can take care of changing the URL or
+        # opening a new tab. The only reason why the frontend would tell us
+        # to open the url is because it's a local url.
+        #
+        # (Note: Of course an attacker could send us an external url to
+        # open, but `navigate_to` has to handle that gracefully anyway.)
         if self.running_as_website:
-            # If running in a browser, JS can take care of changing the URL or
-            # opening a new tab. The only reason why the frontend would tell us
-            # to open the url is because it's a local url.
-            #
-            # (Note: Of course an attacker could send us an external url to
-            # open, but `navigate_to` has to handle that gracefully anyway.)
             self.navigate_to(url)
             return
 
-        # Normalize the URL to make for easier comparisons
-        yarl_url = utils.normalize_url(url)
+        # Case: Running in a window.
+        yarl_url = rio.URL(url)
         del url
 
         # If running_in_window, local urls are *always* navigated to, even if
         # they're meant to be opened in a new tab. The `run_in_window` code
         # isn't designed to handle multiple sessions, so we can't open a new
         # tab or a 2nd window.
-        is_local_url = yarl_url.path.startswith(self._base_url.path)
-
-        if is_local_url:
-            self.navigate_to(yarl_url)
-            return
+        try:
+            utils.url_relative_to_base(yarl_url, self._base_url)
 
         # And if it's an external url, it must be opened in a web browser.
-        import webbrowser
+        except ValueError:
+            import webbrowser
 
-        webbrowser.open(str(yarl_url))
+            webbrowser.open(str(yarl_url))
+
+        # Local URL
+        else:
+            self.navigate_to(yarl_url)
 
     @unicall.local(name="ping")
     async def _ping(self, ping: str) -> str:
@@ -3544,6 +3544,10 @@ a.remove();
         Called by the client when the page changes.
         """
         # Try to navigate to the new page
+        import revel
+
+        revel.debug(f"Called navigate to {new_url}")
+
         self.navigate_to(
             new_url,
             replace=True,
diff --git a/rio/snippets/snippet-files/project-template-Crypto Dashboard/pages/dashboard_page.py b/rio/snippets/snippet-files/project-template-Crypto Dashboard/pages/dashboard_page.py
index 9b99146f..97bcf632 100644
--- a/rio/snippets/snippet-files/project-template-Crypto Dashboard/pages/dashboard_page.py	
+++ b/rio/snippets/snippet-files/project-template-Crypto Dashboard/pages/dashboard_page.py	
@@ -1,4 +1,6 @@
 # <additional-imports>
+# </additional-imports>
+from dataclasses import field
 from pathlib import Path
 
 import numpy as np
@@ -10,8 +12,6 @@ import rio
 from .. import components as comps
 from .. import data_models
 
-# </additional-imports>
-
 
 # <component>
 @rio.page(
diff --git a/rio/snippets/snippet-files/project-template-Todo App/components/new_todo_item_input.py b/rio/snippets/snippet-files/project-template-Todo App/components/new_todo_item_input.py
index e31417ba..dc1b6393 100644
--- a/rio/snippets/snippet-files/project-template-Todo App/components/new_todo_item_input.py	
+++ b/rio/snippets/snippet-files/project-template-Todo App/components/new_todo_item_input.py	
@@ -3,12 +3,13 @@ from __future__ import annotations
 # <additional-imports>
 import datetime
 
+# </additional-imports>
+from dataclasses import field
+
 import rio
 
 from ..data_models import TodoItem
 
-# </additional-imports>
-
 
 # <component>
 class NewTodoItemInput(rio.Component):
diff --git a/rio/utils.py b/rio/utils.py
index adc7fc33..66046e24 100644
--- a/rio/utils.py
+++ b/rio/utils.py
@@ -356,21 +356,28 @@ class FileInfo:
         raise ValueError("Invalid type. Expected 'r' or 'rb'.")
 
 
-def make_url_relative(base: URL, other: URL) -> URL:
+def url_relative_to_base(base: URL, other: URL) -> URL:
     """
     Returns `other` as a relative URL to `base`. Raises a `ValueError` if
-    `other` is not a child of `base`.
+    `other` is not a child of `base`. This will never generate URLs containing
+    `..`. If those would be needed a `ValueError` is raised instead.
 
-    This will never generate URLs containing `..`. If those would be needed a
-    `ValueError` is raised instead.
+    ## Casing
+
+    The scheme and host are case insensitive, as by the URL standard. The path
+    is case sensitive, but a warning will be logged should that be the only
+    reason two URLs are not considered equal.
     """
     # Verify the URLs have the same scheme and host
-    if base.scheme != other.scheme:
+    if base.scheme.lower() != other.scheme.lower():
         raise ValueError(
             f'URLs have different schemes: "{base.scheme}" and "{other.scheme}"'
         )
 
-    if base.host != other.host:
+    base_host = None if base.host is None else base.host.lower()
+    other_host = None if other.host is None else other.host.lower()
+
+    if base_host != other_host:
         raise ValueError(
             f'URLs have different hosts: "{base.host}" and "{other.host}"'
         )
@@ -387,8 +394,16 @@ def make_url_relative(base: URL, other: URL) -> URL:
         other_parts = other_parts[:-1]
 
     # See if the base is a parent of the other URL
-    if base_parts != other_parts[: len(base_parts)]:
-        raise ValueError(f'"{other}" is not a child of "{base}"')
+    for base_part, other_part in zip(base_parts, other_parts):
+        if base_part != other_part:
+            # Log a warning if the only difference is casing
+            if base_part.lower() == other_part.lower():
+                message = f"`{base}` is not considered a base URL of `{other}`, because the URL paths are cased differently. This is considered a different, as by the URL specification. If you were trying to navigate to your own site and it didn't pick this up check your spelling."
+                raise ValueError(message)
+
+            raise ValueError(
+                f"`{base}` is not a base URL of `{other}`. The paths differ at `{base_part}` and `{other_part}`"
+            )
 
     # Remove the common parts from the URL
     other_parts = other_parts[len(base_parts) :]
@@ -566,30 +581,33 @@ def safe_build(
     return placeholder_component
 
 
-def normalize_url(url: str | rio.URL) -> rio.URL:
-    """
-    Returns a normalized version of the given URL.
+# def normalize_url(url: str | rio.URL) -> rio.URL:
+#     """
+#     Returns a normalized version of the given URL.
 
-    This returns a new URL instance which is identical to the given URL, but
-    with the following guarantees:
+#     This returns a new URL instance which is identical to the given URL, but
+#     with the following guarantees:
 
-    - The origin is lowercase
-    - The URL has no trailing slashes
-    """
-    url = rio.URL(url)
+#     - The protocol, host and path are lowercase
+#     - The URL has no trailing slashes
 
-    if url.scheme:
-        url = url.with_scheme(url.scheme.lower())
+#     Query parameters and fragments are considered case-sensitive, and are not
+#     modified.
+#     """
+#     url = rio.URL(url)
 
-    if url.host is not None:
-        url = url.with_host(url.host.lower())
+#     if url.scheme:
+#         url = url.with_scheme(url.scheme.lower())
 
-    if url.path != "/":
-        # Doing `url.with_path(url.path)` erases the query parameters.
-        # Unbelievable.
-        url = url.with_path(url.path.rstrip("/")).with_query(url.query)
+#     if url.host is not None:
+#         url = url.with_host(url.host.lower())
 
-    return url
+#     if url.path != "/":
+#         # Doing `url.with_path(url.path)` erases the query parameters.
+#         # Unbelievable.
+#         url = url.with_path(url.path.rstrip("/").lower()).with_query(url.query)
+
+#     return url
 
 
 def is_python_script(path: Path) -> bool:
diff --git a/tests/test_routing.py b/tests/test_routing.py
index dec08872..c87c704d 100644
--- a/tests/test_routing.py
+++ b/tests/test_routing.py
@@ -131,18 +131,18 @@ def test_redirects(
     # Create a fake session. It contains everything used by the routing system.
     app = rio.App(pages=PAGES)
     app_server = rio.app_server.TestingServer(app)
-    fake_session = app_server.create_dummy_session()
+    session = app_server.create_dummy_session()
 
     # Determine the final URL
     active_pages_and_path_arguments, absolute_url_after_redirects_is = (
         rio.routing.check_page_guards(
-            fake_session,
-            fake_session._base_url.join(rio.URL(relative_url_before_redirects)),
+            session,
+            session._base_url.join(rio.URL(relative_url_before_redirects)),
         )
     )
 
     # Verify the final URL
-    absolute_url_after_redirects_should = fake_session._base_url.join(
+    absolute_url_after_redirects_should = session._base_url.join(
         rio.URL(relative_url_after_redirects_should)
     )
 
@@ -179,6 +179,34 @@ def test_url_parameter_parsing_failure() -> None:
         session._base_url.join(rio.URL("/3.5")),
     )
 
+    assert isinstance(active_pages_and_path_arguments, list)
     assert len(active_pages_and_path_arguments) == 1
     assert active_pages_and_path_arguments[0][0] == float_page
     assert active_pages_and_path_arguments[0][1] == {"path_param": 3.5}
+
+
+def test_redirect_offsite() -> None:
+    """
+    Redirect to a site other than this app.
+    """
+    page = rio.ComponentPage(
+        name="Home",
+        url_segment="",
+        build=FakeComponent,
+    )
+
+    app = rio.App(pages=(page,))
+    app_server = rio.app_server.TestingServer(app)
+    session = app_server.create_dummy_session()
+
+    external_url = rio.URL("http://example.com")
+
+    active_pages_and_path_arguments, absolute_url_after_redirects = (
+        rio.routing.check_page_guards(
+            session,
+            external_url,
+        )
+    )
+
+    assert active_pages_and_path_arguments is None
+    assert absolute_url_after_redirects == external_url