table data ingestion unit tests & fixes

2026-05-02 08:59:27 -05:00 · 2024-10-31 23:05:50 +01:00
parent 5906b719ad
commit 31e28ed38c
4 changed files with 254 additions and 17 deletions
@@ -31,6 +31,9 @@ class Plot(FundamentalComponent):
    a plot using the library of your choice and pass it to the `Plot` component
    to display it in your app.

+    Plots created with `plotly` will be interactive when displayed in Rio. We
+    recommend using it over the other options.
+
    ## Attributes

    `figure`: The plot figure to display.
@@ -43,7 +46,8 @@ class Plot(FundamentalComponent):

    ## Examples

-    Here's a minimal example using a `plotly` plot:
+    Here's a minimal example using a `plotly` plot. Using `plotly` is
+    recommended, because the resulting plots are interactive.

    ```python
    import pandas as pd
@@ -74,9 +78,10 @@ class Plot(FundamentalComponent):
            )
    ```

-    Here's an example using a `matplotlib` plot:
+    Matplotlib plots are also supported:

    ```python
+    import pandas as pd
    import matplotlib.pyplot as plt


@@ -103,9 +108,10 @@ class Plot(FundamentalComponent):
            )
    ```

-    Here's an example using a `seaborn` plot:
+    As well as `seaborn` plots:

    ```python
+    import pandas as pd
    import seaborn as sns


@@ -217,9 +217,9 @@ def _indices_to_rectangle(
 def _data_to_columnar(
    data: pandas.DataFrame
    | polars.DataFrame
+    | numpy.ndarray
    | t.Mapping[str, t.Iterable[TableValue]]
-    | t.Iterable[t.Iterable[TableValue]]
-    | numpy.ndarray,
+    | t.Iterable[t.Iterable[TableValue]],
 ) -> tuple[
    list[str] | None,
    list[list[TableValue]],
@@ -237,15 +237,24 @@ def _data_to_columnar(
    # DataFrame
    #
    # Use narwhals to abstract away the dataframe provider
-    if isinstance(data, maybes.PANDAS_DATAFRAME_TYPES) or isinstance(
-        data, maybes.POLARS_DATAFRAME_TYPES
-    ):
+    if isinstance(data, maybes.DATAFRAME_TYPES):
        nw_data = nw.from_native(data)
        headers = nw_data.columns

        for column_name in headers:
            columns.append(nw_data[column_name].to_list())

+    # NumPy array
+    #
+    # These are neatly orgnanized, just need to get the contents as columns
+    elif isinstance(data, maybes.NUMPY_ARRAY_TYPES):
+        print("HERE")
+        if data.ndim != 2:
+            raise ValueError("Table data must be two-dimensional")
+
+        for ii in range(data.shape[1]):
+            columns.append(data[:, ii].tolist())
+
    # Mapping
    #
    # The headers are trivially available. The columns can also be used as-is,
@@ -280,7 +289,7 @@ def _data_to_columnar(
            raise ValueError("All table rows must have the same length")

        # Black magic to transpose the data
-        data = list(map(list, zip(*data)))
+        columns = list(map(list, zip(*columns)))

    # Done
    return headers, columns
@@ -360,7 +369,7 @@ class Table(FundamentalComponent):  #
    show_row_numbers: bool = True

    # All headers, if present
-    _headers: list[str] | None = None
+    _headers: list[str] | None = field(default=None, init=False)

    # The data, as a list of columns ("column major"). This is set in
    # `__post_init__`.
@@ -13,14 +13,13 @@ import sys
 import typing as t

 import introspection
+import narwhals.typing as nwt

 if t.TYPE_CHECKING:
    import matplotlib.axes  # type: ignore
    import matplotlib.figure  # type: ignore
    import numpy  # type: ignore
-    import pandas  # type: ignore
    import plotly.graph_objects  # type: ignore
-    import polars  # type: ignore

 _IS_INITIALIZED = False

@@ -34,8 +33,7 @@ STR_TYPES = ()

 NUMPY_ARRAY_TYPES: tuple[type[numpy.ndarray], ...] = ()

-PANDAS_DATAFRAME_TYPES: tuple[type[pandas.DataFrame], ...] = ()
-POLARS_DATAFRAME_TYPES: tuple[type[polars.DataFrame], ...] = ()
+DATAFRAME_TYPES: tuple[type[nwt.IntoDataFrame], ...] = ()

 PLOTLY_GRAPH_TYPES: tuple[type[plotly.graph_objects.Figure], ...] = ()
 MATPLOTLIB_GRAPH_TYPES: tuple[type, ...] = ()
@@ -59,7 +57,7 @@ def initialize(force: bool = False) -> None:
    global _IS_INITIALIZED
    global FLOAT_TYPES, INT_TYPES, BOOL_TYPES, STR_TYPES
    global NUMPY_ARRAY_TYPES
-    global PANDAS_DATAFRAME_TYPES, POLARS_DATAFRAME_TYPES
+    global DATAFRAME_TYPES
    global PLOTLY_GRAPH_TYPES, MATPLOTLIB_GRAPH_TYPES, MATPLOTLIB_AXES_TYPES

    # Already initialized?
@@ -73,6 +71,8 @@ def initialize(force: bool = False) -> None:
    BOOL_TYPES = (bool,)
    STR_TYPES = (str,)

+    DATAFRAME_TYPES = ()
+
    # Is numpy available and loaded?
    if "numpy" in sys.modules:
        import numpy  # type: ignore
@@ -92,12 +92,12 @@ def initialize(force: bool = False) -> None:
    if "pandas" in sys.modules:
        import pandas  # type: ignore

-        PANDAS_DATAFRAME_TYPES = (pandas.DataFrame,)
+        DATAFRAME_TYPES += (pandas.DataFrame,)

    if "polars" in sys.modules:
        import polars  # type: ignore

-        POLARS_DATAFRAME_TYPES = (polars.DataFrame,)
+        DATAFRAME_TYPES += (polars.DataFrame,)

    if "plotly" in sys.modules:
        import plotly.graph_objects  # type: ignore
@@ -0,0 +1,222 @@
+"""
+Tests that different types of data formats accepted by tables are correctly
+turned into the internal column format.
+"""
+
+import typing as t
+
+import numpy as np
+import pandas as pd
+import polars as pl
+import pytest
+
+import rio
+import rio.maybes
+
+# Tables work with a lot of optionally supported modules. Make sure Rio is aware
+# of which ones are available.
+rio.maybes.initialize()
+
+
+def gen_valid_data() -> dict[str, t.Iterable[t.Any]]:
+    """
+    Generates a dictionary of valid data for use in a `rio.Table`. The columns
+    are intentionally of different types, some of which can only be used once.
+    """
+
+    return {
+        "Text": ["A", "B", "C", "D", "E"],
+        "Number List": [1, 2, 3, 4, 5],
+        "Number Tuple": (1, 2, 3, 4, 5),
+        "Number Generator": (i for i in range(1, 6)),
+    }
+
+
+def gen_too_short_column_data() -> dict[str, t.Iterable[t.Any]]:
+    """
+    Same as the function above, but one column is intentionally shorter than the
+    others.
+    """
+    data = gen_valid_data()
+    data["Short Column"] = [1, 2, 3, 4]
+    return data
+
+
+def gen_too_long_column_data() -> dict[str, t.Iterable[t.Any]]:
+    """
+    Same as the function above, but one column is intentionally longer than the
+    others.
+    """
+    data = gen_valid_data()
+    data["Long Column"] = [1, 2, 3, 4, 5, 6]
+    return data
+
+
+def into_all_formats(
+    datagen: t.Callable[
+        [],
+        dict[str, t.Iterable[t.Any]],
+    ],
+) -> t.Iterable[tuple[t.Any, bool]]:
+    """
+    Given a data generator, return the same data in all input formats supported
+    by `rio.Table`:
+
+    - pandas DataFrame
+    - polars DataFrame
+    - numpy array
+    - mapping of iterables
+    - iterable of iterables
+
+    Each result is a tuple of the data and a boolean indicating whether the
+    resulting table should have headers.
+    """
+    # Pandas DataFrame
+    yield pd.DataFrame(datagen()), True
+
+    # Polars DataFrame
+    yield pl.DataFrame(datagen()), True
+
+    # NumPy array
+    #
+    # Numpy is problematic, because it cannot store different types of data in
+    # the same array. It implicitly converts everything to strings, which makes
+    # the later correctness checks fail.
+    #
+    # yield (
+    #     np.column_stack(
+    #         [list(column) for column in datagen().values()],
+    #     ),
+    #     False,
+    # )
+
+    # Mapping of iterables
+    yield datagen(), True
+
+    # Iterable of iterables
+    as_df = pl.DataFrame(datagen())
+    rows: list[list[t.Any]] = []
+
+    for row in as_df.rows():
+        rows.append(list(row))
+
+    yield rows, False
+
+
+def assert_columns_match_data(
+    datagen: t.Callable[
+        [],
+        dict[str, t.Iterable[t.Any]],
+    ],
+    should_have_headers: bool,
+    headers_are: list[str] | None,
+    columns_are: list[list[t.Any]],
+) -> None:
+    """
+    Asserts that columns in the standardized format used by `rio.Table` match
+    what would be expected from the data generator.
+    """
+    data = datagen()
+
+    # Do the headers match?
+    if should_have_headers:
+        assert headers_are is not None
+        assert headers_are == list(data.keys())
+    else:
+        assert headers_are is None
+
+    # Correct number of columns?
+    assert len(columns_are) == len(data)
+
+    # Column values
+    for ii, (column_name, column_should) in enumerate(data.items()):
+        column_should = list(column_should)
+        column_is = columns_are[ii]
+
+        assert column_is == column_should
+
+
+@pytest.mark.parametrize(
+    "data, should_have_headers",
+    into_all_formats(gen_valid_data),
+)
+def test_valid_data(data: t.Any, should_have_headers: bool) -> None:
+    """
+    Tests that valid data is correctly columnized.
+    """
+    headers_are, columns_are = rio.components.table._data_to_columnar(data)
+
+    assert_columns_match_data(
+        datagen=gen_valid_data,
+        should_have_headers=should_have_headers,
+        headers_are=headers_are,
+        columns_are=columns_are,
+    )
+
+
+def test_short_column() -> None:
+    """
+    Tests that data with a column that is too short fails as expected.
+    """
+    data = gen_too_short_column_data()
+
+    # The only format supporting too short data is a mapping. All other formats
+    # would immediately raise an error, before even being able to pass the data
+    # to a table.
+    #
+    # -> No need for fancy formats, just pass in the dict directly.
+    with pytest.raises(ValueError):
+        rio.components.table._data_to_columnar(data)
+
+
+def test_long_column() -> None:
+    """
+    Tests that data with a column that is too long fails as expected.
+    """
+    data = gen_too_long_column_data()
+
+    # The only format supporting too short data is a mapping. All other formats
+    # would immediately raise an error, before even being able to pass the data
+    # to a table.
+    #
+    # -> No need for fancy formats, just pass in the dict
+    with pytest.raises(ValueError):
+        rio.components.table._data_to_columnar(data)
+
+
+def test_1d_array() -> None:
+    """
+    Creating a table from a 1D array should fail, as tables are inherently
+    two-dimensional.
+    """
+    with pytest.raises(ValueError):
+        rio.components.table._data_to_columnar(np.array([1, 2, 3]))
+
+
+def test_2d_array() -> None:
+    """
+    Creating a table from a 2D array should work.
+    """
+    headers, columns = rio.components.table._data_to_columnar(
+        np.array([[1, 2, 3], [4, 5, 6]]),
+    )
+
+    assert_columns_match_data(
+        datagen=lambda: {
+            "0": [1, 4],
+            "1": [2, 5],
+            "2": [3, 6],
+        },
+        should_have_headers=False,
+        headers_are=headers,
+        columns_are=columns,
+    )
+
+
+def test_3d_array() -> None:
+    """
+    Creating a table from a 3D array should fail, as tables are inherently
+    two-dimensional.
+    """
+    with pytest.raises(ValueError):
+        rio.components.table._data_to_columnar(np.array([[[1, 2], [3, 4]]]))