Files
rio/tests/test_table_data_ingestion.py
2025-11-13 21:29:49 +01:00

291 lines
7.5 KiB
Python

"""
Tests that different types of data formats accepted by tables are correctly
turned into the internal column format.
"""
import sys
import typing as t
from datetime import datetime
import numpy as np
import pandas as pd
import polars as pl
import pyarrow as pa
import pytest
import rio
import rio.maybes
# Tables work with a lot of optionally supported modules. Make sure Rio is aware
# of which ones are available.
rio.maybes.initialize()
DATE_FORMAT_STRING = "%Y-%m-%d"
def gen_valid_data() -> dict[str, t.Iterable[t.Any]]:
"""
Generates a dictionary of valid data for use in a `rio.Table`. The columns
are intentionally of different types, some of which can only be used once.
"""
return {
"Text": ["A", "B", "C", "D", "E"],
"Number List": [1, 2, 3, 4, 5],
"Number Tuple": (1, 2, 3, 4, 5),
"Number Generator": (i for i in range(1, 6)),
"Date List": [
datetime(2025, 1, 1),
datetime(2025, 1, 2),
datetime(2025, 1, 3),
datetime(2025, 1, 4),
datetime(2025, 1, 5),
],
"Objects List": [
{"foo": "bar"},
datetime(2025, 1, 1),
1,
1.23,
True,
],
}
def gen_too_short_column_data() -> dict[str, t.Iterable[t.Any]]:
"""
Same as the function above, but one column is intentionally shorter than the
others.
"""
data = gen_valid_data()
data["Short Column"] = [1, 2, 3, 4]
return data
def gen_too_long_column_data() -> dict[str, t.Iterable[t.Any]]:
"""
Same as the function above, but one column is intentionally longer than the
others.
"""
data = gen_valid_data()
data["Long Column"] = [1, 2, 3, 4, 5, 6]
return data
def into_all_formats(
datagen: t.Callable[
[],
dict[str, t.Iterable[t.Any]],
],
) -> t.Iterable[tuple[t.Any, bool]]:
"""
Given a data generator, return the same data in all input formats supported
by `rio.Table`:
- pandas DataFrame
- polars DataFrame
- pyarrow Table
- numpy array
- mapping of iterables
- iterable of iterables
Each result is a tuple of the data and a boolean indicating whether the
resulting table should have headers.
"""
# Pandas DataFrame
yield pd.DataFrame(datagen()), True
# Polars DataFrame
#
# In order to make a column hold arbitrary Python objects, the dtype must be
# explicitly set to `pl.Object`.
raw_data = datagen()
raw_data["Objects List"] = pl.Series(
"objects",
raw_data.pop("Objects List"),
dtype=pl.Object,
)
yield pl.DataFrame(raw_data), True
# PyArrow Table
#
# PyArrow does not support arbitrary Python objects.
raw_data = datagen()
raw_data.pop("Objects List")
yield pa.table(raw_data), True
# NumPy array
#
# Numpy is problematic, because it cannot store different types of data in
# the same array. It implicitly converts everything to strings, which makes
# the later correctness checks fail.
#
# yield (
# np.column_stack(
# [list(column) for column in datagen().values()],
# ),
# False,
# )
# Mapping of iterables
yield datagen(), True
# Iterable of iterables. These are row-major, so the values must be
# transposed
rows = list(zip(*datagen().values()))
yield rows, False
def assert_columns_match_data(
datagen: t.Callable[
[],
dict[str, t.Iterable[t.Any]],
],
should_have_headers: bool,
headers_are: list[str] | None,
columns_are: list[list[t.Any]],
allows_arbitrary_py_objects: bool = True,
) -> None:
"""
Asserts that columns in the standardized format used by `rio.Table` match
what would be expected from the data generator.
"""
data = datagen()
if not allows_arbitrary_py_objects:
del data["Objects List"]
# Do the headers match?
if should_have_headers:
assert headers_are is not None
assert headers_are == list(data.keys())
else:
assert headers_are is None
# Correct number of columns?
assert len(columns_are) == len(data)
# Column values
for ii, (column_name, column_raw) in enumerate(data.items()):
column_is = columns_are[ii]
column_should = rio.components.table._convert_iterable(
column_raw,
DATE_FORMAT_STRING,
)
print(ii, column_name)
print(f"Column is: {column_is}")
print(f"Column should: {column_should}")
assert column_is == column_should
@pytest.mark.parametrize(
"data, should_have_headers",
into_all_formats(gen_valid_data),
)
def test_valid_data(data: t.Any, should_have_headers: bool) -> None:
"""
Tests that valid data is correctly columnized.
"""
allow_arbitrary_py_objects = not isinstance(data, pa.Table)
try:
headers_are, columns_are = rio.components.table._data_to_columnar(
data,
DATE_FORMAT_STRING,
)
except pa.ArrowInvalid:
if sys.platform == "win32":
pytest.xfail("Pyarrow bug")
raise
assert_columns_match_data(
datagen=gen_valid_data,
should_have_headers=should_have_headers,
headers_are=headers_are,
columns_are=columns_are,
allows_arbitrary_py_objects=allow_arbitrary_py_objects,
)
def test_short_column() -> None:
"""
Tests that data with a column that is too short fails as expected.
"""
data = gen_too_short_column_data()
# The only format supporting too short data is a mapping. All other formats
# would immediately raise an error, before even being able to pass the data
# to a table.
#
# -> No need for fancy formats, just pass in the dict directly.
with pytest.raises(ValueError):
rio.components.table._data_to_columnar(
data,
DATE_FORMAT_STRING,
)
def test_long_column() -> None:
"""
Tests that data with a column that is too long fails as expected.
"""
data = gen_too_long_column_data()
# The only format supporting too short data is a mapping. All other formats
# would immediately raise an error, before even being able to pass the data
# to a table.
#
# -> No need for fancy formats, just pass in the dict
with pytest.raises(ValueError):
rio.components.table._data_to_columnar(
data,
DATE_FORMAT_STRING,
)
def test_1d_array() -> None:
"""
Creating a table from a 1D array should fail, as tables are inherently
two-dimensional.
"""
with pytest.raises(ValueError):
rio.components.table._data_to_columnar(
np.array([1, 2, 3]),
DATE_FORMAT_STRING,
)
def test_2d_array() -> None:
"""
Creating a table from a 2D array should work.
"""
headers, columns = rio.components.table._data_to_columnar(
np.array([[1, 2, 3], [4, 5, 6]]),
DATE_FORMAT_STRING,
)
assert_columns_match_data(
datagen=lambda: {
"0": [1, 4],
"1": [2, 5],
"2": [3, 6],
},
should_have_headers=False,
headers_are=headers,
columns_are=columns,
)
def test_3d_array() -> None:
"""
Creating a table from a 3D array should fail, as tables are inherently
two-dimensional.
"""
with pytest.raises(ValueError):
rio.components.table._data_to_columnar(
np.array([[[1, 2], [3, 4]]]),
DATE_FORMAT_STRING,
)