-
Notifications
You must be signed in to change notification settings - Fork 185
Closed
Description
It might make more sense to do these before (#2343, #2344).
When I initially tried to fix this in (#2232 (comment)) - most of the 795 errors were from tests using IntoFrame/IntoDataFrame.
Lines 45 to 46 in d2a9f42
| Constructor: TypeAlias = Callable[[Any], IntoFrame] | |
| ConstructorEager: TypeAlias = Callable[[Any], IntoDataFrame] |
AFAICT, none of these constructors should need to use a TypeAlias that mentions nw.DataFrame or nw.LazyFrame.
They all return a native object
conftest.py
Lines 72 to 222 in d2a9f42
| def pandas_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame: | |
| import pandas as pd | |
| return pd.DataFrame(obj) | |
| def pandas_nullable_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame: | |
| import pandas as pd | |
| return pd.DataFrame(obj).convert_dtypes(dtype_backend="numpy_nullable") | |
| def pandas_pyarrow_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame: | |
| import pandas as pd | |
| return pd.DataFrame(obj).convert_dtypes(dtype_backend="pyarrow") | |
| def modin_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame: # pragma: no cover | |
| import modin.pandas as mpd | |
| import pandas as pd | |
| return mpd.DataFrame(pd.DataFrame(obj)) # type: ignore[no-any-return] | |
| def modin_pyarrow_constructor( | |
| obj: dict[str, list[Any]], | |
| ) -> IntoDataFrame: # pragma: no cover | |
| import modin.pandas as mpd | |
| import pandas as pd | |
| return mpd.DataFrame(pd.DataFrame(obj)).convert_dtypes(dtype_backend="pyarrow") # type: ignore[no-any-return] | |
| def cudf_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame: # pragma: no cover | |
| import cudf | |
| return cudf.DataFrame(obj) # type: ignore[no-any-return] | |
| def polars_eager_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame: | |
| import polars as pl | |
| return pl.DataFrame(obj) | |
| def polars_lazy_constructor(obj: dict[str, list[Any]]) -> pl.LazyFrame: | |
| import polars as pl | |
| return pl.LazyFrame(obj) | |
| def duckdb_lazy_constructor(obj: dict[str, list[Any]]) -> duckdb.DuckDBPyRelation: | |
| import duckdb | |
| import polars as pl | |
| _df = pl.LazyFrame(obj) | |
| return duckdb.table("_df") | |
| def dask_lazy_p1_constructor(obj: dict[str, list[Any]]) -> IntoFrame: # pragma: no cover | |
| import dask.dataframe as dd | |
| return dd.from_dict(obj, npartitions=1) # type: ignore[no-any-return] | |
| def dask_lazy_p2_constructor(obj: dict[str, list[Any]]) -> IntoFrame: # pragma: no cover | |
| import dask.dataframe as dd | |
| return dd.from_dict(obj, npartitions=2) # type: ignore[no-any-return] | |
| def pyarrow_table_constructor(obj: dict[str, Any]) -> IntoDataFrame: | |
| import pyarrow as pa | |
| return pa.table(obj) | |
| def pyspark_lazy_constructor() -> Callable[[Any], IntoFrame]: # pragma: no cover | |
| try: | |
| from pyspark.sql import SparkSession | |
| except ImportError: # pragma: no cover | |
| pytest.skip("pyspark is not installed") | |
| return None | |
| import warnings | |
| from atexit import register | |
| with warnings.catch_warnings(): | |
| # The spark session seems to trigger a polars warning. | |
| # Polars is imported in the tests, but not used in the spark operations | |
| warnings.filterwarnings( | |
| "ignore", r"Using fork\(\) can cause Polars", category=RuntimeWarning | |
| ) | |
| session = ( | |
| SparkSession.builder.appName("unit-tests") # pyright: ignore[reportAttributeAccessIssue] | |
| .master("local[1]") | |
| .config("spark.ui.enabled", "false") | |
| # executing one task at a time makes the tests faster | |
| .config("spark.default.parallelism", "1") | |
| .config("spark.sql.shuffle.partitions", "2") | |
| # common timezone for all tests environments | |
| .config("spark.sql.session.timeZone", "UTC") | |
| .getOrCreate() | |
| ) | |
| register(session.stop) | |
| def _constructor(obj: dict[str, list[Any]]) -> IntoFrame: | |
| _obj = deepcopy(obj) | |
| index_col_name = generate_temporary_column_name(n_bytes=8, columns=list(_obj)) | |
| _obj[index_col_name] = list(range(len(_obj[next(iter(_obj))]))) | |
| return ( # type: ignore[no-any-return] | |
| session.createDataFrame([*zip(*_obj.values())], schema=[*_obj.keys()]) | |
| .repartition(2) | |
| .orderBy(index_col_name) | |
| .drop(index_col_name) | |
| ) | |
| return _constructor | |
| def sqlframe_pyspark_lazy_constructor( | |
| obj: dict[str, Any], | |
| ) -> IntoFrame: # pragma: no cover | |
| from sqlframe.duckdb import DuckDBSession | |
| session = DuckDBSession() | |
| return session.createDataFrame([*zip(*obj.values())], schema=[*obj.keys()]) | |
| EAGER_CONSTRUCTORS: dict[str, Callable[[Any], IntoDataFrame]] = { | |
| "pandas": pandas_constructor, | |
| "pandas[nullable]": pandas_nullable_constructor, | |
| "pandas[pyarrow]": pandas_pyarrow_constructor, | |
| "pyarrow": pyarrow_table_constructor, | |
| "modin": modin_constructor, | |
| "modin[pyarrow]": modin_pyarrow_constructor, | |
| "cudf": cudf_constructor, | |
| "polars[eager]": polars_eager_constructor, | |
| } | |
| LAZY_CONSTRUCTORS: dict[str, Callable[[Any], IntoFrame]] = { | |
| "dask": dask_lazy_p2_constructor, | |
| "polars[lazy]": polars_lazy_constructor, | |
| "duckdb": duckdb_lazy_constructor, | |
| "pyspark": pyspark_lazy_constructor, # type: ignore[dict-item] | |
| "sqlframe": sqlframe_pyspark_lazy_constructor, | |
| } | |
| GPU_CONSTRUCTORS: dict[str, Callable[[Any], IntoFrame]] = {"cudf": cudf_constructor} |
Line 60 in d2a9f42
| IntoDataFrame: TypeAlias = Union["NativeFrame", "DataFrame[Any]", "DataFrameLike"] |
Lines 75 to 77 in d2a9f42
| IntoFrame: TypeAlias = Union[ | |
| "NativeFrame", "DataFrame[Any]", "LazyFrame[Any]", "DataFrameLike" | |
| ] |
Reactions are currently unavailable