Skip to content

Narrow TypeAlias(s) used in Constructor, ConstructorEager #2346

@dangotbanned

Description

@dangotbanned

It might make more sense to do these before (#2343, #2344).

When I initially tried to fix this in (#2232 (comment)) - most of the 795 errors were from tests using IntoFrame/IntoDataFrame.

narwhals/tests/utils.py

Lines 45 to 46 in d2a9f42

Constructor: TypeAlias = Callable[[Any], IntoFrame]
ConstructorEager: TypeAlias = Callable[[Any], IntoDataFrame]

AFAICT, none of these constructors should need to use a TypeAlias that mentions nw.DataFrame or nw.LazyFrame.
They all return a native object

conftest.py

narwhals/tests/conftest.py

Lines 72 to 222 in d2a9f42

def pandas_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame:
import pandas as pd
return pd.DataFrame(obj)
def pandas_nullable_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame:
import pandas as pd
return pd.DataFrame(obj).convert_dtypes(dtype_backend="numpy_nullable")
def pandas_pyarrow_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame:
import pandas as pd
return pd.DataFrame(obj).convert_dtypes(dtype_backend="pyarrow")
def modin_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame: # pragma: no cover
import modin.pandas as mpd
import pandas as pd
return mpd.DataFrame(pd.DataFrame(obj)) # type: ignore[no-any-return]
def modin_pyarrow_constructor(
obj: dict[str, list[Any]],
) -> IntoDataFrame: # pragma: no cover
import modin.pandas as mpd
import pandas as pd
return mpd.DataFrame(pd.DataFrame(obj)).convert_dtypes(dtype_backend="pyarrow") # type: ignore[no-any-return]
def cudf_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame: # pragma: no cover
import cudf
return cudf.DataFrame(obj) # type: ignore[no-any-return]
def polars_eager_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame:
import polars as pl
return pl.DataFrame(obj)
def polars_lazy_constructor(obj: dict[str, list[Any]]) -> pl.LazyFrame:
import polars as pl
return pl.LazyFrame(obj)
def duckdb_lazy_constructor(obj: dict[str, list[Any]]) -> duckdb.DuckDBPyRelation:
import duckdb
import polars as pl
_df = pl.LazyFrame(obj)
return duckdb.table("_df")
def dask_lazy_p1_constructor(obj: dict[str, list[Any]]) -> IntoFrame: # pragma: no cover
import dask.dataframe as dd
return dd.from_dict(obj, npartitions=1) # type: ignore[no-any-return]
def dask_lazy_p2_constructor(obj: dict[str, list[Any]]) -> IntoFrame: # pragma: no cover
import dask.dataframe as dd
return dd.from_dict(obj, npartitions=2) # type: ignore[no-any-return]
def pyarrow_table_constructor(obj: dict[str, Any]) -> IntoDataFrame:
import pyarrow as pa
return pa.table(obj)
def pyspark_lazy_constructor() -> Callable[[Any], IntoFrame]: # pragma: no cover
try:
from pyspark.sql import SparkSession
except ImportError: # pragma: no cover
pytest.skip("pyspark is not installed")
return None
import warnings
from atexit import register
with warnings.catch_warnings():
# The spark session seems to trigger a polars warning.
# Polars is imported in the tests, but not used in the spark operations
warnings.filterwarnings(
"ignore", r"Using fork\(\) can cause Polars", category=RuntimeWarning
)
session = (
SparkSession.builder.appName("unit-tests") # pyright: ignore[reportAttributeAccessIssue]
.master("local[1]")
.config("spark.ui.enabled", "false")
# executing one task at a time makes the tests faster
.config("spark.default.parallelism", "1")
.config("spark.sql.shuffle.partitions", "2")
# common timezone for all tests environments
.config("spark.sql.session.timeZone", "UTC")
.getOrCreate()
)
register(session.stop)
def _constructor(obj: dict[str, list[Any]]) -> IntoFrame:
_obj = deepcopy(obj)
index_col_name = generate_temporary_column_name(n_bytes=8, columns=list(_obj))
_obj[index_col_name] = list(range(len(_obj[next(iter(_obj))])))
return ( # type: ignore[no-any-return]
session.createDataFrame([*zip(*_obj.values())], schema=[*_obj.keys()])
.repartition(2)
.orderBy(index_col_name)
.drop(index_col_name)
)
return _constructor
def sqlframe_pyspark_lazy_constructor(
obj: dict[str, Any],
) -> IntoFrame: # pragma: no cover
from sqlframe.duckdb import DuckDBSession
session = DuckDBSession()
return session.createDataFrame([*zip(*obj.values())], schema=[*obj.keys()])
EAGER_CONSTRUCTORS: dict[str, Callable[[Any], IntoDataFrame]] = {
"pandas": pandas_constructor,
"pandas[nullable]": pandas_nullable_constructor,
"pandas[pyarrow]": pandas_pyarrow_constructor,
"pyarrow": pyarrow_table_constructor,
"modin": modin_constructor,
"modin[pyarrow]": modin_pyarrow_constructor,
"cudf": cudf_constructor,
"polars[eager]": polars_eager_constructor,
}
LAZY_CONSTRUCTORS: dict[str, Callable[[Any], IntoFrame]] = {
"dask": dask_lazy_p2_constructor,
"polars[lazy]": polars_lazy_constructor,
"duckdb": duckdb_lazy_constructor,
"pyspark": pyspark_lazy_constructor, # type: ignore[dict-item]
"sqlframe": sqlframe_pyspark_lazy_constructor,
}
GPU_CONSTRUCTORS: dict[str, Callable[[Any], IntoFrame]] = {"cudf": cudf_constructor}

IntoDataFrame: TypeAlias = Union["NativeFrame", "DataFrame[Any]", "DataFrameLike"]

IntoFrame: TypeAlias = Union[
"NativeFrame", "DataFrame[Any]", "LazyFrame[Any]", "DataFrameLike"
]

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions