diff --git a/docs/api-reference/dataframe.md b/docs/api-reference/dataframe.md index 84b279c512..fe44cd3e71 100644 --- a/docs/api-reference/dataframe.md +++ b/docs/api-reference/dataframe.md @@ -14,6 +14,7 @@ - estimated_size - explode - filter + - from_dict - gather_every - get_column - group_by diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 5ac80cbcd8..56aaa3ccc6 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -6,6 +6,7 @@ TYPE_CHECKING, Any, Callable, + ClassVar, Generic, Literal, NoReturn, @@ -28,6 +29,7 @@ generate_repr, is_compliant_dataframe, is_compliant_lazyframe, + is_eager_allowed, is_index_selector, is_list_of, is_sequence_like, @@ -38,12 +40,13 @@ ) from narwhals.dependencies import get_polars, is_numpy_array from narwhals.exceptions import InvalidIntoExprError, InvalidOperationError +from narwhals.functions import _from_dict_no_backend from narwhals.schema import Schema from narwhals.series import Series from narwhals.translate import to_native if TYPE_CHECKING: - from collections.abc import Iterable, Iterator, Sequence + from collections.abc import Iterable, Iterator, Mapping, Sequence from io import BytesIO from pathlib import Path from types import ModuleType @@ -55,6 +58,7 @@ from narwhals._compliant import CompliantDataFrame, CompliantLazyFrame from narwhals._compliant.typing import CompliantExprAny, EagerNamespaceAny + from narwhals.dtypes import DType from narwhals.group_by import GroupBy, LazyGroupBy from narwhals.typing import ( AsofJoinStrategy, @@ -409,6 +413,8 @@ class DataFrame(BaseFrame[DataFrameT]): ``` """ + _version: ClassVar[Version] = Version.MAIN + def _extract_compliant(self, arg: Any) -> Any: from narwhals.expr import Expr from narwhals.series import Series @@ -452,6 +458,68 @@ def __init__(self, df: Any, *, level: Literal["full", "lazy", "interchange"]) -> msg = f"Expected an object which implements `__narwhals_dataframe__`, got: {type(df)}" raise AssertionError(msg) + @classmethod + def from_dict( + cls, + data: Mapping[str, Any], + schema: Mapping[str, DType] | Schema | None = None, + *, + backend: ModuleType | Implementation | str | None = None, + ) -> DataFrame[Any]: + """Instantiate DataFrame from dictionary. + + Indexes (if present, for pandas-like backends) are aligned following + the [left-hand-rule](../concepts/pandas_index.md/). + + Notes: + For pandas-like dataframes, conversion to schema is applied after dataframe + creation. + + Arguments: + data: Dictionary to create DataFrame from. + schema: The DataFrame schema as Schema or dict of {name: type}. If not + specified, the schema will be inferred by the native library. + backend: specifies which eager backend instantiate to. Only + necessary if inputs are not Narwhals Series. + + `backend` can be specified in various ways + + - As `Implementation.` with `BACKEND` being `PANDAS`, `PYARROW`, + `POLARS`, `MODIN` or `CUDF`. + - As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`. + - Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`. + + Returns: + A new DataFrame. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> data = {"c": [5, 2], "d": [1, 4]} + >>> nw.DataFrame.from_dict(data, backend="pandas") + ┌──────────────────┐ + |Narwhals DataFrame| + |------------------| + | c d | + | 0 5 1 | + | 1 2 4 | + └──────────────────┘ + """ + if backend is None: + data, backend = _from_dict_no_backend(data) + implementation = Implementation.from_backend(backend) + if is_eager_allowed(implementation): + ns = cls._version.namespace.from_backend(implementation).compliant + compliant = ns._dataframe.from_dict(data, schema=schema, context=ns) + return cls(compliant, level="full") + # NOTE: (#2786) needs resolving for extensions + msg = ( + f"{implementation} support in Narwhals is lazy-only, but `DataFrame.from_dict` is an eager-only function.\n\n" + "Hint: you may want to use an eager backend and then call `.lazy`, e.g.:\n\n" + f" nw.DataFrame.from_dict({{'a': [1, 2]}}, backend='pyarrow').lazy('{implementation}')" + ) + raise ValueError(msg) + @property def implementation(self) -> Implementation: """Return implementation of native frame. diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index 896d13cfcb..57455f7ef5 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -96,6 +96,8 @@ class DataFrame(NwDataFrame[IntoDataFrameT]): + _version = Version.V1 + @inherit_doc(NwDataFrame) def __init__(self, df: Any, *, level: Literal["full", "lazy", "interchange"]) -> None: assert df._version is Version.V1 # noqa: S101 @@ -104,6 +106,17 @@ def __init__(self, df: Any, *, level: Literal["full", "lazy", "interchange"]) -> # We need to override any method which don't return Self so that type # annotations are correct. + @classmethod + def from_dict( + cls, + data: Mapping[str, Any], + schema: Mapping[str, DType] | Schema | None = None, + *, + backend: ModuleType | Implementation | str | None = None, + ) -> DataFrame[Any]: + result = super().from_dict(data, schema, backend=backend) + return cast("DataFrame[Any]", result) + @property def _series(self) -> type[Series[Any]]: return cast("type[Series[Any]]", Series) diff --git a/tests/conftest.py b/tests/conftest.py index b9094b13d5..a55fa01270 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,11 +4,12 @@ import uuid from copy import deepcopy from functools import lru_cache +from importlib.util import find_spec from typing import TYPE_CHECKING, Any, Callable, cast import pytest -from narwhals._utils import generate_temporary_column_name +from narwhals._utils import Implementation, generate_temporary_column_name from tests.utils import PANDAS_VERSION if TYPE_CHECKING: @@ -23,6 +24,7 @@ from pyspark.sql import DataFrame as PySparkDataFrame from typing_extensions import TypeAlias + from narwhals._namespace import EagerAllowed from narwhals._spark_like.dataframe import SQLFrameDataFrame from narwhals.typing import NativeFrame, NativeLazyFrame from tests.utils import Constructor, ConstructorEager, ConstructorLazy @@ -308,3 +310,20 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: ) elif "constructor" in metafunc.fixturenames: metafunc.parametrize("constructor", constructors, ids=constructors_ids) + + +TEST_EAGER_BACKENDS: list[EagerAllowed] = [] +TEST_EAGER_BACKENDS.extend( + (Implementation.POLARS, "polars") if find_spec("polars") is not None else () +) +TEST_EAGER_BACKENDS.extend( + (Implementation.PANDAS, "pandas") if find_spec("pandas") is not None else () +) +TEST_EAGER_BACKENDS.extend( + (Implementation.PYARROW, "pyarrow") if find_spec("pyarrow") is not None else () +) + + +@pytest.fixture(params=TEST_EAGER_BACKENDS) +def eager_backend(request: pytest.FixtureRequest) -> EagerAllowed: + return request.param # type: ignore[no-any-return] diff --git a/tests/frame/from_dict_test.py b/tests/frame/from_dict_test.py new file mode 100644 index 0000000000..0b952077df --- /dev/null +++ b/tests/frame/from_dict_test.py @@ -0,0 +1,94 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +import narwhals as nw +from narwhals._utils import Implementation +from tests.utils import Constructor, assert_equal_data + +if TYPE_CHECKING: + from narwhals._namespace import EagerAllowed + + +def test_from_dict(eager_backend: EagerAllowed) -> None: + result = nw.DataFrame.from_dict({"c": [1, 2], "d": [5, 6]}, backend=eager_backend) + expected = {"c": [1, 2], "d": [5, 6]} + assert_equal_data(result, expected) + assert isinstance(result, nw.DataFrame) + + +def test_from_dict_schema(eager_backend: EagerAllowed) -> None: + schema = {"c": nw.Int16(), "d": nw.Float32()} + result = nw.DataFrame.from_dict( + {"c": [1, 2], "d": [5, 6]}, backend=eager_backend, schema=schema + ) + assert result.collect_schema() == schema + + +@pytest.mark.parametrize("backend", [Implementation.POLARS, "polars"]) +def test_from_dict_without_backend( + constructor: Constructor, backend: EagerAllowed +) -> None: + pytest.importorskip("polars") + + df = ( + nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})) + .lazy() + .collect(backend=backend) + ) + result = nw.DataFrame.from_dict({"c": df["a"], "d": df["b"]}) + assert_equal_data(result, {"c": [1, 2, 3], "d": [4, 5, 6]}) + + +def test_from_dict_without_backend_invalid(constructor: Constructor) -> None: + df = nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})).lazy().collect() + with pytest.raises(TypeError, match="backend"): + nw.DataFrame.from_dict({"c": nw.to_native(df["a"]), "d": nw.to_native(df["b"])}) + + +def test_from_dict_with_backend_invalid() -> None: + pytest.importorskip("duckdb") + with pytest.raises(ValueError, match="lazy-only"): + nw.DataFrame.from_dict({"c": [1, 2], "d": [5, 6]}, backend="duckdb") + + +@pytest.mark.parametrize("backend", [Implementation.POLARS, "polars"]) +def test_from_dict_one_native_one_narwhals( + constructor: Constructor, backend: EagerAllowed +) -> None: + pytest.importorskip("polars") + + df = ( + nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})) + .lazy() + .collect(backend=backend) + ) + result = nw.DataFrame.from_dict({"c": nw.to_native(df["a"]), "d": df["b"]}) + expected = {"c": [1, 2, 3], "d": [4, 5, 6]} + assert_equal_data(result, expected) + + +def test_from_dict_empty(eager_backend: EagerAllowed) -> None: + result = nw.DataFrame.from_dict({}, backend=eager_backend) + assert result.shape == (0, 0) + + +def test_from_dict_empty_with_schema(eager_backend: EagerAllowed) -> None: + schema = nw.Schema({"a": nw.String(), "b": nw.Int8()}) + result = nw.DataFrame.from_dict({}, schema, backend=eager_backend) + assert result.schema == schema + + +def test_alignment() -> None: + pytest.importorskip("pandas") + import pandas as pd + + # https://github.com/narwhals-dev/narwhals/issues/1474 + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + result = nw.DataFrame.from_dict( + {"a": df["a"], "b": df["a"].sort_values(ascending=False)}, backend=pd + ).to_native() + expected = pd.DataFrame({"a": [1, 2, 3], "b": [3, 2, 1]}) + pd.testing.assert_frame_equal(result, expected) diff --git a/tests/v1_test.py b/tests/v1_test.py index b30c3e3f14..8d6d79ddf1 100644 --- a/tests/v1_test.py +++ b/tests/v1_test.py @@ -35,6 +35,7 @@ if TYPE_CHECKING: from typing_extensions import assert_type + from narwhals._namespace import EagerAllowed from narwhals.typing import IntoDataFrameT from tests.utils import Constructor, ConstructorEager @@ -570,7 +571,8 @@ def test_dataframe_recursive_v1() -> None: pl_frame = pl.DataFrame({"a": [1, 2, 3]}) nw_frame = nw_v1.from_native(pl_frame) - with pytest.raises(AttributeError): + # NOTE: (#2629) combined with passing in `nw_v1.DataFrame` (w/ a `_version`) into itself changes the error + with pytest.raises(AssertionError): nw_v1.DataFrame(nw_frame, level="full") nw_frame_early_return = nw_v1.from_native(nw_frame) @@ -988,3 +990,13 @@ def minimal_function(data: nw_v1.Series[Any]) -> None: # check this doesn't raise type-checking errors minimal_function(col) assert isinstance(col, nw_v1.Series) + + +def test_dataframe_from_dict(eager_backend: EagerAllowed) -> None: + schema = {"c": nw_v1.Int16(), "d": nw_v1.Float32()} + result = nw_v1.DataFrame.from_dict( + {"c": [1, 2], "d": [5, 6]}, backend=eager_backend, schema=schema + ) + assert result.collect_schema() == schema + assert result._version is Version.V1 + assert isinstance(result, nw_v1.DataFrame) diff --git a/utils/check_api_reference.py b/utils/check_api_reference.py index d9de26d689..34aa838a89 100644 --- a/utils/check_api_reference.py +++ b/utils/check_api_reference.py @@ -8,6 +8,7 @@ from collections import deque from inspect import isfunction from pathlib import Path +from types import MethodType from typing import TYPE_CHECKING, Any import polars as pl @@ -22,15 +23,16 @@ if sys.version_info >= (3, 13): def _is_public_method_or_property(obj: Any) -> bool: - return (isfunction(obj) or isinstance(obj, property)) and obj.__name__.startswith( - LOWERCASE - ) + return ( + isfunction(obj) or isinstance(obj, (MethodType, property)) + ) and obj.__name__.startswith(LOWERCASE) else: def _is_public_method_or_property(obj: Any) -> bool: - return (isfunction(obj) and obj.__name__.startswith(LOWERCASE)) or ( - isinstance(obj, property) and obj.fget.__name__.startswith(LOWERCASE) - ) + return ( + (isfunction(obj) or isinstance(obj, MethodType)) + and obj.__name__.startswith(LOWERCASE) + ) or (isinstance(obj, property) and obj.fget.__name__.startswith(LOWERCASE)) def iter_api_reference_names(tp: type[Any]) -> Iterator[str]: