Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api-reference/dataframe.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
- estimated_size
- explode
- filter
- from_dict
- gather_every
- get_column
- group_by
Expand Down
70 changes: 69 additions & 1 deletion narwhals/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
TYPE_CHECKING,
Any,
Callable,
ClassVar,
Generic,
Literal,
NoReturn,
Expand All @@ -28,6 +29,7 @@
generate_repr,
is_compliant_dataframe,
is_compliant_lazyframe,
is_eager_allowed,
is_index_selector,
is_list_of,
is_sequence_like,
Expand All @@ -38,12 +40,13 @@
)
from narwhals.dependencies import get_polars, is_numpy_array
from narwhals.exceptions import InvalidIntoExprError, InvalidOperationError
from narwhals.functions import _from_dict_no_backend
from narwhals.schema import Schema
from narwhals.series import Series
from narwhals.translate import to_native

if TYPE_CHECKING:
from collections.abc import Iterable, Iterator, Sequence
from collections.abc import Iterable, Iterator, Mapping, Sequence
from io import BytesIO
from pathlib import Path
from types import ModuleType
Expand All @@ -55,6 +58,7 @@

from narwhals._compliant import CompliantDataFrame, CompliantLazyFrame
from narwhals._compliant.typing import CompliantExprAny, EagerNamespaceAny
from narwhals.dtypes import DType
from narwhals.group_by import GroupBy, LazyGroupBy
from narwhals.typing import (
AsofJoinStrategy,
Expand Down Expand Up @@ -409,6 +413,8 @@ class DataFrame(BaseFrame[DataFrameT]):
```
"""

_version: ClassVar[Version] = Version.MAIN
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@MarcoGorelli

Doing this is gonna be quite handy for the rest of (#2116)

Since we're already inside the class, we can just start from the known version there

ns = cls._version.namespace.from_backend(implementation).compliant

instead of using _stablify afterwards πŸ˜„

ns = Version.MAIN.namespace.from_backend(implementation).compliant

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice!


def _extract_compliant(self, arg: Any) -> Any:
from narwhals.expr import Expr
from narwhals.series import Series
Expand Down Expand Up @@ -452,6 +458,68 @@ def __init__(self, df: Any, *, level: Literal["full", "lazy", "interchange"]) ->
msg = f"Expected an object which implements `__narwhals_dataframe__`, got: {type(df)}"
raise AssertionError(msg)

@classmethod
def from_dict(
cls,
data: Mapping[str, Any],
schema: Mapping[str, DType] | Schema | None = None,
*,
backend: ModuleType | Implementation | str | None = None,
) -> DataFrame[Any]:
"""Instantiate DataFrame from dictionary.

Indexes (if present, for pandas-like backends) are aligned following
the [left-hand-rule](../concepts/pandas_index.md/).

Notes:
For pandas-like dataframes, conversion to schema is applied after dataframe
creation.

Arguments:
data: Dictionary to create DataFrame from.
schema: The DataFrame schema as Schema or dict of {name: type}. If not
specified, the schema will be inferred by the native library.
backend: specifies which eager backend instantiate to. Only
necessary if inputs are not Narwhals Series.

`backend` can be specified in various ways

- As `Implementation.<BACKEND>` with `BACKEND` being `PANDAS`, `PYARROW`,
`POLARS`, `MODIN` or `CUDF`.
- As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`.
- Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`.

Returns:
A new DataFrame.

Examples:
>>> import pandas as pd
>>> import narwhals as nw
>>> data = {"c": [5, 2], "d": [1, 4]}
>>> nw.DataFrame.from_dict(data, backend="pandas")
β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
|Narwhals DataFrame|
|------------------|
| c d |
| 0 5 1 |
| 1 2 4 |
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
"""
if backend is None:
data, backend = _from_dict_no_backend(data)
implementation = Implementation.from_backend(backend)
if is_eager_allowed(implementation):
ns = cls._version.namespace.from_backend(implementation).compliant
compliant = ns._dataframe.from_dict(data, schema=schema, context=ns)
return cls(compliant, level="full")
# NOTE: (#2786) needs resolving for extensions
msg = (
f"{implementation} support in Narwhals is lazy-only, but `DataFrame.from_dict` is an eager-only function.\n\n"
"Hint: you may want to use an eager backend and then call `.lazy`, e.g.:\n\n"
f" nw.DataFrame.from_dict({{'a': [1, 2]}}, backend='pyarrow').lazy('{implementation}')"
)
raise ValueError(msg)
Comment on lines +511 to +521
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i'm confused as to why is this necessary - if we have a DataFrame class, aren't we already eager?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was just adapting the existing error in nw.from_dict

msg = (
f"{implementation} support in Narwhals is lazy-only, but `from_dict` is an eager-only function.\n\n"
"Hint: you may want to use an eager backend and then call `.lazy`, e.g.:\n\n"
f" nw.from_dict({{'a': [1, 2]}}, backend='pyarrow').lazy('{implementation}')"
)

Did you want this version to do something different if we wrote?

import narwhals as nw

nw.DataFrame.from_dict({'a': [1, 2]}, backend="pyspark")

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah I see sorry, this is indeed needed and correct, thanks!

my only other comment would be - do we want to repeat the docstring, or do like numpy and say "see [...] for full docstring"? probably fine to repeat?

https://numpy.org/doc/stable/reference/generated/numpy.ndarray.std.html

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"see [...] for full docstring"? probably fine to repeat?

Definitely repeat!

I'm guessing even to get that doc you'd need to not have type stubs or have a jupyter kernel running?

image

I'm always gonna feel strongly about IDE-experience-first-docs, since that's how I consume them πŸ™‚

Sadly useful docs + static typing often mean we need to repeat ourselves.
But putting in that extra effort is something I really appreciate from polars vs pandas


@property
def implementation(self) -> Implementation:
"""Return implementation of native frame.
Expand Down
13 changes: 13 additions & 0 deletions narwhals/stable/v1/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@


class DataFrame(NwDataFrame[IntoDataFrameT]):
_version = Version.V1

@inherit_doc(NwDataFrame)
def __init__(self, df: Any, *, level: Literal["full", "lazy", "interchange"]) -> None:
assert df._version is Version.V1 # noqa: S101
Expand All @@ -104,6 +106,17 @@ def __init__(self, df: Any, *, level: Literal["full", "lazy", "interchange"]) ->
# We need to override any method which don't return Self so that type
# annotations are correct.

@classmethod
def from_dict(
cls,
data: Mapping[str, Any],
schema: Mapping[str, DType] | Schema | None = None,
*,
backend: ModuleType | Implementation | str | None = None,
) -> DataFrame[Any]:
result = super().from_dict(data, schema, backend=backend)
return cast("DataFrame[Any]", result)

@property
def _series(self) -> type[Series[Any]]:
return cast("type[Series[Any]]", Series)
Expand Down
21 changes: 20 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
import uuid
from copy import deepcopy
from functools import lru_cache
from importlib.util import find_spec
from typing import TYPE_CHECKING, Any, Callable, cast

import pytest

from narwhals._utils import generate_temporary_column_name
from narwhals._utils import Implementation, generate_temporary_column_name
from tests.utils import PANDAS_VERSION

if TYPE_CHECKING:
Expand All @@ -23,6 +24,7 @@
from pyspark.sql import DataFrame as PySparkDataFrame
from typing_extensions import TypeAlias

from narwhals._namespace import EagerAllowed
from narwhals._spark_like.dataframe import SQLFrameDataFrame
from narwhals.typing import NativeFrame, NativeLazyFrame
from tests.utils import Constructor, ConstructorEager, ConstructorLazy
Expand Down Expand Up @@ -308,3 +310,20 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
)
elif "constructor" in metafunc.fixturenames:
metafunc.parametrize("constructor", constructors, ids=constructors_ids)


TEST_EAGER_BACKENDS: list[EagerAllowed] = []
TEST_EAGER_BACKENDS.extend(
(Implementation.POLARS, "polars") if find_spec("polars") is not None else ()
)
TEST_EAGER_BACKENDS.extend(
(Implementation.PANDAS, "pandas") if find_spec("pandas") is not None else ()
)
TEST_EAGER_BACKENDS.extend(
(Implementation.PYARROW, "pyarrow") if find_spec("pyarrow") is not None else ()
)


@pytest.fixture(params=TEST_EAGER_BACKENDS)
def eager_backend(request: pytest.FixtureRequest) -> EagerAllowed:
return request.param # type: ignore[no-any-return]
94 changes: 94 additions & 0 deletions tests/frame/from_dict_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
from __future__ import annotations

from typing import TYPE_CHECKING

import pytest

import narwhals as nw
from narwhals._utils import Implementation
from tests.utils import Constructor, assert_equal_data

if TYPE_CHECKING:
from narwhals._namespace import EagerAllowed


def test_from_dict(eager_backend: EagerAllowed) -> None:
result = nw.DataFrame.from_dict({"c": [1, 2], "d": [5, 6]}, backend=eager_backend)
expected = {"c": [1, 2], "d": [5, 6]}
assert_equal_data(result, expected)
assert isinstance(result, nw.DataFrame)


def test_from_dict_schema(eager_backend: EagerAllowed) -> None:
schema = {"c": nw.Int16(), "d": nw.Float32()}
result = nw.DataFrame.from_dict(
{"c": [1, 2], "d": [5, 6]}, backend=eager_backend, schema=schema
)
assert result.collect_schema() == schema


@pytest.mark.parametrize("backend", [Implementation.POLARS, "polars"])
def test_from_dict_without_backend(
constructor: Constructor, backend: EagerAllowed
) -> None:
pytest.importorskip("polars")

df = (
nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]}))
.lazy()
.collect(backend=backend)
)
result = nw.DataFrame.from_dict({"c": df["a"], "d": df["b"]})
assert_equal_data(result, {"c": [1, 2, 3], "d": [4, 5, 6]})


def test_from_dict_without_backend_invalid(constructor: Constructor) -> None:
df = nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})).lazy().collect()
with pytest.raises(TypeError, match="backend"):
nw.DataFrame.from_dict({"c": nw.to_native(df["a"]), "d": nw.to_native(df["b"])})


def test_from_dict_with_backend_invalid() -> None:
pytest.importorskip("duckdb")
with pytest.raises(ValueError, match="lazy-only"):
nw.DataFrame.from_dict({"c": [1, 2], "d": [5, 6]}, backend="duckdb")


@pytest.mark.parametrize("backend", [Implementation.POLARS, "polars"])
def test_from_dict_one_native_one_narwhals(
constructor: Constructor, backend: EagerAllowed
) -> None:
pytest.importorskip("polars")

df = (
nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]}))
.lazy()
.collect(backend=backend)
)
result = nw.DataFrame.from_dict({"c": nw.to_native(df["a"]), "d": df["b"]})
expected = {"c": [1, 2, 3], "d": [4, 5, 6]}
assert_equal_data(result, expected)


def test_from_dict_empty(eager_backend: EagerAllowed) -> None:
result = nw.DataFrame.from_dict({}, backend=eager_backend)
assert result.shape == (0, 0)


def test_from_dict_empty_with_schema(eager_backend: EagerAllowed) -> None:
schema = nw.Schema({"a": nw.String(), "b": nw.Int8()})
result = nw.DataFrame.from_dict({}, schema, backend=eager_backend)
assert result.schema == schema


def test_alignment() -> None:
pytest.importorskip("pandas")
import pandas as pd

# https://github.com/narwhals-dev/narwhals/issues/1474
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
result = nw.DataFrame.from_dict(
{"a": df["a"], "b": df["a"].sort_values(ascending=False)}, backend=pd
).to_native()
expected = pd.DataFrame({"a": [1, 2, 3], "b": [3, 2, 1]})
pd.testing.assert_frame_equal(result, expected)
14 changes: 13 additions & 1 deletion tests/v1_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
if TYPE_CHECKING:
from typing_extensions import assert_type

from narwhals._namespace import EagerAllowed
from narwhals.typing import IntoDataFrameT
from tests.utils import Constructor, ConstructorEager

Expand Down Expand Up @@ -570,7 +571,8 @@ def test_dataframe_recursive_v1() -> None:

pl_frame = pl.DataFrame({"a": [1, 2, 3]})
nw_frame = nw_v1.from_native(pl_frame)
with pytest.raises(AttributeError):
# NOTE: (#2629) combined with passing in `nw_v1.DataFrame` (w/ a `_version`) into itself changes the error
with pytest.raises(AssertionError):
nw_v1.DataFrame(nw_frame, level="full")

nw_frame_early_return = nw_v1.from_native(nw_frame)
Expand Down Expand Up @@ -988,3 +990,13 @@ def minimal_function(data: nw_v1.Series[Any]) -> None:
# check this doesn't raise type-checking errors
minimal_function(col)
assert isinstance(col, nw_v1.Series)


def test_dataframe_from_dict(eager_backend: EagerAllowed) -> None:
schema = {"c": nw_v1.Int16(), "d": nw_v1.Float32()}
result = nw_v1.DataFrame.from_dict(
{"c": [1, 2], "d": [5, 6]}, backend=eager_backend, schema=schema
)
assert result.collect_schema() == schema
assert result._version is Version.V1
assert isinstance(result, nw_v1.DataFrame)
14 changes: 8 additions & 6 deletions utils/check_api_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from collections import deque
from inspect import isfunction
from pathlib import Path
from types import MethodType
from typing import TYPE_CHECKING, Any

import polars as pl
Expand All @@ -22,15 +23,16 @@
if sys.version_info >= (3, 13):

def _is_public_method_or_property(obj: Any) -> bool:
return (isfunction(obj) or isinstance(obj, property)) and obj.__name__.startswith(
LOWERCASE
)
return (
isfunction(obj) or isinstance(obj, (MethodType, property))
) and obj.__name__.startswith(LOWERCASE)
else:

def _is_public_method_or_property(obj: Any) -> bool:
return (isfunction(obj) and obj.__name__.startswith(LOWERCASE)) or (
isinstance(obj, property) and obj.fget.__name__.startswith(LOWERCASE)
)
return (
(isfunction(obj) or isinstance(obj, MethodType))
and obj.__name__.startswith(LOWERCASE)
) or (isinstance(obj, property) and obj.fget.__name__.startswith(LOWERCASE))


def iter_api_reference_names(tp: type[Any]) -> Iterator[str]:
Expand Down
Loading