-
Notifications
You must be signed in to change notification settings - Fork 185
feat: Add DataFrame.from_dict
#2885
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2b40ede
5562b00
24997d2
b523edc
695f9ac
bbefe38
ba1ea35
55ba43a
325f466
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,6 +14,7 @@ | |
| - estimated_size | ||
| - explode | ||
| - filter | ||
| - from_dict | ||
| - gather_every | ||
| - get_column | ||
| - group_by | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -6,6 +6,7 @@ | |||||||||||
| TYPE_CHECKING, | ||||||||||||
| Any, | ||||||||||||
| Callable, | ||||||||||||
| ClassVar, | ||||||||||||
| Generic, | ||||||||||||
| Literal, | ||||||||||||
| NoReturn, | ||||||||||||
|
|
@@ -28,6 +29,7 @@ | |||||||||||
| generate_repr, | ||||||||||||
| is_compliant_dataframe, | ||||||||||||
| is_compliant_lazyframe, | ||||||||||||
| is_eager_allowed, | ||||||||||||
| is_index_selector, | ||||||||||||
| is_list_of, | ||||||||||||
| is_sequence_like, | ||||||||||||
|
|
@@ -38,12 +40,13 @@ | |||||||||||
| ) | ||||||||||||
| from narwhals.dependencies import get_polars, is_numpy_array | ||||||||||||
| from narwhals.exceptions import InvalidIntoExprError, InvalidOperationError | ||||||||||||
| from narwhals.functions import _from_dict_no_backend | ||||||||||||
| from narwhals.schema import Schema | ||||||||||||
| from narwhals.series import Series | ||||||||||||
| from narwhals.translate import to_native | ||||||||||||
|
|
||||||||||||
| if TYPE_CHECKING: | ||||||||||||
| from collections.abc import Iterable, Iterator, Sequence | ||||||||||||
| from collections.abc import Iterable, Iterator, Mapping, Sequence | ||||||||||||
| from io import BytesIO | ||||||||||||
| from pathlib import Path | ||||||||||||
| from types import ModuleType | ||||||||||||
|
|
@@ -55,6 +58,7 @@ | |||||||||||
|
|
||||||||||||
| from narwhals._compliant import CompliantDataFrame, CompliantLazyFrame | ||||||||||||
| from narwhals._compliant.typing import CompliantExprAny, EagerNamespaceAny | ||||||||||||
| from narwhals.dtypes import DType | ||||||||||||
| from narwhals.group_by import GroupBy, LazyGroupBy | ||||||||||||
| from narwhals.typing import ( | ||||||||||||
| AsofJoinStrategy, | ||||||||||||
|
|
@@ -409,6 +413,8 @@ class DataFrame(BaseFrame[DataFrameT]): | |||||||||||
| ``` | ||||||||||||
| """ | ||||||||||||
|
|
||||||||||||
| _version: ClassVar[Version] = Version.MAIN | ||||||||||||
|
|
||||||||||||
| def _extract_compliant(self, arg: Any) -> Any: | ||||||||||||
| from narwhals.expr import Expr | ||||||||||||
| from narwhals.series import Series | ||||||||||||
|
|
@@ -452,6 +458,68 @@ def __init__(self, df: Any, *, level: Literal["full", "lazy", "interchange"]) -> | |||||||||||
| msg = f"Expected an object which implements `__narwhals_dataframe__`, got: {type(df)}" | ||||||||||||
| raise AssertionError(msg) | ||||||||||||
|
|
||||||||||||
| @classmethod | ||||||||||||
| def from_dict( | ||||||||||||
| cls, | ||||||||||||
| data: Mapping[str, Any], | ||||||||||||
| schema: Mapping[str, DType] | Schema | None = None, | ||||||||||||
| *, | ||||||||||||
| backend: ModuleType | Implementation | str | None = None, | ||||||||||||
| ) -> DataFrame[Any]: | ||||||||||||
| """Instantiate DataFrame from dictionary. | ||||||||||||
|
|
||||||||||||
| Indexes (if present, for pandas-like backends) are aligned following | ||||||||||||
| the [left-hand-rule](../concepts/pandas_index.md/). | ||||||||||||
|
|
||||||||||||
| Notes: | ||||||||||||
| For pandas-like dataframes, conversion to schema is applied after dataframe | ||||||||||||
| creation. | ||||||||||||
|
|
||||||||||||
| Arguments: | ||||||||||||
| data: Dictionary to create DataFrame from. | ||||||||||||
| schema: The DataFrame schema as Schema or dict of {name: type}. If not | ||||||||||||
| specified, the schema will be inferred by the native library. | ||||||||||||
| backend: specifies which eager backend instantiate to. Only | ||||||||||||
| necessary if inputs are not Narwhals Series. | ||||||||||||
|
|
||||||||||||
| `backend` can be specified in various ways | ||||||||||||
|
|
||||||||||||
| - As `Implementation.<BACKEND>` with `BACKEND` being `PANDAS`, `PYARROW`, | ||||||||||||
| `POLARS`, `MODIN` or `CUDF`. | ||||||||||||
| - As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`. | ||||||||||||
| - Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`. | ||||||||||||
|
|
||||||||||||
| Returns: | ||||||||||||
| A new DataFrame. | ||||||||||||
|
|
||||||||||||
| Examples: | ||||||||||||
| >>> import pandas as pd | ||||||||||||
| >>> import narwhals as nw | ||||||||||||
| >>> data = {"c": [5, 2], "d": [1, 4]} | ||||||||||||
| >>> nw.DataFrame.from_dict(data, backend="pandas") | ||||||||||||
| ββββββββββββββββββββ | ||||||||||||
| |Narwhals DataFrame| | ||||||||||||
| |------------------| | ||||||||||||
| | c d | | ||||||||||||
| | 0 5 1 | | ||||||||||||
| | 1 2 4 | | ||||||||||||
| ββββββββββββββββββββ | ||||||||||||
| """ | ||||||||||||
| if backend is None: | ||||||||||||
| data, backend = _from_dict_no_backend(data) | ||||||||||||
| implementation = Implementation.from_backend(backend) | ||||||||||||
| if is_eager_allowed(implementation): | ||||||||||||
| ns = cls._version.namespace.from_backend(implementation).compliant | ||||||||||||
| compliant = ns._dataframe.from_dict(data, schema=schema, context=ns) | ||||||||||||
| return cls(compliant, level="full") | ||||||||||||
| # NOTE: (#2786) needs resolving for extensions | ||||||||||||
| msg = ( | ||||||||||||
| f"{implementation} support in Narwhals is lazy-only, but `DataFrame.from_dict` is an eager-only function.\n\n" | ||||||||||||
| "Hint: you may want to use an eager backend and then call `.lazy`, e.g.:\n\n" | ||||||||||||
| f" nw.DataFrame.from_dict({{'a': [1, 2]}}, backend='pyarrow').lazy('{implementation}')" | ||||||||||||
| ) | ||||||||||||
| raise ValueError(msg) | ||||||||||||
|
Comment on lines
+511
to
+521
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i'm confused as to why is this necessary - if we have a
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was just adapting the existing error in narwhals/narwhals/functions.py Lines 328 to 332 in ee06d33
Did you want this version to do something different if we wrote? import narwhals as nw
nw.DataFrame.from_dict({'a': [1, 2]}, backend="pyspark")
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah I see sorry, this is indeed needed and correct, thanks! my only other comment would be - do we want to repeat the docstring, or do like numpy and say "see [...] for full docstring"? probably fine to repeat? https://numpy.org/doc/stable/reference/generated/numpy.ndarray.std.html
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Definitely repeat! I'm guessing even to get that doc you'd need to not have type stubs or have a jupyter kernel running?
I'm always gonna feel strongly about IDE-experience-first-docs, since that's how I consume them π Sadly useful docs + static typing often mean we need to repeat ourselves. |
||||||||||||
|
|
||||||||||||
| @property | ||||||||||||
| def implementation(self) -> Implementation: | ||||||||||||
| """Return implementation of native frame. | ||||||||||||
|
|
||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,94 @@ | ||
| from __future__ import annotations | ||
|
|
||
| from typing import TYPE_CHECKING | ||
|
|
||
| import pytest | ||
|
|
||
| import narwhals as nw | ||
| from narwhals._utils import Implementation | ||
| from tests.utils import Constructor, assert_equal_data | ||
|
|
||
| if TYPE_CHECKING: | ||
| from narwhals._namespace import EagerAllowed | ||
|
|
||
|
|
||
| def test_from_dict(eager_backend: EagerAllowed) -> None: | ||
| result = nw.DataFrame.from_dict({"c": [1, 2], "d": [5, 6]}, backend=eager_backend) | ||
| expected = {"c": [1, 2], "d": [5, 6]} | ||
| assert_equal_data(result, expected) | ||
| assert isinstance(result, nw.DataFrame) | ||
|
|
||
|
|
||
| def test_from_dict_schema(eager_backend: EagerAllowed) -> None: | ||
| schema = {"c": nw.Int16(), "d": nw.Float32()} | ||
| result = nw.DataFrame.from_dict( | ||
| {"c": [1, 2], "d": [5, 6]}, backend=eager_backend, schema=schema | ||
| ) | ||
| assert result.collect_schema() == schema | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("backend", [Implementation.POLARS, "polars"]) | ||
| def test_from_dict_without_backend( | ||
| constructor: Constructor, backend: EagerAllowed | ||
| ) -> None: | ||
| pytest.importorskip("polars") | ||
|
|
||
| df = ( | ||
| nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})) | ||
| .lazy() | ||
| .collect(backend=backend) | ||
| ) | ||
| result = nw.DataFrame.from_dict({"c": df["a"], "d": df["b"]}) | ||
| assert_equal_data(result, {"c": [1, 2, 3], "d": [4, 5, 6]}) | ||
|
|
||
|
|
||
| def test_from_dict_without_backend_invalid(constructor: Constructor) -> None: | ||
| df = nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})).lazy().collect() | ||
| with pytest.raises(TypeError, match="backend"): | ||
| nw.DataFrame.from_dict({"c": nw.to_native(df["a"]), "d": nw.to_native(df["b"])}) | ||
|
|
||
|
|
||
| def test_from_dict_with_backend_invalid() -> None: | ||
| pytest.importorskip("duckdb") | ||
| with pytest.raises(ValueError, match="lazy-only"): | ||
| nw.DataFrame.from_dict({"c": [1, 2], "d": [5, 6]}, backend="duckdb") | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("backend", [Implementation.POLARS, "polars"]) | ||
| def test_from_dict_one_native_one_narwhals( | ||
| constructor: Constructor, backend: EagerAllowed | ||
| ) -> None: | ||
| pytest.importorskip("polars") | ||
|
|
||
| df = ( | ||
| nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})) | ||
| .lazy() | ||
| .collect(backend=backend) | ||
| ) | ||
| result = nw.DataFrame.from_dict({"c": nw.to_native(df["a"]), "d": df["b"]}) | ||
| expected = {"c": [1, 2, 3], "d": [4, 5, 6]} | ||
| assert_equal_data(result, expected) | ||
|
|
||
|
|
||
| def test_from_dict_empty(eager_backend: EagerAllowed) -> None: | ||
| result = nw.DataFrame.from_dict({}, backend=eager_backend) | ||
| assert result.shape == (0, 0) | ||
|
|
||
|
|
||
| def test_from_dict_empty_with_schema(eager_backend: EagerAllowed) -> None: | ||
| schema = nw.Schema({"a": nw.String(), "b": nw.Int8()}) | ||
| result = nw.DataFrame.from_dict({}, schema, backend=eager_backend) | ||
| assert result.schema == schema | ||
|
|
||
|
|
||
| def test_alignment() -> None: | ||
| pytest.importorskip("pandas") | ||
| import pandas as pd | ||
|
|
||
| # https://github.com/narwhals-dev/narwhals/issues/1474 | ||
| df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) | ||
| result = nw.DataFrame.from_dict( | ||
| {"a": df["a"], "b": df["a"].sort_values(ascending=False)}, backend=pd | ||
| ).to_native() | ||
| expected = pd.DataFrame({"a": [1, 2, 3], "b": [3, 2, 1]}) | ||
| pd.testing.assert_frame_equal(result, expected) |

There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@MarcoGorelli
Doing this is gonna be quite handy for the rest of (#2116)
Since we're already inside the class, we can just start from the known version there
narwhals/narwhals/dataframe.py
Line 512 in 695f9ac
instead of using
_stablifyafterwards πnarwhals/narwhals/functions.py
Line 316 in f8f7f65
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nice!