Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
12330d5
feat(typing): Add `ArrowConvertible`
dangotbanned Mar 28, 2025
83b2752
feat: Add `CompliantDataFrame.from_arrow`
dangotbanned Mar 28, 2025
7c306dc
feat(DRAFT): Add `PolarsDataFrame.from_arrow`
dangotbanned Mar 28, 2025
038c8da
feat(DRAFT): Add `ArrowDataFrame.from_arrow`
dangotbanned Mar 28, 2025
51bef83
feat(DRAFT): Add `PandasLikeDataFrame.from_arrow`
dangotbanned Mar 28, 2025
79f080a
chore: Add `version` to `_from_arrow_impl`
dangotbanned Mar 28, 2025
1812d28
fix(typing): Help `mypy`
dangotbanned Mar 28, 2025
1f92a58
refactor: Reuse `ArrowDataFrame` impl for `PolarsDataFrame`
dangotbanned Mar 28, 2025
41e4034
fix: `UnboundLocalError` oops
dangotbanned Mar 28, 2025
e52edee
refactor: Naive re-implement `from_arrow`
dangotbanned Mar 28, 2025
8d95cb2
ignore banned import
dangotbanned Mar 28, 2025
299be9c
recover `PandasLike` assertion error
dangotbanned Mar 28, 2025
976cffb
fix: correct typo in original message
dangotbanned Mar 28, 2025
7a31e65
chore: Add `_into_arrow_table`
dangotbanned Mar 28, 2025
f5c79cc
refactor: Use `_into_arrow_table`
dangotbanned Mar 28, 2025
e84a035
chore: Fill in missing error branches
dangotbanned Mar 28, 2025
392a611
chore(typing): Remove unused `native` annotation
dangotbanned Mar 28, 2025
cb6c86f
fix typo
dangotbanned Mar 28, 2025
883b927
docs: Satisfy `darglint`
dangotbanned Mar 28, 2025
655da1d
test: Get and ignore more coverage
dangotbanned Mar 28, 2025
adb6bf1
docs: Add `IntoArrowTable` alias
dangotbanned Mar 28, 2025
bdeaaac
test: Try `test_from_arrow_to_polars`
dangotbanned Mar 28, 2025
7bf8b27
test: no cover
dangotbanned Mar 28, 2025
3843bc2
Merge branch 'main' into df-from-arrow
dangotbanned Mar 28, 2025
ced6d07
Merge branch 'main' into df-from-arrow
dangotbanned Mar 29, 2025
5802361
Merge branch 'main' into df-from-arrow
dangotbanned Mar 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions narwhals/_arrow/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from functools import partial
from typing import TYPE_CHECKING
from typing import Any
from typing import Collection
from typing import Iterator
from typing import Literal
from typing import Mapping
Expand Down Expand Up @@ -32,6 +33,7 @@
from narwhals.utils import parse_columns_to_drop
from narwhals.utils import parse_version
from narwhals.utils import scale_bytes
from narwhals.utils import supports_arrow_c_stream
from narwhals.utils import validate_backend_version

if TYPE_CHECKING:
Expand All @@ -52,6 +54,7 @@
from narwhals._arrow.typing import Indices # type: ignore[attr-defined]
from narwhals._arrow.typing import Mask # type: ignore[attr-defined]
from narwhals._arrow.typing import Order # type: ignore[attr-defined]
from narwhals._translate import IntoArrowTable
from narwhals.dtypes import DType
from narwhals.schema import Schema
from narwhals.typing import CompliantDataFrame
Expand Down Expand Up @@ -93,6 +96,26 @@ def __init__(
self._version = version
validate_backend_version(self._implementation, self._backend_version)

@classmethod
def from_arrow(cls, data: IntoArrowTable, /, *, context: _FullContext) -> Self:
backend_version = context._backend_version
if isinstance(data, pa.Table):
native = data
elif backend_version >= (14,) or isinstance(data, Collection):
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Supporting Collection here might seem strange.

My thinking is that were a list | tuple | dict of pyarrow objects passed - pa.table would still be capable of turning that into a pa.Table:
https://github.com/apache/arrow/blob/7c1b96e26910dd553ee0ccf0792e1854af6b0021/python/pyarrow/table.pxi#L4993-L5000

native = pa.table(data)
elif supports_arrow_c_stream(data): # pragma: no cover
msg = f"PyArrow>=14.0.0 is required for `from_arrow` for object of type {type(data).__name__!r}."
raise ModuleNotFoundError(msg)
else: # pragma: no cover
msg = f"`from_arrow` is not supported for object of type {type(data).__name__!r}."
raise TypeError(msg)
return cls(
native,
backend_version=backend_version,
version=context._version,
validate_column_names=True,
)

@classmethod
def from_dict(
cls,
Expand Down
5 changes: 5 additions & 0 deletions narwhals/_compliant/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from narwhals._compliant.typing import EagerSeriesT
from narwhals._compliant.typing import NativeFrameT_co
from narwhals._expression_parsing import evaluate_output_names_and_aliases
from narwhals._translate import ArrowConvertible
from narwhals._translate import DictConvertible
from narwhals._translate import NumpyConvertible
from narwhals.utils import Version
Expand All @@ -35,6 +36,7 @@
from typing_extensions import TypeAlias

from narwhals._compliant.group_by import CompliantGroupBy
from narwhals._translate import IntoArrowTable
from narwhals.dtypes import DType
from narwhals.schema import Schema
from narwhals.typing import SizeUnit
Expand All @@ -54,6 +56,7 @@
class CompliantDataFrame(
NumpyConvertible["_2DArray", "_2DArray"],
DictConvertible["_ToDict[CompliantSeriesT]", Mapping[str, Any]],
ArrowConvertible["pa.Table", "IntoArrowTable"],
_StoresNative[NativeFrameT_co],
Sized,
Protocol[CompliantSeriesT, CompliantExprT_contra, NativeFrameT_co],
Expand All @@ -66,6 +69,8 @@ class CompliantDataFrame(
def __narwhals_dataframe__(self) -> Self: ...
def __narwhals_namespace__(self) -> Any: ...
@classmethod
def from_arrow(cls, data: IntoArrowTable, /, *, context: _FullContext) -> Self: ...
@classmethod
def from_dict(
cls,
data: Mapping[str, Any],
Expand Down
25 changes: 25 additions & 0 deletions narwhals/_pandas_like/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from narwhals.exceptions import InvalidOperationError
from narwhals.exceptions import ShapeError
from narwhals.utils import Implementation
from narwhals.utils import _into_arrow_table
from narwhals.utils import _remap_full_join_keys
from narwhals.utils import check_column_exists
from narwhals.utils import generate_temporary_column_name
Expand All @@ -55,6 +56,7 @@
from narwhals._pandas_like.expr import PandasLikeExpr
from narwhals._pandas_like.group_by import PandasLikeGroupBy
from narwhals._pandas_like.namespace import PandasLikeNamespace
from narwhals._translate import IntoArrowTable
from narwhals.dtypes import DType
from narwhals.schema import Schema
from narwhals.typing import CompliantDataFrame
Expand Down Expand Up @@ -114,6 +116,29 @@ def __init__(
if validate_column_names:
check_column_names_are_unique(native_dataframe.columns)

@classmethod
def from_arrow(cls, data: IntoArrowTable, /, *, context: _FullContext) -> Self:
implementation = context._implementation
tbl = _into_arrow_table(data, context)
if implementation.is_pandas():
native = tbl.to_pandas()
elif implementation.is_modin(): # pragma: no cover
from modin.pandas.utils import from_arrow as mpd_from_arrow

native = mpd_from_arrow(tbl)
elif implementation.is_cudf(): # pragma: no cover
native = implementation.to_native_namespace().DataFrame.from_arrow(tbl)
else: # pragma: no cover
msg = "congratulations, you entered unreachable code - please report a bug"
raise AssertionError(msg)
Comment on lines +132 to +133
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lol, can't believe i wrote this, we may want to make a utility with a consistent message for these at some point

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Certainly made me smile when I copied it over πŸ˜„

return cls(
native,
implementation=implementation,
backend_version=context._backend_version,
version=context._version,
validate_column_names=True,
)

@classmethod
def from_dict(
cls,
Expand Down
12 changes: 12 additions & 0 deletions narwhals/_polars/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Literal
from typing import Mapping
from typing import Sequence
from typing import cast
from typing import overload

import polars as pl
Expand All @@ -17,6 +18,7 @@
from narwhals._polars.utils import native_to_narwhals_dtype
from narwhals.exceptions import ColumnNotFoundError
from narwhals.utils import Implementation
from narwhals.utils import _into_arrow_table
from narwhals.utils import is_sequence_but_not_str
from narwhals.utils import parse_columns_to_drop
from narwhals.utils import parse_version
Expand All @@ -35,6 +37,7 @@
from narwhals._polars.group_by import PolarsGroupBy
from narwhals._polars.group_by import PolarsLazyGroupBy
from narwhals._polars.series import PolarsSeries
from narwhals._translate import IntoArrowTable
from narwhals.dtypes import DType
from narwhals.schema import Schema
from narwhals.typing import CompliantDataFrame
Expand Down Expand Up @@ -94,6 +97,15 @@ def __init__(
self._version = version
validate_backend_version(self._implementation, self._backend_version)

@classmethod
def from_arrow(cls, data: IntoArrowTable, /, *, context: _FullContext) -> Self:
backend_version = context._backend_version
if backend_version >= (1, 3):
native = pl.DataFrame(data)
else:
native = cast("pl.DataFrame", pl.from_arrow(_into_arrow_table(data, context)))
return cls(native, backend_version=backend_version, version=context._version)

@classmethod
def from_dict(
cls,
Expand Down
34 changes: 34 additions & 0 deletions narwhals/_translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
from typing import Protocol

if TYPE_CHECKING:
import pyarrow as pa
from typing_extensions import Self
from typing_extensions import TypeAlias
from typing_extensions import TypeVar


Expand Down Expand Up @@ -39,6 +41,10 @@ def TypeVar( # noqa: ANN202, N802
)


class ArrowStreamExportable(Protocol):
def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: ...


ToNumpyT_co = TypeVar("ToNumpyT_co", covariant=True)
FromNumpyDT_contra = TypeVar(
"FromNumpyDT_contra", contravariant=True, default=ToNumpyT_co
Expand Down Expand Up @@ -98,3 +104,31 @@ class DictConvertible(
FromDict[FromDictDT_contra],
Protocol[ToDictDT_co, FromDictDT_contra],
): ...


IntoArrowTable: TypeAlias = "ArrowStreamExportable | pa.Table"
"""An object supporting the [Arrow PyCapsule Interface], or a native [`pyarrow.Table`].

[Arrow PyCapsule Interface]: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowstream-export
[`pyarrow.Table`]: https://arrow.apache.org/docs/python/generated/pyarrow.Table.html
"""
ToArrowT_co = TypeVar("ToArrowT_co", covariant=True)
FromArrowDT_contra = TypeVar(
"FromArrowDT_contra", contravariant=True, default=IntoArrowTable
)


class ToArrow(Protocol[ToArrowT_co]):
def to_arrow(self, *args: Any, **kwds: Any) -> ToArrowT_co: ...


class FromArrow(Protocol[FromArrowDT_contra]):
@classmethod
def from_arrow(cls, data: FromArrowDT_contra, *args: Any, **kwds: Any) -> Self: ...


class ArrowConvertible(
ToArrow[ToArrowT_co],
FromArrow[FromArrowDT_contra],
Protocol[ToArrowT_co, FromArrowDT_contra],
): ...
65 changes: 16 additions & 49 deletions narwhals/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from typing import Iterable
from typing import Literal
from typing import Mapping
from typing import Protocol
from typing import Sequence
from typing import cast
from typing import overload
Expand All @@ -26,6 +25,7 @@
from narwhals.dependencies import is_narwhals_series
from narwhals.dependencies import is_numpy_array
from narwhals.dependencies import is_numpy_array_2d
from narwhals.dependencies import is_pyarrow_table
from narwhals.expr import Expr
from narwhals.series import Series
from narwhals.translate import from_native
Expand All @@ -39,6 +39,7 @@
from narwhals.utils import is_eager_allowed
from narwhals.utils import is_sequence_but_not_str
from narwhals.utils import parse_version
from narwhals.utils import supports_arrow_c_stream
from narwhals.utils import validate_laziness

if TYPE_CHECKING:
Expand All @@ -50,6 +51,7 @@

from narwhals._compliant import CompliantExpr
from narwhals._compliant import CompliantNamespace
from narwhals._translate import IntoArrowTable
from narwhals.dataframe import DataFrame
from narwhals.dataframe import LazyFrame
from narwhals.dtypes import DType
Expand All @@ -65,11 +67,6 @@

_IntoSchema: TypeAlias = "Mapping[str, DType] | Schema | Sequence[str] | None"

class ArrowStreamExportable(Protocol):
def __arrow_c_stream__(
self, requested_schema: object | None = None
) -> object: ...


@overload
def concat(
Expand Down Expand Up @@ -486,7 +483,7 @@ def _is_into_schema(obj: Any) -> TypeIs[_IntoSchema]:

@deprecate_native_namespace(warn_version="1.31.0", required=True)
def from_arrow(
native_frame: ArrowStreamExportable,
native_frame: IntoArrowTable,
*,
backend: ModuleType | Implementation | str | None = None,
native_namespace: ModuleType | None = None, # noqa: ARG001
Expand Down Expand Up @@ -535,63 +532,33 @@ def from_arrow(
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
"""
backend = cast("ModuleType | Implementation | str", backend)
return _from_arrow_impl(native_frame, backend=backend)
return _from_arrow_impl(native_frame, backend=backend, version=Version.MAIN)


def _from_arrow_impl(
data: ArrowStreamExportable, *, backend: ModuleType | Implementation | str
data: IntoArrowTable,
*,
backend: ModuleType | Implementation | str,
version: Version,
) -> DataFrame[Any]:
if not hasattr(data, "__arrow_c_stream__"):
if not (supports_arrow_c_stream(data) or is_pyarrow_table(data)):
msg = f"Given object of type {type(data)} does not support PyCapsule interface"
raise TypeError(msg)

implementation = Implementation.from_backend(backend)
native_namespace = implementation.to_native_namespace()

if implementation.is_polars() and parse_version(native_namespace) >= (1, 3):
native_frame = native_namespace.DataFrame(data)
elif implementation in {
Implementation.PANDAS,
Implementation.MODIN,
Implementation.CUDF,
Implementation.POLARS,
}:
# These don't (yet?) support the PyCapsule Interface for import
# so we go via PyArrow
try:
import pyarrow as pa # ignore-banned-import
except ModuleNotFoundError as exc: # pragma: no cover
msg = f"PyArrow>=14.0.0 is required for `from_arrow` for object of type {native_namespace}"
raise ModuleNotFoundError(msg) from exc
if parse_version(pa) < (14, 0): # pragma: no cover
msg = f"PyArrow>=14.0.0 is required for `from_arrow` for object of type {native_namespace}"
raise ModuleNotFoundError(msg) from None

tbl = pa.table(data)
if implementation is Implementation.PANDAS:
native_frame = tbl.to_pandas()
elif implementation is Implementation.MODIN: # pragma: no cover
from modin.pandas.utils import from_arrow

native_frame = from_arrow(tbl)
elif implementation is Implementation.CUDF: # pragma: no cover
native_frame = native_namespace.DataFrame.from_arrow(tbl)
elif implementation is Implementation.POLARS: # pragma: no cover
native_frame = native_namespace.from_arrow(tbl)
else: # pragma: no cover
msg = "congratulations, you entered unrecheable code - please report a bug"
raise AssertionError(msg)
elif implementation is Implementation.PYARROW:
native_frame = native_namespace.table(data)
if is_eager_allowed(implementation):
ns = _into_compliant_namespace(implementation, version)
frame = ns._dataframe.from_arrow(data, context=ns)
return from_native(frame, eager_only=True)
else: # pragma: no cover
native_namespace = implementation.to_native_namespace()
try:
# implementation is UNKNOWN, Narwhals extension using this feature should
# implement PyCapsule support
native_frame = native_namespace.DataFrame(data)
except AttributeError as e:
msg = "Unknown namespace is expected to implement `DataFrame` class which accepts object which supports PyCapsule Interface."
raise AttributeError(msg) from e
return from_native(native_frame, eager_only=True)
return from_native(native_frame, eager_only=True)


def _get_sys_info() -> dict[str, str]:
Expand Down
6 changes: 3 additions & 3 deletions narwhals/stable/v1/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@
from typing_extensions import Self
from typing_extensions import TypeVar

from narwhals._translate import IntoArrowTable
from narwhals.dtypes import DType
from narwhals.functions import ArrowStreamExportable
from narwhals.typing import IntoExpr
from narwhals.typing import IntoFrame
from narwhals.typing import IntoLazyFrameT
Expand Down Expand Up @@ -2213,7 +2213,7 @@ def new_series(

@deprecate_native_namespace(required=True)
def from_arrow(
native_frame: ArrowStreamExportable,
native_frame: IntoArrowTable,
*,
backend: ModuleType | Implementation | str | None = None,
native_namespace: ModuleType | None = None, # noqa: ARG001
Expand Down Expand Up @@ -2242,7 +2242,7 @@ def from_arrow(
"""
backend = cast("ModuleType | Implementation | str", backend)
return _stableify( # type: ignore[no-any-return]
_from_arrow_impl(native_frame, backend=backend)
_from_arrow_impl(native_frame, backend=backend, version=Version.V1)
)


Expand Down
Loading
Loading