Skip to content

Commit 008e082

Browse files
AlenkaFraulcdjorisvandenbossche
authored
GH-49002: [Python] Fix array.to_pandas string type conversion for arrays with None (#49247)
### Rationale for this change The conversion from array with string type to pandas series, when array only has a `None` element, has been taking the old code path even with pandas 3.0. ### What changes are included in this PR? Always check `dtype` in the `_array_like_to_pandas ` conversion and use pandas new default string `dtype` if available. ### Are these changes tested? Yes. ### Are there any user-facing changes? No, only bug fix. * GitHub Issue: #49002 Lead-authored-by: Alenka Frim <AlenkaF@users.noreply.github.com> Co-authored-by: AlenkaF <frim.alenka@gmail.com> Co-authored-by: Raúl Cumplido <raulcumplido@gmail.com> Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com> Signed-off-by: AlenkaF <frim.alenka@gmail.com>
1 parent 3c25e69 commit 008e082

File tree

2 files changed

+52
-3
lines changed

2 files changed

+52
-3
lines changed

python/pyarrow/array.pxi

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2349,6 +2349,13 @@ cdef _array_like_to_pandas(obj, options, types_mapper):
23492349
dtype = original_type.to_pandas_dtype()
23502350
except NotImplementedError:
23512351
pass
2352+
elif pandas_api.uses_string_dtype() and not options["strings_to_categorical"] and (
2353+
original_type.id == _Type_STRING or
2354+
original_type.id == _Type_LARGE_STRING or
2355+
original_type.id == _Type_STRING_VIEW
2356+
):
2357+
# for pandas 3.0+, use pandas' new default string dtype
2358+
dtype = pandas_api.pd.StringDtype(na_value=np.nan)
23522359

23532360
# Only call __from_arrow__ for Arrow extension types or when explicitly
23542361
# overridden via types_mapper

python/pyarrow/tests/test_pandas.py

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2975,7 +2975,9 @@ def check_zero_copy_failure(self, arr):
29752975
arr.to_pandas(zero_copy_only=True)
29762976

29772977
def test_zero_copy_failure_on_object_types(self):
2978-
self.check_zero_copy_failure(pa.array(['A', 'B', 'C']))
2978+
if Version(pd.__version__) < Version("3.0.0"):
2979+
# pandas 3.0 includes default string dtype support
2980+
self.check_zero_copy_failure(pa.array(['A', 'B', 'C']))
29792981

29802982
def test_zero_copy_failure_with_int_when_nulls(self):
29812983
self.check_zero_copy_failure(pa.array([0, 1, None]))
@@ -3047,6 +3049,10 @@ def test_all_none_category(self):
30473049

30483050
def test_empty_arrays(self):
30493051
for dtype_str, pa_type in self.type_pairs:
3052+
if (Version(pd.__version__) >= Version("3.0.0") and
3053+
pa_type == pa.string()):
3054+
# PyArrow backed string dtype are set by default
3055+
dtype_str = 'str'
30503056
arr = np.array([], dtype=np.dtype(dtype_str))
30513057
_check_array_roundtrip(arr, type=pa_type)
30523058

@@ -3231,13 +3237,19 @@ def test_convert_empty_table(self):
32313237
empty_objects = pd.Series(np.array([], dtype=object))
32323238
tm.assert_series_equal(arr.to_pandas(),
32333239
pd.Series(np.array([], dtype=np.int64)))
3234-
arr = pa.array([], type=pa.string())
3235-
tm.assert_series_equal(arr.to_pandas(), empty_objects)
32363240
arr = pa.array([], type=pa.list_(pa.int64()))
32373241
tm.assert_series_equal(arr.to_pandas(), empty_objects)
32383242
arr = pa.array([], type=pa.struct([pa.field('a', pa.int64())]))
32393243
tm.assert_series_equal(arr.to_pandas(), empty_objects)
32403244

3245+
arr = pa.array([], type=pa.string())
3246+
if Version(pd.__version__) >= Version("3.0.0"):
3247+
# PyArrow backed string dtype are set by default
3248+
empty_str = pd.Series([], dtype=str)
3249+
tm.assert_series_equal(arr.to_pandas(), empty_str)
3250+
else:
3251+
tm.assert_series_equal(arr.to_pandas(), empty_objects)
3252+
32413253
def test_non_natural_stride(self):
32423254
"""
32433255
ARROW-2172: converting from a Numpy array with a stride that's
@@ -4652,6 +4664,36 @@ def test_chunked_array_to_pandas_types_mapper():
46524664
assert result.dtype == np.dtype("int64")
46534665

46544666

4667+
@pytest.mark.parametrize(
4668+
"string_type", [pa.string(), pa.large_string(), pa.string_view()]
4669+
)
4670+
@pytest.mark.parametrize("data", [[], [None]])
4671+
def test_array_to_pandas_string_dtype(string_type, data):
4672+
# GH-49002
4673+
if Version(pd.__version__) < Version("3.0.0"):
4674+
pytest.skip("PyArrow backed string dtype missing")
4675+
4676+
arr = pa.array(data, type=string_type)
4677+
result = arr.to_pandas()
4678+
assert result.dtype == pd.StringDtype(na_value=np.nan)
4679+
4680+
arr = pa.chunked_array([data], type=string_type)
4681+
result = arr.to_pandas()
4682+
assert result.dtype == pd.StringDtype(na_value=np.nan)
4683+
4684+
# Test types_mapper takes precedence
4685+
types_mapper = {string_type: None}.get
4686+
result = arr.to_pandas(types_mapper=types_mapper)
4687+
assert result.dtype == np.dtype("object")
4688+
4689+
# Test strings_to_categorical
4690+
result = arr.to_pandas(strings_to_categorical=False)
4691+
assert result.dtype == pd.StringDtype(na_value=np.nan)
4692+
result = arr.to_pandas(strings_to_categorical=True)
4693+
assert result.dtype == pd.CategoricalDtype(categories=[],
4694+
ordered=False)
4695+
4696+
46554697
# ----------------------------------------------------------------------
46564698
# Legacy metadata compatibility tests
46574699

0 commit comments

Comments
 (0)