GH-49002: [Python] Fix array.to_pandas string type conversion for arrays with None (#49247)

AlenkaF · raulcd · jorisvandenbossche · web-flow · commit 008e082cbc08 · 2026-04-01T16:01:07.000+02:00
### Rationale for this change The conversion from array with string type to pandas series, when array only has a `None` element, has been taking the old code path even with pandas 3.0. ### What changes are included in this PR? Always check `dtype` in the `_array_like_to_pandas ` conversion and use pandas new default string `dtype` if available. ### Are these changes tested? Yes. ### Are there any user-facing changes? No, only bug fix. * GitHub Issue: #49002 Lead-authored-by: Alenka Frim <AlenkaF@users.noreply.github.com> Co-authored-by: AlenkaF <frim.alenka@gmail.com> Co-authored-by: Raúl Cumplido <raulcumplido@gmail.com> Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com> Signed-off-by: AlenkaF <frim.alenka@gmail.com>
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
@@ -2349,6 +2349,13 @@ cdef _array_like_to_pandas(obj, options, types_mapper):
             dtype = original_type.to_pandas_dtype()
         except NotImplementedError:
             pass
+    elif pandas_api.uses_string_dtype() and not options["strings_to_categorical"] and (
+        original_type.id == _Type_STRING or
+        original_type.id == _Type_LARGE_STRING or
+        original_type.id == _Type_STRING_VIEW
+    ):
+        # for pandas 3.0+, use pandas' new default string dtype
+        dtype = pandas_api.pd.StringDtype(na_value=np.nan)
 
     # Only call __from_arrow__ for Arrow extension types or when explicitly
     # overridden via types_mapper
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
@@ -2975,7 +2975,9 @@ def check_zero_copy_failure(self, arr):
             arr.to_pandas(zero_copy_only=True)
 
     def test_zero_copy_failure_on_object_types(self):
-        self.check_zero_copy_failure(pa.array(['A', 'B', 'C']))
+        if Version(pd.__version__) < Version("3.0.0"):
+            # pandas 3.0 includes default string dtype support
+            self.check_zero_copy_failure(pa.array(['A', 'B', 'C']))
 
     def test_zero_copy_failure_with_int_when_nulls(self):
         self.check_zero_copy_failure(pa.array([0, 1, None]))
@@ -3047,6 +3049,10 @@ def test_all_none_category(self):
 
     def test_empty_arrays(self):
         for dtype_str, pa_type in self.type_pairs:
+            if (Version(pd.__version__) >= Version("3.0.0") and
+                    pa_type == pa.string()):
+                # PyArrow backed string dtype are set by default
+                dtype_str = 'str'
             arr = np.array([], dtype=np.dtype(dtype_str))
             _check_array_roundtrip(arr, type=pa_type)
 
@@ -3231,13 +3237,19 @@ def test_convert_empty_table(self):
         empty_objects = pd.Series(np.array([], dtype=object))
         tm.assert_series_equal(arr.to_pandas(),
                                pd.Series(np.array([], dtype=np.int64)))
-        arr = pa.array([], type=pa.string())
-        tm.assert_series_equal(arr.to_pandas(), empty_objects)
         arr = pa.array([], type=pa.list_(pa.int64()))
         tm.assert_series_equal(arr.to_pandas(), empty_objects)
         arr = pa.array([], type=pa.struct([pa.field('a', pa.int64())]))
         tm.assert_series_equal(arr.to_pandas(), empty_objects)
 
+        arr = pa.array([], type=pa.string())
+        if Version(pd.__version__) >= Version("3.0.0"):
+            # PyArrow backed string dtype are set by default
+            empty_str = pd.Series([], dtype=str)
+            tm.assert_series_equal(arr.to_pandas(), empty_str)
+        else:
+            tm.assert_series_equal(arr.to_pandas(), empty_objects)
+
     def test_non_natural_stride(self):
         """
         ARROW-2172: converting from a Numpy array with a stride that's
@@ -4652,6 +4664,36 @@ def test_chunked_array_to_pandas_types_mapper():
     assert result.dtype == np.dtype("int64")
 
 
+@pytest.mark.parametrize(
+    "string_type", [pa.string(), pa.large_string(), pa.string_view()]
+)
+@pytest.mark.parametrize("data", [[], [None]])
+def test_array_to_pandas_string_dtype(string_type, data):
+    # GH-49002
+    if Version(pd.__version__) < Version("3.0.0"):
+        pytest.skip("PyArrow backed string dtype missing")
+
+    arr = pa.array(data, type=string_type)
+    result = arr.to_pandas()
+    assert result.dtype == pd.StringDtype(na_value=np.nan)
+
+    arr = pa.chunked_array([data], type=string_type)
+    result = arr.to_pandas()
+    assert result.dtype == pd.StringDtype(na_value=np.nan)
+
+    # Test types_mapper takes precedence
+    types_mapper = {string_type: None}.get
+    result = arr.to_pandas(types_mapper=types_mapper)
+    assert result.dtype == np.dtype("object")
+
+    # Test strings_to_categorical
+    result = arr.to_pandas(strings_to_categorical=False)
+    assert result.dtype == pd.StringDtype(na_value=np.nan)
+    result = arr.to_pandas(strings_to_categorical=True)
+    assert result.dtype == pd.CategoricalDtype(categories=[],
+                                               ordered=False)
+
+
 # ----------------------------------------------------------------------
 # Legacy metadata compatibility tests