diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 8900ecda9e8..6282b3cd0b7 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -740,6 +740,10 @@ def to_array( to_dtype = np.dtype(object) if isinstance(to_dtype, cudf.CategoricalDtype): to_dtype = to_dtype.categories.dtype + # In pandas 3.0+ string categories have StringDtype, which is + # not a numpy dtype. Map it to object so numpy can hold it. + if is_dtype_obj_string(to_dtype): + to_dtype = np.dtype(object) if not isinstance(to_dtype, np.dtype): raise NotImplementedError( diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py index bdddd574f86..a1f295b2c40 100644 --- a/python/cudf/cudf/core/resample.py +++ b/python/cudf/cudf/core/resample.py @@ -164,6 +164,7 @@ def deserialize(cls, header, frames): return out def _handle_frequency_grouper(self, by): + from pandas.tseries.offsets import Day # if `by` is a time frequency grouper, we bin the key column # using bin intervals specified by `by.freq`, then use *that* # as the groupby key @@ -221,6 +222,13 @@ def _handle_frequency_grouper(self, by): closed=closed, ) + # Track the natural end before adding the safety margin. + # When closed='right' and max_date falls exactly on a bin right + # boundary, _get_timestamp_range_edges returns end == max_date + # (the "already the end of the road" case in _adjust_dates_anchored). + # In that case pandas includes one trailing empty bin, so we must too. + natural_end = end + # in some cases, an extra time stamp is required in order to # bin all the values. It's OK if we generate more labels than # we need, as we remove any unused labels below @@ -269,8 +277,19 @@ def _handle_frequency_grouper(self, by): else: cast_bin_labels = cast_bin_labels[:-1] - # if we have more labels than bins, remove the extras labels: + # if we have more labels than bins, remove the extra labels. + # When closed='right' and max_date was exactly on a bin right boundary + # (natural_end == max_date), include one trailing empty bin to match + # pandas behavior. This only applies to Day offsets; sub-day Tick + # offsets (e.g. Second) do not exhibit this behaviour. + nbins = bin_numbers.max() + 1 + if ( + isinstance(offset, Day) + and closed == "right" + and natural_end == pd.Timestamp(max_date) + ): + nbins = min(nbins + 1, len(cast_bin_labels)) if len(cast_bin_labels) > nbins: cast_bin_labels = cast_bin_labels[:nbins] @@ -329,7 +348,7 @@ def _get_timestamp_range_edges( """ from pandas.tseries.offsets import Day, Tick - if isinstance(freq, Tick): + if isinstance(freq, (Tick, Day)): index_tz = first.tz if isinstance(origin, pd.Timestamp) and (origin.tz is None) != ( index_tz is None diff --git a/python/cudf/cudf/tests/groupby/test_reductions.py b/python/cudf/cudf/tests/groupby/test_reductions.py index ed9084e3f0b..4995b728f4f 100644 --- a/python/cudf/cudf/tests/groupby/test_reductions.py +++ b/python/cudf/cudf/tests/groupby/test_reductions.py @@ -436,7 +436,7 @@ def test_groupby_index_type(): df["string_col"] = ["a", "b", "c"] df["counts"] = [1, 2, 3] res = df.groupby(by="string_col").counts.sum() - assert res.index.dtype == cudf.dtype("object") + assert res.index.dtype == cudf.dtype("str") @pytest.mark.parametrize(