Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,10 @@ def to_array(
to_dtype = np.dtype(object)
if isinstance(to_dtype, cudf.CategoricalDtype):
to_dtype = to_dtype.categories.dtype
# In pandas 3.0+ string categories have StringDtype, which is
# not a numpy dtype. Map it to object so numpy can hold it.
if is_dtype_obj_string(to_dtype):
to_dtype = np.dtype(object)

if not isinstance(to_dtype, np.dtype):
raise NotImplementedError(
Expand Down
23 changes: 21 additions & 2 deletions python/cudf/cudf/core/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ def deserialize(cls, header, frames):
return out

def _handle_frequency_grouper(self, by):
from pandas.tseries.offsets import Day
# if `by` is a time frequency grouper, we bin the key column
# using bin intervals specified by `by.freq`, then use *that*
# as the groupby key
Expand Down Expand Up @@ -221,6 +222,13 @@ def _handle_frequency_grouper(self, by):
closed=closed,
)

# Track the natural end before adding the safety margin.
# When closed='right' and max_date falls exactly on a bin right
# boundary, _get_timestamp_range_edges returns end == max_date
# (the "already the end of the road" case in _adjust_dates_anchored).
# In that case pandas includes one trailing empty bin, so we must too.
natural_end = end

# in some cases, an extra time stamp is required in order to
# bin all the values. It's OK if we generate more labels than
# we need, as we remove any unused labels below
Expand Down Expand Up @@ -269,8 +277,19 @@ def _handle_frequency_grouper(self, by):
else:
cast_bin_labels = cast_bin_labels[:-1]

# if we have more labels than bins, remove the extras labels:
# if we have more labels than bins, remove the extra labels.
# When closed='right' and max_date was exactly on a bin right boundary
# (natural_end == max_date), include one trailing empty bin to match
# pandas behavior. This only applies to Day offsets; sub-day Tick
# offsets (e.g. Second) do not exhibit this behaviour.

nbins = bin_numbers.max() + 1
if (
isinstance(offset, Day)
and closed == "right"
and natural_end == pd.Timestamp(max_date)
):
nbins = min(nbins + 1, len(cast_bin_labels))
if len(cast_bin_labels) > nbins:
cast_bin_labels = cast_bin_labels[:nbins]

Expand Down Expand Up @@ -329,7 +348,7 @@ def _get_timestamp_range_edges(
"""
from pandas.tseries.offsets import Day, Tick

if isinstance(freq, Tick):
if isinstance(freq, (Tick, Day)):
index_tz = first.tz
if isinstance(origin, pd.Timestamp) and (origin.tz is None) != (
index_tz is None
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/tests/groupby/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,7 @@ def test_groupby_index_type():
df["string_col"] = ["a", "b", "c"]
df["counts"] = [1, 2, 3]
res = df.groupby(by="string_col").counts.sum()
assert res.index.dtype == cudf.dtype("object")
assert res.index.dtype == cudf.dtype("str")


@pytest.mark.parametrize(
Expand Down
Loading